Python爬虫爬取各大热门短视频平台视频

Posted 爬虫小白0514

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫爬取各大热门短视频平台视频相关的知识,希望对你有一定的参考价值。

1、开发工具

Python3.9

requests库

其他一些Python内置库

pycharm

2、第三方库

安装第三方库

pip install requests

3、实现思路

1、利用tkinter库实例化一个GUI界面,包含提示框、输入框、选择按钮、功能按钮。

2、用requests发送get请求,获得下载链接

3、将下载到的文件保存到本地。


4、实现效果

 

 

 

 


 

5、实现过程

1、B站视频爬虫

import requests
import re
import os
#判断是否存在文件夹video,不存在,则创建一个
filename='video\\\\'
if not os.path.exists(filename):
    os.mkdir(filename)
#定义一个爬虫函数,供主函数调用
def UrlCrawler(url,name):
 
#定制请求头   
      headers =
"cookie":"buvid3=0D3353AC-5B77-680A-697F-8B66493826D160198infoc; b_nut=1670493160; CURRENT_FNVAL=4048; _uuid=AA102510B8-6113-12F5-10674-C7E67642D65561585infoc; rpdid=|(YukRR|mR|0J'uY~|RmJuYk; i-wanna-go-back=-1; fingerprint=9a9c4cc60b4c3b41bce4cf46c57c55ea; buvid_fp_plain=undefined; buvid4=60372B7A-A671-65D8-6993-2FB5D6E3B2CD61117-022120817-lih1xoB%2FrWiTqxe5epW4Zg%3D%3D; buvid_fp=9a9c4cc60b4c3b41bce4cf46c57c55ea; nostalgia_conf=-1; DedeUserID=3493087921833988; DedeUserID__ckMd5=f79b1c5b37110e69; b_ut=5; bp_video_offset_3493087921833988=undefined; PVID=1; SESSDATA=9b3f3db2%2C1691137529%2C03f5b%2A22; bili_jct=5587e773eeb7161f147d72322112dd01; b_lsid=46D44D61_18620C2A453; innersign=1; sid=nxtxooce",
"origin":"https://www.bilibili.com",
"user-agent":"Mozilla/5.0 (Linux; android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36",
   
    try:
         requests.head(url=url)
    except requests.exceptions.ConnectionError:
        return "Error"
    else:
#从页面源代码中解析出视频地址   
        response = requests.get(url=url,headers=headers)
        date = re.findall('readyVideoUrl":"(.*?)","readyDuration', response.text)[0]
        date1=requests.get(url=date,headers=headers).content
#将爬取到的数据写入文件     
         with open('video\\\\'+name+'.mp4',mode='wb') as f:
            f.write(date1)

2、抖音视频爬虫

import requests

import re

import os

#新建文件夹

filename='video\\\\'

if not os.path.exists(filename):

    os.mkdir(filename)

#定义函数,供主函数调用

def TikTok(url,name):

    response = requests.get(url=url)

    response = response.url

#检查主函数输入的链接是否正确 

    try:

        id = re.findall('video/(.*)/\\?', response)[0]

    except IndexError:

        return 1

    # https://www.douyin.com/video/7197438641520610595

    print(id)

    Url = 'https://www.douyin.com/video/' + id

#定制请求头

    headers =

        

        'cookie': 'douyin.com; ttcid=7a9f94f5337c4fb6a4937e5937748bb021; passport_csrf_token=a0ac1ccb642a36adda5944f1c015d48e; passport_csrf_token_default=a0ac1ccb642a36adda5944f1c015d48e; s_v_web_id=verify_lcbu5s5w_28wEiVmM_z3Vd_40ES_B51w_NzAlySlNxpX2; xgplayer_user_id=242308523073; ttwid=1%7CC40qHPAKUiS-rZzMjoNnaRQLVAmjwFKkQYJKPFgoP8w%7C1675328733%7C7c9d6834b4963ac8874725a61b87e3dd9557431c57f3f7751fac875fdc5db078; d_ticket=3c3e91316b0d2f2293d28fe6652d905cc7869; passport_assist_user=CkEfjpnNrq_dwTxeUvZDgvxiJJGX4vK_V2bvkG4hXT2Y93RD7N7Wpv9DwPWJ2-RIiM6ryeR0t7a7jrfKZx1645n2bRpICjwfL4B7J61soinEUkLo0zIVrDu52cNfrSTRckp__Zh7a6qAaiefB0n-jw85LEuN7fTxx3zgjB_uycKsyg4Qrb-oDRiJr9ZUIgEDxmngyQ%3D%3D; n_mh=EyHH0OrPqAYMNqnG7-FHeaRIMyHdxcz5bczy1ihDzJY; sso_auth_status=44d73d0e0c92093e78edc25f7dbd4ffe; sso_auth_status_ss=44d73d0e0c92093e78edc25f7dbd4ffe; sso_uid_tt=ace0d555e37a065789e9cfd86cc68d49; sso_uid_tt_ss=ace0d555e37a065789e9cfd86cc68d49; toutiao_sso_user=4410a2614fa77ef6cbbe4c90d8319abe; toutiao_sso_user_ss=4410a2614fa77ef6cbbe4c90d8319abe; sid_ucp_sso_v1=1.0.0-KDIwNGI4ZjQzMDZjZWY3ZjVmOWIxNTExNjQwOTcyMmMzNTA4MDE4OTEKHwjohYGDiY3dAxD47_6eBhjvMSAMMISOpJQGOAJA8QcaAmxmIiA0NDEwYTI2MTRmYTc3ZWY2Y2JiZTRjOTBkODMxOWFiZQ; ssid_ucp_sso_v1=1.0.0-KDIwNGI4ZjQzMDZjZWY3ZjVmOWIxNTExNjQwOTcyMmMzNTA4MDE4OTEKHwjohYGDiY3dAxD47_6eBhjvMSAMMISOpJQGOAJA8QcaAmxmIiA0NDEwYTI2MTRmYTc3ZWY2Y2JiZTRjOTBkODMxOWFiZQ; odin_tt=07cb42e67dc0b9fa65d040b535cc327a7a483b2242828f4c84668fe6a4fcae69b5a3ef54e4dfbdba322ef19aab7c1e73a008921d15e1a48378be1dd2dfa28fb1; passport_auth_status=61c50bbfa9400bbf2fef96292e2465be%2Cded43c4315dd52db289d582351d64d63; passport_auth_status_ss=61c50bbfa9400bbf2fef96292e2465be%2Cded43c4315dd52db289d582351d64d63; uid_tt=159fc791276b24a5528a79ac5776dcf7; uid_tt_ss=159fc791276b24a5528a79ac5776dcf7; sid_tt=748a279e94b51380eefa350d30df8041; sessionid=748a279e94b51380eefa350d30df8041; sessionid_ss=748a279e94b51380eefa350d30df8041; _tea_utm_cache_2018=undefined; LOGIN_STATUS=1; store-region=cn-gs; store-region-src=uid; sid_guard=748a279e94b51380eefa350d30df8041%7C1675606018%7C5183990%7CThu%2C+06-Apr-2023+14%3A06%3A48+GMT; sid_ucp_v1=1.0.0-KGFjZTk1YjdlZTQzZjhjMmM0NDM4MDYxMGExNmJiNTQyYjBjZDZiYTEKGQjohYGDiY3dAxCC8P6eBhjvMSAMOAJA8QcaAmhsIiA3NDhhMjc5ZTk0YjUxMzgwZWVmYTM1MGQzMGRmODA0MQ; ssid_ucp_v1=1.0.0-KGFjZTk1YjdlZTQzZjhjMmM0NDM4MDYxMGExNmJiNTQyYjBjZDZiYTEKGQjohYGDiY3dAxCC8P6eBhjvMSAMOAJA8QcaAmhsIiA3NDhhMjc5ZTk0YjUxMzgwZWVmYTM1MGQzMGRmODA0MQ; download_guide=%223%2F20230205%22; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAAbAE0Tv5yvVMPsAjb-4wSSB90utPobsmULQ_7kgLejFDqnfufpNzDNwIfkvJpjuCt%2F1675699200000%2F1675620287678%2F0%2F1675647531415%22; SEARCH_RESULT_LIST_TYPE=%22single%22; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAAbAE0Tv5yvVMPsAjb-4wSSB90utPobsmULQ_7kgLejFDqnfufpNzDNwIfkvJpjuCt%2F1675699200000%2F1675649307074%2F1675649217197%2F0%22; live_can_add_dy_2_desktop=%220%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1676254485724%2C%22type%22%3A1%7D; __ac_nonce=063e3575c005659758d70; __ac_signature=_02B4Z6wo00f01D6bw0AAAIDBLsUmeUz5Ijg-u8fAAGxMkIVlgPVhkXvACKDrW5PQhox9NT7.sU9JfmICX4vwHkzh6YJTURiVvfV0V6JSqJjgtexaAwvibswH5m4jxG-hbyvx.CQFY7vWHr9Obb; passport_fe_beating_status=true; csrf_session_id=7b1abe19e2b6358087568b75dd1a0f95; strategyABtestKey=%221675844690.634%22; home_can_add_dy_2_desktop=%221%22; msToken=GJXwPYvB3xxwqGpTA9SHiEyyNOtqkIOLQ-aC53WzuItS77HThruQXqUa8KWSorSeTMCWREe_-H06gJ1D4iOk4wV1iOiJT6wRTyo_nTX7c129ED0TB2BjmeLdw5qIWaQ=; msToken=6p8d3ygLZuKLiISQm_63XijKvLSI0sqW04sHI1LzOhZLbRhIaYsqS59QJwZs6y6eEmEYSAuTNpmz9BhVG0t5I1LuUvaWbBxZyrCjlItMH9yZm2RaYk9ZonDx62JygVw=; tt_scid=2FhmuwuvP-leuEyOg46jFNIcPED5l4jUxFsh3H9PwiHLvTImQ1lgmXM5N3.33RFac36f',

        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',   

    response = requests.get(url=Url, headers=headers)

    date = response.text

#解析重定向后的地址    

    try:

        html_date = re.findall('<script id="RENDER_DATA" type="application/json">(.*?)</script', date)[0]

    except IndexError:

        return 1

    html_date = requests.utils.unquote(html_date)

    try:

        date1=re.findall('playAddr(.*?),',html_date)[0]

#解析出视频地址        

        video_url='https:'+re.findall('"src":"(.*?)',date1)[0]

    except IndexError:

        return 1

    print(video_url)

    video = requests.get(url=video_url, headers=headers)

    date1 = video.content

    with open('video\\\\' + name + '.mp4', mode='wb') as f:

        f.write(date1)

        return 0

3、爬取快手

import requests

import re

import os

filename='video\\\\'

if not os.path.exists(filename):

    os.mkdir(filename)

 

def kuaishou(url,name):

#解析视频链接中的关键字

 judgement_date=re.findall('//(.*?).kuaishou.com',url)[0]

    print(judgement_date)

#通过关键字判断地址来源(pc端,Android端)

    if judgement_date=='v':

        headers=

        'Cookie':'did=web_c449a2a18b2b6ce9264294f6ae305723; didv=1675675109000',

        'Host':'v.kuaishou.com',

        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',

       

    #解析重定向后的地址    reresponse=requests.get(url=url,headers=headers)

        url1=reresponse.url

        url2=re.findall('photo(.*)',url1)[0]

       #拼接url url3='https://www.kuaishou.com/short-video'+url2+'&utm_source=app_share&utm_medium=app_share&utm_campaign=app_share&location=app_share'

        headers1 =

        'Cookie':'kpf=PC_WEB; clientid=3; did=web_c449a2a18b2b6ce9264294f6ae305723; didv=1675675109000; kpn=KUAISHOU_VISION',

        'Host':'www.kuaishou.com',

        'Referer':'https://kphm5nf3.m.chenzhongtech.com/',

        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',

 

       

        reresponse=requests.get(url=url3,headers=headers1)

        date=re.findall('"representation":(.*?)"backupUrl":',reresponse.text)[0]

        date=re.findall('"url":"(.*?)",',date)[0]

        video_url=date.encode('utf-8').decode('unicode_escape')

        headers2=

        'origin':'https://www.kuaishou.com',

        'referer':'https://www.kuaishou.com/short-video/3xwrthmgyqg3fvk?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&docId=9&kpn=NEBULA&subBiz=BROWSE_SLIDE_PHOTO&photoId=3xwrthmgyqg3fvk&shareId=17357027357264&shareToken=X-34jhKPf33QT2fT&shareResourceType=PHOTO_OTHER&userId=3x64pacztgf3z2q&shareType=1&et=1_i%2F2001957724826908914_sl6508bl%24s&shareMode=APP&originShareId=17357027357264&appType=21&shareObjectId=5204190936495876828&shareUrlOpened=0&timestamp=1675943654763&utm_source=app_share&utm_medium=app_share&utm_campaign=app_share&location=app_share',

        

        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',

       

        date1=requests.get(url=video_url,headers=headers2)

        date1 = date1.content

        with open('video\\\\' + name + '.mp4', mode='wb') as f:

            f.write(date1)

    elif judgement_date=='www':

        headers =

            'Cookie': 'kpf=PC_WEB; clientid=3; did=web_c449a2a18b2b6ce9264294f6ae305723; didv=1675675109000; userId=3303107795; kpn=KUAISHOU_VISION; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqAB4kEeE5ZGYsgUmcOaQwDSehsaKGZt2r70Wor6gLe3oXplBL9DZkFXFF8OTo3xCjXuELSpsclmnaq5YJ5qogAv97vzwiopbY7iv0Z4exgMIuQK2nxD4cabvEFGNJdUtnfahJkQpp8zZko5-oACQKxUfhQZowUpqxEByX1x8SkRi6dX17oHOaNeGjul_YRT7g4h6rrh_QoMidetWqPNzUiHVxoSsguEA2pmac6i3oLJsA9rNwKEIiBRsnjw8H54hs6pJjw4Yj19heswDBgDRePckr2ppBA8ECgFMAE; kuaishou.server.web_ph=d031276ae48aadaed85f1733fb5804924f4e',

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',

       

        url = 'https://www.kuaishou.com/f/X4WR6tV40LEX1N6'

        # https://v.kuaishou.com/JeArJH

        response = requests.get(url=url, headers=headers)

        date = response.url

        response1 = requests.get(url=date, headers=headers)

        html_date = response1.text

        html_date = re.findall('"representation":(.*?)"backupUrl"', html_date)[0]

        html_date = re.findall('"url":"(.*?)",', html_date)[0]

        video_url = html_date.encode('utf-8').decode('unicode_escape')

        headers1 =

            'origin': 'https://www.kuaishou.com',

            'range': 'bytes=0-',

            'referer': 'https://www.kuaishou.com',

            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',

 

       

        date1 = requests.get(url=video_url, headers=headers1)

        date1 = date1.content

        with open('video\\\\' + name + '.mp4', mode='wb') as f:

f.write(date1)

4、爬取皮皮虾

import requests

import re

import os

filename='video\\\\'

if not os.path.exists(filename):

    os.mkdir(filename)

def pipixia(url, name):

    print(url)

#请求头信息随时更新

    headers =

        "cookie": "_ga=GA1.2.819352039.1675836585; _gid=GA1.2.815125706.1675836585; MONITOR_WEB_ID=50071578-68a0-493a-8db2-3eb818b40948",

        

        "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36",

   

    response_home_page = requests.get(url=url, headers=headers)

    date = response_home_page.url

 #正则解析视频播放器地址  

id = re.findall('item/(.*?)\\?', date)[0]

 #拼接新地址

  url_1 = f"https://h5.pipix.com/bds/webapi/item/detail/?item_id=id&source=share"

    response_url = requests.get(url=url_1, headers=headers)

    date = response_url.text

#正则解析视频地址

    viedo_url = re.findall('http://v6(.*?)u', date)[0]

    viedo_url = 'http://v6' + viedo_url

    video_response = requests.get(url=viedo_url, headers=headers)

#写入文件

    video_date = video_response.content

    with open('video\\\\' + name + '.mp4', mode='wb') as f:

        f.write(video_date)

5、爬取好看视频

impor requests

import re

import os

#创建文件

filename='video\\\\'

if not os.path.exists(filename):

    os.mkdir(filename)

def Crawler(url,name):

#请求头信息需要更新

    headers =

        'cookie':'BIDUPSID=C4EC17093B39D9C14DA27F2C5EB5E971; PSTM=1665404090; BAIDUID=C4EC17093B39D9C1AE6281A256712DF2:FG=1; BAIDUID_BFESS=C4EC17093B39D9C1AE6281A256712DF2:FG=1; ZFY=S73y0dle7IUenE66ZTiRDG5oPbcGuVoCVD3d1DpoCl8:C; __bid_n=184432843e24d427a34207; PC_TAB_LOG=video_details_page; COMMON_LID=20d6fdf7fb2da7f9480d684df4d1ccbf; Hm_lvt_4aadd610dfd2f5972f1efee2653a2bc5=1675650075,1675659083; BDUSS=1JuMmRQb1o2YlloajV3Y1NsdS05V0lxQkFSb0hmSjFtcG94UUZGVFpPSnRGQWhrRUFBQUFBJCQAAAAAAQAAAAEAAABjUK55aGVoc3ZzODUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG2H4GNth-Bja; BDUSS_BFESS=1JuMmRQb1o2YlloajV3Y1NsdS05V0lxQkFSb0hmSjFtcG94UUZGVFpPSnRGQWhrRUFBQUFBJCQAAAAAAQAAAAEAAABjUK55aGVoc3ZzODUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG2H4GNth-Bja; hkpcvideolandquery=DJ%u52B2%u7206%u821E%u66F2%uFF0C%u6124%u6012%u7684%u60C5%u4EBA%28DJ%u7248%29; Hm_lpvt_4aadd610dfd2f5972f1efee2653a2bc5=1675661411; ariaDefaultTheme=undefined; ab_sr=1.0.1_NWEyYWJjZTA4ZjMxN2I1NDJhZDk0ZGVjYmEwMTkyOGJjZGFiYzA5MDM3NDZhNDIxMzVjMmJkMTE0OTk0YjczZjA3NDUwODNiOTA4MWQ4YWYxNjBmZTdlYzc5MmVjM2FhYzRlNzYzYjkxY2NiMjI5NTQwOWNkNWU5MGVjMmQ4OGE0YzNjZDg3OTk2YjZjZjIwZDZiNzA5MWZlN2JkZGI3ZQ==; reptileData=%7B%22data%22%3A%22e8e9b6023742835f0e561432225f0e7a9a4914d19bf05d633fc2c9dbb0d2621c76ee8600fcf0b64559b108f8658605f4268b0093a8129458e37ff2bc85d28b7d70f4f49d66367684ead60f00ae201824e25c511a619895de14546b0522203ef3%22%2C%22key_id%22%3A%2230%22%2C%22sign%22%3A%22f719e71b%22%7D; RT="z=1&dm=baidu.com&si=57e4d3b7-0362-48c5-80ae-0c5b89bcbe22&ss=ldsc5hkc&sl=77&tt=1v38&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=1ebc8&ul=1ec6h"',

'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',

 

    response = requests.get(url=url, headers=headers)

    m = re.findall('playurl":"(.*?).mp4', response.text)[0]

    m = m.replace("\\\\", "")

    m = m + ".mp4"

    print(m)

    video = requests.get(url=m, headers=headers)

    date1 = video.content

    with open('video\\\\' + name + '.mp4', mode='wb') as f:

        f.write(date1)

6、主函数

import tkinter # 界面模块

import re # 正则模块 from tkinter import * import crawler_bilibili # B站爬取文件 import crawler_hao # 好看视频爬取文件 import kuaishou import pipixia import tiktok import threading import time   ''' 实现原理: 1 利用tkinter模块建立一个界面 2运用按钮执行函数button,将获取的参数提交对应的执行文件 3单选按钮应用 ''' # win = tkinter.Tk() win.title("爬虫") win.geometry("540x360") l1 = Label(win, text="请 输 入 网 址", font="宋体", height="2") l1.grid(row=0, column=0, columnspan=100, padx=60)     text=StringVar() text2 = StringVar() text5 = StringVar() text6 = StringVar() e1 = Entry(win, width=40,textvariable=text,font="song -20", background="#E6E6E6") e1.grid(row=1, column=0, columnspan=100, padx=60) l2 = Label(win, text="请 输 入 文 件 名", font="宋体", height="2") l2.grid(row=2, column=0, columnspan=100, ) e2 = Entry(win, width=40, textvariable=text2, font="song -20", background="#E6E6E6") e2.grid(row=3, column=0, columnspan=100, ) l3 = Label(win, textvariable=text5, font='宋体') l3.grid(row=7, column=0, columnspan=100, ) e3 = Entry(win, width=40, textvariable=text6,state='disabled', font="song -20") e3.grid(row=9, column=0, columnspan=100) l4 = Label(win, text='联系我们', font='宋体') l4.grid(row=10, column=0, columnspan=100, )   chore = True def crawler(text3,name,Num):     Num2=Num-1     list3=[crawler_bilibili.UrlCrawler,crawler_hao.Crawler,pipixia.pipixia,tiktok.TikTok,kuaishou.kuaishou]     response=list3[Num2](text3,name)     if response==1:         text5.set('未知错误')     elif response==0:         text5.set('下载完成')         global chore           chore=False         text6.set('*'*41)       pass     def progress_bar():     i = 1     while chore:         text6.set("*" * i)         i += 1         time.sleep(0.1)         if i > 37:             break     if __name__ == '__main__':     def Del():         text2.set('')         text6.set('')         text.set('')         text5.set('')     def button():         if Num1==0:             text5.set('请选择站点')         else:             t1 = threading.Thread(target=progress_bar, args=())             try:                 text1 = text.get()             except UnicodeDecodeError:                 text5.set('请删除链接中的特殊符号')                 return             if len(text1)==0:                 text5.set('网址为空')                 return             list1=text1.split(' ')             j=0             for i in list1:                 src=list1[j]                 list2=re.findall('https(.*)',src)                 if len(list2)==0:                     j+=1                 else:                     text3='https'+list2[0]             name=text2.get()             if len(name)==0:                 text5.set('文件名为空')                 return             t2=threading.Thread(target=crawler,args=(text3,name,Num1) )         t1.start()         t2.start()     but1 = Button(win, text="提交", command=button, font="宋体")     but2 = Button(win, text="清除", command=Del, font="宋体")     but1.grid(row=6, column=0)     but2.grid(row=6, column=6)     Num1 = 0     # 布置选项按钮和对应的函数     def date():         global Num1         Num1 = v.get()     v = IntVar()     r1 = Radiobutton(win, text="B站 ", variable=v, value=1, command=date, font="song -20")     r2 = Radiobutton(win, text="好看视频", variable=v, value=2, command=date, font="song -20")     r3 = Radiobutton(win, text="皮皮虾", variable=v, value=3, command=date, font="song -20")     r4 = Radiobutton(win, text="抖音", variable=v, value=4, command=date, font="song -20")     r5 = Radiobutton(win, text="快手", variable=v, value=5, command=date, font="song -20")     r1.grid(row=5, column=0)     r2.grid(row=5, column=1)     r3.grid(row=5, column=3)     r4.grid(row=5, column=4)     r5.grid(row=5, column=5)     win.mainloop()

7、将.py文件封装成.exe文件

1、安装pyinstaller库

pip install pyinstaller

2、封装

pyinstaller -F-w  文件名

 

 

Python爬取各大外包网站需求

文章目录


前言

为了更好的掌握数据处理的能力,因而开启Python网络爬虫系列小项目文章。

  • 小项目小需求驱动
  • 总结各种方式
    • 页面源代码返回数据(Xpath、Bs4、PyQuery、正则)
    • 接口返回数据

一、需求

二、分析

一品威客
1、查看网页源代码
2、查找数据
3、获取详情页(赏金、任务要求、需求、状态)


软件项目交易网
1、查看网页源码
2、全局搜索数据


获取YesPMP平台需求任务
1、查看网页源代码
2、全局搜索数据


码市
1、F12抓包即可获取数据
2、构造请求即可获取数据

三、处理

一品威客
1、任务页任务
2、详情页(处理直接雇佣)
3、获取赏金、任务要求、时间

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
from bs4 import BeautifulSoup
import re


HEADERS = "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
           "Content-Type": "text/html; charset=utf-8"


def get_index_source(url):
    res = requests.request("GET",url=url,headers=HEADERS)
    res.encoding = "utf-8"
    return res.text

# 实例化bs4
def method_bs4(html):
    page = BeautifulSoup(html, "html.parser")
    return page



# 直接雇佣任务
def method_zz(code):
    deal = re.compile(r'<meta name="description" content="(?P<is_direct>.*?)" />',re.S)
    result = deal.finditer(code)
    for i in result:
        check = i.group("is_direct")
        if "直接雇佣任务" in check:
            return True


def get_task_url(html):
    page = method_bs4(html)
    # 通过class属性获取页面的任务div
    div = page.select(".title.marginLeft")
    #
    url_list = 
    for _div in div:
        # 获取url
        content_url = _div.find("a")["href"]
        content = _div.text
        task = content.split("【数据采集】")[1]
        url_list[task] = content_url
    return url_list


def get_task_content(url_dict):
    with open("一品威客任务.txt",mode="a+", encoding="utf-8") as f:
        for name, url in url_dict.items():
            # print(name,url)
            code_source = get_index_source(url)
            page = method_bs4(code_source)
            # 获取赏金
            money = page.select(".nummoney.f_l span")
            for _money in money:
                task_money = _money.text.strip("\\n").strip(" ")
                print(task_money)
            # 直接雇佣任务无法查看详情,进行处理
            result = method_zz(code_source)
            if result:
                f.write(f"直接雇佣-nametask_money\\n")
            # 获取开始、结束时间
            time = page.select("#TimeCountdown")
            for _time in time:
                start_time = _time["starttime"]
                end_time = _time["endtime"]
                print(start_time,end_time)
            # 获取需求任务
            content = page.select(".task-info-content p")
            for _content in content:
                content_data = _content.text
                print(content_data)
            f.write(f"name---content_data,task_money,start_time,end_time\\n")


if __name__ == '__main__':
    url = "https://task.epwk.com/sjcj/"
    html = get_index_source(url)
    url_dict = get_task_url(html)
    get_task_content(url_dict)

软件项目交易网
通过Xpath即可获取对应数据

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
from lxml import etree


HEADERS = "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
           "Content-Type": "text/html; charset=utf-8"


def get_index_source(url):
    res = requests.request("GET",url=url,headers=HEADERS)
    res.encoding = "utf-8"
    return res.text

# 实例化etree
def method_xpath(html):
    parse = etree.HTML(html)
    return parse


def get_task_info(html):

    with open("软件交易网站需求.txt",mode="w",encoding="utf-8") as f:
        # 实例化xpath
        parse = method_xpath(html)
        # 通过xpath定位
        result = parse.xpath('//*[@id="projectLists"]/div/ul/li')
        for li in result:
            # 获取任务状态
            status = li.xpath('./div[@class="left_2"]/span/text()')[1]
            # 剔除空格,其它符号
            status = status.strip()
            # 获取任务
            task = li.xpath('./div[@class="left_8"]/h4/a/text()')
            task_content = task[-1].strip()
            # 获取预算
            bond = li.xpath('./div[@class="left_8"]/span[1]/em/text()')[0]
            # 获取人气
            hot = li.xpath('./div[@class="left_8"]/span[2]/em/text()')[0]
            # 发布日期
            start_time = li.xpath('./div[@class="left_8"]/span[3]/em/text()')[0]
            # 截止日期
            end_time = li.xpath('./div[@class="left_8"]/span[4]/em/text()')[0]
            f.write(f"status,task_content,bond,hot,start_time,end_time\\n")




if __name__ == '__main__':
    url = "https://www.sxsoft.com/page/project"
    html = get_index_source(url)
    get_task_info(html)

获取YesPMP平台需求任务
通过PQuery即可获取数据

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
from pyquery import PyQuery as pq


HEADERS = "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
           "Content-Type": "text/html; charset=utf-8"


def get_index_source(url):
    res = requests.request("GET",url=url,headers=HEADERS)
    res.encoding = "utf-8"
    return res.text

# 实例化pq
def method_pq(html):
    parse = pq(html)
    return parse


def get_task_info(html):
    with open("yespmp网站需求.txt",mode="a",encoding="utf-8") as f:
        parse = method_pq(html)
        # 通过class属性进行定位
        result =parse.find(".promain")
        # print(result)
        for _ in result.items():
            # 任务名称
            task_name = _.find(".name").text()
            # 赏金
            price =  _.find(".price").text()
            # 项目周期
            date = _.find(".date").text()
            # 竞标人数
            bid_num =  _.find(".num").text()
            f.write(f"task_name,price,date,bid_num\\n")


if __name__ == '__main__':
    for i in range(2,10):
        url = f"https://www.yespmp.com/project/index_ii.html"
        html = get_index_source(url)
        get_task_info(html)

码市
基本request请求操作(请求头、参数)

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
import json

headers = 
        'cookie': 'mid=6c15e915-d258-41fc-93d9-939a767006da; JSESSIONID=1hfpjvpxsef73sbjoak5g5ehi; _gid=GA1.2.846977299.1668222244; _hjSessionUser_2257705=eyJpZCI6ImI3YzVkMTc5LWM3ZDktNTVmNS04NGZkLTY0YzUxNGY3Mzk5YyIsImNyZWF0ZWQiOjE2NjgyMjIyNDM0NzgsImV4aXN0aW5nIjp0cnVlfQ==; _ga_991F75Z0FG=GS1.1.1668245580.3.1.1668245580.0.0.0; _ga=GA1.2.157466615.1668222243; _gat=1',
        'referer': 'https://codemart.com/projects?labelId=&page=1',
        'accept': 'application/json',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    


def get_data():
    url = "https://codemart.com/api/project?labelId=&page=1"
    payload = 
    response = requests.request("GET", url, headers=headers, data=payload)
    print(json.loads(response.text))


if __name__ == '__main__':
    get_data()

四、总结

  • Xpath
    • 适用于要获取的信息在某个标签下,且各标签层次明显,通过路径找到位置,for循环遍历即可
  • Bs4
    • 适用于要获取的信息比较分散,且通过选择器可以定位(class唯一、id唯一)
  • PyQuery
    • 适用于要获取的信息比较分散,且通过选择器可以定位(class唯一、id唯一)
  • 正则
    • 通过(.*?)就可以处理元素失效或者定位少量信息
    • 不适用网页代码有很多其它符号,定位失效
  • 接口返回数据
    • 对于接口没有进行加密,通过requests构造请求即可获取数据
    • 关注点在请求头中的参数

欢迎加入免费的知识星球内!
我正在「Print(“Hello Python”)」和朋友们讨论有趣的话题,你⼀起来吧?
https://t.zsxq.com/076uG3kOn

以上是关于Python爬虫爬取各大热门短视频平台视频的主要内容,如果未能解决你的问题,请参考以下文章

Python爬虫之爬取B站首页热门推荐视频

python爬取快手ios端首页热门视频

爬取B站热门视频排行榜

Python抖音快手短视频等多平台同步管理发步 - 惠头条短视频,同时支持哔哩哔哩小红书微视西瓜视频微信视频号

都说抖音是毒品软件?我用Python爬取了里面的小视频!

视频解析方案:解析各大直播平台短视频