Python爬虫学习教程,批量爬取下载抖音视频
Posted xiaoyiq
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫学习教程,批量爬取下载抖音视频相关的知识,希望对你有一定的参考价值。
这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
项目源码展示:
1 ‘‘‘ 2 在学习过程中有什么不懂得可以加我的 3 python学习交流扣扣qun,934109170 4 群里有不错的学习教程、开发工具与电子书籍。 5 与你分享python企业当下人才需求及怎么从零基础学习好python,和学习什么内容。 6 ‘‘‘ 7 # -*- coding:utf-8 -*- 8 from contextlib import closing 9 import requests, json, re, os, sys, random 10 from ipaddress import ip_address 11 from subprocess import Popen, PIPE 12 import urllib 13 class DouYin(object): 14 def __init__(self, width = 500, height = 300): 15 """ 16 抖音App视频下载 17 """ 18 rip = ip_address(‘0.0.0.0‘) 19 while rip.is_private: 20 rip = ip_address(‘.‘.join(map(str, (random.randint(0, 255) for _ in range(4))))) 21 self.headers = 22 ‘accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, 23 ‘accept-encoding‘: ‘gzip, deflate, br‘, 24 ‘accept-language‘: ‘zh-CN,zh;q=0.9‘, 25 ‘pragma‘: ‘no-cache‘, 26 ‘cache-control‘: ‘no-cache‘, 27 ‘upgrade-insecure-requests‘: ‘1‘, 28 ‘user-agent‘: ‘Mozilla/5.0 (Linux; U; android 5.1.1; zh-cn; MI 4S Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.1.3‘, 29 ‘X-Real-IP‘: str(rip), 30 ‘X-Forwarded-For‘: str(rip), 31 32 def get_video_urls(self, user_id, type_flag=‘f‘): 33 """ 34 获得视频播放地址 35 Parameters: 36 user_id:查询的用户UID 37 Returns: 38 video_names: 视频名字列表 39 video_urls: 视频链接列表 40 nickname: 用户昵称 41 """ 42 video_names = [] 43 video_urls = [] 44 share_urls = [] 45 max_cursor = 0 46 has_more = 1 47 i = 0 48 share_user_url = ‘https://www.douyin.com/share/user/%s‘ % user_id 49 share_user = requests.get(share_user_url, headers=self.headers) 50 while share_user.status_code != 200: 51 share_user = requests.get(share_user_url, headers=self.headers) 52 _dytk_re = re.compile(r"dytk\\s*:\\s*‘(.+)‘") 53 dytk = _dytk_re.search(share_user.text).group(1) 54 _nickname_re = re.compile(r‘<p class="nickname">(.+?)<\\/p>‘) 55 nickname = _nickname_re.search(share_user.text).group(1) 56 urllib.request.urlretrieve(‘https://raw.githubusercontent.com/Jack-Cherish/python-spider/master/douyin/fuck-byted-acrawler.js‘, ‘fuck-byted-acrawler.js‘) 57 try: 58 Popen([‘node‘, ‘-v‘], stdout=PIPE, stderr=PIPE).communicate() 59 except (OSError, IOError) as err: 60 print(‘请先安装 node.js: https://nodejs.org/‘) 61 sys.exit() 62 user_url_prefix = ‘https://www.douyin.com/aweme/v1/aweme/favorite‘ if type_flag == ‘f‘ else ‘https://www.douyin.com/aweme/v1/aweme/post‘ 63 print(‘解析视频链接中‘) 64 while has_more != 0: 65 process = Popen([‘node‘, ‘fuck-byted-acrawler.js‘, str(user_id)], stdout=PIPE, stderr=PIPE) 66 _sign = process.communicate()[0].decode().strip(‘\\n‘).strip(‘\\r‘) 67 user_url = user_url_prefix + ‘/?user_id=%s&max_cursor=%s&count=21&aid=1128&_signature=%s&dytk=%s‘ % (user_id, max_cursor, _sign, dytk) 68 req = requests.get(user_url, headers=self.headers) 69 while req.status_code != 200: 70 req = requests.get(user_url, headers=self.headers) 71 html = json.loads(req.text) 72 try: 73 while html[‘aweme_list‘] == []: 74 i = i + 1 75 sys.stdout.write(‘已重新链接‘ + str(i) + ‘次 (若超过100次,请ctrl+c强制停止再重来)‘ + ‘\\r‘) 76 sys.stdout.flush() 77 process = Popen([‘node‘, ‘fuck-byted-acrawler.js‘, str(user_id)], stdout=PIPE, stderr=PIPE) 78 _sign = process.communicate()[0].decode().strip(‘\\n‘).strip(‘\\r‘) 79 user_url = user_url_prefix + ‘/?user_id=%s&max_cursor=%s&count=21&aid=1128&_signature=%s&dytk=%s‘ % (user_id, max_cursor, _sign, dytk) 80 req = requests.get(user_url, headers=self.headers) 81 while req.status_code != 200: 82 req = requests.get(user_url, headers=self.headers) 83 html = json.loads(req.text) 84 except: 85 pass 86 i = 0 87 for each in html[‘aweme_list‘]: 88 try: 89 url = ‘https://aweme.snssdk.com/aweme/v1/play/?video_id=%s&line=0&ratio=720p&media_type=4&vr_type=0&test_cdn=None&improve_bitrate=0‘ 90 uri = each[‘video‘][‘play_addr‘][‘uri‘] 91 video_url = url % uri 92 except: 93 continue 94 share_desc = each[‘share_info‘][‘share_desc‘] 95 if os.name == ‘nt‘: 96 for c in r‘\\/:*?"<>|‘: 97 nickname = nickname.replace(c, ‘‘).strip().strip(‘\\.‘) 98 share_desc = share_desc.replace(c, ‘‘).strip() 99 share_id = each[‘aweme_id‘] 100 if share_desc in [‘抖音-原创音乐短视频社区‘, ‘TikTok‘, ‘‘]: 101 video_names.append(share_id + ‘.mp4‘) 102 else: 103 video_names.append(share_id + ‘-‘ + share_desc + ‘.mp4‘) 104 share_urls.append(each[‘share_info‘][‘share_url‘]) 105 video_urls.append(video_url) 106 max_cursor = html[‘max_cursor‘] 107 has_more = html[‘has_more‘] 108 return video_names, video_urls, share_urls, nickname 109 def get_download_url(self, video_url, watermark_flag): 110 """ 111 获得带水印的视频播放地址 112 Parameters: 113 video_url:带水印的视频播放地址 114 Returns: 115 download_url: 带水印的视频下载地址 116 """ 117 # 带水印视频 118 if watermark_flag == True: 119 download_url = video_url.replace(‘/play/‘, ‘/playwm/‘) 120 # 无水印视频 121 else: 122 download_url = video_url.replace(‘/playwm/‘, ‘/play/‘) 123 return download_url 124 def video_downloader(self, video_url, video_name, watermark_flag=False): 125 """ 126 视频下载 127 Parameters: 128 video_url: 带水印的视频地址 129 video_name: 视频名 130 watermark_flag: 是否下载带水印的视频 131 Returns: 132 无 133 """ 134 size = 0 135 video_url = self.get_download_url(video_url, watermark_flag=watermark_flag) 136 with closing(requests.get(video_url, headers=self.headers, stream=True)) as response: 137 chunk_size = 1024 138 content_size = int(response.headers[‘content-length‘]) 139 if response.status_code == 200: 140 sys.stdout.write(‘ [文件大小]:%0.2f MB\\n‘ % (content_size / chunk_size / 1024)) 141 with open(video_name, ‘wb‘) as file: 142 for data in response.iter_content(chunk_size = chunk_size): 143 file.write(data) 144 size += len(data) 145 file.flush() 146 sys.stdout.write(‘ [下载进度]:%.2f%%‘ % float(size / content_size * 100) + ‘\\r‘) 147 sys.stdout.flush() 148 def run(self): 149 """ 150 运行函数 151 Parameters: 152 None 153 Returns: 154 None 155 """ 156 self.hello() 157 print(‘搜索api需要登录,暂时使用UID下载\\n分享用户页面,用浏览器打开短链接,原始链接中/share/user/后的数字即是UID‘) 158 user_id = input(‘请输入ID (例如95006183):‘) 159 user_id = user_id if user_id else ‘95006183‘ 160 watermark_flag = input(‘是否下载带水印的视频 (0-否(默认), 1-是):‘) 161 watermark_flag = watermark_flag if watermark_flag!=‘‘ else ‘0‘ 162 watermark_flag = bool(int(watermark_flag)) 163 type_flag = input(‘f-收藏的(默认), p-上传的:‘) 164 type_flag = type_flag if type_flag!=‘‘ else ‘f‘ 165 save_dir = input(‘保存路径 (例如"E:/Download/", 默认"./Download/"):‘) 166 save_dir = save_dir if save_dir else "./Download/" 167 video_names, video_urls, share_urls, nickname = self.get_video_urls(user_id, type_flag) 168 nickname_dir = os.path.join(save_dir, nickname) 169 if not os.path.exists(save_dir): 170 os.makedirs(save_dir) 171 if nickname not in os.listdir(save_dir): 172 os.mkdir(nickname_dir) 173 if type_flag == ‘f‘: 174 if ‘favorite‘ not in os.listdir(nickname_dir): 175 os.mkdir(os.path.join(nickname_dir, ‘favorite‘)) 176 print(‘视频下载中:共有%d个作品!\\n‘ % len(video_urls)) 177 for num in range(len(video_urls)): 178 print(‘ 解析第%d个视频链接 [%s] 中,请稍后!\\n‘ % (num + 1, share_urls[num])) 179 if ‘\\\\‘ in video_names[num]: 180 video_name = video_names[num].replace(‘\\\\‘, ‘‘) 181 elif ‘/‘ in video_names[num]: 182 video_name = video_names[num].replace(‘/‘, ‘‘) 183 else: 184 video_name = video_names[num] 185 video_path = os.path.join(nickname_dir, video_name) if type_flag!=‘f‘ else os.path.join(nickname_dir, ‘favorite‘, video_name) 186 if os.path.isfile(video_path): 187 print(‘视频已存在‘) 188 else: 189 self.video_downloader(video_urls[num], video_path, watermark_flag) 190 print(‘\\n‘) 191 print(‘下载完成!‘) 192 def hello(self): 193 """ 194 打印欢迎界面 195 Parameters: 196 None 197 Returns: 198 None 199 """ 200 print(‘*‘ * 100) 201 print(‘\\t\\t\\t\\t抖音App视频下载小助手‘) 202 print(‘\\t\\t作者:Jack Cui、steven7851‘) 203 print(‘*‘ * 100) 204 if __name__ == ‘__main__‘: 205 douyin = DouYin() 206 douyin.run()
运行结果:
爬取结果截图
以上是关于Python爬虫学习教程,批量爬取下载抖音视频的主要内容,如果未能解决你的问题,请参考以下文章
python爬虫抖某音爬取视频 Airtest+fiddler