怎么用Python爬取抖音小视频? 资深程序员都这样爬取的(附源码)

Posted 2020-11-09 娇兮心有之

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了怎么用Python爬取抖音小视频? 资深程序员都这样爬取的(附源码)相关的知识，希望对你有一定的参考价值。

简介

抖音，是一款可以拍短视频的音乐创意短视频社交软件，该软件于2016年9月上线，是一个专注年轻人的15秒音乐短视频社区。用户可以通过这款软件选择歌曲，拍摄15秒的音乐短视频，形成自己的作品。此APP已在Android各大应用商店和APP Store均有上线。

今天咱们就用Python爬取抖音视频

准备：

环境：Python3.6+Windows

IDE：你开行就好，喜欢用哪个就用哪个

模块：

1 from splinter.driver.webdriver.chrome import Options, Chrome
2 from splinter.browser import Browser
3 from contextlib import closing
4 import requests, json, time, re, os, sys, time
5 from bs4 import BeautifulSoup

获得视频播放地址

查询的用户ID
视频名字列表
视频链接列表
用户昵称

 1     def get_video_urls(self, user_id):
 2 
 3 +        video_names = []
 4 +        video_urls = []
 5 +        unique_id = \'\'
 6 +        while unique_id != user_id:
 7 +            search_url = \'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622\' % user_id
 8 +            req = requests.get(url = search_url, verify = False)
 9 +            html = json.loads(req.text)
10 +            aweme_count = html[\'user_list\'][0][\'user_info\'][\'aweme_count\']
11 +            uid = html[\'user_list\'][0][\'user_info\'][\'uid\']
12 +            nickname = html[\'user_list\'][0][\'user_info\'][\'nickname\']
13 +            unique_id = html[\'user_list\'][0][\'user_info\'][\'unique_id\']
14 +        user_url = \'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s\' % (uid, aweme_count)
15 +        req = requests.get(url = user_url, verify = False)
16 +        html = json.loads(req.text)
17 +        i = 1
18 +        for each in html[\'aweme_list\']:
19 +            share_desc = each[\'share_info\'][\'share_desc\']
20 +            if \'抖音-原创音乐短视频社区\' == share_desc:
21 +                video_names.append(str(i) + \'.mp4\')
22 +                i += 1
23 +            else:
24 +                video_names.append(share_desc + \'.mp4\')
25 +            video_urls.append(each[\'share_info\'][\'share_url\'])
26 +
27 +        return video_names, video_urls, nickname

获得带水印的视频播放地址

video_url：带水印的视频播放地址
download_url: 带水印的视频下载地址

1     def get_download_url(self, video_url):
2 
3 +        req = requests.get(url = video_url, verify = False)
4 +        bf = BeautifulSoup(req.text, \'lxml\')
5 +        script = bf.find_all(\'script\')[-1]
6 +        video_url_js = re.findall(\'var data = \\[(.+)\\];\', str(script))[0]
7 +        video_html = json.loads(video_url_js)
8 +        download_url = video_html[\'video\'][\'play_addr\'][\'url_list\'][0]
9 +        return download_url

视频下载

video_url: 带水印的视频地址
video_name: 视频名
watermark_flag: 是否下载不带水印的视频

 1     def video_downloader(self, video_url, video_name, watermark_flag=True):
 2 +        """
 3 +        视频下载
 4 +        Parameters:
 5 +            video_url: 带水印的视频地址
 6 +            video_name: 视频名
 7 +            watermark_flag: 是否下载不带水印的视频
 8 +        Returns:
 9 +            无
10 +        """
11 +        size = 0
12 +        if watermark_flag == True:
13 +            video_url = self.remove_watermark(video_url)
14 +        else:
15 +            video_url = self.get_download_url(video_url)
16 +        with closing(requests.get(video_url, stream=True, verify = False)) as response:
17 +            chunk_size = 1024
18 +            content_size = int(response.headers[\'content-length\']) 
19 +            if response.status_code == 200:
20 +                sys.stdout.write(\'  [文件大小]:%0.2f MB\\n\' % (content_size / chunk_size / 1024))
21 +
22 +                with open(video_name, "wb") as file:  
23 +                    for data in response.iter_content(chunk_size = chunk_size):
24 +                        file.write(data)
25 +                        size += len(data)
26 +                        file.flush()
27 +
28 +                        sys.stdout.write(\'  [下载进度]:%.2f%%\' % float(size / content_size * 100) + \'\\r\')
29 +                        sys.stdout.flush()

获得无水印的视频播放地址

 1     def remove_watermark(self, video_url):
 2 +        """
 3 +        获得无水印的视频播放地址
 4 +        Parameters:
 5 +            video_url: 带水印的视频地址
 6 +        Returns:
 7 +            无水印的视频下载地址
 8 +        """
 9 +        self.driver.visit(\'http://douyin.iiilab.com/\')
10 +        self.driver.find_by_tag(\'input\').fill(video_url)
11 +        self.driver.find_by_xpath(\'//button[@class="btn btn-default"]\').click()
12 +        html = self.driver.find_by_xpath(\'//div[@class="thumbnail"]/div/p\')[0].html
13 +        bf = BeautifulSoup(html, \'lxml\')
14 +        return bf.find(\'a\').get(\'href\')

下载视频

 1     def run(self):
 2 +        """
 3 +        运行函数
 4 +        Parameters:
 5 +            None
 6 +        Returns:
 7 +            None
 8 +        """
 9 +        self.hello()
10 +        user_id = input(\'请输入ID(例如40103580):\')
11 +        video_names, video_urls, nickname = self.get_video_urls(user_id)
12 +        if nickname not in os.listdir():
13 +            os.mkdir(nickname)
14 +        print(\'视频下载中:共有%d个作品!\\n\' % len(video_urls))
15 +        for num in range(len(video_urls)):
16 +            print(\'  解析第%d个视频链接 [%s] 中，请稍后!\\n\' % (num+1, video_urls[num]))
17 +            if \'\\\\\' in video_names[num]:
18 +                video_name = video_names[num].replace(\'\\\\\', \'\')
19 +            elif \'/\' in video_names[num]:
20 +                video_name = video_names[num].replace(\'/\', \'\')
21 +            else:
22 +                video_name = video_names[num]
23 +            self.video_downloader(video_urls[num], os.path.join(nickname, video_name))
24 +            print(\'\\n\')
25 +
26 +        print(\'下载完成!\')

全部代码

  1 +# -*- coding:utf-8 -*-
  2 
  3 +Python学习交流群：125240963
  4 +Python学习交流群：125240963
  5 +Python学习交流群：125240963
  6 
  7 +from splinter.driver.webdriver.chrome import Options, Chrome
  8 +from splinter.browser import Browser
  9 +from contextlib import closing
 10 +import requests, json, time, re, os, sys, time
 11 +from bs4 import BeautifulSoup
 12 +
 13  class DouYin(object):
 14     def __init__(self, width = 500, height = 300):
 15 +        """
 16 +        抖音App视频下载
 17 +        """
 18 +        # 无头浏览器
 19 +        chrome_options = Options()
 20 +        chrome_options.add_argument(\'user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"\')
 21 +        self.driver = Browser(driver_name=\'chrome\', executable_path=\'D:/chromedriver\', options=chrome_options, headless=True)
 22 +
 23     def get_video_urls(self, user_id):
 24 +        """
 25 +        获得视频播放地址
 26 +        Parameters:
 27 +            user_id：查询的用户ID
 28 +        Returns:
 29 +            video_names: 视频名字列表
 30 +            video_urls: 视频链接列表
 31 +            nickname: 用户昵称
 32 +        """
 33 +        video_names = []
 34 +        video_urls = []
 35 +        unique_id = \'\'
 36 +        while unique_id != user_id:
 37 +            search_url = \'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622\' % user_id
 38 +            req = requests.get(url = search_url, verify = False)
 39 +            html = json.loads(req.text)
 40 +            aweme_count = html[\'user_list\'][0][\'user_info\'][\'aweme_count\']
 41 +            uid = html[\'user_list\'][0][\'user_info\'][\'uid\']
 42 +            nickname = html[\'user_list\'][0][\'user_info\'][\'nickname\']
 43 +            unique_id = html[\'user_list\'][0][\'user_info\'][\'unique_id\']
 44 +        user_url = \'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s\' % (uid, aweme_count)
 45 +        req = requests.get(url = user_url, verify = False)
 46 +        html = json.loads(req.text)
 47 +        i = 1
 48 +        for each in html[\'aweme_list\']:
 49 +            share_desc = each[\'share_info\'][\'share_desc\']
 50 +            if \'抖音-原创音乐短视频社区\' == share_desc:
 51 +                video_names.append(str(i) + \'.mp4\')
 52 +                i += 1
 53 +            else:
 54 +                video_names.append(share_desc + \'.mp4\')
 55 +            video_urls.append(each[\'share_info\'][\'share_url\'])
 56 +
 57 +        return video_names, video_urls, nickname
 58 +
 59     def get_download_url(self, video_url):
 60 +        """
 61 +        获得带水印的视频播放地址
 62 +        Parameters:
 63 +            video_url：带水印的视频播放地址
 64 +        Returns:
 65 +            download_url: 带水印的视频下载地址
 66 +        """
 67 +        req = requests.get(url = video_url, verify = False)
 68 +        bf = BeautifulSoup(req.text, \'lxml\')
 69 +        script = bf.find_all(\'script\')[-1]
 70 +        video_url_js = re.findall(\'var data = \\[(.+)\\];\', str(script))[0]
 71 +        video_html = json.loads(video_url_js)
 72 +        download_url = video_html[\'video\'][\'play_addr\'][\'url_list\'][0]
 73 +        return download_url
 74 +
 75     def video_downloader(self, video_url, video_name, watermark_flag=True):
 76 +        """
 77 +        视频下载
 78 +        Parameters:
 79 +            video_url: 带水印的视频地址
 80 +            video_name: 视频名
 81 +            watermark_flag: 是否下载不带水印的视频
 82 +        Returns:
 83 +            无
 84 +        """
 85 +        size = 0
 86 +        if watermark_flag == True:
 87 +            video_url = self.remove_watermark(video_url)
 88 +        else:
 89 +            video_url = self.get_download_url(video_url)
 90 +        with closing(requests.get(video_url, stream=True, verify = False)) as response:
 91 +            chunk_size = 1024
 92 +            content_size = int(response.headers[\'content-length\']) 
 93 +            if response.status_code == 200:
 94 +                sys.stdout.write(\'  [文件大小]:%0.2f MB\\n\' % (content_size / chunk_size / 1024))
 95 +
 96 +                with open(video_name, "wb") as file:  
 97 +                    for data in response.iter_content(chunk_size = chunk_size):
 98 +                        file.write(data)
 99 +                        size += len(data)
100 +                        file.flush()
101 +
102 +                        sys.stdout.write(\'  [下载进度]:%.2f%%\' % float(size / content_size * 100) + \'\\r\')
103 +                        sys.stdout.flush()
104 +
105 +
106     def remove_watermark(self, video_url):
107 +        """
108 +        获得无水印的视频播放地址
109 +        Parameters:
110 +            video_url: 带水印的视频地址
111 +        Returns:
112 +            无水印的视频下载地址
113 +        """
114 +        self.driver.visit(\'http://douyin.iiilab.com/\')
115 +        self.driver.find_by_tag(\'input\').fill(video_url)
116 +        self.driver.find_by_xpath(\'//button[@class="btn btn-default"]\').click()
117 +        html = self.driver.find_by_xpath(\'//div[@class="thumbnail"]/div/p\')[0].html
118 +        bf = BeautifulSoup(html, \'lxml\')
119 +        return bf.find(\'a\').get(\'href\')
120 +
121     def run(self):
122 +        """
123 +        运行函数
124 +        Parameters:
125 +            None
126 +        Returns:
127 +            None
128 +        """
129 +        self.hello()
130 +        user_id = input(\'请输入ID(例如40103580):\')
131 +        video_names, video_urls, nickname = self.get_video_urls(user_id)
132 +        if nickname not in os.listdir():
133 +            os.mkdir(nickname)
134 +        print(\'视频下载中:共有%d个作品!\\n\' % len(video_urls))
135 +        for num in range(len(video_urls)):
136 +            print(\'  解析第%d个视频链接 [%s] 中，请稍后!\\n\' % (num+1, video_urls[num]))
137 +            if \'\\\\\' in video_names[num]:
138 +                video_name = video_names[num].replace(\'\\\\\', \'\')
139 +            elif \'/\' in video_names[num]:
140 +                video_name = video_names[num].replace(\'/\', \'\')
141 +            else:
142 +                video_name = video_names[num]
143 +            self.video_downloader(video_urls[num], os.path.join(nickname, video_name))
144 +            print(\'\\n\')
145 +
146 +        print(\'下载完成!\')
147 +
148     def hello(self):
149 +        """
150 +        打印欢迎界面
151 +        Parameters:
152 +            None
153 +        Returns:
154 +            None
155 +        """
156 +        print(\'*\' * 100)
157 +        print(\'\\t\\t\\t\\t抖音App视频下载小助手\')
158 +        print(\'\\t\\t作者:Python学习交流群：125240963\')
159 +        print(\'*\' * 100)
160 +
161 +
162 +if __name__ == \'__main__\':
163 +    douyin = DouYin()
164 +    douyin.run()

以上是关于怎么用Python爬取抖音小视频? 资深程序员都这样爬取的(附源码)的主要内容，如果未能解决你的问题，请参考以下文章

还在考虑去哪找小视频？Python爬虫带你爬取数百万部国产小视频！

抖音爬虫从0到1-第三弹：爬取抖音用户详细数据