python爬虫练习18:爬虫抓取视频思路2
Posted <编程路上>
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬虫练习18:爬虫抓取视频思路2相关的知识,希望对你有一定的参考价值。
想要抓取一个视频:
- 找到m3u8 (各种手段)
- 通过m3u8下载到ts文件
- 可以通过各种手段(不仅是编程手段) 把ts文件合并为一个mp4文件
找到一个视频网址打开,查看源码和F12
获取该链接,但是发现不是真正的m3u8
真正的m3u8在这个链接里面
从中获取合成真正的m3u8下载地址
先用代码完成这一部分:
url='http://48ys.top/vodplay/cW7JJJJN-1-1.html'
headers = 'User-Agent': str(UserAgent().random)
resp = requests.get(url,headers=headers)
res=re.compile(r'"link_pre":"","url":"(?P<url>.*?)",', re.S)# re.S: 让.能匹配换行符
m3u8_url = res.search(resp.text).group('url')
m3u8_url = m3u8_url.replace('\\\\','')
truth_left = m3u8_url.split('/2')[0]
resp1 = requests.get(m3u8_url,headers=headers)
truth_url =truth_left+resp1.text.split('1280x720\\n')[1]
print(truth_url)
输出如下:
使用获得的地址下载m3u8文件
resp2 = requests.get(truth_url.replace('\\n',''),headers=headers)#去掉尾部的\\n
with open('monv2.m3u8', mode="wb") as f:
f.write(resp2.content)
f.close()
观看一下真是的视频地址:
所以需要我们进行一下处理,变成真正的视频下载地址。
# 3.2 下载第二层m3u8文件
with open("monv2.m3u8", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
else:
line = line.strip() # 去掉空白或者换行符 hls/index.m3u8
# 准备拼接第二层m3u8的下载路径
second_m3u8_url = truth_left + line
print(second_m3u8_url)
f.close()
print('ok')
所以视频地址获取到了,但是如果下载的话太慢了,所以采用协程方法快速下载,完整代码如下:
import requests
from bs4 import BeautifulSoup
import re
import asyncio
import aiohttp
import aiofiles
from Crypto.Cipher import AES # pycryptodome
import os
from fake_useragent import UserAgent
def main():
print('ok')
def m3u8_allurl(name,truth_left):
# 3.2 下载第二层m3u8文件
urls = list()
with open(name, mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
else:
line = line.strip() # 去掉空白或者换行符 hls/index.m3u8
# 准备拼接第二层m3u8的下载路径
second_m3u8_url = truth_left + line
urls.append(second_m3u8_url)
f.close()
return urls
def m3u8_download(name,truth_url):
headers = 'User-Agent': str(UserAgent().random)
resp2 = requests.get(truth_url.replace('\\n', ''), headers=headers) # 去掉尾部的\\n
with open(name, mode="wb") as f:
f.write(resp2.content)
f.close()
print(name,' download!!!')
def m3u8_get(url):
headers = 'User-Agent': str(UserAgent().random)
resp = requests.get(url, headers=headers)
res = re.compile(r'"link_pre":"","url":"(?P<url>.*?)",', re.S) # re.S: 让.能匹配换行符
m3u8_url = res.search(resp.text).group('url')
m3u8_url = m3u8_url.replace('\\\\', '')
truth_left = m3u8_url.split('/2')[0]
resp1 = requests.get(m3u8_url, headers=headers)
truth_url = truth_left + resp1.text.split('1280x720\\n')[1]
return truth_left,truth_url.strip()
async def download_ts(url, name, session):
async with session.get(url) as resp:
async with aiofiles.open(name, mode="wb") as f:
await f.write(await resp.content.read()) # 把下载到的内容写入到文件中
f.close()
print(f"name下载完毕")
async def aio_download(allurls,name):
tasks = []
n = 1
async with aiohttp.ClientSession() as session: # 提前准备好session
for i_url in allurls:
task = asyncio.create_task(download_ts(i_url, name+f'_n.ts',session)) # 创建任务
n += 1
tasks.append(task)
await asyncio.wait(tasks) # 等待任务结束
if __name__ == '__main__':
url = 'http://48ys.top/vodplay/cW7JJJJN-1-1.html'
truth_left,m3u8 = m3u8_get(url)
print(m3u8)
m3u8_download('monv2.m3u8',m3u8)
allurls = m3u8_allurl('monv2.m3u8', truth_left)
print(allurls)
asyncio.run(aio_download(allurls, 'monv2/monv2'))
print(len(allurls))
main()
合成mp4视频的话建议使用一下工具,用代码的话有点慢,我的.py文件必须在这些视频的文件夹内运行才能合成,使用文件夹/文件名不能合成,也不清楚什么原因。还有一个问题就是所有ts文件不能一起合成,我也不清楚什么原因,因为不报错也不生成视频,使用我每100个ts合成一个大的ts,将这些大的ts统一合成一个mp4.
import os
lst = []
n=1
#将所有的文件名放入列表
with open("monv2.m3u8", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
line = line.strip()
lname = line.split('/')[-1]
lst.append(lname)
#每100个ts文件名放入一个列表
k=[[] for i in range(0,100)]
num=-1
for i in range(0, len(lst)):
if(i%100==0):
num+=1
# print(num)
# print(i)
k[num].append(lst[i])
#100个ts先合成一个大的ts
ts=[]
for i in range(0,num+1):
s = "+".join(k[i])
print(s)
os.system(f"copy/b s hei.ts")
ts.append(f'hei.ts')
print(f'i is ok')
print(ts)
#将大的ts合成一个mp4
s = "+".join(ts)
os.system(f"copy/b s he.mp4")
print('he.mp4 is ok')
这是成功合成好的视频:
Python爬虫实现抓取腾讯视频所有电影实战必学
前言
本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
作者: Python新手学习之家
用python实现的抓取腾讯视频所有电影的爬虫
-
# -*- coding: utf-8 -*-
-
import re
-
import urllib2
-
from bs4 import BeautifulSoup
-
import string, time
-
import pymongo
-
-
NUM = 0 #全局变量,电影数量
-
m_type = u‘‘ #全局变量,电影类型
-
m_site = u‘qq‘ #全局变量,电影网站
-
-
#根据指定的URL获取网页内容
-
def gethtml(url):
-
req = urllib2.Request(url)
-
response = urllib2.urlopen(req)
-
html = response.read()
-
return html
-
‘‘‘
-
在学习过程中有什么不懂得可以加我的python学习交流扣扣qun,934109170,群里有不错的学习教程与开发工具。
-
‘‘‘
-
-
#从电影分类列表页面获取电影分类
-
def gettags(html):
-
global m_type
-
soup = BeautifulSoup(html) #过滤出分类内容
-
#print soup
-
#<ul class="clearfix _group" gname="mi_type" gtype="1">
-
tags_all = soup.find_all(‘ul‘, {‘class‘ : ‘clearfix _group‘ , ‘gname‘ : ‘mi_type‘})
-
#print len(tags_all), tags_all
-
#print str(tags_all[1]).replace(‘ ‘, ‘‘)
-
-
#<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作" tvalue="0">动作</a>
-
re_tags = r‘<a _hot="tag.sub" class="_gtag _hotkey" href="(.+?)" title="(.+?)" tvalue="(.+?)">.+?</a>‘
-
p = re.compile(re_tags, re.DOTALL)
-
-
tags = p.findall(str(tags_all[0]))
-
if tags:
-
tags_url = {}
-
#print tags
-
for tag in tags:
-
tag_url = tag[0].decode(‘utf-8‘)
-
#print tag_url
-
m_type = tag[1].decode(‘utf-8‘)
-
tags_url[m_type] = tag_url
-
-
else:
-
print "Not Find"
-
return tags_url
-
-
#获取每个分类的页数
-
def get_pages(tag_url):
-
tag_html = gethtml(tag_url)
-
#div class="paginator
-
soup = BeautifulSoup(tag_html) #过滤出标记页面的html
-
#print soup
-
#<div class="mod_pagenav" id="pager">
-
div_page = soup.find_all(‘div‘, {‘class‘ : ‘mod_pagenav‘, ‘id‘ : ‘pager‘})
-
#print div_page #len(div_page), div_page[0]
-
-
#<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
-
re_pages = r‘<a class=.+?><span>(.+?)</span></a>‘
-
p = re.compile(re_pages, re.DOTALL)
-
pages = p.findall(str(div_page[0]))
-
#print pages
-
if len(pages) > 1:
-
return pages[-2]
-
else:
-
return 1
-
-
-
def getmovielist(html):
-
soup = BeautifulSoup(html)
-
-
#<ul class="mod_list_pic_130">
-
divs = soup.find_all(‘ul‘, {‘class‘ : ‘mod_list_pic_130‘})
-
#print divs
-
for div_html in divs:
-
div_html = str(div_html).replace(‘ ‘, ‘‘)
-
#print div_html
-
getmovie(div_html)
-
-
-
def getmovie(html):
-
global NUM
-
global m_type
-
global m_site
-
-
re_movie = r‘<li><a class="mod_poster_130" href="(.+?)" target="_blank" title="(.+?)"><img.+?</li>‘
-
p = re.compile(re_movie, re.DOTALL)
-
movies = p.findall(html)
-
if movies:
-
conn = pymongo.Connection(‘localhost‘, 27017)
-
movie_db = conn.dianying
-
playlinks = movie_db.playlinks
-
#print movies
-
for movie in movies:
-
#print movie
-
NUM += 1
-
print "%s : %d" % ("=" * 70, NUM)
-
values = dict(
-
movie_title = movie[1],
-
movie_url = movie[0],
-
movie_site = m_site,
-
movie_type = m_type
-
)
-
print values
-
playlinks.insert(values)
-
print "_" * 70
-
NUM += 1
-
print "%s : %d" % ("=" * 70, NUM)
-
-
#else:
-
# print "Not Find"
-
-
def getmovieinfo(url):
-
html = gethtml(url)
-
soup = BeautifulSoup(html)
-
-
#pack pack_album album_cover
-
divs = soup.find_all(‘div‘, {‘class‘ : ‘pack pack_album album_cover‘})
-
#print divs[0]
-
-
#<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>
-
re_info = r‘<a href="(.+?)" target="new" title="(.+?)" wl=".+?"> </a>‘
-
p_info = re.compile(re_info, re.DOTALL)
-
m_info = p_info.findall(str(divs[0]))
-
if m_info:
-
return m_info
-
else:
-
print "Not find movie info"
-
-
return m_info
-
-
-
def insertdb(movieinfo):
-
global conn
-
movie_db = conn.dianying_at
-
movies = movie_db.movies
-
movies.insert(movieinfo)
-
-
if __name__ == "__main__":
-
global conn
-
-
tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
-
#print tags_url
-
tags_html = gethtml(tags_url)
-
#print tags_html
-
tag_urls = gettags(tags_html)
-
#print tag_urls
-
-
-
for url in tag_urls.items():
-
print str(url[1]).encode(‘utf-8‘) #,url[0]
-
maxpage = int(get_pages(str(url[1]).encode(‘utf-8‘)))
-
print maxpage
-
-
for x in range(0, maxpage):
-
#http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
-
m_url = str(url[1]).replace(‘0_20_0_-1_0.html‘, ‘‘)
-
movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
-
print movie_url
-
movie_html = gethtml(movie_url.encode(‘utf-8‘))
-
#print movie_html
-
getmovielist(movie_html)
-
time.sleep(0.1)