梨视频,进程池线程池爬取
Posted kai-
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了梨视频,进程池线程池爬取相关的知识,希望对你有一定的参考价值。
[TOC]
进程池
import requests, re, time
from multiprocessing.dummy import Pool
import random
IpPool = [{‘http‘: ‘183.147.230.104: 8118‘}, {‘http‘: ‘60.217.64.237: 31923‘},
{‘http‘: ‘221.193.50.166: 8118‘}]
url = ‘https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start=0‘
start = time.time()
video_list=[]
ret = requests.get(url)
reg = ‘<a href="(.*?)" class="vervideo-lilink actplay">‘
video_urls = re.findall(reg, ret.text)
print(video_urls)
for url in video_urls:
proxy = random.choice(IpPool)
ret_detail = requests.get(‘https://www.pearvideo.com/‘ + url, proxies=proxy)
print(proxy)
reg = ‘srcUrl="(.*?)",vdoUrl=srcUrl‘
mp4_url = re.findall(reg, ret_detail.text)[0] # type:str
video_name = mp4_url.rsplit(‘/‘, 1)[-1]
dic = {
‘v_name‘: video_name,
‘v_url‘: mp4_url
}
video_list.append(dic)
print(video_list)
def get_video(dic):
url = dic[‘v_url‘]
name = dic[‘v_name‘]
print(f‘开始下载{name}‘)
video_data = requests.get(url=url)
print(url)
with open(name, ‘wb‘) as f:
for line in video_data.iter_content():
f.write(line)
print(f‘{name}下载完成‘)
end = time.time()
ctime = end - start
print(ctime)
pools = Pool(12)
pools.map(get_video, video_list)
pools.close()
pools.join()
线程池
import requests
import re
import random
from concurrent.futures import ThreadPoolExecutor
import time
start = time.time()
pool = ThreadPoolExecutor(12)
IpPool = [{‘http‘: ‘183.147.230.104: 8118‘}, {‘http‘: ‘60.217.64.237: 31923‘},
{‘http‘: ‘221.193.50.166: 8118‘}]
url = ‘https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start=0‘
video_list=[]
ret = requests.get(url)
reg = ‘<a href="(.*?)" class="vervideo-lilink actplay">‘
video_urls = re.findall(reg, ret.text)
print(video_urls)
for url in video_urls:
proxy = random.choice(IpPool)
ret_detail = requests.get(‘https://www.pearvideo.com/‘ + url, proxies=proxy)
print(proxy)
reg = ‘srcUrl="(.*?)",vdoUrl=srcUrl‘
mp4_url = re.findall(reg, ret_detail.text)[0] # type:str
video_name = mp4_url.rsplit(‘/‘, 1)[-1]
dic = {
‘v_name‘: video_name,
‘v_url‘: mp4_url
}
video_list.append(dic)
def get_video(dic):
url = dic[‘v_url‘]
name = dic[‘v_name‘]
print(f‘开始下载{name}‘)
video_data = requests.get(url=url)
print(url)
with open(name, ‘wb‘) as f:
for line in video_data.iter_content():
f.write(line)
print(f‘{name}下载完成‘)
end = time.time()
ctime = end - start
print(ctime)
print(video_list)
def main():
for url in video_list:
done = pool.submit(get_video, url)
if __name__ == ‘__main__‘:
main()
pool.shutdown(wait=True)
以上是关于梨视频,进程池线程池爬取的主要内容,如果未能解决你的问题,请参考以下文章
使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中