python3分别用多进程,多线程,协程爬取豆瓣top250数据(python经典编程案例)
Posted cui_yonghua
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3分别用多进程,多线程,协程爬取豆瓣top250数据(python经典编程案例)相关的知识,希望对你有一定的参考价值。
一. 多进程爬取
import requests
import time
import multiprocessing
import random
from lxml import etree
from multiprocessing import Queue
class DouBanSpider:
"""爬虫类"""
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.baseURL = "https://movie.douban.com/top250"
# dataQueue:队列存储数据
self.dataQueue = Queue()
self.num = 1
def loadPage(self, url):
"""向url发送请求,获取响应内容"""
# 随机休眠0-1秒,避免爬虫过快,会导致爬虫被封禁
time.sleep(random.random())
return requests.get(url, headers=self.headers).content
def parsePage(self, url):
"""根据起始url提取所有的url"""
content = self.loadPage(url)
html = etree.HTML(content)
node_list = html.xpath("//div[@class='info']")
for node in node_list:
# 每部电影的标题
title = node.xpath(".//span[@class='title']/text()")[0]
# 每部电影的评分
score = node.xpath(".//span[@class='rating_num']/text()")[0]
# 将数据存储到队列里
self.dataQueue.put(score + "\\t" + title)
# 只有在第一页的时候才获取所有url组成的列表,其它翻页就不再获取
if url == self.baseURL:
return [self.baseURL + link for link in html.xpath("//div[@class='paginator']/a/@href")]
def startWork(self):
link_list = self.parsePage(self.baseURL)
process_list = []
for link in link_list:
process = multiprocessing.Process(target=self.parsePage, args=[link])
process.start()
process_list.append(process)
# 父进程等待所有子进程结束,自己再结束
for process in process_list:
process.join()
# 循环get队列的数据,直到队列为空则退出
while not self.dataQueue.empty():
print(self.num)
print(self.dataQueue.get())
self.num += 1
if __name__ == "__main__":
spider = DouBanSpider()
start = time.time()
spider.startWork()
stop = time.time()
print(f"\\n[LOG]: {stop} - {start} seconds...")
二. 多进程爬取
import requests
import random
import time
import threading
from lxml import etree
from queue import Queue
class DouBanSpider:
"""爬虫类"""
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.baseURL = "https://movie.douban.com/top250"
# dataQueue:队列存储数据
self.dataQueue = Queue()
self.num = 1
def loadPage(self, url):
"""向url发送请求,获取响应内容"""
time.sleep(random.random())
return requests.get(url, headers=self.headers).content
def parsePage(self, url):
"""根据起始url提取所有的url"""
content = self.loadPage(url)
html = etree.HTML(content)
node_list = html.xpath("//div[@class='info']")
for node in node_list:
title = node.xpath(".//span[@class='title']/text()")[0]
score = node.xpath(".//span[@class='rating_num']/text()")[0]
# 将数据存储到队列里
self.dataQueue.put(score + "\\t" + title)
# 只有在第一页的时候才获取所有url组成的列表,其它翻页就不再获取
if url == self.baseURL:
return [self.baseURL + link for link in html.xpath("//div[@class='paginator']/a/@href")]
def startWork(self):
"""开始"""
link_list = self.parsePage(self.baseURL)
thread_list = []
for link in link_list:
thread = threading.Thread(target=self.parsePage, args=[link])
thread.start()
thread_list.append(thread)
# 父线程等待所有子线程结束,自己再结束
for thread in thread_list:
thread.join()
# 循环get队列的数据,直到队列为空则退出
while not self.dataQueue.empty():
print(self.num)
print(self.dataQueue.get())
self.num += 1
if __name__ == "__main__":
spider = DouBanSpider()
start = time.time()
spider.startWork()
stop = time.time()
print(f"\\n[LOG]: {stop - start} seconds...")
三. 协程爬取
"""协程爬虫"""
import time
import gevent
import requests
import random
from lxml import etree
from queue import Queue
from gevent import monkey
# gevent 让我们可以按同步的方式来写异步程序
# monkey.patch_all() 会在Python程序执行时动态的将网络库(socket,select,thread)
# 替换掉,变成异步的库,让我们的程序可以异步的方式处理网络相关的任务
monkey.patch_all()
class DouBanSpider:
"""爬虫类"""
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.baseURL = "https://movie.douban.com/top250"
# dataQueue:队列存储数据
self.dataQueue = Queue()
self.num = 1
def loadPage(self, url):
"""向url发送请求,获取响应内容"""
time.sleep(random.random())
return requests.get(url, headers=self.headers).content
def parsePage(self, url):
"""根据起始url提取所有的url"""
content = self.loadPage(url)
html = etree.HTML(content)
node_list = html.xpath("//div[@class='info']")
for node in node_list:
title = node.xpath(".//span[@class='title']/text()")[0]
score = node.xpath(".//span[@class='rating_num']/text()")[0]
self.dataQueue.put(score + "\\t" + title)
if url == self.baseURL:
return [self.baseURL + link for link in html.xpath("//div[@class='paginator']/a/@href")]
def startWork(self):
"""开始"""
link_list = self.parsePage(self.baseURL)
print(link_list)
jobs = [gevent.spawn(self.parsePage, link) for link in link_list]
# 父线程阻塞,等待所有任务结束后继续执行
gevent.joinall(jobs)
# 循环get队列的数据,直到队列为空则退出
while not self.dataQueue.empty():
print(self.num)
print(self.dataQueue.get())
self.num += 1
if __name__ == "__main__":
spider = DouBanSpider()
start = time.time()
spider.startWork()
stop = time.time()
print(f"\\n[LOG]: {stop} - {start} seconds...")
以上是关于python3分别用多进程,多线程,协程爬取豆瓣top250数据(python经典编程案例)的主要内容,如果未能解决你的问题,请参考以下文章
python爬虫之多线程threading多进程multiprocessing协程aiohttp 批量下载图片