python3分别用多进程,多线程,协程爬取豆瓣top250数据(python经典编程案例)

Posted cui_yonghua

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3分别用多进程,多线程,协程爬取豆瓣top250数据(python经典编程案例)相关的知识,希望对你有一定的参考价值。

一. 多进程爬取

import requests
import time
import multiprocessing
import random
from lxml import etree
from multiprocessing import Queue


class DouBanSpider:
    """爬虫类"""
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.baseURL = "https://movie.douban.com/top250"
        # dataQueue:队列存储数据
        self.dataQueue = Queue()
        self.num = 1

    def loadPage(self, url):
        """向url发送请求,获取响应内容"""
        # 随机休眠0-1秒,避免爬虫过快,会导致爬虫被封禁
        time.sleep(random.random())
        return requests.get(url, headers=self.headers).content

    def parsePage(self, url):
        """根据起始url提取所有的url"""
        content = self.loadPage(url)
        html = etree.HTML(content)
        node_list = html.xpath("//div[@class='info']")
        for node in node_list:
            # 每部电影的标题
            title = node.xpath(".//span[@class='title']/text()")[0]
            # 每部电影的评分
            score = node.xpath(".//span[@class='rating_num']/text()")[0]
            # 将数据存储到队列里
            self.dataQueue.put(score + "\\t" + title)

        # 只有在第一页的时候才获取所有url组成的列表,其它翻页就不再获取
        if url == self.baseURL:
            return [self.baseURL + link for link in html.xpath("//div[@class='paginator']/a/@href")]

    def startWork(self):
        link_list = self.parsePage(self.baseURL)
        process_list = []
        for link in link_list:
            process = multiprocessing.Process(target=self.parsePage, args=[link])
            process.start()
            process_list.append(process)

        # 父进程等待所有子进程结束,自己再结束
        for process in process_list:
            process.join()

        # 循环get队列的数据,直到队列为空则退出
        while not self.dataQueue.empty():
            print(self.num)
            print(self.dataQueue.get())
            self.num += 1


if __name__ == "__main__":
    spider = DouBanSpider()
    start = time.time()
    spider.startWork()
    stop = time.time()
    print(f"\\n[LOG]: {stop} - {start} seconds...")

二. 多进程爬取

import requests
import random
import time
import threading
from lxml import etree
from queue import Queue


class DouBanSpider:
    """爬虫类"""
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.baseURL = "https://movie.douban.com/top250"
        # dataQueue:队列存储数据
        self.dataQueue = Queue()
        self.num = 1

    def loadPage(self, url):
        """向url发送请求,获取响应内容"""
        time.sleep(random.random())
        return requests.get(url, headers=self.headers).content

    def parsePage(self, url):
        """根据起始url提取所有的url"""
        content = self.loadPage(url)
        html = etree.HTML(content)
        node_list = html.xpath("//div[@class='info']")
        for node in node_list:
            title = node.xpath(".//span[@class='title']/text()")[0]
            score = node.xpath(".//span[@class='rating_num']/text()")[0]
            # 将数据存储到队列里
            self.dataQueue.put(score + "\\t" + title)

        # 只有在第一页的时候才获取所有url组成的列表,其它翻页就不再获取
        if url == self.baseURL:
            return [self.baseURL + link for link in html.xpath("//div[@class='paginator']/a/@href")]

    def startWork(self):
        """开始"""
        link_list = self.parsePage(self.baseURL)
        thread_list = []
        for link in link_list:
            thread = threading.Thread(target=self.parsePage, args=[link])
            thread.start()
            thread_list.append(thread)

        # 父线程等待所有子线程结束,自己再结束
        for thread in thread_list:
            thread.join()

        # 循环get队列的数据,直到队列为空则退出
        while not self.dataQueue.empty():
            print(self.num)
            print(self.dataQueue.get())
            self.num += 1


if __name__ == "__main__":
    spider = DouBanSpider()
    start = time.time()
    spider.startWork()
    stop = time.time()
    print(f"\\n[LOG]: {stop - start} seconds...")

三. 协程爬取

"""协程爬虫"""

import time
import gevent  
import requests
import random
from lxml import etree
from queue import Queue
from gevent import monkey

# gevent 让我们可以按同步的方式来写异步程序
# monkey.patch_all() 会在Python程序执行时动态的将网络库(socket,select,thread)
# 替换掉,变成异步的库,让我们的程序可以异步的方式处理网络相关的任务
monkey.patch_all()


class DouBanSpider:
    """爬虫类"""
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.baseURL = "https://movie.douban.com/top250"
        # dataQueue:队列存储数据
        self.dataQueue = Queue()
        self.num = 1

    def loadPage(self, url):
        """向url发送请求,获取响应内容"""
        time.sleep(random.random())
        return requests.get(url, headers=self.headers).content

    def parsePage(self, url):
        """根据起始url提取所有的url"""
        content = self.loadPage(url)
        html = etree.HTML(content)
        node_list = html.xpath("//div[@class='info']")

        for node in node_list:
            title = node.xpath(".//span[@class='title']/text()")[0]
            score = node.xpath(".//span[@class='rating_num']/text()")[0]
            self.dataQueue.put(score + "\\t" + title)

        if url == self.baseURL:
            return [self.baseURL + link for link in html.xpath("//div[@class='paginator']/a/@href")]

    def startWork(self):
        """开始"""
        link_list = self.parsePage(self.baseURL)
        print(link_list)
        jobs = [gevent.spawn(self.parsePage, link) for link in link_list]
        # 父线程阻塞,等待所有任务结束后继续执行
        gevent.joinall(jobs)
        # 循环get队列的数据,直到队列为空则退出
        while not self.dataQueue.empty():
            print(self.num)
            print(self.dataQueue.get())
            self.num += 1


if __name__ == "__main__":
    spider = DouBanSpider()
    start = time.time()
    spider.startWork()
    stop = time.time()
    print(f"\\n[LOG]: {stop} - {start} seconds...")

以上是关于python3分别用多进程,多线程,协程爬取豆瓣top250数据(python经典编程案例)的主要内容,如果未能解决你的问题,请参考以下文章

利用协程多任务协程爬取前几页投诉网

利用协程多任务协程爬取前几页投诉网

python爬虫之多线程threading多进程multiprocessing协程aiohttp 批量下载图片

python爬虫之多线程threading多进程multiprocessing协程aiohttp 批量下载图片

单线程多线程多进程协程比较,以爬取新浪军事历史为例

爬虫协程爬取