scrapy学习2 爬虫中间件,下载器中间件之添加代理
Posted 王大拿
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy学习2 爬虫中间件,下载器中间件之添加代理相关的知识,希望对你有一定的参考价值。
中间件
注意:这些中间件都放在middleware中
下载中间件
作用
实例:
代理被封,添加代理
方式一:内置添加代理功能
import os
# -*- coding: utf-8 -*- import os import scrapy from scrapy.http import Request class ChoutiSpider(scrapy.Spider): name = \'chouti\' allowed_domains = [\'chouti.com\'] start_urls = [\'https://dig.chouti.com/\'] def start_requests(self): os.environ[\'HTTP_PROXY\'] = "http://192.168.11.11" for url in self.start_urls: yield Request(url=url,callback=self.parse) def parse(self, response): print(response)
方法二:自定义中间件添加代理 (有多个代理的时候,而且想要随机循环的使用某个代理,防止被封) 常用此方法,
然后再在middleware.py中添加上自定义的propxy类 和一个方法
代码如下:
import random import base64 import six def to_bytes(text, encoding=None, errors=\'strict\'): """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): return text if not isinstance(text, six.string_types): raise TypeError(\'to_bytes must receive a unicode, str or bytes \' \'object, got %s\' % type(text).__name__) if encoding is None: encoding = \'utf-8\' return text.encode(encoding, errors) class MyProxyDownloaderMiddleware(object): def process_request(self, request, spider): proxy_list = [ {\'ip_port\': \'111.11.228.75:80\', \'user_pass\': \'xxx:123\'}, {\'ip_port\': \'120.198.243.22:80\', \'user_pass\': \'\'}, {\'ip_port\': \'111.8.60.9:8123\', \'user_pass\': \'\'}, {\'ip_port\': \'101.71.27.120:80\', \'user_pass\': \'\'}, {\'ip_port\': \'122.96.59.104:80\', \'user_pass\': \'\'}, {\'ip_port\': \'122.224.249.122:8088\', \'user_pass\': \'\'}, ] proxy = random.choice(proxy_list) if proxy[\'user_pass\'] is not None: request.meta[\'proxy\'] = to_bytes("http://%s" % proxy[\'ip_port\']) encoded_user_pass = base64.encodestring(to_bytes(proxy[\'user_pass\'])) request.headers[\'Proxy-Authorization\'] = to_bytes(\'Basic \' + encoded_user_pass) else: request.meta[\'proxy\'] = to_bytes("http://%s" % proxy[\'ip_port\']) 配置: DOWNLOADER_MIDDLEWARES = { # \'xiaohan.middlewares.MyProxyDownloaderMiddleware\': 543, }
问题2
如果被爬取的网站是自己花钱买的证书(此证书就是为了防止用户发送的数据在中间环节被截获,没有证书相关的解密方式无法解析),可以直接正常爬取
如果是网站么钱,自己写的证书,发送爬取数据的时候,必须携带证书文件,才能爬取数据
方法: 现在Middleware.py中写入这些代码,然后再在配置文件中写上那两行配置(代码)
20. Https访问 Https访问时有两种情况: 1. 要爬取网站使用的可信任证书(默认支持) DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 2. 要爬取网站使用的自定义证书 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory): def getCertificateOptions(self): from OpenSSL import crypto v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open(\'/Users/wupeiqi/client.key.unsecure\', mode=\'r\').read()) v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open(\'/Users/wupeiqi/client.pem\', mode=\'r\').read()) return CertificateOptions( privateKey=v1, # pKey对象 certificate=v2, # X509对象 verify=False, method=getattr(self, \'method\', getattr(self, \'_ssl_method\', None)) ) 其他: 相关类 scrapy.core.downloader.handlers.http.HttpDownloadHandler scrapy.core.downloader.webclient.ScrapyHTTPClientFactory scrapy.core.downloader.contextfactory.ScrapyClientContextFactory 相关配置 DOWNLOADER_HTTPCLIENTFACTORY DOWNLOADER_CLIENTCONTEXTFACTORY """
爬虫中间件
这里注意
代码
middlewares.py class XiaohanSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. def __init__(self): pass @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s # 每次下载完成之后,未执行parse函数之前。 def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. print(\'process_spider_input\',response) return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. print(\'process_spider_output\',response) for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass # 爬虫启动时,第一次执行start_requests时,触发。(只执行一次) def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). print(\'process_start_requests\') for r in start_requests: yield r
settings中的配置
SPIDER_MIDDLEWARES = { \'xiaohan.middlewares.XiaohanSpiderMiddleware\': 543, }
扩展 信号
单纯扩展:
无意义
extends.py class MyExtension(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): obj = cls() return obj 配置: EXTENSIONS = { \'xiaohan.extends.MyExtension\':500, }
扩展+信号:
extends.py
from scrapy import signals class MyExtension(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): obj = cls() # 在爬虫打开时,触发spider_opened信号相关的所有函数:xxxxxxxxxxx crawler.signals.connect(obj.xxxxxxxxxxx1, signal=signals.spider_opened) # 在爬虫关闭时,触发spider_closed信号相关的所有函数:xxxxxxxxxxx crawler.signals.connect(obj.uuuuuuuuuu, signal=signals.spider_closed) return obj def xxxxxxxxxxx1(self, spider): print(\'open\') def uuuuuuuuuu(self, spider): print(\'close\') return obj
配置:
EXTENSIONS = { \'xiaohan.extends.MyExtension\':500, }
7. 自定制命令
- 在spiders同级创建任意目录,如:commands
-
- 在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
from scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): requires_project = True def syntax(self):
#支持的语法 return \'[options]\' def short_desc(self): return \'Runs all of the spiders\' def run(self, args, opts): #获取所有的爬虫 spider_list = self.crawler_process.spiders.list() for name in spider_list:
#craewler_process是执行爬虫的入口,如self.crawler_process.crawl(\'chouti\') self.crawler_process.crawl(name, **opts.__dict__) #让爬虫开始做操作 self.crawler_process.start()
11.TinyScrapy
from twisted.web.client import getPage from twisted.internet import reactor from twisted.internet import defer url_list = [\'http://www.bing.com\', \'http://www.baidu.com\', ] def callback(arg): print(\'回来一个\', arg) defer_list = [] for url in url_list: ret = getPage(bytes(url, encoding=\'utf8\')) ret.addCallback(callback) defer_list.append(ret) def stop(arg): print(\'已经全部现在完毕\', arg) reactor.stop() d = defer.DeferredList(defer_list) d.addBoth(stop) reactor.run()
#!/usr/bin/env python # -*- coding:utf-8 -*- from twisted.web.client import getPage from twisted.internet import reactor from twisted.internet import defer @defer.inlineCallbacks def task(url): ret = getPage(bytes(url, encoding=\'utf8\')) ret.addCallback(callback) yield ret def callback(arg): print(\'回来一个\', arg) url_list = [\'http://www.bing.com\', \'http://www.baidu.com\', ] defer_list = [] for url in url_list: ret = task(url) defer_list.append(ret) def stop(arg): print(\'已经全部现在完毕\', arg) reactor.stop() d = defer.DeferredList(defer_list) d.addBoth(stop) reactor.run()
#!/usr/bin/env python # -*- coding:utf-8 -*- from twisted.internet import defer from twisted.web.client import getPage from twisted.internet import reactor import threading def _next_request(): _next_request_from_scheduler() def _next_request_from_scheduler(): ret = getPage(bytes(\'http://www.chouti.com\', encoding=\'utf8\')) ret.addCallback(callback) ret.addCallback(lambda _: reactor.callLater(0, _next_request)) _closewait = None @defer.inlineCallbacks def engine_start(): global _closewait _closewait = defer.Deferred() yield _closewait @defer.inlineCallbacks def task(url): reactor.callLater(0, _next_request) yield engine_start() counter = 0 def callback(arg): global counter counter +=1 if counter == 10: _closewait.callback(None) print(\'one\', len(arg)) def stop(arg): print(\'all done\', arg) reactor.stop() if __name__ == \'__main__\': url = \'http://www.cnblogs.com\' defer_list = [] deferObj = task(url) defer_list.append(deferObj) v = defer.DeferredList(defer_list) v.addBoth(stop) reactor.run()
补充
如何创建第二个爬虫
利用 scrapy- redis去重
原理就是把访问过的额地址放到一个集合总,然后判断时候访问过
redis只是补充
源码解析:
配置文件(要想使用就是修改这些内容)
代码(老师笔记中)
如果想要自定义去重规则 或者扩展
redis知识补充
博客
自定义调度器
位置
调度器和去重规则的使用的三中情况: 老师的代码(笔记中)
以上是关于scrapy学习2 爬虫中间件,下载器中间件之添加代理的主要内容,如果未能解决你的问题,请参考以下文章
Python爬虫编程思想(158):Scrapy中的下载器中间件
Python爬虫编程思想(158):Scrapy中的下载器中间件