Scrapy 爬虫
Posted 123why
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Scrapy 爬虫相关的知识,希望对你有一定的参考价值。
一、安装 centos7:https://www.jb51.net/article/136478.htm a.pip3 install wheel b.pip3 install Twisted c.pip3 install pywin32 d.pip3 install scrapy 测试是否安装成功 C:Users12164>scrapy Scrapy 1.8.0 - no active project Usage: scrapy <command> [options] [args] Available commands: bench Run quick benchmark test fetch Fetch a URL using the Scrapy downloader genspider Generate new spider using pre-defined templates runspider Run a self-contained spider (without creating a project) settings Get settings values shell Interactive scraping console startproject Create new project # 创建一个爬虫项目 version Print Scrapy version view Open URL in browser, as seen by Scrapy crawl # 运行爬虫项目,需在项目文件夹中执行 [ more ] More commands available when run from project directory Use "scrapy <command> -h" to see more info about a command 二、创建项目 a.打开cmd,切换到想创建项目的目录scrapy startproject chouti New Scrapy project ‘chouti‘, using template directory ‘d:python37libsite-packagesscrapy emplatesproject‘, created in: E:pythonchouti You can start your first spider with: cd chouti scrapy genspider example example.com b.cd chouti c.scrapy genspider get_chouti chouti.com # Cannot create a spider with the same name as your project 不能和项目名重复 Created spider ‘get_chouti‘ using template ‘basic‘ in module: chouti.spiders.get_chouti d.找到创建的name 进行编辑 chouti |____chouti | |_____spiders | |__ __pycache__ | |__ __init__ | |__ get_chouti | |___ __pycache__ | |_____ __init__.py | |_____ items.py # 实例化response | |_____ middlewares.py | |_____ pipelines.py # 消息持久化,将接收到的response通过items实例化保存至本地 | |_____ settings.py # 配置文件 |_____scrapy.cfg e.scrapy crawl name --nolog (nolog不显示日志) ####运行后无内容且代码中有输出代码,查看日志报错403,打开setting文件,配置USER_AGENT 举例:获取豆瓣图书top250的前25 get_chouti.py # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from scrapy.http import Request from ..items import ChoutiItem import time class GetChoutiSpider(scrapy.Spider): name = ‘get_chouti‘ allowed_domains = [‘douban.com‘] # 对应的一级域名,之后的所有请求必须对应,可设置dont_filter=False取消筛选 start_urls = [‘https://book.douban.com/top250?icn=index-book250-all‘] # def start_requests(self): # for url in self.start_urls: # yield Request(url,callback=self.parse) def parse(self, response): # parse函数是默认执行的,但是可以通过start_requests重写 table_hxs = Selector(response).xpath(‘//div[@class="article"]/div[@class="indent"]/table‘) # 选择器 # 获取书名、作者等信息 for one_table in table_hxs: book_item = ChoutiItem() title = one_table.xpath(‘.//a/@title‘).extract()[0] info = one_table.xpath(‘.//p[@class="pl"]/text()‘).extract()[0] #print(title) book_item["title"] = title book_item["info"] = info # 获取书籍链接 time.sleep(10) # 豆瓣限制访问频率 url = one_table.xpath(‘.//a[re:test(@href, "https://book.douban.com/subject/d+")]/@href‘).extract() headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/80.0.3987.122 Safari/537.36‘} #将连接放入调度器 yield Request(url=url[0],method="GET",headers=headers,callback=self.get_book_msg,meta={"book_item":book_item}) def get_book_msg(self,response,*args,**kwargs): book_item_used = response.meta["book_item"] # 获取内容简介和作者简介 info_hxs = Selector(response).xpath("//div[@class=‘related_info‘]//div[re:test(@class,‘indent*‘)]") content_hxs = info_hxs[0].xpath(".//div[@class=‘intro‘]/p/text()").extract() content = ‘‘ for i in content_hxs: if i in content: pass else: content += i book_item_used[‘introduction‘] = content auth_hxs = info_hxs[1].xpath(".//div[@class=‘intro‘]/p/text()").extract() auth_info = ‘‘ for i in auth_hxs: if i in auth_info: pass else: auth_info += i book_item_used[‘author‘] = auth_info yield book_item_used # 将书籍的item对像传入pipeline,需要在setting文件中注册 #ITEM_PIPELINES = { # ‘chouti.pipelines.ChoutiPipeline‘: 300, # 300是权重 #} print(response) items.py import scrapy class ChoutiItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() info = scrapy.Field() introduction = scrapy.Field() author = scrapy.Field() pipelines.py class ChoutiPipeline(object): def process_item(self, item, spider): book = "书名:%s 基本信息:%s 书籍简介: %s 作者简介: %s " %(item["title"],item["info"],item["introduction"],item["author"]) #print(book) with open("new.json","a") as f: f.write(book) # 自定制命令 """ a. 在spider同级创建目录 commands b. 在目录中创建 crawlall.py文件 文件名 ==> 命令名 from scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): """执行所有的spider""" requires_project = True def syntax(self): return ‘[options]‘ def short_desc(self): return ‘Runs all of the spiders‘ def run(self, args, opts): spider_list = self.crawler_process.spiders.list() for name in spider_list: self.crawler_process.crawl(name, **opts.__dict__) self.crawler_process.start() c. 在settings中配置 COMMANDS_MODULE = ‘项目名称.目录名称‘ """ # pipeline(管道) """ 上面的pipeline每次持久化数据的时候都会有文件的打开关闭操作,可以在爬虫开始时打开文件,结束时再关闭文件 class ChoutiPipeline(object): def __init__(self,crawler): self.crawler = crawler pass @classmethod def from_crawler(cls,crawler): # crawler.settings.get() 可以获取settings文件中的所有变量,变量名必须大写 return cls() def open_spider(self,item,spider): self.f = open(name,mode) def close_spider(self,item,spider): self.f.close() def process_item(self, item, spider): self.f.write(data) # return item 这里会将item传给权重仅次于这个pipeline的下一个pipeline # raise Dropitem from scrapy.exceptions import DropItem,如果不想继续向下发送,就报Dropitem错误 """ # Https访问 """ Https访问时有两种情况: 1. 要爬取网站使用的可信任证书(默认支持),服务器需要购买证书,该证书在客户端自带,访问时会根据返回的证书进行认证 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 2. 要爬取网站使用的自定义证书, DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory): def getCertificateOptions(self): from OpenSSL import crypto v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open(‘/Users/wupeiqi/client.key.unsecure‘, mode=‘r‘).read()) v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open(‘/Users/wupeiqi/client.pem‘, mode=‘r‘).read()) return CertificateOptions( privateKey=v1, # pKey对象 certificate=v2, # X509对象 verify=False, method=getattr(self, ‘method‘, getattr(self, ‘_ssl_method‘, None)) ) 其他: 相关类 scrapy.core.downloader.handlers.http.HttpDownloadHandler scrapy.core.downloader.webclient.ScrapyHTTPClientFactory scrapy.core.downloader.contextfactory.ScrapyClientContextFactory 相关配置 DOWNLOADER_HTTPCLIENTFACTORY DOWNLOADER_CLIENTCONTEXTFACTORY """ # 选择器 Selector """ // # 表示子孙中 .// # 当前对象的子孙中 / # 儿子 /div # 儿子中的div标签 /div[@id="i1"] # 儿子中的div标签且id=i1 /div[@id="i1"] # 儿子中的div标签且id=i1 obj.extract() # 列表中的每一个对象转换字符串 =》 [] obj.extract_first() # 列表中的每一个对象转换字符串 => 列表第一个元素 //div/text() # 获取某个标签的文本 //a[@id=‘top‘]/@href # 取到id为top的a标签,并取出属性href的值 //div[re:test(@class,‘indent*‘)] # 使用正则表达式匹配 # from scrapy.selector import Selector # 导入模块 # hxs = Selector(response) # 将返回的信息包装成selector对象 # hxs = Selector(response=response).xpath(‘//a‘) # 找到子孙中所有的a标签 # hxs = Selector(response=response).xpath(‘//a[2]‘) # 取到第三个a标签 # hxs = Selector(response=response).xpath(‘//a[@id]‘) # 找到有id属性的a标签 # hxs = Selector(response=response).xpath(‘//a[@id="i1"]‘) # 找到id属性为i1的a标签 # hxs = Selector(response=response).xpath(‘//a[@href="link.html"][@id="i1"]‘) # 多属性与匹配 # hxs = Selector(response=response).xpath(‘//a[contains(@href, "link")]‘ # 找到href属性包含link的a标签 # hxs = Selector(response=response).xpath(‘//a[starts-with(@href, "link")]‘) # 找到href属性是link起始的a标签 # hxs = Selector(response=response).xpath(‘//a[re:test(@id, "id+")]‘) # 对id属性正则匹配 # hxs = Selector(response=response).xpath(‘//a[re:test(@id, "id+")]/text()‘).extract() # 将对象转换为字符串 # hxs = Selector(response=response).xpath(‘//a[re:test(@id, "id+")]/@href‘).extract() # /@href 取到属性 # hxs = Selector(response=response).xpath(‘//body/ul/li/a/@href‘).extract_first() # 取到列表中的第一个 """ # setting.py 配置文件 """ DEPTH_LIMIT = 2 # 访问层数 DUPEFILTER_CLASS = "chouti.dupefilter.Repeatfilter" # 使用自定义的过滤类 USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36‘ # 伪装成浏览器 ITEM_PIPELINES = { ‘chouti.pipelines.ChoutiPipeline‘: 300, # 300是权重,当item传送到pipeline时会经过所有定义的pipeline类 } # 权重就是决定经过他们的顺序,当然是否经过可以控制 EXTENSIONS = { # 在信号上扩展操作 ‘scrapy.extensions.telnet.TelnetConsole‘: None, } #==>第一部分:基本配置<=== #1、项目名称,默认的USER_AGENT由它来构成,也作为日志记录的日志名 BOT_NAME = ‘Amazon‘ #2、爬虫应用路径 SPIDER_MODULES = [‘Amazon.spiders‘] NEWSPIDER_MODULE = ‘Amazon.spiders‘ #3、客户端User-Agent请求头 #USER_AGENT = ‘Amazon (+http://www.yourdomain.com)‘ #4、是否遵循爬虫协议 # Obey robots.txt rules ROBOTSTXT_OBEY = False #5、是否支持cookie,cookiejar进行操作cookie,默认开启 #COOKIES_ENABLED = False #6、Telnet用于查看当前爬虫的信息,操作爬虫等...使用telnet ip port ,然后通过命令操作 #TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_HOST = ‘127.0.0.1‘ #TELNETCONSOLE_PORT = [6023,] #7、Scrapy发送HTTP请求默认使用的请求头 #DEFAULT_REQUEST_HEADERS = { # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, #} #===>第二部分:并发与延迟<=== #1、下载器总共最大处理的并发请求数,默认值16 #CONCURRENT_REQUESTS = 32 #2、每个域名能够被执行的最大并发请求数目,默认值8 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #3、能够被单个IP处理的并发请求数,默认值0,代表无限制,需要注意两点 #I、如果不为零,那CONCURRENT_REQUESTS_PER_DOMAIN将被忽略,即并发数的限制是按照每个IP来计算,而不是每个域名 #II、该设置也影响DOWNLOAD_DELAY,如果该值不为零,那么DOWNLOAD_DELAY下载延迟是限制每个IP而不是每个域 #CONCURRENT_REQUESTS_PER_IP = 16 #4、如果没有开启智能限速,这个值就代表一个规定死的值,代表对同一网址延迟请求的秒数 #DOWNLOAD_DELAY = 3 #===>第三部分:智能限速/自动节流:AutoThrottle extension<=== #一:介绍 from scrapy.contrib.throttle import AutoThrottle #http://scrapy.readthedocs.io/en/latest/topics/autothrottle.html#topics-autothrottle 设置目标: 1、比使用默认的下载延迟对站点更好 2、自动调整scrapy到最佳的爬取速度,所以用户无需自己调整下载延迟到最佳状态。用户只需要定义允许最大并发的请求,剩下的事情由该扩展组件自动完成 #二:如何实现? 在Scrapy中,下载延迟是通过计算建立TCP连接到接收到HTTP包头(header)之间的时间来测量的。 注意,由于Scrapy可能在忙着处理spider的回调函数或者无法下载,因此在合作的多任务环境下准确测量这些延迟是十分苦难的。 不过,这些延迟仍然是对Scrapy(甚至是服务器)繁忙程度的合理测量,而这扩展就是以此为前提进行编写的。 #三:限速算法 自动限速算法基于以下规则调整下载延迟 #1、spiders开始时的下载延迟是基于AUTOTHROTTLE_START_DELAY的值 #2、当收到一个response,对目标站点的下载延迟=收到响应的延迟时间/AUTOTHROTTLE_TARGET_CONCURRENCY #3、下一次请求的下载延迟就被设置成:对目标站点下载延迟时间和过去的下载延迟时间的平均值 #4、没有达到200个response则不允许降低延迟 #5、下载延迟不能变的比DOWNLOAD_DELAY更低或者比AUTOTHROTTLE_MAX_DELAY更高 #四:配置使用 #开启True,默认False AUTOTHROTTLE_ENABLED = True #起始的延迟 AUTOTHROTTLE_START_DELAY = 5 #最小延迟 DOWNLOAD_DELAY = 3 #最大延迟 AUTOTHROTTLE_MAX_DELAY = 10 #每秒并发请求数的平均值,不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP,调高了则吞吐量增大强奸目标站点,调低了则对目标站点更加”礼貌“ #每个特定的时间点,scrapy并发请求的数目都可能高于或低于该值,这是爬虫视图达到的建议值而不是硬限制 AUTOTHROTTLE_TARGET_CONCURRENCY = 16.0 #调试 AUTOTHROTTLE_DEBUG = True CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_IP = 16 #===>第四部分:爬取深度与爬取方式<=== #1、爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度 # DEPTH_LIMIT = 3 #2、爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo # 后进先出,深度优先 # DEPTH_PRIORITY = 0 # SCHEDULER_DISK_QUEUE = ‘scrapy.squeue.PickleLifoDiskQueue‘ # SCHEDULER_MEMORY_QUEUE = ‘scrapy.squeue.LifoMemoryQueue‘ # 先进先出,广度优先 # DEPTH_PRIORITY = 1 # SCHEDULER_DISK_QUEUE = ‘scrapy.squeue.PickleFifoDiskQueue‘ # SCHEDULER_MEMORY_QUEUE = ‘scrapy.squeue.FifoMemoryQueue‘ #3、调度器队列 # SCHEDULER = ‘scrapy.core.scheduler.Scheduler‘ # from scrapy.core.scheduler import Scheduler #4、访问URL去重 # DUPEFILTER_CLASS = ‘step8_king.duplication.RepeatUrl‘ #===>第五部分:中间件、Pipelines、扩展<=== #1、Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # ‘Amazon.middlewares.AmazonSpiderMiddleware‘: 543, #} #2、Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # ‘Amazon.middlewares.DownMiddleware1‘: 543, } #3、Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, #} #4、Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # ‘Amazon.pipelines.CustomPipeline‘: 200, } #===>第六部分:缓存<=== """ 1. 启用缓存 目的用于将已经发送的请求或相应缓存下来,以便以后使用 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware from scrapy.extensions.httpcache import DummyPolicy from scrapy.extensions.httpcache import FilesystemCacheStorage """ # 是否启用缓存策略 # HTTPCACHE_ENABLED = True # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy" # 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" # 缓存超时时间 # HTTPCACHE_EXPIRATION_SECS = 0 # 缓存保存路径 # HTTPCACHE_DIR = ‘httpcache‘ # 缓存忽略的Http状态码 # HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存存储的插件 # HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘ """ # cookie """ # 导入模块 from scrapy.http.cookies import CookieJar #使用 cj = CookieJar() cj.extract_cookies(response,response.request) print(cj._cookies) # 这就是获取的cookie """ # 扩展 """ 可以对整个爬虫的所有环节进行扩展,比如爬虫开始时可以执行某些操作,yield Request()使用调度器时可以执行某些操作 a.创建一个py文件,在里面创建如下类,类名可自定义 from scrapy import signals class TelnetConsole(protocol.ServerFactory): def __init__(self, crawler): if not crawler.settings.getbool(‘TELNETCONSOLE_ENABLED‘): raise NotConfigured if not TWISTED_CONCH_AVAILABLE: raise NotConfigured( ‘TELNETCONSOLE_ENABLED setting is True but required twisted ‘ ‘modules failed to import: ‘ + _TWISTED_CONCH_TRACEBACK) self.crawler = crawler self.noisy = False self.portrange = [int(x) for x in crawler.settings.getlist(‘TELNETCONSOLE_PORT‘)] self.host = crawler.settings[‘TELNETCONSOLE_HOST‘] self.username = crawler.settings[‘TELNETCONSOLE_USERNAME‘] self.password = crawler.settings[‘TELNETCONSOLE_PASSWORD‘] if not self.password: self.password = binascii.hexlify(os.urandom(8)).decode(‘utf8‘) logger.info(‘Telnet Password: %s‘, self.password) self.crawler.signals.connect(self.start_listening, signals.engine_started) # 这就相当于就我们定义的函数注册到相应的信号上 # 前一个参数是需要执行的操作,后一个是信号 # 可供选择的信号如下 engine_started = object() engine_stopped = object() spider_opened = object() spider_idle = object() spider_closed = object() spider_error = object() request_scheduled = object() request_dropped = object() request_reached_downloader = object() response_received = object() response_downloaded = object() item_scraped = object() item_dropped = object() item_error = object() # for backwards compatibility stats_spider_opened = spider_opened stats_spider_closing = spider_closed stats_spider_closed = spider_closed item_passed = item_scraped request_received = request_scheduled @classmethod def from_crawler(cls, crawler): return cls(crawler) def start_listening(self): pass """ # 代理 """ 需要在环境变量中设置 from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware 方式一:使用默认 os.environ { http_proxy:http://root:woshiniba@192.168.11.11:9999/ https_proxy:http://192.168.11.11:9999/ } 方式二:使用自定义下载中间件 def to_bytes(text, encoding=None, errors=‘strict‘): if isinstance(text, bytes): return text if not isinstance(text, six.string_types): raise TypeError(‘to_bytes must receive a unicode, str or bytes ‘ ‘object, got %s‘ % type(text).__name__) if encoding is None: encoding = ‘utf-8‘ return text.encode(encoding, errors) class ProxyMiddleware(object): def process_request(self, request, spider): PROXIES = [ {‘ip_port‘: ‘111.11.228.75:80‘, ‘user_pass‘: ‘‘}, {‘ip_port‘: ‘120.198.243.22:80‘, ‘user_pass‘: ‘‘}, {‘ip_port‘: ‘111.8.60.9:8123‘, ‘user_pass‘: ‘‘}, {‘ip_port‘: ‘101.71.27.120:80‘, ‘user_pass‘: ‘‘}, {‘ip_port‘: ‘122.96.59.104:80‘, ‘user_pass‘: ‘‘}, {‘ip_port‘: ‘122.224.249.122:8088‘, ‘user_pass‘: ‘‘}, ] proxy = random.choice(PROXIES) if proxy[‘user_pass‘] is not None: request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘]) encoded_user_pass = base64.encodestring(to_bytes(proxy[‘user_pass‘])) request.headers[‘Proxy-Authorization‘] = to_bytes(‘Basic ‘ + encoded_user_pass) print "**************ProxyMiddleware have pass************" + proxy[‘ip_port‘] else: print "**************ProxyMiddleware no pass************" + proxy[‘ip_port‘] request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘]) # 在settings中配置 DOWNLOADER_MIDDLEWARES = { ‘step8_king.middlewares.ProxyMiddleware‘: 500, } """ # URL去重 """ 先在setting中配置 DUPEFILTER_CLASS = "chouti.dupefilter.Repeatfilter" # 使用自定义的过滤类 创建一个py文件,写入自定义的filter类用于对要访问的URL进行过滤筛选,返回False则过滤掉 class Repeatfilter(object): @classmethod def from_settings(cls, settings): # 自动创建类 return cls() def request_seen(self, request): # 对request.url进行过滤筛选 return False def open(self): # can return deferred pass def close(self, reason): # can return a deferred pass def log(self, request, spider): # log that a request has been filtered pass """
以上是关于Scrapy 爬虫的主要内容,如果未能解决你的问题,请参考以下文章
python爬虫scrapy之scrapy终端(Scrapy shell)