爬虫 crawlspider
Posted xlhit
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫 crawlspider相关的知识,希望对你有一定的参考价值。
基于crawlspider 的 爬虫 阳光投诉网
CrawlSpider的全站数据爬取
- CrawlSpider就是另一种形式的爬虫类。CrawlSpider就是Spider的一个子类
- 创建一个基于CrawlSpider的爬虫文件:
- scrapy genspider -t crawl spiderName www.xxx.com
sun.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunPro.items import SunproItem,SunProDetail # class SunSpider(CrawlSpider): # name = ‘sun‘ # # allowed_domains = [‘www.xxx.com‘] # start_urls = [‘http://wz.sun0769.com/index.php/question/questionType?type=4&page=‘] # #连接提取器: # #作用:就是根据指定的规则(allow:正则)进行连接的提取 # link = LinkExtractor(allow=r‘type=4&page=\d+‘) # rules = ( # #规则解析器 # #作用:负责对连接提取器提取到的连接所对应的页面源码数据进行指定规则(callback)的解析 # Rule(link, callback=‘parse_item‘, follow=True), # #follow=True:将连接提取器 继续 作用到 连接提取器提取到的连接 所对应的页面源码中 # ) # # def parse_item(self, response): # print(response) #深度爬取 class SunSpider(CrawlSpider): name = ‘sun‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://wz.sun0769.com/index.php/question/questionType?type=4&page=‘] #连接提取器: #作用:就是根据指定的规则(allow:正则)进行连接的提取 link = LinkExtractor(allow=r‘type=4&page=\d+‘) #使用另一个连接提取期去提取详情页的连接 link_detail = LinkExtractor(allow=r‘question/\d+/\d+\.shtml‘) rules = ( #规则解析器 #作用:负责对连接提取器提取到的连接所对应的页面源码数据进行指定规则(callback)的解析 Rule(link, callback=‘parse_item‘, follow=False), #follow=True:将连接提取器 继续 作用到 连接提取器提取到的连接 所对应的页面源码中 Rule(link_detail,callback=‘parse_detail‘) ) def parse_item(self, response): tr_list = response.xpath(‘//*[@id="morelist"]/div/table[2]//tr/td/table//tr‘) for tr in tr_list: title = tr.xpath(‘./td[2]/a[2]/text()‘).extract_first() num = tr.xpath(‘./td[1]/text()‘).extract_first() item = SunproItem() item[‘title‘] = title item[‘num‘] = num yield item def parse_detail(self,response): content = response.xpath(‘/html/body/div[9]/table[2]//tr[1]/td/div[2]/text()‘).extract_first() num = response.xpath(‘/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()‘).extract_first() num = num.split(‘:‘)[-1] item = SunProDetail() item[‘content‘] = content item[‘num‘] = num yield item
items.py
import scrapy class SunproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() num = scrapy.Field() class SunProDetail(scrapy.Item): content = scrapy.Field() num = scrapy.Field()
pipline.py
class SunproPipeline(object): def process_item(self, item, spider): if item.__class__.__name__ == ‘SunProDetail‘: content = item[‘content‘] num = item[‘num‘] else: title = item[‘content‘] num = item[‘num‘] return item
以上是关于爬虫 crawlspider的主要内容,如果未能解决你的问题,请参考以下文章