Scrapy 爬取保险条款 -《狗嗨默示录》-
Posted 李·狗嗨
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Scrapy 爬取保险条款 -《狗嗨默示录》-相关的知识,希望对你有一定的参考价值。
items.py
class IachinaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() COMPANY = scrapy.Field() TYPE = scrapy.Field() PRODUCT = scrapy.Field() CLAUSE = scrapy.Field() CLAUSE_URL = scrapy.Field()
iachina.py
# -*- coding: utf-8 -*- import scrapy from IAChina.items import IachinaItem class IachinaSpider(scrapy.Spider): name = ‘iachina‘ allowed_domains = [‘old.iachina.cn‘] start_urls = [‘http://old.iachina.cn/product.php?action=company&ttype=2&page={}‘.format(i) for i in range(1,4)] def parse(self, response): if not response: self.log("Company Page error -- %s"%response.url) for sel in response.xpath(‘//div[@class="prolist"]/ul/li/a‘): item = IachinaItem() item[‘COMPANY‘] = sel.xpath(‘text()‘).extract() company_href = sel.xpath(‘@href‘).extract_first() company_url = response.urljoin(company_href) yield scrapy.Request(url=company_url,meta={‘item‘:item},callback=self.parse_type) def parse_type(self,response): if not response: self.log("Type Page erroe -- %s"%response.url) for sel in response.xpath(‘//div[@class="prolist"]/ul/li/a‘): item = response.meta[‘item‘] item[‘TYPE‘] = sel.xpath(‘text()‘).extract() type_href = sel.xpath(‘@href‘).extract_first() type_url = response.urljoin(type_href) yield scrapy.Request(url=type_url, meta={‘item‘: item}, callback=self.parse_product) def parse_product(self,response): if not response: self.log("Product Page erroe -- %s"%response.url) for sel in response.xpath(‘//div[@class="prolist"]/ul/li/a‘): item = response.meta[‘item‘] item[‘PRODUCT‘] = sel.xpath(‘text()‘).extract() product_href = sel.xpath(‘@href‘).extract_first() product_url = response.urljoin(product_href) yield scrapy.Request(url=product_url, meta={‘item‘: item}, callback=self.parse_clause) def parse_clause(self,response): if not response: self.log("Clause Page erroe -- %s"%response.url) for sel in response.xpath(‘//div[@class="prolist"]/table/tr[2]/td/a‘): item = response.meta[‘item‘] item[‘CLAUSE‘] = sel.xpath(‘text()‘).extract() clause_href = sel.xpath(‘@href‘).extract_first() item[‘CLAUSE_URL‘] = response.urljoin(clause_href) yield item
以上是关于Scrapy 爬取保险条款 -《狗嗨默示录》-的主要内容,如果未能解决你的问题,请参考以下文章