scrapy案例:爬取翼蜂网络新闻列表和详情页面
Posted zqrios
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy案例:爬取翼蜂网络新闻列表和详情页面相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*- import scrapy from Demo.items import DemoItem class AbcSpider(scrapy.Spider): name = ‘abcd‘ allowed_domains = [‘www.cnyifeng.net‘] # start_urls = [‘http://abc.com/‘] # 拼接url baseURL = "http://www.cnyifeng.net/news/1/{}.html" offset = 1 start_urls = [baseURL.format(offset)] def parse(self,response): node_list = response.xpath("//div[@class=‘news_con‘]/dl[@class=‘news_dl‘]") for node in node_list: item = DemoItem() if len(node.xpath(".//a[@class=‘dt_1‘]//text()")): item[‘title‘] = node.xpath(".//a[@class=‘dt_1‘]//text()").extract()[0] else: item[‘title‘] = ‘‘ if len(node.xpath("./dd//text()")): item[‘zhaiyao‘] = node.xpath("./dd//text()").extract()[0] else: item[‘zhaiyao‘] = ‘‘ item[‘times‘] = node.xpath(".//span//text()").extract()[0] mainUrl = ‘http://www.cnyifeng.net‘ erUrl = mainUrl + node.xpath(".//a[@class=‘dt_1‘]/@href").extract()[0] yield scrapy.Request(erUrl,callback=self.parse_detail_info,meta=item) # 把item传递给详情页的方法中 if len(response.xpath("//div[@class=‘flickr‘]//span[@class=‘disabled‘]")) == 0: url = response.xpath("//div[@class=‘flickr‘]/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse) else: ToNext = response.xpath("//div[@class=‘flickr‘]//span[@class=‘disabled‘]//text()").extract()[0].encode(‘utf-8‘) if str(ToNext != ‘下一页?‘): url = response.xpath("//div[@class=‘flickr‘]/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse) def parse_detail_info(self,response): item = response.meta #接收列表页的模型 item[‘viewcount‘] = ‘90‘ if len(response.xpath("//div[@id=‘left‘]/div[@class=‘content_arc‘]/span/text()")): content_list = response.xpath("//div[@id=‘left‘]/div[@class=‘content_arc‘]/span/text()").extract() content_str =‘‘ for model in content_list: content_str = content_str + str(model).strip() item[‘content‘] = content_str yield item
以上是关于scrapy案例:爬取翼蜂网络新闻列表和详情页面的主要内容,如果未能解决你的问题,请参考以下文章