python_scrapy_爬虫
Posted longpy
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python_scrapy_爬虫相关的知识,希望对你有一定的参考价值。
python , scrapy框架入门 , xpath解析, json 存储.
涉及到详情页爬取,
目录结构:
kaoshi_bqg.py
import scrapy from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor from ..items import BookBQGItem class KaoshiBqgSpider(scrapy.Spider): name = ‘kaoshi_bqg‘ allowed_domains = [‘biquge5200.cc‘] start_urls = [‘https://www.biquge5200.cc/xuanhuanxiaoshuo/‘] rules = ( # 编写匹配文章列表的规则 Rule(LinkExtractor(allow=r‘https://www.biquge5200.cc/xuanhuanxiaoshuo/‘), follow=True), # 匹配文章详情 Rule(LinkExtractor(allow=r‘.+/[0-9]1-3_[0-9]2-6/‘), callback=‘parse_item‘, follow=False), ) # 小书书名 def parse(self, response): a_list = response.xpath(‘//*[@id="newscontent"]/div[1]/ul//li//span[1]/a‘) for li in a_list: name = li.xpath(".//text()").get() detail_url = li.xpath(".//@href").get() yield scrapy.Request(url=detail_url, callback=self.parse_book, meta=‘info‘: name) # 单本书所有的章节名 def parse_book(self, response): name = response.meta.get(‘info‘) list_a = response.xpath(‘//*[@id="list"]/dl/dd[position()>20]//a‘) for li in list_a: chapter = li.xpath(".//text()").get() url = li.xpath(".//@href").get() yield scrapy.Request(url=url, callback=self.parse_content, meta=‘info‘: (name, chapter)) # 每章节内容 def parse_content(self, response): name, chapter = response.meta.get(‘info‘) content = response.xpath(‘//*[@id="content"]//p/text()‘).getall() item = BookBQGItem(name=name, chapter=chapter, content=content) yield item
item.py
代码
pipelines.py
from scrapy.exporters import JsonLinesItemExporter class BqgPipeline(object): def __init__(self): self.fp = open("biquge.json", ‘wb‘) # JsonLinesItemExporter 调度器 self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False) def process_item(self, item, spider): self.exporter.export_item(item) return item def close_item(self): self.fp.close() print("爬虫结束") # class XmlyPipeline(object): # def __init__(self): # self.fp = open("xmly.json", ‘wb‘) # # JsonLinesItemExporter 调度器 # self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False) # # def process_item(self, item, spider): # self.exporter.export_item(item) # return item # # def close_item(self): # self.fp.close() # print("爬虫结束")
starts.py
from scrapy import cmdline cmdline.execute("scrapy crawl kaoshi_bqg".split()) # cmdline.execute("scrapy crawl xmly".split())
然后是爬取到的数据
biquge.json
xmly.json
- 记录一下爬取过程中遇到的一点点问题:
- 在爬取详情页的的时候, 刚开始不知道怎么获取详情页的 url 以及 上一个页面拿到的字段
也就是 yield 返回 请求详情页 里面的参数没有很好地理解
以上是关于python_scrapy_爬虫的主要内容,如果未能解决你的问题,请参考以下文章