scrapy爬取小说盗墓笔记

Posted 道高一尺

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy爬取小说盗墓笔记相关的知识,希望对你有一定的参考价值。

# -*- coding: utf-8 -*-
import scrapy
import requests
from daomu.items import DaomuItem
from pyquery import PyQuery as pq

class DaomuspiderSpider(scrapy.Spider):
    name = "daomuspider"
    # allowed_domains = ["www.daomubiji.com"]
    start_urls = [http://www.daomubiji.com/]
    index_url = http://www.daomubiji.com/

    def start_requests(self):
        yield scrapy.Request(url=self.index_url,callback=self.parse_book)

    def parse_book(self, response):
        for url in response.css(.article-content a):
            book_url = url.css(a::attr(href)).extract_first()
            yield scrapy.Request(url=book_url, callback=self.parse_chapter)

    def parse_chapter(self, response):
        item = DaomuItem()
        book_title = response.css(.focusbox .container h1::text).extract_first()
        book_info = response.css(.focusbox .container .focusbox-text::text).extract_first()
        book_url = response.url

        for chapter in response.css(.excerpts-wrapper .excerpts .excerpt):
            chapter_title = chapter.css(a::text).extract_first().split( )[1] + :+ chapter.css(a::text).extract_first().split( )[-1]
            chapter_url = chapter.css(a::attr(href)).extract_first()
            content = self.parse_detail(chapter_url)

            item[book_title] = book_title
            item[book_info] = book_info
            item[book_url] = book_url
            item[chapter_title] = chapter_title
            item[chapter_url] = chapter_url
            item[content] = content
            yield item


    def parse_detail(self, url):
        response = requests.get(url)
        doc = pq(response.text)
        content = doc(.article-content p).text()
        return content
import pymongo

class DaomuPipeline(object):

    def __init__(self):
        self.mongo_uri = localhost
        self.mongo_db = daomu

    # @classmethod
    # def frow_crawler(cls, crawler):
    #     return cls(
    #         mongo_uri = crawler.settings.get(‘MONGO_URI‘),
    #         mongo_db = crawler.settings.get(‘MONGO_DB‘)
    #     )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        name = item.__class__.__name__
        self.db[name].insert(dict(item))#一定要注意这里用dict
        return item

    def close_spider(self, spider):
        self.client.close()

 

以上是关于scrapy爬取小说盗墓笔记的主要内容,如果未能解决你的问题,请参考以下文章

python2.7 爬虫_爬取小说盗墓笔记章节及URL并导入MySQL数据库_20161201

scrapy框架爬取小说信息

scrapy初探之实现爬取小说

爬虫学习笔记(十三)—— scrapy-redis:存储到MySQLScrapy项目部署

scrapy爬取杰书网小说

Python爬虫之Scrapy框架系列(14)——实战ZH小说爬取多页爬取