使用scrapy框架---爬小说，入库

Posted 2020-09-09 诡道！！！

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了使用scrapy框架---爬小说，入库相关的知识，希望对你有一定的参考价值。

本人步骤：

1>setting.py:

BOT_NAME = \'newding\'

SPIDER_MODULES = [\'newding.spiders\']
NEWSPIDER_MODULE = \'newding.spiders\'

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
\'newding.pipelines.NewdingPipeline\': 300,
}

以上配置；创建项目会自动出现这些

以下是想要入数据库的（阶段）：

mysql_USER = \'root\'
MYSQL_PASSWORD = \'12345678\'
MYSQL_HOST = \'127.0.0.1\'
MYSQL_PORT = \'3306\'
MYSQL_DB = \'xiaoshuo\'

2>RUN.py

from scrapy.cmdline import execute
execute([\'scrapy\', \'crawl\', \'newding1s\']) #执行项目命令


3>items.py

import scrapy


class NewdingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # pass
    title = scrapy.Field()
    types = scrapy.Field()
    zijie = scrapy.Field()
    book_url = scrapy.Field()

4>sql.py

from newding.settings import *
import mysql.connector

db = mysql.connector.connect(user=MYSQL_USER, password=MYSQL_PASSWORD, host=MYSQL_HOST, port=MYSQL_PORT, db=MYSQL_DB)
cursor = db.cursor(buffered=True)


class Sql():
    @classmethod
    def insert_book(cls, title, types, zijie, book_url):
        sql = "insert INTO book_table(`title`,`types`,`zijie`,`book_url`)VALUES (\'" + title + "\',\'" + types + "\',\'" + zijie + "\',\'" + book_url + "\')"
        cursor.execute(sql)  # 游标执行sql语句
        db.commit()  # 提交数据

    @classmethod
    def select_book(self, book_url):
        # 这一段代码会查找name_id这个字段，如果存在则会返回1不存在则会返回0
        sql = "select EXISTS (select 1 FROM book_table WHERE book_url=\'" + book_url + "\')"
        cursor.execute(sql)
        list = cursor.fetchall()  # 游标查询所有超链接
        return list


5>pipelines.py

from .sql import Sql #引入sql.py文件


class NewdingPipeline(object):
    def process_item(self, item, spider):
        # return item
        title = item[\'title\']
        types = item[\'types\']
        zijie = item[\'zijie\']
        book_url = item[\'book_url\']

        if not Sql.select_book(book_url)[0][0]:
            Sql.insert_book(title, types, zijie, book_url)
        else:
            print(\'该小说已存在\')

6>newding1s.py (项目py文件)
import requests
import scrapy
from scrapy.http import Request
from scrapy.spiders import CrawlSpider, Rule, Request  ##CrawlSpider与Rule配合使用可以骑到历遍全站的作用、Request干啥的我就不解释了
from scrapy.linkextractors import LinkExtractor
from newding.items import *


class Newding1sSpider(scrapy.Spider):
    #name,allowed_domains,start_urls三个字段为固定格式，不能随意改变
    name = \'newding1s\'
    allowed_domains = [\'23us.so\']
    start_urls = [\'http://www.23us.so/\']

    def parse(self, response):
        start_urls = "http://www.23us.so/list/"
        end_url = ".html"
        for i in range(1, 10):  #循环顶点板块
            the_url = start_urls + str(i) + \'_1\' + end_url #拼接板块连接
            # print(the_url)
            yield Request(the_url, self.san)  #将当前函数传到san函数(传到下一函数)

    def san(self, response):
        yeshu = response.xpath(\'//*[@id="pagelink"]/a/text()\').extract()[-1] #用xpath匹配出最大页数
        yeshu_url = response.xpath(\'//*[@id="pagelink"]/a/@href\').extract()[0] #用xpath匹配出最大页数的连接
        qie = yeshu_url[:-6] #http://www.23us.so/list/1_

        for i in range(1, int(yeshu) + 1): #循环最大页数
            qie_html = qie + str(i) + ".html" #拼接板块+页数
            # print(qie_html)
            yield Request(qie_html, self.si)

    def si(self, response):
        #进入当前文章连接封面
        shu_url = response.xpath(\'//*[@id="content"]/dd[1]/table/tr[2]/td[1]/a/@href\').extract()[0]
        # print(shu_url)
        yield Request(shu_url, self.wu)

    def wu(self, response):
        item = NewdingItem() #引用item相对应的字段
        types = response.xpath(\'//*[@id="at"]/tr[1]/td[1]/a/text()\').extract()[0]  #小说类型
        zijie = response.xpath(\'//*[@id="at"]/tr[2]/td[2]/text()\').extract()[0].replace(\'\\xa0\', \'\') #.replace(\'\\xa0\', \'\')处理特殊字符，防止出现乱码
        title = response.xpath(\'//*[@id="content"]/dd[1]/h1/text()\').extract()[0]
        book_url = response.xpath(\'//a[@class="read"]/@href\').extract()[0]

        item[\'title\'] = title
        item[\'types\'] = types
        item[\'zijie\'] = zijie
        item[\'book_url\'] = book_url

        return item


做的项目是顶点小说信息入库（scrapy框架的优点是清晰明了，缺点是之间关系太繁琐）
可以参考另一片代码笔记xpath爬顶点页面信息；可以看出之间明显区别

以上是关于使用scrapy框架---爬小说，入库的主要内容，如果未能解决你的问题，请参考以下文章