简书全站爬取 mysql异步保存

Posted kend

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了简书全站爬取 mysql异步保存相关的知识,希望对你有一定的参考价值。

# 简书网
# 数据保存在mysql中; 将selenium+chromedriver集成到scrapy; 整个网站数据爬取
#  抓取ajax数据

#爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem

class JsSpider(CrawlSpider):
    name = js
    allowed_domains = [jianshu.com]
    start_urls = [https://www.jianshu.com/] # 从首页开始爬去

    rules = (
        # 详情页里面下面推荐的文章的href直接就是/p/.......
        Rule(LinkExtractor(allow=r.*/p/[0-9a-z]12.*),
             callback=parse_detail, follow=True),
    )

    def parse_detail(self, response):
        # print(response.text)
        title = response.xpath("//div[@class=‘note‘]/div[@class=‘post‘]/div[@class=‘article‘]/h1[@class=‘title‘]/text()").get()
        # print(title)
        avatar = response.xpath("//a[@class=‘avatar‘]/img/@src").get()
        # print(avatar)
        author = response.xpath("//span[@class=‘name‘]/a/text()").get()
        # print(author)
        pub_time = response.xpath("//span[@class=‘publish-time‘]/text()").get().replace("*","")
        # print(pub_time)

        # url正常情况下里面只有一个?
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        # print(article_id)

        #html标签一起趴下来, 方便以后展示
        content = response.xpath("//div[@class=‘show-content‘]").get()
        # print(content)
        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=response.url,
            article_id=article_id,
            content=content
        )
        yield item

# item文件
import scrapy

class ArticleItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()
    
    
# pipeline文件  保存在mysql中
import pymysql
from twisted.enterprise import adbapi       # 专门做数据库处理的模块
from pymysql import cursors

class JianshuSpiderPipeline(object):
    def __init__(self):
        dbparams=
            host:127.0.0.1,
            port:3306,
            user:root,
            password:‘‘,
            database:jianshu,
            charset:utf8
        
        self.conn = pymysql.connect(**dbparams)
        # **dbparams 相当于把 host=‘127.0.0.1‘ 写在了括号里

        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql,(item[title],item[content],item[author],item[avatar],
                                      item[pub_time],item[origin_url],item[article_id]))
        self.conn.commit() # 这个是同步进行的 比较慢
        return item

    @property
    def sql(self):
        if not self._sql: # 如果没有 执行
            self._sql = ‘‘‘
            insert into article2(id,title,content,author,avatar,pub_time,
            origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
            ‘‘‘
            return self._sql
        else:
            return self._sql

 

# 优化上面的pipeline文件,  实现异步保存
# 使用twisted 提供的数据库连接池 ConnectionPool,把插入数据的动作变成异步的 (面试可以说)

# 上面的存储是同步 比较慢, 现在优化成异步
class JianshuTwistedPipeline(object):
    def __init__(self):
        # 创建连接池
        dbparams = 
            host: 127.0.0.1,
            port: 3306,
            user: root,
            password: ‘‘,
            database: jianshu,
            charset: utf8,
            cursorclass:cursors.DictCursor
        
        self.dbpool = adbapi.ConnectionPool(pymysql,**dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql: # 如果没有 执行
            self._sql = ‘‘‘
            insert into article2(id,title,content,author,avatar,pub_time,
            origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
            ‘‘‘
            return self._sql
        else:
            return self._sql

    def process_item(self,item,spider):
        # runInteraction执行异步的
        defer = self.dbpool.runInteraction(self.insert_item,item)
        defer.addErrback(self.handle_error,item,spider)

    def insert_item(self,cursor,item): # 插入数据库
        cursor.execute(self.sql,(item[title],item[content],item[author],item[avatar],
                                      item[pub_time],item[origin_url],item[article_id]))

    def handle_error(self,error,item,spider):
        print(=*20)
        print("error:",error)
        print(=*20)

# 把settings中的pipeline文件改一下
ITEM_PIPELINES = 
   # ‘jianshu_spider.pipelines.JianshuSpiderPipeline‘: 300,
   jianshu_spider.pipelines.JianshuTwistedPipeline: 300, # 异步保存数据

 

# 优化动态数据     处理ajax加载进来的数据
# selenium+chromdriver 处理


# 爬虫文件  把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中
    def parse_detail(self, response):
        # print(response.text)
        title = response.xpath("//div[@class=‘note‘]/div[@class=‘post‘]/div[@class=‘article‘]/h1[@class=‘title‘]/text()").get()
        print(title)
        avatar = response.xpath("//a[@class=‘avatar‘]/img/@src").get()
        # print(avatar)
        author = response.xpath("//span[@class=‘name‘]/a/text()").get()
        # print(author)
        pub_time = response.xpath("//span[@class=‘publish-time‘]/text()").get().replace("*","")
        # print(pub_time)

        # url正常情况下里面只有一个?
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        # print(article_id)

        # 把html标签一起趴下来, 方便以后展示
        content = response.xpath("//div[@class=‘show-content‘]").get()
        # print(content)

        # 动态获取下面的数据
        word_count = response.xpath("//span[@class=‘wordage‘]/text()").get().split(" ")[-1]
        read_count = response.xpath("//span[@class=‘views-count‘]/text()").get().split(" ")[-1]
        comment_count = response.xpath("//span[@class=‘comments-count‘]/text()").get().split(" ")[-1]
        like_count = response.xpath("//span[@class=‘likes-count‘]/text()").get().split(" ")[-1]
        subject = response.xpath("//div[@class=‘include-collection‘]/a/div/text()").getall()
        # subject 获取的时候一个列表  存到mysql的时候不支持, 需要把列表转成字符串
        subject = ",".join(subject)

        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=response.url,
            article_id=article_id,
            content=content,
            
            word_count=word_count,
            read_count=read_count,
            comment_count=comment_count,
            like_count=like_count,
            subject=subject,
        )
        yield item



# 管道文件
# 上面的存储是同步 比较慢, 现在优化成异步
class JianshuTwistedPipeline(object):
    def __init__(self):
        # 创建连接池
        dbparams = 
            host: 127.0.0.1,
            port: 3306,
            user: root,
            password: ‘‘,
            database: jianshu,
            charset: utf8,
            cursorclass:cursors.DictCursor
        
        self.dbpool = adbapi.ConnectionPool(pymysql,**dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql: # 如果没有 执行
            self._sql = ‘‘‘
            insert into article2(id,title,content,author,avatar,pub_time,
            origin_url,article_id,read_count, word_count, like_count, comment_count,subject)
             values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            ‘‘‘
            #

            return self._sql
        else:
            return self._sql

    def process_item(self,item,spider):
        # runInteraction执行异步的
        defer = self.dbpool.runInteraction(self.insert_item,item)
        defer.addErrback(self.handle_error,item,spider)

    def insert_item(self,cursor,item): # 插入数据库
        cursor.execute(self.sql,(item[title],item[content],item[author],item[avatar],
                                      item[pub_time],item[origin_url],item[article_id],
                                 item[read_count],item[word_count],item[like_count],item[comment_count],item[subject]))

    def handle_error(self,error,item,spider):
        print(=*20+error+=*20)
        print("error:",error)
        print(=*20+error+=*20)

 

以上是关于简书全站爬取 mysql异步保存的主要内容,如果未能解决你的问题,请参考以下文章

Scrapy+selenium爬取简书全站

爬取拉钩全站的职位信息

Python使用Scrapy爬虫框架全站爬取图片并保存本地(妹子图)

爬取简书

scrapycrawl 爬取笔趣阁小说

BeautifulSoup爬取电影天堂全站电影资源