爬取知名社区技术文章_pipelines_4
Posted 北门吹雪
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取知名社区技术文章_pipelines_4相关的知识,希望对你有一定的参考价值。
获取字段的存储处理和获取普通的路径
#!/usr/bin/python3 # -*- coding: utf-8 -*- import pymysql import gevent import pymysql from gevent import monkey from scrapy.pipelines.images import ImagesPipeline from twisted.enterprise import adbapi import pymysql.cursors class JobboleImagerPipeline(ImagesPipeline): """ 获得图片下载路径 """ def item_completed(self, results, item, info): if ‘img_url‘ in item: for key, value in results: # print(key) img_path = value[‘path‘] # print(value[‘path‘]) item[‘img_path‘] = img_path return item # class SqlSave(object): # """常规同步方式存入数据库""" # def __init__(self): # SQL_DBA = { # ‘host‘: ‘localhost‘, # ‘db‘: ‘jobole‘, # ‘user‘: ‘root‘, # ‘password‘: ‘jiayuan95814‘, # ‘use_unicode‘: True, # ‘charset‘: ‘utf8‘ # } # self.conn = pymysql.connect(**SQL_DBA) # self.cursor = self.conn.cursor() # # def process_item(self, item, spider): # sql = self.get_sql(item) # print(sql) # self.cursor.execute(sql) # self.conn.commit() # # return item # # def get_sql(self, item): # sql = """insert into article(cont_id, cont_url, title, publish_time, cont, img_url, img_path, like_num, collection_num, comment_num) value (‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘, %d, %d, %d) # """ % (item[‘cont_id‘], item[‘cont_url‘],item[‘title‘],item[‘publish_time‘],item[‘cont‘],item[‘img_url‘][0],item[‘img_path‘],item[‘link_num‘],item[‘collection_num‘],item[‘comment_num‘],) # return sql class SqlSave(object): """ 协程方式向数据库插入数据 """ def __init__(self): # 初始数据库连接和参数,SQL_DBA可写在setting中,通过 获取在settings.py中设置的SQL_DBA字典 # @classmethod # def from_settings(cls, settings): # sql_dba = settings[SQL_DBA] # return cls(cls,sql_dba) 需要__init__中新添个参数接收这个值 SQL_DBA = { ‘host‘: ‘localhost‘, ‘db‘: ‘jobole‘, ‘user‘: ‘root‘, ‘password‘: ‘jiayuan95814‘, ‘use_unicode‘: True, ‘charset‘: ‘utf8‘ } self.conn = pymysql.connect(**SQL_DBA) self.cursor = self.conn.cursor() def process_item(self, item, spider): sql = self.__get_sql(item) # 协程方式对数据库插入操作 gevent.joinall([ gevent.spawn(self.__go_sql, self.cursor, self.conn, sql, item), ]) return item def __go_sql(self, cursor, conn, sql, item): try: # 数据库插入操作 cursor.execute(sql, (item[‘cont_id‘], item[‘cont_url‘], item[‘title‘], item[‘publish_time‘], item[‘cont‘], item[‘img_url‘][0], item[‘img_path‘], item[‘link_num‘], item[‘collection_num‘], item[‘comment_num‘])) conn.commit() except Exception as e: print(e) def __get_sql(self, item): # 生成sql语句 sql = """insert into article(cont_id, cont_url, title, publish_time, cont, img_url, img_path, like_num, collection_num, comment_num) value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" return sql
以上是关于爬取知名社区技术文章_pipelines_4的主要内容,如果未能解决你的问题,请参考以下文章