[置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻

Posted 超越梦想

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了[置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻相关的知识,希望对你有一定的参考价值。

存储使用mysql,增量更新东方头条全站新闻的标题 新闻简介 发布时间 新闻的每一页的内容 以及新闻内的所有图片。项目文件结构。

 

这是run.py的内容

 1 #coding=utf-8
 2 from scrapy import cmdline
 3 import redis,time,threading
 4 from multiprocessing import Process
 5 #import scrapy.log
 6 
 7 #cmdline.execute("scrapy crawl baoxian -s LOG_FILE=scrapy10.log".split())
 8 
 9 #scrapy crawl myspider -s LOG_FILE=scrapy2.log
10 
11 
12 start_urls = [\'http://mini.eastday.com/\',
13                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0010&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603095010319,170603093955594-2,170603093955594&jsonpcallback=jQuery18303164258797187358_1496455837718&_=1496455838146\', #国内
14                     \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0011&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603142336718-2,170603142336718,170603122752716&jsonpcallback=jQuery18307262756496202201_1496477922458&_=1496477923254\', #国际
15                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0005&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18302500620267819613_1496483754044&_=1496483755277\',#军事
16                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0003&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183026658024708740413_1496480575988&_=1496480576634\',#社会
17                     \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0002&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery1830691694314358756_1496480816841&_=1496480817500\',#娱乐
18                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0019&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18303703077440150082_1496480892188&_=1496480892581\',#健康
19                     \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0015&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183023222095426172018_1496480961781&_=1496480962307\',#时尚
20                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0008&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183017557532875798643_1496481013410&_=1496481013824\',#科技
21                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0012&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18308183211348950863_1496481106550&_=1496481106993\',#汽车
22                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0018&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18309359942991286516_1496481227742&_=1496481228242\',#人文
23                   \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0007&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183019699203735217452_1496481313637&_=1496481314077\',#游戏
24                     \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0020&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18307782149398699403_1496481413006&_=1496481413401\',#星座
25                     \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0021&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18306590236281044781_1496481467020&_=1496481467496\',#家居
26 
27                   ]
28 
29 r = redis.Redis(host=\'127.0.0.1\',port=6379,db=0)
30 
31 
32 
33 def check_redis_requsts():
34     while(1):
35         \'\'\'
36         for url in start_urls:
37             r.rpush(\'eastdayspider:start_urls\',url)
38         print u\'插入到start_urls的:\',r.lrange(\'eastdayspider:start_urls\',0,-1)
39         \'\'\'
40         for url in start_urls:
41             r.sadd(\'eastdayspider:start_urls\',url)
42         print u\'插入到start_urls的:\',r.smembers(\'eastdayspider:start_urls\')
43 
44         count=0
45         while (count<30):
46             if  r.exists(\'eastdayspider:requests\'):
47                 time.sleep(60)
48                 count=0
49             else:
50                 count+=1
51                 time.sleep(10)
52 
53 def run_spider():
54     cmdline.execute("scrapy crawl eastdayspider".split())
55 
56 
57 if __name__==\'__main__\':
58     pass
59 
60     
61     p1= Process(target=check_redis_requsts)
62     p2=Process(target=run_spider)
63 
64     p1.start()
65     time.sleep(5)
66     p2.start()
67 
68     p1.join()
69     p2.join()

 

这是settings.py

1 # -*- coding: utf-8 -*-
  2 
  3 # Scrapy settings for eastday project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 
 12 BOT_NAME = \'eastday\'
 13 
 14 SPIDER_MODULES = [\'eastday.spiders\']
 15 NEWSPIDER_MODULE = \'eastday.spiders\'
 16 
 17 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 18 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 19 REDIS_START_URLS_AS_SET=True   #shezhi strat_urls键是集合,默认是false是列表
 20 SCHEDULER_PERSIST = True
 21 
 22 DEPTH_PRIORITY=0
 23 RETRY_TIMES = 20
 24 
 25 IMAGES_STORE = \'d:/\'
 26 IMAGES_EXPIRES = 90
 27 
 28 REDIS_HOST = \'localhost\'
 29 REDIS_PORT = 6379
 30 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 31 #USER_AGENT = \'eastday (+http://www.yourdomain.com)\'
 32 
 33 # Obey robots.txt rules
 34 ROBOTSTXT_OBEY = False
 35 
 36 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 37 CONCURRENT_REQUESTS = 10
 38 
 39 # Configure a delay for requests for the same website (default: 0)
 40 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 41 # See also autothrottle settings and docs
 42 DOWNLOAD_DELAY = 0
 43 # The download delay setting will honor only one of:
 44 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 45 #CONCURRENT_REQUESTS_PER_IP = 16
 46 
 47 # Disable cookies (enabled by default)
 48 #COOKIES_ENABLED = False
 49 
 50 # Disable Telnet Console (enabled by default)
 51 #TELNETCONSOLE_ENABLED = False
 52 
 53 # Override the default request headers:
 54 #DEFAULT_REQUEST_HEADERS = {
 55 #   \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
 56 #   \'Accept-Language\': \'en\',
 57 #}
 58 
 59 # Enable or disable spider middlewares
 60 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 61 #SPIDER_MIDDLEWARES = {
 62 #    \'eastday.middlewares.EastdaySpiderMiddleware\': 543,
 63 #}
 64 
 65 # Enable or disable downloader middlewares
 66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 67 
 68 DOWNLOADER_MIDDLEWARES = {
 69     "eastday.middlewares.UserAgentMiddleware": 401,
 70     #"eastday.middlewares.CookiesMiddleware": 402,
 71 }
 72 
 73 
 74 
 75 # Enable or disable extensions
 76 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 77 #EXTENSIONS = {
 78 #    \'scrapy.extensions.telnet.TelnetConsole\': None,
 79 #}
 80 
 81 # Configure item pipelines
 82 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 83 ITEM_PIPELINES = {
 84     #\'eastday.pipelines.EastdayPipeline\': 300,
 85     \'eastday.pipelines.MysqlDBPipeline\':400,
 86    \'eastday.pipelines.DownloadImagesPipeline\':200,
 87     #\'scrapy_redis.pipelines.RedisPipeline\': 400,
 88 
 89 }
 90 
 91 # Enable and configure the AutoThrottle extension (disabled by default)
 92 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 93 #AUTOTHROTTLE_ENABLED = True
 94 # The initial download delay
 95 #AUTOTHROTTLE_START_DELAY = 5
 96 # The maximum download delay to be set in case of high latencies
 97 #AUTOTHROTTLE_MAX_DELAY = 60
 98 # The average number of requests Scrapy should be sending in parallel to
 99 # each remote server
100 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
101 # Enable showing throttling stats for every response received:
102 #AUTOTHROTTLE_DEBUG = False
103 
104 # Enable and configure HTTP caching (disabled by default)
105 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
106 #HTTPCACHE_ENABLED = True
107 #HTTPCACHE_EXPIRATION_SECS = 0
108 #HTTPCACHE_DIR = \'httpcache\'
109 #HTTPCACHE_IGNORE_HTTP_CODES = []
110 #HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'

 

这是pipelines.py,里面有建表文件。里面有个mysql检查url是否存在的语句,其实是多余的。因为url已经在redis中去重了。

1 # -*- coding: utf-8 -*-
  2 
  3 # Define your item pipelines here
  4 #
  5 # Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
  6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 import time,json,pymysql,re
  8 from items import EastdayItem
  9 from scrapy import Request
 10 from scrapy.pipelines.images import ImagesPipeline
 11 from scrapy.exceptions import DropItem
 12 
 13 
 14 \'\'\'
 15 CREATE TABLE `eastday` (
 16   `id` INT(10) NOT NULL AUTO_INCREMENT,
 17   `title` VARCHAR(255) DEFAULT NULL,
 18   `url` VARCHAR(80) DEFAULT NULL,
 19   `tag` VARCHAR(30) DEFAULT NULL,
 20   `brief` VARCHAR(300) DEFAULT NULL,
 21    pubdate      DATETIME,
 22    origin    VARCHAR(50),
 23    crawled_time  DATETIME,
 24 
 25   `miniimg` VARCHAR(500) DEFAULT NULL,
 26   `img_urls` TEXT,
 27   `article` TEXT,
 28   PRIMARY KEY (`id`)
 29 ) ENGINE=INNODB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8
 30 \'\'\'
 31 
 32 class EastdayPipeline(object):
 33 
 34     def process_item(self, item, spider):
 35         print \'----------------------------%s\'%json.dumps(dict(item),ensure_ascii=False)
 36         item["crawled_time"] = time.strftime(\'%Y-%m-%d %H:%M:%S\', time.localtime(time.time()))
 37         return item
 38 
 39 
 40 class MysqlDBPipeline(object):
 41     def __init__(self):
 42 
 43 
 44         self.conn = pymysql.connect(
 45                     host=\'localhost\',
 46                     port=3306,
 47                     user=\'root\',
 48 
 49                     passwd=\'123456\',
 50                     db=\'test\',
 51                     charset=\'utf8\',
 52                     )
 53         self.cur = self.conn.cursor()
 54 
 55     def process_item(self, item, spider):
 56 
 57 
 58         if isinstance(item, EastdayItem):
 59             item["crawled_time"] = time.strftime(\'%Y-%m-%d %H:%M:%S\', time.localtime(time.time()))
 60             print item[\'pubdate\']
 61 
 62             try:
 63                 for key in dict(item):
 64                     pass
 65                     item[key]=str(item[key]).replace("\'", "\\\\\\\'")
 66                     item[key] = str(item[key]).replace(\'"\',\'\\\\\\"\')
 67 
 68                 sql="""insert into eastday values(NULL,"{title}","{url}","{tag}","{brief}","{pubdate}","{origin}","{crawled_time}","{miniimg}","{img_urls}","{article}")""".format(title=item[\'title\'],url=item[\'url\'],tag=item[\'tag\'],brief=item[\'brief\'],pubdate=item[\'pubdate\'],origin=item[\'origin\'],crawled_time=item[\'crawled_time\'],miniimg=item[\'miniimg\'],img_urls=item[\'img_urls\'],article=item[\'article\'])
 69                 sql2 = \'select 1 from eastday where url="%s"\'%item[\'url\']
 70                 print \'sql:\',sql
 71 
 72                 self.cur.execute(sql2)
 73                 is_exist = self.cur.fetchone()
 74                 if is_exist==(1,):
 75                     print \'已存在%s\'%item[\'url\']
 76 
 77                 else:
 78                     self.cur.execute(sql)
 79                     self.conn.commit()
 80                     print \'插入成功\'
 81 
 82             except Exception as e:
 83                 print  u\'数据库error:\',e
 84                 pass
 85 
 86 
 87         else:
 88             print \'nonnonono\'
 89 
 90 
 91 
 92 class DownloadImagesPipeline(ImagesPipeline):
 93 
 94 
 95     def get_media_requests(self, item, info):
 96 
 97 
 98         if item[\'img_urls\']:
 99             for img_url in item[\'img_urls\']:
100 
101                 yield Request(img_url,meta={\'name\':img_url})
102 
103     def item_completed(self, results, item, info):
104         image_paths = [x[\'path\'] for ok, x in results if ok]
105         if not image_paths:
106             raise DropItem("Item contains no images")
107         return item
108 
109     def file_path(self, request, response=None, info=None):
110         m=request.meta
111         img_name=re.findall(\'/([a-z_0-9]*)\\.[(jpeg)|(jpg)|(png)|(bmp)|(gif)|(JPEG)|(JPG)|(PNG)|(BMP)|(GIF)]\',m[\'name\'])[-1]
112         #print \'img_name\',img_name
113         filename = \'full3/%s.jpg\'%img_name
114         return filename

 

这是items.py

1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class EastdayItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     title=scrapy.Field()
15     url=scrapy.Field()
16     tag=scrapy.Field()
17     article=scrapy.Field()
18     img_urls=scrapy.Field()
19     crawled_time=scrapy.Field()
20     pubdate=scrapy.Field()
21     origin=scrapy.Field()
22 
23 
24     brief = scrapy.Field()
25     miniimg = scrapy.Field()
26 
27 
28     pass
29 
30 \'\'\'
31 class GuoneiItem(scrapy.Item):
32     # define the fields for your item here like:
33     # name = scrapy.Field()
34     title=scrapy.Field()
35     url=scrapy.Field()
36     tag=scrapy.Field()
37     article=scrapy.Field()
38     img_urls=scrapy.Field()
39     crawled_time=scrapy.Field()
40 
41     brief=scrapy.Field()
42     miniimg=scrapy.Field()
43 
44 
45     pass
46 \'\'\'

 

文件太多啦,不一一贴了,源码文件已打包已上传到博客园,但没找到分享文件链接的地方,如果要源码的可以评论中留言。

 

这是mysql的存储结果:

 

东方头条内容也是采集其他网站报刊的,内容还是很丰富,把东方头条的爬下来快可以做一个咨询内容的app了。

 

文章图片采用的是新闻中图片的连接的源文件名,方便前端开发在页面中展现正确的图片。

 

以上是关于[置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻的主要内容,如果未能解决你的问题,请参考以下文章

mysql实时增量备份 binlog日志备份

重置 PK 自动增量列

scrapy和scrapy_redis入门

关于SyncNavigator数据库实时同步软件新手使用教程推荐阅读

使用函数计算对表格存储中数据做简单清洗

scrapy-redis 分布式 案例一