[置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻
Posted 超越梦想
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了[置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻相关的知识,希望对你有一定的参考价值。
存储使用mysql,增量更新东方头条全站新闻的标题 新闻简介 发布时间 新闻的每一页的内容 以及新闻内的所有图片。项目文件结构。
这是run.py的内容
1 #coding=utf-8 2 from scrapy import cmdline 3 import redis,time,threading 4 from multiprocessing import Process 5 #import scrapy.log 6 7 #cmdline.execute("scrapy crawl baoxian -s LOG_FILE=scrapy10.log".split()) 8 9 #scrapy crawl myspider -s LOG_FILE=scrapy2.log 10 11 12 start_urls = [\'http://mini.eastday.com/\', 13 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0010¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603095010319,170603093955594-2,170603093955594&jsonpcallback=jQuery18303164258797187358_1496455837718&_=1496455838146\', #国内 14 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0011¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603142336718-2,170603142336718,170603122752716&jsonpcallback=jQuery18307262756496202201_1496477922458&_=1496477923254\', #国际 15 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0005¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18302500620267819613_1496483754044&_=1496483755277\',#军事 16 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0003¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183026658024708740413_1496480575988&_=1496480576634\',#社会 17 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0002¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery1830691694314358756_1496480816841&_=1496480817500\',#娱乐 18 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0019¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18303703077440150082_1496480892188&_=1496480892581\',#健康 19 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0015¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183023222095426172018_1496480961781&_=1496480962307\',#时尚 20 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0008¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183017557532875798643_1496481013410&_=1496481013824\',#科技 21 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0012¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18308183211348950863_1496481106550&_=1496481106993\',#汽车 22 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0018¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18309359942991286516_1496481227742&_=1496481228242\',#人文 23 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0007¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183019699203735217452_1496481313637&_=1496481314077\',#游戏 24 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0020¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18307782149398699403_1496481413006&_=1496481413401\',#星座 25 \'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0021¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18306590236281044781_1496481467020&_=1496481467496\',#家居 26 27 ] 28 29 r = redis.Redis(host=\'127.0.0.1\',port=6379,db=0) 30 31 32 33 def check_redis_requsts(): 34 while(1): 35 \'\'\' 36 for url in start_urls: 37 r.rpush(\'eastdayspider:start_urls\',url) 38 print u\'插入到start_urls的:\',r.lrange(\'eastdayspider:start_urls\',0,-1) 39 \'\'\' 40 for url in start_urls: 41 r.sadd(\'eastdayspider:start_urls\',url) 42 print u\'插入到start_urls的:\',r.smembers(\'eastdayspider:start_urls\') 43 44 count=0 45 while (count<30): 46 if r.exists(\'eastdayspider:requests\'): 47 time.sleep(60) 48 count=0 49 else: 50 count+=1 51 time.sleep(10) 52 53 def run_spider(): 54 cmdline.execute("scrapy crawl eastdayspider".split()) 55 56 57 if __name__==\'__main__\': 58 pass 59 60 61 p1= Process(target=check_redis_requsts) 62 p2=Process(target=run_spider) 63 64 p1.start() 65 time.sleep(5) 66 p2.start() 67 68 p1.join() 69 p2.join()
这是settings.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for eastday project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = \'eastday\' 13 14 SPIDER_MODULES = [\'eastday.spiders\'] 15 NEWSPIDER_MODULE = \'eastday.spiders\' 16 17 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 18 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 19 REDIS_START_URLS_AS_SET=True #shezhi strat_urls键是集合,默认是false是列表 20 SCHEDULER_PERSIST = True 21 22 DEPTH_PRIORITY=0 23 RETRY_TIMES = 20 24 25 IMAGES_STORE = \'d:/\' 26 IMAGES_EXPIRES = 90 27 28 REDIS_HOST = \'localhost\' 29 REDIS_PORT = 6379 30 # Crawl responsibly by identifying yourself (and your website) on the user-agent 31 #USER_AGENT = \'eastday (+http://www.yourdomain.com)\' 32 33 # Obey robots.txt rules 34 ROBOTSTXT_OBEY = False 35 36 # Configure maximum concurrent requests performed by Scrapy (default: 16) 37 CONCURRENT_REQUESTS = 10 38 39 # Configure a delay for requests for the same website (default: 0) 40 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 41 # See also autothrottle settings and docs 42 DOWNLOAD_DELAY = 0 43 # The download delay setting will honor only one of: 44 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 45 #CONCURRENT_REQUESTS_PER_IP = 16 46 47 # Disable cookies (enabled by default) 48 #COOKIES_ENABLED = False 49 50 # Disable Telnet Console (enabled by default) 51 #TELNETCONSOLE_ENABLED = False 52 53 # Override the default request headers: 54 #DEFAULT_REQUEST_HEADERS = { 55 # \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\', 56 # \'Accept-Language\': \'en\', 57 #} 58 59 # Enable or disable spider middlewares 60 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 61 #SPIDER_MIDDLEWARES = { 62 # \'eastday.middlewares.EastdaySpiderMiddleware\': 543, 63 #} 64 65 # Enable or disable downloader middlewares 66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 67 68 DOWNLOADER_MIDDLEWARES = { 69 "eastday.middlewares.UserAgentMiddleware": 401, 70 #"eastday.middlewares.CookiesMiddleware": 402, 71 } 72 73 74 75 # Enable or disable extensions 76 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 77 #EXTENSIONS = { 78 # \'scrapy.extensions.telnet.TelnetConsole\': None, 79 #} 80 81 # Configure item pipelines 82 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 83 ITEM_PIPELINES = { 84 #\'eastday.pipelines.EastdayPipeline\': 300, 85 \'eastday.pipelines.MysqlDBPipeline\':400, 86 \'eastday.pipelines.DownloadImagesPipeline\':200, 87 #\'scrapy_redis.pipelines.RedisPipeline\': 400, 88 89 } 90 91 # Enable and configure the AutoThrottle extension (disabled by default) 92 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 93 #AUTOTHROTTLE_ENABLED = True 94 # The initial download delay 95 #AUTOTHROTTLE_START_DELAY = 5 96 # The maximum download delay to be set in case of high latencies 97 #AUTOTHROTTLE_MAX_DELAY = 60 98 # The average number of requests Scrapy should be sending in parallel to 99 # each remote server 100 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 101 # Enable showing throttling stats for every response received: 102 #AUTOTHROTTLE_DEBUG = False 103 104 # Enable and configure HTTP caching (disabled by default) 105 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 106 #HTTPCACHE_ENABLED = True 107 #HTTPCACHE_EXPIRATION_SECS = 0 108 #HTTPCACHE_DIR = \'httpcache\' 109 #HTTPCACHE_IGNORE_HTTP_CODES = [] 110 #HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'
这是pipelines.py,里面有建表文件。里面有个mysql检查url是否存在的语句,其实是多余的。因为url已经在redis中去重了。
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don\'t forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 import time,json,pymysql,re 8 from items import EastdayItem 9 from scrapy import Request 10 from scrapy.pipelines.images import ImagesPipeline 11 from scrapy.exceptions import DropItem 12 13 14 \'\'\' 15 CREATE TABLE `eastday` ( 16 `id` INT(10) NOT NULL AUTO_INCREMENT, 17 `title` VARCHAR(255) DEFAULT NULL, 18 `url` VARCHAR(80) DEFAULT NULL, 19 `tag` VARCHAR(30) DEFAULT NULL, 20 `brief` VARCHAR(300) DEFAULT NULL, 21 pubdate DATETIME, 22 origin VARCHAR(50), 23 crawled_time DATETIME, 24 25 `miniimg` VARCHAR(500) DEFAULT NULL, 26 `img_urls` TEXT, 27 `article` TEXT, 28 PRIMARY KEY (`id`) 29 ) ENGINE=INNODB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8 30 \'\'\' 31 32 class EastdayPipeline(object): 33 34 def process_item(self, item, spider): 35 print \'----------------------------%s\'%json.dumps(dict(item),ensure_ascii=False) 36 item["crawled_time"] = time.strftime(\'%Y-%m-%d %H:%M:%S\', time.localtime(time.time())) 37 return item 38 39 40 class MysqlDBPipeline(object): 41 def __init__(self): 42 43 44 self.conn = pymysql.connect( 45 host=\'localhost\', 46 port=3306, 47 user=\'root\', 48 49 passwd=\'123456\', 50 db=\'test\', 51 charset=\'utf8\', 52 ) 53 self.cur = self.conn.cursor() 54 55 def process_item(self, item, spider): 56 57 58 if isinstance(item, EastdayItem): 59 item["crawled_time"] = time.strftime(\'%Y-%m-%d %H:%M:%S\', time.localtime(time.time())) 60 print item[\'pubdate\'] 61 62 try: 63 for key in dict(item): 64 pass 65 item[key]=str(item[key]).replace("\'", "\\\\\\\'") 66 item[key] = str(item[key]).replace(\'"\',\'\\\\\\"\') 67 68 sql="""insert into eastday values(NULL,"{title}","{url}","{tag}","{brief}","{pubdate}","{origin}","{crawled_time}","{miniimg}","{img_urls}","{article}")""".format(title=item[\'title\'],url=item[\'url\'],tag=item[\'tag\'],brief=item[\'brief\'],pubdate=item[\'pubdate\'],origin=item[\'origin\'],crawled_time=item[\'crawled_time\'],miniimg=item[\'miniimg\'],img_urls=item[\'img_urls\'],article=item[\'article\']) 69 sql2 = \'select 1 from eastday where url="%s"\'%item[\'url\'] 70 print \'sql:\',sql 71 72 self.cur.execute(sql2) 73 is_exist = self.cur.fetchone() 74 if is_exist==(1,): 75 print \'已存在%s\'%item[\'url\'] 76 77 else: 78 self.cur.execute(sql) 79 self.conn.commit() 80 print \'插入成功\' 81 82 except Exception as e: 83 print u\'数据库error:\',e 84 pass 85 86 87 else: 88 print \'nonnonono\' 89 90 91 92 class DownloadImagesPipeline(ImagesPipeline): 93 94 95 def get_media_requests(self, item, info): 96 97 98 if item[\'img_urls\']: 99 for img_url in item[\'img_urls\']: 100 101 yield Request(img_url,meta={\'name\':img_url}) 102 103 def item_completed(self, results, item, info): 104 image_paths = [x[\'path\'] for ok, x in results if ok] 105 if not image_paths: 106 raise DropItem("Item contains no images") 107 return item 108 109 def file_path(self, request, response=None, info=None): 110 m=request.meta 111 img_name=re.findall(\'/([a-z_0-9]*)\\.[(jpeg)|(jpg)|(png)|(bmp)|(gif)|(JPEG)|(JPG)|(PNG)|(BMP)|(GIF)]\',m[\'name\'])[-1] 112 #print \'img_name\',img_name 113 filename = \'full3/%s.jpg\'%img_name 114 return filename
这是items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class EastdayItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 title=scrapy.Field() 15 url=scrapy.Field() 16 tag=scrapy.Field() 17 article=scrapy.Field() 18 img_urls=scrapy.Field() 19 crawled_time=scrapy.Field() 20 pubdate=scrapy.Field() 21 origin=scrapy.Field() 22 23 24 brief = scrapy.Field() 25 miniimg = scrapy.Field() 26 27 28 pass 29 30 \'\'\' 31 class GuoneiItem(scrapy.Item): 32 # define the fields for your item here like: 33 # name = scrapy.Field() 34 title=scrapy.Field() 35 url=scrapy.Field() 36 tag=scrapy.Field() 37 article=scrapy.Field() 38 img_urls=scrapy.Field() 39 crawled_time=scrapy.Field() 40 41 brief=scrapy.Field() 42 miniimg=scrapy.Field() 43 44 45 pass 46 \'\'\'
文件太多啦,不一一贴了,源码文件已打包已上传到博客园,但没找到分享文件链接的地方,如果要源码的可以评论中留言。
这是mysql的存储结果:
东方头条内容也是采集其他网站报刊的,内容还是很丰富,把东方头条的爬下来快可以做一个咨询内容的app了。
文章图片采用的是新闻中图片的连接的源文件名,方便前端开发在页面中展现正确的图片。
以上是关于[置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻的主要内容,如果未能解决你的问题,请参考以下文章