scrapy-redis使redis不止保存url
Posted 疯疯癫癫的小可爱
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy-redis使redis不止保存url相关的知识,希望对你有一定的参考价值。
先看scrapy-redis源码
1 class RedisMixin(object): 2 """Mixin class to implement reading urls from a redis queue.""" 3 redis_key = None 4 redis_batch_size = None 5 redis_encoding = None 6 7 # Redis client placeholder. 8 server = None 9 10 def start_requests(self): 11 """Returns a batch of start requests from redis.""" 12 return self.next_requests() 13 14 def setup_redis(self, crawler=None): 15 """Setup redis connection and idle signal. 16 17 This should be called after the spider has set its crawler object. 18 """ 19 if self.server is not None: 20 return 21 22 if crawler is None: 23 # We allow optional crawler argument to keep backwards 24 # compatibility. 25 # XXX: Raise a deprecation warning. 26 crawler = getattr(self, ‘crawler‘, None) 27 28 if crawler is None: 29 raise ValueError("crawler is required") 30 31 settings = crawler.settings 32 33 if self.redis_key is None: 34 self.redis_key = settings.get( 35 ‘REDIS_START_URLS_KEY‘, defaults.START_URLS_KEY, 36 ) 37 38 self.redis_key = self.redis_key % {‘name‘: self.name} 39 40 if not self.redis_key.strip(): 41 raise ValueError("redis_key must not be empty") 42 43 if self.redis_batch_size is None: 44 # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 45 self.redis_batch_size = settings.getint( 46 ‘REDIS_START_URLS_BATCH_SIZE‘, 47 settings.getint(‘CONCURRENT_REQUESTS‘), 48 ) 49 50 try: 51 self.redis_batch_size = int(self.redis_batch_size) 52 except (TypeError, ValueError): 53 raise ValueError("redis_batch_size must be an integer") 54 55 if self.redis_encoding is None: 56 self.redis_encoding = settings.get(‘REDIS_ENCODING‘, defaults.REDIS_ENCODING) 57 58 self.logger.info("Reading start URLs from redis key ‘%(redis_key)s‘ " 59 "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 60 self.__dict__) 61 62 self.server = connection.from_settings(crawler.settings) 63 # The idle signal is called when the spider has no requests left, 64 # that‘s when we will schedule new requests from redis queue 65 crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 66 67 def next_requests(self): 68 """Returns a request to be scheduled or none.""" 69 use_set = self.settings.getbool(‘REDIS_START_URLS_AS_SET‘, defaults.START_URLS_AS_SET) 70 fetch_one = self.server.spop if use_set else self.server.lpop 71 # XXX: Do we need to use a timeout here? 72 found = 0 73 # TODO: Use redis pipeline execution. 74 while found < self.redis_batch_size: 75 data = fetch_one(self.redis_key) 76 if not data: 77 # Queue empty. 78 break 79 req = self.make_request_from_data(data) 80 if req: 81 yield req 82 found += 1 83 else: 84 self.logger.debug("Request not made from data: %r", data) 85 86 if found: 87 self.logger.debug("Read %s requests from ‘%s‘", found, self.redis_key) 88 89 def make_request_from_data(self, data): 90 """Returns a Request instance from data coming from Redis. 91 92 By default, ``data`` is an encoded URL. You can override this method to 93 provide your own message decoding. 94 95 Parameters 96 ---------- 97 data : bytes 98 Message from redis. 99 100 """ 101 url = bytes_to_str(data, self.redis_encoding) 102 return self.make_requests_from_url(url) 103 104 def schedule_next_requests(self): 105 """Schedules a request if available""" 106 # TODO: While there is capacity, schedule a batch of redis requests. 107 for req in self.next_requests(): 108 self.crawler.engine.crawl(req, spider=self) 109 110 def spider_idle(self): 111 """Schedules a request if available, otherwise waits.""" 112 # XXX: Handle a sentinel to close the spider. 113 self.schedule_next_requests() 114 raise DontCloseSpider 115 116 117 class RedisSpider(RedisMixin, Spider): 118 """Spider that reads urls from redis queue when idle. 119 120 Attributes 121 ---------- 122 redis_key : str (default: REDIS_START_URLS_KEY) 123 Redis key where to fetch start URLs from.. 124 redis_batch_size : int (default: CONCURRENT_REQUESTS) 125 Number of messages to fetch from redis on each attempt. 126 redis_encoding : str (default: REDIS_ENCODING) 127 Encoding to use when decoding messages from redis queue. 128 129 Settings 130 -------- 131 REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls") 132 Default Redis key where to fetch start URLs from.. 133 REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 134 Default number of messages to fetch from redis on each attempt. 135 REDIS_START_URLS_AS_SET : bool (default: False) 136 Use SET operations to retrieve messages from the redis queue. If False, 137 the messages are retrieve using the LPOP command. 138 REDIS_ENCODING : str (default: "utf-8") 139 Default encoding to use when decoding messages from redis queue. 140 141 """ 142 143 @classmethod 144 def from_crawler(self, crawler, *args, **kwargs): 145 obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 146 obj.setup_redis(crawler) 147 return obj 148 149 150 class RedisCrawlSpider(RedisMixin, CrawlSpider): 151 """Spider that reads urls from redis queue when idle. 152 153 Attributes 154 ---------- 155 redis_key : str (default: REDIS_START_URLS_KEY) 156 Redis key where to fetch start URLs from.. 157 redis_batch_size : int (default: CONCURRENT_REQUESTS) 158 Number of messages to fetch from redis on each attempt. 159 redis_encoding : str (default: REDIS_ENCODING) 160 Encoding to use when decoding messages from redis queue. 161 162 Settings 163 -------- 164 REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls") 165 Default Redis key where to fetch start URLs from.. 166 REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 167 Default number of messages to fetch from redis on each attempt. 168 REDIS_START_URLS_AS_SET : bool (default: True) 169 Use SET operations to retrieve messages from the redis queue. 170 REDIS_ENCODING : str (default: "utf-8") 171 Default encoding to use when decoding messages from redis queue. 172 173 """ 174 175 @classmethod 176 def from_crawler(self, crawler, *args, **kwargs): 177 obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 178 obj.setup_redis(crawler) 179 return obj
仔细看完的话会发现
make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
1 def make_request_from_data(self, data): 2 ‘‘‘ 3 :params data bytes, Message from redis 4 ‘‘‘ 5 company = bytes_to_str(data, self.redis_encoding) 6 return self.make_requests_from_url(company) 7 8 def make_requests_from_url(self, company): 9 data = eval(company) 10 url = data["url"] 11 headers = { 12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/70.0.3538.67 Safari/537.36", 13 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" 14 } 15 return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)
值得注意的是
不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行
以上是关于scrapy-redis使redis不止保存url的主要内容,如果未能解决你的问题,请参考以下文章