爬虫下载脚本
Posted 下丶雨天
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫下载脚本相关的知识,希望对你有一定的参考价值。
download 文件
1 #!/usr/bin/python 2 #_*_coding:utf-8 _*_ 3 import urlparse 4 import urllib2 5 import random 6 import time 7 from datetime import datetime, timedelta 8 import socket 9 import disk_cache 10 DEFAULT_AGENT=\'WSWP\' # 设置代理 11 DEFAULT_DELAY=5 #设置下载延迟 为了限制下载速度 12 DEFAULT_RETRIES=1#发生错误时候尝试的次数 13 DEFAULT_TIMEOUT=60 14 CACHE=disk_cache.DiskCache() 15 class Downloader: 16 def __init__ (self ,delay=DEFAULT_DELAY,user_agent=DEFAULT_AGENT, proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, opener=None,cache=CACHE ): 17 socket.setdefaulttimeout(timeout) 18 self.throttle=Throttle(delay) 19 self.user_agent=user_agent 20 self.proxies=proxies 21 self.num_retries=num_retries 22 self.opener=opener 23 self.cache=cache 24 25 26 def __call__(self,url): 27 result=None 28 print self.cache 29 if self.cache: 30 try: 31 print \'women doushi\' 32 result=self.cache[url] 33 print result 34 print \'123\' 35 except KeyError: 36 pass 37 else: 38 if self.num_retries>0 and 500< result[\'code\']<600: 39 result=None 40 if result is None: 41 self.throttle.wait(url) 42 proxy=random.choice(self.proxies) if self.proxies else None 43 headers={\'User-agent\':self.user_agent} 44 result=self.download(url,headers, proxy=proxy,num_retries=self.num_retries) 45 if self.cache: 46 self.cache[url]=result 47 48 return result[\'html\'] 49 50 def download(self,url, headers,proxy, num_retries, data=None): 51 print \'Downloading:\', url 52 request=urllib2.Request(url,data,headers or {}) 53 opener=self.opener or urllib2.build_opener() 54 if proxy: 55 proxy_params={urlparse.urlparse(url).scheme:proxy} 56 opener.add_handler(urllib2.ProxyHandler(proxy_params)) 57 58 59 try: 60 61 62 response=opener.open(request) 63 html=response.read() 64 code=response.code 65 66 except Exception as e : 67 print \'Download error:\', str(e) 68 69 html=\'\' 70 71 if num_retries>0 and 500<=code<600: 72 return self._get(url,headers, prox,num_retries-1,data) 73 74 75 else: 76 code=None 77 return {\'html\':html,\'code\':code} 78 79 class Throttle: 80 def __init__ (self,delay): 81 self.delay=delay 82 self.domains={} 83 def wait(self,url): 84 domain=urlparse.urlsplit(url).netloc 85 last_accessed=self.domains.get(domain) 86 if self.delay>0 and last_accessed is not None: 87 sleep_secs=self.delay-(datetime.now()-last_accessed).seconds 88 if sleep_secs >0: 89 time.sleep(sleep_secs) 90 self.domains[domain]=datetime.now() 91 p=Downloader() 92 x=p(\'http://www.meituan.com\')
缓存disk_cache 脚本
1 import os 2 import re 3 import urlparse 4 import shutil 5 import zlib 6 from datetime import datetime, timedelta 7 try: 8 import cPickle as pickle 9 except ImportError: 10 import pickle 11 12 13 14 class DiskCache: 15 """ 16 Dictionary interface that stores cached 17 values in the file system rather than in memory. 18 The file path is formed from an md5 hash of the key. 19 20 >>> cache = DiskCache() 21 >>> url = \'http://example.webscraping.com\' 22 >>> result = {\'html\': \'...\'} 23 >>> cache[url] = result 24 >>> cache[url][\'html\'] == result[\'html\'] 25 True 26 >>> cache = DiskCache(expires=timedelta()) 27 >>> cache[url] = result 28 >>> cache[url] 29 Traceback (most recent call last): 30 ... 31 KeyError: \'http://example.webscraping.com has expired\' 32 >>> cache.clear() 33 """ 34 35 def __init__(self, cache_dir=\'cache\', expires=timedelta(days=30), compress=True): 36 """ 37 cache_dir: the root level folder for the cache 38 expires: timedelta of amount of time before a cache entry is considered expired 39 compress: whether to compress data in the cache 40 """ 41 self.cache_dir = cache_dir 42 self.expires = expires 43 self.compress = compress 44 45 46 def __getitem__(self, url): 47 """Load data from disk for this URL 48 """ 49 path = self.url_to_path(url) 50 if os.path.exists(path): 51 with open(path, \'rb\') as fp: 52 data = fp.read() 53 if self.compress: 54 data = zlib.decompress(data) 55 result, timestamp = pickle.loads(data) 56 if self.has_expired(timestamp): 57 raise KeyError(url + \' has expired\') 58 return result 59 else: 60 # URL has not yet been cached 61 raise KeyError(url + \' does not exist\') 62 63 64 def __setitem__(self, url, result): 65 """Save data to disk for this url 66 """ 67 path = self.url_to_path(url) 68 folder = os.path.dirname(path) 69 if not os.path.exists(folder): 70 os.makedirs(folder) 71 72 data = pickle.dumps((result, datetime.utcnow())) 73 if self.compress: 74 data = zlib.compress(data) 75 with open(path, \'wb\') as fp: 76 fp.write(data) 77 78 79 def __delitem__(self, url): 80 """Remove the value at this key and any empty parent sub-directories 81 """ 82 path = self._key_path(url) 83 try: 84 os.remove(path) 85 os.removedirs(os.path.dirname(path)) 86 except OSError: 87 pass 88 89 90 def url_to_path(self, url): 91 """Create file system path for this URL 92 """ 93 components = urlparse.urlsplit(url) 94 # when empty path set to /index.html 95 path = components.path 96 if not path: 97 path = \'/index.html\' 98 elif path.endswith(\'/\'): 99 path += \'index.html\' 100 filename = components.netloc + path + components.query 101 # replace invalid characters 102 filename = re.sub(\'[^/0-9a-zA-Z\\-.,;_ ]\', \'_\', filename) 103 # restrict maximum number of characters 104 filename = \'/\'.join(segment[:255] for segment in filename.split(\'/\')) 105 return os.path.join(self.cache_dir, filename) 106 107 108 def has_expired(self, timestamp): 109 """Return whether this timestamp has expired 110 """ 111 return datetime.utcnow() > timestamp + self.expires 112 113 114 def clear(self): 115 """Remove all the cached values 116 """ 117 if os.path.exists(self.cache_dir): 118 shutil.rmtree(self.cache_dir) 119 120 121 122 if __name__ == \'__main__\': 123 cache=DiskCache() 124 print cache
以上是关于爬虫下载脚本的主要内容,如果未能解决你的问题,请参考以下文章