python爬虫(1)-
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬虫(1)-相关的知识,希望对你有一定的参考价值。
大多数网站都会定义robots.txt文件,以让爬虫了解爬取该网站时存在的限制。在地址后面加/robots.txt查看
识别网站所用的技术:使用builtwith
因为这个包是用于python2的,python3在使用时要做一些修改。然后我们用一个charset包来检测网站的编码,修改后的完整内容如下(可以直接覆盖):
1 import sys 2 import os 3 import re 4 import json 5 import urllib.request 6 import urllib.error 7 import chardet 8 9 10 def builtwith(url, headers=None, html=None, user_agent=‘builtwith‘): 11 """Detect the technology used to build a website 12 13 >>> builtwith(‘http://wordpress.com‘) 14 {u‘blogs‘: [u‘PHP‘, u‘WordPress‘], u‘font-scripts‘: [u‘Google Font API‘], u‘web-servers‘: [u‘Nginx‘], u‘javascript-frameworks‘: [u‘Modernizr‘], u‘programming-languages‘: [u‘PHP‘], u‘cms‘: [u‘WordPress‘]} 15 >>> builtwith(‘http://webscraping.com‘) 16 {u‘javascript-frameworks‘: [u‘jQuery‘, u‘Modernizr‘], u‘web-frameworks‘: [u‘Twitter Bootstrap‘], u‘web-servers‘: [u‘Nginx‘]} 17 >>> builtwith(‘http://microsoft.com‘) 18 {u‘javascript-frameworks‘: [u‘jQuery‘], u‘mobile-frameworks‘: [u‘jQuery Mobile‘], u‘operating-systems‘: [u‘Windows Server‘], u‘web-servers‘: [u‘IIS‘]} 19 >>> builtwith(‘http://jquery.com‘) 20 {u‘cdn‘: [u‘CloudFlare‘], u‘web-servers‘: [u‘Nginx‘], u‘javascript-frameworks‘: [u‘jQuery‘, u‘Modernizr‘], u‘programming-languages‘: [u‘PHP‘], u‘cms‘: [u‘WordPress‘], u‘blogs‘: [u‘PHP‘, u‘WordPress‘]} 21 >>> builtwith(‘http://joomla.org‘) 22 {u‘font-scripts‘: [u‘Google Font API‘], u‘miscellaneous‘: [u‘Gravatar‘], u‘web-servers‘: [u‘LiteSpeed‘], u‘javascript-frameworks‘: [u‘jQuery‘], u‘programming-languages‘: [u‘PHP‘], u‘web-frameworks‘: [u‘Twitter Bootstrap‘], u‘cms‘: [u‘Joomla‘], u‘video-players‘: [u‘YouTube‘]} 23 """ 24 techs = {} 25 26 # check URL 27 for app_name, app_spec in data[‘apps‘].items(): 28 if ‘url‘ in app_spec: 29 if contains(url, app_spec[‘url‘]): 30 add_app(techs, app_name, app_spec) 31 32 # download content 33 if None in (headers, html): 34 try: 35 request = urllib.request.Request(url, None, {‘User-Agent‘: user_agent}) 36 if html: 37 # already have HTML so just need to make HEAD request for headers 38 request.get_method = lambda: ‘HEAD‘ 39 response = urllib.request.urlopen(request) 40 if headers is None: 41 headers = response.headers 42 if html is None: 43 html = response.read() 44 encode_type = chardet.detect(html) 45 if encode_type[‘encoding‘] == ‘utf-8‘: 46 html = html.decode(‘utf-8‘) 47 else: 48 html = html.decode(‘gbk‘) 49 except Exception as e: 50 print(‘Error:‘, e) 51 request = None 52 53 # check headers 54 if headers: 55 for app_name, app_spec in data[‘apps‘].items(): 56 if ‘headers‘ in app_spec: 57 if contains_dict(headers, app_spec[‘headers‘]): 58 add_app(techs, app_name, app_spec) 59 60 # check html 61 if html: 62 for app_name, app_spec in data[‘apps‘].items(): 63 for key in ‘html‘, ‘script‘: 64 snippets = app_spec.get(key, []) 65 if not isinstance(snippets, list): 66 snippets = [snippets] 67 for snippet in snippets: 68 if contains(html, snippet): 69 add_app(techs, app_name, app_spec) 70 break 71 72 # check meta 73 # XXX add proper meta data parsing 74 metas = dict(re.compile(‘<meta[^>]*?name=[\‘"]([^>]*?)[\‘"][^>]*?content=[\‘"]([^>]*?)[\‘"][^>]*?>‘, 75 re.IGNORECASE).findall(html)) 76 for app_name, app_spec in data[‘apps‘].items(): 77 for name, content in app_spec.get(‘meta‘, {}).items(): 78 if name in metas: 79 if contains(metas[name], content): 80 add_app(techs, app_name, app_spec) 81 break 82 83 return techs 84 85 86 parse = builtwith 87 88 89 def add_app(techs, app_name, app_spec): 90 """Add this app to technology 91 """ 92 for category in get_categories(app_spec): 93 if category not in techs: 94 techs[category] = [] 95 if app_name not in techs[category]: 96 techs[category].append(app_name) 97 implies = app_spec.get(‘implies‘, []) 98 if not isinstance(implies, list): 99 implies = [implies] 100 for app_name in implies: 101 add_app(techs, app_name, data[‘apps‘][app_name]) 102 103 104 def get_categories(app_spec): 105 """Return category names for this app_spec 106 """ 107 return [data[‘categories‘][str(c_id)] for c_id in app_spec[‘cats‘]] 108 109 110 def contains(v, regex): 111 """Removes meta data from regex then checks for a regex match 112 """ 113 return re.compile(regex.split(‘\\;‘)[0], flags=re.IGNORECASE).search(v) 114 115 116 def contains_dict(d1, d2): 117 """Takes 2 dictionaries 118 119 Returns True if d1 contains all items in d2""" 120 for k2, v2 in d2.items(): 121 v1 = d1.get(k2) 122 if v1: 123 if not contains(v1, v2): 124 return False 125 else: 126 return False 127 return True 128 129 130 def load_apps(filename=‘apps.json.py‘): 131 """Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer) 132 """ 133 # get the path of this filename relative to the current script 134 # XXX add support to download update 135 filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename) 136 return json.load(open(filename)) 137 138 139 data = load_apps() 140 141 if __name__ == ‘__main__‘: 142 urls = sys.argv[1:] 143 if urls: 144 for url in urls: 145 results = builtwith(url) 146 for result in sorted(results.items()): 147 print(‘%s: %s‘ % result) 148 else: 149 print(‘Usage: %s url1 [url2 url3 ...]‘ % sys.argv[0])
之后我们就可以使用builtwith来读取网站的技术结构
import builtwith builtwith.parse(url)
有一个名为whois的包可以用来查询网站所有者信息
import whois print whois.whois(url)
现在我们来建立一个下载函数:
1 import urllib.request,urllib.error 2 3 def download(url): 4 print(‘Downloading:‘,url) 5 try: 6 html = urllib.request.urlopen(url).read() 7 except urllib.error as e: 8 print(‘Download error:‘,e.reason) 9 html = None 10 return html
当下载出现错误的时候,这个函数能够捕获异常,然后返回None
现在考虑这样一个问题,下载中出现的错误是多种类型的,如5XX的错误是服务器端存在问题,4XX类的错误是请求存在问题,当问题属于服务器端的时候,我们可以尝试重新下载,于是改进版本如下:
1 import urllib.request,urllib.error 2 3 def download(url,num_retries=2): 4 print(‘Downloading:‘,url) 5 try: 6 html = urllib.request.urlopen(url).read() 7 except urllib.error as e: 8 print(‘Download error:‘,e.reason) 9 html = None 10 if num_retries>0: 11 if hasattr(e,‘code‘) and 500<= e <600: 12 return download(url,num_retries-1) 13 return html
其中num_retries是我们设定的尝试重新下载的次数。
在然后,我们要考虑用户代理的问题,默认情况下,urllib使用Python-urllib/x作为用户代理,其中x为python版本号。有些网站会禁止这个代理访问,所以我们要设定自己的用户代理。改进版本如下:
1 import urllib.request,urllib.error 2 3 def download(url,user_agent = ‘wswp‘,num_retries=2): 4 print(‘Downloading:‘,url) 5 header = {‘User_agent‘:user_agent} 6 request = urllib.request(url,header = header) 7 try: 8 html = urllib.request.urlopen(request).read() 9 except urllib.error as e: 10 print(‘Download error:‘,e.reason) 11 html = None 12 if num_retries>0: 13 if hasattr(e,‘code‘) and 500<= e <600: 14 return download(url,num_retries-1) 15 return html
其中,urlopen()可以接受一个url地址为参数,也可以接受一个request对象为参数
这样我们的下载函数就建立好了,它可以捕获异常,重试下载并设置用户代理。
以上是关于python爬虫(1)-的主要内容,如果未能解决你的问题,请参考以下文章
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段