爬虫 下载页面
Posted eat-too-much
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫 下载页面相关的知识,希望对你有一定的参考价值。
简介
爬虫下载页面
代码
简易下载
#!/usr/bin/env python
#coding=utf-8
import urllib2
def download(url):
print('Download:',url)
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print('Download error:', e.reason)
html = None
return html
if __name__ == '__main__':
download('http://www.baidu.com')
似乎并没有把百度的html 下载下来
多次尝试下载 5XX服务器错误 并设置 代理
很多网站都不喜欢被爬虫程序访问,但又没有办法完全禁止,于是就设置了一些反爬策略。比如User Agent,中文名为用户代理,简称UA。User Agent存放于Headers中,服务器就是通过查看Headers中的User Agent来判断是谁在访问。
通过不同的浏览器访问,会有不同的User Agent,如果爬虫不设置的话,很容易被识别出来,就会被限制访问。一般的做法是收集很多不同的User Agent,然后随机使用。
def download(url, user_agent='wswp', num_retries=2):
print 'Downloading:', url
headers = 'User-agent':user_agent
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
#retyr 5XX HTTP errors
return download(url, user_agent, num_retries-1)
return html
使用网站地图下载相关的页面
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>', sitemap)
# download each link
for link in links:
html = download(link)
print link
网站可能会把前面的字符串忽略然后可以只用后面的数字
def crawl_string():
for page in itertools.count(1):
url = 'http://example.webscraping.com/view/-%d' % page
html = download(url)
if ( html is None):
break
else:
pass
网站通过一个页面的链接下载
def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return webpage_regex.findall(html)
def link_crawler(seed_url, link_regex):
crawl_queue = [seed_url]
# keep track which URL's have seen before
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
print "getlinks", get_links(html)
for link in get_links(html):
if re.match(link_regex, link):
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/places/default/(index|view)')
支持对 robots.txt 的解析
def link_crawler(seed_url, link_regex):
rp = robotparser.RobotFileParser()
rp.set_url(seed_url+'/robots.txt')
rp.read()
crawl_queue = [seed_url]
# keep track which URL's have seen before
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
user_agent = 'wswp'
if rp.can_fetch(user_agent, url):
html = download(url)
print "getlinks", get_links(html)
for link in get_links(html):
if re.match(link_regex, link):
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
else:
print 'Blocked by robots.txt:', url
代理
def link_crawler(seed_url, link_regex, proxy=False):
if proxy: # 暂时无法代理
proxy_info=
'host':'106.12.38.133',
'port':22
# We create a handler for the proxy
proxy_support = urllib2.ProxyHandler("http" : "http://%(host)s:%(port)d" % proxy_info)
# We create an opener which uses this handler:
opener = urllib2.build_opener(proxy_support)
# Then we install this opener as the default opener for urllib2:
urllib2.install_opener(opener)
#如果代理需要验证
proxy_info =
'host' : '106.12.38.133',
'port' : 20,
'user' : 'root',
'pass' : 'Woaini7758258!'
proxy_support = urllib2.ProxyHandler("http" : "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info)
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
#htmlpage = urllib2.urlopen("http://sebsauvage.net/").read(200000)
rp = robotparser.RobotFileParser()
rp.set_url(seed_url+'/robots.txt')
rp.read()
crawl_queue = [seed_url]
# keep track which URL's have seen before
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
user_agent = 'wswp'
if rp.can_fetch(user_agent, url):
html = download(url)
print "getlinks", get_links(html)
for link in get_links(html):
if re.match(link_regex, link):
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
else:
print 'Blocked by robots.txt:', url
下载限速
class Throttle:
"""
下载延迟 下载之前调用
"""
def __init__(self, delay):
self.delay = delay
self.domains()
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()
参考链接
https://tieba.baidu.com/p/5832236970
以上是关于爬虫 下载页面的主要内容,如果未能解决你的问题,请参考以下文章