python 一个简单的异步蜘蛛类。基于龙卷风
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 一个简单的异步蜘蛛类。基于龙卷风相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*-
import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
from HTMLParser import HTMLParser
from urlparse import urljoin, urldefrag
class BaseSpider(object):
"""A simple class of asynchronous spider."""
def __init__(self, base_url, concurrency=10):
self.base_url = base_url
self.concurrency = concurrency
self.queue = queues.Queue()
self._fetching = set()
self._fetched = set()
def fetch(self, url):
"""fetch, should be overwrite"""
raise NotImplementedError
def parse_response(self, url, response):
"""parse response, return new links"""
raise NotImplementedError
@gen.coroutine
def get_page(self, url):
"""async fetch action, call self.fetch"""
try:
response = yield self.fetch(url)
except Exception as e:
print('fetch exception: %s %s' % (e, url))
raise gen.Return(e)
raise gen.Return(response)
@gen.coroutine
def _run(self, timeout):
@gen.coroutine
def fetch_url():
current_url = yield self.queue.get()
try:
if current_url in self._fetching:
return
self._fetching.add(current_url)
response = yield self.get_page(current_url)
self._fetched.add(current_url)
print('fetched success: %s, status %s' % (current_url, response.code))
new_urls = self.parse_response(current_url, response)
for new_url in new_urls:
yield self.queue.put(new_url)
except Exception as e:
print('parse exception: %s %s' % (e, current_url))
finally:
self.queue.task_done()
@gen.coroutine
def worker():
while True:
yield fetch_url()
start = time.time()
self.queue.put(self.base_url)
for _ in range(self.concurrency):
worker()
try:
yield self.queue.join(timeout=timeout)
except gen.TimeoutError as e:
pass
try:
assert self._fetching == self._fetched
except AssertionError:
print('remaining fetching url: {}'.format((self._fetching - self._fetched)))
print('Done in %d seconds, fetched %s URLs.' % (
time.time() - start, len(self._fetched)))
def run(self, timeout):
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(lambda:self._run(timeout))
class MySpider(BaseSpider):
def __int__(self, base_url, **kwargs):
super(MySpider, self).__init__(base_url, concurrency=10)
print kwargs
def fetch(self, url):
return httpclient.AsyncHTTPClient().fetch(url)
def parse_response(self, url, response):
if response.code == 200:
html = response.body if isinstance(response.body, str) \
else response.body.decode()
new_links = self.get_links_from_html(url, html)
print 'url {} content length is {}'.format(url, len(html))
return new_links
elif response.code == 599:
return [url]
def get_links_from_html(self, url, html):
result_set = set()
for new_url in self._get_links(html):
format_url = urljoin(url, self._remove_fragment(new_url))
if format_url.startswith(self.base_url):
result_set.add(format_url)
return result_set
def _remove_fragment(self, url):
pure_url, frag = urldefrag(url)
return pure_url
def _get_links(self, html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
if __name__ == '__main__':
s = MySpider("http://www.qq.com/")
s.run(timeout=timedelta(seconds=10))
以上是关于python 一个简单的异步蜘蛛类。基于龙卷风的主要内容,如果未能解决你的问题,请参考以下文章