python 一个简单的异步蜘蛛类。基于龙卷风

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 一个简单的异步蜘蛛类。基于龙卷风相关的知识,希望对你有一定的参考价值。

# -*- coding: utf-8 -*-
import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues

from HTMLParser import HTMLParser
from urlparse import urljoin, urldefrag


class BaseSpider(object):
    """A simple class of asynchronous spider."""

    def __init__(self, base_url, concurrency=10):
        self.base_url = base_url
        self.concurrency = concurrency
        self.queue = queues.Queue()
        self._fetching = set()
        self._fetched = set()

    def fetch(self, url):
        """fetch, should be overwrite"""
        raise NotImplementedError

    def parse_response(self, url, response):
        """parse response, return new links"""
        raise NotImplementedError

    @gen.coroutine
    def get_page(self, url):
        """async fetch action, call self.fetch"""
        try:
            response = yield self.fetch(url)
        except Exception as e:
            print('fetch exception: %s %s' % (e, url))
            raise gen.Return(e)
        raise gen.Return(response)

    @gen.coroutine
    def _run(self, timeout):

        @gen.coroutine
        def fetch_url():
            current_url = yield self.queue.get()
            try:
                if current_url in self._fetching:
                    return

                self._fetching.add(current_url)
                response = yield self.get_page(current_url)
                self._fetched.add(current_url)
                print('fetched success: %s, status %s' % (current_url, response.code))

                new_urls = self.parse_response(current_url, response)
                for new_url in new_urls:
                    yield self.queue.put(new_url)
            except Exception as e:
                print('parse exception: %s %s' % (e, current_url))
            finally:
                self.queue.task_done()

        @gen.coroutine
        def worker():
            while True:
                yield fetch_url()

        start = time.time()
        self.queue.put(self.base_url)
        for _ in range(self.concurrency):
            worker()

        try:
            yield self.queue.join(timeout=timeout)
        except gen.TimeoutError as e:
            pass

        try:
            assert self._fetching == self._fetched
        except AssertionError:
            print('remaining fetching url: {}'.format((self._fetching - self._fetched)))

        print('Done in %d seconds, fetched %s URLs.' % (
            time.time() - start, len(self._fetched)))

    def run(self, timeout):
        io_loop = ioloop.IOLoop.current()
        io_loop.run_sync(lambda:self._run(timeout))


class MySpider(BaseSpider):

    def __int__(self, base_url, **kwargs):
        super(MySpider, self).__init__(base_url, concurrency=10)
        print kwargs

    def fetch(self, url):
        return httpclient.AsyncHTTPClient().fetch(url)

    def parse_response(self, url, response):
        if response.code == 200:
            html = response.body if isinstance(response.body, str) \
                else response.body.decode()
            new_links = self.get_links_from_html(url, html)
            print 'url {} content length is {}'.format(url, len(html))
            return new_links
        elif response.code == 599:
            return [url]

    def get_links_from_html(self, url, html):
        result_set = set()
        for new_url in self._get_links(html):
            format_url = urljoin(url, self._remove_fragment(new_url))
            if format_url.startswith(self.base_url):
                result_set.add(format_url)
        return result_set

    def _remove_fragment(self, url):
        pure_url, frag = urldefrag(url)
        return pure_url

    def _get_links(self, html):
        class URLSeeker(HTMLParser):
            def __init__(self):
                HTMLParser.__init__(self)
                self.urls = []

            def handle_starttag(self, tag, attrs):
                href = dict(attrs).get('href')
                if href and tag == 'a':
                    self.urls.append(href)

        url_seeker = URLSeeker()
        url_seeker.feed(html)
        return url_seeker.urls


if __name__ == '__main__':
    s = MySpider("http://www.qq.com/")
    s.run(timeout=timedelta(seconds=10))

以上是关于python 一个简单的异步蜘蛛类。基于龙卷风的主要内容,如果未能解决你的问题,请参考以下文章

python 龙卷风异步爬虫演示

python tornado获得多个异步httprequest的响应

为异步龙卷风 Web 套接字服务器编写同步测试套件

异步登录龙卷风

如何在数据库中使龙卷风请求成为原子

python 爬虫 scrapy学习之 查看确认爬虫获取的内容 查看蜘蛛看到的是否和你看到的一致