python小爬虫练手

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python小爬虫练手相关的知识,希望对你有一定的参考价值。

一个人无聊,写了个小爬虫爬取不可描述图片....

代码太短,就暂时先往这里贴一下做备份吧。

 

#! /usr/bin/python

import chardet
import urllib3
import uuid
import os
import logging
import time
import sys
import re
import threading
from bs4 import BeautifulSoup


"""
http://www.qiubaichengren.com/1.html
"""


class PageNotFoundException(BaseException):
    """
        代表网页404的异常
    """
    pass


class ResponseStatusException(BaseException):
    pass


class QiuBaiChengRenSpider:

    http_pool_manager = urllib3.PoolManager()

    img_save_dir = ‘D:/QiuBaiChengRen/‘

    logger = logging.getLogger(‘QiuBaiChengRenSpider‘)

    def __init__(self):
        self.init_log()

    def init_log(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        self.logger.addHandler(stream_handler)
        self.logger.setLevel(logging.DEBUG)

    def get(self, url):
        try:
            http_response = self.http_pool_manager.request(‘GET‘, url)
            if http_response.status == 404:
                raise PageNotFoundException(‘404‘)
            if http_response.status != 200:
                raise ResponseStatusException(http_response.status)
            return http_response.data
        except Exception:
            self.logger.info(u‘获取网页的时候发生了异常‘)
            return ‘‘

    def extract_img(self, html_doc):
        bs = BeautifulSoup(html_doc, ‘lxml‘)
        imgs = bs.select(‘div.mala-text img‘)
        return imgs

    def save_img(self, img_tag):
        img_link = img_tag[‘src‘].strip()
        save_name = self.img_save_dir + img_tag[‘alt‘] + ‘___‘ + uuid.uuid4().hex + os.path.splitext(img_link)[1]
        save_name = re.compile(‘[\\s+,\",\‘]‘).sub(‘‘, save_name)    # 覆盖掉生成的文件名中不合法的部分

        self.logger.info(‘Save img: %s %s‘ %(save_name, img_link))

        img_byte = self.get(img_link)
        if img_byte == ‘‘:
            return

        img_file = open(save_name, ‘wb‘)
        img_file.write(img_byte)
        img_file.close()

    def list_visitor(self, seed):
        threads = []
        i = 1
        while True:
            try:
                url = seed % {‘page‘: i}
                self.logger.info(‘Begin process:%s‘ %url)

                html_doc = self.get(url)
                if html_doc == ‘‘:
                    continue

                imgs = self.extract_img(html_doc)
                for img in imgs:
                    # self.logger.info(‘Saving img:%s %s‘ %(img[‘alt‘], img[‘src‘]))
                    t1 = threading.Thread(target=self.save_img, args={img})
                    t1.start()
                    threads.append(t1)
                i += 1
            except PageNotFoundException:
                self.logger.info(‘404‘)
                break
            except BaseException:
                break
        for t1 in threads:
            t1.join()


if __name__ == ‘__main__‘:
    spider = QiuBaiChengRenSpider()
    spider.list_visitor(‘http://www.qiubaichengren.com/%(page)d.html‘)


以上是关于python小爬虫练手的主要内容,如果未能解决你的问题,请参考以下文章

scrapy主动退出爬虫的代码片段(python3)

拿爱奇艺练手Python爬虫,是在法律边缘试探吗?爬虫技巧学习

拿爱奇艺练手Python爬虫,是在法律边缘试探吗?爬虫技巧学习

Python实战手把手有教你写爬虫爬虫练手:看看爱奇艺的评论都在干啥(爬虫+词云分析)

python scrapy 管道学习,并拿在行练手爬虫项目

python scrapy 管道学习,并拿在行练手爬虫项目