xiaohuar.spider

Posted 2021-01-15 sw-z

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了xiaohuar.spider相关的知识，希望对你有一定的参考价值。

import requests, re
from requests.exceptions import RequestException


def get_one_page(url, agent):
    try:
        response = requests.get(url, headers=agent)
        if response.status_code == 200:
            return response.text
        print(‘网站出错1‘)
        return
    except RequestException:
        print(‘网站出错‘)
        return


def reg(x):
    lis = []
    for i in x:
        y = i.rstrip(‘"‘)
        m = y.lstrip(‘src="‘)
        z = m.lstrip(‘http://www.xiaohuar.com‘)
        lis.append(z)
    return lis


def main():
    url = ‘http://www.xiaohuar.com/2014.html‘
    agent = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ‘
                           ‘(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36‘}
    html = get_one_page(url, agent)
    reg1 = re.findall(r‘src="http://www.xiaohuar.com/d.+"|src="/d.+"‘, html)
    return reg1


def read_beauty(lis):
    count = 0
    for i in lis:
        count += 1
        name = ‘%s.jpg‘ % count
        agent = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36‘
                               ‘ (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36‘}
        url = ‘http://www.xiaohuar.com/‘ + i
        try:
            reson = requests.get(url, headers=agent)
            if reson.status_code == 200:
                with open(name, ‘wb‘) as f:
                    f.write(reson.content)
                print(‘完成1次‘)
                continue
            print(‘网站出错1‘)
            return

        except RequestException:
            print(‘网站出错‘)
            return


if __name__ == ‘__main__‘:
    x = main()
    xxxx = reg(x)
    read_beauty(xxxx)
    print(‘全部完成‘)

以上是关于xiaohuar.spider的主要内容，如果未能解决你的问题，请参考以下文章