win4000-spider

Posted 2021-01-15 sw-z

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了win4000-spider相关的知识，希望对你有一定的参考价值。

import requests, re
from requests.exceptions import RequestException

lis = []
head = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64)‘
                         ‘ AppleWebKit/537.36 (Khtml, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘}
def win4000_next_page(url):
    try:
        resp = requests.get(url,headers=head)
        if resp.status_code == 200:
            return resp.text
        else:
            print(‘网站出错‘)
    except RequestException:
        print(‘请求出错‘)

def get_html():
    url = ‘http://www.win4000.com/meinv163726.html‘
    html = win4000_next_page(url)
    count = 0
    while count<= 50:##爬取张数50
        count += 1
        res1 = re.findall(r‘url="http://pic1.win4000.com/pic.+/>‘,html)
        res2 = res1[0].lstrip(‘url="‘)
        res = res2.rstrip(‘" />‘)
        page2 = re.findall(‘href=".+>下一张‘,html)
        page1 = page2[0].lstrip(‘h"‘)
        page3 = page1.lstrip(‘ref="‘)
        page = page3.rstrip(‘">下一张‘)
        lis.append(res)
        html = win4000_next_page(page)
        print(‘下一张%s‘%count)



def download(lis):
    count = 0
    for i in lis:
        count += 1
        res = grab_pic(i)
        with open(r‘C:pythondmspiderpiclib\%s.jpg‘%count,‘wb‘) as f:##绝对路径(需要修改)
            f.write(res)
        print(‘写入完成%s‘%count)

def grab_pic(url):
    try:
        respson = requests.get(url, headers=head)
        if respson.status_code == 200:
            return respson.content
        else:
            print(‘网站出错‘)
    except RequestException:
        print(‘请求出错‘)

if __name__ == ‘__main__‘:
    get_html()
    download(lis)
    print(‘全部完成‘)

以上是关于win4000-spider的主要内容，如果未能解决你的问题，请参考以下文章