Python爬虫壁纸下载

Posted 2022-11-20 流星蝴蝶没有剑

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Python爬虫壁纸下载相关的知识，希望对你有一定的参考价值。

下载所有的壁纸，并且将壁纸和描述保存到数据库

数据库字段：id, type, small, big, title

运行请注意延时，不要暴力运行，影响他人正常使用

import json
import os
import random
import time

import pymysql
import requests
from bs4 import BeautifulSoup

host = 'www.netbian.com'
Referer = 'http://www.netbian.com/dongman/index_2.htm'
headers = [
    'Referer': Referer,
    'Host': host,
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (Khtml, like Gecko) Version/5.1 Safari/534.50',
, 
    'Referer': Referer,
    'Host': host,
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
]
db = pymysql.connect(
    host='localhost',
    user='root',
    password='123456',
    db='taici',
    charset='utf8mb4',
)


def download_img(img_url):
    h = 
        'Referer': Referer,
        'Host': "img.netbian.com",
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    
    r = requests.get(img_url, headers=h, stream=True)
    print(img_url)
    print(r.status_code)  # 返回状态码
    if r.status_code == 200:
        path = './img/%s' % getUrlFileName(img_url)
        if os.path.exists(path):
            pass
        else:
            open(path, 'wb').write(r.content)  # 将内容写入图片
        print("done")
    del r


def getUrlFileName(url):
    return os.path.basename(url)


def inserSql(type, small, big, title):
    global db
    cursor = db.cursor()
    sql = "INSERT INTO bizhi(type, small, big, title) " \\
          "VALUES ('%s', '%s', '%s', '%s')" % (type, small, big, title)
    try:
        # 执行sql语句
        cursor.execute(sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        db.commit()


def pa(url):
    ra = random.randint(0, 1)
    s = requests.session()
    h = headers[ra]
    response = s.request('get', url, headers=h).content
    html = BeautifulSoup(response, 'html.parser')
    return html


def main():
    page = 141
    for index in range(2, page + 1):
        time.sleep(3)
        if index != 1:
            url = 'http://www.netbian.com/dongman/index_%d.htm' % index
        else:
            url = 'http://www.netbian.com/dongman/index.htm'
        print(url)
        html = pa(url)
        ce = html.find("div", "class": "list")
        table = ce.find("ul")
        liArr = table.findAll("li")
        # 遍历单页的每个图片
        for lis in liArr:
            time.sleep(1)
            try:
                if lis.attrs["class"] == "nextpage":
                    break
            except:
                pass
            li = lis.find("img")
            small = li.attrs["src"]
            # 下载缩略图
            download_img(small)
            title = li.attrs["alt"]
            href = lis.find("a").attrs["href"]
            if href[0] == '/':
                href = "http://www.netbian.com" + href
                imgHref = pa(href)
                big = imgHref.find("div", "class": "pic").find("img").attrs["src"]
                download_img(big)
                inserSql('动漫', getUrlFileName(small), getUrlFileName(big), title)
        print("OK", index)


if __name__ == '__main__':
    main()
    db.close()

以上是关于Python爬虫壁纸下载的主要内容，如果未能解决你的问题，请参考以下文章

Python爬虫 壁纸下载

Python爬虫壁纸下载