Python爬虫 壁纸下载
Posted 流星蝴蝶没有剑
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫 壁纸下载相关的知识,希望对你有一定的参考价值。
下载所有的壁纸,并且将壁纸和描述保存到数据库
数据库字段:id, type, small, big, title
运行请注意延时,不要暴力运行,影响他人正常使用
import json
import os
import random
import time
import pymysql
import requests
from bs4 import BeautifulSoup
host = 'www.netbian.com'
Referer = 'http://www.netbian.com/dongman/index_2.htm'
headers = [
'Referer': Referer,
'Host': host,
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (Khtml, like Gecko) Version/5.1 Safari/534.50',
,
'Referer': Referer,
'Host': host,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
]
db = pymysql.connect(
host='localhost',
user='root',
password='123456',
db='taici',
charset='utf8mb4',
)
def download_img(img_url):
h =
'Referer': Referer,
'Host': "img.netbian.com",
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
r = requests.get(img_url, headers=h, stream=True)
print(img_url)
print(r.status_code) # 返回状态码
if r.status_code == 200:
path = './img/%s' % getUrlFileName(img_url)
if os.path.exists(path):
pass
else:
open(path, 'wb').write(r.content) # 将内容写入图片
print("done")
del r
def getUrlFileName(url):
return os.path.basename(url)
def inserSql(type, small, big, title):
global db
cursor = db.cursor()
sql = "INSERT INTO bizhi(type, small, big, title) " \\
"VALUES ('%s', '%s', '%s', '%s')" % (type, small, big, title)
try:
# 执行sql语句
cursor.execute(sql)
except Exception as e:
print("插入数据失败:", e)
else:
db.commit()
def pa(url):
ra = random.randint(0, 1)
s = requests.session()
h = headers[ra]
response = s.request('get', url, headers=h).content
html = BeautifulSoup(response, 'html.parser')
return html
def main():
page = 141
for index in range(2, page + 1):
time.sleep(3)
if index != 1:
url = 'http://www.netbian.com/dongman/index_%d.htm' % index
else:
url = 'http://www.netbian.com/dongman/index.htm'
print(url)
html = pa(url)
ce = html.find("div", "class": "list")
table = ce.find("ul")
liArr = table.findAll("li")
# 遍历单页的每个图片
for lis in liArr:
time.sleep(1)
try:
if lis.attrs["class"] == "nextpage":
break
except:
pass
li = lis.find("img")
small = li.attrs["src"]
# 下载缩略图
download_img(small)
title = li.attrs["alt"]
href = lis.find("a").attrs["href"]
if href[0] == '/':
href = "http://www.netbian.com" + href
imgHref = pa(href)
big = imgHref.find("div", "class": "pic").find("img").attrs["src"]
download_img(big)
inserSql('动漫', getUrlFileName(small), getUrlFileName(big), title)
print("OK", index)
if __name__ == '__main__':
main()
db.close()
以上是关于Python爬虫 壁纸下载的主要内容,如果未能解决你的问题,请参考以下文章
python爬虫,一段完整的python爬虫批量下载网站图片资源的代码