python 来自bing.ioliu.cn/的爬行壁纸
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 来自bing.ioliu.cn/的爬行壁纸相关的知识,希望对你有一定的参考价值。
import re
import os
import aiohttp
import asyncio
import requests
import time
from lxml import etree
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
exception_list = []
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as response:
return await response.text()
async def get_html_url_list():
url_base = "https://bing.ioliu.cn/?p={}"
html = await fetch('https://bing.ioliu.cn/?p=1')
selector = etree.HTML(html)
content = selector.xpath('/html/body/div[@class="page"]/span/text()') # 这里使用id属性来定位哪个div和ul被匹配 使用text()获取文本内容
if content:
page = int(str(content[0]).split('/')[-1])
else:
page = 70
return (url_base.format(i) for i in range(1, page + 1))
async def get_image_and_save(session, url):
filename = os.path.join('YOU_PATH', url.split('/')[-1])
try:
async with session.get(url, headers=headers) as resp:
with open(filename, 'wb') as fd:
while True:
chunk = await resp.content.read(1024)
if not chunk:
break
fd.write(chunk)
except Exception as e:
exception_list.append(e)
return
async def get_all_images(image_url_list):
conn = aiohttp.TCPConnector(verify_ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
sem = asyncio.Semaphore(50)
for image_url in image_url_list:
async with sem:
await get_image_and_save(session, image_url)
def get_image_url_list(html_content_list):
regex = r"http://h1\.ioliu.*?jpg"
image_url_list = []
for html in html_content_list:
matches = re.finditer(regex, html)
for match in list(matches):
image_url_list.append(match.group(0))
image_url_list = image_url_list[1::3]
image_url_list = map(lambda url: '_'.join(url.split('_')[:-1]) + '_1920x1080.jpg', image_url_list)
return image_url_list
def main():
start_time = time.time()
loop = asyncio.get_event_loop()
url_list = loop.run_until_complete(get_html_url_list())
html_content_tasks = [fetch(url) for url in list(url_list)]
html_content_list = loop.run_until_complete(asyncio.gather(*html_content_tasks))
image_url_list = get_image_url_list(html_content_list)
download_image_tasks = [get_all_images(image_url_list)]
loop.run_until_complete(asyncio.gather(*download_image_tasks))
cost_time = time.time() - start_time
print('cost time:', round(cost_time, 2), 'seconds')
for e in exception_list:
print(e)
print("length :{}".format(len(exception_list)))
loop.close()
if __name__ == '__main__':
main()
以上是关于python 来自bing.ioliu.cn/的爬行壁纸的主要内容,如果未能解决你的问题,请参考以下文章