利用协程多任务协程爬取前几页投诉网
Posted kingofcattle
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了利用协程多任务协程爬取前几页投诉网相关的知识,希望对你有一定的参考价值。
import asyncio
import aiohttp
from lxml import etree
urls = [‘http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1‘,
‘http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2‘,
‘http://wz.sun0769.com/political/index/politicsNewest?id=1&page=3‘]
async def get_request(url): # 定义特殊函数
"""网络异步请求模块,在每一个with前面加上async,在每个阻塞的前面加上await"""
async with aiohttp.ClientSession() as sess:
async with await sess.get(url) as response:
page_text = await response.text()
return page_text
def parse(task): # 定义一个解析函数
page_text = task.result() # 绑定函数的返回值
tree = etree.html(page_text)
li_list = tree.xpath(‘/html/body/div[2]/div[3]/ul[2]/li‘)
for li in li_list:
No = li.xpath(‘./span[1]//text()‘)[0]
statu = li.xpath(‘./span[2]//text()‘)[0]
pro = li.xpath(‘./span[3]//text()‘)[0]
time = li.xpath(‘./span[5]//text()‘)[0]
print(No, statu, pro, time)
if __name__ == ‘__main__‘:
tasks = []
for url in urls:
c = get_request(url) # 调用特殊函数,生成协程对象
task = asyncio.ensure_future(c) # 生成任务对象
task.add_done_callback(parse) # 绑定回调函数
tasks.append(task)
loop = asyncio.get_event_loop() # 生成事件循环对象
loop.run_until_complete(asyncio.wait(tasks)) # 将任务对象列表放入事件循环并启动
以上是关于利用协程多任务协程爬取前几页投诉网的主要内容,如果未能解决你的问题,请参考以下文章