爬取知乎话题async使用协程
Posted erick-l
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取知乎话题async使用协程相关的知识,希望对你有一定的参考价值。
import requests import json import time from pyquery import PyQuery import pandas as pd from collections import OrderedDict import multiprocessing import asyncio from functools import partial # cookies = input(‘请输入Cookie:‘) # url = input(‘请输入url:‘) init_url = ‘https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10‘ headers = { ‘User-Agent‘: ‘Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (Khtml, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1‘, ‘Cookie‘: ‘**‘, ‘Referer‘: ‘https://www.zhihu.com/topic/19606409/hot‘, ‘Host‘: ‘www.zhihu.com‘, ‘X-UDID‘: ‘AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA=‘ } def get_all_url(url): res = requests.get(url,headers=headers) data = json.loads(res.text) next_page_url = data[‘paging‘][‘next‘] url_list.append(next_page_url) print(len(url_list)) end_page = data[‘paging‘][‘is_end‘] # true if end_page: return url_list else: get_all_url(next_page_url) async def get_all_data(url): future = loop.run_in_executor(None,partial(requests.get,url,headers=headers)) #res = requests.get(url,headers=headers) res = await future data = json.loads(res.text) res_data = data[‘data‘] print(len(data_list)) for i in res_data: final_data = OrderedDict() type = i[‘target‘][‘type‘] if type ==‘answer‘: final_data[‘title‘] = i[‘target‘][‘question‘][‘title‘] or ‘‘ try: final_data[‘content‘] = PyQuery(i[‘target‘][‘content‘]).text() except Exception as e: final_data[‘content‘] = PyQuery(i[‘target‘][‘excerpt‘]).text() final_data[‘comment_count‘] = i[‘target‘][‘comment_count‘] final_data[‘voteup_count‘] = i[‘target‘][‘voteup_count‘] data_list.append(final_data) if __name__ == ‘__main__‘: data_list=[] url_list = [] get_all_url(init_url) tasks = [asyncio.ensure_future(get_all_data(url)) for url in url_list] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) loop.close() df1 =pd.DataFrame(data_list) df1.to_excel(‘保险‘+time.strftime("%Y%m%d%H%M%S")+‘.xlsx‘,index=False) print(‘done‘)
以上是关于爬取知乎话题async使用协程的主要内容,如果未能解决你的问题,请参考以下文章