爬取知乎话题async使用协程

Posted erick-l

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取知乎话题async使用协程相关的知识,希望对你有一定的参考价值。

import requests
import json
import time
from pyquery import PyQuery
import pandas as pd
from collections import OrderedDict
import multiprocessing
import asyncio
from functools import partial
# cookies = input(‘请输入Cookie:‘)
# url = input(‘请输入url:‘)
init_url = https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10
headers = {
    User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (Khtml, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1,
    Cookie: **,
    Referer: https://www.zhihu.com/topic/19606409/hot,
    Host: www.zhihu.com,
    X-UDID: AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA=
}

def get_all_url(url):
    res = requests.get(url,headers=headers)
    data = json.loads(res.text)
    next_page_url = data[paging][next]
    url_list.append(next_page_url)
    print(len(url_list))
    end_page = data[paging][is_end]  # true
    if end_page:
        return url_list
    else:
        get_all_url(next_page_url)



async def get_all_data(url):
    future = loop.run_in_executor(None,partial(requests.get,url,headers=headers))
    #res = requests.get(url,headers=headers)
    res = await future
    data = json.loads(res.text)
    res_data = data[data]
    print(len(data_list))
    for i in res_data:
        final_data = OrderedDict()
        type = i[target][type]
        if type ==answer:
            final_data[title] = i[target][question][title] or ‘‘
            try:
                final_data[content] = PyQuery(i[target][content]).text()
            except Exception as e:
                final_data[content] = PyQuery(i[target][excerpt]).text()
            final_data[comment_count] = i[target][comment_count]
            final_data[voteup_count] = i[target][voteup_count]
            data_list.append(final_data)

if __name__ == __main__:
    data_list=[]
    url_list = []
    get_all_url(init_url)

    tasks = [asyncio.ensure_future(get_all_data(url)) for url in url_list]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()

    df1 =pd.DataFrame(data_list)
    df1.to_excel(保险+time.strftime("%Y%m%d%H%M%S")+.xlsx,index=False)
    print(done)

 

以上是关于爬取知乎话题async使用协程的主要内容,如果未能解决你的问题,请参考以下文章

爬取知乎热度搜索标题并数据分析及可视化

python爬取知乎首页问题

爬取知乎热搜

Python 爬取知乎用户属性生成词语

常用数据存储的介绍和使用

使用scrapy爬取知乎图片