头条抓取

Posted 2022-05-08 victorstudy

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了头条抓取相关的知识，希望对你有一定的参考价值。

import requests
from lxml import etree
import json
import time


class Toutiao(object):
    def __init__(self):
        self.headers = 
            ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/75.0.3770.100 Safari/537.36‘
        

    def get_key_word(self):
        with open(‘./key_word.txt‘, ‘r‘) as f:
            data = f.readlines()
        return data

    # def get_url(self, data):
    #     for da in data:
    #         url_list = ‘https://www.toutiao.com/api/pc/feed/?category=&utm_source=toutiao‘‘&widen=1&max_behot_time=0&max_behot_time_tmp=0‘‘&tadrequire=true&as=A1256D1346DF1B7&cp=5D360F210BD7CE1‘‘&_signature=NQO3JgAAaDs8T80zj26hTjUDtz‘.format(
    #             da)
    #         for url in url_list:
    #             with open(‘./url.txt‘, ‘a‘) as f:
    #                 f.write(url)

    def get_content(self):
        with open(‘./url.txt‘, ‘r‘) as f:
            url_list = f.readlines()
            for url in url_list:
                try:
                    response = requests.get(url=url, headers=self.headers)
                    time.sleep(1)
                    response = json.loads(response.content.decode(‘utf-8‘))

                    for re in response["data"]:
                        with open(‘../bt/bt.txt‘, ‘a+‘, encoding=‘utf-8‘) as f:
                            f.write(re[‘title‘] + ‘\n‘)

                        with open(‘../bt/nr.txt‘, ‘a+‘, encoding=‘utf-8‘) as c:
                            c.write(re[‘abstract‘] + ‘\n‘)
                except Exception as e:
                    print(e)


if __name__ == ‘__main__‘:
    toutiao = Toutiao()
    data = toutiao.get_key_word()
    # toutiao.get_url(data)
    toutiao.get_content()

以上是关于头条抓取的主要内容，如果未能解决你的问题，请参考以下文章