贴吧爬取
Posted erick-l
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了贴吧爬取相关的知识,希望对你有一定的参考价值。
# coding=utf-8 import requests import re from requests_html import HTMLSession import pandas as pd import time session = HTMLSession() headers = { ‘User-Agent‘: ‘Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1‘, ‘Cookie‘: ‘BAIDUID=0AD95F29B28B1C69CF12212918D35FC5:FG=1; BDUSS=xRTTRqU2poYXJxZmx5bTF0dm5iVERtdWRnTC1hbDJIbnltcGlOcmtuejk1VDViQVFBQUFBJCQAAAAAAAAAAAEAAAC4ED841cW4o8H6MjAxM8zs0KsAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP1YF1v9WBdbV0; BIDUPSID=0AD95F29B28B1C69CF12212918D35FC5; PSTM=1528257025; TIEBAUID=eaa5821fe8cd6332e9f74ebe; TIEBA_USERTYPE=4fe0d47f0a8a56b9153531e1; bdshare_firstime=1529484152117; STOKEN=fb86f516529f2e700875d976398014ccffa45fc25536938272acb3cef065221a; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; UM_distinctid=1651dfa3911746-0d266b95e90a93-163f6952-13c680-1651dfa391242f; Hm_lvt_addc40d255fca71b9b06a07c2397b42a=1533006153,1533094604,1533611406,1533637141; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=1421_21080_26921_20927; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1533711844,1533711977,1533781285,1533868202; 943657144_FRSVideoUploadTip=1; mo_originid=2; IS_NEW_USER=121622ee0999d777aa2e3fa8; BAIDU_WISE_UID=wapp_1533868860558_698; CLIENTWIDTH=375; CLIENTHEIGHT=667; LASW=375; fixedbarautopop=1; recommend_item_click=0; wise_device=1; pb_prompt=1; SET_PB_IMAGE_WIDTH=355; SEENKW=%E6%89%AB%E7%A0%81%23%C9%A8%C2%EB; CNZZDATA1272960286=201730737-1529483780-null%7C1533869631; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1533870061‘ } url_first = ‘https://tieba.baidu.com/mo/q/m?kw=%E6%89%AB%E7%A0%81&pn=0&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=0&pn={}&is_ajax=1‘ all_first_urls = [url_first.format(50*i) for i in range(1,72)] all_fina_url = [] def gen_all_urls(url): url_demo1 = ‘https://tieba.baidu.com/mo/q/m?kw=%E6%89%AB%E7%A0%81&pn=0&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=0&pn=50&is_ajax=1‘ res = requests.get(url, headers=headers) aa = res.json()[‘data‘][‘content‘] bb = re.findall(‘href="(/p/d+?lp=5027&mo_device=1&is_jingpost=0)"‘, aa) all_url = [‘https://tieba.baidu.com‘ + i for i in bb] all_fina_url.extend(all_url) all_fina_data = [] def get_single(url): info = {} url_demo2 = ‘https://tieba.baidu.com/p/5819837590?lp=5027&mo_device=1&is_jingpost=0&pn=0&‘ r = session.get(url,headers=headers) all_text = r.html.find(‘div[lz="0"]‘) for i in all_text: info[‘回复‘]=i.text len(all_fina_data) all_fina_data.append(info) if __name__ == ‘__main__‘: for first_url in all_first_urls: gen_all_urls(first_url) for fina_url in all_fina_url: get_single(fina_url) df1 = pd.DataFrame(all_fina_data) df1.to_excel(‘扫码贴吧信息‘+ time.strftime("%Y%m%d%H%M") + ‘.xlsx‘, index=False) print(‘done‘)
以上是关于贴吧爬取的主要内容,如果未能解决你的问题,请参考以下文章