使用mitmproxy做今日头条爬虫链接分析
Posted proceduremonkey
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用mitmproxy做今日头条爬虫链接分析相关的知识,希望对你有一定的参考价值。
import pickle import chardet from mitmproxy import ctx from pprint import pprint heads_file = ‘header.txt‘ body_file = ‘body.txt‘ #mitmdump -s test.py # Dalvik/2.1.0 (Linux; U; android 8.1.0; MI 8 MIUI/8.8.31) def request(flow): #只是修改请求浏览器请求头为MitmProxy # flow.request.headers[‘User-Agent‘] = ‘Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (Khtml, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1‘ # ctx.log.warn(str(flow.request.url)) # ctx.log.info(str(flow.request.headers)) # pprint(vars(flow.request)) # ctx.log.error(str(dir(flow.request))) # ctx.log.info("data.content:" + str(flow.request.data.content)) # ctx.log.info("data:" + str(dir(flow.request.data))) # ctx.log.info("content:" + str(flow.request.content)) # ctx.log.info(flow.request.headers[‘User-Agent‘]) url = str(flow.request.url) ctx.log.info("url:" + url) # if ‘pstatp.com/article‘ in url or ‘snssdk.com/article‘ in url or ‘snssdk.com/api/search‘ in url: # file = open(heads_file, encoding="utf-8", mode="a") # file.write( url + " ") # file.close() fileother = open("other.txt", encoding="utf-8", mode="a") fileother.write(url + " ") fileother.close() # with open(heads_file, ‘a‘) as handle: # pickle.dump(flow.request.url, handle) # def response(flow): # response = flow.response # info = ctx.log.info # info(str(response.status_code)) # info(str(response.headers)) # info(str(response.cookies)) # # info(str(response.encoding)) # detRes = chardet.detect(response.content) # 返回编码结果 # charset = detRes["encoding"] # info(str(charset)) # # text = response.content.decode(charset, "ignore") # if not charset: # charset = ‘utf-8‘ # text = str(response.content,encoding=charset) # info(text) # file = open(body_file,encoding=charset,mode="a") # file.write(text) # file.close() # with open(body_file, ‘a‘) as handle: # pickle.dump(text, handle)
以上是关于使用mitmproxy做今日头条爬虫链接分析的主要内容,如果未能解决你的问题,请参考以下文章
Python3网络爬虫开发实战 分析Ajax爬取今日头条街拍美图