Ajax数据爬取
Posted 糕事情
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Ajax数据爬取相关的知识,希望对你有一定的参考价值。
Ajax 即“Asynchronous Javascript And XML”(异步 javascript 和 XML),是指一种创建交互式、快速动态网页应用的网页开发技术,无需重新加载整个网页的情况下,能够更新部分网页的技术。
通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新。
1. 爬取微博页面Ajax数据
import requests from urllib.parse import urlencode from pyquery import PyQuery as pq import json, pymongo def get_ajax_page(page): headers = { ‘Host‘: ‘weibo.com‘, ‘Referer‘: ‘https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘, ‘X-Requested-With‘: ‘XMLHttpRequest‘, ‘Cookie‘: ‘SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567‘ } url = ‘https://weibo.com/aj/v6/comment/big‘ params = { ‘ajwvr‘: ‘6‘, ‘id‘: ‘4483557667874538‘, ‘root_comment_max_id_type‘: ‘0‘, ‘page‘: page, } try: response = requests.get(url=url, headers=headers, params=params) if response.status_code == 200: return response.json() # print(type(response.json()), response.json()) except requests.ConnectionError as e: print(‘error‘, e.args) def parse_page(js): data = js.get(‘data‘) html = data.get(‘html‘) doc = pq(html) items = doc(‘div.list_con‘).items() for item in items: msg = {} msg[‘name‘] = item(‘.WB_text‘).text().split(‘:‘)[0] msg[‘content‘] = item(‘.WB_text‘).text().split(‘:‘)[1] msg[‘datetime‘] = item(‘div.WB_from.S_txt2‘).text() yield msg def collection_mongo(host=‘localhost‘, port=27017): client = pymongo.MongoClient(host=host, port=port) return client def save_mongo(client ,data): db = client.weibo collection = db.weibo if collection.insert(data): print(‘Save to mongo‘) def search_mongo(client): db = client.weibo collection = db.weibo result = collection.find() return result def main(): for i in range(1, 11): js = get_ajax_page(str(i)) results = parse_page(js) for result in results: client = collection_mongo(‘10.0.0.100‘) save_mongo(client, result) if __name__ == ‘__main__‘: # main() client = collection_mongo(‘10.0.0.100‘) data = search_mongo(client) for item in data: print(item)
2. Ajax爬取头条街拍图片
import requests from urllib.parse import urlencode from pyquery import PyQuery as pq import json, pymongo def get_ajax_page(page): headers = { ‘Host‘: ‘weibo.com‘, ‘Referer‘: ‘https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘, ‘X-Requested-With‘: ‘XMLHttpRequest‘, ‘Cookie‘: ‘SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567‘ } url = ‘https://weibo.com/aj/v6/comment/big‘ params = { ‘ajwvr‘: ‘6‘, ‘id‘: ‘4483557667874538‘, ‘root_comment_max_id_type‘: ‘0‘, ‘page‘: page, } try: response = requests.get(url=url, headers=headers, params=params) if response.status_code == 200: return response.json() # print(type(response.json()), response.json()) except requests.ConnectionError as e: print(‘error‘, e.args) def parse_page(js): data = js.get(‘data‘) html = data.get(‘html‘) doc = pq(html) items = doc(‘div.list_con‘).items() for item in items: msg = {} msg[‘name‘] = item(‘.WB_text‘).text().split(‘:‘)[0] msg[‘content‘] = item(‘.WB_text‘).text().split(‘:‘)[1] msg[‘datetime‘] = item(‘div.WB_from.S_txt2‘).text() yield msg def collection_mongo(host=‘localhost‘, port=27017): client = pymongo.MongoClient(host=host, port=port) return client def save_mongo(client ,data): db = client.weibo collection = db.weibo if collection.insert(data): print(‘Save to mongo‘) def search_mongo(client): db = client.weibo collection = db.weibo result = collection.find() return result def main(): for i in range(1, 11): js = get_ajax_page(str(i)) results = parse_page(js) for result in results: client = collection_mongo(‘10.0.0.100‘) save_mongo(client, result) if __name__ == ‘__main__‘: # main() client = collection_mongo(‘10.0.0.100‘) data = search_mongo(client) for item in data: print(item)
以上是关于Ajax数据爬取的主要内容,如果未能解决你的问题,请参考以下文章