Ajax数据爬取

Posted 糕事情

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Ajax数据爬取相关的知识,希望对你有一定的参考价值。

 

 

    Ajax 即“Asynchronous Javascript And XML”(异步 javascript 和 XML),是指一种创建交互式、快速动态网页应用的网页开发技术,无需重新加载整个网页的情况下,能够更新部分网页的技术。

 

    通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新。

 

 

 

1. 爬取微博页面Ajax数据

技术图片
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import json, pymongo

def get_ajax_page(page):
    headers = {
        Host: weibo.com,
        Referer: https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment,
        User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/79.0.3945.130 Safari/537.36,
        X-Requested-With: XMLHttpRequest,
        Cookie: SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567
    }
    url = https://weibo.com/aj/v6/comment/big
    params = {
        ajwvr: 6,
        id: 4483557667874538,
        root_comment_max_id_type: 0,
        page: page,
    }
    try:
        response = requests.get(url=url, headers=headers, params=params)
        if response.status_code == 200:
            return response.json()
            # print(type(response.json()), response.json())
    except requests.ConnectionError as e:
        print(error, e.args)


def parse_page(js):
    data = js.get(data)
    html = data.get(html)
    doc = pq(html)
    items = doc(div.list_con).items()
    for item in items:
        msg = {}
        msg[name] = item(.WB_text).text().split()[0]
        msg[content] = item(.WB_text).text().split()[1]
        msg[datetime] = item(div.WB_from.S_txt2).text()

        yield msg

def collection_mongo(host=localhost, port=27017):
    client = pymongo.MongoClient(host=host, port=port)

    return client


def save_mongo(client ,data):

    db = client.weibo
    collection = db.weibo

    if collection.insert(data):
        print(Save to mongo)

def search_mongo(client):
    db = client.weibo
    collection = db.weibo
    result = collection.find()
    return result

def main():
    for i in range(1, 11):
        js = get_ajax_page(str(i))
        results = parse_page(js)
        for result in results:
            client = collection_mongo(10.0.0.100)
            save_mongo(client, result)

if __name__ == __main__:
    # main()
    client = collection_mongo(10.0.0.100)
    data = search_mongo(client)
    for item in data:
        print(item)
View Code

 

 

 

2. Ajax爬取头条街拍图片

技术图片
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import json, pymongo

def get_ajax_page(page):
    headers = {
        Host: weibo.com,
        Referer: https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment,
        User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36,
        X-Requested-With: XMLHttpRequest,
        Cookie: SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567
    }
    url = https://weibo.com/aj/v6/comment/big
    params = {
        ajwvr: 6,
        id: 4483557667874538,
        root_comment_max_id_type: 0,
        page: page,
    }
    try:
        response = requests.get(url=url, headers=headers, params=params)
        if response.status_code == 200:
            return response.json()
            # print(type(response.json()), response.json())
    except requests.ConnectionError as e:
        print(error, e.args)


def parse_page(js):
    data = js.get(data)
    html = data.get(html)
    doc = pq(html)
    items = doc(div.list_con).items()
    for item in items:
        msg = {}
        msg[name] = item(.WB_text).text().split()[0]
        msg[content] = item(.WB_text).text().split()[1]
        msg[datetime] = item(div.WB_from.S_txt2).text()

        yield msg

def collection_mongo(host=localhost, port=27017):
    client = pymongo.MongoClient(host=host, port=port)

    return client


def save_mongo(client ,data):

    db = client.weibo
    collection = db.weibo

    if collection.insert(data):
        print(Save to mongo)

def search_mongo(client):
    db = client.weibo
    collection = db.weibo
    result = collection.find()
    return result

def main():
    for i in range(1, 11):
        js = get_ajax_page(str(i))
        results = parse_page(js)
        for result in results:
            client = collection_mongo(10.0.0.100)
            save_mongo(client, result)

if __name__ == __main__:
    # main()
    client = collection_mongo(10.0.0.100)
    data = search_mongo(client)
    for item in data:
        print(item)
View Code

 

以上是关于Ajax数据爬取的主要内容,如果未能解决你的问题,请参考以下文章

以知乎为例教你如何爬取AJAX数据

分析Ajax爬取

如何用JAVA爬取AJAX加载后的页面

爬虫实例 利用Ajax爬取微博数据

Ajax数据爬取基本原理

Ajax数据爬取