TS爬虫，爬取博客园demo，全网最详细！

Posted 2022-03-04 邱天_henry

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了TS爬虫，爬取博客园demo，全网最详细！相关的知识，希望对你有一定的参考价值。

/**

* 返回脚本节点的数据结构。

* 可以直接返回DbTableInfo类型的数据，ddl存储查询的sql,table存储的是对应的物理表名。也可以返回一个二维数组。

* 使用场景：

* 1. 调用api处理gis数据更新，并需要将处理后的物理表输出为模型。

* 2. 调用api查询数据，读取仪表板或报表的查询条件作为参数，并将实时查询结果返回。

function onProcessData(context: IDataFlowScriptNodeContext): DbTableInfo | any[][]

const Jsoup = Java.type("org.jsoup.Jsoup");

const url = "https://www.cnblogs.com/";

let data = [];

for (let i = 1; i <= 3; i++)

let res: string = post(

url: "https://www.cnblogs.com/AggSite/AggSitePostList",

data:

"CategoryType": "SiteHome",

"ParentCategoryId": 0,

"CategoryId": 808,

"PageIndex": i.toFixed(),

"TotalPostCount": 4000,

"ItemListActionName": "AggSitePostList"

headers:

"Content-Type": "application/json"

).responseText;

let document = Jsoup.parse(res);

let elements = document.getElementsByClass("post-item");

for (let element of elements)

let row = [];

let titleElement = element.getElementsByClass("post-item-title");

let summaryElement = element.getElementsByClass("post-item-summary");

let authorElement = element.select(".post-item-foot > .post-item-author");

//let authorElement = element.select("section > footer > a.:nth-child(1)")

let createDateElement = element.select("section > footer > span.post-meta-item > span");

let commentCountElement = element.select("section > footer > a:nth-child(4) > span");

let viewCountElement = element.select("section > footer > a:nth-child(5) > span");

// 博客 ID

let id = element.attr("data-post-id");

// 博客标题

let title = titleElement.text();

// 内容简介

let summary = summaryElement.get(0).ownText();

let authorUrl = authorElement.select("a").get(0).attr("href");

// 作者 ID

let authorId = authorUrl.substring(url.length, authorUrl.length() - 1);

// 作者网名

let authorName = authorElement.select("span").get(0).text();

// 创建时间

let createDate = createDateElement.text();

// 点赞数

let diggCount = element.getElementById("digg_count_" + id).text();

// 评论数

let commentCount = commentCountElement.text();

// 浏览量

let viewCount = viewCountElement.text();

print("---------------------------------------------")

row.push(id);

row.push(title);

row.push(summary);

row.push(authorUrl);

row.push(authorId);

row.push(authorName);

row.push(createDate);

row.push(diggCount);

row.push(commentCount);

row.push(viewCount);

print("row======" + row);

data.push(row);

return data;

========================================================================通过调用api获取cnode数据

/**

* 返回脚本节点字段结构。

* 使用场景：

* 1. 通过脚本爬取数据到数据仓库，需要生成定义好的字段结构。

* 2. 通过脚本解析json数据，需要预解析几行数据生成字段。

function onProcessFields(context: IDataFlowScriptNodeContext): DbFieldInfo[]

return [

];

/**

* 返回脚本节点的数据结构。

* 可以直接返回DbTableInfo类型的数据，ddl存储查询的sql,table存储的是对应的物理表名。也可以返回一个二维数组。

* 使用场景：

* 1. 调用api处理gis数据更新，并需要将处理后的物理表输出为模型。

* 2. 调用api查询数据，读取仪表板或报表的查询条件作为参数，并将实时查询结果返回。

function onProcessData(context: IDataFlowScriptNodeContext): DbTableInfo | any[][]

const Jsoup = Java.type("org.jsoup.Jsoup");

const url = "https://www.cnblogs.com/";

let data = [];

for (let i = 1; i <= 3; i++)

let res: string = post(

url: "https://www.cnblogs.com/AggSite/AggSitePostList",

data:

"CategoryType": "SiteHome",

"ParentCategoryId": 0,

"CategoryId": 808,

"PageIndex": i.toFixed(),

"TotalPostCount": 4000,

"ItemListActionName": "AggSitePostList"

headers:

"Content-Type": "application/json"

).responseText;

let document = Jsoup.parse(res);

let elements = document.getElementsByClass("post-item");

for (let element of elements)

let row = [];

let titleElement = element.getElementsByClass("post-item-title");

let summaryElement = element.getElementsByClass("post-item-summary");

let authorElement = element.select(".post-item-foot > .post-item-author");

//let authorElement = element.select("section > footer > a.:nth-child(1)")

let createDateElement = element.select("section > footer > span.post-meta-item > span");

let commentCountElement = element.select("section > footer > a:nth-child(4) > span");

let viewCountElement = element.select("section > footer > a:nth-child(5) > span");

// 博客 ID

let id = element.attr("data-post-id");

// 博客标题

let title = titleElement.text();

// 内容简介

let summary = summaryElement.get(0).ownText();

let authorUrl = authorElement.select("a").get(0).attr("href");

// 作者 ID

let authorId = authorUrl.substring(url.length, authorUrl.length() - 1);

// 作者网名

let authorName = authorElement.select("span").get(0).text();

// 创建时间

let createDate = createDateElement.text();

// 点赞数

let diggCount = element.getElementById("digg_count_" + id).text();

// 评论数

let commentCount = commentCountElement.text();

// 浏览量

let viewCount = viewCountElement.text();

print("---------------------------------------------")

row.push(id);

row.push(title);

row.push(summary);

row.push(authorUrl);

row.push(authorId);

row.push(authorName);

row.push(createDate);

row.push(diggCount);

row.push(commentCount);

row.push(viewCount);

print("row======" + row);

data.push(row);

return data;

以上是关于TS爬虫，爬取博客园demo，全网最详细！的主要内容，如果未能解决你的问题，请参考以下文章