篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了微信公众号 过滤 typescript cheerio相关的知识,希望对你有一定的参考价值。
E:\\公众号文章采集\\fi_filter_过滤器\\src\\exact_新浪博客手机版提取连接.js
const fs = require(\'fs\');
const jsdom = require(\'jsdom\');
const JSDOM = jsdom;
fs.readdir(\'./html\', function (err, files)
files.forEach((file) =>
fs.readFile(\'./html/\' + file, \'utf-8\', (err, data) =>
const window = new JSDOM(data);
const $ = require(\'jQuery\')(window);
const writeStream = fs.createWriteStream(\'./urls.txt\', \'utf-8\');
let index = 1;
// #js_content a href 超链接
$(\'#js_content a\').each(function ()
fs.appendFile(\'./urls.txt\', `$$(this).attr(\'href\')\\r\\n`, (err) =>
if (err)
return console.log(\'append txt failed\');
console.log(index++ + \'__append file success\');
);
);
//--------------------------------------------------------
//#js_content_overlay ul li data-link 话题
// $("#js_content_overlay ul li").each(function ()
// fs.appendFile(
// "./urls.txt",
// `第一章\\r\\n$$(this).attr("data-link")\\r\\n`,
// (err) =>
// if (err)
// return console.log("append txt failed");
//
// console.log(index++ + "__append file success");
//
// );
// );
//-----------------------------------------------
//新浪博客手机版提取连接
// $("body ul li a").each(function ()
// fs.appendFile(
// "./urls.txt",
// `第一章\\r\\n$$(this).attr("href")\\r\\n`,
// (err) =>
// if (err)
// return console.log("append txt failed");
//
// console.log(index++ + "__append file success");
//
// );
// );
//----------------------------------------
// 微信公众号主页提取连接
// $("span[hrefs]").each(function ()
// fs.appendFile(
// "./urls.txt",
// `第一章\\r\\n$$(this).attr("hrefs")\\r\\n`,
// (err) =>
// if (err)
// return console.log("append txt failed");
//
// console.log(index++ + "__append file success");
//
// );
// );
writeStream.end();
);
);
);
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(\'.\')[1] === \'txt\')
fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
const $ = cheerio.load(data);
// const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
const writeStream = fs.createWriteStream(
path + \'/目录/\' + \'新浪博客目录.txt\',
\'utf-8\'
);
$($(\'#pl-home-bloglist > article > ul>li\').get().reverse()).each(
(data, ele) =>
let title = $(ele).find(\'h2\').html();
let url = $(ele).find(\'a\').attr(\'data-link\');
writeStream.write(\'[\');
writeStream.write(title);
writeStream.write(\']\');
writeStream.write(\'(\');
writeStream.write(url);
writeStream.write(\')\');
writeStream.write(\'\\n\');
writeStream.write(\'\\n\');
console.log(title);
console.log(url);
);
writeStream.end();
);
);
);
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
let suffixReg = /\\.(html)$/;
if (suffixReg.test(file))
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
const writeStream = fs.createWriteStream(
path + \'/\' + file.split(\'.\')[0] + \'_讯飞有声_超链接_正序.txt\',
\'utf-8\'
);
// 获取所有的消息卡片
let js_history_list = $(\'#js_history_list\').children();
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) =>
return a.attribs.msgid - b.attribs.msgid;
);
//遍历消息卡片
for (let ele of history_list)
const $1 = cheerio.load(ele);
let link = $1(\'h4\').attr(\'hrefs\');
let isOrigin = $1(\'#copyright_logo\').html();
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $1(\'h4\').parent().attr(\'data-type\');
if (isOrigin === \'原创\' && type === \'APPMSG\')
console.log(link);
writeStream.write(link);
writeStream.write(\'\\n\');
writeStream.end();
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt)
var o =
\'M+\': this.getMonth() + 1, //月份
\'d+\': this.getDate(), //日
\'h+\': this.getHours(), //小时
\'m+\': this.getMinutes(), //分
\'s+\': this.getSeconds(), //秒
\'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
;
if (/(y+)/.test(fmt))
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
);
for (var k in o)
if (new RegExp(\'(\' + k + \')\').test(fmt))
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
);
return fmt;
;
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
let suffixReg = /\\.(html)$/;
if (suffixReg.test(file))
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
const writeStream = fs.createWriteStream(
path + \'/\' + file.split(\'.\')[0] + \'_link_正序.md\',
\'utf-8\'
);
// 获取所有的消息卡片
let js_history_list = $(\'#js_history_list\').children();
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) =>
return a.attribs.msgid - b.attribs.msgid;
);
//遍历消息卡片
for (let ele of history_list)
const $1 = cheerio.load(ele);
let link = $1(\'h4\').attr(\'hrefs\');
let isOrigin = $1(\'#copyright_logo\').html();
let time = $1(\'p.weui_media_extra_info\').html().split(\'<span\')[0].trim();
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $1(\'h4\').parent().attr(\'data-type\');
// 如果是原创和图文消息的话
if (isOrigin === \'原创\' && type === \'APPMSG\')
// 对日期进行处理,将2020年7月2日-->2020年07月02日
time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
time = new Date(time).format(\'yyyy年MM月dd日\');
// 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
let title = $1(\'h4\').html().split(\'</span>\')[1].trim();
console.log(time + \'_\' + title);
// 写入Markdown文件
writeStream.write(`[$time_$title]($link)`);
writeStream.write(\'\\n\');
writeStream.end();
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt)
var o =
\'M+\': this.getMonth() + 1, //月份
\'d+\': this.getDate(), //日
\'h+\': this.getHours(), //小时
\'m+\': this.getMinutes(), //分
\'s+\': this.getSeconds(), //秒
\'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
;
if (/(y+)/.test(fmt))
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
);
for (var k in o)
if (new RegExp(\'(\' + k + \')\').test(fmt))
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
);
return fmt;
;
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
console.log(\'---------\' + file + \'----------\');
let suffixReg = /\\.(txt)$/;
if (suffixReg.test(file))
// 读取文件,然后加载
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
// 获取所有的消息卡片
let js_history_list = $(\'#js_history_list\').children();
// 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
if (!js_history_list) continue;
// 创建文件写入流
const writeStream = fs.createWriteStream(
path + \'/\' + file.split(\'.\')[0] + \'_link_正序.md\',
\'utf-8\'
);
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) =>
return a.attribs.msgid - b.attribs.msgid;
);
//遍历消息卡片
for (let ele of history_list)
const $1 = cheerio.load(ele);
let link = $1(\'h4\').attr(\'hrefs\');
let isOrigin = $1(\'#copyright_logo\').html();
let time = $1(\'p.weui_media_extra_info\').html().split(\'<span\')[0].trim();
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $1(\'h4\').parent().attr(\'data-type\');
// 如果是原创和图文消息的话
if (isOrigin === \'原创\' && type === \'APPMSG\')
// 对日期进行处理,将2020年7月2日-->2020年07月02日
time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
time = new Date(time).format(\'yyyy年MM月dd日\');
// 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
let title = $1(\'h4\').html().split(\'</span>\')[1];
if (title !== undefined)
title = title.trim();
console.log(time + \'_\' + title);
// 写入Markdown文件
writeStream.write(`[$time_$title]($link)`);
writeStream.write(\'\\n\');
writeStream.end();
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt)
var o =
\'M+\': this.getMonth() + 1, //月份
\'d+\': this.getDate(), //日
\'h+\': this.getHours(), //小时
\'m+\': this.getMinutes(), //分
\'s+\': this.getSeconds(), //秒
\'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
;
if (/(y+)/.test(fmt))
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
);
for (var k in o)
if (new RegExp(\'(\' + k + \')\').test(fmt))
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
);
return fmt;
;
// 让程序的处理路径切换成当前文件夹
// let path = __dirname;
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
path + \'/\' + \'原创文章-超链接-时间顺序-合并.md\',
\'utf-8\'
);
// 遍历文件列表数组
for (let file of fileList)
console.log(\'---------\' + file + \'----------\');
let suffixReg = /\\.(txt)$/;
if (suffixReg.test(file))
// 读取文件,然后加载
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
// 获取所有的消息卡片
let js_history_list = $(\'#js_history_list\').children();
// 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
if (!js_history_list) continue;
// 写入当前文件标题,作为Markdown的一级标题
writeStream.write(`# $file.split(\'.\')[0]\\n`);
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) =>
return a.attribs.msgid - b.attribs.msgid;
);
//遍历消息卡片
for (let ele of history_list)
const $1 = cheerio.load(ele);
// 获取时间
let time = $1(\'.weui_msg_card_hd:first-child\').html();
time = timeConvert(time);
const msgList = $1(\'.weui_msg_card_bd\').children();
for (let msg of msgList)
const $2 = cheerio.load(msg);
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $2(\'h4\').parent().attr(\'data-type\');
// 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
let isOrigin = $2(\'#copyright_logo\').html();
// 获取当前图文标题
let title = $2(\'h4\').html();
// 对标题进行处理
if (isOrigin)
title = title.split(\'</span>\')[1].trim();
else
title = title.trim();
// 获取链接
let link = $2(\'h4\').attr(\'hrefs\');
// 如果是原创和图文消息的话
if (isOrigin === \'原创\' && type === \'APPMSG\')
console.log(time + \'_\' + title);
writeStream.write(`[$time_$title]($link) `);
writeStream.write(\'\\n\');
writeStream.end();
// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time)
time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
let newTime = new Date(time).format(\'yyyy年MM月dd日\');
return newTime;
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt)
var o =
\'M+\': this.getMonth() + 1, //月份
\'d+\': this.getDate(), //日
\'h+\': this.getHours(), //小时
\'m+\': this.getMinutes(), //分
\'s+\': this.getSeconds(), //秒
\'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
;
if (/(y+)/.test(fmt))
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
);
for (var k in o)
if (new RegExp(\'(\' + k + \')\').test(fmt))
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
);
return fmt;
;
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
path + \'/\' + \'原创文章-超链接-时间顺序-合并.md\',
\'utf-8\'
);
// 遍历文件列表数组
for (let file of fileList)
console.log(\'---------\' + file + \'----------\');
let suffixReg = /\\.(txt)$/;
if (suffixReg.test(file))
// 读取文件,然后加载
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
// 获取所有的消息卡片
let js_history_list = $(\'#js_history_list\').children();
// 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
if (!js_history_list) continue;
// 写入当前文件标题,作为Markdown的一级标题
writeStream.write(`# $file.split(\'.\')[0]\\n`);
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) =>
return a.attribs.msgid - b.attribs.msgid;
);
//遍历消息卡片
for (let ele of history_list)
const $1 = cheerio.load(ele);
// 获取时间
let time = $1(\'.weui_msg_card_hd:first-child\').html();
time = timeConvert(time);
const msgList = $1(\'.weui_msg_card_bd\').children();
for (let msg of msgList)
const $2 = cheerio.load(msg);
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $2(\'h4\').parent().attr(\'data-type\');
// 提取图文消息
if (type === \'APPMSG\')
// 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
let isOrigin = $2(\'#copyright_logo\').html();
// 获取当前图文标题
let title = $2(\'h4\').html();
// 对标题进行处理
if (isOrigin)
title = title.split(\'</span>\')[1].trim();
else
title = title.trim();
// 获取链接
let link = $2(\'h4\').attr(\'hrefs\');
// 如果是原创和图文消息的话
if (isOrigin === \'原创\')
console.log(time + \'_\' + title);
writeStream.write(`[$time_$title]($link) `);
writeStream.write(\'\\n\');
writeStream.end();
// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time)
time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
let newTime = new Date(time).format(\'yyyy年MM月dd日\');
return newTime;
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
console.log(file);
if (file.split(\'.\')[1] === \'txt\')
const data = fs.readFileSync(path + \'/\' + file, \'utf8\');
const $ = cheerio.load(data);
// 1. 提取目录
const writeStream = fs.createWriteStream(path + \'/目录/\' + file, \'utf-8\');
$(\'#js_history_list h4\').each((index, ele) =>
console.log(\'--------\' + index);
writeStream.write($(ele).attr(\'hrefs\') ?? \'\');
writeStream.write(\'\\n\');
);
writeStream.end();
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(\'.\')[1] === \'txt\')
fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
const $ = cheerio.load(data);
// 1. 数据清洗_相同路径下面创建相同文件,用来清理script标签
const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
$(\'script\').remove();
$(\'link\').remove();
writeStream.write($(\'html\').html());
writeStream.end();
);
);
);
const fs = require("fs");
const cheerio = require("cheerio");
let path = `./html`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// console.log(fileList);
// 遍历文件列表数组
// for (let file of fileList)
// console.log(file);
// if (file.split(\'.\')[1] === \'txt\')
// fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
// const $ = cheerio.load(data);
// // 1. 提取目录
// const writeStream = fs.createWriteStream(path + \'/目录/\' + file, \'utf-8\');
// $(
// \'#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li\'
// ).each((index, ele) =>
// writeStream.write($(ele).attr(\'data-link\'));
// writeStream.write(\'\\n\');
// );
// writeStream.end();
// );
//
//
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(\'.\')[1] === \'html\')
fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
const $ = cheerio.load(data);
// 1. 提取目录
const writeStream = fs.createWriteStream(
path + \'/目录/\' + file,
\'utf-8\'
);
$(\'#js_content a\').each((index, ele) =>
writeStream.write($(ele).attr(\'href\'));
writeStream.write(\'\\n\');
);
writeStream.end();
);
);
);
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
if (file.split(\'.\')[1] === \'txt\')
// 读取文件的中内容
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf8\');
// 加载cheerio中生成dom树
const $ = cheerio.load(fileContent);
// 创建写入文件流
const writeStream = fs.createWriteStream(path + \'/目录/\' + file, \'utf8\');
// 写入相关内容
$(\'#js_content a\').each((index, ele) =>
writeStream.write($(ele).attr(\'href\'));
writeStream.write(\'\\n\');
);
// 关闭写入流
writeStream.end();
// fs.readdir(path, function (err, files)
// files.forEach((file) =>
// console.log(file);
// if (file.split(\'.\')[1] === \'html\')
// fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
// const $ = cheerio.load(data);
// // 1. 提取目录
// const writeStream = fs.createWriteStream(
// path + \'/目录/\' + file,
// \'utf-8\'
// );
// $(\'#js_content a\').each((index, ele) =>
// writeStream.write($(ele).attr(\'href\'));
// writeStream.write(\'\\n\');
// );
// writeStream.end();
// );
//
// );
// );
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(\'.\')[1] === \'txt\')
fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
const $ = cheerio.load(data);
// const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
const writeStream = fs.createWriteStream(
path + \'/目录/\' + \'新浪博客目录.txt\',
\'utf-8\'
);
$(\'#pl-home-bloglist > article > ul>li\').each((data, ele) =>
let title = $(ele).find(\'h2\').html();
let url = $(ele).find(\'a\').attr(\'data-link\');
writeStream.write(\'[\');
writeStream.write(title);
writeStream.write(\']\');
writeStream.write(\'(\');
writeStream.write(url);
writeStream.write(\')\');
writeStream.write(\'\\n\');
writeStream.write(\'\\n\');
console.log(title);
console.log(url);
);
writeStream.end();
);
);
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html.js
const fs = require("fs");
const jsdom = require("jsdom");
const JSDOM = jsdom;
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(".")[1] === "html")
fs.readFile(path + "/" + file, "utf-8", (err, data) =>
const window = new JSDOM(data);
const $ = require("jQuery")(window);
const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");
// 包含特定文字,掺杂在正文中的p标签,或者span标签
$("span:contains(\'***\')").remove();
$("span:contains(\'--- TBC ---\')").remove();
$("span:contains(\'支持原创翻译\')").remove();
$("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
$("a:contains(\'阅读全文\')").remove();
$("p:contains(\'因此,在《给弟子的忠告》\')").remove();
//标题移除
$("#activity-name").remove();
// 所有音频标签
$("section").remove();
// 所有的h3标签
$("h3").remove();
// 话题标签
$("#js_tags").remove();
// 所有的img图片;
$("img").remove();
// 所有的script标签
$("script").remove();
//公众号名称 时间信息
$("div#meta_content").remove();
//底部评论信息
$("div.comment").remove();
//html写入
writeStream.write($("html").html());
writeStream.end();
);
);
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio.js
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
if (file.split(\'.\')[1] === \'html\')
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
// 过滤内容
// 所有的script标签
$(\'script\').remove();
$(\'link\').remove();
//html写入
writeStream.write($(\'html\').html());
writeStream.end();
function filterContent($)
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("section:contains(\'相关阅读\')").remove();
// $("span:contains(\'--- TBC ---\')").remove();
// $("span:contains(\'支持原创翻译\')").remove();
// $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
// $("a:contains(\'阅读全文\')").remove();
// $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
//标题移除
// $(\'#activity-name\').remove();
// 所有音频标签
// $(\'section\').remove();
// 所有的h3标签
// $(\'h3\').remove();
// 话题标签
// $(\'#js_tags\').remove();
// 所有的img图片;
// $(\'img\').remove();
// 所有的script标签
$(\'script\').remove();
//公众号名称 时间信息
// $(\'div#meta_content\').remove();
//底部评论信息
// $(\'div.comment\').remove();
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio_听心坊.js
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\听心坊\\\\`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
console.log(file);
if (file.split(\'.\')[1] === \'html\')
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
const $ = cheerio.load(fileContent);
// 过滤内容---------------------------------------------------------
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("span:contains(\'如欲与陈明亮先生交流,请登陆:\')").remove();
// $("span:contains(\'facebook.com/profile.php?id=100039436871466\')").remove();
// $("span:contains(\'更多陈明亮的文章,请前往\')").remove();
// $("span:contains(\'更多的音频\')").remove();
// $("span:contains(\'摄影\')").remove();
// 获取文章内的数字索引
// console.log($("span:contains(\'【明亮说\')").text());
let idx = $("span:contains(\'【明亮说\')").text().split(\'·\')[1];
console.log(idx);
// //测试断点
// writeStream.write($(\'html\').html());
// writeStream.end();
// break;
// //测试断点
// $("a:contains(\'阅读全文\')").remove();
// $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
//标题移除
// $(\'#activity-name\').remove();
// 所有音频标签
// $(\'section\').remove();
// 所有的h3标签
// $(\'h3\').remove();
// 话题标签
// $(\'#js_tags\').remove();
// 所有的img图片;
// $(\'img\').remove();
// 所有的script标签
// $(\'script\').remove();
//公众号名称 时间信息
// $(\'div#meta_content\').remove();
//底部评论信息
// $(\'div.comment\').remove();
//html写入-------------------------------------------------
const writeStream = fs.createWriteStream(path + \'/\' + idx + file, \'utf-8\');
writeStream.write($(\'html\').html());
writeStream.end();
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio_旭然之光.js
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\煦然之光`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(\'.\')[1] === \'html\')
fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) =>
const $ = cheerio.load(data);
const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("section:contains(\'相关阅读\')").remove();
// $("span:contains(\'--- TBC ---\')").remove();
// $("span:contains(\'支持原创翻译\')").remove();
// $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
// $("a:contains(\'阅读全文\')").remove();
// $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
//标题移除
// $(\'#activity-name\').remove();
// 所有音频标签
// $(\'section\').remove();
// 所有的h3标签
// $(\'h3\').remove();
// 话题标签
// $(\'#js_tags\').remove();
// 所有的img图片;
// $(\'img\').remove();
// 所有的script标签
// $(\'script\').remove();
//公众号名称 时间信息
// $(\'div#meta_content\').remove();
//底部评论信息
// $(\'div.comment\').remove();
//html写入
writeStream.write($(\'html\').html());
writeStream.end();
);
);
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio_阿知事业林.js
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\阿知事业林`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
console.log(file);
if (file.split(\'.\')[1] === \'html\')
// 读取文件的中内容
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf8\');
// 加载cheerio中生成dom树
const $ = cheerio.load(fileContent);
// 创建写入文件流
const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
// 所有的img图片;
$(\'img\').remove();
// 所有的script标签
$(\'script\').remove();
// ---------------------
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("section:contains(\'相关阅读\')").remove();
// $("span:contains(\'--- TBC ---\')").remove();
// $("span:contains(\'支持原创翻译\')").remove();
// $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
// $("a:contains(\'阅读全文\')").remove();
// $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
// 作者和发布的时间信息
$(\'#meta_content\').remove();
// 评论信息
$(\'.comment\').remove();
// 话题标签
$(\'#js_tags\').remove();
// 评论赞助
$(\'#js_sponsor_ad_area\').remove();
// 超链接
$(\'a\').remove();
// 所有音频标签
// $(\'section\').remove();
// 所有的h3标签
// $(\'h3\').remove();
// 话题标签
// $(\'#js_tags\').remove();
//公众号名称 时间信息
// $(\'div#meta_content\').remove();
//底部评论信息
// $(\'div.comment\').remove();
//html写入
writeStream.write($(\'html\').html());
writeStream.end();
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_不死甘露.js
const fs = require("fs");
const jsdom = require("jsdom");
const JSDOM = jsdom;
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(".")[1] === "html")
fs.readFile(path + "/" + file, "utf-8", (err, data) =>
const window = new JSDOM(data);
const $ = require("jQuery")(window);
const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");
// 包含特定文字,掺杂在正文中的p标签,或者span标签
$("span:contains(\'***\')").remove();
$("span:contains(\'--- TBC ---\')").remove();
$("span:contains(\'支持原创翻译\')").remove();
$("a:contains(\'阅读全文\')").remove();
$("strong:contains(\'不死甘露\')").remove();
$("strong:contains(\'关于永恒的开示录\')").remove();
$("strong:contains(\'THE NECTAR OF IMMORTALITY\')").remove();
$("span:contains(\'室利·尼萨迦达塔·马哈拉吉 著\')").remove();
$("span:contains(\'灵智宝鬘翻译团队 中译\')").remove();
$("p:contains(\'喜欢作者\')").remove();
$("p:contains(\'——\')").remove();
//标题移除
$("#activity-name").remove();
// 所有音频标签
$("section").remove();
// 所有的h3标签
$("h3").remove();
// 话题标签
$("#js_tags").remove();
// 所有的img图片;
$("img").remove();
// 所有的script标签
$("script").remove();
//公众号名称 时间信息
$("div#meta_content").remove();
//底部评论信息
$("div.comment").remove();
//html写入
writeStream.write($("html").html());
writeStream.end();
);
);
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\html2txt.js
const fs = require("fs");
const jsdom = require("jsdom");
const JSDOM = jsdom;
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\养猫学习`;
fs.readdir(path, function (err, files)
files.forEach((file) =>
console.log(file);
if (file.split(".")[1] === "html")
fs.readFile(path + "/" + file, "utf-8", (err, data) =>
const window = new JSDOM(data);
const $ = require("jQuery")(window);
const writeStream = fs.createWriteStream(
path + "/" + file.split(".")[0] + ".txt",
"utf-8"
);
//标题
writeStream.write($("#activity-name").text());
//内容
writeStream.write($("#js_content").text());
writeStream.end();
);
);
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\html2txt_cheerio.js
const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\阿知事业林`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList)
console.log(file);
if (file.split(\'.\')[1] === \'html\')
// 读取文件的中内容
const fileContent = fs.readFileSync(path + \'/\' + file, \'utf8\');
// 加载cheerio中生成dom树
const $ = cheerio.load(fileContent);
// 创建一个文件写入流
const writeStream = fs.createWriteStream(
path + \'/\' + file.split(\'.\')[0] + \'.txt\',
\'utf-8\'
);
// //标题
// writeStream.write($(\'#activity-name\').text());
// //内容
writeStream.write($(\'body\').text());
writeStream.write(\'endendend\');
writeStream.end();
E:\\公众号文章采集\\fi_filter_过滤器\\src\\任意路径文件写入.js
const fs = require(\'fs\');
const writeFileRecursive = function (path, buffer, callback)
// 前面的文件路径
let lastPath = path.substring(0, path.lastIndexOf(\'/\'));
// 递归创建目录
fs.mkdir(lastPath, recursive: true , (err) =>
if (err) return callback(err);
fs.writeFile(path, buffer, function (err)
if (err) return callback(err);
return callback(null);
);
);
;
const buffer = \'hello\';
writeFileRecursive(\'./public/test/test.txt\', buffer, (err) =>
if (err) console.error(err);
console.info(\'write success\');
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\crawler\\crawler.ts
import superagent from "superagent";
import load, CheerioAPI from "cheerio";
import log from "console";
import createWriteStream from "fs";
export default class Crawler
private url = ``;
private $: CheerioAPI;
constructor()
setUrl(url: string)
this.url = url;
async init()
const res = await superagent.get(this.url);
this.$ = load(res.text);
save(path: string)
const writeStream = createWriteStream(path, "utf-8");
writeStream.write(this.$("html"));
writeStream.end();
getTitle()
log(this.$("#activity-name").text());
return this.$("#activity-name").text();
getTime()
log(this.$("script:contains(\'function htmlDecode(str)\')").text());
getContent()
// 萨特桑指出
const quotes = this.$("span:contains(\'萨特桑指出\')");
return quotes.text();
E:\\公众号文章采集\\fi_filter_过滤器\\src\\crawler\\index.ts
import log from "console";
import Crawler from "./crawler";
const crawler = new Crawler();
crawler.setUrl("https://mp.weixin.qq.com/s/EgZhFJTzsgfYzZZ-4_SI4Q");
await crawler.init();
crawler.getTime();
// crawler.getTitle();
// crawler.save("");
// const content = crawler.getContent();
// log(content);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter\\01_灵智宝鬘_话题_尼萨迦达塔.ts
import log from "console";
import
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
from "fs";
import basename, extname, join, resolve from "path";
import load, CheerioAPI from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\\公众号文章采集\\公众号HTML\\灵智宝鬘
const basePath = "E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘";
const outPath = join(basePath, "out");
try
exitsFolder(outPath);
catch (e)
log(e);
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) =>
return lstatSync(join(basePath, fileName)).isFile();
)
.filter((fileName) =>
const fileExt = extname(fileName);
return fileExt === ".html";
)
.map((fileName) =>
return join(basePath, fileName);
);
// pureFilePathList.forEach((filePath) =>
// extractTopic(filePath);
// );
for (let filePath of pureFilePathList)
const $: CheerioAPI = loadHtmlDom(filePath);
filterDom($);
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
writeStream.write($("html").html());
writeStream.end();
// ====================================================================================================================
// 移除标签的原则,尽量少移除p标签,有可能会误把正文内容移除
function filterDom($: CheerioAPI)
// 话题标签
$("#js_tags").remove();
// 包含特定文字的span标签
$("span:contains(\'灵智宝鬘翻译团队 中译\')").remove();
//
$(
"p[style*=\'white-space: normal;text-align: center;\']:contains(\'我是那\')"
).remove();
// 室利·尼萨迦达塔·马哈拉吉的开示录
$(
"p[style*=\'white-space: normal;text-align: center;\']:contains(\'室利·尼萨迦达塔·马哈拉吉的开示录\')"
).remove();
// 文字颜色是 color: rgb(136, 136, 136) ,且包含" 室利·尼萨迦达塔·马哈拉吉 著"的span标签
$(
"span[style*=\'color: rgb(136, 136, 136)\']:contains(\'室利·尼萨迦达塔·马哈拉吉 著\')"
).remove();
// 红色的span和strong标签
$("span[style*=\'color: rgb(255, 76, 65)\']").remove();
$("strong[style*=\'color: rgb(255, 76, 65)\']").remove();
// 类名是comment的div标签
$("div.comment").remove();
function loadHtmlDom(filePath: string): CheerioAPI
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
function extractLink($: CheerioAPI)
const oLinkList = $("#js_articles > div");
if (!oLinkList.length) return [];
const linkArr: string[] = [];
oLinkList.each((i, oLink) =>
const url = $(oLink).attr("data-jump_url");
if (!url) return;
linkArr.push(url);
);
return linkArr;
function exitsFolder(absPath: string)
try
statSync(absPath);
catch (e)
// 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, recursive: true );
function getCurDate()
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
function getOutFilePath(filePath: string)
return join(outPath, basename(filePath));
E:\\公众号文章采集\\fi_filter_过滤器\\src\\topic\\01_非推送_链接_一行一个.ts
import log from "console";
import
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
from "fs";
import basename, extname, join, resolve from "path";
import load, CheerioAPI from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try
exitsFolder(outPath);
catch (e)
log(e);
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) =>
return lstatSync(join(basePath, fileName)).isFile();
)
.filter((fileName) =>
const fileExt = extname(fileName);
return fileExt === ".txt" || fileExt === ".html";
)
.map((fileName) =>
return join(basePath, fileName);
);
pureFilePathList.forEach((filePath) =>
extractTopic(filePath);
);
function extractTopic(filePath: string)
const $: CheerioAPI = loadHtmlDom(filePath);
const urlArr = extractLink($);
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
urlArr.forEach((url) =>
writeStream.write(url);
writeStream.write("\\n");
);
writeStream.end();
function loadHtmlDom(filePath: string): CheerioAPI
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
function extractLink($: CheerioAPI)
const oLinkList = $(
"#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li"
);
const linkArr: string[] = [];
oLinkList.each((i, oLink) =>
const url = $(oLink).attr("data-link");
linkArr.push(url ? url : "");
);
return linkArr;
function exitsFolder(absPath: string)
try
statSync(absPath);
catch (e)
// 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, recursive: true );
function getCurDate()
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
function getOutFilePath(filePath: string)
return join(
outPath,
getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\topic\\02_推送_链接_一行一个.ts
import log from "console";
import
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
from "fs";
import basename, extname, join, resolve from "path";
import load, CheerioAPI from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try
exitsFolder(outPath);
catch (e)
log(e);
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) =>
return lstatSync(join(basePath, fileName)).isFile();
)
.filter((fileName) =>
const fileExt = extname(fileName);
return fileExt === ".txt" || fileExt === ".html";
)
.map((fileName) =>
return join(basePath, fileName);
);
pureFilePathList.forEach((filePath) =>
extractTopic(filePath);
);
function extractTopic(filePath: string)
const $: CheerioAPI = loadHtmlDom(filePath);
const urlArr = extractLink($)!;
if (!urlArr.length) return;
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
urlArr.forEach((url) =>
writeStream.write(url);
writeStream.write("\\n");
);
writeStream.end();
function loadHtmlDom(filePath: string): CheerioAPI
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
function extractLink($: CheerioAPI)
const oLinkList = $("#js_articles > div");
if (!oLinkList.length) return [];
const linkArr: string[] = [];
oLinkList.each((i, oLink) =>
const url = $(oLink).attr("data-jump_url");
if (!url) return;
linkArr.push(url);
);
return linkArr;
function exitsFolder(absPath: string)
try
statSync(absPath);
catch (e)
// 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, recursive: true );
function getCurDate()
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
function getOutFilePath(filePath: string)
return join(
outPath,
getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
);
E:\\公众号文章采集\\fi_filter_过滤器\\src\\txt\\01_合集.ts
import log from "console";
import
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
from "fs";
import basename, extname, join, resolve from "path";
import load, CheerioAPI from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\\公众号文章采集\\公众号HTML\\灵智宝鬘
const basePath = "E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘\\\\out";
const outFileName = "灵智宝鬘_尼萨迦达塔_我是那";
const outPath = join(basePath, "txt");
try
exitsFolder(outPath);
catch (e)
log(e);
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) =>
return lstatSync(join(basePath, fileName)).isFile();
)
.filter((fileName) =>
const fileExt = extname(fileName);
return fileExt === ".html";
)
.map((fileName) =>
return join(basePath, fileName);
);
// pureFilePathList.forEach((filePath) =>
// extractTopic(filePath);
// );
const outFilePath = getOutFilePath();
const writeStream = createWriteStream(outFilePath, "utf-8");
for (let [index, filePath] of pureFilePathList.entries())
const $: CheerioAPI = loadHtmlDom(filePath);
const textContent = extractText($);
writeStream.write("\\n");
writeStream.write(`第$index + 1章`);
writeStream.write("\\n");
writeStream.write(textContent.title);
writeStream.write("\\n");
writeStream.write(textContent.pubDate);
writeStream.write("\\n");
writeStream.write(textContent.content);
writeStream.write("\\n");
log(`$index_$filePath`);
writeStream.end();
// ====================================================================================================================
function loadHtmlDom(filePath: string): CheerioAPI
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
function exitsFolder(absPath: string)
try
statSync(absPath);
catch (e)
// 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, recursive: true );
function getOutFilePath()
return join(outPath, outFileName + ".txt");
function extractText($: CheerioAPI)
// #activity-name
const title = $("#activity-name").text();
// #publish_time
const pubDate = $("#publish_time").text();
const content = $("#js_content").text();
const splitContent = handleContent(content);
return
title,
pubDate,
content: splitContent,
;
function handleContent(content: string)
return content
.replace("尼萨迦达塔:", replaceContent("尼萨迦达塔:", 1000))
.replace("尼:", replaceContent("尼萨迦达塔:", 1000))
.replace("提问者:", replaceContent("提问者:", 1000))
.replace("问:", replaceContent("提问者:", 1000));
function replaceContent(keyword: string, time: number = 1000)
return `\\n[p$time.toString()]\\n$keyword\\n`;