微信公众号 过滤 typescript cheerio

Posted 一顿操作猛如虎

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了微信公众号 过滤 typescript cheerio相关的知识,希望对你有一定的参考价值。

E:\\公众号文章采集\\fi_filter_过滤器\\src\\exact_新浪博客手机版提取连接.js

const fs = require(\'fs\');
const jsdom = require(\'jsdom\');
const  JSDOM  = jsdom;

fs.readdir(\'./html\', function (err, files) 
  files.forEach((file) => 
    fs.readFile(\'./html/\' + file, \'utf-8\', (err, data) => 
      const  window  = new JSDOM(data);
      const $ = require(\'jQuery\')(window);
      const writeStream = fs.createWriteStream(\'./urls.txt\', \'utf-8\');
      let index = 1;
      // #js_content a           href  超链接

      $(\'#js_content a\').each(function () 
        fs.appendFile(\'./urls.txt\', `$$(this).attr(\'href\')\\r\\n`, (err) => 
          if (err) 
            return console.log(\'append txt failed\');
          
          console.log(index++ + \'__append file success\');
        );
      );

      //--------------------------------------------------------
      //#js_content_overlay ul li                   data-link  话题

      // $("#js_content_overlay ul li").each(function () 
      //   fs.appendFile(
      //     "./urls.txt",
      //     `第一章\\r\\n$$(this).attr("data-link")\\r\\n`,
      //     (err) => 
      //       if (err) 
      //         return console.log("append txt failed");
      //       
      //       console.log(index++ + "__append file success");
      //     
      //   );
      // );

      //-----------------------------------------------
      //新浪博客手机版提取连接
      // $("body ul li a").each(function () 
      //   fs.appendFile(
      //     "./urls.txt",
      //     `第一章\\r\\n$$(this).attr("href")\\r\\n`,
      //     (err) => 
      //       if (err) 
      //         return console.log("append txt failed");
      //       
      //       console.log(index++ + "__append file success");
      //     
      //   );
      // );
      //----------------------------------------
      // 微信公众号主页提取连接
      // $("span[hrefs]").each(function () 
      //   fs.appendFile(
      //     "./urls.txt",
      //     `第一章\\r\\n$$(this).attr("hrefs")\\r\\n`,
      //     (err) => 
      //       if (err) 
      //         return console.log("append txt failed");
      //       
      //       console.log(index++ + "__append file success");
      //     
      //   );
      // );

      writeStream.end();
    );
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_倒序.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(\'.\')[1] === \'txt\') 
      fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
        const $ = cheerio.load(data);
        // const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
        const writeStream = fs.createWriteStream(
          path + \'/目录/\' + \'新浪博客目录.txt\',
          \'utf-8\'
        );
        $($(\'#pl-home-bloglist > article > ul>li\').get().reverse()).each(
          (data, ele) => 
            let title = $(ele).find(\'h2\').html();
            let url = $(ele).find(\'a\').attr(\'data-link\');
            writeStream.write(\'[\');
            writeStream.write(title);
            writeStream.write(\']\');
            writeStream.write(\'(\');
            writeStream.write(url);
            writeStream.write(\')\');
            writeStream.write(\'\\n\');
            writeStream.write(\'\\n\');
            console.log(title);
            console.log(url);
          
        );
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_公众号_历史消息_原创_时间正序.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  let suffixReg = /\\.(html)$/;
  if (suffixReg.test(file)) 
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);
    const writeStream = fs.createWriteStream(
      path + \'/\' + file.split(\'.\')[0] + \'_讯飞有声_超链接_正序.txt\',
      \'utf-8\'
    );
    // 获取所有的消息卡片
    let js_history_list = $(\'#js_history_list\').children();
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => 
      return a.attribs.msgid - b.attribs.msgid;
    );
    //遍历消息卡片
    for (let ele of history_list) 
      const $1 = cheerio.load(ele);
      let link = $1(\'h4\').attr(\'hrefs\');
      let isOrigin = $1(\'#copyright_logo\').html();
      // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
      let type = $1(\'h4\').parent().attr(\'data-type\');
      if (isOrigin === \'原创\' && type === \'APPMSG\') 
        console.log(link);
        writeStream.write(link);
        writeStream.write(\'\\n\');
      
    
    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_公众号_历史消息_原创_时间正序_markdown.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) 
  var o = 
    \'M+\': this.getMonth() + 1, //月份
    \'d+\': this.getDate(), //日
    \'h+\': this.getHours(), //小时
    \'m+\': this.getMinutes(), //分
    \'s+\': this.getSeconds(), //秒
    \'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  ;
  if (/(y+)/.test(fmt)) 
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
    );
  
  for (var k in o) 
    if (new RegExp(\'(\' + k + \')\').test(fmt)) 
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
      );
    
  
  return fmt;
;

let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  let suffixReg = /\\.(html)$/;
  if (suffixReg.test(file)) 
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);
    const writeStream = fs.createWriteStream(
      path + \'/\' + file.split(\'.\')[0] + \'_link_正序.md\',
      \'utf-8\'
    );
    // 获取所有的消息卡片
    let js_history_list = $(\'#js_history_list\').children();
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => 
      return a.attribs.msgid - b.attribs.msgid;
    );
    //遍历消息卡片
    for (let ele of history_list) 
      const $1 = cheerio.load(ele);
      let link = $1(\'h4\').attr(\'hrefs\');
      let isOrigin = $1(\'#copyright_logo\').html();
      let time = $1(\'p.weui_media_extra_info\').html().split(\'<span\')[0].trim();
      // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
      let type = $1(\'h4\').parent().attr(\'data-type\');
      // 如果是原创和图文消息的话
      if (isOrigin === \'原创\' && type === \'APPMSG\') 
        // 对日期进行处理,将2020年7月2日-->2020年07月02日
        time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
        time = new Date(time).format(\'yyyy年MM月dd日\');
        // 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
        let title = $1(\'h4\').html().split(\'</span>\')[1].trim();
        console.log(time + \'_\' + title);
        // 写入Markdown文件
        writeStream.write(`[$time_$title]($link)`);
        writeStream.write(\'\\n\');
      
    
    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_公众号_历史消息_原创_时间正序_markdown_当前文件夹.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) 
  var o = 
    \'M+\': this.getMonth() + 1, //月份
    \'d+\': this.getDate(), //日
    \'h+\': this.getHours(), //小时
    \'m+\': this.getMinutes(), //分
    \'s+\': this.getSeconds(), //秒
    \'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  ;
  if (/(y+)/.test(fmt)) 
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
    );
  
  for (var k in o) 
    if (new RegExp(\'(\' + k + \')\').test(fmt)) 
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
      );
    
  
  return fmt;
;
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) 
  console.log(\'---------\' + file + \'----------\');
  let suffixReg = /\\.(txt)$/;
  if (suffixReg.test(file)) 
    // 读取文件,然后加载
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);
    // 获取所有的消息卡片
    let js_history_list = $(\'#js_history_list\').children();
    // 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
    if (!js_history_list) continue;
    // 创建文件写入流
    const writeStream = fs.createWriteStream(
      path + \'/\' + file.split(\'.\')[0] + \'_link_正序.md\',
      \'utf-8\'
    );
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => 
      return a.attribs.msgid - b.attribs.msgid;
    );
    //遍历消息卡片
    for (let ele of history_list) 
      const $1 = cheerio.load(ele);
      let link = $1(\'h4\').attr(\'hrefs\');
      let isOrigin = $1(\'#copyright_logo\').html();
      let time = $1(\'p.weui_media_extra_info\').html().split(\'<span\')[0].trim();
      // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
      let type = $1(\'h4\').parent().attr(\'data-type\');
      // 如果是原创和图文消息的话
      if (isOrigin === \'原创\' && type === \'APPMSG\') 
        // 对日期进行处理,将2020年7月2日-->2020年07月02日
        time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
        time = new Date(time).format(\'yyyy年MM月dd日\');
        // 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
        let title = $1(\'h4\').html().split(\'</span>\')[1];
        if (title !== undefined) 
          title = title.trim();
        
        console.log(time + \'_\' + title);
        // 写入Markdown文件
        writeStream.write(`[$time_$title]($link)`);
        writeStream.write(\'\\n\');
      
    
    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_公众号_历史消息_原创_时间正序_markdown_当前文件夹_合并.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) 
  var o = 
    \'M+\': this.getMonth() + 1, //月份
    \'d+\': this.getDate(), //日
    \'h+\': this.getHours(), //小时
    \'m+\': this.getMinutes(), //分
    \'s+\': this.getSeconds(), //秒
    \'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  ;
  if (/(y+)/.test(fmt)) 
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
    );
  
  for (var k in o) 
    if (new RegExp(\'(\' + k + \')\').test(fmt)) 
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
      );
    
  
  return fmt;
;
// 让程序的处理路径切换成当前文件夹
// let path = __dirname;
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
  path + \'/\' + \'原创文章-超链接-时间顺序-合并.md\',
  \'utf-8\'
);
// 遍历文件列表数组
for (let file of fileList) 
  console.log(\'---------\' + file + \'----------\');
  let suffixReg = /\\.(txt)$/;
  if (suffixReg.test(file)) 
    // 读取文件,然后加载
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);
    // 获取所有的消息卡片
    let js_history_list = $(\'#js_history_list\').children();
    // 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
    if (!js_history_list) continue;

    // 写入当前文件标题,作为Markdown的一级标题
    writeStream.write(`# $file.split(\'.\')[0]\\n`);
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => 
      return a.attribs.msgid - b.attribs.msgid;
    );
    //遍历消息卡片
    for (let ele of history_list) 
      const $1 = cheerio.load(ele);
      // 获取时间
      let time = $1(\'.weui_msg_card_hd:first-child\').html();
      time = timeConvert(time);
      const msgList = $1(\'.weui_msg_card_bd\').children();
      for (let msg of msgList) 
        const $2 = cheerio.load(msg);
        // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
        let type = $2(\'h4\').parent().attr(\'data-type\');
        // 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
        let isOrigin = $2(\'#copyright_logo\').html();
        // 获取当前图文标题
        let title = $2(\'h4\').html();
        // 对标题进行处理
        if (isOrigin) 
          title = title.split(\'</span>\')[1].trim();
         else 
          title = title.trim();
        
        // 获取链接
        let link = $2(\'h4\').attr(\'hrefs\');
        // 如果是原创和图文消息的话
        if (isOrigin === \'原创\' && type === \'APPMSG\') 
          console.log(time + \'_\' + title);
          writeStream.write(`[$time_$title]($link) `);
          writeStream.write(\'\\n\');
        
      
    
  

writeStream.end();

// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time) 
  time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
  let newTime = new Date(time).format(\'yyyy年MM月dd日\');
  return newTime;


E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_公众号_历史消息_原创_时间正序_markdown_当前文件夹_合并_优化.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) 
  var o = 
    \'M+\': this.getMonth() + 1, //月份
    \'d+\': this.getDate(), //日
    \'h+\': this.getHours(), //小时
    \'m+\': this.getMinutes(), //分
    \'s+\': this.getSeconds(), //秒
    \'q+\': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  ;
  if (/(y+)/.test(fmt)) 
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + \'\').substr(4 - RegExp.$1.length)
    );
  
  for (var k in o) 
    if (new RegExp(\'(\' + k + \')\').test(fmt)) 
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : (\'00\' + o[k]).substr((\'\' + o[k]).length)
      );
    
  
  return fmt;
;
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息\\\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
  path + \'/\' + \'原创文章-超链接-时间顺序-合并.md\',
  \'utf-8\'
);
// 遍历文件列表数组
for (let file of fileList) 
  console.log(\'---------\' + file + \'----------\');
  let suffixReg = /\\.(txt)$/;
  if (suffixReg.test(file)) 
    // 读取文件,然后加载
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);
    // 获取所有的消息卡片
    let js_history_list = $(\'#js_history_list\').children();

    // 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
    if (!js_history_list) continue;

    // 写入当前文件标题,作为Markdown的一级标题
    writeStream.write(`# $file.split(\'.\')[0]\\n`);
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => 
      return a.attribs.msgid - b.attribs.msgid;
    );
    //遍历消息卡片
    for (let ele of history_list) 
      const $1 = cheerio.load(ele);
      // 获取时间
      let time = $1(\'.weui_msg_card_hd:first-child\').html();
      time = timeConvert(time);
      const msgList = $1(\'.weui_msg_card_bd\').children();
      for (let msg of msgList) 
        const $2 = cheerio.load(msg);
        // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
        let type = $2(\'h4\').parent().attr(\'data-type\');
        // 提取图文消息
        if (type === \'APPMSG\') 
          // 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
          let isOrigin = $2(\'#copyright_logo\').html();
          // 获取当前图文标题
          let title = $2(\'h4\').html();
          // 对标题进行处理
          if (isOrigin) 
            title = title.split(\'</span>\')[1].trim();
           else 
            title = title.trim();
          
          // 获取链接
          let link = $2(\'h4\').attr(\'hrefs\');
          // 如果是原创和图文消息的话
          if (isOrigin === \'原创\') 
            console.log(time + \'_\' + title);
            writeStream.write(`[$time_$title]($link) `);
            writeStream.write(\'\\n\');
          
        
      
    
  

writeStream.end();

// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time) 
  time = time.replace(\'年\', \'/\').replace(\'月\', \'/\').replace(\'日\', \'\');
  let newTime = new Date(time).format(\'yyyy年MM月dd日\');
  return newTime;


E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_微信公众号_历史消息_提取链接.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  console.log(file);
  if (file.split(\'.\')[1] === \'txt\') 
    const data = fs.readFileSync(path + \'/\' + file, \'utf8\');
    const $ = cheerio.load(data);
    // 1. 提取目录
    const writeStream = fs.createWriteStream(path + \'/目录/\' + file, \'utf-8\');
    $(\'#js_history_list h4\').each((index, ele) => 
      console.log(\'--------\' + index);
      writeStream.write($(ele).attr(\'hrefs\') ?? \'\');
      writeStream.write(\'\\n\');
    );
    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_微信公众号_数据清洗.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(\'.\')[1] === \'txt\') 
      fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
        const $ = cheerio.load(data);

        // 1. 数据清洗_相同路径下面创建相同文件,用来清理script标签
        const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
        $(\'script\').remove();
        $(\'link\').remove();
        writeStream.write($(\'html\').html());
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_微信公众号_话题_提取链接.js

const fs = require("fs");
const cheerio = require("cheerio");
let path = `./html`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// console.log(fileList);

// 遍历文件列表数组
// for (let file of fileList) 
//   console.log(file);
//   if (file.split(\'.\')[1] === \'txt\') 
//     fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
//       const $ = cheerio.load(data);

//       // 1. 提取目录
//       const writeStream = fs.createWriteStream(path + \'/目录/\' + file, \'utf-8\');
//       $(
//         \'#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li\'
//       ).each((index, ele) => 
//         writeStream.write($(ele).attr(\'data-link\'));
//         writeStream.write(\'\\n\');
//       );
//       writeStream.end();
//     );
//   
// 

E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_微信公众号_页面_提取链接.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;

fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(\'.\')[1] === \'html\') 
      fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
        const $ = cheerio.load(data);

        // 1. 提取目录
        const writeStream = fs.createWriteStream(
          path + \'/目录/\' + file,
          \'utf-8\'
        );
        $(\'#js_content a\').each((index, ele) => 
          writeStream.write($(ele).attr(\'href\'));
          writeStream.write(\'\\n\');
        );
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_微信公众号_页面_提取链接_同步.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  if (file.split(\'.\')[1] === \'txt\') 
    // 读取文件的中内容
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf8\');
    // 加载cheerio中生成dom树
    const $ = cheerio.load(fileContent);
    // 创建写入文件流
    const writeStream = fs.createWriteStream(path + \'/目录/\' + file, \'utf8\');
    // 写入相关内容
    $(\'#js_content a\').each((index, ele) => 
      writeStream.write($(ele).attr(\'href\'));
      writeStream.write(\'\\n\');
    );
    // 关闭写入流
    writeStream.end();
  


// fs.readdir(path, function (err, files) 
//   files.forEach((file) => 
//     console.log(file);
//     if (file.split(\'.\')[1] === \'html\') 
//       fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
//         const $ = cheerio.load(data);

//         // 1. 提取目录
//         const writeStream = fs.createWriteStream(
//           path + \'/目录/\' + file,
//           \'utf-8\'
//         );
//         $(\'#js_content a\').each((index, ele) => 
//           writeStream.write($(ele).attr(\'href\'));
//           writeStream.write(\'\\n\');
//         );
//         writeStream.end();
//       );
//     
//   );
// );

E:\\公众号文章采集\\fi_filter_过滤器\\src\\extract_link_正序.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `./html`;
fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(\'.\')[1] === \'txt\') 
      fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
        const $ = cheerio.load(data);
        // const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
        const writeStream = fs.createWriteStream(
          path + \'/目录/\' + \'新浪博客目录.txt\',
          \'utf-8\'
        );
        $(\'#pl-home-bloglist > article > ul>li\').each((data, ele) => 
          let title = $(ele).find(\'h2\').html();
          let url = $(ele).find(\'a\').attr(\'data-link\');
          writeStream.write(\'[\');
          writeStream.write(title);
          writeStream.write(\']\');
          writeStream.write(\'(\');
          writeStream.write(url);
          writeStream.write(\')\');
          writeStream.write(\'\\n\');
          writeStream.write(\'\\n\');
          console.log(title);
          console.log(url);
        );
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html.js

const fs = require("fs");
const jsdom = require("jsdom");
const  JSDOM  = jsdom;
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘`;
fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(".")[1] === "html") 
      fs.readFile(path + "/" + file, "utf-8", (err, data) => 
        const  window  = new JSDOM(data);
        const $ = require("jQuery")(window);
        const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");
        // 包含特定文字,掺杂在正文中的p标签,或者span标签
        $("span:contains(\'***\')").remove();
        $("span:contains(\'--- TBC ---\')").remove();
        $("span:contains(\'支持原创翻译\')").remove();
        $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
        $("a:contains(\'阅读全文\')").remove();
        $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
        //标题移除
        $("#activity-name").remove();
        // 所有音频标签
        $("section").remove();
        // 所有的h3标签
        $("h3").remove();
        // 话题标签
        $("#js_tags").remove();
        // 所有的img图片;
        $("img").remove();
        // 所有的script标签
        $("script").remove();
        //公众号名称 时间信息
        $("div#meta_content").remove();
        //底部评论信息
        $("div.comment").remove();
        //html写入
        writeStream.write($("html").html());
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\fi_filter_过滤器\\\\公众号历史消息`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  if (file.split(\'.\')[1] === \'html\') 
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);
    const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
    // 过滤内容
    // 所有的script标签
    $(\'script\').remove();
    $(\'link\').remove();
    //html写入
    writeStream.write($(\'html\').html());
    writeStream.end();
  


function filterContent($) 
  // 包含特定文字,掺杂在正文中的p标签,或者span标签
  // $("section:contains(\'相关阅读\')").remove();
  // $("span:contains(\'--- TBC ---\')").remove();
  // $("span:contains(\'支持原创翻译\')").remove();
  // $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
  // $("a:contains(\'阅读全文\')").remove();
  // $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
  //标题移除
  // $(\'#activity-name\').remove();
  // 所有音频标签
  // $(\'section\').remove();
  // 所有的h3标签
  // $(\'h3\').remove();
  // 话题标签
  // $(\'#js_tags\').remove();
  // 所有的img图片;
  // $(\'img\').remove();
  // 所有的script标签
  $(\'script\').remove();
  //公众号名称 时间信息
  // $(\'div#meta_content\').remove();
  //底部评论信息
  // $(\'div.comment\').remove();


E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio_听心坊.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\听心坊\\\\`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  console.log(file);
  if (file.split(\'.\')[1] === \'html\') 
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf-8\');
    const $ = cheerio.load(fileContent);

    // 过滤内容---------------------------------------------------------
    // 包含特定文字,掺杂在正文中的p标签,或者span标签
    // $("span:contains(\'如欲与陈明亮先生交流,请登陆:\')").remove();
    // $("span:contains(\'facebook.com/profile.php?id=100039436871466\')").remove();
    // $("span:contains(\'更多陈明亮的文章,请前往\')").remove();
    // $("span:contains(\'更多的音频\')").remove();
    // $("span:contains(\'摄影\')").remove();
    // 获取文章内的数字索引
    // console.log($("span:contains(\'【明亮说\')").text());
    let idx = $("span:contains(\'【明亮说\')").text().split(\'·\')[1];
    console.log(idx);

    // //测试断点
    // writeStream.write($(\'html\').html());
    // writeStream.end();
    // break;
    // //测试断点

    // $("a:contains(\'阅读全文\')").remove();
    // $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
    //标题移除
    // $(\'#activity-name\').remove();
    // 所有音频标签
    // $(\'section\').remove();
    // 所有的h3标签
    // $(\'h3\').remove();
    // 话题标签
    // $(\'#js_tags\').remove();
    // 所有的img图片;
    // $(\'img\').remove();
    // 所有的script标签
    // $(\'script\').remove();
    //公众号名称 时间信息
    // $(\'div#meta_content\').remove();
    //底部评论信息
    // $(\'div.comment\').remove();
    //html写入-------------------------------------------------
    const writeStream = fs.createWriteStream(path + \'/\' + idx + file, \'utf-8\');
    writeStream.write($(\'html\').html());
    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio_旭然之光.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\煦然之光`;
fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(\'.\')[1] === \'html\') 
      fs.readFile(path + \'/\' + file, \'utf-8\', (err, data) => 
        const $ = cheerio.load(data);
        const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
        // 包含特定文字,掺杂在正文中的p标签,或者span标签
        // $("section:contains(\'相关阅读\')").remove();
        // $("span:contains(\'--- TBC ---\')").remove();
        // $("span:contains(\'支持原创翻译\')").remove();
        // $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
        // $("a:contains(\'阅读全文\')").remove();
        // $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
        //标题移除
        // $(\'#activity-name\').remove();
        // 所有音频标签
        // $(\'section\').remove();
        // 所有的h3标签
        // $(\'h3\').remove();
        // 话题标签
        // $(\'#js_tags\').remove();
        // 所有的img图片;
        // $(\'img\').remove();
        // 所有的script标签
        // $(\'script\').remove();
        //公众号名称 时间信息
        // $(\'div#meta_content\').remove();
        //底部评论信息
        // $(\'div.comment\').remove();
        //html写入
        writeStream.write($(\'html\').html());
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_cheerio_阿知事业林.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\阿知事业林`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  console.log(file);
  if (file.split(\'.\')[1] === \'html\') 
    // 读取文件的中内容
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf8\');
    // 加载cheerio中生成dom树
    const $ = cheerio.load(fileContent);
    // 创建写入文件流
    const writeStream = fs.createWriteStream(path + \'/\' + file, \'utf-8\');
    // 所有的img图片;
    $(\'img\').remove();
    // 所有的script标签
    $(\'script\').remove();
    // ---------------------
    // 包含特定文字,掺杂在正文中的p标签,或者span标签
    // $("section:contains(\'相关阅读\')").remove();
    // $("span:contains(\'--- TBC ---\')").remove();
    // $("span:contains(\'支持原创翻译\')").remove();
    // $("span:contains(\'节选自室利·萨马塔·罗摩达斯\')").remove();
    // $("a:contains(\'阅读全文\')").remove();
    // $("p:contains(\'因此,在《给弟子的忠告》\')").remove();
    // 作者和发布的时间信息
    $(\'#meta_content\').remove();
    // 评论信息
    $(\'.comment\').remove();
    // 话题标签
    $(\'#js_tags\').remove();
    // 评论赞助
    $(\'#js_sponsor_ad_area\').remove();
    // 超链接
    $(\'a\').remove();
    // 所有音频标签
    // $(\'section\').remove();
    // 所有的h3标签
    // $(\'h3\').remove();
    // 话题标签
    // $(\'#js_tags\').remove();
    //公众号名称 时间信息
    // $(\'div#meta_content\').remove();
    //底部评论信息
    // $(\'div.comment\').remove();
    //html写入
    writeStream.write($(\'html\').html());
    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter_html_不死甘露.js

const fs = require("fs");
const jsdom = require("jsdom");
const  JSDOM  = jsdom;
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘`;
fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(".")[1] === "html") 
      fs.readFile(path + "/" + file, "utf-8", (err, data) => 
        const  window  = new JSDOM(data);
        const $ = require("jQuery")(window);
        const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");

        // 包含特定文字,掺杂在正文中的p标签,或者span标签
        $("span:contains(\'***\')").remove();
        $("span:contains(\'--- TBC ---\')").remove();
        $("span:contains(\'支持原创翻译\')").remove();
        $("a:contains(\'阅读全文\')").remove();
        $("strong:contains(\'不死甘露\')").remove();
        $("strong:contains(\'关于永恒的开示录\')").remove();
        $("strong:contains(\'THE NECTAR OF IMMORTALITY\')").remove();

        $("span:contains(\'室利·尼萨迦达塔·马哈拉吉 著\')").remove();
        $("span:contains(\'灵智宝鬘翻译团队 中译\')").remove();
        $("p:contains(\'喜欢作者\')").remove();
        $("p:contains(\'——\')").remove();

        //标题移除
        $("#activity-name").remove();
        // 所有音频标签
        $("section").remove();
        // 所有的h3标签
        $("h3").remove();
        // 话题标签
        $("#js_tags").remove();
        // 所有的img图片;
        $("img").remove();
        // 所有的script标签
        $("script").remove();
        //公众号名称 时间信息
        $("div#meta_content").remove();
        //底部评论信息
        $("div.comment").remove();
        //html写入
        writeStream.write($("html").html());
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\html2txt.js

const fs = require("fs");
const jsdom = require("jsdom");
const  JSDOM  = jsdom;
let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\养猫学习`;

fs.readdir(path, function (err, files) 
  files.forEach((file) => 
    console.log(file);
    if (file.split(".")[1] === "html") 
      fs.readFile(path + "/" + file, "utf-8", (err, data) => 
        const  window  = new JSDOM(data);
        const $ = require("jQuery")(window);
        const writeStream = fs.createWriteStream(
          path + "/" + file.split(".")[0] + ".txt",
          "utf-8"
        );
        //标题
        writeStream.write($("#activity-name").text());
        //内容
        writeStream.write($("#js_content").text());
        writeStream.end();
      );
    
  );
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\html2txt_cheerio.js

const fs = require(\'fs\');
const cheerio = require(\'cheerio\');

let path = `E:\\\\公众号文章采集\\\\公众号HTML\\\\阿知事业林`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) 
  console.log(file);
  if (file.split(\'.\')[1] === \'html\') 
    // 读取文件的中内容
    const fileContent = fs.readFileSync(path + \'/\' + file, \'utf8\');
    // 加载cheerio中生成dom树
    const $ = cheerio.load(fileContent);
    // 创建一个文件写入流
    const writeStream = fs.createWriteStream(
      path + \'/\' + file.split(\'.\')[0] + \'.txt\',
      \'utf-8\'
    );
    // //标题
    // writeStream.write($(\'#activity-name\').text());
    // //内容
    writeStream.write($(\'body\').text());
    writeStream.write(\'endendend\');

    writeStream.end();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\任意路径文件写入.js

const fs = require(\'fs\');

const writeFileRecursive = function (path, buffer, callback) 
  // 前面的文件路径
  let lastPath = path.substring(0, path.lastIndexOf(\'/\'));
  // 递归创建目录
  fs.mkdir(lastPath,  recursive: true , (err) => 
    if (err) return callback(err);
    fs.writeFile(path, buffer, function (err) 
      if (err) return callback(err);
      return callback(null);
    );
  );
;

const buffer = \'hello\';
writeFileRecursive(\'./public/test/test.txt\', buffer, (err) => 
  if (err) console.error(err);
  console.info(\'write success\');
);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\crawler\\crawler.ts

import superagent from "superagent";
import  load, CheerioAPI  from "cheerio";

import  log  from "console";
import  createWriteStream  from "fs";

export default class Crawler 
  private url = ``;
  private $: CheerioAPI;

  constructor() 

  setUrl(url: string) 
    this.url = url;
  

  async init() 
    const res = await superagent.get(this.url);
    this.$ = load(res.text);
  

  save(path: string) 
    const writeStream = createWriteStream(path, "utf-8");
    writeStream.write(this.$("html"));
    writeStream.end();
  
  getTitle() 
    log(this.$("#activity-name").text());
    return this.$("#activity-name").text();
  
  getTime() 
    log(this.$("script:contains(\'function htmlDecode(str)\')").text());
  

  getContent() 
    // 萨特桑指出
    const quotes = this.$("span:contains(\'萨特桑指出\')");
    return quotes.text();
  


E:\\公众号文章采集\\fi_filter_过滤器\\src\\crawler\\index.ts

import  log  from "console";
import Crawler from "./crawler";

const crawler = new Crawler();

crawler.setUrl("https://mp.weixin.qq.com/s/EgZhFJTzsgfYzZZ-4_SI4Q");
await crawler.init();
crawler.getTime();
// crawler.getTitle();
// crawler.save("");

// const content = crawler.getContent();

// log(content);

E:\\公众号文章采集\\fi_filter_过滤器\\src\\filter\\01_灵智宝鬘_话题_尼萨迦达塔.ts

import  log  from "console";
import 
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
 from "fs";
import  basename, extname, join, resolve  from "path";
import  load, CheerioAPI  from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\\公众号文章采集\\公众号HTML\\灵智宝鬘
const basePath = "E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘";
const outPath = join(basePath, "out");

try 
  exitsFolder(outPath);
 catch (e) 
  log(e);


// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);

const pureFilePathList = fileList
  .filter((fileName) => 
    return lstatSync(join(basePath, fileName)).isFile();
  )
  .filter((fileName) => 
    const fileExt = extname(fileName);
    return fileExt === ".html";
  )
  .map((fileName) => 
    return join(basePath, fileName);
  );

// pureFilePathList.forEach((filePath) => 
//   extractTopic(filePath);
// );

for (let filePath of pureFilePathList) 
  const $: CheerioAPI = loadHtmlDom(filePath);
  filterDom($);
  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  writeStream.write($("html").html());
  writeStream.end();


// ====================================================================================================================
// 移除标签的原则,尽量少移除p标签,有可能会误把正文内容移除
function filterDom($: CheerioAPI) 
  // 话题标签
  $("#js_tags").remove();
  // 包含特定文字的span标签
  $("span:contains(\'灵智宝鬘翻译团队 中译\')").remove();
  // 
  $(
    "p[style*=\'white-space: normal;text-align: center;\']:contains(\'我是那\')"
  ).remove();
  // 室利·尼萨迦达塔·马哈拉吉的开示录
  $(
    "p[style*=\'white-space: normal;text-align: center;\']:contains(\'室利·尼萨迦达塔·马哈拉吉的开示录\')"
  ).remove();
  // 文字颜色是 color: rgb(136, 136, 136)   ,且包含" 室利·尼萨迦达塔·马哈拉吉 著"的span标签
  $(
    "span[style*=\'color: rgb(136, 136, 136)\']:contains(\'室利·尼萨迦达塔·马哈拉吉 著\')"
  ).remove();
  // 红色的span和strong标签
  $("span[style*=\'color: rgb(255, 76, 65)\']").remove();
  $("strong[style*=\'color: rgb(255, 76, 65)\']").remove();
  // 类名是comment的div标签
  $("div.comment").remove();


function loadHtmlDom(filePath: string): CheerioAPI 
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);


function extractLink($: CheerioAPI) 
  const oLinkList = $("#js_articles > div");
  if (!oLinkList.length) return [];
  const linkArr: string[] = [];
  oLinkList.each((i, oLink) => 
    const url = $(oLink).attr("data-jump_url");
    if (!url) return;
    linkArr.push(url);
  );

  return linkArr;


function exitsFolder(absPath: string) 
  try 
    statSync(absPath);
   catch (e) 
    // 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath,  recursive: true );
  


function getCurDate() 
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );


function getOutFilePath(filePath: string) 
  return join(outPath, basename(filePath));


E:\\公众号文章采集\\fi_filter_过滤器\\src\\topic\\01_非推送_链接_一行一个.ts

import  log  from "console";
import 
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
 from "fs";
import  basename, extname, join, resolve  from "path";
import  load, CheerioAPI  from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try 
  exitsFolder(outPath);
 catch (e) 
  log(e);


// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
  .filter((fileName) => 
    return lstatSync(join(basePath, fileName)).isFile();
  )
  .filter((fileName) => 
    const fileExt = extname(fileName);
    return fileExt === ".txt" || fileExt === ".html";
  )
  .map((fileName) => 
    return join(basePath, fileName);
  );

pureFilePathList.forEach((filePath) => 
  extractTopic(filePath);
);

function extractTopic(filePath: string) 
  const $: CheerioAPI = loadHtmlDom(filePath);
  const urlArr = extractLink($);
  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  urlArr.forEach((url) => 
    writeStream.write(url);
    writeStream.write("\\n");
  );
  writeStream.end();


function loadHtmlDom(filePath: string): CheerioAPI 
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);


function extractLink($: CheerioAPI) 
  const oLinkList = $(
    "#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li"
  );

  const linkArr: string[] = [];

  oLinkList.each((i, oLink) => 
    const url = $(oLink).attr("data-link");
    linkArr.push(url ? url : "");
  );

  return linkArr;


function exitsFolder(absPath: string) 
  try 
    statSync(absPath);
   catch (e) 
    // 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath,  recursive: true );
  


function getCurDate() 
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );


function getOutFilePath(filePath: string) 
  return join(
    outPath,
    getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
  );


E:\\公众号文章采集\\fi_filter_过滤器\\src\\topic\\02_推送_链接_一行一个.ts

import  log  from "console";
import 
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
 from "fs";
import  basename, extname, join, resolve  from "path";
import  load, CheerioAPI  from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try 
  exitsFolder(outPath);
 catch (e) 
  log(e);


// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
  .filter((fileName) => 
    return lstatSync(join(basePath, fileName)).isFile();
  )
  .filter((fileName) => 
    const fileExt = extname(fileName);
    return fileExt === ".txt" || fileExt === ".html";
  )
  .map((fileName) => 
    return join(basePath, fileName);
  );

pureFilePathList.forEach((filePath) => 
  extractTopic(filePath);
);

function extractTopic(filePath: string) 
  const $: CheerioAPI = loadHtmlDom(filePath);
  const urlArr = extractLink($)!;
  if (!urlArr.length) return;

  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  urlArr.forEach((url) => 
    writeStream.write(url);
    writeStream.write("\\n");
  );
  writeStream.end();


function loadHtmlDom(filePath: string): CheerioAPI 
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);


function extractLink($: CheerioAPI) 
  const oLinkList = $("#js_articles > div");
  if (!oLinkList.length) return [];
  const linkArr: string[] = [];
  oLinkList.each((i, oLink) => 
    const url = $(oLink).attr("data-jump_url");
    if (!url) return;
    linkArr.push(url);
  );

  return linkArr;


function exitsFolder(absPath: string) 
  try 
    statSync(absPath);
   catch (e) 
    // 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath,  recursive: true );
  


function getCurDate() 
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );


function getOutFilePath(filePath: string) 
  return join(
    outPath,
    getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
  );


E:\\公众号文章采集\\fi_filter_过滤器\\src\\txt\\01_合集.ts

import  log  from "console";
import 
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
 from "fs";
import  basename, extname, join, resolve  from "path";
import  load, CheerioAPI  from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\\公众号文章采集\\公众号HTML\\灵智宝鬘
const basePath = "E:\\\\公众号文章采集\\\\公众号HTML\\\\灵智宝鬘\\\\out";
const outFileName = "灵智宝鬘_尼萨迦达塔_我是那";
const outPath = join(basePath, "txt");

try 
  exitsFolder(outPath);
 catch (e) 
  log(e);


// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);

const pureFilePathList = fileList
  .filter((fileName) => 
    return lstatSync(join(basePath, fileName)).isFile();
  )
  .filter((fileName) => 
    const fileExt = extname(fileName);
    return fileExt === ".html";
  )
  .map((fileName) => 
    return join(basePath, fileName);
  );

// pureFilePathList.forEach((filePath) => 
//   extractTopic(filePath);
// );

const outFilePath = getOutFilePath();
const writeStream = createWriteStream(outFilePath, "utf-8");

for (let [index, filePath] of pureFilePathList.entries()) 
  const $: CheerioAPI = loadHtmlDom(filePath);
  const textContent = extractText($);

  writeStream.write("\\n");
  writeStream.write(`第$index + 1章`);
  writeStream.write("\\n");

  writeStream.write(textContent.title);
  writeStream.write("\\n");

  writeStream.write(textContent.pubDate);
  writeStream.write("\\n");

  writeStream.write(textContent.content);
  writeStream.write("\\n");
  log(`$index_$filePath`);

writeStream.end();

// ====================================================================================================================

function loadHtmlDom(filePath: string): CheerioAPI 
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);


function exitsFolder(absPath: string) 
  try 
    statSync(absPath);
   catch (e) 
    // 不存在文件夹,直接创建 recursive: true 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath,  recursive: true );
  


function getOutFilePath() 
  return join(outPath, outFileName + ".txt");


function extractText($: CheerioAPI) 
  // #activity-name
  const title = $("#activity-name").text();
  // #publish_time
  const pubDate = $("#publish_time").text();
  const content = $("#js_content").text();
  const splitContent = handleContent(content);
  return 
    title,
    pubDate,
    content: splitContent,
  ;


function handleContent(content: string) 
  return content
    .replace("尼萨迦达塔:", replaceContent("尼萨迦达塔:", 1000))
    .replace("尼:", replaceContent("尼萨迦达塔:", 1000))
    .replace("提问者:", replaceContent("提问者:", 1000))
    .replace("问:", replaceContent("提问者:", 1000));


function replaceContent(keyword: string, time: number = 1000) 
  return `\\n[p$time.toString()]\\n$keyword\\n`;


微信公众号的文章爬取有三种方式


a. 通过微信订阅号在发布文章,可以查找公众号的文章,方式见微信链接。,阅读数、点赞数、评论数仍无法抓取。

b. 通过搜狗微信搜索微信公众号,但是文章篇幅仍然后有限制,点赞、阅读数、和评论数无法抓取。

c. 通过“中间人方式”对数据进行拦截,过滤解析后进行抓取。

这里就时利用第三种c方式对数据进行抓取。

思路:

1. 安装代理AnProxy,在手机端安装CA证书,启动代理,设置手机代理;

2. 获取目标微信公众号的__biz;

3. 进入微信公众号的历史页面;

4. 使用Monkeyrunner控制滑屏;获取更多的历史消息;

5. 记录文章标题,摘要,创建时间,创作类型,地址等等;

6. 文章列表获取完成后,利用Monkeyrunner进入文章的列表,

7. 记录文章的阅读数,点赞数,评论数等;

8. 重复以上操作。

以上是关于微信公众号 过滤 typescript cheerio的主要内容,如果未能解决你的问题,请参考以下文章

微信公众号推送的图文消息里面的正文可以插入html标签吗?比如iframe a

微信公众号的文章爬取有三种方式

微信公众号缓存问题解决方案

如何用手机添加微信公众号?怎么用微信添加微信公众号?

微信公众号怎么搜索不了 微信搜索不到公众号是啥原因

微信公众号如何登陆