javascript Node.js的Naver新闻抓取工具

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了javascript Node.js的Naver新闻抓取工具相关的知识,希望对你有一定的参考价值。

const fs = require('fs');
const jsdom = require('jsdom');
const parallel = require('parallel-tasks');
const path = require('path');
const request = require('request');

const { JSDOM } = jsdom;

const formatDate = (date) => `${date.getFullYear()}${(date.getMonth() + 1).toString().padStart(2, '0')}${date.getDate().toString().padStart(2, '0')}`;

const NewsType = {
  경향신문: 32,
  국민일보: 5,
  동아일보: 20,
  문화일보: 21,
  서울신문: 81,
  세계일보: 22,
  조선일보: 23,
  중앙일보: 25,
  한겨레: 28,
  한국일보: 469,
};

class NewsCrawler {
  static getLastestArticleId(type) {
    return new Promise((resolve, reject) => {
      const typeId = NewsType[type].toString().padStart(3, '0');
      request.get(`http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&oid=${typeId}&listType=title&date=${formatDate(new Date())}`, async (err, httpResponse, body) => {
        if (err) {
          reject(err);
        } else {
          const { document } = (new JSDOM(body)).window;
          resolve(Number(document.querySelector('.list_body > ul.type02 > li > a').href.split('aid=')[1]));
        }
      });
    });
  }

  static run(type, articleId, articleAmount = 100) {
    return new Promise(async (resolve) => {
      const typeId = NewsType[type].toString().padStart(3, '0');
      const newsDir = path.join(__dirname, type);
      if (!fs.existsSync(newsDir)) {
        fs.mkdirSync(newsDir);
      }
      const urls = Array.apply(null, { length: articleAmount }).map((value, index) => value = `http://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=${typeId}&aid=${(articleId - index).toString().padStart(10, '0')}`);
      const createTask = (url, index) => () => new Promise((resolve2) => {
        console.log(url)
        request.get({ url, encoding: null }, (err, httpResponse, body) => {
          if (!err) {
            const { document } = (new JSDOM(body)).window;
            const titleElement = document.querySelector('.article_header > .article_info > #articleTitle');
            const contentElement = document.querySelector('#articleBody > #articleBodyContents');
            if (titleElement && contentElement) {
              const title = titleElement.innerHTML.trim();
              const content = contentElement.innerHTML
                .replace(/<!--.+-->/g, '')
                .replace(/\/\/[^\n]*\n/, '')
                .replace(/(?:<br>)+/g, '\n')
                .replace(/<(?:.|\n)*?>/gm, '')
                .replace('function _flash_removeCallback() {}', '')
                .trim();
              fs.writeFileSync(path.join(newsDir, `news${index}.json`), JSON.stringify({ title, content }));
            }
          }
          resolve2();
        });
      });
      await parallel.run(urls.map(createTask));
      resolve(true);
    });
  }
}

(async () => {
  // const articleId = await NewsCrawler.getLastestArticleId('경향신문');
  // console.log(articleId);
  await NewsCrawler.run('경향신문', 2854930, 1000);
})()

module.exports = NewsCrawler;

以上是关于javascript Node.js的Naver新闻抓取工具的主要内容,如果未能解决你的问题,请参考以下文章

新 V8 即将推出和 Node.js

新 V8 为 NODE.JS 带来的性能变化

使用Chrome DevTools直接调试Node.js与JavaScript(并行)

准备:新V8即将到来,Node.js的性能正在改变

Node.js

node.js 初体验