Nodejs——简单小说爬虫实现

Posted 2020-09-13 tgxh的博客

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Nodejs——简单小说爬虫实现相关的知识，希望对你有一定的参考价值。

 1 //引入模块
 2 const http = require(\'http\')
 3 const fs = require(\'fs\')
 4 const cheerio = require(\'cheerio\')
 5 const iconv = require(\'iconv-lite\')
 6 //第一章url
 7 const url = \'http://www.81zw.com/book/8634/745331.html\'
 8 //开始章节数
 9 let i = 1
10 //最大获取章节数
11 let num = 100
12 
13 function main(url) {
14     startRequest(url)
15 }
16 
17 function startRequest(url) {
18     http.get(url, res => {
19         //定义空数组存放html
20         const html = []
21         res.on(\'data\', (chunk) => {
22             //把数据块添加进数组
23             html.push(chunk)
24         })
25         res.on(\'end\', () => {
26             //获取数据完毕后，使用iconv-lite转码，decedo中为Buffer对象，Buffer.concat为数组
27             const html1 = iconv.decode(Buffer.concat(html), \'gbk\')
28             //使用cheerio解析html，cheerio模块的语法跟jQuery基本一样
29             const $ = cheerio.load(html1, {decodeEntities: false})
30             //处理数据
31             const title = $(\'.bookname h1\').text()
32             const arr = []
33             const content = $("#content").html()
34             //分析结构后分割html
35             const contentArr = content.split(\'<br><br>\')
36             contentArr.forEach(elem => {
37                 //去除内容的两端空格和&nbsp;
38                 const data = trim(elem.toString())
39                 arr.push(data)
40             })
41             const bookName = $(".con_top a").eq(2).text()
42             //定义存入数据库的对象
43             const obj = {
44                 id: i,
45                 err: 0,
46                 bookName: bookName,
47                 title: title,
48                 content: arr
49             }
50 
51             let url2 = url.split(\'/\')[url.split(\'/\').length - 2]
52             const link = $(".bottem2 a").eq(2).attr(\'href\')
53             //获取当前章节的下一章地址，递归调用fetchPage
54             const nextLink = `http://www.81zw.com/book/${url2}/${link}`
55             saveContent(obj, nextLink)
56             console.log(`第${i + 1}章：${nextLink}`)
57             i++
58             if (i <= num) {
59                 setTimeout(() => {
60                     main(nextLink)
61                 }, 100)
62             }
63         })
64     })
65 }
66 
67 function saveContent(obj, nextLink) {
68     console.log(`${i}--${obj.title}`)
69     //判断书名文件夹是否存在，不存在则创建
70     if (!fs.existsSync(`data/${obj.bookName}`)) {
71         fs.mkdirSync(`data/${obj.bookName}`)
72     }
73     //写入json文件
74     fs.writeFile(`./data/${obj.bookName}/chapter${i}.json`, JSON.stringify(obj), \'utf-8\', err => {
75         if (err) throw err
76     })
77 }
78 
79 function trim(str) {
80     return str.replace(/(^\\s*)|(\\s*$)/g, \'\').replace(/&nbsp;/g, \'\')
81 }
82 
83 main(url)

生成文件

以上是关于Nodejs——简单小说爬虫实现的主要内容，如果未能解决你的问题，请参考以下文章