nodejs .http模块, cheerio模块 实现 小爬虫.
Posted cb_za
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了nodejs .http模块, cheerio模块 实现 小爬虫.相关的知识,希望对你有一定的参考价值。
代码:
1 var http = require("http"); 2 3 var cheerio = require("cheerio"); 4 5 6 var url = \'http://www.imooc.com/learn/348\'; 7 8 9 http.get(url, function(res){ 10 var html = \'\'; 11 12 res.on(\'data\', function(data){ 13 html += data; 14 }); 15 16 res.on(\'end\', function(){ 17 var courseData = filterChapters(html); 18 19 printCourseInfo(courseData); 20 console.log(courseData); 21 }); 22 }).on(\'error\', function(){ 23 console.log("获取课程数据出错!"); 24 }); 25 26 function filterChapters(html) 27 { 28 var $ = cheerio.load(html); 29 30 //所有章节 31 var chapters = $(\'.chapter\'); 32 33 var courseData = []; 34 35 chapters.each(function(item){ 36 var chapter = $(this); 37 var chapterTitle = chapter.find(\'h3 strong\').text().replace(/\\r|\\n/ig,"").trim(); 38 var videos = chapter.find(".video").children(\'li\'); 39 40 var chapterData = { 41 chapterTitle: chapterTitle, 42 videos: [] 43 }; 44 45 videos.each(function(index, item2) { 46 var video = $(this).find(\'.J-media-item\'); 47 var videoTitle = video.text().replace(/\\r|\\n/ig,"").trim(); 48 var id = video.attr(\'href\').split(\'video/\')[1]; 49 50 chapterData.videos.push({ 51 title: videoTitle, 52 id: id 53 }) 54 }); 55 56 courseData.push(chapterData); 57 }); 58 59 return courseData; 60 } 61 62 63 function printCourseInfo(courseData) 64 { 65 courseData.forEach(function(item){ 66 var chapterTitle = item.chapterTitle; 67 console.log(chapterTitle + \'\\n\'); 68 69 item.videos.forEach(function(video){ 70 console.log(\' [\' + video.id+ \']\' + video.title); 71 }); 72 }); 73 }
运行:
----------------------------------------------------------------------
参考链接:
以上是关于nodejs .http模块, cheerio模块 实现 小爬虫.的主要内容,如果未能解决你的问题,请参考以下文章