nodejs小爬虫

来源：互联网发布：想开淘宝店怎么找货源编辑：程序博客网时间：2024/05/18 00:34

最近看到很多招聘前端写着会node，狠狠心，抽空看教程。然后看到慕课网基础教程有个简单版本的爬虫，故照着练习。

现在放出我自己修改后的源码，当然只能抓取一个页面，下次学习深入再研究多个页面的

相关js文件,需要加载cheerio模块

// 慕课网node爬虫练习// 引入http模块var http=require('http')// 引入cheerio模块（类似jquery，需要安装引入）var cheerio=require('cheerio')// 爬去网页链接var url='http://www.imooc.com/learn/348'// 目标分析过滤，使用cheerio模块function filterChapter(html){    // 用cheerio装载内容    var $=cheerio.load(html)    // 获取章节列表的大章标题    var chapters=$('.chapter')    // 过滤后的数据    var courseData=[]    var chapter    var chapterTitle    var videos    var chapterData    // videos    var video    var videoTitle    var id    chapters.each(function(item){        // 保存当前chapter节点对象        chapter=$(this)        // 节点标题        chapterTitle=chapter.find('strong').text().replace(/\s/g,'')        // 大章节下面小章节        videos=chapter.find('.video li')        // 大章节内容保存        chapterData={            chapterTitle:chapterTitle,            videos:[]        }        // 遍历videos                videos.each(function(item){            // a标签            video=$(this).find('.J-media-item')            // a标签内容            videoTitle=video.text().replace(/\s|(开始学习)/g,'')            // a标签href,这里id可以使用截取，替换等方式操作            id=video.attr('href').split('video/')[1].replace(/\s/g,'')            // 推入chapterData.videos,保存为对象格式，因为没个小标题有id，有标题            chapterData.videos.push({                title:videoTitle,                id:id            })        })        // 推入总数组        courseData.push(chapterData)    })    return courseData}// 打印文章标题function printInfo(courseData){    // 遍历    courseData.forEach(function(item) {        // 大章节        var chapterTitle=item.chapterTitle        console.log('\n'+chapterTitle+'\n')        // 小章节        item.videos.forEach(function(v){            console.log('['+v.id+']'+'    '+v.title)        })    }, this);}http.get(url,function(res){    // 爬去网页源码    var html=''    // 绑定data事件    res.on('data',function(data){        html+=data    })    // 结束end事件    res.on('end',function(){        var courseData=filterChapter(html)        // 打印提取的数据        printInfo(courseData)    })    // 超时或者异常捕获事件    res.on('error',function(){        console.log('获取数据错误')    })})

--------------------------------------2017-10-29---------------------------

这一次为node爬虫增加两个方法，需要另外安装cheerio模块和request模块，文章内容保存和图片保存，不得不吐槽下，想要找相关node爬虫教程，太少

保存文件：

 //保存txt文件            fs.appendFile('./data/' + chapterTitle + '.txt', x, 'utf-8', (err) => {                if (err) {                    console.log(err);                }            });

保存图片：

//该函数的作用：在本地存储所爬取到的图片资源function savedImg($,news_title) {        var img_src = 'http://img3.mukewang.com/5344e6d10001867401400140-80-80.jpg'; //获取图片的url        //采用request模块，向服务器发起一次请求，获取图片资源        request.head(img_src,function(err,res,body){            if(err){                console.log(err);            }        });        request(img_src).pipe(fs.createWriteStream('./image/'+news_title + '---' + 'img.jpg'));     //通过流的方式，把图片写到本地/image目录下，并用新闻的标题和图片的标题作为图片的名称。}

完整代码：

//慕课网爬虫练习，这次主要增加保存数据//http模块let http = require('http');//cheerio第三方模块let cheerio = require('cheerio');//文件模块，保存爬取的数据let fs = require('fs');//request第三方模块let request = require('request');//起始网址,该网址貌似不屏蔽爬虫，用来练习爬虫不错let url = 'http://www.imooc.com/learn/348';let dataArr = []; //数组对象let dataObject = {}; //保存标题，链接，时间，以及对应url的内容let $ = "";//数据内容过滤，这里主要是得到标题，时间filterChapter = (html) => {    // 用cheerio装载内容      $=cheerio.load(html)            // 获取章节列表的大章标题        let chapters=$('.chapter')            // 过滤后的数据        let courseData=[]        let chapter        let chapterTitle        let videos        let chapterData        // videos        let video        let videoTitle        let id        chapters.each(function(item){            // 保存当前chapter节点对象            chapter=$(this)            // 节点标题            chapterTitle=chapter.find('strong').text().replace(/\s/g,'')            // 大章节下面小章节            videos=chapter.find('.video li')            // 大章节内容保存            chapterData={                chapterTitle:chapterTitle,                videos:[]            }            // 遍历videos                        videos.each(function(item){                // a标签                video=$(this).find('.J-media-item')                // a标签内容                videoTitle=video.text().replace(/\s|(开始学习)/g,'')                // a标签href,这里id可以使用截取，替换等方式操作                id='http://www.imooc.com'+video.attr('href');                    // 推入chapterData.videos,保存为对象格式，因为没个小标题有id，有标题                chapterData.videos.push({                    title:videoTitle,                    id:id                })            })            // 推入总数组            courseData.push(chapterData)        })        return courseData  }// 打印文章标题  function printInfo(courseData){      // 遍历      var chapterTitle;    courseData.forEach(function(item) {          // 大章节          chapterTitle=item.chapterTitle           // 小章节          item.videos.forEach(function(v){               let x = "测试内容，2017-10-29";            //保存txt文件            fs.appendFile('./data/' + chapterTitle + '.txt', x, 'utf-8', (err) => {                if (err) {                    console.log(err);                }            });        });      }, this);      savedImg($,chapterTitle);}  //该函数的作用：在本地存储所爬取到的图片资源function savedImg($,news_title) {        var img_src = 'http://img3.mukewang.com/5344e6d10001867401400140-80-80.jpg'; //获取图片的url        //采用request模块，向服务器发起一次请求，获取图片资源        request.head(img_src,function(err,res,body){            if(err){                console.log(err);            }        });        request(img_src).pipe(fs.createWriteStream('./image/'+news_title + '---' + 'img.jpg'));     //通过流的方式，把图片写到本地/image目录下，并用新闻的标题和图片的标题作为图片的名称。}//爬虫请求http.get(url,(res)=>{    let html = "";    //data事件，监听数据分模块获取    res.on('data',(data)=>{        html += data;    });    //监听请求结束    res.on('end',()=>{        let courseData=filterChapter(html);          // 打印提取的数据          printInfo(courseData);     })});

参考链接：

http://blog.csdn.net/yezhenxu1992/article/details/50820629

http://www.cnblogs.com/hustskyking/p/spider-with-node.html

代码链接

代码链接2

预防链接挂掉，放出我自己的资源链接

http://download.csdn.net/detail/wujimiao/9883438

http://download.csdn.net/download/wujimiao/10044125

阅读全文

0 0