nodejs 简单爬虫(二)

来源:互联网 发布:手机行车记录软件 编辑:程序博客网 时间:2024/06/01 16:45

如果爬的网站编码不是UTF-8,就得花一点功夫了,今天研究了一下,准备爬个gb2312编码的网站。

nodejs原生不支持gbk编码,linux下可以通过Iconv解决,win下可以通过Iconv addon解决或者iconv-lite+bufferhelper解决

这里尝试一下conv-lite+bufferhelper

准备爬它:


是gb2313的编码,我要爬它的新闻来放到我的毕业设计里面,顺带把爬到的文章展示在我的前端

package.json:

{  "name": "crawler",  "version": "0.0.0",  "private": true,  "scripts": {    "start": "node ./bin/www"  },  "dependencies": {    "express": "~4.9.0",    "body-parser": "~1.8.1",    "cookie-parser": "~1.3.3",    "morgan": "~1.3.0",    "serve-favicon": "~2.1.3",    "debug": "~2.0.0",    "ejs": "~0.8.5",    "cheerio": "~0.18.0",    "iconv-lite": "~0.4.5",    "bufferhelper": "~0.2.0"  }}
index.js:

var express = require('express');var router = express.Router();var cheerio = require('cheerio');var iconv = require('iconv-lite');var BufferHelper = require('bufferhelper');var http = require('http');/* GET home page. */router.get('/', function(req, res) {  var request = http.get('http://futures.hexun.com/integratednews/', function (response) {    response.setEncoding('binary');    var str ='';    response.on('data', function(data) {      str+= data;    }).on('end', function() {      var buf = new Buffer(str, 'binary');      var body = iconv.decode(buf, 'GBK');      var $ = cheerio.load(body);      var news = [];      $('div.temp01>ul>li>a').each(function(index, element){        var $element = $(element);        news.push({          title: $element.text(),          href: $element.attr('href')        });      });      console.log(news);      res.render('index', {title: "crawler", news: news});    });  });});module.exports = router;

index.ejs:

<!DOCTYPE html><html>  <head>    <title><%= title %></title>    <link rel='stylesheet' href='/stylesheets/style.css' />  </head>  <body>    <h1><%= title %></h1>    <p>Welcome to <%= title %></p>    <% for(var i in news ) { %>      <p>        <a href="<%= news[i].href%>" target="_blank">          <%= news[i].title%>        </a>      </p>    <% }; %>  </body></html>
跑跑看:


done!点击文章标题即可转到文章界面


0 0
原创粉丝点击