菜鸟教程:nodeJs爬虫

来源:互联网 发布:什么软件小额贷款成功 编辑:程序博客网 时间:2024/05/21 09:59

一、代码

var https = require('https');var cheerio = require('cheerio');var mysql  = require('mysql');var table = "job";var moment = require('moment');var today = moment();var year = today.format('YYYY年');var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');var urlTool = require("url");var qs = require('querystring');var db_config = {    host: '127.0.0.1',        user: 'root',    password:'123456',    port:'3306',    database:'node'};var connection;function handleDisconnect() {    connection = mysql.createConnection(db_config);                connection.connect(function(err) {               if(err) {                                     console.log('进行断线重连:' + new Date());            setTimeout(handleDisconnect, 2000);   //2秒重连一次            return;        }                //console.log('连接成功');     });                    connection.on('error', function(err) {        console.log('db error', err);        if(err.code === 'PROTOCOL_CONNECTION_LOST') {            handleDisconnect();           } else {                                                 throw err;                                        }    });}function filterJobItem(html){var $ = cheerio.load(html, {decodeEntities: false});var jobItem = $('.job-list').find('ul').children('li');var jobData = [];var msgs = [];var companyMsgs = [];var titleAndSalarys = [];var item, primaryInfo, companyInfo, id, msg, companyMsg, companyShortName, titleAndSalary, date;var addJob = 'insert into job(titleId, title, salary, city, workYear, education, companyName, industry, financing, companySize,date) values(?,?,?,?,?,?,?,?,?,?,?)';var i;jobItem.each(function(item){item = $(this);primaryInfo = item.find('.job-primary').find('.info-primary');companyInfo = item.find('.job-primary').find('.info-company');id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];titleAndSalary = primaryInfo.find('.name').find('a').text();titleAndSalarys = titleAndSalary.split(' ');msg = primaryInfo.find('p').html();msgs = msg.split('<em class="vline"></em>');companyShortName = companyInfo.find('.company-text').find('.name').find('a').text();companyMsg = companyInfo.find('p').html();companyMsgs = companyMsg.split('<em class="vline"></em>');date = item.find('.job-time').text().substr(3);if('昨天' === date)date = yesterday;else { date = year + date;}handleDisconnect();param = [id,titleAndSalarys[0],titleAndSalarys[1],msgs[0],msgs[1],msgs[2],companyShortName,companyMsgs[0],companyMsgs[1],companyMsgs[2],date];connection.query(addJob, param, function(error, result){if(error){console.log(error.message);}else{console.log('insert id: ' + result.titleId);}});})return jobData;}var url;function httpGet(url){https.get(url,function(res){var html = '';res.on('data',function(data){html += data;})res.on('end',function(){//console.log(html); var jobData = filterJobItem(html);})}).on('error', function(){console.log('出错了!');})}var pages = [];function robot(){pages.forEach(function(page){url = url + qs.stringify({ query: 'Java', page: page, ka: 'page-1' },'&');httpGet(url);});}robot();

二、操作

1.1、引入依赖

var https = require('https');//取决于你要爬网址是http还是httpsvar cheerio = require('cheerio');//用法类似jqueryvar mysql  = require('mysql');var moment = require('moment');//格式化时间var urlTool = require("url");var qs = require('querystring');//url和querystring是nodejs用来处理url的两大利器

1.2、依赖的简单使用

1.2.1  monent
var today = moment();var year = today.format('YYYY年');//2017年var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');//昨天的日期,格式为:2017年11月24日
1.2.2 url和querystring
var getQuery = urlTool.parse(url).query;var getData = qs.parse(getQuery);//{ query: 'Java', page: '1', ka: 'page-1' }getData = qs.stringify({ query: 'Java', page: '4', ka: 'page-1' },'&');//query=Java&page=4&ka=page-1
1.2.3 mysql
//nodejs-mysql断线重连var db_config = {    host: '127.0.0.1',        user: 'root',    password:'123456',    port:'3306',    database:'node'};var connection;function handleDisconnect() {    connection = mysql.createConnection(db_config);                connection.connect(function(err) {               if(err) {                                     console.log('进行断线重连:' + new Date());            setTimeout(handleDisconnect, 2000);   //2秒重连一次            return;        }                //console.log('连接成功');     });                    connection.on('error', function(err) {        console.log('db error', err);        if(err.code === 'PROTOCOL_CONNECTION_LOST') {            handleDisconnect();           } else {                                                 throw err;                                        }    });}
1.2.4  cheerio(用法类似jquery)
item = $(this);id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];titleAndSalary = primaryInfo.find('.name').find('a').text();

三、爬虫的使用

将要爬取网页的页数以数组的形式传入pages,在nodejs 执行 node server.js即可。

原创粉丝点击