nodejs 爬虫
来源:互联网 发布:数据透视表实时更新 编辑:程序博客网 时间:2024/05/23 22:59
arrayOpr.js
Array.prototype.uniquelize = function(){
var ra = new Array();
for(var i = 0; i < this.length; i ++){
if(!(ra.indexOf(this[i]) >-1)){
ra.push(this[i]);
}
}
return ra; };
Array.prototype.each = function(fn){
fn = fn || Function.K;
var a = [];
var args = Array.prototype.slice.call(arguments, 1);
for(var i = 0; i < this.length; i++){
var res = fn.apply(this,[this[i],i].concat(args));
if(res != null) a.push(res);
}
return a;
};
Array.union = function(a, b){
return a.concat(b).uniquelize();
};
Array.intersect = function(a, b){
return a.uniquelize().each(function(o){return b.indexOf(o)>-1 ? o : null});
};
Array.minus = function(a, b){
return a.uniquelize().each(function(o){return b.indexOf(o)>-1 ? null : o});
};
Array.complement = function(a, b){
return Array.minus(Array.union(a, b),Array.intersect(a, b));
};
asynSpider.js
var http = require('http');
var promise = require('promise');
var cheerio = require('cheerio');
var fs = require("fs");
const arrayOpr = require('./arrayOpr.js');
var url = "http://pe.pedaily.cn/vcpe/"
var list = [];
for(var i=1;i<=1;i++) {
let curl = url + i.toString();
list.push(getPageList(curl));
}
function getPageList(url){
return new Promise(function(resolve,reject) {
http.get(url,function(res) {
var body = '';
res.on('data',function(chunk) {
res.setEncoding('utf8');
body += chunk;
});
res.on('end',function() {
var spiderContent = [];
var $ = cheerio.load(body);
$('h3','#newslist-all').map(function(i,e1){
let content = {
title: '',
url: ''
}
content.title = $(this).children('a').text();
content.url = $(this).children('a').attr('href');
spiderContent.push(content);
});
resolve(spiderContent);
});
res.on('error',function(e) {
reject(e.message);
});
})
})
}
function getUrlContent(url) {
return new Promise(function(resolve,reject) {
http.get(url,function(res) {
var body = '';
var filename = url.substr(url.lastIndexOf('/') +1);
res.on('data',function(chunk){
res.setEncoding('utf8');
body += chunk;
});
res.on('end',function() {
var urlContent = {
url: url,
subject: '',
content: '',
filename: filename
};
var newcontent$ = cheerio.load(body);
urlContent.subject = newcontent$('div.subject').text();
urlContent.content = newcontent$('div.news-content').text();
resolve(urlContent);
});
res.on('error',function(e) {
reject(e.message);
})
})
})
}
Promise
.all(list)
.then(function(data) {
fs.writeFile('newsList.json', JSON.stringify(data), function(err) {
if (err) {
return console.error(err);
}
fs.readFile('newsList.json', function (err, data) {
if (err) {
return console.error(err);
}
var urls = [];
for(var i=0;i<JSON.parse(data).length;i++) {
var list = JSON.parse(data)[i];
list.map(function(i) {
urls.push(i.url);
});
}
fs.writeFile('urls.json',JSON.stringify(urls),function(err) {
if(err) {
return console.error(err);
}
})
var detail = [];
var count = urls.length;
for(var i=0; i<count;i++) {
detail.push(getUrlContent(urls[i]));
}
console.log(`本次爬虫共计下载页面${count}个`);
Promise
.all(detail)
.then(function(data) {
if(data && data.length >0){
for(var i=0;i<data.length;i++){
let temphtmldata = '<html><body><h1>';
temphtmldata += data[i].subject;
temphtmldata += '</h1><br/><br/><h2>'
temphtmldata += data[i].content;
temphtmldata += '</h2></body></html>'
fs.writeFile('./spiderdata/'+ data[i].filename,temphtmldata,function(err) {
if(err) {
return console.log(err);
}
})
}
}
})
});
});
});
checkUpdate.js
const http = require('http');
const promise = require('promise');
const cheerio = require('cheerio');
const fs = require("fs");
const arrayOpr = require('./arrayOpr.js');
let urlprefix = "http://pe.pedaily.cn/vcpe/"
let checklist = [];
let checkcount = 1;
for(let i=1;i<=checkcount;i++) {
let url = urlprefix + i;
checklist.push(getPageList(url));
}
function getPageList(url){
return new Promise(function(resolve,reject) {
http.get(url,function(res) {
var body = '';
res.on('data',function(chunk) {
res.setEncoding('utf8');
body += chunk;
});
res.on('end',function() {
var spiderContent = [];
var $ = cheerio.load(body);
$('h3','#newslist-all').map(function(i,e1){
let content = {
title: '',
url: ''
}
content.title = $(this).children('a').text();
content.url = $(this).children('a').attr('href');
spiderContent.push(content);
});
resolve(spiderContent);
});
res.on('error',function(e) {
reject(e.message);
});
})
})
};
Promise
.all(checklist)
.then(function(data) {
fs.writeFile('updateList.json', JSON.stringify(data), function(err) {
if (err) {
return console.error(err);
}
fs.readFile('updateList.json', function (err, data) {
if (err) {
return console.error(err);
}
let urls = [];
for(let i=0;i<JSON.parse(data).length;i++) {
let list = JSON.parse(data)[i];
list.map(function(i) {
urls.push(i.url);
});
}
fs.readFile('urls.json', function(err,data) {
if (err) {
return console.error(err);
}
let existUrls = [];
for(let i=0;i<JSON.parse(data).length;i++) {
existUrls.push(JSON.parse(data)[i]);
}
let temp = Array.intersect(urls,existUrls);
let updateurl = Array.minus(urls,temp);
let updateDetail = [];
for(let i=0;i<updateurl.length;i++) {
updateDetail.push(getUrlContent(updateurl[i]));
}
if(updateDetail.length >0) {
console.log(`共发现${updateDetail.length}条新记录`);
Promise
.all(updateDetail)
.then(function(data) {
if(data && data.length >0){
for(var i=0;i<data.length;i++){
let temphtmldata = '<html><body><h1>';
temphtmldata += data[i].subject;
temphtmldata += '</h1><br/><br/><h2>'
temphtmldata += data[i].content;
temphtmldata += '</h2></body></html>'
console.log(`已更新文件${data[i].filename}`);
fs.writeFile('./spiderdata/'+ data[i].filename,temphtmldata,function(err) {
if(err) {
return console.log(err);
}
})
}
let finalurls = Array.union(updateurl,existUrls);
fs.writeFile('urls.json',JSON.stringify(finalurls), function(err) {
if(err) {
return console.log(err);
}
});
}
});
} else {
console.log('未发现有更新的数据');
}
})
});
});
});
function getUrlContent(url) {
return new Promise(function(resolve,reject) {
http.get(url,function(res) {
var body = '';
var filename = url.substr(url.lastIndexOf('/') +1);
res.on('data',function(chunk){
res.setEncoding('utf8');
body += chunk;
});
res.on('end',function() {
var urlContent = {
url: url,
subject: '',
content: '',
filename: filename
};
var newcontent$ = cheerio.load(body);
urlContent.subject = newcontent$('div.subject').text();
urlContent.content = newcontent$('div.news-content').text();
resolve(urlContent);
});
res.on('error',function(e) {
reject(e.message);
})
})
})
}
- nodejs 爬虫
- nodejs 爬虫
- nodejs 爬虫
- nodejs豆瓣爬虫
- nodejs豆瓣爬虫
- NodeJs爬虫02
- NodeJS制作爬虫全过程
- NodeJs编写小爬虫
- nodejs爬虫编码问题
- 用Nodejs做爬虫
- nodejs 学习 - 简单爬虫
- nodejs简易爬虫
- NodeJS爬虫系统
- nodejs+cheerio 爬虫入门
- NodeJs妹子图爬虫
- NodeJS-妹子图爬虫
- Nodejs简单爬虫
- 网络爬虫 Nodejs
- 源码阅读技巧篇
- IE F12 debug using
- c语言操作符
- c++primer5 题3.24
- Invalid bound statement (not found)
- nodejs 爬虫
- asp.net 共享文件夹 网络文件夹 文件读写
- 员工人事档案组成要素介绍
- 欢迎使用CSDN-markdown编辑器
- Java内部类详解
- 宏定义中的##
- guava 集合
- 测试流程。。。
- IDEA下从零开始搭建SpringBoot工程