nodejs使用async来进行优化
来源:互联网 发布:js 设置dialog button 编辑:程序博客网 时间:2024/06/06 03:47
原来的爬取方式,会导致很多个连接同时访问服务器,而导致连接错误,修改为使用async的queue,保持始终只有两个处于激活的状态就不会出现这种问题了
未使用async前的代码:
/* 使用request + cheerio来爬取zngirls网站上的数据 */const request = require('request');const http = require('http');const fs = require('fs');const cheerio = require('cheerio');const url = require('url');const util = require('util');const path = require('path');const process = require('process');const events = require('events');const EventEmitter = events.EventEmitter;function Crawl(girlID) { this.girlID = girlID; this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/'; //允许最多同时连接5个,根据标志位来判断是否有空余未使用 //this.cacheFlag = [false, false, false, false, false]; //this.cacheData = new Array(this.cacheFlag.length); //this.cache = []; //this.emitter = new EventEmitter(); //this.emitter.on('resetflag', this.resetFlag);}Crawl.prototype = { start: function () { //爬取个人主页 var self = this; request.get(this.U(this.getGirlUrl()), function (err, response, body) { if (err) { console.error('错误信息:', err); } else { var $ = cheerio.load(body); $('.igalleryli_link').each(function (i) { var link = $(this); var href = link.attr('href'); //根据gallery的地址来继续爬取gallery var hrefID = href.match(/\/g\/(\d+)/)[1]; var downDir = path.join('' + self.girlID, hrefID); var hostname = url.parse(response.request.href).hostname; var galleryUrl = url.format({ hostname: hostname, pathname: href, protocol: 'http', }); //创建下载目录(如果不存在) var arrDir = downDir.split(path.sep); var startDir = arrDir.shift(); while (true) { if (!fs.existsSync(startDir)) { fs.mkdirSync(startDir); } if (arrDir.length === 0) break; startDir = startDir + path.sep + arrDir.shift(); } //爬取影集 request.get(self.U(galleryUrl), function (err, response, body) { if (err) { console.error('下载错误:' + response.url, err); process.exit(-1); } var $ = cheerio.load(body); var images = $('#hgallery > img'); if (images) { var im = $(images[0]); var src = im.attr('src') var preUrl = src.slice(0, src.lastIndexOf('/') + 1); //爬取所有的图片并异步下载 $('#dinfo > span').each(function (i) { var span = $(this); var matched = span.text().match(/(\d+).*/); if (matched) { //该影集的数目 var count = matched[1]; for (var i = 0; i < count; ++i) { var jpgFile = self.formatIndex(i) + '.jpg'; var jpgUrl = preUrl + jpgFile; var jpgDownFile = path.join(downDir, jpgFile); //console.log('下载图片从: ' + jpgUrl + '到: ' + jpgDownFile); //将下载文件加入到下载队列中 //self.put(jpgUrl, jpgDownFile); //如果这么下载会导致,和服务器存在大量的连接而导致无法同时下载这么多 //需要想办法减少连接数量 /* request(self.U(jpgUrl), function (err, response, body) { //fs.writeFile(jpgDownFile, body, function (err) { // if (err) { // console.error('写入文件错误:' + jpgDownFile, err); // } //}); fs.writeFileSync(jpgDownFile, body); }); */ //如果下载的图片太多的话,这边是会出问题的 //var opt = self.U(jpgUrl); //设置最大的socket连接数目 //opt.pool = {maxSockets: 2}; //request.get(opt).on('error', function (err){ // console.error('下载错误:', err); // process.exit(-1000); //}).pipe(fs.createWriteStream(jpgDownFile)) function down(jpgUrl, jpgDownFile) { //===================================================== /* request(self.U(jpgUrl), function (err, response, body) { //fs.writeFile(jpgDownFile, body, function (err) { // if (err) { // console.error('写入文件错误:' + jpgDownFile, err); // } //}); fs.writeFileSync(jpgDownFile, body); console.log('完成图片下载:' + jpgDownFile); }); */ //===================================================== //scheme02 console.log('jpgUrl=' + jpgUrl + ' jpgFile=' + jpgDownFile); var opt = self.U(jpgUrl); //opt.pool = {maxSockets: 2}; request.get(opt).on('error', function (err) { console.error('下载错误:', err); process.exit(-1000); }).pipe(fs.createWriteStream(jpgDownFile)); console.log('完成图片下载:' + jpgDownFile); //===================================================== //增加一个函数闭包后没有问题(因为数据进行了复制) /*scheme03 var jpgUrlP = url.parse(jpgUrl); http.get({ host: jpgUrlP.host, port: 80, headers: { referer: 'http://www.baidu.com', }, path: jpgUrl, }, function (res) { var buffers = []; res.on('data', function (data) { buffers.push(data); }); res.on('end', function () { var body = Buffer.concat(buffers); fs.writeFileSync(jpgDownFile, body); console.log('完成图片下载:' + jpgDownFile); }); }); */ }; //down(jpgUrl, jpgDownFile); //使用函数来下载 self.download2(jpgUrl, jpgDownFile); } ; } } ); } }); console.log('爬取影集执行完毕'); }); } }); console.log('个人全部影集执行完毕'); }, formatIndex: function (i) { var si = i + ''; if (i === 0) { return si; } while (si.length < 3) { si = '0' + si; } return si; }, getGirlUrl: function () { return util.format(this.girlUrlFmt, this.girlID); }, U: function (_url) { return { url: _url, headers: { referer: 'http://www.baidu.com', connection: 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', } }; }, //将下载内容存放到cache中 /* put: function (_url, filename) { this.cache.push({url: _url, filename: filename}); //process.nextTick(this.downloadTick); this.downloadTick(); }, downloadTick: function () { var self = this; if (this.cache.length <= 0) { console.log('当前没有缓存任何请求..'); return; } for (var i = 0; i < this.cacheFlag.length; ++i) { if (!this.cacheFlag[i]) { var first = this.cache.shift(); var flag = i; if (first) { this.cacheFlag[i] = true; this.cacheData[i] = first; console.log(first); request(self.U(first.url), function (err, response, body) { var cacheData = self.cacheData[flag]; console.log(cacheData); console.log('下载文件到:' + cacheData.filename); fs.writeFileSync(cacheData.filename, body); self.emitter.emit('resetflag', flag) }); } } } }, resetFlag: function (flagIndex) { console.log(this); console.log(this.cacheFlag); if (flagIndex >= 0 && flagIndex < this.cacheFlag.length) this.cacheFlag[flagIndex] = false; else console.error('错误的标志位' + flagIndex); //process.nextTick(this.downloadTick); this.downloadTick(); }, */ download: function (_url, filename) { console.log('jpgUrl=' + _url + ' jpgFile=' + filename); var opt = this.U(_url); //opt.pool = {maxSockets: 2}; request.get(opt).on('error', function (err) { console.error('下载错误:', err); process.exit(-1000); }).pipe(fs.createWriteStream(filename)); //console.log('完成图片下载:' + filename); //等待 var t1 = (new Date()).getTime(); var t2 = (new Date()).getTime(); //3秒以后再继续执行 while((t2-t1) < 4000){ t2 = (new Date()).getTime(); }; }, download2: function(jpgUrl, jpgDownFile){ var jpgUrlP = url.parse(jpgUrl); http.get({ host: jpgUrlP.host, port: 80, headers: { referer: 'http://www.baidu.com', }, path: jpgUrl, }, function (res) { var buffers = []; res.on('data', function (data) { buffers.push(data); }); res.on('end', function () { var body = Buffer.concat(buffers); fs.writeFileSync(jpgDownFile, body); console.log('完成图片下载:' + jpgDownFile);<pre name="code" class="javascript">/* 使用request + cheerio来爬取zngirls网站上的数据 */const request = require('request');const http = require('http');const fs = require('fs');const cheerio = require('cheerio');const url = require('url');const util = require('util');const path = require('path');const process = require('process');const events = require('events');const EventEmitter = events.EventEmitter;const async = require('async');function Crawl(girlID) { this.girlID = girlID; this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/'; //同时可以执行两个 var self = this; this.queue = async.queue(function (task, callback) { //console.log('Hello' + task.name); //执行操作 console.log('url:' + task.url + ' file:' + task.file); self.download2(task.url, task.file, callback); }, 2);}Crawl.prototype = { start: function () { //爬取个人主页 var self = this; request.get(this.U(this.getGirlUrl()), function (err, response, body) { if (err) { console.error('错误信息:', err); } else { var $ = cheerio.load(body); $('.igalleryli_link').each(function (i) { var link = $(this); var href = link.attr('href'); //根据gallery的地址来继续爬取gallery var hrefID = href.match(/\/g\/(\d+)/)[1]; var downDir = path.join('' + self.girlID, hrefID); var hostname = url.parse(response.request.href).hostname; var galleryUrl = url.format({ hostname: hostname, pathname: href, protocol: 'http', }); //创建下载目录(如果不存在) var arrDir = downDir.split(path.sep); var startDir = arrDir.shift(); while (true) { if (!fs.existsSync(startDir)) { fs.mkdirSync(startDir); } if (arrDir.length === 0) break; startDir = startDir + path.sep + arrDir.shift(); } //爬取影集 request.get(self.U(galleryUrl), function (err, response, body) { if (err) { console.error('下载错误:' + response.url, err); process.exit(-1); } var $ = cheerio.load(body); var images = $('#hgallery > img'); if (images) { var im = $(images[0]); var src = im.attr('src') var preUrl = src.slice(0, src.lastIndexOf('/') + 1); //爬取所有的图片并异步下载 $('#dinfo > span').each(function (i) { var span = $(this); var matched = span.text().match(/(\d+).*/); if (matched) { //该影集的数目 var count = matched[1]; for (var i = 0; i < count; ++i) { var jpgFile = self.formatIndex(i) + '.jpg'; var jpgUrl = preUrl + jpgFile; var jpgDownFile = path.join(downDir, jpgFile); //self.download2(jpgUrl, jpgDownFile); self.queue.push({url:jpgUrl, file:jpgDownFile}); } ; } } ); } }); console.log('爬取影集执行完毕'); }); } }); console.log('个人全部影集执行完毕'); }, formatIndex: function (i) { var si = i + ''; if (i === 0) { return si; } while (si.length < 3) { si = '0' + si; } return si; }, getGirlUrl: function () { return util.format(this.girlUrlFmt, this.girlID); }, U: function (_url) { return { url: _url, headers: { referer: 'http://www.baidu.com', connection: 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', } }; }, download: function (_url, filename) { console.log('jpgUrl=' + _url + ' jpgFile=' + filename); var opt = this.U(_url); //opt.pool = {maxSockets: 2}; request.get(opt).on('error', function (err) { console.error('下载错误:', err); process.exit(-1000); }).pipe(fs.createWriteStream(filename)).on('close', function () { console.log('完成图片下载:' + filename); }); }, download2: function (jpgUrl, jpgDownFile,callback) { var jpgUrlP = url.parse(jpgUrl); http.get({ host: jpgUrlP.host, port: 80, headers: { referer: 'http://www.baidu.com', }, path: jpgUrl, }, function (res) { var buffers = []; res.on('data', function (data) { buffers.push(data); }); res.on('end', function () { var body = Buffer.concat(buffers); fs.writeFileSync(jpgDownFile, body); if(callback){ //保证同步 callback(); } console.log('完成图片下载:' + jpgDownFile); }); }); }};var girlID = 19705;var crawl = new Crawl(girlID);crawl.start();
}); }); //等待 //var t1 = (new Date()).getTime(); //var t2 = (new Date()).getTime(); ////3秒以后再继续执行 //while((t2-t1) < 4000){ // t2 = (new Date()).getTime(); //}; }};var girlID = 19705;var crawl = new Crawl(girlID);crawl.start();//crawl.download2('http://t1.zngirls.com/gallery/19705/19815/019.jpg', '119.jpg');console.log('主程序执行完毕');
使用async的queue优化
0 0
- nodejs使用async来进行优化
- nodejs中使用async来对异步操作进行同步,避免多个异步同时启动引发连接错误
- nodejs 使用async进行BT吧最新电影数据爬取
- nodejs async 库使用
- nodejs中Async库使用
- nodejs + async
- 【nodejs】async
- NodeJS中使用async控制并发-@CAOLAN
- nodejs使用eventproxy和async控制并发
- SQL Server 使用索引来对数据访问进行优化
- nodejs Async 详解
- nodejs Async 详解
- nodejs Async 详解
- nodejs Async 详解
- nodejs Async 详解
- Nodejs 异步框架async
- nodejs 异步编程 async
- nodejs Async 详解
- 2016-10-09课后练习
- Linux经典书籍推荐
- 对象的创建(字面量形式)
- STM32系统时钟设置详解
- 77. Combinations
- nodejs使用async来进行优化
- JavaScript 事件处理
- C语言复习总结(3)
- svn主干与分支学习
- Web 应用程序状态管理
- Reverse Integer
- oracle中exp,imp的使用详解
- 深入了解memcache
- Android开发代码规范