nodejs使用async来进行优化

来源:互联网 发布:js 设置dialog button 编辑:程序博客网 时间:2024/06/06 03:47

原来的爬取方式,会导致很多个连接同时访问服务器,而导致连接错误,修改为使用async的queue,保持始终只有两个处于激活的状态就不会出现这种问题了

未使用async前的代码:

/* 使用request + cheerio来爬取zngirls网站上的数据 */const request = require('request');const http = require('http');const fs = require('fs');const cheerio = require('cheerio');const url = require('url');const util = require('util');const path = require('path');const process = require('process');const events = require('events');const EventEmitter = events.EventEmitter;function Crawl(girlID) {    this.girlID = girlID;    this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/';    //允许最多同时连接5个,根据标志位来判断是否有空余未使用    //this.cacheFlag = [false, false, false, false, false];    //this.cacheData = new Array(this.cacheFlag.length);    //this.cache = [];    //this.emitter = new EventEmitter();    //this.emitter.on('resetflag', this.resetFlag);}Crawl.prototype = {    start: function () {        //爬取个人主页        var self = this;        request.get(this.U(this.getGirlUrl()), function (err, response, body) {            if (err) {                console.error('错误信息:', err);            } else {                var $ = cheerio.load(body);                $('.igalleryli_link').each(function (i) {                    var link = $(this);                    var href = link.attr('href');                    //根据gallery的地址来继续爬取gallery                    var hrefID = href.match(/\/g\/(\d+)/)[1];                    var downDir = path.join('' + self.girlID, hrefID);                    var hostname = url.parse(response.request.href).hostname;                    var galleryUrl = url.format({                        hostname: hostname,                        pathname: href,                        protocol: 'http',                    });                    //创建下载目录(如果不存在)                    var arrDir = downDir.split(path.sep);                    var startDir = arrDir.shift();                    while (true) {                        if (!fs.existsSync(startDir)) {                            fs.mkdirSync(startDir);                        }                        if (arrDir.length === 0) break;                        startDir = startDir + path.sep + arrDir.shift();                    }                    //爬取影集                    request.get(self.U(galleryUrl), function (err, response, body) {                        if (err) {                            console.error('下载错误:' + response.url, err);                            process.exit(-1);                        }                        var $ = cheerio.load(body);                        var images = $('#hgallery > img');                        if (images) {                            var im = $(images[0]);                            var src = im.attr('src')                            var preUrl = src.slice(0, src.lastIndexOf('/') + 1);                            //爬取所有的图片并异步下载                            $('#dinfo > span').each(function (i) {                                    var span = $(this);                                    var matched = span.text().match(/(\d+).*/);                                    if (matched) {                                        //该影集的数目                                        var count = matched[1];                                        for (var i = 0; i < count; ++i) {                                            var jpgFile = self.formatIndex(i) + '.jpg';                                            var jpgUrl = preUrl + jpgFile;                                            var jpgDownFile = path.join(downDir, jpgFile);                                            //console.log('下载图片从: ' + jpgUrl + '到: ' + jpgDownFile);                                            //将下载文件加入到下载队列中                                            //self.put(jpgUrl, jpgDownFile);                                            //如果这么下载会导致,和服务器存在大量的连接而导致无法同时下载这么多                                            //需要想办法减少连接数量                                            /*                                             request(self.U(jpgUrl), function (err, response, body) {                                             //fs.writeFile(jpgDownFile, body, function (err) {                                             //    if (err) {                                             //        console.error('写入文件错误:' + jpgDownFile, err);                                             //    }                                             //});                                             fs.writeFileSync(jpgDownFile, body);                                             });                                             */                                            //如果下载的图片太多的话,这边是会出问题的                                            //var opt = self.U(jpgUrl);                                            //设置最大的socket连接数目                                            //opt.pool = {maxSockets: 2};                                            //request.get(opt).on('error', function (err){                                            //    console.error('下载错误:', err);                                            //    process.exit(-1000);                                            //}).pipe(fs.createWriteStream(jpgDownFile))                                            function down(jpgUrl, jpgDownFile) {                                                //=====================================================                                                /*                                                 request(self.U(jpgUrl), function (err, response, body) {                                                 //fs.writeFile(jpgDownFile, body, function (err) {                                                 //    if (err) {                                                 //        console.error('写入文件错误:' + jpgDownFile, err);                                                 //    }                                                 //});                                                 fs.writeFileSync(jpgDownFile, body);                                                 console.log('完成图片下载:' + jpgDownFile);                                                 });                                                 */                                                //=====================================================                                                //scheme02                                                console.log('jpgUrl=' + jpgUrl + ' jpgFile=' + jpgDownFile);                                                var opt = self.U(jpgUrl);                                                //opt.pool = {maxSockets: 2};                                                request.get(opt).on('error', function (err) {                                                    console.error('下载错误:', err);                                                    process.exit(-1000);                                                }).pipe(fs.createWriteStream(jpgDownFile));                                                console.log('完成图片下载:' + jpgDownFile);                                                //=====================================================                                                //增加一个函数闭包后没有问题(因为数据进行了复制)                                                /*scheme03                                                 var jpgUrlP = url.parse(jpgUrl);                                                 http.get({                                                 host: jpgUrlP.host,                                                 port: 80,                                                 headers: {                                                 referer: 'http://www.baidu.com',                                                 },                                                 path: jpgUrl,                                                 }, function (res) {                                                 var buffers = [];                                                 res.on('data', function (data) {                                                 buffers.push(data);                                                 });                                                 res.on('end', function () {                                                 var body = Buffer.concat(buffers);                                                 fs.writeFileSync(jpgDownFile, body);                                                 console.log('完成图片下载:' + jpgDownFile);                                                 });                                                 });                                                 */                                            };                                            //down(jpgUrl, jpgDownFile);                                            //使用函数来下载                                            self.download2(jpgUrl, jpgDownFile);                                        }                                        ;                                    }                                }                            );                        }                    });                    console.log('爬取影集执行完毕');                });            }        });        console.log('个人全部影集执行完毕');    },    formatIndex: function (i) {        var si = i + '';        if (i === 0) {            return si;        }        while (si.length < 3) {            si = '0' + si;        }        return si;    },    getGirlUrl: function () {        return util.format(this.girlUrlFmt, this.girlID);    },    U: function (_url) {        return {            url: _url,            headers: {                referer: 'http://www.baidu.com',                connection: 'keep-alive',                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',            }        };    },    //将下载内容存放到cache中    /*     put: function (_url, filename) {     this.cache.push({url: _url, filename: filename});     //process.nextTick(this.downloadTick);     this.downloadTick();     },     downloadTick: function () {     var self = this;     if (this.cache.length <= 0) {     console.log('当前没有缓存任何请求..');     return;     }     for (var i = 0; i < this.cacheFlag.length; ++i) {     if (!this.cacheFlag[i]) {     var first = this.cache.shift();     var flag = i;     if (first) {     this.cacheFlag[i] = true;     this.cacheData[i] = first;     console.log(first);     request(self.U(first.url), function (err, response, body) {     var cacheData = self.cacheData[flag];     console.log(cacheData);     console.log('下载文件到:' + cacheData.filename);     fs.writeFileSync(cacheData.filename, body);     self.emitter.emit('resetflag', flag)     });     }     }     }     },     resetFlag: function (flagIndex) {     console.log(this);     console.log(this.cacheFlag);     if (flagIndex >= 0 && flagIndex < this.cacheFlag.length)     this.cacheFlag[flagIndex] = false;     else     console.error('错误的标志位' + flagIndex);     //process.nextTick(this.downloadTick);     this.downloadTick();     },     */    download: function (_url, filename) {        console.log('jpgUrl=' + _url + ' jpgFile=' + filename);        var opt = this.U(_url);        //opt.pool = {maxSockets: 2};        request.get(opt).on('error', function (err) {            console.error('下载错误:', err);            process.exit(-1000);        }).pipe(fs.createWriteStream(filename));        //console.log('完成图片下载:' + filename);        //等待        var t1 = (new Date()).getTime();        var t2 = (new Date()).getTime();        //3秒以后再继续执行        while((t2-t1) < 4000){            t2 = (new Date()).getTime();        };    },    download2: function(jpgUrl, jpgDownFile){        var jpgUrlP = url.parse(jpgUrl);        http.get({            host: jpgUrlP.host,            port: 80,            headers: {                referer: 'http://www.baidu.com',            },            path: jpgUrl,        }, function (res) {            var buffers = [];            res.on('data', function (data) {                buffers.push(data);            });            res.on('end', function () {                var body = Buffer.concat(buffers);                fs.writeFileSync(jpgDownFile, body);                console.log('完成图片下载:' + jpgDownFile);<pre name="code" class="javascript">/* 使用request + cheerio来爬取zngirls网站上的数据 */const request = require('request');const http = require('http');const fs = require('fs');const cheerio = require('cheerio');const url = require('url');const util = require('util');const path = require('path');const process = require('process');const events = require('events');const EventEmitter = events.EventEmitter;const async = require('async');function Crawl(girlID) {    this.girlID = girlID;    this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/';    //同时可以执行两个    var self = this;    this.queue = async.queue(function (task, callback) {        //console.log('Hello' + task.name);        //执行操作        console.log('url:' + task.url + ' file:' + task.file);        self.download2(task.url, task.file, callback);    }, 2);}Crawl.prototype = {    start: function () {        //爬取个人主页        var self = this;        request.get(this.U(this.getGirlUrl()), function (err, response, body) {            if (err) {                console.error('错误信息:', err);            } else {                var $ = cheerio.load(body);                $('.igalleryli_link').each(function (i) {                    var link = $(this);                    var href = link.attr('href');                    //根据gallery的地址来继续爬取gallery                    var hrefID = href.match(/\/g\/(\d+)/)[1];                    var downDir = path.join('' + self.girlID, hrefID);                    var hostname = url.parse(response.request.href).hostname;                    var galleryUrl = url.format({                        hostname: hostname,                        pathname: href,                        protocol: 'http',                    });                    //创建下载目录(如果不存在)                    var arrDir = downDir.split(path.sep);                    var startDir = arrDir.shift();                    while (true) {                        if (!fs.existsSync(startDir)) {                            fs.mkdirSync(startDir);                        }                        if (arrDir.length === 0) break;                        startDir = startDir + path.sep + arrDir.shift();                    }                    //爬取影集                    request.get(self.U(galleryUrl), function (err, response, body) {                        if (err) {                            console.error('下载错误:' + response.url, err);                            process.exit(-1);                        }                        var $ = cheerio.load(body);                        var images = $('#hgallery > img');                        if (images) {                            var im = $(images[0]);                            var src = im.attr('src')                            var preUrl = src.slice(0, src.lastIndexOf('/') + 1);                            //爬取所有的图片并异步下载                            $('#dinfo > span').each(function (i) {                                    var span = $(this);                                    var matched = span.text().match(/(\d+).*/);                                    if (matched) {                                        //该影集的数目                                        var count = matched[1];                                        for (var i = 0; i < count; ++i) {                                            var jpgFile = self.formatIndex(i) + '.jpg';                                            var jpgUrl = preUrl + jpgFile;                                            var jpgDownFile = path.join(downDir, jpgFile);                                            //self.download2(jpgUrl, jpgDownFile);                                            self.queue.push({url:jpgUrl, file:jpgDownFile});                                        }                                        ;                                    }                                }                            );                        }                    });                    console.log('爬取影集执行完毕');                });            }        });        console.log('个人全部影集执行完毕');    },    formatIndex: function (i) {        var si = i + '';        if (i === 0) {            return si;        }        while (si.length < 3) {            si = '0' + si;        }        return si;    },    getGirlUrl: function () {        return util.format(this.girlUrlFmt, this.girlID);    },    U: function (_url) {        return {            url: _url,            headers: {                referer: 'http://www.baidu.com',                connection: 'keep-alive',                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',            }        };    },    download: function (_url, filename) {        console.log('jpgUrl=' + _url + ' jpgFile=' + filename);        var opt = this.U(_url);        //opt.pool = {maxSockets: 2};        request.get(opt).on('error', function (err) {            console.error('下载错误:', err);            process.exit(-1000);        }).pipe(fs.createWriteStream(filename)).on('close', function () {            console.log('完成图片下载:' + filename);        });    },    download2: function (jpgUrl, jpgDownFile,callback) {        var jpgUrlP = url.parse(jpgUrl);        http.get({            host: jpgUrlP.host,            port: 80,            headers: {                referer: 'http://www.baidu.com',            },            path: jpgUrl,        }, function (res) {            var buffers = [];            res.on('data', function (data) {                buffers.push(data);            });            res.on('end', function () {                var body = Buffer.concat(buffers);                fs.writeFileSync(jpgDownFile, body);                if(callback){                    //保证同步                    callback();                }                console.log('完成图片下载:' + jpgDownFile);            });        });    }};var girlID = 19705;var crawl = new Crawl(girlID);crawl.start();

}); }); //等待 //var t1 = (new Date()).getTime(); //var t2 = (new Date()).getTime(); ////3秒以后再继续执行 //while((t2-t1) < 4000){ // t2 = (new Date()).getTime(); //}; }};var girlID = 19705;var crawl = new Crawl(girlID);crawl.start();//crawl.download2('http://t1.zngirls.com/gallery/19705/19815/019.jpg', '119.jpg');console.log('主程序执行完毕');



使用async的queue优化


0 0
原创粉丝点击