使用nodejs发起get或post请求抓取网页内容,支持中文编码

来源:互联网 发布:快乐星球电影知乎 编辑:程序博客网 时间:2024/05/22 02:22

使用nodejs发起get或post请求抓取网页内容,支持中文编码(GBK、GB2312)、设置请求超时时间、http或https协议,支持post发送json数据,请求数据编码格式支持指定等。

1. [代码]单元测试:     跳至 [1] [2] [全屏预览]

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/**
 * @fileOverview tool/httpHelper单元测试
 * @author 菱角
 * @module test/tool/httpHelper
 */
 
varshould = require('chai').should();
varhttpHelper = require("../../tool/httpHelper");
//var url='http://www.yunhosting.com/index.asp';
varurl='https://github.com/visionmedia/express';
 
/**
 *  @function tool/httpHelper
 *  @description tool/httpHelper单元测试
 */
describe('test tool/httpHelper',function() {
 
    /**
     *  @function get
     *  @description tool/httpHelper get
     */
    describe('#get',function() {
        it('get方式请求页面不报错',function(done) {
            httpHelper.get(url, 1000, function(err, data) {
                if(err) {
                    returndone(err);
                }
                should.exist(data);
                done();
            },'gbk', {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'});
        });
 
        it('get方式请求Google首页超时',function(done) {
            httpHelper.get('http://www.google.com/', 10, function(err, data) {
                if(err && err.message === 'request timeout') {
                    returndone();
                }
 
                done(newError('超时时间设置无效'));
            },'gbk');
        });
 
    });
 
    /**
     *  @function post
     *  @description tool/httpHelper post
     */
    describe('#post',function() {
        it('post方式请求页面不报错',function(done) {
            httpHelper.post(url, 1000, {}, function(err, data) {
                if(err) {
                    returndone(err);
                }
                should.exist(data);
                done();
            },'gbk', {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'});
        });
 
        it('post方式给页面发送json数据不报错',function(done) {
            httpHelper.post(url, 1000, {
                username:'username',
                pwd:'pwd'
            },function(err, data) {
                if(err) {
                    returndone(err);
                }
                should.exist(data);
                done();
            },'gbk', undefined, 'gbk',true);
        });
 
    });
 
    /**
     *  @function request
     *  @description tool/httpHelper request
     */
    describe('#request',function() {
 
        it('get方式请求,直接返回二进制数据',function(done) {
            varoptions = require('url').parse(url);
            options.method = 'GET';
            options.buffer = true;
            httpHelper.request(options, 1000, {}, function(err, data) {
                if(err) {
                    returndone(err);
                }
                should.exist(data);
                (typeofdata).should.equals('object');
                done();
            });
        });
    });
});

2. [文件] httpHelper.js ~ 5KB     下载(25)     

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/**
 * @fileOverview http请求的工具操作集,包含请求超时时间设置
 * @author menglb
 * @module tool/httpHelper
 */
 
varhttp = require('http');
varhttps = require('https');
varqs = require('querystring');
variconv = require('iconv-lite');
varBufferHelper = require('bufferhelper');
 
/**
 * @exports tool/httpHelper
 */
varhttpHelper = {
 
    /**
     * @description 发起远程请求的基础方法
     * @param {Object} options 请求选项
     * @param {String} [options.protocol='http'] 请求协议
     * @param {String} [options.method='get'] 请求方法,get、post...
     * @param {Object=} options.headers 请求头
     * @param {String=} options.encode 请求数据的编码格式,如果是gbk,使用escape编码
     * @param {Boolean=} [options.json=false] 发送的是否json数据
     * @param {Boolean=} [options.buffer=false] 是否直接返回二进制数据
     * @param {Number=} timeout 超时时间,单位为毫秒
     * @param {Object=} data 请求发送的数据对象
     * @param {RequestCallback} callback 处理请求响应的回调方法,查看 {@link RequestCallback}
     * @param {String} [encoding='utf-8'] 编码格式
     */
    request:function(options, timeout, data, callback, encoding) {
        varhttpLib = http;
        if(options.protocol && options.protocol === 'https:') {
            httpLib = https;
        }
        varcontent = {};
        if(options.json) {
            content = JSON.stringify(data);
        }else{
            content = (options.encode && options.encode.toLocaleLowerCase() == 'gbk') ? qs.stringify(data, null,null, {encodeURIComponent: escape}) : qs.stringify(data);
        }
        if(options.method.toLowerCase() === 'post') {
            options.headers = options.headers || {};
            options.headers['Content-Type'] = options.json ? 'application/json': 'application/x-www-form-urlencoded';
            options.headers['Content-Length'] = Buffer.byteLength(content);
        }
        /** 为true时直接返回数据流 */
        options.buffer = options.buffer || false;
 
        varreq = httpLib.request(options, function(res) {
            varbufferHelper = newBufferHelper();
            res.on('data',function(chunk) {
                bufferHelper.concat(chunk);
            });
            res.on('end',function() {
                var_data;
                if(options.buffer) {
                    _data = bufferHelper.toBuffer();
                }
                else{
                    if(typeofencoding != 'undefined'&& encoding !== null) {
                        _data = iconv.decode(bufferHelper.toBuffer(), encoding);
                    }else{
                        _data = iconv.decode(bufferHelper.toBuffer(), 'utf-8');
                    }
                }
                callback(null, _data, res, req);
            });
        });
 
        req.on('error',function(err) {
            callback(err);
        });
 
        req.write(content);
 
        if(timeout && timeout > 0) {
            req.setTimeout(timeout,function() {
                callback(newError('request timeout'),'');
            });
        }
 
        req.end();
    },
 
    /**
     * @description 以GET的方式发起远程请求
     * @param {String} url 请求地址
     * @param {Number=} timeout 超时时间,单位为毫秒
     * @param {RequestCallback} callback 处理请求响应的回调方法,查看 {@link RequestCallback}
     * @param {String} [encoding='utf-8'] 编码格式
     * @param {Object=} header 请求头对象
     */
    get:function(url, timeout, callback, encoding, header) {
        varoptions = require('url').parse(url);
        options.method = 'GET';
        if(header) {
            options.headers = header;
        }
 
        this.request(options, timeout, {}, callback, encoding);
    },
 
    /**
     * @description 以POST的方式发起远程请求
     * @param {String} url 请求地址
     * @param {Number=} timeout 超时时间,单位为毫秒
     * @param {Object=} data 请求发送的数据对象
     * @param {RequestCallback} callback 处理请求响应的回调方法,查看 {@link RequestCallback}
     * @param {String} [encoding='utf-8'] 编码格式
     * @param {Object=} header 请求头对象
     * @param {String=} reqEncoding 请求数据的编码格式,如果是gbk,使用escape编码
     * @param {Boolean=} [json=false] 发送的是否json数据
     */
    post:function(url, timeout, data, callback, encoding, header, reqEncoding, json) {
        varoptions = require('url').parse(url);
        options.method = 'POST';
        if(header) {
            options.headers = header;
        }
        if(reqEncoding) {
            options.encode = reqEncoding;
        }
        if(json) {
            options.json = json;
        }
        this.request(options, timeout, data, callback, encoding);
    }
};
 
/**
 * @description 处理请求响应的回调方法
 * @callback RequestCallback
 * @param {Object} err 请求或响应的错误对象
 * @param {string} data 响应的数据
 * @param {Object} res 响应流对象
 */
 
module.exports = httpHelper;
0 0