Node.js HTTP requests with gzip/deflate compression

来源：互联网发布：乐清知临寄宿学校地址编辑：程序博客网时间：2024/05/22 02:23

One of my recent projects involved scraping some web data for offline processing. I started using the excellent request library by Mikeal Rogers, which has a number of nice and convenient improvements over the default Node http library.

As I unleashed my first prototype on the web, the database started growing much faster than I had planned. I started by storing raw and uncompressed response data, so an immediate optimization was to use the Accept-Encoding HTTP request header to fetch compressed data from the server.

Unfortunately, some of my target servers sometimes sent back uncompressed data (which they’re entitled to do under the HTTP spec, it’s just slightly annoying). I needed a way to conditionally handle compressed data based on the Content-Encoding response header. I founda solution that worked with the default Node.js HTTP library, but it wasn’t immediately obvious how to port that to Mikeal’s request library.

Approach 1: no streams

My first solution collected data chunks into a Buffer, then passed that into the relevant zlib functions if needed. It’s more code than I wanted, but it works well.

Note: for simplicity, I’ve left out the logic that writes the compressed response body to the database.

var request = require('request'),
  zlib = require('zlib');
 
var headers = {
  "accept-charset" : "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
  "accept-language" : "en-US,en;q=0.8",
  "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
  "accept-encoding" : "gzip,deflate",
};
 
var options = {
  url: "http://google.com",
  headers: headers
};
 
var requestWithEncoding = function(options, callback) {
  var req = request.get(options);
 
  req.on('response', function(res) {
    var chunks = [];
    res.on('data', function(chunk) {
      chunks.push(chunk);
    });
 
    res.on('end', function() {
      var buffer = Buffer.concat(chunks);
      var encoding = res.headers['content-encoding'];
      if (encoding == 'gzip') {
        zlib.gunzip(buffer, function(err, decoded) {
          callback(err, decoded && decoded.toString());
        });
      } else if (encoding == 'deflate') {
        zlib.inflate(buffer, function(err, decoded) {
          callback(err, decoded && decoded.toString());
        })
      } else {
        callback(null, buffer.toString());
      }
    });
  });
 
  req.on('error', function(err) {
    callback(err);
  });
}
 
requestWithEncoding(options, function(err, data) {
  if (err) console.log(err);
  else console.log(data);
})
view rawgzipRequestTest.js hosted with ❤ by GitHub

Approach 2: streams

The downside to the first approach is that all response data is buffered in memory. This was fine for my use case, but in general this can cause memory issues if you’re scraping websites with really large response bodies.

A better approach is to use streams, as Mikeal suggested. Streams are a wonderful abstraction that can help you manage memory consumption better, among other things. There are two great introductions to Node streams here and here. Keep in mind that streams in Node.js are somewhat intricate and still evolving (for example, Node 0.10 introduced streams2 which is not entirely backwards compatible with older versions of Node).

Here’s a working solution that pipes response data into a zlib stream, then pipes that into a final destination (a file, in this case). Notice that the code is cleaner and more readable.

var request = require('request'),
  zlib = require('zlib'),
  fs = require('fs');
 
var headers = {
  "accept-charset" : "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
  "accept-language" : "en-US,en;q=0.8",
  "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
  "accept-encoding" : "gzip,deflate",
}
 
var options = {
  url: "http://google.com",
  headers: headers
}
 
var compressedRequest = function(options, outStream) {
  var req = request(options)
 
  req.on('response', function (res) {
    if (res.statusCode !== 200) throw new Error('Status not 200')
 
    var encoding = res.headers['content-encoding']
    if (encoding == 'gzip') {
      res.pipe(zlib.createGunzip()).pipe(outStream)
    } else if (encoding == 'deflate') {
      res.pipe(zlib.createInflate()).pipe(outStream)
    } else {
      res.pipe(outStream)
    }
  })
 
  req.on('error', function(err) {
    throw err;
  })
}
 
// Dummy write stream. Substitute with any other writeable stream
var outStream = fs.createWriteStream('./sample.html')
compressedRequest(options, outStream)
request({    url:"http://img.baidu.com/hunter/alog/alog.min.js",    encoding:'utf-8',    proxy:'http://web-proxy.oa.com:8080'}).pipe(fs.createWriteStream('data_111.gz'));
view rawgzipRequestStreams.js hosted with ❤ by GitHub

Summary

Both of those approaches will get the job done with Mikeal’s library, and the one you choose depends on the use case. In my project, I needed to save the compressed response data as a field of a Mongoose document, then further process the decompressed data. Streams don’t suit this use case well, so I used the first approach.