node.js使用url下载文件

来源：互联网发布：java三维地质建模编辑：程序博客网时间：2024/06/16 21:56

采用http协议来实现文件下载。其优点在于不需要以来额外程序来下载文件

[javascript] view plain copy
// Dependencies  
var fs = require('fs');  
var url = require('url');  
var http = require('http');  
var exec = require('child_process').exec;  
var spawn = require('child_process').spawn;  
  
// App variables  
var file_url = 'http://www.sina.com/tmp.jpg';  
var DOWNLOAD_DIR = './downloads/';  
  
// We will be downloading the files to a directory, so make sure it's there  
// This step is not required if you have manually created the directory  
var mkdir = 'mkdir -p ' + DOWNLOAD_DIR;  
var child = exec(mkdir, function(err, stdout, stderr) {  
    if (err) throw err;  
    else download_file_httpget(file_url);  
});  
  
// Function to download file using HTTP.get  
var download_file_httpget = function(file_url) {  
var options = {  
    host: url.parse(file_url).host,  
    port: 80,  
    path: url.parse(file_url).pathname  
};  
  
var file_name = url.parse(file_url).pathname.split('/').pop();  
var file = fs.createWriteStream(DOWNLOAD_DIR + file_name);  
  
http.get(options, function(res) {  
    res.on('data', function(data) {  
            file.write(data);  
        }).on('end', function() {  
            file.end();  
            console.log(file_name + ' downloaded to ' + DOWNLOAD_DIR);  
        });  
    });  
};  

使用curl来下载文件，我们需要采用child_process模块中的spawn方法。

[javascript] view plain copy
// Function to download file using curl  
var download_file_curl = function(file_url) {  
  
    // extract the file name  
    var file_name = url.parse(file_url).pathname.split('/').pop();  
    // create an instance of writable stream  
    var file = fs.createWriteStream(DOWNLOAD_DIR + file_name);  
    // execute curl using child_process' spawn function  
    var curl = spawn('curl', [file_url]);  
    // add a 'data' event listener for the spawn instance  
    curl.stdout.on('data', function(data) { file.write(data); });  
    // add an 'end' event listener to close the writeable stream  
    curl.stdout.on('end', function(data) {  
        file.end();  
        console.log(file_name + ' downloaded to ' + DOWNLOAD_DIR);  
    });  
    // when the spawn child process exits, check if there were any errors and close the writeable stream  
    curl.on('exit', function(code) {  
        if (code != 0) {  
            console.log('Failed: ' + code);  
        }  
    });  
};  

另外的方法是使用wget，这种方法的代码非常简洁。

[javascript] view plain copy
// Function to download file using wget  
var download_file_wget = function(file_url) {  
  
    // extract the file name  
    var file_name = url.parse(file_url).pathname.split('/').pop();  
    // compose the wget command  
    var wget = 'wget -P ' + DOWNLOAD_DIR + ' ' + file_url;  
    // excute wget using child_process' exec function  
  
    var child = exec(wget, function(err, stdout, stderr) {  
        if (err) throw err;  
        else console.log(file_name + ' downloaded to ' + DOWNLOAD_DIR);  
    });  
};  

关于child_process.spawn与child_process.exec最大的不同之处在于返回值： spawn returns a stream and exec returns a buffer.

child_process.spawn returns an object with stdout and stderr streams. You can tap on thestdout stream to read data that the child process sends back to Node. stdout being a stream has the "data", "end", and other events that streams have. spawn is best used to when you want the child process to return a large amount of data to Node - image processing, reading binary data etc.

child_process.spawn is "asynchronously asynchronous", meaning it starts sending back data from the child process in a stream as soon as the child process starts executing.

child_process.exec returns the whole buffer output from the child process. By default the buffer size is set at 200k. If the child process returns anything more than that, you program will crash with the error message "Error: maxBuffer exceeded". You can fix that problem by setting a bigger buffer size in the exec options. But you should not do it because exec is not meant for processes that return HUGE buffers to Node. You should use spawn for that. So what do you use exec for? Use it to run programs that return result statuses, instead of data.

child_process.exec is "synchronously asynchronous", meaning although the exec is asynchronous, it waits for the child process to end and tries to return all the buffered data at once. If the buffer size of exec is not set big enough, it fails with a "maxBuffer exceeded" error.

另外，node作为server可以实现upload以及download功能。

如下的code是download的功能。

[javascript] view plain copy
app.get('/upload/:fileid', function(req, res){  
     gfs.getGridFile(req.params.fileid, function(err, file){  
         res.header('Content-Type', file.contentType);  
        res.header('Content-Disposition', 'attachment; filename='+file.filename);  
        return file.stream(true).pipe(res);  
    });  
});  

Code的下载功能在于header中的attachement选项。

如下的code是upload功能。 Upload的文件在req.files中体现。

对于上传的文件，一种方法是保存在server的指定目录下，另外一种方法是保存在MongoDB中。

[javascript] view plain copy
    app.post('/upload/create', function(req, res){  
        //1. save the upload file to folder public/doc/  
//      var tmppath = req.files.file.path;  
//      var targetpath = /public/doc/'+req.files.file.name;  
//  
//      fs.rename(tmppath, targetpath, function(err){  
//          if (err) throw err;  
//  
//          fs.unlink(tmppath, function(){  
//              if (err) throw err;  
//  
//              //res.send({filename: req.files.file.name});  
//          });  
//      });  
  
        //2. save the upload file to mongoDB GridFile.  
        var opts = {content_type: req.files.file.type};  
  
        gfs.putGridFileByPath(req.files.file.path, req.files.file.name, opts, function(err, result){  

阅读全文

0 0