老司机开车记

来源:互联网 发布:数学必修三算法循环 编辑:程序博客网 时间:2024/04/28 10:21

node.js+jsdom 小爬虫 并不是框架>.<

解决某些个人问题:

利用了dfs搜索


"use strict";let https=require('https');let fs=require('fs');let path=require('path');let jsdom=require('jsdom');let visted=[];function isDir(url){  if(url.indexOf('.')!==-1){    return false;  }  try{    fs.mkdirSync(url);    console.log('mkdir:'+url);  }catch(e){    console.log(`IOerror:${e}`)  }  return true;}function isUrlVisited(url){  for(let i=0;i<visted.length;i++){    if(visted[i]===url){      return true;    }  }  visted.push(url);  return false;}function writeFile(url){  if(isDir(url)&&isUrlVisited(url)){    return;  }  let filePath=path.parse(url);  try{    urltoFile(url);  }catch(error){    console.log(error)  }}function urltoFile(url){  https.request(root+url,  (res)=>{    res.on('data',(data)=>{      fs.appendFileSync(url,data);    });  }).on('error',(e)=>{    console.log(e);  }).end();}function dealUrl(url){  isUrlVisited(url);  isDir(url);  writeFile(url);}let root='https://www.seryox.com';function applyUrl(url){  jsdom.env({    url: url,    scripts: ["http://code.jquery.com/jquery.js"],    done: function (err, window) {        let $=window.$;        console.log('done');        try{        let arr=$('a');        console.log(arr.length)        for(let i=0;i<arr.length;i++){          let href=$(arr[i]).attr('href');            if(href.match(/^\/pic/)&&!isUrlVisited(href)){              console.log(href);              if(isDir(href)){                console.log('dir:'+url+href);                applyUrl(root+href);              }else{                console.log('file:'+url+href);                writeFile(href);              }            }        }      }catch(e){        console.log(e+'@'+url);      }    }  });}applyUrl(root+'/');

生成的文件夹编码为UTF-8 URL编码

利用此文件可解决

"use strict";let fs=require('fs');let root='/pic';function isDir(url){  if(url.indexOf('.')!==-1){    return false;  }  return true;}function rename(path,name){  if(name.indexOf('%')!==-1)  console.log(path+'/'+name+' to '+path+'/'+decodeURI(name))    fs.renameSync(path+'/'+name,path+'/'+decodeURI(name));}function main(path){  let fd=fs.readdirSync(path);  for(let i=0;i<fd.length;i++){    if(isDir(fd[i])){      main(path+'/'+fd[i]);    }    rename(path,fd[i]);  }}main(root);


0 0
原创粉丝点击