Jsoup数据抓取

来源:互联网 发布:淘宝卖家开通花呗要求 编辑:程序博客网 时间:2024/05/17 19:21
  1. /*** 
  2.      * 美图抓取 
  3.      * http://www.tupianzj.com/meinv/  
  4.      * @time 2014-9-5上午11:10:25 
  5.      */  
  6.     public static void search2() {  
  7.           
  8.         String httpUrl = "http://www.tupianzj.com/meinv/";  
  9.           
  10.         try {  
  11.               
  12.             Document doc = Jsoup.connect(httpUrl).get();  
  13.               
  14.             Elements items = doc.select(".meinv970 dl");  
  15.               
  16.             System.out.println(items);  
  17.               
  18.             for(Element item:items){  
  19.                   
  20.                 Elements childItemsDT = item.select("dt h2 a");  
  21.                   
  22.                 String title=childItemsDT.html();  
  23.                       
  24.                 System.out.println(childItemsDT.html());  
  25.                   
  26.                 Elements childItemsDD1 = item.select("dd li a img");  
  27.                   
  28.                 for(Element childItemdd1:childItemsDD1){  
  29.                       
  30.                     String picUrl0=childItemdd1.attr("src");  
  31.                       
  32.                     saveImg("小",title, picUrl0);  
  33.                       
  34.                     System.out.println(picUrl0);  
  35.                 }  
  36.                   
  37.                 Elements childItemsDD2 = item.select("dd li .moxflashtext a");  
  38.                   
  39.                 System.out.println(childItemsDD2);  
  40.                   
  41.                 for(Element childItem:childItemsDD2){  
  42.                       
  43.                     String secondUrl="http://www.tupianzj.com"+childItem.attr("href");  
  44.                       
  45.                     Document childDoc = Jsoup.connect(secondUrl).get();  
  46.                       
  47.                     String picUrl=childDoc.select(".pictu900 img").attr("src");  
  48.                       
  49.                     System.out.println(picUrl);  
  50.                       
  51.                     saveImg("大",title, picUrl);  
  52.                       
  53.                     Elements thirdChilds=childDoc.select(".pages li a");  
  54.                       
  55.                     for(Element thirdChild:thirdChilds){  
  56.                           
  57.                         String isHave = thirdChild.attr("href");  
  58.                           
  59.                         if(!"".endsWith(isHave) && !"javascript:dPlayPre();".endsWith(isHave) && !"#".endsWith(isHave) && !"#".endsWith(isHave)){  
  60.                               
  61.                             String url=secondUrl.substring(0,secondUrl.lastIndexOf("/")+1)+isHave;  
  62.                               
  63.                             Document secondChildDoc = Jsoup.connect(url).get();  
  64.                               
  65.                             String picUrl1=secondChildDoc.select(".pictu900 img").attr("src");  
  66.                               
  67.                             System.out.println(picUrl1);  
  68.                               
  69.                             saveImg("大",title, picUrl1);  
  70.                               
  71.                         }  
  72.                           
  73.                     }  
  74.                 }  
  75.                   
  76.             }  
  77.               
  78.         } catch (IOException e) {  
  79.               
  80.             e.printStackTrace();  
  81.         }  
  82.     }  
  83.       
  84.       
  85.     /*** 
  86.      * 保存图片 
  87.      * @time 2014-9-5上午11:10:25 
  88.      */  
  89.      public static String saveImg(String tag,String name,String picUrl) {  
  90.               
  91.             String fileName = "";  
  92.               
  93.             fileName = tag+System.currentTimeMillis()+".jpg";  
  94.               
  95.             File realDirectory = new File("D:/pic/"+name+"/");  
  96.               
  97.             if (!realDirectory.exists()) {  
  98.                 realDirectory.mkdirs();  
  99.             }  
  100.               
  101.             try {  
  102.                 // 构造URL  
  103.                 URL url = new URL(picUrl);  
  104.                 // 打开连接  
  105.                 URLConnection con = url.openConnection();  
  106.                 // 输入流  
  107.                 InputStream is = con.getInputStream();  
  108.                 // 1K的数据缓冲  
  109.                 byte[] bs = new byte[1024];  
  110.                 // 读取到的数据长度  
  111.                 int len;  
  112.                 // 输出的文件流  
  113.                 OutputStream os = new FileOutputStream("D:/pic/"+name+"/"+fileName);  
  114.                 // 开始读取  
  115.                 while ((len = is.read(bs)) != -1) {  
  116.                     os.write(bs, 0, len);  
  117.                 }  
  118.                 // 完毕,关闭所有链接  
  119.                 os.close();  
  120.                 is.close();  
  121.                   
  122.             } catch (Exception e) {  
  123.                   
  124.                 e.printStackTrace();  
  125.             }   
  126.               
  127.             return fileName;  
  128.               
  129.         }  
0 0
原创粉丝点击