xxxx

来源:互联网 发布:windows phone 支付宝 编辑:程序博客网 时间:2024/05/18 13:46
本文介绍 
1.利用正则 匹配到网页上的 图片的路径 和标题 
2.然后下载 
3.然后上传到服务器 

Java代码  收藏代码
  1. public class Picture {  
  2.   
  3.     private String title;  
  4.     private String source;  
  5.     private String upPath;  
  6. //get set ...  
  7.       
  8. }  


Java代码  收藏代码
  1. import java.io.BufferedReader;  
  2. import java.io.File;  
  3. import java.io.FileOutputStream;  
  4. import java.io.InputStream;  
  5. import java.io.InputStreamReader;  
  6. import java.io.OutputStream;  
  7. import java.net.URL;  
  8. import java.net.URLConnection;  
  9. import java.util.ArrayList;  
  10. import java.util.Calendar;  
  11. import java.util.List;  
  12. import java.util.regex.Matcher;  
  13. import java.util.regex.Pattern;  
  14. import com.sun.xml.internal.fastinfoset.stax.events.Util;  
  15.   
  16. public class CatchPicture {  
  17.   
  18.     /** 
  19.      * @param args 
  20.      */  
  21.     public static void main(String[] args) {  
  22.         // TODO Auto-generated method stub  
  23.         //定义抓取图片的 正则表达式  
  24.         String regular="[*]<b>.*?</b><br/><img src=\"(.*?)\" border=0 alt=\'(.*?)\' style=\".*?\" class=\".*?\">  
  25. ";  
  26.         List<Picture> list=new CatchPicture().lookWeiboPic("http://gaoxiao.jokeji.cn/GrapHtml/dongtai/20120921221658.htm","GBK",regular,"2,1");  
  27.         System.out.println(list.size());  
  28.     }  
  29.     //根据URL查看网站上的图片  
  30.     public List<Picture> lookWeiboPic(String url,String charset,String regular,String attIndex){  
  31.         List<Picture> list=new ArrayList<Picture>();  
  32.         try {  
  33.             //获取填写的url  
  34.             //判断所属网站 获取 正则表达式  
  35.             //获取图片存放到 list集合  
  36.             if(!Util.isEmptyString(url)){  
  37.                     String htmls = getPageSource(url.trim(),charset);  
  38.                     Pattern pattern =null;  
  39.                     pattern = Pattern.compile(regular.trim());  
  40.                     if(!Util.isEmptyString(htmls)){  
  41.                         Matcher matcher = pattern.matcher(htmls);  
  42.                           
  43.                         //得到参数属性顺序  
  44.                         String[] sort = regular.trim().split(","); //下标:0 表示 标题title , 1 表示 图片路径   
  45.                         //判断后缀后 得到网站的请求头部 http://www.moonbasa.com/p-032111106.html-->得到 http://www.moonbasa.com  
  46.                         String[] suffix;  
  47.                         suffix =url.trim().split("cn");  
  48.                         String httphread = "";  
  49.                         if (suffix.length > 1) {  
  50.                             httphread = suffix[0] + "cn";  
  51.       
  52.                         } else {  
  53.                             suffix = url.trim().split("com");  
  54.                             httphread = suffix[0] + "com";  
  55.                         }  
  56.                         //循环匹配找到的  
  57.                         while(matcher.find()){  
  58.                             Picture picture=new Picture();  
  59.                               
  60.                             //匹配出title  
  61.                             if (-1 == Integer.parseInt(sort[0])) {  
  62.                                 // 页面上抓不到标题  
  63.                                 picture.setTitle("");  
  64.                             } else {  
  65.                                 // 去标题的#  
  66.                                 String title=matcher.group(Integer.parseInt(sort[0])).replace("#"" ");  
  67.                                 picture.setTitle(title);  
  68.                             }  
  69.                               
  70.                             //匹配出source  
  71.                             if (-1 == Integer.parseInt(sort[1])) {  
  72.                                 // 页面上抓不到图片路径  
  73.                                 picture.setSource("");  
  74.                             }else{  
  75.                                 String webImgUrl=matcher.group(Integer.parseInt(sort[1]));  
  76.                                 //判断是绝对路径还是相对路径  
  77.                                 String[] pathType=webImgUrl.split(":");  
  78.                                 if(pathType.length>1){  
  79.                                     //绝对路径  
  80.                                     picture.setSource(webImgUrl);  
  81.                                 }else{  
  82.                                     //判断相对路径是否含有..  
  83.                                     pathType=webImgUrl.split("\\.\\.");  
  84.                                     if(pathType.length>1){  
  85.                                         picture.setSource(httphread+pathType[1]);  
  86.                                     }else{  
  87.                                         if(webImgUrl.startsWith("/")){  
  88.                                             picture.setSource(httphread+pathType[0]);  
  89.                                         }else{  
  90.                                             picture.setSource(httphread+"/"+pathType[0]);  
  91.                                         }  
  92.                                     }  
  93.                                 }  
  94.                             }  
  95.                             String upPath=upload(picture.getSource(),"d:\\image\\");  
  96.                             picture.setUpPath(upPath);  
  97.                             list.add(picture);  
  98.                         }//--end while  
  99.                     }  
  100.           
  101.                 }  
  102.             }catch (Exception e) {  
  103.                 e.printStackTrace();  
  104.             }  
  105.         return list;  
  106.     }   
  107.       
  108.     /** 
  109.      * 根据网路路径获取 页面源码 
  110.      * @param pageUrl 
  111.      * @param encoding 
  112.      * @return 
  113.      */  
  114.     public String getPageSource(String pageUrl,String encoding) {      
  115.     StringBuffer sb = new StringBuffer();      
  116.     try {      
  117.         //构建一URL对象      
  118.         URL url = new URL(pageUrl);      
  119.         //使用openStream得到一输入流并由此构造一个BufferedReader对象      
  120.         BufferedReader in = new BufferedReader(new InputStreamReader(url      
  121.                 .openStream(), encoding));      
  122.         String line;      
  123.         //读取www资源      
  124.         while ((line = in.readLine()) != null) {      
  125.             sb.append(line);      
  126.             sb.append("\n");    
  127.         }      
  128.         in.close();      
  129.     } catch (Exception ex) {      
  130.         System.err.println(ex);      
  131.     }      
  132.     return sb.toString();      
  133. }     
  134.       
  135.     /** 
  136.      * 上传 图片  
  137.      * @param urlStr 
  138.      * @param path 
  139.      * @return 
  140.      * @throws Exception  
  141.      */  
  142.     public String upload(String urlStr,String path) throws Exception{  
  143.         Calendar calendar = Calendar.getInstance();  
  144.         String month = calendar.get(Calendar.YEAR) + "/"  
  145.                 + (calendar.get(Calendar.MONTH) + 1);  
  146.         String filename = java.util.UUID.randomUUID().toString()  
  147.                 + getExtension(urlStr);  
  148.         path =path + month + "/";  
  149.         download(urlStr,path,filename);  
  150.         return path+month + "/" + filename;  
  151.     }  
  152.     /** 
  153.      * 根据路径 下载图片 然后 保存到对应的目录下 
  154.      * @param urlString 
  155.      * @param filename 
  156.      * @param savePath 
  157.      * @return 
  158.      * @throws Exception 
  159.      */  
  160.     public void download(String urlString, String filename,String savePath) throws Exception {  
  161.         // 构造URL  
  162.         URL url = new URL(urlString);  
  163.         // 打开连接  
  164.         URLConnection con = url.openConnection();  
  165.         //设置请求的路径  
  166.         con.setConnectTimeout(5*1000);  
  167.         // 输入流  
  168.         InputStream is = con.getInputStream();  
  169.       
  170.         // 1K的数据缓冲  
  171.         byte[] bs = new byte[1024];  
  172.         // 读取到的数据长度  
  173.         int len;  
  174.         // 输出的文件流  
  175.        File sf=new File(savePath);  
  176.        if(!sf.exists()){  
  177.            sf.mkdirs();  
  178.        }  
  179.        OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);  
  180.         // 开始读取  
  181.         while ((len = is.read(bs)) != -1) {  
  182.           os.write(bs, 0, len);  
  183.         }  
  184.         // 完毕,关闭所有链接  
  185.         os.close();  
  186.           
  187.         is.close();  
  188.     }   
  189.       
  190. /** 
  191.  * 根据文件名 获取文件的后缀名 
  192.  * @param fileUrl 
  193.  * @return 
  194.  */  
  195.  public String getExtension(String fileUrl){  
  196.      return fileUrl.substring(fileUrl.lastIndexOf("."), fileUrl.length());  
  197.  }  
  198. }  
0 0
原创粉丝点击