网络爬虫之获取图片到本地

来源：互联网发布：防范电信网络诈骗宣传编辑：程序博客网时间：2024/04/30 13:09

/*
* Created on Aug 26, 2011 2:41:26 PM
*
* HtmlSourceGetter.java
*
* NOTICE OF PROPRIETARY RIGHTS
*
* This program is a confidential trade secret and the property of author. Use, examination,
* reproduction, disassembly, decompiling, transfer and/or disclosure to others of
* all or any part of this software program are strictly prohibited except by express
* written agreement with author.
*
* --------------------------------------------------------------------------------------
* Modification History
* Date           Author       Version       Description
* Aug 26, 2011       Cross       1.0       New
* --------------------------------------------------------------------------------------
*/

package com.cross.tools;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

public class HtmlSourceGetter {

    private static HttpURLConnection con = null;
    private static BufferedInputStream bis = null;
    private static OutputStream out = null;

    public static void getSource(String url) {

    public static void parseHTML(String url, String keyword) {

    private static void processNodeList(NodeList list, String keyword) {

    public static void extractLinks(String url) {
   try {

        Parser parser = new Parser(url);
        parser.setEncoding("UTF-8");

        // frame filter
        NodeFilter frameFilter = new NodeFilter() {
       @Override
       public boolean accept(Node node) {
            if(node.getText().startsWith("frame src=")) {
           return true;
            }
            return false;
       }

        };

        // image filter;
        NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);

        // href filter;
        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);


        // link or image filter
//        OrFilter orFilter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class));

        // link or image or frame filter
//        OrFilter allFilter = new OrFilter(orFilter,frameFilter);


        NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);

        for (int i = 0; i < nodeList.size(); i++) {
       Node tag = nodeList.elementAt(i);

       // <a href> tag
//       if(tag instanceof LinkTag) {
//           LinkTag link = (LinkTag)tag;
//           String linkURL = link.getLink();
//           String linkText = link.getLinkText();
//           System.out.println("linkURL:"+linkURL);
//           System.out.println("linkText:"+linkText);
//       }

       // <img src> tag
//       else if(tag instanceof ImageTag) {
            ImageTag image = (ImageTag)tag;
            String imageURL = image.getImageURL();
            String imageText = image.getText();
            System.out.println("imageURL:"+imageURL);
            System.out.println("imageText:"+imageText);

            con = (HttpURLConnection)(new URL(imageURL).openConnection());
            con.connect();

            bis = new BufferedInputStream(con.getInputStream());
            out = new FileOutputStream(new File("c:/cross/" + i + "_" +System.currentTimeMillis() +imageURL.substring(imageURL.lastIndexOf("."))));

            byte[] buf = new byte[1024];
            int size = 0;

            while((size = bis.read(buf)) != -1){
           out.write(buf, 0, size);
                }
//            out.flush();

//       } else { // <frame src> tag eg:<frame src="test.html"/>
//            String frame = tag.getText();
//            String frameURL = frame.split("\"")[1];
//            System.out.println("frameURL:"+frameURL);
//
//       }

        }
   } catch (Exception e) {
        System.err.println(e.getStackTrace());
   } finally {
        try {
       out.close();
       bis.close();
       con.disconnect();
        } catch (IOException e) {
       e.printStackTrace();
        }
   }
    }


    public static void main(String[] args) {
//   HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "@");
//   HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "img");
   HtmlSourceGetter.extractLinks("http://localhost:8080/");
//   HtmlSourceGetter.extractLinks("http://localhost:8080/test/");
    }

}