打开浏览器与抓取网站内容

来源：互联网发布：linux 服务启动命令编辑：程序博客网时间：2024/06/05 21:03

import java.awt.Desktop;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;

public class OpenWin {

   public static void open1(){
       try {
//           Runtime.getRuntime().exec("iexplore.exe http://www.baidu.com");

           URI uri = new URI("http://localhost:8080/TestS/login.jsp");
           Desktop desktop = null;
           if (Desktop.isDesktopSupported()) {
               desktop = Desktop.getDesktop();
           }
           if (desktop != null)
               desktop.browse(uri);
       } catch (Exception ioe) {
           ioe.printStackTrace();
       }
   }

   /**
   * 需要commons-httpclient.jar、commons-codec.jar、commons-logging-1.1.1.jar包
   * @param url
   * @param param
   * @return
   */
   public static String createhttpClient(String url, String param) {
        HttpClient client = new HttpClient();
        String response = null;
        String keyword = null;
        PostMethod postMethod = new PostMethod(url);

//        try {
//            if (param != null)
//                keyword = new String(param.getBytes("utf-8"), "utf-8");
//        } catch (UnsupportedEncodingException e1) {
//            // TODO Auto-generated catch block
//            e1.printStackTrace();
//        }
//
//         NameValuePair[] data = { new NameValuePair("keyword", keyword) };
//         // 将表单的值放入postMethod中
//         postMethod.setRequestBody(data);
//            //以上部分是带参数抓取,我自己把它注销了．大家可以把注销消掉研究下

        try {
            int statusCode = client.executeMethod(postMethod);
            response = new String(postMethod.getResponseBodyAsString()
                    .getBytes("utf-8"), "utf-8");//这里要注意下 gb2312要和你抓取网页的编码要一样
           System.out.println("---response=="+response);
            String p = response.replaceAll("\\&[a-zA-Z]{1,10};", "")
                    .replaceAll("<[^>]*>", "");//去掉网页中带有html语言的标签
            System.out.println(p);
        } catch (Exception e) {

            e.printStackTrace();
        }
        return response;

    }

    // 第二种方法
    // 这种方法是JAVA自带的URL来抓取网站内容

    public static String getPageContent(String strUrl, String strPostRequest,
            int maxLength) {
        // 读取结果网页
        StringBuffer buffer = new StringBuffer();
        System.setProperty("sun.net.client.defaultConnectTimeout", "5000");
        System.setProperty("sun.net.client.defaultReadTimeout", "5000");
        try {
            URL newUrl = new URL(strUrl);
            HttpURLConnection hConnect = (HttpURLConnection) newUrl
                    .openConnection();
            // POST方式的额外数据
            if (strPostRequest.length() > 0) {
                hConnect.setDoOutput(true);
                hConnect.setRequestProperty("Charset", "UTF-8");
                OutputStreamWriter out = new OutputStreamWriter(hConnect
                        .getOutputStream());
                out.write(strPostRequest);
                out.flush();
                out.close();
            }
            // 读取内容
            BufferedReader rd = new BufferedReader(new InputStreamReader(
                    hConnect.getInputStream()));
            int ch;
            for (int length = 0; (ch = rd.read()) > -1
                    && (maxLength <= 0 || length < maxLength); length++)
                buffer.append((char) ch);
            String s = buffer.toString();
//            s.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");

            System.out.println(s);

            rd.close();
            hConnect.disconnect();
            return buffer.toString().trim();
        } catch (Exception e) {
           System.out.println(e.getMessage());
             return e.getMessage();
            //
            //return null;


        }
    }



   public static void main(String[] a) {
//       OpenWin.open1();

//       OpenWin.createhttpClient("http://localhost:8080/TestS/login.jsp", "有一份");

       OpenWin.getPageContent("http://localhost:8080/TestS/login.jsp", "post", 100500);//第二种方法
   }
}

打开浏览器 与 抓取网站内容

打开浏览器与抓取网站内容