url,urlconnection,httpurlconneciton实例

来源:互联网 发布:紫砂壶淘宝店推荐 编辑:程序博客网 时间:2024/06/06 16:32

以下是urlconnection监控赶集网上售价低于200的洗衣机的例子,同时解决了从web读取数据的编码问题

1、找到赶集网上洗衣机的网址,然后得到售价<300的网址
2、获取到网址的源码
3、解析源码,得到我们想要的信息


/**
 * 从赶集网上收集到的关心数据实体类
 * */
public class GanJiEntity {
 private String date = "";
 private String http = "";
 private String content = "";
 private String cityAdress = "";
 private int price = 0;
 
 public String getDate() {
  return date;
 }
 public void setDate(String date) {
  this.date = date;
 }
 public String getHttp() {
  return http;
 }
 public void setHttp(String http) {
  this.http = http;
 }
 public String getContent() {
  return content;
 }
 public void setContent(String content) {
  this.content = content;
 }
 public String getcityAdress() {
  return cityAdress;
 }
 public void setcityAdress(String cityAdress) {
  this.cityAdress = cityAdress;
 }
 public int getPrice() {
  return price;
 }
 public void setPrice(int price) {
  this.price = price;
 }
}


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.sun.corba.se.impl.encoding.CodeSetConversion.BTCConverter;


public class GanJiCrawl {
 
 String urlString = "";
 String urlPrefix = "";
 URL url = null;
// URLConnection urlCon = null;
 HttpURLConnection http = null;
 BufferedReader br = null;
 public GanJiCrawl(String urlString, String urlPrefix){
  this.urlString = urlString;
  this.urlPrefix = urlPrefix;
  try {
   url = new URL(urlString);
//   urlCon = url.openConnection();
//   urlCon.connect();
   http = (HttpURLConnection)url.openConnection();
   http.connect();
   System.out.println(http.getContentEncoding());
  } catch (MalformedURLException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
 }
 
 public String getUrlContent(){
  try {
   
   /**
    * 可行的方法
    * */
//  br = new BufferedReader(new InputStreamReader(urlCon.getInputStream()));//若不知道读入的流里改变编码
   br = new BufferedReader(new InputStreamReader(http.getInputStream(), "utf-8"));//通过在读取的流里改变字符编码,可以解决输出乱码,不用再对lineString转换字符编码
   String lineString = "";
   StringBuffer buffer = new StringBuffer();
   while((lineString = br.readLine()) != null){
    //而以如下方式,通过在读取后的lineString中再转化字符编码是行不通的
//    lineString = new String(lineString.getBytes(),"UTF-8");
//    byte[] b = lineString.getBytes();
//    lineString = new String(b, "utf-8");
    buffer.append(lineString + "\n");
   }
//   System.out.println(buffer.toString());
//   byte[] b = buffer.toString().getBytes("ISO-8859-1");
//   return new String(b,"GB2312");
//   return new String(buffer.toString().getBytes("8859_1"),"GB2321");
   return buffer.toString();
   
   /**
    * 对比上面的
    * 若不知道读入的流里改变编码
    * 而以如下方式,通过在读取后的lineString中再转化字符编码是行不通的,
    * 因为java默认的读取编码是ISO-8859-1(这个不确定)
    * */
//   br = new BufferedReader(new InputStreamReader(http.getInputStream()));
//   String lineString = "";
//   StringBuffer buffer = new StringBuffer();
//   while((lineString = br.readLine()) != null){
//    lineString = new String(lineString.getBytes(),"UTF-8");
////    byte[] b = lineString.getBytes("iso-8859-1");
////    lineString = new String(b, "utf-8");
//    buffer.append(lineString + "\n");
//   }
//   System.out.println(buffer.toString());
   
   /**
    * 这个是从网上找的通过byte数组转换的,中文是正确的读取到了,但是没有读取到完整的网页源码,应该是in.available()不准确性
    * */
//   InputStream in = http.getInputStream();
////   int   all   =   in.available();
////   String   webpage=null;
////   while   (all   >   0)   {
////    byte[]   b   =   new   byte[all];
////    in.read(b);
////    webpage   =   new   String(b,   "UTF-8");
////    System.out.println(webpage);
////    all   =   in.available();
//// //   Thread.sleep(2000);//给它点下载的时间,每两秒钟读取一次
////   }
////   System.out.println(webpage);
//   
   /**
    * 这个是通过InputStream读取字节流的形式,通用的方法
    * */
//   InputStream in = http.getInputStream();
//   int byteread = 0;
//   byte[] b = new byte[1024];
//   StringBuffer br = new StringBuffer();
//   while((byteread = in.read(b)) != -1){
//    String webPage = new String(b, "utf-8");
////    System.out.println(webPage);
//    br.append(webPage);
//    b = new byte[1024];
//   }
//   in.close();
//   return br.toString();
////   System.out.println(br.toString());
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
  
  return "";
 }
 
 public List<GanJiEntity> parseUrlContent(String content){
  List<GanJiEntity> list = new ArrayList<GanJiEntity>();
  String regex = "<p>\\s+" +
    "<i>(.*)</i>\\s+" +
    "<a\\s+href=\"(.*?)\".*\\s+" +
    "(.*)</a>\\s+" +
    ".*class=\"adds\">(\\S+)</a></span>\\s+" +
    "-(.*)元\\s+</p>";
  Pattern pattern = Pattern.compile(regex);
  Matcher matcher = pattern.matcher(content);
  while(matcher.find()){
   GanJiEntity entity = new GanJiEntity();
   int price = Integer.parseInt(matcher.group(5).trim());
   if(price < 200){
//    System.out.println(matcher.group(1));
//    System.out.println(matcher.group(2));
//    System.out.println(matcher.group(3));
//    System.out.println(matcher.group(4));
//    System.out.println(matcher.group(5));
    entity.setDate(matcher.group(1).trim());
    entity.setHttp(urlPrefix  + matcher.group(2).trim());
    entity.setContent(matcher.group(3).trim());
    entity.setcityAdress(matcher.group(4).trim());
    entity.setPrice(price);
    list.add(entity);
   }
  }
  return list;
 }

 /**
  * @param args
  */
 public static void main(String[] args) {
  // TODO Auto-generated method stub
  GanJiCrawl gj = new GanJiCrawl("http://sy.ganji.com/xiyiji/p1/", "http://sy.ganji.com");
  String content = gj.getUrlContent();
//  System.out.println(content);
  List<GanJiEntity> list = gj.parseUrlContent(content);
  for (GanJiEntity entity : list) {
   System.out.println(entity.getDate());
   System.out.println(entity.getHttp());
   System.out.println(entity.getContent());
   System.out.println(entity.getcityAdress());
   System.out.println(entity.getPrice());
   try {
//    Runtime.getRuntime().exec("E:\\Program Files\\SogouExplorer\\SogouExplorer.exe "+entity.getHttp());//等效于下面两行的代码
    String[] exec = {"E:\\Program Files\\SogouExplorer\\SogouExplorer.exe ",entity.getHttp()};
    Runtime.getRuntime().exec(exec);
   } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   }
  }

 }

}

原创粉丝点击