url,urlconnection,httpurlconneciton实例
来源:互联网 发布:紫砂壶淘宝店推荐 编辑:程序博客网 时间:2024/06/06 16:32
以下是urlconnection监控赶集网上售价低于200的洗衣机的例子,同时解决了从web读取数据的编码问题
1、找到赶集网上洗衣机的网址,然后得到售价<300的网址
2、获取到网址的源码
3、解析源码,得到我们想要的信息
/**
* 从赶集网上收集到的关心数据实体类
* */
public class GanJiEntity {
private String date = "";
private String http = "";
private String content = "";
private String cityAdress = "";
private int price = 0;
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getHttp() {
return http;
}
public void setHttp(String http) {
this.http = http;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getcityAdress() {
return cityAdress;
}
public void setcityAdress(String cityAdress) {
this.cityAdress = cityAdress;
}
public int getPrice() {
return price;
}
public void setPrice(int price) {
this.price = price;
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.sun.corba.se.impl.encoding.CodeSetConversion.BTCConverter;
public class GanJiCrawl {
String urlString = "";
String urlPrefix = "";
URL url = null;
// URLConnection urlCon = null;
HttpURLConnection http = null;
BufferedReader br = null;
public GanJiCrawl(String urlString, String urlPrefix){
this.urlString = urlString;
this.urlPrefix = urlPrefix;
try {
url = new URL(urlString);
// urlCon = url.openConnection();
// urlCon.connect();
http = (HttpURLConnection)url.openConnection();
http.connect();
System.out.println(http.getContentEncoding());
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public String getUrlContent(){
try {
/**
* 可行的方法
* */
// br = new BufferedReader(new InputStreamReader(urlCon.getInputStream()));//若不知道读入的流里改变编码
br = new BufferedReader(new InputStreamReader(http.getInputStream(), "utf-8"));//通过在读取的流里改变字符编码,可以解决输出乱码,不用再对lineString转换字符编码
String lineString = "";
StringBuffer buffer = new StringBuffer();
while((lineString = br.readLine()) != null){
//而以如下方式,通过在读取后的lineString中再转化字符编码是行不通的
// lineString = new String(lineString.getBytes(),"UTF-8");
// byte[] b = lineString.getBytes();
// lineString = new String(b, "utf-8");
buffer.append(lineString + "\n");
}
// System.out.println(buffer.toString());
// byte[] b = buffer.toString().getBytes("ISO-8859-1");
// return new String(b,"GB2312");
// return new String(buffer.toString().getBytes("8859_1"),"GB2321");
return buffer.toString();
/**
* 对比上面的
* 若不知道读入的流里改变编码
* 而以如下方式,通过在读取后的lineString中再转化字符编码是行不通的,
* 因为java默认的读取编码是ISO-8859-1(这个不确定)
* */
// br = new BufferedReader(new InputStreamReader(http.getInputStream()));
// String lineString = "";
// StringBuffer buffer = new StringBuffer();
// while((lineString = br.readLine()) != null){
// lineString = new String(lineString.getBytes(),"UTF-8");
//// byte[] b = lineString.getBytes("iso-8859-1");
//// lineString = new String(b, "utf-8");
// buffer.append(lineString + "\n");
// }
// System.out.println(buffer.toString());
/**
* 这个是从网上找的通过byte数组转换的,中文是正确的读取到了,但是没有读取到完整的网页源码,应该是in.available()不准确性
* */
// InputStream in = http.getInputStream();
//// int all = in.available();
//// String webpage=null;
//// while (all > 0) {
//// byte[] b = new byte[all];
//// in.read(b);
//// webpage = new String(b, "UTF-8");
//// System.out.println(webpage);
//// all = in.available();
//// // Thread.sleep(2000);//给它点下载的时间,每两秒钟读取一次
//// }
//// System.out.println(webpage);
//
/**
* 这个是通过InputStream读取字节流的形式,通用的方法
* */
// InputStream in = http.getInputStream();
// int byteread = 0;
// byte[] b = new byte[1024];
// StringBuffer br = new StringBuffer();
// while((byteread = in.read(b)) != -1){
// String webPage = new String(b, "utf-8");
//// System.out.println(webPage);
// br.append(webPage);
// b = new byte[1024];
// }
// in.close();
// return br.toString();
//// System.out.println(br.toString());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}
public List<GanJiEntity> parseUrlContent(String content){
List<GanJiEntity> list = new ArrayList<GanJiEntity>();
String regex = "<p>\\s+" +
"<i>(.*)</i>\\s+" +
"<a\\s+href=\"(.*?)\".*\\s+" +
"(.*)</a>\\s+" +
".*class=\"adds\">(\\S+)</a></span>\\s+" +
"-(.*)元\\s+</p>";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
while(matcher.find()){
GanJiEntity entity = new GanJiEntity();
int price = Integer.parseInt(matcher.group(5).trim());
if(price < 200){
// System.out.println(matcher.group(1));
// System.out.println(matcher.group(2));
// System.out.println(matcher.group(3));
// System.out.println(matcher.group(4));
// System.out.println(matcher.group(5));
entity.setDate(matcher.group(1).trim());
entity.setHttp(urlPrefix + matcher.group(2).trim());
entity.setContent(matcher.group(3).trim());
entity.setcityAdress(matcher.group(4).trim());
entity.setPrice(price);
list.add(entity);
}
}
return list;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
GanJiCrawl gj = new GanJiCrawl("http://sy.ganji.com/xiyiji/p1/", "http://sy.ganji.com");
String content = gj.getUrlContent();
// System.out.println(content);
List<GanJiEntity> list = gj.parseUrlContent(content);
for (GanJiEntity entity : list) {
System.out.println(entity.getDate());
System.out.println(entity.getHttp());
System.out.println(entity.getContent());
System.out.println(entity.getcityAdress());
System.out.println(entity.getPrice());
try {
// Runtime.getRuntime().exec("E:\\Program Files\\SogouExplorer\\SogouExplorer.exe "+entity.getHttp());//等效于下面两行的代码
String[] exec = {"E:\\Program Files\\SogouExplorer\\SogouExplorer.exe ",entity.getHttp()};
Runtime.getRuntime().exec(exec);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
- url,urlconnection,httpurlconneciton实例
- java之URL(URL,URLConnection)实例
- URL URLConnection
- 使用URL和URLConnection
- HttpURL connection URLconnection URL
- URL,URLConnection类
- URL,URLConnection,HttpURLConnection
- URL和URLConnection
- 网络编程--URL,URLConnection
- URL和URLConnection案例
- URL、URLConnection 和HttpURLConnection
- URL、URLConnection、HttpURLConnection理解
- Java网络编程URL&URLConnection
- URL,URLConnection,HttPURLConnection的使用
- URLConnection 访问url 返回内容
- URL,URLConnection,HttPURLConnection的使用
- URL,URLConnection,HttPURLConnection的使用
- URL,URLConnection,HttPURLConnection的使用
- opengl贪吃蛇源代码
- 聚类问题的次优解算法
- javascript 之 父子窗口 交互 动态增加option
- 创建地图
- 单词分界符
- url,urlconnection,httpurlconneciton实例
- 顺序表的实现(包含插入,删除,及查找)
- sybase中文乱码解决方案
- Openmesh halfedge find vertex
- 中断和异常
- DCMTK
- 黑马程序员_repeater控件研究入门
- linux学习
- USACO section 1.5.2 Prime Palindromes