抓取网页内容
来源:互联网 发布:萤石软件下载 编辑:程序博客网 时间:2024/06/06 20:45
package cn.shb.test;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 解析http://cpc.people.com.cn/xuexi/GB/387488/index.html
* 中的标题、日期、来源、内容
* @author Administrator
*
*/
public class Jsoup_HttpClient_Test {
public static void main(String[] args) throws Exception {
//第一步:根据url使用httpclient获取页面信息,方法getHtmlByUrl();
String html = getHtmlByUrl("http://cpc.people.com.cn/xuexi/GB/387488/index.html");
if (html!=null&&!"".equals(html)) {
//第二步:使用jsoup解析html,获取内容
Document doc = Jsoup.parse(html);
Elements linksElements = doc.select("ul[id=tiles]>li>div[class=con]");
for (Element ele:linksElements) {
String title = ele.select(">h3>a").text();
String source = ele.select(">em>a").text();
ele.select(">em>a").empty();
String date = ele.select(">em").text().replace("来源:", "").replace("(", "").replace(")", "");
String text = ele.select(">span>a").text();
System.out.println(title);
System.out.println(source);
System.out.println(date);
System.out.println(text);
System.out.println();
System.out.println();
}
}
}
/**
* 根据URL获得所有的html信息
* @param url
* @return
*/
public static String getHtmlByUrl(String url){
String html = null;
//创建httpClient对象
HttpClient httpClient = new DefaultHttpClient();
//以get方式请求该URL
HttpGet httpget = new HttpGet(url);
try {
//得到responce对象
HttpResponse responce = httpClient.execute(httpget);
//返回码
int resStatu = responce.getStatusLine().getStatusCode();
if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不对
//获得输入流
InputStream entity = responce.getEntity().getContent();
if (entity!=null) {
//通过输入流转为字符串获得html源代码 注:可以获得实体,然后通过 EntityUtils.toString方法获得html
//但是有可能出现乱码,因此在这里采用了这种方式
html=getStreamString(entity);
System.out.println(html);
}
}
} catch (Exception e) {
System.out.println("访问【"+url+"】出现异常!");
e.printStackTrace();
} finally {
httpClient.getConnectionManager().shutdown();
}
return html;
}
/**
* 将一个输入流转化为字符串
*/
public static String getStreamString(InputStream tInputStream){
if (tInputStream != null){
try{
BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream));
StringBuffer tStringBuffer = new StringBuffer();
String sTempOneLine = new String("");
while ((sTempOneLine = tBufferedReader.readLine()) != null){
tStringBuffer.append(sTempOneLine+"\n");
}
return tStringBuffer.toString();
}catch (Exception ex){
ex.printStackTrace();
}
}
return null;
}
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 解析http://cpc.people.com.cn/xuexi/GB/387488/index.html
* 中的标题、日期、来源、内容
* @author Administrator
*
*/
public class Jsoup_HttpClient_Test {
public static void main(String[] args) throws Exception {
//第一步:根据url使用httpclient获取页面信息,方法getHtmlByUrl();
String html = getHtmlByUrl("http://cpc.people.com.cn/xuexi/GB/387488/index.html");
if (html!=null&&!"".equals(html)) {
//第二步:使用jsoup解析html,获取内容
Document doc = Jsoup.parse(html);
Elements linksElements = doc.select("ul[id=tiles]>li>div[class=con]");
for (Element ele:linksElements) {
String title = ele.select(">h3>a").text();
String source = ele.select(">em>a").text();
ele.select(">em>a").empty();
String date = ele.select(">em").text().replace("来源:", "").replace("(", "").replace(")", "");
String text = ele.select(">span>a").text();
System.out.println(title);
System.out.println(source);
System.out.println(date);
System.out.println(text);
System.out.println();
System.out.println();
}
}
}
/**
* 根据URL获得所有的html信息
* @param url
* @return
*/
public static String getHtmlByUrl(String url){
String html = null;
//创建httpClient对象
HttpClient httpClient = new DefaultHttpClient();
//以get方式请求该URL
HttpGet httpget = new HttpGet(url);
try {
//得到responce对象
HttpResponse responce = httpClient.execute(httpget);
//返回码
int resStatu = responce.getStatusLine().getStatusCode();
if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不对
//获得输入流
InputStream entity = responce.getEntity().getContent();
if (entity!=null) {
//通过输入流转为字符串获得html源代码 注:可以获得实体,然后通过 EntityUtils.toString方法获得html
//但是有可能出现乱码,因此在这里采用了这种方式
html=getStreamString(entity);
System.out.println(html);
}
}
} catch (Exception e) {
System.out.println("访问【"+url+"】出现异常!");
e.printStackTrace();
} finally {
httpClient.getConnectionManager().shutdown();
}
return html;
}
/**
* 将一个输入流转化为字符串
*/
public static String getStreamString(InputStream tInputStream){
if (tInputStream != null){
try{
BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream));
StringBuffer tStringBuffer = new StringBuffer();
String sTempOneLine = new String("");
while ((sTempOneLine = tBufferedReader.readLine()) != null){
tStringBuffer.append(sTempOneLine+"\n");
}
return tStringBuffer.toString();
}catch (Exception ex){
ex.printStackTrace();
}
}
return null;
}
}
jar:
commons-codec-1.9.jar
commons-logging-1.2.jar
fluent-hc-4.5.jar
httpclient-4.5.jar
httpclient-cache-4.5.jar
httpclient-win-4.5.jar
httpcore-4.4.1.jar
httpmime-4.5.jar
jna-4.1.0.jar
jna-platform-4.1.0.jar
jsoup-1.8.1.jar
阅读全文
0 0
- JAVA 抓取网页内容
- 【JAVA】 抓取网页内容
- 网页内容抓取
- 网页内容抓取
- fsockopen 抓取网页内容
- JAVA 抓取网页内容
- 抓取网页内容
- lotusscript 抓取网页内容
- C# 抓取网页内容
- PHP抓取网页内容
- java 抓取网页内容
- 抓取网页内容
- 抓取网页内容
- 读取 抓取 网页内容
- 网页内容抓取
- C# 抓取网页内容
- 抓取网页内容
- php抓取网页内容
- 统计学习精要 (Elements of Statistical Learning ) 习题 5.13
- spark快速入门
- Java类与对象
- LeetCode 350. Intersection of Two Array
- C语言程序设计(28)
- 抓取网页内容
- 套接字总结
- C和C++混合编译
- python数据可视化
- php面向对象必知会
- C语音基础-typedef 24
- Java调用Dll
- 通信框架Netty的详细介绍及应用
- Linux下C语言开发(已知进程名得到其PID号)