Java抓取网页内容

来源:互联网 发布:淘宝旺旺聊天生成器 编辑:程序博客网 时间:2024/04/28 20:00

Web 项目中jsoup的使用


下载地址:http://download.csdn.net/detail/start_baby/5132499

eg 我要抓取东方财富的某一只股票的详细详细http://data.eastmoney.com/xg/xg/detail/300461.html300461是股票代码,可能过时,可以进入(http://data.eastmoney.com/xg/xg/default.html)查看股票代码替换一下即可。

1.下载好包,放入项目的lib文件夹下。

2.java类

package com.collect.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
 * 根据输入URL 下载数据
 */
public class CollectFromHtml {
/**
*  网页下载数据
* @method: getHtmlResourceByUrl() -by fjt
* @TODO:  
* @param url 地址
* @param encoding 编码格式
* @return String
*/
public static String getHtmlResourceByUrl(String url, String encoding) {
// 声明容器
StringBuffer buffer = new StringBuffer();
URL urlObj = null;
URLConnection uc = null;
InputStreamReader in = null;
BufferedReader reader = null;
try {
urlObj = new URL(url);
uc = urlObj.openConnection();
in = new InputStreamReader(uc.getInputStream(), encoding);
reader = new BufferedReader(in);
String tempLine = null;
while ((tempLine = reader.readLine()) != null) {
buffer.append(tempLine);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return buffer.toString();
}

/**
* 采集数据详细数据http://data.eastmoney.com/xg/xg/detail/300461.html

* @param code 股票代码
* @return String
*/
public static String getDatasMsg(String code) {
code=code+".html";
String code1=code.replace("'","");
String temp = "";
try {
String URL = "http://data.eastmoney.com/xg/xg/detail/" + code1;
String htmlResource = getHtmlResourceByUrl(URL, "gb2312");
Document document = Jsoup.parse(htmlResource);
Element masthead = document.select("div.content").get(0); //抓取页面div的class属性为content的第一个层
Element masthead1 = document.select("div.content").get(1);//抓取页面div的class属性为content的第二个层
Element masthead2 = document.select("div.content").get(2);//抓取页面div的class属性为content的第三个层
Element masthead3 = document.select("div.content").get(3);抓取页面div的class属性为content的第四个层
temp = masthead.toString() + masthead1.toString()
+ masthead2.toString() + masthead3.toString();
} catch (Exception e) {
}


if (temp.length() > 1000) {
return temp;
} else {
return "no";
}
}
public static void main(String[] args) {
System.out.println(getDatasMsg("002024"));
}
}


3.控制器页面

这个获取页面的java类是静态类,直接在action里面调用这个方法,放入作用域,然后跳转到指定的页面,在页

面用EL表达式取出来就可以了

4.jsp页面类

<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
<%
String path = request.getContextPath();
String basePath = request.getScheme() + "://"
+ request.getServerName() + ":" + request.getServerPort()
+ path + "/";
%>
<%@taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core"%>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<base href="<%=basePath%>">
<title>My JSP 'allShares' starting page</title>
<meta http-equiv="pragma" content="no-cache">
<meta http-equiv="cache-control" content="no-cache">
<meta http-equiv="expires" content="0">
<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
<meta http-equiv="description" content="This is my page">
<!--
<link rel="stylesheet" type="text/css" href="styles.css">
-->
<link rel="stylesheet" type="text/css" href="skin/css/base.css">
<style type="text/css">
body{
padding:0;
margin:0;
background-color:#ECE5D8;
}
<!--
body,table {
font-size: 12px;
}


table {
table-layout: fixed;
empty-cells: show;
border-collapse: collapse;
margin: 0 auto;
}


td {
height: 20px;
}


h1,h2,h3 {
font-size: 15px;
margin: 0;
padding: 3px;
}


.title {
background: #FFF;
border: 1px solid #9DB3C5;
padding: 1px;
width: 90%;
margin: 20px auto;
}


.title h1 {
line-height: 31px;
text-align: center;
background: #2F589C url(th_bg2.gif);
background-repeat: repeat-x;
background-position: 0 0;
color: #FFF;
}


.title th,.title td {
border: 1px solid #CAD9EA;
padding: 5px;
}


/*这个是借鉴一个论坛的样式*/
table.tab1 {
border: 1px solid #cad9ea;
color: #666;
width: 800px;
margin-top: 20px;
}
table.tab1 th {
background-image: url(th_bg1.gif);
background-repeat: :repeat-x;
height: 30px;
}


table.tab1 td,table.tab1 th {
border: 1px solid #cad9ea;
padding: 0 1em 0;
}


table.tab1 tr td.tdtitle {
background-color: #f5fafe;
font-weight: 800;
}
-->
</style>
</head>
<body leftmargin="8" topmargin="8" background='skin/images/allbg.gif'>
<center>
${msg}
</center>
</body>
</html>
0 0
原创粉丝点击