初步使用HTMLParser工具包【部分代码来自网络】
来源:互联网 发布:北京11选5遗漏数据查询 编辑:程序博客网 时间:2024/05/28 06:05
</pre><pre name="code" class="java"><strong><span style="font-size:24px;">先上代码:</span></strong>
</pre><pre name="code" class="java">import java.io.BufferedReader;import java.io.InputStreamReader;import java.io.FileInputStream;import java.io.File;import java.net.HttpURLConnection;import java.net.URL;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.util.NodeList;import org.htmlparser.visitors.TextExtractingVisitor;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;/*** @author www.baizeju.com*/public class TestHtmlParser { private static String ENCODE = "GBK"; private static void message( String szMsg ) { try{ System.out.println(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding"))); } catch(Exception e ){ e.printStackTrace(); } } public static String openFile( String szFileName ) { try { BufferedReader bis = new BufferedReader(new InputStreamReader( new FileInputStream( new File(szFileName)), ENCODE) ); String szContent=""; String szTemp; while ( (szTemp = bis.readLine()) != null) { szContent+=szTemp+"\n"; } bis.close(); return szContent; } catch( Exception e ) { return ""; } } public static void main(String[] args) { String szContent = openFile( "D:\\28GAME\\BeiJing10\\GuanFang\\temp\\2015-1-1.html"); try{ //Parser parser = Parser.createParser(szContent, ENCODE); Parser parser = new Parser( szContent ); //Parser parser = new Parser( (HttpURLConnection) (new URL("http://127.0.0.1:8080/HTMLParserTester.html")).openConnection() ); /*TextExtractingVisitor visitor = new TextExtractingVisitor(); parser.visitAllNodesWith(visitor); String textInPage = visitor.getExtractedText(); message(textInPage);*/ //下面提取表格中的数据 NodeFilter filter1 = new HasAttributeFilter("class","bgcolor1"); NodeFilter filter2 = new HasAttributeFilter("class","bgcolor2"); NodeFilter filter3 = new OrFilter(filter1,filter2); NodeFilter filter = new AndFilter(new TagNameFilter("tr"),filter3); NodeList nodelist = parser.parse(filter);//过滤出符合filter_text的节点LISTNode[] nodes = nodelist.toNodeArray();//转化为数组StringBuffer buftext = new StringBuffer();String line = null;for(int i=0; i<nodes.length; i++){//循环加到buftext上System.out.println(nodes[i].toHtml());System.out.println("---------------------------");line = nodes[i].toPlainTextString(); if(line != null){ buftext.append(line); }}String body = buftext.toString();System.out.println(body);//输出 } catch( Exception e ) { } }}
测试的文本:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="X-UA-Compatible" content="IE=edge" /><meta name="apple-itunes-app" content="app-id=427927518" /><meta property="qc:admins" content="2012211377645053116375" /><title>PK拾开奖信息 - 百度乐彩 - PK拾|开奖公告|开奖结果|开奖查询|历史开奖</title> <meta name="description" content="PK拾开奖信息提供PK拾开奖结果,开奖公告,历史开奖的详情" /> <meta name="keywords" content="PK拾,PK拾投注,开奖公告,开奖结果,开奖查询,历史开奖" /> <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/detail.css?v=2.9.103" type="text/css" media="screen" /> <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/list.css?v=2.9.103" type="text/css" media="screen" /> </head> <body> <table id="draw_list"> <thead> <tr> <td class="td1">开奖日期</td> <td class="td2">期号</td> <td class="td3">开奖号码</td> <td class="td4">本期销量</td> </tr> </thead> <tbody><tr class="bgcolor1"><td class="td1">2015-01-01</td> <td class="td2">466997</td><td class="td3"> <span class="result"> <span class="ball_1">06</span> <span class="ball_1">03</span> <span class="ball_1">01</span> <span class="ball_1">07</span> <span class="ball_1">10</span> <span class="ball_1">04</span> <span class="ball_1">08</span> <span class="ball_1">09</span> <span class="ball_1">02</span> <span class="ball_1">05</span> </span></td> <td class="td4">0</td></tr><tr class="bgcolor2"> <td class="td1">2015-01-01</td> <td class="td2">466996</td> <td class="td3"><span class="result"> <span class="ball_1">01</span> <span class="ball_1">06</span> <span class="ball_1">04</span> <span class="ball_1">02</span> <span class="ball_1">05</span> <span class="ball_1">07</span> <span class="ball_1">10</span> <span class="ball_1">03</span> <span class="ball_1">09</span> <span class="ball_1">08</span></span></td> <td class="td4">0</td></tr> </tbody> </table> </body></html>
结果:
<tr class="bgcolor1"><td class="td1">2015-01-01</td>
<td class="td2">466997</td><td class="td3">
<span class="result">
<span class="ball_1">06</span>
<span class="ball_1">03</span>
<span class="ball_1">01</span>
<span class="ball_1">07</span>
<span class="ball_1">10</span>
<span class="ball_1">04</span>
<span class="ball_1">08</span>
<span class="ball_1">09</span>
<span class="ball_1">02</span>
<span class="ball_1">05</span>
</span></td>
<td class="td4">0</td>
</tr>
---------------------------
<tr class="bgcolor2">
<td class="td1">2015-01-01</td>
<td class="td2">466996</td>
<td class="td3">
<span class="result">
<span class="ball_1">01</span>
<span class="ball_1">06</span>
<span class="ball_1">04</span>
<span class="ball_1">02</span>
<span class="ball_1">05</span>
<span class="ball_1">07</span>
<span class="ball_1">10</span>
<span class="ball_1">03</span>
<span class="ball_1">09</span>
<span class="ball_1">08</span>
</span></td>
<td class="td4">0</td>
</tr>
---------------------------
2015-01-01
466997
06
03
01
07
10
04
08
09
02
05
0
2015-01-01
466996
01
06
04
02
05
07
10
03
09
08
0
- 初步使用HTMLParser工具包【部分代码来自网络】
- 初步使用HTMLParser工具包【部分代码来自网络】
- HTMLparser的初步使用
- 网络爬虫---HTMLParser使用详解
- 网络爬虫---HTMLParser使用举例
- 网络爬虫---HTMLParser使用详解
- 对Swing工具包使用的初步了解
- 3d(三维)验证码实现(部分代码来自网络),旋转未完成,有待改进(一)
- 3d(三维)验证码实现(部分代码来自网络),旋转未完成,有待改进(二)
- 使用HttpClient和HtmlParser实现网络爬虫
- HtmlParser初步研究
- HtmlParser初步研究(转载)
- HtmlParser初步研究
- HtmlParser初步研究
- htmlParser初步研究
- HTMLParser初步研究
- percona-toolkit工具包的安装和初步使用
- [webkit] htmlparser 部分
- Leetcode[3] Longest Substring Without Repeating Characters
- 九度1014排名(结构体)
- 算法设计读书笔记第二章
- hdu 1757 A Simple Math Problem 矩阵快速幂
- Egret Wing实战教程
- 初步使用HTMLParser工具包【部分代码来自网络】
- RMAN duplicate数据库
- 苹果公司的专利战其实是营销战略?
- Ubuntu14.04安装Maven3.2.5
- Spring MVC防止数据重复提交
- 并查集的详解(转)
- VS2010 cocos2d BOX2D环境搭建
- struct和typedef struct区别
- 进程与线程关系