初步使用HTMLParser工具包【部分代码来自网络】

来源:互联网 发布:北京11选5遗漏数据查询 编辑:程序博客网 时间:2024/05/28 06:05
</pre><pre name="code" class="java"><strong><span style="font-size:24px;">先上代码:</span></strong>
</pre><pre name="code" class="java">import java.io.BufferedReader;import java.io.InputStreamReader;import java.io.FileInputStream;import java.io.File;import java.net.HttpURLConnection;import java.net.URL;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.util.NodeList;import org.htmlparser.visitors.TextExtractingVisitor;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;/*** @author www.baizeju.com*/public class TestHtmlParser {    private static String ENCODE = "GBK";    private static void message( String szMsg ) {        try{        System.out.println(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding")));        }         catch(Exception e ){        e.printStackTrace();        }    }    public static String openFile( String szFileName ) {        try {            BufferedReader bis = new BufferedReader(new InputStreamReader(            new FileInputStream( new File(szFileName)), ENCODE) );            String szContent="";            String szTemp;                        while ( (szTemp = bis.readLine()) != null) {                szContent+=szTemp+"\n";            }            bis.close();            return szContent;        }        catch( Exception e ) {            return "";        }    }       public static void main(String[] args) {                String szContent = openFile( "D:\\28GAME\\BeiJing10\\GuanFang\\temp\\2015-1-1.html");                try{            //Parser parser = Parser.createParser(szContent, ENCODE);        Parser parser = new Parser( szContent );           //Parser parser = new Parser( (HttpURLConnection) (new URL("http://127.0.0.1:8080/HTMLParserTester.html")).openConnection() );                    /*TextExtractingVisitor visitor = new TextExtractingVisitor();            parser.visitAllNodesWith(visitor);            String textInPage = visitor.getExtractedText();            message(textInPage);*/        //下面提取表格中的数据        NodeFilter filter1 = new HasAttributeFilter("class","bgcolor1");        NodeFilter filter2 = new HasAttributeFilter("class","bgcolor2");        NodeFilter filter3 = new OrFilter(filter1,filter2);        NodeFilter filter = new AndFilter(new TagNameFilter("tr"),filter3);                NodeList nodelist = parser.parse(filter);//过滤出符合filter_text的节点LISTNode[] nodes = nodelist.toNodeArray();//转化为数组StringBuffer buftext = new StringBuffer();String line = null;for(int i=0; i<nodes.length; i++){//循环加到buftext上System.out.println(nodes[i].toHtml());System.out.println("---------------------------");line = nodes[i].toPlainTextString(); if(line != null){ buftext.append(line); }}String body = buftext.toString();System.out.println(body);//输出                }        catch( Exception e ) {                    }    }}


测试的文本:


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml">  <head>    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="X-UA-Compatible" content="IE=edge" /><meta name="apple-itunes-app" content="app-id=427927518" /><meta property="qc:admins" content="2012211377645053116375" /><title>PK拾开奖信息 - 百度乐彩 - PK拾|开奖公告|开奖结果|开奖查询|历史开奖</title>    <meta name="description" content="PK拾开奖信息提供PK拾开奖结果,开奖公告,历史开奖的详情" />    <meta name="keywords" content="PK拾,PK拾投注,开奖公告,开奖结果,开奖查询,历史开奖" />    <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/detail.css?v=2.9.103" type="text/css" media="screen" />    <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/list.css?v=2.9.103" type="text/css" media="screen" /> </head>  <body>                       <table id="draw_list">                        <thead>                            <tr>                                <td class="td1">开奖日期</td>                                <td class="td2">期号</td>                                <td class="td3">开奖号码</td>                                <td class="td4">本期销量</td>                            </tr>                        </thead>                        <tbody><tr class="bgcolor1"><td class="td1">2015-01-01</td>                                <td class="td2">466997</td><td class="td3">             <span class="result">                    <span class="ball_1">06</span>                    <span class="ball_1">03</span>                    <span class="ball_1">01</span>                    <span class="ball_1">07</span>                    <span class="ball_1">10</span>                    <span class="ball_1">04</span>                    <span class="ball_1">08</span>                    <span class="ball_1">09</span>                    <span class="ball_1">02</span>                    <span class="ball_1">05</span>            </span></td>                                <td class="td4">0</td></tr><tr class="bgcolor2">                                <td class="td1">2015-01-01</td>                                <td class="td2">466996</td>                                <td class="td3"><span class="result">                    <span class="ball_1">01</span>                    <span class="ball_1">06</span>                    <span class="ball_1">04</span>                    <span class="ball_1">02</span>                    <span class="ball_1">05</span>                    <span class="ball_1">07</span>                    <span class="ball_1">10</span>                    <span class="ball_1">03</span>                    <span class="ball_1">09</span>                    <span class="ball_1">08</span></span></td>                               <td class="td4">0</td></tr>                                                    </tbody>                    </table>  </body></html>

结果:


<tr class="bgcolor1"><td class="td1">2015-01-01</td>
                                <td class="td2">466997</td><td class="td3">
             <span class="result">
                    <span class="ball_1">06</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">01</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">08</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
            </span></td>
                                <td class="td4">0</td>
</tr>
---------------------------
<tr class="bgcolor2">
                                <td class="td1">2015-01-01</td>
                                <td class="td2">466996</td>
                                <td class="td3">
<span class="result">
                    <span class="ball_1">01</span>
                    <span class="ball_1">06</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">08</span>
</span></td>
                               <td class="td4">0</td>
</tr>
---------------------------
2015-01-01
                                466997
             
                    06
                    03
                    01
                    07
                    10
                    04
                    08
                    09
                    02
                    05
            
                                0


                                2015-01-01
                                466996
                                


                    01
                    06
                    04
                    02
                    05
                    07
                    10
                    03
                    09
                    08


                               0


0 0
原创粉丝点击