htmlparser实例

来源:互联网 发布:java 字节码增强 编辑:程序博客网 时间:2024/06/07 05:12

1.importjava.net.URL;     
2.    
3.importjunit.framework.TestCase;     
4.    
5.importorg.apache.log4j.Logger;     
6.importorg.htmlparser.Node;     
7.importorg.htmlparser.NodeFilter;     
8.importorg.htmlparser.Parser;     
9.importorg.htmlparser.Tag;     
10.importorg.htmlparser.beans.LinkBean;     
11.importorg.htmlparser.filters.NodeClassFilter;     
12.importorg.htmlparser.filters.OrFilter;     
13.importorg.htmlparser.filters.TagNameFilter;     
14.importorg.htmlparser.tags.HeadTag;     
15.importorg.htmlparser.tags.ImageTag;     
16.importorg.htmlparser.tags.InputTag;     
17.importorg.htmlparser.tags.LinkTag;     
18.importorg.htmlparser.tags.OptionTag;     
19.importorg.htmlparser.tags.SelectTag;     
20.importorg.htmlparser.tags.TableColumn;     
21.importorg.htmlparser.tags.TableRow;     
22.importorg.htmlparser.tags.TableTag;     
23.importorg.htmlparser.tags.TitleTag;     
24.importorg.htmlparser.util.NodeIterator;     
25.importorg.htmlparser.util.NodeList;     
26.importorg.htmlparser.util.ParserException;     
27.importorg.htmlparser.visitors.HtmlPage;     
28.importorg.htmlparser.visitors.NodeVisitor;     
29.importorg.htmlparser.visitors.ObjectFindingVisitor;     
30.    
31.public class T extends TestCase{     
32.    
33.  private static final Logger logger =Logger.getLogger(T.class);     
34.    
35.  public T(String name){     
36.   super(name);     
37.     
38.    
39.     
42.  public void testImageVisitor(){     
43.    try{     
44.     ImageTagimgLink;     
45.     ObjectFindingVisitor visitor = newObjectFindingVisitor(ImageTag.class);     
46.     Parser parser = newParser();     
47.     parser.setURL("http://www.google.com");     
48.     parser.setEncoding(parser.getEncoding());     
49.     parser.visitAllNodesWith(visitor);     
50.     Node[] nodes =visitor.getTags();     
51.     for (int i = 0; i < nodes.length; i++){     
52.       imgLink = (ImageTag)nodes[i];     
53.       logger.fatal("testImageVisitor() ImageURL = " +imgLink.getImageURL());     
54.       logger.fatal("testImageVisitor() ImageLocation = " +imgLink.extractImageLocn());     
55.       logger.fatal("testImageVisitor() SRC = " +imgLink.getAttribute("SRC"));     
56.         
57.    } catch(Exception e){     
58.     e.printStackTrace();     
59.       
60.     
61.    
62.     
65.  public void testNodeFilter(){     
66.    try{     
67.     NodeFilter filter = newTagNameFilter("IMG");     
68.     Parser parser = newParser();     
69.     parser.setURL("http://www.google.com");     
70.     parser.setEncoding(parser.getEncoding());     
71.     NodeList list =parser.extractAllNodesThatMatch(filter);     
72.     for (int i = 0; i < list.size(); i++){     
73.       logger.fatal("testNodeFilter() " +list.elementAt(i).toHtml());     
74.         
75.    } catch(Exception e){     
76.     e.printStackTrace();     
77.       
78.    
79.     
80.    
81.     
84.  public void testLinkTag(){     
85.    try{     
86.    
87.     NodeFilter filter = newNodeClassFilter(LinkTag.class);     
88.     Parser parser = newParser();     
89.     parser.setURL("http://www.google.com");     
90.     parser.setEncoding(parser.getEncoding());     
91.     NodeList list =parser.extractAllNodesThatMatch(filter);     
92.     for (int i = 0; i < list.size(); i++){     
93.       LinkTag node = (LinkTag)list.elementAt(i);     
94.       logger.fatal("testLinkTag() Link is :" +node.extractLink());     
95.         
96.    } catch(Exception e){     
97.     e.printStackTrace();     
98.       
99.    
100.     
101.    
102.     
105.  public void testLinkCSS(){     
106.    try{     
107.    
108.     Parser parser = newParser();     
109.     parser.setInputHTML("<head><title>LinkTest</title>"    
110.         + "<link href="’/test01/css.css"mce_href="’/test01/css.css"' text='text/css' rel='stylesheet'/>"    
111.         + "<link href="/test02/css.css"mce_href="test02/css.css" text='text/css' rel='stylesheet'/>" +"</head>"    
112.         +"<body>");     
113.     parser.setEncoding(parser.getEncoding());     
114.    
115.     for (NodeIterator e = parser.elements(); e.hasMoreNodes();){     
116.       Node node =e.nextNode();     
117.       logger.fatal("testLinkCSS()" + node.getText() +node.getClass());     
118.    
119.         
120.    } catch(Exception e){     
121.     e.printStackTrace();     
122.       
123.     
124.    
125.     
128.  public void testOrFilter(){     
129.   NodeFilter inputFilter = newNodeClassFilter(InputTag.class);     
130.   NodeFilter selectFilter = newNodeClassFilter(SelectTag.class);     
131.    
132.    NodeListnodeList =null;     
133.    
134.    try{     
135.     Parser parser = newParser();     
136.     parser     
137.         .setInputHTML("<head><title>OrFilterTest</title>"    
138.             + "<link href="/test01/css.css"mce_href="test01/css.css" text='text/css' rel='stylesheet'/>"    
139.             + "<link href="/test02/css.css"mce_href="test02/css.css" text='text/css' rel='stylesheet'/>"    
140.             +"</head>"    
141.             +"<body>"    
142.             + "<input type='text' value='text1&prime;name='text1&prime;/>"    
143.             + "<input type='text' value='text2&prime;name='text2&prime;/>"    
144.             + "<select><optionid='1&prime;>1</option><optionid='2&prime;>2</option><optionid='3&prime;></option></select>"    
145.             + "<a href="http://www.yeeach.com"mce_href="yeeach.comhttp://www.yeeach.com">yeeach.com</a>"+"</body>");     
146.    
147.     parser.setEncoding(parser.getEncoding());     
148.     OrFilter lastFilter = newOrFilter();     
149.     lastFilter.setPredicates(new NodeFilter[] { selectFilter,inputFilter});     
150.     nodeList =parser.parse(lastFilter);     
151.     for (int i = 0; i <= nodeList.size(); i++){     
152.       if (nodeList.elementAt(i) instanceof InputTag){     
153.         InputTag tag = (InputTag)nodeList.elementAt(i);     
154.         logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tagvalueis:"    
155.             +tag.getAttribute("value"));     
156.           
157.       if (nodeList.elementAt(i) instanceof SelectTag){     
158.         SelectTag tag = (SelectTag)nodeList.elementAt(i);     
159.         NodeList list =tag.getChildren();     
160.    
161.         for (int j = 0; j < list.size(); j++){     
162.           OptionTag option = (OptionTag)list.elementAt(j);     
163.           logger.fatal("OrFilter Option" +option.getOptionText());     
164.             
165.    
166.           
167.         
168.    
169.    } catch(ParserException e){     
170.     e.printStackTrace();     
171.       
172.     
173.    
174.     
177.  public void testTable(){     
178.    ParsermyParser;     
179.    NodeListnodeList =null;     
180.    myParser= Parser.createParser("<body> " +"<table id='table1&prime;>"    
181.       +"<tr><td>1-11</td><td>1-12</td><td>1-13</td>"    
182.       +"<tr><td>1-21</td><td>1-22</td><td>1-23</td>"    
183.       +"<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"+ "<table id='table2&prime;>"    
184.       +"<tr><td>2-11</td><td>2-12</td><td>2-13</td>"    
185.       +"<tr><td>2-21</td><td>2-22</td><td>2-23</td>"    
186.       +"<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"+ "</body>","GBK");     
187.   NodeFilter tableFilter = newNodeClassFilter(TableTag.class);     
188.    OrFilterlastFilter = newOrFilter();     
189.   lastFilter.setPredicates(new NodeFilter[] { tableFilter});     
190.    try{     
191.     nodeList =myParser.parse(lastFilter);     
192.     for (int i = 0; i <= nodeList.size(); i++){     
193.       if (nodeList.elementAt(i) instanceof TableTag){     
194.         TableTag tag = (TableTag)nodeList.elementAt(i);     
195.         TableRow[] rows =tag.getRows();     
196.    
197.         for (int j = 0; j < rows.length; j++){     
198.           TableRow tr = (TableRow)rows[j];     
199.           TableColumn[] td =tr.getColumns();     
200.           for (int k = 0; k < td.length; k++){     
201.             logger.fatal("<td>" +td[k].toPlainTextString());     
202.               
203.    
204.             
205.    
206.           
207.         
208.    
209.    } catch(ParserException e){     
210.     e.printStackTrace();     
211.       
212.     
213.    
214.     
217.  public void testVisitorAll(){     
218.    try{     
219.     Parser parser = newParser();     
220.     parser.setURL("http://www.google.com");     
221.     parser.setEncoding(parser.getEncoding());     
222.     NodeVisitor visitor = new NodeVisitor(){     
223.       public void visitTag(Tag tag){     
224.         logger.fatal("testVisitorAll()  Tag name is :" +tag.getTagName() + " \n Class is:"    
225.             +tag.getClass());     
226.           
227.    
228.     };     
229.    
230.     parser.visitAllNodesWith(visitor);     
231.    } catch(ParserException e){     
232.     e.printStackTrace();     
233.       
234.     
235.    
236.     
239.  public void testTagVisitor(){     
240.    try{     
241.    
242.     Parser parser = newParser("<head><title>dddd</title>"    
243.         + "<link href="/test01/css.css"mce_href="test01/css.css" text='text/css' rel='stylesheet'/>"    
244.         + "<link href="/test02/css.css"mce_href="test02/css.css" text='text/css' rel='stylesheet'/>" +"</head>"    
245.         + "<body>" + "<ahref="http://www.yeeach.com"mce_href="yeeach.comhttp://www.yeeach.com">yeeach.com</a>"+"</body>");     
246.     NodeVisitor visitor = new NodeVisitor(){     
247.       public void visitTag(Tag tag){     
248.         if (tag instanceof HeadTag){     
249.           logger.fatal("visitTag() HeadTag : Tag name is :" +tag.getTagName()     
250.               + " \n Class is :" + tag.getClass() + "\n Text is :" +tag.getText());     
251.         } else if (tag instanceof TitleTag){     
252.           logger.fatal("visitTag() TitleTag : Tag name is :" +tag.getTagName()     
253.               + " \n Class is :" + tag.getClass() + "\n Text is :" +tag.getText());     
254.    
255.         } else if (tag instanceof LinkTag){     
256.           logger.fatal("visitTag() LinkTag : Tag name is :" +tag.getTagName()     
257.               + " \n Class is :" + tag.getClass() + "\n Text is :" +tag.getText()     
258.               + " \n getAttribute is :" +tag.getAttribute("href"));     
259.         } else{     
260.           logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " \nClass is:"    
261.               + tag.getClass() + "\n Text is :" +tag.getText());     
262.             
263.    
264.           
265.    
266.     };     
267.    
268.     parser.visitAllNodesWith(visitor);     
269.    } catch(Exception e){     
270.     e.printStackTrace();     
271.       
272.     
273.    
274.     
277.  public void testHtmlPage(){     
278.    StringinputHTML = "<html>" +"<head>"    
279.       + "<title>Welcome to the HTMLParserwebsite</title>" +"</head>" +"<body>"    
280.       + "Welcome to HTMLParser" + "<table id='table1&prime;>"    
281.       +"<tr><td>1-11</td><td>1-12</td><td>1-13</td>"    
282.       +"<tr><td>1-21</td><td>1-22</td><td>1-23</td>"    
283.       +"<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"+ "<table id='table2&prime;>"    
284.       +"<tr><td>2-11</td><td>2-12</td><td>2-13</td>"    
285.       +"<tr><td>2-21</td><td>2-22</td><td>2-23</td>"    
286.       +"<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"+ "</body>" +"</html>";     
287.    Parserparser = newParser();     
288.    try{     
289.     parser.setInputHTML(inputHTML);     
290.     parser.setEncoding(parser.getURL());     
291.     HtmlPage page = newHtmlPage(parser);     
292.     parser.visitAllNodesWith(page);     
293.     logger.fatal("testHtmlPage -title is :" +page.getTitle());     
294.     NodeList list =page.getBody();     
295.    
296.     for (NodeIterator iterator = list.elements();iterator.hasMoreNodes();){     
297.       Node node =iterator.nextNode();     
298.       logger.fatal("testHtmlPage -node  is :" +node.toHtml());     
299.         
300.    
301.    } catch(ParserException e){     
302.     // TODO Auto-generated catchblock     
303.     e.printStackTrace();     
304.       
305.     
306.    
307.     
310.  public void testLinkBean(){     
311.    Parserparser = newParser();     
312.    
313.    LinkBeanlinkBean = newLinkBean();     
314.   linkBean.setURL("http://www.google.com");     
315.    URL[]urls =linkBean.getLinks();     
316.    
317.    for (inti = 0; i < urls.length; i++){     
318.     URL url =urls[i];     
319.     logger.fatal("testLinkBean() -url  is :" +url);     
320.       
321.    
322.     
323.    
324.}   


本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/tianhewulei/archive/2009/10/14/4670460.aspx

原创粉丝点击