html解释器-demo(htmlparser使用)

来源:互联网 发布:javascript英文怎么读 编辑:程序博客网 时间:2024/06/04 18:20
package org.sam.util;import java.net.URL;import junit.framework.TestCase;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.beans.LinkBean;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.filters.RegexFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.tags.ImageTag;import org.htmlparser.tags.InputTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.OptionTag;import org.htmlparser.tags.SelectTag;import org.htmlparser.tags.TableColumn;import org.htmlparser.tags.TableRow;import org.htmlparser.tags.TableTag;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.HtmlPage;import org.htmlparser.visitors.NodeVisitor;import org.htmlparser.visitors.ObjectFindingVisitor;import org.junit.Test;public class HtmlToolCase extends TestCase {private Parser parser;//private static final Logger logger = Logger.getLogger(HtmlToolCase.class);private static String ENCODE = "GBK";@Testpublic void setUp(){parser = new Parser();/* * HTMLParser的核心模块是org.htmlparser.Parser: * 以下为构造函数 * public Parser ();  * public Parser (Lexer lexer, ParserFeedback fb); * public Parser (URLConnection connection, ParserFeedback fb) throws ParserException;  * public Parser (String resource, ParserFeedback feedback) throws ParserException;  * public Parser (String resource) throws ParserException;  * public Parser (Lexer lexer);  * public Parser (URLConnection connection) throws ParserException;  * public static Parser createParser (String html, String charset);静态方法 */try {parser.setURL("http://www.csdn.net/");parser.setEncoding(ENCODE);} catch (ParserException pe) {System.err.println(pe.getMessage());}}public void msg( String msg ) {        try{        System.out.println(new String(msg.getBytes(ENCODE), System.getProperty("file.encoding")));         } catch(Exception e ){        System.err.println(ENCODE + "|" + msg);        }    }/* * ObjectFindingVisitor: 用来找出所有指定类型的节点,采用getTags()来获取结果 */public void testObjectFindingVisitor() {ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);try {parser.visitAllNodesWith(ofv);} catch (ParserException pe) {System.err.println(pe.getMessage());}Node[] nodes = ofv.getTags();for(int i=0; i<nodes.length; i++) {ImageTag it = (ImageTag)nodes[i];//图片节点msg("------>Image(" + (i+1) + ")toHtml=" + it.toHtml());msg("------>Image(" + (i+1) + ")toPlainTextString=" + it.toPlainTextString());msg("------>Image(" + (i+1) + ")toTagHtml=" + it.toTagHtml());msg("------>Image(" + (i+1) + ")toHtml-TRUE=" + it.toHtml(true));msg("------>Image(" + (i+1) + ")toHtml-FALSE=" + it.toHtml(false));msg("------>Image(" + (i+1) + ")Text=" + it.getText());msg("------>Image(" + (i+1) + ")URL=" + it.getImageURL());msg("------>Image(" + (i+1) + ")Location=" + it.extractImageLocn());msg("------>Image(" + (i+1) + ")src=" + it.getAttribute("src"));}}/* * 判断类Filter-----》 TagNameFilter:标签名过虑 * HasAttributeFilter * HasChildFilter * HasParentFilter * HasSiblingFilter * IsEqualFilter */public void testTagNameFilter() {NodeFilter nf = new TagNameFilter("img");try {NodeList nl = parser.extractAllNodesThatMatch(nf);for(int i=0; i<nl.size(); i++) {msg("------>Image(" + (i+1) + ")toHtml=" + nl.elementAt(i).toHtml());}} catch (ParserException pe) {System.err.println(pe.getMessage());}}/* * 逻辑运算Filter-----》 OrFilter:Filter或关系 * AndFilter * NotFilter * XorFilter */public void testOrFilter() {NodeFilter input = new NodeClassFilter(InputTag.class);NodeFilter image = new NodeClassFilter(ImageTag.class);NodeFilter select = new NodeClassFilter(SelectTag.class);OrFilter of = new OrFilter();of.setPredicates(new NodeFilter[]{input, image, select});try {NodeList nl = parser.extractAllNodesThatMatch(of);for(int i=0; i<nl.size(); i++) {if(nl.elementAt(i) instanceof InputTag) {InputTag it = (InputTag) nl.elementAt(i);msg("------>InputTag(" + (i+1) + ")name=" + it.getTagName() + " | value=" + it.getAttribute("value"));} else if(nl.elementAt(i) instanceof ImageTag) {ImageTag it = (ImageTag) nl.elementAt(i);msg("------>Image(" + (i+1) + ")toHtml=" + it.toHtml());} else if(nl.elementAt(i) instanceof SelectTag) {SelectTag st = (SelectTag) nl.elementAt(i);msg("------>Image(" + (i+1) + ")toHtml=" + st.toHtml());NodeList childList = st.getChildren();for(int k=0; k<childList.size(); k++) {OptionTag ot = (OptionTag) childList.elementAt(k);msg("------>OptionTag(" + (i+1) + "-" + (k+1) + ")value=" + ot.getValue() + " | text=" + ot.getOptionText());}} else {msg("------>Unknown(" + (i+1) + ")toHtml=" + nl.elementAt(i).toHtml());}}} catch (ParserException pe) {System.err.println(pe.getMessage());}}/* * 其他Filter: * NodeClassFilter: * StringFilter * LinkStringFilter * LinkRegexFilter * RegexFilter * CssSelectorNodeFilter  */public void testNodeClassFilter() {NodeFilter nf = new NodeClassFilter(LinkTag.class);//a标签try {NodeList nl = parser.extractAllNodesThatMatch(nf);for(int i=0; i<nl.size(); i++) {LinkTag lt = (LinkTag) nl.elementAt(i);msg("------>LinkTag(" + (i+1) + ")toHtml=" + lt.toHtml());msg("------>LinkTag(" + (i+1) + ")extractLink=" + lt.extractLink());}} catch (ParserException pe) {System.err.println(pe.getMessage());}}/* * RegexFilter: HTMLParser 的 RegexFilter 用法示例 */public void testRegexFilter() {RegexFilter rfDate = new RegexFilter("\\d{4}[\\/-]\\d{1,2}[\\/-]\\d{1,2}");//日期RegexFilter rfURL = new RegexFilter("(http:|https:|ftp:)//[^[A-Za-z0-9\\._\\?%&+\\-=/#]]*");//url(很不全面)OrFilter of = new OrFilter();of.setPredicates(new NodeFilter[]{rfDate, rfURL});try {NodeList nl = parser.extractAllNodesThatMatch(of);for(int i=0; i<nl.size(); i++) {msg("------>Regex(" + (i+1) + ")toHtml=" + nl.elementAt(i).toHtml());}} catch (ParserException pe) {System.err.println(pe.getMessage());}}/* * 对<table><tr><td></td></tr></table>的解析 */public void testTable() {NodeFilter table = new NodeClassFilter(TableTag.class);OrFilter of = new OrFilter();of.setPredicates(new NodeFilter[]{table});try {NodeList nl = parser.extractAllNodesThatMatch(of);for(int i=0; i<nl.size(); i++) {if(nl.elementAt(i) instanceof TableTag) {TableTag tt = (TableTag) nl.elementAt(i);TableRow[] trs = tt.getRows();for(int k=0; k<trs.length; k++) {TableRow tr = trs[k];msg("------>tr(" + (i+1) + "-" + (k+1) + ")toHtml=" + tr.toHtml());TableColumn[] tcs = tr.getColumns();for(TableColumn tc : tcs) {msg("------>------>td-toHtml=" + tc.toHtml());}}}}} catch (ParserException pe) {System.err.println(pe.getMessage());}}/* * 测试HtmlPage的用法 */public void testHtmlPage() {HtmlPage hp = new HtmlPage(parser);try {parser.visitAllNodesWith(hp);msg("HtmlPage------>title=" + hp.getTitle());NodeList nl = hp.getBody();for(NodeIterator ni=nl.elements(); ni.hasMoreNodes();) {Node n = ni.nextNode();msg("Node------>Class=" + n.getClass() + "------>Text=" + n.getText());}} catch (ParserException pe) {System.err.println(pe.getMessage());}}/* * 采用bean方式访问html */public void testLinkBean() {LinkBean lb = new LinkBean();lb.setURL("http://www.csdn.net");URL[] urls = lb.getLinks();for(int i=0; i<urls.length; i++) {URL url = urls[i];System.err.println("第" + (i+1) + "个超链接:" + url);Parser p = new Parser();try {p.setURL(url.toString());p.setEncoding(ENCODE);NodeVisitor nv = new NodeVisitor() {public void visitTag(Tag t) {msg("Tag------>name=" + t.getTagName() + "------>Text=" + t.getText());}};parser.visitAllNodesWith(nv);} catch (ParserException pe) {System.err.println(pe.getMessage());}}}/* * 采用Visitor方式访问html */public void testNodeVisitor() {NodeVisitor nv = new NodeVisitor() {public void visitTag(Tag t) {msg("Tag------>name=" + t.getTagName() + "------>Text=" + t.getText());}};try {parser.visitAllNodesWith(nv);} catch (ParserException pe) {System.err.println(pe.getMessage());}}}/*对于树型结构进行遍历的函数,这些函数最容易理解:Node getParent ():取得父节点NodeList getChildren ():取得子节点的列表Node getFirstChild ():取得第一个子节点Node getLastChild ():取得最后一个子节点Node getPreviousSibling ():取得前一个兄弟Node getNextSibling ():取得下一个兄弟节点取得Node内容的函数:String getText ():取得文本String toPlainTextString():取得纯文本信息。String toHtml () :取得HTML信息(原始HTML)String toHtml (boolean verbatim):取得HTML信息(原始HTML)String toString ():取得字符串信息(原始HTML)Page getPage ():取得这个Node对应的Page对象int getStartPosition ():取得这个Node在HTML页面中的起始位置int getEndPosition ():取得这个Node在HTML页面中的结束位置用于Filter过滤的函数:void collectInto (NodeList list, NodeFilter filter):基于filter的条件对于这个节点进行过滤,符合条件的节点放到list中。用于Visitor遍历的函数:void accept (NodeVisitor visitor):对这个Node应用visitor用于修改内容的函数,这类用得比较少:void setPage (Page page):设置这个Node对应的Page对象void setText (String text):设置文本void setChildren (NodeList children):设置子节点列表其他函数:void doSemanticAction ():执行这个Node对应的操作Object clone ():接口Clone的抽象函数*/

package org.sam.util;import java.io.BufferedInputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.tags.ImageTag;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.ObjectFindingVisitor;/** * 抓取图片 * @author ssh_kobe  Date: 2011-12-09 */public class SnatchImg {private static final String ENCODE = "GBK";private static final int SIZE = 1024;public static void main(String[] ss) {String rootName = System.getProperty("user.name");//计算机用户账户String tmpdir = System.getProperty("java.io.tmpdir");//默认的临时文件路径String desktop = tmpdir.substring(0, tmpdir.indexOf(rootName)) + rootName + "/Desktop/";desktop = desktop.replace("\\", "/");//桌面路径//(这个取桌面路径方法一般情况下都适用)catchImage("http://fj.sina.com.cn/news/p/p/2011-11-02/1056114622_5.html", "E:/img/");}/** 保存文件*/public static void saveFile(String urlPath, String savePath) {if(!urlPath.startsWith("http://") && urlPath.lastIndexOf("/") < 0) {System.err.println("文件:" + urlPath + " 路径错误!");} else {try {URL url = new URL(urlPath);HttpURLConnection conn = (HttpURLConnection) url.openConnection();//打开远程连接conn.setDoInput(true);conn.setRequestMethod("GET");conn.setConnectTimeout(60000);//1分钟if(conn.getResponseCode() == 200) {BufferedInputStream bis = new BufferedInputStream(conn.getInputStream());File outFile = new File(reName(savePath, urlPath.substring(urlPath.lastIndexOf("/") + 1)));FileOutputStream fos = new FileOutputStream(outFile);byte[] buff = new byte[SIZE];int len = -1;int k = 0;while((len=bis.read(buff)) != -1) {fos.write(buff, 0 , len);k++;}fos.flush();bis.close();fos.close();if(k < 10) {//小于10KBoutFile.delete();} else {System.out.println("文件名:" + outFile.getName());}}conn.disconnect();} catch (FileNotFoundException fnfe) {System.err.println("文件:" + urlPath + " 不存在!");} catch (IOException ioe) {System.err.println("读取文件:" + urlPath + " 失败!");}}}/** 抓取图片*/public static void catchImage(String url, String save) {//此方法有很多情况不适用,且抓取的图片失真Parser parser = new Parser();ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);try {parser.setURL(url);parser.setEncoding(ENCODE);parser.visitAllNodesWith(ofv);} catch (ParserException pe) {System.err.println(pe.getMessage());}Node[] nodes = ofv.getTags();for(int i=0; i<nodes.length; i++) {ImageTag it = (ImageTag)nodes[i];//图片节点String imgSrc = null;if(!it.getImageURL().startsWith("http://")) {try {URL httpPath = new URL(url);if(it.getImageURL().startsWith("/")) {imgSrc = "http://" + httpPath.getHost() + it.getImageURL();} else {imgSrc = "http://" + httpPath.getHost() + "/" + it.getImageURL();}} catch (MalformedURLException e) {e.printStackTrace();}} else {imgSrc = it.getImageURL();}saveFile(imgSrc, save);}}private static String reName(String path, String name) {File file = new File(path);File[] files = file.listFiles();String pre = "";String ext = "";if(name.contains(".")) {ext = name.substring(name.indexOf("."));pre = name.substring(0, name.indexOf("."));} else {pre = name;}int i = 1;boolean flag = true;while(flag) {boolean bl = false;for(File f : files) {if(f.getName().equals(name)) {name = pre + "(" + (i++) + ")" + ext;bl = true;break;}}if(!bl) {flag = false;}}return path + name;}}



package org.sam.util;import java.io.BufferedInputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.tags.ImageTag;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.ObjectFindingVisitor;/** * 抓取图片 * @author ssh_kobe  Date: 2011-12-09 */public class SnatchImg {private static final String ENCODE = "GBK";private static final int SIZE = 1024;public static void main(String[] ss) {//for(int page=0; page<79; page++) {catchImage("", "E:/img/");//}}/* * 保存文件 */public static String saveFile(String urlPath, String savePath) {if(!urlPath.startsWith("http://") && urlPath.lastIndexOf("/") < 0) {System.err.println("文件:" + urlPath + " 路径错误!");} else {try {long start = System.currentTimeMillis();URL url = new URL(urlPath);HttpURLConnection conn = (HttpURLConnection) url.openConnection();//打开远程连接conn.setDoInput(true);conn.setRequestMethod("GET");//conn.setConnectTimeout(60000);//1分钟if(conn.getResponseCode() == 200) {String lastName = reName(savePath, urlPath.substring(urlPath.lastIndexOf("/") + 1));if(!lastName.equals(savePath + urlPath.substring(urlPath.lastIndexOf("/") + 1))) {return "fail";}File outFile = new File(lastName);BufferedInputStream bis = new BufferedInputStream(conn.getInputStream());FileOutputStream fos = new FileOutputStream(outFile);byte[] buff = new byte[SIZE];int len = -1;int k = 0;while((len=bis.read(buff)) != -1) {fos.write(buff, 0 , len);k++;}fos.flush();bis.close();fos.close();if(k < 10) {//小于10KBoutFile.delete();} else {System.out.println("文件名:" + outFile.getName() + " 用时ms:" + (System.currentTimeMillis() - start));}}conn.disconnect();} catch (FileNotFoundException fnfe) {System.err.println("文件:" + urlPath + " 不存在!");} catch (IOException ioe) {System.err.println("读取文件:" + urlPath + " 失败!");}}return "ok";}/* * 抓取图片 */public static void catchImage(String url, String save) {//此方法有很多情况不适用,且抓取的图片失真Parser parser = new Parser();ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);try {parser.setURL(url);parser.setEncoding(ENCODE);parser.visitAllNodesWith(ofv);} catch (ParserException pe) {System.err.println(pe.getMessage());}Node[] nodes = ofv.getTags();for(int i=0; i<nodes.length; i++) {ImageTag it = (ImageTag)nodes[i];//图片节点String imgSrc = null;if(!it.getImageURL().startsWith("http://")) {try {URL httpPath = new URL(url);if(it.getImageURL().startsWith("/")) {imgSrc = "http://" + httpPath.getHost() + it.getImageURL();} else {imgSrc = "http://" + httpPath.getHost() + "/" + it.getImageURL();}} catch (MalformedURLException e) {e.printStackTrace();}} else {imgSrc = it.getImageURL();}saveFile(imgSrc, save);}}private static String reName(String path, String name) {File file = new File(path);File[] files = file.listFiles();String pre = "";String ext = "";if(name.contains(".")) {ext = name.substring(name.indexOf("."));pre = name.substring(0, name.indexOf("."));} else {pre = name;}int i = 1;boolean flag = true;while(flag) {boolean bl = false;for(File f : files) {if(f.getName().equals(name)) {name = pre + "(" + (i++) + ")" + ext;bl = true;break;}}if(!bl) {flag = false;}}return path + name;}}

****************************************************************************************************************************另外

package org.sam.util;import java.io.BufferedInputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.tags.ImageTag;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.ObjectFindingVisitor;/** * 抓取图片 *  * @author ssh_kobe Date: 2011-12-09 */public class SnatchImg {private static final String ENCODE = "GBK";private static final int SIZE = 1024;public static void main(String[] ss) {String rootName = System.getProperty("user.name");// 计算机用户账户String tmpdir = System.getProperty("java.io.tmpdir");// 默认的临时文件路径String desktop = tmpdir.substring(0, tmpdir.indexOf(rootName))+ rootName + "/Desktop/";desktop = desktop.replace("\\", "/");// 桌面路径//(这个取桌面路径方法一般情况下都适用)catchImage("http://fj.sina.com.cn/news/p/p/2011-11-02/1056114622_5.html","E:/img/");}/* * 保存文件 */public static void saveFile(String urlPath, String savePath) {if (!urlPath.startsWith("http://") && urlPath.lastIndexOf("/") < 0) {System.err.println("文件:" + urlPath + " 路径错误!");} else {try {URL url = new URL(urlPath);HttpURLConnection conn = (HttpURLConnection) url.openConnection();// 打开远程连接conn.setDoInput(true);conn.setRequestMethod("GET");conn.setConnectTimeout(60000);// 1分钟if (conn.getResponseCode() == 200) {BufferedInputStream bis = new BufferedInputStream(conn.getInputStream());File outFile = new File(reName(savePath,urlPath.substring(urlPath.lastIndexOf("/") + 1)));FileOutputStream fos = new FileOutputStream(outFile);byte[] buff = new byte[SIZE];int len = -1;int k = 0;while ((len = bis.read(buff)) != -1) {fos.write(buff, 0, len);k++;}fos.flush();bis.close();fos.close();if (k < 10) {// 小于10KBoutFile.delete();} else {System.out.println("文件名:" + outFile.getName());}}conn.disconnect();} catch (FileNotFoundException fnfe) {System.err.println("文件:" + urlPath + " 不存在!");} catch (IOException ioe) {System.err.println("读取文件:" + urlPath + " 失败!");}}}/* * 抓取图片 */public static void catchImage(String url, String save) {// 此方法有很多情况不适用,且抓取的图片失真Parser parser = new Parser();ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);try {parser.setURL(url);parser.setEncoding(ENCODE);parser.visitAllNodesWith(ofv);} catch (ParserException pe) {System.err.println(pe.getMessage());}Node[] nodes = ofv.getTags();for (int i = 0; i < nodes.length; i++) {ImageTag it = (ImageTag) nodes[i];// 图片节点String imgSrc = null;if (!it.getImageURL().startsWith("http://")) {try {URL httpPath = new URL(url);if (it.getImageURL().startsWith("/")) {imgSrc = "http://" + httpPath.getHost()+ it.getImageURL();} else {imgSrc = "http://" + httpPath.getHost() + "/"+ it.getImageURL();}} catch (MalformedURLException e) {e.printStackTrace();}} else {imgSrc = it.getImageURL();}saveFile(imgSrc, save);}}private static String reName(String path, String name) {File file = new File(path);File[] files = file.listFiles();String pre = "";String ext = "";if (name.contains(".")) {ext = name.substring(name.indexOf("."));pre = name.substring(0, name.indexOf("."));} else {pre = name;}int i = 1;boolean flag = true;while (flag) {boolean bl = false;for (File f : files) {if (f.getName().equals(name)) {name = pre + "(" + (i++) + ")" + ext;bl = true;break;}}if (!bl) {flag = false;}}return path + name;}}