使用HotSAX解析html

来源:互联网 发布:单身为什么不好 知乎 编辑:程序博客网 时间:2024/04/29 17:30

KeyWord: HotSAX  Java解析html


补充:原来不需要HotSAX也可以解析,真是蛋疼,罪过,罪过...

HotSAX对中文章支持较差


以下源码需要HotSAX的支持。HotSAX是GPL协议。

下载HotSAX: http://hotsax.sourceforge.net/

下载的包是源码,没有打过包,图方便的话把HotSAX整个目录复制到你的工程下,其中HotSAX又信赖于hotsax.jar,在下载的文件的lib目录中

以下是源码,其作用是解析一串html字符串,并且查找其中的指定文本,并将包含这些文本的节点路径输出。


package t1;import java.io.IOException;import java.io.StringReader;import java.util.ArrayList;import java.util.List;import java.util.Stack;import org.xml.sax.Attributes;import org.xml.sax.ContentHandler;import org.xml.sax.InputSource;import org.xml.sax.Locator;import org.xml.sax.SAXException;import org.xml.sax.XMLReader;import org.xml.sax.helpers.XMLReaderFactory;/** * 解析html字符串,需要HotSAX插件的支持(from:http://hotsax.sourceforge.net/)<br/> * @author TaoPeng * */public class HtmlParserDemo {/** * @param args */public static void main(String[] args) {String html = "<html><head><title>|中国|</title></head><body><div id=\"firstDiv\">中国</div></body></html>";String keyWord = "中国";try {new HtmlParserDemo().test1(html, keyWord);} catch (Exception e) {e.printStackTrace();}}/** * 在html文本中查询文本keyWord,并输出包含这些文本的节点路径 * @param html * @param keyWord * @throws IOException * @throws SAXException */public void test1(String html, String keyWord) throws IOException, SAXException{MyContentHandler mch = new MyContentHandler();mch.setKeyword(keyWord);//XMLReader parser = XMLReaderFactory.createXMLReader("hotsax.html.sax.SaxParser");XMLReader parser = XMLReaderFactory.createXMLReader();//使用这个构造,直接忽略hotsaxparser.setContentHandler(mch);StringReader sr = new StringReader(html);InputSource is = new InputSource(sr);parser.parse(is);List<String> tps = mch.getTagPath();for(String tp : tps){System.out.println(tp);}}}class MyContentHandler implements ContentHandler{/** * 查询的关键字 */private String keyword;private List<String> tagPath = new ArrayList<String>(10);private Stack<String> tagStack = new Stack<String>();public String getKeyword() {return keyword;}public void setKeyword(String keyword) {this.keyword = keyword;}public List<String> getTagPath() {return tagPath;}public void setDocumentLocator(Locator locator) {// TODO Auto-generated method stub}public void startDocument() throws SAXException {// TODO Auto-generated method stub}public void endDocument() throws SAXException {// TODO Auto-generated method stub}public void startPrefixMapping(String prefix, String uri)throws SAXException {// TODO Auto-generated method stub}public void endPrefixMapping(String prefix) throws SAXException {// TODO Auto-generated method stub}public void startElement(String uri, String localName, String qName,Attributes atts) throws SAXException {String tag = localName;String id = atts.getValue("id"); if( id != null && id.length() > 0 ){tag = tag + "(#" + id + ")";}tagStack.push(tag);}public void endElement(String uri, String localName, String qName)throws SAXException {tagStack.pop();}public void characters(char[] ch, int start, int length)throws SAXException {if(keyword == null || length <= 0){return;}String text = new String(ch);if (text.indexOf(keyword) >= 0) {int size = tagStack.size();StringBuffer sb = new StringBuffer(size);for (int i = 0; i < size; i++) {if (sb.length() > 0) {sb.append(" > ");}sb.append(tagStack.get(i));}tagPath.add(sb.toString());}}public void ignorableWhitespace(char[] ch, int start, int length)throws SAXException {// TODO Auto-generated method stub}public void processingInstruction(String target, String data)throws SAXException {// TODO Auto-generated method stub}public void skippedEntity(String name) throws SAXException {// TODO Auto-generated method stub}}