HtmlParser抽取页面文本内容的方法总结

来源:互联网 发布:corel有什么软件 编辑:程序博客网 时间:2024/06/16 11:06

 

Code:
  1. //方法test1应该说是最有效的,避免了出现很多空格。   
  2. package   test;     
  3.       
  4.       
  5.   import   java.io.BufferedReader;     
  6.   import   java.io.File;     
  7.   import   java.io.FileInputStream;     
  8.   import   java.io.InputStreamReader;     
  9.       
  10.   import   org.htmlparser.Node;     
  11.   import   org.htmlparser.NodeFilter;     
  12.   import   org.htmlparser.Parser;     
  13.   import   org.htmlparser.filters.NodeClassFilter;     
  14.   import   org.htmlparser.filters.OrFilter;     
  15.   import   org.htmlparser.nodes.TextNode;     
  16.   import   org.htmlparser.parserapplications.StringExtractor;     
  17.   import   org.htmlparser.tags.LinkTag;     
  18.   import   org.htmlparser.util.NodeList;     
  19.   import   org.htmlparser.util.ParserException;     
  20.   import   org.htmlparser.visitors.HtmlPage;     
  21.   import   org.htmlparser.visitors.TextExtractingVisitor;     
  22.       
  23.       
  24.   /**    
  25.     *   演示了Html   Parse的应用.    
  26.     */     
  27.       
  28.   public   class   ParseHtmlTest     
  29.   {     
  30.       
  31.           public   static   void   main(String[]   args)   throws   Exception     
  32.           {     
  33.                   String   aFile   =   "D://Eclipse//workspace//search//test001//content_1349887.htm";     
  34.       
  35.                   String   content   =   readTextFile(aFile,   "GBK");     
  36.                   StringExtractor   se;     
  37.                   se   =   new   StringExtractor   (aFile);     
  38.                   System.out.println(se.extractStrings(false));     
  39.       
  40.                   test1(content);     
  41.                   System.out.println("=====Test1==============================");     
  42.       
  43.                   test2(content);     
  44.                   System.out.println("=====Test2==========================");     
  45.       
  46.                   test3(content);     
  47.                   System.out.println("=====Test3===============================");     
  48.       
  49.                   test4(content);     
  50.                   System.out.println("=====Test4===============================");     
  51.       
  52.                   test5(aFile);     
  53.                 System.out.println("======Test5==============================");     
  54.       
  55.                   //访问外部资源,相对慢     
  56.                   test5("http://www.medlink.com.cn");       
  57.                   System.out.println("====================================");     
  58.       
  59.           }     
  60.       
  61.           /**    
  62.             *   读取文件的方式来分析内容.    
  63.             *   filePath也可以是一个Url.    
  64.             *      
  65.             *   @param   resource   文件/Url    
  66.             */     
  67.           public   static   void   test5(String   resource)   throws   Exception     
  68.           {     
  69.                   Parser   myParser   =   new   Parser(resource);     
  70.       
  71.                   //设置编码     
  72.                   myParser.setEncoding("GBK");     
  73.       
  74.                   HtmlPage   visitor   =   new   HtmlPage(myParser);     
  75.       
  76.                   myParser.visitAllNodesWith(visitor);     
  77.       
  78.                   String   textInPage   =   visitor.getBody().toString();     
  79.       
  80.                   System.out.println(textInPage);     
  81.           }     
  82.       
  83.           /**    
  84.             *   按页面方式处理.对一个标准的Html页面,推荐使用此种方式.    
  85.             */     
  86.           public   static   void   test4(String   content)   throws   Exception     
  87.           {     
  88.                   Parser   myParser;     
  89.                   myParser   =   Parser.createParser(content,   "GBK");     
  90.       
  91.                   HtmlPage   visitor   =   new   HtmlPage(myParser);     
  92.       
  93.                   myParser.visitAllNodesWith(visitor);     
  94.       
  95.                   String   textInPage   =   visitor.getTitle();     
  96.       
  97.                   System.out.println(textInPage);     
  98.                   System.out.println("--------------------");     
  99.                   System.out.println(visitor.getBody());     
  100.           }     
  101.       
  102.           /**    
  103.             *   利用Visitor模式解析html页面.    
  104.             *    
  105.             *   小优点:翻译了<>等符号      
  106.             *   缺点:好多空格,无法提取link    
  107.             *          
  108.             */     
  109.           public   static   void   test3(String   content)   throws   Exception     
  110.           {     
  111.                   Parser   myParser;     
  112.                   myParser   =   Parser.createParser(content,   "GBK");     
  113.       
  114.                   TextExtractingVisitor   visitor   =   new   TextExtractingVisitor();     
  115.       
  116.                   myParser.visitAllNodesWith(visitor);     
  117.       
  118.                   String   textInPage   =   visitor.getExtractedText();     
  119.       
  120.                   System.out.println(textInPage);     
  121.           }     
  122.       
  123.           /**    
  124.             *   得到普通文本和链接的内容.    
  125.             *      
  126.             *   使用了过滤条件.    
  127.             */     
  128.           public   static   void   test2(String   content)   throws   ParserException     
  129.           {     
  130.                   Parser   myParser;     
  131.                   NodeList   nodeList   =   null;     
  132.       
  133.                   myParser   =   Parser.createParser(content,   "GBK");     
  134.       
  135.                   NodeFilter   textFilter   =   new   NodeClassFilter(TextNode.class);     
  136.                   NodeFilter   linkFilter   =   new   NodeClassFilter(LinkTag.class);     
  137.       
  138.                   //暂时不处理   meta     
  139.                   //NodeFilter   metaFilter   =   new   NodeClassFilter(MetaTag.class);     
  140.       
  141.                   OrFilter   lastFilter   =   new   OrFilter();     
  142.                   lastFilter.setPredicates(new   NodeFilter[]   {   textFilter,   linkFilter   });     
  143.       
  144.                   nodeList   =   myParser.parse(lastFilter);     
  145.       
  146.                   Node[]   nodes   =   nodeList.toNodeArray();     
  147.       
  148.                   for   (int   i   =   0;   i   <   nodes.length;   i++)     
  149.                   {     
  150.                           Node   anode   =   (Node)   nodes[i];     
  151.       
  152.                           String   line   =   "";     
  153.                           if   (anode   instanceof   TextNode)     
  154.                           {     
  155.                                   TextNode   textnode   =   (TextNode)   anode;     
  156.                                   line   =   textnode.toPlainTextString().trim();     
  157.                                   //line   =   textnode.getText();     
  158.                           }     
  159.   //                         else   if   (anode   instanceof   LinkTag)     
  160.   //                         {     
  161.   //                                 LinkTag   linknode   =   (LinkTag)   anode;     
  162.   //     
  163.   //                                 line   =   linknode.getLink();     
  164.   //                                 //@todo   过滤jsp标签:可以自己实现这个函数     
  165.   //                                 //line   =   StringFunc.replace(line,   "<%.*%>",   "");     
  166.   //                         }     
  167.       
  168.                           if   (isTrimEmpty(line))     
  169.                                   continue;     
  170.       
  171.                           System.out.println(line);     
  172.                   }     
  173.           }     
  174.       
  175.           /**    
  176.             *   解析普通文本节点.    
  177.             *      
  178.             *   @param   content    
  179.             *   @throws   ParserException    
  180.             */     
  181.           public   static   void   test1(String   content)   throws   ParserException     
  182.           {     
  183.                   Parser   myParser;     
  184.                   Node[]   nodes   =   null;     
  185.       
  186.                   myParser   =   Parser.createParser(content,   null);     
  187.       
  188.                   nodes   =   myParser.extractAllNodesThatAre(TextNode.class);   //exception   could   be   thrown   here     
  189.       
  190.                   for   (int   i   =   0;   i   <   nodes.length;   i++)     
  191.                   {     
  192.                           TextNode   textnode   =   (TextNode)   nodes[i];     
  193.                           String   line   =   textnode.toPlainTextString().trim();     
  194.                           if   (line.equals(""))     
  195.                                   continue;     
  196.                           System.out.println(line);     
  197.                   }     
  198.       
  199.           }     
  200.       
  201.           /**    
  202.             *   读取一个文件到字符串里.    
  203.             *      
  204.             *   @param   sFileName     文件名    
  205.             *   @param   sEncode       String    
  206.             *   @return   文件内容    
  207.             */     
  208.           public   static   String   readTextFile(String   sFileName,   String   sEncode)     
  209.           {     
  210.                   StringBuffer   sbStr   =   new   StringBuffer();     
  211.       
  212.                   try     
  213.                   {     
  214.                           File   ff   =   new   File(sFileName);     
  215.                           InputStreamReader   read   =   new   InputStreamReader(new   FileInputStream(ff),     
  216.                                           sEncode);     
  217.                           BufferedReader   ins   =   new   BufferedReader(read);     
  218.       
  219.                           String   dataLine   =   "";     
  220.                           while   (null   !=   (dataLine   =   ins.readLine()))     
  221.                           {     
  222.                                   sbStr.append(dataLine);     
  223.                                   sbStr.append("/r/n");     
  224.                           }     
  225.       
  226.                           ins.close();     
  227.                   }     
  228.                   catch   (Exception   e)     
  229.                   {     
  230.                         //   LogMan.error("read   Text   File   Error",   e);     
  231.                   }     
  232.       
  233.                   return   sbStr.toString();     
  234.           }     
  235.       
  236.           /**    
  237.             *   去掉左右空格后字符串是否为空    
  238.             *   @param   astr   String    
  239.             *   @return   boolean    
  240.             */     
  241.           public   static   boolean   isTrimEmpty(String   astr)     
  242.           {     
  243.                   if   ((null   ==   astr)   ||   (astr.length()   ==   0))     
  244.                   {     
  245.                           return   true;     
  246.                   }     
  247.                   if   (isBlank(astr.trim()))     
  248.                   {     
  249.                           return   true;     
  250.                   }     
  251.                   return   false;     
  252.           }     
  253.       
  254.           /**    
  255.             *   字符串是否为空:null或者长度为0.    
  256.             *   @param   astr   源字符串.    
  257.             *   @return   boolean    
  258.             */     
  259.           public   static   boolean   isBlank(String   astr)     
  260.           {     
  261.                   if   ((null   ==   astr)   ||   (astr.length()   ==   0))     
  262.                   {     
  263.                           return   true;     
  264.                   }     
  265.                   else     
  266.                   {     
  267.                           return   false;     
  268.                   }     
  269.           }     
  270.       
  271.   }  

原创粉丝点击