Lucene 课程设计 检索.doc,.pdf,.html,.execl,.txt格式文件

来源:互联网 发布:淘宝不清洗第二次排查 编辑:程序博客网 时间:2024/04/29 14:32

      又花了大概一个星期的时间,终于写完了信息检索的课程设计,快考试,还没复习,整天写个门课的实验...

      利用Lucene开源软件,实现了检索.doc,.pdf,.html,.execl,.txt等常见格式的文件,检索结果给出文件所在的路径

      课设中用到的各软件包为:lucene2.4.0,apache-tomcat-6.0.16,poi-bin-3.2-FINAL-20081019(用于解析office类文件,如doc,.xls等),PDFBox-0.7.3(用于解析.pdf 文件),开发环境为Eclipse Version: 3.4.1

     注意的是,用到的包都要放到WebContent/WEB-INF的lib目录下,否则会报NoClassDefError(好像这么写),其次,在eclipse开发页面的Project Explorer的空白处右键单击,然后单击refresh,否则即使把用的包放到lib目录下,仍会报NoClassDefError错误,就这个“小小”的错误,折磨了我一天半的时间,狂上网查原因,用尽了能够搜集到方法,也不行,都快崩溃了,最后坐在电脑前发呆,无意中发现了之前变动过的docs(用于存放待检索的文件) 在eclipse的Project Explorer显示的仍是之前的那几个文件,这才找到原因,轻轻的点击了refresh,终于解决了,既兴奋有无语,就这点差错,之前没怎么用这类集成开发工具(vc6.0除外),怎么评价呢,这种集成开发工具的却带给了我们很高的开发效率,他给你了很多提示,但就是配置环境比较烦人,稍有点错误就不行,而且这种错误通常很难解决,有利有弊吧(我最欣赏这类开发工具的就是他们对格式都是自动控制的,这点很好,不向用.TXT时那烦人的格式控制..)

下面给出代码:

configuration.jsp

 

  1. <%@ page language="java" contentType="text/html; charset=GB18030"
  2.     pageEncoding="GB18030"%>
  3. <html>
  4.     <head>
  5.         <title>
  6.             Welcome to LuceneWeb - Configuration Page
  7.         </title>
  8.     </head>
  9.     <body background="D:/Program Files/MIR Design/eclise workplace/LuceneWebApplication/bg.jpg">
  10.         <center>
  11.             <font face="Monotype Corsiva"  size="20" color="#9966ff">
  12.                 <b>
  13.                     Welcome to LuceneWeb
  14.                 </b>
  15.             </font>
  16.         </center>
  17.     <h2 align="right">
  18.         <b>
  19.             <font face="Monotype Corsiva" color="#9966ff">
  20.                 Configuration Page
  21.             </font>
  22.         </b>
  23.     </h2>
  24.     <center>
  25.         <form name="Configuration" action="header.jsp" method="get">
  26.             <%for(int i=0;i<7;i++){%>
  27.                 <br>
  28.             <%} %>
  29.         <p>
  30.             <font face="楷体_GB2312" size="5">
  31.                 文档路径:
  32.             </font>
  33.             <input type="text" name="DocumentDirectory" size="40"/>
  34.         </p>
  35.         <p>
  36.             <br>
  37.             <font face="楷体_GB2312" size="5">
  38.                 索引路径:
  39.             </font>
  40.             <input type="text" name="IndexDirectory" size="40"/>
  41.         </p>
  42.             <br>
  43.             <center>
  44.                 <input type="submit" value="建立索引"/>
  45.             </center>
  46.         </form>
  47.         </center>
  48.     </body>
  49. </html>

  header.jsp

  1. <%@page language="java" contentType="text/html; charset=GB18030"
  2.     pageEncoding="GB18030"%>
  3. <%@page import="Index.CreateIndex" %>  
  4. <%@page import ="java.io.*,org.apache.poi.hwpf.extractor.*,org.apache.lucene.analysis.*,org.apache.lucene.analysis.standard.StandardAnalyzer,org.apache.lucene.document.*,org.apache.lucene.index.*, org.apache.lucene.search.*,org.apache.lucene.queryParser.*,org.apache.lucene.demo.*,org.apache.lucene.demo.html.Entities,java.net.URLEncoder" %>
  5. <head>
  6.     <title>Welcome to LuceneWeb - Search Page</title>
  7. </head>
  8. <%
  9.         CreateIndex create=new CreateIndex();
  10.         String index;
  11.         String document_path,index_path;
  12.         document_path=request.getParameter("DocumentDirectory");
  13.         index_path=request.getParameter("IndexDirectory");
  14.         //下面的两个if语句用于设置document和index的默认路径,当没有在配置页面输入这两个路径时,使用默认路径,
  15.         if(document_path.length()<2)//开始使用document_path==null判断,结果不对,姑且就这样判断了
  16.         {
  17.             document_path="D://Program Files//MIR Design//eclise workplace//LuceneWebApplication//docs";
  18.         }
  19.         if(index_path.length()<2)
  20.         {
  21.             index_path="D://Program Files//MIR Design//eclise workplace//LuceneWebApplication//index";
  22.         }
  23.         create.create_index(document_path,index_path);
  24.         String indexPath=create.get_index();
  25. %>
  26. <body background="D:/Program Files/MIR Design/eclise workplace/LuceneWebApplication/luceneweb.jpg">
  27.     <center>
  28.         <font face="Monotype Corsiva" color="#00ffff" size="150">
  29.             <b>
  30.                 LuceneWeb
  31.             </b>
  32.         </font>
  33.     </center>
  34.     <center> 
  35.     <
  36.         for(int i=0;i<10;i++)
  37.         {
  38.     %>
  39.         <br>
  40.     <
  41.         }
  42.     %>
  43.         <form name="Search" action="index.jsp" method="get">
  44.             <p>                     
  45.                 <font face="楷体_GB2312" size="5">
  46.                     查询关键zi:
  47.                 </font>
  48.                 <input name="QueryInput" size="50"/>
  49.                 <input type="hidden" name="indexPath" value="<%=indexPath %>"/>
  50.                 <br>
  51.                 <br>
  52.                 <br>
  53.             </p>
  54.                 <center>
  55.                     <input type="submit" value="Search"/>
  56.                 </center>
  57.      </form>
  58.     </center>

index.jsp

  1. <%@ page import = "  javax.servlet.*, javax.servlet.http.*, java.io.*, org.apache.lucene.analysis.*, org.apache.lucene.analysis.standard.StandardAnalyzer, org.apache.lucene.document.*, org.apache.lucene.index.*, org.apache.lucene.search.*, org.apache.lucene.queryParser.*, org.apache.lucene.demo.*, org.apache.lucene.demo.html.Entities, java.net.URLEncoder" %>
  2. <%@page language="java" contentType="text/html; charset=GB18030"
  3.     pageEncoding="GB18030"%>
  4. <%@include file="header_frame.jsp" %>
  5.     <
  6.         boolean error = false;  
  7.         String indexName=request.getParameter("indexPath");
  8.         IndexSearcher searcher = null;   
  9.         Query query = null;                    
  10.         Hits hits = null;                      
  11.         int startindex =0;                     
  12.         int maxpage =10;                    
  13.         String queryString = null;              
  14.         String startVal ="0";             
  15.         String maxresults ="10";             
  16.         int thispage = 0;    
  17.   
  18.         try {
  19.           searcher = new IndexSearcher(indexName);                                                    
  20.         } catch (Exception e) {                                                                             
  21.     %>
  22.                 <p>Notice:error opening the Index</p>
  23.     <%                error = true;                                  
  24.         }
  25.     %>
  26.     <%
  27.        if (error == false) {                                          
  28.                 queryString = request.getParameter("QueryInput");           
  29.                 startVal    =request.getParameter("startat");       
  30.                 maxresults  =request.getParameter("maxresults"); 
  31.                 try {
  32.                         maxpage    = Integer.parseInt(maxresults);    
  33.                         startindex = Integer.parseInt(startVal);      
  34.                 } catch (Exception e) { } 
  35.               if (queryString == null)
  36.                 {
  37.                         //throw new ServletException("no query "+"specified");                                                                          
  38.         %>
  39.                 <h3 align="left">
  40.                     <font face="Monotype Corsiva"  color="#9966ff">
  41.                         <b>
  42.                             请输入查询关键字...
  43.                         </b>
  44.                     </font>
  45.                 </h3>
  46.         <%
  47.                 } 
  48.         %>
  49.         <%
  50.                 Analyzer analyzer = new StandardAnalyzer();       
  51.                 try {
  52.                         QueryParser qp = new QueryParser("contents", analyzer);
  53.                         query = qp.parse(queryString); 
  54.                 } catch (ParseException e) {                         
  55.     %>
  56.     <%
  57.                         error = true;                                
  58.                 }
  59.         }
  60.     %>
  61.     <%
  62.         if (error == false && searcher != null) {                   
  63.                                                                      
  64.                                                                     
  65.                 thispage = maxpage;                                  
  66.                 hits = searcher.search(query);                      
  67.                 if (hits.length() == 0) {                             
  68.     %>
  69.                 <p>对不起,没有你想查询的结果...</p>
  70.     <%
  71.                 error = true;                                      
  72.                                                                    
  73.                 }
  74.         }
  75.         if (error == false && searcher != null) {  
  76.     %>
  77.     <h3 align="left">
  78.         <font face="Monotype Corsiva"  color="#9966ff">
  79.             <b>
  80.                 总共有<%=hits.length()%>条查询结果...
  81.             </b>
  82.         </font>
  83.     </h3>       
  84.                 <table>
  85.     <%
  86.                     if ((startindex + maxpage) > hits.length()) {
  87.                             thispage = hits.length() - startindex;     
  88.                     }       
  89.     %>
  90.     <%
  91.                     for (int i = startindex; i < (thispage + startindex); i++) { 
  92.     %>
  93.                     <tr>
  94.     <%                      
  95.                             Document doc = hits.doc(i);                   
  96.                             String docdoctitle = doc.get("title");            
  97.                             String url = doc.get("path");                  
  98.                             if (url != null && url.startsWith("../webapps/")) 
  99.                                 { 
  100.                                     urlurl = url.substring(10);
  101.                                 }
  102.                             if ((doctitle == null) || doctitle.equals("")) 
  103.                                     doctitle = url;
  104.                                                                          
  105.     %>
  106.                             <td><a href="<%=url%>"><%=doctitle%></a></td>
  107.                     </tr>
  108.     <%
  109.                     }
  110.     %>                  
  111.                 </table>
  112.     <%
  113.                     for(int i=0;i<7;i++)
  114.                     {
  115.     %>
  116.                     <br>
  117.     <%
  118.                     }
  119.     %>
  120.                     <p align="left">
  121.     <%
  122.                      String first_page="index.jsp?QueryInput="+queryString+"maxresults="+maxpage+"startat="+"0"+"indexPath="+indexName;
  123.     %>
  124.     <%               if(startindex>maxpage-1)
  125.                      {
  126.     %>
  127.                       <a href="<%=first_page%>">首页</a>
  128.     <
  129.                      }
  130.                     else
  131.                     {
  132.     %>
  133.                     首页
  134.     <%
  135.                     }
  136.     %>
  137.      <%                if (startindex>=maxpage) 
  138.                                 {                                                                   
  139.                                     String former_page="index.jsp?QueryInput="+queryString+"maxresults="+maxpage+"startat="+(startindex-maxpage)+"indexPath="+indexName;
  140.     %>
  141.                    
  142.                             <a href="<%=former_page%>">上一页</a>
  143.                         
  144.                   
  145.     <%
  146.                     }else{
  147.     %>
  148.                         上一页
  149.     <%
  150.                     }
  151.     %>
  152.     <%                if ( (startindex + maxpage) < hits.length()) {                                                                   
  153.                             String next_page="index.jsp?QueryInput="+queryString+"maxresults="+maxpage+"startat="+(startindex+maxpage)+"indexPath="+indexName;
  154.     %>
  155.                    
  156.                             <a href="<%=next_page%>">下一页</a>
  157.                         
  158.                   
  159.     <%
  160.                     }else{
  161.     %>
  162.                         下一页
  163.     <%
  164.                     }
  165.     %>                  
  166.      <%
  167.                      String end_page="index.jsp?QueryInput="+queryString+"maxresults="+maxpage+"startat="+maxpage*(hits.length()/maxpage)+"indexPath="+indexName;
  168.      %>
  169.      <%              if((startindex + maxpage) < hits.length())
  170.                         {
  171.     %>
  172.                             <a href="<%=end_page%>">尾页</a>
  173.     <
  174.                              }else{
  175.     %>
  176.                             尾页
  177.     <%
  178.                          }
  179.     %>     
  180.      </p>
  181.     <%       }  
  182.              if (searcher != null)
  183.                     searcher.close();
  184.     %>
  185.      

 

header_frame.jsp

 

  1. <head>
  2.     <title>Welcome to LuceneWeb - Results Page</title>
  3. </head>
  4. <body background="D:/Program Files/MIR Design/eclise workplace/LuceneWebApplication/luceneweb.jpg">
  5.     <center>
  6.         <font face="Monotype Corsiva" color="#00ffff" size="100">
  7.             <b>
  8.                 LuceneWeb
  9.             </b>
  10.         </font>
  11.     </center>
  12.     <h1 align="left">
  13.         <font face="Monotype Corsiva"  color="#9966ff">
  14.             <b>
  15.                 Search Result:
  16.             </b>
  17.         </font>
  18.     </h1>

CreateIndex.java(这个类主要用于生成各种文件格式的索引)

  1. package Index;
  2. //生成Index并返回Index的地址,这部分健壮性(robust)严重不行,因为没有考虑异常问题,只是做个演示,假设一切都按常规操作,有待改进,现在时间太紧
  3. import java.io.BufferedInputStream;
  4. import java.io.File;
  5. import java.io.FileInputStream;
  6. import java.io.FileNotFoundException;
  7. import java.io.FileReader;
  8. import java.io.IOException;
  9. import java.io.Reader;
  10. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  11. import org.apache.lucene.demo.html.HTMLParser;
  12. import org.apache.lucene.document.Document;
  13. import org.apache.lucene.document.Field;
  14. import org.apache.lucene.index.IndexWriter;
  15. import org.apache.poi.hslf.HSLFSlideShow;
  16. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  17. import org.apache.poi.hslf.model.TextRun;
  18. import org.apache.poi.hslf.usermodel.SlideShow;
  19. import org.apache.poi.hssf.model.Workbook;
  20. import org.apache.poi.hwpf.extractor.WordExtractor;
  21. import org.pdfbox.cos.COSDocument;
  22. import org.pdfbox.pdfparser.PDFParser;
  23. import org.pdfbox.pdmodel.PDDocument;
  24. import org.pdfbox.searchengine.lucene.LucenePDFDocument;
  25. import org.pdfbox.util.PDFTextStripper;
  26. import org.apache.poi.hssf.extractor.ExcelExtractor;
  27. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  28. public class CreateIndex{
  29.     private String IndexPath="begin1";
  30.     String index_directory=null;
  31.     String document_directory=null;
  32.     String execu=null;
  33.     //用于创建索引,.txt,.pdf,.doc,.html,.ppt,.execl格式
  34.     public String get_index()
  35.     {
  36.         return IndexPath;
  37.     }
  38.     public void create_index(String document_directory,String index_directory) throws FileNotFoundException,IOException
  39.     {
  40.         this.document_directory=document_directory;
  41.         this.index_directory=index_directory;
  42.         File documentDir=new File(this.document_directory);
  43.         File indexDir=new File(this.index_directory);
  44.         StandardAnalyzer luceneAnalyzer=new StandardAnalyzer();
  45.         File datafiles[]=documentDir.listFiles();
  46.         IndexWriter indexWriter;
  47.         indexWriter=new IndexWriter(indexDir,luceneAnalyzer,true);
  48.         for(int i=0;i<datafiles.length;i++)
  49.         {
  50.             //创建.TXT文件的索引
  51.             if(datafiles[i].isFile()&&datafiles[i].getName().endsWith(".txt"))
  52.             {
  53.                 Document document=new Document();
  54.                 Reader txtReader=new FileReader(datafiles[i]);
  55.                 document.add(new Field("path",datafiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
  56.                 document.add(new Field("contents",txtReader));
  57.                 indexWriter.addDocument(document);
  58.             }
  59.             //创建.HTML文件的索引
  60.             if(datafiles[i].isFile()&&datafiles[i].getName().endsWith(".html"))
  61.             {
  62.                 Document document = new Document();
  63.                 FileInputStream file_input_stream = new FileInputStream(datafiles[i]);
  64.                 HTMLParser parser = new HTMLParser(file_input_stream);
  65.                 document.add(new Field("path",datafiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
  66.                 document.add(new Field("contents", parser.getReader()));
  67.                 indexWriter.addDocument(document);
  68.             }
  69.             //创建.PDF文件的索引
  70.             if(datafiles[i].isFile()&&datafiles[i].getName().endsWith(".pdf"))
  71.             {
  72.                 Document document = new Document();
  73.                 FileInputStream file_input_stream = new FileInputStream(datafiles[i]);
  74.                 PDFParser parser=new PDFParser(file_input_stream);
  75.                 parser.parse();
  76.                 COSDocument cosdoc=parser.getDocument();
  77.                 PDFTextStripper stripper=new PDFTextStripper();
  78.                 String docText=stripper.getText(new PDDocument(cosdoc));
  79.                 document.add(new Field("path",datafiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
  80.                 document.add(new Field("contents",docText,Field.Store.YES,Field.Index.TOKENIZED));
  81.                 //document = LucenePDFDocument.getDocument(datafiles[i]);
  82.                 indexWriter.addDocument(document);  
  83.             }
  84.             //创建.DOC文件的索引
  85.             if(datafiles[i].isFile()&&datafiles[i].getName().endsWith(".doc"))
  86.             {
  87.                 Document document = new Document();
  88.                 FileInputStream file_input_stream = new FileInputStream(datafiles[i]);
  89.                 BufferedInputStream input_stream_buffer = new BufferedInputStream(file_input_stream);
  90.                 WordExtractor doc_extractor = new WordExtractor(input_stream_buffer);  
  91.                 document.add(new Field("path",datafiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
  92.                 document.add(new Field("contents",doc_extractor.getText(),Field.Store.YES,Field.Index.TOKENIZED));
  93.                 indexWriter.addDocument(document);
  94.             }
  95.             //创建.PPT文件的索引    这地方一直有问题,提示“no such entry: "PowerPoint Document",
  96.             //网上查了下,大概是环境的问题,感觉解析DOC,EXECL,PPT的方法应该都一样,利用POI带的.doc|.ppt|.xlsExtractor
  97.             //就能对这三种文件进行解析,但不知就是PPT不行
  98.             /*if(datafiles[i].isFile()&&datafiles[i].getName().endsWith(".ppt"));
  99.             {
  100.                Document document=new Document();
  101.                InputStream file_input_stream = new FileInputStream(datafiles[i]);
  102.                BufferedInputStream input_stream_buffer = new BufferedInputStream(file_input_stream);
  103.                //HSLFSlideShow slide_show=new HSLFSlideShow(input_stream_buffer);
  104.                PowerPointExtractor ppt_extractor = new PowerPointExtractor(input_stream_buffer);  
  105.                document.add(new Field("path",datafiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
  106.                document.add(new Field("contents",ppt_extractor.getText(),Field.Store.NO,Field.Index.TOKENIZED));
  107.                indexWriter.addDocument(document);
  108.             }*/
  109.             if(datafiles[i].isFile()&&datafiles[i].getName().endsWith(".xls"))
  110.             {
  111.                 Document document=new Document();
  112.                 FileInputStream file_input_stream = new FileInputStream(datafiles[i]);
  113.                 BufferedInputStream input_stream_buffer = new BufferedInputStream(file_input_stream);
  114.                 HSSFWorkbook hssf=new HSSFWorkbook(input_stream_buffer);
  115.                 ExcelExtractor xls_extractor=new ExcelExtractor(hssf);
  116.                 document.add(new Field("path",datafiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
  117.                 document.add(new Field("contents",xls_extractor.getText(),Field.Store.NO,Field.Index.TOKENIZED));
  118.                 indexWriter.addDocument(document);
  119.             }   
  120.         }   
  121.         IndexPath=index_directory;
  122.         indexWriter.optimize();
  123.         indexWriter.close();
  124.     }   
  125. }   

最后说下,这个程序的健壮性不行,因为基本上没有对异常进行处理,没时间考虑那么多了...