Lucene索引doc pdf html

来源：互联网发布：实现双向数据绑定编辑：程序博客网时间：2024/04/29 04:14

索引Doc：tm-extractors-0.4

这原是Apache的POI类库的一部分——HWPF，POI提供了一组操作MS-word/excel等文件的方法，在最近的release版本中HWPF被移出，需要下载独立的tm-extractors-0.4。下面的代码中实现了static方法getDocument(File)返回Lucene的Document类型结果，主要通过调用WordExtractor类的成员方法extractor，该方法能返回一个包含所解析doc文件内容的String类型实例。

public Document getDocument(File doc) {String docPath = doc.getAbsolutePath();String title = doc.getName();InputStream inputStream = null;Reader contents = null;// 创建DocumentDocument document = new Document();try {inputStream = new FileInputStream(doc);} catch (FileNotFoundException e) {e.printStackTrace();}WordExtractor extractor = new WordExtractor();try {contents = new StringReader(extractor.extractText(inputStream));} catch (Exception e) {e.printStackTrace();}document.add(new Field("title", title, Field.Store.YES,Field.Index.TOKENIZED));document.add(new Field("contents", contents));document.add(new Field("path", docPath, Field.Store.YES,Field.Index.TOKENIZED));return document;}

索引pdf：使用pdfbox1.2.1,

PDFBox为使用Lucene的开发者专门提供了LucenePDFDocument类，它的static方法getDocument能够直接返回一个Lucene的Document类型结果。所以在为一个pdf文件（例子中为File类型实例pdfFile）创建索引时只要写下如下语句就可以了：

document = LucenePDFDocument.getDocument(pdfFile);

使用 pdfbox1.2.1时，方法被修改了，所以得先提取pdf的内容，再索引。

public Document getPdf(File pdf) {String pdfpath = pdf.getAbsolutePath();// 创建输入流读取pdf文件String title = pdf.getName();String result = "";FileInputStream is = null;PDDocument doc = null;try {is = new FileInputStream(pdf);PDFParser parser = new PDFParser(is);parser.parse();doc = parser.getPDDocument();PDFTextStripper stripper = new PDFTextStripper();result = stripper.getText(doc);} catch (Exception e) {e.printStackTrace();} finally {if (is != null) {try {is.close();} catch (Exception e) {e.printStackTrace();}}if (doc != null) {try {doc.close();} catch (Exception e) {e.printStackTrace();}}}Document document = new Document();document.add(new Field("title", title, Field.Store.YES,Field.Index.TOKENIZED));document.add(new Field("contents", result, Field.Store.YES,Field.Index.ANALYZED));document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));return document;}

索引Html

使用htmlparser来解析，同样在代码中定义了static方法getDocument(File)返回Document类型。

public Document getHtml(File html) {String htmlPath = html.getAbsolutePath();String text = "";Parser parser = null;try {parser =new Parser(htmlPath);} catch (ParserException e) {e.printStackTrace();}try {parser.setEncoding("GB2312");} catch (ParserException e) {e.printStackTrace();}HtmlPage visitor = new HtmlPage(parser);try {parser.visitAllNodesWith(visitor);} catch (ParserException e) {e.printStackTrace();}NodeList nodes = visitor.getBody();int size = nodes.size();for (int i = 0; i < size; i++) {Node node = nodes.elementAt(i);text += node.toPlainTextString();}String title = visitor.getTitle();Reader contents = new StringReader(text);Document document = new Document();document.add(new Field("title", title, Field.Store.YES,Field.Index.TOKENIZED));document.add(new Field("contents", contents));document.add(new Field("path", htmlPath, Field.Store.YES,Field.Index.NO));return document;}

创建索引

public static String INDEX_FILE_PATH = "D:/file";public static String INDEX_STORE_PATH = "D:/index";@SuppressWarnings("deprecation")public static void main(String[] args) throws Exception {test create = new test();IndexWriter writer = new IndexWriter(INDEX_STORE_PATH,new StandardAnalyzer(), true);create.writeToIndex(INDEX_FILE_PATH, writer);writer.close();}public void writeToIndex(String path, IndexWriter writer) throws Exception {File folder = new File(path);String[] files = folder.list();for (int i = 0; i < files.length; i++) {File file = new File(folder, files[i]);String s = file.getAbsolutePath();System.out.println(s);if (s.contains(".")) {int index = s.indexOf(".");String s1 = s.substring(index + 1);if (s1.equals("doc")) {Document doc = getDocument(file);writer.addDocument(doc);writer.optimize();} else if (s1.equals("html")) {Document doc = getHtml(file);writer.addDocument(doc);writer.optimize();} else if (s1.equals("pdf")) {Document doc = getPdf(file);writer.addDocument(doc);writer.optimize();} else if (s1.equals("txt")) {Document doc = new Document();FileInputStream is = new FileInputStream(file);Reader reader = new BufferedReader(new InputStreamReader(is));doc.add(new Field("content", reader));doc.add(new Field("path", s, Field.Store.YES,Field.Index.ANALYZED));writer.addDocument(doc);writer.optimize();} elsecontinue;} else if (!s.contains(".")) {writeToIndex(s, writer);}}System.out.println("共"+files.length+"个文件被索引。");}

查询：

public class indexsearch {public static String INDEX_STORE_PATH = "D:/index";public static void main(String[] args) throws Exception {String s = "科研";indexsearch search = new indexsearch();search.indexSearcher(s);}public void indexSearcher(String s) throws Exception {// System.out.println(s);QueryParser paser = new QueryParser("contents", new StandardAnalyzer());Query query = paser.parse(s); System.out.println(query.toString());Searcher searcher = new IndexSearcher(INDEX_STORE_PATH);Hits hit = searcher.search(query); System.out.println(hit.length());for (int i = 0; i < hit.length(); i++) {Document d = hit.doc(i);String dname = d.get("path");System.out.println(dname + " ");}}}

参考：http://blog.csdn.net/lilice/archive/2007/05/12/1605731.aspx