lucene pdf+doc+ppt+xls+txt+多层文件
来源:互联网 发布:iphone怎么传文件到mac 编辑:程序博客网 时间:2024/04/26 06:11
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package stringtest;
import java.io.*;
import java.io.FileInputStream;
import java.io.File;
import org.apache.poi.hssf.extractor.*;
import org.apache.poi.hssf.usermodel.*;//包含生成Excel文档的各个类.
import org.apache.poi.hwpf.extractor.*;//对word文档进行处理的包
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;//对ppt文档进行处理的包
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;//对pdf文档进行处理的包
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.queryParser.*;//lucene包
/**
*
* @author hp
*/
public class StringTest {
public static String INDEX_FILE_PATH = "F://index2";
public static String INDEX_STORE_PATH ="F://store2";
/**
* @param args the command line arguments
*/
public static void main(String[] args)throws Exception {
StringTest test = new StringTest();
IndexWriter writer = new IndexWriter(INDEX_STORE_PATH,new StandardAnalyzer(),true);
test.writeToIndex(INDEX_FILE_PATH,writer);
writer.close();
test.indexSearcher("雒琛");
//Mix mix = new Mix();
//String string = mix.readPdf("F://ch07.pdf");
//System.out.println(string);
// TODO code application logic here
}
public void writeToIndex(String path,IndexWriter writer)throws Exception{
File folder = new File(path);
String[] files = folder.list();
//System.out.println(files.length);
for(int i=0;i<files.length;i++){
//System.out.println(i);
File file = new File(folder,files[i]);//根据 parent 路径名字符串和 child 路径名字符串创建一个新 File 实例
String s = file.getAbsolutePath();
//System.out.println(s);
if(s.contains(".")) {
int index = s.indexOf(".");
String s1 = s.substring(index+1);
//System.out.println(s);
if(s1.equals("xls")){
Document doc = readExcel(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("doc")){
Document doc = readDoc(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("ppt")){
Document doc = readPpt(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("pdf")){
Document doc = readPdf(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("txt")){
//File f= new File(s);
Document doc = new Document();
FileInputStream is = new FileInputStream(file);
Reader reader = new BufferedReader(new InputStreamReader(is));
doc.add(new Field("content",reader));//,Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("path",s,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
writer.optimize();
}
else continue;
}
else if(!s.contains(".")){ writeToIndex(s,writer);}
}
}
public void indexSearcher(String s)throws Exception{
//System.out.println(s);
QueryParser paser = new QueryParser("content",new StandardAnalyzer());
Query query = paser.parse(s);
//System.out.println(query.toString());
Searcher searcher = new IndexSearcher(INDEX_STORE_PATH);
Hits hit = searcher.search(query);
//System.out.println(hit.length());
for(int i=0;i<hit.length();i++)
{
Document d = hit.doc(i);
String dname = d.get("path");
System.out.println(dname+" ");
}
}
public Document readExcel(String xls)throws Exception {
// 创建输入流读取xls文件
//System.out.println(xls);
InputStream in = new FileInputStream(xls);//xls文件存储地址
HSSFWorkbook workbook = new HSSFWorkbook(in); //读取一个文件
ExcelExtractor extractor = new ExcelExtractor(workbook);
extractor.setFormulasNotResults(true);
extractor.setIncludeSheetNames(false);
String text = extractor.getText(); //Retrieves the text contents of the file
//System.out.println(text);
//return text;//返回文件的Sting类型文字
Document docexcel = new Document();
docexcel.add(newField("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES,Field.Index.ANALYZED));
docexcel.add(new Field("path",xls,Field.Store.YES,Field.Index.ANALYZED));
//System.out.println(text);
return docexcel;
}
public Document readDoc(String doc) throws Exception {
// 创建输入流读取DOC文件
FileInputStream in = new FileInputStream(doc);
WordExtractor extractor = null; // 创建WordExtractor
extractor = new WordExtractor(in);// 对DOC文件进行提取
String text = extractor.getText();
Document docdoc = new Document();
docdoc.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
docdoc.add(new Field("path",doc,Field.Store.YES,Field.Index.ANALYZED));
return docdoc;
}
public Document readPpt(String ppt)throws Exception {
// 创建输入流读取ppt文件
FileInputStream is = new FileInputStream(ppt);
SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();//获得每一张幻灯片
String text = new String();
for(int i=0;i<slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for(int j=0;j<t.length;j++){
//System.out.println(t[j].getText());//这里会将文字内容加到content中去
text += t[j].getText();
}
}
Document docppt = new Document();
docppt.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
docppt.add(new Field("path",ppt,Field.Store.YES,Field.Index.ANALYZED));
return docppt;
}
public Document readPdf(String pdf){
// 创建输入流读取pdf文件
String result="";
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(pdf);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
}catch (Exception e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
//System.out.println(result);
Document docpdf = new Document();
docpdf.add(newField("content",result,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES,Field.Index.ANALYZED));
docpdf.add(new Field("path",pdf,Field.Store.YES,Field.Index.ANALYZED));
return docpdf;
}
}
- lucene pdf+doc+ppt+xls+txt+多层文件
- "doc", "docx", "xls", "xlsx", "ppt", "pptx",txt。等文件转化为pdf
- C#读取doc,pdf,ppt,TXT文件
- VC 读取 doc,xls,ppt,pdf等格式的文件
- 在线打开.PDF、.TXT等文件,.doc、.xls自动下载
- lucene实现pdf,doc,xls,ppt,htm,html等格式文件的检索
- office(doc,xls,txt,pdf,ppt)文档在线预览及转换(office2pdf) - PHP版
- POI解析文档内容(txt,doc,docx,xls,xlsx,ppt,pdf)
- Android中pdf,doc,docx,xls,xlsx,ppt,pptx等office文件预览
- Flash在线文档阅读器::pdf、doc、docx、xls、xlsx、ppt、pptx、htm、txt、rtf、epub、csv、xdoc等
- Flash在线文档阅读器::pdf、doc、docx、xls、xlsx、ppt、pptx、htm、txt、rtf、epub、csv、xdoc等
- (.doc .xls .txt .pdf等文件都可以正常生成SWF格式)[转载]
- PHP读取或者创建txt,doc,xls,pdf各个类型文件
- android使用webview预览png,pdf,doc,xls,txt,等文件
- PHP读取或者创建txt,doc,xls,pdf各个类型文件 by cubeking
- doc[x]、xls[x]、ppt[x] 转pdf
- openoffic+java+spring 多线程 转换doc,ppt,xls -> html/pdf
- C# 打开pdf、doc。xls.文件
- 百度之星程序设计大赛试题
- 为Select设置查询蚕食
- 程序员逛银饰店笑话一则(关注一下你周围可爱的程序员)
- 111001011011111110000011111001101000001110000101
- 浏览器缓存内幕与getLastModified方法
- lucene pdf+doc+ppt+xls+txt+多层文件
- ORACLE分解IP地址
- Lucene索引前对doc pdf html文件的预处理
- Visual Studio 2008 Intellisense for Extjs3.0
- 稳定,彷徨
- 线性常系数差分方程的Matlab递推求解
- 继续摘抄:时髦不能动
- 各种数据库分页查询Sql汇总
- excel与java报表开发