一步一步跟我学习lucene（15）---java读取word excel pdf及lucene搜索之正则表达式查询RegExQuery和手机邮箱查询示例

来源：互联网发布：古玩类网址知多少编辑：程序博客网时间：2024/04/30 22:27

今天快下班的时候收到了一个群友的问题，大意是读取文本文件中的内容，找出文件中的手机号和邮箱，我自己写了一个读取文档的内容的正则查询示例，用于匹配文件中是否含有邮箱或者手机号，这个等于是对之前的文本处理工具的一个梳理，同时结合lucene内部提供的正则匹配查询RegexQuery；

废话不多说了，直接上代码，这里先对文件内容读取分类处理，分为pdf word excel 和普通文本四类，不同的种类读取文本内容不一样

pdf利用pdfbox读取内容，word和excel利用poi进行读取内容，文本文档利用jdk自带的读取

读取pdf、word、excel和普通文本文档内容（支持word excel 2007）

这里代码做了一点调整，主要是对excel格式的空行和空列的过滤

package com.lucene.index.util;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.nio.charset.Charset;import java.nio.file.Files;import java.nio.file.Paths;import java.util.LinkedList;import java.util.List;import org.apache.pdfbox.PDFReader;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.util.PDFTextStripper;import org.apache.poi.EncryptedDocumentException;import org.apache.poi.POIXMLDocument;import org.apache.poi.POIXMLTextExtractor;import org.apache.poi.hssf.usermodel.HSSFCell;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.openxml4j.exceptions.InvalidFormatException;import org.apache.poi.openxml4j.exceptions.OpenXML4JException;import org.apache.poi.openxml4j.opc.OPCPackage;import org.apache.poi.ss.usermodel.Cell;import org.apache.poi.ss.usermodel.CellStyle;import org.apache.poi.ss.usermodel.Sheet;import org.apache.poi.ss.usermodel.Workbook;import org.apache.poi.ss.usermodel.WorkbookFactory;import org.apache.poi.xssf.usermodel.XSSFCell;import org.apache.poi.xssf.usermodel.XSSFRow;import org.apache.poi.xssf.usermodel.XSSFSheet;import org.apache.poi.xssf.usermodel.XSSFWorkbook;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.xmlbeans.XmlException;import com.lucene.bean.FileBean;public class FileUtil {/**读取文件信息和下属文件夹 * @param folder * @return * @throws IOException * @throws OpenXML4JException  * @throws XmlException  */public static List<FileBean> getFolderFiles(String folder) throws Exception {List<FileBean> fileBeans = new LinkedList<FileBean>();File file = new File(folder);if(file.isDirectory()){File[] files = file.listFiles();if(files != null){for (File file2 : files) {fileBeans.addAll(getFolderFiles(file2.getAbsolutePath()));}}}else{FileBean bean = new FileBean();String filePath = file.getAbsolutePath();bean.setPath(file.getAbsolutePath());bean.setModified(file.lastModified());String content = "";if(filePath.endsWith(".doc") || filePath.endsWith(".docx")){content = readDoc(file);}else if(filePath.endsWith(".xls") || filePath.endsWith(".xlsx")){content = readExcel(file);}else if(filePath.endsWith(".pdf")){content = readPdf(file);}else{content = new String(Files.readAllBytes(Paths.get(folder)));}bean.setContent(content);fileBeans.add(bean);}return fileBeans;}/**讀取excel文件 * @param file * @return * @throws IOException  * @throws InvalidFormatException  * @throws EncryptedDocumentException  */public static String readExcel(File file) throws Exception {String filePath = file.getAbsolutePath();StringBuffer content = new StringBuffer("");if(filePath.endsWith(".xls")){InputStream inp = new FileInputStream(filePath);    Workbook wb = WorkbookFactory.create(inp);       Sheet sheet = wb.getSheetAt(0);    for(int i = sheet.getFirstRowNum();i<= sheet.getPhysicalNumberOfRows();i++){      HSSFRow row = (HSSFRow) sheet.getRow(i);      if (row == null) {        continue;      }    for (int j = row.getFirstCellNum(); j <= row.getLastCellNum(); j++) {     if(j < 0){    continue;//增加下标判断    }    HSSFCell cell = row.getCell(j);      if (cell == null) {        continue;      }    content.append(cell.getStringCellValue());        }    }    wb.close();    inp.close();}else{XSSFWorkbook xwb = new XSSFWorkbook(file.getAbsolutePath());XSSFSheet sheet = xwb.getSheetAt(0);  // 定义 row、cell  XSSFRow row;  String cell;  // 循环输出表格中的内容  for (int i = sheet.getFirstRowNum(); i < sheet.getPhysicalNumberOfRows(); i++) {      row = sheet.getRow(i);      if(row == null){    continue;    }    for (int j = row.getFirstCellNum(); j < row.getPhysicalNumberOfCells(); j++) {          // 通过 row.getCell(j).toString() 获取单元格内容，    if(j<0){    continue;    }    XSSFCell xfcell = row.getCell(j);    if(xfcell == null){    continue;    }    xfcell.setCellType(Cell.CELL_TYPE_STRING);//数值型的转成文本型        cell = xfcell.getStringCellValue();        content.append(cell+" ");    }  }  }return content.toString();}/**讀取word內容 * @param file * @return * @throws IOException  * @throws OpenXML4JException  * @throws XmlException  */public static String readDoc(File file) throws IOException, XmlException, OpenXML4JException {String filePath = file.getAbsolutePath();if(filePath.endsWith(".doc")){InputStream is = new FileInputStream(file);WordExtractor ex = new WordExtractor(is);  String text2003 = ex.getText();  ex.close();is.close();return text2003;}else{OPCPackage opcPackage = POIXMLDocument.openPackage(filePath);  POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);  String text2007 = extractor.getText();  extractor.close();return text2007;}}/**讀取pdf內容 * @param file * @return * @throws IOException */public static String readPdf(File file) throws IOException{PDDocument doc = PDDocument.load(file.getAbsolutePath());PDFTextStripper stripper = new PDFTextStripper();String content = stripper.getText(doc);doc.close();return content;}}

正则查询query构建

在原有lucene查询的工具类的基础上加入正则查询的构建

/**获取regexQuery对象 * @param field * @param regex * @return */public static Query getRegexExpQuery(String field,String regex){Query query = null;Term term = new Term(field, regex);query = new RegexpQuery(term);return query;}

最终的searchUtil的内容为

package com.lucene.search;import java.io.File;import java.io.IOException;import java.nio.file.Paths;import java.util.concurrent.ExecutorService;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.MultiReader;import org.apache.lucene.index.Term;import org.apache.lucene.search.BooleanClause.Occur;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.MatchAllDocsQuery;import org.apache.lucene.search.NumericRangeQuery;import org.apache.lucene.search.Query;import org.apache.lucene.search.RegexpQuery;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.Sort;import org.apache.lucene.search.SortField;import org.apache.lucene.search.SortField.Type;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.TopFieldCollector;import org.apache.lucene.store.FSDirectory;import org.wltea.analyzer.lucene.IKAnalyzer;import com.lucene.index.IndexUtil;public class SearchUtil {public static final Analyzer analyzer = new IKAnalyzer();/**获取IndexSearcher对象（适合单索引目录查询使用） * @param indexPath 索引目录 * @return * @throws IOException * @throws InterruptedException  */public static IndexSearcher getIndexSearcher(String indexPath,ExecutorService service,boolean realtime) throws IOException, InterruptedException{DirectoryReader reader = DirectoryReader.open(IndexUtil.getIndexWriter(indexPath, true), realtime);IndexSearcher searcher = new IndexSearcher(reader,service);if(service != null){service.shutdown();}return searcher;}/**多目录多线程查询 * @param parentPath 父级索引目录 * @param service 多线程查询 * @return * @throws IOException * @throws InterruptedException  */public static IndexSearcher getMultiSearcher(String parentPath,ExecutorService service,boolean realtime) throws IOException, InterruptedException{MultiReader multiReader;File file = new File(parentPath);File[] files = file.listFiles();IndexReader[] readers = new IndexReader[files.length];if(!realtime){for (int i = 0 ; i < files.length ; i ++) {readers[i] = DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath(), new String[0])));}}else{for (int i = 0 ; i < files.length ; i ++) {readers[i] = DirectoryReader.open(IndexUtil.getIndexWriter(files[i].getPath(), true), true);}}multiReader = new MultiReader(readers);IndexSearcher searcher = new IndexSearcher(multiReader,service);if(service != null){service.shutdown();}return searcher;}/**从指定配置项中查询 * @return * @param analyzer 分词器 * @param field 字段 * @param fieldType字段类型 * @param queryStr 查询条件 * @param range 是否区间查询 * @return */public static Query getQuery(String field,String fieldType,String queryStr,boolean range){Query q = null;if(queryStr != null && !"".equals(queryStr)){if(range){String[] strs = queryStr.split("\\|");if("int".equals(fieldType)){int min = new Integer(strs[0]);int max = new Integer(strs[1]);q = NumericRangeQuery.newIntRange(field, min, max, true, true);}else if("double".equals(fieldType)){Double min = new Double(strs[0]);Double max = new Double(strs[1]);q = NumericRangeQuery.newDoubleRange(field, min, max, true, true);}else if("float".equals(fieldType)){Float min = new Float(strs[0]);Float max = new Float(strs[1]);q = NumericRangeQuery.newFloatRange(field, min, max, true, true);}else if("long".equals(fieldType)){Long min = new Long(strs[0]);Long max = new Long(strs[1]);q = NumericRangeQuery.newLongRange(field, min, max, true, true);}}else{if("int".equals(fieldType)){q = NumericRangeQuery.newIntRange(field, new Integer(queryStr), new Integer(queryStr), true, true);}else if("double".equals(fieldType)){q = NumericRangeQuery.newDoubleRange(field, new Double(queryStr), new Double(queryStr), true, true);}else if("float".equals(fieldType)){q = NumericRangeQuery.newFloatRange(field, new Float(queryStr), new Float(queryStr), true, true);}else{Term term = new Term(field, queryStr);q = new TermQuery(term);}}}else{q= new MatchAllDocsQuery();}System.out.println(q);return q;}/**多条件查询类似于sql in * @param querys * @return */public static Query getMultiQueryLikeSqlIn(Query ... querys){BooleanQuery query = new BooleanQuery();for (Query subQuery : querys) {query.add(subQuery,Occur.SHOULD);}return query;}/**获取regexQuery对象 * @param field * @param regex * @return */public static Query getRegexExpQuery(String field,String regex){Query query = null;Term term = new Term(field, regex);query = new RegexpQuery(term);return query;}/**多条件查询类似于sql and * @param querys * @return */public static Query getMultiQueryLikeSqlAnd(Query ... querys){BooleanQuery query = new BooleanQuery();for (Query subQuery : querys) {query.add(subQuery,Occur.MUST);}return query;}/**对多个条件进行排序构建排序条件 * @param fields * @param type * @param reverses * @return */public static Sort getSortInfo(String[] fields,Type[] types,boolean[] reverses){SortField[] sortFields = null;int fieldLength = fields.length;int typeLength = types.length;int reverLength = reverses.length;if(!(fieldLength == typeLength) || !(fieldLength == reverLength)){return null;}else{sortFields = new SortField[fields.length];for (int i = 0; i < fields.length; i++) {sortFields[i] = new SortField(fields[i], types[i], reverses[i]);}}return new Sort(sortFields);}/**根据查询器、查询条件、每页数、排序条件进行查询 * @param query 查询条件 * @param first 起始值 * @param max 最大值 * @param sort 排序条件 * @return */public static TopDocs getScoreDocsByPerPageAndSortField(IndexSearcher searcher,Query query, int first,int max, Sort sort){try {if(query == null){System.out.println(" Query is null return null ");return null;}TopFieldCollector collector = null;if(sort != null){collector = TopFieldCollector.create(sort, first+max, false, false, false);}else{sort = new Sort(new SortField[]{new SortField("modified", SortField.Type.LONG)});collector = TopFieldCollector.create(sort, first+max, false, false, false);}searcher.search(query, collector);return collector.topDocs(first, max);} catch (IOException e) {// TODO Auto-generated catch block}return null;}/**获取上次索引的id,增量更新使用 * @return */public static Integer getLastIndexBeanID(IndexReader multiReader){Query query = new MatchAllDocsQuery();IndexSearcher searcher = null;searcher = new IndexSearcher(multiReader);SortField sortField = new SortField("id", SortField.Type.INT,true);Sort sort = new Sort(new SortField[]{sortField});TopDocs docs = getScoreDocsByPerPageAndSortField(searcher,query, 0, 1, sort);ScoreDoc[] scoreDocs = docs.scoreDocs;int total = scoreDocs.length;if(total > 0){ScoreDoc scoreDoc = scoreDocs[0];Document doc = null;try {doc = searcher.doc(scoreDoc.doc);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return new Integer(doc.get("id"));}return 0;}}

正则查询测试

正则查询测试类，主要是测试是否包含手机号或邮箱号，这里的手机号验证有点粗糙，希望不要介意

package com.lucene.index.test;import java.io.IOException;import java.util.concurrent.Executors;import org.apache.lucene.document.Document;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import com.lucene.search.SearchUtil;public class TestSearch {public static void main(String[] args) {try {IndexSearcher searcher = SearchUtil.getMultiSearcher("index", Executors.newCachedThreadPool(), false);Query phoneQuery = SearchUtil.getRegexExpQuery("content", "1[0-9]{10}");Query mailQuery = SearchUtil.getRegexExpQuery("content", "([a-z0-9A-Z]+[-_|\\.]?)+[a-z0-9A-Z]*@([a-z0-9A-Z]+(-[a-z0-9A-Z]+)?\\.)+[a-zA-Z]{2,}");Query finaQuery = SearchUtil.getMultiQueryLikeSqlIn(new Query[]{phoneQuery,mailQuery}); TopDocs topDocs = SearchUtil.getScoreDocsByPerPageAndSortField(searcher, finaQuery, 0, 20, null);System.out.println("符合条件的数据总数："+topDocs.totalHits);System.out.println("本次查询到的数目为："+topDocs.scoreDocs.length);ScoreDoc[] scoreDocs = topDocs.scoreDocs;for (ScoreDoc scoreDoc : scoreDocs) {Document doc = searcher.doc(scoreDoc.doc);System.out.println(doc.get("path")+"    "+doc.get("content"));}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (InterruptedException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}

最终测试结果如下：

content:/1[0-9]{10}/content:/([a-z0-9A-Z]+[-_|\.]?)+[a-z0-9A-Z]*@([a-z0-9A-Z]+(-[a-z0-9A-Z]+)?\.)+[a-zA-Z]{2,}/符合条件的数据总数：6本次查询到的数目为：6D:\hadoop\lucene_regexSearch\testDir\2.txt.txt    电话号码：18519237811D:\hadoop\lucene_regexSearch\testDir\3.txt.txt    电子邮箱yinggui_Wu@163.comD:\hadoop\lucene_regexSearch\testDir\1.docx    邮箱内容yinggui_Wu@163.comD:\hadoop\lucene_regexSearch\testDir\1.pdf    邮箱内容 yinggui_Wu@163.com D:\hadoop\lucene_regexSearch\testDir\1.xlsx    1 2 3 18510539956 D:\hadoop\lucene_regexSearch\testDir\1.txt.txt    <a target=_blank href="mailto:fanyi@qq.com">fanyi@qq.com</a>

代码下载地址

http://download.csdn.net/detail/wuyinggui10000/8746407

一步一步跟我学习lucene是对近期做lucene索引的总结，大家有问题的话联系本人的Q-Q: 891922381，同时本人新建Q-Q群：106570134（lucene,solr,netty,hadoop），如蒙加入，不胜感激，大家共同探讨,本人争取每日一博，希望大家持续关注，会带给大家惊喜的

3 1