Lucene索引的增删改查和二次检索
来源:互联网 发布:nodejs java 分离设计 编辑:程序博客网 时间:2024/06/07 09:16
本博客是本人使用lucene时写的工具类。包含对索引的增删改查,以及对检索结果的多次子检索,检索结果高亮显示等。
//本示例取数据的bean (在此用的是档案文件bean) 自行从数据库封装数据public class DocumentBean { private String id; //数据id private String archiveTypeId; //档案类型id private String archiveId; //档案id private String itemNo; //文件号 private String titleProper; //标题 private String fileSize; //文件大小 private String fileFormat; //文件格式 private String path; //文件路径 private String officeArchiveCode; //文号 private String docType; //文件类型 private String browseCount; //浏览次数 private String downloadCount; //下载次数 private String fileLastDate; //最后更改日期 private String archiveTypeName; //档案类型名称 private String uploadUserName; private String uploadDeptName; private String content; private String realName; //该文件存入数据库的名称 private String belongArch;//所属档案 /**省略getter setter**/}//lucene用到静态变量类public class GlobalBean { // lucene 配置 public static final String LUCENE_INDEX_PATH = "/index";// 索引存放路径 public static final String LUCENE_FILE_PATH = "/uploadFile"; // 添加到索引的文件路径 public static final String LUCENE_FIELD_ID = "id"; // 索引字段id public static final String LUCENE_FIELD_TITLE_PROPER = "titleProper"; // 索引字段标题 public static final String LUCENE_FIELD_UPLOAD_USER = "uploadUser"; // 索引字段上传用户 public static final String LUCENE_FIELD_FILE_FORMAT = "fileFormat"; // 索引字段文件类型 public static final String LUCENE_FIELD_ARCHIVE_TYPE = "archiveType"; // 索引字段档案类型 public static final String LUCENE_FIELD_BELONG_FILE = "belongFile"; // 索引字段所属文件 public static final String LUCENE_FIELD_DOC_TYPE = "docType"; // 文件mine类型 public static final String LUCENE_FIELD_PATH = "path"; // 文件路径 public static final String LUCENE_FIELD_CONTENT = "content"; // 索引字段档案内容 public static final String LUCENE_FIELD_TYPE_ID = "typeId"; // 索引字段档案类型id // 检索时参与检索的索引字段 public static final String LUCENE_INDEX_FIELDS = LUCENE_FIELD_TITLE_PROPER + " " + LUCENE_FIELD_UPLOAD_USER + " " + LUCENE_FIELD_ARCHIVE_TYPE + " " + LUCENE_FIELD_CONTENT; // 文件上传服务器真实路径 public static String LUCENE_FACT_FILE_PATH = ""; // 添加到索引的文件路径 (你们填写上自己的真实路径,我这边是服务器启动初始化过了) // 创建索引服务器真实路径 public static String LUCENE_FACT_INDEX_PATH = ""; // 添加到索引的文件路径(你们填写上自己的真实路径,我这边是服务器启动初始化过了) // 高亮显示上下文 字段显示字数 public static int LUCENE_HIGHLIGHT_CONTEXT_COUNT = 80; }//lucene操作工具类public class LuceneUtil { private static IndexWriter writer = null; private static IndexSearcher searcher = null; private static IndexReader reader = null; private static final IKAnalyzer analyzer = new IKAnalyzer(); /** * 创建索引 * * @param fileMap * 存放数据库查询的上传文件 信息 的 名称(key) 文件bean对象(value) 集合 * @throws SystemGlobalException * 系统全局异常 */ public static void createIndex(Map<String, DocumentBean> fileMap) throws SystemGlobalException { String unDataBaseFileMsg = ""; // 数据库中不存在的文件提示信息 String unExistsFileMsg = ""; // 数据库存在文件实体丢失的提示信息 List<String> entiryList = null; // 存放实体列表 try { File[] files = getFiles(); entiryList = new ArrayList<String>(); for (File file : files) { if (!file.canRead()) throw new SystemGlobalException("文件" + file.getName() + "不可读!"); // 当文件不是数据库存在时.不创建该文件索引 DocumentBean dataBaseDoc = fileMap.get(file.getName()); if (CommonUtil.isEmpty(dataBaseDoc)) { unDataBaseFileMsg += "[" + file.getName() + "],"; continue; } entiryList.add(file.getName()); if (file.isFile()) { saveOrUpdateDocument(dataBaseDoc, file); } } for (Iterator<String> file = fileMap.keySet().iterator(); file .hasNext();) { String dataFileName = file.next(); // 说明数据库中存在,但文件丢失 if (!entiryList.contains(dataFileName)) { unExistsFileMsg += "[" + fileMap.get(dataFileName).getTitleProper() + "],"; } } if (!CommonUtil.isEmpty(unDataBaseFileMsg)) { unDataBaseFileMsg = "创建索引完成!其中" + StringUtil.subStr(unDataBaseFileMsg, 1) + "创建失败!原因:文件不在数据库中存在!"; } if (!CommonUtil.isEmpty(unExistsFileMsg)) { unExistsFileMsg = "创建索引完成!其中" + StringUtil.subStr(unExistsFileMsg, 1) + "创建失败!原因:本地文件丢失!"; } getWriter().commit(); // getWriter().close(); if (!CommonUtil.isEmpty(unDataBaseFileMsg) || !CommonUtil.isEmpty(unExistsFileMsg)) { throw new SystemGlobalException(unDataBaseFileMsg + "\n" + unExistsFileMsg); } } catch (IOException e) { throw new SystemGlobalException("文件读写异常!"); } } /** * 删除所有索引 * * @param indexWriter * 索引操作对象 * @throws SystemGlobalException */ public static void deleteAllIndex() throws SystemGlobalException { try { getWriter().deleteAll(); } catch (IOException e) { throw new SystemGlobalException("删除索引异常"); } } /** * 重建索引库 先 删除所有 ,后添加所有 * * @param indexWriter * 索引操作对象 * @param useSmart * 是否使用智能切分 false 为最细密度切分 */ public static void rebuildIndex(Map<String, DocumentBean> fileMap) throws SystemGlobalException { try { deleteAllIndex(); createIndex(fileMap); System.out.println("-----重建索引库成功!"); } catch (SystemGlobalException e) { throw new SystemGlobalException("重建索引库异常!"); } } /** * 更新索引库所有文件更改过的索引 * * @param beanMap * 数据库 电子全文对象map集合 */ public static Map<String, File> updateBatchIndex( Map<String, DocumentBean> beanMap) throws SystemGlobalException { Map<String, File> modifiedMap = new HashMap<String, File>(); // 获取配置目录下的文件 File[] files = getFiles(); // 判断文件是否修改过 for (File file : files) { Date d = new Date(file.lastModified()); String lastDate = DateUtil.convertDateToStr(d, DateUtil.YYYYMMDDHHMMSS); DocumentBean bean = beanMap.get(file.getName()); if (bean != null) { if (!CommonUtil.isEmpty(lastDate) && !lastDate.equals(bean.getFileLastDate())) { // 文件修改过.更新索引将电子全文id添加至list集合 返回批量数据库修改 // saveOrUpdateDocument(indexWriter, bean, file); modifiedMap.put(bean.getId(), file); } } } return modifiedMap; } /** * 保存或更新索引档案信息 * * @param dataBaseDoc * 电子全文对象 * @param file * 电子全文文件 */ public static void saveOrUpdateDocument(DocumentBean dataBaseDoc, File file) { System.out .println("正在创建文件[" + dataBaseDoc.getTitleProper() + "]的索引..."); try { Document doc = new Document(); Tika tika = new Tika(); // id 用于标识唯一 记录 doc.add(new TextField(GlobalBean.LUCENE_FIELD_ID, CommonUtil .objToStr(dataBaseDoc.getId()), Store.YES)); // 存放文件标题 doc.add(new TextField(GlobalBean.LUCENE_FIELD_TITLE_PROPER, CommonUtil.objToStr(dataBaseDoc.getTitleProper()), Store.YES)); // 存放上传人 doc.add(new TextField(GlobalBean.LUCENE_FIELD_UPLOAD_USER, CommonUtil.objToStr(dataBaseDoc.getUploadUserName()), Store.YES)); // 存放电子档案归类名 doc.add(new TextField(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE, CommonUtil.objToStr(dataBaseDoc.getArchiveTypeName()), Store.YES)); // 将文件中所有信息存放入文档对象content中 String formatter = CommonUtil.objToStr(dataBaseDoc.getFileFormat()); String content = ""; if (dataBaseDoc.getDocType().indexOf("image/") != -1) {// if (formatter.length() > 0){// content = OCRUtil.recognizeText(// new File(GlobalBean.LUCENE_FACT_FILE_PATH+dataBaseDoc.getPath().replace("uploadFile", "")),// formatter.substring(1)).trim();// if(content == null) content = "";// } } else { content = tika.parseToString(file).trim(); } doc.add(new TextField(GlobalBean.LUCENE_FIELD_CONTENT, content, Store.YES)); doc.add(new TextField(GlobalBean.LUCENE_FIELD_FILE_FORMAT, formatter, Store.YES)); doc.add(new TextField(GlobalBean.LUCENE_FIELD_BELONG_FILE, CommonUtil.objToStr(dataBaseDoc.getBelongArch()), Store.YES)); doc.add(new TextField(GlobalBean.LUCENE_FIELD_DOC_TYPE, CommonUtil .objToStr(dataBaseDoc.getDocType()), Store.YES)); doc.add(new TextField(GlobalBean.LUCENE_FIELD_PATH, CommonUtil .objToStr(dataBaseDoc.getPath()), Store.YES)); doc.add(new TextField(GlobalBean.LUCENE_FIELD_TYPE_ID, CommonUtil .objToStr(dataBaseDoc.getArchiveTypeId()), Store.YES)); // 添加该档案对象到索引中 如果未存在添加,存在则修改为最新的 getWriter().updateDocument( new Term(GlobalBean.LUCENE_FIELD_ID, dataBaseDoc.getId()), doc); System.out.println("文件[" + dataBaseDoc.getTitleProper() + "]的索引创建完成!"); } catch (Exception e) { System.out.println(e.getMessage() + "\n不能添加空索引值!"); } } /** * 保存索引 * * @param bean * @param file */ public static void saveOrUpdateIndex(DocumentBean bean, File file) { try { saveOrUpdateDocument(bean, file); getWriter().commit(); } catch (Exception e) { System.out.println("文件[" + bean.getTitleProper() + "]的索引创建出错!错误原因:" + e.getMessage()); } } /** * 获取配置目录下的文件 * * @return */ public static File[] getFiles() { // 上传的文件目录路径 File fileDir = new File(GlobalBean.LUCENE_FACT_FILE_PATH); return fileDir.listFiles(); } /** * 获取指定目录下的文件 * * @return */ public static File[] getFiles(String path) { // 上传的文件目录路径 File fileDir = new File(path); return fileDir.listFiles(); } /** * 查询 * * @param queryStr */ public static List<DocumentBean> search(String queryStr) { System.out.println("正在检索关键字为[" + queryStr + "]的索引文件!"); Map<String, DocumentBean> map = new HashMap<String, DocumentBean>(); List<String> sortList = new ArrayList<String>(); List<DocumentBean> sortedList = new ArrayList<DocumentBean>(); try { long start = System.currentTimeMillis(); if (!CommonUtil.isEmpty(queryStr)) { String[] fields = GlobalBean.LUCENE_INDEX_FIELDS.split(" "); BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length]; for (int index = 0; index < fields.length; index++) { flags[index] = BooleanClause.Occur.SHOULD; } // 多字段查询 Query query = MultiFieldQueryParser.parse(queryStr, fields, flags, analyzer); QueryWrapperFilter filter = new QueryWrapperFilter(query); search(query, filter, map, sortList); } long end = System.currentTimeMillis(); System.out.println("检索结束,耗时:" + (end - start) + "ms"); for (String id : sortList) { sortedList.add(map.get(id)); } } catch (Exception e) { e.printStackTrace(); } return sortedList; } /** * 高级结果查询 不定长 */ public static List<DocumentBean> seniorSearchInResult(List<String> params) { List<DocumentBean> list = null; List<DocumentBean> tempList = new ArrayList<DocumentBean>(); List<String> sortList = new ArrayList<String>(); List<String> tempSortList = new ArrayList<String>(); for (int index = 0; index < params.size() - 1; index++) { list = new ArrayList<DocumentBean>(); tempList = searchInResult(params.get(index), params.get(index + 1), tempSortList); if (index == 0) { sortList.addAll(tempSortList); } if (!CommonUtil.isEmpty(sortList)) { for (String id : tempSortList) { int subIndex = tempSortList.size() - tempList.size(); if (!sortList.contains(id)) tempList.remove(tempSortList.indexOf(id) - subIndex); } } if (sortList.size() > 0) { list.addAll(tempList); } } return list; } /** * @desc 二次搜索 在上次搜索的结果缓存的基础上进行再次检索 * @param newQueryString * @param oldQueryString */ public static List<DocumentBean> searchInResult(String oldStr, String newStr, List<String> sortList) { sortList.clear(); System.out .println("正在检索关键字[" + oldStr + "]结果中关键字为[" + newStr + "]的文件!"); Map<String, DocumentBean> map = new HashMap<String, DocumentBean>(); List<DocumentBean> sortedList = new ArrayList<DocumentBean>(); try { long start = System.currentTimeMillis(); String[] fields = GlobalBean.LUCENE_INDEX_FIELDS.split(" "); BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length]; for (int index = 0; index < fields.length; index++) { flags[index] = BooleanClause.Occur.SHOULD; } // 多字段查询 Query query = MultiFieldQueryParser.parse(newStr.trim(), fields, flags, analyzer); Query oldQuery = MultiFieldQueryParser.parse(oldStr.trim(), fields, flags, analyzer); QueryWrapperFilter oldFilter = new QueryWrapperFilter(oldQuery); CachingWrapperFilter filter = new CachingWrapperFilter(oldFilter); search(query, filter, map, sortList); // for (String field : GlobalBean.LUCENE_INDEX_FIELDS.split(" ")) { // QueryParser queryParser = new QueryParser(field, analyzer); // Query query = queryParser.parse(newStr.trim()); // Query oldQuery = queryParser.parse(oldStr.trim()); // // 查询包装过滤器 // QueryWrapperFilter oldFilter = new QueryWrapperFilter(oldQuery); // CachingWrapperFilter filter = new CachingWrapperFilter( // oldFilter); // search(query, filter, map, sortList); // } long end = System.currentTimeMillis(); System.out.println("检索结束,耗时:" + (end - start) + "ms"); for (String id : sortList) { sortedList.add(map.get(id)); } } catch (Exception e) { } return sortedList; } /** * 全文检索 * * @param query * @param filter * @return */ private static void search(Query query, Filter filter, Map<String, DocumentBean> map, List<String> sortList) { try { TopDocs topDocs = getSearcher().search(query, filter, 1000000); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (scoreDocs != null && scoreDocs.length != 0) { DocumentBean bean = null; for (int i = 0; i < scoreDocs.length; i++) { bean = new DocumentBean(); Document document = getSearcher().doc(scoreDocs[i].doc); String id = document.get(GlobalBean.LUCENE_FIELD_ID); if (!map.containsKey(id)) { String titleProper = document .get(GlobalBean.LUCENE_FIELD_TITLE_PROPER); String uploadUser = document .get(GlobalBean.LUCENE_FIELD_UPLOAD_USER); String archiveType = document .get(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE); String content = document .get(GlobalBean.LUCENE_FIELD_CONTENT); String fileFormat = document .get(GlobalBean.LUCENE_FIELD_FILE_FORMAT); String belongFile = document .get(GlobalBean.LUCENE_FIELD_BELONG_FILE); String docType = document .get(GlobalBean.LUCENE_FIELD_DOC_TYPE); String path = document .get(GlobalBean.LUCENE_FIELD_PATH); String archTypeId = document .get(GlobalBean.LUCENE_FIELD_TYPE_ID); String tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_ID)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_ID, id, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setId(CommonUtil.isEmpty(tempStr) ? id : tempStr); tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_TITLE_PROPER)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_TITLE_PROPER, titleProper, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setTitleProper(CommonUtil.isEmpty(tempStr) ? titleProper : tempStr); tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_UPLOAD_USER)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_UPLOAD_USER, uploadUser, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setUploadUserName(CommonUtil.isEmpty(tempStr) ? uploadUser : tempStr); tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE, archiveType, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setArchiveTypeName(CommonUtil.isEmpty(tempStr) ? archiveType : tempStr); tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_CONTENT)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_CONTENT, content, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setContent(CommonUtil.isEmpty(tempStr) ? content : tempStr); tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_FILE_FORMAT)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_FILE_FORMAT, fileFormat, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setFileFormat(CommonUtil.isEmpty(tempStr) ? fileFormat : tempStr); tempStr = ""; if (GlobalBean.LUCENE_INDEX_FIELDS .contains(GlobalBean.LUCENE_FIELD_BELONG_FILE)) { tempStr = toHighlight(query, analyzer, GlobalBean.LUCENE_FIELD_BELONG_FILE, belongFile, GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT); } bean.setBelongArch(CommonUtil.isEmpty(tempStr) ? belongFile : tempStr); bean.setDocType(docType); bean.setPath(path); bean.setArchiveTypeId(archTypeId); map.put(id, bean); sortList.add(id);// 用于排序 } } // reader.close(); } } catch (Exception e) { e.printStackTrace(); } } /** * 获取操作对象 * * @return * @throws SystemGlobalException */ public static IndexWriter getWriter() throws SystemGlobalException { if (writer == null) { // 存放索引路径 IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_4_10_3, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); File fileindex = new File(GlobalBean.LUCENE_FACT_INDEX_PATH); FSDirectory directory = null; try { directory = FSDirectory.open(fileindex); writer = new IndexWriter(directory, config); } catch (IOException e) { throw new SystemGlobalException("索引操作对象创建失败!"); } } return writer; } /** * 获取检索对象 * * @return * @throws SystemGlobalException */ public static IndexSearcher getSearcher() throws SystemGlobalException { // 存放索引路径 File indexDir = new File(GlobalBean.LUCENE_FACT_INDEX_PATH); try { reader = DirectoryReader.open(FSDirectory.open(indexDir)); searcher = new IndexSearcher(reader); } catch (IOException e) { throw new SystemGlobalException("索引搜索对象创建失败!"); } return searcher; } public static void deleteFileIndex(DocumentBean dataBaseDoc) { System.out .println("正在删除文件[" + dataBaseDoc.getTitleProper() + "]的索引..."); try { getWriter().deleteDocuments( new Term(GlobalBean.LUCENE_FIELD_ID, dataBaseDoc.getId())); System.out.println("文件[" + dataBaseDoc.getTitleProper() + "]的索引删除完成!"); getWriter().commit(); } catch (Exception e) { } } /** * 查询结果高亮 * * @param query * @param analyzer * @param fieldName * @param text * @param length * @return * @throws Exception */ private static String toHighlight(Query query, Analyzer analyzer, String fieldName, String text, int length) throws Exception { Highlighter highLighter = new Highlighter(new SimpleHTMLFormatter( "<font color='red'>", "</font>"), new QueryScorer(query)); Fragmenter fragmenter = new SimpleFragmenter(length); highLighter.setTextFragmenter(fragmenter); return highLighter.getBestFragment(analyzer, fieldName, text); } /** * 二次转码 * * @param str * @return */ public static String secondParseCode(String str) { if (!CommonUtil.isEmpty(str)) { try { return new String(str.getBytes("gbk"), "utf-8"); } catch (UnsupportedEncodingException e) { } } return str; }}
如有疑问,欢迎留言
0 0
- Lucene索引的增删改查和二次检索
- lucene索引的增删改查
- 02-lucene索引的增删改查
- lucene索引的增删改查/lucene索引维护
- 全文检索之lucene的优化篇--增删改查
- 全文检索之lucene的优化篇--增删改查
- lucene索引库的增删改查操作
- lucene(二) 索引的创建、增删改查
- Lucene实现索引数据的增删改查
- Lucene之索引的增删改查-yellowcong
- Lucene使用(二)索引的增删改查
- lucene 的 增删改查
- Lucene的增删查改
- lucene的增删改查
- Lucene全文检索 之创建索引、增删改查(纯代码)
- 搜索学习入门--Lucene初体验(Lucene索引的增删改查)
- lucene增删改查
- 【手把手教你全文检索】Lucene索引的【增、删、改、查】
- Flex与servlet交互2(数据的解析Json)
- cocos2d-x游戏实例(2)-主角根据输入移动
- 《破坏之王—DDoS攻击与防范深度剖析》
- cocos2d-x游戏实例(3)-获得地图索引
- OpenFlow协议之殇?
- Lucene索引的增删改查和二次检索
- cocos2d-x游戏实例(4)-地图碰撞
- 关于尾递归
- Android最佳性能实践(
- Java中native关键字
- cocos2d中文显示问题的解决方法
- Flex与servlet数据交互3(xml解析)
- // 插入排序 源码
- leetcode-237-Delete Node in a Linked List