Lucene索引的增删改查和二次检索

来源:互联网 发布:nodejs java 分离设计 编辑:程序博客网 时间:2024/06/07 09:16

本博客是本人使用lucene时写的工具类。包含对索引的增删改查,以及对检索结果的多次子检索,检索结果高亮显示等。

//本示例取数据的bean (在此用的是档案文件bean)  自行从数据库封装数据public class DocumentBean {    private String id;                   //数据id    private String archiveTypeId;        //档案类型id    private String archiveId;            //档案id    private String itemNo;               //文件号    private String titleProper;          //标题    private String fileSize;             //文件大小    private String fileFormat;           //文件格式    private String path;                 //文件路径    private String officeArchiveCode;    //文号    private String docType;              //文件类型    private String browseCount;          //浏览次数    private String downloadCount;        //下载次数    private String fileLastDate;         //最后更改日期    private String archiveTypeName;      //档案类型名称    private String uploadUserName;    private String uploadDeptName;    private String content;    private String realName;  //该文件存入数据库的名称    private String belongArch;//所属档案    /**省略getter setter**/}//lucene用到静态变量类public class GlobalBean {    // lucene 配置    public static final String LUCENE_INDEX_PATH = "/index";// 索引存放路径    public static final String LUCENE_FILE_PATH = "/uploadFile"; // 添加到索引的文件路径    public static final String LUCENE_FIELD_ID = "id"; // 索引字段id    public static final String LUCENE_FIELD_TITLE_PROPER = "titleProper"; // 索引字段标题    public static final String LUCENE_FIELD_UPLOAD_USER = "uploadUser"; // 索引字段上传用户    public static final String LUCENE_FIELD_FILE_FORMAT = "fileFormat"; // 索引字段文件类型    public static final String LUCENE_FIELD_ARCHIVE_TYPE = "archiveType"; // 索引字段档案类型    public static final String LUCENE_FIELD_BELONG_FILE = "belongFile"; // 索引字段所属文件    public static final String LUCENE_FIELD_DOC_TYPE = "docType"; // 文件mine类型    public static final String LUCENE_FIELD_PATH = "path"; // 文件路径    public static final String LUCENE_FIELD_CONTENT = "content"; // 索引字段档案内容    public static final String LUCENE_FIELD_TYPE_ID = "typeId"; // 索引字段档案类型id    // 检索时参与检索的索引字段    public static final String LUCENE_INDEX_FIELDS = LUCENE_FIELD_TITLE_PROPER            + " " + LUCENE_FIELD_UPLOAD_USER + " " + LUCENE_FIELD_ARCHIVE_TYPE            + " " + LUCENE_FIELD_CONTENT;    // 文件上传服务器真实路径    public static String LUCENE_FACT_FILE_PATH = ""; // 添加到索引的文件路径 (你们填写上自己的真实路径,我这边是服务器启动初始化过了)    // 创建索引服务器真实路径    public static String LUCENE_FACT_INDEX_PATH = ""; // 添加到索引的文件路径(你们填写上自己的真实路径,我这边是服务器启动初始化过了)    // 高亮显示上下文 字段显示字数    public static int LUCENE_HIGHLIGHT_CONTEXT_COUNT = 80;  }//lucene操作工具类public class LuceneUtil {    private static IndexWriter writer = null;    private static IndexSearcher searcher = null;    private static IndexReader reader = null;    private static final IKAnalyzer analyzer = new IKAnalyzer();    /**     * 创建索引     *      * @param fileMap     *            存放数据库查询的上传文件 信息 的 名称(key) 文件bean对象(value) 集合     * @throws SystemGlobalException     *             系统全局异常     */    public static void createIndex(Map<String, DocumentBean> fileMap)            throws SystemGlobalException {        String unDataBaseFileMsg = ""; // 数据库中不存在的文件提示信息        String unExistsFileMsg = ""; // 数据库存在文件实体丢失的提示信息        List<String> entiryList = null; // 存放实体列表        try {            File[] files = getFiles();            entiryList = new ArrayList<String>();            for (File file : files) {                if (!file.canRead())                    throw new SystemGlobalException("文件" + file.getName()                            + "不可读!");                // 当文件不是数据库存在时.不创建该文件索引                DocumentBean dataBaseDoc = fileMap.get(file.getName());                if (CommonUtil.isEmpty(dataBaseDoc)) {                    unDataBaseFileMsg += "[" + file.getName() + "],";                    continue;                }                entiryList.add(file.getName());                if (file.isFile()) {                    saveOrUpdateDocument(dataBaseDoc, file);                }            }            for (Iterator<String> file = fileMap.keySet().iterator(); file                    .hasNext();) {                String dataFileName = file.next();                // 说明数据库中存在,但文件丢失                if (!entiryList.contains(dataFileName)) {                    unExistsFileMsg += "["                            + fileMap.get(dataFileName).getTitleProper() + "],";                }            }            if (!CommonUtil.isEmpty(unDataBaseFileMsg)) {                unDataBaseFileMsg = "创建索引完成!其中"                        + StringUtil.subStr(unDataBaseFileMsg, 1)                        + "创建失败!原因:文件不在数据库中存在!";            }            if (!CommonUtil.isEmpty(unExistsFileMsg)) {                unExistsFileMsg = "创建索引完成!其中"                        + StringUtil.subStr(unExistsFileMsg, 1)                        + "创建失败!原因:本地文件丢失!";            }            getWriter().commit();            // getWriter().close();            if (!CommonUtil.isEmpty(unDataBaseFileMsg)                    || !CommonUtil.isEmpty(unExistsFileMsg)) {                throw new SystemGlobalException(unDataBaseFileMsg + "\n"                        + unExistsFileMsg);            }        } catch (IOException e) {            throw new SystemGlobalException("文件读写异常!");        }    }    /**     * 删除所有索引     *      * @param indexWriter     *            索引操作对象     * @throws SystemGlobalException     */    public static void deleteAllIndex() throws SystemGlobalException {        try {            getWriter().deleteAll();        } catch (IOException e) {            throw new SystemGlobalException("删除索引异常");        }    }    /**     * 重建索引库 先 删除所有 ,后添加所有     *      * @param indexWriter     *            索引操作对象     * @param useSmart     *            是否使用智能切分 false 为最细密度切分     */    public static void rebuildIndex(Map<String, DocumentBean> fileMap)            throws SystemGlobalException {        try {            deleteAllIndex();            createIndex(fileMap);            System.out.println("-----重建索引库成功!");        } catch (SystemGlobalException e) {            throw new SystemGlobalException("重建索引库异常!");        }    }    /**     * 更新索引库所有文件更改过的索引     *      * @param beanMap     *            数据库 电子全文对象map集合     */    public static Map<String, File> updateBatchIndex(            Map<String, DocumentBean> beanMap) throws SystemGlobalException {        Map<String, File> modifiedMap = new HashMap<String, File>();        // 获取配置目录下的文件        File[] files = getFiles();        // 判断文件是否修改过        for (File file : files) {            Date d = new Date(file.lastModified());            String lastDate = DateUtil.convertDateToStr(d,                    DateUtil.YYYYMMDDHHMMSS);            DocumentBean bean = beanMap.get(file.getName());            if (bean != null) {                if (!CommonUtil.isEmpty(lastDate)                        && !lastDate.equals(bean.getFileLastDate())) {                    // 文件修改过.更新索引将电子全文id添加至list集合 返回批量数据库修改                    // saveOrUpdateDocument(indexWriter, bean, file);                    modifiedMap.put(bean.getId(), file);                }            }        }        return modifiedMap;    }    /**     * 保存或更新索引档案信息     *      * @param dataBaseDoc     *            电子全文对象     * @param file     *            电子全文文件     */    public static void saveOrUpdateDocument(DocumentBean dataBaseDoc, File file) {        System.out                .println("正在创建文件[" + dataBaseDoc.getTitleProper() + "]的索引...");        try {            Document doc = new Document();            Tika tika = new Tika();            // id 用于标识唯一 记录            doc.add(new TextField(GlobalBean.LUCENE_FIELD_ID, CommonUtil                    .objToStr(dataBaseDoc.getId()), Store.YES));            // 存放文件标题            doc.add(new TextField(GlobalBean.LUCENE_FIELD_TITLE_PROPER,                    CommonUtil.objToStr(dataBaseDoc.getTitleProper()),                    Store.YES));            // 存放上传人            doc.add(new TextField(GlobalBean.LUCENE_FIELD_UPLOAD_USER,                    CommonUtil.objToStr(dataBaseDoc.getUploadUserName()),                    Store.YES));            // 存放电子档案归类名            doc.add(new TextField(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE,                    CommonUtil.objToStr(dataBaseDoc.getArchiveTypeName()),                    Store.YES));            // 将文件中所有信息存放入文档对象content中            String formatter = CommonUtil.objToStr(dataBaseDoc.getFileFormat());            String content = "";            if (dataBaseDoc.getDocType().indexOf("image/") != -1) {//              if (formatter.length() > 0){//                  content = OCRUtil.recognizeText(//                          new File(GlobalBean.LUCENE_FACT_FILE_PATH+dataBaseDoc.getPath().replace("uploadFile", "")),//                          formatter.substring(1)).trim();//                  if(content == null) content = "";//              }            } else {                content = tika.parseToString(file).trim();            }            doc.add(new TextField(GlobalBean.LUCENE_FIELD_CONTENT, content,                    Store.YES));            doc.add(new TextField(GlobalBean.LUCENE_FIELD_FILE_FORMAT,                    formatter, Store.YES));            doc.add(new TextField(GlobalBean.LUCENE_FIELD_BELONG_FILE,                    CommonUtil.objToStr(dataBaseDoc.getBelongArch()), Store.YES));            doc.add(new TextField(GlobalBean.LUCENE_FIELD_DOC_TYPE, CommonUtil                    .objToStr(dataBaseDoc.getDocType()), Store.YES));            doc.add(new TextField(GlobalBean.LUCENE_FIELD_PATH, CommonUtil                    .objToStr(dataBaseDoc.getPath()), Store.YES));            doc.add(new TextField(GlobalBean.LUCENE_FIELD_TYPE_ID, CommonUtil                    .objToStr(dataBaseDoc.getArchiveTypeId()), Store.YES));            // 添加该档案对象到索引中 如果未存在添加,存在则修改为最新的            getWriter().updateDocument(                    new Term(GlobalBean.LUCENE_FIELD_ID, dataBaseDoc.getId()),                    doc);            System.out.println("文件[" + dataBaseDoc.getTitleProper()                    + "]的索引创建完成!");        } catch (Exception e) {            System.out.println(e.getMessage() + "\n不能添加空索引值!");        }    }    /**     * 保存索引     *      * @param bean     * @param file     */    public static void saveOrUpdateIndex(DocumentBean bean, File file) {        try {            saveOrUpdateDocument(bean, file);            getWriter().commit();        } catch (Exception e) {            System.out.println("文件[" + bean.getTitleProper() + "]的索引创建出错!错误原因:"                    + e.getMessage());        }    }    /**     * 获取配置目录下的文件     *      * @return     */    public static File[] getFiles() {        // 上传的文件目录路径        File fileDir = new File(GlobalBean.LUCENE_FACT_FILE_PATH);        return fileDir.listFiles();    }    /**     * 获取指定目录下的文件     *      * @return     */    public static File[] getFiles(String path) {        // 上传的文件目录路径        File fileDir = new File(path);        return fileDir.listFiles();    }    /**     * 查询     *      * @param queryStr     */    public static List<DocumentBean> search(String queryStr) {        System.out.println("正在检索关键字为[" + queryStr + "]的索引文件!");        Map<String, DocumentBean> map = new HashMap<String, DocumentBean>();        List<String> sortList = new ArrayList<String>();        List<DocumentBean> sortedList = new ArrayList<DocumentBean>();        try {            long start = System.currentTimeMillis();            if (!CommonUtil.isEmpty(queryStr)) {                String[] fields = GlobalBean.LUCENE_INDEX_FIELDS.split(" ");                BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length];                for (int index = 0; index < fields.length; index++) {                    flags[index] = BooleanClause.Occur.SHOULD;                }                // 多字段查询                Query query = MultiFieldQueryParser.parse(queryStr, fields,                        flags, analyzer);                QueryWrapperFilter filter = new QueryWrapperFilter(query);                search(query, filter, map, sortList);            }            long end = System.currentTimeMillis();            System.out.println("检索结束,耗时:" + (end - start) + "ms");            for (String id : sortList) {                sortedList.add(map.get(id));            }        } catch (Exception e) {            e.printStackTrace();        }        return sortedList;    }    /**     * 高级结果查询 不定长     */    public static List<DocumentBean> seniorSearchInResult(List<String> params) {        List<DocumentBean> list = null;        List<DocumentBean> tempList = new ArrayList<DocumentBean>();        List<String> sortList = new ArrayList<String>();        List<String> tempSortList = new ArrayList<String>();        for (int index = 0; index < params.size() - 1; index++) {            list = new ArrayList<DocumentBean>();            tempList = searchInResult(params.get(index), params.get(index + 1),                    tempSortList);            if (index == 0) {                sortList.addAll(tempSortList);            }            if (!CommonUtil.isEmpty(sortList)) {                for (String id : tempSortList) {                    int subIndex = tempSortList.size() - tempList.size();                    if (!sortList.contains(id))                        tempList.remove(tempSortList.indexOf(id) - subIndex);                }            }            if (sortList.size() > 0) {                list.addAll(tempList);            }        }        return list;    }    /**     * @desc 二次搜索 在上次搜索的结果缓存的基础上进行再次检索     * @param newQueryString     * @param oldQueryString     */    public static List<DocumentBean> searchInResult(String oldStr,            String newStr, List<String> sortList) {        sortList.clear();        System.out                .println("正在检索关键字[" + oldStr + "]结果中关键字为[" + newStr + "]的文件!");        Map<String, DocumentBean> map = new HashMap<String, DocumentBean>();        List<DocumentBean> sortedList = new ArrayList<DocumentBean>();        try {            long start = System.currentTimeMillis();            String[] fields = GlobalBean.LUCENE_INDEX_FIELDS.split(" ");            BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length];            for (int index = 0; index < fields.length; index++) {                flags[index] = BooleanClause.Occur.SHOULD;            }            // 多字段查询            Query query = MultiFieldQueryParser.parse(newStr.trim(), fields,                    flags, analyzer);            Query oldQuery = MultiFieldQueryParser.parse(oldStr.trim(), fields,                    flags, analyzer);            QueryWrapperFilter oldFilter = new QueryWrapperFilter(oldQuery);            CachingWrapperFilter filter = new CachingWrapperFilter(oldFilter);            search(query, filter, map, sortList);            // for (String field : GlobalBean.LUCENE_INDEX_FIELDS.split(" ")) {            // QueryParser queryParser = new QueryParser(field, analyzer);            // Query query = queryParser.parse(newStr.trim());            // Query oldQuery = queryParser.parse(oldStr.trim());            // // 查询包装过滤器            // QueryWrapperFilter oldFilter = new QueryWrapperFilter(oldQuery);            // CachingWrapperFilter filter = new CachingWrapperFilter(            // oldFilter);            // search(query, filter, map, sortList);            // }            long end = System.currentTimeMillis();            System.out.println("检索结束,耗时:" + (end - start) + "ms");            for (String id : sortList) {                sortedList.add(map.get(id));            }        } catch (Exception e) {        }        return sortedList;    }    /**     * 全文检索     *      * @param query     * @param filter     * @return     */    private static void search(Query query, Filter filter,            Map<String, DocumentBean> map, List<String> sortList) {        try {            TopDocs topDocs = getSearcher().search(query, filter, 1000000);            ScoreDoc[] scoreDocs = topDocs.scoreDocs;            if (scoreDocs != null && scoreDocs.length != 0) {                DocumentBean bean = null;                for (int i = 0; i < scoreDocs.length; i++) {                    bean = new DocumentBean();                    Document document = getSearcher().doc(scoreDocs[i].doc);                    String id = document.get(GlobalBean.LUCENE_FIELD_ID);                    if (!map.containsKey(id)) {                        String titleProper = document                                .get(GlobalBean.LUCENE_FIELD_TITLE_PROPER);                        String uploadUser = document                                .get(GlobalBean.LUCENE_FIELD_UPLOAD_USER);                        String archiveType = document                                .get(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE);                        String content = document                                .get(GlobalBean.LUCENE_FIELD_CONTENT);                        String fileFormat = document                                .get(GlobalBean.LUCENE_FIELD_FILE_FORMAT);                        String belongFile = document                                .get(GlobalBean.LUCENE_FIELD_BELONG_FILE);                        String docType = document                                .get(GlobalBean.LUCENE_FIELD_DOC_TYPE);                        String path = document                                .get(GlobalBean.LUCENE_FIELD_PATH);                        String archTypeId = document                                .get(GlobalBean.LUCENE_FIELD_TYPE_ID);                        String tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_ID)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_ID, id,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setId(CommonUtil.isEmpty(tempStr) ? id : tempStr);                        tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_TITLE_PROPER)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_TITLE_PROPER,                                    titleProper,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setTitleProper(CommonUtil.isEmpty(tempStr) ? titleProper                                : tempStr);                        tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_UPLOAD_USER)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_UPLOAD_USER,                                    uploadUser,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setUploadUserName(CommonUtil.isEmpty(tempStr) ? uploadUser                                : tempStr);                        tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE,                                    archiveType,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setArchiveTypeName(CommonUtil.isEmpty(tempStr) ? archiveType                                : tempStr);                        tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_CONTENT)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_CONTENT, content,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setContent(CommonUtil.isEmpty(tempStr) ? content                                : tempStr);                        tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_FILE_FORMAT)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_FILE_FORMAT,                                    fileFormat,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setFileFormat(CommonUtil.isEmpty(tempStr) ? fileFormat                                : tempStr);                        tempStr = "";                        if (GlobalBean.LUCENE_INDEX_FIELDS                                .contains(GlobalBean.LUCENE_FIELD_BELONG_FILE)) {                            tempStr = toHighlight(query, analyzer,                                    GlobalBean.LUCENE_FIELD_BELONG_FILE,                                    belongFile,                                    GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);                        }                        bean.setBelongArch(CommonUtil.isEmpty(tempStr) ? belongFile                                : tempStr);                        bean.setDocType(docType);                        bean.setPath(path);                        bean.setArchiveTypeId(archTypeId);                        map.put(id, bean);                        sortList.add(id);// 用于排序                    }                }                // reader.close();            }        } catch (Exception e) {            e.printStackTrace();        }    }    /**     * 获取操作对象     *      * @return     * @throws SystemGlobalException     */    public static IndexWriter getWriter() throws SystemGlobalException {        if (writer == null) {            // 存放索引路径            IndexWriterConfig config = new IndexWriterConfig(                    Version.LUCENE_4_10_3, analyzer);            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);            File fileindex = new File(GlobalBean.LUCENE_FACT_INDEX_PATH);            FSDirectory directory = null;            try {                directory = FSDirectory.open(fileindex);                writer = new IndexWriter(directory, config);            } catch (IOException e) {                throw new SystemGlobalException("索引操作对象创建失败!");            }        }        return writer;    }    /**     * 获取检索对象     *      * @return     * @throws SystemGlobalException     */    public static IndexSearcher getSearcher() throws SystemGlobalException {        // 存放索引路径        File indexDir = new File(GlobalBean.LUCENE_FACT_INDEX_PATH);        try {            reader = DirectoryReader.open(FSDirectory.open(indexDir));            searcher = new IndexSearcher(reader);        } catch (IOException e) {            throw new SystemGlobalException("索引搜索对象创建失败!");        }        return searcher;    }    public static void deleteFileIndex(DocumentBean dataBaseDoc) {        System.out                .println("正在删除文件[" + dataBaseDoc.getTitleProper() + "]的索引...");        try {            getWriter().deleteDocuments(                    new Term(GlobalBean.LUCENE_FIELD_ID, dataBaseDoc.getId()));            System.out.println("文件[" + dataBaseDoc.getTitleProper()                    + "]的索引删除完成!");            getWriter().commit();        } catch (Exception e) {        }    }    /**     * 查询结果高亮     *      * @param query     * @param analyzer     * @param fieldName     * @param text     * @param length     * @return     * @throws Exception     */    private static String toHighlight(Query query, Analyzer analyzer,            String fieldName, String text, int length) throws Exception {        Highlighter highLighter = new Highlighter(new SimpleHTMLFormatter(                "<font color='red'>", "</font>"), new QueryScorer(query));        Fragmenter fragmenter = new SimpleFragmenter(length);        highLighter.setTextFragmenter(fragmenter);        return highLighter.getBestFragment(analyzer, fieldName, text);    }    /**     * 二次转码     *      * @param str     * @return     */    public static String secondParseCode(String str) {        if (!CommonUtil.isEmpty(str)) {            try {                return new String(str.getBytes("gbk"), "utf-8");            } catch (UnsupportedEncodingException e) {            }        }        return str;    }}

如有疑问,欢迎留言

0 0