搜索引擎之全文搜索算法功能实现(基于Lucene)

来源:互联网 发布:非常完美知乎 编辑:程序博客网 时间:2024/04/27 01:05

之前做去转盘网的时候,我已经公开了非全文搜索的代码,需要的朋友希望能够前去阅读我的博客。本文主要讨论如何进行全文搜索,由于本人花了很长时间设计了新作:观点,观点对全文搜索的要求还是很高的,所以我又花了不少时间研究全文搜索,你可以先体验下:点我搜索。废话也不多说了,直接上代码:

public Map<String,Object>  articleSearchAlgorithms(SearchCondition condition,IndexSearcher searcher) throws ParseException, IOException{                     Map<String,Object> map =new HashMap<String,Object>();             String[] filedsList=condition.getFiledsList();             String keyWord=condition.getKeyWord();             int currentPage=condition.getCurrentPage();             int pageSize=condition.getPageSize();             String sortField=condition.getSortField();             boolean isASC=condition.isDESC();             String sDate=condition.getsDate();            String eDate=condition.geteDate();            String classify=condition.getClassify();                                     //过滤终结字符            keyWord=escapeExprSpecialWord(keyWord);                        BooleanQuery q1 = new BooleanQuery();            BooleanQuery q2 = new BooleanQuery();             BooleanQuery booleanQuery = new BooleanQuery(); //boolean查询                          if(classify!=null&&(classify.equals("guanzhi")||classify.equals("opinion")||classify.equals("write"))){                 String typeId="1";//默认言论                 if(classify.equals("guanzhi")){                     typeId="2";                 }                 if(classify.equals("opinion")){                     typeId="3";                 }                 Query termQuery = new TermQuery(new Term("typeId",typeId));                  q1.add(termQuery,BooleanClause.Occur.MUST);             }             if(sDate!=null&&eDate!=null){//是否范围查询由这两个参数决定                Query rangeQuery = new TermRangeQuery("writingTime", new BytesRef(sDate), new BytesRef(eDate),true, true);                q1.add(rangeQuery,BooleanClause.Occur.MUST);             }            Sort sort = new Sort(); // 排序            sort.setSort(SortField.FIELD_SCORE);            if(sortField!=null){                sort.setSort(new SortField(sortField, SortField.Type.STRING, isASC));            }                        int start = (currentPage - 1) * pageSize;            int hm = start + pageSize;                        TopFieldCollector res = TopFieldCollector.create(sort,hm,false, false, false, false);            //完全匹配查询            Term t0=new Term(filedsList[1],keyWord);            TermQuery termQuery = new TermQuery(t0);//两种高度匹配的查询            q2.add(termQuery,BooleanClause.Occur.SHOULD);                        //前缀匹配            Term t1=new Term(filedsList[1],keyWord);            PrefixQuery prefixQuery=new PrefixQuery(t1);            q2.add(prefixQuery,BooleanClause.Occur.SHOULD);                        //短语,相似度匹配,适用于分词的内容            for(int i=0;i<filedsList.length;i++){ //多字段term查询算法                if(i!=1){                    PhraseQuery phraseQuery=new PhraseQuery();                    Term ts0=new Term(filedsList[i],keyWord);                    phraseQuery.add(ts0);                                        FuzzyQuery fQuery=new FuzzyQuery(new Term(filedsList[i],keyWord),2);//最后相似度查询                                        q2.add(phraseQuery,BooleanClause.Occur.SHOULD);                    q2.add(fQuery,BooleanClause.Occur.SHOULD);//后缀相似的拿出来                }            }            MultiFieldQueryParser  queryParser = new MultiFieldQueryParser(Version.LUCENE_47,filedsList,analyzer);            queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);            Query query = queryParser.parse(keyWord);            q2.add(query,BooleanClause.Occur.SHOULD);                        //必须加逻辑判断,否则结果是不同的            if(q1!=null && q1.toString().length()>0){                booleanQuery.add(q1,BooleanClause.Occur.MUST);            }            if(q2!=null && q2.toString().length()>0){                 booleanQuery.add(q2,BooleanClause.Occur.MUST);            }                        searcher.search(booleanQuery, res);            long amount = res.getTotalHits();             TopDocs tds = res.topDocs(start, pageSize);            map.put("amount",amount);            map.put("tds",tds);            map.put("query",booleanQuery);            return map;    }
注意下:上面代码的搜索条件(SearchCondition )是观点网的具体需求,您可以按照您自己的搜索条件做改动,这里也很难适配所有读者。

public Map<String, Object> searchArticle(SearchCondition condition) throws Exception{                    Map<String,Object> map =new HashMap<String,Object>();        List<Write> list=new ArrayList<Write>();                 DirectoryReader reader=condition.getReader();         String URL=condition.getURL();         boolean isHighligth=condition.isHighlight();         String keyWord=condition.getKeyWord();         IndexSearcher searcher=getSearcher(reader,URL);                try{            Map<String,Object> output=articleSearchAlgorithms(condition,searcher);            if(output==null){                map.put("amount",0L);                map.put("source",null);                return map;            }                        map.put("amount", output.get("amount"));            TopDocs tds = (TopDocs) output.get("tds");            ScoreDoc[] sd = tds.scoreDocs;            Query query =(Query) output.get("query");                        for (int i = 0; i < sd.length; i++) {                                Document doc = searcher.doc(sd[i].doc);                String id = doc.get("id");                /**********************start*************************需要处理的放一块儿********************/                String temp=doc.get("title");                String title =temp; //默认不高亮                if(isHighligth){                    //高亮文章标题                    Highlighter highlighterTitle = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));                    highlighterTitle.setTextFragmenter(new SimpleFragmenter(40)); // 字长度                    TokenStream ts = analyzer.tokenStream("title", new StringReader(temp));                    title= highlighterTitle.getBestFragment(ts,temp);                     if(title==null){                        title=temp.replace(keyWord,"<span style='color:red'>"+keyWord+"</span>");//高亮处理插件bug,加这句话避免                    }                }                                String temp1=HtmlEnDecode.htmlEncode(doc.get("content"));                String content=temp1;//使用自己封装的方法来转义                                if(isHighligth){                    //做高亮处理,content                    Highlighter highlighterContent = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));                    highlighterContent.setTextFragmenter(new SimpleFragmenter(Constant.HIGHLIGHT_CONTENT_LENGTH)); // 字长度                    //temp1=StringEscapeUtils.escapeHtml(temp1);//将汉字转义导致高亮失效                    TokenStream ts1 = analyzer.tokenStream("content", new StringReader(temp1));                    content = highlighterContent.getBestFragment(ts1,temp1);                                        if(content==null){                        content=temp1.replace(keyWord,"<span style='color:red'>"+keyWord+"</span>");//高亮处理插件bug,加这句话避免                                                //假设遇上这种情况做处理,其他的高亮器会自动截图                        content=subContent(content);//截取处理                        content=HtmlEnDecode.htmldecode(content);//html解码                        content=SubStringHTML.sub(content,Constant.HIGHLIGHT_CONTENT_LENGTH);                    }                }                /*---------------------------------------不断变动的数据放一块儿----------------------------*/                                Write write=writeDao.getArticle(Long.parseLong(id));                if(write!=null){                    write.setTitle(title);                    write.setContent(content);                                        Date writingTime=write.getWritingTime();                    String timeGap=DateUtil.dateGap(writingTime);//timeGap                    write.setTimeGap(timeGap);                                        list.add(write);                }            }                    }catch(Exception e){            e.printStackTrace();        }        map.put("source",list);        return map;    }

注意上面,这是具体的搜索代码,不同的应用场景有不同的需求,请您按照自己的需求封装对象,查询数据库等,代码毫无保留,绝对可用。

如果有什么疑问可以加qq群:284205104 如果群满了就麻烦去趟去转盘找下最新的群加了即可,谢谢您的阅读。



阅读全文
0 0