Solr进阶之拼写纠错功能的实现基础拼音

来源:互联网 发布:淘宝阿里旺旺下载安装 编辑:程序博客网 时间:2024/05/01 15:03
思路:
1.当汉字个数小于等于三个汉字时,使用单个词库进行匹配(最大匹配法)
将汉字转为同音字,查询单个词库中的数据,选出音一样的词语列表,加上最小距离算法(保证至少一个汉字一样),得出一个列表,按照一定的算法排序后,选出最好的那个词语.
(词语库中词语定时更新,索引对应词语的查询结果)
2.当汉字个数在4到6个数目时,使用最大匹配切词法进行切词处理,切分为单个最大词后,使用1中的排序法则,排序后,(这里词语的组合如何去做?? 先不做,涉及nlp),直接得到最佳的三个词语吧,进行查询,得到结果数.再次使用最小距离算法的方式(solr自身的)进行一次纠错,将两次纠错结果进行算法排序,得出最优化结果
3.当查询汉字个数大于6个时,使用切词的方式进行处理,将词库中需要查询到的汉字预先使用n-gram处理后,将汉字和拼音为一个单位放入多值字段中.
将查询的汉字使用n-gram切分后,转为拼音,进行上述字段的查询,对每个拼音对应的进行高亮处理,取出对应的列表,将一系列汉字组合后,假设经过优化后(最小距离,前后关系 等)数目是3,将这个三个词语再次查询,查出相应的结果数目,
再次使用solr自身的最小距离算法模块,查出一个纠错列表
将两次结果经过算法处理后,得出最优解,给出建议!


准备工作:
1.建立一个词库(这里简单的使用商品相关信息加分词器完成)
2.在商品库中添加一个字段单元 形式 如下  洗 xi ,衣 yi ,机 ji ...


代码实现:

将思路中的1.2合并了下.原理都是一样的.

 @Override    public ResultData<String> productSpellCheck(SearchParams params) {        String spellWrod =null;        //判断查询词的长度根据不同长度使用不同的处理方式        int length = params.getQ().length();        if(length>1 && length<7){//最大匹配分词法            spellWrod = this.maxMatching(params.getQ());            if(spellWrod==null || spellWrod.length()==0){                spellWrod = this.minSpilt(params.getQ());            }        }else if(length>6 && length <=10){//最小切割法            spellWrod=this.minSpilt(params.getQ());        }        ResultData<String> resultData = new ResultData<String>();        resultData.setData(spellWrod);        if(spellWrod !=null && spellWrod.length()>0){            resultData.setSuccess(true);        }else{            resultData.setSuccess(false);        }        return resultData;    }

短词纠错部分实现      /**     * @描述:最大匹配法 ---拼音加最小距离算法       * 电视机  名师辅导 权威名师辅导     * @param word     * @return String     * @createTime:2016年9月12日     * @author: songqinghu     */    private String maxMatching(String word){        Map<String, String> oldWord = NGramTokenizerUtil.analyzer(word);        String spellword="";        List<SpellWordTree> spellWords = new ArrayList<SpellWordTree>();        matching(oldWord,  word.length(), spellWords);        if(spellWords.size()>0){            int start = spellWords.get(spellWords.size()-1).getStart();            if(start!=0){                SpellWordTree spellWordTree = new SpellWordTree();                spellWordTree.setKey(0+"-"+start);                ArrayList<String> wordList = new ArrayList<String>();                wordList.add(oldWord.get(spellWordTree.getKey()));                spellWordTree.setWord(wordList);                spellWords.add(spellWordTree);            }            List<String> words = new ArrayList<String>();            words.add(spellword);//种子            for (int i = spellWords.size()-1; i >=0; i--) {                //spellword=spellword + spellWords.get(i).getWord();                List<String> temp = new ArrayList<String>();                for (String minword : spellWords.get(i).getWord()) {                     for (String seedword : words) {                            seedword = seedword + minword;                            temp.add(seedword);                        }                }                words = temp;            }            if(words.size()==1){                return words.get(0);            }            Long maxCount = -1l;            for (String string : words) {                Long count = findSpellWrodResult(string);                if(count>maxCount){                    spellword = string;                    maxCount = count;                }            }        }        //所有的结束//        for (SpellWordTree spellWord : spellWords) {//            System.out.println(spellWord.getKey() + " : " + spellWord.getWord());//        }        return spellword;    }    /**     * @描述:循环最大匹配     * @param oldWord     * @param max     * @param spellWords     * @return SpellWordTree     * @exception     * @createTime:2016年9月13日     * @author: songqinghu     */    private void matching(Map<String, String> oldWord,int max,List<SpellWordTree> spellWords){        for (int i = max; i >=0; i--) {            for (int j = 0; j < i-1; j++) {                String key = j+"-"+i;                String value = oldWord.get(key);              if(value.length()>1){                  SpellWordTree spellWordTree = new SpellWordTree();                  if(singleWordhandle(value, spellWordTree)){                      //组成树状结构 --后期组合出词语 0 -max区间                      spellWordTree.setKey(key);                      //补全前面的词语                      if(spellWordTree.getEnd()<max){                          SpellWordTree extraWord = new SpellWordTree();                          extraWord.setKey(spellWordTree.getEnd() +"-"+max);                          ArrayList<String> word = new ArrayList<String>();                          word.add(oldWord.get(spellWordTree.getEnd() +"-"+max));                          extraWord.setWord(word);                          spellWords.add(extraWord);                      }                      spellWords.add(spellWordTree);                      matching(oldWord, spellWordTree.getStart(), spellWords);                      return;                  }              }            }        }    }   /**     *      * @描述:查询纠错词的结果数 --选择最优结果     * @param word     * @return Long     * @createTime:2016年9月13日     * @author: songqinghu     */    private Long findSpellWrodResult(String word){        Formula f = new Formula();        f.append(new Query(ProductBean.Fd.name.name(), word)).tagO();        f.append(new Query(ProductBean.Fd.multiple.name(), word));        SolrQuery query = new SolrQuery();        query.set(CommonParams.Q, f.toString());        query.setStart(0);        query.setRows(0);        try {            QueryResponse response = productClient.query(query);            return response.getResults().getNumFound();        } catch (SolrServerException | IOException e) {            e.printStackTrace();        }        return 0l;    }

长词匹配部分 --使用solr的高亮来完成   /**     *      * @描述:ngram 切割 转为拼音后处理     * @param word     * @return String     * @createTime:2016年9月14日     * @author: songqinghu     */    private String minSpilt(String oldWord){        //分词        Map<String, String> terms = NGram11TokenizerUtil.analyzer(oldWord);        //输入原始词汇组合高亮 等待比较        Map<String, String> newTerms = new HashMap<String,String>();        //高亮词        List<Set<String>> highlightingWord = highlightingWord(terms,newTerms,oldWord);        //校验词        TreeSet<String> treewords = new TreeSet<String>();        for (Set<String> highterms : highlightingWord) {            List<String> highList = new ArrayList<String>();            highList.addAll(highterms);            Set<String> keys = newTerms.keySet();            List<String> deletekeys = new ArrayList<String>();            Map<String, String> temp = new HashMap<String,String>();            for (String key : keys) {                if(!highList.contains(newTerms.get(key))){                    temp.put(key, newTerms.get(key));//待纠正                }else{                    deletekeys.add(newTerms.get(key));//待删除                }            }            for (String deletekey : deletekeys) {                highList.remove(deletekey);//纠正词            }            HashMap<String, List<String>> words = new HashMap<String,List<String>>();//纠正后的词            for (String high : highList) {//获取位置                String[] pinyins = high.split(" ");                for (String key : temp.keySet()) {                   String[] oldPinyins = temp.get(key).split(" ");                   for (int i = 1; i < oldPinyins.length; i++) {                       for (int j = 1; j < pinyins.length; j++) {                        if(oldPinyins[i].equals(pinyins[j])){//找到位置                            if(!words.containsKey(key)){                                List<String> word = new ArrayList<String>();                                word.add(pinyins[0]);//汉字                                words.put(key, word);                            }else{                                words.get(key).add(pinyins[0]);                            }                            i=oldPinyins.length;                            j=pinyins.length;                        }                    }                   }                }            }            //组合 words terms            List<String> newWords = new ArrayList<String>();            newWords.add("");//种子            for (int i = 0; i < terms.size(); i++) {                if(words.containsKey(i+"-"+(i+1))){                    List<String> list = words.get(i+"-"+(i+1));                    List<String> tempword = new ArrayList<String>();                    for (String word : list) {                       for (String seed : newWords) {                             seed =  seed +word;                             tempword.add(seed);                        }                    }                    newWords =tempword;                }else{                    String word = terms.get(i+"-"+(i+1));                    List<String> tempword = new ArrayList<String>();                    for (String seed : newWords) {                        seed =  seed +word;                        tempword.add(seed);                    }                    newWords =tempword;                }            }             treewords.addAll(newWords);        }        Long maxCount =-1l;        String result="";        for (String word : treewords) {            Long count = findSpellWrodResult(word);            if(count>maxCount){                result=word;            }        }        return result;    }   /**     * @描述:获取高亮结果集合     * @return void     * @createTime:2016年9月14日     * @author: songqinghu     */    private List<Set<String>> highlightingWord(Map<String, String> terms, Map<String, String> newTerms,String oldWord){        //按照顺序取出 转换为拼音 组合        StringBuffer value = new StringBuffer("( ");        //组装        for (int i = 0; i < oldWord.length(); i++) {            String term = terms.get(i+"-"+(i+1));            Set<String> pinyins = Pinyin4jUtil.converterToSpellToSet(term);            StringBuffer oldtemp = new StringBuffer();            oldtemp.append("<").append(term).append(">");            for (String pinyin : pinyins) {                oldtemp.append(" <").append(pinyin).append(">");            }            newTerms.put(i+"-"+(i+1), oldtemp.toString());            StringBuffer temp = new StringBuffer();            Iterator<String> iterator = pinyins.iterator();            while (iterator.hasNext()) {                temp.append(iterator.next());                if(iterator.hasNext()){                    temp.append(" OR ");                }            }           // String one = temp.toString();          //  temp.append(" OR ").append(term);           // String two = temp.toString();            // value.append("(( "+temp.toString()+" )" + " AND " + " ( " + two +" ))");            value.append("(( ").append(temp.toString()).append(" )").append(" AND ")            .append(" ( ").append(temp.append(" OR ").append(term)).append(" ))");            if(i<oldWord.length()-1){                value.append(" AND ");            }        }        value.append(")");        SolrQuery query = new SolrQuery();        query.set(CommonParams.Q, "spellWords : "+ value.toString());        query.setHighlight(true);        query.addHighlightField("spellWords");        query.setHighlightSimplePre("<");        query.setHighlightSimplePost(">");        query.setHighlightSnippets(100);        query.set(CommonParams.FL, ProductBean.Fd.id.name());        query.setRows(3);        try {            QueryResponse response = productClient.query(query);            SolrDocumentList docs = response.getResults();            Map<String, Map<String, List<String>>> highlighting = response.getHighlighting();            if(docs.size()>0){                List<Set<String>> hightermList = new ArrayList<Set<String>>();                for (SolrDocument doc : docs) {                    Set<String> hightterms = new TreeSet<String>();                    String id = doc.getFieldValue("id").toString();                    //具体纠错处理                    Map<String, List<String>> map = highlighting.get(id);                    List<String> wordList = map.get("spellWords");                    for (String spell : wordList) {                        hightterms.add(spell);//收集去重复                    }                    //return hightterms; //先做一个                    hightermList.add(hightterms);                }                return hightermList;            }        } catch (SolrServerException | IOException e) {            e.printStackTrace();        }         return null;    }


使用到的工具类:


拼音:package cn.com.mx.gome.suggest.util;import java.util.ArrayList;import java.util.HashSet;import java.util.Hashtable;import java.util.List;import java.util.Map;import java.util.Set;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import cn.com.mx.gome.intensive.log.RsysLog;import cn.com.mx.gome.intensive.log.RsysLog.RsysExecLog;import net.sourceforge.pinyin4j.PinyinHelper;import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;public class Pinyin4jUtil {     private  static RsysExecLog logger = RsysLog.getInstance().getRsysExecLog();    /**     * 汉字转换位汉语拼音首字母,英文字符不变,特殊字符丢失 支持多音字,生成方式如(长沙市长:cssc,zssz,zssc,cssz)     *      * 这个方法不太符合我创建索引的要求---我要的是数组--自己重建一个     * @param chines     *            汉字     * @return 拼音 字符串     */     public static String converterToFirstSpell(String chines) {         return parseTheChineseByObject(discountTheChinese(MinMethod(chines).toString()));     }     /**     *      * @描述:返回类型调整为数组     * @param chines     * @return     * @return String[]     * @exception     * @createTime:2016年3月22日     * @author: songqinghu     */    public static Set<String> converterToFirstSpellToSet(String chines){        return parseTheChineseByObjectToSet(discountTheChinese(MinMethod(chines).toString()));     }    /**     *      * @描述:获取首字母方法抽取     * @param chines     * @return     * @return StringBuffer     * @exception     * @createTime:2016年3月22日     * @author: songqinghu     */    public static StringBuffer MinMethod(String chines){        StringBuffer pinyinName = new StringBuffer();         char[] nameChar = chines.toCharArray();         HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();         defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);         defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);         defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V);        for (int i = 0; i < nameChar.length; i++) {             if (nameChar[i] > 128) { //char中1-127对应特殊字符和数字,字母                try {                     // 取得当前汉字的所有全拼                     String[] strs = PinyinHelper.toHanyuPinyinStringArray(                             nameChar[i], defaultFormat);                     if (strs != null) {                         for (int j = 0; j < strs.length; j++) {                             // 取首字母                             pinyinName.append(strs[j].charAt(0));                             if (j != strs.length - 1) {                                 pinyinName.append(",");                             }                         }                     }                     // else {                     // pinyinName.append(nameChar[i]);                     // }                 } catch (BadHanyuPinyinOutputFormatCombination e) {                     logger.error("",e);                }             } else {                 pinyinName.append(nameChar[i]);             }             pinyinName.append(" ");         }         return pinyinName;    }    /**     * 汉字转换位汉语全拼,英文字符不变,特殊字符丢失     * 支持多音字,生成方式如(重当参:zhongdangcen,zhongdangcan,chongdangcen     * ,chongdangshen,zhongdangshen,chongdangcan)     *  不符合 我的要求---重新写一个返回值为Set<String>类型     * @param chines     *            汉字     * @return 拼音     */     public static String converterToSpell(String chines) {         // return pinyinName.toString();         return parseTheChineseByObject(discountTheChinese(midConoverterToSpell(chines).toString()));     }     /**     *      * @描述:返回值为set     * @param chines     * @return     * @return Set<String>     * @exception     * @createTime:2016年3月22日     * @author: songqinghu     */    public static Set<String> converterToSpellToSet(String chines){        return parseTheChineseByObjectToSet(discountTheChinese(midConoverterToSpell(chines).toString()));     }    /**     * @描述:方法抽取     * @param chines     * @return     * @return StringBuffer     * @exception     * @createTime:2016年3月22日     * @author: songqinghu     */    private  static StringBuffer midConoverterToSpell(String chines){        StringBuffer pinyinName = new StringBuffer();         char[] nameChar = chines.toCharArray();         HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();         defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);         defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);         defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V);        for (int i = 0; i < nameChar.length; i++) {             if (nameChar[i] > 128) {                 try {                     // 取得当前汉字的所有全拼                     String[] strs = PinyinHelper.toHanyuPinyinStringArray(                             nameChar[i], defaultFormat);                     if (strs != null) {                         for (int j = 0; j < strs.length; j++) {                             pinyinName.append(strs[j]);                             if (j != strs.length - 1) {                                 pinyinName.append(",");                             }                         }                     }                 } catch (BadHanyuPinyinOutputFormatCombination e) {                     logger.error("",e);                }             } else {                 pinyinName.append(nameChar[i]);             }             pinyinName.append(" ");         }         return pinyinName;    }    /**     * 去除多音字重复数据     *      * @param theStr     * @return     */     private static List<Map<String, Integer>> discountTheChinese(String theStr) {         // 去除重复拼音后的拼音列表         List<Map<String, Integer>> mapList = new ArrayList<Map<String, Integer>>();         // 用于处理每个字的多音字,去掉重复         Map<String, Integer> onlyOne = null;         String[] firsts = theStr.split(" ");         // 读出每个汉字的拼音         for (String str : firsts) {             onlyOne = new Hashtable<String, Integer>();             String[] china = str.split(",");             // 多音字处理             for (String s : china) {                 Integer count = onlyOne.get(s);                 if (count == null) {                     onlyOne.put(s, new Integer(1));                 } else {                     onlyOne.remove(s);                     count++;                     onlyOne.put(s, count);                 }             }             mapList.add(onlyOne);         }         return mapList;     }     /**     * 解析并组合拼音,对象合并方案(推荐使用)     *      * @return     */     private static String parseTheChineseByObject(             List<Map<String, Integer>> list) {         Map<String, Integer> first = MinparseTheChineseByObject(list);        String returnStr = "";         if (first != null) {             // 遍历取出组合字符串             for (String str : first.keySet()) {                 returnStr += (str + ",");             }         }         if (returnStr.length() > 0) {             returnStr = returnStr.substring(0, returnStr.length() - 1);         }         return returnStr;     }     /**     * 解析并组合拼音,对象合并方案(推荐使用)---返回set<String>     *      * @return     */     private static  Set<String> parseTheChineseByObjectToSet(             List<Map<String, Integer>> list) {         Map<String, Integer> first = MinparseTheChineseByObject(list);        Set<String> result = null;         if (first != null && first.keySet().size()>0) {             // 遍历取出组合字符串             result = first.keySet();        }         return result;     }    /**    *     * @描述:方法抽取    * @param list    * @return    * @return Map<String,Integer>    * @exception    * @createTime:2016年3月22日    * @author: songqinghu    */    private static Map<String,Integer> MinparseTheChineseByObject(            List<Map<String, Integer>> list){        Map<String, Integer> first = null; // 用于统计每一次,集合组合数据         // 遍历每一组集合         for (int i = 0; i < list.size(); i++) {             // 每一组集合与上一次组合的Map             Map<String, Integer> temp = new Hashtable<String, Integer>();             // 第一次循环,first为空             if (first != null) {                 // 取出上次组合与此次集合的字符,并保存                 for (String s : first.keySet()) {                     for (String s1 : list.get(i).keySet()) {                         String str = s + s1;                         temp.put(str, 1);                     }                 }                 // 清理上一次组合数据                 if (temp != null && temp.size() > 0) {                     first.clear();                 }             } else {                 for (String s : list.get(i).keySet()) {                     String str = s;                     temp.put(str, 1);                 }             }             // 保存组合数据以便下次循环使用             if (temp != null && temp.size() > 0) {                 first = temp;             }         }         return first;    }}

分词器:package cn.com.mx.gome.suggest.util.analyzer;import java.io.IOException;import java.io.StringReader;import java.util.HashMap;import java.util.Map;import java.util.Set;import java.util.TreeSet;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.ngram.NGramTokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;import cn.com.mx.gome.intensive.log.RsysLog;import cn.com.mx.gome.intensive.log.RsysLog.RsysExecLog;/** * NGram分割词语 * @Description: TODO * @author: songqinghu * @date: 2016年8月3日 上午10:53:28 * Version:1.0 */public class NGram11TokenizerUtil {    private static final RsysExecLog logger = RsysLog.getInstance().getRsysExecLog();    private static Tokenizer tokenizer = new NGramTokenizer(1,1);    private static Object lock = new Object();    private static Tokenizer getTokenizer(){        if(tokenizer ==null){            tokenizer = new NGramTokenizer(1,1);        }        return tokenizer;    }    /**     *      * @描述:对输入的文本使用n-gram进行处理     * @param text     * @return Set<String>     * @createTime:2016年8月3日     * @author: songqinghu     */    public static  Map<String, String> analyzer(String text) {        synchronized (lock) {            StringReader sr = new StringReader(text);                //N-gram模型分词器              getTokenizer().setReader(sr);            Map<String, String> words = tokenizer(getTokenizer());            return words;        }    }     private static Map<String, String>  tokenizer(Tokenizer tokenizer) {        Map<String, String> words = new HashMap<String,String>();        try {            tokenizer.reset();            while(tokenizer.incrementToken())              {   //                CharTermAttribute word=tokenizer.addAttribute(CharTermAttribute.class);//                System.out.println(word);//                words.add(word.toString());                CharTermAttribute charTermAttribute=tokenizer.addAttribute(CharTermAttribute.class);                  OffsetAttribute offsetAttribute=tokenizer.addAttribute(OffsetAttribute.class);                  words.put(offsetAttribute.startOffset()+"-"+offsetAttribute.endOffset(), charTermAttribute.toString());            }                 tokenizer.reset();        } catch (IOException e) {            logger.error("tokenizer occor error :  " + e);            try {                tokenizer.end();                tokenizer.close();            } catch (IOException e1) {                logger.error("tokenizer close error :  " + e);            }              tokenizer = null;        }                 return words;    }  }


纠错演示:
短词纠错:



长词纠错:



到这里基本功能已经实现,剩余就是持续的优化工作了







2 0
原创粉丝点击