一个文本字符串相似度计算的小程序

来源:互联网 发布:异构系统数据交互 编辑:程序博客网 时间:2024/05/30 02:22

帮人写了一个简单的小程序,用于计算两个文本字符串的相似度。计算方式非常简单,使用Lucene提供的StandardAnalyzer分词器将两个字符串分别分词、去除停用词、词干归一化,然后统计第一个文本中的词项在第二个文本中出现的次数,相似度 = 重复词项个数 / 第一个文本词项总数。原理非常简单,类似词袋模型,但是有些用处。

代码如下:

import java.io.IOException;import java.io.StringReader;import java.util.ArrayList;import java.util.Collection;import java.util.HashSet;import java.util.List;import java.util.Set;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.PorterStemFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.util.Version;/** *  * @author zhangxichuan * */public class SimCalculator {        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);        /**     *      * @param content1     * @param content2     * @return     */    public double calculate(String content1, String content2) {        List<String> tokenStream1 = getTokenizedList(content1);        List<String> tokenStream2 = getTokenizedList(content2);        if (isEmpty(tokenStream1) || isEmpty(tokenStream2)) {            return 0d;        }        Set<String> result2Set = new HashSet<String>(tokenStream2.size() / 4 * 3);        for (String token : tokenStream2) {            result2Set.add(token);        }        double simCount = 0d;        for (String token : tokenStream1) {            if (result2Set.contains(token)) {                simCount++;            }        }        return simCount / tokenStream1.size();    }        private static boolean isEmpty(Collection<?> c) {        if ( c == null || c.isEmpty() ) {            return true;        }        return false;    }    private List<String> getTokenizedList(String content) {        List<String> result = new ArrayList<String>();        TokenStream stream  = analyzer.tokenStream(content, new StringReader(content));        stream = new PorterStemFilter(stream);        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);        try {            stream.reset();            while(stream.incrementToken()) {                String term = charTermAttribute.toString();                result.add(term);            }        }        catch(IOException e) {            // not thrown b/c we're using a string reader...        }        return result;    }        public static void main(String[] args) {        String[] str = new String[10];        str[0] = "Indexing Relational Databases Content Offline for Efficient Keyword-Based Search.";        str[1] = "Efficient IR-style Keyword Search over Relational Database.";                System.out.println( new SimCalculator().calculate(str[0], str[1]) );            }}



0 0
原创粉丝点击