简单实现根据Td-idf实现语句相似度

来源:互联网 发布:太阳系比较模拟软件 编辑:程序博客网 时间:2024/06/06 09:04

使用词频来计算两个语句的相似度,假设语句已经使用分词工具分好

public class Similar {public static double getSimilarity(Vector<String> T1, Vector<String> T2) throws Exception {    int size = 0 , size2 = 0 ;    double sum=0;    if ( T1 != null && ( size = T1.size() ) > 0 && T2 != null && ( size2 = T2.size() ) > 0 ) {             Set<String> set=new HashSet<String>();     Map<String, Integer> s1 = new HashMap<String, Integer>();    Map<String, Integer> s2= new HashMap<String, Integer>();    //获取T1,T2所有的不重复单词    for (int i = 0; i < size; i++) {set.add(T1.get(i));}    for(int j=0;j<size2;j++){    set.add(T2.get(j));        }    //计算每个向量中的词语出现的次数     for (String string : set) {    int count=0;        int count1=0;for (String a1 : T1) {if (a1.equals(string)) {count++;}}for (String a2 : T2) {if (a2.equals(string)) {count1++;}}s1.put(string, count);s2.put(string, count1);}        //计算百分比     int c1 = 0 , c2 = 0;     double sum1 = 0;  //S1、S2     double b1=0,b2=0;     for (String string : set) {     c1=s1.get(string);            c2=s2.get(string);             sum1+=c1*c2;             b1+=c1*c1;             b2+=c2*c2;                 }            sum=sum1/Math.sqrt(b1*b2);    }else {System.out.println("参数有误");}return sum;  }public static void main(String[] args) {Vector<String> T1 = new Vector<String>() ;//对输入进行分词Vector<String> T2 = new Vector<String>() ;//对输入进行分词T1.add("i");T1.add("kill");T1.add("you");T1.add("kill");T1.add("you");T1.add("fop");T1.add("tip");T1.add("cnce");T2.add("kill");T2.add("you"); try {System.out.println(getSimilarity(T1, T2));} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}


                                             
0 0
原创粉丝点击