Java进行语义相似度分析

来源：互联网发布：软件设计师参考书编辑：程序博客网时间：2024/05/09 11:23

这是发的第二篇博客，之前那篇还没有通过审核呢，无所谓。

想说说这个题目，还在上大四，自然语言处理和信息检索的知识在我们学校是研究生的课程，而且这个实验室很NB哦，老那我不能保研，只能远远看着实验室门牌号拿着纸巾……擦眼泪了。好在是良心学院，大四没有什么基础课程了，却开了还好多专业限选课，就是各个实验室都能拿出来一些入门课程，派博士生或者直接教授上阵，拿到本科课堂来。

不多说了，这是课上老师留的作业：给定文本input.txt ，其中有750对英文句子，以" 句子1 + Tab + 句子2 +Enter "形式给出。现在要求用余弦向量法，求每对英文句子的相似度，并且输出到output.txt。完成上一个任务后，老师还会给出一个针对上述750对句子，人工给出的相似度评分文件standardAnalysis.txt（750个数，人工写上去的？老师唬我，说，用了什么高端技术），现在又要求利用Pearson相关系数法，分析output.txt与standardAnalysis.txt中得到的语义相似度的相关性如何。

以上就是题目要求，听说，这应该是这项课程最入门的程序和思想了吧，思想去百度 “红字儿”，剩下的就是java的基本操作了，作为一个对java还没有入门我来说，这才是我的难题，好在两个下午给弄出来了，有些地方写的可能很可笑，可我还发现不了，以后回来看的时候用来当茶余饭后吧。。。

废话少说，贴代码，注释写的还挺清楚的

/*Author:NaData :2014/11/29*/import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;class Similarity{String sentence="";String []part=new String[2];String input,output,standardAnalysis;List <String> comDataList=new ArrayList<String>();List <String> userDataList=new ArrayList<String>();static float[] comData=new float[1000];static float[] userData=new float[1000];public Similarity(String input,String output,String standardAnalysis){this.input=input;this.output=output;this.standardAnalysis=standardAnalysis;}public static void main(String args[]) throws IOException{Similarity s=new Similarity("E://input.txt","E://output.txt","E://standardAnalysis.txt");s.fileOperation(s.input,s.output);System.out.println("语义的向量相似度分析完毕"+"\n"+"请查看文件："+s.output);System.out.println("......"+"\n"+"Pearson correlation分析如下");System.out.println("程序计算结果与人工打分结果之间的相关度是：");s.PearsonFileOperation(s.output,s.standardAnalysis);float pearson=s.Pearson(comData,userData);System.out.println(pearson);//System.out.println(s.Sum(comData));//System.out.println(s.Sum(userData));}//Chapter1:余弦向量法public void fileOperation(String inputPath,String outputPath) throws IOException{//读文件File inputFile=new File(inputPath);BufferedReader reader=null;        if(!inputFile.exists()||inputFile.isDirectory())throw new FileNotFoundException();        reader=new BufferedReader(new FileReader(inputFile));                //写文件        File outputFile=new File(outputPath);        FileWriter writer=null;        if(!outputFile.exists())        if(!outputFile.createNewFile())        System.out.println("输出文件创建失败");        writer=new FileWriter(outputFile);                //按行得到句子对儿        int line=1;        float result=(float) 0.0;        String tmpToWrite="";        while((sentence=reader.readLine())!=null){            part=sentence.split("\t");//按"tab"将每对儿句子分成两部分part[0],part[1]            line++;             result=cosVector(part[0],part[1]); //余弦向量法分析相似度                           // 按照       相似度d+"\tab"+part[0]+"\tab"+part[1]+"\n"              tmpToWrite=result+"\t"+part[0]+"\t"+part[1]+"\r\n";            writer.write(tmpToWrite);            writer.flush();        }        if(reader!=null){        try{        reader.close();        }catch(Exception e){        e.printStackTrace();        }        }        if(writer!=null){        try{        writer.close();        }catch(Exception e){        e.printStackTrace();        }        }}//判断指定字符串str是否在Map的索引集当中public boolean isIn(Map<String,int[]> wordWeight,String str){for (String key : wordWeight.keySet()) {//遍历map的所有keyif(key.equals(str))return true;}return false;}//计算余弦向量public float cosVector(String sentence1,String sentence2){String []wordsOfSen1=new String[64];//第一句的单词集String []wordsOfSen2=new String[64];//第二句的单词集wordsOfSen1=sentence1.split(" ");    wordsOfSen2=sentence2.split(" ");    //单词的出现频数，例：wordWeight[word][0]单词"word"在第一句中出现的频数Map <String,int[]> wordWeight=new HashMap<String ,int[]>();//两句话的单词频数统计    for(int i=0;i<wordsOfSen1.length;i++){    if(!isIn(wordWeight,wordsOfSen1[i]))    wordWeight.put(wordsOfSen1[i], new int[]{1,0});    else    wordWeight.get(wordsOfSen1[i])[0]+=1;    }    for(int i=0;i<wordsOfSen2.length;i++){    if(!isIn(wordWeight,wordsOfSen2[i]))    wordWeight.put(wordsOfSen2[i], new int[]{0,1});    else    wordWeight.get(wordsOfSen2[i])[1]+=1;    }    //上面已经将各个单词的频数按照向量(即句子向量)的形式表示出来了    //wordWeight.size就是向量的维数    //wordWeight[word][0]就是单词"word"在第一句中出现的频数    //下面利用该向量计算余弦    float neiji=(float) 0.0;//两个句子向量的内积    float modeOfSen1=(float)0.0;//句子1的向量模de平方    float modeOfSen2=(float)0.0;//句子2的向量模de平方    for(String key:wordWeight.keySet()){    neiji+=wordWeight.get(key)[0]*wordWeight.get(key)[1];    modeOfSen1+=Math.pow(wordWeight.get(key)[0], 2);    modeOfSen2+=Math.pow(wordWeight.get(key)[1], 2);        }return (float) (neiji/(Math.sqrt(modeOfSen1)*Math.sqrt(modeOfSen2)));}//Chapter2:Pearson回归分析//Pearson公式public float Pearson(float[] x,float[] y){int lenx=x.length;int leny=y.length;int len=lenx;//小容错if(lenx<leny) len=lenx;else len=leny;float sumX=Sum(x);float sumY=Sum(y);float sumXX=Mutipl(x,x,len);float sumYY=Mutipl(y,y,len);float sumXY=Mutipl(x,y,len);float upside=sumXY-sumX*sumY/len;float downside=(float) Math.sqrt((sumXX-(Math.pow(sumX, 2))/len)*(sumYY-(Math.pow(sumY, 2))/len));System.out.println(len+" "+sumX+" "+sumY+" "+sumXX+" "+sumYY+" "+sumXY);return upside/downside;}public float Sum(float[] arr){float total=(float)0.0;for(float ele:arr)total+=ele;return total;}public float Mutipl(float[] arr1,float[] arr2,int len){float total=(float)0.0;for(int i=0;i<len;i++)total+=arr1[i]*arr2[i];return total;}//String数组转为float数组public float[] strToFloat(List<String> str){int len=str.size();float[] floatArr=new float[len];for(int i=0;i<len;i++){floatArr[i]=Float.parseFloat(str.get(i));}return floatArr;}public float PearsonFileOperation(String outputPath,String standardAnalysisPath) throws FileNotFoundException  {//读文件File outputFile=new File(outputPath);BufferedReader reader1=null;if(!outputFile.exists()||outputFile.isDirectory())throw new FileNotFoundException();reader1=new BufferedReader(new FileReader(outputFile));File standardAnalysisFile=new File(standardAnalysisPath);BufferedReader reader2=null;if(!standardAnalysisFile.exists()||standardAnalysisFile.isDirectory())throw new FileNotFoundException();reader2=new BufferedReader(new FileReader(standardAnalysisFile));//分段String tmpSen="";String []tmpPart=new String[6];try {while((tmpSen=reader1.readLine())!=null){tmpPart=tmpSen.split("\t");comDataList.add(tmpPart[0]);}while((tmpSen=reader2.readLine())!=null){tmpPart=tmpSen.split("\n");userDataList.add(tmpPart[0]);}//将list转换为float数组comData= strToFloat(comDataList);userData= strToFloat(userDataList);} catch (IOException e) {System.out.println("错误");e.printStackTrace();}return 0;}}

再语义相似度分析时，上面的代码漏掉了一个很重要的点，就是要把平时常用词汇去除掉，这是后来想起来的，懒得去加了，其实很简单，只需要用再建立保存常用词汇的map，计算频数时，去掉常用词即可。

0 0