基于字的文本相似度算法——余弦定理

来源：互联网发布：ps软件基础教程编辑：程序博客网时间：2024/06/05 07:43

一、算法原理

基于字的文本相似度余弦定理算法的原理是：

（1）分别统计两个比较文本中所有字出现的频率，从而得出两个文本对应的向量
（2）利用余弦定理计算这两个向量的夹角余弦值

（3）根据自设置的阈值判断两个文本是否相似

二、算法的C++实现

这里引用的StringUtil.hpp文件引自：

https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp

/* * CosineSimilarity.hpp * *  Created: 2016年10月2日 *   Author: tang */#ifndef SRC_COSINE_SIMILARITY_HPP_#define SRC_COSINE_SIMILARITY_HPP_#include <iostream>#include <vector>#include <map>#include <math.h>#include "StringUtil.hpp"using namespace std;class CosineSimilarity{public:CosineSimilarity(){}double CalculateTextSimilarity(string &str1,string &str2){vector<uint16_t> words_for_str1;vector<uint16_t> words_for_str2;vector<uint16_t>::iterator it;if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) || !utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) ){cout<<"TransCode Error"<<endl;return 0.;}map< uint16_t,pair<int,int> >seq_map;map< uint16_t,pair<int,int> >::iterator map_it;for(it=words_for_str1.begin();it!=words_for_str1.end();++it){if(isHanzi(*it)){map_it=seq_map.find(*it);if(map_it!=seq_map.end()){map_it->second.first++;}else{pair<int,int> seq;seq.first=1;seq.second=0;seq_map[*it]=seq;}}}for(it=words_for_str2.begin();it!=words_for_str2.end();++it)                {if(isHanzi(*it))                        {                                map_it=seq_map.find(*it);                                        if(map_it!=seq_map.end())                                {                                        map_it->second.second++;                                }                                else                                {pair<int,int> seq;                                        seq.first=0;                                        seq.second=1;                                        seq_map[*it]=seq;                                }                        }                }double sqdoc1 = 0.;                 double sqdoc2 = 0.;                 double denominator = 0.;for(map_it=seq_map.begin();map_it!=seq_map.end();++map_it){pair<int,int> c=map_it->second;denominator +=(c.first * c.second);sqdoc1+=(c.first * c.first);sqdoc2+=(c.second * c.second);} if(0==sqdoc1 * sqdoc2)return -1.0;return denominator/sqrt(sqdoc1 * sqdoc2);}bool codeFilter(int code) {        if ((code < 0x4e00 || code > 0x9fa5) && !(code >= '0' && code <= '9') && !(code >= 'a' && code <= 'z') && !(code >= 'A' && code <= 'Z'))            return false;                return true;}bool isHanzi(uint16_t ch){return (ch >= 0x4E00 && ch <= 0x9FA5);}};

三、算法的Java实现

import java.io.UnsupportedEncodingException;  import java.util.Date;  import java.util.HashMap;  import java.util.Iterator;  import java.util.Map; public class CosineSimilarity{    /** * 输入两段文本利用孜频率的余弦定理判断二者间的相似度 *  * @param doc1,文本1 * @param doc2,文本2 * @return 相似度值 */    public double CalculateTextSim(String doc1, String doc2) {if (doc1 != null && doc1.trim().length() > 0 && doc2 != null&& doc2.trim().length() > 0) {Map<Integer, int[]> AlgorithmMap = new HashMap<Integer, int[]>();//将两个字符串中的中文字符以及出现的总数封装到，AlgorithmMap中for (int i = 0; i < doc1.length(); i++) {char d1 = doc1.charAt(i);if(isHanZi(d1)){int charIndex = getGB2312Id(d1);if(charIndex != -1){int[] fq = AlgorithmMap.get(charIndex);if(fq != null && fq.length == 2){fq[0]++;}else {fq = new int[2];fq[0] = 1;fq[1] = 0;AlgorithmMap.put(charIndex, fq);}}}}for (int i = 0; i < doc2.length(); i++) {char d2 = doc2.charAt(i);if(isHanZi(d2)){int charIndex = getGB2312Id(d2);if(charIndex != -1){int[] fq = AlgorithmMap.get(charIndex);if(fq != null && fq.length == 2){fq[1]++;}else {fq = new int[2];fq[0] = 0;fq[1] = 1;AlgorithmMap.put(charIndex, fq);}}}}Iterator<Integer> iterator = AlgorithmMap.keySet().iterator();double sqdoc1 = 0;double sqdoc2 = 0;double denominator = 0; while(iterator.hasNext()){int[] c = AlgorithmMap.get(iterator.next());denominator += c[0]*c[1];sqdoc1 += c[0]*c[0];sqdoc2 += c[1]*c[1];}return denominator / Math.sqrt(sqdoc1*sqdoc2);} else {throw new NullPointerException("the Document is null or have not cahrs!!");}    }/** * 输入一个字符判断是否为中文汉字 *  * @param ch，字符 * @return true为中文汉字，否则为false */    public boolean isHanZi(char ch) {return (ch >= 0x4E00 && ch <= 0x9FA5);        }/** * 根据输入的Unicode字符，获取它的GB2312编码或者ascii编码， *  * @param ch,输入的GB2312中文字符或者ASCII字符(128个) * @return ch在GB2312中的位置，-1表示该字符不认识 */public static short getGB2312Id(char ch) {try {byte[] buffer = Character.toString(ch).getBytes("GB2312");if (buffer.length != 2) {// 正常情况下buffer应该是两个字节，否则说明ch不属于GB2312编码，故返回'?'，此时说明不认识该字符return -1;}int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始，因此减去0xA1=161int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字，因此每个区只收16*6-2=94个汉字return (short) (b0 * 94 + b1);} catch (UnsupportedEncodingException e) {e.printStackTrace();}return -1;}}

1 0