字符串相似度算法
来源:互联网 发布:js导出excel代码 编辑:程序博客网 时间:2024/05/01 14:20
原文:http://blog.csdn.net/guffey/article/details/6750494
2011-09-05 17:30 74人阅读 评论(0)收藏 举报
2011-09-05 17:30 74人阅读 评论(0)收藏 举报
字符串相似度算法( Levenshtein Distance算法)
题目: 一个字符串可以通过增加一个字符,删除一个字符,替换一个字符得到另外一个字符串,假设,我们把从字符串A转换成字符串B,前面3种操作所执行的最少次数称为AB相似度
如 abc adc 度为 1
ababababa babababab 度为 2
abcd acdb 度为2
字符串相似度算法可以使用 Levenshtein Distance算法(中文翻译:编辑距离算法) 这算法是由俄国科学家Levenshtein提出的。其步骤
Set m to be the length of t.
If n = 0, return m and exit.
If m = 0, return n and exit.
Construct a matrix containing 0..m rows and 0..n columns.2Initialize the first row to 0..n.
Initialize the first column to 0..m.3Examine each character of s (i from 1 to n).4Examine each character of t (j from 1 to m).5If s[i] equals t[j], the cost is 0.
If s[i] doesn't equal t[j], the cost is 1.6Set cell d[i,j] of the matrix equal to the minimum of:
a. The cell immediately above plus 1: d[i-1,j] + 1.
b. The cell immediately to the left plus 1: d[i,j-1] + 1.
c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.7After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m].
C++实现如下
#include <iostream>
#include <vector>
#include <string>
using namespace std;
//算法
int ldistance(const string source,const string target)
{
//step 1
int n=source.length();
int m=target.length();
if (m==0) return n;
if (n==0) return m;
//Construct a matrix
typedef vector< vector<int> > Tmatrix;
Tmatrix matrix(n+1);
for(int i=0; i<=n; i++) matrix[i].resize(m+1);
//step 2 Initialize
for(int i=1;i<=n;i++) matrix[i][0]=i;
for(int i=1;i<=m;i++) matrix[0][i]=i;
//step 3
for(int i=1;i<=n;i++)
{
const char si=source[i-1];
//step 4
for(int j=1;j<=m;j++)
{
const char dj=target[j-1];
//step 5
int cost;
if(si==dj){
cost=0;
}
else{
cost=1;
}
//step 6
const int above=matrix[i-1][j]+1;
const int left=matrix[i][j-1]+1;
const int diag=matrix[i-1][j-1]+cost;
matrix[i][j]=min(above,min(left,diag));
}
}//step7
return matrix[n][m];
}
int main(){
string s;
string d;
cout<<"source=";
cin>>s;
cout<<"diag=";
cin>>d;
int dist=ldistance(s,d);
cout<<"dist="<<dist<<endl;
}
#include <iostream>
#include <vector>
#include <string>
using namespace std;
//算法
int ldistance(const string source,const string target)
{
//step 1
int n=source.length();
int m=target.length();
if (m==0) return n;
if (n==0) return m;
//Construct a matrix
typedef vector< vector<int> > Tmatrix;
Tmatrix matrix(n+1);
for(int i=0; i<=n; i++) matrix[i].resize(m+1);
//step 2 Initialize
for(int i=1;i<=n;i++) matrix[i][0]=i;
for(int i=1;i<=m;i++) matrix[0][i]=i;
//step 3
for(int i=1;i<=n;i++)
{
const char si=source[i-1];
//step 4
for(int j=1;j<=m;j++)
{
const char dj=target[j-1];
//step 5
int cost;
if(si==dj){
cost=0;
}
else{
cost=1;
}
//step 6
const int above=matrix[i-1][j]+1;
const int left=matrix[i][j-1]+1;
const int diag=matrix[i-1][j-1]+cost;
matrix[i][j]=min(above,min(left,diag));
}
}//step7
return matrix[n][m];
}
int main(){
string s;
string d;
cout<<"source=";
cin>>s;
cout<<"diag=";
cin>>d;
int dist=ldistance(s,d);
cout<<"dist="<<dist<<endl;
}
#include <vector>
#include <string>
using namespace std;
//算法
int ldistance(const string source,const string target)
{
//step 1
int n=source.length();
int m=target.length();
if (m==0) return n;
if (n==0) return m;
//Construct a matrix
typedef vector< vector<int> > Tmatrix;
Tmatrix matrix(n+1);
for(int i=0; i<=n; i++) matrix[i].resize(m+1);
//step 2 Initialize
for(int i=1;i<=n;i++) matrix[i][0]=i;
for(int i=1;i<=m;i++) matrix[0][i]=i;
//step 3
for(int i=1;i<=n;i++)
{
const char si=source[i-1];
//step 4
for(int j=1;j<=m;j++)
{
const char dj=target[j-1];
//step 5
int cost;
if(si==dj){
cost=0;
}
else{
cost=1;
}
//step 6
const int above=matrix[i-1][j]+1;
const int left=matrix[i][j-1]+1;
const int diag=matrix[i-1][j-1]+cost;
matrix[i][j]=min(above,min(left,diag));
}
}//step7
return matrix[n][m];
}
int main(){
string s;
string d;
cout<<"source=";
cin>>s;
cout<<"diag=";
cin>>d;
int dist=ldistance(s,d);
cout<<"dist="<<dist<<endl;
}
#include <iostream>
#include <vector>
#include <string>
using namespace std;
//算法
int ldistance(const string source,const string target)
{
//step 1
int n=source.length();
int m=target.length();
if (m==0) return n;
if (n==0) return m;
//Construct a matrix
typedef vector< vector<int> > Tmatrix;
Tmatrix matrix(n+1);
for(int i=0; i<=n; i++) matrix[i].resize(m+1);
//step 2 Initialize
for(int i=1;i<=n;i++) matrix[i][0]=i;
for(int i=1;i<=m;i++) matrix[0][i]=i;
//step 3
for(int i=1;i<=n;i++)
{
const char si=source[i-1];
//step 4
for(int j=1;j<=m;j++)
{
const char dj=target[j-1];
//step 5
int cost;
if(si==dj){
cost=0;
}
else{
cost=1;
}
//step 6
const int above=matrix[i-1][j]+1;
const int left=matrix[i][j-1]+1;
const int diag=matrix[i-1][j-1]+cost;
matrix[i][j]=min(above,min(left,diag));
}
}//step7
return matrix[n][m];
}
int main(){
string s;
string d;
cout<<"source=";
cin>>s;
cout<<"diag=";
cin>>d;
int dist=ldistance(s,d);
cout<<"dist="<<dist<<endl;
}
java 字符串编辑距离算法实现:
public static int getLevenshteinDistance (String s, String t) { if (s == null || t == null) { throw new IllegalArgumentException("Strings must not be null"); } /* The difference between this impl. and the previous is that, rather than creating and retaining a matrix of size s.length()+1 by t.length()+1, we maintain two single-dimensional arrays of length s.length()+1. The first, d, is the 'current working' distance array that maintains the newest distance cost counts as we iterate through the characters of String s. Each time we increment the index of String t we are comparing, d is copied to p, the second int[]. Doing so allows us to retain the previous cost counts as required by the algorithm (taking the minimum of the cost count to the left, up one, and diagonally up and to the left of the current cost count being calculated). (Note that the arrays aren't really copied anymore, just switched...this is clearly much better than cloning an array or doing a System.arraycopy() each time through the outer loop.) Effectively, the difference between the two implementations is this one does not cause an out of memory condition when calculating the LD over two very large strings. */ int n = s.length(); // length of s int m = t.length(); // length of t if (n == 0) { return m; } else if (m == 0) { return n; } int p[] = new int[n+1]; //'previous' cost array, horizontally int d[] = new int[n+1]; // cost array, horizontally int _d[]; //placeholder to assist in swapping p and d // indexes into strings s and t int i; // iterates through s int j; // iterates through t char t_j; // jth character of t int cost; // cost for (i = 0; i<=n; i++) { p[i] = i; } for (j = 1; j<=m; j++) { t_j = t.charAt(j-1); d[0] = j; for (i=1; i<=n; i++) { cost = s.charAt(i-1)==t_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); } // copy current distance counts to 'previous row' distance counts _d = p; p = d; d = _d; } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts return p[n];}字符串相似度=1-(编辑距离/(MAX(字符串1长度,字符串2的长度))
oracle 11提供了计算字符串编辑距离和相似度的函数:
参见http://psoug.org/reference/utl_match.html
Oracle UTL_MATCHVersion 11.1 General InformationThe four functions included in the package use different methods to compare a source string and destination string, and return an assessment of what it would take to turn the source string into the destination string.Source$ORACLE_HOME/rdbms/admin/utlmatch.sql EDIT_DISTANCEReturns the number of changes required to turn the source string into the destination string using the Levenshtein Distance algorithm.utl_match.edit_distance(s1 IN VARCHAR2, s2 IN VARCHAR2)RETURN PLS_INTEGER;SELECT utl_match.edit_distance('expresso', 'espresso') DIST
FROM dual; EDIT_DISTANCE_SIMILARITYReturns an integer between 0 and 100, where 0 indicates no similarity at all and 100 indicates a perfect match.utl_match.edit_distance_similarity(
s1 IN VARCHAR2, s2 IN VARCHAR2) RETURN PLS_INTEGER;SELECT utl_match.edit_distance_similarity('expresso', 'espresso') SIM
FROM dual; JARO_WINKLERInstead of simply calculating the number of steps required to change the source string to the destination string, determines how closely the two strings agree with each other and tries to take into account the possibility of a data entry error.utl_match.jaro_winkler(s1 IN VARCHAR2, s2 IN VARCHAR2)
RETURN BINARY_DOUBLE;SELECT utl_match.jaro_winkler('expresso', 'espresso') DIST
FROM dual; JARO_WINKLER_SIMILARITYReturns an integer between 0 and 100, where 0 indicates no similarity at all and 100 indicates a perfect match but tries to take into account possible data entry errors.utl_match.jaro_winkler_similarity(
s1 IN VARCHAR2, s2 IN VARCHAR2) RETURN PLS_INTEGER;SELECT utl_match.jaro_winkler_similarity('expresso', 'expresso') SIM
FROM dual;
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度算法
- 字符串相似度的算法
- 字符串相似度Levenshtein算法
- 字符串相似度算法介绍
- 字符串相似度算法介绍
- 字符串相似度算法介绍(整理)
- 字符串相似度算法(Levenshtein Distance)
- 字符串相似度算法介绍(整理)
- 比较两字符串相似度算法
- sublime text
- ios学习--Three20
- word中如何将所有字母一次修改成新罗马字体
- ArcSDE多服务都指向一个实例的解决方法
- inspiration 使用方法(二)——Symbol
- 字符串相似度算法
- servlet和jsp的区别,servlet和Action的区别,servlet的线程安全性
- linux c/c++ undefined reference to 'pthread_create' 未定义pthread_create问题
- 《高雄度谈话》笔记……
- java多线程备忘
- 设计模式遵循的七大原则
- 自动化测试学习笔记
- 几则小笑话引出的WEB用户体验问题
- C - char与signed char, unsigned char的区别