编辑距离(Levenshtein Distance)
来源:互联网 发布:alias软件购买 编辑:程序博客网 时间:2024/05/21 14:01
http://www.cnitblog.com/ictfly/archive/2005/12/27/5828.aspx
搞自然语言处理的应该不会对这个概念感到陌生,编辑距离就是用来计算从原串(s)转换到目标串(t)所需要的最少的插入,删除和替换的数目,在NLP中应用比较广泛,如一些评测方法中就用到了(wer,mWer等),同时也常用来计算你对原文本所作的改动数。
编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫Levenshtein Distance。
Levenshtein distance (LD) is a measure of the similarity between two strings,which we will refer to as the source string (s) and the target string (t). Thedistance is the number of deletions, insertions, or substitutions required totransform s into t. For example,
- If s is "test" and t is "test", then LD(s,t) = 0, because no transformations are needed. The strings are already identical.
- If s is "test" and t is "tent", then LD(s,t) = 1, because one substitution (change "s" to "n") is sufficient to transform s into t.
The greater the Levenshtein distance, the more different the strings are.
Levenshtein distance is named after the Russian scientist VladimirLevenshtein, who devised the algorithm in 1965. If you can't spell or pronounceLevenshtein, the metric is also sometimes called edit distance.
The Levenshtein distance algorithm has been used in:
- Spell checking
- Speech recognition
- DNA analysis
- Plagiarism detection
The Algorithm
Steps
Step
Description
1
Set n to be the length of s.
Set m to be the length of t.
If n = 0, return m and exit.
If m = 0, return n and exit.
Construct a matrix containing 0..m rows and 0..n columns.
2
Initialize the first row to 0..n.
Initialize the first column to 0..m.
3
Examine each character of s (i from 1 to n).
4
Examine each character of t (j from 1 to m).
5
If s[i] equals t[j], the cost is 0.
If s[i] doesn't equal t[j], the cost is 1.
6
Set cell d[i,j] of the matrix equal to the minimum of:
a. The cell immediately above plus 1: d[i-1,j] + 1.
b. The cell immediately to the left plus 1: d[i,j-1] + 1.
c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.
7
After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m].
Example
This section shows how the Levenshtein distance is computed when thesource string is "GUMBO" and the target string is "GAMBOL".
Steps 1 and 2
G
U
M
B
O
0
1
2
3
4
5
G
1
A
2
M
3
B
4
O
5
L
6
Steps 3 to 6 When i = 1
G
U
M
B
O
0
1
2
3
4
5
G
1
0
A
2
1
M
3
2
B
4
3
O
5
4
L
6
5
Steps 3 to 6 When i = 2
G
U
M
B
O
0
1
2
3
4
5
G
1
0
1
A
2
1
1
M
3
2
2
B
4
3
3
O
5
4
4
L
6
5
5
Steps 3 to 6 When i = 3
G
U
M
B
O
0
1
2
3
4
5
G
1
0
1
2
A
2
1
1
2
M
3
2
2
1
B
4
3
3
2
O
5
4
4
3
L
6
5
5
4
Steps 3 to 6 When i = 4
G
U
M
B
O
0
1
2
3
4
5
G
1
0
1
2
3
A
2
1
1
2
3
M
3
2
2
1
2
B
4
3
3
2
1
O
5
4
4
3
2
L
6
5
5
4
3
Steps 3 to 6 When i = 5
G
U
M
B
O
0
1
2
3
4
5
G
1
0
1
2
3
4
A
2
1
1
2
3
4
M
3
2
2
1
2
3
B
4
3
3
2
1
2
O
5
4
4
3
2
1
L
6
5
5
4
3
2
Step 7
The distance is in the lower right hand corner of the matrix, i.e. 2. Thiscorresponds to our intuitive realization that "GUMBO" can betransformed into "GAMBOL" by substituting "A" for"U" and adding "L" (one substitution and 1 insertion = 2changes).
由于,我在实际应用中要处理中文,每个汉字 在内存中占两个字节,如果单纯用上述程序进行比较,就会有一些微小错误容易让人忽视,如汉字的“啊”和“阿”他们就有一个字节是相同的,一个字节是不同 的,利用上述程序统计出的更改数除以2就会出现半个字,所以,对于汉英混合文本统计更改数时,需先判断当前进行比较的两个字是汉字还是西文字母,然后填写一个代价矩阵,在填写时,如果是汉字,要把其相邻的两个字节对应的代价矩阵赋为同一个值,具体做法,请看代码:
LD(const char *s, const char *t)
{
int *d; // pointer to matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i1; // ith character of s
char s_i2; // ith character of s
char t_j1; // jth character of t
char t_j2; // jth character of t
int *cost; // cost代价矩阵
int result; // result
int cell; // contents of target cell
int above; // contents of cell immediately above
int left; // contents of cell immediately to left
int diag; // contents of cell immediately above and to left
int sz; // number of cells in matrix
// Step 1
n = strlen (s);
m = strlen (t);
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
sz = (n+1) * (m+1) * sizeof (int);
d = (int *) malloc (sz);
cost = (int *) malloc (sz);
// Step 2
for (i = 0; i <= n; i++)
{
PutAt (d, i, 0, n, i);
}
for (j = 0; j <= m; j++)
{
PutAt (d, 0, j, n, j);
}
for (int g=0;g<=m;g++)//把代价距离矩阵全部初始化为同一个值,以后可根据此值判断相应的方格是否被赋过值
{
for(int h=0;h<=n;h++)
{
PutAt(cost,h,g,n,2);
}
}
// Step 3
for (i = 1; i <= n; i++)
{
s_i1 = s[i-1];
s_i2 = s[i];
bool sbd=false;
bool tbd=false;
if(s_i1>=' '&&s_i1<='@'||s_i1>='A'&&s_i1<='~')
{//s为标点符号或其他非中文符号和数字
sbd=true;
}
// Step 4
for (j = 1; j <= m; j++)
{
tbd=false;
t_j1 = t[j-1];
t_j2 = t[j];
// Step 5
if(t_j1>=' '&&t_j1<='@'||t_j1>='A'&&t_j1<='~')
{//t也为标点符号
tbd=true;
}
if(!sbd)
{//s为汉字
if(!tbd)
{//t也为汉字
if (s_i1 == t_j1&&s_i2 == t_j2)
{
bool tt=false;
int temp=GetAt(cost,i,j,n);
if(temp==2)
{
PutAt(cost,i,j,n,0);
tt=true;
}
if(tt)
{//因为st全市汉字,所以把代价矩阵他相邻的未赋过值的三个格赋值
int temp1=GetAt(cost,i+1,j,n);
if(temp1==2)
{
PutAt(cost,i+1,j,n,0);
}
int temp2=GetAt(cost,i,j+1,n);
if(temp2==2)
{
PutAt(cost,i,j+1,n,0);
}
int temp3=GetAt(cost,i+1,j+1,n);
if(temp3==2)
{
PutAt(cost,i+1,j+1,n,0);
}
}
}
else
{
bool tt=false;
int temp=GetAt(cost,i,j,n);
if(temp==2)
{
PutAt(cost,i,j,n,1);
tt=true;
}
if(tt)
{
int temp1=GetAt(cost,i+1,j,n);
if(temp1==2)
{
PutAt(cost,i+1,j,n,1);
}
int temp2=GetAt(cost,i,j+1,n);
if(temp2==2)
{
PutAt(cost,i,j+1,n,1);
}
int temp3=GetAt(cost,i+1,j+1,n);
if(temp3==2)
{
PutAt(cost,i+1,j+1,n,1);
}
}
}
}
else
{//t为符号
bool tt=false;
int temp=GetAt(cost,i,j,n);
if(temp==2)
{
PutAt(cost,i,j,n,1);
tt=true;
}
if(tt)
{
int temp1=GetAt(cost,i+1,j,n);
if(temp1==2)
{
PutAt(cost,i+1,j,n,1);
}
}
}
}
else
{//s为符号
if(!tbd)
{//t为汉字
bool tt=false;
int temp=GetAt(cost,i,j,n);
if(temp==2)
{
PutAt(cost,i,j,n,1);
tt=true;
}
if(tt)
{
int temp1=GetAt(cost,i,j+1,n);
if(temp1==2)
{
PutAt(cost,i,j+1,n,1);
}
}
}
else
{
if(s_i1==t_j1)
{
int temp=GetAt(cost,i,j,n);
if(temp==2)
{
PutAt(cost,i,j,n,0);
}
}
else
{
int temp=GetAt(cost,i,j,n);
if(temp==2)
{
PutAt(cost,i,j,n,1);
}
}
}
}
// Step 6
above = GetAt (d,i-1,j, n);
left = GetAt (d,i, j-1, n);
diag = GetAt (d, i-1,j-1, n);
int curcost=GetAt(cost,i,j,n);
cell = Minimum (above + 1, left + 1, diag + curcost);
PutAt (d, i, j, n, cell);
}
}
// Step 7
result = GetAt (d, n, m, n);
free (d);
return result;
}
- 编辑距离(Levenshtein Distance)
- 编辑距离(Levenshtein Distance)
- Levenshtein Distance(编辑距离)
- 编辑距离(Edit Distance | Levenshtein距离)
- 编辑距离(Levenshtein Distance) (转)
- 编辑距离算法 Levenshtein Distance
- 编辑距离算法(Levenshtein distance)
- Minimum edit distance(levenshtein distance)(最小编辑距离)初探
- 字符串相似度算法(编辑距离算法 Levenshtein Distance)
- Java算法之Levenshtein Distance(编辑距离)算法
- iNLP源代码之编辑距离算法(Levenshtein distance)
- 字符串相似度算法(编辑距离算法 Levenshtein Distance)
- Levenshtein(编辑) 距离
- Levenshtein distance最小编辑距离算法实现
- Levenshtein distance最小编辑距离算法实现
- 最短编辑距离问题 : Levenshtein Distance
- Levenshtein distance最小编辑距离算法实现
- Levenshtein距离(编辑距离)
- poj 1988 Cube Stacking
- Makefile 中几种等号的用法
- BW: How To Delete Workbook
- 类的初始化列表
- LINQ查询一
- 编辑距离(Levenshtein Distance)
- LoadRunner 下载
- 似て非なる言葉「オブジェクト」と「エンティティ」の関係
- Capture video from USB by using OpenCV videoInput
- 安装RFT 8.1安装过程失败,提示:JVMJ9VM015W 初始化实例失败的解决方法
- MVC框架中的模型-视图分离问题(五) —— 分离之评测
- OBJ文件格式分析(一)
- AddressBook 联系人字母排序法
- OBJ文件格式(二)