字符串相似度算法 levenshtein distance 编辑距离算法
来源:互联网 发布:h.323 端口 编辑:程序博客网 时间:2024/05/16 06:13
参考:http://www.merriampark.com/ld.htm#WHATIS
http://en.wikipedia.org/wiki/Levenshtein_distance
* Java
* C++
* Visual Basic
* Python
http://en.wikipedia.org/wiki/Levenshtein_distance
* Java
* C++
* Visual Basic
* Python
- Java
- public class Distance {
- //****************************
- // Get minimum of three values
- //****************************
- private int Minimum (int a, int b, int c) {
- int mi;
- mi = a;
- if (b < mi) {
- mi = b;
- }
- if (c < mi) {
- mi = c;
- }
- return mi;
- }
- //*****************************
- // Compute Levenshtein distance
- //*****************************
- public int LD (String s, String t) {
- int d[][]; // matrix
- int n; // length of s
- int m; // length of t
- int i; // iterates through s
- int j; // iterates through t
- char s_i; // ith character of s
- char t_j; // jth character of t
- int cost; // cost
- // Step 1
- n = s.length ();
- m = t.length ();
- if (n == 0) {
- return m;
- }
- if (m == 0) {
- return n;
- }
- d = new int[n+1][m+1];
- // Step 2
- for (i = 0; i <= n; i++) {
- d[i][0] = i;
- }
- for (j = 0; j <= m; j++) {
- d[0][j] = j;
- }
- // Step 3
- for (i = 1; i <= n; i++) {
- s_i = s.charAt (i - 1);
- // Step 4
- for (j = 1; j <= m; j++) {
- t_j = t.charAt (j - 1);
- // Step 5
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
- // Step 6
- d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);
- }
- }
- // Step 7
- return d[n][m];
- }
- }
- C++
- In C++, the size of an array must be a constant, and this code fragment causes an error at compile time:
- int sz = 5;
- int arr[sz];
- This limitation makes the following C++ code slightly more complicated than it would be if the matrix could simply be declared as a two-dimensional array, with a size determined at run-time.
- In C++ it's more idiomatic to use the System Template Library's vector class, as Anders Sewerin Johansen has done in an alternative C++ implementation.
- Here is the definition of the class (distance.h):
- class Distance
- {
- public:
- int LD (char const *s, char const *t);
- private:
- int Minimum (int a, int b, int c);
- int *GetCellPointer (int *pOrigin, int col, int row, int nCols);
- int GetAt (int *pOrigin, int col, int row, int nCols);
- void PutAt (int *pOrigin, int col, int row, int nCols, int x);
- };
- Here is the implementation of the class (distance.cpp):
- #include "distance.h"
- #include <string.h>
- #include <malloc.h>
- //****************************
- // Get minimum of three values
- //****************************
- int Distance::Minimum (int a, int b, int c)
- {
- int mi;
- mi = a;
- if (b < mi) {
- mi = b;
- }
- if (c < mi) {
- mi = c;
- }
- return mi;
- }
- //**************************************************
- // Get a pointer to the specified cell of the matrix
- //**************************************************
- int *Distance::GetCellPointer (int *pOrigin, int col, int row, int nCols)
- {
- return pOrigin + col + (row * (nCols + 1));
- }
- //*****************************************************
- // Get the contents of the specified cell in the matrix
- //*****************************************************
- int Distance::GetAt (int *pOrigin, int col, int row, int nCols)
- {
- int *pCell;
- pCell = GetCellPointer (pOrigin, col, row, nCols);
- return *pCell;
- }
- //*******************************************************
- // Fill the specified cell in the matrix with the value x
- //*******************************************************
- void Distance::PutAt (int *pOrigin, int col, int row, int nCols, int x)
- {
- int *pCell;
- pCell = GetCellPointer (pOrigin, col, row, nCols);
- *pCell = x;
- }
- //*****************************
- // Compute Levenshtein distance
- //*****************************
- int Distance::LD (char const *s, char const *t)
- {
- int *d; // pointer to matrix
- int n; // length of s
- int m; // length of t
- int i; // iterates through s
- int j; // iterates through t
- char s_i; // ith character of s
- char t_j; // jth character of t
- int cost; // cost
- int result; // result
- int cell; // contents of target cell
- int above; // contents of cell immediately above
- int left; // contents of cell immediately to left
- int diag; // contents of cell immediately above and to left
- int sz; // number of cells in matrix
- // Step 1
- n = strlen (s);
- m = strlen (t);
- if (n == 0) {
- return m;
- }
- if (m == 0) {
- return n;
- }
- sz = (n+1) * (m+1) * sizeof (int);
- d = (int *) malloc (sz);
- // Step 2
- for (i = 0; i <= n; i++) {
- PutAt (d, i, 0, n, i);
- }
- for (j = 0; j <= m; j++) {
- PutAt (d, 0, j, n, j);
- }
- // Step 3
- for (i = 1; i <= n; i++) {
- s_i = s[i-1];
- // Step 4
- for (j = 1; j <= m; j++) {
- t_j = t[j-1];
- // Step 5
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
- // Step 6
- above = GetAt (d,i-1,j, n);
- left = GetAt (d,i, j-1, n);
- diag = GetAt (d, i-1,j-1, n);
- cell = Minimum (above + 1, left + 1, diag + cost);
- PutAt (d, i, j, n, cell);
- }
- }
- // Step 7
- result = GetAt (d, n, m, n);
- free (d);
- return result;
- }
- Visual Basic
- '*******************************
- '*** Get minimum of three values
- '*******************************
- Private Function Minimum(ByVal a As Integer, _
- ByVal b As Integer, _
- ByVal c As Integer) As Integer
- Dim mi As Integer
- mi = a
- If b < mi Then
- mi = b
- End If
- If c < mi Then
- mi = c
- End If
- Minimum = mi
- End Function
- '********************************
- '*** Compute Levenshtein Distance
- '********************************
- Public Function LD(ByVal s As String, ByVal t As String) As Integer
- Dim d() As Integer ' matrix
- Dim m As Integer ' length of t
- Dim n As Integer ' length of s
- Dim i As Integer ' iterates through s
- Dim j As Integer ' iterates through t
- Dim s_i As String ' ith character of s
- Dim t_j As String ' jth character of t
- Dim cost As Integer ' cost
- ' Step 1
- n = Len(s)
- m = Len(t)
- If n = 0 Then
- LD = m
- Exit Function
- End If
- If m = 0 Then
- LD = n
- Exit Function
- End If
- ReDim d(0 To n, 0 To m) As Integer
- ' Step 2
- For i = 0 To n
- d(i, 0) = i
- Next i
- For j = 0 To m
- d(0, j) = j
- Next j
- ' Step 3
- For i = 1 To n
- s_i = Mid$(s, i, 1)
- ' Step 4
- For j = 1 To m
- t_j = Mid$(t, j, 1)
- ' Step 5
- If s_i = t_j Then
- cost = 0
- Else
- cost = 1
- End If
- ' Step 6
- d(i, j) = Minimum(d(i - 1, j) + 1, d(i, j - 1) + 1, d(i - 1, j - 1) + cost)
- Next j
- Next i
- ' Step 7
- LD = d(n, m)
- Erase d
- End Function
Javapublic class Distance { //**************************** // Get minimum of three values //**************************** private int Minimum (int a, int b, int c) { int mi; mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } return mi; } //***************************** // Compute Levenshtein distance //***************************** public int LD (String s, String t) { int d[][]; // matrix int n; // length of s int m; // length of t int i; // iterates through s int j; // iterates through t char s_i; // ith character of s char t_j; // jth character of t int cost; // cost // Step 1 n = s.length (); m = t.length (); if (n == 0) { return m; } if (m == 0) { return n; } d = new int[n+1][m+1]; // Step 2 for (i = 0; i <= n; i++) { d[i][0] = i; } for (j = 0; j <= m; j++) { d[0][j] = j; } // Step 3 for (i = 1; i <= n; i++) { s_i = s.charAt (i - 1); // Step 4 for (j = 1; j <= m; j++) { t_j = t.charAt (j - 1); // Step 5 if (s_i == t_j) { cost = 0; } else { cost = 1; } // Step 6 d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost); } } // Step 7 return d[n][m]; }}C++In C++, the size of an array must be a constant, and this code fragment causes an error at compile time:int sz = 5;int arr[sz];This limitation makes the following C++ code slightly more complicated than it would be if the matrix could simply be declared as a two-dimensional array, with a size determined at run-time.In C++ it's more idiomatic to use the System Template Library's vector class, as Anders Sewerin Johansen has done in an alternative C++ implementation.Here is the definition of the class (distance.h):class Distance{ public: int LD (char const *s, char const *t); private: int Minimum (int a, int b, int c); int *GetCellPointer (int *pOrigin, int col, int row, int nCols); int GetAt (int *pOrigin, int col, int row, int nCols); void PutAt (int *pOrigin, int col, int row, int nCols, int x);}; Here is the implementation of the class (distance.cpp):#include "distance.h"#include <string.h>#include <malloc.h>//****************************// Get minimum of three values//****************************int Distance::Minimum (int a, int b, int c){int mi; mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } return mi;}//**************************************************// Get a pointer to the specified cell of the matrix//************************************************** int *Distance::GetCellPointer (int *pOrigin, int col, int row, int nCols){ return pOrigin + col + (row * (nCols + 1));}//*****************************************************// Get the contents of the specified cell in the matrix //*****************************************************int Distance::GetAt (int *pOrigin, int col, int row, int nCols){int *pCell; pCell = GetCellPointer (pOrigin, col, row, nCols); return *pCell;}//*******************************************************// Fill the specified cell in the matrix with the value x//*******************************************************void Distance::PutAt (int *pOrigin, int col, int row, int nCols, int x){int *pCell; pCell = GetCellPointer (pOrigin, col, row, nCols); *pCell = x;}//*****************************// Compute Levenshtein distance//*****************************int Distance::LD (char const *s, char const *t){int *d; // pointer to matrixint n; // length of sint m; // length of tint i; // iterates through sint j; // iterates through tchar s_i; // ith character of schar t_j; // jth character of tint cost; // costint result; // resultint cell; // contents of target cellint above; // contents of cell immediately aboveint left; // contents of cell immediately to leftint diag; // contents of cell immediately above and to leftint sz; // number of cells in matrix // Step 1 n = strlen (s); m = strlen (t); if (n == 0) { return m; } if (m == 0) { return n; } sz = (n+1) * (m+1) * sizeof (int); d = (int *) malloc (sz); // Step 2 for (i = 0; i <= n; i++) { PutAt (d, i, 0, n, i); } for (j = 0; j <= m; j++) { PutAt (d, 0, j, n, j); } // Step 3 for (i = 1; i <= n; i++) { s_i = s[i-1]; // Step 4 for (j = 1; j <= m; j++) { t_j = t[j-1]; // Step 5 if (s_i == t_j) { cost = 0; } else { cost = 1; } // Step 6 above = GetAt (d,i-1,j, n); left = GetAt (d,i, j-1, n); diag = GetAt (d, i-1,j-1, n); cell = Minimum (above + 1, left + 1, diag + cost); PutAt (d, i, j, n, cell); } } // Step 7 result = GetAt (d, n, m, n); free (d); return result;}Visual Basic'*******************************'*** Get minimum of three values'*******************************Private Function Minimum(ByVal a As Integer, _ ByVal b As Integer, _ ByVal c As Integer) As IntegerDim mi As Integer mi = a If b < mi Then mi = b End If If c < mi Then mi = c End If Minimum = mi End Function'********************************'*** Compute Levenshtein Distance'********************************Public Function LD(ByVal s As String, ByVal t As String) As IntegerDim d() As Integer ' matrixDim m As Integer ' length of tDim n As Integer ' length of sDim i As Integer ' iterates through sDim j As Integer ' iterates through tDim s_i As String ' ith character of sDim t_j As String ' jth character of tDim cost As Integer ' cost ' Step 1 n = Len(s) m = Len(t) If n = 0 Then LD = m Exit Function End If If m = 0 Then LD = n Exit Function End If ReDim d(0 To n, 0 To m) As Integer ' Step 2 For i = 0 To n d(i, 0) = i Next i For j = 0 To m d(0, j) = j Next j ' Step 3 For i = 1 To n s_i = Mid$(s, i, 1) ' Step 4 For j = 1 To m t_j = Mid$(t, j, 1) ' Step 5 If s_i = t_j Then cost = 0 Else cost = 1 End If ' Step 6 d(i, j) = Minimum(d(i - 1, j) + 1, d(i, j - 1) + 1, d(i - 1, j - 1) + cost) Next j Next i ' Step 7 LD = d(n, m) Erase dEnd Function
- #!/user/bin/env python
- # -*- coding: utf-8 -*-
- class arithmetic():
- def __init__(self):
- pass
- ''''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 '''
- def levenshtein(self,first,second):
- if len(first) > len(second):
- first,second = second,first
- if len(first) == 0:
- return len(second)
- if len(second) == 0:
- return len(first)
- first_length = len(first) + 1
- second_length = len(second) + 1
- distance_matrix = [range(second_length) for x in range(first_length)]
- #print distance_matrix
- for i in range(1,first_length):
- for j in range(1,second_length):
- deletion = distance_matrix[i-1][j] + 1
- insertion = distance_matrix[i][j-1] + 1
- substitution = distance_matrix[i-1][j-1]
- if first[i-1] != second[j-1]:
- substitution += 1
- distance_matrix[i][j] = min(insertion,deletion,substitution)
- print distance_matrix
- return distance_matrix[first_length-1][second_length-1]
- if __name__ == "__main__":
- arith = arithmetic()
- print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'
- 字符串相似度算法 levenshtein distance 编辑距离算法
- 字符串相似度算法(编辑距离算法 Levenshtein Distance)
- 字符串相似度算法 levenshtein distance 编辑距离算法
- 字符串相似度算法 -- levenshtein distance 编辑距离算法
- 字符串相似度算法(编辑距离算法 Levenshtein Distance)
- 字符串相似度算法(编辑距离Levenshtein Distance)
- java两字符串相似度计算算法——Levenshtein distance编辑距离算法
- [转]字符串相似度算法(编辑距离算法 Levenshtein Distance)[附c#,asp源码]
- 用C#实现字符串相似度算法(编辑距离算法 Levenshtein Distance)
- 编辑距离算法 Levenshtein Distance
- 编辑距离算法(Levenshtein distance)
- [随笔]初步了解 Levenshtein Distance (Edit Distance) 编辑距离,字符相似度算法
- 字符串相似度算法(Levenshtein Distance)
- java文本相似度计算(Levenshtein Distance算法(中文翻译:编辑距离算法))----代码和详解
- java文本相似度计算(Levenshtein Distance算法(中文翻译:编辑距离算法))----代码和详解
- levenshtein字符串编辑距离算法
- 字符串相似度算法( Levenshtein Distance算法)(zz)
- 字符串相似度算法( Levenshtein Distance算法)
- Hibernate实现Clob和Blob对象的存取
- 拦截器
- MP4系列之--如何获取mp4文件信息
- Android 百度地图开发之地图不刷新问题解决
- 让你学会Linux计划任务
- 字符串相似度算法 levenshtein distance 编辑距离算法
- linux Kill 多个进程
- No Hibernate Session bound to thread, and configuration does not allow 解决办法
- MediaElement SetSource Access Violation
- 文本编程
- mianshi-SQL Server LOCK, ISOLATION LEVEL
- xPath与命名空间与动态加载图片
- Android 和iOS 比较之我见
- Linux内核Makefile文件