面试例题-3 编辑距离

来源:互联网 发布:跳跃网络账号注册 编辑:程序博客网 时间:2024/06/06 17:55

编辑距离(Edit Distance),又称Levenshtein距离,是指两个字串之间,由一个转成另一个所需的最少编辑操作次数。许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。一般来说,编辑距离越小,两个串的相似度越大。
例如将kitten转成sitting:
kitten->sitten (k→s)
sitten->sittin (e→i)
sittin->sitting (插入g)
算法思想:
比如要计算cafe和coffee的编辑距离。cafe→caffe→coffe→coffee
先创建一个6×8的表(cafe长度为4,coffee长度为6,各加2)
  
c
o
f  

e
e








c







a







f







e





 表1
接着,在如下位置填入数字(表2):
  
c
o
f  

e
e

0
1
2
3
4
5
6
c
1






a
2






f
3






e
4





表1

从3,3格开始,开始计算。取以下三个值的最小值:
  • 如果最上方的字符等于最左方的字符,则为左上方的数字。否则为左上方的数字+1。(对于3,3来说为0)
  • 左方数字+1(对于3,3格来说为2)
  • 上方数字+1(对于3,3格来说为2)
   
c
o


e
e

0
1
2
3
4
5
6
c
1
0
1
2
3
4
5
a
2
1
1
2
3
4
5
f
3
2
2
1
2
3
4
e
4
3
3
2
2
2
3

c++

// Leveinshtein.cpp : Defines the entry point for the console application.//#include "stdafx.h"#include <stdio.h>#include<string.h>char s1[1000],s2[1000];int min(int a, int b, int c){int temp = a < b ? a : b;return temp < c ? temp : c;}void Levenshtein (int len1, int len2){int **d = new int*[len1+1];for(int k = 0; k <= len1; k++)d[k] = new int[len2+1];int i,j;for(i = 0; i <= len1; i++)     d[i][0] = i;for(j = 0; j <= len2; j++)d[0][j] = j;for(i = 1; i <= len1; i++){for(j = 1; j<= len2; j++){int cost = s1[i-1] == s2[j-1]?0:1;int deletion = d[i-1][j]+1;//topint insertion = d[i][j-1]+1;//leftint substitution = d[i-1][j-1]+cost;d[i][j] = min(deletion,insertion,substitution);}}printf("距离为:%d\n",d[len1][len2]);for(i=0;i<=len1;i++){delete[] d[i];}delete[] d;}int main(int argc, char* argv[]){while(scanf("%s%s",s1,s2) != EOF){Levenshtein(strlen(s1),strlen(s2));}return 0;}

java

public class App {public static int Levenshtein(String str1, String str2) {int row = str1.length() + 1;int column = str2.length() + 1;int cost;int[] matrix = new int[row * column];for (int i = 0; i < row; i++) {matrix[i] = i;}for (int j = 0; j < matrix.length; j += row)if (j % row == 0)matrix[j] = j / row;for (int i = 1; i < row; i++) {for (int j = 1; j < column; j++) {if (str1.charAt(i - 1) == str2.charAt(j - 1))cost = 0;elsecost = 1;matrix[j * row + i] = Math.min((matrix[(j - 1) * row + i] + 1), // leftMath.min(matrix[j * row + i - 1] + 1, // topmatrix[(j - 1) * row + (i - 1)] + cost));// left-top}}return matrix[matrix.length - 1];}public static void main(String[] args) {String str1 = "cafe";String str2 = "coffee";int distance = Levenshtein(str1, str2);System.out.println(distance);}}


python:

#coding=utf-8from __future__ import divisiondef normal_leven(str1, str2):    len_str1 = len(str1) + 1    len_str2 = len(str2) + 1    # create matrix    matrix = [0 for n in range(len_str1 * len_str2)]    # init x axis    for i in range(len_str1):        matrix[i] = i    # init y axis    #print(matrix)    for j in range(0, len(matrix), len_str1):        if j % len_str1 == 0:            matrix[j] = j // len_str1    #print(matrix)    for i in range(1, len_str1):        for j in range(1, len_str2):            if str1[i - 1] == str2[j - 1]:                cost = 0            else:                cost = 1            matrix[j * len_str1 + i] = min(matrix[(j - 1) * len_str1 + i] + 1,#left                                           matrix[j * len_str1 + (i - 1)] + 1,#top                                           matrix[(j - 1) * len_str1 + (i - 1)] + cost)# left-top            print matrix    return matrix[-1]if __name__ == '__main__':    str1 = u'cafe'    str2 = u'coffe'    distance =  normal_leven(str1,str2)    print distance



   
c
o


e
e

0
1
2
3
4
5
6
c
1
0
1
2
3
4
5
a
2
1
1
2
3
4
5
f
3
2
2
1
2
3
4
e
4
3
3
2
2
2
3
原创粉丝点击