语音识别之找出字符串的最短距离

来源:互联网 发布:js代码混淆加密工具 编辑:程序博客网 时间:2024/05/24 04:14

             这里用到了传说中的DP和剪枝,我也是醉了!我还以为是多么屌炸天的东西!快哭了


1、DP

                         


2、剪枝

有固定阈值和可变阈值,可用于寻找比如最短路径时候减少复杂度。


3、代码

代码一:只有距离的

/**  Project:String Comparation*  Purpose:To find the minimal distance between two strings*  @Created on: March 29, 2015*  @Author: Chen Yu,Gong Qian*  @Discription: calculate the minimal edit distance between 2 strings*  Method : DP (dynamic programming)*  D[i,j]: the minimal edit distance for s1的前i个字符和 s2的前j个字符*  DP Formulation: D[i,j]=min(D[i-1,j]+1,D[i,j-1]+1,D[i-1,j-1]+flag);//其中if(s1[i]!=s2[j])则flag=2,else flag=0;**/#define _CRT_SECURE_NO_WARNINGS#include <iostream>  #include <stdio.h>  #include <string.h>using namespace std;#define N   100  #define INF 100000000  #define min(a,b) a<b?a:b  int  dis[N][N];       //THE MATRIX STORES THE INFORMATION OF DISTANCEint  flag[N][N];      //sign whether be searched  1:searchchar s1[N],s2[N];int  n, m;            //length of the two string  int Min_Distance();int main(){int i, j;int chose;int min = INF;printf( "Please input the template and target string:\n");printf("*********************************************\n\n");while (scanf("%s%s", &s1, &s2) != EOF){printf("Please choose the pruning stratege:\n1.fixed threshold=3\n2.beam search with beam=3\n");printf("*********************************************\n\n");while (scanf("%d", &chose) && chose != 1 && chose != 2){printf("Sorry,error chose!try again\n");printf("*********************************************\n\n");}n = strlen(s1); m = strlen(s2);    //n模板的长度 m目标串的长度 n*mfor (i = 0; i <= n + 1; i++)for (j = 0; j <= m + 1; j++){dis[i][j] = INF;flag[i][j] = true;}if(s1[0]!=s2[0])dis[0][0] = 1;else dis[0][0] = 0;for (j = 0; j <= m; j++){min = INF;for (i = 0; i <=n; i++){if (flag[i][j]){if (i > 0) (s1[i] == s2[j]) ? (dis[i][j] = min(dis[i][j],dis[i - 1][j])) : (dis[i][j] = min(dis[i][j], dis[i - 1][j] + 1)); //delete  if (j > 0) (s1[i] == s2[j]) ? (dis[i][j] = min(dis[i][j], dis[i][j - 1])) : (dis[i][j] = min(dis[i][j], dis[i][j - 1] + 1));//insert  //substitute  if (i > 0 && j > 0){if (s1[i] != s2[j])dis[i][j] = min(dis[i][j], dis[i - 1][j - 1] + 1); elsedis[i][j] = min(dis[i][j], dis[i - 1][j - 1]);}if (dis[i][j]<min)min = dis[i][j];}}for (i = 0; i <= n; i++){//采用fixed thresholdif (chose == 1){if (dis[i][j] > 3)flag[i][j] = false;}//采用beam searchelse{if (dis[i][j] > min + 3)flag[i][j] = false;}}}for (i = 0; i < n; i++){for (j = 0; j < m; j++){if (flag[i][j] == true)printf("%d ", dis[i][j]);elseprintf("X ");}printf("\n");}printf("Minimal edit distance is: %d\n\n", dis[n][m]);}return 0;}


代码二:除了距离还有给出路径

在这里面的我是用的是比较简单的算法来计算该路径的,就是从终点往回走。

/**  Project:String Comparation*  Purpose:To find the minimal distance between two strings*  @Created on: March 29, 2015*  @Author: Chen Yu,Gong Qian*  @Discription: calculate the minimal edit distance between 2 strings*  Method : DP (dynamic programming)*  D[i,j]: the minimal edit distance for s1的前i个字符和 s2的前j个字符*  DP Formulation: D[i,j]=min(D[i-1,j]+1,D[i,j-1]+1,D[i-1,j-1]+flag);//其中if(s1[i]!=s2[j])则flag=2,else flag=0;**/#define _CRT_SECURE_NO_WARNINGS#include <iostream>  #include <stdio.h>  #include <string.h>#include <queue>using namespace std;#define N   100  #define INF 100000000  #define min(a,b) a<b?a:b  #define max(a,b) a>b?a:bint  dis[N][N];       // THE MATRIX STORES THE INFORMATION OF DISTANCEint  dir[N][N];       // 1 stands for left,2 stands for italics,3 stands for upint  flag[N][N];      // sign whether be searched  1:searchchar s1[N],s2[N];int  n, m;            // length of the two string  int Min_Distance();int main(){int i, j;int chose;int min = INF;std::queue<int> q;printf( "Please input the template and target string:\n");printf("*********************************************\n\n");while (scanf("%s%s", &s1, &s2) != EOF){printf("Please choose the pruning stratege:\n1.fixed threshold=3\n2.beam search with beam=3\n");printf("*********************************************\n\n");while (scanf("%d", &chose) && chose != 1 && chose != 2){printf("Sorry,error chose!try again\n");printf("*********************************************\n\n");}n = strlen(s1); m = strlen(s2);    //n模板的长度 m目标串的长度 n*mfor (i = 0; i <= n + 1; i++){for (j = 0; j <= m + 1; j++){dis[i][j] = INF;flag[i][j] = true;}}if(s1[0]!=s2[0]) dis[0][0] = 1;else dis[0][0] = 0;for (j = 0; j <= m; j++){min = INF;for (i = 0; i <=n; i++){if (flag[i][j]){if (i > 0) (s1[i] == s2[j]) ? (dis[i][j] = min(dis[i][j],dis[i - 1][j])) : (dis[i][j] = min(dis[i][j], dis[i - 1][j] + 1)); //delete  if (j > 0) (s1[i] == s2[j]) ? (dis[i][j] = min(dis[i][j], dis[i][j - 1])) : (dis[i][j] = min(dis[i][j], dis[i][j - 1] + 1));//insert   //substitute  if (i > 0 && j > 0){if (s1[i] != s2[j])dis[i][j] = min(dis[i][j], dis[i - 1][j - 1] + 1); elsedis[i][j] = min(dis[i][j], dis[i - 1][j - 1]);/*if((dis[i-1][j-1] > dis[i][j-1])&&(dis[i][j-1]>dis[i-1][j]))dir[i][j] = 1;else if(dis[i-1][j-1] > dis[i-1][j])dir[i][j] = 2;elsedir[i][j] = 3;*/}if (dis[i][j]<min)min = dis[i][j];}}for (i = 0; i <= n; i++){//采用fixed thresholdif (chose == 1){if (dis[i][j] > 3)flag[i][j] = false;}//采用beam searchelse{if (dis[i][j] > min + 3)flag[i][j] = false;}}}for (i = 0; i < n; i++){for (j = 0; j < m; j++){if (flag[i][j] == true)printf("%d ", dis[i][j]);else{printf("X ");dis[i][j] = -1;}}printf("\n");}// 最短路径的长度就是更长的字符串的长度int path_len = max(m, n);q.push(m - 1);q.push(n-1);j = m-1;  //rowi = n-1;  //volfor (int len = 0; len < path_len; len++){if(j == 0 && i == 0)break;else if(i == 0){q.push(0);j--;q.push(j);}else if(j == 0){i--;q.push(i);q.push(0);}else if(dis[i][j-1] == dis[i-1][j-1]){q.push(i);j--;q.push(j);}else if(dis[i-1][j] == dis[i-1][j-1]){i--;q.push(i);q.push(j);}else{i--;j--;q.push(i);q.push(j);}}int pos;for (int len = 0; len < path_len*2; len++){pos = q.front();q.pop();printf("%d ",pos);if (len % 2)printf("\n");}printf("Minimal edit distance is: %d\n\n", dis[n][m]);}return 0;}


代码三:使用字典对一个txt文件进行矫正

#include <iostream>#include <stdio.h>#include <String>#include <sstream>#include <fstream>#include <cctype>#include <algorithm>#include <Windows.h>#define N        100#define M        10000#define INF      1000000#define min(a,b) a<b?a:busing namespace std;string story[M];string storychecked[M];string storycorrect[M];string dict[M];string temp;int    n, m;int    dis[M][N];HANDLE hCon;enum Color { DARKBLUE = 1, DARKGREEN, DARKTEAL, DARKRED, DARKPINK, DARKYELLOW, GRAY, DARKGRAY, BLUE, GREEN, TEAL, RED, PINK, YELLOW, WHITE };void SetColor(Color c){if (hCon == NULL)hCon = GetStdHandle(STD_OUTPUT_HANDLE);SetConsoleTextAttribute(hCon, c);}int main(){SetColor(WHITE);string template_,input;string temp;//********************************************************************************************************//********************************************************************************************************// story read    //open the stream of story and store it into story.txtstring filename = "story.txt";ifstream i_file;string out_text;i_file.open(filename);int length_story = 0;if (i_file.is_open()){while (i_file.good()){i_file >> out_text; //将读取的内容存储到变量out_text中int temp_index = 0;temp = out_text;string::iterator pos = out_text.begin();while (pos != out_text.end()){if (ispunct(*pos)){out_text.erase(pos);}else{++pos;}}cout << out_text << endl;transform(out_text.begin(), out_text.end(), out_text.begin(), tolower);story[length_story] = out_text;length_story++;}}elsecout << "打开文件时出错!\n";i_file.close();//********************************************************************************************************//********************************************************************************************************// dict read//printf("Here is open dict\n");//open the stream of dict and store it into groupfilename = "dict.txt";//ifstream i_file_dict;string out_text_c;i_file.open(filename);int length_dict = 0;if (i_file.is_open()){while (i_file.good()){i_file >> out_text_c; //将读取的内容存储到变量out_text中if (!out_text_c.empty())transform(out_text_c.begin(), out_text_c.end(), out_text_c.begin(), tolower);dict[length_dict] = out_text_c;length_dict++;}}elsecout << "打开文件时出错!\n";i_file.close();//********************************************************************************************************//********************************************************************************************************// story correct read//string temp;//open the stream of story and store it into story.txtfilename = "storycorrect.txt";length_story = 0;i_file.open(filename);length_story = 0;if (i_file.is_open()){while (i_file.good()){i_file >> out_text; //将读取的内容存储到变量out_text中cout << out_text << endl; //在控制台输出读取的内容。为什么最后一行的内容会出现两次int temp_index = 0;temp = out_text;string::iterator pos = out_text.begin();while (pos != out_text.end()){if (ispunct(*pos)){out_text.erase(pos);}else{++pos;}}cout << out_text << endl;transform(out_text.begin(), out_text.end(), out_text.begin(), tolower);storycorrect[length_story] = out_text;length_story++;}}elsecout << "打开文件时出错!\n";i_file.close();//********************************************************************************************************//********************************************************************************************************//find min distanceint i, j;int min = INF;  //the minimal distance between two stringint index = 0;  //to get which word in dict is suitabelfor (int i_ = 0; i_ < length_story;i_++){ m = story[i_].length();for (int j_ = 0; j_ < length_dict; j_++){n = dict[j_].length();for (i = 0; i <= n + 1; i++)for (j = 0; j <= m + 1; j++)dis[i][j] = INF;if (story[i_][0] != dict[j_][0]) dis[0][0] = 1;else dis[0][0] = 0;for (i = 0; i <= n; i++)for (j = 0; j <= m; j++){if (i>0) dis[i][j] = min(dis[i][j], dis[i - 1][j] + 1); //delete  if (j>0) dis[i][j] = min(dis[i][j], dis[i][j - 1] + 1);//insert  //substitute  if (i>0 && j>0){if (dict[j_][i - 1] != story[i_][j - 1])dis[i][j] = min(dis[i][j], dis[i - 1][j - 1] + 1);elsedis[i][j] = min(dis[i][j], dis[i - 1][j - 1]);}}if (dis[n][m] < min){index = j_;min = dis[n][m];}}min = INF;storychecked[i_] = dict[index];cout << storychecked[i_] << endl;}//********************************************************************************************************//********************************************************************************************************//write data into storychecked into storychecked.txtint delete_num = 0, insert_num = 0, replace_num = 0;ofstream o_file;filename = "storychecked.txt";o_file.open(filename);for (int i = 0; i < length_story; i++){o_file << storychecked[i] << " "; //将内容写入到文本文件中cout << storychecked[i] << endl;}o_file.close();for (int i = 0; i < length_story; i++){cout << storycorrect[i] << "  " << storychecked[i] << endl;if (storychecked[i]!=storycorrect[i]){if (storychecked[i].length()>storycorrect[i].length())insert_num++;else if (storychecked[i].length() < storycorrect[i].length())delete_num++;elsereplace_num++;}}//********************************************************************************************************//********************************************************************************************************//get error numberprintf("****************************************************************\n");printf("The total error is %d\n", insert_num + delete_num + replace_num);printf("replace: %d, delete:%d, insert:%d\n",replace_num,delete_num,insert_num);system("pause");return 0;}

代码3:输入多个模板,并找到最好的那个,画出路径。

/**  Project:String Comparation*  Purpose:To find the minimal distance between two strings*  @Created on: March 29, 2015*  @Author: Chen Yu,Gong Qian*  @Discription: calculate the minimal edit distance between 2 strings*  Method : DP (dynamic programming)*  D[i,j]: the minimal edit distance for s1的前i个字符和 s2的前j个字符*  DP Formulation: D[i,j]=min(D[i-1,j]+1,D[i,j-1]+1,D[i-1,j-1]+flag);//其中if(s1[i]!=s2[j])则flag=2,else flag=0;**/#define _CRT_SECURE_NO_WARNINGS#include <iostream>  #include <stdio.h>  #include <string.h>#include <queue>#include <Windows.h>using namespace std;#define N   100  #define INF 65535#define min(a,b) a<b?a:b  #define max(a,b) a>b?a:bint  dis[N][N];       // THE MATRIX STORES THE INFORMATION OF DISTANCEint  dir[N][N];       // 1 stands for left,2 stands for italics,3 stands for upint  flag[N][N];      // sign whether be searched  1:searchchar s1[N],s2[N],s0[20];char s[N][20];int  n, m;            // length of the two string  HANDLE hCon;enum Color { DARKBLUE = 1, DARKGREEN, DARKTEAL, DARKRED, DARKPINK, DARKYELLOW, GRAY, DARKGRAY, BLUE, GREEN, TEAL, RED, PINK, YELLOW, WHITE };//求出最短距离int Min_Distance();//设置颜色//辅助函数,设置控制台的颜色void SetColor(Color c){if (hCon == NULL)hCon = GetStdHandle(STD_OUTPUT_HANDLE);SetConsoleTextAttribute(hCon, c);}int main(){//SetConsoleTextColor(FOREGROUND_RED | FOREGROUND_INTENSITY);int i, j;int chose;int min = INF;int num = 0;std::queue<int> q;SetColor(GREEN);printf("Please the num of templates:");while (scanf("%d", &num) != EOF){//SetConsoleTextColor(FOREGROUND_GREEN | FOREGROUND_INTENSITY);printf("*********************************************\n\n");printf("Please input %d template strings:\n", num);for (i = 0; i < num; i++)scanf("%s", &s[i]);SetColor(PINK);printf("\n");printf("Please input the target string:");scanf("%s", &s0);//while (scanf("%s%s", &s1, &s2) != EOF)//{printf("\nPlease choose the pruning stratege:\n1.fixed threshold=3\n2.beam search with beam=3\n");printf("*********************************************\n\n");while (scanf("%d", &chose) && chose != 1 && chose != 2){printf("Sorry,error chose!try again\n");printf("*********************************************\n\n");}n = 0;m = strlen(s0);int start[20];   //用来记录每个模板的起始点start[0] = 0;for (int i = 0; i < num; i++){n += strlen(s[i]);if (i>0) start[i] = start[i - 1] + strlen(s[i - 1]);}for (i = 0; i <= n + 1; i++)for (j = 0; j <= m + 1; j++){dis[i][j] = INF;flag[i][j] = true;}for (int i = 0; i < num; i++){if (s[i][0] != s0[0]) dis[start[i]][0] = 1;else dis[start[i]][0] = 0;}int k;//calculate distancefor (j = 0; j <= m; j++){min = INF;for (k = 0; k < num; k++){min = INF;for (i = 0; i < strlen(s[k]); i++){int row = start[k] + i;if (flag[row][j]){if (row > start[k]) (s[k][i] == s0[j]) ? (dis[row][j] = min(dis[row][j], dis[row - 1][j])) : (dis[row][j] = min(dis[row][j], dis[row - 1][j] + 1)); //delete  if (j > 0) (s[k][i] == s0[j]) ? (dis[row][j] = min(dis[row][j], dis[row][j - 1])) : (dis[row][j] = min(dis[row][j], dis[row][j - 1] + 1));//insert   //substitute  if (row > start[k] && j > 0){if (s[k][i] != s0[j])dis[row][j] = min(dis[row][j], dis[row - 1][j - 1] + 1);elsedis[row][j] = min(dis[row][j], dis[row - 1][j - 1]);}if (dis[row][j] < min)min = dis[row][j];}}for (i = 0; i < strlen(s[k]); i++){int row = start[k] + i;//采用fixed thresholdif (chose == 1){if (dis[row][j] > 3)flag[row][j] = false;}//采用beam searchelse{if (dis[row][j] > min + 3)flag[row][j] = false;}}}}/*for (int k = 0; k < num; k++){for (int i = 0; i < strlen(s[k]); i++){SetColor(GRAY);printf("%c ", s[k][i]);for (j = 0; j < m; j++){printf("%d ", dis[start[k] + i][j]);}printf("\n");}}*/int cost = INF;int index = 0;      //for (i = 0; i < num; i++){int pos = start[i] + strlen(s[i]) - 1;if (dis[pos][m - 1] < cost){cost = dis[pos][m - 1];index = i;}}// 最短路径的长度就是更长的字符串的长度int path[2][N];int n = strlen(s[index]);int path_len = max(m, n);q.push(m - 1);q.push(n - 1);j = m - 1;  //coli = n - 1;  //rowfor (int len = 0; len < path_len; len++){if (j == 0 && i == 0){q.push(0);q.push(0);path[0][len] = 0;path[1][len] = 0;break;}else if (i == 0)j--;else if (j == 0)i--;else if (dis[start[index] + i][j - 1] == dis[start[index] + i - 1][j - 1])j--;else if (dis[start[index] + i - 1][j] == dis[start[index] + i - 1][j - 1])i--;else{i--;j--;}q.push(i);q.push(j);path[0][len] = i;path[1][len] = j;}//patining colorprintf("  ");for (k = 0; k < m; k++)printf("%c ", s0[k]);printf("\n");for (k = 0; k < num; k++){if (k != index){for (i = 0; i < strlen(s[k]); i++){SetColor(GRAY);printf("%c ", s[k][i]);for (j = 0; j < m; j++){if (flag[start[k] + i][j])printf("%d ", dis[start[k] + i][j]);elseprintf("X ");}printf("\n");}}else{bool flag_st = true;for (i = 0; i < n; i++){SetColor(GRAY);printf("%c ", s[k][i]);for (j = 0; j < m; j++){for (int k = 0; k < path_len; k++){if (path[0][k] == i && path[1][k] == j){flag_st = false;break;}}if (flag_st == false || (j == m - 1 && i == n - 1)){SetColor(RED);printf("%d ", dis[start[k] + i][j]);flag_st = true;}else if (flag[i][j] == true){SetColor(YELLOW);printf("%d ", dis[start[k] + i][j]);//SetConsoleTextColor(FOREGROUND_YELLOW );}else{SetColor(GRAY);//SetConsoleTextColor(FOREGROUND_GREEN | FOREGROUND_INTENSITY);printf("X ");dis[start[k] + i][j] = -1;}}printf("\n");}}SetColor(BLUE);printf("--------------------------------------------------------\n");}SetColor(WHITE);printf("\n*******************************************************************\n");printf("Input String %s is most simutaneous to %s.\n", s0, s[index]);printf("And the distance between them is: %d\n\n", cost);SetColor(GREEN);printf("Please the num of templates:");}system("pause");return 0;}



最后效果如下图:




参考教程:http://blog.csdn.net/jie1991liu/article/details/8778893

0 0
原创粉丝点击