    衡量两个字符串的相似度,我们需要找到一种计算两个字符串距离值的表示方法,并且这个距离值与两个字符串的相似度成反比例关系。题目中利用了使两个字符串相同的最少基本操作次数作为两个字符串的距离d,定义字符相似度为s = 1 / (d + 1)。这恰好符合我们衡量两个字符串相似度的标准。




    设字符串A = “abcdef” 字符串B = “abcef”:

      1.如果先采用删除操作,那么只需把字符串A中的’d‘字符删去即可,字符串相似度s = 1 / 2;

      2.如果先采用增加操作,若在字符串B中的c‘字符后面增加‘d'字符,字符串相似度s = 1 / 2;若在字符串A中的字符'c'后面增加‘e',会导致计算出来的字符串距离值增加;

      3.如果先采用修改操作,若把字符串A中的’d‘字符修改为’e‘,那么字符相似度s小于1 / 2;若把字符串B中的’e‘字符修改为’d‘,那么字符相似度s也是小于1 / 2。



  基于以上这些考虑,可以利用递归算法(1.需要计算不同字符串基本操作顺序下(即多种情况)的相似度结果 2.只保留相似度最大的结果)解决这个问题。



/** * @file count_string_similarity_v1.c * @brief count the similarity between two strings by recursion. * @author chenxilinsidney * @version 1.0 * @date 2014-12-31 */#include <stdlib.h>#include <stdio.h>#include <string.h>// min macro, warning: can not use with '++' and '--' operator#ifndef MIN#define MIN(a, b) ((a) < (b) ? (a) : (b))#endiftypedef int TYPE;/** * @brief calculate string distance by recursion. * * @param[in]      str_a    string a * @param[in]      a_begin  begin index of a(included) * @param[in]      a_end    end index of a(included) * @param[in]      str_b    string b * @param[in]      b_begin  begin index of b(included) * @param[in]      b_end    end index of b(included) * * @return distance between string a and b */TYPE calculate_string_distance(char* str_a, TYPE a_begin, TYPE a_end,        char* str_b, TYPE b_begin, TYPE b_end){    /// check if index of string a or b exceeds their length.    if(a_begin > a_end) {        if(b_begin > b_end)            return 0;        else            return b_end - b_begin + 1;    }    if(b_begin > b_end) {        if(a_begin > a_end)            return 0;        else            return a_end - a_begin + 1;    }    /// check if the begin value by index of a and b are equal    if(str_a[a_begin] == str_b[b_begin]) {        return calculate_string_distance(str_a, a_begin + 1, a_end,                str_b, b_begin + 1, b_end);    } else {        /// get the minimum distance for sub-string by three methods.        TYPE d_1 = calculate_string_distance(str_a, a_begin + 1, a_end,                str_b, b_begin, b_end);        TYPE d_2 = calculate_string_distance(str_a, a_begin, a_end,                str_b, b_begin + 1, b_end);        TYPE d_3 = calculate_string_distance(str_a, a_begin + 1, a_end,                str_b, b_begin + 1, b_end);        return MIN(MIN(d_1, d_2), d_3) + 1;    }}TYPE calculate_string_distance_by_recursion(char* str_a, TYPE a_length,        char* str_b, TYPE b_length){    return calculate_string_distance(str_a, 0, a_length - 1,            str_b, 0, b_length - 1);}int main(void){    char* string_a = "d";    char* string_b = "s";    TYPE distance = calculate_string_distance_by_recursion(string_a,            strlen(string_a),            string_b,            strlen(string_b));    printf("string %s and string %s\ndistance: %d similarity: %f\n",            string_a, string_b, distance, 1 / ((double)distance + 1));    return EXIT_SUCCESS;}




版本二:动态规划——带备忘的自顶向下法(Top-down with memoization)


  下面实现中实际存放子问题解的数组是一维数组,并传递一个数组宽度辅助。(可以修改为真正的二维数组:TYPE** cache = (TYPE**)malloc(a * sizeof(TYPE*)) ...  ; *cache = (TYPE*)malloc(b*sizeof(TYPE))

/** * @file count_string_similarity_by_memoized.c * @brief count the similarity between two strings by dynamic programming * with top-down with memoization method. * @author chenxilinsidney * @version 1.0 * @date 2014-12-31 */#include <stdlib.h>#include <stdio.h>#include <string.h>#include <limits.h>// #define NDEBUG#include <assert.h>#include "memory.h"// #define NDBG_PRINT#include "debug_print.h"// min macro, warning: can not use with '++' and '--' operator#ifndef MIN#define MIN(a, b) ((a) < (b) ? (a) : (b))#endiftypedef int TYPE;/** * @brief calculate string distance by dynamic programming. * * @param[in]      str_a    string a * @param[in]      a_begin  begin index of a(included) * @param[in]      a_end    end index of a(included) * @param[in]      str_b    string b * @param[in]      b_begin  begin index of b(included) * @param[in]      b_end    end index of b(included) * @param[in]      cache    save sub-problem result by memoization. * @param[in]      cache_width width of cache for two dimensional array. * * @return distance between string a and b */TYPE calculate_string_distance(char* str_a, TYPE a_begin, TYPE a_end,        char* str_b, TYPE b_begin, TYPE b_end, TYPE* cache, TYPE cache_width){    DEBUG_PRINT_STRING("In Recursion\n");    DEBUG_PRINT_VALUE("%d", a_begin);    DEBUG_PRINT_VALUE("%d", b_begin);    /// try to get distance from cache    TYPE value_temp;    if((value_temp = cache[a_begin * cache_width + b_begin]) > INT_MIN) {        DEBUG_PRINT_STRING("get distance from cache!\n");        DEBUG_PRINT_VALUE("%d", a_begin);        DEBUG_PRINT_VALUE("%d", b_begin);        DEBUG_PRINT_STRING("Out of Recursion\n");        return value_temp;    }    /// check if the begin value by index of a and b are equal    if(str_a[a_begin] == str_b[b_begin]) {        DEBUG_PRINT_STRING("have same word\n");        DEBUG_PRINT_STRING("set distance to cache!\n");        DEBUG_PRINT_VALUE("%d", a_begin);        DEBUG_PRINT_VALUE("%d", b_begin);        DEBUG_PRINT_STRING("Out of Recursion\n");        return cache[a_begin * cache_width + b_begin] =            calculate_string_distance(str_a, a_begin + 1, a_end,                str_b, b_begin + 1, b_end, cache, cache_width);    } else {        /// get the minimum distance for sub-string by three methods.        TYPE d_1 = calculate_string_distance(str_a, a_begin + 1, a_end,                str_b, b_begin, b_end, cache, cache_width);        TYPE d_2 = calculate_string_distance(str_a, a_begin, a_end,                str_b, b_begin + 1, b_end, cache, cache_width);        TYPE d_3 = calculate_string_distance(str_a, a_begin + 1, a_end,                str_b, b_begin + 1, b_end, cache, cache_width);        /// set distance to cache        DEBUG_PRINT_STRING("set distance to cache!\n");        DEBUG_PRINT_VALUE("%d", a_begin);        DEBUG_PRINT_VALUE("%d", b_begin);        DEBUG_PRINT_STRING("Out of Recursion\n");        return cache[a_begin * cache_width + b_begin] =            MIN(MIN(d_1, d_2), d_3) + 1;    }}TYPE calculate_string_distance_by_memoized(char* str_a, TYPE a_length,        char* str_b, TYPE b_length){    assert(a_length >= 1 && b_length >= 1);    /// initialize cache first    TYPE memoized_length = (a_length + 1) * (b_length + 1);    TYPE* memoized_cache = SMALLOC(memoized_length, TYPE);     TYPE i;    for(i = 0; i < memoized_length; i++)        memoized_cache[i] = INT_MIN;    /// set edge elements first to reduce calculation in program.    for(i = 0; i < b_length + 1; i++)        memoized_cache[a_length * (b_length + 1) + i] = b_length - i;    for(i = 0; i < a_length + 1; i++)        memoized_cache[i * (b_length + 1) + b_length] = a_length - i;    /// get distance by memoized method    TYPE value = calculate_string_distance(str_a, 0, a_length - 1,            str_b, 0, b_length - 1, memoized_cache, b_length + 1);    /// free memory    SFREE(&memoized_cache);    return value;}int main(void){    char* string_a = "ddsag";    char* string_b = "sdsg";    TYPE distance = calculate_string_distance_by_memoized(string_a,            strlen(string_a),            string_b,            strlen(string_b));    printf("string %s and string %s\ndistance: %d similarity: %f\n",            string_a, string_b, distance, 1 / ((double)distance + 1));    return EXIT_SUCCESS;}

版本三:动态规划——自底向上法(bottom-up method)


/** * @file count_string_similarity_by_bottom_up.c * @brief count the similarity between two strings by dynamic programming * with bottom-up method. * @author chenxilinsidney * @version 1.0 * @date 2014-12-31 */#include <stdlib.h>#include <stdio.h>#include <string.h>#include <limits.h>// #define NDEBUG#include <assert.h>#include "memory.h"// #define NDBG_PRINT#include "debug_print.h"// min macro, warning: can not use with '++' and '--' operator#ifndef MIN#define MIN(a, b) ((a) < (b) ? (a) : (b))#endiftypedef int TYPE;TYPE calculate_string_distance_by_bottom_up(char* str_a, TYPE a_length,        char* str_b, TYPE b_length){    assert(a_length >= 1 && b_length >= 1);    /// initialize cache first    TYPE* memoized_cache = SMALLOC((a_length + 1) * (b_length + 1), TYPE);     TYPE* cache_offset;    TYPE i, j;    /// set edge elements first to reduce calculation in program.    for(i = 0; i < b_length + 1; i++)        memoized_cache[i] = i;    for(i = 1; i < a_length + 1; i++)        memoized_cache[i * (b_length + 1)] = i;    /// get distance by bottom-up method    /// i, j to index string from end to begin from the string now    for (i = 0; i < a_length; i++) {        for (j = 0; j < b_length; j++) {            /// save offet pointer            cache_offset = memoized_cache + (b_length + 1) * i + j;            if (str_a[a_length - i - 1] == str_b[b_length - j - 1]) {                /// use old distance if same word                cache_offset[b_length + 1 + 1] =                    cache_offset[0];            } else {                /// use minimum distance if not same word                cache_offset[b_length + 1 + 1] = MIN(                        MIN(cache_offset[b_length + 1],                            cache_offset[1]),                        cache_offset[0]) + 1;            }        }    }    /// get final distance    TYPE distance = memoized_cache[a_length * (b_length + 1) + b_length];    /// free memory    SFREE(&memoized_cache);    return distance;}int main(void){    char* string_a = "ddsag";    char* string_b = "sdsg";    TYPE distance = calculate_string_distance_by_bottom_up(string_a,            strlen(string_a),            string_b,            strlen(string_b));    printf("string %s and string %s\ndistance: %d similarity: %f\n",            string_a, string_b, distance, 1 / ((double)distance + 1));    return EXIT_SUCCESS;}

拓展:我把代码中str_a[a_length - i - 1] == str_b[b_length - j - 1]语句改为str_a[i] == str_b[j]语句后算法的解一致,但这时二维数组表格(由算法一维数组获得)中的内容含义不太一致,实际上一个是从字符串尾部开始计算字符串距离,一个是从字符串头部开始计算字符串距离。






2.全文源码均开源(在UBUNTU + GCC4.8.2下编译并测试通过),可下载或查看:https://github.com/chenxilinsidney/funnycprogram/tree/master/beauty_of_programming/count_string_similarity


