C#:编辑距离计算及更新维基百科

来源:互联网 发布:oracle 表添加字段sql 编辑:程序博客网 时间:2024/05/21 06:20

开发工具:

Visual Studio v2010

.NET Framework 4 Client Profile

维基百科相关主题:

http://en.wikipedia.org/wiki/Levenshtein_distance

http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance(更新)

源代码:

EditDistance.cs

using System;using System.Collections.Generic;namespace Splash{    /// <summary>    /// 编辑距离:莱文斯坦距离 Damerau-Levenshtein Distance    /// </summary>    public static class SpellHelper    {        /// <summary>        /// 莱文斯坦距离(Levenshtein Distance)        /// </summary>        /// <param name="source">源串</param>        /// <param name="target">目标串</param>        /// <param name="similarity">输出:相似度,值在0~1</param>        /// <param name="isCaseSensitive">是否大小写敏感</param>        /// <returns>源串和目标串之间的编辑距离</returns>        /// <remarks>http://en.wikipedia.org/wiki/Levenshtein_distance</remarks>        public static Int32 LevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)        {            if (String.IsNullOrEmpty(source))            {                if (String.IsNullOrEmpty(target))                {                    similarity = 1;                    return 0;                }                else                {                    similarity = 0;                    return target.Length;                }            }            else if (String.IsNullOrEmpty(target))            {                similarity = 0;                return source.Length;            }            String From, To;            if (isCaseSensitive)            {   // 大小写敏感                From = source;                To = target;            }            else            {   // 大小写无关                From = source.ToLower();                To = target.ToLower();            }            // 初始化            Int32 m = From.Length;            Int32 n = To.Length;            Int32[,] H = new Int32[m + 1, n + 1];            for (Int32 i = 0; i <= m; i++) H[i, 0] = i;  // 注意:初始化[0,0]            for (Int32 j = 1; j <= n; j++) H[0, j] = j;            // 迭代            for (Int32 i = 1; i <= m; i++)            {                Char SI = From[i - 1];                for (Int32 j = 1; j <= n; j++)                {   // 删除(deletion) 插入(insertion) 替换(substitution)                    if (SI == To[j - 1])                        H[i, j] = H[i - 1, j - 1];                    else                        H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1;                }            }            // 计算相似度(此相似度未必合理)            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度            similarity = ((Double)(MaxLength - H[m, n])) / MaxLength;            return H[m, n];    // 编辑距离        }        /// <summary>        /// 受限的Damerau-Levenshtein Distance(只允许相邻字符交换)        /// </summary>        /// <param name="source">源串</param>        /// <param name="target">目标串</param>        /// <param name="similarity">输出:相似度,值在0~1</param>        /// <param name="isCaseSensitive">是否大小写敏感</param>        /// <returns>源串和目标串之间的编辑距离</returns>        /// <remarks>http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance</remarks>        public static Int32 OptimalStringAlignmentDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)        {            if (String.IsNullOrEmpty(source))            {                if (String.IsNullOrEmpty(target))                {                    similarity = 1;                    return 0;                }                else                {                    similarity = 0;                    return target.Length;                }            }            else if (String.IsNullOrEmpty(target))            {                similarity = 0;                return source.Length;            }            String From, To;            if (isCaseSensitive)            {   // 大小写敏感                From = source;                To = target;            }            else            {   // 大小写无关                From = source.ToLower();                To = target.ToLower();            }            // 初始化            Int32 m = From.Length;            Int32 n = To.Length;            Int32[,] H = new Int32[m + 1, n + 1];            for (Int32 i = 0; i <= m; i++) H[i, 0] = i;  // 注意:初始化[0,0]            for (Int32 j = 1; j <= n; j++) H[0, j] = j;            // 迭代            for (Int32 i = 1; i <= m; i++)            {                Char SI = From[i - 1];                for (Int32 j = 1; j <= n; j++)                {   // 删除(deletion) 插入(insertion) 替换(substitution)                    Char DJ = To[j - 1];                    if (SI == DJ)                        H[i, j] = H[i - 1, j - 1];                    else                        H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1;                    if (i > 1 && j > 1)                    {   // 交换相邻字符(transposition of two adjacent characters)                        if (SI == To[j - 2] && DJ == From[i - 2])                        {                            H[i, j] = Math.Min(H[i, j], H[i - 2, j - 2] + 1);                        }                    }                }            }            // 计算相似度(此相似度未必合理)            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度            similarity = ((Double)(MaxLength - H[m, n])) / MaxLength;            return H[m, n];    // 编辑距离        }        /// <summary>        /// 不受限的Damerau-Levenshtein Distance(允许交换字符间的删除插入操作)        /// 我在维基百科中贡献了此段代码的修改版(去掉了相似度和大小写敏感)        /// </summary>        /// <param name="source">源串</param>        /// <param name="target">目标串</param>        /// <param name="similarity">输出:相似度,值在0~1</param>        /// <param name="isCaseSensitive">是否大小写敏感</param>        /// <remarks>http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance</remarks>        public static Int32 DamerauLevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)        {            if (String.IsNullOrEmpty(source))            {                if (String.IsNullOrEmpty(target))                {                    similarity = 1;                    return 0;                }                else                {                    similarity = 0;                    return target.Length;                }            }            else if (String.IsNullOrEmpty(target))            {                similarity = 0;                return source.Length;            }            String From, To;            if (isCaseSensitive)            {   // 大小写敏感                From = source;                To = target;            }            else            {   // 大小写无关                From = source.ToLower();                To = target.ToLower();            }            // 初始化            Int32 m = From.Length;            Int32 n = To.Length;            Int32[,] H = new Int32[m + 2, n + 2];            Int32 INF = m + n;            H[0, 0] = INF;            for (Int32 i = 0; i <= m; i++) { H[i + 1, 1] = i; H[i + 1, 0] = INF; }            for (Int32 j = 0; j <= n; j++) { H[1, j + 1] = j; H[0, j + 1] = INF; }            // 对维基百科中给出ActionScript代码优化,去掉参数C,可以更好地适合各国语言            SortedDictionary<Char, Int32> sd = new SortedDictionary<Char, Int32>();            foreach (Char Letter in (From + To))            {                if (!sd.ContainsKey(Letter))                    sd.Add(Letter, 0);            }            // 迭代            for (Int32 i = 1; i <= m; i++)            {                Int32 DB = 0;                for (Int32 j = 1; j <= n; j++)                {                    Int32 i1 = sd[To[j - 1]];   // 定位字符To[j-1]在源串From[0:i-2]中的最后一次索引                    Int32 j1 = DB;              // 定位字符From[i-1]在目标串To[0:j-2]中的最后一次索引                    // 删除(deletion) 插入(insertion) 替换(substitution)                    if (From[i - 1] == To[j - 1])                    {                        H[i + 1, j + 1] = H[i, j];                        DB = j;                    }                    else                    {                        H[i + 1, j + 1] = Math.Min(H[i, j], Math.Min(H[i + 1, j], H[i, j + 1])) + 1;                    }                    // transposition of two adjacent characters                    // 将源串i1-1到i-1内的字符删除,然后交换i1-1和i-1的字符,再加上目标串j1-1到j-1内的字符                      H[i + 1, j + 1] = Math.Min(H[i + 1, j + 1], H[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));                }                sd[From[i - 1]] = i;            }            // 计算相似度(此相似度未必合理)            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度            similarity = ((Double)(MaxLength - H[m + 1, n + 1])) / MaxLength;            return H[m + 1, n + 1];    // 编辑距离        }        /// <summary>        /// 不受限的Damerau-Levenshtein Distance(允许交换字符间的删除插入操作)        /// </summary>        /// <param name="source">源串</param>        /// <param name="target">目标串</param>        /// <param name="similarity">输出:相似度,值在0~1</param>        /// <param name="isCaseSensitive">是否大小写敏感</param>        /// <remarks>更好理解的代码</remarks>        public static Int32 EZDamerauLevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)        {            if (String.IsNullOrEmpty(source))            {                if (String.IsNullOrEmpty(target))                {                    similarity = 1;                    return 0;                }                else                {                    similarity = 0;                    return target.Length;                }            }            else if (String.IsNullOrEmpty(target))            {                similarity = 0;                return source.Length;            }            String From, To;            if (isCaseSensitive)            {   // 大小写敏感                From = source;                To = target;            }            else            {   // 大小写无关                From = source.ToLower();                To = target.ToLower();            }            // 初始化            Int32 m = From.Length;            Int32 n = To.Length;            Int32[,] H = new Int32[m + 1, n + 1];            for (Int32 i = 0; i <= m; i++) H[i, 0] = i;  // 注意:初始化[0,0]            for (Int32 j = 1; j <= n; j++) H[0, j] = j;            // 迭代            for (Int32 i = 1; i <= m; i++)            {                Char SI = From[i - 1];                for (Int32 j = 1; j <= n; j++)                {   // 删除(deletion) 插入(insertion) 替换(substitution)                    Char DJ = To[j - 1];                    if (SI == DJ)                        H[i, j] = H[i - 1, j - 1];                    else                        H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1;                    if (i > 1 && j > 1)                    {   // 交换相邻字符(transposition of two adjacent characters)                                               Int32 i1 = From.LastIndexOf(DJ, i - 2, i - 1);                        if (i1 != -1)                        {                            Int32 j1 = To.LastIndexOf(SI, j - 2, j - 1);                            if (j1 != -1)                            {   // 将源串i1到i-1内的字符删除,然后交换i1和i-1的字符,再加上目标串j1到j-1内的字符                                                              H[i, j] = Math.Min(H[i, j], H[i1, j1] + (i - i1 - 2) + 1 + (j - j1 - 2));                            }                        }                    }                }            }            // 计算相似度(此相似度未必合理)            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度            similarity = ((Double)(MaxLength - H[m, n])) / MaxLength;            return H[m, n];    // 编辑距离        }    }}
 

     

 

原创粉丝点击