C#:编辑距离计算及更新维基百科
来源:互联网 发布:oracle 表添加字段sql 编辑:程序博客网 时间:2024/05/21 06:20
开发工具:
Visual Studio v2010
.NET Framework 4 Client Profile
维基百科相关主题:
http://en.wikipedia.org/wiki/Levenshtein_distance
http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance(更新)
源代码:
EditDistance.cs
using System;using System.Collections.Generic;namespace Splash{ /// <summary> /// 编辑距离:莱文斯坦距离 Damerau-Levenshtein Distance /// </summary> public static class SpellHelper { /// <summary> /// 莱文斯坦距离(Levenshtein Distance) /// </summary> /// <param name="source">源串</param> /// <param name="target">目标串</param> /// <param name="similarity">输出:相似度,值在0~1</param> /// <param name="isCaseSensitive">是否大小写敏感</param> /// <returns>源串和目标串之间的编辑距离</returns> /// <remarks>http://en.wikipedia.org/wiki/Levenshtein_distance</remarks> public static Int32 LevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false) { if (String.IsNullOrEmpty(source)) { if (String.IsNullOrEmpty(target)) { similarity = 1; return 0; } else { similarity = 0; return target.Length; } } else if (String.IsNullOrEmpty(target)) { similarity = 0; return source.Length; } String From, To; if (isCaseSensitive) { // 大小写敏感 From = source; To = target; } else { // 大小写无关 From = source.ToLower(); To = target.ToLower(); } // 初始化 Int32 m = From.Length; Int32 n = To.Length; Int32[,] H = new Int32[m + 1, n + 1]; for (Int32 i = 0; i <= m; i++) H[i, 0] = i; // 注意:初始化[0,0] for (Int32 j = 1; j <= n; j++) H[0, j] = j; // 迭代 for (Int32 i = 1; i <= m; i++) { Char SI = From[i - 1]; for (Int32 j = 1; j <= n; j++) { // 删除(deletion) 插入(insertion) 替换(substitution) if (SI == To[j - 1]) H[i, j] = H[i - 1, j - 1]; else H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1; } } // 计算相似度(此相似度未必合理) Int32 MaxLength = Math.Max(m, n); // 两字符串的最大长度 similarity = ((Double)(MaxLength - H[m, n])) / MaxLength; return H[m, n]; // 编辑距离 } /// <summary> /// 受限的Damerau-Levenshtein Distance(只允许相邻字符交换) /// </summary> /// <param name="source">源串</param> /// <param name="target">目标串</param> /// <param name="similarity">输出:相似度,值在0~1</param> /// <param name="isCaseSensitive">是否大小写敏感</param> /// <returns>源串和目标串之间的编辑距离</returns> /// <remarks>http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance</remarks> public static Int32 OptimalStringAlignmentDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false) { if (String.IsNullOrEmpty(source)) { if (String.IsNullOrEmpty(target)) { similarity = 1; return 0; } else { similarity = 0; return target.Length; } } else if (String.IsNullOrEmpty(target)) { similarity = 0; return source.Length; } String From, To; if (isCaseSensitive) { // 大小写敏感 From = source; To = target; } else { // 大小写无关 From = source.ToLower(); To = target.ToLower(); } // 初始化 Int32 m = From.Length; Int32 n = To.Length; Int32[,] H = new Int32[m + 1, n + 1]; for (Int32 i = 0; i <= m; i++) H[i, 0] = i; // 注意:初始化[0,0] for (Int32 j = 1; j <= n; j++) H[0, j] = j; // 迭代 for (Int32 i = 1; i <= m; i++) { Char SI = From[i - 1]; for (Int32 j = 1; j <= n; j++) { // 删除(deletion) 插入(insertion) 替换(substitution) Char DJ = To[j - 1]; if (SI == DJ) H[i, j] = H[i - 1, j - 1]; else H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1; if (i > 1 && j > 1) { // 交换相邻字符(transposition of two adjacent characters) if (SI == To[j - 2] && DJ == From[i - 2]) { H[i, j] = Math.Min(H[i, j], H[i - 2, j - 2] + 1); } } } } // 计算相似度(此相似度未必合理) Int32 MaxLength = Math.Max(m, n); // 两字符串的最大长度 similarity = ((Double)(MaxLength - H[m, n])) / MaxLength; return H[m, n]; // 编辑距离 } /// <summary> /// 不受限的Damerau-Levenshtein Distance(允许交换字符间的删除插入操作) /// 我在维基百科中贡献了此段代码的修改版(去掉了相似度和大小写敏感) /// </summary> /// <param name="source">源串</param> /// <param name="target">目标串</param> /// <param name="similarity">输出:相似度,值在0~1</param> /// <param name="isCaseSensitive">是否大小写敏感</param> /// <remarks>http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance</remarks> public static Int32 DamerauLevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false) { if (String.IsNullOrEmpty(source)) { if (String.IsNullOrEmpty(target)) { similarity = 1; return 0; } else { similarity = 0; return target.Length; } } else if (String.IsNullOrEmpty(target)) { similarity = 0; return source.Length; } String From, To; if (isCaseSensitive) { // 大小写敏感 From = source; To = target; } else { // 大小写无关 From = source.ToLower(); To = target.ToLower(); } // 初始化 Int32 m = From.Length; Int32 n = To.Length; Int32[,] H = new Int32[m + 2, n + 2]; Int32 INF = m + n; H[0, 0] = INF; for (Int32 i = 0; i <= m; i++) { H[i + 1, 1] = i; H[i + 1, 0] = INF; } for (Int32 j = 0; j <= n; j++) { H[1, j + 1] = j; H[0, j + 1] = INF; } // 对维基百科中给出ActionScript代码优化,去掉参数C,可以更好地适合各国语言 SortedDictionary<Char, Int32> sd = new SortedDictionary<Char, Int32>(); foreach (Char Letter in (From + To)) { if (!sd.ContainsKey(Letter)) sd.Add(Letter, 0); } // 迭代 for (Int32 i = 1; i <= m; i++) { Int32 DB = 0; for (Int32 j = 1; j <= n; j++) { Int32 i1 = sd[To[j - 1]]; // 定位字符To[j-1]在源串From[0:i-2]中的最后一次索引 Int32 j1 = DB; // 定位字符From[i-1]在目标串To[0:j-2]中的最后一次索引 // 删除(deletion) 插入(insertion) 替换(substitution) if (From[i - 1] == To[j - 1]) { H[i + 1, j + 1] = H[i, j]; DB = j; } else { H[i + 1, j + 1] = Math.Min(H[i, j], Math.Min(H[i + 1, j], H[i, j + 1])) + 1; } // transposition of two adjacent characters // 将源串i1-1到i-1内的字符删除,然后交换i1-1和i-1的字符,再加上目标串j1-1到j-1内的字符 H[i + 1, j + 1] = Math.Min(H[i + 1, j + 1], H[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1)); } sd[From[i - 1]] = i; } // 计算相似度(此相似度未必合理) Int32 MaxLength = Math.Max(m, n); // 两字符串的最大长度 similarity = ((Double)(MaxLength - H[m + 1, n + 1])) / MaxLength; return H[m + 1, n + 1]; // 编辑距离 } /// <summary> /// 不受限的Damerau-Levenshtein Distance(允许交换字符间的删除插入操作) /// </summary> /// <param name="source">源串</param> /// <param name="target">目标串</param> /// <param name="similarity">输出:相似度,值在0~1</param> /// <param name="isCaseSensitive">是否大小写敏感</param> /// <remarks>更好理解的代码</remarks> public static Int32 EZDamerauLevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false) { if (String.IsNullOrEmpty(source)) { if (String.IsNullOrEmpty(target)) { similarity = 1; return 0; } else { similarity = 0; return target.Length; } } else if (String.IsNullOrEmpty(target)) { similarity = 0; return source.Length; } String From, To; if (isCaseSensitive) { // 大小写敏感 From = source; To = target; } else { // 大小写无关 From = source.ToLower(); To = target.ToLower(); } // 初始化 Int32 m = From.Length; Int32 n = To.Length; Int32[,] H = new Int32[m + 1, n + 1]; for (Int32 i = 0; i <= m; i++) H[i, 0] = i; // 注意:初始化[0,0] for (Int32 j = 1; j <= n; j++) H[0, j] = j; // 迭代 for (Int32 i = 1; i <= m; i++) { Char SI = From[i - 1]; for (Int32 j = 1; j <= n; j++) { // 删除(deletion) 插入(insertion) 替换(substitution) Char DJ = To[j - 1]; if (SI == DJ) H[i, j] = H[i - 1, j - 1]; else H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1; if (i > 1 && j > 1) { // 交换相邻字符(transposition of two adjacent characters) Int32 i1 = From.LastIndexOf(DJ, i - 2, i - 1); if (i1 != -1) { Int32 j1 = To.LastIndexOf(SI, j - 2, j - 1); if (j1 != -1) { // 将源串i1到i-1内的字符删除,然后交换i1和i-1的字符,再加上目标串j1到j-1内的字符 H[i, j] = Math.Min(H[i, j], H[i1, j1] + (i - i1 - 2) + 1 + (j - j1 - 2)); } } } } } // 计算相似度(此相似度未必合理) Int32 MaxLength = Math.Max(m, n); // 两字符串的最大长度 similarity = ((Double)(MaxLength - H[m, n])) / MaxLength; return H[m, n]; // 编辑距离 } }}
- C#:编辑距离计算及更新维基百科
- [算法]计算编辑距离
- 计算字符串编辑距离
- 计算编辑距离
- 计算最小编辑距离
- 维基百科更新使用条款 要求付费编辑公开说明自身身份
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 编辑距离及编辑距离算法
- 今天编辑了 "维基百科" 条目
- 计算字符串距离(编辑距离)
- IncrediBuild工具使用及设置
- 《 Unix环境高级编程 》笔记
- ARM体系结构与编程学习(五)
- JavaScript 显示当然日期和时间,年月日星期和时间
- asp.net 数据直接输出为下载
- C#:编辑距离计算及更新维基百科
- 存在于一个表而不存在于另一个表中的数据
- XSD生成 C# 类,关于decimal 类型在webservice 不能显示值的问题
- android之多线程工作(二)handler messge机制
- Get Post 请求方式的区别
- dom4j
- EditPlus What's New新增功能部分注解
- 40天 620 ~ 730
- 不使用microscale库从siwarex ms读重量值