SimHash+汉明距离的C#实现方法

来源:互联网 发布:python 字符串换行 编辑:程序博客网 时间:2024/06/05 06:37

根据以下JAVA实现方法改编
http://itindex.net/detail/50448-%E7%9B%B8%E4%BC%BC-%E8%AE%A1%E7%AE%97-google

以下为SimHash+汉明距离的C#实现:

using System;using System.Collections.Generic;using System.Linq;using System.Numerics;using System.Text;namespace chx{    public class SimHash    {        private String tokens;        private BigInteger strSimHash;        private int hashbits = 128;        public BigInteger StrSimHash        {            get            {                return strSimHash;            }        }        public SimHash(String tokens, int hashbits)        {            this.tokens = tokens;            this.hashbits = hashbits;            this.strSimHash = simHash();        }        public SimHash(String tokens)        {            this.tokens = tokens;            this.strSimHash = simHash();        }        private BigInteger simHash()        {            int[] v = new int[this.hashbits];            ChxTokenizer stringTokens = new ChxTokenizer(this.tokens);            while (stringTokens.hasMoreTokens())            {                String temp = stringTokens.nextToken();                BigInteger t = this.hash(temp);                //Console.WriteLine("temp = {0} : {1}", temp, t);                for (int i = 0; i < this.hashbits; i++)                {                    BigInteger bitmask = BigInteger.One << i;                    if ((t & bitmask).Sign!=0)                    {                        v[i] += 1;                    }                    else                    {                        v[i] -= 1;                    }                }            }            BigInteger fingerprint = BigInteger.Zero;            for (int i = 0; i < this.hashbits; i++)            {                if (v[i] >= 0)                {                    fingerprint = fingerprint + (BigInteger.Parse("1") << i);                }            }            return fingerprint;        }        private BigInteger hash(string source)        {            if (source == null || source.Length == 0)            {                return BigInteger.Zero;            }            else            {                char[] sourceArray = source.ToCharArray();                BigInteger x = new BigInteger(((long)sourceArray[0]) << 7);                BigInteger m = BigInteger.Parse("1000003");                BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One;                foreach (char item in sourceArray)                {                    BigInteger temp = new BigInteger((long)item);                    x = ((x * m) ^ temp) & mask;                }                x = x ^ (new BigInteger(source.Length));                if (x.Equals(BigInteger.MinusOne))                {                    x = new BigInteger(-2);                }                return x;            }        }        public int HammingDistance(SimHash other)        {            BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One;            BigInteger x = (this.strSimHash ^ other.strSimHash) & m;            int tot = 0;            while (x.Sign != 0)            {                tot += 1;                x = x & (x- BigInteger.One);            }            return tot;        }      }    //简单的分词法,直接将中文分成单个汉。可以用其他分词法代替    public class ChxTokenizer    {        private string source;        private int index;        private int length;        public ChxTokenizer(string source)        {            this.source = source;            this.index = 0;            this.length = (source ?? "").Length;        }        public bool hasMoreTokens()        {            return index < length;        }        public string nextToken()        {            String s = source.Substring(index, 1);            index++;            return s;         }    }}

使用方法示例:

using System;using System.Collections.Generic;using System.Linq;using System.Text;namespace chx{    class Program    {        static void Main(string[] args)        {            Test();        }    private static void Test()    {        var s1 = "中文分词太麻烦了,也有些中文分词组件也不错";        var hash1 = new SimHash(s1);        Console.WriteLine("S1.simhash: {0}", hash1.StrSimHash);        var s2 = "有些中文分词太麻烦了,也有些中文分词组件也不错";        var hash2 = new SimHash(s2);        Console.WriteLine("S2.simhash: {0}", hash1.StrSimHash);        var s3 = "有些中文分词太麻烦了";        var hash3 = new SimHash(s3);        Console.WriteLine("S3.simhash: {0}", hash1.StrSimHash);        Console.WriteLine("============================");          Console.WriteLine("s1与s2的汉明距离:{0}",hash1.HammingDistance(hash2));        Console.WriteLine("s1与s3的汉明距离:{0}",hash1.HammingDistance(hash3));      }   }}
阅读全文
0 0
原创粉丝点击