数据存在？-‘布隆过滤器’

来源：互联网发布：淘宝c店直播运营编辑：程序博客网时间：2024/05/29 15:08

布隆过滤器是一种能够在大量数据中判断数据是否存在的算法。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法，缺点是有一定的误识别率和删除困难。在介绍‘布隆过滤器’之前，先介绍一下‘位图’的思想：

这里有这样一个问题：给40亿个没有排序、不重复的无符号整数，如何快速的判断一个数据是否在这40亿个数据之中？

--对于40亿个数据，如果我们将这40亿个数据都放入内存中，我们需要多大的存储空间呢？假设每个数据都是char类型的，这样消耗多少的空间？如果是int类型呢？或者是更多的数据呢？不难知道对于大量的数据，如果采用将数据放入内存中，这种方式是很不理智的。这里介绍一种方法—‘位图’。

位图：主要算法思想就是充分的利用bit位，假设数据都是int类型，每个int类型都占32个bit位。将一个int类型数据的32个bit用来表示32个数据是否存在， 0表示不存在，1表示存在（能够极大地缩小空间）。先计算出数据在哪一个int类型的空间中，然后计算在这个int类型的第几个bit位上，然后将此位置更改为1，表明这个位置上存在数据。

下面是‘位图’的实现：

class BitMap{public:     BitMap(size_t size = 0)     //构造      :_size(0)     {          _a.resize((size >> 5) + 1);       //resize开辟空间（int类型的个数），并进行初始化     }          void set(size_t x)    //插入数据     {          size_t index = x >> 5;     //index表示的是数据存在哪一个int类型的位置上          size_t num = x % 32;     //num表示数据存在32bit的具体位置          if (!(_a[index] & (1 << num))) //1<<num表示数据x的位置下标，&结果为0，表示此位置上没有数据          {               ++_size;               _a[index] |= (1 << num);    //利用按位或关系将位置更改为1，表示此位置上现在存在数据          }     }          void Reset(size_t x)      //删除数据     {          size_t index = x >> 5;          size_t num = index % 32;          _a[index] &= (~(1 << num));          --_size;     }          bool Test(size_t x)   //判断数据是否在40亿数据中     {          size_t index = x >> 5;          size_t num = x % 32;          if (_a[index] & (1 << num))               return true;          return false;     }          size_t size()     //求数据的有效个数     {          return _size;     }          void Resize(size_t size)    //开辟空间     {          _a.resize((size >> 5) + 1);     }     protected:     vector<size_t> _a;     size_t _size;};

‘布隆过滤器’也是利用位图的思想，它有一个m个比特个数的空间，每一个bit位都初始化为0，通过k种不同的hash函数，每个函数都确定出元素所在的不同位置，将这k个位置的bit位置为1，则将这个元素添加到m个bit的空间中。当需要对数据进行查找时，将k中hash函数得到的k个位置的bit位进行检查，若k个位置都为1，则数据存在，否则数据不存在。布隆过滤器是不允许进行删除数据的，因为那样会将k个位置置为0，可能会影响其他数据的存在性，从而存在错误。

下面是‘布隆过滤器’的实现：

//实现布隆过滤器template <class K>//使用搜索到的5种Hash函数struct _HashFunc1{     size_t DJBHash(const char *str)     {          if (!*str)               return 0;          register size_t hash = 5381;          while (size_t ch = (size_t)*str++)          {               hash += (hash << 5) + ch;          }          return hash;     }          size_t operator()(const K& str)     {          return DJBHash(str.c_str());     }};template <class K>struct _HashFunc2{     size_t SDBMHash(const char *str)     {          register size_t hash = 0;          while (size_t ch = (size_t)*str++)          {               hash = 65599 * hash + ch;          }          return hash;     }          size_t operator()(const K& str)     {          return SDBMHash(str.c_str());     }};template <class K>struct _HashFunc3{     size_t RSHash(const char *str)     {          register size_t hash = 0;          size_t magic = 63689;          while (size_t ch = (size_t)*str++)          {               hash = hash * magic + ch;               magic *= 378551;          }          return hash;     }          size_t operator()(const K& str)     {          return RSHash(str.c_str());     }};template <class K>struct _HashFunc4{     size_t APHash(const char *str)     {          register size_t hash = 0;          size_t ch;          for (long i = 0; ch = (size_t)*str++; i++)          {               if ((i & 1) == 0)               {                    hash ^= ((hash << 7) ^ ch ^ (hash >> 3));               }               else               {                    hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));               }          }          return hash;     }          size_t operator()(const K& str)     {          return APHash(str.c_str());     }};template <class K>struct _HashFunc5{     size_t JSHash(const char *str)     {          if (!*str)               return 0;          register size_t hash = 1315423911;          while (size_t ch = (size_t)*str++)          {               hash ^= ((hash << 5) + ch + (hash >> 2));          }          return hash;     }     size_t operator()(const K& str)     {          return JSHash(str.c_str());     }};size_t GetPrimeSize(size_t size) //求大于等于size的最小素数{     static const int _prime = 28;     static const unsigned long _PrimeList[_prime] =     {          53ul, 97ul, 193ul, 389ul, 769ul,          1543ul, 3079ul, 6151ul, 12289ul, 24593ul,          49157ul, 98317ul, 196613ul, 393241ul, 786433ul,          1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,          50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,          1610612741ul, 3221225473ul, 4294967291ul     };          for (size_t i = 0; i < _prime; i++)     {          if (_PrimeList[i] >= size)          {               return _PrimeList[i];          }     }     return _PrimeList[_prime - 1];}template <class K = string,           class HashFunc1 = _HashFunc1<K>,          class HashFunc2 = _HashFunc2<K>,          class HashFunc3 = _HashFunc3<K>,          class HashFunc4 = _HashFunc4<K>,          class HashFunc5 = _HashFunc5<K>>class BloomFilter{public:     BloomFilter(size_t size = 0)    //构造     {          _capacity = GetPrimeSize(size);          _bm.Resize(_capacity);     }          void set(const K& key)     {          size_t index1 = HashFunc1()(key);          size_t index2 = HashFunc2()(key);          size_t index3 = HashFunc3()(key);          size_t index4 = HashFunc4()(key);          size_t index5 = HashFunc5()(key);                    _bm.set((index1) % _capacity);          _bm.set((index2) % _capacity);          _bm.set((index3) % _capacity);          _bm.set((index4) % _capacity);          _bm.set((index5) % _capacity);     }          bool Test(const K& key)    //测试数据是否存在     {          size_t index1 = HashFunc1()(key);          if (!_bm.Test((index1) % _capacity))          {               return false;          }                    size_t index2 = HashFunc2()(key);          if (!_bm.Test((index2) % _capacity))          {               return false;          }                    size_t index3 = HashFunc3()(key);          if (!_bm.Test((index3) % _capacity))          {               return false;          }                    size_t index4 = HashFunc4()(key);          if (!_bm.Test((index4) % _capacity))          {               return false;          }                    size_t index5 = HashFunc5()(key);          if (!_bm.Test((index5) % _capacity))          {               return false;          }          return true;     }     protected:     BitMap _bm;     size_t _capacity;};

本文出自 “无心的执着” 博客，转载请与作者联系！

0 0