哈希表的扩展-布隆过滤器

来源：互联网发布：现代主义设计特点,知乎编辑：程序博客网时间：2024/06/05 05:28

布隆过滤器的简介

什么是布隆过滤器？

布隆过滤器(Bloom Filter)是1970年由布隆提出的。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法，缺点是有一定的误识别率和删除困难。

为什么会出现布隆过滤器？

在日常生活中，包括在设计计算机软件时，我们经常判断一个元素是否在一个集合中。比如：要检查一个单词是否拼写正确（即是否在已知的字典中）；判断一个嫌疑人的名字是否已经在嫌疑名单中；在网络爬虫中，一个网站是否被访问过等等。最直接的方法就是将集合中全部的元素存在计算机中，遇到一个新元素，将它和集合中的元素直接对比。计算机中的集合是用哈希表存储。优点是：快速准确，缺点是：费存储空间。为了提高效率我们可以采用hash表，并且将集合中的元素都映射到bitmap中的一个位上，这样的话就会节省空间和查找的时间。但是由于哈希冲突的原因，我们有可能会产生误判，即不同的元素经过散列函数之后可能产生同一个地址。

布隆过滤器有哪些应用？
1、Google著名的分布式数据库Bigtable使用布隆过滤器来查找不存在的行或列，以减少磁盘查找IO的次数。
2、Squid网页代理缓存服务在cache digests 。
3、Venti文档存储系统也采用布隆过滤器来检测先前存储的数据。
4、SPIN模型检测器使用布隆过滤器在大规模验证问题时跟踪可达状态空间。
5、Google Chrome浏览器使用布隆过滤器加速安全浏览服务。
6、在很多Key-Value系统也使用了布隆过滤器加快查询过程。如：Hbase、Accumulo、Leveldb。

简单的实现布隆过滤器

BitSet.h#pragma once#include <vector>class BitSet{public:    BitSet(size_t range)//构造函数    {        _a.resize((range >> 5) + 1, 0);    }    void Set(size_t num)    {        size_t index = num >> 5;//在哪个数中        size_t pos = num % 32;//在哪个比特位中        _a[index] |= (1 << pos);//将num对应的位置1    }    void ReSet(size_t num)    {        size_t index = num >> 5;        size_t pos = num % 32;        _a[index] &= ~(1 << pos);//将num对应的位置0    }    bool Test(size_t num)    {        size_t index = num >> 5;        size_t pos = num % 32;        return _a[index] & (1 << pos);//如果存在，对应的位是1，&1为1，否则相反    }protected:    vector<int> _a;};BloomFilter.h#pragma oncetemplate <typename K>struct _Func1{    size_t BKDRHash(const char *str)    {        register size_t hash = 0;        while (size_t ch = (size_t)*str++)        {            hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313..                   }        return hash;    }    size_t operator()(const string& key)    {        return BKDRHash(key.c_str());    }};template <typename K>struct _Func2{    size_t SDBMHash(const char *str)    {        register size_t hash = 0;        while (size_t ch = (size_t)*str++)        {            hash = 65599 * hash + ch;            //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;          }        return hash;    }    size_t operator()(const string& key)    {        return SDBMHash(key.c_str());    }};template <typename K>struct _Func3{    size_t RSHash(const char *str)    {        register size_t hash = 0;        size_t magic = 63689;        while (size_t ch = (size_t)*str++)        {            hash = hash * magic + ch;            magic *= 378551;        }        return hash;    }    size_t operator()(const string& key)    {        return RSHash(key.c_str());    }};template <typename K>struct _Func4{    size_t APHash(const char *str)    {        register size_t hash = 0;        size_t ch;        for (long i = 0; ch = (size_t)*str++; i++)        {            if ((i & 1) == 0)            {                hash ^= ((hash << 7) ^ ch ^ (hash >> 3));            }            else            {                hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));            }        }        return hash;    }    size_t operator()(const string& key)    {        return APHash(key.c_str());    }};template <typename K>struct _Func5{    size_t JSHash(const char *str)    {        if (!*str)        // 这是由本人添加，以保证空字符串返回哈希值0              return 0;        register size_t hash = 1315423911;        while (size_t ch = (size_t)*str++)        {            hash ^= ((hash << 5) + ch + (hash >> 2));        }        return hash;    }    size_t operator()(const string& key)    {        return JSHash(key.c_str());    }};template <typename K = string    , typename Func1 = _Func1<K>    , typename Func2 = _Func2<K>    , typename Func3 = _Func3<K>    , typename Func4 = _Func4<K>    , typename Func5 = _Func5<K >>class BloomFilter{public:    BloomFilter(const size_t range)        :_s1(range)        , _size(range)    {}    void Set(const K& key)    {        size_t index1 = Func1()(key.c_str()) % _size;        size_t index2 = Func2()(key.c_str()) % _size;        size_t index3 = Func3()(key.c_str()) % _size;        size_t index4 = Func4()(key.c_str()) % _size;        size_t index5 = Func5()(key.c_str()) % _size;        _s1.Set(index1);        _s1.Set(index2);        _s1.Set(index3);        _s1.Set(index4);        _s1.Set(index5);    }    bool Test(const K& key)    {        size_t index1 = Func1()(key.c_str()) % _size;        _s1.Test(index1);        if (_s1.Test(index1) == 0)            return false;        size_t index2 = Func2()(key.c_str()) % _size;        _s1.Test(index2);        if (_s1.Test(index2) == 0)            return false;        size_t index3 = Func3()(key.c_str()) % _size;        _s1.Test(index3);        if (_s1.Test(index3) == 0)            return false;        size_t index4 = Func4()(key.c_str()) % _size;        _s1.Test(index4);        if (_s1.Test(index4) == 0)            return false;        size_t index5 = Func1()(key.c_str()) % _size;        _s1.Test(index5);        if (_s1.Test(index5) == 0)            return false;        return true;    }protected:    BitSet _s1;    size_t _size;};void TestBloomFilter(){    BloomFilter<> bf1(1000);    bf1.Set("sort");    bf1.Set("man");    bf1.Set("left");    bf1.Set("123");    bf1.Set("真的");    bf1.Set("https://hao.360.cn/?a1006");    bf1.Set("https://hao.360.cn/?a10061");    bf1.Set("https://hao.360.cn/?a10062");    bf1.Set("https://hao.360.cn/?a10063");    bf1.Set("https://hao.360.cn/?a10064");    cout << "Is True?:" << bf1.Test("sort") << endl;    cout << "Is True?:" << bf1.Test("123") << endl;    cout << "Is True?:" << bf1.Test("left1") << endl;    cout << "Is True?:" << bf1.Test("真的") << endl;    cout << "Is True?:" << bf1.Test("假的") << endl;    cout << "Is True?:" << bf1.Test("https://hao.360.cn/?a1006") << endl;    cout << "Is True?:" << bf1.Test("https://hao.360.cn/?a10064") << endl;    cout << "Is True?:" << bf1.Test("https://hao.360.cn/?a10067") << endl;}Test.cpp#include <iostream>#include <string>#include <cassert>#include <cstdlib>using namespace std;#include "BitSet.h"#include "BloomFilter.h"int main(){    TestBloomFilter();    return 0;}

阅读全文

1 0