BloomFilter简易实现

来源:互联网 发布:java输入输出学生成绩 编辑:程序博客网 时间:2024/05/19 12:29
#ifndef __BLOOM_FILTER_HPP__#define __BLOOM_FILTER_HPP__#include <cstdlib>#include <cstring>template <typename T>unsigned int GetHash(const T & value){    return(value);}const int prime[] = {      3,   5,   7,  11,  13,  17,  19,  23,  29,  31,     37,  41,  43,  47,  53,  59,  61,  67,  71,  73,      79,  83,  89,  97, 101, 103, 107, 109, 113, 127,    131, 137, 139, 149, 151, 157, 163, 167, 173, 179};template <typename T>class BloomFilter{public:    BloomFilter(unsigned int count, unsigned int k = 10);    ~BloomFilter();    void set(const T & value);    bool test(const T & value);private:    BloomFilter(const BloomFilter &);    BloomFilter & operator = (const BloomFilter &);    void clear(const T & value);private:    unsigned int     m_k;    unsigned int     m_size;    unsigned int     m_count;    unsigned char ** m_filter;};template <typename T>BloomFilter<T>::BloomFilter(unsigned int count, unsigned int k) : m_k(k), m_size(0), m_count(count), m_filter(NULL){    if (m_count == 0) {  /* maybe it is not good enough */        abort();    }        if (sizeof(prime) / sizeof(prime[0]) < 2) {        abort();    }     if (m_k > sizeof(prime) / sizeof(prime[0])) {        m_k = sizeof(prime) / sizeof(prime[0]);    }    else if (m_k < 2) {        m_k = 2;    }    m_size = ((count >> 2) + 1); /* maybe it is not good enough */              typedef unsigned char * ucharptr;    m_filter = new ucharptr[m_k];    if (m_filter == NULL) {        abort();    }    for (int i = 0; i < m_k; ++i) {        m_filter[i] = new unsigned char[m_size];        if (m_filter[i] == NULL) {            abort();        }        memset(m_filter[i], 0, m_size);    }}template <typename T>BloomFilter<T>::~BloomFilter(){    for (int i = 0; i < m_k; ++i) {        delete[] m_filter[i];    }    delete[] m_filter;}template <typename T>void BloomFilter<T>::set(const T & value){    unsigned int hash = GetHash(value);    for (int i = 0; i < m_k; ++i) {        unsigned int key = (hash * prime[i]) % m_count;        unsigned char * bit = m_filter[i];        bit[key >> 3] |= (0x01 << (key & 0x07));    }}template <typename T>void BloomFilter<T>::clear(const T & value){    unsigned int hash = GetHash(value);    for (int i = 0; i < m_k; ++i) {        unsigned int key = (hash * prime[i]) % m_count;        unsigned char * bit = m_filter[i];        bit[key >> 3] &= ~(0x01 << (key & 0x07));    }}template <typename T>bool BloomFilter<T>::test(const T & value){    unsigned int hash = GetHash(value);    for (int i = 0; i < m_k; ++i) {        unsigned int key = (hash * prime[i]) % m_count;        unsigned char * bit = m_filter[i];        if (!(bit[key >> 3] & (0x01 << (key & 0x07)))) {            return(false);        }    }    return(true);}#endif

#include <vector>#include <string>#include <iostream>using namespace std;#include "BloomFilter.hpp"template <>unsigned int GetHash(const string & value){    unsigned int hash = 0;    typedef string::const_iterator iterator;    for (iterator iter = value.begin(); iter != value.end(); ++iter) {        hash += (*iter) * 5;    }    return(hash);}int main(int argc, char ** argv){    int array[] = { 9, 5, 4, 6, 7, 8, 0, 1, 55, -100 };    const int size = sizeof(array)/sizeof(array[0]);    int min = array[0];    int max = array[0];    cout << "array:   ";    for (int i = 0; i < size; ++i) {        if (array[i] > max) {            max = array[i];        }        else if (array[i] < min) {            min = array[i];        }        cout << array[i] << ' ';    }    cout << endl;        BloomFilter<int> filter1(size);    BloomFilter<int> filter2(5 * size);    BloomFilter<int> filter3(10 * size);    /* must be right, but BitMap can do it */    BloomFilter<int> filter4(max - min);    for (int i = 0; i < size; ++i) {        filter1.set(array[i]);        filter2.set(array[i]);        filter3.set(array[i]);        filter4.set(array[i]);    }    cout << "sorted1: ";    for (int value = min; value <= max; ++value) {        if (filter1.test(value)) {            cout << value << ' ';        }    }    cout << endl;    cout << "sorted2: ";    for (int value = min; value <= max; ++value) {        if (filter2.test(value)) {            cout << value << ' ';        }    }    cout << endl;    cout << "sorted3: ";    for (int value = min; value <= max; ++value) {        if (filter3.test(value)) {            cout << value << ' ';        }    }    cout << endl;    cout << "sorted4: ";    for (int value = min; value <= max; ++value) {        if (filter4.test(value)) {            cout << value << ' ';        }    }    cout << endl;    /* ------------------------------------------ */    const char * const url[] = {        "www.google.com.hk",         "www.bing.com.cn",         "www.baidu.com",         "www.manmankan.com",         "www.csdn.net"     };    BloomFilter<string> filter(500);    for (int i = 0; i < sizeof(url)/sizeof(url[0]); ++i) {        filter.set(url[i]);    }    const char * const check[] = {        "www.google.com.hk",         "www.bing.com.cn",         "www.baidu.com",         "www.manmankan.com",         "www.csdn.net",         "www.hao123.com",         "www.sohu.com",         "www.soso.com",         "www.sina.com",         "www.nosuchurl.com"     };    for (int i = 0; i < sizeof(check)/sizeof(check[0]); ++i) {        if (filter.test(check[i])) {            cout << check[i] << "    is exist" << endl;        }        else {            cout << check[i] << "    is not exist" << endl;        }    }    return(0);}

代码中的 位数组大小(m_size, 受传入的m_count影响), 哈希函数个数(m_k), 哈希函数的构造(GetHash)都不太恰当