布隆过滤器实现及应用

来源：互联网发布：php接口的用途编辑：程序博客网时间：2024/06/01 16:47

引子：

下午-开会-对需求-通过友好的交流，得知，我们需要实现一个系统，系统需要实现一个API，用于检查一条特定的数据是否在数据库中，数据库中数据量暂且估计3000万条，API调用量每秒2万次，需求可以接纳一小部分失误，遂想方案，加redis？开什么玩笑，这么小一个API，这么大动干戈，不值得，这时候，布隆过滤器就派上用场了。

介绍：

布隆过滤器是一位名叫布隆的人于balabala。。。。。I DONT CARE！

网上介绍布隆过滤器的文章还算多的，大家自行百度吧，有兴趣的话也可以自行推导一下误判率，位数组数量，插入元素值，哈希次数之间的关系，这里只说一下它的优点缺点使用场景，并附上源码实现供大家使用。

优点：

在大幅缩小使用空间的条件下，以极高的效率实现判断一个元素是否存在于某个集合中（常数级别）

缺点：

有一定的假正例，但没有假反例，即判断元素存在于集合中有一定误判率，但判断元素没有在集合中，则此元素一定不在集合中，使用前需要确定好存入最大值和能够接受的最大误判率，从而确定好需要的位数组位数。

不能删除。

使用场景：

URL去重，

黑名单逻辑(由于有一定误判率，可以在布隆过滤器后面增加一个白名单)

等任何需要判断集合包含关系的场景

源码实现

虽然介绍布隆过滤器原理的很多，但是提供源码实现的很少，或者实现比较简单，今天给大家分享一个生产环境布隆过滤器的实现，有需要的人可直接copy去使用。

/** * 布隆过滤器实现 * congcong.han create on 2016-05-25 */public class BloomFilter {    private static final Logger LOGGER = LoggerFactory.getLogger(BloomFilter.class);    /**     * 分配的byte数量     */    protected long byteSize;    /**     * 使用hash数量     */    protected int hashCount;    /**     * hash函数     */    protected final MurmurHash hash = MurmurHash.getInstance();;    /**     * 布隆过滤器存入key数量     */    protected AtomicInteger keyCount = new AtomicInteger();    /**     * 保证误判率可以放入的最多的key     */    protected int maxKeys;    /**     * byte数组     */    protected ByteBuffer bloom;    protected Lock lock = new ReentrantLock();    public static final double LOG2_SQUARED = Math.log(2) * Math.log(2);    private static final byte[] bitvals = {            (byte) 0x01,            (byte) 0x02,            (byte) 0x04,            (byte) 0x08,            (byte) 0x10,            (byte) 0x20,            (byte) 0x40,            (byte) 0x80    };    /**     * @param maxKeys   最多放入key数量     * @param errorRate 误判率     * @return 需要多少bit位     */    public static long computeBitSize(long maxKeys, double errorRate) {        return (long) Math.ceil(maxKeys * (-Math.log(errorRate) / LOG2_SQUARED));    }    /**     * 计算一个布隆过滤器所能放入的最大值 假设hash数量为最理想情况     *     * @param bitSize     * @param errorRate     * @return     */    public static long idealMaxKeys(long bitSize, double errorRate) {        return (long) (bitSize * (LOG2_SQUARED / -Math.log(errorRate)));    }    /**     * 计算一个布隆过滤器所能放入的最大值     *     * @param bitSize     * @param errorRate     * @param hashCount     * @return     */    public static long computeMaxKeys(long bitSize, double errorRate,                                      int hashCount) {        return (long) (-bitSize * 1.0 / hashCount *                Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));    }    /**     * 计算本过滤器现在情况下的错误率     */    public double actualErrorRate() {        return actualErrorRate(keyCount.get(), byteSize * 8, hashCount);    }    /**     * 计算错误率     *     * @param maxKeys     * @param bitSize     * @param functionCount     * @return the actual error rate     */    public static double actualErrorRate(long maxKeys, long bitSize,                                         int functionCount) {        return Math.exp(Math.log(1 - Math.exp(-functionCount * maxKeys * 1.0                / bitSize)) * functionCount);    }    /**     * 圆整byte数量，使其可以折叠foldFactor次     *     * @param bitSize     * @param foldFactor     * @return Foldable byte size     */    public static int computeFoldableByteSize(long bitSize, int foldFactor) {        long byteSizeLong = (bitSize + 7) / 8;        int mask = (1 << foldFactor) - 1;        if ((mask & byteSizeLong) != 0) {            byteSizeLong >>= foldFactor;            ++byteSizeLong;            byteSizeLong <<= foldFactor;        }        if (byteSizeLong > Integer.MAX_VALUE) {            throw new IllegalArgumentException("byteSize=" + byteSizeLong + " too "                    + "large for bitSize=" + bitSize + ", foldFactor=" + foldFactor);        }        return (int) byteSizeLong;    }    /**     * 最优hash数量     *     * @param maxKeys     * @param bitSize     * @return     */    private static int optimalFunctionCount(int maxKeys, long bitSize) {        long i = bitSize / maxKeys;        double result = Math.ceil(Math.log(2) * i);        if (result > Integer.MAX_VALUE) {            throw new IllegalArgumentException("result too large for integer value.");        }        return (int) result;    }    public BloomFilter(){}    /**     * @param maxKeys    布隆过滤器期望插入KEY     * @param errorRate  期望概率     * @param foldFactor 可以折叠次数     * @throws IllegalArgumentException     */    public BloomFilter(int maxKeys, double errorRate,                           int foldFactor) throws IllegalArgumentException {        long bitSize = computeBitSize(maxKeys, errorRate);        hashCount = optimalFunctionCount(maxKeys, bitSize);        this.maxKeys = maxKeys;        byteSize = computeFoldableByteSize(bitSize, foldFactor);        LOGGER.info("create bloom filter ,length {} maxkey {}  errorRate {}",byteSize,maxKeys,errorRate);        sanityCheck();    }    /**     * 建造规定大小布隆过滤器     *     * @param byteSizeHint 大小     * @param errorRate    错误率     * @param foldFactor   折叠率     * @return the new Bloom filter of the desired size     */    public static BloomFilter createBySize(int byteSizeHint,                                               double errorRate, int foldFactor) {        BloomFilter bbf = new BloomFilter();        bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor);        long bitSize = bbf.byteSize * 8;        bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate);        bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize);        bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount);        return bbf;    }    public BloomFilter createAnother() {        BloomFilter bbf = new BloomFilter();        bbf.byteSize = byteSize;        bbf.hashCount = hashCount;        bbf.maxKeys = maxKeys;        return bbf;    }    /**     * 分配空间     */    public void allocBloom() {        if (this.bloom != null) {            throw new IllegalArgumentException("can only create bloom once.");        }        this.bloom = ByteBuffer.allocate((int) this.byteSize);        assert this.bloom.hasArray();    }    void sanityCheck() throws IllegalArgumentException {        if (0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) {            throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize);        }        if (this.hashCount <= 0) {            throw new IllegalArgumentException("Hash function count must be > 0");        }        if (this.hash == null) {            throw new IllegalArgumentException("hashType must be known");        }        if (this.keyCount.get() < 0) {            throw new IllegalArgumentException("must have positive keyCount");        }    }    public void add(byte[] buf) {        add(buf, 0, buf.length);    }    public void add(byte[] buf, int offset, int len) {        int hash1 = this.hash.hash(buf, offset, len, 0);        int hash2 = this.hash.hash(buf, offset, len, hash1);        for (int i = 0; i < this.hashCount; i++) {            long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));            set(hashLoc);        }        this.keyCount.incrementAndGet();    }    public boolean contains(byte[] buf) {        return contains(buf, 0, buf.length, this.bloom);    }    public boolean contains(byte[] buf, int offset, int length,                            ByteBuffer theBloom) {        if (theBloom.limit() != byteSize) {            throw new IllegalArgumentException("Bloom does not match expected size:"                    + " theBloom.limit()=" + theBloom.limit() + ", byteSize=" + byteSize);        }        return contains(buf, offset, length, theBloom, 0, (int) byteSize, hash, hashCount);    }    public static boolean contains(byte[] buf, int offset, int length,                                   ByteBuffer bloomBuf, int bloomOffset, int bloomSize, MurmurHash hash,                                   int hashCount) {        int hash1 = hash.hash(buf, offset, length, 0);        int hash2 = hash.hash(buf, offset, length, hash1);        int bloomBitSize = bloomSize << 3;        int compositeHash = hash1;        for (int i = 0; i < hashCount; i++) {            int hashLoc = Math.abs(compositeHash % bloomBitSize);            compositeHash += hash2;            if (!get(hashLoc, bloomBuf, bloomOffset)) {                return false;            }        }        return true;    }    //---------------------------------------------------------------------------    /** Private helpers */    void set(long pos) {        int bytePos = (int) (pos / 8);        int bitPos = (int) (pos % 8);        try {            this.lock.lock();            byte curByte = bloom.get(bytePos);            curByte |= bitvals[bitPos];            bloom.put(bytePos, curByte);        } finally {            this.lock.unlock();        }    }    static boolean get(int pos, ByteBuffer bloomBuf, int bloomOffset) {        int bytePos = pos >> 3; //pos / 8        int bitPos = pos & 0x7; //pos % 8        // TODO access this via Util API which can do Unsafe access if possible(?)        byte curByte = bloomBuf.get(bloomOffset + bytePos);        curByte &= bitvals[bitPos];        return (curByte != 0);    }    public long getKeyCount() {        return keyCount.get();    }    public long getMaxKeys() {        return maxKeys;    }    public long getByteSize() {        return byteSize;    }    /**     * 保证误判率的情况下，压缩以节省空间（如果可以的话）     */    public void compactBloom() {        if (this.keyCount.get() > 0 && this.bloom.hasArray()) {            int pieces = 1;            int newByteSize = (int) this.byteSize;            int newMaxKeys = this.maxKeys;            while ((newByteSize & 1) == 0 && newMaxKeys > (this.keyCount.get() << 1)) {                pieces <<= 1;                newByteSize >>= 1;                newMaxKeys >>= 1;            }            if (pieces > 1) {                byte[] array = this.bloom.array();                int start = this.bloom.arrayOffset();                int end = start + newByteSize;                int off = end;                for (int p = 1; p < pieces; ++p) {                    for (int pos = start; pos < end; ++pos) {                        array[pos] |= array[off++];                    }                }                this.bloom.rewind();                this.bloom.limit(newByteSize);                this.bloom = this.bloom.slice();                this.byteSize = newByteSize;                this.maxKeys = newMaxKeys;            }        }    }}

/** *  MurmurHash 参考 http://murmurhash.googlepages.com/ */public class MurmurHash {    private static MurmurHash _instance = new MurmurHash();    public static MurmurHash getInstance() {        return _instance;    }    public int hash(byte[] data, int offset, int length, int seed) {        int m = 0x5bd1e995;        int r = 24;        int h = seed ^ length;        int len_4 = length >> 2;        for (int i = 0; i < len_4; i++) {            int i_4 = (i << 2) + offset;            int k = data[i_4 + 3];            k = k << 8;            k = k | (data[i_4 + 2] & 0xff);            k = k << 8;            k = k | (data[i_4 + 1] & 0xff);            k = k << 8;            k = k | (data[i_4 + 0] & 0xff);            k *= m;            k ^= k >>> r;            k *= m;            h *= m;            h ^= k;        }        // avoid calculating modulo        int len_m = len_4 << 2;        int left = length - len_m;        int i_m = len_m + offset;        if (left != 0) {            if (left >= 3) {                h ^= data[i_m + 2] << 16;            }            if (left >= 2) {                h ^= data[i_m + 1] << 8;            }            if (left >= 1) {                h ^= data[i_m];            }            h *= m;        }        h ^= h >>> 13;        h *= m;        h ^= h >>> 15;        return h;    }}

0 0