布隆过滤器实现及应用
来源:互联网 发布:php接口的用途 编辑:程序博客网 时间:2024/06/01 16:47
引子:
下午-开会-对需求-通过友好的交流,得知,我们需要实现一个系统,系统需要实现一个API,用于检查一条特定的数据是否在数据库中,数据库中数据量暂且估计3000万条,API调用量每秒2万次,需求可以接纳一小部分失误,遂想方案,加redis?开什么玩笑,这么小一个API,这么大动干戈,不值得,这时候,布隆过滤器就派上用场了。
介绍:
布隆过滤器是一位名叫布隆的人于balabala。。。。。I DONT CARE!
网上介绍布隆过滤器的文章还算多的,大家自行百度吧,有兴趣的话也可以自行推导一下误判率,位数组数量,插入元素值,哈希次数之间的关系,这里只说一下它的优点缺点使用场景,并附上源码实现供大家使用。
优点:
在大幅缩小使用空间的条件下,以极高的效率实现判断一个元素是否存在于某个集合中(常数级别)
缺点:
有一定的假正例,但没有假反例,即判断元素存在于集合中有一定误判率,但判断元素没有在集合中,则此元素一定不在集合中,使用前需要确定好存入最大值和能够接受的最大误判率,从而确定好需要的位数组位数。
不能删除。
使用场景:
URL去重,
黑名单逻辑(由于有一定误判率,可以在布隆过滤器后面增加一个白名单)
等任何需要判断集合包含关系的场景
源码实现
虽然介绍布隆过滤器原理的很多,但是提供源码实现的很少,或者实现比较简单,今天给大家分享一个生产环境布隆过滤器的实现,有需要的人可直接copy去使用。
/** * 布隆过滤器实现 * congcong.han create on 2016-05-25 */public class BloomFilter { private static final Logger LOGGER = LoggerFactory.getLogger(BloomFilter.class); /** * 分配的byte数量 */ protected long byteSize; /** * 使用hash数量 */ protected int hashCount; /** * hash函数 */ protected final MurmurHash hash = MurmurHash.getInstance();; /** * 布隆过滤器存入key数量 */ protected AtomicInteger keyCount = new AtomicInteger(); /** * 保证误判率可以放入的最多的key */ protected int maxKeys; /** * byte数组 */ protected ByteBuffer bloom; protected Lock lock = new ReentrantLock(); public static final double LOG2_SQUARED = Math.log(2) * Math.log(2); private static final byte[] bitvals = { (byte) 0x01, (byte) 0x02, (byte) 0x04, (byte) 0x08, (byte) 0x10, (byte) 0x20, (byte) 0x40, (byte) 0x80 }; /** * @param maxKeys 最多放入key数量 * @param errorRate 误判率 * @return 需要多少bit位 */ public static long computeBitSize(long maxKeys, double errorRate) { return (long) Math.ceil(maxKeys * (-Math.log(errorRate) / LOG2_SQUARED)); } /** * 计算一个布隆过滤器所能放入的最大值 假设hash数量为最理想情况 * * @param bitSize * @param errorRate * @return */ public static long idealMaxKeys(long bitSize, double errorRate) { return (long) (bitSize * (LOG2_SQUARED / -Math.log(errorRate))); } /** * 计算一个布隆过滤器所能放入的最大值 * * @param bitSize * @param errorRate * @param hashCount * @return */ public static long computeMaxKeys(long bitSize, double errorRate, int hashCount) { return (long) (-bitSize * 1.0 / hashCount * Math.log(1 - Math.exp(Math.log(errorRate) / hashCount))); } /** * 计算本过滤器现在情况下的错误率 */ public double actualErrorRate() { return actualErrorRate(keyCount.get(), byteSize * 8, hashCount); } /** * 计算错误率 * * @param maxKeys * @param bitSize * @param functionCount * @return the actual error rate */ public static double actualErrorRate(long maxKeys, long bitSize, int functionCount) { return Math.exp(Math.log(1 - Math.exp(-functionCount * maxKeys * 1.0 / bitSize)) * functionCount); } /** * 圆整byte数量,使其可以折叠foldFactor次 * * @param bitSize * @param foldFactor * @return Foldable byte size */ public static int computeFoldableByteSize(long bitSize, int foldFactor) { long byteSizeLong = (bitSize + 7) / 8; int mask = (1 << foldFactor) - 1; if ((mask & byteSizeLong) != 0) { byteSizeLong >>= foldFactor; ++byteSizeLong; byteSizeLong <<= foldFactor; } if (byteSizeLong > Integer.MAX_VALUE) { throw new IllegalArgumentException("byteSize=" + byteSizeLong + " too " + "large for bitSize=" + bitSize + ", foldFactor=" + foldFactor); } return (int) byteSizeLong; } /** * 最优hash数量 * * @param maxKeys * @param bitSize * @return */ private static int optimalFunctionCount(int maxKeys, long bitSize) { long i = bitSize / maxKeys; double result = Math.ceil(Math.log(2) * i); if (result > Integer.MAX_VALUE) { throw new IllegalArgumentException("result too large for integer value."); } return (int) result; } public BloomFilter(){} /** * @param maxKeys 布隆过滤器期望插入KEY * @param errorRate 期望概率 * @param foldFactor 可以折叠次数 * @throws IllegalArgumentException */ public BloomFilter(int maxKeys, double errorRate, int foldFactor) throws IllegalArgumentException { long bitSize = computeBitSize(maxKeys, errorRate); hashCount = optimalFunctionCount(maxKeys, bitSize); this.maxKeys = maxKeys; byteSize = computeFoldableByteSize(bitSize, foldFactor); LOGGER.info("create bloom filter ,length {} maxkey {} errorRate {}",byteSize,maxKeys,errorRate); sanityCheck(); } /** * 建造规定大小布隆过滤器 * * @param byteSizeHint 大小 * @param errorRate 错误率 * @param foldFactor 折叠率 * @return the new Bloom filter of the desired size */ public static BloomFilter createBySize(int byteSizeHint, double errorRate, int foldFactor) { BloomFilter bbf = new BloomFilter(); bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor); long bitSize = bbf.byteSize * 8; bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate); bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize); bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount); return bbf; } public BloomFilter createAnother() { BloomFilter bbf = new BloomFilter(); bbf.byteSize = byteSize; bbf.hashCount = hashCount; bbf.maxKeys = maxKeys; return bbf; } /** * 分配空间 */ public void allocBloom() { if (this.bloom != null) { throw new IllegalArgumentException("can only create bloom once."); } this.bloom = ByteBuffer.allocate((int) this.byteSize); assert this.bloom.hasArray(); } void sanityCheck() throws IllegalArgumentException { if (0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) { throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize); } if (this.hashCount <= 0) { throw new IllegalArgumentException("Hash function count must be > 0"); } if (this.hash == null) { throw new IllegalArgumentException("hashType must be known"); } if (this.keyCount.get() < 0) { throw new IllegalArgumentException("must have positive keyCount"); } } public void add(byte[] buf) { add(buf, 0, buf.length); } public void add(byte[] buf, int offset, int len) { int hash1 = this.hash.hash(buf, offset, len, 0); int hash2 = this.hash.hash(buf, offset, len, hash1); for (int i = 0; i < this.hashCount; i++) { long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8)); set(hashLoc); } this.keyCount.incrementAndGet(); } public boolean contains(byte[] buf) { return contains(buf, 0, buf.length, this.bloom); } public boolean contains(byte[] buf, int offset, int length, ByteBuffer theBloom) { if (theBloom.limit() != byteSize) { throw new IllegalArgumentException("Bloom does not match expected size:" + " theBloom.limit()=" + theBloom.limit() + ", byteSize=" + byteSize); } return contains(buf, offset, length, theBloom, 0, (int) byteSize, hash, hashCount); } public static boolean contains(byte[] buf, int offset, int length, ByteBuffer bloomBuf, int bloomOffset, int bloomSize, MurmurHash hash, int hashCount) { int hash1 = hash.hash(buf, offset, length, 0); int hash2 = hash.hash(buf, offset, length, hash1); int bloomBitSize = bloomSize << 3; int compositeHash = hash1; for (int i = 0; i < hashCount; i++) { int hashLoc = Math.abs(compositeHash % bloomBitSize); compositeHash += hash2; if (!get(hashLoc, bloomBuf, bloomOffset)) { return false; } } return true; } //--------------------------------------------------------------------------- /** Private helpers */ void set(long pos) { int bytePos = (int) (pos / 8); int bitPos = (int) (pos % 8); try { this.lock.lock(); byte curByte = bloom.get(bytePos); curByte |= bitvals[bitPos]; bloom.put(bytePos, curByte); } finally { this.lock.unlock(); } } static boolean get(int pos, ByteBuffer bloomBuf, int bloomOffset) { int bytePos = pos >> 3; //pos / 8 int bitPos = pos & 0x7; //pos % 8 // TODO access this via Util API which can do Unsafe access if possible(?) byte curByte = bloomBuf.get(bloomOffset + bytePos); curByte &= bitvals[bitPos]; return (curByte != 0); } public long getKeyCount() { return keyCount.get(); } public long getMaxKeys() { return maxKeys; } public long getByteSize() { return byteSize; } /** * 保证误判率的情况下,压缩以节省空间(如果可以的话) */ public void compactBloom() { if (this.keyCount.get() > 0 && this.bloom.hasArray()) { int pieces = 1; int newByteSize = (int) this.byteSize; int newMaxKeys = this.maxKeys; while ((newByteSize & 1) == 0 && newMaxKeys > (this.keyCount.get() << 1)) { pieces <<= 1; newByteSize >>= 1; newMaxKeys >>= 1; } if (pieces > 1) { byte[] array = this.bloom.array(); int start = this.bloom.arrayOffset(); int end = start + newByteSize; int off = end; for (int p = 1; p < pieces; ++p) { for (int pos = start; pos < end; ++pos) { array[pos] |= array[off++]; } } this.bloom.rewind(); this.bloom.limit(newByteSize); this.bloom = this.bloom.slice(); this.byteSize = newByteSize; this.maxKeys = newMaxKeys; } } }}
/** * MurmurHash 参考 http://murmurhash.googlepages.com/ */public class MurmurHash { private static MurmurHash _instance = new MurmurHash(); public static MurmurHash getInstance() { return _instance; } public int hash(byte[] data, int offset, int length, int seed) { int m = 0x5bd1e995; int r = 24; int h = seed ^ length; int len_4 = length >> 2; for (int i = 0; i < len_4; i++) { int i_4 = (i << 2) + offset; int k = data[i_4 + 3]; k = k << 8; k = k | (data[i_4 + 2] & 0xff); k = k << 8; k = k | (data[i_4 + 1] & 0xff); k = k << 8; k = k | (data[i_4 + 0] & 0xff); k *= m; k ^= k >>> r; k *= m; h *= m; h ^= k; } // avoid calculating modulo int len_m = len_4 << 2; int left = length - len_m; int i_m = len_m + offset; if (left != 0) { if (left >= 3) { h ^= data[i_m + 2] << 16; } if (left >= 2) { h ^= data[i_m + 1] << 8; } if (left >= 1) { h ^= data[i_m]; } h *= m; } h ^= h >>> 13; h *= m; h ^= h >>> 15; return h; }}
0 0
- 布隆过滤器实现及应用
- 布隆过滤器及Java实现
- 布隆过滤器(Bloom Filter)原理及python实现
- 实现布隆过滤器
- Python实现布隆过滤器
- 布隆过滤器java实现
- 布隆过滤器:实现代码
- 布隆过滤器及其实现
- 实现一个布隆过滤器
- 大数据下-巧用位数组排序和判重及布隆过滤器的简单应用
- 【布隆过滤器】实现一个简单的布隆过滤器
- 简单布隆过滤器实现、布隆过滤器扩展
- 【布隆过滤器】实现一个简单的布隆过滤器
- 【DS】Hash表及布隆过滤器
- url去重 --布隆过滤器 bloom filter原理及python实现
- 全自动过滤器:精密过滤器技术特点及应用说明
- 全自动过滤器:精密过滤器技术特点及应用说明
- 过滤器:精密过滤器特点及应用范围概述
- Android ---Dialog小例
- Symmetric Tree
- HDU 1280 前m大的数(简单HASH)
- 分享一个链接
- Android listView每个item设置不同
- 布隆过滤器实现及应用
- JavaScript学习中的一些笔记
- 常用Docker命令
- 用PC搭建SVN服务器:[1]局域网SVN服务器
- 今天开始学习swift
- 浅谈Java泛型中的extends和super关键字
- Windows下Maven安装教程
- 近期规划及读《致在大学感到迷茫的你》有感
- 异常简介