本文目录导读:

我来详细介绍Java中实现布隆过滤器的几种方式:
手动实现布隆过滤器
基础版实现
import java.util.BitSet;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
public class BloomFilter<T> {
private BitSet bitSet;
private int bitSetSize;
private int numberOfHashFunctions;
private Charset charset = StandardCharsets.UTF_8;
public BloomFilter(int expectedInsertions, double falsePositiveRate) {
// 计算最优的位数组大小
this.bitSetSize = optimalBitSetSize(expectedInsertions, falsePositiveRate);
// 计算最优的哈希函数个数
this.numberOfHashFunctions = optimalNumberOfHashFunctions(expectedInsertions, bitSetSize);
this.bitSet = new BitSet(bitSetSize);
}
public void add(T element) {
byte[] bytes = element.toString().getBytes(charset);
int[] hashValues = getHashValues(bytes);
for (int hash : hashValues) {
bitSet.set(Math.abs(hash % bitSetSize), true);
}
}
public boolean mightContain(T element) {
byte[] bytes = element.toString().getBytes(charset);
int[] hashValues = getHashValues(bytes);
for (int hash : hashValues) {
if (!bitSet.get(Math.abs(hash % bitSetSize))) {
return false;
}
}
return true;
}
// 使用不同的种子生成多个哈希值
private int[] getHashValues(byte[] bytes) {
int[] hashValues = new int[numberOfHashFunctions];
for (int i = 0; i < numberOfHashFunctions; i++) {
hashValues[i] = hash(bytes, i);
}
return hashValues;
}
private int hash(byte[] bytes, int seed) {
int result = seed;
for (byte b : bytes) {
result = 31 * result + (b & 0xff);
}
return result;
}
// 计算最优位数组大小
private int optimalBitSetSize(int expectedInsertions, double falsePositiveRate) {
return (int) (-expectedInsertions * Math.log(falsePositiveRate) / (Math.log(2) * Math.log(2)));
}
// 计算最优哈希函数个数
private int optimalNumberOfHashFunctions(int expectedInsertions, int bitSetSize) {
return Math.max(1, (int) Math.round((double) bitSetSize / expectedInsertions * Math.log(2)));
}
// 获取当前元素数量估计
public int estimatedElementCount() {
int setBits = bitSet.cardinality();
if (setBits == 0) return 0;
return (int) (-(double) bitSetSize / numberOfHashFunctions *
Math.log(1 - (double) setBits / bitSetSize));
}
}
使用示例
public class BloomFilterDemo {
public static void main(String[] args) {
// 创建布隆过滤器:预计插入10000个元素,误判率0.01
BloomFilter<String> bloomFilter = new BloomFilter<>(10000, 0.01);
// 添加元素
bloomFilter.add("apple");
bloomFilter.add("banana");
bloomFilter.add("orange");
// 检查元素是否存在
System.out.println("Contains 'apple': " + bloomFilter.mightContain("apple")); // true
System.out.println("Contains 'grape': " + bloomFilter.mightContain("grape")); // false(可能误判)
// 统计信息
System.out.println("Estimated count: " + bloomFilter.estimatedElementCount());
}
}
使用Guava库实现(推荐)
<!-- Maven依赖 -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import java.nio.charset.Charset;
public class GuavaBloomFilterExample {
public static void main(String[] args) {
// 创建布隆过滤器
BloomFilter<String> bloomFilter = BloomFilter.create(
Funnels.stringFunnel(Charset.forName("UTF-8")),
10000, // 预计插入的数据量
0.01 // 期望的误判率
);
// 添加元素
bloomFilter.put("hello");
bloomFilter.put("world");
bloomFilter.put("java");
// 检查元素
System.out.println("Contains 'hello': " + bloomFilter.mightContain("hello")); // true
System.out.println("Contains 'python': " + bloomFilter.mightContain("python")); // false
// 批量添加
String[] urls = {"url1", "url2", "url3"};
for (String url : urls) {
bloomFilter.put(url);
}
// 获取近似元素数量
System.out.println("Approximate count: " + bloomFilter.approximateElementCount());
}
}
Redis实现的布隆过滤器
安装Redis布隆过滤器模块
# 在Redis中安装布隆过滤器模块 redis-cli > MODULE LOAD /path/to/redisbloom.so
Java代码实现
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.params.BloomFilterInsertParams;
public class RedisBloomFilterExample {
private JedisPool jedisPool;
public RedisBloomFilterExample() {
this.jedisPool = new JedisPool("localhost", 6379);
}
// 创建布隆过滤器
public void createBloomFilter(String key, long capacity, double errorRate) {
try (Jedis jedis = jedisPool.getResource()) {
jedis.bfReserve(key, errorRate, capacity);
}
}
// 添加元素
public boolean add(String key, String value) {
try (Jedis jedis = jedisPool.getResource()) {
return jedis.bfAdd(key, value);
}
}
// 批量添加
public boolean[] addMulti(String key, String... values) {
try (Jedis jedis = jedisPool.getResource()) {
BloomFilterInsertParams params = new BloomFilterInsertParams();
return jedis.bfInsert(key, params, values);
}
}
// 检查元素是否存在
public boolean mightContain(String key, String value) {
try (Jedis jedis = jedisPool.getResource()) {
return jedis.bfExists(key, value);
}
}
// 使用示例
public static void main(String[] args) {
RedisBloomFilterExample example = new RedisBloomFilterExample();
String filterKey = "url_filter";
// 创建布隆过滤器
example.createBloomFilter(filterKey, 100000, 0.01);
// 添加URL
example.add(filterKey, "http://example.com");
example.add(filterKey, "http://google.com");
// 检查URL是否已经被收录
System.out.println("Already crawled? " +
example.mightContain(filterKey, "http://example.com")); // true
System.out.println("Already crawled? " +
example.mightContain(filterKey, "http://new-site.com")); // false
}
}
高级应用:使用多个哈希函数
import java.nio.charset.StandardCharsets;
import java.util.BitSet;
import com.google.common.hash.Hashing;
public class AdvancedBloomFilter<T> {
private final BitSet bitSet;
private final int bitSetSize;
private final int numHashFunctions;
public AdvancedBloomFilter(int expectedInsertions, double falsePositiveRate) {
this.bitSetSize = optimalBitSetSize(expectedInsertions, falsePositiveRate);
this.numHashFunctions = optimalNumHashFunctions(expectedInsertions, bitSetSize);
this.bitSet = new BitSet(bitSetSize);
}
public void put(T object) {
byte[] bytes = object.toString().getBytes(StandardCharsets.UTF_8);
long hash64 = Hashing.murmur3_128().hashBytes(bytes).asLong();
int hash1 = (int) hash64;
int hash2 = (int) (hash64 >>> 32);
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = hash1 + (i * hash2);
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
bitSet.set(combinedHash % bitSetSize);
}
}
public boolean mightContain(T object) {
byte[] bytes = object.toString().getBytes(StandardCharsets.UTF_8);
long hash64 = Hashing.murmur3_128().hashBytes(bytes).asLong();
int hash1 = (int) hash64;
int hash2 = (int) (hash64 >>> 32);
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = hash1 + (i * hash2);
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
if (!bitSet.get(combinedHash % bitSetSize)) {
return false;
}
}
return true;
}
private int optimalBitSetSize(int n, double p) {
return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
private int optimalNumHashFunctions(int n, int m) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
}
实际应用场景
public class BloomFilterApplication {
// 缓存穿透保护
public static class CacheGuard {
private BloomFilter<String> bloomFilter;
private Map<String, Object> cache;
public CacheGuard(int expectedItems) {
this.bloomFilter = BloomFilter.create(
Funnels.stringFunnel(StandardCharsets.UTF_8),
expectedItems,
0.01
);
this.cache = new HashMap<>();
}
public Object get(String key) {
// 先检查布隆过滤器
if (!bloomFilter.mightContain(key)) {
return null; // 一定不存在,直接返回
}
// 可能存在的,继续检查缓存
return cache.get(key);
}
public void put(String key, Object value) {
bloomFilter.put(key);
cache.put(key, value);
}
}
// 网页爬虫去重
public static class WebCrawler {
private BloomFilter<String> crawledUrls;
public WebCrawler() {
this.crawledUrls = BloomFilter.create(
Funnels.stringFunnel(StandardCharsets.UTF_8),
1000000, // 预计100万URL
0.001 // 千分之一误判率
);
}
public boolean shouldCrawl(String url) {
if (crawledUrls.mightContain(url)) {
return false; // 可能已经爬取过
}
crawledUrls.put(url);
return true;
}
}
}
使用建议
- 小数据量(< 100万):使用Guava库实现,简单可靠
- 大数据量:考虑使用Redis布隆过滤器,支持分布式
- 高性能要求:手动实现,使用高性能哈希函数(如MurmurHash)
- 精确性要求:注意误判率参数的选择,通常在0.001-0.01之间
布隆过滤器非常适合处理大数据量的判重场景,特别是能接受一定误判率的业务。