Java案例如何实现布隆过滤器?

wen java案例 4

本文目录导读:

Java案例如何实现布隆过滤器?

  1. 手动实现布隆过滤器
  2. 使用Guava库实现(推荐)
  3. Redis实现的布隆过滤器
  4. 高级应用:使用多个哈希函数
  5. 实际应用场景
  6. 使用建议

我来详细介绍Java中实现布隆过滤器的几种方式:

手动实现布隆过滤器

基础版实现

import java.util.BitSet;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
public class BloomFilter<T> {
    private BitSet bitSet;
    private int bitSetSize;
    private int numberOfHashFunctions;
    private Charset charset = StandardCharsets.UTF_8;
    public BloomFilter(int expectedInsertions, double falsePositiveRate) {
        // 计算最优的位数组大小
        this.bitSetSize = optimalBitSetSize(expectedInsertions, falsePositiveRate);
        // 计算最优的哈希函数个数
        this.numberOfHashFunctions = optimalNumberOfHashFunctions(expectedInsertions, bitSetSize);
        this.bitSet = new BitSet(bitSetSize);
    }
    public void add(T element) {
        byte[] bytes = element.toString().getBytes(charset);
        int[] hashValues = getHashValues(bytes);
        for (int hash : hashValues) {
            bitSet.set(Math.abs(hash % bitSetSize), true);
        }
    }
    public boolean mightContain(T element) {
        byte[] bytes = element.toString().getBytes(charset);
        int[] hashValues = getHashValues(bytes);
        for (int hash : hashValues) {
            if (!bitSet.get(Math.abs(hash % bitSetSize))) {
                return false;
            }
        }
        return true;
    }
    // 使用不同的种子生成多个哈希值
    private int[] getHashValues(byte[] bytes) {
        int[] hashValues = new int[numberOfHashFunctions];
        for (int i = 0; i < numberOfHashFunctions; i++) {
            hashValues[i] = hash(bytes, i);
        }
        return hashValues;
    }
    private int hash(byte[] bytes, int seed) {
        int result = seed;
        for (byte b : bytes) {
            result = 31 * result + (b & 0xff);
        }
        return result;
    }
    // 计算最优位数组大小
    private int optimalBitSetSize(int expectedInsertions, double falsePositiveRate) {
        return (int) (-expectedInsertions * Math.log(falsePositiveRate) / (Math.log(2) * Math.log(2)));
    }
    // 计算最优哈希函数个数
    private int optimalNumberOfHashFunctions(int expectedInsertions, int bitSetSize) {
        return Math.max(1, (int) Math.round((double) bitSetSize / expectedInsertions * Math.log(2)));
    }
    // 获取当前元素数量估计
    public int estimatedElementCount() {
        int setBits = bitSet.cardinality();
        if (setBits == 0) return 0;
        return (int) (-(double) bitSetSize / numberOfHashFunctions * 
                      Math.log(1 - (double) setBits / bitSetSize));
    }
}

使用示例

public class BloomFilterDemo {
    public static void main(String[] args) {
        // 创建布隆过滤器:预计插入10000个元素,误判率0.01
        BloomFilter<String> bloomFilter = new BloomFilter<>(10000, 0.01);
        // 添加元素
        bloomFilter.add("apple");
        bloomFilter.add("banana");
        bloomFilter.add("orange");
        // 检查元素是否存在
        System.out.println("Contains 'apple': " + bloomFilter.mightContain("apple"));    // true
        System.out.println("Contains 'grape': " + bloomFilter.mightContain("grape"));    // false(可能误判)
        // 统计信息
        System.out.println("Estimated count: " + bloomFilter.estimatedElementCount());
    }
}

使用Guava库实现(推荐)

<!-- Maven依赖 -->
<dependency>
    <groupId>com.google.guava</groupId>
    <artifactId>guava</artifactId>
    <version>31.1-jre</version>
</dependency>
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import java.nio.charset.Charset;
public class GuavaBloomFilterExample {
    public static void main(String[] args) {
        // 创建布隆过滤器
        BloomFilter<String> bloomFilter = BloomFilter.create(
            Funnels.stringFunnel(Charset.forName("UTF-8")),
            10000,    // 预计插入的数据量
            0.01      // 期望的误判率
        );
        // 添加元素
        bloomFilter.put("hello");
        bloomFilter.put("world");
        bloomFilter.put("java");
        // 检查元素
        System.out.println("Contains 'hello': " + bloomFilter.mightContain("hello"));  // true
        System.out.println("Contains 'python': " + bloomFilter.mightContain("python")); // false
        // 批量添加
        String[] urls = {"url1", "url2", "url3"};
        for (String url : urls) {
            bloomFilter.put(url);
        }
        // 获取近似元素数量
        System.out.println("Approximate count: " + bloomFilter.approximateElementCount());
    }
}

Redis实现的布隆过滤器

安装Redis布隆过滤器模块

# 在Redis中安装布隆过滤器模块
redis-cli
> MODULE LOAD /path/to/redisbloom.so

Java代码实现

import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.params.BloomFilterInsertParams;
public class RedisBloomFilterExample {
    private JedisPool jedisPool;
    public RedisBloomFilterExample() {
        this.jedisPool = new JedisPool("localhost", 6379);
    }
    // 创建布隆过滤器
    public void createBloomFilter(String key, long capacity, double errorRate) {
        try (Jedis jedis = jedisPool.getResource()) {
            jedis.bfReserve(key, errorRate, capacity);
        }
    }
    // 添加元素
    public boolean add(String key, String value) {
        try (Jedis jedis = jedisPool.getResource()) {
            return jedis.bfAdd(key, value);
        }
    }
    // 批量添加
    public boolean[] addMulti(String key, String... values) {
        try (Jedis jedis = jedisPool.getResource()) {
            BloomFilterInsertParams params = new BloomFilterInsertParams();
            return jedis.bfInsert(key, params, values);
        }
    }
    // 检查元素是否存在
    public boolean mightContain(String key, String value) {
        try (Jedis jedis = jedisPool.getResource()) {
            return jedis.bfExists(key, value);
        }
    }
    // 使用示例
    public static void main(String[] args) {
        RedisBloomFilterExample example = new RedisBloomFilterExample();
        String filterKey = "url_filter";
        // 创建布隆过滤器
        example.createBloomFilter(filterKey, 100000, 0.01);
        // 添加URL
        example.add(filterKey, "http://example.com");
        example.add(filterKey, "http://google.com");
        // 检查URL是否已经被收录
        System.out.println("Already crawled? " + 
            example.mightContain(filterKey, "http://example.com"));  // true
        System.out.println("Already crawled? " + 
            example.mightContain(filterKey, "http://new-site.com")); // false
    }
}

高级应用:使用多个哈希函数

import java.nio.charset.StandardCharsets;
import java.util.BitSet;
import com.google.common.hash.Hashing;
public class AdvancedBloomFilter<T> {
    private final BitSet bitSet;
    private final int bitSetSize;
    private final int numHashFunctions;
    public AdvancedBloomFilter(int expectedInsertions, double falsePositiveRate) {
        this.bitSetSize = optimalBitSetSize(expectedInsertions, falsePositiveRate);
        this.numHashFunctions = optimalNumHashFunctions(expectedInsertions, bitSetSize);
        this.bitSet = new BitSet(bitSetSize);
    }
    public void put(T object) {
        byte[] bytes = object.toString().getBytes(StandardCharsets.UTF_8);
        long hash64 = Hashing.murmur3_128().hashBytes(bytes).asLong();
        int hash1 = (int) hash64;
        int hash2 = (int) (hash64 >>> 32);
        for (int i = 1; i <= numHashFunctions; i++) {
            int combinedHash = hash1 + (i * hash2);
            if (combinedHash < 0) {
                combinedHash = ~combinedHash;
            }
            bitSet.set(combinedHash % bitSetSize);
        }
    }
    public boolean mightContain(T object) {
        byte[] bytes = object.toString().getBytes(StandardCharsets.UTF_8);
        long hash64 = Hashing.murmur3_128().hashBytes(bytes).asLong();
        int hash1 = (int) hash64;
        int hash2 = (int) (hash64 >>> 32);
        for (int i = 1; i <= numHashFunctions; i++) {
            int combinedHash = hash1 + (i * hash2);
            if (combinedHash < 0) {
                combinedHash = ~combinedHash;
            }
            if (!bitSet.get(combinedHash % bitSetSize)) {
                return false;
            }
        }
        return true;
    }
    private int optimalBitSetSize(int n, double p) {
        return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
    }
    private int optimalNumHashFunctions(int n, int m) {
        return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
    }
}

实际应用场景

public class BloomFilterApplication {
    // 缓存穿透保护
    public static class CacheGuard {
        private BloomFilter<String> bloomFilter;
        private Map<String, Object> cache;
        public CacheGuard(int expectedItems) {
            this.bloomFilter = BloomFilter.create(
                Funnels.stringFunnel(StandardCharsets.UTF_8), 
                expectedItems, 
                0.01
            );
            this.cache = new HashMap<>();
        }
        public Object get(String key) {
            // 先检查布隆过滤器
            if (!bloomFilter.mightContain(key)) {
                return null;  // 一定不存在,直接返回
            }
            // 可能存在的,继续检查缓存
            return cache.get(key);
        }
        public void put(String key, Object value) {
            bloomFilter.put(key);
            cache.put(key, value);
        }
    }
    // 网页爬虫去重
    public static class WebCrawler {
        private BloomFilter<String> crawledUrls;
        public WebCrawler() {
            this.crawledUrls = BloomFilter.create(
                Funnels.stringFunnel(StandardCharsets.UTF_8),
                1000000,  // 预计100万URL
                0.001     // 千分之一误判率
            );
        }
        public boolean shouldCrawl(String url) {
            if (crawledUrls.mightContain(url)) {
                return false;  // 可能已经爬取过
            }
            crawledUrls.put(url);
            return true;
        }
    }
}

使用建议

  1. 小数据量(< 100万):使用Guava库实现,简单可靠
  2. 大数据量:考虑使用Redis布隆过滤器,支持分布式
  3. 高性能要求:手动实现,使用高性能哈希函数(如MurmurHash)
  4. 精确性要求:注意误判率参数的选择,通常在0.001-0.01之间

布隆过滤器非常适合处理大数据量的判重场景,特别是能接受一定误判率的业务。

抱歉,评论功能暂时关闭!