隐私计算框架PrivacyMagic--密码学工具篇--布隆过滤器

HycBoy777

452人浏览 · 2025-09-23 15:33:20

HycBoy777 · 2025-09-23 15:33:20 发布

布隆过滤器（Bloom Filter）是密码学中绕不过去的密码学组件。在本隐私框架中，我们实现了布隆过滤器的理想功能函数，可以很简单的操作使用。

布隆过滤器是一种空间效率极高的概率型数据结构，专门用于判断一个元素是否可能存在于集合中。
它的核心特点是：

优点：占用空间极小，查询速度极快（插入和查询均为 O (k)，k 为哈希函数数量）。
缺点：存在一定的误判率（False Positive），即可能把不在集合中的元素错误地判断为存在。但绝不会漏判（False
Negative），即真正存在的元素一定会被正确识别。

工作原理

初始化：创建一个长度为 m 的二进制数组（bit array），并将所有位初始化为 0。
插入元素：将元素 x 分别输入到 k 个不同的哈希函数中。每个哈希函数会输出一个在 [0, m-1] 范围内的索引。将数组中这些索引对应的位都设置为 1。
查询元素：同样将待查询元素 y 输入到这 k 个哈希函数中，得到 k 个索引。检查数组中这些索引对应的位：如果所有位都为 1：则元素y 可能存在于集合中（存在误判可能）。如果有任何一位为 0：则元素 y 一定不存在于集合中。

应用场景

由于其高效的空间和时间特性，布隆过滤器非常适合以下场景：

数据库 / 缓存系统：在查询数据库或缓存前，先用布隆过滤器判断 Key 是否存在，可以有效避免大量对不存在 Key的无效查询，减轻后端压力。
垃圾邮件过滤：维护一个已知的垃圾邮件地址集合，用布隆过滤器快速判断一封邮件是否可能是垃圾邮件。
网络爬虫：记录已经爬取过的 URL，避免重复爬取。
分布式系统：在分布式环境中，用于快速查找数据可能存在的节点。

误判率与参数选择

布隆过滤器的误判率（P）主要由以下三个参数决定：

m：位数组的长度（越大，误判率越低）
n：预计要存入的元素数量
k：哈希函数的个数（有一个最优值）

在 m 和 n 确定的情况下，使得误判率最小的最优哈希函数数量 k 约等于 (m/n) * ln(2)。

#ifndef BLOOM_FILTER_HPP
#define BLOOM_FILTER_HPP

#include <vector>
#include <string>
#include <cmath>
#include <cstdint>
#include <stdexcept>
namespace CryptoTools
{

    // 布隆过滤器类模板，支持任意类型元素
    template <typename T>
    class BloomFilter
    {
    private:
        std::vector<bool> bit_array; // 位数组，存储元素存在标记
        size_t bit_size;             // 位数组大小（位）
        size_t hash_count;           // 哈希函数数量
        size_t item_count;           // 已插入元素数量

        // 哈希函数1：基于DJB2算法
        uint64_t hash1(const T &item) const
        {
            std::string str = std::to_string(item); // 对于非字符串类型转换为字符串
            uint64_t hash = 5381;
            for (char c : str)
            {
                hash = ((hash << 5) + hash) + static_cast<uint64_t>(c);
            }
            return hash % bit_size;
        }

        // 哈希函数2：基于SDBM算法
        uint64_t hash2(const T &item) const
        {
            std::string str = std::to_string(item);
            uint64_t hash = 0;
            for (char c : str)
            {
                hash = static_cast<uint64_t>(c) + (hash << 6) + (hash << 16) - hash;
            }
            return hash % bit_size;
        }

        // 哈希函数3：基于FNV-1a算法
        uint64_t hash3(const T &item) const
        {
            std::string str = std::to_string(item);
            uint64_t hash = 14695981039346656037ULL;
            for (char c : str)
            {
                hash ^= static_cast<uint64_t>(c);
                hash *= 1099511628211ULL;
            }
            return hash % bit_size;
        }

        // 计算第i个哈希值（组合基础哈希函数）
        uint64_t get_hash(const T &item, size_t i) const
        {
            return (hash1(item) + i * hash2(item) + i * i * hash3(item)) % bit_size;
        }

    public:
        /**
         * @brief 构造布隆过滤器
         * @param expected_items 预期插入的元素数量
         * @param false_positive_rate 可接受的误判率
         */
        BloomFilter(size_t expected_items, double false_positive_rate, size_t hash_count): hash_count(hash_count)
        {
            if (expected_items == 0)
            {
                throw std::invalid_argument("预期元素数量不能为0");
            }
            if (false_positive_rate <= 0 || false_positive_rate >= 1)
            {
                throw std::invalid_argument("误判率必须在(0, 1)范围内");
            }

            // 计算最佳位数组大小：m = -n * ln(p) / (ln(2)^2)
            bit_size = static_cast<size_t>(-(expected_items * std::log(false_positive_rate)) / (std::log(2) * std::log(2)));
            if (bit_size == 0)
                bit_size = 1; // 确保至少有1位

            // 计算最佳哈希函数数量：k = m/n * ln(2)
            //hash_count = static_cast<size_t>(std::round((bit_size / expected_items) * std::log(2)));
            if (hash_count == 0)
                hash_count = 1; // 确保至少有1个哈希函数
            std::cout <<"hash_count: "<<hash_count<<std::endl;
            // 初始化位数组
            bit_array.resize(bit_size, false);
            item_count = 0;
        }

        /**
         * @brief 插入元素到布隆过滤器
         * @param item 要插入的元素
         */
        void insert(const T &item)
        {
            for (size_t i = 0; i < hash_count; ++i)
            {
                size_t position = get_hash(item, i);
                bit_array[position] = true;
            }
            item_count++;
        }

        /**
         * @brief 检查元素是否可能存在于布隆过滤器中
         * @param item 要检查的元素
         * @return true：可能存在（有一定误判率）；false：一定不存在
         */
        bool contains(const T &item) const
        {
            for (size_t i = 0; i < hash_count; ++i)
            {
                size_t position = get_hash(item, i);
                if (!bit_array[position])
                {
                    return false; // 只要有一位为0，肯定不存在
                }
            }
            return true; // 所有位都为1，可能存在
        }

        /**
         * @brief 获取位数组大小（位）
         */
        size_t get_bit_size() const
        {
            return bit_size;
        }

        /**
         * @brief 获取哈希函数数量
         */
        size_t get_hash_count() const
        {
            return hash_count;
        }

        /**
         * @brief 获取已插入元素数量
         */
        size_t get_item_count() const
        {
            return item_count;
        }

        /**
         * @brief 清空布隆过滤器
         */
        void clear()
        {
            std::fill(bit_array.begin(), bit_array.end(), false);
            item_count = 0;
        }
    };

    // 针对字符串类型的特化，优化哈希计算
    template <>
    inline uint64_t BloomFilter<std::string>::hash1(const std::string &item) const
    {
        uint64_t hash = 5381;
        for (char c : item)
        {
            hash = ((hash << 5) + hash) + static_cast<uint64_t>(c);
        }
        return hash % bit_size;
    }

    template <>
    inline uint64_t BloomFilter<std::string>::hash2(const std::string &item) const
    {
        uint64_t hash = 0;
        for (char c : item)
        {
            hash = static_cast<uint64_t>(c) + (hash << 6) + (hash << 16) - hash;
        }
        return hash % bit_size;
    }

    template <>
    inline uint64_t BloomFilter<std::string>::hash3(const std::string &item) const
    {
        uint64_t hash = 14695981039346656037ULL;
        for (char c : item)
        {
            hash ^= static_cast<uint64_t>(c);
            hash *= 1099511628211ULL;
        }
        return hash % bit_size;
    }
}
#endif // BLOOM_FILTER_HPP

int main(){
// 创建布隆过滤器：预期1000个元素，误判率0.01, hash 函数数量为3
        CryptoTools::BloomFilter<std::string> filter(1000, 0.01, 3);

        // 插入元素
        filter.insert("apple");
        filter.insert("banana");
        filter.insert("cherry");

        // 检查元素
        std::cout << "apple: " << (filter.contains("apple") ? "可能存在" : "不存在") << std::endl;
        std::cout << "banana: " << (filter.contains("banana") ? "可能存在" : "不存在") << std::endl;
        std::cout << "orange: " << (filter.contains("orange") ? "可能存在" : "不存在") << std::endl;

        return 0;
        }

DAMO开发者矩阵

DAMO开发者矩阵，由阿里巴巴达摩院和中国互联网协会联合发起，致力于探讨最前沿的技术趋势与应用成果，搭建高质量的交流与分享平台，推动技术创新与产业应用链接，围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐

14905黄大年茶思屋榜文第149期第5题面向云手机复杂任务的高效可靠 GUI VLA 模型技术

DAMO开发者矩阵

工业机器人离线仿真全链路研究：ROBOGUIDE 课程体系深度剖析、实操痛点与数字化创新方案

本文系统剖析了七阶段ROBOGUIDE工业机器人离线编程课程体系。该课程采用分层递进设计，从软件基础操作到复杂焊接工艺，完整复刻企业仿真工程师工作流程。核心特色包括：虚实融合的实操导向教学，覆盖搬运、分拣、轨迹加工等主流工业场景；独创"虚拟电机法"与"模型替代法"双抓取仿真技术对比；重点突破变位机协同焊接等高阶应用。课程严格遵循"工作站搭建-编程-

DAMO开发者矩阵

图像识别化技术目标检测模型训练数据标注策略

图像识别技术中的目标检测模型近年来在自动驾驶、安防监控、医疗影像等领域展现出巨大潜力，而模型的性能高度依赖于训练数据的标注质量。对于遮挡、重叠目标，应采用分层标注策略，明确标注可见部分。标注策略需结合具体需求，灵活调整标注粒度和标准，确保模型在实际场景中发挥最佳效果。通过分析模型在测试集上的表现，发现漏检或误检案例，针对性补充标注数据。未来，随着自动化标注技术和多模态融合的发展，数据标注将迈向更高