Building a Bloom filter#

A Bloom filter is a compact probabilistic set: inserted values should always be reported as present, while values that were never inserted may occasionally be reported as present. The core structure is just a bit array plus several hash positions per item.

from hashlib import blake2b

from tibs import Mutibs


BIT_COUNT = 128
HASH_COUNT = 4


def positions(value):
    digest = blake2b(
        value.encode("utf-8"),
        digest_size=HASH_COUNT * 2,
        person=b"tibs-demo",
    ).digest()

    for offset in range(0, len(digest), 2):
        yield int.from_bytes(digest[offset:offset + 2], "big") % BIT_COUNT


def build_filter(values):
    bits = Mutibs.from_zeros(BIT_COUNT)
    for value in values:
        bits.set(list(positions(value)))
    return bits.to_tibs()


def maybe_contains(bits, value):
    return all(bits[position] for position in positions(value))


services = ["auth", "billing", "search", "metrics", "uploads"]
service_filter = build_filter(services)

assert 0 < service_filter.count(1) < BIT_COUNT

for service in services:
    assert maybe_contains(service_filter, service)

assert not maybe_contains(service_filter, "checkout")

This is intentionally small and deterministic. A production Bloom filter would size BIT_COUNT and HASH_COUNT from the expected number of values and the acceptable false-positive rate.