Building a Bloom filter#
A Bloom filter is a compact probabilistic set: inserted values should always be reported as present, while values that were never inserted may occasionally be reported as present. The core structure is just a bit array plus several hash positions per item.
from hashlib import blake2b
from tibs import Mutibs
BIT_COUNT = 128
HASH_COUNT = 4
def positions(value):
digest = blake2b(
value.encode("utf-8"),
digest_size=HASH_COUNT * 2,
person=b"tibs-demo",
).digest()
for offset in range(0, len(digest), 2):
yield int.from_bytes(digest[offset:offset + 2], "big") % BIT_COUNT
def build_filter(values):
bits = Mutibs.from_zeros(BIT_COUNT)
for value in values:
bits.set(list(positions(value)))
return bits.to_tibs()
def maybe_contains(bits, value):
return all(bits[position] for position in positions(value))
services = ["auth", "billing", "search", "metrics", "uploads"]
service_filter = build_filter(services)
assert 0 < service_filter.count(1) < BIT_COUNT
for service in services:
assert maybe_contains(service_filter, service)
assert not maybe_contains(service_filter, "checkout")
This is intentionally small and deterministic. A production Bloom filter would
size BIT_COUNT and HASH_COUNT from the expected number of values and the
acceptable false-positive rate.