heck

an implementation of a bloom filter in go
git clone git://git.ter0.net/heck.git
Log | Files | Refs

commit eda793fad7cdcef08e761402bf9121b74d0bbf96
Author: Chris Johns <chris@ter0.net>
Date:   Fri, 25 Jun 2021 23:48:55 +0100

initial commit

Diffstat:
Ago.mod | 3+++
Aheck.go | 69+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aheck_test.go | 17+++++++++++++++++
3 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/go.mod b/go.mod @@ -0,0 +1,3 @@ +module git.ter0.net/heck + +go 1.16 diff --git a/heck.go b/heck.go @@ -0,0 +1,69 @@ +package heck + +import ( + "hash/maphash" + "math" + "math/big" +) + +type BloomFilter struct { + b *big.Int + seeds []maphash.Seed + hash *maphash.Hash + m uint64 + k int + ε float64 +} + +func k(m uint64, n uint64) int { + return int(math.Ceil(float64(m/n) * math.Log(2))) +} + +func m(n uint64, ε float64) uint64 { + return uint64(math.Ceil(math.Abs(((float64(n) * math.Log(ε)) / math.Pow(math.Log(2), 2))))) +} + +func NewBloomFilter(n uint64, ε float64) BloomFilter { + m := m(n, ε) + k := k(m, n) + f := BloomFilter{ + b: new(big.Int), + m: m, + k: k, + ε: ε, + seeds: make([]maphash.Seed, k, k), + hash: new(maphash.Hash), + } + + // it would be nicer to just have k initialised hashes we could reuse, + // but reusing them seems harder than the docs imply + for i := range f.seeds { + f.seeds[i] = maphash.MakeSeed() + } + + return f +} + +func (f *BloomFilter) index(s maphash.Seed, b []byte) int { + f.hash.SetSeed(s) + f.hash.Write(b) + defer f.hash.Reset() + return int(f.hash.Sum64() % f.m) +} + +func (f *BloomFilter) Add(b []byte) { + for _, seed := range f.seeds { + index := f.index(seed, b) + f.b.SetBit(f.b, index, 1) + } +} + +func (f *BloomFilter) IsMember(b []byte) bool { + for _, seed := range f.seeds { + index := f.index(seed, b) + if f.b.Bit(index) == 0 { + return false + } + } + return true +} diff --git a/heck_test.go b/heck_test.go @@ -0,0 +1,17 @@ +package heck + +import ( + "testing" +) + +func TestMembership(t *testing.T) { + f := NewBloomFilter(1000, 0.01) + bytes := []byte{'f', 'o', 'o'} + if f.IsMember(bytes) { + t.Errorf("element unexpectedly in bloom filter") + } + f.Add(bytes) + if !f.IsMember(bytes) { + t.Errorf("element unexpectedly not in bloom filter") + } +}