Skip to content
20 changes: 10 additions & 10 deletions bbhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ import (
)

// BBHash represents a minimal perfect hash for a set of keys.
type BBHash struct {
type SingleBBHash struct {
bits []bitVector // bit vectors for each level
ranks []uint64 // total rank for each level
reverseMap []uint64 // index -> key (only filled if needed)
}

func newBBHash(initialLevels int) BBHash {
return BBHash{
func newBBHash(initialLevels int) SingleBBHash {
return SingleBBHash{
bits: make([]bitVector, 0, initialLevels),
}
}
Expand All @@ -31,7 +31,7 @@ func newBBHash(initialLevels int) BBHash {
// If the key is not in the original key set, two things can happen:
// 1. The return value is 0, representing that the key was not in the original key set.
// 2. The return value is in the expected range [1, len(keys)], but is a false positive.
func (bb BBHash) Find(key uint64) uint64 {
func (bb SingleBBHash) Find(key uint64) uint64 {
for lvl, bv := range bb.bits {
i := fast.Hash(uint64(lvl), key) % bv.size()
if bv.isSet(i) {
Expand All @@ -43,15 +43,15 @@ func (bb BBHash) Find(key uint64) uint64 {

// Key returns the key for the given index.
// The index must be in the range [1, len(keys)], otherwise 0 is returned.
func (bb BBHash) Key(index uint64) uint64 {
func (bb SingleBBHash) Key(index uint64) uint64 {
if bb.reverseMap == nil || index == 0 || int(index) >= len(bb.reverseMap) {
return 0
}
return bb.reverseMap[index]
}

// compute computes the minimal perfect hash for the given keys.
func (bb *BBHash) compute(keys []uint64, gamma float64) error {
func (bb *SingleBBHash) compute(keys []uint64, gamma float64) error {
sz := len(keys)
redo := make([]uint64, 0, sz/2) // heuristic: only 1/2 of the keys will collide
// bit vectors for current level : A and C in the paper
Expand Down Expand Up @@ -100,7 +100,7 @@ func (bb *BBHash) compute(keys []uint64, gamma float64) error {
}

// computeWithKeymap is similar to compute(), but in addition returns the reverse keymap.
func (bb *BBHash) computeWithKeymap(keys []uint64, gamma float64) error {
func (bb *SingleBBHash) computeWithKeymap(keys []uint64, gamma float64) error {
sz := len(keys)
redo := make([]uint64, 0, sz/2) // heuristic: only 1/2 of the keys will collide
// bit vectors for current level : A and C in the paper
Expand Down Expand Up @@ -169,7 +169,7 @@ func (bb *BBHash) computeWithKeymap(keys []uint64, gamma float64) error {

// computeLevelRanks computes the total rank of each level.
// The total rank is the rank for all levels up to and including the current level.
func (bb *BBHash) computeLevelRanks() {
func (bb *SingleBBHash) computeLevelRanks() {
// Initializing the rank to 1, since the 0 index is reserved for not-found.
var rank uint64 = 1
bb.ranks = make([]uint64, len(bb.bits))
Expand All @@ -181,6 +181,6 @@ func (bb *BBHash) computeLevelRanks() {

// enforce interface compliance
var (
_ bbhash = (*BBHash)(nil)
_ reverseMap = (*BBHash)(nil)
_ bbhash = (*SingleBBHash)(nil)
_ reverseMap = (*SingleBBHash)(nil)
)
22 changes: 11 additions & 11 deletions bbhash_fmt.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import (
)

// String returns a string representation of BBHash with overall and per-level statistics.
func (bb BBHash) String() string {
func (bb SingleBBHash) String() string {
var b strings.Builder
b.WriteString(fmt.Sprintf("BBHash(gamma=%3.1f, entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s, false positive rate=%.2f)\n",
b.WriteString(fmt.Sprintf("single(gamma=%3.1f, entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s, false positive rate=%.2f)\n",
bb.gamma(), bb.entries(), bb.Levels(), bb.BitsPerKey(), bb.wireBits(), bb.space(), bb.falsePositiveRate()))
for i, bv := range bb.bits {
sz := readableSize(int(bv.words()) * 8)
Expand All @@ -20,17 +20,17 @@ func (bb BBHash) String() string {
}

// Levels returns the number of Levels in the minimal perfect hash.
func (bb BBHash) Levels() int {
func (bb SingleBBHash) Levels() int {
return len(bb.bits)
}

// BitsPerKey returns the number of bits per key in the minimal perfect hash.
func (bb BBHash) BitsPerKey() float64 {
func (bb SingleBBHash) BitsPerKey() float64 {
return float64(bb.wireBits()) / float64(bb.entries())
}

// LevelVectors returns a slice representation of the BBHash's per-level bit vectors.
func (bb BBHash) LevelVectors() [][]uint64 {
func (bb SingleBBHash) LevelVectors() [][]uint64 {
m := make([][]uint64, 0, len(bb.bits))
for _, bv := range bb.bits {
m = append(m, bv)
Expand All @@ -40,7 +40,7 @@ func (bb BBHash) LevelVectors() [][]uint64 {

// BitVectors returns a Go slice for BBHash's per-level bit vectors.
// This is intended for testing and debugging; no guarantees are made about the format.
func (bb BBHash) BitVectors(varName string) string {
func (bb SingleBBHash) BitVectors(varName string) string {
var b strings.Builder
b.WriteString(fmt.Sprintf("var %s = [][]uint64{\n", varName))
for lvl, bv := range bb.bits {
Expand Down Expand Up @@ -87,33 +87,33 @@ func readableSize(sizeInBytes int) string {

// gamma returns an estimate of the gamma parameter used to construct the minimal perfect hash.
// It is an estimate because the size of the level 0 bit vector is not necessarily a multiple of 64.
func (bb BBHash) gamma() float64 {
func (bb SingleBBHash) gamma() float64 {
lvl0Size := bb.bits[0].size()
return float64(lvl0Size) / float64(bb.entries())
}

// entries returns the number of entries in the minimal perfect hash.
func (bb BBHash) entries() (sz uint64) {
func (bb SingleBBHash) entries() (sz uint64) {
for _, bv := range bb.bits {
sz += bv.onesCount()
}
return sz
}

// wireBits returns the number of on-the-wire bits used to represent the minimal perfect hash.
func (bb BBHash) wireBits() uint64 {
func (bb SingleBBHash) wireBits() uint64 {
return uint64(bb.marshaledLength()) * 8
}

// space returns a human-readable string representing the size of the minimal perfect hash.
func (bb BBHash) space() string {
func (bb SingleBBHash) space() string {
return readableSize(bb.marshaledLength())
}

// falsePositiveRate returns the false positive rate of the minimal perfect hash.
// Note: This may not be accurate if the actual keys overlap with the test keys [0,2N];
// that is, if many of the actual keys are in the range [0,2N], then it will be inaccurate.
func (bb BBHash) falsePositiveRate() float64 {
func (bb SingleBBHash) falsePositiveRate() float64 {
var cnt int
numTestKeys := bb.entries() * 2
for key := uint64(0); key < numTestKeys; key++ {
Expand Down
22 changes: 11 additions & 11 deletions bbhash_fmt2.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"strings"
)

func (bb BBHash2) String() string {
func (bb BBHash) String() string {
var b strings.Builder
lvlSz := make([]uint64, 0)
lvlEntries := make([]uint64, 0)
Expand All @@ -22,7 +22,7 @@ func (bb BBHash2) String() string {
}
}
}
b.WriteString(fmt.Sprintf("BBHash2(entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s)\n",
b.WriteString(fmt.Sprintf("BBHash(entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s)\n",
bb.entries(), len(lvlSz), bb.BitsPerKey(), bb.wireBits(), bb.space()))
for lvl := 0; lvl < len(lvlSz); lvl++ {
sz := int(lvlSz[lvl])
Expand All @@ -33,7 +33,7 @@ func (bb BBHash2) String() string {
}

// MaxMinLevels returns the maximum and minimum number of levels across all partitions.
func (bb BBHash2) MaxMinLevels() (max, min int) {
func (bb BBHash) MaxMinLevels() (max, min int) {
max = 0
min = 999
for _, bx := range bb.partitions {
Expand All @@ -48,22 +48,22 @@ func (bb BBHash2) MaxMinLevels() (max, min int) {
}

// BitsPerKey returns the number of bits per key in the minimal perfect hash.
func (bb BBHash2) BitsPerKey() float64 {
func (bb BBHash) BitsPerKey() float64 {
return float64(bb.wireBits()) / float64(bb.entries())
}

// LevelVectors returns a slice representation of BBHash2's per-partition, per-level bit vectors.
func (bb BBHash2) LevelVectors() [][][]uint64 {
// LevelVectors returns a slice representation of BBHash's per-partition, per-level bit vectors.
func (bb BBHash) LevelVectors() [][][]uint64 {
var vectors [][][]uint64
for _, bx := range bb.partitions {
vectors = append(vectors, bx.LevelVectors())
}
return vectors
}

// BitVectors returns a Go slice for BBHash2's per-partition, per-level bit vectors.
// BitVectors returns a Go slice for BBHash's per-partition, per-level bit vectors.
// This is intended for testing and debugging; no guarantees are made about the format.
func (bb BBHash2) BitVectors(varName string) string {
func (bb BBHash) BitVectors(varName string) string {
var b strings.Builder
b.WriteString(fmt.Sprintf("var %s = [][][]uint64{\n", varName))
for partition, bx := range bb.partitions {
Expand All @@ -86,19 +86,19 @@ func (bb BBHash2) BitVectors(varName string) string {
}

// entries returns the number of entries in the minimal perfect hash.
func (bb BBHash2) entries() (sz uint64) {
func (bb BBHash) entries() (sz uint64) {
for _, bx := range bb.partitions {
sz += bx.entries()
}
return sz
}

// wireBits returns the number of on-the-wire bits used to represent the minimal perfect hash.
func (bb BBHash2) wireBits() uint64 {
func (bb BBHash) wireBits() uint64 {
return uint64(bb.marshaledLength()) * 8
}

// space returns a human-readable string representing the size of the minimal perfect hash.
func (bb BBHash2) space() string {
func (bb BBHash) space() string {
return readableSize(bb.marshaledLength())
}
4 changes: 2 additions & 2 deletions bbhash_fmt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ func TestString(t *testing.T) {
// See issue #21
return
}
t.Logf("BBHash: %v", bb)
t.Logf("SingleBBHash: %v", bb)
}
t.Logf("BBHash2: %v", bb2)
t.Logf("BBHash: %v", bb2)
})
}
}
Expand Down
11 changes: 11 additions & 0 deletions bbhash_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@ func Keys(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte]) []uint64 {
return keys
}

func KeysNonce(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte], nonce []byte) []uint64 {
var keys []uint64
for c := range chunks {
c = append(c, nonce...)
keys = append(keys, hashFunc(c))
}
return keys
}

var SHA256HashFunc = func(buf []byte) uint64 {
h := sha256.New()
h.Write(buf)
Expand All @@ -45,3 +54,5 @@ var SHA256HashFunc = func(buf []byte) uint64 {
var FastHashFunc = func(buf []byte) uint64 {
return fast.Hash64(123, buf)
}

//
60 changes: 60 additions & 0 deletions bbhash_iter_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
package bbhash_test

import (
"bytes"
_ "embed"
"iter"
"math/rand"
"slices"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/relab/bbhash"
"github.com/relab/bbhash/internal/test"
)

// String taken from https://www.lipsum.com/
Expand Down Expand Up @@ -66,3 +70,59 @@ func TestHashKeysFromChunks(t *testing.T) {
})
}
}

func TestKeysNonce(t *testing.T) {
tests := []struct {
name string
hashFunc func([]byte) uint64
in string
chunkSize int
}{
{name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4},
{name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8},
{name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 4},
{name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 8},
{name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128},
{name: "LongSHA", hashFunc: bbhash.SHA256HashFunc, in: input, chunkSize: 128},
}
for _, test := range tests {
nonce := []byte{byte(rand.Intn(256))}
wantHashedKeys := CollectFunc(slices.Chunk([]byte(test.in), test.chunkSize), func(v []byte) uint64 {
v = append(v, nonce...)
return test.hashFunc(v)
})

r := strings.NewReader(test.in)
chunks := bbhash.ReadChunks(r, test.chunkSize)
gotHashedKeys := bbhash.KeysNonce(test.hashFunc, chunks, nonce)

if diff := cmp.Diff(gotHashedKeys, wantHashedKeys); diff != "" {
t.Errorf("Keys(): (-got +want) \n%s", diff)
}
}
}

func BenchmarkChunks(b *testing.B) {
for _, keySz := range keySizes {
keys := generateKeys(keySz, 99)
bKeys := Uin64ToBytes(keys)
r := bytes.NewReader(bKeys)
for _, gamma := range gammaValues {
for _, sz := range bufSizes {
b.Run(test.Name("New(Chunks)", []string{"gamma", "buffer", "keys"}, gamma, sz, keySz), func(b *testing.B) {
b.Log("Running ReadChunks")
chunks := bbhash.ReadChunks(r, sz)
_ = chunks
})
}
}
}
}

func Uin64ToBytes(keys []uint64) []byte {
buf := make([]byte, 0)
for _, key := range keys {
buf = append(buf, byte(key))
}
return buf
}
Loading