From 36d08410a8cae509fff7efc834bc3f84d389e092 Mon Sep 17 00:00:00 2001 From: Creeger Date: Mon, 10 Mar 2025 18:05:23 +0100 Subject: [PATCH 1/7] First rendition of bbhash benchmarking --- bbhash_iter_test.go | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/bbhash_iter_test.go b/bbhash_iter_test.go index 3579cad..c26e3d2 100644 --- a/bbhash_iter_test.go +++ b/bbhash_iter_test.go @@ -1,9 +1,13 @@ package bbhash_test import ( + "bytes" + "context" "crypto/sha256" + _ "embed" "encoding/binary" "iter" + "os" "slices" "strings" "testing" @@ -11,6 +15,9 @@ import ( "github.com/google/go-cmp/cmp" "github.com/relab/bbhash" "github.com/relab/bbhash/internal/fast" + "github.com/relab/bbhash/internal/test" + "github.com/relab/iago" + "github.com/relab/iago/iagotest" ) // String taken from https://www.lipsum.com/ @@ -79,3 +86,72 @@ func TestHashKeysFromChunks(t *testing.T) { }) } } + +func BenchmarkChunks(b *testing.B) { + for _, keySz := range keySizesOneV { + keys := generateKeys(keySz, 99) + bKeys := Uin64ToBytes(keys) + r := bytes.NewReader(bKeys) + for _, gamma := range gammaValuesOneV { + for _, sz := range bufSizes { + b.Run(test.Name("New(Chunks)", []string{"gamma", "buffer", "keys"}, gamma, sz, keySz), func(b *testing.B) { + b.Log("Running ReadChunks") + chunks := bbhash.ReadChunks(r, sz) + _ = chunks + }) + } + } + } +} + +func Uin64ToBytes(keys []uint64) []byte { + buf := make([]byte, 0) + for _, key := range keys { + buf = append(buf, byte(key)) + } + return buf +} + +func BenchmarkBBhash(b *testing.B) { + n := 1 + //Create keys for the client group + //dir := b.TempDir() + wd, err := os.Getwd() + if err != nil { + b.Fatal(err) + } + + group := iagotest.CreateSSHGroup(b, n, false) + + group.ErrorHandler = func(e error) { + b.Fatal(e) + } + + group.Run("Upload a file", func(ctx context.Context, host iago.Host) error { + src, err := iago.NewPath(wd, ".") + if err != nil { + return err + } + dest, err := iago.NewPath(iago.Expand(host, "$HOME"), "bbhash") + if err != nil { + return err + } + return iago.Upload{ + Src: src, + Dest: dest, + }.Apply(ctx, host) + }) + + group.Run("Custom Shell Command", func(ctx context.Context, host iago.Host) error { + var sb strings.Builder + err = iago.Shell{ + Command: "cd bbhash; /usr/local/go/bin/go env | grep GOMODCACHE", + Stdout: &sb, + }.Apply(ctx, host) + b.Log(sb.String()) + if err != nil { + return err + } + return nil + }) +} From 7c7e2010ebcd8f5cb664bbc5f021161c47193c77 Mon Sep 17 00:00:00 2001 From: Creeger Date: Mon, 31 Mar 2025 12:02:44 +0200 Subject: [PATCH 2/7] Refactored BBHash type names --- bbhash.go | 20 ++++++------ bbhash_fmt.go | 22 ++++++------- bbhash_fmt2.go | 22 ++++++------- bbhash_fmt_test.go | 4 +-- bbhash_marshal.go | 36 +++++++++++----------- bbhash_marshal_test.go | 61 ++++++++++++++++++------------------- bbhash_opts.go | 4 +-- bbhash_parallel.go | 2 +- bbhash_partitioned.go | 34 ++++++++++----------- bbhash_reproducible_test.go | 2 +- cmd/bbhashbench/main.go | 6 ++-- 11 files changed, 106 insertions(+), 107 deletions(-) diff --git a/bbhash.go b/bbhash.go index 7cbdfa1..78c5ca5 100644 --- a/bbhash.go +++ b/bbhash.go @@ -8,14 +8,14 @@ import ( ) // BBHash represents a minimal perfect hash for a set of keys. -type BBHash struct { +type SingleBBHash struct { bits []bitVector // bit vectors for each level ranks []uint64 // total rank for each level reverseMap []uint64 // index -> key (only filled if needed) } -func newBBHash(initialLevels int) BBHash { - return BBHash{ +func newBBHash(initialLevels int) SingleBBHash { + return SingleBBHash{ bits: make([]bitVector, 0, initialLevels), } } @@ -31,7 +31,7 @@ func newBBHash(initialLevels int) BBHash { // If the key is not in the original key set, two things can happen: // 1. The return value is 0, representing that the key was not in the original key set. // 2. The return value is in the expected range [1, len(keys)], but is a false positive. -func (bb BBHash) Find(key uint64) uint64 { +func (bb SingleBBHash) Find(key uint64) uint64 { for lvl, bv := range bb.bits { i := fast.Hash(uint64(lvl), key) % bv.size() if bv.isSet(i) { @@ -43,7 +43,7 @@ func (bb BBHash) Find(key uint64) uint64 { // Key returns the key for the given index. // The index must be in the range [1, len(keys)], otherwise 0 is returned. -func (bb BBHash) Key(index uint64) uint64 { +func (bb SingleBBHash) Key(index uint64) uint64 { if bb.reverseMap == nil || index == 0 || int(index) >= len(bb.reverseMap) { return 0 } @@ -51,7 +51,7 @@ func (bb BBHash) Key(index uint64) uint64 { } // compute computes the minimal perfect hash for the given keys. -func (bb *BBHash) compute(keys []uint64, gamma float64) error { +func (bb *SingleBBHash) compute(keys []uint64, gamma float64) error { sz := len(keys) redo := make([]uint64, 0, sz/2) // heuristic: only 1/2 of the keys will collide // bit vectors for current level : A and C in the paper @@ -100,7 +100,7 @@ func (bb *BBHash) compute(keys []uint64, gamma float64) error { } // computeWithKeymap is similar to compute(), but in addition returns the reverse keymap. -func (bb *BBHash) computeWithKeymap(keys []uint64, gamma float64) error { +func (bb *SingleBBHash) computeWithKeymap(keys []uint64, gamma float64) error { sz := len(keys) redo := make([]uint64, 0, sz/2) // heuristic: only 1/2 of the keys will collide // bit vectors for current level : A and C in the paper @@ -169,7 +169,7 @@ func (bb *BBHash) computeWithKeymap(keys []uint64, gamma float64) error { // computeLevelRanks computes the total rank of each level. // The total rank is the rank for all levels up to and including the current level. -func (bb *BBHash) computeLevelRanks() { +func (bb *SingleBBHash) computeLevelRanks() { // Initializing the rank to 1, since the 0 index is reserved for not-found. var rank uint64 = 1 bb.ranks = make([]uint64, len(bb.bits)) @@ -181,6 +181,6 @@ func (bb *BBHash) computeLevelRanks() { // enforce interface compliance var ( - _ bbhash = (*BBHash)(nil) - _ reverseMap = (*BBHash)(nil) + _ bbhash = (*SingleBBHash)(nil) + _ reverseMap = (*SingleBBHash)(nil) ) diff --git a/bbhash_fmt.go b/bbhash_fmt.go index aeff794..fb236cd 100644 --- a/bbhash_fmt.go +++ b/bbhash_fmt.go @@ -7,9 +7,9 @@ import ( ) // String returns a string representation of BBHash with overall and per-level statistics. -func (bb BBHash) String() string { +func (bb SingleBBHash) String() string { var b strings.Builder - b.WriteString(fmt.Sprintf("BBHash(gamma=%3.1f, entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s, false positive rate=%.2f)\n", + b.WriteString(fmt.Sprintf("single(gamma=%3.1f, entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s, false positive rate=%.2f)\n", bb.gamma(), bb.entries(), bb.Levels(), bb.BitsPerKey(), bb.wireBits(), bb.space(), bb.falsePositiveRate())) for i, bv := range bb.bits { sz := readableSize(int(bv.words()) * 8) @@ -20,17 +20,17 @@ func (bb BBHash) String() string { } // Levels returns the number of Levels in the minimal perfect hash. -func (bb BBHash) Levels() int { +func (bb SingleBBHash) Levels() int { return len(bb.bits) } // BitsPerKey returns the number of bits per key in the minimal perfect hash. -func (bb BBHash) BitsPerKey() float64 { +func (bb SingleBBHash) BitsPerKey() float64 { return float64(bb.wireBits()) / float64(bb.entries()) } // LevelVectors returns a slice representation of the BBHash's per-level bit vectors. -func (bb BBHash) LevelVectors() [][]uint64 { +func (bb SingleBBHash) LevelVectors() [][]uint64 { m := make([][]uint64, 0, len(bb.bits)) for _, bv := range bb.bits { m = append(m, bv) @@ -40,7 +40,7 @@ func (bb BBHash) LevelVectors() [][]uint64 { // BitVectors returns a Go slice for BBHash's per-level bit vectors. // This is intended for testing and debugging; no guarantees are made about the format. -func (bb BBHash) BitVectors(varName string) string { +func (bb SingleBBHash) BitVectors(varName string) string { var b strings.Builder b.WriteString(fmt.Sprintf("var %s = [][]uint64{\n", varName)) for lvl, bv := range bb.bits { @@ -87,13 +87,13 @@ func readableSize(sizeInBytes int) string { // gamma returns an estimate of the gamma parameter used to construct the minimal perfect hash. // It is an estimate because the size of the level 0 bit vector is not necessarily a multiple of 64. -func (bb BBHash) gamma() float64 { +func (bb SingleBBHash) gamma() float64 { lvl0Size := bb.bits[0].size() return float64(lvl0Size) / float64(bb.entries()) } // entries returns the number of entries in the minimal perfect hash. -func (bb BBHash) entries() (sz uint64) { +func (bb SingleBBHash) entries() (sz uint64) { for _, bv := range bb.bits { sz += bv.onesCount() } @@ -101,19 +101,19 @@ func (bb BBHash) entries() (sz uint64) { } // wireBits returns the number of on-the-wire bits used to represent the minimal perfect hash. -func (bb BBHash) wireBits() uint64 { +func (bb SingleBBHash) wireBits() uint64 { return uint64(bb.marshaledLength()) * 8 } // space returns a human-readable string representing the size of the minimal perfect hash. -func (bb BBHash) space() string { +func (bb SingleBBHash) space() string { return readableSize(bb.marshaledLength()) } // falsePositiveRate returns the false positive rate of the minimal perfect hash. // Note: This may not be accurate if the actual keys overlap with the test keys [0,2N]; // that is, if many of the actual keys are in the range [0,2N], then it will be inaccurate. -func (bb BBHash) falsePositiveRate() float64 { +func (bb SingleBBHash) falsePositiveRate() float64 { var cnt int numTestKeys := bb.entries() * 2 for key := uint64(0); key < numTestKeys; key++ { diff --git a/bbhash_fmt2.go b/bbhash_fmt2.go index 84ed059..92e0628 100644 --- a/bbhash_fmt2.go +++ b/bbhash_fmt2.go @@ -6,7 +6,7 @@ import ( "strings" ) -func (bb BBHash2) String() string { +func (bb BBHash) String() string { var b strings.Builder lvlSz := make([]uint64, 0) lvlEntries := make([]uint64, 0) @@ -22,7 +22,7 @@ func (bb BBHash2) String() string { } } } - b.WriteString(fmt.Sprintf("BBHash2(entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s)\n", + b.WriteString(fmt.Sprintf("BBHash(entries=%d, levels=%d, bits per key=%3.1f, wire bits=%d, size=%s)\n", bb.entries(), len(lvlSz), bb.BitsPerKey(), bb.wireBits(), bb.space())) for lvl := 0; lvl < len(lvlSz); lvl++ { sz := int(lvlSz[lvl]) @@ -33,7 +33,7 @@ func (bb BBHash2) String() string { } // MaxMinLevels returns the maximum and minimum number of levels across all partitions. -func (bb BBHash2) MaxMinLevels() (max, min int) { +func (bb BBHash) MaxMinLevels() (max, min int) { max = 0 min = 999 for _, bx := range bb.partitions { @@ -48,12 +48,12 @@ func (bb BBHash2) MaxMinLevels() (max, min int) { } // BitsPerKey returns the number of bits per key in the minimal perfect hash. -func (bb BBHash2) BitsPerKey() float64 { +func (bb BBHash) BitsPerKey() float64 { return float64(bb.wireBits()) / float64(bb.entries()) } -// LevelVectors returns a slice representation of BBHash2's per-partition, per-level bit vectors. -func (bb BBHash2) LevelVectors() [][][]uint64 { +// LevelVectors returns a slice representation of BBHash's per-partition, per-level bit vectors. +func (bb BBHash) LevelVectors() [][][]uint64 { var vectors [][][]uint64 for _, bx := range bb.partitions { vectors = append(vectors, bx.LevelVectors()) @@ -61,9 +61,9 @@ func (bb BBHash2) LevelVectors() [][][]uint64 { return vectors } -// BitVectors returns a Go slice for BBHash2's per-partition, per-level bit vectors. +// BitVectors returns a Go slice for BBHash's per-partition, per-level bit vectors. // This is intended for testing and debugging; no guarantees are made about the format. -func (bb BBHash2) BitVectors(varName string) string { +func (bb BBHash) BitVectors(varName string) string { var b strings.Builder b.WriteString(fmt.Sprintf("var %s = [][][]uint64{\n", varName)) for partition, bx := range bb.partitions { @@ -86,7 +86,7 @@ func (bb BBHash2) BitVectors(varName string) string { } // entries returns the number of entries in the minimal perfect hash. -func (bb BBHash2) entries() (sz uint64) { +func (bb BBHash) entries() (sz uint64) { for _, bx := range bb.partitions { sz += bx.entries() } @@ -94,11 +94,11 @@ func (bb BBHash2) entries() (sz uint64) { } // wireBits returns the number of on-the-wire bits used to represent the minimal perfect hash. -func (bb BBHash2) wireBits() uint64 { +func (bb BBHash) wireBits() uint64 { return uint64(bb.marshaledLength()) * 8 } // space returns a human-readable string representing the size of the minimal perfect hash. -func (bb BBHash2) space() string { +func (bb BBHash) space() string { return readableSize(bb.marshaledLength()) } diff --git a/bbhash_fmt_test.go b/bbhash_fmt_test.go index d298de0..e8333bb 100644 --- a/bbhash_fmt_test.go +++ b/bbhash_fmt_test.go @@ -28,9 +28,9 @@ func TestString(t *testing.T) { // See issue #21 return } - t.Logf("BBHash: %v", bb) + t.Logf("SingleBBHash: %v", bb) } - t.Logf("BBHash2: %v", bb2) + t.Logf("BBHash: %v", bb2) }) } } diff --git a/bbhash_marshal.go b/bbhash_marshal.go index 390ecea..0dfbd9f 100644 --- a/bbhash_marshal.go +++ b/bbhash_marshal.go @@ -7,7 +7,7 @@ import ( ) // marshalLength returns the number of bytes needed to marshal the BBHash. -func (bb BBHash) marshaledLength() int { +func (bb SingleBBHash) marshaledLength() int { bbLen := 1 // one byte for header: max 255 levels for _, bv := range bb.bits { bbLen += bv.marshaledLength() @@ -16,7 +16,7 @@ func (bb BBHash) marshaledLength() int { } // AppendBinary implements the [encoding.BinaryAppender] interface. -func (bb BBHash) AppendBinary(buf []byte) (_ []byte, err error) { +func (bb SingleBBHash) AppendBinary(buf []byte) (_ []byte, err error) { numBitVectors := uint8(len(bb.bits)) if numBitVectors == 0 { return nil, errors.New("BBHash.AppendBinary: no data") @@ -40,12 +40,12 @@ func (bb BBHash) AppendBinary(buf []byte) (_ []byte, err error) { } // MarshalBinary implements the [encoding.BinaryMarshaler] interface. -func (bb BBHash) MarshalBinary() ([]byte, error) { +func (bb SingleBBHash) MarshalBinary() ([]byte, error) { return bb.AppendBinary(make([]byte, 0, bb.marshaledLength())) } // UnmarshalBinary implements the [encoding.BinaryUnmarshaler] interface. -func (bb *BBHash) UnmarshalBinary(data []byte) error { +func (bb *SingleBBHash) UnmarshalBinary(data []byte) error { // Make a copy of data, since we will be modifying buf's slice indices buf := data if len(buf) < 1 { @@ -59,7 +59,7 @@ func (bb *BBHash) UnmarshalBinary(data []byte) error { } buf = buf[1:] // move past header - *bb = BBHash{} // modify bb in place + *bb = SingleBBHash{} // modify bb in place bb.bits = make([]bitVector, numBitVectors) // Read bit vectors for each level @@ -80,8 +80,8 @@ func (bb *BBHash) UnmarshalBinary(data []byte) error { return nil } -// marshalLength returns the number of bytes needed to marshal the BBHash2. -func (b2 BBHash2) marshaledLength() int { +// marshalLength returns the number of bytes needed to marshal the BBHash. +func (b2 BBHash) marshaledLength() int { b2Len := 1 // one byte for header: max 255 partitions // length of each partition for _, bb := range b2.partitions { @@ -93,10 +93,10 @@ func (b2 BBHash2) marshaledLength() int { } // AppendBinary implements the [encoding.BinaryAppender] interface. -func (b2 BBHash2) AppendBinary(buf []byte) (_ []byte, err error) { +func (b2 BBHash) AppendBinary(buf []byte) (_ []byte, err error) { numPartitions := uint8(len(b2.partitions)) if numPartitions == 0 { - return nil, errors.New("BBHash2.AppendBinary: no data") + return nil, errors.New("BBHash.AppendBinary: no data") } // append header: the number of partitions buf = append(buf, numPartitions) @@ -117,45 +117,45 @@ func (b2 BBHash2) AppendBinary(buf []byte) (_ []byte, err error) { } // MarshalBinary implements the [encoding.BinaryMarshaler] interface. -func (b2 BBHash2) MarshalBinary() ([]byte, error) { +func (b2 BBHash) MarshalBinary() ([]byte, error) { return b2.AppendBinary(make([]byte, 0, b2.marshaledLength())) } // UnmarshalBinary implements the [encoding.BinaryUnmarshaler] interface. -func (b2 *BBHash2) UnmarshalBinary(data []byte) error { +func (b2 *BBHash) UnmarshalBinary(data []byte) error { // Make a copy of data, since we will be modifying buf's slice indices buf := data if len(buf) < 1 { - return errors.New("BBHash2.UnmarshalBinary: no data") + return errors.New("BBHash.UnmarshalBinary: no data") } // Read header: the number of partitions numPartitions := uint8(buf[0]) if numPartitions == 0 || numPartitions > maxPartitions { - return fmt.Errorf("BBHash2.UnmarshalBinary: invalid number of partitions %d (max %d)", numPartitions, maxPartitions) + return fmt.Errorf("BBHash.UnmarshalBinary: invalid number of partitions %d (max %d)", numPartitions, maxPartitions) } buf = buf[1:] // move past header - *b2 = BBHash2{} // modify b2 in place - b2.partitions = make([]BBHash, numPartitions) + *b2 = BBHash{} // modify b2 in place + b2.partitions = make([]SingleBBHash, numPartitions) // Read BBHash for each partition for i := range numPartitions { - bb := BBHash{} + bb := SingleBBHash{} if err := bb.UnmarshalBinary(buf); err != nil { return err } b2.partitions[i] = bb bbLen := bb.marshaledLength() if len(buf) < bbLen { - return errors.New("BBHash2.UnmarshalBinary: insufficient data for remaining partitions") + return errors.New("BBHash.UnmarshalBinary: insufficient data for remaining partitions") } buf = buf[bbLen:] // move past the current partition } // we skip the first offset since it is always 0, hence numPartitions-1 if len(buf) < int(uint32bytes*(numPartitions-1)) { - return errors.New("BBHash2.UnmarshalBinary: insufficient data for offset vector") + return errors.New("BBHash.UnmarshalBinary: insufficient data for offset vector") } // Read offset vector diff --git a/bbhash_marshal_test.go b/bbhash_marshal_test.go index c11f346..1263134 100644 --- a/bbhash_marshal_test.go +++ b/bbhash_marshal_test.go @@ -1,17 +1,16 @@ -package bbhash_test +package bbhash import ( "testing" - "github.com/relab/bbhash" "github.com/relab/bbhash/internal/test" ) -func TestMarshalUnmarshalBBHash(t *testing.T) { +func TestMarshalUnmarshalSingleBBHash(t *testing.T) { size := 100000 keys := generateKeys(size, 99) - bb2, err := bbhash.New(keys, bbhash.Gamma(2.0)) + bb2, err := New(keys, Gamma(2.0)) if err != nil { t.Fatalf("Failed to create BBHash: %v", err) } @@ -28,7 +27,7 @@ func TestMarshalUnmarshalBBHash(t *testing.T) { t.Fatalf("Failed to marshal BBHash: %v", err) } - newBB := &bbhash.BBHash{} + newBB := &SingleBBHash{} if err = newBB.UnmarshalBinary(data); err != nil { t.Fatalf("Failed to unmarshal BBHash: %v", err) } @@ -43,18 +42,18 @@ func TestMarshalUnmarshalBBHash(t *testing.T) { } func TestMarshalUnmarshalBBHashEmpty(t *testing.T) { - bb := &bbhash.BBHash{} + bb := &SingleBBHash{} data, err := bb.MarshalBinary() if err == nil { t.Errorf("MarshalBinary() should have failed") } - newBB := &bbhash.BBHash{} + newBB := &SingleBBHash{} if err = newBB.UnmarshalBinary(data); err == nil { t.Errorf("UnmarshalBinary() should have failed") } } -func TestMarshalUnmarshalBBHash2(t *testing.T) { +func TestMarshalUnmarshalBBHash(t *testing.T) { testCases := []struct { size int partitions int @@ -76,9 +75,9 @@ func TestMarshalUnmarshalBBHash2(t *testing.T) { t.Run(test.Name("", []string{"keys", "partitions"}, tc.size, tc.partitions), func(t *testing.T) { keys := generateKeys(tc.size, 98) - bb, err := bbhash.New(keys, bbhash.Partitions(tc.partitions)) + bb, err := New(keys, Partitions(tc.partitions)) if err != nil { - t.Fatalf("Failed to create BBHash2: %v", err) + t.Fatalf("Failed to create BBHash: %v", err) } // Store original Find() results @@ -87,19 +86,19 @@ func TestMarshalUnmarshalBBHash2(t *testing.T) { originalHashIndexes[key] = bb.Find(key) } - t.Logf("Original BBHash2: %v", bb) + t.Logf("Original BBHash: %v", bb) data, err := bb.MarshalBinary() if err != nil { - t.Fatalf("Failed to marshal BBHash2: %v", err) + t.Fatalf("Failed to marshal BBHash: %v", err) } - newBB := &bbhash.BBHash2{} + newBB := &BBHash{} if err = newBB.UnmarshalBinary(data); err != nil { - t.Fatalf("Failed to unmarshal BBHash2: %v", err) + t.Fatalf("Failed to unmarshal BBHash: %v", err) } - // Validate that the unmarshalled BBHash2 produces the same Find() results + // Validate that the unmarshalled BBHash produces the same Find() results for _, key := range keys { hashIndex := newBB.Find(key) if hashIndex != originalHashIndexes[key] { @@ -112,12 +111,12 @@ func TestMarshalUnmarshalBBHash2(t *testing.T) { // Run with: // go test -run x -bench BenchmarkBBHashMarshalBinary -benchmem -func BenchmarkBBHashMarshalBinary(b *testing.B) { +func BenchmarkSingleBBHashMarshalBinary(b *testing.B) { for _, size := range keySizes { keys := generateKeys(size, 99) for _, gamma := range gammaValues { b.Run(test.Name("", []string{"gamma", "keys"}, gamma, size), func(b *testing.B) { - bb2, _ := bbhash.New(keys, bbhash.Gamma(gamma)) + bb2, _ := New(keys, Gamma(gamma)) bb := bb2.SinglePartition() bpk := bb.BitsPerKey() @@ -141,12 +140,12 @@ func BenchmarkBBHashMarshalBinary(b *testing.B) { // Run with: // go test -run x -bench BenchmarkBBHashUnmarshalBinary -benchmem -func BenchmarkBBHashUnmarshalBinary(b *testing.B) { +func BenchmarkSingleBBHashUnmarshalBinary(b *testing.B) { for _, size := range keySizes { keys := generateKeys(size, 99) for _, gamma := range gammaValues { b.Run(test.Name("", []string{"gamma", "keys"}, gamma, size), func(b *testing.B) { - bb2, _ := bbhash.New(keys, bbhash.Gamma(gamma)) + bb2, _ := New(keys, Gamma(gamma)) bb := bb2.SinglePartition() bpk := bb.BitsPerKey() @@ -156,7 +155,7 @@ func BenchmarkBBHashUnmarshalBinary(b *testing.B) { } marshaledSize := len(data) - newBB := &bbhash.BBHash{} + newBB := &BBHash{} if err = newBB.UnmarshalBinary(data); err != nil { b.Fatalf("Failed to unmarshal BBHash: %v", err) } @@ -178,14 +177,14 @@ func BenchmarkBBHashUnmarshalBinary(b *testing.B) { } // Run with: -// go test -run x -bench BenchmarkBBHash2MarshalBinary -benchmem -func BenchmarkBBHash2MarshalBinary(b *testing.B) { +// go test -run x -bench BenchmarkBBHashMarshalBinary -benchmem +func BenchmarkBBHashMarshalBinary(b *testing.B) { for _, size := range keySizes { keys := generateKeys(size, 99) for _, gamma := range gammaValues { for _, partitions := range partitionValues { b.Run(test.Name("", []string{"gamma", "partitions", "keys"}, gamma, partitions, size), func(b *testing.B) { - bb, _ := bbhash.New(keys, bbhash.Gamma(gamma), bbhash.Partitions(partitions)) + bb, _ := New(keys, Gamma(gamma), Partitions(partitions)) bpk := bb.BitsPerKey() data, err := bb.MarshalBinary() @@ -208,14 +207,14 @@ func BenchmarkBBHash2MarshalBinary(b *testing.B) { } // Run with: -// go test -run x -bench BenchmarkBBHash2UnmarshalBinary -benchmem -func BenchmarkBBHash2UnmarshalBinary(b *testing.B) { +// go test -run x -bench BenchmarkBBHashUnmarshalBinary -benchmem +func BenchmarkBBHashUnmarshalBinary(b *testing.B) { for _, size := range keySizes { keys := generateKeys(size, 99) for _, gamma := range gammaValues { for _, partitions := range partitionValues { b.Run(test.Name("", []string{"gamma", "partitions", "keys"}, gamma, partitions, size), func(b *testing.B) { - bb, _ := bbhash.New(keys, bbhash.Gamma(gamma), bbhash.Partitions(partitions)) + bb, _ := New(keys, Gamma(gamma), Partitions(partitions)) bpk := bb.BitsPerKey() data, err := bb.MarshalBinary() @@ -224,7 +223,7 @@ func BenchmarkBBHash2UnmarshalBinary(b *testing.B) { } marshaledSize := len(data) - newBB := &bbhash.BBHash2{} + newBB := &BBHash{} if err = newBB.UnmarshalBinary(data); err != nil { b.Fatalf("Failed to unmarshal BBHash: %v", err) } @@ -246,12 +245,12 @@ func BenchmarkBBHash2UnmarshalBinary(b *testing.B) { } } -// This is a fast deterministic benchmark that only measures the message length (BBHash2 length) +// This is a fast deterministic benchmark that only measures the message length (BBHash length) // and number of bits per key for different key sizes, gamma values and number of partitions. // // Run with: -// go test -run x -bench BenchmarkBBHash2BitsPerKey -func BenchmarkBBHash2BitsPerKey(b *testing.B) { +// go test -run x -bench BenchmarkBBHashBitsPerKey +func BenchmarkBBHashBitsPerKey(b *testing.B) { for _, size := range keySizes { keys := generateKeys(size, 99) for _, gamma := range gammaValues { @@ -262,7 +261,7 @@ func BenchmarkBBHash2BitsPerKey(b *testing.B) { // Stop the benchmark timer since we measure only the bits/key calculation b.StopTimer() - bb, _ := bbhash.New(keys, bbhash.Gamma(gamma), bbhash.Partitions(partitions)) + bb, _ := New(keys, Gamma(gamma), Partitions(partitions)) bpk := bb.BitsPerKey() data, _ := bb.MarshalBinary() marshaledSize := len(data) diff --git a/bbhash_opts.go b/bbhash_opts.go index 737d300..2015e9c 100644 --- a/bbhash_opts.go +++ b/bbhash_opts.go @@ -59,9 +59,9 @@ func InitialLevels(levels int) Options { } } -// Partitions sets the number of partitions to use when creating a BBHash2. +// Partitions sets the number of partitions to use when creating a BBHash. // The keys are partitioned into the given the number partitions. -// Setting partitions to less than 2 results in a single BBHash, wrapped in a BBHash2. +// Setting partitions to less than 2 results in a single BBHash, wrapped in a BBHash. func Partitions(partitions int) Options { return func(o *options) { o.partitions = max(min(partitions, maxPartitions), 1) diff --git a/bbhash_parallel.go b/bbhash_parallel.go index 1bda1e4..d39d4ed 100644 --- a/bbhash_parallel.go +++ b/bbhash_parallel.go @@ -9,7 +9,7 @@ import ( ) // computeParallel computes the minimal perfect hash for the given keys in parallel by sharding the keys. -func (bb *BBHash) computeParallel(keys []uint64, gamma float64) error { +func (bb *SingleBBHash) computeParallel(keys []uint64, gamma float64) error { sz := len(keys) wds := words(sz, gamma) redo := make([]uint64, 0, sz/2) // heuristic: only 1/2 of the keys will collide diff --git a/bbhash_partitioned.go b/bbhash_partitioned.go index 456a5b5..e6f8889 100644 --- a/bbhash_partitioned.go +++ b/bbhash_partitioned.go @@ -4,18 +4,18 @@ import ( "golang.org/x/sync/errgroup" ) -// BBHash2 represents a minimal perfect hash for a set of keys. -type BBHash2 struct { - partitions []BBHash +// BBHash represents a minimal perfect hash for a set of keys. +type BBHash struct { + partitions []SingleBBHash offsets []uint32 } -// New creates a new BBHash2 for the given keys. The keys must be unique. +// New creates a new BBHash for the given keys. The keys must be unique. // Creation is configured using the provided options. The default options // are used if none are provided. Available options include: Gamma, // InitialLevels, Partitions, Parallel, and WithReverseMap. // With fewer than 1000 keys, the sequential version is always used. -func New(keys []uint64, opts ...Options) (*BBHash2, error) { +func New(keys []uint64, opts ...Options) (*BBHash, error) { if len(keys) < 1 { panic("bbhash: no keys provided") } @@ -40,8 +40,8 @@ func New(keys []uint64, opts ...Options) (*BBHash2, error) { if err != nil { return nil, err } - return &BBHash2{ - partitions: []BBHash{bb}, + return &BBHash{ + partitions: []SingleBBHash{bb}, offsets: []uint32{0}, }, nil } @@ -49,7 +49,7 @@ func New(keys []uint64, opts ...Options) (*BBHash2, error) { } // newPartitioned partitions the keys and creates multiple BBHashes in parallel. -func newPartitioned(keys []uint64, o *options) (*BBHash2, error) { +func newPartitioned(keys []uint64, o *options) (*BBHash, error) { // Partition the keys into partitions by placing keys with the // same remainder (modulo partitions) into the same partition. // This approach copies the keys into partitions slices, which @@ -59,8 +59,8 @@ func newPartitioned(keys []uint64, o *options) (*BBHash2, error) { i := k % uint64(o.partitions) partitionKeys[i] = append(partitionKeys[i], k) } - bb := &BBHash2{ - partitions: make([]BBHash, o.partitions), + bb := &BBHash{ + partitions: make([]SingleBBHash, o.partitions), offsets: make([]uint32, o.partitions), } grp := &errgroup.Group{} @@ -92,14 +92,14 @@ func newPartitioned(keys []uint64, o *options) (*BBHash2, error) { // If the key is not in the original key set, two things can happen: // 1. The return value is 0, representing that the key was not in the original key set. // 2. The return value is in the expected range [1, len(keys)], but is a false positive. -func (bb BBHash2) Find(key uint64) uint64 { +func (bb BBHash) Find(key uint64) uint64 { i := key % uint64(len(bb.partitions)) return bb.partitions[i].Find(key) + uint64(bb.offsets[i]) } // Key returns the key for the given index. // The index must be in the range [1, len(keys)], otherwise 0 is returned. -func (bb BBHash2) Key(index uint64) uint64 { +func (bb BBHash) Key(index uint64) uint64 { for _, b := range bb.partitions { if index < uint64(len(b.reverseMap)) { return b.reverseMap[index] @@ -109,15 +109,15 @@ func (bb BBHash2) Key(index uint64) uint64 { return 0 } -// Partitions returns the number of partitions in the BBHash2. +// Partitions returns the number of partitions in the BBHash. // This is mainly useful for testing and may be removed in the future. -func (bb BBHash2) Partitions() int { +func (bb BBHash) Partitions() int { return len(bb.partitions) } // SinglePartition returns the underlying BBHash if it contains a single partition. // If there are multiple partitions, it returns nil. -func (bb BBHash2) SinglePartition() *BBHash { +func (bb BBHash) SinglePartition() *SingleBBHash { if len(bb.partitions) == 1 { return &bb.partitions[0] } @@ -126,6 +126,6 @@ func (bb BBHash2) SinglePartition() *BBHash { // enforce interface compliance var ( - _ bbhash = (*BBHash2)(nil) - _ reverseMap = (*BBHash2)(nil) + _ bbhash = (*BBHash)(nil) + _ reverseMap = (*BBHash)(nil) ) diff --git a/bbhash_reproducible_test.go b/bbhash_reproducible_test.go index 6bf5375..b93cad5 100644 --- a/bbhash_reproducible_test.go +++ b/bbhash_reproducible_test.go @@ -119,7 +119,7 @@ func (bvg *bvGenerator) createBitVectorMap(size, partitions int) string { } // createBitVectors creates a Go string representation of the bit vectors for the given BBHash. -func (bvg *bvGenerator) createBitVectors(bitVectorVarName string, bb *bbhash.BBHash2) { +func (bvg *bvGenerator) createBitVectors(bitVectorVarName string, bb *bbhash.BBHash) { if bvg.usedVarNames[bitVectorVarName] { return // Bit vectors already added for this size and partitions } diff --git a/cmd/bbhashbench/main.go b/cmd/bbhashbench/main.go index 6afebbc..6ddfe71 100644 --- a/cmd/bbhashbench/main.go +++ b/cmd/bbhashbench/main.go @@ -104,7 +104,7 @@ func writeCSVFile(filename string, create, find map[int][]time.Duration, levels func runSequential(numKeys int, gamma float64, count int) ([]time.Duration, []time.Duration, int, float64) { keys := generateKeys(numKeys, 99) - var bb *bbhash.BBHash2 + var bb *bbhash.BBHash var err error elapsed := make([]time.Duration, count) for i := 0; i < count; i++ { @@ -123,7 +123,7 @@ func runSequential(numKeys int, gamma float64, count int) ([]time.Duration, []ti func runParallel(numKeys int, gamma float64, count int) ([]time.Duration, []time.Duration, int, float64) { keys := generateKeys(numKeys, 99) - var bb *bbhash.BBHash2 + var bb *bbhash.BBHash var err error elapsed := make([]time.Duration, count) for i := 0; i < count; i++ { @@ -142,7 +142,7 @@ func runParallel(numKeys int, gamma float64, count int) ([]time.Duration, []time func runPartitioned(numKeys, numPartitions int, gamma float64, count int) ([]time.Duration, []time.Duration, int, float64) { keys := generateKeys(numKeys, 99) - var bb *bbhash.BBHash2 + var bb *bbhash.BBHash var err error elapsed := make([]time.Duration, count) for i := 0; i < count; i++ { From b645f2093e0d13aababd06b6cfb5db41b6ab39e6 Mon Sep 17 00:00:00 2001 From: Creeger Date: Tue, 29 Apr 2025 18:58:24 +0200 Subject: [PATCH 3/7] Exported hash functions --- bbhash_iter.go | 14 ++++++++++++++ bbhash_iter_test.go | 31 ++++++++----------------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/bbhash_iter.go b/bbhash_iter.go index 145789a..eb5b3a1 100644 --- a/bbhash_iter.go +++ b/bbhash_iter.go @@ -1,8 +1,12 @@ package bbhash import ( + "crypto/sha256" + "encoding/binary" "io" "iter" + + "github.com/relab/bbhash/internal/fast" ) // Find the chunks from slow memory @@ -31,3 +35,13 @@ func Keys(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte]) []uint64 { } return keys } + +var SHA256hashFunc = func(buf []byte) uint64 { + h := sha256.New() + h.Write(buf) + return binary.LittleEndian.Uint64(h.Sum(nil)) +} + +var FastHashFunc = func(buf []byte) uint64 { + return fast.Hash64(uint64(123), buf) +} diff --git a/bbhash_iter_test.go b/bbhash_iter_test.go index c26e3d2..c5a29e6 100644 --- a/bbhash_iter_test.go +++ b/bbhash_iter_test.go @@ -2,22 +2,15 @@ package bbhash_test import ( "bytes" - "context" - "crypto/sha256" _ "embed" - "encoding/binary" "iter" - "os" "slices" "strings" "testing" "github.com/google/go-cmp/cmp" "github.com/relab/bbhash" - "github.com/relab/bbhash/internal/fast" "github.com/relab/bbhash/internal/test" - "github.com/relab/iago" - "github.com/relab/iago/iagotest" ) // String taken from https://www.lipsum.com/ @@ -38,16 +31,6 @@ func TestChunks(t *testing.T) { } } -var sha256hashFunc = func(buf []byte) uint64 { - h := sha256.New() - h.Write(buf) - return binary.LittleEndian.Uint64(h.Sum(nil)) -} - -var fastHashFunc = func(buf []byte) uint64 { - return fast.Hash64(uint64(123), buf) -} - func CollectFunc[I, O any](seq iter.Seq[I], f func(I) O) (o []O) { for v := range seq { o = append(o, f(v)) @@ -62,12 +45,12 @@ func TestHashKeysFromChunks(t *testing.T) { in string chunkSize int }{ - {name: "FashHash", hashFunc: fastHashFunc, in: input[:5], chunkSize: 4}, - {name: "FashHash", hashFunc: fastHashFunc, in: input[:5], chunkSize: 8}, - {name: "SHA256", hashFunc: sha256hashFunc, in: input[:5], chunkSize: 4}, - {name: "SHA256", hashFunc: sha256hashFunc, in: input[:5], chunkSize: 8}, - {name: "LongFast", hashFunc: fastHashFunc, in: input, chunkSize: 128}, - {name: "LongSHA", hashFunc: sha256hashFunc, in: input, chunkSize: 128}, + {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4}, + {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8}, + {name: "SHA256", hashFunc: bbhash.SHA256hashFunc, in: input[:5], chunkSize: 4}, + {name: "SHA256", hashFunc: bbhash.SHA256hashFunc, in: input[:5], chunkSize: 8}, + {name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128}, + {name: "LongSHA", hashFunc: bbhash.SHA256hashFunc, in: input, chunkSize: 128}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -112,6 +95,7 @@ func Uin64ToBytes(keys []uint64) []byte { return buf } +/* Put on ice for now due to difficulties getting it up and running func BenchmarkBBhash(b *testing.B) { n := 1 //Create keys for the client group @@ -155,3 +139,4 @@ func BenchmarkBBhash(b *testing.B) { return nil }) } +*/ From f0641117da486e852d1097ebc38c5e70e64d1be2 Mon Sep 17 00:00:00 2001 From: Creeger Date: Thu, 29 May 2025 18:38:48 +0200 Subject: [PATCH 4/7] Implemented --- bbhash_iter.go | 10 +++++++++- bbhash_iter_test.go | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/bbhash_iter.go b/bbhash_iter.go index eb5b3a1..827b5aa 100644 --- a/bbhash_iter.go +++ b/bbhash_iter.go @@ -35,8 +35,16 @@ func Keys(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte]) []uint64 { } return keys } +func KeysNonce(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte], nonce []byte) []uint64 { + var keys []uint64 + for c := range chunks { + c = append(c, nonce...) + keys = append(keys, hashFunc(c)) + } + return keys +} -var SHA256hashFunc = func(buf []byte) uint64 { +var SHA256HashFunc = func(buf []byte) uint64 { h := sha256.New() h.Write(buf) return binary.LittleEndian.Uint64(h.Sum(nil)) diff --git a/bbhash_iter_test.go b/bbhash_iter_test.go index c5a29e6..77ef44d 100644 --- a/bbhash_iter_test.go +++ b/bbhash_iter_test.go @@ -4,6 +4,7 @@ import ( "bytes" _ "embed" "iter" + "math/rand" "slices" "strings" "testing" @@ -47,10 +48,10 @@ func TestHashKeysFromChunks(t *testing.T) { }{ {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4}, {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8}, - {name: "SHA256", hashFunc: bbhash.SHA256hashFunc, in: input[:5], chunkSize: 4}, - {name: "SHA256", hashFunc: bbhash.SHA256hashFunc, in: input[:5], chunkSize: 8}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 4}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 8}, {name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128}, - {name: "LongSHA", hashFunc: bbhash.SHA256hashFunc, in: input, chunkSize: 128}, + {name: "LongSHA", hashFunc: bbhash.SHA256HashFunc, in: input, chunkSize: 128}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -70,6 +71,37 @@ func TestHashKeysFromChunks(t *testing.T) { } } +func TestKeysNonce(t *testing.T) { + tests := []struct { + name string + hashFunc func([]byte) uint64 + in string + chunkSize int + }{ + {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4}, + {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 4}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 8}, + {name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128}, + {name: "LongSHA", hashFunc: bbhash.SHA256HashFunc, in: input, chunkSize: 128}, + } + for _, test := range tests { + nonce := byte(rand.Intn(256)) + wantHashedKeys := CollectFunc(slices.Chunk([]byte(test.in), test.chunkSize), func(v []byte) uint64 { + v = append(v, nonce) + return test.hashFunc(v) + }) + + r := strings.NewReader(test.in) + chunks := bbhash.ReadChunks(r, test.chunkSize) + gotHashedKeys := bbhash.KeysNonce(test.hashFunc, chunks, nonce) + + if diff := cmp.Diff(gotHashedKeys, wantHashedKeys); diff != "" { + t.Errorf("Keys(): (-got +want) \n%s", diff) + } + } +} + func BenchmarkChunks(b *testing.B) { for _, keySz := range keySizesOneV { keys := generateKeys(keySz, 99) From d4fb2dacd6177466e272de4a9a64955b9dfd2bb5 Mon Sep 17 00:00:00 2001 From: Creeger Date: Thu, 29 May 2025 18:53:07 +0200 Subject: [PATCH 5/7] Readded bbhash_iter files and related variables --- bbhash_iter.go | 2 ++ bbhash_iter_test.go | 14 ++++++-------- bbhash_test.go | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bbhash_iter.go b/bbhash_iter.go index 737459f..6b7dff1 100644 --- a/bbhash_iter.go +++ b/bbhash_iter.go @@ -45,3 +45,5 @@ var SHA256HashFunc = func(buf []byte) uint64 { var FastHashFunc = func(buf []byte) uint64 { return fast.Hash64(123, buf) } + +// diff --git a/bbhash_iter_test.go b/bbhash_iter_test.go index 8b52b8c..f5deadb 100644 --- a/bbhash_iter_test.go +++ b/bbhash_iter_test.go @@ -4,7 +4,6 @@ import ( "bytes" _ "embed" "iter" - "os" "slices" "strings" "testing" @@ -12,7 +11,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/relab/bbhash" "github.com/relab/bbhash/internal/test" - ) // String taken from https://www.lipsum.com/ @@ -49,10 +47,10 @@ func TestHashKeysFromChunks(t *testing.T) { }{ {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4}, {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8}, - {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 4}, - {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 8}, + {name: "SHA256", hashFunc: bbhash.Sha256HashFunc, in: input[:5], chunkSize: 4}, + {name: "SHA256", hashFunc: bbhash.Sha256HashFunc, in: input[:5], chunkSize: 8}, {name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128}, - {name: "LongSHA", hashFunc: bbhash.SHA256HashFunc, in: input, chunkSize: 128}, + {name: "LongSHA", hashFunc: bbhash.Sha256HashFunc, in: input, chunkSize: 128}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -73,11 +71,11 @@ func TestHashKeysFromChunks(t *testing.T) { } func BenchmarkChunks(b *testing.B) { - for _, keySz := range keySizesOneV { + for _, keySz := range keySizes { keys := generateKeys(keySz, 99) bKeys := Uin64ToBytes(keys) r := bytes.NewReader(bKeys) - for _, gamma := range gammaValuesOneV { + for _, gamma := range gammaValues { for _, sz := range bufSizes { b.Run(test.Name("New(Chunks)", []string{"gamma", "buffer", "keys"}, gamma, sz, keySz), func(b *testing.B) { b.Log("Running ReadChunks") @@ -95,4 +93,4 @@ func Uin64ToBytes(keys []uint64) []byte { buf = append(buf, byte(key)) } return buf -} \ No newline at end of file +} diff --git a/bbhash_test.go b/bbhash_test.go index 337553f..9faa51b 100644 --- a/bbhash_test.go +++ b/bbhash_test.go @@ -21,6 +21,7 @@ var ( longKeySizes = []int{10_000_000, 100_000_000, 1_000_000_000} partitionValues = []int{1, 4, 8, 16, 24, 32, 48, 64, 128} gammaValues = []float64{1.0, 1.5, 2.0} + bufSizes = []int{10, 100, 1000} ) // TestMain parses command-line flags to set the key sizes, partition values, and gamma values. From 032295520419cf80c8c1f55191638fc8f5bdb13c Mon Sep 17 00:00:00 2001 From: Creeger Date: Thu, 29 May 2025 18:54:29 +0200 Subject: [PATCH 6/7] Capsulated SHA256 in test cases --- bbhash_iter_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bbhash_iter_test.go b/bbhash_iter_test.go index f5deadb..ff1e51c 100644 --- a/bbhash_iter_test.go +++ b/bbhash_iter_test.go @@ -47,10 +47,10 @@ func TestHashKeysFromChunks(t *testing.T) { }{ {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4}, {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8}, - {name: "SHA256", hashFunc: bbhash.Sha256HashFunc, in: input[:5], chunkSize: 4}, - {name: "SHA256", hashFunc: bbhash.Sha256HashFunc, in: input[:5], chunkSize: 8}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 4}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 8}, {name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128}, - {name: "LongSHA", hashFunc: bbhash.Sha256HashFunc, in: input, chunkSize: 128}, + {name: "LongSHA", hashFunc: bbhash.SHA256HashFunc, in: input, chunkSize: 128}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { From 67878bd97ec97e6e3dd26d3efd97fe303bbc5878 Mon Sep 17 00:00:00 2001 From: Creeger Date: Thu, 29 May 2025 18:57:52 +0200 Subject: [PATCH 7/7] Added KeysNonce function --- bbhash_iter.go | 9 +++++++++ bbhash_iter_test.go | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/bbhash_iter.go b/bbhash_iter.go index 6b7dff1..820f7a1 100644 --- a/bbhash_iter.go +++ b/bbhash_iter.go @@ -36,6 +36,15 @@ func Keys(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte]) []uint64 { return keys } +func KeysNonce(hashFunc func([]byte) uint64, chunks iter.Seq[[]byte], nonce []byte) []uint64 { + var keys []uint64 + for c := range chunks { + c = append(c, nonce...) + keys = append(keys, hashFunc(c)) + } + return keys +} + var SHA256HashFunc = func(buf []byte) uint64 { h := sha256.New() h.Write(buf) diff --git a/bbhash_iter_test.go b/bbhash_iter_test.go index ff1e51c..ed19f3b 100644 --- a/bbhash_iter_test.go +++ b/bbhash_iter_test.go @@ -4,6 +4,7 @@ import ( "bytes" _ "embed" "iter" + "math/rand" "slices" "strings" "testing" @@ -70,6 +71,37 @@ func TestHashKeysFromChunks(t *testing.T) { } } +func TestKeysNonce(t *testing.T) { + tests := []struct { + name string + hashFunc func([]byte) uint64 + in string + chunkSize int + }{ + {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 4}, + {name: "FashHash", hashFunc: bbhash.FastHashFunc, in: input[:5], chunkSize: 8}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 4}, + {name: "SHA256", hashFunc: bbhash.SHA256HashFunc, in: input[:5], chunkSize: 8}, + {name: "LongFast", hashFunc: bbhash.FastHashFunc, in: input, chunkSize: 128}, + {name: "LongSHA", hashFunc: bbhash.SHA256HashFunc, in: input, chunkSize: 128}, + } + for _, test := range tests { + nonce := []byte{byte(rand.Intn(256))} + wantHashedKeys := CollectFunc(slices.Chunk([]byte(test.in), test.chunkSize), func(v []byte) uint64 { + v = append(v, nonce...) + return test.hashFunc(v) + }) + + r := strings.NewReader(test.in) + chunks := bbhash.ReadChunks(r, test.chunkSize) + gotHashedKeys := bbhash.KeysNonce(test.hashFunc, chunks, nonce) + + if diff := cmp.Diff(gotHashedKeys, wantHashedKeys); diff != "" { + t.Errorf("Keys(): (-got +want) \n%s", diff) + } + } +} + func BenchmarkChunks(b *testing.B) { for _, keySz := range keySizes { keys := generateKeys(keySz, 99)