From 9f92f8c609d64b019a38728c9de1f34ed2051988 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 11 Jul 2020 09:36:09 +0200 Subject: [PATCH] lib/db: Use SipHash to deal with hash collision in GC (#6826) If the GC finds a key k that it wants to keep, it records that in a Bloom filter. If a key k' can be removed but its hash collides with k, it will be kept. Since the old Bloom filter code was completely deterministic, the next run would encounter the same collision, assuming k must still be kept. A randomized hash function that uses all the SHA-256 bits solves this problem: the second run has a non-zero probability of removing k', as long as the Bloom filter is not completely full. --- go.mod | 1 + go.sum | 2 ++ lib/db/lowlevel.go | 53 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/go.mod b/go.mod index 684655ba4..95459be17 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/certifi/gocertifi v0.0.0-20190905060710-a5e0173ced67 // indirect github.com/chmduquesne/rollinghash v0.0.0-20180912150627-a60f8e7142b5 github.com/d4l3k/messagediff v1.2.1 + github.com/dchest/siphash v1.2.1 github.com/dgraph-io/badger/v2 v2.0.3 github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568 github.com/getsentry/raven-go v0.2.0 diff --git a/go.sum b/go.sum index e476b7011..998b64c9c 100644 --- a/go.sum +++ b/go.sum @@ -70,6 +70,8 @@ github.com/d4l3k/messagediff v1.2.1/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkE github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dchest/siphash v1.2.1 h1:4cLinnzVJDKxTCl9B01807Yiy+W7ZzVHj/KIroQRvT4= +github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4= github.com/dgraph-io/badger v1.6.1 h1:w9pSFNSdq/JPM1N12Fz/F/bzo993Is1W+Q7HjPzi7yg= github.com/dgraph-io/badger/v2 v2.0.3 h1:inzdf6VF/NZ+tJ8RwwYMjJMvsOALTHYdozn0qSl6XJI= github.com/dgraph-io/badger/v2 v2.0.3/go.mod h1:3KY8+bsP8wI0OEnQJAKpd4wIJW/Mm32yw2j/9FUVnIM= diff --git a/lib/db/lowlevel.go b/lib/db/lowlevel.go index 4b11165e7..838ed22a8 100644 --- a/lib/db/lowlevel.go +++ b/lib/db/lowlevel.go @@ -10,11 +10,14 @@ import ( "bytes" "context" "encoding/binary" + "io" "time" + "github.com/dchest/siphash" "github.com/greatroar/blobloom" "github.com/syncthing/syncthing/lib/db/backend" "github.com/syncthing/syncthing/lib/protocol" + "github.com/syncthing/syncthing/lib/rand" "github.com/syncthing/syncthing/lib/sha256" "github.com/syncthing/syncthing/lib/sync" "github.com/syncthing/syncthing/lib/util" @@ -679,10 +682,10 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error { return err } if len(hashes.BlocksHash) > 0 { - blockFilter.Add(bloomHash(hashes.BlocksHash)) + blockFilter.add(hashes.BlocksHash) } if len(hashes.VersionHash) > 0 { - versionFilter.Add(bloomHash(hashes.VersionHash)) + versionFilter.add(hashes.VersionHash) } } it.Release() @@ -707,7 +710,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error { } key := blockListKey(it.Key()) - if blockFilter.Has(bloomHash(key.Hash())) { + if blockFilter.has(key.Hash()) { matchedBlocks++ continue } @@ -736,7 +739,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error { } key := versionKey(it.Key()) - if versionFilter.Has(bloomHash(key.Hash())) { + if versionFilter.has(key.Hash()) { matchedVersions++ continue } @@ -762,21 +765,39 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error { return db.Compact() } -func newBloomFilter(capacity int) *blobloom.Filter { - return blobloom.NewOptimized(blobloom.Config{ - Capacity: uint64(capacity), - FPRate: indirectGCBloomFalsePositiveRate, - MaxBits: 8 * indirectGCBloomMaxBytes, - }) +func newBloomFilter(capacity int) bloomFilter { + var buf [16]byte + io.ReadFull(rand.Reader, buf[:]) + + return bloomFilter{ + f: blobloom.NewOptimized(blobloom.Config{ + Capacity: uint64(capacity), + FPRate: indirectGCBloomFalsePositiveRate, + MaxBits: 8 * indirectGCBloomMaxBytes, + }), + + k0: binary.LittleEndian.Uint64(buf[:8]), + k1: binary.LittleEndian.Uint64(buf[8:]), + } } -// Hash function for the bloomfilter: first eight bytes of the SHA-256. -// Big or little-endian makes no difference, as long as we're consistent. -func bloomHash(key []byte) uint64 { - if len(key) != sha256.Size { - panic("bug: bloomHash passed something not a SHA256 hash") +type bloomFilter struct { + f *blobloom.Filter + k0, k1 uint64 // Random key for SipHash. +} + +func (b *bloomFilter) add(id []byte) { b.f.Add(b.hash(id)) } +func (b *bloomFilter) has(id []byte) bool { return b.f.Has(b.hash(id)) } + +// Hash function for the bloomfilter: SipHash of the SHA-256. +// +// The randomization in SipHash means we get different collisions across +// runs and colliding keys are not kept indefinitely. +func (b *bloomFilter) hash(id []byte) uint64 { + if len(id) != sha256.Size { + panic("bug: bloomFilter.hash passed something not a SHA256 hash") } - return binary.BigEndian.Uint64(key) + return siphash.Hash(b.k0, b.k1, id) } // CheckRepair checks folder metadata and sequences for miscellaneous errors.