mirror of
https://github.com/octoleo/syncthing.git
synced 2025-01-03 15:17:25 +00:00
lib/db: Use SipHash to deal with hash collision in GC (#6826)
If the GC finds a key k that it wants to keep, it records that in a Bloom filter. If a key k' can be removed but its hash collides with k, it will be kept. Since the old Bloom filter code was completely deterministic, the next run would encounter the same collision, assuming k must still be kept. A randomized hash function that uses all the SHA-256 bits solves this problem: the second run has a non-zero probability of removing k', as long as the Bloom filter is not completely full.
This commit is contained in:
parent
bbda58a29f
commit
9f92f8c609
1
go.mod
1
go.mod
@ -11,6 +11,7 @@ require (
|
||||
github.com/certifi/gocertifi v0.0.0-20190905060710-a5e0173ced67 // indirect
|
||||
github.com/chmduquesne/rollinghash v0.0.0-20180912150627-a60f8e7142b5
|
||||
github.com/d4l3k/messagediff v1.2.1
|
||||
github.com/dchest/siphash v1.2.1
|
||||
github.com/dgraph-io/badger/v2 v2.0.3
|
||||
github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568
|
||||
github.com/getsentry/raven-go v0.2.0
|
||||
|
2
go.sum
2
go.sum
@ -70,6 +70,8 @@ github.com/d4l3k/messagediff v1.2.1/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkE
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dchest/siphash v1.2.1 h1:4cLinnzVJDKxTCl9B01807Yiy+W7ZzVHj/KIroQRvT4=
|
||||
github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4=
|
||||
github.com/dgraph-io/badger v1.6.1 h1:w9pSFNSdq/JPM1N12Fz/F/bzo993Is1W+Q7HjPzi7yg=
|
||||
github.com/dgraph-io/badger/v2 v2.0.3 h1:inzdf6VF/NZ+tJ8RwwYMjJMvsOALTHYdozn0qSl6XJI=
|
||||
github.com/dgraph-io/badger/v2 v2.0.3/go.mod h1:3KY8+bsP8wI0OEnQJAKpd4wIJW/Mm32yw2j/9FUVnIM=
|
||||
|
@ -10,11 +10,14 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/dchest/siphash"
|
||||
"github.com/greatroar/blobloom"
|
||||
"github.com/syncthing/syncthing/lib/db/backend"
|
||||
"github.com/syncthing/syncthing/lib/protocol"
|
||||
"github.com/syncthing/syncthing/lib/rand"
|
||||
"github.com/syncthing/syncthing/lib/sha256"
|
||||
"github.com/syncthing/syncthing/lib/sync"
|
||||
"github.com/syncthing/syncthing/lib/util"
|
||||
@ -679,10 +682,10 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
|
||||
return err
|
||||
}
|
||||
if len(hashes.BlocksHash) > 0 {
|
||||
blockFilter.Add(bloomHash(hashes.BlocksHash))
|
||||
blockFilter.add(hashes.BlocksHash)
|
||||
}
|
||||
if len(hashes.VersionHash) > 0 {
|
||||
versionFilter.Add(bloomHash(hashes.VersionHash))
|
||||
versionFilter.add(hashes.VersionHash)
|
||||
}
|
||||
}
|
||||
it.Release()
|
||||
@ -707,7 +710,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
|
||||
}
|
||||
|
||||
key := blockListKey(it.Key())
|
||||
if blockFilter.Has(bloomHash(key.Hash())) {
|
||||
if blockFilter.has(key.Hash()) {
|
||||
matchedBlocks++
|
||||
continue
|
||||
}
|
||||
@ -736,7 +739,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
|
||||
}
|
||||
|
||||
key := versionKey(it.Key())
|
||||
if versionFilter.Has(bloomHash(key.Hash())) {
|
||||
if versionFilter.has(key.Hash()) {
|
||||
matchedVersions++
|
||||
continue
|
||||
}
|
||||
@ -762,21 +765,39 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
|
||||
return db.Compact()
|
||||
}
|
||||
|
||||
func newBloomFilter(capacity int) *blobloom.Filter {
|
||||
return blobloom.NewOptimized(blobloom.Config{
|
||||
Capacity: uint64(capacity),
|
||||
FPRate: indirectGCBloomFalsePositiveRate,
|
||||
MaxBits: 8 * indirectGCBloomMaxBytes,
|
||||
})
|
||||
func newBloomFilter(capacity int) bloomFilter {
|
||||
var buf [16]byte
|
||||
io.ReadFull(rand.Reader, buf[:])
|
||||
|
||||
return bloomFilter{
|
||||
f: blobloom.NewOptimized(blobloom.Config{
|
||||
Capacity: uint64(capacity),
|
||||
FPRate: indirectGCBloomFalsePositiveRate,
|
||||
MaxBits: 8 * indirectGCBloomMaxBytes,
|
||||
}),
|
||||
|
||||
k0: binary.LittleEndian.Uint64(buf[:8]),
|
||||
k1: binary.LittleEndian.Uint64(buf[8:]),
|
||||
}
|
||||
}
|
||||
|
||||
// Hash function for the bloomfilter: first eight bytes of the SHA-256.
|
||||
// Big or little-endian makes no difference, as long as we're consistent.
|
||||
func bloomHash(key []byte) uint64 {
|
||||
if len(key) != sha256.Size {
|
||||
panic("bug: bloomHash passed something not a SHA256 hash")
|
||||
type bloomFilter struct {
|
||||
f *blobloom.Filter
|
||||
k0, k1 uint64 // Random key for SipHash.
|
||||
}
|
||||
|
||||
func (b *bloomFilter) add(id []byte) { b.f.Add(b.hash(id)) }
|
||||
func (b *bloomFilter) has(id []byte) bool { return b.f.Has(b.hash(id)) }
|
||||
|
||||
// Hash function for the bloomfilter: SipHash of the SHA-256.
|
||||
//
|
||||
// The randomization in SipHash means we get different collisions across
|
||||
// runs and colliding keys are not kept indefinitely.
|
||||
func (b *bloomFilter) hash(id []byte) uint64 {
|
||||
if len(id) != sha256.Size {
|
||||
panic("bug: bloomFilter.hash passed something not a SHA256 hash")
|
||||
}
|
||||
return binary.BigEndian.Uint64(key)
|
||||
return siphash.Hash(b.k0, b.k1, id)
|
||||
}
|
||||
|
||||
// CheckRepair checks folder metadata and sequences for miscellaneous errors.
|
||||
|
Loading…
Reference in New Issue
Block a user