From d77a326bb0e7034bf53617cf41fd427ee05a8dfa Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Mon, 8 Jan 2018 20:54:53 +0100 Subject: [PATCH 1/2] Add benchmark for Index.Has() --- internal/repository/index_test.go | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/internal/repository/index_test.go b/internal/repository/index_test.go index f0ca4e19b..38c3f57bf 100644 --- a/internal/repository/index_test.go +++ b/internal/repository/index_test.go @@ -2,6 +2,7 @@ package repository_test import ( "bytes" + "math/rand" "testing" "github.com/restic/restic/internal/repository" @@ -379,3 +380,57 @@ func TestIndexPacks(t *testing.T) { idxPacks := idx.Packs() rtest.Assert(t, packs.Equals(idxPacks), "packs in index do not match packs added to index") } + +const maxPackSize = 16 * 1024 * 1024 + +func createRandomIndex() (idx *repository.Index, lookupID restic.ID) { + idx = repository.NewIndex() + + // create index with 200k pack files + for i := 0; i < 200000; i++ { + packID := restic.NewRandomID() + offset := 0 + for offset < maxPackSize { + size := 2000 + rand.Intn(4*1024*1024) + id := restic.NewRandomID() + idx.Store(restic.PackedBlob{ + PackID: packID, + Blob: restic.Blob{ + Type: restic.DataBlob, + ID: id, + Length: uint(size), + Offset: uint(offset), + }, + }) + + offset += size + + if rand.Float32() < 0.001 && lookupID.IsNull() { + lookupID = id + } + } + } + + return idx, lookupID +} + +func BenchmarkIndexHasUnknown(b *testing.B) { + idx, _ := createRandomIndex() + lookupID := restic.NewRandomID() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + idx.Has(lookupID, restic.DataBlob) + } +} + +func BenchmarkIndexHasKnown(b *testing.B) { + idx, lookupID := createRandomIndex() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + idx.Has(lookupID, restic.DataBlob) + } +} From 539599d1f108170911d330a5ff9f3a1163311637 Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Mon, 8 Jan 2018 13:38:21 -0500 Subject: [PATCH 2/2] repository/index: Optimize index.Has() When backing up several million files (>14M tested here) with few changes, a large amount of time is spent failing to find an id in an index and creating an error to signify this. Since this is checked using the Has method, which doesn't use this error, this time creating the error is wasted. Instead, directly check if the given id and type are present in the index. This also avoids reporting all the packs containing this blob, further reducing cpu usage. --- internal/repository/index.go | 11 +++---- internal/repository/index_test.go | 49 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/internal/repository/index.go b/internal/repository/index.go index 70664fad0..b42ea6baf 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -169,12 +169,13 @@ func (idx *Index) ListPack(id restic.ID) (list []restic.PackedBlob) { // Has returns true iff the id is listed in the index. func (idx *Index) Has(id restic.ID, tpe restic.BlobType) bool { - _, err := idx.Lookup(id, tpe) - if err == nil { - return true - } + idx.m.Lock() + defer idx.m.Unlock() - return false + h := restic.BlobHandle{ID: id, Type: tpe} + + _, ok := idx.pack[h] + return ok } // LookupSize returns the length of the plaintext content of the blob with the diff --git a/internal/repository/index_test.go b/internal/repository/index_test.go index 38c3f57bf..f2e0746c0 100644 --- a/internal/repository/index_test.go +++ b/internal/repository/index_test.go @@ -434,3 +434,52 @@ func BenchmarkIndexHasKnown(b *testing.B) { idx.Has(lookupID, restic.DataBlob) } } + +func TestIndexHas(t *testing.T) { + type testEntry struct { + id restic.ID + pack restic.ID + tpe restic.BlobType + offset, length uint + } + tests := []testEntry{} + + idx := repository.NewIndex() + + // create 50 packs with 20 blobs each + for i := 0; i < 50; i++ { + packID := restic.NewRandomID() + + pos := uint(0) + for j := 0; j < 20; j++ { + id := restic.NewRandomID() + length := uint(i*100 + j) + idx.Store(restic.PackedBlob{ + Blob: restic.Blob{ + Type: restic.DataBlob, + ID: id, + Offset: pos, + Length: length, + }, + PackID: packID, + }) + + tests = append(tests, testEntry{ + id: id, + pack: packID, + tpe: restic.DataBlob, + offset: pos, + length: length, + }) + + pos += length + } + } + + for _, testBlob := range tests { + rtest.Assert(t, idx.Has(testBlob.id, testBlob.tpe), "Index reports not having data blob added to it") + } + + rtest.Assert(t, !idx.Has(restic.NewRandomID(), restic.DataBlob), "Index reports having a data blob not added to it") + rtest.Assert(t, !idx.Has(tests[0].id, restic.TreeBlob), "Index reports having a tree blob added to it with the same id as a data blob") +}