From b21241ec1ceca4221d22593cab59ecc535540fa9 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 28 Aug 2022 12:09:28 +0200 Subject: [PATCH 1/5] restic: Add CountedBlobSet type This allows maintaining a usage counter for each blob. --- internal/restic/counted_blob_set.go | 59 ++++++++++++++++++++++++ internal/restic/counted_blob_set_test.go | 37 +++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 internal/restic/counted_blob_set.go create mode 100644 internal/restic/counted_blob_set_test.go diff --git a/internal/restic/counted_blob_set.go b/internal/restic/counted_blob_set.go new file mode 100644 index 000000000..29644215b --- /dev/null +++ b/internal/restic/counted_blob_set.go @@ -0,0 +1,59 @@ +package restic + +import "sort" + +// CountedBlobSet is a set of blobs. For each blob it also stores a uint8 value +// which can be used to track some information. The CountedBlobSet does not use +// that value in any way. New entries are created with value 0. +type CountedBlobSet map[BlobHandle]uint8 + +// NewCountedBlobSet returns a new CountedBlobSet, populated with ids. +func NewCountedBlobSet(handles ...BlobHandle) CountedBlobSet { + m := make(CountedBlobSet) + for _, h := range handles { + m[h] = 0 + } + + return m +} + +// Has returns true iff id is contained in the set. +func (s CountedBlobSet) Has(h BlobHandle) bool { + _, ok := s[h] + return ok +} + +// Insert adds id to the set. +func (s CountedBlobSet) Insert(h BlobHandle) { + s[h] = 0 +} + +// Delete removes id from the set. +func (s CountedBlobSet) Delete(h BlobHandle) { + delete(s, h) +} + +func (s CountedBlobSet) Len() int { + return len(s) +} + +// List returns a sorted slice of all BlobHandle in the set. +func (s CountedBlobSet) List() BlobHandles { + list := make(BlobHandles, 0, len(s)) + for h := range s { + list = append(list, h) + } + + sort.Sort(list) + + return list +} + +func (s CountedBlobSet) String() string { + str := s.List().String() + if len(str) < 2 { + return "{}" + } + + return "{" + str[1:len(str)-1] + "}" +} diff --git a/internal/restic/counted_blob_set_test.go b/internal/restic/counted_blob_set_test.go new file mode 100644 index 000000000..c5c321503 --- /dev/null +++ b/internal/restic/counted_blob_set_test.go @@ -0,0 +1,37 @@ +package restic_test + +import ( + "testing" + + "github.com/restic/restic/internal/restic" + "github.com/restic/restic/internal/test" +) + +func TestCountedBlobSet(t *testing.T) { + bs := restic.NewCountedBlobSet() + test.Equals(t, bs.Len(), 0) + test.Equals(t, bs.List(), restic.BlobHandles{}) + + bh := restic.NewRandomBlobHandle() + // check non existant + test.Equals(t, bs.Has(bh), false) + + // test insert + bs.Insert(bh) + test.Equals(t, bs.Has(bh), true) + test.Equals(t, bs.Len(), 1) + test.Equals(t, bs.List(), restic.BlobHandles{bh}) + + // test remove + bs.Delete(bh) + test.Equals(t, bs.Len(), 0) + test.Equals(t, bs.Has(bh), false) + test.Equals(t, bs.List(), restic.BlobHandles{}) + + bs = restic.NewCountedBlobSet(bh) + test.Equals(t, bs.Len(), 1) + test.Equals(t, bs.List(), restic.BlobHandles{bh}) + + s := bs.String() + test.Assert(t, len(s) > 10, "invalid string: %v", s) +} From c4fc5c97f90db67e81faddf160573068ca4e371b Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 28 Aug 2022 12:17:20 +0200 Subject: [PATCH 2/5] prune: Use a single CountedBlobSet to track blobs The set covers necessary, existing and duplicate blobs. This removes the duplicate sets used to track whether all necessary blobs also exist. This reduces the memory usage of prune by about 20-30%. --- cmd/restic/cmd_prune.go | 118 ++++++++++++++++++++-------------- internal/repository/repack.go | 12 +++- internal/restic/blob_set.go | 4 ++ internal/restic/find.go | 7 +- 4 files changed, 90 insertions(+), 51 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 63e46e31a..cfd22b1cd 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -233,11 +233,11 @@ type pruneStats struct { } type prunePlan struct { - removePacksFirst restic.IDSet // packs to remove first (unreferenced packs) - repackPacks restic.IDSet // packs to repack - keepBlobs restic.BlobSet // blobs to keep during repacking - removePacks restic.IDSet // packs to remove - ignorePacks restic.IDSet // packs to ignore when rebuilding the index + removePacksFirst restic.IDSet // packs to remove first (unreferenced packs) + repackPacks restic.IDSet // packs to repack + keepBlobs restic.CountedBlobSet // blobs to keep during repacking + removePacks restic.IDSet // packs to remove + ignorePacks restic.IDSet // packs to ignore when rebuilding the index } type packInfo struct { @@ -294,46 +294,53 @@ func planPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo return plan, stats, nil } -func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.BlobSet, stats *pruneStats) (restic.BlobSet, map[restic.ID]packInfo, error) { - keepBlobs := restic.NewBlobSet() - duplicateBlobs := make(map[restic.BlobHandle]uint8) - +func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.CountedBlobSet, stats *pruneStats) (restic.CountedBlobSet, map[restic.ID]packInfo, error) { // iterate over all blobs in index to find out which blobs are duplicates + // The counter in usedBlobs describes how many instances of the blob exist in the repository index + // Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist idx.Each(ctx, func(blob restic.PackedBlob) { bh := blob.BlobHandle size := uint64(blob.Length) - switch { - case usedBlobs.Has(bh): // used blob, move to keepBlobs - usedBlobs.Delete(bh) - keepBlobs.Insert(bh) - stats.size.used += size - stats.blobs.used++ - case keepBlobs.Has(bh): // duplicate blob - count, ok := duplicateBlobs[bh] - if !ok { - count = 2 // this one is already the second blob! - } else if count < math.MaxUint8 { + count, ok := usedBlobs[bh] + if ok { + if count < math.MaxUint8 { // don't overflow, but saturate count at 255 // this can lead to a non-optimal pack selection, but won't cause // problems otherwise count++ } - duplicateBlobs[bh] = count - stats.size.duplicate += size - stats.blobs.duplicate++ - default: + + if count == 1 { + stats.size.used += size + stats.blobs.used++ + } else { + // duplicate if counted more than once + stats.size.duplicate += size + stats.blobs.duplicate++ + } + + usedBlobs[bh] = count + } else { stats.size.unused += size stats.blobs.unused++ } }) // Check if all used blobs have been found in index - if len(usedBlobs) != 0 { + missingBlobs := restic.NewBlobSet() + for bh, count := range usedBlobs { + if count == 0 { + // blob does not exist in any pack files + missingBlobs.Insert(bh) + } + } + + if len(missingBlobs) != 0 { Warnf("%v not found in the index\n\n"+ "Integrity check failed: Data seems to be missing.\n"+ "Will not start prune to prevent (additional) data loss!\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose\n", usedBlobs) + "https://github.com/restic/restic/issues/new/choose\n", missingBlobs) return nil, nil, errorIndexIncomplete } @@ -345,6 +352,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)} } + hasDuplicates := false // iterate over all blobs in index to generate packInfo idx.Each(ctx, func(blob restic.PackedBlob) { ip := indexPack[blob.PackID] @@ -361,10 +369,14 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re bh := blob.BlobHandle size := uint64(blob.Length) - _, isDuplicate := duplicateBlobs[bh] + dupCount := usedBlobs[bh] switch { - case isDuplicate: // duplicate blobs will be handled later - case keepBlobs.Has(bh): // used blob, not duplicate + case dupCount >= 2: + hasDuplicates = true + // mark as unused for now, we will later on select one copy + ip.unusedSize += size + ip.unusedBlobs++ + case dupCount == 1: // used blob, not duplicate ip.usedSize += size ip.usedBlobs++ default: // unused blob @@ -382,40 +394,52 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re // - mark only one occurence of duplicate blobs as used // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" // - if there are no used blobs in a pack, possibly mark duplicates as "unused" - if len(duplicateBlobs) > 0 { + if hasDuplicates { // iterate again over all blobs in index (this is pretty cheap, all in-mem) idx.Each(ctx, func(blob restic.PackedBlob) { bh := blob.BlobHandle - count, isDuplicate := duplicateBlobs[bh] - if !isDuplicate { + count, ok := usedBlobs[bh] + // skip non-duplicate, aka. normal blobs + // count == 0 is used to mark that this was a duplicate blob with only a single occurence remaining + if !ok || count == 1 { return } ip := indexPack[blob.PackID] size := uint64(blob.Length) switch { - case count == 0: - // used duplicate exists -> mark as unused - ip.unusedSize += size - ip.unusedBlobs++ - case ip.usedBlobs > 0, count == 1: - // other used blobs in pack or "last" occurency -> mark as used + case ip.usedBlobs > 0, count == 0: + // other used blobs in pack or "last" occurence -> transition to used ip.usedSize += size ip.usedBlobs++ - // let other occurences be marked as unused - duplicateBlobs[bh] = 0 + ip.unusedSize -= size + ip.unusedBlobs-- + // let other occurences remain marked as unused + usedBlobs[bh] = 1 default: - // mark as unused and decrease counter - ip.unusedSize += size - ip.unusedBlobs++ - duplicateBlobs[bh] = count - 1 + // remain unused and decrease counter + count-- + if count == 1 { + // setting count to 1 would lead to forgetting that this blob had duplicates + // thus use the special value zero. This will select the last instance of the blob for keeping. + count = 0 + } + usedBlobs[bh] = count } // update indexPack indexPack[blob.PackID] = ip }) } - return keepBlobs, indexPack, nil + // Sanity check. If no duplicates exist, all blobs have value 1. After handling + // duplicates, this also applies to duplicates. + for _, count := range usedBlobs { + if count != 1 { + panic("internal error during blob selection") + } + } + + return usedBlobs, indexPack, nil } func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo restic.Repository, indexPack map[restic.ID]packInfo, stats *pruneStats) (prunePlan, error) { @@ -747,7 +771,7 @@ func rebuildIndexFiles(ctx context.Context, gopts GlobalOptions, repo restic.Rep return DeleteFilesChecked(ctx, gopts, repo, obsoleteIndexes, restic.IndexFile) } -func getUsedBlobs(ctx context.Context, gopts GlobalOptions, repo restic.Repository, ignoreSnapshots restic.IDSet) (usedBlobs restic.BlobSet, err error) { +func getUsedBlobs(ctx context.Context, gopts GlobalOptions, repo restic.Repository, ignoreSnapshots restic.IDSet) (usedBlobs restic.CountedBlobSet, err error) { var snapshotTrees restic.IDs Verbosef("loading all snapshots...\n") err = restic.ForAllSnapshots(ctx, repo.Backend(), repo, ignoreSnapshots, @@ -766,7 +790,7 @@ func getUsedBlobs(ctx context.Context, gopts GlobalOptions, repo restic.Reposito Verbosef("finding data that is still in use for %d snapshots\n", len(snapshotTrees)) - usedBlobs = restic.NewBlobSet() + usedBlobs = restic.NewCountedBlobSet() bar := newProgressMax(!gopts.Quiet, uint64(len(snapshotTrees)), "snapshots") defer bar.Done() diff --git a/internal/repository/repack.go b/internal/repository/repack.go index 587721fc7..6adff69f4 100644 --- a/internal/repository/repack.go +++ b/internal/repository/repack.go @@ -12,6 +12,12 @@ import ( "golang.org/x/sync/errgroup" ) +type repackBlobSet interface { + Has(bh restic.BlobHandle) bool + Delete(bh restic.BlobHandle) + Len() int +} + // Repack takes a list of packs together with a list of blobs contained in // these packs. Each pack is loaded and the blobs listed in keepBlobs is saved // into a new pack. Returned is the list of obsolete packs which can then @@ -19,8 +25,8 @@ import ( // // The map keepBlobs is modified by Repack, it is used to keep track of which // blobs have been processed. -func Repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs restic.BlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) { - debug.Log("repacking %d packs while keeping %d blobs", len(packs), len(keepBlobs)) +func Repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs repackBlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) { + debug.Log("repacking %d packs while keeping %d blobs", len(packs), keepBlobs.Len()) if repo == dstRepo && dstRepo.Connections() < 2 { return nil, errors.Fatal("repack step requires a backend connection limit of at least two") @@ -41,7 +47,7 @@ func Repack(ctx context.Context, repo restic.Repository, dstRepo restic.Reposito return obsoletePacks, nil } -func repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs restic.BlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) { +func repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs repackBlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) { wg, wgCtx := errgroup.WithContext(ctx) var keepMutex sync.Mutex diff --git a/internal/restic/blob_set.go b/internal/restic/blob_set.go index 07e88fed0..acacd57d4 100644 --- a/internal/restic/blob_set.go +++ b/internal/restic/blob_set.go @@ -31,6 +31,10 @@ func (s BlobSet) Delete(h BlobHandle) { delete(s, h) } +func (s BlobSet) Len() int { + return len(s) +} + // Equals returns true iff s equals other. func (s BlobSet) Equals(other BlobSet) bool { if len(s) != len(other) { diff --git a/internal/restic/find.go b/internal/restic/find.go index 6544f2b3d..08670a49f 100644 --- a/internal/restic/find.go +++ b/internal/restic/find.go @@ -15,9 +15,14 @@ type Loader interface { Connections() uint } +type findBlobSet interface { + Has(bh BlobHandle) bool + Insert(bh BlobHandle) +} + // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data // blobs) to the set blobs. Already seen tree blobs will not be visited again. -func FindUsedBlobs(ctx context.Context, repo Loader, treeIDs IDs, blobs BlobSet, p *progress.Counter) error { +func FindUsedBlobs(ctx context.Context, repo Loader, treeIDs IDs, blobs findBlobSet, p *progress.Counter) error { var lock sync.Mutex wg, ctx := errgroup.WithContext(ctx) From 68c9cb9c6a8893930ed8e4a124416a08bfcfe037 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 28 Aug 2022 12:24:30 +0200 Subject: [PATCH 3/5] prune: Shrink keepBlobs set if possible As long as only a small fraction of the data in a repository is rewritten, the keepBlobs set will be rather small after cleaning it up. As golang maps do not shrink their memory usage, just copy the contents over to a new map. However, only copy the map if the cleanup removed at least half the entries. --- cmd/restic/cmd_prune.go | 6 ++++++ internal/restic/counted_blob_set.go | 9 +++++++++ internal/restic/counted_blob_set_test.go | 8 ++++++++ 3 files changed, 23 insertions(+) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index cfd22b1cd..46d62460b 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -277,6 +277,7 @@ func planPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo } if len(plan.repackPacks) != 0 { + blobCount := keepBlobs.Len() // when repacking, we do not want to keep blobs which are // already contained in kept packs, so delete them from keepBlobs repo.Index().Each(ctx, func(blob restic.PackedBlob) { @@ -285,6 +286,11 @@ func planPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo } keepBlobs.Delete(blob.BlobHandle) }) + + if keepBlobs.Len() < blobCount/2 { + // replace with copy to shrink map to necessary size if there's a chance to benefit + keepBlobs = keepBlobs.Copy() + } } else { // keepBlobs is only needed if packs are repacked keepBlobs = nil diff --git a/internal/restic/counted_blob_set.go b/internal/restic/counted_blob_set.go index 29644215b..f965d3129 100644 --- a/internal/restic/counted_blob_set.go +++ b/internal/restic/counted_blob_set.go @@ -57,3 +57,12 @@ func (s CountedBlobSet) String() string { return "{" + str[1:len(str)-1] + "}" } + +// Copy returns a copy of the CountedBlobSet. +func (s CountedBlobSet) Copy() CountedBlobSet { + cp := make(CountedBlobSet, len(s)) + for k, v := range s { + cp[k] = v + } + return cp +} diff --git a/internal/restic/counted_blob_set_test.go b/internal/restic/counted_blob_set_test.go index c5c321503..681751e91 100644 --- a/internal/restic/counted_blob_set_test.go +++ b/internal/restic/counted_blob_set_test.go @@ -35,3 +35,11 @@ func TestCountedBlobSet(t *testing.T) { s := bs.String() test.Assert(t, len(s) > 10, "invalid string: %v", s) } + +func TestCountedBlobSetCopy(t *testing.T) { + bs := restic.NewCountedBlobSet(restic.NewRandomBlobHandle(), restic.NewRandomBlobHandle(), restic.NewRandomBlobHandle()) + test.Equals(t, bs.Len(), 3) + cp := bs.Copy() + test.Equals(t, cp.Len(), 3) + test.Equals(t, bs.List(), cp.List()) +} From 1e2794fa55e87a7112bffb93cfc58d641cab000c Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 28 Aug 2022 12:41:22 +0200 Subject: [PATCH 4/5] add prune memory optimization changelog --- changelog/unreleased/pull-3899 | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 changelog/unreleased/pull-3899 diff --git a/changelog/unreleased/pull-3899 b/changelog/unreleased/pull-3899 new file mode 100644 index 000000000..ed26ca591 --- /dev/null +++ b/changelog/unreleased/pull-3899 @@ -0,0 +1,5 @@ +Enhancement: Optimize prune memory usage + +The prune command requires large amounts of memory to determine what to keep and what to remove. That step has been optimized to use up to 30% less memory. + +https://github.com/restic/restic/pull/3899 From d966c527077005c17571f9778d77fdc876a6d397 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 22 Oct 2022 18:10:17 +0200 Subject: [PATCH 5/5] prune: allow gc of set of repacked blobs before index rebuild --- cmd/restic/cmd_prune.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 46d62460b..7918b2294 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -720,6 +720,9 @@ func doPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo r "https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs) return errors.Fatal("internal error: blobs were not repacked") } + + // allow GC of the blob set + plan.keepBlobs = nil } if len(plan.ignorePacks) == 0 {