prune: Use a single CountedBlobSet to track blobs

The set covers necessary, existing and duplicate blobs. This removes the duplicate sets used to track whether all necessary blobs also exist. This reduces the memory usage of prune by about 20-30%.
2024-11-29 08:14:03 +00:00 · 2022-08-28 12:17:20 +02:00 · 2022-08-28 12:17:20 +02:00 · c4fc5c97f9
commit c4fc5c97f9
parent b21241ec1c
4 changed files with 90 additions and 51 deletions
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -233,11 +233,11 @@ type pruneStats struct {
 }

 type prunePlan struct {
-	removePacksFirst restic.IDSet   // packs to remove first (unreferenced packs)
-	repackPacks      restic.IDSet   // packs to repack
-	keepBlobs        restic.BlobSet // blobs to keep during repacking
-	removePacks      restic.IDSet   // packs to remove
-	ignorePacks      restic.IDSet   // packs to ignore when rebuilding the index
+	removePacksFirst restic.IDSet          // packs to remove first (unreferenced packs)
+	repackPacks      restic.IDSet          // packs to repack
+	keepBlobs        restic.CountedBlobSet // blobs to keep during repacking
+	removePacks      restic.IDSet          // packs to remove
+	ignorePacks      restic.IDSet          // packs to ignore when rebuilding the index
 }

 type packInfo struct {
@ -294,46 +294,53 @@ func planPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo
 	return plan, stats, nil
 }

-func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.BlobSet, stats *pruneStats) (restic.BlobSet, map[restic.ID]packInfo, error) {
-	keepBlobs := restic.NewBlobSet()
-	duplicateBlobs := make(map[restic.BlobHandle]uint8)
-
+func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.CountedBlobSet, stats *pruneStats) (restic.CountedBlobSet, map[restic.ID]packInfo, error) {
 	// iterate over all blobs in index to find out which blobs are duplicates
+	// The counter in usedBlobs describes how many instances of the blob exist in the repository index
+	// Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist
 	idx.Each(ctx, func(blob restic.PackedBlob) {
 		bh := blob.BlobHandle
 		size := uint64(blob.Length)
-		switch {
-		case usedBlobs.Has(bh): // used blob, move to keepBlobs
-			usedBlobs.Delete(bh)
-			keepBlobs.Insert(bh)
-			stats.size.used += size
-			stats.blobs.used++
-		case keepBlobs.Has(bh): // duplicate blob
-			count, ok := duplicateBlobs[bh]
-			if !ok {
-				count = 2 // this one is already the second blob!
-			} else if count < math.MaxUint8 {
+		count, ok := usedBlobs[bh]
+		if ok {
+			if count < math.MaxUint8 {
 				// don't overflow, but saturate count at 255
 				// this can lead to a non-optimal pack selection, but won't cause
 				// problems otherwise
 				count++
 			}
-			duplicateBlobs[bh] = count
-			stats.size.duplicate += size
-			stats.blobs.duplicate++
-		default:
+
+			if count == 1 {
+				stats.size.used += size
+				stats.blobs.used++
+			} else {
+				// duplicate if counted more than once
+				stats.size.duplicate += size
+				stats.blobs.duplicate++
+			}
+
+			usedBlobs[bh] = count
+		} else {
 			stats.size.unused += size
 			stats.blobs.unused++
 		}
 	})

 	// Check if all used blobs have been found in index
-	if len(usedBlobs) != 0 {
+	missingBlobs := restic.NewBlobSet()
+	for bh, count := range usedBlobs {
+		if count == 0 {
+			// blob does not exist in any pack files
+			missingBlobs.Insert(bh)
+		}
+	}
+
+	if len(missingBlobs) != 0 {
 		Warnf("%v not found in the index\n\n"+
 			"Integrity check failed: Data seems to be missing.\n"+
 			"Will not start prune to prevent (additional) data loss!\n"+
 			"Please report this error (along with the output of the 'prune' run) at\n"+
-			"https://github.com/restic/restic/issues/new/choose\n", usedBlobs)
+			"https://github.com/restic/restic/issues/new/choose\n", missingBlobs)
 		return nil, nil, errorIndexIncomplete
 	}

@ -345,6 +352,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
 		indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
 	}

+	hasDuplicates := false
 	// iterate over all blobs in index to generate packInfo
 	idx.Each(ctx, func(blob restic.PackedBlob) {
 		ip := indexPack[blob.PackID]
@ -361,10 +369,14 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re

 		bh := blob.BlobHandle
 		size := uint64(blob.Length)
-		_, isDuplicate := duplicateBlobs[bh]
+		dupCount := usedBlobs[bh]
 		switch {
-		case isDuplicate: // duplicate blobs will be handled later
-		case keepBlobs.Has(bh): // used blob, not duplicate
+		case dupCount >= 2:
+			hasDuplicates = true
+			// mark as unused for now, we will later on select one copy
+			ip.unusedSize += size
+			ip.unusedBlobs++
+		case dupCount == 1: // used blob, not duplicate
 			ip.usedSize += size
 			ip.usedBlobs++
 		default: // unused blob
@ -382,40 +394,52 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
 	// - mark only one occurence of duplicate blobs as used
 	// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
 	// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
-	if len(duplicateBlobs) > 0 {
+	if hasDuplicates {
 		// iterate again over all blobs in index (this is pretty cheap, all in-mem)
 		idx.Each(ctx, func(blob restic.PackedBlob) {
 			bh := blob.BlobHandle
-			count, isDuplicate := duplicateBlobs[bh]
-			if !isDuplicate {
+			count, ok := usedBlobs[bh]
+			// skip non-duplicate, aka. normal blobs
+			// count == 0 is used to mark that this was a duplicate blob with only a single occurence remaining
+			if !ok || count == 1 {
 				return
 			}

 			ip := indexPack[blob.PackID]
 			size := uint64(blob.Length)
 			switch {
-			case count == 0:
-				// used duplicate exists ->  mark as unused
-				ip.unusedSize += size
-				ip.unusedBlobs++
-			case ip.usedBlobs > 0, count == 1:
-				// other used blobs in pack or "last" occurency ->  mark as used
+			case ip.usedBlobs > 0, count == 0:
+				// other used blobs in pack or "last" occurence ->  transition to used
 				ip.usedSize += size
 				ip.usedBlobs++
-				// let other occurences be marked as unused
-				duplicateBlobs[bh] = 0
+				ip.unusedSize -= size
+				ip.unusedBlobs--
+				// let other occurences remain marked as unused
+				usedBlobs[bh] = 1
 			default:
-				// mark as unused and decrease counter
-				ip.unusedSize += size
-				ip.unusedBlobs++
-				duplicateBlobs[bh] = count - 1
+				// remain unused and decrease counter
+				count--
+				if count == 1 {
+					// setting count to 1 would lead to forgetting that this blob had duplicates
+					// thus use the special value zero. This will select the last instance of the blob for keeping.
+					count = 0
+				}
+				usedBlobs[bh] = count
 			}
 			// update indexPack
 			indexPack[blob.PackID] = ip
 		})
 	}

-	return keepBlobs, indexPack, nil
+	// Sanity check. If no duplicates exist, all blobs have value 1. After handling
+	// duplicates, this also applies to duplicates.
+	for _, count := range usedBlobs {
+		if count != 1 {
+			panic("internal error during blob selection")
+		}
+	}
+
+	return usedBlobs, indexPack, nil
 }

 func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo restic.Repository, indexPack map[restic.ID]packInfo, stats *pruneStats) (prunePlan, error) {
@ -747,7 +771,7 @@ func rebuildIndexFiles(ctx context.Context, gopts GlobalOptions, repo restic.Rep
 	return DeleteFilesChecked(ctx, gopts, repo, obsoleteIndexes, restic.IndexFile)
 }

-func getUsedBlobs(ctx context.Context, gopts GlobalOptions, repo restic.Repository, ignoreSnapshots restic.IDSet) (usedBlobs restic.BlobSet, err error) {
+func getUsedBlobs(ctx context.Context, gopts GlobalOptions, repo restic.Repository, ignoreSnapshots restic.IDSet) (usedBlobs restic.CountedBlobSet, err error) {
 	var snapshotTrees restic.IDs
 	Verbosef("loading all snapshots...\n")
 	err = restic.ForAllSnapshots(ctx, repo.Backend(), repo, ignoreSnapshots,
@ -766,7 +790,7 @@ func getUsedBlobs(ctx context.Context, gopts GlobalOptions, repo restic.Reposito

 	Verbosef("finding data that is still in use for %d snapshots\n", len(snapshotTrees))

-	usedBlobs = restic.NewBlobSet()
+	usedBlobs = restic.NewCountedBlobSet()

 	bar := newProgressMax(!gopts.Quiet, uint64(len(snapshotTrees)), "snapshots")
 	defer bar.Done()
--- a/internal/repository/repack.go
+++ b/internal/repository/repack.go
@ -12,6 +12,12 @@ import (
 	"golang.org/x/sync/errgroup"
 )

+type repackBlobSet interface {
+	Has(bh restic.BlobHandle) bool
+	Delete(bh restic.BlobHandle)
+	Len() int
+}
+
 // Repack takes a list of packs together with a list of blobs contained in
 // these packs. Each pack is loaded and the blobs listed in keepBlobs is saved
 // into a new pack. Returned is the list of obsolete packs which can then
@ -19,8 +25,8 @@ import (
 //
 // The map keepBlobs is modified by Repack, it is used to keep track of which
 // blobs have been processed.
-func Repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs restic.BlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) {
-	debug.Log("repacking %d packs while keeping %d blobs", len(packs), len(keepBlobs))
+func Repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs repackBlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) {
+	debug.Log("repacking %d packs while keeping %d blobs", len(packs), keepBlobs.Len())

 	if repo == dstRepo && dstRepo.Connections() < 2 {
 		return nil, errors.Fatal("repack step requires a backend connection limit of at least two")
@ -41,7 +47,7 @@ func Repack(ctx context.Context, repo restic.Repository, dstRepo restic.Reposito
 	return obsoletePacks, nil
 }

-func repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs restic.BlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) {
+func repack(ctx context.Context, repo restic.Repository, dstRepo restic.Repository, packs restic.IDSet, keepBlobs repackBlobSet, p *progress.Counter) (obsoletePacks restic.IDSet, err error) {
 	wg, wgCtx := errgroup.WithContext(ctx)

 	var keepMutex sync.Mutex
--- a/internal/restic/blob_set.go
+++ b/internal/restic/blob_set.go
@ -31,6 +31,10 @@ func (s BlobSet) Delete(h BlobHandle) {
 	delete(s, h)
 }

+func (s BlobSet) Len() int {
+	return len(s)
+}
+
 // Equals returns true iff s equals other.
 func (s BlobSet) Equals(other BlobSet) bool {
 	if len(s) != len(other) {
--- a/internal/restic/find.go
+++ b/internal/restic/find.go
@ -15,9 +15,14 @@ type Loader interface {
 	Connections() uint
 }

+type findBlobSet interface {
+	Has(bh BlobHandle) bool
+	Insert(bh BlobHandle)
+}
+
 // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data
 // blobs) to the set blobs. Already seen tree blobs will not be visited again.
-func FindUsedBlobs(ctx context.Context, repo Loader, treeIDs IDs, blobs BlobSet, p *progress.Counter) error {
+func FindUsedBlobs(ctx context.Context, repo Loader, treeIDs IDs, blobs findBlobSet, p *progress.Counter) error {
 	var lock sync.Mutex

 	wg, ctx := errgroup.WithContext(ctx)