FindUsedBlobs: merge seen into blobs BlobSet

The seen BlobSet always contained a subset of the entries in blobs. Thus use blobs instead and avoid the memory overhead of the second set. Suggested-by: Alexander Weiss <alex@weissfam.de>
2024-11-22 21:05:10 +00:00 · 2020-02-01 21:09:52 +01:00 · 2020-02-01 21:09:52 +01:00 · 184103647a
commit 184103647a
parent 48f97f3567
4 changed files with 11 additions and 17 deletions
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -189,14 +189,13 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
 	Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots)

 	usedBlobs := restic.NewBlobSet()
-	seenBlobs := restic.NewBlobSet()

 	bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots")
 	bar.Start()
 	for _, sn := range snapshots {
 		debug.Log("process snapshot %v", sn.ID())

-		err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs, seenBlobs)
+		err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs)
 		if err != nil {
 			if repo.Backend().IsNotExist(err) {
 				return errors.Fatal("unable to load a tree from the repo: " + err.Error())
--- a/cmd/restic/cmd_stats.go
+++ b/cmd/restic/cmd_stats.go
@ -93,7 +93,6 @@ func runStats(gopts GlobalOptions, args []string) error {
 		uniqueInodes: make(map[uint64]struct{}),
 		fileBlobs:    make(map[string]restic.IDSet),
 		blobs:        restic.NewBlobSet(),
-		blobsSeen:    restic.NewBlobSet(),
 	}

 	if snapshotIDString != "" {
@ -183,7 +182,7 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
 	if countMode == countModeRawData {
 		// count just the sizes of unique blobs; we don't need to walk the tree
 		// ourselves in this case, since a nifty function does it for us
-		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
+		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs)
 	}

 	err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats))
@ -318,9 +317,9 @@ type statsContainer struct {
 	// blobs that have been seen as a part of the file
 	fileBlobs map[string]restic.IDSet

-	// blobs and blobsSeen are used to count individual
-	// unique blobs, independent of references to files
-	blobs, blobsSeen restic.BlobSet
+	// blobs is used to count individual unique blobs,
+	// independent of references to files
+	blobs restic.BlobSet
 }

 // fileID is a 256-bit hash that distinguishes unique files.
--- a/internal/restic/find.go
+++ b/internal/restic/find.go
@ -3,9 +3,8 @@ package restic
 import "context"

 // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data
-// blobs) to the set blobs. The tree blobs in the `seen` BlobSet will not be visited
-// again.
-func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet, seen BlobSet) error {
+// blobs) to the set blobs. Already seen tree blobs will not be visited again.
+func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet) error {
 	blobs.Insert(BlobHandle{ID: treeID, Type: TreeBlob})

 	tree, err := repo.LoadTree(ctx, treeID)
@ -22,13 +21,11 @@ func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSe
 		case "dir":
 			subtreeID := *node.Subtree
 			h := BlobHandle{ID: subtreeID, Type: TreeBlob}
-			if seen.Has(h) {
+			if blobs.Has(h) {
 				continue
 			}

-			seen.Insert(h)
-
-			err := FindUsedBlobs(ctx, repo, subtreeID, blobs, seen)
+			err := FindUsedBlobs(ctx, repo, subtreeID, blobs)
 			if err != nil {
 				return err
 			}
--- a/internal/restic/find_test.go
+++ b/internal/restic/find_test.go
@ -93,7 +93,7 @@ func TestFindUsedBlobs(t *testing.T) {

 	for i, sn := range snapshots {
 		usedBlobs := restic.NewBlobSet()
-		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs, restic.NewBlobSet())
+		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs)
 		if err != nil {
 			t.Errorf("FindUsedBlobs returned error: %v", err)
 			continue
@ -127,9 +127,8 @@ func BenchmarkFindUsedBlobs(b *testing.B) {
 	b.ResetTimer()

 	for i := 0; i < b.N; i++ {
-		seen := restic.NewBlobSet()
 		blobs := restic.NewBlobSet()
-		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs, seen)
+		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs)
 		if err != nil {
 			b.Error(err)
 		}