Implement four counting modes

2025-01-22 14:48:24 +00:00 · 2018-04-22 15:27:33 -06:00 · 2018-04-22 15:27:33 -06:00 · a7b95d716a
commit a7b95d716a
parent 925b542eb0
1 changed files with 215 additions and 36 deletions
--- a/cmd/restic/cmd_stats.go
+++ b/cmd/restic/cmd_stats.go
@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"os"
+	"path/filepath"

 	"github.com/restic/restic/internal/restic"
 	"github.com/spf13/cobra"
@ -15,9 +16,10 @@ var cmdStats = &cobra.Command{
 	Use:   "stats",
 	Short: "Scan the repository and show basic statistics",
 	Long: `
-The "stats" command walks all snapshots in a repository and accumulates
-statistics about the data stored therein. It reports on the number of
-unique files and their sizes.
+The "stats" command walks one or all snapshots in a repository and
+accumulates statistics about the data stored therein. It reports on
+the number of unique files and their sizes, according to one of
+the counting modes as given by a flag.
 `,
 	DisableAutoGenTag: true,
 	RunE: func(cmd *cobra.Command, args []string) error {
@ -25,11 +27,25 @@ unique files and their sizes.
 	},
 }

+var countModeFlag []string
+
 func init() {
 	cmdRoot.AddCommand(cmdStats)
+
+	f := cmdStats.Flags()
+	f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)")
+	f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents")
+	f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename")
+	f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them")
+	f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")
 }

 func runStats(gopts GlobalOptions, args []string) error {
+	err := verifyStatsInput(gopts, args)
+	if err != nil {
+		return err
+	}
+
 	ctx, cancel := context.WithCancel(gopts.ctx)
 	defer cancel()

@ -50,27 +66,62 @@ func runStats(gopts GlobalOptions, args []string) error {
 		}
 	}

-	// create a container for the stats, and other state
-	// needed while walking the trees
-	stats := &statsContainer{uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet)}
+	// create a container for the stats (and other needed state)
+	stats := &statsContainer{
+		uniqueFiles: make(map[fileID]struct{}),
+		idSet:       make(restic.IDSet),
+		fileBlobs:   make(map[string]restic.IDSet),
+		blobs:       restic.NewBlobSet(),
+		blobsSeen:   restic.NewBlobSet(),
+	}

-	// iterate every snapshot in the repo
-	err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
-		snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
+	if snapshotIDString != "" {
+		// scan just a single snapshot
+
+		var sID restic.ID
+		if snapshotIDString == "latest" {
+			sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost)
+			if err != nil {
+				Exitf(1, "latest snapshot for criteria not found: %v", err)
+			}
+		} else {
+			sID, err = restic.FindSnapshot(repo, snapshotIDString)
+			if err != nil {
+				return err
+			}
+		}
+
+		snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
 		if err != nil {
-			return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
-		}
-		if snapshot.Tree == nil {
-			return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
+			return err
 		}

-		err = walkTree(ctx, repo, *snapshot.Tree, stats)
-		if err != nil {
-			return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
-		}
+		err = statsWalkSnapshot(ctx, snapshot, repo, stats)
+	} else {
+		// iterate every snapshot in the repo
+		err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
+			snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
+			if err != nil {
+				return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
+			}
+			return statsWalkSnapshot(ctx, snapshot, repo, stats)
+		})
+	}
+	if err != nil {
+		return err
+	}

-		return nil
-	})
+	if countModeRawData {
+		// the blob handles have been collected, but not yet counted
+		for blobHandle := range stats.blobs {
+			blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
+			if !found {
+				return fmt.Errorf("blob %v not found", blobHandle)
+			}
+			stats.TotalSize += uint64(blobSize)
+			stats.TotalBlobCount++
+		}
+	}

 	if gopts.JSON {
 		err = json.NewEncoder(os.Stdout).Encode(stats)
@ -80,12 +131,37 @@ func runStats(gopts GlobalOptions, args []string) error {
 		return nil
 	}

-	Printf("   Cumulative Original Size:   %-5s\n", formatBytes(stats.TotalOriginalSize))
-	Printf("  Total Original File Count:   %d\n", stats.TotalCount)
+	if stats.TotalBlobCount > 0 {
+		Printf("  Total Blob Count:   %d\n", stats.TotalBlobCount)
+	}
+	if stats.TotalFileCount > 0 {
+		Printf("  Total File Count:   %d\n", stats.TotalFileCount)
+	}
+	Printf("        Total Size:   %-5s\n", formatBytes(stats.TotalSize))
+
 	return nil
 }

-func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer) error {
+func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error {
+	if snapshot.Tree == nil {
+		return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
+	}
+
+	if countModeRawData {
+		// count just the sizes of unique blobs; we don't need to walk the tree
+		// ourselves in this case, since a nifty function does it for us
+		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
+	}
+
+	err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator))
+	if err != nil {
+		return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
+	}
+	return nil
+}
+
+func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error {
+	// don't visit a tree we've already walked
 	if stats.idSet.Has(treeID) {
 		return nil
 	}
@ -97,20 +173,59 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta
 	}

 	for _, node := range tree.Nodes {
-		// only count this file if we haven't visited it before
-		fid := makeFileID(node)
-		if _, ok := stats.uniqueFiles[fid]; !ok {
-			// mark the file as visited
-			stats.uniqueFiles[fid] = struct{}{}
+		if countModeUniqueFilesByContent || countModeBlobsPerFile {
+			// only count this file if we haven't visited it before
+			fid := makeFileIDByContents(node)
+			if _, ok := stats.uniqueFiles[fid]; !ok {
+				// mark the file as visited
+				stats.uniqueFiles[fid] = struct{}{}

-			// update our stats to account for this node
-			stats.TotalOriginalSize += node.Size
-			stats.TotalCount++
+				if countModeUniqueFilesByContent {
+					// simply count the size of each unique file (unique by contents only)
+					stats.TotalSize += node.Size
+					stats.TotalFileCount++
+				}
+				if countModeBlobsPerFile {
+					// count the size of each unique blob reference, which is
+					// by unique file (unique by contents and file path)
+					for _, blobID := range node.Content {
+						// ensure we have this file (by path) in our map; in this
+						// mode, a file is unique by both contents and path
+						if _, ok := stats.fileBlobs[fpath]; !ok {
+							stats.fileBlobs[fpath] = restic.NewIDSet()
+							stats.TotalFileCount++
+						}
+						if _, ok := stats.fileBlobs[fpath][blobID]; !ok {
+							// TODO: Is the blob type always 'data' in this case?
+							blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
+							if !found {
+								return fmt.Errorf("blob %s not found for tree %s", blobID, treeID)
+							}
+
+							// count the blob's size, then add this blob by this
+							// file (path) so we don't double-count it
+							stats.TotalSize += uint64(blobSize)
+							stats.fileBlobs[fpath].Insert(blobID)
+
+							// this mode also counts total unique blob _references_ per file
+							stats.TotalBlobCount++
+						}
+					}
+				}
+			}
+		}
+
+		if countModeRestoreSize {
+			// as this is a file in the snapshot, we can simply count its
+			// size without worrying about uniqueness, since duplicate files
+			// will still be restored
+			stats.TotalSize += node.Size
+			stats.TotalFileCount++
 		}

 		// visit subtrees (i.e. directory contents)
 		if node.Subtree != nil {
-			err = walkTree(ctx, repo, *node.Subtree, stats)
+			err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name))
 			if err != nil {
 				return err
 			}
@ -120,7 +235,9 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta
 	return nil
 }

-func makeFileID(node *restic.Node) fileID {
+// makeFileIDByContents returns a hash of the blob IDs of the
+// node's Content in sequence.
+func makeFileIDByContents(node *restic.Node) fileID {
 	var bb []byte
 	for _, c := range node.Content {
 		bb = append(bb, []byte(c[:])...)
@ -128,14 +245,76 @@ func makeFileID(node *restic.Node) fileID {
 	return sha256.Sum256(bb)
 }

+func verifyStatsInput(gopts GlobalOptions, args []string) error {
+	// ensure only one counting mode was specified, for clarity
+	var countModes int
+	if countModeRestoreSize {
+		countModes++
+	}
+	if countModeUniqueFilesByContent {
+		countModes++
+	}
+	if countModeBlobsPerFile {
+		countModes++
+	}
+	if countModeRawData {
+		countModes++
+	}
+	if countModes > 1 {
+		return fmt.Errorf("only one counting mode may be used")
+	}
+	// set a default count mode if none were specified
+	if countModes == 0 {
+		countModeRestoreSize = true
+	}
+	// ensure one or none snapshots were specified
+	if len(args) > 1 {
+		return fmt.Errorf("only one snapshot may be specified")
+	}
+	// set the snapshot to scan, if one was specified
+	if len(args) == 1 {
+		snapshotIDString = args[0]
+	}
+	return nil
+}
+
 // statsContainer holds information during a walk of a repository
 // to collect information about it, as well as state needed
 // for a successful and efficient walk.
 type statsContainer struct {
-	TotalCount        uint64 `json:"total_count"`
-	TotalOriginalSize uint64 `json:"total_original_size"`
-	idSet             restic.IDSet
-	uniqueFiles       map[fileID]struct{}
+	TotalSize      uint64 `json:"total_size"`
+	TotalFileCount uint64 `json:"total_file_count"`
+	TotalBlobCount uint64 `json:"total_blob_count,omitempty"`
+
+	// idSet marks visited trees, to avoid repeated walks
+	idSet restic.IDSet
+
+	// uniqueFiles marks visited files according to their
+	// contents (hashed sequence of content blob IDs)
+	uniqueFiles map[fileID]struct{}
+
+	// fileBlobs maps a file name (path) to the set of
+	// blobs that have been seen as a part of the file
+	fileBlobs map[string]restic.IDSet
+
+	// blobs and blobsSeen are used to count indiviudal
+	// unique blobs, independent of references to files
+	blobs, blobsSeen restic.BlobSet
 }

+// fileID is a 256-bit hash that distinguishes unique files.
 type fileID [32]byte
+
+var (
+	countModeRestoreSize          bool
+	countModeUniqueFilesByContent bool
+	countModeBlobsPerFile         bool
+	countModeRawData              bool
+
+	// the snapshot to scan, as given by the user
+	snapshotIDString string
+
+	// snapshotByHost is the host to filter latest
+	// snapshot by, if given by user
+	snapshotByHost string
+)