From a7b95d716a88de768598ad60341cacc9056ef062 Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Sun, 22 Apr 2018 15:27:33 -0600 Subject: [PATCH] Implement four counting modes --- cmd/restic/cmd_stats.go | 251 ++++++++++++++++++++++++++++++++++------ 1 file changed, 215 insertions(+), 36 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 364a4827f..4b14235dd 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "os" + "path/filepath" "github.com/restic/restic/internal/restic" "github.com/spf13/cobra" @@ -15,9 +16,10 @@ var cmdStats = &cobra.Command{ Use: "stats", Short: "Scan the repository and show basic statistics", Long: ` -The "stats" command walks all snapshots in a repository and accumulates -statistics about the data stored therein. It reports on the number of -unique files and their sizes. +The "stats" command walks one or all snapshots in a repository and +accumulates statistics about the data stored therein. It reports on +the number of unique files and their sizes, according to one of +the counting modes as given by a flag. `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { @@ -25,11 +27,25 @@ unique files and their sizes. }, } +var countModeFlag []string + func init() { cmdRoot.AddCommand(cmdStats) + + f := cmdStats.Flags() + f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)") + f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents") + f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename") + f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them") + f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") } func runStats(gopts GlobalOptions, args []string) error { + err := verifyStatsInput(gopts, args) + if err != nil { + return err + } + ctx, cancel := context.WithCancel(gopts.ctx) defer cancel() @@ -50,27 +66,62 @@ func runStats(gopts GlobalOptions, args []string) error { } } - // create a container for the stats, and other state - // needed while walking the trees - stats := &statsContainer{uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet)} + // create a container for the stats (and other needed state) + stats := &statsContainer{ + uniqueFiles: make(map[fileID]struct{}), + idSet: make(restic.IDSet), + fileBlobs: make(map[string]restic.IDSet), + blobs: restic.NewBlobSet(), + blobsSeen: restic.NewBlobSet(), + } - // iterate every snapshot in the repo - err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { - snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) + if snapshotIDString != "" { + // scan just a single snapshot + + var sID restic.ID + if snapshotIDString == "latest" { + sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost) + if err != nil { + Exitf(1, "latest snapshot for criteria not found: %v", err) + } + } else { + sID, err = restic.FindSnapshot(repo, snapshotIDString) + if err != nil { + return err + } + } + + snapshot, err := restic.LoadSnapshot(ctx, repo, sID) if err != nil { - return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) - } - if snapshot.Tree == nil { - return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) + return err } - err = walkTree(ctx, repo, *snapshot.Tree, stats) - if err != nil { - return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) - } + err = statsWalkSnapshot(ctx, snapshot, repo, stats) + } else { + // iterate every snapshot in the repo + err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { + snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) + if err != nil { + return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) + } + return statsWalkSnapshot(ctx, snapshot, repo, stats) + }) + } + if err != nil { + return err + } - return nil - }) + if countModeRawData { + // the blob handles have been collected, but not yet counted + for blobHandle := range stats.blobs { + blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) + if !found { + return fmt.Errorf("blob %v not found", blobHandle) + } + stats.TotalSize += uint64(blobSize) + stats.TotalBlobCount++ + } + } if gopts.JSON { err = json.NewEncoder(os.Stdout).Encode(stats) @@ -80,12 +131,37 @@ func runStats(gopts GlobalOptions, args []string) error { return nil } - Printf(" Cumulative Original Size: %-5s\n", formatBytes(stats.TotalOriginalSize)) - Printf(" Total Original File Count: %d\n", stats.TotalCount) + if stats.TotalBlobCount > 0 { + Printf(" Total Blob Count: %d\n", stats.TotalBlobCount) + } + if stats.TotalFileCount > 0 { + Printf(" Total File Count: %d\n", stats.TotalFileCount) + } + Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize)) + return nil } -func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer) error { +func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error { + if snapshot.Tree == nil { + return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) + } + + if countModeRawData { + // count just the sizes of unique blobs; we don't need to walk the tree + // ourselves in this case, since a nifty function does it for us + return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) + } + + err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator)) + if err != nil { + return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) + } + return nil +} + +func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error { + // don't visit a tree we've already walked if stats.idSet.Has(treeID) { return nil } @@ -97,20 +173,59 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta } for _, node := range tree.Nodes { - // only count this file if we haven't visited it before - fid := makeFileID(node) - if _, ok := stats.uniqueFiles[fid]; !ok { - // mark the file as visited - stats.uniqueFiles[fid] = struct{}{} + if countModeUniqueFilesByContent || countModeBlobsPerFile { + // only count this file if we haven't visited it before + fid := makeFileIDByContents(node) + if _, ok := stats.uniqueFiles[fid]; !ok { + // mark the file as visited + stats.uniqueFiles[fid] = struct{}{} - // update our stats to account for this node - stats.TotalOriginalSize += node.Size - stats.TotalCount++ + if countModeUniqueFilesByContent { + // simply count the size of each unique file (unique by contents only) + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + if countModeBlobsPerFile { + // count the size of each unique blob reference, which is + // by unique file (unique by contents and file path) + for _, blobID := range node.Content { + // ensure we have this file (by path) in our map; in this + // mode, a file is unique by both contents and path + if _, ok := stats.fileBlobs[fpath]; !ok { + stats.fileBlobs[fpath] = restic.NewIDSet() + stats.TotalFileCount++ + } + if _, ok := stats.fileBlobs[fpath][blobID]; !ok { + // TODO: Is the blob type always 'data' in this case? + blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) + if !found { + return fmt.Errorf("blob %s not found for tree %s", blobID, treeID) + } + + // count the blob's size, then add this blob by this + // file (path) so we don't double-count it + stats.TotalSize += uint64(blobSize) + stats.fileBlobs[fpath].Insert(blobID) + + // this mode also counts total unique blob _references_ per file + stats.TotalBlobCount++ + } + } + } + } + } + + if countModeRestoreSize { + // as this is a file in the snapshot, we can simply count its + // size without worrying about uniqueness, since duplicate files + // will still be restored + stats.TotalSize += node.Size + stats.TotalFileCount++ } // visit subtrees (i.e. directory contents) if node.Subtree != nil { - err = walkTree(ctx, repo, *node.Subtree, stats) + err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name)) if err != nil { return err } @@ -120,7 +235,9 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta return nil } -func makeFileID(node *restic.Node) fileID { +// makeFileIDByContents returns a hash of the blob IDs of the +// node's Content in sequence. +func makeFileIDByContents(node *restic.Node) fileID { var bb []byte for _, c := range node.Content { bb = append(bb, []byte(c[:])...) @@ -128,14 +245,76 @@ func makeFileID(node *restic.Node) fileID { return sha256.Sum256(bb) } +func verifyStatsInput(gopts GlobalOptions, args []string) error { + // ensure only one counting mode was specified, for clarity + var countModes int + if countModeRestoreSize { + countModes++ + } + if countModeUniqueFilesByContent { + countModes++ + } + if countModeBlobsPerFile { + countModes++ + } + if countModeRawData { + countModes++ + } + if countModes > 1 { + return fmt.Errorf("only one counting mode may be used") + } + // set a default count mode if none were specified + if countModes == 0 { + countModeRestoreSize = true + } + // ensure one or none snapshots were specified + if len(args) > 1 { + return fmt.Errorf("only one snapshot may be specified") + } + // set the snapshot to scan, if one was specified + if len(args) == 1 { + snapshotIDString = args[0] + } + return nil +} + // statsContainer holds information during a walk of a repository // to collect information about it, as well as state needed // for a successful and efficient walk. type statsContainer struct { - TotalCount uint64 `json:"total_count"` - TotalOriginalSize uint64 `json:"total_original_size"` - idSet restic.IDSet - uniqueFiles map[fileID]struct{} + TotalSize uint64 `json:"total_size"` + TotalFileCount uint64 `json:"total_file_count"` + TotalBlobCount uint64 `json:"total_blob_count,omitempty"` + + // idSet marks visited trees, to avoid repeated walks + idSet restic.IDSet + + // uniqueFiles marks visited files according to their + // contents (hashed sequence of content blob IDs) + uniqueFiles map[fileID]struct{} + + // fileBlobs maps a file name (path) to the set of + // blobs that have been seen as a part of the file + fileBlobs map[string]restic.IDSet + + // blobs and blobsSeen are used to count indiviudal + // unique blobs, independent of references to files + blobs, blobsSeen restic.BlobSet } +// fileID is a 256-bit hash that distinguishes unique files. type fileID [32]byte + +var ( + countModeRestoreSize bool + countModeUniqueFilesByContent bool + countModeBlobsPerFile bool + countModeRawData bool + + // the snapshot to scan, as given by the user + snapshotIDString string + + // snapshotByHost is the host to filter latest + // snapshot by, if given by user + snapshotByHost string +)