From f7659bd8b024f5f088eff8340d3bfb264c43e9ff Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Fri, 20 Apr 2018 08:44:14 -0600 Subject: [PATCH 1/8] stats: Initial implementation of stats command --- changelog/unreleased/pull-1729 | 4 ++ cmd/restic/cmd_stats.go | 121 +++++++++++++++++++++++++++++++++ doc/bash-completion.sh | 1 + 3 files changed, 126 insertions(+) create mode 100644 changelog/unreleased/pull-1729 create mode 100644 cmd/restic/cmd_stats.go diff --git a/changelog/unreleased/pull-1729 b/changelog/unreleased/pull-1729 new file mode 100644 index 000000000..62e982e95 --- /dev/null +++ b/changelog/unreleased/pull-1729 @@ -0,0 +1,4 @@ +Enhancement: Add stats command to get information about a repository + +https://github.com/restic/restic/issues/874 +https://github.com/restic/restic/pull/1729 diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go new file mode 100644 index 000000000..463337d81 --- /dev/null +++ b/cmd/restic/cmd_stats.go @@ -0,0 +1,121 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + + "github.com/restic/restic/internal/restic" + "github.com/spf13/cobra" +) + +var cmdStats = &cobra.Command{ + Use: "stats", + Short: "Scan the repository and show basic statistics", + Long: ` +The "stats" command walks all snapshots in a repository and accumulates +statistics about the data stored therein. It reports on the number of +unique files and their sizes. +`, + DisableAutoGenTag: true, + RunE: func(cmd *cobra.Command, args []string) error { + return runStats(globalOptions, args) + }, +} + +func init() { + cmdRoot.AddCommand(cmdStats) +} + +func runStats(gopts GlobalOptions, args []string) error { + ctx, cancel := context.WithCancel(gopts.ctx) + defer cancel() + + repo, err := OpenRepository(gopts) + if err != nil { + return err + } + + if err = repo.LoadIndex(ctx); err != nil { + return err + } + + if !gopts.NoLock { + lock, err := lockRepo(repo) + defer unlockRepo(lock) + if err != nil { + return err + } + } + + // create a container for the stats, and other state + // needed while walking the trees + stats := &statsContainer{idSet: restic.NewIDSet()} + + // iterate every snapshot in the repo + err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { + snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) + if err != nil { + return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) + } + if snapshot.Tree == nil { + return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) + } + + err = walkTree(ctx, repo, *snapshot.Tree, stats) + if err != nil { + return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) + } + + return nil + }) + + if gopts.JSON { + err = json.NewEncoder(os.Stdout).Encode(stats) + if err != nil { + return fmt.Errorf("encoding output: %v", err) + } + return nil + } + + Printf(" Cumulative Original Size: %-5s\n", formatBytes(stats.TotalOriginalSize)) + Printf(" Total Original File Count: %d\n", stats.TotalCount) + return nil +} + +func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer) error { + if stats.idSet.Has(treeID) { + return nil + } + stats.idSet.Insert(treeID) + + tree, err := repo.LoadTree(ctx, treeID) + if err != nil { + return fmt.Errorf("loading tree: %v", err) + } + + for _, node := range tree.Nodes { + // update our stats to account for this node + stats.TotalOriginalSize += node.Size + stats.TotalCount++ + + if node.Subtree != nil { + err = walkTree(ctx, repo, *node.Subtree, stats) + if err != nil { + return err + } + } + } + + return nil +} + +// statsContainer holds information during a walk of a repository +// to collect information about it, as well as state needed +// for a successful and efficient walk. +type statsContainer struct { + TotalCount uint64 `json:"total_count"` + TotalOriginalSize uint64 `json:"total_original_size"` + idSet restic.IDSet +} diff --git a/doc/bash-completion.sh b/doc/bash-completion.sh index 5203f5368..d1685e566 100644 --- a/doc/bash-completion.sh +++ b/doc/bash-completion.sh @@ -1310,6 +1310,7 @@ _restic_root_command() commands+=("rebuild-index") commands+=("restore") commands+=("snapshots") + commands+=("stats") commands+=("tag") commands+=("unlock") commands+=("version") From 925b542eb0268e18a540a58c2218a0dcfb3c0fe8 Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Sat, 21 Apr 2018 16:33:18 -0600 Subject: [PATCH 2/8] Count unique files by blob sequence rather than tree ID --- cmd/restic/cmd_stats.go | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 463337d81..364a4827f 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -2,6 +2,7 @@ package main import ( "context" + "crypto/sha256" "encoding/json" "fmt" "os" @@ -51,7 +52,7 @@ func runStats(gopts GlobalOptions, args []string) error { // create a container for the stats, and other state // needed while walking the trees - stats := &statsContainer{idSet: restic.NewIDSet()} + stats := &statsContainer{uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet)} // iterate every snapshot in the repo err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { @@ -96,10 +97,18 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta } for _, node := range tree.Nodes { - // update our stats to account for this node - stats.TotalOriginalSize += node.Size - stats.TotalCount++ + // only count this file if we haven't visited it before + fid := makeFileID(node) + if _, ok := stats.uniqueFiles[fid]; !ok { + // mark the file as visited + stats.uniqueFiles[fid] = struct{}{} + // update our stats to account for this node + stats.TotalOriginalSize += node.Size + stats.TotalCount++ + } + + // visit subtrees (i.e. directory contents) if node.Subtree != nil { err = walkTree(ctx, repo, *node.Subtree, stats) if err != nil { @@ -111,6 +120,14 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta return nil } +func makeFileID(node *restic.Node) fileID { + var bb []byte + for _, c := range node.Content { + bb = append(bb, []byte(c[:])...) + } + return sha256.Sum256(bb) +} + // statsContainer holds information during a walk of a repository // to collect information about it, as well as state needed // for a successful and efficient walk. @@ -118,4 +135,7 @@ type statsContainer struct { TotalCount uint64 `json:"total_count"` TotalOriginalSize uint64 `json:"total_original_size"` idSet restic.IDSet + uniqueFiles map[fileID]struct{} } + +type fileID [32]byte From a7b95d716a88de768598ad60341cacc9056ef062 Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Sun, 22 Apr 2018 15:27:33 -0600 Subject: [PATCH 3/8] Implement four counting modes --- cmd/restic/cmd_stats.go | 251 ++++++++++++++++++++++++++++++++++------ 1 file changed, 215 insertions(+), 36 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 364a4827f..4b14235dd 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "os" + "path/filepath" "github.com/restic/restic/internal/restic" "github.com/spf13/cobra" @@ -15,9 +16,10 @@ var cmdStats = &cobra.Command{ Use: "stats", Short: "Scan the repository and show basic statistics", Long: ` -The "stats" command walks all snapshots in a repository and accumulates -statistics about the data stored therein. It reports on the number of -unique files and their sizes. +The "stats" command walks one or all snapshots in a repository and +accumulates statistics about the data stored therein. It reports on +the number of unique files and their sizes, according to one of +the counting modes as given by a flag. `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { @@ -25,11 +27,25 @@ unique files and their sizes. }, } +var countModeFlag []string + func init() { cmdRoot.AddCommand(cmdStats) + + f := cmdStats.Flags() + f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)") + f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents") + f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename") + f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them") + f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") } func runStats(gopts GlobalOptions, args []string) error { + err := verifyStatsInput(gopts, args) + if err != nil { + return err + } + ctx, cancel := context.WithCancel(gopts.ctx) defer cancel() @@ -50,27 +66,62 @@ func runStats(gopts GlobalOptions, args []string) error { } } - // create a container for the stats, and other state - // needed while walking the trees - stats := &statsContainer{uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet)} + // create a container for the stats (and other needed state) + stats := &statsContainer{ + uniqueFiles: make(map[fileID]struct{}), + idSet: make(restic.IDSet), + fileBlobs: make(map[string]restic.IDSet), + blobs: restic.NewBlobSet(), + blobsSeen: restic.NewBlobSet(), + } - // iterate every snapshot in the repo - err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { - snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) + if snapshotIDString != "" { + // scan just a single snapshot + + var sID restic.ID + if snapshotIDString == "latest" { + sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost) + if err != nil { + Exitf(1, "latest snapshot for criteria not found: %v", err) + } + } else { + sID, err = restic.FindSnapshot(repo, snapshotIDString) + if err != nil { + return err + } + } + + snapshot, err := restic.LoadSnapshot(ctx, repo, sID) if err != nil { - return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) - } - if snapshot.Tree == nil { - return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) + return err } - err = walkTree(ctx, repo, *snapshot.Tree, stats) - if err != nil { - return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) - } + err = statsWalkSnapshot(ctx, snapshot, repo, stats) + } else { + // iterate every snapshot in the repo + err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { + snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) + if err != nil { + return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) + } + return statsWalkSnapshot(ctx, snapshot, repo, stats) + }) + } + if err != nil { + return err + } - return nil - }) + if countModeRawData { + // the blob handles have been collected, but not yet counted + for blobHandle := range stats.blobs { + blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) + if !found { + return fmt.Errorf("blob %v not found", blobHandle) + } + stats.TotalSize += uint64(blobSize) + stats.TotalBlobCount++ + } + } if gopts.JSON { err = json.NewEncoder(os.Stdout).Encode(stats) @@ -80,12 +131,37 @@ func runStats(gopts GlobalOptions, args []string) error { return nil } - Printf(" Cumulative Original Size: %-5s\n", formatBytes(stats.TotalOriginalSize)) - Printf(" Total Original File Count: %d\n", stats.TotalCount) + if stats.TotalBlobCount > 0 { + Printf(" Total Blob Count: %d\n", stats.TotalBlobCount) + } + if stats.TotalFileCount > 0 { + Printf(" Total File Count: %d\n", stats.TotalFileCount) + } + Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize)) + return nil } -func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer) error { +func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error { + if snapshot.Tree == nil { + return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) + } + + if countModeRawData { + // count just the sizes of unique blobs; we don't need to walk the tree + // ourselves in this case, since a nifty function does it for us + return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) + } + + err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator)) + if err != nil { + return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) + } + return nil +} + +func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error { + // don't visit a tree we've already walked if stats.idSet.Has(treeID) { return nil } @@ -97,20 +173,59 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta } for _, node := range tree.Nodes { - // only count this file if we haven't visited it before - fid := makeFileID(node) - if _, ok := stats.uniqueFiles[fid]; !ok { - // mark the file as visited - stats.uniqueFiles[fid] = struct{}{} + if countModeUniqueFilesByContent || countModeBlobsPerFile { + // only count this file if we haven't visited it before + fid := makeFileIDByContents(node) + if _, ok := stats.uniqueFiles[fid]; !ok { + // mark the file as visited + stats.uniqueFiles[fid] = struct{}{} - // update our stats to account for this node - stats.TotalOriginalSize += node.Size - stats.TotalCount++ + if countModeUniqueFilesByContent { + // simply count the size of each unique file (unique by contents only) + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + if countModeBlobsPerFile { + // count the size of each unique blob reference, which is + // by unique file (unique by contents and file path) + for _, blobID := range node.Content { + // ensure we have this file (by path) in our map; in this + // mode, a file is unique by both contents and path + if _, ok := stats.fileBlobs[fpath]; !ok { + stats.fileBlobs[fpath] = restic.NewIDSet() + stats.TotalFileCount++ + } + if _, ok := stats.fileBlobs[fpath][blobID]; !ok { + // TODO: Is the blob type always 'data' in this case? + blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) + if !found { + return fmt.Errorf("blob %s not found for tree %s", blobID, treeID) + } + + // count the blob's size, then add this blob by this + // file (path) so we don't double-count it + stats.TotalSize += uint64(blobSize) + stats.fileBlobs[fpath].Insert(blobID) + + // this mode also counts total unique blob _references_ per file + stats.TotalBlobCount++ + } + } + } + } + } + + if countModeRestoreSize { + // as this is a file in the snapshot, we can simply count its + // size without worrying about uniqueness, since duplicate files + // will still be restored + stats.TotalSize += node.Size + stats.TotalFileCount++ } // visit subtrees (i.e. directory contents) if node.Subtree != nil { - err = walkTree(ctx, repo, *node.Subtree, stats) + err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name)) if err != nil { return err } @@ -120,7 +235,9 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta return nil } -func makeFileID(node *restic.Node) fileID { +// makeFileIDByContents returns a hash of the blob IDs of the +// node's Content in sequence. +func makeFileIDByContents(node *restic.Node) fileID { var bb []byte for _, c := range node.Content { bb = append(bb, []byte(c[:])...) @@ -128,14 +245,76 @@ func makeFileID(node *restic.Node) fileID { return sha256.Sum256(bb) } +func verifyStatsInput(gopts GlobalOptions, args []string) error { + // ensure only one counting mode was specified, for clarity + var countModes int + if countModeRestoreSize { + countModes++ + } + if countModeUniqueFilesByContent { + countModes++ + } + if countModeBlobsPerFile { + countModes++ + } + if countModeRawData { + countModes++ + } + if countModes > 1 { + return fmt.Errorf("only one counting mode may be used") + } + // set a default count mode if none were specified + if countModes == 0 { + countModeRestoreSize = true + } + // ensure one or none snapshots were specified + if len(args) > 1 { + return fmt.Errorf("only one snapshot may be specified") + } + // set the snapshot to scan, if one was specified + if len(args) == 1 { + snapshotIDString = args[0] + } + return nil +} + // statsContainer holds information during a walk of a repository // to collect information about it, as well as state needed // for a successful and efficient walk. type statsContainer struct { - TotalCount uint64 `json:"total_count"` - TotalOriginalSize uint64 `json:"total_original_size"` - idSet restic.IDSet - uniqueFiles map[fileID]struct{} + TotalSize uint64 `json:"total_size"` + TotalFileCount uint64 `json:"total_file_count"` + TotalBlobCount uint64 `json:"total_blob_count,omitempty"` + + // idSet marks visited trees, to avoid repeated walks + idSet restic.IDSet + + // uniqueFiles marks visited files according to their + // contents (hashed sequence of content blob IDs) + uniqueFiles map[fileID]struct{} + + // fileBlobs maps a file name (path) to the set of + // blobs that have been seen as a part of the file + fileBlobs map[string]restic.IDSet + + // blobs and blobsSeen are used to count indiviudal + // unique blobs, independent of references to files + blobs, blobsSeen restic.BlobSet } +// fileID is a 256-bit hash that distinguishes unique files. type fileID [32]byte + +var ( + countModeRestoreSize bool + countModeUniqueFilesByContent bool + countModeBlobsPerFile bool + countModeRawData bool + + // the snapshot to scan, as given by the user + snapshotIDString string + + // snapshotByHost is the host to filter latest + // snapshot by, if given by user + snapshotByHost string +) From acb05e78551cc22277ffa859730f9e11ce91477f Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Sun, 22 Apr 2018 23:34:28 -0600 Subject: [PATCH 4/8] Fix filepath uniqueness bug for blobs-per-file mode --- cmd/restic/cmd_stats.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 4b14235dd..2241be90c 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -191,11 +191,12 @@ func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID for _, blobID := range node.Content { // ensure we have this file (by path) in our map; in this // mode, a file is unique by both contents and path - if _, ok := stats.fileBlobs[fpath]; !ok { - stats.fileBlobs[fpath] = restic.NewIDSet() + nodePath := filepath.Join(fpath, node.Name) + if _, ok := stats.fileBlobs[nodePath]; !ok { + stats.fileBlobs[nodePath] = restic.NewIDSet() stats.TotalFileCount++ } - if _, ok := stats.fileBlobs[fpath][blobID]; !ok { + if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { // TODO: Is the blob type always 'data' in this case? blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) if !found { @@ -205,7 +206,7 @@ func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID // count the blob's size, then add this blob by this // file (path) so we don't double-count it stats.TotalSize += uint64(blobSize) - stats.fileBlobs[fpath].Insert(blobID) + stats.fileBlobs[nodePath].Insert(blobID) // this mode also counts total unique blob _references_ per file stats.TotalBlobCount++ From 930602a444defeb2ca8697620bc99cbffdc413cd Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Wed, 25 Apr 2018 09:39:16 -0600 Subject: [PATCH 5/8] Update comment now that question was answered --- cmd/restic/cmd_stats.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 2241be90c..75b1757d6 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -197,7 +197,7 @@ func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID stats.TotalFileCount++ } if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { - // TODO: Is the blob type always 'data' in this case? + // is always a data blob since we're accessing it via a file's Content array blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) if !found { return fmt.Errorf("blob %s not found for tree %s", blobID, treeID) From daca9d68153e9dbe7d77fe2628b2dbe8d765a87b Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Sat, 23 Jun 2018 12:01:13 -0600 Subject: [PATCH 6/8] Consolidate mode flags; use new Walk function --- cmd/restic/cmd_stats.go | 181 +++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 103 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 75b1757d6..54a27334b 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -9,17 +9,18 @@ import ( "path/filepath" "github.com/restic/restic/internal/restic" + "github.com/restic/restic/internal/walker" "github.com/spf13/cobra" ) var cmdStats = &cobra.Command{ - Use: "stats", + Use: "stats [flags] [snapshot-ID]", Short: "Scan the repository and show basic statistics", Long: ` The "stats" command walks one or all snapshots in a repository and accumulates statistics about the data stored therein. It reports on the number of unique files and their sizes, according to one of -the counting modes as given by a flag. +the counting modes as given by the --mode flag. `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { @@ -27,16 +28,10 @@ the counting modes as given by a flag. }, } -var countModeFlag []string - func init() { cmdRoot.AddCommand(cmdStats) - f := cmdStats.Flags() - f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)") - f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents") - f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename") - f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them") + f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-content, blobs-per-file, or raw-data") f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") } @@ -69,7 +64,6 @@ func runStats(gopts GlobalOptions, args []string) error { // create a container for the stats (and other needed state) stats := &statsContainer{ uniqueFiles: make(map[fileID]struct{}), - idSet: make(restic.IDSet), fileBlobs: make(map[string]restic.IDSet), blobs: restic.NewBlobSet(), blobsSeen: restic.NewBlobSet(), @@ -111,7 +105,7 @@ func runStats(gopts GlobalOptions, args []string) error { return err } - if countModeRawData { + if countMode == countModeRawData { // the blob handles have been collected, but not yet counted for blobHandle := range stats.blobs { blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) @@ -147,93 +141,81 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) } - if countModeRawData { + if countMode == countModeRawData { // count just the sizes of unique blobs; we don't need to walk the tree // ourselves in this case, since a nifty function does it for us return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) } - err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator)) + err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), func(path string, node *restic.Node, nodeErr error) (bool, error) { + return statsWalkTree(path, node, nodeErr, repo, stats) + }) if err != nil { return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) } return nil } -func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error { - // don't visit a tree we've already walked - if stats.idSet.Has(treeID) { - return nil +func statsWalkTree(npath string, node *restic.Node, nodeErr error, repo restic.Repository, stats *statsContainer) (ignore bool, err error) { + if nodeErr != nil { + return true, nodeErr } - stats.idSet.Insert(treeID) - - tree, err := repo.LoadTree(ctx, treeID) - if err != nil { - return fmt.Errorf("loading tree: %v", err) + if node == nil { + return true, nil } - for _, node := range tree.Nodes { - if countModeUniqueFilesByContent || countModeBlobsPerFile { - // only count this file if we haven't visited it before - fid := makeFileIDByContents(node) - if _, ok := stats.uniqueFiles[fid]; !ok { - // mark the file as visited - stats.uniqueFiles[fid] = struct{}{} + if countMode == countModeUniqueFilesByContent || countMode == countModeBlobsPerFile { + // only count this file if we haven't visited it before + fid := makeFileIDByContents(node) + if _, ok := stats.uniqueFiles[fid]; !ok { + // mark the file as visited + stats.uniqueFiles[fid] = struct{}{} - if countModeUniqueFilesByContent { - // simply count the size of each unique file (unique by contents only) - stats.TotalSize += node.Size - stats.TotalFileCount++ - } - if countModeBlobsPerFile { - // count the size of each unique blob reference, which is - // by unique file (unique by contents and file path) - for _, blobID := range node.Content { - // ensure we have this file (by path) in our map; in this - // mode, a file is unique by both contents and path - nodePath := filepath.Join(fpath, node.Name) - if _, ok := stats.fileBlobs[nodePath]; !ok { - stats.fileBlobs[nodePath] = restic.NewIDSet() - stats.TotalFileCount++ + if countMode == countModeUniqueFilesByContent { + // simply count the size of each unique file (unique by contents only) + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + if countMode == countModeBlobsPerFile { + // count the size of each unique blob reference, which is + // by unique file (unique by contents and file path) + for _, blobID := range node.Content { + // ensure we have this file (by path) in our map; in this + // mode, a file is unique by both contents and path + nodePath := filepath.Join(npath, node.Name) + if _, ok := stats.fileBlobs[nodePath]; !ok { + stats.fileBlobs[nodePath] = restic.NewIDSet() + stats.TotalFileCount++ + } + if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { + // is always a data blob since we're accessing it via a file's Content array + blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) + if !found { + return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree) } - if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { - // is always a data blob since we're accessing it via a file's Content array - blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) - if !found { - return fmt.Errorf("blob %s not found for tree %s", blobID, treeID) - } - // count the blob's size, then add this blob by this - // file (path) so we don't double-count it - stats.TotalSize += uint64(blobSize) - stats.fileBlobs[nodePath].Insert(blobID) + // count the blob's size, then add this blob by this + // file (path) so we don't double-count it + stats.TotalSize += uint64(blobSize) + stats.fileBlobs[nodePath].Insert(blobID) - // this mode also counts total unique blob _references_ per file - stats.TotalBlobCount++ - } + // this mode also counts total unique blob _references_ per file + stats.TotalBlobCount++ } } } } - - if countModeRestoreSize { - // as this is a file in the snapshot, we can simply count its - // size without worrying about uniqueness, since duplicate files - // will still be restored - stats.TotalSize += node.Size - stats.TotalFileCount++ - } - - // visit subtrees (i.e. directory contents) - if node.Subtree != nil { - err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name)) - if err != nil { - return err - } - } } - return nil + if countMode == countModeRestoreSize { + // as this is a file in the snapshot, we can simply count its + // size without worrying about uniqueness, since duplicate files + // will still be restored + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + + return true, nil } // makeFileIDByContents returns a hash of the blob IDs of the @@ -247,35 +229,26 @@ func makeFileIDByContents(node *restic.Node) fileID { } func verifyStatsInput(gopts GlobalOptions, args []string) error { - // ensure only one counting mode was specified, for clarity - var countModes int - if countModeRestoreSize { - countModes++ + // require a recognized counting mode + switch countMode { + case countModeRestoreSize: + case countModeUniqueFilesByContent: + case countModeBlobsPerFile: + case countModeRawData: + default: + return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode) } - if countModeUniqueFilesByContent { - countModes++ - } - if countModeBlobsPerFile { - countModes++ - } - if countModeRawData { - countModes++ - } - if countModes > 1 { - return fmt.Errorf("only one counting mode may be used") - } - // set a default count mode if none were specified - if countModes == 0 { - countModeRestoreSize = true - } - // ensure one or none snapshots were specified + + // ensure at most one snapshot was specified if len(args) > 1 { return fmt.Errorf("only one snapshot may be specified") } - // set the snapshot to scan, if one was specified + + // if a snapshot was specified, mark it as the one to scan if len(args) == 1 { snapshotIDString = args[0] } + return nil } @@ -287,9 +260,6 @@ type statsContainer struct { TotalFileCount uint64 `json:"total_file_count"` TotalBlobCount uint64 `json:"total_blob_count,omitempty"` - // idSet marks visited trees, to avoid repeated walks - idSet restic.IDSet - // uniqueFiles marks visited files according to their // contents (hashed sequence of content blob IDs) uniqueFiles map[fileID]struct{} @@ -307,10 +277,8 @@ type statsContainer struct { type fileID [32]byte var ( - countModeRestoreSize bool - countModeUniqueFilesByContent bool - countModeBlobsPerFile bool - countModeRawData bool + // the mode of counting to perform + countMode string // the snapshot to scan, as given by the user snapshotIDString string @@ -319,3 +287,10 @@ var ( // snapshot by, if given by user snapshotByHost string ) + +const ( + countModeRestoreSize = "restore-size" + countModeUniqueFilesByContent = "files-by-content" + countModeBlobsPerFile = "blobs-per-file" + countModeRawData = "raw-data" +) From 12c797700e7bf2968a3cad73b75ac99b5a5fe1aa Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Fri, 27 Jul 2018 21:44:59 +0200 Subject: [PATCH 7/8] make statsWalkSnapshot return a function --- cmd/restic/cmd_stats.go | 104 ++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 54a27334b..621ff0880 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -147,75 +147,75 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) } - err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), func(path string, node *restic.Node, nodeErr error) (bool, error) { - return statsWalkTree(path, node, nodeErr, repo, stats) - }) + err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats)) if err != nil { return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) } return nil } -func statsWalkTree(npath string, node *restic.Node, nodeErr error, repo restic.Repository, stats *statsContainer) (ignore bool, err error) { - if nodeErr != nil { - return true, nodeErr - } - if node == nil { - return true, nil - } +func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFunc { + return func(npath string, node *restic.Node, nodeErr error) (bool, error) { + if nodeErr != nil { + return true, nodeErr + } + if node == nil { + return true, nil + } - if countMode == countModeUniqueFilesByContent || countMode == countModeBlobsPerFile { - // only count this file if we haven't visited it before - fid := makeFileIDByContents(node) - if _, ok := stats.uniqueFiles[fid]; !ok { - // mark the file as visited - stats.uniqueFiles[fid] = struct{}{} + if countMode == countModeUniqueFilesByContent || countMode == countModeBlobsPerFile { + // only count this file if we haven't visited it before + fid := makeFileIDByContents(node) + if _, ok := stats.uniqueFiles[fid]; !ok { + // mark the file as visited + stats.uniqueFiles[fid] = struct{}{} - if countMode == countModeUniqueFilesByContent { - // simply count the size of each unique file (unique by contents only) - stats.TotalSize += node.Size - stats.TotalFileCount++ - } - if countMode == countModeBlobsPerFile { - // count the size of each unique blob reference, which is - // by unique file (unique by contents and file path) - for _, blobID := range node.Content { - // ensure we have this file (by path) in our map; in this - // mode, a file is unique by both contents and path - nodePath := filepath.Join(npath, node.Name) - if _, ok := stats.fileBlobs[nodePath]; !ok { - stats.fileBlobs[nodePath] = restic.NewIDSet() - stats.TotalFileCount++ - } - if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { - // is always a data blob since we're accessing it via a file's Content array - blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) - if !found { - return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree) + if countMode == countModeUniqueFilesByContent { + // simply count the size of each unique file (unique by contents only) + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + if countMode == countModeBlobsPerFile { + // count the size of each unique blob reference, which is + // by unique file (unique by contents and file path) + for _, blobID := range node.Content { + // ensure we have this file (by path) in our map; in this + // mode, a file is unique by both contents and path + nodePath := filepath.Join(npath, node.Name) + if _, ok := stats.fileBlobs[nodePath]; !ok { + stats.fileBlobs[nodePath] = restic.NewIDSet() + stats.TotalFileCount++ } + if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { + // is always a data blob since we're accessing it via a file's Content array + blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) + if !found { + return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree) + } - // count the blob's size, then add this blob by this - // file (path) so we don't double-count it - stats.TotalSize += uint64(blobSize) - stats.fileBlobs[nodePath].Insert(blobID) + // count the blob's size, then add this blob by this + // file (path) so we don't double-count it + stats.TotalSize += uint64(blobSize) + stats.fileBlobs[nodePath].Insert(blobID) - // this mode also counts total unique blob _references_ per file - stats.TotalBlobCount++ + // this mode also counts total unique blob _references_ per file + stats.TotalBlobCount++ + } } } } } - } - if countMode == countModeRestoreSize { - // as this is a file in the snapshot, we can simply count its - // size without worrying about uniqueness, since duplicate files - // will still be restored - stats.TotalSize += node.Size - stats.TotalFileCount++ - } + if countMode == countModeRestoreSize { + // as this is a file in the snapshot, we can simply count its + // size without worrying about uniqueness, since duplicate files + // will still be restored + stats.TotalSize += node.Size + stats.TotalFileCount++ + } - return true, nil + return true, nil + } } // makeFileIDByContents returns a hash of the blob IDs of the From f6b2731aa580759893d52330f85317237f18511e Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Fri, 27 Jul 2018 15:34:37 -0600 Subject: [PATCH 8/8] stats: Add manual doc, improve -h doc Also rename files-by-content to files-by-contents, once and for all --- cmd/restic/cmd_stats.go | 36 +++++++++++++++------ doc/manual_rest.rst | 71 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 621ff0880..970ee1291 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -21,6 +21,25 @@ The "stats" command walks one or all snapshots in a repository and accumulates statistics about the data stored therein. It reports on the number of unique files and their sizes, according to one of the counting modes as given by the --mode flag. + +If no snapshot is specified, all snapshots will be considered. Some +modes make more sense over just a single snapshot, while others +are useful across all snapshots, depending on what you are trying +to calculate. + +The modes are: + + restore-size: (default) Counts the size of the restored files. + + files-by-contents: Counts total size of files, where a file is + considered unique if it has unique contents. + + raw-data: Counts the size of blobs in the repository, regardless + of how many files reference them. + + blobs-per-file: A combination of files-by-contents and raw-data. + +Refer to the online manual for more details about each mode. `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { @@ -31,7 +50,7 @@ the counting modes as given by the --mode flag. func init() { cmdRoot.AddCommand(cmdStats) f := cmdStats.Flags() - f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-content, blobs-per-file, or raw-data") + f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file, or raw-data") f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") } @@ -163,14 +182,14 @@ func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFun return true, nil } - if countMode == countModeUniqueFilesByContent || countMode == countModeBlobsPerFile { + if countMode == countModeUniqueFilesByContents || countMode == countModeBlobsPerFile { // only count this file if we haven't visited it before fid := makeFileIDByContents(node) if _, ok := stats.uniqueFiles[fid]; !ok { // mark the file as visited stats.uniqueFiles[fid] = struct{}{} - if countMode == countModeUniqueFilesByContent { + if countMode == countModeUniqueFilesByContents { // simply count the size of each unique file (unique by contents only) stats.TotalSize += node.Size stats.TotalFileCount++ @@ -197,7 +216,6 @@ func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFun // file (path) so we don't double-count it stats.TotalSize += uint64(blobSize) stats.fileBlobs[nodePath].Insert(blobID) - // this mode also counts total unique blob _references_ per file stats.TotalBlobCount++ } @@ -232,7 +250,7 @@ func verifyStatsInput(gopts GlobalOptions, args []string) error { // require a recognized counting mode switch countMode { case countModeRestoreSize: - case countModeUniqueFilesByContent: + case countModeUniqueFilesByContents: case countModeBlobsPerFile: case countModeRawData: default: @@ -289,8 +307,8 @@ var ( ) const ( - countModeRestoreSize = "restore-size" - countModeUniqueFilesByContent = "files-by-content" - countModeBlobsPerFile = "blobs-per-file" - countModeRawData = "raw-data" + countModeRestoreSize = "restore-size" + countModeUniqueFilesByContents = "files-by-contents" + countModeBlobsPerFile = "blobs-per-file" + countModeRawData = "raw-data" ) diff --git a/doc/manual_rest.rst b/doc/manual_rest.rst index 40540e84f..94173644b 100644 --- a/doc/manual_rest.rst +++ b/doc/manual_rest.rst @@ -36,6 +36,7 @@ Usage help is available: rebuild-index Build a new index file restore Extract the data from a snapshot snapshots List all snapshots + stats Count up sizes and show information about repository data tag Modify tags on snapshots unlock Remove locks other processes created version Print version information @@ -236,6 +237,76 @@ The following metadata is handled by restic: - Subtree - ExtendedAttributes + +Getting information about repository data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the ``stats`` command to count up stats about the data in the repository. +There are different counting modes available using the ``--mode`` flag, +depending on what you want to calculate. The default is the restore size, or +the size required to restore the files: + +- ``restore-size`` (default) counts the size of the restored files. +- ``files-by-contents`` counts the total size of unique files as given by their + contents. This can be useful since a file is considered unique only if it has + unique contents. Keep in mind that a small change to a large file (even when the + file name/path hasn't changed) will cause them to look like different files, thus + essentially causing the whole size of the file to be counted twice. +- ``raw-data`` counts the size of the blobs in the repository, regardless of how many + files reference them. This tells you how much restic has reduced all your original + data down to (either for a single snapshot or across all your backups), and compared + to the size given by the restore-size mode, can tell you how much deduplication is + helping you. +- ``blobs-per-file`` is kind of a mix between files-by-contents and raw-data modes; + it is useful for knowing how much value your backup is providing you in terms of unique + data stored by file. Like files-by-contents, it is resilient to file renames/moves. + Unlike files-by-contents, it does not balloon to high values when large files have + small edits, as long as the file path stayed the same. Unlike raw-data, this mode + DOES consider how many files point to each blob such that the more files a blob is + referenced by, the more it counts toward the size. + +For example, to calculate how much space would be +required to restore the latest snapshot (from any host that made it): + +.. code-block:: console + + $ restic stats latest + password is correct + Total File Count: 10538 + Total Size: 37.824 GiB + +If multiple hosts are backing up to the repository, the latest snapshot may not +be the one you want. You can specify the latest snapshot from only a specific +host by using the ``--host`` flag: + +.. code-block:: console + + $ restic stats --host myserver latest + password is correct + Total File Count: 21766 + Total Size: 481.783 GiB + +There we see that it would take 482 GiB of disk space to restore the latest +snapshot from "myserver". + +But how much space does that snapshot take on disk? In other words, how much +has restic's deduplication helped? We can check: + +.. code-block:: console + + $ restic stats --host myserver --mode raw-data latest + password is correct + Total Blob Count: 340847 + Total Size: 458.663 GiB + +Comparing this size to the previous command, we see that restic has saved +about 23 GiB of space with deduplication. + +Which mode you use depends on your exact use case. Some modes are more useful +across all snapshots, while others make more sense on just a single snapshot, +depending on what you're trying to calculate. + + Scripting ---------