diff --git a/changelog/unreleased/pull-1729 b/changelog/unreleased/pull-1729 new file mode 100644 index 000000000..62e982e95 --- /dev/null +++ b/changelog/unreleased/pull-1729 @@ -0,0 +1,4 @@ +Enhancement: Add stats command to get information about a repository + +https://github.com/restic/restic/issues/874 +https://github.com/restic/restic/pull/1729 diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go new file mode 100644 index 000000000..970ee1291 --- /dev/null +++ b/cmd/restic/cmd_stats.go @@ -0,0 +1,314 @@ +package main + +import ( + "context" + "crypto/sha256" + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/restic/restic/internal/restic" + "github.com/restic/restic/internal/walker" + "github.com/spf13/cobra" +) + +var cmdStats = &cobra.Command{ + Use: "stats [flags] [snapshot-ID]", + Short: "Scan the repository and show basic statistics", + Long: ` +The "stats" command walks one or all snapshots in a repository and +accumulates statistics about the data stored therein. It reports on +the number of unique files and their sizes, according to one of +the counting modes as given by the --mode flag. + +If no snapshot is specified, all snapshots will be considered. Some +modes make more sense over just a single snapshot, while others +are useful across all snapshots, depending on what you are trying +to calculate. + +The modes are: + + restore-size: (default) Counts the size of the restored files. + + files-by-contents: Counts total size of files, where a file is + considered unique if it has unique contents. + + raw-data: Counts the size of blobs in the repository, regardless + of how many files reference them. + + blobs-per-file: A combination of files-by-contents and raw-data. + +Refer to the online manual for more details about each mode. +`, + DisableAutoGenTag: true, + RunE: func(cmd *cobra.Command, args []string) error { + return runStats(globalOptions, args) + }, +} + +func init() { + cmdRoot.AddCommand(cmdStats) + f := cmdStats.Flags() + f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file, or raw-data") + f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") +} + +func runStats(gopts GlobalOptions, args []string) error { + err := verifyStatsInput(gopts, args) + if err != nil { + return err + } + + ctx, cancel := context.WithCancel(gopts.ctx) + defer cancel() + + repo, err := OpenRepository(gopts) + if err != nil { + return err + } + + if err = repo.LoadIndex(ctx); err != nil { + return err + } + + if !gopts.NoLock { + lock, err := lockRepo(repo) + defer unlockRepo(lock) + if err != nil { + return err + } + } + + // create a container for the stats (and other needed state) + stats := &statsContainer{ + uniqueFiles: make(map[fileID]struct{}), + fileBlobs: make(map[string]restic.IDSet), + blobs: restic.NewBlobSet(), + blobsSeen: restic.NewBlobSet(), + } + + if snapshotIDString != "" { + // scan just a single snapshot + + var sID restic.ID + if snapshotIDString == "latest" { + sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost) + if err != nil { + Exitf(1, "latest snapshot for criteria not found: %v", err) + } + } else { + sID, err = restic.FindSnapshot(repo, snapshotIDString) + if err != nil { + return err + } + } + + snapshot, err := restic.LoadSnapshot(ctx, repo, sID) + if err != nil { + return err + } + + err = statsWalkSnapshot(ctx, snapshot, repo, stats) + } else { + // iterate every snapshot in the repo + err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { + snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) + if err != nil { + return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) + } + return statsWalkSnapshot(ctx, snapshot, repo, stats) + }) + } + if err != nil { + return err + } + + if countMode == countModeRawData { + // the blob handles have been collected, but not yet counted + for blobHandle := range stats.blobs { + blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) + if !found { + return fmt.Errorf("blob %v not found", blobHandle) + } + stats.TotalSize += uint64(blobSize) + stats.TotalBlobCount++ + } + } + + if gopts.JSON { + err = json.NewEncoder(os.Stdout).Encode(stats) + if err != nil { + return fmt.Errorf("encoding output: %v", err) + } + return nil + } + + if stats.TotalBlobCount > 0 { + Printf(" Total Blob Count: %d\n", stats.TotalBlobCount) + } + if stats.TotalFileCount > 0 { + Printf(" Total File Count: %d\n", stats.TotalFileCount) + } + Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize)) + + return nil +} + +func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error { + if snapshot.Tree == nil { + return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) + } + + if countMode == countModeRawData { + // count just the sizes of unique blobs; we don't need to walk the tree + // ourselves in this case, since a nifty function does it for us + return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) + } + + err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats)) + if err != nil { + return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) + } + return nil +} + +func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFunc { + return func(npath string, node *restic.Node, nodeErr error) (bool, error) { + if nodeErr != nil { + return true, nodeErr + } + if node == nil { + return true, nil + } + + if countMode == countModeUniqueFilesByContents || countMode == countModeBlobsPerFile { + // only count this file if we haven't visited it before + fid := makeFileIDByContents(node) + if _, ok := stats.uniqueFiles[fid]; !ok { + // mark the file as visited + stats.uniqueFiles[fid] = struct{}{} + + if countMode == countModeUniqueFilesByContents { + // simply count the size of each unique file (unique by contents only) + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + if countMode == countModeBlobsPerFile { + // count the size of each unique blob reference, which is + // by unique file (unique by contents and file path) + for _, blobID := range node.Content { + // ensure we have this file (by path) in our map; in this + // mode, a file is unique by both contents and path + nodePath := filepath.Join(npath, node.Name) + if _, ok := stats.fileBlobs[nodePath]; !ok { + stats.fileBlobs[nodePath] = restic.NewIDSet() + stats.TotalFileCount++ + } + if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { + // is always a data blob since we're accessing it via a file's Content array + blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) + if !found { + return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree) + } + + // count the blob's size, then add this blob by this + // file (path) so we don't double-count it + stats.TotalSize += uint64(blobSize) + stats.fileBlobs[nodePath].Insert(blobID) + // this mode also counts total unique blob _references_ per file + stats.TotalBlobCount++ + } + } + } + } + } + + if countMode == countModeRestoreSize { + // as this is a file in the snapshot, we can simply count its + // size without worrying about uniqueness, since duplicate files + // will still be restored + stats.TotalSize += node.Size + stats.TotalFileCount++ + } + + return true, nil + } +} + +// makeFileIDByContents returns a hash of the blob IDs of the +// node's Content in sequence. +func makeFileIDByContents(node *restic.Node) fileID { + var bb []byte + for _, c := range node.Content { + bb = append(bb, []byte(c[:])...) + } + return sha256.Sum256(bb) +} + +func verifyStatsInput(gopts GlobalOptions, args []string) error { + // require a recognized counting mode + switch countMode { + case countModeRestoreSize: + case countModeUniqueFilesByContents: + case countModeBlobsPerFile: + case countModeRawData: + default: + return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode) + } + + // ensure at most one snapshot was specified + if len(args) > 1 { + return fmt.Errorf("only one snapshot may be specified") + } + + // if a snapshot was specified, mark it as the one to scan + if len(args) == 1 { + snapshotIDString = args[0] + } + + return nil +} + +// statsContainer holds information during a walk of a repository +// to collect information about it, as well as state needed +// for a successful and efficient walk. +type statsContainer struct { + TotalSize uint64 `json:"total_size"` + TotalFileCount uint64 `json:"total_file_count"` + TotalBlobCount uint64 `json:"total_blob_count,omitempty"` + + // uniqueFiles marks visited files according to their + // contents (hashed sequence of content blob IDs) + uniqueFiles map[fileID]struct{} + + // fileBlobs maps a file name (path) to the set of + // blobs that have been seen as a part of the file + fileBlobs map[string]restic.IDSet + + // blobs and blobsSeen are used to count indiviudal + // unique blobs, independent of references to files + blobs, blobsSeen restic.BlobSet +} + +// fileID is a 256-bit hash that distinguishes unique files. +type fileID [32]byte + +var ( + // the mode of counting to perform + countMode string + + // the snapshot to scan, as given by the user + snapshotIDString string + + // snapshotByHost is the host to filter latest + // snapshot by, if given by user + snapshotByHost string +) + +const ( + countModeRestoreSize = "restore-size" + countModeUniqueFilesByContents = "files-by-contents" + countModeBlobsPerFile = "blobs-per-file" + countModeRawData = "raw-data" +) diff --git a/doc/bash-completion.sh b/doc/bash-completion.sh index 5203f5368..d1685e566 100644 --- a/doc/bash-completion.sh +++ b/doc/bash-completion.sh @@ -1310,6 +1310,7 @@ _restic_root_command() commands+=("rebuild-index") commands+=("restore") commands+=("snapshots") + commands+=("stats") commands+=("tag") commands+=("unlock") commands+=("version") diff --git a/doc/manual_rest.rst b/doc/manual_rest.rst index 40540e84f..94173644b 100644 --- a/doc/manual_rest.rst +++ b/doc/manual_rest.rst @@ -36,6 +36,7 @@ Usage help is available: rebuild-index Build a new index file restore Extract the data from a snapshot snapshots List all snapshots + stats Count up sizes and show information about repository data tag Modify tags on snapshots unlock Remove locks other processes created version Print version information @@ -236,6 +237,76 @@ The following metadata is handled by restic: - Subtree - ExtendedAttributes + +Getting information about repository data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the ``stats`` command to count up stats about the data in the repository. +There are different counting modes available using the ``--mode`` flag, +depending on what you want to calculate. The default is the restore size, or +the size required to restore the files: + +- ``restore-size`` (default) counts the size of the restored files. +- ``files-by-contents`` counts the total size of unique files as given by their + contents. This can be useful since a file is considered unique only if it has + unique contents. Keep in mind that a small change to a large file (even when the + file name/path hasn't changed) will cause them to look like different files, thus + essentially causing the whole size of the file to be counted twice. +- ``raw-data`` counts the size of the blobs in the repository, regardless of how many + files reference them. This tells you how much restic has reduced all your original + data down to (either for a single snapshot or across all your backups), and compared + to the size given by the restore-size mode, can tell you how much deduplication is + helping you. +- ``blobs-per-file`` is kind of a mix between files-by-contents and raw-data modes; + it is useful for knowing how much value your backup is providing you in terms of unique + data stored by file. Like files-by-contents, it is resilient to file renames/moves. + Unlike files-by-contents, it does not balloon to high values when large files have + small edits, as long as the file path stayed the same. Unlike raw-data, this mode + DOES consider how many files point to each blob such that the more files a blob is + referenced by, the more it counts toward the size. + +For example, to calculate how much space would be +required to restore the latest snapshot (from any host that made it): + +.. code-block:: console + + $ restic stats latest + password is correct + Total File Count: 10538 + Total Size: 37.824 GiB + +If multiple hosts are backing up to the repository, the latest snapshot may not +be the one you want. You can specify the latest snapshot from only a specific +host by using the ``--host`` flag: + +.. code-block:: console + + $ restic stats --host myserver latest + password is correct + Total File Count: 21766 + Total Size: 481.783 GiB + +There we see that it would take 482 GiB of disk space to restore the latest +snapshot from "myserver". + +But how much space does that snapshot take on disk? In other words, how much +has restic's deduplication helped? We can check: + +.. code-block:: console + + $ restic stats --host myserver --mode raw-data latest + password is correct + Total Blob Count: 340847 + Total Size: 458.663 GiB + +Comparing this size to the previous command, we see that restic has saved +about 23 GiB of space with deduplication. + +Which mode you use depends on your exact use case. Some modes are more useful +across all snapshots, while others make more sense on just a single snapshot, +depending on what you're trying to calculate. + + Scripting ---------