Merge pull request #1729 from mholt/stats

Implement `restic stats` command to get more info about a repository
2025-01-22 14:48:24 +00:00 · 2018-07-31 23:24:36 +02:00 · 2018-07-31 23:24:36 +02:00 · 3422c1ca83
commit 3422c1ca83
parent 01aacf41b5 f6b2731aa5
4 changed files with 390 additions and 0 deletions
--- a/changelog/unreleased/pull-1729
+++ b/changelog/unreleased/pull-1729
@ -0,0 +1,4 @@
+Enhancement: Add stats command to get information about a repository
+
+https://github.com/restic/restic/issues/874
+https://github.com/restic/restic/pull/1729
--- a/cmd/restic/cmd_stats.go
+++ b/cmd/restic/cmd_stats.go
@ -0,0 +1,314 @@
+package main
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/restic/restic/internal/restic"
+	"github.com/restic/restic/internal/walker"
+	"github.com/spf13/cobra"
+)
+
+var cmdStats = &cobra.Command{
+	Use:   "stats [flags] [snapshot-ID]",
+	Short: "Scan the repository and show basic statistics",
+	Long: `
+The "stats" command walks one or all snapshots in a repository and
+accumulates statistics about the data stored therein. It reports on
+the number of unique files and their sizes, according to one of
+the counting modes as given by the --mode flag.
+
+If no snapshot is specified, all snapshots will be considered. Some
+modes make more sense over just a single snapshot, while others
+are useful across all snapshots, depending on what you are trying
+to calculate.
+
+The modes are:
+
+  restore-size: (default) Counts the size of the restored files.
+
+  files-by-contents: Counts total size of files, where a file is
+                     considered unique if it has unique contents.
+
+  raw-data: Counts the size of blobs in the repository, regardless
+			of how many files reference them.
+
+  blobs-per-file: A combination of files-by-contents and raw-data.
+
+Refer to the online manual for more details about each mode.
+`,
+	DisableAutoGenTag: true,
+	RunE: func(cmd *cobra.Command, args []string) error {
+		return runStats(globalOptions, args)
+	},
+}
+
+func init() {
+	cmdRoot.AddCommand(cmdStats)
+	f := cmdStats.Flags()
+	f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file, or raw-data")
+	f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")
+}
+
+func runStats(gopts GlobalOptions, args []string) error {
+	err := verifyStatsInput(gopts, args)
+	if err != nil {
+		return err
+	}
+
+	ctx, cancel := context.WithCancel(gopts.ctx)
+	defer cancel()
+
+	repo, err := OpenRepository(gopts)
+	if err != nil {
+		return err
+	}
+
+	if err = repo.LoadIndex(ctx); err != nil {
+		return err
+	}
+
+	if !gopts.NoLock {
+		lock, err := lockRepo(repo)
+		defer unlockRepo(lock)
+		if err != nil {
+			return err
+		}
+	}
+
+	// create a container for the stats (and other needed state)
+	stats := &statsContainer{
+		uniqueFiles: make(map[fileID]struct{}),
+		fileBlobs:   make(map[string]restic.IDSet),
+		blobs:       restic.NewBlobSet(),
+		blobsSeen:   restic.NewBlobSet(),
+	}
+
+	if snapshotIDString != "" {
+		// scan just a single snapshot
+
+		var sID restic.ID
+		if snapshotIDString == "latest" {
+			sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost)
+			if err != nil {
+				Exitf(1, "latest snapshot for criteria not found: %v", err)
+			}
+		} else {
+			sID, err = restic.FindSnapshot(repo, snapshotIDString)
+			if err != nil {
+				return err
+			}
+		}
+
+		snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
+		if err != nil {
+			return err
+		}
+
+		err = statsWalkSnapshot(ctx, snapshot, repo, stats)
+	} else {
+		// iterate every snapshot in the repo
+		err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
+			snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
+			if err != nil {
+				return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
+			}
+			return statsWalkSnapshot(ctx, snapshot, repo, stats)
+		})
+	}
+	if err != nil {
+		return err
+	}
+
+	if countMode == countModeRawData {
+		// the blob handles have been collected, but not yet counted
+		for blobHandle := range stats.blobs {
+			blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
+			if !found {
+				return fmt.Errorf("blob %v not found", blobHandle)
+			}
+			stats.TotalSize += uint64(blobSize)
+			stats.TotalBlobCount++
+		}
+	}
+
+	if gopts.JSON {
+		err = json.NewEncoder(os.Stdout).Encode(stats)
+		if err != nil {
+			return fmt.Errorf("encoding output: %v", err)
+		}
+		return nil
+	}
+
+	if stats.TotalBlobCount > 0 {
+		Printf("  Total Blob Count:   %d\n", stats.TotalBlobCount)
+	}
+	if stats.TotalFileCount > 0 {
+		Printf("  Total File Count:   %d\n", stats.TotalFileCount)
+	}
+	Printf("        Total Size:   %-5s\n", formatBytes(stats.TotalSize))
+
+	return nil
+}
+
+func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error {
+	if snapshot.Tree == nil {
+		return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
+	}
+
+	if countMode == countModeRawData {
+		// count just the sizes of unique blobs; we don't need to walk the tree
+		// ourselves in this case, since a nifty function does it for us
+		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
+	}
+
+	err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats))
+	if err != nil {
+		return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
+	}
+	return nil
+}
+
+func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFunc {
+	return func(npath string, node *restic.Node, nodeErr error) (bool, error) {
+		if nodeErr != nil {
+			return true, nodeErr
+		}
+		if node == nil {
+			return true, nil
+		}
+
+		if countMode == countModeUniqueFilesByContents || countMode == countModeBlobsPerFile {
+			// only count this file if we haven't visited it before
+			fid := makeFileIDByContents(node)
+			if _, ok := stats.uniqueFiles[fid]; !ok {
+				// mark the file as visited
+				stats.uniqueFiles[fid] = struct{}{}
+
+				if countMode == countModeUniqueFilesByContents {
+					// simply count the size of each unique file (unique by contents only)
+					stats.TotalSize += node.Size
+					stats.TotalFileCount++
+				}
+				if countMode == countModeBlobsPerFile {
+					// count the size of each unique blob reference, which is
+					// by unique file (unique by contents and file path)
+					for _, blobID := range node.Content {
+						// ensure we have this file (by path) in our map; in this
+						// mode, a file is unique by both contents and path
+						nodePath := filepath.Join(npath, node.Name)
+						if _, ok := stats.fileBlobs[nodePath]; !ok {
+							stats.fileBlobs[nodePath] = restic.NewIDSet()
+							stats.TotalFileCount++
+						}
+						if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {
+							// is always a data blob since we're accessing it via a file's Content array
+							blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
+							if !found {
+								return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree)
+							}
+
+							// count the blob's size, then add this blob by this
+							// file (path) so we don't double-count it
+							stats.TotalSize += uint64(blobSize)
+							stats.fileBlobs[nodePath].Insert(blobID)
+							// this mode also counts total unique blob _references_ per file
+							stats.TotalBlobCount++
+						}
+					}
+				}
+			}
+		}
+
+		if countMode == countModeRestoreSize {
+			// as this is a file in the snapshot, we can simply count its
+			// size without worrying about uniqueness, since duplicate files
+			// will still be restored
+			stats.TotalSize += node.Size
+			stats.TotalFileCount++
+		}
+
+		return true, nil
+	}
+}
+
+// makeFileIDByContents returns a hash of the blob IDs of the
+// node's Content in sequence.
+func makeFileIDByContents(node *restic.Node) fileID {
+	var bb []byte
+	for _, c := range node.Content {
+		bb = append(bb, []byte(c[:])...)
+	}
+	return sha256.Sum256(bb)
+}
+
+func verifyStatsInput(gopts GlobalOptions, args []string) error {
+	// require a recognized counting mode
+	switch countMode {
+	case countModeRestoreSize:
+	case countModeUniqueFilesByContents:
+	case countModeBlobsPerFile:
+	case countModeRawData:
+	default:
+		return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode)
+	}
+
+	// ensure at most one snapshot was specified
+	if len(args) > 1 {
+		return fmt.Errorf("only one snapshot may be specified")
+	}
+
+	// if a snapshot was specified, mark it as the one to scan
+	if len(args) == 1 {
+		snapshotIDString = args[0]
+	}
+
+	return nil
+}
+
+// statsContainer holds information during a walk of a repository
+// to collect information about it, as well as state needed
+// for a successful and efficient walk.
+type statsContainer struct {
+	TotalSize      uint64 `json:"total_size"`
+	TotalFileCount uint64 `json:"total_file_count"`
+	TotalBlobCount uint64 `json:"total_blob_count,omitempty"`
+
+	// uniqueFiles marks visited files according to their
+	// contents (hashed sequence of content blob IDs)
+	uniqueFiles map[fileID]struct{}
+
+	// fileBlobs maps a file name (path) to the set of
+	// blobs that have been seen as a part of the file
+	fileBlobs map[string]restic.IDSet
+
+	// blobs and blobsSeen are used to count indiviudal
+	// unique blobs, independent of references to files
+	blobs, blobsSeen restic.BlobSet
+}
+
+// fileID is a 256-bit hash that distinguishes unique files.
+type fileID [32]byte
+
+var (
+	// the mode of counting to perform
+	countMode string
+
+	// the snapshot to scan, as given by the user
+	snapshotIDString string
+
+	// snapshotByHost is the host to filter latest
+	// snapshot by, if given by user
+	snapshotByHost string
+)
+
+const (
+	countModeRestoreSize           = "restore-size"
+	countModeUniqueFilesByContents = "files-by-contents"
+	countModeBlobsPerFile          = "blobs-per-file"
+	countModeRawData               = "raw-data"
+)
--- a/doc/bash-completion.sh
+++ b/doc/bash-completion.sh
@ -1310,6 +1310,7 @@ _restic_root_command()
    commands+=("rebuild-index")
    commands+=("restore")
    commands+=("snapshots")
+    commands+=("stats")
    commands+=("tag")
    commands+=("unlock")
    commands+=("version")
--- a/doc/manual_rest.rst
+++ b/doc/manual_rest.rst
@ -36,6 +36,7 @@ Usage help is available:
      rebuild-index Build a new index file
      restore       Extract the data from a snapshot
      snapshots     List all snapshots
+      stats         Count up sizes and show information about repository data
      tag           Modify tags on snapshots
      unlock        Remove locks other processes created
      version       Print version information
@ -236,6 +237,76 @@ The following metadata is handled by restic:
 - Subtree
 - ExtendedAttributes

+
+Getting information about repository data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the ``stats`` command to count up stats about the data in the repository.
+There are different counting modes available using the ``--mode`` flag,
+depending on what you want to calculate. The default is the restore size, or
+the size required to restore the files:
+
+-  ``restore-size`` (default) counts the size of the restored files.
+-  ``files-by-contents`` counts the total size of unique files as given by their
+   contents. This can be useful since a file is considered unique only if it has
+   unique contents. Keep in mind that a small change to a large file (even when the
+   file name/path hasn't changed) will cause them to look like different files, thus
+   essentially causing the whole size of the file to be counted twice.
+-  ``raw-data`` counts the size of the blobs in the repository, regardless of how many
+   files reference them. This tells you how much restic has reduced all your original
+   data down to (either for a single snapshot or across all your backups), and compared
+   to the size given by the restore-size mode, can tell you how much deduplication is
+   helping you.
+-  ``blobs-per-file`` is kind of a mix between files-by-contents and raw-data modes;
+   it is useful for knowing how much value your backup is providing you in terms of unique
+   data stored by file. Like files-by-contents, it is resilient to file renames/moves.
+   Unlike files-by-contents, it does not balloon to high values when large files have
+   small edits, as long as the file path stayed the same. Unlike raw-data, this mode
+   DOES consider how many files point to each blob such that the more files a blob is
+   referenced by, the more it counts toward the size.
+
+For example, to calculate how much space would be
+required to restore the latest snapshot (from any host that made it):
+
+.. code-block:: console
+
+    $ restic stats latest
+    password is correct
+    Total File Count:   10538
+          Total Size:   37.824 GiB
+
+If multiple hosts are backing up to the repository, the latest snapshot may not
+be the one you want. You can specify the latest snapshot from only a specific
+host by using the ``--host`` flag:
+
+.. code-block:: console
+
+    $ restic stats --host myserver latest
+    password is correct
+    Total File Count:   21766
+          Total Size:   481.783 GiB
+
+There we see that it would take 482 GiB of disk space to restore the latest
+snapshot from "myserver".
+
+But how much space does that snapshot take on disk? In other words, how much
+has restic's deduplication helped? We can check:
+
+.. code-block:: console
+
+    $ restic stats --host myserver --mode raw-data latest
+    password is correct
+    Total Blob Count:   340847
+          Total Size:   458.663 GiB
+
+Comparing this size to the previous command, we see that restic has saved
+about 23 GiB of space with deduplication.
+
+Which mode you use depends on your exact use case. Some modes are more useful
+across all snapshots, while others make more sense on just a single snapshot,
+depending on what you're trying to calculate.
+
+
 Scripting
 ---------