restic/cmd/restic/cmd_stats.go

package main

import (
	"context"
	"crypto/sha256"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"

	"github.com/restic/restic/internal/restic"
	"github.com/spf13/cobra"
)

var cmdStats = &cobra.Command{
	Use:   "stats",
	Short: "Scan the repository and show basic statistics",
	Long: `
The "stats" command walks one or all snapshots in a repository and
accumulates statistics about the data stored therein. It reports on
the number of unique files and their sizes, according to one of
the counting modes as given by a flag.
`,
	DisableAutoGenTag: true,
	RunE: func(cmd *cobra.Command, args []string) error {
		return runStats(globalOptions, args)
	},
}

var countModeFlag []string

func init() {
	cmdRoot.AddCommand(cmdStats)

	f := cmdStats.Flags()
	f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)")
	f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents")
	f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename")
	f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them")
	f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")
}

func runStats(gopts GlobalOptions, args []string) error {
	err := verifyStatsInput(gopts, args)
	if err != nil {
		return err
	}

	ctx, cancel := context.WithCancel(gopts.ctx)
	defer cancel()

	repo, err := OpenRepository(gopts)
	if err != nil {
		return err
	}

	if err = repo.LoadIndex(ctx); err != nil {
		return err
	}

	if !gopts.NoLock {
		lock, err := lockRepo(repo)
		defer unlockRepo(lock)
		if err != nil {
			return err
		}
	}

	// create a container for the stats (and other needed state)
	stats := &statsContainer{
		uniqueFiles: make(map[fileID]struct{}),
		idSet:       make(restic.IDSet),
		fileBlobs:   make(map[string]restic.IDSet),
		blobs:       restic.NewBlobSet(),
		blobsSeen:   restic.NewBlobSet(),
	}

	if snapshotIDString != "" {
		// scan just a single snapshot

		var sID restic.ID
		if snapshotIDString == "latest" {
			sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost)
			if err != nil {
				Exitf(1, "latest snapshot for criteria not found: %v", err)
			}
		} else {
			sID, err = restic.FindSnapshot(repo, snapshotIDString)
			if err != nil {
				return err
			}
		}

		snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
		if err != nil {
			return err
		}

		err = statsWalkSnapshot(ctx, snapshot, repo, stats)
	} else {
		// iterate every snapshot in the repo
		err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
			snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
			if err != nil {
				return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
			}
			return statsWalkSnapshot(ctx, snapshot, repo, stats)
		})
	}
	if err != nil {
		return err
	}

	if countModeRawData {
		// the blob handles have been collected, but not yet counted
		for blobHandle := range stats.blobs {
			blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
			if !found {
				return fmt.Errorf("blob %v not found", blobHandle)
			}
			stats.TotalSize += uint64(blobSize)
			stats.TotalBlobCount++
		}
	}

	if gopts.JSON {
		err = json.NewEncoder(os.Stdout).Encode(stats)
		if err != nil {
			return fmt.Errorf("encoding output: %v", err)
		}
		return nil
	}

	if stats.TotalBlobCount > 0 {
		Printf("  Total Blob Count:   %d\n", stats.TotalBlobCount)
	}
	if stats.TotalFileCount > 0 {
		Printf("  Total File Count:   %d\n", stats.TotalFileCount)
	}
	Printf("        Total Size:   %-5s\n", formatBytes(stats.TotalSize))

	return nil
}

func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error {
	if snapshot.Tree == nil {
		return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
	}

	if countModeRawData {
		// count just the sizes of unique blobs; we don't need to walk the tree
		// ourselves in this case, since a nifty function does it for us
		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
	}

	err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator))
	if err != nil {
		return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
	}
	return nil
}

func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error {
	// don't visit a tree we've already walked
	if stats.idSet.Has(treeID) {
		return nil
	}
	stats.idSet.Insert(treeID)

	tree, err := repo.LoadTree(ctx, treeID)
	if err != nil {
		return fmt.Errorf("loading tree: %v", err)
	}

	for _, node := range tree.Nodes {
		if countModeUniqueFilesByContent || countModeBlobsPerFile {
			// only count this file if we haven't visited it before
			fid := makeFileIDByContents(node)
			if _, ok := stats.uniqueFiles[fid]; !ok {
				// mark the file as visited
				stats.uniqueFiles[fid] = struct{}{}

				if countModeUniqueFilesByContent {
					// simply count the size of each unique file (unique by contents only)
					stats.TotalSize += node.Size
					stats.TotalFileCount++
				}
				if countModeBlobsPerFile {
					// count the size of each unique blob reference, which is
					// by unique file (unique by contents and file path)
					for _, blobID := range node.Content {
						// ensure we have this file (by path) in our map; in this
						// mode, a file is unique by both contents and path
						nodePath := filepath.Join(fpath, node.Name)
						if _, ok := stats.fileBlobs[nodePath]; !ok {
							stats.fileBlobs[nodePath] = restic.NewIDSet()
							stats.TotalFileCount++
						}
						if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {
							// is always a data blob since we're accessing it via a file's Content array
							blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
							if !found {
								return fmt.Errorf("blob %s not found for tree %s", blobID, treeID)
							}

							// count the blob's size, then add this blob by this
							// file (path) so we don't double-count it
							stats.TotalSize += uint64(blobSize)
							stats.fileBlobs[nodePath].Insert(blobID)

							// this mode also counts total unique blob _references_ per file
							stats.TotalBlobCount++
						}
					}
				}
			}
		}

		if countModeRestoreSize {
			// as this is a file in the snapshot, we can simply count its
			// size without worrying about uniqueness, since duplicate files
			// will still be restored
			stats.TotalSize += node.Size
			stats.TotalFileCount++
		}

		// visit subtrees (i.e. directory contents)
		if node.Subtree != nil {
			err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name))
			if err != nil {
				return err
			}
		}
	}

	return nil
}

// makeFileIDByContents returns a hash of the blob IDs of the
// node's Content in sequence.
func makeFileIDByContents(node *restic.Node) fileID {
	var bb []byte
	for _, c := range node.Content {
		bb = append(bb, []byte(c[:])...)
	}
	return sha256.Sum256(bb)
}

func verifyStatsInput(gopts GlobalOptions, args []string) error {
	// ensure only one counting mode was specified, for clarity
	var countModes int
	if countModeRestoreSize {
		countModes++
	}
	if countModeUniqueFilesByContent {
		countModes++
	}
	if countModeBlobsPerFile {
		countModes++
	}
	if countModeRawData {
		countModes++
	}
	if countModes > 1 {
		return fmt.Errorf("only one counting mode may be used")
	}
	// set a default count mode if none were specified
	if countModes == 0 {
		countModeRestoreSize = true
	}
	// ensure one or none snapshots were specified
	if len(args) > 1 {
		return fmt.Errorf("only one snapshot may be specified")
	}
	// set the snapshot to scan, if one was specified
	if len(args) == 1 {
		snapshotIDString = args[0]
	}
	return nil
}

// statsContainer holds information during a walk of a repository
// to collect information about it, as well as state needed
// for a successful and efficient walk.
type statsContainer struct {
	TotalSize      uint64 `json:"total_size"`
	TotalFileCount uint64 `json:"total_file_count"`
	TotalBlobCount uint64 `json:"total_blob_count,omitempty"`

	// idSet marks visited trees, to avoid repeated walks
	idSet restic.IDSet

	// uniqueFiles marks visited files according to their
	// contents (hashed sequence of content blob IDs)
	uniqueFiles map[fileID]struct{}

	// fileBlobs maps a file name (path) to the set of
	// blobs that have been seen as a part of the file
	fileBlobs map[string]restic.IDSet

	// blobs and blobsSeen are used to count indiviudal
	// unique blobs, independent of references to files
	blobs, blobsSeen restic.BlobSet
}

// fileID is a 256-bit hash that distinguishes unique files.
type fileID [32]byte

var (
	countModeRestoreSize          bool
	countModeUniqueFilesByContent bool
	countModeBlobsPerFile         bool
	countModeRawData              bool

	// the snapshot to scan, as given by the user
	snapshotIDString string

	// snapshotByHost is the host to filter latest
	// snapshot by, if given by user
	snapshotByHost string
)
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`package main`

			`import (`
			`"context"`
Count unique files by blob sequence rather than tree ID 2018-04-21 22:33:18 +00:00			`"crypto/sha256"`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`"encoding/json"`
			`"fmt"`
			`"os"`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`"path/filepath"`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00
			`"github.com/restic/restic/internal/restic"`
			`"github.com/spf13/cobra"`
			`)`

			`var cmdStats = &cobra.Command{`
			`Use: "stats",`
			`Short: "Scan the repository and show basic statistics",`
			Long: `
Implement four counting modes 2018-04-22 21:27:33 +00:00			`The "stats" command walks one or all snapshots in a repository and`
			`accumulates statistics about the data stored therein. It reports on`
			`the number of unique files and their sizes, according to one of`
			`the counting modes as given by a flag.`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`,
			`DisableAutoGenTag: true,`
			`RunE: func(cmd *cobra.Command, args []string) error {`
			`return runStats(globalOptions, args)`
			`},`
			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`var countModeFlag []string`

stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`func init() {`
			`cmdRoot.AddCommand(cmdStats)`
Implement four counting modes 2018-04-22 21:27:33 +00:00
			`f := cmdStats.Flags()`
			`f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)")`
			`f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents")`
			`f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename")`
			`f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them")`
			`f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`}`

			`func runStats(gopts GlobalOptions, args []string) error {`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`err := verifyStatsInput(gopts, args)`
			`if err != nil {`
			`return err`
			`}`

stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`ctx, cancel := context.WithCancel(gopts.ctx)`
			`defer cancel()`

			`repo, err := OpenRepository(gopts)`
			`if err != nil {`
			`return err`
			`}`

			`if err = repo.LoadIndex(ctx); err != nil {`
			`return err`
			`}`

			`if !gopts.NoLock {`
			`lock, err := lockRepo(repo)`
			`defer unlockRepo(lock)`
			`if err != nil {`
			`return err`
			`}`
			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`// create a container for the stats (and other needed state)`
			`stats := &statsContainer{`
			`uniqueFiles: make(map[fileID]struct{}),`
			`idSet: make(restic.IDSet),`
			`fileBlobs: make(map[string]restic.IDSet),`
			`blobs: restic.NewBlobSet(),`
			`blobsSeen: restic.NewBlobSet(),`
			`}`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00
Implement four counting modes 2018-04-22 21:27:33 +00:00			`if snapshotIDString != "" {`
			`// scan just a single snapshot`

			`var sID restic.ID`
			`if snapshotIDString == "latest" {`
			`sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost)`
			`if err != nil {`
			`Exitf(1, "latest snapshot for criteria not found: %v", err)`
			`}`
			`} else {`
			`sID, err = restic.FindSnapshot(repo, snapshotIDString)`
			`if err != nil {`
			`return err`
			`}`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`snapshot, err := restic.LoadSnapshot(ctx, repo, sID)`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`if err != nil {`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`return err`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`err = statsWalkSnapshot(ctx, snapshot, repo, stats)`
			`} else {`
			`// iterate every snapshot in the repo`
			`err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {`
			`snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)`
			`if err != nil {`
			`return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)`
			`}`
			`return statsWalkSnapshot(ctx, snapshot, repo, stats)`
			`})`
			`}`
			`if err != nil {`
			`return err`
			`}`

			`if countModeRawData {`
			`// the blob handles have been collected, but not yet counted`
			`for blobHandle := range stats.blobs {`
			`blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)`
			`if !found {`
			`return fmt.Errorf("blob %v not found", blobHandle)`
			`}`
			`stats.TotalSize += uint64(blobSize)`
			`stats.TotalBlobCount++`
			`}`
			`}`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00
			`if gopts.JSON {`
			`err = json.NewEncoder(os.Stdout).Encode(stats)`
			`if err != nil {`
			`return fmt.Errorf("encoding output: %v", err)`
			`}`
			`return nil`
			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`if stats.TotalBlobCount > 0 {`
			`Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)`
			`}`
			`if stats.TotalFileCount > 0 {`
			`Printf(" Total File Count: %d\n", stats.TotalFileCount)`
			`}`
			`Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize))`

			`return nil`
			`}`

			`func statsWalkSnapshot(ctx context.Context, snapshot restic.Snapshot, repo restic.Repository, stats statsContainer) error {`
			`if snapshot.Tree == nil {`
			`return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())`
			`}`

			`if countModeRawData {`
			`// count just the sizes of unique blobs; we don't need to walk the tree`
			`// ourselves in this case, since a nifty function does it for us`
			`return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)`
			`}`

			`err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator))`
			`if err != nil {`
			`return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)`
			`}`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`return nil`
			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error {`
			`// don't visit a tree we've already walked`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`if stats.idSet.Has(treeID) {`
			`return nil`
			`}`
			`stats.idSet.Insert(treeID)`

			`tree, err := repo.LoadTree(ctx, treeID)`
			`if err != nil {`
			`return fmt.Errorf("loading tree: %v", err)`
			`}`

			`for _, node := range tree.Nodes {`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`if countModeUniqueFilesByContent \|\| countModeBlobsPerFile {`
			`// only count this file if we haven't visited it before`
			`fid := makeFileIDByContents(node)`
			`if _, ok := stats.uniqueFiles[fid]; !ok {`
			`// mark the file as visited`
			`stats.uniqueFiles[fid] = struct{}{}`

			`if countModeUniqueFilesByContent {`
			`// simply count the size of each unique file (unique by contents only)`
			`stats.TotalSize += node.Size`
			`stats.TotalFileCount++`
			`}`
			`if countModeBlobsPerFile {`
			`// count the size of each unique blob reference, which is`
			`// by unique file (unique by contents and file path)`
			`for _, blobID := range node.Content {`
			`// ensure we have this file (by path) in our map; in this`
			`// mode, a file is unique by both contents and path`
Fix filepath uniqueness bug for blobs-per-file mode 2018-04-23 05:34:28 +00:00			`nodePath := filepath.Join(fpath, node.Name)`
			`if _, ok := stats.fileBlobs[nodePath]; !ok {`
			`stats.fileBlobs[nodePath] = restic.NewIDSet()`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`stats.TotalFileCount++`
			`}`
Fix filepath uniqueness bug for blobs-per-file mode 2018-04-23 05:34:28 +00:00			`if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {`
Update comment now that question was answered 2018-04-25 15:39:16 +00:00			`// is always a data blob since we're accessing it via a file's Content array`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)`
			`if !found {`
			`return fmt.Errorf("blob %s not found for tree %s", blobID, treeID)`
			`}`

			`// count the blob's size, then add this blob by this`
			`// file (path) so we don't double-count it`
			`stats.TotalSize += uint64(blobSize)`
Fix filepath uniqueness bug for blobs-per-file mode 2018-04-23 05:34:28 +00:00			`stats.fileBlobs[nodePath].Insert(blobID)`
Implement four counting modes 2018-04-22 21:27:33 +00:00
			`// this mode also counts total unique blob _references_ per file`
			`stats.TotalBlobCount++`
			`}`
			`}`
			`}`
			`}`
			`}`

			`if countModeRestoreSize {`
			`// as this is a file in the snapshot, we can simply count its`
			`// size without worrying about uniqueness, since duplicate files`
			`// will still be restored`
			`stats.TotalSize += node.Size`
			`stats.TotalFileCount++`
Count unique files by blob sequence rather than tree ID 2018-04-21 22:33:18 +00:00			`}`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00
Count unique files by blob sequence rather than tree ID 2018-04-21 22:33:18 +00:00			`// visit subtrees (i.e. directory contents)`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`if node.Subtree != nil {`
Implement four counting modes 2018-04-22 21:27:33 +00:00			`err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name))`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`if err != nil {`
			`return err`
			`}`
			`}`
			`}`

			`return nil`
			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`// makeFileIDByContents returns a hash of the blob IDs of the`
			`// node's Content in sequence.`
			`func makeFileIDByContents(node *restic.Node) fileID {`
Count unique files by blob sequence rather than tree ID 2018-04-21 22:33:18 +00:00			`var bb []byte`
			`for _, c := range node.Content {`
			`bb = append(bb, []byte(c[:])...)`
			`}`
			`return sha256.Sum256(bb)`
			`}`

Implement four counting modes 2018-04-22 21:27:33 +00:00			`func verifyStatsInput(gopts GlobalOptions, args []string) error {`
			`// ensure only one counting mode was specified, for clarity`
			`var countModes int`
			`if countModeRestoreSize {`
			`countModes++`
			`}`
			`if countModeUniqueFilesByContent {`
			`countModes++`
			`}`
			`if countModeBlobsPerFile {`
			`countModes++`
			`}`
			`if countModeRawData {`
			`countModes++`
			`}`
			`if countModes > 1 {`
			`return fmt.Errorf("only one counting mode may be used")`
			`}`
			`// set a default count mode if none were specified`
			`if countModes == 0 {`
			`countModeRestoreSize = true`
			`}`
			`// ensure one or none snapshots were specified`
			`if len(args) > 1 {`
			`return fmt.Errorf("only one snapshot may be specified")`
			`}`
			`// set the snapshot to scan, if one was specified`
			`if len(args) == 1 {`
			`snapshotIDString = args[0]`
			`}`
			`return nil`
			`}`

stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`// statsContainer holds information during a walk of a repository`
			`// to collect information about it, as well as state needed`
			`// for a successful and efficient walk.`
			`type statsContainer struct {`
Implement four counting modes 2018-04-22 21:27:33 +00:00			TotalSize uint64 `json:"total_size"`
			TotalFileCount uint64 `json:"total_file_count"`
			TotalBlobCount uint64 `json:"total_blob_count,omitempty"`

			`// idSet marks visited trees, to avoid repeated walks`
			`idSet restic.IDSet`

			`// uniqueFiles marks visited files according to their`
			`// contents (hashed sequence of content blob IDs)`
			`uniqueFiles map[fileID]struct{}`

			`// fileBlobs maps a file name (path) to the set of`
			`// blobs that have been seen as a part of the file`
			`fileBlobs map[string]restic.IDSet`

			`// blobs and blobsSeen are used to count indiviudal`
			`// unique blobs, independent of references to files`
			`blobs, blobsSeen restic.BlobSet`
stats: Initial implementation of stats command 2018-04-20 14:44:14 +00:00			`}`
Count unique files by blob sequence rather than tree ID 2018-04-21 22:33:18 +00:00
Implement four counting modes 2018-04-22 21:27:33 +00:00			`// fileID is a 256-bit hash that distinguishes unique files.`
Count unique files by blob sequence rather than tree ID 2018-04-21 22:33:18 +00:00			`type fileID [32]byte`
Implement four counting modes 2018-04-22 21:27:33 +00:00
			`var (`
			`countModeRestoreSize bool`
			`countModeUniqueFilesByContent bool`
			`countModeBlobsPerFile bool`
			`countModeRawData bool`

			`// the snapshot to scan, as given by the user`
			`snapshotIDString string`

			`// snapshotByHost is the host to filter latest`
			`// snapshot by, if given by user`
			`snapshotByHost string`
			`)`