Merge pull request #2859 from buschjost/stats-filter-by-tag-and-path

Add filter by tag and path to stats command
This commit is contained in:
MichaelEischer 2020-08-31 22:11:01 +02:00 committed by GitHub
commit 55071ee367
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 88 deletions

View File

@ -0,0 +1,9 @@
Enhancement: Support filtering snapshots by tag and path in the stats command
We've added filtering snapshots by `--tag tagList` and by `--path path` to
the `stats` command. This includes filtering of only 'latest' snapshots or
all snapshots in a repository.
https://github.com/restic/restic/issues/2858
https://github.com/restic/restic/pull/2859
https://forum.restic.net/t/stats-for-a-host-and-filtered-snapshots/3020

View File

@ -6,7 +6,6 @@ import (
"fmt"
"path/filepath"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/walker"
@ -15,18 +14,19 @@ import (
)
var cmdStats = &cobra.Command{
Use: "stats [flags] [snapshot-ID]",
Use: "stats [flags] [snapshot ID] [...]",
Short: "Scan the repository and show basic statistics",
Long: `
The "stats" command walks one or all snapshots in a repository and
accumulates statistics about the data stored therein. It reports on
the number of unique files and their sizes, according to one of
The "stats" command walks one or multiple snapshots in a repository
and accumulates statistics about the data stored therein. It reports
on the number of unique files and their sizes, according to one of
the counting modes as given by the --mode flag.
If no snapshot is specified, all snapshots will be considered. Some
modes make more sense over just a single snapshot, while others
are useful across all snapshots, depending on what you are trying
to calculate.
It operates on all snapshots matching the selection criteria or all
snapshots if nothing is specified. The special snapshot ID "latest"
is also supported. Some modes make more sense over
just a single snapshot, while others are useful across all snapshots,
depending on what you are trying to calculate.
The modes are:
@ -50,11 +50,26 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
},
}
// StatsOptions collects all options for the stats command.
type StatsOptions struct {
// the mode of counting to perform (see consts for available modes)
countMode string
// filter snapshots by, if given by user
Hosts []string
Tags restic.TagLists
Paths []string
}
var statsOptions StatsOptions
func init() {
cmdRoot.AddCommand(cmdStats)
f := cmdStats.Flags()
f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file, or raw-data")
f.StringArrayVarP(&snapshotByHosts, "host", "H", nil, "filter latest snapshot by this hostname (can be specified multiple times)")
f.StringVar(&statsOptions.countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file or raw-data")
f.StringArrayVarP(&statsOptions.Hosts, "host", "H", nil, "only consider snapshots with the given `host` (can be specified multiple times)")
f.Var(&statsOptions.Tags, "tag", "only consider snapshots which include this `taglist` in the format `tag[,tag,...]` (can be specified multiple times)")
f.StringArrayVar(&statsOptions.Paths, "path", nil, "only consider snapshots which include this (absolute) `path` (can be specified multiple times)")
}
func runStats(gopts GlobalOptions, args []string) error {
@ -89,52 +104,25 @@ func runStats(gopts GlobalOptions, args []string) error {
// create a container for the stats (and other needed state)
stats := &statsContainer{
uniqueFiles: make(map[fileID]struct{}),
uniqueInodes: make(map[uint64]struct{}),
fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(),
uniqueFiles: make(map[fileID]struct{}),
uniqueInodes: make(map[uint64]struct{}),
fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(),
snapshotsCount: 0,
}
if snapshotIDString != "" {
// scan just a single snapshot
var sID restic.ID
if snapshotIDString == "latest" {
sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHosts)
if err != nil {
return errors.Fatalf("latest snapshot for criteria not found: %v", err)
}
} else {
sID, err = restic.FindSnapshot(repo, snapshotIDString)
if err != nil {
return errors.Fatalf("error loading snapshot: %v", err)
}
}
snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
if err != nil {
return errors.Fatalf("error loading snapshot from repo: %v", err)
}
err = statsWalkSnapshot(ctx, snapshot, repo, stats)
for sn := range FindFilteredSnapshots(ctx, repo, statsOptions.Hosts, statsOptions.Tags, statsOptions.Paths, args) {
err = statsWalkSnapshot(ctx, sn, repo, stats)
if err != nil {
return fmt.Errorf("error walking snapshot: %v", err)
}
} else {
// iterate every snapshot in the repo
err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
if err != nil {
return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
}
return statsWalkSnapshot(ctx, snapshot, repo, stats)
})
}
if err != nil {
return err
}
if countMode == countModeRawData {
if statsOptions.countMode == countModeRawData {
// the blob handles have been collected, but not yet counted
for blobHandle := range stats.blobs {
blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
@ -154,22 +142,16 @@ func runStats(gopts GlobalOptions, args []string) error {
return nil
}
// inform the user what was scanned and how it was scanned
snapshotsScanned := snapshotIDString
if snapshotsScanned == "latest" {
snapshotsScanned = "the latest snapshot"
} else if snapshotsScanned == "" {
snapshotsScanned = "all snapshots"
}
Printf("Stats for %s in %s mode:\n", snapshotsScanned, countMode)
Printf("Stats in %s mode:\n", statsOptions.countMode)
Printf("Snapshots processed: %d\n", stats.snapshotsCount)
if stats.TotalBlobCount > 0 {
Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)
Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)
}
if stats.TotalFileCount > 0 {
Printf(" Total File Count: %d\n", stats.TotalFileCount)
Printf(" Total File Count: %d\n", stats.TotalFileCount)
}
Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize))
Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize))
return nil
}
@ -179,7 +161,9 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
}
if countMode == countModeRawData {
stats.snapshotsCount++
if statsOptions.countMode == countModeRawData {
// count just the sizes of unique blobs; we don't need to walk the tree
// ourselves in this case, since a nifty function does it for us
return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs)
@ -189,6 +173,7 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
if err != nil {
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
}
return nil
}
@ -201,19 +186,19 @@ func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFun
return true, nil
}
if countMode == countModeUniqueFilesByContents || countMode == countModeBlobsPerFile {
if statsOptions.countMode == countModeUniqueFilesByContents || statsOptions.countMode == countModeBlobsPerFile {
// only count this file if we haven't visited it before
fid := makeFileIDByContents(node)
if _, ok := stats.uniqueFiles[fid]; !ok {
// mark the file as visited
stats.uniqueFiles[fid] = struct{}{}
if countMode == countModeUniqueFilesByContents {
if statsOptions.countMode == countModeUniqueFilesByContents {
// simply count the size of each unique file (unique by contents only)
stats.TotalSize += node.Size
stats.TotalFileCount++
}
if countMode == countModeBlobsPerFile {
if statsOptions.countMode == countModeBlobsPerFile {
// count the size of each unique blob reference, which is
// by unique file (unique by contents and file path)
for _, blobID := range node.Content {
@ -243,7 +228,7 @@ func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFun
}
}
if countMode == countModeRestoreSize {
if statsOptions.countMode == countModeRestoreSize {
// as this is a file in the snapshot, we can simply count its
// size without worrying about uniqueness, since duplicate files
// will still be restored
@ -275,23 +260,13 @@ func makeFileIDByContents(node *restic.Node) fileID {
func verifyStatsInput(gopts GlobalOptions, args []string) error {
// require a recognized counting mode
switch countMode {
switch statsOptions.countMode {
case countModeRestoreSize:
case countModeUniqueFilesByContents:
case countModeBlobsPerFile:
case countModeRawData:
default:
return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode)
}
// ensure at most one snapshot was specified
if len(args) > 1 {
return fmt.Errorf("only one snapshot may be specified")
}
// if a snapshot was specified, mark it as the one to scan
if len(args) == 1 {
snapshotIDString = args[0]
return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", statsOptions.countMode)
}
return nil
@ -320,23 +295,14 @@ type statsContainer struct {
// blobs is used to count individual unique blobs,
// independent of references to files
blobs restic.BlobSet
// holds count of all considered snapshots
snapshotsCount int
}
// fileID is a 256-bit hash that distinguishes unique files.
type fileID [32]byte
var (
// the mode of counting to perform
countMode string
// the snapshot to scan, as given by the user
snapshotIDString string
// snapshotByHost is the host to filter latest
// snapshot by, if given by user
snapshotByHosts []string
)
const (
countModeRestoreSize = "restore-size"
countModeUniqueFilesByContents = "files-by-contents"

View File

@ -22,10 +22,10 @@ func FindFilteredSnapshots(ctx context.Context, repo *repository.Repository, hos
// Process all snapshot IDs given as arguments.
for _, s := range snapshotIDs {
if s == "latest" {
usedFilter = true
id, err = restic.FindLatestSnapshot(ctx, repo, paths, tags, hosts)
if err != nil {
Warnf("Ignoring %q, no snapshot matched given filter (Paths:%v Tags:%v Hosts:%v)\n", s, paths, tags, hosts)
usedFilter = true
continue
}
} else {

View File

@ -306,6 +306,10 @@ host by using the ``--host`` flag:
There we see that it would take 482 GiB of disk space to restore the latest
snapshot from "myserver".
In case you have multiple backups running from the same host so can also use
``--tag`` and ``--path`` to be more specific about which snapshots you
are looking for.
But how much space does that snapshot take on disk? In other words, how much
has restic's deduplication helped? We can check: