mirror of
https://github.com/octoleo/restic.git
synced 2024-12-26 04:17:29 +00:00
e44e4b00a6
Ls ncdu
493 lines
14 KiB
Go
493 lines
14 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/restic/chunker"
|
|
"github.com/restic/restic/internal/crypto"
|
|
"github.com/restic/restic/internal/repository"
|
|
"github.com/restic/restic/internal/restic"
|
|
"github.com/restic/restic/internal/restorer"
|
|
"github.com/restic/restic/internal/ui"
|
|
"github.com/restic/restic/internal/ui/table"
|
|
"github.com/restic/restic/internal/walker"
|
|
|
|
"github.com/minio/sha256-simd"
|
|
"github.com/spf13/cobra"
|
|
)
|
|
|
|
var cmdStats = &cobra.Command{
|
|
Use: "stats [flags] [snapshot ID] [...]",
|
|
Short: "Scan the repository and show basic statistics",
|
|
Long: `
|
|
The "stats" command walks one or multiple snapshots in a repository
|
|
and accumulates statistics about the data stored therein. It reports
|
|
on the number of unique files and their sizes, according to one of
|
|
the counting modes as given by the --mode flag.
|
|
|
|
It operates on all snapshots matching the selection criteria or all
|
|
snapshots if nothing is specified. The special snapshot ID "latest"
|
|
is also supported. Some modes make more sense over
|
|
just a single snapshot, while others are useful across all snapshots,
|
|
depending on what you are trying to calculate.
|
|
|
|
The modes are:
|
|
|
|
* restore-size: (default) Counts the size of the restored files.
|
|
* files-by-contents: Counts total size of files, where a file is
|
|
considered unique if it has unique contents.
|
|
* raw-data: Counts the size of blobs in the repository, regardless of
|
|
how many files reference them.
|
|
* blobs-per-file: A combination of files-by-contents and raw-data.
|
|
|
|
Refer to the online manual for more details about each mode.
|
|
|
|
EXIT STATUS
|
|
===========
|
|
|
|
Exit status is 0 if the command was successful, and non-zero if there was any error.
|
|
`,
|
|
DisableAutoGenTag: true,
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
return runStats(cmd.Context(), statsOptions, globalOptions, args)
|
|
},
|
|
}
|
|
|
|
// StatsOptions collects all options for the stats command.
|
|
type StatsOptions struct {
|
|
// the mode of counting to perform (see consts for available modes)
|
|
countMode string
|
|
|
|
restic.SnapshotFilter
|
|
}
|
|
|
|
var statsOptions StatsOptions
|
|
|
|
func init() {
|
|
cmdRoot.AddCommand(cmdStats)
|
|
f := cmdStats.Flags()
|
|
f.StringVar(&statsOptions.countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file or raw-data")
|
|
initMultiSnapshotFilter(f, &statsOptions.SnapshotFilter, true)
|
|
}
|
|
|
|
func runStats(ctx context.Context, opts StatsOptions, gopts GlobalOptions, args []string) error {
|
|
err := verifyStatsInput(opts)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
repo, err := OpenRepository(ctx, gopts)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !gopts.NoLock {
|
|
var lock *restic.Lock
|
|
lock, ctx, err = lockRepo(ctx, repo, gopts.RetryLock, gopts.JSON)
|
|
defer unlockRepo(lock)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
snapshotLister, err := restic.MemorizeList(ctx, repo, restic.SnapshotFile)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
bar := newIndexProgress(gopts.Quiet, gopts.JSON)
|
|
if err = repo.LoadIndex(ctx, bar); err != nil {
|
|
return err
|
|
}
|
|
|
|
if opts.countMode == countModeDebug {
|
|
return statsDebug(ctx, repo)
|
|
}
|
|
|
|
if !gopts.JSON {
|
|
Printf("scanning...\n")
|
|
}
|
|
|
|
// create a container for the stats (and other needed state)
|
|
stats := &statsContainer{
|
|
uniqueFiles: make(map[fileID]struct{}),
|
|
fileBlobs: make(map[string]restic.IDSet),
|
|
blobs: restic.NewBlobSet(),
|
|
SnapshotsCount: 0,
|
|
}
|
|
|
|
for sn := range FindFilteredSnapshots(ctx, snapshotLister, repo, &opts.SnapshotFilter, args) {
|
|
err = statsWalkSnapshot(ctx, sn, repo, opts, stats)
|
|
if err != nil {
|
|
return fmt.Errorf("error walking snapshot: %v", err)
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if opts.countMode == countModeRawData {
|
|
// the blob handles have been collected, but not yet counted
|
|
for blobHandle := range stats.blobs {
|
|
pbs := repo.Index().Lookup(blobHandle)
|
|
if len(pbs) == 0 {
|
|
return fmt.Errorf("blob %v not found", blobHandle)
|
|
}
|
|
stats.TotalSize += uint64(pbs[0].Length)
|
|
if repo.Config().Version >= 2 {
|
|
stats.TotalUncompressedSize += uint64(crypto.CiphertextLength(int(pbs[0].DataLength())))
|
|
if pbs[0].IsCompressed() {
|
|
stats.TotalCompressedBlobsSize += uint64(pbs[0].Length)
|
|
stats.TotalCompressedBlobsUncompressedSize += uint64(crypto.CiphertextLength(int(pbs[0].DataLength())))
|
|
}
|
|
}
|
|
stats.TotalBlobCount++
|
|
}
|
|
if stats.TotalCompressedBlobsSize > 0 {
|
|
stats.CompressionRatio = float64(stats.TotalCompressedBlobsUncompressedSize) / float64(stats.TotalCompressedBlobsSize)
|
|
}
|
|
if stats.TotalUncompressedSize > 0 {
|
|
stats.CompressionProgress = float64(stats.TotalCompressedBlobsUncompressedSize) / float64(stats.TotalUncompressedSize) * 100
|
|
stats.CompressionSpaceSaving = (1 - float64(stats.TotalSize)/float64(stats.TotalUncompressedSize)) * 100
|
|
}
|
|
}
|
|
|
|
if gopts.JSON {
|
|
err = json.NewEncoder(globalOptions.stdout).Encode(stats)
|
|
if err != nil {
|
|
return fmt.Errorf("encoding output: %v", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
Printf("Stats in %s mode:\n", opts.countMode)
|
|
Printf(" Snapshots processed: %d\n", stats.SnapshotsCount)
|
|
if stats.TotalBlobCount > 0 {
|
|
Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)
|
|
}
|
|
if stats.TotalFileCount > 0 {
|
|
Printf(" Total File Count: %d\n", stats.TotalFileCount)
|
|
}
|
|
if stats.TotalUncompressedSize > 0 {
|
|
Printf(" Total Uncompressed Size: %-5s\n", ui.FormatBytes(stats.TotalUncompressedSize))
|
|
}
|
|
Printf(" Total Size: %-5s\n", ui.FormatBytes(stats.TotalSize))
|
|
if stats.CompressionProgress > 0 {
|
|
Printf(" Compression Progress: %.2f%%\n", stats.CompressionProgress)
|
|
}
|
|
if stats.CompressionRatio > 0 {
|
|
Printf(" Compression Ratio: %.2fx\n", stats.CompressionRatio)
|
|
}
|
|
if stats.CompressionSpaceSaving > 0 {
|
|
Printf("Compression Space Saving: %.2f%%\n", stats.CompressionSpaceSaving)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Loader, opts StatsOptions, stats *statsContainer) error {
|
|
if snapshot.Tree == nil {
|
|
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
|
|
}
|
|
|
|
stats.SnapshotsCount++
|
|
|
|
if opts.countMode == countModeRawData {
|
|
// count just the sizes of unique blobs; we don't need to walk the tree
|
|
// ourselves in this case, since a nifty function does it for us
|
|
return restic.FindUsedBlobs(ctx, repo, restic.IDs{*snapshot.Tree}, stats.blobs, nil)
|
|
}
|
|
|
|
hardLinkIndex := restorer.NewHardlinkIndex[struct{}]()
|
|
err := walker.Walk(ctx, repo, *snapshot.Tree, walker.WalkVisitor{
|
|
ProcessNode: statsWalkTree(repo, opts, stats, hardLinkIndex),
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func statsWalkTree(repo restic.Loader, opts StatsOptions, stats *statsContainer, hardLinkIndex *restorer.HardlinkIndex[struct{}]) walker.WalkFunc {
|
|
return func(parentTreeID restic.ID, npath string, node *restic.Node, nodeErr error) error {
|
|
if nodeErr != nil {
|
|
return nodeErr
|
|
}
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
|
|
if opts.countMode == countModeUniqueFilesByContents || opts.countMode == countModeBlobsPerFile {
|
|
// only count this file if we haven't visited it before
|
|
fid := makeFileIDByContents(node)
|
|
if _, ok := stats.uniqueFiles[fid]; !ok {
|
|
// mark the file as visited
|
|
stats.uniqueFiles[fid] = struct{}{}
|
|
|
|
if opts.countMode == countModeUniqueFilesByContents {
|
|
// simply count the size of each unique file (unique by contents only)
|
|
stats.TotalSize += node.Size
|
|
stats.TotalFileCount++
|
|
}
|
|
if opts.countMode == countModeBlobsPerFile {
|
|
// count the size of each unique blob reference, which is
|
|
// by unique file (unique by contents and file path)
|
|
for _, blobID := range node.Content {
|
|
// ensure we have this file (by path) in our map; in this
|
|
// mode, a file is unique by both contents and path
|
|
nodePath := filepath.Join(npath, node.Name)
|
|
if _, ok := stats.fileBlobs[nodePath]; !ok {
|
|
stats.fileBlobs[nodePath] = restic.NewIDSet()
|
|
stats.TotalFileCount++
|
|
}
|
|
if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {
|
|
// is always a data blob since we're accessing it via a file's Content array
|
|
blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
|
|
if !found {
|
|
return fmt.Errorf("blob %s not found for tree %s", blobID, parentTreeID)
|
|
}
|
|
|
|
// count the blob's size, then add this blob by this
|
|
// file (path) so we don't double-count it
|
|
stats.TotalSize += uint64(blobSize)
|
|
stats.fileBlobs[nodePath].Insert(blobID)
|
|
// this mode also counts total unique blob _references_ per file
|
|
stats.TotalBlobCount++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if opts.countMode == countModeRestoreSize {
|
|
// as this is a file in the snapshot, we can simply count its
|
|
// size without worrying about uniqueness, since duplicate files
|
|
// will still be restored
|
|
stats.TotalFileCount++
|
|
|
|
// if inodes are present, only count each inode once
|
|
// (hard links do not increase restore size)
|
|
if !hardLinkIndex.Has(node.Inode, node.DeviceID) || node.Inode == 0 {
|
|
hardLinkIndex.Add(node.Inode, node.DeviceID, struct{}{})
|
|
stats.TotalSize += node.Size
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// makeFileIDByContents returns a hash of the blob IDs of the
|
|
// node's Content in sequence.
|
|
func makeFileIDByContents(node *restic.Node) fileID {
|
|
var bb []byte
|
|
for _, c := range node.Content {
|
|
bb = append(bb, []byte(c[:])...)
|
|
}
|
|
return sha256.Sum256(bb)
|
|
}
|
|
|
|
func verifyStatsInput(opts StatsOptions) error {
|
|
// require a recognized counting mode
|
|
switch opts.countMode {
|
|
case countModeRestoreSize:
|
|
case countModeUniqueFilesByContents:
|
|
case countModeBlobsPerFile:
|
|
case countModeRawData:
|
|
case countModeDebug:
|
|
default:
|
|
return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", opts.countMode)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// statsContainer holds information during a walk of a repository
|
|
// to collect information about it, as well as state needed
|
|
// for a successful and efficient walk.
|
|
type statsContainer struct {
|
|
TotalSize uint64 `json:"total_size"`
|
|
TotalUncompressedSize uint64 `json:"total_uncompressed_size,omitempty"`
|
|
TotalCompressedBlobsSize uint64 `json:"-"`
|
|
TotalCompressedBlobsUncompressedSize uint64 `json:"-"`
|
|
CompressionRatio float64 `json:"compression_ratio,omitempty"`
|
|
CompressionProgress float64 `json:"compression_progress,omitempty"`
|
|
CompressionSpaceSaving float64 `json:"compression_space_saving,omitempty"`
|
|
TotalFileCount uint64 `json:"total_file_count,omitempty"`
|
|
TotalBlobCount uint64 `json:"total_blob_count,omitempty"`
|
|
// holds count of all considered snapshots
|
|
SnapshotsCount int `json:"snapshots_count"`
|
|
|
|
// uniqueFiles marks visited files according to their
|
|
// contents (hashed sequence of content blob IDs)
|
|
uniqueFiles map[fileID]struct{}
|
|
|
|
// fileBlobs maps a file name (path) to the set of
|
|
// blobs that have been seen as a part of the file
|
|
fileBlobs map[string]restic.IDSet
|
|
|
|
// blobs is used to count individual unique blobs,
|
|
// independent of references to files
|
|
blobs restic.BlobSet
|
|
}
|
|
|
|
// fileID is a 256-bit hash that distinguishes unique files.
|
|
type fileID [32]byte
|
|
|
|
const (
|
|
countModeRestoreSize = "restore-size"
|
|
countModeUniqueFilesByContents = "files-by-contents"
|
|
countModeBlobsPerFile = "blobs-per-file"
|
|
countModeRawData = "raw-data"
|
|
countModeDebug = "debug"
|
|
)
|
|
|
|
func statsDebug(ctx context.Context, repo restic.Repository) error {
|
|
Warnf("Collecting size statistics\n\n")
|
|
for _, t := range []restic.FileType{restic.KeyFile, restic.LockFile, restic.IndexFile, restic.PackFile} {
|
|
hist, err := statsDebugFileType(ctx, repo, t)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
Warnf("File Type: %v\n%v\n", t, hist)
|
|
}
|
|
|
|
hist := statsDebugBlobs(ctx, repo)
|
|
for _, t := range []restic.BlobType{restic.DataBlob, restic.TreeBlob} {
|
|
Warnf("Blob Type: %v\n%v\n\n", t, hist[t])
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func statsDebugFileType(ctx context.Context, repo restic.Lister, tpe restic.FileType) (*sizeHistogram, error) {
|
|
hist := newSizeHistogram(2 * repository.MaxPackSize)
|
|
err := repo.List(ctx, tpe, func(id restic.ID, size int64) error {
|
|
hist.Add(uint64(size))
|
|
return nil
|
|
})
|
|
|
|
return hist, err
|
|
}
|
|
|
|
func statsDebugBlobs(ctx context.Context, repo restic.Repository) [restic.NumBlobTypes]*sizeHistogram {
|
|
var hist [restic.NumBlobTypes]*sizeHistogram
|
|
for i := 0; i < len(hist); i++ {
|
|
hist[i] = newSizeHistogram(2 * chunker.MaxSize)
|
|
}
|
|
|
|
repo.Index().Each(ctx, func(pb restic.PackedBlob) {
|
|
hist[pb.Type].Add(uint64(pb.Length))
|
|
})
|
|
|
|
return hist
|
|
}
|
|
|
|
type sizeClass struct {
|
|
lower, upper uint64
|
|
count int64
|
|
}
|
|
|
|
type sizeHistogram struct {
|
|
count int64
|
|
totalSize uint64
|
|
buckets []sizeClass
|
|
oversized []uint64
|
|
}
|
|
|
|
func newSizeHistogram(sizeLimit uint64) *sizeHistogram {
|
|
h := &sizeHistogram{}
|
|
h.buckets = append(h.buckets, sizeClass{0, 0, 0})
|
|
|
|
lowerBound := uint64(1)
|
|
growthFactor := uint64(10)
|
|
|
|
for lowerBound < sizeLimit {
|
|
upperBound := lowerBound*growthFactor - 1
|
|
if upperBound > sizeLimit {
|
|
upperBound = sizeLimit
|
|
}
|
|
h.buckets = append(h.buckets, sizeClass{lowerBound, upperBound, 0})
|
|
lowerBound *= growthFactor
|
|
}
|
|
|
|
return h
|
|
}
|
|
|
|
func (s *sizeHistogram) Add(size uint64) {
|
|
s.count++
|
|
s.totalSize += size
|
|
|
|
for i, bucket := range s.buckets {
|
|
if size >= bucket.lower && size <= bucket.upper {
|
|
s.buckets[i].count++
|
|
return
|
|
}
|
|
}
|
|
|
|
s.oversized = append(s.oversized, size)
|
|
}
|
|
|
|
func (s sizeHistogram) String() string {
|
|
var out strings.Builder
|
|
|
|
out.WriteString(fmt.Sprintf("Count: %d\n", s.count))
|
|
out.WriteString(fmt.Sprintf("Total Size: %s\n", ui.FormatBytes(s.totalSize)))
|
|
|
|
t := table.New()
|
|
t.AddColumn("Size", "{{.SizeRange}}")
|
|
t.AddColumn("Count", "{{.Count}}")
|
|
type line struct {
|
|
SizeRange string
|
|
Count int64
|
|
}
|
|
|
|
// only print up to the highest used bucket size
|
|
lastFilledIdx := 0
|
|
for i := 0; i < len(s.buckets); i++ {
|
|
if s.buckets[i].count != 0 {
|
|
lastFilledIdx = i
|
|
}
|
|
}
|
|
|
|
var lines []line
|
|
hasStarted := false
|
|
for i, b := range s.buckets {
|
|
if i > lastFilledIdx {
|
|
break
|
|
}
|
|
|
|
if b.count > 0 {
|
|
hasStarted = true
|
|
}
|
|
if hasStarted {
|
|
lines = append(lines, line{
|
|
SizeRange: fmt.Sprintf("%d - %d Byte", b.lower, b.upper),
|
|
Count: b.count,
|
|
})
|
|
}
|
|
}
|
|
longestRange := 0
|
|
for _, l := range lines {
|
|
if longestRange < len(l.SizeRange) {
|
|
longestRange = len(l.SizeRange)
|
|
}
|
|
}
|
|
for i := range lines {
|
|
lines[i].SizeRange = strings.Repeat(" ", longestRange-len(lines[i].SizeRange)) + lines[i].SizeRange
|
|
t.AddRow(lines[i])
|
|
}
|
|
|
|
_ = t.Write(&out)
|
|
|
|
if len(s.oversized) > 0 {
|
|
out.WriteString(fmt.Sprintf("Oversized: %v\n", s.oversized))
|
|
}
|
|
return out.String()
|
|
}
|