From 581d90cf910077a0e24ab75d71524106dd347cc0 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Fri, 1 May 2020 22:56:34 +0200 Subject: [PATCH 01/13] Make some pack parameters public --- internal/pack/pack.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/internal/pack/pack.go b/internal/pack/pack.go index fbe0522dc..d4f064476 100644 --- a/internal/pack/pack.go +++ b/internal/pack/pack.go @@ -161,13 +161,16 @@ func (p *Packer) String() string { } var ( - // size of the header-length field at the end of the file - headerLengthSize = binary.Size(uint32(0)) // we require at least one entry in the header, and one blob for a pack file minFileSize = entrySize + crypto.Extension + uint(headerLengthSize) ) const ( + // size of the header-length field at the end of the file; it is a uint32 + headerLengthSize = 4 + // constant overhead of the header independent of #entries + HeaderSize = headerLengthSize + crypto.Extension + maxHeaderSize = 16 * 1024 * 1024 // number of header enries to download as part of header-length request eagerEntries = 15 @@ -315,3 +318,8 @@ func List(k *crypto.Key, rd io.ReaderAt, size int64) (entries []restic.Blob, err return entries, nil } + +// PackedSizeOfBlob returns the size a blob actually uses when saved in a pack +func PackedSizeOfBlob(blobLength uint) uint { + return blobLength + entrySize +} From ce7d6137496d5971462b12865a475e751fe0e4dd Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Fri, 17 Jul 2020 22:13:23 +0200 Subject: [PATCH 02/13] Add Blob.Handle() --- internal/restic/blob.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/restic/blob.go b/internal/restic/blob.go index a3a6c8630..b6c5a47cf 100644 --- a/internal/restic/blob.go +++ b/internal/restic/blob.go @@ -19,6 +19,10 @@ func (b Blob) String() string { b.Type, b.ID.Str(), b.Offset, b.Length) } +func (b Blob) Handle() BlobHandle { + return BlobHandle{ID: b.ID, Type: b.Type} +} + // PackedBlob is a blob stored within a file. type PackedBlob struct { Blob From 3b591ed9878a643b9ac7c84d2f0b37f44b036515 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Thu, 8 Oct 2020 23:01:24 +0200 Subject: [PATCH 03/13] Add Verboseff --- cmd/restic/global.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmd/restic/global.go b/cmd/restic/global.go index 33d222eea..0c95f9d52 100644 --- a/cmd/restic/global.go +++ b/cmd/restic/global.go @@ -231,6 +231,13 @@ func Verbosef(format string, args ...interface{}) { } } +// Verboseff calls Printf to write the message when the verbosity is >= 2 +func Verboseff(format string, args ...interface{}) { + if globalOptions.verbosity >= 2 { + Printf(format, args...) + } +} + // PrintProgress wraps fmt.Printf to handle the difference in writing progress // information to terminals and non-terminal stdout func PrintProgress(format string, args ...interface{}) { From 7f9a0a5907b44c543cd08c0e169103975abac335 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sun, 19 Jul 2020 07:55:14 +0200 Subject: [PATCH 04/13] Reimplementation of prune --- changelog/unreleased/pull-2718 | 22 ++ cmd/restic/cmd_forget.go | 13 +- cmd/restic/cmd_prune.go | 524 +++++++++++++++++++++++---------- cmd/restic/integration_test.go | 79 +++-- doc/060_forget.rst | 126 ++++++-- 5 files changed, 553 insertions(+), 211 deletions(-) create mode 100644 changelog/unreleased/pull-2718 diff --git a/changelog/unreleased/pull-2718 b/changelog/unreleased/pull-2718 new file mode 100644 index 000000000..b2c32cc45 --- /dev/null +++ b/changelog/unreleased/pull-2718 @@ -0,0 +1,22 @@ +Enhancement: Improve pruning performance and make pruning more customizable + +The prune command is now much faster. This is especially the case for remote +repositories or repositories with not much data to prune. +Also the memory usage of the prune command is now reduced. + +By default the prune command now no longer removes all unused blobs. This +behavior can be fine-tuned by new options, like tolerated unused space or +maximum size of packs to repack. For more details, see +https://restic.readthedocs.io/en/stable/060_forget.html + +Moreover, prune now accepts the dry-run option and forget --dry-run --prune +also shows what prune would do. + +Fixes several open issues, e.g.: +https://github.com/restic/restic/issues/1140 +https://github.com/restic/restic/issues/1985 +https://github.com/restic/restic/issues/2112 +https://github.com/restic/restic/issues/2227 +https://github.com/restic/restic/issues/2305 + +https://github.com/restic/restic/pull/2718 diff --git a/cmd/restic/cmd_forget.go b/cmd/restic/cmd_forget.go index 3edaa76e9..fa9739c0b 100644 --- a/cmd/restic/cmd_forget.go +++ b/cmd/restic/cmd_forget.go @@ -80,9 +80,15 @@ func init() { f.BoolVar(&forgetOptions.Prune, "prune", false, "automatically run the 'prune' command if snapshots have been removed") f.SortFlags = false + addPruneOptions(cmdForget) } func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { + err := verifyPruneOptions(&pruneOptions) + if err != nil { + return err + } + repo, err := OpenRepository(gopts) if err != nil { return err @@ -205,7 +211,12 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { } if len(removeSnIDs) > 0 && opts.Prune && !opts.DryRun { - return pruneRepository(gopts, repo) + if !gopts.JSON { + Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs)) + } + + pruneOptions.DryRun = opts.DryRun + return runPruneWithRepo(pruneOptions, gopts, repo) } return nil diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 1bb1a51e5..9a93c600c 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -1,15 +1,22 @@ package main import ( + "sort" + "strconv" + "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" - "github.com/restic/restic/internal/index" + "github.com/restic/restic/internal/pack" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" "github.com/spf13/cobra" ) +var errorIndexIncomplete = errors.Fatal("index is not complete") +var errorPacksMissing = errors.Fatal("packs from index missing in repo") +var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index") + var cmdPrune = &cobra.Command{ Use: "prune [flags]", Short: "Remove unneeded data from the repository", @@ -24,12 +31,72 @@ Exit status is 0 if the command was successful, and non-zero if there was any er `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { - return runPrune(globalOptions) + return runPrune(pruneOptions, globalOptions) }, } +// PruneOptions collects all options for the cleanup command. +type PruneOptions struct { + DryRun bool + MaxUnused string + MaxUnusedPercent float64 + MaxUnusedBytes uint64 + MaxRepackSize string + MaxRepackBytes uint64 + // RepackSmall bool <- This option may be added later + RepackCachableOnly bool +} + +var pruneOptions PruneOptions + func init() { cmdRoot.AddCommand(cmdPrune) + f := cmdPrune.Flags() + f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done") + addPruneOptions(cmdPrune) +} + +func addPruneOptions(c *cobra.Command) { + f := c.Flags() + f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)") + f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)") + f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") + // f.BoolVar(&pruneOptions.RepackSmall, "repack-small", false, "also repack small packs") +} + +func verifyPruneOptions(opts *PruneOptions) error { + if len(opts.MaxRepackSize) > 0 { + size, err := parseSizeStr(opts.MaxRepackSize) + if err != nil { + return err + } + opts.MaxRepackBytes = uint64(size) + } + + length := len(opts.MaxUnused) + if length == 0 { + return nil + } + + var err error + if opts.MaxUnused[length-1] == '%' { + opts.MaxUnusedPercent, err = strconv.ParseFloat(opts.MaxUnused[:length-1], 64) + opts.MaxUnusedBytes = ^uint64(0) + } else { + var size int64 + size, err = parseSizeStr(opts.MaxUnused) + opts.MaxUnusedPercent = 100.0 + opts.MaxUnusedBytes = uint64(size) + } + if err != nil { + return err + } + + if opts.MaxUnusedPercent < 0.0 || opts.MaxUnusedPercent > 100.0 { + return errors.Fatalf("--max-unused-percent should be between 0 and 100. Given value: %f", opts.MaxUnusedPercent) + } + + return nil } func shortenStatus(maxLength int, s string) string { @@ -44,7 +111,12 @@ func shortenStatus(maxLength int, s string) string { return s[:maxLength-3] + "..." } -func runPrune(gopts GlobalOptions) error { +func runPrune(opts PruneOptions, gopts GlobalOptions) error { + err := verifyPruneOptions(&opts) + if err != nil { + return err + } + repo, err := OpenRepository(gopts) if err != nil { return err @@ -56,203 +128,345 @@ func runPrune(gopts GlobalOptions) error { return err } + return runPruneWithRepo(opts, gopts, repo) +} + +func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository) error { // we do not need index updates while pruning! repo.DisableAutoIndexUpdate() - return pruneRepository(gopts, repo) -} - -func mixedBlobs(list []restic.Blob) bool { - var tree, data bool - - for _, pb := range list { - switch pb.Type { - case restic.TreeBlob: - tree = true - case restic.DataBlob: - data = true - } - - if tree && data { - return true - } - } - - return false -} - -func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { - ctx := gopts.ctx - - err := repo.LoadIndex(ctx) + Verbosef("loading all snapshots...\n") + snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo) if err != nil { return err } - var stats struct { - blobs int - packs int - snapshots int - bytes int64 - } - - Verbosef("counting files in repo\n") - err = repo.List(ctx, restic.PackFile, func(restic.ID, int64) error { - stats.packs++ - return nil - }) + Verbosef("loading indexes...\n") + err = repo.LoadIndex(gopts.ctx) if err != nil { return err } - Verbosef("building new index for repo\n") - - bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs") - idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar) - if err != nil { - return err - } - - for _, id := range invalidFiles { - Warnf("incomplete pack file (will be removed): %v\n", id) - } - - blobs := 0 - for _, pack := range idx.Packs { - stats.bytes += pack.Size - blobs += len(pack.Entries) - } - Verbosef("repository contains %v packs (%v blobs) with %v\n", - len(idx.Packs), blobs, formatBytes(uint64(stats.bytes))) - - blobCount := make(map[restic.BlobHandle]int) - var duplicateBlobs uint64 - var duplicateBytes uint64 - - // find duplicate blobs - for _, p := range idx.Packs { - for _, entry := range p.Entries { - stats.blobs++ - h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} - blobCount[h]++ - - if blobCount[h] > 1 { - duplicateBlobs++ - duplicateBytes += uint64(entry.Length) - } - } - } - - Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n", - stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes))) - Verbosef("load all snapshots\n") - - // find referenced blobs - snapshots, err := restic.LoadAllSnapshots(ctx, repo) - if err != nil { - return err - } - - stats.snapshots = len(snapshots) - usedBlobs, err := getUsedBlobs(gopts, repo, snapshots) if err != nil { return err } - var missingBlobs []restic.BlobHandle - for h := range usedBlobs { - if _, ok := blobCount[h]; !ok { - missingBlobs = append(missingBlobs, h) + return prune(opts, gopts, repo, usedBlobs) +} + +type packInfo struct { + usedBlobs uint + unusedBlobs uint + duplicateBlobs uint + usedSize uint64 + unusedSize uint64 + tpe restic.BlobType +} + +type packInfoWithID struct { + ID restic.ID + packInfo +} + +func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error { + ctx := gopts.ctx + + var stats struct { + blobs struct { + used uint + duplicate uint + unused uint + remove uint + repack uint + repackrm uint + } + size struct { + used uint64 + duplicate uint64 + unused uint64 + remove uint64 + repack uint64 + repackrm uint64 + unref uint64 + } + packs struct { + used uint + unused uint + partlyUsed uint + keep uint } } - if len(missingBlobs) > 0 { - return errors.Fatalf("%v not found in the new index\n"+ + + Verbosef("searching used packs...\n") + + keepBlobs := restic.NewBlobSet() + duplicateBlobs := restic.NewBlobSet() + + // iterate over all blobs in index to find out which blobs are duplicates + for blob := range repo.Index().Each(ctx) { + bh := blob.Handle() + switch { + case usedBlobs.Has(bh): // used blob, move to keepBlobs + usedBlobs.Delete(bh) + keepBlobs.Insert(bh) + case keepBlobs.Has(bh): // duplicate blob + duplicateBlobs.Insert(bh) + } + } + + // Check if all used blobs have been found in index + if len(usedBlobs) != 0 { + Warnf("%v not found in the new index\n"+ "Data blobs seem to be missing, aborting prune to prevent further data loss!\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose", missingBlobs) + "https://github.com/restic/restic/issues/new/choose", usedBlobs) + return errorIndexIncomplete } - Verbosef("found %d of %d data blobs still in use, removing %d blobs\n", - len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs)) + indexPack := make(map[restic.ID]packInfo) - // find packs that need a rewrite - rewritePacks := restic.NewIDSet() - for _, pack := range idx.Packs { - if mixedBlobs(pack.Entries) { - rewritePacks.Insert(pack.ID) - continue + // iterate over all blobs in index to generate packInfo + for blob := range repo.Index().Each(ctx) { + ip, ok := indexPack[blob.PackID] + if !ok { + ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize} + } + // mark mixed packs with "Invalid blob type" + if ip.tpe != blob.Type { + ip.tpe = restic.InvalidBlob } - for _, blob := range pack.Entries { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - if !usedBlobs.Has(h) { - rewritePacks.Insert(pack.ID) - continue - } - - if blobCount[h] > 1 { - rewritePacks.Insert(pack.ID) - } + bh := blob.Handle() + size := uint64(pack.PackedSizeOfBlob(blob.Length)) + switch { + case duplicateBlobs.Has(bh): // duplicate blob + ip.usedSize += size + ip.duplicateBlobs++ + stats.size.duplicate += size + stats.blobs.duplicate++ + case keepBlobs.Has(bh): // used blob, not duplicate + ip.usedSize += size + ip.usedBlobs++ + stats.size.used += size + stats.blobs.used++ + default: // unused blob + ip.unusedSize += size + ip.unusedBlobs++ + stats.size.unused += size + stats.blobs.unused++ } + // update indexPack + indexPack[blob.PackID] = ip } - removeBytes := duplicateBytes - - // find packs that are unneeded + Verbosef("collecting packs for deletion and repacking\n") + removePacksFirst := restic.NewIDSet() removePacks := restic.NewIDSet() + repackPacks := restic.NewIDSet() - Verbosef("will remove %d invalid files\n", len(invalidFiles)) - for _, id := range invalidFiles { - removePacks.Insert(id) + var repackCandidates []packInfoWithID + + repack := func(id restic.ID, p packInfo) { + repackPacks.Insert(id) + stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs + stats.size.repack += p.unusedSize + p.usedSize + stats.blobs.repackrm += p.unusedBlobs + stats.size.repackrm += p.unusedSize } - for packID, p := range idx.Packs { - - hasActiveBlob := false - for _, blob := range p.Entries { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - if usedBlobs.Has(h) { - hasActiveBlob = true - continue - } - - removeBytes += uint64(blob.Length) + // loop over all packs and decide what to do + bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed") + bar.Start() + err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error { + p, ok := indexPack[id] + if !ok { + // Pack was not referenced in index and is not used => immediately remove! + Verboseff("will remove pack %v as it is unused and not indexed\n", id.Str()) + removePacksFirst.Insert(id) + stats.size.unref += uint64(packSize) + return nil } - if hasActiveBlob { - continue + if p.unusedSize+p.usedSize != uint64(packSize) { + Warnf("pack %s: calculated size %d does not match real size %d\nRun 'restic rebuild-index'.", + id.Str(), p.unusedSize+p.usedSize, packSize) + return errorSizeNotMatching } - removePacks.Insert(packID) - - if !rewritePacks.Has(packID) { - return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str()) + // statistics + switch { + case p.usedBlobs == 0 && p.duplicateBlobs == 0: + stats.packs.unused++ + case p.unusedBlobs == 0: + stats.packs.used++ + default: + stats.packs.partlyUsed++ } - rewritePacks.Delete(packID) - } + // decide what to do + switch { + case p.usedBlobs == 0 && p.duplicateBlobs == 0: + // All blobs in pack are no longer used => remove pack! + removePacks.Insert(id) + stats.blobs.remove += p.unusedBlobs + stats.size.remove += p.unusedSize - Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n", - len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes))) + case opts.RepackCachableOnly && p.tpe == restic.DataBlob, + // if this is a data pack and --repack-cacheable-only is set => keep pack! + p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: // && (!opts.RepackSmall || packSize >= repository.MinPackSize) + // All blobs in pack are used and not duplicates/mixed => keep pack! + stats.packs.keep++ - var obsoletePacks restic.IDSet - if len(rewritePacks) != 0 { - bar := newProgressMax(!gopts.Quiet, uint64(len(rewritePacks)), "packs rewritten") - obsoletePacks, err = repository.Repack(ctx, repo, rewritePacks, usedBlobs, bar) - if err != nil { - return err + default: + // all other packs are candidates for repacking + repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p}) } - } - removePacks.Merge(obsoletePacks) - - if err = rebuildIndex(ctx, repo, removePacks); err != nil { + delete(indexPack, id) + bar.Report(restic.Stat{Blobs: 1}) + return nil + }) + bar.Done() + if err != nil { return err } + if len(indexPack) != 0 { + Warnf("The index references pack files which are missing from the repository: %v\n", indexPack) + return errorPacksMissing + } + + repackAllPacksWithDuplicates := true + + maxUnusedSizeAfter := opts.MaxUnusedBytes + if opts.MaxUnusedPercent < 100.0 { + maxUnusedSizePercent := uint64(opts.MaxUnusedPercent / (100.0 - opts.MaxUnusedPercent) * float64(stats.size.used)) + if maxUnusedSizePercent < maxUnusedSizeAfter { + maxUnusedSizeAfter = maxUnusedSizePercent + } + } + + // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. + // This is equivalent to sorting by unused / total space. + // Instead of unused[i] / used[i] > unused[j] / used[j] we use + // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 + // Morover duplicates and mixed are sorted to the beginning + sort.Slice(repackCandidates, func(i, j int) bool { + pi := repackCandidates[i].packInfo + pj := repackCandidates[j].packInfo + switch { + case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0: + return true + case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0: + return false + case pi.tpe == restic.InvalidBlob && pj.tpe != restic.InvalidBlob: + return true + case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob: + return false + //case opts.RepackSmall && pi.unusedSize+pi.usedSize < repository.MinPackSize && pj.unusedSize+pj.usedSize >= repository.MinPackSize: + // return true + //case opts.RepackSmall && pj.unusedSize+pj.usedSize < repository.MinPackSize && pi.unusedSize+pi.usedSize >= repository.MinPackSize: + // return false + } + return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize + }) + + for _, p := range repackCandidates { + reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter) + reachedRepackSize := (len(opts.MaxRepackSize) > 0 && stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes) + switch { + case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob): + // repacking duplicates/mixed is only limited by repackSize + repack(p.ID, p.packInfo) + + case reachedUnusedSizeAfter, reachedRepackSize: + // for all other packs stop repacking if tolerated unused size is reached. + stats.packs.keep++ + if p.duplicateBlobs > 0 { + repackAllPacksWithDuplicates = false + } + + default: + repack(p.ID, p.packInfo) + } + } + + // if all duplicates are repacked, print out correct statistics + if repackAllPacksWithDuplicates { + stats.blobs.repackrm += stats.blobs.duplicate + stats.size.repackrm += stats.size.duplicate + } + + Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used)) + if stats.blobs.duplicate > 0 { + Verboseff("duplicates: %10d blobs / %s\n", stats.blobs.duplicate, formatBytes(stats.size.duplicate)) + } + Verboseff("unused: %10d blobs / %s\n", stats.blobs.unused, formatBytes(stats.size.unused)) + if stats.size.unref > 0 { + Verboseff("unreferenced: %s\n", formatBytes(stats.size.unref)) + } + totalBlobs := stats.blobs.used + stats.blobs.unused + stats.blobs.duplicate + totalSize := stats.size.used + stats.size.duplicate + stats.size.unused + stats.size.unref + Verboseff("total: %10d blobs / %s\n", totalBlobs, formatBytes(totalSize)) + Verboseff("unused size: %s of total size\n", formatPercent(stats.size.unused, totalSize)) + + Verbosef("\nto repack: %10d blobs / %s\n", stats.blobs.repack, formatBytes(stats.size.repack)) + Verbosef("this removes %10d blobs / %s\n", stats.blobs.repackrm, formatBytes(stats.size.repackrm)) + Verbosef("to delete: %10d blobs / %s\n", stats.blobs.remove, formatBytes(stats.size.remove+stats.size.unref)) + totalPruneSize := stats.size.remove + stats.size.repackrm + stats.size.unref + Verbosef("total prune: %10d blobs / %s\n", stats.blobs.remove+stats.blobs.repackrm, formatBytes(totalPruneSize)) + Verbosef("remaining: %10d blobs / %s\n", totalBlobs-(stats.blobs.remove+stats.blobs.repackrm), formatBytes(totalSize-totalPruneSize)) + unusedAfter := stats.size.unused - stats.size.remove - stats.size.repackrm + Verbosef("unused size after prune: %s (%s of remaining size)\n", + formatBytes(unusedAfter), formatPercent(unusedAfter, totalSize-totalPruneSize)) + Verbosef("\n") + Verboseff("totally used packs: %10d\n", stats.packs.used) + Verboseff("partly used packs: %10d\n", stats.packs.partlyUsed) + Verboseff("unused packs: %10d\n\n", stats.packs.unused) + + Verboseff("to keep: %10d packs\n", stats.packs.keep) + Verboseff("to repack: %10d packs\n", len(repackPacks)) + Verboseff("to delete: %10d packs\n", len(removePacks)) + if len(removePacksFirst) > 0 { + Verboseff("to delete: %10d unreferenced packs\n\n", len(removePacksFirst)) + } + + if opts.DryRun { + if !gopts.JSON && gopts.verbosity >= 2 { + if len(removePacksFirst) > 0 { + Printf("Would have removed the following unreferenced packs:\n%v\n\n", removePacksFirst) + } + Printf("Would have repacked and removed the following packs:\n%v\n\n", repackPacks) + Printf("Would have removed the following no longer used packs:\n%v\n\n", removePacks) + } + // Always quit here if DryRun was set! + return nil + } + + // unreferenced packs can be safely deleted first + if len(removePacksFirst) != 0 { + Verbosef("deleting unreferenced packs\n") + DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile) + } + + if len(repackPacks) != 0 { + Verbosef("repacking packs\n") + bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked") + _, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar) + if err != nil { + return err + } + // Also remove repacked packs + removePacks.Merge(repackPacks) + } + if len(removePacks) != 0 { - Verbosef("remove %d old packs\n", len(removePacks)) + if err = rebuildIndex(ctx, repo, removePacks); err != nil { + return err + } + + Verbosef("removing %d old packs\n", len(removePacks)) DeleteFiles(gopts, repo, removePacks, restic.PackFile) } @@ -263,7 +477,7 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) { ctx := gopts.ctx - Verbosef("find data that is still in use for %d snapshots\n", len(snapshots)) + Verbosef("finding data that is still in use for %d snapshots\n", len(snapshots)) usedBlobs = restic.NewBlobSet() diff --git a/cmd/restic/integration_test.go b/cmd/restic/integration_test.go index 9c01939ec..10af8701a 100644 --- a/cmd/restic/integration_test.go +++ b/cmd/restic/integration_test.go @@ -270,8 +270,8 @@ func testRunForgetJSON(t testing.TB, gopts GlobalOptions, args ...string) { "Expected 2 snapshots to be removed, got %v", len(forgets[0].Remove)) } -func testRunPrune(t testing.TB, gopts GlobalOptions) { - rtest.OK(t, runPrune(gopts)) +func testRunPrune(t testing.TB, gopts GlobalOptions, opts PruneOptions) { + rtest.OK(t, runPrune(opts, gopts)) } func testSetupBackupData(t testing.TB, env *testEnvironment) string { @@ -1386,6 +1386,41 @@ func TestCheckRestoreNoLock(t *testing.T) { } func TestPrune(t *testing.T) { + t.Run("0", func(t *testing.T) { + opts := PruneOptions{MaxUnusedPercent: 0.0} + checkOpts := CheckOptions{ReadData: true, CheckUnused: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("50", func(t *testing.T) { + opts := PruneOptions{MaxUnusedPercent: 50.0} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("100", func(t *testing.T) { + opts := PruneOptions{MaxUnusedPercent: 100.0} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("CachableOnly", func(t *testing.T) { + opts := PruneOptions{RepackCachableOnly: true} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + /* repack-small option will come in future + t.Run("Small", func(t *testing.T) { + opts = PruneOptions{MaxUnusedPercent: 100.0, RepackSmall: true} + // The test case only produces small files; hence no unused blobs should remain. + checkOpts = CheckOptions{ReadData: true, CheckUnused: true} + testPrune(t, opts, checkOpts) + }) + */ +} + +func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -1406,10 +1441,12 @@ func TestPrune(t *testing.T) { testRunForgetJSON(t, env.gopts) testRunForget(t, env.gopts, firstSnapshot[0].String()) - testRunPrune(t, env.gopts) - testRunCheck(t, env.gopts) + testRunPrune(t, env.gopts, pruneOpts) + rtest.OK(t, runCheck(checkOpts, env.gopts, nil)) } +var pruneDefaultOptions = PruneOptions{MaxUnusedPercent: 1.5} + func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet { r, err := OpenRepository(gopts) rtest.OK(t, err) @@ -1452,14 +1489,8 @@ func TestPruneWithDamagedRepository(t *testing.T) { "expected one snapshot, got %v", snapshotIDs) // prune should fail - err := runPrune(env.gopts) - if err == nil { - t.Fatalf("expected prune to fail") - } - if !strings.Contains(err.Error(), "blobs seem to be missing") { - t.Fatalf("did not find hint for missing blobs") - } - t.Log(err) + rtest.Assert(t, runPrune(pruneDefaultOptions, env.gopts) == errorPacksMissing, + "prune should have reported index not complete error") } // Test repos for edge cases @@ -1469,37 +1500,37 @@ func TestEdgeCaseRepos(t *testing.T) { // repo where index is completely missing // => check and prune should fail t.Run("no-index", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, false, false) + testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where an existing and used blob is missing from the index - // => check should fail, prune should heal this + // => check and prune should fail t.Run("index-missing-blob", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, false, true) + testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where a blob is missing // => check and prune should fail t.Run("no-data", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, false, false) + testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where data exists that is not referenced // => check and prune should fully work t.Run("unreferenced-data", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo where an obsolete index still exists // => check and prune should fully work t.Run("obsolete-index", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo which contains mixed (data/tree) packs // => check and prune should fully work t.Run("mixed-packs", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo which contains duplicate blobs @@ -1510,11 +1541,11 @@ func TestEdgeCaseRepos(t *testing.T) { CheckUnused: true, } t.Run("duplicates", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, false, true) + testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, pruneDefaultOptions, false, true) }) } -func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkOK, pruneOK bool) { +func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, optionsPrune PruneOptions, checkOK, pruneOK bool) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -1524,15 +1555,15 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkO if checkOK { testRunCheck(t, env.gopts) } else { - rtest.Assert(t, runCheck(options, env.gopts, nil) != nil, + rtest.Assert(t, runCheck(optionsCheck, env.gopts, nil) != nil, "check should have reported an error") } if pruneOK { - testRunPrune(t, env.gopts) + testRunPrune(t, env.gopts, optionsPrune) testRunCheck(t, env.gopts) } else { - rtest.Assert(t, runPrune(env.gopts) != nil, + rtest.Assert(t, runPrune(optionsPrune, env.gopts) != nil, "prune should have reported an error") } } diff --git a/doc/060_forget.rst b/doc/060_forget.rst index 99dec6c51..2df82af8b 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -23,12 +23,11 @@ data that was referenced by the snapshot from the repository. This can be automated with the ``--prune`` option of the ``forget`` command, which runs ``prune`` automatically if snapshots have been removed. -.. Warning:: - - Pruning snapshots can be a very time-consuming process, taking nearly - as long as backups themselves. During a prune operation, the index is - locked and backups cannot be completed. Performance improvements are - planned for this feature. +Pruning snapshots can be a time-consuming process, depending on the +amount of snapshots and data to process. During a prune operation, the +repository is locked and backups cannot be completed. Please plan your +pruning so that there's time to complete it and it doesn't interfere with +regular backup runs. It is advisable to run ``restic check`` after pruning, to make sure you are alerted, should the internal data structures of the repository @@ -82,20 +81,32 @@ command must be run: $ restic -r /srv/restic-repo prune enter password for repository: - + repository 33002c5e opened successfully, password is correct + loading all snapshots... + loading indexes... + finding data that is still in use for 4 snapshots + [0:00] 100.00% 4 / 4 snapshots + searching used packs... + collecting packs for deletion and repacking + [0:00] 100.00% 5 / 5 packs processed + + to repack: 69 blobs / 1.078 MiB + this removes 67 blobs / 1.047 MiB + to delete: 7 blobs / 25.726 KiB + total prune: 74 blobs / 1.072 MiB + remaining: 16 blobs / 38.003 KiB + unused size after prune: 0 B (0.00% of remaining size) + + repacking packs + [0:00] 100.00% 2 / 2 packs repacked counting files in repo - building new index for repo - [0:00] 100.00% 22 / 22 files - repository contains 22 packs (8512 blobs) with 100.092 MiB bytes - processed 8512 blobs: 0 duplicate blobs, 0B duplicate - load all snapshots - find data that is still in use for 1 snapshots - [0:00] 100.00% 1 / 1 snapshots - found 8433 of 8512 data blobs still in use - will rewrite 3 packs - creating new index - [0:00] 86.36% 19 / 22 files - saved new index as 544a5084 + [0:00] 100.00% 3 / 3 packs + finding old index files + saved new indexes as [59270b3a] + remove 4 old index files + [0:00] 100.00% 4 / 4 files deleted + removing 3 old packs + [0:00] 100.00% 3 / 3 files deleted done Afterwards the repository is smaller. @@ -119,19 +130,31 @@ to ``forget``: 8c02b94b 2017-02-21 10:48:33 mopped /home/user/work 1 snapshots have been removed, running prune - counting files in repo - building new index for repo - [0:00] 100.00% 37 / 37 packs - repository contains 37 packs (5521 blobs) with 151.012 MiB bytes - processed 5521 blobs: 0 duplicate blobs, 0B duplicate - load all snapshots - find data that is still in use for 1 snapshots + loading all snapshots... + loading indexes... + finding data that is still in use for 1 snapshots [0:00] 100.00% 1 / 1 snapshots - found 5323 of 5521 data blobs still in use, removing 198 blobs - will delete 0 packs and rewrite 27 packs, this frees 22.106 MiB - creating new index - [0:00] 100.00% 30 / 30 packs - saved new index as b49f3e68 + searching used packs... + collecting packs for deletion and repacking + [0:00] 100.00% 5 / 5 packs processed + + to repack: 69 blobs / 1.078 MiB + this removes 67 blobs / 1.047 MiB + to delete: 7 blobs / 25.726 KiB + total prune: 74 blobs / 1.072 MiB + remaining: 16 blobs / 38.003 KiB + unused size after prune: 0 B (0.00% of remaining size) + + repacking packs + [0:00] 100.00% 2 / 2 packs repacked + counting files in repo + [0:00] 100.00% 3 / 3 packs + finding old index files + saved new indexes as [59270b3a] + remove 4 old index files + [0:00] 100.00% 4 / 4 files deleted + removing 3 old packs + [0:00] 100.00% 3 / 3 files deleted done Removing snapshots according to a policy @@ -282,3 +305,44 @@ last-day-of-the-months (11 or 12 depends if the 5 weeklies cross a month). And finally 75 last-day-of-the-year snapshots. All other snapshots are removed. +Customize pruning +***************** + +To understand the custom options, we first explain how the pruning process works: + +- First all snapshots and directories within snapshots are scanned to determine + which data is still in use. +- Then for all pack files ``prune`` finds out if the file is fully used, partly + used or completely unused. +- Completely unused packs are marked for deletion. Fully used packs are kept. + A partially used pack is either kept or marked for repacking depending on user + options. + Note that for repacking, restic must download the file from the repository + storage and reupload the needed data in the repository. This can be very + time-consuming for remote repositories. +- After deciding what to do, ``prune`` will actually perform the repack, modify + the index according to the changes and delete the obsolete files. + +The ``prune`` command accepts the following options: + +- ``--max-unused limit`` allow unused data up to the specified limit within the repository. + This allows restic to keep partly used packs instead of repacking them. + The limit can be specified as size, e.g. "200M" or in percentage with respect to the total + repository size, e.g. "0.5%". + ``prune`` tries to repack as little data as possible while still ensuring this + limit for unused data. + If you want to minimize the space used by your repository, use a value of 0%. + If you want to minimize the time and bandwidth used by the ``prune`` command, use a + high value. A value of 100% will not require any pack file to be repacked. + The default value is 5%. +- ``--max-repack-size size`` if set limits the total size of packs to repack. + As ``prune`` first stores all repacked packs and deletes the obsolete packs at the end, + this option might be handy if you expect many packs to be repacked and fear to run low + on storage. +- ``--repack-cacheable-only`` if set to true only pack files which are cacheable are repacked. + Other pack files are not repacked, if this option is set. + This allows a very fast repacking using only cached data. It can, however, imply that the + unused data in your repository exceeds the value given by ``--max-unused-percent``. + The default value is false. +- ``--dry-run`` only show what ``prune`` would do. +- ``--verbose`` increased verbosity shows additional statistics for ``prune``. From b2f5381737d73f509c40437ae42c5dd20ec9d23f Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sun, 19 Jul 2020 07:13:41 +0200 Subject: [PATCH 05/13] Make realistic forget --prune --dryrun --- cmd/restic/cmd_forget.go | 3 +-- cmd/restic/cmd_prune.go | 6 +++--- internal/restic/snapshot.go | 6 +++++- internal/restic/testing_test.go | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cmd/restic/cmd_forget.go b/cmd/restic/cmd_forget.go index fa9739c0b..596c7c550 100644 --- a/cmd/restic/cmd_forget.go +++ b/cmd/restic/cmd_forget.go @@ -214,9 +214,8 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { if !gopts.JSON { Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs)) } - pruneOptions.DryRun = opts.DryRun - return runPruneWithRepo(pruneOptions, gopts, repo) + return runPruneWithRepo(pruneOptions, gopts, repo, removeSnIDs) } return nil diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 9a93c600c..605f6258e 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -128,15 +128,15 @@ func runPrune(opts PruneOptions, gopts GlobalOptions) error { return err } - return runPruneWithRepo(opts, gopts, repo) + return runPruneWithRepo(opts, gopts, repo, restic.NewIDSet()) } -func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository) error { +func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository, ignoreSnapshots restic.IDSet) error { // we do not need index updates while pruning! repo.DisableAutoIndexUpdate() Verbosef("loading all snapshots...\n") - snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo) + snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo, ignoreSnapshots) if err != nil { return err } diff --git a/internal/restic/snapshot.go b/internal/restic/snapshot.go index dc0dd5949..86e98e234 100644 --- a/internal/restic/snapshot.go +++ b/internal/restic/snapshot.go @@ -67,8 +67,12 @@ func LoadSnapshot(ctx context.Context, repo Repository, id ID) (*Snapshot, error } // LoadAllSnapshots returns a list of all snapshots in the repo. -func LoadAllSnapshots(ctx context.Context, repo Repository) (snapshots []*Snapshot, err error) { +// If a snapshot ID is in excludeIDs, it will not be included in the result. +func LoadAllSnapshots(ctx context.Context, repo Repository, excludeIDs IDSet) (snapshots []*Snapshot, err error) { err = repo.List(ctx, SnapshotFile, func(id ID, size int64) error { + if excludeIDs.Has(id) { + return nil + } sn, err := LoadSnapshot(ctx, repo, id) if err != nil { return err diff --git a/internal/restic/testing_test.go b/internal/restic/testing_test.go index 0386fb76a..c3989f55f 100644 --- a/internal/restic/testing_test.go +++ b/internal/restic/testing_test.go @@ -25,7 +25,7 @@ func TestCreateSnapshot(t *testing.T) { restic.TestCreateSnapshot(t, repo, testSnapshotTime.Add(time.Duration(i)*time.Second), testDepth, 0) } - snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo) + snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo, restic.NewIDSet()) if err != nil { t.Fatal(err) } From 1dd9fdce74818984438173587447035c66e722e0 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 10:40:34 +0100 Subject: [PATCH 06/13] Reword changelog slightly --- changelog/unreleased/pull-2718 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/changelog/unreleased/pull-2718 b/changelog/unreleased/pull-2718 index b2c32cc45..56af35d71 100644 --- a/changelog/unreleased/pull-2718 +++ b/changelog/unreleased/pull-2718 @@ -1,16 +1,16 @@ Enhancement: Improve pruning performance and make pruning more customizable -The prune command is now much faster. This is especially the case for remote -repositories or repositories with not much data to prune. -Also the memory usage of the prune command is now reduced. +The `prune` command is now much faster. This is especially the case for remote +repositories or repositories with not much data to remove. +Also the memory usage of the `prune` command is now reduced. -By default the prune command now no longer removes all unused blobs. This -behavior can be fine-tuned by new options, like tolerated unused space or -maximum size of packs to repack. For more details, see +By default, the `prune` command no longer removes all unused data. This +behavior can be fine-tuned by new options, like the acceptable amount of unused space or +the maximum size of data to reorganize. For more details, see https://restic.readthedocs.io/en/stable/060_forget.html -Moreover, prune now accepts the dry-run option and forget --dry-run --prune -also shows what prune would do. +Moreover, `prune` now accepts the `--dry-run` option and `forget --dry-run --prune` +also shows what `prune` would do. Fixes several open issues, e.g.: https://github.com/restic/restic/issues/1140 From 095155d9ce346f3955bc6a50ff9d8b39d5ec8978 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 12:20:46 +0100 Subject: [PATCH 07/13] Remove RepackSmall --- cmd/restic/cmd_prune.go | 20 +++++++------------- cmd/restic/integration_test.go | 9 --------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 605f6258e..f9f42d720 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -37,13 +37,12 @@ Exit status is 0 if the command was successful, and non-zero if there was any er // PruneOptions collects all options for the cleanup command. type PruneOptions struct { - DryRun bool - MaxUnused string - MaxUnusedPercent float64 - MaxUnusedBytes uint64 - MaxRepackSize string - MaxRepackBytes uint64 - // RepackSmall bool <- This option may be added later + DryRun bool + MaxUnused string + MaxUnusedPercent float64 + MaxUnusedBytes uint64 + MaxRepackSize string + MaxRepackBytes uint64 RepackCachableOnly bool } @@ -61,7 +60,6 @@ func addPruneOptions(c *cobra.Command) { f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)") f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)") f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") - // f.BoolVar(&pruneOptions.RepackSmall, "repack-small", false, "also repack small packs") } func verifyPruneOptions(opts *PruneOptions) error { @@ -314,7 +312,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB case opts.RepackCachableOnly && p.tpe == restic.DataBlob, // if this is a data pack and --repack-cacheable-only is set => keep pack! - p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: // && (!opts.RepackSmall || packSize >= repository.MinPackSize) + p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: // All blobs in pack are used and not duplicates/mixed => keep pack! stats.packs.keep++ @@ -364,10 +362,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB return true case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob: return false - //case opts.RepackSmall && pi.unusedSize+pi.usedSize < repository.MinPackSize && pj.unusedSize+pj.usedSize >= repository.MinPackSize: - // return true - //case opts.RepackSmall && pj.unusedSize+pj.usedSize < repository.MinPackSize && pi.unusedSize+pi.usedSize >= repository.MinPackSize: - // return false } return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize }) diff --git a/cmd/restic/integration_test.go b/cmd/restic/integration_test.go index 10af8701a..6aeeab634 100644 --- a/cmd/restic/integration_test.go +++ b/cmd/restic/integration_test.go @@ -1409,15 +1409,6 @@ func TestPrune(t *testing.T) { checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) - - /* repack-small option will come in future - t.Run("Small", func(t *testing.T) { - opts = PruneOptions{MaxUnusedPercent: 100.0, RepackSmall: true} - // The test case only produces small files; hence no unused blobs should remain. - checkOpts = CheckOptions{ReadData: true, CheckUnused: true} - testPrune(t, opts, checkOpts) - }) - */ } func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) { From aff1e220f540f3bd2690d555d32c11355e96f969 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 11:14:53 +0100 Subject: [PATCH 08/13] Split struct members, add comments --- cmd/restic/cmd_prune.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index f9f42d720..e42eb598f 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -37,12 +37,15 @@ Exit status is 0 if the command was successful, and non-zero if there was any er // PruneOptions collects all options for the cleanup command. type PruneOptions struct { - DryRun bool - MaxUnused string - MaxUnusedPercent float64 - MaxUnusedBytes uint64 - MaxRepackSize string - MaxRepackBytes uint64 + DryRun bool + + MaxUnused string + MaxUnusedPercent float64 // set if MaxUnused is a percentage + MaxUnusedBytes uint64 // set if MaxUnused is an absolute number of bytes + + MaxRepackSize string + MaxRepackBytes uint64 + RepackCachableOnly bool } From a5b80452fe99e0d7c04b33b21e3bfaa9b6bd7f49 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 12:50:33 +0100 Subject: [PATCH 09/13] Add comment that usedBlobs is modified --- cmd/restic/cmd_prune.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index e42eb598f..90b702e61 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -170,6 +170,8 @@ type packInfoWithID struct { packInfo } +// prune selects which files to rewrite and then does that. The map usedBlobs is +// modified in the process. func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error { ctx := gopts.ctx From f8c4dd7b1a938067152443ef5f1884f1b40c8cb1 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 12:50:47 +0100 Subject: [PATCH 10/13] Split packe rewrite logic into two case branches The comma is too sublte, let's split this into two separate branches. --- cmd/restic/cmd_prune.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 90b702e61..4f83a9874 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -315,9 +315,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB stats.blobs.remove += p.unusedBlobs stats.size.remove += p.unusedSize - case opts.RepackCachableOnly && p.tpe == restic.DataBlob, + case opts.RepackCachableOnly && p.tpe == restic.DataBlob: // if this is a data pack and --repack-cacheable-only is set => keep pack! - p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: + stats.packs.keep++ + + case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: // All blobs in pack are used and not duplicates/mixed => keep pack! stats.packs.keep++ From c1a3de4a6e7e373f80e65f218c109b5a09cab8a5 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 10:53:38 +0100 Subject: [PATCH 11/13] Refactor max-unused calculation, add `unlimited` option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a callback to the PruneOptions struct which calculates the number of bytes allowed to be unused after prune is done. This way, the logic is closer to the option parsing code. Also, add an explicit option `unlimited` for the use case when storage does not matter but bandwidth and time do. Internally, this sets the maximum number of unused bytes to MaxUint64. Rework the documentation slightly so that no more "packs" are mentioned and it talks about "files" instead. Make it clear in the documentation that the percentage given to `--max-unused` is relative to the whole repository size after pruning is done. If specified, it must be below 100%, otherwise the repository would contain 100% of unused data, which is pointless. I had a hard time coming up with the correct formula to calculate the maximum number of unused bytes based on the number of used bytes. For a fraction `p` (0 ≤ p < 1), a repo with `u` bytes used, and the number of unused bytes `x` the following holds: x ≤ p * (u+x) ⇔ x ≤ p*u + p*x ⇔ x - p*x ≤ p*u ⇔ x * (1-p) ≤ p*u ⇔ x ≤ p/(1-p) * u --- cmd/restic/cmd_prune.go | 73 ++++++++++++++++++++-------------- cmd/restic/integration_test.go | 12 +++--- doc/060_forget.rst | 73 ++++++++++++++++++++-------------- 3 files changed, 94 insertions(+), 64 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 4f83a9874..0ff4600b8 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -1,8 +1,10 @@ package main import ( + "math" "sort" "strconv" + "strings" "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" @@ -39,9 +41,8 @@ Exit status is 0 if the command was successful, and non-zero if there was any er type PruneOptions struct { DryRun bool - MaxUnused string - MaxUnusedPercent float64 // set if MaxUnused is a percentage - MaxUnusedBytes uint64 // set if MaxUnused is an absolute number of bytes + MaxUnused string + maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused MaxRepackSize string MaxRepackBytes uint64 @@ -60,7 +61,7 @@ func init() { func addPruneOptions(c *cobra.Command) { f := c.Flags() - f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)") + f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')") f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)") f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") } @@ -74,27 +75,46 @@ func verifyPruneOptions(opts *PruneOptions) error { opts.MaxRepackBytes = uint64(size) } - length := len(opts.MaxUnused) - if length == 0 { - return nil + maxUnused := strings.TrimSpace(opts.MaxUnused) + if maxUnused == "" { + return errors.Fatalf("invalid value for --max-unused: %q", opts.MaxUnused) } - var err error - if opts.MaxUnused[length-1] == '%' { - opts.MaxUnusedPercent, err = strconv.ParseFloat(opts.MaxUnused[:length-1], 64) - opts.MaxUnusedBytes = ^uint64(0) - } else { - var size int64 - size, err = parseSizeStr(opts.MaxUnused) - opts.MaxUnusedPercent = 100.0 - opts.MaxUnusedBytes = uint64(size) - } - if err != nil { - return err - } + // parse MaxUnused either as unlimited, a percentage, or an absolute number of bytes + switch { + case maxUnused == "unlimited": + opts.maxUnusedBytes = func(used uint64) uint64 { + return math.MaxUint64 + } - if opts.MaxUnusedPercent < 0.0 || opts.MaxUnusedPercent > 100.0 { - return errors.Fatalf("--max-unused-percent should be between 0 and 100. Given value: %f", opts.MaxUnusedPercent) + case strings.HasSuffix(maxUnused, "%"): + maxUnused = strings.TrimSuffix(maxUnused, "%") + p, err := strconv.ParseFloat(maxUnused, 64) + if err != nil { + return errors.Fatalf("invalid percentage %q passed for --max-unused: %v", opts.MaxUnused, err) + } + + if p < 0 { + return errors.Fatal("percentage for --max-unused must be positive") + } + + if p >= 100 { + return errors.Fatal("percentage for --max-unused must be below 100%") + } + + opts.maxUnusedBytes = func(used uint64) uint64 { + return uint64(p / (100 - p) * float64(used)) + } + + default: + size, err := parseSizeStr(maxUnused) + if err != nil { + return errors.Fatalf("invalid number of bytes %q for --max-unused: %v", opts.MaxUnused, err) + } + + opts.maxUnusedBytes = func(used uint64) uint64 { + return uint64(size) + } } return nil @@ -344,13 +364,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB repackAllPacksWithDuplicates := true - maxUnusedSizeAfter := opts.MaxUnusedBytes - if opts.MaxUnusedPercent < 100.0 { - maxUnusedSizePercent := uint64(opts.MaxUnusedPercent / (100.0 - opts.MaxUnusedPercent) * float64(stats.size.used)) - if maxUnusedSizePercent < maxUnusedSizeAfter { - maxUnusedSizeAfter = maxUnusedSizePercent - } - } + // calculate limit for number of unused bytes in the repo after repacking + maxUnusedSizeAfter := opts.maxUnusedBytes(stats.size.used) // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. // This is equivalent to sorting by unused / total space. diff --git a/cmd/restic/integration_test.go b/cmd/restic/integration_test.go index 6aeeab634..789240e0c 100644 --- a/cmd/restic/integration_test.go +++ b/cmd/restic/integration_test.go @@ -1387,25 +1387,25 @@ func TestCheckRestoreNoLock(t *testing.T) { func TestPrune(t *testing.T) { t.Run("0", func(t *testing.T) { - opts := PruneOptions{MaxUnusedPercent: 0.0} + opts := PruneOptions{MaxUnused: "0%"} checkOpts := CheckOptions{ReadData: true, CheckUnused: true} testPrune(t, opts, checkOpts) }) t.Run("50", func(t *testing.T) { - opts := PruneOptions{MaxUnusedPercent: 50.0} + opts := PruneOptions{MaxUnused: "50%"} checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) - t.Run("100", func(t *testing.T) { - opts := PruneOptions{MaxUnusedPercent: 100.0} + t.Run("unlimited", func(t *testing.T) { + opts := PruneOptions{MaxUnused: "unlimited"} checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) t.Run("CachableOnly", func(t *testing.T) { - opts := PruneOptions{RepackCachableOnly: true} + opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true} checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) @@ -1436,7 +1436,7 @@ func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) { rtest.OK(t, runCheck(checkOpts, env.gopts, nil)) } -var pruneDefaultOptions = PruneOptions{MaxUnusedPercent: 1.5} +var pruneDefaultOptions = PruneOptions{MaxUnused: "5%"} func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet { r, err := OpenRepository(gopts) diff --git a/doc/060_forget.rst b/doc/060_forget.rst index 2df82af8b..08381f180 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -310,39 +310,54 @@ Customize pruning To understand the custom options, we first explain how the pruning process works: -- First all snapshots and directories within snapshots are scanned to determine - which data is still in use. -- Then for all pack files ``prune`` finds out if the file is fully used, partly - used or completely unused. -- Completely unused packs are marked for deletion. Fully used packs are kept. - A partially used pack is either kept or marked for repacking depending on user - options. - Note that for repacking, restic must download the file from the repository - storage and reupload the needed data in the repository. This can be very - time-consuming for remote repositories. -- After deciding what to do, ``prune`` will actually perform the repack, modify - the index according to the changes and delete the obsolete files. +1. All snapshots and directories within snapshots are scanned to determine + which data is still in use. +2. For all files in the repository, restic finds out if the file is fully + used, partly used or completely unused. +3. Completely unused files are marked for deletion. Fully used files are kept. + A partially used file is either kept or marked for repacking depending on user + options. + + Note that for repacking, restic must download the file from the repository + storage and re-upload the needed data in the repository. This can be very + time-consuming for remote repositories. +4. After deciding what to do, ``prune`` will actually perform the repack, modify + the index according to the changes and delete the obsolete files. The ``prune`` command accepts the following options: - ``--max-unused limit`` allow unused data up to the specified limit within the repository. - This allows restic to keep partly used packs instead of repacking them. - The limit can be specified as size, e.g. "200M" or in percentage with respect to the total - repository size, e.g. "0.5%". - ``prune`` tries to repack as little data as possible while still ensuring this + This allows restic to keep partly used files instead of repacking them. + + The limit can be specified in several ways: + + * As an absolute size (e.g. ``200M``). If you want to minimize the space + used by your repository, pass ``0`` to this option. + * As a size relative to the total repo size (e.g. ``10%``). This means that + after prune, at most ``10%`` of the total data stored in the repo may be + unused data. If the repo after prune has as size of 500MB, then at most + 50MB may be unused. + * If the string ``unlimited`` is passed, there is no limit for partly + unused files. This means that as long as some data is still used within + a file stored in the repo, restic will just leave it there. Use this if + you want to minimize the time and bandwidth used by the ``prune`` + operation. + + Restic tries to repack as little data as possible while still ensuring this limit for unused data. - If you want to minimize the space used by your repository, use a value of 0%. - If you want to minimize the time and bandwidth used by the ``prune`` command, use a - high value. A value of 100% will not require any pack file to be repacked. - The default value is 5%. -- ``--max-repack-size size`` if set limits the total size of packs to repack. - As ``prune`` first stores all repacked packs and deletes the obsolete packs at the end, - this option might be handy if you expect many packs to be repacked and fear to run low - on storage. -- ``--repack-cacheable-only`` if set to true only pack files which are cacheable are repacked. - Other pack files are not repacked, if this option is set. - This allows a very fast repacking using only cached data. It can, however, imply that the - unused data in your repository exceeds the value given by ``--max-unused-percent``. - The default value is false. + +- ``--max-repack-size size`` if set limits the total size of files to repack. + As ``prune`` first stores all repacked files and deletes the obsolete files at the end, + this option might be handy if you expect many files to be repacked and fear to run low + on storage. + +- ``--repack-cacheable-only`` if set to true only files which contain + metadata and would be stored in the cache are repacked. Other pack files are + not repacked if this option is set. This allows a very fast repacking + using only cached data. It can, however, imply that the unused data in + your repository exceeds the value given by ``--max-unused``. + The default value is false. + - ``--dry-run`` only show what ``prune`` would do. + - ``--verbose`` increased verbosity shows additional statistics for ``prune``. From 7f86eb4ec03ca72e6814dc5770271995e07e6049 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 13:27:53 +0100 Subject: [PATCH 12/13] Move helper function --- cmd/restic/cmd_prune.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 0ff4600b8..3b9f4f505 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -290,14 +290,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB var repackCandidates []packInfoWithID - repack := func(id restic.ID, p packInfo) { - repackPacks.Insert(id) - stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs - stats.size.repack += p.unusedSize + p.usedSize - stats.blobs.repackrm += p.unusedBlobs - stats.size.repackrm += p.unusedSize - } - // loop over all packs and decide what to do bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed") bar.Start() @@ -388,6 +380,14 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize }) + repack := func(id restic.ID, p packInfo) { + repackPacks.Insert(id) + stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs + stats.size.repack += p.unusedSize + p.usedSize + stats.blobs.repackrm += p.unusedBlobs + stats.size.repackrm += p.unusedSize + } + for _, p := range repackCandidates { reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter) reachedRepackSize := (len(opts.MaxRepackSize) > 0 && stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes) From 1ca60bccfba1d1d5929b64d05df0a5654d81e548 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 13:28:21 +0100 Subject: [PATCH 13/13] Refactor condition for MaxRepackBytes Don't depend on the string (opts.MaxRepackSize) for the condition, instead check if there's a (positive) limit configured. --- cmd/restic/cmd_prune.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 3b9f4f505..4fbc5fab7 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -390,7 +390,12 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB for _, p := range repackCandidates { reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter) - reachedRepackSize := (len(opts.MaxRepackSize) > 0 && stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes) + + reachedRepackSize := false + if opts.MaxRepackBytes > 0 { + reachedRepackSize = stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes + } + switch { case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob): // repacking duplicates/mixed is only limited by repackSize