From 7f9a0a5907b44c543cd08c0e169103975abac335 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sun, 19 Jul 2020 07:55:14 +0200 Subject: [PATCH] Reimplementation of prune --- changelog/unreleased/pull-2718 | 22 ++ cmd/restic/cmd_forget.go | 13 +- cmd/restic/cmd_prune.go | 524 +++++++++++++++++++++++---------- cmd/restic/integration_test.go | 79 +++-- doc/060_forget.rst | 126 ++++++-- 5 files changed, 553 insertions(+), 211 deletions(-) create mode 100644 changelog/unreleased/pull-2718 diff --git a/changelog/unreleased/pull-2718 b/changelog/unreleased/pull-2718 new file mode 100644 index 000000000..b2c32cc45 --- /dev/null +++ b/changelog/unreleased/pull-2718 @@ -0,0 +1,22 @@ +Enhancement: Improve pruning performance and make pruning more customizable + +The prune command is now much faster. This is especially the case for remote +repositories or repositories with not much data to prune. +Also the memory usage of the prune command is now reduced. + +By default the prune command now no longer removes all unused blobs. This +behavior can be fine-tuned by new options, like tolerated unused space or +maximum size of packs to repack. For more details, see +https://restic.readthedocs.io/en/stable/060_forget.html + +Moreover, prune now accepts the dry-run option and forget --dry-run --prune +also shows what prune would do. + +Fixes several open issues, e.g.: +https://github.com/restic/restic/issues/1140 +https://github.com/restic/restic/issues/1985 +https://github.com/restic/restic/issues/2112 +https://github.com/restic/restic/issues/2227 +https://github.com/restic/restic/issues/2305 + +https://github.com/restic/restic/pull/2718 diff --git a/cmd/restic/cmd_forget.go b/cmd/restic/cmd_forget.go index 3edaa76e9..fa9739c0b 100644 --- a/cmd/restic/cmd_forget.go +++ b/cmd/restic/cmd_forget.go @@ -80,9 +80,15 @@ func init() { f.BoolVar(&forgetOptions.Prune, "prune", false, "automatically run the 'prune' command if snapshots have been removed") f.SortFlags = false + addPruneOptions(cmdForget) } func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { + err := verifyPruneOptions(&pruneOptions) + if err != nil { + return err + } + repo, err := OpenRepository(gopts) if err != nil { return err @@ -205,7 +211,12 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { } if len(removeSnIDs) > 0 && opts.Prune && !opts.DryRun { - return pruneRepository(gopts, repo) + if !gopts.JSON { + Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs)) + } + + pruneOptions.DryRun = opts.DryRun + return runPruneWithRepo(pruneOptions, gopts, repo) } return nil diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 1bb1a51e5..9a93c600c 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -1,15 +1,22 @@ package main import ( + "sort" + "strconv" + "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" - "github.com/restic/restic/internal/index" + "github.com/restic/restic/internal/pack" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" "github.com/spf13/cobra" ) +var errorIndexIncomplete = errors.Fatal("index is not complete") +var errorPacksMissing = errors.Fatal("packs from index missing in repo") +var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index") + var cmdPrune = &cobra.Command{ Use: "prune [flags]", Short: "Remove unneeded data from the repository", @@ -24,12 +31,72 @@ Exit status is 0 if the command was successful, and non-zero if there was any er `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { - return runPrune(globalOptions) + return runPrune(pruneOptions, globalOptions) }, } +// PruneOptions collects all options for the cleanup command. +type PruneOptions struct { + DryRun bool + MaxUnused string + MaxUnusedPercent float64 + MaxUnusedBytes uint64 + MaxRepackSize string + MaxRepackBytes uint64 + // RepackSmall bool <- This option may be added later + RepackCachableOnly bool +} + +var pruneOptions PruneOptions + func init() { cmdRoot.AddCommand(cmdPrune) + f := cmdPrune.Flags() + f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done") + addPruneOptions(cmdPrune) +} + +func addPruneOptions(c *cobra.Command) { + f := c.Flags() + f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)") + f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)") + f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") + // f.BoolVar(&pruneOptions.RepackSmall, "repack-small", false, "also repack small packs") +} + +func verifyPruneOptions(opts *PruneOptions) error { + if len(opts.MaxRepackSize) > 0 { + size, err := parseSizeStr(opts.MaxRepackSize) + if err != nil { + return err + } + opts.MaxRepackBytes = uint64(size) + } + + length := len(opts.MaxUnused) + if length == 0 { + return nil + } + + var err error + if opts.MaxUnused[length-1] == '%' { + opts.MaxUnusedPercent, err = strconv.ParseFloat(opts.MaxUnused[:length-1], 64) + opts.MaxUnusedBytes = ^uint64(0) + } else { + var size int64 + size, err = parseSizeStr(opts.MaxUnused) + opts.MaxUnusedPercent = 100.0 + opts.MaxUnusedBytes = uint64(size) + } + if err != nil { + return err + } + + if opts.MaxUnusedPercent < 0.0 || opts.MaxUnusedPercent > 100.0 { + return errors.Fatalf("--max-unused-percent should be between 0 and 100. Given value: %f", opts.MaxUnusedPercent) + } + + return nil } func shortenStatus(maxLength int, s string) string { @@ -44,7 +111,12 @@ func shortenStatus(maxLength int, s string) string { return s[:maxLength-3] + "..." } -func runPrune(gopts GlobalOptions) error { +func runPrune(opts PruneOptions, gopts GlobalOptions) error { + err := verifyPruneOptions(&opts) + if err != nil { + return err + } + repo, err := OpenRepository(gopts) if err != nil { return err @@ -56,203 +128,345 @@ func runPrune(gopts GlobalOptions) error { return err } + return runPruneWithRepo(opts, gopts, repo) +} + +func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository) error { // we do not need index updates while pruning! repo.DisableAutoIndexUpdate() - return pruneRepository(gopts, repo) -} - -func mixedBlobs(list []restic.Blob) bool { - var tree, data bool - - for _, pb := range list { - switch pb.Type { - case restic.TreeBlob: - tree = true - case restic.DataBlob: - data = true - } - - if tree && data { - return true - } - } - - return false -} - -func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { - ctx := gopts.ctx - - err := repo.LoadIndex(ctx) + Verbosef("loading all snapshots...\n") + snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo) if err != nil { return err } - var stats struct { - blobs int - packs int - snapshots int - bytes int64 - } - - Verbosef("counting files in repo\n") - err = repo.List(ctx, restic.PackFile, func(restic.ID, int64) error { - stats.packs++ - return nil - }) + Verbosef("loading indexes...\n") + err = repo.LoadIndex(gopts.ctx) if err != nil { return err } - Verbosef("building new index for repo\n") - - bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs") - idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar) - if err != nil { - return err - } - - for _, id := range invalidFiles { - Warnf("incomplete pack file (will be removed): %v\n", id) - } - - blobs := 0 - for _, pack := range idx.Packs { - stats.bytes += pack.Size - blobs += len(pack.Entries) - } - Verbosef("repository contains %v packs (%v blobs) with %v\n", - len(idx.Packs), blobs, formatBytes(uint64(stats.bytes))) - - blobCount := make(map[restic.BlobHandle]int) - var duplicateBlobs uint64 - var duplicateBytes uint64 - - // find duplicate blobs - for _, p := range idx.Packs { - for _, entry := range p.Entries { - stats.blobs++ - h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} - blobCount[h]++ - - if blobCount[h] > 1 { - duplicateBlobs++ - duplicateBytes += uint64(entry.Length) - } - } - } - - Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n", - stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes))) - Verbosef("load all snapshots\n") - - // find referenced blobs - snapshots, err := restic.LoadAllSnapshots(ctx, repo) - if err != nil { - return err - } - - stats.snapshots = len(snapshots) - usedBlobs, err := getUsedBlobs(gopts, repo, snapshots) if err != nil { return err } - var missingBlobs []restic.BlobHandle - for h := range usedBlobs { - if _, ok := blobCount[h]; !ok { - missingBlobs = append(missingBlobs, h) + return prune(opts, gopts, repo, usedBlobs) +} + +type packInfo struct { + usedBlobs uint + unusedBlobs uint + duplicateBlobs uint + usedSize uint64 + unusedSize uint64 + tpe restic.BlobType +} + +type packInfoWithID struct { + ID restic.ID + packInfo +} + +func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error { + ctx := gopts.ctx + + var stats struct { + blobs struct { + used uint + duplicate uint + unused uint + remove uint + repack uint + repackrm uint + } + size struct { + used uint64 + duplicate uint64 + unused uint64 + remove uint64 + repack uint64 + repackrm uint64 + unref uint64 + } + packs struct { + used uint + unused uint + partlyUsed uint + keep uint } } - if len(missingBlobs) > 0 { - return errors.Fatalf("%v not found in the new index\n"+ + + Verbosef("searching used packs...\n") + + keepBlobs := restic.NewBlobSet() + duplicateBlobs := restic.NewBlobSet() + + // iterate over all blobs in index to find out which blobs are duplicates + for blob := range repo.Index().Each(ctx) { + bh := blob.Handle() + switch { + case usedBlobs.Has(bh): // used blob, move to keepBlobs + usedBlobs.Delete(bh) + keepBlobs.Insert(bh) + case keepBlobs.Has(bh): // duplicate blob + duplicateBlobs.Insert(bh) + } + } + + // Check if all used blobs have been found in index + if len(usedBlobs) != 0 { + Warnf("%v not found in the new index\n"+ "Data blobs seem to be missing, aborting prune to prevent further data loss!\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose", missingBlobs) + "https://github.com/restic/restic/issues/new/choose", usedBlobs) + return errorIndexIncomplete } - Verbosef("found %d of %d data blobs still in use, removing %d blobs\n", - len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs)) + indexPack := make(map[restic.ID]packInfo) - // find packs that need a rewrite - rewritePacks := restic.NewIDSet() - for _, pack := range idx.Packs { - if mixedBlobs(pack.Entries) { - rewritePacks.Insert(pack.ID) - continue + // iterate over all blobs in index to generate packInfo + for blob := range repo.Index().Each(ctx) { + ip, ok := indexPack[blob.PackID] + if !ok { + ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize} + } + // mark mixed packs with "Invalid blob type" + if ip.tpe != blob.Type { + ip.tpe = restic.InvalidBlob } - for _, blob := range pack.Entries { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - if !usedBlobs.Has(h) { - rewritePacks.Insert(pack.ID) - continue - } - - if blobCount[h] > 1 { - rewritePacks.Insert(pack.ID) - } + bh := blob.Handle() + size := uint64(pack.PackedSizeOfBlob(blob.Length)) + switch { + case duplicateBlobs.Has(bh): // duplicate blob + ip.usedSize += size + ip.duplicateBlobs++ + stats.size.duplicate += size + stats.blobs.duplicate++ + case keepBlobs.Has(bh): // used blob, not duplicate + ip.usedSize += size + ip.usedBlobs++ + stats.size.used += size + stats.blobs.used++ + default: // unused blob + ip.unusedSize += size + ip.unusedBlobs++ + stats.size.unused += size + stats.blobs.unused++ } + // update indexPack + indexPack[blob.PackID] = ip } - removeBytes := duplicateBytes - - // find packs that are unneeded + Verbosef("collecting packs for deletion and repacking\n") + removePacksFirst := restic.NewIDSet() removePacks := restic.NewIDSet() + repackPacks := restic.NewIDSet() - Verbosef("will remove %d invalid files\n", len(invalidFiles)) - for _, id := range invalidFiles { - removePacks.Insert(id) + var repackCandidates []packInfoWithID + + repack := func(id restic.ID, p packInfo) { + repackPacks.Insert(id) + stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs + stats.size.repack += p.unusedSize + p.usedSize + stats.blobs.repackrm += p.unusedBlobs + stats.size.repackrm += p.unusedSize } - for packID, p := range idx.Packs { - - hasActiveBlob := false - for _, blob := range p.Entries { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - if usedBlobs.Has(h) { - hasActiveBlob = true - continue - } - - removeBytes += uint64(blob.Length) + // loop over all packs and decide what to do + bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed") + bar.Start() + err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error { + p, ok := indexPack[id] + if !ok { + // Pack was not referenced in index and is not used => immediately remove! + Verboseff("will remove pack %v as it is unused and not indexed\n", id.Str()) + removePacksFirst.Insert(id) + stats.size.unref += uint64(packSize) + return nil } - if hasActiveBlob { - continue + if p.unusedSize+p.usedSize != uint64(packSize) { + Warnf("pack %s: calculated size %d does not match real size %d\nRun 'restic rebuild-index'.", + id.Str(), p.unusedSize+p.usedSize, packSize) + return errorSizeNotMatching } - removePacks.Insert(packID) - - if !rewritePacks.Has(packID) { - return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str()) + // statistics + switch { + case p.usedBlobs == 0 && p.duplicateBlobs == 0: + stats.packs.unused++ + case p.unusedBlobs == 0: + stats.packs.used++ + default: + stats.packs.partlyUsed++ } - rewritePacks.Delete(packID) - } + // decide what to do + switch { + case p.usedBlobs == 0 && p.duplicateBlobs == 0: + // All blobs in pack are no longer used => remove pack! + removePacks.Insert(id) + stats.blobs.remove += p.unusedBlobs + stats.size.remove += p.unusedSize - Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n", - len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes))) + case opts.RepackCachableOnly && p.tpe == restic.DataBlob, + // if this is a data pack and --repack-cacheable-only is set => keep pack! + p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: // && (!opts.RepackSmall || packSize >= repository.MinPackSize) + // All blobs in pack are used and not duplicates/mixed => keep pack! + stats.packs.keep++ - var obsoletePacks restic.IDSet - if len(rewritePacks) != 0 { - bar := newProgressMax(!gopts.Quiet, uint64(len(rewritePacks)), "packs rewritten") - obsoletePacks, err = repository.Repack(ctx, repo, rewritePacks, usedBlobs, bar) - if err != nil { - return err + default: + // all other packs are candidates for repacking + repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p}) } - } - removePacks.Merge(obsoletePacks) - - if err = rebuildIndex(ctx, repo, removePacks); err != nil { + delete(indexPack, id) + bar.Report(restic.Stat{Blobs: 1}) + return nil + }) + bar.Done() + if err != nil { return err } + if len(indexPack) != 0 { + Warnf("The index references pack files which are missing from the repository: %v\n", indexPack) + return errorPacksMissing + } + + repackAllPacksWithDuplicates := true + + maxUnusedSizeAfter := opts.MaxUnusedBytes + if opts.MaxUnusedPercent < 100.0 { + maxUnusedSizePercent := uint64(opts.MaxUnusedPercent / (100.0 - opts.MaxUnusedPercent) * float64(stats.size.used)) + if maxUnusedSizePercent < maxUnusedSizeAfter { + maxUnusedSizeAfter = maxUnusedSizePercent + } + } + + // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. + // This is equivalent to sorting by unused / total space. + // Instead of unused[i] / used[i] > unused[j] / used[j] we use + // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 + // Morover duplicates and mixed are sorted to the beginning + sort.Slice(repackCandidates, func(i, j int) bool { + pi := repackCandidates[i].packInfo + pj := repackCandidates[j].packInfo + switch { + case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0: + return true + case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0: + return false + case pi.tpe == restic.InvalidBlob && pj.tpe != restic.InvalidBlob: + return true + case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob: + return false + //case opts.RepackSmall && pi.unusedSize+pi.usedSize < repository.MinPackSize && pj.unusedSize+pj.usedSize >= repository.MinPackSize: + // return true + //case opts.RepackSmall && pj.unusedSize+pj.usedSize < repository.MinPackSize && pi.unusedSize+pi.usedSize >= repository.MinPackSize: + // return false + } + return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize + }) + + for _, p := range repackCandidates { + reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter) + reachedRepackSize := (len(opts.MaxRepackSize) > 0 && stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes) + switch { + case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob): + // repacking duplicates/mixed is only limited by repackSize + repack(p.ID, p.packInfo) + + case reachedUnusedSizeAfter, reachedRepackSize: + // for all other packs stop repacking if tolerated unused size is reached. + stats.packs.keep++ + if p.duplicateBlobs > 0 { + repackAllPacksWithDuplicates = false + } + + default: + repack(p.ID, p.packInfo) + } + } + + // if all duplicates are repacked, print out correct statistics + if repackAllPacksWithDuplicates { + stats.blobs.repackrm += stats.blobs.duplicate + stats.size.repackrm += stats.size.duplicate + } + + Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used)) + if stats.blobs.duplicate > 0 { + Verboseff("duplicates: %10d blobs / %s\n", stats.blobs.duplicate, formatBytes(stats.size.duplicate)) + } + Verboseff("unused: %10d blobs / %s\n", stats.blobs.unused, formatBytes(stats.size.unused)) + if stats.size.unref > 0 { + Verboseff("unreferenced: %s\n", formatBytes(stats.size.unref)) + } + totalBlobs := stats.blobs.used + stats.blobs.unused + stats.blobs.duplicate + totalSize := stats.size.used + stats.size.duplicate + stats.size.unused + stats.size.unref + Verboseff("total: %10d blobs / %s\n", totalBlobs, formatBytes(totalSize)) + Verboseff("unused size: %s of total size\n", formatPercent(stats.size.unused, totalSize)) + + Verbosef("\nto repack: %10d blobs / %s\n", stats.blobs.repack, formatBytes(stats.size.repack)) + Verbosef("this removes %10d blobs / %s\n", stats.blobs.repackrm, formatBytes(stats.size.repackrm)) + Verbosef("to delete: %10d blobs / %s\n", stats.blobs.remove, formatBytes(stats.size.remove+stats.size.unref)) + totalPruneSize := stats.size.remove + stats.size.repackrm + stats.size.unref + Verbosef("total prune: %10d blobs / %s\n", stats.blobs.remove+stats.blobs.repackrm, formatBytes(totalPruneSize)) + Verbosef("remaining: %10d blobs / %s\n", totalBlobs-(stats.blobs.remove+stats.blobs.repackrm), formatBytes(totalSize-totalPruneSize)) + unusedAfter := stats.size.unused - stats.size.remove - stats.size.repackrm + Verbosef("unused size after prune: %s (%s of remaining size)\n", + formatBytes(unusedAfter), formatPercent(unusedAfter, totalSize-totalPruneSize)) + Verbosef("\n") + Verboseff("totally used packs: %10d\n", stats.packs.used) + Verboseff("partly used packs: %10d\n", stats.packs.partlyUsed) + Verboseff("unused packs: %10d\n\n", stats.packs.unused) + + Verboseff("to keep: %10d packs\n", stats.packs.keep) + Verboseff("to repack: %10d packs\n", len(repackPacks)) + Verboseff("to delete: %10d packs\n", len(removePacks)) + if len(removePacksFirst) > 0 { + Verboseff("to delete: %10d unreferenced packs\n\n", len(removePacksFirst)) + } + + if opts.DryRun { + if !gopts.JSON && gopts.verbosity >= 2 { + if len(removePacksFirst) > 0 { + Printf("Would have removed the following unreferenced packs:\n%v\n\n", removePacksFirst) + } + Printf("Would have repacked and removed the following packs:\n%v\n\n", repackPacks) + Printf("Would have removed the following no longer used packs:\n%v\n\n", removePacks) + } + // Always quit here if DryRun was set! + return nil + } + + // unreferenced packs can be safely deleted first + if len(removePacksFirst) != 0 { + Verbosef("deleting unreferenced packs\n") + DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile) + } + + if len(repackPacks) != 0 { + Verbosef("repacking packs\n") + bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked") + _, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar) + if err != nil { + return err + } + // Also remove repacked packs + removePacks.Merge(repackPacks) + } + if len(removePacks) != 0 { - Verbosef("remove %d old packs\n", len(removePacks)) + if err = rebuildIndex(ctx, repo, removePacks); err != nil { + return err + } + + Verbosef("removing %d old packs\n", len(removePacks)) DeleteFiles(gopts, repo, removePacks, restic.PackFile) } @@ -263,7 +477,7 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) { ctx := gopts.ctx - Verbosef("find data that is still in use for %d snapshots\n", len(snapshots)) + Verbosef("finding data that is still in use for %d snapshots\n", len(snapshots)) usedBlobs = restic.NewBlobSet() diff --git a/cmd/restic/integration_test.go b/cmd/restic/integration_test.go index 9c01939ec..10af8701a 100644 --- a/cmd/restic/integration_test.go +++ b/cmd/restic/integration_test.go @@ -270,8 +270,8 @@ func testRunForgetJSON(t testing.TB, gopts GlobalOptions, args ...string) { "Expected 2 snapshots to be removed, got %v", len(forgets[0].Remove)) } -func testRunPrune(t testing.TB, gopts GlobalOptions) { - rtest.OK(t, runPrune(gopts)) +func testRunPrune(t testing.TB, gopts GlobalOptions, opts PruneOptions) { + rtest.OK(t, runPrune(opts, gopts)) } func testSetupBackupData(t testing.TB, env *testEnvironment) string { @@ -1386,6 +1386,41 @@ func TestCheckRestoreNoLock(t *testing.T) { } func TestPrune(t *testing.T) { + t.Run("0", func(t *testing.T) { + opts := PruneOptions{MaxUnusedPercent: 0.0} + checkOpts := CheckOptions{ReadData: true, CheckUnused: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("50", func(t *testing.T) { + opts := PruneOptions{MaxUnusedPercent: 50.0} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("100", func(t *testing.T) { + opts := PruneOptions{MaxUnusedPercent: 100.0} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("CachableOnly", func(t *testing.T) { + opts := PruneOptions{RepackCachableOnly: true} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + /* repack-small option will come in future + t.Run("Small", func(t *testing.T) { + opts = PruneOptions{MaxUnusedPercent: 100.0, RepackSmall: true} + // The test case only produces small files; hence no unused blobs should remain. + checkOpts = CheckOptions{ReadData: true, CheckUnused: true} + testPrune(t, opts, checkOpts) + }) + */ +} + +func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -1406,10 +1441,12 @@ func TestPrune(t *testing.T) { testRunForgetJSON(t, env.gopts) testRunForget(t, env.gopts, firstSnapshot[0].String()) - testRunPrune(t, env.gopts) - testRunCheck(t, env.gopts) + testRunPrune(t, env.gopts, pruneOpts) + rtest.OK(t, runCheck(checkOpts, env.gopts, nil)) } +var pruneDefaultOptions = PruneOptions{MaxUnusedPercent: 1.5} + func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet { r, err := OpenRepository(gopts) rtest.OK(t, err) @@ -1452,14 +1489,8 @@ func TestPruneWithDamagedRepository(t *testing.T) { "expected one snapshot, got %v", snapshotIDs) // prune should fail - err := runPrune(env.gopts) - if err == nil { - t.Fatalf("expected prune to fail") - } - if !strings.Contains(err.Error(), "blobs seem to be missing") { - t.Fatalf("did not find hint for missing blobs") - } - t.Log(err) + rtest.Assert(t, runPrune(pruneDefaultOptions, env.gopts) == errorPacksMissing, + "prune should have reported index not complete error") } // Test repos for edge cases @@ -1469,37 +1500,37 @@ func TestEdgeCaseRepos(t *testing.T) { // repo where index is completely missing // => check and prune should fail t.Run("no-index", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, false, false) + testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where an existing and used blob is missing from the index - // => check should fail, prune should heal this + // => check and prune should fail t.Run("index-missing-blob", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, false, true) + testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where a blob is missing // => check and prune should fail t.Run("no-data", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, false, false) + testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where data exists that is not referenced // => check and prune should fully work t.Run("unreferenced-data", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo where an obsolete index still exists // => check and prune should fully work t.Run("obsolete-index", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo which contains mixed (data/tree) packs // => check and prune should fully work t.Run("mixed-packs", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo which contains duplicate blobs @@ -1510,11 +1541,11 @@ func TestEdgeCaseRepos(t *testing.T) { CheckUnused: true, } t.Run("duplicates", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, false, true) + testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, pruneDefaultOptions, false, true) }) } -func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkOK, pruneOK bool) { +func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, optionsPrune PruneOptions, checkOK, pruneOK bool) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -1524,15 +1555,15 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkO if checkOK { testRunCheck(t, env.gopts) } else { - rtest.Assert(t, runCheck(options, env.gopts, nil) != nil, + rtest.Assert(t, runCheck(optionsCheck, env.gopts, nil) != nil, "check should have reported an error") } if pruneOK { - testRunPrune(t, env.gopts) + testRunPrune(t, env.gopts, optionsPrune) testRunCheck(t, env.gopts) } else { - rtest.Assert(t, runPrune(env.gopts) != nil, + rtest.Assert(t, runPrune(optionsPrune, env.gopts) != nil, "prune should have reported an error") } } diff --git a/doc/060_forget.rst b/doc/060_forget.rst index 99dec6c51..2df82af8b 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -23,12 +23,11 @@ data that was referenced by the snapshot from the repository. This can be automated with the ``--prune`` option of the ``forget`` command, which runs ``prune`` automatically if snapshots have been removed. -.. Warning:: - - Pruning snapshots can be a very time-consuming process, taking nearly - as long as backups themselves. During a prune operation, the index is - locked and backups cannot be completed. Performance improvements are - planned for this feature. +Pruning snapshots can be a time-consuming process, depending on the +amount of snapshots and data to process. During a prune operation, the +repository is locked and backups cannot be completed. Please plan your +pruning so that there's time to complete it and it doesn't interfere with +regular backup runs. It is advisable to run ``restic check`` after pruning, to make sure you are alerted, should the internal data structures of the repository @@ -82,20 +81,32 @@ command must be run: $ restic -r /srv/restic-repo prune enter password for repository: - + repository 33002c5e opened successfully, password is correct + loading all snapshots... + loading indexes... + finding data that is still in use for 4 snapshots + [0:00] 100.00% 4 / 4 snapshots + searching used packs... + collecting packs for deletion and repacking + [0:00] 100.00% 5 / 5 packs processed + + to repack: 69 blobs / 1.078 MiB + this removes 67 blobs / 1.047 MiB + to delete: 7 blobs / 25.726 KiB + total prune: 74 blobs / 1.072 MiB + remaining: 16 blobs / 38.003 KiB + unused size after prune: 0 B (0.00% of remaining size) + + repacking packs + [0:00] 100.00% 2 / 2 packs repacked counting files in repo - building new index for repo - [0:00] 100.00% 22 / 22 files - repository contains 22 packs (8512 blobs) with 100.092 MiB bytes - processed 8512 blobs: 0 duplicate blobs, 0B duplicate - load all snapshots - find data that is still in use for 1 snapshots - [0:00] 100.00% 1 / 1 snapshots - found 8433 of 8512 data blobs still in use - will rewrite 3 packs - creating new index - [0:00] 86.36% 19 / 22 files - saved new index as 544a5084 + [0:00] 100.00% 3 / 3 packs + finding old index files + saved new indexes as [59270b3a] + remove 4 old index files + [0:00] 100.00% 4 / 4 files deleted + removing 3 old packs + [0:00] 100.00% 3 / 3 files deleted done Afterwards the repository is smaller. @@ -119,19 +130,31 @@ to ``forget``: 8c02b94b 2017-02-21 10:48:33 mopped /home/user/work 1 snapshots have been removed, running prune - counting files in repo - building new index for repo - [0:00] 100.00% 37 / 37 packs - repository contains 37 packs (5521 blobs) with 151.012 MiB bytes - processed 5521 blobs: 0 duplicate blobs, 0B duplicate - load all snapshots - find data that is still in use for 1 snapshots + loading all snapshots... + loading indexes... + finding data that is still in use for 1 snapshots [0:00] 100.00% 1 / 1 snapshots - found 5323 of 5521 data blobs still in use, removing 198 blobs - will delete 0 packs and rewrite 27 packs, this frees 22.106 MiB - creating new index - [0:00] 100.00% 30 / 30 packs - saved new index as b49f3e68 + searching used packs... + collecting packs for deletion and repacking + [0:00] 100.00% 5 / 5 packs processed + + to repack: 69 blobs / 1.078 MiB + this removes 67 blobs / 1.047 MiB + to delete: 7 blobs / 25.726 KiB + total prune: 74 blobs / 1.072 MiB + remaining: 16 blobs / 38.003 KiB + unused size after prune: 0 B (0.00% of remaining size) + + repacking packs + [0:00] 100.00% 2 / 2 packs repacked + counting files in repo + [0:00] 100.00% 3 / 3 packs + finding old index files + saved new indexes as [59270b3a] + remove 4 old index files + [0:00] 100.00% 4 / 4 files deleted + removing 3 old packs + [0:00] 100.00% 3 / 3 files deleted done Removing snapshots according to a policy @@ -282,3 +305,44 @@ last-day-of-the-months (11 or 12 depends if the 5 weeklies cross a month). And finally 75 last-day-of-the-year snapshots. All other snapshots are removed. +Customize pruning +***************** + +To understand the custom options, we first explain how the pruning process works: + +- First all snapshots and directories within snapshots are scanned to determine + which data is still in use. +- Then for all pack files ``prune`` finds out if the file is fully used, partly + used or completely unused. +- Completely unused packs are marked for deletion. Fully used packs are kept. + A partially used pack is either kept or marked for repacking depending on user + options. + Note that for repacking, restic must download the file from the repository + storage and reupload the needed data in the repository. This can be very + time-consuming for remote repositories. +- After deciding what to do, ``prune`` will actually perform the repack, modify + the index according to the changes and delete the obsolete files. + +The ``prune`` command accepts the following options: + +- ``--max-unused limit`` allow unused data up to the specified limit within the repository. + This allows restic to keep partly used packs instead of repacking them. + The limit can be specified as size, e.g. "200M" or in percentage with respect to the total + repository size, e.g. "0.5%". + ``prune`` tries to repack as little data as possible while still ensuring this + limit for unused data. + If you want to minimize the space used by your repository, use a value of 0%. + If you want to minimize the time and bandwidth used by the ``prune`` command, use a + high value. A value of 100% will not require any pack file to be repacked. + The default value is 5%. +- ``--max-repack-size size`` if set limits the total size of packs to repack. + As ``prune`` first stores all repacked packs and deletes the obsolete packs at the end, + this option might be handy if you expect many packs to be repacked and fear to run low + on storage. +- ``--repack-cacheable-only`` if set to true only pack files which are cacheable are repacked. + Other pack files are not repacked, if this option is set. + This allows a very fast repacking using only cached data. It can, however, imply that the + unused data in your repository exceeds the value given by ``--max-unused-percent``. + The default value is false. +- ``--dry-run`` only show what ``prune`` would do. +- ``--verbose`` increased verbosity shows additional statistics for ``prune``.