diff --git a/changelog/unreleased/pull-2718 b/changelog/unreleased/pull-2718 new file mode 100644 index 000000000..56af35d71 --- /dev/null +++ b/changelog/unreleased/pull-2718 @@ -0,0 +1,22 @@ +Enhancement: Improve pruning performance and make pruning more customizable + +The `prune` command is now much faster. This is especially the case for remote +repositories or repositories with not much data to remove. +Also the memory usage of the `prune` command is now reduced. + +By default, the `prune` command no longer removes all unused data. This +behavior can be fine-tuned by new options, like the acceptable amount of unused space or +the maximum size of data to reorganize. For more details, see +https://restic.readthedocs.io/en/stable/060_forget.html + +Moreover, `prune` now accepts the `--dry-run` option and `forget --dry-run --prune` +also shows what `prune` would do. + +Fixes several open issues, e.g.: +https://github.com/restic/restic/issues/1140 +https://github.com/restic/restic/issues/1985 +https://github.com/restic/restic/issues/2112 +https://github.com/restic/restic/issues/2227 +https://github.com/restic/restic/issues/2305 + +https://github.com/restic/restic/pull/2718 diff --git a/cmd/restic/cmd_forget.go b/cmd/restic/cmd_forget.go index 3edaa76e9..596c7c550 100644 --- a/cmd/restic/cmd_forget.go +++ b/cmd/restic/cmd_forget.go @@ -80,9 +80,15 @@ func init() { f.BoolVar(&forgetOptions.Prune, "prune", false, "automatically run the 'prune' command if snapshots have been removed") f.SortFlags = false + addPruneOptions(cmdForget) } func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { + err := verifyPruneOptions(&pruneOptions) + if err != nil { + return err + } + repo, err := OpenRepository(gopts) if err != nil { return err @@ -205,7 +211,11 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error { } if len(removeSnIDs) > 0 && opts.Prune && !opts.DryRun { - return pruneRepository(gopts, repo) + if !gopts.JSON { + Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs)) + } + pruneOptions.DryRun = opts.DryRun + return runPruneWithRepo(pruneOptions, gopts, repo, removeSnIDs) } return nil diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 1bb1a51e5..4fbc5fab7 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -1,15 +1,24 @@ package main import ( + "math" + "sort" + "strconv" + "strings" + "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" - "github.com/restic/restic/internal/index" + "github.com/restic/restic/internal/pack" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" "github.com/spf13/cobra" ) +var errorIndexIncomplete = errors.Fatal("index is not complete") +var errorPacksMissing = errors.Fatal("packs from index missing in repo") +var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index") + var cmdPrune = &cobra.Command{ Use: "prune [flags]", Short: "Remove unneeded data from the repository", @@ -24,12 +33,91 @@ Exit status is 0 if the command was successful, and non-zero if there was any er `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { - return runPrune(globalOptions) + return runPrune(pruneOptions, globalOptions) }, } +// PruneOptions collects all options for the cleanup command. +type PruneOptions struct { + DryRun bool + + MaxUnused string + maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused + + MaxRepackSize string + MaxRepackBytes uint64 + + RepackCachableOnly bool +} + +var pruneOptions PruneOptions + func init() { cmdRoot.AddCommand(cmdPrune) + f := cmdPrune.Flags() + f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done") + addPruneOptions(cmdPrune) +} + +func addPruneOptions(c *cobra.Command) { + f := c.Flags() + f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')") + f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)") + f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") +} + +func verifyPruneOptions(opts *PruneOptions) error { + if len(opts.MaxRepackSize) > 0 { + size, err := parseSizeStr(opts.MaxRepackSize) + if err != nil { + return err + } + opts.MaxRepackBytes = uint64(size) + } + + maxUnused := strings.TrimSpace(opts.MaxUnused) + if maxUnused == "" { + return errors.Fatalf("invalid value for --max-unused: %q", opts.MaxUnused) + } + + // parse MaxUnused either as unlimited, a percentage, or an absolute number of bytes + switch { + case maxUnused == "unlimited": + opts.maxUnusedBytes = func(used uint64) uint64 { + return math.MaxUint64 + } + + case strings.HasSuffix(maxUnused, "%"): + maxUnused = strings.TrimSuffix(maxUnused, "%") + p, err := strconv.ParseFloat(maxUnused, 64) + if err != nil { + return errors.Fatalf("invalid percentage %q passed for --max-unused: %v", opts.MaxUnused, err) + } + + if p < 0 { + return errors.Fatal("percentage for --max-unused must be positive") + } + + if p >= 100 { + return errors.Fatal("percentage for --max-unused must be below 100%") + } + + opts.maxUnusedBytes = func(used uint64) uint64 { + return uint64(p / (100 - p) * float64(used)) + } + + default: + size, err := parseSizeStr(maxUnused) + if err != nil { + return errors.Fatalf("invalid number of bytes %q for --max-unused: %v", opts.MaxUnused, err) + } + + opts.maxUnusedBytes = func(used uint64) uint64 { + return uint64(size) + } + } + + return nil } func shortenStatus(maxLength int, s string) string { @@ -44,7 +132,12 @@ func shortenStatus(maxLength int, s string) string { return s[:maxLength-3] + "..." } -func runPrune(gopts GlobalOptions) error { +func runPrune(opts PruneOptions, gopts GlobalOptions) error { + err := verifyPruneOptions(&opts) + if err != nil { + return err + } + repo, err := OpenRepository(gopts) if err != nil { return err @@ -56,203 +149,345 @@ func runPrune(gopts GlobalOptions) error { return err } + return runPruneWithRepo(opts, gopts, repo, restic.NewIDSet()) +} + +func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository, ignoreSnapshots restic.IDSet) error { // we do not need index updates while pruning! repo.DisableAutoIndexUpdate() - return pruneRepository(gopts, repo) -} - -func mixedBlobs(list []restic.Blob) bool { - var tree, data bool - - for _, pb := range list { - switch pb.Type { - case restic.TreeBlob: - tree = true - case restic.DataBlob: - data = true - } - - if tree && data { - return true - } - } - - return false -} - -func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { - ctx := gopts.ctx - - err := repo.LoadIndex(ctx) + Verbosef("loading all snapshots...\n") + snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo, ignoreSnapshots) if err != nil { return err } - var stats struct { - blobs int - packs int - snapshots int - bytes int64 - } - - Verbosef("counting files in repo\n") - err = repo.List(ctx, restic.PackFile, func(restic.ID, int64) error { - stats.packs++ - return nil - }) + Verbosef("loading indexes...\n") + err = repo.LoadIndex(gopts.ctx) if err != nil { return err } - Verbosef("building new index for repo\n") - - bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs") - idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar) - if err != nil { - return err - } - - for _, id := range invalidFiles { - Warnf("incomplete pack file (will be removed): %v\n", id) - } - - blobs := 0 - for _, pack := range idx.Packs { - stats.bytes += pack.Size - blobs += len(pack.Entries) - } - Verbosef("repository contains %v packs (%v blobs) with %v\n", - len(idx.Packs), blobs, formatBytes(uint64(stats.bytes))) - - blobCount := make(map[restic.BlobHandle]int) - var duplicateBlobs uint64 - var duplicateBytes uint64 - - // find duplicate blobs - for _, p := range idx.Packs { - for _, entry := range p.Entries { - stats.blobs++ - h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} - blobCount[h]++ - - if blobCount[h] > 1 { - duplicateBlobs++ - duplicateBytes += uint64(entry.Length) - } - } - } - - Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n", - stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes))) - Verbosef("load all snapshots\n") - - // find referenced blobs - snapshots, err := restic.LoadAllSnapshots(ctx, repo) - if err != nil { - return err - } - - stats.snapshots = len(snapshots) - usedBlobs, err := getUsedBlobs(gopts, repo, snapshots) if err != nil { return err } - var missingBlobs []restic.BlobHandle - for h := range usedBlobs { - if _, ok := blobCount[h]; !ok { - missingBlobs = append(missingBlobs, h) + return prune(opts, gopts, repo, usedBlobs) +} + +type packInfo struct { + usedBlobs uint + unusedBlobs uint + duplicateBlobs uint + usedSize uint64 + unusedSize uint64 + tpe restic.BlobType +} + +type packInfoWithID struct { + ID restic.ID + packInfo +} + +// prune selects which files to rewrite and then does that. The map usedBlobs is +// modified in the process. +func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error { + ctx := gopts.ctx + + var stats struct { + blobs struct { + used uint + duplicate uint + unused uint + remove uint + repack uint + repackrm uint + } + size struct { + used uint64 + duplicate uint64 + unused uint64 + remove uint64 + repack uint64 + repackrm uint64 + unref uint64 + } + packs struct { + used uint + unused uint + partlyUsed uint + keep uint } } - if len(missingBlobs) > 0 { - return errors.Fatalf("%v not found in the new index\n"+ + + Verbosef("searching used packs...\n") + + keepBlobs := restic.NewBlobSet() + duplicateBlobs := restic.NewBlobSet() + + // iterate over all blobs in index to find out which blobs are duplicates + for blob := range repo.Index().Each(ctx) { + bh := blob.Handle() + switch { + case usedBlobs.Has(bh): // used blob, move to keepBlobs + usedBlobs.Delete(bh) + keepBlobs.Insert(bh) + case keepBlobs.Has(bh): // duplicate blob + duplicateBlobs.Insert(bh) + } + } + + // Check if all used blobs have been found in index + if len(usedBlobs) != 0 { + Warnf("%v not found in the new index\n"+ "Data blobs seem to be missing, aborting prune to prevent further data loss!\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose", missingBlobs) + "https://github.com/restic/restic/issues/new/choose", usedBlobs) + return errorIndexIncomplete } - Verbosef("found %d of %d data blobs still in use, removing %d blobs\n", - len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs)) + indexPack := make(map[restic.ID]packInfo) - // find packs that need a rewrite - rewritePacks := restic.NewIDSet() - for _, pack := range idx.Packs { - if mixedBlobs(pack.Entries) { - rewritePacks.Insert(pack.ID) - continue + // iterate over all blobs in index to generate packInfo + for blob := range repo.Index().Each(ctx) { + ip, ok := indexPack[blob.PackID] + if !ok { + ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize} + } + // mark mixed packs with "Invalid blob type" + if ip.tpe != blob.Type { + ip.tpe = restic.InvalidBlob } - for _, blob := range pack.Entries { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - if !usedBlobs.Has(h) { - rewritePacks.Insert(pack.ID) - continue - } - - if blobCount[h] > 1 { - rewritePacks.Insert(pack.ID) - } + bh := blob.Handle() + size := uint64(pack.PackedSizeOfBlob(blob.Length)) + switch { + case duplicateBlobs.Has(bh): // duplicate blob + ip.usedSize += size + ip.duplicateBlobs++ + stats.size.duplicate += size + stats.blobs.duplicate++ + case keepBlobs.Has(bh): // used blob, not duplicate + ip.usedSize += size + ip.usedBlobs++ + stats.size.used += size + stats.blobs.used++ + default: // unused blob + ip.unusedSize += size + ip.unusedBlobs++ + stats.size.unused += size + stats.blobs.unused++ } + // update indexPack + indexPack[blob.PackID] = ip } - removeBytes := duplicateBytes - - // find packs that are unneeded + Verbosef("collecting packs for deletion and repacking\n") + removePacksFirst := restic.NewIDSet() removePacks := restic.NewIDSet() + repackPacks := restic.NewIDSet() - Verbosef("will remove %d invalid files\n", len(invalidFiles)) - for _, id := range invalidFiles { - removePacks.Insert(id) - } + var repackCandidates []packInfoWithID - for packID, p := range idx.Packs { - - hasActiveBlob := false - for _, blob := range p.Entries { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - if usedBlobs.Has(h) { - hasActiveBlob = true - continue - } - - removeBytes += uint64(blob.Length) + // loop over all packs and decide what to do + bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed") + bar.Start() + err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error { + p, ok := indexPack[id] + if !ok { + // Pack was not referenced in index and is not used => immediately remove! + Verboseff("will remove pack %v as it is unused and not indexed\n", id.Str()) + removePacksFirst.Insert(id) + stats.size.unref += uint64(packSize) + return nil } - if hasActiveBlob { - continue + if p.unusedSize+p.usedSize != uint64(packSize) { + Warnf("pack %s: calculated size %d does not match real size %d\nRun 'restic rebuild-index'.", + id.Str(), p.unusedSize+p.usedSize, packSize) + return errorSizeNotMatching } - removePacks.Insert(packID) - - if !rewritePacks.Has(packID) { - return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str()) + // statistics + switch { + case p.usedBlobs == 0 && p.duplicateBlobs == 0: + stats.packs.unused++ + case p.unusedBlobs == 0: + stats.packs.used++ + default: + stats.packs.partlyUsed++ } - rewritePacks.Delete(packID) - } + // decide what to do + switch { + case p.usedBlobs == 0 && p.duplicateBlobs == 0: + // All blobs in pack are no longer used => remove pack! + removePacks.Insert(id) + stats.blobs.remove += p.unusedBlobs + stats.size.remove += p.unusedSize - Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n", - len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes))) + case opts.RepackCachableOnly && p.tpe == restic.DataBlob: + // if this is a data pack and --repack-cacheable-only is set => keep pack! + stats.packs.keep++ - var obsoletePacks restic.IDSet - if len(rewritePacks) != 0 { - bar := newProgressMax(!gopts.Quiet, uint64(len(rewritePacks)), "packs rewritten") - obsoletePacks, err = repository.Repack(ctx, repo, rewritePacks, usedBlobs, bar) - if err != nil { - return err + case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: + // All blobs in pack are used and not duplicates/mixed => keep pack! + stats.packs.keep++ + + default: + // all other packs are candidates for repacking + repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p}) } - } - removePacks.Merge(obsoletePacks) - - if err = rebuildIndex(ctx, repo, removePacks); err != nil { + delete(indexPack, id) + bar.Report(restic.Stat{Blobs: 1}) + return nil + }) + bar.Done() + if err != nil { return err } + if len(indexPack) != 0 { + Warnf("The index references pack files which are missing from the repository: %v\n", indexPack) + return errorPacksMissing + } + + repackAllPacksWithDuplicates := true + + // calculate limit for number of unused bytes in the repo after repacking + maxUnusedSizeAfter := opts.maxUnusedBytes(stats.size.used) + + // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. + // This is equivalent to sorting by unused / total space. + // Instead of unused[i] / used[i] > unused[j] / used[j] we use + // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 + // Morover duplicates and mixed are sorted to the beginning + sort.Slice(repackCandidates, func(i, j int) bool { + pi := repackCandidates[i].packInfo + pj := repackCandidates[j].packInfo + switch { + case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0: + return true + case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0: + return false + case pi.tpe == restic.InvalidBlob && pj.tpe != restic.InvalidBlob: + return true + case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob: + return false + } + return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize + }) + + repack := func(id restic.ID, p packInfo) { + repackPacks.Insert(id) + stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs + stats.size.repack += p.unusedSize + p.usedSize + stats.blobs.repackrm += p.unusedBlobs + stats.size.repackrm += p.unusedSize + } + + for _, p := range repackCandidates { + reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter) + + reachedRepackSize := false + if opts.MaxRepackBytes > 0 { + reachedRepackSize = stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes + } + + switch { + case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob): + // repacking duplicates/mixed is only limited by repackSize + repack(p.ID, p.packInfo) + + case reachedUnusedSizeAfter, reachedRepackSize: + // for all other packs stop repacking if tolerated unused size is reached. + stats.packs.keep++ + if p.duplicateBlobs > 0 { + repackAllPacksWithDuplicates = false + } + + default: + repack(p.ID, p.packInfo) + } + } + + // if all duplicates are repacked, print out correct statistics + if repackAllPacksWithDuplicates { + stats.blobs.repackrm += stats.blobs.duplicate + stats.size.repackrm += stats.size.duplicate + } + + Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used)) + if stats.blobs.duplicate > 0 { + Verboseff("duplicates: %10d blobs / %s\n", stats.blobs.duplicate, formatBytes(stats.size.duplicate)) + } + Verboseff("unused: %10d blobs / %s\n", stats.blobs.unused, formatBytes(stats.size.unused)) + if stats.size.unref > 0 { + Verboseff("unreferenced: %s\n", formatBytes(stats.size.unref)) + } + totalBlobs := stats.blobs.used + stats.blobs.unused + stats.blobs.duplicate + totalSize := stats.size.used + stats.size.duplicate + stats.size.unused + stats.size.unref + Verboseff("total: %10d blobs / %s\n", totalBlobs, formatBytes(totalSize)) + Verboseff("unused size: %s of total size\n", formatPercent(stats.size.unused, totalSize)) + + Verbosef("\nto repack: %10d blobs / %s\n", stats.blobs.repack, formatBytes(stats.size.repack)) + Verbosef("this removes %10d blobs / %s\n", stats.blobs.repackrm, formatBytes(stats.size.repackrm)) + Verbosef("to delete: %10d blobs / %s\n", stats.blobs.remove, formatBytes(stats.size.remove+stats.size.unref)) + totalPruneSize := stats.size.remove + stats.size.repackrm + stats.size.unref + Verbosef("total prune: %10d blobs / %s\n", stats.blobs.remove+stats.blobs.repackrm, formatBytes(totalPruneSize)) + Verbosef("remaining: %10d blobs / %s\n", totalBlobs-(stats.blobs.remove+stats.blobs.repackrm), formatBytes(totalSize-totalPruneSize)) + unusedAfter := stats.size.unused - stats.size.remove - stats.size.repackrm + Verbosef("unused size after prune: %s (%s of remaining size)\n", + formatBytes(unusedAfter), formatPercent(unusedAfter, totalSize-totalPruneSize)) + Verbosef("\n") + Verboseff("totally used packs: %10d\n", stats.packs.used) + Verboseff("partly used packs: %10d\n", stats.packs.partlyUsed) + Verboseff("unused packs: %10d\n\n", stats.packs.unused) + + Verboseff("to keep: %10d packs\n", stats.packs.keep) + Verboseff("to repack: %10d packs\n", len(repackPacks)) + Verboseff("to delete: %10d packs\n", len(removePacks)) + if len(removePacksFirst) > 0 { + Verboseff("to delete: %10d unreferenced packs\n\n", len(removePacksFirst)) + } + + if opts.DryRun { + if !gopts.JSON && gopts.verbosity >= 2 { + if len(removePacksFirst) > 0 { + Printf("Would have removed the following unreferenced packs:\n%v\n\n", removePacksFirst) + } + Printf("Would have repacked and removed the following packs:\n%v\n\n", repackPacks) + Printf("Would have removed the following no longer used packs:\n%v\n\n", removePacks) + } + // Always quit here if DryRun was set! + return nil + } + + // unreferenced packs can be safely deleted first + if len(removePacksFirst) != 0 { + Verbosef("deleting unreferenced packs\n") + DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile) + } + + if len(repackPacks) != 0 { + Verbosef("repacking packs\n") + bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked") + _, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar) + if err != nil { + return err + } + // Also remove repacked packs + removePacks.Merge(repackPacks) + } + if len(removePacks) != 0 { - Verbosef("remove %d old packs\n", len(removePacks)) + if err = rebuildIndex(ctx, repo, removePacks); err != nil { + return err + } + + Verbosef("removing %d old packs\n", len(removePacks)) DeleteFiles(gopts, repo, removePacks, restic.PackFile) } @@ -263,7 +498,7 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) { ctx := gopts.ctx - Verbosef("find data that is still in use for %d snapshots\n", len(snapshots)) + Verbosef("finding data that is still in use for %d snapshots\n", len(snapshots)) usedBlobs = restic.NewBlobSet() diff --git a/cmd/restic/global.go b/cmd/restic/global.go index fe42da4a7..6ac07c6db 100644 --- a/cmd/restic/global.go +++ b/cmd/restic/global.go @@ -231,6 +231,13 @@ func Verbosef(format string, args ...interface{}) { } } +// Verboseff calls Printf to write the message when the verbosity is >= 2 +func Verboseff(format string, args ...interface{}) { + if globalOptions.verbosity >= 2 { + Printf(format, args...) + } +} + // PrintProgress wraps fmt.Printf to handle the difference in writing progress // information to terminals and non-terminal stdout func PrintProgress(format string, args ...interface{}) { diff --git a/cmd/restic/integration_test.go b/cmd/restic/integration_test.go index 66a129598..0faf07cb5 100644 --- a/cmd/restic/integration_test.go +++ b/cmd/restic/integration_test.go @@ -270,8 +270,8 @@ func testRunForgetJSON(t testing.TB, gopts GlobalOptions, args ...string) { "Expected 2 snapshots to be removed, got %v", len(forgets[0].Remove)) } -func testRunPrune(t testing.TB, gopts GlobalOptions) { - rtest.OK(t, runPrune(gopts)) +func testRunPrune(t testing.TB, gopts GlobalOptions, opts PruneOptions) { + rtest.OK(t, runPrune(opts, gopts)) } func testSetupBackupData(t testing.TB, env *testEnvironment) string { @@ -1386,6 +1386,32 @@ func TestCheckRestoreNoLock(t *testing.T) { } func TestPrune(t *testing.T) { + t.Run("0", func(t *testing.T) { + opts := PruneOptions{MaxUnused: "0%"} + checkOpts := CheckOptions{ReadData: true, CheckUnused: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("50", func(t *testing.T) { + opts := PruneOptions{MaxUnused: "50%"} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("unlimited", func(t *testing.T) { + opts := PruneOptions{MaxUnused: "unlimited"} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) + + t.Run("CachableOnly", func(t *testing.T) { + opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true} + checkOpts := CheckOptions{ReadData: true} + testPrune(t, opts, checkOpts) + }) +} + +func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -1406,10 +1432,12 @@ func TestPrune(t *testing.T) { testRunForgetJSON(t, env.gopts) testRunForget(t, env.gopts, firstSnapshot[0].String()) - testRunPrune(t, env.gopts) - testRunCheck(t, env.gopts) + testRunPrune(t, env.gopts, pruneOpts) + rtest.OK(t, runCheck(checkOpts, env.gopts, nil)) } +var pruneDefaultOptions = PruneOptions{MaxUnused: "5%"} + func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet { r, err := OpenRepository(gopts) rtest.OK(t, err) @@ -1452,14 +1480,8 @@ func TestPruneWithDamagedRepository(t *testing.T) { "expected one snapshot, got %v", snapshotIDs) // prune should fail - err := runPrune(env.gopts) - if err == nil { - t.Fatalf("expected prune to fail") - } - if !strings.Contains(err.Error(), "blobs seem to be missing") { - t.Fatalf("did not find hint for missing blobs") - } - t.Log(err) + rtest.Assert(t, runPrune(pruneDefaultOptions, env.gopts) == errorPacksMissing, + "prune should have reported index not complete error") } // Test repos for edge cases @@ -1469,37 +1491,37 @@ func TestEdgeCaseRepos(t *testing.T) { // repo where index is completely missing // => check and prune should fail t.Run("no-index", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, false, false) + testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where an existing and used blob is missing from the index - // => check should fail, prune should heal this + // => check and prune should fail t.Run("index-missing-blob", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, false, true) + testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where a blob is missing // => check and prune should fail t.Run("no-data", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, false, false) + testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, pruneDefaultOptions, false, false) }) // repo where data exists that is not referenced // => check and prune should fully work t.Run("unreferenced-data", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo where an obsolete index still exists // => check and prune should fully work t.Run("obsolete-index", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo which contains mixed (data/tree) packs // => check and prune should fully work t.Run("mixed-packs", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, true, true) + testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, pruneDefaultOptions, true, true) }) // repo which contains duplicate blobs @@ -1510,11 +1532,11 @@ func TestEdgeCaseRepos(t *testing.T) { CheckUnused: true, } t.Run("duplicates", func(t *testing.T) { - testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, false, true) + testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, pruneDefaultOptions, false, true) }) } -func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkOK, pruneOK bool) { +func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, optionsPrune PruneOptions, checkOK, pruneOK bool) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -1524,15 +1546,15 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkO if checkOK { testRunCheck(t, env.gopts) } else { - rtest.Assert(t, runCheck(options, env.gopts, nil) != nil, + rtest.Assert(t, runCheck(optionsCheck, env.gopts, nil) != nil, "check should have reported an error") } if pruneOK { - testRunPrune(t, env.gopts) + testRunPrune(t, env.gopts, optionsPrune) testRunCheck(t, env.gopts) } else { - rtest.Assert(t, runPrune(env.gopts) != nil, + rtest.Assert(t, runPrune(optionsPrune, env.gopts) != nil, "prune should have reported an error") } } diff --git a/doc/060_forget.rst b/doc/060_forget.rst index 99dec6c51..08381f180 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -23,12 +23,11 @@ data that was referenced by the snapshot from the repository. This can be automated with the ``--prune`` option of the ``forget`` command, which runs ``prune`` automatically if snapshots have been removed. -.. Warning:: - - Pruning snapshots can be a very time-consuming process, taking nearly - as long as backups themselves. During a prune operation, the index is - locked and backups cannot be completed. Performance improvements are - planned for this feature. +Pruning snapshots can be a time-consuming process, depending on the +amount of snapshots and data to process. During a prune operation, the +repository is locked and backups cannot be completed. Please plan your +pruning so that there's time to complete it and it doesn't interfere with +regular backup runs. It is advisable to run ``restic check`` after pruning, to make sure you are alerted, should the internal data structures of the repository @@ -82,20 +81,32 @@ command must be run: $ restic -r /srv/restic-repo prune enter password for repository: - + repository 33002c5e opened successfully, password is correct + loading all snapshots... + loading indexes... + finding data that is still in use for 4 snapshots + [0:00] 100.00% 4 / 4 snapshots + searching used packs... + collecting packs for deletion and repacking + [0:00] 100.00% 5 / 5 packs processed + + to repack: 69 blobs / 1.078 MiB + this removes 67 blobs / 1.047 MiB + to delete: 7 blobs / 25.726 KiB + total prune: 74 blobs / 1.072 MiB + remaining: 16 blobs / 38.003 KiB + unused size after prune: 0 B (0.00% of remaining size) + + repacking packs + [0:00] 100.00% 2 / 2 packs repacked counting files in repo - building new index for repo - [0:00] 100.00% 22 / 22 files - repository contains 22 packs (8512 blobs) with 100.092 MiB bytes - processed 8512 blobs: 0 duplicate blobs, 0B duplicate - load all snapshots - find data that is still in use for 1 snapshots - [0:00] 100.00% 1 / 1 snapshots - found 8433 of 8512 data blobs still in use - will rewrite 3 packs - creating new index - [0:00] 86.36% 19 / 22 files - saved new index as 544a5084 + [0:00] 100.00% 3 / 3 packs + finding old index files + saved new indexes as [59270b3a] + remove 4 old index files + [0:00] 100.00% 4 / 4 files deleted + removing 3 old packs + [0:00] 100.00% 3 / 3 files deleted done Afterwards the repository is smaller. @@ -119,19 +130,31 @@ to ``forget``: 8c02b94b 2017-02-21 10:48:33 mopped /home/user/work 1 snapshots have been removed, running prune - counting files in repo - building new index for repo - [0:00] 100.00% 37 / 37 packs - repository contains 37 packs (5521 blobs) with 151.012 MiB bytes - processed 5521 blobs: 0 duplicate blobs, 0B duplicate - load all snapshots - find data that is still in use for 1 snapshots + loading all snapshots... + loading indexes... + finding data that is still in use for 1 snapshots [0:00] 100.00% 1 / 1 snapshots - found 5323 of 5521 data blobs still in use, removing 198 blobs - will delete 0 packs and rewrite 27 packs, this frees 22.106 MiB - creating new index - [0:00] 100.00% 30 / 30 packs - saved new index as b49f3e68 + searching used packs... + collecting packs for deletion and repacking + [0:00] 100.00% 5 / 5 packs processed + + to repack: 69 blobs / 1.078 MiB + this removes 67 blobs / 1.047 MiB + to delete: 7 blobs / 25.726 KiB + total prune: 74 blobs / 1.072 MiB + remaining: 16 blobs / 38.003 KiB + unused size after prune: 0 B (0.00% of remaining size) + + repacking packs + [0:00] 100.00% 2 / 2 packs repacked + counting files in repo + [0:00] 100.00% 3 / 3 packs + finding old index files + saved new indexes as [59270b3a] + remove 4 old index files + [0:00] 100.00% 4 / 4 files deleted + removing 3 old packs + [0:00] 100.00% 3 / 3 files deleted done Removing snapshots according to a policy @@ -282,3 +305,59 @@ last-day-of-the-months (11 or 12 depends if the 5 weeklies cross a month). And finally 75 last-day-of-the-year snapshots. All other snapshots are removed. +Customize pruning +***************** + +To understand the custom options, we first explain how the pruning process works: + +1. All snapshots and directories within snapshots are scanned to determine + which data is still in use. +2. For all files in the repository, restic finds out if the file is fully + used, partly used or completely unused. +3. Completely unused files are marked for deletion. Fully used files are kept. + A partially used file is either kept or marked for repacking depending on user + options. + + Note that for repacking, restic must download the file from the repository + storage and re-upload the needed data in the repository. This can be very + time-consuming for remote repositories. +4. After deciding what to do, ``prune`` will actually perform the repack, modify + the index according to the changes and delete the obsolete files. + +The ``prune`` command accepts the following options: + +- ``--max-unused limit`` allow unused data up to the specified limit within the repository. + This allows restic to keep partly used files instead of repacking them. + + The limit can be specified in several ways: + + * As an absolute size (e.g. ``200M``). If you want to minimize the space + used by your repository, pass ``0`` to this option. + * As a size relative to the total repo size (e.g. ``10%``). This means that + after prune, at most ``10%`` of the total data stored in the repo may be + unused data. If the repo after prune has as size of 500MB, then at most + 50MB may be unused. + * If the string ``unlimited`` is passed, there is no limit for partly + unused files. This means that as long as some data is still used within + a file stored in the repo, restic will just leave it there. Use this if + you want to minimize the time and bandwidth used by the ``prune`` + operation. + + Restic tries to repack as little data as possible while still ensuring this + limit for unused data. + +- ``--max-repack-size size`` if set limits the total size of files to repack. + As ``prune`` first stores all repacked files and deletes the obsolete files at the end, + this option might be handy if you expect many files to be repacked and fear to run low + on storage. + +- ``--repack-cacheable-only`` if set to true only files which contain + metadata and would be stored in the cache are repacked. Other pack files are + not repacked if this option is set. This allows a very fast repacking + using only cached data. It can, however, imply that the unused data in + your repository exceeds the value given by ``--max-unused``. + The default value is false. + +- ``--dry-run`` only show what ``prune`` would do. + +- ``--verbose`` increased verbosity shows additional statistics for ``prune``. diff --git a/internal/pack/pack.go b/internal/pack/pack.go index fbe0522dc..d4f064476 100644 --- a/internal/pack/pack.go +++ b/internal/pack/pack.go @@ -161,13 +161,16 @@ func (p *Packer) String() string { } var ( - // size of the header-length field at the end of the file - headerLengthSize = binary.Size(uint32(0)) // we require at least one entry in the header, and one blob for a pack file minFileSize = entrySize + crypto.Extension + uint(headerLengthSize) ) const ( + // size of the header-length field at the end of the file; it is a uint32 + headerLengthSize = 4 + // constant overhead of the header independent of #entries + HeaderSize = headerLengthSize + crypto.Extension + maxHeaderSize = 16 * 1024 * 1024 // number of header enries to download as part of header-length request eagerEntries = 15 @@ -315,3 +318,8 @@ func List(k *crypto.Key, rd io.ReaderAt, size int64) (entries []restic.Blob, err return entries, nil } + +// PackedSizeOfBlob returns the size a blob actually uses when saved in a pack +func PackedSizeOfBlob(blobLength uint) uint { + return blobLength + entrySize +} diff --git a/internal/restic/blob.go b/internal/restic/blob.go index a3a6c8630..b6c5a47cf 100644 --- a/internal/restic/blob.go +++ b/internal/restic/blob.go @@ -19,6 +19,10 @@ func (b Blob) String() string { b.Type, b.ID.Str(), b.Offset, b.Length) } +func (b Blob) Handle() BlobHandle { + return BlobHandle{ID: b.ID, Type: b.Type} +} + // PackedBlob is a blob stored within a file. type PackedBlob struct { Blob diff --git a/internal/restic/snapshot.go b/internal/restic/snapshot.go index dc0dd5949..86e98e234 100644 --- a/internal/restic/snapshot.go +++ b/internal/restic/snapshot.go @@ -67,8 +67,12 @@ func LoadSnapshot(ctx context.Context, repo Repository, id ID) (*Snapshot, error } // LoadAllSnapshots returns a list of all snapshots in the repo. -func LoadAllSnapshots(ctx context.Context, repo Repository) (snapshots []*Snapshot, err error) { +// If a snapshot ID is in excludeIDs, it will not be included in the result. +func LoadAllSnapshots(ctx context.Context, repo Repository, excludeIDs IDSet) (snapshots []*Snapshot, err error) { err = repo.List(ctx, SnapshotFile, func(id ID, size int64) error { + if excludeIDs.Has(id) { + return nil + } sn, err := LoadSnapshot(ctx, repo, id) if err != nil { return err diff --git a/internal/restic/testing_test.go b/internal/restic/testing_test.go index 0386fb76a..c3989f55f 100644 --- a/internal/restic/testing_test.go +++ b/internal/restic/testing_test.go @@ -25,7 +25,7 @@ func TestCreateSnapshot(t *testing.T) { restic.TestCreateSnapshot(t, repo, testSnapshotTime.Add(time.Duration(i)*time.Second), testDepth, 0) } - snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo) + snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo, restic.NewIDSet()) if err != nil { t.Fatal(err) }