Merge pull request #2718 from aawsome/new-cleanup-command

Reimplementation of prune
2024-12-22 02:48:55 +00:00 · 2020-11-05 10:12:19 +01:00 · 2020-11-05 10:12:19 +01:00 · 5144141321
commit 5144141321
parent d35d279455 1ca60bccfb
10 changed files with 607 additions and 216 deletions
--- a/changelog/unreleased/pull-2718
+++ b/changelog/unreleased/pull-2718
@ -0,0 +1,22 @@
+Enhancement: Improve pruning performance and make pruning more customizable
+
+The `prune` command is now much faster. This is especially the case for remote
+repositories or repositories with not much data to remove.
+Also the memory usage of the `prune` command is now reduced.
+
+By default, the `prune` command no longer removes all unused data. This
+behavior can be fine-tuned by new options, like the acceptable amount of unused space or
+the maximum size of data to reorganize. For more details, see
+https://restic.readthedocs.io/en/stable/060_forget.html
+
+Moreover, `prune` now accepts the `--dry-run` option and `forget --dry-run --prune`
+also shows what `prune` would do.
+
+Fixes several open issues, e.g.:
+https://github.com/restic/restic/issues/1140
+https://github.com/restic/restic/issues/1985
+https://github.com/restic/restic/issues/2112
+https://github.com/restic/restic/issues/2227
+https://github.com/restic/restic/issues/2305
+
+https://github.com/restic/restic/pull/2718
--- a/cmd/restic/cmd_forget.go
+++ b/cmd/restic/cmd_forget.go
@ -80,9 +80,15 @@ func init() {
 	f.BoolVar(&forgetOptions.Prune, "prune", false, "automatically run the 'prune' command if snapshots have been removed")

 	f.SortFlags = false
+	addPruneOptions(cmdForget)
 }

 func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
+	err := verifyPruneOptions(&pruneOptions)
+	if err != nil {
+		return err
+	}
+
 	repo, err := OpenRepository(gopts)
 	if err != nil {
 		return err
@ -205,7 +211,11 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
 	}

 	if len(removeSnIDs) > 0 && opts.Prune && !opts.DryRun {
-		return pruneRepository(gopts, repo)
+		if !gopts.JSON {
+			Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs))
+		}
+		pruneOptions.DryRun = opts.DryRun
+		return runPruneWithRepo(pruneOptions, gopts, repo, removeSnIDs)
 	}

 	return nil
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -1,15 +1,24 @@
 package main

 import (
+	"math"
+	"sort"
+	"strconv"
+	"strings"
+
 	"github.com/restic/restic/internal/debug"
 	"github.com/restic/restic/internal/errors"
-	"github.com/restic/restic/internal/index"
+	"github.com/restic/restic/internal/pack"
 	"github.com/restic/restic/internal/repository"
 	"github.com/restic/restic/internal/restic"

 	"github.com/spf13/cobra"
 )

+var errorIndexIncomplete = errors.Fatal("index is not complete")
+var errorPacksMissing = errors.Fatal("packs from index missing in repo")
+var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index")
+
 var cmdPrune = &cobra.Command{
 	Use:   "prune [flags]",
 	Short: "Remove unneeded data from the repository",
@ -24,12 +33,91 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
 `,
 	DisableAutoGenTag: true,
 	RunE: func(cmd *cobra.Command, args []string) error {
-		return runPrune(globalOptions)
+		return runPrune(pruneOptions, globalOptions)
 	},
 }

+// PruneOptions collects all options for the cleanup command.
+type PruneOptions struct {
+	DryRun bool
+
+	MaxUnused      string
+	maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused
+
+	MaxRepackSize  string
+	MaxRepackBytes uint64
+
+	RepackCachableOnly bool
+}
+
+var pruneOptions PruneOptions
+
 func init() {
 	cmdRoot.AddCommand(cmdPrune)
+	f := cmdPrune.Flags()
+	f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done")
+	addPruneOptions(cmdPrune)
+}
+
+func addPruneOptions(c *cobra.Command) {
+	f := c.Flags()
+	f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')")
+	f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)")
+	f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable")
+}
+
+func verifyPruneOptions(opts *PruneOptions) error {
+	if len(opts.MaxRepackSize) > 0 {
+		size, err := parseSizeStr(opts.MaxRepackSize)
+		if err != nil {
+			return err
+		}
+		opts.MaxRepackBytes = uint64(size)
+	}
+
+	maxUnused := strings.TrimSpace(opts.MaxUnused)
+	if maxUnused == "" {
+		return errors.Fatalf("invalid value for --max-unused: %q", opts.MaxUnused)
+	}
+
+	// parse MaxUnused either as unlimited, a percentage, or an absolute number of bytes
+	switch {
+	case maxUnused == "unlimited":
+		opts.maxUnusedBytes = func(used uint64) uint64 {
+			return math.MaxUint64
+		}
+
+	case strings.HasSuffix(maxUnused, "%"):
+		maxUnused = strings.TrimSuffix(maxUnused, "%")
+		p, err := strconv.ParseFloat(maxUnused, 64)
+		if err != nil {
+			return errors.Fatalf("invalid percentage %q passed for --max-unused: %v", opts.MaxUnused, err)
+		}
+
+		if p < 0 {
+			return errors.Fatal("percentage for --max-unused must be positive")
+		}
+
+		if p >= 100 {
+			return errors.Fatal("percentage for --max-unused must be below 100%")
+		}
+
+		opts.maxUnusedBytes = func(used uint64) uint64 {
+			return uint64(p / (100 - p) * float64(used))
+		}
+
+	default:
+		size, err := parseSizeStr(maxUnused)
+		if err != nil {
+			return errors.Fatalf("invalid number of bytes %q for --max-unused: %v", opts.MaxUnused, err)
+		}
+
+		opts.maxUnusedBytes = func(used uint64) uint64 {
+			return uint64(size)
+		}
+	}
+
+	return nil
 }

 func shortenStatus(maxLength int, s string) string {
@ -44,7 +132,12 @@ func shortenStatus(maxLength int, s string) string {
 	return s[:maxLength-3] + "..."
 }

-func runPrune(gopts GlobalOptions) error {
+func runPrune(opts PruneOptions, gopts GlobalOptions) error {
+	err := verifyPruneOptions(&opts)
+	if err != nil {
+		return err
+	}
+
 	repo, err := OpenRepository(gopts)
 	if err != nil {
 		return err
@ -56,203 +149,345 @@ func runPrune(gopts GlobalOptions) error {
 		return err
 	}

+	return runPruneWithRepo(opts, gopts, repo, restic.NewIDSet())
+}
+
+func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository, ignoreSnapshots restic.IDSet) error {
 	// we do not need index updates while pruning!
 	repo.DisableAutoIndexUpdate()

-	return pruneRepository(gopts, repo)
-}
-
-func mixedBlobs(list []restic.Blob) bool {
-	var tree, data bool
-
-	for _, pb := range list {
-		switch pb.Type {
-		case restic.TreeBlob:
-			tree = true
-		case restic.DataBlob:
-			data = true
-		}
-
-		if tree && data {
-			return true
-		}
-	}
-
-	return false
-}
-
-func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
-	ctx := gopts.ctx
-
-	err := repo.LoadIndex(ctx)
+	Verbosef("loading all snapshots...\n")
+	snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo, ignoreSnapshots)
 	if err != nil {
 		return err
 	}

-	var stats struct {
-		blobs     int
-		packs     int
-		snapshots int
-		bytes     int64
-	}
-
-	Verbosef("counting files in repo\n")
-	err = repo.List(ctx, restic.PackFile, func(restic.ID, int64) error {
-		stats.packs++
-		return nil
-	})
+	Verbosef("loading indexes...\n")
+	err = repo.LoadIndex(gopts.ctx)
 	if err != nil {
 		return err
 	}

-	Verbosef("building new index for repo\n")
-
-	bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs")
-	idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar)
-	if err != nil {
-		return err
-	}
-
-	for _, id := range invalidFiles {
-		Warnf("incomplete pack file (will be removed): %v\n", id)
-	}
-
-	blobs := 0
-	for _, pack := range idx.Packs {
-		stats.bytes += pack.Size
-		blobs += len(pack.Entries)
-	}
-	Verbosef("repository contains %v packs (%v blobs) with %v\n",
-		len(idx.Packs), blobs, formatBytes(uint64(stats.bytes)))
-
-	blobCount := make(map[restic.BlobHandle]int)
-	var duplicateBlobs uint64
-	var duplicateBytes uint64
-
-	// find duplicate blobs
-	for _, p := range idx.Packs {
-		for _, entry := range p.Entries {
-			stats.blobs++
-			h := restic.BlobHandle{ID: entry.ID, Type: entry.Type}
-			blobCount[h]++
-
-			if blobCount[h] > 1 {
-				duplicateBlobs++
-				duplicateBytes += uint64(entry.Length)
-			}
-		}
-	}
-
-	Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n",
-		stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes)))
-	Verbosef("load all snapshots\n")
-
-	// find referenced blobs
-	snapshots, err := restic.LoadAllSnapshots(ctx, repo)
-	if err != nil {
-		return err
-	}
-
-	stats.snapshots = len(snapshots)
-
 	usedBlobs, err := getUsedBlobs(gopts, repo, snapshots)
 	if err != nil {
 		return err
 	}

-	var missingBlobs []restic.BlobHandle
-	for h := range usedBlobs {
-		if _, ok := blobCount[h]; !ok {
-			missingBlobs = append(missingBlobs, h)
+	return prune(opts, gopts, repo, usedBlobs)
+}
+
+type packInfo struct {
+	usedBlobs      uint
+	unusedBlobs    uint
+	duplicateBlobs uint
+	usedSize       uint64
+	unusedSize     uint64
+	tpe            restic.BlobType
+}
+
+type packInfoWithID struct {
+	ID restic.ID
+	packInfo
+}
+
+// prune selects which files to rewrite and then does that. The map usedBlobs is
+// modified in the process.
+func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error {
+	ctx := gopts.ctx
+
+	var stats struct {
+		blobs struct {
+			used      uint
+			duplicate uint
+			unused    uint
+			remove    uint
+			repack    uint
+			repackrm  uint
+		}
+		size struct {
+			used      uint64
+			duplicate uint64
+			unused    uint64
+			remove    uint64
+			repack    uint64
+			repackrm  uint64
+			unref     uint64
+		}
+		packs struct {
+			used       uint
+			unused     uint
+			partlyUsed uint
+			keep       uint
 		}
 	}
-	if len(missingBlobs) > 0 {
-		return errors.Fatalf("%v not found in the new index\n"+
+
+	Verbosef("searching used packs...\n")
+
+	keepBlobs := restic.NewBlobSet()
+	duplicateBlobs := restic.NewBlobSet()
+
+	// iterate over all blobs in index to find out which blobs are duplicates
+	for blob := range repo.Index().Each(ctx) {
+		bh := blob.Handle()
+		switch {
+		case usedBlobs.Has(bh): // used blob, move to keepBlobs
+			usedBlobs.Delete(bh)
+			keepBlobs.Insert(bh)
+		case keepBlobs.Has(bh): // duplicate blob
+			duplicateBlobs.Insert(bh)
+		}
+	}
+
+	// Check if all used blobs have been found in index
+	if len(usedBlobs) != 0 {
+		Warnf("%v not found in the new index\n"+
 			"Data blobs seem to be missing, aborting prune to prevent further data loss!\n"+
 			"Please report this error (along with the output of the 'prune' run) at\n"+
-			"https://github.com/restic/restic/issues/new/choose", missingBlobs)
+			"https://github.com/restic/restic/issues/new/choose", usedBlobs)
+		return errorIndexIncomplete
 	}

-	Verbosef("found %d of %d data blobs still in use, removing %d blobs\n",
-		len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs))
+	indexPack := make(map[restic.ID]packInfo)

-	// find packs that need a rewrite
-	rewritePacks := restic.NewIDSet()
-	for _, pack := range idx.Packs {
-		if mixedBlobs(pack.Entries) {
-			rewritePacks.Insert(pack.ID)
-			continue
+	// iterate over all blobs in index to generate packInfo
+	for blob := range repo.Index().Each(ctx) {
+		ip, ok := indexPack[blob.PackID]
+		if !ok {
+			ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize}
+		}
+		// mark mixed packs with "Invalid blob type"
+		if ip.tpe != blob.Type {
+			ip.tpe = restic.InvalidBlob
 		}

-		for _, blob := range pack.Entries {
-			h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
-			if !usedBlobs.Has(h) {
-				rewritePacks.Insert(pack.ID)
-				continue
-			}
-
-			if blobCount[h] > 1 {
-				rewritePacks.Insert(pack.ID)
-			}
+		bh := blob.Handle()
+		size := uint64(pack.PackedSizeOfBlob(blob.Length))
+		switch {
+		case duplicateBlobs.Has(bh): // duplicate blob
+			ip.usedSize += size
+			ip.duplicateBlobs++
+			stats.size.duplicate += size
+			stats.blobs.duplicate++
+		case keepBlobs.Has(bh): // used blob, not duplicate
+			ip.usedSize += size
+			ip.usedBlobs++
+			stats.size.used += size
+			stats.blobs.used++
+		default: // unused blob
+			ip.unusedSize += size
+			ip.unusedBlobs++
+			stats.size.unused += size
+			stats.blobs.unused++
 		}
+		// update indexPack
+		indexPack[blob.PackID] = ip
 	}

-	removeBytes := duplicateBytes
-
-	// find packs that are unneeded
+	Verbosef("collecting packs for deletion and repacking\n")
+	removePacksFirst := restic.NewIDSet()
 	removePacks := restic.NewIDSet()
+	repackPacks := restic.NewIDSet()

-	Verbosef("will remove %d invalid files\n", len(invalidFiles))
-	for _, id := range invalidFiles {
-		removePacks.Insert(id)
-	}
+	var repackCandidates []packInfoWithID

-	for packID, p := range idx.Packs {
-
-		hasActiveBlob := false
-		for _, blob := range p.Entries {
-			h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
-			if usedBlobs.Has(h) {
-				hasActiveBlob = true
-				continue
-			}
-
-			removeBytes += uint64(blob.Length)
+	// loop over all packs and decide what to do
+	bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed")
+	bar.Start()
+	err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error {
+		p, ok := indexPack[id]
+		if !ok {
+			// Pack was not referenced in index and is not used  => immediately remove!
+			Verboseff("will remove pack %v as it is unused and not indexed\n", id.Str())
+			removePacksFirst.Insert(id)
+			stats.size.unref += uint64(packSize)
+			return nil
 		}

-		if hasActiveBlob {
-			continue
+		if p.unusedSize+p.usedSize != uint64(packSize) {
+			Warnf("pack %s: calculated size %d does not match real size %d\nRun 'restic rebuild-index'.",
+				id.Str(), p.unusedSize+p.usedSize, packSize)
+			return errorSizeNotMatching
 		}

-		removePacks.Insert(packID)
-
-		if !rewritePacks.Has(packID) {
-			return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str())
+		// statistics
+		switch {
+		case p.usedBlobs == 0 && p.duplicateBlobs == 0:
+			stats.packs.unused++
+		case p.unusedBlobs == 0:
+			stats.packs.used++
+		default:
+			stats.packs.partlyUsed++
 		}

-		rewritePacks.Delete(packID)
-	}
+		// decide what to do
+		switch {
+		case p.usedBlobs == 0 && p.duplicateBlobs == 0:
+			// All blobs in pack are no longer used => remove pack!
+			removePacks.Insert(id)
+			stats.blobs.remove += p.unusedBlobs
+			stats.size.remove += p.unusedSize

-	Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n",
-		len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes)))
+		case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
+			// if this is a data pack and --repack-cacheable-only is set => keep pack!
+			stats.packs.keep++

-	var obsoletePacks restic.IDSet
-	if len(rewritePacks) != 0 {
-		bar := newProgressMax(!gopts.Quiet, uint64(len(rewritePacks)), "packs rewritten")
-		obsoletePacks, err = repository.Repack(ctx, repo, rewritePacks, usedBlobs, bar)
-		if err != nil {
-			return err
+		case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob:
+			// All blobs in pack are used and not duplicates/mixed => keep pack!
+			stats.packs.keep++
+
+		default:
+			// all other packs are candidates for repacking
+			repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p})
 		}
-	}

-	removePacks.Merge(obsoletePacks)
-
-	if err = rebuildIndex(ctx, repo, removePacks); err != nil {
+		delete(indexPack, id)
+		bar.Report(restic.Stat{Blobs: 1})
+		return nil
+	})
+	bar.Done()
+	if err != nil {
 		return err
 	}

+	if len(indexPack) != 0 {
+		Warnf("The index references pack files which are missing from the repository: %v\n", indexPack)
+		return errorPacksMissing
+	}
+
+	repackAllPacksWithDuplicates := true
+
+	// calculate limit for number of unused bytes in the repo after repacking
+	maxUnusedSizeAfter := opts.maxUnusedBytes(stats.size.used)
+
+	// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
+	// This is equivalent to sorting by unused / total space.
+	// Instead of unused[i] / used[i] > unused[j] / used[j] we use
+	// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
+	// Morover duplicates and mixed are sorted to the beginning
+	sort.Slice(repackCandidates, func(i, j int) bool {
+		pi := repackCandidates[i].packInfo
+		pj := repackCandidates[j].packInfo
+		switch {
+		case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
+			return true
+		case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
+			return false
+		case pi.tpe == restic.InvalidBlob && pj.tpe != restic.InvalidBlob:
+			return true
+		case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob:
+			return false
+		}
+		return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize
+	})
+
+	repack := func(id restic.ID, p packInfo) {
+		repackPacks.Insert(id)
+		stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
+		stats.size.repack += p.unusedSize + p.usedSize
+		stats.blobs.repackrm += p.unusedBlobs
+		stats.size.repackrm += p.unusedSize
+	}
+
+	for _, p := range repackCandidates {
+		reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter)
+
+		reachedRepackSize := false
+		if opts.MaxRepackBytes > 0 {
+			reachedRepackSize = stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes
+		}
+
+		switch {
+		case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob):
+			// repacking duplicates/mixed is only limited by repackSize
+			repack(p.ID, p.packInfo)
+
+		case reachedUnusedSizeAfter, reachedRepackSize:
+			// for all other packs stop repacking if tolerated unused size is reached.
+			stats.packs.keep++
+			if p.duplicateBlobs > 0 {
+				repackAllPacksWithDuplicates = false
+			}
+
+		default:
+			repack(p.ID, p.packInfo)
+		}
+	}
+
+	// if all duplicates are repacked, print out correct statistics
+	if repackAllPacksWithDuplicates {
+		stats.blobs.repackrm += stats.blobs.duplicate
+		stats.size.repackrm += stats.size.duplicate
+	}
+
+	Verboseff("\nused:        %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))
+	if stats.blobs.duplicate > 0 {
+		Verboseff("duplicates:  %10d blobs / %s\n", stats.blobs.duplicate, formatBytes(stats.size.duplicate))
+	}
+	Verboseff("unused:      %10d blobs / %s\n", stats.blobs.unused, formatBytes(stats.size.unused))
+	if stats.size.unref > 0 {
+		Verboseff("unreferenced:                   %s\n", formatBytes(stats.size.unref))
+	}
+	totalBlobs := stats.blobs.used + stats.blobs.unused + stats.blobs.duplicate
+	totalSize := stats.size.used + stats.size.duplicate + stats.size.unused + stats.size.unref
+	Verboseff("total:       %10d blobs / %s\n", totalBlobs, formatBytes(totalSize))
+	Verboseff("unused size: %s of total size\n", formatPercent(stats.size.unused, totalSize))
+
+	Verbosef("\nto repack:   %10d blobs / %s\n", stats.blobs.repack, formatBytes(stats.size.repack))
+	Verbosef("this removes %10d blobs / %s\n", stats.blobs.repackrm, formatBytes(stats.size.repackrm))
+	Verbosef("to delete:   %10d blobs / %s\n", stats.blobs.remove, formatBytes(stats.size.remove+stats.size.unref))
+	totalPruneSize := stats.size.remove + stats.size.repackrm + stats.size.unref
+	Verbosef("total prune: %10d blobs / %s\n", stats.blobs.remove+stats.blobs.repackrm, formatBytes(totalPruneSize))
+	Verbosef("remaining:   %10d blobs / %s\n", totalBlobs-(stats.blobs.remove+stats.blobs.repackrm), formatBytes(totalSize-totalPruneSize))
+	unusedAfter := stats.size.unused - stats.size.remove - stats.size.repackrm
+	Verbosef("unused size after prune: %s (%s of remaining size)\n",
+		formatBytes(unusedAfter), formatPercent(unusedAfter, totalSize-totalPruneSize))
+	Verbosef("\n")
+	Verboseff("totally used packs: %10d\n", stats.packs.used)
+	Verboseff("partly used packs:  %10d\n", stats.packs.partlyUsed)
+	Verboseff("unused packs:       %10d\n\n", stats.packs.unused)
+
+	Verboseff("to keep:   %10d packs\n", stats.packs.keep)
+	Verboseff("to repack: %10d packs\n", len(repackPacks))
+	Verboseff("to delete: %10d packs\n", len(removePacks))
+	if len(removePacksFirst) > 0 {
+		Verboseff("to delete: %10d unreferenced packs\n\n", len(removePacksFirst))
+	}
+
+	if opts.DryRun {
+		if !gopts.JSON && gopts.verbosity >= 2 {
+			if len(removePacksFirst) > 0 {
+				Printf("Would have removed the following unreferenced packs:\n%v\n\n", removePacksFirst)
+			}
+			Printf("Would have repacked and removed the following packs:\n%v\n\n", repackPacks)
+			Printf("Would have removed the following no longer used packs:\n%v\n\n", removePacks)
+		}
+		// Always quit here if DryRun was set!
+		return nil
+	}
+
+	// unreferenced packs can be safely deleted first
+	if len(removePacksFirst) != 0 {
+		Verbosef("deleting unreferenced packs\n")
+		DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile)
+	}
+
+	if len(repackPacks) != 0 {
+		Verbosef("repacking packs\n")
+		bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked")
+		_, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar)
+		if err != nil {
+			return err
+		}
+		// Also remove repacked packs
+		removePacks.Merge(repackPacks)
+	}
+
 	if len(removePacks) != 0 {
-		Verbosef("remove %d old packs\n", len(removePacks))
+		if err = rebuildIndex(ctx, repo, removePacks); err != nil {
+			return err
+		}
+
+		Verbosef("removing %d old packs\n", len(removePacks))
 		DeleteFiles(gopts, repo, removePacks, restic.PackFile)
 	}

@ -263,7 +498,7 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
 func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) {
 	ctx := gopts.ctx

-	Verbosef("find data that is still in use for %d snapshots\n", len(snapshots))
+	Verbosef("finding data that is still in use for %d snapshots\n", len(snapshots))

 	usedBlobs = restic.NewBlobSet()

--- a/cmd/restic/global.go
+++ b/cmd/restic/global.go
@ -231,6 +231,13 @@ func Verbosef(format string, args ...interface{}) {
 	}
 }

+// Verboseff calls Printf to write the message when the verbosity is >= 2
+func Verboseff(format string, args ...interface{}) {
+	if globalOptions.verbosity >= 2 {
+		Printf(format, args...)
+	}
+}
+
 // PrintProgress wraps fmt.Printf to handle the difference in writing progress
 // information to terminals and non-terminal stdout
 func PrintProgress(format string, args ...interface{}) {
--- a/cmd/restic/integration_test.go
+++ b/cmd/restic/integration_test.go
@ -270,8 +270,8 @@ func testRunForgetJSON(t testing.TB, gopts GlobalOptions, args ...string) {
 		"Expected 2 snapshots to be removed, got %v", len(forgets[0].Remove))
 }

-func testRunPrune(t testing.TB, gopts GlobalOptions) {
-	rtest.OK(t, runPrune(gopts))
+func testRunPrune(t testing.TB, gopts GlobalOptions, opts PruneOptions) {
+	rtest.OK(t, runPrune(opts, gopts))
 }

 func testSetupBackupData(t testing.TB, env *testEnvironment) string {
@ -1386,6 +1386,32 @@ func TestCheckRestoreNoLock(t *testing.T) {
 }

 func TestPrune(t *testing.T) {
+	t.Run("0", func(t *testing.T) {
+		opts := PruneOptions{MaxUnused: "0%"}
+		checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
+		testPrune(t, opts, checkOpts)
+	})
+
+	t.Run("50", func(t *testing.T) {
+		opts := PruneOptions{MaxUnused: "50%"}
+		checkOpts := CheckOptions{ReadData: true}
+		testPrune(t, opts, checkOpts)
+	})
+
+	t.Run("unlimited", func(t *testing.T) {
+		opts := PruneOptions{MaxUnused: "unlimited"}
+		checkOpts := CheckOptions{ReadData: true}
+		testPrune(t, opts, checkOpts)
+	})
+
+	t.Run("CachableOnly", func(t *testing.T) {
+		opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true}
+		checkOpts := CheckOptions{ReadData: true}
+		testPrune(t, opts, checkOpts)
+	})
+}
+
+func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) {
 	env, cleanup := withTestEnvironment(t)
 	defer cleanup()

@ -1406,10 +1432,12 @@ func TestPrune(t *testing.T) {

 	testRunForgetJSON(t, env.gopts)
 	testRunForget(t, env.gopts, firstSnapshot[0].String())
-	testRunPrune(t, env.gopts)
-	testRunCheck(t, env.gopts)
+	testRunPrune(t, env.gopts, pruneOpts)
+	rtest.OK(t, runCheck(checkOpts, env.gopts, nil))
 }

+var pruneDefaultOptions = PruneOptions{MaxUnused: "5%"}
+
 func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet {
 	r, err := OpenRepository(gopts)
 	rtest.OK(t, err)
@ -1452,14 +1480,8 @@ func TestPruneWithDamagedRepository(t *testing.T) {
 		"expected one snapshot, got %v", snapshotIDs)

 	// prune should fail
-	err := runPrune(env.gopts)
-	if err == nil {
-		t.Fatalf("expected prune to fail")
-	}
-	if !strings.Contains(err.Error(), "blobs seem to be missing") {
-		t.Fatalf("did not find hint for missing blobs")
-	}
-	t.Log(err)
+	rtest.Assert(t, runPrune(pruneDefaultOptions, env.gopts) == errorPacksMissing,
+		"prune should have reported index not complete error")
 }

 // Test repos for edge cases
@ -1469,37 +1491,37 @@ func TestEdgeCaseRepos(t *testing.T) {
 	// repo where index is completely missing
 	// => check and prune should fail
 	t.Run("no-index", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, false, false)
+		testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, pruneDefaultOptions, false, false)
 	})

 	// repo where an existing and used blob is missing from the index
-	// => check should fail, prune should heal this
+	// => check and prune should fail
 	t.Run("index-missing-blob", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, false, true)
+		testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, pruneDefaultOptions, false, false)
 	})

 	// repo where a blob is missing
 	// => check and prune should fail
 	t.Run("no-data", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, false, false)
+		testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, pruneDefaultOptions, false, false)
 	})

 	// repo where data exists that is not referenced
 	// => check and prune should fully work
 	t.Run("unreferenced-data", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, true, true)
+		testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, pruneDefaultOptions, true, true)
 	})

 	// repo where an obsolete index still exists
 	// => check and prune should fully work
 	t.Run("obsolete-index", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, true, true)
+		testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, pruneDefaultOptions, true, true)
 	})

 	// repo which contains mixed (data/tree) packs
 	// => check and prune should fully work
 	t.Run("mixed-packs", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, true, true)
+		testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, pruneDefaultOptions, true, true)
 	})

 	// repo which contains duplicate blobs
@ -1510,11 +1532,11 @@ func TestEdgeCaseRepos(t *testing.T) {
 		CheckUnused: true,
 	}
 	t.Run("duplicates", func(t *testing.T) {
-		testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, false, true)
+		testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, pruneDefaultOptions, false, true)
 	})
 }

-func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkOK, pruneOK bool) {
+func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, optionsPrune PruneOptions, checkOK, pruneOK bool) {
 	env, cleanup := withTestEnvironment(t)
 	defer cleanup()

@ -1524,15 +1546,15 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkO
 	if checkOK {
 		testRunCheck(t, env.gopts)
 	} else {
-		rtest.Assert(t, runCheck(options, env.gopts, nil) != nil,
+		rtest.Assert(t, runCheck(optionsCheck, env.gopts, nil) != nil,
 			"check should have reported an error")
 	}

 	if pruneOK {
-		testRunPrune(t, env.gopts)
+		testRunPrune(t, env.gopts, optionsPrune)
 		testRunCheck(t, env.gopts)
 	} else {
-		rtest.Assert(t, runPrune(env.gopts) != nil,
+		rtest.Assert(t, runPrune(optionsPrune, env.gopts) != nil,
 			"prune should have reported an error")
 	}
 }
--- a/doc/060_forget.rst
+++ b/doc/060_forget.rst
@ -23,12 +23,11 @@ data that was referenced by the snapshot from the repository. This can
 be automated with the ``--prune`` option of the ``forget`` command,
 which runs ``prune`` automatically if snapshots have been removed.

-.. Warning::
-
-   Pruning snapshots can be a very time-consuming process, taking nearly
-   as long as backups themselves. During a prune operation, the index is
-   locked and backups cannot be completed. Performance improvements are 
-   planned for this feature.
+Pruning snapshots can be a time-consuming process, depending on the
+amount of snapshots and data to process. During a prune operation, the
+repository is locked and backups cannot be completed. Please plan your
+pruning so that there's time to complete it and it doesn't interfere with
+regular backup runs.

 It is advisable to run ``restic check`` after pruning, to make sure
 you are alerted, should the internal data structures of the repository
@ -82,20 +81,32 @@ command must be run:

    $ restic -r /srv/restic-repo prune
    enter password for repository:
-
+    repository 33002c5e opened successfully, password is correct
+    loading all snapshots...
+    loading indexes...
+    finding data that is still in use for 4 snapshots
+    [0:00] 100.00%  4 / 4 snapshots
+    searching used packs...
+    collecting packs for deletion and repacking
+    [0:00] 100.00%  5 / 5 packs processed
+    
+    to repack:           69 blobs / 1.078 MiB
+    this removes         67 blobs / 1.047 MiB
+    to delete:            7 blobs / 25.726 KiB
+    total prune:         74 blobs / 1.072 MiB
+    remaining:           16 blobs / 38.003 KiB
+    unused size after prune: 0 B (0.00% of remaining size)
+    
+    repacking packs
+    [0:00] 100.00%  2 / 2 packs repacked
    counting files in repo
-    building new index for repo
-    [0:00] 100.00%  22 / 22 files
-    repository contains 22 packs (8512 blobs) with 100.092 MiB bytes
-    processed 8512 blobs: 0 duplicate blobs, 0B duplicate
-    load all snapshots
-    find data that is still in use for 1 snapshots
-    [0:00] 100.00%  1 / 1 snapshots
-    found 8433 of 8512 data blobs still in use
-    will rewrite 3 packs
-    creating new index
-    [0:00] 86.36%  19 / 22 files
-    saved new index as 544a5084
+    [0:00] 100.00%  3 / 3 packs
+    finding old index files
+    saved new indexes as [59270b3a]
+    remove 4 old index files
+    [0:00] 100.00%  4 / 4 files deleted
+    removing 3 old packs
+    [0:00] 100.00%  3 / 3 files deleted
    done

 Afterwards the repository is smaller.
@ -119,19 +130,31 @@ to ``forget``:
    8c02b94b  2017-02-21 10:48:33  mopped                  /home/user/work

    1 snapshots have been removed, running prune
-    counting files in repo
-    building new index for repo
-    [0:00] 100.00%  37 / 37 packs
-    repository contains 37 packs (5521 blobs) with 151.012 MiB bytes
-    processed 5521 blobs: 0 duplicate blobs, 0B duplicate
-    load all snapshots
-    find data that is still in use for 1 snapshots
+    loading all snapshots...
+    loading indexes...
+    finding data that is still in use for 1 snapshots
    [0:00] 100.00%  1 / 1 snapshots
-    found 5323 of 5521 data blobs still in use, removing 198 blobs
-    will delete 0 packs and rewrite 27 packs, this frees 22.106 MiB
-    creating new index
-    [0:00] 100.00%  30 / 30 packs
-    saved new index as b49f3e68
+    searching used packs...
+    collecting packs for deletion and repacking
+    [0:00] 100.00%  5 / 5 packs processed
+    
+    to repack:           69 blobs / 1.078 MiB
+    this removes         67 blobs / 1.047 MiB
+    to delete:            7 blobs / 25.726 KiB
+    total prune:         74 blobs / 1.072 MiB
+    remaining:           16 blobs / 38.003 KiB
+    unused size after prune: 0 B (0.00% of remaining size)
+    
+    repacking packs
+    [0:00] 100.00%  2 / 2 packs repacked
+    counting files in repo
+    [0:00] 100.00%  3 / 3 packs
+    finding old index files
+    saved new indexes as [59270b3a]
+    remove 4 old index files
+    [0:00] 100.00%  4 / 4 files deleted
+    removing 3 old packs
+    [0:00] 100.00%  3 / 3 files deleted
    done

 Removing snapshots according to a policy
@ -282,3 +305,59 @@ last-day-of-the-months (11 or 12 depends if the 5 weeklies cross a month).
 And finally 75 last-day-of-the-year snapshots. All other snapshots are
 removed.

+Customize pruning
+*****************
+
+To understand the custom options, we first explain how the pruning process works:
+
+1. All snapshots and directories within snapshots are scanned to determine
+   which data is still in use.
+2. For all files in the repository, restic finds out if the file is fully
+   used, partly used or completely unused.
+3. Completely unused files are marked for deletion. Fully used files are kept.
+   A partially used file is either kept or marked for repacking depending on user
+   options.
+
+   Note that for repacking, restic must download the file from the repository
+   storage and re-upload the needed data in the repository. This can be very
+   time-consuming for remote repositories.
+4. After deciding what to do, ``prune`` will actually perform the repack, modify
+   the index according to the changes and delete the obsolete files.
+
+The ``prune`` command accepts the following options:
+
+-  ``--max-unused limit`` allow unused data up to the specified limit within the repository.
+   This allows restic to keep partly used files instead of repacking them.
+
+   The limit can be specified in several ways:
+
+    * As an absolute size (e.g. ``200M``). If you want to minimize the space
+      used by your repository, pass ``0`` to this option.
+    * As a size relative to the total repo size (e.g. ``10%``). This means that
+      after prune, at most ``10%`` of the total data stored in the repo may be
+      unused data. If the repo after prune has as size of 500MB, then at most
+      50MB may be unused.
+    * If the string ``unlimited`` is passed, there is no limit for partly
+      unused files. This means that as long as some data is still used within
+      a file stored in the repo, restic will just leave it there. Use this if
+      you want to minimize the time and bandwidth used by the ``prune``
+      operation.
+
+   Restic tries to repack as little data as possible while still ensuring this 
+   limit for unused data.
+
+- ``--max-repack-size size`` if set limits the total size of files to repack.
+  As ``prune`` first stores all repacked files and deletes the obsolete files at the end,
+  this option might be handy if you expect many files to be repacked and fear to run low
+  on storage. 
+
+- ``--repack-cacheable-only`` if set to true only files which contain
+  metadata and would be stored in the cache are repacked. Other pack files are
+  not repacked if this option is set. This allows a very fast repacking
+  using only cached data. It can, however, imply that the unused data in
+  your repository exceeds the value given by ``--max-unused``.
+  The default value is false.
+
+-  ``--dry-run`` only show what ``prune`` would do.
+
+-  ``--verbose`` increased verbosity shows additional statistics for ``prune``.
--- a/internal/pack/pack.go
+++ b/internal/pack/pack.go
@ -161,13 +161,16 @@ func (p *Packer) String() string {
 }

 var (
-	// size of the header-length field at the end of the file
-	headerLengthSize = binary.Size(uint32(0))
 	// we require at least one entry in the header, and one blob for a pack file
 	minFileSize = entrySize + crypto.Extension + uint(headerLengthSize)
 )

 const (
+	// size of the header-length field at the end of the file; it is a uint32
+	headerLengthSize = 4
+	// constant overhead of the header independent of #entries
+	HeaderSize = headerLengthSize + crypto.Extension
+
 	maxHeaderSize = 16 * 1024 * 1024
 	// number of header enries to download as part of header-length request
 	eagerEntries = 15
@ -315,3 +318,8 @@ func List(k *crypto.Key, rd io.ReaderAt, size int64) (entries []restic.Blob, err

 	return entries, nil
 }
+
+// PackedSizeOfBlob returns the size a blob actually uses when saved in a pack
+func PackedSizeOfBlob(blobLength uint) uint {
+	return blobLength + entrySize
+}
--- a/internal/restic/blob.go
+++ b/internal/restic/blob.go
@ -19,6 +19,10 @@ func (b Blob) String() string {
 		b.Type, b.ID.Str(), b.Offset, b.Length)
 }

+func (b Blob) Handle() BlobHandle {
+	return BlobHandle{ID: b.ID, Type: b.Type}
+}
+
 // PackedBlob is a blob stored within a file.
 type PackedBlob struct {
 	Blob
--- a/internal/restic/snapshot.go
+++ b/internal/restic/snapshot.go
@ -67,8 +67,12 @@ func LoadSnapshot(ctx context.Context, repo Repository, id ID) (*Snapshot, error
 }

 // LoadAllSnapshots returns a list of all snapshots in the repo.
-func LoadAllSnapshots(ctx context.Context, repo Repository) (snapshots []*Snapshot, err error) {
+// If a snapshot ID is in excludeIDs, it will not be included in the result.
+func LoadAllSnapshots(ctx context.Context, repo Repository, excludeIDs IDSet) (snapshots []*Snapshot, err error) {
 	err = repo.List(ctx, SnapshotFile, func(id ID, size int64) error {
+		if excludeIDs.Has(id) {
+			return nil
+		}
 		sn, err := LoadSnapshot(ctx, repo, id)
 		if err != nil {
 			return err
--- a/internal/restic/testing_test.go
+++ b/internal/restic/testing_test.go
@ -25,7 +25,7 @@ func TestCreateSnapshot(t *testing.T) {
 		restic.TestCreateSnapshot(t, repo, testSnapshotTime.Add(time.Duration(i)*time.Second), testDepth, 0)
 	}

-	snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo)
+	snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo, restic.NewIDSet())
 	if err != nil {
 		t.Fatal(err)
 	}