Merge pull request #2842 from aawsome/rebuild-index-inmem

Rebuild index in prune by using in-memory index
2024-11-27 07:16:40 +00:00 · 2020-11-06 20:51:20 +01:00 · 2020-11-06 20:51:20 +01:00 · 4707bdb204
commit 4707bdb204
parent 7f6f31c34b 47277c4b4c
9 changed files with 280 additions and 48 deletions
--- a/changelog/unreleased/pull-2718
+++ b/changelog/unreleased/pull-2718
@ -3,6 +3,11 @@ Enhancement: Improve pruning performance and make pruning more customizable
 The `prune` command is now much faster. This is especially the case for remote
 repositories or repositories with not much data to remove.
 Also the memory usage of the `prune` command is now reduced.
 Restic used to rebuild the index from scratch after pruning. This could lead
 to missing packs in the index in some cases for eventually consistent 
 backends, like e.g. AWS S3.
 This behavior is now changed and the index rebuilding uses the information
 already known by `prune`.
 By default, the `prune` command no longer removes all unused data. This
 behavior can be fine-tuned by new options, like the acceptable amount of unused space or
@ -14,9 +19,11 @@ also shows what `prune` would do.
 Fixes several open issues, e.g.:
 https://github.com/restic/restic/issues/1140
 https://github.com/restic/restic/issues/1599
 https://github.com/restic/restic/issues/1985
 https://github.com/restic/restic/issues/2112
 https://github.com/restic/restic/issues/2227
 https://github.com/restic/restic/issues/2305
 https://github.com/restic/restic/pull/2718
 https://github.com/restic/restic/pull/2842
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -471,19 +471,31 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile)
 	}
 	packsAddedByRepack := 0
 	if len(repackPacks) != 0 {
 		// Remember the number of unique packs before repacking
 		packsBeforeRepacking := len(repo.Index().Packs())
 		Verbosef("repacking packs\n")
 		bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked")
 		_, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar)
 		if err != nil {
 			return err
 		}
 		// Since repacking will only add new packs, we can calculate the number
 		// of packs like this:
 		packsAddedByRepack = len(repo.Index().Packs()) - packsBeforeRepacking
 		// Also remove repacked packs
 		removePacks.Merge(repackPacks)
 	}
 	if len(removePacks) != 0 {
-		if err = rebuildIndex(ctx, repo, removePacks); err != nil {
+		totalpacks := int(stats.packs.used+stats.packs.partlyUsed+stats.packs.unused) -
 			len(removePacks) + packsAddedByRepack
 		err = rebuildIndexFiles(gopts, repo, removePacks, uint64(totalpacks))
 		if err != nil {
 			return err
 		}
@ -495,6 +507,20 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	return nil
 }
 func rebuildIndexFiles(gopts GlobalOptions, repo restic.Repository, removePacks restic.IDSet, packcount uint64) error {
 	Verbosef("rebuilding index\n")
 	bar := newProgressMax(!gopts.Quiet, packcount, "packs processed")
 	obsoleteIndexes, err := (repo.Index()).(*repository.MasterIndex).
 		Save(gopts.ctx, repo, removePacks, bar)
 	if err != nil {
 		return err
 	}
 	Verbosef("deleting obsolete index files\n")
 	return DeleteFilesChecked(gopts, repo, obsoleteIndexes, restic.IndexFile)
 }
 func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) {
 	ctx := gopts.ctx
--- a/cmd/restic/integration_test.go
+++ b/cmd/restic/integration_test.go
@ -1559,6 +1559,62 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, o
 	}
 }
 // a listOnceBackend only allows listing once per filetype
 // listing filetypes more than once may cause problems with eventually consistent
 // backends (like e.g. AWS S3) as the second listing may be inconsistent to what
 // is expected by the first listing + some operations.
 type listOnceBackend struct {
 	restic.Backend
 	listedFileType map[restic.FileType]bool
 }
 func newListOnceBackend(be restic.Backend) *listOnceBackend {
 	return &listOnceBackend{
 		Backend:        be,
 		listedFileType: make(map[restic.FileType]bool),
 	}
 }
 func (be *listOnceBackend) List(ctx context.Context, t restic.FileType, fn func(restic.FileInfo) error) error {
 	if t != restic.LockFile && be.listedFileType[t] {
 		return errors.Errorf("tried listing type %v the second time", t)
 	}
 	be.listedFileType[t] = true
 	return be.Backend.List(ctx, t, fn)
 }
 func TestPruneListOnce(t *testing.T) {
 	env, cleanup := withTestEnvironment(t)
 	defer cleanup()
 	env.gopts.backendTestHook = func(r restic.Backend) (restic.Backend, error) {
 		return newListOnceBackend(r), nil
 	}
 	pruneOpts := PruneOptions{MaxUnused: "0"}
 	checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
 	testSetupBackupData(t, env)
 	opts := BackupOptions{}
 	testRunBackup(t, "", []string{filepath.Join(env.testdata, "0", "0", "9")}, opts, env.gopts)
 	firstSnapshot := testRunList(t, "snapshots", env.gopts)
 	rtest.Assert(t, len(firstSnapshot) == 1,
 		"expected one snapshot, got %v", firstSnapshot)
 	testRunBackup(t, "", []string{filepath.Join(env.testdata, "0", "0", "9", "2")}, opts, env.gopts)
 	testRunBackup(t, "", []string{filepath.Join(env.testdata, "0", "0", "9", "3")}, opts, env.gopts)
 	snapshotIDs := testRunList(t, "snapshots", env.gopts)
 	rtest.Assert(t, len(snapshotIDs) == 3,
 		"expected 3 snapshot, got %v", snapshotIDs)
 	testRunForgetJSON(t, env.gopts)
 	testRunForget(t, env.gopts, firstSnapshot[0].String())
 	testRunPrune(t, env.gopts, pruneOpts)
 	rtest.OK(t, runCheck(checkOpts, env.gopts, nil))
 }
 func TestHardLink(t *testing.T) {
 	// this test assumes a test set with a single directory containing hard linked files
 	env, cleanup := withTestEnvironment(t)
--- a/doc/060_forget.rst
+++ b/doc/060_forget.rst
@ -99,12 +99,10 @@ command must be run:
    repacking packs
    [0:00] 100.00%  2 / 2 packs repacked
-    counting files in repo
+    rebuilding index
-    [0:00] 100.00%  3 / 3 packs
+    [0:00] 100.00%  3 / 3 packs processed
-    finding old index files
+    deleting obsolete index files
-    saved new indexes as [59270b3a]
+    [0:00] 100.00%  3 / 3 files deleted
    remove 4 old index files
    [0:00] 100.00%  4 / 4 files deleted
    removing 3 old packs
    [0:00] 100.00%  3 / 3 files deleted
    done
@ -147,12 +145,10 @@ to ``forget``:
    repacking packs
    [0:00] 100.00%  2 / 2 packs repacked
-    counting files in repo
+    rebuilding index
-    [0:00] 100.00%  3 / 3 packs
+    [0:00] 100.00%  3 / 3 packs processed
-    finding old index files
+    deleting obsolete index files
-    saved new indexes as [59270b3a]
+    [0:00] 100.00%  3 / 3 files deleted
    remove 4 old index files
    [0:00] 100.00%  4 / 4 files deleted
    removing 3 old packs
    [0:00] 100.00%  3 / 3 files deleted
    done
--- a/internal/repository/index.go
+++ b/internal/repository/index.go
@ -275,6 +275,55 @@ func (idx *Index) Each(ctx context.Context) <-chan restic.PackedBlob {
 	return ch
 }
 type EachByPackResult struct {
 	packID restic.ID
 	blobs  []restic.Blob
 }
 // EachByPack returns a channel that yields all blobs known to the index
 // grouped by packID but ignoring blobs with a packID in packPlacklist.
 // When the  context is cancelled, the background goroutine
 // terminates. This blocks any modification of the index.
 func (idx *Index) EachByPack(ctx context.Context, packBlacklist restic.IDSet) <-chan EachByPackResult {
 	idx.m.Lock()
 	ch := make(chan EachByPackResult)
 	go func() {
 		defer idx.m.Unlock()
 		defer func() {
 			close(ch)
 		}()
 		for typ := range idx.byType {
 			byPack := make(map[restic.ID][]*indexEntry)
 			m := &idx.byType[typ]
 			m.foreach(func(e *indexEntry) bool {
 				packID := idx.packs[e.packIndex]
 				if !packBlacklist.Has(packID) {
 					byPack[packID] = append(byPack[packID], e)
 				}
 				return true
 			})
 			for packID, pack := range byPack {
 				var result EachByPackResult
 				result.packID = packID
 				for _, e := range pack {
 					result.blobs = append(result.blobs, idx.toPackedBlob(e, restic.BlobType(typ)).Blob)
 				}
 				select {
 				case <-ctx.Done():
 					return
 				case ch <- result:
 				}
 			}
 		}
 	}()
 	return ch
 }
 // Packs returns all packs in this index
 func (idx *Index) Packs() restic.IDSet {
 	idx.m.Lock()
--- a/internal/repository/master_index.go
+++ b/internal/repository/master_index.go
@ -97,6 +97,19 @@ func (mi *MasterIndex) Has(id restic.ID, tpe restic.BlobType) bool {
 	return false
 }
 // Packs returns all packs that are covered by the index.
 func (mi *MasterIndex) Packs() restic.IDSet {
 	mi.idxMutex.RLock()
 	defer mi.idxMutex.RUnlock()
 	packs := restic.NewIDSet()
 	for _, idx := range mi.idx {
 		packs.Merge(idx.Packs())
 	}
 	return packs
 }
 // Count returns the number of blobs of type t in the index.
 func (mi *MasterIndex) Count(t restic.BlobType) (n uint) {
 	mi.idxMutex.RLock()
@ -248,36 +261,34 @@ func (mi *MasterIndex) MergeFinalIndexes() {
 	mi.idx = newIdx
 }
-// RebuildIndex combines all known indexes to a new index, leaving out any
+// Save saves all known indexes to index files, leaving out any
 // packs whose ID is contained in packBlacklist. The new index contains the IDs
-// of all known indexes in the "supersedes" field.
+// of all known indexes in the "supersedes" field. The IDs are also returned in
-func (mi *MasterIndex) RebuildIndex(ctx context.Context, packBlacklist restic.IDSet) (*Index, error) {
+// the IDSet obsolete
 // After calling this function, you should remove the obsolete index files.
 func (mi *MasterIndex) Save(ctx context.Context, repo restic.Repository, packBlacklist restic.IDSet, p *restic.Progress) (obsolete restic.IDSet, err error) {
 	p.Start()
 	defer p.Done()
 	mi.idxMutex.Lock()
 	defer mi.idxMutex.Unlock()
 	debug.Log("start rebuilding index of %d indexes, pack blacklist: %v", len(mi.idx), packBlacklist)
 	newIndex := NewIndex()
 	obsolete = restic.NewIDSet()
-	ctx, cancel := context.WithCancel(ctx)
+	finalize := func() error {
-	defer cancel()
+		newIndex.Finalize()
 		if _, err := SaveIndex(ctx, repo, newIndex); err != nil {
 			return err
 		}
 		newIndex = NewIndex()
 		return nil
 	}
 	for i, idx := range mi.idx {
-		debug.Log("adding index %d", i)
+		if idx.Final() {
 		for pb := range idx.Each(ctx) {
 			if packBlacklist.Has(pb.PackID) {
 				continue
 			}
 			newIndex.Store(pb)
 		}
 		if !idx.Final() {
 			debug.Log("index %d isn't final, don't add to supersedes field", i)
 			continue
 		}
 			ids, err := idx.IDs()
 			if err != nil {
 				debug.Log("index %d does not have an ID: %v", err)
@ -290,7 +301,26 @@ func (mi *MasterIndex) RebuildIndex(ctx context.Context, packBlacklist restic.ID
 			if err != nil {
 				return nil, err
 			}
 			obsolete.Merge(restic.NewIDSet(ids...))
 		} else {
 			debug.Log("index %d isn't final, don't add to supersedes field", i)
 		}
-	return newIndex, nil
+		debug.Log("adding index %d", i)
 		for pbs := range idx.EachByPack(ctx, packBlacklist) {
 			newIndex.StorePack(pbs.packID, pbs.blobs)
 			p.Report(restic.Stat{Blobs: 1})
 			if IndexFull(newIndex) {
 				if err := finalize(); err != nil {
 					return nil, err
 				}
 			}
 		}
 	}
 	if err := finalize(); err != nil {
 		return nil, err
 	}
 	return
 }
--- a/internal/repository/master_index_test.go
+++ b/internal/repository/master_index_test.go
@ -5,7 +5,9 @@ import (
 	"fmt"
 	"math/rand"
 	"testing"
 	"time"
 	"github.com/restic/restic/internal/checker"
 	"github.com/restic/restic/internal/repository"
 	"github.com/restic/restic/internal/restic"
 	rtest "github.com/restic/restic/internal/test"
@ -322,3 +324,65 @@ func BenchmarkMasterIndexLookupBlobSize(b *testing.B) {
 		mIdx.LookupSize(lookupID, restic.DataBlob)
 	}
 }
 var (
 	snapshotTime = time.Unix(1470492820, 207401672)
 	depth        = 3
 )
 func createFilledRepo(t testing.TB, snapshots int, dup float32) (restic.Repository, func()) {
 	repo, cleanup := repository.TestRepository(t)
 	for i := 0; i < 3; i++ {
 		restic.TestCreateSnapshot(t, repo, snapshotTime.Add(time.Duration(i)*time.Second), depth, dup)
 	}
 	return repo, cleanup
 }
 func TestIndexSave(t *testing.T) {
 	repo, cleanup := createFilledRepo(t, 3, 0)
 	defer cleanup()
 	repo.LoadIndex(context.TODO())
 	obsoletes, err := repo.Index().(*repository.MasterIndex).Save(context.TODO(), repo, nil, nil)
 	if err != nil {
 		t.Fatalf("unable to save new index: %v", err)
 	}
 	for id := range obsoletes {
 		t.Logf("remove index %v", id.Str())
 		h := restic.Handle{Type: restic.IndexFile, Name: id.String()}
 		err = repo.Backend().Remove(context.TODO(), h)
 		if err != nil {
 			t.Errorf("error removing index %v: %v", id, err)
 		}
 	}
 	checker := checker.New(repo)
 	hints, errs := checker.LoadIndex(context.TODO())
 	for _, h := range hints {
 		t.Logf("hint: %v\n", h)
 	}
 	for _, err := range errs {
 		t.Errorf("checker found error: %v", err)
 	}
 	ctx, cancel := context.WithCancel(context.TODO())
 	defer cancel()
 	errCh := make(chan error)
 	go checker.Structure(ctx, errCh)
 	i := 0
 	for err := range errCh {
 		t.Errorf("checker returned error: %v", err)
 		i++
 		if i == 10 {
 			t.Errorf("more than 10 errors returned, skipping the rest")
 			cancel()
 			break
 		}
 	}
 }
--- a/internal/repository/repository.go
+++ b/internal/repository/repository.go
@ -322,7 +322,10 @@ func (r *Repository) Flush(ctx context.Context) error {
 		return err
 	}
-	// Save index after flushing
+	// Save index after flushing only if noAutoIndexUpdate is not set
 	if r.noAutoIndexUpdate {
 		return nil
 	}
 	return r.SaveIndex(ctx)
 }
--- a/internal/restic/repository.go
+++ b/internal/restic/repository.go
@ -62,6 +62,7 @@ type MasterIndex interface {
 	Has(ID, BlobType) bool
 	Lookup(ID, BlobType) []PackedBlob
 	Count(BlobType) uint
 	Packs() IDSet
 	// Each returns a channel that yields all blobs known to the index. When
 	// the context is cancelled, the background goroutine terminates. This