checker: Optimize checker.Packs()

Use result of single repository.List() to find both missing and orphaned data packs. For 500GB repository this eliminates ~100K repository.Test() calls and improves check time by >30M in my environment (~45min before this change and ~7min after). Signed-off-by: Igor Fedorenko <igor@ifedorenko.com>
2024-11-26 06:46:34 +00:00 · 2018-01-11 21:00:48 -05:00 · 2018-01-11 21:00:48 -05:00 · 231076fa4a
commit 231076fa4a
parent 5723636b35
2 changed files with 32 additions and 61 deletions
--- a/changelog/0.8.2/issue-1541
+++ b/changelog/0.8.2/issue-1541
@ -0,0 +1,6 @@
 Enhancement: Reduce number of remote requests during repository check
 This change eliminates redundant remote repository calls and significantly
 improves repository check time.
 https://github.com/restic/restic/issues/1541
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@ -30,8 +30,7 @@ type Checker struct {
 		sync.Mutex
 		M map[restic.ID]uint
 	}
-	indexes       map[restic.ID]*repository.Index
+	indexes map[restic.ID]*repository.Index
 	orphanedPacks restic.IDs
 	masterIndex *repository.MasterIndex
@ -183,38 +182,6 @@ func (e PackError) Error() string {
 	return "pack " + e.ID.String() + ": " + e.Err.Error()
 }
 func packIDTester(ctx context.Context, repo restic.Repository, inChan <-chan restic.ID, errChan chan<- error, wg *sync.WaitGroup) {
 	debug.Log("worker start")
 	defer debug.Log("worker done")
 	defer wg.Done()
 	for id := range inChan {
 		h := restic.Handle{Type: restic.DataFile, Name: id.String()}
 		ok, err := repo.Backend().Test(ctx, h)
 		if err != nil {
 			err = PackError{ID: id, Err: err}
 		} else {
 			if !ok {
 				err = PackError{ID: id, Err: errors.New("does not exist")}
 			}
 		}
 		if err != nil {
 			debug.Log("error checking for pack %s: %v", id.Str(), err)
 			select {
 			case <-ctx.Done():
 				return
 			case errChan <- err:
 			}
 			continue
 		}
 		debug.Log("pack %s exists", id.Str())
 	}
 }
 // Packs checks that all packs referenced in the index are still available and
 // there are no packs that aren't in an index. errChan is closed after all
 // packs have been checked.
@ -222,35 +189,33 @@ func (c *Checker) Packs(ctx context.Context, errChan chan<- error) {
 	defer close(errChan)
 	debug.Log("checking for %d packs", len(c.packs))
 	seenPacks := restic.NewIDSet()
 	var workerWG sync.WaitGroup
 	IDChan := make(chan restic.ID)
 	for i := 0; i < defaultParallelism; i++ {
 		workerWG.Add(1)
 		go packIDTester(ctx, c.repo, IDChan, errChan, &workerWG)
 	}
 	for id := range c.packs {
 		seenPacks.Insert(id)
 		IDChan <- id
 	}
 	close(IDChan)
 	debug.Log("waiting for %d workers to terminate", defaultParallelism)
 	workerWG.Wait()
 	debug.Log("workers terminated")
 	debug.Log("listing repository packs")
 	repoPacks := restic.NewIDSet()
 	for id := range c.repo.List(ctx, restic.DataFile) {
-		debug.Log("check data blob %v", id.Str())
+		select {
-		if !seenPacks.Has(id) {
+		case <-ctx.Done():
-			c.orphanedPacks = append(c.orphanedPacks, id)
+			return
-			select {
+		default:
-			case <-ctx.Done():
+		}
-				return
+		repoPacks.Insert(id)
-			case errChan <- PackError{ID: id, Orphaned: true, Err: errors.New("not referenced in any index")}:
+	}
-			}
+
 	// orphaned: present in the repo but not in c.packs
 	for orphanID := range repoPacks.Sub(c.packs) {
 		select {
 		case <-ctx.Done():
 			return
 		case errChan <- PackError{ID: orphanID, Orphaned: true, Err: errors.New("not referenced in any index")}:
 		}
 	}
 	// missing: present in c.packs but not in the repo
 	for missingID := range c.packs.Sub(repoPacks) {
 		select {
 		case <-ctx.Done():
 			return
 		case errChan <- PackError{ID: missingID, Err: errors.New("does not exist")}:
 		}
 	}
 }