Merge pull request #3082 from aawsome/check-sizes

Check: check sizes of packs from index and packheader
2025-02-02 03:48:25 +00:00 · 2020-11-14 22:37:42 +01:00 · 2020-11-14 22:37:42 +01:00 · 333c5a19d4
commit 333c5a19d4
parent 913a34f568 7eabcabf68
3 changed files with 91 additions and 43 deletions
--- a/changelog/unreleased/pull-3048
+++ b/changelog/unreleased/pull-3048
@ -1,9 +1,17 @@
-Enhancement: Check now checks index when reading packs
+Enhancement: Check now has more checks for consistency of index and pack files

-Restic used to only verfiy the pack file content when calling `check --read-data`
+Restic used to only verify the pack file content when calling `check --read-data` or
 `check --read-data-subset` but did not check if the blobs within the pack are
 correctly contained in the index.
-This check is now added and may give an "Blob ID is not contained in index" error.
+This check is now added and may give an "Blob ID is not contained in index or position
+is incorrect" error.
+Also a new test is added which compares pack file sizes computed from the index and the
+pack header with the actual file size. This test is able to detect truncated pack files.
+
 If the index is not correct, it can be rebuilt by using the `rebuild-index` command.

+Having added these tests, `restic check` is now able to detect non-existing blobs which
+are wrongly referenced in the index. This situation could have lead to missing data.
+
 https://github.com/restic/restic/pull/3048
+https://github.com/restic/restic/pull/3082
--- a/cmd/restic/cmd_check.go
+++ b/cmd/restic/cmd_check.go
@ -234,12 +234,12 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 	}

 	doReadData := func(bucket, totalBuckets uint) {
-		packs := restic.IDSet{}
-		for pack := range chkr.GetPacks() {
+		packs := make(map[restic.ID]int64)
+		for pack, size := range chkr.GetPacks() {
 			// If we ever check more than the first byte
 			// of pack, update totalBucketsMax.
 			if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
-				packs.Insert(pack)
+				packs[pack] = size
 			}
 		}
 		packCount := uint64(len(packs))
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@ -22,7 +22,7 @@ import (
 // A Checker only tests for internal errors within the data structures of the
 // repository (e.g. missing blobs), and needs a valid Repository to work on.
 type Checker struct {
-	packs    restic.IDSet
+	packs    map[restic.ID]int64
 	blobRefs struct {
 		sync.Mutex
 		// see flags below
@ -44,7 +44,7 @@ const (
 // New returns a new checker which runs on repo.
 func New(repo restic.Repository) *Checker {
 	c := &Checker{
-		packs:       restic.NewIDSet(),
+		packs:       make(map[restic.ID]int64),
 		masterIndex: repository.NewMasterIndex(),
 		repo:        repo,
 	}
@ -82,7 +82,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {

 	// track spawned goroutines using wg, create a new context which is
 	// cancelled as soon as an error occurs.
-	wg, ctx := errgroup.WithContext(ctx)
+	wg, wgCtx := errgroup.WithContext(ctx)

 	type FileInfo struct {
 		restic.ID
@ -101,9 +101,9 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 	// send list of index files through ch, which is closed afterwards
 	wg.Go(func() error {
 		defer close(ch)
-		return c.repo.List(ctx, restic.IndexFile, func(id restic.ID, size int64) error {
+		return c.repo.List(wgCtx, restic.IndexFile, func(id restic.ID, size int64) error {
 			select {
-			case <-ctx.Done():
+			case <-wgCtx.Done():
 				return nil
 			case ch <- FileInfo{id, size}:
 			}
@ -120,7 +120,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 			var idx *repository.Index
 			oldFormat := false

-			buf, err = c.repo.LoadAndDecrypt(ctx, buf[:0], restic.IndexFile, fi.ID)
+			buf, err = c.repo.LoadAndDecrypt(wgCtx, buf[:0], restic.IndexFile, fi.ID)
 			if err == nil {
 				idx, oldFormat, err = repository.DecodeIndex(buf, fi.ID)
 			}
@ -134,7 +134,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {

 			select {
 			case resultCh <- Result{idx, fi.ID, err}:
-			case <-ctx.Done():
+			case <-wgCtx.Done():
 			}
 		}
 		return nil
@ -161,8 +161,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {

 			debug.Log("process blobs")
 			cnt := 0
-			for blob := range res.Index.Each(ctx) {
-				c.packs.Insert(blob.PackID)
+			for blob := range res.Index.Each(wgCtx) {
 				h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
 				c.blobRefs.M[h] = blobStatusExists
 				cnt++
@ -183,6 +182,18 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 		errs = append(errs, err)
 	}

+	// Merge index before computing pack sizes, as this needs removed duplicates
+	c.masterIndex.MergeFinalIndexes()
+
+	// compute pack size using index entries
+	for blob := range c.masterIndex.Each(ctx) {
+		size, ok := c.packs[blob.PackID]
+		if !ok {
+			size = pack.HeaderSize
+		}
+		c.packs[blob.PackID] = size + int64(pack.PackedSizeOfBlob(blob.Length))
+	}
+
 	debug.Log("checking for duplicate packs")
 	for packID := range c.packs {
 		debug.Log("  check pack %v: contained in %d indexes", packID, len(packToIndex[packID]))
@ -194,8 +205,6 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 		}
 	}

-	c.masterIndex.MergeFinalIndexes()
-
 	err = c.repo.SetIndex(c.masterIndex)
 	if err != nil {
 		debug.Log("SetIndex returned error: %v", err)
@ -235,10 +244,10 @@ func (c *Checker) Packs(ctx context.Context, errChan chan<- error) {
 	debug.Log("checking for %d packs", len(c.packs))

 	debug.Log("listing repository packs")
-	repoPacks := restic.NewIDSet()
+	repoPacks := make(map[restic.ID]int64)

 	err := c.repo.List(ctx, restic.PackFile, func(id restic.ID, size int64) error {
-		repoPacks.Insert(id)
+		repoPacks[id] = size
 		return nil
 	})

@ -246,23 +255,39 @@ func (c *Checker) Packs(ctx context.Context, errChan chan<- error) {
 		errChan <- err
 	}

+	for id, size := range c.packs {
+		reposize, ok := repoPacks[id]
+		// remove from repoPacks so we can find orphaned packs
+		delete(repoPacks, id)
+
+		// missing: present in c.packs but not in the repo
+		if !ok {
+			select {
+			case <-ctx.Done():
+				return
+			case errChan <- PackError{ID: id, Err: errors.New("does not exist")}:
+			}
+			continue
+		}
+
+		// size not matching: present in c.packs and in the repo, but sizes do not match
+		if size != reposize {
+			select {
+			case <-ctx.Done():
+				return
+			case errChan <- PackError{ID: id, Err: errors.Errorf("unexpected file size: got %d, expected %d", reposize, size)}:
+			}
+		}
+	}
+
 	// orphaned: present in the repo but not in c.packs
-	for orphanID := range repoPacks.Sub(c.packs) {
+	for orphanID := range repoPacks {
 		select {
 		case <-ctx.Done():
 			return
 		case errChan <- PackError{ID: orphanID, Orphaned: true, Err: errors.New("not referenced in any index")}:
 		}
 	}
-
-	// missing: present in c.packs but not in the repo
-	for missingID := range c.packs.Sub(repoPacks) {
-		select {
-		case <-ctx.Done():
-			return
-		case errChan <- PackError{ID: missingID, Err: errors.New("does not exist")}:
-		}
-	}
 }

 // Error is an error that occurred while checking a repository.
@ -695,16 +720,16 @@ func (c *Checker) CountPacks() uint64 {
 }

 // GetPacks returns IDSet of packs in the repository
-func (c *Checker) GetPacks() restic.IDSet {
+func (c *Checker) GetPacks() map[restic.ID]int64 {
 	return c.packs
 }

 // checkPack reads a pack and checks the integrity of all blobs.
-func checkPack(ctx context.Context, r restic.Repository, id restic.ID) error {
+func checkPack(ctx context.Context, r restic.Repository, id restic.ID, size int64) error {
 	debug.Log("checking pack %v", id)
 	h := restic.Handle{Type: restic.PackFile, Name: id.String()}

-	packfile, hash, size, err := repository.DownloadAndHash(ctx, r.Backend(), h)
+	packfile, hash, realSize, err := repository.DownloadAndHash(ctx, r.Backend(), h)
 	if err != nil {
 		return errors.Wrap(err, "checkPack")
 	}
@ -721,6 +746,11 @@ func checkPack(ctx context.Context, r restic.Repository, id restic.ID) error {
 		return errors.Errorf("Pack ID does not match, want %v, got %v", id.Str(), hash.Str())
 	}

+	if realSize != size {
+		debug.Log("Pack size does not match, want %v, got %v", size, realSize)
+		return errors.Errorf("Pack size does not match, want %v, got %v", size, realSize)
+	}
+
 	blobs, err := pack.List(r.Key(), packfile, size)
 	if err != nil {
 		return err
@ -728,8 +758,10 @@ func checkPack(ctx context.Context, r restic.Repository, id restic.ID) error {

 	var errs []error
 	var buf []byte
+	sizeFromBlobs := int64(pack.HeaderSize) // pack size computed only from blob information
 	idx := r.Index()
 	for i, blob := range blobs {
+		sizeFromBlobs += int64(pack.PackedSizeOfBlob(blob.Length))
 		debug.Log("  check blob %d: %v", i, blob)

 		buf = buf[:cap(buf)]
@ -765,20 +797,25 @@ func checkPack(ctx context.Context, r restic.Repository, id restic.ID) error {
 			continue
 		}

-		// Check if blob is contained in index
+		// Check if blob is contained in index and position is correct
 		idxHas := false
 		for _, pb := range idx.Lookup(blob.ID, blob.Type) {
-			if pb.PackID == id {
+			if pb.PackID == id && pb.Offset == blob.Offset && pb.Length == blob.Length {
 				idxHas = true
 				break
 			}
 		}
 		if !idxHas {
-			errs = append(errs, errors.Errorf("Blob ID %v is not contained in index", blob.ID.Str()))
+			errs = append(errs, errors.Errorf("Blob %v is not contained in index or position is incorrect", blob.ID.Str()))
 			continue
 		}
 	}

+	if sizeFromBlobs != size {
+		debug.Log("Pack size does not match, want %v, got %v", size, sizeFromBlobs)
+		errs = append(errs, errors.Errorf("Pack size does not match, want %v, got %v", size, sizeFromBlobs))
+	}
+
 	if len(errs) > 0 {
 		return errors.Errorf("pack %v contains %v errors: %v", id.Str(), len(errs), errs)
 	}
@ -792,29 +829,32 @@ func (c *Checker) ReadData(ctx context.Context, errChan chan<- error) {
 }

 // ReadPacks loads data from specified packs and checks the integrity.
-func (c *Checker) ReadPacks(ctx context.Context, packs restic.IDSet, p *progress.Counter, errChan chan<- error) {
+func (c *Checker) ReadPacks(ctx context.Context, packs map[restic.ID]int64, p *progress.Counter, errChan chan<- error) {
 	defer close(errChan)

 	g, ctx := errgroup.WithContext(ctx)
-	ch := make(chan restic.ID)
+	type packsize struct {
+		id   restic.ID
+		size int64
+	}
+	ch := make(chan packsize)

 	// run workers
 	for i := 0; i < defaultParallelism; i++ {
 		g.Go(func() error {
 			for {
-				var id restic.ID
+				var ps packsize
 				var ok bool

 				select {
 				case <-ctx.Done():
 					return nil
-				case id, ok = <-ch:
+				case ps, ok = <-ch:
 					if !ok {
 						return nil
 					}
 				}
-
-				err := checkPack(ctx, c.repo, id)
+				err := checkPack(ctx, c.repo, ps.id, ps.size)
 				p.Add(1)
 				if err == nil {
 					continue
@ -830,9 +870,9 @@ func (c *Checker) ReadPacks(ctx context.Context, packs restic.IDSet, p *progress
 	}

 	// push packs to ch
-	for pack := range packs {
+	for pack, size := range packs {
 		select {
-		case ch <- pack:
+		case ch <- packsize{id: pack, size: size}:
 		case <-ctx.Done():
 		}
 	}