Merge pull request #4626 from MichaelEischer/reliable-large-restores

Improve reliability of large restores
2024-11-22 04:45:15 +00:00 · 2024-01-09 18:23:09 +01:00 · 2024-01-09 18:23:09 +01:00 · c31e9418ba
commit c31e9418ba
parent 77434c6e2b 4ea3796455
3 changed files with 150 additions and 77 deletions
--- a/changelog/unreleased/pull-4626
+++ b/changelog/unreleased/pull-4626
@ -0,0 +1,11 @@
+Bugfix: Improve reliability of restoring large files
+
+In some cases restic failed to restore large files that frequently contain the
+same file chunk. In combination with certain backends, this could result in
+network connection timeouts that caused incomplete restores.
+
+Restic now includes special handling for such file chunks to ensure reliable
+restores.
+
+https://github.com/restic/restic/pull/4626
+https://forum.restic.net/t/errors-restoring-with-restic-on-windows-server-s3/6943
--- a/internal/restorer/filerestorer.go
+++ b/internal/restorer/filerestorer.go
@ -197,19 +197,20 @@ func (r *fileRestorer) restoreFiles(ctx context.Context) error {
 	return wg.Wait()
 }

-func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
-
-	// calculate blob->[]files->[]offsets mappings
-	blobs := make(map[restic.ID]struct {
+type blobToFileOffsetsMapping map[restic.ID]struct {
 	files map[*fileInfo][]int64 // file -> offsets (plural!) of the blob in the file
-	})
-	var blobList []restic.Blob
+	blob  restic.Blob
+}
+
+func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
+	// calculate blob->[]files->[]offsets mappings
+	blobs := make(blobToFileOffsetsMapping)
 	for file := range pack.files {
 		addBlob := func(blob restic.Blob, fileOffset int64) {
 			blobInfo, ok := blobs[blob.ID]
 			if !ok {
 				blobInfo.files = make(map[*fileInfo][]int64)
-				blobList = append(blobList, blob)
+				blobInfo.blob = blob
 				blobs[blob.ID] = blobInfo
 			}
 			blobInfo.files[file] = append(blobInfo.files[file], fileOffset)
@ -239,21 +240,83 @@ func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
 		}
 	}

-	sanitizeError := func(file *fileInfo, err error) error {
+	// track already processed blobs for precise error reporting
+	processedBlobs := restic.NewBlobSet()
+	for _, entry := range blobs {
+		occurrences := 0
+		for _, offsets := range entry.files {
+			occurrences += len(offsets)
+		}
+		// With a maximum blob size of 8MB, the normal blob streaming has to write
+		// at most 800MB for a single blob. This should be short enough to avoid
+		// network connection timeouts. Based on a quick test, a limit of 100 only
+		// selects a very small number of blobs (the number of references per blob
+		// - aka. `count` - seem to follow a expontential distribution)
+		if occurrences > 100 {
+			// process frequently referenced blobs first as these can take a long time to write
+			// which can cause backend connections to time out
+			delete(blobs, entry.blob.ID)
+			partialBlobs := blobToFileOffsetsMapping{entry.blob.ID: entry}
+			err := r.downloadBlobs(ctx, pack.id, partialBlobs, processedBlobs)
+			if err := r.reportError(blobs, processedBlobs, err); err != nil {
+				return err
+			}
+		}
+	}
+
+	if len(blobs) == 0 {
+		return nil
+	}
+
+	err := r.downloadBlobs(ctx, pack.id, blobs, processedBlobs)
+	return r.reportError(blobs, processedBlobs, err)
+}
+
+func (r *fileRestorer) sanitizeError(file *fileInfo, err error) error {
 	if err != nil {
 		err = r.Error(file.location, err)
 	}
 	return err
+}
+
+func (r *fileRestorer) reportError(blobs blobToFileOffsetsMapping, processedBlobs restic.BlobSet, err error) error {
+	if err == nil {
+		return nil
 	}

-	// track already processed blobs for precise error reporting
-	processedBlobs := restic.NewBlobSet()
-	err := repository.StreamPack(ctx, r.packLoader, r.key, pack.id, blobList, func(h restic.BlobHandle, blobData []byte, err error) error {
+	// only report error for not yet processed blobs
+	affectedFiles := make(map[*fileInfo]struct{})
+	for _, entry := range blobs {
+		if processedBlobs.Has(entry.blob.BlobHandle) {
+			continue
+		}
+		for file := range entry.files {
+			affectedFiles[file] = struct{}{}
+		}
+	}
+
+	for file := range affectedFiles {
+		if errFile := r.sanitizeError(file, err); errFile != nil {
+			return errFile
+		}
+	}
+	return nil
+}
+
+func (r *fileRestorer) downloadBlobs(ctx context.Context, packID restic.ID,
+	blobs blobToFileOffsetsMapping, processedBlobs restic.BlobSet) error {
+
+	blobList := make([]restic.Blob, 0, len(blobs))
+	for _, entry := range blobs {
+		blobList = append(blobList, entry.blob)
+	}
+	return repository.StreamPack(ctx, r.packLoader, r.key, packID, blobList,
+		func(h restic.BlobHandle, blobData []byte, err error) error {
 			processedBlobs.Insert(h)
 			blob := blobs[h.ID]
 			if err != nil {
 				for file := range blob.files {
-				if errFile := sanitizeError(file, err); errFile != nil {
+					if errFile := r.sanitizeError(file, err); errFile != nil {
 						return errFile
 					}
 				}
@ -285,7 +348,7 @@ func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {

 						return writeErr
 					}
-				err := sanitizeError(file, writeToFile())
+					err := r.sanitizeError(file, writeToFile())
 					if err != nil {
 						return err
 					}
@ -293,26 +356,4 @@ func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
 			}
 			return nil
 		})
-
-	if err != nil {
-		// only report error for not yet processed blobs
-		affectedFiles := make(map[*fileInfo]struct{})
-		for _, blob := range blobList {
-			if processedBlobs.Has(blob.BlobHandle) {
-				continue
-			}
-			blob := blobs[blob.ID]
-			for file := range blob.files {
-				affectedFiles[file] = struct{}{}
-			}
-		}
-
-		for file := range affectedFiles {
-			if errFile := sanitizeError(file, err); errFile != nil {
-				return errFile
-			}
-		}
-	}
-
-	return nil
 }
--- a/internal/restorer/filerestorer_test.go
+++ b/internal/restorer/filerestorer_test.go
@ -248,6 +248,27 @@ func TestFileRestorerPackSkip(t *testing.T) {
 	}
 }

+func TestFileRestorerFrequentBlob(t *testing.T) {
+	tempdir := rtest.TempDir(t)
+
+	for _, sparse := range []bool{false, true} {
+		blobs := []TestBlob{
+			{"data1-1", "pack1-1"},
+		}
+		for i := 0; i < 10000; i++ {
+			blobs = append(blobs, TestBlob{"a", "pack1-1"})
+		}
+		blobs = append(blobs, TestBlob{"end", "pack1-1"})
+
+		restoreAndVerify(t, tempdir, []TestFile{
+			{
+				name:  "file1",
+				blobs: blobs,
+			},
+		}, nil, sparse)
+	}
+}
+
 func TestErrorRestoreFiles(t *testing.T) {
 	tempdir := rtest.TempDir(t)
 	content := []TestFile{