From 93210614f413bc76219a6bb3c3a3cce33124ffca Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Sat, 24 Feb 2018 13:23:42 +0100 Subject: [PATCH 1/2] backend/retry: return worker function error and abort This is a bug fix: Before, when the worker function fn in List() of the RetryBackend returned an error, the operation is retried with the next file. This is not consistent with the documentation, the intention was that when fn returns an error, this is passed on to the caller and the List() operation is aborted. Only errors happening on the underlying backend are retried. The error leads to restic ignoring exclusive locks that are present in the repo, so it may happen that a new backup is written which references data that is going to be removed by a concurrently running `prune` operation. The bug was reported by a user here: https://forum.restic.net/t/restic-backup-returns-0-exit-code-when-already-locked/484 --- internal/backend/backend_retry.go | 31 ++++++-- internal/backend/backend_retry_test.go | 98 ++++++++++++++++++++++++++ internal/restic/lock.go | 6 +- 3 files changed, 130 insertions(+), 5 deletions(-) diff --git a/internal/backend/backend_retry.go b/internal/backend/backend_retry.go index 6e00c086f..00274e43e 100644 --- a/internal/backend/backend_retry.go +++ b/internal/backend/backend_retry.go @@ -125,16 +125,39 @@ func (be *RetryBackend) Test(ctx context.Context, h restic.Handle) (exists bool, return exists, err } -// List runs fn for each file in the backend which has the type t. +// List runs fn for each file in the backend which has the type t. When an +// error is returned by the underlying backend, the request is retried. When fn +// returns an error, the operation is aborted and the error is returned to the +// caller. func (be *RetryBackend) List(ctx context.Context, t restic.FileType, fn func(restic.FileInfo) error) error { - listed := make(map[string]struct{}) - return be.retry(ctx, fmt.Sprintf("List(%v)", t), func() error { + // create a new context that we can cancel when fn returns an error, so + // that listing is aborted + listCtx, cancel := context.WithCancel(ctx) + defer cancel() + + listed := make(map[string]struct{}) // remember for which files we already ran fn + var innerErr error // remember when fn returned an error, so we can return that to the caller + + err := be.retry(listCtx, fmt.Sprintf("List(%v)", t), func() error { return be.Backend.List(ctx, t, func(fi restic.FileInfo) error { if _, ok := listed[fi.Name]; ok { return nil } listed[fi.Name] = struct{}{} - return fn(fi) + + innerErr = fn(fi) + if innerErr != nil { + // if fn returned an error, listing is aborted, so we cancel the context + cancel() + } + return innerErr }) }) + + // the error fn returned takes precedence + if innerErr != nil { + return innerErr + } + + return err } diff --git a/internal/backend/backend_retry_test.go b/internal/backend/backend_retry_test.go index 6abd4cba2..832547c77 100644 --- a/internal/backend/backend_retry_test.go +++ b/internal/backend/backend_retry_test.go @@ -124,6 +124,104 @@ func TestBackendListRetry(t *testing.T) { test.Equals(t, []string{ID1, ID2}, listed) // assert no duplicate files } +func TestBackendListRetryErrorFn(t *testing.T) { + var names = []string{"id1", "id2", "foo", "bar"} + + be := &mock.Backend{ + ListFn: func(ctx context.Context, tpe restic.FileType, fn func(restic.FileInfo) error) error { + t.Logf("List called for %v", tpe) + for _, name := range names { + err := fn(restic.FileInfo{Name: name}) + if err != nil { + return err + } + } + + return nil + }, + } + + retryBackend := RetryBackend{ + Backend: be, + } + + var ErrTest = errors.New("test error") + + var listed []string + run := 0 + err := retryBackend.List(context.TODO(), restic.DataFile, func(fi restic.FileInfo) error { + t.Logf("fn called for %v", fi.Name) + run++ + // return an error for the third item in the list + if run == 3 { + t.Log("returning an error") + return ErrTest + } + listed = append(listed, fi.Name) + return nil + }) + + if err != ErrTest { + t.Fatalf("wrong error returned, want %v, got %v", ErrTest, err) + } + + // processing should stop after the error was returned, so run should be 3 + if run != 3 { + t.Fatalf("function was called %d times, wanted %v", run, 3) + } + + test.Equals(t, []string{"id1", "id2"}, listed) +} + +func TestBackendListRetryErrorBackend(t *testing.T) { + var names = []string{"id1", "id2", "foo", "bar"} + + var ErrBackendTest = errors.New("test error") + + retries := 0 + be := &mock.Backend{ + ListFn: func(ctx context.Context, tpe restic.FileType, fn func(restic.FileInfo) error) error { + t.Logf("List called for %v, retries %v", tpe, retries) + retries++ + for i, name := range names { + if i == 2 { + return ErrBackendTest + } + + err := fn(restic.FileInfo{Name: name}) + if err != nil { + return err + } + } + + return nil + }, + } + + const maxRetries = 2 + retryBackend := RetryBackend{ + MaxTries: maxRetries, + Backend: be, + } + + var listed []string + err := retryBackend.List(context.TODO(), restic.DataFile, func(fi restic.FileInfo) error { + t.Logf("fn called for %v", fi.Name) + listed = append(listed, fi.Name) + return nil + }) + + if err != ErrBackendTest { + t.Fatalf("wrong error returned, want %v, got %v", ErrBackendTest, err) + } + + if retries != maxRetries+1 { + t.Fatalf("List was called %d times, wanted %v", retries, maxRetries+1) + } + + test.Equals(t, names[:2], listed) +} + // failingReader returns an error after reading limit number of bytes type failingReader struct { data []byte diff --git a/internal/restic/lock.go b/internal/restic/lock.go index e5acfb4fa..d3e5edc57 100644 --- a/internal/restic/lock.go +++ b/internal/restic/lock.go @@ -44,7 +44,11 @@ type ErrAlreadyLocked struct { } func (e ErrAlreadyLocked) Error() string { - return fmt.Sprintf("repository is already locked by %v", e.otherLock) + s := "" + if e.otherLock.Exclusive { + s = "exclusively " + } + return fmt.Sprintf("repository is already locked %sby %v", s, e.otherLock) } // IsAlreadyLocked returns true iff err is an instance of ErrAlreadyLocked. From 35a816e8abcb7106fa05c650e086658faadc6725 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Sat, 24 Feb 2018 13:34:42 +0100 Subject: [PATCH 2/2] Add entry to changelog --- changelog/0.8.3/pull-1638 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 changelog/0.8.3/pull-1638 diff --git a/changelog/0.8.3/pull-1638 b/changelog/0.8.3/pull-1638 new file mode 100644 index 000000000..d2697b3f1 --- /dev/null +++ b/changelog/0.8.3/pull-1638 @@ -0,0 +1,16 @@ +Bugfix: Handle errors listing files in the backend + +A user reported in the forum that restic completes a backup although a +concurrent `prune` operation was running. A few error messages were printed, +but the backup was attempted and completed successfully. No error code was +returned. + +This should not happen: The repository is exclusively locked during `prune`, so +when `restic backup` is run in parallel, it should abort and return an error +code instead. + +It was found that the bug was in the code introduced only recently, which +retries a List() operation on the backend should that fail. It is now corrected. + +https://github.com/restic/restic/pull/1638 +https://forum.restic.net/t/restic-backup-returns-0-exit-code-when-already-locked/484