2
2
mirror of https://github.com/octoleo/restic.git synced 2025-01-26 16:48:29 +00:00

Merge pull request #2941 from MichaelEischer/parallel-repack

prune: Parallelize repack step
This commit is contained in:
Alexander Neumann 2020-11-05 11:00:41 +01:00 committed by GitHub
commit 636b2f2e94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 157 additions and 70 deletions

View File

@ -0,0 +1,8 @@
Enhancement: Speed up repacking step of prune command
The repack step of the prune command, which moves still used file parts into
new pack files such that the old ones can be garbage collected later on, now
processes multiple pack files in parallel. This is especially beneficial for
high latency backends or when using a fast network connection.
https://github.com/restic/restic/pull/2941

View File

@ -2,18 +2,26 @@ package repository
import ( import (
"context" "context"
"os"
"sync"
"github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/fs" "github.com/restic/restic/internal/fs"
"github.com/restic/restic/internal/pack" "github.com/restic/restic/internal/pack"
"github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/restic"
"golang.org/x/sync/errgroup"
) )
const numRepackWorkers = 8
// Repack takes a list of packs together with a list of blobs contained in // Repack takes a list of packs together with a list of blobs contained in
// these packs. Each pack is loaded and the blobs listed in keepBlobs is saved // these packs. Each pack is loaded and the blobs listed in keepBlobs is saved
// into a new pack. Returned is the list of obsolete packs which can then // into a new pack. Returned is the list of obsolete packs which can then
// be removed. // be removed.
//
// The map keepBlobs is modified by Repack, it is used to keep track of which
// blobs have been processed.
func Repack(ctx context.Context, repo restic.Repository, packs restic.IDSet, keepBlobs restic.BlobSet, p *restic.Progress) (obsoletePacks restic.IDSet, err error) { func Repack(ctx context.Context, repo restic.Repository, packs restic.IDSet, keepBlobs restic.BlobSet, p *restic.Progress) (obsoletePacks restic.IDSet, err error) {
if p != nil { if p != nil {
p.Start() p.Start()
@ -22,36 +30,86 @@ func Repack(ctx context.Context, repo restic.Repository, packs restic.IDSet, kee
debug.Log("repacking %d packs while keeping %d blobs", len(packs), len(keepBlobs)) debug.Log("repacking %d packs while keeping %d blobs", len(packs), len(keepBlobs))
wg, ctx := errgroup.WithContext(ctx)
downloadQueue := make(chan restic.ID)
wg.Go(func() error {
defer close(downloadQueue)
for packID := range packs { for packID := range packs {
select {
case downloadQueue <- packID:
case <-ctx.Done():
return ctx.Err()
}
}
return nil
})
type repackJob struct {
tempfile *os.File
hash restic.ID
packLength int64
}
processQueue := make(chan repackJob)
// used to close processQueue once all downloaders have finished
var downloadWG sync.WaitGroup
downloader := func() error {
defer downloadWG.Done()
for packID := range downloadQueue {
// load the complete pack into a temp file // load the complete pack into a temp file
h := restic.Handle{Type: restic.PackFile, Name: packID.String()} h := restic.Handle{Type: restic.PackFile, Name: packID.String()}
tempfile, hash, packLength, err := DownloadAndHash(ctx, repo.Backend(), h) tempfile, hash, packLength, err := DownloadAndHash(ctx, repo.Backend(), h)
if err != nil { if err != nil {
return nil, errors.Wrap(err, "Repack") return errors.Wrap(err, "Repack")
} }
debug.Log("pack %v loaded (%d bytes), hash %v", packID, packLength, hash) debug.Log("pack %v loaded (%d bytes), hash %v", packID, packLength, hash)
if !packID.Equal(hash) { if !packID.Equal(hash) {
return nil, errors.Errorf("hash does not match id: want %v, got %v", packID, hash) return errors.Errorf("hash does not match id: want %v, got %v", packID, hash)
} }
_, err = tempfile.Seek(0, 0) select {
if err != nil { case processQueue <- repackJob{tempfile, hash, packLength}:
return nil, errors.Wrap(err, "Seek") case <-ctx.Done():
return ctx.Err()
} }
}
return nil
}
downloadWG.Add(numRepackWorkers)
for i := 0; i < numRepackWorkers; i++ {
wg.Go(downloader)
}
wg.Go(func() error {
downloadWG.Wait()
close(processQueue)
return nil
})
var keepMutex sync.Mutex
worker := func() error {
for job := range processQueue {
tempfile, packID, packLength := job.tempfile, job.hash, job.packLength
blobs, err := pack.List(repo.Key(), tempfile, packLength) blobs, err := pack.List(repo.Key(), tempfile, packLength)
if err != nil { if err != nil {
return nil, err return err
} }
debug.Log("processing pack %v, blobs: %v", packID, len(blobs)) debug.Log("processing pack %v, blobs: %v", packID, len(blobs))
var buf []byte var buf []byte
for _, entry := range blobs { for _, entry := range blobs {
h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} h := restic.BlobHandle{ID: entry.ID, Type: entry.Type}
if !keepBlobs.Has(h) {
keepMutex.Lock()
shouldKeep := keepBlobs.Has(h)
keepMutex.Unlock()
if !shouldKeep {
continue continue
} }
@ -64,50 +122,70 @@ func Repack(ctx context.Context, repo restic.Repository, packs restic.IDSet, kee
n, err := tempfile.ReadAt(buf, int64(entry.Offset)) n, err := tempfile.ReadAt(buf, int64(entry.Offset))
if err != nil { if err != nil {
return nil, errors.Wrap(err, "ReadAt") return errors.Wrap(err, "ReadAt")
} }
if n != len(buf) { if n != len(buf) {
return nil, errors.Errorf("read blob %v from %v: not enough bytes read, want %v, got %v", return errors.Errorf("read blob %v from %v: not enough bytes read, want %v, got %v",
h, tempfile.Name(), len(buf), n) h, tempfile.Name(), len(buf), n)
} }
nonce, ciphertext := buf[:repo.Key().NonceSize()], buf[repo.Key().NonceSize():] nonce, ciphertext := buf[:repo.Key().NonceSize()], buf[repo.Key().NonceSize():]
plaintext, err := repo.Key().Open(ciphertext[:0], nonce, ciphertext, nil) plaintext, err := repo.Key().Open(ciphertext[:0], nonce, ciphertext, nil)
if err != nil { if err != nil {
return nil, err return err
} }
id := restic.Hash(plaintext) id := restic.Hash(plaintext)
if !id.Equal(entry.ID) { if !id.Equal(entry.ID) {
debug.Log("read blob %v/%v from %v: wrong data returned, hash is %v", debug.Log("read blob %v/%v from %v: wrong data returned, hash is %v",
h.Type, h.ID, tempfile.Name(), id) h.Type, h.ID, tempfile.Name(), id)
return nil, errors.Errorf("read blob %v from %v: wrong data returned, hash is %v", return errors.Errorf("read blob %v from %v: wrong data returned, hash is %v",
h, tempfile.Name(), id) h, tempfile.Name(), id)
} }
keepMutex.Lock()
// recheck whether some other worker was faster
shouldKeep = keepBlobs.Has(h)
if shouldKeep {
keepBlobs.Delete(h)
}
keepMutex.Unlock()
if !shouldKeep {
continue
}
// We do want to save already saved blobs! // We do want to save already saved blobs!
_, _, err = repo.SaveBlob(ctx, entry.Type, plaintext, entry.ID, true) _, _, err = repo.SaveBlob(ctx, entry.Type, plaintext, entry.ID, true)
if err != nil { if err != nil {
return nil, err return err
} }
debug.Log(" saved blob %v", entry.ID) debug.Log(" saved blob %v", entry.ID)
keepBlobs.Delete(h)
} }
if err = tempfile.Close(); err != nil { if err = tempfile.Close(); err != nil {
return nil, errors.Wrap(err, "Close") return errors.Wrap(err, "Close")
} }
if err = fs.RemoveIfExists(tempfile.Name()); err != nil { if err = fs.RemoveIfExists(tempfile.Name()); err != nil {
return nil, errors.Wrap(err, "Remove") return errors.Wrap(err, "Remove")
} }
if p != nil { if p != nil {
p.Report(restic.Stat{Blobs: 1}) p.Report(restic.Stat{Blobs: 1})
} }
} }
return nil
}
for i := 0; i < numRepackWorkers; i++ {
wg.Go(worker)
}
if err := wg.Wait(); err != nil {
return nil, err
}
if err := repo.Flush(ctx); err != nil { if err := repo.Flush(ctx); err != nil {
return nil, err return nil, err

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"math/rand" "math/rand"
"testing" "testing"
"time"
"github.com/restic/restic/internal/index" "github.com/restic/restic/internal/index"
"github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/repository"
@ -195,7 +196,7 @@ func TestRepack(t *testing.T) {
repo, cleanup := repository.TestRepository(t) repo, cleanup := repository.TestRepository(t)
defer cleanup() defer cleanup()
seed := rand.Int63() seed := time.Now().UnixNano()
rand.Seed(seed) rand.Seed(seed)
t.Logf("rand seed is %v", seed) t.Logf("rand seed is %v", seed)
@ -262,7 +263,7 @@ func TestRepackWrongBlob(t *testing.T) {
repo, cleanup := repository.TestRepository(t) repo, cleanup := repository.TestRepository(t)
defer cleanup() defer cleanup()
seed := rand.Int63() seed := time.Now().UnixNano()
rand.Seed(seed) rand.Seed(seed)
t.Logf("rand seed is %v", seed) t.Logf("rand seed is %v", seed)
@ -277,5 +278,5 @@ func TestRepackWrongBlob(t *testing.T) {
if err == nil { if err == nil {
t.Fatal("expected repack to fail but got no error") t.Fatal("expected repack to fail but got no error")
} }
t.Log(err) t.Logf("found expected error: %v", err)
} }

View File

@ -758,7 +758,7 @@ type Loader interface {
// DownloadAndHash is all-in-one helper to download content of the file at h to a temporary filesystem location // DownloadAndHash is all-in-one helper to download content of the file at h to a temporary filesystem location
// and calculate ID of the contents. Returned (temporary) file is positioned at the beginning of the file; // and calculate ID of the contents. Returned (temporary) file is positioned at the beginning of the file;
// it is reponsibility of the caller to close and delete the file. // it is the reponsibility of the caller to close and delete the file.
func DownloadAndHash(ctx context.Context, be Loader, h restic.Handle) (tmpfile *os.File, hash restic.ID, size int64, err error) { func DownloadAndHash(ctx context.Context, be Loader, h restic.Handle) (tmpfile *os.File, hash restic.ID, size int64, err error) {
tmpfile, err = fs.TempFile("", "restic-temp-") tmpfile, err = fs.TempFile("", "restic-temp-")
if err != nil { if err != nil {