From c147422ba5a4247b8303990f9273e20291b9b317 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 4 Sep 2022 10:49:16 +0200 Subject: [PATCH] repository: special case SaveBlob for all zero chunks Sparse files contain large regions containing only zero bytes. Checking that a blob only contains zeros is possible with over 100GB/s for modern x86 CPUs. Calculating sha256 hashes is only possible with 500MB/s (or 2GB/s using hardware acceleration). Thus we can speed up the hash calculation for all zero blobs (which always have length chunker.MinSize) by checking for zero bytes and then using the precomputed hash. The all zeros check is only performed for blobs with the minimal chunk size, and thus should add no overhead most of the time. For chunks which are not all zero but have the minimal chunks size, the overhead will be below 2% based on the above performance numbers. This allows reading sparse sections of files as fast as the kernel can return data to us. On my system using BTRFS this resulted in about 4GB/s. --- internal/repository/repository.go | 20 +++++++++++++++++++- internal/restorer/filerestorer.go | 3 +-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/internal/repository/repository.go b/internal/repository/repository.go index 625ad9b16..f41ce38a5 100644 --- a/internal/repository/repository.go +++ b/internal/repository/repository.go @@ -813,7 +813,14 @@ func (r *Repository) SaveBlob(ctx context.Context, t restic.BlobType, buf []byte // compute plaintext hash if not already set if id.IsNull() { - newID = restic.Hash(buf) + // Special case the hash calculation for all zero chunks. This is especially + // useful for sparse files containing large all zero regions. For these we can + // process chunks as fast as we can read the from disk. + if len(buf) == chunker.MinSize && restic.ZeroPrefixLen(buf) == chunker.MinSize { + newID = ZeroChunk() + } else { + newID = restic.Hash(buf) + } } else { newID = id } @@ -967,3 +974,14 @@ func streamPackPart(ctx context.Context, beLoad BackendLoadFn, key *crypto.Key, }) return errors.Wrap(err, "StreamPack") } + +var zeroChunkOnce sync.Once +var zeroChunkID restic.ID + +// ZeroChunk computes and returns (cached) the ID of an all-zero chunk with size chunker.MinSize +func ZeroChunk() restic.ID { + zeroChunkOnce.Do(func() { + zeroChunkID = restic.Hash(make([]byte, chunker.MinSize)) + }) + return zeroChunkID +} diff --git a/internal/restorer/filerestorer.go b/internal/restorer/filerestorer.go index 659458cd8..2deef1cd2 100644 --- a/internal/restorer/filerestorer.go +++ b/internal/restorer/filerestorer.go @@ -7,7 +7,6 @@ import ( "golang.org/x/sync/errgroup" - "github.com/restic/chunker" "github.com/restic/restic/internal/crypto" "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" @@ -76,7 +75,7 @@ func newFileRestorer(dst string, idx: idx, packLoader: packLoader, filesWriter: newFilesWriter(workerCount), - zeroChunk: restic.Hash(make([]byte, chunker.MinSize)), + zeroChunk: repository.ZeroChunk(), sparse: sparse, workerCount: workerCount, dst: dst,