From c147422ba5a4247b8303990f9273e20291b9b317 Mon Sep 17 00:00:00 2001
From: Michael Eischer <michael.eischer@fau.de>
Date: Sun, 4 Sep 2022 10:49:16 +0200
Subject: [PATCH] repository: special case SaveBlob for all zero chunks

Sparse files contain large regions containing only zero bytes. Checking
that a blob only contains zeros is possible with over 100GB/s for modern
x86 CPUs. Calculating sha256 hashes is only possible with 500MB/s (or
2GB/s using hardware acceleration). Thus we can speed up the hash
calculation for all zero blobs (which always have length
chunker.MinSize) by checking for zero bytes and then using the
precomputed hash.

The all zeros check is only performed for blobs with the minimal chunk
size, and thus should add no overhead most of the time. For chunks which
are not all zero but have the minimal chunks size, the overhead will be
below 2% based on the above performance numbers.

This allows reading sparse sections of files as fast as the kernel can
return data to us. On my system using BTRFS this resulted in about
4GB/s.
---
 internal/repository/repository.go | 20 +++++++++++++++++++-
 internal/restorer/filerestorer.go |  3 +--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/internal/repository/repository.go b/internal/repository/repository.go
index 625ad9b16..f41ce38a5 100644
--- a/internal/repository/repository.go
+++ b/internal/repository/repository.go
@@ -813,7 +813,14 @@ func (r *Repository) SaveBlob(ctx context.Context, t restic.BlobType, buf []byte
 
 	// compute plaintext hash if not already set
 	if id.IsNull() {
-		newID = restic.Hash(buf)
+		// Special case the hash calculation for all zero chunks. This is especially
+		// useful for sparse files containing large all zero regions. For these we can
+		// process chunks as fast as we can read the from disk.
+		if len(buf) == chunker.MinSize && restic.ZeroPrefixLen(buf) == chunker.MinSize {
+			newID = ZeroChunk()
+		} else {
+			newID = restic.Hash(buf)
+		}
 	} else {
 		newID = id
 	}
@@ -967,3 +974,14 @@ func streamPackPart(ctx context.Context, beLoad BackendLoadFn, key *crypto.Key,
 	})
 	return errors.Wrap(err, "StreamPack")
 }
+
+var zeroChunkOnce sync.Once
+var zeroChunkID restic.ID
+
+// ZeroChunk computes and returns (cached) the ID of an all-zero chunk with size chunker.MinSize
+func ZeroChunk() restic.ID {
+	zeroChunkOnce.Do(func() {
+		zeroChunkID = restic.Hash(make([]byte, chunker.MinSize))
+	})
+	return zeroChunkID
+}
diff --git a/internal/restorer/filerestorer.go b/internal/restorer/filerestorer.go
index 659458cd8..2deef1cd2 100644
--- a/internal/restorer/filerestorer.go
+++ b/internal/restorer/filerestorer.go
@@ -7,7 +7,6 @@ import (
 
 	"golang.org/x/sync/errgroup"
 
-	"github.com/restic/chunker"
 	"github.com/restic/restic/internal/crypto"
 	"github.com/restic/restic/internal/debug"
 	"github.com/restic/restic/internal/errors"
@@ -76,7 +75,7 @@ func newFileRestorer(dst string,
 		idx:         idx,
 		packLoader:  packLoader,
 		filesWriter: newFilesWriter(workerCount),
-		zeroChunk:   restic.Hash(make([]byte, chunker.MinSize)),
+		zeroChunk:   repository.ZeroChunk(),
 		sparse:      sparse,
 		workerCount: workerCount,
 		dst:         dst,