Chaining hash table for repository.Index

These are faster to construct but slower to access. The allocation rate is halved, the peak memory usage almost halved compared to standard map. Benchmark results on linux/amd64, -benchtime=3s -count=20: name old time/op new time/op delta PackerManager-8 178ms ± 0% 178ms ± 0% ~ (p=0.231 n=20+20) DecodeIndex-8 4.54s ± 0% 4.30s ± 0% -5.20% (p=0.000 n=18+17) DecodeIndexParallel-8 4.54s ± 0% 4.30s ± 0% -5.22% (p=0.000 n=19+18) IndexHasUnknown-8 44.4ns ± 5% 50.5ns ±11% +13.82% (p=0.000 n=19+17) IndexHasKnown-8 48.3ns ± 0% 51.5ns ±12% +6.68% (p=0.001 n=16+20) IndexAlloc-8 758ms ± 1% 616ms ± 1% -18.69% (p=0.000 n=19+19) IndexAllocParallel-8 234ms ± 3% 204ms ± 2% -12.60% (p=0.000 n=20+18) MasterIndexLookupSingleIndex-8 122ns ± 0% 145ns ± 9% +18.44% (p=0.000 n=14+20) MasterIndexLookupMultipleIndex-8 369ns ± 2% 429ns ± 8% +16.27% (p=0.000 n=20+20) MasterIndexLookupSingleIndexUnknown-8 68.4ns ± 5% 74.9ns ±13% +9.47% (p=0.000 n=20+20) MasterIndexLookupMultipleIndexUnknown-8 315ns ± 3% 369ns ±11% +17.14% (p=0.000 n=20+20) MasterIndexLookupParallel/known,indices=5-8 743ns ± 1% 816ns ± 2% +9.87% (p=0.000 n=17+17) MasterIndexLookupParallel/unknown,indices=5-8 238ns ± 1% 260ns ± 2% +9.14% (p=0.000 n=19+20) MasterIndexLookupParallel/known,indices=10-8 1.01µs ± 3% 1.11µs ± 2% +9.79% (p=0.000 n=19+20) MasterIndexLookupParallel/unknown,indices=10-8 222ns ± 0% 269ns ± 2% +20.83% (p=0.000 n=16+20) MasterIndexLookupParallel/known,indices=20-8 1.06µs ± 2% 1.19µs ± 2% +12.95% (p=0.000 n=19+18) MasterIndexLookupParallel/unknown,indices=20-8 413ns ± 1% 530ns ± 1% +28.19% (p=0.000 n=18+20) SaveAndEncrypt-8 30.2ms ± 1% 30.4ms ± 0% +0.71% (p=0.000 n=19+19) LoadTree-8 540µs ± 1% 576µs ± 1% +6.73% (p=0.000 n=20+20) LoadBlob-8 5.64ms ± 0% 5.64ms ± 0% ~ (p=0.883 n=18+17) LoadAndDecrypt-8 5.93ms ± 0% 5.95ms ± 1% ~ (p=0.247 n=20+19) LoadIndex-8 25.1ms ± 0% 24.5ms ± 1% -2.54% (p=0.000 n=18+17) name old speed new speed delta PackerManager-8 296MB/s ± 0% 296MB/s ± 0% ~ (p=0.229 n=20+20) SaveAndEncrypt-8 139MB/s ± 1% 138MB/s ± 0% -0.71% (p=0.000 n=19+19) LoadBlob-8 177MB/s ± 0% 177MB/s ± 0% ~ (p=0.890 n=18+17) LoadAndDecrypt-8 169MB/s ± 0% 168MB/s ± 1% ~ (p=0.227 n=20+19) name old alloc/op new alloc/op delta PackerManager-8 91.8kB ± 0% 91.8kB ± 0% ~ (p=0.772 n=12+19) IndexAlloc-8 786MB ± 0% 400MB ± 0% -49.04% (p=0.000 n=20+18) IndexAllocParallel-8 786MB ± 0% 401MB ± 0% -49.04% (p=0.000 n=19+15) SaveAndEncrypt-8 21.0MB ± 0% 21.0MB ± 0% +0.00% (p=0.000 n=19+19) name old allocs/op new allocs/op delta PackerManager-8 1.41k ± 0% 1.41k ± 0% ~ (all equal) IndexAlloc-8 977k ± 0% 907k ± 0% -7.18% (p=0.000 n=20+20) IndexAllocParallel-8 977k ± 0% 907k ± 0% -7.17% (p=0.000 n=19+15) SaveAndEncrypt-8 73.0 ± 0% 73.0 ± 0% ~ (all equal)
2024-11-22 12:55:18 +00:00 · 2020-06-23 22:13:25 +02:00 · 2020-06-23 22:13:25 +02:00 · 7bda28f31f
commit 7bda28f31f
parent 255ba83c4b
7 changed files with 395 additions and 118 deletions
--- a/changelog/unreleased/pull-2781
+++ b/changelog/unreleased/pull-2781
@ -1,6 +1,8 @@
 Enhancement: Reduce memory consumption of in-memory index

 We've improved how the index is stored in memory.
-This change reduces memory usage for large repositories by about 30-40%.
+This change can reduce memory usage for large repositories by up to 50%
+(depending on the operation).

 https://github.com/restic/restic/pull/2781
+https://github.com/restic/restic/pull/2812
--- a/go.mod
+++ b/go.mod
@ -8,6 +8,7 @@ require (
 	github.com/cenkalti/backoff v2.1.1+incompatible
 	github.com/cespare/xxhash v1.1.0
 	github.com/cpuguy83/go-md2man v1.0.10 // indirect
+	github.com/dchest/siphash v1.2.1
 	github.com/dnaeon/go-vcr v1.0.1 // indirect
 	github.com/elithrar/simple-scrypt v1.3.0
 	github.com/golang/protobuf v1.3.1 // indirect
--- a/go.sum
+++ b/go.sum
@ -38,6 +38,8 @@ github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwc
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dchest/siphash v1.2.1 h1:4cLinnzVJDKxTCl9B01807Yiy+W7ZzVHj/KIroQRvT4=
+github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
 github.com/dnaeon/go-vcr v1.0.1 h1:r8L/HqC0Hje5AXMu1ooW8oyQyOFv4GxqpL0nRP7SLLY=
--- a/internal/repository/index.go
+++ b/internal/repository/index.go
@ -19,40 +19,33 @@ import (
 // Hence the index data structure defined here is one of the main contributions
 // to the total memory requirements of restic.
 //
-// We use two maps to store each index entry.
-// The first map stores the first entry of a blobtype/blobID
-// The key of the map is a BlobHandle
-// The entries are the actual index entries.
-// In the second map we store duplicate index entries, i.e. entries with same
-// blobtype/blobID
+// We store the index entries in indexMaps. In these maps, entries take 56
+// bytes each, plus 8/4 = 2 bytes of unused pointers on average, not counting
+// malloc and header struct overhead and ignoring duplicates (those are only
+// present in edge cases and are also removed by prune runs).
+//
 // In the index entries, we need to reference the packID. As one pack may
 // contain many blobs the packIDs are saved in a separate array and only the index
 // within this array is saved in the indexEntry
 //
-// To compute the needed amount of memory, we need some assumptions.
-// Maps need an overhead of allocated but not needed elements.
-// For computations, we assume an overhead of 50% and use OF=1.5 (overhead factor)
-// As duplicates are only present in edge cases and are also removed by prune runs,
-// we assume that there are no significant duplicates and omit them in the calculations.
-// Moreover we asssume on average a minimum of 8 blobs per pack; BP=8
+// We assume on average a minimum of 8 blobs per pack; BP=8.
 // (Note that for large files there should be 3 blobs per pack as the average chunk
 // size is 1.5 MB and the minimum pack size is 4 MB)
 //
 // We have the following sizes:
-// key: 32 + 1 = 33 bytes
-// indexEntry:  8 + 4 + 4 = 16 bytes
+// indexEntry:  56 bytes  (on amd64)
 // each packID: 32 bytes
 //
 // To save N index entries, we therefore need:
-// N * OF * (33 + 16) bytes + N * 32 bytes / BP = N * 78 bytes
+// N * (56 + 2) bytes + N * 32 bytes / BP = N * 62 bytes,
+// i.e., fewer than 64 bytes per blob in an index.

 // Index holds lookup tables for id -> pack.
 type Index struct {
-	m          sync.Mutex
-	blob       map[restic.BlobHandle]indexEntry
-	duplicates map[restic.BlobHandle][]indexEntry
-	packs      restic.IDs
-	treePacks  restic.IDs
+	m         sync.Mutex
+	byType    [restic.NumBlobTypes]indexMap
+	packs     restic.IDs
+	treePacks restic.IDs
 	// only used by Store, StorePacks does not check for already saved packIDs
 	packIDToIndex map[restic.ID]int

@ -62,36 +55,14 @@ type Index struct {
 	created    time.Time
 }

-type indexEntry struct {
-	// only save index do packs; i.e. packs[packindex] yields the packID
-	packIndex int
-	offset    uint32
-	length    uint32
-}
-
 // NewIndex returns a new index.
 func NewIndex() *Index {
 	return &Index{
-		blob:          make(map[restic.BlobHandle]indexEntry),
-		duplicates:    make(map[restic.BlobHandle][]indexEntry),
 		packIDToIndex: make(map[restic.ID]int),
 		created:       time.Now(),
 	}
 }

-// withDuplicates returns the list of all entries for the given blob handle
-func (idx *Index) withDuplicates(h restic.BlobHandle, entry indexEntry) []indexEntry {
-	entries, ok := idx.duplicates[h]
-	if ok {
-		all := make([]indexEntry, len(entries)+1)
-		all[0] = entry
-		copy(all[1:], entries)
-		return all
-	}
-
-	return []indexEntry{entry}
-}
-
 // addToPacks saves the given pack ID and return the index.
 // This procedere allows to use pack IDs which can be easily garbage collected after.
 func (idx *Index) addToPacks(id restic.ID) int {
@ -106,17 +77,9 @@ func (idx *Index) store(packIndex int, blob restic.Blob) {
 	if blob.Offset > maxuint32 || blob.Length > maxuint32 {
 		panic("offset or length does not fit in uint32. You have packs > 4GB!")
 	}
-	newEntry := indexEntry{
-		packIndex: packIndex,
-		offset:    uint32(blob.Offset),
-		length:    uint32(blob.Length),
-	}
-	h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
-	if _, ok := idx.blob[h]; ok {
-		idx.duplicates[h] = append(idx.duplicates[h], newEntry)
-	} else {
-		idx.blob[h] = newEntry
-	}
+
+	m := &idx.byType[blob.Type]
+	m.add(blob.ID, packIndex, uint32(blob.Offset), uint32(blob.Length))
 }

 // Final returns true iff the index is already written to the repository, it is
@ -140,7 +103,10 @@ var IndexFull = func(idx *Index) bool {

 	debug.Log("checking whether index %p is full", idx)

-	blobs := len(idx.blob)
+	var blobs uint
+	for typ := range idx.byType {
+		blobs += idx.byType[typ].len()
+	}
 	age := time.Now().Sub(idx.created)

 	switch {
@ -196,16 +162,15 @@ func (idx *Index) StorePack(id restic.ID, blobs []restic.Blob) {
 	}
 }

-// ListPack returns a list of blobs contained in a pack.
-func (idx *Index) indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.PackedBlob {
+func (idx *Index) toPackedBlob(e *indexEntry, typ restic.BlobType) restic.PackedBlob {
 	return restic.PackedBlob{
 		Blob: restic.Blob{
-			ID:     h.ID,
-			Type:   h.Type,
-			Length: uint(entry.length),
-			Offset: uint(entry.offset),
+			ID:     e.id,
+			Type:   typ,
+			Length: uint(e.length),
+			Offset: uint(e.offset),
 		},
-		PackID: idx.packs[entry.packIndex],
+		PackID: idx.packs[e.packIndex],
 	}
 }

@ -214,21 +179,11 @@ func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.Pack
 	idx.m.Lock()
 	defer idx.m.Unlock()

-	h := restic.BlobHandle{ID: id, Type: tpe}
+	idx.byType[tpe].foreachWithID(id, func(e *indexEntry) {
+		blobs = append(blobs, idx.toPackedBlob(e, tpe))
+	})

-	blob, ok := idx.blob[h]
-	if ok {
-		blobList := idx.withDuplicates(h, blob)
-		blobs = make([]restic.PackedBlob, 0, len(blobList))
-
-		for _, p := range blobList {
-			blobs = append(blobs, idx.indexEntryToPackedBlob(h, p))
-		}
-
-		return blobs, true
-	}
-
-	return nil, false
+	return blobs, len(blobs) > 0
 }

 // ListPack returns a list of blobs contained in a pack.
@ -236,12 +191,14 @@ func (idx *Index) ListPack(id restic.ID) (list []restic.PackedBlob) {
 	idx.m.Lock()
 	defer idx.m.Unlock()

-	for h, entry := range idx.blob {
-		for _, blob := range idx.withDuplicates(h, entry) {
-			if idx.packs[blob.packIndex] == id {
-				list = append(list, idx.indexEntryToPackedBlob(h, blob))
+	for typ := range idx.byType {
+		m := &idx.byType[typ]
+		m.foreach(func(e *indexEntry) bool {
+			if idx.packs[e.packIndex] == id {
+				list = append(list, idx.toPackedBlob(e, restic.BlobType(typ)))
 			}
-		}
+			return true
+		})
 	}

 	return list
@ -252,21 +209,20 @@ func (idx *Index) Has(id restic.ID, tpe restic.BlobType) bool {
 	idx.m.Lock()
 	defer idx.m.Unlock()

-	h := restic.BlobHandle{ID: id, Type: tpe}
-
-	_, ok := idx.blob[h]
-	return ok
+	return idx.byType[tpe].get(id) != nil
 }

 // LookupSize returns the length of the plaintext content of the blob with the
 // given id.
 func (idx *Index) LookupSize(id restic.ID, tpe restic.BlobType) (plaintextLength uint, found bool) {
-	blobs, found := idx.Lookup(id, tpe)
-	if !found {
-		return 0, found
-	}
+	idx.m.Lock()
+	defer idx.m.Unlock()

-	return uint(restic.PlaintextLength(int(blobs[0].Length))), true
+	e := idx.byType[tpe].get(id)
+	if e == nil {
+		return 0, false
+	}
+	return uint(restic.PlaintextLength(int(e.length))), true
 }

 // Supersedes returns the list of indexes this index supersedes, if any.
@ -302,14 +258,16 @@ func (idx *Index) Each(ctx context.Context) <-chan restic.PackedBlob {
 			close(ch)
 		}()

-		for h, entry := range idx.blob {
-			for _, blob := range idx.withDuplicates(h, entry) {
+		for typ := range idx.byType {
+			m := &idx.byType[typ]
+			m.foreach(func(e *indexEntry) bool {
 				select {
 				case <-ctx.Done():
-					return
-				case ch <- idx.indexEntryToPackedBlob(h, blob):
+					return false
+				case ch <- idx.toPackedBlob(e, restic.BlobType(typ)):
+					return true
 				}
-			}
+			})
 		}
 	}()

@ -335,20 +293,7 @@ func (idx *Index) Count(t restic.BlobType) (n uint) {
 	idx.m.Lock()
 	defer idx.m.Unlock()

-	for h := range idx.blob {
-		if h.Type != t {
-			continue
-		}
-		n++
-	}
-	for h, dups := range idx.duplicates {
-		if h.Type != t {
-			continue
-		}
-		n += uint(len(dups))
-	}
-
-	return
+	return idx.byType[t].len()
 }

 type packJSON struct {
@ -368,14 +313,15 @@ func (idx *Index) generatePackList() ([]*packJSON, error) {
 	list := []*packJSON{}
 	packs := make(map[restic.ID]*packJSON)

-	for h, entry := range idx.blob {
-		for _, blob := range idx.withDuplicates(h, entry) {
-			packID := idx.packs[blob.packIndex]
+	for typ := range idx.byType {
+		m := &idx.byType[typ]
+		m.foreach(func(e *indexEntry) bool {
+			packID := idx.packs[e.packIndex]
 			if packID.IsNull() {
 				panic("null pack id")
 			}

-			debug.Log("handle blob %v", h)
+			debug.Log("handle blob %v", e.id)

 			// see if pack is already in map
 			p, ok := packs[packID]
@ -390,12 +336,14 @@ func (idx *Index) generatePackList() ([]*packJSON, error) {

 			// add blob
 			p.Blobs = append(p.Blobs, blobJSON{
-				ID:     h.ID,
-				Type:   h.Type,
-				Offset: uint(blob.offset),
-				Length: uint(blob.length),
+				ID:     e.id,
+				Type:   restic.BlobType(typ),
+				Offset: uint(e.offset),
+				Length: uint(e.length),
 			})
-		}
+
+			return true
+		})
 	}

 	debug.Log("done")
--- a/internal/repository/indexmap.go
+++ b/internal/repository/indexmap.go
@ -0,0 +1,168 @@
+package repository
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+
+	"github.com/restic/restic/internal/restic"
+
+	"github.com/dchest/siphash"
+)
+
+// An indexMap is a chained hash table that maps blob IDs to indexEntries.
+// It allows storing multiple entries with the same key.
+//
+// IndexMap uses some optimizations that are not compatible with supporting
+// deletions.
+//
+// The buckets in this hash table contain only pointers, rather than inlined
+// key-value pairs like the standard Go map. This way, only a pointer array
+// needs to be resized when the table grows, preventing memory usage spikes.
+type indexMap struct {
+	// The number of buckets is always a power of two and never zero.
+	buckets    []*indexEntry
+	numentries uint
+
+	key0, key1 uint64 // Key for hash randomization.
+
+	free *indexEntry // Free list.
+}
+
+const (
+	growthFactor = 2 // Must be a power of 2.
+	maxLoad      = 4 // Max. number of entries per bucket.
+)
+
+// add inserts an indexEntry for the given arguments into the map,
+// using id as the key.
+func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32) {
+	switch {
+	case m.numentries == 0: // Lazy initialization.
+		m.init()
+	case m.numentries >= maxLoad*uint(len(m.buckets)):
+		m.grow()
+	}
+
+	h := m.hash(id)
+	e := m.newEntry()
+	e.id = id
+	e.next = m.buckets[h] // Prepend to existing chain.
+	e.packIndex = packIdx
+	e.offset = offset
+	e.length = length
+
+	m.buckets[h] = e
+	m.numentries++
+}
+
+// foreach calls fn for all entries in the map, until fn returns false.
+func (m *indexMap) foreach(fn func(*indexEntry) bool) {
+	for _, e := range m.buckets {
+		for e != nil {
+			if !fn(e) {
+				return
+			}
+			e = e.next
+		}
+	}
+}
+
+// foreachWithID calls fn for all entries with the given id.
+func (m *indexMap) foreachWithID(id restic.ID, fn func(*indexEntry)) {
+	if len(m.buckets) == 0 {
+		return
+	}
+
+	h := m.hash(id)
+	for e := m.buckets[h]; e != nil; e = e.next {
+		if e.id != id {
+			continue
+		}
+		fn(e)
+	}
+}
+
+// get returns the first entry for the given id.
+func (m *indexMap) get(id restic.ID) *indexEntry {
+	if len(m.buckets) == 0 {
+		return nil
+	}
+
+	h := m.hash(id)
+	for e := m.buckets[h]; e != nil; e = e.next {
+		if e.id == id {
+			return e
+		}
+	}
+	return nil
+}
+
+func (m *indexMap) grow() {
+	old := m.buckets
+	m.buckets = make([]*indexEntry, growthFactor*len(m.buckets))
+
+	for _, e := range old {
+		for e != nil {
+			h := m.hash(e.id)
+			next := e.next
+			e.next = m.buckets[h]
+			m.buckets[h] = e
+			e = next
+		}
+	}
+}
+
+func (m *indexMap) hash(id restic.ID) uint {
+	// We use siphash with a randomly generated 128-bit key, to prevent
+	// backups of specially crafted inputs from degrading performance.
+	// While SHA-256 should be collision-resistant, for hash table indices
+	// we use only a few bits of it and finding collisions for those is
+	// much easier than breaking the whole algorithm.
+	h := uint(siphash.Hash(m.key0, m.key1, id[:]))
+	return h & uint(len(m.buckets)-1)
+}
+
+func (m *indexMap) init() {
+	const initialBuckets = 64
+	m.buckets = make([]*indexEntry, initialBuckets)
+
+	var buf [16]byte
+	if _, err := rand.Read(buf[:]); err != nil {
+		panic(err) // Very little we can do here.
+	}
+	m.key0 = binary.LittleEndian.Uint64(buf[:8])
+	m.key1 = binary.LittleEndian.Uint64(buf[8:])
+}
+
+func (m *indexMap) len() uint { return m.numentries }
+
+func (m *indexMap) newEntry() *indexEntry {
+	// Allocating in batches means that we get closer to optimal space usage,
+	// as Go's malloc will overallocate for structures of size 56 (indexEntry
+	// on amd64).
+	//
+	// 256*56 and 256*48 both have minimal malloc overhead among reasonable sizes.
+	// See src/runtime/sizeclasses.go in the standard library.
+	const entryAllocBatch = 256
+
+	if m.free == nil {
+		free := new([entryAllocBatch]indexEntry)
+		for i := range free[:len(free)-1] {
+			free[i].next = &free[i+1]
+		}
+		m.free = &free[0]
+	}
+
+	e := m.free
+	m.free = m.free.next
+
+	return e
+}
+
+type indexEntry struct {
+	id        restic.ID
+	next      *indexEntry
+	packIndex int // Position in containing Index's packs field.
+	offset    uint32
+	length    uint32
+}
--- a/internal/repository/indexmap_test.go
+++ b/internal/repository/indexmap_test.go
@ -0,0 +1,155 @@
+package repository
+
+import (
+	"math/rand"
+	"testing"
+	"time"
+
+	"github.com/restic/restic/internal/restic"
+	rtest "github.com/restic/restic/internal/test"
+)
+
+func TestIndexMapBasic(t *testing.T) {
+	t.Parallel()
+
+	var (
+		id restic.ID
+		m  indexMap
+		r  = rand.New(rand.NewSource(98765))
+	)
+
+	for i := 1; i <= 400; i++ {
+		r.Read(id[:])
+		rtest.Assert(t, m.get(id) == nil, "%v retrieved but not added", id)
+
+		m.add(id, 0, 0, 0)
+		rtest.Assert(t, m.get(id) != nil, "%v added but not retrieved", id)
+		rtest.Equals(t, uint(i), m.len())
+	}
+}
+
+func TestIndexMapForeach(t *testing.T) {
+	t.Parallel()
+
+	const N = 10
+
+	var m indexMap
+
+	// Don't crash on empty map.
+	m.foreach(func(*indexEntry) bool { return true })
+
+	for i := 0; i < N; i++ {
+		var id restic.ID
+		id[0] = byte(i)
+		m.add(id, i, uint32(i), uint32(i))
+	}
+
+	seen := make(map[int]struct{})
+	m.foreach(func(e *indexEntry) bool {
+		i := int(e.id[0])
+		rtest.Assert(t, i < N, "unknown id %v in indexMap", e.id)
+		rtest.Equals(t, i, e.packIndex)
+		rtest.Equals(t, i, int(e.length))
+		rtest.Equals(t, i, int(e.offset))
+
+		seen[i] = struct{}{}
+		return true
+	})
+
+	rtest.Equals(t, N, len(seen))
+
+	ncalls := 0
+	m.foreach(func(*indexEntry) bool {
+		ncalls++
+		return false
+	})
+	rtest.Equals(t, 1, ncalls)
+}
+
+func TestIndexMapForeachWithID(t *testing.T) {
+	t.Parallel()
+
+	const ndups = 3
+
+	var (
+		id restic.ID
+		m  indexMap
+		r  = rand.New(rand.NewSource(1234321))
+	)
+	r.Read(id[:])
+
+	// No result (and no crash) for empty map.
+	n := 0
+	m.foreachWithID(id, func(*indexEntry) { n++ })
+	rtest.Equals(t, 0, n)
+
+	// Test insertion and retrieval of duplicates.
+	for i := 0; i < ndups; i++ {
+		m.add(id, i, 0, 0)
+	}
+
+	for i := 0; i < 100; i++ {
+		var otherid restic.ID
+		r.Read(otherid[:])
+		m.add(otherid, -1, 0, 0)
+	}
+
+	n = 0
+	var packs [ndups]bool
+	m.foreachWithID(id, func(e *indexEntry) {
+		packs[e.packIndex] = true
+		n++
+	})
+	rtest.Equals(t, ndups, n)
+
+	for i := range packs {
+		rtest.Assert(t, packs[i], "duplicate from pack %d not retrieved", i)
+	}
+}
+
+func TestIndexMapHash(t *testing.T) {
+	t.Parallel()
+
+	var m1, m2 indexMap
+
+	id := restic.NewRandomID()
+	// Add to both maps to initialize them.
+	m1.add(id, 0, 0, 0)
+	m2.add(id, 0, 0, 0)
+
+	h1 := m1.hash(id)
+	h2 := m2.hash(id)
+
+	rtest.Equals(t, len(m1.buckets), len(m2.buckets)) // just to be sure
+
+	if h1 == h2 {
+		// The probability of the zero key should be 2^(-128).
+		if m1.key0 == 0 && m1.key1 == 0 {
+			t.Error("siphash key not set for m1")
+		}
+		if m2.key0 == 0 && m2.key1 == 0 {
+			t.Error("siphash key not set for m2")
+		}
+	}
+}
+
+func BenchmarkIndexMapHash(b *testing.B) {
+	var m indexMap
+	m.add(restic.ID{}, 0, 0, 0) // Trigger lazy initialization.
+
+	ids := make([]restic.ID, 128) // 4 KiB.
+	r := rand.New(rand.NewSource(time.Now().UnixNano()))
+	for i := range ids {
+		r.Read(ids[i][:])
+	}
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(restic.ID{}) * len(ids)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		for _, id := range ids {
+			m.hash(id)
+		}
+	}
+}
--- a/internal/restic/blob.go
+++ b/internal/restic/blob.go
@ -43,6 +43,7 @@ const (
 	InvalidBlob BlobType = iota
 	DataBlob
 	TreeBlob
+	NumBlobTypes // Number of types. Must be last in this enumeration.
 )

 func (t BlobType) String() string {