From fed33295c38197cec49d164725e744c1c9786690 Mon Sep 17 00:00:00 2001
From: Michael Eischer <michael.eischer@fau.de>
Date: Sun, 28 May 2023 23:42:47 +0200
Subject: [PATCH] index: store indexEntries in hashed array tree

This data structure reduces the wasted memory to O(sqrt(n)). The
top-layer of the hashed array tree (HAT) also has a size of O(sqrt(n)),
which makes it cache efficient. The top-layer should be small enough to
easily fit into the CPU cache and thus only adds little overhead
compared to directly accessing an index entry via a pointer.
---
 internal/index/indexmap.go | 94 ++++++++++++++++++++++++++++++++++----
 1 file changed, 85 insertions(+), 9 deletions(-)

diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go
index 60ab11ff7..811d20903 100644
--- a/internal/index/indexmap.go
+++ b/internal/index/indexmap.go
@@ -1,6 +1,7 @@
 package index
 
 import (
+	"fmt"
 	"hash/maphash"
 
 	"github.com/restic/restic/internal/restic"
@@ -22,7 +23,7 @@ type indexMap struct {
 
 	mh maphash.Hash
 
-	blockList []indexEntry
+	blockList hashedArrayTree
 }
 
 const (
@@ -134,22 +135,18 @@ func (m *indexMap) init() {
 	const initialBuckets = 64
 	m.buckets = make([]uint, initialBuckets)
 	// first entry in blockList serves as null byte
-	m.blockList = make([]indexEntry, 1)
+	m.blockList = *newHAT()
+	m.newEntry()
 }
 
 func (m *indexMap) len() uint { return m.numentries }
 
 func (m *indexMap) newEntry() (*indexEntry, uint) {
-	m.blockList = append(m.blockList, indexEntry{})
-
-	idx := uint(len(m.blockList) - 1)
-	e := &m.blockList[idx]
-
-	return e, idx
+	return m.blockList.Alloc()
 }
 
 func (m *indexMap) resolve(idx uint) *indexEntry {
-	return &m.blockList[idx]
+	return m.blockList.Ref(idx)
 }
 
 type indexEntry struct {
@@ -160,3 +157,82 @@ type indexEntry struct {
 	length             uint32
 	uncompressedLength uint32
 }
+
+type hashedArrayTree struct {
+	mask      uint
+	maskShift uint
+	blockSize uint
+
+	size      uint
+	blockList [][]indexEntry
+}
+
+func newHAT() *hashedArrayTree {
+	// start with a small block size
+	blockSizePower := uint(2)
+	blockSize := uint(1 << blockSizePower)
+
+	return &hashedArrayTree{
+		mask:      blockSize - 1,
+		maskShift: blockSizePower,
+		blockSize: blockSize,
+		size:      0,
+		blockList: make([][]indexEntry, blockSize),
+	}
+}
+
+func (h *hashedArrayTree) Alloc() (*indexEntry, uint) {
+	h.grow()
+	size := h.size
+	idx, subIdx := h.index(size)
+	h.size++
+	return &h.blockList[idx][subIdx], size
+}
+
+func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) {
+	subIdx = pos & h.mask
+	idx = pos >> h.maskShift
+	return
+}
+
+func (h *hashedArrayTree) Ref(pos uint) *indexEntry {
+	if pos >= h.size {
+		panic(fmt.Sprintf("array index %d out of bounds %d", pos, h.size))
+	}
+
+	idx, subIdx := h.index(pos)
+	return &h.blockList[idx][subIdx]
+}
+
+func (h *hashedArrayTree) Size() uint {
+	return h.size
+}
+
+func (h *hashedArrayTree) grow() {
+	idx, subIdx := h.index(h.size)
+	if int(idx) == len(h.blockList) {
+		// blockList is too small -> double list and block size
+		oldBlocks := h.blockList
+		h.blockList = make([][]indexEntry, h.blockSize)
+
+		h.blockSize *= 2
+		h.mask = h.mask*2 + 1
+		h.maskShift++
+		idx = idx / 2
+
+		// pairwise merging of blocks
+		for i := 0; i < len(oldBlocks); i += 2 {
+			block := make([]indexEntry, 0, h.blockSize)
+			block = append(block, oldBlocks[i]...)
+			block = append(block, oldBlocks[i+1]...)
+			h.blockList[i/2] = block
+			// allow GC
+			oldBlocks[i] = nil
+			oldBlocks[i+1] = nil
+		}
+	}
+	if subIdx == 0 {
+		// new index entry batch
+		h.blockList[idx] = make([]indexEntry, h.blockSize)
+	}
+}