From fed33295c38197cec49d164725e744c1c9786690 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 28 May 2023 23:42:47 +0200 Subject: [PATCH] index: store indexEntries in hashed array tree This data structure reduces the wasted memory to O(sqrt(n)). The top-layer of the hashed array tree (HAT) also has a size of O(sqrt(n)), which makes it cache efficient. The top-layer should be small enough to easily fit into the CPU cache and thus only adds little overhead compared to directly accessing an index entry via a pointer. --- internal/index/indexmap.go | 94 ++++++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 9 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index 60ab11ff7..811d20903 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -1,6 +1,7 @@ package index import ( + "fmt" "hash/maphash" "github.com/restic/restic/internal/restic" @@ -22,7 +23,7 @@ type indexMap struct { mh maphash.Hash - blockList []indexEntry + blockList hashedArrayTree } const ( @@ -134,22 +135,18 @@ func (m *indexMap) init() { const initialBuckets = 64 m.buckets = make([]uint, initialBuckets) // first entry in blockList serves as null byte - m.blockList = make([]indexEntry, 1) + m.blockList = *newHAT() + m.newEntry() } func (m *indexMap) len() uint { return m.numentries } func (m *indexMap) newEntry() (*indexEntry, uint) { - m.blockList = append(m.blockList, indexEntry{}) - - idx := uint(len(m.blockList) - 1) - e := &m.blockList[idx] - - return e, idx + return m.blockList.Alloc() } func (m *indexMap) resolve(idx uint) *indexEntry { - return &m.blockList[idx] + return m.blockList.Ref(idx) } type indexEntry struct { @@ -160,3 +157,82 @@ type indexEntry struct { length uint32 uncompressedLength uint32 } + +type hashedArrayTree struct { + mask uint + maskShift uint + blockSize uint + + size uint + blockList [][]indexEntry +} + +func newHAT() *hashedArrayTree { + // start with a small block size + blockSizePower := uint(2) + blockSize := uint(1 << blockSizePower) + + return &hashedArrayTree{ + mask: blockSize - 1, + maskShift: blockSizePower, + blockSize: blockSize, + size: 0, + blockList: make([][]indexEntry, blockSize), + } +} + +func (h *hashedArrayTree) Alloc() (*indexEntry, uint) { + h.grow() + size := h.size + idx, subIdx := h.index(size) + h.size++ + return &h.blockList[idx][subIdx], size +} + +func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) { + subIdx = pos & h.mask + idx = pos >> h.maskShift + return +} + +func (h *hashedArrayTree) Ref(pos uint) *indexEntry { + if pos >= h.size { + panic(fmt.Sprintf("array index %d out of bounds %d", pos, h.size)) + } + + idx, subIdx := h.index(pos) + return &h.blockList[idx][subIdx] +} + +func (h *hashedArrayTree) Size() uint { + return h.size +} + +func (h *hashedArrayTree) grow() { + idx, subIdx := h.index(h.size) + if int(idx) == len(h.blockList) { + // blockList is too small -> double list and block size + oldBlocks := h.blockList + h.blockList = make([][]indexEntry, h.blockSize) + + h.blockSize *= 2 + h.mask = h.mask*2 + 1 + h.maskShift++ + idx = idx / 2 + + // pairwise merging of blocks + for i := 0; i < len(oldBlocks); i += 2 { + block := make([]indexEntry, 0, h.blockSize) + block = append(block, oldBlocks[i]...) + block = append(block, oldBlocks[i+1]...) + h.blockList[i/2] = block + // allow GC + oldBlocks[i] = nil + oldBlocks[i+1] = nil + } + } + if subIdx == 0 { + // new index entry batch + h.blockList[idx] = make([]indexEntry, h.blockSize) + } +}