From 5141228e0cb79b696a9ea6f4fa815a61011c63dd Mon Sep 17 00:00:00 2001
From: greatroar <61184462+greatroar@users.noreply.github.com>
Date: Wed, 11 May 2022 20:53:21 +0200
Subject: [PATCH] repository: Re-tune indexmap allocation strategy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch
size from 256 to 128 under the assumption that an indexEntry is 60 bytes
on amd64, but it's 64: structs are padded out to a multiple of 8 for
alignment reasons. That means we'd waste no space in malloc even without
the batch allocation, at least on 64-bit machines. While that strategy
cuts the overallocation down dramatically for many small indexes, it also
seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s):

    name                   old time/op    new time/op    delta
    DecodeIndex-8             4.67s ± 5%     4.60s ± 1%      ~     (p=0.953 n=10+5)
    DecodeIndexParallel-8     4.67s ± 3%     4.60s ± 1%      ~     (p=0.953 n=10+5)
    IndexHasUnknown-8        37.8ns ± 8%    36.5ns ±14%      ~     (p=0.841 n=5+5)
    IndexHasKnown-8          38.5ns ±12%    37.7ns ±10%      ~     (p=0.968 n=5+5)
    IndexAlloc-8              615ms ±18%     607ms ± 1%      ~     (p=1.000 n=10+5)
    IndexAllocParallel-8      245ms ±11%     285ms ± 6%   +16.40%  (p=0.001 n=10+5)
    MasterIndexAlloc-8        286ms ± 9%     275ms ± 2%      ~     (p=1.000 n=10+5)
    LoadIndex/v1-8           27.0ms ± 4%    26.8ms ± 1%      ~     (p=0.690 n=5+5)
    LoadIndex/v2-8           22.4ms ± 1%    22.8ms ± 2%    +1.48%  (p=0.016 n=5+5)

    name                   old alloc/op   new alloc/op   delta
    IndexAlloc-8              446MB ± 0%     446MB ± 0%    -0.00%  (p=0.000 n=8+4)
    IndexAllocParallel-8      446MB ± 0%     446MB ± 0%    -0.00%  (p=0.008 n=8+5)
    MasterIndexAlloc-8        213MB ± 0%     159MB ± 0%   -25.47%  (p=0.000 n=10+5)

    name                   old allocs/op  new allocs/op  delta
    IndexAlloc-8               913k ± 0%     2632k ± 0%  +188.19%  (p=0.008 n=5+5)
    IndexAllocParallel-8       913k ± 0%     2632k ± 0%  +188.21%  (p=0.008 n=5+5)
    MasterIndexAlloc-8         318k ± 0%     1172k ± 0%  +267.86%  (p=0.008 n=5+5)

Instead, this patch sets a batch size of 4, which means no space is
wasted by malloc on 64-bit and very little on 32-bit. It still gets very
close to the savings from not allocating in batches, without requiring
special code for bits.UintSize==64. Benchmark results, again for
Linux/amd64:

    name                   old time/op    new time/op    delta
    DecodeIndex-8             4.67s ± 5%     4.83s ± 9%     ~     (p=0.315 n=10+10)
    DecodeIndexParallel-8     4.67s ± 3%     4.68s ± 4%     ~     (p=0.315 n=10+10)
    IndexHasUnknown-8        37.8ns ± 8%    44.5ns ±19%     ~     (p=0.095 n=5+5)
    IndexHasKnown-8          38.5ns ±12%    36.9ns ± 8%     ~     (p=0.690 n=5+5)
    IndexAlloc-8              615ms ±18%     628ms ±18%     ~     (p=0.218 n=10+10)
    IndexAllocParallel-8      245ms ±11%     262ms ± 9%   +7.02%  (p=0.043 n=10+10)
    MasterIndexAlloc-8        286ms ± 9%     287ms ±13%     ~     (p=1.000 n=10+10)
    LoadIndex/v1-8           27.0ms ± 4%    26.8ms ± 0%     ~     (p=1.000 n=5+5)
    LoadIndex/v2-8           22.4ms ± 1%    22.5ms ± 0%     ~     (p=0.056 n=5+5)

    name                   old alloc/op   new alloc/op   delta
    IndexAlloc-8              446MB ± 0%     446MB ± 0%     ~     (p=1.000 n=8+10)
    IndexAllocParallel-8      446MB ± 0%     446MB ± 0%   -0.00%  (p=0.000 n=8+8)
    MasterIndexAlloc-8        213MB ± 0%     160MB ± 0%  -25.02%  (p=0.000 n=10+9)

    name                   old allocs/op  new allocs/op  delta
    IndexAlloc-8               913k ± 0%     1333k ± 0%  +45.94%  (p=0.000 n=8+10)
    IndexAllocParallel-8       913k ± 0%     1333k ± 0%  +45.94%  (p=0.000 n=8+8)
    MasterIndexAlloc-8         318k ± 0%      525k ± 0%  +64.99%  (p=0.000 n=10+10)

The allocation method indexmap.newEntry has also been rewritten in a
form that is a few instructions shorter.
---
 internal/repository/indexmap.go | 38 ++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/internal/repository/indexmap.go b/internal/repository/indexmap.go
index 6a8e86aad..99c3fd331 100644
--- a/internal/repository/indexmap.go
+++ b/internal/repository/indexmap.go
@@ -130,24 +130,32 @@ func (m *indexMap) init() {
 func (m *indexMap) len() uint { return m.numentries }
 
 func (m *indexMap) newEntry() *indexEntry {
-	// Allocating in batches means that we get closer to optimal space usage,
-	// as Go's malloc will overallocate for structures of size 60 (indexEntry
-	// on amd64).
+	// We keep a free list of objects to speed up allocation and GC.
+	// There's an obvious trade-off here: allocating in larger batches
+	// means we allocate faster and the GC has to keep fewer bits to track
+	// what we have in use, but it means we waste some space.
 	//
-	// 128*60 and 128*60 both have low malloc overhead among reasonable sizes.
-	// See src/runtime/sizeclasses.go in the standard library.
-	const entryAllocBatch = 128
-
-	if m.free == nil {
-		free := new([entryAllocBatch]indexEntry)
-		for i := range free[:len(free)-1] {
-			free[i].next = &free[i+1]
-		}
-		m.free = &free[0]
-	}
+	// Then again, allocating each indexEntry separately also wastes space
+	// on 32-bit platforms, because the Go malloc has no size class for
+	// exactly 52 bytes, so it puts the indexEntry in a 64-byte slot instead.
+	// See src/runtime/sizeclasses.go in the Go source repo.
+	//
+	// The batch size of 4 means we hit the size classes for 4×64=256 bytes
+	// (64-bit) and 4×52=208 bytes (32-bit), wasting nothing in malloc on
+	// 64-bit and relatively little on 32-bit.
+	const entryAllocBatch = 4
 
 	e := m.free
-	m.free = m.free.next
+	if e != nil {
+		m.free = e.next
+	} else {
+		free := new([entryAllocBatch]indexEntry)
+		e = &free[0]
+		for i := 1; i < len(free)-1; i++ {
+			free[i].next = &free[i+1]
+		}
+		m.free = &free[1]
+	}
 
 	return e
 }