From 5141228e0cb79b696a9ea6f4fa815a61011c63dd Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Wed, 11 May 2022 20:53:21 +0200 Subject: [PATCH] repository: Re-tune indexmap allocation strategy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch size from 256 to 128 under the assumption that an indexEntry is 60 bytes on amd64, but it's 64: structs are padded out to a multiple of 8 for alignment reasons. That means we'd waste no space in malloc even without the batch allocation, at least on 64-bit machines. While that strategy cuts the overallocation down dramatically for many small indexes, it also seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s): name old time/op new time/op delta DecodeIndex-8 4.67s ± 5% 4.60s ± 1% ~ (p=0.953 n=10+5) DecodeIndexParallel-8 4.67s ± 3% 4.60s ± 1% ~ (p=0.953 n=10+5) IndexHasUnknown-8 37.8ns ± 8% 36.5ns ±14% ~ (p=0.841 n=5+5) IndexHasKnown-8 38.5ns ±12% 37.7ns ±10% ~ (p=0.968 n=5+5) IndexAlloc-8 615ms ±18% 607ms ± 1% ~ (p=1.000 n=10+5) IndexAllocParallel-8 245ms ±11% 285ms ± 6% +16.40% (p=0.001 n=10+5) MasterIndexAlloc-8 286ms ± 9% 275ms ± 2% ~ (p=1.000 n=10+5) LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 1% ~ (p=0.690 n=5+5) LoadIndex/v2-8 22.4ms ± 1% 22.8ms ± 2% +1.48% (p=0.016 n=5+5) name old alloc/op new alloc/op delta IndexAlloc-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+4) IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.008 n=8+5) MasterIndexAlloc-8 213MB ± 0% 159MB ± 0% -25.47% (p=0.000 n=10+5) name old allocs/op new allocs/op delta IndexAlloc-8 913k ± 0% 2632k ± 0% +188.19% (p=0.008 n=5+5) IndexAllocParallel-8 913k ± 0% 2632k ± 0% +188.21% (p=0.008 n=5+5) MasterIndexAlloc-8 318k ± 0% 1172k ± 0% +267.86% (p=0.008 n=5+5) Instead, this patch sets a batch size of 4, which means no space is wasted by malloc on 64-bit and very little on 32-bit. It still gets very close to the savings from not allocating in batches, without requiring special code for bits.UintSize==64. Benchmark results, again for Linux/amd64: name old time/op new time/op delta DecodeIndex-8 4.67s ± 5% 4.83s ± 9% ~ (p=0.315 n=10+10) DecodeIndexParallel-8 4.67s ± 3% 4.68s ± 4% ~ (p=0.315 n=10+10) IndexHasUnknown-8 37.8ns ± 8% 44.5ns ±19% ~ (p=0.095 n=5+5) IndexHasKnown-8 38.5ns ±12% 36.9ns ± 8% ~ (p=0.690 n=5+5) IndexAlloc-8 615ms ±18% 628ms ±18% ~ (p=0.218 n=10+10) IndexAllocParallel-8 245ms ±11% 262ms ± 9% +7.02% (p=0.043 n=10+10) MasterIndexAlloc-8 286ms ± 9% 287ms ±13% ~ (p=1.000 n=10+10) LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 0% ~ (p=1.000 n=5+5) LoadIndex/v2-8 22.4ms ± 1% 22.5ms ± 0% ~ (p=0.056 n=5+5) name old alloc/op new alloc/op delta IndexAlloc-8 446MB ± 0% 446MB ± 0% ~ (p=1.000 n=8+10) IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+8) MasterIndexAlloc-8 213MB ± 0% 160MB ± 0% -25.02% (p=0.000 n=10+9) name old allocs/op new allocs/op delta IndexAlloc-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+10) IndexAllocParallel-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+8) MasterIndexAlloc-8 318k ± 0% 525k ± 0% +64.99% (p=0.000 n=10+10) The allocation method indexmap.newEntry has also been rewritten in a form that is a few instructions shorter. --- internal/repository/indexmap.go | 38 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/internal/repository/indexmap.go b/internal/repository/indexmap.go index 6a8e86aad..99c3fd331 100644 --- a/internal/repository/indexmap.go +++ b/internal/repository/indexmap.go @@ -130,24 +130,32 @@ func (m *indexMap) init() { func (m *indexMap) len() uint { return m.numentries } func (m *indexMap) newEntry() *indexEntry { - // Allocating in batches means that we get closer to optimal space usage, - // as Go's malloc will overallocate for structures of size 60 (indexEntry - // on amd64). + // We keep a free list of objects to speed up allocation and GC. + // There's an obvious trade-off here: allocating in larger batches + // means we allocate faster and the GC has to keep fewer bits to track + // what we have in use, but it means we waste some space. // - // 128*60 and 128*60 both have low malloc overhead among reasonable sizes. - // See src/runtime/sizeclasses.go in the standard library. - const entryAllocBatch = 128 - - if m.free == nil { - free := new([entryAllocBatch]indexEntry) - for i := range free[:len(free)-1] { - free[i].next = &free[i+1] - } - m.free = &free[0] - } + // Then again, allocating each indexEntry separately also wastes space + // on 32-bit platforms, because the Go malloc has no size class for + // exactly 52 bytes, so it puts the indexEntry in a 64-byte slot instead. + // See src/runtime/sizeclasses.go in the Go source repo. + // + // The batch size of 4 means we hit the size classes for 4×64=256 bytes + // (64-bit) and 4×52=208 bytes (32-bit), wasting nothing in malloc on + // 64-bit and relatively little on 32-bit. + const entryAllocBatch = 4 e := m.free - m.free = m.free.next + if e != nil { + m.free = e.next + } else { + free := new([entryAllocBatch]indexEntry) + e = &free[0] + for i := 1; i < len(free)-1; i++ { + free[i].next = &free[i+1] + } + m.free = &free[1] + } return e }