From 9bbb9c871ed00726396fbfc92c0a322bf7ed8667 Mon Sep 17 00:00:00 2001 From: Achille Date: Mon, 15 May 2023 18:41:52 -0700 Subject: [PATCH] compiler: compress code offsets (#1467) Signed-off-by: Achille Roussel --- internal/bitpack/offset_array.go | 224 ++++++++++++++++++++++++ internal/bitpack/offset_array_test.go | 34 ++++ internal/engine/compiler/RATIONALE.md | 19 ++ internal/engine/compiler/engine.go | 64 +++++-- internal/engine/compiler/engine_test.go | 25 +-- 5 files changed, 335 insertions(+), 31 deletions(-) create mode 100644 internal/bitpack/offset_array.go create mode 100644 internal/bitpack/offset_array_test.go diff --git a/internal/bitpack/offset_array.go b/internal/bitpack/offset_array.go new file mode 100644 index 00000000..8cafd188 --- /dev/null +++ b/internal/bitpack/offset_array.go @@ -0,0 +1,224 @@ +package bitpack + +import ( + "math" +) + +// OffsetArray is an interface representing read-only views of arrays of 64 bits +// offsets. +type OffsetArray interface { + // Returns the value at index i. + // + // The method complexity may be anywhere between O(1) and O(N). + Index(i int) uint64 + // Returns the number of offsets in the array. + // + // The method complexity must be O(1). + Len() int +} + +// OffsetArrayLen is a helper function to access the length of an offset array. +// It is similar to calling Len on the array but handles the special case where +// the array is nil, in which case it returns zero. +func OffsetArrayLen(array OffsetArray) int { + if array != nil { + return array.Len() + } + return 0 +} + +// NewOffsetArray constructs a new array of offsets from the slice of values +// passed as argument. The slice is not retained, the returned array always +// holds a copy of the values. +// +// The underlying implementation of the offset array applies a compression +// mechanism derived from Frame-of-Reference and Delta Encoding to minimize +// the memory footprint of the array. This compression model works best when +// the input is made of ordered values, otherwise the deltas between values +// are likely to be too large to benefit from delta encoding. +// +// See https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/ +func NewOffsetArray(values []uint64) OffsetArray { + if len(values) == 0 { + return emptyOffsetArray{} + } + if len(values) <= smallOffsetArrayCapacity { + return newSmallOffsetArray(values) + } + + maxDelta := uint64(0) + lastValue := values[0] + // TODO: the pre-processing we perform here can be optimized using SIMD + // instructions. + for _, value := range values[1:] { + if delta := value - lastValue; delta > maxDelta { + maxDelta = delta + } + lastValue = value + } + + switch { + case maxDelta > math.MaxUint32: + return newOffsetArray(values) + case maxDelta > math.MaxUint16: + return newDeltaArray[uint32](values) + case maxDelta > math.MaxUint8: + return newDeltaArray[uint16](values) + case maxDelta > 15: + return newDeltaArray[uint8](values) + default: + return newDeltaArrayUint4(values) + } +} + +type offsetArray struct { + values []uint64 +} + +func newOffsetArray(values []uint64) *offsetArray { + a := &offsetArray{ + values: make([]uint64, len(values)), + } + copy(a.values, values) + return a +} + +func (a *offsetArray) Index(i int) uint64 { + return a.values[i] +} + +func (a *offsetArray) Len() int { + return len(a.values) +} + +type emptyOffsetArray struct{} + +func (emptyOffsetArray) Index(int) uint64 { + panic("index out of bounds") +} + +func (emptyOffsetArray) Len() int { + return 0 +} + +const smallOffsetArrayCapacity = 7 + +type smallOffsetArray struct { + length int + values [smallOffsetArrayCapacity]uint64 +} + +func newSmallOffsetArray(values []uint64) *smallOffsetArray { + a := &smallOffsetArray{length: len(values)} + copy(a.values[:], values) + return a +} + +func (a *smallOffsetArray) Index(i int) uint64 { + if i < 0 || i >= a.length { + panic("index out of bounds") + } + return a.values[i] +} + +func (a *smallOffsetArray) Len() int { + return a.length +} + +type uintType interface { + uint8 | uint16 | uint32 | uint64 +} + +type deltaArray[T uintType] struct { + deltas []T + firstValue uint64 +} + +func newDeltaArray[T uintType](values []uint64) *deltaArray[T] { + a := &deltaArray[T]{ + deltas: make([]T, len(values)-1), + firstValue: values[0], + } + lastValue := values[0] + for i, value := range values[1:] { + a.deltas[i] = T(value - lastValue) + lastValue = value + } + return a +} + +func (a *deltaArray[T]) Index(i int) uint64 { + if i < 0 || i >= a.Len() { + panic("index out of bounds") + } + value := a.firstValue + // TODO: computing the prefix sum can be vectorized; + // see https://en.algorithmica.org/hpc/algorithms/prefix/ + for _, delta := range a.deltas[:i] { + value += uint64(delta) + } + return value +} + +func (a *deltaArray[T]) Len() int { + return len(a.deltas) + 1 +} + +// deltaArrayUint4 is a specialization of deltaArray which packs 4 bits integers +// to hold deltas between 0 and 15; based on the analysis of compiling Python, +// it appeared that most source offset deltas were under 16, so using this +// data structure cuts by 50% the memory needed compared to deltaArray[uint8]. +// +// Here is the distribution of source offset deltas for Python 3.13: +// +// - <=15 : 10240 +// - <=255 : 9565 +// - <=65535 : 1163 +// +// Memory profiles showed that using deltaArrayUint4 (compared to deltaArray[T]) +// dropped the memory footprint of source mappings for Python from 6MB to 4.5MB. +type deltaArrayUint4 struct { + deltas []byte + numValues int + firstValue uint64 +} + +func newDeltaArrayUint4(values []uint64) *deltaArrayUint4 { + a := &deltaArrayUint4{ + deltas: make([]byte, len(values)/2+1), + numValues: len(values), + firstValue: values[0], + } + lastValue := values[0] + for i, value := range values[1:] { + a.assign(i, value-lastValue) + lastValue = value + } + return a +} + +func (a *deltaArrayUint4) assign(i int, v uint64) { + index, shift := uint(i)>>1, 4*(uint(i)&1) + a.deltas[index] &= ^(0xF << shift) + a.deltas[index] |= byte(v) << shift +} + +func (a *deltaArrayUint4) index(i int) uint64 { + index, shift := uint(i)>>1, 4*(uint(i)&1) + return uint64((a.deltas[index] >> shift) & 0xF) +} + +func (a *deltaArrayUint4) Index(i int) uint64 { + if i < 0 || i >= a.Len() { + panic("index out of bounds") + } + value := a.firstValue + for j := 0; j < i; j++ { + value += a.index(j) + } + return value +} + +func (a *deltaArrayUint4) Len() int { + return a.numValues +} diff --git a/internal/bitpack/offset_array_test.go b/internal/bitpack/offset_array_test.go new file mode 100644 index 00000000..f1d187e3 --- /dev/null +++ b/internal/bitpack/offset_array_test.go @@ -0,0 +1,34 @@ +package bitpack_test + +import ( + "fmt" + "math" + "testing" + + "github.com/tetratelabs/wazero/internal/bitpack" + "github.com/tetratelabs/wazero/internal/testing/require" +) + +func TestOffsetArray(t *testing.T) { + tests := [][]uint64{ + {}, + {0}, + {1, 2, 3, 4, 5, 6, 7, 8, 9}, + {16: 1}, + {17: math.MaxUint16 + 1}, + {21: 10, 22: math.MaxUint16}, + {0: 42, 100: math.MaxUint64}, + {0: 42, 1: math.MaxUint32, 101: math.MaxUint64}, + } + + for _, test := range tests { + t.Run(fmt.Sprintf("len=%d", len(test)), func(t *testing.T) { + array := bitpack.NewOffsetArray(test) + require.Equal(t, len(test), array.Len()) + + for i, v := range test { + require.Equal(t, v, array.Index(i)) + } + }) + } +} diff --git a/internal/engine/compiler/RATIONALE.md b/internal/engine/compiler/RATIONALE.md index 2db59e72..9cb7513e 100644 --- a/internal/engine/compiler/RATIONALE.md +++ b/internal/engine/compiler/RATIONALE.md @@ -91,3 +91,22 @@ cancellation from taking place. [checkexitcode_loop]: https://github.com/tetratelabs/wazero/blob/86444c67a37dbf9e693ae5b365901f64968d9025/internal/wazeroir/compiler.go#L467-L476 [native_check]: https://github.com/tetratelabs/wazero/issues/1409 + +## Source Offset Mapping + +When translating code from WebAssembly to the wazero IR, and compiling to native +binary, wazero keeps track of two indexes to correlate native program counters +to the original source offset that they were generated from. + +Source offset maps are useful for debugging, but holding indexes in memory for +all instructions can have a significant overhead. To reduce the memory footprint +of the compiled modules, wazero uses data structures inspired by +[frame-of-reference and delta encoding][FOR]. + +Because wazero does not reorder instructions, the source offsets are naturally +sorted during compilation, and the distance between two consecutive offsets is +usually small. Encoding deltas instead of the absolute values allows most of +the indexes to store offsets with an overhead of 8 bits per instruction, instead +of recording 64 bits integers for absolute code positions. + +[FOR]: https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/ diff --git a/internal/engine/compiler/engine.go b/internal/engine/compiler/engine.go index 62d48084..5f43ff22 100644 --- a/internal/engine/compiler/engine.go +++ b/internal/engine/compiler/engine.go @@ -13,6 +13,7 @@ import ( "github.com/tetratelabs/wazero/api" "github.com/tetratelabs/wazero/experimental" "github.com/tetratelabs/wazero/internal/asm" + "github.com/tetratelabs/wazero/internal/bitpack" "github.com/tetratelabs/wazero/internal/filecache" "github.com/tetratelabs/wazero/internal/internalapi" "github.com/tetratelabs/wazero/internal/platform" @@ -285,17 +286,27 @@ type ( sourceOffsetMap sourceOffsetMap } - // sourceOffsetMap holds the information to retrieve the original offset in the Wasm binary from the - // offset in the native binary. + // sourceOffsetMap holds the information to retrieve the original offset in + // the Wasm binary from the offset in the native binary. + // + // The fields are implemented as bit-packed arrays of 64 bits integers to + // reduce the memory footprint. Indexing into such arrays is not as fast as + // indexing into a simple slice, but the source offset map is intended to be + // used for debugging, lookups into the arrays should not appear on code + // paths that are critical to the application performance. + // + // The bitpack.OffsetArray fields may be nil, use bitpack.OffsetArrayLen to + // determine whether they are empty prior to indexing into the arrays to + // avoid panics caused by accessing nil pointers. sourceOffsetMap struct { // See note at top of file before modifying this struct. // irOperationOffsetsInNativeBinary is index-correlated with irOperationSourceOffsetsInWasmBinary, // and maps each index (corresponding to each IR Operation) to the offset in the compiled native code. - irOperationOffsetsInNativeBinary []uint64 + irOperationOffsetsInNativeBinary bitpack.OffsetArray // irOperationSourceOffsetsInWasmBinary is index-correlated with irOperationOffsetsInNativeBinary. // See wazeroir.CompilationResult irOperationOffsetsInNativeBinary. - irOperationSourceOffsetsInWasmBinary []uint64 + irOperationSourceOffsetsInWasmBinary bitpack.OffsetArray } // functionListenerInvocation captures arguments needed to perform function @@ -525,6 +536,7 @@ func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners ln := len(listeners) cmp := newCompiler() asmNodes := new(asmNodes) + offsets := new(offsets) // The executable code is allocated in memory mappings of executableLength, // and grown on demand when we exhaust the memory mapping capacity. @@ -573,7 +585,7 @@ func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners } cmp.Init(typ, ir, lsn != nil) - body, compiledFn.stackPointerCeil, compiledFn.sourceOffsetMap, err = compileWasmFunction(cmp, ir, asmNodes) + body, compiledFn.stackPointerCeil, compiledFn.sourceOffsetMap, err = compileWasmFunction(cmp, ir, asmNodes, offsets) if err != nil { def := module.FunctionDefinition(funcIndex + importedFuncs) return fmt.Errorf("error compiling wasm func[%s]: %w", def.DebugName(), err) @@ -851,7 +863,7 @@ func (ce *callEngine) deferredOnCall(ctx context.Context, m *wasm.ModuleInstance // It is not empty only when the DWARF is enabled. var sources []string if p := fn.parent; p.parent.executable != nil { - if len(fn.parent.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary) != 0 { + if fn.parent.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary != nil { offset := fn.getSourceOffsetInWasmBinary(pc) sources = p.parent.source.DWARFLines.Line(offset) } @@ -895,31 +907,41 @@ func (ce *callEngine) deferredOnCall(ctx context.Context, m *wasm.ModuleInstance // If needPreviousInstr equals true, this returns the previous instruction's offset for the given pc. func (f *function) getSourceOffsetInWasmBinary(pc uint64) uint64 { srcMap := &f.parent.sourceOffsetMap - n := len(srcMap.irOperationOffsetsInNativeBinary) + 1 + n := bitpack.OffsetArrayLen(srcMap.irOperationOffsetsInNativeBinary) + 1 // Calculate the offset in the compiled native binary. pcOffsetInNativeBinary := pc - uint64(f.codeInitialAddress) - // Then, do the binary search on the list of offsets in the native binary for all the IR operations. - // This returns the index of the *next* IR operation of the one corresponding to the origin of this pc. + // Then, do the binary search on the list of offsets in the native binary + // for all the IR operations. This returns the index of the *next* IR + // operation of the one corresponding to the origin of this pc. // See sort.Search. + // + // TODO: the underlying implementation of irOperationOffsetsInNativeBinary + // uses uses delta encoding an calls to the Index method might require a + // O(N) scan of the underlying array, turning binary search into a + // O(N*log(N)) operation. If this code path ends up being a bottleneck, + // we could add a Search method on the bitpack.OffsetArray types to delegate + // the lookup to the underlying data structure, allowing for the selection + // of a more optimized version of the algorithm. If you do so, please add a + // benchmark to verify the impact on compute time. index := sort.Search(n, func(i int) bool { if i == n-1 { return true } - return srcMap.irOperationOffsetsInNativeBinary[i] >= pcOffsetInNativeBinary + return srcMap.irOperationOffsetsInNativeBinary.Index(i) >= pcOffsetInNativeBinary }) - if index == 0 && len(srcMap.irOperationSourceOffsetsInWasmBinary) > 0 { + if index == 0 && bitpack.OffsetArrayLen(srcMap.irOperationSourceOffsetsInWasmBinary) > 0 { // When pc is the beginning of the function, the next IR // operation (returned by sort.Search) is the first of the // offset map. - return srcMap.irOperationSourceOffsetsInWasmBinary[0] + return srcMap.irOperationSourceOffsetsInWasmBinary.Index(0) } if index == n || index == 0 { // This case, somehow pc is not found in the source offset map. return 0 } else { - return srcMap.irOperationSourceOffsetsInWasmBinary[index-1] + return srcMap.irOperationSourceOffsetsInWasmBinary.Index(index - 1) } } @@ -1198,7 +1220,7 @@ func (f internalFunction) Definition() api.FunctionDefinition { // SourceOffsetForPC implements the same method as documented on experimental.InternalFunction. func (f internalFunction) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 { p := f.parent - if len(p.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary) == 0 { + if bitpack.OffsetArrayLen(p.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary) == 0 { return 0 // source not available } return f.getSourceOffsetInWasmBinary(uint64(pc)) @@ -1232,7 +1254,11 @@ type asmNodes struct { nodes []asm.Node } -func compileWasmFunction(cmp compiler, ir *wazeroir.CompilationResult, asmNodes *asmNodes) (body []byte, spCeil uint64, sm sourceOffsetMap, err error) { +type offsets struct { + values []uint64 +} + +func compileWasmFunction(cmp compiler, ir *wazeroir.CompilationResult, asmNodes *asmNodes, offsets *offsets) (body []byte, spCeil uint64, sm sourceOffsetMap, err error) { if err = cmp.compilePreamble(); err != nil { err = fmt.Errorf("failed to emit preamble: %w", err) return @@ -1563,13 +1589,13 @@ func compileWasmFunction(cmp compiler, ir *wazeroir.CompilationResult, asmNodes } if needSourceOffsets { - offsetInNativeBin := make([]uint64, len(irOpBegins)) + offsetInNativeBin := append(offsets.values[:0], make([]uint64, len(irOpBegins))...) + offsets.values = offsetInNativeBin for i, nop := range irOpBegins { offsetInNativeBin[i] = nop.OffsetInBinary() } - sm.irOperationOffsetsInNativeBinary = offsetInNativeBin - sm.irOperationSourceOffsetsInWasmBinary = make([]uint64, len(ir.IROperationSourceOffsetsInWasmBinary)) - copy(sm.irOperationSourceOffsetsInWasmBinary, ir.IROperationSourceOffsetsInWasmBinary) + sm.irOperationOffsetsInNativeBinary = bitpack.NewOffsetArray(offsetInNativeBin) + sm.irOperationSourceOffsetsInWasmBinary = bitpack.NewOffsetArray(ir.IROperationSourceOffsetsInWasmBinary) } return } diff --git a/internal/engine/compiler/engine_test.go b/internal/engine/compiler/engine_test.go index 302bb1a0..b77bc22b 100644 --- a/internal/engine/compiler/engine_test.go +++ b/internal/engine/compiler/engine_test.go @@ -12,6 +12,7 @@ import ( "github.com/tetratelabs/wazero/api" "github.com/tetratelabs/wazero/experimental" "github.com/tetratelabs/wazero/experimental/logging" + "github.com/tetratelabs/wazero/internal/bitpack" "github.com/tetratelabs/wazero/internal/platform" "github.com/tetratelabs/wazero/internal/testing/enginetest" "github.com/tetratelabs/wazero/internal/testing/require" @@ -671,12 +672,12 @@ func TestFunction_getSourceOffsetInWasmBinary(t *testing.T) { pc: 4000, codeInitialAddress: 3999, srcMap: sourceOffsetMap{ - irOperationOffsetsInNativeBinary: []uint64{ + irOperationOffsetsInNativeBinary: bitpack.NewOffsetArray([]uint64{ 0 /*4000-3999=1 exists here*/, 5, 8, 15, - }, - irOperationSourceOffsetsInWasmBinary: []uint64{ + }), + irOperationSourceOffsetsInWasmBinary: bitpack.NewOffsetArray([]uint64{ 10, 100, 800, 12344, - }, + }), }, exp: 10, }, @@ -685,12 +686,12 @@ func TestFunction_getSourceOffsetInWasmBinary(t *testing.T) { pc: 100, codeInitialAddress: 90, srcMap: sourceOffsetMap{ - irOperationOffsetsInNativeBinary: []uint64{ + irOperationOffsetsInNativeBinary: bitpack.NewOffsetArray([]uint64{ 0, 5, 8 /*100-90=10 exists here*/, 15, - }, - irOperationSourceOffsetsInWasmBinary: []uint64{ + }), + irOperationSourceOffsetsInWasmBinary: bitpack.NewOffsetArray([]uint64{ 10, 100, 800, 12344, - }, + }), }, exp: 800, }, @@ -699,12 +700,12 @@ func TestFunction_getSourceOffsetInWasmBinary(t *testing.T) { pc: 9999, codeInitialAddress: 8999, srcMap: sourceOffsetMap{ - irOperationOffsetsInNativeBinary: []uint64{ + irOperationOffsetsInNativeBinary: bitpack.NewOffsetArray([]uint64{ 0, 5, 8, 15, /*9999-8999=1000 exists here*/ - }, - irOperationSourceOffsetsInWasmBinary: []uint64{ + }), + irOperationSourceOffsetsInWasmBinary: bitpack.NewOffsetArray([]uint64{ 10, 100, 800, 12344, - }, + }), }, exp: 12344, },