compiler: compress code offsets (#1467)

Signed-off-by: Achille Roussel <achille.roussel@gmail.com>
This commit is contained in:
Achille
2023-05-15 18:41:52 -07:00
committed by GitHub
parent 2f31894f61
commit 9bbb9c871e
5 changed files with 335 additions and 31 deletions

View File

@@ -0,0 +1,224 @@
package bitpack
import (
"math"
)
// OffsetArray is an interface representing read-only views of arrays of 64 bits
// offsets.
type OffsetArray interface {
// Returns the value at index i.
//
// The method complexity may be anywhere between O(1) and O(N).
Index(i int) uint64
// Returns the number of offsets in the array.
//
// The method complexity must be O(1).
Len() int
}
// OffsetArrayLen is a helper function to access the length of an offset array.
// It is similar to calling Len on the array but handles the special case where
// the array is nil, in which case it returns zero.
func OffsetArrayLen(array OffsetArray) int {
if array != nil {
return array.Len()
}
return 0
}
// NewOffsetArray constructs a new array of offsets from the slice of values
// passed as argument. The slice is not retained, the returned array always
// holds a copy of the values.
//
// The underlying implementation of the offset array applies a compression
// mechanism derived from Frame-of-Reference and Delta Encoding to minimize
// the memory footprint of the array. This compression model works best when
// the input is made of ordered values, otherwise the deltas between values
// are likely to be too large to benefit from delta encoding.
//
// See https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/
func NewOffsetArray(values []uint64) OffsetArray {
if len(values) == 0 {
return emptyOffsetArray{}
}
if len(values) <= smallOffsetArrayCapacity {
return newSmallOffsetArray(values)
}
maxDelta := uint64(0)
lastValue := values[0]
// TODO: the pre-processing we perform here can be optimized using SIMD
// instructions.
for _, value := range values[1:] {
if delta := value - lastValue; delta > maxDelta {
maxDelta = delta
}
lastValue = value
}
switch {
case maxDelta > math.MaxUint32:
return newOffsetArray(values)
case maxDelta > math.MaxUint16:
return newDeltaArray[uint32](values)
case maxDelta > math.MaxUint8:
return newDeltaArray[uint16](values)
case maxDelta > 15:
return newDeltaArray[uint8](values)
default:
return newDeltaArrayUint4(values)
}
}
type offsetArray struct {
values []uint64
}
func newOffsetArray(values []uint64) *offsetArray {
a := &offsetArray{
values: make([]uint64, len(values)),
}
copy(a.values, values)
return a
}
func (a *offsetArray) Index(i int) uint64 {
return a.values[i]
}
func (a *offsetArray) Len() int {
return len(a.values)
}
type emptyOffsetArray struct{}
func (emptyOffsetArray) Index(int) uint64 {
panic("index out of bounds")
}
func (emptyOffsetArray) Len() int {
return 0
}
const smallOffsetArrayCapacity = 7
type smallOffsetArray struct {
length int
values [smallOffsetArrayCapacity]uint64
}
func newSmallOffsetArray(values []uint64) *smallOffsetArray {
a := &smallOffsetArray{length: len(values)}
copy(a.values[:], values)
return a
}
func (a *smallOffsetArray) Index(i int) uint64 {
if i < 0 || i >= a.length {
panic("index out of bounds")
}
return a.values[i]
}
func (a *smallOffsetArray) Len() int {
return a.length
}
type uintType interface {
uint8 | uint16 | uint32 | uint64
}
type deltaArray[T uintType] struct {
deltas []T
firstValue uint64
}
func newDeltaArray[T uintType](values []uint64) *deltaArray[T] {
a := &deltaArray[T]{
deltas: make([]T, len(values)-1),
firstValue: values[0],
}
lastValue := values[0]
for i, value := range values[1:] {
a.deltas[i] = T(value - lastValue)
lastValue = value
}
return a
}
func (a *deltaArray[T]) Index(i int) uint64 {
if i < 0 || i >= a.Len() {
panic("index out of bounds")
}
value := a.firstValue
// TODO: computing the prefix sum can be vectorized;
// see https://en.algorithmica.org/hpc/algorithms/prefix/
for _, delta := range a.deltas[:i] {
value += uint64(delta)
}
return value
}
func (a *deltaArray[T]) Len() int {
return len(a.deltas) + 1
}
// deltaArrayUint4 is a specialization of deltaArray which packs 4 bits integers
// to hold deltas between 0 and 15; based on the analysis of compiling Python,
// it appeared that most source offset deltas were under 16, so using this
// data structure cuts by 50% the memory needed compared to deltaArray[uint8].
//
// Here is the distribution of source offset deltas for Python 3.13:
//
// - <=15 : 10240
// - <=255 : 9565
// - <=65535 : 1163
//
// Memory profiles showed that using deltaArrayUint4 (compared to deltaArray[T])
// dropped the memory footprint of source mappings for Python from 6MB to 4.5MB.
type deltaArrayUint4 struct {
deltas []byte
numValues int
firstValue uint64
}
func newDeltaArrayUint4(values []uint64) *deltaArrayUint4 {
a := &deltaArrayUint4{
deltas: make([]byte, len(values)/2+1),
numValues: len(values),
firstValue: values[0],
}
lastValue := values[0]
for i, value := range values[1:] {
a.assign(i, value-lastValue)
lastValue = value
}
return a
}
func (a *deltaArrayUint4) assign(i int, v uint64) {
index, shift := uint(i)>>1, 4*(uint(i)&1)
a.deltas[index] &= ^(0xF << shift)
a.deltas[index] |= byte(v) << shift
}
func (a *deltaArrayUint4) index(i int) uint64 {
index, shift := uint(i)>>1, 4*(uint(i)&1)
return uint64((a.deltas[index] >> shift) & 0xF)
}
func (a *deltaArrayUint4) Index(i int) uint64 {
if i < 0 || i >= a.Len() {
panic("index out of bounds")
}
value := a.firstValue
for j := 0; j < i; j++ {
value += a.index(j)
}
return value
}
func (a *deltaArrayUint4) Len() int {
return a.numValues
}

View File

@@ -0,0 +1,34 @@
package bitpack_test
import (
"fmt"
"math"
"testing"
"github.com/tetratelabs/wazero/internal/bitpack"
"github.com/tetratelabs/wazero/internal/testing/require"
)
func TestOffsetArray(t *testing.T) {
tests := [][]uint64{
{},
{0},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
{16: 1},
{17: math.MaxUint16 + 1},
{21: 10, 22: math.MaxUint16},
{0: 42, 100: math.MaxUint64},
{0: 42, 1: math.MaxUint32, 101: math.MaxUint64},
}
for _, test := range tests {
t.Run(fmt.Sprintf("len=%d", len(test)), func(t *testing.T) {
array := bitpack.NewOffsetArray(test)
require.Equal(t, len(test), array.Len())
for i, v := range test {
require.Equal(t, v, array.Index(i))
}
})
}
}

View File

@@ -91,3 +91,22 @@ cancellation from taking place.
[checkexitcode_loop]: https://github.com/tetratelabs/wazero/blob/86444c67a37dbf9e693ae5b365901f64968d9025/internal/wazeroir/compiler.go#L467-L476 [checkexitcode_loop]: https://github.com/tetratelabs/wazero/blob/86444c67a37dbf9e693ae5b365901f64968d9025/internal/wazeroir/compiler.go#L467-L476
[native_check]: https://github.com/tetratelabs/wazero/issues/1409 [native_check]: https://github.com/tetratelabs/wazero/issues/1409
## Source Offset Mapping
When translating code from WebAssembly to the wazero IR, and compiling to native
binary, wazero keeps track of two indexes to correlate native program counters
to the original source offset that they were generated from.
Source offset maps are useful for debugging, but holding indexes in memory for
all instructions can have a significant overhead. To reduce the memory footprint
of the compiled modules, wazero uses data structures inspired by
[frame-of-reference and delta encoding][FOR].
Because wazero does not reorder instructions, the source offsets are naturally
sorted during compilation, and the distance between two consecutive offsets is
usually small. Encoding deltas instead of the absolute values allows most of
the indexes to store offsets with an overhead of 8 bits per instruction, instead
of recording 64 bits integers for absolute code positions.
[FOR]: https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/

View File

@@ -13,6 +13,7 @@ import (
"github.com/tetratelabs/wazero/api" "github.com/tetratelabs/wazero/api"
"github.com/tetratelabs/wazero/experimental" "github.com/tetratelabs/wazero/experimental"
"github.com/tetratelabs/wazero/internal/asm" "github.com/tetratelabs/wazero/internal/asm"
"github.com/tetratelabs/wazero/internal/bitpack"
"github.com/tetratelabs/wazero/internal/filecache" "github.com/tetratelabs/wazero/internal/filecache"
"github.com/tetratelabs/wazero/internal/internalapi" "github.com/tetratelabs/wazero/internal/internalapi"
"github.com/tetratelabs/wazero/internal/platform" "github.com/tetratelabs/wazero/internal/platform"
@@ -285,17 +286,27 @@ type (
sourceOffsetMap sourceOffsetMap sourceOffsetMap sourceOffsetMap
} }
// sourceOffsetMap holds the information to retrieve the original offset in the Wasm binary from the // sourceOffsetMap holds the information to retrieve the original offset in
// offset in the native binary. // the Wasm binary from the offset in the native binary.
//
// The fields are implemented as bit-packed arrays of 64 bits integers to
// reduce the memory footprint. Indexing into such arrays is not as fast as
// indexing into a simple slice, but the source offset map is intended to be
// used for debugging, lookups into the arrays should not appear on code
// paths that are critical to the application performance.
//
// The bitpack.OffsetArray fields may be nil, use bitpack.OffsetArrayLen to
// determine whether they are empty prior to indexing into the arrays to
// avoid panics caused by accessing nil pointers.
sourceOffsetMap struct { sourceOffsetMap struct {
// See note at top of file before modifying this struct. // See note at top of file before modifying this struct.
// irOperationOffsetsInNativeBinary is index-correlated with irOperationSourceOffsetsInWasmBinary, // irOperationOffsetsInNativeBinary is index-correlated with irOperationSourceOffsetsInWasmBinary,
// and maps each index (corresponding to each IR Operation) to the offset in the compiled native code. // and maps each index (corresponding to each IR Operation) to the offset in the compiled native code.
irOperationOffsetsInNativeBinary []uint64 irOperationOffsetsInNativeBinary bitpack.OffsetArray
// irOperationSourceOffsetsInWasmBinary is index-correlated with irOperationOffsetsInNativeBinary. // irOperationSourceOffsetsInWasmBinary is index-correlated with irOperationOffsetsInNativeBinary.
// See wazeroir.CompilationResult irOperationOffsetsInNativeBinary. // See wazeroir.CompilationResult irOperationOffsetsInNativeBinary.
irOperationSourceOffsetsInWasmBinary []uint64 irOperationSourceOffsetsInWasmBinary bitpack.OffsetArray
} }
// functionListenerInvocation captures arguments needed to perform function // functionListenerInvocation captures arguments needed to perform function
@@ -525,6 +536,7 @@ func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners
ln := len(listeners) ln := len(listeners)
cmp := newCompiler() cmp := newCompiler()
asmNodes := new(asmNodes) asmNodes := new(asmNodes)
offsets := new(offsets)
// The executable code is allocated in memory mappings of executableLength, // The executable code is allocated in memory mappings of executableLength,
// and grown on demand when we exhaust the memory mapping capacity. // and grown on demand when we exhaust the memory mapping capacity.
@@ -573,7 +585,7 @@ func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners
} }
cmp.Init(typ, ir, lsn != nil) cmp.Init(typ, ir, lsn != nil)
body, compiledFn.stackPointerCeil, compiledFn.sourceOffsetMap, err = compileWasmFunction(cmp, ir, asmNodes) body, compiledFn.stackPointerCeil, compiledFn.sourceOffsetMap, err = compileWasmFunction(cmp, ir, asmNodes, offsets)
if err != nil { if err != nil {
def := module.FunctionDefinition(funcIndex + importedFuncs) def := module.FunctionDefinition(funcIndex + importedFuncs)
return fmt.Errorf("error compiling wasm func[%s]: %w", def.DebugName(), err) return fmt.Errorf("error compiling wasm func[%s]: %w", def.DebugName(), err)
@@ -851,7 +863,7 @@ func (ce *callEngine) deferredOnCall(ctx context.Context, m *wasm.ModuleInstance
// It is not empty only when the DWARF is enabled. // It is not empty only when the DWARF is enabled.
var sources []string var sources []string
if p := fn.parent; p.parent.executable != nil { if p := fn.parent; p.parent.executable != nil {
if len(fn.parent.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary) != 0 { if fn.parent.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary != nil {
offset := fn.getSourceOffsetInWasmBinary(pc) offset := fn.getSourceOffsetInWasmBinary(pc)
sources = p.parent.source.DWARFLines.Line(offset) sources = p.parent.source.DWARFLines.Line(offset)
} }
@@ -895,31 +907,41 @@ func (ce *callEngine) deferredOnCall(ctx context.Context, m *wasm.ModuleInstance
// If needPreviousInstr equals true, this returns the previous instruction's offset for the given pc. // If needPreviousInstr equals true, this returns the previous instruction's offset for the given pc.
func (f *function) getSourceOffsetInWasmBinary(pc uint64) uint64 { func (f *function) getSourceOffsetInWasmBinary(pc uint64) uint64 {
srcMap := &f.parent.sourceOffsetMap srcMap := &f.parent.sourceOffsetMap
n := len(srcMap.irOperationOffsetsInNativeBinary) + 1 n := bitpack.OffsetArrayLen(srcMap.irOperationOffsetsInNativeBinary) + 1
// Calculate the offset in the compiled native binary. // Calculate the offset in the compiled native binary.
pcOffsetInNativeBinary := pc - uint64(f.codeInitialAddress) pcOffsetInNativeBinary := pc - uint64(f.codeInitialAddress)
// Then, do the binary search on the list of offsets in the native binary for all the IR operations. // Then, do the binary search on the list of offsets in the native binary
// This returns the index of the *next* IR operation of the one corresponding to the origin of this pc. // for all the IR operations. This returns the index of the *next* IR
// operation of the one corresponding to the origin of this pc.
// See sort.Search. // See sort.Search.
//
// TODO: the underlying implementation of irOperationOffsetsInNativeBinary
// uses uses delta encoding an calls to the Index method might require a
// O(N) scan of the underlying array, turning binary search into a
// O(N*log(N)) operation. If this code path ends up being a bottleneck,
// we could add a Search method on the bitpack.OffsetArray types to delegate
// the lookup to the underlying data structure, allowing for the selection
// of a more optimized version of the algorithm. If you do so, please add a
// benchmark to verify the impact on compute time.
index := sort.Search(n, func(i int) bool { index := sort.Search(n, func(i int) bool {
if i == n-1 { if i == n-1 {
return true return true
} }
return srcMap.irOperationOffsetsInNativeBinary[i] >= pcOffsetInNativeBinary return srcMap.irOperationOffsetsInNativeBinary.Index(i) >= pcOffsetInNativeBinary
}) })
if index == 0 && len(srcMap.irOperationSourceOffsetsInWasmBinary) > 0 { if index == 0 && bitpack.OffsetArrayLen(srcMap.irOperationSourceOffsetsInWasmBinary) > 0 {
// When pc is the beginning of the function, the next IR // When pc is the beginning of the function, the next IR
// operation (returned by sort.Search) is the first of the // operation (returned by sort.Search) is the first of the
// offset map. // offset map.
return srcMap.irOperationSourceOffsetsInWasmBinary[0] return srcMap.irOperationSourceOffsetsInWasmBinary.Index(0)
} }
if index == n || index == 0 { // This case, somehow pc is not found in the source offset map. if index == n || index == 0 { // This case, somehow pc is not found in the source offset map.
return 0 return 0
} else { } else {
return srcMap.irOperationSourceOffsetsInWasmBinary[index-1] return srcMap.irOperationSourceOffsetsInWasmBinary.Index(index - 1)
} }
} }
@@ -1198,7 +1220,7 @@ func (f internalFunction) Definition() api.FunctionDefinition {
// SourceOffsetForPC implements the same method as documented on experimental.InternalFunction. // SourceOffsetForPC implements the same method as documented on experimental.InternalFunction.
func (f internalFunction) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 { func (f internalFunction) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 {
p := f.parent p := f.parent
if len(p.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary) == 0 { if bitpack.OffsetArrayLen(p.sourceOffsetMap.irOperationSourceOffsetsInWasmBinary) == 0 {
return 0 // source not available return 0 // source not available
} }
return f.getSourceOffsetInWasmBinary(uint64(pc)) return f.getSourceOffsetInWasmBinary(uint64(pc))
@@ -1232,7 +1254,11 @@ type asmNodes struct {
nodes []asm.Node nodes []asm.Node
} }
func compileWasmFunction(cmp compiler, ir *wazeroir.CompilationResult, asmNodes *asmNodes) (body []byte, spCeil uint64, sm sourceOffsetMap, err error) { type offsets struct {
values []uint64
}
func compileWasmFunction(cmp compiler, ir *wazeroir.CompilationResult, asmNodes *asmNodes, offsets *offsets) (body []byte, spCeil uint64, sm sourceOffsetMap, err error) {
if err = cmp.compilePreamble(); err != nil { if err = cmp.compilePreamble(); err != nil {
err = fmt.Errorf("failed to emit preamble: %w", err) err = fmt.Errorf("failed to emit preamble: %w", err)
return return
@@ -1563,13 +1589,13 @@ func compileWasmFunction(cmp compiler, ir *wazeroir.CompilationResult, asmNodes
} }
if needSourceOffsets { if needSourceOffsets {
offsetInNativeBin := make([]uint64, len(irOpBegins)) offsetInNativeBin := append(offsets.values[:0], make([]uint64, len(irOpBegins))...)
offsets.values = offsetInNativeBin
for i, nop := range irOpBegins { for i, nop := range irOpBegins {
offsetInNativeBin[i] = nop.OffsetInBinary() offsetInNativeBin[i] = nop.OffsetInBinary()
} }
sm.irOperationOffsetsInNativeBinary = offsetInNativeBin sm.irOperationOffsetsInNativeBinary = bitpack.NewOffsetArray(offsetInNativeBin)
sm.irOperationSourceOffsetsInWasmBinary = make([]uint64, len(ir.IROperationSourceOffsetsInWasmBinary)) sm.irOperationSourceOffsetsInWasmBinary = bitpack.NewOffsetArray(ir.IROperationSourceOffsetsInWasmBinary)
copy(sm.irOperationSourceOffsetsInWasmBinary, ir.IROperationSourceOffsetsInWasmBinary)
} }
return return
} }

View File

@@ -12,6 +12,7 @@ import (
"github.com/tetratelabs/wazero/api" "github.com/tetratelabs/wazero/api"
"github.com/tetratelabs/wazero/experimental" "github.com/tetratelabs/wazero/experimental"
"github.com/tetratelabs/wazero/experimental/logging" "github.com/tetratelabs/wazero/experimental/logging"
"github.com/tetratelabs/wazero/internal/bitpack"
"github.com/tetratelabs/wazero/internal/platform" "github.com/tetratelabs/wazero/internal/platform"
"github.com/tetratelabs/wazero/internal/testing/enginetest" "github.com/tetratelabs/wazero/internal/testing/enginetest"
"github.com/tetratelabs/wazero/internal/testing/require" "github.com/tetratelabs/wazero/internal/testing/require"
@@ -671,12 +672,12 @@ func TestFunction_getSourceOffsetInWasmBinary(t *testing.T) {
pc: 4000, pc: 4000,
codeInitialAddress: 3999, codeInitialAddress: 3999,
srcMap: sourceOffsetMap{ srcMap: sourceOffsetMap{
irOperationOffsetsInNativeBinary: []uint64{ irOperationOffsetsInNativeBinary: bitpack.NewOffsetArray([]uint64{
0 /*4000-3999=1 exists here*/, 5, 8, 15, 0 /*4000-3999=1 exists here*/, 5, 8, 15,
}, }),
irOperationSourceOffsetsInWasmBinary: []uint64{ irOperationSourceOffsetsInWasmBinary: bitpack.NewOffsetArray([]uint64{
10, 100, 800, 12344, 10, 100, 800, 12344,
}, }),
}, },
exp: 10, exp: 10,
}, },
@@ -685,12 +686,12 @@ func TestFunction_getSourceOffsetInWasmBinary(t *testing.T) {
pc: 100, pc: 100,
codeInitialAddress: 90, codeInitialAddress: 90,
srcMap: sourceOffsetMap{ srcMap: sourceOffsetMap{
irOperationOffsetsInNativeBinary: []uint64{ irOperationOffsetsInNativeBinary: bitpack.NewOffsetArray([]uint64{
0, 5, 8 /*100-90=10 exists here*/, 15, 0, 5, 8 /*100-90=10 exists here*/, 15,
}, }),
irOperationSourceOffsetsInWasmBinary: []uint64{ irOperationSourceOffsetsInWasmBinary: bitpack.NewOffsetArray([]uint64{
10, 100, 800, 12344, 10, 100, 800, 12344,
}, }),
}, },
exp: 800, exp: 800,
}, },
@@ -699,12 +700,12 @@ func TestFunction_getSourceOffsetInWasmBinary(t *testing.T) {
pc: 9999, pc: 9999,
codeInitialAddress: 8999, codeInitialAddress: 8999,
srcMap: sourceOffsetMap{ srcMap: sourceOffsetMap{
irOperationOffsetsInNativeBinary: []uint64{ irOperationOffsetsInNativeBinary: bitpack.NewOffsetArray([]uint64{
0, 5, 8, 15, /*9999-8999=1000 exists here*/ 0, 5, 8, 15, /*9999-8999=1000 exists here*/
}, }),
irOperationSourceOffsetsInWasmBinary: []uint64{ irOperationSourceOffsetsInWasmBinary: bitpack.NewOffsetArray([]uint64{
10, 100, 800, 12344, 10, 100, 800, 12344,
}, }),
}, },
exp: 12344, exp: 12344,
}, },