Files
wazero/internal/engine/compiler/compiler_test.go
Takeshi Yoneda 867459d7d5 compiler: mmap per module instead of per function (#1377)
This changes the mmap strategy used in the compiler backend.
Previously, we used mmap syscall once per function and allocated the 
executable pages each time. Basically, mmap can only allocate the 
boundary of the page size of the underlying os. Even if the requested 
executable is smaller than the page size, the entire page is marked as 
executable and won't be reused by Go runtime. Therefore, we wasted 
roughly `(len(body)%osPageSize)*function`.

Even though we still need to align each function on 16 bytes boundary
when mmaping per module, the wasted space is much smaller than before.

The following benchmark results shows that this improves the overall 
compilation performance while showing the heap usage increased. 
However, the increased heap usage is totally offset by the hidden wasted
memory page which is not measured by Go's -benchmem.
Actually, when I did the experiments, I observed that roughly 20~30mb are
wasted on arm64 previously which is larger than the increased heap usage
in this result. More importantly, this increased heap usage is a target of GC
and should be ignorable in the long-running program vs the wasted page 
is persistent until the CompiledModule is closed.

Not only the actual compilation time, the result indicates that this could 
improve the overall Go runtime's performance maybe thanks to not abusing
runtime.Finalizer since you can see this improves the subsequent interpreter 
benchmark results.

```
goos: darwin
goarch: arm64
pkg: github.com/tetratelabs/wazero/internal/integration_test/bench
                                   │   old.txt   │              new.txt              │
                                   │   sec/op    │   sec/op     vs base              │
Compilation_sqlite3/compiler-10      183.4m ± 0%   175.9m ± 2%  -4.10% (p=0.001 n=7)
Compilation_sqlite3/interpreter-10   61.59m ± 0%   59.57m ± 0%  -3.29% (p=0.001 n=7)
geomean                              106.3m        102.4m       -3.69%

                                   │   old.txt    │               new.txt               │
                                   │     B/op     │     B/op      vs base               │
Compilation_sqlite3/compiler-10      42.93Mi ± 0%   54.33Mi ± 0%  +26.56% (p=0.001 n=7)
Compilation_sqlite3/interpreter-10   51.75Mi ± 0%   51.75Mi ± 0%   -0.01% (p=0.001 n=7)
geomean                              47.13Mi        53.02Mi       +12.49%

                                   │   old.txt   │              new.txt              │
                                   │  allocs/op  │  allocs/op   vs base              │
Compilation_sqlite3/compiler-10      26.07k ± 0%   26.06k ± 0%       ~ (p=0.149 n=7)
Compilation_sqlite3/interpreter-10   13.90k ± 0%   13.90k ± 0%       ~ (p=0.421 n=7)
geomean                              19.03k        19.03k       -0.02%


goos: linux
goarch: amd64
pkg: github.com/tetratelabs/wazero/internal/integration_test/bench
cpu: AMD Ryzen 9 3950X 16-Core Processor
                                   │   old.txt   │              new.txt               │
                                   │   sec/op    │   sec/op     vs base               │
Compilation_sqlite3/compiler-32      384.4m ± 2%   373.0m ± 4%   -2.97% (p=0.001 n=7)
Compilation_sqlite3/interpreter-32   86.09m ± 4%   65.05m ± 2%  -24.44% (p=0.001 n=7)
geomean                              181.9m        155.8m       -14.38%

                                   │   old.txt    │               new.txt               │
                                   │     B/op     │     B/op      vs base               │
Compilation_sqlite3/compiler-32      49.40Mi ± 0%   59.91Mi ± 0%  +21.29% (p=0.001 n=7)
Compilation_sqlite3/interpreter-32   51.77Mi ± 0%   51.76Mi ± 0%   -0.02% (p=0.001 n=7)
geomean                              50.57Mi        55.69Mi       +10.12%

                                   │   old.txt   │              new.txt              │
                                   │  allocs/op  │  allocs/op   vs base              │
Compilation_sqlite3/compiler-32      28.70k ± 0%   28.70k ± 0%       ~ (p=0.925 n=7)
Compilation_sqlite3/interpreter-32   14.00k ± 0%   14.00k ± 0%  -0.04% (p=0.010 n=7)
geomean                              20.05k        20.04k       -0.02%
```

resolves #1060 

Signed-off-by: Takeshi Yoneda <takeshi@tetrate.io>
2023-04-26 14:11:37 +09:00

309 lines
12 KiB
Go

package compiler
import (
"fmt"
"math"
"os"
"runtime"
"testing"
"unsafe"
"github.com/tetratelabs/wazero/internal/platform"
"github.com/tetratelabs/wazero/internal/testing/require"
"github.com/tetratelabs/wazero/internal/wasm"
"github.com/tetratelabs/wazero/internal/wazeroir"
)
func TestMain(m *testing.M) {
if !platform.CompilerSupported() {
os.Exit(0)
}
os.Exit(m.Run())
}
// Ensures that the offset consts do not drift when we manipulate the target
// structs.
//
// Note: This is a package initializer as many tests could fail if these
// constants are misaligned, hiding the root cause.
func init() {
var me moduleEngine
requireEqual := func(expected, actual int, name string) {
if expected != actual {
panic(fmt.Sprintf("%s: expected %d, but was %d", name, expected, actual))
}
}
requireEqual(int(unsafe.Offsetof(me.functions)), moduleEngineFunctionsOffset, "moduleEngineFunctionsOffset")
var ce callEngine
// Offsets for callEngine.moduleContext.
requireEqual(int(unsafe.Offsetof(ce.fn)), callEngineModuleContextFnOffset, "callEngineModuleContextFnOffset")
requireEqual(int(unsafe.Offsetof(ce.moduleInstance)), callEngineModuleContextModuleInstanceOffset, "callEngineModuleContextModuleInstanceOffset")
requireEqual(int(unsafe.Offsetof(ce.globalElement0Address)), callEngineModuleContextGlobalElement0AddressOffset, "callEngineModuleContextGlobalElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.memoryElement0Address)), callEngineModuleContextMemoryElement0AddressOffset, "callEngineModuleContextMemoryElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.memorySliceLen)), callEngineModuleContextMemorySliceLenOffset, "callEngineModuleContextMemorySliceLenOffset")
requireEqual(int(unsafe.Offsetof(ce.memoryInstance)), callEngineModuleContextMemoryInstanceOffset, "callEngineModuleContextMemoryInstanceOffset")
requireEqual(int(unsafe.Offsetof(ce.tablesElement0Address)), callEngineModuleContextTablesElement0AddressOffset, "callEngineModuleContextTablesElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.functionsElement0Address)), callEngineModuleContextFunctionsElement0AddressOffset, "callEngineModuleContextFunctionsElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.typeIDsElement0Address)), callEngineModuleContextTypeIDsElement0AddressOffset, "callEngineModuleContextTypeIDsElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.dataInstancesElement0Address)), callEngineModuleContextDataInstancesElement0AddressOffset, "callEngineModuleContextDataInstancesElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.elementInstancesElement0Address)), callEngineModuleContextElementInstancesElement0AddressOffset, "callEngineModuleContextElementInstancesElement0AddressOffset")
// Offsets for callEngine.stackContext
requireEqual(int(unsafe.Offsetof(ce.stackPointer)), callEngineStackContextStackPointerOffset, "callEngineStackContextStackPointerOffset")
requireEqual(int(unsafe.Offsetof(ce.stackBasePointerInBytes)), callEngineStackContextStackBasePointerInBytesOffset, "callEngineStackContextStackBasePointerInBytesOffset")
requireEqual(int(unsafe.Offsetof(ce.stackElement0Address)), callEngineStackContextStackElement0AddressOffset, "callEngineStackContextStackElement0AddressOffset")
requireEqual(int(unsafe.Offsetof(ce.stackLenInBytes)), callEngineStackContextStackLenInBytesOffset, "callEngineStackContextStackLenInBytesOffset")
// Offsets for callEngine.exitContext.
requireEqual(int(unsafe.Offsetof(ce.statusCode)), callEngineExitContextNativeCallStatusCodeOffset, "callEngineExitContextNativeCallStatusCodeOffset")
requireEqual(int(unsafe.Offsetof(ce.builtinFunctionCallIndex)), callEngineExitContextBuiltinFunctionCallIndexOffset, "callEngineExitContextBuiltinFunctionCallIndexOffset")
requireEqual(int(unsafe.Offsetof(ce.returnAddress)), callEngineExitContextReturnAddressOffset, "callEngineExitContextReturnAddressOffset")
requireEqual(int(unsafe.Offsetof(ce.callerModuleInstance)), callEngineExitContextCallerModuleInstanceOffset, "callEngineExitContextCallerModuleInstanceOffset")
// Size and offsets for callFrame.
var frame callFrame
requireEqual(int(unsafe.Sizeof(frame))/8, callFrameDataSizeInUint64, "callFrameDataSize")
// Offsets for code.
var f function
requireEqual(int(unsafe.Offsetof(f.codeInitialAddress)), functionCodeInitialAddressOffset, "functionCodeInitialAddressOffset")
requireEqual(int(unsafe.Offsetof(f.moduleInstance)), functionModuleInstanceOffset, "functionModuleInstanceOffset")
requireEqual(int(unsafe.Offsetof(f.typeID)), functionTypeIDOffset, "functionTypeIDOffset")
requireEqual(int(unsafe.Sizeof(f)), functionSize, "functionModuleInstanceOffset")
// Offsets for wasm.ModuleInstance.
var moduleInstance wasm.ModuleInstance
requireEqual(int(unsafe.Offsetof(moduleInstance.Globals)), moduleInstanceGlobalsOffset, "moduleInstanceGlobalsOffset")
requireEqual(int(unsafe.Offsetof(moduleInstance.MemoryInstance)), moduleInstanceMemoryOffset, "moduleInstanceMemoryOffset")
requireEqual(int(unsafe.Offsetof(moduleInstance.Tables)), moduleInstanceTablesOffset, "moduleInstanceTablesOffset")
requireEqual(int(unsafe.Offsetof(moduleInstance.Engine)), moduleInstanceEngineOffset, "moduleInstanceEngineOffset")
requireEqual(int(unsafe.Offsetof(moduleInstance.TypeIDs)), moduleInstanceTypeIDsOffset, "moduleInstanceTypeIDsOffset")
requireEqual(int(unsafe.Offsetof(moduleInstance.DataInstances)), moduleInstanceDataInstancesOffset, "moduleInstanceDataInstancesOffset")
requireEqual(int(unsafe.Offsetof(moduleInstance.ElementInstances)), moduleInstanceElementInstancesOffset, "moduleInstanceElementInstancesOffset")
// Offsets for wasm.Table.
var tableInstance wasm.TableInstance
requireEqual(int(unsafe.Offsetof(tableInstance.References)), tableInstanceTableOffset, "tableInstanceTableOffset")
// We add "+8" to get the length of Tables[0].Table
// since the slice header is laid out as {Data uintptr, Len int64, Cap int64} on memory.
requireEqual(int(unsafe.Offsetof(tableInstance.References)+8), tableInstanceTableLenOffset, "tableInstanceTableLenOffset")
// Offsets for wasm.Memory
var memoryInstance wasm.MemoryInstance
requireEqual(int(unsafe.Offsetof(memoryInstance.Buffer)), memoryInstanceBufferOffset, "memoryInstanceBufferOffset")
// "+8" because the slice header is laid out as {Data uintptr, Len int64, Cap int64} on memory.
requireEqual(int(unsafe.Offsetof(memoryInstance.Buffer)+8), memoryInstanceBufferLenOffset, "memoryInstanceBufferLenOffset")
// Offsets for wasm.GlobalInstance
var globalInstance wasm.GlobalInstance
requireEqual(int(unsafe.Offsetof(globalInstance.Val)), globalInstanceValueOffset, "globalInstanceValueOffset")
var dataInstance wasm.DataInstance
requireEqual(int(unsafe.Sizeof(dataInstance)), dataInstanceStructSize, "dataInstanceStructSize")
var elementInstance wasm.ElementInstance
requireEqual(int(unsafe.Sizeof(elementInstance)), elementInstanceStructSize, "elementInstanceStructSize")
var pointer uintptr
requireEqual(int(unsafe.Sizeof(pointer)), 1<<pointerSizeLog2, "pointerSizeLog2")
}
type compilerEnv struct {
me *moduleEngine
ce *callEngine
moduleInstance *wasm.ModuleInstance
}
func (j *compilerEnv) stackTopAsUint32() uint32 {
return uint32(j.stack()[j.ce.stackContext.stackPointer-1])
}
func (j *compilerEnv) stackTopAsInt32() int32 {
return int32(j.stack()[j.ce.stackContext.stackPointer-1])
}
func (j *compilerEnv) stackTopAsUint64() uint64 {
return j.stack()[j.ce.stackContext.stackPointer-1]
}
func (j *compilerEnv) stackTopAsInt64() int64 {
return int64(j.stack()[j.ce.stackContext.stackPointer-1])
}
func (j *compilerEnv) stackTopAsFloat32() float32 {
return math.Float32frombits(uint32(j.stack()[j.ce.stackContext.stackPointer-1]))
}
func (j *compilerEnv) stackTopAsFloat64() float64 {
return math.Float64frombits(j.stack()[j.ce.stackContext.stackPointer-1])
}
func (j *compilerEnv) stackTopAsV128() (lo uint64, hi uint64) {
st := j.stack()
return st[j.ce.stackContext.stackPointer-2], st[j.ce.stackContext.stackPointer-1]
}
func (j *compilerEnv) memory() []byte {
return j.moduleInstance.MemoryInstance.Buffer
}
func (j *compilerEnv) stack() []uint64 {
return j.ce.stack
}
func (j *compilerEnv) compilerStatus() nativeCallStatusCode {
return j.ce.exitContext.statusCode
}
func (j *compilerEnv) builtinFunctionCallAddress() wasm.Index {
return j.ce.exitContext.builtinFunctionCallIndex
}
// stackPointer returns the stack pointer minus the call frame.
func (j *compilerEnv) stackPointer() uint64 {
return j.ce.stackContext.stackPointer - callFrameDataSizeInUint64
}
func (j *compilerEnv) stackBasePointer() uint64 {
return j.ce.stackContext.stackBasePointerInBytes >> 3
}
func (j *compilerEnv) setStackPointer(sp uint64) {
j.ce.stackContext.stackPointer = sp
}
func (j *compilerEnv) addGlobals(g ...*wasm.GlobalInstance) {
j.moduleInstance.Globals = append(j.moduleInstance.Globals, g...)
}
func (j *compilerEnv) globals() []*wasm.GlobalInstance {
return j.moduleInstance.Globals
}
func (j *compilerEnv) addTable(table *wasm.TableInstance) {
j.moduleInstance.Tables = append(j.moduleInstance.Tables, table)
}
func (j *compilerEnv) setStackBasePointer(sp uint64) {
j.ce.stackContext.stackBasePointerInBytes = sp << 3
}
func (j *compilerEnv) module() *wasm.ModuleInstance {
return j.moduleInstance
}
func (j *compilerEnv) moduleEngine() *moduleEngine {
return j.me
}
func (j *compilerEnv) callEngine() *callEngine {
return j.ce
}
func (j *compilerEnv) exec(machineCode []byte) {
executable := requireExecutable(machineCode)
f := &function{
parent: &compiledFunction{parent: &compiledModule{executable: executable}},
codeInitialAddress: uintptr(unsafe.Pointer(&executable[0])),
moduleInstance: j.moduleInstance,
}
j.ce.initialFn = f
j.ce.fn = f
nativecall(
uintptr(unsafe.Pointer(&executable[0])),
uintptr(unsafe.Pointer(j.ce)),
j.moduleInstance,
)
}
func (j *compilerEnv) requireNewCompiler(t *testing.T, functionType *wasm.FunctionType, fn func() compiler, ir *wazeroir.CompilationResult) compilerImpl {
requireSupportedOSArch(t)
if ir == nil {
ir = &wazeroir.CompilationResult{
LabelCallers: map[wazeroir.Label]uint32{},
}
}
c := fn()
c.Init(functionType, ir, false)
ret, ok := c.(compilerImpl)
require.True(t, ok)
return ret
}
// compilerImpl is the interface used for architecture-independent unit tests in this pkg.
// This is currently implemented by amd64 and arm64.
type compilerImpl interface {
compiler
compileExitFromNativeCode(nativeCallStatusCode)
compileMaybeGrowStack() error
compileReturnFunction() error
assignStackPointerCeil(uint64)
setStackPointerCeil(uint64)
compileReleaseRegisterToStack(loc *runtimeValueLocation)
setRuntimeValueLocationStack(*runtimeValueLocationStack)
compileEnsureOnRegister(loc *runtimeValueLocation) error
compileModuleContextInitialization() error
}
const defaultMemoryPageNumInTest = 1
func newCompilerEnvironment() *compilerEnv {
me := &moduleEngine{}
return &compilerEnv{
me: me,
moduleInstance: &wasm.ModuleInstance{
MemoryInstance: &wasm.MemoryInstance{Buffer: make([]byte, wasm.MemoryPageSize*defaultMemoryPageNumInTest)},
Tables: []*wasm.TableInstance{},
Globals: []*wasm.GlobalInstance{},
Engine: me,
},
ce: me.newCallEngine(initialStackSize, &function{parent: &compiledFunction{parent: &compiledModule{}}}),
}
}
// requireRuntimeLocationStackPointerEqual ensures that the compiler's runtimeValueLocationStack has
// the expected stack pointer value relative to the call frame.
func requireRuntimeLocationStackPointerEqual(t *testing.T, expSP uint64, c compiler) {
require.Equal(t, expSP, c.runtimeValueLocationStack().sp-callFrameDataSizeInUint64)
}
// TestCompileI32WrapFromI64 is the regression test for https://github.com/tetratelabs/wazero/issues/1008
func TestCompileI32WrapFromI64(t *testing.T) {
c := newCompiler()
c.Init(&wasm.FunctionType{}, nil, false)
// Push the original i64 value.
loc := c.runtimeValueLocationStack().pushRuntimeValueLocationOnStack()
loc.valueType = runtimeValueTypeI64
// Wrap it as the i32, and this should result in having runtimeValueTypeI32 on top of the stack.
err := c.compileI32WrapFromI64()
require.NoError(t, err)
require.Equal(t, runtimeValueTypeI32, loc.valueType)
}
func operationPtr(operation wazeroir.UnionOperation) *wazeroir.UnionOperation {
return &operation
}
func requireExecutable(original []byte) (executable []byte) {
executable, err := platform.MmapCodeSegment(len(original))
if err != nil {
panic(err)
}
copy(executable, original)
if runtime.GOARCH == "arm64" {
err = platform.MprotectRX(executable)
if err != nil {
panic(err)
}
}
return executable
}