diff --git a/internal/engine/compiler/compiler_drop_test.go b/internal/engine/compiler/compiler_drop_test.go index cb24c2c2..8ebbc5e9 100644 --- a/internal/engine/compiler/compiler_drop_test.go +++ b/internal/engine/compiler/compiler_drop_test.go @@ -6,6 +6,7 @@ import ( "github.com/tetratelabs/wazero/internal/asm" "github.com/tetratelabs/wazero/internal/testing/require" + "github.com/tetratelabs/wazero/internal/wasm" "github.com/tetratelabs/wazero/internal/wazeroir" ) @@ -19,6 +20,7 @@ func Test_compileDropRange(t *testing.T) { t.Run("start at the top", func(t *testing.T) { c := newCompiler() + c.Init(&wasm.FunctionType{}, nil, false) // Use up all unreserved registers. for _, reg := range unreservedGeneralPurposeRegisters { @@ -93,6 +95,7 @@ func Test_getTemporariesForStackedLiveValues(t *testing.T) { t.Run("no stacked values", func(t *testing.T) { liveValues := []runtimeValueLocation{{register: 1}, {register: 2}} c := newCompiler() + c.Init(&wasm.FunctionType{}, nil, false) gpTmp, vecTmp, err := getTemporariesForStackedLiveValues(c, liveValues) require.NoError(t, err) @@ -111,6 +114,7 @@ func Test_getTemporariesForStackedLiveValues(t *testing.T) { {valueType: runtimeValueTypeI64}, } c := newCompiler() + c.Init(&wasm.FunctionType{}, nil, false) if !freeRegisterExists { // Use up all the unreserved gp registers. @@ -151,6 +155,7 @@ func Test_getTemporariesForStackedLiveValues(t *testing.T) { {valueType: runtimeValueTypeV128Hi}, } c := newCompiler() + c.Init(&wasm.FunctionType{}, nil, false) if !freeRegisterExists { // Use up all the unreserved gp registers. @@ -185,6 +190,7 @@ func Test_migrateLiveValue(t *testing.T) { t.Run("already on register", func(t *testing.T) { // This case, we don't use tmp registers. c := newCompiler() + c.Init(&wasm.FunctionType{}, nil, false) // Push the dummy values. for i := 0; i < 10; i++ { diff --git a/internal/engine/compiler/compiler_stack_test.go b/internal/engine/compiler/compiler_stack_test.go index 4cbf5ddd..61144b44 100644 --- a/internal/engine/compiler/compiler_stack_test.go +++ b/internal/engine/compiler/compiler_stack_test.go @@ -34,7 +34,7 @@ func TestCompiler_releaseRegisterToStack(t *testing.T) { require.NoError(t, err) // Set up the location stack so that we push the const on the specified height. - s := runtimeValueLocationStack{ + s := &runtimeValueLocationStack{ sp: tc.stackPointer, stack: make([]runtimeValueLocation, tc.stackPointer), unreservedVectorRegisters: unreservedVectorRegisters, @@ -527,6 +527,11 @@ func TestCompiler_compileSelect(t *testing.T) { t.Run(fmt.Sprintf("x1=0x%x,x2=0x%x", vals[0], vals[1]), func(t *testing.T) { env := newCompilerEnvironment() compiler := env.requireNewCompiler(t, &wasm.FunctionType{}, newCompiler, nil) + + // To make the assertion below stable, we preallocate the underlying stack, + // so that the pointer to the entry will be stale. + compiler.runtimeValueLocationStack().stack = make([]runtimeValueLocation, 100) + err := compiler.compilePreamble() require.NoError(t, err) diff --git a/internal/engine/compiler/compiler_test.go b/internal/engine/compiler/compiler_test.go index 2cc096e9..03ae13f4 100644 --- a/internal/engine/compiler/compiler_test.go +++ b/internal/engine/compiler/compiler_test.go @@ -247,7 +247,7 @@ type compilerImpl interface { assignStackPointerCeil(uint64) setStackPointerCeil(uint64) compileReleaseRegisterToStack(loc *runtimeValueLocation) - setRuntimeValueLocationStack(runtimeValueLocationStack) + setRuntimeValueLocationStack(*runtimeValueLocationStack) compileEnsureOnRegister(loc *runtimeValueLocation) error compileModuleContextInitialization() error } @@ -277,6 +277,8 @@ func requireRuntimeLocationStackPointerEqual(t *testing.T, expSP uint64, c compi // TestCompileI32WrapFromI64 is the regression test for https://github.com/tetratelabs/wazero/issues/1008 func TestCompileI32WrapFromI64(t *testing.T) { c := newCompiler() + c.Init(&wasm.FunctionType{}, nil, false) + // Push the original i64 value. loc := c.runtimeValueLocationStack().pushRuntimeValueLocationOnStack() loc.valueType = runtimeValueTypeI64 diff --git a/internal/engine/compiler/compiler_value_location.go b/internal/engine/compiler/compiler_value_location.go index f893542d..30036fed 100644 --- a/internal/engine/compiler/compiler_value_location.go +++ b/internal/engine/compiler/compiler_value_location.go @@ -111,7 +111,6 @@ func (v *runtimeValueLocation) String() string { func newRuntimeValueLocationStack() runtimeValueLocationStack { return runtimeValueLocationStack{ - stack: make([]runtimeValueLocation, 10), unreservedVectorRegisters: unreservedVectorRegisters, unreservedGeneralPurposeRegisters: unreservedGeneralPurposeRegisters, } @@ -141,14 +140,13 @@ type runtimeValueLocationStack struct { unreservedGeneralPurposeRegisters, unreservedVectorRegisters []asm.Register } -func (v *runtimeValueLocationStack) initialized() bool { - return len(v.unreservedGeneralPurposeRegisters) > 0 -} - func (v *runtimeValueLocationStack) reset() { - v.stackPointerCeil, v.sp = 0, 0 - v.stack = v.stack[:0] - v.usedRegisters = usedRegistersMask(0) + stack := v.stack[:0] + *v = runtimeValueLocationStack{ + unreservedVectorRegisters: unreservedVectorRegisters, + unreservedGeneralPurposeRegisters: unreservedGeneralPurposeRegisters, + stack: stack, + } } func (v *runtimeValueLocationStack) String() string { @@ -160,16 +158,19 @@ func (v *runtimeValueLocationStack) String() string { return fmt.Sprintf("sp=%d, stack=[%s], used_registers=[%s]", v.sp, strings.Join(stackStr, ","), strings.Join(usedRegisters, ",")) } -func (v *runtimeValueLocationStack) clone() runtimeValueLocationStack { - ret := runtimeValueLocationStack{} - ret.sp = v.sp - ret.usedRegisters = v.usedRegisters - ret.stack = make([]runtimeValueLocation, len(v.stack)) - copy(ret.stack, v.stack) - ret.stackPointerCeil = v.stackPointerCeil - ret.unreservedGeneralPurposeRegisters = v.unreservedGeneralPurposeRegisters - ret.unreservedVectorRegisters = v.unreservedVectorRegisters - return ret +// cloneFrom clones the values on `from` into self except for the slice of .stack field. +// The content on .stack will be copied from the origin to self, and grow the underlying slice +// if necessary. +func (v *runtimeValueLocationStack) cloneFrom(from runtimeValueLocationStack) { + // Assigns the same values for fields except for the stack which we want to reuse. + prev := v.stack + *v = from + v.stack = prev[:cap(prev)] // Expand the length to the capacity so that we can minimize "diff" below. + // Copy the content in the stack. + if diff := int(from.sp) - len(v.stack); diff > 0 { + v.stack = append(v.stack, make([]runtimeValueLocation, diff)...) + } + copy(v.stack, from.stack[:from.sp]) } // pushRuntimeValueLocationOnRegister creates a new runtimeValueLocation with a given register and pushes onto diff --git a/internal/engine/compiler/compiler_value_location_test.go b/internal/engine/compiler/compiler_value_location_test.go index d381db60..64710467 100644 --- a/internal/engine/compiler/compiler_value_location_test.go +++ b/internal/engine/compiler/compiler_value_location_test.go @@ -2,7 +2,6 @@ package compiler import ( "testing" - "unsafe" "github.com/tetratelabs/wazero/internal/asm" "github.com/tetratelabs/wazero/internal/testing/require" @@ -42,17 +41,6 @@ func TestRuntimeValueLocationStack_basic(t *testing.T) { s.releaseRegister(loc) require.False(t, s.usedRegisters.exist(loc.register)) require.Equal(t, asm.NilRegister, loc.register) - // Clone. - cloned := s.clone() - require.Equal(t, s.usedRegisters, cloned.usedRegisters) - require.Equal(t, s.unreservedGeneralPurposeRegisters, cloned.unreservedGeneralPurposeRegisters) - require.Equal(t, s.unreservedVectorRegisters, cloned.unreservedVectorRegisters) - require.Equal(t, len(s.stack), len(cloned.stack)) - require.Equal(t, s.sp, cloned.sp) - for i := 0; i < int(s.sp); i++ { - actual, exp := &s.stack[i], &cloned.stack[i] - require.NotEqual(t, uintptr(unsafe.Pointer(exp)), uintptr(unsafe.Pointer(actual))) - } // Check the max stack pointer. for i := 0; i < 1000; i++ { s.pushRuntimeValueLocationOnStack() @@ -207,7 +195,9 @@ func TestRuntimeValueLocation_pushCallFrame(t *testing.T) { t.Run(sig.String(), func(t *testing.T) { s := newRuntimeValueLocationStack() // pushCallFrame assumes that the parameters are already pushed. - s.sp += uint64(sig.ParamNumInUint64) + for i := 0; i < sig.ParamNumInUint64; i++ { + _ = s.pushRuntimeValueLocationOnStack() + } retAddr, stackBasePointer, fn := s.pushCallFrame(sig) @@ -230,3 +220,50 @@ func Test_usedRegistersMask(t *testing.T) { require.False(t, mask.exist(r)) } } + +func TestRuntimeValueLocation_cloneFrom(t *testing.T) { + t.Run("spcap", func(t *testing.T) { + v := runtimeValueLocationStack{stack: make([]runtimeValueLocation, 0, 3)} + orig := v.stack[:cap(v.stack)] + v.cloneFrom(runtimeValueLocationStack{sp: 5, usedRegisters: 0xffff, stack: []runtimeValueLocation{ + {register: 5}, {register: 4}, {register: 3}, {register: 2}, {register: 1}, + }}) + require.Equal(t, uint64(5), v.sp) + require.Equal(t, usedRegistersMask(0xffff), v.usedRegisters) + // Underlying stack should have changed since sp=5>cap(v.stack). + require.NotEqual(t, &orig[0], &v.stack[0]) + require.Equal(t, v.stack[0].register, asm.Register(5)) + require.Equal(t, v.stack[1].register, asm.Register(4)) + require.Equal(t, v.stack[2].register, asm.Register(3)) + require.Equal(t, v.stack[3].register, asm.Register(2)) + require.Equal(t, v.stack[4].register, asm.Register(1)) + }) +} diff --git a/internal/engine/compiler/impl_amd64.go b/internal/engine/compiler/impl_amd64.go index 60640caa..9926072a 100644 --- a/internal/engine/compiler/impl_amd64.go +++ b/internal/engine/compiler/impl_amd64.go @@ -87,7 +87,7 @@ type amd64Compiler struct { cpuFeatures platform.CpuFeatureFlags // locationStack holds the state of wazeroir virtual stack. // and each item is either placed in register or the actual memory stack. - locationStack runtimeValueLocationStack + locationStack *runtimeValueLocationStack // labels hold per wazeroir label specific information in this function. labels [wazeroir.LabelKindNum][]amd64LabelInfo // stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation. @@ -97,47 +97,71 @@ type amd64Compiler struct { withListener bool typ *wasm.FunctionType br *bytes.Reader + // locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack, + // we cache it here, and reset and set to .locationStack in the Init method. + locationStackForEntrypoint runtimeValueLocationStack + // frameIDMax tracks the maximum value of frame id per function. + frameIDMax int + brTableTmp []runtimeValueLocation } func newAmd64Compiler() compiler { c := &amd64Compiler{ - assembler: amd64.NewAssembler(), - locationStack: newRuntimeValueLocationStack(), - cpuFeatures: platform.CpuFeatures, - br: bytes.NewReader(nil), + assembler: amd64.NewAssembler(), + locationStackForEntrypoint: newRuntimeValueLocationStack(), + cpuFeatures: platform.CpuFeatures, + br: bytes.NewReader(nil), } return c } // Init implements compiler.Init. func (c *amd64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) { - assembler, locationStack := c.assembler, c.locationStack - assembler.Reset() - locationStack.reset() - for i := range c.labels { - c.labels[i] = c.labels[i][:0] - } + c.assembler.Reset() + c.locationStackForEntrypoint.reset() + c.resetLabels() *c = amd64Compiler{ - ir: ir, - assembler: assembler, - locationStack: locationStack, - cpuFeatures: c.cpuFeatures, - withListener: withListener, - labels: c.labels, - typ: typ, - br: c.br, + ir: ir, + withListener: withListener, + typ: typ, + assembler: c.assembler, + cpuFeatures: c.cpuFeatures, + labels: c.labels, + br: c.br, + locationStackForEntrypoint: c.locationStackForEntrypoint, + brTableTmp: c.brTableTmp, + } + + // Reuses the initial location stack for the compilation of subsequent functions. + c.locationStack = &c.locationStackForEntrypoint +} + +// resetLabels resets the existing content in arm64Compiler.labels so that +// we could reuse the allocated slices and stacks in the subsequent compilations. +func (c *amd64Compiler) resetLabels() { + for i := range c.labels { + for j := range c.labels[i] { + if j > c.frameIDMax { + // Only need to reset until the maximum frame id. This makes the compilation faster for large binary. + break + } + l := &c.labels[i][j] + l.initialInstruction = nil + l.stackInitialized = false + l.initialStack.reset() + } } } // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture. func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack { - return &c.locationStack + return c.locationStack } // setLocationStack sets the given runtimeValueLocationStack to .locationStack field, // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks. // This is called when we branch into different block. -func (c *amd64Compiler) setLocationStack(newStack runtimeValueLocationStack) { +func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) { if c.stackPointerCeil < c.locationStack.stackPointerCeil { c.stackPointerCeil = c.locationStack.stackPointerCeil } @@ -163,17 +187,23 @@ type amd64LabelInfo struct { // initialInstruction is the initial instruction for this label so other block can jump into it. initialInstruction asm.Node // initialStack is the initial value location stack from which we start compiling this label. - initialStack runtimeValueLocationStack + initialStack runtimeValueLocationStack + stackInitialized bool } func (c *amd64Compiler) label(label wazeroir.Label) *amd64LabelInfo { kind := label.Kind() frames := c.labels[kind] frameID := label.FrameID() + if c.frameIDMax < frameID { + c.frameIDMax = frameID + } // If the frameID is not allocated yet, expand the slice by twice of the diff, // so that we could reduce the allocation in the subsequent compilation. if diff := frameID - len(frames) + 1; diff > 0 { - frames = append(frames, make([]amd64LabelInfo, diff*2)...) + for i := 0; i < diff; i++ { + frames = append(frames, amd64LabelInfo{initialStack: newRuntimeValueLocationStack()}) + } c.labels[kind] = frames } return &frames[frameID] @@ -444,10 +474,9 @@ func (c *amd64Compiler) branchInto(target wazeroir.Label) error { // with the appropriate value locations. Note we clone the stack here as we maybe // manipulate the stack before compiler reaches the label. targetLabel := c.label(target) - if !targetLabel.initialStack.initialized() { - // It seems unnecessary to clone as branchInto is always the tail of the current block. - // TODO: verify ^^. - targetLabel.initialStack = c.locationStack.clone() + if !targetLabel.stackInitialized { + targetLabel.initialStack.cloneFrom(*c.locationStack) + targetLabel.stackInitialized = true } jmp := c.assembler.CompileJump(amd64.JMP) c.assignJumpTarget(target, jmp) @@ -522,38 +551,24 @@ func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error { // Note that .Else branch doesn't have ToDrop as .Else is in reality // corresponding to either If's Else block or Br_if's else block in Wasm. - // Emit for else branches - saved := c.locationStack - c.setLocationStack(saved.clone()) + // Emit the else branch. if elseTarget.IsReturnTarget() { if err := c.compileReturnFunction(); err != nil { return err } } else { - elseLabel := elseTarget - if c.ir.LabelCallers[elseLabel] > 1 { - // We can only re-use register state if when there's a single call-site. - // Release existing values on registers to the stack if there's multiple ones to have - // the consistent value location state at the beginning of label. - if err := c.compileReleaseAllRegistersToStack(); err != nil { - return err - } - } - // Set the initial stack of the target label, so we can start compiling the label - // with the appropriate value locations. Note we clone the stack here as we maybe - // manipulate the stack before compiler reaches the label. - labelInfo := c.label(elseLabel) - if !labelInfo.initialStack.initialized() { - labelInfo.initialStack = c.locationStack + labelInfo := c.label(elseTarget) + if !labelInfo.stackInitialized { + labelInfo.initialStack.cloneFrom(*c.locationStack) + labelInfo.stackInitialized = true } elseJmp := c.assembler.CompileJump(amd64.JMP) - c.assignJumpTarget(elseLabel, elseJmp) + c.assignJumpTarget(elseTarget, elseJmp) } // Handle then branch. c.assembler.SetJumpTargetOnNext(jmpWithCond) - c.setLocationStack(saved) if err := compileDropRange(c, thenToDrop); err != nil { return err } @@ -573,8 +588,9 @@ func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error { // with the appropriate value locations. Note we clone the stack here as we maybe // manipulate the stack before compiler reaches the label. labelInfo := c.label(thenLabel) - if !labelInfo.initialStack.initialized() { - labelInfo.initialStack = c.locationStack + if !labelInfo.stackInitialized { + labelInfo.initialStack.cloneFrom(*c.locationStack) + labelInfo.stackInitialized = true } thenJmp := c.assembler.CompileJump(amd64.JMP) c.assignJumpTarget(thenLabel, thenJmp) @@ -670,7 +686,12 @@ func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error { // [Emit the code for each targets and default branch] labelInitialInstructions := make([]asm.Node, len(o.Us)/2) - saved := c.locationStack + + // Since we might end up having the different stack state in each branch, + // we need to save the initial stack state here, and use the same initial state + // for each iteration. + initialLocationStack := c.getSavedTemporaryLocationStack() + for i := range labelInitialInstructions { // Emit the initial instruction of each target. // We use NOP as we don't yet know the next instruction in each label. @@ -679,27 +700,31 @@ func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error { targetLabel := wazeroir.Label(o.Us[i*2]) targetToDrop := o.Us[i*2+1] - if i < len(labelInitialInstructions)-1 { - // Clone the location stack so the branch-specific code doesn't - // affect others. - c.setLocationStack(saved.clone()) - } else { - // If this is the default branch, we use the original one - // as this is the last code in this block. - c.setLocationStack(saved) - } if err = compileDropRange(c, targetToDrop); err != nil { return err } if err = c.branchInto(targetLabel); err != nil { return err } + // After the iteration, reset the stack's state with initialLocationStack. + c.locationStack.cloneFrom(initialLocationStack) } c.assembler.BuildJumpTable(offsetData, labelInitialInstructions) return nil } +func (c *amd64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack { + initialLocationStack := *c.locationStack // Take copy! + // Use c.brTableTmp for the underlying stack so that we could reduce the allocations. + if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 { + c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...) + } + copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp]) + initialLocationStack.stack = c.brTableTmp + return initialLocationStack +} + func (c *amd64Compiler) assignJumpTarget(label wazeroir.Label, jmpInstruction asm.Node) { jmpTargetLabel := c.label(label) targetInst := jmpTargetLabel.initialInstruction @@ -717,7 +742,7 @@ func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool labelInfo := c.label(label) // If initialStack is not set, that means this label has never been reached. - if !labelInfo.initialStack.initialized() { + if !labelInfo.stackInitialized { skipLabel = true return } @@ -732,7 +757,7 @@ func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool } // Set the initial stack. - c.setLocationStack(labelInfo.initialStack) + c.setLocationStack(&labelInfo.initialStack) return } diff --git a/internal/engine/compiler/impl_amd64_test.go b/internal/engine/compiler/impl_amd64_test.go index 26a3b52e..2956c6f5 100644 --- a/internal/engine/compiler/impl_amd64_test.go +++ b/internal/engine/compiler/impl_amd64_test.go @@ -135,6 +135,11 @@ func TestAmd64Compiler_compile_Mul_Div_Rem(t *testing.T) { const dxValue uint64 = 111111 compiler := env.requireNewCompiler(t, &wasm.FunctionType{}, newAmd64Compiler, nil).(*amd64Compiler) + + // To make the assertion below stable, we preallocate the underlying stack, + // so that the pointer to the entry will be stale. + compiler.runtimeValueLocationStack().stack = make([]runtimeValueLocation, 100) + err := compiler.compilePreamble() require.NoError(t, err) @@ -261,6 +266,11 @@ func TestAmd64Compiler_compile_Mul_Div_Rem(t *testing.T) { env := newCompilerEnvironment() compiler := env.requireNewCompiler(t, &wasm.FunctionType{}, newAmd64Compiler, nil).(*amd64Compiler) + + // To make the assertion below stable, we preallocate the underlying stack, + // so that the pointer to the entry will be stale. + compiler.runtimeValueLocationStack().stack = make([]runtimeValueLocation, 100) + err := compiler.compilePreamble() require.NoError(t, err) @@ -583,6 +593,130 @@ func (c *amd64Compiler) setStackPointerCeil(v uint64) { } // compile implements compilerImpl.setRuntimeValueLocationStack for the amd64 architecture. -func (c *amd64Compiler) setRuntimeValueLocationStack(s runtimeValueLocationStack) { +func (c *amd64Compiler) setRuntimeValueLocationStack(s *runtimeValueLocationStack) { c.locationStack = s } + +func TestAmd64Compiler_label(t *testing.T) { + c := &amd64Compiler{} + c.label(wazeroir.NewLabel(wazeroir.LabelKindContinuation, 100)) + require.Equal(t, 100, c.frameIDMax) + require.Equal(t, 101, len(c.labels[wazeroir.LabelKindContinuation])) + + // frameIDMax is for all LabelKind, so this shouldn't change frameIDMax. + c.label(wazeroir.NewLabel(wazeroir.LabelKindHeader, 2)) + require.Equal(t, 100, c.frameIDMax) + require.Equal(t, 3, len(c.labels[wazeroir.LabelKindHeader])) +} + +func TestAmd64Compiler_Init(t *testing.T) { + c := &amd64Compiler{ + locationStackForEntrypoint: newRuntimeValueLocationStack(), + assembler: amd64.NewAssembler(), + } + const stackCap = 12345 + c.locationStackForEntrypoint.stack = make([]runtimeValueLocation, stackCap) + c.locationStackForEntrypoint.sp = 5555 + + c.Init(&wasm.FunctionType{}, nil, false) + + // locationStack is the pointer to locationStackForEntrypoint after init. + require.Equal(t, c.locationStack, &c.locationStackForEntrypoint) + // And the underlying stack must be reused (the capacity preserved). + require.Equal(t, stackCap, cap(c.locationStack.stack)) + require.Equal(t, stackCap, cap(c.locationStackForEntrypoint.stack)) +} + +func TestAmd64Compiler_resetLabels(t *testing.T) { + c := newAmd64Compiler().(*amd64Compiler) + nop := c.compileNOP() + + const ( + frameIDMax = 50 + capacity = 12345 + ) + c.frameIDMax = frameIDMax + for i := range c.labels { + ifs := make([]amd64LabelInfo, frameIDMax*2) + c.labels[i] = ifs + for j := 0; j <= frameIDMax; j++ { + ifs[j].stackInitialized = true + ifs[j].initialInstruction = nop + ifs[j].initialStack = newRuntimeValueLocationStack() + ifs[j].initialStack.sp = 5555 // should be cleared via runtimeLocationStack.Reset(). + ifs[j].initialStack.stack = make([]runtimeValueLocation, 0, capacity) + } + } + c.resetLabels() + for i := range c.labels { + for j := 0; j < len(c.labels[i]); j++ { + l := &c.labels[i][j] + require.False(t, l.stackInitialized) + require.Nil(t, l.initialInstruction) + require.Equal(t, 0, len(l.initialStack.stack)) + if j > frameIDMax { + require.Equal(t, 0, cap(l.initialStack.stack)) + } else { + require.Equal(t, capacity, cap(l.initialStack.stack)) + } + require.Equal(t, uint64(0), l.initialStack.sp) + } + } +} + +func TestAmd64Compiler_getSavedTemporaryLocationStack(t *testing.T) { + t.Run("len(brTableTmp)len(current)", func(t *testing.T) { + const temporarySliceSize = 100 + st := newRuntimeValueLocationStack() + c := &amd64Compiler{locationStack: &st, brTableTmp: make([]runtimeValueLocation, temporarySliceSize)} + + c.locationStack.sp = 3 + c.locationStack.stack = []runtimeValueLocation{ + {stackPointer: 150}, + {stackPointer: 200}, + {stackPointer: 300}, + {}, + {}, + {}, + {}, + {stackPointer: 1231455}, // Entries here shouldn't be copied as they are avobe sp. + } + + actual := c.getSavedTemporaryLocationStack() + require.Equal(t, uint64(3), actual.sp) + require.Equal(t, temporarySliceSize, len(actual.stack)) + require.Equal(t, c.locationStack.stack[:3], actual.stack[:3]) + for i := int(actual.sp); i < len(actual.stack); i++ { + // Above the stack pointer, the values must not be copied. + require.Zero(t, actual.stack[i].stackPointer) + } + }) +} diff --git a/internal/engine/compiler/impl_arm64.go b/internal/engine/compiler/impl_arm64.go index a89e6865..7e4782b4 100644 --- a/internal/engine/compiler/impl_arm64.go +++ b/internal/engine/compiler/impl_arm64.go @@ -21,7 +21,7 @@ type arm64Compiler struct { ir *wazeroir.CompilationResult // locationStack holds the state of wazeroir virtual stack. // and each item is either placed in register or the actual memory stack. - locationStack runtimeValueLocationStack + locationStack *runtimeValueLocationStack // labels maps a label (e.g. ".L1_then") to *arm64LabelInfo. labels [wazeroir.LabelKindNum][]arm64LabelInfo // stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation. @@ -31,29 +31,57 @@ type arm64Compiler struct { withListener bool typ *wasm.FunctionType br *bytes.Reader + // locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack, + // we cache it here, and reset and set to .locationStack in the Init method. + locationStackForEntrypoint runtimeValueLocationStack + // frameIDMax tracks the maximum value of frame id per function. + frameIDMax int + brTableTmp []runtimeValueLocation } func newArm64Compiler() compiler { return &arm64Compiler{ - assembler: arm64.NewAssembler(arm64ReservedRegisterForTemporary), - locationStack: newRuntimeValueLocationStack(), - br: bytes.NewReader(nil), + assembler: arm64.NewAssembler(arm64ReservedRegisterForTemporary), + locationStackForEntrypoint: newRuntimeValueLocationStack(), + br: bytes.NewReader(nil), } } // Init implements compiler.Init. func (c *arm64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) { - assembler, locationStack := c.assembler, c.locationStack - assembler.Reset() - locationStack.reset() - for i := range c.labels { - c.labels[i] = c.labels[i][:0] - } + c.assembler.Reset() + c.locationStackForEntrypoint.reset() + c.resetLabels() + *c = arm64Compiler{ - assembler: assembler, locationStack: locationStack, - ir: ir, withListener: withListener, labels: c.labels, - typ: typ, - br: c.br, + ir: ir, + withListener: withListener, + typ: typ, + assembler: c.assembler, + labels: c.labels, + br: c.br, + brTableTmp: c.brTableTmp, + locationStackForEntrypoint: c.locationStackForEntrypoint, + } + + // Reuses the initial location stack for the compilation of subsequent functions. + c.locationStack = &c.locationStackForEntrypoint +} + +// resetLabels resets the existing content in arm64Compiler.labels so that +// we could reuse the allocated slices and stacks in the subsequent compilations. +func (c *arm64Compiler) resetLabels() { + for i := range c.labels { + for j := range c.labels[i] { + if j > c.frameIDMax { + // Only need to reset until the maximum frame id. This makes the compilation faster for large binary. + break + } + l := &c.labels[i][j] + l.initialInstruction = nil + l.stackInitialized = false + l.initialStack.reset() + } } } @@ -142,7 +170,8 @@ type arm64LabelInfo struct { // initialInstruction is the initial instruction for this label so other block can branch into it. initialInstruction asm.Node // initialStack is the initial value location stack from which we start compiling this label. - initialStack runtimeValueLocationStack + initialStack runtimeValueLocationStack + stackInitialized bool } // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the arm64 architecture. @@ -156,10 +185,15 @@ func (c *arm64Compiler) label(label wazeroir.Label) *arm64LabelInfo { kind := label.Kind() frames := c.labels[kind] frameID := label.FrameID() + if c.frameIDMax < frameID { + c.frameIDMax = frameID + } // If the frameID is not allocated yet, expand the slice by twice of the diff, // so that we could reduce the allocation in the subsequent compilation. if diff := frameID - len(frames) + 1; diff > 0 { - frames = append(frames, make([]arm64LabelInfo, diff*2)...) + for i := 0; i < diff; i++ { + frames = append(frames, arm64LabelInfo{initialStack: newRuntimeValueLocationStack()}) + } c.labels[kind] = frames } return &frames[frameID] @@ -167,7 +201,7 @@ func (c *arm64Compiler) label(label wazeroir.Label) *arm64LabelInfo { // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture. func (c *arm64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack { - return &c.locationStack + return c.locationStack } // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for arm64. @@ -455,7 +489,7 @@ func (c *arm64Compiler) compileGoDefinedHostFunction() error { // setLocationStack sets the given runtimeValueLocationStack to .locationStack field, // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks. // This is called when we branch into different block. -func (c *arm64Compiler) setLocationStack(newStack runtimeValueLocationStack) { +func (c *arm64Compiler) setLocationStack(newStack *runtimeValueLocationStack) { if c.stackPointerCeil < c.locationStack.stackPointerCeil { c.stackPointerCeil = c.locationStack.stackPointerCeil } @@ -480,7 +514,7 @@ func (c *arm64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipThisLabel labelInfo := c.label(labelKey) // If initialStack is not set, that means this label has never been reached. - if !labelInfo.initialStack.initialized() { + if !labelInfo.stackInitialized { skipThisLabel = true return } @@ -494,7 +528,7 @@ func (c *arm64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipThisLabel } // Set the initial stack. - c.setLocationStack(labelInfo.initialStack) + c.setLocationStack(&labelInfo.initialStack) return false } @@ -753,18 +787,10 @@ func (c *arm64Compiler) compileBrIf(o *wazeroir.UnionOperation) error { } // Emit the code for branching into else branch. - // We save and clone the location stack because we might end up modifying it inside of branchInto, - // and we have to avoid affecting the code generation for Then branch afterwards. - saved := c.locationStack - c.setLocationStack(saved.clone()) elseTarget := wazeroir.Label(o.U2) if err := c.compileBranchInto(elseTarget); err != nil { return err } - - // Now ready to emit the code for branching into then branch. - // Retrieve the original value location stack so that the code below won't be affected by the Else branch ^^. - c.setLocationStack(saved) // We branch into here from the original conditional BR (conditionalBR). c.assembler.SetJumpTargetOnNext(conditionalBR) thenTarget := wazeroir.Label(o.U1) @@ -790,8 +816,9 @@ func (c *arm64Compiler) compileBranchInto(target wazeroir.Label) error { // with the appropriate value locations. Note we clone the stack here as we maybe // manipulate the stack before compiler reaches the label. targetLabel := c.label(target) - if !targetLabel.initialStack.initialized() { - targetLabel.initialStack = c.locationStack.clone() + if !targetLabel.stackInitialized { + targetLabel.initialStack.cloneFrom(*c.locationStack) + targetLabel.stackInitialized = true } br := c.assembler.CompileJump(arm64.B) @@ -910,38 +937,45 @@ func (c *arm64Compiler) compileBrTable(o *wazeroir.UnionOperation) error { // [Emit the code for each targets and default branch] labelInitialInstructions := make([]asm.Node, len(o.Us)/2) - saved := c.locationStack + + // Since we might end up having the different stack state in each branch, + // we need to save the initial stack state here, and use the same initial state + // for each iteration. + initialLocationStack := c.getSavedTemporaryLocationStack() + for i := range labelInitialInstructions { // Emit the initial instruction of each target where // we use NOP as we don't yet know the next instruction in each label. init := c.assembler.CompileStandAlone(arm64.NOP) labelInitialInstructions[i] = init - var locationStack runtimeValueLocationStack targetLabel := wazeroir.Label(o.Us[i*2]) targetToDrop := o.Us[i*2+1] - if i < len(labelInitialInstructions)-1 { - // Clone the location stack so the branch-specific code doesn't - // affect others. - locationStack = saved.clone() - } else { - // If this is the default branch, we use the original one - // as this is the last code in this block. - locationStack = saved - } - c.setLocationStack(locationStack) if err = compileDropRange(c, targetToDrop); err != nil { return err } if err = c.compileBranchInto(targetLabel); err != nil { return err } + // After the iteration, reset the stack's state with initialLocationStack. + c.locationStack.cloneFrom(initialLocationStack) } c.assembler.BuildJumpTable(offsetData, labelInitialInstructions) return nil } +func (c *arm64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack { + initialLocationStack := *c.locationStack // Take copy! + // Use c.brTableTmp for the underlying stack so that we could reduce the allocations. + if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 { + c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...) + } + copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp]) + initialLocationStack.stack = c.brTableTmp + return initialLocationStack +} + // compileCall implements compiler.compileCall for the arm64 architecture. func (c *arm64Compiler) compileCall(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { diff --git a/internal/engine/compiler/impl_arm64_test.go b/internal/engine/compiler/impl_arm64_test.go index f385fbd6..bad4f783 100644 --- a/internal/engine/compiler/impl_arm64_test.go +++ b/internal/engine/compiler/impl_arm64_test.go @@ -100,12 +100,136 @@ func TestArm64Compiler_readInstructionAddress(t *testing.T) { require.Equal(t, nativeCallStatusCodeReturned, env.compilerStatus()) } +func TestArm64Compiler_label(t *testing.T) { + c := &arm64Compiler{} + c.label(wazeroir.NewLabel(wazeroir.LabelKindContinuation, 100)) + require.Equal(t, 100, c.frameIDMax) + require.Equal(t, 101, len(c.labels[wazeroir.LabelKindContinuation])) + + // frameIDMax is for all LabelKind, so this shouldn't change frameIDMax. + c.label(wazeroir.NewLabel(wazeroir.LabelKindHeader, 2)) + require.Equal(t, 100, c.frameIDMax) + require.Equal(t, 3, len(c.labels[wazeroir.LabelKindHeader])) +} + +func TestArm64Compiler_Init(t *testing.T) { + c := &arm64Compiler{ + locationStackForEntrypoint: newRuntimeValueLocationStack(), + assembler: arm64.NewAssembler(0), + } + const stackCap = 12345 + c.locationStackForEntrypoint.stack = make([]runtimeValueLocation, stackCap) + c.locationStackForEntrypoint.sp = 5555 + + c.Init(&wasm.FunctionType{}, nil, false) + + // locationStack is the pointer to locationStackForEntrypoint after init. + require.Equal(t, c.locationStack, &c.locationStackForEntrypoint) + // And the underlying stack must be reused (the capacity preserved). + require.Equal(t, stackCap, cap(c.locationStack.stack)) + require.Equal(t, stackCap, cap(c.locationStackForEntrypoint.stack)) +} + +func TestArm64Compiler_resetLabels(t *testing.T) { + c := newArm64Compiler().(*arm64Compiler) + nop := c.compileNOP() + + const ( + frameIDMax = 50 + capacity = 12345 + ) + c.frameIDMax = frameIDMax + for i := range c.labels { + ifs := make([]arm64LabelInfo, frameIDMax*2) + c.labels[i] = ifs + for j := 0; j <= frameIDMax; j++ { + ifs[j].stackInitialized = true + ifs[j].initialInstruction = nop + ifs[j].initialStack = newRuntimeValueLocationStack() + ifs[j].initialStack.sp = 5555 // should be cleared via runtimeLocationStack.Reset(). + ifs[j].initialStack.stack = make([]runtimeValueLocation, 0, capacity) + } + } + c.resetLabels() + for i := range c.labels { + for j := 0; j < len(c.labels[i]); j++ { + l := &c.labels[i][j] + require.False(t, l.stackInitialized) + require.Nil(t, l.initialInstruction) + require.Equal(t, 0, len(l.initialStack.stack)) + if j > frameIDMax { + require.Equal(t, 0, cap(l.initialStack.stack)) + } else { + require.Equal(t, capacity, cap(l.initialStack.stack)) + } + require.Equal(t, uint64(0), l.initialStack.sp) + } + } +} + +func TestArm64Compiler_getSavedTemporaryLocationStack(t *testing.T) { + t.Run("len(brTableTmp)len(current)", func(t *testing.T) { + const temporarySliceSize = 100 + st := newRuntimeValueLocationStack() + c := &arm64Compiler{locationStack: &st, brTableTmp: make([]runtimeValueLocation, temporarySliceSize)} + + c.locationStack.sp = 3 + c.locationStack.stack = []runtimeValueLocation{ + {stackPointer: 150}, + {stackPointer: 200}, + {stackPointer: 300}, + {}, + {}, + {}, + {}, + {stackPointer: 1231455}, // Entries here shouldn't be copied as they are avobe sp. + } + + actual := c.getSavedTemporaryLocationStack() + require.Equal(t, uint64(3), actual.sp) + require.Equal(t, temporarySliceSize, len(actual.stack)) + require.Equal(t, c.locationStack.stack[:3], actual.stack[:3]) + for i := int(actual.sp); i < len(actual.stack); i++ { + // Above the stack pointer, the values must not be copied. + require.Zero(t, actual.stack[i].stackPointer) + } + }) +} + // compile implements compilerImpl.setStackPointerCeil for the amd64 architecture. func (c *arm64Compiler) setStackPointerCeil(v uint64) { c.stackPointerCeil = v } // compile implements compilerImpl.setRuntimeValueLocationStack for the amd64 architecture. -func (c *arm64Compiler) setRuntimeValueLocationStack(s runtimeValueLocationStack) { +func (c *arm64Compiler) setRuntimeValueLocationStack(s *runtimeValueLocationStack) { c.locationStack = s }