Files
wazero/internal/engine/compiler/impl_amd64.go
2023-06-09 10:58:53 +10:00

5215 lines
203 KiB
Go

package compiler
// This file implements the compiler for amd64/x86_64 target.
// Please refer to https://www.felixcloutier.com/x86/index.html
// if unfamiliar with amd64 instructions used here.
import (
"fmt"
"math"
"github.com/tetratelabs/wazero/internal/asm"
"github.com/tetratelabs/wazero/internal/asm/amd64"
"github.com/tetratelabs/wazero/internal/platform"
"github.com/tetratelabs/wazero/internal/u32"
"github.com/tetratelabs/wazero/internal/u64"
"github.com/tetratelabs/wazero/internal/wasm"
"github.com/tetratelabs/wazero/internal/wazeroir"
)
var (
_minimum32BitSignedInt int32 = math.MinInt32
_maximum32BitSignedInt int32 = math.MaxInt32
_maximum32BitUnsignedInt uint32 = math.MaxUint32
_minimum64BitSignedInt int64 = math.MinInt64
_maximum64BitSignedInt int64 = math.MaxInt64
_maximum64BitUnsignedInt uint64 = math.MaxUint64
_float32SignBitMask uint32 = 1 << 31
_float32RestBitMask = ^_float32SignBitMask
_float64SignBitMask uint64 = 1 << 63
_float64RestBitMask = ^_float64SignBitMask
_float32ForMinimumSigned32bitInteger = uint32(0xCF00_0000)
_float64ForMinimumSigned32bitInteger = uint64(0xC1E0_0000_0020_0000)
_float32ForMinimumSigned64bitInteger = uint32(0xDF00_0000)
_float64ForMinimumSigned64bitInteger = uint64(0xC3E0_0000_0000_0000)
_float32ForMaximumSigned32bitIntPlusOne = uint32(0x4F00_0000)
_float64ForMaximumSigned32bitIntPlusOne = uint64(0x41E0_0000_0000_0000)
_float32ForMaximumSigned64bitIntPlusOne = uint32(0x5F00_0000)
_float64ForMaximumSigned64bitIntPlusOne = uint64(0x43E0_0000_0000_0000)
)
var (
// amd64ReservedRegisterForCallEngine: pointer to callEngine (i.e. *callEngine as uintptr)
amd64ReservedRegisterForCallEngine = amd64.RegR13
// amd64ReservedRegisterForStackBasePointerAddress: stack base pointer's address (callEngine.stackBasePointer) in the current function call.
amd64ReservedRegisterForStackBasePointerAddress = amd64.RegR14
// amd64ReservedRegisterForMemory: pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
amd64ReservedRegisterForMemory = amd64.RegR15
)
var (
amd64UnreservedVectorRegisters = []asm.Register{ //nolint
amd64.RegX0, amd64.RegX1, amd64.RegX2, amd64.RegX3,
amd64.RegX4, amd64.RegX5, amd64.RegX6, amd64.RegX7,
amd64.RegX8, amd64.RegX9, amd64.RegX10, amd64.RegX11,
amd64.RegX12, amd64.RegX13, amd64.RegX14, amd64.RegX15,
}
// Note that we never invoke "call" instruction,
// so we don't need to care about the calling convention.
// TODO: Maybe it is safe just save rbp, rsp somewhere
// in Go-allocated variables, and reuse these registers
// in compiled functions and write them back before returns.
amd64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
amd64.RegAX, amd64.RegCX, amd64.RegDX, amd64.RegBX,
amd64.RegSI, amd64.RegDI, amd64.RegR8, amd64.RegR9,
amd64.RegR10, amd64.RegR11, amd64.RegR12,
}
)
// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds *wasm.ModuleInstance of the
// next executing function instance. The value is set and used when making function calls
// or function returns in the ModuleContextInitialization. See compileModuleContextInitialization.
var amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister = amd64.RegR12
func (c *amd64Compiler) String() string {
return c.locationStack.String()
}
// compileNOP implements compiler.compileNOP for the amd64 architecture.
func (c *amd64Compiler) compileNOP() asm.Node {
return c.assembler.CompileStandAlone(amd64.NOP)
}
type amd64Compiler struct {
assembler amd64.Assembler
ir *wazeroir.CompilationResult
cpuFeatures platform.CpuFeatureFlags
// locationStack holds the state of wazeroir virtual stack.
// and each item is either placed in register or the actual memory stack.
locationStack *runtimeValueLocationStack
// labels hold per wazeroir label specific information in this function.
labels [wazeroir.LabelKindNum][]amd64LabelInfo
// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
stackPointerCeil uint64
// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
assignStackPointerCeilNeeded asm.Node
compiledTrapTargets [nativeCallStatusModuleClosed]asm.Node
withListener bool
typ *wasm.FunctionType
// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
// we cache it here, and reset and set to .locationStack in the Init method.
locationStackForEntrypoint runtimeValueLocationStack
// frameIDMax tracks the maximum value of frame id per function.
frameIDMax int
brTableTmp []runtimeValueLocation
fourZeros,
eightZeros,
minimum32BitSignedInt,
maximum32BitSignedInt,
maximum32BitUnsignedInt,
minimum64BitSignedInt,
maximum64BitSignedInt,
maximum64BitUnsignedInt,
float32SignBitMask,
float32RestBitMask,
float64SignBitMask,
float64RestBitMask,
float32ForMinimumSigned32bitInteger,
float64ForMinimumSigned32bitInteger,
float32ForMinimumSigned64bitInteger,
float64ForMinimumSigned64bitInteger,
float32ForMaximumSigned32bitIntPlusOne,
float64ForMaximumSigned32bitIntPlusOne,
float32ForMaximumSigned64bitIntPlusOne,
float64ForMaximumSigned64bitIntPlusOne *asm.StaticConst
}
func newAmd64Compiler() compiler {
c := &amd64Compiler{
assembler: amd64.NewAssembler(),
locationStackForEntrypoint: newRuntimeValueLocationStack(),
cpuFeatures: platform.CpuFeatures,
}
c.fourZeros = asm.NewStaticConst([]byte{0, 0, 0, 0})
c.eightZeros = asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0})
c.minimum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_minimum32BitSignedInt)))
c.maximum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_maximum32BitSignedInt)))
c.maximum32BitUnsignedInt = asm.NewStaticConst(u32.LeBytes(_maximum32BitUnsignedInt))
c.minimum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_minimum64BitSignedInt)))
c.maximum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_maximum64BitSignedInt)))
c.maximum64BitUnsignedInt = asm.NewStaticConst(u64.LeBytes(_maximum64BitUnsignedInt))
c.float32SignBitMask = asm.NewStaticConst(u32.LeBytes(_float32SignBitMask))
c.float32RestBitMask = asm.NewStaticConst(u32.LeBytes(_float32RestBitMask))
c.float64SignBitMask = asm.NewStaticConst(u64.LeBytes(_float64SignBitMask))
c.float64RestBitMask = asm.NewStaticConst(u64.LeBytes(_float64RestBitMask))
c.float32ForMinimumSigned32bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned32bitInteger))
c.float64ForMinimumSigned32bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned32bitInteger))
c.float32ForMinimumSigned64bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned64bitInteger))
c.float64ForMinimumSigned64bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned64bitInteger))
c.float32ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned32bitIntPlusOne))
c.float64ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned32bitIntPlusOne))
c.float32ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned64bitIntPlusOne))
c.float64ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned64bitIntPlusOne))
return c
}
// Init implements compiler.Init.
func (c *amd64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
c.assembler.Reset()
c.locationStackForEntrypoint.reset()
c.resetLabels()
*c = amd64Compiler{
ir: ir,
withListener: withListener,
typ: typ,
assembler: c.assembler,
cpuFeatures: c.cpuFeatures,
labels: c.labels,
locationStackForEntrypoint: c.locationStackForEntrypoint,
brTableTmp: c.brTableTmp,
fourZeros: c.fourZeros,
eightZeros: c.eightZeros,
minimum32BitSignedInt: c.minimum32BitSignedInt,
maximum32BitSignedInt: c.maximum32BitSignedInt,
maximum32BitUnsignedInt: c.maximum32BitUnsignedInt,
minimum64BitSignedInt: c.minimum64BitSignedInt,
maximum64BitSignedInt: c.maximum64BitSignedInt,
maximum64BitUnsignedInt: c.maximum64BitUnsignedInt,
float32SignBitMask: c.float32SignBitMask,
float32RestBitMask: c.float32RestBitMask,
float64SignBitMask: c.float64SignBitMask,
float64RestBitMask: c.float64RestBitMask,
float32ForMinimumSigned32bitInteger: c.float32ForMinimumSigned32bitInteger,
float64ForMinimumSigned32bitInteger: c.float64ForMinimumSigned32bitInteger,
float32ForMinimumSigned64bitInteger: c.float32ForMinimumSigned64bitInteger,
float64ForMinimumSigned64bitInteger: c.float64ForMinimumSigned64bitInteger,
float32ForMaximumSigned32bitIntPlusOne: c.float32ForMaximumSigned32bitIntPlusOne,
float64ForMaximumSigned32bitIntPlusOne: c.float64ForMaximumSigned32bitIntPlusOne,
float32ForMaximumSigned64bitIntPlusOne: c.float32ForMaximumSigned64bitIntPlusOne,
float64ForMaximumSigned64bitIntPlusOne: c.float64ForMaximumSigned64bitIntPlusOne,
}
// Reuses the initial location stack for the compilation of subsequent functions.
c.locationStack = &c.locationStackForEntrypoint
}
// resetLabels resets the existing content in arm64Compiler.labels so that
// we could reuse the allocated slices and stacks in the subsequent compilations.
func (c *amd64Compiler) resetLabels() {
for i := range c.labels {
for j := range c.labels[i] {
if j > c.frameIDMax {
// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
break
}
l := &c.labels[i][j]
l.initialInstruction = nil
l.stackInitialized = false
l.initialStack.reset()
}
}
}
// runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
return c.locationStack
}
// setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
// while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
// This is called when we branch into different block.
func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
if c.stackPointerCeil < c.locationStack.stackPointerCeil {
c.stackPointerCeil = c.locationStack.stackPointerCeil
}
c.locationStack = newStack
}
// pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for amd64.
func (c *amd64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
c.locationStack.markRegisterUsed(reg)
return
}
// pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for amd64.
func (c *amd64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
c.locationStack.markRegisterUsed(reg)
return
}
type amd64LabelInfo struct {
// initialInstruction is the initial instruction for this label so other block can jump into it.
initialInstruction asm.Node
// initialStack is the initial value location stack from which we start compiling this label.
initialStack runtimeValueLocationStack
stackInitialized bool
}
func (c *amd64Compiler) label(label wazeroir.Label) *amd64LabelInfo {
kind := label.Kind()
frames := c.labels[kind]
frameID := label.FrameID()
if c.frameIDMax < frameID {
c.frameIDMax = frameID
}
// If the frameID is not allocated yet, expand the slice by twice of the diff,
// so that we could reduce the allocation in the subsequent compilation.
if diff := frameID - len(frames) + 1; diff > 0 {
for i := 0; i < diff; i++ {
frames = append(frames, amd64LabelInfo{initialStack: newRuntimeValueLocationStack()})
}
c.labels[kind] = frames
}
return &frames[frameID]
}
// compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the amd64 architecture.
func (c *amd64Compiler) compileBuiltinFunctionCheckExitCode() error {
if err := c.compileCallBuiltinFunction(builtinFunctionIndexCheckExitCode); err != nil {
return err
}
// After the function call, we have to initialize the stack base pointer and memory reserved registers.
c.compileReservedStackBasePointerInitialization()
c.compileReservedMemoryPointerInitialization()
return nil
}
// compileGoDefinedHostFunction constructs the entire code to enter the host function implementation,
// and return to the caller.
func (c *amd64Compiler) compileGoDefinedHostFunction() error {
// First we must update the location stack to reflect the number of host function inputs.
c.locationStack.init(c.typ)
if c.withListener {
if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
return err
}
}
// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
// without sacrificing the performance.
c.compileReservedStackBasePointerInitialization()
// Alias for readability.
tmp := amd64.RegAX
// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
// Load the value into the tmp register: tmp = &function{..}
callerFunction.setRegister(tmp)
c.compileLoadValueOnStackToRegister(callerFunction)
// tmp = *(tmp+functionSourceOffset) = &wasm.ModuleInstance{...}
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, functionModuleInstanceOffset, tmp)
// Load it onto callEngine.exitContext.callerFunctionInstance.
c.assembler.CompileRegisterToMemory(amd64.MOVQ,
tmp,
amd64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
c.locationStack.releaseRegister(callerFunction)
if err := c.compileCallGoHostFunction(); err != nil {
return err
}
// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
c.compileReservedStackBasePointerInitialization()
// Go function can change the module state in arbitrary way, so we have to force
// the callEngine.moduleContext initialization on the function return. To do so,
// we zero-out callEngine.moduleInstance.
c.assembler.CompileConstToMemory(amd64.MOVQ,
0, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
return c.compileReturnFunction()
}
// compile implements compiler.compile for the amd64 architecture.
func (c *amd64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
// used for all labels (via setLocationStack), excluding the current one.
// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
stackPointerCeil = c.stackPointerCeil
if stackPointerCeil < c.locationStack.stackPointerCeil {
stackPointerCeil = c.locationStack.stackPointerCeil
}
// Now that the max stack pointer is determined, we are invoking the callback.
// Note this MUST be called before Assemble() below.
c.assignStackPointerCeil(stackPointerCeil)
err = c.assembler.Assemble(buf)
return
}
// compileUnreachable implements compiler.compileUnreachable for the amd64 architecture.
func (c *amd64Compiler) compileUnreachable() error {
c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
return nil
}
// assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the amd64 architecture.
func (c *amd64Compiler) assignStackPointerCeil(ceil uint64) {
if c.assignStackPointerCeilNeeded != nil {
c.assignStackPointerCeilNeeded.AssignDestinationConstant(int64(ceil) << 3)
}
}
// compileSet implements compiler.compileSet for the amd64 architecture.
func (c *amd64Compiler) compileSet(o *wazeroir.UnionOperation) error {
depth := int(o.U1)
isTargetVector := o.B3
setTargetIndex := int(c.locationStack.sp) - 1 - depth
if isTargetVector {
_ = c.locationStack.pop() // ignore the higher 64-bits.
}
v := c.locationStack.pop()
if err := c.compileEnsureOnRegister(v); err != nil {
return err
}
targetLocation := &c.locationStack.stack[setTargetIndex]
if targetLocation.onRegister() {
// We no longer need the register previously used by the target location.
c.locationStack.markRegisterUnused(targetLocation.register)
}
reg := v.register
targetLocation.setRegister(reg)
targetLocation.valueType = v.valueType
if isTargetVector {
hi := &c.locationStack.stack[setTargetIndex+1]
hi.setRegister(reg)
}
return nil
}
// compileGlobalGet implements compiler.compileGlobalGet for the amd64 architecture.
func (c *amd64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First, move the pointer to the global slice into the allocated register.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
index := o.U1
// Now, move the location of the global instance into the register.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)
// When an integer, reuse the pointer register for the value. Otherwise, allocate a float register for it.
valueReg := intReg
var vt runtimeValueType
var inst asm.Instruction
switch c.ir.Globals[index].ValType {
case wasm.ValueTypeI32:
inst = amd64.MOVL
vt = runtimeValueTypeI32
case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
inst = amd64.MOVQ
vt = runtimeValueTypeI64
case wasm.ValueTypeF32:
inst = amd64.MOVL
vt = runtimeValueTypeF32
valueReg, err = c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
case wasm.ValueTypeF64:
inst = amd64.MOVQ
vt = runtimeValueTypeF64
valueReg, err = c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
case wasm.ValueTypeV128:
inst = amd64.MOVDQU
vt = runtimeValueTypeV128Lo
valueReg, err = c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
default:
panic("BUG: unknown runtime value type")
}
// Using the register holding the pointer to the target instance, move its value into a register.
c.assembler.CompileMemoryToRegister(inst, intReg, globalInstanceValueOffset, valueReg)
// Record that the retrieved global value on the top of the stack is now in a register.
if vt == runtimeValueTypeV128Lo {
c.pushVectorRuntimeValueLocationOnRegister(valueReg)
} else {
c.pushRuntimeValueLocationOnRegister(valueReg, vt)
}
return nil
}
// compileGlobalSet implements compiler.compileGlobalSet for the amd64 architecture.
func (c *amd64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
index := o.U1
wasmValueType := c.ir.Globals[index].ValType
isV128 := wasmValueType == wasm.ValueTypeV128
// First, move the value to set into a temporary register.
val := c.locationStack.pop()
if isV128 {
// The previous val is higher 64-bits, and have to use lower 64-bit's runtimeValueLocation for allocation, etc.
val = c.locationStack.pop()
}
if err := c.compileEnsureOnRegister(val); err != nil {
return err
}
// Allocate a register to hold the memory location of the target global instance.
intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First, move the pointer to the global slice into the allocated register.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
// Now, move the location of the global instance into the register.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)
// Now ready to write the value to the global instance location.
var inst asm.Instruction
if isV128 {
inst = amd64.MOVDQU
} else if wasmValueType == wasm.ValueTypeI32 || wasmValueType == wasm.ValueTypeF32 {
inst = amd64.MOVL
} else {
inst = amd64.MOVQ
}
c.assembler.CompileRegisterToMemory(inst, val.register, intReg, globalInstanceValueOffset)
// Since the value is now written to memory, release the value register.
c.locationStack.releaseRegister(val)
return nil
}
// compileBr implements compiler.compileBr for the amd64 architecture.
func (c *amd64Compiler) compileBr(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
return c.branchInto(wazeroir.Label(o.U1))
}
// branchInto adds instruction necessary to jump into the given branch target.
func (c *amd64Compiler) branchInto(target wazeroir.Label) error {
if target.IsReturnTarget() {
return c.compileReturnFunction()
} else {
if c.ir.LabelCallers[target] > 1 {
// We can only re-use register state if when there's a single call-site.
// Release existing values on registers to the stack if there's multiple ones to have
// the consistent value location state at the beginning of label.
if err := c.compileReleaseAllRegistersToStack(); err != nil {
return err
}
}
// Set the initial stack of the target label, so we can start compiling the label
// with the appropriate value locations. Note we clone the stack here as we maybe
// manipulate the stack before compiler reaches the label.
targetLabel := c.label(target)
if !targetLabel.stackInitialized {
targetLabel.initialStack.cloneFrom(*c.locationStack)
targetLabel.stackInitialized = true
}
jmp := c.assembler.CompileJump(amd64.JMP)
c.assignJumpTarget(target, jmp)
}
return nil
}
// compileBrIf implements compiler.compileBrIf for the amd64 architecture.
func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
cond := c.locationStack.pop()
var jmpWithCond asm.Node
if cond.onConditionalRegister() {
var inst asm.Instruction
switch cond.conditionalRegister {
case amd64.ConditionalRegisterStateE:
inst = amd64.JEQ
case amd64.ConditionalRegisterStateNE:
inst = amd64.JNE
case amd64.ConditionalRegisterStateS:
inst = amd64.JMI
case amd64.ConditionalRegisterStateNS:
inst = amd64.JPL
case amd64.ConditionalRegisterStateG:
inst = amd64.JGT
case amd64.ConditionalRegisterStateGE:
inst = amd64.JGE
case amd64.ConditionalRegisterStateL:
inst = amd64.JLT
case amd64.ConditionalRegisterStateLE:
inst = amd64.JLE
case amd64.ConditionalRegisterStateA:
inst = amd64.JHI
case amd64.ConditionalRegisterStateAE:
inst = amd64.JCC
case amd64.ConditionalRegisterStateB:
inst = amd64.JCS
case amd64.ConditionalRegisterStateBE:
inst = amd64.JLS
}
jmpWithCond = c.assembler.CompileJump(inst)
} else {
// Usually the comparison operand for br_if is on the conditional register,
// but in some cases, they are on the stack or register.
// For example, the following code
// i64.const 1
// local.get 1
// i64.add
// br_if ....
// will try to use the result of i64.add, which resides on the (virtual) stack,
// as the operand for br_if instruction.
if err := c.compileEnsureOnRegister(cond); err != nil {
return err
}
// Check if the value not equals zero.
c.assembler.CompileRegisterToConst(amd64.CMPQ, cond.register, 0)
// Emit jump instruction which jumps when the value does not equals zero.
jmpWithCond = c.assembler.CompileJump(amd64.JNE)
c.locationStack.markRegisterUnused(cond.register)
}
// Make sure that the next coming label is the else jump target.
thenTarget := wazeroir.Label(o.U1)
elseTarget := wazeroir.Label(o.U2)
thenToDrop := o.U3
// Here's the diagram of how we organize the instructions necessarily for brif operation.
//
// jmp_with_cond -> jmp (.Else) -> Then operations...
// |---------(satisfied)------------^^^
//
// Note that .Else branch doesn't have ToDrop as .Else is in reality
// corresponding to either If's Else block or Br_if's else block in Wasm.
// Emit the else branch.
if elseTarget.IsReturnTarget() {
if err := c.compileReturnFunction(); err != nil {
return err
}
} else {
labelInfo := c.label(elseTarget)
if !labelInfo.stackInitialized {
labelInfo.initialStack.cloneFrom(*c.locationStack)
labelInfo.stackInitialized = true
}
elseJmp := c.assembler.CompileJump(amd64.JMP)
c.assignJumpTarget(elseTarget, elseJmp)
}
// Handle then branch.
c.assembler.SetJumpTargetOnNext(jmpWithCond)
if err := compileDropRange(c, thenToDrop); err != nil {
return err
}
if thenTarget.IsReturnTarget() {
return c.compileReturnFunction()
} else {
thenLabel := thenTarget
if c.ir.LabelCallers[thenLabel] > 1 {
// We can only re-use register state if when there's a single call-site.
// Release existing values on registers to the stack if there's multiple ones to have
// the consistent value location state at the beginning of label.
if err := c.compileReleaseAllRegistersToStack(); err != nil {
return err
}
}
// Set the initial stack of the target label, so we can start compiling the label
// with the appropriate value locations. Note we clone the stack here as we maybe
// manipulate the stack before compiler reaches the label.
labelInfo := c.label(thenLabel)
if !labelInfo.stackInitialized {
labelInfo.initialStack.cloneFrom(*c.locationStack)
labelInfo.stackInitialized = true
}
thenJmp := c.assembler.CompileJump(amd64.JMP)
c.assignJumpTarget(thenLabel, thenJmp)
return nil
}
}
// compileBrTable implements compiler.compileBrTable for the amd64 architecture.
func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
index := c.locationStack.pop()
// If the operation only consists of the default target, we branch into it and return early.
if len(o.Us) == 2 {
c.locationStack.releaseRegister(index)
if err := compileDropRange(c, o.Us[1]); err != nil {
return err
}
return c.branchInto(wazeroir.Label(o.Us[0]))
}
// Otherwise, we jump into the selected branch.
if err := c.compileEnsureOnRegister(index); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First, we move the length of target list into the tmp register.
c.assembler.CompileConstToRegister(amd64.MOVQ, int64(len(o.Us)/2-1), tmp)
// Then, we compare the value with the length of targets.
c.assembler.CompileRegisterToRegister(amd64.CMPL, tmp, index.register)
// If the value is larger than the length,
// we round the index to the length as the spec states that
// if the index is larger than or equal the length of list,
// branch into the default branch.
c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, tmp, index.register)
// We prepare the static data which holds the offset of
// each target's first instruction (incl. default)
// relative to the beginning of label tables.
//
// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
// we emit the the code like this at [Emit the code for each targets and default branch] below.
//
// L0:
// 0x123001: XXXX, ...
// .....
// L1:
// 0x123005: YYY, ...
// .....
// L_DEFAULT:
// 0x123009: ZZZ, ...
//
// then offsetData becomes like [0x0, 0x5, 0x8].
// By using this offset list, we could jump into the label for the index by
// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA"
// instruction.
//
// Note: We store each offset of 32-bite unsigned integer as 4 consecutive bytes. So more precisely,
// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
//
// Note: this is similar to how GCC implements Switch statements in C.
offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))
// Load the offsetData's address into tmp.
if err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, offsetData, tmp); err != nil {
return err
}
// Now we have the address of first byte of offsetData in tmp register.
// So the target offset's first byte is at tmp+index*4 as we store
// the offset as 4 bytes for a 32-byte integer.
// Here, we store the offset into the index.register.
c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVL, tmp, 0, index.register, 4, index.register)
// Now we read the address of the beginning of the jump table.
// In the above example, this corresponds to reading the address of 0x123001.
c.assembler.CompileReadInstructionAddress(tmp, amd64.JMP)
// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
// So we could achieve the br_table jump by adding them and jump into the resulting address.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, index.register, tmp)
c.assembler.CompileJumpToRegister(amd64.JMP, tmp)
// We no longer need the index's register, so mark it unused.
c.locationStack.markRegisterUnused(index.register)
// [Emit the code for each targets and default branch]
labelInitialInstructions := make([]asm.Node, len(o.Us)/2)
// Since we might end up having the different stack state in each branch,
// we need to save the initial stack state here, and use the same initial state
// for each iteration.
initialLocationStack := c.getSavedTemporaryLocationStack()
for i := range labelInitialInstructions {
// Emit the initial instruction of each target.
// We use NOP as we don't yet know the next instruction in each label.
// Assembler would optimize out this NOP during code generation, so this is harmless.
labelInitialInstructions[i] = c.assembler.CompileStandAlone(amd64.NOP)
targetLabel := wazeroir.Label(o.Us[i*2])
targetToDrop := o.Us[i*2+1]
if err = compileDropRange(c, targetToDrop); err != nil {
return err
}
if err = c.branchInto(targetLabel); err != nil {
return err
}
// After the iteration, reset the stack's state with initialLocationStack.
c.locationStack.cloneFrom(initialLocationStack)
}
c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
return nil
}
func (c *amd64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
initialLocationStack := *c.locationStack // Take copy!
// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
}
copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
initialLocationStack.stack = c.brTableTmp
return initialLocationStack
}
func (c *amd64Compiler) assignJumpTarget(label wazeroir.Label, jmpInstruction asm.Node) {
jmpTargetLabel := c.label(label)
targetInst := jmpTargetLabel.initialInstruction
if targetInst == nil {
// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
targetInst = c.assembler.AllocateNOP()
jmpTargetLabel.initialInstruction = targetInst
}
jmpInstruction.AssignJumpTarget(targetInst)
}
// compileLabel implements compiler.compileLabel for the amd64 architecture.
func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool) {
label := wazeroir.Label(o.U1)
labelInfo := c.label(label)
// If initialStack is not set, that means this label has never been reached.
if !labelInfo.stackInitialized {
skipLabel = true
return
}
// We use NOP as a beginning of instructions in a label.
if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
// We use NOP as a beginning of instructions in a label.
// This should be eventually optimized out by assembler.
labelInfo.initialInstruction = c.assembler.CompileStandAlone(amd64.NOP)
} else {
c.assembler.Add(labelBegin)
}
// Set the initial stack.
c.setLocationStack(&labelInfo.initialStack)
return
}
// compileCall implements compiler.compileCall for the amd64 architecture.
func (c *amd64Compiler) compileCall(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
functionIndex := o.U1
target := c.ir.Functions[functionIndex]
targetType := &c.ir.Types[target]
targetAddressRegister, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First, push the index to the callEngine.functionsElement0Address into the target register.
c.assembler.CompileConstToRegister(amd64.MOVQ, int64(functionIndex)*functionSize, targetAddressRegister)
// Next, we add the address of the first item of callEngine.functions slice (= &callEngine.functions[0])
// to the target register.
c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine,
callEngineModuleContextFunctionsElement0AddressOffset, targetAddressRegister)
if err := c.compileCallFunctionImpl(targetAddressRegister, targetType); err != nil {
return err
}
return nil
}
// compileCallIndirect implements compiler.compileCallIndirect for the amd64 architecture.
func (c *amd64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) error {
offset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(offset); err != nil {
return nil
}
typeIndex := o.U1
tableIndex := o.U2
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(tmp)
tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(tmp2)
// Load the address of the target table: tmp = &module.Tables[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
// tmp = &module.Tables[0] + Index*8 = &module.Tables[0] + sizeOf(*TableInstance)*index = module.Tables[o.TableIndex].
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
// Then, we need to trap if the offset exceeds the length of table.
c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
// next we check if the target's type matches the operation's one.
// In order to get the type instance's address, we have to multiply the offset
// by 8 as the offset is the "length" of table in Go's "[]uintptr{}",
// and size of uintptr equals 8 bytes == (2^3).
c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, offset.register)
// Adds the address of wasm.Table[0] stored as callEngine.tableElement0Address to the offset.
c.assembler.CompileMemoryToRegister(amd64.ADDQ,
tmp, tableInstanceTableOffset, offset.register)
// "offset = (*offset) (== table[offset] == *code type)"
c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, 0, offset.register)
// At this point offset.register holds the address of *code (as uintptr) at wasm.Table[offset].
//
// Check if the value of table[offset] equals zero, meaning that the target is uninitialized.
c.assembler.CompileRegisterToConst(amd64.CMPQ, offset.register, 0)
// Skipped if the target is initialized.
c.compileTrapFromNativeCode(amd64.JNE, nativeCallStatusCodeInvalidTableAccess)
// Next, we need to check the type matches, i.e. table[offset].source.TypeID == targetFunctionType's typeID.
//
// "tmp2 = [&moduleInstance.TypeIDs[0] + index * 4] (== moduleInstance.TypeIDs[index])"
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
tmp2)
c.assembler.CompileMemoryToRegister(amd64.MOVL, tmp2, int64(typeIndex)*4, tmp2)
// Skipped if the type matches.
c.assembler.CompileMemoryToRegister(amd64.CMPL, offset.register, functionTypeIDOffset, tmp2)
c.compileTrapFromNativeCode(amd64.JEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
targetFunctionType := &c.ir.Types[typeIndex]
if err = c.compileCallFunctionImpl(offset.register, targetFunctionType); err != nil {
return nil
}
// The offset register should be marked as un-used as we consumed in the function call.
c.locationStack.markRegisterUnused(offset.register, tmp, tmp2)
return nil
}
// compileDrop implements compiler.compileDrop for the amd64 architecture.
func (c *amd64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
return compileDropRange(c, o.U1)
}
// compileSelectV128Impl implements compileSelect for vector values.
func (c *amd64Compiler) compileSelectV128Impl(selectorReg asm.Register) error {
x2 := c.locationStack.popV128()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.popV128()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// Compare the conditional value with zero.
c.assembler.CompileRegisterToConst(amd64.CMPQ, selectorReg, 0)
// Set the jump if the top value is not zero.
jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
// In this branch, we select the value of x2, so we move the value into x1.register so that
// we can have the result in x1.register regardless of the selection.
c.assembler.CompileRegisterToRegister(amd64.MOVDQU, x2.register, x1.register)
// Else, we don't need to adjust value, just need to jump to the next instruction.
c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
// As noted, the result exists in x1.register regardless of the selector.
c.pushVectorRuntimeValueLocationOnRegister(x1.register)
// Plus, x2.register is no longer used.
c.locationStack.markRegisterUnused(x2.register)
c.locationStack.markRegisterUnused(selectorReg)
return nil
}
// compileSelect implements compiler.compileSelect for the amd64 architecture.
//
// The emitted native code depends on whether the values are on
// the physical registers or memory stack, or maybe conditional register.
func (c *amd64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
cv := c.locationStack.pop()
if err := c.compileEnsureOnRegister(cv); err != nil {
return err
}
isTargetVector := o.B3
if isTargetVector {
return c.compileSelectV128Impl(cv.register)
}
x2 := c.locationStack.pop()
// We do not consume x1 here, but modify the value according to
// the conditional value "c" above.
peekedX1 := c.locationStack.peek()
// Compare the conditional value with zero.
c.assembler.CompileRegisterToConst(amd64.CMPQ, cv.register, 0)
// Now we can use c.register as temporary location.
// We alias it here for readability.
tmpRegister := cv.register
// Set the jump if the top value is not zero.
jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
// If the value is zero, we must place the value of x2 onto the stack position of x1.
// First we copy the value of x2 to the temporary register if x2 is not currently on a register.
if x2.onStack() {
x2.register = tmpRegister
c.compileLoadValueOnStackToRegister(x2)
}
//
// At this point x2's value is always on a register.
//
// Then release the value in the x2's register to the x1's stack position.
if peekedX1.onRegister() {
c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, peekedX1.register)
} else {
peekedX1.register = x2.register
c.compileReleaseRegisterToStack(peekedX1) // Note inside we mark the register unused!
}
// Else, we don't need to adjust value, just need to jump to the next instruction.
c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
// In any case, we don't need x2 and c anymore!
c.locationStack.releaseRegister(x2)
c.locationStack.releaseRegister(cv)
return nil
}
// compilePick implements compiler.compilePick for the amd64 architecture.
func (c *amd64Compiler) compilePick(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
depth := o.U1
isTargetVector := o.B3
// TODO: if we track the type of values on the stack,
// we could optimize the instruction according to the bit size of the value.
// For now, we just move the entire register i.e. as a quad word (8 bytes).
pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
reg, err := c.allocateRegister(pickTarget.getRegisterType())
if err != nil {
return err
}
if pickTarget.onRegister() {
var inst asm.Instruction
if isTargetVector {
inst = amd64.MOVDQU
} else if pickTarget.valueType == runtimeValueTypeI32 { // amd64 cannot copy single-precisions between registers.
inst = amd64.MOVL
} else {
inst = amd64.MOVQ
}
c.assembler.CompileRegisterToRegister(inst, pickTarget.register, reg)
} else if pickTarget.onStack() {
// Copy the value from the stack.
var inst asm.Instruction
if isTargetVector {
inst = amd64.MOVDQU
} else if pickTarget.valueType == runtimeValueTypeI32 || pickTarget.valueType == runtimeValueTypeF32 {
inst = amd64.MOVL
} else {
inst = amd64.MOVQ
}
// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
c.assembler.CompileMemoryToRegister(inst, amd64ReservedRegisterForStackBasePointerAddress,
int64(pickTarget.stackPointer)*8, reg)
}
// Now we already placed the picked value on the register,
// so push the location onto the stack.
if isTargetVector {
c.pushVectorRuntimeValueLocationOnRegister(reg)
} else {
c.pushRuntimeValueLocationOnRegister(reg, pickTarget.valueType)
}
return nil
}
// compileAdd implements compiler.compileAdd for the amd64 architecture.
func (c *amd64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
// TODO: if the previous instruction is const, then
// this can be optimized. Same goes for other arithmetic instructions.
var instruction asm.Instruction
unsignedType := wazeroir.UnsignedType(o.B1)
switch unsignedType {
case wazeroir.UnsignedTypeI32:
instruction = amd64.ADDL
case wazeroir.UnsignedTypeI64:
instruction = amd64.ADDQ
case wazeroir.UnsignedTypeF32:
instruction = amd64.ADDSS
case wazeroir.UnsignedTypeF64:
instruction = amd64.ADDSD
}
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.peek() // Note this is peek, pop!
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// x1 += x2.
c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
// We no longer need x2 register after ADD operation here,
// so we release it.
c.locationStack.releaseRegister(x2)
return nil
}
// compileSub implements compiler.compileSub for the amd64 architecture.
func (c *amd64Compiler) compileSub(o *wazeroir.UnionOperation) error {
// TODO: if the previous instruction is const, then
// this can be optimized. Same goes for other arithmetic instructions.
var instruction asm.Instruction
unsignedType := wazeroir.UnsignedType(o.B1)
switch unsignedType {
case wazeroir.UnsignedTypeI32:
instruction = amd64.SUBL
case wazeroir.UnsignedTypeI64:
instruction = amd64.SUBQ
case wazeroir.UnsignedTypeF32:
instruction = amd64.SUBSS
case wazeroir.UnsignedTypeF64:
instruction = amd64.SUBSD
}
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.peek() // Note this is peek, pop!
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// x1 -= x2.
c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
// We no longer need x2 register after ADD operation here,
// so we release it.
c.locationStack.releaseRegister(x2)
return nil
}
// compileMul implements compiler.compileMul for the amd64 architecture.
func (c *amd64Compiler) compileMul(o *wazeroir.UnionOperation) (err error) {
unsignedType := wazeroir.UnsignedType(o.B1)
switch unsignedType {
case wazeroir.UnsignedTypeI32:
err = c.compileMulForInts(true, amd64.MULL)
case wazeroir.UnsignedTypeI64:
err = c.compileMulForInts(false, amd64.MULQ)
case wazeroir.UnsignedTypeF32:
err = c.compileMulForFloats(amd64.MULSS)
case wazeroir.UnsignedTypeF64:
err = c.compileMulForFloats(amd64.MULSD)
}
return
}
// compileMulForInts emits instructions to perform integer multiplication for
// top two values on the stack. If unfamiliar with the convention for integer
// multiplication on x86, see https://www.felixcloutier.com/x86/mul.
//
// In summary, one of the values must be on the AX register,
// and the mul instruction stores the overflow info in DX register which we don't use.
// Here, we mean "the overflow info" by 65 bit or higher part of the result for 64 bit case.
//
// So, we have to ensure that
// 1. Previously located value on DX must be saved to memory stack. That is because
// the existing value will be overridden after the mul execution.
// 2. One of the operands (x1 or x2) must be on AX register.
//
// See https://www.felixcloutier.com/x86/mul#description for detail semantics.
func (c *amd64Compiler) compileMulForInts(is32Bit bool, mulInstruction asm.Instruction) error {
const (
resultRegister = amd64.RegAX
reservedRegister = amd64.RegDX
)
x2 := c.locationStack.pop()
x1 := c.locationStack.pop()
var valueOnAX *runtimeValueLocation
if x1.register == resultRegister {
valueOnAX = x1
} else if x2.register == resultRegister {
valueOnAX = x2
} else {
valueOnAX = x2
// This case we move x2 to AX register.
c.onValueReleaseRegisterToStack(resultRegister)
if x2.onConditionalRegister() {
c.compileMoveConditionalToGeneralPurposeRegister(x2, resultRegister)
} else if x2.onStack() {
x2.setRegister(resultRegister)
c.compileLoadValueOnStackToRegister(x2)
c.locationStack.markRegisterUsed(resultRegister)
} else {
var inst asm.Instruction
if is32Bit {
inst = amd64.MOVL
} else {
inst = amd64.MOVQ
}
c.assembler.CompileRegisterToRegister(inst, x2.register, resultRegister)
// We no longer uses the prev register of x2.
c.locationStack.releaseRegister(x2)
x2.setRegister(resultRegister)
c.locationStack.markRegisterUsed(resultRegister)
}
}
// We have to make sure that at this point the operands must be on registers.
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// We have to save the existing value on DX.
// If the DX register is used by either x1 or x2, we don't need to
// save the value because it is consumed by mul anyway.
if x1.register != reservedRegister && x2.register != reservedRegister {
c.onValueReleaseRegisterToStack(reservedRegister)
}
// Now ready to emit the mul instruction.
if x1 == valueOnAX {
c.assembler.CompileRegisterToNone(mulInstruction, x2.register)
} else {
c.assembler.CompileRegisterToNone(mulInstruction, x1.register)
}
c.locationStack.markRegisterUnused(x2.register)
c.locationStack.markRegisterUnused(x1.register)
// Now we have the result in the AX register,
// so we record it.
c.pushRuntimeValueLocationOnRegister(resultRegister, x1.valueType)
return nil
}
func (c *amd64Compiler) compileMulForFloats(instruction asm.Instruction) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// x1 *= x2.
c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
// We no longer need x2 register after MUL operation here,
// so we release it.
c.locationStack.releaseRegister(x2)
return nil
}
// compileClz implements compiler.compileClz for the amd64 architecture.
func (c *amd64Compiler) compileClz(o *wazeroir.UnionOperation) error {
target := c.locationStack.pop()
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
unsignedInt := wazeroir.UnsignedInt(o.B1)
if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.LZCNTQ, target.register, target.register)
}
} else {
// On processors that do not support LZCNT, we combine BSR (calculating
// most significant set bit) with XOR. This logic is described in
// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
// First, we have to check if the target is non-zero as BSR is undefined
// on zero. See https://www.felixcloutier.com/x86/bsr.
c.assembler.CompileRegisterToConst(amd64.CMPQ, target.register, 0)
jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
// If the value is zero, we just push the const value.
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
} else {
c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
}
// Emit the jmp instruction to jump to the position right after
// the non-zero case.
jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
// Start emitting non-zero case.
c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
// First, we calculate the most significant set bit.
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.BSRL, target.register, target.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.BSRQ, target.register, target.register)
}
// Now we XOR the value with the bit length minus one.
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileConstToRegister(amd64.XORL, 31, target.register)
} else {
c.assembler.CompileConstToRegister(amd64.XORQ, 63, target.register)
}
// Finally the end jump instruction of zero case must target towards
// the next instruction.
c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
}
// We reused the same register of target for the result.
c.locationStack.markRegisterUnused(target.register)
c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
return nil
}
// compileCtz implements compiler.compileCtz for the amd64 architecture.
func (c *amd64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
target := c.locationStack.pop()
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
unsignedInt := wazeroir.UnsignedInt(o.B1)
if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
}
} else {
// On processors that do not support TZCNT, the BSF instruction is
// executed instead. The key difference between TZCNT and BSF
// instruction is that if source operand is zero, the content of
// destination operand is undefined.
// https://www.felixcloutier.com/x86/tzcnt.html
// First we compare the target with zero.
c.assembler.CompileRegisterToConst(amd64.CMPQ, target.register, 0)
jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
// If the value is zero, we just push the const value.
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
} else {
c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
}
// Emit the jmp instruction to jump to the position right after
// the non-zero case.
jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
// Otherwise, emit the TZCNT.
c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
}
// Finally the end jump instruction of zero case must target towards
// the next instruction.
c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
}
// We reused the same register of target for the result.
c.locationStack.markRegisterUnused(target.register)
c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
return nil
}
// compilePopcnt implements compiler.compilePopcnt for the amd64 architecture.
func (c *amd64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
target := c.locationStack.pop()
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
unsignedInt := wazeroir.UnsignedInt(o.B1)
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.POPCNTL, target.register, target.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.POPCNTQ, target.register, target.register)
}
// We reused the same register of target for the result.
c.locationStack.markRegisterUnused(target.register)
c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
return nil
}
// compileDiv implements compiler.compileDiv for the amd64 architecture.
func (c *amd64Compiler) compileDiv(o *wazeroir.UnionOperation) (err error) {
signedType := wazeroir.SignedType(o.B1)
switch signedType {
case wazeroir.SignedTypeUint32:
err = c.compileDivForInts(true, false)
case wazeroir.SignedTypeUint64:
err = c.compileDivForInts(false, false)
case wazeroir.SignedTypeInt32:
err = c.compileDivForInts(true, true)
case wazeroir.SignedTypeInt64:
err = c.compileDivForInts(false, true)
case wazeroir.SignedTypeFloat32:
err = c.compileDivForFloats(true)
case wazeroir.SignedTypeFloat64:
err = c.compileDivForFloats(false)
}
return
}
// compileDivForInts emits the instructions to perform division on the top
// two values of integer type on the stack and puts the quotient of the result
// onto the stack. For example, stack [..., 10, 3] results in [..., 3] where
// the remainder is discarded.
func (c *amd64Compiler) compileDivForInts(is32Bit bool, signed bool) error {
if err := c.performDivisionOnInts(false, is32Bit, signed); err != nil {
return err
}
// Now we have the quotient of the division result in the AX register,
// so we record it.
if is32Bit {
c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI32)
} else {
c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI64)
}
return nil
}
// compileRem implements compiler.compileRem for the amd64 architecture.
func (c *amd64Compiler) compileRem(o *wazeroir.UnionOperation) (err error) {
var vt runtimeValueType
signedInt := wazeroir.SignedInt(o.B1)
switch signedInt {
case wazeroir.SignedInt32:
err = c.performDivisionOnInts(true, true, true)
vt = runtimeValueTypeI32
case wazeroir.SignedInt64:
err = c.performDivisionOnInts(true, false, true)
vt = runtimeValueTypeI64
case wazeroir.SignedUint32:
err = c.performDivisionOnInts(true, true, false)
vt = runtimeValueTypeI32
case wazeroir.SignedUint64:
err = c.performDivisionOnInts(true, false, false)
vt = runtimeValueTypeI64
}
if err != nil {
return err
}
// Now we have the remainder of the division result in the DX register,
// so we record it.
c.pushRuntimeValueLocationOnRegister(amd64.RegDX, vt)
return
}
// performDivisionOnInts emits the instructions to do divisions on top two integers on the stack
// via DIV (unsigned div) and IDIV (signed div) instructions.
// See the following explanation of these instructions' semantics from https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf
//
// >> Division requires special arrangements: idiv (signed) and div (unsigned) operate on a 2n-byte dividend and
// >> an n-byte divisor to produce an n-byte quotient and n-byte remainder. The dividend always lives in a fixed pair of
// >> registers (%edx and %eax for the 32-bit case; %rdx and %rax for the 64-bit case); the divisor is specified as the
// >> source operand in the instruction. The quotient goes in %eax (resp. %rax); the remainder in %edx (resp. %rdx). For
// >> signed division, the cltd (resp. ctqo) instruction is used to prepare %edx (resp. %rdx) with the sign extension of
// >> %eax (resp. %rax). For example, if a,b, c are memory locations holding quad words, then we could set c = a/b
// >> using the sequence: movq a(%rip), %rax; ctqo; idivq b(%rip); movq %rax, c(%rip).
//
// tl;dr is that the division result is placed in AX and DX registers after instructions emitted by this function
// where AX holds the quotient while DX the remainder of the division result.
func (c *amd64Compiler) performDivisionOnInts(isRem, is32Bit, signed bool) error {
const (
quotientRegister = amd64.RegAX
remainderRegister = amd64.RegDX
)
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
// Ensures that previous values on these registers are saved to memory.
c.onValueReleaseRegisterToStack(quotientRegister)
c.onValueReleaseRegisterToStack(remainderRegister)
// In order to ensure x2 is placed on a temporary register for x2 value other than AX and DX,
// we mark them as used here.
c.locationStack.markRegisterUsed(quotientRegister)
c.locationStack.markRegisterUsed(remainderRegister)
// Ensure that x2 is placed on a register which is not either AX or DX.
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
// Now we successfully place x2 on a temp register, so we no longer need to
// mark these registers used.
c.locationStack.markRegisterUnused(quotientRegister)
c.locationStack.markRegisterUnused(remainderRegister)
// Check if the x2 equals zero.
if is32Bit {
c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, 0)
} else {
c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, 0)
}
// Skipped if the divisor is nonzero.
c.compileTrapFromNativeCode(amd64.JNE, nativeCallStatusIntegerDivisionByZero)
// next, we ensure that x1 is placed on AX.
x1 := c.locationStack.pop()
if x1.onRegister() && x1.register != quotientRegister {
// Move x1 to quotientRegister.
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.MOVL, x1.register, quotientRegister)
} else {
c.assembler.CompileRegisterToRegister(amd64.MOVQ, x1.register, quotientRegister)
}
c.locationStack.markRegisterUnused(x1.register)
x1.setRegister(quotientRegister)
} else if x1.onStack() {
x1.setRegister(quotientRegister)
c.compileLoadValueOnStackToRegister(x1)
}
// Note: at this point, x1 is placed on AX, x2 is on a register which is not AX or DX.
isSignedRem := isRem && signed
isSignedDiv := !isRem && signed
var signedRemMinusOneDivisorJmp asm.Node
if isSignedRem {
// If this is for getting remainder of signed division,
// we have to treat the special case where the divisor equals -1.
// For example, if this is 32-bit case, the result of (-2^31) / -1 equals (quotient=2^31, remainder=0)
// where quotient doesn't fit in the 32-bit range whose maximum is 2^31-1.
// x86 in this case cause floating point exception, but according to the Wasm spec
// if the divisor equals -1, the result must be zero (not undefined!) as opposed to be "undefined"
// for divisions on (-2^31) / -1 where we do not need to emit the special branches.
// For detail, please refer to https://stackoverflow.com/questions/56303282/why-idiv-with-1-causes-floating-point-exception
// First we compare the division with -1.
if is32Bit {
c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
} else {
c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
}
// If it doesn't equal minus one, we jump to the normal case.
okJmp := c.assembler.CompileJump(amd64.JNE)
// Otherwise, we store zero into the remainder result register (DX).
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.XORL, remainderRegister, remainderRegister)
} else {
c.assembler.CompileRegisterToRegister(amd64.XORQ, remainderRegister, remainderRegister)
}
// Emit the exit jump instruction for the divisor -1 case so
// we skips the normal case.
signedRemMinusOneDivisorJmp = c.assembler.CompileJump(amd64.JMP)
// Set the normal case's jump target.
c.assembler.SetJumpTargetOnNext(okJmp)
} else if isSignedDiv {
// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
// case which results in the floating point exception via division error as
// the resulting value exceeds the maximum of signed int.
// First we compare the division with -1.
if is32Bit {
c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
} else {
c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
}
// If it doesn't equal minus one, we jump to the normal case.
nonMinusOneDivisorJmp := c.assembler.CompileJump(amd64.JNE)
// next we check if the quotient is the most negative value for the signed integer.
// That means whether or not we try to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
if is32Bit {
if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPL, x1.register, c.minimum32BitSignedInt); err != nil {
return err
}
} else {
if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPQ, x1.register, c.minimum64BitSignedInt); err != nil {
return err
}
}
// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
// as that is the overflow in division as the result becomes 2^31 which is larger than
// the maximum of signed 32-bit int (2^31-1).
c.compileTrapFromNativeCode(amd64.JNE, nativeCallStatusIntegerOverflow)
// Set the normal case's jump target.
c.assembler.SetJumpTargetOnNext(nonMinusOneDivisorJmp)
}
// Now ready to emit the div instruction.
// Since the div instructions takes 2n byte dividend placed in DX:AX registers...
// * signed case - we need to sign-extend the dividend into DX register via CDQ (32 bit) or CQO (64 bit).
// * unsigned case - we need to zero DX register via "XOR DX DX"
if is32Bit && signed {
// Emit sign-extension to have 64 bit dividend over DX and AX registers.
c.assembler.CompileStandAlone(amd64.CDQ)
c.assembler.CompileRegisterToNone(amd64.IDIVL, x2.register)
} else if is32Bit && !signed {
// Zeros DX register to have 64 bit dividend over DX and AX registers.
c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
c.assembler.CompileRegisterToNone(amd64.DIVL, x2.register)
} else if !is32Bit && signed {
// Emits sign-extension to have 128 bit dividend over DX and AX registers.
c.assembler.CompileStandAlone(amd64.CQO)
c.assembler.CompileRegisterToNone(amd64.IDIVQ, x2.register)
} else if !is32Bit && !signed {
// Zeros DX register to have 128 bit dividend over DX and AX registers.
c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
c.assembler.CompileRegisterToNone(amd64.DIVQ, x2.register)
}
// If this is signed rem instruction, we must set the jump target of
// the exit jump from division -1 case towards the next instruction.
if signedRemMinusOneDivisorJmp != nil {
c.assembler.SetJumpTargetOnNext(signedRemMinusOneDivisorJmp)
}
// We mark them as unused so that we can push one of them onto the location stack at call sites.
c.locationStack.markRegisterUnused(remainderRegister)
c.locationStack.markRegisterUnused(quotientRegister)
c.locationStack.markRegisterUnused(x2.register)
return nil
}
// compileDivForFloats emits the instructions to perform division
// on the top two values of float type on the stack, placing the result back onto the stack.
// For example, stack [..., 1.0, 4.0] results in [..., 0.25].
func (c *amd64Compiler) compileDivForFloats(is32Bit bool) error {
if is32Bit {
return c.compileSimpleBinaryOp(amd64.DIVSS)
} else {
return c.compileSimpleBinaryOp(amd64.DIVSD)
}
}
// compileAnd implements compiler.compileAnd for the amd64 architecture.
func (c *amd64Compiler) compileAnd(o *wazeroir.UnionOperation) (err error) {
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.compileSimpleBinaryOp(amd64.ANDL)
case wazeroir.UnsignedInt64:
err = c.compileSimpleBinaryOp(amd64.ANDQ)
}
return
}
// compileOr implements compiler.compileOr for the amd64 architecture.
func (c *amd64Compiler) compileOr(o *wazeroir.UnionOperation) (err error) {
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.compileSimpleBinaryOp(amd64.ORL)
case wazeroir.UnsignedInt64:
err = c.compileSimpleBinaryOp(amd64.ORQ)
}
return
}
// compileXor implements compiler.compileXor for the amd64 architecture.
func (c *amd64Compiler) compileXor(o *wazeroir.UnionOperation) (err error) {
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.compileSimpleBinaryOp(amd64.XORL)
case wazeroir.UnsignedInt64:
err = c.compileSimpleBinaryOp(amd64.XORQ)
}
return
}
// compileSimpleBinaryOp emits instructions to pop two values from the stack
// and perform the given instruction on these two values and push the result
// onto the stack.
func (c *amd64Compiler) compileSimpleBinaryOp(instruction asm.Instruction) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
// We consumed x2 register after the operation here,
// so we release it.
c.locationStack.releaseRegister(x2)
// We already stored the result in the register used by x1
// so we record it.
c.locationStack.markRegisterUnused(x1.register)
c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
return nil
}
// compileShl implements compiler.compileShl for the amd64 architecture.
func (c *amd64Compiler) compileShl(o *wazeroir.UnionOperation) (err error) {
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.compileShiftOp(amd64.SHLL, false)
case wazeroir.UnsignedInt64:
err = c.compileShiftOp(amd64.SHLQ, true)
}
return
}
// compileShr implements compiler.compileShr for the amd64 architecture.
func (c *amd64Compiler) compileShr(o *wazeroir.UnionOperation) (err error) {
signedInt := wazeroir.SignedInt(o.B1)
switch signedInt {
case wazeroir.SignedInt32:
err = c.compileShiftOp(amd64.SARL, true)
case wazeroir.SignedInt64:
err = c.compileShiftOp(amd64.SARQ, false)
case wazeroir.SignedUint32:
err = c.compileShiftOp(amd64.SHRL, true)
case wazeroir.SignedUint64:
err = c.compileShiftOp(amd64.SHRQ, false)
}
return
}
// compileRotl implements compiler.compileRotl for the amd64 architecture.
func (c *amd64Compiler) compileRotl(o *wazeroir.UnionOperation) (err error) {
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.compileShiftOp(amd64.ROLL, true)
case wazeroir.UnsignedInt64:
err = c.compileShiftOp(amd64.ROLQ, false)
}
return
}
// compileRotr implements compiler.compileRotr for the amd64 architecture.
func (c *amd64Compiler) compileRotr(o *wazeroir.UnionOperation) (err error) {
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.compileShiftOp(amd64.RORL, true)
case wazeroir.UnsignedInt64:
err = c.compileShiftOp(amd64.RORQ, false)
}
return
}
// compileShiftOp adds instructions for shift operations (SHR, SHL, ROTR, ROTL)
// where we have to place the second value (shift counts) on the CX register.
func (c *amd64Compiler) compileShiftOp(instruction asm.Instruction, is32Bit bool) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
x2 := c.locationStack.pop()
// Ensures that x2 (holding shift counts) is placed on the CX register.
const shiftCountRegister = amd64.RegCX
if (x2.onRegister() && x2.register != shiftCountRegister) || x2.onStack() {
// If another value lives on the CX register, we release it to the stack.
c.onValueReleaseRegisterToStack(shiftCountRegister)
if x2.onRegister() {
x2r := x2.register
// If x2 lives on a register, we move the value to CX.
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.MOVL, x2r, shiftCountRegister)
} else {
c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2r, shiftCountRegister)
}
// We no longer place any value on the original register, so we record it.
c.locationStack.markRegisterUnused(x2r)
} else {
// If it is on stack, we just move the memory allocated value to the CX register.
x2.setRegister(shiftCountRegister)
c.compileLoadValueOnStackToRegister(x2)
}
c.locationStack.markRegisterUsed(shiftCountRegister)
}
x1 := c.locationStack.peek() // Note this is peek!
x1r := x1.register
if x1.onRegister() {
c.assembler.CompileRegisterToRegister(instruction, shiftCountRegister, x1r)
} else {
// Shift target can be placed on a memory location.
// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
c.assembler.CompileRegisterToMemory(instruction, shiftCountRegister, amd64ReservedRegisterForStackBasePointerAddress, int64(x1.stackPointer)*8)
}
// We consumed x2 register after the operation here,
// so we release it.
c.locationStack.markRegisterUnused(shiftCountRegister)
return nil
}
// compileAbs implements compiler.compileAbs for the amd64 architecture.
//
// See the following discussions for how we could take the abs of floats on x86 assembly.
// https://stackoverflow.com/questions/32408665/fastest-way-to-compute-absolute-value-using-sse/32422471#32422471
// https://stackoverflow.com/questions/44630015/how-would-fabsdouble-be-implemented-on-x86-is-it-an-expensive-operation
func (c *amd64Compiler) compileAbs(o *wazeroir.UnionOperation) (err error) {
target := c.locationStack.peek() // Note this is peek!
if err = c.compileEnsureOnRegister(target); err != nil {
return err
}
// First shift left by one to clear the sign bit, and then shift right by one.
if wazeroir.Float(o.B1) == wazeroir.Float32 {
c.assembler.CompileConstToRegister(amd64.PSLLD, 1, target.register)
c.assembler.CompileConstToRegister(amd64.PSRLD, 1, target.register)
} else {
c.assembler.CompileConstToRegister(amd64.PSLLQ, 1, target.register)
c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, target.register)
}
return nil
}
// compileNeg implements compiler.compileNeg for the amd64 architecture.
func (c *amd64Compiler) compileNeg(o *wazeroir.UnionOperation) (err error) {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
tmpReg, err := c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
// First we move the sign-bit mask (placed in memory) to the tmp register,
// since we cannot take XOR directly with float reg and const.
// And then negate the value by XOR it with the sign-bit mask.
if wazeroir.Float(o.B1) == wazeroir.Float32 {
err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
if err != nil {
return err
}
c.assembler.CompileRegisterToRegister(amd64.XORPS, tmpReg, target.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
if err != nil {
return err
}
c.assembler.CompileRegisterToRegister(amd64.XORPD, tmpReg, target.register)
}
return nil
}
// compileCeil implements compiler.compileCeil for the amd64 architecture.
func (c *amd64Compiler) compileCeil(o *wazeroir.UnionOperation) (err error) {
// Internally, ceil can be performed via ROUND instruction with 0x02 mode.
// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/ceilf.S for example.
return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x02)
}
// compileFloor implements compiler.compileFloor for the amd64 architecture.
func (c *amd64Compiler) compileFloor(o *wazeroir.UnionOperation) (err error) {
// Internally, floor can be performed via ROUND instruction with 0x01 mode.
// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/floorf.S for example.
return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x01)
}
// compileTrunc implements compiler.compileTrunc for the amd64 architecture.
func (c *amd64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
// Internally, trunc can be performed via ROUND instruction with 0x03 mode.
// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/truncf.S for example.
return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x03)
}
// compileNearest implements compiler.compileNearest for the amd64 architecture.
func (c *amd64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
// Nearest can be performed via ROUND instruction with 0x00 mode.
return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x00)
}
func (c *amd64Compiler) compileRoundInstruction(is32Bit bool, mode int64) error {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
if is32Bit {
c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSS, target.register, target.register, byte(mode))
} else {
c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSD, target.register, target.register, byte(mode))
}
return nil
}
// compileMin implements compiler.compileMin for the amd64 architecture.
func (c *amd64Compiler) compileMin(o *wazeroir.UnionOperation) error {
is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
if is32Bit {
return c.compileMinOrMax(is32Bit, true, amd64.MINSS)
} else {
return c.compileMinOrMax(is32Bit, true, amd64.MINSD)
}
}
// compileMax implements compiler.compileMax for the amd64 architecture.
func (c *amd64Compiler) compileMax(o *wazeroir.UnionOperation) error {
is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
if is32Bit {
return c.compileMinOrMax(is32Bit, false, amd64.MAXSS)
} else {
return c.compileMinOrMax(is32Bit, false, amd64.MAXSD)
}
}
// emitMinOrMax adds instructions to pop two values from the stack, and push back either minimum or
// minimum of these two values onto the stack according to the minOrMaxInstruction argument.
// minOrMaxInstruction must be one of MAXSS, MAXSD, MINSS or MINSD.
// Note: These native min/max instructions are almost compatible with min/max in the Wasm specification,
// but it is slightly different with respect to the NaN handling.
// Native min/max instructions return non-NaN value if exactly one of target values
// is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN.
// However, WebAssembly specifies that min/max must always return NaN if one of values is NaN.
// Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before
// the native min/max, which is why we cannot simply emit a native min/max instruction here.
//
// For the semantics, see wazeroir.Min and wazeroir.Max for detail.
func (c *amd64Compiler) compileMinOrMax(is32Bit, isMin bool, minOrMaxInstruction asm.Instruction) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
}
// At this point, we have the three cases of conditional flags below
// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
//
// 1) Two values are NaN-free and different: All flags are cleared.
// 2) Two values are NaN-free and equal: Only ZF flags is set.
// 3) One of Two values is NaN: ZF, PF and CF flags are set.
// Jump instruction to handle 1) case by checking the ZF flag
// as ZF is only set for 2) and 3) cases.
nanFreeOrDiffJump := c.assembler.CompileJump(amd64.JNE)
// Start handling 2) and 3).
// Jump if one of two values is NaN by checking the parity flag (PF).
includeNaNJmp := c.assembler.CompileJump(amd64.JPS)
// Start handling 2).
// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
// returned if two values are positive and negative zeros.
var inst asm.Instruction
switch {
case is32Bit && isMin:
inst = amd64.ORPS
case !is32Bit && isMin:
inst = amd64.ORPD
case is32Bit && !isMin:
inst = amd64.ANDPS
case !is32Bit && !isMin:
inst = amd64.ANDPD
}
c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
sameExitJmp := c.assembler.CompileJump(amd64.JMP)
// start handling 3).
c.assembler.SetJumpTargetOnNext(includeNaNJmp)
// We emit the ADD instruction to produce the NaN in x1.
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.ADDSS, x2.register, x1.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.ADDSD, x2.register, x1.register)
}
// Exit from the NaN case branch.
nanExitJmp := c.assembler.CompileJump(amd64.JMP)
// Start handling 1).
c.assembler.SetJumpTargetOnNext(nanFreeOrDiffJump)
// Now handle the NaN-free and different values case.
c.assembler.CompileRegisterToRegister(minOrMaxInstruction, x2.register, x1.register)
// Set the jump target of 1) and 2) cases to the next instruction after 3) case.
c.assembler.SetJumpTargetOnNext(nanExitJmp)
c.assembler.SetJumpTargetOnNext(sameExitJmp)
// Record that we consumed the x2 and placed the minOrMax result in the x1's register.
c.locationStack.markRegisterUnused(x2.register)
c.locationStack.markRegisterUnused(x1.register)
c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
return nil
}
// compileCopysign implements compiler.compileCopysign for the amd64 architecture.
func (c *amd64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
tmpReg, err := c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
// Move the rest bit mask to the temp register.
if is32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32RestBitMask, tmpReg)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64RestBitMask, tmpReg)
}
if err != nil {
return err
}
// Clear the sign bit of x1 via AND with the mask.
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x1.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x1.register)
}
// Move the sign bit mask to the temp register.
if is32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
}
if err != nil {
return err
}
// Clear the non-sign bits of x2 via AND with the mask.
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x2.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x2.register)
}
// Finally, copy the sign bit of x2 to x1.
if is32Bit {
c.assembler.CompileRegisterToRegister(amd64.ORPS, x2.register, x1.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.ORPD, x2.register, x1.register)
}
// Record that we consumed the x2 and placed the copysign result in the x1's register.
c.locationStack.markRegisterUnused(x2.register)
c.locationStack.markRegisterUnused(x1.register)
c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
return nil
}
// compileSqrt implements compiler.compileSqrt for the amd64 architecture.
func (c *amd64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
if wazeroir.Float(o.B1) == wazeroir.Float32 {
c.assembler.CompileRegisterToRegister(amd64.SQRTSS, target.register, target.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.SQRTSD, target.register, target.register)
}
return nil
}
// compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the amd64 architecture.
func (c *amd64Compiler) compileI32WrapFromI64() error {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
c.assembler.CompileRegisterToRegister(amd64.MOVL, target.register, target.register)
target.valueType = runtimeValueTypeI32
return nil
}
// compileITruncFromF implements compiler.compileITruncFromF for the amd64 architecture.
//
// Note: in the following implementation, we use CVTSS2SI and CVTSD2SI to convert floats to signed integers.
// According to the Intel manual ([1],[2]), if the source float value is either +-Inf or NaN, or it exceeds representative ranges
// of target signed integer, then the instruction returns "masked" response float32SignBitMask (or float64SignBitMask for 64 bit case).
// [1] Chapter 11.5.2, SIMD Floating-Point Exception Conditions in "Vol 1, Intel® 64 and IA-32 Architectures Manual"
//
// https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html
//
// [2] https://xem.github.io/minix86/manual/intel-x86-and-64-manual-vol1/o_7281d5ea06a5b67a-268.html
func (c *amd64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) (err error) {
inputType := wazeroir.Float(o.B1)
outputType := wazeroir.SignedInt(o.B2)
nonTrapping := o.B3
if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt32 {
err = c.emitSignedI32TruncFromFloat(true, nonTrapping)
} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt64 {
err = c.emitSignedI64TruncFromFloat(true, nonTrapping)
} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt32 {
err = c.emitSignedI32TruncFromFloat(false, nonTrapping)
} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt64 {
err = c.emitSignedI64TruncFromFloat(false, nonTrapping)
} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint32 {
err = c.emitUnsignedI32TruncFromFloat(true, nonTrapping)
} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint64 {
err = c.emitUnsignedI64TruncFromFloat(true, nonTrapping)
} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint32 {
err = c.emitUnsignedI32TruncFromFloat(false, nonTrapping)
} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint64 {
err = c.emitUnsignedI64TruncFromFloat(false, nonTrapping)
}
return
}
// emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit unsigned integer.
func (c *amd64Compiler) emitUnsignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
source := c.locationStack.pop()
if err := c.compileEnsureOnRegister(source); err != nil {
return err
}
result, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First, we check the source float value is above or equal math.MaxInt32+1.
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
}
if err != nil {
return err
}
// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
var nonTrappingNaNJump asm.Node
if nonTrapping {
jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
// In non trapping case, NaN is casted as zero.
// Zero out the result register by XOR itsself.
c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
} else {
c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
}
// Jump if the source float value is above or equal math.MaxInt32+1.
jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
// next we convert the value as a signed integer.
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
}
// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
var nonTrappingMinusJump asm.Node
if nonTrapping {
jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
// In non trapping case, the minus value is casted as zero.
// Zero out the result register by XOR itsself.
c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
} else {
c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
}
// Otherwise, the values is valid.
okJmpForLessThanMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP)
// Now, start handling the case where the original float value is above or equal math.MaxInt32+1.
//
// First, we subtract the math.MaxInt32+1 from the original value so it can fit in signed 32-bit integer.
c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
}
if err != nil {
return err
}
// Then, convert the subtracted value as a signed 32-bit integer.
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
}
// next, we have to check if the value is from NaN, +Inf.
// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
// This means we check if the result int value is minus or not.
c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
// If the result is minus, the conversion is invalid (from NaN or +Inf)
var nonTrappingAboveOrEqualMaxInt32PlusOne asm.Node
if nonTrapping {
jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitUnsignedInt, result)
if err != nil {
return err
}
nonTrappingAboveOrEqualMaxInt32PlusOne = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
} else {
c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
}
// Otherwise, we successfully converted the source float minus (math.MaxInt32+1) to int.
// So, we retrieve the original source float value by adding the sign mask.
if err = c.assembler.CompileStaticConstToRegister(amd64.ADDL, c.float32SignBitMask, result); err != nil {
return err
}
// We jump to the next instructions for valid cases.
c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt32PlusOne)
if nonTrapping {
c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt32PlusOne)
c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
}
// We consumed the source's register and placed the conversion result
// in the result register.
c.locationStack.markRegisterUnused(source.register)
c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
return nil
}
// emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit unsigned integer.
func (c *amd64Compiler) emitUnsignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
source := c.locationStack.pop()
if err := c.compileEnsureOnRegister(source); err != nil {
return err
}
result, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First, we check the source float value is above or equal math.MaxInt64+1.
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
}
if err != nil {
return err
}
// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
var nonTrappingNaNJump asm.Node
if nonTrapping {
jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is c.not set.
// In non trapping case, NaN is casted as zero.
// Zero out the result register by XOR itsself.
c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
} else {
c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
}
// Jump if the source float values is above or equal math.MaxInt64+1.
jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
// next we convert the value as a signed integer.
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
}
// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
var nonTrappingMinusJump asm.Node
if nonTrapping {
jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
// In non trapping case, the minus value is casted as zero.
// Zero out the result register by XOR itsself.
c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
} else {
c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
}
// Otherwise, the values is valid.
okJmpForLessThanMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP)
// Now, start handling the case where the original float value is above or equal math.MaxInt64+1.
//
// First, we subtract the math.MaxInt64+1 from the original value so it can fit in signed 64-bit integer.
c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
}
if err != nil {
return err
}
// Then, convert the subtracted value as a signed 64-bit integer.
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
}
// next, we have to check if the value is from NaN, +Inf.
// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
// This means we check if the result int value is minus or not.
c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
// If the result is minus, the conversion is invalid (from NaN or +Inf)
var nonTrappingAboveOrEqualMaxInt64PlusOne asm.Node
if nonTrapping {
jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitUnsignedInt, result)
if err != nil {
return err
}
nonTrappingAboveOrEqualMaxInt64PlusOne = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
} else {
c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
}
// Otherwise, we successfully converted the the source float minus (math.MaxInt64+1) to int.
// So, we retrieve the original source float value by adding the sign mask.
if err = c.assembler.CompileStaticConstToRegister(amd64.ADDQ, c.float64SignBitMask, result); err != nil {
return err
}
// We jump to the next instructions for valid cases.
c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt64PlusOne)
if nonTrapping {
c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt64PlusOne)
c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
}
// We consumed the source's register and placed the conversion result
// in the result register.
c.locationStack.markRegisterUnused(source.register)
c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
return nil
}
// emitSignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit signed integer.
func (c *amd64Compiler) emitSignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
source := c.locationStack.pop()
if err := c.compileEnsureOnRegister(source); err != nil {
return err
}
result, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
}
// We compare the conversion result with the sign bit mask to check if it is either
// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
// 2) the source equals the minimum signed 32-bit (=-2147483648.000000) whose bit pattern is float32ForMinimumSigned32bitIntegerAddress for 32 bit float
// or float64ForMinimumSigned32bitIntegerAddress for 64bit float.
err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.float32SignBitMask, result)
if err != nil {
return err
}
// Otherwise, jump to exit as the result is valid.
okJmp := c.assembler.CompileJump(amd64.JNE)
// Start handling the case of 1) and 2).
// First, check if the value is NaN.
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
}
// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
var nontrappingNanJump asm.Node
if nonTrapping {
jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
// In non trapping case, NaN is casted as zero.
// Zero out the result register by XOR itsself.
c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
} else {
// If the value is NaN, we return the function with nativeCallStatusCodeInvalidFloatToIntConversion.
c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
}
// Check if the value is larger than or equal the minimum 32-bit integer value,
// meaning that the value exceeds the lower bound of 32-bit signed integer range.
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned32bitInteger, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned32bitInteger, source.register)
}
if err != nil {
return err
}
if !nonTrapping {
// Trap if the value does not exceed the lower bound.
if isFloat32Bit {
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
} else {
c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusIntegerOverflow)
}
// At this point, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
// So, check if the value equals the minimum signed 32-bit int.
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
}
if err != nil {
return err
}
// Trap if the value is not minus (= the minimum signed 32-bit int).
c.compileTrapFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)
// We jump to the next instructions for valid cases.
c.assembler.SetJumpTargetOnNext(okJmp)
} else {
// Jump if the value does not exceed the lower bound.
var jmpIfNotExceedsLowerBound asm.Node
if isFloat32Bit {
jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JCC)
} else {
jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JHI)
}
// If the value exceeds the lower bound, we "saturate" it to the minimum.
if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.minimum32BitSignedInt, result); err != nil {
return err
}
nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
// Otherwise, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
}
if err != nil {
return err
}
jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int).
// If the value exceeds signed 32-bit maximum, we saturate it to the maximum.
if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitSignedInt, result); err != nil {
return err
}
c.assembler.SetJumpTargetOnNext(okJmp)
c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
}
// We consumed the source's register and placed the conversion result
// in the result register.
c.locationStack.markRegisterUnused(source.register)
c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
return nil
}
// emitSignedI64TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit signed integer.
func (c *amd64Compiler) emitSignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
source := c.locationStack.pop()
if err := c.compileEnsureOnRegister(source); err != nil {
return err
}
result, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
}
// We compare the conversion result with the sign bit mask to check if it is either
// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
// 2) the source equals the minimum signed 32-bit (=-9223372036854775808.0) whose bit pattern is float32ForMinimumSigned64bitIntegerAddress for 32 bit float
// or float64ForMinimumSigned64bitIntegerAddress for 64bit float.
err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.float64SignBitMask, result)
if err != nil {
return err
}
// Otherwise, we simply jump to exit as the result is valid.
okJmp := c.assembler.CompileJump(amd64.JNE)
// Start handling the case of 1) and 2).
// First, check if the value is NaN.
if isFloat32Bit {
c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
}
// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
var nontrappingNanJump asm.Node
if nonTrapping {
jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
// In non trapping case, NaN is casted as zero.
// Zero out the result register by XOR itsself.
c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
} else {
c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
}
// Check if the value is larger than or equal the minimum 64-bit integer value,
// meaning that the value exceeds the lower bound of 64-bit signed integer range.
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned64bitInteger, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned64bitInteger, source.register)
}
if err != nil {
return err
}
if !nonTrapping {
// Jump if the value is -Inf.
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
// At this point, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
// So, check if the value equals the minimum signed 64-bit int.
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
}
if err != nil {
return err
}
// Trap if the value is not minus (= the minimum signed 64-bit int).
c.compileTrapFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)
// We jump to the next instructions for valid cases.
c.assembler.SetJumpTargetOnNext(okJmp)
} else {
// Jump if the value is not -Inf.
jmpIfNotExceedsLowerBound := c.assembler.CompileJump(amd64.JCC)
// If the value exceeds the lower bound, we "saturate" it to the minimum.
err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.minimum64BitSignedInt, result)
if err != nil {
return err
}
nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
// Otherwise, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
// So, check if the value equals the minimum signed 64-bit int.
c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
if isFloat32Bit {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
} else {
err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
}
if err != nil {
return err
}
jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int).
// If the value exceeds signed 64-bit maximum, we saturate it to the maximum.
if err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitSignedInt, result); err != nil {
return err
}
c.assembler.SetJumpTargetOnNext(okJmp)
c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
}
// We consumed the source's register and placed the conversion result
// in the result register.
c.locationStack.markRegisterUnused(source.register)
c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
return nil
}
// compileFConvertFromI implements compiler.compileFConvertFromI for the amd64 architecture.
func (c *amd64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) (err error) {
inputType := wazeroir.SignedInt(o.B1)
outputType := wazeroir.Float(o.B2)
if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
err = c.compileSimpleConversion(amd64.CVTSL2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 32bit int
} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int
} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
err = c.compileSimpleConversion(amd64.CVTSL2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 32bit int
} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int
} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
//
// Here's the summary:
// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
// >> which allows CVTSI2SS to be used after all.
err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int.
} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
// For the same reason above, we use 64bit conversion for unsigned 32bit.
err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int.
} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
err = c.emitUnsignedInt64ToFloatConversion(true)
} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
err = c.emitUnsignedInt64ToFloatConversion(false)
}
return
}
// emitUnsignedInt64ToFloatConversion is handling the case of unsigned 64-bit integer
// in compileFConvertFromI.
func (c *amd64Compiler) emitUnsignedInt64ToFloatConversion(isFloat32bit bool) error {
// The logic here is exactly the same as GCC emits for the following code:
//
// float convert(int num) {
// float foo;
// uint64_t ptr1 = 100;
// foo = (float)(ptr1);
// return foo;
// }
//
// which is compiled by GCC as
//
// convert:
// push rbp
// mov rbp, rsp
// mov DWORD PTR [rbp-20], edi
// mov DWORD PTR [rbp-4], 100
// mov eax, DWORD PTR [rbp-4]
// test rax, rax
// js .handle_sign_bit_case
// cvtsi2ss xmm0, rax
// jmp .exit
// .handle_sign_bit_case:
// mov rdx, rax
// shr rdx
// and eax, 1
// or rdx, rax
// cvtsi2ss xmm0, rdx
// addsd xmm0, xmm0
// .exit: ...
//
// tl;dr is that we have a branch depending on whether or not sign bit is set.
origin := c.locationStack.pop()
if err := c.compileEnsureOnRegister(origin); err != nil {
return err
}
dest, err := c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(dest)
tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// Check if the most significant bit (sign bit) is set.
c.assembler.CompileRegisterToRegister(amd64.TESTQ, origin.register, origin.register)
// Jump if the sign bit is set.
jmpIfSignbitSet := c.assembler.CompileJump(amd64.JMI)
// Otherwise, we could fit the unsigned int into float32.
// So, we convert it to float32 and emit jump instruction to exit from this branch.
if isFloat32bit {
c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, origin.register, dest)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, origin.register, dest)
}
exitFromSignbitUnSet := c.assembler.CompileJump(amd64.JMP)
// Now handling the case where sign-bit is set.
// We emit the following sequences:
// mov tmpReg, origin
// shr tmpReg, 1
// and origin, 1
// or tmpReg, origin
// cvtsi2ss xmm0, tmpReg
// addsd xmm0, xmm0
c.assembler.SetJumpTargetOnNext(jmpIfSignbitSet)
c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, tmpReg)
c.assembler.CompileConstToRegister(amd64.SHRQ, 1, tmpReg)
c.assembler.CompileConstToRegister(amd64.ANDQ, 1, origin.register)
c.assembler.CompileRegisterToRegister(amd64.ORQ, origin.register, tmpReg)
if isFloat32bit {
c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, tmpReg, dest)
} else {
c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, tmpReg, dest)
}
if isFloat32bit {
c.assembler.CompileRegisterToRegister(amd64.ADDSS, dest, dest)
} else {
c.assembler.CompileRegisterToRegister(amd64.ADDSD, dest, dest)
}
// Now, we finished the sign-bit set branch.
// We have to make the exit jump target of sign-bit unset branch
// towards the next instruction.
c.assembler.SetJumpTargetOnNext(exitFromSignbitUnSet)
// We consumed the origin's register and placed the conversion result
// in the dest register.
c.locationStack.markRegisterUnused(origin.register)
if isFloat32bit {
c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF32)
} else {
c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF64)
}
return nil
}
// compileSimpleConversion pops a value type from the stack, and applies the
// given instruction on it, and push the result onto a register of the given type.
func (c *amd64Compiler) compileSimpleConversion(convInstruction asm.Instruction,
destinationRegisterType registerType, destinationValueType runtimeValueType,
) error {
origin := c.locationStack.pop()
if err := c.compileEnsureOnRegister(origin); err != nil {
return err
}
dest, err := c.allocateRegister(destinationRegisterType)
if err != nil {
return err
}
c.assembler.CompileRegisterToRegister(convInstruction, origin.register, dest)
c.locationStack.markRegisterUnused(origin.register)
c.pushRuntimeValueLocationOnRegister(dest, destinationValueType)
return nil
}
// compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the amd64 architecture.
func (c *amd64Compiler) compileF32DemoteFromF64() error {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
c.assembler.CompileRegisterToRegister(amd64.CVTSD2SS, target.register, target.register)
target.valueType = runtimeValueTypeF32
return nil
}
// compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the amd64 architecture.
func (c *amd64Compiler) compileF64PromoteFromF32() error {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
c.assembler.CompileRegisterToRegister(amd64.CVTSS2SD, target.register, target.register)
target.valueType = runtimeValueTypeF64
return nil
}
// compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the amd64 architecture.
func (c *amd64Compiler) compileI32ReinterpretFromF32() error {
if peek := c.locationStack.peek(); peek.onStack() {
// If the value is on the stack, this is no-op as there is nothing to do for converting type.
peek.valueType = runtimeValueTypeI32
return nil
}
return c.compileSimpleConversion(amd64.MOVL, registerTypeGeneralPurpose, runtimeValueTypeI32)
}
// compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the amd64 architecture.
func (c *amd64Compiler) compileI64ReinterpretFromF64() error {
if peek := c.locationStack.peek(); peek.onStack() {
// If the value is on the stack, this is no-op as there is nothing to do for converting type.
peek.valueType = runtimeValueTypeI64
return nil
}
return c.compileSimpleConversion(amd64.MOVQ, registerTypeGeneralPurpose, runtimeValueTypeI64)
}
// compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the amd64 architecture.
func (c *amd64Compiler) compileF32ReinterpretFromI32() error {
if peek := c.locationStack.peek(); peek.onStack() {
// If the value is on the stack, this is no-op as there is nothing to do for converting type.
peek.valueType = runtimeValueTypeF32
return nil
}
return c.compileSimpleConversion(amd64.MOVL, registerTypeVector, runtimeValueTypeF32)
}
// compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the amd64 architecture.
func (c *amd64Compiler) compileF64ReinterpretFromI64() error {
if peek := c.locationStack.peek(); peek.onStack() {
// If the value is on the stack, this is no-op as there is nothing to do for converting type.
peek.valueType = runtimeValueTypeF64
return nil
}
return c.compileSimpleConversion(amd64.MOVQ, registerTypeVector, runtimeValueTypeF64)
}
// compileExtend implements compiler.compileExtend for the amd64 architecture.
func (c *amd64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
var inst asm.Instruction
signed := o.B1 != 0
if signed {
inst = amd64.MOVLQSX // = MOVSXD https://www.felixcloutier.com/x86/movsx:movsxd
} else {
inst = amd64.MOVL
}
return c.compileExtendImpl(inst, runtimeValueTypeI64)
}
// compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend32From8() error {
return c.compileExtendImpl(amd64.MOVBLSX, runtimeValueTypeI32)
}
// compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend32From16() error {
return c.compileExtendImpl(amd64.MOVWLSX, runtimeValueTypeI32)
}
// compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend64From8() error {
return c.compileExtendImpl(amd64.MOVBQSX, runtimeValueTypeI64)
}
// compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend64From16() error {
return c.compileExtendImpl(amd64.MOVWQSX, runtimeValueTypeI64)
}
// compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend64From32() error {
return c.compileExtendImpl(amd64.MOVLQSX, runtimeValueTypeI64)
}
func (c *amd64Compiler) compileExtendImpl(inst asm.Instruction, destinationType runtimeValueType) error {
target := c.locationStack.peek() // Note this is peek!
if err := c.compileEnsureOnRegister(target); err != nil {
return err
}
c.assembler.CompileRegisterToRegister(inst, target.register, target.register)
target.valueType = destinationType
return nil
}
// compileEq implements compiler.compileEq for the amd64 architecture.
func (c *amd64Compiler) compileEq(o *wazeroir.UnionOperation) error {
return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), true)
}
// compileNe implements compiler.compileNe for the amd64 architecture.
func (c *amd64Compiler) compileNe(o *wazeroir.UnionOperation) error {
return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), false)
}
func (c *amd64Compiler) compileEqOrNe(t wazeroir.UnsignedType, shouldEqual bool) (err error) {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
x1r, x2r := x1.register, x2.register
// x1 and x2 are temporary registers only used for the cmp operation. Release them.
c.locationStack.releaseRegister(x1)
c.locationStack.releaseRegister(x2)
switch t {
case wazeroir.UnsignedTypeI32:
err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPL, shouldEqual)
case wazeroir.UnsignedTypeI64:
err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPQ, shouldEqual)
case wazeroir.UnsignedTypeF32:
err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISS, shouldEqual)
case wazeroir.UnsignedTypeF64:
err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISD, shouldEqual)
}
if err != nil {
return
}
return
}
func (c *amd64Compiler) compileEqOrNeForInts(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction,
shouldEqual bool,
) error {
c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
// Record that the result is on the conditional register.
var condReg asm.ConditionalRegisterState
if shouldEqual {
condReg = amd64.ConditionalRegisterStateE
} else {
condReg = amd64.ConditionalRegisterStateNE
}
loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(condReg)
loc.valueType = runtimeValueTypeI32
return nil
}
// For float EQ and NE, we have to take NaN values into account.
// Notably, Wasm specification states that if one of targets is NaN,
// the result must be zero for EQ or one for NE.
func (c *amd64Compiler) compileEqOrNeForFloats(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool) error {
// Before we allocate the result, we have to reserve two int registers.
nanFragReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(nanFragReg)
cmpResultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// Then, execute the comparison.
c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
// First, we get the parity flag which indicates whether one of values was NaN.
if shouldEqual {
// Set 1 if two values are NOT NaN.
c.assembler.CompileNoneToRegister(amd64.SETPC, nanFragReg)
} else {
// Set 1 if one of values is NaN.
c.assembler.CompileNoneToRegister(amd64.SETPS, nanFragReg)
}
// next, we get the usual comparison flag.
if shouldEqual {
// Set 1 if equal.
c.assembler.CompileNoneToRegister(amd64.SETEQ, cmpResultReg)
} else {
// Set 1 if not equal.
c.assembler.CompileNoneToRegister(amd64.SETNE, cmpResultReg)
}
// Do "and" or "or" operations on these two flags to get the actual result.
if shouldEqual {
c.assembler.CompileRegisterToRegister(amd64.ANDL, nanFragReg, cmpResultReg)
} else {
c.assembler.CompileRegisterToRegister(amd64.ORL, nanFragReg, cmpResultReg)
}
// Clear the unnecessary bits by zero extending the first byte.
// This is necessary the upper bits (5 to 32 bits) of SET* instruction result is undefined.
c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, cmpResultReg, cmpResultReg)
// Now we have the result in cmpResultReg register, so we record it.
c.pushRuntimeValueLocationOnRegister(cmpResultReg, runtimeValueTypeI32)
// Also, we no longer need nanFragRegister.
c.locationStack.markRegisterUnused(nanFragReg)
return nil
}
// compileEqz implements compiler.compileEqz for the amd64 architecture.
func (c *amd64Compiler) compileEqz(o *wazeroir.UnionOperation) (err error) {
v := c.locationStack.pop()
if err = c.compileEnsureOnRegister(v); err != nil {
return err
}
unsignedInt := wazeroir.UnsignedInt(o.B1)
switch unsignedInt {
case wazeroir.UnsignedInt32:
err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.fourZeros, v.register)
case wazeroir.UnsignedInt64:
err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.eightZeros, v.register)
}
if err != nil {
return err
}
// v is consumed by the cmp operation so release it.
c.locationStack.releaseRegister(v)
// Finally, record that the result is on the conditional register.
loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
loc.valueType = runtimeValueTypeI32
return nil
}
// compileLt implements compiler.compileLt for the amd64 architecture.
func (c *amd64Compiler) compileLt(o *wazeroir.UnionOperation) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// Emit the compare instruction.
var resultConditionState asm.ConditionalRegisterState
var inst asm.Instruction
signedType := wazeroir.SignedType(o.B1)
switch signedType {
case wazeroir.SignedTypeInt32:
resultConditionState = amd64.ConditionalRegisterStateL
inst = amd64.CMPL
case wazeroir.SignedTypeUint32:
resultConditionState = amd64.ConditionalRegisterStateB
inst = amd64.CMPL
case wazeroir.SignedTypeInt64:
inst = amd64.CMPQ
resultConditionState = amd64.ConditionalRegisterStateL
case wazeroir.SignedTypeUint64:
resultConditionState = amd64.ConditionalRegisterStateB
inst = amd64.CMPQ
case wazeroir.SignedTypeFloat32:
resultConditionState = amd64.ConditionalRegisterStateA
inst = amd64.COMISS
case wazeroir.SignedTypeFloat64:
resultConditionState = amd64.ConditionalRegisterStateA
inst = amd64.COMISD
}
c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
// x1 and x2 are temporary registers only used for the cmp operation. Release them.
c.locationStack.releaseRegister(x1)
c.locationStack.releaseRegister(x2)
// Finally, record that the result is on the conditional register.
loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
loc.valueType = runtimeValueTypeI32
return nil
}
// compileGt implements compiler.compileGt for the amd64 architecture.
func (c *amd64Compiler) compileGt(o *wazeroir.UnionOperation) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// Emit the compare instruction.
var resultConditionState asm.ConditionalRegisterState
signedType := wazeroir.SignedType(o.B1)
switch signedType {
case wazeroir.SignedTypeInt32:
resultConditionState = amd64.ConditionalRegisterStateG
c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
case wazeroir.SignedTypeUint32:
c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateA
case wazeroir.SignedTypeInt64:
c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateG
case wazeroir.SignedTypeUint64:
c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateA
case wazeroir.SignedTypeFloat32:
c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
resultConditionState = amd64.ConditionalRegisterStateA
case wazeroir.SignedTypeFloat64:
c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
resultConditionState = amd64.ConditionalRegisterStateA
}
// x1 and x2 are temporary registers only used for the cmp operation. Release them.
c.locationStack.releaseRegister(x1)
c.locationStack.releaseRegister(x2)
// Finally, record that the result is on the conditional register.
loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
loc.valueType = runtimeValueTypeI32
return nil
}
// compileLe implements compiler.compileLe for the amd64 architecture.
func (c *amd64Compiler) compileLe(o *wazeroir.UnionOperation) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// Emit the compare instruction.
var inst asm.Instruction
var resultConditionState asm.ConditionalRegisterState
signedType := wazeroir.SignedType(o.B1)
switch signedType {
case wazeroir.SignedTypeInt32:
resultConditionState = amd64.ConditionalRegisterStateLE
inst = amd64.CMPL
case wazeroir.SignedTypeUint32:
resultConditionState = amd64.ConditionalRegisterStateBE
inst = amd64.CMPL
case wazeroir.SignedTypeInt64:
resultConditionState = amd64.ConditionalRegisterStateLE
inst = amd64.CMPQ
case wazeroir.SignedTypeUint64:
resultConditionState = amd64.ConditionalRegisterStateBE
inst = amd64.CMPQ
case wazeroir.SignedTypeFloat32:
resultConditionState = amd64.ConditionalRegisterStateAE
inst = amd64.UCOMISS
case wazeroir.SignedTypeFloat64:
resultConditionState = amd64.ConditionalRegisterStateAE
inst = amd64.UCOMISD
}
c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
// x1 and x2 are temporary registers only used for the cmp operation. Release them.
c.locationStack.releaseRegister(x1)
c.locationStack.releaseRegister(x2)
// Finally, record that the result is on the conditional register.
loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
loc.valueType = runtimeValueTypeI32
return nil
}
// compileGe implements compiler.compileGe for the amd64 architecture.
func (c *amd64Compiler) compileGe(o *wazeroir.UnionOperation) error {
x2 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x2); err != nil {
return err
}
x1 := c.locationStack.pop()
if err := c.compileEnsureOnRegister(x1); err != nil {
return err
}
// Emit the compare instruction.
var resultConditionState asm.ConditionalRegisterState
signedType := wazeroir.SignedType(o.B1)
switch signedType {
case wazeroir.SignedTypeInt32:
c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateGE
case wazeroir.SignedTypeUint32:
c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateAE
case wazeroir.SignedTypeInt64:
c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateGE
case wazeroir.SignedTypeUint64:
c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
resultConditionState = amd64.ConditionalRegisterStateAE
case wazeroir.SignedTypeFloat32:
c.assembler.CompileRegisterToRegister(amd64.COMISS, x2.register, x1.register)
resultConditionState = amd64.ConditionalRegisterStateAE
case wazeroir.SignedTypeFloat64:
c.assembler.CompileRegisterToRegister(amd64.COMISD, x2.register, x1.register)
resultConditionState = amd64.ConditionalRegisterStateAE
}
// x1 and x2 are temporary registers only used for the cmp operation. Release them.
c.locationStack.releaseRegister(x1)
c.locationStack.releaseRegister(x2)
// Finally, record that the result is on the conditional register.
loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
loc.valueType = runtimeValueTypeI32
return nil
}
// compileLoad implements compiler.compileLoad for the amd64 architecture.
func (c *amd64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
var (
isIntType bool
movInst asm.Instruction
targetSizeInBytes int64
vt runtimeValueType
)
unsignedType := wazeroir.UnsignedType(o.B1)
offset := uint32(o.U2)
switch unsignedType {
case wazeroir.UnsignedTypeI32:
isIntType = true
movInst = amd64.MOVL
targetSizeInBytes = 32 / 8
vt = runtimeValueTypeI32
case wazeroir.UnsignedTypeI64:
isIntType = true
movInst = amd64.MOVQ
targetSizeInBytes = 64 / 8
vt = runtimeValueTypeI64
case wazeroir.UnsignedTypeF32:
isIntType = false
movInst = amd64.MOVL
targetSizeInBytes = 32 / 8
vt = runtimeValueTypeF32
case wazeroir.UnsignedTypeF64:
isIntType = false
movInst = amd64.MOVQ
targetSizeInBytes = 64 / 8
vt = runtimeValueTypeF64
}
reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
if err != nil {
return err
}
if isIntType {
// For integer types, read the corresponding bytes from the offset to the memory
// and store the value to the int register.
c.assembler.CompileMemoryWithIndexToRegister(movInst,
// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
reg)
c.pushRuntimeValueLocationOnRegister(reg, vt)
} else {
// For float types, we read the value to the float register.
floatReg, err := c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
c.assembler.CompileMemoryWithIndexToRegister(movInst,
// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
floatReg)
c.pushRuntimeValueLocationOnRegister(floatReg, vt)
// We no longer need the int register so mark it unused.
c.locationStack.markRegisterUnused(reg)
}
return nil
}
// compileLoad8 implements compiler.compileLoad8 for the amd64 architecture.
func (c *amd64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
const targetSizeInBytes = 1
offset := uint32(o.U2)
reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
if err != nil {
return err
}
// Then move a byte at the offset to the register.
// Note that Load8 is only for integer types.
var inst asm.Instruction
var vt runtimeValueType
signedInt := wazeroir.SignedInt(o.B1)
switch signedInt {
case wazeroir.SignedInt32:
inst = amd64.MOVBLSX
vt = runtimeValueTypeI32
case wazeroir.SignedUint32:
inst = amd64.MOVBLZX
vt = runtimeValueTypeI32
case wazeroir.SignedInt64:
inst = amd64.MOVBQSX
vt = runtimeValueTypeI64
case wazeroir.SignedUint64:
inst = amd64.MOVBQZX
vt = runtimeValueTypeI64
}
c.assembler.CompileMemoryWithIndexToRegister(inst,
// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
reg)
c.pushRuntimeValueLocationOnRegister(reg, vt)
return nil
}
// compileLoad16 implements compiler.compileLoad16 for the amd64 architecture.
func (c *amd64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
const targetSizeInBytes = 16 / 8
offset := uint32(o.U2)
reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
if err != nil {
return err
}
// Then move 2 bytes at the offset to the register.
// Note that Load16 is only for integer types.
var inst asm.Instruction
var vt runtimeValueType
signedInt := wazeroir.SignedInt(o.B1)
switch signedInt {
case wazeroir.SignedInt32:
inst = amd64.MOVWLSX
vt = runtimeValueTypeI32
case wazeroir.SignedInt64:
inst = amd64.MOVWQSX
vt = runtimeValueTypeI64
case wazeroir.SignedUint32:
inst = amd64.MOVWLZX
vt = runtimeValueTypeI32
case wazeroir.SignedUint64:
inst = amd64.MOVWQZX
vt = runtimeValueTypeI64
}
c.assembler.CompileMemoryWithIndexToRegister(inst,
// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
reg)
c.pushRuntimeValueLocationOnRegister(reg, vt)
return nil
}
// compileLoad32 implements compiler.compileLoad32 for the amd64 architecture.
func (c *amd64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
const targetSizeInBytes = 32 / 8
offset := uint32(o.U2)
reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
if err != nil {
return err
}
// Then move 4 bytes at the offset to the register.
var inst asm.Instruction
signed := o.B1 == 1
if signed {
inst = amd64.MOVLQSX
} else {
inst = amd64.MOVLQZX
}
c.assembler.CompileMemoryWithIndexToRegister(inst,
// We access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
reg)
c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
return nil
}
// compileMemoryAccessCeilSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes"
// into a register, and returns the stored register. We call the result "ceil" because we access the memory
// as memory.Buffer[ceil-targetSizeInBytes: ceil].
//
// Note: this also emits the instructions to check the out-of-bounds memory access.
// In other words, if the ceil exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
func (c *amd64Compiler) compileMemoryAccessCeilSetup(offsetArg uint32, targetSizeInBytes int64) (asm.Register, error) {
base := c.locationStack.pop()
if err := c.compileEnsureOnRegister(base); err != nil {
return asm.NilRegister, err
}
result := base.register
if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxInt32 {
c.assembler.CompileConstToRegister(amd64.ADDQ, offsetConst, result)
} else if offsetConst <= math.MaxUint32 {
// Note: in practice, this branch rarely happens as in this case, the wasm binary know that
// memory has more than 1 GBi or at least tries to access above 1 GBi memory region.
//
// This case, we cannot directly add the offset to a register by ADDQ(const) instruction.
// That is because the imm32 const is sign-extended to 64-bit in ADDQ(const), and we end up
// making offsetConst as the negative number, which is wrong.
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return asm.NilRegister, err
}
c.assembler.CompileConstToRegister(amd64.MOVL, int64(uint32(offsetConst)), tmp)
c.assembler.CompileRegisterToRegister(amd64.ADDQ, tmp, result)
} else {
// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
return result, nil
}
// Now we compare the value with the memory length which is held by callEngine.
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, result)
// Trap if the value is out-of-bounds of memory length.
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
c.locationStack.markRegisterUnused(result)
return result, nil
}
// compileStore implements compiler.compileStore for the amd64 architecture.
func (c *amd64Compiler) compileStore(o *wazeroir.UnionOperation) error {
var movInst asm.Instruction
var targetSizeInByte int64
unsignedType := wazeroir.UnsignedType(o.B1)
offset := uint32(o.U2)
switch unsignedType {
case wazeroir.UnsignedTypeI32, wazeroir.UnsignedTypeF32:
movInst = amd64.MOVL
targetSizeInByte = 32 / 8
case wazeroir.UnsignedTypeI64, wazeroir.UnsignedTypeF64:
movInst = amd64.MOVQ
targetSizeInByte = 64 / 8
}
return c.compileStoreImpl(offset, movInst, targetSizeInByte)
}
// compileStore8 implements compiler.compileStore8 for the amd64 architecture.
func (c *amd64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
return c.compileStoreImpl(uint32(o.U2), amd64.MOVB, 1)
}
// compileStore32 implements compiler.compileStore32 for the amd64 architecture.
func (c *amd64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
return c.compileStoreImpl(uint32(o.U2), amd64.MOVW, 16/8)
}
// compileStore32 implements compiler.compileStore32 for the amd64 architecture.
func (c *amd64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
return c.compileStoreImpl(uint32(o.U2), amd64.MOVL, 32/8)
}
func (c *amd64Compiler) compileStoreImpl(offsetConst uint32, inst asm.Instruction, targetSizeInBytes int64) error {
val := c.locationStack.pop()
if err := c.compileEnsureOnRegister(val); err != nil {
return err
}
reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
if err != nil {
return nil
}
c.assembler.CompileRegisterToMemoryWithIndex(
inst, val.register,
amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
)
// We no longer need both the value and base registers.
c.locationStack.releaseRegister(val)
c.locationStack.markRegisterUnused(reg)
return nil
}
// compileMemoryGrow implements compiler.compileMemoryGrow for the amd64 architecture.
func (c *amd64Compiler) compileMemoryGrow() error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
if err := c.compileCallBuiltinFunction(builtinFunctionIndexMemoryGrow); err != nil {
return err
}
// After the function call, we have to initialize the stack base pointer and memory reserved registers.
c.compileReservedStackBasePointerInitialization()
c.compileReservedMemoryPointerInitialization()
return nil
}
// compileMemorySize implements compiler.compileMemorySize for the amd64 architecture.
func (c *amd64Compiler) compileMemorySize() error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
reg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
loc := c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, loc.register)
// WebAssembly's memory.size returns the page size (65536) of memory region.
// That is equivalent to divide the len of memory slice by 65536 and
// that can be calculated as SHR by 16 bits as 65536 = 2^16.
c.assembler.CompileConstToRegister(amd64.SHRQ, wasm.MemoryPageSizeInBits, loc.register)
return nil
}
// compileMemoryInit implements compiler.compileMemoryInit for the amd64 architecture.
func (c *amd64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
dataIndex := uint32(o.U1)
return c.compileInitImpl(false, dataIndex, 0)
}
// compileInitImpl implements compileTableInit and compileMemoryInit.
//
// TODO: the compiled code in this function should be reused and compile at once as
// the code is independent of any module.
func (c *amd64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
if isTable {
outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
}
copySize := c.locationStack.pop()
if err := c.compileEnsureOnRegister(copySize); err != nil {
return err
}
sourceOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
return err
}
destinationOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
return err
}
instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(instanceAddr)
if isTable {
c.compileLoadElemInstanceAddress(index, instanceAddr)
} else {
c.compileLoadDataInstanceAddress(index, instanceAddr)
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(tmp)
// sourceOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
// destinationOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
// Check instance bounds and if exceeds the length, exit with out of bounds error.
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
sourceOffset.register)
c.compileTrapFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)
// Check destination bounds and if exceeds the length, exit with out of bounds error.
if isTable {
// Load the target table's address.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
// Compare length.
c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
} else {
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
destinationOffset.register)
}
c.compileTrapFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)
// Otherwise, ready to copy the value from source to destination.
//
// If the copy size equal zero, we skip the entire instructions below.
c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
skipJump := c.assembler.CompileJump(amd64.JEQ)
var scale int16
var memToReg, regToMem asm.Instruction
if isTable {
// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
// destinationOffset += table buffer's absolute address.
c.assembler.CompileMemoryToRegister(amd64.ADDQ,
tmp, tableInstanceTableOffset, destinationOffset.register)
// sourceOffset += data buffer's absolute address.
c.assembler.CompileMemoryToRegister(amd64.ADDQ,
instanceAddr, 0, sourceOffset.register)
// For tables, we move 8 bytes at once.
memToReg = amd64.MOVQ
regToMem = memToReg
scale = 8
} else {
// destinationOffset += memory buffer's absolute address.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
// sourceOffset += data buffer's absolute address.
c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register)
// Move one byte at once.
memToReg = amd64.MOVBQZX
regToMem = amd64.MOVB
scale = 1
}
// Negate the counter.
c.assembler.CompileNoneToRegister(amd64.NEGQ, copySize.register)
beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
c.assembler.CompileMemoryWithIndexToRegister(memToReg,
sourceOffset.register, 0, copySize.register, scale,
tmp)
// [destinationOffset + (size.register)] = tmp.
c.assembler.CompileRegisterToMemoryWithIndex(regToMem,
tmp,
destinationOffset.register, 0, copySize.register, scale,
)
// size += 1
c.assembler.CompileNoneToRegister(amd64.INCQ, copySize.register)
c.assembler.CompileJump(amd64.JMI).AssignJumpTarget(beginCopyLoop)
c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
destinationOffset.register, instanceAddr, tmp)
c.assembler.SetJumpTargetOnNext(skipJump)
return nil
}
// compileDataDrop implements compiler.compileDataDrop for the amd64 architecture.
func (c *amd64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
dataIndex := uint32(o.U1)
c.compileLoadDataInstanceAddress(dataIndex, tmp)
// Clears the content of DataInstance[o.DataIndex] (== []byte type).
c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
return nil
}
func (c *amd64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
// dst = dataIndex * dataInstanceStructSize.
c.assembler.CompileConstToRegister(amd64.MOVQ, int64(dataIndex)*dataInstanceStructSize, dst)
// dst = &moduleInstance.DataInstances[0] + dst
// = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
// = &moduleInstance.DataInstances[dataIndex]
c.assembler.CompileMemoryToRegister(amd64.ADDQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
dst,
)
}
// compileCopyLoopImpl implements a REP MOVSQ memory copy for the given range with support for both directions.
func (c *amd64Compiler) compileCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, backwards bool, bwOffset uint8) {
// skip if nothing to copy
c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
restoreCrossing := c.compilePreventCrossedTargetRegisters(
[]*runtimeValueLocation{destinationOffset, sourceOffset, copySize},
[]asm.Register{amd64.RegDI, amd64.RegSI, amd64.RegCX})
// Prepare registers for REP MOVSQ: copy from rsi to rdi, rcx times.
c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
// Point on first byte of first quadword to copy.
if backwards {
c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegDI)
c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegSI)
// Set REP prefix direction backwards.
c.assembler.CompileStandAlone(amd64.STD)
}
c.assembler.CompileStandAlone(amd64.REPMOVSQ)
if backwards {
// Reset direction.
c.assembler.CompileStandAlone(amd64.CLD)
}
// Restore registers.
c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
restoreCrossing()
c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
c.assembler.CompileStandAlone(amd64.NOP)
}
// compileMemoryCopyLoopImpl is used for directly copying after bounds/direction check.
func (c *amd64Compiler) compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
// Point on first byte to be copied depending on direction.
if backwards {
c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
} else {
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
}
// destinationOffset += memory buffer's absolute address.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
// sourceOffset += memory buffer's absolute address.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, sourceOffset.register)
// Copy copySize % 8 bytes in loop to allow copying in 8 byte groups afterward.
beginLoop := c.assembler.CompileStandAlone(amd64.NOP)
// Check copySize % 8 == 0.
c.assembler.CompileConstToRegister(amd64.TESTQ, 7, copySize.register)
breakLoop := c.assembler.CompileJump(amd64.JEQ)
c.assembler.CompileMemoryToRegister(amd64.MOVBQZX, sourceOffset.register, 0, tmp)
c.assembler.CompileRegisterToMemory(amd64.MOVB, tmp, destinationOffset.register, 0)
if backwards {
c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
} else {
c.assembler.CompileNoneToRegister(amd64.INCQ, sourceOffset.register)
c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
}
c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginLoop)
c.assembler.SetJumpTargetOnNext(breakLoop)
// compileCopyLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 7)
}
// compileMemoryCopy implements compiler.compileMemoryCopy for the amd64 architecture.
//
// This uses efficient `REP MOVSQ` instructions to copy in quadword (8 bytes) batches. The remaining bytes
// are copied with a simple `MOV` loop. It uses backward copying for overlapped segments.
func (c *amd64Compiler) compileMemoryCopy() error {
copySize := c.locationStack.pop()
if err := c.compileEnsureOnRegister(copySize); err != nil {
return err
}
sourceOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
return err
}
destinationOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(tmp)
// sourceOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
// destinationOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
// tmp = max(sourceOffset, destinationOffset).
c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, destinationOffset.register)
c.assembler.CompileRegisterToRegister(amd64.MOVQ, sourceOffset.register, tmp)
c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, destinationOffset.register, tmp)
// Check maximum bounds and if exceeds the length, exit with out of bounds error.
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, tmp)
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
// Skip zero size.
c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
skipJump := c.assembler.CompileJump(amd64.JEQ)
// If dest < source, we can copy forwards
c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
// If source + size < dest, we can copy forwards
c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
// Copy backwards.
c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, true)
endJump := c.assembler.CompileJump(amd64.JMP)
// Copy forwards.
c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, false)
c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
destinationOffset.register, tmp)
c.assembler.SetJumpTargetOnNext(skipJump)
c.assembler.SetJumpTargetOnNext(endJump)
return nil
}
// compileFillLoopImpl implements a REP STOSQ fill loop.
func (c *amd64Compiler) compileFillLoopImpl(destinationOffset, value, fillSize *runtimeValueLocation, tmp asm.Register, replicateByte bool) {
// Skip if nothing to fill.
c.assembler.CompileRegisterToConst(amd64.CMPQ, fillSize.register, 0)
emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
if replicateByte {
// Truncate value.register to a single byte
c.assembler.CompileConstToRegister(amd64.ANDQ, 0xff, value.register)
// Replicate single byte onto full 8-byte register.
c.assembler.CompileConstToRegister(amd64.MOVQ, 0x0101010101010101, tmp)
c.assembler.CompileRegisterToRegister(amd64.IMULQ, tmp, value.register)
}
// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
restoreCrossing := c.compilePreventCrossedTargetRegisters(
[]*runtimeValueLocation{destinationOffset, value, fillSize},
[]asm.Register{amd64.RegDI, amd64.RegAX, amd64.RegCX})
// Prepare registers for REP STOSQ: fill at [rdi] with rax, rcx times.
c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
c.assembler.CompileStandAlone(amd64.REPSTOSQ)
// Restore registers.
c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
restoreCrossing()
c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
}
// compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
//
// This function uses efficient `REP STOSQ` instructions to copy in quadword (8 bytes) batches
// if the size if above 15 bytes. For smaller sizes, a simple MOVB copy loop is the best
// option.
//
// TODO: the compiled code in this function should be reused and compile at once as
// the code is independent of any module.
func (c *amd64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
copySize := c.locationStack.pop()
if err := c.compileEnsureOnRegister(copySize); err != nil {
return err
}
value := c.locationStack.pop()
if err := c.compileEnsureOnRegister(value); err != nil {
return err
}
destinationOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(tmp)
// destinationOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
// Check destination bounds and if exceeds the length, exit with out of bounds error.
if isTable {
// tmp = &tables[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
tmp)
// tmp = [tmp + TableIndex*8]
// = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
// = [&tables[TableIndex]] = tables[TableIndex].
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex)*8, tmp)
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
tmp, tableInstanceTableLenOffset,
destinationOffset.register)
} else {
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
destinationOffset.register)
}
if isTable {
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
} else {
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
}
// Otherwise, ready to copy the value from source to destination.
//
// If the copy size equal zero, we skip the entire instructions below.
c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
skipJump := c.assembler.CompileJump(amd64.JEQ)
// destinationOffset -= size.
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
if isTable {
// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
// destinationOffset += table buffer's absolute address.
c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
} else {
// destinationOffset += memory buffer's absolute address.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
// Copy first %15 bytes with simple MOVB instruction.
beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
c.assembler.CompileConstToRegister(amd64.TESTQ, 15, copySize.register)
breakLoop := c.assembler.CompileJump(amd64.JEQ)
c.assembler.CompileRegisterToMemory(amd64.MOVB, value.register, destinationOffset.register, 0)
c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginCopyLoop)
c.assembler.SetJumpTargetOnNext(breakLoop)
// compileFillLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
}
c.compileFillLoopImpl(destinationOffset, value, copySize, tmp, !isTable)
c.locationStack.markRegisterUnused(copySize.register, value.register,
destinationOffset.register, tmp)
c.assembler.SetJumpTargetOnNext(skipJump)
return nil
}
// compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
//
// TODO: the compiled code in this function should be reused and compile at once as
// the code is independent of any module.
func (c *amd64Compiler) compileMemoryFill() error {
return c.compileFillImpl(false, 0)
}
// compileTableInit implements compiler.compileTableInit for the amd64 architecture.
func (c *amd64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
elemIndex := uint32(o.U1)
tableIndex := uint32(o.U2)
return c.compileInitImpl(true, elemIndex, tableIndex)
}
// compileTableCopyLoopImpl is used for directly copying after bounds/direction check.
func (c *amd64Compiler) compileTableCopyLoopImpl(srcTableIndex, dstTableIndex uint32, destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
// Point on first byte to be copied.
if !backwards {
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
}
// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
// destinationOffset += table buffer's absolute address.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
// sourceOffset += table buffer's absolute address.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, sourceOffset.register)
c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 8)
}
// compileTableCopy implements compiler.compileTableCopy for the amd64 architecture.
//
// It uses efficient `REP MOVSB` instructions for optimized copying. It uses backward copying for
// overlapped segments.
func (c *amd64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
copySize := c.locationStack.pop()
if err := c.compileEnsureOnRegister(copySize); err != nil {
return err
}
sourceOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
return err
}
destinationOffset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// sourceOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
// destinationOffset += size.
c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
srcTableIndex := uint32(o.U1)
dstTableIndex := uint32(o.U2)
// Check source bounds and if exceeds the length, exit with out of bounds error.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, sourceOffset.register)
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
// Check destination bounds and if exceeds the length, exit with out of bounds error.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
// Skip zero size.
c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
skipJump := c.assembler.CompileJump(amd64.JEQ)
// If dest < source, we can copy forwards.
c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
// If source + size < dest, we can copy forwards.
c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
// Copy backwards.
c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, true)
endJump := c.assembler.CompileJump(amd64.JMP)
// Copy forwards.
c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, false)
c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
destinationOffset.register, tmp)
c.assembler.SetJumpTargetOnNext(skipJump)
c.assembler.SetJumpTargetOnNext(endJump)
return nil
}
// compileElemDrop implements compiler.compileElemDrop for the amd64 architecture.
func (c *amd64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
elemIndex := uint32(o.U1)
c.compileLoadElemInstanceAddress(elemIndex, tmp)
// Clears the content of ElementInstances[o.ElemIndex].References (== []uintptr{} type).
c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
return nil
}
func (c *amd64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
// dst = elemIndex * elementInstanceStructSize
c.assembler.CompileConstToRegister(amd64.MOVQ, int64(elemIndex)*elementInstanceStructSize, dst)
// dst = &moduleInstance.ElementInstances[0] + dst
// = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
// = &moduleInstance.ElementInstances[elemIndex]
c.assembler.CompileMemoryToRegister(amd64.ADDQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
dst,
)
}
// compileTableGet implements compiler.compileTableGet for the amd64 architecture.
func (c *amd64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
ref, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.locationStack.markRegisterUsed(ref)
offset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(offset); err != nil {
return err
}
// ref = &tables[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
ref)
// ref = [ref + TableIndex*8]
// = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
// = [&tables[TableIndex]] = tables[TableIndex].
tableIndex := int64(o.U1)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableIndex*8, ref)
// Out of bounds check.
c.assembler.CompileMemoryToRegister(amd64.CMPQ, ref, tableInstanceTableLenOffset, offset.register)
c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableInstanceTableOffset, ref)
// ref = [ref + 0 + offset.register * 8]
// = [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset]
// = [&tables[TableIndex].References[offset]]
// = tables[TableIndex].References[offset]
c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, ref,
0, offset.register, 8, ref,
)
c.locationStack.markRegisterUnused(offset.register)
c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
return nil
}
// compileTableSet implements compiler.compileTableSet for the amd64 architecture.
func (c *amd64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
ref := c.locationStack.pop()
if err := c.compileEnsureOnRegister(ref); err != nil {
return err
}
offset := c.locationStack.pop()
if err := c.compileEnsureOnRegister(offset); err != nil {
return err
}
tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// tmp = &tables[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
tmp)
// ref = [ref + TableIndex*8]
// = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
// = [&tables[TableIndex]] = tables[TableIndex].
tableIndex := int64(o.U1)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableIndex*8, tmp)
// Out of bounds check.
c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableInstanceTableOffset, tmp)
// [tmp + 0 + offset.register * 8] = ref
// [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] = ref
// [&tables[TableIndex].References[offset]] = ref
// tables[TableIndex].References[offset] = ref
c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVQ,
ref.register,
tmp, 0, offset.register, 8)
c.locationStack.markRegisterUnused(offset.register, ref.register)
return nil
}
// compileTableGrow implements compiler.compileTableGrow for the amd64 architecture.
func (c *amd64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
// Pushes the table index.
tableIndex := uint32(o.U1)
if err := c.compileConstI32Impl(tableIndex); err != nil {
return err
}
// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
// Therefore, call out to the built function for this purpose.
if err := c.compileCallBuiltinFunction(builtinFunctionIndexTableGrow); err != nil {
return err
}
// TableGrow consumes three values (table index, number of items, initial value).
for i := 0; i < 3; i++ {
c.locationStack.pop()
}
// Then, the previous length was pushed as the result.
loc := c.locationStack.pushRuntimeValueLocationOnStack()
loc.valueType = runtimeValueTypeI32
// After return, we re-initialize reserved registers just like preamble of functions.
c.compileReservedStackBasePointerInitialization()
c.compileReservedMemoryPointerInitialization()
return nil
}
// compileTableSize implements compiler.compileTableSize for the amd64 architecture.
func (c *amd64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
result, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
// result = &tables[0]
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
result)
// result = [result + TableIndex*8]
// = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
// = [&tables[TableIndex]] = tables[TableIndex].
tableIndex := int64(o.U1)
c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableIndex*8, result)
// result = [result + tableInstanceTableLenOffset]
// = [tables[TableIndex] + tableInstanceTableLenOffset]
// = len(tables[TableIndex])
c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableInstanceTableLenOffset, result)
c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
return nil
}
// compileTableFill implements compiler.compileTableFill for the amd64 architecture.
func (c *amd64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
tableIndex := uint32(o.U1)
return c.compileFillImpl(true, tableIndex)
}
// compileRefFunc implements compiler.compileRefFunc for the amd64 architecture.
func (c *amd64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
ref, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
functionIndex := int64(o.U1)
c.assembler.CompileConstToRegister(amd64.MOVQ, functionIndex*functionSize, ref)
// ref = [amd64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset + int64(o.FunctionIndex)*functionSize]
// = &moduleEngine.functions[index]
c.assembler.CompileMemoryToRegister(
amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
ref,
)
c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
return nil
}
// compileConstI32 implements compiler.compileConstI32 for the amd64 architecture.
func (c *amd64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
return c.compileConstI32Impl(uint32(o.U1))
}
func (c *amd64Compiler) compileConstI32Impl(v uint32) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
reg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
c.assembler.CompileConstToRegister(amd64.MOVL, int64(v), reg)
return nil
}
// compileConstI64 implements compiler.compileConstI64 for the amd64 architecture.
func (c *amd64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
reg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1), reg)
return nil
}
// compileConstF32 implements compiler.compileConstF32 for the amd64 architecture.
func (c *amd64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
reg, err := c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF32)
// We cannot directly load the value from memory to float regs,
// so we move it to int reg temporarily.
tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.assembler.CompileConstToRegister(amd64.MOVL, int64(o.U1) /*math.Float32bits(o.Value)*/, tmpReg)
c.assembler.CompileRegisterToRegister(amd64.MOVL, tmpReg, reg)
return nil
}
// compileConstF64 implements compiler.compileConstF64 for the amd64 architecture.
func (c *amd64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
return err
}
reg, err := c.allocateRegister(registerTypeVector)
if err != nil {
return err
}
c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF64)
// We cannot directly load the value from memory to float regs,
// so we move it to int reg temporarily.
tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1) /* math.Float64bits(o.Value) */, tmpReg)
c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, reg)
return nil
}
// compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for amd64.
func (c *amd64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
var inst asm.Instruction
switch loc.valueType {
case runtimeValueTypeV128Lo:
inst = amd64.MOVDQU
case runtimeValueTypeV128Hi:
panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
case runtimeValueTypeI32, runtimeValueTypeF32:
inst = amd64.MOVL
case runtimeValueTypeI64, runtimeValueTypeF64:
inst = amd64.MOVQ
default:
panic("BUG: unknown runtime value type")
}
// Copy the value from the stack.
c.assembler.CompileMemoryToRegister(inst,
// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
loc.register)
if loc.valueType == runtimeValueTypeV128Lo {
// Higher 64-bits are loaded as well ^^.
hi := &c.locationStack.stack[loc.stackPointer+1]
hi.setRegister(loc.register)
}
}
// maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
// if the value is located on a conditional register.
//
// This is usually called at the beginning of methods on compiler interface where we possibly
// compile instructions without saving the conditional register value.
// The compileXXX functions without calling this function is saving the conditional
// value to the stack or register by invoking compileEnsureOnRegister for the top.
func (c *amd64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
if c.locationStack.sp > 0 {
if loc := c.locationStack.peek(); loc.onConditionalRegister() {
if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
return err
}
}
}
return
}
// loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
// to a general purpose register.
func (c *amd64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
reg, err := c.allocateRegister(registerTypeGeneralPurpose)
if err != nil {
return err
}
c.compileMoveConditionalToGeneralPurposeRegister(loc, reg)
return nil
}
func (c *amd64Compiler) compileMoveConditionalToGeneralPurposeRegister(loc *runtimeValueLocation, reg asm.Register) {
// Set the flag bit to the destination. See
// - https://c9x.me/x86/html/file_module_x86_id_288.html
// - https://github.com/golang/go/blob/master/src/cmd/internal/obj/x86/asm6.go#L1453-L1468
// to translate conditionalRegisterState* to amd64.SET*
var inst asm.Instruction
switch loc.conditionalRegister {
case amd64.ConditionalRegisterStateE:
inst = amd64.SETEQ
case amd64.ConditionalRegisterStateNE:
inst = amd64.SETNE
case amd64.ConditionalRegisterStateS:
inst = amd64.SETMI
case amd64.ConditionalRegisterStateNS:
inst = amd64.SETPL
case amd64.ConditionalRegisterStateG:
inst = amd64.SETGT
case amd64.ConditionalRegisterStateGE:
inst = amd64.SETGE
case amd64.ConditionalRegisterStateL:
inst = amd64.SETLT
case amd64.ConditionalRegisterStateLE:
inst = amd64.SETLE
case amd64.ConditionalRegisterStateA:
inst = amd64.SETHI
case amd64.ConditionalRegisterStateAE:
inst = amd64.SETCC
case amd64.ConditionalRegisterStateB:
inst = amd64.SETCS
case amd64.ConditionalRegisterStateBE:
inst = amd64.SETLS
}
c.assembler.CompileNoneToRegister(inst, reg)
// Then we reset the unnecessary bit.
c.assembler.CompileConstToRegister(amd64.ANDQ, 0x1, reg)
// Mark it uses the register.
loc.setRegister(reg)
c.locationStack.markRegisterUsed(reg)
}
// allocateRegister implements compiler.allocateRegister for amd64.
func (c *amd64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
var ok bool
// Try to get the unused register.
reg, ok = c.locationStack.takeFreeRegister(t)
if ok {
return
}
// If not found, we have to steal the register.
stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
if !ok {
err = fmt.Errorf("cannot steal register")
return
}
// Release the steal target register value onto stack location.
reg = stealTarget.register
c.compileReleaseRegisterToStack(stealTarget)
return
}
// callFunction adds instructions to call a function whose address equals either addr parameter or the value on indexReg.
//
// Note: this is the counterpart for returnFunction, and see the comments there as well
// to understand how the function calls are achieved.
func (c *amd64Compiler) compileCallFunctionImpl(functionAddressRegister asm.Register, functype *wasm.FunctionType) error {
// Release all the registers as our calling convention requires the caller-save.
if err := c.compileReleaseAllRegistersToStack(); err != nil {
return err
}
c.locationStack.markRegisterUsed(functionAddressRegister)
// Obtain a temporary register to be used in the followings.
tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
if !found {
// This in theory never happen as all the registers must be free except codeAddressRegister.
return fmt.Errorf("could not find enough free registers")
}
// The stack should look like:
//
// reserved slots for results (if len(results) > len(args))
// | |
// ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
// | | |
// | callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
// |
// nextStackBasePointerOffset
//
// where callFrame is used to return to this currently executed function.
nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
tmpRegister)
callFrameStackBasePointerInBytesLoc.setRegister(tmpRegister)
c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
// Set callEngine.stackContext.stackBasePointer for the next function.
c.assembler.CompileConstToRegister(amd64.ADDQ, nextStackBasePointerOffset<<3, tmpRegister)
// Write the calculated value to callEngine.stackContext.stackBasePointer.
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
tmpRegister)
callFrameFunctionLoc.setRegister(tmpRegister)
c.compileReleaseRegisterToStack(callFrameFunctionLoc)
// Set callEngine.moduleContext.fn to the next *function.
c.assembler.CompileRegisterToMemory(amd64.MOVQ, functionAddressRegister,
amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
// Write the return address into callFrameReturnAddressLoc.
c.assembler.CompileReadInstructionAddress(tmpRegister, amd64.JMP)
callFrameReturnAddressLoc.setRegister(tmpRegister)
c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
if amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister == functionAddressRegister {
// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
// the address (jump target below) will be modified and result in segfault.
// See #526.
c.assembler.CompileRegisterToRegister(amd64.MOVQ, functionAddressRegister, tmpRegister)
functionAddressRegister = tmpRegister
}
// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, functionAddressRegister, functionModuleInstanceOffset,
amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
// And jump into the initial address of the target function.
c.assembler.CompileJumpToMemory(amd64.JMP, functionAddressRegister, functionCodeInitialAddressOffset)
// All the registers used are temporary, so we mark them unused.
c.locationStack.markRegisterUnused(tmpRegister, functionAddressRegister)
// On the function return, we have to initialize the state.
if err := c.compileModuleContextInitialization(); err != nil {
return err
}
// Due to the change to callEngine.stackContext.stackBasePointer.
c.compileReservedStackBasePointerInitialization()
// Due to the change to callEngine.moduleContext.moduleInstance as that might result in
// the memory instance manipulation.
c.compileReservedMemoryPointerInitialization()
// We consumed the function parameters, the call frame stack and reserved slots during the call.
c.locationStack.sp = uint64(nextStackBasePointerOffset)
// Now the function results are pushed by the call.
for _, t := range functype.Results {
loc := c.locationStack.pushRuntimeValueLocationOnStack()
switch t {
case wasm.ValueTypeI32:
loc.valueType = runtimeValueTypeI32
case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
loc.valueType = runtimeValueTypeI64
case wasm.ValueTypeF32:
loc.valueType = runtimeValueTypeF32
case wasm.ValueTypeF64:
loc.valueType = runtimeValueTypeF64
case wasm.ValueTypeV128:
loc.valueType = runtimeValueTypeV128Lo
hi := c.locationStack.pushRuntimeValueLocationOnStack()
hi.valueType = runtimeValueTypeV128Hi
default:
panic("BUG: invalid type: " + wasm.ValueTypeName(t))
}
}
return nil
}
// returnFunction adds instructions to return from the current callframe back to the caller's frame.
// If this is the current one is the origin, we return to the callEngine.execWasmFunction with the Returned status.
// Otherwise, we jump into the callers' return address stored in callFrame.returnAddress while setting
// up all the necessary change on the callEngine's state.
//
// Note: this is the counterpart for callFunction, and see the comments there as well
// to understand how the function calls are achieved.
func (c *amd64Compiler) compileReturnFunction() error {
// Release all the registers as our calling convention requires the caller-save.
if err := c.compileReleaseAllRegistersToStack(); err != nil {
return err
}
if c.withListener {
if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerAfter); err != nil {
return err
}
// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
c.compileReservedStackBasePointerInitialization()
}
// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
// so mark it used so that it won't be used as a free register.
c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
// Obtain a temporary register to be used in the following.
returnAddressRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
if !found {
panic("BUG: all the registers should be free at this point: " + c.locationStack.String())
}
returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
// A zero return address means return from the execution.
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForStackBasePointerAddress, int64(returnAddress.stackPointer)*8,
returnAddressRegister,
)
c.assembler.CompileRegisterToRegister(amd64.TESTQ, returnAddressRegister, returnAddressRegister)
jmpIfNotReturn := c.assembler.CompileJump(amd64.JNE)
c.compileExitFromNativeCode(nativeCallStatusCodeReturned)
// Otherwise, we return to the caller.
c.assembler.SetJumpTargetOnNext(jmpIfNotReturn)
// Alias for readability.
tmpRegister := amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister
// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
callerStackBasePointerInBytes.setRegister(tmpRegister)
c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
c.assembler.CompileRegisterToMemory(amd64.MOVQ,
tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
// Next, restore moduleContext.fn from callerFunction.
callerFunction.setRegister(tmpRegister)
c.compileLoadValueOnStackToRegister(callerFunction)
c.assembler.CompileRegisterToMemory(amd64.MOVQ,
tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
tmpRegister, functionModuleInstanceOffset,
amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
// Then, jump into the return address!
c.assembler.CompileJumpToRegister(amd64.JMP, returnAddressRegister)
return nil
}
func (c *amd64Compiler) compileCallGoHostFunction() error {
return c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction)
}
func (c *amd64Compiler) compileCallBuiltinFunction(index wasm.Index) error {
// Set the functionAddress to the callEngine.exitContext functionCallAddress.
c.assembler.CompileConstToMemory(amd64.MOVL, int64(index), amd64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset)
return c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction)
}
func (c *amd64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode) error {
// Release all the registers as our calling convention requires the caller-save.
if err := c.compileReleaseAllRegistersToStack(); err != nil {
return err
}
c.compileExitFromNativeCode(compilerStatus)
return nil
}
// compileReleaseAllRegistersToStack add the instructions to release all the LIVE value
// in the value location stack at this point into the stack memory location.
func (c *amd64Compiler) compileReleaseAllRegistersToStack() (err error) {
for i := uint64(0); i < c.locationStack.sp; i++ {
if loc := &c.locationStack.stack[i]; loc.onRegister() {
c.compileReleaseRegisterToStack(loc)
} else if loc.onConditionalRegister() {
if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
return
}
c.compileReleaseRegisterToStack(loc)
}
}
return
}
func (c *amd64Compiler) onValueReleaseRegisterToStack(reg asm.Register) {
for i := uint64(0); i < c.locationStack.sp; i++ {
prevValue := &c.locationStack.stack[i]
if prevValue.register == reg {
c.compileReleaseRegisterToStack(prevValue)
break
}
}
}
// compileReleaseRegisterToStack implements compiler.compileReleaseRegisterToStack for amd64.
func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
var inst asm.Instruction
switch loc.valueType {
case runtimeValueTypeV128Lo:
inst = amd64.MOVDQU
case runtimeValueTypeV128Hi:
panic("BUG: V128Hi must be released to the stack along with V128Lo")
case runtimeValueTypeI32, runtimeValueTypeF32:
inst = amd64.MOVL
case runtimeValueTypeI64, runtimeValueTypeF64:
inst = amd64.MOVQ
default:
panic("BUG: unknown runtime value type")
}
c.assembler.CompileRegisterToMemory(inst, loc.register,
// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
// Mark the register is free.
c.locationStack.releaseRegister(loc)
if loc.valueType == runtimeValueTypeV128Lo {
// Higher 64-bits are released as well ^^.
hi := &c.locationStack.stack[loc.stackPointer+1]
c.locationStack.releaseRegister(hi)
}
}
func (c *amd64Compiler) compileTrapFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
if target := c.compiledTrapTargets[status]; target == nil {
skip := c.assembler.CompileJump(skipCondition)
// Save the trap target for future reference.
c.compiledTrapTargets[status] = c.compileNOP()
c.compileExitFromNativeCode(status)
c.assembler.SetJumpTargetOnNext(skip)
} else {
// We've already compiled this.
// Invert the condition to jump into the appropriate target.
var trapCondition asm.Instruction
switch skipCondition {
case amd64.JHI:
trapCondition = amd64.JLS
case amd64.JLS:
trapCondition = amd64.JHI
case amd64.JNE:
trapCondition = amd64.JEQ
case amd64.JEQ:
trapCondition = amd64.JNE
case amd64.JCC:
trapCondition = amd64.JCS
case amd64.JCS:
trapCondition = amd64.JCC
case amd64.JPC:
trapCondition = amd64.JPS
case amd64.JPS:
trapCondition = amd64.JPC
case amd64.JPL:
trapCondition = amd64.JMI
case amd64.JMI:
trapCondition = amd64.JPL
default:
panic("BUG: couldn't invert condition")
}
c.assembler.CompileJump(trapCondition).AssignJumpTarget(target)
}
}
func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
c.assembler.CompileConstToMemory(amd64.MOVB, int64(status),
amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
// Write back the cached SP to the actual eng.stackPointer.
c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp),
amd64ReservedRegisterForCallEngine, callEngineStackContextStackPointerOffset)
switch status {
case nativeCallStatusCodeReturned:
case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
// Read the return address, and write it to callEngine.exitContext.returnAddress.
returnAddressReg, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
if !ok {
panic("BUG: cannot take free register")
}
c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.RET)
c.assembler.CompileRegisterToMemory(amd64.MOVQ,
returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
default:
// This case, the execution traps, so take tmpReg and store the instruction address onto callEngine.returnAddress
// so that the stack trace can contain the top frame's source position.
tmpReg := amd64.RegR15
c.assembler.CompileReadInstructionAddress(tmpReg, amd64.MOVQ)
c.assembler.CompileRegisterToMemory(amd64.MOVQ,
tmpReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
}
c.assembler.CompileStandAlone(amd64.RET)
}
func (c *amd64Compiler) compilePreamble() (err error) {
// We assume all function parameters are already pushed onto the stack by
// the caller.
c.locationStack.init(c.typ)
if err := c.compileModuleContextInitialization(); err != nil {
return err
}
// Check if it's necessary to grow the value stack by using max stack pointer.
if err = c.compileMaybeGrowStack(); err != nil {
return err
}
if c.withListener {
if err = c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
return err
}
}
c.compileReservedStackBasePointerInitialization()
// Finally, we initialize the reserved memory register based on the module context.
c.compileReservedMemoryPointerInitialization()
return
}
func (c *amd64Compiler) compileReservedStackBasePointerInitialization() {
// First, make reservedRegisterForStackBasePointer point to the beginning of the slice backing array.
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
amd64ReservedRegisterForStackBasePointerAddress)
// next we move the base pointer (callEngine.stackBasePointer) to the tmp register.
c.assembler.CompileMemoryToRegister(amd64.ADDQ,
amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
amd64ReservedRegisterForStackBasePointerAddress,
)
}
func (c *amd64Compiler) compileReservedMemoryPointerInitialization() {
if c.ir.HasMemory || c.ir.UsesMemory {
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
amd64ReservedRegisterForMemory,
)
}
}
// compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
// and if so, make the builtin function call to do so. These instructions are called in the function's
// preamble.
func (c *amd64Compiler) compileMaybeGrowStack() error {
tmpRegister, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
if !ok {
panic("BUG: cannot take free register")
}
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset, tmpRegister)
c.assembler.CompileMemoryToRegister(amd64.SUBQ,
amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister)
// If stack base pointer + max stack pointer > stackLen, we need to grow the stack.
cmpWithStackPointerCeil := c.assembler.CompileRegisterToConst(amd64.CMPQ, tmpRegister, 0)
c.assignStackPointerCeilNeeded = cmpWithStackPointerCeil
// Jump if we have no need to grow.
jmpIfNoNeedToGrowStack := c.assembler.CompileJump(amd64.JCC)
// Otherwise, we have to make the builtin function call to grow the call stack.
if err := c.compileCallBuiltinFunction(builtinFunctionIndexGrowStack); err != nil {
return err
}
c.assembler.SetJumpTargetOnNext(jmpIfNoNeedToGrowStack)
return nil
}
// compileModuleContextInitialization adds instructions to initialize callEngine.ModuleContext's fields based on
// callEngine.ModuleContext.ModuleInstanceAddress.
// This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
func (c *amd64Compiler) compileModuleContextInitialization() error {
// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
// so mark it used so that it won't be used as a free register until the module context initialization finishes.
c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
// Obtain the temporary registers to be used in the followings.
tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
if !found {
// This in theory never happen as all the registers must be free except indexReg.
return fmt.Errorf("could not find enough free registers")
}
c.locationStack.markRegisterUsed(tmpRegister)
tmpRegister2, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
if !found {
// This in theory never happen as all the registers must be free except indexReg.
return fmt.Errorf("could not find enough free registers")
}
c.locationStack.markRegisterUsed(tmpRegister2)
// If the module instance address stays the same, we could skip the entire code below.
// The rationale/idea for this is that, in almost all use cases, users instantiate a single
// Wasm binary and run the functions from it, rather than doing import/export on multiple
// binaries. As a result, this cmp and jmp instruction sequence below must be easy for
// x64 CPU to do branch prediction since almost 100% jump happens across function calls.
c.assembler.CompileMemoryToRegister(amd64.CMPQ,
amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
jmpIfModuleNotChange := c.assembler.CompileJump(amd64.JEQ)
// If engine.ModuleContext.ModuleInstance is not equal the value on amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
// we have to put the new value there.
c.assembler.CompileRegisterToMemory(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
// Also, we have to update the following fields:
// * callEngine.moduleContext.globalElement0Address
// * callEngine.moduleContext.tableElement0Address
// * callEngine.moduleContext.memoryInstance
// * callEngine.moduleContext.memoryElement0Address
// * callEngine.moduleContext.memorySliceLen
// * callEngine.moduleContext.codesElement0Address
// * callEngine.moduleContext.typeIDsElement0Address
// * callEngine.moduleContext.dataInstancesElement0Address
// * callEngine.moduleContext.elementInstancesElement0Address
// Update globalElement0Address.
//
// Note: if there's global.get or set instruction in the function, the existence of the globals
// is ensured by function validation at module instantiation phase, and that's why it is ok to
// skip the initialization if the module's globals slice is empty.
if len(c.ir.Globals) > 0 {
// Since ModuleInstance.Globals is []*globalInstance, internally
// the address of the first item in the underlying array lies exactly on the globals offset.
// See https://go.dev/blog/slices-intro if unfamiliar.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset, tmpRegister)
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset)
}
// Update tableElement0Address.
//
// Note: if there's table instruction in the function, the existence of the table
// is ensured by function validation at module instantiation phase, and that's
// why it is ok to skip the initialization if the module's table doesn't exist.
if c.ir.HasTable {
// First, we need to read the *wasm.Table.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTablesOffset, tmpRegister)
// At this point, tmpRegister holds the address of ModuleInstance.Table.
// So we are ready to read and put the first item's address stored in Table.Table.
// Here we read the value into tmpRegister2.
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset)
// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpRegister)
c.assembler.CompileRegisterToMemory(amd64.MOVQ,
tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
}
// Update memoryElement0Address and memorySliceLen.
//
// Note: if there's memory instruction in the function, memory instance must be non-nil.
// That is ensured by function validation at module instantiation phase, and that's
// why it is ok to skip the initialization if the module's memory instance is nil.
if c.ir.HasMemory {
c.assembler.CompileMemoryToRegister(amd64.MOVQ,
amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
tmpRegister)
// Set memory instance.
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset)
// Set length.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferLenOffset, tmpRegister2)
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset)
// Set element zero address.
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferOffset, tmpRegister2)
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset)
}
// Update moduleContext.codesElement0Address
{
// "tmpRegister = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
//
// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
// where tab points to the interface table, and the latter points to the actual
// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
// See the following references for detail:
// * https://research.swtch.com/interfaces
// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset, tmpRegister)
// "tmpRegister = [tmpRegister + moduleEnginecodesOffset] (== &moduleEngine.codes[0])"
c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, moduleEngineFunctionsOffset, tmpRegister)
// "callEngine.moduleContext.functionsElement0Address = tmpRegister".
c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine,
callEngineModuleContextFunctionsElement0AddressOffset)
}
// Update dataInstancesElement0Address.
if c.ir.HasDataInstances {
// "tmpRegister = &moduleInstance.DataInstances[0]"
c.assembler.CompileMemoryToRegister(
amd64.MOVQ,
amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
tmpRegister,
)
// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
c.assembler.CompileRegisterToMemory(
amd64.MOVQ,
tmpRegister,
amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
)
}
// Update callEngine.moduleContext.elementInstancesElement0Address
if c.ir.HasElementInstances {
// "tmpRegister = &moduleInstance.ElementInstnaces[0]"
c.assembler.CompileMemoryToRegister(
amd64.MOVQ,
amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
tmpRegister,
)
// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
c.assembler.CompileRegisterToMemory(
amd64.MOVQ,
tmpRegister,
amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
)
}
c.locationStack.markRegisterUnused(tmpRegister, tmpRegister2)
// Set the jump target towards the next instruction for the case where module instance address hasn't changed.
c.assembler.SetJumpTargetOnNext(jmpIfModuleNotChange)
return nil
}
// compileEnsureOnRegister ensures that the given value is located on a
// general purpose register of an appropriate type.
func (c *amd64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
if loc.onStack() {
// Allocate the register.
reg, err := c.allocateRegister(loc.getRegisterType())
if err != nil {
return err
}
// Mark it uses the register.
loc.setRegister(reg)
c.locationStack.markRegisterUsed(reg)
c.compileLoadValueOnStackToRegister(loc)
} else if loc.onConditionalRegister() {
err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
}
return
}
// compileMaybeSwapRegisters swaps two registers if they're not equal.
func (c *amd64Compiler) compileMaybeSwapRegisters(reg1, reg2 asm.Register) {
if reg1 != reg2 {
c.assembler.CompileRegisterToRegister(amd64.XCHGQ, reg1, reg2)
}
}
// compilePreventCrossedTargetRegisters swaps registers in such a way, that for neither runtimeValueLocation from locs its
// corresponding register with the same index from targets is occupied by some other runtimeValueLocation from locs. It returns a
// closure to restore the original register placement.
//
// This function makes it possible to safely exchange one set of registers with another, where a register might be in both sets.
// Each register will correspond either to itself or another register not present in its own set.
//
// For example, if we have locs = [AX, BX, CX], targets = [BX, SI, AX], then it'll do two swaps
// to make locs = [BX, CX, AX].
func (c *amd64Compiler) compilePreventCrossedTargetRegisters(locs []*runtimeValueLocation, targets []asm.Register) (restore func()) {
type swap struct{ srcIndex, dstIndex int }
var swaps []swap
for i := range locs {
targetLocation := -1 // -1 means not found.
for j := range locs {
if locs[j].register == targets[i] {
targetLocation = j
break
}
}
if targetLocation != -1 && targetLocation != i {
c.compileMaybeSwapRegisters(locs[i].register, locs[targetLocation].register)
locs[i].register, locs[targetLocation].register = locs[targetLocation].register, locs[i].register
swaps = append(swaps, swap{i, targetLocation})
}
}
return func() {
// Restore in reverse order because a register can be moved multiple times.
for i := len(swaps) - 1; i >= 0; i -= 1 {
r1, r2 := swaps[i].srcIndex, swaps[i].dstIndex
c.compileMaybeSwapRegisters(locs[r1].register, locs[r2].register)
locs[r1].register, locs[r2].register = locs[r2].register, locs[r1].register
}
}
}