wazero/internal/engine/compiler/impl_amd64.go

package compiler

// This file implements the compiler for amd64/x86_64 target.
// Please refer to https://www.felixcloutier.com/x86/index.html
// if unfamiliar with amd64 instructions used here.

import (
	"fmt"
	"math"

	"github.com/tetratelabs/wazero/internal/asm"
	"github.com/tetratelabs/wazero/internal/asm/amd64"
	"github.com/tetratelabs/wazero/internal/platform"
	"github.com/tetratelabs/wazero/internal/u32"
	"github.com/tetratelabs/wazero/internal/u64"
	"github.com/tetratelabs/wazero/internal/wasm"
	"github.com/tetratelabs/wazero/internal/wazeroir"
)

var (
	_minimum32BitSignedInt                  int32  = math.MinInt32
	_maximum32BitSignedInt                  int32  = math.MaxInt32
	_maximum32BitUnsignedInt                uint32 = math.MaxUint32
	_minimum64BitSignedInt                  int64  = math.MinInt64
	_maximum64BitSignedInt                  int64  = math.MaxInt64
	_maximum64BitUnsignedInt                uint64 = math.MaxUint64
	_float32SignBitMask                     uint32 = 1 << 31
	_float32RestBitMask                            = ^_float32SignBitMask
	_float64SignBitMask                     uint64 = 1 << 63
	_float64RestBitMask                            = ^_float64SignBitMask
	_float32ForMinimumSigned32bitInteger           = uint32(0xCF00_0000)
	_float64ForMinimumSigned32bitInteger           = uint64(0xC1E0_0000_0020_0000)
	_float32ForMinimumSigned64bitInteger           = uint32(0xDF00_0000)
	_float64ForMinimumSigned64bitInteger           = uint64(0xC3E0_0000_0000_0000)
	_float32ForMaximumSigned32bitIntPlusOne        = uint32(0x4F00_0000)
	_float64ForMaximumSigned32bitIntPlusOne        = uint64(0x41E0_0000_0000_0000)
	_float32ForMaximumSigned64bitIntPlusOne        = uint32(0x5F00_0000)
	_float64ForMaximumSigned64bitIntPlusOne        = uint64(0x43E0_0000_0000_0000)
)

var (
	// amd64ReservedRegisterForCallEngine: pointer to callEngine (i.e. *callEngine as uintptr)
	amd64ReservedRegisterForCallEngine = amd64.RegR13
	// amd64ReservedRegisterForStackBasePointerAddress: stack base pointer's address (callEngine.stackBasePointer) in the current function call.
	amd64ReservedRegisterForStackBasePointerAddress = amd64.RegR14
	// amd64ReservedRegisterForMemory: pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
	amd64ReservedRegisterForMemory = amd64.RegR15
)

var (
	amd64UnreservedVectorRegisters = []asm.Register{ //nolint
		amd64.RegX0, amd64.RegX1, amd64.RegX2, amd64.RegX3,
		amd64.RegX4, amd64.RegX5, amd64.RegX6, amd64.RegX7,
		amd64.RegX8, amd64.RegX9, amd64.RegX10, amd64.RegX11,
		amd64.RegX12, amd64.RegX13, amd64.RegX14, amd64.RegX15,
	}
	// Note that we never invoke "call" instruction,
	// so we don't need to care about the calling convention.
	// TODO: Maybe it is safe just save rbp, rsp somewhere
	// in Go-allocated variables, and reuse these registers
	// in compiled functions and write them back before returns.
	amd64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
		amd64.RegAX, amd64.RegCX, amd64.RegDX, amd64.RegBX,
		amd64.RegSI, amd64.RegDI, amd64.RegR8, amd64.RegR9,
		amd64.RegR10, amd64.RegR11, amd64.RegR12,
	}
)

// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds *wasm.ModuleInstance of the
// next executing function instance. The value is set and used when making function calls
// or function returns in the ModuleContextInitialization. See compileModuleContextInitialization.
var amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister = amd64.RegR12

func (c *amd64Compiler) String() string {
	return c.locationStack.String()
}

// compileNOP implements compiler.compileNOP for the amd64 architecture.
func (c *amd64Compiler) compileNOP() asm.Node {
	return c.assembler.CompileStandAlone(amd64.NOP)
}

type amd64Compiler struct {
	assembler   amd64.Assembler
	ir          *wazeroir.CompilationResult
	cpuFeatures platform.CpuFeatureFlags
	// locationStack holds the state of wazeroir virtual stack.
	// and each item is either placed in register or the actual memory stack.
	locationStack *runtimeValueLocationStack
	// labels hold per wazeroir label specific information in this function.
	labels [wazeroir.LabelKindNum][]amd64LabelInfo
	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
	stackPointerCeil uint64
	// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
	assignStackPointerCeilNeeded asm.Node
	compiledTrapTargets          [nativeCallStatusModuleClosed]asm.Node
	withListener                 bool
	typ                          *wasm.FunctionType
	// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
	// we cache it here, and reset and set to .locationStack in the Init method.
	locationStackForEntrypoint runtimeValueLocationStack
	// frameIDMax tracks the maximum value of frame id per function.
	frameIDMax int
	brTableTmp []runtimeValueLocation

	fourZeros,
	eightZeros,
	minimum32BitSignedInt,
	maximum32BitSignedInt,
	maximum32BitUnsignedInt,
	minimum64BitSignedInt,
	maximum64BitSignedInt,
	maximum64BitUnsignedInt,
	float32SignBitMask,
	float32RestBitMask,
	float64SignBitMask,
	float64RestBitMask,
	float32ForMinimumSigned32bitInteger,
	float64ForMinimumSigned32bitInteger,
	float32ForMinimumSigned64bitInteger,
	float64ForMinimumSigned64bitInteger,
	float32ForMaximumSigned32bitIntPlusOne,
	float64ForMaximumSigned32bitIntPlusOne,
	float32ForMaximumSigned64bitIntPlusOne,
	float64ForMaximumSigned64bitIntPlusOne *asm.StaticConst
}

func newAmd64Compiler() compiler {
	c := &amd64Compiler{
		assembler:                  amd64.NewAssembler(),
		locationStackForEntrypoint: newRuntimeValueLocationStack(),
		cpuFeatures:                platform.CpuFeatures,
	}

	c.fourZeros = asm.NewStaticConst([]byte{0, 0, 0, 0})
	c.eightZeros = asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0})
	c.minimum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_minimum32BitSignedInt)))
	c.maximum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_maximum32BitSignedInt)))
	c.maximum32BitUnsignedInt = asm.NewStaticConst(u32.LeBytes(_maximum32BitUnsignedInt))
	c.minimum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_minimum64BitSignedInt)))
	c.maximum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_maximum64BitSignedInt)))
	c.maximum64BitUnsignedInt = asm.NewStaticConst(u64.LeBytes(_maximum64BitUnsignedInt))
	c.float32SignBitMask = asm.NewStaticConst(u32.LeBytes(_float32SignBitMask))
	c.float32RestBitMask = asm.NewStaticConst(u32.LeBytes(_float32RestBitMask))
	c.float64SignBitMask = asm.NewStaticConst(u64.LeBytes(_float64SignBitMask))
	c.float64RestBitMask = asm.NewStaticConst(u64.LeBytes(_float64RestBitMask))
	c.float32ForMinimumSigned32bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned32bitInteger))
	c.float64ForMinimumSigned32bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned32bitInteger))
	c.float32ForMinimumSigned64bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned64bitInteger))
	c.float64ForMinimumSigned64bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned64bitInteger))
	c.float32ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned32bitIntPlusOne))
	c.float64ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned32bitIntPlusOne))
	c.float32ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned64bitIntPlusOne))
	c.float64ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned64bitIntPlusOne))
	return c
}

// Init implements compiler.Init.
func (c *amd64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
	c.assembler.Reset()
	c.locationStackForEntrypoint.reset()
	c.resetLabels()
	*c = amd64Compiler{
		ir:                                     ir,
		withListener:                           withListener,
		typ:                                    typ,
		assembler:                              c.assembler,
		cpuFeatures:                            c.cpuFeatures,
		labels:                                 c.labels,
		locationStackForEntrypoint:             c.locationStackForEntrypoint,
		brTableTmp:                             c.brTableTmp,
		fourZeros:                              c.fourZeros,
		eightZeros:                             c.eightZeros,
		minimum32BitSignedInt:                  c.minimum32BitSignedInt,
		maximum32BitSignedInt:                  c.maximum32BitSignedInt,
		maximum32BitUnsignedInt:                c.maximum32BitUnsignedInt,
		minimum64BitSignedInt:                  c.minimum64BitSignedInt,
		maximum64BitSignedInt:                  c.maximum64BitSignedInt,
		maximum64BitUnsignedInt:                c.maximum64BitUnsignedInt,
		float32SignBitMask:                     c.float32SignBitMask,
		float32RestBitMask:                     c.float32RestBitMask,
		float64SignBitMask:                     c.float64SignBitMask,
		float64RestBitMask:                     c.float64RestBitMask,
		float32ForMinimumSigned32bitInteger:    c.float32ForMinimumSigned32bitInteger,
		float64ForMinimumSigned32bitInteger:    c.float64ForMinimumSigned32bitInteger,
		float32ForMinimumSigned64bitInteger:    c.float32ForMinimumSigned64bitInteger,
		float64ForMinimumSigned64bitInteger:    c.float64ForMinimumSigned64bitInteger,
		float32ForMaximumSigned32bitIntPlusOne: c.float32ForMaximumSigned32bitIntPlusOne,
		float64ForMaximumSigned32bitIntPlusOne: c.float64ForMaximumSigned32bitIntPlusOne,
		float32ForMaximumSigned64bitIntPlusOne: c.float32ForMaximumSigned64bitIntPlusOne,
		float64ForMaximumSigned64bitIntPlusOne: c.float64ForMaximumSigned64bitIntPlusOne,
	}

	// Reuses the initial location stack for the compilation of subsequent functions.
	c.locationStack = &c.locationStackForEntrypoint
}

// resetLabels resets the existing content in arm64Compiler.labels so that
// we could reuse the allocated slices and stacks in the subsequent compilations.
func (c *amd64Compiler) resetLabels() {
	for i := range c.labels {
		for j := range c.labels[i] {
			if j > c.frameIDMax {
				// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
				break
			}
			l := &c.labels[i][j]
			l.initialInstruction = nil
			l.stackInitialized = false
			l.initialStack.reset()
		}
	}
}

// runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
	return c.locationStack
}

// setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
// while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
// This is called when we branch into different block.
func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
		c.stackPointerCeil = c.locationStack.stackPointerCeil
	}
	c.locationStack = newStack
}

// pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for amd64.
func (c *amd64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
	c.locationStack.markRegisterUsed(reg)
	return
}

// pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for amd64.
func (c *amd64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
	c.locationStack.markRegisterUsed(reg)
	return
}

type amd64LabelInfo struct {
	// initialInstruction is the initial instruction for this label so other block can jump into it.
	initialInstruction asm.Node
	// initialStack is the initial value location stack from which we start compiling this label.
	initialStack     runtimeValueLocationStack
	stackInitialized bool
}

func (c *amd64Compiler) label(label wazeroir.Label) *amd64LabelInfo {
	kind := label.Kind()
	frames := c.labels[kind]
	frameID := label.FrameID()
	if c.frameIDMax < frameID {
		c.frameIDMax = frameID
	}
	// If the frameID is not allocated yet, expand the slice by twice of the diff,
	// so that we could reduce the allocation in the subsequent compilation.
	if diff := frameID - len(frames) + 1; diff > 0 {
		for i := 0; i < diff; i++ {
			frames = append(frames, amd64LabelInfo{initialStack: newRuntimeValueLocationStack()})
		}
		c.labels[kind] = frames
	}
	return &frames[frameID]
}

// compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the amd64 architecture.
func (c *amd64Compiler) compileBuiltinFunctionCheckExitCode() error {
	if err := c.compileCallBuiltinFunction(builtinFunctionIndexCheckExitCode); err != nil {
		return err
	}

	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
	c.compileReservedStackBasePointerInitialization()
	c.compileReservedMemoryPointerInitialization()
	return nil
}

// compileGoDefinedHostFunction constructs the entire code to enter the host function implementation,
// and return to the caller.
func (c *amd64Compiler) compileGoDefinedHostFunction() error {
	// First we must update the location stack to reflect the number of host function inputs.
	c.locationStack.init(c.typ)

	if c.withListener {
		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
			return err
		}
	}

	// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
	// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
	// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
	// without sacrificing the performance.
	c.compileReservedStackBasePointerInitialization()
	// Alias for readability.
	tmp := amd64.RegAX
	// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
	_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
	// Load the value into the tmp register: tmp = &function{..}
	callerFunction.setRegister(tmp)
	c.compileLoadValueOnStackToRegister(callerFunction)
	// tmp = *(tmp+functionSourceOffset) = &wasm.ModuleInstance{...}
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, functionModuleInstanceOffset, tmp)
	// Load it onto callEngine.exitContext.callerFunctionInstance.
	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
		tmp,
		amd64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
	// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
	c.locationStack.releaseRegister(callerFunction)

	if err := c.compileCallGoHostFunction(); err != nil {
		return err
	}

	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
	c.compileReservedStackBasePointerInitialization()

	// Go function can change the module state in arbitrary way, so we have to force
	// the callEngine.moduleContext initialization on the function return. To do so,
	// we zero-out callEngine.moduleInstance.
	c.assembler.CompileConstToMemory(amd64.MOVQ,
		0, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
	return c.compileReturnFunction()
}

// compile implements compiler.compile for the amd64 architecture.
func (c *amd64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
	// used for all labels (via setLocationStack), excluding the current one.
	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
	stackPointerCeil = c.stackPointerCeil
	if stackPointerCeil < c.locationStack.stackPointerCeil {
		stackPointerCeil = c.locationStack.stackPointerCeil
	}

	// Now that the max stack pointer is determined, we are invoking the callback.
	// Note this MUST be called before Assemble() below.
	c.assignStackPointerCeil(stackPointerCeil)

	err = c.assembler.Assemble(buf)
	return
}

// compileUnreachable implements compiler.compileUnreachable for the amd64 architecture.
func (c *amd64Compiler) compileUnreachable() error {
	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
	return nil
}

// assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the amd64 architecture.
func (c *amd64Compiler) assignStackPointerCeil(ceil uint64) {
	if c.assignStackPointerCeilNeeded != nil {
		c.assignStackPointerCeilNeeded.AssignDestinationConstant(int64(ceil) << 3)
	}
}

// compileSet implements compiler.compileSet for the amd64 architecture.
func (c *amd64Compiler) compileSet(o *wazeroir.UnionOperation) error {
	depth := int(o.U1)
	isTargetVector := o.B3

	setTargetIndex := int(c.locationStack.sp) - 1 - depth

	if isTargetVector {
		_ = c.locationStack.pop() // ignore the higher 64-bits.
	}
	v := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	targetLocation := &c.locationStack.stack[setTargetIndex]
	if targetLocation.onRegister() {
		// We no longer need the register previously used by the target location.
		c.locationStack.markRegisterUnused(targetLocation.register)
	}

	reg := v.register
	targetLocation.setRegister(reg)
	targetLocation.valueType = v.valueType
	if isTargetVector {
		hi := &c.locationStack.stack[setTargetIndex+1]
		hi.setRegister(reg)
	}
	return nil
}

// compileGlobalGet implements compiler.compileGlobalGet for the amd64 architecture.
func (c *amd64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First, move the pointer to the global slice into the allocated register.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)

	index := o.U1

	// Now, move the location of the global instance into the register.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)

	// When an integer, reuse the pointer register for the value. Otherwise, allocate a float register for it.
	valueReg := intReg
	var vt runtimeValueType
	var inst asm.Instruction
	switch c.ir.Globals[index].ValType {
	case wasm.ValueTypeI32:
		inst = amd64.MOVL
		vt = runtimeValueTypeI32
	case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
		inst = amd64.MOVQ
		vt = runtimeValueTypeI64
	case wasm.ValueTypeF32:
		inst = amd64.MOVL
		vt = runtimeValueTypeF32
		valueReg, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
	case wasm.ValueTypeF64:
		inst = amd64.MOVQ
		vt = runtimeValueTypeF64
		valueReg, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
	case wasm.ValueTypeV128:
		inst = amd64.MOVDQU
		vt = runtimeValueTypeV128Lo
		valueReg, err = c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
	default:
		panic("BUG: unknown runtime value type")
	}

	// Using the register holding the pointer to the target instance, move its value into a register.
	c.assembler.CompileMemoryToRegister(inst, intReg, globalInstanceValueOffset, valueReg)

	// Record that the retrieved global value on the top of the stack is now in a register.
	if vt == runtimeValueTypeV128Lo {
		c.pushVectorRuntimeValueLocationOnRegister(valueReg)
	} else {
		c.pushRuntimeValueLocationOnRegister(valueReg, vt)
	}
	return nil
}

// compileGlobalSet implements compiler.compileGlobalSet for the amd64 architecture.
func (c *amd64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
	index := o.U1

	wasmValueType := c.ir.Globals[index].ValType
	isV128 := wasmValueType == wasm.ValueTypeV128

	// First, move the value to set into a temporary register.
	val := c.locationStack.pop()
	if isV128 {
		// The previous val is higher 64-bits, and have to use lower 64-bit's runtimeValueLocation for allocation, etc.
		val = c.locationStack.pop()
	}
	if err := c.compileEnsureOnRegister(val); err != nil {
		return err
	}

	// Allocate a register to hold the memory location of the target global instance.
	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First, move the pointer to the global slice into the allocated register.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)

	// Now, move the location of the global instance into the register.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)

	// Now ready to write the value to the global instance location.
	var inst asm.Instruction
	if isV128 {
		inst = amd64.MOVDQU
	} else if wasmValueType == wasm.ValueTypeI32 || wasmValueType == wasm.ValueTypeF32 {
		inst = amd64.MOVL
	} else {
		inst = amd64.MOVQ
	}
	c.assembler.CompileRegisterToMemory(inst, val.register, intReg, globalInstanceValueOffset)

	// Since the value is now written to memory, release the value register.
	c.locationStack.releaseRegister(val)
	return nil
}

// compileBr implements compiler.compileBr for the amd64 architecture.
func (c *amd64Compiler) compileBr(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}
	return c.branchInto(wazeroir.Label(o.U1))
}

// branchInto adds instruction necessary to jump into the given branch target.
func (c *amd64Compiler) branchInto(target wazeroir.Label) error {
	if target.IsReturnTarget() {
		return c.compileReturnFunction()
	} else {
		if c.ir.LabelCallers[target] > 1 {
			// We can only re-use register state if when there's a single call-site.
			// Release existing values on registers to the stack if there's multiple ones to have
			// the consistent value location state at the beginning of label.
			if err := c.compileReleaseAllRegistersToStack(); err != nil {
				return err
			}
		}
		// Set the initial stack of the target label, so we can start compiling the label
		// with the appropriate value locations. Note we clone the stack here as we maybe
		// manipulate the stack before compiler reaches the label.
		targetLabel := c.label(target)
		if !targetLabel.stackInitialized {
			targetLabel.initialStack.cloneFrom(*c.locationStack)
			targetLabel.stackInitialized = true
		}
		jmp := c.assembler.CompileJump(amd64.JMP)
		c.assignJumpTarget(target, jmp)
	}
	return nil
}

// compileBrIf implements compiler.compileBrIf for the amd64 architecture.
func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
	cond := c.locationStack.pop()
	var jmpWithCond asm.Node
	if cond.onConditionalRegister() {
		var inst asm.Instruction
		switch cond.conditionalRegister {
		case amd64.ConditionalRegisterStateE:
			inst = amd64.JEQ
		case amd64.ConditionalRegisterStateNE:
			inst = amd64.JNE
		case amd64.ConditionalRegisterStateS:
			inst = amd64.JMI
		case amd64.ConditionalRegisterStateNS:
			inst = amd64.JPL
		case amd64.ConditionalRegisterStateG:
			inst = amd64.JGT
		case amd64.ConditionalRegisterStateGE:
			inst = amd64.JGE
		case amd64.ConditionalRegisterStateL:
			inst = amd64.JLT
		case amd64.ConditionalRegisterStateLE:
			inst = amd64.JLE
		case amd64.ConditionalRegisterStateA:
			inst = amd64.JHI
		case amd64.ConditionalRegisterStateAE:
			inst = amd64.JCC
		case amd64.ConditionalRegisterStateB:
			inst = amd64.JCS
		case amd64.ConditionalRegisterStateBE:
			inst = amd64.JLS
		}
		jmpWithCond = c.assembler.CompileJump(inst)
	} else {
		// Usually the comparison operand for br_if is on the conditional register,
		// but in some cases, they are on the stack or register.
		// For example, the following code
		// 		i64.const 1
		//      local.get 1
		//      i64.add
		//      br_if ....
		// will try to use the result of i64.add, which resides on the (virtual) stack,
		// as the operand for br_if instruction.
		if err := c.compileEnsureOnRegister(cond); err != nil {
			return err
		}
		// Check if the value not equals zero.
		c.assembler.CompileRegisterToConst(amd64.CMPQ, cond.register, 0)

		// Emit jump instruction which jumps when the value does not equals zero.
		jmpWithCond = c.assembler.CompileJump(amd64.JNE)
		c.locationStack.markRegisterUnused(cond.register)
	}

	// Make sure that the next coming label is the else jump target.
	thenTarget := wazeroir.Label(o.U1)
	elseTarget := wazeroir.Label(o.U2)
	thenToDrop := o.U3

	// Here's the diagram of how we organize the instructions necessarily for brif operation.
	//
	// jmp_with_cond -> jmp (.Else) -> Then operations...
	//    |---------(satisfied)------------^^^
	//
	// Note that .Else branch doesn't have ToDrop as .Else is in reality
	// corresponding to either If's Else block or Br_if's else block in Wasm.

	// Emit the else branch.
	if elseTarget.IsReturnTarget() {
		if err := c.compileReturnFunction(); err != nil {
			return err
		}
	} else {
		labelInfo := c.label(elseTarget)
		if !labelInfo.stackInitialized {
			labelInfo.initialStack.cloneFrom(*c.locationStack)
			labelInfo.stackInitialized = true
		}

		elseJmp := c.assembler.CompileJump(amd64.JMP)
		c.assignJumpTarget(elseTarget, elseJmp)
	}

	// Handle then branch.
	c.assembler.SetJumpTargetOnNext(jmpWithCond)
	if err := compileDropRange(c, thenToDrop); err != nil {
		return err
	}
	if thenTarget.IsReturnTarget() {
		return c.compileReturnFunction()
	} else {
		thenLabel := thenTarget
		if c.ir.LabelCallers[thenLabel] > 1 {
			// We can only re-use register state if when there's a single call-site.
			// Release existing values on registers to the stack if there's multiple ones to have
			// the consistent value location state at the beginning of label.
			if err := c.compileReleaseAllRegistersToStack(); err != nil {
				return err
			}
		}
		// Set the initial stack of the target label, so we can start compiling the label
		// with the appropriate value locations. Note we clone the stack here as we maybe
		// manipulate the stack before compiler reaches the label.
		labelInfo := c.label(thenLabel)
		if !labelInfo.stackInitialized {
			labelInfo.initialStack.cloneFrom(*c.locationStack)
			labelInfo.stackInitialized = true
		}
		thenJmp := c.assembler.CompileJump(amd64.JMP)
		c.assignJumpTarget(thenLabel, thenJmp)
		return nil
	}
}

// compileBrTable implements compiler.compileBrTable for the amd64 architecture.
func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
	index := c.locationStack.pop()

	// If the operation only consists of the default target, we branch into it and return early.
	if len(o.Us) == 2 {
		c.locationStack.releaseRegister(index)
		if err := compileDropRange(c, o.Us[1]); err != nil {
			return err
		}
		return c.branchInto(wazeroir.Label(o.Us[0]))
	}

	// Otherwise, we jump into the selected branch.
	if err := c.compileEnsureOnRegister(index); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First, we move the length of target list into the tmp register.
	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(len(o.Us)/2-1), tmp)

	// Then, we compare the value with the length of targets.
	c.assembler.CompileRegisterToRegister(amd64.CMPL, tmp, index.register)

	// If the value is larger than the length,
	// we round the index to the length as the spec states that
	// if the index is larger than or equal the length of list,
	// branch into the default branch.
	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, tmp, index.register)

	// We prepare the static data which holds the offset of
	// each target's first instruction (incl. default)
	// relative to the beginning of label tables.
	//
	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
	// we emit the the code like this at [Emit the code for each targets and default branch] below.
	//
	// L0:
	//  0x123001: XXXX, ...
	//  .....
	// L1:
	//  0x123005: YYY, ...
	//  .....
	// L_DEFAULT:
	//  0x123009: ZZZ, ...
	//
	// then offsetData becomes like [0x0, 0x5, 0x8].
	// By using this offset list, we could jump into the label for the index by
	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA"
	// instruction.
	//
	// Note: We store each offset of 32-bite unsigned integer as 4 consecutive bytes. So more precisely,
	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
	//
	// Note: this is similar to how GCC implements Switch statements in C.
	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))

	// Load the offsetData's address into tmp.
	if err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, offsetData, tmp); err != nil {
		return err
	}

	// Now we have the address of first byte of offsetData in tmp register.
	// So the target offset's first byte is at tmp+index*4 as we store
	// the offset as 4 bytes for a 32-byte integer.
	// Here, we store the offset into the index.register.
	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVL, tmp, 0, index.register, 4, index.register)

	// Now we read the address of the beginning of the jump table.
	// In the above example, this corresponds to reading the address of 0x123001.
	c.assembler.CompileReadInstructionAddress(tmp, amd64.JMP)

	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
	// So we could achieve the br_table jump by adding them and jump into the resulting address.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, index.register, tmp)

	c.assembler.CompileJumpToRegister(amd64.JMP, tmp)

	// We no longer need the index's register, so mark it unused.
	c.locationStack.markRegisterUnused(index.register)

	// [Emit the code for each targets and default branch]
	labelInitialInstructions := make([]asm.Node, len(o.Us)/2)

	// Since we might end up having the different stack state in each branch,
	// we need to save the initial stack state here, and use the same initial state
	// for each iteration.
	initialLocationStack := c.getSavedTemporaryLocationStack()

	for i := range labelInitialInstructions {
		// Emit the initial instruction of each target.
		// We use NOP as we don't yet know the next instruction in each label.
		// Assembler would optimize out this NOP during code generation, so this is harmless.
		labelInitialInstructions[i] = c.assembler.CompileStandAlone(amd64.NOP)

		targetLabel := wazeroir.Label(o.Us[i*2])
		targetToDrop := o.Us[i*2+1]
		if err = compileDropRange(c, targetToDrop); err != nil {
			return err
		}
		if err = c.branchInto(targetLabel); err != nil {
			return err
		}
		// After the iteration, reset the stack's state with initialLocationStack.
		c.locationStack.cloneFrom(initialLocationStack)
	}

	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
	return nil
}

func (c *amd64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
	initialLocationStack := *c.locationStack // Take copy!
	// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
	if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
		c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
	}
	copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
	initialLocationStack.stack = c.brTableTmp
	return initialLocationStack
}

func (c *amd64Compiler) assignJumpTarget(label wazeroir.Label, jmpInstruction asm.Node) {
	jmpTargetLabel := c.label(label)
	targetInst := jmpTargetLabel.initialInstruction
	if targetInst == nil {
		// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
		targetInst = c.assembler.AllocateNOP()
		jmpTargetLabel.initialInstruction = targetInst
	}
	jmpInstruction.AssignJumpTarget(targetInst)
}

// compileLabel implements compiler.compileLabel for the amd64 architecture.
func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool) {
	label := wazeroir.Label(o.U1)
	labelInfo := c.label(label)

	// If initialStack is not set, that means this label has never been reached.
	if !labelInfo.stackInitialized {
		skipLabel = true
		return
	}

	// We use NOP as a beginning of instructions in a label.
	if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
		// We use NOP as a beginning of instructions in a label.
		// This should be eventually optimized out by assembler.
		labelInfo.initialInstruction = c.assembler.CompileStandAlone(amd64.NOP)
	} else {
		c.assembler.Add(labelBegin)
	}

	// Set the initial stack.
	c.setLocationStack(&labelInfo.initialStack)
	return
}

// compileCall implements compiler.compileCall for the amd64 architecture.
func (c *amd64Compiler) compileCall(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	functionIndex := o.U1

	target := c.ir.Functions[functionIndex]
	targetType := &c.ir.Types[target]

	targetAddressRegister, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First, push the index to the callEngine.functionsElement0Address into the target register.
	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(functionIndex)*functionSize, targetAddressRegister)

	// Next, we add the address of the first item of callEngine.functions slice (= &callEngine.functions[0])
	// to the target register.
	c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine,
		callEngineModuleContextFunctionsElement0AddressOffset, targetAddressRegister)

	if err := c.compileCallFunctionImpl(targetAddressRegister, targetType); err != nil {
		return err
	}
	return nil
}

// compileCallIndirect implements compiler.compileCallIndirect for the amd64 architecture.
func (c *amd64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) error {
	offset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(offset); err != nil {
		return nil
	}
	typeIndex := o.U1
	tableIndex := o.U2

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(tmp)

	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(tmp2)

	// Load the address of the target table: tmp = &module.Tables[0]
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
	// tmp = &module.Tables[0] + Index*8 = &module.Tables[0] + sizeOf(*TableInstance)*index = module.Tables[o.TableIndex].
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)

	// Then, we need to trap if the offset exceeds the length of table.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
	c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)

	// next we check if the target's type matches the operation's one.
	// In order to get the type instance's address, we have to multiply the offset
	// by 8 as the offset is the "length" of table in Go's "[]uintptr{}",
	// and size of uintptr equals 8 bytes == (2^3).
	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, offset.register)

	// Adds the address of wasm.Table[0] stored as callEngine.tableElement0Address to the offset.
	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
		tmp, tableInstanceTableOffset, offset.register)

	// "offset = (*offset) (== table[offset]  == *code type)"
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, 0, offset.register)

	// At this point offset.register holds the address of *code (as uintptr) at wasm.Table[offset].
	//
	// Check if the value of table[offset] equals zero, meaning that the target is uninitialized.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, offset.register, 0)

	// Skipped if the target is initialized.
	c.compileTrapFromNativeCode(amd64.JNE, nativeCallStatusCodeInvalidTableAccess)

	// Next, we need to check the type matches, i.e. table[offset].source.TypeID == targetFunctionType's typeID.
	//
	// "tmp2 = [&moduleInstance.TypeIDs[0] + index * 4] (== moduleInstance.TypeIDs[index])"
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
		tmp2)
	c.assembler.CompileMemoryToRegister(amd64.MOVL, tmp2, int64(typeIndex)*4, tmp2)

	// Skipped if the type matches.
	c.assembler.CompileMemoryToRegister(amd64.CMPL, offset.register, functionTypeIDOffset, tmp2)
	c.compileTrapFromNativeCode(amd64.JEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
	targetFunctionType := &c.ir.Types[typeIndex]
	if err = c.compileCallFunctionImpl(offset.register, targetFunctionType); err != nil {
		return nil
	}

	// The offset register should be marked as un-used as we consumed in the function call.
	c.locationStack.markRegisterUnused(offset.register, tmp, tmp2)
	return nil
}

// compileDrop implements compiler.compileDrop for the amd64 architecture.
func (c *amd64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
	return compileDropRange(c, o.U1)
}

// compileSelectV128Impl implements compileSelect for vector values.
func (c *amd64Compiler) compileSelectV128Impl(selectorReg asm.Register) error {
	x2 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.popV128()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Compare the conditional value with zero.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, selectorReg, 0)

	// Set the jump if the top value is not zero.
	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)

	// In this branch, we select the value of x2, so we move the value into x1.register so that
	// we can have the result in x1.register regardless of the selection.
	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, x2.register, x1.register)

	// Else, we don't need to adjust value, just need to jump to the next instruction.
	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)

	// As noted, the result exists in x1.register regardless of the selector.
	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
	// Plus, x2.register is no longer used.
	c.locationStack.markRegisterUnused(x2.register)
	c.locationStack.markRegisterUnused(selectorReg)
	return nil
}

// compileSelect implements compiler.compileSelect for the amd64 architecture.
//
// The emitted native code depends on whether the values are on
// the physical registers or memory stack, or maybe conditional register.
func (c *amd64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
	cv := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(cv); err != nil {
		return err
	}

	isTargetVector := o.B3
	if isTargetVector {
		return c.compileSelectV128Impl(cv.register)
	}

	x2 := c.locationStack.pop()
	// We do not consume x1 here, but modify the value according to
	// the conditional value "c" above.
	peekedX1 := c.locationStack.peek()

	// Compare the conditional value with zero.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, cv.register, 0)

	// Now we can use c.register as temporary location.
	// We alias it here for readability.
	tmpRegister := cv.register

	// Set the jump if the top value is not zero.
	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)

	// If the value is zero, we must place the value of x2 onto the stack position of x1.

	// First we copy the value of x2 to the temporary register if x2 is not currently on a register.
	if x2.onStack() {
		x2.register = tmpRegister
		c.compileLoadValueOnStackToRegister(x2)
	}

	//
	// At this point x2's value is always on a register.
	//

	// Then release the value in the x2's register to the x1's stack position.
	if peekedX1.onRegister() {
		c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, peekedX1.register)
	} else {
		peekedX1.register = x2.register
		c.compileReleaseRegisterToStack(peekedX1) // Note inside we mark the register unused!
	}

	// Else, we don't need to adjust value, just need to jump to the next instruction.
	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)

	// In any case, we don't need x2 and c anymore!
	c.locationStack.releaseRegister(x2)
	c.locationStack.releaseRegister(cv)
	return nil
}

// compilePick implements compiler.compilePick for the amd64 architecture.
func (c *amd64Compiler) compilePick(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}
	depth := o.U1
	isTargetVector := o.B3

	// TODO: if we track the type of values on the stack,
	// we could optimize the instruction according to the bit size of the value.
	// For now, we just move the entire register i.e. as a quad word (8 bytes).
	pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
	reg, err := c.allocateRegister(pickTarget.getRegisterType())
	if err != nil {
		return err
	}

	if pickTarget.onRegister() {
		var inst asm.Instruction
		if isTargetVector {
			inst = amd64.MOVDQU
		} else if pickTarget.valueType == runtimeValueTypeI32 { // amd64 cannot copy single-precisions between registers.
			inst = amd64.MOVL
		} else {
			inst = amd64.MOVQ
		}
		c.assembler.CompileRegisterToRegister(inst, pickTarget.register, reg)
	} else if pickTarget.onStack() {
		// Copy the value from the stack.
		var inst asm.Instruction
		if isTargetVector {
			inst = amd64.MOVDQU
		} else if pickTarget.valueType == runtimeValueTypeI32 || pickTarget.valueType == runtimeValueTypeF32 {
			inst = amd64.MOVL
		} else {
			inst = amd64.MOVQ
		}
		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
		c.assembler.CompileMemoryToRegister(inst, amd64ReservedRegisterForStackBasePointerAddress,
			int64(pickTarget.stackPointer)*8, reg)
	}
	// Now we already placed the picked value on the register,
	// so push the location onto the stack.
	if isTargetVector {
		c.pushVectorRuntimeValueLocationOnRegister(reg)
	} else {
		c.pushRuntimeValueLocationOnRegister(reg, pickTarget.valueType)
	}
	return nil
}

// compileAdd implements compiler.compileAdd for the amd64 architecture.
func (c *amd64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
	// TODO: if the previous instruction is const, then
	// this can be optimized. Same goes for other arithmetic instructions.

	var instruction asm.Instruction

	unsignedType := wazeroir.UnsignedType(o.B1)
	switch unsignedType {
	case wazeroir.UnsignedTypeI32:
		instruction = amd64.ADDL
	case wazeroir.UnsignedTypeI64:
		instruction = amd64.ADDQ
	case wazeroir.UnsignedTypeF32:
		instruction = amd64.ADDSS
	case wazeroir.UnsignedTypeF64:
		instruction = amd64.ADDSD
	}

	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.peek() // Note this is peek, pop!
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// x1 += x2.
	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)

	// We no longer need x2 register after ADD operation here,
	// so we release it.
	c.locationStack.releaseRegister(x2)
	return nil
}

// compileSub implements compiler.compileSub for the amd64 architecture.
func (c *amd64Compiler) compileSub(o *wazeroir.UnionOperation) error {
	// TODO: if the previous instruction is const, then
	// this can be optimized. Same goes for other arithmetic instructions.

	var instruction asm.Instruction
	unsignedType := wazeroir.UnsignedType(o.B1)
	switch unsignedType {
	case wazeroir.UnsignedTypeI32:
		instruction = amd64.SUBL
	case wazeroir.UnsignedTypeI64:
		instruction = amd64.SUBQ
	case wazeroir.UnsignedTypeF32:
		instruction = amd64.SUBSS
	case wazeroir.UnsignedTypeF64:
		instruction = amd64.SUBSD
	}

	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.peek() // Note this is peek, pop!
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// x1 -= x2.
	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)

	// We no longer need x2 register after ADD operation here,
	// so we release it.
	c.locationStack.releaseRegister(x2)
	return nil
}

// compileMul implements compiler.compileMul for the amd64 architecture.
func (c *amd64Compiler) compileMul(o *wazeroir.UnionOperation) (err error) {
	unsignedType := wazeroir.UnsignedType(o.B1)
	switch unsignedType {
	case wazeroir.UnsignedTypeI32:
		err = c.compileMulForInts(true, amd64.MULL)
	case wazeroir.UnsignedTypeI64:
		err = c.compileMulForInts(false, amd64.MULQ)
	case wazeroir.UnsignedTypeF32:
		err = c.compileMulForFloats(amd64.MULSS)
	case wazeroir.UnsignedTypeF64:
		err = c.compileMulForFloats(amd64.MULSD)
	}
	return
}

// compileMulForInts emits instructions to perform integer multiplication for
// top two values on the stack. If unfamiliar with the convention for integer
// multiplication on x86, see https://www.felixcloutier.com/x86/mul.
//
// In summary, one of the values must be on the AX register,
// and the mul instruction stores the overflow info in DX register which we don't use.
// Here, we mean "the overflow info" by 65 bit or higher part of the result for 64 bit case.
//
// So, we have to ensure that
//  1. Previously located value on DX must be saved to memory stack. That is because
//     the existing value will be overridden after the mul execution.
//  2. One of the operands (x1 or x2) must be on AX register.
//
// See https://www.felixcloutier.com/x86/mul#description for detail semantics.
func (c *amd64Compiler) compileMulForInts(is32Bit bool, mulInstruction asm.Instruction) error {
	const (
		resultRegister   = amd64.RegAX
		reservedRegister = amd64.RegDX
	)

	x2 := c.locationStack.pop()
	x1 := c.locationStack.pop()

	var valueOnAX *runtimeValueLocation
	if x1.register == resultRegister {
		valueOnAX = x1
	} else if x2.register == resultRegister {
		valueOnAX = x2
	} else {
		valueOnAX = x2
		// This case we  move x2 to AX register.
		c.onValueReleaseRegisterToStack(resultRegister)
		if x2.onConditionalRegister() {
			c.compileMoveConditionalToGeneralPurposeRegister(x2, resultRegister)
		} else if x2.onStack() {
			x2.setRegister(resultRegister)
			c.compileLoadValueOnStackToRegister(x2)
			c.locationStack.markRegisterUsed(resultRegister)
		} else {
			var inst asm.Instruction
			if is32Bit {
				inst = amd64.MOVL
			} else {
				inst = amd64.MOVQ
			}
			c.assembler.CompileRegisterToRegister(inst, x2.register, resultRegister)

			// We no longer uses the prev register of x2.
			c.locationStack.releaseRegister(x2)
			x2.setRegister(resultRegister)
			c.locationStack.markRegisterUsed(resultRegister)
		}
	}

	// We have to make sure that at this point the operands must be on registers.
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// We have to save the existing value on DX.
	// If the DX register is used by either x1 or x2, we don't need to
	// save the value because it is consumed by mul anyway.
	if x1.register != reservedRegister && x2.register != reservedRegister {
		c.onValueReleaseRegisterToStack(reservedRegister)
	}

	// Now ready to emit the mul instruction.
	if x1 == valueOnAX {
		c.assembler.CompileRegisterToNone(mulInstruction, x2.register)
	} else {
		c.assembler.CompileRegisterToNone(mulInstruction, x1.register)
	}

	c.locationStack.markRegisterUnused(x2.register)
	c.locationStack.markRegisterUnused(x1.register)

	// Now we have the result in the AX register,
	// so we record it.
	c.pushRuntimeValueLocationOnRegister(resultRegister, x1.valueType)
	return nil
}

func (c *amd64Compiler) compileMulForFloats(instruction asm.Instruction) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// x1 *= x2.
	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)

	// We no longer need x2 register after MUL operation here,
	// so we release it.
	c.locationStack.releaseRegister(x2)
	return nil
}

// compileClz implements compiler.compileClz for the amd64 architecture.
func (c *amd64Compiler) compileClz(o *wazeroir.UnionOperation) error {
	target := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	unsignedInt := wazeroir.UnsignedInt(o.B1)
	if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.LZCNTQ, target.register, target.register)
		}
	} else {
		// On processors that do not support LZCNT, we combine BSR (calculating
		// most significant set bit) with XOR. This logic is described in
		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.

		// First, we have to check if the target is non-zero as BSR is undefined
		// on zero. See https://www.felixcloutier.com/x86/bsr.
		c.assembler.CompileRegisterToConst(amd64.CMPQ, target.register, 0)
		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)

		// If the value is zero, we just push the const value.
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
		} else {
			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
		}

		// Emit the jmp instruction to jump to the position right after
		// the non-zero case.
		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)

		// Start emitting non-zero case.
		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
		// First, we calculate the most significant set bit.
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileRegisterToRegister(amd64.BSRL, target.register, target.register)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.BSRQ, target.register, target.register)
		}

		// Now we XOR the value with the bit length minus one.
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileConstToRegister(amd64.XORL, 31, target.register)
		} else {
			c.assembler.CompileConstToRegister(amd64.XORQ, 63, target.register)
		}

		// Finally the end jump instruction of zero case must target towards
		// the next instruction.
		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
	}

	// We reused the same register of target for the result.
	c.locationStack.markRegisterUnused(target.register)
	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
	return nil
}

// compileCtz implements compiler.compileCtz for the amd64 architecture.
func (c *amd64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
	target := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	unsignedInt := wazeroir.UnsignedInt(o.B1)
	if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
		}
	} else {
		// On processors that do not support TZCNT, the BSF instruction is
		// executed instead. The key difference between TZCNT and BSF
		// instruction is that if source operand is zero, the content of
		// destination operand is undefined.
		// https://www.felixcloutier.com/x86/tzcnt.html

		// First we compare the target with zero.
		c.assembler.CompileRegisterToConst(amd64.CMPQ, target.register, 0)
		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)

		// If the value is zero, we just push the const value.
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
		} else {
			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
		}

		// Emit the jmp instruction to jump to the position right after
		// the non-zero case.
		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)

		// Otherwise, emit the TZCNT.
		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
		if unsignedInt == wazeroir.UnsignedInt32 {
			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
		}

		// Finally the end jump instruction of zero case must target towards
		// the next instruction.
		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
	}

	// We reused the same register of target for the result.
	c.locationStack.markRegisterUnused(target.register)
	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
	return nil
}

// compilePopcnt implements compiler.compilePopcnt for the amd64 architecture.
func (c *amd64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
	target := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	unsignedInt := wazeroir.UnsignedInt(o.B1)
	if unsignedInt == wazeroir.UnsignedInt32 {
		c.assembler.CompileRegisterToRegister(amd64.POPCNTL, target.register, target.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.POPCNTQ, target.register, target.register)
	}

	// We reused the same register of target for the result.
	c.locationStack.markRegisterUnused(target.register)
	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
	return nil
}

// compileDiv implements compiler.compileDiv for the amd64 architecture.
func (c *amd64Compiler) compileDiv(o *wazeroir.UnionOperation) (err error) {
	signedType := wazeroir.SignedType(o.B1)
	switch signedType {
	case wazeroir.SignedTypeUint32:
		err = c.compileDivForInts(true, false)
	case wazeroir.SignedTypeUint64:
		err = c.compileDivForInts(false, false)
	case wazeroir.SignedTypeInt32:
		err = c.compileDivForInts(true, true)
	case wazeroir.SignedTypeInt64:
		err = c.compileDivForInts(false, true)
	case wazeroir.SignedTypeFloat32:
		err = c.compileDivForFloats(true)
	case wazeroir.SignedTypeFloat64:
		err = c.compileDivForFloats(false)
	}
	return
}

// compileDivForInts emits the instructions to perform division on the top
// two values of integer type on the stack and puts the quotient of the result
// onto the stack. For example, stack [..., 10, 3] results in [..., 3] where
// the remainder is discarded.
func (c *amd64Compiler) compileDivForInts(is32Bit bool, signed bool) error {
	if err := c.performDivisionOnInts(false, is32Bit, signed); err != nil {
		return err
	}
	// Now we have the quotient of the division result in the AX register,
	// so we record it.
	if is32Bit {
		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI32)
	} else {
		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI64)
	}
	return nil
}

// compileRem implements compiler.compileRem for the amd64 architecture.
func (c *amd64Compiler) compileRem(o *wazeroir.UnionOperation) (err error) {
	var vt runtimeValueType
	signedInt := wazeroir.SignedInt(o.B1)
	switch signedInt {
	case wazeroir.SignedInt32:
		err = c.performDivisionOnInts(true, true, true)
		vt = runtimeValueTypeI32
	case wazeroir.SignedInt64:
		err = c.performDivisionOnInts(true, false, true)
		vt = runtimeValueTypeI64
	case wazeroir.SignedUint32:
		err = c.performDivisionOnInts(true, true, false)
		vt = runtimeValueTypeI32
	case wazeroir.SignedUint64:
		err = c.performDivisionOnInts(true, false, false)
		vt = runtimeValueTypeI64
	}
	if err != nil {
		return err
	}

	// Now we have the remainder of the division result in the DX register,
	// so we record it.
	c.pushRuntimeValueLocationOnRegister(amd64.RegDX, vt)
	return
}

// performDivisionOnInts emits the instructions to do divisions on top two integers on the stack
// via DIV (unsigned div) and IDIV (signed div) instructions.
// See the following explanation of these instructions' semantics from https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf
//
// >> Division requires special arrangements: idiv (signed) and div (unsigned) operate on a 2n-byte dividend and
// >> an n-byte divisor to produce an n-byte quotient and n-byte remainder. The dividend always lives in a fixed pair of
// >> registers (%edx and %eax for the 32-bit case; %rdx and %rax for the 64-bit case); the divisor is specified as the
// >> source operand in the instruction. The quotient goes in %eax (resp. %rax); the remainder in %edx (resp. %rdx). For
// >> signed division, the cltd (resp. ctqo) instruction is used to prepare %edx (resp. %rdx) with the sign extension of
// >> %eax (resp. %rax). For example, if a,b, c are memory locations holding quad words, then we could set c = a/b
// >> using the sequence: movq a(%rip), %rax; ctqo; idivq b(%rip); movq %rax, c(%rip).
//
// tl;dr is that the division result is placed in AX and DX registers after instructions emitted by this function
// where AX holds the quotient while DX the remainder of the division result.
func (c *amd64Compiler) performDivisionOnInts(isRem, is32Bit, signed bool) error {
	const (
		quotientRegister  = amd64.RegAX
		remainderRegister = amd64.RegDX
	)

	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	// Ensures that previous values on these registers are saved to memory.
	c.onValueReleaseRegisterToStack(quotientRegister)
	c.onValueReleaseRegisterToStack(remainderRegister)

	// In order to ensure x2 is placed on a temporary register for x2 value other than AX and DX,
	// we mark them as used here.
	c.locationStack.markRegisterUsed(quotientRegister)
	c.locationStack.markRegisterUsed(remainderRegister)

	// Ensure that x2 is placed on a register which is not either AX or DX.
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	// Now we successfully place x2 on a temp register, so we no longer need to
	// mark these registers used.
	c.locationStack.markRegisterUnused(quotientRegister)
	c.locationStack.markRegisterUnused(remainderRegister)

	// Check if the x2 equals zero.
	if is32Bit {
		c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, 0)
	} else {
		c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, 0)
	}

	// Skipped if the divisor is nonzero.
	c.compileTrapFromNativeCode(amd64.JNE, nativeCallStatusIntegerDivisionByZero)

	// next, we ensure that x1 is placed on AX.
	x1 := c.locationStack.pop()
	if x1.onRegister() && x1.register != quotientRegister {
		// Move x1 to quotientRegister.
		if is32Bit {
			c.assembler.CompileRegisterToRegister(amd64.MOVL, x1.register, quotientRegister)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.MOVQ, x1.register, quotientRegister)
		}
		c.locationStack.markRegisterUnused(x1.register)
		x1.setRegister(quotientRegister)
	} else if x1.onStack() {
		x1.setRegister(quotientRegister)
		c.compileLoadValueOnStackToRegister(x1)
	}

	// Note: at this point, x1 is placed on AX, x2 is on a register which is not AX or DX.

	isSignedRem := isRem && signed
	isSignedDiv := !isRem && signed
	var signedRemMinusOneDivisorJmp asm.Node
	if isSignedRem {
		// If this is for getting remainder of signed division,
		// we have to treat the special case where the divisor equals -1.
		// For example, if this is 32-bit case, the result of (-2^31) / -1 equals (quotient=2^31, remainder=0)
		// where quotient doesn't fit in the 32-bit range whose maximum is 2^31-1.
		// x86 in this case cause floating point exception, but according to the Wasm spec
		// if the divisor equals -1, the result must be zero (not undefined!) as opposed to be "undefined"
		// for divisions on (-2^31) / -1 where we do not need to emit the special branches.
		// For detail, please refer to https://stackoverflow.com/questions/56303282/why-idiv-with-1-causes-floating-point-exception

		// First we compare the division with -1.
		if is32Bit {
			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
		} else {
			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
		}

		// If it doesn't equal minus one, we jump to the normal case.
		okJmp := c.assembler.CompileJump(amd64.JNE)

		// Otherwise, we store zero into the remainder result register (DX).
		if is32Bit {
			c.assembler.CompileRegisterToRegister(amd64.XORL, remainderRegister, remainderRegister)
		} else {
			c.assembler.CompileRegisterToRegister(amd64.XORQ, remainderRegister, remainderRegister)
		}

		// Emit the exit jump instruction for the divisor -1 case so
		// we skips the normal case.
		signedRemMinusOneDivisorJmp = c.assembler.CompileJump(amd64.JMP)

		// Set the normal case's jump target.
		c.assembler.SetJumpTargetOnNext(okJmp)
	} else if isSignedDiv {
		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
		// case which results in the floating point exception via division error as
		// the resulting value exceeds the maximum of signed int.

		// First we compare the division with -1.
		if is32Bit {
			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
		} else {
			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
		}

		// If it doesn't equal minus one, we jump to the normal case.
		nonMinusOneDivisorJmp := c.assembler.CompileJump(amd64.JNE)

		// next we check if the quotient is the most negative value for the signed integer.
		// That means whether or not we try to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
		if is32Bit {
			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPL, x1.register, c.minimum32BitSignedInt); err != nil {
				return err
			}
		} else {
			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPQ, x1.register, c.minimum64BitSignedInt); err != nil {
				return err
			}
		}

		// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
		// as that is the overflow in division as the result becomes 2^31 which is larger than
		// the maximum of signed 32-bit int (2^31-1).
		c.compileTrapFromNativeCode(amd64.JNE, nativeCallStatusIntegerOverflow)
		// Set the normal case's jump target.
		c.assembler.SetJumpTargetOnNext(nonMinusOneDivisorJmp)
	}

	// Now ready to emit the div instruction.
	// Since the div instructions takes 2n byte dividend placed in DX:AX registers...
	// * signed case - we need to sign-extend the dividend into DX register via CDQ (32 bit) or CQO (64 bit).
	// * unsigned case - we need to zero DX register via "XOR DX DX"
	if is32Bit && signed {
		// Emit sign-extension to have 64 bit dividend over DX and AX registers.
		c.assembler.CompileStandAlone(amd64.CDQ)
		c.assembler.CompileRegisterToNone(amd64.IDIVL, x2.register)
	} else if is32Bit && !signed {
		// Zeros DX register to have 64 bit dividend over DX and AX registers.
		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
		c.assembler.CompileRegisterToNone(amd64.DIVL, x2.register)
	} else if !is32Bit && signed {
		// Emits sign-extension to have 128 bit dividend over DX and AX registers.
		c.assembler.CompileStandAlone(amd64.CQO)
		c.assembler.CompileRegisterToNone(amd64.IDIVQ, x2.register)
	} else if !is32Bit && !signed {
		// Zeros DX register to have 128 bit dividend over DX and AX registers.
		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
		c.assembler.CompileRegisterToNone(amd64.DIVQ, x2.register)
	}

	// If this is signed rem instruction, we must set the jump target of
	// the exit jump from division -1 case towards the next instruction.
	if signedRemMinusOneDivisorJmp != nil {
		c.assembler.SetJumpTargetOnNext(signedRemMinusOneDivisorJmp)
	}

	// We mark them as unused so that we can push one of them onto the location stack at call sites.
	c.locationStack.markRegisterUnused(remainderRegister)
	c.locationStack.markRegisterUnused(quotientRegister)
	c.locationStack.markRegisterUnused(x2.register)
	return nil
}

// compileDivForFloats emits the instructions to perform division
// on the top two values of float type on the stack, placing the result back onto the stack.
// For example, stack [..., 1.0, 4.0] results in [..., 0.25].
func (c *amd64Compiler) compileDivForFloats(is32Bit bool) error {
	if is32Bit {
		return c.compileSimpleBinaryOp(amd64.DIVSS)
	} else {
		return c.compileSimpleBinaryOp(amd64.DIVSD)
	}
}

// compileAnd implements compiler.compileAnd for the amd64 architecture.
func (c *amd64Compiler) compileAnd(o *wazeroir.UnionOperation) (err error) {
	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.compileSimpleBinaryOp(amd64.ANDL)
	case wazeroir.UnsignedInt64:
		err = c.compileSimpleBinaryOp(amd64.ANDQ)
	}
	return
}

// compileOr implements compiler.compileOr for the amd64 architecture.
func (c *amd64Compiler) compileOr(o *wazeroir.UnionOperation) (err error) {
	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.compileSimpleBinaryOp(amd64.ORL)
	case wazeroir.UnsignedInt64:
		err = c.compileSimpleBinaryOp(amd64.ORQ)
	}
	return
}

// compileXor implements compiler.compileXor for the amd64 architecture.
func (c *amd64Compiler) compileXor(o *wazeroir.UnionOperation) (err error) {
	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.compileSimpleBinaryOp(amd64.XORL)
	case wazeroir.UnsignedInt64:
		err = c.compileSimpleBinaryOp(amd64.XORQ)
	}
	return
}

// compileSimpleBinaryOp emits instructions to pop two values from the stack
// and perform the given instruction on these two values and push the result
// onto the stack.
func (c *amd64Compiler) compileSimpleBinaryOp(instruction asm.Instruction) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)

	// We consumed x2 register after the operation here,
	// so we release it.
	c.locationStack.releaseRegister(x2)

	// We already stored the result in the register used by x1
	// so we record it.
	c.locationStack.markRegisterUnused(x1.register)
	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
	return nil
}

// compileShl implements compiler.compileShl for the amd64 architecture.
func (c *amd64Compiler) compileShl(o *wazeroir.UnionOperation) (err error) {
	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.compileShiftOp(amd64.SHLL, false)
	case wazeroir.UnsignedInt64:
		err = c.compileShiftOp(amd64.SHLQ, true)
	}
	return
}

// compileShr implements compiler.compileShr for the amd64 architecture.
func (c *amd64Compiler) compileShr(o *wazeroir.UnionOperation) (err error) {
	signedInt := wazeroir.SignedInt(o.B1)
	switch signedInt {
	case wazeroir.SignedInt32:
		err = c.compileShiftOp(amd64.SARL, true)
	case wazeroir.SignedInt64:
		err = c.compileShiftOp(amd64.SARQ, false)
	case wazeroir.SignedUint32:
		err = c.compileShiftOp(amd64.SHRL, true)
	case wazeroir.SignedUint64:
		err = c.compileShiftOp(amd64.SHRQ, false)
	}
	return
}

// compileRotl implements compiler.compileRotl for the amd64 architecture.
func (c *amd64Compiler) compileRotl(o *wazeroir.UnionOperation) (err error) {
	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.compileShiftOp(amd64.ROLL, true)
	case wazeroir.UnsignedInt64:
		err = c.compileShiftOp(amd64.ROLQ, false)
	}
	return
}

// compileRotr implements compiler.compileRotr for the amd64 architecture.
func (c *amd64Compiler) compileRotr(o *wazeroir.UnionOperation) (err error) {
	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.compileShiftOp(amd64.RORL, true)
	case wazeroir.UnsignedInt64:
		err = c.compileShiftOp(amd64.RORQ, false)
	}
	return
}

// compileShiftOp adds instructions for shift operations (SHR, SHL, ROTR, ROTL)
// where we have to place the second value (shift counts) on the CX register.
func (c *amd64Compiler) compileShiftOp(instruction asm.Instruction, is32Bit bool) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	x2 := c.locationStack.pop()

	// Ensures that x2 (holding shift counts) is placed on the CX register.
	const shiftCountRegister = amd64.RegCX
	if (x2.onRegister() && x2.register != shiftCountRegister) || x2.onStack() {
		// If another value lives on the CX register, we release it to the stack.
		c.onValueReleaseRegisterToStack(shiftCountRegister)

		if x2.onRegister() {
			x2r := x2.register
			// If x2 lives on a register, we move the value to CX.
			if is32Bit {
				c.assembler.CompileRegisterToRegister(amd64.MOVL, x2r, shiftCountRegister)
			} else {
				c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2r, shiftCountRegister)
			}
			// We no longer place any value on the original register, so we record it.
			c.locationStack.markRegisterUnused(x2r)
		} else {
			// If it is on stack, we just move the memory allocated value to the CX register.
			x2.setRegister(shiftCountRegister)
			c.compileLoadValueOnStackToRegister(x2)
		}
		c.locationStack.markRegisterUsed(shiftCountRegister)
	}

	x1 := c.locationStack.peek() // Note this is peek!
	x1r := x1.register

	if x1.onRegister() {
		c.assembler.CompileRegisterToRegister(instruction, shiftCountRegister, x1r)
	} else {
		// Shift target can be placed on a memory location.
		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
		c.assembler.CompileRegisterToMemory(instruction, shiftCountRegister, amd64ReservedRegisterForStackBasePointerAddress, int64(x1.stackPointer)*8)
	}

	// We consumed x2 register after the operation here,
	// so we release it.
	c.locationStack.markRegisterUnused(shiftCountRegister)
	return nil
}

// compileAbs implements compiler.compileAbs for the amd64 architecture.
//
// See the following discussions for how we could take the abs of floats on x86 assembly.
// https://stackoverflow.com/questions/32408665/fastest-way-to-compute-absolute-value-using-sse/32422471#32422471
// https://stackoverflow.com/questions/44630015/how-would-fabsdouble-be-implemented-on-x86-is-it-an-expensive-operation
func (c *amd64Compiler) compileAbs(o *wazeroir.UnionOperation) (err error) {
	target := c.locationStack.peek() // Note this is peek!
	if err = c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	// First shift left by one to clear the sign bit, and then shift right by one.
	if wazeroir.Float(o.B1) == wazeroir.Float32 {
		c.assembler.CompileConstToRegister(amd64.PSLLD, 1, target.register)
		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, target.register)
	} else {
		c.assembler.CompileConstToRegister(amd64.PSLLQ, 1, target.register)
		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, target.register)
	}
	return nil
}

// compileNeg implements compiler.compileNeg for the amd64 architecture.
func (c *amd64Compiler) compileNeg(o *wazeroir.UnionOperation) (err error) {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	tmpReg, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	// First we move the sign-bit mask (placed in memory) to the tmp register,
	// since we cannot take XOR directly with float reg and const.
	// And then negate the value by XOR it with the sign-bit mask.
	if wazeroir.Float(o.B1) == wazeroir.Float32 {
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegister(amd64.XORPS, tmpReg, target.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
		if err != nil {
			return err
		}
		c.assembler.CompileRegisterToRegister(amd64.XORPD, tmpReg, target.register)
	}
	return nil
}

// compileCeil implements compiler.compileCeil for the amd64 architecture.
func (c *amd64Compiler) compileCeil(o *wazeroir.UnionOperation) (err error) {
	// Internally, ceil can be performed via ROUND instruction with 0x02 mode.
	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/ceilf.S for example.
	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x02)
}

// compileFloor implements compiler.compileFloor for the amd64 architecture.
func (c *amd64Compiler) compileFloor(o *wazeroir.UnionOperation) (err error) {
	// Internally, floor can be performed via ROUND instruction with 0x01 mode.
	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/floorf.S for example.
	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x01)
}

// compileTrunc implements compiler.compileTrunc for the amd64 architecture.
func (c *amd64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
	// Internally, trunc can be performed via ROUND instruction with 0x03 mode.
	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/truncf.S for example.
	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x03)
}

// compileNearest implements compiler.compileNearest for the amd64 architecture.
func (c *amd64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
	// Nearest can be performed via ROUND instruction with 0x00 mode.
	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x00)
}

func (c *amd64Compiler) compileRoundInstruction(is32Bit bool, mode int64) error {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	if is32Bit {
		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSS, target.register, target.register, byte(mode))
	} else {
		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSD, target.register, target.register, byte(mode))
	}
	return nil
}

// compileMin implements compiler.compileMin for the amd64 architecture.
func (c *amd64Compiler) compileMin(o *wazeroir.UnionOperation) error {
	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
	if is32Bit {
		return c.compileMinOrMax(is32Bit, true, amd64.MINSS)
	} else {
		return c.compileMinOrMax(is32Bit, true, amd64.MINSD)
	}
}

// compileMax implements compiler.compileMax for the amd64 architecture.
func (c *amd64Compiler) compileMax(o *wazeroir.UnionOperation) error {
	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
	if is32Bit {
		return c.compileMinOrMax(is32Bit, false, amd64.MAXSS)
	} else {
		return c.compileMinOrMax(is32Bit, false, amd64.MAXSD)
	}
}

// emitMinOrMax adds instructions to pop two values from the stack, and push back either minimum or
// minimum of these two values onto the stack according to the minOrMaxInstruction argument.
// minOrMaxInstruction must be one of MAXSS, MAXSD, MINSS or MINSD.
// Note: These native min/max instructions are almost compatible with min/max in the Wasm specification,
// but it is slightly different with respect to the NaN handling.
// Native min/max instructions return non-NaN value if exactly one of target values
// is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN.
// However, WebAssembly specifies that min/max must always return NaN if one of values is NaN.
// Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before
// the native min/max, which is why we cannot simply emit a native min/max instruction here.
//
// For the semantics, see wazeroir.Min and wazeroir.Max for detail.
func (c *amd64Compiler) compileMinOrMax(is32Bit, isMin bool, minOrMaxInstruction asm.Instruction) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}
	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case
	if is32Bit {
		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
	}

	// At this point, we have the three cases of conditional flags below
	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
	//
	// 1) Two values are NaN-free and different: All flags are cleared.
	// 2) Two values are NaN-free and equal: Only ZF flags is set.
	// 3) One of Two values is NaN: ZF, PF and CF flags are set.

	// Jump instruction to handle 1) case by checking the ZF flag
	// as ZF is only set for 2) and 3) cases.
	nanFreeOrDiffJump := c.assembler.CompileJump(amd64.JNE)

	// Start handling 2) and 3).

	// Jump if one of two values is NaN by checking the parity flag (PF).
	includeNaNJmp := c.assembler.CompileJump(amd64.JPS)

	// Start handling 2).

	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
	// returned if two values are positive and negative zeros.
	var inst asm.Instruction
	switch {
	case is32Bit && isMin:
		inst = amd64.ORPS
	case !is32Bit && isMin:
		inst = amd64.ORPD
	case is32Bit && !isMin:
		inst = amd64.ANDPS
	case !is32Bit && !isMin:
		inst = amd64.ANDPD
	}
	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)

	sameExitJmp := c.assembler.CompileJump(amd64.JMP)

	// start handling 3).
	c.assembler.SetJumpTargetOnNext(includeNaNJmp)

	// We emit the ADD instruction to produce the NaN in x1.
	if is32Bit {
		c.assembler.CompileRegisterToRegister(amd64.ADDSS, x2.register, x1.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.ADDSD, x2.register, x1.register)
	}

	// Exit from the NaN case branch.
	nanExitJmp := c.assembler.CompileJump(amd64.JMP)

	// Start handling 1).
	c.assembler.SetJumpTargetOnNext(nanFreeOrDiffJump)

	// Now handle the NaN-free and different values case.
	c.assembler.CompileRegisterToRegister(minOrMaxInstruction, x2.register, x1.register)

	// Set the jump target of 1) and 2) cases to the next instruction after 3) case.
	c.assembler.SetJumpTargetOnNext(nanExitJmp)
	c.assembler.SetJumpTargetOnNext(sameExitJmp)

	// Record that we consumed the x2 and placed the minOrMax result in the x1's register.
	c.locationStack.markRegisterUnused(x2.register)
	c.locationStack.markRegisterUnused(x1.register)
	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
	return nil
}

// compileCopysign implements compiler.compileCopysign for the amd64 architecture.
func (c *amd64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32

	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}
	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}
	tmpReg, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	// Move the rest bit mask to the temp register.
	if is32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32RestBitMask, tmpReg)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64RestBitMask, tmpReg)
	}
	if err != nil {
		return err
	}

	// Clear the sign bit of x1 via AND with the mask.
	if is32Bit {
		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x1.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x1.register)
	}

	// Move the sign bit mask to the temp register.
	if is32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
	}
	if err != nil {
		return err
	}

	// Clear the non-sign bits of x2 via AND with the mask.
	if is32Bit {
		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x2.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x2.register)
	}

	// Finally, copy the sign bit of x2 to x1.
	if is32Bit {
		c.assembler.CompileRegisterToRegister(amd64.ORPS, x2.register, x1.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.ORPD, x2.register, x1.register)
	}

	// Record that we consumed the x2 and placed the copysign result in the x1's register.
	c.locationStack.markRegisterUnused(x2.register)
	c.locationStack.markRegisterUnused(x1.register)
	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
	return nil
}

// compileSqrt implements compiler.compileSqrt for the amd64 architecture.
func (c *amd64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}
	if wazeroir.Float(o.B1) == wazeroir.Float32 {
		c.assembler.CompileRegisterToRegister(amd64.SQRTSS, target.register, target.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.SQRTSD, target.register, target.register)
	}
	return nil
}

// compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the amd64 architecture.
func (c *amd64Compiler) compileI32WrapFromI64() error {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}
	c.assembler.CompileRegisterToRegister(amd64.MOVL, target.register, target.register)
	target.valueType = runtimeValueTypeI32
	return nil
}

// compileITruncFromF implements compiler.compileITruncFromF for the amd64 architecture.
//
// Note: in the following implementation, we use CVTSS2SI and CVTSD2SI to convert floats to signed integers.
// According to the Intel manual ([1],[2]), if the source float value is either +-Inf or NaN, or it exceeds representative ranges
// of target signed integer, then the instruction returns "masked" response float32SignBitMask (or float64SignBitMask for 64 bit case).
// [1] Chapter 11.5.2, SIMD Floating-Point Exception Conditions in "Vol 1, Intel® 64 and IA-32 Architectures Manual"
//
//	https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html
//
// [2] https://xem.github.io/minix86/manual/intel-x86-and-64-manual-vol1/o_7281d5ea06a5b67a-268.html
func (c *amd64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) (err error) {
	inputType := wazeroir.Float(o.B1)
	outputType := wazeroir.SignedInt(o.B2)
	nonTrapping := o.B3
	if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt32 {
		err = c.emitSignedI32TruncFromFloat(true, nonTrapping)
	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt64 {
		err = c.emitSignedI64TruncFromFloat(true, nonTrapping)
	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt32 {
		err = c.emitSignedI32TruncFromFloat(false, nonTrapping)
	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt64 {
		err = c.emitSignedI64TruncFromFloat(false, nonTrapping)
	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint32 {
		err = c.emitUnsignedI32TruncFromFloat(true, nonTrapping)
	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint64 {
		err = c.emitUnsignedI64TruncFromFloat(true, nonTrapping)
	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint32 {
		err = c.emitUnsignedI32TruncFromFloat(false, nonTrapping)
	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint64 {
		err = c.emitUnsignedI64TruncFromFloat(false, nonTrapping)
	}
	return
}

// emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit unsigned integer.
func (c *amd64Compiler) emitUnsignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
	source := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(source); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First, we check the source float value is above or equal math.MaxInt32+1.
	if isFloat32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
	}
	if err != nil {
		return err
	}

	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
	var nonTrappingNaNJump asm.Node
	if nonTrapping {
		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
		// In non trapping case, NaN is casted as zero.
		// Zero out the result register by XOR itsself.
		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
	} else {
		c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
	}

	// Jump if the source float value is above or equal math.MaxInt32+1.
	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)

	// next we convert the value as a signed integer.
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
	}

	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)

	var nonTrappingMinusJump asm.Node
	if nonTrapping {
		jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
		// In non trapping case, the minus value is casted as zero.
		// Zero out the result register by XOR itsself.
		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
	} else {
		c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
	}

	// Otherwise, the values is valid.
	okJmpForLessThanMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP)

	// Now, start handling the case where the original float value is above or equal math.MaxInt32+1.
	//
	// First, we subtract the math.MaxInt32+1 from the original value so it can fit in signed 32-bit integer.
	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
	if isFloat32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
	}
	if err != nil {
		return err
	}

	// Then, convert the subtracted value as a signed 32-bit integer.
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
	}

	// next, we have to check if the value is from NaN, +Inf.
	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
	// This means we check if the result int value is minus or not.
	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)

	// If the result is minus, the conversion is invalid (from NaN or +Inf)
	var nonTrappingAboveOrEqualMaxInt32PlusOne asm.Node
	if nonTrapping {
		jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitUnsignedInt, result)
		if err != nil {
			return err
		}
		nonTrappingAboveOrEqualMaxInt32PlusOne = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
	} else {
		c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
	}

	// Otherwise, we successfully converted the source float minus (math.MaxInt32+1) to int.
	// So, we retrieve the original source float value by adding the sign mask.
	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDL, c.float32SignBitMask, result); err != nil {
		return err
	}

	// We jump to the next instructions for valid cases.
	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt32PlusOne)
	if nonTrapping {
		c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt32PlusOne)
		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
		c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
	}

	// We consumed the source's register and placed the conversion result
	// in the result register.
	c.locationStack.markRegisterUnused(source.register)
	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
	return nil
}

// emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit unsigned integer.
func (c *amd64Compiler) emitUnsignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
	source := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(source); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First, we check the source float value is above or equal math.MaxInt64+1.
	if isFloat32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
	}
	if err != nil {
		return err
	}

	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
	var nonTrappingNaNJump asm.Node
	if nonTrapping {
		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is c.not set.
		// In non trapping case, NaN is casted as zero.
		// Zero out the result register by XOR itsself.
		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
	} else {
		c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
	}

	// Jump if the source float values is above or equal math.MaxInt64+1.
	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)

	// next we convert the value as a signed integer.
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
	}

	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)

	var nonTrappingMinusJump asm.Node
	if nonTrapping {
		jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
		// In non trapping case, the minus value is casted as zero.
		// Zero out the result register by XOR itsself.
		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
	} else {
		c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
	}

	// Otherwise, the values is valid.
	okJmpForLessThanMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP)

	// Now, start handling the case where the original float value is above or equal math.MaxInt64+1.
	//
	// First, we subtract the math.MaxInt64+1 from the original value so it can fit in signed 64-bit integer.
	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
	if isFloat32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
	}
	if err != nil {
		return err
	}

	// Then, convert the subtracted value as a signed 64-bit integer.
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
	}

	// next, we have to check if the value is from NaN, +Inf.
	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
	// This means we check if the result int value is minus or not.
	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)

	// If the result is minus, the conversion is invalid (from NaN or +Inf)
	var nonTrappingAboveOrEqualMaxInt64PlusOne asm.Node
	if nonTrapping {
		jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitUnsignedInt, result)
		if err != nil {
			return err
		}
		nonTrappingAboveOrEqualMaxInt64PlusOne = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
	} else {
		c.compileTrapFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
	}

	// Otherwise, we successfully converted the the source float minus (math.MaxInt64+1) to int.
	// So, we retrieve the original source float value by adding the sign mask.
	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDQ, c.float64SignBitMask, result); err != nil {
		return err
	}

	// We jump to the next instructions for valid cases.
	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt64PlusOne)
	if nonTrapping {
		c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt64PlusOne)
		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
		c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
	}

	// We consumed the source's register and placed the conversion result
	// in the result register.
	c.locationStack.markRegisterUnused(source.register)
	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
	return nil
}

// emitSignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit signed integer.
func (c *amd64Compiler) emitSignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
	source := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(source); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
	}

	// We compare the conversion result with the sign bit mask to check if it is either
	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
	// 2) the source equals the minimum signed 32-bit (=-2147483648.000000) whose bit pattern is float32ForMinimumSigned32bitIntegerAddress for 32 bit float
	// 	  or float64ForMinimumSigned32bitIntegerAddress for 64bit float.
	err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.float32SignBitMask, result)
	if err != nil {
		return err
	}

	// Otherwise, jump to exit as the result is valid.
	okJmp := c.assembler.CompileJump(amd64.JNE)

	// Start handling the case of 1) and 2).
	// First, check if the value is NaN.
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
	}

	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
	var nontrappingNanJump asm.Node
	if nonTrapping {
		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
		// In non trapping case, NaN is casted as zero.
		// Zero out the result register by XOR itsself.
		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
	} else {
		// If the value is NaN, we return the function with nativeCallStatusCodeInvalidFloatToIntConversion.
		c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
	}

	// Check if the value is larger than or equal the minimum 32-bit integer value,
	// meaning that the value exceeds the lower bound of 32-bit signed integer range.
	if isFloat32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned32bitInteger, source.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned32bitInteger, source.register)
	}
	if err != nil {
		return err
	}

	if !nonTrapping {
		// Trap if the value does not exceed the lower bound.
		if isFloat32Bit {
			c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
		} else {
			c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusIntegerOverflow)
		}

		// At this point, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
		// So, check if the value equals the minimum signed 32-bit int.
		if isFloat32Bit {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
		} else {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
		}
		if err != nil {
			return err
		}

		// Trap if the value is not minus (= the minimum signed 32-bit int).
		c.compileTrapFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)

		// We jump to the next instructions for valid cases.
		c.assembler.SetJumpTargetOnNext(okJmp)
	} else {
		// Jump if the value does not exceed the lower bound.
		var jmpIfNotExceedsLowerBound asm.Node
		if isFloat32Bit {
			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JCC)
		} else {
			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JHI)
		}

		// If the value exceeds the lower bound, we "saturate" it to the minimum.
		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.minimum32BitSignedInt, result); err != nil {
			return err
		}
		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)

		// Otherwise, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
		if isFloat32Bit {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
		} else {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
		}
		if err != nil {
			return err
		}
		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int).

		// If the value exceeds signed 32-bit maximum, we saturate it to the maximum.
		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitSignedInt, result); err != nil {
			return err
		}

		c.assembler.SetJumpTargetOnNext(okJmp)
		c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
		c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
		c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
	}

	// We consumed the source's register and placed the conversion result
	// in the result register.
	c.locationStack.markRegisterUnused(source.register)
	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
	return nil
}

// emitSignedI64TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit signed integer.
func (c *amd64Compiler) emitSignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
	source := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(source); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
	}

	// We compare the conversion result with the sign bit mask to check if it is either
	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
	// 2) the source equals the minimum signed 32-bit (=-9223372036854775808.0) whose bit pattern is float32ForMinimumSigned64bitIntegerAddress for 32 bit float
	// 	  or float64ForMinimumSigned64bitIntegerAddress for 64bit float.
	err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.float64SignBitMask, result)
	if err != nil {
		return err
	}

	// Otherwise, we simply jump to exit as the result is valid.
	okJmp := c.assembler.CompileJump(amd64.JNE)

	// Start handling the case of 1) and 2).
	// First, check if the value is NaN.
	if isFloat32Bit {
		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
	}

	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
	var nontrappingNanJump asm.Node
	if nonTrapping {
		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
		// In non trapping case, NaN is casted as zero.
		// Zero out the result register by XOR itsself.
		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
	} else {
		c.compileTrapFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
	}

	// Check if the value is larger than or equal the minimum 64-bit integer value,
	// meaning that the value exceeds the lower bound of 64-bit signed integer range.
	if isFloat32Bit {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned64bitInteger, source.register)
	} else {
		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned64bitInteger, source.register)
	}
	if err != nil {
		return err
	}

	if !nonTrapping {
		// Jump if the value is -Inf.
		c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)

		// At this point, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
		// So, check if the value equals the minimum signed 64-bit int.
		if isFloat32Bit {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
		} else {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
		}
		if err != nil {
			return err
		}

		// Trap if the value is not minus (= the minimum signed 64-bit int).
		c.compileTrapFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)

		// We jump to the next instructions for valid cases.
		c.assembler.SetJumpTargetOnNext(okJmp)
	} else {
		// Jump if the value is not -Inf.
		jmpIfNotExceedsLowerBound := c.assembler.CompileJump(amd64.JCC)

		// If the value exceeds the lower bound, we "saturate" it to the minimum.
		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.minimum64BitSignedInt, result)
		if err != nil {
			return err
		}

		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)

		// Otherwise, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
		// So, check if the value equals the minimum signed 64-bit int.
		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
		if isFloat32Bit {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
		} else {
			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
		}
		if err != nil {
			return err
		}

		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int).

		// If the value exceeds signed 64-bit maximum, we saturate it to the maximum.
		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitSignedInt, result); err != nil {
			return err
		}

		c.assembler.SetJumpTargetOnNext(okJmp)
		c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
		c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
		c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
	}

	// We consumed the source's register and placed the conversion result
	// in the result register.
	c.locationStack.markRegisterUnused(source.register)
	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
	return nil
}

// compileFConvertFromI implements compiler.compileFConvertFromI for the amd64 architecture.
func (c *amd64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) (err error) {
	inputType := wazeroir.SignedInt(o.B1)
	outputType := wazeroir.Float(o.B2)
	if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
		err = c.compileSimpleConversion(amd64.CVTSL2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 32bit int
	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int
	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
		err = c.compileSimpleConversion(amd64.CVTSL2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 32bit int
	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int
	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
		// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
		// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
		//
		// Here's the summary:
		// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
		// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
		// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
		// >> which allows CVTSI2SS to be used after all.
		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int.
	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
		// For the same reason above, we use 64bit conversion for unsigned 32bit.
		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int.
	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
		err = c.emitUnsignedInt64ToFloatConversion(true)
	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
		err = c.emitUnsignedInt64ToFloatConversion(false)
	}
	return
}

// emitUnsignedInt64ToFloatConversion is handling the case of unsigned 64-bit integer
// in compileFConvertFromI.
func (c *amd64Compiler) emitUnsignedInt64ToFloatConversion(isFloat32bit bool) error {
	// The logic here is exactly the same as GCC emits for the following code:
	//
	// float convert(int num) {
	//     float foo;
	//     uint64_t ptr1 = 100;
	//     foo = (float)(ptr1);
	//     return foo;
	// }
	//
	// which is compiled by GCC as
	//
	// convert:
	// 	   push    rbp
	// 	   mov     rbp, rsp
	// 	   mov     DWORD PTR [rbp-20], edi
	// 	   mov     DWORD PTR [rbp-4], 100
	// 	   mov     eax, DWORD PTR [rbp-4]
	// 	   test    rax, rax
	// 	   js      .handle_sign_bit_case
	// 	   cvtsi2ss        xmm0, rax
	// 	   jmp     .exit
	// .handle_sign_bit_case:
	// 	   mov     rdx, rax
	// 	   shr     rdx
	// 	   and     eax, 1
	// 	   or      rdx, rax
	// 	   cvtsi2ss        xmm0, rdx
	// 	   addsd   xmm0, xmm0
	// .exit: ...
	//
	// tl;dr is that we have a branch depending on whether or not sign bit is set.

	origin := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(origin); err != nil {
		return err
	}

	dest, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(dest)

	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// Check if the most significant bit (sign bit) is set.
	c.assembler.CompileRegisterToRegister(amd64.TESTQ, origin.register, origin.register)

	// Jump if the sign bit is set.
	jmpIfSignbitSet := c.assembler.CompileJump(amd64.JMI)

	// Otherwise, we could fit the unsigned int into float32.
	// So, we convert it to float32 and emit jump instruction to exit from this branch.
	if isFloat32bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, origin.register, dest)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, origin.register, dest)
	}
	exitFromSignbitUnSet := c.assembler.CompileJump(amd64.JMP)

	// Now handling the case where sign-bit is set.
	// We emit the following sequences:
	// 	   mov     tmpReg, origin
	// 	   shr     tmpReg, 1
	// 	   and     origin, 1
	// 	   or      tmpReg, origin
	// 	   cvtsi2ss        xmm0, tmpReg
	// 	   addsd   xmm0, xmm0

	c.assembler.SetJumpTargetOnNext(jmpIfSignbitSet)
	c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, tmpReg)
	c.assembler.CompileConstToRegister(amd64.SHRQ, 1, tmpReg)
	c.assembler.CompileConstToRegister(amd64.ANDQ, 1, origin.register)
	c.assembler.CompileRegisterToRegister(amd64.ORQ, origin.register, tmpReg)
	if isFloat32bit {
		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, tmpReg, dest)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, tmpReg, dest)
	}
	if isFloat32bit {
		c.assembler.CompileRegisterToRegister(amd64.ADDSS, dest, dest)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.ADDSD, dest, dest)
	}

	// Now, we finished the sign-bit set branch.
	// We have to make the exit jump target of sign-bit unset branch
	// towards the next instruction.
	c.assembler.SetJumpTargetOnNext(exitFromSignbitUnSet)

	// We consumed the origin's register and placed the conversion result
	// in the dest register.
	c.locationStack.markRegisterUnused(origin.register)
	if isFloat32bit {
		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF32)
	} else {
		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF64)
	}
	return nil
}

// compileSimpleConversion pops a value type from the stack, and applies the
// given instruction on it, and push the result onto a register of the given type.
func (c *amd64Compiler) compileSimpleConversion(convInstruction asm.Instruction,
	destinationRegisterType registerType, destinationValueType runtimeValueType,
) error {
	origin := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(origin); err != nil {
		return err
	}

	dest, err := c.allocateRegister(destinationRegisterType)
	if err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(convInstruction, origin.register, dest)

	c.locationStack.markRegisterUnused(origin.register)
	c.pushRuntimeValueLocationOnRegister(dest, destinationValueType)
	return nil
}

// compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the amd64 architecture.
func (c *amd64Compiler) compileF32DemoteFromF64() error {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.CVTSD2SS, target.register, target.register)
	target.valueType = runtimeValueTypeF32
	return nil
}

// compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the amd64 architecture.
func (c *amd64Compiler) compileF64PromoteFromF32() error {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(amd64.CVTSS2SD, target.register, target.register)
	target.valueType = runtimeValueTypeF64
	return nil
}

// compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the amd64 architecture.
func (c *amd64Compiler) compileI32ReinterpretFromF32() error {
	if peek := c.locationStack.peek(); peek.onStack() {
		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
		peek.valueType = runtimeValueTypeI32
		return nil
	}
	return c.compileSimpleConversion(amd64.MOVL, registerTypeGeneralPurpose, runtimeValueTypeI32)
}

// compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the amd64 architecture.
func (c *amd64Compiler) compileI64ReinterpretFromF64() error {
	if peek := c.locationStack.peek(); peek.onStack() {
		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
		peek.valueType = runtimeValueTypeI64
		return nil
	}
	return c.compileSimpleConversion(amd64.MOVQ, registerTypeGeneralPurpose, runtimeValueTypeI64)
}

// compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the amd64 architecture.
func (c *amd64Compiler) compileF32ReinterpretFromI32() error {
	if peek := c.locationStack.peek(); peek.onStack() {
		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
		peek.valueType = runtimeValueTypeF32
		return nil
	}
	return c.compileSimpleConversion(amd64.MOVL, registerTypeVector, runtimeValueTypeF32)
}

// compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the amd64 architecture.
func (c *amd64Compiler) compileF64ReinterpretFromI64() error {
	if peek := c.locationStack.peek(); peek.onStack() {
		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
		peek.valueType = runtimeValueTypeF64
		return nil
	}
	return c.compileSimpleConversion(amd64.MOVQ, registerTypeVector, runtimeValueTypeF64)
}

// compileExtend implements compiler.compileExtend for the amd64 architecture.
func (c *amd64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
	var inst asm.Instruction
	signed := o.B1 != 0
	if signed {
		inst = amd64.MOVLQSX // = MOVSXD https://www.felixcloutier.com/x86/movsx:movsxd
	} else {
		inst = amd64.MOVL
	}
	return c.compileExtendImpl(inst, runtimeValueTypeI64)
}

// compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend32From8() error {
	return c.compileExtendImpl(amd64.MOVBLSX, runtimeValueTypeI32)
}

// compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend32From16() error {
	return c.compileExtendImpl(amd64.MOVWLSX, runtimeValueTypeI32)
}

// compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend64From8() error {
	return c.compileExtendImpl(amd64.MOVBQSX, runtimeValueTypeI64)
}

// compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend64From16() error {
	return c.compileExtendImpl(amd64.MOVWQSX, runtimeValueTypeI64)
}

// compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the amd64 architecture.
func (c *amd64Compiler) compileSignExtend64From32() error {
	return c.compileExtendImpl(amd64.MOVLQSX, runtimeValueTypeI64)
}

func (c *amd64Compiler) compileExtendImpl(inst asm.Instruction, destinationType runtimeValueType) error {
	target := c.locationStack.peek() // Note this is peek!
	if err := c.compileEnsureOnRegister(target); err != nil {
		return err
	}

	c.assembler.CompileRegisterToRegister(inst, target.register, target.register)
	target.valueType = destinationType
	return nil
}

// compileEq implements compiler.compileEq for the amd64 architecture.
func (c *amd64Compiler) compileEq(o *wazeroir.UnionOperation) error {
	return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), true)
}

// compileNe implements compiler.compileNe for the amd64 architecture.
func (c *amd64Compiler) compileNe(o *wazeroir.UnionOperation) error {
	return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), false)
}

func (c *amd64Compiler) compileEqOrNe(t wazeroir.UnsignedType, shouldEqual bool) (err error) {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	x1r, x2r := x1.register, x2.register

	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
	c.locationStack.releaseRegister(x1)
	c.locationStack.releaseRegister(x2)

	switch t {
	case wazeroir.UnsignedTypeI32:
		err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPL, shouldEqual)
	case wazeroir.UnsignedTypeI64:
		err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPQ, shouldEqual)
	case wazeroir.UnsignedTypeF32:
		err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISS, shouldEqual)
	case wazeroir.UnsignedTypeF64:
		err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISD, shouldEqual)
	}
	if err != nil {
		return
	}
	return
}

func (c *amd64Compiler) compileEqOrNeForInts(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction,
	shouldEqual bool,
) error {
	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)

	// Record that the result is on the conditional register.
	var condReg asm.ConditionalRegisterState
	if shouldEqual {
		condReg = amd64.ConditionalRegisterStateE
	} else {
		condReg = amd64.ConditionalRegisterStateNE
	}
	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(condReg)
	loc.valueType = runtimeValueTypeI32
	return nil
}

// For float EQ and NE, we have to take NaN values into account.
// Notably, Wasm specification states that if one of targets is NaN,
// the result must be zero for EQ or one for NE.
func (c *amd64Compiler) compileEqOrNeForFloats(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool) error {
	// Before we allocate the result, we have to reserve two int registers.
	nanFragReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(nanFragReg)
	cmpResultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// Then, execute the comparison.
	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)

	// First, we get the parity flag which indicates whether one of values was NaN.
	if shouldEqual {
		// Set 1 if two values are NOT NaN.
		c.assembler.CompileNoneToRegister(amd64.SETPC, nanFragReg)
	} else {
		// Set 1 if one of values is NaN.
		c.assembler.CompileNoneToRegister(amd64.SETPS, nanFragReg)
	}

	// next, we get the usual comparison flag.
	if shouldEqual {
		// Set 1 if equal.
		c.assembler.CompileNoneToRegister(amd64.SETEQ, cmpResultReg)
	} else {
		// Set 1 if not equal.
		c.assembler.CompileNoneToRegister(amd64.SETNE, cmpResultReg)
	}

	// Do "and" or "or" operations on these two flags to get the actual result.
	if shouldEqual {
		c.assembler.CompileRegisterToRegister(amd64.ANDL, nanFragReg, cmpResultReg)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.ORL, nanFragReg, cmpResultReg)
	}

	// Clear the unnecessary bits by zero extending the first byte.
	// This is necessary the upper bits (5 to 32 bits) of SET* instruction result is undefined.
	c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, cmpResultReg, cmpResultReg)

	// Now we have the result in cmpResultReg register, so we record it.
	c.pushRuntimeValueLocationOnRegister(cmpResultReg, runtimeValueTypeI32)
	// Also, we no longer need nanFragRegister.
	c.locationStack.markRegisterUnused(nanFragReg)
	return nil
}

// compileEqz implements compiler.compileEqz for the amd64 architecture.
func (c *amd64Compiler) compileEqz(o *wazeroir.UnionOperation) (err error) {
	v := c.locationStack.pop()
	if err = c.compileEnsureOnRegister(v); err != nil {
		return err
	}

	unsignedInt := wazeroir.UnsignedInt(o.B1)
	switch unsignedInt {
	case wazeroir.UnsignedInt32:
		err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.fourZeros, v.register)
	case wazeroir.UnsignedInt64:
		err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.eightZeros, v.register)
	}
	if err != nil {
		return err
	}

	// v is consumed by the cmp operation so release it.
	c.locationStack.releaseRegister(v)

	// Finally, record that the result is on the conditional register.
	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
	loc.valueType = runtimeValueTypeI32
	return nil
}

// compileLt implements compiler.compileLt for the amd64 architecture.
func (c *amd64Compiler) compileLt(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Emit the compare instruction.
	var resultConditionState asm.ConditionalRegisterState
	var inst asm.Instruction
	signedType := wazeroir.SignedType(o.B1)
	switch signedType {
	case wazeroir.SignedTypeInt32:
		resultConditionState = amd64.ConditionalRegisterStateL
		inst = amd64.CMPL
	case wazeroir.SignedTypeUint32:
		resultConditionState = amd64.ConditionalRegisterStateB
		inst = amd64.CMPL
	case wazeroir.SignedTypeInt64:
		inst = amd64.CMPQ
		resultConditionState = amd64.ConditionalRegisterStateL
	case wazeroir.SignedTypeUint64:
		resultConditionState = amd64.ConditionalRegisterStateB
		inst = amd64.CMPQ
	case wazeroir.SignedTypeFloat32:
		resultConditionState = amd64.ConditionalRegisterStateA
		inst = amd64.COMISS
	case wazeroir.SignedTypeFloat64:
		resultConditionState = amd64.ConditionalRegisterStateA
		inst = amd64.COMISD
	}
	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)

	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
	c.locationStack.releaseRegister(x1)
	c.locationStack.releaseRegister(x2)

	// Finally, record that the result is on the conditional register.
	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
	loc.valueType = runtimeValueTypeI32
	return nil
}

// compileGt implements compiler.compileGt for the amd64 architecture.
func (c *amd64Compiler) compileGt(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Emit the compare instruction.
	var resultConditionState asm.ConditionalRegisterState
	signedType := wazeroir.SignedType(o.B1)
	switch signedType {
	case wazeroir.SignedTypeInt32:
		resultConditionState = amd64.ConditionalRegisterStateG
		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
	case wazeroir.SignedTypeUint32:
		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateA
	case wazeroir.SignedTypeInt64:
		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateG
	case wazeroir.SignedTypeUint64:
		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateA
	case wazeroir.SignedTypeFloat32:
		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
		resultConditionState = amd64.ConditionalRegisterStateA
	case wazeroir.SignedTypeFloat64:
		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
		resultConditionState = amd64.ConditionalRegisterStateA
	}

	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
	c.locationStack.releaseRegister(x1)
	c.locationStack.releaseRegister(x2)

	// Finally, record that the result is on the conditional register.
	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
	loc.valueType = runtimeValueTypeI32
	return nil
}

// compileLe implements compiler.compileLe for the amd64 architecture.
func (c *amd64Compiler) compileLe(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Emit the compare instruction.
	var inst asm.Instruction
	var resultConditionState asm.ConditionalRegisterState
	signedType := wazeroir.SignedType(o.B1)
	switch signedType {
	case wazeroir.SignedTypeInt32:
		resultConditionState = amd64.ConditionalRegisterStateLE
		inst = amd64.CMPL
	case wazeroir.SignedTypeUint32:
		resultConditionState = amd64.ConditionalRegisterStateBE
		inst = amd64.CMPL
	case wazeroir.SignedTypeInt64:
		resultConditionState = amd64.ConditionalRegisterStateLE
		inst = amd64.CMPQ
	case wazeroir.SignedTypeUint64:
		resultConditionState = amd64.ConditionalRegisterStateBE
		inst = amd64.CMPQ
	case wazeroir.SignedTypeFloat32:
		resultConditionState = amd64.ConditionalRegisterStateAE
		inst = amd64.UCOMISS
	case wazeroir.SignedTypeFloat64:
		resultConditionState = amd64.ConditionalRegisterStateAE
		inst = amd64.UCOMISD
	}
	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)

	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
	c.locationStack.releaseRegister(x1)
	c.locationStack.releaseRegister(x2)

	// Finally, record that the result is on the conditional register.
	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
	loc.valueType = runtimeValueTypeI32
	return nil
}

// compileGe implements compiler.compileGe for the amd64 architecture.
func (c *amd64Compiler) compileGe(o *wazeroir.UnionOperation) error {
	x2 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x2); err != nil {
		return err
	}

	x1 := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(x1); err != nil {
		return err
	}

	// Emit the compare instruction.
	var resultConditionState asm.ConditionalRegisterState
	signedType := wazeroir.SignedType(o.B1)
	switch signedType {
	case wazeroir.SignedTypeInt32:
		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateGE
	case wazeroir.SignedTypeUint32:
		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateAE
	case wazeroir.SignedTypeInt64:
		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateGE
	case wazeroir.SignedTypeUint64:
		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
		resultConditionState = amd64.ConditionalRegisterStateAE
	case wazeroir.SignedTypeFloat32:
		c.assembler.CompileRegisterToRegister(amd64.COMISS, x2.register, x1.register)
		resultConditionState = amd64.ConditionalRegisterStateAE
	case wazeroir.SignedTypeFloat64:
		c.assembler.CompileRegisterToRegister(amd64.COMISD, x2.register, x1.register)
		resultConditionState = amd64.ConditionalRegisterStateAE
	}

	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
	c.locationStack.releaseRegister(x1)
	c.locationStack.releaseRegister(x2)

	// Finally, record that the result is on the conditional register.
	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
	loc.valueType = runtimeValueTypeI32
	return nil
}

// compileLoad implements compiler.compileLoad for the amd64 architecture.
func (c *amd64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
	var (
		isIntType         bool
		movInst           asm.Instruction
		targetSizeInBytes int64
		vt                runtimeValueType
	)

	unsignedType := wazeroir.UnsignedType(o.B1)
	offset := uint32(o.U2)

	switch unsignedType {
	case wazeroir.UnsignedTypeI32:
		isIntType = true
		movInst = amd64.MOVL
		targetSizeInBytes = 32 / 8
		vt = runtimeValueTypeI32
	case wazeroir.UnsignedTypeI64:
		isIntType = true
		movInst = amd64.MOVQ
		targetSizeInBytes = 64 / 8
		vt = runtimeValueTypeI64
	case wazeroir.UnsignedTypeF32:
		isIntType = false
		movInst = amd64.MOVL
		targetSizeInBytes = 32 / 8
		vt = runtimeValueTypeF32
	case wazeroir.UnsignedTypeF64:
		isIntType = false
		movInst = amd64.MOVQ
		targetSizeInBytes = 64 / 8
		vt = runtimeValueTypeF64
	}

	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}

	if isIntType {
		// For integer types, read the corresponding bytes from the offset to the memory
		// and store the value to the int register.
		c.assembler.CompileMemoryWithIndexToRegister(movInst,
			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
			reg)
		c.pushRuntimeValueLocationOnRegister(reg, vt)
	} else {
		// For float types, we read the value to the float register.
		floatReg, err := c.allocateRegister(registerTypeVector)
		if err != nil {
			return err
		}
		c.assembler.CompileMemoryWithIndexToRegister(movInst,
			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
			floatReg)
		c.pushRuntimeValueLocationOnRegister(floatReg, vt)
		// We no longer need the int register so mark it unused.
		c.locationStack.markRegisterUnused(reg)
	}
	return nil
}

// compileLoad8 implements compiler.compileLoad8 for the amd64 architecture.
func (c *amd64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
	const targetSizeInBytes = 1
	offset := uint32(o.U2)
	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}

	// Then move a byte at the offset to the register.
	// Note that Load8 is only for integer types.
	var inst asm.Instruction
	var vt runtimeValueType
	signedInt := wazeroir.SignedInt(o.B1)
	switch signedInt {
	case wazeroir.SignedInt32:
		inst = amd64.MOVBLSX
		vt = runtimeValueTypeI32
	case wazeroir.SignedUint32:
		inst = amd64.MOVBLZX
		vt = runtimeValueTypeI32
	case wazeroir.SignedInt64:
		inst = amd64.MOVBQSX
		vt = runtimeValueTypeI64
	case wazeroir.SignedUint64:
		inst = amd64.MOVBQZX
		vt = runtimeValueTypeI64
	}

	c.assembler.CompileMemoryWithIndexToRegister(inst,
		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
		reg)

	c.pushRuntimeValueLocationOnRegister(reg, vt)
	return nil
}

// compileLoad16 implements compiler.compileLoad16 for the amd64 architecture.
func (c *amd64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
	const targetSizeInBytes = 16 / 8
	offset := uint32(o.U2)
	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}

	// Then move 2 bytes at the offset to the register.
	// Note that Load16 is only for integer types.
	var inst asm.Instruction
	var vt runtimeValueType
	signedInt := wazeroir.SignedInt(o.B1)
	switch signedInt {
	case wazeroir.SignedInt32:
		inst = amd64.MOVWLSX
		vt = runtimeValueTypeI32
	case wazeroir.SignedInt64:
		inst = amd64.MOVWQSX
		vt = runtimeValueTypeI64
	case wazeroir.SignedUint32:
		inst = amd64.MOVWLZX
		vt = runtimeValueTypeI32
	case wazeroir.SignedUint64:
		inst = amd64.MOVWQZX
		vt = runtimeValueTypeI64
	}

	c.assembler.CompileMemoryWithIndexToRegister(inst,
		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
		reg)

	c.pushRuntimeValueLocationOnRegister(reg, vt)
	return nil
}

// compileLoad32 implements compiler.compileLoad32 for the amd64 architecture.
func (c *amd64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
	const targetSizeInBytes = 32 / 8
	offset := uint32(o.U2)
	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
	if err != nil {
		return err
	}

	// Then move 4 bytes at the offset to the register.
	var inst asm.Instruction
	signed := o.B1 == 1
	if signed {
		inst = amd64.MOVLQSX
	} else {
		inst = amd64.MOVLQZX
	}
	c.assembler.CompileMemoryWithIndexToRegister(inst,
		// We access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
		reg)
	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
	return nil
}

// compileMemoryAccessCeilSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes"
// into a register, and returns the stored register. We call the result "ceil" because we access the memory
// as memory.Buffer[ceil-targetSizeInBytes: ceil].
//
// Note: this also emits the instructions to check the out-of-bounds memory access.
// In other words, if the ceil exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
func (c *amd64Compiler) compileMemoryAccessCeilSetup(offsetArg uint32, targetSizeInBytes int64) (asm.Register, error) {
	base := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(base); err != nil {
		return asm.NilRegister, err
	}

	result := base.register
	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxInt32 {
		c.assembler.CompileConstToRegister(amd64.ADDQ, offsetConst, result)
	} else if offsetConst <= math.MaxUint32 {
		// Note: in practice, this branch rarely happens as in this case, the wasm binary know that
		// memory has more than 1 GBi or at least tries to access above 1 GBi memory region.
		//
		// This case, we cannot directly add the offset to a register by ADDQ(const) instruction.
		// That is because the imm32 const is sign-extended to 64-bit in ADDQ(const), and we end up
		// making offsetConst as the negative number, which is wrong.
		tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
		if err != nil {
			return asm.NilRegister, err
		}
		c.assembler.CompileConstToRegister(amd64.MOVL, int64(uint32(offsetConst)), tmp)
		c.assembler.CompileRegisterToRegister(amd64.ADDQ, tmp, result)
	} else {
		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
		return result, nil
	}

	// Now we compare the value with the memory length which is held by callEngine.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, result)

	// Trap if the value is out-of-bounds of memory length.
	c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)

	c.locationStack.markRegisterUnused(result)
	return result, nil
}

// compileStore implements compiler.compileStore for the amd64 architecture.
func (c *amd64Compiler) compileStore(o *wazeroir.UnionOperation) error {
	var movInst asm.Instruction
	var targetSizeInByte int64
	unsignedType := wazeroir.UnsignedType(o.B1)
	offset := uint32(o.U2)
	switch unsignedType {
	case wazeroir.UnsignedTypeI32, wazeroir.UnsignedTypeF32:
		movInst = amd64.MOVL
		targetSizeInByte = 32 / 8
	case wazeroir.UnsignedTypeI64, wazeroir.UnsignedTypeF64:
		movInst = amd64.MOVQ
		targetSizeInByte = 64 / 8
	}
	return c.compileStoreImpl(offset, movInst, targetSizeInByte)
}

// compileStore8 implements compiler.compileStore8 for the amd64 architecture.
func (c *amd64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
	return c.compileStoreImpl(uint32(o.U2), amd64.MOVB, 1)
}

// compileStore32 implements compiler.compileStore32 for the amd64 architecture.
func (c *amd64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
	return c.compileStoreImpl(uint32(o.U2), amd64.MOVW, 16/8)
}

// compileStore32 implements compiler.compileStore32 for the amd64 architecture.
func (c *amd64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
	return c.compileStoreImpl(uint32(o.U2), amd64.MOVL, 32/8)
}

func (c *amd64Compiler) compileStoreImpl(offsetConst uint32, inst asm.Instruction, targetSizeInBytes int64) error {
	val := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(val); err != nil {
		return err
	}

	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
	if err != nil {
		return nil
	}

	c.assembler.CompileRegisterToMemoryWithIndex(
		inst, val.register,
		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
	)

	// We no longer need both the value and base registers.
	c.locationStack.releaseRegister(val)
	c.locationStack.markRegisterUnused(reg)
	return nil
}

// compileMemoryGrow implements compiler.compileMemoryGrow for the amd64 architecture.
func (c *amd64Compiler) compileMemoryGrow() error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	if err := c.compileCallBuiltinFunction(builtinFunctionIndexMemoryGrow); err != nil {
		return err
	}

	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
	c.compileReservedStackBasePointerInitialization()
	c.compileReservedMemoryPointerInitialization()
	return nil
}

// compileMemorySize implements compiler.compileMemorySize for the amd64 architecture.
func (c *amd64Compiler) compileMemorySize() error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	loc := c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)

	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, loc.register)

	// WebAssembly's memory.size returns the page size (65536) of memory region.
	// That is equivalent to divide the len of memory slice by 65536 and
	// that can be calculated as SHR by 16 bits as 65536 = 2^16.
	c.assembler.CompileConstToRegister(amd64.SHRQ, wasm.MemoryPageSizeInBits, loc.register)
	return nil
}

// compileMemoryInit implements compiler.compileMemoryInit for the amd64 architecture.
func (c *amd64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
	dataIndex := uint32(o.U1)
	return c.compileInitImpl(false, dataIndex, 0)
}

// compileInitImpl implements compileTableInit and compileMemoryInit.
//
// TODO: the compiled code in this function should be reused and compile at once as
// the code is independent of any module.
func (c *amd64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
	if isTable {
		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
	}

	copySize := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(copySize); err != nil {
		return err
	}

	sourceOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
		return err
	}

	destinationOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
		return err
	}

	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(instanceAddr)
	if isTable {
		c.compileLoadElemInstanceAddress(index, instanceAddr)
	} else {
		c.compileLoadDataInstanceAddress(index, instanceAddr)
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(tmp)

	// sourceOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
	// destinationOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)

	// Check instance bounds and if exceeds the length, exit with out of bounds error.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
		sourceOffset.register)
	c.compileTrapFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)

	// Check destination bounds and if exceeds the length, exit with out of bounds error.
	if isTable {
		// Load the target table's address.
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
		// Compare length.
		c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
	} else {
		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
			destinationOffset.register)
	}

	c.compileTrapFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)

	// Otherwise, ready to copy the value from source to destination.
	//
	// If the copy size equal zero, we skip the entire instructions below.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
	skipJump := c.assembler.CompileJump(amd64.JEQ)

	var scale int16
	var memToReg, regToMem asm.Instruction
	if isTable {
		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
		// destinationOffset += table buffer's absolute address.
		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
			tmp, tableInstanceTableOffset, destinationOffset.register)
		// sourceOffset += data buffer's absolute address.
		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
			instanceAddr, 0, sourceOffset.register)

		// For tables, we move 8 bytes at once.
		memToReg = amd64.MOVQ
		regToMem = memToReg
		scale = 8
	} else {
		// destinationOffset += memory buffer's absolute address.
		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)

		// sourceOffset += data buffer's absolute address.
		c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register)

		// Move one byte at once.
		memToReg = amd64.MOVBQZX
		regToMem = amd64.MOVB
		scale = 1
	}

	// Negate the counter.
	c.assembler.CompileNoneToRegister(amd64.NEGQ, copySize.register)

	beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)

	c.assembler.CompileMemoryWithIndexToRegister(memToReg,
		sourceOffset.register, 0, copySize.register, scale,
		tmp)
	// [destinationOffset + (size.register)] = tmp.
	c.assembler.CompileRegisterToMemoryWithIndex(regToMem,
		tmp,
		destinationOffset.register, 0, copySize.register, scale,
	)

	// size += 1
	c.assembler.CompileNoneToRegister(amd64.INCQ, copySize.register)
	c.assembler.CompileJump(amd64.JMI).AssignJumpTarget(beginCopyLoop)

	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
		destinationOffset.register, instanceAddr, tmp)
	c.assembler.SetJumpTargetOnNext(skipJump)
	return nil
}

// compileDataDrop implements compiler.compileDataDrop for the amd64 architecture.
func (c *amd64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	dataIndex := uint32(o.U1)
	c.compileLoadDataInstanceAddress(dataIndex, tmp)

	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
	return nil
}

func (c *amd64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
	// dst = dataIndex * dataInstanceStructSize.
	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(dataIndex)*dataInstanceStructSize, dst)

	// dst = &moduleInstance.DataInstances[0] + dst
	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
	//     = &moduleInstance.DataInstances[dataIndex]
	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
		dst,
	)
}

// compileCopyLoopImpl implements a REP MOVSQ memory copy for the given range with support for both directions.
func (c *amd64Compiler) compileCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, backwards bool, bwOffset uint8) {
	// skip if nothing to copy
	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)

	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
	restoreCrossing := c.compilePreventCrossedTargetRegisters(
		[]*runtimeValueLocation{destinationOffset, sourceOffset, copySize},
		[]asm.Register{amd64.RegDI, amd64.RegSI, amd64.RegCX})

	// Prepare registers for REP MOVSQ: copy from rsi to rdi, rcx times.
	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)

	// Point on first byte of first quadword to copy.
	if backwards {
		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegDI)
		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegSI)
		// Set REP prefix direction backwards.
		c.assembler.CompileStandAlone(amd64.STD)
	}

	c.assembler.CompileStandAlone(amd64.REPMOVSQ)

	if backwards {
		// Reset direction.
		c.assembler.CompileStandAlone(amd64.CLD)
	}

	// Restore registers.
	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
	restoreCrossing()

	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
	c.assembler.CompileStandAlone(amd64.NOP)
}

// compileMemoryCopyLoopImpl is used for directly copying after bounds/direction check.
func (c *amd64Compiler) compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
	// Point on first byte to be copied depending on direction.
	if backwards {
		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
	} else {
		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
	}

	// destinationOffset += memory buffer's absolute address.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
	// sourceOffset += memory buffer's absolute address.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, sourceOffset.register)

	// Copy copySize % 8 bytes in loop to allow copying in 8 byte groups afterward.
	beginLoop := c.assembler.CompileStandAlone(amd64.NOP)

	// Check copySize % 8 == 0.
	c.assembler.CompileConstToRegister(amd64.TESTQ, 7, copySize.register)
	breakLoop := c.assembler.CompileJump(amd64.JEQ)

	c.assembler.CompileMemoryToRegister(amd64.MOVBQZX, sourceOffset.register, 0, tmp)
	c.assembler.CompileRegisterToMemory(amd64.MOVB, tmp, destinationOffset.register, 0)

	if backwards {
		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
	} else {
		c.assembler.CompileNoneToRegister(amd64.INCQ, sourceOffset.register)
		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
	}

	c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
	c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginLoop)
	c.assembler.SetJumpTargetOnNext(breakLoop)

	// compileCopyLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
	c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)

	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 7)
}

// compileMemoryCopy implements compiler.compileMemoryCopy for the amd64 architecture.
//
// This uses efficient `REP MOVSQ` instructions to copy in quadword (8 bytes) batches. The remaining bytes
// are copied with a simple `MOV` loop. It uses backward copying for overlapped segments.
func (c *amd64Compiler) compileMemoryCopy() error {
	copySize := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(copySize); err != nil {
		return err
	}

	sourceOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
		return err
	}

	destinationOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(tmp)

	// sourceOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
	// destinationOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
	// tmp = max(sourceOffset, destinationOffset).
	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, destinationOffset.register)
	c.assembler.CompileRegisterToRegister(amd64.MOVQ, sourceOffset.register, tmp)
	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, destinationOffset.register, tmp)

	// Check maximum bounds and if exceeds the length, exit with out of bounds error.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, tmp)
	c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)

	// Skip zero size.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
	skipJump := c.assembler.CompileJump(amd64.JEQ)

	// If dest < source, we can copy forwards
	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)

	// If source + size < dest, we can copy forwards
	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)

	// Copy backwards.
	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, true)
	endJump := c.assembler.CompileJump(amd64.JMP)

	// Copy forwards.
	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
	c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, false)

	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
		destinationOffset.register, tmp)
	c.assembler.SetJumpTargetOnNext(skipJump)
	c.assembler.SetJumpTargetOnNext(endJump)

	return nil
}

// compileFillLoopImpl implements a REP STOSQ fill loop.
func (c *amd64Compiler) compileFillLoopImpl(destinationOffset, value, fillSize *runtimeValueLocation, tmp asm.Register, replicateByte bool) {
	// Skip if nothing to fill.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, fillSize.register, 0)
	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)

	if replicateByte {
		// Truncate value.register to a single byte
		c.assembler.CompileConstToRegister(amd64.ANDQ, 0xff, value.register)
		// Replicate single byte onto full 8-byte register.
		c.assembler.CompileConstToRegister(amd64.MOVQ, 0x0101010101010101, tmp)
		c.assembler.CompileRegisterToRegister(amd64.IMULQ, tmp, value.register)
	}

	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
	restoreCrossing := c.compilePreventCrossedTargetRegisters(
		[]*runtimeValueLocation{destinationOffset, value, fillSize},
		[]asm.Register{amd64.RegDI, amd64.RegAX, amd64.RegCX})

	// Prepare registers for REP STOSQ: fill at [rdi] with rax, rcx times.
	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)

	c.assembler.CompileStandAlone(amd64.REPSTOSQ)

	// Restore registers.
	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
	restoreCrossing()

	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
}

// compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
//
// This function uses efficient `REP STOSQ` instructions to copy in quadword (8 bytes) batches
// if the size if above 15 bytes. For smaller sizes, a simple MOVB copy loop is the best
// option.
//
// TODO: the compiled code in this function should be reused and compile at once as
// the code is independent of any module.
func (c *amd64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
	copySize := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(copySize); err != nil {
		return err
	}

	value := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(value); err != nil {
		return err
	}

	destinationOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.locationStack.markRegisterUsed(tmp)

	// destinationOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)

	// Check destination bounds and if exceeds the length, exit with out of bounds error.
	if isTable {
		// tmp = &tables[0]
		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
			tmp)

		// tmp = [tmp + TableIndex*8]
		//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
		//     = [&tables[TableIndex]] = tables[TableIndex].
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex)*8, tmp)

		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
			tmp, tableInstanceTableLenOffset,
			destinationOffset.register)
	} else {
		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
			destinationOffset.register)
	}
	if isTable {
		c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
	} else {
		c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
	}

	// Otherwise, ready to copy the value from source to destination.
	//
	// If the copy size equal zero, we skip the entire instructions below.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
	skipJump := c.assembler.CompileJump(amd64.JEQ)

	// destinationOffset -= size.
	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)

	if isTable {
		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
		// destinationOffset += table buffer's absolute address.
		c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)

	} else {
		// destinationOffset += memory buffer's absolute address.
		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)

		// Copy first %15 bytes with simple MOVB instruction.
		beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
		c.assembler.CompileConstToRegister(amd64.TESTQ, 15, copySize.register)
		breakLoop := c.assembler.CompileJump(amd64.JEQ)

		c.assembler.CompileRegisterToMemory(amd64.MOVB, value.register, destinationOffset.register, 0)

		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
		c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
		c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginCopyLoop)

		c.assembler.SetJumpTargetOnNext(breakLoop)
		// compileFillLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
		c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
	}

	c.compileFillLoopImpl(destinationOffset, value, copySize, tmp, !isTable)

	c.locationStack.markRegisterUnused(copySize.register, value.register,
		destinationOffset.register, tmp)
	c.assembler.SetJumpTargetOnNext(skipJump)
	return nil
}

// compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
//
// TODO: the compiled code in this function should be reused and compile at once as
// the code is independent of any module.
func (c *amd64Compiler) compileMemoryFill() error {
	return c.compileFillImpl(false, 0)
}

// compileTableInit implements compiler.compileTableInit for the amd64 architecture.
func (c *amd64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
	elemIndex := uint32(o.U1)
	tableIndex := uint32(o.U2)
	return c.compileInitImpl(true, elemIndex, tableIndex)
}

// compileTableCopyLoopImpl is used for directly copying after bounds/direction check.
func (c *amd64Compiler) compileTableCopyLoopImpl(srcTableIndex, dstTableIndex uint32, destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
	// Point on first byte to be copied.
	if !backwards {
		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
	}

	// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
	// destinationOffset += table buffer's absolute address.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
	// sourceOffset += table buffer's absolute address.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, sourceOffset.register)

	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 8)
}

// compileTableCopy implements compiler.compileTableCopy for the amd64 architecture.
//
// It uses efficient `REP MOVSB` instructions for optimized copying. It uses backward copying for
// overlapped segments.
func (c *amd64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
	copySize := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(copySize); err != nil {
		return err
	}

	sourceOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
		return err
	}

	destinationOffset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// sourceOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
	// destinationOffset += size.
	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)

	srcTableIndex := uint32(o.U1)
	dstTableIndex := uint32(o.U2)

	// Check source bounds and if exceeds the length, exit with out of bounds error.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, sourceOffset.register)
	c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)

	// Check destination bounds and if exceeds the length, exit with out of bounds error.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
	c.compileTrapFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)

	// Skip zero size.
	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
	skipJump := c.assembler.CompileJump(amd64.JEQ)

	// If dest < source, we can copy forwards.
	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)

	// If source + size < dest, we can copy forwards.
	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)

	// Copy backwards.
	c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, true)
	endJump := c.assembler.CompileJump(amd64.JMP)

	// Copy forwards.
	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
	c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
	c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, false)

	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
		destinationOffset.register, tmp)
	c.assembler.SetJumpTargetOnNext(skipJump)
	c.assembler.SetJumpTargetOnNext(endJump)
	return nil
}

// compileElemDrop implements compiler.compileElemDrop for the amd64 architecture.
func (c *amd64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	elemIndex := uint32(o.U1)
	c.compileLoadElemInstanceAddress(elemIndex, tmp)

	// Clears the content of ElementInstances[o.ElemIndex].References (== []uintptr{} type).
	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
	return nil
}

func (c *amd64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
	// dst = elemIndex * elementInstanceStructSize
	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(elemIndex)*elementInstanceStructSize, dst)

	// dst = &moduleInstance.ElementInstances[0] + dst
	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
	//     = &moduleInstance.ElementInstances[elemIndex]
	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
		dst,
	)
}

// compileTableGet implements compiler.compileTableGet for the amd64 architecture.
func (c *amd64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(ref)

	offset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(offset); err != nil {
		return err
	}

	// ref = &tables[0]
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
		ref)

	// ref = [ref + TableIndex*8]
	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
	//     = [&tables[TableIndex]] = tables[TableIndex].
	tableIndex := int64(o.U1)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableIndex*8, ref)

	// Out of bounds check.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ, ref, tableInstanceTableLenOffset, offset.register)
	c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)

	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableInstanceTableOffset, ref)

	// ref = [ref + 0 + offset.register * 8]
	//     = [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset]
	//     = [&tables[TableIndex].References[offset]]
	//     = tables[TableIndex].References[offset]
	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, ref,
		0, offset.register, 8, ref,
	)

	c.locationStack.markRegisterUnused(offset.register)
	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
	return nil
}

// compileTableSet implements compiler.compileTableSet for the amd64 architecture.
func (c *amd64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
	ref := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(ref); err != nil {
		return err
	}

	offset := c.locationStack.pop()
	if err := c.compileEnsureOnRegister(offset); err != nil {
		return err
	}

	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// tmp = &tables[0]
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
		tmp)

	// ref = [ref + TableIndex*8]
	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
	//     = [&tables[TableIndex]] = tables[TableIndex].
	tableIndex := int64(o.U1)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableIndex*8, tmp)

	// Out of bounds check.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
	c.compileTrapFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)

	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableInstanceTableOffset, tmp)

	// [tmp + 0 + offset.register * 8] = ref
	// [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] = ref
	// [&tables[TableIndex].References[offset]] = ref
	// tables[TableIndex].References[offset] = ref
	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVQ,
		ref.register,
		tmp, 0, offset.register, 8)

	c.locationStack.markRegisterUnused(offset.register, ref.register)
	return nil
}

// compileTableGrow implements compiler.compileTableGrow for the amd64 architecture.
func (c *amd64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	// Pushes the table index.
	tableIndex := uint32(o.U1)
	if err := c.compileConstI32Impl(tableIndex); err != nil {
		return err
	}

	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
	// Therefore, call out to the built function for this purpose.
	if err := c.compileCallBuiltinFunction(builtinFunctionIndexTableGrow); err != nil {
		return err
	}

	// TableGrow consumes three values (table index, number of items, initial value).
	for i := 0; i < 3; i++ {
		c.locationStack.pop()
	}

	// Then, the previous length was pushed as the result.
	loc := c.locationStack.pushRuntimeValueLocationOnStack()
	loc.valueType = runtimeValueTypeI32

	// After return, we re-initialize reserved registers just like preamble of functions.
	c.compileReservedStackBasePointerInitialization()
	c.compileReservedMemoryPointerInitialization()
	return nil
}

// compileTableSize implements compiler.compileTableSize for the amd64 architecture.
func (c *amd64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	result, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	// result = &tables[0]
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
		result)

	// result = [result + TableIndex*8]
	//        = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
	//        = [&tables[TableIndex]] = tables[TableIndex].
	tableIndex := int64(o.U1)
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableIndex*8, result)

	// result = [result + tableInstanceTableLenOffset]
	//        = [tables[TableIndex] + tableInstanceTableLenOffset]
	//        = len(tables[TableIndex])
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableInstanceTableLenOffset, result)

	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
	return nil
}

// compileTableFill implements compiler.compileTableFill for the amd64 architecture.
func (c *amd64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
	tableIndex := uint32(o.U1)
	return c.compileFillImpl(true, tableIndex)
}

// compileRefFunc implements compiler.compileRefFunc for the amd64 architecture.
func (c *amd64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	functionIndex := int64(o.U1)
	c.assembler.CompileConstToRegister(amd64.MOVQ, functionIndex*functionSize, ref)

	// ref = [amd64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset + int64(o.FunctionIndex)*functionSize]
	//     = &moduleEngine.functions[index]
	c.assembler.CompileMemoryToRegister(
		amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
		ref,
	)

	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
	return nil
}

// compileConstI32 implements compiler.compileConstI32 for the amd64 architecture.
func (c *amd64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
	return c.compileConstI32Impl(uint32(o.U1))
}

func (c *amd64Compiler) compileConstI32Impl(v uint32) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
	c.assembler.CompileConstToRegister(amd64.MOVL, int64(v), reg)
	return nil
}

// compileConstI64 implements compiler.compileConstI64 for the amd64 architecture.
func (c *amd64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)

	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1), reg)
	return nil
}

// compileConstF32 implements compiler.compileConstF32 for the amd64 architecture.
func (c *amd64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	reg, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}
	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF32)

	// We cannot directly load the value from memory to float regs,
	// so we move it to int reg temporarily.
	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	c.assembler.CompileConstToRegister(amd64.MOVL, int64(o.U1) /*math.Float32bits(o.Value)*/, tmpReg)
	c.assembler.CompileRegisterToRegister(amd64.MOVL, tmpReg, reg)
	return nil
}

// compileConstF64 implements compiler.compileConstF64 for the amd64 architecture.
func (c *amd64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
		return err
	}

	reg, err := c.allocateRegister(registerTypeVector)
	if err != nil {
		return err
	}
	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF64)

	// We cannot directly load the value from memory to float regs,
	// so we move it to int reg temporarily.
	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}

	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1) /* math.Float64bits(o.Value) */, tmpReg)
	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, reg)
	return nil
}

// compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for amd64.
func (c *amd64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
	var inst asm.Instruction
	switch loc.valueType {
	case runtimeValueTypeV128Lo:
		inst = amd64.MOVDQU
	case runtimeValueTypeV128Hi:
		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
	case runtimeValueTypeI32, runtimeValueTypeF32:
		inst = amd64.MOVL
	case runtimeValueTypeI64, runtimeValueTypeF64:
		inst = amd64.MOVQ
	default:
		panic("BUG: unknown runtime value type")
	}

	// Copy the value from the stack.
	c.assembler.CompileMemoryToRegister(inst,
		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
		loc.register)

	if loc.valueType == runtimeValueTypeV128Lo {
		// Higher 64-bits are loaded as well ^^.
		hi := &c.locationStack.stack[loc.stackPointer+1]
		hi.setRegister(loc.register)
	}
}

// maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
// if the value is located on a conditional register.
//
// This is usually called at the beginning of methods on compiler interface where we possibly
// compile instructions without saving the conditional register value.
// The compileXXX functions without calling this function is saving the conditional
// value to the stack or register by invoking compileEnsureOnRegister for the top.
func (c *amd64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
	if c.locationStack.sp > 0 {
		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
				return err
			}
		}
	}
	return
}

// loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
// to a general purpose register.
func (c *amd64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
	if err != nil {
		return err
	}
	c.compileMoveConditionalToGeneralPurposeRegister(loc, reg)
	return nil
}

func (c *amd64Compiler) compileMoveConditionalToGeneralPurposeRegister(loc *runtimeValueLocation, reg asm.Register) {
	// Set the flag bit to the destination. See
	// - https://c9x.me/x86/html/file_module_x86_id_288.html
	// - https://github.com/golang/go/blob/master/src/cmd/internal/obj/x86/asm6.go#L1453-L1468
	// to translate conditionalRegisterState* to amd64.SET*
	var inst asm.Instruction
	switch loc.conditionalRegister {
	case amd64.ConditionalRegisterStateE:
		inst = amd64.SETEQ
	case amd64.ConditionalRegisterStateNE:
		inst = amd64.SETNE
	case amd64.ConditionalRegisterStateS:
		inst = amd64.SETMI
	case amd64.ConditionalRegisterStateNS:
		inst = amd64.SETPL
	case amd64.ConditionalRegisterStateG:
		inst = amd64.SETGT
	case amd64.ConditionalRegisterStateGE:
		inst = amd64.SETGE
	case amd64.ConditionalRegisterStateL:
		inst = amd64.SETLT
	case amd64.ConditionalRegisterStateLE:
		inst = amd64.SETLE
	case amd64.ConditionalRegisterStateA:
		inst = amd64.SETHI
	case amd64.ConditionalRegisterStateAE:
		inst = amd64.SETCC
	case amd64.ConditionalRegisterStateB:
		inst = amd64.SETCS
	case amd64.ConditionalRegisterStateBE:
		inst = amd64.SETLS
	}

	c.assembler.CompileNoneToRegister(inst, reg)

	// Then we reset the unnecessary bit.
	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x1, reg)

	// Mark it uses the register.
	loc.setRegister(reg)
	c.locationStack.markRegisterUsed(reg)
}

// allocateRegister implements compiler.allocateRegister for amd64.
func (c *amd64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
	var ok bool
	// Try to get the unused register.
	reg, ok = c.locationStack.takeFreeRegister(t)
	if ok {
		return
	}

	// If not found, we have to steal the register.
	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
	if !ok {
		err = fmt.Errorf("cannot steal register")
		return
	}

	// Release the steal target register value onto stack location.
	reg = stealTarget.register
	c.compileReleaseRegisterToStack(stealTarget)
	return
}

// callFunction adds instructions to call a function whose address equals either addr parameter or the value on indexReg.
//
// Note: this is the counterpart for returnFunction, and see the comments there as well
// to understand how the function calls are achieved.
func (c *amd64Compiler) compileCallFunctionImpl(functionAddressRegister asm.Register, functype *wasm.FunctionType) error {
	// Release all the registers as our calling convention requires the caller-save.
	if err := c.compileReleaseAllRegistersToStack(); err != nil {
		return err
	}

	c.locationStack.markRegisterUsed(functionAddressRegister)

	// Obtain a temporary register to be used in the followings.
	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
	if !found {
		// This in theory never happen as all the registers must be free except codeAddressRegister.
		return fmt.Errorf("could not find enough free registers")
	}

	// The stack should look like:
	//
	//               reserved slots for results (if len(results) > len(args))
	//                      |     |
	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
	//      |                       |                                                        |
	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
	//      |
	// nextStackBasePointerOffset
	//
	// where callFrame is used to return to this currently executed function.

	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)

	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)

	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
		tmpRegister)
	callFrameStackBasePointerInBytesLoc.setRegister(tmpRegister)
	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)

	// Set callEngine.stackContext.stackBasePointer for the next function.
	c.assembler.CompileConstToRegister(amd64.ADDQ, nextStackBasePointerOffset<<3, tmpRegister)

	// Write the calculated value to callEngine.stackContext.stackBasePointer.
	c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)

	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
		tmpRegister)
	callFrameFunctionLoc.setRegister(tmpRegister)
	c.compileReleaseRegisterToStack(callFrameFunctionLoc)

	// Set callEngine.moduleContext.fn to the next *function.
	c.assembler.CompileRegisterToMemory(amd64.MOVQ, functionAddressRegister,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)

	// Write the return address into callFrameReturnAddressLoc.
	c.assembler.CompileReadInstructionAddress(tmpRegister, amd64.JMP)
	callFrameReturnAddressLoc.setRegister(tmpRegister)
	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)

	if amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister == functionAddressRegister {
		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
		// the address (jump target below) will be modified and result in segfault.
		// See #526.
		c.assembler.CompileRegisterToRegister(amd64.MOVQ, functionAddressRegister, tmpRegister)
		functionAddressRegister = tmpRegister
	}

	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ, functionAddressRegister, functionModuleInstanceOffset,
		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)

	// And jump into the initial address of the target function.
	c.assembler.CompileJumpToMemory(amd64.JMP, functionAddressRegister, functionCodeInitialAddressOffset)

	// All the registers used are temporary, so we mark them unused.
	c.locationStack.markRegisterUnused(tmpRegister, functionAddressRegister)

	// On the function return, we have to initialize the state.
	if err := c.compileModuleContextInitialization(); err != nil {
		return err
	}

	// Due to the change to callEngine.stackContext.stackBasePointer.
	c.compileReservedStackBasePointerInitialization()

	// Due to the change to callEngine.moduleContext.moduleInstance as that might result in
	// the memory instance manipulation.
	c.compileReservedMemoryPointerInitialization()

	// We consumed the function parameters, the call frame stack and reserved slots during the call.
	c.locationStack.sp = uint64(nextStackBasePointerOffset)

	// Now the function results are pushed by the call.
	for _, t := range functype.Results {
		loc := c.locationStack.pushRuntimeValueLocationOnStack()
		switch t {
		case wasm.ValueTypeI32:
			loc.valueType = runtimeValueTypeI32
		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
			loc.valueType = runtimeValueTypeI64
		case wasm.ValueTypeF32:
			loc.valueType = runtimeValueTypeF32
		case wasm.ValueTypeF64:
			loc.valueType = runtimeValueTypeF64
		case wasm.ValueTypeV128:
			loc.valueType = runtimeValueTypeV128Lo
			hi := c.locationStack.pushRuntimeValueLocationOnStack()
			hi.valueType = runtimeValueTypeV128Hi
		default:
			panic("BUG: invalid type: " + wasm.ValueTypeName(t))
		}
	}
	return nil
}

// returnFunction adds instructions to return from the current callframe back to the caller's frame.
// If this is the current one is the origin, we return to the callEngine.execWasmFunction with the Returned status.
// Otherwise, we jump into the callers' return address stored in callFrame.returnAddress while setting
// up all the necessary change on the callEngine's state.
//
// Note: this is the counterpart for callFunction, and see the comments there as well
// to understand how the function calls are achieved.
func (c *amd64Compiler) compileReturnFunction() error {
	// Release all the registers as our calling convention requires the caller-save.
	if err := c.compileReleaseAllRegistersToStack(); err != nil {
		return err
	}

	if c.withListener {
		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerAfter); err != nil {
			return err
		}
		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
		c.compileReservedStackBasePointerInitialization()
	}

	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
	// so mark it used so that it won't be used as a free register.
	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)

	// Obtain a temporary register to be used in the following.
	returnAddressRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
	if !found {
		panic("BUG: all the registers should be free at this point: " + c.locationStack.String())
	}

	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)

	// A zero return address means return from the execution.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForStackBasePointerAddress, int64(returnAddress.stackPointer)*8,
		returnAddressRegister,
	)
	c.assembler.CompileRegisterToRegister(amd64.TESTQ, returnAddressRegister, returnAddressRegister)

	jmpIfNotReturn := c.assembler.CompileJump(amd64.JNE)
	c.compileExitFromNativeCode(nativeCallStatusCodeReturned)

	// Otherwise, we return to the caller.
	c.assembler.SetJumpTargetOnNext(jmpIfNotReturn)

	// Alias for readability.
	tmpRegister := amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister

	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
	callerStackBasePointerInBytes.setRegister(tmpRegister)
	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)

	// Next, restore moduleContext.fn from callerFunction.
	callerFunction.setRegister(tmpRegister)
	c.compileLoadValueOnStackToRegister(callerFunction)
	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)

	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		tmpRegister, functionModuleInstanceOffset,
		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)

	// Then, jump into the return address!
	c.assembler.CompileJumpToRegister(amd64.JMP, returnAddressRegister)
	return nil
}

func (c *amd64Compiler) compileCallGoHostFunction() error {
	return c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction)
}

func (c *amd64Compiler) compileCallBuiltinFunction(index wasm.Index) error {
	// Set the functionAddress to the callEngine.exitContext functionCallAddress.
	c.assembler.CompileConstToMemory(amd64.MOVL, int64(index), amd64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset)
	return c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction)
}

func (c *amd64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode) error {
	// Release all the registers as our calling convention requires the caller-save.
	if err := c.compileReleaseAllRegistersToStack(); err != nil {
		return err
	}

	c.compileExitFromNativeCode(compilerStatus)
	return nil
}

// compileReleaseAllRegistersToStack add the instructions to release all the LIVE value
// in the value location stack at this point into the stack memory location.
func (c *amd64Compiler) compileReleaseAllRegistersToStack() (err error) {
	for i := uint64(0); i < c.locationStack.sp; i++ {
		if loc := &c.locationStack.stack[i]; loc.onRegister() {
			c.compileReleaseRegisterToStack(loc)
		} else if loc.onConditionalRegister() {
			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
				return
			}
			c.compileReleaseRegisterToStack(loc)
		}
	}
	return
}

func (c *amd64Compiler) onValueReleaseRegisterToStack(reg asm.Register) {
	for i := uint64(0); i < c.locationStack.sp; i++ {
		prevValue := &c.locationStack.stack[i]
		if prevValue.register == reg {
			c.compileReleaseRegisterToStack(prevValue)
			break
		}
	}
}

// compileReleaseRegisterToStack implements compiler.compileReleaseRegisterToStack for amd64.
func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
	var inst asm.Instruction
	switch loc.valueType {
	case runtimeValueTypeV128Lo:
		inst = amd64.MOVDQU
	case runtimeValueTypeV128Hi:
		panic("BUG: V128Hi must be released to the stack along with V128Lo")
	case runtimeValueTypeI32, runtimeValueTypeF32:
		inst = amd64.MOVL
	case runtimeValueTypeI64, runtimeValueTypeF64:
		inst = amd64.MOVQ
	default:
		panic("BUG: unknown runtime value type")
	}

	c.assembler.CompileRegisterToMemory(inst, loc.register,
		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)

	// Mark the register is free.
	c.locationStack.releaseRegister(loc)

	if loc.valueType == runtimeValueTypeV128Lo {
		// Higher 64-bits are released as well ^^.
		hi := &c.locationStack.stack[loc.stackPointer+1]
		c.locationStack.releaseRegister(hi)
	}
}

func (c *amd64Compiler) compileTrapFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
	if target := c.compiledTrapTargets[status]; target == nil {
		skip := c.assembler.CompileJump(skipCondition)
		// Save the trap target for future reference.
		c.compiledTrapTargets[status] = c.compileNOP()
		c.compileExitFromNativeCode(status)
		c.assembler.SetJumpTargetOnNext(skip)
	} else {
		// We've already compiled this.
		// Invert the condition to jump into the appropriate target.
		var trapCondition asm.Instruction
		switch skipCondition {
		case amd64.JHI:
			trapCondition = amd64.JLS
		case amd64.JLS:
			trapCondition = amd64.JHI
		case amd64.JNE:
			trapCondition = amd64.JEQ
		case amd64.JEQ:
			trapCondition = amd64.JNE
		case amd64.JCC:
			trapCondition = amd64.JCS
		case amd64.JCS:
			trapCondition = amd64.JCC
		case amd64.JPC:
			trapCondition = amd64.JPS
		case amd64.JPS:
			trapCondition = amd64.JPC
		case amd64.JPL:
			trapCondition = amd64.JMI
		case amd64.JMI:
			trapCondition = amd64.JPL
		default:
			panic("BUG: couldn't invert condition")
		}
		c.assembler.CompileJump(trapCondition).AssignJumpTarget(target)
	}
}

func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
	c.assembler.CompileConstToMemory(amd64.MOVB, int64(status),
		amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)

	// Write back the cached SP to the actual eng.stackPointer.
	c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp),
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackPointerOffset)

	switch status {
	case nativeCallStatusCodeReturned:
	case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
		// Read the return address, and write it to callEngine.exitContext.returnAddress.
		returnAddressReg, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
		if !ok {
			panic("BUG: cannot take free register")
		}
		c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.RET)
		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
			returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
	default:
		// This case, the execution traps, so take tmpReg and store the instruction address onto callEngine.returnAddress
		// so that the stack trace can contain the top frame's source position.
		tmpReg := amd64.RegR15
		c.assembler.CompileReadInstructionAddress(tmpReg, amd64.MOVQ)
		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
			tmpReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
	}

	c.assembler.CompileStandAlone(amd64.RET)
}

func (c *amd64Compiler) compilePreamble() (err error) {
	// We assume all function parameters are already pushed onto the stack by
	// the caller.
	c.locationStack.init(c.typ)

	if err := c.compileModuleContextInitialization(); err != nil {
		return err
	}

	// Check if it's necessary to grow the value stack by using max stack pointer.
	if err = c.compileMaybeGrowStack(); err != nil {
		return err
	}

	if c.withListener {
		if err = c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
			return err
		}
	}

	c.compileReservedStackBasePointerInitialization()

	// Finally, we initialize the reserved memory register based on the module context.
	c.compileReservedMemoryPointerInitialization()
	return
}

func (c *amd64Compiler) compileReservedStackBasePointerInitialization() {
	// First, make reservedRegisterForStackBasePointer point to the beginning of the slice backing array.
	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
		amd64ReservedRegisterForStackBasePointerAddress)

	// next we move the base pointer (callEngine.stackBasePointer) to the tmp register.
	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
		amd64ReservedRegisterForStackBasePointerAddress,
	)
}

func (c *amd64Compiler) compileReservedMemoryPointerInitialization() {
	if c.ir.HasMemory || c.ir.UsesMemory {
		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
			amd64ReservedRegisterForMemory,
		)
	}
}

// compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
// and if so, make the builtin function call to do so. These instructions are called in the function's
// preamble.
func (c *amd64Compiler) compileMaybeGrowStack() error {
	tmpRegister, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
	if !ok {
		panic("BUG: cannot take free register")
	}

	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset, tmpRegister)
	c.assembler.CompileMemoryToRegister(amd64.SUBQ,
		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister)

	// If stack base pointer + max stack pointer > stackLen, we need to grow the stack.
	cmpWithStackPointerCeil := c.assembler.CompileRegisterToConst(amd64.CMPQ, tmpRegister, 0)
	c.assignStackPointerCeilNeeded = cmpWithStackPointerCeil

	// Jump if we have no need to grow.
	jmpIfNoNeedToGrowStack := c.assembler.CompileJump(amd64.JCC)

	// Otherwise, we have to make the builtin function call to grow the call stack.
	if err := c.compileCallBuiltinFunction(builtinFunctionIndexGrowStack); err != nil {
		return err
	}

	c.assembler.SetJumpTargetOnNext(jmpIfNoNeedToGrowStack)
	return nil
}

// compileModuleContextInitialization adds instructions to initialize callEngine.ModuleContext's fields based on
// callEngine.ModuleContext.ModuleInstanceAddress.
// This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
func (c *amd64Compiler) compileModuleContextInitialization() error {
	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
	// so mark it used so that it won't be used as a free register until the module context initialization finishes.
	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)

	// Obtain the temporary registers to be used in the followings.
	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
	if !found {
		// This in theory never happen as all the registers must be free except indexReg.
		return fmt.Errorf("could not find enough free registers")
	}
	c.locationStack.markRegisterUsed(tmpRegister)
	tmpRegister2, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
	if !found {
		// This in theory never happen as all the registers must be free except indexReg.
		return fmt.Errorf("could not find enough free registers")
	}
	c.locationStack.markRegisterUsed(tmpRegister2)

	// If the module instance address stays the same, we could skip the entire code below.
	// The rationale/idea for this is that, in almost all use cases, users instantiate a single
	// Wasm binary and run the functions from it, rather than doing import/export on multiple
	// binaries. As a result, this cmp and jmp instruction sequence below must be easy for
	// x64 CPU to do branch prediction since almost 100% jump happens across function calls.
	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
	jmpIfModuleNotChange := c.assembler.CompileJump(amd64.JEQ)

	// If engine.ModuleContext.ModuleInstance is not equal the value on amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
	// we have to put the new value there.
	c.assembler.CompileRegisterToMemory(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)

	// Also, we have to update the following fields:
	// * callEngine.moduleContext.globalElement0Address
	// * callEngine.moduleContext.tableElement0Address
	// * callEngine.moduleContext.memoryInstance
	// * callEngine.moduleContext.memoryElement0Address
	// * callEngine.moduleContext.memorySliceLen
	// * callEngine.moduleContext.codesElement0Address
	// * callEngine.moduleContext.typeIDsElement0Address
	// * callEngine.moduleContext.dataInstancesElement0Address
	// * callEngine.moduleContext.elementInstancesElement0Address

	// Update globalElement0Address.
	//
	// Note: if there's global.get or set instruction in the function, the existence of the globals
	// is ensured by function validation at module instantiation phase, and that's why it is ok to
	// skip the initialization if the module's globals slice is empty.
	if len(c.ir.Globals) > 0 {
		// Since ModuleInstance.Globals is []*globalInstance, internally
		// the address of the first item in the underlying array lies exactly on the globals offset.
		// See https://go.dev/blog/slices-intro if unfamiliar.
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset, tmpRegister)

		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset)
	}

	// Update tableElement0Address.
	//
	// Note: if there's table instruction in the function, the existence of the table
	// is ensured by function validation at module instantiation phase, and that's
	// why it is ok to skip the initialization if the module's table doesn't exist.
	if c.ir.HasTable {
		// First, we need to read the *wasm.Table.
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTablesOffset, tmpRegister)

		// At this point, tmpRegister holds the address of ModuleInstance.Table.
		// So we are ready to read and put the first item's address stored in Table.Table.
		// Here we read the value into tmpRegister2.
		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset)

		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpRegister)
		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
			tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
	}

	// Update memoryElement0Address and memorySliceLen.
	//
	// Note: if there's memory instruction in the function, memory instance must be non-nil.
	// That is ensured by function validation at module instantiation phase, and that's
	// why it is ok to skip the initialization if the module's memory instance is nil.
	if c.ir.HasMemory {
		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
			tmpRegister)

		// Set memory instance.
		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset)

		// Set length.
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferLenOffset, tmpRegister2)
		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset)

		// Set element zero address.
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferOffset, tmpRegister2)
		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset)
	}

	// Update moduleContext.codesElement0Address
	{
		// "tmpRegister = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
		//
		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
		// where tab points to the interface table, and the latter points to the actual
		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
		// See the following references for detail:
		// * https://research.swtch.com/interfaces
		// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset, tmpRegister)

		// "tmpRegister = [tmpRegister + moduleEnginecodesOffset] (== &moduleEngine.codes[0])"
		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, moduleEngineFunctionsOffset, tmpRegister)

		// "callEngine.moduleContext.functionsElement0Address = tmpRegister".
		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine,
			callEngineModuleContextFunctionsElement0AddressOffset)
	}

	// Update dataInstancesElement0Address.
	if c.ir.HasDataInstances {
		// "tmpRegister = &moduleInstance.DataInstances[0]"
		c.assembler.CompileMemoryToRegister(
			amd64.MOVQ,
			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
			tmpRegister,
		)
		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
		c.assembler.CompileRegisterToMemory(
			amd64.MOVQ,
			tmpRegister,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
		)
	}

	// Update callEngine.moduleContext.elementInstancesElement0Address
	if c.ir.HasElementInstances {
		// "tmpRegister = &moduleInstance.ElementInstnaces[0]"
		c.assembler.CompileMemoryToRegister(
			amd64.MOVQ,
			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
			tmpRegister,
		)
		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
		c.assembler.CompileRegisterToMemory(
			amd64.MOVQ,
			tmpRegister,
			amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
		)
	}

	c.locationStack.markRegisterUnused(tmpRegister, tmpRegister2)

	// Set the jump target towards the next instruction for the case where module instance address hasn't changed.
	c.assembler.SetJumpTargetOnNext(jmpIfModuleNotChange)
	return nil
}

// compileEnsureOnRegister ensures that the given value is located on a
// general purpose register of an appropriate type.
func (c *amd64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
	if loc.onStack() {
		// Allocate the register.
		reg, err := c.allocateRegister(loc.getRegisterType())
		if err != nil {
			return err
		}

		// Mark it uses the register.
		loc.setRegister(reg)
		c.locationStack.markRegisterUsed(reg)

		c.compileLoadValueOnStackToRegister(loc)
	} else if loc.onConditionalRegister() {
		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
	}
	return
}

// compileMaybeSwapRegisters swaps two registers if they're not equal.
func (c *amd64Compiler) compileMaybeSwapRegisters(reg1, reg2 asm.Register) {
	if reg1 != reg2 {
		c.assembler.CompileRegisterToRegister(amd64.XCHGQ, reg1, reg2)
	}
}

// compilePreventCrossedTargetRegisters swaps registers in such a way, that for neither runtimeValueLocation from locs its
// corresponding register with the same index from targets is occupied by some other runtimeValueLocation from locs. It returns a
// closure to restore the original register placement.
//
// This function makes it possible to safely exchange one set of registers with another, where a register might be in both sets.
// Each register will correspond either to itself or another register not present in its own set.
//
// For example, if we have locs = [AX, BX, CX], targets = [BX, SI, AX], then it'll do two swaps
// to make locs = [BX, CX, AX].
func (c *amd64Compiler) compilePreventCrossedTargetRegisters(locs []*runtimeValueLocation, targets []asm.Register) (restore func()) {
	type swap struct{ srcIndex, dstIndex int }
	var swaps []swap
	for i := range locs {
		targetLocation := -1 // -1 means not found.
		for j := range locs {
			if locs[j].register == targets[i] {
				targetLocation = j
				break
			}
		}
		if targetLocation != -1 && targetLocation != i {
			c.compileMaybeSwapRegisters(locs[i].register, locs[targetLocation].register)
			locs[i].register, locs[targetLocation].register = locs[targetLocation].register, locs[i].register
			swaps = append(swaps, swap{i, targetLocation})
		}
	}
	return func() {
		// Restore in reverse order because a register can be moved multiple times.
		for i := len(swaps) - 1; i >= 0; i -= 1 {
			r1, r2 := swaps[i].srcIndex, swaps[i].dstIndex
			c.compileMaybeSwapRegisters(locs[r1].register, locs[r2].register)
			locs[r1].register, locs[r2].register = locs[r2].register, locs[r1].register
		}
	}
}