package amd64 import ( "context" "encoding/binary" "fmt" "math" "strings" "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" "github.com/tetratelabs/wazero/internal/platform" ) // NewBackend returns a new backend for arm64. func NewBackend() backend.Machine { ectx := backend.NewExecutableContextT[instruction]( resetInstruction, setNext, setPrev, asNop, ) return &machine{ ectx: ectx, cpuFeatures: platform.CpuFeatures, regAlloc: regalloc.NewAllocator(regInfo), spillSlots: map[regalloc.VRegID]int64{}, amodePool: wazevoapi.NewPool[amode](nil), constSwizzleMaskConstIndex: -1, constSqmulRoundSatIndex: -1, constI8x16SHLMaskTableIndex: -1, constI8x16LogicalSHRMaskTableIndex: -1, constF64x2CvtFromIMaskIndex: -1, constTwop52Index: -1, constI32sMaxOnF64x2Index: -1, constI32uMaxOnF64x2Index: -1, constAllOnesI8x16Index: -1, constAllOnesI16x8Index: -1, constExtAddPairwiseI16x8uMask1Index: -1, constExtAddPairwiseI16x8uMask2Index: -1, } } type ( // machine implements backend.Machine for amd64. machine struct { c backend.Compiler ectx *backend.ExecutableContextT[instruction] stackBoundsCheckDisabled bool amodePool wazevoapi.Pool[amode] cpuFeatures platform.CpuFeatureFlags regAlloc regalloc.Allocator regAllocFn *backend.RegAllocFunction[*instruction, *machine] regAllocStarted bool spillSlotSize int64 spillSlots map[regalloc.VRegID]int64 currentABI *backend.FunctionABI clobberedRegs []regalloc.VReg maxRequiredStackSizeForCalls int64 labelResolutionPends []labelResolutionPend jmpTableTargets [][]uint32 consts []_const constSwizzleMaskConstIndex, constSqmulRoundSatIndex, constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex, constF64x2CvtFromIMaskIndex, constTwop52Index, constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index, constAllOnesI8x16Index, constAllOnesI16x8Index, constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int } _const struct { lo, hi uint64 _var []byte label *labelPosition } labelResolutionPend struct { instr *instruction instrOffset int64 // imm32Offset is the offset of the last 4 bytes of the instruction. imm32Offset int64 } labelPosition = backend.LabelPosition[instruction] ) func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label { index := *i if index == -1 { label := m.allocateLabel() index = len(m.consts) m.consts = append(m.consts, _const{ _var: _var, label: label, }) *i = index } return m.consts[index].label.L } // Reset implements backend.Machine. func (m *machine) Reset() { m.consts = m.consts[:0] m.clobberedRegs = m.clobberedRegs[:0] for key := range m.spillSlots { m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) } for _, key := range m.clobberedRegs { delete(m.spillSlots, regalloc.VRegID(key)) } m.stackBoundsCheckDisabled = false m.ectx.Reset() m.regAllocFn.Reset() m.regAlloc.Reset() m.regAllocStarted = false m.clobberedRegs = m.clobberedRegs[:0] m.spillSlotSize = 0 m.maxRequiredStackSizeForCalls = 0 m.amodePool.Reset() m.jmpTableTargets = m.jmpTableTargets[:0] m.constSwizzleMaskConstIndex = -1 m.constSqmulRoundSatIndex = -1 m.constI8x16SHLMaskTableIndex = -1 m.constI8x16LogicalSHRMaskTableIndex = -1 m.constF64x2CvtFromIMaskIndex = -1 m.constTwop52Index = -1 m.constI32sMaxOnF64x2Index = -1 m.constI32uMaxOnF64x2Index = -1 m.constAllOnesI8x16Index = -1 m.constAllOnesI16x8Index = -1 m.constExtAddPairwiseI16x8uMask1Index = -1 m.constExtAddPairwiseI16x8uMask2Index = -1 } // ExecutableContext implements backend.Machine. func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx } // DisableStackCheck implements backend.Machine. func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true } // SetCompiler implements backend.Machine. func (m *machine) SetCompiler(c backend.Compiler) { m.c = c m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c) } // SetCurrentABI implements backend.Machine. func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { m.currentABI = abi } // RegAlloc implements backend.Machine. func (m *machine) RegAlloc() { rf := m.regAllocFn for _, pos := range m.ectx.OrderedBlockLabels { rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) } m.regAllocStarted = true m.regAlloc.DoAllocation(rf) // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 } // InsertReturn implements backend.Machine. func (m *machine) InsertReturn() { i := m.allocateInstr().asRet() m.insert(i) } // LowerSingleBranch implements backend.Machine. func (m *machine) LowerSingleBranch(b *ssa.Instruction) { ectx := m.ectx switch b.Opcode() { case ssa.OpcodeJump: _, _, targetBlk := b.BranchData() if b.IsFallthroughJump() { return } jmp := m.allocateInstr() target := ectx.GetOrAllocateSSABlockLabel(targetBlk) if target == backend.LabelReturn { jmp.asRet() } else { jmp.asJmp(newOperandLabel(target)) } m.insert(jmp) case ssa.OpcodeBrTable: index, target := b.BrTableData() m.lowerBrTable(index, target) default: panic("BUG: unexpected branch opcode" + b.Opcode().String()) } } func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { // TODO: reuse the slice! labels := make([]uint32, len(targets)) for j, target := range targets { labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target)) } index = len(m.jmpTableTargets) m.jmpTableTargets = append(m.jmpTableTargets, labels) return } var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp} func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) { _v := m.getOperand_Reg(m.c.ValueDefinition(index)) v := m.copyToTmp(_v.reg()) // First, we need to do the bounds check. maxIndex := m.c.AllocateVReg(ssa.TypeI32) m.lowerIconst(maxIndex, uint64(len(targets)-1), false) cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false) m.insert(cmp) // Then do the conditional move maxIndex to v if v > maxIndex. cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false) m.insert(cmov) // Now that v has the correct index. Load the address of the jump table into the addr. addr := m.c.AllocateVReg(ssa.TypeI64) leaJmpTableAddr := m.allocateInstr() m.insert(leaJmpTableAddr) // Then add the target's offset into jmpTableAddr. loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, // Shift by 3 because each entry is 8 bytes. newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true) m.insert(loadTargetOffsetFromJmpTable) // Now ready to jump. jmp := m.allocateInstr().asJmp(newOperandReg(addr)) m.insert(jmp) jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget() m.insert(jmpTableBegin) leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr) jmpTable := m.allocateInstr() targetSliceIndex := m.addJmpTableTarget(targets) jmpTable.asJmpTableSequence(targetSliceIndex, len(targets)) m.insert(jmpTable) } // LowerConditionalBranch implements backend.Machine. func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { exctx := m.ectx cval, args, targetBlk := b.BranchData() if len(args) > 0 { panic(fmt.Sprintf( "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", exctx.CurrentSSABlk, targetBlk, )) } target := exctx.GetOrAllocateSSABlockLabel(targetBlk) cvalDef := m.c.ValueDefinition(cval) switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { case ssa.OpcodeIcmp: cvalInstr := cvalDef.Instr x, y, c := cvalInstr.IcmpData() cc := condFromSSAIntCmpCond(c) if b.Opcode() == ssa.OpcodeBrz { cc = cc.invert() } // First, perform the comparison and set the flag. xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y) if !m.tryLowerBandToFlag(xd, yd) { m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64) } // Then perform the conditional branch. m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) cvalDef.Instr.MarkLowered() case ssa.OpcodeFcmp: cvalInstr := cvalDef.Instr f1, f2, and := m.lowerFcmpToFlags(cvalInstr) isBrz := b.Opcode() == ssa.OpcodeBrz if isBrz { f1 = f1.invert() } if f2 == condInvalid { m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target))) } else { if isBrz { f2 = f2.invert() and = !and } jmp1, jmp2 := m.allocateInstr(), m.allocateInstr() m.insert(jmp1) m.insert(jmp2) notTaken, notTakenLabel := m.allocateBrTarget() m.insert(notTaken) if and { jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel)) jmp2.asJmpIf(f2, newOperandLabel(target)) } else { jmp1.asJmpIf(f1, newOperandLabel(target)) jmp2.asJmpIf(f2, newOperandLabel(target)) } } cvalDef.Instr.MarkLowered() default: v := m.getOperand_Reg(cvalDef) var cc cond if b.Opcode() == ssa.OpcodeBrz { cc = condZ } else { cc = condNZ } // Perform test %v, %v to set the flag. cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false) m.insert(cmp) m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) } } // LowerInstr implements backend.Machine. func (m *machine) LowerInstr(instr *ssa.Instruction) { if l := instr.SourceOffset(); l.Valid() { info := m.allocateInstr().asEmitSourceOffsetInfo(l) m.insert(info) } switch op := instr.Opcode(); op { case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: panic("BUG: branching instructions are handled by LowerBranches") case ssa.OpcodeReturn: panic("BUG: return must be handled by backend.Compiler") case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. case ssa.OpcodeCall, ssa.OpcodeCallIndirect: m.lowerCall(instr) case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: m.lowerStore(instr) case ssa.OpcodeIadd: m.lowerAluRmiROp(instr, aluRmiROpcodeAdd) case ssa.OpcodeIsub: m.lowerAluRmiROp(instr, aluRmiROpcodeSub) case ssa.OpcodeImul: m.lowerAluRmiROp(instr, aluRmiROpcodeMul) case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem: isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem m.lowerIDivRem(instr, isDiv, isSigned) case ssa.OpcodeBand: m.lowerAluRmiROp(instr, aluRmiROpcodeAnd) case ssa.OpcodeBor: m.lowerAluRmiROp(instr, aluRmiROpcodeOr) case ssa.OpcodeBxor: m.lowerAluRmiROp(instr, aluRmiROpcodeXor) case ssa.OpcodeIshl: m.lowerShiftR(instr, shiftROpShiftLeft) case ssa.OpcodeSshr: m.lowerShiftR(instr, shiftROpShiftRightArithmetic) case ssa.OpcodeUshr: m.lowerShiftR(instr, shiftROpShiftRightLogical) case ssa.OpcodeRotl: m.lowerShiftR(instr, shiftROpRotateLeft) case ssa.OpcodeRotr: m.lowerShiftR(instr, shiftROpRotateRight) case ssa.OpcodeClz: m.lowerClz(instr) case ssa.OpcodeCtz: m.lowerCtz(instr) case ssa.OpcodePopcnt: m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt) case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv: m.lowerXmmRmR(instr) case ssa.OpcodeFabs: m.lowerFabsFneg(instr) case ssa.OpcodeFneg: m.lowerFabsFneg(instr) case ssa.OpcodeCeil: m.lowerRound(instr, roundingModeUp) case ssa.OpcodeFloor: m.lowerRound(instr, roundingModeDown) case ssa.OpcodeTrunc: m.lowerRound(instr, roundingModeZero) case ssa.OpcodeNearest: m.lowerRound(instr, roundingModeNearest) case ssa.OpcodeFmin, ssa.OpcodeFmax: m.lowerFminFmax(instr) case ssa.OpcodeFcopysign: m.lowerFcopysign(instr) case ssa.OpcodeBitcast: m.lowerBitcast(instr) case ssa.OpcodeSqrt: m.lowerSqrt(instr) case ssa.OpcodeFpromote: v := instr.Arg() rn := m.getOperand_Reg(m.c.ValueDefinition(v)) rd := m.c.VRegOf(instr.Return()) cnt := m.allocateInstr() cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd) m.insert(cnt) case ssa.OpcodeFdemote: v := instr.Arg() rn := m.getOperand_Reg(m.c.ValueDefinition(v)) rd := m.c.VRegOf(instr.Return()) cnt := m.allocateInstr() cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd) m.insert(cnt) case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: x, ctx := instr.Arg2() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) ctxVReg := m.c.VRegOf(ctx) m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: x, ctx := instr.Arg2() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) ctxVReg := m.c.VRegOf(ctx) m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) case ssa.OpcodeFcvtFromSint: x := instr.Arg() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := newOperandReg(m.c.VRegOf(instr.Return())) m.lowerFcvtFromSint(rn, rd, x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64) case ssa.OpcodeFcvtFromUint: x := instr.Arg() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := newOperandReg(m.c.VRegOf(instr.Return())) m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64) case ssa.OpcodeVanyTrue: m.lowerVanyTrue(instr) case ssa.OpcodeVallTrue: m.lowerVallTrue(instr) case ssa.OpcodeVhighBits: m.lowerVhighBits(instr) case ssa.OpcodeVbnot: m.lowerVbnot(instr) case ssa.OpcodeVband: x, y := instr.Arg2() m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return()) case ssa.OpcodeVbor: x, y := instr.Arg2() m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return()) case ssa.OpcodeVbxor: x, y := instr.Arg2() m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return()) case ssa.OpcodeVbandnot: m.lowerVbandnot(instr, sseOpcodePandn) case ssa.OpcodeVbitselect: m.lowerVbitselect(instr) case ssa.OpcodeVIadd: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePaddb case ssa.VecLaneI16x8: vecOp = sseOpcodePaddw case ssa.VecLaneI32x4: vecOp = sseOpcodePaddd case ssa.VecLaneI64x2: vecOp = sseOpcodePaddq } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVSaddSat: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePaddsb case ssa.VecLaneI16x8: vecOp = sseOpcodePaddsw } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVUaddSat: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePaddusb case ssa.VecLaneI16x8: vecOp = sseOpcodePaddusw } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVIsub: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePsubb case ssa.VecLaneI16x8: vecOp = sseOpcodePsubw case ssa.VecLaneI32x4: vecOp = sseOpcodePsubd case ssa.VecLaneI64x2: vecOp = sseOpcodePsubq } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVSsubSat: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePsubsb case ssa.VecLaneI16x8: vecOp = sseOpcodePsubsw } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVUsubSat: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePsubusb case ssa.VecLaneI16x8: vecOp = sseOpcodePsubusw } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVImul: m.lowerVImul(instr) case ssa.OpcodeVIneg: x, lane := instr.ArgWithLane() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePsubb case ssa.VecLaneI16x8: vecOp = sseOpcodePsubw case ssa.VecLaneI32x4: vecOp = sseOpcodePsubd case ssa.VecLaneI64x2: vecOp = sseOpcodePsubq default: panic("BUG") } tmp := m.c.AllocateVReg(ssa.TypeV128) m.insert(m.allocateInstr().asZeros(tmp)) i := m.allocateInstr() i.asXmmRmR(vecOp, rn, tmp) m.insert(i) m.copyTo(tmp, rd) case ssa.OpcodeVFadd: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeAddps case ssa.VecLaneF64x2: vecOp = sseOpcodeAddpd } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVFsub: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeSubps case ssa.VecLaneF64x2: vecOp = sseOpcodeSubpd } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVFdiv: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeDivps case ssa.VecLaneF64x2: vecOp = sseOpcodeDivpd } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVFmul: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeMulps case ssa.VecLaneF64x2: vecOp = sseOpcodeMulpd } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVFneg: x, lane := instr.ArgWithLane() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) tmp := m.c.AllocateVReg(ssa.TypeV128) var shiftOp, xorOp sseOpcode var shiftAmt uint32 switch lane { case ssa.VecLaneF32x4: shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps case ssa.VecLaneF64x2: shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd } zero := m.allocateInstr() zero.asZeros(tmp) m.insert(zero) // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction). // See https://www.felixcloutier.com/x86/cmpps // // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane // if the lane is NaN. cmp := m.allocateInstr() cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp) m.insert(cmp) // Do the left shift on each lane to set only the most significant bit in each. i := m.allocateInstr() i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp) m.insert(i) // Get the negated result by XOR on each lane with tmp. i = m.allocateInstr() i.asXmmRmR(xorOp, rn, tmp) m.insert(i) m.copyTo(tmp, rd) case ssa.OpcodeVSqrt: x, lane := instr.ArgWithLane() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeSqrtps case ssa.VecLaneF64x2: vecOp = sseOpcodeSqrtpd } i := m.allocateInstr() i.asXmmUnaryRmR(vecOp, rn, rd) m.insert(i) case ssa.OpcodeVImin: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePminsb case ssa.VecLaneI16x8: vecOp = sseOpcodePminsw case ssa.VecLaneI32x4: vecOp = sseOpcodePminsd } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVUmin: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePminub case ssa.VecLaneI16x8: vecOp = sseOpcodePminuw case ssa.VecLaneI32x4: vecOp = sseOpcodePminud } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVImax: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePmaxsb case ssa.VecLaneI16x8: vecOp = sseOpcodePmaxsw case ssa.VecLaneI32x4: vecOp = sseOpcodePmaxsd } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVUmax: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePmaxub case ssa.VecLaneI16x8: vecOp = sseOpcodePmaxuw case ssa.VecLaneI32x4: vecOp = sseOpcodePmaxud } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVAvgRound: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneI8x16: vecOp = sseOpcodePavgb case ssa.VecLaneI16x8: vecOp = sseOpcodePavgw } m.lowerVbBinOp(vecOp, x, y, instr.Return()) case ssa.OpcodeVIcmp: x, y, c, lane := instr.VIcmpData() m.lowerVIcmp(x, y, c, instr.Return(), lane) case ssa.OpcodeVFcmp: x, y, c, lane := instr.VFcmpData() m.lowerVFcmp(x, y, c, instr.Return(), lane) case ssa.OpcodeExtractlane: x, index, signed, lane := instr.ExtractlaneData() m.lowerExtractLane(x, index, signed, instr.Return(), lane) case ssa.OpcodeInsertlane: x, y, index, lane := instr.InsertlaneData() m.lowerInsertLane(x, y, index, instr.Return(), lane) case ssa.OpcodeSwizzle: x, y, _ := instr.Arg2WithLane() m.lowerSwizzle(x, y, instr.Return()) case ssa.OpcodeShuffle: x, y, lo, hi := instr.ShuffleData() m.lowerShuffle(x, y, lo, hi, instr.Return()) case ssa.OpcodeSplat: x, lane := instr.ArgWithLane() m.lowerSplat(x, instr.Return(), lane) case ssa.OpcodeSqmulRoundSat: x, y := instr.Arg2() m.lowerSqmulRoundSat(x, y, instr.Return()) case ssa.OpcodeVZeroExtLoad: ptr, offset, typ := instr.VZeroExtLoadData() var sseOp sseOpcode // Both movss and movsd clears the higher bits of the destination register upt 128 bits. // https://www.felixcloutier.com/x86/movss // https://www.felixcloutier.com/x86/movsd if typ == ssa.TypeF32 { sseOp = sseOpcodeMovss } else { sseOp = sseOpcodeMovsd } mem := m.lowerToAddressMode(ptr, offset) dst := m.c.VRegOf(instr.Return()) m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst)) case ssa.OpcodeVMinPseudo: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeMinps case ssa.VecLaneF64x2: vecOp = sseOpcodeMinpd default: panic("BUG: unexpected lane type") } m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) case ssa.OpcodeVMaxPseudo: x, y, lane := instr.Arg2WithLane() var vecOp sseOpcode switch lane { case ssa.VecLaneF32x4: vecOp = sseOpcodeMaxps case ssa.VecLaneF64x2: vecOp = sseOpcodeMaxpd default: panic("BUG: unexpected lane type") } m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) case ssa.OpcodeVIshl: x, y, lane := instr.Arg2WithLane() m.lowerVIshl(x, y, instr.Return(), lane) case ssa.OpcodeVSshr: x, y, lane := instr.Arg2WithLane() m.lowerVSshr(x, y, instr.Return(), lane) case ssa.OpcodeVUshr: x, y, lane := instr.Arg2WithLane() m.lowerVUshr(x, y, instr.Return(), lane) case ssa.OpcodeVCeil: x, lane := instr.ArgWithLane() m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2) case ssa.OpcodeVFloor: x, lane := instr.ArgWithLane() m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2) case ssa.OpcodeVTrunc: x, lane := instr.ArgWithLane() m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2) case ssa.OpcodeVNearest: x, lane := instr.ArgWithLane() m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2) case ssa.OpcodeExtIaddPairwise: x, lane, signed := instr.ExtIaddPairwiseData() m.lowerExtIaddPairwise(x, instr.Return(), lane, signed) case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow: x, lane := instr.ArgWithLane() m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow) case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh: x, lane := instr.ArgWithLane() m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh) case ssa.OpcodeLoadSplat: ptr, offset, lane := instr.LoadSplatData() m.lowerLoadSplat(ptr, offset, instr.Return(), lane) case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint: x, lane := instr.ArgWithLane() m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint) case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: x, lane := instr.ArgWithLane() m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat) case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: x, y, lane := instr.Arg2WithLane() m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow) case ssa.OpcodeFvpromoteLow: x := instr.Arg() src := m.getOperand_Reg(m.c.ValueDefinition(x)) dst := m.c.VRegOf(instr.Return()) m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst)) case ssa.OpcodeFvdemote: x := instr.Arg() src := m.getOperand_Reg(m.c.ValueDefinition(x)) dst := m.c.VRegOf(instr.Return()) m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst)) case ssa.OpcodeWideningPairwiseDotProductS: x, y := instr.Arg2() m.lowerWideningPairwiseDotProductS(x, y, instr.Return()) case ssa.OpcodeVIabs: m.lowerVIabs(instr) case ssa.OpcodeVIpopcnt: m.lowerVIpopcnt(instr) case ssa.OpcodeVFmin: m.lowerVFmin(instr) case ssa.OpcodeVFmax: m.lowerVFmax(instr) case ssa.OpcodeVFabs: m.lowerVFabs(instr) case ssa.OpcodeUndefined: m.insert(m.allocateInstr().asUD2()) case ssa.OpcodeExitWithCode: execCtx, code := instr.ExitWithCodeData() m.lowerExitWithCode(m.c.VRegOf(execCtx), code) case ssa.OpcodeExitIfTrueWithCode: execCtx, c, code := instr.ExitIfTrueWithCodeData() m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code) case ssa.OpcodeLoad: ptr, offset, typ := instr.LoadData() dst := m.c.VRegOf(instr.Return()) m.lowerLoad(ptr, offset, typ, dst) case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: ptr, offset, _ := instr.LoadData() ret := m.c.VRegOf(instr.Return()) m.lowerExtLoad(op, ptr, offset, ret) case ssa.OpcodeVconst: result := m.c.VRegOf(instr.Return()) lo, hi := instr.VconstData() m.lowerVconst(result, lo, hi) case ssa.OpcodeSExtend, ssa.OpcodeUExtend: from, to, signed := instr.ExtendData() m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) case ssa.OpcodeIcmp: m.lowerIcmp(instr) case ssa.OpcodeFcmp: m.lowerFcmp(instr) case ssa.OpcodeSelect: cval, x, y := instr.SelectData() m.lowerSelect(x, y, cval, instr.Return()) case ssa.OpcodeIreduce: rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg())) retVal := instr.Return() rd := m.c.VRegOf(retVal) if retVal.Type() != ssa.TypeI32 { panic("TODO?: Ireduce to non-i32") } m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd)) default: panic("TODO: lowering " + op.String()) } } func (m *machine) lowerFcmp(instr *ssa.Instruction) { f1, f2, and := m.lowerFcmpToFlags(instr) rd := m.c.VRegOf(instr.Return()) if f2 == condInvalid { tmp := m.c.AllocateVReg(ssa.TypeI32) m.insert(m.allocateInstr().asSetcc(f1, tmp)) // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match // the semantics of Icmp that sets either 0 or 1. m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) } else { tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32) m.insert(m.allocateInstr().asSetcc(f1, tmp1)) m.insert(m.allocateInstr().asSetcc(f2, tmp2)) var op aluRmiROpcode if and { op = aluRmiROpcodeAnd } else { op = aluRmiROpcodeOr } m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false)) m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd)) } } func (m *machine) lowerIcmp(instr *ssa.Instruction) { x, y, c := instr.IcmpData() m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64) rd := m.c.VRegOf(instr.Return()) tmp := m.c.AllocateVReg(ssa.TypeI32) m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp)) // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match // the semantics of Icmp that sets either 0 or 1. m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) } func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) { xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) rd := m.c.VRegOf(ret) var cond cond cvalDef := m.c.ValueDefinition(cval) switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { case ssa.OpcodeIcmp: icmp := cvalDef.Instr xc, yc, cc := icmp.IcmpData() m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64) cond = condFromSSAIntCmpCond(cc) icmp.Lowered() default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex. cv := m.getOperand_Reg(cvalDef) test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false) m.insert(test) cond = condNZ } if typ := x.Type(); typ.IsInt() { _64 := typ.Bits() == 64 mov := m.allocateInstr() tmp := m.c.AllocateVReg(typ) switch yo.kind { case operandKindReg: mov.asMovRR(yo.reg(), tmp, _64) case operandKindMem: if _64 { mov.asMov64MR(yo, tmp) } else { mov.asMovzxRmR(extModeLQ, yo, tmp) } default: panic("BUG") } m.insert(mov) cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64) m.insert(cmov) m.insert(m.allocateInstr().asMovRR(tmp, rd, _64)) } else { mov := m.allocateInstr() tmp := m.c.AllocateVReg(typ) switch typ { case ssa.TypeF32: mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp) case ssa.TypeF64: mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp) case ssa.TypeV128: mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp) default: panic("BUG") } m.insert(mov) cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size()) m.insert(cmov) m.copyTo(tmp, rd) } } func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) { x := i.op1 rd := i.op2.reg() cond := cond(i.u1) jcc := m.allocateInstr() m.insert(jcc) mov := m.allocateInstr() switch i.u2 { case 4: mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd) case 8: mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd) case 16: mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd) default: panic("BUG") } m.insert(mov) nop, end := m.allocateBrTarget() m.insert(nop) jcc.asJmpIf(cond.invert(), newOperandLabel(end)) } func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) { rd0 := m.c.VRegOf(ret) arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg)) rd := m.c.AllocateVReg(ret.Type()) ext := m.allocateInstr() switch { case from == 8 && to == 16 && signed: ext.asMovsxRmR(extModeBQ, arg, rd) case from == 8 && to == 16 && !signed: ext.asMovzxRmR(extModeBL, arg, rd) case from == 8 && to == 32 && signed: ext.asMovsxRmR(extModeBL, arg, rd) case from == 8 && to == 32 && !signed: ext.asMovzxRmR(extModeBQ, arg, rd) case from == 8 && to == 64 && signed: ext.asMovsxRmR(extModeBQ, arg, rd) case from == 8 && to == 64 && !signed: ext.asMovzxRmR(extModeBQ, arg, rd) case from == 16 && to == 32 && signed: ext.asMovsxRmR(extModeWL, arg, rd) case from == 16 && to == 32 && !signed: ext.asMovzxRmR(extModeWL, arg, rd) case from == 16 && to == 64 && signed: ext.asMovsxRmR(extModeWQ, arg, rd) case from == 16 && to == 64 && !signed: ext.asMovzxRmR(extModeWQ, arg, rd) case from == 32 && to == 64 && signed: ext.asMovsxRmR(extModeLQ, arg, rd) case from == 32 && to == 64 && !signed: ext.asMovzxRmR(extModeLQ, arg, rd) default: panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed)) } m.insert(ext) m.copyTo(rd, rd0) } func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) { if lo == 0 && hi == 0 { m.insert(m.allocateInstr().asZeros(dst)) return } load := m.allocateInstr() constLabel := m.allocateLabel() m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi}) load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst) m.insert(load) } func (m *machine) lowerCtz(instr *ssa.Instruction) { if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt) } else { // On processors that do not support TZCNT, the BSF instruction is // executed instead. The key difference between TZCNT and BSF // instruction is that if source operand is zero, the content of // destination operand is undefined. // https://www.felixcloutier.com/x86/tzcnt.html x := instr.Arg() if !x.Type().IsInt() { panic("BUG?") } _64 := x.Type().Bits() == 64 xDef := m.c.ValueDefinition(x) tmp := m.c.AllocateVReg(x.Type()) rm := m.getOperand_Reg(xDef) // First, we have to check if the target is non-zero. test := m.allocateInstr() test.asCmpRmiR(false, rm, rm.reg(), _64) m.insert(test) jmpNz := m.allocateInstr() m.insert(jmpNz) // If the value is zero, we just push the const value. m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) // Now jump right after the non-zero case. jmpAtEnd := m.allocateInstr() m.insert(jmpAtEnd) // jmpNz target label is set here. nop, nz := m.allocateBrTarget() jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) m.insert(nop) // Emit the non-zero case. bsr := m.allocateInstr() bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64) m.insert(bsr) // jmpAtEnd target label is set here. nopEnd, end := m.allocateBrTarget() jmpAtEnd.asJmp(newOperandLabel(end)) m.insert(nopEnd) m.copyTo(tmp, m.c.VRegOf(instr.Return())) } } func (m *machine) lowerClz(instr *ssa.Instruction) { if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt) } else { // On processors that do not support LZCNT, we combine BSR (calculating // most significant set bit) with XOR. This logic is described in // "Replace Raw Assembly Code with Builtin Intrinsics" section in: // https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code. x := instr.Arg() if !x.Type().IsInt() { panic("BUG?") } _64 := x.Type().Bits() == 64 xDef := m.c.ValueDefinition(x) rm := m.getOperand_Reg(xDef) tmp := m.c.AllocateVReg(x.Type()) // First, we have to check if the rm is non-zero as BSR is undefined // on zero. See https://www.felixcloutier.com/x86/bsr. test := m.allocateInstr() test.asCmpRmiR(false, rm, rm.reg(), _64) m.insert(test) jmpNz := m.allocateInstr() m.insert(jmpNz) // If the value is zero, we just push the const value. m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) // Now jump right after the non-zero case. jmpAtEnd := m.allocateInstr() m.insert(jmpAtEnd) // jmpNz target label is set here. nop, nz := m.allocateBrTarget() jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) m.insert(nop) // Emit the non-zero case. bsr := m.allocateInstr() bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64) m.insert(bsr) // Now we XOR the value with the bit length minus one. xor := m.allocateInstr() xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64) m.insert(xor) // jmpAtEnd target label is set here. nopEnd, end := m.allocateBrTarget() jmpAtEnd.asJmp(newOperandLabel(end)) m.insert(nopEnd) m.copyTo(tmp, m.c.VRegOf(instr.Return())) } } func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) { x := si.Arg() if !x.Type().IsInt() { panic("BUG?") } _64 := x.Type().Bits() == 64 xDef := m.c.ValueDefinition(x) rm := m.getOperand_Mem_Reg(xDef) rd := m.c.VRegOf(si.Return()) instr := m.allocateInstr() instr.asUnaryRmR(op, rm, rd, _64) m.insert(instr) } func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) { mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) load := m.allocateInstr() switch typ { case ssa.TypeI32: load.asMovzxRmR(extModeLQ, mem, dst) case ssa.TypeI64: load.asMov64MR(mem, dst) case ssa.TypeF32: load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst) case ssa.TypeF64: load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst) case ssa.TypeV128: load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst) default: panic("BUG") } m.insert(load) } func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) { mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) load := m.allocateInstr() switch op { case ssa.OpcodeUload8: load.asMovzxRmR(extModeBQ, mem, dst) case ssa.OpcodeUload16: load.asMovzxRmR(extModeWQ, mem, dst) case ssa.OpcodeUload32: load.asMovzxRmR(extModeLQ, mem, dst) case ssa.OpcodeSload8: load.asMovsxRmR(extModeBQ, mem, dst) case ssa.OpcodeSload16: load.asMovsxRmR(extModeWQ, mem, dst) case ssa.OpcodeSload32: load.asMovsxRmR(extModeLQ, mem, dst) default: panic("BUG") } m.insert(load) } func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { condDef := m.c.ValueDefinition(cond) if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) { panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) } cvalInstr := condDef.Instr cvalInstr.MarkLowered() // We need to copy the execution context to a temp register, because if it's spilled, // it might end up being reloaded inside the exiting branch. execCtxTmp := m.copyToTmp(execCtx) x, y, c := cvalInstr.IcmpData() xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) if !m.tryLowerBandToFlag(xx, yy) { m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64) } jmpIf := m.allocateInstr() m.insert(jmpIf) l := m.lowerExitWithCode(execCtxTmp, code) jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l)) } func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) { var target *backend.SSAValueDefinition if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 { if m.c.MatchInstr(y, ssa.OpcodeBand) { target = y } } if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 { if m.c.MatchInstr(x, ssa.OpcodeBand) { target = x } } if target == nil { return false } bandInstr := target.Instr bandX, bandY := bandInstr.Arg2() xx := m.getOperand_Reg(m.c.ValueDefinition(bandX)) yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY)) test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64) m.insert(test) bandInstr.MarkLowered() return true } func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) { saveRsp = m.allocateInstr().asMovRM( rspVReg, newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)), 8, ) saveRbp = m.allocateInstr().asMovRM( rbpVReg, newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)), 8, ) setExitCode = m.allocateInstr().asMovRM( exitCodeReg, newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)), 4, ) return } func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) { exitCodeReg := rbpVReg saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg) // Set save RSP, RBP, and write exit code. m.insert(saveRsp) m.insert(saveRbp) m.lowerIconst(exitCodeReg, uint64(code), false) m.insert(setExitCode) // Next is to save the return address. readRip := m.allocateInstr() m.insert(readRip) ripReg := rbpVReg saveRip := m.allocateInstr().asMovRM( ripReg, newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)), 8, ) m.insert(saveRip) // Finally exit. exitSq := m.allocateExitSeq(execCtx) m.insert(exitSq) // Insert the label for the return address. nop, l := m.allocateBrTarget() readRip.asLEA(newOperandLabel(l), ripReg) m.insert(nop) return l } func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) { x, y := si.Arg2() if !x.Type().IsInt() { panic("BUG?") } _64 := x.Type().Bits() == 64 xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) // TODO: commutative args can be swapped if one of them is an immediate. rn := m.getOperand_Reg(xDef) rm := m.getOperand_Mem_Imm32_Reg(yDef) rd := m.c.VRegOf(si.Return()) // rn is being overwritten, so we first copy its value to a temp register, // in case it is referenced again later. tmp := m.copyToTmp(rn.reg()) alu := m.allocateInstr() alu.asAluRmiR(op, rm, tmp, _64) m.insert(alu) // tmp now contains the result, we copy it to the dest register. m.copyTo(tmp, rd) } func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) { x, amt := si.Arg2() if !x.Type().IsInt() { panic("BUG?") } _64 := x.Type().Bits() == 64 xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt) opAmt := m.getOperand_Imm32_Reg(amtDef) rx := m.getOperand_Reg(xDef) rd := m.c.VRegOf(si.Return()) // rx is being overwritten, so we first copy its value to a temp register, // in case it is referenced again later. tmpDst := m.copyToTmp(rx.reg()) if opAmt.kind == operandKindReg { // If opAmt is a register we must copy its value to rcx, // because shiftR encoding mandates that the shift amount is in rcx. m.copyTo(opAmt.reg(), rcxVReg) alu := m.allocateInstr() alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64) m.insert(alu) } else { alu := m.allocateInstr() alu.asShiftR(op, opAmt, tmpDst, _64) m.insert(alu) } // tmp now contains the result, we copy it to the dest register. m.copyTo(tmpDst, rd) } func (m *machine) lowerXmmRmR(instr *ssa.Instruction) { x, y := instr.Arg2() if !x.Type().IsFloat() { panic("BUG?") } _64 := x.Type().Bits() == 64 var op sseOpcode if _64 { switch instr.Opcode() { case ssa.OpcodeFadd: op = sseOpcodeAddsd case ssa.OpcodeFsub: op = sseOpcodeSubsd case ssa.OpcodeFmul: op = sseOpcodeMulsd case ssa.OpcodeFdiv: op = sseOpcodeDivsd default: panic("BUG") } } else { switch instr.Opcode() { case ssa.OpcodeFadd: op = sseOpcodeAddss case ssa.OpcodeFsub: op = sseOpcodeSubss case ssa.OpcodeFmul: op = sseOpcodeMulss case ssa.OpcodeFdiv: op = sseOpcodeDivss default: panic("BUG") } } xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) rn := m.getOperand_Reg(yDef) rm := m.getOperand_Reg(xDef) rd := m.c.VRegOf(instr.Return()) // rm is being overwritten, so we first copy its value to a temp register, // in case it is referenced again later. tmp := m.copyToTmp(rm.reg()) xmm := m.allocateInstr().asXmmRmR(op, rn, tmp) m.insert(xmm) m.copyTo(tmp, rd) } func (m *machine) lowerSqrt(instr *ssa.Instruction) { x := instr.Arg() if !x.Type().IsFloat() { panic("BUG") } _64 := x.Type().Bits() == 64 var op sseOpcode if _64 { op = sseOpcodeSqrtsd } else { op = sseOpcodeSqrtss } xDef := m.c.ValueDefinition(x) rm := m.getOperand_Mem_Reg(xDef) rd := m.c.VRegOf(instr.Return()) xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd) m.insert(xmm) } func (m *machine) lowerFabsFneg(instr *ssa.Instruction) { x := instr.Arg() if !x.Type().IsFloat() { panic("BUG") } _64 := x.Type().Bits() == 64 var op sseOpcode var mask uint64 if _64 { switch instr.Opcode() { case ssa.OpcodeFabs: mask, op = 0x7fffffffffffffff, sseOpcodeAndpd case ssa.OpcodeFneg: mask, op = 0x8000000000000000, sseOpcodeXorpd } } else { switch instr.Opcode() { case ssa.OpcodeFabs: mask, op = 0x7fffffff, sseOpcodeAndps case ssa.OpcodeFneg: mask, op = 0x80000000, sseOpcodeXorps } } tmp := m.c.AllocateVReg(x.Type()) xDef := m.c.ValueDefinition(x) rm := m.getOperand_Reg(xDef) rd := m.c.VRegOf(instr.Return()) m.lowerFconst(tmp, mask, _64) xmm := m.allocateInstr().asXmmRmR(op, rm, tmp) m.insert(xmm) m.copyTo(tmp, rd) } func (m *machine) lowerStore(si *ssa.Instruction) { value, ptr, offset, storeSizeInBits := si.StoreData() rm := m.getOperand_Reg(m.c.ValueDefinition(value)) mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) store := m.allocateInstr() switch value.Type() { case ssa.TypeI32: store.asMovRM(rm.reg(), mem, storeSizeInBits/8) case ssa.TypeI64: store.asMovRM(rm.reg(), mem, storeSizeInBits/8) case ssa.TypeF32: store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem) case ssa.TypeF64: store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem) case ssa.TypeV128: store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem) default: panic("BUG") } m.insert(store) } func (m *machine) lowerCall(si *ssa.Instruction) { isDirectCall := si.Opcode() == ssa.OpcodeCall var indirectCalleePtr ssa.Value var directCallee ssa.FuncRef var sigID ssa.SignatureID var args []ssa.Value if isDirectCall { directCallee, sigID, args = si.CallData() } else { indirectCalleePtr, sigID, args = si.CallIndirectData() } calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID)) stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize()) if m.maxRequiredStackSizeForCalls < stackSlotSize+16 { m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP. } // Note: See machine.SetupPrologue for the stack layout. // The stack pointer decrease/increase will be inserted later in the compilation. for i, arg := range args { reg := m.c.VRegOf(arg) def := m.c.ValueDefinition(arg) m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize) } if isDirectCall { call := m.allocateInstr().asCall(directCallee, calleeABI) m.insert(call) } else { ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr)) callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI) m.insert(callInd) } var index int r1, rs := si.Returns() if r1.Valid() { m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize) index++ } for _, r := range rs { m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize) index++ } } // callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the // caller side of the function call. func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) { arg := &a.Args[argIndex] if def != nil && def.IsFromInstr() { // Constant instructions are inlined. if inst := def.Instr; inst.Constant() { m.InsertLoadConstant(inst, reg) } } if arg.Kind == backend.ABIArgKindReg { m.InsertMove(arg.Reg, reg, arg.Type) } else { store := m.allocateInstr() mem := newOperandMem(m.newAmodeImmReg( // -stackSlotSize because the stack pointer is not yet decreased. uint32(arg.Offset-stackSlotSize), rspVReg)) switch arg.Type { case ssa.TypeI32: store.asMovRM(reg, mem, 4) case ssa.TypeI64: store.asMovRM(reg, mem, 8) case ssa.TypeF32: store.asXmmMovRM(sseOpcodeMovss, reg, mem) case ssa.TypeF64: store.asXmmMovRM(sseOpcodeMovsd, reg, mem) case ssa.TypeV128: store.asXmmMovRM(sseOpcodeMovdqu, reg, mem) default: panic("BUG") } m.insert(store) } } func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) { r := &a.Rets[retIndex] if r.Kind == backend.ABIArgKindReg { m.InsertMove(reg, r.Reg, r.Type) } else { load := m.allocateInstr() mem := newOperandMem(m.newAmodeImmReg( // -stackSlotSize because the stack pointer is not yet decreased. uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg)) switch r.Type { case ssa.TypeI32: load.asMovzxRmR(extModeLQ, mem, reg) case ssa.TypeI64: load.asMov64MR(mem, reg) case ssa.TypeF32: load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg) case ssa.TypeF64: load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg) case ssa.TypeV128: load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg) default: panic("BUG") } m.insert(load) } } // InsertMove implements backend.Machine. func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { switch typ { case ssa.TypeI32, ssa.TypeI64: i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64) m.insert(i) case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: var op sseOpcode switch typ { case ssa.TypeF32: op = sseOpcodeMovss case ssa.TypeF64: op = sseOpcodeMovsd case ssa.TypeV128: op = sseOpcodeMovdqa } i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst) m.insert(i) default: panic("BUG") } } // Format implements backend.Machine. func (m *machine) Format() string { ectx := m.ectx begins := map[*instruction]backend.Label{} for l, pos := range ectx.LabelPositions { begins[pos.Begin] = l } irBlocks := map[backend.Label]ssa.BasicBlockID{} for i, l := range ectx.SsaBlockIDToLabels { irBlocks[l] = ssa.BasicBlockID(i) } var lines []string for cur := ectx.RootInstr; cur != nil; cur = cur.next { if l, ok := begins[cur]; ok { var labelStr string if blkID, ok := irBlocks[l]; ok { labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) } else { labelStr = fmt.Sprintf("%s:", l) } lines = append(lines, labelStr) } if cur.kind == nop0 { continue } lines = append(lines, "\t"+cur.String()) } for _, vc := range m.consts { if vc._var == nil { lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi)) } else { lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var)) } } return "\n" + strings.Join(lines, "\n") + "\n" } func (m *machine) encodeWithoutSSA(root *instruction) { m.labelResolutionPends = m.labelResolutionPends[:0] ectx := m.ectx bufPtr := m.c.BufPtr() for cur := root; cur != nil; cur = cur.next { offset := int64(len(*bufPtr)) if cur.kind == nop0 { l := cur.nop0Label() if pos, ok := ectx.LabelPositions[l]; ok { pos.BinaryOffset = offset } } needLabelResolution := cur.encode(m.c) if needLabelResolution { m.labelResolutionPends = append(m.labelResolutionPends, labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4}, ) } } for i := range m.labelResolutionPends { p := &m.labelResolutionPends[i] switch p.instr.kind { case jmp, jmpIf, lea: target := p.instr.jmpLabel() targetOffset := ectx.LabelPositions[target].BinaryOffset imm32Offset := p.imm32Offset jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset)) default: panic("BUG") } } } // Encode implements backend.Machine Encode. func (m *machine) Encode(context.Context) { ectx := m.ectx bufPtr := m.c.BufPtr() m.labelResolutionPends = m.labelResolutionPends[:0] for _, pos := range ectx.OrderedBlockLabels { offset := int64(len(*bufPtr)) pos.BinaryOffset = offset for cur := pos.Begin; cur != pos.End.next; cur = cur.next { offset := int64(len(*bufPtr)) switch cur.kind { case nop0: l := cur.nop0Label() if pos, ok := ectx.LabelPositions[l]; ok { pos.BinaryOffset = offset } case sourceOffsetInfo: m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo()) } needLabelResolution := cur.encode(m.c) if needLabelResolution { m.labelResolutionPends = append(m.labelResolutionPends, labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4}, ) } } } for i := range m.consts { offset := int64(len(*bufPtr)) vc := &m.consts[i] vc.label.BinaryOffset = offset if vc._var == nil { lo, hi := vc.lo, vc.hi m.c.Emit8Bytes(lo) m.c.Emit8Bytes(hi) } else { for _, b := range vc._var { m.c.EmitByte(b) } } } buf := *bufPtr for i := range m.labelResolutionPends { p := &m.labelResolutionPends[i] switch p.instr.kind { case jmp, jmpIf, lea, xmmUnaryRmR: target := p.instr.jmpLabel() targetOffset := ectx.LabelPositions[target].BinaryOffset imm32Offset := p.imm32Offset jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset)) case jmpTableIsland: tableBegin := p.instrOffset // Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes. targets := m.jmpTableTargets[p.instr.u1] for i, l := range targets { targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset jmpOffset := targetOffset - tableBegin binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset)) } default: panic("BUG") } } } // ResolveRelocations implements backend.Machine. func (m *machine) ResolveRelocations(refToBinaryOffset map[ssa.FuncRef]int, binary []byte, relocations []backend.RelocationInfo) { for _, r := range relocations { offset := r.Offset calleeFnOffset := refToBinaryOffset[r.FuncRef] // offset is the offset of the last 4 bytes of the call instruction. callInstrOffsetBytes := binary[offset : offset+4] diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction). callInstrOffsetBytes[0] = byte(diff) callInstrOffsetBytes[1] = byte(diff >> 8) callInstrOffsetBytes[2] = byte(diff >> 16) callInstrOffsetBytes[3] = byte(diff >> 24) } } func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) { x := m.getOperand_Reg(xd) y := m.getOperand_Mem_Imm32_Reg(yd) cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64) m.insert(cmp) } func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) { x, y, c := instr.FcmpData() switch c { case ssa.FloatCmpCondEqual: f1, f2 = condNP, condZ and = true case ssa.FloatCmpCondNotEqual: f1, f2 = condP, condNZ case ssa.FloatCmpCondLessThan: f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan) f2 = condInvalid x, y = y, x case ssa.FloatCmpCondLessThanOrEqual: f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual) f2 = condInvalid x, y = y, x default: f1 = condFromSSAFloatCmpCond(c) f2 = condInvalid } var opc sseOpcode if x.Type() == ssa.TypeF32 { opc = sseOpcodeUcomiss } else { opc = sseOpcodeUcomisd } xr := m.getOperand_Reg(m.c.ValueDefinition(x)) yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg())) return } // allocateInstr allocates an instruction. func (m *machine) allocateInstr() *instruction { instr := m.ectx.InstructionPool.Allocate() if !m.regAllocStarted { instr.addedBeforeRegAlloc = true } return instr } func (m *machine) allocateNop() *instruction { instr := m.allocateInstr() instr.kind = nop0 return instr } func (m *machine) insert(i *instruction) { ectx := m.ectx ectx.PendingInstructions = append(ectx.PendingInstructions, i) } func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint pos := m.allocateLabel() l = pos.L nop = m.allocateInstr() nop.asNop0WithLabel(l) pos.Begin, pos.End = nop, nop return } func (m *machine) allocateLabel() *labelPosition { ectx := m.ectx l := ectx.AllocateLabel() pos := ectx.AllocateLabelPosition(l) ectx.LabelPositions[l] = pos return pos } func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { offset, ok := m.spillSlots[id] if !ok { offset = m.spillSlotSize m.spillSlots[id] = offset m.spillSlotSize += int64(size) } return offset } func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) { typ := m.c.TypeOf(src) mov := m.allocateInstr() if typ.IsInt() { mov.asMovRR(src, dst, true) } else { mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst) } m.insert(mov) } func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { typ := m.c.TypeOf(v) tmp := m.c.AllocateVReg(typ) m.copyTo(v, tmp) return tmp } func (m *machine) requiredStackSize() int64 { return m.maxRequiredStackSizeForCalls + m.frameSize() + 16 + // Need for stack checking. 16 // return address and the caller RBP. } func (m *machine) frameSize() int64 { s := m.clobberedRegSlotSize() + m.spillSlotSize if s&0xf != 0 { panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) } return s } func (m *machine) clobberedRegSlotSize() int64 { return int64(len(m.clobberedRegs) * 16) } func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) { x, y, execCtx := si.Arg3() dividend := m.getOperand_Reg(m.c.ValueDefinition(x)) divisor := m.getOperand_Reg(m.c.ValueDefinition(y)) ctxVReg := m.c.VRegOf(execCtx) tmpGp := m.c.AllocateVReg(si.Return().Type()) m.copyTo(dividend.reg(), raxVReg) m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg)) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64) m.insert(seq) rd := m.c.VRegOf(si.Return()) if isDiv { m.copyTo(raxVReg, rd) } else { m.copyTo(rdxVReg, rd) } } func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) { execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData() dividend := raxVReg // Ensure yr is not zero. test := m.allocateInstr() test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64) m.insert(test) jnz := m.allocateInstr() m.insert(jnz) nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero) // If not zero, we can proceed with the division. jnz.asJmpIf(condNZ, newOperandLabel(nz)) var ifRemNeg1 *instruction if signed { var neg1 uint64 if _64 { neg1 = 0xffffffffffffffff } else { neg1 = 0xffffffff } m.lowerIconst(tmpGp, neg1, _64) if isDiv { // For signed division, we have to have branches for "math.MinInt{32,64} / -1" // case which results in the floating point exception via division error as // the resulting value exceeds the maximum of signed int. // First, we check if the divisor is -1. cmp := m.allocateInstr() cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) m.insert(cmp) ifNotNeg1 := m.allocateInstr() m.insert(ifNotNeg1) var minInt uint64 if _64 { minInt = 0x8000000000000000 } else { minInt = 0x80000000 } m.lowerIconst(tmpGp, minInt, _64) // Next we check if the quotient is the most negative value for the signed integer, i.e. // if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively. cmp2 := m.allocateInstr() cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64) m.insert(cmp2) ifNotMinInt := m.allocateInstr() m.insert(ifNotMinInt) // Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1), // as that is the overflow in division as the result becomes 2^31 which is larger than // the maximum of signed 32-bit int (2^31-1). end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end)) ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end)) } else { // If it is remainder, zeros DX register and compare the divisor to -1. xor := m.allocateInstr().asZeros(rdxVReg) m.insert(xor) // We check if the divisor is -1. cmp := m.allocateInstr() cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) m.insert(cmp) ifRemNeg1 = m.allocateInstr() m.insert(ifRemNeg1) } // Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers. sed := m.allocateInstr() sed.asSignExtendData(_64) m.insert(sed) } else { // Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers. zeros := m.allocateInstr().asZeros(rdxVReg) m.insert(zeros) } div := m.allocateInstr() div.asDiv(newOperandReg(divisor), signed, _64) m.insert(div) nop, end := m.allocateBrTarget() m.insert(nop) // If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function. if ifRemNeg1 != nil { ifRemNeg1.asJmpIf(condZ, newOperandLabel(end)) } } func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) { x := instr.Arg() if !x.Type().IsFloat() { panic("BUG?") } var op sseOpcode if x.Type().Bits() == 64 { op = sseOpcodeRoundsd } else { op = sseOpcodeRoundss } xDef := m.c.ValueDefinition(x) rm := m.getOperand_Mem_Reg(xDef) rd := m.c.VRegOf(instr.Return()) xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd) m.insert(xmm) } func (m *machine) lowerFminFmax(instr *ssa.Instruction) { x, y := instr.Arg2() if !x.Type().IsFloat() { panic("BUG?") } _64 := x.Type().Bits() == 64 isMin := instr.Opcode() == ssa.OpcodeFmin var minMaxOp sseOpcode switch { case _64 && isMin: minMaxOp = sseOpcodeMinpd case _64 && !isMin: minMaxOp = sseOpcodeMaxpd case !_64 && isMin: minMaxOp = sseOpcodeMinps case !_64 && !isMin: minMaxOp = sseOpcodeMaxps } xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) rm := m.getOperand_Reg(xDef) // We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg. rn := m.getOperand_Reg(yDef) rd := m.c.VRegOf(instr.Return()) tmp := m.copyToTmp(rm.reg()) // Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case. cmp := m.allocateInstr() if _64 { cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp) } else { cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp) } m.insert(cmp) // At this point, we have the three cases of conditional flags below // (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.) // // 1) Two values are NaN-free and different: All flags are cleared. // 2) Two values are NaN-free and equal: Only ZF flags is set. // 3) One of Two values is NaN: ZF, PF and CF flags are set. // Jump instruction to handle 1) case by checking the ZF flag // as ZF is only set for 2) and 3) cases. nanFreeOrDiffJump := m.allocateInstr() m.insert(nanFreeOrDiffJump) // Start handling 2) and 3). // Jump if one of two values is NaN by checking the parity flag (PF). ifIsNan := m.allocateInstr() m.insert(ifIsNan) // Start handling 2) NaN-free and equal. // Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is // returned if two values are positive and negative zeros. var op sseOpcode switch { case !_64 && isMin: op = sseOpcodeOrps case _64 && isMin: op = sseOpcodeOrpd case !_64 && !isMin: op = sseOpcodeAndps case _64 && !isMin: op = sseOpcodeAndpd } orAnd := m.allocateInstr() orAnd.asXmmRmR(op, rn, tmp) m.insert(orAnd) // Done, jump to end. sameExitJump := m.allocateInstr() m.insert(sameExitJump) // Start handling 3) either is NaN. isNanTarget, isNan := m.allocateBrTarget() m.insert(isNanTarget) ifIsNan.asJmpIf(condP, newOperandLabel(isNan)) // We emit the ADD instruction to produce the NaN in tmp. add := m.allocateInstr() if _64 { add.asXmmRmR(sseOpcodeAddsd, rn, tmp) } else { add.asXmmRmR(sseOpcodeAddss, rn, tmp) } m.insert(add) // Exit from the NaN case branch. nanExitJmp := m.allocateInstr() m.insert(nanExitJmp) // Start handling 1). doMinMaxTarget, doMinMax := m.allocateBrTarget() m.insert(doMinMaxTarget) nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax)) // Now handle the NaN-free and different values case. minMax := m.allocateInstr() minMax.asXmmRmR(minMaxOp, rn, tmp) m.insert(minMax) endNop, end := m.allocateBrTarget() m.insert(endNop) nanExitJmp.asJmp(newOperandLabel(end)) sameExitJump.asJmp(newOperandLabel(end)) m.copyTo(tmp, rd) } func (m *machine) lowerFcopysign(instr *ssa.Instruction) { x, y := instr.Arg2() if !x.Type().IsFloat() { panic("BUG") } _64 := x.Type().Bits() == 64 xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) rm := m.getOperand_Reg(xDef) rn := m.getOperand_Reg(yDef) rd := m.c.VRegOf(instr.Return()) // Clear the non-sign bits of src via AND with the mask. var opAnd, opOr sseOpcode var signMask uint64 if _64 { signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd } else { signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps } signBitReg := m.c.AllocateVReg(x.Type()) m.lowerFconst(signBitReg, signMask, _64) nonSignBitReg := m.c.AllocateVReg(x.Type()) m.lowerFconst(nonSignBitReg, ^signMask, _64) // Extract the sign bits of rn. and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg) m.insert(and) // Clear the sign bit of dst via AND with the non-sign bit mask. xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg) m.insert(xor) // Copy the sign bits of src to dst via OR. or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg) m.insert(or) m.copyTo(nonSignBitReg, rd) } func (m *machine) lowerBitcast(instr *ssa.Instruction) { x, dstTyp := instr.BitcastData() srcTyp := x.Type() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) switch { case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32: cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false) m.insert(cvt) case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32: cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false) m.insert(cvt) case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64: cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true) m.insert(cvt) case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64: cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true) m.insert(cvt) default: panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp)) } } func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { var tmpXmm regalloc.VReg if dst64 { tmpXmm = m.c.AllocateVReg(ssa.TypeF64) } else { tmpXmm = m.c.AllocateVReg(ssa.TypeF32) } m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat)) m.copyTo(tmpGp, rd) } func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) { execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData() var cmpOp, truncOp sseOpcode if src64 { cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si } else { cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si } trunc := m.allocateInstr() trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) m.insert(trunc) // Check if the dst operand was INT_MIN, by checking it against 1. cmp1 := m.allocateInstr() cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64) m.insert(cmp1) // If no overflow, then we are done. doneTarget, done := m.allocateBrTarget() ifNoOverflow := m.allocateInstr() ifNoOverflow.asJmpIf(condNO, newOperandLabel(done)) m.insert(ifNoOverflow) // Now, check for NaN. cmpNan := m.allocateInstr() cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src) m.insert(cmpNan) // We allocate the "non-nan target" here, but we will insert it later. notNanTarget, notNaN := m.allocateBrTarget() ifNotNan := m.allocateInstr() ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN)) m.insert(ifNotNan) if sat { // If NaN and saturating, return 0. zeroDst := m.allocateInstr().asZeros(tmpGp) m.insert(zeroDst) jmpEnd := m.allocateInstr() jmpEnd.asJmp(newOperandLabel(done)) m.insert(jmpEnd) // Otherwise: m.insert(notNanTarget) // Zero-out the tmp register. zero := m.allocateInstr().asZeros(tmpXmm) m.insert(zero) cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) m.insert(cmpXmm) // if >= jump to end. jmpEnd2 := m.allocateInstr() jmpEnd2.asJmpIf(condB, newOperandLabel(done)) m.insert(jmpEnd2) // Otherwise, saturate to INT_MAX. if dst64 { m.lowerIconst(tmpGp, math.MaxInt64, dst64) } else { m.lowerIconst(tmpGp, math.MaxInt32, dst64) } } else { // If non-sat, NaN, trap. m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) // Otherwise, we will jump here. m.insert(notNanTarget) // jump over trap if src larger than threshold condAboveThreshold := condNB // The magic constants are various combination of minInt for int[32|64] represented as float[32|64]. var minInt uint64 switch { case src64 && dst64: minInt = 0xc3e0000000000000 case src64 && !dst64: condAboveThreshold = condNBE minInt = 0xC1E0_0000_0020_0000 case !src64 && dst64: minInt = 0xDF00_0000 case !src64 && !dst64: minInt = 0xCF00_0000 } loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64) m.insert(loadToGP) movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64) m.insert(movToXmm) cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) m.insert(cmpXmm) jmpIfLarger := m.allocateInstr() checkPositiveTarget, checkPositive := m.allocateBrTarget() jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive)) m.insert(jmpIfLarger) m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) // If positive, it was a real overflow. m.insert(checkPositiveTarget) // Zero out the temp register. xorpd := m.allocateInstr() xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm) m.insert(xorpd) pos := m.allocateInstr() pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm) m.insert(pos) // If >= jump to end. jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done)) m.insert(jmp) m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) } m.insert(doneTarget) } func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2)) tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) m.insert(m.allocateFcvtToUintSequence( ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat, )) m.copyTo(tmpGp, rd) } func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) { execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData() var subOp, cmpOp, truncOp sseOpcode if src64 { subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si } else { subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si } doneTarget, done := m.allocateBrTarget() switch { case src64 && dst64: loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true) m.insert(loadToGP) movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) m.insert(movToXmm) case src64 && !dst64: loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true) m.insert(loadToGP) movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) m.insert(movToXmm) case !src64 && dst64: loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false) m.insert(loadToGP) movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) m.insert(movToXmm) case !src64 && !dst64: loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false) m.insert(loadToGP) movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) m.insert(movToXmm) } cmp := m.allocateInstr() cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) m.insert(cmp) // If above `tmp` ("large threshold"), jump to `ifAboveThreshold` ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget() jmpIfAboveThreshold := m.allocateInstr() jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold)) m.insert(jmpIfAboveThreshold) ifNotNaNTarget, ifNotNaN := m.allocateBrTarget() jmpIfNotNaN := m.allocateInstr() jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN)) m.insert(jmpIfNotNaN) // If NaN, handle the error condition. if sat { // On NaN, saturating, we just return 0. zeros := m.allocateInstr().asZeros(tmpGp) m.insert(zeros) jmpEnd := m.allocateInstr() jmpEnd.asJmp(newOperandLabel(done)) m.insert(jmpEnd) } else { // On NaN, non-saturating, we trap. m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) } // If not NaN, land here. m.insert(ifNotNaNTarget) // Truncation happens here. trunc := m.allocateInstr() trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) m.insert(trunc) // Check if the result is negative. cmpNeg := m.allocateInstr() cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) m.insert(cmpNeg) // If non-neg, jump to end. jmpIfNonNeg := m.allocateInstr() jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done)) m.insert(jmpIfNonNeg) if sat { // If the input was "small" (< 2**(width -1)), the only way to get an integer // overflow is because the input was too small: saturate to the min value, i.e. 0. zeros := m.allocateInstr().asZeros(tmpGp) m.insert(zeros) jmpEnd := m.allocateInstr() jmpEnd.asJmp(newOperandLabel(done)) m.insert(jmpEnd) } else { // If not saturating, trap. m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) } // If above the threshold, land here. m.insert(ifAboveThresholdTarget) // tmpDiff := threshold - rn. copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2) m.insert(copySrc) sub := m.allocateInstr() sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000 m.insert(sub) trunc2 := m.allocateInstr() trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64) m.insert(trunc2) // Check if the result is negative. cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) m.insert(cmpNeg2) ifNextLargeTarget, ifNextLarge := m.allocateBrTarget() jmpIfNextLarge := m.allocateInstr() jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge)) m.insert(jmpIfNextLarge) if sat { // The input was "large" (>= maxInt), so the only way to get an integer // overflow is because the input was too large: saturate to the max value. var maxInt uint64 if dst64 { maxInt = math.MaxUint64 } else { maxInt = math.MaxUint32 } m.lowerIconst(tmpGp, maxInt, dst64) jmpToEnd := m.allocateInstr() jmpToEnd.asJmp(newOperandLabel(done)) m.insert(jmpToEnd) } else { // If not saturating, trap. m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) } m.insert(ifNextLargeTarget) var op operand if dst64 { m.lowerIconst(tmpGp2, 0x8000000000000000, true) op = newOperandReg(tmpGp2) } else { op = newOperandImm32(0x80000000) } add := m.allocateInstr() add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64) m.insert(add) m.insert(doneTarget) } func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) { var op sseOpcode if dst64 { op = sseOpcodeCvtsi2sd } else { op = sseOpcodeCvtsi2ss } trunc := m.allocateInstr() trunc.asGprToXmm(op, rn, rd.reg(), src64) m.insert(trunc) } func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) { var op sseOpcode if dst64 { op = sseOpcodeCvtsi2sd } else { op = sseOpcodeCvtsi2ss } // Src is 32 bit, then we just perform the conversion with 64 bit width. // // See the following link for why we use 64bit conversion for unsigned 32bit integer sources: // https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float. // // Here's the summary: // >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float, // >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide // >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values, // >> which allows CVTSI2SS to be used after all. // if !src64 { cvt := m.allocateInstr() cvt.asGprToXmm(op, rn, rd.reg(), true) m.insert(cvt) return } // If uint64, we have to do a bit more work. endTarget, end := m.allocateBrTarget() var tmpXmm regalloc.VReg if dst64 { tmpXmm = m.c.AllocateVReg(ssa.TypeF64) } else { tmpXmm = m.c.AllocateVReg(ssa.TypeF32) } // Check if the most significant bit (sign bit) is set. test := m.allocateInstr() test.asCmpRmiR(false, rn, rn.reg(), src64) m.insert(test) // Jump if the sign bit is set. ifSignTarget, ifSign := m.allocateBrTarget() jmpIfNeg := m.allocateInstr() jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign)) m.insert(jmpIfNeg) // If the sign bit is not set, we could fit the unsigned int into float32/float64. // So, we convert it to float and emit jump instruction to exit from this branch. cvt := m.allocateInstr() cvt.asGprToXmm(op, rn, tmpXmm, src64) m.insert(cvt) // We are done, jump to end. jmpEnd := m.allocateInstr() jmpEnd.asJmp(newOperandLabel(end)) m.insert(jmpEnd) // Now handling the case where sign-bit is set. // We emit the following sequences: // mov %rn, %tmp // shr 1, %tmp // mov %rn, %tmp2 // and 1, %tmp2 // or %tmp2, %tmp // cvtsi2ss %tmp, %xmm0 // addsd %xmm0, %xmm0 m.insert(ifSignTarget) tmp := m.copyToTmp(rn.reg()) shr := m.allocateInstr() shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64) m.insert(shr) tmp2 := m.copyToTmp(rn.reg()) and := m.allocateInstr() and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64) m.insert(and) or := m.allocateInstr() or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64) m.insert(or) cvt2 := m.allocateInstr() cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64) m.insert(cvt2) addsd := m.allocateInstr() if dst64 { addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm) } else { addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm) } m.insert(addsd) m.insert(endTarget) m.copyTo(tmpXmm, rd.reg()) } func (m *machine) lowerVanyTrue(instr *ssa.Instruction) { x := instr.Arg() rm := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) tmp := m.c.AllocateVReg(ssa.TypeI32) cmp := m.allocateInstr() cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg()) m.insert(cmp) setcc := m.allocateInstr() setcc.asSetcc(condNZ, tmp) m.insert(setcc) // Clear the irrelevant bits. and := m.allocateInstr() and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false) m.insert(and) m.copyTo(tmp, rd) } func (m *machine) lowerVallTrue(instr *ssa.Instruction) { x, lane := instr.ArgWithLane() var op sseOpcode switch lane { case ssa.VecLaneI8x16: op = sseOpcodePcmpeqb case ssa.VecLaneI16x8: op = sseOpcodePcmpeqw case ssa.VecLaneI32x4: op = sseOpcodePcmpeqd case ssa.VecLaneI64x2: op = sseOpcodePcmpeqq } rm := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) tmp := m.c.AllocateVReg(ssa.TypeV128) zeros := m.allocateInstr() zeros.asZeros(tmp) m.insert(zeros) pcmp := m.allocateInstr() pcmp.asXmmRmR(op, rm, tmp) m.insert(pcmp) test := m.allocateInstr() test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp) m.insert(test) tmp2 := m.c.AllocateVReg(ssa.TypeI32) setcc := m.allocateInstr() setcc.asSetcc(condZ, tmp2) m.insert(setcc) // Clear the irrelevant bits. and := m.allocateInstr() and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false) m.insert(and) m.copyTo(tmp2, rd) } func (m *machine) lowerVhighBits(instr *ssa.Instruction) { x, lane := instr.ArgWithLane() rm := m.getOperand_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) switch lane { case ssa.VecLaneI8x16: mov := m.allocateInstr() mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false) m.insert(mov) case ssa.VecLaneI16x8: // When we have: // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)] // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)] // where RX(wn) is n-th signed word (16-bit) of RX register, // // "PACKSSWB R1, R2" produces // R1 = [ // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)), // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)), // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)), // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)), // ] // where R1 is the destination register, and // byte_sat(w) = int8(w) if w fits as signed 8-bit, // 0x80 if w is less than 0x80 // 0x7F if w is greater than 0x7f // // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail. // // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8). tmp := m.copyToTmp(rm.reg()) res := m.c.AllocateVReg(ssa.TypeI32) pak := m.allocateInstr() pak.asXmmRmR(sseOpcodePacksswb, rm, tmp) m.insert(pak) mov := m.allocateInstr() mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false) m.insert(mov) // Clear the higher bits than 8. shr := m.allocateInstr() shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false) m.insert(shr) m.copyTo(res, rd) case ssa.VecLaneI32x4: mov := m.allocateInstr() mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true) m.insert(mov) case ssa.VecLaneI64x2: mov := m.allocateInstr() mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true) m.insert(mov) } } func (m *machine) lowerVbnot(instr *ssa.Instruction) { x := instr.Arg() xDef := m.c.ValueDefinition(x) rm := m.getOperand_Reg(xDef) rd := m.c.VRegOf(instr.Return()) tmp := m.copyToTmp(rm.reg()) tmp2 := m.c.AllocateVReg(ssa.TypeV128) // Ensure tmp2 is considered defined by regalloc. m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) // Set all bits on tmp register. pak := m.allocateInstr() pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2) m.insert(pak) // Then XOR with tmp to reverse all bits on v.register. xor := m.allocateInstr() xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp) m.insert(xor) m.copyTo(tmp, rd) } func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) { tmpDst := m.c.AllocateVReg(ssa.TypeV128) m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) switch lane { case ssa.VecLaneI8x16: tmp := m.c.AllocateVReg(ssa.TypeV128) m.insert(m.allocateInstr().asDefineUninitializedReg(tmp)) xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst)) case ssa.VecLaneI16x8: xx := m.getOperand_Reg(m.c.ValueDefinition(x)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) case ssa.VecLaneI32x4: xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) case ssa.VecLaneI64x2: xx := m.getOperand_Reg(m.c.ValueDefinition(x)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst)) case ssa.VecLaneF32x4: xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) case ssa.VecLaneF64x2: xx := m.getOperand_Reg(m.c.ValueDefinition(x)) m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst)) default: panic(fmt.Sprintf("invalid lane type: %s", lane)) } m.copyTo(tmpDst, m.c.VRegOf(ret)) } func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) { var xMask, yMask [2]uint64 for i := 0; i < 8; i++ { loLane := byte(lo >> (i * 8)) if loLane < 16 { xMask[0] |= uint64(loLane) << (i * 8) yMask[0] |= uint64(0x80) << (i * 8) } else { xMask[0] |= uint64(0x80) << (i * 8) yMask[0] |= uint64(loLane-16) << (i * 8) } hiLane := byte(hi >> (i * 8)) if hiLane < 16 { xMask[1] |= uint64(hiLane) << (i * 8) yMask[1] |= uint64(0x80) << (i * 8) } else { xMask[1] |= uint64(0x80) << (i * 8) yMask[1] |= uint64(hiLane-16) << (i * 8) } } xmaskLabel := m.allocateLabel() m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel}) ymaskLabel := m.allocateLabel() m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel}) xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg()) // Apply mask to X. tmp := m.c.AllocateVReg(ssa.TypeV128) loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp) m.insert(loadMaskLo) m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX)) // Apply mask to Y. loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp) m.insert(loadMaskHi) m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY)) // Combine the results. m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY)) m.copyTo(tmpY, m.c.VRegOf(ret)) } func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) { rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rm := m.getOperand_Reg(m.c.ValueDefinition(y)) rd := m.c.VRegOf(ret) tmp := m.copyToTmp(rn.reg()) binOp := m.allocateInstr() binOp.asXmmRmR(op, rm, tmp) m.insert(binOp) m.copyTo(tmp, rd) } func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) { rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) rd := m.c.VRegOf(ret) tmp := m.copyToTmp(rn.reg()) binOp := m.allocateInstr() binOp.asXmmRmR(op, rm, tmp) m.insert(binOp) m.copyTo(tmp, rd) } func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) { var cmpOp sseOpcode switch lane { case ssa.VecLaneF32x4: cmpOp = sseOpcodeCmpps case ssa.VecLaneF64x2: cmpOp = sseOpcodeCmppd default: panic(fmt.Sprintf("invalid lane type: %s", lane)) } xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) var cmpImm cmpPred switch c { case ssa.FloatCmpCondGreaterThan: yy, xx = xx, yy cmpImm = cmpPredLT_OS case ssa.FloatCmpCondGreaterThanOrEqual: yy, xx = xx, yy cmpImm = cmpPredLE_OS case ssa.FloatCmpCondEqual: cmpImm = cmpPredEQ_OQ case ssa.FloatCmpCondNotEqual: cmpImm = cmpPredNEQ_UQ case ssa.FloatCmpCondLessThan: cmpImm = cmpPredLT_OS case ssa.FloatCmpCondLessThanOrEqual: cmpImm = cmpPredLE_OS default: panic(fmt.Sprintf("invalid float comparison condition: %s", c)) } tmp := m.c.AllocateVReg(ssa.TypeV128) xxx := m.getOperand_Mem_Reg(xx) m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp)) rm := m.getOperand_Mem_Reg(yy) m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp)) m.copyTo(tmp, m.c.VRegOf(ret)) } func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) { var eq, gt, maxu, minu, mins sseOpcode switch lane { case ssa.VecLaneI8x16: eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb case ssa.VecLaneI16x8: eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw case ssa.VecLaneI32x4: eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd case ssa.VecLaneI64x2: eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq default: panic(fmt.Sprintf("invalid lane type: %s", lane)) } tmp := m.c.AllocateVReg(ssa.TypeV128) var op operand switch c { case ssa.IntegerCmpCondSignedLessThanOrEqual: if lane == ssa.VecLaneI64x2 { x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) // Copy x to tmp. m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) } else { y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) // Copy y to tmp. m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) } case ssa.IntegerCmpCondSignedGreaterThanOrEqual: if lane == ssa.VecLaneI64x2 { y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) // Copy y to tmp. m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) } else { x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) // Copy x to tmp. m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) } case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual: y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) // Copy y to tmp. m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) default: x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) // Copy x to tmp. m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) } switch c { case ssa.IntegerCmpCondEqual: m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) case ssa.IntegerCmpCondNotEqual: // First we compare for equality. m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) // Then flip the bits. To do so, we set all bits on tmp2. tmp2 := m.c.AllocateVReg(ssa.TypeV128) m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) // And then xor with tmp. m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan: m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual: if lane == ssa.VecLaneI64x2 { m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) // Then flip the bits. To do so, we set all bits on tmp2. tmp2 := m.c.AllocateVReg(ssa.TypeV128) m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) // And then xor with tmp. m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) } else { // First take min of x and y. m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp)) // Then compare for equality. m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) } case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan: // First maxu of x and y. m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp)) // Then compare for equality. m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) // Then flip the bits. To do so, we set all bits on tmp2. tmp2 := m.c.AllocateVReg(ssa.TypeV128) m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) // And then xor with tmp. m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual: m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp)) m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) default: panic("BUG") } m.copyTo(tmp, m.c.VRegOf(ret)) } func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) { x, y := instr.Arg2() xDef := m.c.ValueDefinition(x) yDef := m.c.ValueDefinition(y) rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) rd := m.c.VRegOf(instr.Return()) tmp := m.copyToTmp(rn.reg()) // pandn between rn, rm. pand := m.allocateInstr() pand.asXmmRmR(sseOpcodePandn, rm, tmp) m.insert(pand) m.copyTo(tmp, rd) } func (m *machine) lowerVbitselect(instr *ssa.Instruction) { c, x, y := instr.SelectData() xDef := m.c.ValueDefinition(x) yDef := m.c.ValueDefinition(y) rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) creg := m.getOperand_Reg(m.c.ValueDefinition(c)) rd := m.c.VRegOf(instr.Return()) tmpC := m.copyToTmp(creg.reg()) tmpX := m.copyToTmp(rm.reg()) // And between c, x (overwrites x). pand := m.allocateInstr() pand.asXmmRmR(sseOpcodePand, creg, tmpX) m.insert(pand) // Andn between y, c (overwrites c). pandn := m.allocateInstr() pandn.asXmmRmR(sseOpcodePandn, rn, tmpC) m.insert(pandn) por := m.allocateInstr() por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX) m.insert(por) m.copyTo(tmpX, rd) } func (m *machine) lowerVFmin(instr *ssa.Instruction) { x, y, lane := instr.Arg2WithLane() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rm := m.getOperand_Reg(m.c.ValueDefinition(y)) rd := m.c.VRegOf(instr.Return()) var min, cmp, andn, or, srl /* shift right logical */ sseOpcode var shiftNumToInverseNaN uint32 if lane == ssa.VecLaneF32x4 { min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa } else { min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd } tmp1 := m.copyToTmp(rn.reg()) tmp2 := m.copyToTmp(rm.reg()) // tmp1=min(rn, rm) minIns1 := m.allocateInstr() minIns1.asXmmRmR(min, rn, tmp2) m.insert(minIns1) // tmp2=min(rm, rn) minIns2 := m.allocateInstr() minIns2.asXmmRmR(min, rm, tmp1) m.insert(minIns2) // tmp3:=tmp1=min(rn, rm) tmp3 := m.copyToTmp(tmp1) // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN // NaN if rn == NaN || rm == NaN // min(rm, rm) otherwise orIns := m.allocateInstr() orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1) m.insert(orIns) // tmp3 is originally min(rn,rm). // tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN // 0 otherwise cmpIns := m.allocateInstr() cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3) m.insert(cmpIns) // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN // ^0 if rn == NaN || rm == NaN // min(v1, v2) otherwise orIns2 := m.allocateInstr() orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1) m.insert(orIns2) // tmp3 = set all bits on the mantissa bits // 0 otherwise shift := m.allocateInstr() shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3) m.insert(shift) // tmp3 = tmp1 and !tmp3 // = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN // set all bits on exponential and sign bit (== NaN) if rn == NaN || rm == NaN // min(rn, rm) otherwise andnIns := m.allocateInstr() andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3) m.insert(andnIns) m.copyTo(tmp3, rd) } func (m *machine) lowerVFmax(instr *ssa.Instruction) { x, y, lane := instr.Arg2WithLane() rn := m.getOperand_Reg(m.c.ValueDefinition(x)) rm := m.getOperand_Reg(m.c.ValueDefinition(y)) rd := m.c.VRegOf(instr.Return()) var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode var shiftNumToInverseNaN uint32 if lane == ssa.VecLaneF32x4 { max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa } else { max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd } tmp0 := m.copyToTmp(rm.reg()) tmp1 := m.copyToTmp(rn.reg()) // tmp0=max(rn, rm) maxIns1 := m.allocateInstr() maxIns1.asXmmRmR(max, rn, tmp0) m.insert(maxIns1) // tmp1=max(rm, rn) maxIns2 := m.allocateInstr() maxIns2.asXmmRmR(max, rm, tmp1) m.insert(maxIns2) // tmp2=max(rm, rn) tmp2 := m.copyToTmp(tmp1) // tmp2 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) // 0 if (rn == 0 && rm == 0) // -0 if (rn == -0 && rm == -0) // v1^v2 if rn == NaN || rm == NaN // 0 otherwise xorInstr := m.allocateInstr() xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2) m.insert(xorInstr) // tmp1 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) // 0 if (rn == 0 && rm == 0) // -0 if (rn == -0 && rm == -0) // NaN if rn == NaN || rm == NaN // max(v1, v2) otherwise orInstr := m.allocateInstr() orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1) m.insert(orInstr) tmp3 := m.copyToTmp(tmp1) // tmp3 = 0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm == 0) // -0 if (rn == -0 && rm == -0) // NaN if rn == NaN || rm == NaN // max(v1, v2) otherwise // // Note: -0 - (-0) = 0 (!= -0) in floating point operation. subIns := m.allocateInstr() subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3) m.insert(subIns) // tmp1 = 0^ if rn == NaN || rm == NaN cmpIns := m.allocateInstr() cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1) m.insert(cmpIns) // tmp1 = set all bits on the mantissa bits // 0 otherwise shift := m.allocateInstr() shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1) m.insert(shift) andnIns := m.allocateInstr() andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1) m.insert(andnIns) m.copyTo(tmp1, rd) } func (m *machine) lowerVFabs(instr *ssa.Instruction) { x, lane := instr.ArgWithLane() rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) rd := m.c.VRegOf(instr.Return()) tmp := m.c.AllocateVReg(ssa.TypeV128) def := m.allocateInstr() def.asDefineUninitializedReg(tmp) m.insert(def) // Set all bits on tmp. pcmp := m.allocateInstr() pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp) m.insert(pcmp) switch lane { case ssa.VecLaneF32x4: // Shift right packed single floats by 1 to clear the sign bits. shift := m.allocateInstr() shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp) m.insert(shift) // Clear the sign bit of rm. andp := m.allocateInstr() andp.asXmmRmR(sseOpcodeAndpd, rm, tmp) m.insert(andp) case ssa.VecLaneF64x2: // Shift right packed single floats by 1 to clear the sign bits. shift := m.allocateInstr() shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp) m.insert(shift) // Clear the sign bit of rm. andp := m.allocateInstr() andp.asXmmRmR(sseOpcodeAndps, rm, tmp) m.insert(andp) } m.copyTo(tmp, rd) }