Files
wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
Takeshi Yoneda 01dd74ce73 wazevo: adds support for call_indirect (#1632)
Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
2023-08-16 13:07:13 +09:00

1405 lines
36 KiB
Go

package arm64
import (
"fmt"
"math"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
type (
// instruction represents either a real instruction in arm64, or the meta instructions
// that are convenient for code generation. For example, inline constants are also treated
// as instructions.
//
// Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation
// can be considered equivalent to the sequence of such instructions.
//
// Each field is interpreted depending on the kind.
//
// TODO: optimize the layout later once the impl settles.
instruction struct {
kind instructionKind
prev, next *instruction
u1, u2, u3 uint64
rd, rm, rn, ra operand
amode addressMode
abi *abiImpl
addedAfterLowering bool
}
// instructionKind represents the kind of instruction.
// This controls how the instruction struct is interpreted.
instructionKind int
)
type defKind byte
const (
defKindNone defKind = iota + 1
defKindRD
defKindCall
)
var defKinds = [numInstructionKinds]defKind{
aluRRR: defKindRD,
aluRRRR: defKindRD,
aluRRImm12: defKindRD,
aluRRBitmaskImm: defKindRD,
aluRRRShift: defKindRD,
aluRRImmShift: defKindRD,
aluRRRExtend: defKindRD,
movZ: defKindRD,
movN: defKindRD,
mov32: defKindRD,
mov64: defKindRD,
fpuMov64: defKindRD,
fpuMov128: defKindRD,
fpuRRR: defKindRD,
nop0: defKindNone,
call: defKindCall,
callInd: defKindCall,
ret: defKindNone,
store32: defKindNone,
store64: defKindNone,
exitSequence: defKindNone,
condBr: defKindNone,
br: defKindNone,
cSet: defKindRD,
extend: defKindRD,
fpuCmp: defKindNone,
uLoad8: defKindRD,
uLoad16: defKindRD,
uLoad32: defKindRD,
sLoad8: defKindRD,
sLoad16: defKindRD,
sLoad32: defKindRD,
uLoad64: defKindRD,
fpuLoad32: defKindRD,
fpuLoad64: defKindRD,
fpuLoad128: defKindRD,
loadFpuConst32: defKindRD,
loadFpuConst64: defKindRD,
fpuStore32: defKindNone,
fpuStore64: defKindNone,
fpuStore128: defKindNone,
udf: defKindNone,
}
// defs returns the list of regalloc.VReg that are defined by the instruction.
// In order to reduce the number of allocations, the caller can pass the slice to be used.
func (i *instruction) defs(regs []regalloc.VReg) []regalloc.VReg {
switch defKinds[i.kind] {
case defKindNone:
case defKindRD:
regs = append(regs, i.rd.nr())
case defKindCall:
regs = append(regs, i.abi.retRealRegs...)
default:
panic(fmt.Sprintf("defKind for %v not defined", i))
}
return regs
}
func (i *instruction) assignDef(reg regalloc.VReg) {
switch defKinds[i.kind] {
case defKindNone:
case defKindRD:
i.rd = i.rd.assignReg(reg)
case defKindCall:
panic("BUG: call instructions shouldn't be assigned")
default:
panic(fmt.Sprintf("defKind for %v not defined", i))
}
}
type useKind byte
const (
useKindNone useKind = iota + 1
useKindRN
useKindRNRM
useKindRNRMRA
useKindRet
useKindCall
useKindCallInd
useKindAMode
useKindRNAMode
useKindCond
)
var useKinds = [numInstructionKinds]useKind{
udf: useKindNone,
aluRRR: useKindRNRM,
aluRRRR: useKindRNRMRA,
aluRRImm12: useKindRN,
aluRRBitmaskImm: useKindRN,
aluRRRShift: useKindRNRM,
aluRRImmShift: useKindRN,
aluRRRExtend: useKindRNRM,
movZ: useKindNone,
movN: useKindNone,
mov32: useKindRN,
mov64: useKindRN,
fpuMov64: useKindRN,
fpuMov128: useKindRN,
fpuRRR: useKindRNRM,
nop0: useKindNone,
call: useKindCall,
callInd: useKindCallInd,
ret: useKindRet,
store32: useKindRNAMode,
store64: useKindRNAMode,
exitSequence: useKindRN,
condBr: useKindCond,
br: useKindNone,
cSet: useKindNone,
extend: useKindRN,
fpuCmp: useKindRNRM,
uLoad8: useKindAMode,
uLoad16: useKindAMode,
uLoad32: useKindAMode,
sLoad8: useKindAMode,
sLoad16: useKindAMode,
sLoad32: useKindAMode,
uLoad64: useKindAMode,
fpuLoad32: useKindAMode,
fpuLoad64: useKindAMode,
fpuLoad128: useKindAMode,
fpuStore32: useKindRNAMode,
fpuStore64: useKindRNAMode,
fpuStore128: useKindRNAMode,
loadFpuConst32: useKindNone,
loadFpuConst64: useKindNone,
}
// uses returns the list of regalloc.VReg that are used by the instruction.
// In order to reduce the number of allocations, the caller can pass the slice to be used.
func (i *instruction) uses(regs []regalloc.VReg) []regalloc.VReg {
switch useKinds[i.kind] {
case useKindNone:
case useKindRN:
if rn := i.rn.reg(); rn.Valid() {
regs = append(regs, rn)
}
case useKindRNRM:
if rn := i.rn.reg(); rn.Valid() {
regs = append(regs, rn)
}
if rm := i.rm.reg(); rm.Valid() {
regs = append(regs, rm)
}
case useKindRNRMRA:
if rn := i.rn.reg(); rn.Valid() {
regs = append(regs, rn)
}
if rm := i.rm.reg(); rm.Valid() {
regs = append(regs, rm)
}
if ra := i.ra.reg(); ra.Valid() {
regs = append(regs, ra)
}
case useKindRet:
regs = append(regs, i.abi.retRealRegs...)
case useKindAMode:
if amodeRN := i.amode.rn; amodeRN.Valid() {
regs = append(regs, amodeRN)
}
if amodeRM := i.amode.rm; amodeRM.Valid() {
regs = append(regs, amodeRM)
}
case useKindRNAMode:
regs = append(regs, i.rn.reg())
if amodeRN := i.amode.rn; amodeRN.Valid() {
regs = append(regs, amodeRN)
}
if amodeRM := i.amode.rm; amodeRM.Valid() {
regs = append(regs, amodeRM)
}
case useKindCond:
cnd := cond(i.u1)
if cnd.kind() != condKindCondFlagSet {
regs = append(regs, cnd.register())
}
case useKindCall:
regs = append(regs, i.abi.argRealRegs...)
case useKindCallInd:
regs = append(regs, i.rn.nr())
regs = append(regs, i.abi.argRealRegs...)
default:
panic(fmt.Sprintf("useKind for %v not defined", i))
}
return regs
}
func (i *instruction) assignUses(regs []regalloc.VReg) {
switch useKinds[i.kind] {
case useKindNone:
case useKindRN:
if rn := i.rn.reg(); rn.Valid() {
i.rn = i.rn.assignReg(regs[0])
}
case useKindRNRM:
if rn := i.rn.reg(); rn.Valid() {
i.rn = i.rn.assignReg(regs[0])
}
if rm := i.rm.reg(); rm.Valid() {
i.rm = i.rm.assignReg(regs[1])
}
case useKindRNRMRA:
if rn := i.rn.reg(); rn.Valid() {
i.rn = i.rn.assignReg(regs[0])
}
if rm := i.rm.reg(); rm.Valid() {
i.rm = i.rm.assignReg(regs[1])
}
if ra := i.ra.reg(); ra.Valid() {
i.ra = i.ra.assignReg(regs[2])
}
case useKindRet:
panic("BUG: ret instructions shouldn't be assigned")
case useKindAMode:
if amodeRN := i.amode.rn; amodeRN.Valid() {
i.amode.rn = regs[0]
}
if amodeRM := i.amode.rm; amodeRM.Valid() {
i.amode.rm = regs[1]
}
case useKindRNAMode:
i.rn = i.rn.assignReg(regs[0])
if amodeRN := i.amode.rn; amodeRN.Valid() {
i.amode.rn = regs[1]
}
if amodeRM := i.amode.rm; amodeRM.Valid() {
i.amode.rm = regs[2]
}
case useKindCond:
c := cond(i.u1)
switch c.kind() {
case condKindRegisterZero:
i.u1 = uint64(registerAsRegZeroCond(regs[0]))
case condKindRegisterNotZero:
i.u1 = uint64(registerAsRegNotZeroCond(regs[0]))
}
case useKindCall:
panic("BUG: call instructions shouldn't be assigned")
case useKindCallInd:
i.rn = i.rn.assignReg(regs[0])
default:
panic(fmt.Sprintf("useKind for %v not defined", i))
}
}
func (i *instruction) asCall(ref ssa.FuncRef, abi *abiImpl) {
i.kind = call
i.u1 = uint64(ref)
i.abi = abi
}
func (i *instruction) asCallImm(imm int64) {
i.kind = call
i.u2 = uint64(imm)
}
func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *abiImpl) {
i.kind = callInd
i.rn = operandNR(ptr)
i.abi = abi
}
func (i *instruction) callFuncRef() ssa.FuncRef {
return ssa.FuncRef(i.u1)
}
// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
i.kind = movZ
i.rd = operandNR(dst)
i.u1 = imm
i.u2 = shift
if dst64bit {
i.u3 = 1
}
}
// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
i.kind = movK
i.rd = operandNR(dst)
i.u1 = imm
i.u2 = shift
if dst64bit {
i.u3 = 1
}
}
// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
i.kind = movN
i.rd = operandNR(dst)
i.u1 = imm
i.u2 = shift
if dst64bit {
i.u3 = 1
}
}
func (i *instruction) asNop0() {
i.kind = nop0
}
func (i *instruction) asRet(abi *abiImpl) {
i.kind = ret
i.abi = abi
}
func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
i.kind = storeP64
i.rn = operandNR(src1)
i.rm = operandNR(src2)
i.amode = amode
}
func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
i.kind = loadP64
i.rn = operandNR(src1)
i.rm = operandNR(src2)
i.amode = amode
}
func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
switch sizeInBits {
case 8:
i.kind = store8
case 16:
i.kind = store16
case 32:
if src.reg().RegType() == regalloc.RegTypeInt {
i.kind = store32
} else {
i.kind = fpuStore32
}
case 64:
if src.reg().RegType() == regalloc.RegTypeInt {
i.kind = store64
} else {
i.kind = fpuStore64
}
case 128:
i.kind = fpuStore128
}
i.rn = src
i.amode = amode
}
func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
switch sizeInBits {
case 8:
i.kind = sLoad8
case 16:
i.kind = sLoad16
case 32:
i.kind = sLoad32
default:
panic("BUG")
}
i.rd = dst
i.amode = amode
}
func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
switch sizeInBits {
case 8:
i.kind = uLoad8
case 16:
i.kind = uLoad16
case 32:
i.kind = uLoad32
case 64:
i.kind = uLoad64
}
i.rd = dst
i.amode = amode
}
func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
switch sizeInBits {
case 32:
i.kind = fpuLoad32
case 64:
i.kind = fpuLoad64
case 128:
i.kind = fpuLoad128
}
i.rd = dst
i.amode = amode
}
func (i *instruction) asCSst(rd regalloc.VReg, c condFlag) {
i.kind = cSet
i.rd = operandNR(rd)
i.u1 = uint64(c)
}
func (i *instruction) asBr(target label) {
if target == returnLabel {
panic("BUG: call site should special case for returnLabel")
}
i.kind = br
i.u1 = uint64(target)
}
func (i *instruction) brLabel() label {
return label(i.u1)
}
// brOffsetResolved is called when the target label is resolved.
func (i *instruction) brOffsetResolved(offset int64) {
i.u2 = uint64(offset)
i.u3 = 1 // indicate that the offset is resolved, for debugging.
}
func (i *instruction) brOffset() int64 {
return int64(i.u2)
}
// asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag.
func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
i.kind = condBr
i.u1 = c.asUint64()
i.u2 = uint64(target)
if is64bit {
i.u3 = 1
}
}
func (i *instruction) condBrLabel() label {
return label(i.u2)
}
// condBrOffsetResolve is called when the target label is resolved.
func (i *instruction) condBrOffsetResolve(offset int64) {
i.rd.data = uint64(offset)
i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
}
// condBrOffsetResolved returns true if condBrOffsetResolve is already called.
func (i *instruction) condBrOffsetResolved() bool {
return i.rd.data2 == 1
}
func (i *instruction) condBrOffset() int64 {
return int64(i.rd.data)
}
func (i *instruction) condBrCond() cond {
return cond(i.u1)
}
func (i *instruction) condBr64bit() bool {
return i.u3 == 1
}
func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
i.kind = loadFpuConst32
i.u1 = raw
i.rd = operandNR(rd)
}
func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
i.kind = loadFpuConst64
i.u1 = raw
i.rd = operandNR(rd)
}
func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
i.kind = fpuCmp
i.rn, i.rm = rn, rm
if is64bit {
i.u1 = 1
}
}
// asALU setups a basic ALU instruction.
func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
switch rm.kind {
case operandKindNR:
i.kind = aluRRR
case operandKindSR:
i.kind = aluRRRShift
case operandKindER:
i.kind = aluRRRExtend
case operandKindImm12:
i.kind = aluRRImm12
default:
panic("BUG")
}
i.u1 = uint64(aluOp)
i.rd, i.rn, i.rm = rd, rn, rm
if dst64bit {
i.u3 = 1
}
}
// asALU setups a basic ALU instruction.
func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
i.kind = aluRRRR
i.u1 = uint64(aluOp)
i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
if dst64bit {
i.u3 = 1
}
}
// asALUShift setups a shift based ALU instruction.
func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
switch rm.kind {
case operandKindNR:
i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
case operandKindShiftImm:
i.kind = aluRRImmShift
default:
panic("BUG")
}
i.u1 = uint64(aluOp)
i.rd, i.rn, i.rm = rd, rn, rm
if dst64bit {
i.u3 = 1
}
}
func (i *instruction) asALUBitmaskImm(aluOp aluOp, rn, rd regalloc.VReg, imm uint64, dst64bit bool) {
i.kind = aluRRBitmaskImm
i.u1 = uint64(aluOp)
i.rn, i.rd = operandNR(rn), operandNR(rd)
i.u2 = imm
if dst64bit {
i.u3 = 1
}
}
func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
i.kind = fpuRRR
i.u1 = uint64(op)
i.rd, i.rn, i.rm = rd, rn, rm
if dst64bit {
i.u3 = 1
}
}
func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
i.kind = extend
i.rn, i.rd = operandNR(rn), operandNR(rd)
i.u1 = uint64(fromBits)
i.u2 = uint64(toBits)
if signed {
i.u3 = 1
}
}
func (i *instruction) asMove32(rd, rn regalloc.VReg) {
i.kind = mov32
i.rn, i.rd = operandNR(rn), operandNR(rd)
}
func (i *instruction) asMove64(rd, rn regalloc.VReg) {
i.kind = mov64
i.rn, i.rd = operandNR(rn), operandNR(rd)
}
func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
i.kind = fpuMov64
i.rn, i.rd = operandNR(rn), operandNR(rd)
}
func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) {
i.kind = fpuMov128
i.rn, i.rd = operandNR(rn), operandNR(rd)
}
func (i *instruction) isCopy() bool {
op := i.kind
return op == mov64 || op == mov32 || op == fpuMov64 || op == fpuMov128
}
// String implements fmt.Stringer.
func (i *instruction) String() (str string) {
is64SizeBitToSize := func(u3 uint64) byte {
if u3 == 0 {
return 32
}
return 64
}
switch i.kind {
case nop0:
str = "nop0"
case nop4:
panic("TODO")
case aluRRR:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
case aluRRRR:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
case aluRRImm12:
size := is64SizeBitToSize(i.u3)
v, shiftBit := i.rm.imm12()
if shiftBit == 1 {
str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), uint64(v)<<12)
} else {
str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), v)
}
case aluRRBitmaskImm:
size := is64SizeBitToSize(i.u3)
rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
if size == 32 {
str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
} else {
str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
}
case aluRRImmShift:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("%s %s, %s, %#x",
aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size),
formatVRegSized(i.rn.nr(), size),
i.rm.shiftImm(),
)
case aluRRRShift:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("%s %s, %s, %s",
aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size),
formatVRegSized(i.rn.nr(), size),
formatVRegSized(i.rm.nr(), size),
)
case aluRRRExtend:
rm, e, _ := i.rm.er()
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("%s %s, %s, %s %s", aluOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size),
formatVRegSized(i.rn.nr(), size),
formatVRegSized(rm, e.srcBits()),
e,
)
case bitRR:
panic("TODO")
case uLoad8:
str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case sLoad8:
str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case uLoad16:
str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case sLoad16:
str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case uLoad32:
str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case sLoad32:
str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case uLoad64:
str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
case store8:
panic("TODO")
case store16:
panic("TODO")
case store32:
str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
case store64:
str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
case storeP64:
str = fmt.Sprintf("stp %s, %s, %s",
formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
case loadP64:
str = fmt.Sprintf("ldp %s, %s, %s",
formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
case mov64:
str = fmt.Sprintf("mov %s, %s",
formatVRegSized(i.rd.nr(), 64),
formatVRegSized(i.rn.nr(), 64))
case mov32:
str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
case movZ:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("movz %s, #%#x, LSL %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
case movN:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("movn %s, #%#x, LSL %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
case movK:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("movk %s, #%#x, LSL %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
case extend:
fromBits, toBits := byte(i.u1), byte(i.u2)
var signedStr string
if i.u3 == 1 {
signedStr = "s"
} else {
signedStr = "u"
}
var fromStr string
switch fromBits {
case 8:
fromStr = "b"
case 16:
fromStr = "h"
case 32:
fromStr = "w"
}
str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
case cSel:
panic("TODO")
case cSet:
str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
case cCmpImm:
panic("TODO")
case fpuMov64:
str = fmt.Sprintf("mov %s.8b, %s.8b", formatVRegSized(i.rd.nr(), 128), formatVRegSized(i.rn.nr(), 128))
case fpuMov128:
str = fmt.Sprintf("mov %s.16b, %s.16b", formatVRegSized(i.rd.nr(), 128), formatVRegSized(i.rn.nr(), 128))
case fpuMovFromVec:
panic("TODO")
case fpuRR:
panic("TODO")
case fpuRRR:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
case fpuRRI:
panic("TODO")
case fpuRRRR:
panic("TODO")
case fpuCmp:
size := is64SizeBitToSize(i.u3)
str = fmt.Sprintf("fcmp %s, %s",
formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
case fpuLoad32:
str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
case fpuStore32:
str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
case fpuLoad64:
str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
case fpuStore64:
str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
case fpuLoad128:
str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
case fpuStore128:
str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
case loadFpuConst32:
str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
case loadFpuConst64:
str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
case loadFpuConst128:
panic("TODO")
case fpuToInt:
panic("TODO")
case intToFpu:
panic("TODO")
case fpuCSel32:
panic("TODO")
case fpuCSel64:
panic("TODO")
case fpuRound:
panic("TODO")
case movToFpu:
panic("TODO")
case movToVec:
panic("TODO")
case movFromVec:
panic("TODO")
case movFromVecSigned:
panic("TODO")
case vecDup:
panic("TODO")
case vecDupFromFpu:
panic("TODO")
case vecExtend:
panic("TODO")
case vecMovElement:
panic("TODO")
case vecMiscNarrow:
panic("TODO")
case vecRRR:
panic("TODO")
case vecMisc:
panic("TODO")
case vecLanes:
panic("TODO")
case vecTbl:
panic("TODO")
case vecTbl2:
panic("TODO")
case movToNZCV:
panic("TODO")
case movFromNZCV:
panic("TODO")
case call:
if i.u2 > 0 {
str = fmt.Sprintf("bl #%#x", i.u2)
} else {
str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
}
case callInd:
str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 32))
case ret:
str = "ret"
case epiloguePlaceholder:
panic("TODO")
case br:
target := label(i.u1)
if i.u3 != 0 {
str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
} else {
str = fmt.Sprintf("b %s", target.String())
}
case condBr:
size := is64SizeBitToSize(i.u3)
c := cond(i.u1)
target := label(i.u2)
switch c.kind() {
case condKindRegisterZero:
if !i.condBrOffsetResolved() {
str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String())
} else {
str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String())
}
case condKindRegisterNotZero:
if offset := i.condBrOffset(); offset != 0 {
str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String())
} else {
str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String())
}
case condKindCondFlagSet:
if offset := i.condBrOffset(); offset != 0 {
if target == invalidLabel {
str = fmt.Sprintf("b.%s #%#x", c.flag(), offset)
} else {
str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String())
}
} else {
str = fmt.Sprintf("b.%s %s", c.flag(), target.String())
}
}
case indirectBr:
panic("TODO")
case adr:
str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
case word4:
panic("TODO")
case word8:
panic("TODO")
case jtSequence:
panic("TODO")
case loadAddr:
panic("TODO")
case exitSequence:
str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64))
case udf:
str = "udf"
default:
panic(i.kind)
}
return
}
func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
i.kind = adr
i.rd = operandNR(rd)
i.u1 = uint64(offset)
}
// TODO: delete unnecessary things.
const (
// nop0 represents a no-op of zero size.
nop0 instructionKind = iota + 1
// nop4 represents a no-op that is one instruction large.
nop4
// aluRRR represents an ALU operation with two register sources and a register destination.
aluRRR
// aluRRRR represents an ALU operation with three register sources and a register destination.
aluRRRR
// aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination.
aluRRImm12
// aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination.
aluRRBitmaskImm
// aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination.
aluRRImmShift
// aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination.
aluRRRShift
// aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination.
aluRRRExtend
// bitRR represents a bit op instruction with a single register source.
bitRR
// uLoad8 represents an unsigned 8-bit load.
uLoad8
// sLoad8 represents a signed 8-bit load into 64-bit register.
sLoad8
// uLoad16 represents an unsigned 16-bit load into 64-bit register.
uLoad16
// sLoad16 represents a signed 16-bit load into 64-bit register.
sLoad16
// uLoad32 represents an unsigned 32-bit load into 64-bit register.
uLoad32
// sLoad32 represents a signed 32-bit load into 64-bit register.
sLoad32
// uLoad64 represents a 64-bit load.
uLoad64
// store8 represents an 8-bit store.
store8
// store16 represents a 16-bit store.
store16
// store32 represents a 32-bit store.
store32
// store64 represents a 64-bit store.
store64
// storeP64 represents a store of a pair of registers.
storeP64
// loadP64 represents a load of a pair of registers.
loadP64
// mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling.
mov64
// mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination.
mov32
// movZ represents a MOVZ with a 16-bit immediate.
movZ
// movN represents a MOVN with a 16-bit immediate.
movN
// movK represents a MOVK with a 16-bit immediate.
movK
// extend represents a sign- or zero-extend operation.
extend
// cSel represents a conditional-select operation.
cSel
// cSet represents a conditional-set operation.
cSet
// cCmpImm represents a conditional comparison with an immediate.
cCmpImm
// fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster.
fpuMov64
// fpuMov128 represents a vector register move.
fpuMov128
// fpuMovFromVec represents a move to scalar from a vector element.
fpuMovFromVec
// fpuRR represents a 1-op FPU instruction.
fpuRR
// fpuRRR represents a 2-op FPU instruction.
fpuRRR
// fpuRRI represents a 2-op FPU instruction with immediate value.
fpuRRI
// fpuRRRR represents a 3-op FPU instruction.
fpuRRRR
// fpuCmp represents a FPU comparison, either 32 or 64 bit.
fpuCmp
// fpuLoad32 represents a floating-point load, single-precision (32 bit).
fpuLoad32
// fpuStore32 represents a floating-point store, single-precision (32 bit).
fpuStore32
// fpuLoad64 represents a floating-point load, double-precision (64 bit).
fpuLoad64
// fpuStore64 represents a floating-point store, double-precision (64 bit).
fpuStore64
// fpuLoad128 represents a floating-point/vector load, 128 bit.
fpuLoad128
// fpuStore128 represents a floating-point/vector store, 128 bit.
fpuStore128
// loadFpuConst32 represents a load of a 32-bit floating-point constant.
loadFpuConst32
// loadFpuConst64 represents a load of a 64-bit floating-point constant.
loadFpuConst64
// loadFpuConst128 represents a load of a 128-bit floating-point constant.
loadFpuConst128
// fpuToInt represents a conversion from FP to integer.
fpuToInt
// intToFpu represents a conversion from integer to FP.
intToFpu
// fpuCSel32 represents a 32-bit FP conditional select.
fpuCSel32
// fpuCSel64 represents a 64-bit FP conditional select.
fpuCSel64
// fpuRound represents a rounding to integer operation.
fpuRound
// movToFpu represents a move from a GPR to a scalar FP register.
movToFpu
// movToVec represents a move to a vector element from a GPR.
movToVec
// movFromVec represents an unsigned move from a vector element to a GPR.
movFromVec
// movFromVecSigned represents a signed move from a vector element to a GPR.
movFromVecSigned
// vecDup represents a duplication of general-purpose register to vector.
vecDup
// vecDupFromFpu represents a duplication of scalar to vector.
vecDupFromFpu
// vecExtend represents a vector extension operation.
vecExtend
// vecMovElement represents a move vector element to another vector element operation.
vecMovElement
// vecMiscNarrow represents a vector narrowing operation.
vecMiscNarrow
// vecRRR represents a vector ALU operation.
vecRRR
// vecMisc represents a vector two register miscellaneous instruction.
vecMisc
// vecLanes represents a vector instruction across lanes.
vecLanes
// vecTbl represents a table vector lookup - single register table.
vecTbl
// vecTbl2 represents a table vector lookup - two register table.
vecTbl2
// movToNZCV represents a move to the NZCV flags.
movToNZCV
// movFromNZCV represents a move from the NZCV flags.
movFromNZCV
// call represents a machine call instruction.
call
// callInd represents a machine indirect-call instruction.
callInd
// ret represents a machine return instruction.
ret
// epiloguePlaceholder is a placeholder instruction, generating no code, meaning that a function epilogue must be
// inserted there.
epiloguePlaceholder
// br represents an unconditional branch.
br
// condBr represents a conditional branch.
condBr
// trapIf represents a conditional trap.
// indirectBr represents an indirect branch through a register.
indirectBr
// adr represents a compute the address (using a PC-relative offset) of a memory location.
adr
// word4 represents a raw 32-bit word.
word4
// word8 represents a raw 64-bit word.
word8
// jtSequence represents a jump-table sequence.
jtSequence
// loadAddr represents a load address instruction.
loadAddr
// exitSequence consists of multiple instructions, and exits the execution immediately.
// See encodeExitSequence.
exitSequence
// UDF is the undefined instruction. For debugging only.
udf
// ------------------- do not define below this line -------------------
numInstructionKinds
)
func (i *instruction) asUDF() {
i.kind = udf
}
func (i *instruction) asExitSequence(ctx regalloc.VReg) {
i.kind = exitSequence
i.rn = operandNR(ctx)
}
// aluOp determines the type of ALU operation. Instructions whose kind is one of
// aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
// would use this type.
type aluOp int
func (a aluOp) String() string {
switch a {
case aluOpAdd:
return "add"
case aluOpSub:
return "sub"
case aluOpOrr:
return "orr"
case aluOpAnd:
return "and"
case aluOpBic:
return "bic"
case aluOpEor:
return "eor"
case aluOpAddS:
return "adds"
case aluOpSubS:
return "subs"
case aluOpSMulH:
return "sMulH"
case aluOpUMulH:
return "uMulH"
case aluOpSDiv64:
return "sDiv64"
case aluOpUDiv64:
return "uDiv64"
case aluOpRotR:
return "rotR"
case aluOpLsr:
return "lsr"
case aluOpAsr:
return "asr"
case aluOpLsl:
return "lsl"
case aluOpMAdd:
return "madd"
case aluOpMSub:
return "msub"
}
panic(int(a))
}
const (
// 32/64-bit Add.
aluOpAdd aluOp = iota
// 32/64-bit Subtract.
aluOpSub
// 32/64-bit Bitwise OR.
aluOpOrr
// 32/64-bit Bitwise AND.
aluOpAnd
// 32/64-bit Bitwise AND NOT.
aluOpBic
// 32/64-bit Bitwise XOR (Exclusive OR).
aluOpEor
// 32/64-bit Add setting flags.
aluOpAddS
// 32/64-bit Subtract setting flags.
aluOpSubS
// Signed multiply, high-word result.
aluOpSMulH
// Unsigned multiply, high-word result.
aluOpUMulH
// 64-bit Signed divide.
aluOpSDiv64
// 64-bit Unsigned divide.
aluOpUDiv64
// 32/64-bit Rotate right.
aluOpRotR
// 32/64-bit Logical shift right.
aluOpLsr
// 32/64-bit Arithmetic shift right.
aluOpAsr
// 32/64-bit Logical shift left.
aluOpLsl /// Multiply-add
// MAdd and MSub are only applicable for aluRRRR.
aluOpMAdd
aluOpMSub
)
// fpuBinOp represents a binary floating-point unit (FPU) operation.
type fpuBinOp byte
const (
fpuBinOpAdd = iota
fpuBinOpSub
fpuBinOpMul
fpuBinOpDiv
fpuBinOpMax
fpuBinOpMin
)
// String implements the fmt.Stringer.
func (f fpuBinOp) String() string {
switch f {
case fpuBinOpAdd:
return "fadd"
case fpuBinOpSub:
return "fsub"
case fpuBinOpMul:
return "fmul"
case fpuBinOpDiv:
return "fdiv"
case fpuBinOpMax:
return "fmax"
case fpuBinOpMin:
return "fmin"
}
panic(int(f))
}
// extMode represents the mode of a register operand extension.
// For example, aluRRRExtend instructions need this info to determine the extensions.
type extMode byte
const (
extModeNone extMode = iota
// extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32.
extModeZeroExtend32
// extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32.
extModeSignExtend32
// extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64.
extModeZeroExtend64
// extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64.
extModeSignExtend64
)
func (e extMode) bits() byte {
switch e {
case extModeZeroExtend32, extModeSignExtend32:
return 32
case extModeZeroExtend64, extModeSignExtend64:
return 64
default:
return 0
}
}
func (e extMode) signed() bool {
switch e {
case extModeSignExtend32, extModeSignExtend64:
return true
default:
return false
}
}
func extModeOf(t ssa.Type, signed bool) extMode {
switch t.Bits() {
case 32:
if signed {
return extModeSignExtend32
}
return extModeZeroExtend32
case 64:
if signed {
return extModeSignExtend64
}
return extModeZeroExtend64
default:
panic("TODO? do we need narrower than 32 bits?")
}
}
type extendOp byte
const (
extendOpUXTB extendOp = 0b000
extendOpUXTH extendOp = 0b001
extendOpUXTW extendOp = 0b010
// extendOpUXTX does nothing, but convenient symbol that officially exists. See:
// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
extendOpUXTX extendOp = 0b011
extendOpSXTB extendOp = 0b100
extendOpSXTH extendOp = 0b101
extendOpSXTW extendOp = 0b110
// extendOpSXTX does nothing, but convenient symbol that officially exists. See:
// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
extendOpSXTX extendOp = 0b111
extendOpNone extendOp = 0xff
)
func (e extendOp) srcBits() byte {
switch e {
case extendOpUXTB, extendOpSXTB:
return 8
case extendOpUXTH, extendOpSXTH:
return 16
case extendOpUXTW, extendOpSXTW:
return 32
case extendOpUXTX, extendOpSXTX:
return 64
}
panic(int(e))
}
func (e extendOp) String() string {
switch e {
case extendOpUXTB:
return "UXTB"
case extendOpUXTH:
return "UXTH"
case extendOpUXTW:
return "UXTW"
case extendOpUXTX:
return "UXTX"
case extendOpSXTB:
return "SXTB"
case extendOpSXTH:
return "SXTH"
case extendOpSXTW:
return "SXTW"
case extendOpSXTX:
return "SXTX"
}
panic(int(e))
}
func extendOpFrom(signed bool, from byte) extendOp {
switch from {
case 8:
if signed {
return extendOpSXTB
}
return extendOpUXTB
case 16:
if signed {
return extendOpSXTH
}
return extendOpUXTH
case 32:
if signed {
return extendOpSXTW
}
return extendOpUXTW
case 64:
if signed {
return extendOpSXTX
}
return extendOpUXTX
}
panic("invalid extendOpFrom")
}
type shiftOp byte
const (
shiftOpLSL shiftOp = 0b00
shiftOpLSR shiftOp = 0b01
shiftOpASR shiftOp = 0b10
shiftOpROR shiftOp = 0b11
)
func (s shiftOp) String() string {
switch s {
case shiftOpLSL:
return "LSL"
case shiftOpLSR:
return "LSR"
case shiftOpASR:
return "ASR"
case shiftOpROR:
return "ROR"
}
panic(int(s))
}
func binarySize(begin, end *instruction) (size int64) {
for cur := begin; ; cur = cur.next {
size += cur.size()
if cur == end {
break
}
}
return size
}
const exitSequenceSize = 5 * 4 // 5 instructions as in encodeExitSequence.
// size returns the size of the instruction in encoded bytes.
func (i *instruction) size() int64 {
switch i.kind {
case exitSequence:
return exitSequenceSize // 5 instructions as in encodeExitSequence.
case nop0:
return 0
case loadFpuConst32:
return 4 + 4 + 4
case loadFpuConst64:
return 4 + 4 + 8
case loadFpuConst128:
return 4 + 4 + 12
default:
return 4
}
}