3901 lines
133 KiB
Go
3901 lines
133 KiB
Go
package arm64
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"errors"
|
|
"fmt"
|
|
|
|
"github.com/tetratelabs/wazero/internal/asm"
|
|
)
|
|
|
|
type nodeImpl struct {
|
|
// jumpTarget holds the target node in the linked for the jump-kind instruction.
|
|
jumpTarget *nodeImpl
|
|
// next holds the next node from this node in the assembled linked list.
|
|
next *nodeImpl
|
|
staticConst *asm.StaticConst
|
|
|
|
instruction asm.Instruction
|
|
types operandTypes
|
|
srcReg, srcReg2, dstReg, dstReg2 asm.Register
|
|
srcConst, dstConst asm.ConstantValue
|
|
|
|
offsetInBinary asm.NodeOffsetInBinary
|
|
|
|
// readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of
|
|
// read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress.
|
|
readInstructionAddressBeforeTargetInstruction asm.Instruction
|
|
|
|
vectorArrangement VectorArrangement
|
|
srcVectorIndex, dstVectorIndex VectorIndex
|
|
}
|
|
|
|
// AssignJumpTarget implements the same method as documented on asm.Node.
|
|
func (n *nodeImpl) AssignJumpTarget(target asm.Node) {
|
|
n.jumpTarget = target.(*nodeImpl)
|
|
}
|
|
|
|
// AssignDestinationConstant implements the same method as documented on asm.Node.
|
|
func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) {
|
|
n.dstConst = value
|
|
}
|
|
|
|
// AssignSourceConstant implements the same method as documented on asm.Node.
|
|
func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) {
|
|
n.srcConst = value
|
|
}
|
|
|
|
// OffsetInBinary implements the same method as documented on asm.Node.
|
|
func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary {
|
|
return n.offsetInBinary
|
|
}
|
|
|
|
// String implements fmt.Stringer.
|
|
//
|
|
// This is for debugging purpose, and the format is similar to the AT&T assembly syntax,
|
|
// meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand
|
|
// might be embraced by '[]' to represent the memory location, and multiple operands
|
|
// are embraced by `()`.
|
|
func (n *nodeImpl) String() (ret string) {
|
|
instName := InstructionName(n.instruction)
|
|
switch n.types {
|
|
case operandTypesNoneToNone:
|
|
ret = instName
|
|
case operandTypesNoneToRegister:
|
|
ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg))
|
|
case operandTypesNoneToBranch:
|
|
ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget)
|
|
case operandTypesRegisterToRegister:
|
|
ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg))
|
|
case operandTypesLeftShiftedRegisterToRegister:
|
|
ret = fmt.Sprintf("%s (%s, %s << %d), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), n.srcConst, RegisterName(n.dstReg))
|
|
case operandTypesTwoRegistersToRegister:
|
|
ret = fmt.Sprintf("%s (%s, %s), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg))
|
|
case operandTypesThreeRegistersToRegister:
|
|
ret = fmt.Sprintf("%s (%s, %s, %s), %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), RegisterName(n.dstReg2))
|
|
case operandTypesTwoRegistersToNone:
|
|
ret = fmt.Sprintf("%s (%s, %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2))
|
|
case operandTypesRegisterAndConstToNone:
|
|
ret = fmt.Sprintf("%s (%s, 0x%x)", instName, RegisterName(n.srcReg), n.srcConst)
|
|
case operandTypesRegisterAndConstToRegister:
|
|
ret = fmt.Sprintf("%s (%s, 0x%x), %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
|
|
case operandTypesRegisterToMemory:
|
|
if n.dstReg2 != asm.NilRegister {
|
|
ret = fmt.Sprintf("%s %s, [%s + %s]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), RegisterName(n.dstReg2))
|
|
} else {
|
|
ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst)
|
|
}
|
|
case operandTypesMemoryToRegister:
|
|
if n.srcReg2 != asm.NilRegister {
|
|
ret = fmt.Sprintf("%s [%s + %s], %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg))
|
|
} else {
|
|
ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
|
|
}
|
|
case operandTypesConstToRegister:
|
|
ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg))
|
|
case operandTypesRegisterToVectorRegister:
|
|
ret = fmt.Sprintf("%s %s, %s.%s[%d]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement, n.dstVectorIndex)
|
|
case operandTypesVectorRegisterToRegister:
|
|
ret = fmt.Sprintf("%s %s.%s[%d], %s", instName, RegisterName(n.srcReg), n.vectorArrangement, n.srcVectorIndex, RegisterName(n.dstReg))
|
|
case operandTypesVectorRegisterToMemory:
|
|
if n.dstReg2 != asm.NilRegister {
|
|
ret = fmt.Sprintf("%s %s.%s, [%s + %s]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), RegisterName(n.dstReg2))
|
|
} else {
|
|
ret = fmt.Sprintf("%s %s.%s, [%s + 0x%x]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), n.dstConst)
|
|
}
|
|
case operandTypesMemoryToVectorRegister:
|
|
ret = fmt.Sprintf("%s [%s], %s.%s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement)
|
|
case operandTypesVectorRegisterToVectorRegister:
|
|
ret = fmt.Sprintf("%s %[2]s.%[4]s, %[3]s.%[4]s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement)
|
|
case operandTypesStaticConstToVectorRegister:
|
|
ret = fmt.Sprintf("%s $%#x %s.%s", instName, n.staticConst.Raw, RegisterName(n.dstReg), n.vectorArrangement)
|
|
case operandTypesTwoVectorRegistersToVectorRegister:
|
|
ret = fmt.Sprintf("%s (%s.%[5]s, %[3]s.%[5]s), %[4]s.%[5]s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), n.vectorArrangement)
|
|
}
|
|
return
|
|
}
|
|
|
|
// operandTypes represents types of operands of a node.
|
|
type operandTypes byte
|
|
|
|
const (
|
|
operandTypesNoneToNone operandTypes = iota
|
|
operandTypesNoneToRegister
|
|
operandTypesNoneToBranch
|
|
operandTypesRegisterToRegister
|
|
operandTypesLeftShiftedRegisterToRegister
|
|
operandTypesTwoRegistersToRegister
|
|
operandTypesThreeRegistersToRegister
|
|
operandTypesTwoRegistersToNone
|
|
operandTypesRegisterAndConstToNone
|
|
operandTypesRegisterAndConstToRegister
|
|
operandTypesRegisterToMemory
|
|
operandTypesMemoryToRegister
|
|
operandTypesConstToRegister
|
|
operandTypesRegisterToVectorRegister
|
|
operandTypesVectorRegisterToRegister
|
|
operandTypesMemoryToVectorRegister
|
|
operandTypesVectorRegisterToMemory
|
|
operandTypesVectorRegisterToVectorRegister
|
|
operandTypesTwoVectorRegistersToVectorRegister
|
|
operandTypesStaticConstToVectorRegister
|
|
)
|
|
|
|
// String implements fmt.Stringer
|
|
func (o operandTypes) String() (ret string) {
|
|
switch o {
|
|
case operandTypesNoneToNone:
|
|
ret = "NoneToNone"
|
|
case operandTypesNoneToRegister:
|
|
ret = "NoneToRegister"
|
|
case operandTypesNoneToBranch:
|
|
ret = "NoneToBranch"
|
|
case operandTypesRegisterToRegister:
|
|
ret = "RegisterToRegister"
|
|
case operandTypesLeftShiftedRegisterToRegister:
|
|
ret = "LeftShiftedRegisterToRegister"
|
|
case operandTypesTwoRegistersToRegister:
|
|
ret = "TwoRegistersToRegister"
|
|
case operandTypesThreeRegistersToRegister:
|
|
ret = "ThreeRegistersToRegister"
|
|
case operandTypesTwoRegistersToNone:
|
|
ret = "TwoRegistersToNone"
|
|
case operandTypesRegisterAndConstToNone:
|
|
ret = "RegisterAndConstToNone"
|
|
case operandTypesRegisterAndConstToRegister:
|
|
ret = "RegisterAndConstToRegister"
|
|
case operandTypesRegisterToMemory:
|
|
ret = "RegisterToMemory"
|
|
case operandTypesMemoryToRegister:
|
|
ret = "MemoryToRegister"
|
|
case operandTypesConstToRegister:
|
|
ret = "ConstToRegister"
|
|
case operandTypesRegisterToVectorRegister:
|
|
ret = "RegisterToVectorRegister"
|
|
case operandTypesVectorRegisterToRegister:
|
|
ret = "VectorRegisterToRegister"
|
|
case operandTypesMemoryToVectorRegister:
|
|
ret = "MemoryToVectorRegister"
|
|
case operandTypesVectorRegisterToMemory:
|
|
ret = "VectorRegisterToMemory"
|
|
case operandTypesVectorRegisterToVectorRegister:
|
|
ret = "VectorRegisterToVectorRegister"
|
|
case operandTypesTwoVectorRegistersToVectorRegister:
|
|
ret = "TwoVectorRegistersToVectorRegister"
|
|
case operandTypesStaticConstToVectorRegister:
|
|
ret = "StaticConstToVectorRegister"
|
|
}
|
|
return
|
|
}
|
|
|
|
const (
|
|
maxSignedInt26 int64 = 1<<25 - 1
|
|
minSignedInt26 int64 = -(1 << 25)
|
|
|
|
maxSignedInt19 int64 = 1<<19 - 1
|
|
minSignedInt19 int64 = -(1 << 19)
|
|
)
|
|
|
|
// AssemblerImpl implements Assembler.
|
|
type AssemblerImpl struct {
|
|
root *nodeImpl
|
|
current *nodeImpl
|
|
asm.BaseAssemblerImpl
|
|
relativeJumpNodes []*nodeImpl
|
|
adrInstructionNodes []*nodeImpl
|
|
nodePool nodePool
|
|
pool asm.StaticConstPool
|
|
nodeCount int
|
|
|
|
// MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstPool
|
|
// but have it as a field here for testability.
|
|
MaxDisplacementForConstantPool int
|
|
|
|
temporaryRegister asm.Register
|
|
}
|
|
|
|
const nodePageSize = 128
|
|
|
|
type nodePage = [nodePageSize]nodeImpl
|
|
|
|
// nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl.
|
|
// This reduces the allocations over compilation by reusing AssemblerImpl.
|
|
type nodePool struct {
|
|
pages []*nodePage
|
|
index int
|
|
}
|
|
|
|
// allocNode allocates a new nodeImpl for use from the pool.
|
|
// This expands the pool if there is no space left for it.
|
|
func (n *nodePool) allocNode() *nodeImpl {
|
|
if n.index == nodePageSize {
|
|
if len(n.pages) == cap(n.pages) {
|
|
n.pages = append(n.pages, new(nodePage))
|
|
} else {
|
|
i := len(n.pages)
|
|
n.pages = n.pages[:i+1]
|
|
if n.pages[i] == nil {
|
|
n.pages[i] = new(nodePage)
|
|
}
|
|
}
|
|
n.index = 0
|
|
}
|
|
ret := &n.pages[len(n.pages)-1][n.index]
|
|
n.index++
|
|
return ret
|
|
}
|
|
|
|
func (n *nodePool) reset() {
|
|
for _, ns := range n.pages {
|
|
pages := ns[:]
|
|
for i := range pages {
|
|
pages[i] = nodeImpl{}
|
|
}
|
|
}
|
|
n.pages = n.pages[:0]
|
|
n.index = nodePageSize
|
|
}
|
|
|
|
func NewAssembler(temporaryRegister asm.Register) *AssemblerImpl {
|
|
return &AssemblerImpl{
|
|
nodePool: nodePool{index: nodePageSize},
|
|
temporaryRegister: temporaryRegister,
|
|
pool: asm.NewStaticConstPool(),
|
|
MaxDisplacementForConstantPool: defaultMaxDisplacementForConstPool,
|
|
}
|
|
}
|
|
|
|
// AllocateNOP implements asm.AssemblerBase.
|
|
func (a *AssemblerImpl) AllocateNOP() asm.Node {
|
|
n := a.nodePool.allocNode()
|
|
n.instruction = NOP
|
|
n.types = operandTypesNoneToNone
|
|
return n
|
|
}
|
|
|
|
// Add implements asm.AssemblerBase.
|
|
func (a *AssemblerImpl) Add(n asm.Node) {
|
|
a.addNode(n.(*nodeImpl))
|
|
}
|
|
|
|
// Reset implements asm.AssemblerBase.
|
|
func (a *AssemblerImpl) Reset() {
|
|
pool := a.pool
|
|
pool.Reset()
|
|
*a = AssemblerImpl{
|
|
nodePool: a.nodePool,
|
|
pool: pool,
|
|
temporaryRegister: a.temporaryRegister,
|
|
adrInstructionNodes: a.adrInstructionNodes[:0],
|
|
relativeJumpNodes: a.relativeJumpNodes[:0],
|
|
BaseAssemblerImpl: asm.BaseAssemblerImpl{
|
|
SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0],
|
|
JumpTableEntries: a.JumpTableEntries[:0],
|
|
},
|
|
}
|
|
a.nodePool.reset()
|
|
}
|
|
|
|
// newNode creates a new Node and appends it into the linked list.
|
|
func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl {
|
|
n := a.nodePool.allocNode()
|
|
n.instruction = instruction
|
|
n.types = types
|
|
|
|
a.addNode(n)
|
|
return n
|
|
}
|
|
|
|
// addNode appends the new node into the linked list.
|
|
func (a *AssemblerImpl) addNode(node *nodeImpl) {
|
|
a.nodeCount++
|
|
|
|
if a.root == nil {
|
|
a.root = node
|
|
a.current = node
|
|
} else {
|
|
parent := a.current
|
|
parent.next = node
|
|
a.current = node
|
|
}
|
|
|
|
for _, o := range a.SetBranchTargetOnNextNodes {
|
|
origin := o.(*nodeImpl)
|
|
origin.jumpTarget = node
|
|
}
|
|
// Reuse the underlying slice to avoid re-allocations.
|
|
a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0]
|
|
}
|
|
|
|
// Assemble implements asm.AssemblerBase
|
|
func (a *AssemblerImpl) Assemble(buf asm.Buffer) error {
|
|
// arm64 has 32-bit fixed length instructions,
|
|
// but note that some nodes are encoded as multiple instructions,
|
|
// so the resulting binary might not be the size of count*8.
|
|
buf.Grow(a.nodeCount * 8)
|
|
|
|
for n := a.root; n != nil; n = n.next {
|
|
n.offsetInBinary = uint64(buf.Len())
|
|
if err := a.encodeNode(buf, n); err != nil {
|
|
return err
|
|
}
|
|
a.maybeFlushConstPool(buf, n.next == nil)
|
|
}
|
|
|
|
code := buf.Bytes()
|
|
|
|
if err := a.FinalizeJumpTableEntry(code); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, rel := range a.relativeJumpNodes {
|
|
if err := a.relativeBranchFinalize(code, rel); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
for _, adr := range a.adrInstructionNodes {
|
|
if err := a.finalizeADRInstructionNode(code, adr); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
const defaultMaxDisplacementForConstPool = (1 << 20) - 1 - 4 // -4 for unconditional branch to skip the constants.
|
|
|
|
// maybeFlushConstPool flushes the constant pool if endOfBinary or a boundary condition was met.
|
|
func (a *AssemblerImpl) maybeFlushConstPool(buf asm.Buffer, endOfBinary bool) {
|
|
if a.pool.Empty() {
|
|
return
|
|
}
|
|
|
|
// If endOfBinary = true, we no longer need to emit the instructions, therefore
|
|
// flush all the constants.
|
|
if endOfBinary ||
|
|
// Also, if the offset between the first usage of the constant pool and
|
|
// the first constant would exceed 2^20 -1(= 2MiB-1), which is the maximum offset
|
|
// for LDR(literal)/ADR instruction, flush all the constants in the pool.
|
|
(buf.Len()+a.pool.PoolSizeInBytes-int(a.pool.FirstUseOffsetInBinary)) >= a.MaxDisplacementForConstantPool {
|
|
|
|
// Before emitting consts, we have to add br instruction to skip the const pool.
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1123-L1129
|
|
skipOffset := a.pool.PoolSizeInBytes/4 + 1
|
|
if a.pool.PoolSizeInBytes%4 != 0 {
|
|
skipOffset++
|
|
}
|
|
if endOfBinary {
|
|
// If this is the end of binary, we never reach this block,
|
|
// so offset can be zero (which is the behavior of Go's assembler).
|
|
skipOffset = 0
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
byte(skipOffset),
|
|
byte(skipOffset>>8),
|
|
byte(skipOffset>>16),
|
|
0x14,
|
|
)
|
|
|
|
// Then adding the consts into the binary.
|
|
for _, c := range a.pool.Consts {
|
|
c.SetOffsetInBinary(uint64(buf.Len()))
|
|
buf.AppendBytes(c.Raw)
|
|
}
|
|
|
|
// arm64 instructions are 4-byte (32-bit) aligned, so we must pad the zero consts here.
|
|
if pad := buf.Len() % 4; pad != 0 {
|
|
buf.AppendBytes(make([]byte, 4-pad))
|
|
}
|
|
|
|
// After the flush, reset the constant pool.
|
|
a.pool.Reset()
|
|
}
|
|
}
|
|
|
|
// encodeNode encodes the given node into writer.
|
|
func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
switch n.types {
|
|
case operandTypesNoneToNone:
|
|
err = a.encodeNoneToNone(buf, n)
|
|
case operandTypesNoneToRegister:
|
|
err = a.encodeJumpToRegister(buf, n)
|
|
case operandTypesNoneToBranch:
|
|
err = a.encodeRelativeBranch(buf, n)
|
|
case operandTypesRegisterToRegister:
|
|
err = a.encodeRegisterToRegister(buf, n)
|
|
case operandTypesLeftShiftedRegisterToRegister:
|
|
err = a.encodeLeftShiftedRegisterToRegister(buf, n)
|
|
case operandTypesTwoRegistersToRegister:
|
|
err = a.encodeTwoRegistersToRegister(buf, n)
|
|
case operandTypesThreeRegistersToRegister:
|
|
err = a.encodeThreeRegistersToRegister(buf, n)
|
|
case operandTypesTwoRegistersToNone:
|
|
err = a.encodeTwoRegistersToNone(buf, n)
|
|
case operandTypesRegisterAndConstToNone:
|
|
err = a.encodeRegisterAndConstToNone(buf, n)
|
|
case operandTypesRegisterToMemory:
|
|
err = a.encodeRegisterToMemory(buf, n)
|
|
case operandTypesMemoryToRegister:
|
|
err = a.encodeMemoryToRegister(buf, n)
|
|
case operandTypesRegisterAndConstToRegister, operandTypesConstToRegister:
|
|
err = a.encodeConstToRegister(buf, n)
|
|
case operandTypesRegisterToVectorRegister:
|
|
err = a.encodeRegisterToVectorRegister(buf, n)
|
|
case operandTypesVectorRegisterToRegister:
|
|
err = a.encodeVectorRegisterToRegister(buf, n)
|
|
case operandTypesMemoryToVectorRegister:
|
|
err = a.encodeMemoryToVectorRegister(buf, n)
|
|
case operandTypesVectorRegisterToMemory:
|
|
err = a.encodeVectorRegisterToMemory(buf, n)
|
|
case operandTypesVectorRegisterToVectorRegister:
|
|
err = a.encodeVectorRegisterToVectorRegister(buf, n)
|
|
case operandTypesStaticConstToVectorRegister:
|
|
err = a.encodeStaticConstToVectorRegister(buf, n)
|
|
case operandTypesTwoVectorRegistersToVectorRegister:
|
|
err = a.encodeTwoVectorRegistersToVectorRegister(buf, n)
|
|
default:
|
|
err = fmt.Errorf("encoder undefined for [%s] operand type", n.types)
|
|
}
|
|
if err != nil {
|
|
err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node.
|
|
}
|
|
return
|
|
}
|
|
|
|
// CompileStandAlone implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node {
|
|
return a.newNode(instruction, operandTypesNoneToNone)
|
|
}
|
|
|
|
// CompileConstToRegister implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileConstToRegister(
|
|
instruction asm.Instruction,
|
|
value asm.ConstantValue,
|
|
destinationReg asm.Register,
|
|
) (inst asm.Node) {
|
|
n := a.newNode(instruction, operandTypesConstToRegister)
|
|
n.srcConst = value
|
|
n.dstReg = destinationReg
|
|
return n
|
|
}
|
|
|
|
// CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
|
|
n := a.newNode(instruction, operandTypesRegisterToRegister)
|
|
n.srcReg = from
|
|
n.dstReg = to
|
|
}
|
|
|
|
// CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileMemoryToRegister(
|
|
instruction asm.Instruction,
|
|
sourceBaseReg asm.Register,
|
|
sourceOffsetConst asm.ConstantValue,
|
|
destinationReg asm.Register,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesMemoryToRegister)
|
|
n.srcReg = sourceBaseReg
|
|
n.srcConst = sourceOffsetConst
|
|
n.dstReg = destinationReg
|
|
}
|
|
|
|
// CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileRegisterToMemory(
|
|
instruction asm.Instruction,
|
|
sourceRegister, destinationBaseRegister asm.Register,
|
|
destinationOffsetConst asm.ConstantValue,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesRegisterToMemory)
|
|
n.srcReg = sourceRegister
|
|
n.dstReg = destinationBaseRegister
|
|
n.dstConst = destinationOffsetConst
|
|
}
|
|
|
|
// CompileJump implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node {
|
|
return a.newNode(jmpInstruction, operandTypesNoneToBranch)
|
|
}
|
|
|
|
// CompileJumpToRegister implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) {
|
|
n := a.newNode(jmpInstruction, operandTypesNoneToRegister)
|
|
n.dstReg = reg
|
|
}
|
|
|
|
// CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase.
|
|
func (a *AssemblerImpl) CompileReadInstructionAddress(
|
|
destinationRegister asm.Register,
|
|
beforeAcquisitionTargetInstruction asm.Instruction,
|
|
) {
|
|
n := a.newNode(ADR, operandTypesMemoryToRegister)
|
|
n.dstReg = destinationRegister
|
|
n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction
|
|
}
|
|
|
|
// CompileMemoryWithRegisterOffsetToRegister implements Assembler.CompileMemoryWithRegisterOffsetToRegister
|
|
func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToRegister(
|
|
instruction asm.Instruction,
|
|
srcBaseReg, srcOffsetReg, dstReg asm.Register,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesMemoryToRegister)
|
|
n.dstReg = dstReg
|
|
n.srcReg = srcBaseReg
|
|
n.srcReg2 = srcOffsetReg
|
|
}
|
|
|
|
// CompileRegisterToMemoryWithRegisterOffset implements Assembler.CompileRegisterToMemoryWithRegisterOffset
|
|
func (a *AssemblerImpl) CompileRegisterToMemoryWithRegisterOffset(
|
|
instruction asm.Instruction,
|
|
srcReg, dstBaseReg, dstOffsetReg asm.Register,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesRegisterToMemory)
|
|
n.srcReg = srcReg
|
|
n.dstReg = dstBaseReg
|
|
n.dstReg2 = dstOffsetReg
|
|
}
|
|
|
|
// CompileTwoRegistersToRegister implements Assembler.CompileTwoRegistersToRegister
|
|
func (a *AssemblerImpl) CompileTwoRegistersToRegister(instruction asm.Instruction, src1, src2, dst asm.Register) {
|
|
n := a.newNode(instruction, operandTypesTwoRegistersToRegister)
|
|
n.srcReg = src1
|
|
n.srcReg2 = src2
|
|
n.dstReg = dst
|
|
}
|
|
|
|
// CompileThreeRegistersToRegister implements Assembler.CompileThreeRegistersToRegister
|
|
func (a *AssemblerImpl) CompileThreeRegistersToRegister(
|
|
instruction asm.Instruction,
|
|
src1, src2, src3, dst asm.Register,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesThreeRegistersToRegister)
|
|
n.srcReg = src1
|
|
n.srcReg2 = src2
|
|
n.dstReg = src3 // To minimize the size of nodeImpl struct, we reuse dstReg for the third source operand.
|
|
n.dstReg2 = dst
|
|
}
|
|
|
|
// CompileTwoRegistersToNone implements Assembler.CompileTwoRegistersToNone
|
|
func (a *AssemblerImpl) CompileTwoRegistersToNone(instruction asm.Instruction, src1, src2 asm.Register) {
|
|
n := a.newNode(instruction, operandTypesTwoRegistersToNone)
|
|
n.srcReg = src1
|
|
n.srcReg2 = src2
|
|
}
|
|
|
|
// CompileRegisterAndConstToNone implements Assembler.CompileRegisterAndConstToNone
|
|
func (a *AssemblerImpl) CompileRegisterAndConstToNone(
|
|
instruction asm.Instruction,
|
|
src asm.Register,
|
|
srcConst asm.ConstantValue,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesRegisterAndConstToNone)
|
|
n.srcReg = src
|
|
n.srcConst = srcConst
|
|
}
|
|
|
|
// CompileRegisterAndConstToRegister implements Assembler.CompileRegisterAndConstToRegister
|
|
func (a *AssemblerImpl) CompileRegisterAndConstToRegister(
|
|
instruction asm.Instruction,
|
|
src asm.Register,
|
|
srcConst asm.ConstantValue,
|
|
dst asm.Register,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesRegisterAndConstToRegister)
|
|
n.srcReg = src
|
|
n.srcConst = srcConst
|
|
n.dstReg = dst
|
|
}
|
|
|
|
// CompileLeftShiftedRegisterToRegister implements Assembler.CompileLeftShiftedRegisterToRegister
|
|
func (a *AssemblerImpl) CompileLeftShiftedRegisterToRegister(
|
|
instruction asm.Instruction,
|
|
shiftedSourceReg asm.Register,
|
|
shiftNum asm.ConstantValue,
|
|
srcReg, dstReg asm.Register,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesLeftShiftedRegisterToRegister)
|
|
n.srcReg = srcReg
|
|
n.srcReg2 = shiftedSourceReg
|
|
n.srcConst = shiftNum
|
|
n.dstReg = dstReg
|
|
}
|
|
|
|
// CompileConditionalRegisterSet implements Assembler.CompileConditionalRegisterSet
|
|
func (a *AssemblerImpl) CompileConditionalRegisterSet(cond asm.ConditionalRegisterState, dstReg asm.Register) {
|
|
n := a.newNode(CSET, operandTypesRegisterToRegister)
|
|
n.srcReg = conditionalRegisterStateToRegister(cond)
|
|
n.dstReg = dstReg
|
|
}
|
|
|
|
// CompileMemoryToVectorRegister implements Assembler.CompileMemoryToVectorRegister
|
|
func (a *AssemblerImpl) CompileMemoryToVectorRegister(
|
|
instruction asm.Instruction, srcBaseReg asm.Register, dstOffset asm.ConstantValue, dstReg asm.Register, arrangement VectorArrangement,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesMemoryToVectorRegister)
|
|
n.srcReg = srcBaseReg
|
|
n.srcConst = dstOffset
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileMemoryWithRegisterOffsetToVectorRegister implements Assembler.CompileMemoryWithRegisterOffsetToVectorRegister
|
|
func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToVectorRegister(instruction asm.Instruction,
|
|
srcBaseReg, srcOffsetRegister asm.Register, dstReg asm.Register, arrangement VectorArrangement,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesMemoryToVectorRegister)
|
|
n.srcReg = srcBaseReg
|
|
n.srcReg2 = srcOffsetRegister
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileVectorRegisterToMemory implements Assembler.CompileVectorRegisterToMemory
|
|
func (a *AssemblerImpl) CompileVectorRegisterToMemory(
|
|
instruction asm.Instruction, srcReg, dstBaseReg asm.Register, dstOffset asm.ConstantValue, arrangement VectorArrangement,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesVectorRegisterToMemory)
|
|
n.srcReg = srcReg
|
|
n.dstReg = dstBaseReg
|
|
n.dstConst = dstOffset
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileVectorRegisterToMemoryWithRegisterOffset implements Assembler.CompileVectorRegisterToMemoryWithRegisterOffset
|
|
func (a *AssemblerImpl) CompileVectorRegisterToMemoryWithRegisterOffset(instruction asm.Instruction,
|
|
srcReg, dstBaseReg, dstOffsetRegister asm.Register, arrangement VectorArrangement,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesVectorRegisterToMemory)
|
|
n.srcReg = srcReg
|
|
n.dstReg = dstBaseReg
|
|
n.dstReg2 = dstOffsetRegister
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileRegisterToVectorRegister implements Assembler.CompileRegisterToVectorRegister
|
|
func (a *AssemblerImpl) CompileRegisterToVectorRegister(
|
|
instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, index VectorIndex,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesRegisterToVectorRegister)
|
|
n.srcReg = srcReg
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
n.dstVectorIndex = index
|
|
}
|
|
|
|
// CompileVectorRegisterToRegister implements Assembler.CompileVectorRegisterToRegister
|
|
func (a *AssemblerImpl) CompileVectorRegisterToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register,
|
|
arrangement VectorArrangement, index VectorIndex,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesVectorRegisterToRegister)
|
|
n.srcReg = srcReg
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
n.srcVectorIndex = index
|
|
}
|
|
|
|
// CompileVectorRegisterToVectorRegister implements Assembler.CompileVectorRegisterToVectorRegister
|
|
func (a *AssemblerImpl) CompileVectorRegisterToVectorRegister(
|
|
instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, srcIndex, dstIndex VectorIndex,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister)
|
|
n.srcReg = srcReg
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
n.srcVectorIndex = srcIndex
|
|
n.dstVectorIndex = dstIndex
|
|
}
|
|
|
|
// CompileVectorRegisterToVectorRegisterWithConst implements Assembler.CompileVectorRegisterToVectorRegisterWithConst
|
|
func (a *AssemblerImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction,
|
|
srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister)
|
|
n.srcReg = srcReg
|
|
n.srcConst = c
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileStaticConstToRegister implements Assembler.CompileStaticConstToVectorRegister
|
|
func (a *AssemblerImpl) CompileStaticConstToRegister(instruction asm.Instruction, c *asm.StaticConst, dstReg asm.Register) {
|
|
n := a.newNode(instruction, operandTypesMemoryToRegister)
|
|
n.staticConst = c
|
|
n.dstReg = dstReg
|
|
}
|
|
|
|
// CompileStaticConstToVectorRegister implements Assembler.CompileStaticConstToVectorRegister
|
|
func (a *AssemblerImpl) CompileStaticConstToVectorRegister(instruction asm.Instruction,
|
|
c *asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesStaticConstToVectorRegister)
|
|
n.staticConst = c
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileTwoVectorRegistersToVectorRegister implements Assembler.CompileTwoVectorRegistersToVectorRegister.
|
|
func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register,
|
|
arrangement VectorArrangement,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister)
|
|
n.srcReg = srcReg
|
|
n.srcReg2 = srcReg2
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
// CompileTwoVectorRegistersToVectorRegisterWithConst implements Assembler.CompileTwoVectorRegistersToVectorRegisterWithConst.
|
|
func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction,
|
|
srcReg, srcReg2, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue,
|
|
) {
|
|
n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister)
|
|
n.srcReg = srcReg
|
|
n.srcReg2 = srcReg2
|
|
n.srcConst = c
|
|
n.dstReg = dstReg
|
|
n.vectorArrangement = arrangement
|
|
}
|
|
|
|
func errorEncodingUnsupported(n *nodeImpl) error {
|
|
return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types)
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) error {
|
|
switch n.instruction {
|
|
case UDF:
|
|
buf.Append4Bytes(0, 0, 0, 0)
|
|
return nil
|
|
case NOP:
|
|
return nil
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeJumpToRegister(buf asm.Buffer, n *nodeImpl) error {
|
|
// "Unconditional branch (register)" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions
|
|
var opc byte
|
|
switch n.instruction {
|
|
case RET:
|
|
opc = 0b0010
|
|
case B:
|
|
opc = 0b0000
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
regBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid destination register: %w", err)
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
0x00|(regBits<<5),
|
|
0x00|(regBits>>3),
|
|
0b000_11111|(opc<<5),
|
|
0b1101011_0|(opc>>3),
|
|
)
|
|
return err
|
|
}
|
|
|
|
func (a *AssemblerImpl) relativeBranchFinalize(code []byte, n *nodeImpl) error {
|
|
var condBits byte
|
|
const condBitsUnconditional = 0xff // Indicates this is not conditional jump.
|
|
|
|
// https://developer.arm.com/documentation/den0024/a/CHDEEABE
|
|
switch n.instruction {
|
|
case B:
|
|
condBits = condBitsUnconditional
|
|
case BCONDEQ:
|
|
condBits = 0b0000
|
|
case BCONDGE:
|
|
condBits = 0b1010
|
|
case BCONDGT:
|
|
condBits = 0b1100
|
|
case BCONDHI:
|
|
condBits = 0b1000
|
|
case BCONDHS:
|
|
condBits = 0b0010
|
|
case BCONDLE:
|
|
condBits = 0b1101
|
|
case BCONDLO:
|
|
condBits = 0b0011
|
|
case BCONDLS:
|
|
condBits = 0b1001
|
|
case BCONDLT:
|
|
condBits = 0b1011
|
|
case BCONDMI:
|
|
condBits = 0b0100
|
|
case BCONDPL:
|
|
condBits = 0b0101
|
|
case BCONDNE:
|
|
condBits = 0b0001
|
|
case BCONDVS:
|
|
condBits = 0b0110
|
|
case BCONDVC:
|
|
condBits = 0b0111
|
|
}
|
|
|
|
branchInstOffset := int64(n.OffsetInBinary())
|
|
offset := int64(n.jumpTarget.OffsetInBinary()) - branchInstOffset
|
|
if offset%4 != 0 {
|
|
return errors.New("BUG: relative jump offset must be 4 bytes aligned")
|
|
}
|
|
|
|
branchInst := code[branchInstOffset : branchInstOffset+4]
|
|
if condBits == condBitsUnconditional {
|
|
imm26 := offset >> 2 // divide by 4.
|
|
if imm26 < minSignedInt26 || imm26 > maxSignedInt26 {
|
|
// In theory this could happen if a Wasm binary has a huge single label (more than 128MB for a single block),
|
|
// and in that case, we use load the offset into a register and do the register jump, but to avoid the complexity,
|
|
// we impose this limit for now as that would be *unlikely* happen in practice.
|
|
return fmt.Errorf("relative jump offset %d/4 must be within %d and %d", offset, minSignedInt26, maxSignedInt26)
|
|
}
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-?lang=en
|
|
branchInst[0] = byte(imm26)
|
|
branchInst[1] = byte(imm26 >> 8)
|
|
branchInst[2] = byte(imm26 >> 16)
|
|
branchInst[3] = (byte(imm26 >> 24 & 0b000000_11)) | 0b000101_00
|
|
} else {
|
|
imm19 := offset >> 2 // divide by 4.
|
|
if imm19 < minSignedInt19 || imm19 > maxSignedInt19 {
|
|
// This should be a bug in our compiler as the conditional jumps are only used in the small offsets (~a few bytes),
|
|
// and if ever happens, compiler can be fixed.
|
|
return fmt.Errorf("BUG: relative jump offset %d/4(=%d) must be within %d and %d", offset, imm19, minSignedInt19, maxSignedInt19)
|
|
}
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-?lang=en
|
|
branchInst[0] = (byte(imm19<<5) & 0b111_0_0000) | condBits
|
|
branchInst[1] = byte(imm19 >> 3)
|
|
branchInst[2] = byte(imm19 >> 11)
|
|
branchInst[3] = 0b01010100
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeRelativeBranch(buf asm.Buffer, n *nodeImpl) error {
|
|
switch n.instruction {
|
|
case B, BCONDEQ, BCONDGE, BCONDGT, BCONDHI, BCONDHS, BCONDLE, BCONDLO, BCONDLS, BCONDLT, BCONDMI, BCONDNE, BCONDVS, BCONDVC, BCONDPL:
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
if n.jumpTarget == nil {
|
|
return fmt.Errorf("branch target must be set for %s", InstructionName(n.instruction))
|
|
}
|
|
|
|
// At this point, we don't yet know that target's branch, so emit the placeholder (4 bytes).
|
|
buf.Append4Bytes(0, 0, 0, 0)
|
|
a.relativeJumpNodes = append(a.relativeJumpNodes, n)
|
|
return nil
|
|
}
|
|
|
|
func checkRegisterToRegisterType(src, dst asm.Register, requireSrcInt, requireDstInt bool) (err error) {
|
|
isSrcInt, isDstInt := isIntRegister(src), isIntRegister(dst)
|
|
if isSrcInt && !requireSrcInt {
|
|
err = fmt.Errorf("src requires float register but got %s", RegisterName(src))
|
|
} else if !isSrcInt && requireSrcInt {
|
|
err = fmt.Errorf("src requires int register but got %s", RegisterName(src))
|
|
} else if isDstInt && !requireDstInt {
|
|
err = fmt.Errorf("dst requires float register but got %s", RegisterName(dst))
|
|
} else if !isDstInt && requireDstInt {
|
|
err = fmt.Errorf("dst requires int register but got %s", RegisterName(dst))
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
switch inst := n.instruction; inst {
|
|
case ADD, ADDW, SUB:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
|
|
var sfops byte
|
|
switch inst {
|
|
case ADD:
|
|
sfops = 0b100
|
|
case ADDW:
|
|
case SUB:
|
|
sfops = 0b110
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
dstRegBits>>3,
|
|
srcRegBits,
|
|
(sfops<<5)|0b01011,
|
|
)
|
|
case CLZ, CLZW, RBIT, RBITW:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
|
|
var sf, opcode byte
|
|
switch inst {
|
|
case CLZ:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en
|
|
sf, opcode = 0b1, 0b000_100
|
|
case CLZW:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en
|
|
sf, opcode = 0b0, 0b000_100
|
|
case RBIT:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en
|
|
sf, opcode = 0b1, 0b000_000
|
|
case RBITW:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en
|
|
sf, opcode = 0b0, 0b000_000
|
|
}
|
|
if inst == CLZ {
|
|
sf = 1
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
opcode<<2|(srcRegBits>>3),
|
|
0b110_00000,
|
|
(sf<<7)|0b0_1011010,
|
|
)
|
|
case CSET:
|
|
if !isConditionalRegister(n.srcReg) {
|
|
return fmt.Errorf("CSET requires conditional register but got %s", RegisterName(n.srcReg))
|
|
}
|
|
|
|
dstRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// CSET encodes the conditional bits with its least significant bit inverted.
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en
|
|
//
|
|
// https://developer.arm.com/documentation/den0024/a/CHDEEABE
|
|
var conditionalBits byte
|
|
switch n.srcReg {
|
|
case RegCondEQ:
|
|
conditionalBits = 0b0001
|
|
case RegCondNE:
|
|
conditionalBits = 0b0000
|
|
case RegCondHS:
|
|
conditionalBits = 0b0011
|
|
case RegCondLO:
|
|
conditionalBits = 0b0010
|
|
case RegCondMI:
|
|
conditionalBits = 0b0101
|
|
case RegCondPL:
|
|
conditionalBits = 0b0100
|
|
case RegCondVS:
|
|
conditionalBits = 0b0111
|
|
case RegCondVC:
|
|
conditionalBits = 0b0110
|
|
case RegCondHI:
|
|
conditionalBits = 0b1001
|
|
case RegCondLS:
|
|
conditionalBits = 0b1000
|
|
case RegCondGE:
|
|
conditionalBits = 0b1011
|
|
case RegCondLT:
|
|
conditionalBits = 0b1010
|
|
case RegCondGT:
|
|
conditionalBits = 0b1101
|
|
case RegCondLE:
|
|
conditionalBits = 0b1100
|
|
case RegCondAL:
|
|
conditionalBits = 0b1111
|
|
case RegCondNV:
|
|
conditionalBits = 0b1110
|
|
}
|
|
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en
|
|
buf.Append4Bytes(
|
|
0b111_00000|dstRegBits,
|
|
(conditionalBits<<4)|0b0000_0111,
|
|
0b100_11111,
|
|
0b10011010,
|
|
)
|
|
|
|
case FABSD, FABSS, FNEGD, FNEGS, FSQRTD, FSQRTS, FCVTSD, FCVTDS, FRINTMD, FRINTMS,
|
|
FRINTND, FRINTNS, FRINTPD, FRINTPS, FRINTZD, FRINTZS:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil {
|
|
return
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
|
|
var tp, opcode byte
|
|
switch inst {
|
|
case FABSD:
|
|
opcode, tp = 0b000001, 0b01
|
|
case FABSS:
|
|
opcode, tp = 0b000001, 0b00
|
|
case FNEGD:
|
|
opcode, tp = 0b000010, 0b01
|
|
case FNEGS:
|
|
opcode, tp = 0b000010, 0b00
|
|
case FSQRTD:
|
|
opcode, tp = 0b000011, 0b01
|
|
case FSQRTS:
|
|
opcode, tp = 0b000011, 0b00
|
|
case FCVTSD:
|
|
opcode, tp = 0b000101, 0b00
|
|
case FCVTDS:
|
|
opcode, tp = 0b000100, 0b01
|
|
case FRINTMD:
|
|
opcode, tp = 0b001010, 0b01
|
|
case FRINTMS:
|
|
opcode, tp = 0b001010, 0b00
|
|
case FRINTND:
|
|
opcode, tp = 0b001000, 0b01
|
|
case FRINTNS:
|
|
opcode, tp = 0b001000, 0b00
|
|
case FRINTPD:
|
|
opcode, tp = 0b001001, 0b01
|
|
case FRINTPS:
|
|
opcode, tp = 0b001001, 0b00
|
|
case FRINTZD:
|
|
opcode, tp = 0b001011, 0b01
|
|
case FRINTZS:
|
|
opcode, tp = 0b001011, 0b00
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
(opcode<<7)|0b0_10000_00|(srcRegBits>>3),
|
|
tp<<6|0b00_1_00000|opcode>>1,
|
|
0b0_00_11110,
|
|
)
|
|
|
|
case FADDD, FADDS, FDIVS, FDIVD, FMAXD, FMAXS, FMIND, FMINS, FMULS, FMULD:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil {
|
|
return
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
// "Floating-point data-processing (2 source)" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
|
|
var tp, opcode byte
|
|
switch inst {
|
|
case FADDD:
|
|
opcode, tp = 0b0010, 0b01
|
|
case FADDS:
|
|
opcode, tp = 0b0010, 0b00
|
|
case FDIVD:
|
|
opcode, tp = 0b0001, 0b01
|
|
case FDIVS:
|
|
opcode, tp = 0b0001, 0b00
|
|
case FMAXD:
|
|
opcode, tp = 0b0100, 0b01
|
|
case FMAXS:
|
|
opcode, tp = 0b0100, 0b00
|
|
case FMIND:
|
|
opcode, tp = 0b0101, 0b01
|
|
case FMINS:
|
|
opcode, tp = 0b0101, 0b00
|
|
case FMULS:
|
|
opcode, tp = 0b0000, 0b00
|
|
case FMULD:
|
|
opcode, tp = 0b0000, 0b01
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
opcode<<4|0b0000_10_00|(dstRegBits>>3),
|
|
tp<<6|0b00_1_00000|srcRegBits,
|
|
0b0001_1110,
|
|
)
|
|
|
|
case FCVTZSD, FCVTZSDW, FCVTZSS, FCVTZSSW, FCVTZUD, FCVTZUDW, FCVTZUS, FCVTZUSW:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, true); err != nil {
|
|
return
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
// "Conversion between floating-point and integer" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
|
|
var sf, tp, opcode byte
|
|
switch inst {
|
|
case FCVTZSD: // Double to signed 64-bit
|
|
sf, tp, opcode = 0b1, 0b01, 0b000
|
|
case FCVTZSDW: // Double to signed 32-bit.
|
|
sf, tp, opcode = 0b0, 0b01, 0b000
|
|
case FCVTZSS: // Single to signed 64-bit.
|
|
sf, tp, opcode = 0b1, 0b00, 0b000
|
|
case FCVTZSSW: // Single to signed 32-bit.
|
|
sf, tp, opcode = 0b0, 0b00, 0b000
|
|
case FCVTZUD: // Double to unsigned 64-bit.
|
|
sf, tp, opcode = 0b1, 0b01, 0b001
|
|
case FCVTZUDW: // Double to unsigned 32-bit.
|
|
sf, tp, opcode = 0b0, 0b01, 0b001
|
|
case FCVTZUS: // Single to unsigned 64-bit.
|
|
sf, tp, opcode = 0b1, 0b00, 0b001
|
|
case FCVTZUSW: // Single to unsigned 32-bit.
|
|
sf, tp, opcode = 0b0, 0b00, 0b001
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
0|(srcRegBits>>3),
|
|
tp<<6|0b00_1_11_000|opcode,
|
|
sf<<7|0b0_0_0_11110,
|
|
)
|
|
|
|
case FMOVD, FMOVS:
|
|
isSrcInt, isDstInt := isIntRegister(n.srcReg), isIntRegister(n.dstReg)
|
|
if isSrcInt && isDstInt {
|
|
return errors.New("FMOV needs at least one of operands to be integer")
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMOV--register---Floating-point-Move-register-without-conversion-?lang=en
|
|
if !isSrcInt && !isDstInt { // Float to float.
|
|
var tp byte
|
|
if inst == FMOVD {
|
|
tp = 0b01
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
0b0_10000_00|(srcRegBits>>3),
|
|
tp<<6|0b00_1_00000,
|
|
0b000_11110,
|
|
)
|
|
} else if isSrcInt && !isDstInt { // Int to float.
|
|
var tp, sf byte
|
|
if inst == FMOVD {
|
|
tp, sf = 0b01, 0b1
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
srcRegBits>>3,
|
|
tp<<6|0b00_1_00_111,
|
|
sf<<7|0b0_00_11110,
|
|
)
|
|
} else { // Float to int.
|
|
var tp, sf byte
|
|
if inst == FMOVD {
|
|
tp, sf = 0b01, 0b1
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
srcRegBits>>3,
|
|
tp<<6|0b00_1_00_110,
|
|
sf<<7|0b0_00_11110,
|
|
)
|
|
}
|
|
|
|
case MOVD, MOVW:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
if n.srcReg == RegSP || n.dstReg == RegSP {
|
|
// Moving between stack pointers.
|
|
// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate--
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
srcRegBits>>3,
|
|
0x0,
|
|
0b1001_0001,
|
|
)
|
|
return
|
|
}
|
|
|
|
if n.srcReg == RegRZR && inst == MOVD {
|
|
// If this is 64-bit mov from zero register, then we encode this as MOVK.
|
|
// See "Move wide (immediate)" in
|
|
// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Immediate
|
|
buf.Append4Bytes(
|
|
dstRegBits,
|
|
0x0,
|
|
0b1000_0000,
|
|
0b1_10_10010,
|
|
)
|
|
} else {
|
|
// MOV can be encoded as ORR (shifted register): "ORR Wd, WZR, Wm".
|
|
// https://developer.arm.com/documentation/100069/0609/A64-General-Instructions/MOV--register-
|
|
var sf byte
|
|
if inst == MOVD {
|
|
sf = 0b1
|
|
}
|
|
buf.Append4Bytes(
|
|
(zeroRegisterBits<<5)|dstRegBits,
|
|
zeroRegisterBits>>3,
|
|
0b000_00000|srcRegBits,
|
|
sf<<7|0b0_01_01010,
|
|
)
|
|
}
|
|
|
|
case MRS:
|
|
if n.srcReg != RegFPSR {
|
|
return fmt.Errorf("MRS has only support for FPSR register as a src but got %s", RegisterName(n.srcReg))
|
|
}
|
|
|
|
// For how to specify FPSR register, see "Accessing FPSR" in:
|
|
// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en
|
|
dstRegBits := registerBits(n.dstReg)
|
|
buf.Append4Bytes(
|
|
0b001<<5|dstRegBits,
|
|
0b0100<<4|0b0100,
|
|
0b0011_0000|0b11<<3|0b011,
|
|
0b1101_0101,
|
|
)
|
|
|
|
case MSR:
|
|
if n.dstReg != RegFPSR {
|
|
return fmt.Errorf("MSR has only support for FPSR register as a dst but got %s", RegisterName(n.srcReg))
|
|
}
|
|
|
|
// For how to specify FPSR register, see "Accessing FPSR" in:
|
|
// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en
|
|
srcRegBits := registerBits(n.srcReg)
|
|
buf.Append4Bytes(
|
|
0b001<<5|srcRegBits,
|
|
0b0100<<4|0b0100,
|
|
0b0001_0000|0b11<<3|0b011,
|
|
0b1101_0101,
|
|
)
|
|
|
|
case MUL, MULW:
|
|
// Multiplications are encoded as MADD (zero register, src, dst), dst = zero + (src * dst) = src * dst.
|
|
// See "Data-processing (3 source)" in
|
|
// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
|
|
var sf byte
|
|
if inst == MUL {
|
|
sf = 0b1
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
buf.Append4Bytes(
|
|
dstRegBits<<5|dstRegBits,
|
|
zeroRegisterBits<<2|dstRegBits>>3,
|
|
srcRegBits,
|
|
sf<<7|0b11011,
|
|
)
|
|
|
|
case NEG, NEGW:
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
|
|
// NEG is encoded as "SUB dst, XZR, src" = "dst = 0 - src"
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
|
|
var sf byte
|
|
if inst == NEG {
|
|
sf = 0b1
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(zeroRegisterBits<<5)|dstRegBits,
|
|
zeroRegisterBits>>3,
|
|
srcRegBits,
|
|
sf<<7|0b0_10_00000|0b0_00_01011,
|
|
)
|
|
|
|
case SDIV, SDIVW, UDIV, UDIVW:
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
|
|
// See "Data-processing (2 source)" in
|
|
// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
var sf, opcode byte
|
|
switch inst {
|
|
case SDIV:
|
|
sf, opcode = 0b1, 0b000011
|
|
case SDIVW:
|
|
sf, opcode = 0b0, 0b000011
|
|
case UDIV:
|
|
sf, opcode = 0b1, 0b000010
|
|
case UDIVW:
|
|
sf, opcode = 0b0, 0b000010
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
opcode<<2|(dstRegBits>>3),
|
|
0b110_00000|srcRegBits,
|
|
sf<<7|0b0_00_11010,
|
|
)
|
|
|
|
case SCVTFD, SCVTFWD, SCVTFS, SCVTFWS, UCVTFD, UCVTFS, UCVTFWD, UCVTFWS:
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, false); err != nil {
|
|
return
|
|
}
|
|
|
|
// "Conversion between floating-point and integer" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
|
|
var sf, tp, opcode byte
|
|
switch inst {
|
|
case SCVTFD: // 64-bit integer to double
|
|
sf, tp, opcode = 0b1, 0b01, 0b010
|
|
case SCVTFWD: // 32-bit integer to double
|
|
sf, tp, opcode = 0b0, 0b01, 0b010
|
|
case SCVTFS: // 64-bit integer to single
|
|
sf, tp, opcode = 0b1, 0b00, 0b010
|
|
case SCVTFWS: // 32-bit integer to single
|
|
sf, tp, opcode = 0b0, 0b00, 0b010
|
|
case UCVTFD: // 64-bit to double
|
|
sf, tp, opcode = 0b1, 0b01, 0b011
|
|
case UCVTFWD: // 32-bit to double
|
|
sf, tp, opcode = 0b0, 0b01, 0b011
|
|
case UCVTFS: // 64-bit to single
|
|
sf, tp, opcode = 0b1, 0b00, 0b011
|
|
case UCVTFWS: // 32-bit to single
|
|
sf, tp, opcode = 0b0, 0b00, 0b011
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
srcRegBits>>3,
|
|
tp<<6|0b00_1_00_000|opcode,
|
|
sf<<7|0b0_0_0_11110,
|
|
)
|
|
|
|
case SXTB, SXTBW, SXTH, SXTHW, SXTW:
|
|
if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
|
|
return
|
|
}
|
|
|
|
srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
|
|
if n.srcReg == RegRZR {
|
|
// If the source is zero register, we encode as MOV dst, zero.
|
|
var sf byte
|
|
if inst == MOVD {
|
|
sf = 0b1
|
|
}
|
|
buf.Append4Bytes(
|
|
(zeroRegisterBits<<5)|dstRegBits,
|
|
zeroRegisterBits>>3,
|
|
0b000_00000|srcRegBits,
|
|
sf<<7|0b0_01_01010,
|
|
)
|
|
return
|
|
}
|
|
|
|
// SXTB is encoded as "SBFM Wd, Wn, #0, #7"
|
|
// https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTB
|
|
// SXTH is encoded as "SBFM Wd, Wn, #0, #15"
|
|
// https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTH
|
|
// SXTW is encoded as "SBFM Xd, Xn, #0, #31"
|
|
// https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/SXTW
|
|
|
|
var n, sf, imms, opc byte
|
|
switch inst {
|
|
case SXTB:
|
|
n, sf, imms = 0b1, 0b1, 0x7
|
|
case SXTBW:
|
|
n, sf, imms = 0b0, 0b0, 0x7
|
|
case SXTH:
|
|
n, sf, imms = 0b1, 0b1, 0xf
|
|
case SXTHW:
|
|
n, sf, imms = 0b0, 0b0, 0xf
|
|
case SXTW:
|
|
n, sf, imms = 0b1, 0b1, 0x1f
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
imms<<2|(srcRegBits>>3),
|
|
n<<6,
|
|
sf<<7|opc<<5|0b10011,
|
|
)
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeLeftShiftedRegisterToRegister(buf asm.Buffer, n *nodeImpl) error {
|
|
baseRegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
shiftTargetRegBits, err := intRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dstRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch n.instruction {
|
|
case ADD:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
|
|
const logicalLeftShiftBits = 0b00
|
|
if n.srcConst < 0 || n.srcConst > 64 {
|
|
return fmt.Errorf("shift amount must fit in unsigned 6-bit integer (0-64) but got %d", n.srcConst)
|
|
}
|
|
shiftByte := byte(n.srcConst)
|
|
buf.Append4Bytes(
|
|
(baseRegBits<<5)|dstRegBits,
|
|
(shiftByte<<2)|(baseRegBits>>3),
|
|
(logicalLeftShiftBits<<6)|shiftTargetRegBits,
|
|
0b1000_1011,
|
|
)
|
|
return err
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeTwoRegistersToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
switch inst := n.instruction; inst {
|
|
case AND, ANDW, ORR, ORRW, EOR, EORW:
|
|
// See "Logical (shifted register)" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
|
|
var sf, opc byte
|
|
switch inst {
|
|
case AND:
|
|
sf, opc = 0b1, 0b00
|
|
case ANDW:
|
|
sf, opc = 0b0, 0b00
|
|
case ORR:
|
|
sf, opc = 0b1, 0b01
|
|
case ORRW:
|
|
sf, opc = 0b0, 0b01
|
|
case EOR:
|
|
sf, opc = 0b1, 0b10
|
|
case EORW:
|
|
sf, opc = 0b0, 0b10
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcReg2Bits<<5)|dstRegBits,
|
|
srcReg2Bits>>3,
|
|
srcRegBits,
|
|
sf<<7|opc<<5|0b01010,
|
|
)
|
|
case ASR, ASRW, LSL, LSLW, LSR, LSRW, ROR, RORW:
|
|
// See "Data-processing (2 source)" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
|
|
|
|
var sf, opcode byte
|
|
switch inst {
|
|
case ASR:
|
|
sf, opcode = 0b1, 0b001010
|
|
case ASRW:
|
|
sf, opcode = 0b0, 0b001010
|
|
case LSL:
|
|
sf, opcode = 0b1, 0b001000
|
|
case LSLW:
|
|
sf, opcode = 0b0, 0b001000
|
|
case LSR:
|
|
sf, opcode = 0b1, 0b001001
|
|
case LSRW:
|
|
sf, opcode = 0b0, 0b001001
|
|
case ROR:
|
|
sf, opcode = 0b1, 0b001011
|
|
case RORW:
|
|
sf, opcode = 0b0, 0b001011
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcReg2Bits<<5)|dstRegBits,
|
|
opcode<<2|(srcReg2Bits>>3),
|
|
0b110_00000|srcRegBits,
|
|
sf<<7|0b0_00_11010,
|
|
)
|
|
case SDIV, SDIVW, UDIV, UDIVW:
|
|
srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
|
|
|
|
// See "Data-processing (2 source)" in
|
|
// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
var sf, opcode byte
|
|
switch inst {
|
|
case SDIV:
|
|
sf, opcode = 0b1, 0b000011
|
|
case SDIVW:
|
|
sf, opcode = 0b0, 0b000011
|
|
case UDIV:
|
|
sf, opcode = 0b1, 0b000010
|
|
case UDIVW:
|
|
sf, opcode = 0b0, 0b000010
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcReg2Bits<<5)|dstRegBits,
|
|
opcode<<2|(srcReg2Bits>>3),
|
|
0b110_00000|srcRegBits,
|
|
sf<<7|0b0_00_11010,
|
|
)
|
|
case SUB, SUBW:
|
|
srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
|
|
|
|
// See "Add/subtract (shifted register)" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
var sf byte
|
|
if inst == SUB {
|
|
sf = 0b1
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcReg2Bits<<5)|dstRegBits,
|
|
srcReg2Bits>>3,
|
|
srcRegBits,
|
|
sf<<7|0b0_10_01011,
|
|
)
|
|
case FSUBD, FSUBS:
|
|
srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
|
|
|
|
// See "Floating-point data-processing (2 source)" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var tp byte
|
|
if inst == FSUBD {
|
|
tp = 0b01
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcReg2Bits<<5)|dstRegBits,
|
|
0b0011_10_00|(srcReg2Bits>>3),
|
|
tp<<6|0b00_1_00000|srcRegBits,
|
|
0b0_00_11110,
|
|
)
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeThreeRegistersToRegister(buf asm.Buffer, n *nodeImpl) error {
|
|
switch n.instruction {
|
|
case MSUB, MSUBW:
|
|
// Dst = Src2 - (Src1 * Src3)
|
|
// "Data-processing (3 source)" in:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
|
|
src1RegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
src2RegBits, err := intRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
src3RegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dstRegBits, err := intRegisterBits(n.dstReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var sf byte // is zero for MSUBW (32-bit MSUB).
|
|
if n.instruction == MSUB {
|
|
sf = 0b1
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(src3RegBits<<5)|dstRegBits,
|
|
0b1_0000000|(src2RegBits<<2)|(src3RegBits>>3),
|
|
src1RegBits,
|
|
sf<<7|0b00_11011,
|
|
)
|
|
return nil
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeTwoRegistersToNone(buf asm.Buffer, n *nodeImpl) error {
|
|
switch n.instruction {
|
|
case CMPW, CMP:
|
|
// Compare on two registers is an alias for "SUBS (src1, src2) ZERO"
|
|
// which can be encoded as SUBS (shifted registers) with zero shifting.
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
|
|
src1RegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
src2RegBits, err := intRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var op byte
|
|
if n.instruction == CMP {
|
|
op = 0b111
|
|
} else {
|
|
op = 0b011
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(src2RegBits<<5)|zeroRegisterBits,
|
|
src2RegBits>>3,
|
|
src1RegBits,
|
|
0b01011|(op<<5),
|
|
)
|
|
return nil
|
|
case FCMPS, FCMPD:
|
|
// "Floating-point compare" section in:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
src1RegBits, err := vectorRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
src2RegBits, err := vectorRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var ftype byte // is zero for FCMPS (single precision float compare).
|
|
if n.instruction == FCMPD {
|
|
ftype = 0b01
|
|
}
|
|
buf.Append4Bytes(
|
|
src2RegBits<<5,
|
|
0b001000_00|(src2RegBits>>3),
|
|
ftype<<6|0b1_00000|src1RegBits,
|
|
0b000_11110,
|
|
)
|
|
return nil
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeRegisterAndConstToNone(buf asm.Buffer, n *nodeImpl) error {
|
|
if n.instruction != CMP {
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en
|
|
if n.srcConst < 0 || n.srcConst > 4095 {
|
|
return fmt.Errorf("immediate for CMP must fit in 0 to 4095 but got %d", n.srcConst)
|
|
} else if n.srcReg == RegRZR {
|
|
return errors.New("zero register is not supported for CMP (immediate)")
|
|
}
|
|
|
|
srcRegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|zeroRegisterBits,
|
|
(byte(n.srcConst)<<2)|(srcRegBits>>3),
|
|
byte(n.srcConst>>6),
|
|
0b111_10001,
|
|
)
|
|
return nil
|
|
}
|
|
|
|
func fitInSigned9Bits(v int64) bool {
|
|
return v >= -256 && v <= 255
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeLoadOrStoreWithRegisterOffset(
|
|
buf asm.Buffer, baseRegBits, offsetRegBits, targetRegBits byte, opcode, size, v byte,
|
|
) {
|
|
// See "Load/store register (register offset)".
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
|
|
buf.Append4Bytes(
|
|
(baseRegBits<<5)|targetRegBits,
|
|
0b011_010_00|(baseRegBits>>3),
|
|
opcode<<6|0b00_1_00000|offsetRegBits,
|
|
size<<6|v<<2|0b00_111_0_00,
|
|
)
|
|
}
|
|
|
|
// validateMemoryOffset validates the memory offset if the given offset can be encoded in the assembler.
|
|
// In theory, offset can be any, but for simplicity of our homemade assembler, we limit the offset range
|
|
// that can be encoded enough for supporting compiler.
|
|
func validateMemoryOffset(offset int64) error {
|
|
if offset > 255 && offset%4 != 0 {
|
|
// This is because we only have large offsets for load/store with Wasm value stack or reading type IDs, and its offset
|
|
// is always multiplied by 4 or 8 (== the size of uint32 or uint64 == the type of wasm.FunctionTypeID or value stack in Go)
|
|
return fmt.Errorf("large memory offset (>255) must be a multiple of 4 but got %d", offset)
|
|
} else if offset < -256 { // 9-bit signed integer's minimum = 2^8.
|
|
return fmt.Errorf("negative memory offset must be larget than or equal -256 but got %d", offset)
|
|
} else if offset > 1<<31-1 {
|
|
return fmt.Errorf("large memory offset must be less than %d but got %d", 1<<31-1, offset)
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// encodeLoadOrStoreWithConstOffset encodes load/store instructions with the constant offset.
|
|
//
|
|
// Note: Encoding strategy intentionally matches the Go assembler: https://go.dev/doc/asm
|
|
func (a *AssemblerImpl) encodeLoadOrStoreWithConstOffset(
|
|
buf asm.Buffer,
|
|
baseRegBits, targetRegBits byte,
|
|
offset int64,
|
|
opcode, size, v byte,
|
|
datasize, datasizeLog2 int64,
|
|
) (err error) {
|
|
if err = validateMemoryOffset(offset); err != nil {
|
|
return
|
|
}
|
|
|
|
if fitInSigned9Bits(offset) {
|
|
// See "LDAPR/STLR (unscaled immediate)"
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldapstl_unscaled
|
|
if offset < 0 || offset%datasize != 0 {
|
|
// This case is encoded as one "unscaled signed store".
|
|
buf.Append4Bytes(
|
|
(baseRegBits<<5)|targetRegBits,
|
|
byte(offset<<4)|(baseRegBits>>3),
|
|
opcode<<6|(0b00_00_11111&byte(offset>>4)),
|
|
size<<6|v<<2|0b00_1_11_0_00,
|
|
)
|
|
return
|
|
}
|
|
}
|
|
|
|
// At this point we have the assumption that offset is positive.
|
|
// Plus if it is a multiple of datasize, then it can be encoded as a single "unsigned immediate".
|
|
if offset%datasize == 0 &&
|
|
offset < (1<<12)<<datasizeLog2 {
|
|
m := offset / datasize
|
|
buf.Append4Bytes(
|
|
(baseRegBits<<5)|targetRegBits,
|
|
(byte(m<<2))|(baseRegBits>>3),
|
|
opcode<<6|0b00_111111&byte(m>>6),
|
|
size<<6|v<<2|0b00_1_11_0_01,
|
|
)
|
|
return
|
|
}
|
|
|
|
// Otherwise, we need multiple instructions.
|
|
tmpRegBits := registerBits(a.temporaryRegister)
|
|
offset32 := int32(offset)
|
|
|
|
// Go's assembler adds a const into the const pool at this point,
|
|
// regardless of its usage; e.g. if we enter the then block of the following if statement,
|
|
// the const is not used but it is added into the const pool.
|
|
c := asm.NewStaticConst(make([]byte, 4))
|
|
binary.LittleEndian.PutUint32(c.Raw, uint32(offset))
|
|
a.pool.AddConst(c, uint64(buf.Len()))
|
|
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3529-L3532
|
|
// If the offset is within 24-bits, we can load it with two ADD instructions.
|
|
hi := offset32 - (offset32 & (0xfff << uint(datasizeLog2)))
|
|
if hi&^0xfff000 == 0 {
|
|
var sfops byte = 0b100
|
|
m := ((offset32 - hi) >> datasizeLog2) & 0xfff
|
|
hi >>= 12
|
|
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3534-L3535
|
|
buf.Append4Bytes(
|
|
(baseRegBits<<5)|tmpRegBits,
|
|
(byte(hi)<<2)|(baseRegBits>>3),
|
|
0b01<<6 /* shift by 12 */ |byte(hi>>6),
|
|
sfops<<5|0b10001,
|
|
)
|
|
|
|
buf.Append4Bytes(
|
|
(tmpRegBits<<5)|targetRegBits,
|
|
(byte(m<<2))|(tmpRegBits>>3),
|
|
opcode<<6|0b00_111111&byte(m>>6),
|
|
size<<6|v<<2|0b00_1_11_0_01,
|
|
)
|
|
} else {
|
|
// This case we load the const via ldr(literal) into tem register,
|
|
// and the target const is placed after this instruction below.
|
|
loadLiteralOffsetInBinary := uint64(buf.Len())
|
|
|
|
// First we emit the ldr(literal) with offset zero as we don't yet know the const's placement in the binary.
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--literal---Load-Register--literal--
|
|
buf.Append4Bytes(tmpRegBits, 0x0, 0x0, 0b00_011_0_00)
|
|
|
|
// Set the callback for the constant, and we set properly the offset in the callback.
|
|
|
|
c.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
|
|
// ldr(literal) encodes offset divided by 4.
|
|
offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4
|
|
bin := buf.Bytes()
|
|
bin[loadLiteralOffsetInBinary] |= byte(offset << 5)
|
|
bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3)
|
|
bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11)
|
|
})
|
|
|
|
// Then, load the constant with the register offset.
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--register---Load-Register--register--
|
|
buf.Append4Bytes(
|
|
(baseRegBits<<5)|targetRegBits,
|
|
0b011_010_00|(baseRegBits>>3),
|
|
opcode<<6|0b00_1_00000|tmpRegBits,
|
|
size<<6|v<<2|0b00_111_0_00,
|
|
)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
|
|
var (
|
|
size, v byte
|
|
datasize, datasizeLog2 int64
|
|
isTargetFloat bool
|
|
)
|
|
switch n.instruction {
|
|
case STRD:
|
|
size, v, datasize, datasizeLog2 = 0b11, 0x0, 8, 3
|
|
case STRW:
|
|
size, v, datasize, datasizeLog2 = 0b10, 0x0, 4, 2
|
|
case STRH:
|
|
size, v, datasize, datasizeLog2 = 0b01, 0x0, 2, 1
|
|
case STRB:
|
|
size, v, datasize, datasizeLog2 = 0b00, 0x0, 1, 0
|
|
case FSTRD:
|
|
size, v, datasize, datasizeLog2, isTargetFloat = 0b11, 0x1, 8, 3, true
|
|
case FSTRS:
|
|
size, v, datasize, datasizeLog2, isTargetFloat = 0b10, 0x1, 4, 2, true
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
var srcRegBits byte
|
|
if isTargetFloat {
|
|
srcRegBits, err = vectorRegisterBits(n.srcReg)
|
|
} else {
|
|
srcRegBits, err = intRegisterBits(n.srcReg)
|
|
}
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
baseRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
const opcode = 0x00 // opcode for store instructions.
|
|
if n.dstReg2 != asm.NilRegister {
|
|
offsetRegBits, err := intRegisterBits(n.dstReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeLoadOrStoreWithRegisterOffset(buf, baseRegBits, offsetRegBits, srcRegBits, opcode, size, v)
|
|
} else {
|
|
err = a.encodeLoadOrStoreWithConstOffset(buf, baseRegBits, srcRegBits, n.dstConst, opcode, size, v, datasize, datasizeLog2)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeADR(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
dstRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
adrInstructionOffsetInBinary := uint64(buf.Len())
|
|
|
|
// At this point, we don't yet know the target offset to read from,
|
|
// so we emit the ADR instruction with 0 offset, and replace later in the callback.
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
|
|
buf.Append4Bytes(dstRegBits, 0x0, 0x0, 0b10000)
|
|
|
|
// This case, the ADR's target offset is for the staticConst's initial address.
|
|
if sc := n.staticConst; sc != nil {
|
|
a.pool.AddConst(sc, adrInstructionOffsetInBinary)
|
|
sc.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
|
|
adrInstructionBytes := buf.Bytes()[adrInstructionOffsetInBinary : adrInstructionOffsetInBinary+4]
|
|
offset := int(offsetOfConst) - int(adrInstructionOffsetInBinary)
|
|
|
|
// See https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
|
|
adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5)
|
|
offset >>= 2
|
|
adrInstructionBytes[0] |= byte(offset << 5)
|
|
offset >>= 3
|
|
adrInstructionBytes[1] |= byte(offset)
|
|
offset >>= 8
|
|
adrInstructionBytes[2] |= byte(offset)
|
|
})
|
|
return
|
|
} else {
|
|
a.adrInstructionNodes = append(a.adrInstructionNodes, n)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) finalizeADRInstructionNode(code []byte, n *nodeImpl) (err error) {
|
|
// Find the target instruction node.
|
|
targetNode := n
|
|
for ; targetNode != nil; targetNode = targetNode.next {
|
|
if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction {
|
|
targetNode = targetNode.next
|
|
break
|
|
}
|
|
}
|
|
|
|
if targetNode == nil {
|
|
return fmt.Errorf("BUG: target instruction %s not found for ADR", InstructionName(n.readInstructionAddressBeforeTargetInstruction))
|
|
}
|
|
|
|
offset := targetNode.OffsetInBinary() - n.OffsetInBinary()
|
|
if i64 := int64(offset); i64 >= 1<<20 || i64 < -1<<20 {
|
|
// We could support offset over 20-bit range by special casing them here,
|
|
// but 20-bit range should be enough for our impl. If the necessity comes up,
|
|
// we could add the special casing here to support arbitrary large offset.
|
|
return fmt.Errorf("BUG: too large offset for ADR: %#x", offset)
|
|
}
|
|
|
|
adrInstructionBytes := code[n.OffsetInBinary() : n.OffsetInBinary()+4]
|
|
// According to the binary format of ADR instruction:
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
|
|
adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5)
|
|
offset >>= 2
|
|
adrInstructionBytes[0] |= byte(offset << 5)
|
|
offset >>= 3
|
|
adrInstructionBytes[1] |= byte(offset)
|
|
offset >>= 8
|
|
adrInstructionBytes[2] |= byte(offset)
|
|
return nil
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
|
|
var (
|
|
size, v, opcode byte
|
|
datasize, datasizeLog2 int64
|
|
isTargetFloat bool
|
|
)
|
|
switch n.instruction {
|
|
case ADR:
|
|
return a.encodeADR(buf, n)
|
|
case FLDRD:
|
|
size, v, datasize, datasizeLog2, opcode, isTargetFloat = 0b11, 0x1, 8, 3, 0b01, true
|
|
case FLDRS:
|
|
size, v, datasize, datasizeLog2, opcode, isTargetFloat = 0b10, 0x1, 4, 2, 0b01, true
|
|
case LDRD:
|
|
size, v, datasize, datasizeLog2, opcode = 0b11, 0x0, 8, 3, 0b01
|
|
case LDRW:
|
|
size, v, datasize, datasizeLog2, opcode = 0b10, 0x0, 4, 2, 0b01
|
|
case LDRSHD:
|
|
size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b10
|
|
case LDRSHW:
|
|
size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b11
|
|
case LDRH:
|
|
size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b01
|
|
case LDRSBD:
|
|
size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b10
|
|
case LDRSBW:
|
|
size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b11
|
|
case LDRB:
|
|
size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b01
|
|
case LDRSW:
|
|
size, v, datasize, datasizeLog2, opcode = 0b10, 0x0, 4, 2, 0b10
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
var dstRegBits byte
|
|
if isTargetFloat {
|
|
dstRegBits, err = vectorRegisterBits(n.dstReg)
|
|
} else {
|
|
dstRegBits, err = intRegisterBits(n.dstReg)
|
|
}
|
|
if err != nil {
|
|
return
|
|
}
|
|
baseRegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if n.srcReg2 != asm.NilRegister {
|
|
offsetRegBits, err := intRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeLoadOrStoreWithRegisterOffset(buf, baseRegBits, offsetRegBits, dstRegBits, opcode,
|
|
size, v)
|
|
} else {
|
|
err = a.encodeLoadOrStoreWithConstOffset(buf, baseRegBits, dstRegBits, n.srcConst, opcode,
|
|
size, v, datasize, datasizeLog2)
|
|
}
|
|
return
|
|
}
|
|
|
|
// const16bitAligned check if the value is on the 16-bit alignment.
|
|
// If so, returns the shift num divided by 16, and otherwise -1.
|
|
func const16bitAligned(v int64) (ret int) {
|
|
ret = -1
|
|
for s := 0; s < 64; s += 16 {
|
|
if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
|
|
ret = s / 16
|
|
break
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
|
|
//
|
|
// Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
|
|
// Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
|
|
//
|
|
// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
|
|
func isBitMaskImmediate(x uint64) bool {
|
|
// All zeros and ones are not "bitmask immediate" by defainition.
|
|
if x == 0 || x == 0xffff_ffff_ffff_ffff {
|
|
return false
|
|
}
|
|
|
|
switch {
|
|
case x != x>>32|x<<32:
|
|
// e = 64
|
|
case x != x>>16|x<<48:
|
|
// e = 32 (x == x>>32|x<<32).
|
|
// e.g. 0x00ff_ff00_00ff_ff00
|
|
x = uint64(int32(x))
|
|
case x != x>>8|x<<56:
|
|
// e = 16 (x == x>>16|x<<48).
|
|
// e.g. 0x00ff_00ff_00ff_00ff
|
|
x = uint64(int16(x))
|
|
case x != x>>4|x<<60:
|
|
// e = 8 (x == x>>8|x<<56).
|
|
// e.g. 0x0f0f_0f0f_0f0f_0f0f
|
|
x = uint64(int8(x))
|
|
default:
|
|
// e = 4 or 2.
|
|
return true
|
|
}
|
|
return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
|
|
}
|
|
|
|
// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
|
|
// For example: 0b1110 -> true, 0b1010 -> false
|
|
func sequenceOfSetbits(x uint64) bool {
|
|
y := getLowestBit(x)
|
|
// If x is a sequence of set bit, this should results in the number
|
|
// with only one set bit (i.e. power of two).
|
|
y += x
|
|
return (y-1)&y == 0
|
|
}
|
|
|
|
func getLowestBit(x uint64) uint64 {
|
|
// See https://stackoverflow.com/questions/12247186/find-the-lowest-set-bit
|
|
return x & (^x + 1)
|
|
}
|
|
|
|
func (a *AssemblerImpl) addOrSub64BitRegisters(buf asm.Buffer, sfops byte, sp bool, dstRegBits, src1RegBits, src2RegBits byte) {
|
|
// src1Reg = src1Reg +/- src2Reg
|
|
if sp {
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--extended-register---Add--extended-register--?lang=en
|
|
buf.Append4Bytes(
|
|
(src1RegBits<<5)|dstRegBits,
|
|
0b011<<5|src1RegBits>>3,
|
|
1<<5|src2RegBits,
|
|
sfops<<5|0b01011,
|
|
)
|
|
} else {
|
|
buf.Append4Bytes(
|
|
(src1RegBits<<5)|dstRegBits,
|
|
src1RegBits>>3,
|
|
src2RegBits,
|
|
sfops<<5|0b01011,
|
|
)
|
|
}
|
|
}
|
|
|
|
func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
|
|
var size uint32
|
|
switch {
|
|
case c != c>>32|c<<32:
|
|
size = 64
|
|
case c != c>>16|c<<48:
|
|
size = 32
|
|
c = uint64(int32(c))
|
|
case c != c>>8|c<<56:
|
|
size = 16
|
|
c = uint64(int16(c))
|
|
case c != c>>4|c<<60:
|
|
size = 8
|
|
c = uint64(int8(c))
|
|
case c != c>>2|c<<62:
|
|
size = 4
|
|
c = uint64(int64(c<<60) >> 60)
|
|
default:
|
|
size = 2
|
|
c = uint64(int64(c<<62) >> 62)
|
|
}
|
|
|
|
neg := false
|
|
if int64(c) < 0 {
|
|
c = ^c
|
|
neg = true
|
|
}
|
|
|
|
onesSize, nonZeroPos := getOnesSequenceSize(c)
|
|
if neg {
|
|
nonZeroPos = onesSize + nonZeroPos
|
|
onesSize = size - onesSize
|
|
}
|
|
|
|
var mode byte = 32
|
|
if is64bit {
|
|
N, mode = 0b1, 64
|
|
}
|
|
|
|
immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
|
|
imms = byte((onesSize - 1) | 63&^(size<<1-1))
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
// Alias for readability.
|
|
c := n.srcConst
|
|
|
|
dstRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// See "Logical (immediate)" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate
|
|
switch n.instruction {
|
|
case ANDIMM32:
|
|
var sf, opc byte = 0b0, 0b00
|
|
if !isBitMaskImmediate(uint64(c)) {
|
|
err = fmt.Errorf("const %d must be valid bitmask immediate for %s", c, InstructionName(ANDIMM64))
|
|
return
|
|
}
|
|
immr, imms, N := bitmaskImmediate(uint64(c), false)
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
imms<<2|dstRegBits>>3,
|
|
N<<6|immr,
|
|
sf<<7|opc<<5|0b10010,
|
|
)
|
|
return
|
|
case ANDIMM64:
|
|
var sf, opc byte = 0b1, 0b00
|
|
if !isBitMaskImmediate(uint64(c)) {
|
|
err = fmt.Errorf("const %d must be valid bitmask immediate for %s", c, InstructionName(ANDIMM64))
|
|
return
|
|
}
|
|
immr, imms, N := bitmaskImmediate(uint64(c), true)
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
imms<<2|dstRegBits>>3,
|
|
N<<6|immr,
|
|
sf<<7|opc<<5|0b10010,
|
|
)
|
|
return
|
|
}
|
|
|
|
switch inst := n.instruction; inst {
|
|
case ADD, ADDS, SUB, SUBS:
|
|
srcRegBits := dstRegBits
|
|
if n.srcReg != asm.NilRegister {
|
|
srcRegBits, err = intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
var sfops byte
|
|
if inst == ADD {
|
|
sfops = 0b100
|
|
} else if inst == ADDS {
|
|
sfops = 0b101
|
|
} else if inst == SUB {
|
|
sfops = 0b110
|
|
} else if inst == SUBS {
|
|
sfops = 0b111
|
|
}
|
|
|
|
isSP := n.srcReg == RegSP || n.dstReg == RegSP
|
|
if c == 0 {
|
|
// If the constant equals zero, we encode it as ADD (register) with zero register.
|
|
a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, zeroRegisterBits)
|
|
return
|
|
}
|
|
|
|
if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
|
|
// If the const can be represented as "imm12" or "imm12 << 12": one instruction
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L2992
|
|
|
|
if c <= 0xfff {
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
(byte(c)<<2)|(srcRegBits>>3),
|
|
byte(c>>6),
|
|
sfops<<5|0b10001,
|
|
)
|
|
} else {
|
|
c >>= 12
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
(byte(c)<<2)|(srcRegBits>>3),
|
|
0b01<<6 /* shift by 12 */ |byte(c>>6),
|
|
sfops<<5|0b10001,
|
|
)
|
|
}
|
|
return
|
|
}
|
|
|
|
if t := const16bitAligned(c); t >= 0 {
|
|
// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
|
|
// We could load it into temporary with movk.
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029
|
|
tmpRegBits := registerBits(a.temporaryRegister)
|
|
|
|
// MOVZ $c, tmpReg with shifting.
|
|
a.load16bitAlignedConst(buf, c>>(16*t), byte(t), tmpRegBits, false, true)
|
|
|
|
// ADD/SUB tmpReg, dstReg
|
|
a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
|
|
return
|
|
} else if t := const16bitAligned(^c); t >= 0 {
|
|
// Also if the reverse of the const can fit within 16-bit range, do the same ^^.
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029
|
|
tmpRegBits := registerBits(a.temporaryRegister)
|
|
|
|
// MOVN $c, tmpReg with shifting.
|
|
a.load16bitAlignedConst(buf, ^c>>(16*t), byte(t), tmpRegBits, true, true)
|
|
|
|
// ADD/SUB tmpReg, dstReg
|
|
a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
|
|
return
|
|
}
|
|
|
|
if uc := uint64(c); isBitMaskImmediate(uc) {
|
|
// If the const can be represented as "bitmask immediate", we load it via ORR into temp register.
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6570-L6583
|
|
tmpRegBits := registerBits(a.temporaryRegister)
|
|
// OOR $c, tmpReg
|
|
a.loadConstViaBitMaskImmediate(buf, uc, tmpRegBits, true)
|
|
|
|
// ADD/SUB tmpReg, dstReg
|
|
a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
|
|
return
|
|
}
|
|
|
|
// If the value fits within 24-bit, then we emit two add instructions
|
|
if 0 <= c && c <= 0xffffff && inst != SUBS && inst != ADDS {
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3849-L3862
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
(byte(c)<<2)|(dstRegBits>>3),
|
|
byte(c&0xfff>>6),
|
|
sfops<<5|0b10001,
|
|
)
|
|
c = c >> 12
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
(byte(c)<<2)|(dstRegBits>>3),
|
|
0b01_000000 /* shift by 12 */ |byte(c>>6),
|
|
sfops<<5|0b10001,
|
|
)
|
|
return
|
|
}
|
|
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3163-L3203
|
|
// Otherwise we use MOVZ and MOVNs for loading const into tmpRegister.
|
|
tmpRegBits := registerBits(a.temporaryRegister)
|
|
a.load64bitConst(buf, c, tmpRegBits)
|
|
a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
|
|
case MOVW:
|
|
if c == 0 {
|
|
buf.Append4Bytes(
|
|
(zeroRegisterBits<<5)|dstRegBits,
|
|
zeroRegisterBits>>3,
|
|
0b000_00000|zeroRegisterBits,
|
|
0b0_01_01010,
|
|
)
|
|
return
|
|
}
|
|
|
|
// Following the logic here:
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
|
|
c32 := uint32(c)
|
|
ic := int64(c32)
|
|
if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
|
|
if isBitMaskImmediate(uint64(c)) {
|
|
a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, false)
|
|
return
|
|
}
|
|
}
|
|
|
|
if t := const16bitAligned(int64(c32)); t >= 0 {
|
|
// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
|
|
// We could load it into temporary with movk.
|
|
a.load16bitAlignedConst(buf, int64(c32)>>(16*t), byte(t), dstRegBits, false, false)
|
|
} else if t := const16bitAligned(int64(^c32)); t >= 0 {
|
|
// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
|
|
a.load16bitAlignedConst(buf, int64(^c32)>>(16*t), byte(t), dstRegBits, true, false)
|
|
} else if isBitMaskImmediate(uint64(c)) {
|
|
a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, false)
|
|
} else {
|
|
// Otherwise, we use MOVZ and MOVK to load it.
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6623-L6630
|
|
c16 := uint16(c32)
|
|
// MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
buf.Append4Bytes(
|
|
(byte(c16)<<5)|dstRegBits,
|
|
byte(c16>>3),
|
|
1<<7|byte(c16>>11),
|
|
0b0_10_10010,
|
|
)
|
|
// MOVK: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK
|
|
c16 = uint16(c32 >> 16)
|
|
if c16 != 0 {
|
|
buf.Append4Bytes(
|
|
(byte(c16)<<5)|dstRegBits,
|
|
byte(c16>>3),
|
|
1<<7|0b0_01_00000 /* shift by 16 */ |byte(c16>>11),
|
|
0b0_11_10010,
|
|
)
|
|
}
|
|
}
|
|
case MOVD:
|
|
// Following the logic here:
|
|
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
|
|
if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
|
|
if isBitMaskImmediate(uint64(c)) {
|
|
a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, true)
|
|
return
|
|
}
|
|
}
|
|
|
|
if t := const16bitAligned(c); t >= 0 {
|
|
// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
|
|
// We could load it into temporary with movk.
|
|
a.load16bitAlignedConst(buf, c>>(16*t), byte(t), dstRegBits, false, true)
|
|
} else if t := const16bitAligned(^c); t >= 0 {
|
|
// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
|
|
a.load16bitAlignedConst(buf, (^c)>>(16*t), byte(t), dstRegBits, true, true)
|
|
} else if isBitMaskImmediate(uint64(c)) {
|
|
a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, true)
|
|
} else {
|
|
a.load64bitConst(buf, c, dstRegBits)
|
|
}
|
|
case LSR:
|
|
if c == 0 {
|
|
err = errors.New("LSR with zero constant should be optimized out")
|
|
return
|
|
} else if c < 0 || c > 63 {
|
|
err = fmt.Errorf("LSR requires immediate to be within 0 to 63, but got %d", c)
|
|
return
|
|
}
|
|
|
|
// LSR(immediate) is an alias of UBFM
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
0b111111_00|dstRegBits>>3,
|
|
0b01_000000|byte(c),
|
|
0b110_10011,
|
|
)
|
|
case LSL:
|
|
if c == 0 {
|
|
err = errors.New("LSL with zero constant should be optimized out")
|
|
return
|
|
} else if c < 0 || c > 63 {
|
|
err = fmt.Errorf("LSL requires immediate to be within 0 to 63, but got %d", c)
|
|
return
|
|
}
|
|
|
|
// LSL(immediate) is an alias of UBFM
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM-
|
|
cb := byte(c)
|
|
buf.Append4Bytes(
|
|
(dstRegBits<<5)|dstRegBits,
|
|
(0b111111-cb)<<2|dstRegBits>>3,
|
|
0b01_000000|(64-cb),
|
|
0b110_10011,
|
|
)
|
|
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) movk(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) {
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK
|
|
buf.Append4Bytes(
|
|
(byte(v)<<5)|dstRegBits,
|
|
byte(v>>3),
|
|
1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)),
|
|
0b1_11_10010,
|
|
)
|
|
}
|
|
|
|
func (a *AssemblerImpl) movz(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) {
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
buf.Append4Bytes(
|
|
(byte(v)<<5)|dstRegBits,
|
|
byte(v>>3),
|
|
1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)),
|
|
0b1_10_10010,
|
|
)
|
|
}
|
|
|
|
func (a *AssemblerImpl) movn(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) {
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
buf.Append4Bytes(
|
|
(byte(v)<<5)|dstRegBits,
|
|
byte(v>>3),
|
|
1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)),
|
|
0b1_00_10010,
|
|
)
|
|
}
|
|
|
|
// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
|
|
// consts as in the Go assembler.
|
|
//
|
|
// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
|
|
func (a *AssemblerImpl) load64bitConst(buf asm.Buffer, c int64, dstRegBits byte) {
|
|
var bits [4]uint64
|
|
var zeros, negs int
|
|
for i := 0; i < 4; i++ {
|
|
bits[i] = uint64((c >> uint(i*16)) & 0xffff)
|
|
if v := bits[i]; v == 0 {
|
|
zeros++
|
|
} else if v == 0xffff {
|
|
negs++
|
|
}
|
|
}
|
|
|
|
if zeros == 3 {
|
|
// one MOVZ instruction.
|
|
for i, v := range bits {
|
|
if v != 0 {
|
|
a.movz(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
} else if negs == 3 {
|
|
// one MOVN instruction.
|
|
for i, v := range bits {
|
|
if v != 0xffff {
|
|
v = ^v
|
|
a.movn(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
} else if zeros == 2 {
|
|
// one MOVZ then one OVK.
|
|
var movz bool
|
|
for i, v := range bits {
|
|
if !movz && v != 0 { // MOVZ.
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
a.movz(buf, v, i, dstRegBits)
|
|
movz = true
|
|
} else if v != 0 {
|
|
a.movk(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
|
|
} else if negs == 2 {
|
|
// one MOVN then one or two MOVK.
|
|
var movn bool
|
|
for i, v := range bits { // Emit MOVN.
|
|
if !movn && v != 0xffff {
|
|
v = ^v
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
|
|
a.movn(buf, v, i, dstRegBits)
|
|
movn = true
|
|
} else if v != 0xffff {
|
|
a.movk(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
|
|
} else if zeros == 1 {
|
|
// one MOVZ then two MOVK.
|
|
var movz bool
|
|
for i, v := range bits {
|
|
if !movz && v != 0 { // MOVZ.
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
a.movz(buf, v, i, dstRegBits)
|
|
movz = true
|
|
} else if v != 0 {
|
|
a.movk(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
|
|
} else if negs == 1 {
|
|
// one MOVN then two MOVK.
|
|
var movn bool
|
|
for i, v := range bits { // Emit MOVN.
|
|
if !movn && v != 0xffff {
|
|
v = ^v
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
|
|
a.movn(buf, v, i, dstRegBits)
|
|
movn = true
|
|
} else if v != 0xffff {
|
|
a.movk(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
|
|
} else {
|
|
// one MOVZ then tree MOVK.
|
|
var movz bool
|
|
for i, v := range bits {
|
|
if !movz && v != 0 { // MOVZ.
|
|
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
a.movz(buf, v, i, dstRegBits)
|
|
movz = true
|
|
} else if v != 0 {
|
|
a.movk(buf, v, i, dstRegBits)
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
func (a *AssemblerImpl) load16bitAlignedConst(buf asm.Buffer, c int64, shiftNum byte, regBits byte, reverse bool, dst64bit bool) {
|
|
var lastByte byte
|
|
if reverse {
|
|
// MOVN: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
|
|
lastByte = 0b0_00_10010
|
|
} else {
|
|
// MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
|
|
lastByte = 0b0_10_10010
|
|
}
|
|
if dst64bit {
|
|
lastByte |= 0b1 << 7
|
|
}
|
|
buf.Append4Bytes(
|
|
(byte(c)<<5)|regBits,
|
|
byte(c>>3),
|
|
1<<7|(shiftNum<<5)|byte(c>>11),
|
|
lastByte,
|
|
)
|
|
}
|
|
|
|
// loadConstViaBitMaskImmediate loads the constant with ORR (bitmask immediate).
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate--?lang=en
|
|
func (a *AssemblerImpl) loadConstViaBitMaskImmediate(buf asm.Buffer, c uint64, regBits byte, dst64bit bool) {
|
|
var size uint32
|
|
switch {
|
|
case c != c>>32|c<<32:
|
|
size = 64
|
|
case c != c>>16|c<<48:
|
|
size = 32
|
|
c = uint64(int32(c))
|
|
case c != c>>8|c<<56:
|
|
size = 16
|
|
c = uint64(int16(c))
|
|
case c != c>>4|c<<60:
|
|
size = 8
|
|
c = uint64(int8(c))
|
|
case c != c>>2|c<<62:
|
|
size = 4
|
|
c = uint64(int64(c<<60) >> 60)
|
|
default:
|
|
size = 2
|
|
c = uint64(int64(c<<62) >> 62)
|
|
}
|
|
|
|
neg := false
|
|
if int64(c) < 0 {
|
|
c = ^c
|
|
neg = true
|
|
}
|
|
|
|
onesSize, nonZeroPos := getOnesSequenceSize(c)
|
|
if neg {
|
|
nonZeroPos = onesSize + nonZeroPos
|
|
onesSize = size - onesSize
|
|
}
|
|
|
|
// See the following article for understanding the encoding.
|
|
// https://dinfuehr.github.io/blog/encoding-of-immediate-values-on-aarch64/
|
|
var n byte
|
|
mode := 32
|
|
if dst64bit && size == 64 {
|
|
n = 0b1
|
|
mode = 64
|
|
}
|
|
|
|
r := byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
|
|
s := byte((onesSize - 1) | 63&^(size<<1-1))
|
|
|
|
var sf byte
|
|
if dst64bit {
|
|
sf = 0b1
|
|
}
|
|
buf.Append4Bytes(
|
|
(zeroRegisterBits<<5)|regBits,
|
|
s<<2|(zeroRegisterBits>>3),
|
|
n<<6|r,
|
|
sf<<7|0b0_01_10010,
|
|
)
|
|
}
|
|
|
|
func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) {
|
|
// Take 0b00111000 for example:
|
|
y := getLowestBit(x) // = 0b0000100
|
|
nonZeroPos = setBitPos(y) // = 2
|
|
size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3
|
|
return
|
|
}
|
|
|
|
func setBitPos(x uint64) (ret uint32) {
|
|
for ; ; ret++ {
|
|
if x == 0b1 {
|
|
break
|
|
}
|
|
x = x >> 1
|
|
}
|
|
return
|
|
}
|
|
|
|
func checkArrangementIndexPair(arr VectorArrangement, index VectorIndex) (err error) {
|
|
if arr == VectorArrangementNone {
|
|
return nil
|
|
}
|
|
var valid bool
|
|
switch arr {
|
|
case VectorArrangement8B:
|
|
valid = index < 8
|
|
case VectorArrangement16B:
|
|
valid = index < 16
|
|
case VectorArrangement4H:
|
|
valid = index < 4
|
|
case VectorArrangement8H:
|
|
valid = index < 8
|
|
case VectorArrangement2S:
|
|
valid = index < 2
|
|
case VectorArrangement4S:
|
|
valid = index < 4
|
|
case VectorArrangement1D:
|
|
valid = index < 1
|
|
case VectorArrangement2D:
|
|
valid = index < 2
|
|
case VectorArrangementB:
|
|
valid = index < 16
|
|
case VectorArrangementH:
|
|
valid = index < 8
|
|
case VectorArrangementS:
|
|
valid = index < 4
|
|
case VectorArrangementD:
|
|
valid = index < 2
|
|
}
|
|
if !valid {
|
|
err = fmt.Errorf("invalid arrangement and index pair: %s[%d]", arr, index)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeMemoryToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
srcBaseRegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch n.instruction {
|
|
case VMOV: // translated as LDR(immediate,SIMD&FP)
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LDR--immediate--SIMD-FP---Load-SIMD-FP-Register--immediate-offset--?lang=en
|
|
var size, opcode byte
|
|
var dataSize, dataSizeLog2 int64
|
|
switch n.vectorArrangement {
|
|
case VectorArrangementB:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b00, 0b01, 1, 0
|
|
case VectorArrangementH:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b01, 0b01, 2, 1
|
|
case VectorArrangementS:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b10, 0b01, 4, 2
|
|
case VectorArrangementD:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b11, 0b01, 8, 3
|
|
case VectorArrangementQ:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b00, 0b11, 16, 4
|
|
}
|
|
const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos
|
|
if n.srcReg2 != asm.NilRegister {
|
|
offsetRegBits, err := intRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeLoadOrStoreWithRegisterOffset(buf, srcBaseRegBits, offsetRegBits, dstVectorRegBits, opcode, size, v)
|
|
} else {
|
|
err = a.encodeLoadOrStoreWithConstOffset(buf, srcBaseRegBits, dstVectorRegBits,
|
|
n.srcConst, opcode, size, v, dataSize, dataSizeLog2)
|
|
}
|
|
case LD1R:
|
|
if n.srcReg2 != asm.NilRegister || n.srcConst != 0 {
|
|
return fmt.Errorf("offset for %s is not implemented", InstructionName(LD1R))
|
|
}
|
|
|
|
var size, q byte
|
|
switch n.vectorArrangement {
|
|
case VectorArrangement8B:
|
|
size, q = 0b00, 0b0
|
|
case VectorArrangement16B:
|
|
size, q = 0b00, 0b1
|
|
case VectorArrangement4H:
|
|
size, q = 0b01, 0b0
|
|
case VectorArrangement8H:
|
|
size, q = 0b01, 0b1
|
|
case VectorArrangement2S:
|
|
size, q = 0b10, 0b0
|
|
case VectorArrangement4S:
|
|
size, q = 0b10, 0b1
|
|
case VectorArrangement1D:
|
|
size, q = 0b11, 0b0
|
|
case VectorArrangement2D:
|
|
size, q = 0b11, 0b1
|
|
}
|
|
|
|
// No offset encoding.
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#iclass_as_post_index
|
|
buf.Append4Bytes(
|
|
(srcBaseRegBits<<5)|dstVectorRegBits,
|
|
0b11_000000|size<<2|srcBaseRegBits>>3,
|
|
0b01_000000,
|
|
q<<6|0b1101,
|
|
)
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
return
|
|
}
|
|
|
|
func arrangementSizeQ(arr VectorArrangement) (size, q byte) {
|
|
switch arr {
|
|
case VectorArrangement8B:
|
|
size, q = 0b00, 0
|
|
case VectorArrangement16B:
|
|
size, q = 0b00, 1
|
|
case VectorArrangement4H:
|
|
size, q = 0b01, 0
|
|
case VectorArrangement8H:
|
|
size, q = 0b01, 1
|
|
case VectorArrangement2S:
|
|
size, q = 0b10, 0
|
|
case VectorArrangement4S:
|
|
size, q = 0b10, 1
|
|
case VectorArrangement1D:
|
|
size, q = 0b11, 0
|
|
case VectorArrangement2D:
|
|
size, q = 0b11, 1
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeVectorRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
srcVectorRegBits, err := vectorRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstBaseRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch n.instruction {
|
|
case VMOV: // translated as STR(immediate,SIMD&FP)
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/STR--immediate--SIMD-FP---Store-SIMD-FP-register--immediate-offset--
|
|
var size, opcode byte
|
|
var dataSize, dataSizeLog2 int64
|
|
switch n.vectorArrangement {
|
|
case VectorArrangementB:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b00, 0b00, 1, 0
|
|
case VectorArrangementH:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b01, 0b00, 2, 1
|
|
case VectorArrangementS:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b10, 0b00, 4, 2
|
|
case VectorArrangementD:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b11, 0b00, 8, 3
|
|
case VectorArrangementQ:
|
|
size, opcode, dataSize, dataSizeLog2 = 0b00, 0b10, 16, 4
|
|
}
|
|
const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos
|
|
|
|
if n.dstReg2 != asm.NilRegister {
|
|
offsetRegBits, err := intRegisterBits(n.dstReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeLoadOrStoreWithRegisterOffset(buf, dstBaseRegBits, offsetRegBits, srcVectorRegBits, opcode, size, v)
|
|
} else {
|
|
err = a.encodeLoadOrStoreWithConstOffset(buf, dstBaseRegBits, srcVectorRegBits,
|
|
n.dstConst, opcode, size, v, dataSize, dataSizeLog2)
|
|
}
|
|
default:
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeStaticConstToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
if n.instruction != VMOV {
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
dstRegBits, err := vectorRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// LDR (literal, SIMD&FP)
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--
|
|
var opc byte
|
|
var constLength int
|
|
switch n.vectorArrangement {
|
|
case VectorArrangementS:
|
|
opc, constLength = 0b00, 4
|
|
case VectorArrangementD:
|
|
opc, constLength = 0b01, 8
|
|
case VectorArrangementQ:
|
|
opc, constLength = 0b10, 16
|
|
}
|
|
|
|
loadLiteralOffsetInBinary := uint64(buf.Len())
|
|
a.pool.AddConst(n.staticConst, loadLiteralOffsetInBinary)
|
|
|
|
if len(n.staticConst.Raw) != constLength {
|
|
return fmt.Errorf("invalid const length for %s: want %d but was %d",
|
|
n.vectorArrangement, constLength, len(n.staticConst.Raw))
|
|
}
|
|
|
|
buf.Append4Bytes(dstRegBits, 0x0, 0x0, opc<<6|0b11100)
|
|
n.staticConst.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
|
|
// LDR (literal, SIMD&FP) encodes offset divided by 4.
|
|
offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4
|
|
bin := buf.Bytes()
|
|
bin[loadLiteralOffsetInBinary] |= byte(offset << 5)
|
|
bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3)
|
|
bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11)
|
|
})
|
|
return
|
|
}
|
|
|
|
// advancedSIMDTwoRegisterMisc holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct {
|
|
qAndSize map[VectorArrangement]qAndSize
|
|
u, opcode byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en
|
|
NOT: {
|
|
u: 0b1, opcode: 0b00101,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en
|
|
VFNEG: {
|
|
u: 0b1, opcode: 0b01111,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FABS--vector---Floating-point-Absolute-value--vector--?lang=en
|
|
VFABS: {u: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSQRT--vector---Floating-point-Square-Root--vector--?lang=en
|
|
VFSQRT: {u: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTM--vector---Floating-point-Round-to-Integral--toward-Minus-infinity--vector--?lang=en
|
|
VFRINTM: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTN--vector---Floating-point-Round-to-Integral--to-nearest-with-ties-to-even--vector--?lang=en
|
|
VFRINTN: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTP--vector---Floating-point-Round-to-Integral--toward-Plus-infinity--vector--?lang=en
|
|
VFRINTP: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTZ--vector---Floating-point-Round-to-Integral--toward-Zero--vector--?lang=en
|
|
VFRINTZ: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CNT--Population-Count-per-byte-?lang=en
|
|
VCNT: {u: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NEG--vector---Negate--vector--?lang=en
|
|
VNEG: {u: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ABS--Absolute-value--vector--?lang=en
|
|
VABS: {u: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/REV64--Reverse-elements-in-64-bit-doublewords--vector--?lang=en
|
|
REV64: {u: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/XTN--XTN2--Extract-Narrow-?lang=en
|
|
XTN: {u: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {q: 0, size: 0b10},
|
|
VectorArrangement4S: {q: 0, size: 0b01},
|
|
VectorArrangement8H: {q: 0, size: 0b00},
|
|
}},
|
|
SHLL: {u: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {q: 0b00, size: 0b00},
|
|
VectorArrangement4H: {q: 0b00, size: 0b01},
|
|
VectorArrangement2S: {q: 0b00, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en
|
|
CMEQZERO: {u: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en
|
|
SADDLP: {u: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en
|
|
UADDLP: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en
|
|
VFCVTZS: {u: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en
|
|
VFCVTZU: {u: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en
|
|
SQXTN: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {q: 0b0, size: 0b00},
|
|
VectorArrangement4H: {q: 0b0, size: 0b01},
|
|
VectorArrangement2S: {q: 0b0, size: 0b10},
|
|
}},
|
|
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en
|
|
SQXTN2: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {q: 0b1, size: 0b00},
|
|
VectorArrangement8H: {q: 0b1, size: 0b01},
|
|
VectorArrangement4S: {q: 0b1, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en
|
|
UQXTN: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en
|
|
SQXTUN: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {q: 0b0, size: 0b00},
|
|
VectorArrangement4H: {q: 0b0, size: 0b01},
|
|
VectorArrangement2S: {q: 0b0, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en
|
|
SQXTUN2: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {q: 0b1, size: 0b00},
|
|
VectorArrangement8H: {q: 0b1, size: 0b01},
|
|
VectorArrangement4S: {q: 0b1, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-Convert-to-Floating-point--vector--?lang=en
|
|
VSCVTF: {u: 0b0, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {q: 0b1, size: 0b01},
|
|
VectorArrangement4S: {q: 0b1, size: 0b00},
|
|
VectorArrangement2S: {q: 0b0, size: 0b00},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-Convert-to-Floating-point--vector--?lang=en
|
|
VUCVTF: {u: 0b1, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {q: 0b1, size: 0b01},
|
|
VectorArrangement4S: {q: 0b1, size: 0b00},
|
|
VectorArrangement2S: {q: 0b0, size: 0b00},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTL--FCVTL2--Floating-point-Convert-to-higher-precision-Long--vector--?lang=en
|
|
FCVTL: {u: 0b0, opcode: 0b10111, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2S: {size: 0b01, q: 0b0},
|
|
VectorArrangement4H: {size: 0b00, q: 0b0},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTN--FCVTN2--Floating-point-Convert-to-lower-precision-Narrow--vector--?lang=en
|
|
FCVTN: {u: 0b0, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2S: {size: 0b01, q: 0b0},
|
|
VectorArrangement4H: {size: 0b00, q: 0b0},
|
|
}},
|
|
}
|
|
|
|
// advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDThreeDifferent = map[asm.Instruction]struct {
|
|
qAndSize map[VectorArrangement]qAndSize
|
|
u, opcode byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMLAL--UMLAL2--vector---Unsigned-Multiply-Add-Long--vector--?lang=en
|
|
VUMLAL: {u: 0b1, opcode: 0b1000, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2S: {q: 0b0, size: 0b10},
|
|
VectorArrangement4H: {q: 0b0, size: 0b01},
|
|
VectorArrangement8B: {q: 0b0, size: 0b00},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en
|
|
SMULL: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {q: 0b0, size: 0b00},
|
|
VectorArrangement4H: {q: 0b0, size: 0b01},
|
|
VectorArrangement2S: {q: 0b0, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en
|
|
SMULL2: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {q: 0b1, size: 0b00},
|
|
VectorArrangement8H: {q: 0b1, size: 0b01},
|
|
VectorArrangement4S: {q: 0b1, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
UMULL: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {q: 0b0, size: 0b00},
|
|
VectorArrangement4H: {q: 0b0, size: 0b01},
|
|
VectorArrangement2S: {q: 0b0, size: 0b10},
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
UMULL2: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {q: 0b1, size: 0b00},
|
|
VectorArrangement8H: {q: 0b1, size: 0b01},
|
|
VectorArrangement4S: {q: 0b1, size: 0b10},
|
|
}},
|
|
}
|
|
|
|
// advancedSIMDThreeSame holds information to encode instructions as "Advanced SIMD three same" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDThreeSame = map[asm.Instruction]struct {
|
|
qAndSize map[VectorArrangement]qAndSize
|
|
u, opcode byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--?lang=en
|
|
VAND: {
|
|
u: 0b0, opcode: 0b00011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BSL--Bitwise-Select-?lang=en
|
|
BSL: {
|
|
u: 0b1, opcode: 0b00011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b01, q: 0b1},
|
|
VectorArrangement8B: {size: 0b01, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EOR--vector---Bitwise-Exclusive-OR--vector--?lang=en
|
|
EOR: {
|
|
u: 0b1, opcode: 0b00011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--?lang=en
|
|
VORR: {
|
|
u: 0b0, opcode: 0b00011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b10, q: 0b1},
|
|
VectorArrangement8B: {size: 0b10, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--?lang=en
|
|
BIC: {
|
|
u: 0b0, opcode: 0b00011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b01, q: 0b1},
|
|
VectorArrangement8B: {size: 0b01, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
|
|
VFADDS: {
|
|
u: 0b0, opcode: 0b11010,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
|
|
VFADDD: {
|
|
u: 0b0, opcode: 0b11010,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
|
|
VFSUBS: {
|
|
u: 0b0, opcode: 0b11010,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
|
|
VFSUBD: {
|
|
u: 0b0, opcode: 0b11010,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-?lang=en
|
|
UMAXP: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en
|
|
CMEQ: {u: 0b1, opcode: 0b10001, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector-
|
|
VADDP: {u: 0b0, opcode: 0b10111, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector---Add--vector--?lang=en
|
|
VADD: {u: 0, opcode: 0b10000, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--?lang=en
|
|
VSUB: {u: 1, opcode: 0b10000, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
|
|
SSHL: {u: 0, opcode: 0b01000, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
|
|
USHL: {u: 0b1, opcode: 0b01000, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGT--register---Compare-signed-Greater-than--vector--?lang=en
|
|
CMGT: {u: 0b0, opcode: 0b00110, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHI--register---Compare-unsigned-Higher--vector--?lang=en
|
|
CMHI: {u: 0b1, opcode: 0b00110, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGE--register---Compare-signed-Greater-than-or-Equal--vector--?lang=en
|
|
CMGE: {u: 0b0, opcode: 0b00111, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHS--register---Compare-unsigned-Higher-or-Same--vector--?lang=en
|
|
CMHS: {u: 0b1, opcode: 0b00111, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMEQ--register---Floating-point-Compare-Equal--vector--?lang=en
|
|
FCMEQ: {
|
|
u: 0b0, opcode: 0b11100,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGT--register---Floating-point-Compare-Greater-than--vector--?lang=en
|
|
FCMGT: {
|
|
u: 0b1, opcode: 0b11100,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGE--register---Floating-point-Compare-Greater-than-or-Equal--vector--?lang=en
|
|
FCMGE: {
|
|
u: 0b1, opcode: 0b11100,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMIN--vector---Floating-point-minimum--vector--?lang=en
|
|
VFMIN: {
|
|
u: 0b0, opcode: 0b11110,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMAX--vector---Floating-point-Maximum--vector--?lang=en
|
|
VFMAX: {
|
|
u: 0b0, opcode: 0b11110,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMUL--vector---Floating-point-Multiply--vector--?lang=en
|
|
VFMUL: {
|
|
u: 0b1, opcode: 0b11011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FDIV--vector---Floating-point-Divide--vector--?lang=en
|
|
VFDIV: {
|
|
u: 0b1, opcode: 0b11111,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4S: {size: 0b00, q: 0b1},
|
|
VectorArrangement2S: {size: 0b00, q: 0b0},
|
|
VectorArrangement2D: {size: 0b01, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MUL--vector---Multiply--vector--?lang=en
|
|
VMUL: {u: 0b0, opcode: 0b10011, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQADD--Signed-saturating-Add-?lang=en
|
|
VSQADD: {u: 0b0, opcode: 0b00001, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQADD--Unsigned-saturating-Add-?lang=en
|
|
VUQADD: {u: 0b1, opcode: 0b00001, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMIN--Signed-Minimum--vector--?lang=en
|
|
SMIN: {u: 0b0, opcode: 0b01101, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMAX--Signed-Maximum--vector--?lang=en
|
|
SMAX: {u: 0b0, opcode: 0b01100, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMIN--Unsigned-Minimum--vector--?lang=en
|
|
UMIN: {u: 0b1, opcode: 0b01101, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAX--Unsigned-Maximum--vector--?lang=en
|
|
UMAX: {u: 0b1, opcode: 0b01100, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/URHADD--Unsigned-Rounding-Halving-Add-?lang=en
|
|
URHADD: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQSUB--Signed-saturating-Subtract-?lang=en
|
|
VSQSUB: {u: 0b0, opcode: 0b00101, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQSUB--Unsigned-saturating-Subtract-?lang=en
|
|
VUQSUB: {u: 0b1, opcode: 0b00101, qAndSize: defaultQAndSize},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIT--Bitwise-Insert-if-True-?lang=en
|
|
VBIT: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {q: 0b0, size: 0b10},
|
|
VectorArrangement16B: {q: 0b1, size: 0b10},
|
|
}},
|
|
SQRDMULH: {u: 0b1, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement4H: {q: 0b0, size: 0b01},
|
|
VectorArrangement8H: {q: 0b1, size: 0b01},
|
|
VectorArrangement2S: {q: 0b0, size: 0b10},
|
|
VectorArrangement4S: {q: 0b1, size: 0b10},
|
|
}},
|
|
}
|
|
|
|
// aAndSize is a pair of "Q" and "size" that appear in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
type qAndSize struct{ q, size byte }
|
|
|
|
// defaultQAndSize maps a vector arrangement to the default qAndSize which is encoded by many instructions.
|
|
var defaultQAndSize = map[VectorArrangement]qAndSize{
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement4H: {size: 0b01, q: 0b0},
|
|
VectorArrangement8H: {size: 0b01, q: 0b1},
|
|
VectorArrangement2S: {size: 0b10, q: 0b0},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
VectorArrangement1D: {size: 0b11, q: 0b0},
|
|
VectorArrangement2D: {size: 0b11, q: 0b1},
|
|
}
|
|
|
|
// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD across lanes" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDAcrossLanes = map[asm.Instruction]struct {
|
|
qAndSize map[VectorArrangement]qAndSize
|
|
u, opcode byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-?lang=en
|
|
ADDV: {
|
|
u: 0b0, opcode: 0b11011,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
VectorArrangement8H: {size: 0b01, q: 0b1},
|
|
VectorArrangement4H: {size: 0b01, q: 0b0},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
},
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMINV--Unsigned-Minimum-across-Vector-?lang=en
|
|
UMINV: {
|
|
u: 0b1, opcode: 0b11010,
|
|
qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
VectorArrangement8H: {size: 0b01, q: 0b1},
|
|
VectorArrangement4H: {size: 0b01, q: 0b0},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
},
|
|
},
|
|
UADDLV: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{
|
|
VectorArrangement16B: {size: 0b00, q: 0b1},
|
|
VectorArrangement8B: {size: 0b00, q: 0b0},
|
|
VectorArrangement8H: {size: 0b01, q: 0b1},
|
|
VectorArrangement4H: {size: 0b01, q: 0b0},
|
|
VectorArrangement4S: {size: 0b10, q: 0b1},
|
|
}},
|
|
}
|
|
|
|
// advancedSIMDScalarPairwise holds information to encode instructions as "Advanced SIMD scalar pairwise" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDScalarPairwise = map[asm.Instruction]struct {
|
|
size map[VectorArrangement]byte
|
|
u, opcode byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en
|
|
ADDP: {u: 0b0, opcode: 0b11011, size: map[VectorArrangement]byte{VectorArrangement2D: 0b11}},
|
|
}
|
|
|
|
// advancedSIMDCopy holds information to encode instructions as "Advanced SIMD copy" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDCopy = map[asm.Instruction]struct {
|
|
// TODO: extract common implementation of resolver.
|
|
resolver func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error)
|
|
op byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-?lang=en
|
|
DUPELEM: {op: 0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
|
|
imm4 = 0b0000
|
|
q = 0b1
|
|
|
|
switch arr {
|
|
case VectorArrangementB:
|
|
imm5 |= 0b1
|
|
imm5 |= byte(srcIndex) << 1
|
|
case VectorArrangementH:
|
|
imm5 |= 0b10
|
|
imm5 |= byte(srcIndex) << 2
|
|
case VectorArrangementS:
|
|
imm5 |= 0b100
|
|
imm5 |= byte(srcIndex) << 3
|
|
case VectorArrangementD:
|
|
imm5 |= 0b1000
|
|
imm5 |= byte(srcIndex) << 4
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement for DUPELEM: %d", arr)
|
|
}
|
|
|
|
return
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
|
|
DUPGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
|
|
imm4 = 0b0001
|
|
switch arr {
|
|
case VectorArrangement8B:
|
|
imm5 = 0b1
|
|
case VectorArrangement16B:
|
|
imm5 = 0b1
|
|
q = 0b1
|
|
case VectorArrangement4H:
|
|
imm5 = 0b10
|
|
case VectorArrangement8H:
|
|
imm5 = 0b10
|
|
q = 0b1
|
|
case VectorArrangement2S:
|
|
imm5 = 0b100
|
|
case VectorArrangement4S:
|
|
imm5 = 0b100
|
|
q = 0b1
|
|
case VectorArrangement2D:
|
|
imm5 = 0b1000
|
|
q = 0b1
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement for DUPGEN: %s", arr)
|
|
}
|
|
return
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en
|
|
INSGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
|
|
imm4, q = 0b0011, 0b1
|
|
switch arr {
|
|
case VectorArrangementB:
|
|
imm5 |= 0b1
|
|
imm5 |= byte(dstIndex) << 1
|
|
case VectorArrangementH:
|
|
imm5 |= 0b10
|
|
imm5 |= byte(dstIndex) << 2
|
|
case VectorArrangementS:
|
|
imm5 |= 0b100
|
|
imm5 |= byte(dstIndex) << 3
|
|
case VectorArrangementD:
|
|
imm5 |= 0b1000
|
|
imm5 |= byte(dstIndex) << 4
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement for INSGEN: %s", arr)
|
|
}
|
|
return
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
|
|
UMOV: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
|
|
imm4 = 0b0111
|
|
switch arr {
|
|
case VectorArrangementB:
|
|
imm5 |= 0b1
|
|
imm5 |= byte(srcIndex) << 1
|
|
case VectorArrangementH:
|
|
imm5 |= 0b10
|
|
imm5 |= byte(srcIndex) << 2
|
|
case VectorArrangementS:
|
|
imm5 |= 0b100
|
|
imm5 |= byte(srcIndex) << 3
|
|
case VectorArrangementD:
|
|
imm5 |= 0b1000
|
|
imm5 |= byte(srcIndex) << 4
|
|
q = 0b1
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement for UMOV: %s", arr)
|
|
}
|
|
return
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-?lang=en
|
|
SMOV32: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
|
|
imm4 = 0b0101
|
|
switch arr {
|
|
case VectorArrangementB:
|
|
imm5 |= 0b1
|
|
imm5 |= byte(srcIndex) << 1
|
|
case VectorArrangementH:
|
|
imm5 |= 0b10
|
|
imm5 |= byte(srcIndex) << 2
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement for SMOV32: %s", arr)
|
|
}
|
|
return
|
|
}},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
|
|
INSELEM: {op: 0b1, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
|
|
q = 0b1
|
|
switch arr {
|
|
case VectorArrangementB:
|
|
imm5 |= 0b1
|
|
imm5 |= byte(dstIndex) << 1
|
|
imm4 = byte(srcIndex)
|
|
case VectorArrangementH:
|
|
imm5 |= 0b10
|
|
imm5 |= byte(dstIndex) << 2
|
|
imm4 = byte(srcIndex) << 1
|
|
case VectorArrangementS:
|
|
imm5 |= 0b100
|
|
imm5 |= byte(dstIndex) << 3
|
|
imm4 = byte(srcIndex) << 2
|
|
case VectorArrangementD:
|
|
imm5 |= 0b1000
|
|
imm5 |= byte(dstIndex) << 4
|
|
imm4 = byte(srcIndex) << 3
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement for INSELEM: %d", arr)
|
|
}
|
|
return
|
|
}},
|
|
}
|
|
|
|
// advancedSIMDTableLookup holds information to encode instructions as "Advanced SIMD table lookup" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDTableLookup = map[asm.Instruction]struct {
|
|
q map[VectorArrangement]byte
|
|
op, op2, Len byte
|
|
}{
|
|
TBL1: {op: 0, op2: 0, Len: 0b00, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}},
|
|
TBL2: {op: 0, op2: 0, Len: 0b01, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}},
|
|
}
|
|
|
|
// advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDShiftByImmediate = map[asm.Instruction]struct {
|
|
q map[VectorArrangement]byte
|
|
immResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error)
|
|
U, opcode byte
|
|
}{
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
|
|
SSHLL: {
|
|
U: 0b0, opcode: 0b10100,
|
|
q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0},
|
|
immResolver: immResolverForSIMDSiftLeftByImmediate,
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
|
|
SSHLL2: {
|
|
U: 0b0, opcode: 0b10100,
|
|
q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1},
|
|
immResolver: immResolverForSIMDSiftLeftByImmediate,
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
|
|
USHLL: {
|
|
U: 0b1, opcode: 0b10100,
|
|
q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0},
|
|
immResolver: immResolverForSIMDSiftLeftByImmediate,
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
|
|
USHLL2: {
|
|
U: 0b1, opcode: 0b10100,
|
|
q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1},
|
|
immResolver: immResolverForSIMDSiftLeftByImmediate,
|
|
},
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en
|
|
SSHR: {
|
|
U: 0b0, opcode: 0b00000,
|
|
q: map[VectorArrangement]byte{
|
|
VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1, VectorArrangement2D: 0b1,
|
|
VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0,
|
|
},
|
|
immResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) {
|
|
switch arr {
|
|
case VectorArrangement16B, VectorArrangement8B:
|
|
immh = 0b0001
|
|
immb = 8 - byte(shiftAmount&0b111)
|
|
case VectorArrangement8H, VectorArrangement4H:
|
|
v := 16 - byte(shiftAmount&0b1111)
|
|
immb = v & 0b111
|
|
immh = 0b0010 | (v >> 3)
|
|
case VectorArrangement4S, VectorArrangement2S:
|
|
v := 32 - byte(shiftAmount&0b11111)
|
|
immb = v & 0b111
|
|
immh = 0b0100 | (v >> 3)
|
|
case VectorArrangement2D:
|
|
v := 64 - byte(shiftAmount&0b111111)
|
|
immb = v & 0b111
|
|
immh = 0b1000 | (v >> 3)
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement %s", arr)
|
|
}
|
|
return
|
|
},
|
|
},
|
|
}
|
|
|
|
// advancedSIMDPermute holds information to encode instructions as "Advanced SIMD permute" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
var advancedSIMDPermute = map[asm.Instruction]struct {
|
|
opcode byte
|
|
}{
|
|
ZIP1: {opcode: 0b011},
|
|
}
|
|
|
|
func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) {
|
|
switch arr {
|
|
case VectorArrangement16B, VectorArrangement8B:
|
|
immb = byte(shiftAmount)
|
|
immh = 0b0001
|
|
case VectorArrangement8H, VectorArrangement4H:
|
|
immb = byte(shiftAmount) & 0b111
|
|
immh = 0b0010 | byte(shiftAmount>>3)
|
|
case VectorArrangement4S, VectorArrangement2S:
|
|
immb = byte(shiftAmount) & 0b111
|
|
immh = 0b0100 | byte(shiftAmount>>3)
|
|
default:
|
|
err = fmt.Errorf("unsupported arrangement %s", arr)
|
|
}
|
|
return
|
|
}
|
|
|
|
// encodeAdvancedSIMDCopy encodes instruction as "Advanced SIMD copy" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
func (a *AssemblerImpl) encodeAdvancedSIMDCopy(buf asm.Buffer, srcRegBits, dstRegBits, op, imm5, imm4, q byte) {
|
|
buf.Append4Bytes(
|
|
(srcRegBits<<5)|dstRegBits,
|
|
imm4<<3|0b1<<2|srcRegBits>>3,
|
|
imm5,
|
|
q<<6|op<<5|0b1110,
|
|
)
|
|
}
|
|
|
|
// encodeAdvancedSIMDThreeSame encodes instruction as "Advanced SIMD three same" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
func (a *AssemblerImpl) encodeAdvancedSIMDThreeSame(buf asm.Buffer, src1, src2, dst, opcode, size, q, u byte) {
|
|
buf.Append4Bytes(
|
|
(src2<<5)|dst,
|
|
opcode<<3|1<<2|src2>>3,
|
|
size<<6|0b1<<5|src1,
|
|
q<<6|u<<5|0b1110,
|
|
)
|
|
}
|
|
|
|
// encodeAdvancedSIMDThreeDifferent encodes instruction as "Advanced SIMD three different" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
func (a *AssemblerImpl) encodeAdvancedSIMDThreeDifferent(buf asm.Buffer, src1, src2, dst, opcode, size, q, u byte) {
|
|
buf.Append4Bytes(
|
|
(src2<<5)|dst,
|
|
opcode<<4|src2>>3,
|
|
size<<6|0b1<<5|src1,
|
|
q<<6|u<<5|0b1110,
|
|
)
|
|
}
|
|
|
|
// encodeAdvancedSIMDPermute encodes instruction as "Advanced SIMD permute" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
func (a *AssemblerImpl) encodeAdvancedSIMDPermute(buf asm.Buffer, src1, src2, dst, opcode, size, q byte) {
|
|
buf.Append4Bytes(
|
|
(src2<<5)|dst,
|
|
opcode<<4|0b1<<3|src2>>3,
|
|
size<<6|src1,
|
|
q<<6|0b1110,
|
|
)
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeVectorRegisterToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
var srcVectorRegBits byte
|
|
if n.srcReg != RegRZR {
|
|
srcVectorRegBits, err = vectorRegisterBits(n.srcReg)
|
|
} else if n.instruction == CMEQZERO {
|
|
// CMEQZERO has RegRZR as the src, and we apply the instruction to the same register as the destination.
|
|
srcVectorRegBits, err = vectorRegisterBits(n.dstReg)
|
|
}
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
|
|
imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeAdvancedSIMDCopy(buf, srcVectorRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
|
|
return nil
|
|
}
|
|
|
|
if scalarPairwise, ok := advancedSIMDScalarPairwise[n.instruction]; ok {
|
|
// See "Advanced SIMD scalar pairwise" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
size, ok := scalarPairwise.size[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcVectorRegBits<<5)|dstVectorRegBits,
|
|
scalarPairwise.opcode<<4|1<<3|srcVectorRegBits>>3,
|
|
size<<6|0b11<<4|scalarPairwise.opcode>>4,
|
|
0b1<<6|scalarPairwise.u<<5|0b11110,
|
|
)
|
|
return
|
|
}
|
|
|
|
if twoRegMisc, ok := advancedSIMDTwoRegisterMisc[n.instruction]; ok {
|
|
// See "Advanced SIMD two-register miscellaneous" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
qs, ok := twoRegMisc.qAndSize[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcVectorRegBits<<5)|dstVectorRegBits,
|
|
twoRegMisc.opcode<<4|0b1<<3|srcVectorRegBits>>3,
|
|
qs.size<<6|0b1<<5|twoRegMisc.opcode>>4,
|
|
qs.q<<6|twoRegMisc.u<<5|0b01110,
|
|
)
|
|
return nil
|
|
}
|
|
|
|
if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok {
|
|
qs, ok := threeSame.qAndSize[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
a.encodeAdvancedSIMDThreeSame(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u)
|
|
return nil
|
|
}
|
|
|
|
if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok {
|
|
qs, ok := threeDifferent.qAndSize[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
a.encodeAdvancedSIMDThreeDifferent(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u)
|
|
return nil
|
|
}
|
|
|
|
if acrossLanes, ok := advancedSIMDAcrossLanes[n.instruction]; ok {
|
|
// See "Advanced SIMD across lanes" in
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
|
|
qs, ok := acrossLanes.qAndSize[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcVectorRegBits<<5)|dstVectorRegBits,
|
|
acrossLanes.opcode<<4|0b1<<3|srcVectorRegBits>>3,
|
|
qs.size<<6|0b11000<<1|acrossLanes.opcode>>4,
|
|
qs.q<<6|acrossLanes.u<<5|0b01110,
|
|
)
|
|
return nil
|
|
}
|
|
|
|
if lookup, ok := advancedSIMDTableLookup[n.instruction]; ok {
|
|
q, ok := lookup.q[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcVectorRegBits<<5)|dstVectorRegBits,
|
|
lookup.Len<<5|lookup.op<<4|srcVectorRegBits>>3,
|
|
lookup.op2<<6|dstVectorRegBits,
|
|
q<<6|0b1110,
|
|
)
|
|
return
|
|
}
|
|
|
|
if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.instruction]; ok {
|
|
immh, immb, err := shiftByImmediate.immResolver(n.srcConst, n.vectorArrangement)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
q, ok := shiftByImmediate.q[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
|
|
buf.Append4Bytes(
|
|
(srcVectorRegBits<<5)|dstVectorRegBits,
|
|
shiftByImmediate.opcode<<3|0b1<<2|srcVectorRegBits>>3,
|
|
immh<<3|immb,
|
|
q<<6|shiftByImmediate.U<<5|0b1111,
|
|
)
|
|
return nil
|
|
}
|
|
|
|
if permute, ok := advancedSIMDPermute[n.instruction]; ok {
|
|
size, q := arrangementSizeQ(n.vectorArrangement)
|
|
a.encodeAdvancedSIMDPermute(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, permute.opcode, size, q)
|
|
return
|
|
}
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeTwoVectorRegistersToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
var srcRegBits, srcRegBits2, dstRegBits byte
|
|
srcRegBits, err = vectorRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
srcRegBits2, err = vectorRegisterBits(n.srcReg2)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstRegBits, err = vectorRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok {
|
|
qs, ok := threeSame.qAndSize[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
a.encodeAdvancedSIMDThreeSame(buf, srcRegBits, srcRegBits2, dstRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u)
|
|
return nil
|
|
}
|
|
|
|
if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok {
|
|
qs, ok := threeDifferent.qAndSize[n.vectorArrangement]
|
|
if !ok {
|
|
return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
|
|
}
|
|
a.encodeAdvancedSIMDThreeDifferent(buf, srcRegBits, srcRegBits2, dstRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u)
|
|
return nil
|
|
}
|
|
|
|
if permute, ok := advancedSIMDPermute[n.instruction]; ok {
|
|
size, q := arrangementSizeQ(n.vectorArrangement)
|
|
a.encodeAdvancedSIMDPermute(buf, srcRegBits, srcRegBits2, dstRegBits, permute.opcode, size, q)
|
|
return
|
|
}
|
|
|
|
if n.instruction == EXT {
|
|
// EXT is the only instruction in "Advanced SIMD extract", so inline the encoding here.
|
|
// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
|
|
var q, imm4 byte
|
|
switch n.vectorArrangement {
|
|
case VectorArrangement16B:
|
|
imm4 = 0b1111 & byte(n.srcConst)
|
|
q = 0b1
|
|
case VectorArrangement8B:
|
|
imm4 = 0b111 & byte(n.srcConst)
|
|
default:
|
|
return fmt.Errorf("invalid arrangement %s for EXT", n.vectorArrangement)
|
|
}
|
|
buf.Append4Bytes(
|
|
(srcRegBits2<<5)|dstRegBits,
|
|
imm4<<3|srcRegBits2>>3,
|
|
srcRegBits,
|
|
q<<6|0b101110,
|
|
)
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeVectorRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
if err = checkArrangementIndexPair(n.vectorArrangement, n.srcVectorIndex); err != nil {
|
|
return
|
|
}
|
|
|
|
srcVecRegBits, err := vectorRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstRegBits, err := intRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
|
|
imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeAdvancedSIMDCopy(buf, srcVecRegBits, dstRegBits, simdCopy.op, imm5, imm4, q)
|
|
return nil
|
|
}
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
func (a *AssemblerImpl) encodeRegisterToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
|
|
srcRegBits, err := intRegisterBits(n.srcReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
|
|
imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.encodeAdvancedSIMDCopy(buf, srcRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
|
|
return nil
|
|
}
|
|
return errorEncodingUnsupported(n)
|
|
}
|
|
|
|
var zeroRegisterBits byte = 0b11111
|
|
|
|
func isIntRegister(r asm.Register) bool {
|
|
return RegR0 <= r && r <= RegSP
|
|
}
|
|
|
|
func isVectorRegister(r asm.Register) bool {
|
|
return RegV0 <= r && r <= RegV31
|
|
}
|
|
|
|
func isConditionalRegister(r asm.Register) bool {
|
|
return RegCondEQ <= r && r <= RegCondNV
|
|
}
|
|
|
|
func intRegisterBits(r asm.Register) (ret byte, err error) {
|
|
if !isIntRegister(r) {
|
|
err = fmt.Errorf("%s is not integer", RegisterName(r))
|
|
} else if r == RegSP {
|
|
// SP has the same bit representations as RegRZR.
|
|
r = RegRZR
|
|
}
|
|
ret = byte(r - RegR0)
|
|
return
|
|
}
|
|
|
|
func vectorRegisterBits(r asm.Register) (ret byte, err error) {
|
|
if !isVectorRegister(r) {
|
|
err = fmt.Errorf("%s is not vector", RegisterName(r))
|
|
} else {
|
|
ret = byte(r - RegV0)
|
|
}
|
|
return
|
|
}
|
|
|
|
func registerBits(r asm.Register) (ret byte) {
|
|
if isIntRegister(r) {
|
|
if r == RegSP {
|
|
// SP has the same bit representations as RegRZR.
|
|
r = RegRZR
|
|
}
|
|
ret = byte(r - RegR0)
|
|
} else {
|
|
ret = byte(r - RegV0)
|
|
}
|
|
return
|
|
}
|