wazevo(arm64): reuses queue in addr lowering (#1835)
Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
This commit is contained in:
@@ -307,37 +307,35 @@ func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte
|
||||
// During the construction, this might emit additional instructions.
|
||||
//
|
||||
// Extracted as a separate function for easy testing.
|
||||
func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc.VReg, size byte, offset int64) (amode addressMode) {
|
||||
switch a64sExist, a32sExist := len(a64s) > 0, len(a32s) > 0; {
|
||||
func (m *machine) lowerToAddressModeFromAddends(a32s *queue[addend32], a64s *queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
|
||||
switch a64sExist, a32sExist := !a64s.empty(), !a32s.empty(); {
|
||||
case a64sExist && a32sExist:
|
||||
var base regalloc.VReg
|
||||
base, a64s = dequeue(a64s)
|
||||
base = a64s.dequeue()
|
||||
var a32 addend32
|
||||
a32, a32s = dequeue(a32s)
|
||||
a32 = a32s.dequeue()
|
||||
amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
|
||||
case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
|
||||
var base regalloc.VReg
|
||||
base, a64s = dequeue(a64s)
|
||||
base = a64s.dequeue()
|
||||
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
|
||||
offset = 0
|
||||
case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
|
||||
var base regalloc.VReg
|
||||
base, a64s = dequeue(a64s)
|
||||
base = a64s.dequeue()
|
||||
amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
|
||||
offset = 0
|
||||
case a64sExist:
|
||||
var base regalloc.VReg
|
||||
base, a64s = dequeue(a64s)
|
||||
if len(a64s) > 0 {
|
||||
var index regalloc.VReg
|
||||
index, a64s = dequeue(a64s)
|
||||
base = a64s.dequeue()
|
||||
if !a64s.empty() {
|
||||
index := a64s.dequeue()
|
||||
amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
|
||||
} else {
|
||||
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
|
||||
}
|
||||
case a32sExist:
|
||||
var base32 addend32
|
||||
base32, a32s = dequeue(a32s)
|
||||
base32 := a32s.dequeue()
|
||||
|
||||
// First we need 64-bit base.
|
||||
base := m.compiler.AllocateVReg(ssa.TypeI64)
|
||||
@@ -349,9 +347,8 @@ func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc
|
||||
baseExt.asExtend(base, base32.r, 32, 64, signed)
|
||||
m.insert(baseExt)
|
||||
|
||||
if len(a32s) > 0 {
|
||||
var index addend32
|
||||
index, a32s = dequeue(a32s)
|
||||
if !a32s.empty() {
|
||||
index := a32s.dequeue()
|
||||
amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
|
||||
} else {
|
||||
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
|
||||
@@ -368,11 +365,13 @@ func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc
|
||||
baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
|
||||
}
|
||||
|
||||
for _, a64 := range a64s {
|
||||
for !a64s.empty() {
|
||||
a64 := a64s.dequeue()
|
||||
baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
|
||||
}
|
||||
|
||||
for _, a32 := range a32s {
|
||||
for !a32s.empty() {
|
||||
a32 := a32s.dequeue()
|
||||
baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
|
||||
}
|
||||
amode.rn = baseReg
|
||||
@@ -381,21 +380,22 @@ func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc
|
||||
|
||||
var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
|
||||
|
||||
func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64 []regalloc.VReg, offset int64) {
|
||||
m.addends64 = m.addends64[:0]
|
||||
m.addends32 = m.addends32[:0]
|
||||
m.addendsWorkQueue = append(m.addendsWorkQueue[:0], ptr)
|
||||
func (m *machine) collectAddends(ptr ssa.Value) (addends32 *queue[addend32], addends64 *queue[regalloc.VReg], offset int64) {
|
||||
m.addendsWorkQueue.reset()
|
||||
m.addends32.reset()
|
||||
m.addends64.reset()
|
||||
m.addendsWorkQueue.enqueue(ptr)
|
||||
|
||||
for len(m.addendsWorkQueue) > 0 {
|
||||
var v ssa.Value
|
||||
v, m.addendsWorkQueue = dequeue(m.addendsWorkQueue)
|
||||
for !m.addendsWorkQueue.empty() {
|
||||
v := m.addendsWorkQueue.dequeue()
|
||||
|
||||
def := m.compiler.ValueDefinition(v)
|
||||
switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
|
||||
case ssa.OpcodeIadd:
|
||||
// If the addend is an add, we recursively collect its operands.
|
||||
x, y := def.Instr.Arg2()
|
||||
m.addendsWorkQueue = append(m.addendsWorkQueue, x, y)
|
||||
m.addendsWorkQueue.enqueue(x)
|
||||
m.addendsWorkQueue.enqueue(y)
|
||||
def.Instr.MarkLowered()
|
||||
case ssa.OpcodeIconst:
|
||||
// If the addend is constant, we just statically merge it into the offset.
|
||||
@@ -411,7 +411,7 @@ func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64
|
||||
switch input := def.Instr.Arg(); input.Type().Bits() {
|
||||
case 64:
|
||||
// If the input is already 64-bit, this extend is a no-op. TODO: shouldn't this be optimized out at much earlier stage? no?
|
||||
m.addends64 = append(m.addends64, m.getOperand_NR(m.compiler.ValueDefinition(input), extModeNone).nr())
|
||||
m.addends64.enqueue(m.getOperand_NR(m.compiler.ValueDefinition(input), extModeNone).nr())
|
||||
def.Instr.MarkLowered()
|
||||
continue
|
||||
case 32:
|
||||
@@ -432,7 +432,7 @@ func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64
|
||||
// Sign-extension of a 32-bit constant can be merged into the offset.
|
||||
offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
|
||||
default:
|
||||
m.addends32 = append(m.addends32, addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
|
||||
m.addends32.enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
|
||||
}
|
||||
def.Instr.MarkLowered()
|
||||
continue
|
||||
@@ -443,10 +443,10 @@ func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64
|
||||
panic("TODO: add tests")
|
||||
default:
|
||||
// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
|
||||
m.addends64 = append(m.addends64, m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
|
||||
m.addends64.enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
|
||||
}
|
||||
}
|
||||
return m.addends32, m.addends64, offset
|
||||
return &m.addends32, &m.addends64, offset
|
||||
}
|
||||
|
||||
func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
|
||||
@@ -481,7 +481,27 @@ func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regal
|
||||
return
|
||||
}
|
||||
|
||||
func dequeue[T any](q []T) (ret T, popped []T) {
|
||||
ret, popped = (q)[0], (q)[1:]
|
||||
// queue is the resettable queue where the underlying slice is reused.
|
||||
type queue[T any] struct {
|
||||
index int
|
||||
data []T
|
||||
}
|
||||
|
||||
func (q *queue[T]) enqueue(v T) {
|
||||
q.data = append(q.data, v)
|
||||
}
|
||||
|
||||
func (q *queue[T]) dequeue() (ret T) {
|
||||
ret = q.data[q.index]
|
||||
q.index++
|
||||
return
|
||||
}
|
||||
|
||||
func (q *queue[T]) empty() bool {
|
||||
return q.index >= len(q.data)
|
||||
}
|
||||
|
||||
func (q *queue[T]) reset() {
|
||||
q.index = 0
|
||||
q.data = q.data[:0]
|
||||
}
|
||||
|
||||
@@ -472,9 +472,9 @@ func TestMachine_collectAddends(t *testing.T) {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, b, m := newSetupWithMockContext()
|
||||
ptr, verify := tc.setup(ctx, b, m)
|
||||
actual32s, actual64s, actualOffset := m.collectAddends(ptr)
|
||||
require.Equal(t, tc.exp32s, actual32s)
|
||||
require.Equal(t, tc.exp64s, actual64s)
|
||||
actual32sQ, actual64sQ, actualOffset := m.collectAddends(ptr)
|
||||
require.Equal(t, tc.exp32s, actual32sQ.data)
|
||||
require.Equal(t, tc.exp64s, actual64sQ.data)
|
||||
require.Equal(t, tc.offset, actualOffset)
|
||||
verify(t)
|
||||
})
|
||||
@@ -567,18 +567,6 @@ func TestMachine_addRegToReg64Ext(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func Test_dequeue(t *testing.T) {
|
||||
ints := []int{1, 2, 3}
|
||||
one, intPopped := dequeue(ints)
|
||||
require.Equal(t, 1, one)
|
||||
require.Equal(t, []int{2, 3}, intPopped)
|
||||
|
||||
strs := []string{"a", "b", "c"}
|
||||
a, strPopped := dequeue(strs)
|
||||
require.Equal(t, "a", a)
|
||||
require.Equal(t, []string{"b", "c"}, strPopped)
|
||||
}
|
||||
|
||||
func TestMachine_lowerToAddressModeFromAddends(t *testing.T) {
|
||||
x1, x2, x3 := regalloc.FromRealReg(x1, regalloc.RegTypeInt), regalloc.FromRealReg(x2, regalloc.RegTypeInt), regalloc.FromRealReg(x3, regalloc.RegTypeInt)
|
||||
x4, x5, x6 := regalloc.FromRealReg(x4, regalloc.RegTypeInt), regalloc.FromRealReg(x5, regalloc.RegTypeInt), regalloc.FromRealReg(x6, regalloc.RegTypeInt)
|
||||
@@ -823,7 +811,16 @@ func TestMachine_lowerToAddressModeFromAddends(t *testing.T) {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, _, m := newSetupWithMockContext()
|
||||
ctx.vRegCounter = int(nextVReg.ID()) - 1
|
||||
actual := m.lowerToAddressModeFromAddends(tc.a32s, tc.a64s, tc.dstSizeInBits, tc.offset)
|
||||
|
||||
var a32s queue[addend32]
|
||||
var a64s queue[regalloc.VReg]
|
||||
for _, a32 := range tc.a32s {
|
||||
a32s.enqueue(a32)
|
||||
}
|
||||
for _, a64 := range tc.a64s {
|
||||
a64s.enqueue(a64)
|
||||
}
|
||||
actual := m.lowerToAddressModeFromAddends(&a32s, &a64s, tc.dstSizeInBits, tc.offset)
|
||||
require.Equal(t, strings.Join(tc.insts, "\n"), formatEmittedInstructionsInCurrentBlock(m))
|
||||
require.Equal(t, tc.exp, actual, actual.format(tc.dstSizeInBits))
|
||||
})
|
||||
|
||||
@@ -37,10 +37,10 @@ type (
|
||||
labelPositionPool wazevoapi.Pool[labelPosition]
|
||||
|
||||
// addendsWorkQueue is used during address lowering, defined here for reuse.
|
||||
addendsWorkQueue []ssa.Value
|
||||
addends32 []addend32
|
||||
addendsWorkQueue queue[ssa.Value]
|
||||
addends32 queue[addend32]
|
||||
// addends64 is used during address lowering, defined here for reuse.
|
||||
addends64 []regalloc.VReg
|
||||
addends64 queue[regalloc.VReg]
|
||||
unresolvedAddressModes []*instruction
|
||||
|
||||
// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
|
||||
|
||||
Reference in New Issue
Block a user