diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/internal/engine/wazevo/backend/isa/arm64/lower_mem.go index 07280227..a2484929 100644 --- a/internal/engine/wazevo/backend/isa/arm64/lower_mem.go +++ b/internal/engine/wazevo/backend/isa/arm64/lower_mem.go @@ -307,37 +307,35 @@ func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte // During the construction, this might emit additional instructions. // // Extracted as a separate function for easy testing. -func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc.VReg, size byte, offset int64) (amode addressMode) { - switch a64sExist, a32sExist := len(a64s) > 0, len(a32s) > 0; { +func (m *machine) lowerToAddressModeFromAddends(a32s *queue[addend32], a64s *queue[regalloc.VReg], size byte, offset int64) (amode addressMode) { + switch a64sExist, a32sExist := !a64s.empty(), !a32s.empty(); { case a64sExist && a32sExist: var base regalloc.VReg - base, a64s = dequeue(a64s) + base = a64s.dequeue() var a32 addend32 - a32, a32s = dequeue(a32s) + a32 = a32s.dequeue() amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext} case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset): var base regalloc.VReg - base, a64s = dequeue(a64s) + base = a64s.dequeue() amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset} offset = 0 case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset): var base regalloc.VReg - base, a64s = dequeue(a64s) + base = a64s.dequeue() amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset} offset = 0 case a64sExist: var base regalloc.VReg - base, a64s = dequeue(a64s) - if len(a64s) > 0 { - var index regalloc.VReg - index, a64s = dequeue(a64s) + base = a64s.dequeue() + if !a64s.empty() { + index := a64s.dequeue() amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */} } else { amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} } case a32sExist: - var base32 addend32 - base32, a32s = dequeue(a32s) + base32 := a32s.dequeue() // First we need 64-bit base. base := m.compiler.AllocateVReg(ssa.TypeI64) @@ -349,9 +347,8 @@ func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc baseExt.asExtend(base, base32.r, 32, 64, signed) m.insert(baseExt) - if len(a32s) > 0 { - var index addend32 - index, a32s = dequeue(a32s) + if !a32s.empty() { + index := a32s.dequeue() amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext} } else { amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} @@ -368,11 +365,13 @@ func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset } - for _, a64 := range a64s { + for !a64s.empty() { + a64 := a64s.dequeue() baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64 } - for _, a32 := range a32s { + for !a32s.empty() { + a32 := a32s.dequeue() baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit) } amode.rn = baseReg @@ -381,21 +380,22 @@ func (m *machine) lowerToAddressModeFromAddends(a32s []addend32, a64s []regalloc var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst} -func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64 []regalloc.VReg, offset int64) { - m.addends64 = m.addends64[:0] - m.addends32 = m.addends32[:0] - m.addendsWorkQueue = append(m.addendsWorkQueue[:0], ptr) +func (m *machine) collectAddends(ptr ssa.Value) (addends32 *queue[addend32], addends64 *queue[regalloc.VReg], offset int64) { + m.addendsWorkQueue.reset() + m.addends32.reset() + m.addends64.reset() + m.addendsWorkQueue.enqueue(ptr) - for len(m.addendsWorkQueue) > 0 { - var v ssa.Value - v, m.addendsWorkQueue = dequeue(m.addendsWorkQueue) + for !m.addendsWorkQueue.empty() { + v := m.addendsWorkQueue.dequeue() def := m.compiler.ValueDefinition(v) switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op { case ssa.OpcodeIadd: // If the addend is an add, we recursively collect its operands. x, y := def.Instr.Arg2() - m.addendsWorkQueue = append(m.addendsWorkQueue, x, y) + m.addendsWorkQueue.enqueue(x) + m.addendsWorkQueue.enqueue(y) def.Instr.MarkLowered() case ssa.OpcodeIconst: // If the addend is constant, we just statically merge it into the offset. @@ -411,7 +411,7 @@ func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64 switch input := def.Instr.Arg(); input.Type().Bits() { case 64: // If the input is already 64-bit, this extend is a no-op. TODO: shouldn't this be optimized out at much earlier stage? no? - m.addends64 = append(m.addends64, m.getOperand_NR(m.compiler.ValueDefinition(input), extModeNone).nr()) + m.addends64.enqueue(m.getOperand_NR(m.compiler.ValueDefinition(input), extModeNone).nr()) def.Instr.MarkLowered() continue case 32: @@ -432,7 +432,7 @@ func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64 // Sign-extension of a 32-bit constant can be merged into the offset. offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend! default: - m.addends32 = append(m.addends32, addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext}) + m.addends32.enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext}) } def.Instr.MarkLowered() continue @@ -443,10 +443,10 @@ func (m *machine) collectAddends(ptr ssa.Value) (addends32 []addend32, addends64 panic("TODO: add tests") default: // If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it. - m.addends64 = append(m.addends64, m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr()) + m.addends64.enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr()) } } - return m.addends32, m.addends64, offset + return &m.addends32, &m.addends64, offset } func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) { @@ -481,7 +481,27 @@ func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regal return } -func dequeue[T any](q []T) (ret T, popped []T) { - ret, popped = (q)[0], (q)[1:] +// queue is the resettable queue where the underlying slice is reused. +type queue[T any] struct { + index int + data []T +} + +func (q *queue[T]) enqueue(v T) { + q.data = append(q.data, v) +} + +func (q *queue[T]) dequeue() (ret T) { + ret = q.data[q.index] + q.index++ return } + +func (q *queue[T]) empty() bool { + return q.index >= len(q.data) +} + +func (q *queue[T]) reset() { + q.index = 0 + q.data = q.data[:0] +} diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go b/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go index e3c5a192..14a2b55f 100644 --- a/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go +++ b/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go @@ -472,9 +472,9 @@ func TestMachine_collectAddends(t *testing.T) { t.Run(tc.name, func(t *testing.T) { ctx, b, m := newSetupWithMockContext() ptr, verify := tc.setup(ctx, b, m) - actual32s, actual64s, actualOffset := m.collectAddends(ptr) - require.Equal(t, tc.exp32s, actual32s) - require.Equal(t, tc.exp64s, actual64s) + actual32sQ, actual64sQ, actualOffset := m.collectAddends(ptr) + require.Equal(t, tc.exp32s, actual32sQ.data) + require.Equal(t, tc.exp64s, actual64sQ.data) require.Equal(t, tc.offset, actualOffset) verify(t) }) @@ -567,18 +567,6 @@ func TestMachine_addRegToReg64Ext(t *testing.T) { } } -func Test_dequeue(t *testing.T) { - ints := []int{1, 2, 3} - one, intPopped := dequeue(ints) - require.Equal(t, 1, one) - require.Equal(t, []int{2, 3}, intPopped) - - strs := []string{"a", "b", "c"} - a, strPopped := dequeue(strs) - require.Equal(t, "a", a) - require.Equal(t, []string{"b", "c"}, strPopped) -} - func TestMachine_lowerToAddressModeFromAddends(t *testing.T) { x1, x2, x3 := regalloc.FromRealReg(x1, regalloc.RegTypeInt), regalloc.FromRealReg(x2, regalloc.RegTypeInt), regalloc.FromRealReg(x3, regalloc.RegTypeInt) x4, x5, x6 := regalloc.FromRealReg(x4, regalloc.RegTypeInt), regalloc.FromRealReg(x5, regalloc.RegTypeInt), regalloc.FromRealReg(x6, regalloc.RegTypeInt) @@ -823,7 +811,16 @@ func TestMachine_lowerToAddressModeFromAddends(t *testing.T) { t.Run(tc.name, func(t *testing.T) { ctx, _, m := newSetupWithMockContext() ctx.vRegCounter = int(nextVReg.ID()) - 1 - actual := m.lowerToAddressModeFromAddends(tc.a32s, tc.a64s, tc.dstSizeInBits, tc.offset) + + var a32s queue[addend32] + var a64s queue[regalloc.VReg] + for _, a32 := range tc.a32s { + a32s.enqueue(a32) + } + for _, a64 := range tc.a64s { + a64s.enqueue(a64) + } + actual := m.lowerToAddressModeFromAddends(&a32s, &a64s, tc.dstSizeInBits, tc.offset) require.Equal(t, strings.Join(tc.insts, "\n"), formatEmittedInstructionsInCurrentBlock(m)) require.Equal(t, tc.exp, actual, actual.format(tc.dstSizeInBits)) }) diff --git a/internal/engine/wazevo/backend/isa/arm64/machine.go b/internal/engine/wazevo/backend/isa/arm64/machine.go index abf3b8f0..3663feb3 100644 --- a/internal/engine/wazevo/backend/isa/arm64/machine.go +++ b/internal/engine/wazevo/backend/isa/arm64/machine.go @@ -37,10 +37,10 @@ type ( labelPositionPool wazevoapi.Pool[labelPosition] // addendsWorkQueue is used during address lowering, defined here for reuse. - addendsWorkQueue []ssa.Value - addends32 []addend32 + addendsWorkQueue queue[ssa.Value] + addends32 queue[addend32] // addends64 is used during address lowering, defined here for reuse. - addends64 []regalloc.VReg + addends64 queue[regalloc.VReg] unresolvedAddressModes []*instruction // spillSlotSize is the size of the stack slot in bytes used for spilling registers.