diff --git a/internal/engine/wazevo/backend/isa/arm64/instr.go b/internal/engine/wazevo/backend/isa/arm64/instr.go index ebd158c9..e945870f 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr.go @@ -88,6 +88,7 @@ var defKinds = [numInstructionKinds]defKind{ fpuLoad32: defKindRD, fpuLoad64: defKindRD, fpuLoad128: defKindRD, + vecLoad1R: defKindRD, loadFpuConst32: defKindRD, loadFpuConst64: defKindRD, loadFpuConst128: defKindRD, @@ -212,6 +213,7 @@ var useKinds = [numInstructionKinds]useKind{ loadFpuConst32: useKindNone, loadFpuConst64: useKindNone, loadFpuConst128: useKindNone, + vecLoad1R: useKindRN, cSel: useKindRNRM, fpuCSel: useKindRNRM, movToVec: useKindRN, @@ -543,6 +545,13 @@ func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) i.amode = amode } +func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) { + i.kind = vecLoad1R + i.rd = rd + i.rn = rn + i.u1 = uint64(arr) +} + func (i *instruction) asCSet(rd regalloc.VReg, c condFlag) { i.kind = cSet i.rd = operandNR(rd) @@ -1474,6 +1483,8 @@ const ( loadFpuConst64 // loadFpuConst128 represents a load of a 128-bit floating-point constant. loadFpuConst128 + // vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector. + vecLoad1R // fpuToInt represents a conversion from FP to integer. fpuToInt // intToFpu represents a conversion from integer to FP. diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go index 4d2daa0e..76f0d3c6 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go @@ -46,6 +46,11 @@ func (i *instruction) encode(c backend.Compiler) { c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode)) case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128: c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode)) + case vecLoad1R: + c.Emit4Bytes(encodeVecLoad1Rrt( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u1))) case condBr: imm19 := i.condBrOffset() if imm19%4 != 0 { @@ -1293,6 +1298,13 @@ func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint3 } } +// encodeVecLoad1Rrt encodes as Load one single-element structure and Replicate to all lanes (of one register) in +// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm +func encodeVecLoad1Rrt(rt, rn uint32, arr vecArrangement) uint32 { + size, q := arrToSizeQEncoded(arr) + return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt +} + // encodeAluBitmaskImmediate encodes as Logical (immediate) in // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 { diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go index 1c7ee1c4..f13c9cb9 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go @@ -1589,6 +1589,7 @@ func TestInstruction_encode(t *testing.T) { i.asFpuRR(fpuUniOpRoundNearest, operandNR(v1VReg), operandNR(v2VReg), true) }}, {want: "4140611e", setup: func(i *instruction) { i.asFpuRR(fpuUniOpNeg, operandNR(v1VReg), operandNR(v2VReg), true) }}, + {want: "41c4404d", setup: func(i *instruction) { i.asVecLoad1R(operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H) }}, {want: "4201231e4201631e4201239e4201639e4201221e4201621e4201229e4201629e", setup: func(i *instruction) { i.asNop0() cur := i diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go index ad26b82e..0bf45322 100644 --- a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go +++ b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go @@ -674,6 +674,24 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) { } m.insert(dup) + case ssa.OpcodeLoadSplat: + x, offset, lane := instr.LoadSplatData() + rd := operandNR(m.compiler.VRegOf(instr.Return())) + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + tmpReg := m.compiler.AllocateVReg(ssa.TypeI32) + + // Our encoding for vecLoad1R does not support all the addressing modes yet, + // we use the no-offset addressing mode and add the offset to a temp register. + add := m.allocateInstr() + add.asALU(aluOpAdd, operandNR(tmpReg), rn, operandImm12(uint16(offset), 0), true) + m.insert(add) + + ld1r := m.allocateInstr() + ld1r.asVecLoad1R(rd, operandNR(tmpReg), arr) + m.insert(ld1r) + default: panic("TODO: lowering " + op.String()) } diff --git a/internal/engine/wazevo/e2e_test.go b/internal/engine/wazevo/e2e_test.go index 738154cf..6189233b 100644 --- a/internal/engine/wazevo/e2e_test.go +++ b/internal/engine/wazevo/e2e_test.go @@ -199,9 +199,22 @@ func TestSpectestV2(t *testing.T) { {"simd_i16x8_extadd_pairwise_i8x16"}, {"simd_i32x4_extadd_pairwise_i16x8"}, {"simd_int_to_int_extend"}, + {"simd_load"}, + {"simd_load_extend"}, + {"simd_load_splat"}, + {"simd_load_zero"}, + {"simd_load8_lane"}, + {"simd_load16_lane"}, + {"simd_load32_lane"}, + {"simd_load64_lane"}, {"simd_lane"}, {"simd_linking"}, {"simd_splat"}, + {"simd_store"}, + {"simd_store8_lane"}, + {"simd_store16_lane"}, + {"simd_store32_lane"}, + {"simd_store64_lane"}, } { t.Run(tc.name, func(t *testing.T) { t.Run("normal", func(t *testing.T) { diff --git a/internal/engine/wazevo/frontend/lower.go b/internal/engine/wazevo/frontend/lower.go index 3de795bf..595ca466 100644 --- a/internal/engine/wazevo/frontend/lower.go +++ b/internal/engine/wazevo/frontend/lower.go @@ -1705,6 +1705,135 @@ func (c *Compiler) lowerCurrentOpcode() { load.AsLoad(addr, offset, ssa.TypeV128) builder.InsertInstruction(load) state.push(load.Return()) + case wasm.OpcodeVecV128Load8Lane, wasm.OpcodeVecV128Load16Lane, wasm.OpcodeVecV128Load32Lane: + _, offset := c.readMemArg() + state.pc++ + if state.unreachable { + break + } + var lane ssa.VecLane + var loadOp ssa.Opcode + var opSize uint64 + switch vecOp { + case wasm.OpcodeVecV128Load8Lane: + loadOp, lane, opSize = ssa.OpcodeUload8, ssa.VecLaneI8x16, 1 + case wasm.OpcodeVecV128Load16Lane: + loadOp, lane, opSize = ssa.OpcodeUload16, ssa.VecLaneI16x8, 2 + case wasm.OpcodeVecV128Load32Lane: + loadOp, lane, opSize = ssa.OpcodeUload32, ssa.VecLaneI32x4, 4 + } + laneIndex := c.wasmFunctionBody[state.pc] + vector := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + load := builder.AllocateInstruction(). + AsExtLoad(loadOp, addr, offset, false). + Insert(builder).Return() + ret := builder.AllocateInstruction(). + AsInsertlane(vector, load, laneIndex, lane). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Load64Lane: + _, offset := c.readMemArg() + state.pc++ + if state.unreachable { + break + } + laneIndex := c.wasmFunctionBody[state.pc] + vector := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 8) + load := builder.AllocateInstruction(). + AsLoad(addr, offset, ssa.TypeI64). + Insert(builder).Return() + ret := builder.AllocateInstruction(). + AsInsertlane(vector, load, laneIndex, ssa.VecLaneI64x2). + Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecV128Load32zero: + _, offset := c.readMemArg() + if state.unreachable { + break + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 4) + ret := builder.AllocateInstruction(). + AsLoad(addr, offset, ssa.TypeF32). + Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecV128Load64zero: + _, offset := c.readMemArg() + if state.unreachable { + break + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 8) + ret := builder.AllocateInstruction(). + AsLoad(addr, offset, ssa.TypeF64). + Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecV128Load8x8u, wasm.OpcodeVecV128Load8x8s, + wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load16x4s, + wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load32x2s: + _, offset := c.readMemArg() + if state.unreachable { + break + } + var lane ssa.VecLane + var signed bool + switch vecOp { + case wasm.OpcodeVecV128Load8x8s: + signed = true + fallthrough + case wasm.OpcodeVecV128Load8x8u: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecV128Load16x4s: + signed = true + fallthrough + case wasm.OpcodeVecV128Load16x4u: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecV128Load32x2s: + signed = true + fallthrough + case wasm.OpcodeVecV128Load32x2u: + lane = ssa.VecLaneI32x4 + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 8) + load := builder.AllocateInstruction(). + AsLoad(addr, offset, ssa.TypeV128). + Insert(builder).Return() + ret := builder.AllocateInstruction(). + AsWiden(load, lane, signed, true). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat, + wasm.OpcodeVecV128Load32Splat, wasm.OpcodeVecV128Load64Splat: + _, offset := c.readMemArg() + if state.unreachable { + break + } + var lane ssa.VecLane + var opSize uint64 + switch vecOp { + case wasm.OpcodeVecV128Load8Splat: + lane, opSize = ssa.VecLaneI8x16, 1 + case wasm.OpcodeVecV128Load16Splat: + lane, opSize = ssa.VecLaneI16x8, 2 + case wasm.OpcodeVecV128Load32Splat: + lane, opSize = ssa.VecLaneI32x4, 4 + case wasm.OpcodeVecV128Load64Splat: + lane, opSize = ssa.VecLaneI64x2, 8 + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + ret := builder.AllocateInstruction(). + AsLoadSplat(addr, offset, lane). + Insert(builder).Return() + state.push(ret) case wasm.OpcodeVecV128Store: _, offset := c.readMemArg() if state.unreachable { @@ -1716,7 +1845,36 @@ func (c *Compiler) lowerCurrentOpcode() { builder.AllocateInstruction(). AsStore(ssa.OpcodeStore, value, addr, offset). Insert(builder) - + case wasm.OpcodeVecV128Store8Lane, wasm.OpcodeVecV128Store16Lane, + wasm.OpcodeVecV128Store32Lane, wasm.OpcodeVecV128Store64Lane: + _, offset := c.readMemArg() + state.pc++ + if state.unreachable { + break + } + laneIndex := c.wasmFunctionBody[state.pc] + var storeOp ssa.Opcode + var lane ssa.VecLane + var opSize uint64 + switch vecOp { + case wasm.OpcodeVecV128Store8Lane: + storeOp, lane, opSize = ssa.OpcodeIstore8, ssa.VecLaneI8x16, 1 + case wasm.OpcodeVecV128Store16Lane: + storeOp, lane, opSize = ssa.OpcodeIstore16, ssa.VecLaneI16x8, 2 + case wasm.OpcodeVecV128Store32Lane: + storeOp, lane, opSize = ssa.OpcodeIstore32, ssa.VecLaneI32x4, 4 + case wasm.OpcodeVecV128Store64Lane: + storeOp, lane, opSize = ssa.OpcodeStore, ssa.VecLaneI64x2, 8 + } + vector := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + value := builder.AllocateInstruction(). + AsExtractlane(vector, laneIndex, lane, false). + Insert(builder).Return() + builder.AllocateInstruction(). + AsStore(storeOp, value, addr, offset). + Insert(builder) case wasm.OpcodeVecV128Not: if state.unreachable { break diff --git a/internal/engine/wazevo/ssa/instructions.go b/internal/engine/wazevo/ssa/instructions.go index 8a8777df..3bc63633 100644 --- a/internal/engine/wazevo/ssa/instructions.go +++ b/internal/engine/wazevo/ssa/instructions.go @@ -308,6 +308,9 @@ const ( // `v = sload32x2 MemFlags, p, Offset`. OpcodeSload32x2 + // OpcodeLoadSplat represents a load that replicates the loaded value to all lanes `v = LoadSplat.lane MemFlags, p, Offset`. + OpcodeLoadSplat + // OpcodeIconst represents the integer const. OpcodeIconst @@ -712,10 +715,6 @@ const ( // OpcodeBitcast is a bitcast operation: `v = bitcast MemFlags, x`. OpcodeBitcast - // OpcodeScalarToVector ... - // `v = scalar_to_vector s`. - OpcodeScalarToVector - // OpcodeBmask ... // `v = bmask x`. OpcodeBmask @@ -881,6 +880,7 @@ var instructionSideEffects = [opcodeEnd]sideEffect{ OpcodeCtz: sideEffectNone, OpcodePopcnt: sideEffectNone, OpcodeLoad: sideEffectNone, + OpcodeLoadSplat: sideEffectNone, OpcodeUload8: sideEffectNone, OpcodeUload16: sideEffectNone, OpcodeUload32: sideEffectNone, @@ -1106,6 +1106,7 @@ var instructionReturnTypes = [opcodeEnd]returnTypesFn{ return }, OpcodeLoad: returnTypesFnSingle, + OpcodeLoadSplat: returnTypesFnV128, OpcodeIadd: returnTypesFnSingle, OpcodeIsub: returnTypesFnSingle, OpcodeImul: returnTypesFnSingle, @@ -1182,7 +1183,7 @@ func (i *Instruction) AsLoad(ptr Value, offset uint32, typ Type) *Instruction { } // AsExtLoad initializes this instruction as a store instruction with OpcodeLoad. -func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bool) { +func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bool) *Instruction { i.opcode = op i.v = ptr i.u1 = uint64(offset) @@ -1191,14 +1192,17 @@ func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bo } else { i.typ = TypeI32 } + return i } -// AsSimdLoad initializes this instruction as a load instruction with OpcodeLoad 128 bit. -func (i *Instruction) AsSimdLoad(op Opcode, ptr Value, offset uint32) { - i.opcode = op +// AsLoadSplat initializes this instruction as a store instruction with OpcodeLoadSplat. +func (i *Instruction) AsLoadSplat(ptr Value, offset uint32, lane VecLane) *Instruction { + i.opcode = OpcodeLoadSplat i.v = ptr i.u1 = uint64(offset) + i.u2 = uint64(lane) i.typ = TypeV128 + return i } // LoadData returns the operands for a load instruction. @@ -1206,6 +1210,11 @@ func (i *Instruction) LoadData() (ptr Value, offset uint32, typ Type) { return i.v, uint32(i.u1), i.typ } +// LoadSplatData returns the operands for a load splat instruction. +func (i *Instruction) LoadSplatData() (ptr Value, offset uint32, lane VecLane) { + return i.v, uint32(i.u1), VecLane(i.u2) +} + // AsStore initializes this instruction as a store instruction with OpcodeStore. func (i *Instruction) AsStore(storeOp Opcode, value, ptr Value, offset uint32) *Instruction { i.opcode = storeOp @@ -2512,6 +2521,8 @@ func (i *Instruction) Format(b Builder) string { instSuffix = fmt.Sprintf(" %s, %s, %#x", i.v.Format(b), i.v2.Format(b), int32(i.u1)) case OpcodeLoad: instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1)) + case OpcodeLoadSplat: + instSuffix = fmt.Sprintf(".%s %s, %#x", VecLane(i.u2), i.v.Format(b), int32(i.u1)) case OpcodeUload8, OpcodeUload16, OpcodeUload32, OpcodeSload8, OpcodeSload16, OpcodeSload32: instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1)) case OpcodeSelect, OpcodeVbitselect: @@ -2720,6 +2731,8 @@ func (o Opcode) String() (ret string) { return "SsubSat" case OpcodeLoad: return "Load" + case OpcodeLoadSplat: + return "LoadSplat" case OpcodeStore: return "Store" case OpcodeUload8: @@ -2906,8 +2919,6 @@ func (o Opcode) String() (ret string) { return "Nearest" case OpcodeBitcast: return "Bitcast" - case OpcodeScalarToVector: - return "ScalarToVector" case OpcodeBmask: return "Bmask" case OpcodeIreduce: diff --git a/internal/integration_test/fuzzcases/fuzzcases_test.go b/internal/integration_test/fuzzcases/fuzzcases_test.go index b0f9aa65..4ab5d089 100644 --- a/internal/integration_test/fuzzcases/fuzzcases_test.go +++ b/internal/integration_test/fuzzcases/fuzzcases_test.go @@ -5,7 +5,6 @@ import ( "embed" "fmt" "runtime" - "strings" "testing" "github.com/tetratelabs/wazero" @@ -49,12 +48,6 @@ func runWithInterpreter(t *testing.T, runner func(t *testing.T, r wazero.Runtime func runWithWazevo(t *testing.T, runner func(t *testing.T, r wazero.Runtime)) { t.Run("wazevo", func(t *testing.T) { - name := t.Name() - for _, skipTarget := range []string{"695", "701", "718"} { - if strings.Contains(name, skipTarget) { - t.Skip("TODO: skipping for wazevo until SIMD is completed") - } - } config := wazero.NewRuntimeConfigInterpreter() wazevo.ConfigureWazevo(config) r := wazero.NewRuntimeWithConfig(ctx, config)