wazevo: passes simd load/store spec tests (#1766)

Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com>
2023-10-11 01:18:07 +02:00
parent fc8419346a
commit e3d83bbc7a
8 changed files with 235 additions and 18 deletions
--- a/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -88,6 +88,7 @@ var defKinds = [numInstructionKinds]defKind{
 	fpuLoad32:            defKindRD,
 	fpuLoad64:            defKindRD,
 	fpuLoad128:           defKindRD,
+	vecLoad1R:            defKindRD,
 	loadFpuConst32:       defKindRD,
 	loadFpuConst64:       defKindRD,
 	loadFpuConst128:      defKindRD,
@@ -212,6 +213,7 @@ var useKinds = [numInstructionKinds]useKind{
 	loadFpuConst32:       useKindNone,
 	loadFpuConst64:       useKindNone,
 	loadFpuConst128:      useKindNone,
+	vecLoad1R:            useKindRN,
 	cSel:                 useKindRNRM,
 	fpuCSel:              useKindRNRM,
 	movToVec:             useKindRN,
@@ -543,6 +545,13 @@ func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte)
 	i.amode = amode
 }

+func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+	i.kind = vecLoad1R
+	i.rd = rd
+	i.rn = rn
+	i.u1 = uint64(arr)
+}
+
 func (i *instruction) asCSet(rd regalloc.VReg, c condFlag) {
 	i.kind = cSet
 	i.rd = operandNR(rd)
@@ -1474,6 +1483,8 @@ const (
 	loadFpuConst64
 	// loadFpuConst128 represents a load of a 128-bit floating-point constant.
 	loadFpuConst128
+	// vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector.
+	vecLoad1R
 	// fpuToInt represents a conversion from FP to integer.
 	fpuToInt
 	// intToFpu represents a conversion from integer to FP.
--- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -46,6 +46,11 @@ func (i *instruction) encode(c backend.Compiler) {
 		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
 	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
 		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+	case vecLoad1R:
+		c.Emit4Bytes(encodeVecLoad1Rrt(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u1)))
 	case condBr:
 		imm19 := i.condBrOffset()
 		if imm19%4 != 0 {
@@ -1293,6 +1298,13 @@ func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint3
 	}
 }

+// encodeVecLoad1Rrt encodes as Load one single-element structure and Replicate to all lanes (of one register) in
+// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm
+func encodeVecLoad1Rrt(rt, rn uint32, arr vecArrangement) uint32 {
+	size, q := arrToSizeQEncoded(arr)
+	return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt
+}
+
 // encodeAluBitmaskImmediate encodes as Logical (immediate) in
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
 func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 {
--- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
@@ -1589,6 +1589,7 @@ func TestInstruction_encode(t *testing.T) {
 			i.asFpuRR(fpuUniOpRoundNearest, operandNR(v1VReg), operandNR(v2VReg), true)
 		}},
 		{want: "4140611e", setup: func(i *instruction) { i.asFpuRR(fpuUniOpNeg, operandNR(v1VReg), operandNR(v2VReg), true) }},
+		{want: "41c4404d", setup: func(i *instruction) { i.asVecLoad1R(operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H) }},
 		{want: "4201231e4201631e4201239e4201639e4201221e4201621e4201229e4201629e", setup: func(i *instruction) {
 			i.asNop0()
 			cur := i
--- a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -674,6 +674,24 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		m.insert(dup)

+	case ssa.OpcodeLoadSplat:
+		x, offset, lane := instr.LoadSplatData()
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		arr := ssaLaneToArrangement(lane)
+
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeI32)
+
+		// Our encoding for vecLoad1R does not support all the addressing modes yet,
+		// we use the no-offset addressing mode and add the offset to a temp register.
+		add := m.allocateInstr()
+		add.asALU(aluOpAdd, operandNR(tmpReg), rn, operandImm12(uint16(offset), 0), true)
+		m.insert(add)
+
+		ld1r := m.allocateInstr()
+		ld1r.asVecLoad1R(rd, operandNR(tmpReg), arr)
+		m.insert(ld1r)
+
 	default:
 		panic("TODO: lowering " + op.String())
 	}
--- a/internal/engine/wazevo/e2e_test.go
+++ b/internal/engine/wazevo/e2e_test.go
@@ -199,9 +199,22 @@ func TestSpectestV2(t *testing.T) {
 		{"simd_i16x8_extadd_pairwise_i8x16"},
 		{"simd_i32x4_extadd_pairwise_i16x8"},
 		{"simd_int_to_int_extend"},
+		{"simd_load"},
+		{"simd_load_extend"},
+		{"simd_load_splat"},
+		{"simd_load_zero"},
+		{"simd_load8_lane"},
+		{"simd_load16_lane"},
+		{"simd_load32_lane"},
+		{"simd_load64_lane"},
 		{"simd_lane"},
 		{"simd_linking"},
 		{"simd_splat"},
+		{"simd_store"},
+		{"simd_store8_lane"},
+		{"simd_store16_lane"},
+		{"simd_store32_lane"},
+		{"simd_store64_lane"},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Run("normal", func(t *testing.T) {
--- a/internal/engine/wazevo/frontend/lower.go
+++ b/internal/engine/wazevo/frontend/lower.go
@@ -1705,6 +1705,135 @@ func (c *Compiler) lowerCurrentOpcode() {
 			load.AsLoad(addr, offset, ssa.TypeV128)
 			builder.InsertInstruction(load)
 			state.push(load.Return())
+		case wasm.OpcodeVecV128Load8Lane, wasm.OpcodeVecV128Load16Lane, wasm.OpcodeVecV128Load32Lane:
+			_, offset := c.readMemArg()
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			var loadOp ssa.Opcode
+			var opSize uint64
+			switch vecOp {
+			case wasm.OpcodeVecV128Load8Lane:
+				loadOp, lane, opSize = ssa.OpcodeUload8, ssa.VecLaneI8x16, 1
+			case wasm.OpcodeVecV128Load16Lane:
+				loadOp, lane, opSize = ssa.OpcodeUload16, ssa.VecLaneI16x8, 2
+			case wasm.OpcodeVecV128Load32Lane:
+				loadOp, lane, opSize = ssa.OpcodeUload32, ssa.VecLaneI32x4, 4
+			}
+			laneIndex := c.wasmFunctionBody[state.pc]
+			vector := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+			load := builder.AllocateInstruction().
+				AsExtLoad(loadOp, addr, offset, false).
+				Insert(builder).Return()
+			ret := builder.AllocateInstruction().
+				AsInsertlane(vector, load, laneIndex, lane).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Load64Lane:
+			_, offset := c.readMemArg()
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			laneIndex := c.wasmFunctionBody[state.pc]
+			vector := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 8)
+			load := builder.AllocateInstruction().
+				AsLoad(addr, offset, ssa.TypeI64).
+				Insert(builder).Return()
+			ret := builder.AllocateInstruction().
+				AsInsertlane(vector, load, laneIndex, ssa.VecLaneI64x2).
+				Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecV128Load32zero:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 4)
+			ret := builder.AllocateInstruction().
+				AsLoad(addr, offset, ssa.TypeF32).
+				Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecV128Load64zero:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 8)
+			ret := builder.AllocateInstruction().
+				AsLoad(addr, offset, ssa.TypeF64).
+				Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecV128Load8x8u, wasm.OpcodeVecV128Load8x8s,
+			wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load16x4s,
+			wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load32x2s:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			var signed bool
+			switch vecOp {
+			case wasm.OpcodeVecV128Load8x8s:
+				signed = true
+				fallthrough
+			case wasm.OpcodeVecV128Load8x8u:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecV128Load16x4s:
+				signed = true
+				fallthrough
+			case wasm.OpcodeVecV128Load16x4u:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecV128Load32x2s:
+				signed = true
+				fallthrough
+			case wasm.OpcodeVecV128Load32x2u:
+				lane = ssa.VecLaneI32x4
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 8)
+			load := builder.AllocateInstruction().
+				AsLoad(addr, offset, ssa.TypeV128).
+				Insert(builder).Return()
+			ret := builder.AllocateInstruction().
+				AsWiden(load, lane, signed, true).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat,
+			wasm.OpcodeVecV128Load32Splat, wasm.OpcodeVecV128Load64Splat:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			var opSize uint64
+			switch vecOp {
+			case wasm.OpcodeVecV128Load8Splat:
+				lane, opSize = ssa.VecLaneI8x16, 1
+			case wasm.OpcodeVecV128Load16Splat:
+				lane, opSize = ssa.VecLaneI16x8, 2
+			case wasm.OpcodeVecV128Load32Splat:
+				lane, opSize = ssa.VecLaneI32x4, 4
+			case wasm.OpcodeVecV128Load64Splat:
+				lane, opSize = ssa.VecLaneI64x2, 8
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+			ret := builder.AllocateInstruction().
+				AsLoadSplat(addr, offset, lane).
+				Insert(builder).Return()
+			state.push(ret)
 		case wasm.OpcodeVecV128Store:
 			_, offset := c.readMemArg()
 			if state.unreachable {
@@ -1716,7 +1845,36 @@ func (c *Compiler) lowerCurrentOpcode() {
 			builder.AllocateInstruction().
 				AsStore(ssa.OpcodeStore, value, addr, offset).
 				Insert(builder)
-
+		case wasm.OpcodeVecV128Store8Lane, wasm.OpcodeVecV128Store16Lane,
+			wasm.OpcodeVecV128Store32Lane, wasm.OpcodeVecV128Store64Lane:
+			_, offset := c.readMemArg()
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			laneIndex := c.wasmFunctionBody[state.pc]
+			var storeOp ssa.Opcode
+			var lane ssa.VecLane
+			var opSize uint64
+			switch vecOp {
+			case wasm.OpcodeVecV128Store8Lane:
+				storeOp, lane, opSize = ssa.OpcodeIstore8, ssa.VecLaneI8x16, 1
+			case wasm.OpcodeVecV128Store16Lane:
+				storeOp, lane, opSize = ssa.OpcodeIstore16, ssa.VecLaneI16x8, 2
+			case wasm.OpcodeVecV128Store32Lane:
+				storeOp, lane, opSize = ssa.OpcodeIstore32, ssa.VecLaneI32x4, 4
+			case wasm.OpcodeVecV128Store64Lane:
+				storeOp, lane, opSize = ssa.OpcodeStore, ssa.VecLaneI64x2, 8
+			}
+			vector := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+			value := builder.AllocateInstruction().
+				AsExtractlane(vector, laneIndex, lane, false).
+				Insert(builder).Return()
+			builder.AllocateInstruction().
+				AsStore(storeOp, value, addr, offset).
+				Insert(builder)
 		case wasm.OpcodeVecV128Not:
 			if state.unreachable {
 				break
--- a/internal/engine/wazevo/ssa/instructions.go
+++ b/internal/engine/wazevo/ssa/instructions.go
@@ -308,6 +308,9 @@ const (
 	// `v = sload32x2 MemFlags, p, Offset`.
 	OpcodeSload32x2

+	// OpcodeLoadSplat represents a load that replicates the loaded value to all lanes `v = LoadSplat.lane MemFlags, p, Offset`.
+	OpcodeLoadSplat
+
 	// OpcodeIconst represents the integer const.
 	OpcodeIconst

@@ -712,10 +715,6 @@ const (
 	// OpcodeBitcast is a bitcast operation: `v = bitcast MemFlags, x`.
 	OpcodeBitcast

-	// OpcodeScalarToVector ...
-	// `v = scalar_to_vector s`.
-	OpcodeScalarToVector
-
 	// OpcodeBmask ...
 	// `v = bmask x`.
 	OpcodeBmask
@@ -881,6 +880,7 @@ var instructionSideEffects = [opcodeEnd]sideEffect{
 	OpcodeCtz:                sideEffectNone,
 	OpcodePopcnt:             sideEffectNone,
 	OpcodeLoad:               sideEffectNone,
+	OpcodeLoadSplat:          sideEffectNone,
 	OpcodeUload8:             sideEffectNone,
 	OpcodeUload16:            sideEffectNone,
 	OpcodeUload32:            sideEffectNone,
@@ -1106,6 +1106,7 @@ var instructionReturnTypes = [opcodeEnd]returnTypesFn{
 		return
 	},
 	OpcodeLoad:               returnTypesFnSingle,
+	OpcodeLoadSplat:          returnTypesFnV128,
 	OpcodeIadd:               returnTypesFnSingle,
 	OpcodeIsub:               returnTypesFnSingle,
 	OpcodeImul:               returnTypesFnSingle,
@@ -1182,7 +1183,7 @@ func (i *Instruction) AsLoad(ptr Value, offset uint32, typ Type) *Instruction {
 }

 // AsExtLoad initializes this instruction as a store instruction with OpcodeLoad.
-func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bool) {
+func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bool) *Instruction {
 	i.opcode = op
 	i.v = ptr
 	i.u1 = uint64(offset)
@@ -1191,14 +1192,17 @@ func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bo
 	} else {
 		i.typ = TypeI32
 	}
+	return i
 }

-// AsSimdLoad initializes this instruction as a load instruction with OpcodeLoad 128 bit.
-func (i *Instruction) AsSimdLoad(op Opcode, ptr Value, offset uint32) {
-	i.opcode = op
+// AsLoadSplat initializes this instruction as a store instruction with OpcodeLoadSplat.
+func (i *Instruction) AsLoadSplat(ptr Value, offset uint32, lane VecLane) *Instruction {
+	i.opcode = OpcodeLoadSplat
 	i.v = ptr
 	i.u1 = uint64(offset)
+	i.u2 = uint64(lane)
 	i.typ = TypeV128
+	return i
 }

 // LoadData returns the operands for a load instruction.
@@ -1206,6 +1210,11 @@ func (i *Instruction) LoadData() (ptr Value, offset uint32, typ Type) {
 	return i.v, uint32(i.u1), i.typ
 }

+// LoadSplatData returns the operands for a load splat instruction.
+func (i *Instruction) LoadSplatData() (ptr Value, offset uint32, lane VecLane) {
+	return i.v, uint32(i.u1), VecLane(i.u2)
+}
+
 // AsStore initializes this instruction as a store instruction with OpcodeStore.
 func (i *Instruction) AsStore(storeOp Opcode, value, ptr Value, offset uint32) *Instruction {
 	i.opcode = storeOp
@@ -2512,6 +2521,8 @@ func (i *Instruction) Format(b Builder) string {
 		instSuffix = fmt.Sprintf(" %s, %s, %#x", i.v.Format(b), i.v2.Format(b), int32(i.u1))
 	case OpcodeLoad:
 		instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1))
+	case OpcodeLoadSplat:
+		instSuffix = fmt.Sprintf(".%s %s, %#x", VecLane(i.u2), i.v.Format(b), int32(i.u1))
 	case OpcodeUload8, OpcodeUload16, OpcodeUload32, OpcodeSload8, OpcodeSload16, OpcodeSload32:
 		instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1))
 	case OpcodeSelect, OpcodeVbitselect:
@@ -2720,6 +2731,8 @@ func (o Opcode) String() (ret string) {
 		return "SsubSat"
 	case OpcodeLoad:
 		return "Load"
+	case OpcodeLoadSplat:
+		return "LoadSplat"
 	case OpcodeStore:
 		return "Store"
 	case OpcodeUload8:
@@ -2906,8 +2919,6 @@ func (o Opcode) String() (ret string) {
 		return "Nearest"
 	case OpcodeBitcast:
 		return "Bitcast"
-	case OpcodeScalarToVector:
-		return "ScalarToVector"
 	case OpcodeBmask:
 		return "Bmask"
 	case OpcodeIreduce:
--- a/internal/integration_test/fuzzcases/fuzzcases_test.go
+++ b/internal/integration_test/fuzzcases/fuzzcases_test.go
@@ -5,7 +5,6 @@ import (
 	"embed"
 	"fmt"
 	"runtime"
-	"strings"
 	"testing"

 	"github.com/tetratelabs/wazero"
@@ -49,12 +48,6 @@ func runWithInterpreter(t *testing.T, runner func(t *testing.T, r wazero.Runtime

 func runWithWazevo(t *testing.T, runner func(t *testing.T, r wazero.Runtime)) {
 	t.Run("wazevo", func(t *testing.T) {
-		name := t.Name()
-		for _, skipTarget := range []string{"695", "701", "718"} {
-			if strings.Contains(name, skipTarget) {
-				t.Skip("TODO: skipping for wazevo until SIMD is completed")
-			}
-		}
 		config := wazero.NewRuntimeConfigInterpreter()
 		wazevo.ConfigureWazevo(config)
 		r := wazero.NewRuntimeWithConfig(ctx, config)