wazevo(amd64): lowering for popcnt, ctz, clz + cpuid refactor (#1959)

Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com>
This commit is contained in:
Edoardo Vacchi
2024-01-25 18:19:37 +01:00
committed by GitHub
parent d6ab95b142
commit 16a6ffb129
11 changed files with 368 additions and 45 deletions

View File

@@ -1268,7 +1268,7 @@ func (c *amd64Compiler) compileClz(o *wazeroir.UnionOperation) error {
}
unsignedInt := wazeroir.UnsignedInt(o.B1)
if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register)
} else {
@@ -1331,7 +1331,7 @@ func (c *amd64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
}
unsignedInt := wazeroir.UnsignedInt(o.B1)
if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
if unsignedInt == wazeroir.UnsignedInt32 {
c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
} else {

View File

@@ -470,17 +470,17 @@ func TestAmd64Compiler_preventCrossedTargetdRegisters(t *testing.T) {
// mockCpuFlags implements platform.CpuFeatureFlags
type mockCpuFlags struct {
flags uint64
extraFlags uint64
flags platform.CpuFeature
extraFlags platform.CpuFeature
}
// Has implements the method of the same name in platform.CpuFeatureFlags
func (f *mockCpuFlags) Has(flag uint64) bool {
func (f *mockCpuFlags) Has(flag platform.CpuFeature) bool {
return (f.flags & flag) != 0
}
// HasExtra implements the method of the same name in platform.CpuFeatureFlags
func (f *mockCpuFlags) HasExtra(flag uint64) bool {
func (f *mockCpuFlags) HasExtra(flag platform.CpuFeature) bool {
return (f.extraFlags & flag) != 0
}
@@ -498,7 +498,7 @@ func TestAmd64Compiler_ensureClz_ABM(t *testing.T) {
expectedCode: "b80a000000f3480fbdc0",
cpuFeatures: &mockCpuFlags{
flags: 0,
extraFlags: platform.CpuExtraFeatureABM,
extraFlags: platform.CpuExtraFeatureAmd64ABM,
},
},
{
@@ -556,7 +556,7 @@ func TestAmd64Compiler_ensureCtz_ABM(t *testing.T) {
expectedCode: "b80a000000f3480fbcc0",
cpuFeatures: &mockCpuFlags{
flags: 0,
extraFlags: platform.CpuExtraFeatureABM,
extraFlags: platform.CpuExtraFeatureAmd64ABM,
},
},
{

View File

@@ -1574,6 +1574,7 @@ var defKinds = [instrMax]defKind{
aluRmiR: defKindNone,
shiftR: defKindNone,
imm: defKindOp2,
unaryRmR: defKindOp2,
xmmUnaryRmR: defKindOp2,
mov64MR: defKindOp2,
movsxRmR: defKindOp2,
@@ -1626,6 +1627,7 @@ var useKinds = [instrMax]useKind{
aluRmiR: useKindOp1Op2Reg,
shiftR: useKindOp1Op2Reg,
imm: useKindNone,
unaryRmR: useKindOp1,
xmmUnaryRmR: useKindOp1,
mov64MR: useKindOp1,
movzxRmR: useKindOp1,

View File

@@ -10,6 +10,7 @@ import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
"github.com/tetratelabs/wazero/internal/platform"
)
// NewBackend returns a new backend for arm64.
@@ -21,9 +22,10 @@ func NewBackend() backend.Machine {
asNop,
)
return &machine{
ectx: ectx,
regAlloc: regalloc.NewAllocator(regInfo),
spillSlots: map[regalloc.VRegID]int64{},
ectx: ectx,
cpuFeatures: platform.CpuFeatures,
regAlloc: regalloc.NewAllocator(regInfo),
spillSlots: map[regalloc.VRegID]int64{},
}
}
@@ -34,6 +36,8 @@ type (
ectx *backend.ExecutableContextT[instruction]
stackBoundsCheckDisabled bool
cpuFeatures platform.CpuFeatureFlags
regAlloc regalloc.Allocator
regAllocFn *backend.RegAllocFunction[*instruction, *machine]
regAllocStarted bool
@@ -218,12 +222,18 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
m.lowerShiftR(instr, shiftROpShiftLeft)
case ssa.OpcodeSshr:
m.lowerShiftR(instr, shiftROpShiftRightArithmetic)
case ssa.OpcodeUshr:
m.lowerShiftR(instr, shiftROpShiftRightLogical)
case ssa.OpcodeRotl:
m.lowerShiftR(instr, shiftROpRotateLeft)
case ssa.OpcodeRotr:
m.lowerShiftR(instr, shiftROpRotateRight)
case ssa.OpcodeUshr:
m.lowerShiftR(instr, shiftROpShiftRightLogical)
case ssa.OpcodeClz:
m.lowerClz(instr)
case ssa.OpcodeCtz:
m.lowerCtz(instr)
case ssa.OpcodePopcnt:
m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt)
case ssa.OpcodeUndefined:
m.insert(m.allocateInstr().asUD2())
case ssa.OpcodeExitWithCode:
@@ -320,6 +330,132 @@ func (m *machine) lowerVconst(res ssa.Value, lo, hi uint64) {
jmp.asJmp(newOperandLabel(afterLoadLabel))
}
func (m *machine) lowerCtz(instr *ssa.Instruction) {
if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt)
} else {
// On processors that do not support TZCNT, the BSF instruction is
// executed instead. The key difference between TZCNT and BSF
// instruction is that if source operand is zero, the content of
// destination operand is undefined.
// https://www.felixcloutier.com/x86/tzcnt.html
x := instr.Arg()
if !x.Type().IsInt() {
panic("BUG?")
}
_64 := x.Type().Bits() == 64
xDef := m.c.ValueDefinition(x)
rm := m.getOperand_Reg(xDef)
rd := m.c.VRegOf(instr.Return())
// First, we have to check if the target is non-zero.
test := m.allocateInstr()
test.asCmpRmiR(false, rm, rm.r, _64)
m.insert(test)
jmpNz := m.allocateInstr() // Will backpatch the operands later.
m.insert(jmpNz)
// If the value is zero, we just push the const value.
m.lowerIconst(rd, uint64(x.Type().Bits()), _64)
// Now jump right after the non-zero case.
jmpAtEnd := m.allocateInstr() // Will backpatch later.
m.insert(jmpAtEnd)
// jmpNz target label is set here.
nop, nz := m.allocateBrTarget()
jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
m.insert(nop)
// Emit the non-zero case.
bsr := m.allocateInstr()
bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, rd, _64)
m.insert(bsr)
// jmpAtEnd target label is set here.
nopEnd, end := m.allocateBrTarget()
jmpAtEnd.asJmp(newOperandLabel(end))
m.insert(nopEnd)
}
}
func (m *machine) lowerClz(instr *ssa.Instruction) {
if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt)
} else {
// On processors that do not support LZCNT, we combine BSR (calculating
// most significant set bit) with XOR. This logic is described in
// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
x := instr.Arg()
if !x.Type().IsInt() {
panic("BUG?")
}
_64 := x.Type().Bits() == 64
xDef := m.c.ValueDefinition(x)
rm := m.getOperand_Reg(xDef)
rd := m.c.VRegOf(instr.Return())
// First, we have to check if the rm is non-zero as BSR is undefined
// on zero. See https://www.felixcloutier.com/x86/bsr.
test := m.allocateInstr()
test.asCmpRmiR(false, rm, rm.r, _64)
m.insert(test)
jmpNz := m.allocateInstr() // Will backpatch later.
m.insert(jmpNz)
// If the value is zero, we just push the const value.
m.lowerIconst(rd, uint64(x.Type().Bits()), _64)
// Now jump right after the non-zero case.
jmpAtEnd := m.allocateInstr() // Will backpatch later.
m.insert(jmpAtEnd)
// jmpNz target label is set here.
nop, nz := m.allocateBrTarget()
jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
m.insert(nop)
// Emit the non-zero case.
tmp := m.c.VRegOf(instr.Return())
bsr := m.allocateInstr()
bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64)
m.insert(bsr)
// Now we XOR the value with the bit length minus one.
xor := m.allocateInstr()
xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64)
m.insert(xor)
// jmpAtEnd target label is set here.
nopEnd, end := m.allocateBrTarget()
jmpAtEnd.asJmp(newOperandLabel(end))
m.insert(nopEnd)
}
}
func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) {
x := si.Arg()
if !x.Type().IsInt() {
panic("BUG?")
}
_64 := x.Type().Bits() == 64
xDef := m.c.ValueDefinition(x)
rm := m.getOperand_Imm32_Reg(xDef)
rd := m.c.VRegOf(si.Return())
instr := m.allocateInstr()
instr.asUnaryRmR(op, rm, rd, _64)
m.insert(instr)
}
func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) {
mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
load := m.allocateInstr()

View File

@@ -9,6 +9,7 @@ import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
"github.com/tetratelabs/wazero/internal/platform"
"github.com/tetratelabs/wazero/internal/testing/require"
)
@@ -270,3 +271,161 @@ L1:
ud2
`, m.Format())
}
func Test_machine_lowerClz(t *testing.T) {
for _, tc := range []struct {
name string
setup func(*mockCompiler, ssa.Builder, *machine) *backend.SSAValueDefinition
cpuFlags platform.CpuFeatureFlags
tpe ssa.Type
exp string
}{
{
name: "no extra flags (64)",
cpuFlags: &mockCpuFlags{},
tpe: ssa.TypeI64,
exp: `
testq %rax, %rax
jnz L1
movabsq $64, %rcx
jmp L2
L1:
bsrq %rax, %rcx
xor $63, %rcx
L2:
`,
},
{
name: "ABM (64)",
cpuFlags: &mockCpuFlags{extraFlags: platform.CpuExtraFeatureAmd64ABM},
tpe: ssa.TypeI64,
exp: `
lzcntq %rax, %rcx
`,
},
{
name: "no extra flags (32)",
cpuFlags: &mockCpuFlags{},
tpe: ssa.TypeI32,
exp: `
testl %eax, %eax
jnz L1
movl $32, %ecx
jmp L2
L1:
bsrl %eax, %ecx
xor $31, %ecx
L2:
`,
},
{
name: "ABM (32)",
cpuFlags: &mockCpuFlags{extraFlags: platform.CpuExtraFeatureAmd64ABM},
tpe: ssa.TypeI32,
exp: `
lzcntl %eax, %ecx
`,
},
} {
t.Run(tc.name, func(t *testing.T) {
ctx, b, m := newSetupWithMockContext()
p := b.CurrentBlock().AddParam(b, tc.tpe)
m.cpuFeatures = tc.cpuFlags
ctx.definitions[p] = &backend.SSAValueDefinition{BlockParamValue: p, BlkParamVReg: raxVReg}
ctx.vRegMap[0] = rcxVReg
instr := &ssa.Instruction{}
instr.AsClz(p)
m.lowerClz(instr)
m.ectx.FlushPendingInstructions()
m.ectx.RootInstr = m.ectx.PerBlockHead
require.Equal(t, tc.exp, m.Format())
})
}
}
func Test_machine_lowerCtz(t *testing.T) {
for _, tc := range []struct {
name string
setup func(*mockCompiler, ssa.Builder, *machine) *backend.SSAValueDefinition
cpuFlags platform.CpuFeatureFlags
tpe ssa.Type
exp string
}{
{
name: "no extra flags (64)",
cpuFlags: &mockCpuFlags{},
tpe: ssa.TypeI64,
exp: `
testq %rax, %rax
jnz L1
movabsq $64, %rcx
jmp L2
L1:
bsfq %rax, %rcx
L2:
`,
},
{
name: "ABM (64)",
cpuFlags: &mockCpuFlags{extraFlags: platform.CpuExtraFeatureAmd64ABM},
tpe: ssa.TypeI64,
exp: `
tzcntq %rax, %rcx
`,
},
{
name: "no extra flags (32)",
cpuFlags: &mockCpuFlags{},
tpe: ssa.TypeI32,
exp: `
testl %eax, %eax
jnz L1
movl $32, %ecx
jmp L2
L1:
bsfl %eax, %ecx
L2:
`,
},
{
name: "ABM (32)",
cpuFlags: &mockCpuFlags{extraFlags: platform.CpuExtraFeatureAmd64ABM},
tpe: ssa.TypeI32,
exp: `
tzcntl %eax, %ecx
`,
},
} {
t.Run(tc.name, func(t *testing.T) {
ctx, b, m := newSetupWithMockContext()
p := b.CurrentBlock().AddParam(b, tc.tpe)
m.cpuFeatures = tc.cpuFlags
ctx.definitions[p] = &backend.SSAValueDefinition{BlockParamValue: p, BlkParamVReg: raxVReg}
ctx.vRegMap[0] = rcxVReg
instr := &ssa.Instruction{}
instr.AsCtz(p)
m.lowerCtz(instr)
m.ectx.FlushPendingInstructions()
m.ectx.RootInstr = m.ectx.PerBlockHead
require.Equal(t, tc.exp, m.Format())
})
}
}
// mockCpuFlags implements platform.CpuFeatureFlags
type mockCpuFlags struct {
flags platform.CpuFeature
extraFlags platform.CpuFeature
}
// Has implements the method of the same name in platform.CpuFeatureFlags
func (f *mockCpuFlags) Has(flag platform.CpuFeature) bool {
return (f.flags & flag) != 0
}
// HasExtra implements the method of the same name in platform.CpuFeatureFlags
func (f *mockCpuFlags) HasExtra(flag platform.CpuFeature) bool {
return (f.extraFlags & flag) != 0
}

View File

@@ -122,6 +122,15 @@ func TestE2E(t *testing.T) {
},
},
},
{
name: "integer bit counts", m: testcases.IntegerBitCounts.Module,
calls: []callCase{{
params: []uint64{10, 100},
expResults: []uint64{
28, 1, 2, 57, 2, 3,
},
}},
},
{
name: "many_params_many_results",
m: testcases.ManyParamsManyResults.Module,

View File

@@ -0,0 +1,25 @@
package platform
// CpuFeatureFlags exposes methods for querying CPU capabilities
type CpuFeatureFlags interface {
// Has returns true when the specified flag (represented as uint64) is supported
Has(cpuFeature CpuFeature) bool
// HasExtra returns true when the specified extraFlag (represented as uint64) is supported
HasExtra(cpuFeature CpuFeature) bool
}
type CpuFeature uint64
const (
// CpuFeatureAmd64SSE3 is the flag to query CpuFeatureFlags.Has for SSEv3 capabilities on amd64
CpuFeatureAmd64SSE3 CpuFeature = 1
// CpuFeatureAmd64SSE4_1 is the flag to query CpuFeatureFlags.Has for SSEv4.1 capabilities on amd64
CpuFeatureAmd64SSE4_1 CpuFeature = 1 << 19
// CpuFeatureAmd64SSE4_2 is the flag to query CpuFeatureFlags.Has for SSEv4.2 capabilities on amd64
CpuFeatureAmd64SSE4_2 CpuFeature = 1 << 20
)
const (
// CpuExtraFeatureAmd64ABM is the flag to query CpuFeatureFlags.HasExtra for Advanced Bit Manipulation capabilities (e.g. LZCNT) on amd64
CpuExtraFeatureAmd64ABM CpuFeature = 1 << 5
)

View File

@@ -1,30 +1,8 @@
package platform
const (
// CpuFeatureSSE3 is the flag to query CpuFeatureFlags.Has for SSEv3 capabilities
CpuFeatureSSE3 = uint64(1)
// CpuFeatureSSE4_1 is the flag to query CpuFeatureFlags.Has for SSEv4.1 capabilities
CpuFeatureSSE4_1 = uint64(1) << 19
// CpuFeatureSSE4_2 is the flag to query CpuFeatureFlags.Has for SSEv4.2 capabilities
CpuFeatureSSE4_2 = uint64(1) << 20
)
const (
// CpuExtraFeatureABM is the flag to query CpuFeatureFlags.HasExtra for Advanced Bit Manipulation capabilities (e.g. LZCNT)
CpuExtraFeatureABM = uint64(1) << 5
)
// CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods
var CpuFeatures CpuFeatureFlags = loadCpuFeatureFlags()
// CpuFeatureFlags exposes methods for querying CPU capabilities
type CpuFeatureFlags interface {
// Has returns true when the specified flag (represented as uint64) is supported
Has(cpuFeature uint64) bool
// HasExtra returns true when the specified extraFlag (represented as uint64) is supported
HasExtra(cpuFeature uint64) bool
}
// cpuFeatureFlags implements CpuFeatureFlags interface
type cpuFeatureFlags struct {
flags uint64
@@ -69,11 +47,11 @@ func loadCpuFeatureFlags() CpuFeatureFlags {
}
// Has implements the same method on the CpuFeatureFlags interface
func (f *cpuFeatureFlags) Has(cpuFeature uint64) bool {
return (f.flags & cpuFeature) != 0
func (f *cpuFeatureFlags) Has(cpuFeature CpuFeature) bool {
return (f.flags & uint64(cpuFeature)) != 0
}
// HasExtra implements the same method on the CpuFeatureFlags interface
func (f *cpuFeatureFlags) HasExtra(cpuFeature uint64) bool {
return (f.extraFlags & cpuFeature) != 0
func (f *cpuFeatureFlags) HasExtra(cpuFeature CpuFeature) bool {
return (f.extraFlags & uint64(cpuFeature)) != 0
}

View File

@@ -8,11 +8,11 @@ import (
func TestAmd64CpuId_cpuHasFeature(t *testing.T) {
flags := cpuFeatureFlags{
flags: CpuFeatureSSE3,
extraFlags: CpuExtraFeatureABM,
flags: uint64(CpuFeatureAmd64SSE3),
extraFlags: uint64(CpuExtraFeatureAmd64ABM),
}
require.True(t, flags.Has(CpuFeatureSSE3))
require.False(t, flags.Has(CpuFeatureSSE4_2))
require.True(t, flags.HasExtra(CpuExtraFeatureABM))
require.True(t, flags.Has(CpuFeatureAmd64SSE3))
require.False(t, flags.Has(CpuFeatureAmd64SSE4_2))
require.True(t, flags.HasExtra(CpuExtraFeatureAmd64ABM))
require.False(t, flags.HasExtra(1<<6)) // some other value
}

View File

@@ -0,0 +1,14 @@
//go:build !amd64
package platform
var CpuFeatures CpuFeatureFlags = &cpuFeatureFlags{}
// cpuFeatureFlags implements CpuFeatureFlags for unsupported platforms
type cpuFeatureFlags struct{}
// Has implements the same method on the CpuFeatureFlags interface
func (c *cpuFeatureFlags) Has(cpuFeature CpuFeature) bool { return false }
// HasExtra implements the same method on the CpuFeatureFlags interface
func (c *cpuFeatureFlags) HasExtra(cpuFeature CpuFeature) bool { return false }

View File

@@ -3,5 +3,5 @@ package platform
// init verifies that the current CPU supports the required AMD64 instructions
func init() {
// Ensure SSE4.1 is supported.
archRequirementsVerified = CpuFeatures.Has(CpuFeatureSSE4_1)
archRequirementsVerified = CpuFeatures.Has(CpuFeatureAmd64SSE4_1)
}