diff --git a/BENCHMARK_REPORT_AVX2.md b/BENCHMARK_REPORT_AVX2.md new file mode 100644 index 0000000..b431276 --- /dev/null +++ b/BENCHMARK_REPORT_AVX2.md @@ -0,0 +1,228 @@ +# Benchmark Report: p256k1 Implementation Comparison + +This report compares performance of different secp256k1 implementations: + +1. **Pure Go** - p256k1 with assembly disabled (baseline) +2. **AVX2/ASM** - p256k1 with x86-64 assembly enabled (scalar and field operations) +3. **libsecp256k1** - Bitcoin Core's C library via purego (no CGO) +4. **Default** - p256k1 with automatic feature detection + +## Test Environment + +- **Platform**: Linux 6.8.0 (amd64) +- **CPU**: AMD Ryzen 5 PRO 4650G with Radeon Graphics (12 threads) +- **Go Version**: go1.23+ +- **Date**: 2025-11-28 + +## High-Level Operation Benchmarks + +| Operation | Pure Go | AVX2 | libsecp256k1 | Default | +|-----------|---------|------|--------------|---------| +| **Pubkey Derivation** | 56.09 µs | 55.72 µs | **20.84 µs** | 54.03 µs | +| **Sign** | 56.18 µs | 56.00 µs | **39.92 µs** | 28.92 µs | +| **Verify** | 144.01 µs | 139.55 µs | **42.10 µs** | 139.22 µs | +| **ECDH** | 107.80 µs | 106.30 µs | N/A | 104.53 µs | + +### Relative Performance (vs Pure Go) + +| Operation | AVX2 | libsecp256k1 | +|-----------|------|--------------| +| **Pubkey Derivation** | 1.01x faster | **2.69x faster** | +| **Sign** | 1.00x | **1.41x faster** | +| **Verify** | **1.03x faster** | **3.42x faster** | +| **ECDH** | **1.01x faster** | N/A | + +## Scalar Operation Benchmarks (Isolated) + +These benchmarks measure the individual scalar arithmetic operations in isolation: + +| Operation | Pure Go | x86-64 Assembly | Speedup | +|-----------|---------|-----------------|---------| +| **Scalar Multiply** | 46.52 ns | 30.49 ns | **1.53x faster** | +| **Scalar Add** | 5.29 ns | 4.69 ns | **1.13x faster** | + +The x86-64 scalar multiplication shows a **53% improvement** over pure Go, demonstrating the effectiveness of the optimized 512-bit reduction algorithm. + +## Field Operation Benchmarks (Isolated) + +Field operations (modular arithmetic over the secp256k1 prime field) dominate elliptic curve computations. These benchmarks measure the assembly-optimized field multiplication and squaring: + +| Operation | Pure Go | x86-64 Assembly | Speedup | +|-----------|---------|-----------------|---------| +| **Field Multiply** | 27.5 ns | 26.0 ns | **1.06x faster** | +| **Field Square** | 27.5 ns | 21.7 ns | **1.27x faster** | + +The field squaring assembly shows a **21% improvement** because it exploits the symmetry of squaring (computing 2·a[i]·a[j] once instead of a[i]·a[j] + a[j]·a[i]). + +### Why Field Assembly Speedup is More Modest + +The field multiplication assembly provides a smaller speedup than scalar multiplication because: + +1. **Go's uint128 emulation is efficient**: The pure Go implementation uses `bits.Mul64` and `bits.Add64` which compile to efficient machine code +2. **No SIMD opportunity**: Field multiplication requires sequential 128-bit accumulator operations that don't parallelize well +3. **Memory access patterns**: Both implementations have similar memory access patterns for the 5×52-bit limb representation + +The squaring optimization is more effective because it reduces the number of multiplications by exploiting a[i]·a[j] = a[j]·a[i]. + +## Memory Allocations + +| Operation | Pure Go | x86-64 ASM | libsecp256k1 | +|-----------|---------|------------|--------------| +| **Pubkey Derivation** | 256 B / 4 allocs | 256 B / 4 allocs | 504 B / 13 allocs | +| **Sign** | 576 B / 10 allocs | 576 B / 10 allocs | 400 B / 8 allocs | +| **Verify** | 128 B / 4 allocs | 128 B / 4 allocs | 312 B / 8 allocs | +| **ECDH** | 209 B / 5 allocs | 209 B / 5 allocs | N/A | + +The Pure Go and assembly implementations have identical memory profiles since assembly only affects computation, not allocation patterns. libsecp256k1 via purego has higher allocations due to the FFI overhead. + +## Analysis + +### Why Assembly Improvement is Limited at High Level + +The scalar multiplication speedup (53%) and field squaring speedup (21%) don't fully translate to proportional high-level operation improvements because: + +1. **Field operations dominate**: Point multiplication on the elliptic curve spends most time in field arithmetic (modular multiplication/squaring over the prime field p), not scalar arithmetic over the group order n. + +2. **Operation breakdown**: In a typical signature verification: + - ~90% of time: Field multiplications and squarings for point operations + - ~5% of time: Scalar arithmetic + - ~5% of time: Other operations (hashing, memory, etc.) + +3. **Amdahl's Law**: The 21% field squaring speedup affects roughly half of field operations (squaring is called frequently in inversion and exponentiation), yielding ~10% improvement in field-heavy code paths. + +### libsecp256k1 Performance + +The Bitcoin Core C library via purego shows excellent performance: +- **2.7-3.4x faster** for most operations +- Uses highly optimized field arithmetic with platform-specific assembly +- Employs advanced techniques like GLV endomorphism + +### x86-64 Assembly Implementation Details + +#### Scalar Multiplication (`scalar_amd64.s`) + +Implements the same 3-phase reduction algorithm as bitcoin-core/secp256k1: + +**3-Phase Reduction Algorithm:** + +1. **Phase 1**: 512 bits → 385 bits + ``` + m[0..6] = l[0..3] + l[4..7] * NC + ``` + +2. **Phase 2**: 385 bits → 258 bits + ``` + p[0..4] = m[0..3] + m[4..6] * NC + ``` + +3. **Phase 3**: 258 bits → 256 bits + ``` + r[0..3] = p[0..3] + p[4] * NC + ``` + Plus final conditional reduction if result ≥ n + +**Constants (NC = 2^256 - n):** +- `NC0 = 0x402DA1732FC9BEBF` +- `NC1 = 0x4551231950B75FC4` +- `NC2 = 1` + +#### Field Multiplication and Squaring (`field_amd64.s`) + +Ported from bitcoin-core/secp256k1's `field_5x52_int128_impl.h`: + +**5×52-bit Limb Representation:** +- Field element value = Σ(n[i] × 2^(52×i)) for i = 0..4 +- Each limb n[i] fits in 52 bits (with some headroom for accumulation) +- Total: 260 bits capacity for 256-bit field elements + +**Reduction Constants:** +- Field prime p = 2^256 - 2^32 - 977 +- R = 2^256 mod p = 0x1000003D10 (shifted for 52-bit alignment) +- M = 0xFFFFFFFFFFFFF (52-bit mask) + +**Algorithm Highlights:** +- Uses 128-bit accumulators (via MULQ instruction producing DX:AX) +- Interleaves computation of partial products with reduction +- Squaring exploits symmetry: 2·a[i]·a[j] computed once instead of twice + +## Raw Benchmark Data + +``` +goos: linux +goarch: amd64 +pkg: p256k1.mleku.dev/bench +cpu: AMD Ryzen 5 PRO 4650G with Radeon Graphics + +# High-level operations (benchtime=2s) +BenchmarkPureGo_PubkeyDerivation-12 44107 56085 ns/op 256 B/op 4 allocs/op +BenchmarkPureGo_Sign-12 41503 56182 ns/op 576 B/op 10 allocs/op +BenchmarkPureGo_Verify-12 17293 144012 ns/op 128 B/op 4 allocs/op +BenchmarkPureGo_ECDH-12 22831 107799 ns/op 209 B/op 5 allocs/op +BenchmarkAVX2_PubkeyDerivation-12 43000 55724 ns/op 256 B/op 4 allocs/op +BenchmarkAVX2_Sign-12 41588 55999 ns/op 576 B/op 10 allocs/op +BenchmarkAVX2_Verify-12 17684 139552 ns/op 128 B/op 4 allocs/op +BenchmarkAVX2_ECDH-12 22786 106296 ns/op 209 B/op 5 allocs/op +BenchmarkLibSecp_Sign-12 59470 39916 ns/op 400 B/op 8 allocs/op +BenchmarkLibSecp_PubkeyDerivation-12 119511 20844 ns/op 504 B/op 13 allocs/op +BenchmarkLibSecp_Verify-12 57483 42102 ns/op 312 B/op 8 allocs/op +BenchmarkPubkeyDerivation-12 42465 54030 ns/op 256 B/op 4 allocs/op +BenchmarkSign-12 85609 28920 ns/op 576 B/op 10 allocs/op +BenchmarkVerify-12 17397 139216 ns/op 128 B/op 4 allocs/op +BenchmarkECDH-12 22885 104530 ns/op 209 B/op 5 allocs/op + +# Isolated scalar operations (benchtime=2s) +BenchmarkScalarMulPureGo-12 50429706 46.52 ns/op +BenchmarkScalarMulAVX2-12 79820377 30.49 ns/op +BenchmarkScalarAddPureGo-12 464323708 5.288 ns/op +BenchmarkScalarAddAVX2-12 549494175 4.694 ns/op + +# Isolated field operations (benchtime=1s, count=5) +BenchmarkFieldMulAsm-12 46677114 25.82 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulAsm-12 45379737 26.63 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulAsm-12 47394996 25.99 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulAsm-12 48337986 27.05 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulAsm-12 47056432 27.52 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulPureGo-12 42025989 27.86 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulPureGo-12 39620865 27.44 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulPureGo-12 39708454 27.25 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulPureGo-12 43870612 27.77 ns/op 0 B/op 0 allocs/op +BenchmarkFieldMulPureGo-12 44919584 27.41 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrAsm-12 59990847 21.63 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrAsm-12 57070836 21.85 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrAsm-12 55419507 21.81 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrAsm-12 57015470 21.93 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrAsm-12 54106294 21.12 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrPureGo-12 40245084 27.62 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrPureGo-12 43287774 27.04 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrPureGo-12 44501200 28.47 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrPureGo-12 46260654 27.04 ns/op 0 B/op 0 allocs/op +BenchmarkFieldSqrPureGo-12 45252552 27.75 ns/op 0 B/op 0 allocs/op +``` + +## Conclusions + +1. **Scalar multiplication is 53% faster** with x86-64 assembly (46.52 ns → 30.49 ns) +2. **Scalar addition is 13% faster** with x86-64 assembly (5.29 ns → 4.69 ns) +3. **Field squaring is 21% faster** with x86-64 assembly (27.5 ns → 21.7 ns) +4. **Field multiplication is 6% faster** with x86-64 assembly (27.5 ns → 26.0 ns) +5. **High-level operation improvements are modest** (~1-3%) due to the complexity of the full cryptographic pipeline +6. **libsecp256k1 is 2.7-3.4x faster** for cryptographic operations (uses additional optimizations like GLV endomorphism) +7. **Pure Go is competitive** - within 3x of highly optimized C for most operations +8. **Memory efficiency is identical** between Pure Go and assembly implementations + +## Future Optimization Opportunities + +To achieve larger speedups, focus on: + +1. **BMI2 instructions**: Use MULX/ADCX/ADOX for better carry handling in field multiplication (potential 10-20% gain) +2. **AVX-512 IFMA**: If available, use 52-bit multiply-add instructions for massive field operation speedup +3. **GLV endomorphism**: Implement the secp256k1-specific optimization that splits scalar multiplication +4. **Vectorized point operations**: Batch multiple independent point operations using SIMD +5. **ARM64 NEON**: Add optimizations for Apple Silicon and ARM servers + +## References + +- [bitcoin-core/secp256k1](https://github.com/bitcoin-core/secp256k1) - Reference C implementation +- [scalar_4x64_impl.h](https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h) - Scalar reduction algorithm +- [field_5x52_int128_impl.h](https://github.com/bitcoin-core/secp256k1/blob/master/src/field_5x52_int128_impl.h) - Field arithmetic implementation +- [Efficient Modular Multiplication](https://eprint.iacr.org/2021/1151.pdf) - Research on modular arithmetic optimization diff --git a/avx_test.go b/avx_test.go new file mode 100644 index 0000000..59db9cf --- /dev/null +++ b/avx_test.go @@ -0,0 +1,272 @@ +package p256k1 + +import ( + "testing" +) + +func TestAVX2Integration(t *testing.T) { + t.Logf("AVX2 CPU support: %v", HasAVX2CPU()) + t.Logf("AVX2 enabled: %v", HasAVX2()) + + // Test scalar multiplication with AVX2 + var a, b, productAVX, productGo Scalar + a.setInt(12345) + b.setInt(67890) + + // Compute with AVX2 enabled + SetAVX2Enabled(true) + productAVX.mul(&a, &b) + + // Compute with AVX2 disabled + SetAVX2Enabled(false) + productGo.mulPureGo(&a, &b) + + // Re-enable AVX2 + SetAVX2Enabled(true) + + if !productAVX.equal(&productGo) { + t.Errorf("AVX2 and Go scalar multiplication differ:\n AVX2: %v\n Go: %v", + productAVX.d, productGo.d) + } else { + t.Logf("Scalar multiplication matches: %v", productAVX.d) + } + + // Test scalar addition + var sumAVX, sumGo Scalar + SetAVX2Enabled(true) + sumAVX.add(&a, &b) + + SetAVX2Enabled(false) + sumGo.addPureGo(&a, &b) + + SetAVX2Enabled(true) + + if !sumAVX.equal(&sumGo) { + t.Errorf("AVX2 and Go scalar addition differ:\n AVX2: %v\n Go: %v", + sumAVX.d, sumGo.d) + } else { + t.Logf("Scalar addition matches: %v", sumAVX.d) + } + + // Test inverse (which uses mul internally) + var inv, product Scalar + a.setInt(2) + + SetAVX2Enabled(true) + inv.inverse(&a) + product.mul(&a, &inv) + + t.Logf("a = %v", a.d) + t.Logf("inv(a) = %v", inv.d) + t.Logf("a * inv(a) = %v", product.d) + t.Logf("isOne = %v", product.isOne()) + + if !product.isOne() { + // Try with pure Go + SetAVX2Enabled(false) + var inv2, product2 Scalar + inv2.inverse(&a) + product2.mul(&a, &inv2) + t.Logf("Pure Go: a * inv(a) = %v", product2.d) + t.Logf("Pure Go isOne = %v", product2.isOne()) + SetAVX2Enabled(true) + + t.Errorf("2 * inv(2) should equal 1") + } +} + +func TestScalarMulAVX2VsPureGo(t *testing.T) { + if !HasAVX2CPU() { + t.Skip("AVX2 not available") + } + + // Test several multiplication cases + testCases := []struct { + a, b uint + }{ + {2, 3}, + {12345, 67890}, + {0xFFFFFFFF, 0xFFFFFFFF}, + {1, 1}, + {0, 123}, + } + + for _, tc := range testCases { + var a, b, productAVX, productGo Scalar + a.setInt(tc.a) + b.setInt(tc.b) + + SetAVX2Enabled(true) + scalarMulAVX2(&productAVX, &a, &b) + + productGo.mulPureGo(&a, &b) + + if !productAVX.equal(&productGo) { + t.Errorf("Mismatch for %d * %d:\n AVX2: %v\n Go: %v", + tc.a, tc.b, productAVX.d, productGo.d) + } + } +} + +func TestScalarMulAVX2Large(t *testing.T) { + if !HasAVX2CPU() { + t.Skip("AVX2 not available") + } + + // Test with the actual inverse of 2 + var a Scalar + a.setInt(2) + + var inv Scalar + SetAVX2Enabled(false) + inv.inverse(&a) + SetAVX2Enabled(true) + + t.Logf("a = %v", a.d) + t.Logf("inv(2) = %v", inv.d) + + // Test multiplication of 2 * inv(2) + var productAVX, productGo Scalar + scalarMulAVX2(&productAVX, &a, &inv) + + SetAVX2Enabled(false) + productGo.mulPureGo(&a, &inv) + SetAVX2Enabled(true) + + t.Logf("AVX2: 2 * inv(2) = %v", productAVX.d) + t.Logf("Go: 2 * inv(2) = %v", productGo.d) + + if !productAVX.equal(&productGo) { + t.Errorf("Large number multiplication differs") + } +} + +func TestInverseAVX2VsGo(t *testing.T) { + if !HasAVX2CPU() { + t.Skip("AVX2 not available") + } + + var a Scalar + a.setInt(2) + + // Compute inverse with AVX2 + var invAVX Scalar + SetAVX2Enabled(true) + invAVX.inverse(&a) + + // Compute inverse with pure Go + var invGo Scalar + SetAVX2Enabled(false) + invGo.inverse(&a) + SetAVX2Enabled(true) + + t.Logf("AVX2 inv(2) = %v", invAVX.d) + t.Logf("Go inv(2) = %v", invGo.d) + + if !invAVX.equal(&invGo) { + t.Errorf("Inverse differs between AVX2 and Go") + } +} + +func TestScalarMulAliased(t *testing.T) { + if !HasAVX2CPU() { + t.Skip("AVX2 not available") + } + + // Test aliased multiplication: r.mul(r, &b) and r.mul(&a, r) + var a, b Scalar + a.setInt(12345) + b.setInt(67890) + + // Test r = r * b + var rAVX, rGo Scalar + rAVX = a + rGo = a + + SetAVX2Enabled(true) + scalarMulAVX2(&rAVX, &rAVX, &b) + + SetAVX2Enabled(false) + rGo.mulPureGo(&rGo, &b) + SetAVX2Enabled(true) + + if !rAVX.equal(&rGo) { + t.Errorf("r = r * b failed:\n AVX2: %v\n Go: %v", rAVX.d, rGo.d) + } + + // Test r = a * r + rAVX = b + rGo = b + + SetAVX2Enabled(true) + scalarMulAVX2(&rAVX, &a, &rAVX) + + SetAVX2Enabled(false) + rGo.mulPureGo(&a, &rGo) + SetAVX2Enabled(true) + + if !rAVX.equal(&rGo) { + t.Errorf("r = a * r failed:\n AVX2: %v\n Go: %v", rAVX.d, rGo.d) + } + + // Test squaring: r = r * r + rAVX = a + rGo = a + + SetAVX2Enabled(true) + scalarMulAVX2(&rAVX, &rAVX, &rAVX) + + SetAVX2Enabled(false) + rGo.mulPureGo(&rGo, &rGo) + SetAVX2Enabled(true) + + if !rAVX.equal(&rGo) { + t.Errorf("r = r * r failed:\n AVX2: %v\n Go: %v", rAVX.d, rGo.d) + } +} + +func TestScalarMulLargeNumbers(t *testing.T) { + if !HasAVX2CPU() { + t.Skip("AVX2 not available") + } + + // Test with large numbers (all limbs non-zero) + testCases := []struct { + name string + a, b Scalar + }{ + { + name: "large a * small b", + a: Scalar{d: [4]uint64{0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 0}}, + b: Scalar{d: [4]uint64{2, 0, 0, 0}}, + }, + { + name: "a^2 where a is large", + a: Scalar{d: [4]uint64{0x123456789ABCDEF0, 0xFEDCBA9876543210, 0, 0}}, + b: Scalar{d: [4]uint64{0x123456789ABCDEF0, 0xFEDCBA9876543210, 0, 0}}, + }, + { + name: "full limbs", + a: Scalar{d: [4]uint64{0x123456789ABCDEF0, 0xFEDCBA9876543210, 0x1111111111111111, 0x2222222222222222}}, + b: Scalar{d: [4]uint64{0x0FEDCBA987654321, 0x123456789ABCDEF0, 0x3333333333333333, 0x4444444444444444}}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var productAVX, productGo Scalar + + SetAVX2Enabled(true) + scalarMulAVX2(&productAVX, &tc.a, &tc.b) + + SetAVX2Enabled(false) + productGo.mulPureGo(&tc.a, &tc.b) + SetAVX2Enabled(true) + + if !productAVX.equal(&productGo) { + t.Errorf("Mismatch:\n a: %v\n b: %v\n AVX2: %v\n Go: %v", + tc.a.d, tc.b.d, productAVX.d, productGo.d) + } + }) + } +} diff --git a/bench/avx2_bench_test.go b/bench/avx2_bench_test.go new file mode 100644 index 0000000..f125cde --- /dev/null +++ b/bench/avx2_bench_test.go @@ -0,0 +1,316 @@ +//go:build !nocgo + +package bench + +import ( + "crypto/rand" + "testing" + + "p256k1.mleku.dev" + "p256k1.mleku.dev/signer" +) + +// This file contains benchmarks comparing: +// 1. P256K1 Pure Go implementation +// 2. P256K1 with AVX2 scalar operations (where applicable) +// 3. libsecp256k1.so via purego (if available) + +var ( + avxBenchSeckey []byte + avxBenchMsghash []byte + avxBenchSigner *signer.P256K1Signer + avxBenchSigner2 *signer.P256K1Signer + avxBenchSig []byte + avxBenchLibSecp *p256k1.LibSecp256k1 +) + +func initAVXBenchData() { + if avxBenchSeckey == nil { + avxBenchSeckey = []byte{ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + } + + for { + testSigner := signer.NewP256K1Signer() + if err := testSigner.InitSec(avxBenchSeckey); err == nil { + break + } + if _, err := rand.Read(avxBenchSeckey); err != nil { + panic(err) + } + } + + avxBenchMsghash = make([]byte, 32) + if _, err := rand.Read(avxBenchMsghash); err != nil { + panic(err) + } + } + + // Setup P256K1Signer + s := signer.NewP256K1Signer() + if err := s.InitSec(avxBenchSeckey); err != nil { + panic(err) + } + avxBenchSigner = s + + var err error + avxBenchSig, err = s.Sign(avxBenchMsghash) + if err != nil { + panic(err) + } + + // Generate second key pair for ECDH + seckey2 := make([]byte, 32) + for { + if _, err := rand.Read(seckey2); err != nil { + panic(err) + } + testSigner := signer.NewP256K1Signer() + if err := testSigner.InitSec(seckey2); err == nil { + break + } + } + + s2 := signer.NewP256K1Signer() + if err := s2.InitSec(seckey2); err != nil { + panic(err) + } + avxBenchSigner2 = s2 + + // Try to load libsecp256k1 + avxBenchLibSecp, _ = p256k1.GetLibSecp256k1() +} + +// Pure Go benchmarks (AVX2 disabled) +func BenchmarkPureGo_PubkeyDerivation(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + p256k1.SetAVX2Enabled(false) + defer p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + s := signer.NewP256K1Signer() + if err := s.InitSec(avxBenchSeckey); err != nil { + b.Fatalf("failed to create signer: %v", err) + } + _ = s.Pub() + } +} + +func BenchmarkPureGo_Sign(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + p256k1.SetAVX2Enabled(false) + defer p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := avxBenchSigner.Sign(avxBenchMsghash) + if err != nil { + b.Fatalf("failed to sign: %v", err) + } + } +} + +func BenchmarkPureGo_Verify(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + p256k1.SetAVX2Enabled(false) + defer p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + verifier := signer.NewP256K1Signer() + if err := verifier.InitPub(avxBenchSigner.Pub()); err != nil { + b.Fatalf("failed to create verifier: %v", err) + } + valid, err := verifier.Verify(avxBenchMsghash, avxBenchSig) + if err != nil { + b.Fatalf("verification error: %v", err) + } + if !valid { + b.Fatalf("verification failed") + } + } +} + +func BenchmarkPureGo_ECDH(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + p256k1.SetAVX2Enabled(false) + defer p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := avxBenchSigner.ECDH(avxBenchSigner2.Pub()) + if err != nil { + b.Fatalf("ECDH failed: %v", err) + } + } +} + +// AVX2-enabled benchmarks +func BenchmarkAVX2_PubkeyDerivation(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if !p256k1.HasAVX2CPU() { + b.Skip("AVX2 not available") + } + + p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + s := signer.NewP256K1Signer() + if err := s.InitSec(avxBenchSeckey); err != nil { + b.Fatalf("failed to create signer: %v", err) + } + _ = s.Pub() + } +} + +func BenchmarkAVX2_Sign(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if !p256k1.HasAVX2CPU() { + b.Skip("AVX2 not available") + } + + p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := avxBenchSigner.Sign(avxBenchMsghash) + if err != nil { + b.Fatalf("failed to sign: %v", err) + } + } +} + +func BenchmarkAVX2_Verify(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if !p256k1.HasAVX2CPU() { + b.Skip("AVX2 not available") + } + + p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + verifier := signer.NewP256K1Signer() + if err := verifier.InitPub(avxBenchSigner.Pub()); err != nil { + b.Fatalf("failed to create verifier: %v", err) + } + valid, err := verifier.Verify(avxBenchMsghash, avxBenchSig) + if err != nil { + b.Fatalf("verification error: %v", err) + } + if !valid { + b.Fatalf("verification failed") + } + } +} + +func BenchmarkAVX2_ECDH(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if !p256k1.HasAVX2CPU() { + b.Skip("AVX2 not available") + } + + p256k1.SetAVX2Enabled(true) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := avxBenchSigner.ECDH(avxBenchSigner2.Pub()) + if err != nil { + b.Fatalf("ECDH failed: %v", err) + } + } +} + +// libsecp256k1.so benchmarks via purego +func BenchmarkLibSecp_Sign(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if avxBenchLibSecp == nil || !avxBenchLibSecp.IsLoaded() { + b.Skip("libsecp256k1.so not available") + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := avxBenchLibSecp.SchnorrSign(avxBenchMsghash, avxBenchSeckey) + if err != nil { + b.Fatalf("signing failed: %v", err) + } + } +} + +func BenchmarkLibSecp_PubkeyDerivation(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if avxBenchLibSecp == nil || !avxBenchLibSecp.IsLoaded() { + b.Skip("libsecp256k1.so not available") + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := avxBenchLibSecp.CreatePubkey(avxBenchSeckey) + if err != nil { + b.Fatalf("pubkey creation failed: %v", err) + } + } +} + +func BenchmarkLibSecp_Verify(b *testing.B) { + if avxBenchSeckey == nil { + initAVXBenchData() + } + + if avxBenchLibSecp == nil || !avxBenchLibSecp.IsLoaded() { + b.Skip("libsecp256k1.so not available") + } + + // Sign with libsecp to get compatible signature + sig, err := avxBenchLibSecp.SchnorrSign(avxBenchMsghash, avxBenchSeckey) + if err != nil { + b.Fatalf("signing failed: %v", err) + } + + pubkey, err := avxBenchLibSecp.CreatePubkey(avxBenchSeckey) + if err != nil { + b.Fatalf("pubkey creation failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !avxBenchLibSecp.SchnorrVerify(sig, avxBenchMsghash, pubkey) { + b.Fatalf("verification failed") + } + } +} diff --git a/cpufeatures.go b/cpufeatures.go new file mode 100644 index 0000000..ec87465 --- /dev/null +++ b/cpufeatures.go @@ -0,0 +1,60 @@ +//go:build amd64 + +package p256k1 + +import ( + "sync" + "sync/atomic" + + "github.com/klauspost/cpuid/v2" +) + +// CPU feature flags +var ( + // hasAVX2CPU indicates whether the CPU supports AVX2 instructions. + // This is detected at startup and never changes. + hasAVX2CPU bool + + // avx2Disabled allows runtime disabling of AVX2 for testing/debugging. + // Uses atomic operations for thread-safety without locks on the fast path. + avx2Disabled atomic.Bool + + // initOnce ensures CPU detection runs exactly once + initOnce sync.Once +) + +func init() { + initOnce.Do(detectCPUFeatures) +} + +// detectCPUFeatures detects CPU capabilities at startup +func detectCPUFeatures() { + hasAVX2CPU = cpuid.CPU.Has(cpuid.AVX2) +} + +// HasAVX2 returns true if AVX2 is available and enabled. +// This is the function that should be called in hot paths to decide +// whether to use AVX2-optimized code paths. +func HasAVX2() bool { + return hasAVX2CPU && !avx2Disabled.Load() +} + +// HasAVX2CPU returns true if the CPU supports AVX2, regardless of whether +// it's been disabled via SetAVX2Enabled. +func HasAVX2CPU() bool { + return hasAVX2CPU +} + +// SetAVX2Enabled enables or disables the use of AVX2 instructions. +// This is useful for benchmarking to compare AVX2 vs non-AVX2 performance, +// or for debugging. Pass true to enable AVX2 (default), false to disable. +// This function is thread-safe. +func SetAVX2Enabled(enabled bool) { + avx2Disabled.Store(!enabled) +} + +// IsAVX2Enabled returns whether AVX2 is currently enabled. +// Returns true if AVX2 is both available on the CPU and not disabled. +func IsAVX2Enabled() bool { + return HasAVX2() +} diff --git a/cpufeatures_generic.go b/cpufeatures_generic.go new file mode 100644 index 0000000..890ebbf --- /dev/null +++ b/cpufeatures_generic.go @@ -0,0 +1,26 @@ +//go:build !amd64 + +package p256k1 + +// Generic stubs for non-AMD64 architectures. +// AVX2 is not available on non-x86 platforms. + +// HasAVX2 always returns false on non-AMD64 platforms. +func HasAVX2() bool { + return false +} + +// HasAVX2CPU always returns false on non-AMD64 platforms. +func HasAVX2CPU() bool { + return false +} + +// SetAVX2Enabled is a no-op on non-AMD64 platforms. +func SetAVX2Enabled(enabled bool) { + // No-op: AVX2 is not available +} + +// IsAVX2Enabled always returns false on non-AMD64 platforms. +func IsAVX2Enabled() bool { + return false +} diff --git a/field_amd64.go b/field_amd64.go new file mode 100644 index 0000000..c17b3a5 --- /dev/null +++ b/field_amd64.go @@ -0,0 +1,23 @@ +//go:build amd64 + +package p256k1 + +// fieldMulAsm multiplies two field elements using x86-64 assembly. +// This is a direct port of bitcoin-core secp256k1_fe_mul_inner. +// r, a, b are 5x52-bit limb representations. +// +//go:noescape +func fieldMulAsm(r, a, b *FieldElement) + +// fieldSqrAsm squares a field element using x86-64 assembly. +// This is a direct port of bitcoin-core secp256k1_fe_sqr_inner. +// Squaring is optimized compared to multiplication. +// +//go:noescape +func fieldSqrAsm(r, a *FieldElement) + +// hasFieldAsm returns true if field assembly is available. +// On amd64, this is always true. +func hasFieldAsm() bool { + return true +} diff --git a/field_amd64.s b/field_amd64.s new file mode 100644 index 0000000..ae143ae --- /dev/null +++ b/field_amd64.s @@ -0,0 +1,692 @@ +//go:build amd64 + +#include "textflag.h" + +// Field multiplication assembly for secp256k1 using 5x52-bit limb representation. +// Ported from bitcoin-core/secp256k1 field_5x52_asm_impl.h +// +// The field element is represented as 5 limbs of 52 bits each: +// n[0..4] where value = sum(n[i] * 2^(52*i)) +// +// Field prime p = 2^256 - 2^32 - 977 +// Reduction constant R = 2^256 mod p = 2^32 + 977 = 0x1000003D1 +// For 5x52: R shifted = 0x1000003D10 (for 52-bit alignment) +// +// Stack layout for fieldMulAsm (96 bytes): +// 0(SP) - d_lo +// 8(SP) - d_hi +// 16(SP) - c_lo +// 24(SP) - c_hi +// 32(SP) - t3 +// 40(SP) - t4 +// 48(SP) - tx +// 56(SP) - u0 +// 64(SP) - temp storage +// 72(SP) - temp storage 2 +// 80(SP) - saved b pointer + +// Macro-like operations implemented inline: +// rshift52: shift 128-bit value right by 52 +// result_lo = (in_lo >> 52) | (in_hi << 12) +// result_hi = in_hi >> 52 + +// func fieldMulAsm(r, a, b *FieldElement) +TEXT ·fieldMulAsm(SB), NOSPLIT, $96-24 + MOVQ r+0(FP), DI + MOVQ a+8(FP), SI + MOVQ b+16(FP), BX + + // Save b pointer + MOVQ BX, 80(SP) + + // Load a[0..4] into registers + MOVQ 0(SI), R8 // a0 + MOVQ 8(SI), R9 // a1 + MOVQ 16(SI), R10 // a2 + MOVQ 24(SI), R11 // a3 + MOVQ 32(SI), R12 // a4 + + // Constants we'll use frequently + // M = 0xFFFFFFFFFFFFF (2^52 - 1) + // R = 0x1000003D10 + + // === Step 1: d = a0*b3 + a1*b2 + a2*b1 + a3*b0 === + MOVQ R8, AX + MULQ 24(BX) // a0 * b3 + MOVQ AX, 0(SP) // d_lo + MOVQ DX, 8(SP) // d_hi + + MOVQ R9, AX + MULQ 16(BX) // a1 * b2 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R10, AX + MULQ 8(BX) // a2 * b1 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R11, AX + MULQ 0(BX) // a3 * b0 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 2: c = a4*b4 === + MOVQ R12, AX + MULQ 32(BX) // a4 * b4 + MOVQ AX, 16(SP) // c_lo + MOVQ DX, 24(SP) // c_hi + + // === Step 3: d += R * c_lo === + // Note: we use full c_lo (64 bits), NOT c_lo & M + MOVQ 16(SP), AX // c_lo (full 64 bits) + MOVQ $0x1000003D10, CX // R + MULQ CX // R * c_lo -> DX:AX + ADDQ AX, 0(SP) // d_lo += product_lo + ADCQ DX, 8(SP) // d_hi += product_hi + carry + + // === Step 4: c >>= 64 (just take c_hi) === + MOVQ 24(SP), AX // c_hi + MOVQ AX, 16(SP) // new c = c_hi (single 64-bit now) + MOVQ $0, 24(SP) // c_hi = 0 + + // === Step 5: t3 = d & M; d >>= 52 === + MOVQ 0(SP), AX // d_lo + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX // t3 = d & M + MOVQ AX, 32(SP) // save t3 + + // d >>= 52: d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52 + MOVQ 0(SP), AX // d_lo + MOVQ 8(SP), CX // d_hi + SHRQ $52, AX // d_lo >> 52 + MOVQ CX, DX + SHLQ $12, DX // d_hi << 12 + ORQ DX, AX // new d_lo + SHRQ $52, CX // new d_hi + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 6: d += a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 === + MOVQ 80(SP), BX // restore b pointer + + MOVQ R8, AX + MULQ 32(BX) // a0 * b4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R9, AX + MULQ 24(BX) // a1 * b3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R10, AX + MULQ 16(BX) // a2 * b2 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R11, AX + MULQ 8(BX) // a3 * b1 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R12, AX + MULQ 0(BX) // a4 * b0 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 7: d += (R << 12) * c === + // R << 12 = 0x1000003D10 << 12 = 0x1000003D10000 + MOVQ 16(SP), AX // c (from c >>= 64) + MOVQ $0x1000003D10000, CX + MULQ CX // (R << 12) * c + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) === + MOVQ 0(SP), AX // d_lo + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX // t4 = d & M + MOVQ AX, 40(SP) // save t4 (before modifications) + + SHRQ $48, AX // tx = t4 >> 48 + MOVQ AX, 48(SP) // save tx + + MOVQ 40(SP), AX + MOVQ $0x0FFFFFFFFFFFF, CX // M >> 4 = 2^48 - 1 + ANDQ CX, AX // t4 &= (M >> 4) + MOVQ AX, 40(SP) // save final t4 + + // === Step 9: d >>= 52 === + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 10: c = a0*b0 === + MOVQ R8, AX + MULQ 0(BX) // a0 * b0 + MOVQ AX, 16(SP) // c_lo + MOVQ DX, 24(SP) // c_hi + + // === Step 11: d += a1*b4 + a2*b3 + a3*b2 + a4*b1 === + MOVQ R9, AX + MULQ 32(BX) // a1 * b4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R10, AX + MULQ 24(BX) // a2 * b3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R11, AX + MULQ 16(BX) // a3 * b2 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R12, AX + MULQ 8(BX) // a4 * b1 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx === + MOVQ 0(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX // u0 = d & M + SHLQ $4, AX // u0 << 4 + ORQ 48(SP), AX // u0 |= tx + MOVQ AX, 56(SP) // save u0 + + // d >>= 52 + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 13: c += (R >> 4) * u0 === + // R >> 4 = 0x1000003D10 >> 4 = 0x1000003D1 + MOVQ 56(SP), AX // u0 + MOVQ $0x1000003D1, CX + MULQ CX // (R >> 4) * u0 + ADDQ AX, 16(SP) // c_lo + ADCQ DX, 24(SP) // c_hi + + // === Step 14: r[0] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 0(DI) // store r[0] + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 16(SP) + MOVQ CX, 24(SP) + + // === Step 15: c += a0*b1 + a1*b0 === + MOVQ R8, AX + MULQ 8(BX) // a0 * b1 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ R9, AX + MULQ 0(BX) // a1 * b0 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // === Step 16: d += a2*b4 + a3*b3 + a4*b2 === + MOVQ R10, AX + MULQ 32(BX) // a2 * b4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R11, AX + MULQ 24(BX) // a3 * b3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R12, AX + MULQ 16(BX) // a4 * b2 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 17: c += R * (d & M); d >>= 52 === + MOVQ 0(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX // d & M + MOVQ $0x1000003D10, CX // R + MULQ CX // R * (d & M) + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // d >>= 52 + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 18: r[1] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 8(DI) // store r[1] + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 16(SP) + MOVQ CX, 24(SP) + + // === Step 19: c += a0*b2 + a1*b1 + a2*b0 === + MOVQ R8, AX + MULQ 16(BX) // a0 * b2 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ R9, AX + MULQ 8(BX) // a1 * b1 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ R10, AX + MULQ 0(BX) // a2 * b0 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // === Step 20: d += a3*b4 + a4*b3 === + MOVQ R11, AX + MULQ 32(BX) // a3 * b4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R12, AX + MULQ 24(BX) // a4 * b3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 21: c += R * d_lo; d >>= 64 === + // Note: use full d_lo here, not d & M + MOVQ 0(SP), AX // d_lo + MOVQ $0x1000003D10, CX // R + MULQ CX // R * d_lo + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // d >>= 64 (just take d_hi) + MOVQ 8(SP), AX + MOVQ AX, 0(SP) + MOVQ $0, 8(SP) + + // === Step 22: r[2] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 16(DI) // store r[2] + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 16(SP) + MOVQ CX, 24(SP) + + // === Step 23: c += (R << 12) * d + t3 === + MOVQ 0(SP), AX // d (after d >>= 64) + MOVQ $0x1000003D10000, CX // R << 12 + MULQ CX // (R << 12) * d + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ 32(SP), AX // t3 + ADDQ AX, 16(SP) + ADCQ $0, 24(SP) + + // === Step 24: r[3] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 24(DI) // store r[3] + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + + // === Step 25: r[4] = c + t4 === + ADDQ 40(SP), AX // c + t4 + MOVQ AX, 32(DI) // store r[4] + + RET + +// func fieldSqrAsm(r, a *FieldElement) +// Squares a field element in 5x52 representation. +// This follows the bitcoin-core secp256k1_fe_sqr_inner algorithm. +// Squaring is optimized since a*a has symmetric terms: a[i]*a[j] appears twice. +TEXT ·fieldSqrAsm(SB), NOSPLIT, $96-16 + MOVQ r+0(FP), DI + MOVQ a+8(FP), SI + + // Load a[0..4] into registers + MOVQ 0(SI), R8 // a0 + MOVQ 8(SI), R9 // a1 + MOVQ 16(SI), R10 // a2 + MOVQ 24(SI), R11 // a3 + MOVQ 32(SI), R12 // a4 + + // === Step 1: d = 2*a0*a3 + 2*a1*a2 === + MOVQ R8, AX + ADDQ AX, AX // 2*a0 + MULQ R11 // 2*a0 * a3 + MOVQ AX, 0(SP) // d_lo + MOVQ DX, 8(SP) // d_hi + + MOVQ R9, AX + ADDQ AX, AX // 2*a1 + MULQ R10 // 2*a1 * a2 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 2: c = a4*a4 === + MOVQ R12, AX + MULQ R12 // a4 * a4 + MOVQ AX, 16(SP) // c_lo + MOVQ DX, 24(SP) // c_hi + + // === Step 3: d += R * c_lo === + // Note: use full c_lo (64 bits), NOT c_lo & M + MOVQ 16(SP), AX // c_lo (full 64 bits) + MOVQ $0x1000003D10, CX + MULQ CX + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 4: c >>= 64 === + MOVQ 24(SP), AX + MOVQ AX, 16(SP) + MOVQ $0, 24(SP) + + // === Step 5: t3 = d & M; d >>= 52 === + MOVQ 0(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 32(SP) // t3 + + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 6: d += 2*a0*a4 + 2*a1*a3 + a2*a2 === + // Pre-compute 2*a4 for later use + MOVQ R12, CX + ADDQ CX, CX // 2*a4 + MOVQ CX, 64(SP) // save 2*a4 + + MOVQ R8, AX + MULQ CX // a0 * 2*a4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R9, AX + ADDQ AX, AX // 2*a1 + MULQ R11 // 2*a1 * a3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R10, AX + MULQ R10 // a2 * a2 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 7: d += (R << 12) * c === + MOVQ 16(SP), AX + MOVQ $0x1000003D10000, CX + MULQ CX + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) === + MOVQ 0(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 40(SP) // full t4 + + SHRQ $48, AX + MOVQ AX, 48(SP) // tx + + MOVQ 40(SP), AX + MOVQ $0x0FFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 40(SP) // t4 + + // === Step 9: d >>= 52 === + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 10: c = a0*a0 === + MOVQ R8, AX + MULQ R8 + MOVQ AX, 16(SP) + MOVQ DX, 24(SP) + + // === Step 11: d += a1*2*a4 + 2*a2*a3 === + MOVQ R9, AX + MULQ 64(SP) // a1 * 2*a4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R10, AX + ADDQ AX, AX // 2*a2 + MULQ R11 // 2*a2 * a3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx === + MOVQ 0(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + SHLQ $4, AX + ORQ 48(SP), AX + MOVQ AX, 56(SP) // u0 + + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 13: c += (R >> 4) * u0 === + MOVQ 56(SP), AX + MOVQ $0x1000003D1, CX + MULQ CX + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // === Step 14: r[0] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 0(DI) + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 16(SP) + MOVQ CX, 24(SP) + + // === Step 15: c += 2*a0*a1 === + MOVQ R8, AX + ADDQ AX, AX + MULQ R9 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // === Step 16: d += a2*2*a4 + a3*a3 === + MOVQ R10, AX + MULQ 64(SP) // a2 * 2*a4 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + MOVQ R11, AX + MULQ R11 // a3 * a3 + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 17: c += R * (d & M); d >>= 52 === + MOVQ 0(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ $0x1000003D10, CX + MULQ CX + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ 0(SP), AX + MOVQ 8(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 0(SP) + MOVQ CX, 8(SP) + + // === Step 18: r[1] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 8(DI) + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 16(SP) + MOVQ CX, 24(SP) + + // === Step 19: c += 2*a0*a2 + a1*a1 === + MOVQ R8, AX + ADDQ AX, AX + MULQ R10 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ R9, AX + MULQ R9 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + // === Step 20: d += a3*2*a4 === + MOVQ R11, AX + MULQ 64(SP) + ADDQ AX, 0(SP) + ADCQ DX, 8(SP) + + // === Step 21: c += R * d_lo; d >>= 64 === + MOVQ 0(SP), AX + MOVQ $0x1000003D10, CX + MULQ CX + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ 8(SP), AX + MOVQ AX, 0(SP) + MOVQ $0, 8(SP) + + // === Step 22: r[2] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 16(DI) + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + SHRQ $52, CX + MOVQ AX, 16(SP) + MOVQ CX, 24(SP) + + // === Step 23: c += (R << 12) * d + t3 === + MOVQ 0(SP), AX + MOVQ $0x1000003D10000, CX + MULQ CX + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + + MOVQ 32(SP), AX + ADDQ AX, 16(SP) + ADCQ $0, 24(SP) + + // === Step 24: r[3] = c & M; c >>= 52 === + MOVQ 16(SP), AX + MOVQ $0xFFFFFFFFFFFFF, CX + ANDQ CX, AX + MOVQ AX, 24(DI) + + MOVQ 16(SP), AX + MOVQ 24(SP), CX + SHRQ $52, AX + MOVQ CX, DX + SHLQ $12, DX + ORQ DX, AX + + // === Step 25: r[4] = c + t4 === + ADDQ 40(SP), AX + MOVQ AX, 32(DI) + + RET diff --git a/field_asm_test.go b/field_asm_test.go new file mode 100644 index 0000000..902aad7 --- /dev/null +++ b/field_asm_test.go @@ -0,0 +1,198 @@ +package p256k1 + +import ( + "testing" +) + +// fieldMulPureGo is the pure Go implementation for comparison +func fieldMulPureGo(r, a, b *FieldElement) { + // Extract limbs for easier access + a0, a1, a2, a3, a4 := a.n[0], a.n[1], a.n[2], a.n[3], a.n[4] + b0, b1, b2, b3, b4 := b.n[0], b.n[1], b.n[2], b.n[3], b.n[4] + + const M = uint64(0xFFFFFFFFFFFFF) // 2^52 - 1 + const R = uint64(fieldReductionConstantShifted) // 0x1000003D10 + + // Following the C implementation algorithm exactly + var c, d uint128 + d = mulU64ToU128(a0, b3) + d = addMulU128(d, a1, b2) + d = addMulU128(d, a2, b1) + d = addMulU128(d, a3, b0) + + c = mulU64ToU128(a4, b4) + + d = addMulU128(d, R, c.lo()) + c = c.rshift(64) + + t3 := d.lo() & M + d = d.rshift(52) + + d = addMulU128(d, a0, b4) + d = addMulU128(d, a1, b3) + d = addMulU128(d, a2, b2) + d = addMulU128(d, a3, b1) + d = addMulU128(d, a4, b0) + + d = addMulU128(d, R<<12, c.lo()) + + t4 := d.lo() & M + d = d.rshift(52) + tx := t4 >> 48 + t4 &= (M >> 4) + + c = mulU64ToU128(a0, b0) + + d = addMulU128(d, a1, b4) + d = addMulU128(d, a2, b3) + d = addMulU128(d, a3, b2) + d = addMulU128(d, a4, b1) + + u0 := d.lo() & M + d = d.rshift(52) + u0 = (u0 << 4) | tx + + c = addMulU128(c, u0, R>>4) + + r.n[0] = c.lo() & M + c = c.rshift(52) + + c = addMulU128(c, a0, b1) + c = addMulU128(c, a1, b0) + + d = addMulU128(d, a2, b4) + d = addMulU128(d, a3, b3) + d = addMulU128(d, a4, b2) + + c = addMulU128(c, R, d.lo()&M) + d = d.rshift(52) + + r.n[1] = c.lo() & M + c = c.rshift(52) + + c = addMulU128(c, a0, b2) + c = addMulU128(c, a1, b1) + c = addMulU128(c, a2, b0) + + d = addMulU128(d, a3, b4) + d = addMulU128(d, a4, b3) + + c = addMulU128(c, R, d.lo()) + d = d.rshift(64) + + r.n[2] = c.lo() & M + c = c.rshift(52) + + c = addMulU128(c, R<<12, d.lo()) + c = addU128(c, t3) + + r.n[3] = c.lo() & M + c = c.rshift(52) + + r.n[4] = c.lo() + t4 + + r.magnitude = 1 + r.normalized = false +} + +func TestFieldMulAsmVsPureGo(t *testing.T) { + // Test with simple values first + a := FieldElement{n: [5]uint64{1, 0, 0, 0, 0}, magnitude: 1, normalized: true} + b := FieldElement{n: [5]uint64{2, 0, 0, 0, 0}, magnitude: 1, normalized: true} + + var rAsm, rGo FieldElement + + // Pure Go + fieldMulPureGo(&rGo, &a, &b) + + // Assembly + if hasFieldAsm() { + fieldMulAsm(&rAsm, &a, &b) + rAsm.magnitude = 1 + rAsm.normalized = false + + t.Logf("a = %v", a.n) + t.Logf("b = %v", b.n) + t.Logf("Go result: %v", rGo.n) + t.Logf("Asm result: %v", rAsm.n) + + for i := 0; i < 5; i++ { + if rAsm.n[i] != rGo.n[i] { + t.Errorf("limb %d mismatch: asm=%x, go=%x", i, rAsm.n[i], rGo.n[i]) + } + } + } else { + t.Skip("Assembly not available") + } +} + +func TestFieldMulAsmVsPureGoLarger(t *testing.T) { + // Test with larger values + a := FieldElement{ + n: [5]uint64{0x1234567890abcdef & 0xFFFFFFFFFFFFF, 0xfedcba9876543210 & 0xFFFFFFFFFFFFF, 0x0123456789abcdef & 0xFFFFFFFFFFFFF, 0xfedcba0987654321 & 0xFFFFFFFFFFFFF, 0x0123456789ab & 0x0FFFFFFFFFFFF}, + magnitude: 1, + normalized: true, + } + b := FieldElement{ + n: [5]uint64{0xabcdef1234567890 & 0xFFFFFFFFFFFFF, 0x9876543210fedcba & 0xFFFFFFFFFFFFF, 0xfedcba1234567890 & 0xFFFFFFFFFFFFF, 0x0987654321abcdef & 0xFFFFFFFFFFFFF, 0x0fedcba98765 & 0x0FFFFFFFFFFFF}, + magnitude: 1, + normalized: true, + } + + var rAsm, rGo FieldElement + + // Pure Go + fieldMulPureGo(&rGo, &a, &b) + + // Assembly + if hasFieldAsm() { + fieldMulAsm(&rAsm, &a, &b) + rAsm.magnitude = 1 + rAsm.normalized = false + + t.Logf("a = %v", a.n) + t.Logf("b = %v", b.n) + t.Logf("Go result: %v", rGo.n) + t.Logf("Asm result: %v", rAsm.n) + + for i := 0; i < 5; i++ { + if rAsm.n[i] != rGo.n[i] { + t.Errorf("limb %d mismatch: asm=%x, go=%x", i, rAsm.n[i], rGo.n[i]) + } + } + } else { + t.Skip("Assembly not available") + } +} + +func TestFieldSqrAsmVsPureGo(t *testing.T) { + a := FieldElement{ + n: [5]uint64{0x1234567890abcdef & 0xFFFFFFFFFFFFF, 0xfedcba9876543210 & 0xFFFFFFFFFFFFF, 0x0123456789abcdef & 0xFFFFFFFFFFFFF, 0xfedcba0987654321 & 0xFFFFFFFFFFFFF, 0x0123456789ab & 0x0FFFFFFFFFFFF}, + magnitude: 1, + normalized: true, + } + + var rAsm, rGo FieldElement + + // Pure Go (a * a) + fieldMulPureGo(&rGo, &a, &a) + + // Assembly + if hasFieldAsm() { + fieldSqrAsm(&rAsm, &a) + rAsm.magnitude = 1 + rAsm.normalized = false + + t.Logf("a = %v", a.n) + t.Logf("Go result: %v", rGo.n) + t.Logf("Asm result: %v", rAsm.n) + + for i := 0; i < 5; i++ { + if rAsm.n[i] != rGo.n[i] { + t.Errorf("limb %d mismatch: asm=%x, go=%x", i, rAsm.n[i], rGo.n[i]) + } + } + } else { + t.Skip("Assembly not available") + } +} diff --git a/field_bench_test.go b/field_bench_test.go new file mode 100644 index 0000000..af1dc11 --- /dev/null +++ b/field_bench_test.go @@ -0,0 +1,76 @@ +package p256k1 + +import ( + "testing" +) + +var benchFieldA = FieldElement{ + n: [5]uint64{0x4567890abcdef, 0xcba9876543210, 0x3456789abcdef, 0xcba0987654321, 0x123456789ab}, + magnitude: 1, + normalized: true, +} + +var benchFieldB = FieldElement{ + n: [5]uint64{0xdef1234567890, 0x6543210fedcba, 0xcba1234567890, 0x7654321abcdef, 0xfedcba98765}, + magnitude: 1, + normalized: true, +} + +// BenchmarkFieldMulAsm benchmarks the assembly field multiplication +func BenchmarkFieldMulAsm(b *testing.B) { + if !hasFieldAsm() { + b.Skip("Assembly not available") + } + + var r FieldElement + for i := 0; i < b.N; i++ { + fieldMulAsm(&r, &benchFieldA, &benchFieldB) + } +} + +// BenchmarkFieldMulPureGo benchmarks the pure Go field multiplication +func BenchmarkFieldMulPureGo(b *testing.B) { + var r FieldElement + for i := 0; i < b.N; i++ { + fieldMulPureGo(&r, &benchFieldA, &benchFieldB) + } +} + +// BenchmarkFieldSqrAsm benchmarks the assembly field squaring +func BenchmarkFieldSqrAsm(b *testing.B) { + if !hasFieldAsm() { + b.Skip("Assembly not available") + } + + var r FieldElement + for i := 0; i < b.N; i++ { + fieldSqrAsm(&r, &benchFieldA) + } +} + +// BenchmarkFieldSqrPureGo benchmarks the pure Go field squaring (via mul) +func BenchmarkFieldSqrPureGo(b *testing.B) { + var r FieldElement + for i := 0; i < b.N; i++ { + fieldMulPureGo(&r, &benchFieldA, &benchFieldA) + } +} + +// BenchmarkFieldMul benchmarks the full mul method (which uses assembly when available) +func BenchmarkFieldMul(b *testing.B) { + r := new(FieldElement) + a := benchFieldA + bb := benchFieldB + for i := 0; i < b.N; i++ { + r.mul(&a, &bb) + } +} + +// BenchmarkFieldSqr benchmarks the full sqr method (which uses assembly when available) +func BenchmarkFieldSqr(b *testing.B) { + r := new(FieldElement) + a := benchFieldA + for i := 0; i < b.N; i++ { + r.sqr(&a) + } +} diff --git a/field_generic.go b/field_generic.go new file mode 100644 index 0000000..f193b25 --- /dev/null +++ b/field_generic.go @@ -0,0 +1,21 @@ +//go:build !amd64 + +package p256k1 + +// hasFieldAsm returns true if field assembly is available. +// On non-amd64 platforms, assembly is not available. +func hasFieldAsm() bool { + return false +} + +// fieldMulAsm is a stub for non-amd64 platforms. +// It should never be called since hasFieldAsm() returns false. +func fieldMulAsm(r, a, b *FieldElement) { + panic("field assembly not available on this platform") +} + +// fieldSqrAsm is a stub for non-amd64 platforms. +// It should never be called since hasFieldAsm() returns false. +func fieldSqrAsm(r, a *FieldElement) { + panic("field assembly not available on this platform") +} diff --git a/field_mul.go b/field_mul.go index 3e662a4..9e14a7f 100644 --- a/field_mul.go +++ b/field_mul.go @@ -61,7 +61,7 @@ func (r *FieldElement) mul(a, b *FieldElement) { // Use pointers directly if magnitude is low enough (optimization) var aNorm, bNorm *FieldElement var aTemp, bTemp FieldElement - + if a.magnitude > 8 { aTemp = *a aTemp.normalizeWeak() @@ -69,7 +69,7 @@ func (r *FieldElement) mul(a, b *FieldElement) { } else { aNorm = a // Use directly, no copy needed } - + if b.magnitude > 8 { bTemp = *b bTemp.normalizeWeak() @@ -78,6 +78,14 @@ func (r *FieldElement) mul(a, b *FieldElement) { bNorm = b // Use directly, no copy needed } + // Use assembly if available + if hasFieldAsm() { + fieldMulAsm(r, aNorm, bNorm) + r.magnitude = 1 + r.normalized = false + return + } + // Extract limbs for easier access a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4] b0, b1, b2, b3, b4 := bNorm.n[0], bNorm.n[1], bNorm.n[2], bNorm.n[3], bNorm.n[4] @@ -298,7 +306,7 @@ func (r *FieldElement) sqr(a *FieldElement) { // Use pointer directly if magnitude is low enough (optimization) var aNorm *FieldElement var aTemp FieldElement - + if a.magnitude > 8 { aTemp = *a aTemp.normalizeWeak() @@ -307,6 +315,14 @@ func (r *FieldElement) sqr(a *FieldElement) { aNorm = a // Use directly, no copy needed } + // Use assembly if available + if hasFieldAsm() { + fieldSqrAsm(r, aNorm) + r.magnitude = 1 + r.normalized = false + return + } + // Extract limbs for easier access a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4] diff --git a/go.mod b/go.mod index 9a60406..30af960 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( ) require ( + github.com/ebitengine/purego v0.9.1 // indirect github.com/klauspost/cpuid/v2 v2.3.0 // indirect golang.org/x/sys v0.37.0 // indirect ) diff --git a/go.sum b/go.sum index f7ec1aa..a7aeacc 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= +github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= diff --git a/libsecp256k1.so b/libsecp256k1.so new file mode 100755 index 0000000..3cfca4e Binary files /dev/null and b/libsecp256k1.so differ diff --git a/libsecp256k1_purego.go b/libsecp256k1_purego.go new file mode 100644 index 0000000..39ca314 --- /dev/null +++ b/libsecp256k1_purego.go @@ -0,0 +1,265 @@ +package p256k1 + +import ( + "errors" + "sync" + + "github.com/ebitengine/purego" +) + +// LibSecp256k1 wraps the native libsecp256k1.so library using purego for CGO-free operation. +// This provides a way to benchmark against the C implementation without CGO. +type LibSecp256k1 struct { + lib uintptr + ctx uintptr + loaded bool + mu sync.RWMutex + + // Function pointers + contextCreate func(uint) uintptr + contextDestroy func(uintptr) + contextRandomize func(uintptr, *byte) int + schnorrsigSign32 func(uintptr, *byte, *byte, *byte, *byte) int + schnorrsigVerify func(uintptr, *byte, *byte, uint, *byte) int + keypairCreate func(uintptr, *byte, *byte) int + keypairXonlyPub func(uintptr, *byte, *int, *byte) int + xonlyPubkeyParse func(uintptr, *byte, *byte) int + ecPubkeyCreate func(uintptr, *byte, *byte) int + ecPubkeyParse func(uintptr, *byte, *byte, uint) int + ecPubkeySerialize func(uintptr, *byte, *uint, *byte, uint) int + xonlyPubkeySerialize func(uintptr, *byte, *byte) int + ecdh func(uintptr, *byte, *byte, *byte, uintptr, uintptr) int +} + +// Secp256k1 context flags +// In modern libsecp256k1, SECP256K1_CONTEXT_NONE = 1 is the only valid flag. +// The old SIGN (256) and VERIFY (257) flags are deprecated. +const ( + libContextNone = 1 +) + +// Global instance +var ( + libSecp *LibSecp256k1 + libSecpOnce sync.Once + libSecpInitErr error +) + +// GetLibSecp256k1 returns the global LibSecp256k1 instance, loading it if necessary. +// Returns nil and an error if the library cannot be loaded. +func GetLibSecp256k1() (*LibSecp256k1, error) { + libSecpOnce.Do(func() { + libSecp = &LibSecp256k1{} + // Try multiple paths to find the library + paths := []string{ + "./libsecp256k1.so", + "../libsecp256k1.so", + "/home/mleku/src/p256k1.mleku.dev/libsecp256k1.so", + "libsecp256k1.so", + } + for _, path := range paths { + err := libSecp.Load(path) + if err == nil { + libSecpInitErr = nil + return + } + libSecpInitErr = err + } + }) + if libSecpInitErr != nil { + return nil, libSecpInitErr + } + return libSecp, nil +} + +// Load loads the libsecp256k1.so library from the given path. +func (l *LibSecp256k1) Load(path string) error { + l.mu.Lock() + defer l.mu.Unlock() + + if l.loaded { + return nil + } + + lib, err := purego.Dlopen(path, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + return err + } + l.lib = lib + + // Register function pointers + purego.RegisterLibFunc(&l.contextCreate, lib, "secp256k1_context_create") + purego.RegisterLibFunc(&l.contextDestroy, lib, "secp256k1_context_destroy") + purego.RegisterLibFunc(&l.contextRandomize, lib, "secp256k1_context_randomize") + purego.RegisterLibFunc(&l.schnorrsigSign32, lib, "secp256k1_schnorrsig_sign32") + purego.RegisterLibFunc(&l.schnorrsigVerify, lib, "secp256k1_schnorrsig_verify") + purego.RegisterLibFunc(&l.keypairCreate, lib, "secp256k1_keypair_create") + purego.RegisterLibFunc(&l.keypairXonlyPub, lib, "secp256k1_keypair_xonly_pub") + purego.RegisterLibFunc(&l.xonlyPubkeyParse, lib, "secp256k1_xonly_pubkey_parse") + purego.RegisterLibFunc(&l.ecPubkeyCreate, lib, "secp256k1_ec_pubkey_create") + purego.RegisterLibFunc(&l.ecPubkeyParse, lib, "secp256k1_ec_pubkey_parse") + purego.RegisterLibFunc(&l.ecPubkeySerialize, lib, "secp256k1_ec_pubkey_serialize") + purego.RegisterLibFunc(&l.xonlyPubkeySerialize, lib, "secp256k1_xonly_pubkey_serialize") + purego.RegisterLibFunc(&l.ecdh, lib, "secp256k1_ecdh") + + // Create context (modern libsecp256k1 uses SECP256K1_CONTEXT_NONE = 1) + l.ctx = l.contextCreate(libContextNone) + if l.ctx == 0 { + return errors.New("failed to create secp256k1 context") + } + + // Randomize context for better security + var seed [32]byte + // Use zero seed for deterministic benchmarks + l.contextRandomize(l.ctx, &seed[0]) + + l.loaded = true + return nil +} + +// Close releases the library resources. +func (l *LibSecp256k1) Close() { + l.mu.Lock() + defer l.mu.Unlock() + + if !l.loaded { + return + } + + if l.ctx != 0 { + l.contextDestroy(l.ctx) + l.ctx = 0 + } + + if l.lib != 0 { + purego.Dlclose(l.lib) + l.lib = 0 + } + + l.loaded = false +} + +// IsLoaded returns true if the library is loaded. +func (l *LibSecp256k1) IsLoaded() bool { + l.mu.RLock() + defer l.mu.RUnlock() + return l.loaded +} + +// SchnorrSign signs a 32-byte message using a 32-byte secret key. +// Returns a 64-byte signature. +func (l *LibSecp256k1) SchnorrSign(msg32, seckey32 []byte) ([]byte, error) { + l.mu.RLock() + defer l.mu.RUnlock() + + if !l.loaded { + return nil, errors.New("library not loaded") + } + if len(msg32) != 32 { + return nil, errors.New("message must be 32 bytes") + } + if len(seckey32) != 32 { + return nil, errors.New("secret key must be 32 bytes") + } + + // Create keypair from secret key + keypair := make([]byte, 96) // secp256k1_keypair is 96 bytes + if l.keypairCreate(l.ctx, &keypair[0], &seckey32[0]) != 1 { + return nil, errors.New("failed to create keypair") + } + + // Sign + sig := make([]byte, 64) + if l.schnorrsigSign32(l.ctx, &sig[0], &msg32[0], &keypair[0], nil) != 1 { + return nil, errors.New("signing failed") + } + + return sig, nil +} + +// SchnorrVerify verifies a Schnorr signature. +func (l *LibSecp256k1) SchnorrVerify(sig64, msg32, pubkey32 []byte) bool { + l.mu.RLock() + defer l.mu.RUnlock() + + if !l.loaded { + return false + } + if len(sig64) != 64 || len(msg32) != 32 || len(pubkey32) != 32 { + return false + } + + // Parse x-only pubkey using secp256k1_xonly_pubkey_parse + xonlyPubkey := make([]byte, 64) // secp256k1_xonly_pubkey is 64 bytes + if l.xonlyPubkeyParse(l.ctx, &xonlyPubkey[0], &pubkey32[0]) != 1 { + return false + } + + result := l.schnorrsigVerify(l.ctx, &sig64[0], &msg32[0], 32, &xonlyPubkey[0]) + return result == 1 +} + +// CreatePubkey derives a public key from a secret key. +// Returns the 32-byte x-only public key. +func (l *LibSecp256k1) CreatePubkey(seckey32 []byte) ([]byte, error) { + l.mu.RLock() + defer l.mu.RUnlock() + + if !l.loaded { + return nil, errors.New("library not loaded") + } + if len(seckey32) != 32 { + return nil, errors.New("secret key must be 32 bytes") + } + + // Create keypair + keypair := make([]byte, 96) + if l.keypairCreate(l.ctx, &keypair[0], &seckey32[0]) != 1 { + return nil, errors.New("failed to create keypair") + } + + // Extract x-only pubkey (internal representation is 64 bytes) + xonlyPubkey := make([]byte, 64) + var parity int + if l.keypairXonlyPub(l.ctx, &xonlyPubkey[0], &parity, &keypair[0]) != 1 { + return nil, errors.New("failed to extract x-only pubkey") + } + + // Serialize to get the 32-byte x-coordinate + pubkey32 := make([]byte, 32) + if l.xonlyPubkeySerialize(l.ctx, &pubkey32[0], &xonlyPubkey[0]) != 1 { + return nil, errors.New("failed to serialize x-only pubkey") + } + + return pubkey32, nil +} + +// ECDH computes the shared secret using ECDH. +func (l *LibSecp256k1) ECDH(seckey32, pubkey33 []byte) ([]byte, error) { + l.mu.RLock() + defer l.mu.RUnlock() + + if !l.loaded { + return nil, errors.New("library not loaded") + } + if len(seckey32) != 32 { + return nil, errors.New("secret key must be 32 bytes") + } + if len(pubkey33) != 33 && len(pubkey33) != 65 { + return nil, errors.New("public key must be 33 or 65 bytes") + } + + // Parse pubkey + pubkey := make([]byte, 64) // secp256k1_pubkey is 64 bytes + if l.ecPubkeyParse(l.ctx, &pubkey[0], &pubkey33[0], uint(len(pubkey33))) != 1 { + return nil, errors.New("failed to parse public key") + } + + // Compute ECDH + output := make([]byte, 32) + if l.ecdh(l.ctx, &output[0], &pubkey[0], &seckey32[0], 0, 0) != 1 { + return nil, errors.New("ECDH failed") + } + + return output, nil +} diff --git a/scalar.go b/scalar.go index f075f16..e1973d6 100644 --- a/scalar.go +++ b/scalar.go @@ -40,40 +40,6 @@ var ( // ScalarOne represents the scalar 1 ScalarOne = Scalar{d: [4]uint64{1, 0, 0, 0}} - // GLV (Gallant-Lambert-Vanstone) endomorphism constants - // lambda is a primitive cube root of unity modulo n (the curve order) - secp256k1Lambda = Scalar{d: [4]uint64{ - 0x5363AD4CC05C30E0, 0xA5261C028812645A, - 0x122E22EA20816678, 0xDF02967C1B23BD72, - }} - - // Note: beta is defined in field.go as a FieldElement constant - - // GLV basis vectors and constants for scalar splitting - // These are used to decompose scalars for faster multiplication - // minus_b1 and minus_b2 are precomputed constants for the GLV splitting algorithm - minusB1 = Scalar{d: [4]uint64{ - 0x0000000000000000, 0x0000000000000000, - 0xE4437ED6010E8828, 0x6F547FA90ABFE4C3, - }} - - minusB2 = Scalar{d: [4]uint64{ - 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, - 0x8A280AC50774346D, 0x3DB1562CDE9798D9, - }} - - // Precomputed estimates for GLV scalar splitting - // g1 and g2 are approximations of b2/d and (-b1)/d respectively - // where d is the curve order n - g1 = Scalar{d: [4]uint64{ - 0x3086D221A7D46BCD, 0xE86C90E49284EB15, - 0x3DAA8A1471E8CA7F, 0xE893209A45DBB031, - }} - - g2 = Scalar{d: [4]uint64{ - 0xE4437ED6010E8828, 0x6F547FA90ABFE4C4, - 0x221208AC9DF506C6, 0x1571B4AE8AC47F71, - }} ) // setInt sets a scalar to a small integer value @@ -227,6 +193,16 @@ func (r *Scalar) reduce(overflow int) { // add adds two scalars: r = a + b, returns overflow func (r *Scalar) add(a, b *Scalar) bool { + // Use AVX2 if available (AMD64 only) + if HasAVX2() { + scalarAddAVX2(r, a, b) + return false // AVX2 version handles reduction internally + } + return r.addPureGo(a, b) +} + +// addPureGo is the pure Go implementation of scalar addition +func (r *Scalar) addPureGo(a, b *Scalar) bool { var carry uint64 r.d[0], carry = bits.Add64(a.d[0], b.d[0], 0) @@ -244,15 +220,35 @@ func (r *Scalar) add(a, b *Scalar) bool { // sub subtracts two scalars: r = a - b func (r *Scalar) sub(a, b *Scalar) { + // Use AVX2 if available (AMD64 only) + if HasAVX2() { + scalarSubAVX2(r, a, b) + return + } + r.subPureGo(a, b) +} + +// subPureGo is the pure Go implementation of scalar subtraction +func (r *Scalar) subPureGo(a, b *Scalar) { // Compute a - b = a + (-b) var negB Scalar negB.negate(b) *r = *a - r.add(r, &negB) + r.addPureGo(r, &negB) } // mul multiplies two scalars: r = a * b func (r *Scalar) mul(a, b *Scalar) { + // Use AVX2 if available (AMD64 only) + if HasAVX2() { + scalarMulAVX2(r, a, b) + return + } + r.mulPureGo(a, b) +} + +// mulPureGo is the pure Go implementation of scalar multiplication +func (r *Scalar) mulPureGo(a, b *Scalar) { // Compute full 512-bit product using all 16 cross products var l [8]uint64 r.mul512(l[:], a, b) @@ -681,10 +677,8 @@ func scalarAdd(r, a, b *Scalar) bool { // scalarMul multiplies two scalars: r = a * b func scalarMul(r, a, b *Scalar) { - // Compute full 512-bit product using all 16 cross products - var l [8]uint64 - scalarMul512(l[:], a, b) - scalarReduce512(r, l[:]) + // Use the method version which has the correct 512-bit reduction + r.mulPureGo(a, b) } // scalarGetB32 serializes a scalar to 32 bytes in big-endian format @@ -742,88 +736,6 @@ func scalarReduce(r *Scalar, overflow int) { } } -// scalarMul512 computes the 512-bit product of two scalars -func scalarMul512(l []uint64, a, b *Scalar) { - if len(l) < 8 { - panic("l must be at least 8 uint64s") - } - - var c0, c1 uint64 - var c2 uint32 - - // Clear accumulator - l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7] = 0, 0, 0, 0, 0, 0, 0, 0 - - // Helper functions (translated from C) - muladd := func(ai, bi uint64) { - hi, lo := bits.Mul64(ai, bi) - var carry uint64 - c0, carry = bits.Add64(c0, lo, 0) - c1, carry = bits.Add64(c1, hi, carry) - c2 += uint32(carry) - } - - sumadd := func(a uint64) { - var carry uint64 - c0, carry = bits.Add64(c0, a, 0) - c1, carry = bits.Add64(c1, 0, carry) - c2 += uint32(carry) - } - - extract := func() uint64 { - result := c0 - c0 = c1 - c1 = uint64(c2) - c2 = 0 - return result - } - - // l[0..7] = a[0..3] * b[0..3] (following C implementation exactly) - c0, c1, c2 = 0, 0, 0 - muladd(a.d[0], b.d[0]) - l[0] = extract() - - sumadd(a.d[0]*b.d[1] + a.d[1]*b.d[0]) - l[1] = extract() - - sumadd(a.d[0]*b.d[2] + a.d[1]*b.d[1] + a.d[2]*b.d[0]) - l[2] = extract() - - sumadd(a.d[0]*b.d[3] + a.d[1]*b.d[2] + a.d[2]*b.d[1] + a.d[3]*b.d[0]) - l[3] = extract() - - sumadd(a.d[1]*b.d[3] + a.d[2]*b.d[2] + a.d[3]*b.d[1]) - l[4] = extract() - - sumadd(a.d[2]*b.d[3] + a.d[3]*b.d[2]) - l[5] = extract() - - sumadd(a.d[3] * b.d[3]) - l[6] = extract() - - l[7] = c0 -} - -// scalarReduce512 reduces a 512-bit value to 256-bit -func scalarReduce512(r *Scalar, l []uint64) { - if len(l) < 8 { - panic("l must be at least 8 uint64s") - } - - // Implementation follows the C secp256k1_scalar_reduce_512 algorithm - // This is a simplified version - the full implementation would include - // the Montgomery reduction steps from the C code - r.d[0] = l[0] - r.d[1] = l[1] - r.d[2] = l[2] - r.d[3] = l[3] - - // Apply modular reduction if needed - if scalarCheckOverflow(r) { - scalarReduce(r, 0) - } -} - // wNAF converts a scalar to Windowed Non-Adjacent Form representation // wNAF represents the scalar using digits in the range [-(2^(w-1)-1), 2^(w-1)-1] // with the property that non-zero digits are separated by at least w-1 zeros. @@ -882,86 +794,3 @@ func (s *Scalar) wNAF(wnaf []int, w uint) int { return bits + 1 } -// scalarMulShiftVar computes r = round(a * b / 2^shift) using variable-time arithmetic -// This is used for the GLV scalar splitting algorithm -func scalarMulShiftVar(r *Scalar, a *Scalar, b *Scalar, shift uint) { - if shift > 512 { - panic("shift too large") - } - - var l [8]uint64 - scalarMul512(l[:], a, b) - - // Right shift by 'shift' bits, rounding to nearest - carry := uint64(0) - if shift > 0 && (l[0]&(uint64(1)<<(shift-1))) != 0 { - carry = 1 // Round up if the bit being shifted out is 1 - } - - // Shift the limbs - for i := 0; i < 4; i++ { - var srcIndex int - var srcShift uint - if shift >= 64*uint(i) { - srcIndex = int(shift/64) + i - srcShift = shift % 64 - } else { - srcIndex = i - srcShift = shift - } - - if srcIndex >= 8 { - r.d[i] = 0 - continue - } - - val := l[srcIndex] - if srcShift > 0 && srcIndex+1 < 8 { - val |= l[srcIndex+1] << (64 - srcShift) - } - val >>= srcShift - - if i == 0 { - val += carry - } - - r.d[i] = val - } - - // Ensure result is reduced - scalarReduce(r, 0) -} - -// splitLambda splits a scalar k into r1 and r2 such that r1 + lambda*r2 = k mod n -// where lambda is the secp256k1 endomorphism constant. -// This is used for GLV (Gallant-Lambert-Vanstone) optimization. -// -// The algorithm computes c1 and c2 as approximations, then solves for r1 and r2. -// r1 and r2 are guaranteed to be in the range [-2^128, 2^128] approximately. -// -// Returns r1, r2 where k = r1 + lambda*r2 mod n -func (r1 *Scalar) splitLambda(r2 *Scalar, k *Scalar) { - var c1, c2 Scalar - - // Compute c1 = round(k * g1 / 2^384) - // c2 = round(k * g2 / 2^384) - // These are high-precision approximations for the GLV basis decomposition - scalarMulShiftVar(&c1, k, &g1, 384) - scalarMulShiftVar(&c2, k, &g2, 384) - - // Compute r2 = c1*(-b1) + c2*(-b2) - var tmp1, tmp2 Scalar - scalarMul(&tmp1, &c1, &minusB1) - scalarMul(&tmp2, &c2, &minusB2) - scalarAdd(r2, &tmp1, &tmp2) - - // Compute r1 = k - r2*lambda - scalarMul(r1, r2, &secp256k1Lambda) - r1.negate(r1) - scalarAdd(r1, r1, k) - - // Ensure the result is properly reduced - scalarReduce(r1, 0) - scalarReduce(r2, 0) -} - diff --git a/scalar_amd64.go b/scalar_amd64.go new file mode 100644 index 0000000..fc25331 --- /dev/null +++ b/scalar_amd64.go @@ -0,0 +1,23 @@ +//go:build amd64 + +package p256k1 + +// AMD64-specific scalar operations with optional AVX2 acceleration. +// The Scalar type uses 4×uint64 limbs which are memory-compatible with +// the AVX package's 2×Uint128 representation. + +// scalarMulAVX2 multiplies two scalars using AVX2 assembly. +// Both input and output use the same memory layout as the pure Go implementation. +// +//go:noescape +func scalarMulAVX2(r, a, b *Scalar) + +// scalarAddAVX2 adds two scalars using AVX2 assembly. +// +//go:noescape +func scalarAddAVX2(r, a, b *Scalar) + +// scalarSubAVX2 subtracts two scalars using AVX2 assembly. +// +//go:noescape +func scalarSubAVX2(r, a, b *Scalar) diff --git a/scalar_amd64.s b/scalar_amd64.s new file mode 100644 index 0000000..aefe643 --- /dev/null +++ b/scalar_amd64.s @@ -0,0 +1,622 @@ +//go:build amd64 + +#include "textflag.h" + +// Constants for scalar reduction +// n = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141 +DATA p256k1ScalarN<>+0x00(SB)/8, $0xBFD25E8CD0364141 +DATA p256k1ScalarN<>+0x08(SB)/8, $0xBAAEDCE6AF48A03B +DATA p256k1ScalarN<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFE +DATA p256k1ScalarN<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF +GLOBL p256k1ScalarN<>(SB), RODATA|NOPTR, $32 + +// 2^256 - n (for reduction) +// NC0 = 0x402DA1732FC9BEBF +// NC1 = 0x4551231950B75FC4 +// NC2 = 1 +DATA p256k1ScalarNC<>+0x00(SB)/8, $0x402DA1732FC9BEBF +DATA p256k1ScalarNC<>+0x08(SB)/8, $0x4551231950B75FC4 +DATA p256k1ScalarNC<>+0x10(SB)/8, $0x0000000000000001 +DATA p256k1ScalarNC<>+0x18(SB)/8, $0x0000000000000000 +GLOBL p256k1ScalarNC<>(SB), RODATA|NOPTR, $32 + +// func scalarAddAVX2(r, a, b *Scalar) +// Adds two 256-bit scalars with carry chain and modular reduction. +TEXT ·scalarAddAVX2(SB), NOSPLIT, $0-24 + MOVQ r+0(FP), DI + MOVQ a+8(FP), SI + MOVQ b+16(FP), DX + + // Load a and b into registers (scalar loads for carry chain) + MOVQ 0(SI), AX // a.d[0] + MOVQ 8(SI), BX // a.d[1] + MOVQ 16(SI), CX // a.d[2] + MOVQ 24(SI), R8 // a.d[3] + + // Add b with carry chain + ADDQ 0(DX), AX // a.d[0] + b.d[0] + ADCQ 8(DX), BX // a.d[1] + b.d[1] + carry + ADCQ 16(DX), CX // a.d[2] + b.d[2] + carry + ADCQ 24(DX), R8 // a.d[3] + b.d[3] + carry + + // Save carry flag + SETCS R9B + + // Store preliminary result + MOVQ AX, 0(DI) + MOVQ BX, 8(DI) + MOVQ CX, 16(DI) + MOVQ R8, 24(DI) + + // Check if we need to reduce (carry set or result >= n) + TESTB R9B, R9B + JNZ add_reduce + + // Compare with n (from high to low) + MOVQ $0xFFFFFFFFFFFFFFFF, R10 + CMPQ R8, R10 + JB add_done + JA add_reduce + MOVQ p256k1ScalarN<>+0x10(SB), R10 + CMPQ CX, R10 + JB add_done + JA add_reduce + MOVQ p256k1ScalarN<>+0x08(SB), R10 + CMPQ BX, R10 + JB add_done + JA add_reduce + MOVQ p256k1ScalarN<>+0x00(SB), R10 + CMPQ AX, R10 + JB add_done + +add_reduce: + // Add 2^256 - n (which is equivalent to subtracting n) + MOVQ 0(DI), AX + MOVQ 8(DI), BX + MOVQ 16(DI), CX + MOVQ 24(DI), R8 + + MOVQ p256k1ScalarNC<>+0x00(SB), R10 + ADDQ R10, AX + MOVQ p256k1ScalarNC<>+0x08(SB), R10 + ADCQ R10, BX + MOVQ p256k1ScalarNC<>+0x10(SB), R10 + ADCQ R10, CX + MOVQ p256k1ScalarNC<>+0x18(SB), R10 + ADCQ R10, R8 + + MOVQ AX, 0(DI) + MOVQ BX, 8(DI) + MOVQ CX, 16(DI) + MOVQ R8, 24(DI) + +add_done: + VZEROUPPER + RET + +// func scalarSubAVX2(r, a, b *Scalar) +// Subtracts two 256-bit scalars. +TEXT ·scalarSubAVX2(SB), NOSPLIT, $0-24 + MOVQ r+0(FP), DI + MOVQ a+8(FP), SI + MOVQ b+16(FP), DX + + // Load a + MOVQ 0(SI), AX + MOVQ 8(SI), BX + MOVQ 16(SI), CX + MOVQ 24(SI), R8 + + // Subtract b with borrow chain + SUBQ 0(DX), AX + SBBQ 8(DX), BX + SBBQ 16(DX), CX + SBBQ 24(DX), R8 + + // Save borrow flag + SETCS R9B + + // Store preliminary result + MOVQ AX, 0(DI) + MOVQ BX, 8(DI) + MOVQ CX, 16(DI) + MOVQ R8, 24(DI) + + // If borrow, add n back + TESTB R9B, R9B + JZ sub_done + + // Add n + MOVQ p256k1ScalarN<>+0x00(SB), R10 + ADDQ R10, AX + MOVQ p256k1ScalarN<>+0x08(SB), R10 + ADCQ R10, BX + MOVQ p256k1ScalarN<>+0x10(SB), R10 + ADCQ R10, CX + MOVQ p256k1ScalarN<>+0x18(SB), R10 + ADCQ R10, R8 + + MOVQ AX, 0(DI) + MOVQ BX, 8(DI) + MOVQ CX, 16(DI) + MOVQ R8, 24(DI) + +sub_done: + VZEROUPPER + RET + +// func scalarMulAVX2(r, a, b *Scalar) +// Multiplies two 256-bit scalars and reduces mod n. +// This implementation follows the bitcoin-core secp256k1 algorithm exactly. +TEXT ·scalarMulAVX2(SB), NOSPLIT, $128-24 + MOVQ r+0(FP), DI + MOVQ a+8(FP), SI + MOVQ b+16(FP), DX + + // Load a limbs + MOVQ 0(SI), R8 // a0 + MOVQ 8(SI), R9 // a1 + MOVQ 16(SI), R10 // a2 + MOVQ 24(SI), R11 // a3 + + // Store b pointer for later use + MOVQ DX, R12 + + // Compute 512-bit product using schoolbook multiplication + // Product stored on stack at SP+0 to SP+56 (8 limbs: l0..l7) + + // Initialize product to zero + XORQ AX, AX + MOVQ AX, 0(SP) // l0 + MOVQ AX, 8(SP) // l1 + MOVQ AX, 16(SP) // l2 + MOVQ AX, 24(SP) // l3 + MOVQ AX, 32(SP) // l4 + MOVQ AX, 40(SP) // l5 + MOVQ AX, 48(SP) // l6 + MOVQ AX, 56(SP) // l7 + + // Multiply a0 * b[0..3] + MOVQ R8, AX + MULQ 0(R12) // a0 * b0 + MOVQ AX, 0(SP) + MOVQ DX, R13 // carry + + MOVQ R8, AX + MULQ 8(R12) // a0 * b1 + ADDQ R13, AX + ADCQ $0, DX + MOVQ AX, 8(SP) + MOVQ DX, R13 + + MOVQ R8, AX + MULQ 16(R12) // a0 * b2 + ADDQ R13, AX + ADCQ $0, DX + MOVQ AX, 16(SP) + MOVQ DX, R13 + + MOVQ R8, AX + MULQ 24(R12) // a0 * b3 + ADDQ R13, AX + ADCQ $0, DX + MOVQ AX, 24(SP) + MOVQ DX, 32(SP) + + // Multiply a1 * b[0..3] and add + MOVQ R9, AX + MULQ 0(R12) // a1 * b0 + ADDQ AX, 8(SP) + ADCQ DX, 16(SP) + ADCQ $0, 24(SP) + ADCQ $0, 32(SP) + + MOVQ R9, AX + MULQ 8(R12) // a1 * b1 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + ADCQ $0, 32(SP) + + MOVQ R9, AX + MULQ 16(R12) // a1 * b2 + ADDQ AX, 24(SP) + ADCQ DX, 32(SP) + ADCQ $0, 40(SP) + + MOVQ R9, AX + MULQ 24(R12) // a1 * b3 + ADDQ AX, 32(SP) + ADCQ DX, 40(SP) + + // Multiply a2 * b[0..3] and add + MOVQ R10, AX + MULQ 0(R12) // a2 * b0 + ADDQ AX, 16(SP) + ADCQ DX, 24(SP) + ADCQ $0, 32(SP) + ADCQ $0, 40(SP) + + MOVQ R10, AX + MULQ 8(R12) // a2 * b1 + ADDQ AX, 24(SP) + ADCQ DX, 32(SP) + ADCQ $0, 40(SP) + + MOVQ R10, AX + MULQ 16(R12) // a2 * b2 + ADDQ AX, 32(SP) + ADCQ DX, 40(SP) + ADCQ $0, 48(SP) + + MOVQ R10, AX + MULQ 24(R12) // a2 * b3 + ADDQ AX, 40(SP) + ADCQ DX, 48(SP) + + // Multiply a3 * b[0..3] and add + MOVQ R11, AX + MULQ 0(R12) // a3 * b0 + ADDQ AX, 24(SP) + ADCQ DX, 32(SP) + ADCQ $0, 40(SP) + ADCQ $0, 48(SP) + + MOVQ R11, AX + MULQ 8(R12) // a3 * b1 + ADDQ AX, 32(SP) + ADCQ DX, 40(SP) + ADCQ $0, 48(SP) + + MOVQ R11, AX + MULQ 16(R12) // a3 * b2 + ADDQ AX, 40(SP) + ADCQ DX, 48(SP) + ADCQ $0, 56(SP) + + MOVQ R11, AX + MULQ 24(R12) // a3 * b3 + ADDQ AX, 48(SP) + ADCQ DX, 56(SP) + + // Now we have the 512-bit product in SP+0..SP+56 (l[0..7]) + // Reduce using the exact algorithm from bitcoin-core secp256k1 + // + // Phase 1: Reduce 512 bits into 385 bits + // m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C + // where n[0..3] = l[4..7] (high 256 bits) + // + // NC0 = 0x402DA1732FC9BEBF + // NC1 = 0x4551231950B75FC4 + // NC2 = 1 + + // Load high limbs (l4..l7 = n0..n3) + MOVQ 32(SP), R8 // n0 = l4 + MOVQ 40(SP), R9 // n1 = l5 + MOVQ 48(SP), R10 // n2 = l6 + MOVQ 56(SP), R11 // n3 = l7 + + // Load constants + MOVQ $0x402DA1732FC9BEBF, R12 // NC0 + MOVQ $0x4551231950B75FC4, R13 // NC1 + + // Use stack locations 64-112 for intermediate m values + // We'll use a 160-bit accumulator approach like the C code + // c0 (R14), c1 (R15), c2 (stored on stack at 120(SP)) + + // === m0 === + // c0 = l[0], c1 = 0 + // muladd_fast(n0, NC0): hi,lo = n0*NC0; c0 += lo, c1 += hi + carry + // m0 = extract_fast() = c0; c0 = c1; c1 = 0 + MOVQ 0(SP), R14 // c0 = l0 + XORQ R15, R15 // c1 = 0 + MOVQ R8, AX + MULQ R12 // DX:AX = n0 * NC0 + ADDQ AX, R14 // c0 += lo + ADCQ DX, R15 // c1 += hi + carry + MOVQ R14, 64(SP) // m0 = c0 + MOVQ R15, R14 // c0 = c1 + XORQ R15, R15 // c1 = 0 + MOVQ $0, 120(SP) // c2 = 0 + + // === m1 === + // sumadd_fast(l[1]) + // muladd(n1, NC0) + // muladd(n0, NC1) + // m1 = extract() + ADDQ 8(SP), R14 // c0 += l1 + ADCQ $0, R15 // c1 += carry + + MOVQ R9, AX + MULQ R12 // DX:AX = n1 * NC0 + ADDQ AX, R14 // c0 += lo + ADCQ DX, R15 // c1 += hi + carry + ADCQ $0, 120(SP) // c2 += carry + + MOVQ R8, AX + MULQ R13 // DX:AX = n0 * NC1 + ADDQ AX, R14 // c0 += lo + ADCQ DX, R15 // c1 += hi + carry + ADCQ $0, 120(SP) // c2 += carry + + MOVQ R14, 72(SP) // m1 = c0 + MOVQ R15, R14 // c0 = c1 + MOVQ 120(SP), R15 // c1 = c2 + MOVQ $0, 120(SP) // c2 = 0 + + // === m2 === + // sumadd(l[2]) + // muladd(n2, NC0) + // muladd(n1, NC1) + // sumadd(n0) (because NC2 = 1) + // m2 = extract() + ADDQ 16(SP), R14 // c0 += l2 + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R10, AX + MULQ R12 // DX:AX = n2 * NC0 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + MOVQ R9, AX + MULQ R13 // DX:AX = n1 * NC1 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + ADDQ R8, R14 // c0 += n0 (n0 * NC2 = n0 * 1) + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R14, 80(SP) // m2 = c0 + MOVQ R15, R14 // c0 = c1 + MOVQ 120(SP), R15 // c1 = c2 + MOVQ $0, 120(SP) // c2 = 0 + + // === m3 === + // sumadd(l[3]) + // muladd(n3, NC0) + // muladd(n2, NC1) + // sumadd(n1) + // m3 = extract() + ADDQ 24(SP), R14 // c0 += l3 + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R11, AX + MULQ R12 // DX:AX = n3 * NC0 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + MOVQ R10, AX + MULQ R13 // DX:AX = n2 * NC1 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + ADDQ R9, R14 // c0 += n1 + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R14, 88(SP) // m3 = c0 + MOVQ R15, R14 // c0 = c1 + MOVQ 120(SP), R15 // c1 = c2 + MOVQ $0, 120(SP) // c2 = 0 + + // === m4 === + // muladd(n3, NC1) + // sumadd(n2) + // m4 = extract() + MOVQ R11, AX + MULQ R13 // DX:AX = n3 * NC1 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + ADDQ R10, R14 // c0 += n2 + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R14, 96(SP) // m4 = c0 + MOVQ R15, R14 // c0 = c1 + MOVQ 120(SP), R15 // c1 = c2 + + // === m5 === + // sumadd_fast(n3) + // m5 = extract_fast() + ADDQ R11, R14 // c0 += n3 + ADCQ $0, R15 // c1 += carry + + MOVQ R14, 104(SP) // m5 = c0 + MOVQ R15, R14 // c0 = c1 + + // === m6 === + // m6 = c0 (low 32 bits only, but we keep full 64 bits for simplicity) + MOVQ R14, 112(SP) // m6 = c0 + + // Phase 2: Reduce 385 bits into 258 bits + // p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C + // m4, m5 are 64-bit, m6 is at most 33 bits + + // Load m values + MOVQ 96(SP), R8 // m4 + MOVQ 104(SP), R9 // m5 + MOVQ 112(SP), R10 // m6 + + // === p0 === + // c0 = m0, c1 = 0 + // muladd_fast(m4, NC0) + // p0 = extract_fast() + MOVQ 64(SP), R14 // c0 = m0 + XORQ R15, R15 // c1 = 0 + + MOVQ R8, AX + MULQ R12 // DX:AX = m4 * NC0 + ADDQ AX, R14 + ADCQ DX, R15 + + MOVQ R14, 64(SP) // p0 = c0 (reuse m0 location) + MOVQ R15, R14 // c0 = c1 + XORQ R15, R15 // c1 = 0 + MOVQ $0, 120(SP) // c2 = 0 + + // === p1 === + // sumadd_fast(m1) + // muladd(m5, NC0) + // muladd(m4, NC1) + // p1 = extract() + ADDQ 72(SP), R14 // c0 += m1 + ADCQ $0, R15 + + MOVQ R9, AX + MULQ R12 // DX:AX = m5 * NC0 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + MOVQ R8, AX + MULQ R13 // DX:AX = m4 * NC1 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + MOVQ R14, 72(SP) // p1 = c0 + MOVQ R15, R14 // c0 = c1 + MOVQ 120(SP), R15 // c1 = c2 + MOVQ $0, 120(SP) // c2 = 0 + + // === p2 === + // sumadd(m2) + // muladd(m6, NC0) + // muladd(m5, NC1) + // sumadd(m4) + // p2 = extract() + ADDQ 80(SP), R14 // c0 += m2 + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R10, AX + MULQ R12 // DX:AX = m6 * NC0 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + MOVQ R9, AX + MULQ R13 // DX:AX = m5 * NC1 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0, 120(SP) + + ADDQ R8, R14 // c0 += m4 + ADCQ $0, R15 + ADCQ $0, 120(SP) + + MOVQ R14, 80(SP) // p2 = c0 + MOVQ R15, R14 // c0 = c1 + MOVQ 120(SP), R15 // c1 = c2 + + // === p3 === + // sumadd_fast(m3) + // muladd_fast(m6, NC1) + // sumadd_fast(m5) + // p3 = extract_fast() + ADDQ 88(SP), R14 // c0 += m3 + ADCQ $0, R15 + + MOVQ R10, AX + MULQ R13 // DX:AX = m6 * NC1 + ADDQ AX, R14 + ADCQ DX, R15 + + ADDQ R9, R14 // c0 += m5 + ADCQ $0, R15 + + MOVQ R14, 88(SP) // p3 = c0 + // p4 = c1 + m6 + ADDQ R15, R10 // p4 = c1 + m6 + + // === p4 === + MOVQ R10, 96(SP) // p4 + + // Phase 3: Reduce 258 bits into 256 bits + // r[0..3] = p[0..3] + p[4] * SECP256K1_N_C + // Then check for overflow and reduce once more if needed + + // Use 128-bit arithmetic for this phase + // t = p0 + p4 * NC0 + MOVQ 96(SP), R11 // p4 + + // r0 = (p0 + p4 * NC0) mod 2^64, carry to next + MOVQ R11, AX + MULQ R12 // DX:AX = p4 * NC0 + ADDQ 64(SP), AX // AX = p0 + lo + ADCQ $0, DX // DX = hi + carry + MOVQ AX, R8 // r0 + MOVQ DX, R14 // carry + + // r1 = p1 + p4 * NC1 + carry + MOVQ R11, AX + MULQ R13 // DX:AX = p4 * NC1 + ADDQ R14, AX // AX += carry + ADCQ $0, DX + ADDQ 72(SP), AX // AX += p1 + ADCQ $0, DX + MOVQ AX, R9 // r1 + MOVQ DX, R14 // carry + + // r2 = p2 + p4 * NC2 + carry = p2 + p4 + carry + MOVQ 80(SP), AX + ADDQ R14, AX // AX = p2 + carry + MOVQ $0, DX + ADCQ $0, DX + ADDQ R11, AX // AX += p4 (NC2 = 1) + ADCQ $0, DX + MOVQ AX, R10 // r2 + MOVQ DX, R14 // carry + + // r3 = p3 + carry + MOVQ 88(SP), AX + ADDQ R14, AX + SETCS R14B // final carry + MOVQ AX, R11 // r3 + + // Check if we need to reduce (carry or result >= n) + TESTB R14B, R14B + JNZ mul_do_final_reduce + + // Compare with n (from high to low) + MOVQ $0xFFFFFFFFFFFFFFFF, R15 + CMPQ R11, R15 + JB mul_store_result + JA mul_do_final_reduce + MOVQ $0xFFFFFFFFFFFFFFFE, R15 + CMPQ R10, R15 + JB mul_store_result + JA mul_do_final_reduce + MOVQ $0xBAAEDCE6AF48A03B, R15 + CMPQ R9, R15 + JB mul_store_result + JA mul_do_final_reduce + MOVQ $0xBFD25E8CD0364141, R15 + CMPQ R8, R15 + JB mul_store_result + +mul_do_final_reduce: + // Add 2^256 - n + ADDQ R12, R8 // r0 += NC0 + ADCQ R13, R9 // r1 += NC1 + ADCQ $1, R10 // r2 += NC2 = 1 + ADCQ $0, R11 // r3 += 0 + +mul_store_result: + // Store result + MOVQ r+0(FP), DI + MOVQ R8, 0(DI) + MOVQ R9, 8(DI) + MOVQ R10, 16(DI) + MOVQ R11, 24(DI) + + VZEROUPPER + RET diff --git a/scalar_generic.go b/scalar_generic.go new file mode 100644 index 0000000..050e910 --- /dev/null +++ b/scalar_generic.go @@ -0,0 +1,18 @@ +//go:build !amd64 + +package p256k1 + +// Generic stub implementations for non-AMD64 architectures. +// These simply forward to the pure Go implementations. + +func scalarMulAVX2(r, a, b *Scalar) { + r.mulPureGo(a, b) +} + +func scalarAddAVX2(r, a, b *Scalar) { + r.addPureGo(a, b) +} + +func scalarSubAVX2(r, a, b *Scalar) { + r.subPureGo(a, b) +}