From f2ddcfacbbb4a93791a8d00a43cb50b23962b230 Mon Sep 17 00:00:00 2001 From: mleku Date: Sat, 1 Nov 2025 21:39:36 +0000 Subject: [PATCH] Refactor Ecmult functions for optimized windowed multiplication and enhance performance This commit introduces a new `ecmultWindowedVar` function that implements optimized windowed multiplication for scalar multiplication, significantly improving performance during verification operations. The existing `Ecmult` function is updated to utilize this new implementation, converting points to affine coordinates for efficiency. Additionally, the `EcmultConst` function is retained for constant-time operations. The changes also include enhancements to the generator multiplication context, utilizing precomputed byte points for improved efficiency. Overall, these optimizations lead to a notable reduction in operation times for cryptographic computations. --- VERIFICATION_PERFORMANCE_ANALYSIS.md | 184 +++++++++++++++++++++++++++ bench/BENCHMARK_REPORT.md | 111 +++++++++------- ecdh.go | 105 ++++++++++++--- ecmult_gen.go | 171 ++++++++++++++++++++----- schnorr.go | 6 +- signer/p256k1_signer.go | 4 +- 6 files changed, 481 insertions(+), 100 deletions(-) create mode 100644 VERIFICATION_PERFORMANCE_ANALYSIS.md diff --git a/VERIFICATION_PERFORMANCE_ANALYSIS.md b/VERIFICATION_PERFORMANCE_ANALYSIS.md new file mode 100644 index 0000000..4444b46 --- /dev/null +++ b/VERIFICATION_PERFORMANCE_ANALYSIS.md @@ -0,0 +1,184 @@ +# Verification Performance Analysis: NextP256K vs P256K1 + +## Summary + +NextP256K's verification is **4.7x faster** than p256k1 (40,017 ns/op vs 186,054 ns/op) because it uses libsecp256k1's highly optimized C implementation, while p256k1 uses a simple binary multiplication algorithm. + +## Root Cause + +The performance bottleneck is in `EcmultConst`, which is used to compute `e*P` during Schnorr verification. + +### Schnorr Verification Algorithm + +```186:289:schnorr.go +// SchnorrVerify verifies a Schnorr signature following BIP-340 +func SchnorrVerify(sig64 []byte, msg32 []byte, xonlyPubkey *XOnlyPubkey) bool { + // ... validation ... + + // Compute R = s*G - e*P + // First compute s*G + var sG GroupElementJacobian + EcmultGen(&sG, &s) // Fast: uses optimized precomputed tables + + // Compute e*P where P is the x-only pubkey + var eP GroupElementJacobian + EcmultConst(&eP, &pk, &e) // Slow: uses simple binary method + + // ... rest of verification ... +} +``` + +### Performance Breakdown + +1. **s*G computation** (`EcmultGen`): + - Uses 8-bit byte-based precomputed tables + - Highly optimized: ~58,618 ns/op for pubkey derivation + - Fast because the generator point G is fixed and precomputed + +2. **e*P computation** (`EcmultConst`): + - Uses simple binary method with 256 iterations + - Each iteration: double, check bit, potentially add + - **This is the bottleneck** + +### Current EcmultConst Implementation + +```10:48:ecdh.go +// EcmultConst computes r = q * a using constant-time multiplication +// This is a simplified implementation for Phase 3 - can be optimized later +func EcmultConst(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) { + // ... edge cases ... + + // Process bits from MSB to LSB + for i := 0; i < 256; i++ { + if i > 0 { + r.double(r) + } + + // Get bit i (from MSB) + bit := q.getBits(uint(255-i), 1) + if bit != 0 { + if r.isInfinity() { + *r = base + } else { + r.addVar(r, &base) + } + } + } +} +``` + +**Problem:** This performs 256 iterations, each requiring: +- One field element doubling operation +- One bit extraction +- Potentially one point addition + +For verification, this means **256 doublings + up to 256 additions** per verification, which is extremely inefficient. + +## Why NextP256K is Faster + +NextP256K uses libsecp256k1's optimized C implementation (`secp256k1_ecmult_const`) which: + +1. **Uses GLV Endomorphism**: + - Splits the scalar into two smaller components using the curve's endomorphism + - Computes two smaller multiplications instead of one large one + - Reduces the effective bit length from 256 to ~128 bits per component + +2. **Windowed Precomputation**: + - Precomputes a table of multiples of the base point + - Uses windowed lookups instead of processing bits one at a time + - Processes multiple bits per iteration (typically 4-6 bits at a time) + +3. **Signed-Digit Multi-Comb Algorithm**: + - Uses a more efficient representation that reduces the number of additions + - Minimizes the number of point operations required + +4. **Assembly Optimizations**: + - Field arithmetic operations are optimized in assembly + - Hand-tuned for specific CPU architectures + +### Reference Implementation + +The C reference shows the complexity: + +```124:268:src/ecmult_const_impl.h +static void secp256k1_ecmult_const(secp256k1_gej *r, const secp256k1_ge *a, const secp256k1_scalar *q) { + /* The approach below combines the signed-digit logic from Mike Hamburg's + * "Fast and compact elliptic-curve cryptography" (https://eprint.iacr.org/2012/309) + * Section 3.3, with the GLV endomorphism. + * ... */ + + /* Precompute table for base point and lambda * base point */ + + /* Process bits in groups using windowed lookups */ + for (group = ECMULT_CONST_GROUPS - 1; group >= 0; --group) { + /* Lookup precomputed points */ + ECMULT_CONST_TABLE_GET_GE(&t, pre_a, bits1); + /* ... */ + } +} +``` + +## Performance Impact + +### Benchmark Results + +| Operation | P256K1 | NextP256K | Speedup | +|-----------|--------|-----------|---------| +| **Verification** | 186,054 ns/op | 40,017 ns/op | **4.7x** | +| Signing | 31,937 ns/op | 52,060 ns/op | 0.6x (slower) | +| Pubkey Derivation | 58,618 ns/op | 280,835 ns/op | 0.2x (slower) | + +**Note:** NextP256K is slower for signing and pubkey derivation due to CGO overhead for smaller operations, but much faster for verification because the computation is more complex. + +## Optimization Opportunities + +To improve p256k1's verification performance, `EcmultConst` should be optimized to: + +1. **Implement GLV Endomorphism**: + - Split scalar using secp256k1's endomorphism + - Compute two smaller multiplications + - Combine results + +2. **Add Windowed Precomputation**: + - Precompute a table of multiples of the base point + - Process bits in groups (windows) instead of individually + - Use lookup tables instead of repeated additions + +3. **Consider Variable-Time Optimization**: + - For verification (public operation), variable-time algorithms are acceptable + - Could use `Ecmult` instead of `EcmultConst` if constant-time isn't required + +4. **Implement Signed-Digit Representation**: + - Use signed-digit multi-comb algorithm + - Reduce the number of additions required + +## Complexity Comparison + +### Current (Simple Binary Method) +- **Operations:** O(256) doublings + O(256) additions (worst case) +- **Complexity:** ~256 point operations + +### Optimized (Windowed + GLV) +- **Operations:** O(64) doublings + O(16) additions (with window size 4) +- **Complexity:** ~80 point operations (4x improvement) + +### With Assembly Optimizations +- **Additional:** 2-3x speedup from optimized field arithmetic +- **Total:** ~10-15x faster than simple binary method + +## Conclusion + +The 4.7x performance difference is primarily due to: +1. **Algorithmic efficiency**: Windowed multiplication vs. simple binary method +2. **GLV endomorphism**: Splitting scalar into smaller components +3. **Assembly optimizations**: Hand-tuned field arithmetic in C +4. **Better memory access patterns**: Precomputed tables vs. repeated computations + +The optimization is non-trivial and would require implementing: +- GLV endomorphism support +- Windowed precomputation tables +- Signed-digit multi-comb algorithm +- Potentially assembly optimizations for field arithmetic + +For now, NextP256K's advantage in verification is expected given its use of the mature, highly optimized libsecp256k1 C library. + diff --git a/bench/BENCHMARK_REPORT.md b/bench/BENCHMARK_REPORT.md index c6486da..950db23 100644 --- a/bench/BENCHMARK_REPORT.md +++ b/bench/BENCHMARK_REPORT.md @@ -8,21 +8,25 @@ This report compares three signer implementations for secp256k1 operations: 2. **BtcecSigner** - Pure Go wrapper around btcec/v2 3. **NextP256K Signer** - CGO version using next.orly.dev/pkg/crypto/p256k (CGO bindings to libsecp256k1) -**Generated:** 2025-11-01 +**Generated:** 2025-11-01 (Updated after optimized windowed multiplication for verification) **Platform:** linux/amd64 **CPU:** AMD Ryzen 5 PRO 4650G with Radeon Graphics **Go Version:** go1.25.3 +**Key Optimizations:** +- Implemented 8-bit byte-based precomputed tables matching btcec's approach, resulting in 4x improvement in pubkey derivation and 4.3x improvement in signing. +- Optimized windowed multiplication for verification (5-bit windows, Jacobian coordinate table building): 19% improvement (186,054 → 150,457 ns/op). + --- ## Summary Results | Operation | P256K1Signer | BtcecSigner | NextP256K | Winner | |-----------|-------------|-------------|-----------|--------| -| **Pubkey Derivation** | 232,922 ns/op | 63,317 ns/op | 295,599 ns/op | Btcec (3.7x faster) | -| **Sign** | 136,560 ns/op | 216,808 ns/op | 53,454 ns/op | NextP256K (2.6x faster) | -| **Verify** | 268,771 ns/op | 160,894 ns/op | 38,423 ns/op | NextP256K (7.0x faster) | -| **ECDH** | 158,730 ns/op | 130,804 ns/op | 124,998 ns/op | NextP256K (1.3x faster) | +| **Pubkey Derivation** | 59,056 ns/op | 63,958 ns/op | 269,444 ns/op | P256K1 (8% faster than Btcec) | +| **Sign** | 31,592 ns/op | 219,388 ns/op | 52,233 ns/op | P256K1 (1.7x faster than NextP256K) | +| **Verify** | 150,457 ns/op | 163,867 ns/op | 40,550 ns/op | NextP256K (3.7x faster) | +| **ECDH** | 163,356 ns/op | 136,329 ns/op | 124,423 ns/op | NextP256K (1.3x faster) | --- @@ -34,12 +38,13 @@ Deriving public key from private key (32 bytes → 32 bytes x-only pubkey). | Implementation | Time per op | Memory | Allocations | Speedup vs P256K1 | |----------------|-------------|--------|-------------|-------------------| -| **P256K1Signer** | 232,922 ns/op | 256 B/op | 4 allocs/op | 1.0x (baseline) | -| **BtcecSigner** | 63,317 ns/op | 368 B/op | 7 allocs/op | **3.7x faster** | -| **NextP256K** | 295,599 ns/op | 983,395 B/op | 9 allocs/op | 0.8x slower | +| **P256K1Signer** | 59,056 ns/op | 256 B/op | 4 allocs/op | 1.0x (baseline) | +| **BtcecSigner** | 63,958 ns/op | 368 B/op | 7 allocs/op | 0.9x slower | +| **NextP256K** | 269,444 ns/op | 983,393 B/op | 9 allocs/op | 0.2x slower | **Analysis:** -- Btcec is fastest for key derivation (3.7x faster than P256K1) +- **P256K1 is fastest** (8% faster than Btcec) after implementing 8-bit byte-based precomputed tables +- Massive improvement: 4x faster than previous implementation (232,922 → 58,618 ns/op) - NextP256K is slowest, likely due to CGO overhead for small operations - P256K1 has lowest memory allocation overhead @@ -49,13 +54,13 @@ Creating BIP-340 Schnorr signatures (32-byte message → 64-byte signature). | Implementation | Time per op | Memory | Allocations | Speedup vs P256K1 | |----------------|-------------|--------|-------------|-------------------| -| **P256K1Signer** | 136,560 ns/op | 1,152 B/op | 17 allocs/op | 1.0x (baseline) | -| **BtcecSigner** | 216,808 ns/op | 2,193 B/op | 38 allocs/op | 0.6x slower | -| **NextP256K** | 53,454 ns/op | 128 B/op | 3 allocs/op | **2.6x faster** | +| **P256K1Signer** | 31,592 ns/op | 1,152 B/op | 17 allocs/op | 1.0x (baseline) | +| **BtcecSigner** | 219,388 ns/op | 2,193 B/op | 38 allocs/op | 0.1x slower | +| **NextP256K** | 52,233 ns/op | 128 B/op | 3 allocs/op | 0.6x slower | **Analysis:** -- NextP256K is fastest (2.6x faster than P256K1), benefiting from optimized C implementation -- P256K1 is second fastest, showing good performance for pure Go +- **P256K1 is fastest** (1.7x faster than NextP256K), benefiting from optimized pubkey derivation +- NextP256K is second fastest, benefiting from optimized C implementation - Btcec is slowest, likely due to more allocations and pure Go overhead - NextP256K has lowest memory usage (128 B vs 1,152 B) @@ -65,14 +70,15 @@ Verifying BIP-340 Schnorr signatures (32-byte message + 64-byte signature). | Implementation | Time per op | Memory | Allocations | Speedup vs P256K1 | |----------------|-------------|--------|-------------|-------------------| -| **P256K1Signer** | 268,771 ns/op | 576 B/op | 9 allocs/op | 1.0x (baseline) | -| **BtcecSigner** | 160,894 ns/op | 1,120 B/op | 18 allocs/op | 1.7x faster | -| **NextP256K** | 38,423 ns/op | 96 B/op | 2 allocs/op | **7.0x faster** | +| **P256K1Signer** | 150,457 ns/op | 576 B/op | 9 allocs/op | 1.0x (baseline) | +| **BtcecSigner** | 163,867 ns/op | 1,120 B/op | 18 allocs/op | 0.9x slower | +| **NextP256K** | 40,550 ns/op | 96 B/op | 2 allocs/op | **3.7x faster** | **Analysis:** -- NextP256K is dramatically fastest (7.0x faster), showcasing CGO advantage for verification -- Btcec is second fastest (1.7x faster than P256K1) -- P256K1 is slowest but still reasonable for pure Go +- NextP256K is dramatically fastest (3.7x faster), showcasing CGO advantage for verification +- **P256K1 is fastest pure Go implementation** (8% faster than Btcec) after optimized windowed multiplication +- **19% improvement** over previous implementation (186,054 → 150,457 ns/op) +- Optimizations: 5-bit windowed multiplication with efficient Jacobian coordinate table building - NextP256K has minimal memory footprint (96 B vs 576 B) ### ECDH (Shared Secret Generation) @@ -81,9 +87,9 @@ Generating shared secret using Elliptic Curve Diffie-Hellman. | Implementation | Time per op | Memory | Allocations | Speedup vs P256K1 | |----------------|-------------|--------|-------------|-------------------| -| **P256K1Signer** | 158,730 ns/op | 241 B/op | 6 allocs/op | 1.0x (baseline) | -| **BtcecSigner** | 130,804 ns/op | 832 B/op | 13 allocs/op | 1.2x faster | -| **NextP256K** | 124,998 ns/op | 160 B/op | 3 allocs/op | **1.3x faster** | +| **P256K1Signer** | 163,356 ns/op | 241 B/op | 6 allocs/op | 1.0x (baseline) | +| **BtcecSigner** | 136,329 ns/op | 832 B/op | 13 allocs/op | 1.2x faster | +| **NextP256K** | 124,423 ns/op | 160 B/op | 3 allocs/op | **1.3x faster** | **Analysis:** - All implementations are relatively close in performance @@ -95,27 +101,30 @@ Generating shared secret using Elliptic Curve Diffie-Hellman. ## Performance Analysis -### Overall Winner: NextP256K (CGO) +### Overall Winner: Mixed (P256K1 wins 2/4 operations, NextP256K wins 2/4 operations) -The CGO-based NextP256K implementation wins in 3 out of 4 operations: -- **Signing:** 2.6x faster than P256K1 -- **Verification:** 7.0x faster than P256K1 (largest advantage) -- **ECDH:** 1.3x faster than P256K1 +After optimized windowed multiplication for verification: +- **P256K1Signer** wins in 2 out of 4 operations: + - **Pubkey Derivation:** Fastest (8% faster than Btcec) + - **Signing:** Fastest (1.7x faster than NextP256K) +- **NextP256K** wins in 2 operations: + - **Verification:** Fastest (3.7x faster than P256K1, CGO advantage) + - **ECDH:** Fastest (1.3x faster than P256K1) -### Best Pure Go: Mixed Results +### Best Pure Go: P256K1Signer For pure Go implementations: -- **Btcec** wins for key derivation (3.7x faster) -- **P256K1** wins for signing among pure Go (though still slower than CGO) -- **Btcec** is faster for verification (1.7x faster than P256K1) -- Both are comparable for ECDH +- **P256K1** wins for key derivation (8% faster than Btcec) +- **P256K1** wins for signing (6.9x faster than Btcec) +- **P256K1** wins for verification (8% faster than Btcec) - **now fastest pure Go!** +- **Btcec** is faster for ECDH (1.2x faster than P256K1) ### Memory Efficiency | Implementation | Avg Memory per Operation | Notes | |----------------|-------------------------|-------| -| **NextP256K** | ~300 KB avg | Very efficient, minimal allocations | -| **P256K1Signer** | ~500 B avg | Low memory footprint | +| **P256K1Signer** | ~500 B avg | Low memory footprint, consistent across operations | +| **NextP256K** | ~300 KB avg | Very efficient, minimal allocations (except pubkey derivation overhead) | | **BtcecSigner** | ~1.1 KB avg | Higher allocations, but acceptable | **Note:** NextP256K shows high memory in pubkey derivation (983 KB) due to one-time CGO initialization overhead, but this is amortized across operations. @@ -128,19 +137,19 @@ For pure Go implementations: - Maximum performance is critical - CGO is acceptable in your build environment - Low memory footprint is important -- Verification speed is critical (7x faster) +- Verification speed is critical (4.7x faster) ### Use P256K1Signer when: - Pure Go is required (no CGO) -- Good balance of performance and simplicity +- **Pubkey derivation or signing performance is critical** (now fastest pure Go) - Lower memory allocations are preferred - You want to avoid external C dependencies +- You need the best overall pure Go performance ### Use BtcecSigner when: - Pure Go is required -- Key derivation performance matters (3.7x faster) +- Verification speed is slightly more important than signing/pubkey derivation - You're already using btcec in your project -- Verification needs to be faster than P256K1 but CGO isn't available --- @@ -148,18 +157,28 @@ For pure Go implementations: The benchmarks demonstrate that: -1. **CGO implementations (NextP256K) provide significant performance advantages** for cryptographic operations, especially verification (7x faster) +1. **After optimized windowed multiplication for verification**, P256K1Signer achieves: + - **Fastest pubkey derivation** among all implementations (59,056 ns/op) + - **Fastest signing** among all implementations (31,592 ns/op) + - **Fastest pure Go verification** (150,457 ns/op) - 19% improvement (186,054 → 150,457 ns/op) + - **8% faster verification than Btcec** in pure Go -2. **Pure Go implementations are competitive** for most operations, with Btcec showing strength in key derivation and verification +2. **Windowed multiplication optimization results:** + - Implemented 5-bit windowed multiplication with efficient Jacobian coordinate table building + - Kept all operations in Jacobian coordinates to avoid expensive affine conversions + - Reduced iterations from 256 (bit-by-bit) to ~52 (5-bit windows) + - **Successfully improved performance by 19%** over simple binary method -3. **P256K1Signer** provides a good middle ground with reasonable performance and clean API +3. **CGO implementations (NextP256K) still provide advantages** for verification (3.7x faster) and ECDH (1.3x faster) -4. **Memory efficiency** varies by operation, with NextP256K generally being most efficient +4. **Pure Go implementations are highly competitive**, with P256K1Signer leading in 3 out of 4 operations + +5. **Memory efficiency** varies by operation, with P256K1Signer maintaining low memory usage (256 B for pubkey derivation) The choice between implementations depends on your specific requirements: -- **Performance-critical applications:** Use NextP256K (CGO) -- **Pure Go requirements:** Choose between Btcec (faster) or P256K1 (cleaner API) -- **Balance:** P256K1Signer offers good performance with pure Go simplicity +- **Maximum performance:** Use NextP256K (CGO) - fastest for verification and ECDH +- **Best pure Go performance:** Use P256K1Signer - fastest for pubkey derivation, signing, and verification (now fastest pure Go for all three!) +- **Pure Go with ECDH focus:** Use BtcecSigner (slightly faster ECDH than P256K1) --- diff --git a/ecdh.go b/ecdh.go index 8224640..8262463 100644 --- a/ecdh.go +++ b/ecdh.go @@ -47,6 +47,85 @@ func EcmultConst(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) { } } +// ecmultWindowedVar computes r = q * a using optimized windowed multiplication (variable-time) +// Uses a window size of 5 bits (32 precomputed multiples) +// Optimized for verification: efficient table building using Jacobian coordinates +func ecmultWindowedVar(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) { + if a.isInfinity() { + r.setInfinity() + return + } + + if q.isZero() { + r.setInfinity() + return + } + + const windowSize = 5 + const tableSize = 1 << windowSize // 32 + + // Convert point to Jacobian once + var aJac GroupElementJacobian + aJac.setGE(a) + + // Build table efficiently using Jacobian coordinates, only convert to affine at end + // Store odd multiples in Jacobian form to avoid frequent conversions + var tableJac [tableSize]GroupElementJacobian + tableJac[0].setInfinity() + tableJac[1] = aJac + + // Build odd multiples efficiently: tableJac[2*i+1] = (2*i+1) * a + // Start with 3*a = a + 2*a + var twoA GroupElementJacobian + twoA.double(&aJac) + + // Build table: tableJac[i] = tableJac[i-2] + 2*a for odd i + for i := 3; i < tableSize; i += 2 { + tableJac[i].addVar(&tableJac[i-2], &twoA) + } + + // Build even multiples: tableJac[2*i] = 2 * tableJac[i] + for i := 1; i < tableSize/2; i++ { + tableJac[2*i].double(&tableJac[i]) + } + + // Process scalar in windows of 5 bits from MSB to LSB + r.setInfinity() + numWindows := (256 + windowSize - 1) / windowSize // Ceiling division + + for window := 0; window < numWindows; window++ { + // Calculate bit offset for this window (MSB first) + bitOffset := 255 - window*windowSize + if bitOffset < 0 { + break + } + + // Extract window bits + actualWindowSize := windowSize + if bitOffset < windowSize-1 { + actualWindowSize = bitOffset + 1 + } + + windowBits := q.getBits(uint(bitOffset-actualWindowSize+1), uint(actualWindowSize)) + + // Double result windowSize times (once per bit position in window) + if !r.isInfinity() { + for j := 0; j < actualWindowSize; j++ { + r.double(r) + } + } + + // Add precomputed point if window is non-zero + if windowBits != 0 && windowBits < tableSize { + if r.isInfinity() { + *r = tableJac[windowBits] + } else { + r.addVar(r, &tableJac[windowBits]) + } + } + } +} + // Ecmult computes r = q * a (variable-time, optimized) // This is a simplified implementation - can be optimized with windowing later func Ecmult(r *GroupElementJacobian, a *GroupElementJacobian, q *Scalar) { @@ -60,27 +139,12 @@ func Ecmult(r *GroupElementJacobian, a *GroupElementJacobian, q *Scalar) { return } - // Simple binary method for now - r.setInfinity() - var base GroupElementJacobian - base = *a + // Convert to affine for windowed multiplication + var aAff GroupElementAffine + aAff.setGEJ(a) - // Process bits from MSB to LSB - for i := 0; i < 256; i++ { - if i > 0 { - r.double(r) - } - - // Get bit i (from MSB) - bit := q.getBits(uint(255-i), 1) - if bit != 0 { - if r.isInfinity() { - *r = base - } else { - r.addVar(r, &base) - } - } - } + // Use optimized windowed multiplication + ecmultWindowedVar(r, &aAff, q) } // ECDHHashFunction is a function type for hashing ECDH shared secrets @@ -309,3 +373,4 @@ func ECDHXOnly(output []byte, pubkey *PublicKey, seckey []byte) error { return nil } + diff --git a/ecmult_gen.go b/ecmult_gen.go index 2403975..41509d2 100644 --- a/ecmult_gen.go +++ b/ecmult_gen.go @@ -1,68 +1,175 @@ package p256k1 +import ( + "sync" +) + +const ( + // Number of bytes in a 256-bit scalar + numBytes = 32 + // Number of possible byte values + numByteValues = 256 +) + +// bytePointTable stores precomputed byte points for each byte position +// bytePoints[byteNum][byteVal] = byteVal * 2^(8*(31-byteNum)) * G +// where byteNum is 0-31 (MSB to LSB) and byteVal is 0-255 +// Each entry stores [X, Y] coordinates as 32-byte arrays +type bytePointTable [numBytes][numByteValues][2][32]byte + // EcmultGenContext holds precomputed data for generator multiplication type EcmultGenContext struct { - // Precomputed odd multiples of the generator - // This would contain precomputed tables in a real implementation + // Precomputed byte points: bytePoints[byteNum][byteVal] = [X, Y] coordinates + // in affine form for byteVal * 2^(8*(31-byteNum)) * G + bytePoints bytePointTable initialized bool } +var ( + // Global context for generator multiplication (initialized once) + globalGenContext *EcmultGenContext + genContextOnce sync.Once +) + +// initGenContext initializes the precomputed byte points table +func (ctx *EcmultGenContext) initGenContext() { + // Start with G (generator point) + var gJac GroupElementJacobian + gJac.setGE(&Generator) + + // Compute base points for each byte position + // For byteNum i, we need: byteVal * 2^(8*(31-i)) * G + // We'll compute each byte position's base multiplier first + + // Compute 2^8 * G, 2^16 * G, ..., 2^248 * G + var byteBases [numBytes]GroupElementJacobian + + // Base for byte 31 (LSB): 2^0 * G = G + byteBases[31] = gJac + + // Compute bases for bytes 30 down to 0 (MSB) + // byteBases[i] = 2^(8*(31-i)) * G + for i := numBytes - 2; i >= 0; i-- { + // byteBases[i] = byteBases[i+1] * 2^8 + byteBases[i] = byteBases[i+1] + for j := 0; j < 8; j++ { + byteBases[i].double(&byteBases[i]) + } + } + + // Now compute all byte points for each byte position + for byteNum := 0; byteNum < numBytes; byteNum++ { + base := byteBases[byteNum] + + // Convert base to affine for efficiency + var baseAff GroupElementAffine + baseAff.setGEJ(&base) + + // bytePoints[byteNum][0] = infinity (point at infinity) + // We'll skip this and handle it in the lookup + + // bytePoints[byteNum][1] = base + var ptJac GroupElementJacobian + ptJac.setGE(&baseAff) + var ptAff GroupElementAffine + ptAff.setGEJ(&ptJac) + ptAff.x.normalize() + ptAff.y.normalize() + ptAff.x.getB32(ctx.bytePoints[byteNum][1][0][:]) + ptAff.y.getB32(ctx.bytePoints[byteNum][1][1][:]) + + // Compute bytePoints[byteNum][byteVal] = byteVal * base + // We'll use addition to build up multiples + var accJac GroupElementJacobian = ptJac + var accAff GroupElementAffine + + for byteVal := 2; byteVal < numByteValues; byteVal++ { + // acc = acc + base + accJac.addVar(&accJac, &ptJac) + accAff.setGEJ(&accJac) + accAff.x.normalize() + accAff.y.normalize() + accAff.x.getB32(ctx.bytePoints[byteNum][byteVal][0][:]) + accAff.y.getB32(ctx.bytePoints[byteNum][byteVal][1][:]) + } + } + + ctx.initialized = true +} + +// getGlobalGenContext returns the global precomputed context +func getGlobalGenContext() *EcmultGenContext { + genContextOnce.Do(func() { + globalGenContext = &EcmultGenContext{} + globalGenContext.initGenContext() + }) + return globalGenContext +} + // NewEcmultGenContext creates a new generator multiplication context func NewEcmultGenContext() *EcmultGenContext { - return &EcmultGenContext{ - initialized: true, - } + ctx := &EcmultGenContext{} + ctx.initGenContext() + return ctx } // ecmultGen computes r = n * G where G is the generator point -// This is a simplified implementation - the real version would use precomputed tables +// Uses 8-bit byte-based lookup table (like btcec) for maximum efficiency func (ctx *EcmultGenContext) ecmultGen(r *GroupElementJacobian, n *Scalar) { if !ctx.initialized { panic("ecmult_gen context not initialized") } - + // Handle zero scalar if n.isZero() { r.setInfinity() return } - + // Handle scalar = 1 if n.isOne() { r.setGE(&Generator) return } - - // Simple binary method for now (not optimal but correct) - // Real implementation would use precomputed tables and windowing + + // Byte-based method: process one byte at a time (MSB to LSB) + // For each byte, lookup the precomputed point and add it r.setInfinity() - - var base GroupElementJacobian - base.setGE(&Generator) - - // Process each bit of the scalar - for i := 0; i < 256; i++ { - // Double the accumulator - if i > 0 { - r.double(r) + + // Get scalar bytes (MSB to LSB) + var scalarBytes [32]byte + n.getB32(scalarBytes[:]) + + for byteNum := 0; byteNum < numBytes; byteNum++ { + byteVal := scalarBytes[byteNum] + + // Skip zero bytes + if byteVal == 0 { + continue } - - // Extract bit i from scalar (from MSB) - bit := n.getBits(uint(255-i), 1) - if bit != 0 { - if r.isInfinity() { - *r = base - } else { - r.addVar(r, &base) - } + + // Lookup precomputed point for this byte + var ptAff GroupElementAffine + var xFe, yFe FieldElement + xFe.setB32(ctx.bytePoints[byteNum][byteVal][0][:]) + yFe.setB32(ctx.bytePoints[byteNum][byteVal][1][:]) + ptAff.setXY(&xFe, &yFe) + + // Convert to Jacobian and add + var ptJac GroupElementJacobian + ptJac.setGE(&ptAff) + + if r.isInfinity() { + *r = ptJac + } else { + r.addVar(r, &ptJac) } } } // EcmultGen is the public interface for generator multiplication func EcmultGen(r *GroupElementJacobian, n *Scalar) { - // Use a default context for now - // In a real implementation, this would use a global precomputed context - ctx := NewEcmultGenContext() + // Use global precomputed context for efficiency + ctx := getGlobalGenContext() ctx.ecmultGen(r, n) } diff --git a/schnorr.go b/schnorr.go index 145f774..5741f23 100644 --- a/schnorr.go +++ b/schnorr.go @@ -246,8 +246,12 @@ func SchnorrVerify(sig64 []byte, msg32 []byte, xonlyPubkey *XOnlyPubkey) bool { return false } + // Use optimized variable-time multiplication for verification + // (constant-time is not required for public verification operations) + var pkJac GroupElementJacobian + pkJac.setGE(&pk) var eP GroupElementJacobian - EcmultConst(&eP, &pk, &e) + Ecmult(&eP, &pkJac, &e) // Negate eP var negEP GroupElementJacobian diff --git a/signer/p256k1_signer.go b/signer/p256k1_signer.go index b3aaef3..b9f0b77 100644 --- a/signer/p256k1_signer.go +++ b/signer/p256k1_signer.go @@ -76,13 +76,15 @@ func (s *P256K1Signer) InitSec(sec []byte) error { return err } - // If parity is 1 (odd Y), negate the secret key + // If parity is 1 (odd Y), negate the secret key and recompute public key + // With windowed optimization, this is now much faster than before if parity == 1 { seckey := kp.Seckey() if !p256k1.ECSeckeyNegate(seckey) { return errors.New("failed to negate secret key") } // Recreate keypair with negated secret key + // This is now optimized with windowed precomputed tables kp, err = p256k1.KeyPairCreate(seckey) if err != nil { return err