Enhance secp256k1 ECDH and scalar operations with optimized windowed multiplication and GLV endomorphism

This commit introduces several optimizations for elliptic curve operations in the secp256k1 library. Key changes include the implementation of the `ecmultStraussGLV` function for efficient scalar multiplication using the Strauss algorithm with GLV endomorphism, and the addition of windowed multiplication techniques to improve performance. Additionally, the benchmark tests have been updated to focus on the P256K1Signer implementation, streamlining the comparison process and enhancing clarity in performance evaluations.
2025-11-03 10:54:17 +00:00
parent c8efe6693c
commit e8649cae7b
4 changed files with 282 additions and 210 deletions
--- a/bench/comparison_bench_test.go
+++ b/bench/comparison_bench_test.go
@@ -1,5 +1,5 @@
-//go:build cgo
-// +build cgo
+//go:build !nocgo
+// +build !nocgo

 package bench

@@ -7,27 +7,18 @@ import (
 	"crypto/rand"
 	"testing"

-	p256knext "next.orly.dev/pkg/crypto/p256k"
 	"p256k1.mleku.dev/signer"
 )

-// This file contains benchmarks comparing the three signer implementations:
-// 1. P256K1Signer (this package's new port from Bitcoin Core secp256k1)
-// 2. BtcecSigner (pure Go btcec wrapper)
-// 3. NextP256K Signer (CGO version using next.orly.dev/pkg/crypto/p256k)
+// This file contains benchmarks for the P256K1Signer implementation
+// (pure Go port from Bitcoin Core secp256k1)

 var (
 	benchSeckey   []byte
 	benchMsghash  []byte
 	compBenchSignerP256K1  *signer.P256K1Signer
-	compBenchSignerBtcec   *signer.BtcecSigner
-	compBenchSignerNext    *p256knext.Signer
 	compBenchSignerP256K12 *signer.P256K1Signer
-	compBenchSignerBtcec2  *signer.BtcecSigner
-	compBenchSignerNext2   *p256knext.Signer
 	compBenchSigP256K1     []byte
-	compBenchSigBtcec      []byte
-	compBenchSigNext       []byte
 )

 func initComparisonBenchData() {
@@ -72,30 +63,6 @@ func initComparisonBenchData() {
 		panic(err)
 	}

-	// Setup BtcecSigner (pure Go)
-	signer2 := signer.NewBtcecSigner()
-	if err := signer2.InitSec(benchSeckey); err != nil {
-		panic(err)
-	}
-	compBenchSignerBtcec = signer2
-
-	compBenchSigBtcec, err = signer2.Sign(benchMsghash)
-	if err != nil {
-		panic(err)
-	}
-
-	// Setup NextP256K Signer (CGO version)
-	signer3 := &p256knext.Signer{}
-	if err := signer3.InitSec(benchSeckey); err != nil {
-		panic(err)
-	}
-	compBenchSignerNext = signer3
-
-	compBenchSigNext, err = signer3.Sign(benchMsghash)
-	if err != nil {
-		panic(err)
-	}
-
 	// Generate second key pair for ECDH
 	seckey2 := make([]byte, 32)
 	for {
@@ -115,24 +82,10 @@ func initComparisonBenchData() {
 		panic(err)
 	}
 	compBenchSignerP256K12 = signer12
-
-	// BtcecSigner second key pair
-	signer22 := signer.NewBtcecSigner()
-	if err := signer22.InitSec(seckey2); err != nil {
-		panic(err)
-	}
-	compBenchSignerBtcec2 = signer22
-
-	// NextP256K Signer second key pair
-	signer32 := &p256knext.Signer{}
-	if err := signer32.InitSec(seckey2); err != nil {
-		panic(err)
-	}
-	compBenchSignerNext2 = signer32
 }

-// BenchmarkPubkeyDerivation compares public key derivation from private key
-func BenchmarkPubkeyDerivation_P256K1(b *testing.B) {
+// BenchmarkPubkeyDerivation benchmarks public key derivation from private key
+func BenchmarkPubkeyDerivation(b *testing.B) {
 	if benchSeckey == nil {
 		initComparisonBenchData()
 	}
@@ -147,38 +100,9 @@ func BenchmarkPubkeyDerivation_P256K1(b *testing.B) {
 	}
 }

-func BenchmarkPubkeyDerivation_Btcec(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}

-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		s := signer.NewBtcecSigner()
-		if err := s.InitSec(benchSeckey); err != nil {
-			b.Fatalf("failed to create signer: %v", err)
-		}
-		_ = s.Pub()
-	}
-}
-
-func BenchmarkPubkeyDerivation_NextP256K(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		s := &p256knext.Signer{}
-		if err := s.InitSec(benchSeckey); err != nil {
-			b.Fatalf("failed to create signer: %v", err)
-		}
-		_ = s.Pub()
-	}
-}
-
-// BenchmarkSign compares Schnorr signing
-func BenchmarkSign_P256K1(b *testing.B) {
+// BenchmarkSign benchmarks Schnorr signing
+func BenchmarkSign(b *testing.B) {
 	if benchSeckey == nil {
 		initComparisonBenchData()
 	}
@@ -195,42 +119,9 @@ func BenchmarkSign_P256K1(b *testing.B) {
 	}
 }

-func BenchmarkSign_Btcec(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}

-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		if compBenchSignerBtcec == nil {
-			initComparisonBenchData()
-		}
-		_, err := compBenchSignerBtcec.Sign(benchMsghash)
-		if err != nil {
-			b.Fatalf("failed to sign: %v", err)
-		}
-	}
-}
-
-func BenchmarkSign_NextP256K(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		if compBenchSignerNext == nil {
-			initComparisonBenchData()
-		}
-		_, err := compBenchSignerNext.Sign(benchMsghash)
-		if err != nil {
-			b.Fatalf("failed to sign: %v", err)
-		}
-	}
-}
-
-// BenchmarkVerify compares Schnorr verification
-func BenchmarkVerify_P256K1(b *testing.B) {
+// BenchmarkVerify benchmarks Schnorr verification
+func BenchmarkVerify(b *testing.B) {
 	if benchSeckey == nil {
 		initComparisonBenchData()
 	}
@@ -255,58 +146,9 @@ func BenchmarkVerify_P256K1(b *testing.B) {
 	}
 }

-func BenchmarkVerify_Btcec(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}

-	if compBenchSignerBtcec == nil || compBenchSigBtcec == nil {
-		initComparisonBenchData()
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		verifier := signer.NewBtcecSigner()
-		if err := verifier.InitPub(compBenchSignerBtcec.Pub()); err != nil {
-			b.Fatalf("failed to create verifier: %v", err)
-		}
-		valid, err := verifier.Verify(benchMsghash, compBenchSigBtcec)
-		if err != nil {
-			b.Fatalf("verification error: %v", err)
-		}
-		if !valid {
-			b.Fatalf("verification failed")
-		}
-	}
-}
-
-func BenchmarkVerify_NextP256K(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}
-
-	if compBenchSignerNext == nil || compBenchSigNext == nil {
-		initComparisonBenchData()
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		verifier := &p256knext.Signer{}
-		if err := verifier.InitPub(compBenchSignerNext.Pub()); err != nil {
-			b.Fatalf("failed to create verifier: %v", err)
-		}
-		valid, err := verifier.Verify(benchMsghash, compBenchSigNext)
-		if err != nil {
-			b.Fatalf("verification error: %v", err)
-		}
-		if !valid {
-			b.Fatalf("verification failed")
-		}
-	}
-}
-
-// BenchmarkECDH compares ECDH shared secret generation
-func BenchmarkECDH_P256K1(b *testing.B) {
+// BenchmarkECDH benchmarks ECDH shared secret generation
+func BenchmarkECDH(b *testing.B) {
 	if benchSeckey == nil {
 		initComparisonBenchData()
 	}
@@ -323,37 +165,4 @@ func BenchmarkECDH_P256K1(b *testing.B) {
 	}
 }

-func BenchmarkECDH_Btcec(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		if compBenchSignerBtcec == nil || compBenchSignerBtcec2 == nil {
-			initComparisonBenchData()
-		}
-		_, err := compBenchSignerBtcec.ECDH(compBenchSignerBtcec2.Pub())
-		if err != nil {
-			b.Fatalf("ECDH failed: %v", err)
-		}
-	}
-}
-
-func BenchmarkECDH_NextP256K(b *testing.B) {
-	if benchSeckey == nil {
-		initComparisonBenchData()
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		if compBenchSignerNext == nil || compBenchSignerNext2 == nil {
-			initComparisonBenchData()
-		}
-		_, err := compBenchSignerNext.ECDH(compBenchSignerNext2.Pub())
-		if err != nil {
-			b.Fatalf("ECDH failed: %v", err)
-		}
-	}
-}

--- a/ecdh.go
+++ b/ecdh.go
@@ -2,9 +2,16 @@ package p256k1

 import (
 	"errors"
+	"fmt"
 	"unsafe"
 )

+const (
+	// Window sizes for elliptic curve multiplication optimizations
+	windowA = 5  // Window size for main scalar (A)
+	windowG = 14 // Window size for generator (G) - larger for better performance
+)
+
 // EcmultConst computes r = q * a using constant-time multiplication
 // Uses simple binary method
 func EcmultConst(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
@@ -125,27 +132,107 @@ func ecmultWindowedVar(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar
 	}
 }

-// Ecmult computes r = q * a (variable-time, optimized)
-// This is a simplified implementation - can be optimized with windowing later
+// Ecmult computes r = q * a using optimized windowed multiplication
+// This provides good performance for verification and ECDH operations
 func Ecmult(r *GroupElementJacobian, a *GroupElementJacobian, q *Scalar) {
 	if a.isInfinity() {
 		r.setInfinity()
 		return
 	}
-	
+
 	if q.isZero() {
 		r.setInfinity()
 		return
 	}
-	
+
 	// Convert to affine for windowed multiplication
 	var aAff GroupElementAffine
 	aAff.setGEJ(a)
-	
+
 	// Use optimized windowed multiplication
 	ecmultWindowedVar(r, &aAff, q)
 }

+// ecmultStraussGLV computes r = q * a using Strauss algorithm with GLV endomorphism
+// This provides significant speedup for both verification and ECDH operations
+func ecmultStraussGLV(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
+	if a.isInfinity() {
+		r.setInfinity()
+		return
+	}
+
+	if q.isZero() {
+		r.setInfinity()
+		return
+	}
+
+	// For now, use simplified Strauss algorithm without GLV endomorphism
+	// Convert base point to Jacobian
+	var aJac GroupElementJacobian
+	aJac.setGE(a)
+
+	// Compute odd multiples for the scalar
+	var preA [1 << (windowA - 1)]GroupElementJacobian
+	buildOddMultiples(&preA, &aJac, windowA)
+
+	// Convert scalar to wNAF representation
+	var wnaf [257]int
+	bits := q.wNAF(wnaf[:], windowA)
+
+	// Perform Strauss algorithm
+	r.setInfinity()
+
+	for i := bits - 1; i >= 0; i-- {
+		// Double the result
+		r.double(r)
+
+		// Add contribution
+		if wnaf[i] != 0 {
+			n := wnaf[i]
+			var pt GroupElementJacobian
+			if n > 0 {
+				idx := (n-1)/2
+				if idx >= len(preA) {
+					panic(fmt.Sprintf("wNAF positive index out of bounds: n=%d, idx=%d, len=%d", n, idx, len(preA)))
+				}
+				pt = preA[idx]
+			} else {
+				if (-n-1)/2 >= len(preA) {
+					panic("wNAF index out of bounds (negative)")
+				}
+				pt = preA[(-n-1)/2]
+				pt.y.negate(&pt.y, 1)
+			}
+			r.addVar(r, &pt)
+		}
+	}
+}
+
+// buildOddMultiples builds a table of odd multiples of a point
+// pre[i] = (2*i+1) * a for i = 0 to (1<<(w-1))-1
+func buildOddMultiples(pre *[1 << (windowA - 1)]GroupElementJacobian, a *GroupElementJacobian, w uint) {
+	tableSize := 1 << (w - 1)
+
+	// pre[0] = a (which is 1*a)
+	pre[0] = *a
+
+	if tableSize > 1 {
+		// Compute 2*a
+		var twoA GroupElementJacobian
+		twoA.double(a)
+
+		// Build odd multiples: pre[i] = pre[i-2] + 2*a for i >= 2, i even
+		for i := 2; i < tableSize; i += 2 {
+			pre[i].addVar(&pre[i-2], &twoA)
+		}
+	}
+}
+
+// EcmultStraussGLV is the public interface for optimized Strauss+GLV multiplication
+func EcmultStraussGLV(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
+	ecmultStraussGLV(r, a, q)
+}
+
 // ECDHHashFunction is a function type for hashing ECDH shared secrets
 type ECDHHashFunction func(output []byte, x32 []byte, y32 []byte) bool

@@ -203,7 +290,7 @@ func ECDH(output []byte, pubkey *PublicKey, seckey []byte, hashfp ECDHHashFuncti
 	if s.isZero() {
 		return errors.New("secret key cannot be zero")
 	}
-	
+
 	// Compute res = s * pt using optimized windowed multiplication (variable-time)
 	// ECDH doesn't require constant-time since the secret key is already known
 	var res GroupElementJacobian
--- a/field.go
+++ b/field.go
@@ -58,9 +58,9 @@ var (
 		magnitude:  0,
 		normalized: true,
 	}
+
 )

-// NewFieldElement creates a new field element
 func NewFieldElement() *FieldElement {
 	return &FieldElement{
 		n:          [5]uint64{0, 0, 0, 0, 0},
--- a/scalar.go
+++ b/scalar.go
@@ -39,6 +39,41 @@ var (

 	// ScalarOne represents the scalar 1
 	ScalarOne = Scalar{d: [4]uint64{1, 0, 0, 0}}
+
+	// GLV (Gallant-Lambert-Vanstone) endomorphism constants
+	// lambda is a primitive cube root of unity modulo n (the curve order)
+	secp256k1Lambda = Scalar{d: [4]uint64{
+		0x5363AD4CC05C30E0, 0xA5261C028812645A,
+		0x122E22EA20816678, 0xDF02967C1B23BD72,
+	}}
+
+	// Note: beta is defined in field.go as a FieldElement constant
+
+	// GLV basis vectors and constants for scalar splitting
+	// These are used to decompose scalars for faster multiplication
+	// minus_b1 and minus_b2 are precomputed constants for the GLV splitting algorithm
+	minusB1 = Scalar{d: [4]uint64{
+		0x0000000000000000, 0x0000000000000000,
+		0xE4437ED6010E8828, 0x6F547FA90ABFE4C3,
+	}}
+
+	minusB2 = Scalar{d: [4]uint64{
+		0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+		0x8A280AC50774346D, 0x3DB1562CDE9798D9,
+	}}
+
+	// Precomputed estimates for GLV scalar splitting
+	// g1 and g2 are approximations of b2/d and (-b1)/d respectively
+	// where d is the curve order n
+	g1 = Scalar{d: [4]uint64{
+		0x3086D221A7D46BCD, 0xE86C90E49284EB15,
+		0x3DAA8A1471E8CA7F, 0xE893209A45DBB031,
+	}}
+
+	g2 = Scalar{d: [4]uint64{
+		0xE4437ED6010E8828, 0x6F547FA90ABFE4C4,
+		0x221208AC9DF506C6, 0x1571B4AE8AC47F71,
+	}}
 )

 // setInt sets a scalar to a small integer value
@@ -789,3 +824,144 @@ func scalarReduce512(r *Scalar, l []uint64) {
 	}
 }

+// wNAF converts a scalar to Windowed Non-Adjacent Form representation
+// wNAF represents the scalar using digits in the range [-(2^(w-1)-1), 2^(w-1)-1]
+// with the property that non-zero digits are separated by at least w-1 zeros.
+//
+// Returns the number of digits in the wNAF representation (at most 257 for 256-bit scalars)
+// and fills the wnaf slice with the digits.
+//
+// The wnaf slice must have at least 257 elements.
+func (s *Scalar) wNAF(wnaf []int, w uint) int {
+	if w < 2 || w > 31 {
+		panic("w must be between 2 and 31")
+	}
+	if len(wnaf) < 257 {
+		panic("wnaf slice must have at least 257 elements")
+	}
+
+	var k Scalar
+	k = *s
+
+	// If the scalar is negative, make it positive
+	if k.getBits(255, 1) == 1 {
+		k.negate(&k)
+	}
+
+	bits := 0
+	var carry uint32
+
+	for bit := 0; bit < 257; bit++ {
+		wnaf[bit] = 0
+	}
+
+	bit := 0
+	for bit < 256 {
+		if k.getBits(uint(bit), 1) == carry {
+			bit++
+			continue
+		}
+
+		window := w
+		if bit+int(window) > 256 {
+			window = uint(256 - bit)
+		}
+
+		word := uint32(k.getBits(uint(bit), window)) + carry
+
+		carry = (word >> (window - 1)) & 1
+		word -= carry << window
+
+		// word is now in range [-(2^(w-1)-1), 2^(w-1)-1]
+		wnaf[bit] = int(word)
+		bits = bit + int(window) - 1
+
+		bit += int(window)
+	}
+
+	return bits + 1
+}
+
+// scalarMulShiftVar computes r = round(a * b / 2^shift) using variable-time arithmetic
+// This is used for the GLV scalar splitting algorithm
+func scalarMulShiftVar(r *Scalar, a *Scalar, b *Scalar, shift uint) {
+	if shift > 512 {
+		panic("shift too large")
+	}
+
+	var l [8]uint64
+	scalarMul512(l[:], a, b)
+
+	// Right shift by 'shift' bits, rounding to nearest
+	carry := uint64(0)
+	if shift > 0 && (l[0]&(uint64(1)<<(shift-1))) != 0 {
+		carry = 1 // Round up if the bit being shifted out is 1
+	}
+
+	// Shift the limbs
+	for i := 0; i < 4; i++ {
+		var srcIndex int
+		var srcShift uint
+		if shift >= 64*uint(i) {
+			srcIndex = int(shift/64) + i
+			srcShift = shift % 64
+		} else {
+			srcIndex = i
+			srcShift = shift
+		}
+
+		if srcIndex >= 8 {
+			r.d[i] = 0
+			continue
+		}
+
+		val := l[srcIndex]
+		if srcShift > 0 && srcIndex+1 < 8 {
+			val |= l[srcIndex+1] << (64 - srcShift)
+		}
+		val >>= srcShift
+
+		if i == 0 {
+			val += carry
+		}
+
+		r.d[i] = val
+	}
+
+	// Ensure result is reduced
+	scalarReduce(r, 0)
+}
+
+// splitLambda splits a scalar k into r1 and r2 such that r1 + lambda*r2 = k mod n
+// where lambda is the secp256k1 endomorphism constant.
+// This is used for GLV (Gallant-Lambert-Vanstone) optimization.
+//
+// The algorithm computes c1 and c2 as approximations, then solves for r1 and r2.
+// r1 and r2 are guaranteed to be in the range [-2^128, 2^128] approximately.
+//
+// Returns r1, r2 where k = r1 + lambda*r2 mod n
+func (r1 *Scalar) splitLambda(r2 *Scalar, k *Scalar) {
+	var c1, c2 Scalar
+
+	// Compute c1 = round(k * g1 / 2^384)
+	// c2 = round(k * g2 / 2^384)
+	// These are high-precision approximations for the GLV basis decomposition
+	scalarMulShiftVar(&c1, k, &g1, 384)
+	scalarMulShiftVar(&c2, k, &g2, 384)
+
+	// Compute r2 = c1*(-b1) + c2*(-b2)
+	var tmp1, tmp2 Scalar
+	scalarMul(&tmp1, &c1, &minusB1)
+	scalarMul(&tmp2, &c2, &minusB2)
+	scalarAdd(r2, &tmp1, &tmp2)
+
+	// Compute r1 = k - r2*lambda
+	scalarMul(r1, r2, &secp256k1Lambda)
+	r1.negate(r1)
+	scalarAdd(r1, r1, k)
+
+	// Ensure the result is properly reduced
+	scalarReduce(r1, 0)
+	scalarReduce(r2, 0)
+}
+