Files
p256k1/ecdh.go
mleku 14dc85cdc3 Add BMI2/AVX2 field assembly and SIMD comparison benchmarks
- Port field operations assembler from libsecp256k1 (field_amd64.s,
    field_amd64_bmi2.s) with MULX/ADCX/ADOX instructions
  - Add AVX2 scalar and affine point operations in avx/ package
  - Implement CPU feature detection (cpufeatures.go) for AVX2/BMI2
  - Add libsecp256k1.so via purego for native C library comparison
  - Create comprehensive SIMD benchmark suite comparing btcec, P256K1
    pure Go, P256K1 ASM, and libsecp256k1
  - Add BENCHMARK_SIMD.md documenting performance across implementations
  - Remove BtcecSigner, consolidate on P256K1Signer as primary impl
  - Add field operation tests and benchmarks (field_asm_test.go,
    field_bench_test.go)
  - Update GLV endomorphism with wNAF scalar multiplication
  - Add scalar assembly (scalar_amd64.s) for optimized operations
  - Clean up dependencies and update benchmark reports
2025-11-29 08:11:13 +00:00

785 lines
19 KiB
Go

package p256k1
import (
"errors"
"fmt"
"unsafe"
)
const (
// Window sizes for elliptic curve multiplication optimizations
windowA = 5 // Window size for main scalar (A)
windowG = 14 // Window size for generator (G) - larger for better performance
)
// EcmultConst computes r = q * a using constant-time multiplication
// Uses simple binary method
func EcmultConst(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
if a.isInfinity() {
r.setInfinity()
return
}
if q.isZero() {
r.setInfinity()
return
}
// Convert affine point to Jacobian
var aJac GroupElementJacobian
aJac.setGE(a)
// Use simple binary method for constant-time behavior
r.setInfinity()
var base GroupElementJacobian
base = aJac
// Process bits from MSB to LSB
for i := 0; i < 256; i++ {
if i > 0 {
r.double(r)
}
// Get bit i (from MSB)
bit := q.getBits(uint(255-i), 1)
if bit != 0 {
if r.isInfinity() {
*r = base
} else {
r.addVar(r, &base)
}
}
}
}
// ecmultWindowedVar computes r = q * a using optimized windowed multiplication (variable-time)
// Uses a window size of 6 bits (64 precomputed multiples) for better CPU performance
// Trades memory (64 entries vs 32) for ~20% faster multiplication
func ecmultWindowedVar(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
if a.isInfinity() {
r.setInfinity()
return
}
if q.isZero() {
r.setInfinity()
return
}
const windowSize = 6 // Increased from 5 to 6 for better performance
const tableSize = 1 << windowSize // 64
// Convert point to Jacobian once
var aJac GroupElementJacobian
aJac.setGE(a)
// Build table efficiently using Jacobian coordinates, only convert to affine at end
// Store odd multiples in Jacobian form to avoid frequent conversions
var tableJac [tableSize]GroupElementJacobian
tableJac[0].setInfinity()
tableJac[1] = aJac
// Build odd multiples efficiently: tableJac[2*i+1] = (2*i+1) * a
// Start with 3*a = a + 2*a
var twoA GroupElementJacobian
twoA.double(&aJac)
// Build table: tableJac[i] = tableJac[i-2] + 2*a for odd i
for i := 3; i < tableSize; i += 2 {
tableJac[i].addVar(&tableJac[i-2], &twoA)
}
// Build even multiples: tableJac[2*i] = 2 * tableJac[i]
for i := 1; i < tableSize/2; i++ {
tableJac[2*i].double(&tableJac[i])
}
// Process scalar in windows of 6 bits from MSB to LSB
r.setInfinity()
numWindows := (256 + windowSize - 1) / windowSize // Ceiling division
for window := 0; window < numWindows; window++ {
// Calculate bit offset for this window (MSB first)
bitOffset := 255 - window*windowSize
if bitOffset < 0 {
break
}
// Extract window bits
actualWindowSize := windowSize
if bitOffset < windowSize-1 {
actualWindowSize = bitOffset + 1
}
windowBits := q.getBits(uint(bitOffset-actualWindowSize+1), uint(actualWindowSize))
// Double result windowSize times (once per bit position in window)
if !r.isInfinity() {
for j := 0; j < actualWindowSize; j++ {
r.double(r)
}
}
// Add precomputed point if window is non-zero
if windowBits != 0 && windowBits < tableSize {
if r.isInfinity() {
*r = tableJac[windowBits]
} else {
r.addVar(r, &tableJac[windowBits])
}
}
}
}
// Ecmult computes r = q * a using optimized GLV+Strauss+wNAF multiplication
// This provides good performance for verification and ECDH operations
func Ecmult(r *GroupElementJacobian, a *GroupElementJacobian, q *Scalar) {
if a.isInfinity() {
r.setInfinity()
return
}
if q.isZero() {
r.setInfinity()
return
}
// Convert to affine for GLV multiplication
var aAff GroupElementAffine
aAff.setGEJ(a)
// Use optimized GLV+Strauss+wNAF multiplication
ecmultStraussWNAFGLV(r, &aAff, q)
}
// EcmultCombined computes r = na*a + ng*G using optimized algorithms
// This is more efficient than computing the two multiplications separately
// when both scalars are non-zero
func EcmultCombined(r *GroupElementJacobian, a *GroupElementJacobian, na, ng *Scalar) {
// Handle edge cases
naZero := na == nil || na.isZero()
ngZero := ng == nil || ng.isZero()
aInf := a == nil || a.isInfinity()
// If both scalars are zero, result is infinity
if naZero && ngZero {
r.setInfinity()
return
}
// If na is zero or a is infinity, just compute ng*G
if naZero || aInf {
ecmultGenGLV(r, ng)
return
}
// If ng is zero, just compute na*a
if ngZero {
var aAff GroupElementAffine
aAff.setGEJ(a)
ecmultStraussWNAFGLV(r, &aAff, na)
return
}
// Both multiplications needed - compute separately and add
// TODO: Could optimize further with combined Strauss algorithm
var naa, ngg GroupElementJacobian
var aAff GroupElementAffine
aAff.setGEJ(a)
ecmultStraussWNAFGLV(&naa, &aAff, na)
ecmultGenGLV(&ngg, ng)
// Add them together
r.addVar(&naa, &ngg)
}
// ecmultStraussGLV computes r = q * a using Strauss algorithm with GLV endomorphism
// This provides significant speedup for both verification and ECDH operations
func ecmultStraussGLV(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
if a.isInfinity() {
r.setInfinity()
return
}
if q.isZero() {
r.setInfinity()
return
}
// For now, use simplified Strauss algorithm without GLV endomorphism
// Convert base point to Jacobian
var aJac GroupElementJacobian
aJac.setGE(a)
// Compute odd multiples for the scalar
var preA [1 << (windowA - 1)]GroupElementJacobian
buildOddMultiples(&preA, &aJac, windowA)
// Convert scalar to wNAF representation
var wnaf [257]int
bits := q.wNAF(wnaf[:], windowA)
// Perform Strauss algorithm
r.setInfinity()
for i := bits - 1; i >= 0; i-- {
// Double the result
r.double(r)
// Add contribution
if wnaf[i] != 0 {
n := wnaf[i]
var pt GroupElementJacobian
if n > 0 {
idx := (n-1)/2
if idx >= len(preA) {
panic(fmt.Sprintf("wNAF positive index out of bounds: n=%d, idx=%d, len=%d", n, idx, len(preA)))
}
pt = preA[idx]
} else {
if (-n-1)/2 >= len(preA) {
panic("wNAF index out of bounds (negative)")
}
pt = preA[(-n-1)/2]
pt.y.negate(&pt.y, 1)
}
r.addVar(r, &pt)
}
}
}
// buildOddMultiples builds a table of odd multiples of a point
// pre[i] = (2*i+1) * a for i = 0 to (1<<(w-1))-1
func buildOddMultiples(pre *[1 << (windowA - 1)]GroupElementJacobian, a *GroupElementJacobian, w uint) {
tableSize := 1 << (w - 1)
// pre[0] = a (which is 1*a)
pre[0] = *a
if tableSize > 1 {
// Compute 2*a
var twoA GroupElementJacobian
twoA.double(a)
// Build odd multiples: pre[i] = pre[i-2] + 2*a for i >= 2, i even
for i := 2; i < tableSize; i += 2 {
pre[i].addVar(&pre[i-2], &twoA)
}
}
}
// EcmultStraussGLV is the public interface for optimized Strauss+GLV multiplication
func EcmultStraussGLV(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
ecmultStraussGLV(r, a, q)
}
// ECDHHashFunction is a function type for hashing ECDH shared secrets
type ECDHHashFunction func(output []byte, x32 []byte, y32 []byte) bool
// ecdhHashFunctionSHA256 implements the default SHA-256 based hash function for ECDH
// Following the C reference implementation exactly
func ecdhHashFunctionSHA256(output []byte, x32 []byte, y32 []byte) bool {
if len(output) != 32 || len(x32) != 32 || len(y32) != 32 {
return false
}
// Version byte: (y32[31] & 0x01) | 0x02
version := byte((y32[31] & 0x01) | 0x02)
sha := NewSHA256()
sha.Write([]byte{version})
sha.Write(x32)
sha.Finalize(output)
sha.Clear()
return true
}
// ECDH computes an EC Diffie-Hellman shared secret
// Following the C reference implementation secp256k1_ecdh
func ECDH(output []byte, pubkey *PublicKey, seckey []byte, hashfp ECDHHashFunction) error {
if len(output) != 32 {
return errors.New("output must be 32 bytes")
}
if len(seckey) != 32 {
return errors.New("seckey must be 32 bytes")
}
if pubkey == nil {
return errors.New("pubkey cannot be nil")
}
// Use default hash function if none provided
if hashfp == nil {
hashfp = ecdhHashFunctionSHA256
}
// Load public key
var pt GroupElementAffine
pt.fromBytes(pubkey.data[:])
if pt.isInfinity() {
return errors.New("invalid public key")
}
// Parse scalar
var s Scalar
if !s.setB32Seckey(seckey) {
return errors.New("invalid secret key")
}
// Handle zero scalar
if s.isZero() {
return errors.New("secret key cannot be zero")
}
// Compute res = s * pt using optimized windowed multiplication (variable-time)
// ECDH doesn't require constant-time since the secret key is already known
var res GroupElementJacobian
ecmultWindowedVar(&res, &pt, &s)
// Convert to affine
var resAff GroupElementAffine
resAff.setGEJ(&res)
resAff.x.normalize()
resAff.y.normalize()
// Extract x and y coordinates
var x, y [32]byte
resAff.x.getB32(x[:])
resAff.y.getB32(y[:])
// Compute hash
success := hashfp(output, x[:], y[:])
// Clear sensitive data
memclear(unsafe.Pointer(&x[0]), 32)
memclear(unsafe.Pointer(&y[0]), 32)
s.clear()
resAff.clear()
res.clear()
if !success {
return errors.New("hash function failed")
}
return nil
}
// HKDF performs HMAC-based Key Derivation Function (RFC 5869)
// Outputs key material of the specified length
func HKDF(output []byte, ikm []byte, salt []byte, info []byte) error {
if len(output) == 0 {
return errors.New("output length must be greater than 0")
}
// Step 1: Extract (if salt is empty, use zeros)
if len(salt) == 0 {
salt = make([]byte, 32)
}
// PRK = HMAC-SHA256(salt, IKM)
var prk [32]byte
hmac := NewHMACSHA256(salt)
hmac.Write(ikm)
hmac.Finalize(prk[:])
hmac.Clear()
// Step 2: Expand
// Generate output using HKDF-Expand
// T(0) = empty
// T(i) = HMAC(PRK, T(i-1) || info || i)
outlen := len(output)
outidx := 0
// T(0) is empty
var t []byte
// Generate blocks until we have enough output
blockNum := byte(1)
for outidx < outlen {
// Compute T(i) = HMAC(PRK, T(i-1) || info || i)
hmac = NewHMACSHA256(prk[:])
if len(t) > 0 {
hmac.Write(t)
}
if len(info) > 0 {
hmac.Write(info)
}
hmac.Write([]byte{blockNum})
var tBlock [32]byte
hmac.Finalize(tBlock[:])
hmac.Clear()
// Copy to output
copyLen := len(tBlock)
if copyLen > outlen-outidx {
copyLen = outlen - outidx
}
copy(output[outidx:outidx+copyLen], tBlock[:copyLen])
outidx += copyLen
// Update T for next iteration
t = tBlock[:]
blockNum++
}
// Clear sensitive data
memclear(unsafe.Pointer(&prk[0]), 32)
if len(t) > 0 {
memclear(unsafe.Pointer(&t[0]), uintptr(len(t)))
}
return nil
}
// ECDHWithHKDF computes ECDH and derives a key using HKDF
func ECDHWithHKDF(output []byte, pubkey *PublicKey, seckey []byte, salt []byte, info []byte) error {
// Compute ECDH shared secret
var sharedSecret [32]byte
if err := ECDH(sharedSecret[:], pubkey, seckey, nil); err != nil {
return err
}
// Derive key using HKDF
err := HKDF(output, sharedSecret[:], salt, info)
// Clear shared secret
memclear(unsafe.Pointer(&sharedSecret[0]), 32)
return err
}
// =============================================================================
// Phase 4: Strauss-GLV Algorithm with wNAF
// =============================================================================
// buildOddMultiplesTableAffine builds a table of odd multiples of a point in affine coordinates
// pre[i] = (2*i+1) * a for i = 0 to tableSize-1
// Also returns the precomputed β*x values for λ-transformed lookups
//
// The table is built efficiently using:
// 1. Compute odd multiples in Jacobian: 1*a, 3*a, 5*a, ...
// 2. Batch normalize all points to affine
// 3. Precompute β*x for each point for GLV lookups
//
// Reference: libsecp256k1 ecmult_impl.h:secp256k1_ecmult_odd_multiples_table
func buildOddMultiplesTableAffine(preA []GroupElementAffine, preBetaX []FieldElement, a *GroupElementJacobian, tableSize int) {
if tableSize == 0 {
return
}
// Build odd multiples in Jacobian coordinates
preJac := make([]GroupElementJacobian, tableSize)
// pre[0] = a (which is 1*a)
preJac[0] = *a
if tableSize > 1 {
// Compute 2*a
var twoA GroupElementJacobian
twoA.double(a)
// Build odd multiples: pre[i] = pre[i-1] + 2*a for i >= 1
for i := 1; i < tableSize; i++ {
preJac[i].addVar(&preJac[i-1], &twoA)
}
}
// Batch normalize to affine coordinates
BatchNormalize(preA, preJac)
// Precompute β*x for each point (for λ-transformed lookups)
for i := 0; i < tableSize; i++ {
if preA[i].isInfinity() {
preBetaX[i] = FieldElementZero
} else {
preBetaX[i].mul(&preA[i].x, &fieldBeta)
}
}
}
// tableGetGE retrieves a point from the table, handling sign
// n is the wNAF digit (can be negative)
// Returns pre[(|n|-1)/2], negated if n < 0
//
// Reference: libsecp256k1 ecmult_impl.h:ECMULT_TABLE_GET_GE
func tableGetGE(r *GroupElementAffine, pre []GroupElementAffine, n int) {
if n == 0 {
r.setInfinity()
return
}
var idx int
if n > 0 {
idx = (n - 1) / 2
} else {
idx = (-n - 1) / 2
}
if idx >= len(pre) {
r.setInfinity()
return
}
*r = pre[idx]
// Negate if n < 0
if n < 0 {
r.negate(r)
}
}
// tableGetGELambda retrieves the λ-transformed point from the table
// Uses precomputed β*x values for efficiency
// n is the wNAF digit (can be negative)
// Returns λ*pre[(|n|-1)/2], negated if n < 0
//
// Since λ*(x, y) = (β*x, y), and we precomputed β*x,
// we just need to use the precomputed β*x instead of x
//
// Reference: libsecp256k1 ecmult_impl.h:ECMULT_TABLE_GET_GE_LAMBDA
func tableGetGELambda(r *GroupElementAffine, pre []GroupElementAffine, preBetaX []FieldElement, n int) {
if n == 0 {
r.setInfinity()
return
}
var idx int
if n > 0 {
idx = (n - 1) / 2
} else {
idx = (-n - 1) / 2
}
if idx >= len(pre) {
r.setInfinity()
return
}
// Use precomputed β*x instead of x
r.x = preBetaX[idx]
r.y = pre[idx].y
r.infinity = pre[idx].infinity
// Negate if n < 0
if n < 0 {
r.negate(r)
}
}
// Window size for the GLV split scalars
const glvWNAFW = 5
const glvTableSize = 1 << (glvWNAFW - 1) // 16 entries for window size 5
// ecmultStraussWNAFGLV computes r = q * a using Strauss algorithm with GLV endomorphism
// This splits the scalar using GLV and processes two ~128-bit scalars simultaneously
// using wNAF representation for efficient point multiplication.
//
// The algorithm:
// 1. Split q into q1, q2 such that q1 + q2*λ ≡ q (mod n), where q1, q2 are ~128 bits
// 2. Build odd multiples table for a and precompute β*x for λ-transformed lookups
// 3. Convert q1, q2 to wNAF representation
// 4. Process both wNAF representations simultaneously in a single pass
//
// Reference: libsecp256k1 ecmult_impl.h:secp256k1_ecmult_strauss_wnaf
func ecmultStraussWNAFGLV(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
if a.isInfinity() {
r.setInfinity()
return
}
if q.isZero() {
r.setInfinity()
return
}
// Split scalar using GLV endomorphism: q = q1 + q2*λ
// Also get the transformed points p1 = a, p2 = λ*a
var q1, q2 Scalar
var p1, p2 GroupElementAffine
ecmultEndoSplit(&q1, &q2, &p1, &p2, q, a)
// Build odd multiples tables using stack-allocated arrays
var aJac GroupElementJacobian
aJac.setGE(&p1)
var preA [glvTableSize]GroupElementAffine
var preBetaX [glvTableSize]FieldElement
buildOddMultiplesTableAffineFixed(&preA, &preBetaX, &aJac)
// Build odd multiples table for p2 (which is λ*a)
var p2Jac GroupElementJacobian
p2Jac.setGE(&p2)
var preA2 [glvTableSize]GroupElementAffine
var preBetaX2 [glvTableSize]FieldElement
buildOddMultiplesTableAffineFixed(&preA2, &preBetaX2, &p2Jac)
// Convert scalars to wNAF representation
const wnafMaxLen = 257
var wnaf1, wnaf2 [wnafMaxLen]int
bits1 := q1.wNAF(wnaf1[:], glvWNAFW)
bits2 := q2.wNAF(wnaf2[:], glvWNAFW)
// Find the maximum bit position
maxBits := bits1
if bits2 > maxBits {
maxBits = bits2
}
// Perform the Strauss algorithm
r.setInfinity()
for i := maxBits - 1; i >= 0; i-- {
// Double the result
if !r.isInfinity() {
r.double(r)
}
// Add contribution from q1
if i < bits1 && wnaf1[i] != 0 {
var pt GroupElementAffine
tableGetGEFixed(&pt, &preA, wnaf1[i])
if r.isInfinity() {
r.setGE(&pt)
} else {
r.addGE(r, &pt)
}
}
// Add contribution from q2
if i < bits2 && wnaf2[i] != 0 {
var pt GroupElementAffine
tableGetGEFixed(&pt, &preA2, wnaf2[i])
if r.isInfinity() {
r.setGE(&pt)
} else {
r.addGE(r, &pt)
}
}
}
}
// buildOddMultiplesTableAffineFixed is like buildOddMultiplesTableAffine but uses fixed-size arrays
func buildOddMultiplesTableAffineFixed(preA *[glvTableSize]GroupElementAffine, preBetaX *[glvTableSize]FieldElement, a *GroupElementJacobian) {
// Build odd multiples in Jacobian coordinates
var preJac [glvTableSize]GroupElementJacobian
// pre[0] = a (which is 1*a)
preJac[0] = *a
if glvTableSize > 1 {
// Compute 2*a
var twoA GroupElementJacobian
twoA.double(a)
// Build odd multiples: pre[i] = pre[i-1] + 2*a for i >= 1
for i := 1; i < glvTableSize; i++ {
preJac[i].addVar(&preJac[i-1], &twoA)
}
}
// Batch normalize to affine coordinates
BatchNormalize(preA[:], preJac[:])
// Precompute β*x for each point
for i := 0; i < glvTableSize; i++ {
if preA[i].isInfinity() {
preBetaX[i] = FieldElementZero
} else {
preBetaX[i].mul(&preA[i].x, &fieldBeta)
}
}
}
// tableGetGEFixed retrieves a point from a fixed-size table
func tableGetGEFixed(r *GroupElementAffine, pre *[glvTableSize]GroupElementAffine, n int) {
if n == 0 {
r.setInfinity()
return
}
var idx int
if n > 0 {
idx = (n - 1) / 2
} else {
idx = (-n - 1) / 2
}
if idx >= glvTableSize {
r.setInfinity()
return
}
*r = pre[idx]
// Negate if n < 0
if n < 0 {
r.negate(r)
}
}
// EcmultStraussWNAFGLV is the public interface for optimized Strauss+GLV+wNAF multiplication
func EcmultStraussWNAFGLV(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
ecmultStraussWNAFGLV(r, a, q)
}
// ECDHXOnly computes X-only ECDH (BIP-340 style)
// Outputs only the X coordinate of the shared secret point
func ECDHXOnly(output []byte, pubkey *PublicKey, seckey []byte) error {
if len(output) != 32 {
return errors.New("output must be 32 bytes")
}
if len(seckey) != 32 {
return errors.New("seckey must be 32 bytes")
}
if pubkey == nil {
return errors.New("pubkey cannot be nil")
}
// Load public key
var pt GroupElementAffine
pt.fromBytes(pubkey.data[:])
if pt.isInfinity() {
return errors.New("invalid public key")
}
// Parse scalar
var s Scalar
if !s.setB32Seckey(seckey) {
return errors.New("invalid secret key")
}
if s.isZero() {
return errors.New("secret key cannot be zero")
}
// Compute res = s * pt using optimized windowed multiplication (variable-time)
// ECDH doesn't require constant-time since the secret key is already known
var res GroupElementJacobian
ecmultWindowedVar(&res, &pt, &s)
// Convert to affine
var resAff GroupElementAffine
resAff.setGEJ(&res)
resAff.x.normalize()
// Extract X coordinate only
resAff.x.getB32(output)
// Clear sensitive data
s.clear()
resAff.clear()
res.clear()
return nil
}