package avx

import "math/bits"

// Scalar operations modulo the secp256k1 group order n.
// n = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141

// SetBytes sets a scalar from a 32-byte big-endian slice.
// Returns true if the value was >= n and was reduced.
func (s *Scalar) SetBytes(b []byte) bool {
	if len(b) != 32 {
		panic("scalar must be 32 bytes")
	}

	// Convert big-endian bytes to little-endian limbs
	s.D[0].Lo = uint64(b[31]) | uint64(b[30])<<8 | uint64(b[29])<<16 | uint64(b[28])<<24 |
		uint64(b[27])<<32 | uint64(b[26])<<40 | uint64(b[25])<<48 | uint64(b[24])<<56
	s.D[0].Hi = uint64(b[23]) | uint64(b[22])<<8 | uint64(b[21])<<16 | uint64(b[20])<<24 |
		uint64(b[19])<<32 | uint64(b[18])<<40 | uint64(b[17])<<48 | uint64(b[16])<<56
	s.D[1].Lo = uint64(b[15]) | uint64(b[14])<<8 | uint64(b[13])<<16 | uint64(b[12])<<24 |
		uint64(b[11])<<32 | uint64(b[10])<<40 | uint64(b[9])<<48 | uint64(b[8])<<56
	s.D[1].Hi = uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
		uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56

	// Check overflow and reduce if necessary
	overflow := s.checkOverflow()
	if overflow {
		s.reduce()
	}
	return overflow
}

// Bytes returns the scalar as a 32-byte big-endian slice.
func (s *Scalar) Bytes() [32]byte {
	var b [32]byte
	b[31] = byte(s.D[0].Lo)
	b[30] = byte(s.D[0].Lo >> 8)
	b[29] = byte(s.D[0].Lo >> 16)
	b[28] = byte(s.D[0].Lo >> 24)
	b[27] = byte(s.D[0].Lo >> 32)
	b[26] = byte(s.D[0].Lo >> 40)
	b[25] = byte(s.D[0].Lo >> 48)
	b[24] = byte(s.D[0].Lo >> 56)

	b[23] = byte(s.D[0].Hi)
	b[22] = byte(s.D[0].Hi >> 8)
	b[21] = byte(s.D[0].Hi >> 16)
	b[20] = byte(s.D[0].Hi >> 24)
	b[19] = byte(s.D[0].Hi >> 32)
	b[18] = byte(s.D[0].Hi >> 40)
	b[17] = byte(s.D[0].Hi >> 48)
	b[16] = byte(s.D[0].Hi >> 56)

	b[15] = byte(s.D[1].Lo)
	b[14] = byte(s.D[1].Lo >> 8)
	b[13] = byte(s.D[1].Lo >> 16)
	b[12] = byte(s.D[1].Lo >> 24)
	b[11] = byte(s.D[1].Lo >> 32)
	b[10] = byte(s.D[1].Lo >> 40)
	b[9] = byte(s.D[1].Lo >> 48)
	b[8] = byte(s.D[1].Lo >> 56)

	b[7] = byte(s.D[1].Hi)
	b[6] = byte(s.D[1].Hi >> 8)
	b[5] = byte(s.D[1].Hi >> 16)
	b[4] = byte(s.D[1].Hi >> 24)
	b[3] = byte(s.D[1].Hi >> 32)
	b[2] = byte(s.D[1].Hi >> 40)
	b[1] = byte(s.D[1].Hi >> 48)
	b[0] = byte(s.D[1].Hi >> 56)

	return b
}

// IsZero returns true if the scalar is zero.
func (s *Scalar) IsZero() bool {
	return s.D[0].IsZero() && s.D[1].IsZero()
}

// IsOne returns true if the scalar is one.
func (s *Scalar) IsOne() bool {
	return s.D[0].Lo == 1 && s.D[0].Hi == 0 && s.D[1].IsZero()
}

// Equal returns true if two scalars are equal.
func (s *Scalar) Equal(other *Scalar) bool {
	return s.D[0].Lo == other.D[0].Lo && s.D[0].Hi == other.D[0].Hi &&
		s.D[1].Lo == other.D[1].Lo && s.D[1].Hi == other.D[1].Hi
}

// checkOverflow returns true if s >= n.
func (s *Scalar) checkOverflow() bool {
	// Compare high to low
	if s.D[1].Hi > ScalarN.D[1].Hi {
		return true
	}
	if s.D[1].Hi < ScalarN.D[1].Hi {
		return false
	}
	if s.D[1].Lo > ScalarN.D[1].Lo {
		return true
	}
	if s.D[1].Lo < ScalarN.D[1].Lo {
		return false
	}
	if s.D[0].Hi > ScalarN.D[0].Hi {
		return true
	}
	if s.D[0].Hi < ScalarN.D[0].Hi {
		return false
	}
	return s.D[0].Lo >= ScalarN.D[0].Lo
}

// reduce reduces s modulo n by adding the complement (2^256 - n).
func (s *Scalar) reduce() {
	// s = s - n = s + (2^256 - n) mod 2^256
	var carry uint64
	s.D[0].Lo, carry = bits.Add64(s.D[0].Lo, ScalarNC.D[0].Lo, 0)
	s.D[0].Hi, carry = bits.Add64(s.D[0].Hi, ScalarNC.D[0].Hi, carry)
	s.D[1].Lo, carry = bits.Add64(s.D[1].Lo, ScalarNC.D[1].Lo, carry)
	s.D[1].Hi, _ = bits.Add64(s.D[1].Hi, ScalarNC.D[1].Hi, carry)
}

// Add sets s = a + b mod n.
func (s *Scalar) Add(a, b *Scalar) *Scalar {
	var carry uint64
	s.D[0].Lo, carry = bits.Add64(a.D[0].Lo, b.D[0].Lo, 0)
	s.D[0].Hi, carry = bits.Add64(a.D[0].Hi, b.D[0].Hi, carry)
	s.D[1].Lo, carry = bits.Add64(a.D[1].Lo, b.D[1].Lo, carry)
	s.D[1].Hi, carry = bits.Add64(a.D[1].Hi, b.D[1].Hi, carry)

	// If there was a carry or if result >= n, reduce
	if carry != 0 || s.checkOverflow() {
		s.reduce()
	}
	return s
}

// Sub sets s = a - b mod n.
func (s *Scalar) Sub(a, b *Scalar) *Scalar {
	var borrow uint64
	s.D[0].Lo, borrow = bits.Sub64(a.D[0].Lo, b.D[0].Lo, 0)
	s.D[0].Hi, borrow = bits.Sub64(a.D[0].Hi, b.D[0].Hi, borrow)
	s.D[1].Lo, borrow = bits.Sub64(a.D[1].Lo, b.D[1].Lo, borrow)
	s.D[1].Hi, borrow = bits.Sub64(a.D[1].Hi, b.D[1].Hi, borrow)

	// If there was a borrow, add n back
	if borrow != 0 {
		var carry uint64
		s.D[0].Lo, carry = bits.Add64(s.D[0].Lo, ScalarN.D[0].Lo, 0)
		s.D[0].Hi, carry = bits.Add64(s.D[0].Hi, ScalarN.D[0].Hi, carry)
		s.D[1].Lo, carry = bits.Add64(s.D[1].Lo, ScalarN.D[1].Lo, carry)
		s.D[1].Hi, _ = bits.Add64(s.D[1].Hi, ScalarN.D[1].Hi, carry)
	}
	return s
}

// Negate sets s = -a mod n.
func (s *Scalar) Negate(a *Scalar) *Scalar {
	if a.IsZero() {
		*s = ScalarZero
		return s
	}
	// s = n - a
	var borrow uint64
	s.D[0].Lo, borrow = bits.Sub64(ScalarN.D[0].Lo, a.D[0].Lo, 0)
	s.D[0].Hi, borrow = bits.Sub64(ScalarN.D[0].Hi, a.D[0].Hi, borrow)
	s.D[1].Lo, borrow = bits.Sub64(ScalarN.D[1].Lo, a.D[1].Lo, borrow)
	s.D[1].Hi, _ = bits.Sub64(ScalarN.D[1].Hi, a.D[1].Hi, borrow)
	return s
}

// Mul sets s = a * b mod n.
func (s *Scalar) Mul(a, b *Scalar) *Scalar {
	// Compute 512-bit product
	var prod [8]uint64
	scalarMul512(&prod, a, b)

	// Reduce mod n
	scalarReduce512(s, &prod)
	return s
}

// scalarMul512 computes the 512-bit product of two 256-bit scalars.
// Result is stored in prod[0..7] where prod[0] is the least significant.
func scalarMul512(prod *[8]uint64, a, b *Scalar) {
	// Using schoolbook multiplication with 64-bit limbs
	// a = a[0] + a[1]*2^64 + a[2]*2^128 + a[3]*2^192
	// b = b[0] + b[1]*2^64 + b[2]*2^128 + b[3]*2^192

	aLimbs := [4]uint64{a.D[0].Lo, a.D[0].Hi, a.D[1].Lo, a.D[1].Hi}
	bLimbs := [4]uint64{b.D[0].Lo, b.D[0].Hi, b.D[1].Lo, b.D[1].Hi}

	// Clear product
	for i := range prod {
		prod[i] = 0
	}

	// Schoolbook multiplication
	for i := 0; i < 4; i++ {
		var carry uint64
		for j := 0; j < 4; j++ {
			hi, lo := bits.Mul64(aLimbs[i], bLimbs[j])
			lo, c := bits.Add64(lo, prod[i+j], 0)
			hi, _ = bits.Add64(hi, 0, c)
			lo, c = bits.Add64(lo, carry, 0)
			hi, _ = bits.Add64(hi, 0, c)
			prod[i+j] = lo
			carry = hi
		}
		prod[i+4] = carry
	}
}

// scalarReduce512 reduces a 512-bit value mod n.
func scalarReduce512(s *Scalar, prod *[8]uint64) {
	// Barrett reduction or simple repeated subtraction
	// For now, use a simpler approach: extract high 256 bits, multiply by (2^256 mod n), add to low

	// 2^256 mod n = 2^256 - n = ScalarNC (approximately 0x14551231950B75FC4...etc)
	// This is a simplified reduction - a full implementation would use Barrett reduction

	// Copy low 256 bits to result
	s.D[0].Lo = prod[0]
	s.D[0].Hi = prod[1]
	s.D[1].Lo = prod[2]
	s.D[1].Hi = prod[3]

	// If high 256 bits are non-zero, we need to reduce
	if prod[4] != 0 || prod[5] != 0 || prod[6] != 0 || prod[7] != 0 {
		// high * (2^256 mod n) + low
		// This is a simplified version - multiply high by NC and add
		highScalar := Scalar{
			D: [2]Uint128{
				{Lo: prod[4], Hi: prod[5]},
				{Lo: prod[6], Hi: prod[7]},
			},
		}

		// Multiply high by NC (which is small: ~2^129)
		// For correctness, we'd need full multiplication, but NC is small enough
		// that we can use a simplified approach

		// NC = 0x14551231950B75FC4402DA1732FC9BEBF
		// NC.D[0] = {Lo: 0x402DA1732FC9BEBF, Hi: 0x4551231950B75FC4}
		// NC.D[1] = {Lo: 0x1, Hi: 0}

		// Approximate: high * NC ≈ high * 2^129 (since NC ≈ 2^129)
		// This means we shift high left by 129 bits and add

		// For a correct implementation, compute high * NC properly:
		var reduction [8]uint64
		ncLimbs := [4]uint64{ScalarNC.D[0].Lo, ScalarNC.D[0].Hi, ScalarNC.D[1].Lo, ScalarNC.D[1].Hi}
		highLimbs := [4]uint64{highScalar.D[0].Lo, highScalar.D[0].Hi, highScalar.D[1].Lo, highScalar.D[1].Hi}

		for i := 0; i < 4; i++ {
			var carry uint64
			for j := 0; j < 4; j++ {
				hi, lo := bits.Mul64(highLimbs[i], ncLimbs[j])
				lo, c := bits.Add64(lo, reduction[i+j], 0)
				hi, _ = bits.Add64(hi, 0, c)
				lo, c = bits.Add64(lo, carry, 0)
				hi, _ = bits.Add64(hi, 0, c)
				reduction[i+j] = lo
				carry = hi
			}
			if i+4 < 8 {
				reduction[i+4], _ = bits.Add64(reduction[i+4], carry, 0)
			}
		}

		// Add reduction to s
		var carry uint64
		s.D[0].Lo, carry = bits.Add64(s.D[0].Lo, reduction[0], 0)
		s.D[0].Hi, carry = bits.Add64(s.D[0].Hi, reduction[1], carry)
		s.D[1].Lo, carry = bits.Add64(s.D[1].Lo, reduction[2], carry)
		s.D[1].Hi, carry = bits.Add64(s.D[1].Hi, reduction[3], carry)

		// Handle any remaining high bits by repeated reduction
		// If there's a carry, it represents 2^256 which equals NC mod n
		// If reduction[4..7] are non-zero, we need to reduce those too
		if carry != 0 || reduction[4] != 0 || reduction[5] != 0 || reduction[6] != 0 || reduction[7] != 0 {
			// The carry and reduction[4..7] together represent additional multiples of 2^256
			// Each 2^256 ≡ NC (mod n), so we add (carry + reduction[4..7]) * NC

			// First, handle the carry
			if carry != 0 {
				// carry * NC
				var c uint64
				s.D[0].Lo, c = bits.Add64(s.D[0].Lo, ScalarNC.D[0].Lo, 0)
				s.D[0].Hi, c = bits.Add64(s.D[0].Hi, ScalarNC.D[0].Hi, c)
				s.D[1].Lo, c = bits.Add64(s.D[1].Lo, ScalarNC.D[1].Lo, c)
				s.D[1].Hi, c = bits.Add64(s.D[1].Hi, ScalarNC.D[1].Hi, c)

				// If there's still a carry, add NC again
				for c != 0 {
					s.D[0].Lo, c = bits.Add64(s.D[0].Lo, ScalarNC.D[0].Lo, 0)
					s.D[0].Hi, c = bits.Add64(s.D[0].Hi, ScalarNC.D[0].Hi, c)
					s.D[1].Lo, c = bits.Add64(s.D[1].Lo, ScalarNC.D[1].Lo, c)
					s.D[1].Hi, c = bits.Add64(s.D[1].Hi, ScalarNC.D[1].Hi, c)
				}
			}

			// Handle reduction[4..7] if non-zero
			if reduction[4] != 0 || reduction[5] != 0 || reduction[6] != 0 || reduction[7] != 0 {
				// Compute reduction[4..7] * NC and add
				highScalar2 := Scalar{
					D: [2]Uint128{
						{Lo: reduction[4], Hi: reduction[5]},
						{Lo: reduction[6], Hi: reduction[7]},
					},
				}

				var reduction2 [8]uint64
				high2Limbs := [4]uint64{highScalar2.D[0].Lo, highScalar2.D[0].Hi, highScalar2.D[1].Lo, highScalar2.D[1].Hi}

				for i := 0; i < 4; i++ {
					var c uint64
					for j := 0; j < 4; j++ {
						hi, lo := bits.Mul64(high2Limbs[i], ncLimbs[j])
						lo, cc := bits.Add64(lo, reduction2[i+j], 0)
						hi, _ = bits.Add64(hi, 0, cc)
						lo, cc = bits.Add64(lo, c, 0)
						hi, _ = bits.Add64(hi, 0, cc)
						reduction2[i+j] = lo
						c = hi
					}
					if i+4 < 8 {
						reduction2[i+4], _ = bits.Add64(reduction2[i+4], c, 0)
					}
				}

				var c uint64
				s.D[0].Lo, c = bits.Add64(s.D[0].Lo, reduction2[0], 0)
				s.D[0].Hi, c = bits.Add64(s.D[0].Hi, reduction2[1], c)
				s.D[1].Lo, c = bits.Add64(s.D[1].Lo, reduction2[2], c)
				s.D[1].Hi, c = bits.Add64(s.D[1].Hi, reduction2[3], c)

				// Handle cascading carries
				for c != 0 || reduction2[4] != 0 || reduction2[5] != 0 || reduction2[6] != 0 || reduction2[7] != 0 {
					// This case is extremely rare but handle it
					for s.checkOverflow() {
						s.reduce()
					}
					break
				}
			}
		}
	}

	// Final reduction if needed
	if s.checkOverflow() {
		s.reduce()
	}
}

// Sqr sets s = a^2 mod n.
func (s *Scalar) Sqr(a *Scalar) *Scalar {
	return s.Mul(a, a)
}

// Inverse sets s = a^(-1) mod n using Fermat's little theorem.
// a^(-1) = a^(n-2) mod n
func (s *Scalar) Inverse(a *Scalar) *Scalar {
	// n-2 in binary is used for square-and-multiply
	// This is a simplified implementation using binary exponentiation

	var result, base Scalar
	result = ScalarOne
	base = *a

	// n-2 bytes (big-endian)
	nMinus2 := [32]byte{
		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE,
		0xBA, 0xAE, 0xDC, 0xE6, 0xAF, 0x48, 0xA0, 0x3B,
		0xBF, 0xD2, 0x5E, 0x8C, 0xD0, 0x36, 0x41, 0x3F,
	}

	for i := 0; i < 32; i++ {
		b := nMinus2[31-i]
		for j := 0; j < 8; j++ {
			if (b>>j)&1 == 1 {
				result.Mul(&result, &base)
			}
			base.Sqr(&base)
		}
	}

	*s = result
	return s
}

// IsHigh returns true if s > n/2.
func (s *Scalar) IsHigh() bool {
	// Compare with n/2
	if s.D[1].Hi > ScalarNHalf.D[1].Hi {
		return true
	}
	if s.D[1].Hi < ScalarNHalf.D[1].Hi {
		return false
	}
	if s.D[1].Lo > ScalarNHalf.D[1].Lo {
		return true
	}
	if s.D[1].Lo < ScalarNHalf.D[1].Lo {
		return false
	}
	if s.D[0].Hi > ScalarNHalf.D[0].Hi {
		return true
	}
	if s.D[0].Hi < ScalarNHalf.D[0].Hi {
		return false
	}
	return s.D[0].Lo > ScalarNHalf.D[0].Lo
}

// CondNegate negates s if cond is true.
func (s *Scalar) CondNegate(cond bool) *Scalar {
	if cond {
		s.Negate(s)
	}
	return s
}