// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This file contains the Go wrapper for the constant-time, 64-bit assembly
// implementation of P256. The optimizations performed here are described in
// detail in:
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
//                          256-bit primes"
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf

// +build amd64 arm64

package elliptic

import (
	
	
)

type (
	p256Curve struct {
		*CurveParams
	}

	p256Point struct {
		xyz [12]uint64
	}
)

var (
	p256            p256Curve
	p256Precomputed *[43][32 * 8]uint64
	precomputeOnce  sync.Once
)

func initP256() {
	// See FIPS 186-3, section D.2.3
	p256.CurveParams = &CurveParams{Name: "P-256"}
	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
	p256.BitSize = 256
}

func ( p256Curve) () *CurveParams {
	return .CurveParams
}

// Functions implemented in p256_asm_*64.s
// Montgomery multiplication modulo P256
//go:noescape
func p256Mul(, ,  []uint64)

// Montgomery square modulo P256, repeated n times (n >= 1)
//go:noescape
func p256Sqr(,  []uint64,  int)

// Montgomery multiplication by 1
//go:noescape
func p256FromMont(,  []uint64)

// iff cond == 1  val <- -val
//go:noescape
func p256NegCond( []uint64,  int)

// if cond == 0 res <- b; else res <- a
//go:noescape
func p256MovCond(, ,  []uint64,  int)

// Endianness swap
//go:noescape
func p256BigToLittle( []uint64,  []byte)

//go:noescape
func p256LittleToBig( []byte,  []uint64)

// Constant time table access
//go:noescape
func p256Select(,  []uint64,  int)

//go:noescape
func p256SelectBase(,  []uint64,  int)

// Montgomery multiplication modulo Ord(G)
//go:noescape
func p256OrdMul(, ,  []uint64)

// Montgomery square modulo Ord(G), repeated n times
//go:noescape
func p256OrdSqr(,  []uint64,  int)

// Point add with in2 being affine point
// If sign == 1 -> in2 = -in2
// If sel == 0 -> res = in1
// if zero == 0 -> res = in2
//go:noescape
func p256PointAddAffineAsm(, ,  []uint64, , ,  int)

// Point add. Returns one if the two input points were equal and zero
// otherwise. (Note that, due to the way that the equations work out, some
// representations of ∞ are considered equal to everything by this function.)
//go:noescape
func p256PointAddAsm(, ,  []uint64) int

// Point double
//go:noescape
func p256PointDoubleAsm(,  []uint64)

func ( p256Curve) ( *big.Int) *big.Int {
	if .Sign() < 0 {
		// This should never happen.
		 = new(big.Int).Neg()
	}

	if .Cmp(p256.N) >= 0 {
		// This should never happen.
		 = new(big.Int).Mod(, p256.N)
	}

	// table will store precomputed powers of x.
	var  [4 * 9]uint64
	var (
		      = [4*0 : 4*1]
		     = [4*1 : 4*2]
		    = [4*2 : 4*3]
		    = [4*3 : 4*4]
		   = [4*4 : 4*5]
		  = [4*5 : 4*6]
		 = [4*6 : 4*7]
		       = [4*7 : 4*8]
		       = [4*8 : 4*9]
	)

	fromBig([:], )
	// This code operates in the Montgomery domain where R = 2^256 mod n
	// and n is the order of the scalar field. (See initP256 for the
	// value.) Elements in the Montgomery domain take the form a×R and
	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
	// i.e. converts x into the Montgomery domain.
	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
	 := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
	p256OrdMul(, , )      // _1
	p256OrdSqr(, , 1)       // _10
	p256OrdMul(, , )     // _11
	p256OrdMul(, , )   // _101
	p256OrdMul(, , )  // _111
	p256OrdSqr(, , 1)     // _1010
	p256OrdMul(, , ) // _1111

	p256OrdSqr(, , 1)          // _10100
	p256OrdMul(, , )    // _10101
	p256OrdSqr(, , 1)     // _101010
	p256OrdMul(, , ) // _101111
	p256OrdMul(, , )     // _111111 = x6
	p256OrdSqr(, , 2)          // _11111100
	p256OrdMul(, , )        // _11111111 = x8
	p256OrdSqr(, , 8)          // _ff00
	p256OrdMul(, , )          // _ffff = x16
	p256OrdSqr(, , 16)         // _ffff0000
	p256OrdMul(, , )          // _ffffffff = x32

	p256OrdSqr(, , 64)
	p256OrdMul(, , )
	p256OrdSqr(, , 32)
	p256OrdMul(, , )

	 := []uint8{
		6, 5, 4, 5, 5,
		4, 3, 3, 5, 9,
		6, 2, 5, 6, 5,
		4, 5, 5, 3, 10,
		2, 5, 5, 3, 7, 6}
	 := [][]uint64{
		, , , , ,
		, , , , ,
		, , , , ,
		, , , , ,
		, , , , , }

	for ,  := range  {
		p256OrdSqr(, , int())
		p256OrdMul(, , [])
	}

	// Multiplying by one in the Montgomery domain converts a Montgomery
	// value out of the domain.
	 := []uint64{1, 0, 0, 0}
	p256OrdMul(, , )

	 := make([]byte, 32)
	p256LittleToBig(, )
	return new(big.Int).SetBytes()
}

// fromBig converts a *big.Int into a format used by this code.
func fromBig( []uint64,  *big.Int) {
	for  := range  {
		[] = 0
	}

	for ,  := range .Bits() {
		[] = uint64()
	}
}

// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
// to out. If the scalar is equal or greater than the order of the group, it's
// reduced modulo that order.
func p256GetScalar( []uint64,  []byte) {
	 := new(big.Int).SetBytes()

	if .Cmp(p256.N) >= 0 {
		.Mod(, p256.N)
	}
	fromBig(, )
}

// p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
// underlying field of the curve. (See initP256 for the value.) Thus rr here is
// R×R mod p. See comment in Inverse about how this is used.
var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd}

func maybeReduceModP( *big.Int) *big.Int {
	if .Cmp(p256.P) < 0 {
		return 
	}
	return new(big.Int).Mod(, p256.P)
}

func ( p256Curve) (,  *big.Int, ,  []byte) (,  *big.Int) {
	 := make([]uint64, 4)
	var ,  p256Point
	p256GetScalar(, )
	 := scalarIsZero()
	.p256BaseMult()

	p256GetScalar(, )
	 := scalarIsZero()
	fromBig(.xyz[0:4], maybeReduceModP())
	fromBig(.xyz[4:8], maybeReduceModP())
	p256Mul(.xyz[0:4], .xyz[0:4], rr[:])
	p256Mul(.xyz[4:8], .xyz[4:8], rr[:])

	// This sets r2's Z value to 1, in the Montgomery domain.
	.xyz[8] = 0x0000000000000001
	.xyz[9] = 0xffffffff00000000
	.xyz[10] = 0xffffffffffffffff
	.xyz[11] = 0x00000000fffffffe

	.p256ScalarMult()

	var ,  p256Point
	 := p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	.CopyConditional(&, )
	.CopyConditional(&, )
	.CopyConditional(&, )

	return .p256PointToAffine()
}

func ( p256Curve) ( []byte) (,  *big.Int) {
	 := make([]uint64, 4)
	p256GetScalar(, )

	var  p256Point
	.p256BaseMult()
	return .p256PointToAffine()
}

func ( p256Curve) (,  *big.Int,  []byte) (,  *big.Int) {
	 := make([]uint64, 4)
	p256GetScalar(, )

	var  p256Point
	fromBig(.xyz[0:4], maybeReduceModP())
	fromBig(.xyz[4:8], maybeReduceModP())
	p256Mul(.xyz[0:4], .xyz[0:4], rr[:])
	p256Mul(.xyz[4:8], .xyz[4:8], rr[:])
	// This sets r2's Z value to 1, in the Montgomery domain.
	.xyz[8] = 0x0000000000000001
	.xyz[9] = 0xffffffff00000000
	.xyz[10] = 0xffffffffffffffff
	.xyz[11] = 0x00000000fffffffe

	.p256ScalarMult()
	return .p256PointToAffine()
}

// uint64IsZero returns 1 if x is zero and zero otherwise.
func uint64IsZero( uint64) int {
	 = ^
	 &=  >> 32
	 &=  >> 16
	 &=  >> 8
	 &=  >> 4
	 &=  >> 2
	 &=  >> 1
	return int( & 1)
}

// scalarIsZero returns 1 if scalar represents the zero value, and zero
// otherwise.
func scalarIsZero( []uint64) int {
	return uint64IsZero([0] | [1] | [2] | [3])
}

func ( *p256Point) () (,  *big.Int) {
	 := make([]uint64, 4)
	 := make([]uint64, 4)
	p256Inverse(, .xyz[8:12])
	p256Sqr(, , 1)
	p256Mul(, , )

	p256Mul(, .xyz[0:4], )
	p256Mul(, .xyz[4:8], )

	p256FromMont(, )
	p256FromMont(, )

	 := make([]byte, 32)
	 := make([]byte, 32)
	p256LittleToBig(, )
	p256LittleToBig(, )

	return new(big.Int).SetBytes(), new(big.Int).SetBytes()
}

// CopyConditional copies overwrites p with src if v == 1, and leaves p
// unchanged if v == 0.
func ( *p256Point) ( *p256Point,  int) {
	 := uint64() - 1
	 := ^

	for ,  := range .xyz {
		.xyz[] = ( & ) | (.xyz[] & )
	}
}

// p256Inverse sets out to in^-1 mod p.
func p256Inverse(,  []uint64) {
	var  [6 * 4]uint64
	 := [4*0 : 4*0+4]
	 := [4*1 : 4*1+4]
	 := [4*2 : 4*2+4]
	 := [4*3 : 4*3+4]
	 := [4*4 : 4*4+4]

	p256Sqr(, , 1)
	p256Mul(, , ) // 3*p

	p256Sqr(, , 2)
	p256Mul(, , ) // f*p

	p256Sqr(, , 4)
	p256Mul(, , ) // ff*p

	p256Sqr(, , 8)
	p256Mul(, , ) // ffff*p

	p256Sqr(, , 16)
	p256Mul(, , ) // ffffffff*p

	p256Sqr(, , 32)
	p256Mul(, , )

	p256Sqr(, , 128)
	p256Mul(, , )

	p256Sqr(, , 32)
	p256Mul(, , )

	p256Sqr(, , 16)
	p256Mul(, , )

	p256Sqr(, , 8)
	p256Mul(, , )

	p256Sqr(, , 4)
	p256Mul(, , )

	p256Sqr(, , 2)
	p256Mul(, , )

	p256Sqr(, , 2)
	p256Mul(, , )
}

func ( *p256Point) ( *[16 * 4 * 3]uint64,  int) {
	copy([*12:], .xyz[:])
}

func boothW5( uint) (int, int) {
	var  uint = ^(( >> 5) - 1)
	var  uint = (1 << 6) -  - 1
	 = ( & ) | ( & (^))
	 = ( >> 1) + ( & 1)
	return int(), int( & 1)
}

func boothW6( uint) (int, int) {
	var  uint = ^(( >> 6) - 1)
	var  uint = (1 << 7) -  - 1
	 = ( & ) | ( & (^))
	 = ( >> 1) + ( & 1)
	return int(), int( & 1)
}

func initTable() {
	p256Precomputed = new([43][32 * 8]uint64)

	 := []uint64{
		0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6,
		0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85,
		0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe,
	}
	 := make([]uint64, 12)
	 := make([]uint64, 12)
	copy(, )

	 := make([]uint64, 4)
	 := make([]uint64, 4)
	for  := 0;  < 32; ++ {
		copy(, )
		for  := 0;  < 43; ++ {
			// The window size is 6 so we need to double 6 times.
			if  != 0 {
				for  := 0;  < 6; ++ {
					p256PointDoubleAsm(, )
				}
			}
			// Convert the point to affine form. (Its values are
			// still in Montgomery form however.)
			p256Inverse(, [8:12])
			p256Sqr(, , 1)
			p256Mul(, , )

			p256Mul([:4], [:4], )
			p256Mul([4:8], [4:8], )

			copy([8:12], [8:12])
			// Update the table entry
			copy(p256Precomputed[][*8:], [:8])
		}
		if  == 0 {
			p256PointDoubleAsm(, )
		} else {
			p256PointAddAsm(, , )
		}
	}
}

func ( *p256Point) ( []uint64) {
	precomputeOnce.Do(initTable)

	 := ([0] << 1) & 0x7f
	,  := boothW6(uint())
	p256SelectBase(.xyz[0:8], p256Precomputed[0][0:], )
	p256NegCond(.xyz[4:8], )

	// (This is one, in the Montgomery domain.)
	.xyz[8] = 0x0000000000000001
	.xyz[9] = 0xffffffff00000000
	.xyz[10] = 0xffffffffffffffff
	.xyz[11] = 0x00000000fffffffe

	var  p256Point
	// (This is one, in the Montgomery domain.)
	.xyz[8] = 0x0000000000000001
	.xyz[9] = 0xffffffff00000000
	.xyz[10] = 0xffffffffffffffff
	.xyz[11] = 0x00000000fffffffe

	 := uint(5)
	 := 

	for  := 1;  < 43; ++ {
		if  < 192 {
			 = (([/64] >> ( % 64)) + ([/64+1] << (64 - ( % 64)))) & 0x7f
		} else {
			 = ([/64] >> ( % 64)) & 0x7f
		}
		 += 6
		,  = boothW6(uint())
		p256SelectBase(.xyz[0:8], p256Precomputed[][0:], )
		p256PointAddAffineAsm(.xyz[0:12], .xyz[0:12], .xyz[0:8], , , )
		 |= 
	}
}

func ( *p256Point) ( []uint64) {
	// precomp is a table of precomputed points that stores powers of p
	// from p^1 to p^16.
	var  [16 * 4 * 3]uint64
	var , , ,  p256Point

	// Prepare the table
	.p256StorePoint(&, 0) // 1

	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	.p256StorePoint(&, 1)  // 2
	.p256StorePoint(&, 3)  // 4
	.p256StorePoint(&, 7)  // 8
	.p256StorePoint(&, 15) // 16

	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	.p256StorePoint(&, 2) // 3
	.p256StorePoint(&, 4) // 5
	.p256StorePoint(&, 8) // 9

	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	.p256StorePoint(&, 5) // 6
	.p256StorePoint(&, 9) // 10

	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	.p256StorePoint(&, 6)  // 7
	.p256StorePoint(&, 10) // 11

	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	.p256StorePoint(&, 11) // 12
	.p256StorePoint(&, 13) // 14

	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	.p256StorePoint(&, 12) // 13
	.p256StorePoint(&, 14) // 15

	// Start scanning the window from top bit
	 := uint(254)
	var ,  int

	 := ([/64] >> ( % 64)) & 0x3f
	, _ = boothW5(uint())

	p256Select(.xyz[0:12], [0:], )
	 := 

	for  > 4 {
		 -= 5
		p256PointDoubleAsm(.xyz[:], .xyz[:])
		p256PointDoubleAsm(.xyz[:], .xyz[:])
		p256PointDoubleAsm(.xyz[:], .xyz[:])
		p256PointDoubleAsm(.xyz[:], .xyz[:])
		p256PointDoubleAsm(.xyz[:], .xyz[:])

		if  < 192 {
			 = (([/64] >> ( % 64)) + ([/64+1] << (64 - ( % 64)))) & 0x3f
		} else {
			 = ([/64] >> ( % 64)) & 0x3f
		}

		,  = boothW5(uint())

		p256Select(.xyz[0:], [0:], )
		p256NegCond(.xyz[4:8], )
		p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
		p256MovCond(.xyz[0:12], .xyz[0:12], .xyz[0:12], )
		p256MovCond(.xyz[0:12], .xyz[0:12], .xyz[0:12], )
		 |= 
	}

	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])
	p256PointDoubleAsm(.xyz[:], .xyz[:])

	 = ([0] << 1) & 0x3f
	,  = boothW5(uint())

	p256Select(.xyz[0:], [0:], )
	p256NegCond(.xyz[4:8], )
	p256PointAddAsm(.xyz[:], .xyz[:], .xyz[:])
	p256MovCond(.xyz[0:12], .xyz[0:12], .xyz[0:12], )
	p256MovCond(.xyz[0:12], .xyz[0:12], .xyz[0:12], )
}