// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package mlkem

import (
	
	
	
)

// fieldElement is an integer modulo q, an element of ℤ_q. It is always reduced.
type fieldElement uint16

// fieldCheckReduced checks that a value a is < q.
func fieldCheckReduced( uint16) (fieldElement, error) {
	if  >= q {
		return 0, errors.New("unreduced field element")
	}
	return fieldElement(), nil
}

// fieldReduceOnce reduces a value a < 2q.
func fieldReduceOnce( uint16) fieldElement {
	 :=  - q
	// If x underflowed, then x >= 2¹⁶ - q > 2¹⁵, so the top bit is set.
	 += ( >> 15) * q
	return fieldElement()
}

func fieldAdd(,  fieldElement) fieldElement {
	 := uint16( + )
	return fieldReduceOnce()
}

func fieldSub(,  fieldElement) fieldElement {
	 := uint16( -  + q)
	return fieldReduceOnce()
}

const (
	barrettMultiplier = 5039 // 2¹² * 2¹² / q
	barrettShift      = 24   // log₂(2¹² * 2¹²)
)

// fieldReduce reduces a value a < 2q² using Barrett reduction, to avoid
// potentially variable-time division.
func fieldReduce( uint32) fieldElement {
	 := uint32((uint64() * barrettMultiplier) >> barrettShift)
	return fieldReduceOnce(uint16( - *q))
}

func fieldMul(,  fieldElement) fieldElement {
	 := uint32() * uint32()
	return fieldReduce()
}

// fieldMulSub returns a * (b - c). This operation is fused to save a
// fieldReduceOnce after the subtraction.
func fieldMulSub(, ,  fieldElement) fieldElement {
	 := uint32() * uint32(-+q)
	return fieldReduce()
}

// fieldAddMul returns a * b + c * d. This operation is fused to save a
// fieldReduceOnce and a fieldReduce.
func fieldAddMul(, , ,  fieldElement) fieldElement {
	 := uint32() * uint32()
	 += uint32() * uint32()
	return fieldReduce()
}

// compress maps a field element uniformly to the range 0 to 2ᵈ-1, according to
// FIPS 203, Definition 4.7.
func compress( fieldElement,  uint8) uint16 {
	// We want to compute (x * 2ᵈ) / q, rounded to nearest integer, with 1/2
	// rounding up (see FIPS 203, Section 2.3).

	// Barrett reduction produces a quotient and a remainder in the range [0, 2q),
	// such that dividend = quotient * q + remainder.
	 := uint32() <<  // x * 2ᵈ
	 := uint32(uint64() * barrettMultiplier >> barrettShift)
	 :=  - *q

	// Since the remainder is in the range [0, 2q), not [0, q), we need to
	// portion it into three spans for rounding.
	//
	//     [ 0,       q/2     ) -> round to 0
	//     [ q/2,     q + q/2 ) -> round to 1
	//     [ q + q/2, 2q      ) -> round to 2
	//
	// We can convert that to the following logic: add 1 if remainder > q/2,
	// then add 1 again if remainder > q + q/2.
	//
	// Note that if remainder > x, then ⌊x⌋ - remainder underflows, and the top
	// bit of the difference will be set.
	 += (q/2 - ) >> 31 & 1
	 += (q + q/2 - ) >> 31 & 1

	// quotient might have overflowed at this point, so reduce it by masking.
	var  uint32 = (1 << ) - 1
	return uint16( & )
}

// decompress maps a number x between 0 and 2ᵈ-1 uniformly to the full range of
// field elements, according to FIPS 203, Definition 4.8.
func decompress( uint16,  uint8) fieldElement {
	// We want to compute (y * q) / 2ᵈ, rounded to nearest integer, with 1/2
	// rounding up (see FIPS 203, Section 2.3).

	 := uint32() * q
	 :=  >>  // (y * q) / 2ᵈ

	// The d'th least-significant bit of the dividend (the most significant bit
	// of the remainder) is 1 for the top half of the values that divide to the
	// same quotient, which are the ones that round up.
	 +=  >> ( - 1) & 1

	// quotient is at most (2¹¹-1) * q / 2¹¹ + 1 = 3328, so it didn't overflow.
	return fieldElement()
}

// ringElement is a polynomial, an element of R_q, represented as an array
// according to FIPS 203, Section 2.4.4.
type ringElement [n]fieldElement

// polyAdd adds two ringElements or nttElements.
func polyAdd[ ~[n]fieldElement](,  ) ( ) {
	for  := range  {
		[] = fieldAdd([], [])
	}
	return 
}

// polySub subtracts two ringElements or nttElements.
func polySub[ ~[n]fieldElement](,  ) ( ) {
	for  := range  {
		[] = fieldSub([], [])
	}
	return 
}

// polyByteEncode appends the 384-byte encoding of f to b.
//
// It implements ByteEncode₁₂, according to FIPS 203, Algorithm 5.
func polyByteEncode[ ~[n]fieldElement]( []byte,  ) []byte {
	,  := sliceForAppend(, encodingSize12)
	for  := 0;  < n;  += 2 {
		 := uint32([]) | uint32([+1])<<12
		[0] = uint8()
		[1] = uint8( >> 8)
		[2] = uint8( >> 16)
		 = [3:]
	}
	return 
}

// polyByteDecode decodes the 384-byte encoding of a polynomial, checking that
// all the coefficients are properly reduced. This fulfills the "Modulus check"
// step of ML-KEM Encapsulation.
//
// It implements ByteDecode₁₂, according to FIPS 203, Algorithm 6.
func polyByteDecode[ ~[n]fieldElement]( []byte) (, error) {
	if len() != encodingSize12 {
		return {}, errors.New("mlkem: invalid encoding length")
	}
	var  
	for  := 0;  < n;  += 2 {
		 := uint32([0]) | uint32([1])<<8 | uint32([2])<<16
		const  = 0b1111_1111_1111
		var  error
		if [],  = fieldCheckReduced(uint16( & ));  != nil {
			return {}, errors.New("mlkem: invalid polynomial encoding")
		}
		if [+1],  = fieldCheckReduced(uint16( >> 12));  != nil {
			return {}, errors.New("mlkem: invalid polynomial encoding")
		}
		 = [3:]
	}
	return , nil
}

// sliceForAppend takes a slice and a requested number of bytes. It returns a
// slice with the contents of the given slice followed by that many bytes and a
// second slice that aliases into it and contains only the extra bytes. If the
// original slice has sufficient capacity then no allocation is performed.
func sliceForAppend( []byte,  int) (,  []byte) {
	if  := len() + ; cap() >=  {
		 = [:]
	} else {
		 = make([]byte, )
		copy(, )
	}
	 = [len():]
	return
}

// ringCompressAndEncode1 appends a 32-byte encoding of a ring element to s,
// compressing one coefficients per bit.
//
// It implements Compress₁, according to FIPS 203, Definition 4.7,
// followed by ByteEncode₁, according to FIPS 203, Algorithm 5.
func ringCompressAndEncode1( []byte,  ringElement) []byte {
	,  := sliceForAppend(, encodingSize1)
	for  := range  {
		[] = 0
	}
	for  := range  {
		[/8] |= uint8(compress([], 1) << ( % 8))
	}
	return 
}

// ringDecodeAndDecompress1 decodes a 32-byte slice to a ring element where each
// bit is mapped to 0 or ⌈q/2⌋.
//
// It implements ByteDecode₁, according to FIPS 203, Algorithm 6,
// followed by Decompress₁, according to FIPS 203, Definition 4.8.
func ringDecodeAndDecompress1( *[encodingSize1]byte) ringElement {
	var  ringElement
	for  := range  {
		 := [/8] >> ( % 8) & 1
		const  = (q + 1) / 2        // ⌈q/2⌋, rounded up per FIPS 203, Section 2.3
		[] = fieldElement() *  // 0 decompresses to 0, and 1 to ⌈q/2⌋
	}
	return 
}

// ringCompressAndEncode4 appends a 128-byte encoding of a ring element to s,
// compressing two coefficients per byte.
//
// It implements Compress₄, according to FIPS 203, Definition 4.7,
// followed by ByteEncode₄, according to FIPS 203, Algorithm 5.
func ringCompressAndEncode4( []byte,  ringElement) []byte {
	,  := sliceForAppend(, encodingSize4)
	for  := 0;  < n;  += 2 {
		[/2] = uint8(compress([], 4) | compress([+1], 4)<<4)
	}
	return 
}

// ringDecodeAndDecompress4 decodes a 128-byte encoding of a ring element where
// each four bits are mapped to an equidistant distribution.
//
// It implements ByteDecode₄, according to FIPS 203, Algorithm 6,
// followed by Decompress₄, according to FIPS 203, Definition 4.8.
func ringDecodeAndDecompress4( *[encodingSize4]byte) ringElement {
	var  ringElement
	for  := 0;  < n;  += 2 {
		[] = fieldElement(decompress(uint16([/2]&0b1111), 4))
		[+1] = fieldElement(decompress(uint16([/2]>>4), 4))
	}
	return 
}

// ringCompressAndEncode10 appends a 320-byte encoding of a ring element to s,
// compressing four coefficients per five bytes.
//
// It implements Compress₁₀, according to FIPS 203, Definition 4.7,
// followed by ByteEncode₁₀, according to FIPS 203, Algorithm 5.
func ringCompressAndEncode10( []byte,  ringElement) []byte {
	,  := sliceForAppend(, encodingSize10)
	for  := 0;  < n;  += 4 {
		var  uint64
		 |= uint64(compress([], 10))
		 |= uint64(compress([+1], 10)) << 10
		 |= uint64(compress([+2], 10)) << 20
		 |= uint64(compress([+3], 10)) << 30
		[0] = uint8()
		[1] = uint8( >> 8)
		[2] = uint8( >> 16)
		[3] = uint8( >> 24)
		[4] = uint8( >> 32)
		 = [5:]
	}
	return 
}

// ringDecodeAndDecompress10 decodes a 320-byte encoding of a ring element where
// each ten bits are mapped to an equidistant distribution.
//
// It implements ByteDecode₁₀, according to FIPS 203, Algorithm 6,
// followed by Decompress₁₀, according to FIPS 203, Definition 4.8.
func ringDecodeAndDecompress10( *[encodingSize10]byte) ringElement {
	 := [:]
	var  ringElement
	for  := 0;  < n;  += 4 {
		 := uint64([0]) | uint64([1])<<8 | uint64([2])<<16 | uint64([3])<<24 | uint64([4])<<32
		 = [5:]
		[] = fieldElement(decompress(uint16(>>0&0b11_1111_1111), 10))
		[+1] = fieldElement(decompress(uint16(>>10&0b11_1111_1111), 10))
		[+2] = fieldElement(decompress(uint16(>>20&0b11_1111_1111), 10))
		[+3] = fieldElement(decompress(uint16(>>30&0b11_1111_1111), 10))
	}
	return 
}

// ringCompressAndEncode appends an encoding of a ring element to s,
// compressing each coefficient to d bits.
//
// It implements Compress, according to FIPS 203, Definition 4.7,
// followed by ByteEncode, according to FIPS 203, Algorithm 5.
func ringCompressAndEncode( []byte,  ringElement,  uint8) []byte {
	var  byte
	var  uint8
	for  := 0;  < n; ++ {
		 := compress([], )
		var  uint8
		for  <  {
			 |= byte(>>) << 
			 := min(8-, -)
			 += 
			 += 
			if  == 8 {
				 = append(, )
				 = 0
				 = 0
			}
		}
	}
	if  != 0 {
		panic("mlkem: internal error: bitsFilled != 0")
	}
	return 
}

// ringDecodeAndDecompress decodes an encoding of a ring element where
// each d bits are mapped to an equidistant distribution.
//
// It implements ByteDecode, according to FIPS 203, Algorithm 6,
// followed by Decompress, according to FIPS 203, Definition 4.8.
func ringDecodeAndDecompress( []byte,  uint8) ringElement {
	var  ringElement
	var  uint8
	for  := 0;  < n; ++ {
		var  uint16
		var  uint8
		for  <  {
			 |= uint16([0]>>) << 
			 &= (1 << ) - 1
			 := min(8-, -)
			 += 
			 += 
			if  == 8 {
				 = [1:]
				 = 0
			}
		}
		[] = fieldElement(decompress(, ))
	}
	if len() != 0 {
		panic("mlkem: internal error: leftover bytes")
	}
	return 
}

// ringCompressAndEncode5 appends a 160-byte encoding of a ring element to s,
// compressing eight coefficients per five bytes.
//
// It implements Compress₅, according to FIPS 203, Definition 4.7,
// followed by ByteEncode₅, according to FIPS 203, Algorithm 5.
func ringCompressAndEncode5( []byte,  ringElement) []byte {
	return ringCompressAndEncode(, , 5)
}

// ringDecodeAndDecompress5 decodes a 160-byte encoding of a ring element where
// each five bits are mapped to an equidistant distribution.
//
// It implements ByteDecode₅, according to FIPS 203, Algorithm 6,
// followed by Decompress₅, according to FIPS 203, Definition 4.8.
func ringDecodeAndDecompress5( *[encodingSize5]byte) ringElement {
	return ringDecodeAndDecompress([:], 5)
}

// ringCompressAndEncode11 appends a 352-byte encoding of a ring element to s,
// compressing eight coefficients per eleven bytes.
//
// It implements Compress₁₁, according to FIPS 203, Definition 4.7,
// followed by ByteEncode₁₁, according to FIPS 203, Algorithm 5.
func ringCompressAndEncode11( []byte,  ringElement) []byte {
	return ringCompressAndEncode(, , 11)
}

// ringDecodeAndDecompress11 decodes a 352-byte encoding of a ring element where
// each eleven bits are mapped to an equidistant distribution.
//
// It implements ByteDecode₁₁, according to FIPS 203, Algorithm 6,
// followed by Decompress₁₁, according to FIPS 203, Definition 4.8.
func ringDecodeAndDecompress11( *[encodingSize11]byte) ringElement {
	return ringDecodeAndDecompress([:], 11)
}

// samplePolyCBD draws a ringElement from the special Dη distribution given a
// stream of random bytes generated by the PRF function, according to FIPS 203,
// Algorithm 8 and Definition 4.3.
func samplePolyCBD( []byte,  byte) ringElement {
	 := sha3.NewShake256()
	.Write()
	.Write([]byte{})
	 := make([]byte, 64*2) // η = 2
	.Read()

	// SamplePolyCBD simply draws four (2η) bits for each coefficient, and adds
	// the first two and subtracts the last two.

	var  ringElement
	for  := 0;  < n;  += 2 {
		 := [/2]
		, , ,  := >>7, >>6&1, >>5&1, >>4&1
		, , ,  := >>3&1, >>2&1, >>1&1, &1
		[] = fieldSub(fieldElement(+), fieldElement(+))
		[+1] = fieldSub(fieldElement(+), fieldElement(+))
	}
	return 
}

// nttElement is an NTT representation, an element of T_q, represented as an
// array according to FIPS 203, Section 2.4.4.
type nttElement [n]fieldElement

// gammas are the values ζ^2BitRev7(i)+1 mod q for each index i, according to
// FIPS 203, Appendix A (with negative values reduced to positive).
var gammas = [128]fieldElement{17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, 2110, 1219, 2935, 394, 885, 2444, 2154, 1175}

// nttMul multiplies two nttElements.
//
// It implements MultiplyNTTs, according to FIPS 203, Algorithm 11.
func nttMul(,  nttElement) nttElement {
	var  nttElement
	// We use i += 2 for bounds check elimination. See https://go.dev/issue/66826.
	for  := 0;  < 256;  += 2 {
		,  := [], [+1]
		,  := [], [+1]
		[] = fieldAddMul(, , fieldMul(, ), gammas[/2])
		[+1] = fieldAddMul(, , , )
	}
	return 
}

// zetas are the values ζ^BitRev7(k) mod q for each index k, according to FIPS
// 203, Appendix A.
var zetas = [128]fieldElement{1, 1729, 2580, 3289, 2642, 630, 1897, 848, 1062, 1919, 193, 797, 2786, 3260, 569, 1746, 296, 2447, 1339, 1476, 3046, 56, 2240, 1333, 1426, 2094, 535, 2882, 2393, 2879, 1974, 821, 289, 331, 3253, 1756, 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, 2474, 3110, 1227, 910, 17, 2761, 583, 2649, 1637, 723, 2288, 1100, 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154}

// ntt maps a ringElement to its nttElement representation.
//
// It implements NTT, according to FIPS 203, Algorithm 9.
func ntt( ringElement) nttElement {
	 := 1
	for  := 128;  >= 2;  /= 2 {
		for  := 0;  < 256;  += 2 *  {
			 := zetas[]
			++
			// Bounds check elimination hint.
			,  := [:+], [+:++]
			for  := 0;  < ; ++ {
				 := fieldMul(, [])
				[] = fieldSub([], )
				[] = fieldAdd([], )
			}
		}
	}
	return nttElement()
}

// inverseNTT maps a nttElement back to the ringElement it represents.
//
// It implements NTT⁻¹, according to FIPS 203, Algorithm 10.
func inverseNTT( nttElement) ringElement {
	 := 127
	for  := 2;  <= 128;  *= 2 {
		for  := 0;  < 256;  += 2 *  {
			 := zetas[]
			--
			// Bounds check elimination hint.
			,  := [:+], [+:++]
			for  := 0;  < ; ++ {
				 := []
				[] = fieldAdd(, [])
				[] = fieldMulSub(, [], )
			}
		}
	}
	for  := range  {
		[] = fieldMul([], 3303) // 3303 = 128⁻¹ mod q
	}
	return ringElement()
}

// sampleNTT draws a uniformly random nttElement from a stream of uniformly
// random bytes generated by the XOF function, according to FIPS 203,
// Algorithm 7.
func sampleNTT( []byte, ,  byte) nttElement {
	 := sha3.NewShake128()
	.Write()
	.Write([]byte{, })

	// SampleNTT essentially draws 12 bits at a time from r, interprets them in
	// little-endian, and rejects values higher than q, until it drew 256
	// values. (The rejection rate is approximately 19%.)
	//
	// To do this from a bytes stream, it draws three bytes at a time, and
	// splits them into two uint16 appropriately masked.
	//
	//               r₀              r₁              r₂
	//       |- - - - - - - -|- - - - - - - -|- - - - - - - -|
	//
	//               Uint16(r₀ || r₁)
	//       |- - - - - - - - - - - - - - - -|
	//       |- - - - - - - - - - - -|
	//                   d₁
	//
	//                                Uint16(r₁ || r₂)
	//                       |- - - - - - - - - - - - - - - -|
	//                               |- - - - - - - - - - - -|
	//                                           d₂
	//
	// Note that in little-endian, the rightmost bits are the most significant
	// bits (dropped with a mask) and the leftmost bits are the least
	// significant bits (dropped with a right shift).

	var  nttElement
	var  int        // index into a
	var  [24]byte // buffered reads from B
	 := len()  // index into buf, starts in a "buffer fully consumed" state
	for {
		if  >= len() {
			.Read([:])
			 = 0
		}
		 := byteorder.LEUint16([:]) & 0b1111_1111_1111
		 := byteorder.LEUint16([+1:]) >> 4
		 += 3
		if  < q {
			[] = fieldElement()
			++
		}
		if  >= len() {
			break
		}
		if  < q {
			[] = fieldElement()
			++
		}
		if  >= len() {
			break
		}
	}
	return 
}