// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This file provides the generic implementation of Sum and MAC. Other files
// might provide optimized assembly implementations of some of this code.

package poly1305

import 

// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
// for a 64 bytes message is approximately
//
//     s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r  mod  2¹³⁰ - 5
//
// for some secret r and s. It can be computed sequentially like
//
//     for len(msg) > 0:
//         h += read(msg, 16)
//         h *= r
//         h %= 2¹³⁰ - 5
//     return h + s
//
// All the complexity is about doing performant constant-time math on numbers
// larger than any available numeric type.

func sumGeneric( *[TagSize]byte,  []byte,  *[32]byte) {
	 := newMACGeneric()
	.Write()
	.Sum()
}

func newMACGeneric( *[32]byte) macGeneric {
	 := macGeneric{}
	initialize(, &.macState)
	return 
}

// macState holds numbers in saturated 64-bit little-endian limbs. That is,
// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
type macState struct {
	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
	// can grow larger during and after rounds. It must, however, remain below
	// 2 * (2¹³⁰ - 5).
	h [3]uint64
	// r and s are the private key components.
	r [2]uint64
	s [2]uint64
}

type macGeneric struct {
	macState

	buffer [TagSize]byte
	offset int
}

// Write splits the incoming message into TagSize chunks, and passes them to
// update. It buffers incomplete chunks.
func ( *macGeneric) ( []byte) (int, error) {
	 := len()
	if .offset > 0 {
		 := copy(.buffer[.offset:], )
		if .offset+ < TagSize {
			.offset += 
			return , nil
		}
		 = [:]
		.offset = 0
		updateGeneric(&.macState, .buffer[:])
	}
	if  := len() - (len() % TagSize);  > 0 {
		updateGeneric(&.macState, [:])
		 = [:]
	}
	if len() > 0 {
		.offset += copy(.buffer[.offset:], )
	}
	return , nil
}

// Sum flushes the last incomplete chunk from the buffer, if any, and generates
// the MAC output. It does not modify its state, in order to allow for multiple
// calls to Sum, even if no Write is allowed after Sum.
func ( *macGeneric) ( *[TagSize]byte) {
	 := .macState
	if .offset > 0 {
		updateGeneric(&, .buffer[:.offset])
	}
	finalize(, &.h, &.s)
}

// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
// clears some bits of the secret coefficient to make it possible to implement
// multiplication more efficiently.
const (
	rMask0 = 0x0FFFFFFC0FFFFFFF
	rMask1 = 0x0FFFFFFC0FFFFFFC
)

// initialize loads the 256-bit key into the two 128-bit secret values r and s.
func initialize( *[32]byte,  *macState) {
	.r[0] = binary.LittleEndian.Uint64([0:8]) & rMask0
	.r[1] = binary.LittleEndian.Uint64([8:16]) & rMask1
	.s[0] = binary.LittleEndian.Uint64([16:24])
	.s[1] = binary.LittleEndian.Uint64([24:32])
}

// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
// bits.Mul64 and bits.Add64 intrinsics.
type uint128 struct {
	lo, hi uint64
}

func mul64(,  uint64) uint128 {
	,  := bitsMul64(, )
	return uint128{, }
}

func add128(,  uint128) uint128 {
	,  := bitsAdd64(.lo, .lo, 0)
	,  := bitsAdd64(.hi, .hi, )
	if  != 0 {
		panic("poly1305: unexpected overflow")
	}
	return uint128{, }
}

func shiftRightBy2( uint128) uint128 {
	.lo = .lo>>2 | (.hi&3)<<62
	.hi = .hi >> 2
	return 
}

// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
// 128 bits of message, it computes
//
//	h₊ = (h + m) * r  mod  2¹³⁰ - 5
//
// If the msg length is not a multiple of TagSize, it assumes the last
// incomplete chunk is the final one.
func updateGeneric( *macState,  []byte) {
	, ,  := .h[0], .h[1], .h[2]
	,  := .r[0], .r[1]

	for len() > 0 {
		var  uint64

		// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
		// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
		// reduced at the end of the multiplication below.
		//
		// The spec requires us to set a bit just above the message size, not to
		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
		// add 1 to the most significant (2¹²⁸) limb, h2.
		if len() >= TagSize {
			,  = bitsAdd64(, binary.LittleEndian.Uint64([0:8]), 0)
			,  = bitsAdd64(, binary.LittleEndian.Uint64([8:16]), )
			 +=  + 1

			 = [TagSize:]
		} else {
			var  [TagSize]byte
			copy([:], )
			[len()] = 1

			,  = bitsAdd64(, binary.LittleEndian.Uint64([0:8]), 0)
			,  = bitsAdd64(, binary.LittleEndian.Uint64([8:16]), )
			 += 

			 = nil
		}

		// Multiplication of big number limbs is similar to elementary school
		// columnar multiplication. Instead of digits, there are 64-bit limbs.
		//
		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
		//
		//                        h2    h1    h0  x
		//                              r1    r0  =
		//                       ----------------
		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products
		//            +   h2r1  h1r1  h0r1
		//               ------------------------
		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs
		//               ------------------------
		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation
		//     +         m3.lo m2.lo m1.lo m0.lo
		//        -------------------------------
		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs
		//
		// The main difference from pen-and-paper multiplication is that we do
		// carry propagation in a separate step, as if we wrote two digit sums
		// at first (the 128-bit limbs), and then carried the tens all at once.

		 := mul64(, )
		 := mul64(, )
		 := mul64(, )
		 := mul64(, )
		 := mul64(, )
		 := mul64(, )

		// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
		// top 4 bits cleared by rMask{0,1}, we know that their product is not going
		// to overflow 64 bits, so we can ignore the high part of the products.
		//
		// This also means that the product doesn't have a fifth limb (t4).
		if .hi != 0 {
			panic("poly1305: unexpected overflow")
		}
		if .hi != 0 {
			panic("poly1305: unexpected overflow")
		}

		 := 
		 := add128(, ) // These two additions don't overflow thanks again
		 := add128(, ) // to the 4 masked bits at the top of r0 and r1.
		 := 

		 := .lo
		,  := bitsAdd64(.lo, .hi, 0)
		,  := bitsAdd64(.lo, .hi, )
		,  := bitsAdd64(.lo, .hi, )

		// Now we have the result as 4 64-bit limbs, and we need to reduce it
		// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
		// a cheap partial reduction according to the reduction identity
		//
		//     c * 2¹³⁰ + n  =  c * 5 + n  mod  2¹³⁰ - 5
		//
		// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
		// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
		// assumptions we make about h in the rest of the code.
		//
		// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23

		// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
		// Note that the carry bits are effectively shifted left by 2, in other
		// words, cc = c * 4 for the c in the reduction identity.
		, ,  = , , &maskLow2Bits
		 := uint128{ & maskNotLow2Bits, }

		// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.

		,  = bitsAdd64(, .lo, 0)
		,  = bitsAdd64(, .hi, )
		 += 

		 = shiftRightBy2()

		,  = bitsAdd64(, .lo, 0)
		,  = bitsAdd64(, .hi, )
		 += 

		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
		//
		//     5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
	}

	.h[0], .h[1], .h[2] = , , 
}

const (
	maskLow2Bits    uint64 = 0x0000000000000003
	maskNotLow2Bits uint64 = ^maskLow2Bits
)

// select64 returns x if v == 1 and y if v == 0, in constant time.
func select64(, ,  uint64) uint64 { return ^(-1)& | (-1)& }

// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
const (
	p0 = 0xFFFFFFFFFFFFFFFB
	p1 = 0xFFFFFFFFFFFFFFFF
	p2 = 0x0000000000000003
)

// finalize completes the modular reduction of h and computes
//
//	out = h + s  mod  2¹²⁸
func finalize( *[TagSize]byte,  *[3]uint64,  *[2]uint64) {
	, ,  := [0], [1], [2]

	// After the partial reduction in updateGeneric, h might be more than
	// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
	// result if the subtraction underflows, and t otherwise.

	,  := bitsSub64(, p0, 0)
	,  := bitsSub64(, p1, )
	_,  = bitsSub64(, p2, )

	// h = h if h < p else h - p
	 = select64(, , )
	 = select64(, , )

	// Finally, we compute the last Poly1305 step
	//
	//     tag = h + s  mod  2¹²⁸
	//
	// by just doing a wide addition with the 128 low bits of h and discarding
	// the overflow.
	,  := bitsAdd64(, [0], 0)
	, _ = bitsAdd64(, [1], )

	binary.LittleEndian.PutUint64([0:8], )
	binary.LittleEndian.PutUint64([8:16], )
}