````// Copyright 2018 The Go Authors. All rights reserved.`
`// Use of this source code is governed by a BSD-style`
`// license that can be found in the LICENSE file.`

`// This file provides the generic implementation of Sum and MAC. Other files`
`// might provide optimized assembly implementations of some of this code.`

`package poly1305`

`import "encoding/binary"`

`// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag`
`// for a 64 bytes message is approximately`
`//`
`//     s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r  mod  2¹³⁰ - 5`
`//`
`// for some secret r and s. It can be computed sequentially like`
`//`
`//     for len(msg) > 0:`
`//         h += read(msg, 16)`
`//         h *= r`
`//         h %= 2¹³⁰ - 5`
`//     return h + s`
`//`
`// All the complexity is about doing performant constant-time math on numbers`
`// larger than any available numeric type.`

`func sumGeneric(out *[TagSize]byte, msg []byte, key *byte) {`
`	h := newMACGeneric(key)`
`	h.Write(msg)`
`	h.Sum(out)`
`}`

`func newMACGeneric(key *byte) macGeneric {`
`	m := macGeneric{}`
`	initialize(key, &m.macState)`
`	return m`
`}`

`// macState holds numbers in saturated 64-bit little-endian limbs. That is,`
`// the value of [x0, x1, x2] is x + x * 2⁶⁴ + x * 2¹²⁸.`
`type macState struct {`
`	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but`
`	// can grow larger during and after rounds. It must, however, remain below`
`	// 2 * (2¹³⁰ - 5).`
`	h uint64`
`	// r and s are the private key components.`
`	r uint64`
`	s uint64`
`}`

`type macGeneric struct {`
`	macState`

`	buffer [TagSize]byte`
`	offset int`
`}`

`// Write splits the incoming message into TagSize chunks, and passes them to`
`// update. It buffers incomplete chunks.`
`func (h *macGeneric) Write(p []byte) (int, error) {`
`	nn := len(p)`
`	if h.offset > 0 {`
`		n := copy(h.buffer[h.offset:], p)`
`		if h.offset+n < TagSize {`
`			h.offset += n`
`			return nn, nil`
`		}`
`		p = p[n:]`
`		h.offset = 0`
`		updateGeneric(&h.macState, h.buffer[:])`
`	}`
`	if n := len(p) - (len(p) % TagSize); n > 0 {`
`		updateGeneric(&h.macState, p[:n])`
`		p = p[n:]`
`	}`
`	if len(p) > 0 {`
`		h.offset += copy(h.buffer[h.offset:], p)`
`	}`
`	return nn, nil`
`}`

`// Sum flushes the last incomplete chunk from the buffer, if any, and generates`
`// the MAC output. It does not modify its state, in order to allow for multiple`
`// calls to Sum, even if no Write is allowed after Sum.`
`func (h *macGeneric) Sum(out *[TagSize]byte) {`
`	state := h.macState`
`	if h.offset > 0 {`
`		updateGeneric(&state, h.buffer[:h.offset])`
`	}`
`	finalize(out, &state.h, &state.s)`
`}`

`// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It`
`// clears some bits of the secret coefficient to make it possible to implement`
`// multiplication more efficiently.`
`const (`
`	rMask0 = 0x0FFFFFFC0FFFFFFF`
`	rMask1 = 0x0FFFFFFC0FFFFFFC`
`)`

`// initialize loads the 256-bit key into the two 128-bit secret values r and s.`
`func initialize(key *byte, m *macState) {`
`	m.r = binary.LittleEndian.Uint64(key[0:8]) & rMask0`
`	m.r = binary.LittleEndian.Uint64(key[8:16]) & rMask1`
`	m.s = binary.LittleEndian.Uint64(key[16:24])`
`	m.s = binary.LittleEndian.Uint64(key[24:32])`
`}`

`// uint128 holds a 128-bit number as two 64-bit limbs, for use with the`
`// bits.Mul64 and bits.Add64 intrinsics.`
`type uint128 struct {`
`	lo, hi uint64`
`}`

`func mul64(a, b uint64) uint128 {`
`	hi, lo := bitsMul64(a, b)`
`	return uint128{lo, hi}`
`}`

`func add128(a, b uint128) uint128 {`
`	lo, c := bitsAdd64(a.lo, b.lo, 0)`
`	hi, c := bitsAdd64(a.hi, b.hi, c)`
`	if c != 0 {`
`		panic("poly1305: unexpected overflow")`
`	}`
`	return uint128{lo, hi}`
`}`

`func shiftRightBy2(a uint128) uint128 {`
`	a.lo = a.lo>>2 | (a.hi&3)<<62`
`	a.hi = a.hi >> 2`
`	return a`
`}`

`// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of`
`// 128 bits of message, it computes`
`//`
`//	h₊ = (h + m) * r  mod  2¹³⁰ - 5`
`//`
`// If the msg length is not a multiple of TagSize, it assumes the last`
`// incomplete chunk is the final one.`
`func updateGeneric(state *macState, msg []byte) {`
`	h0, h1, h2 := state.h, state.h, state.h`
`	r0, r1 := state.r, state.r`

`	for len(msg) > 0 {`
`		var c uint64`

`		// For the first step, h + m, we use a chain of bits.Add64 intrinsics.`
`		// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially`
`		// reduced at the end of the multiplication below.`
`		//`
`		// The spec requires us to set a bit just above the message size, not to`
`		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just`
`		// add 1 to the most significant (2¹²⁸) limb, h2.`
`		if len(msg) >= TagSize {`
`			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)`
`			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)`
`			h2 += c + 1`

`			msg = msg[TagSize:]`
`		} else {`
`			var buf [TagSize]byte`
`			copy(buf[:], msg)`
`			buf[len(msg)] = 1`

`			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)`
`			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)`
`			h2 += c`

`			msg = nil`
`		}`

`		// Multiplication of big number limbs is similar to elementary school`
`		// columnar multiplication. Instead of digits, there are 64-bit limbs.`
`		//`
`		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.`
`		//`
`		//                        h2    h1    h0  x`
`		//                              r1    r0  =`
`		//                       ----------------`
`		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products`
`		//            +   h2r1  h1r1  h0r1`
`		//               ------------------------`
`		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs`
`		//               ------------------------`
`		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation`
`		//     +         m3.lo m2.lo m1.lo m0.lo`
`		//        -------------------------------`
`		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs`
`		//`
`		// The main difference from pen-and-paper multiplication is that we do`
`		// carry propagation in a separate step, as if we wrote two digit sums`
`		// at first (the 128-bit limbs), and then carried the tens all at once.`

`		h0r0 := mul64(h0, r0)`
`		h1r0 := mul64(h1, r0)`
`		h2r0 := mul64(h2, r0)`
`		h0r1 := mul64(h0, r1)`
`		h1r1 := mul64(h1, r1)`
`		h2r1 := mul64(h2, r1)`

`		// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their`
`		// top 4 bits cleared by rMask{0,1}, we know that their product is not going`
`		// to overflow 64 bits, so we can ignore the high part of the products.`
`		//`
`		// This also means that the product doesn't have a fifth limb (t4).`
`		if h2r0.hi != 0 {`
`			panic("poly1305: unexpected overflow")`
`		}`
`		if h2r1.hi != 0 {`
`			panic("poly1305: unexpected overflow")`
`		}`

`		m0 := h0r0`
`		m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again`
`		m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.`
`		m3 := h2r1`

`		t0 := m0.lo`
`		t1, c := bitsAdd64(m1.lo, m0.hi, 0)`
`		t2, c := bitsAdd64(m2.lo, m1.hi, c)`
`		t3, _ := bitsAdd64(m3.lo, m2.hi, c)`

`		// Now we have the result as 4 64-bit limbs, and we need to reduce it`
`		// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do`
`		// a cheap partial reduction according to the reduction identity`
`		//`
`		//     c * 2¹³⁰ + n  =  c * 5 + n  mod  2¹³⁰ - 5`
`		//`
`		// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is`
`		// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the`
`		// assumptions we make about h in the rest of the code.`
`		//`
`		// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23`

`		// We split the final result at the 2¹³⁰ mark into h and cc, the carry.`
`		// Note that the carry bits are effectively shifted left by 2, in other`
`		// words, cc = c * 4 for the c in the reduction identity.`
`		h0, h1, h2 = t0, t1, t2&maskLow2Bits`
`		cc := uint128{t2 & maskNotLow2Bits, t3}`

`		// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.`

`		h0, c = bitsAdd64(h0, cc.lo, 0)`
`		h1, c = bitsAdd64(h1, cc.hi, c)`
`		h2 += c`

`		cc = shiftRightBy2(cc)`

`		h0, c = bitsAdd64(h0, cc.lo, 0)`
`		h1, c = bitsAdd64(h1, cc.hi, c)`
`		h2 += c`

`		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most`
`		//`
`		//     5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1`
`	}`

`	state.h, state.h, state.h = h0, h1, h2`
`}`

`const (`
`	maskLow2Bits    uint64 = 0x0000000000000003`
`	maskNotLow2Bits uint64 = ^maskLow2Bits`
`)`

`// select64 returns x if v == 1 and y if v == 0, in constant time.`
`func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }`

`// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.`
`const (`
`	p0 = 0xFFFFFFFFFFFFFFFB`
`	p1 = 0xFFFFFFFFFFFFFFFF`
`	p2 = 0x0000000000000003`
`)`

`// finalize completes the modular reduction of h and computes`
`//`
`//	out = h + s  mod  2¹²⁸`
`func finalize(out *[TagSize]byte, h *uint64, s *uint64) {`
`	h0, h1, h2 := h, h, h`

`	// After the partial reduction in updateGeneric, h might be more than`
`	// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction`
`	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the`
`	// result if the subtraction underflows, and t otherwise.`

`	hMinusP0, b := bitsSub64(h0, p0, 0)`
`	hMinusP1, b := bitsSub64(h1, p1, b)`
`	_, b = bitsSub64(h2, p2, b)`

`	// h = h if h < p else h - p`
`	h0 = select64(b, h0, hMinusP0)`
`	h1 = select64(b, h1, hMinusP1)`

`	// Finally, we compute the last Poly1305 step`
`	//`
`	//     tag = h + s  mod  2¹²⁸`
`	//`
`	// by just doing a wide addition with the 128 low bits of h and discarding`
`	// the overflow.`
`	h0, c := bitsAdd64(h0, s, 0)`
`	h1, _ = bitsAdd64(h1, s, c)`

`	binary.LittleEndian.PutUint64(out[0:8], h0)`
`	binary.LittleEndian.PutUint64(out[8:16], h1)`
`}`
```