// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// ChaCha8 is ChaCha with 8 rounds. // See https://cr.yp.to/chacha/chacha-20080128.pdf. // // ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to: // // const1 const2 const3 const4 // seed seed seed seed // seed seed seed seed // counter64 0 0 // // We use the same constants as ChaCha20 does, a random seed, // and a counter. Running ChaCha8 on this input produces // a 4x4 matrix of pseudo-random values with as much entropy // as the seed. // // Given SIMD registers that can hold N uint32s, it is possible // to run N ChaCha8 block transformations in parallel by filling // the first register with the N copies of const1, the second // with N copies of const2, and so on, and then running the operations. // // Each iteration of ChaCha8Rand operates over 32 bytes of input and // produces 992 bytes of RNG output, plus 32 bytes of input for the next // iteration. // // The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to // produce 1024 bytes of output (16 blocks, with counters 0 to 15). // First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32, // 0x6b206574 are subtracted from the 32-bit little-endian words at // position 0, 1, 2, and 3 respectively, and an increasing counter // starting at zero is subtracted from each word at position 12. Then, // this stream is permuted such that for each sequence of four blocks, // first we output the first four bytes of each block, then the next four // bytes of each block, and so on. Finally, the last 32 bytes of output // are used as the input of the next iteration, and the remaining 992 // bytes are the RNG output. // // See https://c2sp.org/chacha8rand for additional details. // // Normal ChaCha20 implementations for encryption use this same // parallelism but then have to deinterlace the results so that // it appears the blocks were generated separately. For the purposes // of generating random numbers, the interlacing is fine. // We are simply locked in to preserving the 4-way interlacing // in any future optimizations.
package chacha8rand import ( ) // setup sets up 4 ChaCha8 blocks in b32 with the counter and seed. // Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced // the same way they would be in a 4-way SIMD implementations. func setup( *[4]uint64, *[16][4]uint32, uint32) { // Convert to uint64 to do half as many stores to memory. := (*[16][2]uint64)(unsafe.Pointer()) // Constants; same as in ChaCha20: "expand 32-byte k" [0][0] = 0x61707865_61707865 [0][1] = 0x61707865_61707865 [1][0] = 0x3320646e_3320646e [1][1] = 0x3320646e_3320646e [2][0] = 0x79622d32_79622d32 [2][1] = 0x79622d32_79622d32 [3][0] = 0x6b206574_6b206574 [3][1] = 0x6b206574_6b206574 // Seed values. var uint64 var uint32 = uint32([0]) = uint64()<<32 | uint64() [4][0] = [4][1] = = uint32([0] >> 32) = uint64()<<32 | uint64() [5][0] = [5][1] = = uint32([1]) = uint64()<<32 | uint64() [6][0] = [6][1] = = uint32([1] >> 32) = uint64()<<32 | uint64() [7][0] = [7][1] = = uint32([2]) = uint64()<<32 | uint64() [8][0] = [8][1] = = uint32([2] >> 32) = uint64()<<32 | uint64() [9][0] = [9][1] = = uint32([3]) = uint64()<<32 | uint64() [10][0] = [10][1] = = uint32([3] >> 32) = uint64()<<32 | uint64() [11][0] = [11][1] = // Counters. if goarch.BigEndian { [12][0] = uint64(+0)<<32 | uint64(+1) [12][1] = uint64(+2)<<32 | uint64(+3) } else { [12][0] = uint64(+0) | uint64(+1)<<32 [12][1] = uint64(+2) | uint64(+3)<<32 } // Zeros. [13][0] = 0 [13][1] = 0 [14][0] = 0 [14][1] = 0 [15][0] = 0 [15][1] = 0 } func _() { // block and block_generic must have same type := block = block_generic _ = } // block_generic is the non-assembly block implementation, // for use on systems without special assembly. // Even on such systems, it is quite fast: on GOOS=386, // ChaCha8 using this code generates random values faster than PCG-DXSM. func block_generic( *[4]uint64, *[32]uint64, uint32) { := (*[16][4]uint32)(unsafe.Pointer()) setup(, , ) for := range [0] { // Load block i from b[*][i] into local variables. := [0][] := [1][] := [2][] := [3][] := [4][] := [5][] := [6][] := [7][] := [8][] := [9][] := [10][] := [11][] := [12][] := [13][] := [14][] := [15][] // 4 iterations of eight quarter-rounds each is 8 rounds for := 0; < 4; ++ { , , , = qr(, , , ) , , , = qr(, , , ) , , , = qr(, , , ) , , , = qr(, , , ) , , , = qr(, , , ) , , , = qr(, , , ) , , , = qr(, , , ) , , , = qr(, , , ) } // Store block i back into b[*][i]. // Add b4..b11 back to the original key material, // like in ChaCha20, to avoid trivial invertibility. // There is no entropy in b0..b3 and b12..b15 // so we can skip the additions and save some time. [0][] = [1][] = [2][] = [3][] = [4][] += [5][] += [6][] += [7][] += [8][] += [9][] += [10][] += [11][] += [12][] = [13][] = [14][] = [15][] = } if goarch.BigEndian { // On a big-endian system, reading the uint32 pairs as uint64s // will word-swap them compared to little-endian, so we word-swap // them here first to make the next swap get the right answer. for , := range { [] = >>32 | <<32 } } } // qr is the (inlinable) ChaCha8 quarter round. func qr(, , , uint32) (, , , uint32) { += ^= = <<16 | >>16 += ^= = <<12 | >>20 += ^= = <<8 | >>24 += ^= = <<7 | >>25 return , , , }