// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// SHA512 block routine. See sha512block.go for Go equivalent.
//
// The algorithm is detailed in FIPS 180-4:
//
//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
//
// a = H0
// b = H1
// c = H2
// d = H3
// e = H4
// f = H5
// g = H6
// h = H7
//
// for t = 0 to 79 {
//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
//    h = g
//    g = f
//    f = e
//    e = d + T1
//    d = c
//    c = b
//    b = a
//    a = T1 + T2
// }
//
// H0 = a + H0
// H1 = b + H1
// H2 = c + H2
// H3 = d + H3
// H4 = e + H4
// H5 = f + H5
// H6 = g + H6
// H7 = h + H7

// Wt = Mt; for 0 <= t <= 15
#define MSGSCHEDULE0(index) \
	MOVQ	(index*8)(SI), AX; \
	BSWAPQ	AX; \
	MOVQ	AX, (index*8)(BP)

// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
#define MSGSCHEDULE1(index) \
	MOVQ	((index-2)*8)(BP), AX; \
	MOVQ	AX, CX; \
	RORQ	$19, AX; \
	MOVQ	CX, DX; \
	RORQ	$61, CX; \
	SHRQ	$6, DX; \
	MOVQ	((index-15)*8)(BP), BX; \
	XORQ	CX, AX; \
	MOVQ	BX, CX; \
	XORQ	DX, AX; \
	RORQ	$1, BX; \
	MOVQ	CX, DX; \
	SHRQ	$7, DX; \
	RORQ	$8, CX; \
	ADDQ	((index-7)*8)(BP), AX; \
	XORQ	CX, BX; \
	XORQ	DX, BX; \
	ADDQ	((index-16)*8)(BP), BX; \
	ADDQ	BX, AX; \
	MOVQ	AX, ((index)*8)(BP)

// Calculate T1 in AX - uses AX, CX and DX registers.
// h is also used as an accumulator. Wt is passed in AX.
//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
#define SHA512T1(const, e, f, g, h) \
	MOVQ	$const, DX; \
	ADDQ	AX, h; \
	MOVQ	e, AX; \
	ADDQ	DX, h; \
	MOVQ	e, CX; \
	RORQ	$14, AX; \
	MOVQ	e, DX; \
	RORQ	$18, CX; \
	XORQ	CX, AX; \
	MOVQ	e, CX; \
	RORQ	$41, DX; \
	ANDQ	f, CX; \
	XORQ	AX, DX; \
	MOVQ	e, AX; \
	NOTQ	AX; \
	ADDQ	DX, h; \
	ANDQ	g, AX; \
	XORQ	CX, AX; \
	ADDQ	h, AX

// Calculate T2 in BX - uses BX, CX, DX and DI registers.
//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
#define SHA512T2(a, b, c) \
	MOVQ	a, DI; \
	MOVQ	c, BX; \
	RORQ	$28, DI; \
	MOVQ	a, DX; \
	ANDQ	b, BX; \
	RORQ	$34, DX; \
	MOVQ	a, CX; \
	ANDQ	c, CX; \
	XORQ	DX, DI; \
	XORQ	CX, BX; \
	MOVQ	a, DX; \
	MOVQ	b, CX; \
	RORQ	$39, DX; \
	ANDQ	a, CX; \
	XORQ	CX, BX; \
	XORQ	DX, DI; \
	ADDQ	DI, BX

// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
	SHA512T1(const, e, f, g, h); \
	SHA512T2(a, b, c); \
	MOVQ	BX, h; \
	ADDQ	AX, d; \
	ADDQ	AX, h

#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
	MSGSCHEDULE0(index); \
	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)

#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
	MSGSCHEDULE1(index); \
	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)

TEXT ·blockAMD64(SB),0,$648-32
	MOVQ	p_base+8(FP), SI
	MOVQ	p_len+16(FP), DX
	SHRQ	$7, DX
	SHLQ	$7, DX

	LEAQ	(SI)(DX*1), DI
	MOVQ	DI, 640(SP)
	CMPQ	SI, DI
	JEQ	end

	MOVQ	dig+0(FP), BP
	MOVQ	(0*8)(BP), R8		// a = H0
	MOVQ	(1*8)(BP), R9		// b = H1
	MOVQ	(2*8)(BP), R10		// c = H2
	MOVQ	(3*8)(BP), R11		// d = H3
	MOVQ	(4*8)(BP), R12		// e = H4
	MOVQ	(5*8)(BP), R13		// f = H5
	MOVQ	(6*8)(BP), R14		// g = H6
	MOVQ	(7*8)(BP), R15		// h = H7

loop:
	MOVQ	SP, BP			// message schedule

	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)

	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)

	MOVQ	dig+0(FP), BP
	ADDQ	(0*8)(BP), R8	// H0 = a + H0
	MOVQ	R8, (0*8)(BP)
	ADDQ	(1*8)(BP), R9	// H1 = b + H1
	MOVQ	R9, (1*8)(BP)
	ADDQ	(2*8)(BP), R10	// H2 = c + H2
	MOVQ	R10, (2*8)(BP)
	ADDQ	(3*8)(BP), R11	// H3 = d + H3
	MOVQ	R11, (3*8)(BP)
	ADDQ	(4*8)(BP), R12	// H4 = e + H4
	MOVQ	R12, (4*8)(BP)
	ADDQ	(5*8)(BP), R13	// H5 = f + H5
	MOVQ	R13, (5*8)(BP)
	ADDQ	(6*8)(BP), R14	// H6 = g + H6
	MOVQ	R14, (6*8)(BP)
	ADDQ	(7*8)(BP), R15	// H7 = h + H7
	MOVQ	R15, (7*8)(BP)

	ADDQ	$128, SI
	CMPQ	SI, 640(SP)
	JB	loop

end:
	RET

// Version below is based on "Fast SHA512 Implementations on Intel
// Architecture Processors" White-paper
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
// AVX2 version by Intel, same algorithm in Linux kernel:
// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S

// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
// David Cote <david.m.cote@intel.com>
// Aleksey Sidorov <aleksey.sidorov@intel.com>

#define YFER_SIZE (4*8)
#define SRND_SIZE (1*8)
#define INP_SIZE (1*8)

#define frame_YFER (0)
#define frame_SRND (frame_YFER + YFER_SIZE)
#define frame_INP (frame_SRND + SRND_SIZE)
#define frame_INPEND (frame_INP + INP_SIZE)

#define addm(p1, p2) \
	ADDQ p1, p2; \
	MOVQ p2, p1

#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
	VMOVDQU p2, p1;    \
	VPSHUFB p3, p1, p1

#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
	VPALIGNR   $RVAL, YSRC2, YDST, YDST

DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f

GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32

DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF

GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32

TEXT ·blockAVX2(SB), NOSPLIT, $56-32
	MOVQ dig+0(FP), SI
	MOVQ p_base+8(FP), DI
	MOVQ p_len+16(FP), DX

	SHRQ $7, DX
	SHLQ $7, DX

	JZ   done_hash
	ADDQ DI, DX
	MOVQ DX, frame_INPEND(SP)

	MOVQ (0*8)(SI), AX
	MOVQ (1*8)(SI), BX
	MOVQ (2*8)(SI), CX
	MOVQ (3*8)(SI), R8
	MOVQ (4*8)(SI), DX
	MOVQ (5*8)(SI), R9
	MOVQ (6*8)(SI), R10
	MOVQ (7*8)(SI), R11

	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9

loop0:
	MOVQ ·_K+0(SB), BP

	// byte swap first 16 dwords
	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)

	MOVQ DI, frame_INP(SP)

	// schedule 64 input dwords, by doing 12 rounds of 4 each
	MOVQ $4, frame_SRND(SP)

loop1:
	VPADDQ  (BP), Y4, Y0
	VMOVDQU Y0, frame_YFER(SP)

	MY_VPALIGNR(Y0, Y7, Y6, 8)

	VPADDQ Y4, Y0, Y0

	MY_VPALIGNR(Y1, Y5, Y4, 8)

	VPSRLQ $1, Y1, Y2
	VPSLLQ $(64-1), Y1, Y3
	VPOR   Y2, Y3, Y3

	VPSRLQ $7, Y1, Y8

	MOVQ  AX, DI
	RORXQ $41, DX, R13
	RORXQ $18, DX, R14
	ADDQ  frame_YFER(SP), R11
	ORQ   CX, DI
	MOVQ  R9, R15
	RORXQ $34, AX, R12

	XORQ  R14, R13
	XORQ  R10, R15
	RORXQ $14, DX, R14

	ANDQ  DX, R15
	XORQ  R14, R13
	RORXQ $39, AX, R14
	ADDQ  R11, R8

	ANDQ  BX, DI
	XORQ  R12, R14
	RORXQ $28, AX, R12

	XORQ R10, R15
	XORQ R12, R14
	MOVQ AX, R12
	ANDQ CX, R12

	ADDQ R13, R15
	ORQ  R12, DI
	ADDQ R14, R11

	ADDQ R15, R8

	ADDQ R15, R11
	ADDQ DI, R11

	VPSRLQ $8, Y1, Y2
	VPSLLQ $(64-8), Y1, Y1
	VPOR   Y2, Y1, Y1

	VPXOR Y8, Y3, Y3
	VPXOR Y1, Y3, Y1

	VPADDQ Y1, Y0, Y0

	VPERM2F128 $0x0, Y0, Y0, Y4

	VPAND MASK_YMM_LO<>(SB), Y0, Y0

	VPERM2F128 $0x11, Y7, Y7, Y2
	VPSRLQ     $6, Y2, Y8

	MOVQ  R11, DI
	RORXQ $41, R8, R13
	RORXQ $18, R8, R14
	ADDQ  1*8+frame_YFER(SP), R10
	ORQ   BX, DI

	MOVQ  DX, R15
	RORXQ $34, R11, R12
	XORQ  R14, R13
	XORQ  R9, R15

	RORXQ $14, R8, R14
	XORQ  R14, R13
	RORXQ $39, R11, R14
	ANDQ  R8, R15
	ADDQ  R10, CX

	ANDQ AX, DI
	XORQ R12, R14

	RORXQ $28, R11, R12
	XORQ  R9, R15

	XORQ R12, R14
	MOVQ R11, R12
	ANDQ BX, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, R10

	ADDQ R15, CX
	ADDQ R15, R10
	ADDQ DI, R10

	VPSRLQ $19, Y2, Y3
	VPSLLQ $(64-19), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y2, Y3
	VPSLLQ $(64-61), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y4, Y4

	VPSRLQ $6, Y4, Y8

	MOVQ  R10, DI
	RORXQ $41, CX, R13
	ADDQ  2*8+frame_YFER(SP), R9

	RORXQ $18, CX, R14
	ORQ   AX, DI
	MOVQ  R8, R15
	XORQ  DX, R15

	RORXQ $34, R10, R12
	XORQ  R14, R13
	ANDQ  CX, R15

	RORXQ $14, CX, R14
	ADDQ  R9, BX
	ANDQ  R11, DI

	XORQ  R14, R13
	RORXQ $39, R10, R14
	XORQ  DX, R15

	XORQ  R12, R14
	RORXQ $28, R10, R12

	XORQ R12, R14
	MOVQ R10, R12
	ANDQ AX, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, R9
	ADDQ R15, BX
	ADDQ R15, R9

	ADDQ DI, R9

	VPSRLQ $19, Y4, Y3
	VPSLLQ $(64-19), Y4, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y4, Y3
	VPSLLQ $(64-61), Y4, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y0, Y2

	VPBLENDD $0xF0, Y2, Y4, Y4

	MOVQ  R9, DI
	RORXQ $41, BX, R13
	RORXQ $18, BX, R14
	ADDQ  3*8+frame_YFER(SP), DX
	ORQ   R11, DI

	MOVQ  CX, R15
	RORXQ $34, R9, R12
	XORQ  R14, R13
	XORQ  R8, R15

	RORXQ $14, BX, R14
	ANDQ  BX, R15
	ADDQ  DX, AX
	ANDQ  R10, DI

	XORQ R14, R13
	XORQ R8, R15

	RORXQ $39, R9, R14
	ADDQ  R13, R15

	XORQ R12, R14
	ADDQ R15, AX

	RORXQ $28, R9, R12

	XORQ R12, R14
	MOVQ R9, R12
	ANDQ R11, R12
	ORQ  R12, DI

	ADDQ R14, DX
	ADDQ R15, DX
	ADDQ DI, DX

	VPADDQ  1*32(BP), Y5, Y0
	VMOVDQU Y0, frame_YFER(SP)

	MY_VPALIGNR(Y0, Y4, Y7, 8)

	VPADDQ Y5, Y0, Y0

	MY_VPALIGNR(Y1, Y6, Y5, 8)

	VPSRLQ $1, Y1, Y2
	VPSLLQ $(64-1), Y1, Y3
	VPOR   Y2, Y3, Y3

	VPSRLQ $7, Y1, Y8

	MOVQ  DX, DI
	RORXQ $41, AX, R13
	RORXQ $18, AX, R14
	ADDQ  frame_YFER(SP), R8
	ORQ   R10, DI
	MOVQ  BX, R15
	RORXQ $34, DX, R12

	XORQ  R14, R13
	XORQ  CX, R15
	RORXQ $14, AX, R14

	ANDQ  AX, R15
	XORQ  R14, R13
	RORXQ $39, DX, R14
	ADDQ  R8, R11

	ANDQ  R9, DI
	XORQ  R12, R14
	RORXQ $28, DX, R12

	XORQ CX, R15
	XORQ R12, R14
	MOVQ DX, R12
	ANDQ R10, R12

	ADDQ R13, R15
	ORQ  R12, DI
	ADDQ R14, R8

	ADDQ R15, R11

	ADDQ R15, R8
	ADDQ DI, R8

	VPSRLQ $8, Y1, Y2
	VPSLLQ $(64-8), Y1, Y1
	VPOR   Y2, Y1, Y1

	VPXOR Y8, Y3, Y3
	VPXOR Y1, Y3, Y1

	VPADDQ Y1, Y0, Y0

	VPERM2F128 $0x0, Y0, Y0, Y5

	VPAND MASK_YMM_LO<>(SB), Y0, Y0

	VPERM2F128 $0x11, Y4, Y4, Y2
	VPSRLQ     $6, Y2, Y8

	MOVQ  R8, DI
	RORXQ $41, R11, R13
	RORXQ $18, R11, R14
	ADDQ  1*8+frame_YFER(SP), CX
	ORQ   R9, DI

	MOVQ  AX, R15
	RORXQ $34, R8, R12
	XORQ  R14, R13
	XORQ  BX, R15

	RORXQ $14, R11, R14
	XORQ  R14, R13
	RORXQ $39, R8, R14
	ANDQ  R11, R15
	ADDQ  CX, R10

	ANDQ DX, DI
	XORQ R12, R14

	RORXQ $28, R8, R12
	XORQ  BX, R15

	XORQ R12, R14
	MOVQ R8, R12
	ANDQ R9, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, CX

	ADDQ R15, R10
	ADDQ R15, CX
	ADDQ DI, CX

	VPSRLQ $19, Y2, Y3
	VPSLLQ $(64-19), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y2, Y3
	VPSLLQ $(64-61), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y5, Y5

	VPSRLQ $6, Y5, Y8

	MOVQ  CX, DI
	RORXQ $41, R10, R13
	ADDQ  2*8+frame_YFER(SP), BX

	RORXQ $18, R10, R14
	ORQ   DX, DI
	MOVQ  R11, R15
	XORQ  AX, R15

	RORXQ $34, CX, R12
	XORQ  R14, R13
	ANDQ  R10, R15

	RORXQ $14, R10, R14
	ADDQ  BX, R9
	ANDQ  R8, DI

	XORQ  R14, R13
	RORXQ $39, CX, R14
	XORQ  AX, R15

	XORQ  R12, R14
	RORXQ $28, CX, R12

	XORQ R12, R14
	MOVQ CX, R12
	ANDQ DX, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, BX
	ADDQ R15, R9
	ADDQ R15, BX

	ADDQ DI, BX

	VPSRLQ $19, Y5, Y3
	VPSLLQ $(64-19), Y5, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y5, Y3
	VPSLLQ $(64-61), Y5, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y0, Y2

	VPBLENDD $0xF0, Y2, Y5, Y5

	MOVQ  BX, DI
	RORXQ $41, R9, R13
	RORXQ $18, R9, R14
	ADDQ  3*8+frame_YFER(SP), AX
	ORQ   R8, DI

	MOVQ  R10, R15
	RORXQ $34, BX, R12
	XORQ  R14, R13
	XORQ  R11, R15

	RORXQ $14, R9, R14
	ANDQ  R9, R15
	ADDQ  AX, DX
	ANDQ  CX, DI

	XORQ R14, R13
	XORQ R11, R15

	RORXQ $39, BX, R14
	ADDQ  R13, R15

	XORQ R12, R14
	ADDQ R15, DX

	RORXQ $28, BX, R12

	XORQ R12, R14
	MOVQ BX, R12
	ANDQ R8, R12
	ORQ  R12, DI

	ADDQ R14, AX
	ADDQ R15, AX
	ADDQ DI, AX

	VPADDQ  2*32(BP), Y6, Y0
	VMOVDQU Y0, frame_YFER(SP)

	MY_VPALIGNR(Y0, Y5, Y4, 8)

	VPADDQ Y6, Y0, Y0

	MY_VPALIGNR(Y1, Y7, Y6, 8)

	VPSRLQ $1, Y1, Y2
	VPSLLQ $(64-1), Y1, Y3
	VPOR   Y2, Y3, Y3

	VPSRLQ $7, Y1, Y8

	MOVQ  AX, DI
	RORXQ $41, DX, R13
	RORXQ $18, DX, R14
	ADDQ  frame_YFER(SP), R11
	ORQ   CX, DI
	MOVQ  R9, R15
	RORXQ $34, AX, R12

	XORQ  R14, R13
	XORQ  R10, R15
	RORXQ $14, DX, R14

	ANDQ  DX, R15
	XORQ  R14, R13
	RORXQ $39, AX, R14
	ADDQ  R11, R8

	ANDQ  BX, DI
	XORQ  R12, R14
	RORXQ $28, AX, R12

	XORQ R10, R15
	XORQ R12, R14
	MOVQ AX, R12
	ANDQ CX, R12

	ADDQ R13, R15
	ORQ  R12, DI
	ADDQ R14, R11

	ADDQ R15, R8

	ADDQ R15, R11
	ADDQ DI, R11

	VPSRLQ $8, Y1, Y2
	VPSLLQ $(64-8), Y1, Y1
	VPOR   Y2, Y1, Y1

	VPXOR Y8, Y3, Y3
	VPXOR Y1, Y3, Y1

	VPADDQ Y1, Y0, Y0

	VPERM2F128 $0x0, Y0, Y0, Y6

	VPAND MASK_YMM_LO<>(SB), Y0, Y0

	VPERM2F128 $0x11, Y5, Y5, Y2
	VPSRLQ     $6, Y2, Y8

	MOVQ  R11, DI
	RORXQ $41, R8, R13
	RORXQ $18, R8, R14
	ADDQ  1*8+frame_YFER(SP), R10
	ORQ   BX, DI

	MOVQ  DX, R15
	RORXQ $34, R11, R12
	XORQ  R14, R13
	XORQ  R9, R15

	RORXQ $14, R8, R14
	XORQ  R14, R13
	RORXQ $39, R11, R14
	ANDQ  R8, R15
	ADDQ  R10, CX

	ANDQ AX, DI
	XORQ R12, R14

	RORXQ $28, R11, R12
	XORQ  R9, R15

	XORQ R12, R14
	MOVQ R11, R12
	ANDQ BX, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, R10

	ADDQ R15, CX
	ADDQ R15, R10
	ADDQ DI, R10

	VPSRLQ $19, Y2, Y3
	VPSLLQ $(64-19), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y2, Y3
	VPSLLQ $(64-61), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y6, Y6

	VPSRLQ $6, Y6, Y8

	MOVQ  R10, DI
	RORXQ $41, CX, R13
	ADDQ  2*8+frame_YFER(SP), R9

	RORXQ $18, CX, R14
	ORQ   AX, DI
	MOVQ  R8, R15
	XORQ  DX, R15

	RORXQ $34, R10, R12
	XORQ  R14, R13
	ANDQ  CX, R15

	RORXQ $14, CX, R14
	ADDQ  R9, BX
	ANDQ  R11, DI

	XORQ  R14, R13
	RORXQ $39, R10, R14
	XORQ  DX, R15

	XORQ  R12, R14
	RORXQ $28, R10, R12

	XORQ R12, R14
	MOVQ R10, R12
	ANDQ AX, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, R9
	ADDQ R15, BX
	ADDQ R15, R9

	ADDQ DI, R9

	VPSRLQ $19, Y6, Y3
	VPSLLQ $(64-19), Y6, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y6, Y3
	VPSLLQ $(64-61), Y6, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y0, Y2

	VPBLENDD $0xF0, Y2, Y6, Y6

	MOVQ  R9, DI
	RORXQ $41, BX, R13
	RORXQ $18, BX, R14
	ADDQ  3*8+frame_YFER(SP), DX
	ORQ   R11, DI

	MOVQ  CX, R15
	RORXQ $34, R9, R12
	XORQ  R14, R13
	XORQ  R8, R15

	RORXQ $14, BX, R14
	ANDQ  BX, R15
	ADDQ  DX, AX
	ANDQ  R10, DI

	XORQ R14, R13
	XORQ R8, R15

	RORXQ $39, R9, R14
	ADDQ  R13, R15

	XORQ R12, R14
	ADDQ R15, AX

	RORXQ $28, R9, R12

	XORQ R12, R14
	MOVQ R9, R12
	ANDQ R11, R12
	ORQ  R12, DI

	ADDQ R14, DX
	ADDQ R15, DX
	ADDQ DI, DX

	VPADDQ  3*32(BP), Y7, Y0
	VMOVDQU Y0, frame_YFER(SP)
	ADDQ    $(4*32), BP

	MY_VPALIGNR(Y0, Y6, Y5, 8)

	VPADDQ Y7, Y0, Y0

	MY_VPALIGNR(Y1, Y4, Y7, 8)

	VPSRLQ $1, Y1, Y2
	VPSLLQ $(64-1), Y1, Y3
	VPOR   Y2, Y3, Y3

	VPSRLQ $7, Y1, Y8

	MOVQ  DX, DI
	RORXQ $41, AX, R13
	RORXQ $18, AX, R14
	ADDQ  frame_YFER(SP), R8
	ORQ   R10, DI
	MOVQ  BX, R15
	RORXQ $34, DX, R12

	XORQ  R14, R13
	XORQ  CX, R15
	RORXQ $14, AX, R14

	ANDQ  AX, R15
	XORQ  R14, R13
	RORXQ $39, DX, R14
	ADDQ  R8, R11

	ANDQ  R9, DI
	XORQ  R12, R14
	RORXQ $28, DX, R12

	XORQ CX, R15
	XORQ R12, R14
	MOVQ DX, R12
	ANDQ R10, R12

	ADDQ R13, R15
	ORQ  R12, DI
	ADDQ R14, R8

	ADDQ R15, R11

	ADDQ R15, R8
	ADDQ DI, R8

	VPSRLQ $8, Y1, Y2
	VPSLLQ $(64-8), Y1, Y1
	VPOR   Y2, Y1, Y1

	VPXOR Y8, Y3, Y3
	VPXOR Y1, Y3, Y1

	VPADDQ Y1, Y0, Y0

	VPERM2F128 $0x0, Y0, Y0, Y7

	VPAND MASK_YMM_LO<>(SB), Y0, Y0

	VPERM2F128 $0x11, Y6, Y6, Y2
	VPSRLQ     $6, Y2, Y8

	MOVQ  R8, DI
	RORXQ $41, R11, R13
	RORXQ $18, R11, R14
	ADDQ  1*8+frame_YFER(SP), CX
	ORQ   R9, DI

	MOVQ  AX, R15
	RORXQ $34, R8, R12
	XORQ  R14, R13
	XORQ  BX, R15

	RORXQ $14, R11, R14
	XORQ  R14, R13
	RORXQ $39, R8, R14
	ANDQ  R11, R15
	ADDQ  CX, R10

	ANDQ DX, DI
	XORQ R12, R14

	RORXQ $28, R8, R12
	XORQ  BX, R15

	XORQ R12, R14
	MOVQ R8, R12
	ANDQ R9, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, CX

	ADDQ R15, R10
	ADDQ R15, CX
	ADDQ DI, CX

	VPSRLQ $19, Y2, Y3
	VPSLLQ $(64-19), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y2, Y3
	VPSLLQ $(64-61), Y2, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y7, Y7

	VPSRLQ $6, Y7, Y8

	MOVQ  CX, DI
	RORXQ $41, R10, R13
	ADDQ  2*8+frame_YFER(SP), BX

	RORXQ $18, R10, R14
	ORQ   DX, DI
	MOVQ  R11, R15
	XORQ  AX, R15

	RORXQ $34, CX, R12
	XORQ  R14, R13
	ANDQ  R10, R15

	RORXQ $14, R10, R14
	ADDQ  BX, R9
	ANDQ  R8, DI

	XORQ  R14, R13
	RORXQ $39, CX, R14
	XORQ  AX, R15

	XORQ  R12, R14
	RORXQ $28, CX, R12

	XORQ R12, R14
	MOVQ CX, R12
	ANDQ DX, R12
	ADDQ R13, R15

	ORQ  R12, DI
	ADDQ R14, BX
	ADDQ R15, R9
	ADDQ R15, BX

	ADDQ DI, BX

	VPSRLQ $19, Y7, Y3
	VPSLLQ $(64-19), Y7, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8
	VPSRLQ $61, Y7, Y3
	VPSLLQ $(64-61), Y7, Y1
	VPOR   Y1, Y3, Y3
	VPXOR  Y3, Y8, Y8

	VPADDQ Y8, Y0, Y2

	VPBLENDD $0xF0, Y2, Y7, Y7

	MOVQ  BX, DI
	RORXQ $41, R9, R13
	RORXQ $18, R9, R14
	ADDQ  3*8+frame_YFER(SP), AX
	ORQ   R8, DI

	MOVQ  R10, R15
	RORXQ $34, BX, R12
	XORQ  R14, R13
	XORQ  R11, R15

	RORXQ $14, R9, R14
	ANDQ  R9, R15
	ADDQ  AX, DX
	ANDQ  CX, DI

	XORQ R14, R13
	XORQ R11, R15

	RORXQ $39, BX, R14
	ADDQ  R13, R15

	XORQ R12, R14
	ADDQ R15, DX

	RORXQ $28, BX, R12

	XORQ R12, R14
	MOVQ BX, R12
	ANDQ R8, R12
	ORQ  R12, DI

	ADDQ R14, AX
	ADDQ R15, AX
	ADDQ DI, AX

	SUBQ $1, frame_SRND(SP)
	JNE  loop1

	MOVQ $2, frame_SRND(SP)

loop2:
	VPADDQ  (BP), Y4, Y0
	VMOVDQU Y0, frame_YFER(SP)

	MOVQ  R9, R15
	RORXQ $41, DX, R13
	RORXQ $18, DX, R14
	XORQ  R10, R15

	XORQ  R14, R13
	RORXQ $14, DX, R14
	ANDQ  DX, R15

	XORQ  R14, R13
	RORXQ $34, AX, R12
	XORQ  R10, R15
	RORXQ $39, AX, R14
	MOVQ  AX, DI

	XORQ  R12, R14
	RORXQ $28, AX, R12
	ADDQ  frame_YFER(SP), R11
	ORQ   CX, DI

	XORQ R12, R14
	MOVQ AX, R12
	ANDQ BX, DI
	ANDQ CX, R12
	ADDQ R13, R15

	ADDQ R11, R8
	ORQ  R12, DI
	ADDQ R14, R11

	ADDQ R15, R8

	ADDQ  R15, R11
	MOVQ  DX, R15
	RORXQ $41, R8, R13
	RORXQ $18, R8, R14
	XORQ  R9, R15

	XORQ  R14, R13
	RORXQ $14, R8, R14
	ANDQ  R8, R15
	ADDQ  DI, R11

	XORQ  R14, R13
	RORXQ $34, R11, R12
	XORQ  R9, R15
	RORXQ $39, R11, R14
	MOVQ  R11, DI

	XORQ  R12, R14
	RORXQ $28, R11, R12
	ADDQ  8*1+frame_YFER(SP), R10
	ORQ   BX, DI

	XORQ R12, R14
	MOVQ R11, R12
	ANDQ AX, DI
	ANDQ BX, R12
	ADDQ R13, R15

	ADDQ R10, CX
	ORQ  R12, DI
	ADDQ R14, R10

	ADDQ R15, CX

	ADDQ  R15, R10
	MOVQ  R8, R15
	RORXQ $41, CX, R13
	RORXQ $18, CX, R14
	XORQ  DX, R15

	XORQ  R14, R13
	RORXQ $14, CX, R14
	ANDQ  CX, R15
	ADDQ  DI, R10

	XORQ  R14, R13
	RORXQ $34, R10, R12
	XORQ  DX, R15
	RORXQ $39, R10, R14
	MOVQ  R10, DI

	XORQ  R12, R14
	RORXQ $28, R10, R12
	ADDQ  8*2+frame_YFER(SP), R9
	ORQ   AX, DI

	XORQ R12, R14
	MOVQ R10, R12
	ANDQ R11, DI
	ANDQ AX, R12
	ADDQ R13, R15

	ADDQ R9, BX
	ORQ  R12, DI
	ADDQ R14, R9

	ADDQ R15, BX

	ADDQ  R15, R9
	MOVQ  CX, R15
	RORXQ $41, BX, R13
	RORXQ $18, BX, R14
	XORQ  R8, R15

	XORQ  R14, R13
	RORXQ $14, BX, R14
	ANDQ  BX, R15
	ADDQ  DI, R9

	XORQ  R14, R13
	RORXQ $34, R9, R12
	XORQ  R8, R15
	RORXQ $39, R9, R14
	MOVQ  R9, DI

	XORQ  R12, R14
	RORXQ $28, R9, R12
	ADDQ  8*3+frame_YFER(SP), DX
	ORQ   R11, DI

	XORQ R12, R14
	MOVQ R9, R12
	ANDQ R10, DI
	ANDQ R11, R12
	ADDQ R13, R15

	ADDQ DX, AX
	ORQ  R12, DI
	ADDQ R14, DX

	ADDQ R15, AX

	ADDQ R15, DX

	ADDQ DI, DX

	VPADDQ  1*32(BP), Y5, Y0
	VMOVDQU Y0, frame_YFER(SP)
	ADDQ    $(2*32), BP

	MOVQ  BX, R15
	RORXQ $41, AX, R13
	RORXQ $18, AX, R14
	XORQ  CX, R15

	XORQ  R14, R13
	RORXQ $14, AX, R14
	ANDQ  AX, R15

	XORQ  R14, R13
	RORXQ $34, DX, R12
	XORQ  CX, R15
	RORXQ $39, DX, R14
	MOVQ  DX, DI

	XORQ  R12, R14
	RORXQ $28, DX, R12
	ADDQ  frame_YFER(SP), R8
	ORQ   R10, DI

	XORQ R12, R14
	MOVQ DX, R12
	ANDQ R9, DI
	ANDQ R10, R12
	ADDQ R13, R15

	ADDQ R8, R11
	ORQ  R12, DI
	ADDQ R14, R8

	ADDQ R15, R11

	ADDQ  R15, R8
	MOVQ  AX, R15
	RORXQ $41, R11, R13
	RORXQ $18, R11, R14
	XORQ  BX, R15

	XORQ  R14, R13
	RORXQ $14, R11, R14
	ANDQ  R11, R15
	ADDQ  DI, R8

	XORQ  R14, R13
	RORXQ $34, R8, R12
	XORQ  BX, R15
	RORXQ $39, R8, R14
	MOVQ  R8, DI

	XORQ  R12, R14
	RORXQ $28, R8, R12
	ADDQ  8*1+frame_YFER(SP), CX
	ORQ   R9, DI

	XORQ R12, R14
	MOVQ R8, R12
	ANDQ DX, DI
	ANDQ R9, R12
	ADDQ R13, R15

	ADDQ CX, R10
	ORQ  R12, DI
	ADDQ R14, CX

	ADDQ R15, R10

	ADDQ  R15, CX
	MOVQ  R11, R15
	RORXQ $41, R10, R13
	RORXQ $18, R10, R14
	XORQ  AX, R15

	XORQ  R14, R13
	RORXQ $14, R10, R14
	ANDQ  R10, R15
	ADDQ  DI, CX

	XORQ  R14, R13
	RORXQ $34, CX, R12
	XORQ  AX, R15
	RORXQ $39, CX, R14
	MOVQ  CX, DI

	XORQ  R12, R14
	RORXQ $28, CX, R12
	ADDQ  8*2+frame_YFER(SP), BX
	ORQ   DX, DI

	XORQ R12, R14
	MOVQ CX, R12
	ANDQ R8, DI
	ANDQ DX, R12
	ADDQ R13, R15

	ADDQ BX, R9
	ORQ  R12, DI
	ADDQ R14, BX

	ADDQ R15, R9

	ADDQ  R15, BX
	MOVQ  R10, R15
	RORXQ $41, R9, R13
	RORXQ $18, R9, R14
	XORQ  R11, R15

	XORQ  R14, R13
	RORXQ $14, R9, R14
	ANDQ  R9, R15
	ADDQ  DI, BX

	XORQ  R14, R13
	RORXQ $34, BX, R12
	XORQ  R11, R15
	RORXQ $39, BX, R14
	MOVQ  BX, DI

	XORQ  R12, R14
	RORXQ $28, BX, R12
	ADDQ  8*3+frame_YFER(SP), AX
	ORQ   R8, DI

	XORQ R12, R14
	MOVQ BX, R12
	ANDQ CX, DI
	ANDQ R8, R12
	ADDQ R13, R15

	ADDQ AX, DX
	ORQ  R12, DI
	ADDQ R14, AX

	ADDQ R15, DX

	ADDQ R15, AX

	ADDQ DI, AX

	VMOVDQU Y6, Y4
	VMOVDQU Y7, Y5

	SUBQ $1, frame_SRND(SP)
	JNE  loop2

	addm(8*0(SI),AX)
	addm(8*1(SI),BX)
	addm(8*2(SI),CX)
	addm(8*3(SI),R8)
	addm(8*4(SI),DX)
	addm(8*5(SI),R9)
	addm(8*6(SI),R10)
	addm(8*7(SI),R11)

	MOVQ frame_INP(SP), DI
	ADDQ $128, DI
	CMPQ DI, frame_INPEND(SP)
	JNE  loop0

done_hash:
	VZEROUPPER
	RET