// Code generated by command: go run gcm_amd64_asm.go -out ../../gcm_amd64.s -pkg aes. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func gcmAesFinish(productTable *[256]byte, tagMask *[16]byte, T *[16]byte, pLen uint64, dLen uint64)
// Requires: PCLMULQDQ, SSE2, SSE4.1, SSSE3
TEXT ·gcmAesFinish(SB), NOSPLIT, $0-40
	MOVQ      productTable+0(FP), DI
	MOVQ      tagMask+8(FP), SI
	MOVQ      T+16(FP), DX
	MOVQ      pLen+24(FP), AX
	MOVQ      dLen+32(FP), CX
	MOVOU     (DX), X8
	MOVOU     (SI), X13
	MOVOU     bswapMask<>+0(SB), X15
	MOVOU     gcmPoly<>+0(SB), X14
	SHLQ      $0x03, AX
	SHLQ      $0x03, CX
	MOVQ      AX, X0
	PINSRQ    $0x01, CX, X0
	PXOR      X8, X0
	MOVOU     224(DI), X8
	MOVOU     240(DI), X10
	MOVOU     X8, X9
	PCLMULQDQ $0x00, X0, X8
	PCLMULQDQ $0x11, X0, X9
	PSHUFD    $0x4e, X0, X11
	PXOR      X0, X11
	PCLMULQDQ $0x00, X11, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	PSHUFB    X15, X8
	PXOR      X13, X8
	MOVOU     X8, (DX)
	RET

DATA bswapMask<>+0(SB)/8, $0x08090a0b0c0d0e0f
DATA bswapMask<>+8(SB)/8, $0x0001020304050607
GLOBL bswapMask<>(SB), RODATA|NOPTR, $16

DATA gcmPoly<>+0(SB)/8, $0x0000000000000001
DATA gcmPoly<>+8(SB)/8, $0xc200000000000000
GLOBL gcmPoly<>(SB), RODATA|NOPTR, $16

// func gcmAesInit(productTable *[256]byte, ks []uint32)
// Requires: AES, PCLMULQDQ, SSE2, SSSE3
TEXT ·gcmAesInit(SB), NOSPLIT, $0-32
	MOVQ  productTable+0(FP), DI
	MOVQ  ks_base+8(FP), SI
	MOVQ  ks_len+16(FP), DX
	SHRQ  $0x02, DX
	DECQ  DX
	MOVOU bswapMask<>+0(SB), X15
	MOVOU gcmPoly<>+0(SB), X14

	// Encrypt block 0, with the AES key to generate the hash key H
	MOVOU  (SI), X0
	MOVOU  16(SI), X11
	AESENC X11, X0
	MOVOU  32(SI), X11
	AESENC X11, X0
	MOVOU  48(SI), X11
	AESENC X11, X0
	MOVOU  64(SI), X11
	AESENC X11, X0
	MOVOU  80(SI), X11
	AESENC X11, X0
	MOVOU  96(SI), X11
	AESENC X11, X0
	MOVOU  112(SI), X11
	AESENC X11, X0
	MOVOU  128(SI), X11
	AESENC X11, X0
	MOVOU  144(SI), X11
	AESENC X11, X0
	MOVOU  160(SI), X11
	CMPQ   DX, $0x0c
	JB     initEncLast
	AESENC X11, X0
	MOVOU  176(SI), X11
	AESENC X11, X0
	MOVOU  192(SI), X11
	JE     initEncLast
	AESENC X11, X0
	MOVOU  208(SI), X11
	AESENC X11, X0
	MOVOU  224(SI), X11

initEncLast:
	AESENCLAST X11, X0
	PSHUFB     X15, X0

	// H * 2
	PSHUFD $0xff, X0, X11
	MOVOU  X0, X12
	PSRAL  $0x1f, X11
	PAND   X14, X11
	PSRLL  $0x1f, X12
	PSLLDQ $0x04, X12
	PSLLL  $0x01, X0
	PXOR   X11, X0
	PXOR   X12, X0

	// Karatsuba pre-computations
	MOVOU  X0, 224(DI)
	PSHUFD $0x4e, X0, X1
	PXOR   X0, X1
	MOVOU  X1, 240(DI)
	MOVOU  X0, X2
	MOVOU  X1, X3

	// Now prepare powers of H and pre-computations for them
	MOVQ $0x00000007, AX

initLoop:
	MOVOU     X2, X11
	MOVOU     X2, X12
	MOVOU     X3, X13
	PCLMULQDQ $0x00, X0, X11
	PCLMULQDQ $0x11, X0, X12
	PCLMULQDQ $0x00, X1, X13
	PXOR      X11, X13
	PXOR      X12, X13
	MOVOU     X13, X4
	PSLLDQ    $0x08, X4
	PSRLDQ    $0x08, X13
	PXOR      X4, X11
	PXOR      X13, X12
	MOVOU     X14, X2
	PCLMULQDQ $0x01, X11, X2
	PSHUFD    $0x4e, X11, X11
	PXOR      X2, X11
	MOVOU     X14, X2
	PCLMULQDQ $0x01, X11, X2
	PSHUFD    $0x4e, X11, X11
	PXOR      X11, X2
	PXOR      X12, X2
	MOVOU     X2, 192(DI)
	PSHUFD    $0x4e, X2, X3
	PXOR      X2, X3
	MOVOU     X3, 208(DI)
	DECQ      AX
	LEAQ      -32(DI), DI
	JNE       initLoop
	RET

// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
// Requires: PCLMULQDQ, SSE2, SSE4.1, SSSE3
TEXT ·gcmAesData(SB), NOSPLIT, $0-40
	MOVQ  productTable+0(FP), DI
	MOVQ  data_base+8(FP), SI
	MOVQ  data_len+16(FP), DX
	MOVQ  T+32(FP), CX
	PXOR  X8, X8
	MOVOU bswapMask<>+0(SB), X15
	MOVOU gcmPoly<>+0(SB), X14
	TESTQ DX, DX
	JEQ   dataBail
	CMPQ  DX, $0x0d
	JE    dataTLS
	CMPQ  DX, $0x80
	JB    startSinglesLoop
	JMP   dataOctaLoop

dataTLS:
	MOVOU  224(DI), X12
	MOVOU  240(DI), X13
	PXOR   X0, X0
	MOVQ   (SI), X0
	PINSRD $0x02, 8(SI), X0
	PINSRB $0x0c, 12(SI), X0
	XORQ   DX, DX
	JMP    dataMul

dataOctaLoop:
	CMPQ      DX, $0x80
	JB        startSinglesLoop
	SUBQ      $0x80, DX
	MOVOU     (SI), X0
	MOVOU     16(SI), X1
	MOVOU     32(SI), X2
	MOVOU     48(SI), X3
	MOVOU     64(SI), X4
	MOVOU     80(SI), X5
	MOVOU     96(SI), X6
	MOVOU     112(SI), X7
	LEAQ      128(SI), SI
	PSHUFB    X15, X0
	PSHUFB    X15, X1
	PSHUFB    X15, X2
	PSHUFB    X15, X3
	PSHUFB    X15, X4
	PSHUFB    X15, X5
	PSHUFB    X15, X6
	PSHUFB    X15, X7
	PXOR      X8, X0
	MOVOU     (DI), X8
	MOVOU     16(DI), X10
	MOVOU     X8, X9
	PSHUFD    $0x4e, X0, X12
	PXOR      X0, X12
	PCLMULQDQ $0x00, X0, X8
	PCLMULQDQ $0x11, X0, X9
	PCLMULQDQ $0x00, X12, X10
	MOVOU     32(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X1, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X1, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X1, X12
	PXOR      X12, X1
	MOVOU     48(DI), X12
	PCLMULQDQ $0x00, X1, X12
	PXOR      X12, X10
	MOVOU     64(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X2, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X2, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X2, X12
	PXOR      X12, X2
	MOVOU     80(DI), X12
	PCLMULQDQ $0x00, X2, X12
	PXOR      X12, X10
	MOVOU     96(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X3, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X3, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X3, X12
	PXOR      X12, X3
	MOVOU     112(DI), X12
	PCLMULQDQ $0x00, X3, X12
	PXOR      X12, X10
	MOVOU     128(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X4, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X4, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X4, X12
	PXOR      X12, X4
	MOVOU     144(DI), X12
	PCLMULQDQ $0x00, X4, X12
	PXOR      X12, X10
	MOVOU     160(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X5, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X5, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X5, X12
	PXOR      X12, X5
	MOVOU     176(DI), X12
	PCLMULQDQ $0x00, X5, X12
	PXOR      X12, X10
	MOVOU     192(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X6, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X6, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X6, X12
	PXOR      X12, X6
	MOVOU     208(DI), X12
	PCLMULQDQ $0x00, X6, X12
	PXOR      X12, X10
	MOVOU     224(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X7, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X7, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X7, X12
	PXOR      X12, X7
	MOVOU     240(DI), X12
	PCLMULQDQ $0x00, X7, X12
	PXOR      X12, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	JMP       dataOctaLoop

startSinglesLoop:
	MOVOU 224(DI), X12
	MOVOU 240(DI), X13

dataSinglesLoop:
	CMPQ  DX, $0x10
	JB    dataEnd
	SUBQ  $0x10, DX
	MOVOU (SI), X0

dataMul:
	PSHUFB    X15, X0
	PXOR      X8, X0
	MOVOU     X12, X8
	MOVOU     X13, X10
	MOVOU     X12, X9
	PSHUFD    $0x4e, X0, X11
	PXOR      X0, X11
	PCLMULQDQ $0x00, X0, X8
	PCLMULQDQ $0x11, X0, X9
	PCLMULQDQ $0x00, X11, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	LEAQ      16(SI), SI
	JMP       dataSinglesLoop

dataEnd:
	TESTQ DX, DX
	JEQ   dataBail
	PXOR  X0, X0
	LEAQ  -1(SI)(DX*1), SI

dataLoadLoop:
	PSLLDQ $0x01, X0
	PINSRB $0x00, (SI), X0
	LEAQ   -1(SI), SI
	DECQ   DX
	JNE    dataLoadLoop
	JMP    dataMul

dataBail:
	MOVOU X8, (CX)
	RET

// func gcmAesEnc(productTable *[256]byte, dst []byte, src []byte, ctr *[16]byte, T *[16]byte, ks []uint32)
// Requires: AES, PCLMULQDQ, SSE2, SSE4.1, SSSE3
TEXT ·gcmAesEnc(SB), $256-96
	MOVQ   productTable+0(FP), DI
	MOVQ   dst_base+8(FP), DX
	MOVQ   src_base+32(FP), SI
	MOVQ   src_len+40(FP), R9
	MOVQ   ctr+56(FP), CX
	MOVQ   T+64(FP), R8
	MOVQ   ks_base+72(FP), AX
	MOVQ   ks_len+80(FP), R13
	SHRQ   $0x02, R13
	DECQ   R13
	MOVOU  bswapMask<>+0(SB), X15
	MOVOU  gcmPoly<>+0(SB), X14
	MOVOU  (R8), X8
	PXOR   X9, X9
	PXOR   X10, X10
	MOVOU  (CX), X0
	MOVL   12(CX), R10
	MOVOU  (AX), X11
	MOVL   12(AX), R12
	BSWAPL R10
	BSWAPL R12
	PXOR   X0, X11
	MOVOU  X11, 128(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 140(SP)
	CMPQ   R9, $0x80
	JB     gcmAesEncSingles
	SUBQ   $0x80, R9

	// We have at least 8 blocks to encrypt, prepare the rest of the counters
	MOVOU  X11, 144(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 156(SP)
	MOVOU  X11, 160(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 172(SP)
	MOVOU  X11, 176(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 188(SP)
	MOVOU  X11, 192(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 204(SP)
	MOVOU  X11, 208(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 220(SP)
	MOVOU  X11, 224(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 236(SP)
	MOVOU  X11, 240(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 252(SP)
	MOVOU  128(SP), X0
	MOVOU  144(SP), X1
	MOVOU  160(SP), X2
	MOVOU  176(SP), X3
	MOVOU  192(SP), X4
	MOVOU  208(SP), X5
	MOVOU  224(SP), X6
	MOVOU  240(SP), X7
	MOVOU  16(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 140(SP)
	MOVOU  32(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 156(SP)
	MOVOU  48(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 172(SP)
	MOVOU  64(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 188(SP)
	MOVOU  80(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 204(SP)
	MOVOU  96(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 220(SP)
	MOVOU  112(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 236(SP)
	MOVOU  128(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 252(SP)
	MOVOU  144(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	MOVOU  160(AX), X11
	CMPQ   R13, $0x0c
	JB     encLast1
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	MOVOU  176(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	MOVOU  192(AX), X11
	JE     encLast1
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	MOVOU  208(AX), X11
	AESENC X11, X0
	AESENC X11, X1
	AESENC X11, X2
	AESENC X11, X3
	AESENC X11, X4
	AESENC X11, X5
	AESENC X11, X6
	AESENC X11, X7
	MOVOU  224(AX), X11

encLast1:
	AESENCLAST X11, X0
	AESENCLAST X11, X1
	AESENCLAST X11, X2
	AESENCLAST X11, X3
	AESENCLAST X11, X4
	AESENCLAST X11, X5
	AESENCLAST X11, X6
	AESENCLAST X11, X7
	MOVOU      (SI), X11
	PXOR       X11, X0
	MOVOU      16(SI), X11
	PXOR       X11, X1
	MOVOU      32(SI), X11
	PXOR       X11, X2
	MOVOU      48(SI), X11
	PXOR       X11, X3
	MOVOU      64(SI), X11
	PXOR       X11, X4
	MOVOU      80(SI), X11
	PXOR       X11, X5
	MOVOU      96(SI), X11
	PXOR       X11, X6
	MOVOU      112(SI), X11
	PXOR       X11, X7
	MOVOU      X0, (DX)
	PSHUFB     X15, X0
	PXOR       X8, X0
	MOVOU      X1, 16(DX)
	PSHUFB     X15, X1
	MOVOU      X2, 32(DX)
	PSHUFB     X15, X2
	MOVOU      X3, 48(DX)
	PSHUFB     X15, X3
	MOVOU      X4, 64(DX)
	PSHUFB     X15, X4
	MOVOU      X5, 80(DX)
	PSHUFB     X15, X5
	MOVOU      X6, 96(DX)
	PSHUFB     X15, X6
	MOVOU      X7, 112(DX)
	PSHUFB     X15, X7
	MOVOU      X0, (SP)
	MOVOU      X1, 16(SP)
	MOVOU      X2, 32(SP)
	MOVOU      X3, 48(SP)
	MOVOU      X4, 64(SP)
	MOVOU      X5, 80(SP)
	MOVOU      X6, 96(SP)
	MOVOU      X7, 112(SP)
	LEAQ       128(SI), SI
	LEAQ       128(DX), DX

gcmAesEncOctetsLoop:
	CMPQ      R9, $0x80
	JB        gcmAesEncOctetsEnd
	SUBQ      $0x80, R9
	MOVOU     128(SP), X0
	MOVOU     144(SP), X1
	MOVOU     160(SP), X2
	MOVOU     176(SP), X3
	MOVOU     192(SP), X4
	MOVOU     208(SP), X5
	MOVOU     224(SP), X6
	MOVOU     240(SP), X7
	MOVOU     (SP), X11
	PSHUFD    $0x4e, X11, X12
	PXOR      X11, X12
	MOVOU     (DI), X8
	MOVOU     16(DI), X10
	MOVOU     X8, X9
	PCLMULQDQ $0x00, X12, X10
	PCLMULQDQ $0x00, X11, X8
	PCLMULQDQ $0x11, X11, X9
	MOVOU     16(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     32(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     16(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     48(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 140(SP)
	MOVOU     32(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     64(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     32(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     80(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 156(SP)
	MOVOU     48(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     96(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     48(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     112(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 172(SP)
	MOVOU     64(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     128(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     64(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     144(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 188(SP)
	MOVOU     80(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     160(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     80(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     176(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 204(SP)
	MOVOU     96(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     192(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     96(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     208(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 220(SP)
	MOVOU     112(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     224(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     112(SP), X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     240(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 236(SP)
	MOVOU     128(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 252(SP)
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     144(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	MOVOU     160(AX), X11
	CMPQ      R13, $0x0c
	JB        encLast2
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     176(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     192(AX), X11
	JE        encLast2
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     208(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     224(AX), X11

encLast2:
	AESENCLAST X11, X0
	AESENCLAST X11, X1
	AESENCLAST X11, X2
	AESENCLAST X11, X3
	AESENCLAST X11, X4
	AESENCLAST X11, X5
	AESENCLAST X11, X6
	AESENCLAST X11, X7
	MOVOU      (SI), X11
	PXOR       X11, X0
	MOVOU      16(SI), X11
	PXOR       X11, X1
	MOVOU      32(SI), X11
	PXOR       X11, X2
	MOVOU      48(SI), X11
	PXOR       X11, X3
	MOVOU      64(SI), X11
	PXOR       X11, X4
	MOVOU      80(SI), X11
	PXOR       X11, X5
	MOVOU      96(SI), X11
	PXOR       X11, X6
	MOVOU      112(SI), X11
	PXOR       X11, X7
	MOVOU      X0, (DX)
	PSHUFB     X15, X0
	PXOR       X8, X0
	MOVOU      X1, 16(DX)
	PSHUFB     X15, X1
	MOVOU      X2, 32(DX)
	PSHUFB     X15, X2
	MOVOU      X3, 48(DX)
	PSHUFB     X15, X3
	MOVOU      X4, 64(DX)
	PSHUFB     X15, X4
	MOVOU      X5, 80(DX)
	PSHUFB     X15, X5
	MOVOU      X6, 96(DX)
	PSHUFB     X15, X6
	MOVOU      X7, 112(DX)
	PSHUFB     X15, X7
	MOVOU      X0, (SP)
	MOVOU      X1, 16(SP)
	MOVOU      X2, 32(SP)
	MOVOU      X3, 48(SP)
	MOVOU      X4, 64(SP)
	MOVOU      X5, 80(SP)
	MOVOU      X6, 96(SP)
	MOVOU      X7, 112(SP)
	LEAQ       128(SI), SI
	LEAQ       128(DX), DX
	JMP        gcmAesEncOctetsLoop

gcmAesEncOctetsEnd:
	MOVOU     (SP), X11
	MOVOU     (DI), X8
	MOVOU     16(DI), X10
	MOVOU     X8, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X11, X12
	PCLMULQDQ $0x00, X11, X8
	PCLMULQDQ $0x11, X11, X9
	PCLMULQDQ $0x00, X12, X10
	MOVOU     16(SP), X11
	MOVOU     32(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     48(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	MOVOU     32(SP), X11
	MOVOU     64(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     80(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	MOVOU     48(SP), X11
	MOVOU     96(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     112(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	MOVOU     64(SP), X11
	MOVOU     128(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     144(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	MOVOU     80(SP), X11
	MOVOU     160(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     176(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	MOVOU     96(SP), X11
	MOVOU     192(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     208(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	MOVOU     112(SP), X11
	MOVOU     224(DI), X12
	MOVOU     X12, X13
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PCLMULQDQ $0x11, X11, X13
	PXOR      X13, X9
	PSHUFD    $0x4e, X11, X12
	PXOR      X12, X11
	MOVOU     240(DI), X12
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	TESTQ     R9, R9
	JE        gcmAesEncDone
	SUBQ      $0x07, R10

gcmAesEncSingles:
	MOVOU 16(AX), X1
	MOVOU 32(AX), X2
	MOVOU 48(AX), X3
	MOVOU 64(AX), X4
	MOVOU 80(AX), X5
	MOVOU 96(AX), X6
	MOVOU 112(AX), X7
	MOVOU 224(DI), X13

gcmAesEncSinglesLoop:
	CMPQ   R9, $0x10
	JB     gcmAesEncTail
	SUBQ   $0x10, R9
	MOVOU  128(SP), X0
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 140(SP)
	AESENC X1, X0
	AESENC X2, X0
	AESENC X3, X0
	AESENC X4, X0
	AESENC X5, X0
	AESENC X6, X0
	AESENC X7, X0
	MOVOU  128(AX), X11
	AESENC X11, X0
	MOVOU  144(AX), X11
	AESENC X11, X0
	MOVOU  160(AX), X11
	CMPQ   R13, $0x0c
	JB     encLast3
	AESENC X11, X0
	MOVOU  176(AX), X11
	AESENC X11, X0
	MOVOU  192(AX), X11
	JE     encLast3
	AESENC X11, X0
	MOVOU  208(AX), X11
	AESENC X11, X0
	MOVOU  224(AX), X11

encLast3:
	AESENCLAST X11, X0
	MOVOU      (SI), X11
	PXOR       X11, X0
	MOVOU      X0, (DX)
	PSHUFB     X15, X0
	PXOR       X8, X0
	MOVOU      X13, X8
	MOVOU      X13, X9
	MOVOU      240(DI), X10
	PSHUFD     $0x4e, X0, X11
	PXOR       X0, X11
	PCLMULQDQ  $0x00, X0, X8
	PCLMULQDQ  $0x11, X0, X9
	PCLMULQDQ  $0x00, X11, X10
	PXOR       X8, X10
	PXOR       X9, X10
	MOVOU      X10, X11
	PSRLDQ     $0x08, X10
	PSLLDQ     $0x08, X11
	PXOR       X10, X9
	PXOR       X11, X8
	MOVOU      X14, X11
	PCLMULQDQ  $0x01, X8, X11
	PSHUFD     $0x4e, X8, X8
	PXOR       X11, X8
	MOVOU      X14, X11
	PCLMULQDQ  $0x01, X8, X11
	PSHUFD     $0x4e, X8, X8
	PXOR       X11, X8
	PXOR       X9, X8
	LEAQ       16(SI), SI
	LEAQ       16(DX), DX
	JMP        gcmAesEncSinglesLoop

gcmAesEncTail:
	TESTQ  R9, R9
	JE     gcmAesEncDone
	MOVOU  128(SP), X0
	AESENC X1, X0
	AESENC X2, X0
	AESENC X3, X0
	AESENC X4, X0
	AESENC X5, X0
	AESENC X6, X0
	AESENC X7, X0
	MOVOU  128(AX), X11
	AESENC X11, X0
	MOVOU  144(AX), X11
	AESENC X11, X0
	MOVOU  160(AX), X11
	CMPQ   R13, $0x0c
	JB     encLast4
	AESENC X11, X0
	MOVOU  176(AX), X11
	AESENC X11, X0
	MOVOU  192(AX), X11
	JE     encLast4
	AESENC X11, X0
	MOVOU  208(AX), X11
	AESENC X11, X0
	MOVOU  224(AX), X11

encLast4:
	AESENCLAST X11, X0
	MOVOU      X0, X11
	LEAQ       -1(SI)(R9*1), SI
	MOVQ       R9, R11
	SHLQ       $0x04, R11
	LEAQ       andMask<>+0(SB), R10
	MOVOU      -16(R10)(R11*1), X12
	PXOR       X0, X0

ptxLoadLoop:
	PSLLDQ    $0x01, X0
	PINSRB    $0x00, (SI), X0
	LEAQ      -1(SI), SI
	DECQ      R9
	JNE       ptxLoadLoop
	PXOR      X11, X0
	PAND      X12, X0
	MOVOU     X0, (DX)
	PSHUFB    X15, X0
	PXOR      X8, X0
	MOVOU     X13, X8
	MOVOU     X13, X9
	MOVOU     240(DI), X10
	PSHUFD    $0x4e, X0, X11
	PXOR      X0, X11
	PCLMULQDQ $0x00, X0, X8
	PCLMULQDQ $0x11, X0, X9
	PCLMULQDQ $0x00, X11, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8

gcmAesEncDone:
	MOVOU X8, (R8)
	RET

DATA andMask<>+0(SB)/8, $0x00000000000000ff
DATA andMask<>+8(SB)/8, $0x0000000000000000
DATA andMask<>+16(SB)/8, $0x000000000000ffff
DATA andMask<>+24(SB)/8, $0x0000000000000000
DATA andMask<>+32(SB)/8, $0x0000000000ffffff
DATA andMask<>+40(SB)/8, $0x0000000000000000
DATA andMask<>+48(SB)/8, $0x00000000ffffffff
DATA andMask<>+56(SB)/8, $0x0000000000000000
DATA andMask<>+64(SB)/8, $0x000000ffffffffff
DATA andMask<>+72(SB)/8, $0x0000000000000000
DATA andMask<>+80(SB)/8, $0x0000ffffffffffff
DATA andMask<>+88(SB)/8, $0x0000000000000000
DATA andMask<>+96(SB)/8, $0x00ffffffffffffff
DATA andMask<>+104(SB)/8, $0x0000000000000000
DATA andMask<>+112(SB)/8, $0xffffffffffffffff
DATA andMask<>+120(SB)/8, $0x0000000000000000
DATA andMask<>+128(SB)/8, $0xffffffffffffffff
DATA andMask<>+136(SB)/8, $0x00000000000000ff
DATA andMask<>+144(SB)/8, $0xffffffffffffffff
DATA andMask<>+152(SB)/8, $0x000000000000ffff
DATA andMask<>+160(SB)/8, $0xffffffffffffffff
DATA andMask<>+168(SB)/8, $0x0000000000ffffff
DATA andMask<>+176(SB)/8, $0xffffffffffffffff
DATA andMask<>+184(SB)/8, $0x00000000ffffffff
DATA andMask<>+192(SB)/8, $0xffffffffffffffff
DATA andMask<>+200(SB)/8, $0x000000ffffffffff
DATA andMask<>+208(SB)/8, $0xffffffffffffffff
DATA andMask<>+216(SB)/8, $0x0000ffffffffffff
DATA andMask<>+224(SB)/8, $0xffffffffffffffff
DATA andMask<>+232(SB)/8, $0x00ffffffffffffff
GLOBL andMask<>(SB), RODATA|NOPTR, $240

// func gcmAesDec(productTable *[256]byte, dst []byte, src []byte, ctr *[16]byte, T *[16]byte, ks []uint32)
// Requires: AES, PCLMULQDQ, SSE2, SSE4.1, SSSE3
TEXT ·gcmAesDec(SB), $128-96
	MOVQ   productTable+0(FP), DI
	MOVQ   dst_base+8(FP), SI
	MOVQ   src_base+32(FP), DX
	MOVQ   src_len+40(FP), R9
	MOVQ   ctr+56(FP), CX
	MOVQ   T+64(FP), R8
	MOVQ   ks_base+72(FP), AX
	MOVQ   ks_len+80(FP), R13
	SHRQ   $0x02, R13
	DECQ   R13
	MOVOU  bswapMask<>+0(SB), X15
	MOVOU  gcmPoly<>+0(SB), X14
	MOVOU  (R8), X8
	PXOR   X9, X9
	PXOR   X10, X10
	MOVOU  (CX), X0
	MOVL   12(CX), R10
	MOVOU  (AX), X11
	MOVL   12(AX), R12
	BSWAPL R10
	BSWAPL R12
	PXOR   X0, X11
	MOVOU  X11, (SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 12(SP)
	CMPQ   R9, $0x80
	JB     gcmAesDecSingles
	MOVOU  X11, 16(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 28(SP)
	MOVOU  X11, 32(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 44(SP)
	MOVOU  X11, 48(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 60(SP)
	MOVOU  X11, 64(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 76(SP)
	MOVOU  X11, 80(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 92(SP)
	MOVOU  X11, 96(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 108(SP)
	MOVOU  X11, 112(SP)
	ADDL   $0x01, R10
	MOVL   R10, R11
	XORL   R12, R11
	BSWAPL R11
	MOVL   R11, 124(SP)

gcmAesDecOctetsLoop:
	CMPQ      R9, $0x80
	JB        gcmAesDecEndOctets
	SUBQ      $0x80, R9
	MOVOU     (SP), X0
	MOVOU     16(SP), X1
	MOVOU     32(SP), X2
	MOVOU     48(SP), X3
	MOVOU     64(SP), X4
	MOVOU     80(SP), X5
	MOVOU     96(SP), X6
	MOVOU     112(SP), X7
	MOVOU     (DX), X11
	PSHUFB    X15, X11
	PXOR      X8, X11
	PSHUFD    $0x4e, X11, X12
	PXOR      X11, X12
	MOVOU     (DI), X8
	MOVOU     16(DI), X10
	MOVOU     X8, X9
	PCLMULQDQ $0x00, X12, X10
	PCLMULQDQ $0x00, X11, X8
	PCLMULQDQ $0x11, X11, X9
	MOVOU     16(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     32(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     16(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     48(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 12(SP)
	MOVOU     32(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     64(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     32(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     80(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 28(SP)
	MOVOU     48(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     96(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     48(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     112(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 44(SP)
	MOVOU     64(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     128(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     64(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     144(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 60(SP)
	MOVOU     80(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     160(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     80(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     176(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 76(SP)
	MOVOU     96(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     192(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     96(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     208(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 92(SP)
	MOVOU     112(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	MOVOU     224(DI), X12
	MOVOU     X12, X13
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     112(DX), X11
	PSHUFB    X15, X11
	PCLMULQDQ $0x00, X11, X12
	PXOR      X12, X8
	PSHUFD    $0x4e, X11, X12
	PCLMULQDQ $0x11, X11, X13
	PXOR      X12, X11
	PXOR      X13, X9
	MOVOU     240(DI), X13
	PCLMULQDQ $0x00, X13, X11
	PXOR      X11, X10
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 108(SP)
	MOVOU     128(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 124(SP)
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     144(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	MOVOU     160(AX), X11
	CMPQ      R13, $0x0c
	JB        decLast1
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     176(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     192(AX), X11
	JE        decLast1
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     208(AX), X11
	AESENC    X11, X0
	AESENC    X11, X1
	AESENC    X11, X2
	AESENC    X11, X3
	AESENC    X11, X4
	AESENC    X11, X5
	AESENC    X11, X6
	AESENC    X11, X7
	MOVOU     224(AX), X11

decLast1:
	AESENCLAST X11, X0
	AESENCLAST X11, X1
	AESENCLAST X11, X2
	AESENCLAST X11, X3
	AESENCLAST X11, X4
	AESENCLAST X11, X5
	AESENCLAST X11, X6
	AESENCLAST X11, X7
	MOVOU      (DX), X11
	PXOR       X11, X0
	MOVOU      16(DX), X11
	PXOR       X11, X1
	MOVOU      32(DX), X11
	PXOR       X11, X2
	MOVOU      48(DX), X11
	PXOR       X11, X3
	MOVOU      64(DX), X11
	PXOR       X11, X4
	MOVOU      80(DX), X11
	PXOR       X11, X5
	MOVOU      96(DX), X11
	PXOR       X11, X6
	MOVOU      112(DX), X11
	PXOR       X11, X7
	MOVOU      X0, (SI)
	MOVOU      X1, 16(SI)
	MOVOU      X2, 32(SI)
	MOVOU      X3, 48(SI)
	MOVOU      X4, 64(SI)
	MOVOU      X5, 80(SI)
	MOVOU      X6, 96(SI)
	MOVOU      X7, 112(SI)
	LEAQ       128(SI), SI
	LEAQ       128(DX), DX
	JMP        gcmAesDecOctetsLoop

gcmAesDecEndOctets:
	SUBQ $0x07, R10

gcmAesDecSingles:
	MOVOU 16(AX), X1
	MOVOU 32(AX), X2
	MOVOU 48(AX), X3
	MOVOU 64(AX), X4
	MOVOU 80(AX), X5
	MOVOU 96(AX), X6
	MOVOU 112(AX), X7
	MOVOU 224(DI), X13

gcmAesDecSinglesLoop:
	CMPQ      R9, $0x10
	JB        gcmAesDecTail
	SUBQ      $0x10, R9
	MOVOU     (DX), X0
	MOVOU     X0, X12
	PSHUFB    X15, X0
	PXOR      X8, X0
	MOVOU     X13, X8
	MOVOU     X13, X9
	MOVOU     240(DI), X10
	PCLMULQDQ $0x00, X0, X8
	PCLMULQDQ $0x11, X0, X9
	PSHUFD    $0x4e, X0, X11
	PXOR      X0, X11
	PCLMULQDQ $0x00, X11, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	MOVOU     (SP), X0
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 12(SP)
	AESENC    X1, X0
	AESENC    X2, X0
	AESENC    X3, X0
	AESENC    X4, X0
	AESENC    X5, X0
	AESENC    X6, X0
	AESENC    X7, X0
	MOVOU     128(AX), X11
	AESENC    X11, X0
	MOVOU     144(AX), X11
	AESENC    X11, X0
	MOVOU     160(AX), X11
	CMPQ      R13, $0x0c
	JB        decLast2
	AESENC    X11, X0
	MOVOU     176(AX), X11
	AESENC    X11, X0
	MOVOU     192(AX), X11
	JE        decLast2
	AESENC    X11, X0
	MOVOU     208(AX), X11
	AESENC    X11, X0
	MOVOU     224(AX), X11

decLast2:
	AESENCLAST X11, X0
	PXOR       X12, X0
	MOVOU      X0, (SI)
	LEAQ       16(SI), SI
	LEAQ       16(DX), DX
	JMP        gcmAesDecSinglesLoop

gcmAesDecTail:
	TESTQ     R9, R9
	JE        gcmAesDecDone
	MOVQ      R9, R11
	SHLQ      $0x04, R11
	LEAQ      andMask<>+0(SB), R10
	MOVOU     -16(R10)(R11*1), X12
	MOVOU     (DX), X0
	PAND      X12, X0
	MOVOU     X0, X12
	PSHUFB    X15, X0
	PXOR      X8, X0
	MOVOU     224(DI), X8
	MOVOU     240(DI), X10
	MOVOU     X8, X9
	PCLMULQDQ $0x00, X0, X8
	PCLMULQDQ $0x11, X0, X9
	PSHUFD    $0x4e, X0, X11
	PXOR      X0, X11
	PCLMULQDQ $0x00, X11, X10
	PXOR      X8, X10
	PXOR      X9, X10
	MOVOU     X10, X11
	PSRLDQ    $0x08, X10
	PSLLDQ    $0x08, X11
	PXOR      X10, X9
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	MOVOU     X14, X11
	PCLMULQDQ $0x01, X8, X11
	PSHUFD    $0x4e, X8, X8
	PXOR      X11, X8
	PXOR      X9, X8
	MOVOU     (SP), X0
	ADDL      $0x01, R10
	MOVL      R10, R11
	XORL      R12, R11
	BSWAPL    R11
	MOVL      R11, 12(SP)
	AESENC    X1, X0
	AESENC    X2, X0
	AESENC    X3, X0
	AESENC    X4, X0
	AESENC    X5, X0
	AESENC    X6, X0
	AESENC    X7, X0
	MOVOU     128(AX), X11
	AESENC    X11, X0
	MOVOU     144(AX), X11
	AESENC    X11, X0
	MOVOU     160(AX), X11
	CMPQ      R13, $0x0c
	JB        decLast3
	AESENC    X11, X0
	MOVOU     176(AX), X11
	AESENC    X11, X0
	MOVOU     192(AX), X11
	JE        decLast3
	AESENC    X11, X0
	MOVOU     208(AX), X11
	AESENC    X11, X0
	MOVOU     224(AX), X11

decLast3:
	AESENCLAST X11, X0
	PXOR       X12, X0

ptxStoreLoop:
	PEXTRB $0x00, X0, (SI)
	PSRLDQ $0x01, X0
	LEAQ   1(SI), SI
	DECQ   R9
	JNE    ptxStoreLoop

gcmAesDecDone:
	MOVOU X8, (R8)
	RET