// Code generated by command: go run sha512block_amd64_asm.go -out ../sha512block_amd64.s. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func blockAVX2(dig *Digest, p []byte)
// Requires: AVX, AVX2, BMI2
TEXT ·blockAVX2(SB), NOSPLIT, $56-32
	MOVQ    dig+0(FP), SI
	MOVQ    p_base+8(FP), DI
	MOVQ    p_len+16(FP), DX
	SHRQ    $0x07, DX
	SHLQ    $0x07, DX
	JZ      done_hash
	ADDQ    DI, DX
	MOVQ    DX, 48(SP)
	MOVQ    (SI), AX
	MOVQ    8(SI), BX
	MOVQ    16(SI), CX
	MOVQ    24(SI), R8
	MOVQ    32(SI), DX
	MOVQ    40(SI), R9
	MOVQ    48(SI), R10
	MOVQ    56(SI), R11
	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>+0(SB), Y9

loop0:
	MOVQ    $·_K+0(SB), BP
	VMOVDQU (DI), Y4
	VPSHUFB Y9, Y4, Y4
	VMOVDQU 32(DI), Y5
	VPSHUFB Y9, Y5, Y5
	VMOVDQU 64(DI), Y6
	VPSHUFB Y9, Y6, Y6
	VMOVDQU 96(DI), Y7
	VPSHUFB Y9, Y7, Y7
	MOVQ    DI, 40(SP)
	MOVQ    $0x00000004, 32(SP)

loop1:
	VPADDQ     (BP), Y4, Y0
	VMOVDQU    Y0, (SP)
	VPERM2F128 $0x03, Y6, Y7, Y0
	VPALIGNR   $0x08, Y6, Y0, Y0
	VPADDQ     Y4, Y0, Y0
	VPERM2F128 $0x03, Y4, Y5, Y1
	VPALIGNR   $0x08, Y4, Y1, Y1
	VPSRLQ     $0x01, Y1, Y2
	VPSLLQ     $0x3f, Y1, Y3
	VPOR       Y2, Y3, Y3
	VPSRLQ     $0x07, Y1, Y8
	MOVQ       AX, DI
	RORXQ      $0x29, DX, R13
	RORXQ      $0x12, DX, R14
	ADDQ       (SP), R11
	ORQ        CX, DI
	MOVQ       R9, R15
	RORXQ      $0x22, AX, R12
	XORQ       R14, R13
	XORQ       R10, R15
	RORXQ      $0x0e, DX, R14
	ANDQ       DX, R15
	XORQ       R14, R13
	RORXQ      $0x27, AX, R14
	ADDQ       R11, R8
	ANDQ       BX, DI
	XORQ       R12, R14
	RORXQ      $0x1c, AX, R12
	XORQ       R10, R15
	XORQ       R12, R14
	MOVQ       AX, R12
	ANDQ       CX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R11
	ADDQ       R15, R8
	ADDQ       R15, R11
	ADDQ       DI, R11
	VPSRLQ     $0x08, Y1, Y2
	VPSLLQ     $0x38, Y1, Y1
	VPOR       Y2, Y1, Y1
	VPXOR      Y8, Y3, Y3
	VPXOR      Y1, Y3, Y1
	VPADDQ     Y1, Y0, Y0
	VPERM2F128 $0x00, Y0, Y0, Y4
	VPAND      MASK_YMM_LO<>+0(SB), Y0, Y0
	VPERM2F128 $0x11, Y7, Y7, Y2
	VPSRLQ     $0x06, Y2, Y8
	MOVQ       R11, DI
	RORXQ      $0x29, R8, R13
	RORXQ      $0x12, R8, R14
	ADDQ       8(SP), R10
	ORQ        BX, DI
	MOVQ       DX, R15
	RORXQ      $0x22, R11, R12
	XORQ       R14, R13
	XORQ       R9, R15
	RORXQ      $0x0e, R8, R14
	XORQ       R14, R13
	RORXQ      $0x27, R11, R14
	ANDQ       R8, R15
	ADDQ       R10, CX
	ANDQ       AX, DI
	XORQ       R12, R14
	RORXQ      $0x1c, R11, R12
	XORQ       R9, R15
	XORQ       R12, R14
	MOVQ       R11, R12
	ANDQ       BX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R10
	ADDQ       R15, CX
	ADDQ       R15, R10
	ADDQ       DI, R10
	VPSRLQ     $0x13, Y2, Y3
	VPSLLQ     $0x2d, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y2, Y3
	VPSLLQ     $0x03, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y4, Y4
	VPSRLQ     $0x06, Y4, Y8
	MOVQ       R10, DI
	RORXQ      $0x29, CX, R13
	ADDQ       16(SP), R9
	RORXQ      $0x12, CX, R14
	ORQ        AX, DI
	MOVQ       R8, R15
	XORQ       DX, R15
	RORXQ      $0x22, R10, R12
	XORQ       R14, R13
	ANDQ       CX, R15
	RORXQ      $0x0e, CX, R14
	ADDQ       R9, BX
	ANDQ       R11, DI
	XORQ       R14, R13
	RORXQ      $0x27, R10, R14
	XORQ       DX, R15
	XORQ       R12, R14
	RORXQ      $0x1c, R10, R12
	XORQ       R12, R14
	MOVQ       R10, R12
	ANDQ       AX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R9
	ADDQ       R15, BX
	ADDQ       R15, R9
	ADDQ       DI, R9
	VPSRLQ     $0x13, Y4, Y3
	VPSLLQ     $0x2d, Y4, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y4, Y3
	VPSLLQ     $0x03, Y4, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y0, Y2
	VPBLENDD   $0xf0, Y2, Y4, Y4
	MOVQ       R9, DI
	RORXQ      $0x29, BX, R13
	RORXQ      $0x12, BX, R14
	ADDQ       24(SP), DX
	ORQ        R11, DI
	MOVQ       CX, R15
	RORXQ      $0x22, R9, R12
	XORQ       R14, R13
	XORQ       R8, R15
	RORXQ      $0x0e, BX, R14
	ANDQ       BX, R15
	ADDQ       DX, AX
	ANDQ       R10, DI
	XORQ       R14, R13
	XORQ       R8, R15
	RORXQ      $0x27, R9, R14
	ADDQ       R13, R15
	XORQ       R12, R14
	ADDQ       R15, AX
	RORXQ      $0x1c, R9, R12
	XORQ       R12, R14
	MOVQ       R9, R12
	ANDQ       R11, R12
	ORQ        R12, DI
	ADDQ       R14, DX
	ADDQ       R15, DX
	ADDQ       DI, DX
	VPADDQ     32(BP), Y5, Y0
	VMOVDQU    Y0, (SP)
	VPERM2F128 $0x03, Y7, Y4, Y0
	VPALIGNR   $0x08, Y7, Y0, Y0
	VPADDQ     Y5, Y0, Y0
	VPERM2F128 $0x03, Y5, Y6, Y1
	VPALIGNR   $0x08, Y5, Y1, Y1
	VPSRLQ     $0x01, Y1, Y2
	VPSLLQ     $0x3f, Y1, Y3
	VPOR       Y2, Y3, Y3
	VPSRLQ     $0x07, Y1, Y8
	MOVQ       DX, DI
	RORXQ      $0x29, AX, R13
	RORXQ      $0x12, AX, R14
	ADDQ       (SP), R8
	ORQ        R10, DI
	MOVQ       BX, R15
	RORXQ      $0x22, DX, R12
	XORQ       R14, R13
	XORQ       CX, R15
	RORXQ      $0x0e, AX, R14
	ANDQ       AX, R15
	XORQ       R14, R13
	RORXQ      $0x27, DX, R14
	ADDQ       R8, R11
	ANDQ       R9, DI
	XORQ       R12, R14
	RORXQ      $0x1c, DX, R12
	XORQ       CX, R15
	XORQ       R12, R14
	MOVQ       DX, R12
	ANDQ       R10, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R8
	ADDQ       R15, R11
	ADDQ       R15, R8
	ADDQ       DI, R8
	VPSRLQ     $0x08, Y1, Y2
	VPSLLQ     $0x38, Y1, Y1
	VPOR       Y2, Y1, Y1
	VPXOR      Y8, Y3, Y3
	VPXOR      Y1, Y3, Y1
	VPADDQ     Y1, Y0, Y0
	VPERM2F128 $0x00, Y0, Y0, Y5
	VPAND      MASK_YMM_LO<>+0(SB), Y0, Y0
	VPERM2F128 $0x11, Y4, Y4, Y2
	VPSRLQ     $0x06, Y2, Y8
	MOVQ       R8, DI
	RORXQ      $0x29, R11, R13
	RORXQ      $0x12, R11, R14
	ADDQ       8(SP), CX
	ORQ        R9, DI
	MOVQ       AX, R15
	RORXQ      $0x22, R8, R12
	XORQ       R14, R13
	XORQ       BX, R15
	RORXQ      $0x0e, R11, R14
	XORQ       R14, R13
	RORXQ      $0x27, R8, R14
	ANDQ       R11, R15
	ADDQ       CX, R10
	ANDQ       DX, DI
	XORQ       R12, R14
	RORXQ      $0x1c, R8, R12
	XORQ       BX, R15
	XORQ       R12, R14
	MOVQ       R8, R12
	ANDQ       R9, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, CX
	ADDQ       R15, R10
	ADDQ       R15, CX
	ADDQ       DI, CX
	VPSRLQ     $0x13, Y2, Y3
	VPSLLQ     $0x2d, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y2, Y3
	VPSLLQ     $0x03, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y5, Y5
	VPSRLQ     $0x06, Y5, Y8
	MOVQ       CX, DI
	RORXQ      $0x29, R10, R13
	ADDQ       16(SP), BX
	RORXQ      $0x12, R10, R14
	ORQ        DX, DI
	MOVQ       R11, R15
	XORQ       AX, R15
	RORXQ      $0x22, CX, R12
	XORQ       R14, R13
	ANDQ       R10, R15
	RORXQ      $0x0e, R10, R14
	ADDQ       BX, R9
	ANDQ       R8, DI
	XORQ       R14, R13
	RORXQ      $0x27, CX, R14
	XORQ       AX, R15
	XORQ       R12, R14
	RORXQ      $0x1c, CX, R12
	XORQ       R12, R14
	MOVQ       CX, R12
	ANDQ       DX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, BX
	ADDQ       R15, R9
	ADDQ       R15, BX
	ADDQ       DI, BX
	VPSRLQ     $0x13, Y5, Y3
	VPSLLQ     $0x2d, Y5, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y5, Y3
	VPSLLQ     $0x03, Y5, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y0, Y2
	VPBLENDD   $0xf0, Y2, Y5, Y5
	MOVQ       BX, DI
	RORXQ      $0x29, R9, R13
	RORXQ      $0x12, R9, R14
	ADDQ       24(SP), AX
	ORQ        R8, DI
	MOVQ       R10, R15
	RORXQ      $0x22, BX, R12
	XORQ       R14, R13
	XORQ       R11, R15
	RORXQ      $0x0e, R9, R14
	ANDQ       R9, R15
	ADDQ       AX, DX
	ANDQ       CX, DI
	XORQ       R14, R13
	XORQ       R11, R15
	RORXQ      $0x27, BX, R14
	ADDQ       R13, R15
	XORQ       R12, R14
	ADDQ       R15, DX
	RORXQ      $0x1c, BX, R12
	XORQ       R12, R14
	MOVQ       BX, R12
	ANDQ       R8, R12
	ORQ        R12, DI
	ADDQ       R14, AX
	ADDQ       R15, AX
	ADDQ       DI, AX
	VPADDQ     64(BP), Y6, Y0
	VMOVDQU    Y0, (SP)
	VPERM2F128 $0x03, Y4, Y5, Y0
	VPALIGNR   $0x08, Y4, Y0, Y0
	VPADDQ     Y6, Y0, Y0
	VPERM2F128 $0x03, Y6, Y7, Y1
	VPALIGNR   $0x08, Y6, Y1, Y1
	VPSRLQ     $0x01, Y1, Y2
	VPSLLQ     $0x3f, Y1, Y3
	VPOR       Y2, Y3, Y3
	VPSRLQ     $0x07, Y1, Y8
	MOVQ       AX, DI
	RORXQ      $0x29, DX, R13
	RORXQ      $0x12, DX, R14
	ADDQ       (SP), R11
	ORQ        CX, DI
	MOVQ       R9, R15
	RORXQ      $0x22, AX, R12
	XORQ       R14, R13
	XORQ       R10, R15
	RORXQ      $0x0e, DX, R14
	ANDQ       DX, R15
	XORQ       R14, R13
	RORXQ      $0x27, AX, R14
	ADDQ       R11, R8
	ANDQ       BX, DI
	XORQ       R12, R14
	RORXQ      $0x1c, AX, R12
	XORQ       R10, R15
	XORQ       R12, R14
	MOVQ       AX, R12
	ANDQ       CX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R11
	ADDQ       R15, R8
	ADDQ       R15, R11
	ADDQ       DI, R11
	VPSRLQ     $0x08, Y1, Y2
	VPSLLQ     $0x38, Y1, Y1
	VPOR       Y2, Y1, Y1
	VPXOR      Y8, Y3, Y3
	VPXOR      Y1, Y3, Y1
	VPADDQ     Y1, Y0, Y0
	VPERM2F128 $0x00, Y0, Y0, Y6
	VPAND      MASK_YMM_LO<>+0(SB), Y0, Y0
	VPERM2F128 $0x11, Y5, Y5, Y2
	VPSRLQ     $0x06, Y2, Y8
	MOVQ       R11, DI
	RORXQ      $0x29, R8, R13
	RORXQ      $0x12, R8, R14
	ADDQ       8(SP), R10
	ORQ        BX, DI
	MOVQ       DX, R15
	RORXQ      $0x22, R11, R12
	XORQ       R14, R13
	XORQ       R9, R15
	RORXQ      $0x0e, R8, R14
	XORQ       R14, R13
	RORXQ      $0x27, R11, R14
	ANDQ       R8, R15
	ADDQ       R10, CX
	ANDQ       AX, DI
	XORQ       R12, R14
	RORXQ      $0x1c, R11, R12
	XORQ       R9, R15
	XORQ       R12, R14
	MOVQ       R11, R12
	ANDQ       BX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R10
	ADDQ       R15, CX
	ADDQ       R15, R10
	ADDQ       DI, R10
	VPSRLQ     $0x13, Y2, Y3
	VPSLLQ     $0x2d, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y2, Y3
	VPSLLQ     $0x03, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y6, Y6
	VPSRLQ     $0x06, Y6, Y8
	MOVQ       R10, DI
	RORXQ      $0x29, CX, R13
	ADDQ       16(SP), R9
	RORXQ      $0x12, CX, R14
	ORQ        AX, DI
	MOVQ       R8, R15
	XORQ       DX, R15
	RORXQ      $0x22, R10, R12
	XORQ       R14, R13
	ANDQ       CX, R15
	RORXQ      $0x0e, CX, R14
	ADDQ       R9, BX
	ANDQ       R11, DI
	XORQ       R14, R13
	RORXQ      $0x27, R10, R14
	XORQ       DX, R15
	XORQ       R12, R14
	RORXQ      $0x1c, R10, R12
	XORQ       R12, R14
	MOVQ       R10, R12
	ANDQ       AX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R9
	ADDQ       R15, BX
	ADDQ       R15, R9
	ADDQ       DI, R9
	VPSRLQ     $0x13, Y6, Y3
	VPSLLQ     $0x2d, Y6, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y6, Y3
	VPSLLQ     $0x03, Y6, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y0, Y2
	VPBLENDD   $0xf0, Y2, Y6, Y6
	MOVQ       R9, DI
	RORXQ      $0x29, BX, R13
	RORXQ      $0x12, BX, R14
	ADDQ       24(SP), DX
	ORQ        R11, DI
	MOVQ       CX, R15
	RORXQ      $0x22, R9, R12
	XORQ       R14, R13
	XORQ       R8, R15
	RORXQ      $0x0e, BX, R14
	ANDQ       BX, R15
	ADDQ       DX, AX
	ANDQ       R10, DI
	XORQ       R14, R13
	XORQ       R8, R15
	RORXQ      $0x27, R9, R14
	ADDQ       R13, R15
	XORQ       R12, R14
	ADDQ       R15, AX
	RORXQ      $0x1c, R9, R12
	XORQ       R12, R14
	MOVQ       R9, R12
	ANDQ       R11, R12
	ORQ        R12, DI
	ADDQ       R14, DX
	ADDQ       R15, DX
	ADDQ       DI, DX
	VPADDQ     96(BP), Y7, Y0
	VMOVDQU    Y0, (SP)
	ADDQ       $0x80, BP
	VPERM2F128 $0x03, Y5, Y6, Y0
	VPALIGNR   $0x08, Y5, Y0, Y0
	VPADDQ     Y7, Y0, Y0
	VPERM2F128 $0x03, Y7, Y4, Y1
	VPALIGNR   $0x08, Y7, Y1, Y1
	VPSRLQ     $0x01, Y1, Y2
	VPSLLQ     $0x3f, Y1, Y3
	VPOR       Y2, Y3, Y3
	VPSRLQ     $0x07, Y1, Y8
	MOVQ       DX, DI
	RORXQ      $0x29, AX, R13
	RORXQ      $0x12, AX, R14
	ADDQ       (SP), R8
	ORQ        R10, DI
	MOVQ       BX, R15
	RORXQ      $0x22, DX, R12
	XORQ       R14, R13
	XORQ       CX, R15
	RORXQ      $0x0e, AX, R14
	ANDQ       AX, R15
	XORQ       R14, R13
	RORXQ      $0x27, DX, R14
	ADDQ       R8, R11
	ANDQ       R9, DI
	XORQ       R12, R14
	RORXQ      $0x1c, DX, R12
	XORQ       CX, R15
	XORQ       R12, R14
	MOVQ       DX, R12
	ANDQ       R10, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, R8
	ADDQ       R15, R11
	ADDQ       R15, R8
	ADDQ       DI, R8
	VPSRLQ     $0x08, Y1, Y2
	VPSLLQ     $0x38, Y1, Y1
	VPOR       Y2, Y1, Y1
	VPXOR      Y8, Y3, Y3
	VPXOR      Y1, Y3, Y1
	VPADDQ     Y1, Y0, Y0
	VPERM2F128 $0x00, Y0, Y0, Y7
	VPAND      MASK_YMM_LO<>+0(SB), Y0, Y0
	VPERM2F128 $0x11, Y6, Y6, Y2
	VPSRLQ     $0x06, Y2, Y8
	MOVQ       R8, DI
	RORXQ      $0x29, R11, R13
	RORXQ      $0x12, R11, R14
	ADDQ       8(SP), CX
	ORQ        R9, DI
	MOVQ       AX, R15
	RORXQ      $0x22, R8, R12
	XORQ       R14, R13
	XORQ       BX, R15
	RORXQ      $0x0e, R11, R14
	XORQ       R14, R13
	RORXQ      $0x27, R8, R14
	ANDQ       R11, R15
	ADDQ       CX, R10
	ANDQ       DX, DI
	XORQ       R12, R14
	RORXQ      $0x1c, R8, R12
	XORQ       BX, R15
	XORQ       R12, R14
	MOVQ       R8, R12
	ANDQ       R9, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, CX
	ADDQ       R15, R10
	ADDQ       R15, CX
	ADDQ       DI, CX
	VPSRLQ     $0x13, Y2, Y3
	VPSLLQ     $0x2d, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y2, Y3
	VPSLLQ     $0x03, Y2, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y7, Y7
	VPSRLQ     $0x06, Y7, Y8
	MOVQ       CX, DI
	RORXQ      $0x29, R10, R13
	ADDQ       16(SP), BX
	RORXQ      $0x12, R10, R14
	ORQ        DX, DI
	MOVQ       R11, R15
	XORQ       AX, R15
	RORXQ      $0x22, CX, R12
	XORQ       R14, R13
	ANDQ       R10, R15
	RORXQ      $0x0e, R10, R14
	ADDQ       BX, R9
	ANDQ       R8, DI
	XORQ       R14, R13
	RORXQ      $0x27, CX, R14
	XORQ       AX, R15
	XORQ       R12, R14
	RORXQ      $0x1c, CX, R12
	XORQ       R12, R14
	MOVQ       CX, R12
	ANDQ       DX, R12
	ADDQ       R13, R15
	ORQ        R12, DI
	ADDQ       R14, BX
	ADDQ       R15, R9
	ADDQ       R15, BX
	ADDQ       DI, BX
	VPSRLQ     $0x13, Y7, Y3
	VPSLLQ     $0x2d, Y7, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPSRLQ     $0x3d, Y7, Y3
	VPSLLQ     $0x03, Y7, Y1
	VPOR       Y1, Y3, Y3
	VPXOR      Y3, Y8, Y8
	VPADDQ     Y8, Y0, Y2
	VPBLENDD   $0xf0, Y2, Y7, Y7
	MOVQ       BX, DI
	RORXQ      $0x29, R9, R13
	RORXQ      $0x12, R9, R14
	ADDQ       24(SP), AX
	ORQ        R8, DI
	MOVQ       R10, R15
	RORXQ      $0x22, BX, R12
	XORQ       R14, R13
	XORQ       R11, R15
	RORXQ      $0x0e, R9, R14
	ANDQ       R9, R15
	ADDQ       AX, DX
	ANDQ       CX, DI
	XORQ       R14, R13
	XORQ       R11, R15
	RORXQ      $0x27, BX, R14
	ADDQ       R13, R15
	XORQ       R12, R14
	ADDQ       R15, DX
	RORXQ      $0x1c, BX, R12
	XORQ       R12, R14
	MOVQ       BX, R12
	ANDQ       R8, R12
	ORQ        R12, DI
	ADDQ       R14, AX
	ADDQ       R15, AX
	ADDQ       DI, AX
	SUBQ       $0x01, 32(SP)
	JNE        loop1
	MOVQ       $0x00000002, 32(SP)

loop2:
	VPADDQ  (BP), Y4, Y0
	VMOVDQU Y0, (SP)
	MOVQ    R9, R15
	RORXQ   $0x29, DX, R13
	RORXQ   $0x12, DX, R14
	XORQ    R10, R15
	XORQ    R14, R13
	RORXQ   $0x0e, DX, R14
	ANDQ    DX, R15
	XORQ    R14, R13
	RORXQ   $0x22, AX, R12
	XORQ    R10, R15
	RORXQ   $0x27, AX, R14
	MOVQ    AX, DI
	XORQ    R12, R14
	RORXQ   $0x1c, AX, R12
	ADDQ    (SP), R11
	ORQ     CX, DI
	XORQ    R12, R14
	MOVQ    AX, R12
	ANDQ    BX, DI
	ANDQ    CX, R12
	ADDQ    R13, R15
	ADDQ    R11, R8
	ORQ     R12, DI
	ADDQ    R14, R11
	ADDQ    R15, R8
	ADDQ    R15, R11
	MOVQ    DX, R15
	RORXQ   $0x29, R8, R13
	RORXQ   $0x12, R8, R14
	XORQ    R9, R15
	XORQ    R14, R13
	RORXQ   $0x0e, R8, R14
	ANDQ    R8, R15
	ADDQ    DI, R11
	XORQ    R14, R13
	RORXQ   $0x22, R11, R12
	XORQ    R9, R15
	RORXQ   $0x27, R11, R14
	MOVQ    R11, DI
	XORQ    R12, R14
	RORXQ   $0x1c, R11, R12
	ADDQ    8(SP), R10
	ORQ     BX, DI
	XORQ    R12, R14
	MOVQ    R11, R12
	ANDQ    AX, DI
	ANDQ    BX, R12
	ADDQ    R13, R15
	ADDQ    R10, CX
	ORQ     R12, DI
	ADDQ    R14, R10
	ADDQ    R15, CX
	ADDQ    R15, R10
	MOVQ    R8, R15
	RORXQ   $0x29, CX, R13
	RORXQ   $0x12, CX, R14
	XORQ    DX, R15
	XORQ    R14, R13
	RORXQ   $0x0e, CX, R14
	ANDQ    CX, R15
	ADDQ    DI, R10
	XORQ    R14, R13
	RORXQ   $0x22, R10, R12
	XORQ    DX, R15
	RORXQ   $0x27, R10, R14
	MOVQ    R10, DI
	XORQ    R12, R14
	RORXQ   $0x1c, R10, R12
	ADDQ    16(SP), R9
	ORQ     AX, DI
	XORQ    R12, R14
	MOVQ    R10, R12
	ANDQ    R11, DI
	ANDQ    AX, R12
	ADDQ    R13, R15
	ADDQ    R9, BX
	ORQ     R12, DI
	ADDQ    R14, R9
	ADDQ    R15, BX
	ADDQ    R15, R9
	MOVQ    CX, R15
	RORXQ   $0x29, BX, R13
	RORXQ   $0x12, BX, R14
	XORQ    R8, R15
	XORQ    R14, R13
	RORXQ   $0x0e, BX, R14
	ANDQ    BX, R15
	ADDQ    DI, R9
	XORQ    R14, R13
	RORXQ   $0x22, R9, R12
	XORQ    R8, R15
	RORXQ   $0x27, R9, R14
	MOVQ    R9, DI
	XORQ    R12, R14
	RORXQ   $0x1c, R9, R12
	ADDQ    24(SP), DX
	ORQ     R11, DI
	XORQ    R12, R14
	MOVQ    R9, R12
	ANDQ    R10, DI
	ANDQ    R11, R12
	ADDQ    R13, R15
	ADDQ    DX, AX
	ORQ     R12, DI
	ADDQ    R14, DX
	ADDQ    R15, AX
	ADDQ    R15, DX
	ADDQ    DI, DX
	VPADDQ  32(BP), Y5, Y0
	VMOVDQU Y0, (SP)
	ADDQ    $0x40, BP
	MOVQ    BX, R15
	RORXQ   $0x29, AX, R13
	RORXQ   $0x12, AX, R14
	XORQ    CX, R15
	XORQ    R14, R13
	RORXQ   $0x0e, AX, R14
	ANDQ    AX, R15
	XORQ    R14, R13
	RORXQ   $0x22, DX, R12
	XORQ    CX, R15
	RORXQ   $0x27, DX, R14
	MOVQ    DX, DI
	XORQ    R12, R14
	RORXQ   $0x1c, DX, R12
	ADDQ    (SP), R8
	ORQ     R10, DI
	XORQ    R12, R14
	MOVQ    DX, R12
	ANDQ    R9, DI
	ANDQ    R10, R12
	ADDQ    R13, R15
	ADDQ    R8, R11
	ORQ     R12, DI
	ADDQ    R14, R8
	ADDQ    R15, R11
	ADDQ    R15, R8
	MOVQ    AX, R15
	RORXQ   $0x29, R11, R13
	RORXQ   $0x12, R11, R14
	XORQ    BX, R15
	XORQ    R14, R13
	RORXQ   $0x0e, R11, R14
	ANDQ    R11, R15
	ADDQ    DI, R8
	XORQ    R14, R13
	RORXQ   $0x22, R8, R12
	XORQ    BX, R15
	RORXQ   $0x27, R8, R14
	MOVQ    R8, DI
	XORQ    R12, R14
	RORXQ   $0x1c, R8, R12
	ADDQ    8(SP), CX
	ORQ     R9, DI
	XORQ    R12, R14
	MOVQ    R8, R12
	ANDQ    DX, DI
	ANDQ    R9, R12
	ADDQ    R13, R15
	ADDQ    CX, R10
	ORQ     R12, DI
	ADDQ    R14, CX
	ADDQ    R15, R10
	ADDQ    R15, CX
	MOVQ    R11, R15
	RORXQ   $0x29, R10, R13
	RORXQ   $0x12, R10, R14
	XORQ    AX, R15
	XORQ    R14, R13
	RORXQ   $0x0e, R10, R14
	ANDQ    R10, R15
	ADDQ    DI, CX
	XORQ    R14, R13
	RORXQ   $0x22, CX, R12
	XORQ    AX, R15
	RORXQ   $0x27, CX, R14
	MOVQ    CX, DI
	XORQ    R12, R14
	RORXQ   $0x1c, CX, R12
	ADDQ    16(SP), BX
	ORQ     DX, DI
	XORQ    R12, R14
	MOVQ    CX, R12
	ANDQ    R8, DI
	ANDQ    DX, R12
	ADDQ    R13, R15
	ADDQ    BX, R9
	ORQ     R12, DI
	ADDQ    R14, BX
	ADDQ    R15, R9
	ADDQ    R15, BX
	MOVQ    R10, R15
	RORXQ   $0x29, R9, R13
	RORXQ   $0x12, R9, R14
	XORQ    R11, R15
	XORQ    R14, R13
	RORXQ   $0x0e, R9, R14
	ANDQ    R9, R15
	ADDQ    DI, BX
	XORQ    R14, R13
	RORXQ   $0x22, BX, R12
	XORQ    R11, R15
	RORXQ   $0x27, BX, R14
	MOVQ    BX, DI
	XORQ    R12, R14
	RORXQ   $0x1c, BX, R12
	ADDQ    24(SP), AX
	ORQ     R8, DI
	XORQ    R12, R14
	MOVQ    BX, R12
	ANDQ    CX, DI
	ANDQ    R8, R12
	ADDQ    R13, R15
	ADDQ    AX, DX
	ORQ     R12, DI
	ADDQ    R14, AX
	ADDQ    R15, DX
	ADDQ    R15, AX
	ADDQ    DI, AX
	VMOVDQU Y6, Y4
	VMOVDQU Y7, Y5
	SUBQ    $0x01, 32(SP)
	JNE     loop2
	ADDQ    (SI), AX
	MOVQ    AX, (SI)
	ADDQ    8(SI), BX
	MOVQ    BX, 8(SI)
	ADDQ    16(SI), CX
	MOVQ    CX, 16(SI)
	ADDQ    24(SI), R8
	MOVQ    R8, 24(SI)
	ADDQ    32(SI), DX
	MOVQ    DX, 32(SI)
	ADDQ    40(SI), R9
	MOVQ    R9, 40(SI)
	ADDQ    48(SI), R10
	MOVQ    R10, 48(SI)
	ADDQ    56(SI), R11
	MOVQ    R11, 56(SI)
	MOVQ    40(SP), DI
	ADDQ    $0x80, DI
	CMPQ    DI, 48(SP)
	JNE     loop0

done_hash:
	VZEROUPPER
	RET

DATA PSHUFFLE_BYTE_FLIP_MASK<>+0(SB)/8, $0x0001020304050607
DATA PSHUFFLE_BYTE_FLIP_MASK<>+8(SB)/8, $0x08090a0b0c0d0e0f
DATA PSHUFFLE_BYTE_FLIP_MASK<>+16(SB)/8, $0x1011121314151617
DATA PSHUFFLE_BYTE_FLIP_MASK<>+24(SB)/8, $0x18191a1b1c1d1e1f
GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), RODATA|NOPTR, $32

DATA MASK_YMM_LO<>+0(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+8(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+16(SB)/8, $0xffffffffffffffff
DATA MASK_YMM_LO<>+24(SB)/8, $0xffffffffffffffff
GLOBL MASK_YMM_LO<>(SB), RODATA|NOPTR, $32