// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "go_asm.h"
#include "textflag.h"

// Test-only.
TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
	MOVQ sizeClass+0(FP), CX
	MOVQ packed+8(FP), AX

	// Call the expander for this size class
	LEAQ ·gcExpandersAVX512(SB), BX
	CALL (BX)(CX*8)

	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
	VMOVDQU64 Z1, 0(DI)
	VMOVDQU64 Z2, 64(DI)
	VZEROUPPER
	RET

TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
	// Z1+Z2 = Expand the grey object mask into a grey word mask
	MOVQ objMarks+16(FP), AX
	MOVQ sizeClass+24(FP), CX
	LEAQ ·gcExpandersAVX512(SB), BX
	CALL (BX)(CX*8)

	// Z3+Z4 = Load the pointer mask
	MOVQ ptrMask+32(FP), AX
	VMOVDQU64 0(AX), Z3
	VMOVDQU64 64(AX), Z4

	// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
	VPANDQ Z1, Z3, Z1
	VPANDQ Z2, Z4, Z2

	// Now each bit of Z1+Z2 represents one word of the span.
	// Thus, each byte covers 64 bytes of memory, which is also how
	// much we can fix in a Z register.
	//
	// We do a load/compress for each 64 byte frame.
	//
	// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
	VPOPCNTB Z1, Z3 // Requires BITALG
	VPOPCNTB Z2, Z4

	// Store the scan mask and word counts at 0(SP) and 128(SP).
	//
	// TODO: Is it better to read directly from the registers?
	VMOVDQU64 Z1, 0(SP)
	VMOVDQU64 Z2, 64(SP)
	VMOVDQU64 Z3, 128(SP)
	VMOVDQU64 Z4, 192(SP)

	// SI = Current address in span
	MOVQ mem+0(FP), SI
	// DI = Scan buffer base
	MOVQ bufp+8(FP), DI
	// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
	MOVQ $0, DX

	// AX = address in scan mask, 128(AX) = address in popcount
	LEAQ 0(SP), AX

	// Loop over the 64 byte frames in this span.
	// BX = 1 past the end of the scan mask
	LEAQ 128(SP), BX

	// Align loop to a cache line so that performance is less sensitive
	// to how this function ends up laid out in memory. This is a hot
	// function in the GC, and this is a tight loop. We don't want
	// performance to waver wildly due to unrelated changes.
	PCALIGN $64
loop:
	// CX = Fetch the mask of words to load from this frame.
	MOVBQZX 0(AX), CX
	// Skip empty frames.
	TESTQ CX, CX
	JZ skip

	// Load the 64 byte frame.
	KMOVB CX, K1
	VMOVDQA64 0(SI), Z1

	// Collect just the pointers from the greyed objects into the scan buffer,
	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
	//
	// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
	// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
	// imposes a severe performance penalty of around an order of magnitude
	// compared to a register destination.
	//
	// This workaround is unfortunate on other microarchitectures, where a
	// memory destination is slightly faster than adding an additional move
	// instruction, but no where near an order of magnitude. It would be
	// nice to have a Genoa-only variant here.
	//
	// AMD Turin / Zen 5 fixes this issue.
	//
	// See
	// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
	VPCOMPRESSQ Z1, K1, Z2
	VMOVDQU64 Z2, (DI)(DX*8)

	// Advance the scan buffer position by the number of pointers.
	MOVBQZX 128(AX), CX
	ADDQ CX, DX

skip:
	ADDQ $64, SI
	ADDQ $1, AX
	CMPQ AX, BX
	JB loop

end:
	MOVL DX, count+40(FP)
	VZEROUPPER
	RET