// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// Test-only.
TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
MOVQ sizeClass+0(FP), CX
MOVQ packed+8(FP), AX
// Call the expander for this size class
LEAQ ·gcExpandersAVX512(SB), BX
CALL (BX)(CX*8)
MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
VMOVDQU64 Z1, 0(DI)
VMOVDQU64 Z2, 64(DI)
VZEROUPPER
RET
TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
// Z1+Z2 = Expand the grey object mask into a grey word mask
MOVQ objMarks+16(FP), AX
MOVQ sizeClass+24(FP), CX
LEAQ ·gcExpandersAVX512(SB), BX
CALL (BX)(CX*8)
// Z3+Z4 = Load the pointer mask
MOVQ ptrMask+32(FP), AX
VMOVDQU64 0(AX), Z3
VMOVDQU64 64(AX), Z4
// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
VPANDQ Z1, Z3, Z1
VPANDQ Z2, Z4, Z2
// Now each bit of Z1+Z2 represents one word of the span.
// Thus, each byte covers 64 bytes of memory, which is also how
// much we can fix in a Z register.
//
// We do a load/compress for each 64 byte frame.
//
// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
VPOPCNTB Z1, Z3 // Requires BITALG
VPOPCNTB Z2, Z4
// Store the scan mask and word counts at 0(SP) and 128(SP).
//
// TODO: Is it better to read directly from the registers?
VMOVDQU64 Z1, 0(SP)
VMOVDQU64 Z2, 64(SP)
VMOVDQU64 Z3, 128(SP)
VMOVDQU64 Z4, 192(SP)
// SI = Current address in span
MOVQ mem+0(FP), SI
// DI = Scan buffer base
MOVQ bufp+8(FP), DI
// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
MOVQ $0, DX
// AX = address in scan mask, 128(AX) = address in popcount
LEAQ 0(SP), AX
// Loop over the 64 byte frames in this span.
// BX = 1 past the end of the scan mask
LEAQ 128(SP), BX
// Align loop to a cache line so that performance is less sensitive
// to how this function ends up laid out in memory. This is a hot
// function in the GC, and this is a tight loop. We don't want
// performance to waver wildly due to unrelated changes.
PCALIGN $64
loop:
// CX = Fetch the mask of words to load from this frame.
MOVBQZX 0(AX), CX
// Skip empty frames.
TESTQ CX, CX
JZ skip
// Load the 64 byte frame.
KMOVB CX, K1
VMOVDQA64 0(SI), Z1
// Collect just the pointers from the greyed objects into the scan buffer,
// i.e., copy the word indices in the mask from Z1 into contiguous memory.
//
// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
// imposes a severe performance penalty of around an order of magnitude
// compared to a register destination.
//
// This workaround is unfortunate on other microarchitectures, where a
// memory destination is slightly faster than adding an additional move
// instruction, but no where near an order of magnitude. It would be
// nice to have a Genoa-only variant here.
//
// AMD Turin / Zen 5 fixes this issue.
//
// See
// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
VPCOMPRESSQ Z1, K1, Z2
VMOVDQU64 Z2, (DI)(DX*8)
// Advance the scan buffer position by the number of pointers.
MOVBQZX 128(AX), CX
ADDQ CX, DX
skip:
ADDQ $64, SI
ADDQ $1, AX
CMPQ AX, BX
JB loop
end:
MOVL DX, count+40(FP)
VZEROUPPER
RET
 |
The pages are generated with Golds v0.8.3-preview. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |