```
Source File
count_amd64.s
Belonging Package
internal/bytealg
```

`// Copyright 2018 The Go Authors. All rights reserved.`

`// Use of this source code is governed by a BSD-style`

`// license that can be found in the LICENSE file.`

`#include "go_asm.h"`

`#include "textflag.h"`

`TEXT ·Count(SB),NOSPLIT,$0-40`

`CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1`

`JEQ 2(PC)`

`JMP ·countGeneric(SB)`

`MOVQ b_base+0(FP), SI`

`MOVQ b_len+8(FP), BX`

`MOVB c+24(FP), AL`

`LEAQ ret+32(FP), R8`

`JMP countbody<>(SB)`

`TEXT ·CountString(SB),NOSPLIT,$0-32`

`CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1`

`JEQ 2(PC)`

`JMP ·countGenericString(SB)`

`MOVQ s_base+0(FP), SI`

`MOVQ s_len+8(FP), BX`

`MOVB c+16(FP), AL`

`LEAQ ret+24(FP), R8`

`JMP countbody<>(SB)`

`// input:`

`// SI: data`

`// BX: data len`

`// AL: byte sought`

`// R8: address to put result`

`// This function requires the POPCNT instruction.`

`TEXT countbody<>(SB),NOSPLIT,$0`

`// Shuffle X0 around so that each byte contains`

`// the character we're looking for.`

`MOVD AX, X0`

`PUNPCKLBW X0, X0`

`PUNPCKLBW X0, X0`

`PSHUFL $0, X0, X0`

`CMPQ BX, $16`

`JLT small`

`MOVQ $0, R12 // Accumulator`

`MOVQ SI, DI`

`CMPQ BX, $32`

`JA avx2`

`sse:`

`LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes`

`JMP sseloopentry`

`sseloop:`

`// Move the next 16-byte chunk of the data into X1.`

`MOVOU (DI), X1`

`// Compare bytes in X0 to X1.`

`PCMPEQB X0, X1`

`// Take the top bit of each byte in X1 and put the result in DX.`

`PMOVMSKB X1, DX`

`// Count number of matching bytes`

`POPCNTL DX, DX`

`// Accumulate into R12`

`ADDQ DX, R12`

`// Advance to next block.`

`ADDQ $16, DI`

`sseloopentry:`

`CMPQ DI, AX`

`JBE sseloop`

`// Get the number of bytes to consider in the last 16 bytes`

`ANDQ $15, BX`

`JZ end`

`// Create mask to ignore overlap between previous 16 byte block`

`// and the next.`

`MOVQ $16,CX`

`SUBQ BX, CX`

`MOVQ $0xFFFF, R10`

`SARQ CL, R10`

`SALQ CL, R10`

`// Process the last 16-byte chunk. This chunk may overlap with the`

`// chunks we've already searched so we need to mask part of it.`

`MOVOU (AX), X1`

`PCMPEQB X0, X1`

`PMOVMSKB X1, DX`

`// Apply mask`

`ANDQ R10, DX`

`POPCNTL DX, DX`

`ADDQ DX, R12`

`end:`

`MOVQ R12, (R8)`

`RET`

`// handle for lengths < 16`

`small:`

`TESTQ BX, BX`

`JEQ endzero`

`// Check if we'll load across a page boundary.`

`LEAQ 16(SI), AX`

`TESTW $0xff0, AX`

`JEQ endofpage`

`// We must ignore high bytes as they aren't part of our slice.`

`// Create mask.`

`MOVB BX, CX`

`MOVQ $1, R10`

`SALQ CL, R10`

`SUBQ $1, R10`

`// Load data`

`MOVOU (SI), X1`

`// Compare target byte with each byte in data.`

`PCMPEQB X0, X1`

`// Move result bits to integer register.`

`PMOVMSKB X1, DX`

`// Apply mask`

`ANDQ R10, DX`

`POPCNTL DX, DX`

`// Directly return DX, we don't need to accumulate`

`// since we have <16 bytes.`

`MOVQ DX, (R8)`

`RET`

`endzero:`

`MOVQ $0, (R8)`

`RET`

`endofpage:`

`// We must ignore low bytes as they aren't part of our slice.`

`MOVQ $16,CX`

`SUBQ BX, CX`

`MOVQ $0xFFFF, R10`

`SARQ CL, R10`

`SALQ CL, R10`

`// Load data into the high end of X1.`

`MOVOU -16(SI)(BX*1), X1`

`// Compare target byte with each byte in data.`

`PCMPEQB X0, X1`

`// Move result bits to integer register.`

`PMOVMSKB X1, DX`

`// Apply mask`

`ANDQ R10, DX`

`// Directly return DX, we don't need to accumulate`

`// since we have <16 bytes.`

`POPCNTL DX, DX`

`MOVQ DX, (R8)`

`RET`

`avx2:`

`CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1`

`JNE sse`

`MOVD AX, X0`

`LEAQ -32(SI)(BX*1), R11`

`VPBROADCASTB X0, Y1`

`avx2_loop:`

`VMOVDQU (DI), Y2`

`VPCMPEQB Y1, Y2, Y3`

`VPMOVMSKB Y3, DX`

`POPCNTL DX, DX`

`ADDQ DX, R12`

`ADDQ $32, DI`

`CMPQ DI, R11`

`JLE avx2_loop`

`// If last block is already processed,`

`// skip to the end.`

`CMPQ DI, R11`

`JEQ endavx`

`// Load address of the last 32 bytes.`

`// There is an overlap with the previous block.`

`MOVQ R11, DI`

`VMOVDQU (DI), Y2`

`VPCMPEQB Y1, Y2, Y3`

`VPMOVMSKB Y3, DX`

`// Exit AVX mode.`

`VZEROUPPER`

`// Create mask to ignore overlap between previous 32 byte block`

`// and the next.`

`ANDQ $31, BX`

`MOVQ $32,CX`

`SUBQ BX, CX`

`MOVQ $0xFFFFFFFF, R10`

`SARQ CL, R10`

`SALQ CL, R10`

`// Apply mask`

`ANDQ R10, DX`

`POPCNTL DX, DX`

`ADDQ DX, R12`

`MOVQ R12, (R8)`

`RET`

`endavx:`

`// Exit AVX mode.`

`VZEROUPPER`

`MOVQ R12, (R8)`

`RET`

The pages are generated with Golds v0.1.6-preview. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project and developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |