// Copyright 2025 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package asmgenimport ()// Note: Exported fields and methods are expected to be used// by function generators (like the ones in add.go and so on).// Unexported fields and methods should not be.// A Pipe manages the input and output data pipelines for a function's// memory operations.//// The input is one or more equal-length slices of words, so collectively// it can be viewed as a matrix, in which each slice is a row and each column// is a set of corresponding words from the different slices.// The output can be viewed the same way, although it is often just one row.typePipestruct { f *Func// function being generated label string// prefix for loop labels (default "loop") backward bool// processing columns in reverse started bool// Start has been called loaded bool// LoadPtrs has been called inPtr []RegPtr// input slice pointers hints []Hint// for each inPtr, a register hint to use for its data outPtr []RegPtr// output slice pointers index Reg// index register, if in use useIndexCounter bool// index counter requested indexCounter int// index is also counter (386); 0 no, -1 negative counter, +1 positive counter readOff int// read offset not yet added to index writeOff int// write offset not yet added to index factors []int// unrolling factors counts []Reg// iterations for each factor needWrite bool// need a write call during Loop1/LoopN maxColumns int// maximum columns during unrolled loop unrollStart func() // emit code at start of unrolled body unrollEnd func() // emit code end of unrolled body}// Pipe creates and returns a new pipe for use in the function f.func ( *Func) () *Pipe { := .Asm := &Pipe{f: ,label: "loop",maxColumns: 10000000, }if := .Arch.maxColumns; != 0 { .maxColumns = }return}// SetBackward sets the pipe to process the input and output columns in reverse order.// This is needed for left shifts, which might otherwise overwrite data they will read later.func ( *Pipe) () {if .loaded { .f.Asm.Fatalf("SetBackward after Start/LoadPtrs") } .backward = true}// SetUseIndexCounter sets the pipe to use an index counter if possible,// meaning the loop counter is also used as an index for accessing the slice data.// This clever trick is slower on modern processors, but it is still necessary on 386.// On non-386 systems, SetUseIndexCounter is a no-op.func ( *Pipe) () {if .f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it)return } .useIndexCounter = true}// SetLabel sets the label prefix for the loops emitted by the pipe.// The default prefix is "loop".func ( *Pipe) ( string) { .label = }// SetMaxColumns sets the maximum number of// columns processed in a single loop body call.func ( *Pipe) ( int) { .maxColumns = }// SetHint records that the inputs from the named vector// should be allocated with the given register hint.//// If the hint indicates a single register on the target architecture,// then SetHint calls SetMaxColumns(1), since the hinted register// can only be used for one value at a time.func ( *Pipe) ( string, Hint) {if == HintMemOK && !.f.Asm.Arch.memOK {return } := slices.Index(.f.inputs, )if < 0 { .f.Asm.Fatalf("unknown input name %s", ) }if .f.Asm.hint() != "" { .SetMaxColumns(1) }forlen(.hints) <= { .hints = append(.hints, HintNone) } .hints[] = }// LoadPtrs loads the slice pointer arguments into registers,// assuming that the slice length n has already been loaded// into the register n.//// Start will call LoadPtrs if it has not been called already.// LoadPtrs only needs to be called explicitly when code needs// to use LoadN before Start, like when the shift.go generators// read an initial word before the loop.func ( *Pipe) ( Reg) { := .f.Asmif .loaded { .Fatalf("pointers already loaded") }// Load the actual pointers. .loaded = truefor , := range .f.inputs { .inPtr = append(.inPtr, RegPtr(.f.Arg(+"_base"))) }for , := range .f.outputs { .outPtr = append(.outPtr, RegPtr(.f.Arg(+"_base"))) }// Decide the memory access strategy for LoadN and StoreN.switch {case .backward && .useIndexCounter:// Generator wants an index counter, meaning when the iteration counter // is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes). // The loop is moving backward through the slice, but the counter // is also moving backward, so not much to do. .Comment("run loop backward, using counter as positive index") .indexCounter = +1 .index = case !.backward && .useIndexCounter:// Generator wants an index counter, but the loop is moving forward. // To make the counter move in the direction of data access, // we negate the counter, counting up from -len(z) to -1. // To make the index access the right words, we add len(z)*WordBytes // to each of the pointers. // See comment below about the garbage collector (non-)implications // of pointing beyond the slice bounds. .Comment("use counter as negative index") .indexCounter = -1 .index = for , := range .inPtr { .AddWords(, , ) }for , := range .outPtr { .AddWords(, , ) } .Neg(, )case .backward:// Generator wants to run the loop backward. // We'll decrement the pointers before using them, // so position them at the very end of the slices. // If we had precise pointer information for assembly, // these pointers would cause problems with the garbage collector, // since they no longer point into the allocated slice, // but the garbage collector ignores unexpected values in assembly stacks, // and the actual slice pointers are still in the argument stack slots, // so the slices won't be collected early. // If we switched to the register ABI, we might have to rethink this. // (The same thing happens by the end of forward loops, // but it's less important since once the pointers go off the slice // in a forward loop, the loop is over and the slice won't be accessed anymore.) .Comment("run loop backward")for , := range .inPtr { .AddWords(, , ) }for , := range .outPtr { .AddWords(, , ) }case !.backward:// Nothing to do! }}// LoadN returns the next n columns of input words as a slice of rows.// Regs for inputs that have been marked using p.SetMemOK will be direct memory references.// Regs for other inputs will be newly allocated registers and must be freed.func ( *Pipe) ( int) [][]Reg { := .f.Asm := make([][]Reg, len(.inPtr))for , := range .inPtr { [] = make([]Reg, )switch {case .Arch.loadIncN != nil:// Load from memory and advance pointers at the same time.for := range [] { [][] = .f.Asm.Reg() }if .backward { .Arch.loadDecN(, , []) } else { .Arch.loadIncN(, , []) }default:// Load from memory using offsets. // We'll advance the pointers or the index counter later.for := range { := .readOff + if .backward { = -( + 1) }varRegif .indexCounter != 0 { = .Arch.memIndex(, *.Arch.WordBytes, .index, ) } else { = .mem( * .Arch.WordBytes) } := HintNoneif < len(.hints) { = .hints[] }if == HintMemOK { [][] = } else { := .f.Asm.RegHint() .Mov(, ) [][] = } } } } .readOff += return}// StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]).func ( *Pipe) ( [][]Reg) { .needWrite = false := .f.Asmiflen() != len(.outPtr) { .f.Asm.Fatalf("wrong number of output rows") } := len([0])for , := range .outPtr {switch {case .Arch.storeIncN != nil:// Store to memory and advance pointers at the same time.if .backward { .Arch.storeDecN(, , []) } else { .Arch.storeIncN(, , []) }default:// Store to memory using offsets. // We'll advance the pointers or the index counter later.for , := range [] { := .writeOff + if .backward { = -( + 1) }varRegif .indexCounter != 0 { = .Arch.memIndex(, *.Arch.WordBytes, .index, ) } else { = .mem( * .Arch.WordBytes) } .Mov(, ) } } } .writeOff += }// advancePtrs advances the pointers by step// or handles bookkeeping for an imminent index advance by step// that the caller will do.func ( *Pipe) ( int) { := .f.Asmswitch {case .Arch.loadIncN != nil:// nothing to dodefault:// Adjust read/write offsets for pointer advance (or imminent index advance). .readOff -= .writeOff -= if .indexCounter == 0 {// Advance pointers.if .backward { = - }for , := range .inPtr { .Add(.Imm(*.Arch.WordBytes), Reg(), Reg(), KeepCarry) }for , := range .outPtr { .Add(.Imm(*.Arch.WordBytes), Reg(), Reg(), KeepCarry) } } }}// DropInput deletes the named input from the pipe,// usually because it has been exhausted.// (This is not used yet but will be used in a future generator.)func ( *Pipe) ( string) { := slices.Index(.f.inputs, )if < 0 { .f.Asm.Fatalf("unknown input %s", ) } := .inPtr[] .f.Asm.Free(Reg()) .inPtr = slices.Delete(.inPtr, , +1) .f.inputs = slices.Delete(.f.inputs, , +1)iflen(.hints) > { .hints = slices.Delete(.hints, , +1) }}// Start prepares to loop over n columns.// The factors give a sequence of unrolling factors to use,// which must be either strictly increasing or strictly decreasing// and must include 1.// For example, 4, 1 means to process 4 elements at a time// and then 1 at a time for the final 0-3; specifying 1,4 instead// handles 0-3 elements first and then 4 at a time.// Similarly, 32, 4, 1 means to process 32 at a time,// then 4 at a time, then 1 at a time.//// One benefit of using 1, 4 instead of 4, 1 is that the body// processing 4 at a time needs more registers, and if it is// the final body, the register holding the fragment count (0-3)// has been freed and is available for use.//// Start may modify the carry flag.//// Start must be followed by a call to Loop1 or LoopN,// but it is permitted to emit other instructions first,// for example to set an initial carry flag.func ( *Pipe) ( Reg, ...int) { := .f.Asmif .started { .Fatalf("loop already started") }if .useIndexCounter && len() > 1 { .Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", ) } .started = trueif !.loaded {iflen() == 1 { .SetUseIndexCounter() } .LoadPtrs() }// If there were calls to LoadN between LoadPtrs and Start, // adjust the loop not to scan those columns, assuming that // either the code already called an equivalent StoreN or else // that it will do so after the loop.if := .readOff; != 0 {if .indexCounter < 0 {// Index is negated, so add off instead of subtracting. .Add(.Imm(), , , SmashCarry) } else { .Sub(.Imm(), , , SmashCarry) }if .indexCounter != 0 {// n is also the index we are using, so adjust readOff and writeOff // to continue to point at the same positions as before we changed n. .readOff -= .writeOff -= } } .Restart(, ...)}// Restart prepares to loop over an additional n columns,// beyond a previous loop run by p.Start/p.Loop.func ( *Pipe) ( Reg, ...int) { := .f.Asmif !.started { .Fatalf("pipe not started") } .factors = .counts = make([]Reg, len())iflen() == 0 { = []int{1} }// Compute the loop lengths for each unrolled section into separate registers. // We compute them all ahead of time in case the computation would smash // a carry flag that the loop bodies need preserved.iflen() > 1 { .Comment("compute unrolled loop lengths") }switch {default: .Fatalf("invalid factors %v", )case [0] == 1:// increasing loop factors := 1for , := range [1:] {if <= [] { .Fatalf("non-increasing factors %v", ) }if &(-1) != 0 { .Fatalf("non-power-of-two factors %v", ) } := .f.Asm.Reg() /= .And(.Imm(-1), , ) .Rsh(.Imm(bits.TrailingZeros(uint())), , ) *= .counts[] = } .counts[len(.counts)-1] = case [len()-1] == 1:// decreasing loop factorsfor , := range [:len()-1] {if <= [+1] { .Fatalf("non-decreasing factors %v", ) }if &(-1) != 0 { .Fatalf("non-power-of-two factors %v", ) } := .f.Asm.Reg() .Rsh(.Imm(bits.TrailingZeros(uint())), , ) .And(.Imm(-1), , ) .counts[] = } .counts[len(.counts)-1] = }}// Done frees all the registers allocated by the pipe.func ( *Pipe) () {for , := range .inPtr { .f.Asm.Free(Reg()) } .inPtr = nilfor , := range .outPtr { .f.Asm.Free(Reg()) } .outPtr = nil .index = Reg{}}// Loop emits code for the loop, calling block repeatedly to emit code that// handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p).// block must call p.StoreN(out) to write N output columns.// The out slice is a pre-allocated matrix of uninitialized Reg values.// block is expected to set each entry to the Reg that should be written// before calling p.StoreN(out).//// For example, if the loop is to be unrolled 4x in blocks of 2 columns each,// the sequence of calls to emit the unrolled loop body is://// start() // set by pAtUnrollStart// ... reads for 2 columns ...// block()// ... writes for 2 columns ...// ... reads for 2 columns ...// block()// ... writes for 2 columns ...// end() // set by p.AtUnrollEnd//// Any registers allocated during block are freed automatically when block returns.func ( *Pipe) ( func(, [][]Reg)) {if .factors == nil { .f.Asm.Fatalf("Pipe.Start not called") }for , := range .factors { := .counts[] .unroll(, , )if < len(.factors)-1 { .f.Asm.Free() } } .factors = nil}// AtUnrollStart sets a function to call at the start of an unrolled sequence.// See [Pipe.Loop] for details.func ( *Pipe) ( func()) { .unrollStart = }// AtUnrollEnd sets a function to call at the end of an unrolled sequence.// See [Pipe.Loop] for details.func ( *Pipe) ( func()) { .unrollEnd = }// unroll emits a single unrolled loop for the given factor, iterating n times.func ( *Pipe) ( Reg, int, func(, [][]Reg)) { := .f.Asm := fmt.Sprintf("%s%d", .label, )// Top of loop control flow. .Label()if .Arch.loopTop != "" { .Printf("\t"+.Arch.loopTop+"\n", , +"done") } else { .JmpZero(, +"done") } .Label( + "cont")// Unrolled loop body.if < .maxColumns { .Comment("unroll %dX", ) } else { .Comment("unroll %dX in batches of %d", , .maxColumns) }if .unrollStart != nil { .unrollStart() }for := 0; < ; { := min(-, .maxColumns) := .RegsUsed() := make([][]Reg, len(.outPtr))for := range { [] = make([]Reg, ) } := .LoadN() .needWrite = true (, )if .needWrite && len(.outPtr) > 0 { .Fatalf("missing p.Write1 or p.StoreN") } .SetRegsUsed() // free anything block allocated += }if .unrollEnd != nil { .unrollEnd() } .advancePtrs()// Bottom of loop control flow.switch {case .indexCounter >= 0 && .Arch.loopBottom != "": .Printf("\t"+.Arch.loopBottom+"\n", , +"cont")case .indexCounter >= 0: .Sub(.Imm(1), , , KeepCarry) .JmpNonZero(, +"cont")case .indexCounter < 0 && .Arch.loopBottomNeg != "": .Printf("\t"+.Arch.loopBottomNeg+"\n", , +"cont")case .indexCounter < 0: .Add(.Imm(1), , , KeepCarry) } .Label( + "done")}
The pages are generated with Goldsv0.7.7-preview. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.