// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package asmgen

import (
	
	
	
)

// Note: Exported fields and methods are expected to be used
// by function generators (like the ones in add.go and so on).
// Unexported fields and methods should not be.

// A Pipe manages the input and output data pipelines for a function's
// memory operations.
//
// The input is one or more equal-length slices of words, so collectively
// it can be viewed as a matrix, in which each slice is a row and each column
// is a set of corresponding words from the different slices.
// The output can be viewed the same way, although it is often just one row.
type Pipe struct {
	f               *Func    // function being generated
	label           string   // prefix for loop labels (default "loop")
	backward        bool     // processing columns in reverse
	started         bool     // Start has been called
	loaded          bool     // LoadPtrs has been called
	inPtr           []RegPtr // input slice pointers
	hints           []Hint   // for each inPtr, a register hint to use for its data
	outPtr          []RegPtr // output slice pointers
	index           Reg      // index register, if in use
	useIndexCounter bool     // index counter requested
	indexCounter    int      // index is also counter (386); 0 no, -1 negative counter, +1 positive counter
	readOff         int      // read offset not yet added to index
	writeOff        int      // write offset not yet added to index
	factors         []int    // unrolling factors
	counts          []Reg    // iterations for each factor
	needWrite       bool     // need a write call during Loop1/LoopN
	maxColumns      int      // maximum columns during unrolled loop
	unrollStart     func()   // emit code at start of unrolled body
	unrollEnd       func()   // emit code end of unrolled body
}

// Pipe creates and returns a new pipe for use in the function f.
func ( *Func) () *Pipe {
	 := .Asm
	 := &Pipe{
		f:          ,
		label:      "loop",
		maxColumns: 10000000,
	}
	if  := .Arch.maxColumns;  != 0 {
		.maxColumns = 
	}
	return 
}

// SetBackward sets the pipe to process the input and output columns in reverse order.
// This is needed for left shifts, which might otherwise overwrite data they will read later.
func ( *Pipe) () {
	if .loaded {
		.f.Asm.Fatalf("SetBackward after Start/LoadPtrs")
	}
	.backward = true
}

// SetUseIndexCounter sets the pipe to use an index counter if possible,
// meaning the loop counter is also used as an index for accessing the slice data.
// This clever trick is slower on modern processors, but it is still necessary on 386.
// On non-386 systems, SetUseIndexCounter is a no-op.
func ( *Pipe) () {
	if .f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it)
		return
	}
	.useIndexCounter = true
}

// SetLabel sets the label prefix for the loops emitted by the pipe.
// The default prefix is "loop".
func ( *Pipe) ( string) {
	.label = 
}

// SetMaxColumns sets the maximum number of
// columns processed in a single loop body call.
func ( *Pipe) ( int) {
	.maxColumns = 
}

// SetHint records that the inputs from the named vector
// should be allocated with the given register hint.
//
// If the hint indicates a single register on the target architecture,
// then SetHint calls SetMaxColumns(1), since the hinted register
// can only be used for one value at a time.
func ( *Pipe) ( string,  Hint) {
	if  == HintMemOK && !.f.Asm.Arch.memOK {
		return
	}
	 := slices.Index(.f.inputs, )
	if  < 0 {
		.f.Asm.Fatalf("unknown input name %s", )
	}
	if .f.Asm.hint() != "" {
		.SetMaxColumns(1)
	}
	for len(.hints) <=  {
		.hints = append(.hints, HintNone)
	}
	.hints[] = 
}

// LoadPtrs loads the slice pointer arguments into registers,
// assuming that the slice length n has already been loaded
// into the register n.
//
// Start will call LoadPtrs if it has not been called already.
// LoadPtrs only needs to be called explicitly when code needs
// to use LoadN before Start, like when the shift.go generators
// read an initial word before the loop.
func ( *Pipe) ( Reg) {
	 := .f.Asm
	if .loaded {
		.Fatalf("pointers already loaded")
	}

	// Load the actual pointers.
	.loaded = true
	for ,  := range .f.inputs {
		.inPtr = append(.inPtr, RegPtr(.f.Arg(+"_base")))
	}
	for ,  := range .f.outputs {
		.outPtr = append(.outPtr, RegPtr(.f.Arg(+"_base")))
	}

	// Decide the memory access strategy for LoadN and StoreN.
	switch {
	case .backward && .useIndexCounter:
		// Generator wants an index counter, meaning when the iteration counter
		// is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes).
		// The loop is moving backward through the slice, but the counter
		// is also moving backward, so not much to do.
		.Comment("run loop backward, using counter as positive index")
		.indexCounter = +1
		.index = 

	case !.backward && .useIndexCounter:
		// Generator wants an index counter, but the loop is moving forward.
		// To make the counter move in the direction of data access,
		// we negate the counter, counting up from -len(z) to -1.
		// To make the index access the right words, we add len(z)*WordBytes
		// to each of the pointers.
		// See comment below about the garbage collector (non-)implications
		// of pointing beyond the slice bounds.
		.Comment("use counter as negative index")
		.indexCounter = -1
		.index = 
		for ,  := range .inPtr {
			.AddWords(, , )
		}
		for ,  := range .outPtr {
			.AddWords(, , )
		}
		.Neg(, )

	case .backward:
		// Generator wants to run the loop backward.
		// We'll decrement the pointers before using them,
		// so position them at the very end of the slices.
		// If we had precise pointer information for assembly,
		// these pointers would cause problems with the garbage collector,
		// since they no longer point into the allocated slice,
		// but the garbage collector ignores unexpected values in assembly stacks,
		// and the actual slice pointers are still in the argument stack slots,
		// so the slices won't be collected early.
		// If we switched to the register ABI, we might have to rethink this.
		// (The same thing happens by the end of forward loops,
		// but it's less important since once the pointers go off the slice
		// in a forward loop, the loop is over and the slice won't be accessed anymore.)
		.Comment("run loop backward")
		for ,  := range .inPtr {
			.AddWords(, , )
		}
		for ,  := range .outPtr {
			.AddWords(, , )
		}

	case !.backward:
		// Nothing to do!
	}
}

// LoadN returns the next n columns of input words as a slice of rows.
// Regs for inputs that have been marked using p.SetMemOK will be direct memory references.
// Regs for other inputs will be newly allocated registers and must be freed.
func ( *Pipe) ( int) [][]Reg {
	 := .f.Asm
	 := make([][]Reg, len(.inPtr))
	for ,  := range .inPtr {
		[] = make([]Reg, )
		switch {
		case .Arch.loadIncN != nil:
			// Load from memory and advance pointers at the same time.
			for  := range [] {
				[][] = .f.Asm.Reg()
			}
			if .backward {
				.Arch.loadDecN(, , [])
			} else {
				.Arch.loadIncN(, , [])
			}

		default:
			// Load from memory using offsets.
			// We'll advance the pointers or the index counter later.
			for  := range  {
				 := .readOff + 
				if .backward {
					 = -( + 1)
				}
				var  Reg
				if .indexCounter != 0 {
					 = .Arch.memIndex(, *.Arch.WordBytes, .index, )
				} else {
					 = .mem( * .Arch.WordBytes)
				}
				 := HintNone
				if  < len(.hints) {
					 = .hints[]
				}
				if  == HintMemOK {
					[][] = 
				} else {
					 := .f.Asm.RegHint()
					.Mov(, )
					[][] = 
				}
			}
		}
	}
	.readOff += 
	return 
}

// StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]).
func ( *Pipe) ( [][]Reg) {
	.needWrite = false
	 := .f.Asm
	if len() != len(.outPtr) {
		.f.Asm.Fatalf("wrong number of output rows")
	}
	 := len([0])
	for ,  := range .outPtr {
		switch {
		case .Arch.storeIncN != nil:
			// Store to memory and advance pointers at the same time.
			if .backward {
				.Arch.storeDecN(, , [])
			} else {
				.Arch.storeIncN(, , [])
			}

		default:
			// Store to memory using offsets.
			// We'll advance the pointers or the index counter later.
			for ,  := range [] {
				 := .writeOff + 
				if .backward {
					 = -( + 1)
				}
				var  Reg
				if .indexCounter != 0 {
					 = .Arch.memIndex(, *.Arch.WordBytes, .index, )
				} else {
					 = .mem( * .Arch.WordBytes)
				}
				.Mov(, )
			}
		}
	}
	.writeOff += 
}

// advancePtrs advances the pointers by step
// or handles bookkeeping for an imminent index advance by step
// that the caller will do.
func ( *Pipe) ( int) {
	 := .f.Asm
	switch {
	case .Arch.loadIncN != nil:
		// nothing to do

	default:
		// Adjust read/write offsets for pointer advance (or imminent index advance).
		.readOff -= 
		.writeOff -= 

		if .indexCounter == 0 {
			// Advance pointers.
			if .backward {
				 = -
			}
			for ,  := range .inPtr {
				.Add(.Imm(*.Arch.WordBytes), Reg(), Reg(), KeepCarry)
			}
			for ,  := range .outPtr {
				.Add(.Imm(*.Arch.WordBytes), Reg(), Reg(), KeepCarry)
			}
		}
	}
}

// DropInput deletes the named input from the pipe,
// usually because it has been exhausted.
// (This is not used yet but will be used in a future generator.)
func ( *Pipe) ( string) {
	 := slices.Index(.f.inputs, )
	if  < 0 {
		.f.Asm.Fatalf("unknown input %s", )
	}
	 := .inPtr[]
	.f.Asm.Free(Reg())
	.inPtr = slices.Delete(.inPtr, , +1)
	.f.inputs = slices.Delete(.f.inputs, , +1)
	if len(.hints) >  {
		.hints = slices.Delete(.hints, , +1)
	}
}

// Start prepares to loop over n columns.
// The factors give a sequence of unrolling factors to use,
// which must be either strictly increasing or strictly decreasing
// and must include 1.
// For example, 4, 1 means to process 4 elements at a time
// and then 1 at a time for the final 0-3; specifying 1,4 instead
// handles 0-3 elements first and then 4 at a time.
// Similarly, 32, 4, 1 means to process 32 at a time,
// then 4 at a time, then 1 at a time.
//
// One benefit of using 1, 4 instead of 4, 1 is that the body
// processing 4 at a time needs more registers, and if it is
// the final body, the register holding the fragment count (0-3)
// has been freed and is available for use.
//
// Start may modify the carry flag.
//
// Start must be followed by a call to Loop1 or LoopN,
// but it is permitted to emit other instructions first,
// for example to set an initial carry flag.
func ( *Pipe) ( Reg,  ...int) {
	 := .f.Asm
	if .started {
		.Fatalf("loop already started")
	}
	if .useIndexCounter && len() > 1 {
		.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", )
	}
	.started = true
	if !.loaded {
		if len() == 1 {
			.SetUseIndexCounter()
		}
		.LoadPtrs()
	}

	// If there were calls to LoadN between LoadPtrs and Start,
	// adjust the loop not to scan those columns, assuming that
	// either the code already called an equivalent StoreN or else
	// that it will do so after the loop.
	if  := .readOff;  != 0 {
		if .indexCounter < 0 {
			// Index is negated, so add off instead of subtracting.
			.Add(.Imm(), , , SmashCarry)
		} else {
			.Sub(.Imm(), , , SmashCarry)
		}
		if .indexCounter != 0 {
			// n is also the index we are using, so adjust readOff and writeOff
			// to continue to point at the same positions as before we changed n.
			.readOff -= 
			.writeOff -= 
		}
	}

	.Restart(, ...)
}

// Restart prepares to loop over an additional n columns,
// beyond a previous loop run by p.Start/p.Loop.
func ( *Pipe) ( Reg,  ...int) {
	 := .f.Asm
	if !.started {
		.Fatalf("pipe not started")
	}
	.factors = 
	.counts = make([]Reg, len())
	if len() == 0 {
		 = []int{1}
	}

	// Compute the loop lengths for each unrolled section into separate registers.
	// We compute them all ahead of time in case the computation would smash
	// a carry flag that the loop bodies need preserved.
	if len() > 1 {
		.Comment("compute unrolled loop lengths")
	}
	switch {
	default:
		.Fatalf("invalid factors %v", )

	case [0] == 1:
		// increasing loop factors
		 := 1
		for ,  := range [1:] {
			if  <= [] {
				.Fatalf("non-increasing factors %v", )
			}
			if &(-1) != 0 {
				.Fatalf("non-power-of-two factors %v", )
			}
			 := .f.Asm.Reg()
			 /= 
			.And(.Imm(-1), , )
			.Rsh(.Imm(bits.TrailingZeros(uint())), , )
			 *= 
			.counts[] = 
		}
		.counts[len(.counts)-1] = 

	case [len()-1] == 1:
		// decreasing loop factors
		for ,  := range [:len()-1] {
			if  <= [+1] {
				.Fatalf("non-decreasing factors %v", )
			}
			if &(-1) != 0 {
				.Fatalf("non-power-of-two factors %v", )
			}
			 := .f.Asm.Reg()
			.Rsh(.Imm(bits.TrailingZeros(uint())), , )
			.And(.Imm(-1), , )
			.counts[] = 
		}
		.counts[len(.counts)-1] = 
	}
}

// Done frees all the registers allocated by the pipe.
func ( *Pipe) () {
	for ,  := range .inPtr {
		.f.Asm.Free(Reg())
	}
	.inPtr = nil
	for ,  := range .outPtr {
		.f.Asm.Free(Reg())
	}
	.outPtr = nil
	.index = Reg{}
}

// Loop emits code for the loop, calling block repeatedly to emit code that
// handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p).
// block must call p.StoreN(out) to write N output columns.
// The out slice is a pre-allocated matrix of uninitialized Reg values.
// block is expected to set each entry to the Reg that should be written
// before calling p.StoreN(out).
//
// For example, if the loop is to be unrolled 4x in blocks of 2 columns each,
// the sequence of calls to emit the unrolled loop body is:
//
//	start()  // set by pAtUnrollStart
//	... reads for 2 columns ...
//	block()
//	... writes for 2 columns ...
//	... reads for 2 columns ...
//	block()
//	... writes for 2 columns ...
//	end()  // set by p.AtUnrollEnd
//
// Any registers allocated during block are freed automatically when block returns.
func ( *Pipe) ( func(,  [][]Reg)) {
	if .factors == nil {
		.f.Asm.Fatalf("Pipe.Start not called")
	}
	for ,  := range .factors {
		 := .counts[]
		.unroll(, , )
		if  < len(.factors)-1 {
			.f.Asm.Free()
		}
	}
	.factors = nil
}

// AtUnrollStart sets a function to call at the start of an unrolled sequence.
// See [Pipe.Loop] for details.
func ( *Pipe) ( func()) {
	.unrollStart = 
}

// AtUnrollEnd sets a function to call at the end of an unrolled sequence.
// See [Pipe.Loop] for details.
func ( *Pipe) ( func()) {
	.unrollEnd = 
}

// unroll emits a single unrolled loop for the given factor, iterating n times.
func ( *Pipe) ( Reg,  int,  func(,  [][]Reg)) {
	 := .f.Asm
	 := fmt.Sprintf("%s%d", .label, )

	// Top of loop control flow.
	.Label()
	if .Arch.loopTop != "" {
		.Printf("\t"+.Arch.loopTop+"\n", , +"done")
	} else {
		.JmpZero(, +"done")
	}
	.Label( + "cont")

	// Unrolled loop body.
	if  < .maxColumns {
		.Comment("unroll %dX", )
	} else {
		.Comment("unroll %dX in batches of %d", , .maxColumns)
	}
	if .unrollStart != nil {
		.unrollStart()
	}
	for  := 0;  < ; {
		 := min(-, .maxColumns)
		 := .RegsUsed()
		 := make([][]Reg, len(.outPtr))
		for  := range  {
			[] = make([]Reg, )
		}
		 := .LoadN()
		.needWrite = true
		(, )
		if .needWrite && len(.outPtr) > 0 {
			.Fatalf("missing p.Write1 or p.StoreN")
		}
		.SetRegsUsed() // free anything block allocated
		 += 
	}
	if .unrollEnd != nil {
		.unrollEnd()
	}
	.advancePtrs()

	// Bottom of loop control flow.
	switch {
	case .indexCounter >= 0 && .Arch.loopBottom != "":
		.Printf("\t"+.Arch.loopBottom+"\n", , +"cont")

	case .indexCounter >= 0:
		.Sub(.Imm(1), , , KeepCarry)
		.JmpNonZero(, +"cont")

	case .indexCounter < 0 && .Arch.loopBottomNeg != "":
		.Printf("\t"+.Arch.loopBottomNeg+"\n", , +"cont")

	case .indexCounter < 0:
		.Add(.Imm(1), , , KeepCarry)
	}
	.Label( + "done")
}