// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package asmgen

// shiftVU generates lshVU and rshVU, which do
// z, c = x << s and z, c = x >> s, for 0 < s < _W.
func shiftVU( *Asm,  string) {
	// Because these routines can be called for z.Lsh(z, N) and z.Rsh(z, N),
	// the input and output slices may be aliased at different offsets.
	// For example (on 64-bit systems), during z.Lsh(z, 65), &z[0] == &x[1],
	// and during z.Rsh(z, 65), &z[1] == &x[0].
	// For left shift, we must process the slices from len(z)-1 down to 0,
	// so that we don't overwrite a word before we need to read it.
	// For right shift, we must process the slices from 0 up to len(z)-1.
	// The different traversals at least make the two cases more consistent,
	// since we're always delaying the output by one word compared
	// to the input.

	 := .Func("func " +  + "(z, x []Word, s uint) (c Word)")

	// Check for no input early, since we need to start by reading 1 word.
	 := .Arg("z_len")
	.JmpZero(, "ret0")

	// Start loop by reading first input word.
	 := .ArgHint("s", HintShiftCount)
	 := .Pipe()
	if  == "lshVU" {
		.SetBackward()
	}
	 := []int{1, 4}
	if .Arch == Arch386 {
		 = []int{1} // too few registers for more
		.SetUseIndexCounter()
	}
	.LoadPtrs()
	.Comment("shift first word into carry")
	 := .LoadN(1)[0][0]

	// Decide how to shift. On systems with a wide shift (x86), use that.
	// Otherwise, we need shift by s and negative (reverse) shift by 64-s or 32-s.
	 := .Lsh
	 := .LshWide
	 := .Rsh
	 := .RshReg
	if  == "rshVU" {
		 = .Rsh
		 = .RshWide
		 = .Lsh
		 = .LshReg
	}
	if .Arch.HasShiftWide() {
		// Use wide shift to avoid needing negative shifts.
		// The invariant is that prev holds the previous word (not shifted at all),
		// to be used as input into the wide shift.
		// After the loop finishes, prev holds the final output word to be written.
		 := .Reg()
		(, , .Imm(0), )
		.StoreArg(, "c")
		.Free()
		.Comment("shift remaining words")
		.Start(, ...)
		.Loop(func( [][]Reg,  [][]Reg) {
			// We reuse the input registers as output, delayed one cycle; prev is the first output.
			// After writing the outputs to memory, we can copy the final x value into prev
			// for the next iteration.
			 := 
			for ,  := range [0] {
				(, , , )
				[0][] = 
				 = 
			}
			.StoreN()
			.Mov(, )
		})
		.Comment("store final shifted bits")
		(, , )
	} else {
		// Construct values from x << s and x >> (64-s).
		// After the first word has been processed, the invariant is that
		// prev holds x << s, to be used as the high bits of the next output word,
		// once we find the low bits after reading the next input word.
		// After the loop finishes, prev holds the final output word to be written.
		 := .Reg()
		.Mov(.Imm(.Arch.WordBits), )
		.Sub(, , , SmashCarry)
		 := .Reg()
		(, , )
		(, , )
		.StoreArg(, "c")
		.Free()
		.Comment("shift remaining words")
		.Start(, ...)
		.Loop(func(,  [][]Reg) {
			if .HasRegShift() {
				// ARM (32-bit) allows shifts in most arithmetic expressions,
				// including OR, letting us combine the negShift and a.Or.
				// The simplest way to manage the registers is to do StoreN for
				// one output at a time, and since we don't use multi-register
				// stores on ARM, that doesn't hurt us.
				[0] = [0][:1]
				for ,  := range [0] {
					.Or((, ), , )
					[0][0] = 
					.StoreN()
					(, , )
				}
				return
			}
			// We reuse the input registers as output, delayed one cycle; z0 is the first output.
			 := .Reg()
			 := 
			for ,  := range [0] {
				(, , )
				.Or(, , )
				(, , )
				[0][] = 
				 = 
			}
			.StoreN()
		})
		.Comment("store final shifted bits")
	}
	.StoreN([][]Reg{{}})
	.Done()
	.Free()
	.Ret()

	// Return 0, used from above.
	.Label("ret0")
	.StoreArg(.Imm(0), "c")
	.Ret()
}