// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build goexperiment.simd && amd64

package archsimd

// These constants represent the source pattern for the four parameters
// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
// L means the element comes from the 'x' vector (Low), and
// H means it comes from the 'y' vector (High).
// The order of the letters corresponds to elements a, b, c, d.
// The underlying integer value is a bitmask where:
// Bit 0: Source of element 'a' (0 for x, 1 for y)
// Bit 1: Source of element 'b' (0 for x, 1 for y)
// Bit 2: Source of element 'c' (0 for x, 1 for y)
// Bit 3: Source of element 'd' (0 for x, 1 for y)
// Note that the least-significant bit is on the LEFT in this encoding.
const (
	_LLLL = iota // a:x, b:x, c:x, d:x
	_HLLL        // a:y, b:x, c:x, d:x
	_LHLL        // a:x, b:y, c:x, d:x
	_HHLL        // a:y, b:y, c:x, d:x
	_LLHL        // a:x, b:x, c:y, d:x
	_HLHL        // a:y, b:x, c:y, d:x
	_LHHL        // a:x, b:y, c:y, d:x
	_HHHL        // a:y, b:y, c:y, d:x
	_LLLH        // a:x, b:x, c:x, d:y
	_HLLH        // a:y, b:x, c:x, d:y
	_LHLH        // a:x, b:y, c:x, d:y
	_HHLH        // a:y, b:y, c:x, d:y
	_LLHH        // a:x, b:x, c:y, d:y
	_HLHH        // a:y, b:x, c:y, d:y
	_LHHH        // a:x, b:y, c:y, d:y
	_HHHH        // a:y, b:y, c:y, d:y
)

// These constants represent the source pattern for the four parameters
// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
// two-element vectors.
const (
	_LL = iota
	_HL
	_LH
	_HH
)

// SelectFromPair returns the selection of four elements from the two
// vectors x and y, where selector values in the range 0-3 specify
// elements from x and values in the range 4-7 specify the 0-3 elements
// of y.  When the selectors are constants and the selection can be
// implemented in a single instruction, it will be, otherwise it
// requires two.  a is the source index of the least element in the
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output.  For example,
// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
func ( Int32x4) (, , ,  uint8,  Int32x4) Int32x4 {
	// pattern gets the concatenation of "x or y?" bits
	// (0 == x, 1 == y)
	// This will determine operand choice/order and whether a second
	// instruction is needed.
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	// a-d are masked down to their offsets within x or y
	// this is not necessary for x, but this is easier on the
	// eyes and reduces the risk of an error now or later.
	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstant(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPair returns the selection of four elements from the two
// vectors x and y, where selector values in the range 0-3 specify
// elements from x and values in the range 4-7 specify the 0-3 elements
// of y.  When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two. a is the source index of the least element in the
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output.  For example,
// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
func ( Uint32x4) (, , ,  uint8,  Uint32x4) Uint32x4 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstant(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPair returns the selection of four elements from the two
// vectors x and y, where selector values in the range 0-3 specify
// elements from x and values in the range 4-7 specify the 0-3 elements
// of y.  When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two. a is the source index of the least element in the
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output.  For example,
// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
func ( Float32x4) (, , ,  uint8,  Float32x4) Float32x4 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstant(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstant(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstant(cscimm4(, , , ), )
		return .concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of four elements from  x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
// When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two. a is the source index of the least element in the
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output.  For example,
// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
//
//	returns {4,8,25,81,64,128,169,289}
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
func ( Int32x8) (, , ,  uint8,  Int32x8) Int32x8 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of four elements from  x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
// When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two. a is the source index of the least element in the
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output.  For example,
// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
//
//	returns {4,8,25,81,64,128,169,289}
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
func ( Uint32x8) (, , ,  uint8,  Uint32x8) Uint32x8 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of four elements from  x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
// When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two. a is the source index of the least element in the
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output.  For example,
// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
//
//	returns {4,8,25,81,64,128,169,289}
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
func ( Float32x8) (, , ,  uint8,  Float32x8) Float32x8 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of four elements from  x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
// When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX512
func ( Int32x16) (, , ,  uint8,  Int32x16) Int32x16 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of four elements from  x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
// When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX512
func ( Uint32x16) (, , ,  uint8,  Uint32x16) Uint32x16 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of four elements from  x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
// When the selectors are constants and can be the selection
// can be implemented in a single instruction, it will be, otherwise
// it requires two.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX512
func ( Float32x16) (, , ,  uint8,  Float32x16) Float32x16 {
	 := >>2 + (&4)>>1 + ( & 4) + (&4)<<1

	, , ,  = &3, &3, &3, &3

	switch  {
	case _LLLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _LLHH:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )
	case _HHLL:
		return .concatSelectedConstantGrouped(cscimm4(, , , ), )

	case _HLLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHLL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _HLHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )
	case _LHHH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(0, 2, , ), )

	case _LLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _LLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )
	case _HHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(cscimm4(, , 0, 2), )

	case _LHLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, )
	case _HLHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, )
	case _HLLH:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, )
	case _LHHL:
		 := .concatSelectedConstantGrouped(cscimm4(, , , ), )
		return .concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, )
	}
	panic("missing case, switch should be exhaustive")
}

// cscimm4 converts the 4 vector element indices into a single
// uint8 for use as an immediate.
func cscimm4(, , ,  uint8) uint8 {
	return uint8( + <<2 + <<4 + <<6)
}

// cscimm2 converts the 2 vector element indices into a single
// uint8 for use as an immediate.
func cscimm2(,  uint8) uint8 {
	return uint8( + <<1)
}

// cscimm2g2 converts the 2 vector element indices into a single
// uint8 for use as an immediate, but duplicated for VSHUFPD
// to emulate grouped behavior of VSHUFPS
func cscimm2g2(,  uint8) uint8 {
	 := cscimm2(, )
	return  + <<2
}

// cscimm2g4 converts the 2 vector element indices into a single
// uint8 for use as an immediate, but with four copies for VSHUFPD
// to emulate grouped behavior of VSHUFPS
func cscimm2g4(,  uint8) uint8 {
	 := cscimm2g2(, )
	return  + <<4
}

// SelectFromPair returns the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
func ( Uint64x2) (,  uint8,  Uint64x2) Uint64x2 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstant(cscimm2(, ), )
	case _HH:
		return .concatSelectedConstant(cscimm2(, ), )
	case _LH:
		return .concatSelectedConstant(cscimm2(, ), )
	case _HL:
		return .concatSelectedConstant(cscimm2(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
func ( Uint64x4) (,  uint8,  Uint64x4) Uint64x4 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _HH:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _LH:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _HL:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX512
func ( Uint64x8) (,  uint8,  Uint64x8) Uint64x8 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _HH:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _LH:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _HL:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPair returns the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
func ( Float64x2) (,  uint8,  Float64x2) Float64x2 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstant(cscimm2(, ), )
	case _HH:
		return .concatSelectedConstant(cscimm2(, ), )
	case _LH:
		return .concatSelectedConstant(cscimm2(, ), )
	case _HL:
		return .concatSelectedConstant(cscimm2(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
func ( Float64x4) (,  uint8,  Float64x4) Float64x4 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _HH:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _LH:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _HL:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX512
func ( Float64x8) (,  uint8,  Float64x8) Float64x8 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _HH:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _LH:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _HL:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPair returns the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
func ( Int64x2) (,  uint8,  Int64x2) Int64x2 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstant(cscimm2(, ), )
	case _HH:
		return .concatSelectedConstant(cscimm2(, ), )
	case _LH:
		return .concatSelectedConstant(cscimm2(, ), )
	case _HL:
		return .concatSelectedConstant(cscimm2(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
func ( Int64x4) (,  uint8,  Int64x4) Int64x4 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _HH:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _LH:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	case _HL:
		return .concatSelectedConstantGrouped(cscimm2g2(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y.  When the selectors are constants the selection can be
// implemented in a single instruction.
//
// If the selectors are not constant this will translate to a function
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX512
func ( Int64x8) (,  uint8,  Int64x8) Int64x8 {
	 := (&2)>>1 + ( & 2)

	,  = &1, &1

	switch  {
	case _LL:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _HH:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _LH:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	case _HL:
		return .concatSelectedConstantGrouped(cscimm2g4(, ), )
	}
	panic("missing case, switch should be exhaustive")
}

/* PermuteScalars */

// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
//
//	result = {x[a], x[b], x[c], x[d]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX
func ( Int32x4) (, , ,  uint8) Int32x4 {
	return .permuteScalars(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
//
//	result = {x[a], x[b], x[c], x[d]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX
func ( Uint32x4) (, , ,  uint8) Uint32x4 {
	return .permuteScalars(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

/* PermuteScalarsGrouped */

// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func ( Int32x8) (, , ,  uint8) Int32x8 {
	return .permuteScalarsGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
//			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func ( Int32x16) (, , ,  uint8) Int32x16 {
	return .permuteScalarsGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func ( Uint32x8) (, , ,  uint8) Uint32x8 {
	return .permuteScalarsGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
//			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func ( Uint32x16) (, , ,  uint8) Uint32x16 {
	return .permuteScalarsGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

/* PermuteScalarsHi */

// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
//
// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func ( Int16x8) (, , ,  uint8) Int16x8 {
	return .permuteScalarsHi(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
//
// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func ( Uint16x8) (, , ,  uint8) Uint16x8 {
	return .permuteScalarsHi(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

/* PermuteScalarsHiGrouped */

// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//		  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
//			x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func ( Int16x16) (, , ,  uint8) Int16x16 {
	return .permuteScalarsHiGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//		  {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
//			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
//			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
//			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func ( Int16x32) (, , ,  uint8) Int16x32 {
	return .permuteScalarsHiGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//	  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
//		x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Each group is of size 128-bit.
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func ( Uint16x16) (, , ,  uint8) Uint16x16 {
	return .permuteScalarsHiGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//		 {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
//			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
//			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
//			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func ( Uint16x32) (, , ,  uint8) Uint16x32 {
	return .permuteScalarsHiGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

/* PermuteScalarsLo */

// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
//
//	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func ( Int16x8) (, , ,  uint8) Int16x8 {
	return .permuteScalarsLo(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
//
//	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func ( Uint16x8) (, , ,  uint8) Uint16x8 {
	return .permuteScalarsLo(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

/* PermuteScalarsLoGrouped */

// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//	 {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
//		 x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func ( Int16x16) (, , ,  uint8) Int16x16 {
	return .permuteScalarsLoGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
//		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
//		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
//		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func ( Int16x32) (, , ,  uint8) Int16x32 {
	return .permuteScalarsLoGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
//		x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func ( Uint16x16) (, , ,  uint8) Uint16x16 {
	return .permuteScalarsLoGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
//	 result =
//	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
//		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
//		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
//		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
//
// Each group is of size 128-bit.
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func ( Uint16x32) (, , ,  uint8) Uint16x32 {
	return .permuteScalarsLoGrouped(&3 | (&3)<<2 | (&3)<<4 | <<6)
}

// CarrylessMultiply computes one of four possible carryless
// multiplications of selected high and low halves of x and y,
// depending on the values of a and b, returning the 128-bit
// product in the concatenated two elements of the result.
// a selects the low (0) or high (1) element of x and
// b selects the low (0) or high (1) element of y.
//
// A carryless multiplication uses bitwise XOR instead of
// add-with-carry, for example (in base two):
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
//
// This also models multiplication of polynomials with coefficients
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
// polynomial terms, but coefficients "add" with XOR.)
//
// constant values of a and b will result in better performance,
// otherwise the intrinsic may translate into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX
func ( Uint64x2) (,  uint8,  Uint64x2) Uint64x2 {
	return .carrylessMultiply(&1+((&1)<<4), )
}

// CarrylessMultiplyGrouped computes one of four possible carryless
// multiplications of selected high and low halves of each of the two
// 128-bit lanes of x and y, depending on the values of a and b,
// and returns the four 128-bit products in the result's lanes.
// a selects the low (0) or high (1) elements of x's lanes and
// b selects the low (0) or high (1) elements of y's lanes.
//
// A carryless multiplication uses bitwise XOR instead of
// add-with-carry, for example (in base two):
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
//
// This also models multiplication of polynomials with coefficients
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
// polynomial terms, but coefficients "add" with XOR.)
//
// constant values of a and b will result in better performance,
// otherwise the intrinsic may translate into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
func ( Uint64x4) (,  uint8,  Uint64x4) Uint64x4 {
	return .carrylessMultiply(&1+((&1)<<4), )
}

// CarrylessMultiplyGrouped computes one of four possible carryless
// multiplications of selected high and low halves of each of the four
// 128-bit lanes of x and y, depending on the values of a and b,
// and returns the four 128-bit products in the result's lanes.
// a selects the low (0) or high (1) elements of x's lanes and
// b selects the low (0) or high (1) elements of y's lanes.
//
// A carryless multiplication uses bitwise XOR instead of
// add-with-carry, for example (in base two):
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
//
// This also models multiplication of polynomials with coefficients
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
// polynomial terms, but coefficients "add" with XOR.)
//
// constant values of a and b will result in better performance,
// otherwise the intrinsic may translate into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
func ( Uint64x8) (,  uint8,  Uint64x8) Uint64x8 {
	return .carrylessMultiply(&1+((&1)<<4), )
}