// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build goexperiment.simd

package archsimd

import 

// Implementation of all the {Int,Uint}{8,16} load and store slice part
// functions and methods for 128-bit and 256-bit vectors.

/* pointer-punning functions for chunked slice part loads. */

func int16atP8( *int8) *int16 {
	return (*int16)(unsafe.Pointer())
}

func int32atP8( *int8) *int32 {
	return (*int32)(unsafe.Pointer())
}

func int64atP8( *int8) *int64 {
	return (*int64)(unsafe.Pointer())
}

func int32atP16( *int16) *int32 {
	return (*int32)(unsafe.Pointer())
}

func int64atP16( *int16) *int64 {
	return (*int64)(unsafe.Pointer())
}

func int64atP32( *int32) *int64 {
	return (*int64)(unsafe.Pointer())
}

func int32atP64( *int64) *int32 {
	return (*int32)(unsafe.Pointer())
}

/* These two masks are used by generated code */

var vecMask64 = [16]int64{
	-1, -1, -1, -1,
	-1, -1, -1, -1,
	0, 0, 0, 0,
	0, 0, 0, 0,
}

var vecMask32 = [32]int32{
	-1, -1, -1, -1,
	-1, -1, -1, -1,
	-1, -1, -1, -1,
	-1, -1, -1, -1,
	0, 0, 0, 0,
	0, 0, 0, 0,
	0, 0, 0, 0,
	0, 0, 0, 0,
}

/* 256-bit int vector loads and stores made from 128-bit parts */

// LoadInt8x32SlicePart loads a Int8x32 from the slice s.
// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
// If s has 32 or more elements, the function is equivalent to LoadInt8x32Slice.
func ( []int8) Int8x32 {
	 := len()
	if  >= 32 {
		return LoadInt8x32Slice()
	}
	var  Int8x32
	if  == 0 {
		return 
	}
	if  > 16 {
		return .SetLo(LoadInt8x16Slice()).SetHi(LoadInt8x16SlicePart([16:]))
	} else {
		return .SetLo(LoadInt8x16SlicePart())
	}
}

// LoadInt16x16SlicePart loads a Int16x16 from the slice s.
// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
// If s has 16 or more elements, the function is equivalent to LoadInt16x16Slice.
func ( []int16) Int16x16 {
	 := len()
	if  >= 16 {
		return LoadInt16x16Slice()
	}
	var  Int16x16
	if  == 0 {
		return 
	}
	if  > 8 {
		return .SetLo(LoadInt16x8Slice()).SetHi(LoadInt16x8SlicePart([8:]))
	} else {
		return .SetLo(LoadInt16x8SlicePart())
	}
}

// StoreSlicePart stores the elements of x into the slice s.
// It stores as many elements as will fit in s.
// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
func ( Int8x32) ( []int8) {
	 := len()
	if  >= 32 {
		.StoreSlice()
		return
	}
	if  == 0 {
		return
	}
	if  > 16 {
		.GetLo().StoreSlice()
		.GetHi().StoreSlicePart([16:])
	} else { // fits in one
		.GetLo().StoreSlicePart()
	}
}

// StoreSlicePart stores the elements of x into the slice s.
// It stores as many elements as will fit in s.
// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
func ( Int16x16) ( []int16) {
	 := len()
	if  >= 16 {
		.StoreSlice()
		return
	}
	if  == 0 {
		return
	}
	if  > 8 {
		.GetLo().StoreSlice()
		.GetHi().StoreSlicePart([8:])
	} else { // fits in one
		.GetLo().StoreSlicePart()
	}
}

/* 128-bit vector load and store slice parts for 8 and 16-bit int elements */

// LoadInt8x16SlicePart loads a Int8x16 from the slice s.
// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
// If s has 16 or more elements, the function is equivalent to LoadInt8x16Slice.
func ( []int8) Int8x16 {
	 := len()
	if  >= 16 {
		return LoadInt8x16Slice()
	}
	var  Int8x16
	if  == 0 {
		return 
	}
	if  >= 8 { // 8-15
		 = .AsInt64x2().SetElem(0, *int64atP8(&[0])).AsInt8x16()
		if  >= 12 { // 12, 13, 14, 15
			 = .AsInt32x4().SetElem(8/4, *int32atP8(&[8])).AsInt8x16()
			if  >= 14 {
				 = .AsInt16x8().SetElem(12/2, *int16atP8(&[12])).AsInt8x16()
				if  == 15 {
					 = .SetElem(14, [14])
				}
			} else if  == 13 {
				 = .SetElem(12, [12])
			}
		} else if  >= 10 { // 10, 11
			 = .AsInt16x8().SetElem(8/2, *int16atP8(&[8])).AsInt8x16()
			if  == 11 {
				 = .SetElem(10, [10])
			}
		} else if  == 9 {
			 = .SetElem(8, [8])
		}
	} else if  >= 4 { // 4-7
		 = .AsInt32x4().SetElem(0, *int32atP8(&[0])).AsInt8x16()
		if  >= 6 {
			 = .AsInt16x8().SetElem(4/2, *int16atP8(&[4])).AsInt8x16()
			if  == 7 {
				 = .SetElem(6, [6])
			}
		} else if  == 5 {
			 = .SetElem(4, [4])
		}
	} else if  >= 2 { // 2,3
		 = .AsInt16x8().SetElem(0, *int16atP8(&[0])).AsInt8x16()
		if  == 3 {
			 = .SetElem(2, [2])
		}
	} else { // l == 1
		 = .SetElem(0, [0])
	}
	return 
}

// StoreSlicePart stores the elements of x into the slice s.
// It stores as many elements as will fit in s.
// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
func ( Int8x16) ( []int8) {
	 := len()
	if  >= 16 {
		.StoreSlice()
		return
	}
	if  == 0 {
		return
	}
	if  >= 8 { // 8-15
		*int64atP8(&[0]) = .AsInt64x2().GetElem(0)
		if  >= 12 { // 12, 13, 14, 15
			*int32atP8(&[8]) = .AsInt32x4().GetElem(8 / 4)
			if  >= 14 {
				*int16atP8(&[12]) = .AsInt16x8().GetElem(12 / 2)
				if  == 15 {
					[14] = .GetElem(14)
				}
			} else if  == 13 {
				[12] = .GetElem(12)
			}
		} else if  >= 10 { // 10, 11
			*int16atP8(&[8]) = .AsInt16x8().GetElem(8 / 2)
			if  == 11 {
				[10] = .GetElem(10)
			}
		} else if  == 9 {
			[8] = .GetElem(8)
		}
	} else if  >= 4 { // 4-7
		*int32atP8(&[0]) = .AsInt32x4().GetElem(0)
		if  >= 6 {
			*int16atP8(&[4]) = .AsInt16x8().GetElem(4 / 2)
			if  == 7 {
				[6] = .GetElem(6)
			}
		} else if  == 5 {
			[4] = .GetElem(4)
		}
	} else if  >= 2 { // 2,3
		*int16atP8(&[0]) = .AsInt16x8().GetElem(0)
		if  == 3 {
			[2] = .GetElem(2)
		}
	} else { // l == 1
		[0] = .GetElem(0)
	}
}

// LoadInt16x8SlicePart loads a Int16x8 from the slice s.
// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
// If s has 8 or more elements, the function is equivalent to LoadInt16x8Slice.
func ( []int16) Int16x8 {
	 := len()
	if  >= 8 {
		return LoadInt16x8Slice()
	}
	var  Int16x8
	if  == 0 {
		return 
	}
	if  >= 4 { // 4-7
		 = .AsInt64x2().SetElem(0, *int64atP16(&[0])).AsInt16x8()
		if  >= 6 {
			 = .AsInt32x4().SetElem(4/2, *int32atP16(&[4])).AsInt16x8()
			if  == 7 {
				 = .SetElem(6, [6])
			}
		} else if  == 5 {
			 = .SetElem(4, [4])
		}
	} else if  >= 2 { // 2,3
		 = .AsInt32x4().SetElem(0, *int32atP16(&[0])).AsInt16x8()
		if  == 3 {
			 = .SetElem(2, [2])
		}
	} else { // l == 1
		 = .SetElem(0, [0])
	}
	return 
}

// StoreSlicePart stores the elements of x into the slice s.
// It stores as many elements as will fit in s.
// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
func ( Int16x8) ( []int16) {
	 := len()
	if  >= 8 {
		.StoreSlice()
		return
	}
	if  == 0 {
		return
	}
	if  >= 4 { // 4-7
		*int64atP16(&[0]) = .AsInt64x2().GetElem(0)
		if  >= 6 {
			*int32atP16(&[4]) = .AsInt32x4().GetElem(4 / 2)
			if  == 7 {
				[6] = .GetElem(6)
			}
		} else if  == 5 {
			[4] = .GetElem(4)
		}
	} else if  >= 2 { // 2,3
		*int32atP16(&[0]) = .AsInt32x4().GetElem(0)
		if  == 3 {
			[2] = .GetElem(2)
		}
	} else { // l == 1
		[0] = .GetElem(0)
	}
	return
}