// Code generated by command: go run p256_asm.go -out ../p256_asm_amd64.s. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
// Requires: SSE2
TEXT ·p256MovCond(SB), NOSPLIT, $0-32
	MOVQ    res+0(FP), DI
	MOVQ    a+8(FP), SI
	MOVQ    b+16(FP), CX
	MOVQ    cond+24(FP), X12
	PXOR    X13, X13
	PSHUFD  $0x00, X12, X12
	PCMPEQL X13, X12
	MOVOU   X12, X0
	MOVOU   (SI), X6
	PANDN   X6, X0
	MOVOU   X12, X1
	MOVOU   16(SI), X7
	PANDN   X7, X1
	MOVOU   X12, X2
	MOVOU   32(SI), X8
	PANDN   X8, X2
	MOVOU   X12, X3
	MOVOU   48(SI), X9
	PANDN   X9, X3
	MOVOU   X12, X4
	MOVOU   64(SI), X10
	PANDN   X10, X4
	MOVOU   X12, X5
	MOVOU   80(SI), X11
	PANDN   X11, X5
	MOVOU   (CX), X6
	MOVOU   16(CX), X7
	MOVOU   32(CX), X8
	MOVOU   48(CX), X9
	MOVOU   64(CX), X10
	MOVOU   80(CX), X11
	PAND    X12, X6
	PAND    X12, X7
	PAND    X12, X8
	PAND    X12, X9
	PAND    X12, X10
	PAND    X12, X11
	PXOR    X6, X0
	PXOR    X7, X1
	PXOR    X8, X2
	PXOR    X9, X3
	PXOR    X10, X4
	PXOR    X11, X5
	MOVOU   X0, (DI)
	MOVOU   X1, 16(DI)
	MOVOU   X2, 32(DI)
	MOVOU   X3, 48(DI)
	MOVOU   X4, 64(DI)
	MOVOU   X5, 80(DI)
	RET

// func p256NegCond(val *p256Element, cond int)
// Requires: CMOV
TEXT ·p256NegCond(SB), NOSPLIT, $0-16
	MOVQ val+0(FP), DI
	MOVQ cond+8(FP), R14

	// acc = poly
	MOVQ $-1, R8
	MOVQ p256const0<>+0(SB), R9
	MOVQ $+0, R10
	MOVQ p256const1<>+0(SB), R11

	// Load the original value
	MOVQ (DI), R13
	MOVQ 8(DI), SI
	MOVQ 16(DI), CX
	MOVQ 24(DI), R15

	// Speculatively subtract
	SUBQ R13, R8
	SBBQ SI, R9
	SBBQ CX, R10
	SBBQ R15, R11

	// If condition is 0, keep original value
	TESTQ   R14, R14
	CMOVQEQ R13, R8
	CMOVQEQ SI, R9
	CMOVQEQ CX, R10
	CMOVQEQ R15, R11

	// Store result
	MOVQ R8, (DI)
	MOVQ R9, 8(DI)
	MOVQ R10, 16(DI)
	MOVQ R11, 24(DI)
	RET

DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
GLOBL p256const0<>(SB), RODATA, $8

DATA p256const1<>+0(SB)/8, $0xffffffff00000001
GLOBL p256const1<>(SB), RODATA, $8

// func p256Sqr(res *p256Element, in *p256Element, n int)
// Requires: CMOV
TEXT ·p256Sqr(SB), NOSPLIT, $0-24
	MOVQ res+0(FP), DI
	MOVQ in+8(FP), SI
	MOVQ n+16(FP), BX

sqrLoop:
	// y[1:] * y[0]
	MOVQ (SI), R14
	MOVQ 8(SI), AX
	MULQ R14
	MOVQ AX, R9
	MOVQ DX, R10
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R11
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R12

	// y[2:] * y[1]
	MOVQ 8(SI), R14
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R13

	// y[3] * y[2]
	MOVQ 16(SI), R14
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ AX, R13
	ADCQ $0x00, DX
	MOVQ DX, CX
	XORQ R15, R15

	// *2
	ADDQ R9, R9
	ADCQ R10, R10
	ADCQ R11, R11
	ADCQ R12, R12
	ADCQ R13, R13
	ADCQ CX, CX
	ADCQ $0x00, R15

	// Missing products
	MOVQ (SI), AX
	MULQ AX
	MOVQ AX, R8
	MOVQ DX, R14
	MOVQ 8(SI), AX
	MULQ AX
	ADDQ R14, R9
	ADCQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R14
	MOVQ 16(SI), AX
	MULQ AX
	ADDQ R14, R11
	ADCQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R14
	MOVQ 24(SI), AX
	MULQ AX
	ADDQ R14, R13
	ADCQ AX, CX
	ADCQ DX, R15
	MOVQ R15, SI

	// First reduction step
	MOVQ R8, AX
	MOVQ R8, R15
	SHLQ $0x20, R8
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R8, R9
	ADCQ R15, R10
	ADCQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R8

	// Second reduction step
	MOVQ R9, AX
	MOVQ R9, R15
	SHLQ $0x20, R9
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R9, R10
	ADCQ R15, R11
	ADCQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R9

	// Third reduction step
	MOVQ R10, AX
	MOVQ R10, R15
	SHLQ $0x20, R10
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R10, R11
	ADCQ R15, R8
	ADCQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R10

	// Last reduction step
	XORQ R14, R14
	MOVQ R11, AX
	MOVQ R11, R15
	SHLQ $0x20, R11
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R11, R8
	ADCQ R15, R9
	ADCQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R11

	// Add bits [511:256] of the sqr result
	ADCQ R12, R8
	ADCQ R13, R9
	ADCQ CX, R10
	ADCQ SI, R11
	ADCQ $0x00, R14
	MOVQ R8, R12
	MOVQ R9, R13
	MOVQ R10, CX
	MOVQ R11, R15

	// Subtract p256
	SUBQ    $-1, R8
	SBBQ    p256const0<>+0(SB), R9
	SBBQ    $0x00, R10
	SBBQ    p256const1<>+0(SB), R11
	SBBQ    $0x00, R14
	CMOVQCS R12, R8
	CMOVQCS R13, R9
	CMOVQCS CX, R10
	CMOVQCS R15, R11
	MOVQ    R8, (DI)
	MOVQ    R9, 8(DI)
	MOVQ    R10, 16(DI)
	MOVQ    R11, 24(DI)
	MOVQ    DI, SI
	DECQ    BX
	JNE     sqrLoop
	RET

// func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
// Requires: CMOV
TEXT ·p256Mul(SB), NOSPLIT, $0-24
	MOVQ res+0(FP), DI
	MOVQ in1+8(FP), SI
	MOVQ in2+16(FP), CX

	// x * y[0]
	MOVQ (CX), R14
	MOVQ (SI), AX
	MULQ R14
	MOVQ AX, R8
	MOVQ DX, R9
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R10
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R11
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R12
	XORQ R13, R13

	// First reduction step
	MOVQ R8, AX
	MOVQ R8, R15
	SHLQ $0x20, R8
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R8, R9
	ADCQ R15, R10
	ADCQ AX, R11
	ADCQ DX, R12
	ADCQ $0x00, R13
	XORQ R8, R8

	// x * y[1]
	MOVQ 8(CX), R14
	MOVQ (SI), AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ R15, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ DX, R13
	ADCQ $0x00, R8

	// Second reduction step
	MOVQ R9, AX
	MOVQ R9, R15
	SHLQ $0x20, R9
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R9, R10
	ADCQ R15, R11
	ADCQ AX, R12
	ADCQ DX, R13
	ADCQ $0x00, R8
	XORQ R9, R9

	// x * y[2]
	MOVQ 16(CX), R14
	MOVQ (SI), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R13
	ADCQ $0x00, DX
	ADDQ AX, R13
	ADCQ DX, R8
	ADCQ $0x00, R9

	// Third reduction step
	MOVQ R10, AX
	MOVQ R10, R15
	SHLQ $0x20, R10
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R10, R11
	ADCQ R15, R12
	ADCQ AX, R13
	ADCQ DX, R8
	ADCQ $0x00, R9
	XORQ R10, R10

	// x * y[3]
	MOVQ 24(CX), R14
	MOVQ (SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ R15, R13
	ADCQ $0x00, DX
	ADDQ AX, R13
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R8
	ADCQ $0x00, DX
	ADDQ AX, R8
	ADCQ DX, R9
	ADCQ $0x00, R10

	// Last reduction step
	MOVQ R11, AX
	MOVQ R11, R15
	SHLQ $0x20, R11
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R11, R12
	ADCQ R15, R13
	ADCQ AX, R8
	ADCQ DX, R9
	ADCQ $0x00, R10

	// Copy result [255:0]
	MOVQ R12, SI
	MOVQ R13, R11
	MOVQ R8, R14
	MOVQ R9, R15

	// Subtract p256
	SUBQ    $-1, R12
	SBBQ    p256const0<>+0(SB), R13
	SBBQ    $0x00, R8
	SBBQ    p256const1<>+0(SB), R9
	SBBQ    $0x00, R10
	CMOVQCS SI, R12
	CMOVQCS R11, R13
	CMOVQCS R14, R8
	CMOVQCS R15, R9
	MOVQ    R12, (DI)
	MOVQ    R13, 8(DI)
	MOVQ    R8, 16(DI)
	MOVQ    R9, 24(DI)
	RET

// func p256FromMont(res *p256Element, in *p256Element)
// Requires: CMOV
TEXT ·p256FromMont(SB), NOSPLIT, $0-16
	MOVQ res+0(FP), DI
	MOVQ in+8(FP), SI
	MOVQ (SI), R8
	MOVQ 8(SI), R9
	MOVQ 16(SI), R10
	MOVQ 24(SI), R11
	XORQ R12, R12

	// Only reduce, no multiplications are needed
	// First stage
	MOVQ R8, AX
	MOVQ R8, R15
	SHLQ $0x20, R8
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R8, R9
	ADCQ R15, R10
	ADCQ AX, R11
	ADCQ DX, R12
	XORQ R13, R13

	// Second stage
	MOVQ R9, AX
	MOVQ R9, R15
	SHLQ $0x20, R9
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R9, R10
	ADCQ R15, R11
	ADCQ AX, R12
	ADCQ DX, R13
	XORQ R8, R8

	// Third stage
	MOVQ R10, AX
	MOVQ R10, R15
	SHLQ $0x20, R10
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, R15
	ADDQ R10, R11
	ADCQ R15, R12
	ADCQ AX, R13
	ADCQ DX, R8
	XORQ R9, R9

	// Last stage
	MOVQ    R11, AX
	MOVQ    R11, R15
	SHLQ    $0x20, R11
	MULQ    p256const1<>+0(SB)
	SHRQ    $0x20, R15
	ADDQ    R11, R12
	ADCQ    R15, R13
	ADCQ    AX, R8
	ADCQ    DX, R9
	MOVQ    R12, SI
	MOVQ    R13, R11
	MOVQ    R8, R14
	MOVQ    R9, R15
	SUBQ    $-1, R12
	SBBQ    p256const0<>+0(SB), R13
	SBBQ    $0x00, R8
	SBBQ    p256const1<>+0(SB), R9
	CMOVQCS SI, R12
	CMOVQCS R11, R13
	CMOVQCS R14, R8
	CMOVQCS R15, R9
	MOVQ    R12, (DI)
	MOVQ    R13, 8(DI)
	MOVQ    R8, 16(DI)
	MOVQ    R9, 24(DI)
	RET

// func p256Select(res *P256Point, table *p256Table, idx int)
// Requires: SSE2
TEXT ·p256Select(SB), NOSPLIT, $0-24
	MOVQ    idx+16(FP), AX
	MOVQ    table+8(FP), DI
	MOVQ    res+0(FP), DX
	PXOR    X15, X15
	PCMPEQL X14, X14
	PSUBL   X14, X15
	MOVL    AX, X14
	PSHUFD  $0x00, X14, X14
	PXOR    X0, X0
	PXOR    X1, X1
	PXOR    X2, X2
	PXOR    X3, X3
	PXOR    X4, X4
	PXOR    X5, X5
	MOVQ    $0x00000010, AX
	MOVOU   X15, X13

loop_select:
	MOVOU   X13, X12
	PADDL   X15, X13
	PCMPEQL X14, X12
	MOVOU   (DI), X6
	MOVOU   16(DI), X7
	MOVOU   32(DI), X8
	MOVOU   48(DI), X9
	MOVOU   64(DI), X10
	MOVOU   80(DI), X11
	ADDQ    $0x60, DI
	PAND    X12, X6
	PAND    X12, X7
	PAND    X12, X8
	PAND    X12, X9
	PAND    X12, X10
	PAND    X12, X11
	PXOR    X6, X0
	PXOR    X7, X1
	PXOR    X8, X2
	PXOR    X9, X3
	PXOR    X10, X4
	PXOR    X11, X5
	DECQ    AX
	JNE     loop_select
	MOVOU   X0, (DX)
	MOVOU   X1, 16(DX)
	MOVOU   X2, 32(DX)
	MOVOU   X3, 48(DX)
	MOVOU   X4, 64(DX)
	MOVOU   X5, 80(DX)
	RET

// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
// Requires: SSE2
TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
	MOVQ    idx+16(FP), AX
	MOVQ    table+8(FP), DI
	MOVQ    res+0(FP), DX
	PXOR    X15, X15
	PCMPEQL X14, X14
	PSUBL   X14, X15
	MOVL    AX, X14
	PSHUFD  $0x00, X14, X14
	PXOR    X0, X0
	PXOR    X1, X1
	PXOR    X2, X2
	PXOR    X3, X3
	MOVQ    $0x00000010, AX
	MOVOU   X15, X13

loop_select_base:
	MOVOU   X13, X12
	PADDL   X15, X13
	PCMPEQL X14, X12
	MOVOU   (DI), X4
	MOVOU   16(DI), X5
	MOVOU   32(DI), X6
	MOVOU   48(DI), X7
	MOVOU   64(DI), X8
	MOVOU   80(DI), X9
	MOVOU   96(DI), X10
	MOVOU   112(DI), X11
	ADDQ    $0x80, DI
	PAND    X12, X4
	PAND    X12, X5
	PAND    X12, X6
	PAND    X12, X7
	MOVOU   X13, X12
	PADDL   X15, X13
	PCMPEQL X14, X12
	PAND    X12, X8
	PAND    X12, X9
	PAND    X12, X10
	PAND    X12, X11
	PXOR    X4, X0
	PXOR    X5, X1
	PXOR    X6, X2
	PXOR    X7, X3
	PXOR    X8, X0
	PXOR    X9, X1
	PXOR    X10, X2
	PXOR    X11, X3
	DECQ    AX
	JNE     loop_select_base
	MOVOU   X0, (DX)
	MOVOU   X1, 16(DX)
	MOVOU   X2, 32(DX)
	MOVOU   X3, 48(DX)
	RET

// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
// Requires: CMOV
TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
	MOVQ res+0(FP), DI
	MOVQ in1+8(FP), SI
	MOVQ in2+16(FP), CX

	// x * y[0]
	MOVQ (CX), R14
	MOVQ (SI), AX
	MULQ R14
	MOVQ AX, R8
	MOVQ DX, R9
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R10
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R11
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R12
	XORQ R13, R13

	// First reduction step
	MOVQ R8, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R9
	ADCQ $0x00, DX
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+16(SB), AX
	MULQ R14
	ADDQ R15, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+24(SB), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ DX, R12
	ADCQ $0x00, R13

	// x * y[1]
	MOVQ 8(CX), R14
	MOVQ (SI), AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ R15, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ DX, R13
	ADCQ $0x00, R8

	// Second reduction step
	MOVQ R9, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+16(SB), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+24(SB), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ DX, R13
	ADCQ $0x00, R8

	// x * y[2]
	MOVQ 16(CX), R14
	MOVQ (SI), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R13
	ADCQ $0x00, DX
	ADDQ AX, R13
	ADCQ DX, R8
	ADCQ $0x00, R9

	// Third reduction step
	MOVQ R10, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+16(SB), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+24(SB), AX
	MULQ R14
	ADDQ R15, R13
	ADCQ $0x00, DX
	ADDQ AX, R13
	ADCQ DX, R8
	ADCQ $0x00, R9

	// x * y[3]
	MOVQ 24(CX), R14
	MOVQ (SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 8(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ R15, R13
	ADCQ $0x00, DX
	ADDQ AX, R13
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R8
	ADCQ $0x00, DX
	ADDQ AX, R8
	ADCQ DX, R9
	ADCQ $0x00, R10

	// Last reduction step
	MOVQ R11, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+16(SB), AX
	MULQ R14
	ADDQ R15, R13
	ADCQ $0x00, DX
	ADDQ AX, R13
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+24(SB), AX
	MULQ R14
	ADDQ R15, R8
	ADCQ $0x00, DX
	ADDQ AX, R8
	ADCQ DX, R9
	ADCQ $0x00, R10

	// Copy result [255:0]
	MOVQ R12, SI
	MOVQ R13, R11
	MOVQ R8, R14
	MOVQ R9, R15

	// Subtract p256
	SUBQ    p256ord<>+0(SB), R12
	SBBQ    p256ord<>+8(SB), R13
	SBBQ    p256ord<>+16(SB), R8
	SBBQ    p256ord<>+24(SB), R9
	SBBQ    $0x00, R10
	CMOVQCS SI, R12
	CMOVQCS R11, R13
	CMOVQCS R14, R8
	CMOVQCS R15, R9
	MOVQ    R12, (DI)
	MOVQ    R13, 8(DI)
	MOVQ    R8, 16(DI)
	MOVQ    R9, 24(DI)
	RET

DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
GLOBL p256ordK0<>(SB), RODATA, $8

DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
DATA p256ord<>+24(SB)/8, $0xffffffff00000000
GLOBL p256ord<>(SB), RODATA, $32

// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
// Requires: CMOV
TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
	MOVQ res+0(FP), DI
	MOVQ in+8(FP), SI
	MOVQ n+16(FP), BX

ordSqrLoop:
	// y[1:] * y[0]
	MOVQ (SI), R14
	MOVQ 8(SI), AX
	MULQ R14
	MOVQ AX, R9
	MOVQ DX, R10
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R11
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R12

	// y[2:] * y[1]
	MOVQ 8(SI), R14
	MOVQ 16(SI), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ R15, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R13

	// y[3] * y[2]
	MOVQ 16(SI), R14
	MOVQ 24(SI), AX
	MULQ R14
	ADDQ AX, R13
	ADCQ $0x00, DX
	MOVQ DX, CX
	XORQ R15, R15

	// *2
	ADDQ R9, R9
	ADCQ R10, R10
	ADCQ R11, R11
	ADCQ R12, R12
	ADCQ R13, R13
	ADCQ CX, CX
	ADCQ $0x00, R15

	// Missing products
	MOVQ (SI), AX
	MULQ AX
	MOVQ AX, R8
	MOVQ DX, R14
	MOVQ 8(SI), AX
	MULQ AX
	ADDQ R14, R9
	ADCQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R14
	MOVQ 16(SI), AX
	MULQ AX
	ADDQ R14, R11
	ADCQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R14
	MOVQ 24(SI), AX
	MULQ AX
	ADDQ R14, R13
	ADCQ AX, CX
	ADCQ DX, R15
	MOVQ R15, SI

	// First reduction step
	MOVQ R8, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R9
	ADCQ $0x00, DX
	ADDQ AX, R9
	MOVQ R14, R15
	ADCQ DX, R10
	ADCQ $0x00, R15
	SUBQ R14, R10
	SBBQ $0x00, R15
	MOVQ R14, AX
	MOVQ R14, DX
	MOVQ R14, R8
	SHLQ $0x20, AX
	SHRQ $0x20, DX
	ADDQ R15, R11
	ADCQ $0x00, R8
	SUBQ AX, R11
	SBBQ DX, R8

	// Second reduction step
	MOVQ R9, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	MOVQ R14, R15
	ADCQ DX, R11
	ADCQ $0x00, R15
	SUBQ R14, R11
	SBBQ $0x00, R15
	MOVQ R14, AX
	MOVQ R14, DX
	MOVQ R14, R9
	SHLQ $0x20, AX
	SHRQ $0x20, DX
	ADDQ R15, R8
	ADCQ $0x00, R9
	SUBQ AX, R8
	SBBQ DX, R9

	// Third reduction step
	MOVQ R10, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	MOVQ R14, R15
	ADCQ DX, R8
	ADCQ $0x00, R15
	SUBQ R14, R8
	SBBQ $0x00, R15
	MOVQ R14, AX
	MOVQ R14, DX
	MOVQ R14, R10
	SHLQ $0x20, AX
	SHRQ $0x20, DX
	ADDQ R15, R9
	ADCQ $0x00, R10
	SUBQ AX, R9
	SBBQ DX, R10

	// Last reduction step
	MOVQ R11, AX
	MULQ p256ordK0<>+0(SB)
	MOVQ AX, R14
	MOVQ p256ord<>+0(SB), AX
	MULQ R14
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ p256ord<>+8(SB), AX
	MULQ R14
	ADDQ R15, R8
	ADCQ $0x00, DX
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ R14, R15
	ADCQ DX, R9
	ADCQ $0x00, R15
	SUBQ R14, R9
	SBBQ $0x00, R15
	MOVQ R14, AX
	MOVQ R14, DX
	MOVQ R14, R11
	SHLQ $0x20, AX
	SHRQ $0x20, DX
	ADDQ R15, R10
	ADCQ $0x00, R11
	SUBQ AX, R10
	SBBQ DX, R11
	XORQ R14, R14

	// Add bits [511:256] of the sqr result
	ADCQ R12, R8
	ADCQ R13, R9
	ADCQ CX, R10
	ADCQ SI, R11
	ADCQ $0x00, R14
	MOVQ R8, R12
	MOVQ R9, R13
	MOVQ R10, CX
	MOVQ R11, R15

	// Subtract p256
	SUBQ    p256ord<>+0(SB), R8
	SBBQ    p256ord<>+8(SB), R9
	SBBQ    p256ord<>+16(SB), R10
	SBBQ    p256ord<>+24(SB), R11
	SBBQ    $0x00, R14
	CMOVQCS R12, R8
	CMOVQCS R13, R9
	CMOVQCS CX, R10
	CMOVQCS R15, R11
	MOVQ    R8, (DI)
	MOVQ    R9, 8(DI)
	MOVQ    R10, 16(DI)
	MOVQ    R11, 24(DI)
	MOVQ    DI, SI
	DECQ    BX
	JNE     ordSqrLoop
	RET

// func p256SubInternal()
// Requires: CMOV
TEXT p256SubInternal(SB), NOSPLIT, $0
	XORQ    AX, AX
	SUBQ    R14, R10
	SBBQ    R15, R11
	SBBQ    DI, R12
	SBBQ    SI, R13
	SBBQ    $0x00, AX
	MOVQ    R10, BX
	MOVQ    R11, CX
	MOVQ    R12, R8
	MOVQ    R13, R9
	ADDQ    $-1, R10
	ADCQ    p256const0<>+0(SB), R11
	ADCQ    $0x00, R12
	ADCQ    p256const1<>+0(SB), R13
	ANDQ    $0x01, AX
	CMOVQEQ BX, R10
	CMOVQEQ CX, R11
	CMOVQEQ R8, R12
	CMOVQEQ R9, R13
	RET

// func p256MulInternal()
// Requires: CMOV
TEXT p256MulInternal(SB), NOSPLIT, $8
	MOVQ R10, AX
	MULQ R14
	MOVQ AX, BX
	MOVQ DX, CX
	MOVQ R10, AX
	MULQ R15
	ADDQ AX, CX
	ADCQ $0x00, DX
	MOVQ DX, R8
	MOVQ R10, AX
	MULQ DI
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R9
	MOVQ R10, AX
	MULQ SI
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R10
	MOVQ R11, AX
	MULQ R14
	ADDQ AX, CX
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R11, AX
	MULQ R15
	ADDQ BP, R8
	ADCQ $0x00, DX
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R11, AX
	MULQ DI
	ADDQ BP, R9
	ADCQ $0x00, DX
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R11, AX
	MULQ SI
	ADDQ BP, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, R11
	MOVQ R12, AX
	MULQ R14
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R12, AX
	MULQ R15
	ADDQ BP, R9
	ADCQ $0x00, DX
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R12, AX
	MULQ DI
	ADDQ BP, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R12, AX
	MULQ SI
	ADDQ BP, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, R12
	MOVQ R13, AX
	MULQ R14
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R13, AX
	MULQ R15
	ADDQ BP, R10
	ADCQ $0x00, DX
	ADDQ AX, R10
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R13, AX
	MULQ DI
	ADDQ BP, R11
	ADCQ $0x00, DX
	ADDQ AX, R11
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R13, AX
	MULQ SI
	ADDQ BP, R12
	ADCQ $0x00, DX
	ADDQ AX, R12
	ADCQ $0x00, DX
	MOVQ DX, R13

	// First reduction step
	MOVQ BX, AX
	MOVQ BX, BP
	SHLQ $0x20, BX
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ BX, CX
	ADCQ BP, R8
	ADCQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, BX

	// Second reduction step
	MOVQ CX, AX
	MOVQ CX, BP
	SHLQ $0x20, CX
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ CX, R8
	ADCQ BP, R9
	ADCQ AX, BX
	ADCQ $0x00, DX
	MOVQ DX, CX

	// Third reduction step
	MOVQ R8, AX
	MOVQ R8, BP
	SHLQ $0x20, R8
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ R8, R9
	ADCQ BP, BX
	ADCQ AX, CX
	ADCQ $0x00, DX
	MOVQ DX, R8

	// Last reduction step
	MOVQ R9, AX
	MOVQ R9, BP
	SHLQ $0x20, R9
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ R9, BX
	ADCQ BP, CX
	ADCQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R9
	MOVQ $0x00000000, BP

	// Add bits [511:256] of the result
	ADCQ BX, R10
	ADCQ CX, R11
	ADCQ R8, R12
	ADCQ R9, R13
	ADCQ $0x00, BP

	// Copy result
	MOVQ R10, BX
	MOVQ R11, CX
	MOVQ R12, R8
	MOVQ R13, R9

	// Subtract p256
	SUBQ $-1, R10
	SBBQ p256const0<>+0(SB), R11
	SBBQ $0x00, R12
	SBBQ p256const1<>+0(SB), R13
	SBBQ $0x00, BP

	// If the result of the subtraction is negative, restore the previous result
	CMOVQCS BX, R10
	CMOVQCS CX, R11
	CMOVQCS R8, R12
	CMOVQCS R9, R13
	RET

// func p256SqrInternal()
// Requires: CMOV
TEXT p256SqrInternal(SB), NOSPLIT, $8
	MOVQ R10, AX
	MULQ R11
	MOVQ AX, CX
	MOVQ DX, R8
	MOVQ R10, AX
	MULQ R12
	ADDQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R9
	MOVQ R10, AX
	MULQ R13
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, R14
	MOVQ R11, AX
	MULQ R12
	ADDQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, BP
	MOVQ R11, AX
	MULQ R13
	ADDQ BP, R14
	ADCQ $0x00, DX
	ADDQ AX, R14
	ADCQ $0x00, DX
	MOVQ DX, R15
	MOVQ R12, AX
	MULQ R13
	ADDQ AX, R15
	ADCQ $0x00, DX
	MOVQ DX, DI
	XORQ SI, SI

	// *2
	ADDQ CX, CX
	ADCQ R8, R8
	ADCQ R9, R9
	ADCQ R14, R14
	ADCQ R15, R15
	ADCQ DI, DI
	ADCQ $0x00, SI

	// Missing products
	MOVQ R10, AX
	MULQ AX
	MOVQ AX, BX
	MOVQ DX, R10
	MOVQ R11, AX
	MULQ AX
	ADDQ R10, CX
	ADCQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R10
	MOVQ R12, AX
	MULQ AX
	ADDQ R10, R9
	ADCQ AX, R14
	ADCQ $0x00, DX
	MOVQ DX, R10
	MOVQ R13, AX
	MULQ AX
	ADDQ R10, R15
	ADCQ AX, DI
	ADCQ DX, SI

	// First reduction step
	MOVQ BX, AX
	MOVQ BX, BP
	SHLQ $0x20, BX
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ BX, CX
	ADCQ BP, R8
	ADCQ AX, R9
	ADCQ $0x00, DX
	MOVQ DX, BX

	// Second reduction step
	MOVQ CX, AX
	MOVQ CX, BP
	SHLQ $0x20, CX
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ CX, R8
	ADCQ BP, R9
	ADCQ AX, BX
	ADCQ $0x00, DX
	MOVQ DX, CX

	// Third reduction step
	MOVQ R8, AX
	MOVQ R8, BP
	SHLQ $0x20, R8
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ R8, R9
	ADCQ BP, BX
	ADCQ AX, CX
	ADCQ $0x00, DX
	MOVQ DX, R8

	// Last reduction step
	MOVQ R9, AX
	MOVQ R9, BP
	SHLQ $0x20, R9
	MULQ p256const1<>+0(SB)
	SHRQ $0x20, BP
	ADDQ R9, BX
	ADCQ BP, CX
	ADCQ AX, R8
	ADCQ $0x00, DX
	MOVQ DX, R9
	MOVQ $0x00000000, BP

	// Add bits [511:256] of the result
	ADCQ BX, R14
	ADCQ CX, R15
	ADCQ R8, DI
	ADCQ R9, SI
	ADCQ $0x00, BP

	// Copy result
	MOVQ R14, R10
	MOVQ R15, R11
	MOVQ DI, R12
	MOVQ SI, R13

	// Subtract p256
	SUBQ $-1, R10
	SBBQ p256const0<>+0(SB), R11
	SBBQ $0x00, R12
	SBBQ p256const1<>+0(SB), R13
	SBBQ $0x00, BP

	// If the result of the subtraction is negative, restore the previous result
	CMOVQCS R14, R10
	CMOVQCS R15, R11
	CMOVQCS DI, R12
	CMOVQCS SI, R13
	RET

// func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
// Requires: CMOV, SSE2
TEXT ·p256PointAddAffineAsm(SB), $512-48
	MOVQ  res+0(FP), AX
	MOVQ  in1+8(FP), BX
	MOVQ  in2+16(FP), CX
	MOVQ  sign+24(FP), DX
	MOVQ  sel+32(FP), R15
	MOVQ  zero+40(FP), DI
	MOVOU (BX), X0
	MOVOU 16(BX), X1
	MOVOU 32(BX), X2
	MOVOU 48(BX), X3
	MOVOU 64(BX), X4
	MOVOU 80(BX), X5
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)
	MOVOU X4, 64(SP)
	MOVOU X5, 80(SP)
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU X0, 96(SP)
	MOVOU X1, 112(SP)

	// Store pointer to result
	MOVQ AX, 480(SP)
	MOVL R15, 488(SP)
	MOVL DI, 492(SP)

	// Negate y2in based on sign
	MOVQ 32(CX), R10
	MOVQ 40(CX), R11
	MOVQ 48(CX), R12
	MOVQ 56(CX), R13
	MOVQ $-1, BX
	MOVQ p256const0<>+0(SB), CX
	MOVQ $0x00000000, R8
	MOVQ p256const1<>+0(SB), R9
	XORQ AX, AX

	// Speculatively subtract
	SUBQ R10, BX
	SBBQ R11, CX
	SBBQ R12, R8
	SBBQ R13, R9
	SBBQ $0x00, AX
	MOVQ BX, R14
	MOVQ CX, R15
	MOVQ R8, DI
	MOVQ R9, SI

	// Add in case the operand was > p256
	ADDQ    $-1, BX
	ADCQ    p256const0<>+0(SB), CX
	ADCQ    $0x00, R8
	ADCQ    p256const1<>+0(SB), R9
	ADCQ    $0x00, AX
	CMOVQNE R14, BX
	CMOVQNE R15, CX
	CMOVQNE DI, R8
	CMOVQNE SI, R9

	// If condition is 0, keep original value
	TESTQ   DX, DX
	CMOVQEQ R10, BX
	CMOVQEQ R11, CX
	CMOVQEQ R12, R8
	CMOVQEQ R13, R9

	// Store result
	MOVQ BX, 128(SP)
	MOVQ CX, 136(SP)
	MOVQ R8, 144(SP)
	MOVQ R9, 152(SP)

	// Begin point add
	MOVQ    64(SP), R10
	MOVQ    72(SP), R11
	MOVQ    80(SP), R12
	MOVQ    88(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 288(SP)
	MOVQ    R11, 296(SP)
	MOVQ    R12, 304(SP)
	MOVQ    R13, 312(SP)
	MOVQ    96(SP), R14
	MOVQ    104(SP), R15
	MOVQ    112(SP), DI
	MOVQ    120(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    (SP), R14
	MOVQ    8(SP), R15
	MOVQ    16(SP), DI
	MOVQ    24(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 320(SP)
	MOVQ    R11, 328(SP)
	MOVQ    R12, 336(SP)
	MOVQ    R13, 344(SP)
	MOVQ    64(SP), R14
	MOVQ    72(SP), R15
	MOVQ    80(SP), DI
	MOVQ    88(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 224(SP)
	MOVQ    R11, 232(SP)
	MOVQ    R12, 240(SP)
	MOVQ    R13, 248(SP)
	MOVQ    288(SP), R10
	MOVQ    296(SP), R11
	MOVQ    304(SP), R12
	MOVQ    312(SP), R13
	CALL    p256MulInternal(SB)
	MOVQ    128(SP), R14
	MOVQ    136(SP), R15
	MOVQ    144(SP), DI
	MOVQ    152(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 256(SP)
	MOVQ    R11, 264(SP)
	MOVQ    R12, 272(SP)
	MOVQ    R13, 280(SP)
	MOVQ    32(SP), R14
	MOVQ    40(SP), R15
	MOVQ    48(SP), DI
	MOVQ    56(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 352(SP)
	MOVQ    R11, 360(SP)
	MOVQ    R12, 368(SP)
	MOVQ    R13, 376(SP)
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 416(SP)
	MOVQ    R11, 424(SP)
	MOVQ    R12, 432(SP)
	MOVQ    R13, 440(SP)
	MOVQ    320(SP), R10
	MOVQ    328(SP), R11
	MOVQ    336(SP), R12
	MOVQ    344(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 384(SP)
	MOVQ    R11, 392(SP)
	MOVQ    R12, 400(SP)
	MOVQ    R13, 408(SP)
	MOVQ    320(SP), R14
	MOVQ    328(SP), R15
	MOVQ    336(SP), DI
	MOVQ    344(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 448(SP)
	MOVQ    R11, 456(SP)
	MOVQ    R12, 464(SP)
	MOVQ    R13, 472(SP)
	MOVQ    32(SP), R14
	MOVQ    40(SP), R15
	MOVQ    48(SP), DI
	MOVQ    56(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 256(SP)
	MOVQ    R11, 264(SP)
	MOVQ    R12, 272(SP)
	MOVQ    R13, 280(SP)
	MOVQ    (SP), R10
	MOVQ    8(SP), R11
	MOVQ    16(SP), R12
	MOVQ    24(SP), R13
	MOVQ    384(SP), R14
	MOVQ    392(SP), R15
	MOVQ    400(SP), DI
	MOVQ    408(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 320(SP)
	MOVQ    R11, 328(SP)
	MOVQ    R12, 336(SP)
	MOVQ    R13, 344(SP)
	XORQ    AX, AX
	ADDQ    R10, R10
	ADCQ    R11, R11
	ADCQ    R12, R12
	ADCQ    R13, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    416(SP), R10
	MOVQ    424(SP), R11
	MOVQ    432(SP), R12
	MOVQ    440(SP), R13
	CALL    p256SubInternal(SB)
	MOVQ    448(SP), R14
	MOVQ    456(SP), R15
	MOVQ    464(SP), DI
	MOVQ    472(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 160(SP)
	MOVQ    R11, 168(SP)
	MOVQ    R12, 176(SP)
	MOVQ    R13, 184(SP)
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	MOVQ    320(SP), R10
	MOVQ    328(SP), R11
	MOVQ    336(SP), R12
	MOVQ    344(SP), R13
	CALL    p256SubInternal(SB)
	MOVQ    352(SP), R14
	MOVQ    360(SP), R15
	MOVQ    368(SP), DI
	MOVQ    376(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    256(SP), R14
	MOVQ    264(SP), R15
	MOVQ    272(SP), DI
	MOVQ    280(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 192(SP)
	MOVQ    R11, 200(SP)
	MOVQ    R12, 208(SP)
	MOVQ    R13, 216(SP)

	// Load stored values from stack
	MOVQ 480(SP), AX
	MOVL 488(SP), BX
	MOVL 492(SP), CX

	// The result is not valid if (sel == 0), conditional choose
	MOVOU   160(SP), X0
	MOVOU   176(SP), X1
	MOVOU   192(SP), X2
	MOVOU   208(SP), X3
	MOVOU   224(SP), X4
	MOVOU   240(SP), X5
	MOVL    BX, X6
	MOVL    CX, X7
	PXOR    X8, X8
	PCMPEQL X9, X9
	PSHUFD  $0x00, X6, X6
	PSHUFD  $0x00, X7, X7
	PCMPEQL X8, X6
	PCMPEQL X8, X7
	MOVOU   X6, X15
	PANDN   X9, X15
	MOVOU   (SP), X9
	MOVOU   16(SP), X10
	MOVOU   32(SP), X11
	MOVOU   48(SP), X12
	MOVOU   64(SP), X13
	MOVOU   80(SP), X14
	PAND    X15, X0
	PAND    X15, X1
	PAND    X15, X2
	PAND    X15, X3
	PAND    X15, X4
	PAND    X15, X5
	PAND    X6, X9
	PAND    X6, X10
	PAND    X6, X11
	PAND    X6, X12
	PAND    X6, X13
	PAND    X6, X14
	PXOR    X9, X0
	PXOR    X10, X1
	PXOR    X11, X2
	PXOR    X12, X3
	PXOR    X13, X4
	PXOR    X14, X5

	// Similarly if zero == 0
	PCMPEQL X9, X9
	MOVOU   X7, X15
	PANDN   X9, X15
	MOVOU   96(SP), X9
	MOVOU   112(SP), X10
	MOVOU   128(SP), X11
	MOVOU   144(SP), X12
	MOVOU   p256one<>+0(SB), X13
	MOVOU   p256one<>+16(SB), X14
	PAND    X15, X0
	PAND    X15, X1
	PAND    X15, X2
	PAND    X15, X3
	PAND    X15, X4
	PAND    X15, X5
	PAND    X7, X9
	PAND    X7, X10
	PAND    X7, X11
	PAND    X7, X12
	PAND    X7, X13
	PAND    X7, X14
	PXOR    X9, X0
	PXOR    X10, X1
	PXOR    X11, X2
	PXOR    X12, X3
	PXOR    X13, X4
	PXOR    X14, X5

	// Finally output the result
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, 32(AX)
	MOVOU X3, 48(AX)
	MOVOU X4, 64(AX)
	MOVOU X5, 80(AX)
	MOVQ  $0x00000000, 480(SP)
	RET

DATA p256one<>+0(SB)/8, $0x0000000000000001
DATA p256one<>+8(SB)/8, $0xffffffff00000000
DATA p256one<>+16(SB)/8, $0xffffffffffffffff
DATA p256one<>+24(SB)/8, $0x00000000fffffffe
GLOBL p256one<>(SB), RODATA, $32

// func p256IsZero()
// Requires: CMOV
TEXT p256IsZero(SB), NOSPLIT, $0
	// AX contains a flag that is set if the input is zero.
	XORQ AX, AX
	MOVQ $0x00000001, R15

	// Check whether [acc4..acc7] are all zero.
	MOVQ R10, R14
	ORQ  R11, R14
	ORQ  R12, R14
	ORQ  R13, R14

	// Set the zero flag if so. (CMOV of a constant to a register doesn't
	// appear to be supported in Go. Thus t1 = 1.)
	CMOVQEQ R15, AX

	// XOR [acc4..acc7] with P and compare with zero again.
	XORQ $-1, R10
	XORQ p256const0<>+0(SB), R11
	XORQ p256const1<>+0(SB), R13
	ORQ  R11, R10
	ORQ  R12, R10
	ORQ  R13, R10

	// Set the zero flag if so.
	CMOVQEQ R15, AX
	RET

// func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
// Requires: CMOV, SSE2
TEXT ·p256PointAddAsm(SB), $680-32
	// Move input to stack in order to free registers
	MOVQ  res+0(FP), AX
	MOVQ  in1+8(FP), BX
	MOVQ  in2+16(FP), CX
	MOVOU (BX), X0
	MOVOU 16(BX), X1
	MOVOU 32(BX), X2
	MOVOU 48(BX), X3
	MOVOU 64(BX), X4
	MOVOU 80(BX), X5
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)
	MOVOU X4, 64(SP)
	MOVOU X5, 80(SP)
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU 32(CX), X2
	MOVOU 48(CX), X3
	MOVOU 64(CX), X4
	MOVOU 80(CX), X5
	MOVOU X0, 96(SP)
	MOVOU X1, 112(SP)
	MOVOU X2, 128(SP)
	MOVOU X3, 144(SP)
	MOVOU X4, 160(SP)
	MOVOU X5, 176(SP)

	// Store pointer to result
	MOVQ AX, 640(SP)

	// Begin point add
	MOVQ    160(SP), R10
	MOVQ    168(SP), R11
	MOVQ    176(SP), R12
	MOVQ    184(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 448(SP)
	MOVQ    R11, 456(SP)
	MOVQ    R12, 464(SP)
	MOVQ    R13, 472(SP)
	MOVQ    160(SP), R14
	MOVQ    168(SP), R15
	MOVQ    176(SP), DI
	MOVQ    184(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    32(SP), R14
	MOVQ    40(SP), R15
	MOVQ    48(SP), DI
	MOVQ    56(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 352(SP)
	MOVQ    R11, 360(SP)
	MOVQ    R12, 368(SP)
	MOVQ    R13, 376(SP)
	MOVQ    64(SP), R10
	MOVQ    72(SP), R11
	MOVQ    80(SP), R12
	MOVQ    88(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 416(SP)
	MOVQ    R11, 424(SP)
	MOVQ    R12, 432(SP)
	MOVQ    R13, 440(SP)
	MOVQ    64(SP), R14
	MOVQ    72(SP), R15
	MOVQ    80(SP), DI
	MOVQ    88(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    128(SP), R14
	MOVQ    136(SP), R15
	MOVQ    144(SP), DI
	MOVQ    152(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 384(SP)
	MOVQ    R11, 392(SP)
	MOVQ    R12, 400(SP)
	MOVQ    R13, 408(SP)
	MOVQ    352(SP), R14
	MOVQ    360(SP), R15
	MOVQ    368(SP), DI
	MOVQ    376(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 512(SP)
	MOVQ    R11, 520(SP)
	MOVQ    R12, 528(SP)
	MOVQ    R13, 536(SP)
	CALL    p256IsZero(SB)
	MOVQ    AX, 648(SP)
	MOVQ    448(SP), R10
	MOVQ    456(SP), R11
	MOVQ    464(SP), R12
	MOVQ    472(SP), R13
	MOVQ    (SP), R14
	MOVQ    8(SP), R15
	MOVQ    16(SP), DI
	MOVQ    24(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 288(SP)
	MOVQ    R11, 296(SP)
	MOVQ    R12, 304(SP)
	MOVQ    R13, 312(SP)
	MOVQ    416(SP), R10
	MOVQ    424(SP), R11
	MOVQ    432(SP), R12
	MOVQ    440(SP), R13
	MOVQ    96(SP), R14
	MOVQ    104(SP), R15
	MOVQ    112(SP), DI
	MOVQ    120(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 320(SP)
	MOVQ    R11, 328(SP)
	MOVQ    R12, 336(SP)
	MOVQ    R13, 344(SP)
	MOVQ    288(SP), R14
	MOVQ    296(SP), R15
	MOVQ    304(SP), DI
	MOVQ    312(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 480(SP)
	MOVQ    R11, 488(SP)
	MOVQ    R12, 496(SP)
	MOVQ    R13, 504(SP)
	CALL    p256IsZero(SB)
	ANDQ    648(SP), AX
	MOVQ    AX, 648(SP)
	MOVQ    512(SP), R10
	MOVQ    520(SP), R11
	MOVQ    528(SP), R12
	MOVQ    536(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 576(SP)
	MOVQ    R11, 584(SP)
	MOVQ    R12, 592(SP)
	MOVQ    R13, 600(SP)
	MOVQ    480(SP), R10
	MOVQ    488(SP), R11
	MOVQ    496(SP), R12
	MOVQ    504(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 544(SP)
	MOVQ    R11, 552(SP)
	MOVQ    R12, 560(SP)
	MOVQ    R13, 568(SP)
	MOVQ    480(SP), R14
	MOVQ    488(SP), R15
	MOVQ    496(SP), DI
	MOVQ    504(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 608(SP)
	MOVQ    R11, 616(SP)
	MOVQ    R12, 624(SP)
	MOVQ    R13, 632(SP)
	MOVQ    352(SP), R14
	MOVQ    360(SP), R15
	MOVQ    368(SP), DI
	MOVQ    376(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 384(SP)
	MOVQ    R11, 392(SP)
	MOVQ    R12, 400(SP)
	MOVQ    R13, 408(SP)
	MOVQ    64(SP), R10
	MOVQ    72(SP), R11
	MOVQ    80(SP), R12
	MOVQ    88(SP), R13
	MOVQ    160(SP), R14
	MOVQ    168(SP), R15
	MOVQ    176(SP), DI
	MOVQ    184(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    480(SP), R14
	MOVQ    488(SP), R15
	MOVQ    496(SP), DI
	MOVQ    504(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 256(SP)
	MOVQ    R11, 264(SP)
	MOVQ    R12, 272(SP)
	MOVQ    R13, 280(SP)
	MOVQ    544(SP), R10
	MOVQ    552(SP), R11
	MOVQ    560(SP), R12
	MOVQ    568(SP), R13
	MOVQ    288(SP), R14
	MOVQ    296(SP), R15
	MOVQ    304(SP), DI
	MOVQ    312(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 320(SP)
	MOVQ    R11, 328(SP)
	MOVQ    R12, 336(SP)
	MOVQ    R13, 344(SP)
	XORQ    AX, AX
	ADDQ    R10, R10
	ADCQ    R11, R11
	ADCQ    R12, R12
	ADCQ    R13, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    576(SP), R10
	MOVQ    584(SP), R11
	MOVQ    592(SP), R12
	MOVQ    600(SP), R13
	CALL    p256SubInternal(SB)
	MOVQ    608(SP), R14
	MOVQ    616(SP), R15
	MOVQ    624(SP), DI
	MOVQ    632(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 192(SP)
	MOVQ    R11, 200(SP)
	MOVQ    R12, 208(SP)
	MOVQ    R13, 216(SP)
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	MOVQ    320(SP), R10
	MOVQ    328(SP), R11
	MOVQ    336(SP), R12
	MOVQ    344(SP), R13
	CALL    p256SubInternal(SB)
	MOVQ    512(SP), R14
	MOVQ    520(SP), R15
	MOVQ    528(SP), DI
	MOVQ    536(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    384(SP), R14
	MOVQ    392(SP), R15
	MOVQ    400(SP), DI
	MOVQ    408(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    R10, 224(SP)
	MOVQ    R11, 232(SP)
	MOVQ    R12, 240(SP)
	MOVQ    R13, 248(SP)
	MOVOU   192(SP), X0
	MOVOU   208(SP), X1
	MOVOU   224(SP), X2
	MOVOU   240(SP), X3
	MOVOU   256(SP), X4
	MOVOU   272(SP), X5

	// Finally output the result
	MOVQ  640(SP), AX
	MOVQ  $0x00000000, 640(SP)
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, 32(AX)
	MOVOU X3, 48(AX)
	MOVOU X4, 64(AX)
	MOVOU X5, 80(AX)
	MOVQ  648(SP), AX
	MOVQ  AX, ret+24(FP)
	RET

// func p256PointDoubleAsm(res *P256Point, in *P256Point)
// Requires: CMOV, SSE2
TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
	MOVQ  res+0(FP), AX
	MOVQ  in+8(FP), BX
	MOVOU (BX), X0
	MOVOU 16(BX), X1
	MOVOU 32(BX), X2
	MOVOU 48(BX), X3
	MOVOU 64(BX), X4
	MOVOU 80(BX), X5
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)
	MOVOU X4, 64(SP)
	MOVOU X5, 80(SP)

	// Store pointer to result
	MOVQ AX, 224(SP)

	// Begin point double
	MOVQ    64(SP), R10
	MOVQ    72(SP), R11
	MOVQ    80(SP), R12
	MOVQ    88(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 160(SP)
	MOVQ    R11, 168(SP)
	MOVQ    R12, 176(SP)
	MOVQ    R13, 184(SP)
	MOVQ    (SP), R14
	MOVQ    8(SP), R15
	MOVQ    16(SP), DI
	MOVQ    24(SP), SI
	XORQ    AX, AX
	ADDQ    R14, R10
	ADCQ    R15, R11
	ADCQ    DI, R12
	ADCQ    SI, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    R14, 128(SP)
	MOVQ    R15, 136(SP)
	MOVQ    DI, 144(SP)
	MOVQ    SI, 152(SP)
	MOVQ    64(SP), R10
	MOVQ    72(SP), R11
	MOVQ    80(SP), R12
	MOVQ    88(SP), R13
	MOVQ    32(SP), R14
	MOVQ    40(SP), R15
	MOVQ    48(SP), DI
	MOVQ    56(SP), SI
	CALL    p256MulInternal(SB)
	XORQ    AX, AX
	ADDQ    R10, R10
	ADCQ    R11, R11
	ADCQ    R12, R12
	ADCQ    R13, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    224(SP), AX

	// Store z
	MOVQ R14, 64(AX)
	MOVQ R15, 72(AX)
	MOVQ DI, 80(AX)
	MOVQ SI, 88(AX)
	MOVQ (SP), R10
	MOVQ 8(SP), R11
	MOVQ 16(SP), R12
	MOVQ 24(SP), R13
	MOVQ 160(SP), R14
	MOVQ 168(SP), R15
	MOVQ 176(SP), DI
	MOVQ 184(SP), SI
	CALL p256SubInternal(SB)
	MOVQ 128(SP), R14
	MOVQ 136(SP), R15
	MOVQ 144(SP), DI
	MOVQ 152(SP), SI
	CALL p256MulInternal(SB)
	MOVQ R10, 128(SP)
	MOVQ R11, 136(SP)
	MOVQ R12, 144(SP)
	MOVQ R13, 152(SP)

	// Multiply by 3
	XORQ    AX, AX
	ADDQ    R10, R10
	ADCQ    R11, R11
	ADCQ    R12, R12
	ADCQ    R13, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    128(SP), R10
	MOVQ    136(SP), R11
	MOVQ    144(SP), R12
	MOVQ    152(SP), R13
	XORQ    AX, AX
	ADDQ    R14, R10
	ADCQ    R15, R11
	ADCQ    DI, R12
	ADCQ    SI, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    R14, 128(SP)
	MOVQ    R15, 136(SP)
	MOVQ    DI, 144(SP)
	MOVQ    SI, 152(SP)

	// ////////////////////////
	MOVQ    32(SP), R10
	MOVQ    40(SP), R11
	MOVQ    48(SP), R12
	MOVQ    56(SP), R13
	XORQ    AX, AX
	ADDQ    R10, R10
	ADCQ    R11, R11
	ADCQ    R12, R12
	ADCQ    R13, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    R14, R10
	MOVQ    R15, R11
	MOVQ    DI, R12
	MOVQ    SI, R13
	CALL    p256SqrInternal(SB)
	MOVQ    R10, 96(SP)
	MOVQ    R11, 104(SP)
	MOVQ    R12, 112(SP)
	MOVQ    R13, 120(SP)
	CALL    p256SqrInternal(SB)

	// Divide by 2
	XORQ    AX, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	ADDQ    $-1, R10
	ADCQ    p256const0<>+0(SB), R11
	ADCQ    $0x00, R12
	ADCQ    p256const1<>+0(SB), R13
	ADCQ    $0x00, AX
	TESTQ   $0x00000001, R14
	CMOVQEQ R14, R10
	CMOVQEQ R15, R11
	CMOVQEQ DI, R12
	CMOVQEQ SI, R13
	ANDQ    R14, AX
	SHRQ    $0x01, R11, R10
	SHRQ    $0x01, R12, R11
	SHRQ    $0x01, R13, R12
	SHRQ    $0x01, AX, R13
	MOVQ    R10, 32(SP)
	MOVQ    R11, 40(SP)
	MOVQ    R12, 48(SP)
	MOVQ    R13, 56(SP)

	// /////////////////////////
	MOVQ    (SP), R10
	MOVQ    8(SP), R11
	MOVQ    16(SP), R12
	MOVQ    24(SP), R13
	MOVQ    96(SP), R14
	MOVQ    104(SP), R15
	MOVQ    112(SP), DI
	MOVQ    120(SP), SI
	CALL    p256MulInternal(SB)
	MOVQ    R10, 96(SP)
	MOVQ    R11, 104(SP)
	MOVQ    R12, 112(SP)
	MOVQ    R13, 120(SP)
	XORQ    AX, AX
	ADDQ    R10, R10
	ADCQ    R11, R11
	ADCQ    R12, R12
	ADCQ    R13, R13
	ADCQ    $+0, AX
	MOVQ    R10, R14
	MOVQ    R11, R15
	MOVQ    R12, DI
	MOVQ    R13, SI
	SUBQ    $-1, R14
	SBBQ    p256const0<>+0(SB), R15
	SBBQ    $+0, DI
	SBBQ    p256const1<>+0(SB), SI
	SBBQ    $+0, AX
	CMOVQCS R10, R14
	CMOVQCS R11, R15
	CMOVQCS R12, DI
	CMOVQCS R13, SI
	MOVQ    R14, 192(SP)
	MOVQ    R15, 200(SP)
	MOVQ    DI, 208(SP)
	MOVQ    SI, 216(SP)
	MOVQ    128(SP), R10
	MOVQ    136(SP), R11
	MOVQ    144(SP), R12
	MOVQ    152(SP), R13
	CALL    p256SqrInternal(SB)
	MOVQ    192(SP), R14
	MOVQ    200(SP), R15
	MOVQ    208(SP), DI
	MOVQ    216(SP), SI
	CALL    p256SubInternal(SB)
	MOVQ    224(SP), AX

	// Store x
	MOVQ R10, (AX)
	MOVQ R11, 8(AX)
	MOVQ R12, 16(AX)
	MOVQ R13, 24(AX)
	MOVQ R10, R14
	MOVQ R11, R15
	MOVQ R12, DI
	MOVQ R13, SI
	MOVQ 96(SP), R10
	MOVQ 104(SP), R11
	MOVQ 112(SP), R12
	MOVQ 120(SP), R13
	CALL p256SubInternal(SB)
	MOVQ 128(SP), R14
	MOVQ 136(SP), R15
	MOVQ 144(SP), DI
	MOVQ 152(SP), SI
	CALL p256MulInternal(SB)
	MOVQ 32(SP), R14
	MOVQ 40(SP), R15
	MOVQ 48(SP), DI
	MOVQ 56(SP), SI
	CALL p256SubInternal(SB)
	MOVQ 224(SP), AX

	// Store y
	MOVQ R10, 32(AX)
	MOVQ R11, 40(AX)
	MOVQ R12, 48(AX)
	MOVQ R13, 56(AX)

	// ///////////////////////
	MOVQ $0x00000000, 224(SP)
	RET