1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file contains constant-time, 64-bit assembly implementation of
6// P256. The optimizations performed here are described in detail in:
7// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
8//                          256-bit primes"
9// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
10// https://eprint.iacr.org/2013/816.pdf
11
12#include "textflag.h"
13
14#define res_ptr DI
15#define x_ptr SI
16#define y_ptr CX
17
18#define acc0 R8
19#define acc1 R9
20#define acc2 R10
21#define acc3 R11
22#define acc4 R12
23#define acc5 R13
24#define t0 R14
25#define t1 R15
26
27DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
28DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
29DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
30DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
31DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
32DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
33DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
34DATA p256one<>+0x00(SB)/8, $0x0000000000000001
35DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
36DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
37DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
38GLOBL p256const0<>(SB), 8, $8
39GLOBL p256const1<>(SB), 8, $8
40GLOBL p256ordK0<>(SB), 8, $8
41GLOBL p256ord<>(SB), 8, $32
42GLOBL p256one<>(SB), 8, $32
43
44/* ---------------------------------------*/
45// func p256LittleToBig(res []byte, in []uint64)
46TEXT ·p256LittleToBig(SB),NOSPLIT,$0
47	JMP ·p256BigToLittle(SB)
48/* ---------------------------------------*/
49// func p256BigToLittle(res []uint64, in []byte)
50TEXT ·p256BigToLittle(SB),NOSPLIT,$0
51	MOVQ res+0(FP), res_ptr
52	MOVQ in+24(FP), x_ptr
53
54	MOVQ (8*0)(x_ptr), acc0
55	MOVQ (8*1)(x_ptr), acc1
56	MOVQ (8*2)(x_ptr), acc2
57	MOVQ (8*3)(x_ptr), acc3
58
59	BSWAPQ acc0
60	BSWAPQ acc1
61	BSWAPQ acc2
62	BSWAPQ acc3
63
64	MOVQ acc3, (8*0)(res_ptr)
65	MOVQ acc2, (8*1)(res_ptr)
66	MOVQ acc1, (8*2)(res_ptr)
67	MOVQ acc0, (8*3)(res_ptr)
68
69	RET
70/* ---------------------------------------*/
71// func p256MovCond(res, a, b []uint64, cond int)
72// If cond == 0 res=b, else res=a
73TEXT ·p256MovCond(SB),NOSPLIT,$0
74	MOVQ res+0(FP), res_ptr
75	MOVQ a+24(FP), x_ptr
76	MOVQ b+48(FP), y_ptr
77	MOVQ cond+72(FP), X12
78
79	PXOR X13, X13
80	PSHUFD $0, X12, X12
81	PCMPEQL X13, X12
82
83	MOVOU X12, X0
84	PANDN (16*0)(x_ptr), X0
85	MOVOU X12, X1
86	PANDN (16*1)(x_ptr), X1
87	MOVOU X12, X2
88	PANDN (16*2)(x_ptr), X2
89	MOVOU X12, X3
90	PANDN (16*3)(x_ptr), X3
91	MOVOU X12, X4
92	PANDN (16*4)(x_ptr), X4
93	MOVOU X12, X5
94	PANDN (16*5)(x_ptr), X5
95
96	MOVOU (16*0)(y_ptr), X6
97	MOVOU (16*1)(y_ptr), X7
98	MOVOU (16*2)(y_ptr), X8
99	MOVOU (16*3)(y_ptr), X9
100	MOVOU (16*4)(y_ptr), X10
101	MOVOU (16*5)(y_ptr), X11
102
103	PAND X12, X6
104	PAND X12, X7
105	PAND X12, X8
106	PAND X12, X9
107	PAND X12, X10
108	PAND X12, X11
109
110	PXOR X6, X0
111	PXOR X7, X1
112	PXOR X8, X2
113	PXOR X9, X3
114	PXOR X10, X4
115	PXOR X11, X5
116
117	MOVOU X0, (16*0)(res_ptr)
118	MOVOU X1, (16*1)(res_ptr)
119	MOVOU X2, (16*2)(res_ptr)
120	MOVOU X3, (16*3)(res_ptr)
121	MOVOU X4, (16*4)(res_ptr)
122	MOVOU X5, (16*5)(res_ptr)
123
124	RET
125/* ---------------------------------------*/
126// func p256NegCond(val []uint64, cond int)
127TEXT ·p256NegCond(SB),NOSPLIT,$0
128	MOVQ val+0(FP), res_ptr
129	MOVQ cond+24(FP), t0
130	// acc = poly
131	MOVQ $-1, acc0
132	MOVQ p256const0<>(SB), acc1
133	MOVQ $0, acc2
134	MOVQ p256const1<>(SB), acc3
135	// Load the original value
136	MOVQ (8*0)(res_ptr), acc5
137	MOVQ (8*1)(res_ptr), x_ptr
138	MOVQ (8*2)(res_ptr), y_ptr
139	MOVQ (8*3)(res_ptr), t1
140	// Speculatively subtract
141	SUBQ acc5, acc0
142	SBBQ x_ptr, acc1
143	SBBQ y_ptr, acc2
144	SBBQ t1, acc3
145	// If condition is 0, keep original value
146	TESTQ t0, t0
147	CMOVQEQ acc5, acc0
148	CMOVQEQ x_ptr, acc1
149	CMOVQEQ y_ptr, acc2
150	CMOVQEQ t1, acc3
151	// Store result
152	MOVQ acc0, (8*0)(res_ptr)
153	MOVQ acc1, (8*1)(res_ptr)
154	MOVQ acc2, (8*2)(res_ptr)
155	MOVQ acc3, (8*3)(res_ptr)
156
157	RET
158/* ---------------------------------------*/
159// func p256Sqr(res, in []uint64)
160TEXT ·p256Sqr(SB),NOSPLIT,$0
161	MOVQ res+0(FP), res_ptr
162	MOVQ in+24(FP), x_ptr
163	// y[1:] * y[0]
164	MOVQ (8*0)(x_ptr), t0
165
166	MOVQ (8*1)(x_ptr), AX
167	MULQ t0
168	MOVQ AX, acc1
169	MOVQ DX, acc2
170
171	MOVQ (8*2)(x_ptr), AX
172	MULQ t0
173	ADDQ AX, acc2
174	ADCQ $0, DX
175	MOVQ DX, acc3
176
177	MOVQ (8*3)(x_ptr), AX
178	MULQ t0
179	ADDQ AX, acc3
180	ADCQ $0, DX
181	MOVQ DX, acc4
182	// y[2:] * y[1]
183	MOVQ (8*1)(x_ptr), t0
184
185	MOVQ (8*2)(x_ptr), AX
186	MULQ t0
187	ADDQ AX, acc3
188	ADCQ $0, DX
189	MOVQ DX, t1
190
191	MOVQ (8*3)(x_ptr), AX
192	MULQ t0
193	ADDQ t1, acc4
194	ADCQ $0, DX
195	ADDQ AX, acc4
196	ADCQ $0, DX
197	MOVQ DX, acc5
198	// y[3] * y[2]
199	MOVQ (8*2)(x_ptr), t0
200
201	MOVQ (8*3)(x_ptr), AX
202	MULQ t0
203	ADDQ AX, acc5
204	ADCQ $0, DX
205	MOVQ DX, y_ptr
206	XORQ t1, t1
207	// *2
208	ADDQ acc1, acc1
209	ADCQ acc2, acc2
210	ADCQ acc3, acc3
211	ADCQ acc4, acc4
212	ADCQ acc5, acc5
213	ADCQ y_ptr, y_ptr
214	ADCQ $0, t1
215	// Missing products
216	MOVQ (8*0)(x_ptr), AX
217	MULQ AX
218	MOVQ AX, acc0
219	MOVQ DX, t0
220
221	MOVQ (8*1)(x_ptr), AX
222	MULQ AX
223	ADDQ t0, acc1
224	ADCQ AX, acc2
225	ADCQ $0, DX
226	MOVQ DX, t0
227
228	MOVQ (8*2)(x_ptr), AX
229	MULQ AX
230	ADDQ t0, acc3
231	ADCQ AX, acc4
232	ADCQ $0, DX
233	MOVQ DX, t0
234
235	MOVQ (8*3)(x_ptr), AX
236	MULQ AX
237	ADDQ t0, acc5
238	ADCQ AX, y_ptr
239	ADCQ DX, t1
240	MOVQ t1, x_ptr
241	// First reduction step
242	MOVQ acc0, AX
243	MOVQ acc0, t1
244	SHLQ $32, acc0
245	MULQ p256const1<>(SB)
246	SHRQ $32, t1
247	ADDQ acc0, acc1
248	ADCQ t1, acc2
249	ADCQ AX, acc3
250	ADCQ $0, DX
251	MOVQ DX, acc0
252	// Second reduction step
253	MOVQ acc1, AX
254	MOVQ acc1, t1
255	SHLQ $32, acc1
256	MULQ p256const1<>(SB)
257	SHRQ $32, t1
258	ADDQ acc1, acc2
259	ADCQ t1, acc3
260	ADCQ AX, acc0
261	ADCQ $0, DX
262	MOVQ DX, acc1
263	// Third reduction step
264	MOVQ acc2, AX
265	MOVQ acc2, t1
266	SHLQ $32, acc2
267	MULQ p256const1<>(SB)
268	SHRQ $32, t1
269	ADDQ acc2, acc3
270	ADCQ t1, acc0
271	ADCQ AX, acc1
272	ADCQ $0, DX
273	MOVQ DX, acc2
274	// Last reduction step
275	XORQ t0, t0
276	MOVQ acc3, AX
277	MOVQ acc3, t1
278	SHLQ $32, acc3
279	MULQ p256const1<>(SB)
280	SHRQ $32, t1
281	ADDQ acc3, acc0
282	ADCQ t1, acc1
283	ADCQ AX, acc2
284	ADCQ $0, DX
285	MOVQ DX, acc3
286	// Add bits [511:256] of the sqr result
287	ADCQ acc4, acc0
288	ADCQ acc5, acc1
289	ADCQ y_ptr, acc2
290	ADCQ x_ptr, acc3
291	ADCQ $0, t0
292
293	MOVQ acc0, acc4
294	MOVQ acc1, acc5
295	MOVQ acc2, y_ptr
296	MOVQ acc3, t1
297	// Subtract p256
298	SUBQ $-1, acc0
299	SBBQ p256const0<>(SB) ,acc1
300	SBBQ $0, acc2
301	SBBQ p256const1<>(SB), acc3
302	SBBQ $0, t0
303
304	CMOVQCS acc4, acc0
305	CMOVQCS acc5, acc1
306	CMOVQCS y_ptr, acc2
307	CMOVQCS t1, acc3
308
309	MOVQ acc0, (8*0)(res_ptr)
310	MOVQ acc1, (8*1)(res_ptr)
311	MOVQ acc2, (8*2)(res_ptr)
312	MOVQ acc3, (8*3)(res_ptr)
313
314	RET
315/* ---------------------------------------*/
316// func p256Mul(res, in1, in2 []uint64)
317TEXT ·p256Mul(SB),NOSPLIT,$0
318	MOVQ res+0(FP), res_ptr
319	MOVQ in1+24(FP), x_ptr
320	MOVQ in2+48(FP), y_ptr
321	// x * y[0]
322	MOVQ (8*0)(y_ptr), t0
323
324	MOVQ (8*0)(x_ptr), AX
325	MULQ t0
326	MOVQ AX, acc0
327	MOVQ DX, acc1
328
329	MOVQ (8*1)(x_ptr), AX
330	MULQ t0
331	ADDQ AX, acc1
332	ADCQ $0, DX
333	MOVQ DX, acc2
334
335	MOVQ (8*2)(x_ptr), AX
336	MULQ t0
337	ADDQ AX, acc2
338	ADCQ $0, DX
339	MOVQ DX, acc3
340
341	MOVQ (8*3)(x_ptr), AX
342	MULQ t0
343	ADDQ AX, acc3
344	ADCQ $0, DX
345	MOVQ DX, acc4
346	XORQ acc5, acc5
347	// First reduction step
348	MOVQ acc0, AX
349	MOVQ acc0, t1
350	SHLQ $32, acc0
351	MULQ p256const1<>(SB)
352	SHRQ $32, t1
353	ADDQ acc0, acc1
354	ADCQ t1, acc2
355	ADCQ AX, acc3
356	ADCQ DX, acc4
357	ADCQ $0, acc5
358	XORQ acc0, acc0
359	// x * y[1]
360	MOVQ (8*1)(y_ptr), t0
361
362	MOVQ (8*0)(x_ptr), AX
363	MULQ t0
364	ADDQ AX, acc1
365	ADCQ $0, DX
366	MOVQ DX, t1
367
368	MOVQ (8*1)(x_ptr), AX
369	MULQ t0
370	ADDQ t1, acc2
371	ADCQ $0, DX
372	ADDQ AX, acc2
373	ADCQ $0, DX
374	MOVQ DX, t1
375
376	MOVQ (8*2)(x_ptr), AX
377	MULQ t0
378	ADDQ t1, acc3
379	ADCQ $0, DX
380	ADDQ AX, acc3
381	ADCQ $0, DX
382	MOVQ DX, t1
383
384	MOVQ (8*3)(x_ptr), AX
385	MULQ t0
386	ADDQ t1, acc4
387	ADCQ $0, DX
388	ADDQ AX, acc4
389	ADCQ DX, acc5
390	ADCQ $0, acc0
391	// Second reduction step
392	MOVQ acc1, AX
393	MOVQ acc1, t1
394	SHLQ $32, acc1
395	MULQ p256const1<>(SB)
396	SHRQ $32, t1
397	ADDQ acc1, acc2
398	ADCQ t1, acc3
399	ADCQ AX, acc4
400	ADCQ DX, acc5
401	ADCQ $0, acc0
402	XORQ acc1, acc1
403	// x * y[2]
404	MOVQ (8*2)(y_ptr), t0
405
406	MOVQ (8*0)(x_ptr), AX
407	MULQ t0
408	ADDQ AX, acc2
409	ADCQ $0, DX
410	MOVQ DX, t1
411
412	MOVQ (8*1)(x_ptr), AX
413	MULQ t0
414	ADDQ t1, acc3
415	ADCQ $0, DX
416	ADDQ AX, acc3
417	ADCQ $0, DX
418	MOVQ DX, t1
419
420	MOVQ (8*2)(x_ptr), AX
421	MULQ t0
422	ADDQ t1, acc4
423	ADCQ $0, DX
424	ADDQ AX, acc4
425	ADCQ $0, DX
426	MOVQ DX, t1
427
428	MOVQ (8*3)(x_ptr), AX
429	MULQ t0
430	ADDQ t1, acc5
431	ADCQ $0, DX
432	ADDQ AX, acc5
433	ADCQ DX, acc0
434	ADCQ $0, acc1
435	// Third reduction step
436	MOVQ acc2, AX
437	MOVQ acc2, t1
438	SHLQ $32, acc2
439	MULQ p256const1<>(SB)
440	SHRQ $32, t1
441	ADDQ acc2, acc3
442	ADCQ t1, acc4
443	ADCQ AX, acc5
444	ADCQ DX, acc0
445	ADCQ $0, acc1
446	XORQ acc2, acc2
447	// x * y[3]
448	MOVQ (8*3)(y_ptr), t0
449
450	MOVQ (8*0)(x_ptr), AX
451	MULQ t0
452	ADDQ AX, acc3
453	ADCQ $0, DX
454	MOVQ DX, t1
455
456	MOVQ (8*1)(x_ptr), AX
457	MULQ t0
458	ADDQ t1, acc4
459	ADCQ $0, DX
460	ADDQ AX, acc4
461	ADCQ $0, DX
462	MOVQ DX, t1
463
464	MOVQ (8*2)(x_ptr), AX
465	MULQ t0
466	ADDQ t1, acc5
467	ADCQ $0, DX
468	ADDQ AX, acc5
469	ADCQ $0, DX
470	MOVQ DX, t1
471
472	MOVQ (8*3)(x_ptr), AX
473	MULQ t0
474	ADDQ t1, acc0
475	ADCQ $0, DX
476	ADDQ AX, acc0
477	ADCQ DX, acc1
478	ADCQ $0, acc2
479	// Last reduction step
480	MOVQ acc3, AX
481	MOVQ acc3, t1
482	SHLQ $32, acc3
483	MULQ p256const1<>(SB)
484	SHRQ $32, t1
485	ADDQ acc3, acc4
486	ADCQ t1, acc5
487	ADCQ AX, acc0
488	ADCQ DX, acc1
489	ADCQ $0, acc2
490	// Copy result [255:0]
491	MOVQ acc4, x_ptr
492	MOVQ acc5, acc3
493	MOVQ acc0, t0
494	MOVQ acc1, t1
495	// Subtract p256
496	SUBQ $-1, acc4
497	SBBQ p256const0<>(SB) ,acc5
498	SBBQ $0, acc0
499	SBBQ p256const1<>(SB), acc1
500	SBBQ $0, acc2
501
502	CMOVQCS x_ptr, acc4
503	CMOVQCS acc3, acc5
504	CMOVQCS t0, acc0
505	CMOVQCS t1, acc1
506
507	MOVQ acc4, (8*0)(res_ptr)
508	MOVQ acc5, (8*1)(res_ptr)
509	MOVQ acc0, (8*2)(res_ptr)
510	MOVQ acc1, (8*3)(res_ptr)
511
512	RET
513/* ---------------------------------------*/
514// func p256FromMont(res, in []uint64)
515TEXT ·p256FromMont(SB),NOSPLIT,$0
516	MOVQ res+0(FP), res_ptr
517	MOVQ in+24(FP), x_ptr
518
519	MOVQ (8*0)(x_ptr), acc0
520	MOVQ (8*1)(x_ptr), acc1
521	MOVQ (8*2)(x_ptr), acc2
522	MOVQ (8*3)(x_ptr), acc3
523	XORQ acc4, acc4
524
525	// Only reduce, no multiplications are needed
526	// First stage
527	MOVQ acc0, AX
528	MOVQ acc0, t1
529	SHLQ $32, acc0
530	MULQ p256const1<>(SB)
531	SHRQ $32, t1
532	ADDQ acc0, acc1
533	ADCQ t1, acc2
534	ADCQ AX, acc3
535	ADCQ DX, acc4
536	XORQ acc5, acc5
537	// Second stage
538	MOVQ acc1, AX
539	MOVQ acc1, t1
540	SHLQ $32, acc1
541	MULQ p256const1<>(SB)
542	SHRQ $32, t1
543	ADDQ acc1, acc2
544	ADCQ t1, acc3
545	ADCQ AX, acc4
546	ADCQ DX, acc5
547	XORQ acc0, acc0
548	// Third stage
549	MOVQ acc2, AX
550	MOVQ acc2, t1
551	SHLQ $32, acc2
552	MULQ p256const1<>(SB)
553	SHRQ $32, t1
554	ADDQ acc2, acc3
555	ADCQ t1, acc4
556	ADCQ AX, acc5
557	ADCQ DX, acc0
558	XORQ acc1, acc1
559	// Last stage
560	MOVQ acc3, AX
561	MOVQ acc3, t1
562	SHLQ $32, acc3
563	MULQ p256const1<>(SB)
564	SHRQ $32, t1
565	ADDQ acc3, acc4
566	ADCQ t1, acc5
567	ADCQ AX, acc0
568	ADCQ DX, acc1
569
570	MOVQ acc4, x_ptr
571	MOVQ acc5, acc3
572	MOVQ acc0, t0
573	MOVQ acc1, t1
574
575	SUBQ $-1, acc4
576	SBBQ p256const0<>(SB), acc5
577	SBBQ $0, acc0
578	SBBQ p256const1<>(SB), acc1
579
580	CMOVQCS x_ptr, acc4
581	CMOVQCS acc3, acc5
582	CMOVQCS t0, acc0
583	CMOVQCS t1, acc1
584
585	MOVQ acc4, (8*0)(res_ptr)
586	MOVQ acc5, (8*1)(res_ptr)
587	MOVQ acc0, (8*2)(res_ptr)
588	MOVQ acc1, (8*3)(res_ptr)
589
590	RET
591/* ---------------------------------------*/
592// Constant time point access to arbitrary point table.
593// Indexed from 1 to 15, with -1 offset
594// (index 0 is implicitly point at infinity)
595// func p256Select(point, table []uint64, idx int)
596TEXT ·p256Select(SB),NOSPLIT,$0
597	MOVQ idx+48(FP),AX
598	MOVQ table+24(FP),DI
599	MOVQ point+0(FP),DX
600
601	PXOR X15, X15	// X15 = 0
602	PCMPEQL X14, X14 // X14 = -1
603	PSUBL X14, X15   // X15 = 1
604	MOVL AX, X14
605	PSHUFD $0, X14, X14
606
607	PXOR X0, X0
608	PXOR X1, X1
609	PXOR X2, X2
610	PXOR X3, X3
611	PXOR X4, X4
612	PXOR X5, X5
613	MOVQ $16, AX
614
615	MOVOU X15, X13
616
617loop_select:
618
619		MOVOU X13, X12
620		PADDL X15, X13
621		PCMPEQL X14, X12
622
623		MOVOU (16*0)(DI), X6
624		MOVOU (16*1)(DI), X7
625		MOVOU (16*2)(DI), X8
626		MOVOU (16*3)(DI), X9
627		MOVOU (16*4)(DI), X10
628		MOVOU (16*5)(DI), X11
629		ADDQ $(16*6), DI
630
631		PAND X12, X6
632		PAND X12, X7
633		PAND X12, X8
634		PAND X12, X9
635		PAND X12, X10
636		PAND X12, X11
637
638		PXOR X6, X0
639		PXOR X7, X1
640		PXOR X8, X2
641		PXOR X9, X3
642		PXOR X10, X4
643		PXOR X11, X5
644
645		DECQ AX
646		JNE loop_select
647
648	MOVOU X0, (16*0)(DX)
649	MOVOU X1, (16*1)(DX)
650	MOVOU X2, (16*2)(DX)
651	MOVOU X3, (16*3)(DX)
652	MOVOU X4, (16*4)(DX)
653	MOVOU X5, (16*5)(DX)
654
655	RET
656/* ---------------------------------------*/
657// Constant time point access to base point table.
658// func p256SelectBase(point, table []uint64, idx int)
659TEXT ·p256SelectBase(SB),NOSPLIT,$0
660	MOVQ idx+48(FP),AX
661	MOVQ table+24(FP),DI
662	MOVQ point+0(FP),DX
663
664	PXOR X15, X15	// X15 = 0
665	PCMPEQL X14, X14 // X14 = -1
666	PSUBL X14, X15   // X15 = 1
667	MOVL AX, X14
668	PSHUFD $0, X14, X14
669
670	PXOR X0, X0
671	PXOR X1, X1
672	PXOR X2, X2
673	PXOR X3, X3
674	MOVQ $32, AX
675
676	MOVOU X15, X13
677
678loop_select_base:
679
680		MOVOU X13, X12
681		PADDL X15, X13
682		PCMPEQL X14, X12
683
684		MOVOU (16*0)(DI), X4
685		MOVOU (16*1)(DI), X5
686		MOVOU (16*2)(DI), X6
687		MOVOU (16*3)(DI), X7
688
689		MOVOU (16*4)(DI), X8
690		MOVOU (16*5)(DI), X9
691		MOVOU (16*6)(DI), X10
692		MOVOU (16*7)(DI), X11
693
694		ADDQ $(16*8), DI
695
696		PAND X12, X4
697		PAND X12, X5
698		PAND X12, X6
699		PAND X12, X7
700
701		MOVOU X13, X12
702		PADDL X15, X13
703		PCMPEQL X14, X12
704
705		PAND X12, X8
706		PAND X12, X9
707		PAND X12, X10
708		PAND X12, X11
709
710		PXOR X4, X0
711		PXOR X5, X1
712		PXOR X6, X2
713		PXOR X7, X3
714
715		PXOR X8, X0
716		PXOR X9, X1
717		PXOR X10, X2
718		PXOR X11, X3
719
720		DECQ AX
721		JNE loop_select_base
722
723	MOVOU X0, (16*0)(DX)
724	MOVOU X1, (16*1)(DX)
725	MOVOU X2, (16*2)(DX)
726	MOVOU X3, (16*3)(DX)
727
728	RET
729/* ---------------------------------------*/
730// func p256OrdMul(res, in1, in2 []uint64)
731TEXT ·p256OrdMul(SB),NOSPLIT,$0
732	MOVQ res+0(FP), res_ptr
733	MOVQ in1+24(FP), x_ptr
734	MOVQ in2+48(FP), y_ptr
735	// x * y[0]
736	MOVQ (8*0)(y_ptr), t0
737
738	MOVQ (8*0)(x_ptr), AX
739	MULQ t0
740	MOVQ AX, acc0
741	MOVQ DX, acc1
742
743	MOVQ (8*1)(x_ptr), AX
744	MULQ t0
745	ADDQ AX, acc1
746	ADCQ $0, DX
747	MOVQ DX, acc2
748
749	MOVQ (8*2)(x_ptr), AX
750	MULQ t0
751	ADDQ AX, acc2
752	ADCQ $0, DX
753	MOVQ DX, acc3
754
755	MOVQ (8*3)(x_ptr), AX
756	MULQ t0
757	ADDQ AX, acc3
758	ADCQ $0, DX
759	MOVQ DX, acc4
760	XORQ acc5, acc5
761	// First reduction step
762	MOVQ acc0, AX
763	MULQ p256ordK0<>(SB)
764	MOVQ AX, t0
765
766	MOVQ p256ord<>+0x00(SB), AX
767	MULQ t0
768	ADDQ AX, acc0
769	ADCQ $0, DX
770	MOVQ DX, t1
771
772	MOVQ p256ord<>+0x08(SB), AX
773	MULQ t0
774	ADDQ t1, acc1
775	ADCQ $0, DX
776	ADDQ AX, acc1
777	ADCQ $0, DX
778	MOVQ DX, t1
779
780	MOVQ p256ord<>+0x10(SB), AX
781	MULQ t0
782	ADDQ t1, acc2
783	ADCQ $0, DX
784	ADDQ AX, acc2
785	ADCQ $0, DX
786	MOVQ DX, t1
787
788	MOVQ p256ord<>+0x18(SB), AX
789	MULQ t0
790	ADDQ t1, acc3
791	ADCQ $0, DX
792	ADDQ AX, acc3
793	ADCQ DX, acc4
794	ADCQ $0, acc5
795	// x * y[1]
796	MOVQ (8*1)(y_ptr), t0
797
798	MOVQ (8*0)(x_ptr), AX
799	MULQ t0
800	ADDQ AX, acc1
801	ADCQ $0, DX
802	MOVQ DX, t1
803
804	MOVQ (8*1)(x_ptr), AX
805	MULQ t0
806	ADDQ t1, acc2
807	ADCQ $0, DX
808	ADDQ AX, acc2
809	ADCQ $0, DX
810	MOVQ DX, t1
811
812	MOVQ (8*2)(x_ptr), AX
813	MULQ t0
814	ADDQ t1, acc3
815	ADCQ $0, DX
816	ADDQ AX, acc3
817	ADCQ $0, DX
818	MOVQ DX, t1
819
820	MOVQ (8*3)(x_ptr), AX
821	MULQ t0
822	ADDQ t1, acc4
823	ADCQ $0, DX
824	ADDQ AX, acc4
825	ADCQ DX, acc5
826	ADCQ $0, acc0
827	// Second reduction step
828	MOVQ acc1, AX
829	MULQ p256ordK0<>(SB)
830	MOVQ AX, t0
831
832	MOVQ p256ord<>+0x00(SB), AX
833	MULQ t0
834	ADDQ AX, acc1
835	ADCQ $0, DX
836	MOVQ DX, t1
837
838	MOVQ p256ord<>+0x08(SB), AX
839	MULQ t0
840	ADDQ t1, acc2
841	ADCQ $0, DX
842	ADDQ AX, acc2
843	ADCQ $0, DX
844	MOVQ DX, t1
845
846	MOVQ p256ord<>+0x10(SB), AX
847	MULQ t0
848	ADDQ t1, acc3
849	ADCQ $0, DX
850	ADDQ AX, acc3
851	ADCQ $0, DX
852	MOVQ DX, t1
853
854	MOVQ p256ord<>+0x18(SB), AX
855	MULQ t0
856	ADDQ t1, acc4
857	ADCQ $0, DX
858	ADDQ AX, acc4
859	ADCQ DX, acc5
860	ADCQ $0, acc0
861	// x * y[2]
862	MOVQ (8*2)(y_ptr), t0
863
864	MOVQ (8*0)(x_ptr), AX
865	MULQ t0
866	ADDQ AX, acc2
867	ADCQ $0, DX
868	MOVQ DX, t1
869
870	MOVQ (8*1)(x_ptr), AX
871	MULQ t0
872	ADDQ t1, acc3
873	ADCQ $0, DX
874	ADDQ AX, acc3
875	ADCQ $0, DX
876	MOVQ DX, t1
877
878	MOVQ (8*2)(x_ptr), AX
879	MULQ t0
880	ADDQ t1, acc4
881	ADCQ $0, DX
882	ADDQ AX, acc4
883	ADCQ $0, DX
884	MOVQ DX, t1
885
886	MOVQ (8*3)(x_ptr), AX
887	MULQ t0
888	ADDQ t1, acc5
889	ADCQ $0, DX
890	ADDQ AX, acc5
891	ADCQ DX, acc0
892	ADCQ $0, acc1
893	// Third reduction step
894	MOVQ acc2, AX
895	MULQ p256ordK0<>(SB)
896	MOVQ AX, t0
897
898	MOVQ p256ord<>+0x00(SB), AX
899	MULQ t0
900	ADDQ AX, acc2
901	ADCQ $0, DX
902	MOVQ DX, t1
903
904	MOVQ p256ord<>+0x08(SB), AX
905	MULQ t0
906	ADDQ t1, acc3
907	ADCQ $0, DX
908	ADDQ AX, acc3
909	ADCQ $0, DX
910	MOVQ DX, t1
911
912	MOVQ p256ord<>+0x10(SB), AX
913	MULQ t0
914	ADDQ t1, acc4
915	ADCQ $0, DX
916	ADDQ AX, acc4
917	ADCQ $0, DX
918	MOVQ DX, t1
919
920	MOVQ p256ord<>+0x18(SB), AX
921	MULQ t0
922	ADDQ t1, acc5
923	ADCQ $0, DX
924	ADDQ AX, acc5
925	ADCQ DX, acc0
926	ADCQ $0, acc1
927	// x * y[3]
928	MOVQ (8*3)(y_ptr), t0
929
930	MOVQ (8*0)(x_ptr), AX
931	MULQ t0
932	ADDQ AX, acc3
933	ADCQ $0, DX
934	MOVQ DX, t1
935
936	MOVQ (8*1)(x_ptr), AX
937	MULQ t0
938	ADDQ t1, acc4
939	ADCQ $0, DX
940	ADDQ AX, acc4
941	ADCQ $0, DX
942	MOVQ DX, t1
943
944	MOVQ (8*2)(x_ptr), AX
945	MULQ t0
946	ADDQ t1, acc5
947	ADCQ $0, DX
948	ADDQ AX, acc5
949	ADCQ $0, DX
950	MOVQ DX, t1
951
952	MOVQ (8*3)(x_ptr), AX
953	MULQ t0
954	ADDQ t1, acc0
955	ADCQ $0, DX
956	ADDQ AX, acc0
957	ADCQ DX, acc1
958	ADCQ $0, acc2
959	// Last reduction step
960	MOVQ acc3, AX
961	MULQ p256ordK0<>(SB)
962	MOVQ AX, t0
963
964	MOVQ p256ord<>+0x00(SB), AX
965	MULQ t0
966	ADDQ AX, acc3
967	ADCQ $0, DX
968	MOVQ DX, t1
969
970	MOVQ p256ord<>+0x08(SB), AX
971	MULQ t0
972	ADDQ t1, acc4
973	ADCQ $0, DX
974	ADDQ AX, acc4
975	ADCQ $0, DX
976	MOVQ DX, t1
977
978	MOVQ p256ord<>+0x10(SB), AX
979	MULQ t0
980	ADDQ t1, acc5
981	ADCQ $0, DX
982	ADDQ AX, acc5
983	ADCQ $0, DX
984	MOVQ DX, t1
985
986	MOVQ p256ord<>+0x18(SB), AX
987	MULQ t0
988	ADDQ t1, acc0
989	ADCQ $0, DX
990	ADDQ AX, acc0
991	ADCQ DX, acc1
992	ADCQ $0, acc2
993	// Copy result [255:0]
994	MOVQ acc4, x_ptr
995	MOVQ acc5, acc3
996	MOVQ acc0, t0
997	MOVQ acc1, t1
998	// Subtract p256
999	SUBQ p256ord<>+0x00(SB), acc4
1000	SBBQ p256ord<>+0x08(SB) ,acc5
1001	SBBQ p256ord<>+0x10(SB), acc0
1002	SBBQ p256ord<>+0x18(SB), acc1
1003	SBBQ $0, acc2
1004
1005	CMOVQCS x_ptr, acc4
1006	CMOVQCS acc3, acc5
1007	CMOVQCS t0, acc0
1008	CMOVQCS t1, acc1
1009
1010	MOVQ acc4, (8*0)(res_ptr)
1011	MOVQ acc5, (8*1)(res_ptr)
1012	MOVQ acc0, (8*2)(res_ptr)
1013	MOVQ acc1, (8*3)(res_ptr)
1014
1015	RET
1016/* ---------------------------------------*/
1017// func p256OrdSqr(res, in []uint64, n int)
1018TEXT ·p256OrdSqr(SB),NOSPLIT,$0
1019	MOVQ res+0(FP), res_ptr
1020	MOVQ in+24(FP), x_ptr
1021	MOVQ n+48(FP), BX
1022
1023ordSqrLoop:
1024
1025	// y[1:] * y[0]
1026	MOVQ (8*0)(x_ptr), t0
1027
1028	MOVQ (8*1)(x_ptr), AX
1029	MULQ t0
1030	MOVQ AX, acc1
1031	MOVQ DX, acc2
1032
1033	MOVQ (8*2)(x_ptr), AX
1034	MULQ t0
1035	ADDQ AX, acc2
1036	ADCQ $0, DX
1037	MOVQ DX, acc3
1038
1039	MOVQ (8*3)(x_ptr), AX
1040	MULQ t0
1041	ADDQ AX, acc3
1042	ADCQ $0, DX
1043	MOVQ DX, acc4
1044	// y[2:] * y[1]
1045	MOVQ (8*1)(x_ptr), t0
1046
1047	MOVQ (8*2)(x_ptr), AX
1048	MULQ t0
1049	ADDQ AX, acc3
1050	ADCQ $0, DX
1051	MOVQ DX, t1
1052
1053	MOVQ (8*3)(x_ptr), AX
1054	MULQ t0
1055	ADDQ t1, acc4
1056	ADCQ $0, DX
1057	ADDQ AX, acc4
1058	ADCQ $0, DX
1059	MOVQ DX, acc5
1060	// y[3] * y[2]
1061	MOVQ (8*2)(x_ptr), t0
1062
1063	MOVQ (8*3)(x_ptr), AX
1064	MULQ t0
1065	ADDQ AX, acc5
1066	ADCQ $0, DX
1067	MOVQ DX, y_ptr
1068	XORQ t1, t1
1069	// *2
1070	ADDQ acc1, acc1
1071	ADCQ acc2, acc2
1072	ADCQ acc3, acc3
1073	ADCQ acc4, acc4
1074	ADCQ acc5, acc5
1075	ADCQ y_ptr, y_ptr
1076	ADCQ $0, t1
1077	// Missing products
1078	MOVQ (8*0)(x_ptr), AX
1079	MULQ AX
1080	MOVQ AX, acc0
1081	MOVQ DX, t0
1082
1083	MOVQ (8*1)(x_ptr), AX
1084	MULQ AX
1085	ADDQ t0, acc1
1086	ADCQ AX, acc2
1087	ADCQ $0, DX
1088	MOVQ DX, t0
1089
1090	MOVQ (8*2)(x_ptr), AX
1091	MULQ AX
1092	ADDQ t0, acc3
1093	ADCQ AX, acc4
1094	ADCQ $0, DX
1095	MOVQ DX, t0
1096
1097	MOVQ (8*3)(x_ptr), AX
1098	MULQ AX
1099	ADDQ t0, acc5
1100	ADCQ AX, y_ptr
1101	ADCQ DX, t1
1102	MOVQ t1, x_ptr
1103	// First reduction step
1104	MOVQ acc0, AX
1105	MULQ p256ordK0<>(SB)
1106	MOVQ AX, t0
1107
1108	MOVQ p256ord<>+0x00(SB), AX
1109	MULQ t0
1110	ADDQ AX, acc0
1111	ADCQ $0, DX
1112	MOVQ DX, t1
1113
1114	MOVQ p256ord<>+0x08(SB), AX
1115	MULQ t0
1116	ADDQ t1, acc1
1117	ADCQ $0, DX
1118	ADDQ AX, acc1
1119
1120	MOVQ t0, t1
1121	ADCQ DX, acc2
1122	ADCQ $0, t1
1123	SUBQ t0, acc2
1124	SBBQ $0, t1
1125
1126	MOVQ t0, AX
1127	MOVQ t0, DX
1128	MOVQ t0, acc0
1129	SHLQ $32, AX
1130	SHRQ $32, DX
1131
1132	ADDQ t1, acc3
1133	ADCQ $0, acc0
1134	SUBQ AX, acc3
1135	SBBQ DX, acc0
1136	// Second reduction step
1137	MOVQ acc1, AX
1138	MULQ p256ordK0<>(SB)
1139	MOVQ AX, t0
1140
1141	MOVQ p256ord<>+0x00(SB), AX
1142	MULQ t0
1143	ADDQ AX, acc1
1144	ADCQ $0, DX
1145	MOVQ DX, t1
1146
1147	MOVQ p256ord<>+0x08(SB), AX
1148	MULQ t0
1149	ADDQ t1, acc2
1150	ADCQ $0, DX
1151	ADDQ AX, acc2
1152
1153	MOVQ t0, t1
1154	ADCQ DX, acc3
1155	ADCQ $0, t1
1156	SUBQ t0, acc3
1157	SBBQ $0, t1
1158
1159	MOVQ t0, AX
1160	MOVQ t0, DX
1161	MOVQ t0, acc1
1162	SHLQ $32, AX
1163	SHRQ $32, DX
1164
1165	ADDQ t1, acc0
1166	ADCQ $0, acc1
1167	SUBQ AX, acc0
1168	SBBQ DX, acc1
1169	// Third reduction step
1170	MOVQ acc2, AX
1171	MULQ p256ordK0<>(SB)
1172	MOVQ AX, t0
1173
1174	MOVQ p256ord<>+0x00(SB), AX
1175	MULQ t0
1176	ADDQ AX, acc2
1177	ADCQ $0, DX
1178	MOVQ DX, t1
1179
1180	MOVQ p256ord<>+0x08(SB), AX
1181	MULQ t0
1182	ADDQ t1, acc3
1183	ADCQ $0, DX
1184	ADDQ AX, acc3
1185
1186	MOVQ t0, t1
1187	ADCQ DX, acc0
1188	ADCQ $0, t1
1189	SUBQ t0, acc0
1190	SBBQ $0, t1
1191
1192	MOVQ t0, AX
1193	MOVQ t0, DX
1194	MOVQ t0, acc2
1195	SHLQ $32, AX
1196	SHRQ $32, DX
1197
1198	ADDQ t1, acc1
1199	ADCQ $0, acc2
1200	SUBQ AX, acc1
1201	SBBQ DX, acc2
1202	// Last reduction step
1203	MOVQ acc3, AX
1204	MULQ p256ordK0<>(SB)
1205	MOVQ AX, t0
1206
1207	MOVQ p256ord<>+0x00(SB), AX
1208	MULQ t0
1209	ADDQ AX, acc3
1210	ADCQ $0, DX
1211	MOVQ DX, t1
1212
1213	MOVQ p256ord<>+0x08(SB), AX
1214	MULQ t0
1215	ADDQ t1, acc0
1216	ADCQ $0, DX
1217	ADDQ AX, acc0
1218	ADCQ $0, DX
1219	MOVQ DX, t1
1220
1221	MOVQ t0, t1
1222	ADCQ DX, acc1
1223	ADCQ $0, t1
1224	SUBQ t0, acc1
1225	SBBQ $0, t1
1226
1227	MOVQ t0, AX
1228	MOVQ t0, DX
1229	MOVQ t0, acc3
1230	SHLQ $32, AX
1231	SHRQ $32, DX
1232
1233	ADDQ t1, acc2
1234	ADCQ $0, acc3
1235	SUBQ AX, acc2
1236	SBBQ DX, acc3
1237	XORQ t0, t0
1238	// Add bits [511:256] of the sqr result
1239	ADCQ acc4, acc0
1240	ADCQ acc5, acc1
1241	ADCQ y_ptr, acc2
1242	ADCQ x_ptr, acc3
1243	ADCQ $0, t0
1244
1245	MOVQ acc0, acc4
1246	MOVQ acc1, acc5
1247	MOVQ acc2, y_ptr
1248	MOVQ acc3, t1
1249	// Subtract p256
1250	SUBQ p256ord<>+0x00(SB), acc0
1251	SBBQ p256ord<>+0x08(SB) ,acc1
1252	SBBQ p256ord<>+0x10(SB), acc2
1253	SBBQ p256ord<>+0x18(SB), acc3
1254	SBBQ $0, t0
1255
1256	CMOVQCS acc4, acc0
1257	CMOVQCS acc5, acc1
1258	CMOVQCS y_ptr, acc2
1259	CMOVQCS t1, acc3
1260
1261	MOVQ acc0, (8*0)(res_ptr)
1262	MOVQ acc1, (8*1)(res_ptr)
1263	MOVQ acc2, (8*2)(res_ptr)
1264	MOVQ acc3, (8*3)(res_ptr)
1265	MOVQ res_ptr, x_ptr
1266	DECQ BX
1267	JNE ordSqrLoop
1268
1269	RET
1270/* ---------------------------------------*/
1271#undef res_ptr
1272#undef x_ptr
1273#undef y_ptr
1274
1275#undef acc0
1276#undef acc1
1277#undef acc2
1278#undef acc3
1279#undef acc4
1280#undef acc5
1281#undef t0
1282#undef t1
1283/* ---------------------------------------*/
1284#define mul0 AX
1285#define mul1 DX
1286#define acc0 BX
1287#define acc1 CX
1288#define acc2 R8
1289#define acc3 R9
1290#define acc4 R10
1291#define acc5 R11
1292#define acc6 R12
1293#define acc7 R13
1294#define t0 R14
1295#define t1 R15
1296#define t2 DI
1297#define t3 SI
1298#define hlp BP
1299/* ---------------------------------------*/
1300TEXT p256SubInternal(SB),NOSPLIT,$0
1301	XORQ mul0, mul0
1302	SUBQ t0, acc4
1303	SBBQ t1, acc5
1304	SBBQ t2, acc6
1305	SBBQ t3, acc7
1306	SBBQ $0, mul0
1307
1308	MOVQ acc4, acc0
1309	MOVQ acc5, acc1
1310	MOVQ acc6, acc2
1311	MOVQ acc7, acc3
1312
1313	ADDQ $-1, acc4
1314	ADCQ p256const0<>(SB), acc5
1315	ADCQ $0, acc6
1316	ADCQ p256const1<>(SB), acc7
1317	ANDQ $1, mul0
1318
1319	CMOVQEQ acc0, acc4
1320	CMOVQEQ acc1, acc5
1321	CMOVQEQ acc2, acc6
1322	CMOVQEQ acc3, acc7
1323
1324	RET
1325/* ---------------------------------------*/
1326TEXT p256MulInternal(SB),NOSPLIT,$0
1327	MOVQ acc4, mul0
1328	MULQ t0
1329	MOVQ mul0, acc0
1330	MOVQ mul1, acc1
1331
1332	MOVQ acc4, mul0
1333	MULQ t1
1334	ADDQ mul0, acc1
1335	ADCQ $0, mul1
1336	MOVQ mul1, acc2
1337
1338	MOVQ acc4, mul0
1339	MULQ t2
1340	ADDQ mul0, acc2
1341	ADCQ $0, mul1
1342	MOVQ mul1, acc3
1343
1344	MOVQ acc4, mul0
1345	MULQ t3
1346	ADDQ mul0, acc3
1347	ADCQ $0, mul1
1348	MOVQ mul1, acc4
1349
1350	MOVQ acc5, mul0
1351	MULQ t0
1352	ADDQ mul0, acc1
1353	ADCQ $0, mul1
1354	MOVQ mul1, hlp
1355
1356	MOVQ acc5, mul0
1357	MULQ t1
1358	ADDQ hlp, acc2
1359	ADCQ $0, mul1
1360	ADDQ mul0, acc2
1361	ADCQ $0, mul1
1362	MOVQ mul1, hlp
1363
1364	MOVQ acc5, mul0
1365	MULQ t2
1366	ADDQ hlp, acc3
1367	ADCQ $0, mul1
1368	ADDQ mul0, acc3
1369	ADCQ $0, mul1
1370	MOVQ mul1, hlp
1371
1372	MOVQ acc5, mul0
1373	MULQ t3
1374	ADDQ hlp, acc4
1375	ADCQ $0, mul1
1376	ADDQ mul0, acc4
1377	ADCQ $0, mul1
1378	MOVQ mul1, acc5
1379
1380	MOVQ acc6, mul0
1381	MULQ t0
1382	ADDQ mul0, acc2
1383	ADCQ $0, mul1
1384	MOVQ mul1, hlp
1385
1386	MOVQ acc6, mul0
1387	MULQ t1
1388	ADDQ hlp, acc3
1389	ADCQ $0, mul1
1390	ADDQ mul0, acc3
1391	ADCQ $0, mul1
1392	MOVQ mul1, hlp
1393
1394	MOVQ acc6, mul0
1395	MULQ t2
1396	ADDQ hlp, acc4
1397	ADCQ $0, mul1
1398	ADDQ mul0, acc4
1399	ADCQ $0, mul1
1400	MOVQ mul1, hlp
1401
1402	MOVQ acc6, mul0
1403	MULQ t3
1404	ADDQ hlp, acc5
1405	ADCQ $0, mul1
1406	ADDQ mul0, acc5
1407	ADCQ $0, mul1
1408	MOVQ mul1, acc6
1409
1410	MOVQ acc7, mul0
1411	MULQ t0
1412	ADDQ mul0, acc3
1413	ADCQ $0, mul1
1414	MOVQ mul1, hlp
1415
1416	MOVQ acc7, mul0
1417	MULQ t1
1418	ADDQ hlp, acc4
1419	ADCQ $0, mul1
1420	ADDQ mul0, acc4
1421	ADCQ $0, mul1
1422	MOVQ mul1, hlp
1423
1424	MOVQ acc7, mul0
1425	MULQ t2
1426	ADDQ hlp, acc5
1427	ADCQ $0, mul1
1428	ADDQ mul0, acc5
1429	ADCQ $0, mul1
1430	MOVQ mul1, hlp
1431
1432	MOVQ acc7, mul0
1433	MULQ t3
1434	ADDQ hlp, acc6
1435	ADCQ $0, mul1
1436	ADDQ mul0, acc6
1437	ADCQ $0, mul1
1438	MOVQ mul1, acc7
1439	// First reduction step
1440	MOVQ acc0, mul0
1441	MOVQ acc0, hlp
1442	SHLQ $32, acc0
1443	MULQ p256const1<>(SB)
1444	SHRQ $32, hlp
1445	ADDQ acc0, acc1
1446	ADCQ hlp, acc2
1447	ADCQ mul0, acc3
1448	ADCQ $0, mul1
1449	MOVQ mul1, acc0
1450	// Second reduction step
1451	MOVQ acc1, mul0
1452	MOVQ acc1, hlp
1453	SHLQ $32, acc1
1454	MULQ p256const1<>(SB)
1455	SHRQ $32, hlp
1456	ADDQ acc1, acc2
1457	ADCQ hlp, acc3
1458	ADCQ mul0, acc0
1459	ADCQ $0, mul1
1460	MOVQ mul1, acc1
1461	// Third reduction step
1462	MOVQ acc2, mul0
1463	MOVQ acc2, hlp
1464	SHLQ $32, acc2
1465	MULQ p256const1<>(SB)
1466	SHRQ $32, hlp
1467	ADDQ acc2, acc3
1468	ADCQ hlp, acc0
1469	ADCQ mul0, acc1
1470	ADCQ $0, mul1
1471	MOVQ mul1, acc2
1472	// Last reduction step
1473	MOVQ acc3, mul0
1474	MOVQ acc3, hlp
1475	SHLQ $32, acc3
1476	MULQ p256const1<>(SB)
1477	SHRQ $32, hlp
1478	ADDQ acc3, acc0
1479	ADCQ hlp, acc1
1480	ADCQ mul0, acc2
1481	ADCQ $0, mul1
1482	MOVQ mul1, acc3
1483	BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00   // MOVQ $0, BP
1484	// Add bits [511:256] of the result
1485	ADCQ acc0, acc4
1486	ADCQ acc1, acc5
1487	ADCQ acc2, acc6
1488	ADCQ acc3, acc7
1489	ADCQ $0, hlp
1490	// Copy result
1491	MOVQ acc4, acc0
1492	MOVQ acc5, acc1
1493	MOVQ acc6, acc2
1494	MOVQ acc7, acc3
1495	// Subtract p256
1496	SUBQ $-1, acc4
1497	SBBQ p256const0<>(SB) ,acc5
1498	SBBQ $0, acc6
1499	SBBQ p256const1<>(SB), acc7
1500	SBBQ $0, hlp
1501	// If the result of the subtraction is negative, restore the previous result
1502	CMOVQCS acc0, acc4
1503	CMOVQCS acc1, acc5
1504	CMOVQCS acc2, acc6
1505	CMOVQCS acc3, acc7
1506
1507	RET
1508/* ---------------------------------------*/
1509TEXT p256SqrInternal(SB),NOSPLIT,$0
1510
1511	MOVQ acc4, mul0
1512	MULQ acc5
1513	MOVQ mul0, acc1
1514	MOVQ mul1, acc2
1515
1516	MOVQ acc4, mul0
1517	MULQ acc6
1518	ADDQ mul0, acc2
1519	ADCQ $0, mul1
1520	MOVQ mul1, acc3
1521
1522	MOVQ acc4, mul0
1523	MULQ acc7
1524	ADDQ mul0, acc3
1525	ADCQ $0, mul1
1526	MOVQ mul1, t0
1527
1528	MOVQ acc5, mul0
1529	MULQ acc6
1530	ADDQ mul0, acc3
1531	ADCQ $0, mul1
1532	MOVQ mul1, hlp
1533
1534	MOVQ acc5, mul0
1535	MULQ acc7
1536	ADDQ hlp, t0
1537	ADCQ $0, mul1
1538	ADDQ mul0, t0
1539	ADCQ $0, mul1
1540	MOVQ mul1, t1
1541
1542	MOVQ acc6, mul0
1543	MULQ acc7
1544	ADDQ mul0, t1
1545	ADCQ $0, mul1
1546	MOVQ mul1, t2
1547	XORQ t3, t3
1548	// *2
1549	ADDQ acc1, acc1
1550	ADCQ acc2, acc2
1551	ADCQ acc3, acc3
1552	ADCQ t0, t0
1553	ADCQ t1, t1
1554	ADCQ t2, t2
1555	ADCQ $0, t3
1556	// Missing products
1557	MOVQ acc4, mul0
1558	MULQ mul0
1559	MOVQ mul0, acc0
1560	MOVQ DX, acc4
1561
1562	MOVQ acc5, mul0
1563	MULQ mul0
1564	ADDQ acc4, acc1
1565	ADCQ mul0, acc2
1566	ADCQ $0, DX
1567	MOVQ DX, acc4
1568
1569	MOVQ acc6, mul0
1570	MULQ mul0
1571	ADDQ acc4, acc3
1572	ADCQ mul0, t0
1573	ADCQ $0, DX
1574	MOVQ DX, acc4
1575
1576	MOVQ acc7, mul0
1577	MULQ mul0
1578	ADDQ acc4, t1
1579	ADCQ mul0, t2
1580	ADCQ DX, t3
1581	// First reduction step
1582	MOVQ acc0, mul0
1583	MOVQ acc0, hlp
1584	SHLQ $32, acc0
1585	MULQ p256const1<>(SB)
1586	SHRQ $32, hlp
1587	ADDQ acc0, acc1
1588	ADCQ hlp, acc2
1589	ADCQ mul0, acc3
1590	ADCQ $0, mul1
1591	MOVQ mul1, acc0
1592	// Second reduction step
1593	MOVQ acc1, mul0
1594	MOVQ acc1, hlp
1595	SHLQ $32, acc1
1596	MULQ p256const1<>(SB)
1597	SHRQ $32, hlp
1598	ADDQ acc1, acc2
1599	ADCQ hlp, acc3
1600	ADCQ mul0, acc0
1601	ADCQ $0, mul1
1602	MOVQ mul1, acc1
1603	// Third reduction step
1604	MOVQ acc2, mul0
1605	MOVQ acc2, hlp
1606	SHLQ $32, acc2
1607	MULQ p256const1<>(SB)
1608	SHRQ $32, hlp
1609	ADDQ acc2, acc3
1610	ADCQ hlp, acc0
1611	ADCQ mul0, acc1
1612	ADCQ $0, mul1
1613	MOVQ mul1, acc2
1614	// Last reduction step
1615	MOVQ acc3, mul0
1616	MOVQ acc3, hlp
1617	SHLQ $32, acc3
1618	MULQ p256const1<>(SB)
1619	SHRQ $32, hlp
1620	ADDQ acc3, acc0
1621	ADCQ hlp, acc1
1622	ADCQ mul0, acc2
1623	ADCQ $0, mul1
1624	MOVQ mul1, acc3
1625	BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00   // MOVQ $0, BP
1626	// Add bits [511:256] of the result
1627	ADCQ acc0, t0
1628	ADCQ acc1, t1
1629	ADCQ acc2, t2
1630	ADCQ acc3, t3
1631	ADCQ $0, hlp
1632	// Copy result
1633	MOVQ t0, acc4
1634	MOVQ t1, acc5
1635	MOVQ t2, acc6
1636	MOVQ t3, acc7
1637	// Subtract p256
1638	SUBQ $-1, acc4
1639	SBBQ p256const0<>(SB) ,acc5
1640	SBBQ $0, acc6
1641	SBBQ p256const1<>(SB), acc7
1642	SBBQ $0, hlp
1643	// If the result of the subtraction is negative, restore the previous result
1644	CMOVQCS t0, acc4
1645	CMOVQCS t1, acc5
1646	CMOVQCS t2, acc6
1647	CMOVQCS t3, acc7
1648
1649	RET
1650/* ---------------------------------------*/
1651#define p256MulBy2Inline\
1652	XORQ mul0, mul0;\
1653	ADDQ acc4, acc4;\
1654	ADCQ acc5, acc5;\
1655	ADCQ acc6, acc6;\
1656	ADCQ acc7, acc7;\
1657	ADCQ $0, mul0;\
1658	MOVQ acc4, t0;\
1659	MOVQ acc5, t1;\
1660	MOVQ acc6, t2;\
1661	MOVQ acc7, t3;\
1662	SUBQ $-1, t0;\
1663	SBBQ p256const0<>(SB), t1;\
1664	SBBQ $0, t2;\
1665	SBBQ p256const1<>(SB), t3;\
1666	SBBQ $0, mul0;\
1667	CMOVQCS acc4, t0;\
1668	CMOVQCS acc5, t1;\
1669	CMOVQCS acc6, t2;\
1670	CMOVQCS acc7, t3;
1671/* ---------------------------------------*/
1672#define p256AddInline \
1673	XORQ mul0, mul0;\
1674	ADDQ t0, acc4;\
1675	ADCQ t1, acc5;\
1676	ADCQ t2, acc6;\
1677	ADCQ t3, acc7;\
1678	ADCQ $0, mul0;\
1679	MOVQ acc4, t0;\
1680	MOVQ acc5, t1;\
1681	MOVQ acc6, t2;\
1682	MOVQ acc7, t3;\
1683	SUBQ $-1, t0;\
1684	SBBQ p256const0<>(SB), t1;\
1685	SBBQ $0, t2;\
1686	SBBQ p256const1<>(SB), t3;\
1687	SBBQ $0, mul0;\
1688	CMOVQCS acc4, t0;\
1689	CMOVQCS acc5, t1;\
1690	CMOVQCS acc6, t2;\
1691	CMOVQCS acc7, t3;
1692/* ---------------------------------------*/
1693#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
1694#define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
1695#define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
1696#define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
1697#define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
1698#define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
1699/* ---------------------------------------*/
1700#define x1in(off) (32*0 + off)(SP)
1701#define y1in(off) (32*1 + off)(SP)
1702#define z1in(off) (32*2 + off)(SP)
1703#define x2in(off) (32*3 + off)(SP)
1704#define y2in(off) (32*4 + off)(SP)
1705#define xout(off) (32*5 + off)(SP)
1706#define yout(off) (32*6 + off)(SP)
1707#define zout(off) (32*7 + off)(SP)
1708#define s2(off)   (32*8 + off)(SP)
1709#define z1sqr(off) (32*9 + off)(SP)
1710#define h(off)	  (32*10 + off)(SP)
1711#define r(off)	  (32*11 + off)(SP)
1712#define hsqr(off) (32*12 + off)(SP)
1713#define rsqr(off) (32*13 + off)(SP)
1714#define hcub(off) (32*14 + off)(SP)
1715#define rptr	  (32*15)(SP)
1716#define sel_save  (32*15 + 8)(SP)
1717#define zero_save (32*15 + 8 + 4)(SP)
1718
1719// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
1720TEXT ·p256PointAddAffineAsm(SB),0,$512-96
1721	// Move input to stack in order to free registers
1722	MOVQ res+0(FP), AX
1723	MOVQ in1+24(FP), BX
1724	MOVQ in2+48(FP), CX
1725	MOVQ sign+72(FP), DX
1726	MOVQ sel+80(FP), t1
1727	MOVQ zero+88(FP), t2
1728
1729	MOVOU (16*0)(BX), X0
1730	MOVOU (16*1)(BX), X1
1731	MOVOU (16*2)(BX), X2
1732	MOVOU (16*3)(BX), X3
1733	MOVOU (16*4)(BX), X4
1734	MOVOU (16*5)(BX), X5
1735
1736	MOVOU X0, x1in(16*0)
1737	MOVOU X1, x1in(16*1)
1738	MOVOU X2, y1in(16*0)
1739	MOVOU X3, y1in(16*1)
1740	MOVOU X4, z1in(16*0)
1741	MOVOU X5, z1in(16*1)
1742
1743	MOVOU (16*0)(CX), X0
1744	MOVOU (16*1)(CX), X1
1745
1746	MOVOU X0, x2in(16*0)
1747	MOVOU X1, x2in(16*1)
1748	// Store pointer to result
1749	MOVQ mul0, rptr
1750	MOVL t1, sel_save
1751	MOVL t2, zero_save
1752	// Negate y2in based on sign
1753	MOVQ (16*2 + 8*0)(CX), acc4
1754	MOVQ (16*2 + 8*1)(CX), acc5
1755	MOVQ (16*2 + 8*2)(CX), acc6
1756	MOVQ (16*2 + 8*3)(CX), acc7
1757	MOVQ $-1, acc0
1758	MOVQ p256const0<>(SB), acc1
1759	MOVQ $0, acc2
1760	MOVQ p256const1<>(SB), acc3
1761	XORQ mul0, mul0
1762	// Speculatively subtract
1763	SUBQ acc4, acc0
1764	SBBQ acc5, acc1
1765	SBBQ acc6, acc2
1766	SBBQ acc7, acc3
1767	SBBQ $0, mul0
1768	MOVQ acc0, t0
1769	MOVQ acc1, t1
1770	MOVQ acc2, t2
1771	MOVQ acc3, t3
1772	// Add in case the operand was > p256
1773	ADDQ $-1, acc0
1774	ADCQ p256const0<>(SB), acc1
1775	ADCQ $0, acc2
1776	ADCQ p256const1<>(SB), acc3
1777	ADCQ $0, mul0
1778	CMOVQNE t0, acc0
1779	CMOVQNE t1, acc1
1780	CMOVQNE t2, acc2
1781	CMOVQNE t3, acc3
1782	// If condition is 0, keep original value
1783	TESTQ DX, DX
1784	CMOVQEQ acc4, acc0
1785	CMOVQEQ acc5, acc1
1786	CMOVQEQ acc6, acc2
1787	CMOVQEQ acc7, acc3
1788	// Store result
1789	MOVQ acc0, y2in(8*0)
1790	MOVQ acc1, y2in(8*1)
1791	MOVQ acc2, y2in(8*2)
1792	MOVQ acc3, y2in(8*3)
1793	// Begin point add
1794	LDacc (z1in)
1795	CALL p256SqrInternal(SB)	// z1ˆ2
1796	ST (z1sqr)
1797
1798	LDt (x2in)
1799	CALL p256MulInternal(SB)	// x2 * z1ˆ2
1800
1801	LDt (x1in)
1802	CALL p256SubInternal(SB)	// h = u2 - u1
1803	ST (h)
1804
1805	LDt (z1in)
1806	CALL p256MulInternal(SB)	// z3 = h * z1
1807	ST (zout)
1808
1809	LDacc (z1sqr)
1810	CALL p256MulInternal(SB)	// z1ˆ3
1811
1812	LDt (y2in)
1813	CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
1814	ST (s2)
1815
1816	LDt (y1in)
1817	CALL p256SubInternal(SB)	// r = s2 - s1
1818	ST (r)
1819
1820	CALL p256SqrInternal(SB)	// rsqr = rˆ2
1821	ST (rsqr)
1822
1823	LDacc (h)
1824	CALL p256SqrInternal(SB)	// hsqr = hˆ2
1825	ST (hsqr)
1826
1827	LDt (h)
1828	CALL p256MulInternal(SB)	// hcub = hˆ3
1829	ST (hcub)
1830
1831	LDt (y1in)
1832	CALL p256MulInternal(SB)	// y1 * hˆ3
1833	ST (s2)
1834
1835	LDacc (x1in)
1836	LDt (hsqr)
1837	CALL p256MulInternal(SB)	// u1 * hˆ2
1838	ST (h)
1839
1840	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
1841	LDacc (rsqr)
1842	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
1843
1844	LDt (hcub)
1845	CALL p256SubInternal(SB)
1846	ST (xout)
1847
1848	MOVQ acc4, t0
1849	MOVQ acc5, t1
1850	MOVQ acc6, t2
1851	MOVQ acc7, t3
1852	LDacc (h)
1853	CALL p256SubInternal(SB)
1854
1855	LDt (r)
1856	CALL p256MulInternal(SB)
1857
1858	LDt (s2)
1859	CALL p256SubInternal(SB)
1860	ST (yout)
1861	// Load stored values from stack
1862	MOVQ rptr, AX
1863	MOVL sel_save, BX
1864	MOVL zero_save, CX
1865	// The result is not valid if (sel == 0), conditional choose
1866	MOVOU xout(16*0), X0
1867	MOVOU xout(16*1), X1
1868	MOVOU yout(16*0), X2
1869	MOVOU yout(16*1), X3
1870	MOVOU zout(16*0), X4
1871	MOVOU zout(16*1), X5
1872
1873	MOVL BX, X6
1874	MOVL CX, X7
1875
1876	PXOR X8, X8
1877	PCMPEQL X9, X9
1878
1879	PSHUFD $0, X6, X6
1880	PSHUFD $0, X7, X7
1881
1882	PCMPEQL X8, X6
1883	PCMPEQL X8, X7
1884
1885	MOVOU X6, X15
1886	PANDN X9, X15
1887
1888	MOVOU x1in(16*0), X9
1889	MOVOU x1in(16*1), X10
1890	MOVOU y1in(16*0), X11
1891	MOVOU y1in(16*1), X12
1892	MOVOU z1in(16*0), X13
1893	MOVOU z1in(16*1), X14
1894
1895	PAND X15, X0
1896	PAND X15, X1
1897	PAND X15, X2
1898	PAND X15, X3
1899	PAND X15, X4
1900	PAND X15, X5
1901
1902	PAND X6, X9
1903	PAND X6, X10
1904	PAND X6, X11
1905	PAND X6, X12
1906	PAND X6, X13
1907	PAND X6, X14
1908
1909	PXOR X9, X0
1910	PXOR X10, X1
1911	PXOR X11, X2
1912	PXOR X12, X3
1913	PXOR X13, X4
1914	PXOR X14, X5
1915	// Similarly if zero == 0
1916	PCMPEQL X9, X9
1917	MOVOU X7, X15
1918	PANDN X9, X15
1919
1920	MOVOU x2in(16*0), X9
1921	MOVOU x2in(16*1), X10
1922	MOVOU y2in(16*0), X11
1923	MOVOU y2in(16*1), X12
1924	MOVOU p256one<>+0x00(SB), X13
1925	MOVOU p256one<>+0x10(SB), X14
1926
1927	PAND X15, X0
1928	PAND X15, X1
1929	PAND X15, X2
1930	PAND X15, X3
1931	PAND X15, X4
1932	PAND X15, X5
1933
1934	PAND X7, X9
1935	PAND X7, X10
1936	PAND X7, X11
1937	PAND X7, X12
1938	PAND X7, X13
1939	PAND X7, X14
1940
1941	PXOR X9, X0
1942	PXOR X10, X1
1943	PXOR X11, X2
1944	PXOR X12, X3
1945	PXOR X13, X4
1946	PXOR X14, X5
1947	// Finally output the result
1948	MOVOU X0, (16*0)(AX)
1949	MOVOU X1, (16*1)(AX)
1950	MOVOU X2, (16*2)(AX)
1951	MOVOU X3, (16*3)(AX)
1952	MOVOU X4, (16*4)(AX)
1953	MOVOU X5, (16*5)(AX)
1954	MOVQ $0, rptr
1955
1956	RET
1957#undef x1in
1958#undef y1in
1959#undef z1in
1960#undef x2in
1961#undef y2in
1962#undef xout
1963#undef yout
1964#undef zout
1965#undef s2
1966#undef z1sqr
1967#undef h
1968#undef r
1969#undef hsqr
1970#undef rsqr
1971#undef hcub
1972#undef rptr
1973#undef sel_save
1974#undef zero_save
1975/* ---------------------------------------*/
1976#define x1in(off) (32*0 + off)(SP)
1977#define y1in(off) (32*1 + off)(SP)
1978#define z1in(off) (32*2 + off)(SP)
1979#define x2in(off) (32*3 + off)(SP)
1980#define y2in(off) (32*4 + off)(SP)
1981#define z2in(off) (32*5 + off)(SP)
1982
1983#define xout(off) (32*6 + off)(SP)
1984#define yout(off) (32*7 + off)(SP)
1985#define zout(off) (32*8 + off)(SP)
1986
1987#define u1(off)    (32*9 + off)(SP)
1988#define u2(off)    (32*10 + off)(SP)
1989#define s1(off)    (32*11 + off)(SP)
1990#define s2(off)    (32*12 + off)(SP)
1991#define z1sqr(off) (32*13 + off)(SP)
1992#define z2sqr(off) (32*14 + off)(SP)
1993#define h(off)     (32*15 + off)(SP)
1994#define r(off)     (32*16 + off)(SP)
1995#define hsqr(off)  (32*17 + off)(SP)
1996#define rsqr(off)  (32*18 + off)(SP)
1997#define hcub(off)  (32*19 + off)(SP)
1998#define rptr       (32*20)(SP)
1999
2000//func p256PointAddAsm(res, in1, in2 []uint64)
2001TEXT ·p256PointAddAsm(SB),0,$672-72
2002	// Move input to stack in order to free registers
2003	MOVQ res+0(FP), AX
2004	MOVQ in1+24(FP), BX
2005	MOVQ in2+48(FP), CX
2006
2007	MOVOU (16*0)(BX), X0
2008	MOVOU (16*1)(BX), X1
2009	MOVOU (16*2)(BX), X2
2010	MOVOU (16*3)(BX), X3
2011	MOVOU (16*4)(BX), X4
2012	MOVOU (16*5)(BX), X5
2013
2014	MOVOU X0, x1in(16*0)
2015	MOVOU X1, x1in(16*1)
2016	MOVOU X2, y1in(16*0)
2017	MOVOU X3, y1in(16*1)
2018	MOVOU X4, z1in(16*0)
2019	MOVOU X5, z1in(16*1)
2020
2021	MOVOU (16*0)(CX), X0
2022	MOVOU (16*1)(CX), X1
2023	MOVOU (16*2)(CX), X2
2024	MOVOU (16*3)(CX), X3
2025	MOVOU (16*4)(CX), X4
2026	MOVOU (16*5)(CX), X5
2027
2028	MOVOU X0, x2in(16*0)
2029	MOVOU X1, x2in(16*1)
2030	MOVOU X2, y2in(16*0)
2031	MOVOU X3, y2in(16*1)
2032	MOVOU X4, z2in(16*0)
2033	MOVOU X5, z2in(16*1)
2034	// Store pointer to result
2035	MOVQ AX, rptr
2036	// Begin point add
2037	LDacc (z2in)
2038	CALL p256SqrInternal(SB)	// z2ˆ2
2039	ST (z2sqr)
2040	LDt (z2in)
2041	CALL p256MulInternal(SB)	// z2ˆ3
2042	LDt (y1in)
2043	CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
2044	ST (s1)
2045
2046	LDacc (z1in)
2047	CALL p256SqrInternal(SB)	// z1ˆ2
2048	ST (z1sqr)
2049	LDt (z1in)
2050	CALL p256MulInternal(SB)	// z1ˆ3
2051	LDt (y2in)
2052	CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
2053	ST (s2)
2054
2055	LDt (s1)
2056	CALL p256SubInternal(SB)	// r = s2 - s1
2057	ST (r)
2058
2059	LDacc (z2sqr)
2060	LDt (x1in)
2061	CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
2062	ST (u1)
2063	LDacc (z1sqr)
2064	LDt (x2in)
2065	CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
2066	ST (u2)
2067
2068	LDt (u1)
2069	CALL p256SubInternal(SB)	// h = u2 - u1
2070	ST (h)
2071
2072	LDacc (r)
2073	CALL p256SqrInternal(SB)	// rsqr = rˆ2
2074	ST (rsqr)
2075
2076	LDacc (h)
2077	CALL p256SqrInternal(SB)	// hsqr = hˆ2
2078	ST (hsqr)
2079
2080	LDt (h)
2081	CALL p256MulInternal(SB)	// hcub = hˆ3
2082	ST (hcub)
2083
2084	LDt (s1)
2085	CALL p256MulInternal(SB)
2086	ST (s2)
2087
2088	LDacc (z1in)
2089	LDt (z2in)
2090	CALL p256MulInternal(SB)	// z1 * z2
2091	LDt (h)
2092	CALL p256MulInternal(SB)	// z1 * z2 * h
2093	ST (zout)
2094
2095	LDacc (hsqr)
2096	LDt (u1)
2097	CALL p256MulInternal(SB)	// hˆ2 * u1
2098	ST (u2)
2099
2100	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
2101	LDacc (rsqr)
2102	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
2103
2104	LDt (hcub)
2105	CALL p256SubInternal(SB)
2106	ST (xout)
2107
2108	MOVQ acc4, t0
2109	MOVQ acc5, t1
2110	MOVQ acc6, t2
2111	MOVQ acc7, t3
2112	LDacc (u2)
2113	CALL p256SubInternal(SB)
2114
2115	LDt (r)
2116	CALL p256MulInternal(SB)
2117
2118	LDt (s2)
2119	CALL p256SubInternal(SB)
2120	ST (yout)
2121
2122	MOVOU xout(16*0), X0
2123	MOVOU xout(16*1), X1
2124	MOVOU yout(16*0), X2
2125	MOVOU yout(16*1), X3
2126	MOVOU zout(16*0), X4
2127	MOVOU zout(16*1), X5
2128	// Finally output the result
2129	MOVQ rptr, AX
2130	MOVQ $0, rptr
2131	MOVOU X0, (16*0)(AX)
2132	MOVOU X1, (16*1)(AX)
2133	MOVOU X2, (16*2)(AX)
2134	MOVOU X3, (16*3)(AX)
2135	MOVOU X4, (16*4)(AX)
2136	MOVOU X5, (16*5)(AX)
2137
2138	RET
2139#undef x1in
2140#undef y1in
2141#undef z1in
2142#undef x2in
2143#undef y2in
2144#undef z2in
2145#undef xout
2146#undef yout
2147#undef zout
2148#undef s1
2149#undef s2
2150#undef u1
2151#undef u2
2152#undef z1sqr
2153#undef z2sqr
2154#undef h
2155#undef r
2156#undef hsqr
2157#undef rsqr
2158#undef hcub
2159#undef rptr
2160/* ---------------------------------------*/
2161#define x(off) (32*0 + off)(SP)
2162#define y(off) (32*1 + off)(SP)
2163#define z(off) (32*2 + off)(SP)
2164
2165#define s(off)	(32*3 + off)(SP)
2166#define m(off)	(32*4 + off)(SP)
2167#define zsqr(off) (32*5 + off)(SP)
2168#define tmp(off)  (32*6 + off)(SP)
2169#define rptr	  (32*7)(SP)
2170
2171//func p256PointDoubleAsm(res, in []uint64)
2172TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
2173	// Move input to stack in order to free registers
2174	MOVQ res+0(FP), AX
2175	MOVQ in+24(FP), BX
2176
2177	MOVOU (16*0)(BX), X0
2178	MOVOU (16*1)(BX), X1
2179	MOVOU (16*2)(BX), X2
2180	MOVOU (16*3)(BX), X3
2181	MOVOU (16*4)(BX), X4
2182	MOVOU (16*5)(BX), X5
2183
2184	MOVOU X0, x(16*0)
2185	MOVOU X1, x(16*1)
2186	MOVOU X2, y(16*0)
2187	MOVOU X3, y(16*1)
2188	MOVOU X4, z(16*0)
2189	MOVOU X5, z(16*1)
2190	// Store pointer to result
2191	MOVQ AX, rptr
2192	// Begin point double
2193	LDacc (z)
2194	CALL p256SqrInternal(SB)
2195	ST (zsqr)
2196
2197	LDt (x)
2198	p256AddInline
2199	STt (m)
2200
2201	LDacc (z)
2202	LDt (y)
2203	CALL p256MulInternal(SB)
2204	p256MulBy2Inline
2205	MOVQ rptr, AX
2206	// Store z
2207	MOVQ t0, (16*4 + 8*0)(AX)
2208	MOVQ t1, (16*4 + 8*1)(AX)
2209	MOVQ t2, (16*4 + 8*2)(AX)
2210	MOVQ t3, (16*4 + 8*3)(AX)
2211
2212	LDacc (x)
2213	LDt (zsqr)
2214	CALL p256SubInternal(SB)
2215	LDt (m)
2216	CALL p256MulInternal(SB)
2217	ST (m)
2218	// Multiply by 3
2219	p256MulBy2Inline
2220	LDacc (m)
2221	p256AddInline
2222	STt (m)
2223	////////////////////////
2224	LDacc (y)
2225	p256MulBy2Inline
2226	t2acc
2227	CALL p256SqrInternal(SB)
2228	ST (s)
2229	CALL p256SqrInternal(SB)
2230	// Divide by 2
2231	XORQ mul0, mul0
2232	MOVQ acc4, t0
2233	MOVQ acc5, t1
2234	MOVQ acc6, t2
2235	MOVQ acc7, t3
2236
2237	ADDQ $-1, acc4
2238	ADCQ p256const0<>(SB), acc5
2239	ADCQ $0, acc6
2240	ADCQ p256const1<>(SB), acc7
2241	ADCQ $0, mul0
2242	TESTQ $1, t0
2243
2244	CMOVQEQ t0, acc4
2245	CMOVQEQ t1, acc5
2246	CMOVQEQ t2, acc6
2247	CMOVQEQ t3, acc7
2248	ANDQ t0, mul0
2249
2250	SHRQ $1, acc4:acc5
2251	SHRQ $1, acc5:acc6
2252	SHRQ $1, acc6:acc7
2253	SHRQ $1, acc7:mul0
2254	ST (y)
2255	/////////////////////////
2256	LDacc (x)
2257	LDt (s)
2258	CALL p256MulInternal(SB)
2259	ST (s)
2260	p256MulBy2Inline
2261	STt (tmp)
2262
2263	LDacc (m)
2264	CALL p256SqrInternal(SB)
2265	LDt (tmp)
2266	CALL p256SubInternal(SB)
2267
2268	MOVQ rptr, AX
2269	// Store x
2270	MOVQ acc4, (16*0 + 8*0)(AX)
2271	MOVQ acc5, (16*0 + 8*1)(AX)
2272	MOVQ acc6, (16*0 + 8*2)(AX)
2273	MOVQ acc7, (16*0 + 8*3)(AX)
2274
2275	acc2t
2276	LDacc (s)
2277	CALL p256SubInternal(SB)
2278
2279	LDt (m)
2280	CALL p256MulInternal(SB)
2281
2282	LDt (y)
2283	CALL p256SubInternal(SB)
2284	MOVQ rptr, AX
2285	// Store y
2286	MOVQ acc4, (16*2 + 8*0)(AX)
2287	MOVQ acc5, (16*2 + 8*1)(AX)
2288	MOVQ acc6, (16*2 + 8*2)(AX)
2289	MOVQ acc7, (16*2 + 8*3)(AX)
2290	///////////////////////
2291	MOVQ $0, rptr
2292
2293	RET
2294/* ---------------------------------------*/
2295
2296