1#! /usr/bin/env perl
2# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the OpenSSL license (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
58		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
59	$avx = ($1>=2.19) + ($1>=2.22);
60	$addx = ($1>=2.23);
61}
62
63if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66	$addx = ($1>=2.10);
67}
68
69if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71	$avx = ($1>=10) + ($1>=11);
72	$addx = ($1>=12);
73}
74
75if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
76	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
77	$avx = ($ver>=3.0) + ($ver>=3.01);
78	$addx = ($ver>=3.03);
79}
80
81$code.=<<___;
82.text
83.extern	OPENSSL_ia32cap_P
84
85# The polynomial
86.align 64
87.Lpoly:
88.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
89
90# 2^512 mod P precomputed for NIST P256 polynomial
91.LRR:
92.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
93
94.LOne:
95.long 1,1,1,1,1,1,1,1
96.LTwo:
97.long 2,2,2,2,2,2,2,2
98.LThree:
99.long 3,3,3,3,3,3,3,3
100.LONE_mont:
101.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
102
103# Constants for computations modulo ord(p256)
104.Lord:
105.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
106.LordK:
107.quad 0xccd1c8aaee00bc4f
108___
109
110{
111################################################################################
112# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
113
114my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
115my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
116my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
117
118$code.=<<___;
119
120.globl	ecp_nistz256_mul_by_2
121.type	ecp_nistz256_mul_by_2,\@function,2
122.align	64
123ecp_nistz256_mul_by_2:
124.cfi_startproc
125	push	%r12
126.cfi_push	%r12
127	push	%r13
128.cfi_push	%r13
129.Lmul_by_2_body:
130
131	mov	8*0($a_ptr), $a0
132	xor	$t4,$t4
133	mov	8*1($a_ptr), $a1
134	add	$a0, $a0		# a0:a3+a0:a3
135	mov	8*2($a_ptr), $a2
136	adc	$a1, $a1
137	mov	8*3($a_ptr), $a3
138	lea	.Lpoly(%rip), $a_ptr
139	 mov	$a0, $t0
140	adc	$a2, $a2
141	adc	$a3, $a3
142	 mov	$a1, $t1
143	adc	\$0, $t4
144
145	sub	8*0($a_ptr), $a0
146	 mov	$a2, $t2
147	sbb	8*1($a_ptr), $a1
148	sbb	8*2($a_ptr), $a2
149	 mov	$a3, $t3
150	sbb	8*3($a_ptr), $a3
151	sbb	\$0, $t4
152
153	cmovc	$t0, $a0
154	cmovc	$t1, $a1
155	mov	$a0, 8*0($r_ptr)
156	cmovc	$t2, $a2
157	mov	$a1, 8*1($r_ptr)
158	cmovc	$t3, $a3
159	mov	$a2, 8*2($r_ptr)
160	mov	$a3, 8*3($r_ptr)
161
162	mov	0(%rsp),%r13
163.cfi_restore	%r13
164	mov	8(%rsp),%r12
165.cfi_restore	%r12
166	lea	16(%rsp),%rsp
167.cfi_adjust_cfa_offset	-16
168.Lmul_by_2_epilogue:
169	ret
170.cfi_endproc
171.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
172
173################################################################################
174# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
175.globl	ecp_nistz256_div_by_2
176.type	ecp_nistz256_div_by_2,\@function,2
177.align	32
178ecp_nistz256_div_by_2:
179.cfi_startproc
180	push	%r12
181.cfi_push	%r12
182	push	%r13
183.cfi_push	%r13
184.Ldiv_by_2_body:
185
186	mov	8*0($a_ptr), $a0
187	mov	8*1($a_ptr), $a1
188	mov	8*2($a_ptr), $a2
189	 mov	$a0, $t0
190	mov	8*3($a_ptr), $a3
191	lea	.Lpoly(%rip), $a_ptr
192
193	 mov	$a1, $t1
194	xor	$t4, $t4
195	add	8*0($a_ptr), $a0
196	 mov	$a2, $t2
197	adc	8*1($a_ptr), $a1
198	adc	8*2($a_ptr), $a2
199	 mov	$a3, $t3
200	adc	8*3($a_ptr), $a3
201	adc	\$0, $t4
202	xor	$a_ptr, $a_ptr		# borrow $a_ptr
203	test	\$1, $t0
204
205	cmovz	$t0, $a0
206	cmovz	$t1, $a1
207	cmovz	$t2, $a2
208	cmovz	$t3, $a3
209	cmovz	$a_ptr, $t4
210
211	mov	$a1, $t0		# a0:a3>>1
212	shr	\$1, $a0
213	shl	\$63, $t0
214	mov	$a2, $t1
215	shr	\$1, $a1
216	or	$t0, $a0
217	shl	\$63, $t1
218	mov	$a3, $t2
219	shr	\$1, $a2
220	or	$t1, $a1
221	shl	\$63, $t2
222	shr	\$1, $a3
223	shl	\$63, $t4
224	or	$t2, $a2
225	or	$t4, $a3
226
227	mov	$a0, 8*0($r_ptr)
228	mov	$a1, 8*1($r_ptr)
229	mov	$a2, 8*2($r_ptr)
230	mov	$a3, 8*3($r_ptr)
231
232	mov	0(%rsp),%r13
233.cfi_restore	%r13
234	mov	8(%rsp),%r12
235.cfi_restore	%r12
236	lea	16(%rsp),%rsp
237.cfi_adjust_cfa_offset	-16
238.Ldiv_by_2_epilogue:
239	ret
240.cfi_endproc
241.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
242
243################################################################################
244# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
245.globl	ecp_nistz256_mul_by_3
246.type	ecp_nistz256_mul_by_3,\@function,2
247.align	32
248ecp_nistz256_mul_by_3:
249.cfi_startproc
250	push	%r12
251.cfi_push	%r12
252	push	%r13
253.cfi_push	%r13
254.Lmul_by_3_body:
255
256	mov	8*0($a_ptr), $a0
257	xor	$t4, $t4
258	mov	8*1($a_ptr), $a1
259	add	$a0, $a0		# a0:a3+a0:a3
260	mov	8*2($a_ptr), $a2
261	adc	$a1, $a1
262	mov	8*3($a_ptr), $a3
263	 mov	$a0, $t0
264	adc	$a2, $a2
265	adc	$a3, $a3
266	 mov	$a1, $t1
267	adc	\$0, $t4
268
269	sub	\$-1, $a0
270	 mov	$a2, $t2
271	sbb	.Lpoly+8*1(%rip), $a1
272	sbb	\$0, $a2
273	 mov	$a3, $t3
274	sbb	.Lpoly+8*3(%rip), $a3
275	sbb	\$0, $t4
276
277	cmovc	$t0, $a0
278	cmovc	$t1, $a1
279	cmovc	$t2, $a2
280	cmovc	$t3, $a3
281
282	xor	$t4, $t4
283	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
284	adc	8*1($a_ptr), $a1
285	 mov	$a0, $t0
286	adc	8*2($a_ptr), $a2
287	adc	8*3($a_ptr), $a3
288	 mov	$a1, $t1
289	adc	\$0, $t4
290
291	sub	\$-1, $a0
292	 mov	$a2, $t2
293	sbb	.Lpoly+8*1(%rip), $a1
294	sbb	\$0, $a2
295	 mov	$a3, $t3
296	sbb	.Lpoly+8*3(%rip), $a3
297	sbb	\$0, $t4
298
299	cmovc	$t0, $a0
300	cmovc	$t1, $a1
301	mov	$a0, 8*0($r_ptr)
302	cmovc	$t2, $a2
303	mov	$a1, 8*1($r_ptr)
304	cmovc	$t3, $a3
305	mov	$a2, 8*2($r_ptr)
306	mov	$a3, 8*3($r_ptr)
307
308	mov	0(%rsp),%r13
309.cfi_restore	%r13
310	mov	8(%rsp),%r12
311.cfi_restore	%r12
312	lea	16(%rsp),%rsp
313.cfi_adjust_cfa_offset	-16
314.Lmul_by_3_epilogue:
315	ret
316.cfi_endproc
317.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
318
319################################################################################
320# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
321.globl	ecp_nistz256_add
322.type	ecp_nistz256_add,\@function,3
323.align	32
324ecp_nistz256_add:
325.cfi_startproc
326	push	%r12
327.cfi_push	%r12
328	push	%r13
329.cfi_push	%r13
330.Ladd_body:
331
332	mov	8*0($a_ptr), $a0
333	xor	$t4, $t4
334	mov	8*1($a_ptr), $a1
335	mov	8*2($a_ptr), $a2
336	mov	8*3($a_ptr), $a3
337	lea	.Lpoly(%rip), $a_ptr
338
339	add	8*0($b_ptr), $a0
340	adc	8*1($b_ptr), $a1
341	 mov	$a0, $t0
342	adc	8*2($b_ptr), $a2
343	adc	8*3($b_ptr), $a3
344	 mov	$a1, $t1
345	adc	\$0, $t4
346
347	sub	8*0($a_ptr), $a0
348	 mov	$a2, $t2
349	sbb	8*1($a_ptr), $a1
350	sbb	8*2($a_ptr), $a2
351	 mov	$a3, $t3
352	sbb	8*3($a_ptr), $a3
353	sbb	\$0, $t4
354
355	cmovc	$t0, $a0
356	cmovc	$t1, $a1
357	mov	$a0, 8*0($r_ptr)
358	cmovc	$t2, $a2
359	mov	$a1, 8*1($r_ptr)
360	cmovc	$t3, $a3
361	mov	$a2, 8*2($r_ptr)
362	mov	$a3, 8*3($r_ptr)
363
364	mov	0(%rsp),%r13
365.cfi_restore	%r13
366	mov	8(%rsp),%r12
367.cfi_restore	%r12
368	lea	16(%rsp),%rsp
369.cfi_adjust_cfa_offset	-16
370.Ladd_epilogue:
371	ret
372.cfi_endproc
373.size	ecp_nistz256_add,.-ecp_nistz256_add
374
375################################################################################
376# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
377.globl	ecp_nistz256_sub
378.type	ecp_nistz256_sub,\@function,3
379.align	32
380ecp_nistz256_sub:
381.cfi_startproc
382	push	%r12
383.cfi_push	%r12
384	push	%r13
385.cfi_push	%r13
386.Lsub_body:
387
388	mov	8*0($a_ptr), $a0
389	xor	$t4, $t4
390	mov	8*1($a_ptr), $a1
391	mov	8*2($a_ptr), $a2
392	mov	8*3($a_ptr), $a3
393	lea	.Lpoly(%rip), $a_ptr
394
395	sub	8*0($b_ptr), $a0
396	sbb	8*1($b_ptr), $a1
397	 mov	$a0, $t0
398	sbb	8*2($b_ptr), $a2
399	sbb	8*3($b_ptr), $a3
400	 mov	$a1, $t1
401	sbb	\$0, $t4
402
403	add	8*0($a_ptr), $a0
404	 mov	$a2, $t2
405	adc	8*1($a_ptr), $a1
406	adc	8*2($a_ptr), $a2
407	 mov	$a3, $t3
408	adc	8*3($a_ptr), $a3
409	test	$t4, $t4
410
411	cmovz	$t0, $a0
412	cmovz	$t1, $a1
413	mov	$a0, 8*0($r_ptr)
414	cmovz	$t2, $a2
415	mov	$a1, 8*1($r_ptr)
416	cmovz	$t3, $a3
417	mov	$a2, 8*2($r_ptr)
418	mov	$a3, 8*3($r_ptr)
419
420	mov	0(%rsp),%r13
421.cfi_restore	%r13
422	mov	8(%rsp),%r12
423.cfi_restore	%r12
424	lea	16(%rsp),%rsp
425.cfi_adjust_cfa_offset	-16
426.Lsub_epilogue:
427	ret
428.cfi_endproc
429.size	ecp_nistz256_sub,.-ecp_nistz256_sub
430
431################################################################################
432# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
433.globl	ecp_nistz256_neg
434.type	ecp_nistz256_neg,\@function,2
435.align	32
436ecp_nistz256_neg:
437.cfi_startproc
438	push	%r12
439.cfi_push	%r12
440	push	%r13
441.cfi_push	%r13
442.Lneg_body:
443
444	xor	$a0, $a0
445	xor	$a1, $a1
446	xor	$a2, $a2
447	xor	$a3, $a3
448	xor	$t4, $t4
449
450	sub	8*0($a_ptr), $a0
451	sbb	8*1($a_ptr), $a1
452	sbb	8*2($a_ptr), $a2
453	 mov	$a0, $t0
454	sbb	8*3($a_ptr), $a3
455	lea	.Lpoly(%rip), $a_ptr
456	 mov	$a1, $t1
457	sbb	\$0, $t4
458
459	add	8*0($a_ptr), $a0
460	 mov	$a2, $t2
461	adc	8*1($a_ptr), $a1
462	adc	8*2($a_ptr), $a2
463	 mov	$a3, $t3
464	adc	8*3($a_ptr), $a3
465	test	$t4, $t4
466
467	cmovz	$t0, $a0
468	cmovz	$t1, $a1
469	mov	$a0, 8*0($r_ptr)
470	cmovz	$t2, $a2
471	mov	$a1, 8*1($r_ptr)
472	cmovz	$t3, $a3
473	mov	$a2, 8*2($r_ptr)
474	mov	$a3, 8*3($r_ptr)
475
476	mov	0(%rsp),%r13
477.cfi_restore	%r13
478	mov	8(%rsp),%r12
479.cfi_restore	%r12
480	lea	16(%rsp),%rsp
481.cfi_adjust_cfa_offset	-16
482.Lneg_epilogue:
483	ret
484.cfi_endproc
485.size	ecp_nistz256_neg,.-ecp_nistz256_neg
486___
487}
488{
489my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
490my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
491my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
492my ($poly1,$poly3)=($acc6,$acc7);
493
494$code.=<<___;
495################################################################################
496# void ecp_nistz256_ord_mul_mont(
497#   uint64_t res[4],
498#   uint64_t a[4],
499#   uint64_t b[4]);
500
501.globl	ecp_nistz256_ord_mul_mont
502.type	ecp_nistz256_ord_mul_mont,\@function,3
503.align	32
504ecp_nistz256_ord_mul_mont:
505.cfi_startproc
506___
507$code.=<<___	if ($addx);
508	mov	\$0x80100, %ecx
509	and	OPENSSL_ia32cap_P+8(%rip), %ecx
510	cmp	\$0x80100, %ecx
511	je	.Lecp_nistz256_ord_mul_montx
512___
513$code.=<<___;
514	push	%rbp
515.cfi_push	%rbp
516	push	%rbx
517.cfi_push	%rbx
518	push	%r12
519.cfi_push	%r12
520	push	%r13
521.cfi_push	%r13
522	push	%r14
523.cfi_push	%r14
524	push	%r15
525.cfi_push	%r15
526.Lord_mul_body:
527
528	mov	8*0($b_org), %rax
529	mov	$b_org, $b_ptr
530	lea	.Lord(%rip), %r14
531	mov	.LordK(%rip), %r15
532
533	################################# * b[0]
534	mov	%rax, $t0
535	mulq	8*0($a_ptr)
536	mov	%rax, $acc0
537	mov	$t0, %rax
538	mov	%rdx, $acc1
539
540	mulq	8*1($a_ptr)
541	add	%rax, $acc1
542	mov	$t0, %rax
543	adc	\$0, %rdx
544	mov	%rdx, $acc2
545
546	mulq	8*2($a_ptr)
547	add	%rax, $acc2
548	mov	$t0, %rax
549	adc	\$0, %rdx
550
551	 mov	$acc0, $acc5
552	 imulq	%r15,$acc0
553
554	mov	%rdx, $acc3
555	mulq	8*3($a_ptr)
556	add	%rax, $acc3
557	 mov	$acc0, %rax
558	adc	\$0, %rdx
559	mov	%rdx, $acc4
560
561	################################# First reduction step
562	mulq	8*0(%r14)
563	mov	$acc0, $t1
564	add	%rax, $acc5		# guaranteed to be zero
565	mov	$acc0, %rax
566	adc	\$0, %rdx
567	mov	%rdx, $t0
568
569	sub	$acc0, $acc2
570	sbb	\$0, $acc0		# can't borrow
571
572	mulq	8*1(%r14)
573	add	$t0, $acc1
574	adc	\$0, %rdx
575	add	%rax, $acc1
576	mov	$t1, %rax
577	adc	%rdx, $acc2
578	mov	$t1, %rdx
579	adc	\$0, $acc0		# can't overflow
580
581	shl	\$32, %rax
582	shr	\$32, %rdx
583	sub	%rax, $acc3
584	 mov	8*1($b_ptr), %rax
585	sbb	%rdx, $t1		# can't borrow
586
587	add	$acc0, $acc3
588	adc	$t1, $acc4
589	adc	\$0, $acc5
590
591	################################# * b[1]
592	mov	%rax, $t0
593	mulq	8*0($a_ptr)
594	add	%rax, $acc1
595	mov	$t0, %rax
596	adc	\$0, %rdx
597	mov	%rdx, $t1
598
599	mulq	8*1($a_ptr)
600	add	$t1, $acc2
601	adc	\$0, %rdx
602	add	%rax, $acc2
603	mov	$t0, %rax
604	adc	\$0, %rdx
605	mov	%rdx, $t1
606
607	mulq	8*2($a_ptr)
608	add	$t1, $acc3
609	adc	\$0, %rdx
610	add	%rax, $acc3
611	mov	$t0, %rax
612	adc	\$0, %rdx
613
614	 mov	$acc1, $t0
615	 imulq	%r15, $acc1
616
617	mov	%rdx, $t1
618	mulq	8*3($a_ptr)
619	add	$t1, $acc4
620	adc	\$0, %rdx
621	xor	$acc0, $acc0
622	add	%rax, $acc4
623	 mov	$acc1, %rax
624	adc	%rdx, $acc5
625	adc	\$0, $acc0
626
627	################################# Second reduction step
628	mulq	8*0(%r14)
629	mov	$acc1, $t1
630	add	%rax, $t0		# guaranteed to be zero
631	mov	$acc1, %rax
632	adc	%rdx, $t0
633
634	sub	$acc1, $acc3
635	sbb	\$0, $acc1		# can't borrow
636
637	mulq	8*1(%r14)
638	add	$t0, $acc2
639	adc	\$0, %rdx
640	add	%rax, $acc2
641	mov	$t1, %rax
642	adc	%rdx, $acc3
643	mov	$t1, %rdx
644	adc	\$0, $acc1		# can't overflow
645
646	shl	\$32, %rax
647	shr	\$32, %rdx
648	sub	%rax, $acc4
649	 mov	8*2($b_ptr), %rax
650	sbb	%rdx, $t1		# can't borrow
651
652	add	$acc1, $acc4
653	adc	$t1, $acc5
654	adc	\$0, $acc0
655
656	################################## * b[2]
657	mov	%rax, $t0
658	mulq	8*0($a_ptr)
659	add	%rax, $acc2
660	mov	$t0, %rax
661	adc	\$0, %rdx
662	mov	%rdx, $t1
663
664	mulq	8*1($a_ptr)
665	add	$t1, $acc3
666	adc	\$0, %rdx
667	add	%rax, $acc3
668	mov	$t0, %rax
669	adc	\$0, %rdx
670	mov	%rdx, $t1
671
672	mulq	8*2($a_ptr)
673	add	$t1, $acc4
674	adc	\$0, %rdx
675	add	%rax, $acc4
676	mov	$t0, %rax
677	adc	\$0, %rdx
678
679	 mov	$acc2, $t0
680	 imulq	%r15, $acc2
681
682	mov	%rdx, $t1
683	mulq	8*3($a_ptr)
684	add	$t1, $acc5
685	adc	\$0, %rdx
686	xor	$acc1, $acc1
687	add	%rax, $acc5
688	 mov	$acc2, %rax
689	adc	%rdx, $acc0
690	adc	\$0, $acc1
691
692	################################# Third reduction step
693	mulq	8*0(%r14)
694	mov	$acc2, $t1
695	add	%rax, $t0		# guaranteed to be zero
696	mov	$acc2, %rax
697	adc	%rdx, $t0
698
699	sub	$acc2, $acc4
700	sbb	\$0, $acc2		# can't borrow
701
702	mulq	8*1(%r14)
703	add	$t0, $acc3
704	adc	\$0, %rdx
705	add	%rax, $acc3
706	mov	$t1, %rax
707	adc	%rdx, $acc4
708	mov	$t1, %rdx
709	adc	\$0, $acc2		# can't overflow
710
711	shl	\$32, %rax
712	shr	\$32, %rdx
713	sub	%rax, $acc5
714	 mov	8*3($b_ptr), %rax
715	sbb	%rdx, $t1		# can't borrow
716
717	add	$acc2, $acc5
718	adc	$t1, $acc0
719	adc	\$0, $acc1
720
721	################################# * b[3]
722	mov	%rax, $t0
723	mulq	8*0($a_ptr)
724	add	%rax, $acc3
725	mov	$t0, %rax
726	adc	\$0, %rdx
727	mov	%rdx, $t1
728
729	mulq	8*1($a_ptr)
730	add	$t1, $acc4
731	adc	\$0, %rdx
732	add	%rax, $acc4
733	mov	$t0, %rax
734	adc	\$0, %rdx
735	mov	%rdx, $t1
736
737	mulq	8*2($a_ptr)
738	add	$t1, $acc5
739	adc	\$0, %rdx
740	add	%rax, $acc5
741	mov	$t0, %rax
742	adc	\$0, %rdx
743
744	 mov	$acc3, $t0
745	 imulq	%r15, $acc3
746
747	mov	%rdx, $t1
748	mulq	8*3($a_ptr)
749	add	$t1, $acc0
750	adc	\$0, %rdx
751	xor	$acc2, $acc2
752	add	%rax, $acc0
753	 mov	$acc3, %rax
754	adc	%rdx, $acc1
755	adc	\$0, $acc2
756
757	################################# Last reduction step
758	mulq	8*0(%r14)
759	mov	$acc3, $t1
760	add	%rax, $t0		# guaranteed to be zero
761	mov	$acc3, %rax
762	adc	%rdx, $t0
763
764	sub	$acc3, $acc5
765	sbb	\$0, $acc3		# can't borrow
766
767	mulq	8*1(%r14)
768	add	$t0, $acc4
769	adc	\$0, %rdx
770	add	%rax, $acc4
771	mov	$t1, %rax
772	adc	%rdx, $acc5
773	mov	$t1, %rdx
774	adc	\$0, $acc3		# can't overflow
775
776	shl	\$32, %rax
777	shr	\$32, %rdx
778	sub	%rax, $acc0
779	sbb	%rdx, $t1		# can't borrow
780
781	add	$acc3, $acc0
782	adc	$t1, $acc1
783	adc	\$0, $acc2
784
785	################################# Subtract ord
786	 mov	$acc4, $a_ptr
787	sub	8*0(%r14), $acc4
788	 mov	$acc5, $acc3
789	sbb	8*1(%r14), $acc5
790	 mov	$acc0, $t0
791	sbb	8*2(%r14), $acc0
792	 mov	$acc1, $t1
793	sbb	8*3(%r14), $acc1
794	sbb	\$0, $acc2
795
796	cmovc	$a_ptr, $acc4
797	cmovc	$acc3, $acc5
798	cmovc	$t0, $acc0
799	cmovc	$t1, $acc1
800
801	mov	$acc4, 8*0($r_ptr)
802	mov	$acc5, 8*1($r_ptr)
803	mov	$acc0, 8*2($r_ptr)
804	mov	$acc1, 8*3($r_ptr)
805
806	mov	0(%rsp),%r15
807.cfi_restore	%r15
808	mov	8(%rsp),%r14
809.cfi_restore	%r14
810	mov	16(%rsp),%r13
811.cfi_restore	%r13
812	mov	24(%rsp),%r12
813.cfi_restore	%r12
814	mov	32(%rsp),%rbx
815.cfi_restore	%rbx
816	mov	40(%rsp),%rbp
817.cfi_restore	%rbp
818	lea	48(%rsp),%rsp
819.cfi_adjust_cfa_offset	-48
820.Lord_mul_epilogue:
821	ret
822.cfi_endproc
823.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
824
825################################################################################
826# void ecp_nistz256_ord_sqr_mont(
827#   uint64_t res[4],
828#   uint64_t a[4],
829#   int rep);
830
831.globl	ecp_nistz256_ord_sqr_mont
832.type	ecp_nistz256_ord_sqr_mont,\@function,3
833.align	32
834ecp_nistz256_ord_sqr_mont:
835.cfi_startproc
836___
837$code.=<<___	if ($addx);
838	mov	\$0x80100, %ecx
839	and	OPENSSL_ia32cap_P+8(%rip), %ecx
840	cmp	\$0x80100, %ecx
841	je	.Lecp_nistz256_ord_sqr_montx
842___
843$code.=<<___;
844	push	%rbp
845.cfi_push	%rbp
846	push	%rbx
847.cfi_push	%rbx
848	push	%r12
849.cfi_push	%r12
850	push	%r13
851.cfi_push	%r13
852	push	%r14
853.cfi_push	%r14
854	push	%r15
855.cfi_push	%r15
856.Lord_sqr_body:
857
858	mov	8*0($a_ptr), $acc0
859	mov	8*1($a_ptr), %rax
860	mov	8*2($a_ptr), $acc6
861	mov	8*3($a_ptr), $acc7
862	lea	.Lord(%rip), $a_ptr	# pointer to modulus
863	mov	$b_org, $b_ptr
864	jmp	.Loop_ord_sqr
865
866.align	32
867.Loop_ord_sqr:
868	################################# a[1:] * a[0]
869	mov	%rax, $t1		# put aside a[1]
870	mul	$acc0			# a[1] * a[0]
871	mov	%rax, $acc1
872	movq	$t1, %xmm1		# offload a[1]
873	mov	$acc6, %rax
874	mov	%rdx, $acc2
875
876	mul	$acc0			# a[2] * a[0]
877	add	%rax, $acc2
878	mov	$acc7, %rax
879	movq	$acc6, %xmm2		# offload a[2]
880	adc	\$0, %rdx
881	mov	%rdx, $acc3
882
883	mul	$acc0			# a[3] * a[0]
884	add	%rax, $acc3
885	mov	$acc7, %rax
886	movq	$acc7, %xmm3		# offload a[3]
887	adc	\$0, %rdx
888	mov	%rdx, $acc4
889
890	################################# a[3] * a[2]
891	mul	$acc6			# a[3] * a[2]
892	mov	%rax, $acc5
893	mov	$acc6, %rax
894	mov	%rdx, $acc6
895
896	################################# a[2:] * a[1]
897	mul	$t1			# a[2] * a[1]
898	add	%rax, $acc3
899	mov	$acc7, %rax
900	adc	\$0, %rdx
901	mov	%rdx, $acc7
902
903	mul	$t1			# a[3] * a[1]
904	add	%rax, $acc4
905	adc	\$0, %rdx
906
907	add	$acc7, $acc4
908	adc	%rdx, $acc5
909	adc	\$0, $acc6		# can't overflow
910
911	################################# *2
912	xor	$acc7, $acc7
913	mov	$acc0, %rax
914	add	$acc1, $acc1
915	adc	$acc2, $acc2
916	adc	$acc3, $acc3
917	adc	$acc4, $acc4
918	adc	$acc5, $acc5
919	adc	$acc6, $acc6
920	adc	\$0, $acc7
921
922	################################# Missing products
923	mul	%rax			# a[0] * a[0]
924	mov	%rax, $acc0
925	movq	%xmm1, %rax
926	mov	%rdx, $t1
927
928	mul	%rax			# a[1] * a[1]
929	add	$t1, $acc1
930	adc	%rax, $acc2
931	movq	%xmm2, %rax
932	adc	\$0, %rdx
933	mov	%rdx, $t1
934
935	mul	%rax			# a[2] * a[2]
936	add	$t1, $acc3
937	adc	%rax, $acc4
938	movq	%xmm3, %rax
939	adc	\$0, %rdx
940	mov	%rdx, $t1
941
942	 mov	$acc0, $t0
943	 imulq	8*4($a_ptr), $acc0	# *= .LordK
944
945	mul	%rax			# a[3] * a[3]
946	add	$t1, $acc5
947	adc	%rax, $acc6
948	 mov	8*0($a_ptr), %rax	# modulus[0]
949	adc	%rdx, $acc7		# can't overflow
950
951	################################# First reduction step
952	mul	$acc0
953	mov	$acc0, $t1
954	add	%rax, $t0		# guaranteed to be zero
955	mov	8*1($a_ptr), %rax	# modulus[1]
956	adc	%rdx, $t0
957
958	sub	$acc0, $acc2
959	sbb	\$0, $t1		# can't borrow
960
961	mul	$acc0
962	add	$t0, $acc1
963	adc	\$0, %rdx
964	add	%rax, $acc1
965	mov	$acc0, %rax
966	adc	%rdx, $acc2
967	mov	$acc0, %rdx
968	adc	\$0, $t1		# can't overflow
969
970	 mov	$acc1, $t0
971	 imulq	8*4($a_ptr), $acc1	# *= .LordK
972
973	shl	\$32, %rax
974	shr	\$32, %rdx
975	sub	%rax, $acc3
976	 mov	8*0($a_ptr), %rax
977	sbb	%rdx, $acc0		# can't borrow
978
979	add	$t1, $acc3
980	adc	\$0, $acc0		# can't overflow
981
982	################################# Second reduction step
983	mul	$acc1
984	mov	$acc1, $t1
985	add	%rax, $t0		# guaranteed to be zero
986	mov	8*1($a_ptr), %rax
987	adc	%rdx, $t0
988
989	sub	$acc1, $acc3
990	sbb	\$0, $t1		# can't borrow
991
992	mul	$acc1
993	add	$t0, $acc2
994	adc	\$0, %rdx
995	add	%rax, $acc2
996	mov	$acc1, %rax
997	adc	%rdx, $acc3
998	mov	$acc1, %rdx
999	adc	\$0, $t1		# can't overflow
1000
1001	 mov	$acc2, $t0
1002	 imulq	8*4($a_ptr), $acc2	# *= .LordK
1003
1004	shl	\$32, %rax
1005	shr	\$32, %rdx
1006	sub	%rax, $acc0
1007	 mov	8*0($a_ptr), %rax
1008	sbb	%rdx, $acc1		# can't borrow
1009
1010	add	$t1, $acc0
1011	adc	\$0, $acc1		# can't overflow
1012
1013	################################# Third reduction step
1014	mul	$acc2
1015	mov	$acc2, $t1
1016	add	%rax, $t0		# guaranteed to be zero
1017	mov	8*1($a_ptr), %rax
1018	adc	%rdx, $t0
1019
1020	sub	$acc2, $acc0
1021	sbb	\$0, $t1		# can't borrow
1022
1023	mul	$acc2
1024	add	$t0, $acc3
1025	adc	\$0, %rdx
1026	add	%rax, $acc3
1027	mov	$acc2, %rax
1028	adc	%rdx, $acc0
1029	mov	$acc2, %rdx
1030	adc	\$0, $t1		# can't overflow
1031
1032	 mov	$acc3, $t0
1033	 imulq	8*4($a_ptr), $acc3	# *= .LordK
1034
1035	shl	\$32, %rax
1036	shr	\$32, %rdx
1037	sub	%rax, $acc1
1038	 mov	8*0($a_ptr), %rax
1039	sbb	%rdx, $acc2		# can't borrow
1040
1041	add	$t1, $acc1
1042	adc	\$0, $acc2		# can't overflow
1043
1044	################################# Last reduction step
1045	mul	$acc3
1046	mov	$acc3, $t1
1047	add	%rax, $t0		# guaranteed to be zero
1048	mov	8*1($a_ptr), %rax
1049	adc	%rdx, $t0
1050
1051	sub	$acc3, $acc1
1052	sbb	\$0, $t1		# can't borrow
1053
1054	mul	$acc3
1055	add	$t0, $acc0
1056	adc	\$0, %rdx
1057	add	%rax, $acc0
1058	mov	$acc3, %rax
1059	adc	%rdx, $acc1
1060	mov	$acc3, %rdx
1061	adc	\$0, $t1		# can't overflow
1062
1063	shl	\$32, %rax
1064	shr	\$32, %rdx
1065	sub	%rax, $acc2
1066	sbb	%rdx, $acc3		# can't borrow
1067
1068	add	$t1, $acc2
1069	adc	\$0, $acc3		# can't overflow
1070
1071	################################# Add bits [511:256] of the sqr result
1072	xor	%rdx, %rdx
1073	add	$acc4, $acc0
1074	adc	$acc5, $acc1
1075	 mov	$acc0, $acc4
1076	adc	$acc6, $acc2
1077	adc	$acc7, $acc3
1078	 mov	$acc1, %rax
1079	adc	\$0, %rdx
1080
1081	################################# Compare to modulus
1082	sub	8*0($a_ptr), $acc0
1083	 mov	$acc2, $acc6
1084	sbb	8*1($a_ptr), $acc1
1085	sbb	8*2($a_ptr), $acc2
1086	 mov	$acc3, $acc7
1087	sbb	8*3($a_ptr), $acc3
1088	sbb	\$0, %rdx
1089
1090	cmovc	$acc4, $acc0
1091	cmovnc	$acc1, %rax
1092	cmovnc	$acc2, $acc6
1093	cmovnc	$acc3, $acc7
1094
1095	dec	$b_ptr
1096	jnz	.Loop_ord_sqr
1097
1098	mov	$acc0, 8*0($r_ptr)
1099	mov	%rax,  8*1($r_ptr)
1100	pxor	%xmm1, %xmm1
1101	mov	$acc6, 8*2($r_ptr)
1102	pxor	%xmm2, %xmm2
1103	mov	$acc7, 8*3($r_ptr)
1104	pxor	%xmm3, %xmm3
1105
1106	mov	0(%rsp),%r15
1107.cfi_restore	%r15
1108	mov	8(%rsp),%r14
1109.cfi_restore	%r14
1110	mov	16(%rsp),%r13
1111.cfi_restore	%r13
1112	mov	24(%rsp),%r12
1113.cfi_restore	%r12
1114	mov	32(%rsp),%rbx
1115.cfi_restore	%rbx
1116	mov	40(%rsp),%rbp
1117.cfi_restore	%rbp
1118	lea	48(%rsp),%rsp
1119.cfi_adjust_cfa_offset	-48
1120.Lord_sqr_epilogue:
1121	ret
1122.cfi_endproc
1123.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1124___
1125
1126$code.=<<___	if ($addx);
1127################################################################################
1128.type	ecp_nistz256_ord_mul_montx,\@function,3
1129.align	32
1130ecp_nistz256_ord_mul_montx:
1131.cfi_startproc
1132.Lecp_nistz256_ord_mul_montx:
1133	push	%rbp
1134.cfi_push	%rbp
1135	push	%rbx
1136.cfi_push	%rbx
1137	push	%r12
1138.cfi_push	%r12
1139	push	%r13
1140.cfi_push	%r13
1141	push	%r14
1142.cfi_push	%r14
1143	push	%r15
1144.cfi_push	%r15
1145.Lord_mulx_body:
1146
1147	mov	$b_org, $b_ptr
1148	mov	8*0($b_org), %rdx
1149	mov	8*0($a_ptr), $acc1
1150	mov	8*1($a_ptr), $acc2
1151	mov	8*2($a_ptr), $acc3
1152	mov	8*3($a_ptr), $acc4
1153	lea	-128($a_ptr), $a_ptr	# control u-op density
1154	lea	.Lord-128(%rip), %r14
1155	mov	.LordK(%rip), %r15
1156
1157	################################# Multiply by b[0]
1158	mulx	$acc1, $acc0, $acc1
1159	mulx	$acc2, $t0, $acc2
1160	mulx	$acc3, $t1, $acc3
1161	add	$t0, $acc1
1162	mulx	$acc4, $t0, $acc4
1163	 mov	$acc0, %rdx
1164	 mulx	%r15, %rdx, %rax
1165	adc	$t1, $acc2
1166	adc	$t0, $acc3
1167	adc	\$0, $acc4
1168
1169	################################# reduction
1170	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
1171	mulx	8*0+128(%r14), $t0, $t1
1172	adcx	$t0, $acc0		# guaranteed to be zero
1173	adox	$t1, $acc1
1174
1175	mulx	8*1+128(%r14), $t0, $t1
1176	adcx	$t0, $acc1
1177	adox	$t1, $acc2
1178
1179	mulx	8*2+128(%r14), $t0, $t1
1180	adcx	$t0, $acc2
1181	adox	$t1, $acc3
1182
1183	mulx	8*3+128(%r14), $t0, $t1
1184	 mov	8*1($b_ptr), %rdx
1185	adcx	$t0, $acc3
1186	adox	$t1, $acc4
1187	adcx	$acc0, $acc4
1188	adox	$acc0, $acc5
1189	adc	\$0, $acc5		# cf=0, of=0
1190
1191	################################# Multiply by b[1]
1192	mulx	8*0+128($a_ptr), $t0, $t1
1193	adcx	$t0, $acc1
1194	adox	$t1, $acc2
1195
1196	mulx	8*1+128($a_ptr), $t0, $t1
1197	adcx	$t0, $acc2
1198	adox	$t1, $acc3
1199
1200	mulx	8*2+128($a_ptr), $t0, $t1
1201	adcx	$t0, $acc3
1202	adox	$t1, $acc4
1203
1204	mulx	8*3+128($a_ptr), $t0, $t1
1205	 mov	$acc1, %rdx
1206	 mulx	%r15, %rdx, %rax
1207	adcx	$t0, $acc4
1208	adox	$t1, $acc5
1209
1210	adcx	$acc0, $acc5
1211	adox	$acc0, $acc0
1212	adc	\$0, $acc0		# cf=0, of=0
1213
1214	################################# reduction
1215	mulx	8*0+128(%r14), $t0, $t1
1216	adcx	$t0, $acc1		# guaranteed to be zero
1217	adox	$t1, $acc2
1218
1219	mulx	8*1+128(%r14), $t0, $t1
1220	adcx	$t0, $acc2
1221	adox	$t1, $acc3
1222
1223	mulx	8*2+128(%r14), $t0, $t1
1224	adcx	$t0, $acc3
1225	adox	$t1, $acc4
1226
1227	mulx	8*3+128(%r14), $t0, $t1
1228	 mov	8*2($b_ptr), %rdx
1229	adcx	$t0, $acc4
1230	adox	$t1, $acc5
1231	adcx	$acc1, $acc5
1232	adox	$acc1, $acc0
1233	adc	\$0, $acc0		# cf=0, of=0
1234
1235	################################# Multiply by b[2]
1236	mulx	8*0+128($a_ptr), $t0, $t1
1237	adcx	$t0, $acc2
1238	adox	$t1, $acc3
1239
1240	mulx	8*1+128($a_ptr), $t0, $t1
1241	adcx	$t0, $acc3
1242	adox	$t1, $acc4
1243
1244	mulx	8*2+128($a_ptr), $t0, $t1
1245	adcx	$t0, $acc4
1246	adox	$t1, $acc5
1247
1248	mulx	8*3+128($a_ptr), $t0, $t1
1249	 mov	$acc2, %rdx
1250	 mulx	%r15, %rdx, %rax
1251	adcx	$t0, $acc5
1252	adox	$t1, $acc0
1253
1254	adcx	$acc1, $acc0
1255	adox	$acc1, $acc1
1256	adc	\$0, $acc1		# cf=0, of=0
1257
1258	################################# reduction
1259	mulx	8*0+128(%r14), $t0, $t1
1260	adcx	$t0, $acc2		# guaranteed to be zero
1261	adox	$t1, $acc3
1262
1263	mulx	8*1+128(%r14), $t0, $t1
1264	adcx	$t0, $acc3
1265	adox	$t1, $acc4
1266
1267	mulx	8*2+128(%r14), $t0, $t1
1268	adcx	$t0, $acc4
1269	adox	$t1, $acc5
1270
1271	mulx	8*3+128(%r14), $t0, $t1
1272	 mov	8*3($b_ptr), %rdx
1273	adcx	$t0, $acc5
1274	adox	$t1, $acc0
1275	adcx	$acc2, $acc0
1276	adox	$acc2, $acc1
1277	adc	\$0, $acc1		# cf=0, of=0
1278
1279	################################# Multiply by b[3]
1280	mulx	8*0+128($a_ptr), $t0, $t1
1281	adcx	$t0, $acc3
1282	adox	$t1, $acc4
1283
1284	mulx	8*1+128($a_ptr), $t0, $t1
1285	adcx	$t0, $acc4
1286	adox	$t1, $acc5
1287
1288	mulx	8*2+128($a_ptr), $t0, $t1
1289	adcx	$t0, $acc5
1290	adox	$t1, $acc0
1291
1292	mulx	8*3+128($a_ptr), $t0, $t1
1293	 mov	$acc3, %rdx
1294	 mulx	%r15, %rdx, %rax
1295	adcx	$t0, $acc0
1296	adox	$t1, $acc1
1297
1298	adcx	$acc2, $acc1
1299	adox	$acc2, $acc2
1300	adc	\$0, $acc2		# cf=0, of=0
1301
1302	################################# reduction
1303	mulx	8*0+128(%r14), $t0, $t1
1304	adcx	$t0, $acc3		# guranteed to be zero
1305	adox	$t1, $acc4
1306
1307	mulx	8*1+128(%r14), $t0, $t1
1308	adcx	$t0, $acc4
1309	adox	$t1, $acc5
1310
1311	mulx	8*2+128(%r14), $t0, $t1
1312	adcx	$t0, $acc5
1313	adox	$t1, $acc0
1314
1315	mulx	8*3+128(%r14), $t0, $t1
1316	lea	128(%r14),%r14
1317	 mov	$acc4, $t2
1318	adcx	$t0, $acc0
1319	adox	$t1, $acc1
1320	 mov	$acc5, $t3
1321	adcx	$acc3, $acc1
1322	adox	$acc3, $acc2
1323	adc	\$0, $acc2
1324
1325	#################################
1326	# Branch-less conditional subtraction of P
1327	 mov	$acc0, $t0
1328	sub	8*0(%r14), $acc4
1329	sbb	8*1(%r14), $acc5
1330	sbb	8*2(%r14), $acc0
1331	 mov	$acc1, $t1
1332	sbb	8*3(%r14), $acc1
1333	sbb	\$0, $acc2
1334
1335	cmovc	$t2, $acc4
1336	cmovc	$t3, $acc5
1337	cmovc	$t0, $acc0
1338	cmovc	$t1, $acc1
1339
1340	mov	$acc4, 8*0($r_ptr)
1341	mov	$acc5, 8*1($r_ptr)
1342	mov	$acc0, 8*2($r_ptr)
1343	mov	$acc1, 8*3($r_ptr)
1344
1345	mov	0(%rsp),%r15
1346.cfi_restore	%r15
1347	mov	8(%rsp),%r14
1348.cfi_restore	%r14
1349	mov	16(%rsp),%r13
1350.cfi_restore	%r13
1351	mov	24(%rsp),%r12
1352.cfi_restore	%r12
1353	mov	32(%rsp),%rbx
1354.cfi_restore	%rbx
1355	mov	40(%rsp),%rbp
1356.cfi_restore	%rbp
1357	lea	48(%rsp),%rsp
1358.cfi_adjust_cfa_offset	-48
1359.Lord_mulx_epilogue:
1360	ret
1361.cfi_endproc
1362.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1363
1364.type	ecp_nistz256_ord_sqr_montx,\@function,3
1365.align	32
1366ecp_nistz256_ord_sqr_montx:
1367.cfi_startproc
1368.Lecp_nistz256_ord_sqr_montx:
1369	push	%rbp
1370.cfi_push	%rbp
1371	push	%rbx
1372.cfi_push	%rbx
1373	push	%r12
1374.cfi_push	%r12
1375	push	%r13
1376.cfi_push	%r13
1377	push	%r14
1378.cfi_push	%r14
1379	push	%r15
1380.cfi_push	%r15
1381.Lord_sqrx_body:
1382
1383	mov	$b_org, $b_ptr
1384	mov	8*0($a_ptr), %rdx
1385	mov	8*1($a_ptr), $acc6
1386	mov	8*2($a_ptr), $acc7
1387	mov	8*3($a_ptr), $acc0
1388	lea	.Lord(%rip), $a_ptr
1389	jmp	.Loop_ord_sqrx
1390
1391.align	32
1392.Loop_ord_sqrx:
1393	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1394	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1395	 mov	%rdx, %rax		# offload a[0]
1396	 movq	$acc6, %xmm1		# offload a[1]
1397	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1398	 mov	$acc6, %rdx
1399	add	$t0, $acc2
1400	 movq	$acc7, %xmm2		# offload a[2]
1401	adc	$t1, $acc3
1402	adc	\$0, $acc4
1403	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1404	#################################
1405	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1406	adcx	$t0, $acc3
1407	adox	$t1, $acc4
1408
1409	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1410	 mov	$acc7, %rdx
1411	adcx	$t0, $acc4
1412	adox	$t1, $acc5
1413	adc	\$0, $acc5
1414	#################################
1415	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1416	mov	%rax, %rdx
1417	 movq	$acc0, %xmm3		# offload a[3]
1418	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1419	 adcx	$acc1, $acc1		# acc1:6<<1
1420	adox	$t0, $acc5
1421	 adcx	$acc2, $acc2
1422	adox	$acc7, $acc6		# of=0
1423
1424	################################# a[i]*a[i]
1425	mulx	%rdx, $acc0, $t1
1426	movq	%xmm1, %rdx
1427	 adcx	$acc3, $acc3
1428	adox	$t1, $acc1
1429	 adcx	$acc4, $acc4
1430	mulx	%rdx, $t0, $t4
1431	movq	%xmm2, %rdx
1432	 adcx	$acc5, $acc5
1433	adox	$t0, $acc2
1434	 adcx	$acc6, $acc6
1435	mulx	%rdx, $t0, $t1
1436	.byte	0x67
1437	movq	%xmm3, %rdx
1438	adox	$t4, $acc3
1439	 adcx	$acc7, $acc7
1440	adox	$t0, $acc4
1441	adox	$t1, $acc5
1442	mulx	%rdx, $t0, $t4
1443	adox	$t0, $acc6
1444	adox	$t4, $acc7
1445
1446	################################# reduction
1447	mov	$acc0, %rdx
1448	mulx	8*4($a_ptr), %rdx, $t0
1449
1450	xor	%rax, %rax		# cf=0, of=0
1451	mulx	8*0($a_ptr), $t0, $t1
1452	adcx	$t0, $acc0		# guaranteed to be zero
1453	adox	$t1, $acc1
1454	mulx	8*1($a_ptr), $t0, $t1
1455	adcx	$t0, $acc1
1456	adox	$t1, $acc2
1457	mulx	8*2($a_ptr), $t0, $t1
1458	adcx	$t0, $acc2
1459	adox	$t1, $acc3
1460	mulx	8*3($a_ptr), $t0, $t1
1461	adcx	$t0, $acc3
1462	adox	$t1, $acc0		# of=0
1463	adcx	%rax, $acc0		# cf=0
1464
1465	#################################
1466	mov	$acc1, %rdx
1467	mulx	8*4($a_ptr), %rdx, $t0
1468
1469	mulx	8*0($a_ptr), $t0, $t1
1470	adox	$t0, $acc1		# guaranteed to be zero
1471	adcx	$t1, $acc2
1472	mulx	8*1($a_ptr), $t0, $t1
1473	adox	$t0, $acc2
1474	adcx	$t1, $acc3
1475	mulx	8*2($a_ptr), $t0, $t1
1476	adox	$t0, $acc3
1477	adcx	$t1, $acc0
1478	mulx	8*3($a_ptr), $t0, $t1
1479	adox	$t0, $acc0
1480	adcx	$t1, $acc1		# cf=0
1481	adox	%rax, $acc1		# of=0
1482
1483	#################################
1484	mov	$acc2, %rdx
1485	mulx	8*4($a_ptr), %rdx, $t0
1486
1487	mulx	8*0($a_ptr), $t0, $t1
1488	adcx	$t0, $acc2		# guaranteed to be zero
1489	adox	$t1, $acc3
1490	mulx	8*1($a_ptr), $t0, $t1
1491	adcx	$t0, $acc3
1492	adox	$t1, $acc0
1493	mulx	8*2($a_ptr), $t0, $t1
1494	adcx	$t0, $acc0
1495	adox	$t1, $acc1
1496	mulx	8*3($a_ptr), $t0, $t1
1497	adcx	$t0, $acc1
1498	adox	$t1, $acc2		# of=0
1499	adcx	%rax, $acc2		# cf=0
1500
1501	#################################
1502	mov	$acc3, %rdx
1503	mulx	8*4($a_ptr), %rdx, $t0
1504
1505	mulx	8*0($a_ptr), $t0, $t1
1506	adox	$t0, $acc3		# guaranteed to be zero
1507	adcx	$t1, $acc0
1508	mulx	8*1($a_ptr), $t0, $t1
1509	adox	$t0, $acc0
1510	adcx	$t1, $acc1
1511	mulx	8*2($a_ptr), $t0, $t1
1512	adox	$t0, $acc1
1513	adcx	$t1, $acc2
1514	mulx	8*3($a_ptr), $t0, $t1
1515	adox	$t0, $acc2
1516	adcx	$t1, $acc3
1517	adox	%rax, $acc3
1518
1519	################################# accumulate upper half
1520	add	$acc0, $acc4		# add	$acc4, $acc0
1521	adc	$acc5, $acc1
1522	 mov	$acc4, %rdx
1523	adc	$acc6, $acc2
1524	adc	$acc7, $acc3
1525	 mov	$acc1, $acc6
1526	adc	\$0, %rax
1527
1528	################################# compare to modulus
1529	sub	8*0($a_ptr), $acc4
1530	 mov	$acc2, $acc7
1531	sbb	8*1($a_ptr), $acc1
1532	sbb	8*2($a_ptr), $acc2
1533	 mov	$acc3, $acc0
1534	sbb	8*3($a_ptr), $acc3
1535	sbb	\$0, %rax
1536
1537	cmovnc	$acc4, %rdx
1538	cmovnc	$acc1, $acc6
1539	cmovnc	$acc2, $acc7
1540	cmovnc	$acc3, $acc0
1541
1542	dec	$b_ptr
1543	jnz	.Loop_ord_sqrx
1544
1545	mov	%rdx, 8*0($r_ptr)
1546	mov	$acc6, 8*1($r_ptr)
1547	pxor	%xmm1, %xmm1
1548	mov	$acc7, 8*2($r_ptr)
1549	pxor	%xmm2, %xmm2
1550	mov	$acc0, 8*3($r_ptr)
1551	pxor	%xmm3, %xmm3
1552
1553	mov	0(%rsp),%r15
1554.cfi_restore	%r15
1555	mov	8(%rsp),%r14
1556.cfi_restore	%r14
1557	mov	16(%rsp),%r13
1558.cfi_restore	%r13
1559	mov	24(%rsp),%r12
1560.cfi_restore	%r12
1561	mov	32(%rsp),%rbx
1562.cfi_restore	%rbx
1563	mov	40(%rsp),%rbp
1564.cfi_restore	%rbp
1565	lea	48(%rsp),%rsp
1566.cfi_adjust_cfa_offset	-48
1567.Lord_sqrx_epilogue:
1568	ret
1569.cfi_endproc
1570.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1571___
1572
1573$code.=<<___;
1574################################################################################
1575# void ecp_nistz256_to_mont(
1576#   uint64_t res[4],
1577#   uint64_t in[4]);
1578.globl	ecp_nistz256_to_mont
1579.type	ecp_nistz256_to_mont,\@function,2
1580.align	32
1581ecp_nistz256_to_mont:
1582___
1583$code.=<<___	if ($addx);
1584	mov	\$0x80100, %ecx
1585	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1586___
1587$code.=<<___;
1588	lea	.LRR(%rip), $b_org
1589	jmp	.Lmul_mont
1590.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1591
1592################################################################################
1593# void ecp_nistz256_mul_mont(
1594#   uint64_t res[4],
1595#   uint64_t a[4],
1596#   uint64_t b[4]);
1597
1598.globl	ecp_nistz256_mul_mont
1599.type	ecp_nistz256_mul_mont,\@function,3
1600.align	32
1601ecp_nistz256_mul_mont:
1602.cfi_startproc
1603___
1604$code.=<<___	if ($addx);
1605	mov	\$0x80100, %ecx
1606	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1607___
1608$code.=<<___;
1609.Lmul_mont:
1610	push	%rbp
1611.cfi_push	%rbp
1612	push	%rbx
1613.cfi_push	%rbx
1614	push	%r12
1615.cfi_push	%r12
1616	push	%r13
1617.cfi_push	%r13
1618	push	%r14
1619.cfi_push	%r14
1620	push	%r15
1621.cfi_push	%r15
1622.Lmul_body:
1623___
1624$code.=<<___	if ($addx);
1625	cmp	\$0x80100, %ecx
1626	je	.Lmul_montx
1627___
1628$code.=<<___;
1629	mov	$b_org, $b_ptr
1630	mov	8*0($b_org), %rax
1631	mov	8*0($a_ptr), $acc1
1632	mov	8*1($a_ptr), $acc2
1633	mov	8*2($a_ptr), $acc3
1634	mov	8*3($a_ptr), $acc4
1635
1636	call	__ecp_nistz256_mul_montq
1637___
1638$code.=<<___	if ($addx);
1639	jmp	.Lmul_mont_done
1640
1641.align	32
1642.Lmul_montx:
1643	mov	$b_org, $b_ptr
1644	mov	8*0($b_org), %rdx
1645	mov	8*0($a_ptr), $acc1
1646	mov	8*1($a_ptr), $acc2
1647	mov	8*2($a_ptr), $acc3
1648	mov	8*3($a_ptr), $acc4
1649	lea	-128($a_ptr), $a_ptr	# control u-op density
1650
1651	call	__ecp_nistz256_mul_montx
1652___
1653$code.=<<___;
1654.Lmul_mont_done:
1655	mov	0(%rsp),%r15
1656.cfi_restore	%r15
1657	mov	8(%rsp),%r14
1658.cfi_restore	%r14
1659	mov	16(%rsp),%r13
1660.cfi_restore	%r13
1661	mov	24(%rsp),%r12
1662.cfi_restore	%r12
1663	mov	32(%rsp),%rbx
1664.cfi_restore	%rbx
1665	mov	40(%rsp),%rbp
1666.cfi_restore	%rbp
1667	lea	48(%rsp),%rsp
1668.cfi_adjust_cfa_offset	-48
1669.Lmul_epilogue:
1670	ret
1671.cfi_endproc
1672.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1673
1674.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1675.align	32
1676__ecp_nistz256_mul_montq:
1677	########################################################################
1678	# Multiply a by b[0]
1679	mov	%rax, $t1
1680	mulq	$acc1
1681	mov	.Lpoly+8*1(%rip),$poly1
1682	mov	%rax, $acc0
1683	mov	$t1, %rax
1684	mov	%rdx, $acc1
1685
1686	mulq	$acc2
1687	mov	.Lpoly+8*3(%rip),$poly3
1688	add	%rax, $acc1
1689	mov	$t1, %rax
1690	adc	\$0, %rdx
1691	mov	%rdx, $acc2
1692
1693	mulq	$acc3
1694	add	%rax, $acc2
1695	mov	$t1, %rax
1696	adc	\$0, %rdx
1697	mov	%rdx, $acc3
1698
1699	mulq	$acc4
1700	add	%rax, $acc3
1701	 mov	$acc0, %rax
1702	adc	\$0, %rdx
1703	xor	$acc5, $acc5
1704	mov	%rdx, $acc4
1705
1706	########################################################################
1707	# First reduction step
1708	# Basically now we want to multiply acc[0] by p256,
1709	# and add the result to the acc.
1710	# Due to the special form of p256 we do some optimizations
1711	#
1712	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1713	# then we add acc[0] and get acc[0] x 2^96
1714
1715	mov	$acc0, $t1
1716	shl	\$32, $acc0
1717	mulq	$poly3
1718	shr	\$32, $t1
1719	add	$acc0, $acc1		# +=acc[0]<<96
1720	adc	$t1, $acc2
1721	adc	%rax, $acc3
1722	 mov	8*1($b_ptr), %rax
1723	adc	%rdx, $acc4
1724	adc	\$0, $acc5
1725	xor	$acc0, $acc0
1726
1727	########################################################################
1728	# Multiply by b[1]
1729	mov	%rax, $t1
1730	mulq	8*0($a_ptr)
1731	add	%rax, $acc1
1732	mov	$t1, %rax
1733	adc	\$0, %rdx
1734	mov	%rdx, $t0
1735
1736	mulq	8*1($a_ptr)
1737	add	$t0, $acc2
1738	adc	\$0, %rdx
1739	add	%rax, $acc2
1740	mov	$t1, %rax
1741	adc	\$0, %rdx
1742	mov	%rdx, $t0
1743
1744	mulq	8*2($a_ptr)
1745	add	$t0, $acc3
1746	adc	\$0, %rdx
1747	add	%rax, $acc3
1748	mov	$t1, %rax
1749	adc	\$0, %rdx
1750	mov	%rdx, $t0
1751
1752	mulq	8*3($a_ptr)
1753	add	$t0, $acc4
1754	adc	\$0, %rdx
1755	add	%rax, $acc4
1756	 mov	$acc1, %rax
1757	adc	%rdx, $acc5
1758	adc	\$0, $acc0
1759
1760	########################################################################
1761	# Second reduction step
1762	mov	$acc1, $t1
1763	shl	\$32, $acc1
1764	mulq	$poly3
1765	shr	\$32, $t1
1766	add	$acc1, $acc2
1767	adc	$t1, $acc3
1768	adc	%rax, $acc4
1769	 mov	8*2($b_ptr), %rax
1770	adc	%rdx, $acc5
1771	adc	\$0, $acc0
1772	xor	$acc1, $acc1
1773
1774	########################################################################
1775	# Multiply by b[2]
1776	mov	%rax, $t1
1777	mulq	8*0($a_ptr)
1778	add	%rax, $acc2
1779	mov	$t1, %rax
1780	adc	\$0, %rdx
1781	mov	%rdx, $t0
1782
1783	mulq	8*1($a_ptr)
1784	add	$t0, $acc3
1785	adc	\$0, %rdx
1786	add	%rax, $acc3
1787	mov	$t1, %rax
1788	adc	\$0, %rdx
1789	mov	%rdx, $t0
1790
1791	mulq	8*2($a_ptr)
1792	add	$t0, $acc4
1793	adc	\$0, %rdx
1794	add	%rax, $acc4
1795	mov	$t1, %rax
1796	adc	\$0, %rdx
1797	mov	%rdx, $t0
1798
1799	mulq	8*3($a_ptr)
1800	add	$t0, $acc5
1801	adc	\$0, %rdx
1802	add	%rax, $acc5
1803	 mov	$acc2, %rax
1804	adc	%rdx, $acc0
1805	adc	\$0, $acc1
1806
1807	########################################################################
1808	# Third reduction step
1809	mov	$acc2, $t1
1810	shl	\$32, $acc2
1811	mulq	$poly3
1812	shr	\$32, $t1
1813	add	$acc2, $acc3
1814	adc	$t1, $acc4
1815	adc	%rax, $acc5
1816	 mov	8*3($b_ptr), %rax
1817	adc	%rdx, $acc0
1818	adc	\$0, $acc1
1819	xor	$acc2, $acc2
1820
1821	########################################################################
1822	# Multiply by b[3]
1823	mov	%rax, $t1
1824	mulq	8*0($a_ptr)
1825	add	%rax, $acc3
1826	mov	$t1, %rax
1827	adc	\$0, %rdx
1828	mov	%rdx, $t0
1829
1830	mulq	8*1($a_ptr)
1831	add	$t0, $acc4
1832	adc	\$0, %rdx
1833	add	%rax, $acc4
1834	mov	$t1, %rax
1835	adc	\$0, %rdx
1836	mov	%rdx, $t0
1837
1838	mulq	8*2($a_ptr)
1839	add	$t0, $acc5
1840	adc	\$0, %rdx
1841	add	%rax, $acc5
1842	mov	$t1, %rax
1843	adc	\$0, %rdx
1844	mov	%rdx, $t0
1845
1846	mulq	8*3($a_ptr)
1847	add	$t0, $acc0
1848	adc	\$0, %rdx
1849	add	%rax, $acc0
1850	 mov	$acc3, %rax
1851	adc	%rdx, $acc1
1852	adc	\$0, $acc2
1853
1854	########################################################################
1855	# Final reduction step
1856	mov	$acc3, $t1
1857	shl	\$32, $acc3
1858	mulq	$poly3
1859	shr	\$32, $t1
1860	add	$acc3, $acc4
1861	adc	$t1, $acc5
1862	 mov	$acc4, $t0
1863	adc	%rax, $acc0
1864	adc	%rdx, $acc1
1865	 mov	$acc5, $t1
1866	adc	\$0, $acc2
1867
1868	########################################################################
1869	# Branch-less conditional subtraction of P
1870	sub	\$-1, $acc4		# .Lpoly[0]
1871	 mov	$acc0, $t2
1872	sbb	$poly1, $acc5		# .Lpoly[1]
1873	sbb	\$0, $acc0		# .Lpoly[2]
1874	 mov	$acc1, $t3
1875	sbb	$poly3, $acc1		# .Lpoly[3]
1876	sbb	\$0, $acc2
1877
1878	cmovc	$t0, $acc4
1879	cmovc	$t1, $acc5
1880	mov	$acc4, 8*0($r_ptr)
1881	cmovc	$t2, $acc0
1882	mov	$acc5, 8*1($r_ptr)
1883	cmovc	$t3, $acc1
1884	mov	$acc0, 8*2($r_ptr)
1885	mov	$acc1, 8*3($r_ptr)
1886
1887	ret
1888.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1889
1890################################################################################
1891# void ecp_nistz256_sqr_mont(
1892#   uint64_t res[4],
1893#   uint64_t a[4]);
1894
1895# we optimize the square according to S.Gueron and V.Krasnov,
1896# "Speeding up Big-Number Squaring"
1897.globl	ecp_nistz256_sqr_mont
1898.type	ecp_nistz256_sqr_mont,\@function,2
1899.align	32
1900ecp_nistz256_sqr_mont:
1901.cfi_startproc
1902___
1903$code.=<<___	if ($addx);
1904	mov	\$0x80100, %ecx
1905	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1906___
1907$code.=<<___;
1908	push	%rbp
1909.cfi_push	%rbp
1910	push	%rbx
1911.cfi_push	%rbx
1912	push	%r12
1913.cfi_push	%r12
1914	push	%r13
1915.cfi_push	%r13
1916	push	%r14
1917.cfi_push	%r14
1918	push	%r15
1919.cfi_push	%r15
1920.Lsqr_body:
1921___
1922$code.=<<___	if ($addx);
1923	cmp	\$0x80100, %ecx
1924	je	.Lsqr_montx
1925___
1926$code.=<<___;
1927	mov	8*0($a_ptr), %rax
1928	mov	8*1($a_ptr), $acc6
1929	mov	8*2($a_ptr), $acc7
1930	mov	8*3($a_ptr), $acc0
1931
1932	call	__ecp_nistz256_sqr_montq
1933___
1934$code.=<<___	if ($addx);
1935	jmp	.Lsqr_mont_done
1936
1937.align	32
1938.Lsqr_montx:
1939	mov	8*0($a_ptr), %rdx
1940	mov	8*1($a_ptr), $acc6
1941	mov	8*2($a_ptr), $acc7
1942	mov	8*3($a_ptr), $acc0
1943	lea	-128($a_ptr), $a_ptr	# control u-op density
1944
1945	call	__ecp_nistz256_sqr_montx
1946___
1947$code.=<<___;
1948.Lsqr_mont_done:
1949	mov	0(%rsp),%r15
1950.cfi_restore	%r15
1951	mov	8(%rsp),%r14
1952.cfi_restore	%r14
1953	mov	16(%rsp),%r13
1954.cfi_restore	%r13
1955	mov	24(%rsp),%r12
1956.cfi_restore	%r12
1957	mov	32(%rsp),%rbx
1958.cfi_restore	%rbx
1959	mov	40(%rsp),%rbp
1960.cfi_restore	%rbp
1961	lea	48(%rsp),%rsp
1962.cfi_adjust_cfa_offset	-48
1963.Lsqr_epilogue:
1964	ret
1965.cfi_endproc
1966.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1967
1968.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1969.align	32
1970__ecp_nistz256_sqr_montq:
1971	mov	%rax, $acc5
1972	mulq	$acc6			# a[1]*a[0]
1973	mov	%rax, $acc1
1974	mov	$acc7, %rax
1975	mov	%rdx, $acc2
1976
1977	mulq	$acc5			# a[0]*a[2]
1978	add	%rax, $acc2
1979	mov	$acc0, %rax
1980	adc	\$0, %rdx
1981	mov	%rdx, $acc3
1982
1983	mulq	$acc5			# a[0]*a[3]
1984	add	%rax, $acc3
1985	 mov	$acc7, %rax
1986	adc	\$0, %rdx
1987	mov	%rdx, $acc4
1988
1989	#################################
1990	mulq	$acc6			# a[1]*a[2]
1991	add	%rax, $acc3
1992	mov	$acc0, %rax
1993	adc	\$0, %rdx
1994	mov	%rdx, $t1
1995
1996	mulq	$acc6			# a[1]*a[3]
1997	add	%rax, $acc4
1998	 mov	$acc0, %rax
1999	adc	\$0, %rdx
2000	add	$t1, $acc4
2001	mov	%rdx, $acc5
2002	adc	\$0, $acc5
2003
2004	#################################
2005	mulq	$acc7			# a[2]*a[3]
2006	xor	$acc7, $acc7
2007	add	%rax, $acc5
2008	 mov	8*0($a_ptr), %rax
2009	mov	%rdx, $acc6
2010	adc	\$0, $acc6
2011
2012	add	$acc1, $acc1		# acc1:6<<1
2013	adc	$acc2, $acc2
2014	adc	$acc3, $acc3
2015	adc	$acc4, $acc4
2016	adc	$acc5, $acc5
2017	adc	$acc6, $acc6
2018	adc	\$0, $acc7
2019
2020	mulq	%rax
2021	mov	%rax, $acc0
2022	mov	8*1($a_ptr), %rax
2023	mov	%rdx, $t0
2024
2025	mulq	%rax
2026	add	$t0, $acc1
2027	adc	%rax, $acc2
2028	mov	8*2($a_ptr), %rax
2029	adc	\$0, %rdx
2030	mov	%rdx, $t0
2031
2032	mulq	%rax
2033	add	$t0, $acc3
2034	adc	%rax, $acc4
2035	mov	8*3($a_ptr), %rax
2036	adc	\$0, %rdx
2037	mov	%rdx, $t0
2038
2039	mulq	%rax
2040	add	$t0, $acc5
2041	adc	%rax, $acc6
2042	 mov	$acc0, %rax
2043	adc	%rdx, $acc7
2044
2045	mov	.Lpoly+8*1(%rip), $a_ptr
2046	mov	.Lpoly+8*3(%rip), $t1
2047
2048	##########################################
2049	# Now the reduction
2050	# First iteration
2051	mov	$acc0, $t0
2052	shl	\$32, $acc0
2053	mulq	$t1
2054	shr	\$32, $t0
2055	add	$acc0, $acc1		# +=acc[0]<<96
2056	adc	$t0, $acc2
2057	adc	%rax, $acc3
2058	 mov	$acc1, %rax
2059	adc	\$0, %rdx
2060
2061	##########################################
2062	# Second iteration
2063	mov	$acc1, $t0
2064	shl	\$32, $acc1
2065	mov	%rdx, $acc0
2066	mulq	$t1
2067	shr	\$32, $t0
2068	add	$acc1, $acc2
2069	adc	$t0, $acc3
2070	adc	%rax, $acc0
2071	 mov	$acc2, %rax
2072	adc	\$0, %rdx
2073
2074	##########################################
2075	# Third iteration
2076	mov	$acc2, $t0
2077	shl	\$32, $acc2
2078	mov	%rdx, $acc1
2079	mulq	$t1
2080	shr	\$32, $t0
2081	add	$acc2, $acc3
2082	adc	$t0, $acc0
2083	adc	%rax, $acc1
2084	 mov	$acc3, %rax
2085	adc	\$0, %rdx
2086
2087	###########################################
2088	# Last iteration
2089	mov	$acc3, $t0
2090	shl	\$32, $acc3
2091	mov	%rdx, $acc2
2092	mulq	$t1
2093	shr	\$32, $t0
2094	add	$acc3, $acc0
2095	adc	$t0, $acc1
2096	adc	%rax, $acc2
2097	adc	\$0, %rdx
2098	xor	$acc3, $acc3
2099
2100	############################################
2101	# Add the rest of the acc
2102	add	$acc0, $acc4
2103	adc	$acc1, $acc5
2104	 mov	$acc4, $acc0
2105	adc	$acc2, $acc6
2106	adc	%rdx, $acc7
2107	 mov	$acc5, $acc1
2108	adc	\$0, $acc3
2109
2110	sub	\$-1, $acc4		# .Lpoly[0]
2111	 mov	$acc6, $acc2
2112	sbb	$a_ptr, $acc5		# .Lpoly[1]
2113	sbb	\$0, $acc6		# .Lpoly[2]
2114	 mov	$acc7, $t0
2115	sbb	$t1, $acc7		# .Lpoly[3]
2116	sbb	\$0, $acc3
2117
2118	cmovc	$acc0, $acc4
2119	cmovc	$acc1, $acc5
2120	mov	$acc4, 8*0($r_ptr)
2121	cmovc	$acc2, $acc6
2122	mov	$acc5, 8*1($r_ptr)
2123	cmovc	$t0, $acc7
2124	mov	$acc6, 8*2($r_ptr)
2125	mov	$acc7, 8*3($r_ptr)
2126
2127	ret
2128.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2129___
2130
2131if ($addx) {
2132$code.=<<___;
2133.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
2134.align	32
2135__ecp_nistz256_mul_montx:
2136	########################################################################
2137	# Multiply by b[0]
2138	mulx	$acc1, $acc0, $acc1
2139	mulx	$acc2, $t0, $acc2
2140	mov	\$32, $poly1
2141	xor	$acc5, $acc5		# cf=0
2142	mulx	$acc3, $t1, $acc3
2143	mov	.Lpoly+8*3(%rip), $poly3
2144	adc	$t0, $acc1
2145	mulx	$acc4, $t0, $acc4
2146	 mov	$acc0, %rdx
2147	adc	$t1, $acc2
2148	 shlx	$poly1,$acc0,$t1
2149	adc	$t0, $acc3
2150	 shrx	$poly1,$acc0,$t0
2151	adc	\$0, $acc4
2152
2153	########################################################################
2154	# First reduction step
2155	add	$t1, $acc1
2156	adc	$t0, $acc2
2157
2158	mulx	$poly3, $t0, $t1
2159	 mov	8*1($b_ptr), %rdx
2160	adc	$t0, $acc3
2161	adc	$t1, $acc4
2162	adc	\$0, $acc5
2163	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
2164
2165	########################################################################
2166	# Multiply by b[1]
2167	mulx	8*0+128($a_ptr), $t0, $t1
2168	adcx	$t0, $acc1
2169	adox	$t1, $acc2
2170
2171	mulx	8*1+128($a_ptr), $t0, $t1
2172	adcx	$t0, $acc2
2173	adox	$t1, $acc3
2174
2175	mulx	8*2+128($a_ptr), $t0, $t1
2176	adcx	$t0, $acc3
2177	adox	$t1, $acc4
2178
2179	mulx	8*3+128($a_ptr), $t0, $t1
2180	 mov	$acc1, %rdx
2181	adcx	$t0, $acc4
2182	 shlx	$poly1, $acc1, $t0
2183	adox	$t1, $acc5
2184	 shrx	$poly1, $acc1, $t1
2185
2186	adcx	$acc0, $acc5
2187	adox	$acc0, $acc0
2188	adc	\$0, $acc0
2189
2190	########################################################################
2191	# Second reduction step
2192	add	$t0, $acc2
2193	adc	$t1, $acc3
2194
2195	mulx	$poly3, $t0, $t1
2196	 mov	8*2($b_ptr), %rdx
2197	adc	$t0, $acc4
2198	adc	$t1, $acc5
2199	adc	\$0, $acc0
2200	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
2201
2202	########################################################################
2203	# Multiply by b[2]
2204	mulx	8*0+128($a_ptr), $t0, $t1
2205	adcx	$t0, $acc2
2206	adox	$t1, $acc3
2207
2208	mulx	8*1+128($a_ptr), $t0, $t1
2209	adcx	$t0, $acc3
2210	adox	$t1, $acc4
2211
2212	mulx	8*2+128($a_ptr), $t0, $t1
2213	adcx	$t0, $acc4
2214	adox	$t1, $acc5
2215
2216	mulx	8*3+128($a_ptr), $t0, $t1
2217	 mov	$acc2, %rdx
2218	adcx	$t0, $acc5
2219	 shlx	$poly1, $acc2, $t0
2220	adox	$t1, $acc0
2221	 shrx	$poly1, $acc2, $t1
2222
2223	adcx	$acc1, $acc0
2224	adox	$acc1, $acc1
2225	adc	\$0, $acc1
2226
2227	########################################################################
2228	# Third reduction step
2229	add	$t0, $acc3
2230	adc	$t1, $acc4
2231
2232	mulx	$poly3, $t0, $t1
2233	 mov	8*3($b_ptr), %rdx
2234	adc	$t0, $acc5
2235	adc	$t1, $acc0
2236	adc	\$0, $acc1
2237	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
2238
2239	########################################################################
2240	# Multiply by b[3]
2241	mulx	8*0+128($a_ptr), $t0, $t1
2242	adcx	$t0, $acc3
2243	adox	$t1, $acc4
2244
2245	mulx	8*1+128($a_ptr), $t0, $t1
2246	adcx	$t0, $acc4
2247	adox	$t1, $acc5
2248
2249	mulx	8*2+128($a_ptr), $t0, $t1
2250	adcx	$t0, $acc5
2251	adox	$t1, $acc0
2252
2253	mulx	8*3+128($a_ptr), $t0, $t1
2254	 mov	$acc3, %rdx
2255	adcx	$t0, $acc0
2256	 shlx	$poly1, $acc3, $t0
2257	adox	$t1, $acc1
2258	 shrx	$poly1, $acc3, $t1
2259
2260	adcx	$acc2, $acc1
2261	adox	$acc2, $acc2
2262	adc	\$0, $acc2
2263
2264	########################################################################
2265	# Fourth reduction step
2266	add	$t0, $acc4
2267	adc	$t1, $acc5
2268
2269	mulx	$poly3, $t0, $t1
2270	 mov	$acc4, $t2
2271	mov	.Lpoly+8*1(%rip), $poly1
2272	adc	$t0, $acc0
2273	 mov	$acc5, $t3
2274	adc	$t1, $acc1
2275	adc	\$0, $acc2
2276
2277	########################################################################
2278	# Branch-less conditional subtraction of P
2279	xor	%eax, %eax
2280	 mov	$acc0, $t0
2281	sbb	\$-1, $acc4		# .Lpoly[0]
2282	sbb	$poly1, $acc5		# .Lpoly[1]
2283	sbb	\$0, $acc0		# .Lpoly[2]
2284	 mov	$acc1, $t1
2285	sbb	$poly3, $acc1		# .Lpoly[3]
2286	sbb	\$0, $acc2
2287
2288	cmovc	$t2, $acc4
2289	cmovc	$t3, $acc5
2290	mov	$acc4, 8*0($r_ptr)
2291	cmovc	$t0, $acc0
2292	mov	$acc5, 8*1($r_ptr)
2293	cmovc	$t1, $acc1
2294	mov	$acc0, 8*2($r_ptr)
2295	mov	$acc1, 8*3($r_ptr)
2296
2297	ret
2298.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2299
2300.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
2301.align	32
2302__ecp_nistz256_sqr_montx:
2303	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
2304	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
2305	xor	%eax, %eax
2306	adc	$t0, $acc2
2307	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
2308	 mov	$acc6, %rdx
2309	adc	$t1, $acc3
2310	adc	\$0, $acc4
2311	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
2312
2313	#################################
2314	mulx	$acc7, $t0, $t1		# a[1]*a[2]
2315	adcx	$t0, $acc3
2316	adox	$t1, $acc4
2317
2318	mulx	$acc0, $t0, $t1		# a[1]*a[3]
2319	 mov	$acc7, %rdx
2320	adcx	$t0, $acc4
2321	adox	$t1, $acc5
2322	adc	\$0, $acc5
2323
2324	#################################
2325	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
2326	 mov	8*0+128($a_ptr), %rdx
2327	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
2328	 adcx	$acc1, $acc1		# acc1:6<<1
2329	adox	$t0, $acc5
2330	 adcx	$acc2, $acc2
2331	adox	$acc7, $acc6		# of=0
2332
2333	mulx	%rdx, $acc0, $t1
2334	mov	8*1+128($a_ptr), %rdx
2335	 adcx	$acc3, $acc3
2336	adox	$t1, $acc1
2337	 adcx	$acc4, $acc4
2338	mulx	%rdx, $t0, $t4
2339	mov	8*2+128($a_ptr), %rdx
2340	 adcx	$acc5, $acc5
2341	adox	$t0, $acc2
2342	 adcx	$acc6, $acc6
2343	.byte	0x67
2344	mulx	%rdx, $t0, $t1
2345	mov	8*3+128($a_ptr), %rdx
2346	adox	$t4, $acc3
2347	 adcx	$acc7, $acc7
2348	adox	$t0, $acc4
2349	 mov	\$32, $a_ptr
2350	adox	$t1, $acc5
2351	.byte	0x67,0x67
2352	mulx	%rdx, $t0, $t4
2353	 mov	.Lpoly+8*3(%rip), %rdx
2354	adox	$t0, $acc6
2355	 shlx	$a_ptr, $acc0, $t0
2356	adox	$t4, $acc7
2357	 shrx	$a_ptr, $acc0, $t4
2358	mov	%rdx,$t1
2359
2360	# reduction step 1
2361	add	$t0, $acc1
2362	adc	$t4, $acc2
2363
2364	mulx	$acc0, $t0, $acc0
2365	adc	$t0, $acc3
2366	 shlx	$a_ptr, $acc1, $t0
2367	adc	\$0, $acc0
2368	 shrx	$a_ptr, $acc1, $t4
2369
2370	# reduction step 2
2371	add	$t0, $acc2
2372	adc	$t4, $acc3
2373
2374	mulx	$acc1, $t0, $acc1
2375	adc	$t0, $acc0
2376	 shlx	$a_ptr, $acc2, $t0
2377	adc	\$0, $acc1
2378	 shrx	$a_ptr, $acc2, $t4
2379
2380	# reduction step 3
2381	add	$t0, $acc3
2382	adc	$t4, $acc0
2383
2384	mulx	$acc2, $t0, $acc2
2385	adc	$t0, $acc1
2386	 shlx	$a_ptr, $acc3, $t0
2387	adc	\$0, $acc2
2388	 shrx	$a_ptr, $acc3, $t4
2389
2390	# reduction step 4
2391	add	$t0, $acc0
2392	adc	$t4, $acc1
2393
2394	mulx	$acc3, $t0, $acc3
2395	adc	$t0, $acc2
2396	adc	\$0, $acc3
2397
2398	xor	$t3, $t3
2399	add	$acc0, $acc4		# accumulate upper half
2400	 mov	.Lpoly+8*1(%rip), $a_ptr
2401	adc	$acc1, $acc5
2402	 mov	$acc4, $acc0
2403	adc	$acc2, $acc6
2404	adc	$acc3, $acc7
2405	 mov	$acc5, $acc1
2406	adc	\$0, $t3
2407
2408	sub	\$-1, $acc4		# .Lpoly[0]
2409	 mov	$acc6, $acc2
2410	sbb	$a_ptr, $acc5		# .Lpoly[1]
2411	sbb	\$0, $acc6		# .Lpoly[2]
2412	 mov	$acc7, $acc3
2413	sbb	$t1, $acc7		# .Lpoly[3]
2414	sbb	\$0, $t3
2415
2416	cmovc	$acc0, $acc4
2417	cmovc	$acc1, $acc5
2418	mov	$acc4, 8*0($r_ptr)
2419	cmovc	$acc2, $acc6
2420	mov	$acc5, 8*1($r_ptr)
2421	cmovc	$acc3, $acc7
2422	mov	$acc6, 8*2($r_ptr)
2423	mov	$acc7, 8*3($r_ptr)
2424
2425	ret
2426.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2427___
2428}
2429}
2430{
2431my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2432my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2433my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2434
2435$code.=<<___;
2436################################################################################
2437# void ecp_nistz256_from_mont(
2438#   uint64_t res[4],
2439#   uint64_t in[4]);
2440# This one performs Montgomery multiplication by 1, so we only need the reduction
2441
2442.globl	ecp_nistz256_from_mont
2443.type	ecp_nistz256_from_mont,\@function,2
2444.align	32
2445ecp_nistz256_from_mont:
2446.cfi_startproc
2447	push	%r12
2448.cfi_push	%r12
2449	push	%r13
2450.cfi_push	%r13
2451.Lfrom_body:
2452
2453	mov	8*0($in_ptr), %rax
2454	mov	.Lpoly+8*3(%rip), $t2
2455	mov	8*1($in_ptr), $acc1
2456	mov	8*2($in_ptr), $acc2
2457	mov	8*3($in_ptr), $acc3
2458	mov	%rax, $acc0
2459	mov	.Lpoly+8*1(%rip), $t1
2460
2461	#########################################
2462	# First iteration
2463	mov	%rax, $t0
2464	shl	\$32, $acc0
2465	mulq	$t2
2466	shr	\$32, $t0
2467	add	$acc0, $acc1
2468	adc	$t0, $acc2
2469	adc	%rax, $acc3
2470	 mov	$acc1, %rax
2471	adc	\$0, %rdx
2472
2473	#########################################
2474	# Second iteration
2475	mov	$acc1, $t0
2476	shl	\$32, $acc1
2477	mov	%rdx, $acc0
2478	mulq	$t2
2479	shr	\$32, $t0
2480	add	$acc1, $acc2
2481	adc	$t0, $acc3
2482	adc	%rax, $acc0
2483	 mov	$acc2, %rax
2484	adc	\$0, %rdx
2485
2486	##########################################
2487	# Third iteration
2488	mov	$acc2, $t0
2489	shl	\$32, $acc2
2490	mov	%rdx, $acc1
2491	mulq	$t2
2492	shr	\$32, $t0
2493	add	$acc2, $acc3
2494	adc	$t0, $acc0
2495	adc	%rax, $acc1
2496	 mov	$acc3, %rax
2497	adc	\$0, %rdx
2498
2499	###########################################
2500	# Last iteration
2501	mov	$acc3, $t0
2502	shl	\$32, $acc3
2503	mov	%rdx, $acc2
2504	mulq	$t2
2505	shr	\$32, $t0
2506	add	$acc3, $acc0
2507	adc	$t0, $acc1
2508	 mov	$acc0, $t0
2509	adc	%rax, $acc2
2510	 mov	$acc1, $in_ptr
2511	adc	\$0, %rdx
2512
2513	###########################################
2514	# Branch-less conditional subtraction
2515	sub	\$-1, $acc0
2516	 mov	$acc2, %rax
2517	sbb	$t1, $acc1
2518	sbb	\$0, $acc2
2519	 mov	%rdx, $acc3
2520	sbb	$t2, %rdx
2521	sbb	$t2, $t2
2522
2523	cmovnz	$t0, $acc0
2524	cmovnz	$in_ptr, $acc1
2525	mov	$acc0, 8*0($r_ptr)
2526	cmovnz	%rax, $acc2
2527	mov	$acc1, 8*1($r_ptr)
2528	cmovz	%rdx, $acc3
2529	mov	$acc2, 8*2($r_ptr)
2530	mov	$acc3, 8*3($r_ptr)
2531
2532	mov	0(%rsp),%r13
2533.cfi_restore	%r13
2534	mov	8(%rsp),%r12
2535.cfi_restore	%r12
2536	lea	16(%rsp),%rsp
2537.cfi_adjust_cfa_offset	-16
2538.Lfrom_epilogue:
2539	ret
2540.cfi_endproc
2541.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2542___
2543}
2544{
2545my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2546my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2547my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2548my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2549
2550$code.=<<___;
2551################################################################################
2552# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2553.globl	ecp_nistz256_scatter_w5
2554.type	ecp_nistz256_scatter_w5,\@abi-omnipotent
2555.align	32
2556ecp_nistz256_scatter_w5:
2557	lea	-3($index,$index,2), $index
2558	movdqa	0x00($in_t), %xmm0
2559	shl	\$5, $index
2560	movdqa	0x10($in_t), %xmm1
2561	movdqa	0x20($in_t), %xmm2
2562	movdqa	0x30($in_t), %xmm3
2563	movdqa	0x40($in_t), %xmm4
2564	movdqa	0x50($in_t), %xmm5
2565	movdqa	%xmm0, 0x00($val,$index)
2566	movdqa	%xmm1, 0x10($val,$index)
2567	movdqa	%xmm2, 0x20($val,$index)
2568	movdqa	%xmm3, 0x30($val,$index)
2569	movdqa	%xmm4, 0x40($val,$index)
2570	movdqa	%xmm5, 0x50($val,$index)
2571
2572	ret
2573.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2574
2575################################################################################
2576# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2577.globl	ecp_nistz256_gather_w5
2578.type	ecp_nistz256_gather_w5,\@abi-omnipotent
2579.align	32
2580ecp_nistz256_gather_w5:
2581___
2582$code.=<<___	if ($avx>1);
2583	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2584	test	\$`1<<5`, %eax
2585	jnz	.Lavx2_gather_w5
2586___
2587$code.=<<___	if ($win64);
2588	lea	-0x88(%rsp), %rax
2589.LSEH_begin_ecp_nistz256_gather_w5:
2590	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2591	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2592	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2593	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2594	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2595	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2596	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2597	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2598	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2599	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2600	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2601___
2602$code.=<<___;
2603	movdqa	.LOne(%rip), $ONE
2604	movd	$index, $INDEX
2605
2606	pxor	$Ra, $Ra
2607	pxor	$Rb, $Rb
2608	pxor	$Rc, $Rc
2609	pxor	$Rd, $Rd
2610	pxor	$Re, $Re
2611	pxor	$Rf, $Rf
2612
2613	movdqa	$ONE, $M0
2614	pshufd	\$0, $INDEX, $INDEX
2615
2616	mov	\$16, %rax
2617.Lselect_loop_sse_w5:
2618
2619	movdqa	$M0, $TMP0
2620	paddd	$ONE, $M0
2621	pcmpeqd $INDEX, $TMP0
2622
2623	movdqa	16*0($in_t), $T0a
2624	movdqa	16*1($in_t), $T0b
2625	movdqa	16*2($in_t), $T0c
2626	movdqa	16*3($in_t), $T0d
2627	movdqa	16*4($in_t), $T0e
2628	movdqa	16*5($in_t), $T0f
2629	lea 16*6($in_t), $in_t
2630
2631	pand	$TMP0, $T0a
2632	pand	$TMP0, $T0b
2633	por	$T0a, $Ra
2634	pand	$TMP0, $T0c
2635	por	$T0b, $Rb
2636	pand	$TMP0, $T0d
2637	por	$T0c, $Rc
2638	pand	$TMP0, $T0e
2639	por	$T0d, $Rd
2640	pand	$TMP0, $T0f
2641	por	$T0e, $Re
2642	por	$T0f, $Rf
2643
2644	dec	%rax
2645	jnz	.Lselect_loop_sse_w5
2646
2647	movdqu	$Ra, 16*0($val)
2648	movdqu	$Rb, 16*1($val)
2649	movdqu	$Rc, 16*2($val)
2650	movdqu	$Rd, 16*3($val)
2651	movdqu	$Re, 16*4($val)
2652	movdqu	$Rf, 16*5($val)
2653___
2654$code.=<<___	if ($win64);
2655	movaps	(%rsp), %xmm6
2656	movaps	0x10(%rsp), %xmm7
2657	movaps	0x20(%rsp), %xmm8
2658	movaps	0x30(%rsp), %xmm9
2659	movaps	0x40(%rsp), %xmm10
2660	movaps	0x50(%rsp), %xmm11
2661	movaps	0x60(%rsp), %xmm12
2662	movaps	0x70(%rsp), %xmm13
2663	movaps	0x80(%rsp), %xmm14
2664	movaps	0x90(%rsp), %xmm15
2665	lea	0xa8(%rsp), %rsp
2666___
2667$code.=<<___;
2668	ret
2669.LSEH_end_ecp_nistz256_gather_w5:
2670.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2671
2672################################################################################
2673# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2674.globl	ecp_nistz256_scatter_w7
2675.type	ecp_nistz256_scatter_w7,\@abi-omnipotent
2676.align	32
2677ecp_nistz256_scatter_w7:
2678	movdqu	0x00($in_t), %xmm0
2679	shl	\$6, $index
2680	movdqu	0x10($in_t), %xmm1
2681	movdqu	0x20($in_t), %xmm2
2682	movdqu	0x30($in_t), %xmm3
2683	movdqa	%xmm0, 0x00($val,$index)
2684	movdqa	%xmm1, 0x10($val,$index)
2685	movdqa	%xmm2, 0x20($val,$index)
2686	movdqa	%xmm3, 0x30($val,$index)
2687
2688	ret
2689.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2690
2691################################################################################
2692# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2693.globl	ecp_nistz256_gather_w7
2694.type	ecp_nistz256_gather_w7,\@abi-omnipotent
2695.align	32
2696ecp_nistz256_gather_w7:
2697___
2698$code.=<<___	if ($avx>1);
2699	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2700	test	\$`1<<5`, %eax
2701	jnz	.Lavx2_gather_w7
2702___
2703$code.=<<___	if ($win64);
2704	lea	-0x88(%rsp), %rax
2705.LSEH_begin_ecp_nistz256_gather_w7:
2706	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2707	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2708	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2709	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2710	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2711	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2712	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2713	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2714	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2715	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2716	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2717___
2718$code.=<<___;
2719	movdqa	.LOne(%rip), $M0
2720	movd	$index, $INDEX
2721
2722	pxor	$Ra, $Ra
2723	pxor	$Rb, $Rb
2724	pxor	$Rc, $Rc
2725	pxor	$Rd, $Rd
2726
2727	movdqa	$M0, $ONE
2728	pshufd	\$0, $INDEX, $INDEX
2729	mov	\$64, %rax
2730
2731.Lselect_loop_sse_w7:
2732	movdqa	$M0, $TMP0
2733	paddd	$ONE, $M0
2734	movdqa	16*0($in_t), $T0a
2735	movdqa	16*1($in_t), $T0b
2736	pcmpeqd	$INDEX, $TMP0
2737	movdqa	16*2($in_t), $T0c
2738	movdqa	16*3($in_t), $T0d
2739	lea	16*4($in_t), $in_t
2740
2741	pand	$TMP0, $T0a
2742	pand	$TMP0, $T0b
2743	por	$T0a, $Ra
2744	pand	$TMP0, $T0c
2745	por	$T0b, $Rb
2746	pand	$TMP0, $T0d
2747	por	$T0c, $Rc
2748	prefetcht0	255($in_t)
2749	por	$T0d, $Rd
2750
2751	dec	%rax
2752	jnz	.Lselect_loop_sse_w7
2753
2754	movdqu	$Ra, 16*0($val)
2755	movdqu	$Rb, 16*1($val)
2756	movdqu	$Rc, 16*2($val)
2757	movdqu	$Rd, 16*3($val)
2758___
2759$code.=<<___	if ($win64);
2760	movaps	(%rsp), %xmm6
2761	movaps	0x10(%rsp), %xmm7
2762	movaps	0x20(%rsp), %xmm8
2763	movaps	0x30(%rsp), %xmm9
2764	movaps	0x40(%rsp), %xmm10
2765	movaps	0x50(%rsp), %xmm11
2766	movaps	0x60(%rsp), %xmm12
2767	movaps	0x70(%rsp), %xmm13
2768	movaps	0x80(%rsp), %xmm14
2769	movaps	0x90(%rsp), %xmm15
2770	lea	0xa8(%rsp), %rsp
2771___
2772$code.=<<___;
2773	ret
2774.LSEH_end_ecp_nistz256_gather_w7:
2775.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2776___
2777}
2778if ($avx>1) {
2779my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2780my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2781my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2782my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2783
2784$code.=<<___;
2785################################################################################
2786# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2787.type	ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2788.align	32
2789ecp_nistz256_avx2_gather_w5:
2790.Lavx2_gather_w5:
2791	vzeroupper
2792___
2793$code.=<<___	if ($win64);
2794	lea	-0x88(%rsp), %rax
2795	mov	%rsp,%r11
2796.LSEH_begin_ecp_nistz256_avx2_gather_w5:
2797	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2798	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2799	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2800	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2801	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2802	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2803	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2804	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2805	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2806	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2807	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2808___
2809$code.=<<___;
2810	vmovdqa	.LTwo(%rip), $TWO
2811
2812	vpxor	$Ra, $Ra, $Ra
2813	vpxor	$Rb, $Rb, $Rb
2814	vpxor	$Rc, $Rc, $Rc
2815
2816	vmovdqa .LOne(%rip), $M0
2817	vmovdqa .LTwo(%rip), $M1
2818
2819	vmovd	$index, %xmm1
2820	vpermd	$INDEX, $Ra, $INDEX
2821
2822	mov	\$8, %rax
2823.Lselect_loop_avx2_w5:
2824
2825	vmovdqa	32*0($in_t), $T0a
2826	vmovdqa	32*1($in_t), $T0b
2827	vmovdqa	32*2($in_t), $T0c
2828
2829	vmovdqa	32*3($in_t), $T1a
2830	vmovdqa	32*4($in_t), $T1b
2831	vmovdqa	32*5($in_t), $T1c
2832
2833	vpcmpeqd	$INDEX, $M0, $TMP0
2834	vpcmpeqd	$INDEX, $M1, $TMP1
2835
2836	vpaddd	$TWO, $M0, $M0
2837	vpaddd	$TWO, $M1, $M1
2838	lea	32*6($in_t), $in_t
2839
2840	vpand	$TMP0, $T0a, $T0a
2841	vpand	$TMP0, $T0b, $T0b
2842	vpand	$TMP0, $T0c, $T0c
2843	vpand	$TMP1, $T1a, $T1a
2844	vpand	$TMP1, $T1b, $T1b
2845	vpand	$TMP1, $T1c, $T1c
2846
2847	vpxor	$T0a, $Ra, $Ra
2848	vpxor	$T0b, $Rb, $Rb
2849	vpxor	$T0c, $Rc, $Rc
2850	vpxor	$T1a, $Ra, $Ra
2851	vpxor	$T1b, $Rb, $Rb
2852	vpxor	$T1c, $Rc, $Rc
2853
2854	dec %rax
2855	jnz .Lselect_loop_avx2_w5
2856
2857	vmovdqu $Ra, 32*0($val)
2858	vmovdqu $Rb, 32*1($val)
2859	vmovdqu $Rc, 32*2($val)
2860	vzeroupper
2861___
2862$code.=<<___	if ($win64);
2863	movaps	(%rsp), %xmm6
2864	movaps	0x10(%rsp), %xmm7
2865	movaps	0x20(%rsp), %xmm8
2866	movaps	0x30(%rsp), %xmm9
2867	movaps	0x40(%rsp), %xmm10
2868	movaps	0x50(%rsp), %xmm11
2869	movaps	0x60(%rsp), %xmm12
2870	movaps	0x70(%rsp), %xmm13
2871	movaps	0x80(%rsp), %xmm14
2872	movaps	0x90(%rsp), %xmm15
2873	lea	(%r11), %rsp
2874___
2875$code.=<<___;
2876	ret
2877.LSEH_end_ecp_nistz256_avx2_gather_w5:
2878.size	ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2879___
2880}
2881if ($avx>1) {
2882my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2883my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2884my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2885my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2886my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2887
2888$code.=<<___;
2889
2890################################################################################
2891# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2892.globl	ecp_nistz256_avx2_gather_w7
2893.type	ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2894.align	32
2895ecp_nistz256_avx2_gather_w7:
2896.Lavx2_gather_w7:
2897	vzeroupper
2898___
2899$code.=<<___	if ($win64);
2900	mov	%rsp,%r11
2901	lea	-0x88(%rsp), %rax
2902.LSEH_begin_ecp_nistz256_avx2_gather_w7:
2903	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2904	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2905	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2906	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2907	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2908	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2909	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2910	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2911	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2912	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2913	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2914___
2915$code.=<<___;
2916	vmovdqa	.LThree(%rip), $THREE
2917
2918	vpxor	$Ra, $Ra, $Ra
2919	vpxor	$Rb, $Rb, $Rb
2920
2921	vmovdqa .LOne(%rip), $M0
2922	vmovdqa .LTwo(%rip), $M1
2923	vmovdqa .LThree(%rip), $M2
2924
2925	vmovd	$index, %xmm1
2926	vpermd	$INDEX, $Ra, $INDEX
2927	# Skip index = 0, because it is implicitly the point at infinity
2928
2929	mov	\$21, %rax
2930.Lselect_loop_avx2_w7:
2931
2932	vmovdqa	32*0($in_t), $T0a
2933	vmovdqa	32*1($in_t), $T0b
2934
2935	vmovdqa	32*2($in_t), $T1a
2936	vmovdqa	32*3($in_t), $T1b
2937
2938	vmovdqa	32*4($in_t), $T2a
2939	vmovdqa	32*5($in_t), $T2b
2940
2941	vpcmpeqd	$INDEX, $M0, $TMP0
2942	vpcmpeqd	$INDEX, $M1, $TMP1
2943	vpcmpeqd	$INDEX, $M2, $TMP2
2944
2945	vpaddd	$THREE, $M0, $M0
2946	vpaddd	$THREE, $M1, $M1
2947	vpaddd	$THREE, $M2, $M2
2948	lea	32*6($in_t), $in_t
2949
2950	vpand	$TMP0, $T0a, $T0a
2951	vpand	$TMP0, $T0b, $T0b
2952	vpand	$TMP1, $T1a, $T1a
2953	vpand	$TMP1, $T1b, $T1b
2954	vpand	$TMP2, $T2a, $T2a
2955	vpand	$TMP2, $T2b, $T2b
2956
2957	vpxor	$T0a, $Ra, $Ra
2958	vpxor	$T0b, $Rb, $Rb
2959	vpxor	$T1a, $Ra, $Ra
2960	vpxor	$T1b, $Rb, $Rb
2961	vpxor	$T2a, $Ra, $Ra
2962	vpxor	$T2b, $Rb, $Rb
2963
2964	dec %rax
2965	jnz .Lselect_loop_avx2_w7
2966
2967
2968	vmovdqa	32*0($in_t), $T0a
2969	vmovdqa	32*1($in_t), $T0b
2970
2971	vpcmpeqd	$INDEX, $M0, $TMP0
2972
2973	vpand	$TMP0, $T0a, $T0a
2974	vpand	$TMP0, $T0b, $T0b
2975
2976	vpxor	$T0a, $Ra, $Ra
2977	vpxor	$T0b, $Rb, $Rb
2978
2979	vmovdqu $Ra, 32*0($val)
2980	vmovdqu $Rb, 32*1($val)
2981	vzeroupper
2982___
2983$code.=<<___	if ($win64);
2984	movaps	(%rsp), %xmm6
2985	movaps	0x10(%rsp), %xmm7
2986	movaps	0x20(%rsp), %xmm8
2987	movaps	0x30(%rsp), %xmm9
2988	movaps	0x40(%rsp), %xmm10
2989	movaps	0x50(%rsp), %xmm11
2990	movaps	0x60(%rsp), %xmm12
2991	movaps	0x70(%rsp), %xmm13
2992	movaps	0x80(%rsp), %xmm14
2993	movaps	0x90(%rsp), %xmm15
2994	lea	(%r11), %rsp
2995___
2996$code.=<<___;
2997	ret
2998.LSEH_end_ecp_nistz256_avx2_gather_w7:
2999.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3000___
3001} else {
3002$code.=<<___;
3003.globl	ecp_nistz256_avx2_gather_w7
3004.type	ecp_nistz256_avx2_gather_w7,\@function,3
3005.align	32
3006ecp_nistz256_avx2_gather_w7:
3007	.byte	0x0f,0x0b	# ud2
3008	ret
3009.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3010___
3011}
3012{{{
3013########################################################################
3014# This block implements higher level point_double, point_add and
3015# point_add_affine. The key to performance in this case is to allow
3016# out-of-order execution logic to overlap computations from next step
3017# with tail processing from current step. By using tailored calling
3018# sequence we minimize inter-step overhead to give processor better
3019# shot at overlapping operations...
3020#
3021# You will notice that input data is copied to stack. Trouble is that
3022# there are no registers to spare for holding original pointers and
3023# reloading them, pointers, would create undesired dependencies on
3024# effective addresses calculation paths. In other words it's too done
3025# to favour out-of-order execution logic.
3026#						<appro@openssl.org>
3027
3028my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3029my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3030my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3031my ($poly1,$poly3)=($acc6,$acc7);
3032
3033sub load_for_mul () {
3034my ($a,$b,$src0) = @_;
3035my $bias = $src0 eq "%rax" ? 0 : -128;
3036
3037"	mov	$b, $src0
3038	lea	$b, $b_ptr
3039	mov	8*0+$a, $acc1
3040	mov	8*1+$a, $acc2
3041	lea	$bias+$a, $a_ptr
3042	mov	8*2+$a, $acc3
3043	mov	8*3+$a, $acc4"
3044}
3045
3046sub load_for_sqr () {
3047my ($a,$src0) = @_;
3048my $bias = $src0 eq "%rax" ? 0 : -128;
3049
3050"	mov	8*0+$a, $src0
3051	mov	8*1+$a, $acc6
3052	lea	$bias+$a, $a_ptr
3053	mov	8*2+$a, $acc7
3054	mov	8*3+$a, $acc0"
3055}
3056
3057									{
3058########################################################################
3059# operate in 4-5-0-1 "name space" that matches multiplication output
3060#
3061my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3062
3063$code.=<<___;
3064.type	__ecp_nistz256_add_toq,\@abi-omnipotent
3065.align	32
3066__ecp_nistz256_add_toq:
3067	xor	$t4,$t4
3068	add	8*0($b_ptr), $a0
3069	adc	8*1($b_ptr), $a1
3070	 mov	$a0, $t0
3071	adc	8*2($b_ptr), $a2
3072	adc	8*3($b_ptr), $a3
3073	 mov	$a1, $t1
3074	adc	\$0, $t4
3075
3076	sub	\$-1, $a0
3077	 mov	$a2, $t2
3078	sbb	$poly1, $a1
3079	sbb	\$0, $a2
3080	 mov	$a3, $t3
3081	sbb	$poly3, $a3
3082	sbb	\$0, $t4
3083
3084	cmovc	$t0, $a0
3085	cmovc	$t1, $a1
3086	mov	$a0, 8*0($r_ptr)
3087	cmovc	$t2, $a2
3088	mov	$a1, 8*1($r_ptr)
3089	cmovc	$t3, $a3
3090	mov	$a2, 8*2($r_ptr)
3091	mov	$a3, 8*3($r_ptr)
3092
3093	ret
3094.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3095
3096.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
3097.align	32
3098__ecp_nistz256_sub_fromq:
3099	sub	8*0($b_ptr), $a0
3100	sbb	8*1($b_ptr), $a1
3101	 mov	$a0, $t0
3102	sbb	8*2($b_ptr), $a2
3103	sbb	8*3($b_ptr), $a3
3104	 mov	$a1, $t1
3105	sbb	$t4, $t4
3106
3107	add	\$-1, $a0
3108	 mov	$a2, $t2
3109	adc	$poly1, $a1
3110	adc	\$0, $a2
3111	 mov	$a3, $t3
3112	adc	$poly3, $a3
3113	test	$t4, $t4
3114
3115	cmovz	$t0, $a0
3116	cmovz	$t1, $a1
3117	mov	$a0, 8*0($r_ptr)
3118	cmovz	$t2, $a2
3119	mov	$a1, 8*1($r_ptr)
3120	cmovz	$t3, $a3
3121	mov	$a2, 8*2($r_ptr)
3122	mov	$a3, 8*3($r_ptr)
3123
3124	ret
3125.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3126
3127.type	__ecp_nistz256_subq,\@abi-omnipotent
3128.align	32
3129__ecp_nistz256_subq:
3130	sub	$a0, $t0
3131	sbb	$a1, $t1
3132	 mov	$t0, $a0
3133	sbb	$a2, $t2
3134	sbb	$a3, $t3
3135	 mov	$t1, $a1
3136	sbb	$t4, $t4
3137
3138	add	\$-1, $t0
3139	 mov	$t2, $a2
3140	adc	$poly1, $t1
3141	adc	\$0, $t2
3142	 mov	$t3, $a3
3143	adc	$poly3, $t3
3144	test	$t4, $t4
3145
3146	cmovnz	$t0, $a0
3147	cmovnz	$t1, $a1
3148	cmovnz	$t2, $a2
3149	cmovnz	$t3, $a3
3150
3151	ret
3152.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
3153
3154.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
3155.align	32
3156__ecp_nistz256_mul_by_2q:
3157	xor	$t4, $t4
3158	add	$a0, $a0		# a0:a3+a0:a3
3159	adc	$a1, $a1
3160	 mov	$a0, $t0
3161	adc	$a2, $a2
3162	adc	$a3, $a3
3163	 mov	$a1, $t1
3164	adc	\$0, $t4
3165
3166	sub	\$-1, $a0
3167	 mov	$a2, $t2
3168	sbb	$poly1, $a1
3169	sbb	\$0, $a2
3170	 mov	$a3, $t3
3171	sbb	$poly3, $a3
3172	sbb	\$0, $t4
3173
3174	cmovc	$t0, $a0
3175	cmovc	$t1, $a1
3176	mov	$a0, 8*0($r_ptr)
3177	cmovc	$t2, $a2
3178	mov	$a1, 8*1($r_ptr)
3179	cmovc	$t3, $a3
3180	mov	$a2, 8*2($r_ptr)
3181	mov	$a3, 8*3($r_ptr)
3182
3183	ret
3184.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3185___
3186									}
3187sub gen_double () {
3188    my $x = shift;
3189    my ($src0,$sfx,$bias);
3190    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3191
3192    if ($x ne "x") {
3193	$src0 = "%rax";
3194	$sfx  = "";
3195	$bias = 0;
3196
3197$code.=<<___;
3198.globl	ecp_nistz256_point_double
3199.type	ecp_nistz256_point_double,\@function,2
3200.align	32
3201ecp_nistz256_point_double:
3202.cfi_startproc
3203___
3204$code.=<<___	if ($addx);
3205	mov	\$0x80100, %ecx
3206	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3207	cmp	\$0x80100, %ecx
3208	je	.Lpoint_doublex
3209___
3210    } else {
3211	$src0 = "%rdx";
3212	$sfx  = "x";
3213	$bias = 128;
3214
3215$code.=<<___;
3216.type	ecp_nistz256_point_doublex,\@function,2
3217.align	32
3218ecp_nistz256_point_doublex:
3219.cfi_startproc
3220.Lpoint_doublex:
3221___
3222    }
3223$code.=<<___;
3224	push	%rbp
3225.cfi_push	%rbp
3226	push	%rbx
3227.cfi_push	%rbx
3228	push	%r12
3229.cfi_push	%r12
3230	push	%r13
3231.cfi_push	%r13
3232	push	%r14
3233.cfi_push	%r14
3234	push	%r15
3235.cfi_push	%r15
3236	sub	\$32*5+8, %rsp
3237.cfi_adjust_cfa_offset	32*5+8
3238.Lpoint_double${x}_body:
3239
3240.Lpoint_double_shortcut$x:
3241	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
3242	mov	$a_ptr, $b_ptr			# backup copy
3243	movdqu	0x10($a_ptr), %xmm1
3244	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
3245	 mov	0x20+8*1($a_ptr), $acc5
3246	 mov	0x20+8*2($a_ptr), $acc0
3247	 mov	0x20+8*3($a_ptr), $acc1
3248	 mov	.Lpoly+8*1(%rip), $poly1
3249	 mov	.Lpoly+8*3(%rip), $poly3
3250	movdqa	%xmm0, $in_x(%rsp)
3251	movdqa	%xmm1, $in_x+0x10(%rsp)
3252	lea	0x20($r_ptr), $acc2
3253	lea	0x40($r_ptr), $acc3
3254	movq	$r_ptr, %xmm0
3255	movq	$acc2, %xmm1
3256	movq	$acc3, %xmm2
3257
3258	lea	$S(%rsp), $r_ptr
3259	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
3260
3261	mov	0x40+8*0($a_ptr), $src0
3262	mov	0x40+8*1($a_ptr), $acc6
3263	mov	0x40+8*2($a_ptr), $acc7
3264	mov	0x40+8*3($a_ptr), $acc0
3265	lea	0x40-$bias($a_ptr), $a_ptr
3266	lea	$Zsqr(%rsp), $r_ptr
3267	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
3268
3269	`&load_for_sqr("$S(%rsp)", "$src0")`
3270	lea	$S(%rsp), $r_ptr
3271	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
3272
3273	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
3274	mov	0x40+8*0($b_ptr), $acc1
3275	mov	0x40+8*1($b_ptr), $acc2
3276	mov	0x40+8*2($b_ptr), $acc3
3277	mov	0x40+8*3($b_ptr), $acc4
3278	lea	0x40-$bias($b_ptr), $a_ptr
3279	lea	0x20($b_ptr), $b_ptr
3280	movq	%xmm2, $r_ptr
3281	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
3282	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
3283
3284	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3285	mov	$in_x+8*1(%rsp), $acc5
3286	lea	$Zsqr(%rsp), $b_ptr
3287	mov	$in_x+8*2(%rsp), $acc0
3288	mov	$in_x+8*3(%rsp), $acc1
3289	lea	$M(%rsp), $r_ptr
3290	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
3291
3292	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3293	mov	$in_x+8*1(%rsp), $acc5
3294	lea	$Zsqr(%rsp), $b_ptr
3295	mov	$in_x+8*2(%rsp), $acc0
3296	mov	$in_x+8*3(%rsp), $acc1
3297	lea	$Zsqr(%rsp), $r_ptr
3298	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
3299
3300	`&load_for_sqr("$S(%rsp)", "$src0")`
3301	movq	%xmm1, $r_ptr
3302	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
3303___
3304{
3305######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3306# operate in 4-5-6-7 "name space" that matches squaring output
3307#
3308my ($poly1,$poly3)=($a_ptr,$t1);
3309my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3310
3311$code.=<<___;
3312	xor	$t4, $t4
3313	mov	$a0, $t0
3314	add	\$-1, $a0
3315	mov	$a1, $t1
3316	adc	$poly1, $a1
3317	mov	$a2, $t2
3318	adc	\$0, $a2
3319	mov	$a3, $t3
3320	adc	$poly3, $a3
3321	adc	\$0, $t4
3322	xor	$a_ptr, $a_ptr		# borrow $a_ptr
3323	test	\$1, $t0
3324
3325	cmovz	$t0, $a0
3326	cmovz	$t1, $a1
3327	cmovz	$t2, $a2
3328	cmovz	$t3, $a3
3329	cmovz	$a_ptr, $t4
3330
3331	mov	$a1, $t0		# a0:a3>>1
3332	shr	\$1, $a0
3333	shl	\$63, $t0
3334	mov	$a2, $t1
3335	shr	\$1, $a1
3336	or	$t0, $a0
3337	shl	\$63, $t1
3338	mov	$a3, $t2
3339	shr	\$1, $a2
3340	or	$t1, $a1
3341	shl	\$63, $t2
3342	mov	$a0, 8*0($r_ptr)
3343	shr	\$1, $a3
3344	mov	$a1, 8*1($r_ptr)
3345	shl	\$63, $t4
3346	or	$t2, $a2
3347	or	$t4, $a3
3348	mov	$a2, 8*2($r_ptr)
3349	mov	$a3, 8*3($r_ptr)
3350___
3351}
3352$code.=<<___;
3353	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3354	lea	$M(%rsp), $r_ptr
3355	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
3356
3357	lea	$tmp0(%rsp), $r_ptr
3358	call	__ecp_nistz256_mul_by_2$x
3359
3360	lea	$M(%rsp), $b_ptr
3361	lea	$M(%rsp), $r_ptr
3362	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
3363
3364	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3365	lea	$S(%rsp), $r_ptr
3366	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
3367
3368	lea	$tmp0(%rsp), $r_ptr
3369	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
3370
3371	`&load_for_sqr("$M(%rsp)", "$src0")`
3372	movq	%xmm0, $r_ptr
3373	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
3374
3375	lea	$tmp0(%rsp), $b_ptr
3376	mov	$acc6, $acc0			# harmonize sqr output and sub input
3377	mov	$acc7, $acc1
3378	mov	$a_ptr, $poly1
3379	mov	$t1, $poly3
3380	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
3381
3382	mov	$S+8*0(%rsp), $t0
3383	mov	$S+8*1(%rsp), $t1
3384	mov	$S+8*2(%rsp), $t2
3385	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
3386	lea	$S(%rsp), $r_ptr
3387	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
3388
3389	mov	$M(%rsp), $src0
3390	lea	$M(%rsp), $b_ptr
3391	mov	$acc4, $acc6			# harmonize sub output and mul input
3392	xor	%ecx, %ecx
3393	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
3394	mov	$acc5, $acc2
3395	mov	$acc5, $S+8*1(%rsp)
3396	cmovz	$acc0, $acc3
3397	mov	$acc0, $S+8*2(%rsp)
3398	lea	$S-$bias(%rsp), $a_ptr
3399	cmovz	$acc1, $acc4
3400	mov	$acc1, $S+8*3(%rsp)
3401	mov	$acc6, $acc1
3402	lea	$S(%rsp), $r_ptr
3403	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
3404
3405	movq	%xmm1, $b_ptr
3406	movq	%xmm1, $r_ptr
3407	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
3408
3409	lea	32*5+56(%rsp), %rsi
3410.cfi_def_cfa	%rsi,8
3411	mov	-48(%rsi),%r15
3412.cfi_restore	%r15
3413	mov	-40(%rsi),%r14
3414.cfi_restore	%r14
3415	mov	-32(%rsi),%r13
3416.cfi_restore	%r13
3417	mov	-24(%rsi),%r12
3418.cfi_restore	%r12
3419	mov	-16(%rsi),%rbx
3420.cfi_restore	%rbx
3421	mov	-8(%rsi),%rbp
3422.cfi_restore	%rbp
3423	lea	(%rsi),%rsp
3424.cfi_def_cfa_register	%rsp
3425.Lpoint_double${x}_epilogue:
3426	ret
3427.cfi_endproc
3428.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3429___
3430}
3431&gen_double("q");
3432
3433sub gen_add () {
3434    my $x = shift;
3435    my ($src0,$sfx,$bias);
3436    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3437	$U1,$U2,$S1,$S2,
3438	$res_x,$res_y,$res_z,
3439	$in1_x,$in1_y,$in1_z,
3440	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3441    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3442
3443    if ($x ne "x") {
3444	$src0 = "%rax";
3445	$sfx  = "";
3446	$bias = 0;
3447
3448$code.=<<___;
3449.globl	ecp_nistz256_point_add
3450.type	ecp_nistz256_point_add,\@function,3
3451.align	32
3452ecp_nistz256_point_add:
3453.cfi_startproc
3454___
3455$code.=<<___	if ($addx);
3456	mov	\$0x80100, %ecx
3457	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3458	cmp	\$0x80100, %ecx
3459	je	.Lpoint_addx
3460___
3461    } else {
3462	$src0 = "%rdx";
3463	$sfx  = "x";
3464	$bias = 128;
3465
3466$code.=<<___;
3467.type	ecp_nistz256_point_addx,\@function,3
3468.align	32
3469ecp_nistz256_point_addx:
3470.cfi_startproc
3471.Lpoint_addx:
3472___
3473    }
3474$code.=<<___;
3475	push	%rbp
3476.cfi_push	%rbp
3477	push	%rbx
3478.cfi_push	%rbx
3479	push	%r12
3480.cfi_push	%r12
3481	push	%r13
3482.cfi_push	%r13
3483	push	%r14
3484.cfi_push	%r14
3485	push	%r15
3486.cfi_push	%r15
3487	sub	\$32*18+8, %rsp
3488.cfi_adjust_cfa_offset	32*18+8
3489.Lpoint_add${x}_body:
3490
3491	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3492	movdqu	0x10($a_ptr), %xmm1
3493	movdqu	0x20($a_ptr), %xmm2
3494	movdqu	0x30($a_ptr), %xmm3
3495	movdqu	0x40($a_ptr), %xmm4
3496	movdqu	0x50($a_ptr), %xmm5
3497	mov	$a_ptr, $b_ptr			# reassign
3498	mov	$b_org, $a_ptr			# reassign
3499	movdqa	%xmm0, $in1_x(%rsp)
3500	movdqa	%xmm1, $in1_x+0x10(%rsp)
3501	movdqa	%xmm2, $in1_y(%rsp)
3502	movdqa	%xmm3, $in1_y+0x10(%rsp)
3503	movdqa	%xmm4, $in1_z(%rsp)
3504	movdqa	%xmm5, $in1_z+0x10(%rsp)
3505	por	%xmm4, %xmm5
3506
3507	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3508	 pshufd	\$0xb1, %xmm5, %xmm3
3509	movdqu	0x10($a_ptr), %xmm1
3510	movdqu	0x20($a_ptr), %xmm2
3511	 por	%xmm3, %xmm5
3512	movdqu	0x30($a_ptr), %xmm3
3513	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3514	 mov	0x40+8*1($a_ptr), $acc6
3515	 mov	0x40+8*2($a_ptr), $acc7
3516	 mov	0x40+8*3($a_ptr), $acc0
3517	movdqa	%xmm0, $in2_x(%rsp)
3518	 pshufd	\$0x1e, %xmm5, %xmm4
3519	movdqa	%xmm1, $in2_x+0x10(%rsp)
3520	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3521	movdqu	0x50($a_ptr),%xmm1
3522	movdqa	%xmm2, $in2_y(%rsp)
3523	movdqa	%xmm3, $in2_y+0x10(%rsp)
3524	 por	%xmm4, %xmm5
3525	 pxor	%xmm4, %xmm4
3526	por	%xmm0, %xmm1
3527	 movq	$r_ptr, %xmm0			# save $r_ptr
3528
3529	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3530	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3531	 mov	$acc6, $in2_z+8*1(%rsp)
3532	 mov	$acc7, $in2_z+8*2(%rsp)
3533	 mov	$acc0, $in2_z+8*3(%rsp)
3534	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3535	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3536
3537	pcmpeqd	%xmm4, %xmm5
3538	pshufd	\$0xb1, %xmm1, %xmm4
3539	por	%xmm1, %xmm4
3540	pshufd	\$0, %xmm5, %xmm5		# in1infty
3541	pshufd	\$0x1e, %xmm4, %xmm3
3542	por	%xmm3, %xmm4
3543	pxor	%xmm3, %xmm3
3544	pcmpeqd	%xmm3, %xmm4
3545	pshufd	\$0, %xmm4, %xmm4		# in2infty
3546	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3547	 mov	0x40+8*1($b_ptr), $acc6
3548	 mov	0x40+8*2($b_ptr), $acc7
3549	 mov	0x40+8*3($b_ptr), $acc0
3550	movq	$b_ptr, %xmm1
3551
3552	lea	0x40-$bias($b_ptr), $a_ptr
3553	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3554	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3555
3556	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3557	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3558	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3559
3560	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3561	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3562	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3563
3564	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3565	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3566	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3567
3568	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3569	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3570	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3571
3572	lea	$S1(%rsp), $b_ptr
3573	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3574	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3575
3576	or	$acc5, $acc4			# see if result is zero
3577	movdqa	%xmm4, %xmm2
3578	or	$acc0, $acc4
3579	or	$acc1, $acc4
3580	por	%xmm5, %xmm2			# in1infty || in2infty
3581	movq	$acc4, %xmm3
3582
3583	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3584	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3585	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3586
3587	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3588	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3589	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3590
3591	lea	$U1(%rsp), $b_ptr
3592	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3593	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3594
3595	or	$acc5, $acc4			# see if result is zero
3596	or	$acc0, $acc4
3597	or	$acc1, $acc4
3598
3599	.byte	0x3e				# predict taken
3600	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
3601	movq	%xmm2, $acc0
3602	movq	%xmm3, $acc1
3603	test	$acc0, $acc0
3604	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
3605	test	$acc1, $acc1
3606	jz	.Ladd_double$x			# is_equal(S1,S2)?
3607
3608	movq	%xmm0, $r_ptr			# restore $r_ptr
3609	pxor	%xmm0, %xmm0
3610	movdqu	%xmm0, 0x00($r_ptr)
3611	movdqu	%xmm0, 0x10($r_ptr)
3612	movdqu	%xmm0, 0x20($r_ptr)
3613	movdqu	%xmm0, 0x30($r_ptr)
3614	movdqu	%xmm0, 0x40($r_ptr)
3615	movdqu	%xmm0, 0x50($r_ptr)
3616	jmp	.Ladd_done$x
3617
3618.align	32
3619.Ladd_double$x:
3620	movq	%xmm1, $a_ptr			# restore $a_ptr
3621	movq	%xmm0, $r_ptr			# restore $r_ptr
3622	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3623	jmp	.Lpoint_double_shortcut$x
3624
3625.align	32
3626.Ladd_proceed$x:
3627	`&load_for_sqr("$R(%rsp)", "$src0")`
3628	lea	$Rsqr(%rsp), $r_ptr		# R^2
3629	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3630
3631	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3632	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3633	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3634
3635	`&load_for_sqr("$H(%rsp)", "$src0")`
3636	lea	$Hsqr(%rsp), $r_ptr		# H^2
3637	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3638
3639	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3640	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3641	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3642
3643	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3644	lea	$Hcub(%rsp), $r_ptr		# H^3
3645	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3646
3647	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3648	lea	$U2(%rsp), $r_ptr		# U1*H^2
3649	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3650___
3651{
3652#######################################################################
3653# operate in 4-5-0-1 "name space" that matches multiplication output
3654#
3655my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3656my ($poly1, $poly3)=($acc6,$acc7);
3657
3658$code.=<<___;
3659	#lea	$U2(%rsp), $a_ptr
3660	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3661	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3662
3663	xor	$t4, $t4
3664	add	$acc0, $acc0		# a0:a3+a0:a3
3665	lea	$Rsqr(%rsp), $a_ptr
3666	adc	$acc1, $acc1
3667	 mov	$acc0, $t0
3668	adc	$acc2, $acc2
3669	adc	$acc3, $acc3
3670	 mov	$acc1, $t1
3671	adc	\$0, $t4
3672
3673	sub	\$-1, $acc0
3674	 mov	$acc2, $t2
3675	sbb	$poly1, $acc1
3676	sbb	\$0, $acc2
3677	 mov	$acc3, $t3
3678	sbb	$poly3, $acc3
3679	sbb	\$0, $t4
3680
3681	cmovc	$t0, $acc0
3682	mov	8*0($a_ptr), $t0
3683	cmovc	$t1, $acc1
3684	mov	8*1($a_ptr), $t1
3685	cmovc	$t2, $acc2
3686	mov	8*2($a_ptr), $t2
3687	cmovc	$t3, $acc3
3688	mov	8*3($a_ptr), $t3
3689
3690	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3691
3692	lea	$Hcub(%rsp), $b_ptr
3693	lea	$res_x(%rsp), $r_ptr
3694	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3695
3696	mov	$U2+8*0(%rsp), $t0
3697	mov	$U2+8*1(%rsp), $t1
3698	mov	$U2+8*2(%rsp), $t2
3699	mov	$U2+8*3(%rsp), $t3
3700	lea	$res_y(%rsp), $r_ptr
3701
3702	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3703
3704	mov	$acc0, 8*0($r_ptr)		# save the result, as
3705	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3706	mov	$acc2, 8*2($r_ptr)
3707	mov	$acc3, 8*3($r_ptr)
3708___
3709}
3710$code.=<<___;
3711	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3712	lea	$S2(%rsp), $r_ptr
3713	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3714
3715	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3716	lea	$res_y(%rsp), $r_ptr
3717	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3718
3719	lea	$S2(%rsp), $b_ptr
3720	lea	$res_y(%rsp), $r_ptr
3721	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3722
3723	movq	%xmm0, $r_ptr		# restore $r_ptr
3724
3725	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3726	movdqa	%xmm5, %xmm1
3727	pandn	$res_z(%rsp), %xmm0
3728	movdqa	%xmm5, %xmm2
3729	pandn	$res_z+0x10(%rsp), %xmm1
3730	movdqa	%xmm5, %xmm3
3731	pand	$in2_z(%rsp), %xmm2
3732	pand	$in2_z+0x10(%rsp), %xmm3
3733	por	%xmm0, %xmm2
3734	por	%xmm1, %xmm3
3735
3736	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3737	movdqa	%xmm4, %xmm1
3738	pandn	%xmm2, %xmm0
3739	movdqa	%xmm4, %xmm2
3740	pandn	%xmm3, %xmm1
3741	movdqa	%xmm4, %xmm3
3742	pand	$in1_z(%rsp), %xmm2
3743	pand	$in1_z+0x10(%rsp), %xmm3
3744	por	%xmm0, %xmm2
3745	por	%xmm1, %xmm3
3746	movdqu	%xmm2, 0x40($r_ptr)
3747	movdqu	%xmm3, 0x50($r_ptr)
3748
3749	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3750	movdqa	%xmm5, %xmm1
3751	pandn	$res_x(%rsp), %xmm0
3752	movdqa	%xmm5, %xmm2
3753	pandn	$res_x+0x10(%rsp), %xmm1
3754	movdqa	%xmm5, %xmm3
3755	pand	$in2_x(%rsp), %xmm2
3756	pand	$in2_x+0x10(%rsp), %xmm3
3757	por	%xmm0, %xmm2
3758	por	%xmm1, %xmm3
3759
3760	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3761	movdqa	%xmm4, %xmm1
3762	pandn	%xmm2, %xmm0
3763	movdqa	%xmm4, %xmm2
3764	pandn	%xmm3, %xmm1
3765	movdqa	%xmm4, %xmm3
3766	pand	$in1_x(%rsp), %xmm2
3767	pand	$in1_x+0x10(%rsp), %xmm3
3768	por	%xmm0, %xmm2
3769	por	%xmm1, %xmm3
3770	movdqu	%xmm2, 0x00($r_ptr)
3771	movdqu	%xmm3, 0x10($r_ptr)
3772
3773	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3774	movdqa	%xmm5, %xmm1
3775	pandn	$res_y(%rsp), %xmm0
3776	movdqa	%xmm5, %xmm2
3777	pandn	$res_y+0x10(%rsp), %xmm1
3778	movdqa	%xmm5, %xmm3
3779	pand	$in2_y(%rsp), %xmm2
3780	pand	$in2_y+0x10(%rsp), %xmm3
3781	por	%xmm0, %xmm2
3782	por	%xmm1, %xmm3
3783
3784	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3785	movdqa	%xmm4, %xmm1
3786	pandn	%xmm2, %xmm0
3787	movdqa	%xmm4, %xmm2
3788	pandn	%xmm3, %xmm1
3789	movdqa	%xmm4, %xmm3
3790	pand	$in1_y(%rsp), %xmm2
3791	pand	$in1_y+0x10(%rsp), %xmm3
3792	por	%xmm0, %xmm2
3793	por	%xmm1, %xmm3
3794	movdqu	%xmm2, 0x20($r_ptr)
3795	movdqu	%xmm3, 0x30($r_ptr)
3796
3797.Ladd_done$x:
3798	lea	32*18+56(%rsp), %rsi
3799.cfi_def_cfa	%rsi,8
3800	mov	-48(%rsi),%r15
3801.cfi_restore	%r15
3802	mov	-40(%rsi),%r14
3803.cfi_restore	%r14
3804	mov	-32(%rsi),%r13
3805.cfi_restore	%r13
3806	mov	-24(%rsi),%r12
3807.cfi_restore	%r12
3808	mov	-16(%rsi),%rbx
3809.cfi_restore	%rbx
3810	mov	-8(%rsi),%rbp
3811.cfi_restore	%rbp
3812	lea	(%rsi),%rsp
3813.cfi_def_cfa_register	%rsp
3814.Lpoint_add${x}_epilogue:
3815	ret
3816.cfi_endproc
3817.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3818___
3819}
3820&gen_add("q");
3821
3822sub gen_add_affine () {
3823    my $x = shift;
3824    my ($src0,$sfx,$bias);
3825    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3826	$res_x,$res_y,$res_z,
3827	$in1_x,$in1_y,$in1_z,
3828	$in2_x,$in2_y)=map(32*$_,(0..14));
3829    my $Z1sqr = $S2;
3830
3831    if ($x ne "x") {
3832	$src0 = "%rax";
3833	$sfx  = "";
3834	$bias = 0;
3835
3836$code.=<<___;
3837.globl	ecp_nistz256_point_add_affine
3838.type	ecp_nistz256_point_add_affine,\@function,3
3839.align	32
3840ecp_nistz256_point_add_affine:
3841.cfi_startproc
3842___
3843$code.=<<___	if ($addx);
3844	mov	\$0x80100, %ecx
3845	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3846	cmp	\$0x80100, %ecx
3847	je	.Lpoint_add_affinex
3848___
3849    } else {
3850	$src0 = "%rdx";
3851	$sfx  = "x";
3852	$bias = 128;
3853
3854$code.=<<___;
3855.type	ecp_nistz256_point_add_affinex,\@function,3
3856.align	32
3857ecp_nistz256_point_add_affinex:
3858.cfi_startproc
3859.Lpoint_add_affinex:
3860___
3861    }
3862$code.=<<___;
3863	push	%rbp
3864.cfi_push	%rbp
3865	push	%rbx
3866.cfi_push	%rbx
3867	push	%r12
3868.cfi_push	%r12
3869	push	%r13
3870.cfi_push	%r13
3871	push	%r14
3872.cfi_push	%r14
3873	push	%r15
3874.cfi_push	%r15
3875	sub	\$32*15+8, %rsp
3876.cfi_adjust_cfa_offset	32*15+8
3877.Ladd_affine${x}_body:
3878
3879	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3880	mov	$b_org, $b_ptr		# reassign
3881	movdqu	0x10($a_ptr), %xmm1
3882	movdqu	0x20($a_ptr), %xmm2
3883	movdqu	0x30($a_ptr), %xmm3
3884	movdqu	0x40($a_ptr), %xmm4
3885	movdqu	0x50($a_ptr), %xmm5
3886	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3887	 mov	0x40+8*1($a_ptr), $acc6
3888	 mov	0x40+8*2($a_ptr), $acc7
3889	 mov	0x40+8*3($a_ptr), $acc0
3890	movdqa	%xmm0, $in1_x(%rsp)
3891	movdqa	%xmm1, $in1_x+0x10(%rsp)
3892	movdqa	%xmm2, $in1_y(%rsp)
3893	movdqa	%xmm3, $in1_y+0x10(%rsp)
3894	movdqa	%xmm4, $in1_z(%rsp)
3895	movdqa	%xmm5, $in1_z+0x10(%rsp)
3896	por	%xmm4, %xmm5
3897
3898	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3899	 pshufd	\$0xb1, %xmm5, %xmm3
3900	movdqu	0x10($b_ptr), %xmm1
3901	movdqu	0x20($b_ptr), %xmm2
3902	 por	%xmm3, %xmm5
3903	movdqu	0x30($b_ptr), %xmm3
3904	movdqa	%xmm0, $in2_x(%rsp)
3905	 pshufd	\$0x1e, %xmm5, %xmm4
3906	movdqa	%xmm1, $in2_x+0x10(%rsp)
3907	por	%xmm0, %xmm1
3908	 movq	$r_ptr, %xmm0		# save $r_ptr
3909	movdqa	%xmm2, $in2_y(%rsp)
3910	movdqa	%xmm3, $in2_y+0x10(%rsp)
3911	por	%xmm2, %xmm3
3912	 por	%xmm4, %xmm5
3913	 pxor	%xmm4, %xmm4
3914	por	%xmm1, %xmm3
3915
3916	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3917	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3918	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3919
3920	pcmpeqd	%xmm4, %xmm5
3921	pshufd	\$0xb1, %xmm3, %xmm4
3922	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3923	 #lea	0x00($b_ptr), $b_ptr
3924	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3925	por	%xmm3, %xmm4
3926	pshufd	\$0, %xmm5, %xmm5		# in1infty
3927	pshufd	\$0x1e, %xmm4, %xmm3
3928	 mov	$acc5, $acc2
3929	por	%xmm3, %xmm4
3930	pxor	%xmm3, %xmm3
3931	 mov	$acc6, $acc3
3932	pcmpeqd	%xmm3, %xmm4
3933	pshufd	\$0, %xmm4, %xmm4		# in2infty
3934
3935	lea	$Z1sqr-$bias(%rsp), $a_ptr
3936	mov	$acc7, $acc4
3937	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3938	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3939
3940	lea	$in1_x(%rsp), $b_ptr
3941	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3942	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3943
3944	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3945	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3946	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3947
3948	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3949	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3950	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3951
3952	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3953	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3954	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3955
3956	lea	$in1_y(%rsp), $b_ptr
3957	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3958	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3959
3960	`&load_for_sqr("$H(%rsp)", "$src0")`
3961	lea	$Hsqr(%rsp), $r_ptr		# H^2
3962	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3963
3964	`&load_for_sqr("$R(%rsp)", "$src0")`
3965	lea	$Rsqr(%rsp), $r_ptr		# R^2
3966	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3967
3968	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3969	lea	$Hcub(%rsp), $r_ptr		# H^3
3970	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3971
3972	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3973	lea	$U2(%rsp), $r_ptr		# U1*H^2
3974	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
3975___
3976{
3977#######################################################################
3978# operate in 4-5-0-1 "name space" that matches multiplication output
3979#
3980my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3981my ($poly1, $poly3)=($acc6,$acc7);
3982
3983$code.=<<___;
3984	#lea	$U2(%rsp), $a_ptr
3985	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3986	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3987
3988	xor	$t4, $t4
3989	add	$acc0, $acc0		# a0:a3+a0:a3
3990	lea	$Rsqr(%rsp), $a_ptr
3991	adc	$acc1, $acc1
3992	 mov	$acc0, $t0
3993	adc	$acc2, $acc2
3994	adc	$acc3, $acc3
3995	 mov	$acc1, $t1
3996	adc	\$0, $t4
3997
3998	sub	\$-1, $acc0
3999	 mov	$acc2, $t2
4000	sbb	$poly1, $acc1
4001	sbb	\$0, $acc2
4002	 mov	$acc3, $t3
4003	sbb	$poly3, $acc3
4004	sbb	\$0, $t4
4005
4006	cmovc	$t0, $acc0
4007	mov	8*0($a_ptr), $t0
4008	cmovc	$t1, $acc1
4009	mov	8*1($a_ptr), $t1
4010	cmovc	$t2, $acc2
4011	mov	8*2($a_ptr), $t2
4012	cmovc	$t3, $acc3
4013	mov	8*3($a_ptr), $t3
4014
4015	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
4016
4017	lea	$Hcub(%rsp), $b_ptr
4018	lea	$res_x(%rsp), $r_ptr
4019	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
4020
4021	mov	$U2+8*0(%rsp), $t0
4022	mov	$U2+8*1(%rsp), $t1
4023	mov	$U2+8*2(%rsp), $t2
4024	mov	$U2+8*3(%rsp), $t3
4025	lea	$H(%rsp), $r_ptr
4026
4027	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
4028
4029	mov	$acc0, 8*0($r_ptr)		# save the result, as
4030	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
4031	mov	$acc2, 8*2($r_ptr)
4032	mov	$acc3, 8*3($r_ptr)
4033___
4034}
4035$code.=<<___;
4036	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4037	lea	$S2(%rsp), $r_ptr
4038	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
4039
4040	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4041	lea	$H(%rsp), $r_ptr
4042	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
4043
4044	lea	$S2(%rsp), $b_ptr
4045	lea	$res_y(%rsp), $r_ptr
4046	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
4047
4048	movq	%xmm0, $r_ptr		# restore $r_ptr
4049
4050	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
4051	movdqa	%xmm5, %xmm1
4052	pandn	$res_z(%rsp), %xmm0
4053	movdqa	%xmm5, %xmm2
4054	pandn	$res_z+0x10(%rsp), %xmm1
4055	movdqa	%xmm5, %xmm3
4056	pand	.LONE_mont(%rip), %xmm2
4057	pand	.LONE_mont+0x10(%rip), %xmm3
4058	por	%xmm0, %xmm2
4059	por	%xmm1, %xmm3
4060
4061	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
4062	movdqa	%xmm4, %xmm1
4063	pandn	%xmm2, %xmm0
4064	movdqa	%xmm4, %xmm2
4065	pandn	%xmm3, %xmm1
4066	movdqa	%xmm4, %xmm3
4067	pand	$in1_z(%rsp), %xmm2
4068	pand	$in1_z+0x10(%rsp), %xmm3
4069	por	%xmm0, %xmm2
4070	por	%xmm1, %xmm3
4071	movdqu	%xmm2, 0x40($r_ptr)
4072	movdqu	%xmm3, 0x50($r_ptr)
4073
4074	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
4075	movdqa	%xmm5, %xmm1
4076	pandn	$res_x(%rsp), %xmm0
4077	movdqa	%xmm5, %xmm2
4078	pandn	$res_x+0x10(%rsp), %xmm1
4079	movdqa	%xmm5, %xmm3
4080	pand	$in2_x(%rsp), %xmm2
4081	pand	$in2_x+0x10(%rsp), %xmm3
4082	por	%xmm0, %xmm2
4083	por	%xmm1, %xmm3
4084
4085	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
4086	movdqa	%xmm4, %xmm1
4087	pandn	%xmm2, %xmm0
4088	movdqa	%xmm4, %xmm2
4089	pandn	%xmm3, %xmm1
4090	movdqa	%xmm4, %xmm3
4091	pand	$in1_x(%rsp), %xmm2
4092	pand	$in1_x+0x10(%rsp), %xmm3
4093	por	%xmm0, %xmm2
4094	por	%xmm1, %xmm3
4095	movdqu	%xmm2, 0x00($r_ptr)
4096	movdqu	%xmm3, 0x10($r_ptr)
4097
4098	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
4099	movdqa	%xmm5, %xmm1
4100	pandn	$res_y(%rsp), %xmm0
4101	movdqa	%xmm5, %xmm2
4102	pandn	$res_y+0x10(%rsp), %xmm1
4103	movdqa	%xmm5, %xmm3
4104	pand	$in2_y(%rsp), %xmm2
4105	pand	$in2_y+0x10(%rsp), %xmm3
4106	por	%xmm0, %xmm2
4107	por	%xmm1, %xmm3
4108
4109	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
4110	movdqa	%xmm4, %xmm1
4111	pandn	%xmm2, %xmm0
4112	movdqa	%xmm4, %xmm2
4113	pandn	%xmm3, %xmm1
4114	movdqa	%xmm4, %xmm3
4115	pand	$in1_y(%rsp), %xmm2
4116	pand	$in1_y+0x10(%rsp), %xmm3
4117	por	%xmm0, %xmm2
4118	por	%xmm1, %xmm3
4119	movdqu	%xmm2, 0x20($r_ptr)
4120	movdqu	%xmm3, 0x30($r_ptr)
4121
4122	lea	32*15+56(%rsp), %rsi
4123.cfi_def_cfa	%rsi,8
4124	mov	-48(%rsi),%r15
4125.cfi_restore	%r15
4126	mov	-40(%rsi),%r14
4127.cfi_restore	%r14
4128	mov	-32(%rsi),%r13
4129.cfi_restore	%r13
4130	mov	-24(%rsi),%r12
4131.cfi_restore	%r12
4132	mov	-16(%rsi),%rbx
4133.cfi_restore	%rbx
4134	mov	-8(%rsi),%rbp
4135.cfi_restore	%rbp
4136	lea	(%rsi),%rsp
4137.cfi_def_cfa_register	%rsp
4138.Ladd_affine${x}_epilogue:
4139	ret
4140.cfi_endproc
4141.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4142___
4143}
4144&gen_add_affine("q");
4145
4146########################################################################
4147# AD*X magic
4148#
4149if ($addx) {								{
4150########################################################################
4151# operate in 4-5-0-1 "name space" that matches multiplication output
4152#
4153my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4154
4155$code.=<<___;
4156.type	__ecp_nistz256_add_tox,\@abi-omnipotent
4157.align	32
4158__ecp_nistz256_add_tox:
4159	xor	$t4, $t4
4160	adc	8*0($b_ptr), $a0
4161	adc	8*1($b_ptr), $a1
4162	 mov	$a0, $t0
4163	adc	8*2($b_ptr), $a2
4164	adc	8*3($b_ptr), $a3
4165	 mov	$a1, $t1
4166	adc	\$0, $t4
4167
4168	xor	$t3, $t3
4169	sbb	\$-1, $a0
4170	 mov	$a2, $t2
4171	sbb	$poly1, $a1
4172	sbb	\$0, $a2
4173	 mov	$a3, $t3
4174	sbb	$poly3, $a3
4175	sbb	\$0, $t4
4176
4177	cmovc	$t0, $a0
4178	cmovc	$t1, $a1
4179	mov	$a0, 8*0($r_ptr)
4180	cmovc	$t2, $a2
4181	mov	$a1, 8*1($r_ptr)
4182	cmovc	$t3, $a3
4183	mov	$a2, 8*2($r_ptr)
4184	mov	$a3, 8*3($r_ptr)
4185
4186	ret
4187.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4188
4189.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
4190.align	32
4191__ecp_nistz256_sub_fromx:
4192	xor	$t4, $t4
4193	sbb	8*0($b_ptr), $a0
4194	sbb	8*1($b_ptr), $a1
4195	 mov	$a0, $t0
4196	sbb	8*2($b_ptr), $a2
4197	sbb	8*3($b_ptr), $a3
4198	 mov	$a1, $t1
4199	sbb	\$0, $t4
4200
4201	xor	$t3, $t3
4202	adc	\$-1, $a0
4203	 mov	$a2, $t2
4204	adc	$poly1, $a1
4205	adc	\$0, $a2
4206	 mov	$a3, $t3
4207	adc	$poly3, $a3
4208
4209	bt	\$0, $t4
4210	cmovnc	$t0, $a0
4211	cmovnc	$t1, $a1
4212	mov	$a0, 8*0($r_ptr)
4213	cmovnc	$t2, $a2
4214	mov	$a1, 8*1($r_ptr)
4215	cmovnc	$t3, $a3
4216	mov	$a2, 8*2($r_ptr)
4217	mov	$a3, 8*3($r_ptr)
4218
4219	ret
4220.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4221
4222.type	__ecp_nistz256_subx,\@abi-omnipotent
4223.align	32
4224__ecp_nistz256_subx:
4225	xor	$t4, $t4
4226	sbb	$a0, $t0
4227	sbb	$a1, $t1
4228	 mov	$t0, $a0
4229	sbb	$a2, $t2
4230	sbb	$a3, $t3
4231	 mov	$t1, $a1
4232	sbb	\$0, $t4
4233
4234	xor	$a3 ,$a3
4235	adc	\$-1, $t0
4236	 mov	$t2, $a2
4237	adc	$poly1, $t1
4238	adc	\$0, $t2
4239	 mov	$t3, $a3
4240	adc	$poly3, $t3
4241
4242	bt	\$0, $t4
4243	cmovc	$t0, $a0
4244	cmovc	$t1, $a1
4245	cmovc	$t2, $a2
4246	cmovc	$t3, $a3
4247
4248	ret
4249.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
4250
4251.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
4252.align	32
4253__ecp_nistz256_mul_by_2x:
4254	xor	$t4, $t4
4255	adc	$a0, $a0		# a0:a3+a0:a3
4256	adc	$a1, $a1
4257	 mov	$a0, $t0
4258	adc	$a2, $a2
4259	adc	$a3, $a3
4260	 mov	$a1, $t1
4261	adc	\$0, $t4
4262
4263	xor	$t3, $t3
4264	sbb	\$-1, $a0
4265	 mov	$a2, $t2
4266	sbb	$poly1, $a1
4267	sbb	\$0, $a2
4268	 mov	$a3, $t3
4269	sbb	$poly3, $a3
4270	sbb	\$0, $t4
4271
4272	cmovc	$t0, $a0
4273	cmovc	$t1, $a1
4274	mov	$a0, 8*0($r_ptr)
4275	cmovc	$t2, $a2
4276	mov	$a1, 8*1($r_ptr)
4277	cmovc	$t3, $a3
4278	mov	$a2, 8*2($r_ptr)
4279	mov	$a3, 8*3($r_ptr)
4280
4281	ret
4282.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4283___
4284									}
4285&gen_double("x");
4286&gen_add("x");
4287&gen_add_affine("x");
4288}
4289}}}
4290
4291# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4292#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4293if ($win64) {
4294$rec="%rcx";
4295$frame="%rdx";
4296$context="%r8";
4297$disp="%r9";
4298
4299$code.=<<___;
4300.extern	__imp_RtlVirtualUnwind
4301
4302.type	short_handler,\@abi-omnipotent
4303.align	16
4304short_handler:
4305	push	%rsi
4306	push	%rdi
4307	push	%rbx
4308	push	%rbp
4309	push	%r12
4310	push	%r13
4311	push	%r14
4312	push	%r15
4313	pushfq
4314	sub	\$64,%rsp
4315
4316	mov	120($context),%rax	# pull context->Rax
4317	mov	248($context),%rbx	# pull context->Rip
4318
4319	mov	8($disp),%rsi		# disp->ImageBase
4320	mov	56($disp),%r11		# disp->HandlerData
4321
4322	mov	0(%r11),%r10d		# HandlerData[0]
4323	lea	(%rsi,%r10),%r10	# end of prologue label
4324	cmp	%r10,%rbx		# context->Rip<end of prologue label
4325	jb	.Lcommon_seh_tail
4326
4327	mov	152($context),%rax	# pull context->Rsp
4328
4329	mov	4(%r11),%r10d		# HandlerData[1]
4330	lea	(%rsi,%r10),%r10	# epilogue label
4331	cmp	%r10,%rbx		# context->Rip>=epilogue label
4332	jae	.Lcommon_seh_tail
4333
4334	lea	16(%rax),%rax
4335
4336	mov	-8(%rax),%r12
4337	mov	-16(%rax),%r13
4338	mov	%r12,216($context)	# restore context->R12
4339	mov	%r13,224($context)	# restore context->R13
4340
4341	jmp	.Lcommon_seh_tail
4342.size	short_handler,.-short_handler
4343
4344.type	full_handler,\@abi-omnipotent
4345.align	16
4346full_handler:
4347	push	%rsi
4348	push	%rdi
4349	push	%rbx
4350	push	%rbp
4351	push	%r12
4352	push	%r13
4353	push	%r14
4354	push	%r15
4355	pushfq
4356	sub	\$64,%rsp
4357
4358	mov	120($context),%rax	# pull context->Rax
4359	mov	248($context),%rbx	# pull context->Rip
4360
4361	mov	8($disp),%rsi		# disp->ImageBase
4362	mov	56($disp),%r11		# disp->HandlerData
4363
4364	mov	0(%r11),%r10d		# HandlerData[0]
4365	lea	(%rsi,%r10),%r10	# end of prologue label
4366	cmp	%r10,%rbx		# context->Rip<end of prologue label
4367	jb	.Lcommon_seh_tail
4368
4369	mov	152($context),%rax	# pull context->Rsp
4370
4371	mov	4(%r11),%r10d		# HandlerData[1]
4372	lea	(%rsi,%r10),%r10	# epilogue label
4373	cmp	%r10,%rbx		# context->Rip>=epilogue label
4374	jae	.Lcommon_seh_tail
4375
4376	mov	8(%r11),%r10d		# HandlerData[2]
4377	lea	(%rax,%r10),%rax
4378
4379	mov	-8(%rax),%rbp
4380	mov	-16(%rax),%rbx
4381	mov	-24(%rax),%r12
4382	mov	-32(%rax),%r13
4383	mov	-40(%rax),%r14
4384	mov	-48(%rax),%r15
4385	mov	%rbx,144($context)	# restore context->Rbx
4386	mov	%rbp,160($context)	# restore context->Rbp
4387	mov	%r12,216($context)	# restore context->R12
4388	mov	%r13,224($context)	# restore context->R13
4389	mov	%r14,232($context)	# restore context->R14
4390	mov	%r15,240($context)	# restore context->R15
4391
4392.Lcommon_seh_tail:
4393	mov	8(%rax),%rdi
4394	mov	16(%rax),%rsi
4395	mov	%rax,152($context)	# restore context->Rsp
4396	mov	%rsi,168($context)	# restore context->Rsi
4397	mov	%rdi,176($context)	# restore context->Rdi
4398
4399	mov	40($disp),%rdi		# disp->ContextRecord
4400	mov	$context,%rsi		# context
4401	mov	\$154,%ecx		# sizeof(CONTEXT)
4402	.long	0xa548f3fc		# cld; rep movsq
4403
4404	mov	$disp,%rsi
4405	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4406	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4407	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4408	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4409	mov	40(%rsi),%r10		# disp->ContextRecord
4410	lea	56(%rsi),%r11		# &disp->HandlerData
4411	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4412	mov	%r10,32(%rsp)		# arg5
4413	mov	%r11,40(%rsp)		# arg6
4414	mov	%r12,48(%rsp)		# arg7
4415	mov	%rcx,56(%rsp)		# arg8, (NULL)
4416	call	*__imp_RtlVirtualUnwind(%rip)
4417
4418	mov	\$1,%eax		# ExceptionContinueSearch
4419	add	\$64,%rsp
4420	popfq
4421	pop	%r15
4422	pop	%r14
4423	pop	%r13
4424	pop	%r12
4425	pop	%rbp
4426	pop	%rbx
4427	pop	%rdi
4428	pop	%rsi
4429	ret
4430.size	full_handler,.-full_handler
4431
4432.section	.pdata
4433.align	4
4434	.rva	.LSEH_begin_ecp_nistz256_mul_by_2
4435	.rva	.LSEH_end_ecp_nistz256_mul_by_2
4436	.rva	.LSEH_info_ecp_nistz256_mul_by_2
4437
4438	.rva	.LSEH_begin_ecp_nistz256_div_by_2
4439	.rva	.LSEH_end_ecp_nistz256_div_by_2
4440	.rva	.LSEH_info_ecp_nistz256_div_by_2
4441
4442	.rva	.LSEH_begin_ecp_nistz256_mul_by_3
4443	.rva	.LSEH_end_ecp_nistz256_mul_by_3
4444	.rva	.LSEH_info_ecp_nistz256_mul_by_3
4445
4446	.rva	.LSEH_begin_ecp_nistz256_add
4447	.rva	.LSEH_end_ecp_nistz256_add
4448	.rva	.LSEH_info_ecp_nistz256_add
4449
4450	.rva	.LSEH_begin_ecp_nistz256_sub
4451	.rva	.LSEH_end_ecp_nistz256_sub
4452	.rva	.LSEH_info_ecp_nistz256_sub
4453
4454	.rva	.LSEH_begin_ecp_nistz256_neg
4455	.rva	.LSEH_end_ecp_nistz256_neg
4456	.rva	.LSEH_info_ecp_nistz256_neg
4457
4458	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
4459	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
4460	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
4461
4462	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
4463	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
4464	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
4465___
4466$code.=<<___	if ($addx);
4467	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
4468	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
4469	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
4470
4471	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
4472	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
4473	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
4474___
4475$code.=<<___;
4476	.rva	.LSEH_begin_ecp_nistz256_to_mont
4477	.rva	.LSEH_end_ecp_nistz256_to_mont
4478	.rva	.LSEH_info_ecp_nistz256_to_mont
4479
4480	.rva	.LSEH_begin_ecp_nistz256_mul_mont
4481	.rva	.LSEH_end_ecp_nistz256_mul_mont
4482	.rva	.LSEH_info_ecp_nistz256_mul_mont
4483
4484	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
4485	.rva	.LSEH_end_ecp_nistz256_sqr_mont
4486	.rva	.LSEH_info_ecp_nistz256_sqr_mont
4487
4488	.rva	.LSEH_begin_ecp_nistz256_from_mont
4489	.rva	.LSEH_end_ecp_nistz256_from_mont
4490	.rva	.LSEH_info_ecp_nistz256_from_mont
4491
4492	.rva	.LSEH_begin_ecp_nistz256_gather_w5
4493	.rva	.LSEH_end_ecp_nistz256_gather_w5
4494	.rva	.LSEH_info_ecp_nistz256_gather_wX
4495
4496	.rva	.LSEH_begin_ecp_nistz256_gather_w7
4497	.rva	.LSEH_end_ecp_nistz256_gather_w7
4498	.rva	.LSEH_info_ecp_nistz256_gather_wX
4499___
4500$code.=<<___	if ($avx>1);
4501	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w5
4502	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w5
4503	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4504
4505	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w7
4506	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w7
4507	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4508___
4509$code.=<<___;
4510	.rva	.LSEH_begin_ecp_nistz256_point_double
4511	.rva	.LSEH_end_ecp_nistz256_point_double
4512	.rva	.LSEH_info_ecp_nistz256_point_double
4513
4514	.rva	.LSEH_begin_ecp_nistz256_point_add
4515	.rva	.LSEH_end_ecp_nistz256_point_add
4516	.rva	.LSEH_info_ecp_nistz256_point_add
4517
4518	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4519	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4520	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4521___
4522$code.=<<___ if ($addx);
4523	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4524	.rva	.LSEH_end_ecp_nistz256_point_doublex
4525	.rva	.LSEH_info_ecp_nistz256_point_doublex
4526
4527	.rva	.LSEH_begin_ecp_nistz256_point_addx
4528	.rva	.LSEH_end_ecp_nistz256_point_addx
4529	.rva	.LSEH_info_ecp_nistz256_point_addx
4530
4531	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4532	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4533	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4534___
4535$code.=<<___;
4536
4537.section	.xdata
4538.align	8
4539.LSEH_info_ecp_nistz256_mul_by_2:
4540	.byte	9,0,0,0
4541	.rva	short_handler
4542	.rva	.Lmul_by_2_body,.Lmul_by_2_epilogue	# HandlerData[]
4543.LSEH_info_ecp_nistz256_div_by_2:
4544	.byte	9,0,0,0
4545	.rva	short_handler
4546	.rva	.Ldiv_by_2_body,.Ldiv_by_2_epilogue	# HandlerData[]
4547.LSEH_info_ecp_nistz256_mul_by_3:
4548	.byte	9,0,0,0
4549	.rva	short_handler
4550	.rva	.Lmul_by_3_body,.Lmul_by_3_epilogue	# HandlerData[]
4551.LSEH_info_ecp_nistz256_add:
4552	.byte	9,0,0,0
4553	.rva	short_handler
4554	.rva	.Ladd_body,.Ladd_epilogue		# HandlerData[]
4555.LSEH_info_ecp_nistz256_sub:
4556	.byte	9,0,0,0
4557	.rva	short_handler
4558	.rva	.Lsub_body,.Lsub_epilogue		# HandlerData[]
4559.LSEH_info_ecp_nistz256_neg:
4560	.byte	9,0,0,0
4561	.rva	short_handler
4562	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4563.LSEH_info_ecp_nistz256_ord_mul_mont:
4564	.byte	9,0,0,0
4565	.rva	full_handler
4566	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4567	.long	48,0
4568.LSEH_info_ecp_nistz256_ord_sqr_mont:
4569	.byte	9,0,0,0
4570	.rva	full_handler
4571	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4572	.long	48,0
4573___
4574$code.=<<___ if ($addx);
4575.LSEH_info_ecp_nistz256_ord_mul_montx:
4576	.byte	9,0,0,0
4577	.rva	full_handler
4578	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4579	.long	48,0
4580.LSEH_info_ecp_nistz256_ord_sqr_montx:
4581	.byte	9,0,0,0
4582	.rva	full_handler
4583	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4584	.long	48,0
4585___
4586$code.=<<___;
4587.LSEH_info_ecp_nistz256_to_mont:
4588	.byte	9,0,0,0
4589	.rva	full_handler
4590	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4591	.long	48,0
4592.LSEH_info_ecp_nistz256_mul_mont:
4593	.byte	9,0,0,0
4594	.rva	full_handler
4595	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4596	.long	48,0
4597.LSEH_info_ecp_nistz256_sqr_mont:
4598	.byte	9,0,0,0
4599	.rva	full_handler
4600	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4601	.long	48,0
4602.LSEH_info_ecp_nistz256_from_mont:
4603	.byte	9,0,0,0
4604	.rva	short_handler
4605	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
4606.LSEH_info_ecp_nistz256_gather_wX:
4607	.byte	0x01,0x33,0x16,0x00
4608	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4609	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4610	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4611	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4612	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4613	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4614	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4615	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4616	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4617	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4618	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4619	.align	8
4620___
4621$code.=<<___	if ($avx>1);
4622.LSEH_info_ecp_nistz256_avx2_gather_wX:
4623	.byte	0x01,0x36,0x17,0x0b
4624	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4625	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4626	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4627	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4628	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4629	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4630	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4631	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4632	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4633	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4634	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4635	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4636	.align	8
4637___
4638$code.=<<___;
4639.LSEH_info_ecp_nistz256_point_double:
4640	.byte	9,0,0,0
4641	.rva	full_handler
4642	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4643	.long	32*5+56,0
4644.LSEH_info_ecp_nistz256_point_add:
4645	.byte	9,0,0,0
4646	.rva	full_handler
4647	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4648	.long	32*18+56,0
4649.LSEH_info_ecp_nistz256_point_add_affine:
4650	.byte	9,0,0,0
4651	.rva	full_handler
4652	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4653	.long	32*15+56,0
4654___
4655$code.=<<___ if ($addx);
4656.align	8
4657.LSEH_info_ecp_nistz256_point_doublex:
4658	.byte	9,0,0,0
4659	.rva	full_handler
4660	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4661	.long	32*5+56,0
4662.LSEH_info_ecp_nistz256_point_addx:
4663	.byte	9,0,0,0
4664	.rva	full_handler
4665	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4666	.long	32*18+56,0
4667.LSEH_info_ecp_nistz256_point_add_affinex:
4668	.byte	9,0,0,0
4669	.rva	full_handler
4670	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4671	.long	32*15+56,0
4672___
4673}
4674
4675########################################################################
4676# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4677#
4678open TABLE,"<ecp_nistz256_table.c"		or
4679open TABLE,"<${dir}../ecp_nistz256_table.c"	or
4680die "failed to open ecp_nistz256_table.c:",$!;
4681
4682use integer;
4683
4684foreach(<TABLE>) {
4685	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4686}
4687close TABLE;
4688
4689die "insane number of elements" if ($#arr != 64*16*37-1);
4690
4691print <<___;
4692.text
4693.globl	ecp_nistz256_precomputed
4694.type	ecp_nistz256_precomputed,\@object
4695.align	4096
4696ecp_nistz256_precomputed:
4697___
4698while (@line=splice(@arr,0,16)) {
4699	print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4700}
4701print <<___;
4702.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4703___
4704
4705$code =~ s/\`([^\`]*)\`/eval $1/gem;
4706print $code;
4707close STDOUT;
4708