1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the OpenSSL license (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
58		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
59	$avx = ($1>=2.19) + ($1>=2.22);
60	$addx = ($1>=2.23);
61}
62
63if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66	$addx = ($1>=2.10);
67}
68
69if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71	$avx = ($1>=10) + ($1>=11);
72	$addx = ($1>=12);
73}
74
75if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
76	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
77	$avx = ($ver>=3.0) + ($ver>=3.01);
78	$addx = ($ver>=3.03);
79}
80
81$code.=<<___;
82.text
83.extern	OPENSSL_ia32cap_P
84
85# The polynomial
86.align 64
87.Lpoly:
88.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
89
90# 2^512 mod P precomputed for NIST P256 polynomial
91.LRR:
92.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
93
94.LOne:
95.long 1,1,1,1,1,1,1,1
96.LTwo:
97.long 2,2,2,2,2,2,2,2
98.LThree:
99.long 3,3,3,3,3,3,3,3
100.LONE_mont:
101.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
102
103# Constants for computations modulo ord(p256)
104.Lord:
105.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
106.LordK:
107.quad 0xccd1c8aaee00bc4f
108___
109
110{
111################################################################################
112# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
113
114my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
115my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
116my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
117
118$code.=<<___;
119
120.globl	ecp_nistz256_mul_by_2
121.type	ecp_nistz256_mul_by_2,\@function,2
122.align	64
123ecp_nistz256_mul_by_2:
124.cfi_startproc
125	push	%r12
126.cfi_push	%r12
127	push	%r13
128.cfi_push	%r13
129.Lmul_by_2_body:
130
131	mov	8*0($a_ptr), $a0
132	xor	$t4,$t4
133	mov	8*1($a_ptr), $a1
134	add	$a0, $a0		# a0:a3+a0:a3
135	mov	8*2($a_ptr), $a2
136	adc	$a1, $a1
137	mov	8*3($a_ptr), $a3
138	lea	.Lpoly(%rip), $a_ptr
139	 mov	$a0, $t0
140	adc	$a2, $a2
141	adc	$a3, $a3
142	 mov	$a1, $t1
143	adc	\$0, $t4
144
145	sub	8*0($a_ptr), $a0
146	 mov	$a2, $t2
147	sbb	8*1($a_ptr), $a1
148	sbb	8*2($a_ptr), $a2
149	 mov	$a3, $t3
150	sbb	8*3($a_ptr), $a3
151	sbb	\$0, $t4
152
153	cmovc	$t0, $a0
154	cmovc	$t1, $a1
155	mov	$a0, 8*0($r_ptr)
156	cmovc	$t2, $a2
157	mov	$a1, 8*1($r_ptr)
158	cmovc	$t3, $a3
159	mov	$a2, 8*2($r_ptr)
160	mov	$a3, 8*3($r_ptr)
161
162	mov	0(%rsp),%r13
163.cfi_restore	%r13
164	mov	8(%rsp),%r12
165.cfi_restore	%r12
166	lea	16(%rsp),%rsp
167.cfi_adjust_cfa_offset	-16
168.Lmul_by_2_epilogue:
169	ret
170.cfi_endproc
171.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
172
173################################################################################
174# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
175.globl	ecp_nistz256_div_by_2
176.type	ecp_nistz256_div_by_2,\@function,2
177.align	32
178ecp_nistz256_div_by_2:
179.cfi_startproc
180	push	%r12
181.cfi_push	%r12
182	push	%r13
183.cfi_push	%r13
184.Ldiv_by_2_body:
185
186	mov	8*0($a_ptr), $a0
187	mov	8*1($a_ptr), $a1
188	mov	8*2($a_ptr), $a2
189	 mov	$a0, $t0
190	mov	8*3($a_ptr), $a3
191	lea	.Lpoly(%rip), $a_ptr
192
193	 mov	$a1, $t1
194	xor	$t4, $t4
195	add	8*0($a_ptr), $a0
196	 mov	$a2, $t2
197	adc	8*1($a_ptr), $a1
198	adc	8*2($a_ptr), $a2
199	 mov	$a3, $t3
200	adc	8*3($a_ptr), $a3
201	adc	\$0, $t4
202	xor	$a_ptr, $a_ptr		# borrow $a_ptr
203	test	\$1, $t0
204
205	cmovz	$t0, $a0
206	cmovz	$t1, $a1
207	cmovz	$t2, $a2
208	cmovz	$t3, $a3
209	cmovz	$a_ptr, $t4
210
211	mov	$a1, $t0		# a0:a3>>1
212	shr	\$1, $a0
213	shl	\$63, $t0
214	mov	$a2, $t1
215	shr	\$1, $a1
216	or	$t0, $a0
217	shl	\$63, $t1
218	mov	$a3, $t2
219	shr	\$1, $a2
220	or	$t1, $a1
221	shl	\$63, $t2
222	shr	\$1, $a3
223	shl	\$63, $t4
224	or	$t2, $a2
225	or	$t4, $a3
226
227	mov	$a0, 8*0($r_ptr)
228	mov	$a1, 8*1($r_ptr)
229	mov	$a2, 8*2($r_ptr)
230	mov	$a3, 8*3($r_ptr)
231
232	mov	0(%rsp),%r13
233.cfi_restore	%r13
234	mov	8(%rsp),%r12
235.cfi_restore	%r12
236	lea	16(%rsp),%rsp
237.cfi_adjust_cfa_offset	-16
238.Ldiv_by_2_epilogue:
239	ret
240.cfi_endproc
241.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
242
243################################################################################
244# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
245.globl	ecp_nistz256_mul_by_3
246.type	ecp_nistz256_mul_by_3,\@function,2
247.align	32
248ecp_nistz256_mul_by_3:
249.cfi_startproc
250	push	%r12
251.cfi_push	%r12
252	push	%r13
253.cfi_push	%r13
254.Lmul_by_3_body:
255
256	mov	8*0($a_ptr), $a0
257	xor	$t4, $t4
258	mov	8*1($a_ptr), $a1
259	add	$a0, $a0		# a0:a3+a0:a3
260	mov	8*2($a_ptr), $a2
261	adc	$a1, $a1
262	mov	8*3($a_ptr), $a3
263	 mov	$a0, $t0
264	adc	$a2, $a2
265	adc	$a3, $a3
266	 mov	$a1, $t1
267	adc	\$0, $t4
268
269	sub	\$-1, $a0
270	 mov	$a2, $t2
271	sbb	.Lpoly+8*1(%rip), $a1
272	sbb	\$0, $a2
273	 mov	$a3, $t3
274	sbb	.Lpoly+8*3(%rip), $a3
275	sbb	\$0, $t4
276
277	cmovc	$t0, $a0
278	cmovc	$t1, $a1
279	cmovc	$t2, $a2
280	cmovc	$t3, $a3
281
282	xor	$t4, $t4
283	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
284	adc	8*1($a_ptr), $a1
285	 mov	$a0, $t0
286	adc	8*2($a_ptr), $a2
287	adc	8*3($a_ptr), $a3
288	 mov	$a1, $t1
289	adc	\$0, $t4
290
291	sub	\$-1, $a0
292	 mov	$a2, $t2
293	sbb	.Lpoly+8*1(%rip), $a1
294	sbb	\$0, $a2
295	 mov	$a3, $t3
296	sbb	.Lpoly+8*3(%rip), $a3
297	sbb	\$0, $t4
298
299	cmovc	$t0, $a0
300	cmovc	$t1, $a1
301	mov	$a0, 8*0($r_ptr)
302	cmovc	$t2, $a2
303	mov	$a1, 8*1($r_ptr)
304	cmovc	$t3, $a3
305	mov	$a2, 8*2($r_ptr)
306	mov	$a3, 8*3($r_ptr)
307
308	mov	0(%rsp),%r13
309.cfi_restore	%r13
310	mov	8(%rsp),%r12
311.cfi_restore	%r12
312	lea	16(%rsp),%rsp
313.cfi_adjust_cfa_offset	-16
314.Lmul_by_3_epilogue:
315	ret
316.cfi_endproc
317.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
318
319################################################################################
320# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
321.globl	ecp_nistz256_add
322.type	ecp_nistz256_add,\@function,3
323.align	32
324ecp_nistz256_add:
325.cfi_startproc
326	push	%r12
327.cfi_push	%r12
328	push	%r13
329.cfi_push	%r13
330.Ladd_body:
331
332	mov	8*0($a_ptr), $a0
333	xor	$t4, $t4
334	mov	8*1($a_ptr), $a1
335	mov	8*2($a_ptr), $a2
336	mov	8*3($a_ptr), $a3
337	lea	.Lpoly(%rip), $a_ptr
338
339	add	8*0($b_ptr), $a0
340	adc	8*1($b_ptr), $a1
341	 mov	$a0, $t0
342	adc	8*2($b_ptr), $a2
343	adc	8*3($b_ptr), $a3
344	 mov	$a1, $t1
345	adc	\$0, $t4
346
347	sub	8*0($a_ptr), $a0
348	 mov	$a2, $t2
349	sbb	8*1($a_ptr), $a1
350	sbb	8*2($a_ptr), $a2
351	 mov	$a3, $t3
352	sbb	8*3($a_ptr), $a3
353	sbb	\$0, $t4
354
355	cmovc	$t0, $a0
356	cmovc	$t1, $a1
357	mov	$a0, 8*0($r_ptr)
358	cmovc	$t2, $a2
359	mov	$a1, 8*1($r_ptr)
360	cmovc	$t3, $a3
361	mov	$a2, 8*2($r_ptr)
362	mov	$a3, 8*3($r_ptr)
363
364	mov	0(%rsp),%r13
365.cfi_restore	%r13
366	mov	8(%rsp),%r12
367.cfi_restore	%r12
368	lea	16(%rsp),%rsp
369.cfi_adjust_cfa_offset	-16
370.Ladd_epilogue:
371	ret
372.cfi_endproc
373.size	ecp_nistz256_add,.-ecp_nistz256_add
374
375################################################################################
376# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
377.globl	ecp_nistz256_sub
378.type	ecp_nistz256_sub,\@function,3
379.align	32
380ecp_nistz256_sub:
381.cfi_startproc
382	push	%r12
383.cfi_push	%r12
384	push	%r13
385.cfi_push	%r13
386.Lsub_body:
387
388	mov	8*0($a_ptr), $a0
389	xor	$t4, $t4
390	mov	8*1($a_ptr), $a1
391	mov	8*2($a_ptr), $a2
392	mov	8*3($a_ptr), $a3
393	lea	.Lpoly(%rip), $a_ptr
394
395	sub	8*0($b_ptr), $a0
396	sbb	8*1($b_ptr), $a1
397	 mov	$a0, $t0
398	sbb	8*2($b_ptr), $a2
399	sbb	8*3($b_ptr), $a3
400	 mov	$a1, $t1
401	sbb	\$0, $t4
402
403	add	8*0($a_ptr), $a0
404	 mov	$a2, $t2
405	adc	8*1($a_ptr), $a1
406	adc	8*2($a_ptr), $a2
407	 mov	$a3, $t3
408	adc	8*3($a_ptr), $a3
409	test	$t4, $t4
410
411	cmovz	$t0, $a0
412	cmovz	$t1, $a1
413	mov	$a0, 8*0($r_ptr)
414	cmovz	$t2, $a2
415	mov	$a1, 8*1($r_ptr)
416	cmovz	$t3, $a3
417	mov	$a2, 8*2($r_ptr)
418	mov	$a3, 8*3($r_ptr)
419
420	mov	0(%rsp),%r13
421.cfi_restore	%r13
422	mov	8(%rsp),%r12
423.cfi_restore	%r12
424	lea	16(%rsp),%rsp
425.cfi_adjust_cfa_offset	-16
426.Lsub_epilogue:
427	ret
428.cfi_endproc
429.size	ecp_nistz256_sub,.-ecp_nistz256_sub
430
431################################################################################
432# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
433.globl	ecp_nistz256_neg
434.type	ecp_nistz256_neg,\@function,2
435.align	32
436ecp_nistz256_neg:
437.cfi_startproc
438	push	%r12
439.cfi_push	%r12
440	push	%r13
441.cfi_push	%r13
442.Lneg_body:
443
444	xor	$a0, $a0
445	xor	$a1, $a1
446	xor	$a2, $a2
447	xor	$a3, $a3
448	xor	$t4, $t4
449
450	sub	8*0($a_ptr), $a0
451	sbb	8*1($a_ptr), $a1
452	sbb	8*2($a_ptr), $a2
453	 mov	$a0, $t0
454	sbb	8*3($a_ptr), $a3
455	lea	.Lpoly(%rip), $a_ptr
456	 mov	$a1, $t1
457	sbb	\$0, $t4
458
459	add	8*0($a_ptr), $a0
460	 mov	$a2, $t2
461	adc	8*1($a_ptr), $a1
462	adc	8*2($a_ptr), $a2
463	 mov	$a3, $t3
464	adc	8*3($a_ptr), $a3
465	test	$t4, $t4
466
467	cmovz	$t0, $a0
468	cmovz	$t1, $a1
469	mov	$a0, 8*0($r_ptr)
470	cmovz	$t2, $a2
471	mov	$a1, 8*1($r_ptr)
472	cmovz	$t3, $a3
473	mov	$a2, 8*2($r_ptr)
474	mov	$a3, 8*3($r_ptr)
475
476	mov	0(%rsp),%r13
477.cfi_restore	%r13
478	mov	8(%rsp),%r12
479.cfi_restore	%r12
480	lea	16(%rsp),%rsp
481.cfi_adjust_cfa_offset	-16
482.Lneg_epilogue:
483	ret
484.cfi_endproc
485.size	ecp_nistz256_neg,.-ecp_nistz256_neg
486___
487}
488{
489my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
490my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
491my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
492my ($poly1,$poly3)=($acc6,$acc7);
493
494$code.=<<___;
495################################################################################
496# void ecp_nistz256_ord_mul_mont(
497#   uint64_t res[4],
498#   uint64_t a[4],
499#   uint64_t b[4]);
500
501.globl	ecp_nistz256_ord_mul_mont
502.type	ecp_nistz256_ord_mul_mont,\@function,3
503.align	32
504ecp_nistz256_ord_mul_mont:
505.cfi_startproc
506___
507$code.=<<___	if ($addx);
508	mov	\$0x80100, %ecx
509	and	OPENSSL_ia32cap_P+8(%rip), %ecx
510	cmp	\$0x80100, %ecx
511	je	.Lecp_nistz256_ord_mul_montx
512___
513$code.=<<___;
514	push	%rbp
515.cfi_push	%rbp
516	push	%rbx
517.cfi_push	%rbx
518	push	%r12
519.cfi_push	%r12
520	push	%r13
521.cfi_push	%r13
522	push	%r14
523.cfi_push	%r14
524	push	%r15
525.cfi_push	%r15
526.Lord_mul_body:
527
528	mov	8*0($b_org), %rax
529	mov	$b_org, $b_ptr
530	lea	.Lord(%rip), %r14
531	mov	.LordK(%rip), %r15
532
533	################################# * b[0]
534	mov	%rax, $t0
535	mulq	8*0($a_ptr)
536	mov	%rax, $acc0
537	mov	$t0, %rax
538	mov	%rdx, $acc1
539
540	mulq	8*1($a_ptr)
541	add	%rax, $acc1
542	mov	$t0, %rax
543	adc	\$0, %rdx
544	mov	%rdx, $acc2
545
546	mulq	8*2($a_ptr)
547	add	%rax, $acc2
548	mov	$t0, %rax
549	adc	\$0, %rdx
550
551	 mov	$acc0, $acc5
552	 imulq	%r15,$acc0
553
554	mov	%rdx, $acc3
555	mulq	8*3($a_ptr)
556	add	%rax, $acc3
557	 mov	$acc0, %rax
558	adc	\$0, %rdx
559	mov	%rdx, $acc4
560
561	################################# First reduction step
562	mulq	8*0(%r14)
563	mov	$acc0, $t1
564	add	%rax, $acc5		# guaranteed to be zero
565	mov	$acc0, %rax
566	adc	\$0, %rdx
567	mov	%rdx, $t0
568
569	sub	$acc0, $acc2
570	sbb	\$0, $acc0		# can't borrow
571
572	mulq	8*1(%r14)
573	add	$t0, $acc1
574	adc	\$0, %rdx
575	add	%rax, $acc1
576	mov	$t1, %rax
577	adc	%rdx, $acc2
578	mov	$t1, %rdx
579	adc	\$0, $acc0		# can't overflow
580
581	shl	\$32, %rax
582	shr	\$32, %rdx
583	sub	%rax, $acc3
584	 mov	8*1($b_ptr), %rax
585	sbb	%rdx, $t1		# can't borrow
586
587	add	$acc0, $acc3
588	adc	$t1, $acc4
589	adc	\$0, $acc5
590
591	################################# * b[1]
592	mov	%rax, $t0
593	mulq	8*0($a_ptr)
594	add	%rax, $acc1
595	mov	$t0, %rax
596	adc	\$0, %rdx
597	mov	%rdx, $t1
598
599	mulq	8*1($a_ptr)
600	add	$t1, $acc2
601	adc	\$0, %rdx
602	add	%rax, $acc2
603	mov	$t0, %rax
604	adc	\$0, %rdx
605	mov	%rdx, $t1
606
607	mulq	8*2($a_ptr)
608	add	$t1, $acc3
609	adc	\$0, %rdx
610	add	%rax, $acc3
611	mov	$t0, %rax
612	adc	\$0, %rdx
613
614	 mov	$acc1, $t0
615	 imulq	%r15, $acc1
616
617	mov	%rdx, $t1
618	mulq	8*3($a_ptr)
619	add	$t1, $acc4
620	adc	\$0, %rdx
621	xor	$acc0, $acc0
622	add	%rax, $acc4
623	 mov	$acc1, %rax
624	adc	%rdx, $acc5
625	adc	\$0, $acc0
626
627	################################# Second reduction step
628	mulq	8*0(%r14)
629	mov	$acc1, $t1
630	add	%rax, $t0		# guaranteed to be zero
631	mov	$acc1, %rax
632	adc	%rdx, $t0
633
634	sub	$acc1, $acc3
635	sbb	\$0, $acc1		# can't borrow
636
637	mulq	8*1(%r14)
638	add	$t0, $acc2
639	adc	\$0, %rdx
640	add	%rax, $acc2
641	mov	$t1, %rax
642	adc	%rdx, $acc3
643	mov	$t1, %rdx
644	adc	\$0, $acc1		# can't overflow
645
646	shl	\$32, %rax
647	shr	\$32, %rdx
648	sub	%rax, $acc4
649	 mov	8*2($b_ptr), %rax
650	sbb	%rdx, $t1		# can't borrow
651
652	add	$acc1, $acc4
653	adc	$t1, $acc5
654	adc	\$0, $acc0
655
656	################################## * b[2]
657	mov	%rax, $t0
658	mulq	8*0($a_ptr)
659	add	%rax, $acc2
660	mov	$t0, %rax
661	adc	\$0, %rdx
662	mov	%rdx, $t1
663
664	mulq	8*1($a_ptr)
665	add	$t1, $acc3
666	adc	\$0, %rdx
667	add	%rax, $acc3
668	mov	$t0, %rax
669	adc	\$0, %rdx
670	mov	%rdx, $t1
671
672	mulq	8*2($a_ptr)
673	add	$t1, $acc4
674	adc	\$0, %rdx
675	add	%rax, $acc4
676	mov	$t0, %rax
677	adc	\$0, %rdx
678
679	 mov	$acc2, $t0
680	 imulq	%r15, $acc2
681
682	mov	%rdx, $t1
683	mulq	8*3($a_ptr)
684	add	$t1, $acc5
685	adc	\$0, %rdx
686	xor	$acc1, $acc1
687	add	%rax, $acc5
688	 mov	$acc2, %rax
689	adc	%rdx, $acc0
690	adc	\$0, $acc1
691
692	################################# Third reduction step
693	mulq	8*0(%r14)
694	mov	$acc2, $t1
695	add	%rax, $t0		# guaranteed to be zero
696	mov	$acc2, %rax
697	adc	%rdx, $t0
698
699	sub	$acc2, $acc4
700	sbb	\$0, $acc2		# can't borrow
701
702	mulq	8*1(%r14)
703	add	$t0, $acc3
704	adc	\$0, %rdx
705	add	%rax, $acc3
706	mov	$t1, %rax
707	adc	%rdx, $acc4
708	mov	$t1, %rdx
709	adc	\$0, $acc2		# can't overflow
710
711	shl	\$32, %rax
712	shr	\$32, %rdx
713	sub	%rax, $acc5
714	 mov	8*3($b_ptr), %rax
715	sbb	%rdx, $t1		# can't borrow
716
717	add	$acc2, $acc5
718	adc	$t1, $acc0
719	adc	\$0, $acc1
720
721	################################# * b[3]
722	mov	%rax, $t0
723	mulq	8*0($a_ptr)
724	add	%rax, $acc3
725	mov	$t0, %rax
726	adc	\$0, %rdx
727	mov	%rdx, $t1
728
729	mulq	8*1($a_ptr)
730	add	$t1, $acc4
731	adc	\$0, %rdx
732	add	%rax, $acc4
733	mov	$t0, %rax
734	adc	\$0, %rdx
735	mov	%rdx, $t1
736
737	mulq	8*2($a_ptr)
738	add	$t1, $acc5
739	adc	\$0, %rdx
740	add	%rax, $acc5
741	mov	$t0, %rax
742	adc	\$0, %rdx
743
744	 mov	$acc3, $t0
745	 imulq	%r15, $acc3
746
747	mov	%rdx, $t1
748	mulq	8*3($a_ptr)
749	add	$t1, $acc0
750	adc	\$0, %rdx
751	xor	$acc2, $acc2
752	add	%rax, $acc0
753	 mov	$acc3, %rax
754	adc	%rdx, $acc1
755	adc	\$0, $acc2
756
757	################################# Last reduction step
758	mulq	8*0(%r14)
759	mov	$acc3, $t1
760	add	%rax, $t0		# guaranteed to be zero
761	mov	$acc3, %rax
762	adc	%rdx, $t0
763
764	sub	$acc3, $acc5
765	sbb	\$0, $acc3		# can't borrow
766
767	mulq	8*1(%r14)
768	add	$t0, $acc4
769	adc	\$0, %rdx
770	add	%rax, $acc4
771	mov	$t1, %rax
772	adc	%rdx, $acc5
773	mov	$t1, %rdx
774	adc	\$0, $acc3		# can't overflow
775
776	shl	\$32, %rax
777	shr	\$32, %rdx
778	sub	%rax, $acc0
779	sbb	%rdx, $t1		# can't borrow
780
781	add	$acc3, $acc0
782	adc	$t1, $acc1
783	adc	\$0, $acc2
784
785	################################# Subtract ord
786	 mov	$acc4, $a_ptr
787	sub	8*0(%r14), $acc4
788	 mov	$acc5, $acc3
789	sbb	8*1(%r14), $acc5
790	 mov	$acc0, $t0
791	sbb	8*2(%r14), $acc0
792	 mov	$acc1, $t1
793	sbb	8*3(%r14), $acc1
794	sbb	\$0, $acc2
795
796	cmovc	$a_ptr, $acc4
797	cmovc	$acc3, $acc5
798	cmovc	$t0, $acc0
799	cmovc	$t1, $acc1
800
801	mov	$acc4, 8*0($r_ptr)
802	mov	$acc5, 8*1($r_ptr)
803	mov	$acc0, 8*2($r_ptr)
804	mov	$acc1, 8*3($r_ptr)
805
806	mov	0(%rsp),%r15
807.cfi_restore	%r15
808	mov	8(%rsp),%r14
809.cfi_restore	%r14
810	mov	16(%rsp),%r13
811.cfi_restore	%r13
812	mov	24(%rsp),%r12
813.cfi_restore	%r12
814	mov	32(%rsp),%rbx
815.cfi_restore	%rbx
816	mov	40(%rsp),%rbp
817.cfi_restore	%rbp
818	lea	48(%rsp),%rsp
819.cfi_adjust_cfa_offset	-48
820.Lord_mul_epilogue:
821	ret
822.cfi_endproc
823.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
824
825################################################################################
826# void ecp_nistz256_ord_sqr_mont(
827#   uint64_t res[4],
828#   uint64_t a[4],
829#   int rep);
830
831.globl	ecp_nistz256_ord_sqr_mont
832.type	ecp_nistz256_ord_sqr_mont,\@function,3
833.align	32
834ecp_nistz256_ord_sqr_mont:
835.cfi_startproc
836___
837$code.=<<___	if ($addx);
838	mov	\$0x80100, %ecx
839	and	OPENSSL_ia32cap_P+8(%rip), %ecx
840	cmp	\$0x80100, %ecx
841	je	.Lecp_nistz256_ord_sqr_montx
842___
843$code.=<<___;
844	push	%rbp
845.cfi_push	%rbp
846	push	%rbx
847.cfi_push	%rbx
848	push	%r12
849.cfi_push	%r12
850	push	%r13
851.cfi_push	%r13
852	push	%r14
853.cfi_push	%r14
854	push	%r15
855.cfi_push	%r15
856.Lord_sqr_body:
857
858	mov	8*0($a_ptr), $acc0
859	mov	8*1($a_ptr), %rax
860	mov	8*2($a_ptr), $acc6
861	mov	8*3($a_ptr), $acc7
862	lea	.Lord(%rip), $a_ptr	# pointer to modulus
863	mov	$b_org, $b_ptr
864	jmp	.Loop_ord_sqr
865
866.align	32
867.Loop_ord_sqr:
868	################################# a[1:] * a[0]
869	mov	%rax, $t1		# put aside a[1]
870	mul	$acc0			# a[1] * a[0]
871	mov	%rax, $acc1
872	movq	$t1, %xmm1		# offload a[1]
873	mov	$acc6, %rax
874	mov	%rdx, $acc2
875
876	mul	$acc0			# a[2] * a[0]
877	add	%rax, $acc2
878	mov	$acc7, %rax
879	movq	$acc6, %xmm2		# offload a[2]
880	adc	\$0, %rdx
881	mov	%rdx, $acc3
882
883	mul	$acc0			# a[3] * a[0]
884	add	%rax, $acc3
885	mov	$acc7, %rax
886	movq	$acc7, %xmm3		# offload a[3]
887	adc	\$0, %rdx
888	mov	%rdx, $acc4
889
890	################################# a[3] * a[2]
891	mul	$acc6			# a[3] * a[2]
892	mov	%rax, $acc5
893	mov	$acc6, %rax
894	mov	%rdx, $acc6
895
896	################################# a[2:] * a[1]
897	mul	$t1			# a[2] * a[1]
898	add	%rax, $acc3
899	mov	$acc7, %rax
900	adc	\$0, %rdx
901	mov	%rdx, $acc7
902
903	mul	$t1			# a[3] * a[1]
904	add	%rax, $acc4
905	adc	\$0, %rdx
906
907	add	$acc7, $acc4
908	adc	%rdx, $acc5
909	adc	\$0, $acc6		# can't overflow
910
911	################################# *2
912	xor	$acc7, $acc7
913	mov	$acc0, %rax
914	add	$acc1, $acc1
915	adc	$acc2, $acc2
916	adc	$acc3, $acc3
917	adc	$acc4, $acc4
918	adc	$acc5, $acc5
919	adc	$acc6, $acc6
920	adc	\$0, $acc7
921
922	################################# Missing products
923	mul	%rax			# a[0] * a[0]
924	mov	%rax, $acc0
925	movq	%xmm1, %rax
926	mov	%rdx, $t1
927
928	mul	%rax			# a[1] * a[1]
929	add	$t1, $acc1
930	adc	%rax, $acc2
931	movq	%xmm2, %rax
932	adc	\$0, %rdx
933	mov	%rdx, $t1
934
935	mul	%rax			# a[2] * a[2]
936	add	$t1, $acc3
937	adc	%rax, $acc4
938	movq	%xmm3, %rax
939	adc	\$0, %rdx
940	mov	%rdx, $t1
941
942	 mov	$acc0, $t0
943	 imulq	8*4($a_ptr), $acc0	# *= .LordK
944
945	mul	%rax			# a[3] * a[3]
946	add	$t1, $acc5
947	adc	%rax, $acc6
948	 mov	8*0($a_ptr), %rax	# modulus[0]
949	adc	%rdx, $acc7		# can't overflow
950
951	################################# First reduction step
952	mul	$acc0
953	mov	$acc0, $t1
954	add	%rax, $t0		# guaranteed to be zero
955	mov	8*1($a_ptr), %rax	# modulus[1]
956	adc	%rdx, $t0
957
958	sub	$acc0, $acc2
959	sbb	\$0, $t1		# can't borrow
960
961	mul	$acc0
962	add	$t0, $acc1
963	adc	\$0, %rdx
964	add	%rax, $acc1
965	mov	$acc0, %rax
966	adc	%rdx, $acc2
967	mov	$acc0, %rdx
968	adc	\$0, $t1		# can't overflow
969
970	 mov	$acc1, $t0
971	 imulq	8*4($a_ptr), $acc1	# *= .LordK
972
973	shl	\$32, %rax
974	shr	\$32, %rdx
975	sub	%rax, $acc3
976	 mov	8*0($a_ptr), %rax
977	sbb	%rdx, $acc0		# can't borrow
978
979	add	$t1, $acc3
980	adc	\$0, $acc0		# can't overflow
981
982	################################# Second reduction step
983	mul	$acc1
984	mov	$acc1, $t1
985	add	%rax, $t0		# guaranteed to be zero
986	mov	8*1($a_ptr), %rax
987	adc	%rdx, $t0
988
989	sub	$acc1, $acc3
990	sbb	\$0, $t1		# can't borrow
991
992	mul	$acc1
993	add	$t0, $acc2
994	adc	\$0, %rdx
995	add	%rax, $acc2
996	mov	$acc1, %rax
997	adc	%rdx, $acc3
998	mov	$acc1, %rdx
999	adc	\$0, $t1		# can't overflow
1000
1001	 mov	$acc2, $t0
1002	 imulq	8*4($a_ptr), $acc2	# *= .LordK
1003
1004	shl	\$32, %rax
1005	shr	\$32, %rdx
1006	sub	%rax, $acc0
1007	 mov	8*0($a_ptr), %rax
1008	sbb	%rdx, $acc1		# can't borrow
1009
1010	add	$t1, $acc0
1011	adc	\$0, $acc1		# can't overflow
1012
1013	################################# Third reduction step
1014	mul	$acc2
1015	mov	$acc2, $t1
1016	add	%rax, $t0		# guaranteed to be zero
1017	mov	8*1($a_ptr), %rax
1018	adc	%rdx, $t0
1019
1020	sub	$acc2, $acc0
1021	sbb	\$0, $t1		# can't borrow
1022
1023	mul	$acc2
1024	add	$t0, $acc3
1025	adc	\$0, %rdx
1026	add	%rax, $acc3
1027	mov	$acc2, %rax
1028	adc	%rdx, $acc0
1029	mov	$acc2, %rdx
1030	adc	\$0, $t1		# can't overflow
1031
1032	 mov	$acc3, $t0
1033	 imulq	8*4($a_ptr), $acc3	# *= .LordK
1034
1035	shl	\$32, %rax
1036	shr	\$32, %rdx
1037	sub	%rax, $acc1
1038	 mov	8*0($a_ptr), %rax
1039	sbb	%rdx, $acc2		# can't borrow
1040
1041	add	$t1, $acc1
1042	adc	\$0, $acc2		# can't overflow
1043
1044	################################# Last reduction step
1045	mul	$acc3
1046	mov	$acc3, $t1
1047	add	%rax, $t0		# guaranteed to be zero
1048	mov	8*1($a_ptr), %rax
1049	adc	%rdx, $t0
1050
1051	sub	$acc3, $acc1
1052	sbb	\$0, $t1		# can't borrow
1053
1054	mul	$acc3
1055	add	$t0, $acc0
1056	adc	\$0, %rdx
1057	add	%rax, $acc0
1058	mov	$acc3, %rax
1059	adc	%rdx, $acc1
1060	mov	$acc3, %rdx
1061	adc	\$0, $t1		# can't overflow
1062
1063	shl	\$32, %rax
1064	shr	\$32, %rdx
1065	sub	%rax, $acc2
1066	sbb	%rdx, $acc3		# can't borrow
1067
1068	add	$t1, $acc2
1069	adc	\$0, $acc3		# can't overflow
1070
1071	################################# Add bits [511:256] of the sqr result
1072	xor	%rdx, %rdx
1073	add	$acc4, $acc0
1074	adc	$acc5, $acc1
1075	 mov	$acc0, $acc4
1076	adc	$acc6, $acc2
1077	adc	$acc7, $acc3
1078	 mov	$acc1, %rax
1079	adc	\$0, %rdx
1080
1081	################################# Compare to modulus
1082	sub	8*0($a_ptr), $acc0
1083	 mov	$acc2, $acc6
1084	sbb	8*1($a_ptr), $acc1
1085	sbb	8*2($a_ptr), $acc2
1086	 mov	$acc3, $acc7
1087	sbb	8*3($a_ptr), $acc3
1088	sbb	\$0, %rdx
1089
1090	cmovc	$acc4, $acc0
1091	cmovnc	$acc1, %rax
1092	cmovnc	$acc2, $acc6
1093	cmovnc	$acc3, $acc7
1094
1095	dec	$b_ptr
1096	jnz	.Loop_ord_sqr
1097
1098	mov	$acc0, 8*0($r_ptr)
1099	mov	%rax,  8*1($r_ptr)
1100	pxor	%xmm1, %xmm1
1101	mov	$acc6, 8*2($r_ptr)
1102	pxor	%xmm2, %xmm2
1103	mov	$acc7, 8*3($r_ptr)
1104	pxor	%xmm3, %xmm3
1105
1106	mov	0(%rsp),%r15
1107.cfi_restore	%r15
1108	mov	8(%rsp),%r14
1109.cfi_restore	%r14
1110	mov	16(%rsp),%r13
1111.cfi_restore	%r13
1112	mov	24(%rsp),%r12
1113.cfi_restore	%r12
1114	mov	32(%rsp),%rbx
1115.cfi_restore	%rbx
1116	mov	40(%rsp),%rbp
1117.cfi_restore	%rbp
1118	lea	48(%rsp),%rsp
1119.cfi_adjust_cfa_offset	-48
1120.Lord_sqr_epilogue:
1121	ret
1122.cfi_endproc
1123.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1124___
1125
1126$code.=<<___	if ($addx);
1127################################################################################
1128.type	ecp_nistz256_ord_mul_montx,\@function,3
1129.align	32
1130ecp_nistz256_ord_mul_montx:
1131.cfi_startproc
1132.Lecp_nistz256_ord_mul_montx:
1133	push	%rbp
1134.cfi_push	%rbp
1135	push	%rbx
1136.cfi_push	%rbx
1137	push	%r12
1138.cfi_push	%r12
1139	push	%r13
1140.cfi_push	%r13
1141	push	%r14
1142.cfi_push	%r14
1143	push	%r15
1144.cfi_push	%r15
1145.Lord_mulx_body:
1146
1147	mov	$b_org, $b_ptr
1148	mov	8*0($b_org), %rdx
1149	mov	8*0($a_ptr), $acc1
1150	mov	8*1($a_ptr), $acc2
1151	mov	8*2($a_ptr), $acc3
1152	mov	8*3($a_ptr), $acc4
1153	lea	-128($a_ptr), $a_ptr	# control u-op density
1154	lea	.Lord-128(%rip), %r14
1155	mov	.LordK(%rip), %r15
1156
1157	################################# Multiply by b[0]
1158	mulx	$acc1, $acc0, $acc1
1159	mulx	$acc2, $t0, $acc2
1160	mulx	$acc3, $t1, $acc3
1161	add	$t0, $acc1
1162	mulx	$acc4, $t0, $acc4
1163	 mov	$acc0, %rdx
1164	 mulx	%r15, %rdx, %rax
1165	adc	$t1, $acc2
1166	adc	$t0, $acc3
1167	adc	\$0, $acc4
1168
1169	################################# reduction
1170	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
1171	mulx	8*0+128(%r14), $t0, $t1
1172	adcx	$t0, $acc0		# guaranteed to be zero
1173	adox	$t1, $acc1
1174
1175	mulx	8*1+128(%r14), $t0, $t1
1176	adcx	$t0, $acc1
1177	adox	$t1, $acc2
1178
1179	mulx	8*2+128(%r14), $t0, $t1
1180	adcx	$t0, $acc2
1181	adox	$t1, $acc3
1182
1183	mulx	8*3+128(%r14), $t0, $t1
1184	 mov	8*1($b_ptr), %rdx
1185	adcx	$t0, $acc3
1186	adox	$t1, $acc4
1187	adcx	$acc0, $acc4
1188	adox	$acc0, $acc5
1189	adc	\$0, $acc5		# cf=0, of=0
1190
1191	################################# Multiply by b[1]
1192	mulx	8*0+128($a_ptr), $t0, $t1
1193	adcx	$t0, $acc1
1194	adox	$t1, $acc2
1195
1196	mulx	8*1+128($a_ptr), $t0, $t1
1197	adcx	$t0, $acc2
1198	adox	$t1, $acc3
1199
1200	mulx	8*2+128($a_ptr), $t0, $t1
1201	adcx	$t0, $acc3
1202	adox	$t1, $acc4
1203
1204	mulx	8*3+128($a_ptr), $t0, $t1
1205	 mov	$acc1, %rdx
1206	 mulx	%r15, %rdx, %rax
1207	adcx	$t0, $acc4
1208	adox	$t1, $acc5
1209
1210	adcx	$acc0, $acc5
1211	adox	$acc0, $acc0
1212	adc	\$0, $acc0		# cf=0, of=0
1213
1214	################################# reduction
1215	mulx	8*0+128(%r14), $t0, $t1
1216	adcx	$t0, $acc1		# guaranteed to be zero
1217	adox	$t1, $acc2
1218
1219	mulx	8*1+128(%r14), $t0, $t1
1220	adcx	$t0, $acc2
1221	adox	$t1, $acc3
1222
1223	mulx	8*2+128(%r14), $t0, $t1
1224	adcx	$t0, $acc3
1225	adox	$t1, $acc4
1226
1227	mulx	8*3+128(%r14), $t0, $t1
1228	 mov	8*2($b_ptr), %rdx
1229	adcx	$t0, $acc4
1230	adox	$t1, $acc5
1231	adcx	$acc1, $acc5
1232	adox	$acc1, $acc0
1233	adc	\$0, $acc0		# cf=0, of=0
1234
1235	################################# Multiply by b[2]
1236	mulx	8*0+128($a_ptr), $t0, $t1
1237	adcx	$t0, $acc2
1238	adox	$t1, $acc3
1239
1240	mulx	8*1+128($a_ptr), $t0, $t1
1241	adcx	$t0, $acc3
1242	adox	$t1, $acc4
1243
1244	mulx	8*2+128($a_ptr), $t0, $t1
1245	adcx	$t0, $acc4
1246	adox	$t1, $acc5
1247
1248	mulx	8*3+128($a_ptr), $t0, $t1
1249	 mov	$acc2, %rdx
1250	 mulx	%r15, %rdx, %rax
1251	adcx	$t0, $acc5
1252	adox	$t1, $acc0
1253
1254	adcx	$acc1, $acc0
1255	adox	$acc1, $acc1
1256	adc	\$0, $acc1		# cf=0, of=0
1257
1258	################################# reduction
1259	mulx	8*0+128(%r14), $t0, $t1
1260	adcx	$t0, $acc2		# guaranteed to be zero
1261	adox	$t1, $acc3
1262
1263	mulx	8*1+128(%r14), $t0, $t1
1264	adcx	$t0, $acc3
1265	adox	$t1, $acc4
1266
1267	mulx	8*2+128(%r14), $t0, $t1
1268	adcx	$t0, $acc4
1269	adox	$t1, $acc5
1270
1271	mulx	8*3+128(%r14), $t0, $t1
1272	 mov	8*3($b_ptr), %rdx
1273	adcx	$t0, $acc5
1274	adox	$t1, $acc0
1275	adcx	$acc2, $acc0
1276	adox	$acc2, $acc1
1277	adc	\$0, $acc1		# cf=0, of=0
1278
1279	################################# Multiply by b[3]
1280	mulx	8*0+128($a_ptr), $t0, $t1
1281	adcx	$t0, $acc3
1282	adox	$t1, $acc4
1283
1284	mulx	8*1+128($a_ptr), $t0, $t1
1285	adcx	$t0, $acc4
1286	adox	$t1, $acc5
1287
1288	mulx	8*2+128($a_ptr), $t0, $t1
1289	adcx	$t0, $acc5
1290	adox	$t1, $acc0
1291
1292	mulx	8*3+128($a_ptr), $t0, $t1
1293	 mov	$acc3, %rdx
1294	 mulx	%r15, %rdx, %rax
1295	adcx	$t0, $acc0
1296	adox	$t1, $acc1
1297
1298	adcx	$acc2, $acc1
1299	adox	$acc2, $acc2
1300	adc	\$0, $acc2		# cf=0, of=0
1301
1302	################################# reduction
1303	mulx	8*0+128(%r14), $t0, $t1
1304	adcx	$t0, $acc3		# guaranteed to be zero
1305	adox	$t1, $acc4
1306
1307	mulx	8*1+128(%r14), $t0, $t1
1308	adcx	$t0, $acc4
1309	adox	$t1, $acc5
1310
1311	mulx	8*2+128(%r14), $t0, $t1
1312	adcx	$t0, $acc5
1313	adox	$t1, $acc0
1314
1315	mulx	8*3+128(%r14), $t0, $t1
1316	lea	128(%r14),%r14
1317	 mov	$acc4, $t2
1318	adcx	$t0, $acc0
1319	adox	$t1, $acc1
1320	 mov	$acc5, $t3
1321	adcx	$acc3, $acc1
1322	adox	$acc3, $acc2
1323	adc	\$0, $acc2
1324
1325	#################################
1326	# Branch-less conditional subtraction of P
1327	 mov	$acc0, $t0
1328	sub	8*0(%r14), $acc4
1329	sbb	8*1(%r14), $acc5
1330	sbb	8*2(%r14), $acc0
1331	 mov	$acc1, $t1
1332	sbb	8*3(%r14), $acc1
1333	sbb	\$0, $acc2
1334
1335	cmovc	$t2, $acc4
1336	cmovc	$t3, $acc5
1337	cmovc	$t0, $acc0
1338	cmovc	$t1, $acc1
1339
1340	mov	$acc4, 8*0($r_ptr)
1341	mov	$acc5, 8*1($r_ptr)
1342	mov	$acc0, 8*2($r_ptr)
1343	mov	$acc1, 8*3($r_ptr)
1344
1345	mov	0(%rsp),%r15
1346.cfi_restore	%r15
1347	mov	8(%rsp),%r14
1348.cfi_restore	%r14
1349	mov	16(%rsp),%r13
1350.cfi_restore	%r13
1351	mov	24(%rsp),%r12
1352.cfi_restore	%r12
1353	mov	32(%rsp),%rbx
1354.cfi_restore	%rbx
1355	mov	40(%rsp),%rbp
1356.cfi_restore	%rbp
1357	lea	48(%rsp),%rsp
1358.cfi_adjust_cfa_offset	-48
1359.Lord_mulx_epilogue:
1360	ret
1361.cfi_endproc
1362.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1363
1364.type	ecp_nistz256_ord_sqr_montx,\@function,3
1365.align	32
1366ecp_nistz256_ord_sqr_montx:
1367.cfi_startproc
1368.Lecp_nistz256_ord_sqr_montx:
1369	push	%rbp
1370.cfi_push	%rbp
1371	push	%rbx
1372.cfi_push	%rbx
1373	push	%r12
1374.cfi_push	%r12
1375	push	%r13
1376.cfi_push	%r13
1377	push	%r14
1378.cfi_push	%r14
1379	push	%r15
1380.cfi_push	%r15
1381.Lord_sqrx_body:
1382
1383	mov	$b_org, $b_ptr
1384	mov	8*0($a_ptr), %rdx
1385	mov	8*1($a_ptr), $acc6
1386	mov	8*2($a_ptr), $acc7
1387	mov	8*3($a_ptr), $acc0
1388	lea	.Lord(%rip), $a_ptr
1389	jmp	.Loop_ord_sqrx
1390
1391.align	32
1392.Loop_ord_sqrx:
1393	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1394	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1395	 mov	%rdx, %rax		# offload a[0]
1396	 movq	$acc6, %xmm1		# offload a[1]
1397	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1398	 mov	$acc6, %rdx
1399	add	$t0, $acc2
1400	 movq	$acc7, %xmm2		# offload a[2]
1401	adc	$t1, $acc3
1402	adc	\$0, $acc4
1403	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1404	#################################
1405	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1406	adcx	$t0, $acc3
1407	adox	$t1, $acc4
1408
1409	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1410	 mov	$acc7, %rdx
1411	adcx	$t0, $acc4
1412	adox	$t1, $acc5
1413	adc	\$0, $acc5
1414	#################################
1415	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1416	mov	%rax, %rdx
1417	 movq	$acc0, %xmm3		# offload a[3]
1418	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1419	 adcx	$acc1, $acc1		# acc1:6<<1
1420	adox	$t0, $acc5
1421	 adcx	$acc2, $acc2
1422	adox	$acc7, $acc6		# of=0
1423
1424	################################# a[i]*a[i]
1425	mulx	%rdx, $acc0, $t1
1426	movq	%xmm1, %rdx
1427	 adcx	$acc3, $acc3
1428	adox	$t1, $acc1
1429	 adcx	$acc4, $acc4
1430	mulx	%rdx, $t0, $t4
1431	movq	%xmm2, %rdx
1432	 adcx	$acc5, $acc5
1433	adox	$t0, $acc2
1434	 adcx	$acc6, $acc6
1435	mulx	%rdx, $t0, $t1
1436	.byte	0x67
1437	movq	%xmm3, %rdx
1438	adox	$t4, $acc3
1439	 adcx	$acc7, $acc7
1440	adox	$t0, $acc4
1441	adox	$t1, $acc5
1442	mulx	%rdx, $t0, $t4
1443	adox	$t0, $acc6
1444	adox	$t4, $acc7
1445
1446	################################# reduction
1447	mov	$acc0, %rdx
1448	mulx	8*4($a_ptr), %rdx, $t0
1449
1450	xor	%rax, %rax		# cf=0, of=0
1451	mulx	8*0($a_ptr), $t0, $t1
1452	adcx	$t0, $acc0		# guaranteed to be zero
1453	adox	$t1, $acc1
1454	mulx	8*1($a_ptr), $t0, $t1
1455	adcx	$t0, $acc1
1456	adox	$t1, $acc2
1457	mulx	8*2($a_ptr), $t0, $t1
1458	adcx	$t0, $acc2
1459	adox	$t1, $acc3
1460	mulx	8*3($a_ptr), $t0, $t1
1461	adcx	$t0, $acc3
1462	adox	$t1, $acc0		# of=0
1463	adcx	%rax, $acc0		# cf=0
1464
1465	#################################
1466	mov	$acc1, %rdx
1467	mulx	8*4($a_ptr), %rdx, $t0
1468
1469	mulx	8*0($a_ptr), $t0, $t1
1470	adox	$t0, $acc1		# guaranteed to be zero
1471	adcx	$t1, $acc2
1472	mulx	8*1($a_ptr), $t0, $t1
1473	adox	$t0, $acc2
1474	adcx	$t1, $acc3
1475	mulx	8*2($a_ptr), $t0, $t1
1476	adox	$t0, $acc3
1477	adcx	$t1, $acc0
1478	mulx	8*3($a_ptr), $t0, $t1
1479	adox	$t0, $acc0
1480	adcx	$t1, $acc1		# cf=0
1481	adox	%rax, $acc1		# of=0
1482
1483	#################################
1484	mov	$acc2, %rdx
1485	mulx	8*4($a_ptr), %rdx, $t0
1486
1487	mulx	8*0($a_ptr), $t0, $t1
1488	adcx	$t0, $acc2		# guaranteed to be zero
1489	adox	$t1, $acc3
1490	mulx	8*1($a_ptr), $t0, $t1
1491	adcx	$t0, $acc3
1492	adox	$t1, $acc0
1493	mulx	8*2($a_ptr), $t0, $t1
1494	adcx	$t0, $acc0
1495	adox	$t1, $acc1
1496	mulx	8*3($a_ptr), $t0, $t1
1497	adcx	$t0, $acc1
1498	adox	$t1, $acc2		# of=0
1499	adcx	%rax, $acc2		# cf=0
1500
1501	#################################
1502	mov	$acc3, %rdx
1503	mulx	8*4($a_ptr), %rdx, $t0
1504
1505	mulx	8*0($a_ptr), $t0, $t1
1506	adox	$t0, $acc3		# guaranteed to be zero
1507	adcx	$t1, $acc0
1508	mulx	8*1($a_ptr), $t0, $t1
1509	adox	$t0, $acc0
1510	adcx	$t1, $acc1
1511	mulx	8*2($a_ptr), $t0, $t1
1512	adox	$t0, $acc1
1513	adcx	$t1, $acc2
1514	mulx	8*3($a_ptr), $t0, $t1
1515	adox	$t0, $acc2
1516	adcx	$t1, $acc3
1517	adox	%rax, $acc3
1518
1519	################################# accumulate upper half
1520	add	$acc0, $acc4		# add	$acc4, $acc0
1521	adc	$acc5, $acc1
1522	 mov	$acc4, %rdx
1523	adc	$acc6, $acc2
1524	adc	$acc7, $acc3
1525	 mov	$acc1, $acc6
1526	adc	\$0, %rax
1527
1528	################################# compare to modulus
1529	sub	8*0($a_ptr), $acc4
1530	 mov	$acc2, $acc7
1531	sbb	8*1($a_ptr), $acc1
1532	sbb	8*2($a_ptr), $acc2
1533	 mov	$acc3, $acc0
1534	sbb	8*3($a_ptr), $acc3
1535	sbb	\$0, %rax
1536
1537	cmovnc	$acc4, %rdx
1538	cmovnc	$acc1, $acc6
1539	cmovnc	$acc2, $acc7
1540	cmovnc	$acc3, $acc0
1541
1542	dec	$b_ptr
1543	jnz	.Loop_ord_sqrx
1544
1545	mov	%rdx, 8*0($r_ptr)
1546	mov	$acc6, 8*1($r_ptr)
1547	pxor	%xmm1, %xmm1
1548	mov	$acc7, 8*2($r_ptr)
1549	pxor	%xmm2, %xmm2
1550	mov	$acc0, 8*3($r_ptr)
1551	pxor	%xmm3, %xmm3
1552
1553	mov	0(%rsp),%r15
1554.cfi_restore	%r15
1555	mov	8(%rsp),%r14
1556.cfi_restore	%r14
1557	mov	16(%rsp),%r13
1558.cfi_restore	%r13
1559	mov	24(%rsp),%r12
1560.cfi_restore	%r12
1561	mov	32(%rsp),%rbx
1562.cfi_restore	%rbx
1563	mov	40(%rsp),%rbp
1564.cfi_restore	%rbp
1565	lea	48(%rsp),%rsp
1566.cfi_adjust_cfa_offset	-48
1567.Lord_sqrx_epilogue:
1568	ret
1569.cfi_endproc
1570.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1571___
1572
1573$code.=<<___;
1574################################################################################
1575# void ecp_nistz256_to_mont(
1576#   uint64_t res[4],
1577#   uint64_t in[4]);
1578.globl	ecp_nistz256_to_mont
1579.type	ecp_nistz256_to_mont,\@function,2
1580.align	32
1581ecp_nistz256_to_mont:
1582.cfi_startproc
1583___
1584$code.=<<___	if ($addx);
1585	mov	\$0x80100, %ecx
1586	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1587___
1588$code.=<<___;
1589	lea	.LRR(%rip), $b_org
1590	jmp	.Lmul_mont
1591.cfi_endproc
1592.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1593
1594################################################################################
1595# void ecp_nistz256_mul_mont(
1596#   uint64_t res[4],
1597#   uint64_t a[4],
1598#   uint64_t b[4]);
1599
1600.globl	ecp_nistz256_mul_mont
1601.type	ecp_nistz256_mul_mont,\@function,3
1602.align	32
1603ecp_nistz256_mul_mont:
1604.cfi_startproc
1605___
1606$code.=<<___	if ($addx);
1607	mov	\$0x80100, %ecx
1608	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1609___
1610$code.=<<___;
1611.Lmul_mont:
1612	push	%rbp
1613.cfi_push	%rbp
1614	push	%rbx
1615.cfi_push	%rbx
1616	push	%r12
1617.cfi_push	%r12
1618	push	%r13
1619.cfi_push	%r13
1620	push	%r14
1621.cfi_push	%r14
1622	push	%r15
1623.cfi_push	%r15
1624.Lmul_body:
1625___
1626$code.=<<___	if ($addx);
1627	cmp	\$0x80100, %ecx
1628	je	.Lmul_montx
1629___
1630$code.=<<___;
1631	mov	$b_org, $b_ptr
1632	mov	8*0($b_org), %rax
1633	mov	8*0($a_ptr), $acc1
1634	mov	8*1($a_ptr), $acc2
1635	mov	8*2($a_ptr), $acc3
1636	mov	8*3($a_ptr), $acc4
1637
1638	call	__ecp_nistz256_mul_montq
1639___
1640$code.=<<___	if ($addx);
1641	jmp	.Lmul_mont_done
1642
1643.align	32
1644.Lmul_montx:
1645	mov	$b_org, $b_ptr
1646	mov	8*0($b_org), %rdx
1647	mov	8*0($a_ptr), $acc1
1648	mov	8*1($a_ptr), $acc2
1649	mov	8*2($a_ptr), $acc3
1650	mov	8*3($a_ptr), $acc4
1651	lea	-128($a_ptr), $a_ptr	# control u-op density
1652
1653	call	__ecp_nistz256_mul_montx
1654___
1655$code.=<<___;
1656.Lmul_mont_done:
1657	mov	0(%rsp),%r15
1658.cfi_restore	%r15
1659	mov	8(%rsp),%r14
1660.cfi_restore	%r14
1661	mov	16(%rsp),%r13
1662.cfi_restore	%r13
1663	mov	24(%rsp),%r12
1664.cfi_restore	%r12
1665	mov	32(%rsp),%rbx
1666.cfi_restore	%rbx
1667	mov	40(%rsp),%rbp
1668.cfi_restore	%rbp
1669	lea	48(%rsp),%rsp
1670.cfi_adjust_cfa_offset	-48
1671.Lmul_epilogue:
1672	ret
1673.cfi_endproc
1674.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1675
1676.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1677.align	32
1678__ecp_nistz256_mul_montq:
1679.cfi_startproc
1680	########################################################################
1681	# Multiply a by b[0]
1682	mov	%rax, $t1
1683	mulq	$acc1
1684	mov	.Lpoly+8*1(%rip),$poly1
1685	mov	%rax, $acc0
1686	mov	$t1, %rax
1687	mov	%rdx, $acc1
1688
1689	mulq	$acc2
1690	mov	.Lpoly+8*3(%rip),$poly3
1691	add	%rax, $acc1
1692	mov	$t1, %rax
1693	adc	\$0, %rdx
1694	mov	%rdx, $acc2
1695
1696	mulq	$acc3
1697	add	%rax, $acc2
1698	mov	$t1, %rax
1699	adc	\$0, %rdx
1700	mov	%rdx, $acc3
1701
1702	mulq	$acc4
1703	add	%rax, $acc3
1704	 mov	$acc0, %rax
1705	adc	\$0, %rdx
1706	xor	$acc5, $acc5
1707	mov	%rdx, $acc4
1708
1709	########################################################################
1710	# First reduction step
1711	# Basically now we want to multiply acc[0] by p256,
1712	# and add the result to the acc.
1713	# Due to the special form of p256 we do some optimizations
1714	#
1715	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1716	# then we add acc[0] and get acc[0] x 2^96
1717
1718	mov	$acc0, $t1
1719	shl	\$32, $acc0
1720	mulq	$poly3
1721	shr	\$32, $t1
1722	add	$acc0, $acc1		# +=acc[0]<<96
1723	adc	$t1, $acc2
1724	adc	%rax, $acc3
1725	 mov	8*1($b_ptr), %rax
1726	adc	%rdx, $acc4
1727	adc	\$0, $acc5
1728	xor	$acc0, $acc0
1729
1730	########################################################################
1731	# Multiply by b[1]
1732	mov	%rax, $t1
1733	mulq	8*0($a_ptr)
1734	add	%rax, $acc1
1735	mov	$t1, %rax
1736	adc	\$0, %rdx
1737	mov	%rdx, $t0
1738
1739	mulq	8*1($a_ptr)
1740	add	$t0, $acc2
1741	adc	\$0, %rdx
1742	add	%rax, $acc2
1743	mov	$t1, %rax
1744	adc	\$0, %rdx
1745	mov	%rdx, $t0
1746
1747	mulq	8*2($a_ptr)
1748	add	$t0, $acc3
1749	adc	\$0, %rdx
1750	add	%rax, $acc3
1751	mov	$t1, %rax
1752	adc	\$0, %rdx
1753	mov	%rdx, $t0
1754
1755	mulq	8*3($a_ptr)
1756	add	$t0, $acc4
1757	adc	\$0, %rdx
1758	add	%rax, $acc4
1759	 mov	$acc1, %rax
1760	adc	%rdx, $acc5
1761	adc	\$0, $acc0
1762
1763	########################################################################
1764	# Second reduction step
1765	mov	$acc1, $t1
1766	shl	\$32, $acc1
1767	mulq	$poly3
1768	shr	\$32, $t1
1769	add	$acc1, $acc2
1770	adc	$t1, $acc3
1771	adc	%rax, $acc4
1772	 mov	8*2($b_ptr), %rax
1773	adc	%rdx, $acc5
1774	adc	\$0, $acc0
1775	xor	$acc1, $acc1
1776
1777	########################################################################
1778	# Multiply by b[2]
1779	mov	%rax, $t1
1780	mulq	8*0($a_ptr)
1781	add	%rax, $acc2
1782	mov	$t1, %rax
1783	adc	\$0, %rdx
1784	mov	%rdx, $t0
1785
1786	mulq	8*1($a_ptr)
1787	add	$t0, $acc3
1788	adc	\$0, %rdx
1789	add	%rax, $acc3
1790	mov	$t1, %rax
1791	adc	\$0, %rdx
1792	mov	%rdx, $t0
1793
1794	mulq	8*2($a_ptr)
1795	add	$t0, $acc4
1796	adc	\$0, %rdx
1797	add	%rax, $acc4
1798	mov	$t1, %rax
1799	adc	\$0, %rdx
1800	mov	%rdx, $t0
1801
1802	mulq	8*3($a_ptr)
1803	add	$t0, $acc5
1804	adc	\$0, %rdx
1805	add	%rax, $acc5
1806	 mov	$acc2, %rax
1807	adc	%rdx, $acc0
1808	adc	\$0, $acc1
1809
1810	########################################################################
1811	# Third reduction step
1812	mov	$acc2, $t1
1813	shl	\$32, $acc2
1814	mulq	$poly3
1815	shr	\$32, $t1
1816	add	$acc2, $acc3
1817	adc	$t1, $acc4
1818	adc	%rax, $acc5
1819	 mov	8*3($b_ptr), %rax
1820	adc	%rdx, $acc0
1821	adc	\$0, $acc1
1822	xor	$acc2, $acc2
1823
1824	########################################################################
1825	# Multiply by b[3]
1826	mov	%rax, $t1
1827	mulq	8*0($a_ptr)
1828	add	%rax, $acc3
1829	mov	$t1, %rax
1830	adc	\$0, %rdx
1831	mov	%rdx, $t0
1832
1833	mulq	8*1($a_ptr)
1834	add	$t0, $acc4
1835	adc	\$0, %rdx
1836	add	%rax, $acc4
1837	mov	$t1, %rax
1838	adc	\$0, %rdx
1839	mov	%rdx, $t0
1840
1841	mulq	8*2($a_ptr)
1842	add	$t0, $acc5
1843	adc	\$0, %rdx
1844	add	%rax, $acc5
1845	mov	$t1, %rax
1846	adc	\$0, %rdx
1847	mov	%rdx, $t0
1848
1849	mulq	8*3($a_ptr)
1850	add	$t0, $acc0
1851	adc	\$0, %rdx
1852	add	%rax, $acc0
1853	 mov	$acc3, %rax
1854	adc	%rdx, $acc1
1855	adc	\$0, $acc2
1856
1857	########################################################################
1858	# Final reduction step
1859	mov	$acc3, $t1
1860	shl	\$32, $acc3
1861	mulq	$poly3
1862	shr	\$32, $t1
1863	add	$acc3, $acc4
1864	adc	$t1, $acc5
1865	 mov	$acc4, $t0
1866	adc	%rax, $acc0
1867	adc	%rdx, $acc1
1868	 mov	$acc5, $t1
1869	adc	\$0, $acc2
1870
1871	########################################################################
1872	# Branch-less conditional subtraction of P
1873	sub	\$-1, $acc4		# .Lpoly[0]
1874	 mov	$acc0, $t2
1875	sbb	$poly1, $acc5		# .Lpoly[1]
1876	sbb	\$0, $acc0		# .Lpoly[2]
1877	 mov	$acc1, $t3
1878	sbb	$poly3, $acc1		# .Lpoly[3]
1879	sbb	\$0, $acc2
1880
1881	cmovc	$t0, $acc4
1882	cmovc	$t1, $acc5
1883	mov	$acc4, 8*0($r_ptr)
1884	cmovc	$t2, $acc0
1885	mov	$acc5, 8*1($r_ptr)
1886	cmovc	$t3, $acc1
1887	mov	$acc0, 8*2($r_ptr)
1888	mov	$acc1, 8*3($r_ptr)
1889
1890	ret
1891.cfi_endproc
1892.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1893
1894################################################################################
1895# void ecp_nistz256_sqr_mont(
1896#   uint64_t res[4],
1897#   uint64_t a[4]);
1898
1899# we optimize the square according to S.Gueron and V.Krasnov,
1900# "Speeding up Big-Number Squaring"
1901.globl	ecp_nistz256_sqr_mont
1902.type	ecp_nistz256_sqr_mont,\@function,2
1903.align	32
1904ecp_nistz256_sqr_mont:
1905.cfi_startproc
1906___
1907$code.=<<___	if ($addx);
1908	mov	\$0x80100, %ecx
1909	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1910___
1911$code.=<<___;
1912	push	%rbp
1913.cfi_push	%rbp
1914	push	%rbx
1915.cfi_push	%rbx
1916	push	%r12
1917.cfi_push	%r12
1918	push	%r13
1919.cfi_push	%r13
1920	push	%r14
1921.cfi_push	%r14
1922	push	%r15
1923.cfi_push	%r15
1924.Lsqr_body:
1925___
1926$code.=<<___	if ($addx);
1927	cmp	\$0x80100, %ecx
1928	je	.Lsqr_montx
1929___
1930$code.=<<___;
1931	mov	8*0($a_ptr), %rax
1932	mov	8*1($a_ptr), $acc6
1933	mov	8*2($a_ptr), $acc7
1934	mov	8*3($a_ptr), $acc0
1935
1936	call	__ecp_nistz256_sqr_montq
1937___
1938$code.=<<___	if ($addx);
1939	jmp	.Lsqr_mont_done
1940
1941.align	32
1942.Lsqr_montx:
1943	mov	8*0($a_ptr), %rdx
1944	mov	8*1($a_ptr), $acc6
1945	mov	8*2($a_ptr), $acc7
1946	mov	8*3($a_ptr), $acc0
1947	lea	-128($a_ptr), $a_ptr	# control u-op density
1948
1949	call	__ecp_nistz256_sqr_montx
1950___
1951$code.=<<___;
1952.Lsqr_mont_done:
1953	mov	0(%rsp),%r15
1954.cfi_restore	%r15
1955	mov	8(%rsp),%r14
1956.cfi_restore	%r14
1957	mov	16(%rsp),%r13
1958.cfi_restore	%r13
1959	mov	24(%rsp),%r12
1960.cfi_restore	%r12
1961	mov	32(%rsp),%rbx
1962.cfi_restore	%rbx
1963	mov	40(%rsp),%rbp
1964.cfi_restore	%rbp
1965	lea	48(%rsp),%rsp
1966.cfi_adjust_cfa_offset	-48
1967.Lsqr_epilogue:
1968	ret
1969.cfi_endproc
1970.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1971
1972.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1973.align	32
1974__ecp_nistz256_sqr_montq:
1975.cfi_startproc
1976	mov	%rax, $acc5
1977	mulq	$acc6			# a[1]*a[0]
1978	mov	%rax, $acc1
1979	mov	$acc7, %rax
1980	mov	%rdx, $acc2
1981
1982	mulq	$acc5			# a[0]*a[2]
1983	add	%rax, $acc2
1984	mov	$acc0, %rax
1985	adc	\$0, %rdx
1986	mov	%rdx, $acc3
1987
1988	mulq	$acc5			# a[0]*a[3]
1989	add	%rax, $acc3
1990	 mov	$acc7, %rax
1991	adc	\$0, %rdx
1992	mov	%rdx, $acc4
1993
1994	#################################
1995	mulq	$acc6			# a[1]*a[2]
1996	add	%rax, $acc3
1997	mov	$acc0, %rax
1998	adc	\$0, %rdx
1999	mov	%rdx, $t1
2000
2001	mulq	$acc6			# a[1]*a[3]
2002	add	%rax, $acc4
2003	 mov	$acc0, %rax
2004	adc	\$0, %rdx
2005	add	$t1, $acc4
2006	mov	%rdx, $acc5
2007	adc	\$0, $acc5
2008
2009	#################################
2010	mulq	$acc7			# a[2]*a[3]
2011	xor	$acc7, $acc7
2012	add	%rax, $acc5
2013	 mov	8*0($a_ptr), %rax
2014	mov	%rdx, $acc6
2015	adc	\$0, $acc6
2016
2017	add	$acc1, $acc1		# acc1:6<<1
2018	adc	$acc2, $acc2
2019	adc	$acc3, $acc3
2020	adc	$acc4, $acc4
2021	adc	$acc5, $acc5
2022	adc	$acc6, $acc6
2023	adc	\$0, $acc7
2024
2025	mulq	%rax
2026	mov	%rax, $acc0
2027	mov	8*1($a_ptr), %rax
2028	mov	%rdx, $t0
2029
2030	mulq	%rax
2031	add	$t0, $acc1
2032	adc	%rax, $acc2
2033	mov	8*2($a_ptr), %rax
2034	adc	\$0, %rdx
2035	mov	%rdx, $t0
2036
2037	mulq	%rax
2038	add	$t0, $acc3
2039	adc	%rax, $acc4
2040	mov	8*3($a_ptr), %rax
2041	adc	\$0, %rdx
2042	mov	%rdx, $t0
2043
2044	mulq	%rax
2045	add	$t0, $acc5
2046	adc	%rax, $acc6
2047	 mov	$acc0, %rax
2048	adc	%rdx, $acc7
2049
2050	mov	.Lpoly+8*1(%rip), $a_ptr
2051	mov	.Lpoly+8*3(%rip), $t1
2052
2053	##########################################
2054	# Now the reduction
2055	# First iteration
2056	mov	$acc0, $t0
2057	shl	\$32, $acc0
2058	mulq	$t1
2059	shr	\$32, $t0
2060	add	$acc0, $acc1		# +=acc[0]<<96
2061	adc	$t0, $acc2
2062	adc	%rax, $acc3
2063	 mov	$acc1, %rax
2064	adc	\$0, %rdx
2065
2066	##########################################
2067	# Second iteration
2068	mov	$acc1, $t0
2069	shl	\$32, $acc1
2070	mov	%rdx, $acc0
2071	mulq	$t1
2072	shr	\$32, $t0
2073	add	$acc1, $acc2
2074	adc	$t0, $acc3
2075	adc	%rax, $acc0
2076	 mov	$acc2, %rax
2077	adc	\$0, %rdx
2078
2079	##########################################
2080	# Third iteration
2081	mov	$acc2, $t0
2082	shl	\$32, $acc2
2083	mov	%rdx, $acc1
2084	mulq	$t1
2085	shr	\$32, $t0
2086	add	$acc2, $acc3
2087	adc	$t0, $acc0
2088	adc	%rax, $acc1
2089	 mov	$acc3, %rax
2090	adc	\$0, %rdx
2091
2092	###########################################
2093	# Last iteration
2094	mov	$acc3, $t0
2095	shl	\$32, $acc3
2096	mov	%rdx, $acc2
2097	mulq	$t1
2098	shr	\$32, $t0
2099	add	$acc3, $acc0
2100	adc	$t0, $acc1
2101	adc	%rax, $acc2
2102	adc	\$0, %rdx
2103	xor	$acc3, $acc3
2104
2105	############################################
2106	# Add the rest of the acc
2107	add	$acc0, $acc4
2108	adc	$acc1, $acc5
2109	 mov	$acc4, $acc0
2110	adc	$acc2, $acc6
2111	adc	%rdx, $acc7
2112	 mov	$acc5, $acc1
2113	adc	\$0, $acc3
2114
2115	sub	\$-1, $acc4		# .Lpoly[0]
2116	 mov	$acc6, $acc2
2117	sbb	$a_ptr, $acc5		# .Lpoly[1]
2118	sbb	\$0, $acc6		# .Lpoly[2]
2119	 mov	$acc7, $t0
2120	sbb	$t1, $acc7		# .Lpoly[3]
2121	sbb	\$0, $acc3
2122
2123	cmovc	$acc0, $acc4
2124	cmovc	$acc1, $acc5
2125	mov	$acc4, 8*0($r_ptr)
2126	cmovc	$acc2, $acc6
2127	mov	$acc5, 8*1($r_ptr)
2128	cmovc	$t0, $acc7
2129	mov	$acc6, 8*2($r_ptr)
2130	mov	$acc7, 8*3($r_ptr)
2131
2132	ret
2133.cfi_endproc
2134.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2135___
2136
2137if ($addx) {
2138$code.=<<___;
2139.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
2140.align	32
2141__ecp_nistz256_mul_montx:
2142.cfi_startproc
2143	########################################################################
2144	# Multiply by b[0]
2145	mulx	$acc1, $acc0, $acc1
2146	mulx	$acc2, $t0, $acc2
2147	mov	\$32, $poly1
2148	xor	$acc5, $acc5		# cf=0
2149	mulx	$acc3, $t1, $acc3
2150	mov	.Lpoly+8*3(%rip), $poly3
2151	adc	$t0, $acc1
2152	mulx	$acc4, $t0, $acc4
2153	 mov	$acc0, %rdx
2154	adc	$t1, $acc2
2155	 shlx	$poly1,$acc0,$t1
2156	adc	$t0, $acc3
2157	 shrx	$poly1,$acc0,$t0
2158	adc	\$0, $acc4
2159
2160	########################################################################
2161	# First reduction step
2162	add	$t1, $acc1
2163	adc	$t0, $acc2
2164
2165	mulx	$poly3, $t0, $t1
2166	 mov	8*1($b_ptr), %rdx
2167	adc	$t0, $acc3
2168	adc	$t1, $acc4
2169	adc	\$0, $acc5
2170	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
2171
2172	########################################################################
2173	# Multiply by b[1]
2174	mulx	8*0+128($a_ptr), $t0, $t1
2175	adcx	$t0, $acc1
2176	adox	$t1, $acc2
2177
2178	mulx	8*1+128($a_ptr), $t0, $t1
2179	adcx	$t0, $acc2
2180	adox	$t1, $acc3
2181
2182	mulx	8*2+128($a_ptr), $t0, $t1
2183	adcx	$t0, $acc3
2184	adox	$t1, $acc4
2185
2186	mulx	8*3+128($a_ptr), $t0, $t1
2187	 mov	$acc1, %rdx
2188	adcx	$t0, $acc4
2189	 shlx	$poly1, $acc1, $t0
2190	adox	$t1, $acc5
2191	 shrx	$poly1, $acc1, $t1
2192
2193	adcx	$acc0, $acc5
2194	adox	$acc0, $acc0
2195	adc	\$0, $acc0
2196
2197	########################################################################
2198	# Second reduction step
2199	add	$t0, $acc2
2200	adc	$t1, $acc3
2201
2202	mulx	$poly3, $t0, $t1
2203	 mov	8*2($b_ptr), %rdx
2204	adc	$t0, $acc4
2205	adc	$t1, $acc5
2206	adc	\$0, $acc0
2207	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
2208
2209	########################################################################
2210	# Multiply by b[2]
2211	mulx	8*0+128($a_ptr), $t0, $t1
2212	adcx	$t0, $acc2
2213	adox	$t1, $acc3
2214
2215	mulx	8*1+128($a_ptr), $t0, $t1
2216	adcx	$t0, $acc3
2217	adox	$t1, $acc4
2218
2219	mulx	8*2+128($a_ptr), $t0, $t1
2220	adcx	$t0, $acc4
2221	adox	$t1, $acc5
2222
2223	mulx	8*3+128($a_ptr), $t0, $t1
2224	 mov	$acc2, %rdx
2225	adcx	$t0, $acc5
2226	 shlx	$poly1, $acc2, $t0
2227	adox	$t1, $acc0
2228	 shrx	$poly1, $acc2, $t1
2229
2230	adcx	$acc1, $acc0
2231	adox	$acc1, $acc1
2232	adc	\$0, $acc1
2233
2234	########################################################################
2235	# Third reduction step
2236	add	$t0, $acc3
2237	adc	$t1, $acc4
2238
2239	mulx	$poly3, $t0, $t1
2240	 mov	8*3($b_ptr), %rdx
2241	adc	$t0, $acc5
2242	adc	$t1, $acc0
2243	adc	\$0, $acc1
2244	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
2245
2246	########################################################################
2247	# Multiply by b[3]
2248	mulx	8*0+128($a_ptr), $t0, $t1
2249	adcx	$t0, $acc3
2250	adox	$t1, $acc4
2251
2252	mulx	8*1+128($a_ptr), $t0, $t1
2253	adcx	$t0, $acc4
2254	adox	$t1, $acc5
2255
2256	mulx	8*2+128($a_ptr), $t0, $t1
2257	adcx	$t0, $acc5
2258	adox	$t1, $acc0
2259
2260	mulx	8*3+128($a_ptr), $t0, $t1
2261	 mov	$acc3, %rdx
2262	adcx	$t0, $acc0
2263	 shlx	$poly1, $acc3, $t0
2264	adox	$t1, $acc1
2265	 shrx	$poly1, $acc3, $t1
2266
2267	adcx	$acc2, $acc1
2268	adox	$acc2, $acc2
2269	adc	\$0, $acc2
2270
2271	########################################################################
2272	# Fourth reduction step
2273	add	$t0, $acc4
2274	adc	$t1, $acc5
2275
2276	mulx	$poly3, $t0, $t1
2277	 mov	$acc4, $t2
2278	mov	.Lpoly+8*1(%rip), $poly1
2279	adc	$t0, $acc0
2280	 mov	$acc5, $t3
2281	adc	$t1, $acc1
2282	adc	\$0, $acc2
2283
2284	########################################################################
2285	# Branch-less conditional subtraction of P
2286	xor	%eax, %eax
2287	 mov	$acc0, $t0
2288	sbb	\$-1, $acc4		# .Lpoly[0]
2289	sbb	$poly1, $acc5		# .Lpoly[1]
2290	sbb	\$0, $acc0		# .Lpoly[2]
2291	 mov	$acc1, $t1
2292	sbb	$poly3, $acc1		# .Lpoly[3]
2293	sbb	\$0, $acc2
2294
2295	cmovc	$t2, $acc4
2296	cmovc	$t3, $acc5
2297	mov	$acc4, 8*0($r_ptr)
2298	cmovc	$t0, $acc0
2299	mov	$acc5, 8*1($r_ptr)
2300	cmovc	$t1, $acc1
2301	mov	$acc0, 8*2($r_ptr)
2302	mov	$acc1, 8*3($r_ptr)
2303
2304	ret
2305.cfi_endproc
2306.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2307
2308.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
2309.align	32
2310__ecp_nistz256_sqr_montx:
2311.cfi_startproc
2312	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
2313	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
2314	xor	%eax, %eax
2315	adc	$t0, $acc2
2316	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
2317	 mov	$acc6, %rdx
2318	adc	$t1, $acc3
2319	adc	\$0, $acc4
2320	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
2321
2322	#################################
2323	mulx	$acc7, $t0, $t1		# a[1]*a[2]
2324	adcx	$t0, $acc3
2325	adox	$t1, $acc4
2326
2327	mulx	$acc0, $t0, $t1		# a[1]*a[3]
2328	 mov	$acc7, %rdx
2329	adcx	$t0, $acc4
2330	adox	$t1, $acc5
2331	adc	\$0, $acc5
2332
2333	#################################
2334	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
2335	 mov	8*0+128($a_ptr), %rdx
2336	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
2337	 adcx	$acc1, $acc1		# acc1:6<<1
2338	adox	$t0, $acc5
2339	 adcx	$acc2, $acc2
2340	adox	$acc7, $acc6		# of=0
2341
2342	mulx	%rdx, $acc0, $t1
2343	mov	8*1+128($a_ptr), %rdx
2344	 adcx	$acc3, $acc3
2345	adox	$t1, $acc1
2346	 adcx	$acc4, $acc4
2347	mulx	%rdx, $t0, $t4
2348	mov	8*2+128($a_ptr), %rdx
2349	 adcx	$acc5, $acc5
2350	adox	$t0, $acc2
2351	 adcx	$acc6, $acc6
2352	.byte	0x67
2353	mulx	%rdx, $t0, $t1
2354	mov	8*3+128($a_ptr), %rdx
2355	adox	$t4, $acc3
2356	 adcx	$acc7, $acc7
2357	adox	$t0, $acc4
2358	 mov	\$32, $a_ptr
2359	adox	$t1, $acc5
2360	.byte	0x67,0x67
2361	mulx	%rdx, $t0, $t4
2362	 mov	.Lpoly+8*3(%rip), %rdx
2363	adox	$t0, $acc6
2364	 shlx	$a_ptr, $acc0, $t0
2365	adox	$t4, $acc7
2366	 shrx	$a_ptr, $acc0, $t4
2367	mov	%rdx,$t1
2368
2369	# reduction step 1
2370	add	$t0, $acc1
2371	adc	$t4, $acc2
2372
2373	mulx	$acc0, $t0, $acc0
2374	adc	$t0, $acc3
2375	 shlx	$a_ptr, $acc1, $t0
2376	adc	\$0, $acc0
2377	 shrx	$a_ptr, $acc1, $t4
2378
2379	# reduction step 2
2380	add	$t0, $acc2
2381	adc	$t4, $acc3
2382
2383	mulx	$acc1, $t0, $acc1
2384	adc	$t0, $acc0
2385	 shlx	$a_ptr, $acc2, $t0
2386	adc	\$0, $acc1
2387	 shrx	$a_ptr, $acc2, $t4
2388
2389	# reduction step 3
2390	add	$t0, $acc3
2391	adc	$t4, $acc0
2392
2393	mulx	$acc2, $t0, $acc2
2394	adc	$t0, $acc1
2395	 shlx	$a_ptr, $acc3, $t0
2396	adc	\$0, $acc2
2397	 shrx	$a_ptr, $acc3, $t4
2398
2399	# reduction step 4
2400	add	$t0, $acc0
2401	adc	$t4, $acc1
2402
2403	mulx	$acc3, $t0, $acc3
2404	adc	$t0, $acc2
2405	adc	\$0, $acc3
2406
2407	xor	$t3, $t3
2408	add	$acc0, $acc4		# accumulate upper half
2409	 mov	.Lpoly+8*1(%rip), $a_ptr
2410	adc	$acc1, $acc5
2411	 mov	$acc4, $acc0
2412	adc	$acc2, $acc6
2413	adc	$acc3, $acc7
2414	 mov	$acc5, $acc1
2415	adc	\$0, $t3
2416
2417	sub	\$-1, $acc4		# .Lpoly[0]
2418	 mov	$acc6, $acc2
2419	sbb	$a_ptr, $acc5		# .Lpoly[1]
2420	sbb	\$0, $acc6		# .Lpoly[2]
2421	 mov	$acc7, $acc3
2422	sbb	$t1, $acc7		# .Lpoly[3]
2423	sbb	\$0, $t3
2424
2425	cmovc	$acc0, $acc4
2426	cmovc	$acc1, $acc5
2427	mov	$acc4, 8*0($r_ptr)
2428	cmovc	$acc2, $acc6
2429	mov	$acc5, 8*1($r_ptr)
2430	cmovc	$acc3, $acc7
2431	mov	$acc6, 8*2($r_ptr)
2432	mov	$acc7, 8*3($r_ptr)
2433
2434	ret
2435.cfi_endproc
2436.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2437___
2438}
2439}
2440{
2441my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2442my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2443my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2444
2445$code.=<<___;
2446################################################################################
2447# void ecp_nistz256_from_mont(
2448#   uint64_t res[4],
2449#   uint64_t in[4]);
2450# This one performs Montgomery multiplication by 1, so we only need the reduction
2451
2452.globl	ecp_nistz256_from_mont
2453.type	ecp_nistz256_from_mont,\@function,2
2454.align	32
2455ecp_nistz256_from_mont:
2456.cfi_startproc
2457	push	%r12
2458.cfi_push	%r12
2459	push	%r13
2460.cfi_push	%r13
2461.Lfrom_body:
2462
2463	mov	8*0($in_ptr), %rax
2464	mov	.Lpoly+8*3(%rip), $t2
2465	mov	8*1($in_ptr), $acc1
2466	mov	8*2($in_ptr), $acc2
2467	mov	8*3($in_ptr), $acc3
2468	mov	%rax, $acc0
2469	mov	.Lpoly+8*1(%rip), $t1
2470
2471	#########################################
2472	# First iteration
2473	mov	%rax, $t0
2474	shl	\$32, $acc0
2475	mulq	$t2
2476	shr	\$32, $t0
2477	add	$acc0, $acc1
2478	adc	$t0, $acc2
2479	adc	%rax, $acc3
2480	 mov	$acc1, %rax
2481	adc	\$0, %rdx
2482
2483	#########################################
2484	# Second iteration
2485	mov	$acc1, $t0
2486	shl	\$32, $acc1
2487	mov	%rdx, $acc0
2488	mulq	$t2
2489	shr	\$32, $t0
2490	add	$acc1, $acc2
2491	adc	$t0, $acc3
2492	adc	%rax, $acc0
2493	 mov	$acc2, %rax
2494	adc	\$0, %rdx
2495
2496	##########################################
2497	# Third iteration
2498	mov	$acc2, $t0
2499	shl	\$32, $acc2
2500	mov	%rdx, $acc1
2501	mulq	$t2
2502	shr	\$32, $t0
2503	add	$acc2, $acc3
2504	adc	$t0, $acc0
2505	adc	%rax, $acc1
2506	 mov	$acc3, %rax
2507	adc	\$0, %rdx
2508
2509	###########################################
2510	# Last iteration
2511	mov	$acc3, $t0
2512	shl	\$32, $acc3
2513	mov	%rdx, $acc2
2514	mulq	$t2
2515	shr	\$32, $t0
2516	add	$acc3, $acc0
2517	adc	$t0, $acc1
2518	 mov	$acc0, $t0
2519	adc	%rax, $acc2
2520	 mov	$acc1, $in_ptr
2521	adc	\$0, %rdx
2522
2523	###########################################
2524	# Branch-less conditional subtraction
2525	sub	\$-1, $acc0
2526	 mov	$acc2, %rax
2527	sbb	$t1, $acc1
2528	sbb	\$0, $acc2
2529	 mov	%rdx, $acc3
2530	sbb	$t2, %rdx
2531	sbb	$t2, $t2
2532
2533	cmovnz	$t0, $acc0
2534	cmovnz	$in_ptr, $acc1
2535	mov	$acc0, 8*0($r_ptr)
2536	cmovnz	%rax, $acc2
2537	mov	$acc1, 8*1($r_ptr)
2538	cmovz	%rdx, $acc3
2539	mov	$acc2, 8*2($r_ptr)
2540	mov	$acc3, 8*3($r_ptr)
2541
2542	mov	0(%rsp),%r13
2543.cfi_restore	%r13
2544	mov	8(%rsp),%r12
2545.cfi_restore	%r12
2546	lea	16(%rsp),%rsp
2547.cfi_adjust_cfa_offset	-16
2548.Lfrom_epilogue:
2549	ret
2550.cfi_endproc
2551.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2552___
2553}
2554{
2555my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2556my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2557my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2558my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2559
2560$code.=<<___;
2561################################################################################
2562# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2563.globl	ecp_nistz256_scatter_w5
2564.type	ecp_nistz256_scatter_w5,\@abi-omnipotent
2565.align	32
2566ecp_nistz256_scatter_w5:
2567.cfi_startproc
2568	lea	-3($index,$index,2), $index
2569	movdqa	0x00($in_t), %xmm0
2570	shl	\$5, $index
2571	movdqa	0x10($in_t), %xmm1
2572	movdqa	0x20($in_t), %xmm2
2573	movdqa	0x30($in_t), %xmm3
2574	movdqa	0x40($in_t), %xmm4
2575	movdqa	0x50($in_t), %xmm5
2576	movdqa	%xmm0, 0x00($val,$index)
2577	movdqa	%xmm1, 0x10($val,$index)
2578	movdqa	%xmm2, 0x20($val,$index)
2579	movdqa	%xmm3, 0x30($val,$index)
2580	movdqa	%xmm4, 0x40($val,$index)
2581	movdqa	%xmm5, 0x50($val,$index)
2582
2583	ret
2584.cfi_endproc
2585.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2586
2587################################################################################
2588# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2589.globl	ecp_nistz256_gather_w5
2590.type	ecp_nistz256_gather_w5,\@abi-omnipotent
2591.align	32
2592ecp_nistz256_gather_w5:
2593.cfi_startproc
2594___
2595$code.=<<___	if ($avx>1);
2596	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2597	test	\$`1<<5`, %eax
2598	jnz	.Lavx2_gather_w5
2599___
2600$code.=<<___	if ($win64);
2601	lea	-0x88(%rsp), %rax
2602.LSEH_begin_ecp_nistz256_gather_w5:
2603	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2604	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2605	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2606	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2607	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2608	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2609	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2610	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2611	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2612	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2613	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2614___
2615$code.=<<___;
2616	movdqa	.LOne(%rip), $ONE
2617	movd	$index, $INDEX
2618
2619	pxor	$Ra, $Ra
2620	pxor	$Rb, $Rb
2621	pxor	$Rc, $Rc
2622	pxor	$Rd, $Rd
2623	pxor	$Re, $Re
2624	pxor	$Rf, $Rf
2625
2626	movdqa	$ONE, $M0
2627	pshufd	\$0, $INDEX, $INDEX
2628
2629	mov	\$16, %rax
2630.Lselect_loop_sse_w5:
2631
2632	movdqa	$M0, $TMP0
2633	paddd	$ONE, $M0
2634	pcmpeqd $INDEX, $TMP0
2635
2636	movdqa	16*0($in_t), $T0a
2637	movdqa	16*1($in_t), $T0b
2638	movdqa	16*2($in_t), $T0c
2639	movdqa	16*3($in_t), $T0d
2640	movdqa	16*4($in_t), $T0e
2641	movdqa	16*5($in_t), $T0f
2642	lea 16*6($in_t), $in_t
2643
2644	pand	$TMP0, $T0a
2645	pand	$TMP0, $T0b
2646	por	$T0a, $Ra
2647	pand	$TMP0, $T0c
2648	por	$T0b, $Rb
2649	pand	$TMP0, $T0d
2650	por	$T0c, $Rc
2651	pand	$TMP0, $T0e
2652	por	$T0d, $Rd
2653	pand	$TMP0, $T0f
2654	por	$T0e, $Re
2655	por	$T0f, $Rf
2656
2657	dec	%rax
2658	jnz	.Lselect_loop_sse_w5
2659
2660	movdqu	$Ra, 16*0($val)
2661	movdqu	$Rb, 16*1($val)
2662	movdqu	$Rc, 16*2($val)
2663	movdqu	$Rd, 16*3($val)
2664	movdqu	$Re, 16*4($val)
2665	movdqu	$Rf, 16*5($val)
2666___
2667$code.=<<___	if ($win64);
2668	movaps	(%rsp), %xmm6
2669	movaps	0x10(%rsp), %xmm7
2670	movaps	0x20(%rsp), %xmm8
2671	movaps	0x30(%rsp), %xmm9
2672	movaps	0x40(%rsp), %xmm10
2673	movaps	0x50(%rsp), %xmm11
2674	movaps	0x60(%rsp), %xmm12
2675	movaps	0x70(%rsp), %xmm13
2676	movaps	0x80(%rsp), %xmm14
2677	movaps	0x90(%rsp), %xmm15
2678	lea	0xa8(%rsp), %rsp
2679___
2680$code.=<<___;
2681	ret
2682.cfi_endproc
2683.LSEH_end_ecp_nistz256_gather_w5:
2684.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2685
2686################################################################################
2687# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2688.globl	ecp_nistz256_scatter_w7
2689.type	ecp_nistz256_scatter_w7,\@abi-omnipotent
2690.align	32
2691ecp_nistz256_scatter_w7:
2692.cfi_startproc
2693	movdqu	0x00($in_t), %xmm0
2694	shl	\$6, $index
2695	movdqu	0x10($in_t), %xmm1
2696	movdqu	0x20($in_t), %xmm2
2697	movdqu	0x30($in_t), %xmm3
2698	movdqa	%xmm0, 0x00($val,$index)
2699	movdqa	%xmm1, 0x10($val,$index)
2700	movdqa	%xmm2, 0x20($val,$index)
2701	movdqa	%xmm3, 0x30($val,$index)
2702
2703	ret
2704.cfi_endproc
2705.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2706
2707################################################################################
2708# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2709.globl	ecp_nistz256_gather_w7
2710.type	ecp_nistz256_gather_w7,\@abi-omnipotent
2711.align	32
2712ecp_nistz256_gather_w7:
2713.cfi_startproc
2714___
2715$code.=<<___	if ($avx>1);
2716	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2717	test	\$`1<<5`, %eax
2718	jnz	.Lavx2_gather_w7
2719___
2720$code.=<<___	if ($win64);
2721	lea	-0x88(%rsp), %rax
2722.LSEH_begin_ecp_nistz256_gather_w7:
2723	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2724	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2725	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2726	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2727	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2728	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2729	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2730	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2731	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2732	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2733	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2734___
2735$code.=<<___;
2736	movdqa	.LOne(%rip), $M0
2737	movd	$index, $INDEX
2738
2739	pxor	$Ra, $Ra
2740	pxor	$Rb, $Rb
2741	pxor	$Rc, $Rc
2742	pxor	$Rd, $Rd
2743
2744	movdqa	$M0, $ONE
2745	pshufd	\$0, $INDEX, $INDEX
2746	mov	\$64, %rax
2747
2748.Lselect_loop_sse_w7:
2749	movdqa	$M0, $TMP0
2750	paddd	$ONE, $M0
2751	movdqa	16*0($in_t), $T0a
2752	movdqa	16*1($in_t), $T0b
2753	pcmpeqd	$INDEX, $TMP0
2754	movdqa	16*2($in_t), $T0c
2755	movdqa	16*3($in_t), $T0d
2756	lea	16*4($in_t), $in_t
2757
2758	pand	$TMP0, $T0a
2759	pand	$TMP0, $T0b
2760	por	$T0a, $Ra
2761	pand	$TMP0, $T0c
2762	por	$T0b, $Rb
2763	pand	$TMP0, $T0d
2764	por	$T0c, $Rc
2765	prefetcht0	255($in_t)
2766	por	$T0d, $Rd
2767
2768	dec	%rax
2769	jnz	.Lselect_loop_sse_w7
2770
2771	movdqu	$Ra, 16*0($val)
2772	movdqu	$Rb, 16*1($val)
2773	movdqu	$Rc, 16*2($val)
2774	movdqu	$Rd, 16*3($val)
2775___
2776$code.=<<___	if ($win64);
2777	movaps	(%rsp), %xmm6
2778	movaps	0x10(%rsp), %xmm7
2779	movaps	0x20(%rsp), %xmm8
2780	movaps	0x30(%rsp), %xmm9
2781	movaps	0x40(%rsp), %xmm10
2782	movaps	0x50(%rsp), %xmm11
2783	movaps	0x60(%rsp), %xmm12
2784	movaps	0x70(%rsp), %xmm13
2785	movaps	0x80(%rsp), %xmm14
2786	movaps	0x90(%rsp), %xmm15
2787	lea	0xa8(%rsp), %rsp
2788___
2789$code.=<<___;
2790	ret
2791.cfi_endproc
2792.LSEH_end_ecp_nistz256_gather_w7:
2793.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2794___
2795}
2796if ($avx>1) {
2797my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2798my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2799my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2800my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2801
2802$code.=<<___;
2803################################################################################
2804# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2805.type	ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2806.align	32
2807ecp_nistz256_avx2_gather_w5:
2808.cfi_startproc
2809.Lavx2_gather_w5:
2810	vzeroupper
2811___
2812$code.=<<___	if ($win64);
2813	lea	-0x88(%rsp), %rax
2814	mov	%rsp,%r11
2815.LSEH_begin_ecp_nistz256_avx2_gather_w5:
2816	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2817	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2818	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2819	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2820	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2821	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2822	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2823	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2824	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2825	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2826	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2827___
2828$code.=<<___;
2829	vmovdqa	.LTwo(%rip), $TWO
2830
2831	vpxor	$Ra, $Ra, $Ra
2832	vpxor	$Rb, $Rb, $Rb
2833	vpxor	$Rc, $Rc, $Rc
2834
2835	vmovdqa .LOne(%rip), $M0
2836	vmovdqa .LTwo(%rip), $M1
2837
2838	vmovd	$index, %xmm1
2839	vpermd	$INDEX, $Ra, $INDEX
2840
2841	mov	\$8, %rax
2842.Lselect_loop_avx2_w5:
2843
2844	vmovdqa	32*0($in_t), $T0a
2845	vmovdqa	32*1($in_t), $T0b
2846	vmovdqa	32*2($in_t), $T0c
2847
2848	vmovdqa	32*3($in_t), $T1a
2849	vmovdqa	32*4($in_t), $T1b
2850	vmovdqa	32*5($in_t), $T1c
2851
2852	vpcmpeqd	$INDEX, $M0, $TMP0
2853	vpcmpeqd	$INDEX, $M1, $TMP1
2854
2855	vpaddd	$TWO, $M0, $M0
2856	vpaddd	$TWO, $M1, $M1
2857	lea	32*6($in_t), $in_t
2858
2859	vpand	$TMP0, $T0a, $T0a
2860	vpand	$TMP0, $T0b, $T0b
2861	vpand	$TMP0, $T0c, $T0c
2862	vpand	$TMP1, $T1a, $T1a
2863	vpand	$TMP1, $T1b, $T1b
2864	vpand	$TMP1, $T1c, $T1c
2865
2866	vpxor	$T0a, $Ra, $Ra
2867	vpxor	$T0b, $Rb, $Rb
2868	vpxor	$T0c, $Rc, $Rc
2869	vpxor	$T1a, $Ra, $Ra
2870	vpxor	$T1b, $Rb, $Rb
2871	vpxor	$T1c, $Rc, $Rc
2872
2873	dec %rax
2874	jnz .Lselect_loop_avx2_w5
2875
2876	vmovdqu $Ra, 32*0($val)
2877	vmovdqu $Rb, 32*1($val)
2878	vmovdqu $Rc, 32*2($val)
2879	vzeroupper
2880___
2881$code.=<<___	if ($win64);
2882	movaps	(%rsp), %xmm6
2883	movaps	0x10(%rsp), %xmm7
2884	movaps	0x20(%rsp), %xmm8
2885	movaps	0x30(%rsp), %xmm9
2886	movaps	0x40(%rsp), %xmm10
2887	movaps	0x50(%rsp), %xmm11
2888	movaps	0x60(%rsp), %xmm12
2889	movaps	0x70(%rsp), %xmm13
2890	movaps	0x80(%rsp), %xmm14
2891	movaps	0x90(%rsp), %xmm15
2892	lea	(%r11), %rsp
2893___
2894$code.=<<___;
2895	ret
2896.cfi_endproc
2897.LSEH_end_ecp_nistz256_avx2_gather_w5:
2898.size	ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2899___
2900}
2901if ($avx>1) {
2902my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2903my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2904my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2905my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2906my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2907
2908$code.=<<___;
2909
2910################################################################################
2911# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2912.globl	ecp_nistz256_avx2_gather_w7
2913.type	ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2914.align	32
2915ecp_nistz256_avx2_gather_w7:
2916.cfi_startproc
2917.Lavx2_gather_w7:
2918	vzeroupper
2919___
2920$code.=<<___	if ($win64);
2921	mov	%rsp,%r11
2922	lea	-0x88(%rsp), %rax
2923.LSEH_begin_ecp_nistz256_avx2_gather_w7:
2924	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2925	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2926	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2927	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2928	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2929	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2930	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2931	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2932	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2933	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2934	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2935___
2936$code.=<<___;
2937	vmovdqa	.LThree(%rip), $THREE
2938
2939	vpxor	$Ra, $Ra, $Ra
2940	vpxor	$Rb, $Rb, $Rb
2941
2942	vmovdqa .LOne(%rip), $M0
2943	vmovdqa .LTwo(%rip), $M1
2944	vmovdqa .LThree(%rip), $M2
2945
2946	vmovd	$index, %xmm1
2947	vpermd	$INDEX, $Ra, $INDEX
2948	# Skip index = 0, because it is implicitly the point at infinity
2949
2950	mov	\$21, %rax
2951.Lselect_loop_avx2_w7:
2952
2953	vmovdqa	32*0($in_t), $T0a
2954	vmovdqa	32*1($in_t), $T0b
2955
2956	vmovdqa	32*2($in_t), $T1a
2957	vmovdqa	32*3($in_t), $T1b
2958
2959	vmovdqa	32*4($in_t), $T2a
2960	vmovdqa	32*5($in_t), $T2b
2961
2962	vpcmpeqd	$INDEX, $M0, $TMP0
2963	vpcmpeqd	$INDEX, $M1, $TMP1
2964	vpcmpeqd	$INDEX, $M2, $TMP2
2965
2966	vpaddd	$THREE, $M0, $M0
2967	vpaddd	$THREE, $M1, $M1
2968	vpaddd	$THREE, $M2, $M2
2969	lea	32*6($in_t), $in_t
2970
2971	vpand	$TMP0, $T0a, $T0a
2972	vpand	$TMP0, $T0b, $T0b
2973	vpand	$TMP1, $T1a, $T1a
2974	vpand	$TMP1, $T1b, $T1b
2975	vpand	$TMP2, $T2a, $T2a
2976	vpand	$TMP2, $T2b, $T2b
2977
2978	vpxor	$T0a, $Ra, $Ra
2979	vpxor	$T0b, $Rb, $Rb
2980	vpxor	$T1a, $Ra, $Ra
2981	vpxor	$T1b, $Rb, $Rb
2982	vpxor	$T2a, $Ra, $Ra
2983	vpxor	$T2b, $Rb, $Rb
2984
2985	dec %rax
2986	jnz .Lselect_loop_avx2_w7
2987
2988
2989	vmovdqa	32*0($in_t), $T0a
2990	vmovdqa	32*1($in_t), $T0b
2991
2992	vpcmpeqd	$INDEX, $M0, $TMP0
2993
2994	vpand	$TMP0, $T0a, $T0a
2995	vpand	$TMP0, $T0b, $T0b
2996
2997	vpxor	$T0a, $Ra, $Ra
2998	vpxor	$T0b, $Rb, $Rb
2999
3000	vmovdqu $Ra, 32*0($val)
3001	vmovdqu $Rb, 32*1($val)
3002	vzeroupper
3003___
3004$code.=<<___	if ($win64);
3005	movaps	(%rsp), %xmm6
3006	movaps	0x10(%rsp), %xmm7
3007	movaps	0x20(%rsp), %xmm8
3008	movaps	0x30(%rsp), %xmm9
3009	movaps	0x40(%rsp), %xmm10
3010	movaps	0x50(%rsp), %xmm11
3011	movaps	0x60(%rsp), %xmm12
3012	movaps	0x70(%rsp), %xmm13
3013	movaps	0x80(%rsp), %xmm14
3014	movaps	0x90(%rsp), %xmm15
3015	lea	(%r11), %rsp
3016___
3017$code.=<<___;
3018	ret
3019.cfi_endproc
3020.LSEH_end_ecp_nistz256_avx2_gather_w7:
3021.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3022___
3023} else {
3024$code.=<<___;
3025.globl	ecp_nistz256_avx2_gather_w7
3026.type	ecp_nistz256_avx2_gather_w7,\@function,3
3027.align	32
3028ecp_nistz256_avx2_gather_w7:
3029.cfi_startproc
3030	.byte	0x0f,0x0b	# ud2
3031	ret
3032.cfi_endproc
3033.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3034___
3035}
3036{{{
3037########################################################################
3038# This block implements higher level point_double, point_add and
3039# point_add_affine. The key to performance in this case is to allow
3040# out-of-order execution logic to overlap computations from next step
3041# with tail processing from current step. By using tailored calling
3042# sequence we minimize inter-step overhead to give processor better
3043# shot at overlapping operations...
3044#
3045# You will notice that input data is copied to stack. Trouble is that
3046# there are no registers to spare for holding original pointers and
3047# reloading them, pointers, would create undesired dependencies on
3048# effective addresses calculation paths. In other words it's too done
3049# to favour out-of-order execution logic.
3050#						<appro@openssl.org>
3051
3052my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3053my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3054my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3055my ($poly1,$poly3)=($acc6,$acc7);
3056
3057sub load_for_mul () {
3058my ($a,$b,$src0) = @_;
3059my $bias = $src0 eq "%rax" ? 0 : -128;
3060
3061"	mov	$b, $src0
3062	lea	$b, $b_ptr
3063	mov	8*0+$a, $acc1
3064	mov	8*1+$a, $acc2
3065	lea	$bias+$a, $a_ptr
3066	mov	8*2+$a, $acc3
3067	mov	8*3+$a, $acc4"
3068}
3069
3070sub load_for_sqr () {
3071my ($a,$src0) = @_;
3072my $bias = $src0 eq "%rax" ? 0 : -128;
3073
3074"	mov	8*0+$a, $src0
3075	mov	8*1+$a, $acc6
3076	lea	$bias+$a, $a_ptr
3077	mov	8*2+$a, $acc7
3078	mov	8*3+$a, $acc0"
3079}
3080
3081									{
3082########################################################################
3083# operate in 4-5-0-1 "name space" that matches multiplication output
3084#
3085my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3086
3087$code.=<<___;
3088.type	__ecp_nistz256_add_toq,\@abi-omnipotent
3089.align	32
3090__ecp_nistz256_add_toq:
3091.cfi_startproc
3092	xor	$t4,$t4
3093	add	8*0($b_ptr), $a0
3094	adc	8*1($b_ptr), $a1
3095	 mov	$a0, $t0
3096	adc	8*2($b_ptr), $a2
3097	adc	8*3($b_ptr), $a3
3098	 mov	$a1, $t1
3099	adc	\$0, $t4
3100
3101	sub	\$-1, $a0
3102	 mov	$a2, $t2
3103	sbb	$poly1, $a1
3104	sbb	\$0, $a2
3105	 mov	$a3, $t3
3106	sbb	$poly3, $a3
3107	sbb	\$0, $t4
3108
3109	cmovc	$t0, $a0
3110	cmovc	$t1, $a1
3111	mov	$a0, 8*0($r_ptr)
3112	cmovc	$t2, $a2
3113	mov	$a1, 8*1($r_ptr)
3114	cmovc	$t3, $a3
3115	mov	$a2, 8*2($r_ptr)
3116	mov	$a3, 8*3($r_ptr)
3117
3118	ret
3119.cfi_endproc
3120.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3121
3122.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
3123.align	32
3124__ecp_nistz256_sub_fromq:
3125.cfi_startproc
3126	sub	8*0($b_ptr), $a0
3127	sbb	8*1($b_ptr), $a1
3128	 mov	$a0, $t0
3129	sbb	8*2($b_ptr), $a2
3130	sbb	8*3($b_ptr), $a3
3131	 mov	$a1, $t1
3132	sbb	$t4, $t4
3133
3134	add	\$-1, $a0
3135	 mov	$a2, $t2
3136	adc	$poly1, $a1
3137	adc	\$0, $a2
3138	 mov	$a3, $t3
3139	adc	$poly3, $a3
3140	test	$t4, $t4
3141
3142	cmovz	$t0, $a0
3143	cmovz	$t1, $a1
3144	mov	$a0, 8*0($r_ptr)
3145	cmovz	$t2, $a2
3146	mov	$a1, 8*1($r_ptr)
3147	cmovz	$t3, $a3
3148	mov	$a2, 8*2($r_ptr)
3149	mov	$a3, 8*3($r_ptr)
3150
3151	ret
3152.cfi_endproc
3153.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3154
3155.type	__ecp_nistz256_subq,\@abi-omnipotent
3156.align	32
3157__ecp_nistz256_subq:
3158.cfi_startproc
3159	sub	$a0, $t0
3160	sbb	$a1, $t1
3161	 mov	$t0, $a0
3162	sbb	$a2, $t2
3163	sbb	$a3, $t3
3164	 mov	$t1, $a1
3165	sbb	$t4, $t4
3166
3167	add	\$-1, $t0
3168	 mov	$t2, $a2
3169	adc	$poly1, $t1
3170	adc	\$0, $t2
3171	 mov	$t3, $a3
3172	adc	$poly3, $t3
3173	test	$t4, $t4
3174
3175	cmovnz	$t0, $a0
3176	cmovnz	$t1, $a1
3177	cmovnz	$t2, $a2
3178	cmovnz	$t3, $a3
3179
3180	ret
3181.cfi_endproc
3182.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
3183
3184.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
3185.align	32
3186__ecp_nistz256_mul_by_2q:
3187.cfi_startproc
3188	xor	$t4, $t4
3189	add	$a0, $a0		# a0:a3+a0:a3
3190	adc	$a1, $a1
3191	 mov	$a0, $t0
3192	adc	$a2, $a2
3193	adc	$a3, $a3
3194	 mov	$a1, $t1
3195	adc	\$0, $t4
3196
3197	sub	\$-1, $a0
3198	 mov	$a2, $t2
3199	sbb	$poly1, $a1
3200	sbb	\$0, $a2
3201	 mov	$a3, $t3
3202	sbb	$poly3, $a3
3203	sbb	\$0, $t4
3204
3205	cmovc	$t0, $a0
3206	cmovc	$t1, $a1
3207	mov	$a0, 8*0($r_ptr)
3208	cmovc	$t2, $a2
3209	mov	$a1, 8*1($r_ptr)
3210	cmovc	$t3, $a3
3211	mov	$a2, 8*2($r_ptr)
3212	mov	$a3, 8*3($r_ptr)
3213
3214	ret
3215.cfi_endproc
3216.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3217___
3218									}
3219sub gen_double () {
3220    my $x = shift;
3221    my ($src0,$sfx,$bias);
3222    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3223
3224    if ($x ne "x") {
3225	$src0 = "%rax";
3226	$sfx  = "";
3227	$bias = 0;
3228
3229$code.=<<___;
3230.globl	ecp_nistz256_point_double
3231.type	ecp_nistz256_point_double,\@function,2
3232.align	32
3233ecp_nistz256_point_double:
3234.cfi_startproc
3235___
3236$code.=<<___	if ($addx);
3237	mov	\$0x80100, %ecx
3238	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3239	cmp	\$0x80100, %ecx
3240	je	.Lpoint_doublex
3241___
3242    } else {
3243	$src0 = "%rdx";
3244	$sfx  = "x";
3245	$bias = 128;
3246
3247$code.=<<___;
3248.type	ecp_nistz256_point_doublex,\@function,2
3249.align	32
3250ecp_nistz256_point_doublex:
3251.cfi_startproc
3252.Lpoint_doublex:
3253___
3254    }
3255$code.=<<___;
3256	push	%rbp
3257.cfi_push	%rbp
3258	push	%rbx
3259.cfi_push	%rbx
3260	push	%r12
3261.cfi_push	%r12
3262	push	%r13
3263.cfi_push	%r13
3264	push	%r14
3265.cfi_push	%r14
3266	push	%r15
3267.cfi_push	%r15
3268	sub	\$32*5+8, %rsp
3269.cfi_adjust_cfa_offset	32*5+8
3270.Lpoint_double${x}_body:
3271
3272.Lpoint_double_shortcut$x:
3273	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
3274	mov	$a_ptr, $b_ptr			# backup copy
3275	movdqu	0x10($a_ptr), %xmm1
3276	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
3277	 mov	0x20+8*1($a_ptr), $acc5
3278	 mov	0x20+8*2($a_ptr), $acc0
3279	 mov	0x20+8*3($a_ptr), $acc1
3280	 mov	.Lpoly+8*1(%rip), $poly1
3281	 mov	.Lpoly+8*3(%rip), $poly3
3282	movdqa	%xmm0, $in_x(%rsp)
3283	movdqa	%xmm1, $in_x+0x10(%rsp)
3284	lea	0x20($r_ptr), $acc2
3285	lea	0x40($r_ptr), $acc3
3286	movq	$r_ptr, %xmm0
3287	movq	$acc2, %xmm1
3288	movq	$acc3, %xmm2
3289
3290	lea	$S(%rsp), $r_ptr
3291	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
3292
3293	mov	0x40+8*0($a_ptr), $src0
3294	mov	0x40+8*1($a_ptr), $acc6
3295	mov	0x40+8*2($a_ptr), $acc7
3296	mov	0x40+8*3($a_ptr), $acc0
3297	lea	0x40-$bias($a_ptr), $a_ptr
3298	lea	$Zsqr(%rsp), $r_ptr
3299	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
3300
3301	`&load_for_sqr("$S(%rsp)", "$src0")`
3302	lea	$S(%rsp), $r_ptr
3303	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
3304
3305	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
3306	mov	0x40+8*0($b_ptr), $acc1
3307	mov	0x40+8*1($b_ptr), $acc2
3308	mov	0x40+8*2($b_ptr), $acc3
3309	mov	0x40+8*3($b_ptr), $acc4
3310	lea	0x40-$bias($b_ptr), $a_ptr
3311	lea	0x20($b_ptr), $b_ptr
3312	movq	%xmm2, $r_ptr
3313	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
3314	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
3315
3316	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3317	mov	$in_x+8*1(%rsp), $acc5
3318	lea	$Zsqr(%rsp), $b_ptr
3319	mov	$in_x+8*2(%rsp), $acc0
3320	mov	$in_x+8*3(%rsp), $acc1
3321	lea	$M(%rsp), $r_ptr
3322	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
3323
3324	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3325	mov	$in_x+8*1(%rsp), $acc5
3326	lea	$Zsqr(%rsp), $b_ptr
3327	mov	$in_x+8*2(%rsp), $acc0
3328	mov	$in_x+8*3(%rsp), $acc1
3329	lea	$Zsqr(%rsp), $r_ptr
3330	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
3331
3332	`&load_for_sqr("$S(%rsp)", "$src0")`
3333	movq	%xmm1, $r_ptr
3334	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
3335___
3336{
3337######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3338# operate in 4-5-6-7 "name space" that matches squaring output
3339#
3340my ($poly1,$poly3)=($a_ptr,$t1);
3341my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3342
3343$code.=<<___;
3344	xor	$t4, $t4
3345	mov	$a0, $t0
3346	add	\$-1, $a0
3347	mov	$a1, $t1
3348	adc	$poly1, $a1
3349	mov	$a2, $t2
3350	adc	\$0, $a2
3351	mov	$a3, $t3
3352	adc	$poly3, $a3
3353	adc	\$0, $t4
3354	xor	$a_ptr, $a_ptr		# borrow $a_ptr
3355	test	\$1, $t0
3356
3357	cmovz	$t0, $a0
3358	cmovz	$t1, $a1
3359	cmovz	$t2, $a2
3360	cmovz	$t3, $a3
3361	cmovz	$a_ptr, $t4
3362
3363	mov	$a1, $t0		# a0:a3>>1
3364	shr	\$1, $a0
3365	shl	\$63, $t0
3366	mov	$a2, $t1
3367	shr	\$1, $a1
3368	or	$t0, $a0
3369	shl	\$63, $t1
3370	mov	$a3, $t2
3371	shr	\$1, $a2
3372	or	$t1, $a1
3373	shl	\$63, $t2
3374	mov	$a0, 8*0($r_ptr)
3375	shr	\$1, $a3
3376	mov	$a1, 8*1($r_ptr)
3377	shl	\$63, $t4
3378	or	$t2, $a2
3379	or	$t4, $a3
3380	mov	$a2, 8*2($r_ptr)
3381	mov	$a3, 8*3($r_ptr)
3382___
3383}
3384$code.=<<___;
3385	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3386	lea	$M(%rsp), $r_ptr
3387	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
3388
3389	lea	$tmp0(%rsp), $r_ptr
3390	call	__ecp_nistz256_mul_by_2$x
3391
3392	lea	$M(%rsp), $b_ptr
3393	lea	$M(%rsp), $r_ptr
3394	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
3395
3396	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3397	lea	$S(%rsp), $r_ptr
3398	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
3399
3400	lea	$tmp0(%rsp), $r_ptr
3401	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
3402
3403	`&load_for_sqr("$M(%rsp)", "$src0")`
3404	movq	%xmm0, $r_ptr
3405	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
3406
3407	lea	$tmp0(%rsp), $b_ptr
3408	mov	$acc6, $acc0			# harmonize sqr output and sub input
3409	mov	$acc7, $acc1
3410	mov	$a_ptr, $poly1
3411	mov	$t1, $poly3
3412	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
3413
3414	mov	$S+8*0(%rsp), $t0
3415	mov	$S+8*1(%rsp), $t1
3416	mov	$S+8*2(%rsp), $t2
3417	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
3418	lea	$S(%rsp), $r_ptr
3419	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
3420
3421	mov	$M(%rsp), $src0
3422	lea	$M(%rsp), $b_ptr
3423	mov	$acc4, $acc6			# harmonize sub output and mul input
3424	xor	%ecx, %ecx
3425	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
3426	mov	$acc5, $acc2
3427	mov	$acc5, $S+8*1(%rsp)
3428	cmovz	$acc0, $acc3
3429	mov	$acc0, $S+8*2(%rsp)
3430	lea	$S-$bias(%rsp), $a_ptr
3431	cmovz	$acc1, $acc4
3432	mov	$acc1, $S+8*3(%rsp)
3433	mov	$acc6, $acc1
3434	lea	$S(%rsp), $r_ptr
3435	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
3436
3437	movq	%xmm1, $b_ptr
3438	movq	%xmm1, $r_ptr
3439	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
3440
3441	lea	32*5+56(%rsp), %rsi
3442.cfi_def_cfa	%rsi,8
3443	mov	-48(%rsi),%r15
3444.cfi_restore	%r15
3445	mov	-40(%rsi),%r14
3446.cfi_restore	%r14
3447	mov	-32(%rsi),%r13
3448.cfi_restore	%r13
3449	mov	-24(%rsi),%r12
3450.cfi_restore	%r12
3451	mov	-16(%rsi),%rbx
3452.cfi_restore	%rbx
3453	mov	-8(%rsi),%rbp
3454.cfi_restore	%rbp
3455	lea	(%rsi),%rsp
3456.cfi_def_cfa_register	%rsp
3457.Lpoint_double${x}_epilogue:
3458	ret
3459.cfi_endproc
3460.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3461___
3462}
3463&gen_double("q");
3464
3465sub gen_add () {
3466    my $x = shift;
3467    my ($src0,$sfx,$bias);
3468    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3469	$U1,$U2,$S1,$S2,
3470	$res_x,$res_y,$res_z,
3471	$in1_x,$in1_y,$in1_z,
3472	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3473    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3474
3475    if ($x ne "x") {
3476	$src0 = "%rax";
3477	$sfx  = "";
3478	$bias = 0;
3479
3480$code.=<<___;
3481.globl	ecp_nistz256_point_add
3482.type	ecp_nistz256_point_add,\@function,3
3483.align	32
3484ecp_nistz256_point_add:
3485.cfi_startproc
3486___
3487$code.=<<___	if ($addx);
3488	mov	\$0x80100, %ecx
3489	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3490	cmp	\$0x80100, %ecx
3491	je	.Lpoint_addx
3492___
3493    } else {
3494	$src0 = "%rdx";
3495	$sfx  = "x";
3496	$bias = 128;
3497
3498$code.=<<___;
3499.type	ecp_nistz256_point_addx,\@function,3
3500.align	32
3501ecp_nistz256_point_addx:
3502.cfi_startproc
3503.Lpoint_addx:
3504___
3505    }
3506$code.=<<___;
3507	push	%rbp
3508.cfi_push	%rbp
3509	push	%rbx
3510.cfi_push	%rbx
3511	push	%r12
3512.cfi_push	%r12
3513	push	%r13
3514.cfi_push	%r13
3515	push	%r14
3516.cfi_push	%r14
3517	push	%r15
3518.cfi_push	%r15
3519	sub	\$32*18+8, %rsp
3520.cfi_adjust_cfa_offset	32*18+8
3521.Lpoint_add${x}_body:
3522
3523	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3524	movdqu	0x10($a_ptr), %xmm1
3525	movdqu	0x20($a_ptr), %xmm2
3526	movdqu	0x30($a_ptr), %xmm3
3527	movdqu	0x40($a_ptr), %xmm4
3528	movdqu	0x50($a_ptr), %xmm5
3529	mov	$a_ptr, $b_ptr			# reassign
3530	mov	$b_org, $a_ptr			# reassign
3531	movdqa	%xmm0, $in1_x(%rsp)
3532	movdqa	%xmm1, $in1_x+0x10(%rsp)
3533	movdqa	%xmm2, $in1_y(%rsp)
3534	movdqa	%xmm3, $in1_y+0x10(%rsp)
3535	movdqa	%xmm4, $in1_z(%rsp)
3536	movdqa	%xmm5, $in1_z+0x10(%rsp)
3537	por	%xmm4, %xmm5
3538
3539	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3540	 pshufd	\$0xb1, %xmm5, %xmm3
3541	movdqu	0x10($a_ptr), %xmm1
3542	movdqu	0x20($a_ptr), %xmm2
3543	 por	%xmm3, %xmm5
3544	movdqu	0x30($a_ptr), %xmm3
3545	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3546	 mov	0x40+8*1($a_ptr), $acc6
3547	 mov	0x40+8*2($a_ptr), $acc7
3548	 mov	0x40+8*3($a_ptr), $acc0
3549	movdqa	%xmm0, $in2_x(%rsp)
3550	 pshufd	\$0x1e, %xmm5, %xmm4
3551	movdqa	%xmm1, $in2_x+0x10(%rsp)
3552	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3553	movdqu	0x50($a_ptr),%xmm1
3554	movdqa	%xmm2, $in2_y(%rsp)
3555	movdqa	%xmm3, $in2_y+0x10(%rsp)
3556	 por	%xmm4, %xmm5
3557	 pxor	%xmm4, %xmm4
3558	por	%xmm0, %xmm1
3559	 movq	$r_ptr, %xmm0			# save $r_ptr
3560
3561	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3562	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3563	 mov	$acc6, $in2_z+8*1(%rsp)
3564	 mov	$acc7, $in2_z+8*2(%rsp)
3565	 mov	$acc0, $in2_z+8*3(%rsp)
3566	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3567	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3568
3569	pcmpeqd	%xmm4, %xmm5
3570	pshufd	\$0xb1, %xmm1, %xmm4
3571	por	%xmm1, %xmm4
3572	pshufd	\$0, %xmm5, %xmm5		# in1infty
3573	pshufd	\$0x1e, %xmm4, %xmm3
3574	por	%xmm3, %xmm4
3575	pxor	%xmm3, %xmm3
3576	pcmpeqd	%xmm3, %xmm4
3577	pshufd	\$0, %xmm4, %xmm4		# in2infty
3578	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3579	 mov	0x40+8*1($b_ptr), $acc6
3580	 mov	0x40+8*2($b_ptr), $acc7
3581	 mov	0x40+8*3($b_ptr), $acc0
3582	movq	$b_ptr, %xmm1
3583
3584	lea	0x40-$bias($b_ptr), $a_ptr
3585	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3586	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3587
3588	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3589	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3590	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3591
3592	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3593	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3594	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3595
3596	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3597	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3598	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3599
3600	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3601	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3602	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3603
3604	lea	$S1(%rsp), $b_ptr
3605	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3606	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3607
3608	or	$acc5, $acc4			# see if result is zero
3609	movdqa	%xmm4, %xmm2
3610	or	$acc0, $acc4
3611	or	$acc1, $acc4
3612	por	%xmm5, %xmm2			# in1infty || in2infty
3613	movq	$acc4, %xmm3
3614
3615	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3616	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3617	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3618
3619	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3620	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3621	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3622
3623	lea	$U1(%rsp), $b_ptr
3624	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3625	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3626
3627	or	$acc5, $acc4			# see if result is zero
3628	or	$acc0, $acc4
3629	or	$acc1, $acc4			# !is_equal(U1, U2)
3630
3631	movq	%xmm2, $acc0			# in1infty | in2infty
3632	movq	%xmm3, $acc1			# !is_equal(S1, S2)
3633
3634	or	$acc0, $acc4
3635	or	$acc1, $acc4
3636
3637	# if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2))
3638	.byte	0x3e				# predict taken
3639	jnz	.Ladd_proceed$x
3640
3641.Ladd_double$x:
3642	movq	%xmm1, $a_ptr			# restore $a_ptr
3643	movq	%xmm0, $r_ptr			# restore $r_ptr
3644	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3645.cfi_adjust_cfa_offset	`-32*(18-5)`
3646	jmp	.Lpoint_double_shortcut$x
3647.cfi_adjust_cfa_offset	`32*(18-5)`
3648
3649.align	32
3650.Ladd_proceed$x:
3651	`&load_for_sqr("$R(%rsp)", "$src0")`
3652	lea	$Rsqr(%rsp), $r_ptr		# R^2
3653	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3654
3655	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3656	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3657	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3658
3659	`&load_for_sqr("$H(%rsp)", "$src0")`
3660	lea	$Hsqr(%rsp), $r_ptr		# H^2
3661	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3662
3663	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3664	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3665	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3666
3667	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3668	lea	$Hcub(%rsp), $r_ptr		# H^3
3669	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3670
3671	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3672	lea	$U2(%rsp), $r_ptr		# U1*H^2
3673	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3674___
3675{
3676#######################################################################
3677# operate in 4-5-0-1 "name space" that matches multiplication output
3678#
3679my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3680my ($poly1, $poly3)=($acc6,$acc7);
3681
3682$code.=<<___;
3683	#lea	$U2(%rsp), $a_ptr
3684	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3685	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3686
3687	xor	$t4, $t4
3688	add	$acc0, $acc0		# a0:a3+a0:a3
3689	lea	$Rsqr(%rsp), $a_ptr
3690	adc	$acc1, $acc1
3691	 mov	$acc0, $t0
3692	adc	$acc2, $acc2
3693	adc	$acc3, $acc3
3694	 mov	$acc1, $t1
3695	adc	\$0, $t4
3696
3697	sub	\$-1, $acc0
3698	 mov	$acc2, $t2
3699	sbb	$poly1, $acc1
3700	sbb	\$0, $acc2
3701	 mov	$acc3, $t3
3702	sbb	$poly3, $acc3
3703	sbb	\$0, $t4
3704
3705	cmovc	$t0, $acc0
3706	mov	8*0($a_ptr), $t0
3707	cmovc	$t1, $acc1
3708	mov	8*1($a_ptr), $t1
3709	cmovc	$t2, $acc2
3710	mov	8*2($a_ptr), $t2
3711	cmovc	$t3, $acc3
3712	mov	8*3($a_ptr), $t3
3713
3714	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3715
3716	lea	$Hcub(%rsp), $b_ptr
3717	lea	$res_x(%rsp), $r_ptr
3718	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3719
3720	mov	$U2+8*0(%rsp), $t0
3721	mov	$U2+8*1(%rsp), $t1
3722	mov	$U2+8*2(%rsp), $t2
3723	mov	$U2+8*3(%rsp), $t3
3724	lea	$res_y(%rsp), $r_ptr
3725
3726	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3727
3728	mov	$acc0, 8*0($r_ptr)		# save the result, as
3729	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3730	mov	$acc2, 8*2($r_ptr)
3731	mov	$acc3, 8*3($r_ptr)
3732___
3733}
3734$code.=<<___;
3735	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3736	lea	$S2(%rsp), $r_ptr
3737	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3738
3739	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3740	lea	$res_y(%rsp), $r_ptr
3741	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3742
3743	lea	$S2(%rsp), $b_ptr
3744	lea	$res_y(%rsp), $r_ptr
3745	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3746
3747	movq	%xmm0, $r_ptr		# restore $r_ptr
3748
3749	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3750	movdqa	%xmm5, %xmm1
3751	pandn	$res_z(%rsp), %xmm0
3752	movdqa	%xmm5, %xmm2
3753	pandn	$res_z+0x10(%rsp), %xmm1
3754	movdqa	%xmm5, %xmm3
3755	pand	$in2_z(%rsp), %xmm2
3756	pand	$in2_z+0x10(%rsp), %xmm3
3757	por	%xmm0, %xmm2
3758	por	%xmm1, %xmm3
3759
3760	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3761	movdqa	%xmm4, %xmm1
3762	pandn	%xmm2, %xmm0
3763	movdqa	%xmm4, %xmm2
3764	pandn	%xmm3, %xmm1
3765	movdqa	%xmm4, %xmm3
3766	pand	$in1_z(%rsp), %xmm2
3767	pand	$in1_z+0x10(%rsp), %xmm3
3768	por	%xmm0, %xmm2
3769	por	%xmm1, %xmm3
3770	movdqu	%xmm2, 0x40($r_ptr)
3771	movdqu	%xmm3, 0x50($r_ptr)
3772
3773	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3774	movdqa	%xmm5, %xmm1
3775	pandn	$res_x(%rsp), %xmm0
3776	movdqa	%xmm5, %xmm2
3777	pandn	$res_x+0x10(%rsp), %xmm1
3778	movdqa	%xmm5, %xmm3
3779	pand	$in2_x(%rsp), %xmm2
3780	pand	$in2_x+0x10(%rsp), %xmm3
3781	por	%xmm0, %xmm2
3782	por	%xmm1, %xmm3
3783
3784	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3785	movdqa	%xmm4, %xmm1
3786	pandn	%xmm2, %xmm0
3787	movdqa	%xmm4, %xmm2
3788	pandn	%xmm3, %xmm1
3789	movdqa	%xmm4, %xmm3
3790	pand	$in1_x(%rsp), %xmm2
3791	pand	$in1_x+0x10(%rsp), %xmm3
3792	por	%xmm0, %xmm2
3793	por	%xmm1, %xmm3
3794	movdqu	%xmm2, 0x00($r_ptr)
3795	movdqu	%xmm3, 0x10($r_ptr)
3796
3797	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3798	movdqa	%xmm5, %xmm1
3799	pandn	$res_y(%rsp), %xmm0
3800	movdqa	%xmm5, %xmm2
3801	pandn	$res_y+0x10(%rsp), %xmm1
3802	movdqa	%xmm5, %xmm3
3803	pand	$in2_y(%rsp), %xmm2
3804	pand	$in2_y+0x10(%rsp), %xmm3
3805	por	%xmm0, %xmm2
3806	por	%xmm1, %xmm3
3807
3808	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3809	movdqa	%xmm4, %xmm1
3810	pandn	%xmm2, %xmm0
3811	movdqa	%xmm4, %xmm2
3812	pandn	%xmm3, %xmm1
3813	movdqa	%xmm4, %xmm3
3814	pand	$in1_y(%rsp), %xmm2
3815	pand	$in1_y+0x10(%rsp), %xmm3
3816	por	%xmm0, %xmm2
3817	por	%xmm1, %xmm3
3818	movdqu	%xmm2, 0x20($r_ptr)
3819	movdqu	%xmm3, 0x30($r_ptr)
3820
3821.Ladd_done$x:
3822	lea	32*18+56(%rsp), %rsi
3823.cfi_def_cfa	%rsi,8
3824	mov	-48(%rsi),%r15
3825.cfi_restore	%r15
3826	mov	-40(%rsi),%r14
3827.cfi_restore	%r14
3828	mov	-32(%rsi),%r13
3829.cfi_restore	%r13
3830	mov	-24(%rsi),%r12
3831.cfi_restore	%r12
3832	mov	-16(%rsi),%rbx
3833.cfi_restore	%rbx
3834	mov	-8(%rsi),%rbp
3835.cfi_restore	%rbp
3836	lea	(%rsi),%rsp
3837.cfi_def_cfa_register	%rsp
3838.Lpoint_add${x}_epilogue:
3839	ret
3840.cfi_endproc
3841.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3842___
3843}
3844&gen_add("q");
3845
3846sub gen_add_affine () {
3847    my $x = shift;
3848    my ($src0,$sfx,$bias);
3849    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3850	$res_x,$res_y,$res_z,
3851	$in1_x,$in1_y,$in1_z,
3852	$in2_x,$in2_y)=map(32*$_,(0..14));
3853    my $Z1sqr = $S2;
3854
3855    if ($x ne "x") {
3856	$src0 = "%rax";
3857	$sfx  = "";
3858	$bias = 0;
3859
3860$code.=<<___;
3861.globl	ecp_nistz256_point_add_affine
3862.type	ecp_nistz256_point_add_affine,\@function,3
3863.align	32
3864ecp_nistz256_point_add_affine:
3865.cfi_startproc
3866___
3867$code.=<<___	if ($addx);
3868	mov	\$0x80100, %ecx
3869	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3870	cmp	\$0x80100, %ecx
3871	je	.Lpoint_add_affinex
3872___
3873    } else {
3874	$src0 = "%rdx";
3875	$sfx  = "x";
3876	$bias = 128;
3877
3878$code.=<<___;
3879.type	ecp_nistz256_point_add_affinex,\@function,3
3880.align	32
3881ecp_nistz256_point_add_affinex:
3882.cfi_startproc
3883.Lpoint_add_affinex:
3884___
3885    }
3886$code.=<<___;
3887	push	%rbp
3888.cfi_push	%rbp
3889	push	%rbx
3890.cfi_push	%rbx
3891	push	%r12
3892.cfi_push	%r12
3893	push	%r13
3894.cfi_push	%r13
3895	push	%r14
3896.cfi_push	%r14
3897	push	%r15
3898.cfi_push	%r15
3899	sub	\$32*15+8, %rsp
3900.cfi_adjust_cfa_offset	32*15+8
3901.Ladd_affine${x}_body:
3902
3903	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3904	mov	$b_org, $b_ptr		# reassign
3905	movdqu	0x10($a_ptr), %xmm1
3906	movdqu	0x20($a_ptr), %xmm2
3907	movdqu	0x30($a_ptr), %xmm3
3908	movdqu	0x40($a_ptr), %xmm4
3909	movdqu	0x50($a_ptr), %xmm5
3910	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3911	 mov	0x40+8*1($a_ptr), $acc6
3912	 mov	0x40+8*2($a_ptr), $acc7
3913	 mov	0x40+8*3($a_ptr), $acc0
3914	movdqa	%xmm0, $in1_x(%rsp)
3915	movdqa	%xmm1, $in1_x+0x10(%rsp)
3916	movdqa	%xmm2, $in1_y(%rsp)
3917	movdqa	%xmm3, $in1_y+0x10(%rsp)
3918	movdqa	%xmm4, $in1_z(%rsp)
3919	movdqa	%xmm5, $in1_z+0x10(%rsp)
3920	por	%xmm4, %xmm5
3921
3922	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3923	 pshufd	\$0xb1, %xmm5, %xmm3
3924	movdqu	0x10($b_ptr), %xmm1
3925	movdqu	0x20($b_ptr), %xmm2
3926	 por	%xmm3, %xmm5
3927	movdqu	0x30($b_ptr), %xmm3
3928	movdqa	%xmm0, $in2_x(%rsp)
3929	 pshufd	\$0x1e, %xmm5, %xmm4
3930	movdqa	%xmm1, $in2_x+0x10(%rsp)
3931	por	%xmm0, %xmm1
3932	 movq	$r_ptr, %xmm0		# save $r_ptr
3933	movdqa	%xmm2, $in2_y(%rsp)
3934	movdqa	%xmm3, $in2_y+0x10(%rsp)
3935	por	%xmm2, %xmm3
3936	 por	%xmm4, %xmm5
3937	 pxor	%xmm4, %xmm4
3938	por	%xmm1, %xmm3
3939
3940	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3941	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3942	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3943
3944	pcmpeqd	%xmm4, %xmm5
3945	pshufd	\$0xb1, %xmm3, %xmm4
3946	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3947	 #lea	0x00($b_ptr), $b_ptr
3948	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3949	por	%xmm3, %xmm4
3950	pshufd	\$0, %xmm5, %xmm5		# in1infty
3951	pshufd	\$0x1e, %xmm4, %xmm3
3952	 mov	$acc5, $acc2
3953	por	%xmm3, %xmm4
3954	pxor	%xmm3, %xmm3
3955	 mov	$acc6, $acc3
3956	pcmpeqd	%xmm3, %xmm4
3957	pshufd	\$0, %xmm4, %xmm4		# in2infty
3958
3959	lea	$Z1sqr-$bias(%rsp), $a_ptr
3960	mov	$acc7, $acc4
3961	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3962	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3963
3964	lea	$in1_x(%rsp), $b_ptr
3965	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3966	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3967
3968	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3969	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3970	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3971
3972	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3973	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3974	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3975
3976	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3977	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3978	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3979
3980	lea	$in1_y(%rsp), $b_ptr
3981	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3982	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3983
3984	`&load_for_sqr("$H(%rsp)", "$src0")`
3985	lea	$Hsqr(%rsp), $r_ptr		# H^2
3986	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3987
3988	`&load_for_sqr("$R(%rsp)", "$src0")`
3989	lea	$Rsqr(%rsp), $r_ptr		# R^2
3990	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3991
3992	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3993	lea	$Hcub(%rsp), $r_ptr		# H^3
3994	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3995
3996	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3997	lea	$U2(%rsp), $r_ptr		# U1*H^2
3998	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
3999___
4000{
4001#######################################################################
4002# operate in 4-5-0-1 "name space" that matches multiplication output
4003#
4004my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4005my ($poly1, $poly3)=($acc6,$acc7);
4006
4007$code.=<<___;
4008	#lea	$U2(%rsp), $a_ptr
4009	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
4010	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
4011
4012	xor	$t4, $t4
4013	add	$acc0, $acc0		# a0:a3+a0:a3
4014	lea	$Rsqr(%rsp), $a_ptr
4015	adc	$acc1, $acc1
4016	 mov	$acc0, $t0
4017	adc	$acc2, $acc2
4018	adc	$acc3, $acc3
4019	 mov	$acc1, $t1
4020	adc	\$0, $t4
4021
4022	sub	\$-1, $acc0
4023	 mov	$acc2, $t2
4024	sbb	$poly1, $acc1
4025	sbb	\$0, $acc2
4026	 mov	$acc3, $t3
4027	sbb	$poly3, $acc3
4028	sbb	\$0, $t4
4029
4030	cmovc	$t0, $acc0
4031	mov	8*0($a_ptr), $t0
4032	cmovc	$t1, $acc1
4033	mov	8*1($a_ptr), $t1
4034	cmovc	$t2, $acc2
4035	mov	8*2($a_ptr), $t2
4036	cmovc	$t3, $acc3
4037	mov	8*3($a_ptr), $t3
4038
4039	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
4040
4041	lea	$Hcub(%rsp), $b_ptr
4042	lea	$res_x(%rsp), $r_ptr
4043	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
4044
4045	mov	$U2+8*0(%rsp), $t0
4046	mov	$U2+8*1(%rsp), $t1
4047	mov	$U2+8*2(%rsp), $t2
4048	mov	$U2+8*3(%rsp), $t3
4049	lea	$H(%rsp), $r_ptr
4050
4051	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
4052
4053	mov	$acc0, 8*0($r_ptr)		# save the result, as
4054	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
4055	mov	$acc2, 8*2($r_ptr)
4056	mov	$acc3, 8*3($r_ptr)
4057___
4058}
4059$code.=<<___;
4060	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4061	lea	$S2(%rsp), $r_ptr
4062	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
4063
4064	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4065	lea	$H(%rsp), $r_ptr
4066	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
4067
4068	lea	$S2(%rsp), $b_ptr
4069	lea	$res_y(%rsp), $r_ptr
4070	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
4071
4072	movq	%xmm0, $r_ptr		# restore $r_ptr
4073
4074	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
4075	movdqa	%xmm5, %xmm1
4076	pandn	$res_z(%rsp), %xmm0
4077	movdqa	%xmm5, %xmm2
4078	pandn	$res_z+0x10(%rsp), %xmm1
4079	movdqa	%xmm5, %xmm3
4080	pand	.LONE_mont(%rip), %xmm2
4081	pand	.LONE_mont+0x10(%rip), %xmm3
4082	por	%xmm0, %xmm2
4083	por	%xmm1, %xmm3
4084
4085	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
4086	movdqa	%xmm4, %xmm1
4087	pandn	%xmm2, %xmm0
4088	movdqa	%xmm4, %xmm2
4089	pandn	%xmm3, %xmm1
4090	movdqa	%xmm4, %xmm3
4091	pand	$in1_z(%rsp), %xmm2
4092	pand	$in1_z+0x10(%rsp), %xmm3
4093	por	%xmm0, %xmm2
4094	por	%xmm1, %xmm3
4095	movdqu	%xmm2, 0x40($r_ptr)
4096	movdqu	%xmm3, 0x50($r_ptr)
4097
4098	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
4099	movdqa	%xmm5, %xmm1
4100	pandn	$res_x(%rsp), %xmm0
4101	movdqa	%xmm5, %xmm2
4102	pandn	$res_x+0x10(%rsp), %xmm1
4103	movdqa	%xmm5, %xmm3
4104	pand	$in2_x(%rsp), %xmm2
4105	pand	$in2_x+0x10(%rsp), %xmm3
4106	por	%xmm0, %xmm2
4107	por	%xmm1, %xmm3
4108
4109	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
4110	movdqa	%xmm4, %xmm1
4111	pandn	%xmm2, %xmm0
4112	movdqa	%xmm4, %xmm2
4113	pandn	%xmm3, %xmm1
4114	movdqa	%xmm4, %xmm3
4115	pand	$in1_x(%rsp), %xmm2
4116	pand	$in1_x+0x10(%rsp), %xmm3
4117	por	%xmm0, %xmm2
4118	por	%xmm1, %xmm3
4119	movdqu	%xmm2, 0x00($r_ptr)
4120	movdqu	%xmm3, 0x10($r_ptr)
4121
4122	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
4123	movdqa	%xmm5, %xmm1
4124	pandn	$res_y(%rsp), %xmm0
4125	movdqa	%xmm5, %xmm2
4126	pandn	$res_y+0x10(%rsp), %xmm1
4127	movdqa	%xmm5, %xmm3
4128	pand	$in2_y(%rsp), %xmm2
4129	pand	$in2_y+0x10(%rsp), %xmm3
4130	por	%xmm0, %xmm2
4131	por	%xmm1, %xmm3
4132
4133	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
4134	movdqa	%xmm4, %xmm1
4135	pandn	%xmm2, %xmm0
4136	movdqa	%xmm4, %xmm2
4137	pandn	%xmm3, %xmm1
4138	movdqa	%xmm4, %xmm3
4139	pand	$in1_y(%rsp), %xmm2
4140	pand	$in1_y+0x10(%rsp), %xmm3
4141	por	%xmm0, %xmm2
4142	por	%xmm1, %xmm3
4143	movdqu	%xmm2, 0x20($r_ptr)
4144	movdqu	%xmm3, 0x30($r_ptr)
4145
4146	lea	32*15+56(%rsp), %rsi
4147.cfi_def_cfa	%rsi,8
4148	mov	-48(%rsi),%r15
4149.cfi_restore	%r15
4150	mov	-40(%rsi),%r14
4151.cfi_restore	%r14
4152	mov	-32(%rsi),%r13
4153.cfi_restore	%r13
4154	mov	-24(%rsi),%r12
4155.cfi_restore	%r12
4156	mov	-16(%rsi),%rbx
4157.cfi_restore	%rbx
4158	mov	-8(%rsi),%rbp
4159.cfi_restore	%rbp
4160	lea	(%rsi),%rsp
4161.cfi_def_cfa_register	%rsp
4162.Ladd_affine${x}_epilogue:
4163	ret
4164.cfi_endproc
4165.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4166___
4167}
4168&gen_add_affine("q");
4169
4170########################################################################
4171# AD*X magic
4172#
4173if ($addx) {								{
4174########################################################################
4175# operate in 4-5-0-1 "name space" that matches multiplication output
4176#
4177my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4178
4179$code.=<<___;
4180.type	__ecp_nistz256_add_tox,\@abi-omnipotent
4181.align	32
4182__ecp_nistz256_add_tox:
4183.cfi_startproc
4184	xor	$t4, $t4
4185	adc	8*0($b_ptr), $a0
4186	adc	8*1($b_ptr), $a1
4187	 mov	$a0, $t0
4188	adc	8*2($b_ptr), $a2
4189	adc	8*3($b_ptr), $a3
4190	 mov	$a1, $t1
4191	adc	\$0, $t4
4192
4193	xor	$t3, $t3
4194	sbb	\$-1, $a0
4195	 mov	$a2, $t2
4196	sbb	$poly1, $a1
4197	sbb	\$0, $a2
4198	 mov	$a3, $t3
4199	sbb	$poly3, $a3
4200	sbb	\$0, $t4
4201
4202	cmovc	$t0, $a0
4203	cmovc	$t1, $a1
4204	mov	$a0, 8*0($r_ptr)
4205	cmovc	$t2, $a2
4206	mov	$a1, 8*1($r_ptr)
4207	cmovc	$t3, $a3
4208	mov	$a2, 8*2($r_ptr)
4209	mov	$a3, 8*3($r_ptr)
4210
4211	ret
4212.cfi_endproc
4213.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4214
4215.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
4216.align	32
4217__ecp_nistz256_sub_fromx:
4218.cfi_startproc
4219	xor	$t4, $t4
4220	sbb	8*0($b_ptr), $a0
4221	sbb	8*1($b_ptr), $a1
4222	 mov	$a0, $t0
4223	sbb	8*2($b_ptr), $a2
4224	sbb	8*3($b_ptr), $a3
4225	 mov	$a1, $t1
4226	sbb	\$0, $t4
4227
4228	xor	$t3, $t3
4229	adc	\$-1, $a0
4230	 mov	$a2, $t2
4231	adc	$poly1, $a1
4232	adc	\$0, $a2
4233	 mov	$a3, $t3
4234	adc	$poly3, $a3
4235
4236	bt	\$0, $t4
4237	cmovnc	$t0, $a0
4238	cmovnc	$t1, $a1
4239	mov	$a0, 8*0($r_ptr)
4240	cmovnc	$t2, $a2
4241	mov	$a1, 8*1($r_ptr)
4242	cmovnc	$t3, $a3
4243	mov	$a2, 8*2($r_ptr)
4244	mov	$a3, 8*3($r_ptr)
4245
4246	ret
4247.cfi_endproc
4248.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4249
4250.type	__ecp_nistz256_subx,\@abi-omnipotent
4251.align	32
4252__ecp_nistz256_subx:
4253.cfi_startproc
4254	xor	$t4, $t4
4255	sbb	$a0, $t0
4256	sbb	$a1, $t1
4257	 mov	$t0, $a0
4258	sbb	$a2, $t2
4259	sbb	$a3, $t3
4260	 mov	$t1, $a1
4261	sbb	\$0, $t4
4262
4263	xor	$a3 ,$a3
4264	adc	\$-1, $t0
4265	 mov	$t2, $a2
4266	adc	$poly1, $t1
4267	adc	\$0, $t2
4268	 mov	$t3, $a3
4269	adc	$poly3, $t3
4270
4271	bt	\$0, $t4
4272	cmovc	$t0, $a0
4273	cmovc	$t1, $a1
4274	cmovc	$t2, $a2
4275	cmovc	$t3, $a3
4276
4277	ret
4278.cfi_endproc
4279.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
4280
4281.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
4282.align	32
4283__ecp_nistz256_mul_by_2x:
4284.cfi_startproc
4285	xor	$t4, $t4
4286	adc	$a0, $a0		# a0:a3+a0:a3
4287	adc	$a1, $a1
4288	 mov	$a0, $t0
4289	adc	$a2, $a2
4290	adc	$a3, $a3
4291	 mov	$a1, $t1
4292	adc	\$0, $t4
4293
4294	xor	$t3, $t3
4295	sbb	\$-1, $a0
4296	 mov	$a2, $t2
4297	sbb	$poly1, $a1
4298	sbb	\$0, $a2
4299	 mov	$a3, $t3
4300	sbb	$poly3, $a3
4301	sbb	\$0, $t4
4302
4303	cmovc	$t0, $a0
4304	cmovc	$t1, $a1
4305	mov	$a0, 8*0($r_ptr)
4306	cmovc	$t2, $a2
4307	mov	$a1, 8*1($r_ptr)
4308	cmovc	$t3, $a3
4309	mov	$a2, 8*2($r_ptr)
4310	mov	$a3, 8*3($r_ptr)
4311
4312	ret
4313.cfi_endproc
4314.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4315___
4316									}
4317&gen_double("x");
4318&gen_add("x");
4319&gen_add_affine("x");
4320}
4321}}}
4322
4323# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4324#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4325if ($win64) {
4326$rec="%rcx";
4327$frame="%rdx";
4328$context="%r8";
4329$disp="%r9";
4330
4331$code.=<<___;
4332.extern	__imp_RtlVirtualUnwind
4333
4334.type	short_handler,\@abi-omnipotent
4335.align	16
4336short_handler:
4337	push	%rsi
4338	push	%rdi
4339	push	%rbx
4340	push	%rbp
4341	push	%r12
4342	push	%r13
4343	push	%r14
4344	push	%r15
4345	pushfq
4346	sub	\$64,%rsp
4347
4348	mov	120($context),%rax	# pull context->Rax
4349	mov	248($context),%rbx	# pull context->Rip
4350
4351	mov	8($disp),%rsi		# disp->ImageBase
4352	mov	56($disp),%r11		# disp->HandlerData
4353
4354	mov	0(%r11),%r10d		# HandlerData[0]
4355	lea	(%rsi,%r10),%r10	# end of prologue label
4356	cmp	%r10,%rbx		# context->Rip<end of prologue label
4357	jb	.Lcommon_seh_tail
4358
4359	mov	152($context),%rax	# pull context->Rsp
4360
4361	mov	4(%r11),%r10d		# HandlerData[1]
4362	lea	(%rsi,%r10),%r10	# epilogue label
4363	cmp	%r10,%rbx		# context->Rip>=epilogue label
4364	jae	.Lcommon_seh_tail
4365
4366	lea	16(%rax),%rax
4367
4368	mov	-8(%rax),%r12
4369	mov	-16(%rax),%r13
4370	mov	%r12,216($context)	# restore context->R12
4371	mov	%r13,224($context)	# restore context->R13
4372
4373	jmp	.Lcommon_seh_tail
4374.size	short_handler,.-short_handler
4375
4376.type	full_handler,\@abi-omnipotent
4377.align	16
4378full_handler:
4379	push	%rsi
4380	push	%rdi
4381	push	%rbx
4382	push	%rbp
4383	push	%r12
4384	push	%r13
4385	push	%r14
4386	push	%r15
4387	pushfq
4388	sub	\$64,%rsp
4389
4390	mov	120($context),%rax	# pull context->Rax
4391	mov	248($context),%rbx	# pull context->Rip
4392
4393	mov	8($disp),%rsi		# disp->ImageBase
4394	mov	56($disp),%r11		# disp->HandlerData
4395
4396	mov	0(%r11),%r10d		# HandlerData[0]
4397	lea	(%rsi,%r10),%r10	# end of prologue label
4398	cmp	%r10,%rbx		# context->Rip<end of prologue label
4399	jb	.Lcommon_seh_tail
4400
4401	mov	152($context),%rax	# pull context->Rsp
4402
4403	mov	4(%r11),%r10d		# HandlerData[1]
4404	lea	(%rsi,%r10),%r10	# epilogue label
4405	cmp	%r10,%rbx		# context->Rip>=epilogue label
4406	jae	.Lcommon_seh_tail
4407
4408	mov	8(%r11),%r10d		# HandlerData[2]
4409	lea	(%rax,%r10),%rax
4410
4411	mov	-8(%rax),%rbp
4412	mov	-16(%rax),%rbx
4413	mov	-24(%rax),%r12
4414	mov	-32(%rax),%r13
4415	mov	-40(%rax),%r14
4416	mov	-48(%rax),%r15
4417	mov	%rbx,144($context)	# restore context->Rbx
4418	mov	%rbp,160($context)	# restore context->Rbp
4419	mov	%r12,216($context)	# restore context->R12
4420	mov	%r13,224($context)	# restore context->R13
4421	mov	%r14,232($context)	# restore context->R14
4422	mov	%r15,240($context)	# restore context->R15
4423
4424.Lcommon_seh_tail:
4425	mov	8(%rax),%rdi
4426	mov	16(%rax),%rsi
4427	mov	%rax,152($context)	# restore context->Rsp
4428	mov	%rsi,168($context)	# restore context->Rsi
4429	mov	%rdi,176($context)	# restore context->Rdi
4430
4431	mov	40($disp),%rdi		# disp->ContextRecord
4432	mov	$context,%rsi		# context
4433	mov	\$154,%ecx		# sizeof(CONTEXT)
4434	.long	0xa548f3fc		# cld; rep movsq
4435
4436	mov	$disp,%rsi
4437	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4438	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4439	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4440	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4441	mov	40(%rsi),%r10		# disp->ContextRecord
4442	lea	56(%rsi),%r11		# &disp->HandlerData
4443	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4444	mov	%r10,32(%rsp)		# arg5
4445	mov	%r11,40(%rsp)		# arg6
4446	mov	%r12,48(%rsp)		# arg7
4447	mov	%rcx,56(%rsp)		# arg8, (NULL)
4448	call	*__imp_RtlVirtualUnwind(%rip)
4449
4450	mov	\$1,%eax		# ExceptionContinueSearch
4451	add	\$64,%rsp
4452	popfq
4453	pop	%r15
4454	pop	%r14
4455	pop	%r13
4456	pop	%r12
4457	pop	%rbp
4458	pop	%rbx
4459	pop	%rdi
4460	pop	%rsi
4461	ret
4462.size	full_handler,.-full_handler
4463
4464.section	.pdata
4465.align	4
4466	.rva	.LSEH_begin_ecp_nistz256_mul_by_2
4467	.rva	.LSEH_end_ecp_nistz256_mul_by_2
4468	.rva	.LSEH_info_ecp_nistz256_mul_by_2
4469
4470	.rva	.LSEH_begin_ecp_nistz256_div_by_2
4471	.rva	.LSEH_end_ecp_nistz256_div_by_2
4472	.rva	.LSEH_info_ecp_nistz256_div_by_2
4473
4474	.rva	.LSEH_begin_ecp_nistz256_mul_by_3
4475	.rva	.LSEH_end_ecp_nistz256_mul_by_3
4476	.rva	.LSEH_info_ecp_nistz256_mul_by_3
4477
4478	.rva	.LSEH_begin_ecp_nistz256_add
4479	.rva	.LSEH_end_ecp_nistz256_add
4480	.rva	.LSEH_info_ecp_nistz256_add
4481
4482	.rva	.LSEH_begin_ecp_nistz256_sub
4483	.rva	.LSEH_end_ecp_nistz256_sub
4484	.rva	.LSEH_info_ecp_nistz256_sub
4485
4486	.rva	.LSEH_begin_ecp_nistz256_neg
4487	.rva	.LSEH_end_ecp_nistz256_neg
4488	.rva	.LSEH_info_ecp_nistz256_neg
4489
4490	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
4491	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
4492	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
4493
4494	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
4495	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
4496	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
4497___
4498$code.=<<___	if ($addx);
4499	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
4500	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
4501	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
4502
4503	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
4504	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
4505	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
4506___
4507$code.=<<___;
4508	.rva	.LSEH_begin_ecp_nistz256_to_mont
4509	.rva	.LSEH_end_ecp_nistz256_to_mont
4510	.rva	.LSEH_info_ecp_nistz256_to_mont
4511
4512	.rva	.LSEH_begin_ecp_nistz256_mul_mont
4513	.rva	.LSEH_end_ecp_nistz256_mul_mont
4514	.rva	.LSEH_info_ecp_nistz256_mul_mont
4515
4516	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
4517	.rva	.LSEH_end_ecp_nistz256_sqr_mont
4518	.rva	.LSEH_info_ecp_nistz256_sqr_mont
4519
4520	.rva	.LSEH_begin_ecp_nistz256_from_mont
4521	.rva	.LSEH_end_ecp_nistz256_from_mont
4522	.rva	.LSEH_info_ecp_nistz256_from_mont
4523
4524	.rva	.LSEH_begin_ecp_nistz256_gather_w5
4525	.rva	.LSEH_end_ecp_nistz256_gather_w5
4526	.rva	.LSEH_info_ecp_nistz256_gather_wX
4527
4528	.rva	.LSEH_begin_ecp_nistz256_gather_w7
4529	.rva	.LSEH_end_ecp_nistz256_gather_w7
4530	.rva	.LSEH_info_ecp_nistz256_gather_wX
4531___
4532$code.=<<___	if ($avx>1);
4533	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w5
4534	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w5
4535	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4536
4537	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w7
4538	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w7
4539	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4540___
4541$code.=<<___;
4542	.rva	.LSEH_begin_ecp_nistz256_point_double
4543	.rva	.LSEH_end_ecp_nistz256_point_double
4544	.rva	.LSEH_info_ecp_nistz256_point_double
4545
4546	.rva	.LSEH_begin_ecp_nistz256_point_add
4547	.rva	.LSEH_end_ecp_nistz256_point_add
4548	.rva	.LSEH_info_ecp_nistz256_point_add
4549
4550	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4551	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4552	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4553___
4554$code.=<<___ if ($addx);
4555	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4556	.rva	.LSEH_end_ecp_nistz256_point_doublex
4557	.rva	.LSEH_info_ecp_nistz256_point_doublex
4558
4559	.rva	.LSEH_begin_ecp_nistz256_point_addx
4560	.rva	.LSEH_end_ecp_nistz256_point_addx
4561	.rva	.LSEH_info_ecp_nistz256_point_addx
4562
4563	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4564	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4565	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4566___
4567$code.=<<___;
4568
4569.section	.xdata
4570.align	8
4571.LSEH_info_ecp_nistz256_mul_by_2:
4572	.byte	9,0,0,0
4573	.rva	short_handler
4574	.rva	.Lmul_by_2_body,.Lmul_by_2_epilogue	# HandlerData[]
4575.LSEH_info_ecp_nistz256_div_by_2:
4576	.byte	9,0,0,0
4577	.rva	short_handler
4578	.rva	.Ldiv_by_2_body,.Ldiv_by_2_epilogue	# HandlerData[]
4579.LSEH_info_ecp_nistz256_mul_by_3:
4580	.byte	9,0,0,0
4581	.rva	short_handler
4582	.rva	.Lmul_by_3_body,.Lmul_by_3_epilogue	# HandlerData[]
4583.LSEH_info_ecp_nistz256_add:
4584	.byte	9,0,0,0
4585	.rva	short_handler
4586	.rva	.Ladd_body,.Ladd_epilogue		# HandlerData[]
4587.LSEH_info_ecp_nistz256_sub:
4588	.byte	9,0,0,0
4589	.rva	short_handler
4590	.rva	.Lsub_body,.Lsub_epilogue		# HandlerData[]
4591.LSEH_info_ecp_nistz256_neg:
4592	.byte	9,0,0,0
4593	.rva	short_handler
4594	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4595.LSEH_info_ecp_nistz256_ord_mul_mont:
4596	.byte	9,0,0,0
4597	.rva	full_handler
4598	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4599	.long	48,0
4600.LSEH_info_ecp_nistz256_ord_sqr_mont:
4601	.byte	9,0,0,0
4602	.rva	full_handler
4603	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4604	.long	48,0
4605___
4606$code.=<<___ if ($addx);
4607.LSEH_info_ecp_nistz256_ord_mul_montx:
4608	.byte	9,0,0,0
4609	.rva	full_handler
4610	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4611	.long	48,0
4612.LSEH_info_ecp_nistz256_ord_sqr_montx:
4613	.byte	9,0,0,0
4614	.rva	full_handler
4615	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4616	.long	48,0
4617___
4618$code.=<<___;
4619.LSEH_info_ecp_nistz256_to_mont:
4620	.byte	9,0,0,0
4621	.rva	full_handler
4622	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4623	.long	48,0
4624.LSEH_info_ecp_nistz256_mul_mont:
4625	.byte	9,0,0,0
4626	.rva	full_handler
4627	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4628	.long	48,0
4629.LSEH_info_ecp_nistz256_sqr_mont:
4630	.byte	9,0,0,0
4631	.rva	full_handler
4632	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4633	.long	48,0
4634.LSEH_info_ecp_nistz256_from_mont:
4635	.byte	9,0,0,0
4636	.rva	short_handler
4637	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
4638.LSEH_info_ecp_nistz256_gather_wX:
4639	.byte	0x01,0x33,0x16,0x00
4640	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4641	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4642	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4643	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4644	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4645	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4646	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4647	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4648	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4649	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4650	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4651	.align	8
4652___
4653$code.=<<___	if ($avx>1);
4654.LSEH_info_ecp_nistz256_avx2_gather_wX:
4655	.byte	0x01,0x36,0x17,0x0b
4656	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4657	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4658	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4659	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4660	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4661	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4662	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4663	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4664	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4665	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4666	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4667	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4668	.align	8
4669___
4670$code.=<<___;
4671.LSEH_info_ecp_nistz256_point_double:
4672	.byte	9,0,0,0
4673	.rva	full_handler
4674	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4675	.long	32*5+56,0
4676.LSEH_info_ecp_nistz256_point_add:
4677	.byte	9,0,0,0
4678	.rva	full_handler
4679	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4680	.long	32*18+56,0
4681.LSEH_info_ecp_nistz256_point_add_affine:
4682	.byte	9,0,0,0
4683	.rva	full_handler
4684	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4685	.long	32*15+56,0
4686___
4687$code.=<<___ if ($addx);
4688.align	8
4689.LSEH_info_ecp_nistz256_point_doublex:
4690	.byte	9,0,0,0
4691	.rva	full_handler
4692	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4693	.long	32*5+56,0
4694.LSEH_info_ecp_nistz256_point_addx:
4695	.byte	9,0,0,0
4696	.rva	full_handler
4697	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4698	.long	32*18+56,0
4699.LSEH_info_ecp_nistz256_point_add_affinex:
4700	.byte	9,0,0,0
4701	.rva	full_handler
4702	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4703	.long	32*15+56,0
4704___
4705}
4706
4707########################################################################
4708# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4709#
4710open TABLE,"<ecp_nistz256_table.c"		or
4711open TABLE,"<${dir}../ecp_nistz256_table.c"	or
4712die "failed to open ecp_nistz256_table.c:",$!;
4713
4714use integer;
4715
4716foreach(<TABLE>) {
4717	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4718}
4719close TABLE;
4720
4721die "insane number of elements" if ($#arr != 64*16*37-1);
4722
4723print <<___;
4724.text
4725.globl	ecp_nistz256_precomputed
4726.type	ecp_nistz256_precomputed,\@object
4727.align	4096
4728ecp_nistz256_precomputed:
4729___
4730while (@line=splice(@arr,0,16)) {
4731	print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4732}
4733print <<___;
4734.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4735___
4736
4737$code =~ s/\`([^\`]*)\`/eval $1/gem;
4738print $code;
4739close STDOUT or die "error closing STDOUT: $!";
4740