1#! /usr/bin/env perl
2# Copyright 2014-2019 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the OpenSSL license (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
58		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
59	$avx = ($1>=2.19) + ($1>=2.22);
60	$addx = ($1>=2.23);
61}
62
63if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66	$addx = ($1>=2.10);
67}
68
69if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71	$avx = ($1>=10) + ($1>=11);
72	$addx = ($1>=12);
73}
74
75if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
76	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
77	$avx = ($ver>=3.0) + ($ver>=3.01);
78	$addx = ($ver>=3.03);
79}
80
81$code.=<<___;
82.text
83.extern	OPENSSL_ia32cap_P
84
85# The polynomial
86.align 64
87.Lpoly:
88.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
89
90# 2^512 mod P precomputed for NIST P256 polynomial
91.LRR:
92.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
93
94.LOne:
95.long 1,1,1,1,1,1,1,1
96.LTwo:
97.long 2,2,2,2,2,2,2,2
98.LThree:
99.long 3,3,3,3,3,3,3,3
100.LONE_mont:
101.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
102
103# Constants for computations modulo ord(p256)
104.Lord:
105.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
106.LordK:
107.quad 0xccd1c8aaee00bc4f
108___
109
110{
111################################################################################
112# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
113
114my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
115my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
116my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
117
118$code.=<<___;
119
120.globl	ecp_nistz256_mul_by_2
121.type	ecp_nistz256_mul_by_2,\@function,2
122.align	64
123ecp_nistz256_mul_by_2:
124.cfi_startproc
125	push	%r12
126.cfi_push	%r12
127	push	%r13
128.cfi_push	%r13
129.Lmul_by_2_body:
130
131	mov	8*0($a_ptr), $a0
132	xor	$t4,$t4
133	mov	8*1($a_ptr), $a1
134	add	$a0, $a0		# a0:a3+a0:a3
135	mov	8*2($a_ptr), $a2
136	adc	$a1, $a1
137	mov	8*3($a_ptr), $a3
138	lea	.Lpoly(%rip), $a_ptr
139	 mov	$a0, $t0
140	adc	$a2, $a2
141	adc	$a3, $a3
142	 mov	$a1, $t1
143	adc	\$0, $t4
144
145	sub	8*0($a_ptr), $a0
146	 mov	$a2, $t2
147	sbb	8*1($a_ptr), $a1
148	sbb	8*2($a_ptr), $a2
149	 mov	$a3, $t3
150	sbb	8*3($a_ptr), $a3
151	sbb	\$0, $t4
152
153	cmovc	$t0, $a0
154	cmovc	$t1, $a1
155	mov	$a0, 8*0($r_ptr)
156	cmovc	$t2, $a2
157	mov	$a1, 8*1($r_ptr)
158	cmovc	$t3, $a3
159	mov	$a2, 8*2($r_ptr)
160	mov	$a3, 8*3($r_ptr)
161
162	mov	0(%rsp),%r13
163.cfi_restore	%r13
164	mov	8(%rsp),%r12
165.cfi_restore	%r12
166	lea	16(%rsp),%rsp
167.cfi_adjust_cfa_offset	-16
168.Lmul_by_2_epilogue:
169	ret
170.cfi_endproc
171.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
172
173################################################################################
174# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
175.globl	ecp_nistz256_div_by_2
176.type	ecp_nistz256_div_by_2,\@function,2
177.align	32
178ecp_nistz256_div_by_2:
179.cfi_startproc
180	push	%r12
181.cfi_push	%r12
182	push	%r13
183.cfi_push	%r13
184.Ldiv_by_2_body:
185
186	mov	8*0($a_ptr), $a0
187	mov	8*1($a_ptr), $a1
188	mov	8*2($a_ptr), $a2
189	 mov	$a0, $t0
190	mov	8*3($a_ptr), $a3
191	lea	.Lpoly(%rip), $a_ptr
192
193	 mov	$a1, $t1
194	xor	$t4, $t4
195	add	8*0($a_ptr), $a0
196	 mov	$a2, $t2
197	adc	8*1($a_ptr), $a1
198	adc	8*2($a_ptr), $a2
199	 mov	$a3, $t3
200	adc	8*3($a_ptr), $a3
201	adc	\$0, $t4
202	xor	$a_ptr, $a_ptr		# borrow $a_ptr
203	test	\$1, $t0
204
205	cmovz	$t0, $a0
206	cmovz	$t1, $a1
207	cmovz	$t2, $a2
208	cmovz	$t3, $a3
209	cmovz	$a_ptr, $t4
210
211	mov	$a1, $t0		# a0:a3>>1
212	shr	\$1, $a0
213	shl	\$63, $t0
214	mov	$a2, $t1
215	shr	\$1, $a1
216	or	$t0, $a0
217	shl	\$63, $t1
218	mov	$a3, $t2
219	shr	\$1, $a2
220	or	$t1, $a1
221	shl	\$63, $t2
222	shr	\$1, $a3
223	shl	\$63, $t4
224	or	$t2, $a2
225	or	$t4, $a3
226
227	mov	$a0, 8*0($r_ptr)
228	mov	$a1, 8*1($r_ptr)
229	mov	$a2, 8*2($r_ptr)
230	mov	$a3, 8*3($r_ptr)
231
232	mov	0(%rsp),%r13
233.cfi_restore	%r13
234	mov	8(%rsp),%r12
235.cfi_restore	%r12
236	lea	16(%rsp),%rsp
237.cfi_adjust_cfa_offset	-16
238.Ldiv_by_2_epilogue:
239	ret
240.cfi_endproc
241.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
242
243################################################################################
244# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
245.globl	ecp_nistz256_mul_by_3
246.type	ecp_nistz256_mul_by_3,\@function,2
247.align	32
248ecp_nistz256_mul_by_3:
249.cfi_startproc
250	push	%r12
251.cfi_push	%r12
252	push	%r13
253.cfi_push	%r13
254.Lmul_by_3_body:
255
256	mov	8*0($a_ptr), $a0
257	xor	$t4, $t4
258	mov	8*1($a_ptr), $a1
259	add	$a0, $a0		# a0:a3+a0:a3
260	mov	8*2($a_ptr), $a2
261	adc	$a1, $a1
262	mov	8*3($a_ptr), $a3
263	 mov	$a0, $t0
264	adc	$a2, $a2
265	adc	$a3, $a3
266	 mov	$a1, $t1
267	adc	\$0, $t4
268
269	sub	\$-1, $a0
270	 mov	$a2, $t2
271	sbb	.Lpoly+8*1(%rip), $a1
272	sbb	\$0, $a2
273	 mov	$a3, $t3
274	sbb	.Lpoly+8*3(%rip), $a3
275	sbb	\$0, $t4
276
277	cmovc	$t0, $a0
278	cmovc	$t1, $a1
279	cmovc	$t2, $a2
280	cmovc	$t3, $a3
281
282	xor	$t4, $t4
283	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
284	adc	8*1($a_ptr), $a1
285	 mov	$a0, $t0
286	adc	8*2($a_ptr), $a2
287	adc	8*3($a_ptr), $a3
288	 mov	$a1, $t1
289	adc	\$0, $t4
290
291	sub	\$-1, $a0
292	 mov	$a2, $t2
293	sbb	.Lpoly+8*1(%rip), $a1
294	sbb	\$0, $a2
295	 mov	$a3, $t3
296	sbb	.Lpoly+8*3(%rip), $a3
297	sbb	\$0, $t4
298
299	cmovc	$t0, $a0
300	cmovc	$t1, $a1
301	mov	$a0, 8*0($r_ptr)
302	cmovc	$t2, $a2
303	mov	$a1, 8*1($r_ptr)
304	cmovc	$t3, $a3
305	mov	$a2, 8*2($r_ptr)
306	mov	$a3, 8*3($r_ptr)
307
308	mov	0(%rsp),%r13
309.cfi_restore	%r13
310	mov	8(%rsp),%r12
311.cfi_restore	%r12
312	lea	16(%rsp),%rsp
313.cfi_adjust_cfa_offset	-16
314.Lmul_by_3_epilogue:
315	ret
316.cfi_endproc
317.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
318
319################################################################################
320# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
321.globl	ecp_nistz256_add
322.type	ecp_nistz256_add,\@function,3
323.align	32
324ecp_nistz256_add:
325.cfi_startproc
326	push	%r12
327.cfi_push	%r12
328	push	%r13
329.cfi_push	%r13
330.Ladd_body:
331
332	mov	8*0($a_ptr), $a0
333	xor	$t4, $t4
334	mov	8*1($a_ptr), $a1
335	mov	8*2($a_ptr), $a2
336	mov	8*3($a_ptr), $a3
337	lea	.Lpoly(%rip), $a_ptr
338
339	add	8*0($b_ptr), $a0
340	adc	8*1($b_ptr), $a1
341	 mov	$a0, $t0
342	adc	8*2($b_ptr), $a2
343	adc	8*3($b_ptr), $a3
344	 mov	$a1, $t1
345	adc	\$0, $t4
346
347	sub	8*0($a_ptr), $a0
348	 mov	$a2, $t2
349	sbb	8*1($a_ptr), $a1
350	sbb	8*2($a_ptr), $a2
351	 mov	$a3, $t3
352	sbb	8*3($a_ptr), $a3
353	sbb	\$0, $t4
354
355	cmovc	$t0, $a0
356	cmovc	$t1, $a1
357	mov	$a0, 8*0($r_ptr)
358	cmovc	$t2, $a2
359	mov	$a1, 8*1($r_ptr)
360	cmovc	$t3, $a3
361	mov	$a2, 8*2($r_ptr)
362	mov	$a3, 8*3($r_ptr)
363
364	mov	0(%rsp),%r13
365.cfi_restore	%r13
366	mov	8(%rsp),%r12
367.cfi_restore	%r12
368	lea	16(%rsp),%rsp
369.cfi_adjust_cfa_offset	-16
370.Ladd_epilogue:
371	ret
372.cfi_endproc
373.size	ecp_nistz256_add,.-ecp_nistz256_add
374
375################################################################################
376# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
377.globl	ecp_nistz256_sub
378.type	ecp_nistz256_sub,\@function,3
379.align	32
380ecp_nistz256_sub:
381.cfi_startproc
382	push	%r12
383.cfi_push	%r12
384	push	%r13
385.cfi_push	%r13
386.Lsub_body:
387
388	mov	8*0($a_ptr), $a0
389	xor	$t4, $t4
390	mov	8*1($a_ptr), $a1
391	mov	8*2($a_ptr), $a2
392	mov	8*3($a_ptr), $a3
393	lea	.Lpoly(%rip), $a_ptr
394
395	sub	8*0($b_ptr), $a0
396	sbb	8*1($b_ptr), $a1
397	 mov	$a0, $t0
398	sbb	8*2($b_ptr), $a2
399	sbb	8*3($b_ptr), $a3
400	 mov	$a1, $t1
401	sbb	\$0, $t4
402
403	add	8*0($a_ptr), $a0
404	 mov	$a2, $t2
405	adc	8*1($a_ptr), $a1
406	adc	8*2($a_ptr), $a2
407	 mov	$a3, $t3
408	adc	8*3($a_ptr), $a3
409	test	$t4, $t4
410
411	cmovz	$t0, $a0
412	cmovz	$t1, $a1
413	mov	$a0, 8*0($r_ptr)
414	cmovz	$t2, $a2
415	mov	$a1, 8*1($r_ptr)
416	cmovz	$t3, $a3
417	mov	$a2, 8*2($r_ptr)
418	mov	$a3, 8*3($r_ptr)
419
420	mov	0(%rsp),%r13
421.cfi_restore	%r13
422	mov	8(%rsp),%r12
423.cfi_restore	%r12
424	lea	16(%rsp),%rsp
425.cfi_adjust_cfa_offset	-16
426.Lsub_epilogue:
427	ret
428.cfi_endproc
429.size	ecp_nistz256_sub,.-ecp_nistz256_sub
430
431################################################################################
432# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
433.globl	ecp_nistz256_neg
434.type	ecp_nistz256_neg,\@function,2
435.align	32
436ecp_nistz256_neg:
437.cfi_startproc
438	push	%r12
439.cfi_push	%r12
440	push	%r13
441.cfi_push	%r13
442.Lneg_body:
443
444	xor	$a0, $a0
445	xor	$a1, $a1
446	xor	$a2, $a2
447	xor	$a3, $a3
448	xor	$t4, $t4
449
450	sub	8*0($a_ptr), $a0
451	sbb	8*1($a_ptr), $a1
452	sbb	8*2($a_ptr), $a2
453	 mov	$a0, $t0
454	sbb	8*3($a_ptr), $a3
455	lea	.Lpoly(%rip), $a_ptr
456	 mov	$a1, $t1
457	sbb	\$0, $t4
458
459	add	8*0($a_ptr), $a0
460	 mov	$a2, $t2
461	adc	8*1($a_ptr), $a1
462	adc	8*2($a_ptr), $a2
463	 mov	$a3, $t3
464	adc	8*3($a_ptr), $a3
465	test	$t4, $t4
466
467	cmovz	$t0, $a0
468	cmovz	$t1, $a1
469	mov	$a0, 8*0($r_ptr)
470	cmovz	$t2, $a2
471	mov	$a1, 8*1($r_ptr)
472	cmovz	$t3, $a3
473	mov	$a2, 8*2($r_ptr)
474	mov	$a3, 8*3($r_ptr)
475
476	mov	0(%rsp),%r13
477.cfi_restore	%r13
478	mov	8(%rsp),%r12
479.cfi_restore	%r12
480	lea	16(%rsp),%rsp
481.cfi_adjust_cfa_offset	-16
482.Lneg_epilogue:
483	ret
484.cfi_endproc
485.size	ecp_nistz256_neg,.-ecp_nistz256_neg
486___
487}
488{
489my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
490my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
491my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
492my ($poly1,$poly3)=($acc6,$acc7);
493
494$code.=<<___;
495################################################################################
496# void ecp_nistz256_ord_mul_mont(
497#   uint64_t res[4],
498#   uint64_t a[4],
499#   uint64_t b[4]);
500
501.globl	ecp_nistz256_ord_mul_mont
502.type	ecp_nistz256_ord_mul_mont,\@function,3
503.align	32
504ecp_nistz256_ord_mul_mont:
505.cfi_startproc
506___
507$code.=<<___	if ($addx);
508	mov	\$0x80100, %ecx
509	and	OPENSSL_ia32cap_P+8(%rip), %ecx
510	cmp	\$0x80100, %ecx
511	je	.Lecp_nistz256_ord_mul_montx
512___
513$code.=<<___;
514	push	%rbp
515.cfi_push	%rbp
516	push	%rbx
517.cfi_push	%rbx
518	push	%r12
519.cfi_push	%r12
520	push	%r13
521.cfi_push	%r13
522	push	%r14
523.cfi_push	%r14
524	push	%r15
525.cfi_push	%r15
526.Lord_mul_body:
527
528	mov	8*0($b_org), %rax
529	mov	$b_org, $b_ptr
530	lea	.Lord(%rip), %r14
531	mov	.LordK(%rip), %r15
532
533	################################# * b[0]
534	mov	%rax, $t0
535	mulq	8*0($a_ptr)
536	mov	%rax, $acc0
537	mov	$t0, %rax
538	mov	%rdx, $acc1
539
540	mulq	8*1($a_ptr)
541	add	%rax, $acc1
542	mov	$t0, %rax
543	adc	\$0, %rdx
544	mov	%rdx, $acc2
545
546	mulq	8*2($a_ptr)
547	add	%rax, $acc2
548	mov	$t0, %rax
549	adc	\$0, %rdx
550
551	 mov	$acc0, $acc5
552	 imulq	%r15,$acc0
553
554	mov	%rdx, $acc3
555	mulq	8*3($a_ptr)
556	add	%rax, $acc3
557	 mov	$acc0, %rax
558	adc	\$0, %rdx
559	mov	%rdx, $acc4
560
561	################################# First reduction step
562	mulq	8*0(%r14)
563	mov	$acc0, $t1
564	add	%rax, $acc5		# guaranteed to be zero
565	mov	$acc0, %rax
566	adc	\$0, %rdx
567	mov	%rdx, $t0
568
569	sub	$acc0, $acc2
570	sbb	\$0, $acc0		# can't borrow
571
572	mulq	8*1(%r14)
573	add	$t0, $acc1
574	adc	\$0, %rdx
575	add	%rax, $acc1
576	mov	$t1, %rax
577	adc	%rdx, $acc2
578	mov	$t1, %rdx
579	adc	\$0, $acc0		# can't overflow
580
581	shl	\$32, %rax
582	shr	\$32, %rdx
583	sub	%rax, $acc3
584	 mov	8*1($b_ptr), %rax
585	sbb	%rdx, $t1		# can't borrow
586
587	add	$acc0, $acc3
588	adc	$t1, $acc4
589	adc	\$0, $acc5
590
591	################################# * b[1]
592	mov	%rax, $t0
593	mulq	8*0($a_ptr)
594	add	%rax, $acc1
595	mov	$t0, %rax
596	adc	\$0, %rdx
597	mov	%rdx, $t1
598
599	mulq	8*1($a_ptr)
600	add	$t1, $acc2
601	adc	\$0, %rdx
602	add	%rax, $acc2
603	mov	$t0, %rax
604	adc	\$0, %rdx
605	mov	%rdx, $t1
606
607	mulq	8*2($a_ptr)
608	add	$t1, $acc3
609	adc	\$0, %rdx
610	add	%rax, $acc3
611	mov	$t0, %rax
612	adc	\$0, %rdx
613
614	 mov	$acc1, $t0
615	 imulq	%r15, $acc1
616
617	mov	%rdx, $t1
618	mulq	8*3($a_ptr)
619	add	$t1, $acc4
620	adc	\$0, %rdx
621	xor	$acc0, $acc0
622	add	%rax, $acc4
623	 mov	$acc1, %rax
624	adc	%rdx, $acc5
625	adc	\$0, $acc0
626
627	################################# Second reduction step
628	mulq	8*0(%r14)
629	mov	$acc1, $t1
630	add	%rax, $t0		# guaranteed to be zero
631	mov	$acc1, %rax
632	adc	%rdx, $t0
633
634	sub	$acc1, $acc3
635	sbb	\$0, $acc1		# can't borrow
636
637	mulq	8*1(%r14)
638	add	$t0, $acc2
639	adc	\$0, %rdx
640	add	%rax, $acc2
641	mov	$t1, %rax
642	adc	%rdx, $acc3
643	mov	$t1, %rdx
644	adc	\$0, $acc1		# can't overflow
645
646	shl	\$32, %rax
647	shr	\$32, %rdx
648	sub	%rax, $acc4
649	 mov	8*2($b_ptr), %rax
650	sbb	%rdx, $t1		# can't borrow
651
652	add	$acc1, $acc4
653	adc	$t1, $acc5
654	adc	\$0, $acc0
655
656	################################## * b[2]
657	mov	%rax, $t0
658	mulq	8*0($a_ptr)
659	add	%rax, $acc2
660	mov	$t0, %rax
661	adc	\$0, %rdx
662	mov	%rdx, $t1
663
664	mulq	8*1($a_ptr)
665	add	$t1, $acc3
666	adc	\$0, %rdx
667	add	%rax, $acc3
668	mov	$t0, %rax
669	adc	\$0, %rdx
670	mov	%rdx, $t1
671
672	mulq	8*2($a_ptr)
673	add	$t1, $acc4
674	adc	\$0, %rdx
675	add	%rax, $acc4
676	mov	$t0, %rax
677	adc	\$0, %rdx
678
679	 mov	$acc2, $t0
680	 imulq	%r15, $acc2
681
682	mov	%rdx, $t1
683	mulq	8*3($a_ptr)
684	add	$t1, $acc5
685	adc	\$0, %rdx
686	xor	$acc1, $acc1
687	add	%rax, $acc5
688	 mov	$acc2, %rax
689	adc	%rdx, $acc0
690	adc	\$0, $acc1
691
692	################################# Third reduction step
693	mulq	8*0(%r14)
694	mov	$acc2, $t1
695	add	%rax, $t0		# guaranteed to be zero
696	mov	$acc2, %rax
697	adc	%rdx, $t0
698
699	sub	$acc2, $acc4
700	sbb	\$0, $acc2		# can't borrow
701
702	mulq	8*1(%r14)
703	add	$t0, $acc3
704	adc	\$0, %rdx
705	add	%rax, $acc3
706	mov	$t1, %rax
707	adc	%rdx, $acc4
708	mov	$t1, %rdx
709	adc	\$0, $acc2		# can't overflow
710
711	shl	\$32, %rax
712	shr	\$32, %rdx
713	sub	%rax, $acc5
714	 mov	8*3($b_ptr), %rax
715	sbb	%rdx, $t1		# can't borrow
716
717	add	$acc2, $acc5
718	adc	$t1, $acc0
719	adc	\$0, $acc1
720
721	################################# * b[3]
722	mov	%rax, $t0
723	mulq	8*0($a_ptr)
724	add	%rax, $acc3
725	mov	$t0, %rax
726	adc	\$0, %rdx
727	mov	%rdx, $t1
728
729	mulq	8*1($a_ptr)
730	add	$t1, $acc4
731	adc	\$0, %rdx
732	add	%rax, $acc4
733	mov	$t0, %rax
734	adc	\$0, %rdx
735	mov	%rdx, $t1
736
737	mulq	8*2($a_ptr)
738	add	$t1, $acc5
739	adc	\$0, %rdx
740	add	%rax, $acc5
741	mov	$t0, %rax
742	adc	\$0, %rdx
743
744	 mov	$acc3, $t0
745	 imulq	%r15, $acc3
746
747	mov	%rdx, $t1
748	mulq	8*3($a_ptr)
749	add	$t1, $acc0
750	adc	\$0, %rdx
751	xor	$acc2, $acc2
752	add	%rax, $acc0
753	 mov	$acc3, %rax
754	adc	%rdx, $acc1
755	adc	\$0, $acc2
756
757	################################# Last reduction step
758	mulq	8*0(%r14)
759	mov	$acc3, $t1
760	add	%rax, $t0		# guaranteed to be zero
761	mov	$acc3, %rax
762	adc	%rdx, $t0
763
764	sub	$acc3, $acc5
765	sbb	\$0, $acc3		# can't borrow
766
767	mulq	8*1(%r14)
768	add	$t0, $acc4
769	adc	\$0, %rdx
770	add	%rax, $acc4
771	mov	$t1, %rax
772	adc	%rdx, $acc5
773	mov	$t1, %rdx
774	adc	\$0, $acc3		# can't overflow
775
776	shl	\$32, %rax
777	shr	\$32, %rdx
778	sub	%rax, $acc0
779	sbb	%rdx, $t1		# can't borrow
780
781	add	$acc3, $acc0
782	adc	$t1, $acc1
783	adc	\$0, $acc2
784
785	################################# Subtract ord
786	 mov	$acc4, $a_ptr
787	sub	8*0(%r14), $acc4
788	 mov	$acc5, $acc3
789	sbb	8*1(%r14), $acc5
790	 mov	$acc0, $t0
791	sbb	8*2(%r14), $acc0
792	 mov	$acc1, $t1
793	sbb	8*3(%r14), $acc1
794	sbb	\$0, $acc2
795
796	cmovc	$a_ptr, $acc4
797	cmovc	$acc3, $acc5
798	cmovc	$t0, $acc0
799	cmovc	$t1, $acc1
800
801	mov	$acc4, 8*0($r_ptr)
802	mov	$acc5, 8*1($r_ptr)
803	mov	$acc0, 8*2($r_ptr)
804	mov	$acc1, 8*3($r_ptr)
805
806	mov	0(%rsp),%r15
807.cfi_restore	%r15
808	mov	8(%rsp),%r14
809.cfi_restore	%r14
810	mov	16(%rsp),%r13
811.cfi_restore	%r13
812	mov	24(%rsp),%r12
813.cfi_restore	%r12
814	mov	32(%rsp),%rbx
815.cfi_restore	%rbx
816	mov	40(%rsp),%rbp
817.cfi_restore	%rbp
818	lea	48(%rsp),%rsp
819.cfi_adjust_cfa_offset	-48
820.Lord_mul_epilogue:
821	ret
822.cfi_endproc
823.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
824
825################################################################################
826# void ecp_nistz256_ord_sqr_mont(
827#   uint64_t res[4],
828#   uint64_t a[4],
829#   int rep);
830
831.globl	ecp_nistz256_ord_sqr_mont
832.type	ecp_nistz256_ord_sqr_mont,\@function,3
833.align	32
834ecp_nistz256_ord_sqr_mont:
835.cfi_startproc
836___
837$code.=<<___	if ($addx);
838	mov	\$0x80100, %ecx
839	and	OPENSSL_ia32cap_P+8(%rip), %ecx
840	cmp	\$0x80100, %ecx
841	je	.Lecp_nistz256_ord_sqr_montx
842___
843$code.=<<___;
844	push	%rbp
845.cfi_push	%rbp
846	push	%rbx
847.cfi_push	%rbx
848	push	%r12
849.cfi_push	%r12
850	push	%r13
851.cfi_push	%r13
852	push	%r14
853.cfi_push	%r14
854	push	%r15
855.cfi_push	%r15
856.Lord_sqr_body:
857
858	mov	8*0($a_ptr), $acc0
859	mov	8*1($a_ptr), %rax
860	mov	8*2($a_ptr), $acc6
861	mov	8*3($a_ptr), $acc7
862	lea	.Lord(%rip), $a_ptr	# pointer to modulus
863	mov	$b_org, $b_ptr
864	jmp	.Loop_ord_sqr
865
866.align	32
867.Loop_ord_sqr:
868	################################# a[1:] * a[0]
869	mov	%rax, $t1		# put aside a[1]
870	mul	$acc0			# a[1] * a[0]
871	mov	%rax, $acc1
872	movq	$t1, %xmm1		# offload a[1]
873	mov	$acc6, %rax
874	mov	%rdx, $acc2
875
876	mul	$acc0			# a[2] * a[0]
877	add	%rax, $acc2
878	mov	$acc7, %rax
879	movq	$acc6, %xmm2		# offload a[2]
880	adc	\$0, %rdx
881	mov	%rdx, $acc3
882
883	mul	$acc0			# a[3] * a[0]
884	add	%rax, $acc3
885	mov	$acc7, %rax
886	movq	$acc7, %xmm3		# offload a[3]
887	adc	\$0, %rdx
888	mov	%rdx, $acc4
889
890	################################# a[3] * a[2]
891	mul	$acc6			# a[3] * a[2]
892	mov	%rax, $acc5
893	mov	$acc6, %rax
894	mov	%rdx, $acc6
895
896	################################# a[2:] * a[1]
897	mul	$t1			# a[2] * a[1]
898	add	%rax, $acc3
899	mov	$acc7, %rax
900	adc	\$0, %rdx
901	mov	%rdx, $acc7
902
903	mul	$t1			# a[3] * a[1]
904	add	%rax, $acc4
905	adc	\$0, %rdx
906
907	add	$acc7, $acc4
908	adc	%rdx, $acc5
909	adc	\$0, $acc6		# can't overflow
910
911	################################# *2
912	xor	$acc7, $acc7
913	mov	$acc0, %rax
914	add	$acc1, $acc1
915	adc	$acc2, $acc2
916	adc	$acc3, $acc3
917	adc	$acc4, $acc4
918	adc	$acc5, $acc5
919	adc	$acc6, $acc6
920	adc	\$0, $acc7
921
922	################################# Missing products
923	mul	%rax			# a[0] * a[0]
924	mov	%rax, $acc0
925	movq	%xmm1, %rax
926	mov	%rdx, $t1
927
928	mul	%rax			# a[1] * a[1]
929	add	$t1, $acc1
930	adc	%rax, $acc2
931	movq	%xmm2, %rax
932	adc	\$0, %rdx
933	mov	%rdx, $t1
934
935	mul	%rax			# a[2] * a[2]
936	add	$t1, $acc3
937	adc	%rax, $acc4
938	movq	%xmm3, %rax
939	adc	\$0, %rdx
940	mov	%rdx, $t1
941
942	 mov	$acc0, $t0
943	 imulq	8*4($a_ptr), $acc0	# *= .LordK
944
945	mul	%rax			# a[3] * a[3]
946	add	$t1, $acc5
947	adc	%rax, $acc6
948	 mov	8*0($a_ptr), %rax	# modulus[0]
949	adc	%rdx, $acc7		# can't overflow
950
951	################################# First reduction step
952	mul	$acc0
953	mov	$acc0, $t1
954	add	%rax, $t0		# guaranteed to be zero
955	mov	8*1($a_ptr), %rax	# modulus[1]
956	adc	%rdx, $t0
957
958	sub	$acc0, $acc2
959	sbb	\$0, $t1		# can't borrow
960
961	mul	$acc0
962	add	$t0, $acc1
963	adc	\$0, %rdx
964	add	%rax, $acc1
965	mov	$acc0, %rax
966	adc	%rdx, $acc2
967	mov	$acc0, %rdx
968	adc	\$0, $t1		# can't overflow
969
970	 mov	$acc1, $t0
971	 imulq	8*4($a_ptr), $acc1	# *= .LordK
972
973	shl	\$32, %rax
974	shr	\$32, %rdx
975	sub	%rax, $acc3
976	 mov	8*0($a_ptr), %rax
977	sbb	%rdx, $acc0		# can't borrow
978
979	add	$t1, $acc3
980	adc	\$0, $acc0		# can't overflow
981
982	################################# Second reduction step
983	mul	$acc1
984	mov	$acc1, $t1
985	add	%rax, $t0		# guaranteed to be zero
986	mov	8*1($a_ptr), %rax
987	adc	%rdx, $t0
988
989	sub	$acc1, $acc3
990	sbb	\$0, $t1		# can't borrow
991
992	mul	$acc1
993	add	$t0, $acc2
994	adc	\$0, %rdx
995	add	%rax, $acc2
996	mov	$acc1, %rax
997	adc	%rdx, $acc3
998	mov	$acc1, %rdx
999	adc	\$0, $t1		# can't overflow
1000
1001	 mov	$acc2, $t0
1002	 imulq	8*4($a_ptr), $acc2	# *= .LordK
1003
1004	shl	\$32, %rax
1005	shr	\$32, %rdx
1006	sub	%rax, $acc0
1007	 mov	8*0($a_ptr), %rax
1008	sbb	%rdx, $acc1		# can't borrow
1009
1010	add	$t1, $acc0
1011	adc	\$0, $acc1		# can't overflow
1012
1013	################################# Third reduction step
1014	mul	$acc2
1015	mov	$acc2, $t1
1016	add	%rax, $t0		# guaranteed to be zero
1017	mov	8*1($a_ptr), %rax
1018	adc	%rdx, $t0
1019
1020	sub	$acc2, $acc0
1021	sbb	\$0, $t1		# can't borrow
1022
1023	mul	$acc2
1024	add	$t0, $acc3
1025	adc	\$0, %rdx
1026	add	%rax, $acc3
1027	mov	$acc2, %rax
1028	adc	%rdx, $acc0
1029	mov	$acc2, %rdx
1030	adc	\$0, $t1		# can't overflow
1031
1032	 mov	$acc3, $t0
1033	 imulq	8*4($a_ptr), $acc3	# *= .LordK
1034
1035	shl	\$32, %rax
1036	shr	\$32, %rdx
1037	sub	%rax, $acc1
1038	 mov	8*0($a_ptr), %rax
1039	sbb	%rdx, $acc2		# can't borrow
1040
1041	add	$t1, $acc1
1042	adc	\$0, $acc2		# can't overflow
1043
1044	################################# Last reduction step
1045	mul	$acc3
1046	mov	$acc3, $t1
1047	add	%rax, $t0		# guaranteed to be zero
1048	mov	8*1($a_ptr), %rax
1049	adc	%rdx, $t0
1050
1051	sub	$acc3, $acc1
1052	sbb	\$0, $t1		# can't borrow
1053
1054	mul	$acc3
1055	add	$t0, $acc0
1056	adc	\$0, %rdx
1057	add	%rax, $acc0
1058	mov	$acc3, %rax
1059	adc	%rdx, $acc1
1060	mov	$acc3, %rdx
1061	adc	\$0, $t1		# can't overflow
1062
1063	shl	\$32, %rax
1064	shr	\$32, %rdx
1065	sub	%rax, $acc2
1066	sbb	%rdx, $acc3		# can't borrow
1067
1068	add	$t1, $acc2
1069	adc	\$0, $acc3		# can't overflow
1070
1071	################################# Add bits [511:256] of the sqr result
1072	xor	%rdx, %rdx
1073	add	$acc4, $acc0
1074	adc	$acc5, $acc1
1075	 mov	$acc0, $acc4
1076	adc	$acc6, $acc2
1077	adc	$acc7, $acc3
1078	 mov	$acc1, %rax
1079	adc	\$0, %rdx
1080
1081	################################# Compare to modulus
1082	sub	8*0($a_ptr), $acc0
1083	 mov	$acc2, $acc6
1084	sbb	8*1($a_ptr), $acc1
1085	sbb	8*2($a_ptr), $acc2
1086	 mov	$acc3, $acc7
1087	sbb	8*3($a_ptr), $acc3
1088	sbb	\$0, %rdx
1089
1090	cmovc	$acc4, $acc0
1091	cmovnc	$acc1, %rax
1092	cmovnc	$acc2, $acc6
1093	cmovnc	$acc3, $acc7
1094
1095	dec	$b_ptr
1096	jnz	.Loop_ord_sqr
1097
1098	mov	$acc0, 8*0($r_ptr)
1099	mov	%rax,  8*1($r_ptr)
1100	pxor	%xmm1, %xmm1
1101	mov	$acc6, 8*2($r_ptr)
1102	pxor	%xmm2, %xmm2
1103	mov	$acc7, 8*3($r_ptr)
1104	pxor	%xmm3, %xmm3
1105
1106	mov	0(%rsp),%r15
1107.cfi_restore	%r15
1108	mov	8(%rsp),%r14
1109.cfi_restore	%r14
1110	mov	16(%rsp),%r13
1111.cfi_restore	%r13
1112	mov	24(%rsp),%r12
1113.cfi_restore	%r12
1114	mov	32(%rsp),%rbx
1115.cfi_restore	%rbx
1116	mov	40(%rsp),%rbp
1117.cfi_restore	%rbp
1118	lea	48(%rsp),%rsp
1119.cfi_adjust_cfa_offset	-48
1120.Lord_sqr_epilogue:
1121	ret
1122.cfi_endproc
1123.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1124___
1125
1126$code.=<<___	if ($addx);
1127################################################################################
1128.type	ecp_nistz256_ord_mul_montx,\@function,3
1129.align	32
1130ecp_nistz256_ord_mul_montx:
1131.cfi_startproc
1132.Lecp_nistz256_ord_mul_montx:
1133	push	%rbp
1134.cfi_push	%rbp
1135	push	%rbx
1136.cfi_push	%rbx
1137	push	%r12
1138.cfi_push	%r12
1139	push	%r13
1140.cfi_push	%r13
1141	push	%r14
1142.cfi_push	%r14
1143	push	%r15
1144.cfi_push	%r15
1145.Lord_mulx_body:
1146
1147	mov	$b_org, $b_ptr
1148	mov	8*0($b_org), %rdx
1149	mov	8*0($a_ptr), $acc1
1150	mov	8*1($a_ptr), $acc2
1151	mov	8*2($a_ptr), $acc3
1152	mov	8*3($a_ptr), $acc4
1153	lea	-128($a_ptr), $a_ptr	# control u-op density
1154	lea	.Lord-128(%rip), %r14
1155	mov	.LordK(%rip), %r15
1156
1157	################################# Multiply by b[0]
1158	mulx	$acc1, $acc0, $acc1
1159	mulx	$acc2, $t0, $acc2
1160	mulx	$acc3, $t1, $acc3
1161	add	$t0, $acc1
1162	mulx	$acc4, $t0, $acc4
1163	 mov	$acc0, %rdx
1164	 mulx	%r15, %rdx, %rax
1165	adc	$t1, $acc2
1166	adc	$t0, $acc3
1167	adc	\$0, $acc4
1168
1169	################################# reduction
1170	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
1171	mulx	8*0+128(%r14), $t0, $t1
1172	adcx	$t0, $acc0		# guaranteed to be zero
1173	adox	$t1, $acc1
1174
1175	mulx	8*1+128(%r14), $t0, $t1
1176	adcx	$t0, $acc1
1177	adox	$t1, $acc2
1178
1179	mulx	8*2+128(%r14), $t0, $t1
1180	adcx	$t0, $acc2
1181	adox	$t1, $acc3
1182
1183	mulx	8*3+128(%r14), $t0, $t1
1184	 mov	8*1($b_ptr), %rdx
1185	adcx	$t0, $acc3
1186	adox	$t1, $acc4
1187	adcx	$acc0, $acc4
1188	adox	$acc0, $acc5
1189	adc	\$0, $acc5		# cf=0, of=0
1190
1191	################################# Multiply by b[1]
1192	mulx	8*0+128($a_ptr), $t0, $t1
1193	adcx	$t0, $acc1
1194	adox	$t1, $acc2
1195
1196	mulx	8*1+128($a_ptr), $t0, $t1
1197	adcx	$t0, $acc2
1198	adox	$t1, $acc3
1199
1200	mulx	8*2+128($a_ptr), $t0, $t1
1201	adcx	$t0, $acc3
1202	adox	$t1, $acc4
1203
1204	mulx	8*3+128($a_ptr), $t0, $t1
1205	 mov	$acc1, %rdx
1206	 mulx	%r15, %rdx, %rax
1207	adcx	$t0, $acc4
1208	adox	$t1, $acc5
1209
1210	adcx	$acc0, $acc5
1211	adox	$acc0, $acc0
1212	adc	\$0, $acc0		# cf=0, of=0
1213
1214	################################# reduction
1215	mulx	8*0+128(%r14), $t0, $t1
1216	adcx	$t0, $acc1		# guaranteed to be zero
1217	adox	$t1, $acc2
1218
1219	mulx	8*1+128(%r14), $t0, $t1
1220	adcx	$t0, $acc2
1221	adox	$t1, $acc3
1222
1223	mulx	8*2+128(%r14), $t0, $t1
1224	adcx	$t0, $acc3
1225	adox	$t1, $acc4
1226
1227	mulx	8*3+128(%r14), $t0, $t1
1228	 mov	8*2($b_ptr), %rdx
1229	adcx	$t0, $acc4
1230	adox	$t1, $acc5
1231	adcx	$acc1, $acc5
1232	adox	$acc1, $acc0
1233	adc	\$0, $acc0		# cf=0, of=0
1234
1235	################################# Multiply by b[2]
1236	mulx	8*0+128($a_ptr), $t0, $t1
1237	adcx	$t0, $acc2
1238	adox	$t1, $acc3
1239
1240	mulx	8*1+128($a_ptr), $t0, $t1
1241	adcx	$t0, $acc3
1242	adox	$t1, $acc4
1243
1244	mulx	8*2+128($a_ptr), $t0, $t1
1245	adcx	$t0, $acc4
1246	adox	$t1, $acc5
1247
1248	mulx	8*3+128($a_ptr), $t0, $t1
1249	 mov	$acc2, %rdx
1250	 mulx	%r15, %rdx, %rax
1251	adcx	$t0, $acc5
1252	adox	$t1, $acc0
1253
1254	adcx	$acc1, $acc0
1255	adox	$acc1, $acc1
1256	adc	\$0, $acc1		# cf=0, of=0
1257
1258	################################# reduction
1259	mulx	8*0+128(%r14), $t0, $t1
1260	adcx	$t0, $acc2		# guaranteed to be zero
1261	adox	$t1, $acc3
1262
1263	mulx	8*1+128(%r14), $t0, $t1
1264	adcx	$t0, $acc3
1265	adox	$t1, $acc4
1266
1267	mulx	8*2+128(%r14), $t0, $t1
1268	adcx	$t0, $acc4
1269	adox	$t1, $acc5
1270
1271	mulx	8*3+128(%r14), $t0, $t1
1272	 mov	8*3($b_ptr), %rdx
1273	adcx	$t0, $acc5
1274	adox	$t1, $acc0
1275	adcx	$acc2, $acc0
1276	adox	$acc2, $acc1
1277	adc	\$0, $acc1		# cf=0, of=0
1278
1279	################################# Multiply by b[3]
1280	mulx	8*0+128($a_ptr), $t0, $t1
1281	adcx	$t0, $acc3
1282	adox	$t1, $acc4
1283
1284	mulx	8*1+128($a_ptr), $t0, $t1
1285	adcx	$t0, $acc4
1286	adox	$t1, $acc5
1287
1288	mulx	8*2+128($a_ptr), $t0, $t1
1289	adcx	$t0, $acc5
1290	adox	$t1, $acc0
1291
1292	mulx	8*3+128($a_ptr), $t0, $t1
1293	 mov	$acc3, %rdx
1294	 mulx	%r15, %rdx, %rax
1295	adcx	$t0, $acc0
1296	adox	$t1, $acc1
1297
1298	adcx	$acc2, $acc1
1299	adox	$acc2, $acc2
1300	adc	\$0, $acc2		# cf=0, of=0
1301
1302	################################# reduction
1303	mulx	8*0+128(%r14), $t0, $t1
1304	adcx	$t0, $acc3		# guaranteed to be zero
1305	adox	$t1, $acc4
1306
1307	mulx	8*1+128(%r14), $t0, $t1
1308	adcx	$t0, $acc4
1309	adox	$t1, $acc5
1310
1311	mulx	8*2+128(%r14), $t0, $t1
1312	adcx	$t0, $acc5
1313	adox	$t1, $acc0
1314
1315	mulx	8*3+128(%r14), $t0, $t1
1316	lea	128(%r14),%r14
1317	 mov	$acc4, $t2
1318	adcx	$t0, $acc0
1319	adox	$t1, $acc1
1320	 mov	$acc5, $t3
1321	adcx	$acc3, $acc1
1322	adox	$acc3, $acc2
1323	adc	\$0, $acc2
1324
1325	#################################
1326	# Branch-less conditional subtraction of P
1327	 mov	$acc0, $t0
1328	sub	8*0(%r14), $acc4
1329	sbb	8*1(%r14), $acc5
1330	sbb	8*2(%r14), $acc0
1331	 mov	$acc1, $t1
1332	sbb	8*3(%r14), $acc1
1333	sbb	\$0, $acc2
1334
1335	cmovc	$t2, $acc4
1336	cmovc	$t3, $acc5
1337	cmovc	$t0, $acc0
1338	cmovc	$t1, $acc1
1339
1340	mov	$acc4, 8*0($r_ptr)
1341	mov	$acc5, 8*1($r_ptr)
1342	mov	$acc0, 8*2($r_ptr)
1343	mov	$acc1, 8*3($r_ptr)
1344
1345	mov	0(%rsp),%r15
1346.cfi_restore	%r15
1347	mov	8(%rsp),%r14
1348.cfi_restore	%r14
1349	mov	16(%rsp),%r13
1350.cfi_restore	%r13
1351	mov	24(%rsp),%r12
1352.cfi_restore	%r12
1353	mov	32(%rsp),%rbx
1354.cfi_restore	%rbx
1355	mov	40(%rsp),%rbp
1356.cfi_restore	%rbp
1357	lea	48(%rsp),%rsp
1358.cfi_adjust_cfa_offset	-48
1359.Lord_mulx_epilogue:
1360	ret
1361.cfi_endproc
1362.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1363
1364.type	ecp_nistz256_ord_sqr_montx,\@function,3
1365.align	32
1366ecp_nistz256_ord_sqr_montx:
1367.cfi_startproc
1368.Lecp_nistz256_ord_sqr_montx:
1369	push	%rbp
1370.cfi_push	%rbp
1371	push	%rbx
1372.cfi_push	%rbx
1373	push	%r12
1374.cfi_push	%r12
1375	push	%r13
1376.cfi_push	%r13
1377	push	%r14
1378.cfi_push	%r14
1379	push	%r15
1380.cfi_push	%r15
1381.Lord_sqrx_body:
1382
1383	mov	$b_org, $b_ptr
1384	mov	8*0($a_ptr), %rdx
1385	mov	8*1($a_ptr), $acc6
1386	mov	8*2($a_ptr), $acc7
1387	mov	8*3($a_ptr), $acc0
1388	lea	.Lord(%rip), $a_ptr
1389	jmp	.Loop_ord_sqrx
1390
1391.align	32
1392.Loop_ord_sqrx:
1393	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1394	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1395	 mov	%rdx, %rax		# offload a[0]
1396	 movq	$acc6, %xmm1		# offload a[1]
1397	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1398	 mov	$acc6, %rdx
1399	add	$t0, $acc2
1400	 movq	$acc7, %xmm2		# offload a[2]
1401	adc	$t1, $acc3
1402	adc	\$0, $acc4
1403	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1404	#################################
1405	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1406	adcx	$t0, $acc3
1407	adox	$t1, $acc4
1408
1409	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1410	 mov	$acc7, %rdx
1411	adcx	$t0, $acc4
1412	adox	$t1, $acc5
1413	adc	\$0, $acc5
1414	#################################
1415	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1416	mov	%rax, %rdx
1417	 movq	$acc0, %xmm3		# offload a[3]
1418	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1419	 adcx	$acc1, $acc1		# acc1:6<<1
1420	adox	$t0, $acc5
1421	 adcx	$acc2, $acc2
1422	adox	$acc7, $acc6		# of=0
1423
1424	################################# a[i]*a[i]
1425	mulx	%rdx, $acc0, $t1
1426	movq	%xmm1, %rdx
1427	 adcx	$acc3, $acc3
1428	adox	$t1, $acc1
1429	 adcx	$acc4, $acc4
1430	mulx	%rdx, $t0, $t4
1431	movq	%xmm2, %rdx
1432	 adcx	$acc5, $acc5
1433	adox	$t0, $acc2
1434	 adcx	$acc6, $acc6
1435	mulx	%rdx, $t0, $t1
1436	.byte	0x67
1437	movq	%xmm3, %rdx
1438	adox	$t4, $acc3
1439	 adcx	$acc7, $acc7
1440	adox	$t0, $acc4
1441	adox	$t1, $acc5
1442	mulx	%rdx, $t0, $t4
1443	adox	$t0, $acc6
1444	adox	$t4, $acc7
1445
1446	################################# reduction
1447	mov	$acc0, %rdx
1448	mulx	8*4($a_ptr), %rdx, $t0
1449
1450	xor	%rax, %rax		# cf=0, of=0
1451	mulx	8*0($a_ptr), $t0, $t1
1452	adcx	$t0, $acc0		# guaranteed to be zero
1453	adox	$t1, $acc1
1454	mulx	8*1($a_ptr), $t0, $t1
1455	adcx	$t0, $acc1
1456	adox	$t1, $acc2
1457	mulx	8*2($a_ptr), $t0, $t1
1458	adcx	$t0, $acc2
1459	adox	$t1, $acc3
1460	mulx	8*3($a_ptr), $t0, $t1
1461	adcx	$t0, $acc3
1462	adox	$t1, $acc0		# of=0
1463	adcx	%rax, $acc0		# cf=0
1464
1465	#################################
1466	mov	$acc1, %rdx
1467	mulx	8*4($a_ptr), %rdx, $t0
1468
1469	mulx	8*0($a_ptr), $t0, $t1
1470	adox	$t0, $acc1		# guaranteed to be zero
1471	adcx	$t1, $acc2
1472	mulx	8*1($a_ptr), $t0, $t1
1473	adox	$t0, $acc2
1474	adcx	$t1, $acc3
1475	mulx	8*2($a_ptr), $t0, $t1
1476	adox	$t0, $acc3
1477	adcx	$t1, $acc0
1478	mulx	8*3($a_ptr), $t0, $t1
1479	adox	$t0, $acc0
1480	adcx	$t1, $acc1		# cf=0
1481	adox	%rax, $acc1		# of=0
1482
1483	#################################
1484	mov	$acc2, %rdx
1485	mulx	8*4($a_ptr), %rdx, $t0
1486
1487	mulx	8*0($a_ptr), $t0, $t1
1488	adcx	$t0, $acc2		# guaranteed to be zero
1489	adox	$t1, $acc3
1490	mulx	8*1($a_ptr), $t0, $t1
1491	adcx	$t0, $acc3
1492	adox	$t1, $acc0
1493	mulx	8*2($a_ptr), $t0, $t1
1494	adcx	$t0, $acc0
1495	adox	$t1, $acc1
1496	mulx	8*3($a_ptr), $t0, $t1
1497	adcx	$t0, $acc1
1498	adox	$t1, $acc2		# of=0
1499	adcx	%rax, $acc2		# cf=0
1500
1501	#################################
1502	mov	$acc3, %rdx
1503	mulx	8*4($a_ptr), %rdx, $t0
1504
1505	mulx	8*0($a_ptr), $t0, $t1
1506	adox	$t0, $acc3		# guaranteed to be zero
1507	adcx	$t1, $acc0
1508	mulx	8*1($a_ptr), $t0, $t1
1509	adox	$t0, $acc0
1510	adcx	$t1, $acc1
1511	mulx	8*2($a_ptr), $t0, $t1
1512	adox	$t0, $acc1
1513	adcx	$t1, $acc2
1514	mulx	8*3($a_ptr), $t0, $t1
1515	adox	$t0, $acc2
1516	adcx	$t1, $acc3
1517	adox	%rax, $acc3
1518
1519	################################# accumulate upper half
1520	add	$acc0, $acc4		# add	$acc4, $acc0
1521	adc	$acc5, $acc1
1522	 mov	$acc4, %rdx
1523	adc	$acc6, $acc2
1524	adc	$acc7, $acc3
1525	 mov	$acc1, $acc6
1526	adc	\$0, %rax
1527
1528	################################# compare to modulus
1529	sub	8*0($a_ptr), $acc4
1530	 mov	$acc2, $acc7
1531	sbb	8*1($a_ptr), $acc1
1532	sbb	8*2($a_ptr), $acc2
1533	 mov	$acc3, $acc0
1534	sbb	8*3($a_ptr), $acc3
1535	sbb	\$0, %rax
1536
1537	cmovnc	$acc4, %rdx
1538	cmovnc	$acc1, $acc6
1539	cmovnc	$acc2, $acc7
1540	cmovnc	$acc3, $acc0
1541
1542	dec	$b_ptr
1543	jnz	.Loop_ord_sqrx
1544
1545	mov	%rdx, 8*0($r_ptr)
1546	mov	$acc6, 8*1($r_ptr)
1547	pxor	%xmm1, %xmm1
1548	mov	$acc7, 8*2($r_ptr)
1549	pxor	%xmm2, %xmm2
1550	mov	$acc0, 8*3($r_ptr)
1551	pxor	%xmm3, %xmm3
1552
1553	mov	0(%rsp),%r15
1554.cfi_restore	%r15
1555	mov	8(%rsp),%r14
1556.cfi_restore	%r14
1557	mov	16(%rsp),%r13
1558.cfi_restore	%r13
1559	mov	24(%rsp),%r12
1560.cfi_restore	%r12
1561	mov	32(%rsp),%rbx
1562.cfi_restore	%rbx
1563	mov	40(%rsp),%rbp
1564.cfi_restore	%rbp
1565	lea	48(%rsp),%rsp
1566.cfi_adjust_cfa_offset	-48
1567.Lord_sqrx_epilogue:
1568	ret
1569.cfi_endproc
1570.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1571___
1572
1573$code.=<<___;
1574################################################################################
1575# void ecp_nistz256_to_mont(
1576#   uint64_t res[4],
1577#   uint64_t in[4]);
1578.globl	ecp_nistz256_to_mont
1579.type	ecp_nistz256_to_mont,\@function,2
1580.align	32
1581ecp_nistz256_to_mont:
1582___
1583$code.=<<___	if ($addx);
1584	mov	\$0x80100, %ecx
1585	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1586___
1587$code.=<<___;
1588	lea	.LRR(%rip), $b_org
1589	jmp	.Lmul_mont
1590.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1591
1592################################################################################
1593# void ecp_nistz256_mul_mont(
1594#   uint64_t res[4],
1595#   uint64_t a[4],
1596#   uint64_t b[4]);
1597
1598.globl	ecp_nistz256_mul_mont
1599.type	ecp_nistz256_mul_mont,\@function,3
1600.align	32
1601ecp_nistz256_mul_mont:
1602.cfi_startproc
1603___
1604$code.=<<___	if ($addx);
1605	mov	\$0x80100, %ecx
1606	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1607___
1608$code.=<<___;
1609.Lmul_mont:
1610	push	%rbp
1611.cfi_push	%rbp
1612	push	%rbx
1613.cfi_push	%rbx
1614	push	%r12
1615.cfi_push	%r12
1616	push	%r13
1617.cfi_push	%r13
1618	push	%r14
1619.cfi_push	%r14
1620	push	%r15
1621.cfi_push	%r15
1622.Lmul_body:
1623___
1624$code.=<<___	if ($addx);
1625	cmp	\$0x80100, %ecx
1626	je	.Lmul_montx
1627___
1628$code.=<<___;
1629	mov	$b_org, $b_ptr
1630	mov	8*0($b_org), %rax
1631	mov	8*0($a_ptr), $acc1
1632	mov	8*1($a_ptr), $acc2
1633	mov	8*2($a_ptr), $acc3
1634	mov	8*3($a_ptr), $acc4
1635
1636	call	__ecp_nistz256_mul_montq
1637___
1638$code.=<<___	if ($addx);
1639	jmp	.Lmul_mont_done
1640
1641.align	32
1642.Lmul_montx:
1643	mov	$b_org, $b_ptr
1644	mov	8*0($b_org), %rdx
1645	mov	8*0($a_ptr), $acc1
1646	mov	8*1($a_ptr), $acc2
1647	mov	8*2($a_ptr), $acc3
1648	mov	8*3($a_ptr), $acc4
1649	lea	-128($a_ptr), $a_ptr	# control u-op density
1650
1651	call	__ecp_nistz256_mul_montx
1652___
1653$code.=<<___;
1654.Lmul_mont_done:
1655	mov	0(%rsp),%r15
1656.cfi_restore	%r15
1657	mov	8(%rsp),%r14
1658.cfi_restore	%r14
1659	mov	16(%rsp),%r13
1660.cfi_restore	%r13
1661	mov	24(%rsp),%r12
1662.cfi_restore	%r12
1663	mov	32(%rsp),%rbx
1664.cfi_restore	%rbx
1665	mov	40(%rsp),%rbp
1666.cfi_restore	%rbp
1667	lea	48(%rsp),%rsp
1668.cfi_adjust_cfa_offset	-48
1669.Lmul_epilogue:
1670	ret
1671.cfi_endproc
1672.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1673
1674.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1675.align	32
1676__ecp_nistz256_mul_montq:
1677.cfi_startproc
1678	########################################################################
1679	# Multiply a by b[0]
1680	mov	%rax, $t1
1681	mulq	$acc1
1682	mov	.Lpoly+8*1(%rip),$poly1
1683	mov	%rax, $acc0
1684	mov	$t1, %rax
1685	mov	%rdx, $acc1
1686
1687	mulq	$acc2
1688	mov	.Lpoly+8*3(%rip),$poly3
1689	add	%rax, $acc1
1690	mov	$t1, %rax
1691	adc	\$0, %rdx
1692	mov	%rdx, $acc2
1693
1694	mulq	$acc3
1695	add	%rax, $acc2
1696	mov	$t1, %rax
1697	adc	\$0, %rdx
1698	mov	%rdx, $acc3
1699
1700	mulq	$acc4
1701	add	%rax, $acc3
1702	 mov	$acc0, %rax
1703	adc	\$0, %rdx
1704	xor	$acc5, $acc5
1705	mov	%rdx, $acc4
1706
1707	########################################################################
1708	# First reduction step
1709	# Basically now we want to multiply acc[0] by p256,
1710	# and add the result to the acc.
1711	# Due to the special form of p256 we do some optimizations
1712	#
1713	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1714	# then we add acc[0] and get acc[0] x 2^96
1715
1716	mov	$acc0, $t1
1717	shl	\$32, $acc0
1718	mulq	$poly3
1719	shr	\$32, $t1
1720	add	$acc0, $acc1		# +=acc[0]<<96
1721	adc	$t1, $acc2
1722	adc	%rax, $acc3
1723	 mov	8*1($b_ptr), %rax
1724	adc	%rdx, $acc4
1725	adc	\$0, $acc5
1726	xor	$acc0, $acc0
1727
1728	########################################################################
1729	# Multiply by b[1]
1730	mov	%rax, $t1
1731	mulq	8*0($a_ptr)
1732	add	%rax, $acc1
1733	mov	$t1, %rax
1734	adc	\$0, %rdx
1735	mov	%rdx, $t0
1736
1737	mulq	8*1($a_ptr)
1738	add	$t0, $acc2
1739	adc	\$0, %rdx
1740	add	%rax, $acc2
1741	mov	$t1, %rax
1742	adc	\$0, %rdx
1743	mov	%rdx, $t0
1744
1745	mulq	8*2($a_ptr)
1746	add	$t0, $acc3
1747	adc	\$0, %rdx
1748	add	%rax, $acc3
1749	mov	$t1, %rax
1750	adc	\$0, %rdx
1751	mov	%rdx, $t0
1752
1753	mulq	8*3($a_ptr)
1754	add	$t0, $acc4
1755	adc	\$0, %rdx
1756	add	%rax, $acc4
1757	 mov	$acc1, %rax
1758	adc	%rdx, $acc5
1759	adc	\$0, $acc0
1760
1761	########################################################################
1762	# Second reduction step
1763	mov	$acc1, $t1
1764	shl	\$32, $acc1
1765	mulq	$poly3
1766	shr	\$32, $t1
1767	add	$acc1, $acc2
1768	adc	$t1, $acc3
1769	adc	%rax, $acc4
1770	 mov	8*2($b_ptr), %rax
1771	adc	%rdx, $acc5
1772	adc	\$0, $acc0
1773	xor	$acc1, $acc1
1774
1775	########################################################################
1776	# Multiply by b[2]
1777	mov	%rax, $t1
1778	mulq	8*0($a_ptr)
1779	add	%rax, $acc2
1780	mov	$t1, %rax
1781	adc	\$0, %rdx
1782	mov	%rdx, $t0
1783
1784	mulq	8*1($a_ptr)
1785	add	$t0, $acc3
1786	adc	\$0, %rdx
1787	add	%rax, $acc3
1788	mov	$t1, %rax
1789	adc	\$0, %rdx
1790	mov	%rdx, $t0
1791
1792	mulq	8*2($a_ptr)
1793	add	$t0, $acc4
1794	adc	\$0, %rdx
1795	add	%rax, $acc4
1796	mov	$t1, %rax
1797	adc	\$0, %rdx
1798	mov	%rdx, $t0
1799
1800	mulq	8*3($a_ptr)
1801	add	$t0, $acc5
1802	adc	\$0, %rdx
1803	add	%rax, $acc5
1804	 mov	$acc2, %rax
1805	adc	%rdx, $acc0
1806	adc	\$0, $acc1
1807
1808	########################################################################
1809	# Third reduction step
1810	mov	$acc2, $t1
1811	shl	\$32, $acc2
1812	mulq	$poly3
1813	shr	\$32, $t1
1814	add	$acc2, $acc3
1815	adc	$t1, $acc4
1816	adc	%rax, $acc5
1817	 mov	8*3($b_ptr), %rax
1818	adc	%rdx, $acc0
1819	adc	\$0, $acc1
1820	xor	$acc2, $acc2
1821
1822	########################################################################
1823	# Multiply by b[3]
1824	mov	%rax, $t1
1825	mulq	8*0($a_ptr)
1826	add	%rax, $acc3
1827	mov	$t1, %rax
1828	adc	\$0, %rdx
1829	mov	%rdx, $t0
1830
1831	mulq	8*1($a_ptr)
1832	add	$t0, $acc4
1833	adc	\$0, %rdx
1834	add	%rax, $acc4
1835	mov	$t1, %rax
1836	adc	\$0, %rdx
1837	mov	%rdx, $t0
1838
1839	mulq	8*2($a_ptr)
1840	add	$t0, $acc5
1841	adc	\$0, %rdx
1842	add	%rax, $acc5
1843	mov	$t1, %rax
1844	adc	\$0, %rdx
1845	mov	%rdx, $t0
1846
1847	mulq	8*3($a_ptr)
1848	add	$t0, $acc0
1849	adc	\$0, %rdx
1850	add	%rax, $acc0
1851	 mov	$acc3, %rax
1852	adc	%rdx, $acc1
1853	adc	\$0, $acc2
1854
1855	########################################################################
1856	# Final reduction step
1857	mov	$acc3, $t1
1858	shl	\$32, $acc3
1859	mulq	$poly3
1860	shr	\$32, $t1
1861	add	$acc3, $acc4
1862	adc	$t1, $acc5
1863	 mov	$acc4, $t0
1864	adc	%rax, $acc0
1865	adc	%rdx, $acc1
1866	 mov	$acc5, $t1
1867	adc	\$0, $acc2
1868
1869	########################################################################
1870	# Branch-less conditional subtraction of P
1871	sub	\$-1, $acc4		# .Lpoly[0]
1872	 mov	$acc0, $t2
1873	sbb	$poly1, $acc5		# .Lpoly[1]
1874	sbb	\$0, $acc0		# .Lpoly[2]
1875	 mov	$acc1, $t3
1876	sbb	$poly3, $acc1		# .Lpoly[3]
1877	sbb	\$0, $acc2
1878
1879	cmovc	$t0, $acc4
1880	cmovc	$t1, $acc5
1881	mov	$acc4, 8*0($r_ptr)
1882	cmovc	$t2, $acc0
1883	mov	$acc5, 8*1($r_ptr)
1884	cmovc	$t3, $acc1
1885	mov	$acc0, 8*2($r_ptr)
1886	mov	$acc1, 8*3($r_ptr)
1887
1888	ret
1889.cfi_endproc
1890.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1891
1892################################################################################
1893# void ecp_nistz256_sqr_mont(
1894#   uint64_t res[4],
1895#   uint64_t a[4]);
1896
1897# we optimize the square according to S.Gueron and V.Krasnov,
1898# "Speeding up Big-Number Squaring"
1899.globl	ecp_nistz256_sqr_mont
1900.type	ecp_nistz256_sqr_mont,\@function,2
1901.align	32
1902ecp_nistz256_sqr_mont:
1903.cfi_startproc
1904___
1905$code.=<<___	if ($addx);
1906	mov	\$0x80100, %ecx
1907	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1908___
1909$code.=<<___;
1910	push	%rbp
1911.cfi_push	%rbp
1912	push	%rbx
1913.cfi_push	%rbx
1914	push	%r12
1915.cfi_push	%r12
1916	push	%r13
1917.cfi_push	%r13
1918	push	%r14
1919.cfi_push	%r14
1920	push	%r15
1921.cfi_push	%r15
1922.Lsqr_body:
1923___
1924$code.=<<___	if ($addx);
1925	cmp	\$0x80100, %ecx
1926	je	.Lsqr_montx
1927___
1928$code.=<<___;
1929	mov	8*0($a_ptr), %rax
1930	mov	8*1($a_ptr), $acc6
1931	mov	8*2($a_ptr), $acc7
1932	mov	8*3($a_ptr), $acc0
1933
1934	call	__ecp_nistz256_sqr_montq
1935___
1936$code.=<<___	if ($addx);
1937	jmp	.Lsqr_mont_done
1938
1939.align	32
1940.Lsqr_montx:
1941	mov	8*0($a_ptr), %rdx
1942	mov	8*1($a_ptr), $acc6
1943	mov	8*2($a_ptr), $acc7
1944	mov	8*3($a_ptr), $acc0
1945	lea	-128($a_ptr), $a_ptr	# control u-op density
1946
1947	call	__ecp_nistz256_sqr_montx
1948___
1949$code.=<<___;
1950.Lsqr_mont_done:
1951	mov	0(%rsp),%r15
1952.cfi_restore	%r15
1953	mov	8(%rsp),%r14
1954.cfi_restore	%r14
1955	mov	16(%rsp),%r13
1956.cfi_restore	%r13
1957	mov	24(%rsp),%r12
1958.cfi_restore	%r12
1959	mov	32(%rsp),%rbx
1960.cfi_restore	%rbx
1961	mov	40(%rsp),%rbp
1962.cfi_restore	%rbp
1963	lea	48(%rsp),%rsp
1964.cfi_adjust_cfa_offset	-48
1965.Lsqr_epilogue:
1966	ret
1967.cfi_endproc
1968.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1969
1970.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1971.align	32
1972__ecp_nistz256_sqr_montq:
1973.cfi_startproc
1974	mov	%rax, $acc5
1975	mulq	$acc6			# a[1]*a[0]
1976	mov	%rax, $acc1
1977	mov	$acc7, %rax
1978	mov	%rdx, $acc2
1979
1980	mulq	$acc5			# a[0]*a[2]
1981	add	%rax, $acc2
1982	mov	$acc0, %rax
1983	adc	\$0, %rdx
1984	mov	%rdx, $acc3
1985
1986	mulq	$acc5			# a[0]*a[3]
1987	add	%rax, $acc3
1988	 mov	$acc7, %rax
1989	adc	\$0, %rdx
1990	mov	%rdx, $acc4
1991
1992	#################################
1993	mulq	$acc6			# a[1]*a[2]
1994	add	%rax, $acc3
1995	mov	$acc0, %rax
1996	adc	\$0, %rdx
1997	mov	%rdx, $t1
1998
1999	mulq	$acc6			# a[1]*a[3]
2000	add	%rax, $acc4
2001	 mov	$acc0, %rax
2002	adc	\$0, %rdx
2003	add	$t1, $acc4
2004	mov	%rdx, $acc5
2005	adc	\$0, $acc5
2006
2007	#################################
2008	mulq	$acc7			# a[2]*a[3]
2009	xor	$acc7, $acc7
2010	add	%rax, $acc5
2011	 mov	8*0($a_ptr), %rax
2012	mov	%rdx, $acc6
2013	adc	\$0, $acc6
2014
2015	add	$acc1, $acc1		# acc1:6<<1
2016	adc	$acc2, $acc2
2017	adc	$acc3, $acc3
2018	adc	$acc4, $acc4
2019	adc	$acc5, $acc5
2020	adc	$acc6, $acc6
2021	adc	\$0, $acc7
2022
2023	mulq	%rax
2024	mov	%rax, $acc0
2025	mov	8*1($a_ptr), %rax
2026	mov	%rdx, $t0
2027
2028	mulq	%rax
2029	add	$t0, $acc1
2030	adc	%rax, $acc2
2031	mov	8*2($a_ptr), %rax
2032	adc	\$0, %rdx
2033	mov	%rdx, $t0
2034
2035	mulq	%rax
2036	add	$t0, $acc3
2037	adc	%rax, $acc4
2038	mov	8*3($a_ptr), %rax
2039	adc	\$0, %rdx
2040	mov	%rdx, $t0
2041
2042	mulq	%rax
2043	add	$t0, $acc5
2044	adc	%rax, $acc6
2045	 mov	$acc0, %rax
2046	adc	%rdx, $acc7
2047
2048	mov	.Lpoly+8*1(%rip), $a_ptr
2049	mov	.Lpoly+8*3(%rip), $t1
2050
2051	##########################################
2052	# Now the reduction
2053	# First iteration
2054	mov	$acc0, $t0
2055	shl	\$32, $acc0
2056	mulq	$t1
2057	shr	\$32, $t0
2058	add	$acc0, $acc1		# +=acc[0]<<96
2059	adc	$t0, $acc2
2060	adc	%rax, $acc3
2061	 mov	$acc1, %rax
2062	adc	\$0, %rdx
2063
2064	##########################################
2065	# Second iteration
2066	mov	$acc1, $t0
2067	shl	\$32, $acc1
2068	mov	%rdx, $acc0
2069	mulq	$t1
2070	shr	\$32, $t0
2071	add	$acc1, $acc2
2072	adc	$t0, $acc3
2073	adc	%rax, $acc0
2074	 mov	$acc2, %rax
2075	adc	\$0, %rdx
2076
2077	##########################################
2078	# Third iteration
2079	mov	$acc2, $t0
2080	shl	\$32, $acc2
2081	mov	%rdx, $acc1
2082	mulq	$t1
2083	shr	\$32, $t0
2084	add	$acc2, $acc3
2085	adc	$t0, $acc0
2086	adc	%rax, $acc1
2087	 mov	$acc3, %rax
2088	adc	\$0, %rdx
2089
2090	###########################################
2091	# Last iteration
2092	mov	$acc3, $t0
2093	shl	\$32, $acc3
2094	mov	%rdx, $acc2
2095	mulq	$t1
2096	shr	\$32, $t0
2097	add	$acc3, $acc0
2098	adc	$t0, $acc1
2099	adc	%rax, $acc2
2100	adc	\$0, %rdx
2101	xor	$acc3, $acc3
2102
2103	############################################
2104	# Add the rest of the acc
2105	add	$acc0, $acc4
2106	adc	$acc1, $acc5
2107	 mov	$acc4, $acc0
2108	adc	$acc2, $acc6
2109	adc	%rdx, $acc7
2110	 mov	$acc5, $acc1
2111	adc	\$0, $acc3
2112
2113	sub	\$-1, $acc4		# .Lpoly[0]
2114	 mov	$acc6, $acc2
2115	sbb	$a_ptr, $acc5		# .Lpoly[1]
2116	sbb	\$0, $acc6		# .Lpoly[2]
2117	 mov	$acc7, $t0
2118	sbb	$t1, $acc7		# .Lpoly[3]
2119	sbb	\$0, $acc3
2120
2121	cmovc	$acc0, $acc4
2122	cmovc	$acc1, $acc5
2123	mov	$acc4, 8*0($r_ptr)
2124	cmovc	$acc2, $acc6
2125	mov	$acc5, 8*1($r_ptr)
2126	cmovc	$t0, $acc7
2127	mov	$acc6, 8*2($r_ptr)
2128	mov	$acc7, 8*3($r_ptr)
2129
2130	ret
2131.cfi_endproc
2132.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2133___
2134
2135if ($addx) {
2136$code.=<<___;
2137.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
2138.align	32
2139__ecp_nistz256_mul_montx:
2140.cfi_startproc
2141	########################################################################
2142	# Multiply by b[0]
2143	mulx	$acc1, $acc0, $acc1
2144	mulx	$acc2, $t0, $acc2
2145	mov	\$32, $poly1
2146	xor	$acc5, $acc5		# cf=0
2147	mulx	$acc3, $t1, $acc3
2148	mov	.Lpoly+8*3(%rip), $poly3
2149	adc	$t0, $acc1
2150	mulx	$acc4, $t0, $acc4
2151	 mov	$acc0, %rdx
2152	adc	$t1, $acc2
2153	 shlx	$poly1,$acc0,$t1
2154	adc	$t0, $acc3
2155	 shrx	$poly1,$acc0,$t0
2156	adc	\$0, $acc4
2157
2158	########################################################################
2159	# First reduction step
2160	add	$t1, $acc1
2161	adc	$t0, $acc2
2162
2163	mulx	$poly3, $t0, $t1
2164	 mov	8*1($b_ptr), %rdx
2165	adc	$t0, $acc3
2166	adc	$t1, $acc4
2167	adc	\$0, $acc5
2168	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
2169
2170	########################################################################
2171	# Multiply by b[1]
2172	mulx	8*0+128($a_ptr), $t0, $t1
2173	adcx	$t0, $acc1
2174	adox	$t1, $acc2
2175
2176	mulx	8*1+128($a_ptr), $t0, $t1
2177	adcx	$t0, $acc2
2178	adox	$t1, $acc3
2179
2180	mulx	8*2+128($a_ptr), $t0, $t1
2181	adcx	$t0, $acc3
2182	adox	$t1, $acc4
2183
2184	mulx	8*3+128($a_ptr), $t0, $t1
2185	 mov	$acc1, %rdx
2186	adcx	$t0, $acc4
2187	 shlx	$poly1, $acc1, $t0
2188	adox	$t1, $acc5
2189	 shrx	$poly1, $acc1, $t1
2190
2191	adcx	$acc0, $acc5
2192	adox	$acc0, $acc0
2193	adc	\$0, $acc0
2194
2195	########################################################################
2196	# Second reduction step
2197	add	$t0, $acc2
2198	adc	$t1, $acc3
2199
2200	mulx	$poly3, $t0, $t1
2201	 mov	8*2($b_ptr), %rdx
2202	adc	$t0, $acc4
2203	adc	$t1, $acc5
2204	adc	\$0, $acc0
2205	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
2206
2207	########################################################################
2208	# Multiply by b[2]
2209	mulx	8*0+128($a_ptr), $t0, $t1
2210	adcx	$t0, $acc2
2211	adox	$t1, $acc3
2212
2213	mulx	8*1+128($a_ptr), $t0, $t1
2214	adcx	$t0, $acc3
2215	adox	$t1, $acc4
2216
2217	mulx	8*2+128($a_ptr), $t0, $t1
2218	adcx	$t0, $acc4
2219	adox	$t1, $acc5
2220
2221	mulx	8*3+128($a_ptr), $t0, $t1
2222	 mov	$acc2, %rdx
2223	adcx	$t0, $acc5
2224	 shlx	$poly1, $acc2, $t0
2225	adox	$t1, $acc0
2226	 shrx	$poly1, $acc2, $t1
2227
2228	adcx	$acc1, $acc0
2229	adox	$acc1, $acc1
2230	adc	\$0, $acc1
2231
2232	########################################################################
2233	# Third reduction step
2234	add	$t0, $acc3
2235	adc	$t1, $acc4
2236
2237	mulx	$poly3, $t0, $t1
2238	 mov	8*3($b_ptr), %rdx
2239	adc	$t0, $acc5
2240	adc	$t1, $acc0
2241	adc	\$0, $acc1
2242	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
2243
2244	########################################################################
2245	# Multiply by b[3]
2246	mulx	8*0+128($a_ptr), $t0, $t1
2247	adcx	$t0, $acc3
2248	adox	$t1, $acc4
2249
2250	mulx	8*1+128($a_ptr), $t0, $t1
2251	adcx	$t0, $acc4
2252	adox	$t1, $acc5
2253
2254	mulx	8*2+128($a_ptr), $t0, $t1
2255	adcx	$t0, $acc5
2256	adox	$t1, $acc0
2257
2258	mulx	8*3+128($a_ptr), $t0, $t1
2259	 mov	$acc3, %rdx
2260	adcx	$t0, $acc0
2261	 shlx	$poly1, $acc3, $t0
2262	adox	$t1, $acc1
2263	 shrx	$poly1, $acc3, $t1
2264
2265	adcx	$acc2, $acc1
2266	adox	$acc2, $acc2
2267	adc	\$0, $acc2
2268
2269	########################################################################
2270	# Fourth reduction step
2271	add	$t0, $acc4
2272	adc	$t1, $acc5
2273
2274	mulx	$poly3, $t0, $t1
2275	 mov	$acc4, $t2
2276	mov	.Lpoly+8*1(%rip), $poly1
2277	adc	$t0, $acc0
2278	 mov	$acc5, $t3
2279	adc	$t1, $acc1
2280	adc	\$0, $acc2
2281
2282	########################################################################
2283	# Branch-less conditional subtraction of P
2284	xor	%eax, %eax
2285	 mov	$acc0, $t0
2286	sbb	\$-1, $acc4		# .Lpoly[0]
2287	sbb	$poly1, $acc5		# .Lpoly[1]
2288	sbb	\$0, $acc0		# .Lpoly[2]
2289	 mov	$acc1, $t1
2290	sbb	$poly3, $acc1		# .Lpoly[3]
2291	sbb	\$0, $acc2
2292
2293	cmovc	$t2, $acc4
2294	cmovc	$t3, $acc5
2295	mov	$acc4, 8*0($r_ptr)
2296	cmovc	$t0, $acc0
2297	mov	$acc5, 8*1($r_ptr)
2298	cmovc	$t1, $acc1
2299	mov	$acc0, 8*2($r_ptr)
2300	mov	$acc1, 8*3($r_ptr)
2301
2302	ret
2303.cfi_endproc
2304.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2305
2306.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
2307.align	32
2308__ecp_nistz256_sqr_montx:
2309.cfi_startproc
2310	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
2311	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
2312	xor	%eax, %eax
2313	adc	$t0, $acc2
2314	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
2315	 mov	$acc6, %rdx
2316	adc	$t1, $acc3
2317	adc	\$0, $acc4
2318	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
2319
2320	#################################
2321	mulx	$acc7, $t0, $t1		# a[1]*a[2]
2322	adcx	$t0, $acc3
2323	adox	$t1, $acc4
2324
2325	mulx	$acc0, $t0, $t1		# a[1]*a[3]
2326	 mov	$acc7, %rdx
2327	adcx	$t0, $acc4
2328	adox	$t1, $acc5
2329	adc	\$0, $acc5
2330
2331	#################################
2332	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
2333	 mov	8*0+128($a_ptr), %rdx
2334	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
2335	 adcx	$acc1, $acc1		# acc1:6<<1
2336	adox	$t0, $acc5
2337	 adcx	$acc2, $acc2
2338	adox	$acc7, $acc6		# of=0
2339
2340	mulx	%rdx, $acc0, $t1
2341	mov	8*1+128($a_ptr), %rdx
2342	 adcx	$acc3, $acc3
2343	adox	$t1, $acc1
2344	 adcx	$acc4, $acc4
2345	mulx	%rdx, $t0, $t4
2346	mov	8*2+128($a_ptr), %rdx
2347	 adcx	$acc5, $acc5
2348	adox	$t0, $acc2
2349	 adcx	$acc6, $acc6
2350	.byte	0x67
2351	mulx	%rdx, $t0, $t1
2352	mov	8*3+128($a_ptr), %rdx
2353	adox	$t4, $acc3
2354	 adcx	$acc7, $acc7
2355	adox	$t0, $acc4
2356	 mov	\$32, $a_ptr
2357	adox	$t1, $acc5
2358	.byte	0x67,0x67
2359	mulx	%rdx, $t0, $t4
2360	 mov	.Lpoly+8*3(%rip), %rdx
2361	adox	$t0, $acc6
2362	 shlx	$a_ptr, $acc0, $t0
2363	adox	$t4, $acc7
2364	 shrx	$a_ptr, $acc0, $t4
2365	mov	%rdx,$t1
2366
2367	# reduction step 1
2368	add	$t0, $acc1
2369	adc	$t4, $acc2
2370
2371	mulx	$acc0, $t0, $acc0
2372	adc	$t0, $acc3
2373	 shlx	$a_ptr, $acc1, $t0
2374	adc	\$0, $acc0
2375	 shrx	$a_ptr, $acc1, $t4
2376
2377	# reduction step 2
2378	add	$t0, $acc2
2379	adc	$t4, $acc3
2380
2381	mulx	$acc1, $t0, $acc1
2382	adc	$t0, $acc0
2383	 shlx	$a_ptr, $acc2, $t0
2384	adc	\$0, $acc1
2385	 shrx	$a_ptr, $acc2, $t4
2386
2387	# reduction step 3
2388	add	$t0, $acc3
2389	adc	$t4, $acc0
2390
2391	mulx	$acc2, $t0, $acc2
2392	adc	$t0, $acc1
2393	 shlx	$a_ptr, $acc3, $t0
2394	adc	\$0, $acc2
2395	 shrx	$a_ptr, $acc3, $t4
2396
2397	# reduction step 4
2398	add	$t0, $acc0
2399	adc	$t4, $acc1
2400
2401	mulx	$acc3, $t0, $acc3
2402	adc	$t0, $acc2
2403	adc	\$0, $acc3
2404
2405	xor	$t3, $t3
2406	add	$acc0, $acc4		# accumulate upper half
2407	 mov	.Lpoly+8*1(%rip), $a_ptr
2408	adc	$acc1, $acc5
2409	 mov	$acc4, $acc0
2410	adc	$acc2, $acc6
2411	adc	$acc3, $acc7
2412	 mov	$acc5, $acc1
2413	adc	\$0, $t3
2414
2415	sub	\$-1, $acc4		# .Lpoly[0]
2416	 mov	$acc6, $acc2
2417	sbb	$a_ptr, $acc5		# .Lpoly[1]
2418	sbb	\$0, $acc6		# .Lpoly[2]
2419	 mov	$acc7, $acc3
2420	sbb	$t1, $acc7		# .Lpoly[3]
2421	sbb	\$0, $t3
2422
2423	cmovc	$acc0, $acc4
2424	cmovc	$acc1, $acc5
2425	mov	$acc4, 8*0($r_ptr)
2426	cmovc	$acc2, $acc6
2427	mov	$acc5, 8*1($r_ptr)
2428	cmovc	$acc3, $acc7
2429	mov	$acc6, 8*2($r_ptr)
2430	mov	$acc7, 8*3($r_ptr)
2431
2432	ret
2433.cfi_endproc
2434.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2435___
2436}
2437}
2438{
2439my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2440my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2441my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2442
2443$code.=<<___;
2444################################################################################
2445# void ecp_nistz256_from_mont(
2446#   uint64_t res[4],
2447#   uint64_t in[4]);
2448# This one performs Montgomery multiplication by 1, so we only need the reduction
2449
2450.globl	ecp_nistz256_from_mont
2451.type	ecp_nistz256_from_mont,\@function,2
2452.align	32
2453ecp_nistz256_from_mont:
2454.cfi_startproc
2455	push	%r12
2456.cfi_push	%r12
2457	push	%r13
2458.cfi_push	%r13
2459.Lfrom_body:
2460
2461	mov	8*0($in_ptr), %rax
2462	mov	.Lpoly+8*3(%rip), $t2
2463	mov	8*1($in_ptr), $acc1
2464	mov	8*2($in_ptr), $acc2
2465	mov	8*3($in_ptr), $acc3
2466	mov	%rax, $acc0
2467	mov	.Lpoly+8*1(%rip), $t1
2468
2469	#########################################
2470	# First iteration
2471	mov	%rax, $t0
2472	shl	\$32, $acc0
2473	mulq	$t2
2474	shr	\$32, $t0
2475	add	$acc0, $acc1
2476	adc	$t0, $acc2
2477	adc	%rax, $acc3
2478	 mov	$acc1, %rax
2479	adc	\$0, %rdx
2480
2481	#########################################
2482	# Second iteration
2483	mov	$acc1, $t0
2484	shl	\$32, $acc1
2485	mov	%rdx, $acc0
2486	mulq	$t2
2487	shr	\$32, $t0
2488	add	$acc1, $acc2
2489	adc	$t0, $acc3
2490	adc	%rax, $acc0
2491	 mov	$acc2, %rax
2492	adc	\$0, %rdx
2493
2494	##########################################
2495	# Third iteration
2496	mov	$acc2, $t0
2497	shl	\$32, $acc2
2498	mov	%rdx, $acc1
2499	mulq	$t2
2500	shr	\$32, $t0
2501	add	$acc2, $acc3
2502	adc	$t0, $acc0
2503	adc	%rax, $acc1
2504	 mov	$acc3, %rax
2505	adc	\$0, %rdx
2506
2507	###########################################
2508	# Last iteration
2509	mov	$acc3, $t0
2510	shl	\$32, $acc3
2511	mov	%rdx, $acc2
2512	mulq	$t2
2513	shr	\$32, $t0
2514	add	$acc3, $acc0
2515	adc	$t0, $acc1
2516	 mov	$acc0, $t0
2517	adc	%rax, $acc2
2518	 mov	$acc1, $in_ptr
2519	adc	\$0, %rdx
2520
2521	###########################################
2522	# Branch-less conditional subtraction
2523	sub	\$-1, $acc0
2524	 mov	$acc2, %rax
2525	sbb	$t1, $acc1
2526	sbb	\$0, $acc2
2527	 mov	%rdx, $acc3
2528	sbb	$t2, %rdx
2529	sbb	$t2, $t2
2530
2531	cmovnz	$t0, $acc0
2532	cmovnz	$in_ptr, $acc1
2533	mov	$acc0, 8*0($r_ptr)
2534	cmovnz	%rax, $acc2
2535	mov	$acc1, 8*1($r_ptr)
2536	cmovz	%rdx, $acc3
2537	mov	$acc2, 8*2($r_ptr)
2538	mov	$acc3, 8*3($r_ptr)
2539
2540	mov	0(%rsp),%r13
2541.cfi_restore	%r13
2542	mov	8(%rsp),%r12
2543.cfi_restore	%r12
2544	lea	16(%rsp),%rsp
2545.cfi_adjust_cfa_offset	-16
2546.Lfrom_epilogue:
2547	ret
2548.cfi_endproc
2549.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2550___
2551}
2552{
2553my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2554my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2555my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2556my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2557
2558$code.=<<___;
2559################################################################################
2560# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2561.globl	ecp_nistz256_scatter_w5
2562.type	ecp_nistz256_scatter_w5,\@abi-omnipotent
2563.align	32
2564ecp_nistz256_scatter_w5:
2565	lea	-3($index,$index,2), $index
2566	movdqa	0x00($in_t), %xmm0
2567	shl	\$5, $index
2568	movdqa	0x10($in_t), %xmm1
2569	movdqa	0x20($in_t), %xmm2
2570	movdqa	0x30($in_t), %xmm3
2571	movdqa	0x40($in_t), %xmm4
2572	movdqa	0x50($in_t), %xmm5
2573	movdqa	%xmm0, 0x00($val,$index)
2574	movdqa	%xmm1, 0x10($val,$index)
2575	movdqa	%xmm2, 0x20($val,$index)
2576	movdqa	%xmm3, 0x30($val,$index)
2577	movdqa	%xmm4, 0x40($val,$index)
2578	movdqa	%xmm5, 0x50($val,$index)
2579
2580	ret
2581.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2582
2583################################################################################
2584# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2585.globl	ecp_nistz256_gather_w5
2586.type	ecp_nistz256_gather_w5,\@abi-omnipotent
2587.align	32
2588ecp_nistz256_gather_w5:
2589.cfi_startproc
2590___
2591$code.=<<___	if ($avx>1);
2592	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2593	test	\$`1<<5`, %eax
2594	jnz	.Lavx2_gather_w5
2595___
2596$code.=<<___	if ($win64);
2597	lea	-0x88(%rsp), %rax
2598.LSEH_begin_ecp_nistz256_gather_w5:
2599	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2600	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2601	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2602	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2603	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2604	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2605	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2606	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2607	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2608	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2609	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2610___
2611$code.=<<___;
2612	movdqa	.LOne(%rip), $ONE
2613	movd	$index, $INDEX
2614
2615	pxor	$Ra, $Ra
2616	pxor	$Rb, $Rb
2617	pxor	$Rc, $Rc
2618	pxor	$Rd, $Rd
2619	pxor	$Re, $Re
2620	pxor	$Rf, $Rf
2621
2622	movdqa	$ONE, $M0
2623	pshufd	\$0, $INDEX, $INDEX
2624
2625	mov	\$16, %rax
2626.Lselect_loop_sse_w5:
2627
2628	movdqa	$M0, $TMP0
2629	paddd	$ONE, $M0
2630	pcmpeqd $INDEX, $TMP0
2631
2632	movdqa	16*0($in_t), $T0a
2633	movdqa	16*1($in_t), $T0b
2634	movdqa	16*2($in_t), $T0c
2635	movdqa	16*3($in_t), $T0d
2636	movdqa	16*4($in_t), $T0e
2637	movdqa	16*5($in_t), $T0f
2638	lea 16*6($in_t), $in_t
2639
2640	pand	$TMP0, $T0a
2641	pand	$TMP0, $T0b
2642	por	$T0a, $Ra
2643	pand	$TMP0, $T0c
2644	por	$T0b, $Rb
2645	pand	$TMP0, $T0d
2646	por	$T0c, $Rc
2647	pand	$TMP0, $T0e
2648	por	$T0d, $Rd
2649	pand	$TMP0, $T0f
2650	por	$T0e, $Re
2651	por	$T0f, $Rf
2652
2653	dec	%rax
2654	jnz	.Lselect_loop_sse_w5
2655
2656	movdqu	$Ra, 16*0($val)
2657	movdqu	$Rb, 16*1($val)
2658	movdqu	$Rc, 16*2($val)
2659	movdqu	$Rd, 16*3($val)
2660	movdqu	$Re, 16*4($val)
2661	movdqu	$Rf, 16*5($val)
2662___
2663$code.=<<___	if ($win64);
2664	movaps	(%rsp), %xmm6
2665	movaps	0x10(%rsp), %xmm7
2666	movaps	0x20(%rsp), %xmm8
2667	movaps	0x30(%rsp), %xmm9
2668	movaps	0x40(%rsp), %xmm10
2669	movaps	0x50(%rsp), %xmm11
2670	movaps	0x60(%rsp), %xmm12
2671	movaps	0x70(%rsp), %xmm13
2672	movaps	0x80(%rsp), %xmm14
2673	movaps	0x90(%rsp), %xmm15
2674	lea	0xa8(%rsp), %rsp
2675___
2676$code.=<<___;
2677	ret
2678.cfi_endproc
2679.LSEH_end_ecp_nistz256_gather_w5:
2680.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2681
2682################################################################################
2683# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2684.globl	ecp_nistz256_scatter_w7
2685.type	ecp_nistz256_scatter_w7,\@abi-omnipotent
2686.align	32
2687ecp_nistz256_scatter_w7:
2688	movdqu	0x00($in_t), %xmm0
2689	shl	\$6, $index
2690	movdqu	0x10($in_t), %xmm1
2691	movdqu	0x20($in_t), %xmm2
2692	movdqu	0x30($in_t), %xmm3
2693	movdqa	%xmm0, 0x00($val,$index)
2694	movdqa	%xmm1, 0x10($val,$index)
2695	movdqa	%xmm2, 0x20($val,$index)
2696	movdqa	%xmm3, 0x30($val,$index)
2697
2698	ret
2699.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2700
2701################################################################################
2702# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2703.globl	ecp_nistz256_gather_w7
2704.type	ecp_nistz256_gather_w7,\@abi-omnipotent
2705.align	32
2706ecp_nistz256_gather_w7:
2707.cfi_startproc
2708___
2709$code.=<<___	if ($avx>1);
2710	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2711	test	\$`1<<5`, %eax
2712	jnz	.Lavx2_gather_w7
2713___
2714$code.=<<___	if ($win64);
2715	lea	-0x88(%rsp), %rax
2716.LSEH_begin_ecp_nistz256_gather_w7:
2717	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2718	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2719	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2720	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2721	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2722	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2723	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2724	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2725	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2726	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2727	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2728___
2729$code.=<<___;
2730	movdqa	.LOne(%rip), $M0
2731	movd	$index, $INDEX
2732
2733	pxor	$Ra, $Ra
2734	pxor	$Rb, $Rb
2735	pxor	$Rc, $Rc
2736	pxor	$Rd, $Rd
2737
2738	movdqa	$M0, $ONE
2739	pshufd	\$0, $INDEX, $INDEX
2740	mov	\$64, %rax
2741
2742.Lselect_loop_sse_w7:
2743	movdqa	$M0, $TMP0
2744	paddd	$ONE, $M0
2745	movdqa	16*0($in_t), $T0a
2746	movdqa	16*1($in_t), $T0b
2747	pcmpeqd	$INDEX, $TMP0
2748	movdqa	16*2($in_t), $T0c
2749	movdqa	16*3($in_t), $T0d
2750	lea	16*4($in_t), $in_t
2751
2752	pand	$TMP0, $T0a
2753	pand	$TMP0, $T0b
2754	por	$T0a, $Ra
2755	pand	$TMP0, $T0c
2756	por	$T0b, $Rb
2757	pand	$TMP0, $T0d
2758	por	$T0c, $Rc
2759	prefetcht0	255($in_t)
2760	por	$T0d, $Rd
2761
2762	dec	%rax
2763	jnz	.Lselect_loop_sse_w7
2764
2765	movdqu	$Ra, 16*0($val)
2766	movdqu	$Rb, 16*1($val)
2767	movdqu	$Rc, 16*2($val)
2768	movdqu	$Rd, 16*3($val)
2769___
2770$code.=<<___	if ($win64);
2771	movaps	(%rsp), %xmm6
2772	movaps	0x10(%rsp), %xmm7
2773	movaps	0x20(%rsp), %xmm8
2774	movaps	0x30(%rsp), %xmm9
2775	movaps	0x40(%rsp), %xmm10
2776	movaps	0x50(%rsp), %xmm11
2777	movaps	0x60(%rsp), %xmm12
2778	movaps	0x70(%rsp), %xmm13
2779	movaps	0x80(%rsp), %xmm14
2780	movaps	0x90(%rsp), %xmm15
2781	lea	0xa8(%rsp), %rsp
2782___
2783$code.=<<___;
2784	ret
2785.cfi_endproc
2786.LSEH_end_ecp_nistz256_gather_w7:
2787.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2788___
2789}
2790if ($avx>1) {
2791my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2792my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2793my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2794my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2795
2796$code.=<<___;
2797################################################################################
2798# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2799.type	ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2800.align	32
2801ecp_nistz256_avx2_gather_w5:
2802.cfi_startproc
2803.Lavx2_gather_w5:
2804	vzeroupper
2805___
2806$code.=<<___	if ($win64);
2807	lea	-0x88(%rsp), %rax
2808	mov	%rsp,%r11
2809.LSEH_begin_ecp_nistz256_avx2_gather_w5:
2810	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2811	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2812	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2813	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2814	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2815	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2816	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2817	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2818	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2819	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2820	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2821___
2822$code.=<<___;
2823	vmovdqa	.LTwo(%rip), $TWO
2824
2825	vpxor	$Ra, $Ra, $Ra
2826	vpxor	$Rb, $Rb, $Rb
2827	vpxor	$Rc, $Rc, $Rc
2828
2829	vmovdqa .LOne(%rip), $M0
2830	vmovdqa .LTwo(%rip), $M1
2831
2832	vmovd	$index, %xmm1
2833	vpermd	$INDEX, $Ra, $INDEX
2834
2835	mov	\$8, %rax
2836.Lselect_loop_avx2_w5:
2837
2838	vmovdqa	32*0($in_t), $T0a
2839	vmovdqa	32*1($in_t), $T0b
2840	vmovdqa	32*2($in_t), $T0c
2841
2842	vmovdqa	32*3($in_t), $T1a
2843	vmovdqa	32*4($in_t), $T1b
2844	vmovdqa	32*5($in_t), $T1c
2845
2846	vpcmpeqd	$INDEX, $M0, $TMP0
2847	vpcmpeqd	$INDEX, $M1, $TMP1
2848
2849	vpaddd	$TWO, $M0, $M0
2850	vpaddd	$TWO, $M1, $M1
2851	lea	32*6($in_t), $in_t
2852
2853	vpand	$TMP0, $T0a, $T0a
2854	vpand	$TMP0, $T0b, $T0b
2855	vpand	$TMP0, $T0c, $T0c
2856	vpand	$TMP1, $T1a, $T1a
2857	vpand	$TMP1, $T1b, $T1b
2858	vpand	$TMP1, $T1c, $T1c
2859
2860	vpxor	$T0a, $Ra, $Ra
2861	vpxor	$T0b, $Rb, $Rb
2862	vpxor	$T0c, $Rc, $Rc
2863	vpxor	$T1a, $Ra, $Ra
2864	vpxor	$T1b, $Rb, $Rb
2865	vpxor	$T1c, $Rc, $Rc
2866
2867	dec %rax
2868	jnz .Lselect_loop_avx2_w5
2869
2870	vmovdqu $Ra, 32*0($val)
2871	vmovdqu $Rb, 32*1($val)
2872	vmovdqu $Rc, 32*2($val)
2873	vzeroupper
2874___
2875$code.=<<___	if ($win64);
2876	movaps	(%rsp), %xmm6
2877	movaps	0x10(%rsp), %xmm7
2878	movaps	0x20(%rsp), %xmm8
2879	movaps	0x30(%rsp), %xmm9
2880	movaps	0x40(%rsp), %xmm10
2881	movaps	0x50(%rsp), %xmm11
2882	movaps	0x60(%rsp), %xmm12
2883	movaps	0x70(%rsp), %xmm13
2884	movaps	0x80(%rsp), %xmm14
2885	movaps	0x90(%rsp), %xmm15
2886	lea	(%r11), %rsp
2887___
2888$code.=<<___;
2889	ret
2890.cfi_endproc
2891.LSEH_end_ecp_nistz256_avx2_gather_w5:
2892.size	ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2893___
2894}
2895if ($avx>1) {
2896my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2897my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2898my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2899my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2900my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2901
2902$code.=<<___;
2903
2904################################################################################
2905# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2906.globl	ecp_nistz256_avx2_gather_w7
2907.type	ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2908.align	32
2909ecp_nistz256_avx2_gather_w7:
2910.cfi_startproc
2911.Lavx2_gather_w7:
2912	vzeroupper
2913___
2914$code.=<<___	if ($win64);
2915	mov	%rsp,%r11
2916	lea	-0x88(%rsp), %rax
2917.LSEH_begin_ecp_nistz256_avx2_gather_w7:
2918	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2919	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2920	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2921	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2922	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2923	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2924	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2925	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2926	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2927	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2928	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2929___
2930$code.=<<___;
2931	vmovdqa	.LThree(%rip), $THREE
2932
2933	vpxor	$Ra, $Ra, $Ra
2934	vpxor	$Rb, $Rb, $Rb
2935
2936	vmovdqa .LOne(%rip), $M0
2937	vmovdqa .LTwo(%rip), $M1
2938	vmovdqa .LThree(%rip), $M2
2939
2940	vmovd	$index, %xmm1
2941	vpermd	$INDEX, $Ra, $INDEX
2942	# Skip index = 0, because it is implicitly the point at infinity
2943
2944	mov	\$21, %rax
2945.Lselect_loop_avx2_w7:
2946
2947	vmovdqa	32*0($in_t), $T0a
2948	vmovdqa	32*1($in_t), $T0b
2949
2950	vmovdqa	32*2($in_t), $T1a
2951	vmovdqa	32*3($in_t), $T1b
2952
2953	vmovdqa	32*4($in_t), $T2a
2954	vmovdqa	32*5($in_t), $T2b
2955
2956	vpcmpeqd	$INDEX, $M0, $TMP0
2957	vpcmpeqd	$INDEX, $M1, $TMP1
2958	vpcmpeqd	$INDEX, $M2, $TMP2
2959
2960	vpaddd	$THREE, $M0, $M0
2961	vpaddd	$THREE, $M1, $M1
2962	vpaddd	$THREE, $M2, $M2
2963	lea	32*6($in_t), $in_t
2964
2965	vpand	$TMP0, $T0a, $T0a
2966	vpand	$TMP0, $T0b, $T0b
2967	vpand	$TMP1, $T1a, $T1a
2968	vpand	$TMP1, $T1b, $T1b
2969	vpand	$TMP2, $T2a, $T2a
2970	vpand	$TMP2, $T2b, $T2b
2971
2972	vpxor	$T0a, $Ra, $Ra
2973	vpxor	$T0b, $Rb, $Rb
2974	vpxor	$T1a, $Ra, $Ra
2975	vpxor	$T1b, $Rb, $Rb
2976	vpxor	$T2a, $Ra, $Ra
2977	vpxor	$T2b, $Rb, $Rb
2978
2979	dec %rax
2980	jnz .Lselect_loop_avx2_w7
2981
2982
2983	vmovdqa	32*0($in_t), $T0a
2984	vmovdqa	32*1($in_t), $T0b
2985
2986	vpcmpeqd	$INDEX, $M0, $TMP0
2987
2988	vpand	$TMP0, $T0a, $T0a
2989	vpand	$TMP0, $T0b, $T0b
2990
2991	vpxor	$T0a, $Ra, $Ra
2992	vpxor	$T0b, $Rb, $Rb
2993
2994	vmovdqu $Ra, 32*0($val)
2995	vmovdqu $Rb, 32*1($val)
2996	vzeroupper
2997___
2998$code.=<<___	if ($win64);
2999	movaps	(%rsp), %xmm6
3000	movaps	0x10(%rsp), %xmm7
3001	movaps	0x20(%rsp), %xmm8
3002	movaps	0x30(%rsp), %xmm9
3003	movaps	0x40(%rsp), %xmm10
3004	movaps	0x50(%rsp), %xmm11
3005	movaps	0x60(%rsp), %xmm12
3006	movaps	0x70(%rsp), %xmm13
3007	movaps	0x80(%rsp), %xmm14
3008	movaps	0x90(%rsp), %xmm15
3009	lea	(%r11), %rsp
3010___
3011$code.=<<___;
3012	ret
3013.cfi_endproc
3014.LSEH_end_ecp_nistz256_avx2_gather_w7:
3015.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3016___
3017} else {
3018$code.=<<___;
3019.globl	ecp_nistz256_avx2_gather_w7
3020.type	ecp_nistz256_avx2_gather_w7,\@function,3
3021.align	32
3022ecp_nistz256_avx2_gather_w7:
3023	.byte	0x0f,0x0b	# ud2
3024	ret
3025.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3026___
3027}
3028{{{
3029########################################################################
3030# This block implements higher level point_double, point_add and
3031# point_add_affine. The key to performance in this case is to allow
3032# out-of-order execution logic to overlap computations from next step
3033# with tail processing from current step. By using tailored calling
3034# sequence we minimize inter-step overhead to give processor better
3035# shot at overlapping operations...
3036#
3037# You will notice that input data is copied to stack. Trouble is that
3038# there are no registers to spare for holding original pointers and
3039# reloading them, pointers, would create undesired dependencies on
3040# effective addresses calculation paths. In other words it's too done
3041# to favour out-of-order execution logic.
3042#						<appro@openssl.org>
3043
3044my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3045my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3046my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3047my ($poly1,$poly3)=($acc6,$acc7);
3048
3049sub load_for_mul () {
3050my ($a,$b,$src0) = @_;
3051my $bias = $src0 eq "%rax" ? 0 : -128;
3052
3053"	mov	$b, $src0
3054	lea	$b, $b_ptr
3055	mov	8*0+$a, $acc1
3056	mov	8*1+$a, $acc2
3057	lea	$bias+$a, $a_ptr
3058	mov	8*2+$a, $acc3
3059	mov	8*3+$a, $acc4"
3060}
3061
3062sub load_for_sqr () {
3063my ($a,$src0) = @_;
3064my $bias = $src0 eq "%rax" ? 0 : -128;
3065
3066"	mov	8*0+$a, $src0
3067	mov	8*1+$a, $acc6
3068	lea	$bias+$a, $a_ptr
3069	mov	8*2+$a, $acc7
3070	mov	8*3+$a, $acc0"
3071}
3072
3073									{
3074########################################################################
3075# operate in 4-5-0-1 "name space" that matches multiplication output
3076#
3077my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3078
3079$code.=<<___;
3080.type	__ecp_nistz256_add_toq,\@abi-omnipotent
3081.align	32
3082__ecp_nistz256_add_toq:
3083.cfi_startproc
3084	xor	$t4,$t4
3085	add	8*0($b_ptr), $a0
3086	adc	8*1($b_ptr), $a1
3087	 mov	$a0, $t0
3088	adc	8*2($b_ptr), $a2
3089	adc	8*3($b_ptr), $a3
3090	 mov	$a1, $t1
3091	adc	\$0, $t4
3092
3093	sub	\$-1, $a0
3094	 mov	$a2, $t2
3095	sbb	$poly1, $a1
3096	sbb	\$0, $a2
3097	 mov	$a3, $t3
3098	sbb	$poly3, $a3
3099	sbb	\$0, $t4
3100
3101	cmovc	$t0, $a0
3102	cmovc	$t1, $a1
3103	mov	$a0, 8*0($r_ptr)
3104	cmovc	$t2, $a2
3105	mov	$a1, 8*1($r_ptr)
3106	cmovc	$t3, $a3
3107	mov	$a2, 8*2($r_ptr)
3108	mov	$a3, 8*3($r_ptr)
3109
3110	ret
3111.cfi_endproc
3112.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3113
3114.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
3115.align	32
3116__ecp_nistz256_sub_fromq:
3117.cfi_startproc
3118	sub	8*0($b_ptr), $a0
3119	sbb	8*1($b_ptr), $a1
3120	 mov	$a0, $t0
3121	sbb	8*2($b_ptr), $a2
3122	sbb	8*3($b_ptr), $a3
3123	 mov	$a1, $t1
3124	sbb	$t4, $t4
3125
3126	add	\$-1, $a0
3127	 mov	$a2, $t2
3128	adc	$poly1, $a1
3129	adc	\$0, $a2
3130	 mov	$a3, $t3
3131	adc	$poly3, $a3
3132	test	$t4, $t4
3133
3134	cmovz	$t0, $a0
3135	cmovz	$t1, $a1
3136	mov	$a0, 8*0($r_ptr)
3137	cmovz	$t2, $a2
3138	mov	$a1, 8*1($r_ptr)
3139	cmovz	$t3, $a3
3140	mov	$a2, 8*2($r_ptr)
3141	mov	$a3, 8*3($r_ptr)
3142
3143	ret
3144.cfi_endproc
3145.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3146
3147.type	__ecp_nistz256_subq,\@abi-omnipotent
3148.align	32
3149__ecp_nistz256_subq:
3150.cfi_startproc
3151	sub	$a0, $t0
3152	sbb	$a1, $t1
3153	 mov	$t0, $a0
3154	sbb	$a2, $t2
3155	sbb	$a3, $t3
3156	 mov	$t1, $a1
3157	sbb	$t4, $t4
3158
3159	add	\$-1, $t0
3160	 mov	$t2, $a2
3161	adc	$poly1, $t1
3162	adc	\$0, $t2
3163	 mov	$t3, $a3
3164	adc	$poly3, $t3
3165	test	$t4, $t4
3166
3167	cmovnz	$t0, $a0
3168	cmovnz	$t1, $a1
3169	cmovnz	$t2, $a2
3170	cmovnz	$t3, $a3
3171
3172	ret
3173.cfi_endproc
3174.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
3175
3176.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
3177.align	32
3178__ecp_nistz256_mul_by_2q:
3179.cfi_startproc
3180	xor	$t4, $t4
3181	add	$a0, $a0		# a0:a3+a0:a3
3182	adc	$a1, $a1
3183	 mov	$a0, $t0
3184	adc	$a2, $a2
3185	adc	$a3, $a3
3186	 mov	$a1, $t1
3187	adc	\$0, $t4
3188
3189	sub	\$-1, $a0
3190	 mov	$a2, $t2
3191	sbb	$poly1, $a1
3192	sbb	\$0, $a2
3193	 mov	$a3, $t3
3194	sbb	$poly3, $a3
3195	sbb	\$0, $t4
3196
3197	cmovc	$t0, $a0
3198	cmovc	$t1, $a1
3199	mov	$a0, 8*0($r_ptr)
3200	cmovc	$t2, $a2
3201	mov	$a1, 8*1($r_ptr)
3202	cmovc	$t3, $a3
3203	mov	$a2, 8*2($r_ptr)
3204	mov	$a3, 8*3($r_ptr)
3205
3206	ret
3207.cfi_endproc
3208.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3209___
3210									}
3211sub gen_double () {
3212    my $x = shift;
3213    my ($src0,$sfx,$bias);
3214    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3215
3216    if ($x ne "x") {
3217	$src0 = "%rax";
3218	$sfx  = "";
3219	$bias = 0;
3220
3221$code.=<<___;
3222.globl	ecp_nistz256_point_double
3223.type	ecp_nistz256_point_double,\@function,2
3224.align	32
3225ecp_nistz256_point_double:
3226.cfi_startproc
3227___
3228$code.=<<___	if ($addx);
3229	mov	\$0x80100, %ecx
3230	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3231	cmp	\$0x80100, %ecx
3232	je	.Lpoint_doublex
3233___
3234    } else {
3235	$src0 = "%rdx";
3236	$sfx  = "x";
3237	$bias = 128;
3238
3239$code.=<<___;
3240.type	ecp_nistz256_point_doublex,\@function,2
3241.align	32
3242ecp_nistz256_point_doublex:
3243.cfi_startproc
3244.Lpoint_doublex:
3245___
3246    }
3247$code.=<<___;
3248	push	%rbp
3249.cfi_push	%rbp
3250	push	%rbx
3251.cfi_push	%rbx
3252	push	%r12
3253.cfi_push	%r12
3254	push	%r13
3255.cfi_push	%r13
3256	push	%r14
3257.cfi_push	%r14
3258	push	%r15
3259.cfi_push	%r15
3260	sub	\$32*5+8, %rsp
3261.cfi_adjust_cfa_offset	32*5+8
3262.Lpoint_double${x}_body:
3263
3264.Lpoint_double_shortcut$x:
3265	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
3266	mov	$a_ptr, $b_ptr			# backup copy
3267	movdqu	0x10($a_ptr), %xmm1
3268	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
3269	 mov	0x20+8*1($a_ptr), $acc5
3270	 mov	0x20+8*2($a_ptr), $acc0
3271	 mov	0x20+8*3($a_ptr), $acc1
3272	 mov	.Lpoly+8*1(%rip), $poly1
3273	 mov	.Lpoly+8*3(%rip), $poly3
3274	movdqa	%xmm0, $in_x(%rsp)
3275	movdqa	%xmm1, $in_x+0x10(%rsp)
3276	lea	0x20($r_ptr), $acc2
3277	lea	0x40($r_ptr), $acc3
3278	movq	$r_ptr, %xmm0
3279	movq	$acc2, %xmm1
3280	movq	$acc3, %xmm2
3281
3282	lea	$S(%rsp), $r_ptr
3283	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
3284
3285	mov	0x40+8*0($a_ptr), $src0
3286	mov	0x40+8*1($a_ptr), $acc6
3287	mov	0x40+8*2($a_ptr), $acc7
3288	mov	0x40+8*3($a_ptr), $acc0
3289	lea	0x40-$bias($a_ptr), $a_ptr
3290	lea	$Zsqr(%rsp), $r_ptr
3291	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
3292
3293	`&load_for_sqr("$S(%rsp)", "$src0")`
3294	lea	$S(%rsp), $r_ptr
3295	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
3296
3297	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
3298	mov	0x40+8*0($b_ptr), $acc1
3299	mov	0x40+8*1($b_ptr), $acc2
3300	mov	0x40+8*2($b_ptr), $acc3
3301	mov	0x40+8*3($b_ptr), $acc4
3302	lea	0x40-$bias($b_ptr), $a_ptr
3303	lea	0x20($b_ptr), $b_ptr
3304	movq	%xmm2, $r_ptr
3305	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
3306	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
3307
3308	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3309	mov	$in_x+8*1(%rsp), $acc5
3310	lea	$Zsqr(%rsp), $b_ptr
3311	mov	$in_x+8*2(%rsp), $acc0
3312	mov	$in_x+8*3(%rsp), $acc1
3313	lea	$M(%rsp), $r_ptr
3314	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
3315
3316	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3317	mov	$in_x+8*1(%rsp), $acc5
3318	lea	$Zsqr(%rsp), $b_ptr
3319	mov	$in_x+8*2(%rsp), $acc0
3320	mov	$in_x+8*3(%rsp), $acc1
3321	lea	$Zsqr(%rsp), $r_ptr
3322	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
3323
3324	`&load_for_sqr("$S(%rsp)", "$src0")`
3325	movq	%xmm1, $r_ptr
3326	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
3327___
3328{
3329######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3330# operate in 4-5-6-7 "name space" that matches squaring output
3331#
3332my ($poly1,$poly3)=($a_ptr,$t1);
3333my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3334
3335$code.=<<___;
3336	xor	$t4, $t4
3337	mov	$a0, $t0
3338	add	\$-1, $a0
3339	mov	$a1, $t1
3340	adc	$poly1, $a1
3341	mov	$a2, $t2
3342	adc	\$0, $a2
3343	mov	$a3, $t3
3344	adc	$poly3, $a3
3345	adc	\$0, $t4
3346	xor	$a_ptr, $a_ptr		# borrow $a_ptr
3347	test	\$1, $t0
3348
3349	cmovz	$t0, $a0
3350	cmovz	$t1, $a1
3351	cmovz	$t2, $a2
3352	cmovz	$t3, $a3
3353	cmovz	$a_ptr, $t4
3354
3355	mov	$a1, $t0		# a0:a3>>1
3356	shr	\$1, $a0
3357	shl	\$63, $t0
3358	mov	$a2, $t1
3359	shr	\$1, $a1
3360	or	$t0, $a0
3361	shl	\$63, $t1
3362	mov	$a3, $t2
3363	shr	\$1, $a2
3364	or	$t1, $a1
3365	shl	\$63, $t2
3366	mov	$a0, 8*0($r_ptr)
3367	shr	\$1, $a3
3368	mov	$a1, 8*1($r_ptr)
3369	shl	\$63, $t4
3370	or	$t2, $a2
3371	or	$t4, $a3
3372	mov	$a2, 8*2($r_ptr)
3373	mov	$a3, 8*3($r_ptr)
3374___
3375}
3376$code.=<<___;
3377	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3378	lea	$M(%rsp), $r_ptr
3379	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
3380
3381	lea	$tmp0(%rsp), $r_ptr
3382	call	__ecp_nistz256_mul_by_2$x
3383
3384	lea	$M(%rsp), $b_ptr
3385	lea	$M(%rsp), $r_ptr
3386	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
3387
3388	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3389	lea	$S(%rsp), $r_ptr
3390	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
3391
3392	lea	$tmp0(%rsp), $r_ptr
3393	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
3394
3395	`&load_for_sqr("$M(%rsp)", "$src0")`
3396	movq	%xmm0, $r_ptr
3397	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
3398
3399	lea	$tmp0(%rsp), $b_ptr
3400	mov	$acc6, $acc0			# harmonize sqr output and sub input
3401	mov	$acc7, $acc1
3402	mov	$a_ptr, $poly1
3403	mov	$t1, $poly3
3404	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
3405
3406	mov	$S+8*0(%rsp), $t0
3407	mov	$S+8*1(%rsp), $t1
3408	mov	$S+8*2(%rsp), $t2
3409	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
3410	lea	$S(%rsp), $r_ptr
3411	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
3412
3413	mov	$M(%rsp), $src0
3414	lea	$M(%rsp), $b_ptr
3415	mov	$acc4, $acc6			# harmonize sub output and mul input
3416	xor	%ecx, %ecx
3417	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
3418	mov	$acc5, $acc2
3419	mov	$acc5, $S+8*1(%rsp)
3420	cmovz	$acc0, $acc3
3421	mov	$acc0, $S+8*2(%rsp)
3422	lea	$S-$bias(%rsp), $a_ptr
3423	cmovz	$acc1, $acc4
3424	mov	$acc1, $S+8*3(%rsp)
3425	mov	$acc6, $acc1
3426	lea	$S(%rsp), $r_ptr
3427	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
3428
3429	movq	%xmm1, $b_ptr
3430	movq	%xmm1, $r_ptr
3431	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
3432
3433	lea	32*5+56(%rsp), %rsi
3434.cfi_def_cfa	%rsi,8
3435	mov	-48(%rsi),%r15
3436.cfi_restore	%r15
3437	mov	-40(%rsi),%r14
3438.cfi_restore	%r14
3439	mov	-32(%rsi),%r13
3440.cfi_restore	%r13
3441	mov	-24(%rsi),%r12
3442.cfi_restore	%r12
3443	mov	-16(%rsi),%rbx
3444.cfi_restore	%rbx
3445	mov	-8(%rsi),%rbp
3446.cfi_restore	%rbp
3447	lea	(%rsi),%rsp
3448.cfi_def_cfa_register	%rsp
3449.Lpoint_double${x}_epilogue:
3450	ret
3451.cfi_endproc
3452.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3453___
3454}
3455&gen_double("q");
3456
3457sub gen_add () {
3458    my $x = shift;
3459    my ($src0,$sfx,$bias);
3460    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3461	$U1,$U2,$S1,$S2,
3462	$res_x,$res_y,$res_z,
3463	$in1_x,$in1_y,$in1_z,
3464	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3465    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3466
3467    if ($x ne "x") {
3468	$src0 = "%rax";
3469	$sfx  = "";
3470	$bias = 0;
3471
3472$code.=<<___;
3473.globl	ecp_nistz256_point_add
3474.type	ecp_nistz256_point_add,\@function,3
3475.align	32
3476ecp_nistz256_point_add:
3477.cfi_startproc
3478___
3479$code.=<<___	if ($addx);
3480	mov	\$0x80100, %ecx
3481	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3482	cmp	\$0x80100, %ecx
3483	je	.Lpoint_addx
3484___
3485    } else {
3486	$src0 = "%rdx";
3487	$sfx  = "x";
3488	$bias = 128;
3489
3490$code.=<<___;
3491.type	ecp_nistz256_point_addx,\@function,3
3492.align	32
3493ecp_nistz256_point_addx:
3494.cfi_startproc
3495.Lpoint_addx:
3496___
3497    }
3498$code.=<<___;
3499	push	%rbp
3500.cfi_push	%rbp
3501	push	%rbx
3502.cfi_push	%rbx
3503	push	%r12
3504.cfi_push	%r12
3505	push	%r13
3506.cfi_push	%r13
3507	push	%r14
3508.cfi_push	%r14
3509	push	%r15
3510.cfi_push	%r15
3511	sub	\$32*18+8, %rsp
3512.cfi_adjust_cfa_offset	32*18+8
3513.Lpoint_add${x}_body:
3514
3515	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3516	movdqu	0x10($a_ptr), %xmm1
3517	movdqu	0x20($a_ptr), %xmm2
3518	movdqu	0x30($a_ptr), %xmm3
3519	movdqu	0x40($a_ptr), %xmm4
3520	movdqu	0x50($a_ptr), %xmm5
3521	mov	$a_ptr, $b_ptr			# reassign
3522	mov	$b_org, $a_ptr			# reassign
3523	movdqa	%xmm0, $in1_x(%rsp)
3524	movdqa	%xmm1, $in1_x+0x10(%rsp)
3525	movdqa	%xmm2, $in1_y(%rsp)
3526	movdqa	%xmm3, $in1_y+0x10(%rsp)
3527	movdqa	%xmm4, $in1_z(%rsp)
3528	movdqa	%xmm5, $in1_z+0x10(%rsp)
3529	por	%xmm4, %xmm5
3530
3531	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3532	 pshufd	\$0xb1, %xmm5, %xmm3
3533	movdqu	0x10($a_ptr), %xmm1
3534	movdqu	0x20($a_ptr), %xmm2
3535	 por	%xmm3, %xmm5
3536	movdqu	0x30($a_ptr), %xmm3
3537	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3538	 mov	0x40+8*1($a_ptr), $acc6
3539	 mov	0x40+8*2($a_ptr), $acc7
3540	 mov	0x40+8*3($a_ptr), $acc0
3541	movdqa	%xmm0, $in2_x(%rsp)
3542	 pshufd	\$0x1e, %xmm5, %xmm4
3543	movdqa	%xmm1, $in2_x+0x10(%rsp)
3544	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3545	movdqu	0x50($a_ptr),%xmm1
3546	movdqa	%xmm2, $in2_y(%rsp)
3547	movdqa	%xmm3, $in2_y+0x10(%rsp)
3548	 por	%xmm4, %xmm5
3549	 pxor	%xmm4, %xmm4
3550	por	%xmm0, %xmm1
3551	 movq	$r_ptr, %xmm0			# save $r_ptr
3552
3553	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3554	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3555	 mov	$acc6, $in2_z+8*1(%rsp)
3556	 mov	$acc7, $in2_z+8*2(%rsp)
3557	 mov	$acc0, $in2_z+8*3(%rsp)
3558	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3559	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3560
3561	pcmpeqd	%xmm4, %xmm5
3562	pshufd	\$0xb1, %xmm1, %xmm4
3563	por	%xmm1, %xmm4
3564	pshufd	\$0, %xmm5, %xmm5		# in1infty
3565	pshufd	\$0x1e, %xmm4, %xmm3
3566	por	%xmm3, %xmm4
3567	pxor	%xmm3, %xmm3
3568	pcmpeqd	%xmm3, %xmm4
3569	pshufd	\$0, %xmm4, %xmm4		# in2infty
3570	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3571	 mov	0x40+8*1($b_ptr), $acc6
3572	 mov	0x40+8*2($b_ptr), $acc7
3573	 mov	0x40+8*3($b_ptr), $acc0
3574	movq	$b_ptr, %xmm1
3575
3576	lea	0x40-$bias($b_ptr), $a_ptr
3577	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3578	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3579
3580	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3581	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3582	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3583
3584	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3585	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3586	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3587
3588	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3589	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3590	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3591
3592	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3593	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3594	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3595
3596	lea	$S1(%rsp), $b_ptr
3597	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3598	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3599
3600	or	$acc5, $acc4			# see if result is zero
3601	movdqa	%xmm4, %xmm2
3602	or	$acc0, $acc4
3603	or	$acc1, $acc4
3604	por	%xmm5, %xmm2			# in1infty || in2infty
3605	movq	$acc4, %xmm3
3606
3607	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3608	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3609	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3610
3611	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3612	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3613	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3614
3615	lea	$U1(%rsp), $b_ptr
3616	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3617	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3618
3619	or	$acc5, $acc4			# see if result is zero
3620	or	$acc0, $acc4
3621	or	$acc1, $acc4
3622
3623	.byte	0x3e				# predict taken
3624	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
3625	movq	%xmm2, $acc0
3626	movq	%xmm3, $acc1
3627	test	$acc0, $acc0
3628	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
3629	test	$acc1, $acc1
3630	jz	.Ladd_double$x			# is_equal(S1,S2)?
3631
3632	movq	%xmm0, $r_ptr			# restore $r_ptr
3633	pxor	%xmm0, %xmm0
3634	movdqu	%xmm0, 0x00($r_ptr)
3635	movdqu	%xmm0, 0x10($r_ptr)
3636	movdqu	%xmm0, 0x20($r_ptr)
3637	movdqu	%xmm0, 0x30($r_ptr)
3638	movdqu	%xmm0, 0x40($r_ptr)
3639	movdqu	%xmm0, 0x50($r_ptr)
3640	jmp	.Ladd_done$x
3641
3642.align	32
3643.Ladd_double$x:
3644	movq	%xmm1, $a_ptr			# restore $a_ptr
3645	movq	%xmm0, $r_ptr			# restore $r_ptr
3646	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3647.cfi_adjust_cfa_offset	`-32*(18-5)`
3648	jmp	.Lpoint_double_shortcut$x
3649.cfi_adjust_cfa_offset	`32*(18-5)`
3650
3651.align	32
3652.Ladd_proceed$x:
3653	`&load_for_sqr("$R(%rsp)", "$src0")`
3654	lea	$Rsqr(%rsp), $r_ptr		# R^2
3655	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3656
3657	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3658	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3659	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3660
3661	`&load_for_sqr("$H(%rsp)", "$src0")`
3662	lea	$Hsqr(%rsp), $r_ptr		# H^2
3663	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3664
3665	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3666	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3667	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3668
3669	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3670	lea	$Hcub(%rsp), $r_ptr		# H^3
3671	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3672
3673	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3674	lea	$U2(%rsp), $r_ptr		# U1*H^2
3675	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3676___
3677{
3678#######################################################################
3679# operate in 4-5-0-1 "name space" that matches multiplication output
3680#
3681my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3682my ($poly1, $poly3)=($acc6,$acc7);
3683
3684$code.=<<___;
3685	#lea	$U2(%rsp), $a_ptr
3686	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3687	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3688
3689	xor	$t4, $t4
3690	add	$acc0, $acc0		# a0:a3+a0:a3
3691	lea	$Rsqr(%rsp), $a_ptr
3692	adc	$acc1, $acc1
3693	 mov	$acc0, $t0
3694	adc	$acc2, $acc2
3695	adc	$acc3, $acc3
3696	 mov	$acc1, $t1
3697	adc	\$0, $t4
3698
3699	sub	\$-1, $acc0
3700	 mov	$acc2, $t2
3701	sbb	$poly1, $acc1
3702	sbb	\$0, $acc2
3703	 mov	$acc3, $t3
3704	sbb	$poly3, $acc3
3705	sbb	\$0, $t4
3706
3707	cmovc	$t0, $acc0
3708	mov	8*0($a_ptr), $t0
3709	cmovc	$t1, $acc1
3710	mov	8*1($a_ptr), $t1
3711	cmovc	$t2, $acc2
3712	mov	8*2($a_ptr), $t2
3713	cmovc	$t3, $acc3
3714	mov	8*3($a_ptr), $t3
3715
3716	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3717
3718	lea	$Hcub(%rsp), $b_ptr
3719	lea	$res_x(%rsp), $r_ptr
3720	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3721
3722	mov	$U2+8*0(%rsp), $t0
3723	mov	$U2+8*1(%rsp), $t1
3724	mov	$U2+8*2(%rsp), $t2
3725	mov	$U2+8*3(%rsp), $t3
3726	lea	$res_y(%rsp), $r_ptr
3727
3728	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3729
3730	mov	$acc0, 8*0($r_ptr)		# save the result, as
3731	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3732	mov	$acc2, 8*2($r_ptr)
3733	mov	$acc3, 8*3($r_ptr)
3734___
3735}
3736$code.=<<___;
3737	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3738	lea	$S2(%rsp), $r_ptr
3739	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3740
3741	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3742	lea	$res_y(%rsp), $r_ptr
3743	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3744
3745	lea	$S2(%rsp), $b_ptr
3746	lea	$res_y(%rsp), $r_ptr
3747	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3748
3749	movq	%xmm0, $r_ptr		# restore $r_ptr
3750
3751	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3752	movdqa	%xmm5, %xmm1
3753	pandn	$res_z(%rsp), %xmm0
3754	movdqa	%xmm5, %xmm2
3755	pandn	$res_z+0x10(%rsp), %xmm1
3756	movdqa	%xmm5, %xmm3
3757	pand	$in2_z(%rsp), %xmm2
3758	pand	$in2_z+0x10(%rsp), %xmm3
3759	por	%xmm0, %xmm2
3760	por	%xmm1, %xmm3
3761
3762	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3763	movdqa	%xmm4, %xmm1
3764	pandn	%xmm2, %xmm0
3765	movdqa	%xmm4, %xmm2
3766	pandn	%xmm3, %xmm1
3767	movdqa	%xmm4, %xmm3
3768	pand	$in1_z(%rsp), %xmm2
3769	pand	$in1_z+0x10(%rsp), %xmm3
3770	por	%xmm0, %xmm2
3771	por	%xmm1, %xmm3
3772	movdqu	%xmm2, 0x40($r_ptr)
3773	movdqu	%xmm3, 0x50($r_ptr)
3774
3775	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3776	movdqa	%xmm5, %xmm1
3777	pandn	$res_x(%rsp), %xmm0
3778	movdqa	%xmm5, %xmm2
3779	pandn	$res_x+0x10(%rsp), %xmm1
3780	movdqa	%xmm5, %xmm3
3781	pand	$in2_x(%rsp), %xmm2
3782	pand	$in2_x+0x10(%rsp), %xmm3
3783	por	%xmm0, %xmm2
3784	por	%xmm1, %xmm3
3785
3786	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3787	movdqa	%xmm4, %xmm1
3788	pandn	%xmm2, %xmm0
3789	movdqa	%xmm4, %xmm2
3790	pandn	%xmm3, %xmm1
3791	movdqa	%xmm4, %xmm3
3792	pand	$in1_x(%rsp), %xmm2
3793	pand	$in1_x+0x10(%rsp), %xmm3
3794	por	%xmm0, %xmm2
3795	por	%xmm1, %xmm3
3796	movdqu	%xmm2, 0x00($r_ptr)
3797	movdqu	%xmm3, 0x10($r_ptr)
3798
3799	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3800	movdqa	%xmm5, %xmm1
3801	pandn	$res_y(%rsp), %xmm0
3802	movdqa	%xmm5, %xmm2
3803	pandn	$res_y+0x10(%rsp), %xmm1
3804	movdqa	%xmm5, %xmm3
3805	pand	$in2_y(%rsp), %xmm2
3806	pand	$in2_y+0x10(%rsp), %xmm3
3807	por	%xmm0, %xmm2
3808	por	%xmm1, %xmm3
3809
3810	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3811	movdqa	%xmm4, %xmm1
3812	pandn	%xmm2, %xmm0
3813	movdqa	%xmm4, %xmm2
3814	pandn	%xmm3, %xmm1
3815	movdqa	%xmm4, %xmm3
3816	pand	$in1_y(%rsp), %xmm2
3817	pand	$in1_y+0x10(%rsp), %xmm3
3818	por	%xmm0, %xmm2
3819	por	%xmm1, %xmm3
3820	movdqu	%xmm2, 0x20($r_ptr)
3821	movdqu	%xmm3, 0x30($r_ptr)
3822
3823.Ladd_done$x:
3824	lea	32*18+56(%rsp), %rsi
3825.cfi_def_cfa	%rsi,8
3826	mov	-48(%rsi),%r15
3827.cfi_restore	%r15
3828	mov	-40(%rsi),%r14
3829.cfi_restore	%r14
3830	mov	-32(%rsi),%r13
3831.cfi_restore	%r13
3832	mov	-24(%rsi),%r12
3833.cfi_restore	%r12
3834	mov	-16(%rsi),%rbx
3835.cfi_restore	%rbx
3836	mov	-8(%rsi),%rbp
3837.cfi_restore	%rbp
3838	lea	(%rsi),%rsp
3839.cfi_def_cfa_register	%rsp
3840.Lpoint_add${x}_epilogue:
3841	ret
3842.cfi_endproc
3843.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3844___
3845}
3846&gen_add("q");
3847
3848sub gen_add_affine () {
3849    my $x = shift;
3850    my ($src0,$sfx,$bias);
3851    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3852	$res_x,$res_y,$res_z,
3853	$in1_x,$in1_y,$in1_z,
3854	$in2_x,$in2_y)=map(32*$_,(0..14));
3855    my $Z1sqr = $S2;
3856
3857    if ($x ne "x") {
3858	$src0 = "%rax";
3859	$sfx  = "";
3860	$bias = 0;
3861
3862$code.=<<___;
3863.globl	ecp_nistz256_point_add_affine
3864.type	ecp_nistz256_point_add_affine,\@function,3
3865.align	32
3866ecp_nistz256_point_add_affine:
3867.cfi_startproc
3868___
3869$code.=<<___	if ($addx);
3870	mov	\$0x80100, %ecx
3871	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3872	cmp	\$0x80100, %ecx
3873	je	.Lpoint_add_affinex
3874___
3875    } else {
3876	$src0 = "%rdx";
3877	$sfx  = "x";
3878	$bias = 128;
3879
3880$code.=<<___;
3881.type	ecp_nistz256_point_add_affinex,\@function,3
3882.align	32
3883ecp_nistz256_point_add_affinex:
3884.cfi_startproc
3885.Lpoint_add_affinex:
3886___
3887    }
3888$code.=<<___;
3889	push	%rbp
3890.cfi_push	%rbp
3891	push	%rbx
3892.cfi_push	%rbx
3893	push	%r12
3894.cfi_push	%r12
3895	push	%r13
3896.cfi_push	%r13
3897	push	%r14
3898.cfi_push	%r14
3899	push	%r15
3900.cfi_push	%r15
3901	sub	\$32*15+8, %rsp
3902.cfi_adjust_cfa_offset	32*15+8
3903.Ladd_affine${x}_body:
3904
3905	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3906	mov	$b_org, $b_ptr		# reassign
3907	movdqu	0x10($a_ptr), %xmm1
3908	movdqu	0x20($a_ptr), %xmm2
3909	movdqu	0x30($a_ptr), %xmm3
3910	movdqu	0x40($a_ptr), %xmm4
3911	movdqu	0x50($a_ptr), %xmm5
3912	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3913	 mov	0x40+8*1($a_ptr), $acc6
3914	 mov	0x40+8*2($a_ptr), $acc7
3915	 mov	0x40+8*3($a_ptr), $acc0
3916	movdqa	%xmm0, $in1_x(%rsp)
3917	movdqa	%xmm1, $in1_x+0x10(%rsp)
3918	movdqa	%xmm2, $in1_y(%rsp)
3919	movdqa	%xmm3, $in1_y+0x10(%rsp)
3920	movdqa	%xmm4, $in1_z(%rsp)
3921	movdqa	%xmm5, $in1_z+0x10(%rsp)
3922	por	%xmm4, %xmm5
3923
3924	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3925	 pshufd	\$0xb1, %xmm5, %xmm3
3926	movdqu	0x10($b_ptr), %xmm1
3927	movdqu	0x20($b_ptr), %xmm2
3928	 por	%xmm3, %xmm5
3929	movdqu	0x30($b_ptr), %xmm3
3930	movdqa	%xmm0, $in2_x(%rsp)
3931	 pshufd	\$0x1e, %xmm5, %xmm4
3932	movdqa	%xmm1, $in2_x+0x10(%rsp)
3933	por	%xmm0, %xmm1
3934	 movq	$r_ptr, %xmm0		# save $r_ptr
3935	movdqa	%xmm2, $in2_y(%rsp)
3936	movdqa	%xmm3, $in2_y+0x10(%rsp)
3937	por	%xmm2, %xmm3
3938	 por	%xmm4, %xmm5
3939	 pxor	%xmm4, %xmm4
3940	por	%xmm1, %xmm3
3941
3942	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3943	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3944	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3945
3946	pcmpeqd	%xmm4, %xmm5
3947	pshufd	\$0xb1, %xmm3, %xmm4
3948	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3949	 #lea	0x00($b_ptr), $b_ptr
3950	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3951	por	%xmm3, %xmm4
3952	pshufd	\$0, %xmm5, %xmm5		# in1infty
3953	pshufd	\$0x1e, %xmm4, %xmm3
3954	 mov	$acc5, $acc2
3955	por	%xmm3, %xmm4
3956	pxor	%xmm3, %xmm3
3957	 mov	$acc6, $acc3
3958	pcmpeqd	%xmm3, %xmm4
3959	pshufd	\$0, %xmm4, %xmm4		# in2infty
3960
3961	lea	$Z1sqr-$bias(%rsp), $a_ptr
3962	mov	$acc7, $acc4
3963	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3964	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3965
3966	lea	$in1_x(%rsp), $b_ptr
3967	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3968	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3969
3970	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3971	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3972	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3973
3974	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3975	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3976	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3977
3978	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3979	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3980	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3981
3982	lea	$in1_y(%rsp), $b_ptr
3983	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3984	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3985
3986	`&load_for_sqr("$H(%rsp)", "$src0")`
3987	lea	$Hsqr(%rsp), $r_ptr		# H^2
3988	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3989
3990	`&load_for_sqr("$R(%rsp)", "$src0")`
3991	lea	$Rsqr(%rsp), $r_ptr		# R^2
3992	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3993
3994	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3995	lea	$Hcub(%rsp), $r_ptr		# H^3
3996	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3997
3998	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3999	lea	$U2(%rsp), $r_ptr		# U1*H^2
4000	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
4001___
4002{
4003#######################################################################
4004# operate in 4-5-0-1 "name space" that matches multiplication output
4005#
4006my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4007my ($poly1, $poly3)=($acc6,$acc7);
4008
4009$code.=<<___;
4010	#lea	$U2(%rsp), $a_ptr
4011	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
4012	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
4013
4014	xor	$t4, $t4
4015	add	$acc0, $acc0		# a0:a3+a0:a3
4016	lea	$Rsqr(%rsp), $a_ptr
4017	adc	$acc1, $acc1
4018	 mov	$acc0, $t0
4019	adc	$acc2, $acc2
4020	adc	$acc3, $acc3
4021	 mov	$acc1, $t1
4022	adc	\$0, $t4
4023
4024	sub	\$-1, $acc0
4025	 mov	$acc2, $t2
4026	sbb	$poly1, $acc1
4027	sbb	\$0, $acc2
4028	 mov	$acc3, $t3
4029	sbb	$poly3, $acc3
4030	sbb	\$0, $t4
4031
4032	cmovc	$t0, $acc0
4033	mov	8*0($a_ptr), $t0
4034	cmovc	$t1, $acc1
4035	mov	8*1($a_ptr), $t1
4036	cmovc	$t2, $acc2
4037	mov	8*2($a_ptr), $t2
4038	cmovc	$t3, $acc3
4039	mov	8*3($a_ptr), $t3
4040
4041	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
4042
4043	lea	$Hcub(%rsp), $b_ptr
4044	lea	$res_x(%rsp), $r_ptr
4045	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
4046
4047	mov	$U2+8*0(%rsp), $t0
4048	mov	$U2+8*1(%rsp), $t1
4049	mov	$U2+8*2(%rsp), $t2
4050	mov	$U2+8*3(%rsp), $t3
4051	lea	$H(%rsp), $r_ptr
4052
4053	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
4054
4055	mov	$acc0, 8*0($r_ptr)		# save the result, as
4056	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
4057	mov	$acc2, 8*2($r_ptr)
4058	mov	$acc3, 8*3($r_ptr)
4059___
4060}
4061$code.=<<___;
4062	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4063	lea	$S2(%rsp), $r_ptr
4064	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
4065
4066	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4067	lea	$H(%rsp), $r_ptr
4068	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
4069
4070	lea	$S2(%rsp), $b_ptr
4071	lea	$res_y(%rsp), $r_ptr
4072	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
4073
4074	movq	%xmm0, $r_ptr		# restore $r_ptr
4075
4076	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
4077	movdqa	%xmm5, %xmm1
4078	pandn	$res_z(%rsp), %xmm0
4079	movdqa	%xmm5, %xmm2
4080	pandn	$res_z+0x10(%rsp), %xmm1
4081	movdqa	%xmm5, %xmm3
4082	pand	.LONE_mont(%rip), %xmm2
4083	pand	.LONE_mont+0x10(%rip), %xmm3
4084	por	%xmm0, %xmm2
4085	por	%xmm1, %xmm3
4086
4087	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
4088	movdqa	%xmm4, %xmm1
4089	pandn	%xmm2, %xmm0
4090	movdqa	%xmm4, %xmm2
4091	pandn	%xmm3, %xmm1
4092	movdqa	%xmm4, %xmm3
4093	pand	$in1_z(%rsp), %xmm2
4094	pand	$in1_z+0x10(%rsp), %xmm3
4095	por	%xmm0, %xmm2
4096	por	%xmm1, %xmm3
4097	movdqu	%xmm2, 0x40($r_ptr)
4098	movdqu	%xmm3, 0x50($r_ptr)
4099
4100	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
4101	movdqa	%xmm5, %xmm1
4102	pandn	$res_x(%rsp), %xmm0
4103	movdqa	%xmm5, %xmm2
4104	pandn	$res_x+0x10(%rsp), %xmm1
4105	movdqa	%xmm5, %xmm3
4106	pand	$in2_x(%rsp), %xmm2
4107	pand	$in2_x+0x10(%rsp), %xmm3
4108	por	%xmm0, %xmm2
4109	por	%xmm1, %xmm3
4110
4111	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
4112	movdqa	%xmm4, %xmm1
4113	pandn	%xmm2, %xmm0
4114	movdqa	%xmm4, %xmm2
4115	pandn	%xmm3, %xmm1
4116	movdqa	%xmm4, %xmm3
4117	pand	$in1_x(%rsp), %xmm2
4118	pand	$in1_x+0x10(%rsp), %xmm3
4119	por	%xmm0, %xmm2
4120	por	%xmm1, %xmm3
4121	movdqu	%xmm2, 0x00($r_ptr)
4122	movdqu	%xmm3, 0x10($r_ptr)
4123
4124	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
4125	movdqa	%xmm5, %xmm1
4126	pandn	$res_y(%rsp), %xmm0
4127	movdqa	%xmm5, %xmm2
4128	pandn	$res_y+0x10(%rsp), %xmm1
4129	movdqa	%xmm5, %xmm3
4130	pand	$in2_y(%rsp), %xmm2
4131	pand	$in2_y+0x10(%rsp), %xmm3
4132	por	%xmm0, %xmm2
4133	por	%xmm1, %xmm3
4134
4135	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
4136	movdqa	%xmm4, %xmm1
4137	pandn	%xmm2, %xmm0
4138	movdqa	%xmm4, %xmm2
4139	pandn	%xmm3, %xmm1
4140	movdqa	%xmm4, %xmm3
4141	pand	$in1_y(%rsp), %xmm2
4142	pand	$in1_y+0x10(%rsp), %xmm3
4143	por	%xmm0, %xmm2
4144	por	%xmm1, %xmm3
4145	movdqu	%xmm2, 0x20($r_ptr)
4146	movdqu	%xmm3, 0x30($r_ptr)
4147
4148	lea	32*15+56(%rsp), %rsi
4149.cfi_def_cfa	%rsi,8
4150	mov	-48(%rsi),%r15
4151.cfi_restore	%r15
4152	mov	-40(%rsi),%r14
4153.cfi_restore	%r14
4154	mov	-32(%rsi),%r13
4155.cfi_restore	%r13
4156	mov	-24(%rsi),%r12
4157.cfi_restore	%r12
4158	mov	-16(%rsi),%rbx
4159.cfi_restore	%rbx
4160	mov	-8(%rsi),%rbp
4161.cfi_restore	%rbp
4162	lea	(%rsi),%rsp
4163.cfi_def_cfa_register	%rsp
4164.Ladd_affine${x}_epilogue:
4165	ret
4166.cfi_endproc
4167.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4168___
4169}
4170&gen_add_affine("q");
4171
4172########################################################################
4173# AD*X magic
4174#
4175if ($addx) {								{
4176########################################################################
4177# operate in 4-5-0-1 "name space" that matches multiplication output
4178#
4179my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4180
4181$code.=<<___;
4182.type	__ecp_nistz256_add_tox,\@abi-omnipotent
4183.align	32
4184__ecp_nistz256_add_tox:
4185.cfi_startproc
4186	xor	$t4, $t4
4187	adc	8*0($b_ptr), $a0
4188	adc	8*1($b_ptr), $a1
4189	 mov	$a0, $t0
4190	adc	8*2($b_ptr), $a2
4191	adc	8*3($b_ptr), $a3
4192	 mov	$a1, $t1
4193	adc	\$0, $t4
4194
4195	xor	$t3, $t3
4196	sbb	\$-1, $a0
4197	 mov	$a2, $t2
4198	sbb	$poly1, $a1
4199	sbb	\$0, $a2
4200	 mov	$a3, $t3
4201	sbb	$poly3, $a3
4202	sbb	\$0, $t4
4203
4204	cmovc	$t0, $a0
4205	cmovc	$t1, $a1
4206	mov	$a0, 8*0($r_ptr)
4207	cmovc	$t2, $a2
4208	mov	$a1, 8*1($r_ptr)
4209	cmovc	$t3, $a3
4210	mov	$a2, 8*2($r_ptr)
4211	mov	$a3, 8*3($r_ptr)
4212
4213	ret
4214.cfi_endproc
4215.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4216
4217.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
4218.align	32
4219__ecp_nistz256_sub_fromx:
4220.cfi_startproc
4221	xor	$t4, $t4
4222	sbb	8*0($b_ptr), $a0
4223	sbb	8*1($b_ptr), $a1
4224	 mov	$a0, $t0
4225	sbb	8*2($b_ptr), $a2
4226	sbb	8*3($b_ptr), $a3
4227	 mov	$a1, $t1
4228	sbb	\$0, $t4
4229
4230	xor	$t3, $t3
4231	adc	\$-1, $a0
4232	 mov	$a2, $t2
4233	adc	$poly1, $a1
4234	adc	\$0, $a2
4235	 mov	$a3, $t3
4236	adc	$poly3, $a3
4237
4238	bt	\$0, $t4
4239	cmovnc	$t0, $a0
4240	cmovnc	$t1, $a1
4241	mov	$a0, 8*0($r_ptr)
4242	cmovnc	$t2, $a2
4243	mov	$a1, 8*1($r_ptr)
4244	cmovnc	$t3, $a3
4245	mov	$a2, 8*2($r_ptr)
4246	mov	$a3, 8*3($r_ptr)
4247
4248	ret
4249.cfi_endproc
4250.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4251
4252.type	__ecp_nistz256_subx,\@abi-omnipotent
4253.align	32
4254__ecp_nistz256_subx:
4255.cfi_startproc
4256	xor	$t4, $t4
4257	sbb	$a0, $t0
4258	sbb	$a1, $t1
4259	 mov	$t0, $a0
4260	sbb	$a2, $t2
4261	sbb	$a3, $t3
4262	 mov	$t1, $a1
4263	sbb	\$0, $t4
4264
4265	xor	$a3 ,$a3
4266	adc	\$-1, $t0
4267	 mov	$t2, $a2
4268	adc	$poly1, $t1
4269	adc	\$0, $t2
4270	 mov	$t3, $a3
4271	adc	$poly3, $t3
4272
4273	bt	\$0, $t4
4274	cmovc	$t0, $a0
4275	cmovc	$t1, $a1
4276	cmovc	$t2, $a2
4277	cmovc	$t3, $a3
4278
4279	ret
4280.cfi_endproc
4281.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
4282
4283.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
4284.align	32
4285__ecp_nistz256_mul_by_2x:
4286.cfi_startproc
4287	xor	$t4, $t4
4288	adc	$a0, $a0		# a0:a3+a0:a3
4289	adc	$a1, $a1
4290	 mov	$a0, $t0
4291	adc	$a2, $a2
4292	adc	$a3, $a3
4293	 mov	$a1, $t1
4294	adc	\$0, $t4
4295
4296	xor	$t3, $t3
4297	sbb	\$-1, $a0
4298	 mov	$a2, $t2
4299	sbb	$poly1, $a1
4300	sbb	\$0, $a2
4301	 mov	$a3, $t3
4302	sbb	$poly3, $a3
4303	sbb	\$0, $t4
4304
4305	cmovc	$t0, $a0
4306	cmovc	$t1, $a1
4307	mov	$a0, 8*0($r_ptr)
4308	cmovc	$t2, $a2
4309	mov	$a1, 8*1($r_ptr)
4310	cmovc	$t3, $a3
4311	mov	$a2, 8*2($r_ptr)
4312	mov	$a3, 8*3($r_ptr)
4313
4314	ret
4315.cfi_endproc
4316.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4317___
4318									}
4319&gen_double("x");
4320&gen_add("x");
4321&gen_add_affine("x");
4322}
4323}}}
4324
4325# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4326#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4327if ($win64) {
4328$rec="%rcx";
4329$frame="%rdx";
4330$context="%r8";
4331$disp="%r9";
4332
4333$code.=<<___;
4334.extern	__imp_RtlVirtualUnwind
4335
4336.type	short_handler,\@abi-omnipotent
4337.align	16
4338short_handler:
4339	push	%rsi
4340	push	%rdi
4341	push	%rbx
4342	push	%rbp
4343	push	%r12
4344	push	%r13
4345	push	%r14
4346	push	%r15
4347	pushfq
4348	sub	\$64,%rsp
4349
4350	mov	120($context),%rax	# pull context->Rax
4351	mov	248($context),%rbx	# pull context->Rip
4352
4353	mov	8($disp),%rsi		# disp->ImageBase
4354	mov	56($disp),%r11		# disp->HandlerData
4355
4356	mov	0(%r11),%r10d		# HandlerData[0]
4357	lea	(%rsi,%r10),%r10	# end of prologue label
4358	cmp	%r10,%rbx		# context->Rip<end of prologue label
4359	jb	.Lcommon_seh_tail
4360
4361	mov	152($context),%rax	# pull context->Rsp
4362
4363	mov	4(%r11),%r10d		# HandlerData[1]
4364	lea	(%rsi,%r10),%r10	# epilogue label
4365	cmp	%r10,%rbx		# context->Rip>=epilogue label
4366	jae	.Lcommon_seh_tail
4367
4368	lea	16(%rax),%rax
4369
4370	mov	-8(%rax),%r12
4371	mov	-16(%rax),%r13
4372	mov	%r12,216($context)	# restore context->R12
4373	mov	%r13,224($context)	# restore context->R13
4374
4375	jmp	.Lcommon_seh_tail
4376.size	short_handler,.-short_handler
4377
4378.type	full_handler,\@abi-omnipotent
4379.align	16
4380full_handler:
4381	push	%rsi
4382	push	%rdi
4383	push	%rbx
4384	push	%rbp
4385	push	%r12
4386	push	%r13
4387	push	%r14
4388	push	%r15
4389	pushfq
4390	sub	\$64,%rsp
4391
4392	mov	120($context),%rax	# pull context->Rax
4393	mov	248($context),%rbx	# pull context->Rip
4394
4395	mov	8($disp),%rsi		# disp->ImageBase
4396	mov	56($disp),%r11		# disp->HandlerData
4397
4398	mov	0(%r11),%r10d		# HandlerData[0]
4399	lea	(%rsi,%r10),%r10	# end of prologue label
4400	cmp	%r10,%rbx		# context->Rip<end of prologue label
4401	jb	.Lcommon_seh_tail
4402
4403	mov	152($context),%rax	# pull context->Rsp
4404
4405	mov	4(%r11),%r10d		# HandlerData[1]
4406	lea	(%rsi,%r10),%r10	# epilogue label
4407	cmp	%r10,%rbx		# context->Rip>=epilogue label
4408	jae	.Lcommon_seh_tail
4409
4410	mov	8(%r11),%r10d		# HandlerData[2]
4411	lea	(%rax,%r10),%rax
4412
4413	mov	-8(%rax),%rbp
4414	mov	-16(%rax),%rbx
4415	mov	-24(%rax),%r12
4416	mov	-32(%rax),%r13
4417	mov	-40(%rax),%r14
4418	mov	-48(%rax),%r15
4419	mov	%rbx,144($context)	# restore context->Rbx
4420	mov	%rbp,160($context)	# restore context->Rbp
4421	mov	%r12,216($context)	# restore context->R12
4422	mov	%r13,224($context)	# restore context->R13
4423	mov	%r14,232($context)	# restore context->R14
4424	mov	%r15,240($context)	# restore context->R15
4425
4426.Lcommon_seh_tail:
4427	mov	8(%rax),%rdi
4428	mov	16(%rax),%rsi
4429	mov	%rax,152($context)	# restore context->Rsp
4430	mov	%rsi,168($context)	# restore context->Rsi
4431	mov	%rdi,176($context)	# restore context->Rdi
4432
4433	mov	40($disp),%rdi		# disp->ContextRecord
4434	mov	$context,%rsi		# context
4435	mov	\$154,%ecx		# sizeof(CONTEXT)
4436	.long	0xa548f3fc		# cld; rep movsq
4437
4438	mov	$disp,%rsi
4439	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4440	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4441	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4442	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4443	mov	40(%rsi),%r10		# disp->ContextRecord
4444	lea	56(%rsi),%r11		# &disp->HandlerData
4445	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4446	mov	%r10,32(%rsp)		# arg5
4447	mov	%r11,40(%rsp)		# arg6
4448	mov	%r12,48(%rsp)		# arg7
4449	mov	%rcx,56(%rsp)		# arg8, (NULL)
4450	call	*__imp_RtlVirtualUnwind(%rip)
4451
4452	mov	\$1,%eax		# ExceptionContinueSearch
4453	add	\$64,%rsp
4454	popfq
4455	pop	%r15
4456	pop	%r14
4457	pop	%r13
4458	pop	%r12
4459	pop	%rbp
4460	pop	%rbx
4461	pop	%rdi
4462	pop	%rsi
4463	ret
4464.size	full_handler,.-full_handler
4465
4466.section	.pdata
4467.align	4
4468	.rva	.LSEH_begin_ecp_nistz256_mul_by_2
4469	.rva	.LSEH_end_ecp_nistz256_mul_by_2
4470	.rva	.LSEH_info_ecp_nistz256_mul_by_2
4471
4472	.rva	.LSEH_begin_ecp_nistz256_div_by_2
4473	.rva	.LSEH_end_ecp_nistz256_div_by_2
4474	.rva	.LSEH_info_ecp_nistz256_div_by_2
4475
4476	.rva	.LSEH_begin_ecp_nistz256_mul_by_3
4477	.rva	.LSEH_end_ecp_nistz256_mul_by_3
4478	.rva	.LSEH_info_ecp_nistz256_mul_by_3
4479
4480	.rva	.LSEH_begin_ecp_nistz256_add
4481	.rva	.LSEH_end_ecp_nistz256_add
4482	.rva	.LSEH_info_ecp_nistz256_add
4483
4484	.rva	.LSEH_begin_ecp_nistz256_sub
4485	.rva	.LSEH_end_ecp_nistz256_sub
4486	.rva	.LSEH_info_ecp_nistz256_sub
4487
4488	.rva	.LSEH_begin_ecp_nistz256_neg
4489	.rva	.LSEH_end_ecp_nistz256_neg
4490	.rva	.LSEH_info_ecp_nistz256_neg
4491
4492	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
4493	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
4494	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
4495
4496	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
4497	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
4498	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
4499___
4500$code.=<<___	if ($addx);
4501	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
4502	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
4503	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
4504
4505	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
4506	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
4507	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
4508___
4509$code.=<<___;
4510	.rva	.LSEH_begin_ecp_nistz256_to_mont
4511	.rva	.LSEH_end_ecp_nistz256_to_mont
4512	.rva	.LSEH_info_ecp_nistz256_to_mont
4513
4514	.rva	.LSEH_begin_ecp_nistz256_mul_mont
4515	.rva	.LSEH_end_ecp_nistz256_mul_mont
4516	.rva	.LSEH_info_ecp_nistz256_mul_mont
4517
4518	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
4519	.rva	.LSEH_end_ecp_nistz256_sqr_mont
4520	.rva	.LSEH_info_ecp_nistz256_sqr_mont
4521
4522	.rva	.LSEH_begin_ecp_nistz256_from_mont
4523	.rva	.LSEH_end_ecp_nistz256_from_mont
4524	.rva	.LSEH_info_ecp_nistz256_from_mont
4525
4526	.rva	.LSEH_begin_ecp_nistz256_gather_w5
4527	.rva	.LSEH_end_ecp_nistz256_gather_w5
4528	.rva	.LSEH_info_ecp_nistz256_gather_wX
4529
4530	.rva	.LSEH_begin_ecp_nistz256_gather_w7
4531	.rva	.LSEH_end_ecp_nistz256_gather_w7
4532	.rva	.LSEH_info_ecp_nistz256_gather_wX
4533___
4534$code.=<<___	if ($avx>1);
4535	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w5
4536	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w5
4537	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4538
4539	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w7
4540	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w7
4541	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4542___
4543$code.=<<___;
4544	.rva	.LSEH_begin_ecp_nistz256_point_double
4545	.rva	.LSEH_end_ecp_nistz256_point_double
4546	.rva	.LSEH_info_ecp_nistz256_point_double
4547
4548	.rva	.LSEH_begin_ecp_nistz256_point_add
4549	.rva	.LSEH_end_ecp_nistz256_point_add
4550	.rva	.LSEH_info_ecp_nistz256_point_add
4551
4552	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4553	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4554	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4555___
4556$code.=<<___ if ($addx);
4557	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4558	.rva	.LSEH_end_ecp_nistz256_point_doublex
4559	.rva	.LSEH_info_ecp_nistz256_point_doublex
4560
4561	.rva	.LSEH_begin_ecp_nistz256_point_addx
4562	.rva	.LSEH_end_ecp_nistz256_point_addx
4563	.rva	.LSEH_info_ecp_nistz256_point_addx
4564
4565	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4566	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4567	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4568___
4569$code.=<<___;
4570
4571.section	.xdata
4572.align	8
4573.LSEH_info_ecp_nistz256_mul_by_2:
4574	.byte	9,0,0,0
4575	.rva	short_handler
4576	.rva	.Lmul_by_2_body,.Lmul_by_2_epilogue	# HandlerData[]
4577.LSEH_info_ecp_nistz256_div_by_2:
4578	.byte	9,0,0,0
4579	.rva	short_handler
4580	.rva	.Ldiv_by_2_body,.Ldiv_by_2_epilogue	# HandlerData[]
4581.LSEH_info_ecp_nistz256_mul_by_3:
4582	.byte	9,0,0,0
4583	.rva	short_handler
4584	.rva	.Lmul_by_3_body,.Lmul_by_3_epilogue	# HandlerData[]
4585.LSEH_info_ecp_nistz256_add:
4586	.byte	9,0,0,0
4587	.rva	short_handler
4588	.rva	.Ladd_body,.Ladd_epilogue		# HandlerData[]
4589.LSEH_info_ecp_nistz256_sub:
4590	.byte	9,0,0,0
4591	.rva	short_handler
4592	.rva	.Lsub_body,.Lsub_epilogue		# HandlerData[]
4593.LSEH_info_ecp_nistz256_neg:
4594	.byte	9,0,0,0
4595	.rva	short_handler
4596	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4597.LSEH_info_ecp_nistz256_ord_mul_mont:
4598	.byte	9,0,0,0
4599	.rva	full_handler
4600	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4601	.long	48,0
4602.LSEH_info_ecp_nistz256_ord_sqr_mont:
4603	.byte	9,0,0,0
4604	.rva	full_handler
4605	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4606	.long	48,0
4607___
4608$code.=<<___ if ($addx);
4609.LSEH_info_ecp_nistz256_ord_mul_montx:
4610	.byte	9,0,0,0
4611	.rva	full_handler
4612	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4613	.long	48,0
4614.LSEH_info_ecp_nistz256_ord_sqr_montx:
4615	.byte	9,0,0,0
4616	.rva	full_handler
4617	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4618	.long	48,0
4619___
4620$code.=<<___;
4621.LSEH_info_ecp_nistz256_to_mont:
4622	.byte	9,0,0,0
4623	.rva	full_handler
4624	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4625	.long	48,0
4626.LSEH_info_ecp_nistz256_mul_mont:
4627	.byte	9,0,0,0
4628	.rva	full_handler
4629	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4630	.long	48,0
4631.LSEH_info_ecp_nistz256_sqr_mont:
4632	.byte	9,0,0,0
4633	.rva	full_handler
4634	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4635	.long	48,0
4636.LSEH_info_ecp_nistz256_from_mont:
4637	.byte	9,0,0,0
4638	.rva	short_handler
4639	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
4640.LSEH_info_ecp_nistz256_gather_wX:
4641	.byte	0x01,0x33,0x16,0x00
4642	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4643	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4644	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4645	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4646	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4647	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4648	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4649	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4650	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4651	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4652	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4653	.align	8
4654___
4655$code.=<<___	if ($avx>1);
4656.LSEH_info_ecp_nistz256_avx2_gather_wX:
4657	.byte	0x01,0x36,0x17,0x0b
4658	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4659	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4660	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4661	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4662	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4663	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4664	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4665	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4666	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4667	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4668	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4669	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4670	.align	8
4671___
4672$code.=<<___;
4673.LSEH_info_ecp_nistz256_point_double:
4674	.byte	9,0,0,0
4675	.rva	full_handler
4676	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4677	.long	32*5+56,0
4678.LSEH_info_ecp_nistz256_point_add:
4679	.byte	9,0,0,0
4680	.rva	full_handler
4681	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4682	.long	32*18+56,0
4683.LSEH_info_ecp_nistz256_point_add_affine:
4684	.byte	9,0,0,0
4685	.rva	full_handler
4686	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4687	.long	32*15+56,0
4688___
4689$code.=<<___ if ($addx);
4690.align	8
4691.LSEH_info_ecp_nistz256_point_doublex:
4692	.byte	9,0,0,0
4693	.rva	full_handler
4694	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4695	.long	32*5+56,0
4696.LSEH_info_ecp_nistz256_point_addx:
4697	.byte	9,0,0,0
4698	.rva	full_handler
4699	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4700	.long	32*18+56,0
4701.LSEH_info_ecp_nistz256_point_add_affinex:
4702	.byte	9,0,0,0
4703	.rva	full_handler
4704	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4705	.long	32*15+56,0
4706___
4707}
4708
4709########################################################################
4710# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4711#
4712open TABLE,"<ecp_nistz256_table.c"		or
4713open TABLE,"<${dir}../ecp_nistz256_table.c"	or
4714die "failed to open ecp_nistz256_table.c:",$!;
4715
4716use integer;
4717
4718foreach(<TABLE>) {
4719	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4720}
4721close TABLE;
4722
4723die "insane number of elements" if ($#arr != 64*16*37-1);
4724
4725print <<___;
4726.text
4727.globl	ecp_nistz256_precomputed
4728.type	ecp_nistz256_precomputed,\@object
4729.align	4096
4730ecp_nistz256_precomputed:
4731___
4732while (@line=splice(@arr,0,16)) {
4733	print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4734}
4735print <<___;
4736.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4737___
4738
4739$code =~ s/\`([^\`]*)\`/eval $1/gem;
4740print $code;
4741close STDOUT;
4742