1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the Apache License 2.0 (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53die "can't locate x86_64-xlate.pl";
54
55open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
56    or die "can't call $xlate: $!";
57*STDOUT=*OUT;
58
59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61	$avx = ($1>=2.19) + ($1>=2.22);
62	$addx = ($1>=2.23);
63}
64
65if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
67	$avx = ($1>=2.09) + ($1>=2.10);
68	$addx = ($1>=2.10);
69}
70
71if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73	$avx = ($1>=10) + ($1>=11);
74	$addx = ($1>=12);
75}
76
77if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
78	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
79	$avx = ($ver>=3.0) + ($ver>=3.01);
80	$addx = ($ver>=3.03);
81}
82
83$code.=<<___;
84.text
85.extern	OPENSSL_ia32cap_P
86
87# The polynomial
88.align 64
89.Lpoly:
90.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
91
92# 2^512 mod P precomputed for NIST P256 polynomial
93.LRR:
94.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
95
96.LOne:
97.long 1,1,1,1,1,1,1,1
98.LTwo:
99.long 2,2,2,2,2,2,2,2
100.LThree:
101.long 3,3,3,3,3,3,3,3
102.LONE_mont:
103.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
104
105# Constants for computations modulo ord(p256)
106.Lord:
107.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
108.LordK:
109.quad 0xccd1c8aaee00bc4f
110___
111
112{
113################################################################################
114# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
115
116my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
117my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
118my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
119
120$code.=<<___;
121
122.globl	ecp_nistz256_mul_by_2
123.type	ecp_nistz256_mul_by_2,\@function,2
124.align	64
125ecp_nistz256_mul_by_2:
126.cfi_startproc
127	push	%r12
128.cfi_push	%r12
129	push	%r13
130.cfi_push	%r13
131.Lmul_by_2_body:
132
133	mov	8*0($a_ptr), $a0
134	xor	$t4,$t4
135	mov	8*1($a_ptr), $a1
136	add	$a0, $a0		# a0:a3+a0:a3
137	mov	8*2($a_ptr), $a2
138	adc	$a1, $a1
139	mov	8*3($a_ptr), $a3
140	lea	.Lpoly(%rip), $a_ptr
141	 mov	$a0, $t0
142	adc	$a2, $a2
143	adc	$a3, $a3
144	 mov	$a1, $t1
145	adc	\$0, $t4
146
147	sub	8*0($a_ptr), $a0
148	 mov	$a2, $t2
149	sbb	8*1($a_ptr), $a1
150	sbb	8*2($a_ptr), $a2
151	 mov	$a3, $t3
152	sbb	8*3($a_ptr), $a3
153	sbb	\$0, $t4
154
155	cmovc	$t0, $a0
156	cmovc	$t1, $a1
157	mov	$a0, 8*0($r_ptr)
158	cmovc	$t2, $a2
159	mov	$a1, 8*1($r_ptr)
160	cmovc	$t3, $a3
161	mov	$a2, 8*2($r_ptr)
162	mov	$a3, 8*3($r_ptr)
163
164	mov	0(%rsp),%r13
165.cfi_restore	%r13
166	mov	8(%rsp),%r12
167.cfi_restore	%r12
168	lea	16(%rsp),%rsp
169.cfi_adjust_cfa_offset	-16
170.Lmul_by_2_epilogue:
171	ret
172.cfi_endproc
173.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
174
175################################################################################
176# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
177.globl	ecp_nistz256_div_by_2
178.type	ecp_nistz256_div_by_2,\@function,2
179.align	32
180ecp_nistz256_div_by_2:
181.cfi_startproc
182	push	%r12
183.cfi_push	%r12
184	push	%r13
185.cfi_push	%r13
186.Ldiv_by_2_body:
187
188	mov	8*0($a_ptr), $a0
189	mov	8*1($a_ptr), $a1
190	mov	8*2($a_ptr), $a2
191	 mov	$a0, $t0
192	mov	8*3($a_ptr), $a3
193	lea	.Lpoly(%rip), $a_ptr
194
195	 mov	$a1, $t1
196	xor	$t4, $t4
197	add	8*0($a_ptr), $a0
198	 mov	$a2, $t2
199	adc	8*1($a_ptr), $a1
200	adc	8*2($a_ptr), $a2
201	 mov	$a3, $t3
202	adc	8*3($a_ptr), $a3
203	adc	\$0, $t4
204	xor	$a_ptr, $a_ptr		# borrow $a_ptr
205	test	\$1, $t0
206
207	cmovz	$t0, $a0
208	cmovz	$t1, $a1
209	cmovz	$t2, $a2
210	cmovz	$t3, $a3
211	cmovz	$a_ptr, $t4
212
213	mov	$a1, $t0		# a0:a3>>1
214	shr	\$1, $a0
215	shl	\$63, $t0
216	mov	$a2, $t1
217	shr	\$1, $a1
218	or	$t0, $a0
219	shl	\$63, $t1
220	mov	$a3, $t2
221	shr	\$1, $a2
222	or	$t1, $a1
223	shl	\$63, $t2
224	shr	\$1, $a3
225	shl	\$63, $t4
226	or	$t2, $a2
227	or	$t4, $a3
228
229	mov	$a0, 8*0($r_ptr)
230	mov	$a1, 8*1($r_ptr)
231	mov	$a2, 8*2($r_ptr)
232	mov	$a3, 8*3($r_ptr)
233
234	mov	0(%rsp),%r13
235.cfi_restore	%r13
236	mov	8(%rsp),%r12
237.cfi_restore	%r12
238	lea	16(%rsp),%rsp
239.cfi_adjust_cfa_offset	-16
240.Ldiv_by_2_epilogue:
241	ret
242.cfi_endproc
243.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
244
245################################################################################
246# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
247.globl	ecp_nistz256_mul_by_3
248.type	ecp_nistz256_mul_by_3,\@function,2
249.align	32
250ecp_nistz256_mul_by_3:
251.cfi_startproc
252	push	%r12
253.cfi_push	%r12
254	push	%r13
255.cfi_push	%r13
256.Lmul_by_3_body:
257
258	mov	8*0($a_ptr), $a0
259	xor	$t4, $t4
260	mov	8*1($a_ptr), $a1
261	add	$a0, $a0		# a0:a3+a0:a3
262	mov	8*2($a_ptr), $a2
263	adc	$a1, $a1
264	mov	8*3($a_ptr), $a3
265	 mov	$a0, $t0
266	adc	$a2, $a2
267	adc	$a3, $a3
268	 mov	$a1, $t1
269	adc	\$0, $t4
270
271	sub	\$-1, $a0
272	 mov	$a2, $t2
273	sbb	.Lpoly+8*1(%rip), $a1
274	sbb	\$0, $a2
275	 mov	$a3, $t3
276	sbb	.Lpoly+8*3(%rip), $a3
277	sbb	\$0, $t4
278
279	cmovc	$t0, $a0
280	cmovc	$t1, $a1
281	cmovc	$t2, $a2
282	cmovc	$t3, $a3
283
284	xor	$t4, $t4
285	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
286	adc	8*1($a_ptr), $a1
287	 mov	$a0, $t0
288	adc	8*2($a_ptr), $a2
289	adc	8*3($a_ptr), $a3
290	 mov	$a1, $t1
291	adc	\$0, $t4
292
293	sub	\$-1, $a0
294	 mov	$a2, $t2
295	sbb	.Lpoly+8*1(%rip), $a1
296	sbb	\$0, $a2
297	 mov	$a3, $t3
298	sbb	.Lpoly+8*3(%rip), $a3
299	sbb	\$0, $t4
300
301	cmovc	$t0, $a0
302	cmovc	$t1, $a1
303	mov	$a0, 8*0($r_ptr)
304	cmovc	$t2, $a2
305	mov	$a1, 8*1($r_ptr)
306	cmovc	$t3, $a3
307	mov	$a2, 8*2($r_ptr)
308	mov	$a3, 8*3($r_ptr)
309
310	mov	0(%rsp),%r13
311.cfi_restore	%r13
312	mov	8(%rsp),%r12
313.cfi_restore	%r12
314	lea	16(%rsp),%rsp
315.cfi_adjust_cfa_offset	-16
316.Lmul_by_3_epilogue:
317	ret
318.cfi_endproc
319.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
320
321################################################################################
322# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
323.globl	ecp_nistz256_add
324.type	ecp_nistz256_add,\@function,3
325.align	32
326ecp_nistz256_add:
327.cfi_startproc
328	push	%r12
329.cfi_push	%r12
330	push	%r13
331.cfi_push	%r13
332.Ladd_body:
333
334	mov	8*0($a_ptr), $a0
335	xor	$t4, $t4
336	mov	8*1($a_ptr), $a1
337	mov	8*2($a_ptr), $a2
338	mov	8*3($a_ptr), $a3
339	lea	.Lpoly(%rip), $a_ptr
340
341	add	8*0($b_ptr), $a0
342	adc	8*1($b_ptr), $a1
343	 mov	$a0, $t0
344	adc	8*2($b_ptr), $a2
345	adc	8*3($b_ptr), $a3
346	 mov	$a1, $t1
347	adc	\$0, $t4
348
349	sub	8*0($a_ptr), $a0
350	 mov	$a2, $t2
351	sbb	8*1($a_ptr), $a1
352	sbb	8*2($a_ptr), $a2
353	 mov	$a3, $t3
354	sbb	8*3($a_ptr), $a3
355	sbb	\$0, $t4
356
357	cmovc	$t0, $a0
358	cmovc	$t1, $a1
359	mov	$a0, 8*0($r_ptr)
360	cmovc	$t2, $a2
361	mov	$a1, 8*1($r_ptr)
362	cmovc	$t3, $a3
363	mov	$a2, 8*2($r_ptr)
364	mov	$a3, 8*3($r_ptr)
365
366	mov	0(%rsp),%r13
367.cfi_restore	%r13
368	mov	8(%rsp),%r12
369.cfi_restore	%r12
370	lea	16(%rsp),%rsp
371.cfi_adjust_cfa_offset	-16
372.Ladd_epilogue:
373	ret
374.cfi_endproc
375.size	ecp_nistz256_add,.-ecp_nistz256_add
376
377################################################################################
378# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
379.globl	ecp_nistz256_sub
380.type	ecp_nistz256_sub,\@function,3
381.align	32
382ecp_nistz256_sub:
383.cfi_startproc
384	push	%r12
385.cfi_push	%r12
386	push	%r13
387.cfi_push	%r13
388.Lsub_body:
389
390	mov	8*0($a_ptr), $a0
391	xor	$t4, $t4
392	mov	8*1($a_ptr), $a1
393	mov	8*2($a_ptr), $a2
394	mov	8*3($a_ptr), $a3
395	lea	.Lpoly(%rip), $a_ptr
396
397	sub	8*0($b_ptr), $a0
398	sbb	8*1($b_ptr), $a1
399	 mov	$a0, $t0
400	sbb	8*2($b_ptr), $a2
401	sbb	8*3($b_ptr), $a3
402	 mov	$a1, $t1
403	sbb	\$0, $t4
404
405	add	8*0($a_ptr), $a0
406	 mov	$a2, $t2
407	adc	8*1($a_ptr), $a1
408	adc	8*2($a_ptr), $a2
409	 mov	$a3, $t3
410	adc	8*3($a_ptr), $a3
411	test	$t4, $t4
412
413	cmovz	$t0, $a0
414	cmovz	$t1, $a1
415	mov	$a0, 8*0($r_ptr)
416	cmovz	$t2, $a2
417	mov	$a1, 8*1($r_ptr)
418	cmovz	$t3, $a3
419	mov	$a2, 8*2($r_ptr)
420	mov	$a3, 8*3($r_ptr)
421
422	mov	0(%rsp),%r13
423.cfi_restore	%r13
424	mov	8(%rsp),%r12
425.cfi_restore	%r12
426	lea	16(%rsp),%rsp
427.cfi_adjust_cfa_offset	-16
428.Lsub_epilogue:
429	ret
430.cfi_endproc
431.size	ecp_nistz256_sub,.-ecp_nistz256_sub
432
433################################################################################
434# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
435.globl	ecp_nistz256_neg
436.type	ecp_nistz256_neg,\@function,2
437.align	32
438ecp_nistz256_neg:
439.cfi_startproc
440	push	%r12
441.cfi_push	%r12
442	push	%r13
443.cfi_push	%r13
444.Lneg_body:
445
446	xor	$a0, $a0
447	xor	$a1, $a1
448	xor	$a2, $a2
449	xor	$a3, $a3
450	xor	$t4, $t4
451
452	sub	8*0($a_ptr), $a0
453	sbb	8*1($a_ptr), $a1
454	sbb	8*2($a_ptr), $a2
455	 mov	$a0, $t0
456	sbb	8*3($a_ptr), $a3
457	lea	.Lpoly(%rip), $a_ptr
458	 mov	$a1, $t1
459	sbb	\$0, $t4
460
461	add	8*0($a_ptr), $a0
462	 mov	$a2, $t2
463	adc	8*1($a_ptr), $a1
464	adc	8*2($a_ptr), $a2
465	 mov	$a3, $t3
466	adc	8*3($a_ptr), $a3
467	test	$t4, $t4
468
469	cmovz	$t0, $a0
470	cmovz	$t1, $a1
471	mov	$a0, 8*0($r_ptr)
472	cmovz	$t2, $a2
473	mov	$a1, 8*1($r_ptr)
474	cmovz	$t3, $a3
475	mov	$a2, 8*2($r_ptr)
476	mov	$a3, 8*3($r_ptr)
477
478	mov	0(%rsp),%r13
479.cfi_restore	%r13
480	mov	8(%rsp),%r12
481.cfi_restore	%r12
482	lea	16(%rsp),%rsp
483.cfi_adjust_cfa_offset	-16
484.Lneg_epilogue:
485	ret
486.cfi_endproc
487.size	ecp_nistz256_neg,.-ecp_nistz256_neg
488___
489}
490{
491my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
492my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
493my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
494my ($poly1,$poly3)=($acc6,$acc7);
495
496$code.=<<___;
497################################################################################
498# void ecp_nistz256_ord_mul_mont(
499#   uint64_t res[4],
500#   uint64_t a[4],
501#   uint64_t b[4]);
502
503.globl	ecp_nistz256_ord_mul_mont
504.type	ecp_nistz256_ord_mul_mont,\@function,3
505.align	32
506ecp_nistz256_ord_mul_mont:
507.cfi_startproc
508___
509$code.=<<___	if ($addx);
510	mov	\$0x80100, %ecx
511	and	OPENSSL_ia32cap_P+8(%rip), %ecx
512	cmp	\$0x80100, %ecx
513	je	.Lecp_nistz256_ord_mul_montx
514___
515$code.=<<___;
516	push	%rbp
517.cfi_push	%rbp
518	push	%rbx
519.cfi_push	%rbx
520	push	%r12
521.cfi_push	%r12
522	push	%r13
523.cfi_push	%r13
524	push	%r14
525.cfi_push	%r14
526	push	%r15
527.cfi_push	%r15
528.Lord_mul_body:
529
530	mov	8*0($b_org), %rax
531	mov	$b_org, $b_ptr
532	lea	.Lord(%rip), %r14
533	mov	.LordK(%rip), %r15
534
535	################################# * b[0]
536	mov	%rax, $t0
537	mulq	8*0($a_ptr)
538	mov	%rax, $acc0
539	mov	$t0, %rax
540	mov	%rdx, $acc1
541
542	mulq	8*1($a_ptr)
543	add	%rax, $acc1
544	mov	$t0, %rax
545	adc	\$0, %rdx
546	mov	%rdx, $acc2
547
548	mulq	8*2($a_ptr)
549	add	%rax, $acc2
550	mov	$t0, %rax
551	adc	\$0, %rdx
552
553	 mov	$acc0, $acc5
554	 imulq	%r15,$acc0
555
556	mov	%rdx, $acc3
557	mulq	8*3($a_ptr)
558	add	%rax, $acc3
559	 mov	$acc0, %rax
560	adc	\$0, %rdx
561	mov	%rdx, $acc4
562
563	################################# First reduction step
564	mulq	8*0(%r14)
565	mov	$acc0, $t1
566	add	%rax, $acc5		# guaranteed to be zero
567	mov	$acc0, %rax
568	adc	\$0, %rdx
569	mov	%rdx, $t0
570
571	sub	$acc0, $acc2
572	sbb	\$0, $acc0		# can't borrow
573
574	mulq	8*1(%r14)
575	add	$t0, $acc1
576	adc	\$0, %rdx
577	add	%rax, $acc1
578	mov	$t1, %rax
579	adc	%rdx, $acc2
580	mov	$t1, %rdx
581	adc	\$0, $acc0		# can't overflow
582
583	shl	\$32, %rax
584	shr	\$32, %rdx
585	sub	%rax, $acc3
586	 mov	8*1($b_ptr), %rax
587	sbb	%rdx, $t1		# can't borrow
588
589	add	$acc0, $acc3
590	adc	$t1, $acc4
591	adc	\$0, $acc5
592
593	################################# * b[1]
594	mov	%rax, $t0
595	mulq	8*0($a_ptr)
596	add	%rax, $acc1
597	mov	$t0, %rax
598	adc	\$0, %rdx
599	mov	%rdx, $t1
600
601	mulq	8*1($a_ptr)
602	add	$t1, $acc2
603	adc	\$0, %rdx
604	add	%rax, $acc2
605	mov	$t0, %rax
606	adc	\$0, %rdx
607	mov	%rdx, $t1
608
609	mulq	8*2($a_ptr)
610	add	$t1, $acc3
611	adc	\$0, %rdx
612	add	%rax, $acc3
613	mov	$t0, %rax
614	adc	\$0, %rdx
615
616	 mov	$acc1, $t0
617	 imulq	%r15, $acc1
618
619	mov	%rdx, $t1
620	mulq	8*3($a_ptr)
621	add	$t1, $acc4
622	adc	\$0, %rdx
623	xor	$acc0, $acc0
624	add	%rax, $acc4
625	 mov	$acc1, %rax
626	adc	%rdx, $acc5
627	adc	\$0, $acc0
628
629	################################# Second reduction step
630	mulq	8*0(%r14)
631	mov	$acc1, $t1
632	add	%rax, $t0		# guaranteed to be zero
633	mov	$acc1, %rax
634	adc	%rdx, $t0
635
636	sub	$acc1, $acc3
637	sbb	\$0, $acc1		# can't borrow
638
639	mulq	8*1(%r14)
640	add	$t0, $acc2
641	adc	\$0, %rdx
642	add	%rax, $acc2
643	mov	$t1, %rax
644	adc	%rdx, $acc3
645	mov	$t1, %rdx
646	adc	\$0, $acc1		# can't overflow
647
648	shl	\$32, %rax
649	shr	\$32, %rdx
650	sub	%rax, $acc4
651	 mov	8*2($b_ptr), %rax
652	sbb	%rdx, $t1		# can't borrow
653
654	add	$acc1, $acc4
655	adc	$t1, $acc5
656	adc	\$0, $acc0
657
658	################################## * b[2]
659	mov	%rax, $t0
660	mulq	8*0($a_ptr)
661	add	%rax, $acc2
662	mov	$t0, %rax
663	adc	\$0, %rdx
664	mov	%rdx, $t1
665
666	mulq	8*1($a_ptr)
667	add	$t1, $acc3
668	adc	\$0, %rdx
669	add	%rax, $acc3
670	mov	$t0, %rax
671	adc	\$0, %rdx
672	mov	%rdx, $t1
673
674	mulq	8*2($a_ptr)
675	add	$t1, $acc4
676	adc	\$0, %rdx
677	add	%rax, $acc4
678	mov	$t0, %rax
679	adc	\$0, %rdx
680
681	 mov	$acc2, $t0
682	 imulq	%r15, $acc2
683
684	mov	%rdx, $t1
685	mulq	8*3($a_ptr)
686	add	$t1, $acc5
687	adc	\$0, %rdx
688	xor	$acc1, $acc1
689	add	%rax, $acc5
690	 mov	$acc2, %rax
691	adc	%rdx, $acc0
692	adc	\$0, $acc1
693
694	################################# Third reduction step
695	mulq	8*0(%r14)
696	mov	$acc2, $t1
697	add	%rax, $t0		# guaranteed to be zero
698	mov	$acc2, %rax
699	adc	%rdx, $t0
700
701	sub	$acc2, $acc4
702	sbb	\$0, $acc2		# can't borrow
703
704	mulq	8*1(%r14)
705	add	$t0, $acc3
706	adc	\$0, %rdx
707	add	%rax, $acc3
708	mov	$t1, %rax
709	adc	%rdx, $acc4
710	mov	$t1, %rdx
711	adc	\$0, $acc2		# can't overflow
712
713	shl	\$32, %rax
714	shr	\$32, %rdx
715	sub	%rax, $acc5
716	 mov	8*3($b_ptr), %rax
717	sbb	%rdx, $t1		# can't borrow
718
719	add	$acc2, $acc5
720	adc	$t1, $acc0
721	adc	\$0, $acc1
722
723	################################# * b[3]
724	mov	%rax, $t0
725	mulq	8*0($a_ptr)
726	add	%rax, $acc3
727	mov	$t0, %rax
728	adc	\$0, %rdx
729	mov	%rdx, $t1
730
731	mulq	8*1($a_ptr)
732	add	$t1, $acc4
733	adc	\$0, %rdx
734	add	%rax, $acc4
735	mov	$t0, %rax
736	adc	\$0, %rdx
737	mov	%rdx, $t1
738
739	mulq	8*2($a_ptr)
740	add	$t1, $acc5
741	adc	\$0, %rdx
742	add	%rax, $acc5
743	mov	$t0, %rax
744	adc	\$0, %rdx
745
746	 mov	$acc3, $t0
747	 imulq	%r15, $acc3
748
749	mov	%rdx, $t1
750	mulq	8*3($a_ptr)
751	add	$t1, $acc0
752	adc	\$0, %rdx
753	xor	$acc2, $acc2
754	add	%rax, $acc0
755	 mov	$acc3, %rax
756	adc	%rdx, $acc1
757	adc	\$0, $acc2
758
759	################################# Last reduction step
760	mulq	8*0(%r14)
761	mov	$acc3, $t1
762	add	%rax, $t0		# guaranteed to be zero
763	mov	$acc3, %rax
764	adc	%rdx, $t0
765
766	sub	$acc3, $acc5
767	sbb	\$0, $acc3		# can't borrow
768
769	mulq	8*1(%r14)
770	add	$t0, $acc4
771	adc	\$0, %rdx
772	add	%rax, $acc4
773	mov	$t1, %rax
774	adc	%rdx, $acc5
775	mov	$t1, %rdx
776	adc	\$0, $acc3		# can't overflow
777
778	shl	\$32, %rax
779	shr	\$32, %rdx
780	sub	%rax, $acc0
781	sbb	%rdx, $t1		# can't borrow
782
783	add	$acc3, $acc0
784	adc	$t1, $acc1
785	adc	\$0, $acc2
786
787	################################# Subtract ord
788	 mov	$acc4, $a_ptr
789	sub	8*0(%r14), $acc4
790	 mov	$acc5, $acc3
791	sbb	8*1(%r14), $acc5
792	 mov	$acc0, $t0
793	sbb	8*2(%r14), $acc0
794	 mov	$acc1, $t1
795	sbb	8*3(%r14), $acc1
796	sbb	\$0, $acc2
797
798	cmovc	$a_ptr, $acc4
799	cmovc	$acc3, $acc5
800	cmovc	$t0, $acc0
801	cmovc	$t1, $acc1
802
803	mov	$acc4, 8*0($r_ptr)
804	mov	$acc5, 8*1($r_ptr)
805	mov	$acc0, 8*2($r_ptr)
806	mov	$acc1, 8*3($r_ptr)
807
808	mov	0(%rsp),%r15
809.cfi_restore	%r15
810	mov	8(%rsp),%r14
811.cfi_restore	%r14
812	mov	16(%rsp),%r13
813.cfi_restore	%r13
814	mov	24(%rsp),%r12
815.cfi_restore	%r12
816	mov	32(%rsp),%rbx
817.cfi_restore	%rbx
818	mov	40(%rsp),%rbp
819.cfi_restore	%rbp
820	lea	48(%rsp),%rsp
821.cfi_adjust_cfa_offset	-48
822.Lord_mul_epilogue:
823	ret
824.cfi_endproc
825.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
826
827################################################################################
828# void ecp_nistz256_ord_sqr_mont(
829#   uint64_t res[4],
830#   uint64_t a[4],
831#   uint64_t rep);
832
833.globl	ecp_nistz256_ord_sqr_mont
834.type	ecp_nistz256_ord_sqr_mont,\@function,3
835.align	32
836ecp_nistz256_ord_sqr_mont:
837.cfi_startproc
838___
839$code.=<<___	if ($addx);
840	mov	\$0x80100, %ecx
841	and	OPENSSL_ia32cap_P+8(%rip), %ecx
842	cmp	\$0x80100, %ecx
843	je	.Lecp_nistz256_ord_sqr_montx
844___
845$code.=<<___;
846	push	%rbp
847.cfi_push	%rbp
848	push	%rbx
849.cfi_push	%rbx
850	push	%r12
851.cfi_push	%r12
852	push	%r13
853.cfi_push	%r13
854	push	%r14
855.cfi_push	%r14
856	push	%r15
857.cfi_push	%r15
858.Lord_sqr_body:
859
860	mov	8*0($a_ptr), $acc0
861	mov	8*1($a_ptr), %rax
862	mov	8*2($a_ptr), $acc6
863	mov	8*3($a_ptr), $acc7
864	lea	.Lord(%rip), $a_ptr	# pointer to modulus
865	mov	$b_org, $b_ptr
866	jmp	.Loop_ord_sqr
867
868.align	32
869.Loop_ord_sqr:
870	################################# a[1:] * a[0]
871	mov	%rax, $t1		# put aside a[1]
872	mul	$acc0			# a[1] * a[0]
873	mov	%rax, $acc1
874	movq	$t1, %xmm1		# offload a[1]
875	mov	$acc6, %rax
876	mov	%rdx, $acc2
877
878	mul	$acc0			# a[2] * a[0]
879	add	%rax, $acc2
880	mov	$acc7, %rax
881	movq	$acc6, %xmm2		# offload a[2]
882	adc	\$0, %rdx
883	mov	%rdx, $acc3
884
885	mul	$acc0			# a[3] * a[0]
886	add	%rax, $acc3
887	mov	$acc7, %rax
888	movq	$acc7, %xmm3		# offload a[3]
889	adc	\$0, %rdx
890	mov	%rdx, $acc4
891
892	################################# a[3] * a[2]
893	mul	$acc6			# a[3] * a[2]
894	mov	%rax, $acc5
895	mov	$acc6, %rax
896	mov	%rdx, $acc6
897
898	################################# a[2:] * a[1]
899	mul	$t1			# a[2] * a[1]
900	add	%rax, $acc3
901	mov	$acc7, %rax
902	adc	\$0, %rdx
903	mov	%rdx, $acc7
904
905	mul	$t1			# a[3] * a[1]
906	add	%rax, $acc4
907	adc	\$0, %rdx
908
909	add	$acc7, $acc4
910	adc	%rdx, $acc5
911	adc	\$0, $acc6		# can't overflow
912
913	################################# *2
914	xor	$acc7, $acc7
915	mov	$acc0, %rax
916	add	$acc1, $acc1
917	adc	$acc2, $acc2
918	adc	$acc3, $acc3
919	adc	$acc4, $acc4
920	adc	$acc5, $acc5
921	adc	$acc6, $acc6
922	adc	\$0, $acc7
923
924	################################# Missing products
925	mul	%rax			# a[0] * a[0]
926	mov	%rax, $acc0
927	movq	%xmm1, %rax
928	mov	%rdx, $t1
929
930	mul	%rax			# a[1] * a[1]
931	add	$t1, $acc1
932	adc	%rax, $acc2
933	movq	%xmm2, %rax
934	adc	\$0, %rdx
935	mov	%rdx, $t1
936
937	mul	%rax			# a[2] * a[2]
938	add	$t1, $acc3
939	adc	%rax, $acc4
940	movq	%xmm3, %rax
941	adc	\$0, %rdx
942	mov	%rdx, $t1
943
944	 mov	$acc0, $t0
945	 imulq	8*4($a_ptr), $acc0	# *= .LordK
946
947	mul	%rax			# a[3] * a[3]
948	add	$t1, $acc5
949	adc	%rax, $acc6
950	 mov	8*0($a_ptr), %rax	# modulus[0]
951	adc	%rdx, $acc7		# can't overflow
952
953	################################# First reduction step
954	mul	$acc0
955	mov	$acc0, $t1
956	add	%rax, $t0		# guaranteed to be zero
957	mov	8*1($a_ptr), %rax	# modulus[1]
958	adc	%rdx, $t0
959
960	sub	$acc0, $acc2
961	sbb	\$0, $t1		# can't borrow
962
963	mul	$acc0
964	add	$t0, $acc1
965	adc	\$0, %rdx
966	add	%rax, $acc1
967	mov	$acc0, %rax
968	adc	%rdx, $acc2
969	mov	$acc0, %rdx
970	adc	\$0, $t1		# can't overflow
971
972	 mov	$acc1, $t0
973	 imulq	8*4($a_ptr), $acc1	# *= .LordK
974
975	shl	\$32, %rax
976	shr	\$32, %rdx
977	sub	%rax, $acc3
978	 mov	8*0($a_ptr), %rax
979	sbb	%rdx, $acc0		# can't borrow
980
981	add	$t1, $acc3
982	adc	\$0, $acc0		# can't overflow
983
984	################################# Second reduction step
985	mul	$acc1
986	mov	$acc1, $t1
987	add	%rax, $t0		# guaranteed to be zero
988	mov	8*1($a_ptr), %rax
989	adc	%rdx, $t0
990
991	sub	$acc1, $acc3
992	sbb	\$0, $t1		# can't borrow
993
994	mul	$acc1
995	add	$t0, $acc2
996	adc	\$0, %rdx
997	add	%rax, $acc2
998	mov	$acc1, %rax
999	adc	%rdx, $acc3
1000	mov	$acc1, %rdx
1001	adc	\$0, $t1		# can't overflow
1002
1003	 mov	$acc2, $t0
1004	 imulq	8*4($a_ptr), $acc2	# *= .LordK
1005
1006	shl	\$32, %rax
1007	shr	\$32, %rdx
1008	sub	%rax, $acc0
1009	 mov	8*0($a_ptr), %rax
1010	sbb	%rdx, $acc1		# can't borrow
1011
1012	add	$t1, $acc0
1013	adc	\$0, $acc1		# can't overflow
1014
1015	################################# Third reduction step
1016	mul	$acc2
1017	mov	$acc2, $t1
1018	add	%rax, $t0		# guaranteed to be zero
1019	mov	8*1($a_ptr), %rax
1020	adc	%rdx, $t0
1021
1022	sub	$acc2, $acc0
1023	sbb	\$0, $t1		# can't borrow
1024
1025	mul	$acc2
1026	add	$t0, $acc3
1027	adc	\$0, %rdx
1028	add	%rax, $acc3
1029	mov	$acc2, %rax
1030	adc	%rdx, $acc0
1031	mov	$acc2, %rdx
1032	adc	\$0, $t1		# can't overflow
1033
1034	 mov	$acc3, $t0
1035	 imulq	8*4($a_ptr), $acc3	# *= .LordK
1036
1037	shl	\$32, %rax
1038	shr	\$32, %rdx
1039	sub	%rax, $acc1
1040	 mov	8*0($a_ptr), %rax
1041	sbb	%rdx, $acc2		# can't borrow
1042
1043	add	$t1, $acc1
1044	adc	\$0, $acc2		# can't overflow
1045
1046	################################# Last reduction step
1047	mul	$acc3
1048	mov	$acc3, $t1
1049	add	%rax, $t0		# guaranteed to be zero
1050	mov	8*1($a_ptr), %rax
1051	adc	%rdx, $t0
1052
1053	sub	$acc3, $acc1
1054	sbb	\$0, $t1		# can't borrow
1055
1056	mul	$acc3
1057	add	$t0, $acc0
1058	adc	\$0, %rdx
1059	add	%rax, $acc0
1060	mov	$acc3, %rax
1061	adc	%rdx, $acc1
1062	mov	$acc3, %rdx
1063	adc	\$0, $t1		# can't overflow
1064
1065	shl	\$32, %rax
1066	shr	\$32, %rdx
1067	sub	%rax, $acc2
1068	sbb	%rdx, $acc3		# can't borrow
1069
1070	add	$t1, $acc2
1071	adc	\$0, $acc3		# can't overflow
1072
1073	################################# Add bits [511:256] of the sqr result
1074	xor	%rdx, %rdx
1075	add	$acc4, $acc0
1076	adc	$acc5, $acc1
1077	 mov	$acc0, $acc4
1078	adc	$acc6, $acc2
1079	adc	$acc7, $acc3
1080	 mov	$acc1, %rax
1081	adc	\$0, %rdx
1082
1083	################################# Compare to modulus
1084	sub	8*0($a_ptr), $acc0
1085	 mov	$acc2, $acc6
1086	sbb	8*1($a_ptr), $acc1
1087	sbb	8*2($a_ptr), $acc2
1088	 mov	$acc3, $acc7
1089	sbb	8*3($a_ptr), $acc3
1090	sbb	\$0, %rdx
1091
1092	cmovc	$acc4, $acc0
1093	cmovnc	$acc1, %rax
1094	cmovnc	$acc2, $acc6
1095	cmovnc	$acc3, $acc7
1096
1097	dec	$b_ptr
1098	jnz	.Loop_ord_sqr
1099
1100	mov	$acc0, 8*0($r_ptr)
1101	mov	%rax,  8*1($r_ptr)
1102	pxor	%xmm1, %xmm1
1103	mov	$acc6, 8*2($r_ptr)
1104	pxor	%xmm2, %xmm2
1105	mov	$acc7, 8*3($r_ptr)
1106	pxor	%xmm3, %xmm3
1107
1108	mov	0(%rsp),%r15
1109.cfi_restore	%r15
1110	mov	8(%rsp),%r14
1111.cfi_restore	%r14
1112	mov	16(%rsp),%r13
1113.cfi_restore	%r13
1114	mov	24(%rsp),%r12
1115.cfi_restore	%r12
1116	mov	32(%rsp),%rbx
1117.cfi_restore	%rbx
1118	mov	40(%rsp),%rbp
1119.cfi_restore	%rbp
1120	lea	48(%rsp),%rsp
1121.cfi_adjust_cfa_offset	-48
1122.Lord_sqr_epilogue:
1123	ret
1124.cfi_endproc
1125.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1126___
1127
1128$code.=<<___	if ($addx);
1129################################################################################
1130.type	ecp_nistz256_ord_mul_montx,\@function,3
1131.align	32
1132ecp_nistz256_ord_mul_montx:
1133.cfi_startproc
1134.Lecp_nistz256_ord_mul_montx:
1135	push	%rbp
1136.cfi_push	%rbp
1137	push	%rbx
1138.cfi_push	%rbx
1139	push	%r12
1140.cfi_push	%r12
1141	push	%r13
1142.cfi_push	%r13
1143	push	%r14
1144.cfi_push	%r14
1145	push	%r15
1146.cfi_push	%r15
1147.Lord_mulx_body:
1148
1149	mov	$b_org, $b_ptr
1150	mov	8*0($b_org), %rdx
1151	mov	8*0($a_ptr), $acc1
1152	mov	8*1($a_ptr), $acc2
1153	mov	8*2($a_ptr), $acc3
1154	mov	8*3($a_ptr), $acc4
1155	lea	-128($a_ptr), $a_ptr	# control u-op density
1156	lea	.Lord-128(%rip), %r14
1157	mov	.LordK(%rip), %r15
1158
1159	################################# Multiply by b[0]
1160	mulx	$acc1, $acc0, $acc1
1161	mulx	$acc2, $t0, $acc2
1162	mulx	$acc3, $t1, $acc3
1163	add	$t0, $acc1
1164	mulx	$acc4, $t0, $acc4
1165	 mov	$acc0, %rdx
1166	 mulx	%r15, %rdx, %rax
1167	adc	$t1, $acc2
1168	adc	$t0, $acc3
1169	adc	\$0, $acc4
1170
1171	################################# reduction
1172	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
1173	mulx	8*0+128(%r14), $t0, $t1
1174	adcx	$t0, $acc0		# guaranteed to be zero
1175	adox	$t1, $acc1
1176
1177	mulx	8*1+128(%r14), $t0, $t1
1178	adcx	$t0, $acc1
1179	adox	$t1, $acc2
1180
1181	mulx	8*2+128(%r14), $t0, $t1
1182	adcx	$t0, $acc2
1183	adox	$t1, $acc3
1184
1185	mulx	8*3+128(%r14), $t0, $t1
1186	 mov	8*1($b_ptr), %rdx
1187	adcx	$t0, $acc3
1188	adox	$t1, $acc4
1189	adcx	$acc0, $acc4
1190	adox	$acc0, $acc5
1191	adc	\$0, $acc5		# cf=0, of=0
1192
1193	################################# Multiply by b[1]
1194	mulx	8*0+128($a_ptr), $t0, $t1
1195	adcx	$t0, $acc1
1196	adox	$t1, $acc2
1197
1198	mulx	8*1+128($a_ptr), $t0, $t1
1199	adcx	$t0, $acc2
1200	adox	$t1, $acc3
1201
1202	mulx	8*2+128($a_ptr), $t0, $t1
1203	adcx	$t0, $acc3
1204	adox	$t1, $acc4
1205
1206	mulx	8*3+128($a_ptr), $t0, $t1
1207	 mov	$acc1, %rdx
1208	 mulx	%r15, %rdx, %rax
1209	adcx	$t0, $acc4
1210	adox	$t1, $acc5
1211
1212	adcx	$acc0, $acc5
1213	adox	$acc0, $acc0
1214	adc	\$0, $acc0		# cf=0, of=0
1215
1216	################################# reduction
1217	mulx	8*0+128(%r14), $t0, $t1
1218	adcx	$t0, $acc1		# guaranteed to be zero
1219	adox	$t1, $acc2
1220
1221	mulx	8*1+128(%r14), $t0, $t1
1222	adcx	$t0, $acc2
1223	adox	$t1, $acc3
1224
1225	mulx	8*2+128(%r14), $t0, $t1
1226	adcx	$t0, $acc3
1227	adox	$t1, $acc4
1228
1229	mulx	8*3+128(%r14), $t0, $t1
1230	 mov	8*2($b_ptr), %rdx
1231	adcx	$t0, $acc4
1232	adox	$t1, $acc5
1233	adcx	$acc1, $acc5
1234	adox	$acc1, $acc0
1235	adc	\$0, $acc0		# cf=0, of=0
1236
1237	################################# Multiply by b[2]
1238	mulx	8*0+128($a_ptr), $t0, $t1
1239	adcx	$t0, $acc2
1240	adox	$t1, $acc3
1241
1242	mulx	8*1+128($a_ptr), $t0, $t1
1243	adcx	$t0, $acc3
1244	adox	$t1, $acc4
1245
1246	mulx	8*2+128($a_ptr), $t0, $t1
1247	adcx	$t0, $acc4
1248	adox	$t1, $acc5
1249
1250	mulx	8*3+128($a_ptr), $t0, $t1
1251	 mov	$acc2, %rdx
1252	 mulx	%r15, %rdx, %rax
1253	adcx	$t0, $acc5
1254	adox	$t1, $acc0
1255
1256	adcx	$acc1, $acc0
1257	adox	$acc1, $acc1
1258	adc	\$0, $acc1		# cf=0, of=0
1259
1260	################################# reduction
1261	mulx	8*0+128(%r14), $t0, $t1
1262	adcx	$t0, $acc2		# guaranteed to be zero
1263	adox	$t1, $acc3
1264
1265	mulx	8*1+128(%r14), $t0, $t1
1266	adcx	$t0, $acc3
1267	adox	$t1, $acc4
1268
1269	mulx	8*2+128(%r14), $t0, $t1
1270	adcx	$t0, $acc4
1271	adox	$t1, $acc5
1272
1273	mulx	8*3+128(%r14), $t0, $t1
1274	 mov	8*3($b_ptr), %rdx
1275	adcx	$t0, $acc5
1276	adox	$t1, $acc0
1277	adcx	$acc2, $acc0
1278	adox	$acc2, $acc1
1279	adc	\$0, $acc1		# cf=0, of=0
1280
1281	################################# Multiply by b[3]
1282	mulx	8*0+128($a_ptr), $t0, $t1
1283	adcx	$t0, $acc3
1284	adox	$t1, $acc4
1285
1286	mulx	8*1+128($a_ptr), $t0, $t1
1287	adcx	$t0, $acc4
1288	adox	$t1, $acc5
1289
1290	mulx	8*2+128($a_ptr), $t0, $t1
1291	adcx	$t0, $acc5
1292	adox	$t1, $acc0
1293
1294	mulx	8*3+128($a_ptr), $t0, $t1
1295	 mov	$acc3, %rdx
1296	 mulx	%r15, %rdx, %rax
1297	adcx	$t0, $acc0
1298	adox	$t1, $acc1
1299
1300	adcx	$acc2, $acc1
1301	adox	$acc2, $acc2
1302	adc	\$0, $acc2		# cf=0, of=0
1303
1304	################################# reduction
1305	mulx	8*0+128(%r14), $t0, $t1
1306	adcx	$t0, $acc3		# guaranteed to be zero
1307	adox	$t1, $acc4
1308
1309	mulx	8*1+128(%r14), $t0, $t1
1310	adcx	$t0, $acc4
1311	adox	$t1, $acc5
1312
1313	mulx	8*2+128(%r14), $t0, $t1
1314	adcx	$t0, $acc5
1315	adox	$t1, $acc0
1316
1317	mulx	8*3+128(%r14), $t0, $t1
1318	lea	128(%r14),%r14
1319	 mov	$acc4, $t2
1320	adcx	$t0, $acc0
1321	adox	$t1, $acc1
1322	 mov	$acc5, $t3
1323	adcx	$acc3, $acc1
1324	adox	$acc3, $acc2
1325	adc	\$0, $acc2
1326
1327	#################################
1328	# Branch-less conditional subtraction of P
1329	 mov	$acc0, $t0
1330	sub	8*0(%r14), $acc4
1331	sbb	8*1(%r14), $acc5
1332	sbb	8*2(%r14), $acc0
1333	 mov	$acc1, $t1
1334	sbb	8*3(%r14), $acc1
1335	sbb	\$0, $acc2
1336
1337	cmovc	$t2, $acc4
1338	cmovc	$t3, $acc5
1339	cmovc	$t0, $acc0
1340	cmovc	$t1, $acc1
1341
1342	mov	$acc4, 8*0($r_ptr)
1343	mov	$acc5, 8*1($r_ptr)
1344	mov	$acc0, 8*2($r_ptr)
1345	mov	$acc1, 8*3($r_ptr)
1346
1347	mov	0(%rsp),%r15
1348.cfi_restore	%r15
1349	mov	8(%rsp),%r14
1350.cfi_restore	%r14
1351	mov	16(%rsp),%r13
1352.cfi_restore	%r13
1353	mov	24(%rsp),%r12
1354.cfi_restore	%r12
1355	mov	32(%rsp),%rbx
1356.cfi_restore	%rbx
1357	mov	40(%rsp),%rbp
1358.cfi_restore	%rbp
1359	lea	48(%rsp),%rsp
1360.cfi_adjust_cfa_offset	-48
1361.Lord_mulx_epilogue:
1362	ret
1363.cfi_endproc
1364.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1365
1366.type	ecp_nistz256_ord_sqr_montx,\@function,3
1367.align	32
1368ecp_nistz256_ord_sqr_montx:
1369.cfi_startproc
1370.Lecp_nistz256_ord_sqr_montx:
1371	push	%rbp
1372.cfi_push	%rbp
1373	push	%rbx
1374.cfi_push	%rbx
1375	push	%r12
1376.cfi_push	%r12
1377	push	%r13
1378.cfi_push	%r13
1379	push	%r14
1380.cfi_push	%r14
1381	push	%r15
1382.cfi_push	%r15
1383.Lord_sqrx_body:
1384
1385	mov	$b_org, $b_ptr
1386	mov	8*0($a_ptr), %rdx
1387	mov	8*1($a_ptr), $acc6
1388	mov	8*2($a_ptr), $acc7
1389	mov	8*3($a_ptr), $acc0
1390	lea	.Lord(%rip), $a_ptr
1391	jmp	.Loop_ord_sqrx
1392
1393.align	32
1394.Loop_ord_sqrx:
1395	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1396	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1397	 mov	%rdx, %rax		# offload a[0]
1398	 movq	$acc6, %xmm1		# offload a[1]
1399	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1400	 mov	$acc6, %rdx
1401	add	$t0, $acc2
1402	 movq	$acc7, %xmm2		# offload a[2]
1403	adc	$t1, $acc3
1404	adc	\$0, $acc4
1405	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1406	#################################
1407	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1408	adcx	$t0, $acc3
1409	adox	$t1, $acc4
1410
1411	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1412	 mov	$acc7, %rdx
1413	adcx	$t0, $acc4
1414	adox	$t1, $acc5
1415	adc	\$0, $acc5
1416	#################################
1417	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1418	mov	%rax, %rdx
1419	 movq	$acc0, %xmm3		# offload a[3]
1420	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1421	 adcx	$acc1, $acc1		# acc1:6<<1
1422	adox	$t0, $acc5
1423	 adcx	$acc2, $acc2
1424	adox	$acc7, $acc6		# of=0
1425
1426	################################# a[i]*a[i]
1427	mulx	%rdx, $acc0, $t1
1428	movq	%xmm1, %rdx
1429	 adcx	$acc3, $acc3
1430	adox	$t1, $acc1
1431	 adcx	$acc4, $acc4
1432	mulx	%rdx, $t0, $t4
1433	movq	%xmm2, %rdx
1434	 adcx	$acc5, $acc5
1435	adox	$t0, $acc2
1436	 adcx	$acc6, $acc6
1437	mulx	%rdx, $t0, $t1
1438	.byte	0x67
1439	movq	%xmm3, %rdx
1440	adox	$t4, $acc3
1441	 adcx	$acc7, $acc7
1442	adox	$t0, $acc4
1443	adox	$t1, $acc5
1444	mulx	%rdx, $t0, $t4
1445	adox	$t0, $acc6
1446	adox	$t4, $acc7
1447
1448	################################# reduction
1449	mov	$acc0, %rdx
1450	mulx	8*4($a_ptr), %rdx, $t0
1451
1452	xor	%rax, %rax		# cf=0, of=0
1453	mulx	8*0($a_ptr), $t0, $t1
1454	adcx	$t0, $acc0		# guaranteed to be zero
1455	adox	$t1, $acc1
1456	mulx	8*1($a_ptr), $t0, $t1
1457	adcx	$t0, $acc1
1458	adox	$t1, $acc2
1459	mulx	8*2($a_ptr), $t0, $t1
1460	adcx	$t0, $acc2
1461	adox	$t1, $acc3
1462	mulx	8*3($a_ptr), $t0, $t1
1463	adcx	$t0, $acc3
1464	adox	$t1, $acc0		# of=0
1465	adcx	%rax, $acc0		# cf=0
1466
1467	#################################
1468	mov	$acc1, %rdx
1469	mulx	8*4($a_ptr), %rdx, $t0
1470
1471	mulx	8*0($a_ptr), $t0, $t1
1472	adox	$t0, $acc1		# guaranteed to be zero
1473	adcx	$t1, $acc2
1474	mulx	8*1($a_ptr), $t0, $t1
1475	adox	$t0, $acc2
1476	adcx	$t1, $acc3
1477	mulx	8*2($a_ptr), $t0, $t1
1478	adox	$t0, $acc3
1479	adcx	$t1, $acc0
1480	mulx	8*3($a_ptr), $t0, $t1
1481	adox	$t0, $acc0
1482	adcx	$t1, $acc1		# cf=0
1483	adox	%rax, $acc1		# of=0
1484
1485	#################################
1486	mov	$acc2, %rdx
1487	mulx	8*4($a_ptr), %rdx, $t0
1488
1489	mulx	8*0($a_ptr), $t0, $t1
1490	adcx	$t0, $acc2		# guaranteed to be zero
1491	adox	$t1, $acc3
1492	mulx	8*1($a_ptr), $t0, $t1
1493	adcx	$t0, $acc3
1494	adox	$t1, $acc0
1495	mulx	8*2($a_ptr), $t0, $t1
1496	adcx	$t0, $acc0
1497	adox	$t1, $acc1
1498	mulx	8*3($a_ptr), $t0, $t1
1499	adcx	$t0, $acc1
1500	adox	$t1, $acc2		# of=0
1501	adcx	%rax, $acc2		# cf=0
1502
1503	#################################
1504	mov	$acc3, %rdx
1505	mulx	8*4($a_ptr), %rdx, $t0
1506
1507	mulx	8*0($a_ptr), $t0, $t1
1508	adox	$t0, $acc3		# guaranteed to be zero
1509	adcx	$t1, $acc0
1510	mulx	8*1($a_ptr), $t0, $t1
1511	adox	$t0, $acc0
1512	adcx	$t1, $acc1
1513	mulx	8*2($a_ptr), $t0, $t1
1514	adox	$t0, $acc1
1515	adcx	$t1, $acc2
1516	mulx	8*3($a_ptr), $t0, $t1
1517	adox	$t0, $acc2
1518	adcx	$t1, $acc3
1519	adox	%rax, $acc3
1520
1521	################################# accumulate upper half
1522	add	$acc0, $acc4		# add	$acc4, $acc0
1523	adc	$acc5, $acc1
1524	 mov	$acc4, %rdx
1525	adc	$acc6, $acc2
1526	adc	$acc7, $acc3
1527	 mov	$acc1, $acc6
1528	adc	\$0, %rax
1529
1530	################################# compare to modulus
1531	sub	8*0($a_ptr), $acc4
1532	 mov	$acc2, $acc7
1533	sbb	8*1($a_ptr), $acc1
1534	sbb	8*2($a_ptr), $acc2
1535	 mov	$acc3, $acc0
1536	sbb	8*3($a_ptr), $acc3
1537	sbb	\$0, %rax
1538
1539	cmovnc	$acc4, %rdx
1540	cmovnc	$acc1, $acc6
1541	cmovnc	$acc2, $acc7
1542	cmovnc	$acc3, $acc0
1543
1544	dec	$b_ptr
1545	jnz	.Loop_ord_sqrx
1546
1547	mov	%rdx, 8*0($r_ptr)
1548	mov	$acc6, 8*1($r_ptr)
1549	pxor	%xmm1, %xmm1
1550	mov	$acc7, 8*2($r_ptr)
1551	pxor	%xmm2, %xmm2
1552	mov	$acc0, 8*3($r_ptr)
1553	pxor	%xmm3, %xmm3
1554
1555	mov	0(%rsp),%r15
1556.cfi_restore	%r15
1557	mov	8(%rsp),%r14
1558.cfi_restore	%r14
1559	mov	16(%rsp),%r13
1560.cfi_restore	%r13
1561	mov	24(%rsp),%r12
1562.cfi_restore	%r12
1563	mov	32(%rsp),%rbx
1564.cfi_restore	%rbx
1565	mov	40(%rsp),%rbp
1566.cfi_restore	%rbp
1567	lea	48(%rsp),%rsp
1568.cfi_adjust_cfa_offset	-48
1569.Lord_sqrx_epilogue:
1570	ret
1571.cfi_endproc
1572.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1573___
1574
1575$code.=<<___;
1576################################################################################
1577# void ecp_nistz256_to_mont(
1578#   uint64_t res[4],
1579#   uint64_t in[4]);
1580.globl	ecp_nistz256_to_mont
1581.type	ecp_nistz256_to_mont,\@function,2
1582.align	32
1583ecp_nistz256_to_mont:
1584.cfi_startproc
1585___
1586$code.=<<___	if ($addx);
1587	mov	\$0x80100, %ecx
1588	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1589___
1590$code.=<<___;
1591	lea	.LRR(%rip), $b_org
1592	jmp	.Lmul_mont
1593.cfi_endproc
1594.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1595
1596################################################################################
1597# void ecp_nistz256_mul_mont(
1598#   uint64_t res[4],
1599#   uint64_t a[4],
1600#   uint64_t b[4]);
1601
1602.globl	ecp_nistz256_mul_mont
1603.type	ecp_nistz256_mul_mont,\@function,3
1604.align	32
1605ecp_nistz256_mul_mont:
1606.cfi_startproc
1607___
1608$code.=<<___	if ($addx);
1609	mov	\$0x80100, %ecx
1610	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1611___
1612$code.=<<___;
1613.Lmul_mont:
1614	push	%rbp
1615.cfi_push	%rbp
1616	push	%rbx
1617.cfi_push	%rbx
1618	push	%r12
1619.cfi_push	%r12
1620	push	%r13
1621.cfi_push	%r13
1622	push	%r14
1623.cfi_push	%r14
1624	push	%r15
1625.cfi_push	%r15
1626.Lmul_body:
1627___
1628$code.=<<___	if ($addx);
1629	cmp	\$0x80100, %ecx
1630	je	.Lmul_montx
1631___
1632$code.=<<___;
1633	mov	$b_org, $b_ptr
1634	mov	8*0($b_org), %rax
1635	mov	8*0($a_ptr), $acc1
1636	mov	8*1($a_ptr), $acc2
1637	mov	8*2($a_ptr), $acc3
1638	mov	8*3($a_ptr), $acc4
1639
1640	call	__ecp_nistz256_mul_montq
1641___
1642$code.=<<___	if ($addx);
1643	jmp	.Lmul_mont_done
1644
1645.align	32
1646.Lmul_montx:
1647	mov	$b_org, $b_ptr
1648	mov	8*0($b_org), %rdx
1649	mov	8*0($a_ptr), $acc1
1650	mov	8*1($a_ptr), $acc2
1651	mov	8*2($a_ptr), $acc3
1652	mov	8*3($a_ptr), $acc4
1653	lea	-128($a_ptr), $a_ptr	# control u-op density
1654
1655	call	__ecp_nistz256_mul_montx
1656___
1657$code.=<<___;
1658.Lmul_mont_done:
1659	mov	0(%rsp),%r15
1660.cfi_restore	%r15
1661	mov	8(%rsp),%r14
1662.cfi_restore	%r14
1663	mov	16(%rsp),%r13
1664.cfi_restore	%r13
1665	mov	24(%rsp),%r12
1666.cfi_restore	%r12
1667	mov	32(%rsp),%rbx
1668.cfi_restore	%rbx
1669	mov	40(%rsp),%rbp
1670.cfi_restore	%rbp
1671	lea	48(%rsp),%rsp
1672.cfi_adjust_cfa_offset	-48
1673.Lmul_epilogue:
1674	ret
1675.cfi_endproc
1676.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1677
1678.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1679.align	32
1680__ecp_nistz256_mul_montq:
1681.cfi_startproc
1682	########################################################################
1683	# Multiply a by b[0]
1684	mov	%rax, $t1
1685	mulq	$acc1
1686	mov	.Lpoly+8*1(%rip),$poly1
1687	mov	%rax, $acc0
1688	mov	$t1, %rax
1689	mov	%rdx, $acc1
1690
1691	mulq	$acc2
1692	mov	.Lpoly+8*3(%rip),$poly3
1693	add	%rax, $acc1
1694	mov	$t1, %rax
1695	adc	\$0, %rdx
1696	mov	%rdx, $acc2
1697
1698	mulq	$acc3
1699	add	%rax, $acc2
1700	mov	$t1, %rax
1701	adc	\$0, %rdx
1702	mov	%rdx, $acc3
1703
1704	mulq	$acc4
1705	add	%rax, $acc3
1706	 mov	$acc0, %rax
1707	adc	\$0, %rdx
1708	xor	$acc5, $acc5
1709	mov	%rdx, $acc4
1710
1711	########################################################################
1712	# First reduction step
1713	# Basically now we want to multiply acc[0] by p256,
1714	# and add the result to the acc.
1715	# Due to the special form of p256 we do some optimizations
1716	#
1717	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1718	# then we add acc[0] and get acc[0] x 2^96
1719
1720	mov	$acc0, $t1
1721	shl	\$32, $acc0
1722	mulq	$poly3
1723	shr	\$32, $t1
1724	add	$acc0, $acc1		# +=acc[0]<<96
1725	adc	$t1, $acc2
1726	adc	%rax, $acc3
1727	 mov	8*1($b_ptr), %rax
1728	adc	%rdx, $acc4
1729	adc	\$0, $acc5
1730	xor	$acc0, $acc0
1731
1732	########################################################################
1733	# Multiply by b[1]
1734	mov	%rax, $t1
1735	mulq	8*0($a_ptr)
1736	add	%rax, $acc1
1737	mov	$t1, %rax
1738	adc	\$0, %rdx
1739	mov	%rdx, $t0
1740
1741	mulq	8*1($a_ptr)
1742	add	$t0, $acc2
1743	adc	\$0, %rdx
1744	add	%rax, $acc2
1745	mov	$t1, %rax
1746	adc	\$0, %rdx
1747	mov	%rdx, $t0
1748
1749	mulq	8*2($a_ptr)
1750	add	$t0, $acc3
1751	adc	\$0, %rdx
1752	add	%rax, $acc3
1753	mov	$t1, %rax
1754	adc	\$0, %rdx
1755	mov	%rdx, $t0
1756
1757	mulq	8*3($a_ptr)
1758	add	$t0, $acc4
1759	adc	\$0, %rdx
1760	add	%rax, $acc4
1761	 mov	$acc1, %rax
1762	adc	%rdx, $acc5
1763	adc	\$0, $acc0
1764
1765	########################################################################
1766	# Second reduction step
1767	mov	$acc1, $t1
1768	shl	\$32, $acc1
1769	mulq	$poly3
1770	shr	\$32, $t1
1771	add	$acc1, $acc2
1772	adc	$t1, $acc3
1773	adc	%rax, $acc4
1774	 mov	8*2($b_ptr), %rax
1775	adc	%rdx, $acc5
1776	adc	\$0, $acc0
1777	xor	$acc1, $acc1
1778
1779	########################################################################
1780	# Multiply by b[2]
1781	mov	%rax, $t1
1782	mulq	8*0($a_ptr)
1783	add	%rax, $acc2
1784	mov	$t1, %rax
1785	adc	\$0, %rdx
1786	mov	%rdx, $t0
1787
1788	mulq	8*1($a_ptr)
1789	add	$t0, $acc3
1790	adc	\$0, %rdx
1791	add	%rax, $acc3
1792	mov	$t1, %rax
1793	adc	\$0, %rdx
1794	mov	%rdx, $t0
1795
1796	mulq	8*2($a_ptr)
1797	add	$t0, $acc4
1798	adc	\$0, %rdx
1799	add	%rax, $acc4
1800	mov	$t1, %rax
1801	adc	\$0, %rdx
1802	mov	%rdx, $t0
1803
1804	mulq	8*3($a_ptr)
1805	add	$t0, $acc5
1806	adc	\$0, %rdx
1807	add	%rax, $acc5
1808	 mov	$acc2, %rax
1809	adc	%rdx, $acc0
1810	adc	\$0, $acc1
1811
1812	########################################################################
1813	# Third reduction step
1814	mov	$acc2, $t1
1815	shl	\$32, $acc2
1816	mulq	$poly3
1817	shr	\$32, $t1
1818	add	$acc2, $acc3
1819	adc	$t1, $acc4
1820	adc	%rax, $acc5
1821	 mov	8*3($b_ptr), %rax
1822	adc	%rdx, $acc0
1823	adc	\$0, $acc1
1824	xor	$acc2, $acc2
1825
1826	########################################################################
1827	# Multiply by b[3]
1828	mov	%rax, $t1
1829	mulq	8*0($a_ptr)
1830	add	%rax, $acc3
1831	mov	$t1, %rax
1832	adc	\$0, %rdx
1833	mov	%rdx, $t0
1834
1835	mulq	8*1($a_ptr)
1836	add	$t0, $acc4
1837	adc	\$0, %rdx
1838	add	%rax, $acc4
1839	mov	$t1, %rax
1840	adc	\$0, %rdx
1841	mov	%rdx, $t0
1842
1843	mulq	8*2($a_ptr)
1844	add	$t0, $acc5
1845	adc	\$0, %rdx
1846	add	%rax, $acc5
1847	mov	$t1, %rax
1848	adc	\$0, %rdx
1849	mov	%rdx, $t0
1850
1851	mulq	8*3($a_ptr)
1852	add	$t0, $acc0
1853	adc	\$0, %rdx
1854	add	%rax, $acc0
1855	 mov	$acc3, %rax
1856	adc	%rdx, $acc1
1857	adc	\$0, $acc2
1858
1859	########################################################################
1860	# Final reduction step
1861	mov	$acc3, $t1
1862	shl	\$32, $acc3
1863	mulq	$poly3
1864	shr	\$32, $t1
1865	add	$acc3, $acc4
1866	adc	$t1, $acc5
1867	 mov	$acc4, $t0
1868	adc	%rax, $acc0
1869	adc	%rdx, $acc1
1870	 mov	$acc5, $t1
1871	adc	\$0, $acc2
1872
1873	########################################################################
1874	# Branch-less conditional subtraction of P
1875	sub	\$-1, $acc4		# .Lpoly[0]
1876	 mov	$acc0, $t2
1877	sbb	$poly1, $acc5		# .Lpoly[1]
1878	sbb	\$0, $acc0		# .Lpoly[2]
1879	 mov	$acc1, $t3
1880	sbb	$poly3, $acc1		# .Lpoly[3]
1881	sbb	\$0, $acc2
1882
1883	cmovc	$t0, $acc4
1884	cmovc	$t1, $acc5
1885	mov	$acc4, 8*0($r_ptr)
1886	cmovc	$t2, $acc0
1887	mov	$acc5, 8*1($r_ptr)
1888	cmovc	$t3, $acc1
1889	mov	$acc0, 8*2($r_ptr)
1890	mov	$acc1, 8*3($r_ptr)
1891
1892	ret
1893.cfi_endproc
1894.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1895
1896################################################################################
1897# void ecp_nistz256_sqr_mont(
1898#   uint64_t res[4],
1899#   uint64_t a[4]);
1900
1901# we optimize the square according to S.Gueron and V.Krasnov,
1902# "Speeding up Big-Number Squaring"
1903.globl	ecp_nistz256_sqr_mont
1904.type	ecp_nistz256_sqr_mont,\@function,2
1905.align	32
1906ecp_nistz256_sqr_mont:
1907.cfi_startproc
1908___
1909$code.=<<___	if ($addx);
1910	mov	\$0x80100, %ecx
1911	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1912___
1913$code.=<<___;
1914	push	%rbp
1915.cfi_push	%rbp
1916	push	%rbx
1917.cfi_push	%rbx
1918	push	%r12
1919.cfi_push	%r12
1920	push	%r13
1921.cfi_push	%r13
1922	push	%r14
1923.cfi_push	%r14
1924	push	%r15
1925.cfi_push	%r15
1926.Lsqr_body:
1927___
1928$code.=<<___	if ($addx);
1929	cmp	\$0x80100, %ecx
1930	je	.Lsqr_montx
1931___
1932$code.=<<___;
1933	mov	8*0($a_ptr), %rax
1934	mov	8*1($a_ptr), $acc6
1935	mov	8*2($a_ptr), $acc7
1936	mov	8*3($a_ptr), $acc0
1937
1938	call	__ecp_nistz256_sqr_montq
1939___
1940$code.=<<___	if ($addx);
1941	jmp	.Lsqr_mont_done
1942
1943.align	32
1944.Lsqr_montx:
1945	mov	8*0($a_ptr), %rdx
1946	mov	8*1($a_ptr), $acc6
1947	mov	8*2($a_ptr), $acc7
1948	mov	8*3($a_ptr), $acc0
1949	lea	-128($a_ptr), $a_ptr	# control u-op density
1950
1951	call	__ecp_nistz256_sqr_montx
1952___
1953$code.=<<___;
1954.Lsqr_mont_done:
1955	mov	0(%rsp),%r15
1956.cfi_restore	%r15
1957	mov	8(%rsp),%r14
1958.cfi_restore	%r14
1959	mov	16(%rsp),%r13
1960.cfi_restore	%r13
1961	mov	24(%rsp),%r12
1962.cfi_restore	%r12
1963	mov	32(%rsp),%rbx
1964.cfi_restore	%rbx
1965	mov	40(%rsp),%rbp
1966.cfi_restore	%rbp
1967	lea	48(%rsp),%rsp
1968.cfi_adjust_cfa_offset	-48
1969.Lsqr_epilogue:
1970	ret
1971.cfi_endproc
1972.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1973
1974.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1975.align	32
1976__ecp_nistz256_sqr_montq:
1977.cfi_startproc
1978	mov	%rax, $acc5
1979	mulq	$acc6			# a[1]*a[0]
1980	mov	%rax, $acc1
1981	mov	$acc7, %rax
1982	mov	%rdx, $acc2
1983
1984	mulq	$acc5			# a[0]*a[2]
1985	add	%rax, $acc2
1986	mov	$acc0, %rax
1987	adc	\$0, %rdx
1988	mov	%rdx, $acc3
1989
1990	mulq	$acc5			# a[0]*a[3]
1991	add	%rax, $acc3
1992	 mov	$acc7, %rax
1993	adc	\$0, %rdx
1994	mov	%rdx, $acc4
1995
1996	#################################
1997	mulq	$acc6			# a[1]*a[2]
1998	add	%rax, $acc3
1999	mov	$acc0, %rax
2000	adc	\$0, %rdx
2001	mov	%rdx, $t1
2002
2003	mulq	$acc6			# a[1]*a[3]
2004	add	%rax, $acc4
2005	 mov	$acc0, %rax
2006	adc	\$0, %rdx
2007	add	$t1, $acc4
2008	mov	%rdx, $acc5
2009	adc	\$0, $acc5
2010
2011	#################################
2012	mulq	$acc7			# a[2]*a[3]
2013	xor	$acc7, $acc7
2014	add	%rax, $acc5
2015	 mov	8*0($a_ptr), %rax
2016	mov	%rdx, $acc6
2017	adc	\$0, $acc6
2018
2019	add	$acc1, $acc1		# acc1:6<<1
2020	adc	$acc2, $acc2
2021	adc	$acc3, $acc3
2022	adc	$acc4, $acc4
2023	adc	$acc5, $acc5
2024	adc	$acc6, $acc6
2025	adc	\$0, $acc7
2026
2027	mulq	%rax
2028	mov	%rax, $acc0
2029	mov	8*1($a_ptr), %rax
2030	mov	%rdx, $t0
2031
2032	mulq	%rax
2033	add	$t0, $acc1
2034	adc	%rax, $acc2
2035	mov	8*2($a_ptr), %rax
2036	adc	\$0, %rdx
2037	mov	%rdx, $t0
2038
2039	mulq	%rax
2040	add	$t0, $acc3
2041	adc	%rax, $acc4
2042	mov	8*3($a_ptr), %rax
2043	adc	\$0, %rdx
2044	mov	%rdx, $t0
2045
2046	mulq	%rax
2047	add	$t0, $acc5
2048	adc	%rax, $acc6
2049	 mov	$acc0, %rax
2050	adc	%rdx, $acc7
2051
2052	mov	.Lpoly+8*1(%rip), $a_ptr
2053	mov	.Lpoly+8*3(%rip), $t1
2054
2055	##########################################
2056	# Now the reduction
2057	# First iteration
2058	mov	$acc0, $t0
2059	shl	\$32, $acc0
2060	mulq	$t1
2061	shr	\$32, $t0
2062	add	$acc0, $acc1		# +=acc[0]<<96
2063	adc	$t0, $acc2
2064	adc	%rax, $acc3
2065	 mov	$acc1, %rax
2066	adc	\$0, %rdx
2067
2068	##########################################
2069	# Second iteration
2070	mov	$acc1, $t0
2071	shl	\$32, $acc1
2072	mov	%rdx, $acc0
2073	mulq	$t1
2074	shr	\$32, $t0
2075	add	$acc1, $acc2
2076	adc	$t0, $acc3
2077	adc	%rax, $acc0
2078	 mov	$acc2, %rax
2079	adc	\$0, %rdx
2080
2081	##########################################
2082	# Third iteration
2083	mov	$acc2, $t0
2084	shl	\$32, $acc2
2085	mov	%rdx, $acc1
2086	mulq	$t1
2087	shr	\$32, $t0
2088	add	$acc2, $acc3
2089	adc	$t0, $acc0
2090	adc	%rax, $acc1
2091	 mov	$acc3, %rax
2092	adc	\$0, %rdx
2093
2094	###########################################
2095	# Last iteration
2096	mov	$acc3, $t0
2097	shl	\$32, $acc3
2098	mov	%rdx, $acc2
2099	mulq	$t1
2100	shr	\$32, $t0
2101	add	$acc3, $acc0
2102	adc	$t0, $acc1
2103	adc	%rax, $acc2
2104	adc	\$0, %rdx
2105	xor	$acc3, $acc3
2106
2107	############################################
2108	# Add the rest of the acc
2109	add	$acc0, $acc4
2110	adc	$acc1, $acc5
2111	 mov	$acc4, $acc0
2112	adc	$acc2, $acc6
2113	adc	%rdx, $acc7
2114	 mov	$acc5, $acc1
2115	adc	\$0, $acc3
2116
2117	sub	\$-1, $acc4		# .Lpoly[0]
2118	 mov	$acc6, $acc2
2119	sbb	$a_ptr, $acc5		# .Lpoly[1]
2120	sbb	\$0, $acc6		# .Lpoly[2]
2121	 mov	$acc7, $t0
2122	sbb	$t1, $acc7		# .Lpoly[3]
2123	sbb	\$0, $acc3
2124
2125	cmovc	$acc0, $acc4
2126	cmovc	$acc1, $acc5
2127	mov	$acc4, 8*0($r_ptr)
2128	cmovc	$acc2, $acc6
2129	mov	$acc5, 8*1($r_ptr)
2130	cmovc	$t0, $acc7
2131	mov	$acc6, 8*2($r_ptr)
2132	mov	$acc7, 8*3($r_ptr)
2133
2134	ret
2135.cfi_endproc
2136.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2137___
2138
2139if ($addx) {
2140$code.=<<___;
2141.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
2142.align	32
2143__ecp_nistz256_mul_montx:
2144.cfi_startproc
2145	########################################################################
2146	# Multiply by b[0]
2147	mulx	$acc1, $acc0, $acc1
2148	mulx	$acc2, $t0, $acc2
2149	mov	\$32, $poly1
2150	xor	$acc5, $acc5		# cf=0
2151	mulx	$acc3, $t1, $acc3
2152	mov	.Lpoly+8*3(%rip), $poly3
2153	adc	$t0, $acc1
2154	mulx	$acc4, $t0, $acc4
2155	 mov	$acc0, %rdx
2156	adc	$t1, $acc2
2157	 shlx	$poly1,$acc0,$t1
2158	adc	$t0, $acc3
2159	 shrx	$poly1,$acc0,$t0
2160	adc	\$0, $acc4
2161
2162	########################################################################
2163	# First reduction step
2164	add	$t1, $acc1
2165	adc	$t0, $acc2
2166
2167	mulx	$poly3, $t0, $t1
2168	 mov	8*1($b_ptr), %rdx
2169	adc	$t0, $acc3
2170	adc	$t1, $acc4
2171	adc	\$0, $acc5
2172	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
2173
2174	########################################################################
2175	# Multiply by b[1]
2176	mulx	8*0+128($a_ptr), $t0, $t1
2177	adcx	$t0, $acc1
2178	adox	$t1, $acc2
2179
2180	mulx	8*1+128($a_ptr), $t0, $t1
2181	adcx	$t0, $acc2
2182	adox	$t1, $acc3
2183
2184	mulx	8*2+128($a_ptr), $t0, $t1
2185	adcx	$t0, $acc3
2186	adox	$t1, $acc4
2187
2188	mulx	8*3+128($a_ptr), $t0, $t1
2189	 mov	$acc1, %rdx
2190	adcx	$t0, $acc4
2191	 shlx	$poly1, $acc1, $t0
2192	adox	$t1, $acc5
2193	 shrx	$poly1, $acc1, $t1
2194
2195	adcx	$acc0, $acc5
2196	adox	$acc0, $acc0
2197	adc	\$0, $acc0
2198
2199	########################################################################
2200	# Second reduction step
2201	add	$t0, $acc2
2202	adc	$t1, $acc3
2203
2204	mulx	$poly3, $t0, $t1
2205	 mov	8*2($b_ptr), %rdx
2206	adc	$t0, $acc4
2207	adc	$t1, $acc5
2208	adc	\$0, $acc0
2209	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
2210
2211	########################################################################
2212	# Multiply by b[2]
2213	mulx	8*0+128($a_ptr), $t0, $t1
2214	adcx	$t0, $acc2
2215	adox	$t1, $acc3
2216
2217	mulx	8*1+128($a_ptr), $t0, $t1
2218	adcx	$t0, $acc3
2219	adox	$t1, $acc4
2220
2221	mulx	8*2+128($a_ptr), $t0, $t1
2222	adcx	$t0, $acc4
2223	adox	$t1, $acc5
2224
2225	mulx	8*3+128($a_ptr), $t0, $t1
2226	 mov	$acc2, %rdx
2227	adcx	$t0, $acc5
2228	 shlx	$poly1, $acc2, $t0
2229	adox	$t1, $acc0
2230	 shrx	$poly1, $acc2, $t1
2231
2232	adcx	$acc1, $acc0
2233	adox	$acc1, $acc1
2234	adc	\$0, $acc1
2235
2236	########################################################################
2237	# Third reduction step
2238	add	$t0, $acc3
2239	adc	$t1, $acc4
2240
2241	mulx	$poly3, $t0, $t1
2242	 mov	8*3($b_ptr), %rdx
2243	adc	$t0, $acc5
2244	adc	$t1, $acc0
2245	adc	\$0, $acc1
2246	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
2247
2248	########################################################################
2249	# Multiply by b[3]
2250	mulx	8*0+128($a_ptr), $t0, $t1
2251	adcx	$t0, $acc3
2252	adox	$t1, $acc4
2253
2254	mulx	8*1+128($a_ptr), $t0, $t1
2255	adcx	$t0, $acc4
2256	adox	$t1, $acc5
2257
2258	mulx	8*2+128($a_ptr), $t0, $t1
2259	adcx	$t0, $acc5
2260	adox	$t1, $acc0
2261
2262	mulx	8*3+128($a_ptr), $t0, $t1
2263	 mov	$acc3, %rdx
2264	adcx	$t0, $acc0
2265	 shlx	$poly1, $acc3, $t0
2266	adox	$t1, $acc1
2267	 shrx	$poly1, $acc3, $t1
2268
2269	adcx	$acc2, $acc1
2270	adox	$acc2, $acc2
2271	adc	\$0, $acc2
2272
2273	########################################################################
2274	# Fourth reduction step
2275	add	$t0, $acc4
2276	adc	$t1, $acc5
2277
2278	mulx	$poly3, $t0, $t1
2279	 mov	$acc4, $t2
2280	mov	.Lpoly+8*1(%rip), $poly1
2281	adc	$t0, $acc0
2282	 mov	$acc5, $t3
2283	adc	$t1, $acc1
2284	adc	\$0, $acc2
2285
2286	########################################################################
2287	# Branch-less conditional subtraction of P
2288	xor	%eax, %eax
2289	 mov	$acc0, $t0
2290	sbb	\$-1, $acc4		# .Lpoly[0]
2291	sbb	$poly1, $acc5		# .Lpoly[1]
2292	sbb	\$0, $acc0		# .Lpoly[2]
2293	 mov	$acc1, $t1
2294	sbb	$poly3, $acc1		# .Lpoly[3]
2295	sbb	\$0, $acc2
2296
2297	cmovc	$t2, $acc4
2298	cmovc	$t3, $acc5
2299	mov	$acc4, 8*0($r_ptr)
2300	cmovc	$t0, $acc0
2301	mov	$acc5, 8*1($r_ptr)
2302	cmovc	$t1, $acc1
2303	mov	$acc0, 8*2($r_ptr)
2304	mov	$acc1, 8*3($r_ptr)
2305
2306	ret
2307.cfi_endproc
2308.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2309
2310.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
2311.align	32
2312__ecp_nistz256_sqr_montx:
2313.cfi_startproc
2314	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
2315	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
2316	xor	%eax, %eax
2317	adc	$t0, $acc2
2318	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
2319	 mov	$acc6, %rdx
2320	adc	$t1, $acc3
2321	adc	\$0, $acc4
2322	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
2323
2324	#################################
2325	mulx	$acc7, $t0, $t1		# a[1]*a[2]
2326	adcx	$t0, $acc3
2327	adox	$t1, $acc4
2328
2329	mulx	$acc0, $t0, $t1		# a[1]*a[3]
2330	 mov	$acc7, %rdx
2331	adcx	$t0, $acc4
2332	adox	$t1, $acc5
2333	adc	\$0, $acc5
2334
2335	#################################
2336	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
2337	 mov	8*0+128($a_ptr), %rdx
2338	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
2339	 adcx	$acc1, $acc1		# acc1:6<<1
2340	adox	$t0, $acc5
2341	 adcx	$acc2, $acc2
2342	adox	$acc7, $acc6		# of=0
2343
2344	mulx	%rdx, $acc0, $t1
2345	mov	8*1+128($a_ptr), %rdx
2346	 adcx	$acc3, $acc3
2347	adox	$t1, $acc1
2348	 adcx	$acc4, $acc4
2349	mulx	%rdx, $t0, $t4
2350	mov	8*2+128($a_ptr), %rdx
2351	 adcx	$acc5, $acc5
2352	adox	$t0, $acc2
2353	 adcx	$acc6, $acc6
2354	.byte	0x67
2355	mulx	%rdx, $t0, $t1
2356	mov	8*3+128($a_ptr), %rdx
2357	adox	$t4, $acc3
2358	 adcx	$acc7, $acc7
2359	adox	$t0, $acc4
2360	 mov	\$32, $a_ptr
2361	adox	$t1, $acc5
2362	.byte	0x67,0x67
2363	mulx	%rdx, $t0, $t4
2364	 mov	.Lpoly+8*3(%rip), %rdx
2365	adox	$t0, $acc6
2366	 shlx	$a_ptr, $acc0, $t0
2367	adox	$t4, $acc7
2368	 shrx	$a_ptr, $acc0, $t4
2369	mov	%rdx,$t1
2370
2371	# reduction step 1
2372	add	$t0, $acc1
2373	adc	$t4, $acc2
2374
2375	mulx	$acc0, $t0, $acc0
2376	adc	$t0, $acc3
2377	 shlx	$a_ptr, $acc1, $t0
2378	adc	\$0, $acc0
2379	 shrx	$a_ptr, $acc1, $t4
2380
2381	# reduction step 2
2382	add	$t0, $acc2
2383	adc	$t4, $acc3
2384
2385	mulx	$acc1, $t0, $acc1
2386	adc	$t0, $acc0
2387	 shlx	$a_ptr, $acc2, $t0
2388	adc	\$0, $acc1
2389	 shrx	$a_ptr, $acc2, $t4
2390
2391	# reduction step 3
2392	add	$t0, $acc3
2393	adc	$t4, $acc0
2394
2395	mulx	$acc2, $t0, $acc2
2396	adc	$t0, $acc1
2397	 shlx	$a_ptr, $acc3, $t0
2398	adc	\$0, $acc2
2399	 shrx	$a_ptr, $acc3, $t4
2400
2401	# reduction step 4
2402	add	$t0, $acc0
2403	adc	$t4, $acc1
2404
2405	mulx	$acc3, $t0, $acc3
2406	adc	$t0, $acc2
2407	adc	\$0, $acc3
2408
2409	xor	$t3, $t3
2410	add	$acc0, $acc4		# accumulate upper half
2411	 mov	.Lpoly+8*1(%rip), $a_ptr
2412	adc	$acc1, $acc5
2413	 mov	$acc4, $acc0
2414	adc	$acc2, $acc6
2415	adc	$acc3, $acc7
2416	 mov	$acc5, $acc1
2417	adc	\$0, $t3
2418
2419	sub	\$-1, $acc4		# .Lpoly[0]
2420	 mov	$acc6, $acc2
2421	sbb	$a_ptr, $acc5		# .Lpoly[1]
2422	sbb	\$0, $acc6		# .Lpoly[2]
2423	 mov	$acc7, $acc3
2424	sbb	$t1, $acc7		# .Lpoly[3]
2425	sbb	\$0, $t3
2426
2427	cmovc	$acc0, $acc4
2428	cmovc	$acc1, $acc5
2429	mov	$acc4, 8*0($r_ptr)
2430	cmovc	$acc2, $acc6
2431	mov	$acc5, 8*1($r_ptr)
2432	cmovc	$acc3, $acc7
2433	mov	$acc6, 8*2($r_ptr)
2434	mov	$acc7, 8*3($r_ptr)
2435
2436	ret
2437.cfi_endproc
2438.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2439___
2440}
2441}
2442{
2443my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2444my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2445my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2446
2447$code.=<<___;
2448################################################################################
2449# void ecp_nistz256_from_mont(
2450#   uint64_t res[4],
2451#   uint64_t in[4]);
2452# This one performs Montgomery multiplication by 1, so we only need the reduction
2453
2454.globl	ecp_nistz256_from_mont
2455.type	ecp_nistz256_from_mont,\@function,2
2456.align	32
2457ecp_nistz256_from_mont:
2458.cfi_startproc
2459	push	%r12
2460.cfi_push	%r12
2461	push	%r13
2462.cfi_push	%r13
2463.Lfrom_body:
2464
2465	mov	8*0($in_ptr), %rax
2466	mov	.Lpoly+8*3(%rip), $t2
2467	mov	8*1($in_ptr), $acc1
2468	mov	8*2($in_ptr), $acc2
2469	mov	8*3($in_ptr), $acc3
2470	mov	%rax, $acc0
2471	mov	.Lpoly+8*1(%rip), $t1
2472
2473	#########################################
2474	# First iteration
2475	mov	%rax, $t0
2476	shl	\$32, $acc0
2477	mulq	$t2
2478	shr	\$32, $t0
2479	add	$acc0, $acc1
2480	adc	$t0, $acc2
2481	adc	%rax, $acc3
2482	 mov	$acc1, %rax
2483	adc	\$0, %rdx
2484
2485	#########################################
2486	# Second iteration
2487	mov	$acc1, $t0
2488	shl	\$32, $acc1
2489	mov	%rdx, $acc0
2490	mulq	$t2
2491	shr	\$32, $t0
2492	add	$acc1, $acc2
2493	adc	$t0, $acc3
2494	adc	%rax, $acc0
2495	 mov	$acc2, %rax
2496	adc	\$0, %rdx
2497
2498	##########################################
2499	# Third iteration
2500	mov	$acc2, $t0
2501	shl	\$32, $acc2
2502	mov	%rdx, $acc1
2503	mulq	$t2
2504	shr	\$32, $t0
2505	add	$acc2, $acc3
2506	adc	$t0, $acc0
2507	adc	%rax, $acc1
2508	 mov	$acc3, %rax
2509	adc	\$0, %rdx
2510
2511	###########################################
2512	# Last iteration
2513	mov	$acc3, $t0
2514	shl	\$32, $acc3
2515	mov	%rdx, $acc2
2516	mulq	$t2
2517	shr	\$32, $t0
2518	add	$acc3, $acc0
2519	adc	$t0, $acc1
2520	 mov	$acc0, $t0
2521	adc	%rax, $acc2
2522	 mov	$acc1, $in_ptr
2523	adc	\$0, %rdx
2524
2525	###########################################
2526	# Branch-less conditional subtraction
2527	sub	\$-1, $acc0
2528	 mov	$acc2, %rax
2529	sbb	$t1, $acc1
2530	sbb	\$0, $acc2
2531	 mov	%rdx, $acc3
2532	sbb	$t2, %rdx
2533	sbb	$t2, $t2
2534
2535	cmovnz	$t0, $acc0
2536	cmovnz	$in_ptr, $acc1
2537	mov	$acc0, 8*0($r_ptr)
2538	cmovnz	%rax, $acc2
2539	mov	$acc1, 8*1($r_ptr)
2540	cmovz	%rdx, $acc3
2541	mov	$acc2, 8*2($r_ptr)
2542	mov	$acc3, 8*3($r_ptr)
2543
2544	mov	0(%rsp),%r13
2545.cfi_restore	%r13
2546	mov	8(%rsp),%r12
2547.cfi_restore	%r12
2548	lea	16(%rsp),%rsp
2549.cfi_adjust_cfa_offset	-16
2550.Lfrom_epilogue:
2551	ret
2552.cfi_endproc
2553.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2554___
2555}
2556{
2557my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2558my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2559my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2560my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2561
2562$code.=<<___;
2563################################################################################
2564# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2565.globl	ecp_nistz256_scatter_w5
2566.type	ecp_nistz256_scatter_w5,\@abi-omnipotent
2567.align	32
2568ecp_nistz256_scatter_w5:
2569.cfi_startproc
2570	lea	-3($index,$index,2), $index
2571	movdqa	0x00($in_t), %xmm0
2572	shl	\$5, $index
2573	movdqa	0x10($in_t), %xmm1
2574	movdqa	0x20($in_t), %xmm2
2575	movdqa	0x30($in_t), %xmm3
2576	movdqa	0x40($in_t), %xmm4
2577	movdqa	0x50($in_t), %xmm5
2578	movdqa	%xmm0, 0x00($val,$index)
2579	movdqa	%xmm1, 0x10($val,$index)
2580	movdqa	%xmm2, 0x20($val,$index)
2581	movdqa	%xmm3, 0x30($val,$index)
2582	movdqa	%xmm4, 0x40($val,$index)
2583	movdqa	%xmm5, 0x50($val,$index)
2584
2585	ret
2586.cfi_endproc
2587.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2588
2589################################################################################
2590# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2591.globl	ecp_nistz256_gather_w5
2592.type	ecp_nistz256_gather_w5,\@abi-omnipotent
2593.align	32
2594ecp_nistz256_gather_w5:
2595.cfi_startproc
2596___
2597$code.=<<___	if ($avx>1);
2598	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2599	test	\$`1<<5`, %eax
2600	jnz	.Lavx2_gather_w5
2601___
2602$code.=<<___	if ($win64);
2603	lea	-0x88(%rsp), %rax
2604.LSEH_begin_ecp_nistz256_gather_w5:
2605	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2606	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2607	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2608	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2609	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2610	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2611	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2612	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2613	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2614	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2615	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2616___
2617$code.=<<___;
2618	movdqa	.LOne(%rip), $ONE
2619	movd	$index, $INDEX
2620
2621	pxor	$Ra, $Ra
2622	pxor	$Rb, $Rb
2623	pxor	$Rc, $Rc
2624	pxor	$Rd, $Rd
2625	pxor	$Re, $Re
2626	pxor	$Rf, $Rf
2627
2628	movdqa	$ONE, $M0
2629	pshufd	\$0, $INDEX, $INDEX
2630
2631	mov	\$16, %rax
2632.Lselect_loop_sse_w5:
2633
2634	movdqa	$M0, $TMP0
2635	paddd	$ONE, $M0
2636	pcmpeqd $INDEX, $TMP0
2637
2638	movdqa	16*0($in_t), $T0a
2639	movdqa	16*1($in_t), $T0b
2640	movdqa	16*2($in_t), $T0c
2641	movdqa	16*3($in_t), $T0d
2642	movdqa	16*4($in_t), $T0e
2643	movdqa	16*5($in_t), $T0f
2644	lea 16*6($in_t), $in_t
2645
2646	pand	$TMP0, $T0a
2647	pand	$TMP0, $T0b
2648	por	$T0a, $Ra
2649	pand	$TMP0, $T0c
2650	por	$T0b, $Rb
2651	pand	$TMP0, $T0d
2652	por	$T0c, $Rc
2653	pand	$TMP0, $T0e
2654	por	$T0d, $Rd
2655	pand	$TMP0, $T0f
2656	por	$T0e, $Re
2657	por	$T0f, $Rf
2658
2659	dec	%rax
2660	jnz	.Lselect_loop_sse_w5
2661
2662	movdqu	$Ra, 16*0($val)
2663	movdqu	$Rb, 16*1($val)
2664	movdqu	$Rc, 16*2($val)
2665	movdqu	$Rd, 16*3($val)
2666	movdqu	$Re, 16*4($val)
2667	movdqu	$Rf, 16*5($val)
2668___
2669$code.=<<___	if ($win64);
2670	movaps	(%rsp), %xmm6
2671	movaps	0x10(%rsp), %xmm7
2672	movaps	0x20(%rsp), %xmm8
2673	movaps	0x30(%rsp), %xmm9
2674	movaps	0x40(%rsp), %xmm10
2675	movaps	0x50(%rsp), %xmm11
2676	movaps	0x60(%rsp), %xmm12
2677	movaps	0x70(%rsp), %xmm13
2678	movaps	0x80(%rsp), %xmm14
2679	movaps	0x90(%rsp), %xmm15
2680	lea	0xa8(%rsp), %rsp
2681___
2682$code.=<<___;
2683	ret
2684.cfi_endproc
2685.LSEH_end_ecp_nistz256_gather_w5:
2686.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2687
2688################################################################################
2689# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2690.globl	ecp_nistz256_scatter_w7
2691.type	ecp_nistz256_scatter_w7,\@abi-omnipotent
2692.align	32
2693ecp_nistz256_scatter_w7:
2694.cfi_startproc
2695	movdqu	0x00($in_t), %xmm0
2696	shl	\$6, $index
2697	movdqu	0x10($in_t), %xmm1
2698	movdqu	0x20($in_t), %xmm2
2699	movdqu	0x30($in_t), %xmm3
2700	movdqa	%xmm0, 0x00($val,$index)
2701	movdqa	%xmm1, 0x10($val,$index)
2702	movdqa	%xmm2, 0x20($val,$index)
2703	movdqa	%xmm3, 0x30($val,$index)
2704
2705	ret
2706.cfi_endproc
2707.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2708
2709################################################################################
2710# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2711.globl	ecp_nistz256_gather_w7
2712.type	ecp_nistz256_gather_w7,\@abi-omnipotent
2713.align	32
2714ecp_nistz256_gather_w7:
2715.cfi_startproc
2716___
2717$code.=<<___	if ($avx>1);
2718	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2719	test	\$`1<<5`, %eax
2720	jnz	.Lavx2_gather_w7
2721___
2722$code.=<<___	if ($win64);
2723	lea	-0x88(%rsp), %rax
2724.LSEH_begin_ecp_nistz256_gather_w7:
2725	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2726	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2727	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2728	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2729	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2730	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2731	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2732	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2733	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2734	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2735	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2736___
2737$code.=<<___;
2738	movdqa	.LOne(%rip), $M0
2739	movd	$index, $INDEX
2740
2741	pxor	$Ra, $Ra
2742	pxor	$Rb, $Rb
2743	pxor	$Rc, $Rc
2744	pxor	$Rd, $Rd
2745
2746	movdqa	$M0, $ONE
2747	pshufd	\$0, $INDEX, $INDEX
2748	mov	\$64, %rax
2749
2750.Lselect_loop_sse_w7:
2751	movdqa	$M0, $TMP0
2752	paddd	$ONE, $M0
2753	movdqa	16*0($in_t), $T0a
2754	movdqa	16*1($in_t), $T0b
2755	pcmpeqd	$INDEX, $TMP0
2756	movdqa	16*2($in_t), $T0c
2757	movdqa	16*3($in_t), $T0d
2758	lea	16*4($in_t), $in_t
2759
2760	pand	$TMP0, $T0a
2761	pand	$TMP0, $T0b
2762	por	$T0a, $Ra
2763	pand	$TMP0, $T0c
2764	por	$T0b, $Rb
2765	pand	$TMP0, $T0d
2766	por	$T0c, $Rc
2767	prefetcht0	255($in_t)
2768	por	$T0d, $Rd
2769
2770	dec	%rax
2771	jnz	.Lselect_loop_sse_w7
2772
2773	movdqu	$Ra, 16*0($val)
2774	movdqu	$Rb, 16*1($val)
2775	movdqu	$Rc, 16*2($val)
2776	movdqu	$Rd, 16*3($val)
2777___
2778$code.=<<___	if ($win64);
2779	movaps	(%rsp), %xmm6
2780	movaps	0x10(%rsp), %xmm7
2781	movaps	0x20(%rsp), %xmm8
2782	movaps	0x30(%rsp), %xmm9
2783	movaps	0x40(%rsp), %xmm10
2784	movaps	0x50(%rsp), %xmm11
2785	movaps	0x60(%rsp), %xmm12
2786	movaps	0x70(%rsp), %xmm13
2787	movaps	0x80(%rsp), %xmm14
2788	movaps	0x90(%rsp), %xmm15
2789	lea	0xa8(%rsp), %rsp
2790___
2791$code.=<<___;
2792	ret
2793.cfi_endproc
2794.LSEH_end_ecp_nistz256_gather_w7:
2795.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2796___
2797}
2798if ($avx>1) {
2799my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2800my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2801my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2802my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2803
2804$code.=<<___;
2805################################################################################
2806# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2807.type	ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2808.align	32
2809ecp_nistz256_avx2_gather_w5:
2810.cfi_startproc
2811.Lavx2_gather_w5:
2812	vzeroupper
2813___
2814$code.=<<___	if ($win64);
2815	lea	-0x88(%rsp), %rax
2816	mov	%rsp,%r11
2817.LSEH_begin_ecp_nistz256_avx2_gather_w5:
2818	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2819	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2820	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2821	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2822	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2823	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2824	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2825	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2826	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2827	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2828	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2829___
2830$code.=<<___;
2831	vmovdqa	.LTwo(%rip), $TWO
2832
2833	vpxor	$Ra, $Ra, $Ra
2834	vpxor	$Rb, $Rb, $Rb
2835	vpxor	$Rc, $Rc, $Rc
2836
2837	vmovdqa .LOne(%rip), $M0
2838	vmovdqa .LTwo(%rip), $M1
2839
2840	vmovd	$index, %xmm1
2841	vpermd	$INDEX, $Ra, $INDEX
2842
2843	mov	\$8, %rax
2844.Lselect_loop_avx2_w5:
2845
2846	vmovdqa	32*0($in_t), $T0a
2847	vmovdqa	32*1($in_t), $T0b
2848	vmovdqa	32*2($in_t), $T0c
2849
2850	vmovdqa	32*3($in_t), $T1a
2851	vmovdqa	32*4($in_t), $T1b
2852	vmovdqa	32*5($in_t), $T1c
2853
2854	vpcmpeqd	$INDEX, $M0, $TMP0
2855	vpcmpeqd	$INDEX, $M1, $TMP1
2856
2857	vpaddd	$TWO, $M0, $M0
2858	vpaddd	$TWO, $M1, $M1
2859	lea	32*6($in_t), $in_t
2860
2861	vpand	$TMP0, $T0a, $T0a
2862	vpand	$TMP0, $T0b, $T0b
2863	vpand	$TMP0, $T0c, $T0c
2864	vpand	$TMP1, $T1a, $T1a
2865	vpand	$TMP1, $T1b, $T1b
2866	vpand	$TMP1, $T1c, $T1c
2867
2868	vpxor	$T0a, $Ra, $Ra
2869	vpxor	$T0b, $Rb, $Rb
2870	vpxor	$T0c, $Rc, $Rc
2871	vpxor	$T1a, $Ra, $Ra
2872	vpxor	$T1b, $Rb, $Rb
2873	vpxor	$T1c, $Rc, $Rc
2874
2875	dec %rax
2876	jnz .Lselect_loop_avx2_w5
2877
2878	vmovdqu $Ra, 32*0($val)
2879	vmovdqu $Rb, 32*1($val)
2880	vmovdqu $Rc, 32*2($val)
2881	vzeroupper
2882___
2883$code.=<<___	if ($win64);
2884	movaps	(%rsp), %xmm6
2885	movaps	0x10(%rsp), %xmm7
2886	movaps	0x20(%rsp), %xmm8
2887	movaps	0x30(%rsp), %xmm9
2888	movaps	0x40(%rsp), %xmm10
2889	movaps	0x50(%rsp), %xmm11
2890	movaps	0x60(%rsp), %xmm12
2891	movaps	0x70(%rsp), %xmm13
2892	movaps	0x80(%rsp), %xmm14
2893	movaps	0x90(%rsp), %xmm15
2894	lea	(%r11), %rsp
2895___
2896$code.=<<___;
2897	ret
2898.cfi_endproc
2899.LSEH_end_ecp_nistz256_avx2_gather_w5:
2900.size	ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2901___
2902}
2903if ($avx>1) {
2904my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2905my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2906my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2907my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2908my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2909
2910$code.=<<___;
2911
2912################################################################################
2913# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2914.globl	ecp_nistz256_avx2_gather_w7
2915.type	ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2916.align	32
2917ecp_nistz256_avx2_gather_w7:
2918.cfi_startproc
2919.Lavx2_gather_w7:
2920	vzeroupper
2921___
2922$code.=<<___	if ($win64);
2923	mov	%rsp,%r11
2924	lea	-0x88(%rsp), %rax
2925.LSEH_begin_ecp_nistz256_avx2_gather_w7:
2926	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2927	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2928	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2929	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2930	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2931	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2932	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2933	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2934	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2935	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2936	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2937___
2938$code.=<<___;
2939	vmovdqa	.LThree(%rip), $THREE
2940
2941	vpxor	$Ra, $Ra, $Ra
2942	vpxor	$Rb, $Rb, $Rb
2943
2944	vmovdqa .LOne(%rip), $M0
2945	vmovdqa .LTwo(%rip), $M1
2946	vmovdqa .LThree(%rip), $M2
2947
2948	vmovd	$index, %xmm1
2949	vpermd	$INDEX, $Ra, $INDEX
2950	# Skip index = 0, because it is implicitly the point at infinity
2951
2952	mov	\$21, %rax
2953.Lselect_loop_avx2_w7:
2954
2955	vmovdqa	32*0($in_t), $T0a
2956	vmovdqa	32*1($in_t), $T0b
2957
2958	vmovdqa	32*2($in_t), $T1a
2959	vmovdqa	32*3($in_t), $T1b
2960
2961	vmovdqa	32*4($in_t), $T2a
2962	vmovdqa	32*5($in_t), $T2b
2963
2964	vpcmpeqd	$INDEX, $M0, $TMP0
2965	vpcmpeqd	$INDEX, $M1, $TMP1
2966	vpcmpeqd	$INDEX, $M2, $TMP2
2967
2968	vpaddd	$THREE, $M0, $M0
2969	vpaddd	$THREE, $M1, $M1
2970	vpaddd	$THREE, $M2, $M2
2971	lea	32*6($in_t), $in_t
2972
2973	vpand	$TMP0, $T0a, $T0a
2974	vpand	$TMP0, $T0b, $T0b
2975	vpand	$TMP1, $T1a, $T1a
2976	vpand	$TMP1, $T1b, $T1b
2977	vpand	$TMP2, $T2a, $T2a
2978	vpand	$TMP2, $T2b, $T2b
2979
2980	vpxor	$T0a, $Ra, $Ra
2981	vpxor	$T0b, $Rb, $Rb
2982	vpxor	$T1a, $Ra, $Ra
2983	vpxor	$T1b, $Rb, $Rb
2984	vpxor	$T2a, $Ra, $Ra
2985	vpxor	$T2b, $Rb, $Rb
2986
2987	dec %rax
2988	jnz .Lselect_loop_avx2_w7
2989
2990
2991	vmovdqa	32*0($in_t), $T0a
2992	vmovdqa	32*1($in_t), $T0b
2993
2994	vpcmpeqd	$INDEX, $M0, $TMP0
2995
2996	vpand	$TMP0, $T0a, $T0a
2997	vpand	$TMP0, $T0b, $T0b
2998
2999	vpxor	$T0a, $Ra, $Ra
3000	vpxor	$T0b, $Rb, $Rb
3001
3002	vmovdqu $Ra, 32*0($val)
3003	vmovdqu $Rb, 32*1($val)
3004	vzeroupper
3005___
3006$code.=<<___	if ($win64);
3007	movaps	(%rsp), %xmm6
3008	movaps	0x10(%rsp), %xmm7
3009	movaps	0x20(%rsp), %xmm8
3010	movaps	0x30(%rsp), %xmm9
3011	movaps	0x40(%rsp), %xmm10
3012	movaps	0x50(%rsp), %xmm11
3013	movaps	0x60(%rsp), %xmm12
3014	movaps	0x70(%rsp), %xmm13
3015	movaps	0x80(%rsp), %xmm14
3016	movaps	0x90(%rsp), %xmm15
3017	lea	(%r11), %rsp
3018___
3019$code.=<<___;
3020	ret
3021.cfi_endproc
3022.LSEH_end_ecp_nistz256_avx2_gather_w7:
3023.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3024___
3025} else {
3026$code.=<<___;
3027.globl	ecp_nistz256_avx2_gather_w7
3028.type	ecp_nistz256_avx2_gather_w7,\@function,3
3029.align	32
3030ecp_nistz256_avx2_gather_w7:
3031.cfi_startproc
3032	.byte	0x0f,0x0b	# ud2
3033	ret
3034.cfi_endproc
3035.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3036___
3037}
3038{{{
3039########################################################################
3040# This block implements higher level point_double, point_add and
3041# point_add_affine. The key to performance in this case is to allow
3042# out-of-order execution logic to overlap computations from next step
3043# with tail processing from current step. By using tailored calling
3044# sequence we minimize inter-step overhead to give processor better
3045# shot at overlapping operations...
3046#
3047# You will notice that input data is copied to stack. Trouble is that
3048# there are no registers to spare for holding original pointers and
3049# reloading them, pointers, would create undesired dependencies on
3050# effective addresses calculation paths. In other words it's too done
3051# to favour out-of-order execution logic.
3052#						<appro@openssl.org>
3053
3054my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3055my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3056my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3057my ($poly1,$poly3)=($acc6,$acc7);
3058
3059sub load_for_mul () {
3060my ($a,$b,$src0) = @_;
3061my $bias = $src0 eq "%rax" ? 0 : -128;
3062
3063"	mov	$b, $src0
3064	lea	$b, $b_ptr
3065	mov	8*0+$a, $acc1
3066	mov	8*1+$a, $acc2
3067	lea	$bias+$a, $a_ptr
3068	mov	8*2+$a, $acc3
3069	mov	8*3+$a, $acc4"
3070}
3071
3072sub load_for_sqr () {
3073my ($a,$src0) = @_;
3074my $bias = $src0 eq "%rax" ? 0 : -128;
3075
3076"	mov	8*0+$a, $src0
3077	mov	8*1+$a, $acc6
3078	lea	$bias+$a, $a_ptr
3079	mov	8*2+$a, $acc7
3080	mov	8*3+$a, $acc0"
3081}
3082
3083									{
3084########################################################################
3085# operate in 4-5-0-1 "name space" that matches multiplication output
3086#
3087my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3088
3089$code.=<<___;
3090.type	__ecp_nistz256_add_toq,\@abi-omnipotent
3091.align	32
3092__ecp_nistz256_add_toq:
3093.cfi_startproc
3094	xor	$t4,$t4
3095	add	8*0($b_ptr), $a0
3096	adc	8*1($b_ptr), $a1
3097	 mov	$a0, $t0
3098	adc	8*2($b_ptr), $a2
3099	adc	8*3($b_ptr), $a3
3100	 mov	$a1, $t1
3101	adc	\$0, $t4
3102
3103	sub	\$-1, $a0
3104	 mov	$a2, $t2
3105	sbb	$poly1, $a1
3106	sbb	\$0, $a2
3107	 mov	$a3, $t3
3108	sbb	$poly3, $a3
3109	sbb	\$0, $t4
3110
3111	cmovc	$t0, $a0
3112	cmovc	$t1, $a1
3113	mov	$a0, 8*0($r_ptr)
3114	cmovc	$t2, $a2
3115	mov	$a1, 8*1($r_ptr)
3116	cmovc	$t3, $a3
3117	mov	$a2, 8*2($r_ptr)
3118	mov	$a3, 8*3($r_ptr)
3119
3120	ret
3121.cfi_endproc
3122.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3123
3124.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
3125.align	32
3126__ecp_nistz256_sub_fromq:
3127.cfi_startproc
3128	sub	8*0($b_ptr), $a0
3129	sbb	8*1($b_ptr), $a1
3130	 mov	$a0, $t0
3131	sbb	8*2($b_ptr), $a2
3132	sbb	8*3($b_ptr), $a3
3133	 mov	$a1, $t1
3134	sbb	$t4, $t4
3135
3136	add	\$-1, $a0
3137	 mov	$a2, $t2
3138	adc	$poly1, $a1
3139	adc	\$0, $a2
3140	 mov	$a3, $t3
3141	adc	$poly3, $a3
3142	test	$t4, $t4
3143
3144	cmovz	$t0, $a0
3145	cmovz	$t1, $a1
3146	mov	$a0, 8*0($r_ptr)
3147	cmovz	$t2, $a2
3148	mov	$a1, 8*1($r_ptr)
3149	cmovz	$t3, $a3
3150	mov	$a2, 8*2($r_ptr)
3151	mov	$a3, 8*3($r_ptr)
3152
3153	ret
3154.cfi_endproc
3155.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3156
3157.type	__ecp_nistz256_subq,\@abi-omnipotent
3158.align	32
3159__ecp_nistz256_subq:
3160.cfi_startproc
3161	sub	$a0, $t0
3162	sbb	$a1, $t1
3163	 mov	$t0, $a0
3164	sbb	$a2, $t2
3165	sbb	$a3, $t3
3166	 mov	$t1, $a1
3167	sbb	$t4, $t4
3168
3169	add	\$-1, $t0
3170	 mov	$t2, $a2
3171	adc	$poly1, $t1
3172	adc	\$0, $t2
3173	 mov	$t3, $a3
3174	adc	$poly3, $t3
3175	test	$t4, $t4
3176
3177	cmovnz	$t0, $a0
3178	cmovnz	$t1, $a1
3179	cmovnz	$t2, $a2
3180	cmovnz	$t3, $a3
3181
3182	ret
3183.cfi_endproc
3184.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
3185
3186.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
3187.align	32
3188__ecp_nistz256_mul_by_2q:
3189.cfi_startproc
3190	xor	$t4, $t4
3191	add	$a0, $a0		# a0:a3+a0:a3
3192	adc	$a1, $a1
3193	 mov	$a0, $t0
3194	adc	$a2, $a2
3195	adc	$a3, $a3
3196	 mov	$a1, $t1
3197	adc	\$0, $t4
3198
3199	sub	\$-1, $a0
3200	 mov	$a2, $t2
3201	sbb	$poly1, $a1
3202	sbb	\$0, $a2
3203	 mov	$a3, $t3
3204	sbb	$poly3, $a3
3205	sbb	\$0, $t4
3206
3207	cmovc	$t0, $a0
3208	cmovc	$t1, $a1
3209	mov	$a0, 8*0($r_ptr)
3210	cmovc	$t2, $a2
3211	mov	$a1, 8*1($r_ptr)
3212	cmovc	$t3, $a3
3213	mov	$a2, 8*2($r_ptr)
3214	mov	$a3, 8*3($r_ptr)
3215
3216	ret
3217.cfi_endproc
3218.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3219___
3220									}
3221sub gen_double () {
3222    my $x = shift;
3223    my ($src0,$sfx,$bias);
3224    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3225
3226    if ($x ne "x") {
3227	$src0 = "%rax";
3228	$sfx  = "";
3229	$bias = 0;
3230
3231$code.=<<___;
3232.globl	ecp_nistz256_point_double
3233.type	ecp_nistz256_point_double,\@function,2
3234.align	32
3235ecp_nistz256_point_double:
3236.cfi_startproc
3237___
3238$code.=<<___	if ($addx);
3239	mov	\$0x80100, %ecx
3240	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3241	cmp	\$0x80100, %ecx
3242	je	.Lpoint_doublex
3243___
3244    } else {
3245	$src0 = "%rdx";
3246	$sfx  = "x";
3247	$bias = 128;
3248
3249$code.=<<___;
3250.type	ecp_nistz256_point_doublex,\@function,2
3251.align	32
3252ecp_nistz256_point_doublex:
3253.cfi_startproc
3254.Lpoint_doublex:
3255___
3256    }
3257$code.=<<___;
3258	push	%rbp
3259.cfi_push	%rbp
3260	push	%rbx
3261.cfi_push	%rbx
3262	push	%r12
3263.cfi_push	%r12
3264	push	%r13
3265.cfi_push	%r13
3266	push	%r14
3267.cfi_push	%r14
3268	push	%r15
3269.cfi_push	%r15
3270	sub	\$32*5+8, %rsp
3271.cfi_adjust_cfa_offset	32*5+8
3272.Lpoint_double${x}_body:
3273
3274.Lpoint_double_shortcut$x:
3275	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
3276	mov	$a_ptr, $b_ptr			# backup copy
3277	movdqu	0x10($a_ptr), %xmm1
3278	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
3279	 mov	0x20+8*1($a_ptr), $acc5
3280	 mov	0x20+8*2($a_ptr), $acc0
3281	 mov	0x20+8*3($a_ptr), $acc1
3282	 mov	.Lpoly+8*1(%rip), $poly1
3283	 mov	.Lpoly+8*3(%rip), $poly3
3284	movdqa	%xmm0, $in_x(%rsp)
3285	movdqa	%xmm1, $in_x+0x10(%rsp)
3286	lea	0x20($r_ptr), $acc2
3287	lea	0x40($r_ptr), $acc3
3288	movq	$r_ptr, %xmm0
3289	movq	$acc2, %xmm1
3290	movq	$acc3, %xmm2
3291
3292	lea	$S(%rsp), $r_ptr
3293	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
3294
3295	mov	0x40+8*0($a_ptr), $src0
3296	mov	0x40+8*1($a_ptr), $acc6
3297	mov	0x40+8*2($a_ptr), $acc7
3298	mov	0x40+8*3($a_ptr), $acc0
3299	lea	0x40-$bias($a_ptr), $a_ptr
3300	lea	$Zsqr(%rsp), $r_ptr
3301	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
3302
3303	`&load_for_sqr("$S(%rsp)", "$src0")`
3304	lea	$S(%rsp), $r_ptr
3305	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
3306
3307	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
3308	mov	0x40+8*0($b_ptr), $acc1
3309	mov	0x40+8*1($b_ptr), $acc2
3310	mov	0x40+8*2($b_ptr), $acc3
3311	mov	0x40+8*3($b_ptr), $acc4
3312	lea	0x40-$bias($b_ptr), $a_ptr
3313	lea	0x20($b_ptr), $b_ptr
3314	movq	%xmm2, $r_ptr
3315	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
3316	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
3317
3318	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3319	mov	$in_x+8*1(%rsp), $acc5
3320	lea	$Zsqr(%rsp), $b_ptr
3321	mov	$in_x+8*2(%rsp), $acc0
3322	mov	$in_x+8*3(%rsp), $acc1
3323	lea	$M(%rsp), $r_ptr
3324	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
3325
3326	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3327	mov	$in_x+8*1(%rsp), $acc5
3328	lea	$Zsqr(%rsp), $b_ptr
3329	mov	$in_x+8*2(%rsp), $acc0
3330	mov	$in_x+8*3(%rsp), $acc1
3331	lea	$Zsqr(%rsp), $r_ptr
3332	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
3333
3334	`&load_for_sqr("$S(%rsp)", "$src0")`
3335	movq	%xmm1, $r_ptr
3336	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
3337___
3338{
3339######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3340# operate in 4-5-6-7 "name space" that matches squaring output
3341#
3342my ($poly1,$poly3)=($a_ptr,$t1);
3343my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3344
3345$code.=<<___;
3346	xor	$t4, $t4
3347	mov	$a0, $t0
3348	add	\$-1, $a0
3349	mov	$a1, $t1
3350	adc	$poly1, $a1
3351	mov	$a2, $t2
3352	adc	\$0, $a2
3353	mov	$a3, $t3
3354	adc	$poly3, $a3
3355	adc	\$0, $t4
3356	xor	$a_ptr, $a_ptr		# borrow $a_ptr
3357	test	\$1, $t0
3358
3359	cmovz	$t0, $a0
3360	cmovz	$t1, $a1
3361	cmovz	$t2, $a2
3362	cmovz	$t3, $a3
3363	cmovz	$a_ptr, $t4
3364
3365	mov	$a1, $t0		# a0:a3>>1
3366	shr	\$1, $a0
3367	shl	\$63, $t0
3368	mov	$a2, $t1
3369	shr	\$1, $a1
3370	or	$t0, $a0
3371	shl	\$63, $t1
3372	mov	$a3, $t2
3373	shr	\$1, $a2
3374	or	$t1, $a1
3375	shl	\$63, $t2
3376	mov	$a0, 8*0($r_ptr)
3377	shr	\$1, $a3
3378	mov	$a1, 8*1($r_ptr)
3379	shl	\$63, $t4
3380	or	$t2, $a2
3381	or	$t4, $a3
3382	mov	$a2, 8*2($r_ptr)
3383	mov	$a3, 8*3($r_ptr)
3384___
3385}
3386$code.=<<___;
3387	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3388	lea	$M(%rsp), $r_ptr
3389	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
3390
3391	lea	$tmp0(%rsp), $r_ptr
3392	call	__ecp_nistz256_mul_by_2$x
3393
3394	lea	$M(%rsp), $b_ptr
3395	lea	$M(%rsp), $r_ptr
3396	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
3397
3398	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3399	lea	$S(%rsp), $r_ptr
3400	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
3401
3402	lea	$tmp0(%rsp), $r_ptr
3403	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
3404
3405	`&load_for_sqr("$M(%rsp)", "$src0")`
3406	movq	%xmm0, $r_ptr
3407	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
3408
3409	lea	$tmp0(%rsp), $b_ptr
3410	mov	$acc6, $acc0			# harmonize sqr output and sub input
3411	mov	$acc7, $acc1
3412	mov	$a_ptr, $poly1
3413	mov	$t1, $poly3
3414	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
3415
3416	mov	$S+8*0(%rsp), $t0
3417	mov	$S+8*1(%rsp), $t1
3418	mov	$S+8*2(%rsp), $t2
3419	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
3420	lea	$S(%rsp), $r_ptr
3421	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
3422
3423	mov	$M(%rsp), $src0
3424	lea	$M(%rsp), $b_ptr
3425	mov	$acc4, $acc6			# harmonize sub output and mul input
3426	xor	%ecx, %ecx
3427	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
3428	mov	$acc5, $acc2
3429	mov	$acc5, $S+8*1(%rsp)
3430	cmovz	$acc0, $acc3
3431	mov	$acc0, $S+8*2(%rsp)
3432	lea	$S-$bias(%rsp), $a_ptr
3433	cmovz	$acc1, $acc4
3434	mov	$acc1, $S+8*3(%rsp)
3435	mov	$acc6, $acc1
3436	lea	$S(%rsp), $r_ptr
3437	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
3438
3439	movq	%xmm1, $b_ptr
3440	movq	%xmm1, $r_ptr
3441	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
3442
3443	lea	32*5+56(%rsp), %rsi
3444.cfi_def_cfa	%rsi,8
3445	mov	-48(%rsi),%r15
3446.cfi_restore	%r15
3447	mov	-40(%rsi),%r14
3448.cfi_restore	%r14
3449	mov	-32(%rsi),%r13
3450.cfi_restore	%r13
3451	mov	-24(%rsi),%r12
3452.cfi_restore	%r12
3453	mov	-16(%rsi),%rbx
3454.cfi_restore	%rbx
3455	mov	-8(%rsi),%rbp
3456.cfi_restore	%rbp
3457	lea	(%rsi),%rsp
3458.cfi_def_cfa_register	%rsp
3459.Lpoint_double${x}_epilogue:
3460	ret
3461.cfi_endproc
3462.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3463___
3464}
3465&gen_double("q");
3466
3467sub gen_add () {
3468    my $x = shift;
3469    my ($src0,$sfx,$bias);
3470    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3471	$U1,$U2,$S1,$S2,
3472	$res_x,$res_y,$res_z,
3473	$in1_x,$in1_y,$in1_z,
3474	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3475    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3476
3477    if ($x ne "x") {
3478	$src0 = "%rax";
3479	$sfx  = "";
3480	$bias = 0;
3481
3482$code.=<<___;
3483.globl	ecp_nistz256_point_add
3484.type	ecp_nistz256_point_add,\@function,3
3485.align	32
3486ecp_nistz256_point_add:
3487.cfi_startproc
3488___
3489$code.=<<___	if ($addx);
3490	mov	\$0x80100, %ecx
3491	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3492	cmp	\$0x80100, %ecx
3493	je	.Lpoint_addx
3494___
3495    } else {
3496	$src0 = "%rdx";
3497	$sfx  = "x";
3498	$bias = 128;
3499
3500$code.=<<___;
3501.type	ecp_nistz256_point_addx,\@function,3
3502.align	32
3503ecp_nistz256_point_addx:
3504.cfi_startproc
3505.Lpoint_addx:
3506___
3507    }
3508$code.=<<___;
3509	push	%rbp
3510.cfi_push	%rbp
3511	push	%rbx
3512.cfi_push	%rbx
3513	push	%r12
3514.cfi_push	%r12
3515	push	%r13
3516.cfi_push	%r13
3517	push	%r14
3518.cfi_push	%r14
3519	push	%r15
3520.cfi_push	%r15
3521	sub	\$32*18+8, %rsp
3522.cfi_adjust_cfa_offset	32*18+8
3523.Lpoint_add${x}_body:
3524
3525	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3526	movdqu	0x10($a_ptr), %xmm1
3527	movdqu	0x20($a_ptr), %xmm2
3528	movdqu	0x30($a_ptr), %xmm3
3529	movdqu	0x40($a_ptr), %xmm4
3530	movdqu	0x50($a_ptr), %xmm5
3531	mov	$a_ptr, $b_ptr			# reassign
3532	mov	$b_org, $a_ptr			# reassign
3533	movdqa	%xmm0, $in1_x(%rsp)
3534	movdqa	%xmm1, $in1_x+0x10(%rsp)
3535	movdqa	%xmm2, $in1_y(%rsp)
3536	movdqa	%xmm3, $in1_y+0x10(%rsp)
3537	movdqa	%xmm4, $in1_z(%rsp)
3538	movdqa	%xmm5, $in1_z+0x10(%rsp)
3539	por	%xmm4, %xmm5
3540
3541	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3542	 pshufd	\$0xb1, %xmm5, %xmm3
3543	movdqu	0x10($a_ptr), %xmm1
3544	movdqu	0x20($a_ptr), %xmm2
3545	 por	%xmm3, %xmm5
3546	movdqu	0x30($a_ptr), %xmm3
3547	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3548	 mov	0x40+8*1($a_ptr), $acc6
3549	 mov	0x40+8*2($a_ptr), $acc7
3550	 mov	0x40+8*3($a_ptr), $acc0
3551	movdqa	%xmm0, $in2_x(%rsp)
3552	 pshufd	\$0x1e, %xmm5, %xmm4
3553	movdqa	%xmm1, $in2_x+0x10(%rsp)
3554	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3555	movdqu	0x50($a_ptr),%xmm1
3556	movdqa	%xmm2, $in2_y(%rsp)
3557	movdqa	%xmm3, $in2_y+0x10(%rsp)
3558	 por	%xmm4, %xmm5
3559	 pxor	%xmm4, %xmm4
3560	por	%xmm0, %xmm1
3561	 movq	$r_ptr, %xmm0			# save $r_ptr
3562
3563	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3564	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3565	 mov	$acc6, $in2_z+8*1(%rsp)
3566	 mov	$acc7, $in2_z+8*2(%rsp)
3567	 mov	$acc0, $in2_z+8*3(%rsp)
3568	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3569	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3570
3571	pcmpeqd	%xmm4, %xmm5
3572	pshufd	\$0xb1, %xmm1, %xmm4
3573	por	%xmm1, %xmm4
3574	pshufd	\$0, %xmm5, %xmm5		# in1infty
3575	pshufd	\$0x1e, %xmm4, %xmm3
3576	por	%xmm3, %xmm4
3577	pxor	%xmm3, %xmm3
3578	pcmpeqd	%xmm3, %xmm4
3579	pshufd	\$0, %xmm4, %xmm4		# in2infty
3580	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3581	 mov	0x40+8*1($b_ptr), $acc6
3582	 mov	0x40+8*2($b_ptr), $acc7
3583	 mov	0x40+8*3($b_ptr), $acc0
3584	movq	$b_ptr, %xmm1
3585
3586	lea	0x40-$bias($b_ptr), $a_ptr
3587	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3588	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3589
3590	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3591	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3592	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3593
3594	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3595	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3596	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3597
3598	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3599	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3600	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3601
3602	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3603	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3604	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3605
3606	lea	$S1(%rsp), $b_ptr
3607	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3608	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3609
3610	or	$acc5, $acc4			# see if result is zero
3611	movdqa	%xmm4, %xmm2
3612	or	$acc0, $acc4
3613	or	$acc1, $acc4
3614	por	%xmm5, %xmm2			# in1infty || in2infty
3615	movq	$acc4, %xmm3
3616
3617	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3618	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3619	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3620
3621	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3622	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3623	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3624
3625	lea	$U1(%rsp), $b_ptr
3626	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3627	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3628
3629	or	$acc5, $acc4			# see if result is zero
3630	or	$acc0, $acc4
3631	or	$acc1, $acc4			# !is_equal(U1, U2)
3632
3633	movq	%xmm2, $acc0			# in1infty | in2infty
3634	movq	%xmm3, $acc1			# !is_equal(S1, S2)
3635
3636	or	$acc0, $acc4
3637	or	$acc1, $acc4
3638
3639	# if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2))
3640	.byte	0x3e				# predict taken
3641	jnz	.Ladd_proceed$x
3642
3643.Ladd_double$x:
3644	movq	%xmm1, $a_ptr			# restore $a_ptr
3645	movq	%xmm0, $r_ptr			# restore $r_ptr
3646	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3647.cfi_adjust_cfa_offset	`-32*(18-5)`
3648	jmp	.Lpoint_double_shortcut$x
3649.cfi_adjust_cfa_offset	`32*(18-5)`
3650
3651.align	32
3652.Ladd_proceed$x:
3653	`&load_for_sqr("$R(%rsp)", "$src0")`
3654	lea	$Rsqr(%rsp), $r_ptr		# R^2
3655	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3656
3657	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3658	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3659	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3660
3661	`&load_for_sqr("$H(%rsp)", "$src0")`
3662	lea	$Hsqr(%rsp), $r_ptr		# H^2
3663	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3664
3665	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3666	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3667	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3668
3669	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3670	lea	$Hcub(%rsp), $r_ptr		# H^3
3671	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3672
3673	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3674	lea	$U2(%rsp), $r_ptr		# U1*H^2
3675	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3676___
3677{
3678#######################################################################
3679# operate in 4-5-0-1 "name space" that matches multiplication output
3680#
3681my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3682my ($poly1, $poly3)=($acc6,$acc7);
3683
3684$code.=<<___;
3685	#lea	$U2(%rsp), $a_ptr
3686	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3687	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3688
3689	xor	$t4, $t4
3690	add	$acc0, $acc0		# a0:a3+a0:a3
3691	lea	$Rsqr(%rsp), $a_ptr
3692	adc	$acc1, $acc1
3693	 mov	$acc0, $t0
3694	adc	$acc2, $acc2
3695	adc	$acc3, $acc3
3696	 mov	$acc1, $t1
3697	adc	\$0, $t4
3698
3699	sub	\$-1, $acc0
3700	 mov	$acc2, $t2
3701	sbb	$poly1, $acc1
3702	sbb	\$0, $acc2
3703	 mov	$acc3, $t3
3704	sbb	$poly3, $acc3
3705	sbb	\$0, $t4
3706
3707	cmovc	$t0, $acc0
3708	mov	8*0($a_ptr), $t0
3709	cmovc	$t1, $acc1
3710	mov	8*1($a_ptr), $t1
3711	cmovc	$t2, $acc2
3712	mov	8*2($a_ptr), $t2
3713	cmovc	$t3, $acc3
3714	mov	8*3($a_ptr), $t3
3715
3716	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3717
3718	lea	$Hcub(%rsp), $b_ptr
3719	lea	$res_x(%rsp), $r_ptr
3720	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3721
3722	mov	$U2+8*0(%rsp), $t0
3723	mov	$U2+8*1(%rsp), $t1
3724	mov	$U2+8*2(%rsp), $t2
3725	mov	$U2+8*3(%rsp), $t3
3726	lea	$res_y(%rsp), $r_ptr
3727
3728	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3729
3730	mov	$acc0, 8*0($r_ptr)		# save the result, as
3731	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3732	mov	$acc2, 8*2($r_ptr)
3733	mov	$acc3, 8*3($r_ptr)
3734___
3735}
3736$code.=<<___;
3737	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3738	lea	$S2(%rsp), $r_ptr
3739	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3740
3741	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3742	lea	$res_y(%rsp), $r_ptr
3743	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3744
3745	lea	$S2(%rsp), $b_ptr
3746	lea	$res_y(%rsp), $r_ptr
3747	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3748
3749	movq	%xmm0, $r_ptr		# restore $r_ptr
3750
3751	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3752	movdqa	%xmm5, %xmm1
3753	pandn	$res_z(%rsp), %xmm0
3754	movdqa	%xmm5, %xmm2
3755	pandn	$res_z+0x10(%rsp), %xmm1
3756	movdqa	%xmm5, %xmm3
3757	pand	$in2_z(%rsp), %xmm2
3758	pand	$in2_z+0x10(%rsp), %xmm3
3759	por	%xmm0, %xmm2
3760	por	%xmm1, %xmm3
3761
3762	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3763	movdqa	%xmm4, %xmm1
3764	pandn	%xmm2, %xmm0
3765	movdqa	%xmm4, %xmm2
3766	pandn	%xmm3, %xmm1
3767	movdqa	%xmm4, %xmm3
3768	pand	$in1_z(%rsp), %xmm2
3769	pand	$in1_z+0x10(%rsp), %xmm3
3770	por	%xmm0, %xmm2
3771	por	%xmm1, %xmm3
3772	movdqu	%xmm2, 0x40($r_ptr)
3773	movdqu	%xmm3, 0x50($r_ptr)
3774
3775	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3776	movdqa	%xmm5, %xmm1
3777	pandn	$res_x(%rsp), %xmm0
3778	movdqa	%xmm5, %xmm2
3779	pandn	$res_x+0x10(%rsp), %xmm1
3780	movdqa	%xmm5, %xmm3
3781	pand	$in2_x(%rsp), %xmm2
3782	pand	$in2_x+0x10(%rsp), %xmm3
3783	por	%xmm0, %xmm2
3784	por	%xmm1, %xmm3
3785
3786	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3787	movdqa	%xmm4, %xmm1
3788	pandn	%xmm2, %xmm0
3789	movdqa	%xmm4, %xmm2
3790	pandn	%xmm3, %xmm1
3791	movdqa	%xmm4, %xmm3
3792	pand	$in1_x(%rsp), %xmm2
3793	pand	$in1_x+0x10(%rsp), %xmm3
3794	por	%xmm0, %xmm2
3795	por	%xmm1, %xmm3
3796	movdqu	%xmm2, 0x00($r_ptr)
3797	movdqu	%xmm3, 0x10($r_ptr)
3798
3799	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3800	movdqa	%xmm5, %xmm1
3801	pandn	$res_y(%rsp), %xmm0
3802	movdqa	%xmm5, %xmm2
3803	pandn	$res_y+0x10(%rsp), %xmm1
3804	movdqa	%xmm5, %xmm3
3805	pand	$in2_y(%rsp), %xmm2
3806	pand	$in2_y+0x10(%rsp), %xmm3
3807	por	%xmm0, %xmm2
3808	por	%xmm1, %xmm3
3809
3810	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3811	movdqa	%xmm4, %xmm1
3812	pandn	%xmm2, %xmm0
3813	movdqa	%xmm4, %xmm2
3814	pandn	%xmm3, %xmm1
3815	movdqa	%xmm4, %xmm3
3816	pand	$in1_y(%rsp), %xmm2
3817	pand	$in1_y+0x10(%rsp), %xmm3
3818	por	%xmm0, %xmm2
3819	por	%xmm1, %xmm3
3820	movdqu	%xmm2, 0x20($r_ptr)
3821	movdqu	%xmm3, 0x30($r_ptr)
3822
3823.Ladd_done$x:
3824	lea	32*18+56(%rsp), %rsi
3825.cfi_def_cfa	%rsi,8
3826	mov	-48(%rsi),%r15
3827.cfi_restore	%r15
3828	mov	-40(%rsi),%r14
3829.cfi_restore	%r14
3830	mov	-32(%rsi),%r13
3831.cfi_restore	%r13
3832	mov	-24(%rsi),%r12
3833.cfi_restore	%r12
3834	mov	-16(%rsi),%rbx
3835.cfi_restore	%rbx
3836	mov	-8(%rsi),%rbp
3837.cfi_restore	%rbp
3838	lea	(%rsi),%rsp
3839.cfi_def_cfa_register	%rsp
3840.Lpoint_add${x}_epilogue:
3841	ret
3842.cfi_endproc
3843.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3844___
3845}
3846&gen_add("q");
3847
3848sub gen_add_affine () {
3849    my $x = shift;
3850    my ($src0,$sfx,$bias);
3851    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3852	$res_x,$res_y,$res_z,
3853	$in1_x,$in1_y,$in1_z,
3854	$in2_x,$in2_y)=map(32*$_,(0..14));
3855    my $Z1sqr = $S2;
3856
3857    if ($x ne "x") {
3858	$src0 = "%rax";
3859	$sfx  = "";
3860	$bias = 0;
3861
3862$code.=<<___;
3863.globl	ecp_nistz256_point_add_affine
3864.type	ecp_nistz256_point_add_affine,\@function,3
3865.align	32
3866ecp_nistz256_point_add_affine:
3867.cfi_startproc
3868___
3869$code.=<<___	if ($addx);
3870	mov	\$0x80100, %ecx
3871	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3872	cmp	\$0x80100, %ecx
3873	je	.Lpoint_add_affinex
3874___
3875    } else {
3876	$src0 = "%rdx";
3877	$sfx  = "x";
3878	$bias = 128;
3879
3880$code.=<<___;
3881.type	ecp_nistz256_point_add_affinex,\@function,3
3882.align	32
3883ecp_nistz256_point_add_affinex:
3884.cfi_startproc
3885.Lpoint_add_affinex:
3886___
3887    }
3888$code.=<<___;
3889	push	%rbp
3890.cfi_push	%rbp
3891	push	%rbx
3892.cfi_push	%rbx
3893	push	%r12
3894.cfi_push	%r12
3895	push	%r13
3896.cfi_push	%r13
3897	push	%r14
3898.cfi_push	%r14
3899	push	%r15
3900.cfi_push	%r15
3901	sub	\$32*15+8, %rsp
3902.cfi_adjust_cfa_offset	32*15+8
3903.Ladd_affine${x}_body:
3904
3905	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3906	mov	$b_org, $b_ptr		# reassign
3907	movdqu	0x10($a_ptr), %xmm1
3908	movdqu	0x20($a_ptr), %xmm2
3909	movdqu	0x30($a_ptr), %xmm3
3910	movdqu	0x40($a_ptr), %xmm4
3911	movdqu	0x50($a_ptr), %xmm5
3912	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3913	 mov	0x40+8*1($a_ptr), $acc6
3914	 mov	0x40+8*2($a_ptr), $acc7
3915	 mov	0x40+8*3($a_ptr), $acc0
3916	movdqa	%xmm0, $in1_x(%rsp)
3917	movdqa	%xmm1, $in1_x+0x10(%rsp)
3918	movdqa	%xmm2, $in1_y(%rsp)
3919	movdqa	%xmm3, $in1_y+0x10(%rsp)
3920	movdqa	%xmm4, $in1_z(%rsp)
3921	movdqa	%xmm5, $in1_z+0x10(%rsp)
3922	por	%xmm4, %xmm5
3923
3924	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3925	 pshufd	\$0xb1, %xmm5, %xmm3
3926	movdqu	0x10($b_ptr), %xmm1
3927	movdqu	0x20($b_ptr), %xmm2
3928	 por	%xmm3, %xmm5
3929	movdqu	0x30($b_ptr), %xmm3
3930	movdqa	%xmm0, $in2_x(%rsp)
3931	 pshufd	\$0x1e, %xmm5, %xmm4
3932	movdqa	%xmm1, $in2_x+0x10(%rsp)
3933	por	%xmm0, %xmm1
3934	 movq	$r_ptr, %xmm0		# save $r_ptr
3935	movdqa	%xmm2, $in2_y(%rsp)
3936	movdqa	%xmm3, $in2_y+0x10(%rsp)
3937	por	%xmm2, %xmm3
3938	 por	%xmm4, %xmm5
3939	 pxor	%xmm4, %xmm4
3940	por	%xmm1, %xmm3
3941
3942	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3943	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3944	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3945
3946	pcmpeqd	%xmm4, %xmm5
3947	pshufd	\$0xb1, %xmm3, %xmm4
3948	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3949	 #lea	0x00($b_ptr), $b_ptr
3950	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3951	por	%xmm3, %xmm4
3952	pshufd	\$0, %xmm5, %xmm5		# in1infty
3953	pshufd	\$0x1e, %xmm4, %xmm3
3954	 mov	$acc5, $acc2
3955	por	%xmm3, %xmm4
3956	pxor	%xmm3, %xmm3
3957	 mov	$acc6, $acc3
3958	pcmpeqd	%xmm3, %xmm4
3959	pshufd	\$0, %xmm4, %xmm4		# in2infty
3960
3961	lea	$Z1sqr-$bias(%rsp), $a_ptr
3962	mov	$acc7, $acc4
3963	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3964	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3965
3966	lea	$in1_x(%rsp), $b_ptr
3967	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3968	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3969
3970	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3971	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3972	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3973
3974	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3975	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3976	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3977
3978	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3979	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3980	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3981
3982	lea	$in1_y(%rsp), $b_ptr
3983	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3984	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3985
3986	`&load_for_sqr("$H(%rsp)", "$src0")`
3987	lea	$Hsqr(%rsp), $r_ptr		# H^2
3988	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3989
3990	`&load_for_sqr("$R(%rsp)", "$src0")`
3991	lea	$Rsqr(%rsp), $r_ptr		# R^2
3992	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3993
3994	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3995	lea	$Hcub(%rsp), $r_ptr		# H^3
3996	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3997
3998	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3999	lea	$U2(%rsp), $r_ptr		# U1*H^2
4000	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
4001___
4002{
4003#######################################################################
4004# operate in 4-5-0-1 "name space" that matches multiplication output
4005#
4006my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4007my ($poly1, $poly3)=($acc6,$acc7);
4008
4009$code.=<<___;
4010	#lea	$U2(%rsp), $a_ptr
4011	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
4012	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
4013
4014	xor	$t4, $t4
4015	add	$acc0, $acc0		# a0:a3+a0:a3
4016	lea	$Rsqr(%rsp), $a_ptr
4017	adc	$acc1, $acc1
4018	 mov	$acc0, $t0
4019	adc	$acc2, $acc2
4020	adc	$acc3, $acc3
4021	 mov	$acc1, $t1
4022	adc	\$0, $t4
4023
4024	sub	\$-1, $acc0
4025	 mov	$acc2, $t2
4026	sbb	$poly1, $acc1
4027	sbb	\$0, $acc2
4028	 mov	$acc3, $t3
4029	sbb	$poly3, $acc3
4030	sbb	\$0, $t4
4031
4032	cmovc	$t0, $acc0
4033	mov	8*0($a_ptr), $t0
4034	cmovc	$t1, $acc1
4035	mov	8*1($a_ptr), $t1
4036	cmovc	$t2, $acc2
4037	mov	8*2($a_ptr), $t2
4038	cmovc	$t3, $acc3
4039	mov	8*3($a_ptr), $t3
4040
4041	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
4042
4043	lea	$Hcub(%rsp), $b_ptr
4044	lea	$res_x(%rsp), $r_ptr
4045	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
4046
4047	mov	$U2+8*0(%rsp), $t0
4048	mov	$U2+8*1(%rsp), $t1
4049	mov	$U2+8*2(%rsp), $t2
4050	mov	$U2+8*3(%rsp), $t3
4051	lea	$H(%rsp), $r_ptr
4052
4053	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
4054
4055	mov	$acc0, 8*0($r_ptr)		# save the result, as
4056	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
4057	mov	$acc2, 8*2($r_ptr)
4058	mov	$acc3, 8*3($r_ptr)
4059___
4060}
4061$code.=<<___;
4062	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4063	lea	$S2(%rsp), $r_ptr
4064	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
4065
4066	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4067	lea	$H(%rsp), $r_ptr
4068	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
4069
4070	lea	$S2(%rsp), $b_ptr
4071	lea	$res_y(%rsp), $r_ptr
4072	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
4073
4074	movq	%xmm0, $r_ptr		# restore $r_ptr
4075
4076	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
4077	movdqa	%xmm5, %xmm1
4078	pandn	$res_z(%rsp), %xmm0
4079	movdqa	%xmm5, %xmm2
4080	pandn	$res_z+0x10(%rsp), %xmm1
4081	movdqa	%xmm5, %xmm3
4082	pand	.LONE_mont(%rip), %xmm2
4083	pand	.LONE_mont+0x10(%rip), %xmm3
4084	por	%xmm0, %xmm2
4085	por	%xmm1, %xmm3
4086
4087	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
4088	movdqa	%xmm4, %xmm1
4089	pandn	%xmm2, %xmm0
4090	movdqa	%xmm4, %xmm2
4091	pandn	%xmm3, %xmm1
4092	movdqa	%xmm4, %xmm3
4093	pand	$in1_z(%rsp), %xmm2
4094	pand	$in1_z+0x10(%rsp), %xmm3
4095	por	%xmm0, %xmm2
4096	por	%xmm1, %xmm3
4097	movdqu	%xmm2, 0x40($r_ptr)
4098	movdqu	%xmm3, 0x50($r_ptr)
4099
4100	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
4101	movdqa	%xmm5, %xmm1
4102	pandn	$res_x(%rsp), %xmm0
4103	movdqa	%xmm5, %xmm2
4104	pandn	$res_x+0x10(%rsp), %xmm1
4105	movdqa	%xmm5, %xmm3
4106	pand	$in2_x(%rsp), %xmm2
4107	pand	$in2_x+0x10(%rsp), %xmm3
4108	por	%xmm0, %xmm2
4109	por	%xmm1, %xmm3
4110
4111	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
4112	movdqa	%xmm4, %xmm1
4113	pandn	%xmm2, %xmm0
4114	movdqa	%xmm4, %xmm2
4115	pandn	%xmm3, %xmm1
4116	movdqa	%xmm4, %xmm3
4117	pand	$in1_x(%rsp), %xmm2
4118	pand	$in1_x+0x10(%rsp), %xmm3
4119	por	%xmm0, %xmm2
4120	por	%xmm1, %xmm3
4121	movdqu	%xmm2, 0x00($r_ptr)
4122	movdqu	%xmm3, 0x10($r_ptr)
4123
4124	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
4125	movdqa	%xmm5, %xmm1
4126	pandn	$res_y(%rsp), %xmm0
4127	movdqa	%xmm5, %xmm2
4128	pandn	$res_y+0x10(%rsp), %xmm1
4129	movdqa	%xmm5, %xmm3
4130	pand	$in2_y(%rsp), %xmm2
4131	pand	$in2_y+0x10(%rsp), %xmm3
4132	por	%xmm0, %xmm2
4133	por	%xmm1, %xmm3
4134
4135	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
4136	movdqa	%xmm4, %xmm1
4137	pandn	%xmm2, %xmm0
4138	movdqa	%xmm4, %xmm2
4139	pandn	%xmm3, %xmm1
4140	movdqa	%xmm4, %xmm3
4141	pand	$in1_y(%rsp), %xmm2
4142	pand	$in1_y+0x10(%rsp), %xmm3
4143	por	%xmm0, %xmm2
4144	por	%xmm1, %xmm3
4145	movdqu	%xmm2, 0x20($r_ptr)
4146	movdqu	%xmm3, 0x30($r_ptr)
4147
4148	lea	32*15+56(%rsp), %rsi
4149.cfi_def_cfa	%rsi,8
4150	mov	-48(%rsi),%r15
4151.cfi_restore	%r15
4152	mov	-40(%rsi),%r14
4153.cfi_restore	%r14
4154	mov	-32(%rsi),%r13
4155.cfi_restore	%r13
4156	mov	-24(%rsi),%r12
4157.cfi_restore	%r12
4158	mov	-16(%rsi),%rbx
4159.cfi_restore	%rbx
4160	mov	-8(%rsi),%rbp
4161.cfi_restore	%rbp
4162	lea	(%rsi),%rsp
4163.cfi_def_cfa_register	%rsp
4164.Ladd_affine${x}_epilogue:
4165	ret
4166.cfi_endproc
4167.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4168___
4169}
4170&gen_add_affine("q");
4171
4172########################################################################
4173# AD*X magic
4174#
4175if ($addx) {								{
4176########################################################################
4177# operate in 4-5-0-1 "name space" that matches multiplication output
4178#
4179my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4180
4181$code.=<<___;
4182.type	__ecp_nistz256_add_tox,\@abi-omnipotent
4183.align	32
4184__ecp_nistz256_add_tox:
4185.cfi_startproc
4186	xor	$t4, $t4
4187	adc	8*0($b_ptr), $a0
4188	adc	8*1($b_ptr), $a1
4189	 mov	$a0, $t0
4190	adc	8*2($b_ptr), $a2
4191	adc	8*3($b_ptr), $a3
4192	 mov	$a1, $t1
4193	adc	\$0, $t4
4194
4195	xor	$t3, $t3
4196	sbb	\$-1, $a0
4197	 mov	$a2, $t2
4198	sbb	$poly1, $a1
4199	sbb	\$0, $a2
4200	 mov	$a3, $t3
4201	sbb	$poly3, $a3
4202	sbb	\$0, $t4
4203
4204	cmovc	$t0, $a0
4205	cmovc	$t1, $a1
4206	mov	$a0, 8*0($r_ptr)
4207	cmovc	$t2, $a2
4208	mov	$a1, 8*1($r_ptr)
4209	cmovc	$t3, $a3
4210	mov	$a2, 8*2($r_ptr)
4211	mov	$a3, 8*3($r_ptr)
4212
4213	ret
4214.cfi_endproc
4215.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4216
4217.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
4218.align	32
4219__ecp_nistz256_sub_fromx:
4220.cfi_startproc
4221	xor	$t4, $t4
4222	sbb	8*0($b_ptr), $a0
4223	sbb	8*1($b_ptr), $a1
4224	 mov	$a0, $t0
4225	sbb	8*2($b_ptr), $a2
4226	sbb	8*3($b_ptr), $a3
4227	 mov	$a1, $t1
4228	sbb	\$0, $t4
4229
4230	xor	$t3, $t3
4231	adc	\$-1, $a0
4232	 mov	$a2, $t2
4233	adc	$poly1, $a1
4234	adc	\$0, $a2
4235	 mov	$a3, $t3
4236	adc	$poly3, $a3
4237
4238	bt	\$0, $t4
4239	cmovnc	$t0, $a0
4240	cmovnc	$t1, $a1
4241	mov	$a0, 8*0($r_ptr)
4242	cmovnc	$t2, $a2
4243	mov	$a1, 8*1($r_ptr)
4244	cmovnc	$t3, $a3
4245	mov	$a2, 8*2($r_ptr)
4246	mov	$a3, 8*3($r_ptr)
4247
4248	ret
4249.cfi_endproc
4250.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4251
4252.type	__ecp_nistz256_subx,\@abi-omnipotent
4253.align	32
4254__ecp_nistz256_subx:
4255.cfi_startproc
4256	xor	$t4, $t4
4257	sbb	$a0, $t0
4258	sbb	$a1, $t1
4259	 mov	$t0, $a0
4260	sbb	$a2, $t2
4261	sbb	$a3, $t3
4262	 mov	$t1, $a1
4263	sbb	\$0, $t4
4264
4265	xor	$a3 ,$a3
4266	adc	\$-1, $t0
4267	 mov	$t2, $a2
4268	adc	$poly1, $t1
4269	adc	\$0, $t2
4270	 mov	$t3, $a3
4271	adc	$poly3, $t3
4272
4273	bt	\$0, $t4
4274	cmovc	$t0, $a0
4275	cmovc	$t1, $a1
4276	cmovc	$t2, $a2
4277	cmovc	$t3, $a3
4278
4279	ret
4280.cfi_endproc
4281.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
4282
4283.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
4284.align	32
4285__ecp_nistz256_mul_by_2x:
4286.cfi_startproc
4287	xor	$t4, $t4
4288	adc	$a0, $a0		# a0:a3+a0:a3
4289	adc	$a1, $a1
4290	 mov	$a0, $t0
4291	adc	$a2, $a2
4292	adc	$a3, $a3
4293	 mov	$a1, $t1
4294	adc	\$0, $t4
4295
4296	xor	$t3, $t3
4297	sbb	\$-1, $a0
4298	 mov	$a2, $t2
4299	sbb	$poly1, $a1
4300	sbb	\$0, $a2
4301	 mov	$a3, $t3
4302	sbb	$poly3, $a3
4303	sbb	\$0, $t4
4304
4305	cmovc	$t0, $a0
4306	cmovc	$t1, $a1
4307	mov	$a0, 8*0($r_ptr)
4308	cmovc	$t2, $a2
4309	mov	$a1, 8*1($r_ptr)
4310	cmovc	$t3, $a3
4311	mov	$a2, 8*2($r_ptr)
4312	mov	$a3, 8*3($r_ptr)
4313
4314	ret
4315.cfi_endproc
4316.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4317___
4318									}
4319&gen_double("x");
4320&gen_add("x");
4321&gen_add_affine("x");
4322}
4323}}}
4324
4325# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4326#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4327if ($win64) {
4328$rec="%rcx";
4329$frame="%rdx";
4330$context="%r8";
4331$disp="%r9";
4332
4333$code.=<<___;
4334.extern	__imp_RtlVirtualUnwind
4335
4336.type	short_handler,\@abi-omnipotent
4337.align	16
4338short_handler:
4339	push	%rsi
4340	push	%rdi
4341	push	%rbx
4342	push	%rbp
4343	push	%r12
4344	push	%r13
4345	push	%r14
4346	push	%r15
4347	pushfq
4348	sub	\$64,%rsp
4349
4350	mov	120($context),%rax	# pull context->Rax
4351	mov	248($context),%rbx	# pull context->Rip
4352
4353	mov	8($disp),%rsi		# disp->ImageBase
4354	mov	56($disp),%r11		# disp->HandlerData
4355
4356	mov	0(%r11),%r10d		# HandlerData[0]
4357	lea	(%rsi,%r10),%r10	# end of prologue label
4358	cmp	%r10,%rbx		# context->Rip<end of prologue label
4359	jb	.Lcommon_seh_tail
4360
4361	mov	152($context),%rax	# pull context->Rsp
4362
4363	mov	4(%r11),%r10d		# HandlerData[1]
4364	lea	(%rsi,%r10),%r10	# epilogue label
4365	cmp	%r10,%rbx		# context->Rip>=epilogue label
4366	jae	.Lcommon_seh_tail
4367
4368	lea	16(%rax),%rax
4369
4370	mov	-8(%rax),%r12
4371	mov	-16(%rax),%r13
4372	mov	%r12,216($context)	# restore context->R12
4373	mov	%r13,224($context)	# restore context->R13
4374
4375	jmp	.Lcommon_seh_tail
4376.size	short_handler,.-short_handler
4377
4378.type	full_handler,\@abi-omnipotent
4379.align	16
4380full_handler:
4381	push	%rsi
4382	push	%rdi
4383	push	%rbx
4384	push	%rbp
4385	push	%r12
4386	push	%r13
4387	push	%r14
4388	push	%r15
4389	pushfq
4390	sub	\$64,%rsp
4391
4392	mov	120($context),%rax	# pull context->Rax
4393	mov	248($context),%rbx	# pull context->Rip
4394
4395	mov	8($disp),%rsi		# disp->ImageBase
4396	mov	56($disp),%r11		# disp->HandlerData
4397
4398	mov	0(%r11),%r10d		# HandlerData[0]
4399	lea	(%rsi,%r10),%r10	# end of prologue label
4400	cmp	%r10,%rbx		# context->Rip<end of prologue label
4401	jb	.Lcommon_seh_tail
4402
4403	mov	152($context),%rax	# pull context->Rsp
4404
4405	mov	4(%r11),%r10d		# HandlerData[1]
4406	lea	(%rsi,%r10),%r10	# epilogue label
4407	cmp	%r10,%rbx		# context->Rip>=epilogue label
4408	jae	.Lcommon_seh_tail
4409
4410	mov	8(%r11),%r10d		# HandlerData[2]
4411	lea	(%rax,%r10),%rax
4412
4413	mov	-8(%rax),%rbp
4414	mov	-16(%rax),%rbx
4415	mov	-24(%rax),%r12
4416	mov	-32(%rax),%r13
4417	mov	-40(%rax),%r14
4418	mov	-48(%rax),%r15
4419	mov	%rbx,144($context)	# restore context->Rbx
4420	mov	%rbp,160($context)	# restore context->Rbp
4421	mov	%r12,216($context)	# restore context->R12
4422	mov	%r13,224($context)	# restore context->R13
4423	mov	%r14,232($context)	# restore context->R14
4424	mov	%r15,240($context)	# restore context->R15
4425
4426.Lcommon_seh_tail:
4427	mov	8(%rax),%rdi
4428	mov	16(%rax),%rsi
4429	mov	%rax,152($context)	# restore context->Rsp
4430	mov	%rsi,168($context)	# restore context->Rsi
4431	mov	%rdi,176($context)	# restore context->Rdi
4432
4433	mov	40($disp),%rdi		# disp->ContextRecord
4434	mov	$context,%rsi		# context
4435	mov	\$154,%ecx		# sizeof(CONTEXT)
4436	.long	0xa548f3fc		# cld; rep movsq
4437
4438	mov	$disp,%rsi
4439	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4440	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4441	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4442	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4443	mov	40(%rsi),%r10		# disp->ContextRecord
4444	lea	56(%rsi),%r11		# &disp->HandlerData
4445	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4446	mov	%r10,32(%rsp)		# arg5
4447	mov	%r11,40(%rsp)		# arg6
4448	mov	%r12,48(%rsp)		# arg7
4449	mov	%rcx,56(%rsp)		# arg8, (NULL)
4450	call	*__imp_RtlVirtualUnwind(%rip)
4451
4452	mov	\$1,%eax		# ExceptionContinueSearch
4453	add	\$64,%rsp
4454	popfq
4455	pop	%r15
4456	pop	%r14
4457	pop	%r13
4458	pop	%r12
4459	pop	%rbp
4460	pop	%rbx
4461	pop	%rdi
4462	pop	%rsi
4463	ret
4464.size	full_handler,.-full_handler
4465
4466.section	.pdata
4467.align	4
4468	.rva	.LSEH_begin_ecp_nistz256_mul_by_2
4469	.rva	.LSEH_end_ecp_nistz256_mul_by_2
4470	.rva	.LSEH_info_ecp_nistz256_mul_by_2
4471
4472	.rva	.LSEH_begin_ecp_nistz256_div_by_2
4473	.rva	.LSEH_end_ecp_nistz256_div_by_2
4474	.rva	.LSEH_info_ecp_nistz256_div_by_2
4475
4476	.rva	.LSEH_begin_ecp_nistz256_mul_by_3
4477	.rva	.LSEH_end_ecp_nistz256_mul_by_3
4478	.rva	.LSEH_info_ecp_nistz256_mul_by_3
4479
4480	.rva	.LSEH_begin_ecp_nistz256_add
4481	.rva	.LSEH_end_ecp_nistz256_add
4482	.rva	.LSEH_info_ecp_nistz256_add
4483
4484	.rva	.LSEH_begin_ecp_nistz256_sub
4485	.rva	.LSEH_end_ecp_nistz256_sub
4486	.rva	.LSEH_info_ecp_nistz256_sub
4487
4488	.rva	.LSEH_begin_ecp_nistz256_neg
4489	.rva	.LSEH_end_ecp_nistz256_neg
4490	.rva	.LSEH_info_ecp_nistz256_neg
4491
4492	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
4493	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
4494	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
4495
4496	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
4497	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
4498	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
4499___
4500$code.=<<___	if ($addx);
4501	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
4502	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
4503	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
4504
4505	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
4506	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
4507	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
4508___
4509$code.=<<___;
4510	.rva	.LSEH_begin_ecp_nistz256_to_mont
4511	.rva	.LSEH_end_ecp_nistz256_to_mont
4512	.rva	.LSEH_info_ecp_nistz256_to_mont
4513
4514	.rva	.LSEH_begin_ecp_nistz256_mul_mont
4515	.rva	.LSEH_end_ecp_nistz256_mul_mont
4516	.rva	.LSEH_info_ecp_nistz256_mul_mont
4517
4518	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
4519	.rva	.LSEH_end_ecp_nistz256_sqr_mont
4520	.rva	.LSEH_info_ecp_nistz256_sqr_mont
4521
4522	.rva	.LSEH_begin_ecp_nistz256_from_mont
4523	.rva	.LSEH_end_ecp_nistz256_from_mont
4524	.rva	.LSEH_info_ecp_nistz256_from_mont
4525
4526	.rva	.LSEH_begin_ecp_nistz256_gather_w5
4527	.rva	.LSEH_end_ecp_nistz256_gather_w5
4528	.rva	.LSEH_info_ecp_nistz256_gather_wX
4529
4530	.rva	.LSEH_begin_ecp_nistz256_gather_w7
4531	.rva	.LSEH_end_ecp_nistz256_gather_w7
4532	.rva	.LSEH_info_ecp_nistz256_gather_wX
4533___
4534$code.=<<___	if ($avx>1);
4535	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w5
4536	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w5
4537	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4538
4539	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w7
4540	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w7
4541	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4542___
4543$code.=<<___;
4544	.rva	.LSEH_begin_ecp_nistz256_point_double
4545	.rva	.LSEH_end_ecp_nistz256_point_double
4546	.rva	.LSEH_info_ecp_nistz256_point_double
4547
4548	.rva	.LSEH_begin_ecp_nistz256_point_add
4549	.rva	.LSEH_end_ecp_nistz256_point_add
4550	.rva	.LSEH_info_ecp_nistz256_point_add
4551
4552	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4553	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4554	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4555___
4556$code.=<<___ if ($addx);
4557	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4558	.rva	.LSEH_end_ecp_nistz256_point_doublex
4559	.rva	.LSEH_info_ecp_nistz256_point_doublex
4560
4561	.rva	.LSEH_begin_ecp_nistz256_point_addx
4562	.rva	.LSEH_end_ecp_nistz256_point_addx
4563	.rva	.LSEH_info_ecp_nistz256_point_addx
4564
4565	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4566	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4567	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4568___
4569$code.=<<___;
4570
4571.section	.xdata
4572.align	8
4573.LSEH_info_ecp_nistz256_mul_by_2:
4574	.byte	9,0,0,0
4575	.rva	short_handler
4576	.rva	.Lmul_by_2_body,.Lmul_by_2_epilogue	# HandlerData[]
4577.LSEH_info_ecp_nistz256_div_by_2:
4578	.byte	9,0,0,0
4579	.rva	short_handler
4580	.rva	.Ldiv_by_2_body,.Ldiv_by_2_epilogue	# HandlerData[]
4581.LSEH_info_ecp_nistz256_mul_by_3:
4582	.byte	9,0,0,0
4583	.rva	short_handler
4584	.rva	.Lmul_by_3_body,.Lmul_by_3_epilogue	# HandlerData[]
4585.LSEH_info_ecp_nistz256_add:
4586	.byte	9,0,0,0
4587	.rva	short_handler
4588	.rva	.Ladd_body,.Ladd_epilogue		# HandlerData[]
4589.LSEH_info_ecp_nistz256_sub:
4590	.byte	9,0,0,0
4591	.rva	short_handler
4592	.rva	.Lsub_body,.Lsub_epilogue		# HandlerData[]
4593.LSEH_info_ecp_nistz256_neg:
4594	.byte	9,0,0,0
4595	.rva	short_handler
4596	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4597.LSEH_info_ecp_nistz256_ord_mul_mont:
4598	.byte	9,0,0,0
4599	.rva	full_handler
4600	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4601	.long	48,0
4602.LSEH_info_ecp_nistz256_ord_sqr_mont:
4603	.byte	9,0,0,0
4604	.rva	full_handler
4605	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4606	.long	48,0
4607___
4608$code.=<<___ if ($addx);
4609.LSEH_info_ecp_nistz256_ord_mul_montx:
4610	.byte	9,0,0,0
4611	.rva	full_handler
4612	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4613	.long	48,0
4614.LSEH_info_ecp_nistz256_ord_sqr_montx:
4615	.byte	9,0,0,0
4616	.rva	full_handler
4617	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4618	.long	48,0
4619___
4620$code.=<<___;
4621.LSEH_info_ecp_nistz256_to_mont:
4622	.byte	9,0,0,0
4623	.rva	full_handler
4624	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4625	.long	48,0
4626.LSEH_info_ecp_nistz256_mul_mont:
4627	.byte	9,0,0,0
4628	.rva	full_handler
4629	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4630	.long	48,0
4631.LSEH_info_ecp_nistz256_sqr_mont:
4632	.byte	9,0,0,0
4633	.rva	full_handler
4634	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4635	.long	48,0
4636.LSEH_info_ecp_nistz256_from_mont:
4637	.byte	9,0,0,0
4638	.rva	short_handler
4639	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
4640.LSEH_info_ecp_nistz256_gather_wX:
4641	.byte	0x01,0x33,0x16,0x00
4642	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4643	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4644	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4645	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4646	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4647	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4648	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4649	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4650	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4651	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4652	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4653	.align	8
4654___
4655$code.=<<___	if ($avx>1);
4656.LSEH_info_ecp_nistz256_avx2_gather_wX:
4657	.byte	0x01,0x36,0x17,0x0b
4658	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4659	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4660	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4661	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4662	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4663	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4664	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4665	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4666	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4667	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4668	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4669	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4670	.align	8
4671___
4672$code.=<<___;
4673.LSEH_info_ecp_nistz256_point_double:
4674	.byte	9,0,0,0
4675	.rva	full_handler
4676	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4677	.long	32*5+56,0
4678.LSEH_info_ecp_nistz256_point_add:
4679	.byte	9,0,0,0
4680	.rva	full_handler
4681	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4682	.long	32*18+56,0
4683.LSEH_info_ecp_nistz256_point_add_affine:
4684	.byte	9,0,0,0
4685	.rva	full_handler
4686	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4687	.long	32*15+56,0
4688___
4689$code.=<<___ if ($addx);
4690.align	8
4691.LSEH_info_ecp_nistz256_point_doublex:
4692	.byte	9,0,0,0
4693	.rva	full_handler
4694	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4695	.long	32*5+56,0
4696.LSEH_info_ecp_nistz256_point_addx:
4697	.byte	9,0,0,0
4698	.rva	full_handler
4699	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4700	.long	32*18+56,0
4701.LSEH_info_ecp_nistz256_point_add_affinex:
4702	.byte	9,0,0,0
4703	.rva	full_handler
4704	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4705	.long	32*15+56,0
4706___
4707}
4708
4709########################################################################
4710# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4711#
4712open TABLE,"<ecp_nistz256_table.c"		or
4713open TABLE,"<${dir}../ecp_nistz256_table.c"	or
4714die "failed to open ecp_nistz256_table.c:",$!;
4715
4716use integer;
4717
4718foreach(<TABLE>) {
4719	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4720}
4721close TABLE;
4722
4723die "insane number of elements" if ($#arr != 64*16*37-1);
4724
4725print <<___;
4726.text
4727.globl	ecp_nistz256_precomputed
4728.type	ecp_nistz256_precomputed,\@object
4729.align	4096
4730ecp_nistz256_precomputed:
4731___
4732while (@line=splice(@arr,0,16)) {
4733	print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4734}
4735print <<___;
4736.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4737___
4738
4739$code =~ s/\`([^\`]*)\`/eval $1/gem;
4740print $code;
4741close STDOUT or die "error closing STDOUT: $!";
4742