1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv8.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# Apple A7		+190-360%
26# Cortex-A53		+190-400%
27# Cortex-A57		+190-350%
28# Denver		+230-400%
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +400% means 5x improvement.
33
34# $output is the last argument if it looks like a file (it has an extension)
35# $flavour is the first argument if it doesn't look like a file
36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42die "can't locate arm-xlate.pl";
43
44open OUT,"| \"$^X\" $xlate $flavour \"$output\""
45    or die "can't call $xlate: $!";
46*STDOUT=*OUT;
47
48{
49my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
50    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
51    map("x$_",(0..17,19,20));
52
53my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
54
55$code.=<<___;
56#include "arm_arch.h"
57
58.text
59___
60########################################################################
61# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
62#
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64open TABLE,"<ecp_nistz256_table.c"		or
65open TABLE,"<${dir}../ecp_nistz256_table.c"	or
66die "failed to open ecp_nistz256_table.c:",$!;
67
68use integer;
69
70foreach(<TABLE>) {
71	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
72}
73close TABLE;
74
75# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
76# 64*16*37-1 is because $#arr returns last valid index or @arr, not
77# amount of elements.
78die "insane number of elements" if ($#arr != 64*16*37-1);
79
80$code.=<<___;
81.globl	ecp_nistz256_precomputed
82.type	ecp_nistz256_precomputed,%object
83.align	12
84ecp_nistz256_precomputed:
85___
86########################################################################
87# this conversion smashes P256_POINT_AFFINE by individual bytes with
88# 64 byte interval, similar to
89#	1111222233334444
90#	1234123412341234
91for(1..37) {
92	@tbl = splice(@arr,0,64*16);
93	for($i=0;$i<64;$i++) {
94		undef @line;
95		for($j=0;$j<64;$j++) {
96			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
97		}
98		$code.=".byte\t";
99		$code.=join(',',map { sprintf "0x%02x",$_} @line);
100		$code.="\n";
101	}
102}
103$code.=<<___;
104.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
105.align	5
106.Lpoly:
107.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
108.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
109.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
110.Lone_mont:
111.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
112.Lone:
113.quad	1,0,0,0
114.Lord:
115.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
116.LordK:
117.quad	0xccd1c8aaee00bc4f
118.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
119
120// void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
121.globl	ecp_nistz256_to_mont
122.type	ecp_nistz256_to_mont,%function
123.align	6
124ecp_nistz256_to_mont:
125	AARCH64_SIGN_LINK_REGISTER
126	stp	x29,x30,[sp,#-32]!
127	add	x29,sp,#0
128	stp	x19,x20,[sp,#16]
129
130	ldr	$bi,.LRR		// bp[0]
131	ldp	$a0,$a1,[$ap]
132	ldp	$a2,$a3,[$ap,#16]
133	ldr	$poly1,.Lpoly+8
134	ldr	$poly3,.Lpoly+24
135	adr	$bp,.LRR		// &bp[0]
136
137	bl	__ecp_nistz256_mul_mont
138
139	ldp	x19,x20,[sp,#16]
140	ldp	x29,x30,[sp],#32
141	AARCH64_VALIDATE_LINK_REGISTER
142	ret
143.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
144
145// void	ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
146.globl	ecp_nistz256_from_mont
147.type	ecp_nistz256_from_mont,%function
148.align	4
149ecp_nistz256_from_mont:
150	AARCH64_SIGN_LINK_REGISTER
151	stp	x29,x30,[sp,#-32]!
152	add	x29,sp,#0
153	stp	x19,x20,[sp,#16]
154
155	mov	$bi,#1			// bp[0]
156	ldp	$a0,$a1,[$ap]
157	ldp	$a2,$a3,[$ap,#16]
158	ldr	$poly1,.Lpoly+8
159	ldr	$poly3,.Lpoly+24
160	adr	$bp,.Lone		// &bp[0]
161
162	bl	__ecp_nistz256_mul_mont
163
164	ldp	x19,x20,[sp,#16]
165	ldp	x29,x30,[sp],#32
166	AARCH64_VALIDATE_LINK_REGISTER
167	ret
168.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
169
170// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
171//					     const BN_ULONG x2[4]);
172.globl	ecp_nistz256_mul_mont
173.type	ecp_nistz256_mul_mont,%function
174.align	4
175ecp_nistz256_mul_mont:
176	AARCH64_SIGN_LINK_REGISTER
177	stp	x29,x30,[sp,#-32]!
178	add	x29,sp,#0
179	stp	x19,x20,[sp,#16]
180
181	ldr	$bi,[$bp]		// bp[0]
182	ldp	$a0,$a1,[$ap]
183	ldp	$a2,$a3,[$ap,#16]
184	ldr	$poly1,.Lpoly+8
185	ldr	$poly3,.Lpoly+24
186
187	bl	__ecp_nistz256_mul_mont
188
189	ldp	x19,x20,[sp,#16]
190	ldp	x29,x30,[sp],#32
191	AARCH64_VALIDATE_LINK_REGISTER
192	ret
193.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
194
195// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
196.globl	ecp_nistz256_sqr_mont
197.type	ecp_nistz256_sqr_mont,%function
198.align	4
199ecp_nistz256_sqr_mont:
200	AARCH64_SIGN_LINK_REGISTER
201	stp	x29,x30,[sp,#-32]!
202	add	x29,sp,#0
203	stp	x19,x20,[sp,#16]
204
205	ldp	$a0,$a1,[$ap]
206	ldp	$a2,$a3,[$ap,#16]
207	ldr	$poly1,.Lpoly+8
208	ldr	$poly3,.Lpoly+24
209
210	bl	__ecp_nistz256_sqr_mont
211
212	ldp	x19,x20,[sp,#16]
213	ldp	x29,x30,[sp],#32
214	AARCH64_VALIDATE_LINK_REGISTER
215	ret
216.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
217
218// void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
219//					const BN_ULONG x2[4]);
220.globl	ecp_nistz256_add
221.type	ecp_nistz256_add,%function
222.align	4
223ecp_nistz256_add:
224	AARCH64_SIGN_LINK_REGISTER
225	stp	x29,x30,[sp,#-16]!
226	add	x29,sp,#0
227
228	ldp	$acc0,$acc1,[$ap]
229	ldp	$t0,$t1,[$bp]
230	ldp	$acc2,$acc3,[$ap,#16]
231	ldp	$t2,$t3,[$bp,#16]
232	ldr	$poly1,.Lpoly+8
233	ldr	$poly3,.Lpoly+24
234
235	bl	__ecp_nistz256_add
236
237	ldp	x29,x30,[sp],#16
238	AARCH64_VALIDATE_LINK_REGISTER
239	ret
240.size	ecp_nistz256_add,.-ecp_nistz256_add
241
242// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
243.globl	ecp_nistz256_div_by_2
244.type	ecp_nistz256_div_by_2,%function
245.align	4
246ecp_nistz256_div_by_2:
247	AARCH64_SIGN_LINK_REGISTER
248	stp	x29,x30,[sp,#-16]!
249	add	x29,sp,#0
250
251	ldp	$acc0,$acc1,[$ap]
252	ldp	$acc2,$acc3,[$ap,#16]
253	ldr	$poly1,.Lpoly+8
254	ldr	$poly3,.Lpoly+24
255
256	bl	__ecp_nistz256_div_by_2
257
258	ldp	x29,x30,[sp],#16
259	AARCH64_VALIDATE_LINK_REGISTER
260	ret
261.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
262
263// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
264.globl	ecp_nistz256_mul_by_2
265.type	ecp_nistz256_mul_by_2,%function
266.align	4
267ecp_nistz256_mul_by_2:
268	AARCH64_SIGN_LINK_REGISTER
269	stp	x29,x30,[sp,#-16]!
270	add	x29,sp,#0
271
272	ldp	$acc0,$acc1,[$ap]
273	ldp	$acc2,$acc3,[$ap,#16]
274	ldr	$poly1,.Lpoly+8
275	ldr	$poly3,.Lpoly+24
276	mov	$t0,$acc0
277	mov	$t1,$acc1
278	mov	$t2,$acc2
279	mov	$t3,$acc3
280
281	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
282
283	ldp	x29,x30,[sp],#16
284	AARCH64_VALIDATE_LINK_REGISTER
285	ret
286.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
287
288// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
289.globl	ecp_nistz256_mul_by_3
290.type	ecp_nistz256_mul_by_3,%function
291.align	4
292ecp_nistz256_mul_by_3:
293	AARCH64_SIGN_LINK_REGISTER
294	stp	x29,x30,[sp,#-16]!
295	add	x29,sp,#0
296
297	ldp	$acc0,$acc1,[$ap]
298	ldp	$acc2,$acc3,[$ap,#16]
299	ldr	$poly1,.Lpoly+8
300	ldr	$poly3,.Lpoly+24
301	mov	$t0,$acc0
302	mov	$t1,$acc1
303	mov	$t2,$acc2
304	mov	$t3,$acc3
305	mov	$a0,$acc0
306	mov	$a1,$acc1
307	mov	$a2,$acc2
308	mov	$a3,$acc3
309
310	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
311
312	mov	$t0,$a0
313	mov	$t1,$a1
314	mov	$t2,$a2
315	mov	$t3,$a3
316
317	bl	__ecp_nistz256_add	// ret += a	// 2*a+a=3*a
318
319	ldp	x29,x30,[sp],#16
320	AARCH64_VALIDATE_LINK_REGISTER
321	ret
322.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
323
324// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
325//				        const BN_ULONG x2[4]);
326.globl	ecp_nistz256_sub
327.type	ecp_nistz256_sub,%function
328.align	4
329ecp_nistz256_sub:
330	AARCH64_SIGN_LINK_REGISTER
331	stp	x29,x30,[sp,#-16]!
332	add	x29,sp,#0
333
334	ldp	$acc0,$acc1,[$ap]
335	ldp	$acc2,$acc3,[$ap,#16]
336	ldr	$poly1,.Lpoly+8
337	ldr	$poly3,.Lpoly+24
338
339	bl	__ecp_nistz256_sub_from
340
341	ldp	x29,x30,[sp],#16
342	AARCH64_VALIDATE_LINK_REGISTER
343	ret
344.size	ecp_nistz256_sub,.-ecp_nistz256_sub
345
346// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
347.globl	ecp_nistz256_neg
348.type	ecp_nistz256_neg,%function
349.align	4
350ecp_nistz256_neg:
351	AARCH64_SIGN_LINK_REGISTER
352	stp	x29,x30,[sp,#-16]!
353	add	x29,sp,#0
354
355	mov	$bp,$ap
356	mov	$acc0,xzr		// a = 0
357	mov	$acc1,xzr
358	mov	$acc2,xzr
359	mov	$acc3,xzr
360	ldr	$poly1,.Lpoly+8
361	ldr	$poly3,.Lpoly+24
362
363	bl	__ecp_nistz256_sub_from
364
365	ldp	x29,x30,[sp],#16
366	AARCH64_VALIDATE_LINK_REGISTER
367	ret
368.size	ecp_nistz256_neg,.-ecp_nistz256_neg
369
370// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
371// to $a0-$a3 and b[0] - to $bi
372.type	__ecp_nistz256_mul_mont,%function
373.align	4
374__ecp_nistz256_mul_mont:
375	mul	$acc0,$a0,$bi		// a[0]*b[0]
376	umulh	$t0,$a0,$bi
377
378	mul	$acc1,$a1,$bi		// a[1]*b[0]
379	umulh	$t1,$a1,$bi
380
381	mul	$acc2,$a2,$bi		// a[2]*b[0]
382	umulh	$t2,$a2,$bi
383
384	mul	$acc3,$a3,$bi		// a[3]*b[0]
385	umulh	$t3,$a3,$bi
386	ldr	$bi,[$bp,#8]		// b[1]
387
388	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
389	 lsl	$t0,$acc0,#32
390	adcs	$acc2,$acc2,$t1
391	 lsr	$t1,$acc0,#32
392	adcs	$acc3,$acc3,$t2
393	adc	$acc4,xzr,$t3
394	mov	$acc5,xzr
395___
396for($i=1;$i<4;$i++) {
397        # Reduction iteration is normally performed by accumulating
398        # result of multiplication of modulus by "magic" digit [and
399        # omitting least significant word, which is guaranteed to
400        # be 0], but thanks to special form of modulus and "magic"
401        # digit being equal to least significant word, it can be
402        # performed with additions and subtractions alone. Indeed:
403        #
404        #            ffff0001.00000000.0000ffff.ffffffff
405        # *                                     abcdefgh
406        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
407        #
408        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
409        # rewrite above as:
410        #
411        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
412        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
413        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
414        #
415        # or marking redundant operations:
416        #
417        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
418        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
419        # - 0000abcd.efgh0000.--------.--------.--------
420
421$code.=<<___;
422	subs	$t2,$acc0,$t0		// "*0xffff0001"
423	sbc	$t3,$acc0,$t1
424	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
425	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
426	adcs	$acc1,$acc2,$t1
427	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
428	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
429	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
430	adcs	$acc3,$acc4,$t3
431	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
432	adc	$acc4,$acc5,xzr
433
434	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
435	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
436	adcs	$acc1,$acc1,$t1
437	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
438	adcs	$acc2,$acc2,$t2
439	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
440	adcs	$acc3,$acc3,$t3
441	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
442	adc	$acc4,$acc4,xzr
443___
444$code.=<<___	if ($i<3);
445	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
446___
447$code.=<<___;
448	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
449	 lsl	$t0,$acc0,#32
450	adcs	$acc2,$acc2,$t1
451	 lsr	$t1,$acc0,#32
452	adcs	$acc3,$acc3,$t2
453	adcs	$acc4,$acc4,$t3
454	adc	$acc5,xzr,xzr
455___
456}
457$code.=<<___;
458	// last reduction
459	subs	$t2,$acc0,$t0		// "*0xffff0001"
460	sbc	$t3,$acc0,$t1
461	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
462	adcs	$acc1,$acc2,$t1
463	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
464	adcs	$acc3,$acc4,$t3
465	adc	$acc4,$acc5,xzr
466
467	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
468	sbcs	$t1,$acc1,$poly1
469	sbcs	$t2,$acc2,xzr
470	sbcs	$t3,$acc3,$poly3
471	sbcs	xzr,$acc4,xzr		// did it borrow?
472
473	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
474	csel	$acc1,$acc1,$t1,lo
475	csel	$acc2,$acc2,$t2,lo
476	stp	$acc0,$acc1,[$rp]
477	csel	$acc3,$acc3,$t3,lo
478	stp	$acc2,$acc3,[$rp,#16]
479
480	ret
481.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
482
483// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
484// to $a0-$a3
485.type	__ecp_nistz256_sqr_mont,%function
486.align	4
487__ecp_nistz256_sqr_mont:
488	//  |  |  |  |  |  |a1*a0|  |
489	//  |  |  |  |  |a2*a0|  |  |
490	//  |  |a3*a2|a3*a0|  |  |  |
491	//  |  |  |  |a2*a1|  |  |  |
492	//  |  |  |a3*a1|  |  |  |  |
493	// *|  |  |  |  |  |  |  | 2|
494	// +|a3*a3|a2*a2|a1*a1|a0*a0|
495	//  |--+--+--+--+--+--+--+--|
496	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
497	//
498	//  "can't overflow" below mark carrying into high part of
499	//  multiplication result, which can't overflow, because it
500	//  can never be all ones.
501
502	mul	$acc1,$a1,$a0		// a[1]*a[0]
503	umulh	$t1,$a1,$a0
504	mul	$acc2,$a2,$a0		// a[2]*a[0]
505	umulh	$t2,$a2,$a0
506	mul	$acc3,$a3,$a0		// a[3]*a[0]
507	umulh	$acc4,$a3,$a0
508
509	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
510	 mul	$t0,$a2,$a1		// a[2]*a[1]
511	 umulh	$t1,$a2,$a1
512	adcs	$acc3,$acc3,$t2
513	 mul	$t2,$a3,$a1		// a[3]*a[1]
514	 umulh	$t3,$a3,$a1
515	adc	$acc4,$acc4,xzr		// can't overflow
516
517	mul	$acc5,$a3,$a2		// a[3]*a[2]
518	umulh	$acc6,$a3,$a2
519
520	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
521	 mul	$acc0,$a0,$a0		// a[0]*a[0]
522	adc	$t2,$t3,xzr		// can't overflow
523
524	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
525	 umulh	$a0,$a0,$a0
526	adcs	$acc4,$acc4,$t1
527	 mul	$t1,$a1,$a1		// a[1]*a[1]
528	adcs	$acc5,$acc5,$t2
529	 umulh	$a1,$a1,$a1
530	adc	$acc6,$acc6,xzr		// can't overflow
531
532	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
533	 mul	$t2,$a2,$a2		// a[2]*a[2]
534	adcs	$acc2,$acc2,$acc2
535	 umulh	$a2,$a2,$a2
536	adcs	$acc3,$acc3,$acc3
537	 mul	$t3,$a3,$a3		// a[3]*a[3]
538	adcs	$acc4,$acc4,$acc4
539	 umulh	$a3,$a3,$a3
540	adcs	$acc5,$acc5,$acc5
541	adcs	$acc6,$acc6,$acc6
542	adc	$acc7,xzr,xzr
543
544	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
545	adcs	$acc2,$acc2,$t1
546	adcs	$acc3,$acc3,$a1
547	adcs	$acc4,$acc4,$t2
548	adcs	$acc5,$acc5,$a2
549	 lsl	$t0,$acc0,#32
550	adcs	$acc6,$acc6,$t3
551	 lsr	$t1,$acc0,#32
552	adc	$acc7,$acc7,$a3
553___
554for($i=0;$i<3;$i++) {			# reductions, see commentary in
555					# multiplication for details
556$code.=<<___;
557	subs	$t2,$acc0,$t0		// "*0xffff0001"
558	sbc	$t3,$acc0,$t1
559	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
560	adcs	$acc1,$acc2,$t1
561	 lsl	$t0,$acc0,#32
562	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
563	 lsr	$t1,$acc0,#32
564	adc	$acc3,$t3,xzr		// can't overflow
565___
566}
567$code.=<<___;
568	subs	$t2,$acc0,$t0		// "*0xffff0001"
569	sbc	$t3,$acc0,$t1
570	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
571	adcs	$acc1,$acc2,$t1
572	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
573	adc	$acc3,$t3,xzr		// can't overflow
574
575	adds	$acc0,$acc0,$acc4	// accumulate upper half
576	adcs	$acc1,$acc1,$acc5
577	adcs	$acc2,$acc2,$acc6
578	adcs	$acc3,$acc3,$acc7
579	adc	$acc4,xzr,xzr
580
581	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
582	sbcs	$t1,$acc1,$poly1
583	sbcs	$t2,$acc2,xzr
584	sbcs	$t3,$acc3,$poly3
585	sbcs	xzr,$acc4,xzr		// did it borrow?
586
587	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
588	csel	$acc1,$acc1,$t1,lo
589	csel	$acc2,$acc2,$t2,lo
590	stp	$acc0,$acc1,[$rp]
591	csel	$acc3,$acc3,$t3,lo
592	stp	$acc2,$acc3,[$rp,#16]
593
594	ret
595.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
596
597// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
598// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
599// contexts, e.g. in multiplication by 2 and 3...
600.type	__ecp_nistz256_add,%function
601.align	4
602__ecp_nistz256_add:
603	adds	$acc0,$acc0,$t0		// ret = a+b
604	adcs	$acc1,$acc1,$t1
605	adcs	$acc2,$acc2,$t2
606	adcs	$acc3,$acc3,$t3
607	adc	$ap,xzr,xzr		// zap $ap
608
609	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
610	sbcs	$t1,$acc1,$poly1
611	sbcs	$t2,$acc2,xzr
612	sbcs	$t3,$acc3,$poly3
613	sbcs	xzr,$ap,xzr		// did subtraction borrow?
614
615	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
616	csel	$acc1,$acc1,$t1,lo
617	csel	$acc2,$acc2,$t2,lo
618	stp	$acc0,$acc1,[$rp]
619	csel	$acc3,$acc3,$t3,lo
620	stp	$acc2,$acc3,[$rp,#16]
621
622	ret
623.size	__ecp_nistz256_add,.-__ecp_nistz256_add
624
625.type	__ecp_nistz256_sub_from,%function
626.align	4
627__ecp_nistz256_sub_from:
628	ldp	$t0,$t1,[$bp]
629	ldp	$t2,$t3,[$bp,#16]
630	subs	$acc0,$acc0,$t0		// ret = a-b
631	sbcs	$acc1,$acc1,$t1
632	sbcs	$acc2,$acc2,$t2
633	sbcs	$acc3,$acc3,$t3
634	sbc	$ap,xzr,xzr		// zap $ap
635
636	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
637	adcs	$t1,$acc1,$poly1
638	adcs	$t2,$acc2,xzr
639	adc	$t3,$acc3,$poly3
640	cmp	$ap,xzr			// did subtraction borrow?
641
642	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
643	csel	$acc1,$acc1,$t1,eq
644	csel	$acc2,$acc2,$t2,eq
645	stp	$acc0,$acc1,[$rp]
646	csel	$acc3,$acc3,$t3,eq
647	stp	$acc2,$acc3,[$rp,#16]
648
649	ret
650.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
651
652.type	__ecp_nistz256_sub_morf,%function
653.align	4
654__ecp_nistz256_sub_morf:
655	ldp	$t0,$t1,[$bp]
656	ldp	$t2,$t3,[$bp,#16]
657	subs	$acc0,$t0,$acc0		// ret = b-a
658	sbcs	$acc1,$t1,$acc1
659	sbcs	$acc2,$t2,$acc2
660	sbcs	$acc3,$t3,$acc3
661	sbc	$ap,xzr,xzr		// zap $ap
662
663	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
664	adcs	$t1,$acc1,$poly1
665	adcs	$t2,$acc2,xzr
666	adc	$t3,$acc3,$poly3
667	cmp	$ap,xzr			// did subtraction borrow?
668
669	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
670	csel	$acc1,$acc1,$t1,eq
671	csel	$acc2,$acc2,$t2,eq
672	stp	$acc0,$acc1,[$rp]
673	csel	$acc3,$acc3,$t3,eq
674	stp	$acc2,$acc3,[$rp,#16]
675
676	ret
677.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
678
679.type	__ecp_nistz256_div_by_2,%function
680.align	4
681__ecp_nistz256_div_by_2:
682	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
683	adcs	$t1,$acc1,$poly1
684	adcs	$t2,$acc2,xzr
685	adcs	$t3,$acc3,$poly3
686	adc	$ap,xzr,xzr		// zap $ap
687	tst	$acc0,#1		// is a even?
688
689	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
690	csel	$acc1,$acc1,$t1,eq
691	csel	$acc2,$acc2,$t2,eq
692	csel	$acc3,$acc3,$t3,eq
693	csel	$ap,xzr,$ap,eq
694
695	lsr	$acc0,$acc0,#1		// ret >>= 1
696	orr	$acc0,$acc0,$acc1,lsl#63
697	lsr	$acc1,$acc1,#1
698	orr	$acc1,$acc1,$acc2,lsl#63
699	lsr	$acc2,$acc2,#1
700	orr	$acc2,$acc2,$acc3,lsl#63
701	lsr	$acc3,$acc3,#1
702	stp	$acc0,$acc1,[$rp]
703	orr	$acc3,$acc3,$ap,lsl#63
704	stp	$acc2,$acc3,[$rp,#16]
705
706	ret
707.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
708___
709########################################################################
710# following subroutines are "literal" implementation of those found in
711# ecp_nistz256.c
712#
713########################################################################
714# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
715#
716{
717my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
718# above map() describes stack layout with 4 temporary
719# 256-bit vectors on top.
720my ($rp_real,$ap_real) = map("x$_",(21,22));
721
722$code.=<<___;
723.globl	ecp_nistz256_point_double
724.type	ecp_nistz256_point_double,%function
725.align	5
726ecp_nistz256_point_double:
727	AARCH64_SIGN_LINK_REGISTER
728	stp	x29,x30,[sp,#-96]!
729	add	x29,sp,#0
730	stp	x19,x20,[sp,#16]
731	stp	x21,x22,[sp,#32]
732	sub	sp,sp,#32*4
733
734.Ldouble_shortcut:
735	ldp	$acc0,$acc1,[$ap,#32]
736	 mov	$rp_real,$rp
737	ldp	$acc2,$acc3,[$ap,#48]
738	 mov	$ap_real,$ap
739	 ldr	$poly1,.Lpoly+8
740	mov	$t0,$acc0
741	 ldr	$poly3,.Lpoly+24
742	mov	$t1,$acc1
743	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
744	mov	$t2,$acc2
745	mov	$t3,$acc3
746	 ldp	$a2,$a3,[$ap_real,#64+16]
747	add	$rp,sp,#$S
748	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
749
750	add	$rp,sp,#$Zsqr
751	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
752
753	ldp	$t0,$t1,[$ap_real]
754	ldp	$t2,$t3,[$ap_real,#16]
755	mov	$a0,$acc0		// put Zsqr aside for p256_sub
756	mov	$a1,$acc1
757	mov	$a2,$acc2
758	mov	$a3,$acc3
759	add	$rp,sp,#$M
760	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
761
762	add	$bp,$ap_real,#0
763	mov	$acc0,$a0		// restore Zsqr
764	mov	$acc1,$a1
765	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
766	mov	$acc2,$a2
767	mov	$acc3,$a3
768	 ldp	$a2,$a3,[sp,#$S+16]
769	add	$rp,sp,#$Zsqr
770	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
771
772	add	$rp,sp,#$S
773	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
774
775	ldr	$bi,[$ap_real,#32]
776	ldp	$a0,$a1,[$ap_real,#64]
777	ldp	$a2,$a3,[$ap_real,#64+16]
778	add	$bp,$ap_real,#32
779	add	$rp,sp,#$tmp0
780	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
781
782	mov	$t0,$acc0
783	mov	$t1,$acc1
784	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
785	mov	$t2,$acc2
786	mov	$t3,$acc3
787	 ldp	$a2,$a3,[sp,#$S+16]
788	add	$rp,$rp_real,#64
789	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
790
791	add	$rp,sp,#$tmp0
792	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
793
794	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
795	 ldp	$a0,$a1,[sp,#$M]
796	 ldp	$a2,$a3,[sp,#$M+16]
797	add	$rp,$rp_real,#32
798	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
799
800	add	$bp,sp,#$Zsqr
801	add	$rp,sp,#$M
802	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
803
804	mov	$t0,$acc0		// duplicate M
805	mov	$t1,$acc1
806	mov	$t2,$acc2
807	mov	$t3,$acc3
808	mov	$a0,$acc0		// put M aside
809	mov	$a1,$acc1
810	mov	$a2,$acc2
811	mov	$a3,$acc3
812	add	$rp,sp,#$M
813	bl	__ecp_nistz256_add
814	mov	$t0,$a0			// restore M
815	mov	$t1,$a1
816	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
817	mov	$t2,$a2
818	 ldp	$a0,$a1,[sp,#$S]
819	mov	$t3,$a3
820	 ldp	$a2,$a3,[sp,#$S+16]
821	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
822
823	add	$bp,$ap_real,#0
824	add	$rp,sp,#$S
825	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
826
827	mov	$t0,$acc0
828	mov	$t1,$acc1
829	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
830	mov	$t2,$acc2
831	mov	$t3,$acc3
832	 ldp	$a2,$a3,[sp,#$M+16]
833	add	$rp,sp,#$tmp0
834	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
835
836	add	$rp,$rp_real,#0
837	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
838
839	add	$bp,sp,#$tmp0
840	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
841
842	add	$bp,sp,#$S
843	add	$rp,sp,#$S
844	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
845
846	ldr	$bi,[sp,#$M]
847	mov	$a0,$acc0		// copy S
848	mov	$a1,$acc1
849	mov	$a2,$acc2
850	mov	$a3,$acc3
851	add	$bp,sp,#$M
852	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
853
854	add	$bp,$rp_real,#32
855	add	$rp,$rp_real,#32
856	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
857
858	add	sp,x29,#0		// destroy frame
859	ldp	x19,x20,[x29,#16]
860	ldp	x21,x22,[x29,#32]
861	ldp	x29,x30,[sp],#96
862	AARCH64_VALIDATE_LINK_REGISTER
863	ret
864.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
865___
866}
867
868########################################################################
869# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
870#			      const P256_POINT *in2);
871{
872my ($res_x,$res_y,$res_z,
873    $H,$Hsqr,$R,$Rsqr,$Hcub,
874    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
875my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
876# above map() describes stack layout with 12 temporary
877# 256-bit vectors on top.
878my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
879
880$code.=<<___;
881.globl	ecp_nistz256_point_add
882.type	ecp_nistz256_point_add,%function
883.align	5
884ecp_nistz256_point_add:
885	AARCH64_SIGN_LINK_REGISTER
886	stp	x29,x30,[sp,#-96]!
887	add	x29,sp,#0
888	stp	x19,x20,[sp,#16]
889	stp	x21,x22,[sp,#32]
890	stp	x23,x24,[sp,#48]
891	stp	x25,x26,[sp,#64]
892	stp	x27,x28,[sp,#80]
893	sub	sp,sp,#32*12
894
895	ldp	$a0,$a1,[$bp,#64]	// in2_z
896	ldp	$a2,$a3,[$bp,#64+16]
897	 mov	$rp_real,$rp
898	 mov	$ap_real,$ap
899	 mov	$bp_real,$bp
900	 ldr	$poly1,.Lpoly+8
901	 ldr	$poly3,.Lpoly+24
902	orr	$t0,$a0,$a1
903	orr	$t2,$a2,$a3
904	orr	$in2infty,$t0,$t2
905	cmp	$in2infty,#0
906	csetm	$in2infty,ne		// ~in2infty
907	add	$rp,sp,#$Z2sqr
908	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
909
910	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
911	ldp	$a2,$a3,[$ap_real,#64+16]
912	orr	$t0,$a0,$a1
913	orr	$t2,$a2,$a3
914	orr	$in1infty,$t0,$t2
915	cmp	$in1infty,#0
916	csetm	$in1infty,ne		// ~in1infty
917	add	$rp,sp,#$Z1sqr
918	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
919
920	ldr	$bi,[$bp_real,#64]
921	ldp	$a0,$a1,[sp,#$Z2sqr]
922	ldp	$a2,$a3,[sp,#$Z2sqr+16]
923	add	$bp,$bp_real,#64
924	add	$rp,sp,#$S1
925	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
926
927	ldr	$bi,[$ap_real,#64]
928	ldp	$a0,$a1,[sp,#$Z1sqr]
929	ldp	$a2,$a3,[sp,#$Z1sqr+16]
930	add	$bp,$ap_real,#64
931	add	$rp,sp,#$S2
932	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
933
934	ldr	$bi,[$ap_real,#32]
935	ldp	$a0,$a1,[sp,#$S1]
936	ldp	$a2,$a3,[sp,#$S1+16]
937	add	$bp,$ap_real,#32
938	add	$rp,sp,#$S1
939	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
940
941	ldr	$bi,[$bp_real,#32]
942	ldp	$a0,$a1,[sp,#$S2]
943	ldp	$a2,$a3,[sp,#$S2+16]
944	add	$bp,$bp_real,#32
945	add	$rp,sp,#$S2
946	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
947
948	add	$bp,sp,#$S1
949	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
950	 ldp	$a0,$a1,[$ap_real]
951	 ldp	$a2,$a3,[$ap_real,#16]
952	add	$rp,sp,#$R
953	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
954
955	orr	$acc0,$acc0,$acc1	// see if result is zero
956	orr	$acc2,$acc2,$acc3
957	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
958
959	add	$bp,sp,#$Z2sqr
960	add	$rp,sp,#$U1
961	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
962
963	ldr	$bi,[sp,#$Z1sqr]
964	ldp	$a0,$a1,[$bp_real]
965	ldp	$a2,$a3,[$bp_real,#16]
966	add	$bp,sp,#$Z1sqr
967	add	$rp,sp,#$U2
968	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
969
970	add	$bp,sp,#$U1
971	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
972	 ldp	$a2,$a3,[sp,#$R+16]
973	add	$rp,sp,#$H
974	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
975
976	orr	$acc0,$acc0,$acc1	// see if result is zero
977	orr	$acc2,$acc2,$acc3
978	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
979
980	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
981	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
982	orr	$acc0,$acc0,$temp1
983	orr	$acc0,$acc0,$temp2
984	orr	$acc0,$acc0,$temp0
985	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
986
987.Ladd_double:
988	mov	$ap,$ap_real
989	mov	$rp,$rp_real
990	ldp	x23,x24,[x29,#48]
991	ldp	x25,x26,[x29,#64]
992	ldp	x27,x28,[x29,#80]
993	add	sp,sp,#32*(12-4)	// difference in stack frames
994	b	.Ldouble_shortcut
995
996.align	4
997.Ladd_proceed:
998	add	$rp,sp,#$Rsqr
999	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1000
1001	ldr	$bi,[$ap_real,#64]
1002	ldp	$a0,$a1,[sp,#$H]
1003	ldp	$a2,$a3,[sp,#$H+16]
1004	add	$bp,$ap_real,#64
1005	add	$rp,sp,#$res_z
1006	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1007
1008	ldp	$a0,$a1,[sp,#$H]
1009	ldp	$a2,$a3,[sp,#$H+16]
1010	add	$rp,sp,#$Hsqr
1011	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1012
1013	ldr	$bi,[$bp_real,#64]
1014	ldp	$a0,$a1,[sp,#$res_z]
1015	ldp	$a2,$a3,[sp,#$res_z+16]
1016	add	$bp,$bp_real,#64
1017	add	$rp,sp,#$res_z
1018	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
1019
1020	ldr	$bi,[sp,#$H]
1021	ldp	$a0,$a1,[sp,#$Hsqr]
1022	ldp	$a2,$a3,[sp,#$Hsqr+16]
1023	add	$bp,sp,#$H
1024	add	$rp,sp,#$Hcub
1025	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1026
1027	ldr	$bi,[sp,#$Hsqr]
1028	ldp	$a0,$a1,[sp,#$U1]
1029	ldp	$a2,$a3,[sp,#$U1+16]
1030	add	$bp,sp,#$Hsqr
1031	add	$rp,sp,#$U2
1032	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
1033
1034	mov	$t0,$acc0
1035	mov	$t1,$acc1
1036	mov	$t2,$acc2
1037	mov	$t3,$acc3
1038	add	$rp,sp,#$Hsqr
1039	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1040
1041	add	$bp,sp,#$Rsqr
1042	add	$rp,sp,#$res_x
1043	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1044
1045	add	$bp,sp,#$Hcub
1046	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1047
1048	add	$bp,sp,#$U2
1049	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
1050	 ldp	$a0,$a1,[sp,#$S1]
1051	 ldp	$a2,$a3,[sp,#$S1+16]
1052	add	$rp,sp,#$res_y
1053	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1054
1055	add	$bp,sp,#$Hcub
1056	add	$rp,sp,#$S2
1057	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
1058
1059	ldr	$bi,[sp,#$R]
1060	ldp	$a0,$a1,[sp,#$res_y]
1061	ldp	$a2,$a3,[sp,#$res_y+16]
1062	add	$bp,sp,#$R
1063	add	$rp,sp,#$res_y
1064	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1065
1066	add	$bp,sp,#$S2
1067	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1068
1069	ldp	$a0,$a1,[sp,#$res_x]		// res
1070	ldp	$a2,$a3,[sp,#$res_x+16]
1071	ldp	$t0,$t1,[$bp_real]		// in2
1072	ldp	$t2,$t3,[$bp_real,#16]
1073___
1074for($i=0;$i<64;$i+=32) {		# conditional moves
1075$code.=<<___;
1076	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1077	cmp	$in1infty,#0			// ~$in1intfy, remember?
1078	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1079	csel	$t0,$a0,$t0,ne
1080	csel	$t1,$a1,$t1,ne
1081	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1082	csel	$t2,$a2,$t2,ne
1083	csel	$t3,$a3,$t3,ne
1084	cmp	$in2infty,#0			// ~$in2intfy, remember?
1085	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1086	csel	$acc0,$t0,$acc0,ne
1087	csel	$acc1,$t1,$acc1,ne
1088	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1089	csel	$acc2,$t2,$acc2,ne
1090	csel	$acc3,$t3,$acc3,ne
1091	ldp	$t2,$t3,[$bp_real,#$i+48]
1092	stp	$acc0,$acc1,[$rp_real,#$i]
1093	stp	$acc2,$acc3,[$rp_real,#$i+16]
1094___
1095}
1096$code.=<<___;
1097	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1098	cmp	$in1infty,#0			// ~$in1intfy, remember?
1099	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1100	csel	$t0,$a0,$t0,ne
1101	csel	$t1,$a1,$t1,ne
1102	csel	$t2,$a2,$t2,ne
1103	csel	$t3,$a3,$t3,ne
1104	cmp	$in2infty,#0			// ~$in2intfy, remember?
1105	csel	$acc0,$t0,$acc0,ne
1106	csel	$acc1,$t1,$acc1,ne
1107	csel	$acc2,$t2,$acc2,ne
1108	csel	$acc3,$t3,$acc3,ne
1109	stp	$acc0,$acc1,[$rp_real,#$i]
1110	stp	$acc2,$acc3,[$rp_real,#$i+16]
1111
1112.Ladd_done:
1113	add	sp,x29,#0		// destroy frame
1114	ldp	x19,x20,[x29,#16]
1115	ldp	x21,x22,[x29,#32]
1116	ldp	x23,x24,[x29,#48]
1117	ldp	x25,x26,[x29,#64]
1118	ldp	x27,x28,[x29,#80]
1119	ldp	x29,x30,[sp],#96
1120	AARCH64_VALIDATE_LINK_REGISTER
1121	ret
1122.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1123___
1124}
1125
1126########################################################################
1127# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1128#				     const P256_POINT_AFFINE *in2);
1129{
1130my ($res_x,$res_y,$res_z,
1131    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1132my $Z1sqr = $S2;
1133# above map() describes stack layout with 10 temporary
1134# 256-bit vectors on top.
1135my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1136
1137$code.=<<___;
1138.globl	ecp_nistz256_point_add_affine
1139.type	ecp_nistz256_point_add_affine,%function
1140.align	5
1141ecp_nistz256_point_add_affine:
1142	AARCH64_SIGN_LINK_REGISTER
1143	stp	x29,x30,[sp,#-80]!
1144	add	x29,sp,#0
1145	stp	x19,x20,[sp,#16]
1146	stp	x21,x22,[sp,#32]
1147	stp	x23,x24,[sp,#48]
1148	stp	x25,x26,[sp,#64]
1149	sub	sp,sp,#32*10
1150
1151	mov	$rp_real,$rp
1152	mov	$ap_real,$ap
1153	mov	$bp_real,$bp
1154	ldr	$poly1,.Lpoly+8
1155	ldr	$poly3,.Lpoly+24
1156
1157	ldp	$a0,$a1,[$ap,#64]	// in1_z
1158	ldp	$a2,$a3,[$ap,#64+16]
1159	orr	$t0,$a0,$a1
1160	orr	$t2,$a2,$a3
1161	orr	$in1infty,$t0,$t2
1162	cmp	$in1infty,#0
1163	csetm	$in1infty,ne		// ~in1infty
1164
1165	ldp	$acc0,$acc1,[$bp]	// in2_x
1166	ldp	$acc2,$acc3,[$bp,#16]
1167	ldp	$t0,$t1,[$bp,#32]	// in2_y
1168	ldp	$t2,$t3,[$bp,#48]
1169	orr	$acc0,$acc0,$acc1
1170	orr	$acc2,$acc2,$acc3
1171	orr	$t0,$t0,$t1
1172	orr	$t2,$t2,$t3
1173	orr	$acc0,$acc0,$acc2
1174	orr	$t0,$t0,$t2
1175	orr	$in2infty,$acc0,$t0
1176	cmp	$in2infty,#0
1177	csetm	$in2infty,ne		// ~in2infty
1178
1179	add	$rp,sp,#$Z1sqr
1180	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1181
1182	mov	$a0,$acc0
1183	mov	$a1,$acc1
1184	mov	$a2,$acc2
1185	mov	$a3,$acc3
1186	ldr	$bi,[$bp_real]
1187	add	$bp,$bp_real,#0
1188	add	$rp,sp,#$U2
1189	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1190
1191	add	$bp,$ap_real,#0
1192	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
1193	 ldp	$a0,$a1,[sp,#$Z1sqr]
1194	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
1195	add	$rp,sp,#$H
1196	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1197
1198	add	$bp,$ap_real,#64
1199	add	$rp,sp,#$S2
1200	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1201
1202	ldr	$bi,[$ap_real,#64]
1203	ldp	$a0,$a1,[sp,#$H]
1204	ldp	$a2,$a3,[sp,#$H+16]
1205	add	$bp,$ap_real,#64
1206	add	$rp,sp,#$res_z
1207	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1208
1209	ldr	$bi,[$bp_real,#32]
1210	ldp	$a0,$a1,[sp,#$S2]
1211	ldp	$a2,$a3,[sp,#$S2+16]
1212	add	$bp,$bp_real,#32
1213	add	$rp,sp,#$S2
1214	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1215
1216	add	$bp,$ap_real,#32
1217	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
1218	 ldp	$a2,$a3,[sp,#$H+16]
1219	add	$rp,sp,#$R
1220	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1221
1222	add	$rp,sp,#$Hsqr
1223	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1224
1225	ldp	$a0,$a1,[sp,#$R]
1226	ldp	$a2,$a3,[sp,#$R+16]
1227	add	$rp,sp,#$Rsqr
1228	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1229
1230	ldr	$bi,[sp,#$H]
1231	ldp	$a0,$a1,[sp,#$Hsqr]
1232	ldp	$a2,$a3,[sp,#$Hsqr+16]
1233	add	$bp,sp,#$H
1234	add	$rp,sp,#$Hcub
1235	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1236
1237	ldr	$bi,[$ap_real]
1238	ldp	$a0,$a1,[sp,#$Hsqr]
1239	ldp	$a2,$a3,[sp,#$Hsqr+16]
1240	add	$bp,$ap_real,#0
1241	add	$rp,sp,#$U2
1242	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1243
1244	mov	$t0,$acc0
1245	mov	$t1,$acc1
1246	mov	$t2,$acc2
1247	mov	$t3,$acc3
1248	add	$rp,sp,#$Hsqr
1249	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1250
1251	add	$bp,sp,#$Rsqr
1252	add	$rp,sp,#$res_x
1253	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1254
1255	add	$bp,sp,#$Hcub
1256	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1257
1258	add	$bp,sp,#$U2
1259	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
1260	 ldp	$a0,$a1,[sp,#$Hcub]
1261	 ldp	$a2,$a3,[sp,#$Hcub+16]
1262	add	$rp,sp,#$res_y
1263	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1264
1265	add	$bp,$ap_real,#32
1266	add	$rp,sp,#$S2
1267	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1268
1269	ldr	$bi,[sp,#$R]
1270	ldp	$a0,$a1,[sp,#$res_y]
1271	ldp	$a2,$a3,[sp,#$res_y+16]
1272	add	$bp,sp,#$R
1273	add	$rp,sp,#$res_y
1274	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1275
1276	add	$bp,sp,#$S2
1277	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1278
1279	ldp	$a0,$a1,[sp,#$res_x]		// res
1280	ldp	$a2,$a3,[sp,#$res_x+16]
1281	ldp	$t0,$t1,[$bp_real]		// in2
1282	ldp	$t2,$t3,[$bp_real,#16]
1283___
1284for($i=0;$i<64;$i+=32) {		# conditional moves
1285$code.=<<___;
1286	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1287	cmp	$in1infty,#0			// ~$in1intfy, remember?
1288	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1289	csel	$t0,$a0,$t0,ne
1290	csel	$t1,$a1,$t1,ne
1291	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1292	csel	$t2,$a2,$t2,ne
1293	csel	$t3,$a3,$t3,ne
1294	cmp	$in2infty,#0			// ~$in2intfy, remember?
1295	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1296	csel	$acc0,$t0,$acc0,ne
1297	csel	$acc1,$t1,$acc1,ne
1298	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1299	csel	$acc2,$t2,$acc2,ne
1300	csel	$acc3,$t3,$acc3,ne
1301	ldp	$t2,$t3,[$bp_real,#$i+48]
1302	stp	$acc0,$acc1,[$rp_real,#$i]
1303	stp	$acc2,$acc3,[$rp_real,#$i+16]
1304___
1305$code.=<<___	if ($i == 0);
1306	adr	$bp_real,.Lone_mont-64
1307___
1308}
1309$code.=<<___;
1310	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1311	cmp	$in1infty,#0			// ~$in1intfy, remember?
1312	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1313	csel	$t0,$a0,$t0,ne
1314	csel	$t1,$a1,$t1,ne
1315	csel	$t2,$a2,$t2,ne
1316	csel	$t3,$a3,$t3,ne
1317	cmp	$in2infty,#0			// ~$in2intfy, remember?
1318	csel	$acc0,$t0,$acc0,ne
1319	csel	$acc1,$t1,$acc1,ne
1320	csel	$acc2,$t2,$acc2,ne
1321	csel	$acc3,$t3,$acc3,ne
1322	stp	$acc0,$acc1,[$rp_real,#$i]
1323	stp	$acc2,$acc3,[$rp_real,#$i+16]
1324
1325	add	sp,x29,#0		// destroy frame
1326	ldp	x19,x20,[x29,#16]
1327	ldp	x21,x22,[x29,#32]
1328	ldp	x23,x24,[x29,#48]
1329	ldp	x25,x26,[x29,#64]
1330	ldp	x29,x30,[sp],#80
1331	AARCH64_VALIDATE_LINK_REGISTER
1332	ret
1333.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1334___
1335}
1336if (1) {
1337my ($ord0,$ord1) = ($poly1,$poly3);
1338my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1339my $acc7 = $bi;
1340
1341$code.=<<___;
1342////////////////////////////////////////////////////////////////////////
1343// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1344//                                uint64_t b[4]);
1345.globl	ecp_nistz256_ord_mul_mont
1346.type	ecp_nistz256_ord_mul_mont,%function
1347.align	4
1348ecp_nistz256_ord_mul_mont:
1349	AARCH64_VALID_CALL_TARGET
1350	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1351	stp	x29,x30,[sp,#-64]!
1352	add	x29,sp,#0
1353	stp	x19,x20,[sp,#16]
1354	stp	x21,x22,[sp,#32]
1355	stp	x23,x24,[sp,#48]
1356
1357	adr	$ordk,.Lord
1358	ldr	$bi,[$bp]		// bp[0]
1359	ldp	$a0,$a1,[$ap]
1360	ldp	$a2,$a3,[$ap,#16]
1361
1362	ldp	$ord0,$ord1,[$ordk,#0]
1363	ldp	$ord2,$ord3,[$ordk,#16]
1364	ldr	$ordk,[$ordk,#32]
1365
1366	mul	$acc0,$a0,$bi		// a[0]*b[0]
1367	umulh	$t0,$a0,$bi
1368
1369	mul	$acc1,$a1,$bi		// a[1]*b[0]
1370	umulh	$t1,$a1,$bi
1371
1372	mul	$acc2,$a2,$bi		// a[2]*b[0]
1373	umulh	$t2,$a2,$bi
1374
1375	mul	$acc3,$a3,$bi		// a[3]*b[0]
1376	umulh	$acc4,$a3,$bi
1377
1378	mul	$t4,$acc0,$ordk
1379
1380	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
1381	adcs	$acc2,$acc2,$t1
1382	adcs	$acc3,$acc3,$t2
1383	adc	$acc4,$acc4,xzr
1384	mov	$acc5,xzr
1385___
1386for ($i=1;$i<4;$i++) {
1387	################################################################
1388	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1389	# *                                     abcdefgh
1390	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1391	#
1392	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1393	# rewrite above as:
1394	#
1395	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1396	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1397	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1398$code.=<<___;
1399	ldr	$bi,[$bp,#8*$i]		// b[i]
1400
1401	lsl	$t0,$t4,#32
1402	subs	$acc2,$acc2,$t4
1403	lsr	$t1,$t4,#32
1404	sbcs	$acc3,$acc3,$t0
1405	sbcs	$acc4,$acc4,$t1
1406	sbc	$acc5,$acc5,xzr
1407
1408	subs	xzr,$acc0,#1
1409	umulh	$t1,$ord0,$t4
1410	mul	$t2,$ord1,$t4
1411	umulh	$t3,$ord1,$t4
1412
1413	adcs	$t2,$t2,$t1
1414	 mul	$t0,$a0,$bi
1415	adc	$t3,$t3,xzr
1416	 mul	$t1,$a1,$bi
1417
1418	adds	$acc0,$acc1,$t2
1419	 mul	$t2,$a2,$bi
1420	adcs	$acc1,$acc2,$t3
1421	 mul	$t3,$a3,$bi
1422	adcs	$acc2,$acc3,$t4
1423	adcs	$acc3,$acc4,$t4
1424	adc	$acc4,$acc5,xzr
1425
1426	adds	$acc0,$acc0,$t0		// accumulate low parts
1427	umulh	$t0,$a0,$bi
1428	adcs	$acc1,$acc1,$t1
1429	umulh	$t1,$a1,$bi
1430	adcs	$acc2,$acc2,$t2
1431	umulh	$t2,$a2,$bi
1432	adcs	$acc3,$acc3,$t3
1433	umulh	$t3,$a3,$bi
1434	adc	$acc4,$acc4,xzr
1435	mul	$t4,$acc0,$ordk
1436	adds	$acc1,$acc1,$t0		// accumulate high parts
1437	adcs	$acc2,$acc2,$t1
1438	adcs	$acc3,$acc3,$t2
1439	adcs	$acc4,$acc4,$t3
1440	adc	$acc5,xzr,xzr
1441___
1442}
1443$code.=<<___;
1444	lsl	$t0,$t4,#32		// last reduction
1445	subs	$acc2,$acc2,$t4
1446	lsr	$t1,$t4,#32
1447	sbcs	$acc3,$acc3,$t0
1448	sbcs	$acc4,$acc4,$t1
1449	sbc	$acc5,$acc5,xzr
1450
1451	subs	xzr,$acc0,#1
1452	umulh	$t1,$ord0,$t4
1453	mul	$t2,$ord1,$t4
1454	umulh	$t3,$ord1,$t4
1455
1456	adcs	$t2,$t2,$t1
1457	adc	$t3,$t3,xzr
1458
1459	adds	$acc0,$acc1,$t2
1460	adcs	$acc1,$acc2,$t3
1461	adcs	$acc2,$acc3,$t4
1462	adcs	$acc3,$acc4,$t4
1463	adc	$acc4,$acc5,xzr
1464
1465	subs	$t0,$acc0,$ord0		// ret -= modulus
1466	sbcs	$t1,$acc1,$ord1
1467	sbcs	$t2,$acc2,$ord2
1468	sbcs	$t3,$acc3,$ord3
1469	sbcs	xzr,$acc4,xzr
1470
1471	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1472	csel	$acc1,$acc1,$t1,lo
1473	csel	$acc2,$acc2,$t2,lo
1474	stp	$acc0,$acc1,[$rp]
1475	csel	$acc3,$acc3,$t3,lo
1476	stp	$acc2,$acc3,[$rp,#16]
1477
1478	ldp	x19,x20,[sp,#16]
1479	ldp	x21,x22,[sp,#32]
1480	ldp	x23,x24,[sp,#48]
1481	ldr	x29,[sp],#64
1482	ret
1483.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1484
1485////////////////////////////////////////////////////////////////////////
1486// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1487//                                uint64_t rep);
1488.globl	ecp_nistz256_ord_sqr_mont
1489.type	ecp_nistz256_ord_sqr_mont,%function
1490.align	4
1491ecp_nistz256_ord_sqr_mont:
1492	AARCH64_VALID_CALL_TARGET
1493	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1494	stp	x29,x30,[sp,#-64]!
1495	add	x29,sp,#0
1496	stp	x19,x20,[sp,#16]
1497	stp	x21,x22,[sp,#32]
1498	stp	x23,x24,[sp,#48]
1499
1500	adr	$ordk,.Lord
1501	ldp	$a0,$a1,[$ap]
1502	ldp	$a2,$a3,[$ap,#16]
1503
1504	ldp	$ord0,$ord1,[$ordk,#0]
1505	ldp	$ord2,$ord3,[$ordk,#16]
1506	ldr	$ordk,[$ordk,#32]
1507	b	.Loop_ord_sqr
1508
1509.align	4
1510.Loop_ord_sqr:
1511	sub	$bp,$bp,#1
1512	////////////////////////////////////////////////////////////////
1513	//  |  |  |  |  |  |a1*a0|  |
1514	//  |  |  |  |  |a2*a0|  |  |
1515	//  |  |a3*a2|a3*a0|  |  |  |
1516	//  |  |  |  |a2*a1|  |  |  |
1517	//  |  |  |a3*a1|  |  |  |  |
1518	// *|  |  |  |  |  |  |  | 2|
1519	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1520	//  |--+--+--+--+--+--+--+--|
1521	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1522	//
1523	//  "can't overflow" below mark carrying into high part of
1524	//  multiplication result, which can't overflow, because it
1525	//  can never be all ones.
1526
1527	mul	$acc1,$a1,$a0		// a[1]*a[0]
1528	umulh	$t1,$a1,$a0
1529	mul	$acc2,$a2,$a0		// a[2]*a[0]
1530	umulh	$t2,$a2,$a0
1531	mul	$acc3,$a3,$a0		// a[3]*a[0]
1532	umulh	$acc4,$a3,$a0
1533
1534	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
1535	 mul	$t0,$a2,$a1		// a[2]*a[1]
1536	 umulh	$t1,$a2,$a1
1537	adcs	$acc3,$acc3,$t2
1538	 mul	$t2,$a3,$a1		// a[3]*a[1]
1539	 umulh	$t3,$a3,$a1
1540	adc	$acc4,$acc4,xzr		// can't overflow
1541
1542	mul	$acc5,$a3,$a2		// a[3]*a[2]
1543	umulh	$acc6,$a3,$a2
1544
1545	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
1546	 mul	$acc0,$a0,$a0		// a[0]*a[0]
1547	adc	$t2,$t3,xzr		// can't overflow
1548
1549	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
1550	 umulh	$a0,$a0,$a0
1551	adcs	$acc4,$acc4,$t1
1552	 mul	$t1,$a1,$a1		// a[1]*a[1]
1553	adcs	$acc5,$acc5,$t2
1554	 umulh	$a1,$a1,$a1
1555	adc	$acc6,$acc6,xzr		// can't overflow
1556
1557	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
1558	 mul	$t2,$a2,$a2		// a[2]*a[2]
1559	adcs	$acc2,$acc2,$acc2
1560	 umulh	$a2,$a2,$a2
1561	adcs	$acc3,$acc3,$acc3
1562	 mul	$t3,$a3,$a3		// a[3]*a[3]
1563	adcs	$acc4,$acc4,$acc4
1564	 umulh	$a3,$a3,$a3
1565	adcs	$acc5,$acc5,$acc5
1566	adcs	$acc6,$acc6,$acc6
1567	adc	$acc7,xzr,xzr
1568
1569	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
1570	 mul	$t4,$acc0,$ordk
1571	adcs	$acc2,$acc2,$t1
1572	adcs	$acc3,$acc3,$a1
1573	adcs	$acc4,$acc4,$t2
1574	adcs	$acc5,$acc5,$a2
1575	adcs	$acc6,$acc6,$t3
1576	adc	$acc7,$acc7,$a3
1577___
1578for($i=0; $i<4; $i++) {			# reductions
1579$code.=<<___;
1580	subs	xzr,$acc0,#1
1581	umulh	$t1,$ord0,$t4
1582	mul	$t2,$ord1,$t4
1583	umulh	$t3,$ord1,$t4
1584
1585	adcs	$t2,$t2,$t1
1586	adc	$t3,$t3,xzr
1587
1588	adds	$acc0,$acc1,$t2
1589	adcs	$acc1,$acc2,$t3
1590	adcs	$acc2,$acc3,$t4
1591	adc	$acc3,xzr,$t4		// can't overflow
1592___
1593$code.=<<___	if ($i<3);
1594	mul	$t3,$acc0,$ordk
1595___
1596$code.=<<___;
1597	lsl	$t0,$t4,#32
1598	subs	$acc1,$acc1,$t4
1599	lsr	$t1,$t4,#32
1600	sbcs	$acc2,$acc2,$t0
1601	sbc	$acc3,$acc3,$t1		// can't borrow
1602___
1603	($t3,$t4) = ($t4,$t3);
1604}
1605$code.=<<___;
1606	adds	$acc0,$acc0,$acc4	// accumulate upper half
1607	adcs	$acc1,$acc1,$acc5
1608	adcs	$acc2,$acc2,$acc6
1609	adcs	$acc3,$acc3,$acc7
1610	adc	$acc4,xzr,xzr
1611
1612	subs	$t0,$acc0,$ord0		// ret -= modulus
1613	sbcs	$t1,$acc1,$ord1
1614	sbcs	$t2,$acc2,$ord2
1615	sbcs	$t3,$acc3,$ord3
1616	sbcs	xzr,$acc4,xzr
1617
1618	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1619	csel	$a1,$acc1,$t1,lo
1620	csel	$a2,$acc2,$t2,lo
1621	csel	$a3,$acc3,$t3,lo
1622
1623	cbnz	$bp,.Loop_ord_sqr
1624
1625	stp	$a0,$a1,[$rp]
1626	stp	$a2,$a3,[$rp,#16]
1627
1628	ldp	x19,x20,[sp,#16]
1629	ldp	x21,x22,[sp,#32]
1630	ldp	x23,x24,[sp,#48]
1631	ldr	x29,[sp],#64
1632	ret
1633.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1634___
1635}	}
1636
1637########################################################################
1638# scatter-gather subroutines
1639{
1640my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1641$code.=<<___;
1642// void	ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1643//					 int x2);
1644.globl	ecp_nistz256_scatter_w5
1645.type	ecp_nistz256_scatter_w5,%function
1646.align	4
1647ecp_nistz256_scatter_w5:
1648	AARCH64_VALID_CALL_TARGET
1649	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1650	stp	x29,x30,[sp,#-16]!
1651	add	x29,sp,#0
1652
1653	add	$out,$out,$index,lsl#2
1654
1655	ldp	x4,x5,[$inp]		// X
1656	ldp	x6,x7,[$inp,#16]
1657	stur	w4,[$out,#64*0-4]
1658	lsr	x4,x4,#32
1659	str	w5,[$out,#64*1-4]
1660	lsr	x5,x5,#32
1661	str	w6,[$out,#64*2-4]
1662	lsr	x6,x6,#32
1663	str	w7,[$out,#64*3-4]
1664	lsr	x7,x7,#32
1665	str	w4,[$out,#64*4-4]
1666	str	w5,[$out,#64*5-4]
1667	str	w6,[$out,#64*6-4]
1668	str	w7,[$out,#64*7-4]
1669	add	$out,$out,#64*8
1670
1671	ldp	x4,x5,[$inp,#32]	// Y
1672	ldp	x6,x7,[$inp,#48]
1673	stur	w4,[$out,#64*0-4]
1674	lsr	x4,x4,#32
1675	str	w5,[$out,#64*1-4]
1676	lsr	x5,x5,#32
1677	str	w6,[$out,#64*2-4]
1678	lsr	x6,x6,#32
1679	str	w7,[$out,#64*3-4]
1680	lsr	x7,x7,#32
1681	str	w4,[$out,#64*4-4]
1682	str	w5,[$out,#64*5-4]
1683	str	w6,[$out,#64*6-4]
1684	str	w7,[$out,#64*7-4]
1685	add	$out,$out,#64*8
1686
1687	ldp	x4,x5,[$inp,#64]	// Z
1688	ldp	x6,x7,[$inp,#80]
1689	stur	w4,[$out,#64*0-4]
1690	lsr	x4,x4,#32
1691	str	w5,[$out,#64*1-4]
1692	lsr	x5,x5,#32
1693	str	w6,[$out,#64*2-4]
1694	lsr	x6,x6,#32
1695	str	w7,[$out,#64*3-4]
1696	lsr	x7,x7,#32
1697	str	w4,[$out,#64*4-4]
1698	str	w5,[$out,#64*5-4]
1699	str	w6,[$out,#64*6-4]
1700	str	w7,[$out,#64*7-4]
1701
1702	ldr	x29,[sp],#16
1703	ret
1704.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1705
1706// void	ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1707//					      int x2);
1708.globl	ecp_nistz256_gather_w5
1709.type	ecp_nistz256_gather_w5,%function
1710.align	4
1711ecp_nistz256_gather_w5:
1712	AARCH64_VALID_CALL_TARGET
1713	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1714	stp	x29,x30,[sp,#-16]!
1715	add	x29,sp,#0
1716
1717	cmp	$index,xzr
1718	csetm	x3,ne
1719	add	$index,$index,x3
1720	add	$inp,$inp,$index,lsl#2
1721
1722	ldr	w4,[$inp,#64*0]
1723	ldr	w5,[$inp,#64*1]
1724	ldr	w6,[$inp,#64*2]
1725	ldr	w7,[$inp,#64*3]
1726	ldr	w8,[$inp,#64*4]
1727	ldr	w9,[$inp,#64*5]
1728	ldr	w10,[$inp,#64*6]
1729	ldr	w11,[$inp,#64*7]
1730	add	$inp,$inp,#64*8
1731	orr	x4,x4,x8,lsl#32
1732	orr	x5,x5,x9,lsl#32
1733	orr	x6,x6,x10,lsl#32
1734	orr	x7,x7,x11,lsl#32
1735	csel	x4,x4,xzr,ne
1736	csel	x5,x5,xzr,ne
1737	csel	x6,x6,xzr,ne
1738	csel	x7,x7,xzr,ne
1739	stp	x4,x5,[$out]		// X
1740	stp	x6,x7,[$out,#16]
1741
1742	ldr	w4,[$inp,#64*0]
1743	ldr	w5,[$inp,#64*1]
1744	ldr	w6,[$inp,#64*2]
1745	ldr	w7,[$inp,#64*3]
1746	ldr	w8,[$inp,#64*4]
1747	ldr	w9,[$inp,#64*5]
1748	ldr	w10,[$inp,#64*6]
1749	ldr	w11,[$inp,#64*7]
1750	add	$inp,$inp,#64*8
1751	orr	x4,x4,x8,lsl#32
1752	orr	x5,x5,x9,lsl#32
1753	orr	x6,x6,x10,lsl#32
1754	orr	x7,x7,x11,lsl#32
1755	csel	x4,x4,xzr,ne
1756	csel	x5,x5,xzr,ne
1757	csel	x6,x6,xzr,ne
1758	csel	x7,x7,xzr,ne
1759	stp	x4,x5,[$out,#32]	// Y
1760	stp	x6,x7,[$out,#48]
1761
1762	ldr	w4,[$inp,#64*0]
1763	ldr	w5,[$inp,#64*1]
1764	ldr	w6,[$inp,#64*2]
1765	ldr	w7,[$inp,#64*3]
1766	ldr	w8,[$inp,#64*4]
1767	ldr	w9,[$inp,#64*5]
1768	ldr	w10,[$inp,#64*6]
1769	ldr	w11,[$inp,#64*7]
1770	orr	x4,x4,x8,lsl#32
1771	orr	x5,x5,x9,lsl#32
1772	orr	x6,x6,x10,lsl#32
1773	orr	x7,x7,x11,lsl#32
1774	csel	x4,x4,xzr,ne
1775	csel	x5,x5,xzr,ne
1776	csel	x6,x6,xzr,ne
1777	csel	x7,x7,xzr,ne
1778	stp	x4,x5,[$out,#64]	// Z
1779	stp	x6,x7,[$out,#80]
1780
1781	ldr	x29,[sp],#16
1782	ret
1783.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1784
1785// void	ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1786//					 int x2);
1787.globl	ecp_nistz256_scatter_w7
1788.type	ecp_nistz256_scatter_w7,%function
1789.align	4
1790ecp_nistz256_scatter_w7:
1791	AARCH64_VALID_CALL_TARGET
1792	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1793	stp	x29,x30,[sp,#-16]!
1794	add	x29,sp,#0
1795
1796	add	$out,$out,$index
1797	mov	$index,#64/8
1798.Loop_scatter_w7:
1799	ldr	x3,[$inp],#8
1800	subs	$index,$index,#1
1801	prfm	pstl1strm,[$out,#4096+64*0]
1802	prfm	pstl1strm,[$out,#4096+64*1]
1803	prfm	pstl1strm,[$out,#4096+64*2]
1804	prfm	pstl1strm,[$out,#4096+64*3]
1805	prfm	pstl1strm,[$out,#4096+64*4]
1806	prfm	pstl1strm,[$out,#4096+64*5]
1807	prfm	pstl1strm,[$out,#4096+64*6]
1808	prfm	pstl1strm,[$out,#4096+64*7]
1809	strb	w3,[$out,#64*0]
1810	lsr	x3,x3,#8
1811	strb	w3,[$out,#64*1]
1812	lsr	x3,x3,#8
1813	strb	w3,[$out,#64*2]
1814	lsr	x3,x3,#8
1815	strb	w3,[$out,#64*3]
1816	lsr	x3,x3,#8
1817	strb	w3,[$out,#64*4]
1818	lsr	x3,x3,#8
1819	strb	w3,[$out,#64*5]
1820	lsr	x3,x3,#8
1821	strb	w3,[$out,#64*6]
1822	lsr	x3,x3,#8
1823	strb	w3,[$out,#64*7]
1824	add	$out,$out,#64*8
1825	b.ne	.Loop_scatter_w7
1826
1827	ldr	x29,[sp],#16
1828	ret
1829.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1830
1831// void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1832//						     int x2);
1833.globl	ecp_nistz256_gather_w7
1834.type	ecp_nistz256_gather_w7,%function
1835.align	4
1836ecp_nistz256_gather_w7:
1837	AARCH64_VALID_CALL_TARGET
1838	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1839	stp	x29,x30,[sp,#-16]!
1840	add	x29,sp,#0
1841
1842	cmp	$index,xzr
1843	csetm	x3,ne
1844	add	$index,$index,x3
1845	add	$inp,$inp,$index
1846	mov	$index,#64/8
1847	nop
1848.Loop_gather_w7:
1849	ldrb	w4,[$inp,#64*0]
1850	prfm	pldl1strm,[$inp,#4096+64*0]
1851	subs	$index,$index,#1
1852	ldrb	w5,[$inp,#64*1]
1853	prfm	pldl1strm,[$inp,#4096+64*1]
1854	ldrb	w6,[$inp,#64*2]
1855	prfm	pldl1strm,[$inp,#4096+64*2]
1856	ldrb	w7,[$inp,#64*3]
1857	prfm	pldl1strm,[$inp,#4096+64*3]
1858	ldrb	w8,[$inp,#64*4]
1859	prfm	pldl1strm,[$inp,#4096+64*4]
1860	ldrb	w9,[$inp,#64*5]
1861	prfm	pldl1strm,[$inp,#4096+64*5]
1862	ldrb	w10,[$inp,#64*6]
1863	prfm	pldl1strm,[$inp,#4096+64*6]
1864	ldrb	w11,[$inp,#64*7]
1865	prfm	pldl1strm,[$inp,#4096+64*7]
1866	add	$inp,$inp,#64*8
1867	orr	x4,x4,x5,lsl#8
1868	orr	x6,x6,x7,lsl#8
1869	orr	x8,x8,x9,lsl#8
1870	orr	x4,x4,x6,lsl#16
1871	orr	x10,x10,x11,lsl#8
1872	orr	x4,x4,x8,lsl#32
1873	orr	x4,x4,x10,lsl#48
1874	and	x4,x4,x3
1875	str	x4,[$out],#8
1876	b.ne	.Loop_gather_w7
1877
1878	ldr	x29,[sp],#16
1879	ret
1880.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1881___
1882}
1883
1884foreach (split("\n",$code)) {
1885	s/\`([^\`]*)\`/eval $1/ge;
1886
1887	print $_,"\n";
1888}
1889close STDOUT or die "error closing STDOUT: $!";	# enforce flush
1890