1#! /usr/bin/env perl
2# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv8.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# Apple A7		+190-360%
26# Cortex-A53		+190-400%
27# Cortex-A57		+190-350%
28# Denver		+230-400%
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +400% means 5x improvement.
33
34$flavour = shift;
35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
36
37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40die "can't locate arm-xlate.pl";
41
42open OUT,"| \"$^X\" $xlate $flavour $output";
43*STDOUT=*OUT;
44
45{
46my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
47    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
48    map("x$_",(0..17,19,20));
49
50my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
51
52$code.=<<___;
53#include "arm_arch.h"
54
55.text
56___
57########################################################################
58# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
59#
60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61open TABLE,"<ecp_nistz256_table.c"		or
62open TABLE,"<${dir}../ecp_nistz256_table.c"	or
63die "failed to open ecp_nistz256_table.c:",$!;
64
65use integer;
66
67foreach(<TABLE>) {
68	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
69}
70close TABLE;
71
72# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
73# 64*16*37-1 is because $#arr returns last valid index or @arr, not
74# amount of elements.
75die "insane number of elements" if ($#arr != 64*16*37-1);
76
77$code.=<<___;
78.globl	ecp_nistz256_precomputed
79.type	ecp_nistz256_precomputed,%object
80.align	12
81ecp_nistz256_precomputed:
82___
83########################################################################
84# this conversion smashes P256_POINT_AFFINE by individual bytes with
85# 64 byte interval, similar to
86#	1111222233334444
87#	1234123412341234
88for(1..37) {
89	@tbl = splice(@arr,0,64*16);
90	for($i=0;$i<64;$i++) {
91		undef @line;
92		for($j=0;$j<64;$j++) {
93			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
94		}
95		$code.=".byte\t";
96		$code.=join(',',map { sprintf "0x%02x",$_} @line);
97		$code.="\n";
98	}
99}
100$code.=<<___;
101.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
102.align	5
103.Lpoly:
104.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
105.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
106.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
107.Lone_mont:
108.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
109.Lone:
110.quad	1,0,0,0
111.Lord:
112.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
113.LordK:
114.quad	0xccd1c8aaee00bc4f
115.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
116
117// void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
118.globl	ecp_nistz256_to_mont
119.type	ecp_nistz256_to_mont,%function
120.align	6
121ecp_nistz256_to_mont:
122	stp	x29,x30,[sp,#-32]!
123	add	x29,sp,#0
124	stp	x19,x20,[sp,#16]
125
126	ldr	$bi,.LRR		// bp[0]
127	ldp	$a0,$a1,[$ap]
128	ldp	$a2,$a3,[$ap,#16]
129	ldr	$poly1,.Lpoly+8
130	ldr	$poly3,.Lpoly+24
131	adr	$bp,.LRR		// &bp[0]
132
133	bl	__ecp_nistz256_mul_mont
134
135	ldp	x19,x20,[sp,#16]
136	ldp	x29,x30,[sp],#32
137	ret
138.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
139
140// void	ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
141.globl	ecp_nistz256_from_mont
142.type	ecp_nistz256_from_mont,%function
143.align	4
144ecp_nistz256_from_mont:
145	stp	x29,x30,[sp,#-32]!
146	add	x29,sp,#0
147	stp	x19,x20,[sp,#16]
148
149	mov	$bi,#1			// bp[0]
150	ldp	$a0,$a1,[$ap]
151	ldp	$a2,$a3,[$ap,#16]
152	ldr	$poly1,.Lpoly+8
153	ldr	$poly3,.Lpoly+24
154	adr	$bp,.Lone		// &bp[0]
155
156	bl	__ecp_nistz256_mul_mont
157
158	ldp	x19,x20,[sp,#16]
159	ldp	x29,x30,[sp],#32
160	ret
161.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
162
163// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
164//					     const BN_ULONG x2[4]);
165.globl	ecp_nistz256_mul_mont
166.type	ecp_nistz256_mul_mont,%function
167.align	4
168ecp_nistz256_mul_mont:
169	stp	x29,x30,[sp,#-32]!
170	add	x29,sp,#0
171	stp	x19,x20,[sp,#16]
172
173	ldr	$bi,[$bp]		// bp[0]
174	ldp	$a0,$a1,[$ap]
175	ldp	$a2,$a3,[$ap,#16]
176	ldr	$poly1,.Lpoly+8
177	ldr	$poly3,.Lpoly+24
178
179	bl	__ecp_nistz256_mul_mont
180
181	ldp	x19,x20,[sp,#16]
182	ldp	x29,x30,[sp],#32
183	ret
184.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
185
186// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
187.globl	ecp_nistz256_sqr_mont
188.type	ecp_nistz256_sqr_mont,%function
189.align	4
190ecp_nistz256_sqr_mont:
191	stp	x29,x30,[sp,#-32]!
192	add	x29,sp,#0
193	stp	x19,x20,[sp,#16]
194
195	ldp	$a0,$a1,[$ap]
196	ldp	$a2,$a3,[$ap,#16]
197	ldr	$poly1,.Lpoly+8
198	ldr	$poly3,.Lpoly+24
199
200	bl	__ecp_nistz256_sqr_mont
201
202	ldp	x19,x20,[sp,#16]
203	ldp	x29,x30,[sp],#32
204	ret
205.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
206
207// void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
208//					const BN_ULONG x2[4]);
209.globl	ecp_nistz256_add
210.type	ecp_nistz256_add,%function
211.align	4
212ecp_nistz256_add:
213	stp	x29,x30,[sp,#-16]!
214	add	x29,sp,#0
215
216	ldp	$acc0,$acc1,[$ap]
217	ldp	$t0,$t1,[$bp]
218	ldp	$acc2,$acc3,[$ap,#16]
219	ldp	$t2,$t3,[$bp,#16]
220	ldr	$poly1,.Lpoly+8
221	ldr	$poly3,.Lpoly+24
222
223	bl	__ecp_nistz256_add
224
225	ldp	x29,x30,[sp],#16
226	ret
227.size	ecp_nistz256_add,.-ecp_nistz256_add
228
229// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
230.globl	ecp_nistz256_div_by_2
231.type	ecp_nistz256_div_by_2,%function
232.align	4
233ecp_nistz256_div_by_2:
234	stp	x29,x30,[sp,#-16]!
235	add	x29,sp,#0
236
237	ldp	$acc0,$acc1,[$ap]
238	ldp	$acc2,$acc3,[$ap,#16]
239	ldr	$poly1,.Lpoly+8
240	ldr	$poly3,.Lpoly+24
241
242	bl	__ecp_nistz256_div_by_2
243
244	ldp	x29,x30,[sp],#16
245	ret
246.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
247
248// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
249.globl	ecp_nistz256_mul_by_2
250.type	ecp_nistz256_mul_by_2,%function
251.align	4
252ecp_nistz256_mul_by_2:
253	stp	x29,x30,[sp,#-16]!
254	add	x29,sp,#0
255
256	ldp	$acc0,$acc1,[$ap]
257	ldp	$acc2,$acc3,[$ap,#16]
258	ldr	$poly1,.Lpoly+8
259	ldr	$poly3,.Lpoly+24
260	mov	$t0,$acc0
261	mov	$t1,$acc1
262	mov	$t2,$acc2
263	mov	$t3,$acc3
264
265	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
266
267	ldp	x29,x30,[sp],#16
268	ret
269.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
270
271// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
272.globl	ecp_nistz256_mul_by_3
273.type	ecp_nistz256_mul_by_3,%function
274.align	4
275ecp_nistz256_mul_by_3:
276	stp	x29,x30,[sp,#-16]!
277	add	x29,sp,#0
278
279	ldp	$acc0,$acc1,[$ap]
280	ldp	$acc2,$acc3,[$ap,#16]
281	ldr	$poly1,.Lpoly+8
282	ldr	$poly3,.Lpoly+24
283	mov	$t0,$acc0
284	mov	$t1,$acc1
285	mov	$t2,$acc2
286	mov	$t3,$acc3
287	mov	$a0,$acc0
288	mov	$a1,$acc1
289	mov	$a2,$acc2
290	mov	$a3,$acc3
291
292	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
293
294	mov	$t0,$a0
295	mov	$t1,$a1
296	mov	$t2,$a2
297	mov	$t3,$a3
298
299	bl	__ecp_nistz256_add	// ret += a	// 2*a+a=3*a
300
301	ldp	x29,x30,[sp],#16
302	ret
303.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
304
305// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
306//				        const BN_ULONG x2[4]);
307.globl	ecp_nistz256_sub
308.type	ecp_nistz256_sub,%function
309.align	4
310ecp_nistz256_sub:
311	stp	x29,x30,[sp,#-16]!
312	add	x29,sp,#0
313
314	ldp	$acc0,$acc1,[$ap]
315	ldp	$acc2,$acc3,[$ap,#16]
316	ldr	$poly1,.Lpoly+8
317	ldr	$poly3,.Lpoly+24
318
319	bl	__ecp_nistz256_sub_from
320
321	ldp	x29,x30,[sp],#16
322	ret
323.size	ecp_nistz256_sub,.-ecp_nistz256_sub
324
325// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
326.globl	ecp_nistz256_neg
327.type	ecp_nistz256_neg,%function
328.align	4
329ecp_nistz256_neg:
330	stp	x29,x30,[sp,#-16]!
331	add	x29,sp,#0
332
333	mov	$bp,$ap
334	mov	$acc0,xzr		// a = 0
335	mov	$acc1,xzr
336	mov	$acc2,xzr
337	mov	$acc3,xzr
338	ldr	$poly1,.Lpoly+8
339	ldr	$poly3,.Lpoly+24
340
341	bl	__ecp_nistz256_sub_from
342
343	ldp	x29,x30,[sp],#16
344	ret
345.size	ecp_nistz256_neg,.-ecp_nistz256_neg
346
347// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
348// to $a0-$a3 and b[0] - to $bi
349.type	__ecp_nistz256_mul_mont,%function
350.align	4
351__ecp_nistz256_mul_mont:
352	mul	$acc0,$a0,$bi		// a[0]*b[0]
353	umulh	$t0,$a0,$bi
354
355	mul	$acc1,$a1,$bi		// a[1]*b[0]
356	umulh	$t1,$a1,$bi
357
358	mul	$acc2,$a2,$bi		// a[2]*b[0]
359	umulh	$t2,$a2,$bi
360
361	mul	$acc3,$a3,$bi		// a[3]*b[0]
362	umulh	$t3,$a3,$bi
363	ldr	$bi,[$bp,#8]		// b[1]
364
365	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
366	 lsl	$t0,$acc0,#32
367	adcs	$acc2,$acc2,$t1
368	 lsr	$t1,$acc0,#32
369	adcs	$acc3,$acc3,$t2
370	adc	$acc4,xzr,$t3
371	mov	$acc5,xzr
372___
373for($i=1;$i<4;$i++) {
374        # Reduction iteration is normally performed by accumulating
375        # result of multiplication of modulus by "magic" digit [and
376        # omitting least significant word, which is guaranteed to
377        # be 0], but thanks to special form of modulus and "magic"
378        # digit being equal to least significant word, it can be
379        # performed with additions and subtractions alone. Indeed:
380        #
381        #            ffff0001.00000000.0000ffff.ffffffff
382        # *                                     abcdefgh
383        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
384        #
385        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
386        # rewrite above as:
387        #
388        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
389        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
390        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
391        #
392        # or marking redundant operations:
393        #
394        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
395        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
396        # - 0000abcd.efgh0000.--------.--------.--------
397
398$code.=<<___;
399	subs	$t2,$acc0,$t0		// "*0xffff0001"
400	sbc	$t3,$acc0,$t1
401	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
402	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
403	adcs	$acc1,$acc2,$t1
404	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
405	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
406	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
407	adcs	$acc3,$acc4,$t3
408	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
409	adc	$acc4,$acc5,xzr
410
411	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
412	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
413	adcs	$acc1,$acc1,$t1
414	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
415	adcs	$acc2,$acc2,$t2
416	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
417	adcs	$acc3,$acc3,$t3
418	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
419	adc	$acc4,$acc4,xzr
420___
421$code.=<<___	if ($i<3);
422	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
423___
424$code.=<<___;
425	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
426	 lsl	$t0,$acc0,#32
427	adcs	$acc2,$acc2,$t1
428	 lsr	$t1,$acc0,#32
429	adcs	$acc3,$acc3,$t2
430	adcs	$acc4,$acc4,$t3
431	adc	$acc5,xzr,xzr
432___
433}
434$code.=<<___;
435	// last reduction
436	subs	$t2,$acc0,$t0		// "*0xffff0001"
437	sbc	$t3,$acc0,$t1
438	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
439	adcs	$acc1,$acc2,$t1
440	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
441	adcs	$acc3,$acc4,$t3
442	adc	$acc4,$acc5,xzr
443
444	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
445	sbcs	$t1,$acc1,$poly1
446	sbcs	$t2,$acc2,xzr
447	sbcs	$t3,$acc3,$poly3
448	sbcs	xzr,$acc4,xzr		// did it borrow?
449
450	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
451	csel	$acc1,$acc1,$t1,lo
452	csel	$acc2,$acc2,$t2,lo
453	stp	$acc0,$acc1,[$rp]
454	csel	$acc3,$acc3,$t3,lo
455	stp	$acc2,$acc3,[$rp,#16]
456
457	ret
458.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
459
460// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
461// to $a0-$a3
462.type	__ecp_nistz256_sqr_mont,%function
463.align	4
464__ecp_nistz256_sqr_mont:
465	//  |  |  |  |  |  |a1*a0|  |
466	//  |  |  |  |  |a2*a0|  |  |
467	//  |  |a3*a2|a3*a0|  |  |  |
468	//  |  |  |  |a2*a1|  |  |  |
469	//  |  |  |a3*a1|  |  |  |  |
470	// *|  |  |  |  |  |  |  | 2|
471	// +|a3*a3|a2*a2|a1*a1|a0*a0|
472	//  |--+--+--+--+--+--+--+--|
473	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
474	//
475	//  "can't overflow" below mark carrying into high part of
476	//  multiplication result, which can't overflow, because it
477	//  can never be all ones.
478
479	mul	$acc1,$a1,$a0		// a[1]*a[0]
480	umulh	$t1,$a1,$a0
481	mul	$acc2,$a2,$a0		// a[2]*a[0]
482	umulh	$t2,$a2,$a0
483	mul	$acc3,$a3,$a0		// a[3]*a[0]
484	umulh	$acc4,$a3,$a0
485
486	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
487	 mul	$t0,$a2,$a1		// a[2]*a[1]
488	 umulh	$t1,$a2,$a1
489	adcs	$acc3,$acc3,$t2
490	 mul	$t2,$a3,$a1		// a[3]*a[1]
491	 umulh	$t3,$a3,$a1
492	adc	$acc4,$acc4,xzr		// can't overflow
493
494	mul	$acc5,$a3,$a2		// a[3]*a[2]
495	umulh	$acc6,$a3,$a2
496
497	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
498	 mul	$acc0,$a0,$a0		// a[0]*a[0]
499	adc	$t2,$t3,xzr		// can't overflow
500
501	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
502	 umulh	$a0,$a0,$a0
503	adcs	$acc4,$acc4,$t1
504	 mul	$t1,$a1,$a1		// a[1]*a[1]
505	adcs	$acc5,$acc5,$t2
506	 umulh	$a1,$a1,$a1
507	adc	$acc6,$acc6,xzr		// can't overflow
508
509	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
510	 mul	$t2,$a2,$a2		// a[2]*a[2]
511	adcs	$acc2,$acc2,$acc2
512	 umulh	$a2,$a2,$a2
513	adcs	$acc3,$acc3,$acc3
514	 mul	$t3,$a3,$a3		// a[3]*a[3]
515	adcs	$acc4,$acc4,$acc4
516	 umulh	$a3,$a3,$a3
517	adcs	$acc5,$acc5,$acc5
518	adcs	$acc6,$acc6,$acc6
519	adc	$acc7,xzr,xzr
520
521	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
522	adcs	$acc2,$acc2,$t1
523	adcs	$acc3,$acc3,$a1
524	adcs	$acc4,$acc4,$t2
525	adcs	$acc5,$acc5,$a2
526	 lsl	$t0,$acc0,#32
527	adcs	$acc6,$acc6,$t3
528	 lsr	$t1,$acc0,#32
529	adc	$acc7,$acc7,$a3
530___
531for($i=0;$i<3;$i++) {			# reductions, see commentary in
532					# multiplication for details
533$code.=<<___;
534	subs	$t2,$acc0,$t0		// "*0xffff0001"
535	sbc	$t3,$acc0,$t1
536	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
537	adcs	$acc1,$acc2,$t1
538	 lsl	$t0,$acc0,#32
539	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
540	 lsr	$t1,$acc0,#32
541	adc	$acc3,$t3,xzr		// can't overflow
542___
543}
544$code.=<<___;
545	subs	$t2,$acc0,$t0		// "*0xffff0001"
546	sbc	$t3,$acc0,$t1
547	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
548	adcs	$acc1,$acc2,$t1
549	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
550	adc	$acc3,$t3,xzr		// can't overflow
551
552	adds	$acc0,$acc0,$acc4	// accumulate upper half
553	adcs	$acc1,$acc1,$acc5
554	adcs	$acc2,$acc2,$acc6
555	adcs	$acc3,$acc3,$acc7
556	adc	$acc4,xzr,xzr
557
558	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
559	sbcs	$t1,$acc1,$poly1
560	sbcs	$t2,$acc2,xzr
561	sbcs	$t3,$acc3,$poly3
562	sbcs	xzr,$acc4,xzr		// did it borrow?
563
564	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
565	csel	$acc1,$acc1,$t1,lo
566	csel	$acc2,$acc2,$t2,lo
567	stp	$acc0,$acc1,[$rp]
568	csel	$acc3,$acc3,$t3,lo
569	stp	$acc2,$acc3,[$rp,#16]
570
571	ret
572.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
573
574// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
575// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
576// contexts, e.g. in multiplication by 2 and 3...
577.type	__ecp_nistz256_add,%function
578.align	4
579__ecp_nistz256_add:
580	adds	$acc0,$acc0,$t0		// ret = a+b
581	adcs	$acc1,$acc1,$t1
582	adcs	$acc2,$acc2,$t2
583	adcs	$acc3,$acc3,$t3
584	adc	$ap,xzr,xzr		// zap $ap
585
586	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
587	sbcs	$t1,$acc1,$poly1
588	sbcs	$t2,$acc2,xzr
589	sbcs	$t3,$acc3,$poly3
590	sbcs	xzr,$ap,xzr		// did subtraction borrow?
591
592	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
593	csel	$acc1,$acc1,$t1,lo
594	csel	$acc2,$acc2,$t2,lo
595	stp	$acc0,$acc1,[$rp]
596	csel	$acc3,$acc3,$t3,lo
597	stp	$acc2,$acc3,[$rp,#16]
598
599	ret
600.size	__ecp_nistz256_add,.-__ecp_nistz256_add
601
602.type	__ecp_nistz256_sub_from,%function
603.align	4
604__ecp_nistz256_sub_from:
605	ldp	$t0,$t1,[$bp]
606	ldp	$t2,$t3,[$bp,#16]
607	subs	$acc0,$acc0,$t0		// ret = a-b
608	sbcs	$acc1,$acc1,$t1
609	sbcs	$acc2,$acc2,$t2
610	sbcs	$acc3,$acc3,$t3
611	sbc	$ap,xzr,xzr		// zap $ap
612
613	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
614	adcs	$t1,$acc1,$poly1
615	adcs	$t2,$acc2,xzr
616	adc	$t3,$acc3,$poly3
617	cmp	$ap,xzr			// did subtraction borrow?
618
619	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
620	csel	$acc1,$acc1,$t1,eq
621	csel	$acc2,$acc2,$t2,eq
622	stp	$acc0,$acc1,[$rp]
623	csel	$acc3,$acc3,$t3,eq
624	stp	$acc2,$acc3,[$rp,#16]
625
626	ret
627.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
628
629.type	__ecp_nistz256_sub_morf,%function
630.align	4
631__ecp_nistz256_sub_morf:
632	ldp	$t0,$t1,[$bp]
633	ldp	$t2,$t3,[$bp,#16]
634	subs	$acc0,$t0,$acc0		// ret = b-a
635	sbcs	$acc1,$t1,$acc1
636	sbcs	$acc2,$t2,$acc2
637	sbcs	$acc3,$t3,$acc3
638	sbc	$ap,xzr,xzr		// zap $ap
639
640	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
641	adcs	$t1,$acc1,$poly1
642	adcs	$t2,$acc2,xzr
643	adc	$t3,$acc3,$poly3
644	cmp	$ap,xzr			// did subtraction borrow?
645
646	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
647	csel	$acc1,$acc1,$t1,eq
648	csel	$acc2,$acc2,$t2,eq
649	stp	$acc0,$acc1,[$rp]
650	csel	$acc3,$acc3,$t3,eq
651	stp	$acc2,$acc3,[$rp,#16]
652
653	ret
654.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
655
656.type	__ecp_nistz256_div_by_2,%function
657.align	4
658__ecp_nistz256_div_by_2:
659	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
660	adcs	$t1,$acc1,$poly1
661	adcs	$t2,$acc2,xzr
662	adcs	$t3,$acc3,$poly3
663	adc	$ap,xzr,xzr		// zap $ap
664	tst	$acc0,#1		// is a even?
665
666	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
667	csel	$acc1,$acc1,$t1,eq
668	csel	$acc2,$acc2,$t2,eq
669	csel	$acc3,$acc3,$t3,eq
670	csel	$ap,xzr,$ap,eq
671
672	lsr	$acc0,$acc0,#1		// ret >>= 1
673	orr	$acc0,$acc0,$acc1,lsl#63
674	lsr	$acc1,$acc1,#1
675	orr	$acc1,$acc1,$acc2,lsl#63
676	lsr	$acc2,$acc2,#1
677	orr	$acc2,$acc2,$acc3,lsl#63
678	lsr	$acc3,$acc3,#1
679	stp	$acc0,$acc1,[$rp]
680	orr	$acc3,$acc3,$ap,lsl#63
681	stp	$acc2,$acc3,[$rp,#16]
682
683	ret
684.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
685___
686########################################################################
687# following subroutines are "literal" implementation of those found in
688# ecp_nistz256.c
689#
690########################################################################
691# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
692#
693{
694my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
695# above map() describes stack layout with 4 temporary
696# 256-bit vectors on top.
697my ($rp_real,$ap_real) = map("x$_",(21,22));
698
699$code.=<<___;
700.globl	ecp_nistz256_point_double
701.type	ecp_nistz256_point_double,%function
702.align	5
703ecp_nistz256_point_double:
704	stp	x29,x30,[sp,#-80]!
705	add	x29,sp,#0
706	stp	x19,x20,[sp,#16]
707	stp	x21,x22,[sp,#32]
708	sub	sp,sp,#32*4
709
710.Ldouble_shortcut:
711	ldp	$acc0,$acc1,[$ap,#32]
712	 mov	$rp_real,$rp
713	ldp	$acc2,$acc3,[$ap,#48]
714	 mov	$ap_real,$ap
715	 ldr	$poly1,.Lpoly+8
716	mov	$t0,$acc0
717	 ldr	$poly3,.Lpoly+24
718	mov	$t1,$acc1
719	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
720	mov	$t2,$acc2
721	mov	$t3,$acc3
722	 ldp	$a2,$a3,[$ap_real,#64+16]
723	add	$rp,sp,#$S
724	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
725
726	add	$rp,sp,#$Zsqr
727	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
728
729	ldp	$t0,$t1,[$ap_real]
730	ldp	$t2,$t3,[$ap_real,#16]
731	mov	$a0,$acc0		// put Zsqr aside for p256_sub
732	mov	$a1,$acc1
733	mov	$a2,$acc2
734	mov	$a3,$acc3
735	add	$rp,sp,#$M
736	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
737
738	add	$bp,$ap_real,#0
739	mov	$acc0,$a0		// restore Zsqr
740	mov	$acc1,$a1
741	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
742	mov	$acc2,$a2
743	mov	$acc3,$a3
744	 ldp	$a2,$a3,[sp,#$S+16]
745	add	$rp,sp,#$Zsqr
746	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
747
748	add	$rp,sp,#$S
749	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
750
751	ldr	$bi,[$ap_real,#32]
752	ldp	$a0,$a1,[$ap_real,#64]
753	ldp	$a2,$a3,[$ap_real,#64+16]
754	add	$bp,$ap_real,#32
755	add	$rp,sp,#$tmp0
756	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
757
758	mov	$t0,$acc0
759	mov	$t1,$acc1
760	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
761	mov	$t2,$acc2
762	mov	$t3,$acc3
763	 ldp	$a2,$a3,[sp,#$S+16]
764	add	$rp,$rp_real,#64
765	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
766
767	add	$rp,sp,#$tmp0
768	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
769
770	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
771	 ldp	$a0,$a1,[sp,#$M]
772	 ldp	$a2,$a3,[sp,#$M+16]
773	add	$rp,$rp_real,#32
774	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
775
776	add	$bp,sp,#$Zsqr
777	add	$rp,sp,#$M
778	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
779
780	mov	$t0,$acc0		// duplicate M
781	mov	$t1,$acc1
782	mov	$t2,$acc2
783	mov	$t3,$acc3
784	mov	$a0,$acc0		// put M aside
785	mov	$a1,$acc1
786	mov	$a2,$acc2
787	mov	$a3,$acc3
788	add	$rp,sp,#$M
789	bl	__ecp_nistz256_add
790	mov	$t0,$a0			// restore M
791	mov	$t1,$a1
792	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
793	mov	$t2,$a2
794	 ldp	$a0,$a1,[sp,#$S]
795	mov	$t3,$a3
796	 ldp	$a2,$a3,[sp,#$S+16]
797	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
798
799	add	$bp,$ap_real,#0
800	add	$rp,sp,#$S
801	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
802
803	mov	$t0,$acc0
804	mov	$t1,$acc1
805	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
806	mov	$t2,$acc2
807	mov	$t3,$acc3
808	 ldp	$a2,$a3,[sp,#$M+16]
809	add	$rp,sp,#$tmp0
810	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
811
812	add	$rp,$rp_real,#0
813	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
814
815	add	$bp,sp,#$tmp0
816	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
817
818	add	$bp,sp,#$S
819	add	$rp,sp,#$S
820	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
821
822	ldr	$bi,[sp,#$M]
823	mov	$a0,$acc0		// copy S
824	mov	$a1,$acc1
825	mov	$a2,$acc2
826	mov	$a3,$acc3
827	add	$bp,sp,#$M
828	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
829
830	add	$bp,$rp_real,#32
831	add	$rp,$rp_real,#32
832	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
833
834	add	sp,x29,#0		// destroy frame
835	ldp	x19,x20,[x29,#16]
836	ldp	x21,x22,[x29,#32]
837	ldp	x29,x30,[sp],#80
838	ret
839.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
840___
841}
842
843########################################################################
844# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
845#			      const P256_POINT *in2);
846{
847my ($res_x,$res_y,$res_z,
848    $H,$Hsqr,$R,$Rsqr,$Hcub,
849    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
850my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
851# above map() describes stack layout with 12 temporary
852# 256-bit vectors on top.
853my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
854
855$code.=<<___;
856.globl	ecp_nistz256_point_add
857.type	ecp_nistz256_point_add,%function
858.align	5
859ecp_nistz256_point_add:
860	stp	x29,x30,[sp,#-80]!
861	add	x29,sp,#0
862	stp	x19,x20,[sp,#16]
863	stp	x21,x22,[sp,#32]
864	stp	x23,x24,[sp,#48]
865	stp	x25,x26,[sp,#64]
866	sub	sp,sp,#32*12
867
868	ldp	$a0,$a1,[$bp,#64]	// in2_z
869	ldp	$a2,$a3,[$bp,#64+16]
870	 mov	$rp_real,$rp
871	 mov	$ap_real,$ap
872	 mov	$bp_real,$bp
873	 ldr	$poly1,.Lpoly+8
874	 ldr	$poly3,.Lpoly+24
875	orr	$t0,$a0,$a1
876	orr	$t2,$a2,$a3
877	orr	$in2infty,$t0,$t2
878	cmp	$in2infty,#0
879	csetm	$in2infty,ne		// !in2infty
880	add	$rp,sp,#$Z2sqr
881	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
882
883	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
884	ldp	$a2,$a3,[$ap_real,#64+16]
885	orr	$t0,$a0,$a1
886	orr	$t2,$a2,$a3
887	orr	$in1infty,$t0,$t2
888	cmp	$in1infty,#0
889	csetm	$in1infty,ne		// !in1infty
890	add	$rp,sp,#$Z1sqr
891	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
892
893	ldr	$bi,[$bp_real,#64]
894	ldp	$a0,$a1,[sp,#$Z2sqr]
895	ldp	$a2,$a3,[sp,#$Z2sqr+16]
896	add	$bp,$bp_real,#64
897	add	$rp,sp,#$S1
898	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
899
900	ldr	$bi,[$ap_real,#64]
901	ldp	$a0,$a1,[sp,#$Z1sqr]
902	ldp	$a2,$a3,[sp,#$Z1sqr+16]
903	add	$bp,$ap_real,#64
904	add	$rp,sp,#$S2
905	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
906
907	ldr	$bi,[$ap_real,#32]
908	ldp	$a0,$a1,[sp,#$S1]
909	ldp	$a2,$a3,[sp,#$S1+16]
910	add	$bp,$ap_real,#32
911	add	$rp,sp,#$S1
912	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
913
914	ldr	$bi,[$bp_real,#32]
915	ldp	$a0,$a1,[sp,#$S2]
916	ldp	$a2,$a3,[sp,#$S2+16]
917	add	$bp,$bp_real,#32
918	add	$rp,sp,#$S2
919	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
920
921	add	$bp,sp,#$S1
922	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
923	 ldp	$a0,$a1,[$ap_real]
924	 ldp	$a2,$a3,[$ap_real,#16]
925	add	$rp,sp,#$R
926	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
927
928	orr	$acc0,$acc0,$acc1	// see if result is zero
929	orr	$acc2,$acc2,$acc3
930	orr	$temp,$acc0,$acc2
931
932	add	$bp,sp,#$Z2sqr
933	add	$rp,sp,#$U1
934	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
935
936	ldr	$bi,[sp,#$Z1sqr]
937	ldp	$a0,$a1,[$bp_real]
938	ldp	$a2,$a3,[$bp_real,#16]
939	add	$bp,sp,#$Z1sqr
940	add	$rp,sp,#$U2
941	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
942
943	add	$bp,sp,#$U1
944	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
945	 ldp	$a2,$a3,[sp,#$R+16]
946	add	$rp,sp,#$H
947	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
948
949	orr	$acc0,$acc0,$acc1	// see if result is zero
950	orr	$acc2,$acc2,$acc3
951	orr	$acc0,$acc0,$acc2
952	tst	$acc0,$acc0
953	b.ne	.Ladd_proceed		// is_equal(U1,U2)?
954
955	tst	$in1infty,$in2infty
956	b.eq	.Ladd_proceed		// (in1infty || in2infty)?
957
958	tst	$temp,$temp
959	b.eq	.Ladd_double		// is_equal(S1,S2)?
960
961	eor	$a0,$a0,$a0
962	eor	$a1,$a1,$a1
963	stp	$a0,$a1,[$rp_real]
964	stp	$a0,$a1,[$rp_real,#16]
965	stp	$a0,$a1,[$rp_real,#32]
966	stp	$a0,$a1,[$rp_real,#48]
967	stp	$a0,$a1,[$rp_real,#64]
968	stp	$a0,$a1,[$rp_real,#80]
969	b	.Ladd_done
970
971.align	4
972.Ladd_double:
973	mov	$ap,$ap_real
974	mov	$rp,$rp_real
975	ldp	x23,x24,[x29,#48]
976	ldp	x25,x26,[x29,#64]
977	add	sp,sp,#32*(12-4)	// difference in stack frames
978	b	.Ldouble_shortcut
979
980.align	4
981.Ladd_proceed:
982	add	$rp,sp,#$Rsqr
983	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
984
985	ldr	$bi,[$ap_real,#64]
986	ldp	$a0,$a1,[sp,#$H]
987	ldp	$a2,$a3,[sp,#$H+16]
988	add	$bp,$ap_real,#64
989	add	$rp,sp,#$res_z
990	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
991
992	ldp	$a0,$a1,[sp,#$H]
993	ldp	$a2,$a3,[sp,#$H+16]
994	add	$rp,sp,#$Hsqr
995	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
996
997	ldr	$bi,[$bp_real,#64]
998	ldp	$a0,$a1,[sp,#$res_z]
999	ldp	$a2,$a3,[sp,#$res_z+16]
1000	add	$bp,$bp_real,#64
1001	add	$rp,sp,#$res_z
1002	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
1003
1004	ldr	$bi,[sp,#$H]
1005	ldp	$a0,$a1,[sp,#$Hsqr]
1006	ldp	$a2,$a3,[sp,#$Hsqr+16]
1007	add	$bp,sp,#$H
1008	add	$rp,sp,#$Hcub
1009	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1010
1011	ldr	$bi,[sp,#$Hsqr]
1012	ldp	$a0,$a1,[sp,#$U1]
1013	ldp	$a2,$a3,[sp,#$U1+16]
1014	add	$bp,sp,#$Hsqr
1015	add	$rp,sp,#$U2
1016	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
1017
1018	mov	$t0,$acc0
1019	mov	$t1,$acc1
1020	mov	$t2,$acc2
1021	mov	$t3,$acc3
1022	add	$rp,sp,#$Hsqr
1023	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1024
1025	add	$bp,sp,#$Rsqr
1026	add	$rp,sp,#$res_x
1027	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1028
1029	add	$bp,sp,#$Hcub
1030	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1031
1032	add	$bp,sp,#$U2
1033	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
1034	 ldp	$a0,$a1,[sp,#$S1]
1035	 ldp	$a2,$a3,[sp,#$S1+16]
1036	add	$rp,sp,#$res_y
1037	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1038
1039	add	$bp,sp,#$Hcub
1040	add	$rp,sp,#$S2
1041	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
1042
1043	ldr	$bi,[sp,#$R]
1044	ldp	$a0,$a1,[sp,#$res_y]
1045	ldp	$a2,$a3,[sp,#$res_y+16]
1046	add	$bp,sp,#$R
1047	add	$rp,sp,#$res_y
1048	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1049
1050	add	$bp,sp,#$S2
1051	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1052
1053	ldp	$a0,$a1,[sp,#$res_x]		// res
1054	ldp	$a2,$a3,[sp,#$res_x+16]
1055	ldp	$t0,$t1,[$bp_real]		// in2
1056	ldp	$t2,$t3,[$bp_real,#16]
1057___
1058for($i=0;$i<64;$i+=32) {		# conditional moves
1059$code.=<<___;
1060	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1061	cmp	$in1infty,#0			// !$in1intfy, remember?
1062	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1063	csel	$t0,$a0,$t0,ne
1064	csel	$t1,$a1,$t1,ne
1065	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1066	csel	$t2,$a2,$t2,ne
1067	csel	$t3,$a3,$t3,ne
1068	cmp	$in2infty,#0			// !$in2intfy, remember?
1069	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1070	csel	$acc0,$t0,$acc0,ne
1071	csel	$acc1,$t1,$acc1,ne
1072	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1073	csel	$acc2,$t2,$acc2,ne
1074	csel	$acc3,$t3,$acc3,ne
1075	ldp	$t2,$t3,[$bp_real,#$i+48]
1076	stp	$acc0,$acc1,[$rp_real,#$i]
1077	stp	$acc2,$acc3,[$rp_real,#$i+16]
1078___
1079}
1080$code.=<<___;
1081	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1082	cmp	$in1infty,#0			// !$in1intfy, remember?
1083	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1084	csel	$t0,$a0,$t0,ne
1085	csel	$t1,$a1,$t1,ne
1086	csel	$t2,$a2,$t2,ne
1087	csel	$t3,$a3,$t3,ne
1088	cmp	$in2infty,#0			// !$in2intfy, remember?
1089	csel	$acc0,$t0,$acc0,ne
1090	csel	$acc1,$t1,$acc1,ne
1091	csel	$acc2,$t2,$acc2,ne
1092	csel	$acc3,$t3,$acc3,ne
1093	stp	$acc0,$acc1,[$rp_real,#$i]
1094	stp	$acc2,$acc3,[$rp_real,#$i+16]
1095
1096.Ladd_done:
1097	add	sp,x29,#0	// destroy frame
1098	ldp	x19,x20,[x29,#16]
1099	ldp	x21,x22,[x29,#32]
1100	ldp	x23,x24,[x29,#48]
1101	ldp	x25,x26,[x29,#64]
1102	ldp	x29,x30,[sp],#80
1103	ret
1104.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1105___
1106}
1107
1108########################################################################
1109# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1110#				     const P256_POINT_AFFINE *in2);
1111{
1112my ($res_x,$res_y,$res_z,
1113    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1114my $Z1sqr = $S2;
1115# above map() describes stack layout with 10 temporary
1116# 256-bit vectors on top.
1117my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1118
1119$code.=<<___;
1120.globl	ecp_nistz256_point_add_affine
1121.type	ecp_nistz256_point_add_affine,%function
1122.align	5
1123ecp_nistz256_point_add_affine:
1124	stp	x29,x30,[sp,#-80]!
1125	add	x29,sp,#0
1126	stp	x19,x20,[sp,#16]
1127	stp	x21,x22,[sp,#32]
1128	stp	x23,x24,[sp,#48]
1129	stp	x25,x26,[sp,#64]
1130	sub	sp,sp,#32*10
1131
1132	mov	$rp_real,$rp
1133	mov	$ap_real,$ap
1134	mov	$bp_real,$bp
1135	ldr	$poly1,.Lpoly+8
1136	ldr	$poly3,.Lpoly+24
1137
1138	ldp	$a0,$a1,[$ap,#64]	// in1_z
1139	ldp	$a2,$a3,[$ap,#64+16]
1140	orr	$t0,$a0,$a1
1141	orr	$t2,$a2,$a3
1142	orr	$in1infty,$t0,$t2
1143	cmp	$in1infty,#0
1144	csetm	$in1infty,ne		// !in1infty
1145
1146	ldp	$acc0,$acc1,[$bp]	// in2_x
1147	ldp	$acc2,$acc3,[$bp,#16]
1148	ldp	$t0,$t1,[$bp,#32]	// in2_y
1149	ldp	$t2,$t3,[$bp,#48]
1150	orr	$acc0,$acc0,$acc1
1151	orr	$acc2,$acc2,$acc3
1152	orr	$t0,$t0,$t1
1153	orr	$t2,$t2,$t3
1154	orr	$acc0,$acc0,$acc2
1155	orr	$t0,$t0,$t2
1156	orr	$in2infty,$acc0,$t0
1157	cmp	$in2infty,#0
1158	csetm	$in2infty,ne		// !in2infty
1159
1160	add	$rp,sp,#$Z1sqr
1161	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1162
1163	mov	$a0,$acc0
1164	mov	$a1,$acc1
1165	mov	$a2,$acc2
1166	mov	$a3,$acc3
1167	ldr	$bi,[$bp_real]
1168	add	$bp,$bp_real,#0
1169	add	$rp,sp,#$U2
1170	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1171
1172	add	$bp,$ap_real,#0
1173	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
1174	 ldp	$a0,$a1,[sp,#$Z1sqr]
1175	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
1176	add	$rp,sp,#$H
1177	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1178
1179	add	$bp,$ap_real,#64
1180	add	$rp,sp,#$S2
1181	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1182
1183	ldr	$bi,[$ap_real,#64]
1184	ldp	$a0,$a1,[sp,#$H]
1185	ldp	$a2,$a3,[sp,#$H+16]
1186	add	$bp,$ap_real,#64
1187	add	$rp,sp,#$res_z
1188	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1189
1190	ldr	$bi,[$bp_real,#32]
1191	ldp	$a0,$a1,[sp,#$S2]
1192	ldp	$a2,$a3,[sp,#$S2+16]
1193	add	$bp,$bp_real,#32
1194	add	$rp,sp,#$S2
1195	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1196
1197	add	$bp,$ap_real,#32
1198	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
1199	 ldp	$a2,$a3,[sp,#$H+16]
1200	add	$rp,sp,#$R
1201	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1202
1203	add	$rp,sp,#$Hsqr
1204	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1205
1206	ldp	$a0,$a1,[sp,#$R]
1207	ldp	$a2,$a3,[sp,#$R+16]
1208	add	$rp,sp,#$Rsqr
1209	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1210
1211	ldr	$bi,[sp,#$H]
1212	ldp	$a0,$a1,[sp,#$Hsqr]
1213	ldp	$a2,$a3,[sp,#$Hsqr+16]
1214	add	$bp,sp,#$H
1215	add	$rp,sp,#$Hcub
1216	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1217
1218	ldr	$bi,[$ap_real]
1219	ldp	$a0,$a1,[sp,#$Hsqr]
1220	ldp	$a2,$a3,[sp,#$Hsqr+16]
1221	add	$bp,$ap_real,#0
1222	add	$rp,sp,#$U2
1223	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1224
1225	mov	$t0,$acc0
1226	mov	$t1,$acc1
1227	mov	$t2,$acc2
1228	mov	$t3,$acc3
1229	add	$rp,sp,#$Hsqr
1230	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1231
1232	add	$bp,sp,#$Rsqr
1233	add	$rp,sp,#$res_x
1234	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1235
1236	add	$bp,sp,#$Hcub
1237	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1238
1239	add	$bp,sp,#$U2
1240	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
1241	 ldp	$a0,$a1,[sp,#$Hcub]
1242	 ldp	$a2,$a3,[sp,#$Hcub+16]
1243	add	$rp,sp,#$res_y
1244	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1245
1246	add	$bp,$ap_real,#32
1247	add	$rp,sp,#$S2
1248	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1249
1250	ldr	$bi,[sp,#$R]
1251	ldp	$a0,$a1,[sp,#$res_y]
1252	ldp	$a2,$a3,[sp,#$res_y+16]
1253	add	$bp,sp,#$R
1254	add	$rp,sp,#$res_y
1255	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1256
1257	add	$bp,sp,#$S2
1258	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1259
1260	ldp	$a0,$a1,[sp,#$res_x]		// res
1261	ldp	$a2,$a3,[sp,#$res_x+16]
1262	ldp	$t0,$t1,[$bp_real]		// in2
1263	ldp	$t2,$t3,[$bp_real,#16]
1264___
1265for($i=0;$i<64;$i+=32) {		# conditional moves
1266$code.=<<___;
1267	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1268	cmp	$in1infty,#0			// !$in1intfy, remember?
1269	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1270	csel	$t0,$a0,$t0,ne
1271	csel	$t1,$a1,$t1,ne
1272	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1273	csel	$t2,$a2,$t2,ne
1274	csel	$t3,$a3,$t3,ne
1275	cmp	$in2infty,#0			// !$in2intfy, remember?
1276	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1277	csel	$acc0,$t0,$acc0,ne
1278	csel	$acc1,$t1,$acc1,ne
1279	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1280	csel	$acc2,$t2,$acc2,ne
1281	csel	$acc3,$t3,$acc3,ne
1282	ldp	$t2,$t3,[$bp_real,#$i+48]
1283	stp	$acc0,$acc1,[$rp_real,#$i]
1284	stp	$acc2,$acc3,[$rp_real,#$i+16]
1285___
1286$code.=<<___	if ($i == 0);
1287	adr	$bp_real,.Lone_mont-64
1288___
1289}
1290$code.=<<___;
1291	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1292	cmp	$in1infty,#0			// !$in1intfy, remember?
1293	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1294	csel	$t0,$a0,$t0,ne
1295	csel	$t1,$a1,$t1,ne
1296	csel	$t2,$a2,$t2,ne
1297	csel	$t3,$a3,$t3,ne
1298	cmp	$in2infty,#0			// !$in2intfy, remember?
1299	csel	$acc0,$t0,$acc0,ne
1300	csel	$acc1,$t1,$acc1,ne
1301	csel	$acc2,$t2,$acc2,ne
1302	csel	$acc3,$t3,$acc3,ne
1303	stp	$acc0,$acc1,[$rp_real,#$i]
1304	stp	$acc2,$acc3,[$rp_real,#$i+16]
1305
1306	add	sp,x29,#0		// destroy frame
1307	ldp	x19,x20,[x29,#16]
1308	ldp	x21,x22,[x29,#32]
1309	ldp	x23,x24,[x29,#48]
1310	ldp	x25,x26,[x29,#64]
1311	ldp	x29,x30,[sp],#80
1312	ret
1313.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1314___
1315}
1316if (1) {
1317my ($ord0,$ord1) = ($poly1,$poly3);
1318my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1319my $acc7 = $bi;
1320
1321$code.=<<___;
1322////////////////////////////////////////////////////////////////////////
1323// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1324//                                uint64_t b[4]);
1325.globl	ecp_nistz256_ord_mul_mont
1326.type	ecp_nistz256_ord_mul_mont,%function
1327.align	4
1328ecp_nistz256_ord_mul_mont:
1329	stp	x29,x30,[sp,#-64]!
1330	add	x29,sp,#0
1331	stp	x19,x20,[sp,#16]
1332	stp	x21,x22,[sp,#32]
1333	stp	x23,x24,[sp,#48]
1334
1335	adr	$ordk,.Lord
1336	ldr	$bi,[$bp]		// bp[0]
1337	ldp	$a0,$a1,[$ap]
1338	ldp	$a2,$a3,[$ap,#16]
1339
1340	ldp	$ord0,$ord1,[$ordk,#0]
1341	ldp	$ord2,$ord3,[$ordk,#16]
1342	ldr	$ordk,[$ordk,#32]
1343
1344	mul	$acc0,$a0,$bi		// a[0]*b[0]
1345	umulh	$t0,$a0,$bi
1346
1347	mul	$acc1,$a1,$bi		// a[1]*b[0]
1348	umulh	$t1,$a1,$bi
1349
1350	mul	$acc2,$a2,$bi		// a[2]*b[0]
1351	umulh	$t2,$a2,$bi
1352
1353	mul	$acc3,$a3,$bi		// a[3]*b[0]
1354	umulh	$acc4,$a3,$bi
1355
1356	mul	$t4,$acc0,$ordk
1357
1358	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
1359	adcs	$acc2,$acc2,$t1
1360	adcs	$acc3,$acc3,$t2
1361	adc	$acc4,$acc4,xzr
1362	mov	$acc5,xzr
1363___
1364for ($i=1;$i<4;$i++) {
1365	################################################################
1366	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1367	# *                                     abcdefgh
1368	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1369	#
1370	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1371	# rewrite above as:
1372	#
1373	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1374	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1375	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1376$code.=<<___;
1377	ldr	$bi,[$bp,#8*$i]		// b[i]
1378
1379	lsl	$t0,$t4,#32
1380	subs	$acc2,$acc2,$t4
1381	lsr	$t1,$t4,#32
1382	sbcs	$acc3,$acc3,$t0
1383	sbcs	$acc4,$acc4,$t1
1384	sbc	$acc5,$acc5,xzr
1385
1386	subs	xzr,$acc0,#1
1387	umulh	$t1,$ord0,$t4
1388	mul	$t2,$ord1,$t4
1389	umulh	$t3,$ord1,$t4
1390
1391	adcs	$t2,$t2,$t1
1392	 mul	$t0,$a0,$bi
1393	adc	$t3,$t3,xzr
1394	 mul	$t1,$a1,$bi
1395
1396	adds	$acc0,$acc1,$t2
1397	 mul	$t2,$a2,$bi
1398	adcs	$acc1,$acc2,$t3
1399	 mul	$t3,$a3,$bi
1400	adcs	$acc2,$acc3,$t4
1401	adcs	$acc3,$acc4,$t4
1402	adc	$acc4,$acc5,xzr
1403
1404	adds	$acc0,$acc0,$t0		// accumulate low parts
1405	umulh	$t0,$a0,$bi
1406	adcs	$acc1,$acc1,$t1
1407	umulh	$t1,$a1,$bi
1408	adcs	$acc2,$acc2,$t2
1409	umulh	$t2,$a2,$bi
1410	adcs	$acc3,$acc3,$t3
1411	umulh	$t3,$a3,$bi
1412	adc	$acc4,$acc4,xzr
1413	mul	$t4,$acc0,$ordk
1414	adds	$acc1,$acc1,$t0		// accumulate high parts
1415	adcs	$acc2,$acc2,$t1
1416	adcs	$acc3,$acc3,$t2
1417	adcs	$acc4,$acc4,$t3
1418	adc	$acc5,xzr,xzr
1419___
1420}
1421$code.=<<___;
1422	lsl	$t0,$t4,#32		// last reduction
1423	subs	$acc2,$acc2,$t4
1424	lsr	$t1,$t4,#32
1425	sbcs	$acc3,$acc3,$t0
1426	sbcs	$acc4,$acc4,$t1
1427	sbc	$acc5,$acc5,xzr
1428
1429	subs	xzr,$acc0,#1
1430	umulh	$t1,$ord0,$t4
1431	mul	$t2,$ord1,$t4
1432	umulh	$t3,$ord1,$t4
1433
1434	adcs	$t2,$t2,$t1
1435	adc	$t3,$t3,xzr
1436
1437	adds	$acc0,$acc1,$t2
1438	adcs	$acc1,$acc2,$t3
1439	adcs	$acc2,$acc3,$t4
1440	adcs	$acc3,$acc4,$t4
1441	adc	$acc4,$acc5,xzr
1442
1443	subs	$t0,$acc0,$ord0		// ret -= modulus
1444	sbcs	$t1,$acc1,$ord1
1445	sbcs	$t2,$acc2,$ord2
1446	sbcs	$t3,$acc3,$ord3
1447	sbcs	xzr,$acc4,xzr
1448
1449	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1450	csel	$acc1,$acc1,$t1,lo
1451	csel	$acc2,$acc2,$t2,lo
1452	stp	$acc0,$acc1,[$rp]
1453	csel	$acc3,$acc3,$t3,lo
1454	stp	$acc2,$acc3,[$rp,#16]
1455
1456	ldp	x19,x20,[sp,#16]
1457	ldp	x21,x22,[sp,#32]
1458	ldp	x23,x24,[sp,#48]
1459	ldr	x29,[sp],#64
1460	ret
1461.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1462
1463////////////////////////////////////////////////////////////////////////
1464// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1465//                                int rep);
1466.globl	ecp_nistz256_ord_sqr_mont
1467.type	ecp_nistz256_ord_sqr_mont,%function
1468.align	4
1469ecp_nistz256_ord_sqr_mont:
1470	stp	x29,x30,[sp,#-64]!
1471	add	x29,sp,#0
1472	stp	x19,x20,[sp,#16]
1473	stp	x21,x22,[sp,#32]
1474	stp	x23,x24,[sp,#48]
1475
1476	adr	$ordk,.Lord
1477	ldp	$a0,$a1,[$ap]
1478	ldp	$a2,$a3,[$ap,#16]
1479
1480	ldp	$ord0,$ord1,[$ordk,#0]
1481	ldp	$ord2,$ord3,[$ordk,#16]
1482	ldr	$ordk,[$ordk,#32]
1483	b	.Loop_ord_sqr
1484
1485.align	4
1486.Loop_ord_sqr:
1487	sub	$bp,$bp,#1
1488	////////////////////////////////////////////////////////////////
1489	//  |  |  |  |  |  |a1*a0|  |
1490	//  |  |  |  |  |a2*a0|  |  |
1491	//  |  |a3*a2|a3*a0|  |  |  |
1492	//  |  |  |  |a2*a1|  |  |  |
1493	//  |  |  |a3*a1|  |  |  |  |
1494	// *|  |  |  |  |  |  |  | 2|
1495	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1496	//  |--+--+--+--+--+--+--+--|
1497	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1498	//
1499	//  "can't overflow" below mark carrying into high part of
1500	//  multiplication result, which can't overflow, because it
1501	//  can never be all ones.
1502
1503	mul	$acc1,$a1,$a0		// a[1]*a[0]
1504	umulh	$t1,$a1,$a0
1505	mul	$acc2,$a2,$a0		// a[2]*a[0]
1506	umulh	$t2,$a2,$a0
1507	mul	$acc3,$a3,$a0		// a[3]*a[0]
1508	umulh	$acc4,$a3,$a0
1509
1510	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
1511	 mul	$t0,$a2,$a1		// a[2]*a[1]
1512	 umulh	$t1,$a2,$a1
1513	adcs	$acc3,$acc3,$t2
1514	 mul	$t2,$a3,$a1		// a[3]*a[1]
1515	 umulh	$t3,$a3,$a1
1516	adc	$acc4,$acc4,xzr		// can't overflow
1517
1518	mul	$acc5,$a3,$a2		// a[3]*a[2]
1519	umulh	$acc6,$a3,$a2
1520
1521	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
1522	 mul	$acc0,$a0,$a0		// a[0]*a[0]
1523	adc	$t2,$t3,xzr		// can't overflow
1524
1525	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
1526	 umulh	$a0,$a0,$a0
1527	adcs	$acc4,$acc4,$t1
1528	 mul	$t1,$a1,$a1		// a[1]*a[1]
1529	adcs	$acc5,$acc5,$t2
1530	 umulh	$a1,$a1,$a1
1531	adc	$acc6,$acc6,xzr		// can't overflow
1532
1533	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
1534	 mul	$t2,$a2,$a2		// a[2]*a[2]
1535	adcs	$acc2,$acc2,$acc2
1536	 umulh	$a2,$a2,$a2
1537	adcs	$acc3,$acc3,$acc3
1538	 mul	$t3,$a3,$a3		// a[3]*a[3]
1539	adcs	$acc4,$acc4,$acc4
1540	 umulh	$a3,$a3,$a3
1541	adcs	$acc5,$acc5,$acc5
1542	adcs	$acc6,$acc6,$acc6
1543	adc	$acc7,xzr,xzr
1544
1545	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
1546	 mul	$t4,$acc0,$ordk
1547	adcs	$acc2,$acc2,$t1
1548	adcs	$acc3,$acc3,$a1
1549	adcs	$acc4,$acc4,$t2
1550	adcs	$acc5,$acc5,$a2
1551	adcs	$acc6,$acc6,$t3
1552	adc	$acc7,$acc7,$a3
1553___
1554for($i=0; $i<4; $i++) {			# reductions
1555$code.=<<___;
1556	subs	xzr,$acc0,#1
1557	umulh	$t1,$ord0,$t4
1558	mul	$t2,$ord1,$t4
1559	umulh	$t3,$ord1,$t4
1560
1561	adcs	$t2,$t2,$t1
1562	adc	$t3,$t3,xzr
1563
1564	adds	$acc0,$acc1,$t2
1565	adcs	$acc1,$acc2,$t3
1566	adcs	$acc2,$acc3,$t4
1567	adc	$acc3,xzr,$t4		// can't overflow
1568___
1569$code.=<<___	if ($i<3);
1570	mul	$t3,$acc0,$ordk
1571___
1572$code.=<<___;
1573	lsl	$t0,$t4,#32
1574	subs	$acc1,$acc1,$t4
1575	lsr	$t1,$t4,#32
1576	sbcs	$acc2,$acc2,$t0
1577	sbc	$acc3,$acc3,$t1		// can't borrow
1578___
1579	($t3,$t4) = ($t4,$t3);
1580}
1581$code.=<<___;
1582	adds	$acc0,$acc0,$acc4	// accumulate upper half
1583	adcs	$acc1,$acc1,$acc5
1584	adcs	$acc2,$acc2,$acc6
1585	adcs	$acc3,$acc3,$acc7
1586	adc	$acc4,xzr,xzr
1587
1588	subs	$t0,$acc0,$ord0		// ret -= modulus
1589	sbcs	$t1,$acc1,$ord1
1590	sbcs	$t2,$acc2,$ord2
1591	sbcs	$t3,$acc3,$ord3
1592	sbcs	xzr,$acc4,xzr
1593
1594	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1595	csel	$a1,$acc1,$t1,lo
1596	csel	$a2,$acc2,$t2,lo
1597	csel	$a3,$acc3,$t3,lo
1598
1599	cbnz	$bp,.Loop_ord_sqr
1600
1601	stp	$a0,$a1,[$rp]
1602	stp	$a2,$a3,[$rp,#16]
1603
1604	ldp	x19,x20,[sp,#16]
1605	ldp	x21,x22,[sp,#32]
1606	ldp	x23,x24,[sp,#48]
1607	ldr	x29,[sp],#64
1608	ret
1609.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1610___
1611}	}
1612
1613########################################################################
1614# scatter-gather subroutines
1615{
1616my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1617$code.=<<___;
1618// void	ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1619//					 int x2);
1620.globl	ecp_nistz256_scatter_w5
1621.type	ecp_nistz256_scatter_w5,%function
1622.align	4
1623ecp_nistz256_scatter_w5:
1624	stp	x29,x30,[sp,#-16]!
1625	add	x29,sp,#0
1626
1627	add	$out,$out,$index,lsl#2
1628
1629	ldp	x4,x5,[$inp]		// X
1630	ldp	x6,x7,[$inp,#16]
1631	str	w4,[$out,#64*0-4]
1632	lsr	x4,x4,#32
1633	str	w5,[$out,#64*1-4]
1634	lsr	x5,x5,#32
1635	str	w6,[$out,#64*2-4]
1636	lsr	x6,x6,#32
1637	str	w7,[$out,#64*3-4]
1638	lsr	x7,x7,#32
1639	str	w4,[$out,#64*4-4]
1640	str	w5,[$out,#64*5-4]
1641	str	w6,[$out,#64*6-4]
1642	str	w7,[$out,#64*7-4]
1643	add	$out,$out,#64*8
1644
1645	ldp	x4,x5,[$inp,#32]	// Y
1646	ldp	x6,x7,[$inp,#48]
1647	str	w4,[$out,#64*0-4]
1648	lsr	x4,x4,#32
1649	str	w5,[$out,#64*1-4]
1650	lsr	x5,x5,#32
1651	str	w6,[$out,#64*2-4]
1652	lsr	x6,x6,#32
1653	str	w7,[$out,#64*3-4]
1654	lsr	x7,x7,#32
1655	str	w4,[$out,#64*4-4]
1656	str	w5,[$out,#64*5-4]
1657	str	w6,[$out,#64*6-4]
1658	str	w7,[$out,#64*7-4]
1659	add	$out,$out,#64*8
1660
1661	ldp	x4,x5,[$inp,#64]	// Z
1662	ldp	x6,x7,[$inp,#80]
1663	str	w4,[$out,#64*0-4]
1664	lsr	x4,x4,#32
1665	str	w5,[$out,#64*1-4]
1666	lsr	x5,x5,#32
1667	str	w6,[$out,#64*2-4]
1668	lsr	x6,x6,#32
1669	str	w7,[$out,#64*3-4]
1670	lsr	x7,x7,#32
1671	str	w4,[$out,#64*4-4]
1672	str	w5,[$out,#64*5-4]
1673	str	w6,[$out,#64*6-4]
1674	str	w7,[$out,#64*7-4]
1675
1676	ldr	x29,[sp],#16
1677	ret
1678.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1679
1680// void	ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1681//					      int x2);
1682.globl	ecp_nistz256_gather_w5
1683.type	ecp_nistz256_gather_w5,%function
1684.align	4
1685ecp_nistz256_gather_w5:
1686	stp	x29,x30,[sp,#-16]!
1687	add	x29,sp,#0
1688
1689	cmp	$index,xzr
1690	csetm	x3,ne
1691	add	$index,$index,x3
1692	add	$inp,$inp,$index,lsl#2
1693
1694	ldr	w4,[$inp,#64*0]
1695	ldr	w5,[$inp,#64*1]
1696	ldr	w6,[$inp,#64*2]
1697	ldr	w7,[$inp,#64*3]
1698	ldr	w8,[$inp,#64*4]
1699	ldr	w9,[$inp,#64*5]
1700	ldr	w10,[$inp,#64*6]
1701	ldr	w11,[$inp,#64*7]
1702	add	$inp,$inp,#64*8
1703	orr	x4,x4,x8,lsl#32
1704	orr	x5,x5,x9,lsl#32
1705	orr	x6,x6,x10,lsl#32
1706	orr	x7,x7,x11,lsl#32
1707	csel	x4,x4,xzr,ne
1708	csel	x5,x5,xzr,ne
1709	csel	x6,x6,xzr,ne
1710	csel	x7,x7,xzr,ne
1711	stp	x4,x5,[$out]		// X
1712	stp	x6,x7,[$out,#16]
1713
1714	ldr	w4,[$inp,#64*0]
1715	ldr	w5,[$inp,#64*1]
1716	ldr	w6,[$inp,#64*2]
1717	ldr	w7,[$inp,#64*3]
1718	ldr	w8,[$inp,#64*4]
1719	ldr	w9,[$inp,#64*5]
1720	ldr	w10,[$inp,#64*6]
1721	ldr	w11,[$inp,#64*7]
1722	add	$inp,$inp,#64*8
1723	orr	x4,x4,x8,lsl#32
1724	orr	x5,x5,x9,lsl#32
1725	orr	x6,x6,x10,lsl#32
1726	orr	x7,x7,x11,lsl#32
1727	csel	x4,x4,xzr,ne
1728	csel	x5,x5,xzr,ne
1729	csel	x6,x6,xzr,ne
1730	csel	x7,x7,xzr,ne
1731	stp	x4,x5,[$out,#32]	// Y
1732	stp	x6,x7,[$out,#48]
1733
1734	ldr	w4,[$inp,#64*0]
1735	ldr	w5,[$inp,#64*1]
1736	ldr	w6,[$inp,#64*2]
1737	ldr	w7,[$inp,#64*3]
1738	ldr	w8,[$inp,#64*4]
1739	ldr	w9,[$inp,#64*5]
1740	ldr	w10,[$inp,#64*6]
1741	ldr	w11,[$inp,#64*7]
1742	orr	x4,x4,x8,lsl#32
1743	orr	x5,x5,x9,lsl#32
1744	orr	x6,x6,x10,lsl#32
1745	orr	x7,x7,x11,lsl#32
1746	csel	x4,x4,xzr,ne
1747	csel	x5,x5,xzr,ne
1748	csel	x6,x6,xzr,ne
1749	csel	x7,x7,xzr,ne
1750	stp	x4,x5,[$out,#64]	// Z
1751	stp	x6,x7,[$out,#80]
1752
1753	ldr	x29,[sp],#16
1754	ret
1755.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1756
1757// void	ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1758//					 int x2);
1759.globl	ecp_nistz256_scatter_w7
1760.type	ecp_nistz256_scatter_w7,%function
1761.align	4
1762ecp_nistz256_scatter_w7:
1763	stp	x29,x30,[sp,#-16]!
1764	add	x29,sp,#0
1765
1766	add	$out,$out,$index
1767	mov	$index,#64/8
1768.Loop_scatter_w7:
1769	ldr	x3,[$inp],#8
1770	subs	$index,$index,#1
1771	prfm	pstl1strm,[$out,#4096+64*0]
1772	prfm	pstl1strm,[$out,#4096+64*1]
1773	prfm	pstl1strm,[$out,#4096+64*2]
1774	prfm	pstl1strm,[$out,#4096+64*3]
1775	prfm	pstl1strm,[$out,#4096+64*4]
1776	prfm	pstl1strm,[$out,#4096+64*5]
1777	prfm	pstl1strm,[$out,#4096+64*6]
1778	prfm	pstl1strm,[$out,#4096+64*7]
1779	strb	w3,[$out,#64*0]
1780	lsr	x3,x3,#8
1781	strb	w3,[$out,#64*1]
1782	lsr	x3,x3,#8
1783	strb	w3,[$out,#64*2]
1784	lsr	x3,x3,#8
1785	strb	w3,[$out,#64*3]
1786	lsr	x3,x3,#8
1787	strb	w3,[$out,#64*4]
1788	lsr	x3,x3,#8
1789	strb	w3,[$out,#64*5]
1790	lsr	x3,x3,#8
1791	strb	w3,[$out,#64*6]
1792	lsr	x3,x3,#8
1793	strb	w3,[$out,#64*7]
1794	add	$out,$out,#64*8
1795	b.ne	.Loop_scatter_w7
1796
1797	ldr	x29,[sp],#16
1798	ret
1799.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1800
1801// void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1802//						     int x2);
1803.globl	ecp_nistz256_gather_w7
1804.type	ecp_nistz256_gather_w7,%function
1805.align	4
1806ecp_nistz256_gather_w7:
1807	stp	x29,x30,[sp,#-16]!
1808	add	x29,sp,#0
1809
1810	cmp	$index,xzr
1811	csetm	x3,ne
1812	add	$index,$index,x3
1813	add	$inp,$inp,$index
1814	mov	$index,#64/8
1815	nop
1816.Loop_gather_w7:
1817	ldrb	w4,[$inp,#64*0]
1818	prfm	pldl1strm,[$inp,#4096+64*0]
1819	subs	$index,$index,#1
1820	ldrb	w5,[$inp,#64*1]
1821	prfm	pldl1strm,[$inp,#4096+64*1]
1822	ldrb	w6,[$inp,#64*2]
1823	prfm	pldl1strm,[$inp,#4096+64*2]
1824	ldrb	w7,[$inp,#64*3]
1825	prfm	pldl1strm,[$inp,#4096+64*3]
1826	ldrb	w8,[$inp,#64*4]
1827	prfm	pldl1strm,[$inp,#4096+64*4]
1828	ldrb	w9,[$inp,#64*5]
1829	prfm	pldl1strm,[$inp,#4096+64*5]
1830	ldrb	w10,[$inp,#64*6]
1831	prfm	pldl1strm,[$inp,#4096+64*6]
1832	ldrb	w11,[$inp,#64*7]
1833	prfm	pldl1strm,[$inp,#4096+64*7]
1834	add	$inp,$inp,#64*8
1835	orr	x4,x4,x5,lsl#8
1836	orr	x6,x6,x7,lsl#8
1837	orr	x8,x8,x9,lsl#8
1838	orr	x4,x4,x6,lsl#16
1839	orr	x10,x10,x11,lsl#8
1840	orr	x4,x4,x8,lsl#32
1841	orr	x4,x4,x10,lsl#48
1842	and	x4,x4,x3
1843	str	x4,[$out],#8
1844	b.ne	.Loop_gather_w7
1845
1846	ldr	x29,[sp],#16
1847	ret
1848.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1849___
1850}
1851
1852foreach (split("\n",$code)) {
1853	s/\`([^\`]*)\`/eval $1/ge;
1854
1855	print $_,"\n";
1856}
1857close STDOUT;	# enforce flush
1858