1#! /usr/bin/env perl
2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions
6# are met:
7#
8# 1. Redistributions of source code must retain the above copyright
9#    notice, this list of conditions and the following disclaimer.
10#
11# 2. Redistributions in binary form must reproduce the above copyright
12#    notice, this list of conditions and the following disclaimer in
13#    the documentation and/or other materials provided with the
14#    distribution.
15#
16# 3. All advertising materials mentioning features or use of this
17#    software must display the following acknowledgment:
18#    "This product includes software developed by the OpenSSL Project
19#    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20#
21# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22#    endorse or promote products derived from this software without
23#    prior written permission. For written permission, please contact
24#    openssl-core@openssl.org.
25#
26# 5. Products derived from this software may not be called "OpenSSL"
27#    nor may "OpenSSL" appear in their names without prior written
28#    permission of the OpenSSL Project.
29#
30# 6. Redistributions of any form whatsoever must retain the following
31#    acknowledgment:
32#    "This product includes software developed by the OpenSSL Project
33#    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34#
35# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46# OF THE POSSIBILITY OF SUCH DAMAGE.
47# ====================================================================
48#
49# This product includes cryptographic software written by Eric Young
50# (eay@cryptsoft.com).  This product includes software written by Tim
51# Hudson (tjh@cryptsoft.com).
52
53
54# ====================================================================
55# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
56# project. The module is, however, dual licensed under OpenSSL and
57# CRYPTOGAMS licenses depending on where you obtain it. For further
58# details see http://www.openssl.org/~appro/cryptogams/.
59# ====================================================================
60#
61# ECP_NISTZ256 module for ARMv8.
62#
63# February 2015.
64#
65# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
66# http://eprint.iacr.org/2013/816.
67#
68#			with/without -DECP_NISTZ256_ASM
69# Apple A7		+120-360%
70# Cortex-A53		+120-400%
71# Cortex-A57		+120-350%
72# X-Gene		+200-330%
73# Denver		+140-400%
74#
75# Ranges denote minimum and maximum improvement coefficients depending
76# on benchmark. Lower coefficients are for ECDSA sign, server-side
77# operation. Keep in mind that +400% means 5x improvement.
78
79$flavour = shift;
80while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
81
82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
84( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
85die "can't locate arm-xlate.pl";
86
87open OUT,"| \"$^X\" $xlate $flavour $output";
88*STDOUT=*OUT;
89
90{
91my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
92    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
93    map("x$_",(0..17,19,20));
94
95my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
96
97$code.=<<___;
98#include <GFp/arm_arch.h>
99
100.text
101.align	5
102.Lpoly:
103.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
104.Lone_mont:
105.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
106.Lone:
107.quad	1,0,0,0
108.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
109
110// void	GFp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
111//					     const BN_ULONG x2[4]);
112.globl	GFp_nistz256_mul_mont
113.type	GFp_nistz256_mul_mont,%function
114.align	4
115GFp_nistz256_mul_mont:
116	stp	x29,x30,[sp,#-32]!
117	add	x29,sp,#0
118	stp	x19,x20,[sp,#16]
119
120	ldr	$bi,[$bp]		// bp[0]
121	ldp	$a0,$a1,[$ap]
122	ldp	$a2,$a3,[$ap,#16]
123	ldr	$poly1,.Lpoly+8
124	ldr	$poly3,.Lpoly+24
125
126	bl	__ecp_nistz256_mul_mont
127
128	ldp	x19,x20,[sp,#16]
129	ldp	x29,x30,[sp],#32
130	ret
131.size	GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
132
133// void	GFp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
134.globl	GFp_nistz256_sqr_mont
135.type	GFp_nistz256_sqr_mont,%function
136.align	4
137GFp_nistz256_sqr_mont:
138	stp	x29,x30,[sp,#-32]!
139	add	x29,sp,#0
140	stp	x19,x20,[sp,#16]
141
142	ldp	$a0,$a1,[$ap]
143	ldp	$a2,$a3,[$ap,#16]
144	ldr	$poly1,.Lpoly+8
145	ldr	$poly3,.Lpoly+24
146
147	bl	__ecp_nistz256_sqr_mont
148
149	ldp	x19,x20,[sp,#16]
150	ldp	x29,x30,[sp],#32
151	ret
152.size	GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont
153
154// void	GFp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
155//					const BN_ULONG x2[4]);
156.globl	GFp_nistz256_add
157.type	GFp_nistz256_add,%function
158.align	4
159GFp_nistz256_add:
160	stp	x29,x30,[sp,#-16]!
161	add	x29,sp,#0
162
163	ldp	$acc0,$acc1,[$ap]
164	ldp	$t0,$t1,[$bp]
165	ldp	$acc2,$acc3,[$ap,#16]
166	ldp	$t2,$t3,[$bp,#16]
167	ldr	$poly1,.Lpoly+8
168	ldr	$poly3,.Lpoly+24
169
170	bl	__ecp_nistz256_add
171
172	ldp	x29,x30,[sp],#16
173	ret
174.size	GFp_nistz256_add,.-GFp_nistz256_add
175
176// void	GFp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
177.globl	GFp_nistz256_neg
178.type	GFp_nistz256_neg,%function
179.align	4
180GFp_nistz256_neg:
181	stp	x29,x30,[sp,#-16]!
182	add	x29,sp,#0
183
184	mov	$bp,$ap
185	mov	$acc0,xzr		// a = 0
186	mov	$acc1,xzr
187	mov	$acc2,xzr
188	mov	$acc3,xzr
189	ldr	$poly1,.Lpoly+8
190	ldr	$poly3,.Lpoly+24
191
192	bl	__ecp_nistz256_sub_from
193
194	ldp	x29,x30,[sp],#16
195	ret
196.size	GFp_nistz256_neg,.-GFp_nistz256_neg
197
198// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
199// to $a0-$a3 and b[0] - to $bi
200.type	__ecp_nistz256_mul_mont,%function
201.align	4
202__ecp_nistz256_mul_mont:
203	mul	$acc0,$a0,$bi		// a[0]*b[0]
204	umulh	$t0,$a0,$bi
205
206	mul	$acc1,$a1,$bi		// a[1]*b[0]
207	umulh	$t1,$a1,$bi
208
209	mul	$acc2,$a2,$bi		// a[2]*b[0]
210	umulh	$t2,$a2,$bi
211
212	mul	$acc3,$a3,$bi		// a[3]*b[0]
213	umulh	$t3,$a3,$bi
214	ldr	$bi,[$bp,#8]		// b[1]
215
216	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
217	 lsl	$t0,$acc0,#32
218	adcs	$acc2,$acc2,$t1
219	 lsr	$t1,$acc0,#32
220	adcs	$acc3,$acc3,$t2
221	adc	$acc4,xzr,$t3
222	mov	$acc5,xzr
223___
224for($i=1;$i<4;$i++) {
225        # Reduction iteration is normally performed by accumulating
226        # result of multiplication of modulus by "magic" digit [and
227        # omitting least significant word, which is guaranteed to
228        # be 0], but thanks to special form of modulus and "magic"
229        # digit being equal to least significant word, it can be
230        # performed with additions and subtractions alone. Indeed:
231        #
232        #            ffff0001.00000000.0000ffff.ffffffff
233        # *                                     abcdefgh
234        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
235        #
236        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
237        # rewrite above as:
238        #
239        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
240        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
241        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
242        #
243        # or marking redundant operations:
244        #
245        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
246        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
247        # - 0000abcd.efgh0000.--------.--------.--------
248
249$code.=<<___;
250	subs	$t2,$acc0,$t0		// "*0xffff0001"
251	sbc	$t3,$acc0,$t1
252	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
253	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
254	adcs	$acc1,$acc2,$t1
255	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
256	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
257	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
258	adcs	$acc3,$acc4,$t3
259	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
260	adc	$acc4,$acc5,xzr
261
262	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
263	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
264	adcs	$acc1,$acc1,$t1
265	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
266	adcs	$acc2,$acc2,$t2
267	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
268	adcs	$acc3,$acc3,$t3
269	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
270	adc	$acc4,$acc4,xzr
271___
272$code.=<<___	if ($i<3);
273	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
274___
275$code.=<<___;
276	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
277	 lsl	$t0,$acc0,#32
278	adcs	$acc2,$acc2,$t1
279	 lsr	$t1,$acc0,#32
280	adcs	$acc3,$acc3,$t2
281	adcs	$acc4,$acc4,$t3
282	adc	$acc5,xzr,xzr
283___
284}
285$code.=<<___;
286	// last reduction
287	subs	$t2,$acc0,$t0		// "*0xffff0001"
288	sbc	$t3,$acc0,$t1
289	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
290	adcs	$acc1,$acc2,$t1
291	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
292	adcs	$acc3,$acc4,$t3
293	adc	$acc4,$acc5,xzr
294
295	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
296	sbcs	$t1,$acc1,$poly1
297	sbcs	$t2,$acc2,xzr
298	sbcs	$t3,$acc3,$poly3
299	sbcs	xzr,$acc4,xzr		// did it borrow?
300
301	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
302	csel	$acc1,$acc1,$t1,lo
303	csel	$acc2,$acc2,$t2,lo
304	stp	$acc0,$acc1,[$rp]
305	csel	$acc3,$acc3,$t3,lo
306	stp	$acc2,$acc3,[$rp,#16]
307
308	ret
309.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
310
311// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
312// to $a0-$a3
313.type	__ecp_nistz256_sqr_mont,%function
314.align	4
315__ecp_nistz256_sqr_mont:
316	//  |  |  |  |  |  |a1*a0|  |
317	//  |  |  |  |  |a2*a0|  |  |
318	//  |  |a3*a2|a3*a0|  |  |  |
319	//  |  |  |  |a2*a1|  |  |  |
320	//  |  |  |a3*a1|  |  |  |  |
321	// *|  |  |  |  |  |  |  | 2|
322	// +|a3*a3|a2*a2|a1*a1|a0*a0|
323	//  |--+--+--+--+--+--+--+--|
324	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
325	//
326	//  "can't overflow" below mark carrying into high part of
327	//  multiplication result, which can't overflow, because it
328	//  can never be all ones.
329
330	mul	$acc1,$a1,$a0		// a[1]*a[0]
331	umulh	$t1,$a1,$a0
332	mul	$acc2,$a2,$a0		// a[2]*a[0]
333	umulh	$t2,$a2,$a0
334	mul	$acc3,$a3,$a0		// a[3]*a[0]
335	umulh	$acc4,$a3,$a0
336
337	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
338	 mul	$t0,$a2,$a1		// a[2]*a[1]
339	 umulh	$t1,$a2,$a1
340	adcs	$acc3,$acc3,$t2
341	 mul	$t2,$a3,$a1		// a[3]*a[1]
342	 umulh	$t3,$a3,$a1
343	adc	$acc4,$acc4,xzr		// can't overflow
344
345	mul	$acc5,$a3,$a2		// a[3]*a[2]
346	umulh	$acc6,$a3,$a2
347
348	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
349	 mul	$acc0,$a0,$a0		// a[0]*a[0]
350	adc	$t2,$t3,xzr		// can't overflow
351
352	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
353	 umulh	$a0,$a0,$a0
354	adcs	$acc4,$acc4,$t1
355	 mul	$t1,$a1,$a1		// a[1]*a[1]
356	adcs	$acc5,$acc5,$t2
357	 umulh	$a1,$a1,$a1
358	adc	$acc6,$acc6,xzr		// can't overflow
359
360	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
361	 mul	$t2,$a2,$a2		// a[2]*a[2]
362	adcs	$acc2,$acc2,$acc2
363	 umulh	$a2,$a2,$a2
364	adcs	$acc3,$acc3,$acc3
365	 mul	$t3,$a3,$a3		// a[3]*a[3]
366	adcs	$acc4,$acc4,$acc4
367	 umulh	$a3,$a3,$a3
368	adcs	$acc5,$acc5,$acc5
369	adcs	$acc6,$acc6,$acc6
370	adc	$acc7,xzr,xzr
371
372	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
373	adcs	$acc2,$acc2,$t1
374	adcs	$acc3,$acc3,$a1
375	adcs	$acc4,$acc4,$t2
376	adcs	$acc5,$acc5,$a2
377	 lsl	$t0,$acc0,#32
378	adcs	$acc6,$acc6,$t3
379	 lsr	$t1,$acc0,#32
380	adc	$acc7,$acc7,$a3
381___
382for($i=0;$i<3;$i++) {			# reductions, see commentary in
383					# multiplication for details
384$code.=<<___;
385	subs	$t2,$acc0,$t0		// "*0xffff0001"
386	sbc	$t3,$acc0,$t1
387	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
388	adcs	$acc1,$acc2,$t1
389	 lsl	$t0,$acc0,#32
390	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
391	 lsr	$t1,$acc0,#32
392	adc	$acc3,$t3,xzr		// can't overflow
393___
394}
395$code.=<<___;
396	subs	$t2,$acc0,$t0		// "*0xffff0001"
397	sbc	$t3,$acc0,$t1
398	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
399	adcs	$acc1,$acc2,$t1
400	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
401	adc	$acc3,$t3,xzr		// can't overflow
402
403	adds	$acc0,$acc0,$acc4	// accumulate upper half
404	adcs	$acc1,$acc1,$acc5
405	adcs	$acc2,$acc2,$acc6
406	adcs	$acc3,$acc3,$acc7
407	adc	$acc4,xzr,xzr
408
409	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
410	sbcs	$t1,$acc1,$poly1
411	sbcs	$t2,$acc2,xzr
412	sbcs	$t3,$acc3,$poly3
413	sbcs	xzr,$acc4,xzr		// did it borrow?
414
415	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
416	csel	$acc1,$acc1,$t1,lo
417	csel	$acc2,$acc2,$t2,lo
418	stp	$acc0,$acc1,[$rp]
419	csel	$acc3,$acc3,$t3,lo
420	stp	$acc2,$acc3,[$rp,#16]
421
422	ret
423.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
424
425// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
426// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
427// contexts, e.g. in multiplication by 2 and 3...
428.type	__ecp_nistz256_add,%function
429.align	4
430__ecp_nistz256_add:
431	adds	$acc0,$acc0,$t0		// ret = a+b
432	adcs	$acc1,$acc1,$t1
433	adcs	$acc2,$acc2,$t2
434	adcs	$acc3,$acc3,$t3
435	adc	$ap,xzr,xzr		// zap $ap
436
437	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
438	sbcs	$t1,$acc1,$poly1
439	sbcs	$t2,$acc2,xzr
440	sbcs	$t3,$acc3,$poly3
441	sbcs	xzr,$ap,xzr		// did subtraction borrow?
442
443	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
444	csel	$acc1,$acc1,$t1,lo
445	csel	$acc2,$acc2,$t2,lo
446	stp	$acc0,$acc1,[$rp]
447	csel	$acc3,$acc3,$t3,lo
448	stp	$acc2,$acc3,[$rp,#16]
449
450	ret
451.size	__ecp_nistz256_add,.-__ecp_nistz256_add
452
453.type	__ecp_nistz256_sub_from,%function
454.align	4
455__ecp_nistz256_sub_from:
456	ldp	$t0,$t1,[$bp]
457	ldp	$t2,$t3,[$bp,#16]
458	subs	$acc0,$acc0,$t0		// ret = a-b
459	sbcs	$acc1,$acc1,$t1
460	sbcs	$acc2,$acc2,$t2
461	sbcs	$acc3,$acc3,$t3
462	sbc	$ap,xzr,xzr		// zap $ap
463
464	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
465	adcs	$t1,$acc1,$poly1
466	adcs	$t2,$acc2,xzr
467	adc	$t3,$acc3,$poly3
468	cmp	$ap,xzr			// did subtraction borrow?
469
470	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
471	csel	$acc1,$acc1,$t1,eq
472	csel	$acc2,$acc2,$t2,eq
473	stp	$acc0,$acc1,[$rp]
474	csel	$acc3,$acc3,$t3,eq
475	stp	$acc2,$acc3,[$rp,#16]
476
477	ret
478.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
479
480.type	__ecp_nistz256_sub_morf,%function
481.align	4
482__ecp_nistz256_sub_morf:
483	ldp	$t0,$t1,[$bp]
484	ldp	$t2,$t3,[$bp,#16]
485	subs	$acc0,$t0,$acc0		// ret = b-a
486	sbcs	$acc1,$t1,$acc1
487	sbcs	$acc2,$t2,$acc2
488	sbcs	$acc3,$t3,$acc3
489	sbc	$ap,xzr,xzr		// zap $ap
490
491	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
492	adcs	$t1,$acc1,$poly1
493	adcs	$t2,$acc2,xzr
494	adc	$t3,$acc3,$poly3
495	cmp	$ap,xzr			// did subtraction borrow?
496
497	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
498	csel	$acc1,$acc1,$t1,eq
499	csel	$acc2,$acc2,$t2,eq
500	stp	$acc0,$acc1,[$rp]
501	csel	$acc3,$acc3,$t3,eq
502	stp	$acc2,$acc3,[$rp,#16]
503
504	ret
505.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
506
507.type	__ecp_nistz256_div_by_2,%function
508.align	4
509__ecp_nistz256_div_by_2:
510	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
511	adcs	$t1,$acc1,$poly1
512	adcs	$t2,$acc2,xzr
513	adcs	$t3,$acc3,$poly3
514	adc	$ap,xzr,xzr		// zap $ap
515	tst	$acc0,#1		// is a even?
516
517	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
518	csel	$acc1,$acc1,$t1,eq
519	csel	$acc2,$acc2,$t2,eq
520	csel	$acc3,$acc3,$t3,eq
521	csel	$ap,xzr,$ap,eq
522
523	lsr	$acc0,$acc0,#1		// ret >>= 1
524	orr	$acc0,$acc0,$acc1,lsl#63
525	lsr	$acc1,$acc1,#1
526	orr	$acc1,$acc1,$acc2,lsl#63
527	lsr	$acc2,$acc2,#1
528	orr	$acc2,$acc2,$acc3,lsl#63
529	lsr	$acc3,$acc3,#1
530	stp	$acc0,$acc1,[$rp]
531	orr	$acc3,$acc3,$ap,lsl#63
532	stp	$acc2,$acc3,[$rp,#16]
533
534	ret
535.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
536___
537########################################################################
538# following subroutines are "literal" implementation of those found in
539# ecp_nistz256.c
540#
541########################################################################
542# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
543#
544{
545my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
546# above map() describes stack layout with 4 temporary
547# 256-bit vectors on top.
548my ($rp_real,$ap_real) = map("x$_",(21,22));
549
550$code.=<<___;
551.globl	GFp_nistz256_point_double
552.type	GFp_nistz256_point_double,%function
553.align	5
554GFp_nistz256_point_double:
555	stp	x29,x30,[sp,#-80]!
556	add	x29,sp,#0
557	stp	x19,x20,[sp,#16]
558	stp	x21,x22,[sp,#32]
559	sub	sp,sp,#32*4
560
561.Ldouble_shortcut:
562	ldp	$acc0,$acc1,[$ap,#32]
563	 mov	$rp_real,$rp
564	ldp	$acc2,$acc3,[$ap,#48]
565	 mov	$ap_real,$ap
566	 ldr	$poly1,.Lpoly+8
567	mov	$t0,$acc0
568	 ldr	$poly3,.Lpoly+24
569	mov	$t1,$acc1
570	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
571	mov	$t2,$acc2
572	mov	$t3,$acc3
573	 ldp	$a2,$a3,[$ap_real,#64+16]
574	add	$rp,sp,#$S
575	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
576
577	add	$rp,sp,#$Zsqr
578	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
579
580	ldp	$t0,$t1,[$ap_real]
581	ldp	$t2,$t3,[$ap_real,#16]
582	mov	$a0,$acc0		// put Zsqr aside for p256_sub
583	mov	$a1,$acc1
584	mov	$a2,$acc2
585	mov	$a3,$acc3
586	add	$rp,sp,#$M
587	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
588
589	add	$bp,$ap_real,#0
590	mov	$acc0,$a0		// restore Zsqr
591	mov	$acc1,$a1
592	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
593	mov	$acc2,$a2
594	mov	$acc3,$a3
595	 ldp	$a2,$a3,[sp,#$S+16]
596	add	$rp,sp,#$Zsqr
597	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
598
599	add	$rp,sp,#$S
600	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
601
602	ldr	$bi,[$ap_real,#32]
603	ldp	$a0,$a1,[$ap_real,#64]
604	ldp	$a2,$a3,[$ap_real,#64+16]
605	add	$bp,$ap_real,#32
606	add	$rp,sp,#$tmp0
607	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
608
609	mov	$t0,$acc0
610	mov	$t1,$acc1
611	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
612	mov	$t2,$acc2
613	mov	$t3,$acc3
614	 ldp	$a2,$a3,[sp,#$S+16]
615	add	$rp,$rp_real,#64
616	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
617
618	add	$rp,sp,#$tmp0
619	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
620
621	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
622	 ldp	$a0,$a1,[sp,#$M]
623	 ldp	$a2,$a3,[sp,#$M+16]
624	add	$rp,$rp_real,#32
625	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
626
627	add	$bp,sp,#$Zsqr
628	add	$rp,sp,#$M
629	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
630
631	mov	$t0,$acc0		// duplicate M
632	mov	$t1,$acc1
633	mov	$t2,$acc2
634	mov	$t3,$acc3
635	mov	$a0,$acc0		// put M aside
636	mov	$a1,$acc1
637	mov	$a2,$acc2
638	mov	$a3,$acc3
639	add	$rp,sp,#$M
640	bl	__ecp_nistz256_add
641	mov	$t0,$a0			// restore M
642	mov	$t1,$a1
643	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
644	mov	$t2,$a2
645	 ldp	$a0,$a1,[sp,#$S]
646	mov	$t3,$a3
647	 ldp	$a2,$a3,[sp,#$S+16]
648	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
649
650	add	$bp,$ap_real,#0
651	add	$rp,sp,#$S
652	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
653
654	mov	$t0,$acc0
655	mov	$t1,$acc1
656	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
657	mov	$t2,$acc2
658	mov	$t3,$acc3
659	 ldp	$a2,$a3,[sp,#$M+16]
660	add	$rp,sp,#$tmp0
661	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
662
663	add	$rp,$rp_real,#0
664	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
665
666	add	$bp,sp,#$tmp0
667	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
668
669	add	$bp,sp,#$S
670	add	$rp,sp,#$S
671	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
672
673	ldr	$bi,[sp,#$M]
674	mov	$a0,$acc0		// copy S
675	mov	$a1,$acc1
676	mov	$a2,$acc2
677	mov	$a3,$acc3
678	add	$bp,sp,#$M
679	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
680
681	add	$bp,$rp_real,#32
682	add	$rp,$rp_real,#32
683	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
684
685	add	sp,x29,#0		// destroy frame
686	ldp	x19,x20,[x29,#16]
687	ldp	x21,x22,[x29,#32]
688	ldp	x29,x30,[sp],#80
689	ret
690.size	GFp_nistz256_point_double,.-GFp_nistz256_point_double
691___
692}
693
694########################################################################
695# void GFp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
696#				     const P256_POINT_AFFINE *in2);
697{
698my ($res_x,$res_y,$res_z,
699    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
700my $Z1sqr = $S2;
701# above map() describes stack layout with 10 temporary
702# 256-bit vectors on top.
703my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
704
705$code.=<<___;
706.globl	GFp_nistz256_point_add_affine
707.type	GFp_nistz256_point_add_affine,%function
708.align	5
709GFp_nistz256_point_add_affine:
710	stp	x29,x30,[sp,#-80]!
711	add	x29,sp,#0
712	stp	x19,x20,[sp,#16]
713	stp	x21,x22,[sp,#32]
714	stp	x23,x24,[sp,#48]
715	stp	x25,x26,[sp,#64]
716	sub	sp,sp,#32*10
717
718	mov	$rp_real,$rp
719	mov	$ap_real,$ap
720	mov	$bp_real,$bp
721	ldr	$poly1,.Lpoly+8
722	ldr	$poly3,.Lpoly+24
723
724	ldp	$a0,$a1,[$ap,#64]	// in1_z
725	ldp	$a2,$a3,[$ap,#64+16]
726	orr	$t0,$a0,$a1
727	orr	$t2,$a2,$a3
728	orr	$in1infty,$t0,$t2
729	cmp	$in1infty,#0
730	csetm	$in1infty,ne		// !in1infty
731
732	ldp	$acc0,$acc1,[$bp]	// in2_x
733	ldp	$acc2,$acc3,[$bp,#16]
734	ldp	$t0,$t1,[$bp,#32]	// in2_y
735	ldp	$t2,$t3,[$bp,#48]
736	orr	$acc0,$acc0,$acc1
737	orr	$acc2,$acc2,$acc3
738	orr	$t0,$t0,$t1
739	orr	$t2,$t2,$t3
740	orr	$acc0,$acc0,$acc2
741	orr	$t0,$t0,$t2
742	orr	$in2infty,$acc0,$t0
743	cmp	$in2infty,#0
744	csetm	$in2infty,ne		// !in2infty
745
746	add	$rp,sp,#$Z1sqr
747	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
748
749	mov	$a0,$acc0
750	mov	$a1,$acc1
751	mov	$a2,$acc2
752	mov	$a3,$acc3
753	ldr	$bi,[$bp_real]
754	add	$bp,$bp_real,#0
755	add	$rp,sp,#$U2
756	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
757
758	add	$bp,$ap_real,#0
759	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
760	 ldp	$a0,$a1,[sp,#$Z1sqr]
761	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
762	add	$rp,sp,#$H
763	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
764
765	add	$bp,$ap_real,#64
766	add	$rp,sp,#$S2
767	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
768
769	ldr	$bi,[$ap_real,#64]
770	ldp	$a0,$a1,[sp,#$H]
771	ldp	$a2,$a3,[sp,#$H+16]
772	add	$bp,$ap_real,#64
773	add	$rp,sp,#$res_z
774	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
775
776	ldr	$bi,[$bp_real,#32]
777	ldp	$a0,$a1,[sp,#$S2]
778	ldp	$a2,$a3,[sp,#$S2+16]
779	add	$bp,$bp_real,#32
780	add	$rp,sp,#$S2
781	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
782
783	add	$bp,$ap_real,#32
784	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
785	 ldp	$a2,$a3,[sp,#$H+16]
786	add	$rp,sp,#$R
787	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
788
789	add	$rp,sp,#$Hsqr
790	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
791
792	ldp	$a0,$a1,[sp,#$R]
793	ldp	$a2,$a3,[sp,#$R+16]
794	add	$rp,sp,#$Rsqr
795	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
796
797	ldr	$bi,[sp,#$H]
798	ldp	$a0,$a1,[sp,#$Hsqr]
799	ldp	$a2,$a3,[sp,#$Hsqr+16]
800	add	$bp,sp,#$H
801	add	$rp,sp,#$Hcub
802	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
803
804	ldr	$bi,[$ap_real]
805	ldp	$a0,$a1,[sp,#$Hsqr]
806	ldp	$a2,$a3,[sp,#$Hsqr+16]
807	add	$bp,$ap_real,#0
808	add	$rp,sp,#$U2
809	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
810
811	mov	$t0,$acc0
812	mov	$t1,$acc1
813	mov	$t2,$acc2
814	mov	$t3,$acc3
815	add	$rp,sp,#$Hsqr
816	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
817
818	add	$bp,sp,#$Rsqr
819	add	$rp,sp,#$res_x
820	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
821
822	add	$bp,sp,#$Hcub
823	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
824
825	add	$bp,sp,#$U2
826	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
827	 ldp	$a0,$a1,[sp,#$Hcub]
828	 ldp	$a2,$a3,[sp,#$Hcub+16]
829	add	$rp,sp,#$res_y
830	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
831
832	add	$bp,$ap_real,#32
833	add	$rp,sp,#$S2
834	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
835
836	ldr	$bi,[sp,#$R]
837	ldp	$a0,$a1,[sp,#$res_y]
838	ldp	$a2,$a3,[sp,#$res_y+16]
839	add	$bp,sp,#$R
840	add	$rp,sp,#$res_y
841	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
842
843	add	$bp,sp,#$S2
844	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
845
846	ldp	$a0,$a1,[sp,#$res_x]		// res
847	ldp	$a2,$a3,[sp,#$res_x+16]
848	ldp	$t0,$t1,[$bp_real]		// in2
849	ldp	$t2,$t3,[$bp_real,#16]
850___
851for($i=0;$i<64;$i+=32) {		# conditional moves
852$code.=<<___;
853	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
854	cmp	$in1infty,#0			// !$in1intfy, remember?
855	ldp	$acc2,$acc3,[$ap_real,#$i+16]
856	csel	$t0,$a0,$t0,ne
857	csel	$t1,$a1,$t1,ne
858	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
859	csel	$t2,$a2,$t2,ne
860	csel	$t3,$a3,$t3,ne
861	cmp	$in2infty,#0			// !$in2intfy, remember?
862	ldp	$a2,$a3,[sp,#$res_x+$i+48]
863	csel	$acc0,$t0,$acc0,ne
864	csel	$acc1,$t1,$acc1,ne
865	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
866	csel	$acc2,$t2,$acc2,ne
867	csel	$acc3,$t3,$acc3,ne
868	ldp	$t2,$t3,[$bp_real,#$i+48]
869	stp	$acc0,$acc1,[$rp_real,#$i]
870	stp	$acc2,$acc3,[$rp_real,#$i+16]
871___
872$code.=<<___	if ($i == 0);
873	adr	$bp_real,.Lone_mont-64
874___
875}
876$code.=<<___;
877	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
878	cmp	$in1infty,#0			// !$in1intfy, remember?
879	ldp	$acc2,$acc3,[$ap_real,#$i+16]
880	csel	$t0,$a0,$t0,ne
881	csel	$t1,$a1,$t1,ne
882	csel	$t2,$a2,$t2,ne
883	csel	$t3,$a3,$t3,ne
884	cmp	$in2infty,#0			// !$in2intfy, remember?
885	csel	$acc0,$t0,$acc0,ne
886	csel	$acc1,$t1,$acc1,ne
887	csel	$acc2,$t2,$acc2,ne
888	csel	$acc3,$t3,$acc3,ne
889	stp	$acc0,$acc1,[$rp_real,#$i]
890	stp	$acc2,$acc3,[$rp_real,#$i+16]
891
892	add	sp,x29,#0		// destroy frame
893	ldp	x19,x20,[x29,#16]
894	ldp	x21,x22,[x29,#32]
895	ldp	x23,x24,[x29,#48]
896	ldp	x25,x26,[x29,#64]
897	ldp	x29,x30,[sp],#80
898	ret
899.size	GFp_nistz256_point_add_affine,.-GFp_nistz256_point_add_affine
900___
901}	}
902
903foreach (split("\n",$code)) {
904	s/\`([^\`]*)\`/eval $1/ge;
905
906	print $_,"\n";
907}
908close STDOUT or die "error closing STDOUT";
909