1#! /usr/bin/env perl
2# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv4.
18#
19# October 2014.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816. In the process of adaptation
23# original .c module was made 32-bit savvy in order to make this
24# implementation possible.
25#
26#			with/without -DECP_NISTZ256_ASM
27# Cortex-A8		+53-170%
28# Cortex-A9		+76-205%
29# Cortex-A15		+100-316%
30# Snapdragon S4		+66-187%
31#
32# Ranges denote minimum and maximum improvement coefficients depending
33# on benchmark. Lower coefficients are for ECDSA sign, server-side
34# operation. Keep in mind that +200% means 3x improvement.
35
36$flavour = shift;
37if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
38else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
39
40if ($flavour && $flavour ne "void") {
41    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44    die "can't locate arm-xlate.pl";
45
46    open STDOUT,"| \"$^X\" $xlate $flavour $output";
47} else {
48    open STDOUT,">$output";
49}
50
51$code.=<<___;
52#include "arm_arch.h"
53
54.text
55#if defined(__thumb2__)
56.syntax	unified
57.thumb
58#else
59.code	32
60#endif
61___
62########################################################################
63# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
64#
65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66open TABLE,"<ecp_nistz256_table.c"		or
67open TABLE,"<${dir}../ecp_nistz256_table.c"	or
68die "failed to open ecp_nistz256_table.c:",$!;
69
70use integer;
71
72foreach(<TABLE>) {
73	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
74}
75close TABLE;
76
77# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
78# 64*16*37-1 is because $#arr returns last valid index or @arr, not
79# amount of elements.
80die "insane number of elements" if ($#arr != 64*16*37-1);
81
82$code.=<<___;
83.globl	ecp_nistz256_precomputed
84.type	ecp_nistz256_precomputed,%object
85.align	12
86ecp_nistz256_precomputed:
87___
88########################################################################
89# this conversion smashes P256_POINT_AFFINE by individual bytes with
90# 64 byte interval, similar to
91#	1111222233334444
92#	1234123412341234
93for(1..37) {
94	@tbl = splice(@arr,0,64*16);
95	for($i=0;$i<64;$i++) {
96		undef @line;
97		for($j=0;$j<64;$j++) {
98			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
99		}
100		$code.=".byte\t";
101		$code.=join(',',map { sprintf "0x%02x",$_} @line);
102		$code.="\n";
103	}
104}
105$code.=<<___;
106.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
107.align	5
108.LRR:	@ 2^512 mod P precomputed for NIST P256 polynomial
109.long	0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
110.long	0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
111.Lone:
112.long	1,0,0,0,0,0,0,0
113.asciz	"ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
114.align	6
115___
116
117########################################################################
118# common register layout, note that $t2 is link register, so that if
119# internal subroutine uses $t2, then it has to offload lr...
120
121($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
122		map("r$_",(0..12,14));
123($t0,$t3)=($ff,$a_ptr);
124
125$code.=<<___;
126@ void	ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
127.globl	ecp_nistz256_to_mont
128.type	ecp_nistz256_to_mont,%function
129ecp_nistz256_to_mont:
130	adr	$b_ptr,.LRR
131	b	.Lecp_nistz256_mul_mont
132.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
133
134@ void	ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
135.globl	ecp_nistz256_from_mont
136.type	ecp_nistz256_from_mont,%function
137ecp_nistz256_from_mont:
138	adr	$b_ptr,.Lone
139	b	.Lecp_nistz256_mul_mont
140.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
141
142@ void	ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
143.globl	ecp_nistz256_mul_by_2
144.type	ecp_nistz256_mul_by_2,%function
145.align	4
146ecp_nistz256_mul_by_2:
147	stmdb	sp!,{r4-r12,lr}
148	bl	__ecp_nistz256_mul_by_2
149#if __ARM_ARCH__>=5 || !defined(__thumb__)
150	ldmia	sp!,{r4-r12,pc}
151#else
152	ldmia	sp!,{r4-r12,lr}
153	bx	lr			@ interoperable with Thumb ISA:-)
154#endif
155.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
156
157.type	__ecp_nistz256_mul_by_2,%function
158.align	4
159__ecp_nistz256_mul_by_2:
160	ldr	$a0,[$a_ptr,#0]
161	ldr	$a1,[$a_ptr,#4]
162	ldr	$a2,[$a_ptr,#8]
163	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7], i.e. add with itself
164	ldr	$a3,[$a_ptr,#12]
165	adcs	$a1,$a1,$a1
166	ldr	$a4,[$a_ptr,#16]
167	adcs	$a2,$a2,$a2
168	ldr	$a5,[$a_ptr,#20]
169	adcs	$a3,$a3,$a3
170	ldr	$a6,[$a_ptr,#24]
171	adcs	$a4,$a4,$a4
172	ldr	$a7,[$a_ptr,#28]
173	adcs	$a5,$a5,$a5
174	adcs	$a6,$a6,$a6
175	mov	$ff,#0
176	adcs	$a7,$a7,$a7
177	adc	$ff,$ff,#0
178
179	b	.Lreduce_by_sub
180.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
181
182@ void	ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
183@					const BN_ULONG r2[8]);
184.globl	ecp_nistz256_add
185.type	ecp_nistz256_add,%function
186.align	4
187ecp_nistz256_add:
188	stmdb	sp!,{r4-r12,lr}
189	bl	__ecp_nistz256_add
190#if __ARM_ARCH__>=5 || !defined(__thumb__)
191	ldmia	sp!,{r4-r12,pc}
192#else
193	ldmia	sp!,{r4-r12,lr}
194	bx	lr			@ interoperable with Thumb ISA:-)
195#endif
196.size	ecp_nistz256_add,.-ecp_nistz256_add
197
198.type	__ecp_nistz256_add,%function
199.align	4
200__ecp_nistz256_add:
201	str	lr,[sp,#-4]!		@ push lr
202
203	ldr	$a0,[$a_ptr,#0]
204	ldr	$a1,[$a_ptr,#4]
205	ldr	$a2,[$a_ptr,#8]
206	ldr	$a3,[$a_ptr,#12]
207	ldr	$a4,[$a_ptr,#16]
208	 ldr	$t0,[$b_ptr,#0]
209	ldr	$a5,[$a_ptr,#20]
210	 ldr	$t1,[$b_ptr,#4]
211	ldr	$a6,[$a_ptr,#24]
212	 ldr	$t2,[$b_ptr,#8]
213	ldr	$a7,[$a_ptr,#28]
214	 ldr	$t3,[$b_ptr,#12]
215	adds	$a0,$a0,$t0
216	 ldr	$t0,[$b_ptr,#16]
217	adcs	$a1,$a1,$t1
218	 ldr	$t1,[$b_ptr,#20]
219	adcs	$a2,$a2,$t2
220	 ldr	$t2,[$b_ptr,#24]
221	adcs	$a3,$a3,$t3
222	 ldr	$t3,[$b_ptr,#28]
223	adcs	$a4,$a4,$t0
224	adcs	$a5,$a5,$t1
225	adcs	$a6,$a6,$t2
226	mov	$ff,#0
227	adcs	$a7,$a7,$t3
228	adc	$ff,$ff,#0
229	ldr	lr,[sp],#4		@ pop lr
230
231.Lreduce_by_sub:
232
233	@ if a+b >= modulus, subtract modulus.
234	@
235	@ But since comparison implies subtraction, we subtract
236	@ modulus and then add it back if subtraction borrowed.
237
238	subs	$a0,$a0,#-1
239	sbcs	$a1,$a1,#-1
240	sbcs	$a2,$a2,#-1
241	sbcs	$a3,$a3,#0
242	sbcs	$a4,$a4,#0
243	sbcs	$a5,$a5,#0
244	sbcs	$a6,$a6,#1
245	sbcs	$a7,$a7,#-1
246	sbc	$ff,$ff,#0
247
248	@ Note that because mod has special form, i.e. consists of
249	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
250	@ using value of borrow as a whole or extracting single bit.
251	@ Follow $ff register...
252
253	adds	$a0,$a0,$ff		@ add synthesized modulus
254	adcs	$a1,$a1,$ff
255	str	$a0,[$r_ptr,#0]
256	adcs	$a2,$a2,$ff
257	str	$a1,[$r_ptr,#4]
258	adcs	$a3,$a3,#0
259	str	$a2,[$r_ptr,#8]
260	adcs	$a4,$a4,#0
261	str	$a3,[$r_ptr,#12]
262	adcs	$a5,$a5,#0
263	str	$a4,[$r_ptr,#16]
264	adcs	$a6,$a6,$ff,lsr#31
265	str	$a5,[$r_ptr,#20]
266	adcs	$a7,$a7,$ff
267	str	$a6,[$r_ptr,#24]
268	str	$a7,[$r_ptr,#28]
269
270	mov	pc,lr
271.size	__ecp_nistz256_add,.-__ecp_nistz256_add
272
273@ void	ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
274.globl	ecp_nistz256_mul_by_3
275.type	ecp_nistz256_mul_by_3,%function
276.align	4
277ecp_nistz256_mul_by_3:
278	stmdb	sp!,{r4-r12,lr}
279	bl	__ecp_nistz256_mul_by_3
280#if __ARM_ARCH__>=5 || !defined(__thumb__)
281	ldmia	sp!,{r4-r12,pc}
282#else
283	ldmia	sp!,{r4-r12,lr}
284	bx	lr			@ interoperable with Thumb ISA:-)
285#endif
286.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
287
288.type	__ecp_nistz256_mul_by_3,%function
289.align	4
290__ecp_nistz256_mul_by_3:
291	str	lr,[sp,#-4]!		@ push lr
292
293	@ As multiplication by 3 is performed as 2*n+n, below are inline
294	@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
295	@ corresponding subroutines for details.
296
297	ldr	$a0,[$a_ptr,#0]
298	ldr	$a1,[$a_ptr,#4]
299	ldr	$a2,[$a_ptr,#8]
300	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
301	ldr	$a3,[$a_ptr,#12]
302	adcs	$a1,$a1,$a1
303	ldr	$a4,[$a_ptr,#16]
304	adcs	$a2,$a2,$a2
305	ldr	$a5,[$a_ptr,#20]
306	adcs	$a3,$a3,$a3
307	ldr	$a6,[$a_ptr,#24]
308	adcs	$a4,$a4,$a4
309	ldr	$a7,[$a_ptr,#28]
310	adcs	$a5,$a5,$a5
311	adcs	$a6,$a6,$a6
312	mov	$ff,#0
313	adcs	$a7,$a7,$a7
314	adc	$ff,$ff,#0
315
316	subs	$a0,$a0,#-1		@ .Lreduce_by_sub but without stores
317	sbcs	$a1,$a1,#-1
318	sbcs	$a2,$a2,#-1
319	sbcs	$a3,$a3,#0
320	sbcs	$a4,$a4,#0
321	sbcs	$a5,$a5,#0
322	sbcs	$a6,$a6,#1
323	sbcs	$a7,$a7,#-1
324	sbc	$ff,$ff,#0
325
326	adds	$a0,$a0,$ff		@ add synthesized modulus
327	adcs	$a1,$a1,$ff
328	adcs	$a2,$a2,$ff
329	adcs	$a3,$a3,#0
330	adcs	$a4,$a4,#0
331	 ldr	$b_ptr,[$a_ptr,#0]
332	adcs	$a5,$a5,#0
333	 ldr	$t1,[$a_ptr,#4]
334	adcs	$a6,$a6,$ff,lsr#31
335	 ldr	$t2,[$a_ptr,#8]
336	adc	$a7,$a7,$ff
337
338	ldr	$t0,[$a_ptr,#12]
339	adds	$a0,$a0,$b_ptr		@ 2*a[0:7]+=a[0:7]
340	ldr	$b_ptr,[$a_ptr,#16]
341	adcs	$a1,$a1,$t1
342	ldr	$t1,[$a_ptr,#20]
343	adcs	$a2,$a2,$t2
344	ldr	$t2,[$a_ptr,#24]
345	adcs	$a3,$a3,$t0
346	ldr	$t3,[$a_ptr,#28]
347	adcs	$a4,$a4,$b_ptr
348	adcs	$a5,$a5,$t1
349	adcs	$a6,$a6,$t2
350	mov	$ff,#0
351	adcs	$a7,$a7,$t3
352	adc	$ff,$ff,#0
353	ldr	lr,[sp],#4		@ pop lr
354
355	b	.Lreduce_by_sub
356.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
357
358@ void	ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
359.globl	ecp_nistz256_div_by_2
360.type	ecp_nistz256_div_by_2,%function
361.align	4
362ecp_nistz256_div_by_2:
363	stmdb	sp!,{r4-r12,lr}
364	bl	__ecp_nistz256_div_by_2
365#if __ARM_ARCH__>=5 || !defined(__thumb__)
366	ldmia	sp!,{r4-r12,pc}
367#else
368	ldmia	sp!,{r4-r12,lr}
369	bx	lr			@ interoperable with Thumb ISA:-)
370#endif
371.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
372
373.type	__ecp_nistz256_div_by_2,%function
374.align	4
375__ecp_nistz256_div_by_2:
376	@ ret = (a is odd ? a+mod : a) >> 1
377
378	ldr	$a0,[$a_ptr,#0]
379	ldr	$a1,[$a_ptr,#4]
380	ldr	$a2,[$a_ptr,#8]
381	mov	$ff,$a0,lsl#31		@ place least significant bit to most
382					@ significant position, now arithmetic
383					@ right shift by 31 will produce -1 or
384					@ 0, while logical right shift 1 or 0,
385					@ this is how modulus is conditionally
386					@ synthesized in this case...
387	ldr	$a3,[$a_ptr,#12]
388	adds	$a0,$a0,$ff,asr#31
389	ldr	$a4,[$a_ptr,#16]
390	adcs	$a1,$a1,$ff,asr#31
391	ldr	$a5,[$a_ptr,#20]
392	adcs	$a2,$a2,$ff,asr#31
393	ldr	$a6,[$a_ptr,#24]
394	adcs	$a3,$a3,#0
395	ldr	$a7,[$a_ptr,#28]
396	adcs	$a4,$a4,#0
397	 mov	$a0,$a0,lsr#1		@ a[0:7]>>=1, we can start early
398					@ because it doesn't affect flags
399	adcs	$a5,$a5,#0
400	 orr	$a0,$a0,$a1,lsl#31
401	adcs	$a6,$a6,$ff,lsr#31
402	mov	$b_ptr,#0
403	adcs	$a7,$a7,$ff,asr#31
404	 mov	$a1,$a1,lsr#1
405	adc	$b_ptr,$b_ptr,#0	@ top-most carry bit from addition
406
407	orr	$a1,$a1,$a2,lsl#31
408	mov	$a2,$a2,lsr#1
409	str	$a0,[$r_ptr,#0]
410	orr	$a2,$a2,$a3,lsl#31
411	mov	$a3,$a3,lsr#1
412	str	$a1,[$r_ptr,#4]
413	orr	$a3,$a3,$a4,lsl#31
414	mov	$a4,$a4,lsr#1
415	str	$a2,[$r_ptr,#8]
416	orr	$a4,$a4,$a5,lsl#31
417	mov	$a5,$a5,lsr#1
418	str	$a3,[$r_ptr,#12]
419	orr	$a5,$a5,$a6,lsl#31
420	mov	$a6,$a6,lsr#1
421	str	$a4,[$r_ptr,#16]
422	orr	$a6,$a6,$a7,lsl#31
423	mov	$a7,$a7,lsr#1
424	str	$a5,[$r_ptr,#20]
425	orr	$a7,$a7,$b_ptr,lsl#31	@ don't forget the top-most carry bit
426	str	$a6,[$r_ptr,#24]
427	str	$a7,[$r_ptr,#28]
428
429	mov	pc,lr
430.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
431
432@ void	ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
433@				        const BN_ULONG r2[8]);
434.globl	ecp_nistz256_sub
435.type	ecp_nistz256_sub,%function
436.align	4
437ecp_nistz256_sub:
438	stmdb	sp!,{r4-r12,lr}
439	bl	__ecp_nistz256_sub
440#if __ARM_ARCH__>=5 || !defined(__thumb__)
441	ldmia	sp!,{r4-r12,pc}
442#else
443	ldmia	sp!,{r4-r12,lr}
444	bx	lr			@ interoperable with Thumb ISA:-)
445#endif
446.size	ecp_nistz256_sub,.-ecp_nistz256_sub
447
448.type	__ecp_nistz256_sub,%function
449.align	4
450__ecp_nistz256_sub:
451	str	lr,[sp,#-4]!		@ push lr
452
453	ldr	$a0,[$a_ptr,#0]
454	ldr	$a1,[$a_ptr,#4]
455	ldr	$a2,[$a_ptr,#8]
456	ldr	$a3,[$a_ptr,#12]
457	ldr	$a4,[$a_ptr,#16]
458	 ldr	$t0,[$b_ptr,#0]
459	ldr	$a5,[$a_ptr,#20]
460	 ldr	$t1,[$b_ptr,#4]
461	ldr	$a6,[$a_ptr,#24]
462	 ldr	$t2,[$b_ptr,#8]
463	ldr	$a7,[$a_ptr,#28]
464	 ldr	$t3,[$b_ptr,#12]
465	subs	$a0,$a0,$t0
466	 ldr	$t0,[$b_ptr,#16]
467	sbcs	$a1,$a1,$t1
468	 ldr	$t1,[$b_ptr,#20]
469	sbcs	$a2,$a2,$t2
470	 ldr	$t2,[$b_ptr,#24]
471	sbcs	$a3,$a3,$t3
472	 ldr	$t3,[$b_ptr,#28]
473	sbcs	$a4,$a4,$t0
474	sbcs	$a5,$a5,$t1
475	sbcs	$a6,$a6,$t2
476	sbcs	$a7,$a7,$t3
477	sbc	$ff,$ff,$ff		@ broadcast borrow bit
478	ldr	lr,[sp],#4		@ pop lr
479
480.Lreduce_by_add:
481
482	@ if a-b borrows, add modulus.
483	@
484	@ Note that because mod has special form, i.e. consists of
485	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
486	@ broadcasting borrow bit to a register, $ff, and using it as
487	@ a whole or extracting single bit.
488
489	adds	$a0,$a0,$ff		@ add synthesized modulus
490	adcs	$a1,$a1,$ff
491	str	$a0,[$r_ptr,#0]
492	adcs	$a2,$a2,$ff
493	str	$a1,[$r_ptr,#4]
494	adcs	$a3,$a3,#0
495	str	$a2,[$r_ptr,#8]
496	adcs	$a4,$a4,#0
497	str	$a3,[$r_ptr,#12]
498	adcs	$a5,$a5,#0
499	str	$a4,[$r_ptr,#16]
500	adcs	$a6,$a6,$ff,lsr#31
501	str	$a5,[$r_ptr,#20]
502	adcs	$a7,$a7,$ff
503	str	$a6,[$r_ptr,#24]
504	str	$a7,[$r_ptr,#28]
505
506	mov	pc,lr
507.size	__ecp_nistz256_sub,.-__ecp_nistz256_sub
508
509@ void	ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
510.globl	ecp_nistz256_neg
511.type	ecp_nistz256_neg,%function
512.align	4
513ecp_nistz256_neg:
514	stmdb	sp!,{r4-r12,lr}
515	bl	__ecp_nistz256_neg
516#if __ARM_ARCH__>=5 || !defined(__thumb__)
517	ldmia	sp!,{r4-r12,pc}
518#else
519	ldmia	sp!,{r4-r12,lr}
520	bx	lr			@ interoperable with Thumb ISA:-)
521#endif
522.size	ecp_nistz256_neg,.-ecp_nistz256_neg
523
524.type	__ecp_nistz256_neg,%function
525.align	4
526__ecp_nistz256_neg:
527	ldr	$a0,[$a_ptr,#0]
528	eor	$ff,$ff,$ff
529	ldr	$a1,[$a_ptr,#4]
530	ldr	$a2,[$a_ptr,#8]
531	subs	$a0,$ff,$a0
532	ldr	$a3,[$a_ptr,#12]
533	sbcs	$a1,$ff,$a1
534	ldr	$a4,[$a_ptr,#16]
535	sbcs	$a2,$ff,$a2
536	ldr	$a5,[$a_ptr,#20]
537	sbcs	$a3,$ff,$a3
538	ldr	$a6,[$a_ptr,#24]
539	sbcs	$a4,$ff,$a4
540	ldr	$a7,[$a_ptr,#28]
541	sbcs	$a5,$ff,$a5
542	sbcs	$a6,$ff,$a6
543	sbcs	$a7,$ff,$a7
544	sbc	$ff,$ff,$ff
545
546	b	.Lreduce_by_add
547.size	__ecp_nistz256_neg,.-__ecp_nistz256_neg
548___
549{
550my @acc=map("r$_",(3..11));
551my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
552
553$code.=<<___;
554@ void	ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
555.globl	ecp_nistz256_sqr_mont
556.type	ecp_nistz256_sqr_mont,%function
557.align	4
558ecp_nistz256_sqr_mont:
559	mov	$b_ptr,$a_ptr
560	b	.Lecp_nistz256_mul_mont
561.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
562
563@ void	ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
564@					     const BN_ULONG r2[8]);
565.globl	ecp_nistz256_mul_mont
566.type	ecp_nistz256_mul_mont,%function
567.align	4
568ecp_nistz256_mul_mont:
569.Lecp_nistz256_mul_mont:
570	stmdb	sp!,{r4-r12,lr}
571	bl	__ecp_nistz256_mul_mont
572#if __ARM_ARCH__>=5 || !defined(__thumb__)
573	ldmia	sp!,{r4-r12,pc}
574#else
575	ldmia	sp!,{r4-r12,lr}
576	bx	lr			@ interoperable with Thumb ISA:-)
577#endif
578.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
579
580.type	__ecp_nistz256_mul_mont,%function
581.align	4
582__ecp_nistz256_mul_mont:
583	stmdb	sp!,{r0-r2,lr}			@ make a copy of arguments too
584
585	ldr	$bj,[$b_ptr,#0]			@ b[0]
586	ldmia	$a_ptr,{@acc[1]-@acc[8]}
587
588	umull	@acc[0],$t3,@acc[1],$bj		@ r[0]=a[0]*b[0]
589	stmdb	sp!,{$acc[1]-@acc[8]}		@ copy a[0-7] to stack, so
590						@ that it can be addressed
591						@ without spending register
592						@ on address
593	umull	@acc[1],$t0,@acc[2],$bj		@ r[1]=a[1]*b[0]
594	umull	@acc[2],$t1,@acc[3],$bj
595	adds	@acc[1],@acc[1],$t3		@ accumulate high part of mult
596	umull	@acc[3],$t2,@acc[4],$bj
597	adcs	@acc[2],@acc[2],$t0
598	umull	@acc[4],$t3,@acc[5],$bj
599	adcs	@acc[3],@acc[3],$t1
600	umull	@acc[5],$t0,@acc[6],$bj
601	adcs	@acc[4],@acc[4],$t2
602	umull	@acc[6],$t1,@acc[7],$bj
603	adcs	@acc[5],@acc[5],$t3
604	umull	@acc[7],$t2,@acc[8],$bj
605	adcs	@acc[6],@acc[6],$t0
606	adcs	@acc[7],@acc[7],$t1
607	eor	$t3,$t3,$t3			@ first overflow bit is zero
608	adc	@acc[8],$t2,#0
609___
610for(my $i=1;$i<8;$i++) {
611my $t4=@acc[0];
612
613	# Reduction iteration is normally performed by accumulating
614	# result of multiplication of modulus by "magic" digit [and
615	# omitting least significant word, which is guaranteed to
616	# be 0], but thanks to special form of modulus and "magic"
617	# digit being equal to least significant word, it can be
618	# performed with additions and subtractions alone. Indeed:
619	#
620	#        ffff.0001.0000.0000.0000.ffff.ffff.ffff
621	# *                                         abcd
622	# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
623	#
624	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
625	# rewrite above as:
626	#
627	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
628	# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
629	# -      abcd.0000.0000.0000.0000.0000.0000.abcd
630	#
631	# or marking redundant operations:
632	#
633	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
634	# + abcd.0000.abcd.0000.0000.abcd.----.----.----
635	# -      abcd.----.----.----.----.----.----.----
636
637$code.=<<___;
638	@ multiplication-less reduction $i
639	adds	@acc[3],@acc[3],@acc[0]		@ r[3]+=r[0]
640	 ldr	$bj,[sp,#40]			@ restore b_ptr
641	adcs	@acc[4],@acc[4],#0		@ r[4]+=0
642	adcs	@acc[5],@acc[5],#0		@ r[5]+=0
643	adcs	@acc[6],@acc[6],@acc[0]		@ r[6]+=r[0]
644	 ldr	$t1,[sp,#0]			@ load a[0]
645	adcs	@acc[7],@acc[7],#0		@ r[7]+=0
646	 ldr	$bj,[$bj,#4*$i]			@ load b[i]
647	adcs	@acc[8],@acc[8],@acc[0]		@ r[8]+=r[0]
648	 eor	$t0,$t0,$t0
649	adc	$t3,$t3,#0			@ overflow bit
650	subs	@acc[7],@acc[7],@acc[0]		@ r[7]-=r[0]
651	 ldr	$t2,[sp,#4]			@ a[1]
652	sbcs	@acc[8],@acc[8],#0		@ r[8]-=0
653	 umlal	@acc[1],$t0,$t1,$bj		@ "r[0]"+=a[0]*b[i]
654	 eor	$t1,$t1,$t1
655	sbc	@acc[0],$t3,#0			@ overflow bit, keep in mind
656						@ that netto result is
657						@ addition of a value which
658						@ makes underflow impossible
659
660	ldr	$t3,[sp,#8]			@ a[2]
661	umlal	@acc[2],$t1,$t2,$bj		@ "r[1]"+=a[1]*b[i]
662	 str	@acc[0],[sp,#36]		@ temporarily offload overflow
663	eor	$t2,$t2,$t2
664	ldr	$t4,[sp,#12]			@ a[3], $t4 is alias @acc[0]
665	umlal	@acc[3],$t2,$t3,$bj		@ "r[2]"+=a[2]*b[i]
666	eor	$t3,$t3,$t3
667	adds	@acc[2],@acc[2],$t0		@ accumulate high part of mult
668	ldr	$t0,[sp,#16]			@ a[4]
669	umlal	@acc[4],$t3,$t4,$bj		@ "r[3]"+=a[3]*b[i]
670	eor	$t4,$t4,$t4
671	adcs	@acc[3],@acc[3],$t1
672	ldr	$t1,[sp,#20]			@ a[5]
673	umlal	@acc[5],$t4,$t0,$bj		@ "r[4]"+=a[4]*b[i]
674	eor	$t0,$t0,$t0
675	adcs	@acc[4],@acc[4],$t2
676	ldr	$t2,[sp,#24]			@ a[6]
677	umlal	@acc[6],$t0,$t1,$bj		@ "r[5]"+=a[5]*b[i]
678	eor	$t1,$t1,$t1
679	adcs	@acc[5],@acc[5],$t3
680	ldr	$t3,[sp,#28]			@ a[7]
681	umlal	@acc[7],$t1,$t2,$bj		@ "r[6]"+=a[6]*b[i]
682	eor	$t2,$t2,$t2
683	adcs	@acc[6],@acc[6],$t4
684	 ldr	@acc[0],[sp,#36]		@ restore overflow bit
685	umlal	@acc[8],$t2,$t3,$bj		@ "r[7]"+=a[7]*b[i]
686	eor	$t3,$t3,$t3
687	adcs	@acc[7],@acc[7],$t0
688	adcs	@acc[8],@acc[8],$t1
689	adcs	@acc[0],$acc[0],$t2
690	adc	$t3,$t3,#0			@ new overflow bit
691___
692	push(@acc,shift(@acc));			# rotate registers, so that
693						# "r[i]" becomes r[i]
694}
695$code.=<<___;
696	@ last multiplication-less reduction
697	adds	@acc[3],@acc[3],@acc[0]
698	ldr	$r_ptr,[sp,#32]			@ restore r_ptr
699	adcs	@acc[4],@acc[4],#0
700	adcs	@acc[5],@acc[5],#0
701	adcs	@acc[6],@acc[6],@acc[0]
702	adcs	@acc[7],@acc[7],#0
703	adcs	@acc[8],@acc[8],@acc[0]
704	adc	$t3,$t3,#0
705	subs	@acc[7],@acc[7],@acc[0]
706	sbcs	@acc[8],@acc[8],#0
707	sbc	@acc[0],$t3,#0			@ overflow bit
708
709	@ Final step is "if result > mod, subtract mod", but we do it
710	@ "other way around", namely subtract modulus from result
711	@ and if it borrowed, add modulus back.
712
713	adds	@acc[1],@acc[1],#1		@ subs	@acc[1],@acc[1],#-1
714	adcs	@acc[2],@acc[2],#0		@ sbcs	@acc[2],@acc[2],#-1
715	adcs	@acc[3],@acc[3],#0		@ sbcs	@acc[3],@acc[3],#-1
716	sbcs	@acc[4],@acc[4],#0
717	sbcs	@acc[5],@acc[5],#0
718	sbcs	@acc[6],@acc[6],#0
719	sbcs	@acc[7],@acc[7],#1
720	adcs	@acc[8],@acc[8],#0		@ sbcs	@acc[8],@acc[8],#-1
721	ldr	lr,[sp,#44]			@ restore lr
722	sbc	@acc[0],@acc[0],#0		@ broadcast borrow bit
723	add	sp,sp,#48
724
725	@ Note that because mod has special form, i.e. consists of
726	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
727	@ broadcasting borrow bit to a register, @acc[0], and using it as
728	@ a whole or extracting single bit.
729
730	adds	@acc[1],@acc[1],@acc[0]		@ add modulus or zero
731	adcs	@acc[2],@acc[2],@acc[0]
732	str	@acc[1],[$r_ptr,#0]
733	adcs	@acc[3],@acc[3],@acc[0]
734	str	@acc[2],[$r_ptr,#4]
735	adcs	@acc[4],@acc[4],#0
736	str	@acc[3],[$r_ptr,#8]
737	adcs	@acc[5],@acc[5],#0
738	str	@acc[4],[$r_ptr,#12]
739	adcs	@acc[6],@acc[6],#0
740	str	@acc[5],[$r_ptr,#16]
741	adcs	@acc[7],@acc[7],@acc[0],lsr#31
742	str	@acc[6],[$r_ptr,#20]
743	adc	@acc[8],@acc[8],@acc[0]
744	str	@acc[7],[$r_ptr,#24]
745	str	@acc[8],[$r_ptr,#28]
746
747	mov	pc,lr
748.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
749___
750}
751
752{
753my ($out,$inp,$index,$mask)=map("r$_",(0..3));
754$code.=<<___;
755@ void	ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
756@					 int r2);
757.globl	ecp_nistz256_scatter_w5
758.type	ecp_nistz256_scatter_w5,%function
759.align	5
760ecp_nistz256_scatter_w5:
761	stmdb	sp!,{r4-r11}
762
763	add	$out,$out,$index,lsl#2
764
765	ldmia	$inp!,{r4-r11}		@ X
766	str	r4,[$out,#64*0-4]
767	str	r5,[$out,#64*1-4]
768	str	r6,[$out,#64*2-4]
769	str	r7,[$out,#64*3-4]
770	str	r8,[$out,#64*4-4]
771	str	r9,[$out,#64*5-4]
772	str	r10,[$out,#64*6-4]
773	str	r11,[$out,#64*7-4]
774	add	$out,$out,#64*8
775
776	ldmia	$inp!,{r4-r11}		@ Y
777	str	r4,[$out,#64*0-4]
778	str	r5,[$out,#64*1-4]
779	str	r6,[$out,#64*2-4]
780	str	r7,[$out,#64*3-4]
781	str	r8,[$out,#64*4-4]
782	str	r9,[$out,#64*5-4]
783	str	r10,[$out,#64*6-4]
784	str	r11,[$out,#64*7-4]
785	add	$out,$out,#64*8
786
787	ldmia	$inp,{r4-r11}		@ Z
788	str	r4,[$out,#64*0-4]
789	str	r5,[$out,#64*1-4]
790	str	r6,[$out,#64*2-4]
791	str	r7,[$out,#64*3-4]
792	str	r8,[$out,#64*4-4]
793	str	r9,[$out,#64*5-4]
794	str	r10,[$out,#64*6-4]
795	str	r11,[$out,#64*7-4]
796
797	ldmia	sp!,{r4-r11}
798#if __ARM_ARCH__>=5 || defined(__thumb__)
799	bx	lr
800#else
801	mov	pc,lr
802#endif
803.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
804
805@ void	ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
806@					      int r2);
807.globl	ecp_nistz256_gather_w5
808.type	ecp_nistz256_gather_w5,%function
809.align	5
810ecp_nistz256_gather_w5:
811	stmdb	sp!,{r4-r11}
812
813	cmp	$index,#0
814	mov	$mask,#0
815#ifdef	__thumb2__
816	itt	ne
817#endif
818	subne	$index,$index,#1
819	movne	$mask,#-1
820	add	$inp,$inp,$index,lsl#2
821
822	ldr	r4,[$inp,#64*0]
823	ldr	r5,[$inp,#64*1]
824	ldr	r6,[$inp,#64*2]
825	and	r4,r4,$mask
826	ldr	r7,[$inp,#64*3]
827	and	r5,r5,$mask
828	ldr	r8,[$inp,#64*4]
829	and	r6,r6,$mask
830	ldr	r9,[$inp,#64*5]
831	and	r7,r7,$mask
832	ldr	r10,[$inp,#64*6]
833	and	r8,r8,$mask
834	ldr	r11,[$inp,#64*7]
835	add	$inp,$inp,#64*8
836	and	r9,r9,$mask
837	and	r10,r10,$mask
838	and	r11,r11,$mask
839	stmia	$out!,{r4-r11}	@ X
840
841	ldr	r4,[$inp,#64*0]
842	ldr	r5,[$inp,#64*1]
843	ldr	r6,[$inp,#64*2]
844	and	r4,r4,$mask
845	ldr	r7,[$inp,#64*3]
846	and	r5,r5,$mask
847	ldr	r8,[$inp,#64*4]
848	and	r6,r6,$mask
849	ldr	r9,[$inp,#64*5]
850	and	r7,r7,$mask
851	ldr	r10,[$inp,#64*6]
852	and	r8,r8,$mask
853	ldr	r11,[$inp,#64*7]
854	add	$inp,$inp,#64*8
855	and	r9,r9,$mask
856	and	r10,r10,$mask
857	and	r11,r11,$mask
858	stmia	$out!,{r4-r11}	@ Y
859
860	ldr	r4,[$inp,#64*0]
861	ldr	r5,[$inp,#64*1]
862	ldr	r6,[$inp,#64*2]
863	and	r4,r4,$mask
864	ldr	r7,[$inp,#64*3]
865	and	r5,r5,$mask
866	ldr	r8,[$inp,#64*4]
867	and	r6,r6,$mask
868	ldr	r9,[$inp,#64*5]
869	and	r7,r7,$mask
870	ldr	r10,[$inp,#64*6]
871	and	r8,r8,$mask
872	ldr	r11,[$inp,#64*7]
873	and	r9,r9,$mask
874	and	r10,r10,$mask
875	and	r11,r11,$mask
876	stmia	$out,{r4-r11}		@ Z
877
878	ldmia	sp!,{r4-r11}
879#if __ARM_ARCH__>=5 || defined(__thumb__)
880	bx	lr
881#else
882	mov	pc,lr
883#endif
884.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
885
886@ void	ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
887@					 int r2);
888.globl	ecp_nistz256_scatter_w7
889.type	ecp_nistz256_scatter_w7,%function
890.align	5
891ecp_nistz256_scatter_w7:
892	add	$out,$out,$index
893	mov	$index,#64/4
894.Loop_scatter_w7:
895	ldr	$mask,[$inp],#4
896	subs	$index,$index,#1
897	strb	$mask,[$out,#64*0]
898	mov	$mask,$mask,lsr#8
899	strb	$mask,[$out,#64*1]
900	mov	$mask,$mask,lsr#8
901	strb	$mask,[$out,#64*2]
902	mov	$mask,$mask,lsr#8
903	strb	$mask,[$out,#64*3]
904	add	$out,$out,#64*4
905	bne	.Loop_scatter_w7
906
907#if __ARM_ARCH__>=5 || defined(__thumb__)
908	bx	lr
909#else
910	mov	pc,lr
911#endif
912.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
913
914@ void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
915@						     int r2);
916.globl	ecp_nistz256_gather_w7
917.type	ecp_nistz256_gather_w7,%function
918.align	5
919ecp_nistz256_gather_w7:
920	stmdb	sp!,{r4-r7}
921
922	cmp	$index,#0
923	mov	$mask,#0
924#ifdef	__thumb2__
925	itt	ne
926#endif
927	subne	$index,$index,#1
928	movne	$mask,#-1
929	add	$inp,$inp,$index
930	mov	$index,#64/4
931	nop
932.Loop_gather_w7:
933	ldrb	r4,[$inp,#64*0]
934	subs	$index,$index,#1
935	ldrb	r5,[$inp,#64*1]
936	ldrb	r6,[$inp,#64*2]
937	ldrb	r7,[$inp,#64*3]
938	add	$inp,$inp,#64*4
939	orr	r4,r4,r5,lsl#8
940	orr	r4,r4,r6,lsl#16
941	orr	r4,r4,r7,lsl#24
942	and	r4,r4,$mask
943	str	r4,[$out],#4
944	bne	.Loop_gather_w7
945
946	ldmia	sp!,{r4-r7}
947#if __ARM_ARCH__>=5 || defined(__thumb__)
948	bx	lr
949#else
950	mov	pc,lr
951#endif
952.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
953___
954}
955if (0) {
956# In comparison to integer-only equivalent of below subroutine:
957#
958# Cortex-A8	+10%
959# Cortex-A9	-10%
960# Snapdragon S4	+5%
961#
962# As not all time is spent in multiplication, overall impact is deemed
963# too low to care about.
964
965my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
966my $mask="q4";
967my $mult="q5";
968my @AxB=map("q$_",(8..15));
969
970my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
971
972$code.=<<___;
973#if __ARM_ARCH__>=7
974.fpu	neon
975
976.globl	ecp_nistz256_mul_mont_neon
977.type	ecp_nistz256_mul_mont_neon,%function
978.align	5
979ecp_nistz256_mul_mont_neon:
980	mov	ip,sp
981	stmdb	sp!,{r4-r9}
982	vstmdb	sp!,{q4-q5}		@ ABI specification says so
983
984	sub		$toutptr,sp,#40
985	vld1.32		{${Bi}[0]},[$bptr,:32]!
986	veor		$zero,$zero,$zero
987	vld1.32		{$A0-$A3}, [$aptr]		@ can't specify :32 :-(
988	vzip.16		$Bi,$zero
989	mov		sp,$toutptr			@ alloca
990	vmov.i64	$mask,#0xffff
991
992	vmull.u32	@AxB[0],$Bi,${A0}[0]
993	vmull.u32	@AxB[1],$Bi,${A0}[1]
994	vmull.u32	@AxB[2],$Bi,${A1}[0]
995	vmull.u32	@AxB[3],$Bi,${A1}[1]
996	 vshr.u64	$temp,@AxB[0]#lo,#16
997	vmull.u32	@AxB[4],$Bi,${A2}[0]
998	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
999	vmull.u32	@AxB[5],$Bi,${A2}[1]
1000	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 32 bits of a[0]*b[0]
1001	vmull.u32	@AxB[6],$Bi,${A3}[0]
1002	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1003	vmull.u32	@AxB[7],$Bi,${A3}[1]
1004___
1005for($i=1;$i<8;$i++) {
1006$code.=<<___;
1007	 vld1.32	{${Bi}[0]},[$bptr,:32]!
1008	 veor		$zero,$zero,$zero
1009	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ reduction
1010	vshl.u64	$mult,@AxB[0],#32
1011	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1012	vsub.u64	$mult,$mult,@AxB[0]
1013	 vzip.16	$Bi,$zero
1014	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1015	vadd.u64	@AxB[7],@AxB[7],$mult
1016___
1017	push(@AxB,shift(@AxB));
1018$code.=<<___;
1019	vmlal.u32	@AxB[0],$Bi,${A0}[0]
1020	vmlal.u32	@AxB[1],$Bi,${A0}[1]
1021	vmlal.u32	@AxB[2],$Bi,${A1}[0]
1022	vmlal.u32	@AxB[3],$Bi,${A1}[1]
1023	 vshr.u64	$temp,@AxB[0]#lo,#16
1024	vmlal.u32	@AxB[4],$Bi,${A2}[0]
1025	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
1026	vmlal.u32	@AxB[5],$Bi,${A2}[1]
1027	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 33 bits of a[0]*b[i]+t[0]
1028	vmlal.u32	@AxB[6],$Bi,${A3}[0]
1029	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1030	vmull.u32	@AxB[7],$Bi,${A3}[1]
1031___
1032}
1033$code.=<<___;
1034	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ last reduction
1035	vshl.u64	$mult,@AxB[0],#32
1036	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1037	vsub.u64	$mult,$mult,@AxB[0]
1038	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1039	vadd.u64	@AxB[7],@AxB[7],$mult
1040
1041	vshr.u64	$temp,@AxB[1]#lo,#16		@ convert
1042	vadd.u64	@AxB[1]#hi,@AxB[1]#hi,$temp
1043	vshr.u64	$temp,@AxB[1]#hi,#16
1044	vzip.16		@AxB[1]#lo,@AxB[1]#hi
1045___
1046foreach (2..7) {
1047$code.=<<___;
1048	vadd.u64	@AxB[$_]#lo,@AxB[$_]#lo,$temp
1049	vst1.32		{@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1050	vshr.u64	$temp,@AxB[$_]#lo,#16
1051	vadd.u64	@AxB[$_]#hi,@AxB[$_]#hi,$temp
1052	vshr.u64	$temp,@AxB[$_]#hi,#16
1053	vzip.16		@AxB[$_]#lo,@AxB[$_]#hi
1054___
1055}
1056$code.=<<___;
1057	vst1.32		{@AxB[7]#lo[0]},[$toutptr,:32]!
1058	vst1.32		{$temp},[$toutptr]		@ upper 33 bits
1059
1060	ldr	r1,[sp,#0]
1061	ldr	r2,[sp,#4]
1062	ldr	r3,[sp,#8]
1063	subs	r1,r1,#-1
1064	ldr	r4,[sp,#12]
1065	sbcs	r2,r2,#-1
1066	ldr	r5,[sp,#16]
1067	sbcs	r3,r3,#-1
1068	ldr	r6,[sp,#20]
1069	sbcs	r4,r4,#0
1070	ldr	r7,[sp,#24]
1071	sbcs	r5,r5,#0
1072	ldr	r8,[sp,#28]
1073	sbcs	r6,r6,#0
1074	ldr	r9,[sp,#32]				@ top-most bit
1075	sbcs	r7,r7,#1
1076	sub	sp,ip,#40+16
1077	sbcs	r8,r8,#-1
1078	sbc	r9,r9,#0
1079        vldmia  sp!,{q4-q5}
1080
1081	adds	r1,r1,r9
1082	adcs	r2,r2,r9
1083	str	r1,[$rptr,#0]
1084	adcs	r3,r3,r9
1085	str	r2,[$rptr,#4]
1086	adcs	r4,r4,#0
1087	str	r3,[$rptr,#8]
1088	adcs	r5,r5,#0
1089	str	r4,[$rptr,#12]
1090	adcs	r6,r6,#0
1091	str	r5,[$rptr,#16]
1092	adcs	r7,r7,r9,lsr#31
1093	str	r6,[$rptr,#20]
1094	adcs	r8,r8,r9
1095	str	r7,[$rptr,#24]
1096	str	r8,[$rptr,#28]
1097
1098        ldmia   sp!,{r4-r9}
1099	bx	lr
1100.size	ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1101#endif
1102___
1103}
1104
1105{{{
1106########################################################################
1107# Below $aN assignment matches order in which 256-bit result appears in
1108# register bank at return from __ecp_nistz256_mul_mont, so that we can
1109# skip over reloading it from memory. This means that below functions
1110# use custom calling sequence accepting 256-bit input in registers,
1111# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1112#
1113# See their "normal" counterparts for insights on calculations.
1114
1115my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1116    $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1117my $ff=$b_ptr;
1118
1119$code.=<<___;
1120.type	__ecp_nistz256_sub_from,%function
1121.align	5
1122__ecp_nistz256_sub_from:
1123	str	lr,[sp,#-4]!		@ push lr
1124
1125	 ldr	$t0,[$b_ptr,#0]
1126	 ldr	$t1,[$b_ptr,#4]
1127	 ldr	$t2,[$b_ptr,#8]
1128	 ldr	$t3,[$b_ptr,#12]
1129	subs	$a0,$a0,$t0
1130	 ldr	$t0,[$b_ptr,#16]
1131	sbcs	$a1,$a1,$t1
1132	 ldr	$t1,[$b_ptr,#20]
1133	sbcs	$a2,$a2,$t2
1134	 ldr	$t2,[$b_ptr,#24]
1135	sbcs	$a3,$a3,$t3
1136	 ldr	$t3,[$b_ptr,#28]
1137	sbcs	$a4,$a4,$t0
1138	sbcs	$a5,$a5,$t1
1139	sbcs	$a6,$a6,$t2
1140	sbcs	$a7,$a7,$t3
1141	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1142	ldr	lr,[sp],#4		@ pop lr
1143
1144	adds	$a0,$a0,$ff		@ add synthesized modulus
1145	adcs	$a1,$a1,$ff
1146	str	$a0,[$r_ptr,#0]
1147	adcs	$a2,$a2,$ff
1148	str	$a1,[$r_ptr,#4]
1149	adcs	$a3,$a3,#0
1150	str	$a2,[$r_ptr,#8]
1151	adcs	$a4,$a4,#0
1152	str	$a3,[$r_ptr,#12]
1153	adcs	$a5,$a5,#0
1154	str	$a4,[$r_ptr,#16]
1155	adcs	$a6,$a6,$ff,lsr#31
1156	str	$a5,[$r_ptr,#20]
1157	adcs	$a7,$a7,$ff
1158	str	$a6,[$r_ptr,#24]
1159	str	$a7,[$r_ptr,#28]
1160
1161	mov	pc,lr
1162.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1163
1164.type	__ecp_nistz256_sub_morf,%function
1165.align	5
1166__ecp_nistz256_sub_morf:
1167	str	lr,[sp,#-4]!		@ push lr
1168
1169	 ldr	$t0,[$b_ptr,#0]
1170	 ldr	$t1,[$b_ptr,#4]
1171	 ldr	$t2,[$b_ptr,#8]
1172	 ldr	$t3,[$b_ptr,#12]
1173	subs	$a0,$t0,$a0
1174	 ldr	$t0,[$b_ptr,#16]
1175	sbcs	$a1,$t1,$a1
1176	 ldr	$t1,[$b_ptr,#20]
1177	sbcs	$a2,$t2,$a2
1178	 ldr	$t2,[$b_ptr,#24]
1179	sbcs	$a3,$t3,$a3
1180	 ldr	$t3,[$b_ptr,#28]
1181	sbcs	$a4,$t0,$a4
1182	sbcs	$a5,$t1,$a5
1183	sbcs	$a6,$t2,$a6
1184	sbcs	$a7,$t3,$a7
1185	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1186	ldr	lr,[sp],#4		@ pop lr
1187
1188	adds	$a0,$a0,$ff		@ add synthesized modulus
1189	adcs	$a1,$a1,$ff
1190	str	$a0,[$r_ptr,#0]
1191	adcs	$a2,$a2,$ff
1192	str	$a1,[$r_ptr,#4]
1193	adcs	$a3,$a3,#0
1194	str	$a2,[$r_ptr,#8]
1195	adcs	$a4,$a4,#0
1196	str	$a3,[$r_ptr,#12]
1197	adcs	$a5,$a5,#0
1198	str	$a4,[$r_ptr,#16]
1199	adcs	$a6,$a6,$ff,lsr#31
1200	str	$a5,[$r_ptr,#20]
1201	adcs	$a7,$a7,$ff
1202	str	$a6,[$r_ptr,#24]
1203	str	$a7,[$r_ptr,#28]
1204
1205	mov	pc,lr
1206.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1207
1208.type	__ecp_nistz256_add_self,%function
1209.align	4
1210__ecp_nistz256_add_self:
1211	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
1212	adcs	$a1,$a1,$a1
1213	adcs	$a2,$a2,$a2
1214	adcs	$a3,$a3,$a3
1215	adcs	$a4,$a4,$a4
1216	adcs	$a5,$a5,$a5
1217	adcs	$a6,$a6,$a6
1218	mov	$ff,#0
1219	adcs	$a7,$a7,$a7
1220	adc	$ff,$ff,#0
1221
1222	@ if a+b >= modulus, subtract modulus.
1223	@
1224	@ But since comparison implies subtraction, we subtract
1225	@ modulus and then add it back if subtraction borrowed.
1226
1227	subs	$a0,$a0,#-1
1228	sbcs	$a1,$a1,#-1
1229	sbcs	$a2,$a2,#-1
1230	sbcs	$a3,$a3,#0
1231	sbcs	$a4,$a4,#0
1232	sbcs	$a5,$a5,#0
1233	sbcs	$a6,$a6,#1
1234	sbcs	$a7,$a7,#-1
1235	sbc	$ff,$ff,#0
1236
1237	@ Note that because mod has special form, i.e. consists of
1238	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1239	@ using value of borrow as a whole or extracting single bit.
1240	@ Follow $ff register...
1241
1242	adds	$a0,$a0,$ff		@ add synthesized modulus
1243	adcs	$a1,$a1,$ff
1244	str	$a0,[$r_ptr,#0]
1245	adcs	$a2,$a2,$ff
1246	str	$a1,[$r_ptr,#4]
1247	adcs	$a3,$a3,#0
1248	str	$a2,[$r_ptr,#8]
1249	adcs	$a4,$a4,#0
1250	str	$a3,[$r_ptr,#12]
1251	adcs	$a5,$a5,#0
1252	str	$a4,[$r_ptr,#16]
1253	adcs	$a6,$a6,$ff,lsr#31
1254	str	$a5,[$r_ptr,#20]
1255	adcs	$a7,$a7,$ff
1256	str	$a6,[$r_ptr,#24]
1257	str	$a7,[$r_ptr,#28]
1258
1259	mov	pc,lr
1260.size	__ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1261
1262___
1263
1264########################################################################
1265# following subroutines are "literal" implementation of those found in
1266# ecp_nistz256.c
1267#
1268########################################################################
1269# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1270#
1271{
1272my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1273# above map() describes stack layout with 5 temporary
1274# 256-bit vectors on top. Then note that we push
1275# starting from r0, which means that we have copy of
1276# input arguments just below these temporary vectors.
1277
1278$code.=<<___;
1279.globl	ecp_nistz256_point_double
1280.type	ecp_nistz256_point_double,%function
1281.align	5
1282ecp_nistz256_point_double:
1283	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1284	sub	sp,sp,#32*5
1285
1286.Lpoint_double_shortcut:
1287	add	r3,sp,#$in_x
1288	ldmia	$a_ptr!,{r4-r11}	@ copy in_x
1289	stmia	r3,{r4-r11}
1290
1291	add	$r_ptr,sp,#$S
1292	bl	__ecp_nistz256_mul_by_2	@ p256_mul_by_2(S, in_y);
1293
1294	add	$b_ptr,$a_ptr,#32
1295	add	$a_ptr,$a_ptr,#32
1296	add	$r_ptr,sp,#$Zsqr
1297	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Zsqr, in_z);
1298
1299	add	$a_ptr,sp,#$S
1300	add	$b_ptr,sp,#$S
1301	add	$r_ptr,sp,#$S
1302	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(S, S);
1303
1304	ldr	$b_ptr,[sp,#32*5+4]
1305	add	$a_ptr,$b_ptr,#32
1306	add	$b_ptr,$b_ptr,#64
1307	add	$r_ptr,sp,#$tmp0
1308	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(tmp0, in_z, in_y);
1309
1310	ldr	$r_ptr,[sp,#32*5]
1311	add	$r_ptr,$r_ptr,#64
1312	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(res_z, tmp0);
1313
1314	add	$a_ptr,sp,#$in_x
1315	add	$b_ptr,sp,#$Zsqr
1316	add	$r_ptr,sp,#$M
1317	bl	__ecp_nistz256_add	@ p256_add(M, in_x, Zsqr);
1318
1319	add	$a_ptr,sp,#$in_x
1320	add	$b_ptr,sp,#$Zsqr
1321	add	$r_ptr,sp,#$Zsqr
1322	bl	__ecp_nistz256_sub	@ p256_sub(Zsqr, in_x, Zsqr);
1323
1324	add	$a_ptr,sp,#$S
1325	add	$b_ptr,sp,#$S
1326	add	$r_ptr,sp,#$tmp0
1327	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(tmp0, S);
1328
1329	add	$a_ptr,sp,#$Zsqr
1330	add	$b_ptr,sp,#$M
1331	add	$r_ptr,sp,#$M
1332	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(M, M, Zsqr);
1333
1334	ldr	$r_ptr,[sp,#32*5]
1335	add	$a_ptr,sp,#$tmp0
1336	add	$r_ptr,$r_ptr,#32
1337	bl	__ecp_nistz256_div_by_2	@ p256_div_by_2(res_y, tmp0);
1338
1339	add	$a_ptr,sp,#$M
1340	add	$r_ptr,sp,#$M
1341	bl	__ecp_nistz256_mul_by_3	@ p256_mul_by_3(M, M);
1342
1343	add	$a_ptr,sp,#$in_x
1344	add	$b_ptr,sp,#$S
1345	add	$r_ptr,sp,#$S
1346	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, in_x);
1347
1348	add	$r_ptr,sp,#$tmp0
1349	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(tmp0, S);
1350
1351	ldr	$r_ptr,[sp,#32*5]
1352	add	$a_ptr,sp,#$M
1353	add	$b_ptr,sp,#$M
1354	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(res_x, M);
1355
1356	add	$b_ptr,sp,#$tmp0
1357	bl	__ecp_nistz256_sub_from	@ p256_sub(res_x, res_x, tmp0);
1358
1359	add	$b_ptr,sp,#$S
1360	add	$r_ptr,sp,#$S
1361	bl	__ecp_nistz256_sub_morf	@ p256_sub(S, S, res_x);
1362
1363	add	$a_ptr,sp,#$M
1364	add	$b_ptr,sp,#$S
1365	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, M);
1366
1367	ldr	$r_ptr,[sp,#32*5]
1368	add	$b_ptr,$r_ptr,#32
1369	add	$r_ptr,$r_ptr,#32
1370	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, S, res_y);
1371
1372	add	sp,sp,#32*5+16		@ +16 means "skip even over saved r0-r3"
1373#if __ARM_ARCH__>=5 || !defined(__thumb__)
1374	ldmia	sp!,{r4-r12,pc}
1375#else
1376	ldmia	sp!,{r4-r12,lr}
1377	bx	lr			@ interoperable with Thumb ISA:-)
1378#endif
1379.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1380___
1381}
1382
1383########################################################################
1384# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1385#			      const P256_POINT *in2);
1386{
1387my ($res_x,$res_y,$res_z,
1388    $in1_x,$in1_y,$in1_z,
1389    $in2_x,$in2_y,$in2_z,
1390    $H,$Hsqr,$R,$Rsqr,$Hcub,
1391    $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1392my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1393# above map() describes stack layout with 18 temporary
1394# 256-bit vectors on top. Then note that we push
1395# starting from r0, which means that we have copy of
1396# input arguments just below these temporary vectors.
1397# We use three of them for !in1infty, !in2intfy and
1398# result of check for zero.
1399
1400$code.=<<___;
1401.globl	ecp_nistz256_point_add
1402.type	ecp_nistz256_point_add,%function
1403.align	5
1404ecp_nistz256_point_add:
1405	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1406	sub	sp,sp,#32*18+16
1407
1408	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1409	add	r3,sp,#$in2_x
1410	stmia	r3!,{r4-r11}
1411	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1412	stmia	r3!,{r4-r11}
1413	ldmia	$b_ptr,{r4-r11}		@ copy in2_z
1414	orr	r12,r4,r5
1415	orr	r12,r12,r6
1416	orr	r12,r12,r7
1417	orr	r12,r12,r8
1418	orr	r12,r12,r9
1419	orr	r12,r12,r10
1420	orr	r12,r12,r11
1421	cmp	r12,#0
1422#ifdef	__thumb2__
1423	it	ne
1424#endif
1425	movne	r12,#-1
1426	stmia	r3,{r4-r11}
1427	str	r12,[sp,#32*18+8]	@ !in2infty
1428
1429	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1430	add	r3,sp,#$in1_x
1431	stmia	r3!,{r4-r11}
1432	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1433	stmia	r3!,{r4-r11}
1434	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1435	orr	r12,r4,r5
1436	orr	r12,r12,r6
1437	orr	r12,r12,r7
1438	orr	r12,r12,r8
1439	orr	r12,r12,r9
1440	orr	r12,r12,r10
1441	orr	r12,r12,r11
1442	cmp	r12,#0
1443#ifdef	__thumb2__
1444	it	ne
1445#endif
1446	movne	r12,#-1
1447	stmia	r3,{r4-r11}
1448	str	r12,[sp,#32*18+4]	@ !in1infty
1449
1450	add	$a_ptr,sp,#$in2_z
1451	add	$b_ptr,sp,#$in2_z
1452	add	$r_ptr,sp,#$Z2sqr
1453	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z2sqr, in2_z);
1454
1455	add	$a_ptr,sp,#$in1_z
1456	add	$b_ptr,sp,#$in1_z
1457	add	$r_ptr,sp,#$Z1sqr
1458	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1459
1460	add	$a_ptr,sp,#$in2_z
1461	add	$b_ptr,sp,#$Z2sqr
1462	add	$r_ptr,sp,#$S1
1463	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, Z2sqr, in2_z);
1464
1465	add	$a_ptr,sp,#$in1_z
1466	add	$b_ptr,sp,#$Z1sqr
1467	add	$r_ptr,sp,#$S2
1468	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1469
1470	add	$a_ptr,sp,#$in1_y
1471	add	$b_ptr,sp,#$S1
1472	add	$r_ptr,sp,#$S1
1473	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, S1, in1_y);
1474
1475	add	$a_ptr,sp,#$in2_y
1476	add	$b_ptr,sp,#$S2
1477	add	$r_ptr,sp,#$S2
1478	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1479
1480	add	$b_ptr,sp,#$S1
1481	add	$r_ptr,sp,#$R
1482	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, S1);
1483
1484	orr	$a0,$a0,$a1		@ see if result is zero
1485	orr	$a2,$a2,$a3
1486	orr	$a4,$a4,$a5
1487	orr	$a0,$a0,$a2
1488	orr	$a4,$a4,$a6
1489	orr	$a0,$a0,$a7
1490	 add	$a_ptr,sp,#$in1_x
1491	orr	$a0,$a0,$a4
1492	 add	$b_ptr,sp,#$Z2sqr
1493	str	$a0,[sp,#32*18+12]
1494
1495	add	$r_ptr,sp,#$U1
1496	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U1, in1_x, Z2sqr);
1497
1498	add	$a_ptr,sp,#$in2_x
1499	add	$b_ptr,sp,#$Z1sqr
1500	add	$r_ptr,sp,#$U2
1501	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in2_x, Z1sqr);
1502
1503	add	$b_ptr,sp,#$U1
1504	add	$r_ptr,sp,#$H
1505	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, U1);
1506
1507	orr	$a0,$a0,$a1		@ see if result is zero
1508	orr	$a2,$a2,$a3
1509	orr	$a4,$a4,$a5
1510	orr	$a0,$a0,$a2
1511	orr	$a4,$a4,$a6
1512	orr	$a0,$a0,$a7
1513	orrs	$a0,$a0,$a4
1514
1515	bne	.Ladd_proceed		@ is_equal(U1,U2)?
1516
1517	ldr	$t0,[sp,#32*18+4]
1518	ldr	$t1,[sp,#32*18+8]
1519	ldr	$t2,[sp,#32*18+12]
1520	tst	$t0,$t1
1521	beq	.Ladd_proceed		@ (in1infty || in2infty)?
1522	tst	$t2,$t2
1523	beq	.Ladd_double		@ is_equal(S1,S2)?
1524
1525	ldr	$r_ptr,[sp,#32*18+16]
1526	eor	r4,r4,r4
1527	eor	r5,r5,r5
1528	eor	r6,r6,r6
1529	eor	r7,r7,r7
1530	eor	r8,r8,r8
1531	eor	r9,r9,r9
1532	eor	r10,r10,r10
1533	eor	r11,r11,r11
1534	stmia	$r_ptr!,{r4-r11}
1535	stmia	$r_ptr!,{r4-r11}
1536	stmia	$r_ptr!,{r4-r11}
1537	b	.Ladd_done
1538
1539.align	4
1540.Ladd_double:
1541	ldr	$a_ptr,[sp,#32*18+20]
1542	add	sp,sp,#32*(18-5)+16	@ difference in frame sizes
1543	b	.Lpoint_double_shortcut
1544
1545.align	4
1546.Ladd_proceed:
1547	add	$a_ptr,sp,#$R
1548	add	$b_ptr,sp,#$R
1549	add	$r_ptr,sp,#$Rsqr
1550	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1551
1552	add	$a_ptr,sp,#$H
1553	add	$b_ptr,sp,#$in1_z
1554	add	$r_ptr,sp,#$res_z
1555	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1556
1557	add	$a_ptr,sp,#$H
1558	add	$b_ptr,sp,#$H
1559	add	$r_ptr,sp,#$Hsqr
1560	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1561
1562	add	$a_ptr,sp,#$in2_z
1563	add	$b_ptr,sp,#$res_z
1564	add	$r_ptr,sp,#$res_z
1565	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, res_z, in2_z);
1566
1567	add	$a_ptr,sp,#$H
1568	add	$b_ptr,sp,#$Hsqr
1569	add	$r_ptr,sp,#$Hcub
1570	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1571
1572	add	$a_ptr,sp,#$Hsqr
1573	add	$b_ptr,sp,#$U1
1574	add	$r_ptr,sp,#$U2
1575	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, U1, Hsqr);
1576
1577	add	$r_ptr,sp,#$Hsqr
1578	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1579
1580	add	$b_ptr,sp,#$Rsqr
1581	add	$r_ptr,sp,#$res_x
1582	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1583
1584	add	$b_ptr,sp,#$Hcub
1585	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1586
1587	add	$b_ptr,sp,#$U2
1588	add	$r_ptr,sp,#$res_y
1589	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1590
1591	add	$a_ptr,sp,#$Hcub
1592	add	$b_ptr,sp,#$S1
1593	add	$r_ptr,sp,#$S2
1594	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S1, Hcub);
1595
1596	add	$a_ptr,sp,#$R
1597	add	$b_ptr,sp,#$res_y
1598	add	$r_ptr,sp,#$res_y
1599	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1600
1601	add	$b_ptr,sp,#$S2
1602	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1603
1604	ldr	r11,[sp,#32*18+4]	@ !in1intfy
1605	ldr	r12,[sp,#32*18+8]	@ !in2intfy
1606	add	r1,sp,#$res_x
1607	add	r2,sp,#$in2_x
1608	and	r10,r11,r12
1609	mvn	r11,r11
1610	add	r3,sp,#$in1_x
1611	and	r11,r11,r12
1612	mvn	r12,r12
1613	ldr	$r_ptr,[sp,#32*18+16]
1614___
1615for($i=0;$i<96;$i+=8) {			# conditional moves
1616$code.=<<___;
1617	ldmia	r1!,{r4-r5}		@ res_x
1618	ldmia	r2!,{r6-r7}		@ in2_x
1619	ldmia	r3!,{r8-r9}		@ in1_x
1620	and	r4,r4,r10
1621	and	r5,r5,r10
1622	and	r6,r6,r11
1623	and	r7,r7,r11
1624	and	r8,r8,r12
1625	and	r9,r9,r12
1626	orr	r4,r4,r6
1627	orr	r5,r5,r7
1628	orr	r4,r4,r8
1629	orr	r5,r5,r9
1630	stmia	$r_ptr!,{r4-r5}
1631___
1632}
1633$code.=<<___;
1634.Ladd_done:
1635	add	sp,sp,#32*18+16+16	@ +16 means "skip even over saved r0-r3"
1636#if __ARM_ARCH__>=5 || !defined(__thumb__)
1637	ldmia	sp!,{r4-r12,pc}
1638#else
1639	ldmia	sp!,{r4-r12,lr}
1640	bx	lr			@ interoperable with Thumb ISA:-)
1641#endif
1642.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1643___
1644}
1645
1646########################################################################
1647# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1648#				     const P256_POINT_AFFINE *in2);
1649{
1650my ($res_x,$res_y,$res_z,
1651    $in1_x,$in1_y,$in1_z,
1652    $in2_x,$in2_y,
1653    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1654my $Z1sqr = $S2;
1655# above map() describes stack layout with 18 temporary
1656# 256-bit vectors on top. Then note that we push
1657# starting from r0, which means that we have copy of
1658# input arguments just below these temporary vectors.
1659# We use two of them for !in1infty, !in2intfy.
1660
1661my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1662
1663$code.=<<___;
1664.globl	ecp_nistz256_point_add_affine
1665.type	ecp_nistz256_point_add_affine,%function
1666.align	5
1667ecp_nistz256_point_add_affine:
1668	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1669	sub	sp,sp,#32*15
1670
1671	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1672	add	r3,sp,#$in1_x
1673	stmia	r3!,{r4-r11}
1674	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1675	stmia	r3!,{r4-r11}
1676	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1677	orr	r12,r4,r5
1678	orr	r12,r12,r6
1679	orr	r12,r12,r7
1680	orr	r12,r12,r8
1681	orr	r12,r12,r9
1682	orr	r12,r12,r10
1683	orr	r12,r12,r11
1684	cmp	r12,#0
1685#ifdef	__thumb2__
1686	it	ne
1687#endif
1688	movne	r12,#-1
1689	stmia	r3,{r4-r11}
1690	str	r12,[sp,#32*15+4]	@ !in1infty
1691
1692	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1693	add	r3,sp,#$in2_x
1694	orr	r12,r4,r5
1695	orr	r12,r12,r6
1696	orr	r12,r12,r7
1697	orr	r12,r12,r8
1698	orr	r12,r12,r9
1699	orr	r12,r12,r10
1700	orr	r12,r12,r11
1701	stmia	r3!,{r4-r11}
1702	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1703	orr	r12,r12,r4
1704	orr	r12,r12,r5
1705	orr	r12,r12,r6
1706	orr	r12,r12,r7
1707	orr	r12,r12,r8
1708	orr	r12,r12,r9
1709	orr	r12,r12,r10
1710	orr	r12,r12,r11
1711	stmia	r3!,{r4-r11}
1712	cmp	r12,#0
1713#ifdef	__thumb2__
1714	it	ne
1715#endif
1716	movne	r12,#-1
1717	str	r12,[sp,#32*15+8]	@ !in2infty
1718
1719	add	$a_ptr,sp,#$in1_z
1720	add	$b_ptr,sp,#$in1_z
1721	add	$r_ptr,sp,#$Z1sqr
1722	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1723
1724	add	$a_ptr,sp,#$Z1sqr
1725	add	$b_ptr,sp,#$in2_x
1726	add	$r_ptr,sp,#$U2
1727	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, Z1sqr, in2_x);
1728
1729	add	$b_ptr,sp,#$in1_x
1730	add	$r_ptr,sp,#$H
1731	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, in1_x);
1732
1733	add	$a_ptr,sp,#$Z1sqr
1734	add	$b_ptr,sp,#$in1_z
1735	add	$r_ptr,sp,#$S2
1736	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1737
1738	add	$a_ptr,sp,#$H
1739	add	$b_ptr,sp,#$in1_z
1740	add	$r_ptr,sp,#$res_z
1741	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1742
1743	add	$a_ptr,sp,#$in2_y
1744	add	$b_ptr,sp,#$S2
1745	add	$r_ptr,sp,#$S2
1746	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1747
1748	add	$b_ptr,sp,#$in1_y
1749	add	$r_ptr,sp,#$R
1750	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, in1_y);
1751
1752	add	$a_ptr,sp,#$H
1753	add	$b_ptr,sp,#$H
1754	add	$r_ptr,sp,#$Hsqr
1755	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1756
1757	add	$a_ptr,sp,#$R
1758	add	$b_ptr,sp,#$R
1759	add	$r_ptr,sp,#$Rsqr
1760	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1761
1762	add	$a_ptr,sp,#$H
1763	add	$b_ptr,sp,#$Hsqr
1764	add	$r_ptr,sp,#$Hcub
1765	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1766
1767	add	$a_ptr,sp,#$Hsqr
1768	add	$b_ptr,sp,#$in1_x
1769	add	$r_ptr,sp,#$U2
1770	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in1_x, Hsqr);
1771
1772	add	$r_ptr,sp,#$Hsqr
1773	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1774
1775	add	$b_ptr,sp,#$Rsqr
1776	add	$r_ptr,sp,#$res_x
1777	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1778
1779	add	$b_ptr,sp,#$Hcub
1780	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1781
1782	add	$b_ptr,sp,#$U2
1783	add	$r_ptr,sp,#$res_y
1784	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1785
1786	add	$a_ptr,sp,#$Hcub
1787	add	$b_ptr,sp,#$in1_y
1788	add	$r_ptr,sp,#$S2
1789	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, in1_y, Hcub);
1790
1791	add	$a_ptr,sp,#$R
1792	add	$b_ptr,sp,#$res_y
1793	add	$r_ptr,sp,#$res_y
1794	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1795
1796	add	$b_ptr,sp,#$S2
1797	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1798
1799	ldr	r11,[sp,#32*15+4]	@ !in1intfy
1800	ldr	r12,[sp,#32*15+8]	@ !in2intfy
1801	add	r1,sp,#$res_x
1802	add	r2,sp,#$in2_x
1803	and	r10,r11,r12
1804	mvn	r11,r11
1805	add	r3,sp,#$in1_x
1806	and	r11,r11,r12
1807	mvn	r12,r12
1808	ldr	$r_ptr,[sp,#32*15]
1809___
1810for($i=0;$i<64;$i+=8) {			# conditional moves
1811$code.=<<___;
1812	ldmia	r1!,{r4-r5}		@ res_x
1813	ldmia	r2!,{r6-r7}		@ in2_x
1814	ldmia	r3!,{r8-r9}		@ in1_x
1815	and	r4,r4,r10
1816	and	r5,r5,r10
1817	and	r6,r6,r11
1818	and	r7,r7,r11
1819	and	r8,r8,r12
1820	and	r9,r9,r12
1821	orr	r4,r4,r6
1822	orr	r5,r5,r7
1823	orr	r4,r4,r8
1824	orr	r5,r5,r9
1825	stmia	$r_ptr!,{r4-r5}
1826___
1827}
1828for(;$i<96;$i+=8) {
1829my $j=($i-64)/4;
1830$code.=<<___;
1831	ldmia	r1!,{r4-r5}		@ res_z
1832	ldmia	r3!,{r8-r9}		@ in1_z
1833	and	r4,r4,r10
1834	and	r5,r5,r10
1835	and	r6,r11,#@ONE_mont[$j]
1836	and	r7,r11,#@ONE_mont[$j+1]
1837	and	r8,r8,r12
1838	and	r9,r9,r12
1839	orr	r4,r4,r6
1840	orr	r5,r5,r7
1841	orr	r4,r4,r8
1842	orr	r5,r5,r9
1843	stmia	$r_ptr!,{r4-r5}
1844___
1845}
1846$code.=<<___;
1847	add	sp,sp,#32*15+16		@ +16 means "skip even over saved r0-r3"
1848#if __ARM_ARCH__>=5 || !defined(__thumb__)
1849	ldmia	sp!,{r4-r12,pc}
1850#else
1851	ldmia	sp!,{r4-r12,lr}
1852	bx	lr			@ interoperable with Thumb ISA:-)
1853#endif
1854.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1855___
1856}					}}}
1857
1858foreach (split("\n",$code)) {
1859	s/\`([^\`]*)\`/eval $1/geo;
1860
1861	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1862
1863	print $_,"\n";
1864}
1865close STDOUT;	# enforce flush
1866