1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26# November 2013
27#
28# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29# performance improvement on Cortex-A8 is ~45-100% depending on key
30# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31# On Snapdragon S4 improvement was measured to vary from ~70% to
32# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33# rather because original integer-only code seems to perform
34# suboptimally on S4. Situation on Cortex-A9 is unfortunately
35# different. It's being looked into, but the trouble is that
36# performance for vectors longer than 256 bits is actually couple
37# of percent worse than for integer-only code. The code is chosen
38# for execution on all NEON-capable processors, because gain on
39# others outweighs the marginal loss on Cortex-A9.
40
41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42open STDOUT,">$output";
43
44$num="r0";	# starts as num argument, but holds &tp[num-1]
45$ap="r1";
46$bp="r2"; $bi="r2"; $rp="r2";
47$np="r3";
48$tp="r4";
49$aj="r5";
50$nj="r6";
51$tj="r7";
52$n0="r8";
53###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
54$alo="r10";	# sl, gcc uses it to keep @GOT
55$ahi="r11";	# fp
56$nlo="r12";	# ip
57###########	# r13 is stack pointer
58$nhi="r14";	# lr
59###########	# r15 is program counter
60
61#### argument block layout relative to &tp[num-1], a.k.a. $num
62$_rp="$num,#12*4";
63# ap permanently resides in r1
64$_bp="$num,#13*4";
65# np permanently resides in r3
66$_n0="$num,#14*4";
67$_num="$num,#15*4";	$_bpend=$_num;
68
69$code=<<___;
70#include "arm_arch.h"
71
72.text
73.code	32
74
75#if __ARM_MAX_ARCH__>=7
76.align	5
77.LOPENSSL_armcap:
78.word	OPENSSL_armcap_P-bn_mul_mont
79#endif
80
81.global	bn_mul_mont
82.type	bn_mul_mont,%function
83
84.align	5
85bn_mul_mont:
86	ldr	ip,[sp,#4]		@ load num
87	stmdb	sp!,{r0,r2}		@ sp points at argument block
88#if __ARM_MAX_ARCH__>=7
89	tst	ip,#7
90	bne	.Lialu
91	adr	r0,bn_mul_mont
92	ldr	r2,.LOPENSSL_armcap
93	ldr	r0,[r0,r2]
94	tst	r0,#1			@ NEON available?
95	ldmia	sp, {r0,r2}
96	beq	.Lialu
97	add	sp,sp,#8
98	b	bn_mul8x_mont_neon
99.align	4
100.Lialu:
101#endif
102	cmp	ip,#2
103	mov	$num,ip			@ load num
104	movlt	r0,#0
105	addlt	sp,sp,#2*4
106	blt	.Labrt
107
108	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
109
110	mov	$num,$num,lsl#2		@ rescale $num for byte count
111	sub	sp,sp,$num		@ alloca(4*num)
112	sub	sp,sp,#4		@ +extra dword
113	sub	$num,$num,#4		@ "num=num-1"
114	add	$tp,$bp,$num		@ &bp[num-1]
115
116	add	$num,sp,$num		@ $num to point at &tp[num-1]
117	ldr	$n0,[$_n0]		@ &n0
118	ldr	$bi,[$bp]		@ bp[0]
119	ldr	$aj,[$ap],#4		@ ap[0],ap++
120	ldr	$nj,[$np],#4		@ np[0],np++
121	ldr	$n0,[$n0]		@ *n0
122	str	$tp,[$_bpend]		@ save &bp[num]
123
124	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
125	str	$n0,[$_n0]		@ save n0 value
126	mul	$n0,$alo,$n0		@ "tp[0]"*n0
127	mov	$nlo,#0
128	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
129	mov	$tp,sp
130
131.L1st:
132	ldr	$aj,[$ap],#4		@ ap[j],ap++
133	mov	$alo,$ahi
134	ldr	$nj,[$np],#4		@ np[j],np++
135	mov	$ahi,#0
136	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
137	mov	$nhi,#0
138	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
139	adds	$nlo,$nlo,$alo
140	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
141	adc	$nlo,$nhi,#0
142	cmp	$tp,$num
143	bne	.L1st
144
145	adds	$nlo,$nlo,$ahi
146	ldr	$tp,[$_bp]		@ restore bp
147	mov	$nhi,#0
148	ldr	$n0,[$_n0]		@ restore n0
149	adc	$nhi,$nhi,#0
150	str	$nlo,[$num]		@ tp[num-1]=
151	str	$nhi,[$num,#4]		@ tp[num]=
152
153.Louter:
154	sub	$tj,$num,sp		@ "original" $num-1 value
155	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
156	ldr	$bi,[$tp,#4]!		@ *(++bp)
157	sub	$np,$np,$tj		@ "rewind" np to &np[1]
158	ldr	$aj,[$ap,#-4]		@ ap[0]
159	ldr	$alo,[sp]		@ tp[0]
160	ldr	$nj,[$np,#-4]		@ np[0]
161	ldr	$tj,[sp,#4]		@ tp[1]
162
163	mov	$ahi,#0
164	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
165	str	$tp,[$_bp]		@ save bp
166	mul	$n0,$alo,$n0
167	mov	$nlo,#0
168	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
169	mov	$tp,sp
170
171.Linner:
172	ldr	$aj,[$ap],#4		@ ap[j],ap++
173	adds	$alo,$ahi,$tj		@ +=tp[j]
174	ldr	$nj,[$np],#4		@ np[j],np++
175	mov	$ahi,#0
176	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
177	mov	$nhi,#0
178	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
179	adc	$ahi,$ahi,#0
180	ldr	$tj,[$tp,#8]		@ tp[j+1]
181	adds	$nlo,$nlo,$alo
182	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
183	adc	$nlo,$nhi,#0
184	cmp	$tp,$num
185	bne	.Linner
186
187	adds	$nlo,$nlo,$ahi
188	mov	$nhi,#0
189	ldr	$tp,[$_bp]		@ restore bp
190	adc	$nhi,$nhi,#0
191	ldr	$n0,[$_n0]		@ restore n0
192	adds	$nlo,$nlo,$tj
193	ldr	$tj,[$_bpend]		@ restore &bp[num]
194	adc	$nhi,$nhi,#0
195	str	$nlo,[$num]		@ tp[num-1]=
196	str	$nhi,[$num,#4]		@ tp[num]=
197
198	cmp	$tp,$tj
199	bne	.Louter
200
201	ldr	$rp,[$_rp]		@ pull rp
202	add	$num,$num,#4		@ $num to point at &tp[num]
203	sub	$aj,$num,sp		@ "original" num value
204	mov	$tp,sp			@ "rewind" $tp
205	mov	$ap,$tp			@ "borrow" $ap
206	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
207
208	subs	$tj,$tj,$tj		@ "clear" carry flag
209.Lsub:	ldr	$tj,[$tp],#4
210	ldr	$nj,[$np],#4
211	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
212	str	$tj,[$rp],#4		@ rp[j]=
213	teq	$tp,$num		@ preserve carry
214	bne	.Lsub
215	sbcs	$nhi,$nhi,#0		@ upmost carry
216	mov	$tp,sp			@ "rewind" $tp
217	sub	$rp,$rp,$aj		@ "rewind" $rp
218
219	and	$ap,$tp,$nhi
220	bic	$np,$rp,$nhi
221	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
222
223.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
224	str	sp,[$tp],#4		@ zap tp
225	str	$tj,[$rp],#4
226	cmp	$tp,$num
227	bne	.Lcopy
228
229	add	sp,$num,#4		@ skip over tp[num+1]
230	ldmia	sp!,{r4-r12,lr}		@ restore registers
231	add	sp,sp,#2*4		@ skip over {r0,r2}
232	mov	r0,#1
233.Labrt:
234#if __ARM_ARCH__>=5
235	ret				@ bx lr
236#else
237	tst	lr,#1
238	moveq	pc,lr			@ be binary compatible with V4, yet
239	bx	lr			@ interoperable with Thumb ISA:-)
240#endif
241.size	bn_mul_mont,.-bn_mul_mont
242___
243{
244sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
245sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
246
247my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
248my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
249my ($Z,$Temp)=("q4","q5");
250my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
251my ($Bi,$Ni,$M0)=map("d$_",(28..31));
252my $zero=&Dlo($Z);
253my $temp=&Dlo($Temp);
254
255my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
256my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
257
258$code.=<<___;
259#if __ARM_MAX_ARCH__>=7
260.arch	armv7-a
261.fpu	neon
262
263.type	bn_mul8x_mont_neon,%function
264.align	5
265bn_mul8x_mont_neon:
266	mov	ip,sp
267	stmdb	sp!,{r4-r11}
268	vstmdb	sp!,{d8-d15}		@ ABI specification says so
269	ldmia	ip,{r4-r5}		@ load rest of parameter block
270
271	sub		$toutptr,sp,#16
272	vld1.32		{${Bi}[0]}, [$bptr,:32]!
273	sub		$toutptr,$toutptr,$num,lsl#4
274	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
275	and		$toutptr,$toutptr,#-64
276	vld1.32		{${M0}[0]}, [$n0,:32]
277	mov		sp,$toutptr			@ alloca
278	veor		$zero,$zero,$zero
279	subs		$inner,$num,#8
280	vzip.16		$Bi,$zero
281
282	vmull.u32	$A0xB,$Bi,${A0}[0]
283	vmull.u32	$A1xB,$Bi,${A0}[1]
284	vmull.u32	$A2xB,$Bi,${A1}[0]
285	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
286	vmull.u32	$A3xB,$Bi,${A1}[1]
287
288	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
289	veor		$zero,$zero,$zero
290	vmul.u32	$Ni,$temp,$M0
291
292	vmull.u32	$A4xB,$Bi,${A2}[0]
293	 vld1.32	{$N0-$N3}, [$nptr]!
294	vmull.u32	$A5xB,$Bi,${A2}[1]
295	vmull.u32	$A6xB,$Bi,${A3}[0]
296	vzip.16		$Ni,$zero
297	vmull.u32	$A7xB,$Bi,${A3}[1]
298
299	bne	.LNEON_1st
300
301	@ special case for num=8, everything is in register bank...
302
303	vmlal.u32	$A0xB,$Ni,${N0}[0]
304	sub		$outer,$num,#1
305	vmlal.u32	$A1xB,$Ni,${N0}[1]
306	vmlal.u32	$A2xB,$Ni,${N1}[0]
307	vmlal.u32	$A3xB,$Ni,${N1}[1]
308
309	vmlal.u32	$A4xB,$Ni,${N2}[0]
310	vmov		$Temp,$A0xB
311	vmlal.u32	$A5xB,$Ni,${N2}[1]
312	vmov		$A0xB,$A1xB
313	vmlal.u32	$A6xB,$Ni,${N3}[0]
314	vmov		$A1xB,$A2xB
315	vmlal.u32	$A7xB,$Ni,${N3}[1]
316	vmov		$A2xB,$A3xB
317	vmov		$A3xB,$A4xB
318	vshr.u64	$temp,$temp,#16
319	vmov		$A4xB,$A5xB
320	vmov		$A5xB,$A6xB
321	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
322	vmov		$A6xB,$A7xB
323	veor		$A7xB,$A7xB
324	vshr.u64	$temp,$temp,#16
325
326	b	.LNEON_outer8
327
328.align	4
329.LNEON_outer8:
330	vld1.32		{${Bi}[0]}, [$bptr,:32]!
331	veor		$zero,$zero,$zero
332	vzip.16		$Bi,$zero
333	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
334
335	vmlal.u32	$A0xB,$Bi,${A0}[0]
336	vmlal.u32	$A1xB,$Bi,${A0}[1]
337	vmlal.u32	$A2xB,$Bi,${A1}[0]
338	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
339	vmlal.u32	$A3xB,$Bi,${A1}[1]
340
341	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
342	veor		$zero,$zero,$zero
343	subs		$outer,$outer,#1
344	vmul.u32	$Ni,$temp,$M0
345
346	vmlal.u32	$A4xB,$Bi,${A2}[0]
347	vmlal.u32	$A5xB,$Bi,${A2}[1]
348	vmlal.u32	$A6xB,$Bi,${A3}[0]
349	vzip.16		$Ni,$zero
350	vmlal.u32	$A7xB,$Bi,${A3}[1]
351
352	vmlal.u32	$A0xB,$Ni,${N0}[0]
353	vmlal.u32	$A1xB,$Ni,${N0}[1]
354	vmlal.u32	$A2xB,$Ni,${N1}[0]
355	vmlal.u32	$A3xB,$Ni,${N1}[1]
356
357	vmlal.u32	$A4xB,$Ni,${N2}[0]
358	vmov		$Temp,$A0xB
359	vmlal.u32	$A5xB,$Ni,${N2}[1]
360	vmov		$A0xB,$A1xB
361	vmlal.u32	$A6xB,$Ni,${N3}[0]
362	vmov		$A1xB,$A2xB
363	vmlal.u32	$A7xB,$Ni,${N3}[1]
364	vmov		$A2xB,$A3xB
365	vmov		$A3xB,$A4xB
366	vshr.u64	$temp,$temp,#16
367	vmov		$A4xB,$A5xB
368	vmov		$A5xB,$A6xB
369	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
370	vmov		$A6xB,$A7xB
371	veor		$A7xB,$A7xB
372	vshr.u64	$temp,$temp,#16
373
374	bne	.LNEON_outer8
375
376	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
377	mov		$toutptr,sp
378	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
379	mov		$inner,$num
380	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
381	add		$tinptr,sp,#16
382	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
383	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
384
385	b	.LNEON_tail2
386
387.align	4
388.LNEON_1st:
389	vmlal.u32	$A0xB,$Ni,${N0}[0]
390	 vld1.32	{$A0-$A3}, [$aptr]!
391	vmlal.u32	$A1xB,$Ni,${N0}[1]
392	subs		$inner,$inner,#8
393	vmlal.u32	$A2xB,$Ni,${N1}[0]
394	vmlal.u32	$A3xB,$Ni,${N1}[1]
395
396	vmlal.u32	$A4xB,$Ni,${N2}[0]
397	 vld1.32	{$N0-$N1}, [$nptr]!
398	vmlal.u32	$A5xB,$Ni,${N2}[1]
399	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
400	vmlal.u32	$A6xB,$Ni,${N3}[0]
401	vmlal.u32	$A7xB,$Ni,${N3}[1]
402	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
403
404	vmull.u32	$A0xB,$Bi,${A0}[0]
405	 vld1.32	{$N2-$N3}, [$nptr]!
406	vmull.u32	$A1xB,$Bi,${A0}[1]
407	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
408	vmull.u32	$A2xB,$Bi,${A1}[0]
409	vmull.u32	$A3xB,$Bi,${A1}[1]
410	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
411
412	vmull.u32	$A4xB,$Bi,${A2}[0]
413	vmull.u32	$A5xB,$Bi,${A2}[1]
414	vmull.u32	$A6xB,$Bi,${A3}[0]
415	vmull.u32	$A7xB,$Bi,${A3}[1]
416
417	bne	.LNEON_1st
418
419	vmlal.u32	$A0xB,$Ni,${N0}[0]
420	add		$tinptr,sp,#16
421	vmlal.u32	$A1xB,$Ni,${N0}[1]
422	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
423	vmlal.u32	$A2xB,$Ni,${N1}[0]
424	 vld1.64	{$Temp}, [sp,:128]
425	vmlal.u32	$A3xB,$Ni,${N1}[1]
426	sub		$outer,$num,#1
427
428	vmlal.u32	$A4xB,$Ni,${N2}[0]
429	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
430	vmlal.u32	$A5xB,$Ni,${N2}[1]
431	vshr.u64	$temp,$temp,#16
432	 vld1.64	{$A0xB},       [$tinptr, :128]!
433	vmlal.u32	$A6xB,$Ni,${N3}[0]
434	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
435	vmlal.u32	$A7xB,$Ni,${N3}[1]
436
437	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
438	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
439	veor		$Z,$Z,$Z
440	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
441	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
442	vst1.64		{$Z},          [$toutptr,:128]
443	vshr.u64	$temp,$temp,#16
444
445	b		.LNEON_outer
446
447.align	4
448.LNEON_outer:
449	vld1.32		{${Bi}[0]}, [$bptr,:32]!
450	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
451	vld1.32		{$A0-$A3},  [$aptr]!
452	veor		$zero,$zero,$zero
453	mov		$toutptr,sp
454	vzip.16		$Bi,$zero
455	sub		$inner,$num,#8
456	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
457
458	vmlal.u32	$A0xB,$Bi,${A0}[0]
459	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
460	vmlal.u32	$A1xB,$Bi,${A0}[1]
461	vmlal.u32	$A2xB,$Bi,${A1}[0]
462	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
463	vmlal.u32	$A3xB,$Bi,${A1}[1]
464
465	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
466	veor		$zero,$zero,$zero
467	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
468	 vld1.64	{$A7xB},[$tinptr,:128]!
469	vmul.u32	$Ni,$temp,$M0
470
471	vmlal.u32	$A4xB,$Bi,${A2}[0]
472	 vld1.32	{$N0-$N3}, [$nptr]!
473	vmlal.u32	$A5xB,$Bi,${A2}[1]
474	vmlal.u32	$A6xB,$Bi,${A3}[0]
475	vzip.16		$Ni,$zero
476	vmlal.u32	$A7xB,$Bi,${A3}[1]
477
478.LNEON_inner:
479	vmlal.u32	$A0xB,$Ni,${N0}[0]
480	 vld1.32	{$A0-$A3}, [$aptr]!
481	vmlal.u32	$A1xB,$Ni,${N0}[1]
482	 subs		$inner,$inner,#8
483	vmlal.u32	$A2xB,$Ni,${N1}[0]
484	vmlal.u32	$A3xB,$Ni,${N1}[1]
485	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
486
487	vmlal.u32	$A4xB,$Ni,${N2}[0]
488	 vld1.64	{$A0xB},       [$tinptr, :128]!
489	vmlal.u32	$A5xB,$Ni,${N2}[1]
490	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
491	vmlal.u32	$A6xB,$Ni,${N3}[0]
492	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
493	vmlal.u32	$A7xB,$Ni,${N3}[1]
494	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
495
496	vmlal.u32	$A0xB,$Bi,${A0}[0]
497	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
498	vmlal.u32	$A1xB,$Bi,${A0}[1]
499	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
500	vmlal.u32	$A2xB,$Bi,${A1}[0]
501	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
502	vmlal.u32	$A3xB,$Bi,${A1}[1]
503	 vld1.32	{$N0-$N3}, [$nptr]!
504
505	vmlal.u32	$A4xB,$Bi,${A2}[0]
506	 vld1.64	{$A7xB},       [$tinptr, :128]!
507	vmlal.u32	$A5xB,$Bi,${A2}[1]
508	vmlal.u32	$A6xB,$Bi,${A3}[0]
509	vmlal.u32	$A7xB,$Bi,${A3}[1]
510
511	bne	.LNEON_inner
512
513	vmlal.u32	$A0xB,$Ni,${N0}[0]
514	add		$tinptr,sp,#16
515	vmlal.u32	$A1xB,$Ni,${N0}[1]
516	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
517	vmlal.u32	$A2xB,$Ni,${N1}[0]
518	 vld1.64	{$Temp}, [sp,:128]
519	vmlal.u32	$A3xB,$Ni,${N1}[1]
520	subs		$outer,$outer,#1
521
522	vmlal.u32	$A4xB,$Ni,${N2}[0]
523	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
524	vmlal.u32	$A5xB,$Ni,${N2}[1]
525	 vld1.64	{$A0xB},       [$tinptr, :128]!
526	vshr.u64	$temp,$temp,#16
527	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
528	vmlal.u32	$A6xB,$Ni,${N3}[0]
529	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
530	vmlal.u32	$A7xB,$Ni,${N3}[1]
531
532	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
533	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
534	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
535	vshr.u64	$temp,$temp,#16
536
537	bne	.LNEON_outer
538
539	mov		$toutptr,sp
540	mov		$inner,$num
541
542.LNEON_tail:
543	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
544	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
545	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
546	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
547	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
548	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
549	vld1.64		{$A7xB},       [$tinptr, :128]!
550	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
551
552.LNEON_tail2:
553	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
554	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
555	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
556	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
557	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
558	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
559
560	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
561	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
562	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
563	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
564	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
565	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
566
567	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
568	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
569	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
570	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
571	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
572	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
573
574	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
575	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
576	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
577	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
578	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
579	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
580
581	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
582	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
583	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
584	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
585	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
586	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
587
588	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
589	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
590	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
591	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
592	vld1.64		{$A0xB}, [$tinptr, :128]!
593	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
594	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
595
596	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
597	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
598	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
599	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
600	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
601	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
602	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
603	subs		$inner,$inner,#8
604	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
605
606	bne	.LNEON_tail
607
608	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
609	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
610	subs	$aptr,sp,#0				@ clear carry flag
611	add	$bptr,sp,$num,lsl#2
612
613.LNEON_sub:
614	ldmia	$aptr!, {r4-r7}
615	ldmia	$nptr!, {r8-r11}
616	sbcs	r8, r4,r8
617	sbcs	r9, r5,r9
618	sbcs	r10,r6,r10
619	sbcs	r11,r7,r11
620	teq	$aptr,$bptr				@ preserves carry
621	stmia	$rptr!, {r8-r11}
622	bne	.LNEON_sub
623
624	ldr	r10, [$aptr]				@ load top-most bit
625	veor	q0,q0,q0
626	sub	r11,$bptr,sp				@ this is num*4
627	veor	q1,q1,q1
628	mov	$aptr,sp
629	sub	$rptr,$rptr,r11				@ rewind $rptr
630	mov	$nptr,$bptr				@ second 3/4th of frame
631	sbcs	r10,r10,#0				@ result is carry flag
632
633.LNEON_copy_n_zap:
634	ldmia	$aptr!, {r4-r7}
635	ldmia	$rptr,  {r8-r11}
636	movcc	r8, r4
637	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
638	movcc	r9, r5
639	movcc	r10,r6
640	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
641	movcc	r11,r7
642	ldmia	$aptr, {r4-r7}
643	stmia	$rptr!, {r8-r11}
644	sub	$aptr,$aptr,#16
645	ldmia	$rptr, {r8-r11}
646	movcc	r8, r4
647	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
648	movcc	r9, r5
649	movcc	r10,r6
650	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
651	movcc	r11,r7
652	teq	$aptr,$bptr				@ preserves carry
653	stmia	$rptr!, {r8-r11}
654	bne	.LNEON_copy_n_zap
655
656	sub	sp,ip,#96
657        vldmia  sp!,{d8-d15}
658        ldmia   sp!,{r4-r11}
659	ret						@ bx lr
660.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
661#endif
662___
663}
664$code.=<<___;
665.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
666.align	2
667#if __ARM_MAX_ARCH__>=7
668.comm	OPENSSL_armcap_P,4,4
669#endif
670___
671
672$code =~ s/\`([^\`]*)\`/eval $1/gem;
673$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
674$code =~ s/\bret\b/bx	lr/gm;
675print $code;
676close STDOUT;
677