11f13597dSJung-uk Kim#!/usr/bin/env perl
21f13597dSJung-uk Kim
31f13597dSJung-uk Kim# ====================================================================
4*7bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
51f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
61f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
71f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
81f13597dSJung-uk Kim# ====================================================================
91f13597dSJung-uk Kim
101f13597dSJung-uk Kim# January 2007.
111f13597dSJung-uk Kim
121f13597dSJung-uk Kim# Montgomery multiplication for ARMv4.
131f13597dSJung-uk Kim#
141f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations
151f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement
161f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and
171f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
181f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal
191f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an
201f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction.
211f13597dSJung-uk Kim#
221f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less
231f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively
241f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical.
251f13597dSJung-uk Kim
26*7bded2dbSJung-uk Kim# November 2013
27*7bded2dbSJung-uk Kim#
28*7bded2dbSJung-uk Kim# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29*7bded2dbSJung-uk Kim# performance improvement on Cortex-A8 is ~45-100% depending on key
30*7bded2dbSJung-uk Kim# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31*7bded2dbSJung-uk Kim# On Snapdragon S4 improvement was measured to vary from ~70% to
32*7bded2dbSJung-uk Kim# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33*7bded2dbSJung-uk Kim# rather because original integer-only code seems to perform
34*7bded2dbSJung-uk Kim# suboptimally on S4. Situation on Cortex-A9 is unfortunately
35*7bded2dbSJung-uk Kim# different. It's being looked into, but the trouble is that
36*7bded2dbSJung-uk Kim# performance for vectors longer than 256 bits is actually couple
37*7bded2dbSJung-uk Kim# of percent worse than for integer-only code. The code is chosen
38*7bded2dbSJung-uk Kim# for execution on all NEON-capable processors, because gain on
39*7bded2dbSJung-uk Kim# others outweighs the marginal loss on Cortex-A9.
40*7bded2dbSJung-uk Kim
411f13597dSJung-uk Kimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
421f13597dSJung-uk Kimopen STDOUT,">$output";
431f13597dSJung-uk Kim
441f13597dSJung-uk Kim$num="r0";	# starts as num argument, but holds &tp[num-1]
451f13597dSJung-uk Kim$ap="r1";
461f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2";
471f13597dSJung-uk Kim$np="r3";
481f13597dSJung-uk Kim$tp="r4";
491f13597dSJung-uk Kim$aj="r5";
501f13597dSJung-uk Kim$nj="r6";
511f13597dSJung-uk Kim$tj="r7";
521f13597dSJung-uk Kim$n0="r8";
531f13597dSJung-uk Kim###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
541f13597dSJung-uk Kim$alo="r10";	# sl, gcc uses it to keep @GOT
551f13597dSJung-uk Kim$ahi="r11";	# fp
561f13597dSJung-uk Kim$nlo="r12";	# ip
571f13597dSJung-uk Kim###########	# r13 is stack pointer
581f13597dSJung-uk Kim$nhi="r14";	# lr
591f13597dSJung-uk Kim###########	# r15 is program counter
601f13597dSJung-uk Kim
611f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num
621f13597dSJung-uk Kim$_rp="$num,#12*4";
631f13597dSJung-uk Kim# ap permanently resides in r1
641f13597dSJung-uk Kim$_bp="$num,#13*4";
651f13597dSJung-uk Kim# np permanently resides in r3
661f13597dSJung-uk Kim$_n0="$num,#14*4";
671f13597dSJung-uk Kim$_num="$num,#15*4";	$_bpend=$_num;
681f13597dSJung-uk Kim
691f13597dSJung-uk Kim$code=<<___;
70*7bded2dbSJung-uk Kim#include "arm_arch.h"
71*7bded2dbSJung-uk Kim
721f13597dSJung-uk Kim.text
73*7bded2dbSJung-uk Kim.code	32
74*7bded2dbSJung-uk Kim
75*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7
76*7bded2dbSJung-uk Kim.align	5
77*7bded2dbSJung-uk Kim.LOPENSSL_armcap:
78*7bded2dbSJung-uk Kim.word	OPENSSL_armcap_P-bn_mul_mont
79*7bded2dbSJung-uk Kim#endif
801f13597dSJung-uk Kim
811f13597dSJung-uk Kim.global	bn_mul_mont
821f13597dSJung-uk Kim.type	bn_mul_mont,%function
831f13597dSJung-uk Kim
84*7bded2dbSJung-uk Kim.align	5
851f13597dSJung-uk Kimbn_mul_mont:
86*7bded2dbSJung-uk Kim	ldr	ip,[sp,#4]		@ load num
871f13597dSJung-uk Kim	stmdb	sp!,{r0,r2}		@ sp points at argument block
88*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7
89*7bded2dbSJung-uk Kim	tst	ip,#7
90*7bded2dbSJung-uk Kim	bne	.Lialu
91*7bded2dbSJung-uk Kim	adr	r0,bn_mul_mont
92*7bded2dbSJung-uk Kim	ldr	r2,.LOPENSSL_armcap
93*7bded2dbSJung-uk Kim	ldr	r0,[r0,r2]
94*7bded2dbSJung-uk Kim	tst	r0,#1			@ NEON available?
95*7bded2dbSJung-uk Kim	ldmia	sp, {r0,r2}
96*7bded2dbSJung-uk Kim	beq	.Lialu
97*7bded2dbSJung-uk Kim	add	sp,sp,#8
98*7bded2dbSJung-uk Kim	b	bn_mul8x_mont_neon
99*7bded2dbSJung-uk Kim.align	4
100*7bded2dbSJung-uk Kim.Lialu:
101*7bded2dbSJung-uk Kim#endif
102*7bded2dbSJung-uk Kim	cmp	ip,#2
103*7bded2dbSJung-uk Kim	mov	$num,ip			@ load num
1041f13597dSJung-uk Kim	movlt	r0,#0
1051f13597dSJung-uk Kim	addlt	sp,sp,#2*4
1061f13597dSJung-uk Kim	blt	.Labrt
1071f13597dSJung-uk Kim
1081f13597dSJung-uk Kim	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
1091f13597dSJung-uk Kim
1101f13597dSJung-uk Kim	mov	$num,$num,lsl#2		@ rescale $num for byte count
1111f13597dSJung-uk Kim	sub	sp,sp,$num		@ alloca(4*num)
1121f13597dSJung-uk Kim	sub	sp,sp,#4		@ +extra dword
1131f13597dSJung-uk Kim	sub	$num,$num,#4		@ "num=num-1"
1141f13597dSJung-uk Kim	add	$tp,$bp,$num		@ &bp[num-1]
1151f13597dSJung-uk Kim
1161f13597dSJung-uk Kim	add	$num,sp,$num		@ $num to point at &tp[num-1]
1171f13597dSJung-uk Kim	ldr	$n0,[$_n0]		@ &n0
1181f13597dSJung-uk Kim	ldr	$bi,[$bp]		@ bp[0]
1191f13597dSJung-uk Kim	ldr	$aj,[$ap],#4		@ ap[0],ap++
1201f13597dSJung-uk Kim	ldr	$nj,[$np],#4		@ np[0],np++
1211f13597dSJung-uk Kim	ldr	$n0,[$n0]		@ *n0
1221f13597dSJung-uk Kim	str	$tp,[$_bpend]		@ save &bp[num]
1231f13597dSJung-uk Kim
1241f13597dSJung-uk Kim	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
1251f13597dSJung-uk Kim	str	$n0,[$_n0]		@ save n0 value
1261f13597dSJung-uk Kim	mul	$n0,$alo,$n0		@ "tp[0]"*n0
1271f13597dSJung-uk Kim	mov	$nlo,#0
1281f13597dSJung-uk Kim	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
1291f13597dSJung-uk Kim	mov	$tp,sp
1301f13597dSJung-uk Kim
1311f13597dSJung-uk Kim.L1st:
1321f13597dSJung-uk Kim	ldr	$aj,[$ap],#4		@ ap[j],ap++
1331f13597dSJung-uk Kim	mov	$alo,$ahi
1341f13597dSJung-uk Kim	ldr	$nj,[$np],#4		@ np[j],np++
1351f13597dSJung-uk Kim	mov	$ahi,#0
1361f13597dSJung-uk Kim	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
1371f13597dSJung-uk Kim	mov	$nhi,#0
1381f13597dSJung-uk Kim	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
1391f13597dSJung-uk Kim	adds	$nlo,$nlo,$alo
1401f13597dSJung-uk Kim	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
1411f13597dSJung-uk Kim	adc	$nlo,$nhi,#0
1421f13597dSJung-uk Kim	cmp	$tp,$num
1431f13597dSJung-uk Kim	bne	.L1st
1441f13597dSJung-uk Kim
1451f13597dSJung-uk Kim	adds	$nlo,$nlo,$ahi
1461f13597dSJung-uk Kim	ldr	$tp,[$_bp]		@ restore bp
1471f13597dSJung-uk Kim	mov	$nhi,#0
1481f13597dSJung-uk Kim	ldr	$n0,[$_n0]		@ restore n0
1491f13597dSJung-uk Kim	adc	$nhi,$nhi,#0
1501f13597dSJung-uk Kim	str	$nlo,[$num]		@ tp[num-1]=
1511f13597dSJung-uk Kim	str	$nhi,[$num,#4]		@ tp[num]=
1521f13597dSJung-uk Kim
1531f13597dSJung-uk Kim.Louter:
1541f13597dSJung-uk Kim	sub	$tj,$num,sp		@ "original" $num-1 value
1551f13597dSJung-uk Kim	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
1561f13597dSJung-uk Kim	ldr	$bi,[$tp,#4]!		@ *(++bp)
1571f13597dSJung-uk Kim	sub	$np,$np,$tj		@ "rewind" np to &np[1]
1581f13597dSJung-uk Kim	ldr	$aj,[$ap,#-4]		@ ap[0]
1591f13597dSJung-uk Kim	ldr	$alo,[sp]		@ tp[0]
1601f13597dSJung-uk Kim	ldr	$nj,[$np,#-4]		@ np[0]
1611f13597dSJung-uk Kim	ldr	$tj,[sp,#4]		@ tp[1]
1621f13597dSJung-uk Kim
1631f13597dSJung-uk Kim	mov	$ahi,#0
1641f13597dSJung-uk Kim	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
1651f13597dSJung-uk Kim	str	$tp,[$_bp]		@ save bp
1661f13597dSJung-uk Kim	mul	$n0,$alo,$n0
1671f13597dSJung-uk Kim	mov	$nlo,#0
1681f13597dSJung-uk Kim	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
1691f13597dSJung-uk Kim	mov	$tp,sp
1701f13597dSJung-uk Kim
1711f13597dSJung-uk Kim.Linner:
1721f13597dSJung-uk Kim	ldr	$aj,[$ap],#4		@ ap[j],ap++
1731f13597dSJung-uk Kim	adds	$alo,$ahi,$tj		@ +=tp[j]
1741f13597dSJung-uk Kim	ldr	$nj,[$np],#4		@ np[j],np++
1751f13597dSJung-uk Kim	mov	$ahi,#0
1761f13597dSJung-uk Kim	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
1771f13597dSJung-uk Kim	mov	$nhi,#0
1781f13597dSJung-uk Kim	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
1791f13597dSJung-uk Kim	adc	$ahi,$ahi,#0
1801f13597dSJung-uk Kim	ldr	$tj,[$tp,#8]		@ tp[j+1]
1811f13597dSJung-uk Kim	adds	$nlo,$nlo,$alo
1821f13597dSJung-uk Kim	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
1831f13597dSJung-uk Kim	adc	$nlo,$nhi,#0
1841f13597dSJung-uk Kim	cmp	$tp,$num
1851f13597dSJung-uk Kim	bne	.Linner
1861f13597dSJung-uk Kim
1871f13597dSJung-uk Kim	adds	$nlo,$nlo,$ahi
1881f13597dSJung-uk Kim	mov	$nhi,#0
1891f13597dSJung-uk Kim	ldr	$tp,[$_bp]		@ restore bp
1901f13597dSJung-uk Kim	adc	$nhi,$nhi,#0
1911f13597dSJung-uk Kim	ldr	$n0,[$_n0]		@ restore n0
1921f13597dSJung-uk Kim	adds	$nlo,$nlo,$tj
1931f13597dSJung-uk Kim	ldr	$tj,[$_bpend]		@ restore &bp[num]
1941f13597dSJung-uk Kim	adc	$nhi,$nhi,#0
1951f13597dSJung-uk Kim	str	$nlo,[$num]		@ tp[num-1]=
1961f13597dSJung-uk Kim	str	$nhi,[$num,#4]		@ tp[num]=
1971f13597dSJung-uk Kim
1981f13597dSJung-uk Kim	cmp	$tp,$tj
1991f13597dSJung-uk Kim	bne	.Louter
2001f13597dSJung-uk Kim
2011f13597dSJung-uk Kim	ldr	$rp,[$_rp]		@ pull rp
2021f13597dSJung-uk Kim	add	$num,$num,#4		@ $num to point at &tp[num]
2031f13597dSJung-uk Kim	sub	$aj,$num,sp		@ "original" num value
2041f13597dSJung-uk Kim	mov	$tp,sp			@ "rewind" $tp
2051f13597dSJung-uk Kim	mov	$ap,$tp			@ "borrow" $ap
2061f13597dSJung-uk Kim	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
2071f13597dSJung-uk Kim
2081f13597dSJung-uk Kim	subs	$tj,$tj,$tj		@ "clear" carry flag
2091f13597dSJung-uk Kim.Lsub:	ldr	$tj,[$tp],#4
2101f13597dSJung-uk Kim	ldr	$nj,[$np],#4
2111f13597dSJung-uk Kim	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
2121f13597dSJung-uk Kim	str	$tj,[$rp],#4		@ rp[j]=
2131f13597dSJung-uk Kim	teq	$tp,$num		@ preserve carry
2141f13597dSJung-uk Kim	bne	.Lsub
2151f13597dSJung-uk Kim	sbcs	$nhi,$nhi,#0		@ upmost carry
2161f13597dSJung-uk Kim	mov	$tp,sp			@ "rewind" $tp
2171f13597dSJung-uk Kim	sub	$rp,$rp,$aj		@ "rewind" $rp
2181f13597dSJung-uk Kim
2191f13597dSJung-uk Kim	and	$ap,$tp,$nhi
2201f13597dSJung-uk Kim	bic	$np,$rp,$nhi
2211f13597dSJung-uk Kim	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
2221f13597dSJung-uk Kim
2231f13597dSJung-uk Kim.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
2241f13597dSJung-uk Kim	str	sp,[$tp],#4		@ zap tp
2251f13597dSJung-uk Kim	str	$tj,[$rp],#4
2261f13597dSJung-uk Kim	cmp	$tp,$num
2271f13597dSJung-uk Kim	bne	.Lcopy
2281f13597dSJung-uk Kim
2291f13597dSJung-uk Kim	add	sp,$num,#4		@ skip over tp[num+1]
2301f13597dSJung-uk Kim	ldmia	sp!,{r4-r12,lr}		@ restore registers
2311f13597dSJung-uk Kim	add	sp,sp,#2*4		@ skip over {r0,r2}
2321f13597dSJung-uk Kim	mov	r0,#1
233*7bded2dbSJung-uk Kim.Labrt:
234*7bded2dbSJung-uk Kim#if __ARM_ARCH__>=5
235*7bded2dbSJung-uk Kim	ret				@ bx lr
236*7bded2dbSJung-uk Kim#else
237*7bded2dbSJung-uk Kim	tst	lr,#1
2381f13597dSJung-uk Kim	moveq	pc,lr			@ be binary compatible with V4, yet
2391f13597dSJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
240*7bded2dbSJung-uk Kim#endif
2411f13597dSJung-uk Kim.size	bn_mul_mont,.-bn_mul_mont
242*7bded2dbSJung-uk Kim___
243*7bded2dbSJung-uk Kim{
244*7bded2dbSJung-uk Kimsub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
245*7bded2dbSJung-uk Kimsub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
246*7bded2dbSJung-uk Kim
247*7bded2dbSJung-uk Kimmy ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
248*7bded2dbSJung-uk Kimmy ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
249*7bded2dbSJung-uk Kimmy ($Z,$Temp)=("q4","q5");
250*7bded2dbSJung-uk Kimmy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
251*7bded2dbSJung-uk Kimmy ($Bi,$Ni,$M0)=map("d$_",(28..31));
252*7bded2dbSJung-uk Kimmy $zero=&Dlo($Z);
253*7bded2dbSJung-uk Kimmy $temp=&Dlo($Temp);
254*7bded2dbSJung-uk Kim
255*7bded2dbSJung-uk Kimmy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
256*7bded2dbSJung-uk Kimmy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
257*7bded2dbSJung-uk Kim
258*7bded2dbSJung-uk Kim$code.=<<___;
259*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7
260*7bded2dbSJung-uk Kim.arch	armv7-a
261*7bded2dbSJung-uk Kim.fpu	neon
262*7bded2dbSJung-uk Kim
263*7bded2dbSJung-uk Kim.type	bn_mul8x_mont_neon,%function
264*7bded2dbSJung-uk Kim.align	5
265*7bded2dbSJung-uk Kimbn_mul8x_mont_neon:
266*7bded2dbSJung-uk Kim	mov	ip,sp
267*7bded2dbSJung-uk Kim	stmdb	sp!,{r4-r11}
268*7bded2dbSJung-uk Kim	vstmdb	sp!,{d8-d15}		@ ABI specification says so
269*7bded2dbSJung-uk Kim	ldmia	ip,{r4-r5}		@ load rest of parameter block
270*7bded2dbSJung-uk Kim
271*7bded2dbSJung-uk Kim	sub		$toutptr,sp,#16
272*7bded2dbSJung-uk Kim	vld1.32		{${Bi}[0]}, [$bptr,:32]!
273*7bded2dbSJung-uk Kim	sub		$toutptr,$toutptr,$num,lsl#4
274*7bded2dbSJung-uk Kim	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
275*7bded2dbSJung-uk Kim	and		$toutptr,$toutptr,#-64
276*7bded2dbSJung-uk Kim	vld1.32		{${M0}[0]}, [$n0,:32]
277*7bded2dbSJung-uk Kim	mov		sp,$toutptr			@ alloca
278*7bded2dbSJung-uk Kim	veor		$zero,$zero,$zero
279*7bded2dbSJung-uk Kim	subs		$inner,$num,#8
280*7bded2dbSJung-uk Kim	vzip.16		$Bi,$zero
281*7bded2dbSJung-uk Kim
282*7bded2dbSJung-uk Kim	vmull.u32	$A0xB,$Bi,${A0}[0]
283*7bded2dbSJung-uk Kim	vmull.u32	$A1xB,$Bi,${A0}[1]
284*7bded2dbSJung-uk Kim	vmull.u32	$A2xB,$Bi,${A1}[0]
285*7bded2dbSJung-uk Kim	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
286*7bded2dbSJung-uk Kim	vmull.u32	$A3xB,$Bi,${A1}[1]
287*7bded2dbSJung-uk Kim
288*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
289*7bded2dbSJung-uk Kim	veor		$zero,$zero,$zero
290*7bded2dbSJung-uk Kim	vmul.u32	$Ni,$temp,$M0
291*7bded2dbSJung-uk Kim
292*7bded2dbSJung-uk Kim	vmull.u32	$A4xB,$Bi,${A2}[0]
293*7bded2dbSJung-uk Kim	 vld1.32	{$N0-$N3}, [$nptr]!
294*7bded2dbSJung-uk Kim	vmull.u32	$A5xB,$Bi,${A2}[1]
295*7bded2dbSJung-uk Kim	vmull.u32	$A6xB,$Bi,${A3}[0]
296*7bded2dbSJung-uk Kim	vzip.16		$Ni,$zero
297*7bded2dbSJung-uk Kim	vmull.u32	$A7xB,$Bi,${A3}[1]
298*7bded2dbSJung-uk Kim
299*7bded2dbSJung-uk Kim	bne	.LNEON_1st
300*7bded2dbSJung-uk Kim
301*7bded2dbSJung-uk Kim	@ special case for num=8, everything is in register bank...
302*7bded2dbSJung-uk Kim
303*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Ni,${N0}[0]
304*7bded2dbSJung-uk Kim	sub		$outer,$num,#1
305*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Ni,${N0}[1]
306*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Ni,${N1}[0]
307*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Ni,${N1}[1]
308*7bded2dbSJung-uk Kim
309*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Ni,${N2}[0]
310*7bded2dbSJung-uk Kim	vmov		$Temp,$A0xB
311*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Ni,${N2}[1]
312*7bded2dbSJung-uk Kim	vmov		$A0xB,$A1xB
313*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Ni,${N3}[0]
314*7bded2dbSJung-uk Kim	vmov		$A1xB,$A2xB
315*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Ni,${N3}[1]
316*7bded2dbSJung-uk Kim	vmov		$A2xB,$A3xB
317*7bded2dbSJung-uk Kim	vmov		$A3xB,$A4xB
318*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
319*7bded2dbSJung-uk Kim	vmov		$A4xB,$A5xB
320*7bded2dbSJung-uk Kim	vmov		$A5xB,$A6xB
321*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
322*7bded2dbSJung-uk Kim	vmov		$A6xB,$A7xB
323*7bded2dbSJung-uk Kim	veor		$A7xB,$A7xB
324*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
325*7bded2dbSJung-uk Kim
326*7bded2dbSJung-uk Kim	b	.LNEON_outer8
327*7bded2dbSJung-uk Kim
328*7bded2dbSJung-uk Kim.align	4
329*7bded2dbSJung-uk Kim.LNEON_outer8:
330*7bded2dbSJung-uk Kim	vld1.32		{${Bi}[0]}, [$bptr,:32]!
331*7bded2dbSJung-uk Kim	veor		$zero,$zero,$zero
332*7bded2dbSJung-uk Kim	vzip.16		$Bi,$zero
333*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
334*7bded2dbSJung-uk Kim
335*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Bi,${A0}[0]
336*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Bi,${A0}[1]
337*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Bi,${A1}[0]
338*7bded2dbSJung-uk Kim	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
339*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Bi,${A1}[1]
340*7bded2dbSJung-uk Kim
341*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
342*7bded2dbSJung-uk Kim	veor		$zero,$zero,$zero
343*7bded2dbSJung-uk Kim	subs		$outer,$outer,#1
344*7bded2dbSJung-uk Kim	vmul.u32	$Ni,$temp,$M0
345*7bded2dbSJung-uk Kim
346*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Bi,${A2}[0]
347*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Bi,${A2}[1]
348*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Bi,${A3}[0]
349*7bded2dbSJung-uk Kim	vzip.16		$Ni,$zero
350*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Bi,${A3}[1]
351*7bded2dbSJung-uk Kim
352*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Ni,${N0}[0]
353*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Ni,${N0}[1]
354*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Ni,${N1}[0]
355*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Ni,${N1}[1]
356*7bded2dbSJung-uk Kim
357*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Ni,${N2}[0]
358*7bded2dbSJung-uk Kim	vmov		$Temp,$A0xB
359*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Ni,${N2}[1]
360*7bded2dbSJung-uk Kim	vmov		$A0xB,$A1xB
361*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Ni,${N3}[0]
362*7bded2dbSJung-uk Kim	vmov		$A1xB,$A2xB
363*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Ni,${N3}[1]
364*7bded2dbSJung-uk Kim	vmov		$A2xB,$A3xB
365*7bded2dbSJung-uk Kim	vmov		$A3xB,$A4xB
366*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
367*7bded2dbSJung-uk Kim	vmov		$A4xB,$A5xB
368*7bded2dbSJung-uk Kim	vmov		$A5xB,$A6xB
369*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
370*7bded2dbSJung-uk Kim	vmov		$A6xB,$A7xB
371*7bded2dbSJung-uk Kim	veor		$A7xB,$A7xB
372*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
373*7bded2dbSJung-uk Kim
374*7bded2dbSJung-uk Kim	bne	.LNEON_outer8
375*7bded2dbSJung-uk Kim
376*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
377*7bded2dbSJung-uk Kim	mov		$toutptr,sp
378*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
379*7bded2dbSJung-uk Kim	mov		$inner,$num
380*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
381*7bded2dbSJung-uk Kim	add		$tinptr,sp,#16
382*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
383*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
384*7bded2dbSJung-uk Kim
385*7bded2dbSJung-uk Kim	b	.LNEON_tail2
386*7bded2dbSJung-uk Kim
387*7bded2dbSJung-uk Kim.align	4
388*7bded2dbSJung-uk Kim.LNEON_1st:
389*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Ni,${N0}[0]
390*7bded2dbSJung-uk Kim	 vld1.32	{$A0-$A3}, [$aptr]!
391*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Ni,${N0}[1]
392*7bded2dbSJung-uk Kim	subs		$inner,$inner,#8
393*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Ni,${N1}[0]
394*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Ni,${N1}[1]
395*7bded2dbSJung-uk Kim
396*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Ni,${N2}[0]
397*7bded2dbSJung-uk Kim	 vld1.32	{$N0-$N1}, [$nptr]!
398*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Ni,${N2}[1]
399*7bded2dbSJung-uk Kim	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
400*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Ni,${N3}[0]
401*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Ni,${N3}[1]
402*7bded2dbSJung-uk Kim	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
403*7bded2dbSJung-uk Kim
404*7bded2dbSJung-uk Kim	vmull.u32	$A0xB,$Bi,${A0}[0]
405*7bded2dbSJung-uk Kim	 vld1.32	{$N2-$N3}, [$nptr]!
406*7bded2dbSJung-uk Kim	vmull.u32	$A1xB,$Bi,${A0}[1]
407*7bded2dbSJung-uk Kim	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
408*7bded2dbSJung-uk Kim	vmull.u32	$A2xB,$Bi,${A1}[0]
409*7bded2dbSJung-uk Kim	vmull.u32	$A3xB,$Bi,${A1}[1]
410*7bded2dbSJung-uk Kim	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
411*7bded2dbSJung-uk Kim
412*7bded2dbSJung-uk Kim	vmull.u32	$A4xB,$Bi,${A2}[0]
413*7bded2dbSJung-uk Kim	vmull.u32	$A5xB,$Bi,${A2}[1]
414*7bded2dbSJung-uk Kim	vmull.u32	$A6xB,$Bi,${A3}[0]
415*7bded2dbSJung-uk Kim	vmull.u32	$A7xB,$Bi,${A3}[1]
416*7bded2dbSJung-uk Kim
417*7bded2dbSJung-uk Kim	bne	.LNEON_1st
418*7bded2dbSJung-uk Kim
419*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Ni,${N0}[0]
420*7bded2dbSJung-uk Kim	add		$tinptr,sp,#16
421*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Ni,${N0}[1]
422*7bded2dbSJung-uk Kim	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
423*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Ni,${N1}[0]
424*7bded2dbSJung-uk Kim	 vld1.64	{$Temp}, [sp,:128]
425*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Ni,${N1}[1]
426*7bded2dbSJung-uk Kim	sub		$outer,$num,#1
427*7bded2dbSJung-uk Kim
428*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Ni,${N2}[0]
429*7bded2dbSJung-uk Kim	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
430*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Ni,${N2}[1]
431*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
432*7bded2dbSJung-uk Kim	 vld1.64	{$A0xB},       [$tinptr, :128]!
433*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Ni,${N3}[0]
434*7bded2dbSJung-uk Kim	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
435*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Ni,${N3}[1]
436*7bded2dbSJung-uk Kim
437*7bded2dbSJung-uk Kim	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
438*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
439*7bded2dbSJung-uk Kim	veor		$Z,$Z,$Z
440*7bded2dbSJung-uk Kim	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
441*7bded2dbSJung-uk Kim	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
442*7bded2dbSJung-uk Kim	vst1.64		{$Z},          [$toutptr,:128]
443*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
444*7bded2dbSJung-uk Kim
445*7bded2dbSJung-uk Kim	b		.LNEON_outer
446*7bded2dbSJung-uk Kim
447*7bded2dbSJung-uk Kim.align	4
448*7bded2dbSJung-uk Kim.LNEON_outer:
449*7bded2dbSJung-uk Kim	vld1.32		{${Bi}[0]}, [$bptr,:32]!
450*7bded2dbSJung-uk Kim	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
451*7bded2dbSJung-uk Kim	vld1.32		{$A0-$A3},  [$aptr]!
452*7bded2dbSJung-uk Kim	veor		$zero,$zero,$zero
453*7bded2dbSJung-uk Kim	mov		$toutptr,sp
454*7bded2dbSJung-uk Kim	vzip.16		$Bi,$zero
455*7bded2dbSJung-uk Kim	sub		$inner,$num,#8
456*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
457*7bded2dbSJung-uk Kim
458*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Bi,${A0}[0]
459*7bded2dbSJung-uk Kim	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
460*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Bi,${A0}[1]
461*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Bi,${A1}[0]
462*7bded2dbSJung-uk Kim	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
463*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Bi,${A1}[1]
464*7bded2dbSJung-uk Kim
465*7bded2dbSJung-uk Kim	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
466*7bded2dbSJung-uk Kim	veor		$zero,$zero,$zero
467*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
468*7bded2dbSJung-uk Kim	 vld1.64	{$A7xB},[$tinptr,:128]!
469*7bded2dbSJung-uk Kim	vmul.u32	$Ni,$temp,$M0
470*7bded2dbSJung-uk Kim
471*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Bi,${A2}[0]
472*7bded2dbSJung-uk Kim	 vld1.32	{$N0-$N3}, [$nptr]!
473*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Bi,${A2}[1]
474*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Bi,${A3}[0]
475*7bded2dbSJung-uk Kim	vzip.16		$Ni,$zero
476*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Bi,${A3}[1]
477*7bded2dbSJung-uk Kim
478*7bded2dbSJung-uk Kim.LNEON_inner:
479*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Ni,${N0}[0]
480*7bded2dbSJung-uk Kim	 vld1.32	{$A0-$A3}, [$aptr]!
481*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Ni,${N0}[1]
482*7bded2dbSJung-uk Kim	 subs		$inner,$inner,#8
483*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Ni,${N1}[0]
484*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Ni,${N1}[1]
485*7bded2dbSJung-uk Kim	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
486*7bded2dbSJung-uk Kim
487*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Ni,${N2}[0]
488*7bded2dbSJung-uk Kim	 vld1.64	{$A0xB},       [$tinptr, :128]!
489*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Ni,${N2}[1]
490*7bded2dbSJung-uk Kim	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
491*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Ni,${N3}[0]
492*7bded2dbSJung-uk Kim	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
493*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Ni,${N3}[1]
494*7bded2dbSJung-uk Kim	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
495*7bded2dbSJung-uk Kim
496*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Bi,${A0}[0]
497*7bded2dbSJung-uk Kim	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
498*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Bi,${A0}[1]
499*7bded2dbSJung-uk Kim	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
500*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Bi,${A1}[0]
501*7bded2dbSJung-uk Kim	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
502*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Bi,${A1}[1]
503*7bded2dbSJung-uk Kim	 vld1.32	{$N0-$N3}, [$nptr]!
504*7bded2dbSJung-uk Kim
505*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Bi,${A2}[0]
506*7bded2dbSJung-uk Kim	 vld1.64	{$A7xB},       [$tinptr, :128]!
507*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Bi,${A2}[1]
508*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Bi,${A3}[0]
509*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Bi,${A3}[1]
510*7bded2dbSJung-uk Kim
511*7bded2dbSJung-uk Kim	bne	.LNEON_inner
512*7bded2dbSJung-uk Kim
513*7bded2dbSJung-uk Kim	vmlal.u32	$A0xB,$Ni,${N0}[0]
514*7bded2dbSJung-uk Kim	add		$tinptr,sp,#16
515*7bded2dbSJung-uk Kim	vmlal.u32	$A1xB,$Ni,${N0}[1]
516*7bded2dbSJung-uk Kim	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
517*7bded2dbSJung-uk Kim	vmlal.u32	$A2xB,$Ni,${N1}[0]
518*7bded2dbSJung-uk Kim	 vld1.64	{$Temp}, [sp,:128]
519*7bded2dbSJung-uk Kim	vmlal.u32	$A3xB,$Ni,${N1}[1]
520*7bded2dbSJung-uk Kim	subs		$outer,$outer,#1
521*7bded2dbSJung-uk Kim
522*7bded2dbSJung-uk Kim	vmlal.u32	$A4xB,$Ni,${N2}[0]
523*7bded2dbSJung-uk Kim	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
524*7bded2dbSJung-uk Kim	vmlal.u32	$A5xB,$Ni,${N2}[1]
525*7bded2dbSJung-uk Kim	 vld1.64	{$A0xB},       [$tinptr, :128]!
526*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
527*7bded2dbSJung-uk Kim	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
528*7bded2dbSJung-uk Kim	vmlal.u32	$A6xB,$Ni,${N3}[0]
529*7bded2dbSJung-uk Kim	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
530*7bded2dbSJung-uk Kim	vmlal.u32	$A7xB,$Ni,${N3}[1]
531*7bded2dbSJung-uk Kim
532*7bded2dbSJung-uk Kim	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
533*7bded2dbSJung-uk Kim	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
534*7bded2dbSJung-uk Kim	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
535*7bded2dbSJung-uk Kim	vshr.u64	$temp,$temp,#16
536*7bded2dbSJung-uk Kim
537*7bded2dbSJung-uk Kim	bne	.LNEON_outer
538*7bded2dbSJung-uk Kim
539*7bded2dbSJung-uk Kim	mov		$toutptr,sp
540*7bded2dbSJung-uk Kim	mov		$inner,$num
541*7bded2dbSJung-uk Kim
542*7bded2dbSJung-uk Kim.LNEON_tail:
543*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
544*7bded2dbSJung-uk Kim	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
545*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
546*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
547*7bded2dbSJung-uk Kim	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
548*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
549*7bded2dbSJung-uk Kim	vld1.64		{$A7xB},       [$tinptr, :128]!
550*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
551*7bded2dbSJung-uk Kim
552*7bded2dbSJung-uk Kim.LNEON_tail2:
553*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
554*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
555*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
556*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
557*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
558*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
559*7bded2dbSJung-uk Kim
560*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
561*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
562*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
563*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
564*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
565*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
566*7bded2dbSJung-uk Kim
567*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
568*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
569*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
570*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
571*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
572*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
573*7bded2dbSJung-uk Kim
574*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
575*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
576*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
577*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
578*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
579*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
580*7bded2dbSJung-uk Kim
581*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
582*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
583*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
584*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
585*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
586*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
587*7bded2dbSJung-uk Kim
588*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
589*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
590*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
591*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
592*7bded2dbSJung-uk Kim	vld1.64		{$A0xB}, [$tinptr, :128]!
593*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
594*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
595*7bded2dbSJung-uk Kim
596*7bded2dbSJung-uk Kim	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
597*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
598*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
599*7bded2dbSJung-uk Kim	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
600*7bded2dbSJung-uk Kim	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
601*7bded2dbSJung-uk Kim	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
602*7bded2dbSJung-uk Kim	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
603*7bded2dbSJung-uk Kim	subs		$inner,$inner,#8
604*7bded2dbSJung-uk Kim	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
605*7bded2dbSJung-uk Kim
606*7bded2dbSJung-uk Kim	bne	.LNEON_tail
607*7bded2dbSJung-uk Kim
608*7bded2dbSJung-uk Kim	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
609*7bded2dbSJung-uk Kim	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
610*7bded2dbSJung-uk Kim	subs	$aptr,sp,#0				@ clear carry flag
611*7bded2dbSJung-uk Kim	add	$bptr,sp,$num,lsl#2
612*7bded2dbSJung-uk Kim
613*7bded2dbSJung-uk Kim.LNEON_sub:
614*7bded2dbSJung-uk Kim	ldmia	$aptr!, {r4-r7}
615*7bded2dbSJung-uk Kim	ldmia	$nptr!, {r8-r11}
616*7bded2dbSJung-uk Kim	sbcs	r8, r4,r8
617*7bded2dbSJung-uk Kim	sbcs	r9, r5,r9
618*7bded2dbSJung-uk Kim	sbcs	r10,r6,r10
619*7bded2dbSJung-uk Kim	sbcs	r11,r7,r11
620*7bded2dbSJung-uk Kim	teq	$aptr,$bptr				@ preserves carry
621*7bded2dbSJung-uk Kim	stmia	$rptr!, {r8-r11}
622*7bded2dbSJung-uk Kim	bne	.LNEON_sub
623*7bded2dbSJung-uk Kim
624*7bded2dbSJung-uk Kim	ldr	r10, [$aptr]				@ load top-most bit
625*7bded2dbSJung-uk Kim	veor	q0,q0,q0
626*7bded2dbSJung-uk Kim	sub	r11,$bptr,sp				@ this is num*4
627*7bded2dbSJung-uk Kim	veor	q1,q1,q1
628*7bded2dbSJung-uk Kim	mov	$aptr,sp
629*7bded2dbSJung-uk Kim	sub	$rptr,$rptr,r11				@ rewind $rptr
630*7bded2dbSJung-uk Kim	mov	$nptr,$bptr				@ second 3/4th of frame
631*7bded2dbSJung-uk Kim	sbcs	r10,r10,#0				@ result is carry flag
632*7bded2dbSJung-uk Kim
633*7bded2dbSJung-uk Kim.LNEON_copy_n_zap:
634*7bded2dbSJung-uk Kim	ldmia	$aptr!, {r4-r7}
635*7bded2dbSJung-uk Kim	ldmia	$rptr,  {r8-r11}
636*7bded2dbSJung-uk Kim	movcc	r8, r4
637*7bded2dbSJung-uk Kim	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
638*7bded2dbSJung-uk Kim	movcc	r9, r5
639*7bded2dbSJung-uk Kim	movcc	r10,r6
640*7bded2dbSJung-uk Kim	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
641*7bded2dbSJung-uk Kim	movcc	r11,r7
642*7bded2dbSJung-uk Kim	ldmia	$aptr, {r4-r7}
643*7bded2dbSJung-uk Kim	stmia	$rptr!, {r8-r11}
644*7bded2dbSJung-uk Kim	sub	$aptr,$aptr,#16
645*7bded2dbSJung-uk Kim	ldmia	$rptr, {r8-r11}
646*7bded2dbSJung-uk Kim	movcc	r8, r4
647*7bded2dbSJung-uk Kim	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
648*7bded2dbSJung-uk Kim	movcc	r9, r5
649*7bded2dbSJung-uk Kim	movcc	r10,r6
650*7bded2dbSJung-uk Kim	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
651*7bded2dbSJung-uk Kim	movcc	r11,r7
652*7bded2dbSJung-uk Kim	teq	$aptr,$bptr				@ preserves carry
653*7bded2dbSJung-uk Kim	stmia	$rptr!, {r8-r11}
654*7bded2dbSJung-uk Kim	bne	.LNEON_copy_n_zap
655*7bded2dbSJung-uk Kim
656*7bded2dbSJung-uk Kim	sub	sp,ip,#96
657*7bded2dbSJung-uk Kim        vldmia  sp!,{d8-d15}
658*7bded2dbSJung-uk Kim        ldmia   sp!,{r4-r11}
659*7bded2dbSJung-uk Kim	ret						@ bx lr
660*7bded2dbSJung-uk Kim.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
661*7bded2dbSJung-uk Kim#endif
662*7bded2dbSJung-uk Kim___
663*7bded2dbSJung-uk Kim}
664*7bded2dbSJung-uk Kim$code.=<<___;
665*7bded2dbSJung-uk Kim.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
6661f13597dSJung-uk Kim.align	2
667*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7
668*7bded2dbSJung-uk Kim.comm	OPENSSL_armcap_P,4,4
669*7bded2dbSJung-uk Kim#endif
6701f13597dSJung-uk Kim___
6711f13597dSJung-uk Kim
672*7bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem;
6731f13597dSJung-uk Kim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
674*7bded2dbSJung-uk Kim$code =~ s/\bret\b/bx	lr/gm;
6751f13597dSJung-uk Kimprint $code;
6761f13597dSJung-uk Kimclose STDOUT;
677