11f13597dSJung-uk Kim#!/usr/bin/env perl 21f13597dSJung-uk Kim 31f13597dSJung-uk Kim# ==================================================================== 47bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 51f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 61f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 71f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 81f13597dSJung-uk Kim# ==================================================================== 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# January 2007. 111f13597dSJung-uk Kim 121f13597dSJung-uk Kim# Montgomery multiplication for ARMv4. 131f13597dSJung-uk Kim# 141f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations 151f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement 161f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and 171f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 181f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal 191f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an 201f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction. 211f13597dSJung-uk Kim# 221f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less 231f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively 241f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical. 251f13597dSJung-uk Kim 267bded2dbSJung-uk Kim# November 2013 277bded2dbSJung-uk Kim# 287bded2dbSJung-uk Kim# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 297bded2dbSJung-uk Kim# performance improvement on Cortex-A8 is ~45-100% depending on key 307bded2dbSJung-uk Kim# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 317bded2dbSJung-uk Kim# On Snapdragon S4 improvement was measured to vary from ~70% to 327bded2dbSJung-uk Kim# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 337bded2dbSJung-uk Kim# rather because original integer-only code seems to perform 347bded2dbSJung-uk Kim# suboptimally on S4. Situation on Cortex-A9 is unfortunately 357bded2dbSJung-uk Kim# different. It's being looked into, but the trouble is that 367bded2dbSJung-uk Kim# performance for vectors longer than 256 bits is actually couple 377bded2dbSJung-uk Kim# of percent worse than for integer-only code. The code is chosen 387bded2dbSJung-uk Kim# for execution on all NEON-capable processors, because gain on 397bded2dbSJung-uk Kim# others outweighs the marginal loss on Cortex-A9. 407bded2dbSJung-uk Kim 411f13597dSJung-uk Kimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 421f13597dSJung-uk Kimopen STDOUT,">$output"; 431f13597dSJung-uk Kim 441f13597dSJung-uk Kim$num="r0"; # starts as num argument, but holds &tp[num-1] 451f13597dSJung-uk Kim$ap="r1"; 461f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2"; 471f13597dSJung-uk Kim$np="r3"; 481f13597dSJung-uk Kim$tp="r4"; 491f13597dSJung-uk Kim$aj="r5"; 501f13597dSJung-uk Kim$nj="r6"; 511f13597dSJung-uk Kim$tj="r7"; 521f13597dSJung-uk Kim$n0="r8"; 531f13597dSJung-uk Kim########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 541f13597dSJung-uk Kim$alo="r10"; # sl, gcc uses it to keep @GOT 551f13597dSJung-uk Kim$ahi="r11"; # fp 561f13597dSJung-uk Kim$nlo="r12"; # ip 571f13597dSJung-uk Kim########### # r13 is stack pointer 581f13597dSJung-uk Kim$nhi="r14"; # lr 591f13597dSJung-uk Kim########### # r15 is program counter 601f13597dSJung-uk Kim 611f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num 621f13597dSJung-uk Kim$_rp="$num,#12*4"; 631f13597dSJung-uk Kim# ap permanently resides in r1 641f13597dSJung-uk Kim$_bp="$num,#13*4"; 651f13597dSJung-uk Kim# np permanently resides in r3 661f13597dSJung-uk Kim$_n0="$num,#14*4"; 671f13597dSJung-uk Kim$_num="$num,#15*4"; $_bpend=$_num; 681f13597dSJung-uk Kim 691f13597dSJung-uk Kim$code=<<___; 707bded2dbSJung-uk Kim#include "arm_arch.h" 717bded2dbSJung-uk Kim 721f13597dSJung-uk Kim.text 737bded2dbSJung-uk Kim.code 32 747bded2dbSJung-uk Kim 757bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 767bded2dbSJung-uk Kim.align 5 777bded2dbSJung-uk Kim.LOPENSSL_armcap: 787bded2dbSJung-uk Kim.word OPENSSL_armcap_P-bn_mul_mont 797bded2dbSJung-uk Kim#endif 801f13597dSJung-uk Kim 811f13597dSJung-uk Kim.global bn_mul_mont 821f13597dSJung-uk Kim.type bn_mul_mont,%function 831f13597dSJung-uk Kim 847bded2dbSJung-uk Kim.align 5 851f13597dSJung-uk Kimbn_mul_mont: 867bded2dbSJung-uk Kim ldr ip,[sp,#4] @ load num 871f13597dSJung-uk Kim stmdb sp!,{r0,r2} @ sp points at argument block 887bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 897bded2dbSJung-uk Kim tst ip,#7 907bded2dbSJung-uk Kim bne .Lialu 917bded2dbSJung-uk Kim adr r0,bn_mul_mont 927bded2dbSJung-uk Kim ldr r2,.LOPENSSL_armcap 937bded2dbSJung-uk Kim ldr r0,[r0,r2] 947bded2dbSJung-uk Kim tst r0,#1 @ NEON available? 957bded2dbSJung-uk Kim ldmia sp, {r0,r2} 967bded2dbSJung-uk Kim beq .Lialu 977bded2dbSJung-uk Kim add sp,sp,#8 987bded2dbSJung-uk Kim b bn_mul8x_mont_neon 997bded2dbSJung-uk Kim.align 4 1007bded2dbSJung-uk Kim.Lialu: 1017bded2dbSJung-uk Kim#endif 1027bded2dbSJung-uk Kim cmp ip,#2 1037bded2dbSJung-uk Kim mov $num,ip @ load num 1041f13597dSJung-uk Kim movlt r0,#0 1051f13597dSJung-uk Kim addlt sp,sp,#2*4 1061f13597dSJung-uk Kim blt .Labrt 1071f13597dSJung-uk Kim 1081f13597dSJung-uk Kim stmdb sp!,{r4-r12,lr} @ save 10 registers 1091f13597dSJung-uk Kim 1101f13597dSJung-uk Kim mov $num,$num,lsl#2 @ rescale $num for byte count 1111f13597dSJung-uk Kim sub sp,sp,$num @ alloca(4*num) 1121f13597dSJung-uk Kim sub sp,sp,#4 @ +extra dword 1131f13597dSJung-uk Kim sub $num,$num,#4 @ "num=num-1" 1141f13597dSJung-uk Kim add $tp,$bp,$num @ &bp[num-1] 1151f13597dSJung-uk Kim 1161f13597dSJung-uk Kim add $num,sp,$num @ $num to point at &tp[num-1] 1171f13597dSJung-uk Kim ldr $n0,[$_n0] @ &n0 1181f13597dSJung-uk Kim ldr $bi,[$bp] @ bp[0] 1191f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[0],ap++ 1201f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[0],np++ 1211f13597dSJung-uk Kim ldr $n0,[$n0] @ *n0 1221f13597dSJung-uk Kim str $tp,[$_bpend] @ save &bp[num] 1231f13597dSJung-uk Kim 1241f13597dSJung-uk Kim umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 1251f13597dSJung-uk Kim str $n0,[$_n0] @ save n0 value 1261f13597dSJung-uk Kim mul $n0,$alo,$n0 @ "tp[0]"*n0 1271f13597dSJung-uk Kim mov $nlo,#0 1281f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 1291f13597dSJung-uk Kim mov $tp,sp 1301f13597dSJung-uk Kim 1311f13597dSJung-uk Kim.L1st: 1321f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1331f13597dSJung-uk Kim mov $alo,$ahi 1341f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1351f13597dSJung-uk Kim mov $ahi,#0 1361f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 1371f13597dSJung-uk Kim mov $nhi,#0 1381f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1391f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1401f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1411f13597dSJung-uk Kim adc $nlo,$nhi,#0 1421f13597dSJung-uk Kim cmp $tp,$num 1431f13597dSJung-uk Kim bne .L1st 1441f13597dSJung-uk Kim 1451f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1461f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1471f13597dSJung-uk Kim mov $nhi,#0 1481f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1491f13597dSJung-uk Kim adc $nhi,$nhi,#0 1501f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 1511f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 1521f13597dSJung-uk Kim 1531f13597dSJung-uk Kim.Louter: 1541f13597dSJung-uk Kim sub $tj,$num,sp @ "original" $num-1 value 1551f13597dSJung-uk Kim sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 1561f13597dSJung-uk Kim ldr $bi,[$tp,#4]! @ *(++bp) 1571f13597dSJung-uk Kim sub $np,$np,$tj @ "rewind" np to &np[1] 1581f13597dSJung-uk Kim ldr $aj,[$ap,#-4] @ ap[0] 1591f13597dSJung-uk Kim ldr $alo,[sp] @ tp[0] 1601f13597dSJung-uk Kim ldr $nj,[$np,#-4] @ np[0] 1611f13597dSJung-uk Kim ldr $tj,[sp,#4] @ tp[1] 1621f13597dSJung-uk Kim 1631f13597dSJung-uk Kim mov $ahi,#0 1641f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 1651f13597dSJung-uk Kim str $tp,[$_bp] @ save bp 1661f13597dSJung-uk Kim mul $n0,$alo,$n0 1671f13597dSJung-uk Kim mov $nlo,#0 1681f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 1691f13597dSJung-uk Kim mov $tp,sp 1701f13597dSJung-uk Kim 1711f13597dSJung-uk Kim.Linner: 1721f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1731f13597dSJung-uk Kim adds $alo,$ahi,$tj @ +=tp[j] 1741f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1751f13597dSJung-uk Kim mov $ahi,#0 1761f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 1771f13597dSJung-uk Kim mov $nhi,#0 1781f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1791f13597dSJung-uk Kim adc $ahi,$ahi,#0 1801f13597dSJung-uk Kim ldr $tj,[$tp,#8] @ tp[j+1] 1811f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1821f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1831f13597dSJung-uk Kim adc $nlo,$nhi,#0 1841f13597dSJung-uk Kim cmp $tp,$num 1851f13597dSJung-uk Kim bne .Linner 1861f13597dSJung-uk Kim 1871f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1881f13597dSJung-uk Kim mov $nhi,#0 1891f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1901f13597dSJung-uk Kim adc $nhi,$nhi,#0 1911f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1921f13597dSJung-uk Kim adds $nlo,$nlo,$tj 1931f13597dSJung-uk Kim ldr $tj,[$_bpend] @ restore &bp[num] 1941f13597dSJung-uk Kim adc $nhi,$nhi,#0 1951f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 1961f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 1971f13597dSJung-uk Kim 1981f13597dSJung-uk Kim cmp $tp,$tj 1991f13597dSJung-uk Kim bne .Louter 2001f13597dSJung-uk Kim 2011f13597dSJung-uk Kim ldr $rp,[$_rp] @ pull rp 2021f13597dSJung-uk Kim add $num,$num,#4 @ $num to point at &tp[num] 2031f13597dSJung-uk Kim sub $aj,$num,sp @ "original" num value 2041f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2051f13597dSJung-uk Kim mov $ap,$tp @ "borrow" $ap 2061f13597dSJung-uk Kim sub $np,$np,$aj @ "rewind" $np to &np[0] 2071f13597dSJung-uk Kim 2081f13597dSJung-uk Kim subs $tj,$tj,$tj @ "clear" carry flag 2091f13597dSJung-uk Kim.Lsub: ldr $tj,[$tp],#4 2101f13597dSJung-uk Kim ldr $nj,[$np],#4 2111f13597dSJung-uk Kim sbcs $tj,$tj,$nj @ tp[j]-np[j] 2121f13597dSJung-uk Kim str $tj,[$rp],#4 @ rp[j]= 2131f13597dSJung-uk Kim teq $tp,$num @ preserve carry 2141f13597dSJung-uk Kim bne .Lsub 2151f13597dSJung-uk Kim sbcs $nhi,$nhi,#0 @ upmost carry 2161f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2171f13597dSJung-uk Kim sub $rp,$rp,$aj @ "rewind" $rp 2181f13597dSJung-uk Kim 219*dea77ea6SJung-uk Kim.Lcopy: ldr $tj,[$tp] @ conditional copy 220*dea77ea6SJung-uk Kim ldr $aj,[$rp] 2211f13597dSJung-uk Kim str sp,[$tp],#4 @ zap tp 222*dea77ea6SJung-uk Kim#ifdef __thumb2__ 223*dea77ea6SJung-uk Kim it cc 224*dea77ea6SJung-uk Kim#endif 225*dea77ea6SJung-uk Kim movcc $aj,$tj 226*dea77ea6SJung-uk Kim str $aj,[$rp],#4 227*dea77ea6SJung-uk Kim teq $tp,$num @ preserve carry 2281f13597dSJung-uk Kim bne .Lcopy 2291f13597dSJung-uk Kim 2301f13597dSJung-uk Kim add sp,$num,#4 @ skip over tp[num+1] 2311f13597dSJung-uk Kim ldmia sp!,{r4-r12,lr} @ restore registers 2321f13597dSJung-uk Kim add sp,sp,#2*4 @ skip over {r0,r2} 2331f13597dSJung-uk Kim mov r0,#1 2347bded2dbSJung-uk Kim.Labrt: 2357bded2dbSJung-uk Kim#if __ARM_ARCH__>=5 2367bded2dbSJung-uk Kim ret @ bx lr 2377bded2dbSJung-uk Kim#else 2387bded2dbSJung-uk Kim tst lr,#1 2391f13597dSJung-uk Kim moveq pc,lr @ be binary compatible with V4, yet 2401f13597dSJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 2417bded2dbSJung-uk Kim#endif 2421f13597dSJung-uk Kim.size bn_mul_mont,.-bn_mul_mont 2437bded2dbSJung-uk Kim___ 2447bded2dbSJung-uk Kim{ 2457bded2dbSJung-uk Kimsub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 2467bded2dbSJung-uk Kimsub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 2477bded2dbSJung-uk Kim 2487bded2dbSJung-uk Kimmy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 2497bded2dbSJung-uk Kimmy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 2507bded2dbSJung-uk Kimmy ($Z,$Temp)=("q4","q5"); 2517bded2dbSJung-uk Kimmy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); 2527bded2dbSJung-uk Kimmy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 2537bded2dbSJung-uk Kimmy $zero=&Dlo($Z); 2547bded2dbSJung-uk Kimmy $temp=&Dlo($Temp); 2557bded2dbSJung-uk Kim 2567bded2dbSJung-uk Kimmy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 2577bded2dbSJung-uk Kimmy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); 2587bded2dbSJung-uk Kim 2597bded2dbSJung-uk Kim$code.=<<___; 2607bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 2617bded2dbSJung-uk Kim.arch armv7-a 2627bded2dbSJung-uk Kim.fpu neon 2637bded2dbSJung-uk Kim 2647bded2dbSJung-uk Kim.type bn_mul8x_mont_neon,%function 2657bded2dbSJung-uk Kim.align 5 2667bded2dbSJung-uk Kimbn_mul8x_mont_neon: 2677bded2dbSJung-uk Kim mov ip,sp 2687bded2dbSJung-uk Kim stmdb sp!,{r4-r11} 2697bded2dbSJung-uk Kim vstmdb sp!,{d8-d15} @ ABI specification says so 2707bded2dbSJung-uk Kim ldmia ip,{r4-r5} @ load rest of parameter block 2717bded2dbSJung-uk Kim 2727bded2dbSJung-uk Kim sub $toutptr,sp,#16 2737bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 2747bded2dbSJung-uk Kim sub $toutptr,$toutptr,$num,lsl#4 2757bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 2767bded2dbSJung-uk Kim and $toutptr,$toutptr,#-64 2777bded2dbSJung-uk Kim vld1.32 {${M0}[0]}, [$n0,:32] 2787bded2dbSJung-uk Kim mov sp,$toutptr @ alloca 2797bded2dbSJung-uk Kim veor $zero,$zero,$zero 2807bded2dbSJung-uk Kim subs $inner,$num,#8 2817bded2dbSJung-uk Kim vzip.16 $Bi,$zero 2827bded2dbSJung-uk Kim 2837bded2dbSJung-uk Kim vmull.u32 $A0xB,$Bi,${A0}[0] 2847bded2dbSJung-uk Kim vmull.u32 $A1xB,$Bi,${A0}[1] 2857bded2dbSJung-uk Kim vmull.u32 $A2xB,$Bi,${A1}[0] 2867bded2dbSJung-uk Kim vshl.i64 $temp,`&Dhi("$A0xB")`,#16 2877bded2dbSJung-uk Kim vmull.u32 $A3xB,$Bi,${A1}[1] 2887bded2dbSJung-uk Kim 2897bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 2907bded2dbSJung-uk Kim veor $zero,$zero,$zero 2917bded2dbSJung-uk Kim vmul.u32 $Ni,$temp,$M0 2927bded2dbSJung-uk Kim 2937bded2dbSJung-uk Kim vmull.u32 $A4xB,$Bi,${A2}[0] 2947bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 2957bded2dbSJung-uk Kim vmull.u32 $A5xB,$Bi,${A2}[1] 2967bded2dbSJung-uk Kim vmull.u32 $A6xB,$Bi,${A3}[0] 2977bded2dbSJung-uk Kim vzip.16 $Ni,$zero 2987bded2dbSJung-uk Kim vmull.u32 $A7xB,$Bi,${A3}[1] 2997bded2dbSJung-uk Kim 3007bded2dbSJung-uk Kim bne .LNEON_1st 3017bded2dbSJung-uk Kim 3027bded2dbSJung-uk Kim @ special case for num=8, everything is in register bank... 3037bded2dbSJung-uk Kim 3047bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 3057bded2dbSJung-uk Kim sub $outer,$num,#1 3067bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 3077bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 3087bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 3097bded2dbSJung-uk Kim 3107bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 3117bded2dbSJung-uk Kim vmov $Temp,$A0xB 3127bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 3137bded2dbSJung-uk Kim vmov $A0xB,$A1xB 3147bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 3157bded2dbSJung-uk Kim vmov $A1xB,$A2xB 3167bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 3177bded2dbSJung-uk Kim vmov $A2xB,$A3xB 3187bded2dbSJung-uk Kim vmov $A3xB,$A4xB 3197bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3207bded2dbSJung-uk Kim vmov $A4xB,$A5xB 3217bded2dbSJung-uk Kim vmov $A5xB,$A6xB 3227bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 3237bded2dbSJung-uk Kim vmov $A6xB,$A7xB 3247bded2dbSJung-uk Kim veor $A7xB,$A7xB 3257bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3267bded2dbSJung-uk Kim 3277bded2dbSJung-uk Kim b .LNEON_outer8 3287bded2dbSJung-uk Kim 3297bded2dbSJung-uk Kim.align 4 3307bded2dbSJung-uk Kim.LNEON_outer8: 3317bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 3327bded2dbSJung-uk Kim veor $zero,$zero,$zero 3337bded2dbSJung-uk Kim vzip.16 $Bi,$zero 3347bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 3357bded2dbSJung-uk Kim 3367bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Bi,${A0}[0] 3377bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Bi,${A0}[1] 3387bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Bi,${A1}[0] 3397bded2dbSJung-uk Kim vshl.i64 $temp,`&Dhi("$A0xB")`,#16 3407bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Bi,${A1}[1] 3417bded2dbSJung-uk Kim 3427bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 3437bded2dbSJung-uk Kim veor $zero,$zero,$zero 3447bded2dbSJung-uk Kim subs $outer,$outer,#1 3457bded2dbSJung-uk Kim vmul.u32 $Ni,$temp,$M0 3467bded2dbSJung-uk Kim 3477bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Bi,${A2}[0] 3487bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Bi,${A2}[1] 3497bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Bi,${A3}[0] 3507bded2dbSJung-uk Kim vzip.16 $Ni,$zero 3517bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Bi,${A3}[1] 3527bded2dbSJung-uk Kim 3537bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 3547bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 3557bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 3567bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 3577bded2dbSJung-uk Kim 3587bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 3597bded2dbSJung-uk Kim vmov $Temp,$A0xB 3607bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 3617bded2dbSJung-uk Kim vmov $A0xB,$A1xB 3627bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 3637bded2dbSJung-uk Kim vmov $A1xB,$A2xB 3647bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 3657bded2dbSJung-uk Kim vmov $A2xB,$A3xB 3667bded2dbSJung-uk Kim vmov $A3xB,$A4xB 3677bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3687bded2dbSJung-uk Kim vmov $A4xB,$A5xB 3697bded2dbSJung-uk Kim vmov $A5xB,$A6xB 3707bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 3717bded2dbSJung-uk Kim vmov $A6xB,$A7xB 3727bded2dbSJung-uk Kim veor $A7xB,$A7xB 3737bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3747bded2dbSJung-uk Kim 3757bded2dbSJung-uk Kim bne .LNEON_outer8 3767bded2dbSJung-uk Kim 3777bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 3787bded2dbSJung-uk Kim mov $toutptr,sp 3797bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A0xB")`,#16 3807bded2dbSJung-uk Kim mov $inner,$num 3817bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 3827bded2dbSJung-uk Kim add $tinptr,sp,#16 3837bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A0xB")`,#16 3847bded2dbSJung-uk Kim vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 3857bded2dbSJung-uk Kim 3867bded2dbSJung-uk Kim b .LNEON_tail2 3877bded2dbSJung-uk Kim 3887bded2dbSJung-uk Kim.align 4 3897bded2dbSJung-uk Kim.LNEON_1st: 3907bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 3917bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! 3927bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 3937bded2dbSJung-uk Kim subs $inner,$inner,#8 3947bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 3957bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 3967bded2dbSJung-uk Kim 3977bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 3987bded2dbSJung-uk Kim vld1.32 {$N0-$N1}, [$nptr]! 3997bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 4007bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 4017bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 4027bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 4037bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 4047bded2dbSJung-uk Kim 4057bded2dbSJung-uk Kim vmull.u32 $A0xB,$Bi,${A0}[0] 4067bded2dbSJung-uk Kim vld1.32 {$N2-$N3}, [$nptr]! 4077bded2dbSJung-uk Kim vmull.u32 $A1xB,$Bi,${A0}[1] 4087bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 4097bded2dbSJung-uk Kim vmull.u32 $A2xB,$Bi,${A1}[0] 4107bded2dbSJung-uk Kim vmull.u32 $A3xB,$Bi,${A1}[1] 4117bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 4127bded2dbSJung-uk Kim 4137bded2dbSJung-uk Kim vmull.u32 $A4xB,$Bi,${A2}[0] 4147bded2dbSJung-uk Kim vmull.u32 $A5xB,$Bi,${A2}[1] 4157bded2dbSJung-uk Kim vmull.u32 $A6xB,$Bi,${A3}[0] 4167bded2dbSJung-uk Kim vmull.u32 $A7xB,$Bi,${A3}[1] 4177bded2dbSJung-uk Kim 4187bded2dbSJung-uk Kim bne .LNEON_1st 4197bded2dbSJung-uk Kim 4207bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 4217bded2dbSJung-uk Kim add $tinptr,sp,#16 4227bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 4237bded2dbSJung-uk Kim sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 4247bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 4257bded2dbSJung-uk Kim vld1.64 {$Temp}, [sp,:128] 4267bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 4277bded2dbSJung-uk Kim sub $outer,$num,#1 4287bded2dbSJung-uk Kim 4297bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 4307bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 4317bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 4327bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 4337bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 4347bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 4357bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 4367bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 4377bded2dbSJung-uk Kim 4387bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 4397bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 4407bded2dbSJung-uk Kim veor $Z,$Z,$Z 4417bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 4427bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 4437bded2dbSJung-uk Kim vst1.64 {$Z}, [$toutptr,:128] 4447bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 4457bded2dbSJung-uk Kim 4467bded2dbSJung-uk Kim b .LNEON_outer 4477bded2dbSJung-uk Kim 4487bded2dbSJung-uk Kim.align 4 4497bded2dbSJung-uk Kim.LNEON_outer: 4507bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 4517bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 4527bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! 4537bded2dbSJung-uk Kim veor $zero,$zero,$zero 4547bded2dbSJung-uk Kim mov $toutptr,sp 4557bded2dbSJung-uk Kim vzip.16 $Bi,$zero 4567bded2dbSJung-uk Kim sub $inner,$num,#8 4577bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 4587bded2dbSJung-uk Kim 4597bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Bi,${A0}[0] 4607bded2dbSJung-uk Kim vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! 4617bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Bi,${A0}[1] 4627bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Bi,${A1}[0] 4637bded2dbSJung-uk Kim vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! 4647bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Bi,${A1}[1] 4657bded2dbSJung-uk Kim 4667bded2dbSJung-uk Kim vshl.i64 $temp,`&Dhi("$A0xB")`,#16 4677bded2dbSJung-uk Kim veor $zero,$zero,$zero 4687bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 4697bded2dbSJung-uk Kim vld1.64 {$A7xB},[$tinptr,:128]! 4707bded2dbSJung-uk Kim vmul.u32 $Ni,$temp,$M0 4717bded2dbSJung-uk Kim 4727bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Bi,${A2}[0] 4737bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 4747bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Bi,${A2}[1] 4757bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Bi,${A3}[0] 4767bded2dbSJung-uk Kim vzip.16 $Ni,$zero 4777bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Bi,${A3}[1] 4787bded2dbSJung-uk Kim 4797bded2dbSJung-uk Kim.LNEON_inner: 4807bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 4817bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! 4827bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 4837bded2dbSJung-uk Kim subs $inner,$inner,#8 4847bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 4857bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 4867bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 4877bded2dbSJung-uk Kim 4887bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 4897bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 4907bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 4917bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 4927bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 4937bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 4947bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 4957bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 4967bded2dbSJung-uk Kim 4977bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Bi,${A0}[0] 4987bded2dbSJung-uk Kim vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 4997bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Bi,${A0}[1] 5007bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 5017bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Bi,${A1}[0] 5027bded2dbSJung-uk Kim vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 5037bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Bi,${A1}[1] 5047bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 5057bded2dbSJung-uk Kim 5067bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Bi,${A2}[0] 5077bded2dbSJung-uk Kim vld1.64 {$A7xB}, [$tinptr, :128]! 5087bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Bi,${A2}[1] 5097bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Bi,${A3}[0] 5107bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Bi,${A3}[1] 5117bded2dbSJung-uk Kim 5127bded2dbSJung-uk Kim bne .LNEON_inner 5137bded2dbSJung-uk Kim 5147bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 5157bded2dbSJung-uk Kim add $tinptr,sp,#16 5167bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 5177bded2dbSJung-uk Kim sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 5187bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 5197bded2dbSJung-uk Kim vld1.64 {$Temp}, [sp,:128] 5207bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 5217bded2dbSJung-uk Kim subs $outer,$outer,#1 5227bded2dbSJung-uk Kim 5237bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 5247bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 5257bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 5267bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 5277bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 5287bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 5297bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 5307bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 5317bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 5327bded2dbSJung-uk Kim 5337bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 5347bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 5357bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 5367bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 5377bded2dbSJung-uk Kim 5387bded2dbSJung-uk Kim bne .LNEON_outer 5397bded2dbSJung-uk Kim 5407bded2dbSJung-uk Kim mov $toutptr,sp 5417bded2dbSJung-uk Kim mov $inner,$num 5427bded2dbSJung-uk Kim 5437bded2dbSJung-uk Kim.LNEON_tail: 5447bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 5457bded2dbSJung-uk Kim vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 5467bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A0xB")`,#16 5477bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 5487bded2dbSJung-uk Kim vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 5497bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A0xB")`,#16 5507bded2dbSJung-uk Kim vld1.64 {$A7xB}, [$tinptr, :128]! 5517bded2dbSJung-uk Kim vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 5527bded2dbSJung-uk Kim 5537bded2dbSJung-uk Kim.LNEON_tail2: 5547bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp 5557bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! 5567bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A1xB")`,#16 5577bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp 5587bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A1xB")`,#16 5597bded2dbSJung-uk Kim vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` 5607bded2dbSJung-uk Kim 5617bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp 5627bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! 5637bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A2xB")`,#16 5647bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp 5657bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A2xB")`,#16 5667bded2dbSJung-uk Kim vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` 5677bded2dbSJung-uk Kim 5687bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp 5697bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! 5707bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A3xB")`,#16 5717bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp 5727bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A3xB")`,#16 5737bded2dbSJung-uk Kim vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` 5747bded2dbSJung-uk Kim 5757bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp 5767bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! 5777bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A4xB")`,#16 5787bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp 5797bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A4xB")`,#16 5807bded2dbSJung-uk Kim vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` 5817bded2dbSJung-uk Kim 5827bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp 5837bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! 5847bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A5xB")`,#16 5857bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp 5867bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A5xB")`,#16 5877bded2dbSJung-uk Kim vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` 5887bded2dbSJung-uk Kim 5897bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp 5907bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! 5917bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A6xB")`,#16 5927bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp 5937bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 5947bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A6xB")`,#16 5957bded2dbSJung-uk Kim vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` 5967bded2dbSJung-uk Kim 5977bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp 5987bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! 5997bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A7xB")`,#16 6007bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp 6017bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 6027bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A7xB")`,#16 6037bded2dbSJung-uk Kim vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` 6047bded2dbSJung-uk Kim subs $inner,$inner,#8 6057bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! 6067bded2dbSJung-uk Kim 6077bded2dbSJung-uk Kim bne .LNEON_tail 6087bded2dbSJung-uk Kim 6097bded2dbSJung-uk Kim vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 6107bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 6117bded2dbSJung-uk Kim subs $aptr,sp,#0 @ clear carry flag 6127bded2dbSJung-uk Kim add $bptr,sp,$num,lsl#2 6137bded2dbSJung-uk Kim 6147bded2dbSJung-uk Kim.LNEON_sub: 6157bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 6167bded2dbSJung-uk Kim ldmia $nptr!, {r8-r11} 6177bded2dbSJung-uk Kim sbcs r8, r4,r8 6187bded2dbSJung-uk Kim sbcs r9, r5,r9 6197bded2dbSJung-uk Kim sbcs r10,r6,r10 6207bded2dbSJung-uk Kim sbcs r11,r7,r11 6217bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 6227bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 6237bded2dbSJung-uk Kim bne .LNEON_sub 6247bded2dbSJung-uk Kim 6257bded2dbSJung-uk Kim ldr r10, [$aptr] @ load top-most bit 6267bded2dbSJung-uk Kim veor q0,q0,q0 6277bded2dbSJung-uk Kim sub r11,$bptr,sp @ this is num*4 6287bded2dbSJung-uk Kim veor q1,q1,q1 6297bded2dbSJung-uk Kim mov $aptr,sp 6307bded2dbSJung-uk Kim sub $rptr,$rptr,r11 @ rewind $rptr 6317bded2dbSJung-uk Kim mov $nptr,$bptr @ second 3/4th of frame 6327bded2dbSJung-uk Kim sbcs r10,r10,#0 @ result is carry flag 6337bded2dbSJung-uk Kim 6347bded2dbSJung-uk Kim.LNEON_copy_n_zap: 6357bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 6367bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 6377bded2dbSJung-uk Kim movcc r8, r4 6387bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 6397bded2dbSJung-uk Kim movcc r9, r5 6407bded2dbSJung-uk Kim movcc r10,r6 6417bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 6427bded2dbSJung-uk Kim movcc r11,r7 6437bded2dbSJung-uk Kim ldmia $aptr, {r4-r7} 6447bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 6457bded2dbSJung-uk Kim sub $aptr,$aptr,#16 6467bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 6477bded2dbSJung-uk Kim movcc r8, r4 6487bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 6497bded2dbSJung-uk Kim movcc r9, r5 6507bded2dbSJung-uk Kim movcc r10,r6 6517bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 6527bded2dbSJung-uk Kim movcc r11,r7 6537bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 6547bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 6557bded2dbSJung-uk Kim bne .LNEON_copy_n_zap 6567bded2dbSJung-uk Kim 6577bded2dbSJung-uk Kim sub sp,ip,#96 6587bded2dbSJung-uk Kim vldmia sp!,{d8-d15} 6597bded2dbSJung-uk Kim ldmia sp!,{r4-r11} 6607bded2dbSJung-uk Kim ret @ bx lr 6617bded2dbSJung-uk Kim.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 6627bded2dbSJung-uk Kim#endif 6637bded2dbSJung-uk Kim___ 6647bded2dbSJung-uk Kim} 6657bded2dbSJung-uk Kim$code.=<<___; 6667bded2dbSJung-uk Kim.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 6671f13597dSJung-uk Kim.align 2 6687bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 6697bded2dbSJung-uk Kim.comm OPENSSL_armcap_P,4,4 6707bded2dbSJung-uk Kim#endif 6711f13597dSJung-uk Kim___ 6721f13597dSJung-uk Kim 6737bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem; 6741f13597dSJung-uk Kim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 6757bded2dbSJung-uk Kim$code =~ s/\bret\b/bx lr/gm; 6761f13597dSJung-uk Kimprint $code; 6771f13597dSJung-uk Kimclose STDOUT; 678