11f13597dSJung-uk Kim#!/usr/bin/env perl 21f13597dSJung-uk Kim 31f13597dSJung-uk Kim# ==================================================================== 4*7bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 51f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 61f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 71f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 81f13597dSJung-uk Kim# ==================================================================== 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# January 2007. 111f13597dSJung-uk Kim 121f13597dSJung-uk Kim# Montgomery multiplication for ARMv4. 131f13597dSJung-uk Kim# 141f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations 151f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement 161f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and 171f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 181f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal 191f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an 201f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction. 211f13597dSJung-uk Kim# 221f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less 231f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively 241f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical. 251f13597dSJung-uk Kim 26*7bded2dbSJung-uk Kim# November 2013 27*7bded2dbSJung-uk Kim# 28*7bded2dbSJung-uk Kim# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 29*7bded2dbSJung-uk Kim# performance improvement on Cortex-A8 is ~45-100% depending on key 30*7bded2dbSJung-uk Kim# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 31*7bded2dbSJung-uk Kim# On Snapdragon S4 improvement was measured to vary from ~70% to 32*7bded2dbSJung-uk Kim# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 33*7bded2dbSJung-uk Kim# rather because original integer-only code seems to perform 34*7bded2dbSJung-uk Kim# suboptimally on S4. Situation on Cortex-A9 is unfortunately 35*7bded2dbSJung-uk Kim# different. It's being looked into, but the trouble is that 36*7bded2dbSJung-uk Kim# performance for vectors longer than 256 bits is actually couple 37*7bded2dbSJung-uk Kim# of percent worse than for integer-only code. The code is chosen 38*7bded2dbSJung-uk Kim# for execution on all NEON-capable processors, because gain on 39*7bded2dbSJung-uk Kim# others outweighs the marginal loss on Cortex-A9. 40*7bded2dbSJung-uk Kim 411f13597dSJung-uk Kimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 421f13597dSJung-uk Kimopen STDOUT,">$output"; 431f13597dSJung-uk Kim 441f13597dSJung-uk Kim$num="r0"; # starts as num argument, but holds &tp[num-1] 451f13597dSJung-uk Kim$ap="r1"; 461f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2"; 471f13597dSJung-uk Kim$np="r3"; 481f13597dSJung-uk Kim$tp="r4"; 491f13597dSJung-uk Kim$aj="r5"; 501f13597dSJung-uk Kim$nj="r6"; 511f13597dSJung-uk Kim$tj="r7"; 521f13597dSJung-uk Kim$n0="r8"; 531f13597dSJung-uk Kim########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 541f13597dSJung-uk Kim$alo="r10"; # sl, gcc uses it to keep @GOT 551f13597dSJung-uk Kim$ahi="r11"; # fp 561f13597dSJung-uk Kim$nlo="r12"; # ip 571f13597dSJung-uk Kim########### # r13 is stack pointer 581f13597dSJung-uk Kim$nhi="r14"; # lr 591f13597dSJung-uk Kim########### # r15 is program counter 601f13597dSJung-uk Kim 611f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num 621f13597dSJung-uk Kim$_rp="$num,#12*4"; 631f13597dSJung-uk Kim# ap permanently resides in r1 641f13597dSJung-uk Kim$_bp="$num,#13*4"; 651f13597dSJung-uk Kim# np permanently resides in r3 661f13597dSJung-uk Kim$_n0="$num,#14*4"; 671f13597dSJung-uk Kim$_num="$num,#15*4"; $_bpend=$_num; 681f13597dSJung-uk Kim 691f13597dSJung-uk Kim$code=<<___; 70*7bded2dbSJung-uk Kim#include "arm_arch.h" 71*7bded2dbSJung-uk Kim 721f13597dSJung-uk Kim.text 73*7bded2dbSJung-uk Kim.code 32 74*7bded2dbSJung-uk Kim 75*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 76*7bded2dbSJung-uk Kim.align 5 77*7bded2dbSJung-uk Kim.LOPENSSL_armcap: 78*7bded2dbSJung-uk Kim.word OPENSSL_armcap_P-bn_mul_mont 79*7bded2dbSJung-uk Kim#endif 801f13597dSJung-uk Kim 811f13597dSJung-uk Kim.global bn_mul_mont 821f13597dSJung-uk Kim.type bn_mul_mont,%function 831f13597dSJung-uk Kim 84*7bded2dbSJung-uk Kim.align 5 851f13597dSJung-uk Kimbn_mul_mont: 86*7bded2dbSJung-uk Kim ldr ip,[sp,#4] @ load num 871f13597dSJung-uk Kim stmdb sp!,{r0,r2} @ sp points at argument block 88*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 89*7bded2dbSJung-uk Kim tst ip,#7 90*7bded2dbSJung-uk Kim bne .Lialu 91*7bded2dbSJung-uk Kim adr r0,bn_mul_mont 92*7bded2dbSJung-uk Kim ldr r2,.LOPENSSL_armcap 93*7bded2dbSJung-uk Kim ldr r0,[r0,r2] 94*7bded2dbSJung-uk Kim tst r0,#1 @ NEON available? 95*7bded2dbSJung-uk Kim ldmia sp, {r0,r2} 96*7bded2dbSJung-uk Kim beq .Lialu 97*7bded2dbSJung-uk Kim add sp,sp,#8 98*7bded2dbSJung-uk Kim b bn_mul8x_mont_neon 99*7bded2dbSJung-uk Kim.align 4 100*7bded2dbSJung-uk Kim.Lialu: 101*7bded2dbSJung-uk Kim#endif 102*7bded2dbSJung-uk Kim cmp ip,#2 103*7bded2dbSJung-uk Kim mov $num,ip @ load num 1041f13597dSJung-uk Kim movlt r0,#0 1051f13597dSJung-uk Kim addlt sp,sp,#2*4 1061f13597dSJung-uk Kim blt .Labrt 1071f13597dSJung-uk Kim 1081f13597dSJung-uk Kim stmdb sp!,{r4-r12,lr} @ save 10 registers 1091f13597dSJung-uk Kim 1101f13597dSJung-uk Kim mov $num,$num,lsl#2 @ rescale $num for byte count 1111f13597dSJung-uk Kim sub sp,sp,$num @ alloca(4*num) 1121f13597dSJung-uk Kim sub sp,sp,#4 @ +extra dword 1131f13597dSJung-uk Kim sub $num,$num,#4 @ "num=num-1" 1141f13597dSJung-uk Kim add $tp,$bp,$num @ &bp[num-1] 1151f13597dSJung-uk Kim 1161f13597dSJung-uk Kim add $num,sp,$num @ $num to point at &tp[num-1] 1171f13597dSJung-uk Kim ldr $n0,[$_n0] @ &n0 1181f13597dSJung-uk Kim ldr $bi,[$bp] @ bp[0] 1191f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[0],ap++ 1201f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[0],np++ 1211f13597dSJung-uk Kim ldr $n0,[$n0] @ *n0 1221f13597dSJung-uk Kim str $tp,[$_bpend] @ save &bp[num] 1231f13597dSJung-uk Kim 1241f13597dSJung-uk Kim umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 1251f13597dSJung-uk Kim str $n0,[$_n0] @ save n0 value 1261f13597dSJung-uk Kim mul $n0,$alo,$n0 @ "tp[0]"*n0 1271f13597dSJung-uk Kim mov $nlo,#0 1281f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 1291f13597dSJung-uk Kim mov $tp,sp 1301f13597dSJung-uk Kim 1311f13597dSJung-uk Kim.L1st: 1321f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1331f13597dSJung-uk Kim mov $alo,$ahi 1341f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1351f13597dSJung-uk Kim mov $ahi,#0 1361f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 1371f13597dSJung-uk Kim mov $nhi,#0 1381f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1391f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1401f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1411f13597dSJung-uk Kim adc $nlo,$nhi,#0 1421f13597dSJung-uk Kim cmp $tp,$num 1431f13597dSJung-uk Kim bne .L1st 1441f13597dSJung-uk Kim 1451f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1461f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1471f13597dSJung-uk Kim mov $nhi,#0 1481f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1491f13597dSJung-uk Kim adc $nhi,$nhi,#0 1501f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 1511f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 1521f13597dSJung-uk Kim 1531f13597dSJung-uk Kim.Louter: 1541f13597dSJung-uk Kim sub $tj,$num,sp @ "original" $num-1 value 1551f13597dSJung-uk Kim sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 1561f13597dSJung-uk Kim ldr $bi,[$tp,#4]! @ *(++bp) 1571f13597dSJung-uk Kim sub $np,$np,$tj @ "rewind" np to &np[1] 1581f13597dSJung-uk Kim ldr $aj,[$ap,#-4] @ ap[0] 1591f13597dSJung-uk Kim ldr $alo,[sp] @ tp[0] 1601f13597dSJung-uk Kim ldr $nj,[$np,#-4] @ np[0] 1611f13597dSJung-uk Kim ldr $tj,[sp,#4] @ tp[1] 1621f13597dSJung-uk Kim 1631f13597dSJung-uk Kim mov $ahi,#0 1641f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 1651f13597dSJung-uk Kim str $tp,[$_bp] @ save bp 1661f13597dSJung-uk Kim mul $n0,$alo,$n0 1671f13597dSJung-uk Kim mov $nlo,#0 1681f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 1691f13597dSJung-uk Kim mov $tp,sp 1701f13597dSJung-uk Kim 1711f13597dSJung-uk Kim.Linner: 1721f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1731f13597dSJung-uk Kim adds $alo,$ahi,$tj @ +=tp[j] 1741f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1751f13597dSJung-uk Kim mov $ahi,#0 1761f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 1771f13597dSJung-uk Kim mov $nhi,#0 1781f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1791f13597dSJung-uk Kim adc $ahi,$ahi,#0 1801f13597dSJung-uk Kim ldr $tj,[$tp,#8] @ tp[j+1] 1811f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1821f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1831f13597dSJung-uk Kim adc $nlo,$nhi,#0 1841f13597dSJung-uk Kim cmp $tp,$num 1851f13597dSJung-uk Kim bne .Linner 1861f13597dSJung-uk Kim 1871f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1881f13597dSJung-uk Kim mov $nhi,#0 1891f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1901f13597dSJung-uk Kim adc $nhi,$nhi,#0 1911f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1921f13597dSJung-uk Kim adds $nlo,$nlo,$tj 1931f13597dSJung-uk Kim ldr $tj,[$_bpend] @ restore &bp[num] 1941f13597dSJung-uk Kim adc $nhi,$nhi,#0 1951f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 1961f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 1971f13597dSJung-uk Kim 1981f13597dSJung-uk Kim cmp $tp,$tj 1991f13597dSJung-uk Kim bne .Louter 2001f13597dSJung-uk Kim 2011f13597dSJung-uk Kim ldr $rp,[$_rp] @ pull rp 2021f13597dSJung-uk Kim add $num,$num,#4 @ $num to point at &tp[num] 2031f13597dSJung-uk Kim sub $aj,$num,sp @ "original" num value 2041f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2051f13597dSJung-uk Kim mov $ap,$tp @ "borrow" $ap 2061f13597dSJung-uk Kim sub $np,$np,$aj @ "rewind" $np to &np[0] 2071f13597dSJung-uk Kim 2081f13597dSJung-uk Kim subs $tj,$tj,$tj @ "clear" carry flag 2091f13597dSJung-uk Kim.Lsub: ldr $tj,[$tp],#4 2101f13597dSJung-uk Kim ldr $nj,[$np],#4 2111f13597dSJung-uk Kim sbcs $tj,$tj,$nj @ tp[j]-np[j] 2121f13597dSJung-uk Kim str $tj,[$rp],#4 @ rp[j]= 2131f13597dSJung-uk Kim teq $tp,$num @ preserve carry 2141f13597dSJung-uk Kim bne .Lsub 2151f13597dSJung-uk Kim sbcs $nhi,$nhi,#0 @ upmost carry 2161f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2171f13597dSJung-uk Kim sub $rp,$rp,$aj @ "rewind" $rp 2181f13597dSJung-uk Kim 2191f13597dSJung-uk Kim and $ap,$tp,$nhi 2201f13597dSJung-uk Kim bic $np,$rp,$nhi 2211f13597dSJung-uk Kim orr $ap,$ap,$np @ ap=borrow?tp:rp 2221f13597dSJung-uk Kim 2231f13597dSJung-uk Kim.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh 2241f13597dSJung-uk Kim str sp,[$tp],#4 @ zap tp 2251f13597dSJung-uk Kim str $tj,[$rp],#4 2261f13597dSJung-uk Kim cmp $tp,$num 2271f13597dSJung-uk Kim bne .Lcopy 2281f13597dSJung-uk Kim 2291f13597dSJung-uk Kim add sp,$num,#4 @ skip over tp[num+1] 2301f13597dSJung-uk Kim ldmia sp!,{r4-r12,lr} @ restore registers 2311f13597dSJung-uk Kim add sp,sp,#2*4 @ skip over {r0,r2} 2321f13597dSJung-uk Kim mov r0,#1 233*7bded2dbSJung-uk Kim.Labrt: 234*7bded2dbSJung-uk Kim#if __ARM_ARCH__>=5 235*7bded2dbSJung-uk Kim ret @ bx lr 236*7bded2dbSJung-uk Kim#else 237*7bded2dbSJung-uk Kim tst lr,#1 2381f13597dSJung-uk Kim moveq pc,lr @ be binary compatible with V4, yet 2391f13597dSJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 240*7bded2dbSJung-uk Kim#endif 2411f13597dSJung-uk Kim.size bn_mul_mont,.-bn_mul_mont 242*7bded2dbSJung-uk Kim___ 243*7bded2dbSJung-uk Kim{ 244*7bded2dbSJung-uk Kimsub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 245*7bded2dbSJung-uk Kimsub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 246*7bded2dbSJung-uk Kim 247*7bded2dbSJung-uk Kimmy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 248*7bded2dbSJung-uk Kimmy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 249*7bded2dbSJung-uk Kimmy ($Z,$Temp)=("q4","q5"); 250*7bded2dbSJung-uk Kimmy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); 251*7bded2dbSJung-uk Kimmy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 252*7bded2dbSJung-uk Kimmy $zero=&Dlo($Z); 253*7bded2dbSJung-uk Kimmy $temp=&Dlo($Temp); 254*7bded2dbSJung-uk Kim 255*7bded2dbSJung-uk Kimmy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 256*7bded2dbSJung-uk Kimmy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); 257*7bded2dbSJung-uk Kim 258*7bded2dbSJung-uk Kim$code.=<<___; 259*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 260*7bded2dbSJung-uk Kim.arch armv7-a 261*7bded2dbSJung-uk Kim.fpu neon 262*7bded2dbSJung-uk Kim 263*7bded2dbSJung-uk Kim.type bn_mul8x_mont_neon,%function 264*7bded2dbSJung-uk Kim.align 5 265*7bded2dbSJung-uk Kimbn_mul8x_mont_neon: 266*7bded2dbSJung-uk Kim mov ip,sp 267*7bded2dbSJung-uk Kim stmdb sp!,{r4-r11} 268*7bded2dbSJung-uk Kim vstmdb sp!,{d8-d15} @ ABI specification says so 269*7bded2dbSJung-uk Kim ldmia ip,{r4-r5} @ load rest of parameter block 270*7bded2dbSJung-uk Kim 271*7bded2dbSJung-uk Kim sub $toutptr,sp,#16 272*7bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 273*7bded2dbSJung-uk Kim sub $toutptr,$toutptr,$num,lsl#4 274*7bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 275*7bded2dbSJung-uk Kim and $toutptr,$toutptr,#-64 276*7bded2dbSJung-uk Kim vld1.32 {${M0}[0]}, [$n0,:32] 277*7bded2dbSJung-uk Kim mov sp,$toutptr @ alloca 278*7bded2dbSJung-uk Kim veor $zero,$zero,$zero 279*7bded2dbSJung-uk Kim subs $inner,$num,#8 280*7bded2dbSJung-uk Kim vzip.16 $Bi,$zero 281*7bded2dbSJung-uk Kim 282*7bded2dbSJung-uk Kim vmull.u32 $A0xB,$Bi,${A0}[0] 283*7bded2dbSJung-uk Kim vmull.u32 $A1xB,$Bi,${A0}[1] 284*7bded2dbSJung-uk Kim vmull.u32 $A2xB,$Bi,${A1}[0] 285*7bded2dbSJung-uk Kim vshl.i64 $temp,`&Dhi("$A0xB")`,#16 286*7bded2dbSJung-uk Kim vmull.u32 $A3xB,$Bi,${A1}[1] 287*7bded2dbSJung-uk Kim 288*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 289*7bded2dbSJung-uk Kim veor $zero,$zero,$zero 290*7bded2dbSJung-uk Kim vmul.u32 $Ni,$temp,$M0 291*7bded2dbSJung-uk Kim 292*7bded2dbSJung-uk Kim vmull.u32 $A4xB,$Bi,${A2}[0] 293*7bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 294*7bded2dbSJung-uk Kim vmull.u32 $A5xB,$Bi,${A2}[1] 295*7bded2dbSJung-uk Kim vmull.u32 $A6xB,$Bi,${A3}[0] 296*7bded2dbSJung-uk Kim vzip.16 $Ni,$zero 297*7bded2dbSJung-uk Kim vmull.u32 $A7xB,$Bi,${A3}[1] 298*7bded2dbSJung-uk Kim 299*7bded2dbSJung-uk Kim bne .LNEON_1st 300*7bded2dbSJung-uk Kim 301*7bded2dbSJung-uk Kim @ special case for num=8, everything is in register bank... 302*7bded2dbSJung-uk Kim 303*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 304*7bded2dbSJung-uk Kim sub $outer,$num,#1 305*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 306*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 307*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 308*7bded2dbSJung-uk Kim 309*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 310*7bded2dbSJung-uk Kim vmov $Temp,$A0xB 311*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 312*7bded2dbSJung-uk Kim vmov $A0xB,$A1xB 313*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 314*7bded2dbSJung-uk Kim vmov $A1xB,$A2xB 315*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 316*7bded2dbSJung-uk Kim vmov $A2xB,$A3xB 317*7bded2dbSJung-uk Kim vmov $A3xB,$A4xB 318*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 319*7bded2dbSJung-uk Kim vmov $A4xB,$A5xB 320*7bded2dbSJung-uk Kim vmov $A5xB,$A6xB 321*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 322*7bded2dbSJung-uk Kim vmov $A6xB,$A7xB 323*7bded2dbSJung-uk Kim veor $A7xB,$A7xB 324*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 325*7bded2dbSJung-uk Kim 326*7bded2dbSJung-uk Kim b .LNEON_outer8 327*7bded2dbSJung-uk Kim 328*7bded2dbSJung-uk Kim.align 4 329*7bded2dbSJung-uk Kim.LNEON_outer8: 330*7bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 331*7bded2dbSJung-uk Kim veor $zero,$zero,$zero 332*7bded2dbSJung-uk Kim vzip.16 $Bi,$zero 333*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 334*7bded2dbSJung-uk Kim 335*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Bi,${A0}[0] 336*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Bi,${A0}[1] 337*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Bi,${A1}[0] 338*7bded2dbSJung-uk Kim vshl.i64 $temp,`&Dhi("$A0xB")`,#16 339*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Bi,${A1}[1] 340*7bded2dbSJung-uk Kim 341*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 342*7bded2dbSJung-uk Kim veor $zero,$zero,$zero 343*7bded2dbSJung-uk Kim subs $outer,$outer,#1 344*7bded2dbSJung-uk Kim vmul.u32 $Ni,$temp,$M0 345*7bded2dbSJung-uk Kim 346*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Bi,${A2}[0] 347*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Bi,${A2}[1] 348*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Bi,${A3}[0] 349*7bded2dbSJung-uk Kim vzip.16 $Ni,$zero 350*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Bi,${A3}[1] 351*7bded2dbSJung-uk Kim 352*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 353*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 354*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 355*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 356*7bded2dbSJung-uk Kim 357*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 358*7bded2dbSJung-uk Kim vmov $Temp,$A0xB 359*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 360*7bded2dbSJung-uk Kim vmov $A0xB,$A1xB 361*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 362*7bded2dbSJung-uk Kim vmov $A1xB,$A2xB 363*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 364*7bded2dbSJung-uk Kim vmov $A2xB,$A3xB 365*7bded2dbSJung-uk Kim vmov $A3xB,$A4xB 366*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 367*7bded2dbSJung-uk Kim vmov $A4xB,$A5xB 368*7bded2dbSJung-uk Kim vmov $A5xB,$A6xB 369*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 370*7bded2dbSJung-uk Kim vmov $A6xB,$A7xB 371*7bded2dbSJung-uk Kim veor $A7xB,$A7xB 372*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 373*7bded2dbSJung-uk Kim 374*7bded2dbSJung-uk Kim bne .LNEON_outer8 375*7bded2dbSJung-uk Kim 376*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 377*7bded2dbSJung-uk Kim mov $toutptr,sp 378*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A0xB")`,#16 379*7bded2dbSJung-uk Kim mov $inner,$num 380*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 381*7bded2dbSJung-uk Kim add $tinptr,sp,#16 382*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A0xB")`,#16 383*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 384*7bded2dbSJung-uk Kim 385*7bded2dbSJung-uk Kim b .LNEON_tail2 386*7bded2dbSJung-uk Kim 387*7bded2dbSJung-uk Kim.align 4 388*7bded2dbSJung-uk Kim.LNEON_1st: 389*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 390*7bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! 391*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 392*7bded2dbSJung-uk Kim subs $inner,$inner,#8 393*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 394*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 395*7bded2dbSJung-uk Kim 396*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 397*7bded2dbSJung-uk Kim vld1.32 {$N0-$N1}, [$nptr]! 398*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 399*7bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 400*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 401*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 402*7bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 403*7bded2dbSJung-uk Kim 404*7bded2dbSJung-uk Kim vmull.u32 $A0xB,$Bi,${A0}[0] 405*7bded2dbSJung-uk Kim vld1.32 {$N2-$N3}, [$nptr]! 406*7bded2dbSJung-uk Kim vmull.u32 $A1xB,$Bi,${A0}[1] 407*7bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 408*7bded2dbSJung-uk Kim vmull.u32 $A2xB,$Bi,${A1}[0] 409*7bded2dbSJung-uk Kim vmull.u32 $A3xB,$Bi,${A1}[1] 410*7bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 411*7bded2dbSJung-uk Kim 412*7bded2dbSJung-uk Kim vmull.u32 $A4xB,$Bi,${A2}[0] 413*7bded2dbSJung-uk Kim vmull.u32 $A5xB,$Bi,${A2}[1] 414*7bded2dbSJung-uk Kim vmull.u32 $A6xB,$Bi,${A3}[0] 415*7bded2dbSJung-uk Kim vmull.u32 $A7xB,$Bi,${A3}[1] 416*7bded2dbSJung-uk Kim 417*7bded2dbSJung-uk Kim bne .LNEON_1st 418*7bded2dbSJung-uk Kim 419*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 420*7bded2dbSJung-uk Kim add $tinptr,sp,#16 421*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 422*7bded2dbSJung-uk Kim sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 423*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 424*7bded2dbSJung-uk Kim vld1.64 {$Temp}, [sp,:128] 425*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 426*7bded2dbSJung-uk Kim sub $outer,$num,#1 427*7bded2dbSJung-uk Kim 428*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 429*7bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 430*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 431*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 432*7bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 433*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 434*7bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 435*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 436*7bded2dbSJung-uk Kim 437*7bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 438*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 439*7bded2dbSJung-uk Kim veor $Z,$Z,$Z 440*7bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 441*7bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 442*7bded2dbSJung-uk Kim vst1.64 {$Z}, [$toutptr,:128] 443*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 444*7bded2dbSJung-uk Kim 445*7bded2dbSJung-uk Kim b .LNEON_outer 446*7bded2dbSJung-uk Kim 447*7bded2dbSJung-uk Kim.align 4 448*7bded2dbSJung-uk Kim.LNEON_outer: 449*7bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 450*7bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 451*7bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! 452*7bded2dbSJung-uk Kim veor $zero,$zero,$zero 453*7bded2dbSJung-uk Kim mov $toutptr,sp 454*7bded2dbSJung-uk Kim vzip.16 $Bi,$zero 455*7bded2dbSJung-uk Kim sub $inner,$num,#8 456*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 457*7bded2dbSJung-uk Kim 458*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Bi,${A0}[0] 459*7bded2dbSJung-uk Kim vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! 460*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Bi,${A0}[1] 461*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Bi,${A1}[0] 462*7bded2dbSJung-uk Kim vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! 463*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Bi,${A1}[1] 464*7bded2dbSJung-uk Kim 465*7bded2dbSJung-uk Kim vshl.i64 $temp,`&Dhi("$A0xB")`,#16 466*7bded2dbSJung-uk Kim veor $zero,$zero,$zero 467*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 468*7bded2dbSJung-uk Kim vld1.64 {$A7xB},[$tinptr,:128]! 469*7bded2dbSJung-uk Kim vmul.u32 $Ni,$temp,$M0 470*7bded2dbSJung-uk Kim 471*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Bi,${A2}[0] 472*7bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 473*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Bi,${A2}[1] 474*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Bi,${A3}[0] 475*7bded2dbSJung-uk Kim vzip.16 $Ni,$zero 476*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Bi,${A3}[1] 477*7bded2dbSJung-uk Kim 478*7bded2dbSJung-uk Kim.LNEON_inner: 479*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 480*7bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! 481*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 482*7bded2dbSJung-uk Kim subs $inner,$inner,#8 483*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 484*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 485*7bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 486*7bded2dbSJung-uk Kim 487*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 488*7bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 489*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 490*7bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 491*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 492*7bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 493*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 494*7bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 495*7bded2dbSJung-uk Kim 496*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Bi,${A0}[0] 497*7bded2dbSJung-uk Kim vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 498*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Bi,${A0}[1] 499*7bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 500*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Bi,${A1}[0] 501*7bded2dbSJung-uk Kim vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 502*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Bi,${A1}[1] 503*7bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 504*7bded2dbSJung-uk Kim 505*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Bi,${A2}[0] 506*7bded2dbSJung-uk Kim vld1.64 {$A7xB}, [$tinptr, :128]! 507*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Bi,${A2}[1] 508*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Bi,${A3}[0] 509*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Bi,${A3}[1] 510*7bded2dbSJung-uk Kim 511*7bded2dbSJung-uk Kim bne .LNEON_inner 512*7bded2dbSJung-uk Kim 513*7bded2dbSJung-uk Kim vmlal.u32 $A0xB,$Ni,${N0}[0] 514*7bded2dbSJung-uk Kim add $tinptr,sp,#16 515*7bded2dbSJung-uk Kim vmlal.u32 $A1xB,$Ni,${N0}[1] 516*7bded2dbSJung-uk Kim sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 517*7bded2dbSJung-uk Kim vmlal.u32 $A2xB,$Ni,${N1}[0] 518*7bded2dbSJung-uk Kim vld1.64 {$Temp}, [sp,:128] 519*7bded2dbSJung-uk Kim vmlal.u32 $A3xB,$Ni,${N1}[1] 520*7bded2dbSJung-uk Kim subs $outer,$outer,#1 521*7bded2dbSJung-uk Kim 522*7bded2dbSJung-uk Kim vmlal.u32 $A4xB,$Ni,${N2}[0] 523*7bded2dbSJung-uk Kim vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 524*7bded2dbSJung-uk Kim vmlal.u32 $A5xB,$Ni,${N2}[1] 525*7bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 526*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 527*7bded2dbSJung-uk Kim vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 528*7bded2dbSJung-uk Kim vmlal.u32 $A6xB,$Ni,${N3}[0] 529*7bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 530*7bded2dbSJung-uk Kim vmlal.u32 $A7xB,$Ni,${N3}[1] 531*7bded2dbSJung-uk Kim 532*7bded2dbSJung-uk Kim vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 533*7bded2dbSJung-uk Kim vadd.u64 $temp,$temp,`&Dhi("$Temp")` 534*7bded2dbSJung-uk Kim vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 535*7bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 536*7bded2dbSJung-uk Kim 537*7bded2dbSJung-uk Kim bne .LNEON_outer 538*7bded2dbSJung-uk Kim 539*7bded2dbSJung-uk Kim mov $toutptr,sp 540*7bded2dbSJung-uk Kim mov $inner,$num 541*7bded2dbSJung-uk Kim 542*7bded2dbSJung-uk Kim.LNEON_tail: 543*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 544*7bded2dbSJung-uk Kim vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 545*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A0xB")`,#16 546*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 547*7bded2dbSJung-uk Kim vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 548*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A0xB")`,#16 549*7bded2dbSJung-uk Kim vld1.64 {$A7xB}, [$tinptr, :128]! 550*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 551*7bded2dbSJung-uk Kim 552*7bded2dbSJung-uk Kim.LNEON_tail2: 553*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp 554*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! 555*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A1xB")`,#16 556*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp 557*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A1xB")`,#16 558*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` 559*7bded2dbSJung-uk Kim 560*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp 561*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! 562*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A2xB")`,#16 563*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp 564*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A2xB")`,#16 565*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` 566*7bded2dbSJung-uk Kim 567*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp 568*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! 569*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A3xB")`,#16 570*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp 571*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A3xB")`,#16 572*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` 573*7bded2dbSJung-uk Kim 574*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp 575*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! 576*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A4xB")`,#16 577*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp 578*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A4xB")`,#16 579*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` 580*7bded2dbSJung-uk Kim 581*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp 582*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! 583*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A5xB")`,#16 584*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp 585*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A5xB")`,#16 586*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` 587*7bded2dbSJung-uk Kim 588*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp 589*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! 590*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A6xB")`,#16 591*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp 592*7bded2dbSJung-uk Kim vld1.64 {$A0xB}, [$tinptr, :128]! 593*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A6xB")`,#16 594*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` 595*7bded2dbSJung-uk Kim 596*7bded2dbSJung-uk Kim vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp 597*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! 598*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dlo("$A7xB")`,#16 599*7bded2dbSJung-uk Kim vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp 600*7bded2dbSJung-uk Kim vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 601*7bded2dbSJung-uk Kim vshr.u64 $temp,`&Dhi("$A7xB")`,#16 602*7bded2dbSJung-uk Kim vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` 603*7bded2dbSJung-uk Kim subs $inner,$inner,#8 604*7bded2dbSJung-uk Kim vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! 605*7bded2dbSJung-uk Kim 606*7bded2dbSJung-uk Kim bne .LNEON_tail 607*7bded2dbSJung-uk Kim 608*7bded2dbSJung-uk Kim vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 609*7bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 610*7bded2dbSJung-uk Kim subs $aptr,sp,#0 @ clear carry flag 611*7bded2dbSJung-uk Kim add $bptr,sp,$num,lsl#2 612*7bded2dbSJung-uk Kim 613*7bded2dbSJung-uk Kim.LNEON_sub: 614*7bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 615*7bded2dbSJung-uk Kim ldmia $nptr!, {r8-r11} 616*7bded2dbSJung-uk Kim sbcs r8, r4,r8 617*7bded2dbSJung-uk Kim sbcs r9, r5,r9 618*7bded2dbSJung-uk Kim sbcs r10,r6,r10 619*7bded2dbSJung-uk Kim sbcs r11,r7,r11 620*7bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 621*7bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 622*7bded2dbSJung-uk Kim bne .LNEON_sub 623*7bded2dbSJung-uk Kim 624*7bded2dbSJung-uk Kim ldr r10, [$aptr] @ load top-most bit 625*7bded2dbSJung-uk Kim veor q0,q0,q0 626*7bded2dbSJung-uk Kim sub r11,$bptr,sp @ this is num*4 627*7bded2dbSJung-uk Kim veor q1,q1,q1 628*7bded2dbSJung-uk Kim mov $aptr,sp 629*7bded2dbSJung-uk Kim sub $rptr,$rptr,r11 @ rewind $rptr 630*7bded2dbSJung-uk Kim mov $nptr,$bptr @ second 3/4th of frame 631*7bded2dbSJung-uk Kim sbcs r10,r10,#0 @ result is carry flag 632*7bded2dbSJung-uk Kim 633*7bded2dbSJung-uk Kim.LNEON_copy_n_zap: 634*7bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 635*7bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 636*7bded2dbSJung-uk Kim movcc r8, r4 637*7bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 638*7bded2dbSJung-uk Kim movcc r9, r5 639*7bded2dbSJung-uk Kim movcc r10,r6 640*7bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 641*7bded2dbSJung-uk Kim movcc r11,r7 642*7bded2dbSJung-uk Kim ldmia $aptr, {r4-r7} 643*7bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 644*7bded2dbSJung-uk Kim sub $aptr,$aptr,#16 645*7bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 646*7bded2dbSJung-uk Kim movcc r8, r4 647*7bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 648*7bded2dbSJung-uk Kim movcc r9, r5 649*7bded2dbSJung-uk Kim movcc r10,r6 650*7bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 651*7bded2dbSJung-uk Kim movcc r11,r7 652*7bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 653*7bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 654*7bded2dbSJung-uk Kim bne .LNEON_copy_n_zap 655*7bded2dbSJung-uk Kim 656*7bded2dbSJung-uk Kim sub sp,ip,#96 657*7bded2dbSJung-uk Kim vldmia sp!,{d8-d15} 658*7bded2dbSJung-uk Kim ldmia sp!,{r4-r11} 659*7bded2dbSJung-uk Kim ret @ bx lr 660*7bded2dbSJung-uk Kim.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 661*7bded2dbSJung-uk Kim#endif 662*7bded2dbSJung-uk Kim___ 663*7bded2dbSJung-uk Kim} 664*7bded2dbSJung-uk Kim$code.=<<___; 665*7bded2dbSJung-uk Kim.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 6661f13597dSJung-uk Kim.align 2 667*7bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 668*7bded2dbSJung-uk Kim.comm OPENSSL_armcap_P,4,4 669*7bded2dbSJung-uk Kim#endif 6701f13597dSJung-uk Kim___ 6711f13597dSJung-uk Kim 672*7bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem; 6731f13597dSJung-uk Kim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 674*7bded2dbSJung-uk Kim$code =~ s/\bret\b/bx lr/gm; 6751f13597dSJung-uk Kimprint $code; 6761f13597dSJung-uk Kimclose STDOUT; 677