11f13597dSJung-uk Kim#! /usr/bin/env perl 2*17f01e99SJung-uk Kim# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4e71b7053SJung-uk Kim# Licensed under the OpenSSL license (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# ==================================================================== 117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 151f13597dSJung-uk Kim# ==================================================================== 161f13597dSJung-uk Kim 171f13597dSJung-uk Kim# January 2007. 181f13597dSJung-uk Kim 191f13597dSJung-uk Kim# Montgomery multiplication for ARMv4. 201f13597dSJung-uk Kim# 211f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations 221f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement 231f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and 241f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 251f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal 261f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an 271f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction. 281f13597dSJung-uk Kim# 291f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less 301f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively 311f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical. 321f13597dSJung-uk Kim 337bded2dbSJung-uk Kim# November 2013 347bded2dbSJung-uk Kim# 357bded2dbSJung-uk Kim# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 367bded2dbSJung-uk Kim# performance improvement on Cortex-A8 is ~45-100% depending on key 377bded2dbSJung-uk Kim# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 387bded2dbSJung-uk Kim# On Snapdragon S4 improvement was measured to vary from ~70% to 397bded2dbSJung-uk Kim# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 407bded2dbSJung-uk Kim# rather because original integer-only code seems to perform 417bded2dbSJung-uk Kim# suboptimally on S4. Situation on Cortex-A9 is unfortunately 427bded2dbSJung-uk Kim# different. It's being looked into, but the trouble is that 437bded2dbSJung-uk Kim# performance for vectors longer than 256 bits is actually couple 447bded2dbSJung-uk Kim# of percent worse than for integer-only code. The code is chosen 457bded2dbSJung-uk Kim# for execution on all NEON-capable processors, because gain on 467bded2dbSJung-uk Kim# others outweighs the marginal loss on Cortex-A9. 477bded2dbSJung-uk Kim 48e71b7053SJung-uk Kim# September 2015 49e71b7053SJung-uk Kim# 50e71b7053SJung-uk Kim# Align Cortex-A9 performance with November 2013 improvements, i.e. 51e71b7053SJung-uk Kim# NEON code is now ~20-105% faster than integer-only one on this 52e71b7053SJung-uk Kim# processor. But this optimization further improved performance even 53e71b7053SJung-uk Kim# on other processors: NEON code path is ~45-180% faster than original 54e71b7053SJung-uk Kim# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on 55e71b7053SJung-uk Kim# Snapdragon S4. 56e71b7053SJung-uk Kim 57e71b7053SJung-uk Kim$flavour = shift; 58e71b7053SJung-uk Kimif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 59e71b7053SJung-uk Kimelse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 60e71b7053SJung-uk Kim 61e71b7053SJung-uk Kimif ($flavour && $flavour ne "void") { 62e71b7053SJung-uk Kim $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63e71b7053SJung-uk Kim ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 64e71b7053SJung-uk Kim ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 65e71b7053SJung-uk Kim die "can't locate arm-xlate.pl"; 66e71b7053SJung-uk Kim 67e71b7053SJung-uk Kim open STDOUT,"| \"$^X\" $xlate $flavour $output"; 68e71b7053SJung-uk Kim} else { 691f13597dSJung-uk Kim open STDOUT,">$output"; 70e71b7053SJung-uk Kim} 711f13597dSJung-uk Kim 721f13597dSJung-uk Kim$num="r0"; # starts as num argument, but holds &tp[num-1] 731f13597dSJung-uk Kim$ap="r1"; 741f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2"; 751f13597dSJung-uk Kim$np="r3"; 761f13597dSJung-uk Kim$tp="r4"; 771f13597dSJung-uk Kim$aj="r5"; 781f13597dSJung-uk Kim$nj="r6"; 791f13597dSJung-uk Kim$tj="r7"; 801f13597dSJung-uk Kim$n0="r8"; 811f13597dSJung-uk Kim########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 821f13597dSJung-uk Kim$alo="r10"; # sl, gcc uses it to keep @GOT 831f13597dSJung-uk Kim$ahi="r11"; # fp 841f13597dSJung-uk Kim$nlo="r12"; # ip 851f13597dSJung-uk Kim########### # r13 is stack pointer 861f13597dSJung-uk Kim$nhi="r14"; # lr 871f13597dSJung-uk Kim########### # r15 is program counter 881f13597dSJung-uk Kim 891f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num 901f13597dSJung-uk Kim$_rp="$num,#12*4"; 911f13597dSJung-uk Kim# ap permanently resides in r1 921f13597dSJung-uk Kim$_bp="$num,#13*4"; 931f13597dSJung-uk Kim# np permanently resides in r3 941f13597dSJung-uk Kim$_n0="$num,#14*4"; 951f13597dSJung-uk Kim$_num="$num,#15*4"; $_bpend=$_num; 961f13597dSJung-uk Kim 971f13597dSJung-uk Kim$code=<<___; 987bded2dbSJung-uk Kim#include "arm_arch.h" 997bded2dbSJung-uk Kim 1001f13597dSJung-uk Kim.text 101e71b7053SJung-uk Kim#if defined(__thumb2__) 102e71b7053SJung-uk Kim.syntax unified 103e71b7053SJung-uk Kim.thumb 104e71b7053SJung-uk Kim#else 1057bded2dbSJung-uk Kim.code 32 106e71b7053SJung-uk Kim#endif 1077bded2dbSJung-uk Kim 1087bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 1097bded2dbSJung-uk Kim.align 5 1107bded2dbSJung-uk Kim.LOPENSSL_armcap: 111e71b7053SJung-uk Kim.word OPENSSL_armcap_P-.Lbn_mul_mont 1127bded2dbSJung-uk Kim#endif 1131f13597dSJung-uk Kim 1141f13597dSJung-uk Kim.global bn_mul_mont 1151f13597dSJung-uk Kim.type bn_mul_mont,%function 1161f13597dSJung-uk Kim 1177bded2dbSJung-uk Kim.align 5 1181f13597dSJung-uk Kimbn_mul_mont: 119e71b7053SJung-uk Kim.Lbn_mul_mont: 1207bded2dbSJung-uk Kim ldr ip,[sp,#4] @ load num 1211f13597dSJung-uk Kim stmdb sp!,{r0,r2} @ sp points at argument block 1227bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 1237bded2dbSJung-uk Kim tst ip,#7 1247bded2dbSJung-uk Kim bne .Lialu 125e71b7053SJung-uk Kim adr r0,.Lbn_mul_mont 1267bded2dbSJung-uk Kim ldr r2,.LOPENSSL_armcap 1277bded2dbSJung-uk Kim ldr r0,[r0,r2] 128e71b7053SJung-uk Kim#ifdef __APPLE__ 129e71b7053SJung-uk Kim ldr r0,[r0] 130e71b7053SJung-uk Kim#endif 131e71b7053SJung-uk Kim tst r0,#ARMV7_NEON @ NEON available? 1327bded2dbSJung-uk Kim ldmia sp, {r0,r2} 1337bded2dbSJung-uk Kim beq .Lialu 1347bded2dbSJung-uk Kim add sp,sp,#8 1357bded2dbSJung-uk Kim b bn_mul8x_mont_neon 1367bded2dbSJung-uk Kim.align 4 1377bded2dbSJung-uk Kim.Lialu: 1387bded2dbSJung-uk Kim#endif 1397bded2dbSJung-uk Kim cmp ip,#2 1407bded2dbSJung-uk Kim mov $num,ip @ load num 141e71b7053SJung-uk Kim#ifdef __thumb2__ 142e71b7053SJung-uk Kim ittt lt 143e71b7053SJung-uk Kim#endif 1441f13597dSJung-uk Kim movlt r0,#0 1451f13597dSJung-uk Kim addlt sp,sp,#2*4 1461f13597dSJung-uk Kim blt .Labrt 1471f13597dSJung-uk Kim 1481f13597dSJung-uk Kim stmdb sp!,{r4-r12,lr} @ save 10 registers 1491f13597dSJung-uk Kim 1501f13597dSJung-uk Kim mov $num,$num,lsl#2 @ rescale $num for byte count 1511f13597dSJung-uk Kim sub sp,sp,$num @ alloca(4*num) 1521f13597dSJung-uk Kim sub sp,sp,#4 @ +extra dword 1531f13597dSJung-uk Kim sub $num,$num,#4 @ "num=num-1" 1541f13597dSJung-uk Kim add $tp,$bp,$num @ &bp[num-1] 1551f13597dSJung-uk Kim 1561f13597dSJung-uk Kim add $num,sp,$num @ $num to point at &tp[num-1] 1571f13597dSJung-uk Kim ldr $n0,[$_n0] @ &n0 1581f13597dSJung-uk Kim ldr $bi,[$bp] @ bp[0] 1591f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[0],ap++ 1601f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[0],np++ 1611f13597dSJung-uk Kim ldr $n0,[$n0] @ *n0 1621f13597dSJung-uk Kim str $tp,[$_bpend] @ save &bp[num] 1631f13597dSJung-uk Kim 1641f13597dSJung-uk Kim umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 1651f13597dSJung-uk Kim str $n0,[$_n0] @ save n0 value 1661f13597dSJung-uk Kim mul $n0,$alo,$n0 @ "tp[0]"*n0 1671f13597dSJung-uk Kim mov $nlo,#0 1681f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 1691f13597dSJung-uk Kim mov $tp,sp 1701f13597dSJung-uk Kim 1711f13597dSJung-uk Kim.L1st: 1721f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1731f13597dSJung-uk Kim mov $alo,$ahi 1741f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1751f13597dSJung-uk Kim mov $ahi,#0 1761f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 1771f13597dSJung-uk Kim mov $nhi,#0 1781f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1791f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1801f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1811f13597dSJung-uk Kim adc $nlo,$nhi,#0 1821f13597dSJung-uk Kim cmp $tp,$num 1831f13597dSJung-uk Kim bne .L1st 1841f13597dSJung-uk Kim 1851f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1861f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1871f13597dSJung-uk Kim mov $nhi,#0 1881f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1891f13597dSJung-uk Kim adc $nhi,$nhi,#0 1901f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 191e71b7053SJung-uk Kim mov $tj,sp 1921f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 1931f13597dSJung-uk Kim 1941f13597dSJung-uk Kim.Louter: 195e71b7053SJung-uk Kim sub $tj,$num,$tj @ "original" $num-1 value 1961f13597dSJung-uk Kim sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 1971f13597dSJung-uk Kim ldr $bi,[$tp,#4]! @ *(++bp) 1981f13597dSJung-uk Kim sub $np,$np,$tj @ "rewind" np to &np[1] 1991f13597dSJung-uk Kim ldr $aj,[$ap,#-4] @ ap[0] 2001f13597dSJung-uk Kim ldr $alo,[sp] @ tp[0] 2011f13597dSJung-uk Kim ldr $nj,[$np,#-4] @ np[0] 2021f13597dSJung-uk Kim ldr $tj,[sp,#4] @ tp[1] 2031f13597dSJung-uk Kim 2041f13597dSJung-uk Kim mov $ahi,#0 2051f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 2061f13597dSJung-uk Kim str $tp,[$_bp] @ save bp 2071f13597dSJung-uk Kim mul $n0,$alo,$n0 2081f13597dSJung-uk Kim mov $nlo,#0 2091f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 2101f13597dSJung-uk Kim mov $tp,sp 2111f13597dSJung-uk Kim 2121f13597dSJung-uk Kim.Linner: 2131f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 2141f13597dSJung-uk Kim adds $alo,$ahi,$tj @ +=tp[j] 2151f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 2161f13597dSJung-uk Kim mov $ahi,#0 2171f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 2181f13597dSJung-uk Kim mov $nhi,#0 2191f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 2201f13597dSJung-uk Kim adc $ahi,$ahi,#0 2211f13597dSJung-uk Kim ldr $tj,[$tp,#8] @ tp[j+1] 2221f13597dSJung-uk Kim adds $nlo,$nlo,$alo 2231f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 2241f13597dSJung-uk Kim adc $nlo,$nhi,#0 2251f13597dSJung-uk Kim cmp $tp,$num 2261f13597dSJung-uk Kim bne .Linner 2271f13597dSJung-uk Kim 2281f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 2291f13597dSJung-uk Kim mov $nhi,#0 2301f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 2311f13597dSJung-uk Kim adc $nhi,$nhi,#0 2321f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 2331f13597dSJung-uk Kim adds $nlo,$nlo,$tj 2341f13597dSJung-uk Kim ldr $tj,[$_bpend] @ restore &bp[num] 2351f13597dSJung-uk Kim adc $nhi,$nhi,#0 2361f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 2371f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 2381f13597dSJung-uk Kim 2391f13597dSJung-uk Kim cmp $tp,$tj 240e71b7053SJung-uk Kim#ifdef __thumb2__ 241e71b7053SJung-uk Kim itt ne 242e71b7053SJung-uk Kim#endif 243e71b7053SJung-uk Kim movne $tj,sp 2441f13597dSJung-uk Kim bne .Louter 2451f13597dSJung-uk Kim 2461f13597dSJung-uk Kim ldr $rp,[$_rp] @ pull rp 247e71b7053SJung-uk Kim mov $aj,sp 2481f13597dSJung-uk Kim add $num,$num,#4 @ $num to point at &tp[num] 249e71b7053SJung-uk Kim sub $aj,$num,$aj @ "original" num value 2501f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2511f13597dSJung-uk Kim mov $ap,$tp @ "borrow" $ap 2521f13597dSJung-uk Kim sub $np,$np,$aj @ "rewind" $np to &np[0] 2531f13597dSJung-uk Kim 2541f13597dSJung-uk Kim subs $tj,$tj,$tj @ "clear" carry flag 2551f13597dSJung-uk Kim.Lsub: ldr $tj,[$tp],#4 2561f13597dSJung-uk Kim ldr $nj,[$np],#4 2571f13597dSJung-uk Kim sbcs $tj,$tj,$nj @ tp[j]-np[j] 2581f13597dSJung-uk Kim str $tj,[$rp],#4 @ rp[j]= 2591f13597dSJung-uk Kim teq $tp,$num @ preserve carry 2601f13597dSJung-uk Kim bne .Lsub 2611f13597dSJung-uk Kim sbcs $nhi,$nhi,#0 @ upmost carry 2621f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2631f13597dSJung-uk Kim sub $rp,$rp,$aj @ "rewind" $rp 2641f13597dSJung-uk Kim 265dea77ea6SJung-uk Kim.Lcopy: ldr $tj,[$tp] @ conditional copy 266dea77ea6SJung-uk Kim ldr $aj,[$rp] 2671f13597dSJung-uk Kim str sp,[$tp],#4 @ zap tp 268dea77ea6SJung-uk Kim#ifdef __thumb2__ 269dea77ea6SJung-uk Kim it cc 270dea77ea6SJung-uk Kim#endif 271dea77ea6SJung-uk Kim movcc $aj,$tj 272dea77ea6SJung-uk Kim str $aj,[$rp],#4 273dea77ea6SJung-uk Kim teq $tp,$num @ preserve carry 2741f13597dSJung-uk Kim bne .Lcopy 2751f13597dSJung-uk Kim 276e71b7053SJung-uk Kim mov sp,$num 277e71b7053SJung-uk Kim add sp,sp,#4 @ skip over tp[num+1] 2781f13597dSJung-uk Kim ldmia sp!,{r4-r12,lr} @ restore registers 2791f13597dSJung-uk Kim add sp,sp,#2*4 @ skip over {r0,r2} 2801f13597dSJung-uk Kim mov r0,#1 2817bded2dbSJung-uk Kim.Labrt: 2827bded2dbSJung-uk Kim#if __ARM_ARCH__>=5 2837bded2dbSJung-uk Kim ret @ bx lr 2847bded2dbSJung-uk Kim#else 2857bded2dbSJung-uk Kim tst lr,#1 2861f13597dSJung-uk Kim moveq pc,lr @ be binary compatible with V4, yet 2871f13597dSJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 2887bded2dbSJung-uk Kim#endif 2891f13597dSJung-uk Kim.size bn_mul_mont,.-bn_mul_mont 2907bded2dbSJung-uk Kim___ 2917bded2dbSJung-uk Kim{ 2927bded2dbSJung-uk Kimmy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 2937bded2dbSJung-uk Kimmy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 2947bded2dbSJung-uk Kimmy ($Z,$Temp)=("q4","q5"); 295e71b7053SJung-uk Kimmy @ACC=map("q$_",(6..13)); 2967bded2dbSJung-uk Kimmy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 297e71b7053SJung-uk Kimmy $zero="$Z#lo"; 298e71b7053SJung-uk Kimmy $temp="$Temp#lo"; 2997bded2dbSJung-uk Kim 3007bded2dbSJung-uk Kimmy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 301e71b7053SJung-uk Kimmy ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); 3027bded2dbSJung-uk Kim 3037bded2dbSJung-uk Kim$code.=<<___; 3047bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 3057bded2dbSJung-uk Kim.arch armv7-a 3067bded2dbSJung-uk Kim.fpu neon 3077bded2dbSJung-uk Kim 3087bded2dbSJung-uk Kim.type bn_mul8x_mont_neon,%function 3097bded2dbSJung-uk Kim.align 5 3107bded2dbSJung-uk Kimbn_mul8x_mont_neon: 3117bded2dbSJung-uk Kim mov ip,sp 3127bded2dbSJung-uk Kim stmdb sp!,{r4-r11} 3137bded2dbSJung-uk Kim vstmdb sp!,{d8-d15} @ ABI specification says so 3147bded2dbSJung-uk Kim ldmia ip,{r4-r5} @ load rest of parameter block 315e71b7053SJung-uk Kim mov ip,sp 3167bded2dbSJung-uk Kim 317e71b7053SJung-uk Kim cmp $num,#8 318e71b7053SJung-uk Kim bhi .LNEON_8n 319e71b7053SJung-uk Kim 320e71b7053SJung-uk Kim @ special case for $num==8, everything is in register bank... 321e71b7053SJung-uk Kim 3227bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 323e71b7053SJung-uk Kim veor $zero,$zero,$zero 324e71b7053SJung-uk Kim sub $toutptr,sp,$num,lsl#4 3257bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 3267bded2dbSJung-uk Kim and $toutptr,$toutptr,#-64 3277bded2dbSJung-uk Kim vld1.32 {${M0}[0]}, [$n0,:32] 3287bded2dbSJung-uk Kim mov sp,$toutptr @ alloca 3297bded2dbSJung-uk Kim vzip.16 $Bi,$zero 3307bded2dbSJung-uk Kim 331e71b7053SJung-uk Kim vmull.u32 @ACC[0],$Bi,${A0}[0] 332e71b7053SJung-uk Kim vmull.u32 @ACC[1],$Bi,${A0}[1] 333e71b7053SJung-uk Kim vmull.u32 @ACC[2],$Bi,${A1}[0] 334e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 335e71b7053SJung-uk Kim vmull.u32 @ACC[3],$Bi,${A1}[1] 3367bded2dbSJung-uk Kim 337e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 3387bded2dbSJung-uk Kim veor $zero,$zero,$zero 339e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 3407bded2dbSJung-uk Kim 341e71b7053SJung-uk Kim vmull.u32 @ACC[4],$Bi,${A2}[0] 3427bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 343e71b7053SJung-uk Kim vmull.u32 @ACC[5],$Bi,${A2}[1] 344e71b7053SJung-uk Kim vmull.u32 @ACC[6],$Bi,${A3}[0] 3457bded2dbSJung-uk Kim vzip.16 $Ni,$zero 346e71b7053SJung-uk Kim vmull.u32 @ACC[7],$Bi,${A3}[1] 3477bded2dbSJung-uk Kim 348e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 3497bded2dbSJung-uk Kim sub $outer,$num,#1 350e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 351e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 352e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 3537bded2dbSJung-uk Kim 354e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 355e71b7053SJung-uk Kim vmov $Temp,@ACC[0] 356e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 357e71b7053SJung-uk Kim vmov @ACC[0],@ACC[1] 358e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 359e71b7053SJung-uk Kim vmov @ACC[1],@ACC[2] 360e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 361e71b7053SJung-uk Kim vmov @ACC[2],@ACC[3] 362e71b7053SJung-uk Kim vmov @ACC[3],@ACC[4] 3637bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 364e71b7053SJung-uk Kim vmov @ACC[4],@ACC[5] 365e71b7053SJung-uk Kim vmov @ACC[5],@ACC[6] 366e71b7053SJung-uk Kim vadd.u64 $temp,$temp,$Temp#hi 367e71b7053SJung-uk Kim vmov @ACC[6],@ACC[7] 368e71b7053SJung-uk Kim veor @ACC[7],@ACC[7] 3697bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3707bded2dbSJung-uk Kim 3717bded2dbSJung-uk Kim b .LNEON_outer8 3727bded2dbSJung-uk Kim 3737bded2dbSJung-uk Kim.align 4 3747bded2dbSJung-uk Kim.LNEON_outer8: 3757bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 3767bded2dbSJung-uk Kim veor $zero,$zero,$zero 3777bded2dbSJung-uk Kim vzip.16 $Bi,$zero 378e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 3797bded2dbSJung-uk Kim 380e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 381e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 382e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 383e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 384e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 3857bded2dbSJung-uk Kim 386e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 3877bded2dbSJung-uk Kim veor $zero,$zero,$zero 3887bded2dbSJung-uk Kim subs $outer,$outer,#1 389e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 3907bded2dbSJung-uk Kim 391e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 392e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 393e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 3947bded2dbSJung-uk Kim vzip.16 $Ni,$zero 395e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 3967bded2dbSJung-uk Kim 397e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 398e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 399e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 400e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 4017bded2dbSJung-uk Kim 402e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 403e71b7053SJung-uk Kim vmov $Temp,@ACC[0] 404e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 405e71b7053SJung-uk Kim vmov @ACC[0],@ACC[1] 406e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 407e71b7053SJung-uk Kim vmov @ACC[1],@ACC[2] 408e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 409e71b7053SJung-uk Kim vmov @ACC[2],@ACC[3] 410e71b7053SJung-uk Kim vmov @ACC[3],@ACC[4] 4117bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 412e71b7053SJung-uk Kim vmov @ACC[4],@ACC[5] 413e71b7053SJung-uk Kim vmov @ACC[5],@ACC[6] 414e71b7053SJung-uk Kim vadd.u64 $temp,$temp,$Temp#hi 415e71b7053SJung-uk Kim vmov @ACC[6],@ACC[7] 416e71b7053SJung-uk Kim veor @ACC[7],@ACC[7] 4177bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 4187bded2dbSJung-uk Kim 4197bded2dbSJung-uk Kim bne .LNEON_outer8 4207bded2dbSJung-uk Kim 421e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 4227bded2dbSJung-uk Kim mov $toutptr,sp 423e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 4247bded2dbSJung-uk Kim mov $inner,$num 425e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 426e71b7053SJung-uk Kim add $tinptr,sp,#96 427e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 428e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 4297bded2dbSJung-uk Kim 430e71b7053SJung-uk Kim b .LNEON_tail_entry 4317bded2dbSJung-uk Kim 4327bded2dbSJung-uk Kim.align 4 433e71b7053SJung-uk Kim.LNEON_8n: 434e71b7053SJung-uk Kim veor @ACC[0],@ACC[0],@ACC[0] 435e71b7053SJung-uk Kim sub $toutptr,sp,#128 436e71b7053SJung-uk Kim veor @ACC[1],@ACC[1],@ACC[1] 437e71b7053SJung-uk Kim sub $toutptr,$toutptr,$num,lsl#4 438e71b7053SJung-uk Kim veor @ACC[2],@ACC[2],@ACC[2] 439e71b7053SJung-uk Kim and $toutptr,$toutptr,#-64 440e71b7053SJung-uk Kim veor @ACC[3],@ACC[3],@ACC[3] 441e71b7053SJung-uk Kim mov sp,$toutptr @ alloca 442e71b7053SJung-uk Kim veor @ACC[4],@ACC[4],@ACC[4] 443e71b7053SJung-uk Kim add $toutptr,$toutptr,#256 444e71b7053SJung-uk Kim veor @ACC[5],@ACC[5],@ACC[5] 4457bded2dbSJung-uk Kim sub $inner,$num,#8 446e71b7053SJung-uk Kim veor @ACC[6],@ACC[6],@ACC[6] 447e71b7053SJung-uk Kim veor @ACC[7],@ACC[7],@ACC[7] 4487bded2dbSJung-uk Kim 449e71b7053SJung-uk Kim.LNEON_8n_init: 450e71b7053SJung-uk Kim vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! 451e71b7053SJung-uk Kim subs $inner,$inner,#8 452e71b7053SJung-uk Kim vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! 453e71b7053SJung-uk Kim vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! 454e71b7053SJung-uk Kim vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! 455e71b7053SJung-uk Kim bne .LNEON_8n_init 4567bded2dbSJung-uk Kim 457e71b7053SJung-uk Kim add $tinptr,sp,#256 4587bded2dbSJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 459e71b7053SJung-uk Kim add $bnptr,sp,#8 460e71b7053SJung-uk Kim vld1.32 {${M0}[0]},[$n0,:32] 461e71b7053SJung-uk Kim mov $outer,$num 462e71b7053SJung-uk Kim b .LNEON_8n_outer 4637bded2dbSJung-uk Kim 464e71b7053SJung-uk Kim.align 4 465e71b7053SJung-uk Kim.LNEON_8n_outer: 466e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ 467e71b7053SJung-uk Kim veor $zero,$zero,$zero 468e71b7053SJung-uk Kim vzip.16 $Bi,$zero 469e71b7053SJung-uk Kim add $toutptr,sp,#128 4707bded2dbSJung-uk Kim vld1.32 {$N0-$N3},[$nptr]! 4717bded2dbSJung-uk Kim 472e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 473e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 474e71b7053SJung-uk Kim veor $zero,$zero,$zero 475e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 476e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 477e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 478e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 479e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 480e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 481e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 482e71b7053SJung-uk Kim vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] 483e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 484e71b7053SJung-uk Kim vzip.16 $Ni,$zero 485e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 486e71b7053SJung-uk Kim___ 487e71b7053SJung-uk Kimfor ($i=0; $i<7;) { 488e71b7053SJung-uk Kim$code.=<<___; 489e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ 490e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 491e71b7053SJung-uk Kim veor $temp,$temp,$temp 492e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 493e71b7053SJung-uk Kim vzip.16 $Bi,$temp 494e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 495e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 496e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 497e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 498e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi 499e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 500e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 501e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 502e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 503e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo 504e71b7053SJung-uk Kim vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] 505e71b7053SJung-uk Kim___ 506e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); $i++; 507e71b7053SJung-uk Kim$code.=<<___; 508e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 509e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128]! 510e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 511e71b7053SJung-uk Kim veor $zero,$zero,$zero 512e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 513e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 514e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 515e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 516e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 517e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 518e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 519e71b7053SJung-uk Kim vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] 520e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 521e71b7053SJung-uk Kim vzip.16 $Ni,$zero 522e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 523e71b7053SJung-uk Kim___ 524e71b7053SJung-uk Kim} 525e71b7053SJung-uk Kim$code.=<<___; 526e71b7053SJung-uk Kim vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] 527e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 528e71b7053SJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 529e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 530e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 531e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 532e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 533e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 534e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi 535e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 536e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 537e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 538e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 539e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo 540e71b7053SJung-uk Kim vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] 541e71b7053SJung-uk Kim add $bnptr,sp,#8 @ rewind 542e71b7053SJung-uk Kim___ 543e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 544e71b7053SJung-uk Kim$code.=<<___; 545e71b7053SJung-uk Kim sub $inner,$num,#8 546e71b7053SJung-uk Kim b .LNEON_8n_inner 5477bded2dbSJung-uk Kim 548e71b7053SJung-uk Kim.align 4 549e71b7053SJung-uk Kim.LNEON_8n_inner: 5507bded2dbSJung-uk Kim subs $inner,$inner,#8 551e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 552e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128] 553e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 554e71b7053SJung-uk Kim vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] 555e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 556e71b7053SJung-uk Kim vld1.32 {$N0-$N3},[$nptr]! 557e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 558e71b7053SJung-uk Kim it ne 559e71b7053SJung-uk Kim addne $tinptr,$tinptr,#16 @ don't advance in last iteration 560e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 561e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 562e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 563e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 564e71b7053SJung-uk Kim___ 565e71b7053SJung-uk Kimfor ($i=1; $i<8; $i++) { 566e71b7053SJung-uk Kim$code.=<<___; 567e71b7053SJung-uk Kim vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] 568e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 569e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 570e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 571e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 572e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 573e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 574e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 575e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 576e71b7053SJung-uk Kim vst1.64 {@ACC[0]},[$toutptr,:128]! 577e71b7053SJung-uk Kim___ 578e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 579e71b7053SJung-uk Kim$code.=<<___; 580e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 581e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128] 582e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 583e71b7053SJung-uk Kim vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] 584e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 585e71b7053SJung-uk Kim it ne 586e71b7053SJung-uk Kim addne $tinptr,$tinptr,#16 @ don't advance in last iteration 587e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 588e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 589e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 590e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 591e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 592e71b7053SJung-uk Kim___ 593e71b7053SJung-uk Kim} 594e71b7053SJung-uk Kim$code.=<<___; 595e71b7053SJung-uk Kim it eq 596e71b7053SJung-uk Kim subeq $aptr,$aptr,$num,lsl#2 @ rewind 597e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 598e71b7053SJung-uk Kim vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] 599e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 600e71b7053SJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 601e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 602e71b7053SJung-uk Kim add $bnptr,sp,#8 @ rewind 603e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 604e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 605e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 606e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 607e71b7053SJung-uk Kim vst1.64 {@ACC[0]},[$toutptr,:128]! 608e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 6097bded2dbSJung-uk Kim 610e71b7053SJung-uk Kim bne .LNEON_8n_inner 611e71b7053SJung-uk Kim___ 612e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 613e71b7053SJung-uk Kim$code.=<<___; 614e71b7053SJung-uk Kim add $tinptr,sp,#128 615e71b7053SJung-uk Kim vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! 616e71b7053SJung-uk Kim veor q2,q2,q2 @ $N0-$N1 617e71b7053SJung-uk Kim vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! 618e71b7053SJung-uk Kim veor q3,q3,q3 @ $N2-$N3 619e71b7053SJung-uk Kim vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! 620e71b7053SJung-uk Kim vst1.64 {@ACC[6]},[$toutptr,:128] 621e71b7053SJung-uk Kim 622e71b7053SJung-uk Kim subs $outer,$outer,#8 623e71b7053SJung-uk Kim vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! 624e71b7053SJung-uk Kim vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! 625e71b7053SJung-uk Kim vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! 626e71b7053SJung-uk Kim vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! 627e71b7053SJung-uk Kim 628e71b7053SJung-uk Kim itt ne 629e71b7053SJung-uk Kim subne $nptr,$nptr,$num,lsl#2 @ rewind 630e71b7053SJung-uk Kim bne .LNEON_8n_outer 631e71b7053SJung-uk Kim 632e71b7053SJung-uk Kim add $toutptr,sp,#128 633e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame 634e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 635e71b7053SJung-uk Kim vst1.64 {q2-q3},[sp,:256]! 636e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 637e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! 638e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 639e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! 640e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 641e71b7053SJung-uk Kim 642e71b7053SJung-uk Kim mov $inner,$num 643e71b7053SJung-uk Kim b .LNEON_tail_entry 644e71b7053SJung-uk Kim 645e71b7053SJung-uk Kim.align 4 646e71b7053SJung-uk Kim.LNEON_tail: 647e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 648e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 649e71b7053SJung-uk Kim vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! 650e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 651e71b7053SJung-uk Kim vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! 652e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 653e71b7053SJung-uk Kim vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! 654e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 655e71b7053SJung-uk Kim 656e71b7053SJung-uk Kim.LNEON_tail_entry: 657e71b7053SJung-uk Kim___ 658e71b7053SJung-uk Kimfor ($i=1; $i<8; $i++) { 659e71b7053SJung-uk Kim$code.=<<___; 660e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp 661e71b7053SJung-uk Kim vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! 662e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[1]#lo,#16 663e71b7053SJung-uk Kim vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp 664e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[1]#hi,#16 665e71b7053SJung-uk Kim vzip.16 @ACC[1]#lo,@ACC[1]#hi 666e71b7053SJung-uk Kim___ 667e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 668e71b7053SJung-uk Kim} 669e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 670e71b7053SJung-uk Kim$code.=<<___; 671e71b7053SJung-uk Kim vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! 672e71b7053SJung-uk Kim subs $inner,$inner,#8 673e71b7053SJung-uk Kim vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! 6747bded2dbSJung-uk Kim bne .LNEON_tail 6757bded2dbSJung-uk Kim 6767bded2dbSJung-uk Kim vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 6777bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 6787bded2dbSJung-uk Kim subs $aptr,sp,#0 @ clear carry flag 6797bded2dbSJung-uk Kim add $bptr,sp,$num,lsl#2 6807bded2dbSJung-uk Kim 6817bded2dbSJung-uk Kim.LNEON_sub: 6827bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 6837bded2dbSJung-uk Kim ldmia $nptr!, {r8-r11} 6847bded2dbSJung-uk Kim sbcs r8, r4,r8 6857bded2dbSJung-uk Kim sbcs r9, r5,r9 6867bded2dbSJung-uk Kim sbcs r10,r6,r10 6877bded2dbSJung-uk Kim sbcs r11,r7,r11 6887bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 6897bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 6907bded2dbSJung-uk Kim bne .LNEON_sub 6917bded2dbSJung-uk Kim 6927bded2dbSJung-uk Kim ldr r10, [$aptr] @ load top-most bit 693e71b7053SJung-uk Kim mov r11,sp 6947bded2dbSJung-uk Kim veor q0,q0,q0 695e71b7053SJung-uk Kim sub r11,$bptr,r11 @ this is num*4 6967bded2dbSJung-uk Kim veor q1,q1,q1 6977bded2dbSJung-uk Kim mov $aptr,sp 6987bded2dbSJung-uk Kim sub $rptr,$rptr,r11 @ rewind $rptr 6997bded2dbSJung-uk Kim mov $nptr,$bptr @ second 3/4th of frame 7007bded2dbSJung-uk Kim sbcs r10,r10,#0 @ result is carry flag 7017bded2dbSJung-uk Kim 7027bded2dbSJung-uk Kim.LNEON_copy_n_zap: 7037bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 7047bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 705e71b7053SJung-uk Kim it cc 7067bded2dbSJung-uk Kim movcc r8, r4 7077bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 708e71b7053SJung-uk Kim itt cc 7097bded2dbSJung-uk Kim movcc r9, r5 7107bded2dbSJung-uk Kim movcc r10,r6 7117bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 712e71b7053SJung-uk Kim it cc 7137bded2dbSJung-uk Kim movcc r11,r7 7147bded2dbSJung-uk Kim ldmia $aptr, {r4-r7} 7157bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 7167bded2dbSJung-uk Kim sub $aptr,$aptr,#16 7177bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 718e71b7053SJung-uk Kim it cc 7197bded2dbSJung-uk Kim movcc r8, r4 7207bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 721e71b7053SJung-uk Kim itt cc 7227bded2dbSJung-uk Kim movcc r9, r5 7237bded2dbSJung-uk Kim movcc r10,r6 7247bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 725e71b7053SJung-uk Kim it cc 7267bded2dbSJung-uk Kim movcc r11,r7 7277bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 7287bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 7297bded2dbSJung-uk Kim bne .LNEON_copy_n_zap 7307bded2dbSJung-uk Kim 731e71b7053SJung-uk Kim mov sp,ip 7327bded2dbSJung-uk Kim vldmia sp!,{d8-d15} 7337bded2dbSJung-uk Kim ldmia sp!,{r4-r11} 7347bded2dbSJung-uk Kim ret @ bx lr 7357bded2dbSJung-uk Kim.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 7367bded2dbSJung-uk Kim#endif 7377bded2dbSJung-uk Kim___ 7387bded2dbSJung-uk Kim} 7397bded2dbSJung-uk Kim$code.=<<___; 7407bded2dbSJung-uk Kim.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 7411f13597dSJung-uk Kim.align 2 7427bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 7437bded2dbSJung-uk Kim.comm OPENSSL_armcap_P,4,4 7447bded2dbSJung-uk Kim#endif 7451f13597dSJung-uk Kim___ 7461f13597dSJung-uk Kim 747e71b7053SJung-uk Kimforeach (split("\n",$code)) { 748e71b7053SJung-uk Kim s/\`([^\`]*)\`/eval $1/ge; 749e71b7053SJung-uk Kim 750e71b7053SJung-uk Kim s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or 751e71b7053SJung-uk Kim s/\bret\b/bx lr/g or 752e71b7053SJung-uk Kim s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 753e71b7053SJung-uk Kim 754e71b7053SJung-uk Kim print $_,"\n"; 755e71b7053SJung-uk Kim} 756e71b7053SJung-uk Kim 757*17f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 758