11f13597dSJung-uk Kim#! /usr/bin/env perl 2*e71b7053SJung-uk Kim# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. 3*e71b7053SJung-uk Kim# 4*e71b7053SJung-uk Kim# Licensed under the OpenSSL license (the "License"). You may not use 5*e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6*e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7*e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8*e71b7053SJung-uk Kim 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# ==================================================================== 117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 151f13597dSJung-uk Kim# ==================================================================== 161f13597dSJung-uk Kim 171f13597dSJung-uk Kim# January 2007. 181f13597dSJung-uk Kim 191f13597dSJung-uk Kim# Montgomery multiplication for ARMv4. 201f13597dSJung-uk Kim# 211f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations 221f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement 231f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and 241f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 251f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal 261f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an 271f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction. 281f13597dSJung-uk Kim# 291f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less 301f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively 311f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical. 321f13597dSJung-uk Kim 337bded2dbSJung-uk Kim# November 2013 347bded2dbSJung-uk Kim# 357bded2dbSJung-uk Kim# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 367bded2dbSJung-uk Kim# performance improvement on Cortex-A8 is ~45-100% depending on key 377bded2dbSJung-uk Kim# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 387bded2dbSJung-uk Kim# On Snapdragon S4 improvement was measured to vary from ~70% to 397bded2dbSJung-uk Kim# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 407bded2dbSJung-uk Kim# rather because original integer-only code seems to perform 417bded2dbSJung-uk Kim# suboptimally on S4. Situation on Cortex-A9 is unfortunately 427bded2dbSJung-uk Kim# different. It's being looked into, but the trouble is that 437bded2dbSJung-uk Kim# performance for vectors longer than 256 bits is actually couple 447bded2dbSJung-uk Kim# of percent worse than for integer-only code. The code is chosen 457bded2dbSJung-uk Kim# for execution on all NEON-capable processors, because gain on 467bded2dbSJung-uk Kim# others outweighs the marginal loss on Cortex-A9. 477bded2dbSJung-uk Kim 48*e71b7053SJung-uk Kim# September 2015 49*e71b7053SJung-uk Kim# 50*e71b7053SJung-uk Kim# Align Cortex-A9 performance with November 2013 improvements, i.e. 51*e71b7053SJung-uk Kim# NEON code is now ~20-105% faster than integer-only one on this 52*e71b7053SJung-uk Kim# processor. But this optimization further improved performance even 53*e71b7053SJung-uk Kim# on other processors: NEON code path is ~45-180% faster than original 54*e71b7053SJung-uk Kim# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on 55*e71b7053SJung-uk Kim# Snapdragon S4. 56*e71b7053SJung-uk Kim 57*e71b7053SJung-uk Kim$flavour = shift; 58*e71b7053SJung-uk Kimif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 59*e71b7053SJung-uk Kimelse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 60*e71b7053SJung-uk Kim 61*e71b7053SJung-uk Kimif ($flavour && $flavour ne "void") { 62*e71b7053SJung-uk Kim $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63*e71b7053SJung-uk Kim ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 64*e71b7053SJung-uk Kim ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 65*e71b7053SJung-uk Kim die "can't locate arm-xlate.pl"; 66*e71b7053SJung-uk Kim 67*e71b7053SJung-uk Kim open STDOUT,"| \"$^X\" $xlate $flavour $output"; 68*e71b7053SJung-uk Kim} else { 691f13597dSJung-uk Kim open STDOUT,">$output"; 70*e71b7053SJung-uk Kim} 711f13597dSJung-uk Kim 721f13597dSJung-uk Kim$num="r0"; # starts as num argument, but holds &tp[num-1] 731f13597dSJung-uk Kim$ap="r1"; 741f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2"; 751f13597dSJung-uk Kim$np="r3"; 761f13597dSJung-uk Kim$tp="r4"; 771f13597dSJung-uk Kim$aj="r5"; 781f13597dSJung-uk Kim$nj="r6"; 791f13597dSJung-uk Kim$tj="r7"; 801f13597dSJung-uk Kim$n0="r8"; 811f13597dSJung-uk Kim########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 821f13597dSJung-uk Kim$alo="r10"; # sl, gcc uses it to keep @GOT 831f13597dSJung-uk Kim$ahi="r11"; # fp 841f13597dSJung-uk Kim$nlo="r12"; # ip 851f13597dSJung-uk Kim########### # r13 is stack pointer 861f13597dSJung-uk Kim$nhi="r14"; # lr 871f13597dSJung-uk Kim########### # r15 is program counter 881f13597dSJung-uk Kim 891f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num 901f13597dSJung-uk Kim$_rp="$num,#12*4"; 911f13597dSJung-uk Kim# ap permanently resides in r1 921f13597dSJung-uk Kim$_bp="$num,#13*4"; 931f13597dSJung-uk Kim# np permanently resides in r3 941f13597dSJung-uk Kim$_n0="$num,#14*4"; 951f13597dSJung-uk Kim$_num="$num,#15*4"; $_bpend=$_num; 961f13597dSJung-uk Kim 971f13597dSJung-uk Kim$code=<<___; 987bded2dbSJung-uk Kim#include "arm_arch.h" 997bded2dbSJung-uk Kim 1001f13597dSJung-uk Kim.text 101*e71b7053SJung-uk Kim#if defined(__thumb2__) 102*e71b7053SJung-uk Kim.syntax unified 103*e71b7053SJung-uk Kim.thumb 104*e71b7053SJung-uk Kim#else 1057bded2dbSJung-uk Kim.code 32 106*e71b7053SJung-uk Kim#endif 1077bded2dbSJung-uk Kim 1087bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 1097bded2dbSJung-uk Kim.align 5 1107bded2dbSJung-uk Kim.LOPENSSL_armcap: 111*e71b7053SJung-uk Kim.word OPENSSL_armcap_P-.Lbn_mul_mont 1127bded2dbSJung-uk Kim#endif 1131f13597dSJung-uk Kim 1141f13597dSJung-uk Kim.global bn_mul_mont 1151f13597dSJung-uk Kim.type bn_mul_mont,%function 1161f13597dSJung-uk Kim 1177bded2dbSJung-uk Kim.align 5 1181f13597dSJung-uk Kimbn_mul_mont: 119*e71b7053SJung-uk Kim.Lbn_mul_mont: 1207bded2dbSJung-uk Kim ldr ip,[sp,#4] @ load num 1211f13597dSJung-uk Kim stmdb sp!,{r0,r2} @ sp points at argument block 1227bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 1237bded2dbSJung-uk Kim tst ip,#7 1247bded2dbSJung-uk Kim bne .Lialu 125*e71b7053SJung-uk Kim adr r0,.Lbn_mul_mont 1267bded2dbSJung-uk Kim ldr r2,.LOPENSSL_armcap 1277bded2dbSJung-uk Kim ldr r0,[r0,r2] 128*e71b7053SJung-uk Kim#ifdef __APPLE__ 129*e71b7053SJung-uk Kim ldr r0,[r0] 130*e71b7053SJung-uk Kim#endif 131*e71b7053SJung-uk Kim tst r0,#ARMV7_NEON @ NEON available? 1327bded2dbSJung-uk Kim ldmia sp, {r0,r2} 1337bded2dbSJung-uk Kim beq .Lialu 1347bded2dbSJung-uk Kim add sp,sp,#8 1357bded2dbSJung-uk Kim b bn_mul8x_mont_neon 1367bded2dbSJung-uk Kim.align 4 1377bded2dbSJung-uk Kim.Lialu: 1387bded2dbSJung-uk Kim#endif 1397bded2dbSJung-uk Kim cmp ip,#2 1407bded2dbSJung-uk Kim mov $num,ip @ load num 141*e71b7053SJung-uk Kim#ifdef __thumb2__ 142*e71b7053SJung-uk Kim ittt lt 143*e71b7053SJung-uk Kim#endif 1441f13597dSJung-uk Kim movlt r0,#0 1451f13597dSJung-uk Kim addlt sp,sp,#2*4 1461f13597dSJung-uk Kim blt .Labrt 1471f13597dSJung-uk Kim 1481f13597dSJung-uk Kim stmdb sp!,{r4-r12,lr} @ save 10 registers 1491f13597dSJung-uk Kim 1501f13597dSJung-uk Kim mov $num,$num,lsl#2 @ rescale $num for byte count 1511f13597dSJung-uk Kim sub sp,sp,$num @ alloca(4*num) 1521f13597dSJung-uk Kim sub sp,sp,#4 @ +extra dword 1531f13597dSJung-uk Kim sub $num,$num,#4 @ "num=num-1" 1541f13597dSJung-uk Kim add $tp,$bp,$num @ &bp[num-1] 1551f13597dSJung-uk Kim 1561f13597dSJung-uk Kim add $num,sp,$num @ $num to point at &tp[num-1] 1571f13597dSJung-uk Kim ldr $n0,[$_n0] @ &n0 1581f13597dSJung-uk Kim ldr $bi,[$bp] @ bp[0] 1591f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[0],ap++ 1601f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[0],np++ 1611f13597dSJung-uk Kim ldr $n0,[$n0] @ *n0 1621f13597dSJung-uk Kim str $tp,[$_bpend] @ save &bp[num] 1631f13597dSJung-uk Kim 1641f13597dSJung-uk Kim umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 1651f13597dSJung-uk Kim str $n0,[$_n0] @ save n0 value 1661f13597dSJung-uk Kim mul $n0,$alo,$n0 @ "tp[0]"*n0 1671f13597dSJung-uk Kim mov $nlo,#0 1681f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 1691f13597dSJung-uk Kim mov $tp,sp 1701f13597dSJung-uk Kim 1711f13597dSJung-uk Kim.L1st: 1721f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1731f13597dSJung-uk Kim mov $alo,$ahi 1741f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1751f13597dSJung-uk Kim mov $ahi,#0 1761f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 1771f13597dSJung-uk Kim mov $nhi,#0 1781f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1791f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1801f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1811f13597dSJung-uk Kim adc $nlo,$nhi,#0 1821f13597dSJung-uk Kim cmp $tp,$num 1831f13597dSJung-uk Kim bne .L1st 1841f13597dSJung-uk Kim 1851f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1861f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1871f13597dSJung-uk Kim mov $nhi,#0 1881f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1891f13597dSJung-uk Kim adc $nhi,$nhi,#0 1901f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 191*e71b7053SJung-uk Kim mov $tj,sp 1921f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 1931f13597dSJung-uk Kim 1941f13597dSJung-uk Kim.Louter: 195*e71b7053SJung-uk Kim sub $tj,$num,$tj @ "original" $num-1 value 1961f13597dSJung-uk Kim sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 1971f13597dSJung-uk Kim ldr $bi,[$tp,#4]! @ *(++bp) 1981f13597dSJung-uk Kim sub $np,$np,$tj @ "rewind" np to &np[1] 1991f13597dSJung-uk Kim ldr $aj,[$ap,#-4] @ ap[0] 2001f13597dSJung-uk Kim ldr $alo,[sp] @ tp[0] 2011f13597dSJung-uk Kim ldr $nj,[$np,#-4] @ np[0] 2021f13597dSJung-uk Kim ldr $tj,[sp,#4] @ tp[1] 2031f13597dSJung-uk Kim 2041f13597dSJung-uk Kim mov $ahi,#0 2051f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 2061f13597dSJung-uk Kim str $tp,[$_bp] @ save bp 2071f13597dSJung-uk Kim mul $n0,$alo,$n0 2081f13597dSJung-uk Kim mov $nlo,#0 2091f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 2101f13597dSJung-uk Kim mov $tp,sp 2111f13597dSJung-uk Kim 2121f13597dSJung-uk Kim.Linner: 2131f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 2141f13597dSJung-uk Kim adds $alo,$ahi,$tj @ +=tp[j] 2151f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 2161f13597dSJung-uk Kim mov $ahi,#0 2171f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 2181f13597dSJung-uk Kim mov $nhi,#0 2191f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 2201f13597dSJung-uk Kim adc $ahi,$ahi,#0 2211f13597dSJung-uk Kim ldr $tj,[$tp,#8] @ tp[j+1] 2221f13597dSJung-uk Kim adds $nlo,$nlo,$alo 2231f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 2241f13597dSJung-uk Kim adc $nlo,$nhi,#0 2251f13597dSJung-uk Kim cmp $tp,$num 2261f13597dSJung-uk Kim bne .Linner 2271f13597dSJung-uk Kim 2281f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 2291f13597dSJung-uk Kim mov $nhi,#0 2301f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 2311f13597dSJung-uk Kim adc $nhi,$nhi,#0 2321f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 2331f13597dSJung-uk Kim adds $nlo,$nlo,$tj 2341f13597dSJung-uk Kim ldr $tj,[$_bpend] @ restore &bp[num] 2351f13597dSJung-uk Kim adc $nhi,$nhi,#0 2361f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 2371f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 2381f13597dSJung-uk Kim 2391f13597dSJung-uk Kim cmp $tp,$tj 240*e71b7053SJung-uk Kim#ifdef __thumb2__ 241*e71b7053SJung-uk Kim itt ne 242*e71b7053SJung-uk Kim#endif 243*e71b7053SJung-uk Kim movne $tj,sp 2441f13597dSJung-uk Kim bne .Louter 2451f13597dSJung-uk Kim 2461f13597dSJung-uk Kim ldr $rp,[$_rp] @ pull rp 247*e71b7053SJung-uk Kim mov $aj,sp 2481f13597dSJung-uk Kim add $num,$num,#4 @ $num to point at &tp[num] 249*e71b7053SJung-uk Kim sub $aj,$num,$aj @ "original" num value 2501f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2511f13597dSJung-uk Kim mov $ap,$tp @ "borrow" $ap 2521f13597dSJung-uk Kim sub $np,$np,$aj @ "rewind" $np to &np[0] 2531f13597dSJung-uk Kim 2541f13597dSJung-uk Kim subs $tj,$tj,$tj @ "clear" carry flag 2551f13597dSJung-uk Kim.Lsub: ldr $tj,[$tp],#4 2561f13597dSJung-uk Kim ldr $nj,[$np],#4 2571f13597dSJung-uk Kim sbcs $tj,$tj,$nj @ tp[j]-np[j] 2581f13597dSJung-uk Kim str $tj,[$rp],#4 @ rp[j]= 2591f13597dSJung-uk Kim teq $tp,$num @ preserve carry 2601f13597dSJung-uk Kim bne .Lsub 2611f13597dSJung-uk Kim sbcs $nhi,$nhi,#0 @ upmost carry 2621f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2631f13597dSJung-uk Kim sub $rp,$rp,$aj @ "rewind" $rp 2641f13597dSJung-uk Kim 265dea77ea6SJung-uk Kim.Lcopy: ldr $tj,[$tp] @ conditional copy 266dea77ea6SJung-uk Kim ldr $aj,[$rp] 2671f13597dSJung-uk Kim str sp,[$tp],#4 @ zap tp 268dea77ea6SJung-uk Kim#ifdef __thumb2__ 269dea77ea6SJung-uk Kim it cc 270dea77ea6SJung-uk Kim#endif 271dea77ea6SJung-uk Kim movcc $aj,$tj 272dea77ea6SJung-uk Kim str $aj,[$rp],#4 273dea77ea6SJung-uk Kim teq $tp,$num @ preserve carry 2741f13597dSJung-uk Kim bne .Lcopy 2751f13597dSJung-uk Kim 276*e71b7053SJung-uk Kim mov sp,$num 277*e71b7053SJung-uk Kim add sp,sp,#4 @ skip over tp[num+1] 2781f13597dSJung-uk Kim ldmia sp!,{r4-r12,lr} @ restore registers 2791f13597dSJung-uk Kim add sp,sp,#2*4 @ skip over {r0,r2} 2801f13597dSJung-uk Kim mov r0,#1 2817bded2dbSJung-uk Kim.Labrt: 2827bded2dbSJung-uk Kim#if __ARM_ARCH__>=5 2837bded2dbSJung-uk Kim ret @ bx lr 2847bded2dbSJung-uk Kim#else 2857bded2dbSJung-uk Kim tst lr,#1 2861f13597dSJung-uk Kim moveq pc,lr @ be binary compatible with V4, yet 2871f13597dSJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 2887bded2dbSJung-uk Kim#endif 2891f13597dSJung-uk Kim.size bn_mul_mont,.-bn_mul_mont 2907bded2dbSJung-uk Kim___ 2917bded2dbSJung-uk Kim{ 2927bded2dbSJung-uk Kimmy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 2937bded2dbSJung-uk Kimmy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 2947bded2dbSJung-uk Kimmy ($Z,$Temp)=("q4","q5"); 295*e71b7053SJung-uk Kimmy @ACC=map("q$_",(6..13)); 2967bded2dbSJung-uk Kimmy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 297*e71b7053SJung-uk Kimmy $zero="$Z#lo"; 298*e71b7053SJung-uk Kimmy $temp="$Temp#lo"; 2997bded2dbSJung-uk Kim 3007bded2dbSJung-uk Kimmy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 301*e71b7053SJung-uk Kimmy ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); 3027bded2dbSJung-uk Kim 3037bded2dbSJung-uk Kim$code.=<<___; 3047bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 3057bded2dbSJung-uk Kim.arch armv7-a 3067bded2dbSJung-uk Kim.fpu neon 3077bded2dbSJung-uk Kim 3087bded2dbSJung-uk Kim.type bn_mul8x_mont_neon,%function 3097bded2dbSJung-uk Kim.align 5 3107bded2dbSJung-uk Kimbn_mul8x_mont_neon: 3117bded2dbSJung-uk Kim mov ip,sp 3127bded2dbSJung-uk Kim stmdb sp!,{r4-r11} 3137bded2dbSJung-uk Kim vstmdb sp!,{d8-d15} @ ABI specification says so 3147bded2dbSJung-uk Kim ldmia ip,{r4-r5} @ load rest of parameter block 315*e71b7053SJung-uk Kim mov ip,sp 3167bded2dbSJung-uk Kim 317*e71b7053SJung-uk Kim cmp $num,#8 318*e71b7053SJung-uk Kim bhi .LNEON_8n 319*e71b7053SJung-uk Kim 320*e71b7053SJung-uk Kim @ special case for $num==8, everything is in register bank... 321*e71b7053SJung-uk Kim 3227bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 323*e71b7053SJung-uk Kim veor $zero,$zero,$zero 324*e71b7053SJung-uk Kim sub $toutptr,sp,$num,lsl#4 3257bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 3267bded2dbSJung-uk Kim and $toutptr,$toutptr,#-64 3277bded2dbSJung-uk Kim vld1.32 {${M0}[0]}, [$n0,:32] 3287bded2dbSJung-uk Kim mov sp,$toutptr @ alloca 3297bded2dbSJung-uk Kim vzip.16 $Bi,$zero 3307bded2dbSJung-uk Kim 331*e71b7053SJung-uk Kim vmull.u32 @ACC[0],$Bi,${A0}[0] 332*e71b7053SJung-uk Kim vmull.u32 @ACC[1],$Bi,${A0}[1] 333*e71b7053SJung-uk Kim vmull.u32 @ACC[2],$Bi,${A1}[0] 334*e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 335*e71b7053SJung-uk Kim vmull.u32 @ACC[3],$Bi,${A1}[1] 3367bded2dbSJung-uk Kim 337*e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 3387bded2dbSJung-uk Kim veor $zero,$zero,$zero 339*e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 3407bded2dbSJung-uk Kim 341*e71b7053SJung-uk Kim vmull.u32 @ACC[4],$Bi,${A2}[0] 3427bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 343*e71b7053SJung-uk Kim vmull.u32 @ACC[5],$Bi,${A2}[1] 344*e71b7053SJung-uk Kim vmull.u32 @ACC[6],$Bi,${A3}[0] 3457bded2dbSJung-uk Kim vzip.16 $Ni,$zero 346*e71b7053SJung-uk Kim vmull.u32 @ACC[7],$Bi,${A3}[1] 3477bded2dbSJung-uk Kim 348*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 3497bded2dbSJung-uk Kim sub $outer,$num,#1 350*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 351*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 352*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 3537bded2dbSJung-uk Kim 354*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 355*e71b7053SJung-uk Kim vmov $Temp,@ACC[0] 356*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 357*e71b7053SJung-uk Kim vmov @ACC[0],@ACC[1] 358*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 359*e71b7053SJung-uk Kim vmov @ACC[1],@ACC[2] 360*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 361*e71b7053SJung-uk Kim vmov @ACC[2],@ACC[3] 362*e71b7053SJung-uk Kim vmov @ACC[3],@ACC[4] 3637bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 364*e71b7053SJung-uk Kim vmov @ACC[4],@ACC[5] 365*e71b7053SJung-uk Kim vmov @ACC[5],@ACC[6] 366*e71b7053SJung-uk Kim vadd.u64 $temp,$temp,$Temp#hi 367*e71b7053SJung-uk Kim vmov @ACC[6],@ACC[7] 368*e71b7053SJung-uk Kim veor @ACC[7],@ACC[7] 3697bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3707bded2dbSJung-uk Kim 3717bded2dbSJung-uk Kim b .LNEON_outer8 3727bded2dbSJung-uk Kim 3737bded2dbSJung-uk Kim.align 4 3747bded2dbSJung-uk Kim.LNEON_outer8: 3757bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 3767bded2dbSJung-uk Kim veor $zero,$zero,$zero 3777bded2dbSJung-uk Kim vzip.16 $Bi,$zero 378*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 3797bded2dbSJung-uk Kim 380*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 381*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 382*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 383*e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 384*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 3857bded2dbSJung-uk Kim 386*e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 3877bded2dbSJung-uk Kim veor $zero,$zero,$zero 3887bded2dbSJung-uk Kim subs $outer,$outer,#1 389*e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 3907bded2dbSJung-uk Kim 391*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 392*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 393*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 3947bded2dbSJung-uk Kim vzip.16 $Ni,$zero 395*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 3967bded2dbSJung-uk Kim 397*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 398*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 399*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 400*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 4017bded2dbSJung-uk Kim 402*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 403*e71b7053SJung-uk Kim vmov $Temp,@ACC[0] 404*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 405*e71b7053SJung-uk Kim vmov @ACC[0],@ACC[1] 406*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 407*e71b7053SJung-uk Kim vmov @ACC[1],@ACC[2] 408*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 409*e71b7053SJung-uk Kim vmov @ACC[2],@ACC[3] 410*e71b7053SJung-uk Kim vmov @ACC[3],@ACC[4] 4117bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 412*e71b7053SJung-uk Kim vmov @ACC[4],@ACC[5] 413*e71b7053SJung-uk Kim vmov @ACC[5],@ACC[6] 414*e71b7053SJung-uk Kim vadd.u64 $temp,$temp,$Temp#hi 415*e71b7053SJung-uk Kim vmov @ACC[6],@ACC[7] 416*e71b7053SJung-uk Kim veor @ACC[7],@ACC[7] 4177bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 4187bded2dbSJung-uk Kim 4197bded2dbSJung-uk Kim bne .LNEON_outer8 4207bded2dbSJung-uk Kim 421*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 4227bded2dbSJung-uk Kim mov $toutptr,sp 423*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 4247bded2dbSJung-uk Kim mov $inner,$num 425*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 426*e71b7053SJung-uk Kim add $tinptr,sp,#96 427*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 428*e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 4297bded2dbSJung-uk Kim 430*e71b7053SJung-uk Kim b .LNEON_tail_entry 4317bded2dbSJung-uk Kim 4327bded2dbSJung-uk Kim.align 4 433*e71b7053SJung-uk Kim.LNEON_8n: 434*e71b7053SJung-uk Kim veor @ACC[0],@ACC[0],@ACC[0] 435*e71b7053SJung-uk Kim sub $toutptr,sp,#128 436*e71b7053SJung-uk Kim veor @ACC[1],@ACC[1],@ACC[1] 437*e71b7053SJung-uk Kim sub $toutptr,$toutptr,$num,lsl#4 438*e71b7053SJung-uk Kim veor @ACC[2],@ACC[2],@ACC[2] 439*e71b7053SJung-uk Kim and $toutptr,$toutptr,#-64 440*e71b7053SJung-uk Kim veor @ACC[3],@ACC[3],@ACC[3] 441*e71b7053SJung-uk Kim mov sp,$toutptr @ alloca 442*e71b7053SJung-uk Kim veor @ACC[4],@ACC[4],@ACC[4] 443*e71b7053SJung-uk Kim add $toutptr,$toutptr,#256 444*e71b7053SJung-uk Kim veor @ACC[5],@ACC[5],@ACC[5] 4457bded2dbSJung-uk Kim sub $inner,$num,#8 446*e71b7053SJung-uk Kim veor @ACC[6],@ACC[6],@ACC[6] 447*e71b7053SJung-uk Kim veor @ACC[7],@ACC[7],@ACC[7] 4487bded2dbSJung-uk Kim 449*e71b7053SJung-uk Kim.LNEON_8n_init: 450*e71b7053SJung-uk Kim vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! 451*e71b7053SJung-uk Kim subs $inner,$inner,#8 452*e71b7053SJung-uk Kim vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! 453*e71b7053SJung-uk Kim vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! 454*e71b7053SJung-uk Kim vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! 455*e71b7053SJung-uk Kim bne .LNEON_8n_init 4567bded2dbSJung-uk Kim 457*e71b7053SJung-uk Kim add $tinptr,sp,#256 4587bded2dbSJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 459*e71b7053SJung-uk Kim add $bnptr,sp,#8 460*e71b7053SJung-uk Kim vld1.32 {${M0}[0]},[$n0,:32] 461*e71b7053SJung-uk Kim mov $outer,$num 462*e71b7053SJung-uk Kim b .LNEON_8n_outer 4637bded2dbSJung-uk Kim 464*e71b7053SJung-uk Kim.align 4 465*e71b7053SJung-uk Kim.LNEON_8n_outer: 466*e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ 467*e71b7053SJung-uk Kim veor $zero,$zero,$zero 468*e71b7053SJung-uk Kim vzip.16 $Bi,$zero 469*e71b7053SJung-uk Kim add $toutptr,sp,#128 4707bded2dbSJung-uk Kim vld1.32 {$N0-$N3},[$nptr]! 4717bded2dbSJung-uk Kim 472*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 473*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 474*e71b7053SJung-uk Kim veor $zero,$zero,$zero 475*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 476*e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 477*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 478*e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 479*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 480*e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 481*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 482*e71b7053SJung-uk Kim vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] 483*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 484*e71b7053SJung-uk Kim vzip.16 $Ni,$zero 485*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 486*e71b7053SJung-uk Kim___ 487*e71b7053SJung-uk Kimfor ($i=0; $i<7;) { 488*e71b7053SJung-uk Kim$code.=<<___; 489*e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ 490*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 491*e71b7053SJung-uk Kim veor $temp,$temp,$temp 492*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 493*e71b7053SJung-uk Kim vzip.16 $Bi,$temp 494*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 495*e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 496*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 497*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 498*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi 499*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 500*e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 501*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 502*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 503*e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo 504*e71b7053SJung-uk Kim vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] 505*e71b7053SJung-uk Kim___ 506*e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); $i++; 507*e71b7053SJung-uk Kim$code.=<<___; 508*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 509*e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128]! 510*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 511*e71b7053SJung-uk Kim veor $zero,$zero,$zero 512*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 513*e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 514*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 515*e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 516*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 517*e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 518*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 519*e71b7053SJung-uk Kim vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] 520*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 521*e71b7053SJung-uk Kim vzip.16 $Ni,$zero 522*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 523*e71b7053SJung-uk Kim___ 524*e71b7053SJung-uk Kim} 525*e71b7053SJung-uk Kim$code.=<<___; 526*e71b7053SJung-uk Kim vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] 527*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 528*e71b7053SJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 529*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 530*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 531*e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 532*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 533*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 534*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi 535*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 536*e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 537*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 538*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 539*e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo 540*e71b7053SJung-uk Kim vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] 541*e71b7053SJung-uk Kim add $bnptr,sp,#8 @ rewind 542*e71b7053SJung-uk Kim___ 543*e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 544*e71b7053SJung-uk Kim$code.=<<___; 545*e71b7053SJung-uk Kim sub $inner,$num,#8 546*e71b7053SJung-uk Kim b .LNEON_8n_inner 5477bded2dbSJung-uk Kim 548*e71b7053SJung-uk Kim.align 4 549*e71b7053SJung-uk Kim.LNEON_8n_inner: 5507bded2dbSJung-uk Kim subs $inner,$inner,#8 551*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 552*e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128] 553*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 554*e71b7053SJung-uk Kim vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] 555*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 556*e71b7053SJung-uk Kim vld1.32 {$N0-$N3},[$nptr]! 557*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 558*e71b7053SJung-uk Kim it ne 559*e71b7053SJung-uk Kim addne $tinptr,$tinptr,#16 @ don't advance in last iteration 560*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 561*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 562*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 563*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 564*e71b7053SJung-uk Kim___ 565*e71b7053SJung-uk Kimfor ($i=1; $i<8; $i++) { 566*e71b7053SJung-uk Kim$code.=<<___; 567*e71b7053SJung-uk Kim vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] 568*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 569*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 570*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 571*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 572*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 573*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 574*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 575*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 576*e71b7053SJung-uk Kim vst1.64 {@ACC[0]},[$toutptr,:128]! 577*e71b7053SJung-uk Kim___ 578*e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 579*e71b7053SJung-uk Kim$code.=<<___; 580*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 581*e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128] 582*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 583*e71b7053SJung-uk Kim vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] 584*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 585*e71b7053SJung-uk Kim it ne 586*e71b7053SJung-uk Kim addne $tinptr,$tinptr,#16 @ don't advance in last iteration 587*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 588*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 589*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 590*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 591*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 592*e71b7053SJung-uk Kim___ 593*e71b7053SJung-uk Kim} 594*e71b7053SJung-uk Kim$code.=<<___; 595*e71b7053SJung-uk Kim it eq 596*e71b7053SJung-uk Kim subeq $aptr,$aptr,$num,lsl#2 @ rewind 597*e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 598*e71b7053SJung-uk Kim vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] 599*e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 600*e71b7053SJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 601*e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 602*e71b7053SJung-uk Kim add $bnptr,sp,#8 @ rewind 603*e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 604*e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 605*e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 606*e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 607*e71b7053SJung-uk Kim vst1.64 {@ACC[0]},[$toutptr,:128]! 608*e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 6097bded2dbSJung-uk Kim 610*e71b7053SJung-uk Kim bne .LNEON_8n_inner 611*e71b7053SJung-uk Kim___ 612*e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 613*e71b7053SJung-uk Kim$code.=<<___; 614*e71b7053SJung-uk Kim add $tinptr,sp,#128 615*e71b7053SJung-uk Kim vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! 616*e71b7053SJung-uk Kim veor q2,q2,q2 @ $N0-$N1 617*e71b7053SJung-uk Kim vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! 618*e71b7053SJung-uk Kim veor q3,q3,q3 @ $N2-$N3 619*e71b7053SJung-uk Kim vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! 620*e71b7053SJung-uk Kim vst1.64 {@ACC[6]},[$toutptr,:128] 621*e71b7053SJung-uk Kim 622*e71b7053SJung-uk Kim subs $outer,$outer,#8 623*e71b7053SJung-uk Kim vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! 624*e71b7053SJung-uk Kim vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! 625*e71b7053SJung-uk Kim vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! 626*e71b7053SJung-uk Kim vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! 627*e71b7053SJung-uk Kim 628*e71b7053SJung-uk Kim itt ne 629*e71b7053SJung-uk Kim subne $nptr,$nptr,$num,lsl#2 @ rewind 630*e71b7053SJung-uk Kim bne .LNEON_8n_outer 631*e71b7053SJung-uk Kim 632*e71b7053SJung-uk Kim add $toutptr,sp,#128 633*e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame 634*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 635*e71b7053SJung-uk Kim vst1.64 {q2-q3},[sp,:256]! 636*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 637*e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! 638*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 639*e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! 640*e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 641*e71b7053SJung-uk Kim 642*e71b7053SJung-uk Kim mov $inner,$num 643*e71b7053SJung-uk Kim b .LNEON_tail_entry 644*e71b7053SJung-uk Kim 645*e71b7053SJung-uk Kim.align 4 646*e71b7053SJung-uk Kim.LNEON_tail: 647*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 648*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 649*e71b7053SJung-uk Kim vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! 650*e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 651*e71b7053SJung-uk Kim vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! 652*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 653*e71b7053SJung-uk Kim vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! 654*e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 655*e71b7053SJung-uk Kim 656*e71b7053SJung-uk Kim.LNEON_tail_entry: 657*e71b7053SJung-uk Kim___ 658*e71b7053SJung-uk Kimfor ($i=1; $i<8; $i++) { 659*e71b7053SJung-uk Kim$code.=<<___; 660*e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp 661*e71b7053SJung-uk Kim vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! 662*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[1]#lo,#16 663*e71b7053SJung-uk Kim vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp 664*e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[1]#hi,#16 665*e71b7053SJung-uk Kim vzip.16 @ACC[1]#lo,@ACC[1]#hi 666*e71b7053SJung-uk Kim___ 667*e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 668*e71b7053SJung-uk Kim} 669*e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 670*e71b7053SJung-uk Kim$code.=<<___; 671*e71b7053SJung-uk Kim vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! 672*e71b7053SJung-uk Kim subs $inner,$inner,#8 673*e71b7053SJung-uk Kim vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! 6747bded2dbSJung-uk Kim bne .LNEON_tail 6757bded2dbSJung-uk Kim 6767bded2dbSJung-uk Kim vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 6777bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 6787bded2dbSJung-uk Kim subs $aptr,sp,#0 @ clear carry flag 6797bded2dbSJung-uk Kim add $bptr,sp,$num,lsl#2 6807bded2dbSJung-uk Kim 6817bded2dbSJung-uk Kim.LNEON_sub: 6827bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 6837bded2dbSJung-uk Kim ldmia $nptr!, {r8-r11} 6847bded2dbSJung-uk Kim sbcs r8, r4,r8 6857bded2dbSJung-uk Kim sbcs r9, r5,r9 6867bded2dbSJung-uk Kim sbcs r10,r6,r10 6877bded2dbSJung-uk Kim sbcs r11,r7,r11 6887bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 6897bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 6907bded2dbSJung-uk Kim bne .LNEON_sub 6917bded2dbSJung-uk Kim 6927bded2dbSJung-uk Kim ldr r10, [$aptr] @ load top-most bit 693*e71b7053SJung-uk Kim mov r11,sp 6947bded2dbSJung-uk Kim veor q0,q0,q0 695*e71b7053SJung-uk Kim sub r11,$bptr,r11 @ this is num*4 6967bded2dbSJung-uk Kim veor q1,q1,q1 6977bded2dbSJung-uk Kim mov $aptr,sp 6987bded2dbSJung-uk Kim sub $rptr,$rptr,r11 @ rewind $rptr 6997bded2dbSJung-uk Kim mov $nptr,$bptr @ second 3/4th of frame 7007bded2dbSJung-uk Kim sbcs r10,r10,#0 @ result is carry flag 7017bded2dbSJung-uk Kim 7027bded2dbSJung-uk Kim.LNEON_copy_n_zap: 7037bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 7047bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 705*e71b7053SJung-uk Kim it cc 7067bded2dbSJung-uk Kim movcc r8, r4 7077bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 708*e71b7053SJung-uk Kim itt cc 7097bded2dbSJung-uk Kim movcc r9, r5 7107bded2dbSJung-uk Kim movcc r10,r6 7117bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 712*e71b7053SJung-uk Kim it cc 7137bded2dbSJung-uk Kim movcc r11,r7 7147bded2dbSJung-uk Kim ldmia $aptr, {r4-r7} 7157bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 7167bded2dbSJung-uk Kim sub $aptr,$aptr,#16 7177bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 718*e71b7053SJung-uk Kim it cc 7197bded2dbSJung-uk Kim movcc r8, r4 7207bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 721*e71b7053SJung-uk Kim itt cc 7227bded2dbSJung-uk Kim movcc r9, r5 7237bded2dbSJung-uk Kim movcc r10,r6 7247bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 725*e71b7053SJung-uk Kim it cc 7267bded2dbSJung-uk Kim movcc r11,r7 7277bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 7287bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 7297bded2dbSJung-uk Kim bne .LNEON_copy_n_zap 7307bded2dbSJung-uk Kim 731*e71b7053SJung-uk Kim mov sp,ip 7327bded2dbSJung-uk Kim vldmia sp!,{d8-d15} 7337bded2dbSJung-uk Kim ldmia sp!,{r4-r11} 7347bded2dbSJung-uk Kim ret @ bx lr 7357bded2dbSJung-uk Kim.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 7367bded2dbSJung-uk Kim#endif 7377bded2dbSJung-uk Kim___ 7387bded2dbSJung-uk Kim} 7397bded2dbSJung-uk Kim$code.=<<___; 7407bded2dbSJung-uk Kim.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 7411f13597dSJung-uk Kim.align 2 7427bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 7437bded2dbSJung-uk Kim.comm OPENSSL_armcap_P,4,4 7447bded2dbSJung-uk Kim#endif 7451f13597dSJung-uk Kim___ 7461f13597dSJung-uk Kim 747*e71b7053SJung-uk Kimforeach (split("\n",$code)) { 748*e71b7053SJung-uk Kim s/\`([^\`]*)\`/eval $1/ge; 749*e71b7053SJung-uk Kim 750*e71b7053SJung-uk Kim s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or 751*e71b7053SJung-uk Kim s/\bret\b/bx lr/g or 752*e71b7053SJung-uk Kim s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 753*e71b7053SJung-uk Kim 754*e71b7053SJung-uk Kim print $_,"\n"; 755*e71b7053SJung-uk Kim} 756*e71b7053SJung-uk Kim 7571f13597dSJung-uk Kimclose STDOUT; 758