11f13597dSJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# ==================================================================== 117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 151f13597dSJung-uk Kim# ==================================================================== 161f13597dSJung-uk Kim 171f13597dSJung-uk Kim# January 2007. 181f13597dSJung-uk Kim 191f13597dSJung-uk Kim# Montgomery multiplication for ARMv4. 201f13597dSJung-uk Kim# 211f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations 221f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement 231f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and 241f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 251f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal 261f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an 271f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction. 281f13597dSJung-uk Kim# 291f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less 301f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively 311f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical. 321f13597dSJung-uk Kim 337bded2dbSJung-uk Kim# November 2013 347bded2dbSJung-uk Kim# 357bded2dbSJung-uk Kim# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 367bded2dbSJung-uk Kim# performance improvement on Cortex-A8 is ~45-100% depending on key 377bded2dbSJung-uk Kim# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 387bded2dbSJung-uk Kim# On Snapdragon S4 improvement was measured to vary from ~70% to 397bded2dbSJung-uk Kim# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 407bded2dbSJung-uk Kim# rather because original integer-only code seems to perform 417bded2dbSJung-uk Kim# suboptimally on S4. Situation on Cortex-A9 is unfortunately 427bded2dbSJung-uk Kim# different. It's being looked into, but the trouble is that 437bded2dbSJung-uk Kim# performance for vectors longer than 256 bits is actually couple 447bded2dbSJung-uk Kim# of percent worse than for integer-only code. The code is chosen 457bded2dbSJung-uk Kim# for execution on all NEON-capable processors, because gain on 467bded2dbSJung-uk Kim# others outweighs the marginal loss on Cortex-A9. 477bded2dbSJung-uk Kim 48e71b7053SJung-uk Kim# September 2015 49e71b7053SJung-uk Kim# 50e71b7053SJung-uk Kim# Align Cortex-A9 performance with November 2013 improvements, i.e. 51e71b7053SJung-uk Kim# NEON code is now ~20-105% faster than integer-only one on this 52e71b7053SJung-uk Kim# processor. But this optimization further improved performance even 53e71b7053SJung-uk Kim# on other processors: NEON code path is ~45-180% faster than original 54e71b7053SJung-uk Kim# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on 55e71b7053SJung-uk Kim# Snapdragon S4. 56e71b7053SJung-uk Kim 57*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 58*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 59*b077aed3SPierre Proncherymy $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 60*b077aed3SPierre Proncherymy $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 61e71b7053SJung-uk Kim 62e71b7053SJung-uk Kimif ($flavour && $flavour ne "void") { 63e71b7053SJung-uk Kim $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64e71b7053SJung-uk Kim ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 65e71b7053SJung-uk Kim ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 66e71b7053SJung-uk Kim die "can't locate arm-xlate.pl"; 67e71b7053SJung-uk Kim 68*b077aed3SPierre Pronchery open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 69*b077aed3SPierre Pronchery or die "can't call $xlate: $1"; 70e71b7053SJung-uk Kim} else { 71*b077aed3SPierre Pronchery $output and open STDOUT,">$output"; 72e71b7053SJung-uk Kim} 731f13597dSJung-uk Kim 741f13597dSJung-uk Kim$num="r0"; # starts as num argument, but holds &tp[num-1] 751f13597dSJung-uk Kim$ap="r1"; 761f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2"; 771f13597dSJung-uk Kim$np="r3"; 781f13597dSJung-uk Kim$tp="r4"; 791f13597dSJung-uk Kim$aj="r5"; 801f13597dSJung-uk Kim$nj="r6"; 811f13597dSJung-uk Kim$tj="r7"; 821f13597dSJung-uk Kim$n0="r8"; 831f13597dSJung-uk Kim########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 841f13597dSJung-uk Kim$alo="r10"; # sl, gcc uses it to keep @GOT 851f13597dSJung-uk Kim$ahi="r11"; # fp 861f13597dSJung-uk Kim$nlo="r12"; # ip 871f13597dSJung-uk Kim########### # r13 is stack pointer 881f13597dSJung-uk Kim$nhi="r14"; # lr 891f13597dSJung-uk Kim########### # r15 is program counter 901f13597dSJung-uk Kim 911f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num 921f13597dSJung-uk Kim$_rp="$num,#12*4"; 931f13597dSJung-uk Kim# ap permanently resides in r1 941f13597dSJung-uk Kim$_bp="$num,#13*4"; 951f13597dSJung-uk Kim# np permanently resides in r3 961f13597dSJung-uk Kim$_n0="$num,#14*4"; 971f13597dSJung-uk Kim$_num="$num,#15*4"; $_bpend=$_num; 981f13597dSJung-uk Kim 991f13597dSJung-uk Kim$code=<<___; 1007bded2dbSJung-uk Kim#include "arm_arch.h" 1017bded2dbSJung-uk Kim 102e71b7053SJung-uk Kim#if defined(__thumb2__) 103e71b7053SJung-uk Kim.syntax unified 104e71b7053SJung-uk Kim.thumb 105e71b7053SJung-uk Kim#else 1067bded2dbSJung-uk Kim.code 32 107e71b7053SJung-uk Kim#endif 1087bded2dbSJung-uk Kim 109*b077aed3SPierre Pronchery.text 110*b077aed3SPierre Pronchery 1117bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 1127bded2dbSJung-uk Kim.align 5 1137bded2dbSJung-uk Kim.LOPENSSL_armcap: 114*b077aed3SPierre Pronchery# ifdef _WIN32 115*b077aed3SPierre Pronchery.word OPENSSL_armcap_P 116*b077aed3SPierre Pronchery# else 117e71b7053SJung-uk Kim.word OPENSSL_armcap_P-.Lbn_mul_mont 1187bded2dbSJung-uk Kim# endif 119*b077aed3SPierre Pronchery#endif 1201f13597dSJung-uk Kim 1211f13597dSJung-uk Kim.global bn_mul_mont 1221f13597dSJung-uk Kim.type bn_mul_mont,%function 1231f13597dSJung-uk Kim 1247bded2dbSJung-uk Kim.align 5 1251f13597dSJung-uk Kimbn_mul_mont: 126e71b7053SJung-uk Kim.Lbn_mul_mont: 1277bded2dbSJung-uk Kim ldr ip,[sp,#4] @ load num 1281f13597dSJung-uk Kim stmdb sp!,{r0,r2} @ sp points at argument block 1297bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 1307bded2dbSJung-uk Kim tst ip,#7 1317bded2dbSJung-uk Kim bne .Lialu 132*b077aed3SPierre Pronchery ldr r0,.LOPENSSL_armcap 133*b077aed3SPierre Pronchery#if !defined(_WIN32) 134*b077aed3SPierre Pronchery adr r2,.Lbn_mul_mont 1357bded2dbSJung-uk Kim ldr r0,[r0,r2] 136*b077aed3SPierre Pronchery# endif 137*b077aed3SPierre Pronchery# if defined(__APPLE__) || defined(_WIN32) 138e71b7053SJung-uk Kim ldr r0,[r0] 139e71b7053SJung-uk Kim# endif 140e71b7053SJung-uk Kim tst r0,#ARMV7_NEON @ NEON available? 1417bded2dbSJung-uk Kim ldmia sp, {r0,r2} 1427bded2dbSJung-uk Kim beq .Lialu 1437bded2dbSJung-uk Kim add sp,sp,#8 1447bded2dbSJung-uk Kim b bn_mul8x_mont_neon 1457bded2dbSJung-uk Kim.align 4 1467bded2dbSJung-uk Kim.Lialu: 1477bded2dbSJung-uk Kim#endif 1487bded2dbSJung-uk Kim cmp ip,#2 1497bded2dbSJung-uk Kim mov $num,ip @ load num 150e71b7053SJung-uk Kim#ifdef __thumb2__ 151e71b7053SJung-uk Kim ittt lt 152e71b7053SJung-uk Kim#endif 1531f13597dSJung-uk Kim movlt r0,#0 1541f13597dSJung-uk Kim addlt sp,sp,#2*4 1551f13597dSJung-uk Kim blt .Labrt 1561f13597dSJung-uk Kim 1571f13597dSJung-uk Kim stmdb sp!,{r4-r12,lr} @ save 10 registers 1581f13597dSJung-uk Kim 1591f13597dSJung-uk Kim mov $num,$num,lsl#2 @ rescale $num for byte count 1601f13597dSJung-uk Kim sub sp,sp,$num @ alloca(4*num) 1611f13597dSJung-uk Kim sub sp,sp,#4 @ +extra dword 1621f13597dSJung-uk Kim sub $num,$num,#4 @ "num=num-1" 1631f13597dSJung-uk Kim add $tp,$bp,$num @ &bp[num-1] 1641f13597dSJung-uk Kim 1651f13597dSJung-uk Kim add $num,sp,$num @ $num to point at &tp[num-1] 1661f13597dSJung-uk Kim ldr $n0,[$_n0] @ &n0 1671f13597dSJung-uk Kim ldr $bi,[$bp] @ bp[0] 1681f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[0],ap++ 1691f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[0],np++ 1701f13597dSJung-uk Kim ldr $n0,[$n0] @ *n0 1711f13597dSJung-uk Kim str $tp,[$_bpend] @ save &bp[num] 1721f13597dSJung-uk Kim 1731f13597dSJung-uk Kim umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 1741f13597dSJung-uk Kim str $n0,[$_n0] @ save n0 value 1751f13597dSJung-uk Kim mul $n0,$alo,$n0 @ "tp[0]"*n0 1761f13597dSJung-uk Kim mov $nlo,#0 1771f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 1781f13597dSJung-uk Kim mov $tp,sp 1791f13597dSJung-uk Kim 1801f13597dSJung-uk Kim.L1st: 1811f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 1821f13597dSJung-uk Kim mov $alo,$ahi 1831f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 1841f13597dSJung-uk Kim mov $ahi,#0 1851f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 1861f13597dSJung-uk Kim mov $nhi,#0 1871f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 1881f13597dSJung-uk Kim adds $nlo,$nlo,$alo 1891f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 1901f13597dSJung-uk Kim adc $nlo,$nhi,#0 1911f13597dSJung-uk Kim cmp $tp,$num 1921f13597dSJung-uk Kim bne .L1st 1931f13597dSJung-uk Kim 1941f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 1951f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 1961f13597dSJung-uk Kim mov $nhi,#0 1971f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 1981f13597dSJung-uk Kim adc $nhi,$nhi,#0 1991f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 200e71b7053SJung-uk Kim mov $tj,sp 2011f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 2021f13597dSJung-uk Kim 2031f13597dSJung-uk Kim.Louter: 204e71b7053SJung-uk Kim sub $tj,$num,$tj @ "original" $num-1 value 2051f13597dSJung-uk Kim sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 2061f13597dSJung-uk Kim ldr $bi,[$tp,#4]! @ *(++bp) 2071f13597dSJung-uk Kim sub $np,$np,$tj @ "rewind" np to &np[1] 2081f13597dSJung-uk Kim ldr $aj,[$ap,#-4] @ ap[0] 2091f13597dSJung-uk Kim ldr $alo,[sp] @ tp[0] 2101f13597dSJung-uk Kim ldr $nj,[$np,#-4] @ np[0] 2111f13597dSJung-uk Kim ldr $tj,[sp,#4] @ tp[1] 2121f13597dSJung-uk Kim 2131f13597dSJung-uk Kim mov $ahi,#0 2141f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 2151f13597dSJung-uk Kim str $tp,[$_bp] @ save bp 2161f13597dSJung-uk Kim mul $n0,$alo,$n0 2171f13597dSJung-uk Kim mov $nlo,#0 2181f13597dSJung-uk Kim umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 2191f13597dSJung-uk Kim mov $tp,sp 2201f13597dSJung-uk Kim 2211f13597dSJung-uk Kim.Linner: 2221f13597dSJung-uk Kim ldr $aj,[$ap],#4 @ ap[j],ap++ 2231f13597dSJung-uk Kim adds $alo,$ahi,$tj @ +=tp[j] 2241f13597dSJung-uk Kim ldr $nj,[$np],#4 @ np[j],np++ 2251f13597dSJung-uk Kim mov $ahi,#0 2261f13597dSJung-uk Kim umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 2271f13597dSJung-uk Kim mov $nhi,#0 2281f13597dSJung-uk Kim umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 2291f13597dSJung-uk Kim adc $ahi,$ahi,#0 2301f13597dSJung-uk Kim ldr $tj,[$tp,#8] @ tp[j+1] 2311f13597dSJung-uk Kim adds $nlo,$nlo,$alo 2321f13597dSJung-uk Kim str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 2331f13597dSJung-uk Kim adc $nlo,$nhi,#0 2341f13597dSJung-uk Kim cmp $tp,$num 2351f13597dSJung-uk Kim bne .Linner 2361f13597dSJung-uk Kim 2371f13597dSJung-uk Kim adds $nlo,$nlo,$ahi 2381f13597dSJung-uk Kim mov $nhi,#0 2391f13597dSJung-uk Kim ldr $tp,[$_bp] @ restore bp 2401f13597dSJung-uk Kim adc $nhi,$nhi,#0 2411f13597dSJung-uk Kim ldr $n0,[$_n0] @ restore n0 2421f13597dSJung-uk Kim adds $nlo,$nlo,$tj 2431f13597dSJung-uk Kim ldr $tj,[$_bpend] @ restore &bp[num] 2441f13597dSJung-uk Kim adc $nhi,$nhi,#0 2451f13597dSJung-uk Kim str $nlo,[$num] @ tp[num-1]= 2461f13597dSJung-uk Kim str $nhi,[$num,#4] @ tp[num]= 2471f13597dSJung-uk Kim 2481f13597dSJung-uk Kim cmp $tp,$tj 249e71b7053SJung-uk Kim#ifdef __thumb2__ 250e71b7053SJung-uk Kim itt ne 251e71b7053SJung-uk Kim#endif 252e71b7053SJung-uk Kim movne $tj,sp 2531f13597dSJung-uk Kim bne .Louter 2541f13597dSJung-uk Kim 2551f13597dSJung-uk Kim ldr $rp,[$_rp] @ pull rp 256e71b7053SJung-uk Kim mov $aj,sp 2571f13597dSJung-uk Kim add $num,$num,#4 @ $num to point at &tp[num] 258e71b7053SJung-uk Kim sub $aj,$num,$aj @ "original" num value 2591f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2601f13597dSJung-uk Kim mov $ap,$tp @ "borrow" $ap 2611f13597dSJung-uk Kim sub $np,$np,$aj @ "rewind" $np to &np[0] 2621f13597dSJung-uk Kim 2631f13597dSJung-uk Kim subs $tj,$tj,$tj @ "clear" carry flag 2641f13597dSJung-uk Kim.Lsub: ldr $tj,[$tp],#4 2651f13597dSJung-uk Kim ldr $nj,[$np],#4 2661f13597dSJung-uk Kim sbcs $tj,$tj,$nj @ tp[j]-np[j] 2671f13597dSJung-uk Kim str $tj,[$rp],#4 @ rp[j]= 2681f13597dSJung-uk Kim teq $tp,$num @ preserve carry 2691f13597dSJung-uk Kim bne .Lsub 2701f13597dSJung-uk Kim sbcs $nhi,$nhi,#0 @ upmost carry 2711f13597dSJung-uk Kim mov $tp,sp @ "rewind" $tp 2721f13597dSJung-uk Kim sub $rp,$rp,$aj @ "rewind" $rp 2731f13597dSJung-uk Kim 274dea77ea6SJung-uk Kim.Lcopy: ldr $tj,[$tp] @ conditional copy 275dea77ea6SJung-uk Kim ldr $aj,[$rp] 2761f13597dSJung-uk Kim str sp,[$tp],#4 @ zap tp 277dea77ea6SJung-uk Kim#ifdef __thumb2__ 278dea77ea6SJung-uk Kim it cc 279dea77ea6SJung-uk Kim#endif 280dea77ea6SJung-uk Kim movcc $aj,$tj 281dea77ea6SJung-uk Kim str $aj,[$rp],#4 282dea77ea6SJung-uk Kim teq $tp,$num @ preserve carry 2831f13597dSJung-uk Kim bne .Lcopy 2841f13597dSJung-uk Kim 285e71b7053SJung-uk Kim mov sp,$num 286e71b7053SJung-uk Kim add sp,sp,#4 @ skip over tp[num+1] 2871f13597dSJung-uk Kim ldmia sp!,{r4-r12,lr} @ restore registers 2881f13597dSJung-uk Kim add sp,sp,#2*4 @ skip over {r0,r2} 2891f13597dSJung-uk Kim mov r0,#1 2907bded2dbSJung-uk Kim.Labrt: 2917bded2dbSJung-uk Kim#if __ARM_ARCH__>=5 2927bded2dbSJung-uk Kim ret @ bx lr 2937bded2dbSJung-uk Kim#else 2947bded2dbSJung-uk Kim tst lr,#1 2951f13597dSJung-uk Kim moveq pc,lr @ be binary compatible with V4, yet 2961f13597dSJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 2977bded2dbSJung-uk Kim#endif 2981f13597dSJung-uk Kim.size bn_mul_mont,.-bn_mul_mont 2997bded2dbSJung-uk Kim___ 3007bded2dbSJung-uk Kim{ 3017bded2dbSJung-uk Kimmy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 3027bded2dbSJung-uk Kimmy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 3037bded2dbSJung-uk Kimmy ($Z,$Temp)=("q4","q5"); 304e71b7053SJung-uk Kimmy @ACC=map("q$_",(6..13)); 3057bded2dbSJung-uk Kimmy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 306e71b7053SJung-uk Kimmy $zero="$Z#lo"; 307e71b7053SJung-uk Kimmy $temp="$Temp#lo"; 3087bded2dbSJung-uk Kim 3097bded2dbSJung-uk Kimmy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 310e71b7053SJung-uk Kimmy ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); 3117bded2dbSJung-uk Kim 3127bded2dbSJung-uk Kim$code.=<<___; 3137bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 3147bded2dbSJung-uk Kim.arch armv7-a 3157bded2dbSJung-uk Kim.fpu neon 3167bded2dbSJung-uk Kim 3177bded2dbSJung-uk Kim.type bn_mul8x_mont_neon,%function 3187bded2dbSJung-uk Kim.align 5 3197bded2dbSJung-uk Kimbn_mul8x_mont_neon: 3207bded2dbSJung-uk Kim mov ip,sp 3217bded2dbSJung-uk Kim stmdb sp!,{r4-r11} 3227bded2dbSJung-uk Kim vstmdb sp!,{d8-d15} @ ABI specification says so 3237bded2dbSJung-uk Kim ldmia ip,{r4-r5} @ load rest of parameter block 324e71b7053SJung-uk Kim mov ip,sp 3257bded2dbSJung-uk Kim 326e71b7053SJung-uk Kim cmp $num,#8 327e71b7053SJung-uk Kim bhi .LNEON_8n 328e71b7053SJung-uk Kim 329e71b7053SJung-uk Kim @ special case for $num==8, everything is in register bank... 330e71b7053SJung-uk Kim 3317bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 332e71b7053SJung-uk Kim veor $zero,$zero,$zero 333e71b7053SJung-uk Kim sub $toutptr,sp,$num,lsl#4 3347bded2dbSJung-uk Kim vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 3357bded2dbSJung-uk Kim and $toutptr,$toutptr,#-64 3367bded2dbSJung-uk Kim vld1.32 {${M0}[0]}, [$n0,:32] 3377bded2dbSJung-uk Kim mov sp,$toutptr @ alloca 3387bded2dbSJung-uk Kim vzip.16 $Bi,$zero 3397bded2dbSJung-uk Kim 340e71b7053SJung-uk Kim vmull.u32 @ACC[0],$Bi,${A0}[0] 341e71b7053SJung-uk Kim vmull.u32 @ACC[1],$Bi,${A0}[1] 342e71b7053SJung-uk Kim vmull.u32 @ACC[2],$Bi,${A1}[0] 343e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 344e71b7053SJung-uk Kim vmull.u32 @ACC[3],$Bi,${A1}[1] 3457bded2dbSJung-uk Kim 346e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 3477bded2dbSJung-uk Kim veor $zero,$zero,$zero 348e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 3497bded2dbSJung-uk Kim 350e71b7053SJung-uk Kim vmull.u32 @ACC[4],$Bi,${A2}[0] 3517bded2dbSJung-uk Kim vld1.32 {$N0-$N3}, [$nptr]! 352e71b7053SJung-uk Kim vmull.u32 @ACC[5],$Bi,${A2}[1] 353e71b7053SJung-uk Kim vmull.u32 @ACC[6],$Bi,${A3}[0] 3547bded2dbSJung-uk Kim vzip.16 $Ni,$zero 355e71b7053SJung-uk Kim vmull.u32 @ACC[7],$Bi,${A3}[1] 3567bded2dbSJung-uk Kim 357e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 3587bded2dbSJung-uk Kim sub $outer,$num,#1 359e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 360e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 361e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 3627bded2dbSJung-uk Kim 363e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 364e71b7053SJung-uk Kim vmov $Temp,@ACC[0] 365e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 366e71b7053SJung-uk Kim vmov @ACC[0],@ACC[1] 367e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 368e71b7053SJung-uk Kim vmov @ACC[1],@ACC[2] 369e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 370e71b7053SJung-uk Kim vmov @ACC[2],@ACC[3] 371e71b7053SJung-uk Kim vmov @ACC[3],@ACC[4] 3727bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 373e71b7053SJung-uk Kim vmov @ACC[4],@ACC[5] 374e71b7053SJung-uk Kim vmov @ACC[5],@ACC[6] 375e71b7053SJung-uk Kim vadd.u64 $temp,$temp,$Temp#hi 376e71b7053SJung-uk Kim vmov @ACC[6],@ACC[7] 377e71b7053SJung-uk Kim veor @ACC[7],@ACC[7] 3787bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 3797bded2dbSJung-uk Kim 3807bded2dbSJung-uk Kim b .LNEON_outer8 3817bded2dbSJung-uk Kim 3827bded2dbSJung-uk Kim.align 4 3837bded2dbSJung-uk Kim.LNEON_outer8: 3847bded2dbSJung-uk Kim vld1.32 {${Bi}[0]}, [$bptr,:32]! 3857bded2dbSJung-uk Kim veor $zero,$zero,$zero 3867bded2dbSJung-uk Kim vzip.16 $Bi,$zero 387e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 3887bded2dbSJung-uk Kim 389e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 390e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 391e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 392e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 393e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 3947bded2dbSJung-uk Kim 395e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 3967bded2dbSJung-uk Kim veor $zero,$zero,$zero 3977bded2dbSJung-uk Kim subs $outer,$outer,#1 398e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 3997bded2dbSJung-uk Kim 400e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 401e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 402e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 4037bded2dbSJung-uk Kim vzip.16 $Ni,$zero 404e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 4057bded2dbSJung-uk Kim 406e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 407e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 408e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 409e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 4107bded2dbSJung-uk Kim 411e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 412e71b7053SJung-uk Kim vmov $Temp,@ACC[0] 413e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 414e71b7053SJung-uk Kim vmov @ACC[0],@ACC[1] 415e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 416e71b7053SJung-uk Kim vmov @ACC[1],@ACC[2] 417e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 418e71b7053SJung-uk Kim vmov @ACC[2],@ACC[3] 419e71b7053SJung-uk Kim vmov @ACC[3],@ACC[4] 4207bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 421e71b7053SJung-uk Kim vmov @ACC[4],@ACC[5] 422e71b7053SJung-uk Kim vmov @ACC[5],@ACC[6] 423e71b7053SJung-uk Kim vadd.u64 $temp,$temp,$Temp#hi 424e71b7053SJung-uk Kim vmov @ACC[6],@ACC[7] 425e71b7053SJung-uk Kim veor @ACC[7],@ACC[7] 4267bded2dbSJung-uk Kim vshr.u64 $temp,$temp,#16 4277bded2dbSJung-uk Kim 4287bded2dbSJung-uk Kim bne .LNEON_outer8 4297bded2dbSJung-uk Kim 430e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 4317bded2dbSJung-uk Kim mov $toutptr,sp 432e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 4337bded2dbSJung-uk Kim mov $inner,$num 434e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 435e71b7053SJung-uk Kim add $tinptr,sp,#96 436e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 437e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 4387bded2dbSJung-uk Kim 439e71b7053SJung-uk Kim b .LNEON_tail_entry 4407bded2dbSJung-uk Kim 4417bded2dbSJung-uk Kim.align 4 442e71b7053SJung-uk Kim.LNEON_8n: 443e71b7053SJung-uk Kim veor @ACC[0],@ACC[0],@ACC[0] 444e71b7053SJung-uk Kim sub $toutptr,sp,#128 445e71b7053SJung-uk Kim veor @ACC[1],@ACC[1],@ACC[1] 446e71b7053SJung-uk Kim sub $toutptr,$toutptr,$num,lsl#4 447e71b7053SJung-uk Kim veor @ACC[2],@ACC[2],@ACC[2] 448e71b7053SJung-uk Kim and $toutptr,$toutptr,#-64 449e71b7053SJung-uk Kim veor @ACC[3],@ACC[3],@ACC[3] 450e71b7053SJung-uk Kim mov sp,$toutptr @ alloca 451e71b7053SJung-uk Kim veor @ACC[4],@ACC[4],@ACC[4] 452e71b7053SJung-uk Kim add $toutptr,$toutptr,#256 453e71b7053SJung-uk Kim veor @ACC[5],@ACC[5],@ACC[5] 4547bded2dbSJung-uk Kim sub $inner,$num,#8 455e71b7053SJung-uk Kim veor @ACC[6],@ACC[6],@ACC[6] 456e71b7053SJung-uk Kim veor @ACC[7],@ACC[7],@ACC[7] 4577bded2dbSJung-uk Kim 458e71b7053SJung-uk Kim.LNEON_8n_init: 459e71b7053SJung-uk Kim vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! 460e71b7053SJung-uk Kim subs $inner,$inner,#8 461e71b7053SJung-uk Kim vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! 462e71b7053SJung-uk Kim vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! 463e71b7053SJung-uk Kim vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! 464e71b7053SJung-uk Kim bne .LNEON_8n_init 4657bded2dbSJung-uk Kim 466e71b7053SJung-uk Kim add $tinptr,sp,#256 4677bded2dbSJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 468e71b7053SJung-uk Kim add $bnptr,sp,#8 469e71b7053SJung-uk Kim vld1.32 {${M0}[0]},[$n0,:32] 470e71b7053SJung-uk Kim mov $outer,$num 471e71b7053SJung-uk Kim b .LNEON_8n_outer 4727bded2dbSJung-uk Kim 473e71b7053SJung-uk Kim.align 4 474e71b7053SJung-uk Kim.LNEON_8n_outer: 475e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ 476e71b7053SJung-uk Kim veor $zero,$zero,$zero 477e71b7053SJung-uk Kim vzip.16 $Bi,$zero 478e71b7053SJung-uk Kim add $toutptr,sp,#128 4797bded2dbSJung-uk Kim vld1.32 {$N0-$N3},[$nptr]! 4807bded2dbSJung-uk Kim 481e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 482e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 483e71b7053SJung-uk Kim veor $zero,$zero,$zero 484e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 485e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 486e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 487e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 488e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 489e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 490e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 491e71b7053SJung-uk Kim vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] 492e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 493e71b7053SJung-uk Kim vzip.16 $Ni,$zero 494e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 495e71b7053SJung-uk Kim___ 496e71b7053SJung-uk Kimfor ($i=0; $i<7;) { 497e71b7053SJung-uk Kim$code.=<<___; 498e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ 499e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 500e71b7053SJung-uk Kim veor $temp,$temp,$temp 501e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 502e71b7053SJung-uk Kim vzip.16 $Bi,$temp 503e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 504e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 505e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 506e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 507e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi 508e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 509e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 510e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 511e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 512e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo 513e71b7053SJung-uk Kim vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] 514e71b7053SJung-uk Kim___ 515e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); $i++; 516e71b7053SJung-uk Kim$code.=<<___; 517e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 518e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128]! 519e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 520e71b7053SJung-uk Kim veor $zero,$zero,$zero 521e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 522e71b7053SJung-uk Kim vshl.i64 $Ni,@ACC[0]#hi,#16 523e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 524e71b7053SJung-uk Kim vadd.u64 $Ni,$Ni,@ACC[0]#lo 525e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 526e71b7053SJung-uk Kim vmul.u32 $Ni,$Ni,$M0 527e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 528e71b7053SJung-uk Kim vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] 529e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 530e71b7053SJung-uk Kim vzip.16 $Ni,$zero 531e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 532e71b7053SJung-uk Kim___ 533e71b7053SJung-uk Kim} 534e71b7053SJung-uk Kim$code.=<<___; 535e71b7053SJung-uk Kim vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] 536e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 537e71b7053SJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 538e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 539e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 540e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 541e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 542e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 543e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi 544e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 545e71b7053SJung-uk Kim vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 546e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 547e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 548e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo 549e71b7053SJung-uk Kim vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] 550e71b7053SJung-uk Kim add $bnptr,sp,#8 @ rewind 551e71b7053SJung-uk Kim___ 552e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 553e71b7053SJung-uk Kim$code.=<<___; 554e71b7053SJung-uk Kim sub $inner,$num,#8 555e71b7053SJung-uk Kim b .LNEON_8n_inner 5567bded2dbSJung-uk Kim 557e71b7053SJung-uk Kim.align 4 558e71b7053SJung-uk Kim.LNEON_8n_inner: 5597bded2dbSJung-uk Kim subs $inner,$inner,#8 560e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 561e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128] 562e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 563e71b7053SJung-uk Kim vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] 564e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 565e71b7053SJung-uk Kim vld1.32 {$N0-$N3},[$nptr]! 566e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 567e71b7053SJung-uk Kim it ne 568e71b7053SJung-uk Kim addne $tinptr,$tinptr,#16 @ don't advance in last iteration 569e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 570e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 571e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 572e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 573e71b7053SJung-uk Kim___ 574e71b7053SJung-uk Kimfor ($i=1; $i<8; $i++) { 575e71b7053SJung-uk Kim$code.=<<___; 576e71b7053SJung-uk Kim vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] 577e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 578e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 579e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 580e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 581e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 582e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 583e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 584e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 585e71b7053SJung-uk Kim vst1.64 {@ACC[0]},[$toutptr,:128]! 586e71b7053SJung-uk Kim___ 587e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 588e71b7053SJung-uk Kim$code.=<<___; 589e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Bi,${A0}[0] 590e71b7053SJung-uk Kim vld1.64 {@ACC[7]},[$tinptr,:128] 591e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Bi,${A0}[1] 592e71b7053SJung-uk Kim vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] 593e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Bi,${A1}[0] 594e71b7053SJung-uk Kim it ne 595e71b7053SJung-uk Kim addne $tinptr,$tinptr,#16 @ don't advance in last iteration 596e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Bi,${A1}[1] 597e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Bi,${A2}[0] 598e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Bi,${A2}[1] 599e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Bi,${A3}[0] 600e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Bi,${A3}[1] 601e71b7053SJung-uk Kim___ 602e71b7053SJung-uk Kim} 603e71b7053SJung-uk Kim$code.=<<___; 604e71b7053SJung-uk Kim it eq 605e71b7053SJung-uk Kim subeq $aptr,$aptr,$num,lsl#2 @ rewind 606e71b7053SJung-uk Kim vmlal.u32 @ACC[0],$Ni,${N0}[0] 607e71b7053SJung-uk Kim vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] 608e71b7053SJung-uk Kim vmlal.u32 @ACC[1],$Ni,${N0}[1] 609e71b7053SJung-uk Kim vld1.32 {$A0-$A3},[$aptr]! 610e71b7053SJung-uk Kim vmlal.u32 @ACC[2],$Ni,${N1}[0] 611e71b7053SJung-uk Kim add $bnptr,sp,#8 @ rewind 612e71b7053SJung-uk Kim vmlal.u32 @ACC[3],$Ni,${N1}[1] 613e71b7053SJung-uk Kim vmlal.u32 @ACC[4],$Ni,${N2}[0] 614e71b7053SJung-uk Kim vmlal.u32 @ACC[5],$Ni,${N2}[1] 615e71b7053SJung-uk Kim vmlal.u32 @ACC[6],$Ni,${N3}[0] 616e71b7053SJung-uk Kim vst1.64 {@ACC[0]},[$toutptr,:128]! 617e71b7053SJung-uk Kim vmlal.u32 @ACC[7],$Ni,${N3}[1] 6187bded2dbSJung-uk Kim 619e71b7053SJung-uk Kim bne .LNEON_8n_inner 620e71b7053SJung-uk Kim___ 621e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 622e71b7053SJung-uk Kim$code.=<<___; 623e71b7053SJung-uk Kim add $tinptr,sp,#128 624e71b7053SJung-uk Kim vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! 625e71b7053SJung-uk Kim veor q2,q2,q2 @ $N0-$N1 626e71b7053SJung-uk Kim vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! 627e71b7053SJung-uk Kim veor q3,q3,q3 @ $N2-$N3 628e71b7053SJung-uk Kim vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! 629e71b7053SJung-uk Kim vst1.64 {@ACC[6]},[$toutptr,:128] 630e71b7053SJung-uk Kim 631e71b7053SJung-uk Kim subs $outer,$outer,#8 632e71b7053SJung-uk Kim vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! 633e71b7053SJung-uk Kim vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! 634e71b7053SJung-uk Kim vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! 635e71b7053SJung-uk Kim vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! 636e71b7053SJung-uk Kim 637e71b7053SJung-uk Kim itt ne 638e71b7053SJung-uk Kim subne $nptr,$nptr,$num,lsl#2 @ rewind 639e71b7053SJung-uk Kim bne .LNEON_8n_outer 640e71b7053SJung-uk Kim 641e71b7053SJung-uk Kim add $toutptr,sp,#128 642e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame 643e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 644e71b7053SJung-uk Kim vst1.64 {q2-q3},[sp,:256]! 645e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 646e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! 647e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 648e71b7053SJung-uk Kim vst1.64 {q2-q3}, [sp,:256]! 649e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 650e71b7053SJung-uk Kim 651e71b7053SJung-uk Kim mov $inner,$num 652e71b7053SJung-uk Kim b .LNEON_tail_entry 653e71b7053SJung-uk Kim 654e71b7053SJung-uk Kim.align 4 655e71b7053SJung-uk Kim.LNEON_tail: 656e71b7053SJung-uk Kim vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp 657e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#lo,#16 658e71b7053SJung-uk Kim vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! 659e71b7053SJung-uk Kim vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp 660e71b7053SJung-uk Kim vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! 661e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[0]#hi,#16 662e71b7053SJung-uk Kim vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! 663e71b7053SJung-uk Kim vzip.16 @ACC[0]#lo,@ACC[0]#hi 664e71b7053SJung-uk Kim 665e71b7053SJung-uk Kim.LNEON_tail_entry: 666e71b7053SJung-uk Kim___ 667e71b7053SJung-uk Kimfor ($i=1; $i<8; $i++) { 668e71b7053SJung-uk Kim$code.=<<___; 669e71b7053SJung-uk Kim vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp 670e71b7053SJung-uk Kim vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! 671e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[1]#lo,#16 672e71b7053SJung-uk Kim vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp 673e71b7053SJung-uk Kim vshr.u64 $temp,@ACC[1]#hi,#16 674e71b7053SJung-uk Kim vzip.16 @ACC[1]#lo,@ACC[1]#hi 675e71b7053SJung-uk Kim___ 676e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 677e71b7053SJung-uk Kim} 678e71b7053SJung-uk Kim push(@ACC,shift(@ACC)); 679e71b7053SJung-uk Kim$code.=<<___; 680e71b7053SJung-uk Kim vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! 681e71b7053SJung-uk Kim subs $inner,$inner,#8 682e71b7053SJung-uk Kim vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! 6837bded2dbSJung-uk Kim bne .LNEON_tail 6847bded2dbSJung-uk Kim 6857bded2dbSJung-uk Kim vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 6867bded2dbSJung-uk Kim sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 6877bded2dbSJung-uk Kim subs $aptr,sp,#0 @ clear carry flag 6887bded2dbSJung-uk Kim add $bptr,sp,$num,lsl#2 6897bded2dbSJung-uk Kim 6907bded2dbSJung-uk Kim.LNEON_sub: 6917bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 6927bded2dbSJung-uk Kim ldmia $nptr!, {r8-r11} 6937bded2dbSJung-uk Kim sbcs r8, r4,r8 6947bded2dbSJung-uk Kim sbcs r9, r5,r9 6957bded2dbSJung-uk Kim sbcs r10,r6,r10 6967bded2dbSJung-uk Kim sbcs r11,r7,r11 6977bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 6987bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 6997bded2dbSJung-uk Kim bne .LNEON_sub 7007bded2dbSJung-uk Kim 7017bded2dbSJung-uk Kim ldr r10, [$aptr] @ load top-most bit 702e71b7053SJung-uk Kim mov r11,sp 7037bded2dbSJung-uk Kim veor q0,q0,q0 704e71b7053SJung-uk Kim sub r11,$bptr,r11 @ this is num*4 7057bded2dbSJung-uk Kim veor q1,q1,q1 7067bded2dbSJung-uk Kim mov $aptr,sp 7077bded2dbSJung-uk Kim sub $rptr,$rptr,r11 @ rewind $rptr 7087bded2dbSJung-uk Kim mov $nptr,$bptr @ second 3/4th of frame 7097bded2dbSJung-uk Kim sbcs r10,r10,#0 @ result is carry flag 7107bded2dbSJung-uk Kim 7117bded2dbSJung-uk Kim.LNEON_copy_n_zap: 7127bded2dbSJung-uk Kim ldmia $aptr!, {r4-r7} 7137bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 714e71b7053SJung-uk Kim it cc 7157bded2dbSJung-uk Kim movcc r8, r4 7167bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 717e71b7053SJung-uk Kim itt cc 7187bded2dbSJung-uk Kim movcc r9, r5 7197bded2dbSJung-uk Kim movcc r10,r6 7207bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 721e71b7053SJung-uk Kim it cc 7227bded2dbSJung-uk Kim movcc r11,r7 7237bded2dbSJung-uk Kim ldmia $aptr, {r4-r7} 7247bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 7257bded2dbSJung-uk Kim sub $aptr,$aptr,#16 7267bded2dbSJung-uk Kim ldmia $rptr, {r8-r11} 727e71b7053SJung-uk Kim it cc 7287bded2dbSJung-uk Kim movcc r8, r4 7297bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 730e71b7053SJung-uk Kim itt cc 7317bded2dbSJung-uk Kim movcc r9, r5 7327bded2dbSJung-uk Kim movcc r10,r6 7337bded2dbSJung-uk Kim vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 734e71b7053SJung-uk Kim it cc 7357bded2dbSJung-uk Kim movcc r11,r7 7367bded2dbSJung-uk Kim teq $aptr,$bptr @ preserves carry 7377bded2dbSJung-uk Kim stmia $rptr!, {r8-r11} 7387bded2dbSJung-uk Kim bne .LNEON_copy_n_zap 7397bded2dbSJung-uk Kim 740e71b7053SJung-uk Kim mov sp,ip 7417bded2dbSJung-uk Kim vldmia sp!,{d8-d15} 7427bded2dbSJung-uk Kim ldmia sp!,{r4-r11} 7437bded2dbSJung-uk Kim ret @ bx lr 7447bded2dbSJung-uk Kim.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 7457bded2dbSJung-uk Kim#endif 7467bded2dbSJung-uk Kim___ 7477bded2dbSJung-uk Kim} 7487bded2dbSJung-uk Kim$code.=<<___; 7497bded2dbSJung-uk Kim.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 7501f13597dSJung-uk Kim.align 2 7517bded2dbSJung-uk Kim#if __ARM_MAX_ARCH__>=7 7527bded2dbSJung-uk Kim.comm OPENSSL_armcap_P,4,4 7537bded2dbSJung-uk Kim#endif 7541f13597dSJung-uk Kim___ 7551f13597dSJung-uk Kim 756e71b7053SJung-uk Kimforeach (split("\n",$code)) { 757e71b7053SJung-uk Kim s/\`([^\`]*)\`/eval $1/ge; 758e71b7053SJung-uk Kim 759e71b7053SJung-uk Kim s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or 760e71b7053SJung-uk Kim s/\bret\b/bx lr/g or 761e71b7053SJung-uk Kim s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 762e71b7053SJung-uk Kim 763e71b7053SJung-uk Kim print $_,"\n"; 764e71b7053SJung-uk Kim} 765e71b7053SJung-uk Kim 76617f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 767