11f13597dSJung-uk Kim#! /usr/bin/env perl 2*b077aed3SPierre Pronchery# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# ==================================================================== 11e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 151f13597dSJung-uk Kim# ==================================================================== 161f13597dSJung-uk Kim 171f13597dSJung-uk Kim# December 2005 181f13597dSJung-uk Kim# 191f13597dSJung-uk Kim# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons 201f13597dSJung-uk Kim# for undertaken effort are multiple. First of all, UltraSPARC is not 211f13597dSJung-uk Kim# the whole SPARCv9 universe and other VIS-free implementations deserve 221f13597dSJung-uk Kim# optimized code as much. Secondly, newly introduced UltraSPARC T1, 23e71b7053SJung-uk Kim# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, 241f13597dSJung-uk Kim# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with 251f13597dSJung-uk Kim# several integrated RSA/DSA accelerator circuits accessible through 261f13597dSJung-uk Kim# kernel driver [only(*)], but having decent user-land software 271f13597dSJung-uk Kim# implementation is important too. Finally, reasons like desire to 281f13597dSJung-uk Kim# experiment with dedicated squaring procedure. Yes, this module 291f13597dSJung-uk Kim# implements one, because it was easiest to draft it in SPARCv9 301f13597dSJung-uk Kim# instructions... 311f13597dSJung-uk Kim 321f13597dSJung-uk Kim# (*) Engine accessing the driver in question is on my TODO list. 33e71b7053SJung-uk Kim# For reference, accelerator is estimated to give 6 to 10 times 341f13597dSJung-uk Kim# improvement on single-threaded RSA sign. It should be noted 351f13597dSJung-uk Kim# that 6-10x improvement coefficient does not actually mean 361f13597dSJung-uk Kim# something extraordinary in terms of absolute [single-threaded] 371f13597dSJung-uk Kim# performance, as SPARCv9 instruction set is by all means least 381f13597dSJung-uk Kim# suitable for high performance crypto among other 64 bit 391f13597dSJung-uk Kim# platforms. 6-10x factor simply places T1 in same performance 401f13597dSJung-uk Kim# domain as say AMD64 and IA-64. Improvement of RSA verify don't 411f13597dSJung-uk Kim# appear impressive at all, but it's the sign operation which is 421f13597dSJung-uk Kim# far more critical/interesting. 431f13597dSJung-uk Kim 441f13597dSJung-uk Kim# You might notice that inner loops are modulo-scheduled:-) This has 451f13597dSJung-uk Kim# essentially negligible impact on UltraSPARC performance, it's 461f13597dSJung-uk Kim# Fujitsu SPARC64 V users who should notice and hopefully appreciate 471f13597dSJung-uk Kim# the advantage... Currently this module surpasses sparcv9a-mont.pl 481f13597dSJung-uk Kim# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a 491f13597dSJung-uk Kim# module still have hidden potential [see TODO list there], which is 501f13597dSJung-uk Kim# estimated to be larger than 20%... 511f13597dSJung-uk Kim 52*b077aed3SPierre Pronchery$output = pop and open STDOUT,">$output"; 53e71b7053SJung-uk Kim 541f13597dSJung-uk Kim# int bn_mul_mont( 551f13597dSJung-uk Kim$rp="%i0"; # BN_ULONG *rp, 561f13597dSJung-uk Kim$ap="%i1"; # const BN_ULONG *ap, 571f13597dSJung-uk Kim$bp="%i2"; # const BN_ULONG *bp, 581f13597dSJung-uk Kim$np="%i3"; # const BN_ULONG *np, 591f13597dSJung-uk Kim$n0="%i4"; # const BN_ULONG *n0, 601f13597dSJung-uk Kim$num="%i5"; # int num); 611f13597dSJung-uk Kim 62e71b7053SJung-uk Kim$frame="STACK_FRAME"; 63e71b7053SJung-uk Kim$bias="STACK_BIAS"; 641f13597dSJung-uk Kim 651f13597dSJung-uk Kim$car0="%o0"; 661f13597dSJung-uk Kim$car1="%o1"; 671f13597dSJung-uk Kim$car2="%o2"; # 1 bit 681f13597dSJung-uk Kim$acc0="%o3"; 691f13597dSJung-uk Kim$acc1="%o4"; 701f13597dSJung-uk Kim$mask="%g1"; # 32 bits, what a waste... 711f13597dSJung-uk Kim$tmp0="%g4"; 721f13597dSJung-uk Kim$tmp1="%g5"; 731f13597dSJung-uk Kim 741f13597dSJung-uk Kim$i="%l0"; 751f13597dSJung-uk Kim$j="%l1"; 761f13597dSJung-uk Kim$mul0="%l2"; 771f13597dSJung-uk Kim$mul1="%l3"; 781f13597dSJung-uk Kim$tp="%l4"; 791f13597dSJung-uk Kim$apj="%l5"; 801f13597dSJung-uk Kim$npj="%l6"; 811f13597dSJung-uk Kim$tpj="%l7"; 821f13597dSJung-uk Kim 831f13597dSJung-uk Kim$fname="bn_mul_mont_int"; 841f13597dSJung-uk Kim 851f13597dSJung-uk Kim$code=<<___; 86*b077aed3SPierre Pronchery#ifndef __ASSEMBLER__ 87*b077aed3SPierre Pronchery# define __ASSEMBLER__ 1 88*b077aed3SPierre Pronchery#endif 89*b077aed3SPierre Pronchery#include "crypto/sparc_arch.h" 90e71b7053SJung-uk Kim 911f13597dSJung-uk Kim.section ".text",#alloc,#execinstr 921f13597dSJung-uk Kim 931f13597dSJung-uk Kim.global $fname 941f13597dSJung-uk Kim.align 32 951f13597dSJung-uk Kim$fname: 961f13597dSJung-uk Kim cmp %o5,4 ! 128 bits minimum 971f13597dSJung-uk Kim bge,pt %icc,.Lenter 981f13597dSJung-uk Kim sethi %hi(0xffffffff),$mask 991f13597dSJung-uk Kim retl 1001f13597dSJung-uk Kim clr %o0 1011f13597dSJung-uk Kim.align 32 1021f13597dSJung-uk Kim.Lenter: 1031f13597dSJung-uk Kim save %sp,-$frame,%sp 1041f13597dSJung-uk Kim sll $num,2,$num ! num*=4 1051f13597dSJung-uk Kim or $mask,%lo(0xffffffff),$mask 1061f13597dSJung-uk Kim ld [$n0],$n0 1071f13597dSJung-uk Kim cmp $ap,$bp 1081f13597dSJung-uk Kim and $num,$mask,$num 1091f13597dSJung-uk Kim ld [$bp],$mul0 ! bp[0] 1101f13597dSJung-uk Kim nop 1111f13597dSJung-uk Kim 1121f13597dSJung-uk Kim add %sp,$bias,%o7 ! real top of stack 1131f13597dSJung-uk Kim ld [$ap],$car0 ! ap[0] ! redundant in squaring context 1141f13597dSJung-uk Kim sub %o7,$num,%o7 1151f13597dSJung-uk Kim ld [$ap+4],$apj ! ap[1] 1161f13597dSJung-uk Kim and %o7,-1024,%o7 1171f13597dSJung-uk Kim ld [$np],$car1 ! np[0] 1181f13597dSJung-uk Kim sub %o7,$bias,%sp ! alloca 1191f13597dSJung-uk Kim ld [$np+4],$npj ! np[1] 120e71b7053SJung-uk Kim be,pt SIZE_T_CC,.Lbn_sqr_mont 1211f13597dSJung-uk Kim mov 12,$j 1221f13597dSJung-uk Kim 1231f13597dSJung-uk Kim mulx $car0,$mul0,$car0 ! ap[0]*bp[0] 1241f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] 1251f13597dSJung-uk Kim and $car0,$mask,$acc0 1261f13597dSJung-uk Kim add %sp,$bias+$frame,$tp 1271f13597dSJung-uk Kim ld [$ap+8],$apj !prologue! 1281f13597dSJung-uk Kim 1291f13597dSJung-uk Kim mulx $n0,$acc0,$mul1 ! "t[0]"*n0 1301f13597dSJung-uk Kim and $mul1,$mask,$mul1 1311f13597dSJung-uk Kim 1321f13597dSJung-uk Kim mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 1331f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 1341f13597dSJung-uk Kim srlx $car0,32,$car0 1351f13597dSJung-uk Kim add $acc0,$car1,$car1 1361f13597dSJung-uk Kim ld [$np+8],$npj !prologue! 1371f13597dSJung-uk Kim srlx $car1,32,$car1 1381f13597dSJung-uk Kim mov $tmp0,$acc0 !prologue! 1391f13597dSJung-uk Kim 1401f13597dSJung-uk Kim.L1st: 1411f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 1421f13597dSJung-uk Kim mulx $npj,$mul1,$tmp1 1431f13597dSJung-uk Kim add $acc0,$car0,$car0 1441f13597dSJung-uk Kim ld [$ap+$j],$apj ! ap[j] 1451f13597dSJung-uk Kim and $car0,$mask,$acc0 1461f13597dSJung-uk Kim add $acc1,$car1,$car1 1471f13597dSJung-uk Kim ld [$np+$j],$npj ! np[j] 1481f13597dSJung-uk Kim srlx $car0,32,$car0 1491f13597dSJung-uk Kim add $acc0,$car1,$car1 1501f13597dSJung-uk Kim add $j,4,$j ! j++ 1511f13597dSJung-uk Kim mov $tmp0,$acc0 1521f13597dSJung-uk Kim st $car1,[$tp] 1531f13597dSJung-uk Kim cmp $j,$num 1541f13597dSJung-uk Kim mov $tmp1,$acc1 1551f13597dSJung-uk Kim srlx $car1,32,$car1 1561f13597dSJung-uk Kim bl %icc,.L1st 1571f13597dSJung-uk Kim add $tp,4,$tp ! tp++ 1581f13597dSJung-uk Kim!.L1st 1591f13597dSJung-uk Kim 1601f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 !epilogue! 1611f13597dSJung-uk Kim mulx $npj,$mul1,$tmp1 1621f13597dSJung-uk Kim add $acc0,$car0,$car0 1631f13597dSJung-uk Kim and $car0,$mask,$acc0 1641f13597dSJung-uk Kim add $acc1,$car1,$car1 1651f13597dSJung-uk Kim srlx $car0,32,$car0 1661f13597dSJung-uk Kim add $acc0,$car1,$car1 1671f13597dSJung-uk Kim st $car1,[$tp] 1681f13597dSJung-uk Kim srlx $car1,32,$car1 1691f13597dSJung-uk Kim 1701f13597dSJung-uk Kim add $tmp0,$car0,$car0 1711f13597dSJung-uk Kim and $car0,$mask,$acc0 1721f13597dSJung-uk Kim add $tmp1,$car1,$car1 1731f13597dSJung-uk Kim srlx $car0,32,$car0 1741f13597dSJung-uk Kim add $acc0,$car1,$car1 1751f13597dSJung-uk Kim st $car1,[$tp+4] 1761f13597dSJung-uk Kim srlx $car1,32,$car1 1771f13597dSJung-uk Kim 1781f13597dSJung-uk Kim add $car0,$car1,$car1 1791f13597dSJung-uk Kim st $car1,[$tp+8] 1801f13597dSJung-uk Kim srlx $car1,32,$car2 1811f13597dSJung-uk Kim 1821f13597dSJung-uk Kim mov 4,$i ! i++ 1831f13597dSJung-uk Kim ld [$bp+4],$mul0 ! bp[1] 1841f13597dSJung-uk Kim.Louter: 1851f13597dSJung-uk Kim add %sp,$bias+$frame,$tp 1861f13597dSJung-uk Kim ld [$ap],$car0 ! ap[0] 1871f13597dSJung-uk Kim ld [$ap+4],$apj ! ap[1] 1881f13597dSJung-uk Kim ld [$np],$car1 ! np[0] 1891f13597dSJung-uk Kim ld [$np+4],$npj ! np[1] 1901f13597dSJung-uk Kim ld [$tp],$tmp1 ! tp[0] 1911f13597dSJung-uk Kim ld [$tp+4],$tpj ! tp[1] 1921f13597dSJung-uk Kim mov 12,$j 1931f13597dSJung-uk Kim 1941f13597dSJung-uk Kim mulx $car0,$mul0,$car0 1951f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 !prologue! 1961f13597dSJung-uk Kim add $tmp1,$car0,$car0 1971f13597dSJung-uk Kim ld [$ap+8],$apj !prologue! 1981f13597dSJung-uk Kim and $car0,$mask,$acc0 1991f13597dSJung-uk Kim 2001f13597dSJung-uk Kim mulx $n0,$acc0,$mul1 2011f13597dSJung-uk Kim and $mul1,$mask,$mul1 2021f13597dSJung-uk Kim 2031f13597dSJung-uk Kim mulx $car1,$mul1,$car1 2041f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 !prologue! 2051f13597dSJung-uk Kim srlx $car0,32,$car0 2061f13597dSJung-uk Kim add $acc0,$car1,$car1 2071f13597dSJung-uk Kim ld [$np+8],$npj !prologue! 2081f13597dSJung-uk Kim srlx $car1,32,$car1 2091f13597dSJung-uk Kim mov $tmp0,$acc0 !prologue! 2101f13597dSJung-uk Kim 2111f13597dSJung-uk Kim.Linner: 2121f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 2131f13597dSJung-uk Kim mulx $npj,$mul1,$tmp1 2141f13597dSJung-uk Kim add $tpj,$car0,$car0 2151f13597dSJung-uk Kim ld [$ap+$j],$apj ! ap[j] 2161f13597dSJung-uk Kim add $acc0,$car0,$car0 2171f13597dSJung-uk Kim add $acc1,$car1,$car1 2181f13597dSJung-uk Kim ld [$np+$j],$npj ! np[j] 2191f13597dSJung-uk Kim and $car0,$mask,$acc0 2201f13597dSJung-uk Kim ld [$tp+8],$tpj ! tp[j] 2211f13597dSJung-uk Kim srlx $car0,32,$car0 2221f13597dSJung-uk Kim add $acc0,$car1,$car1 2231f13597dSJung-uk Kim add $j,4,$j ! j++ 2241f13597dSJung-uk Kim mov $tmp0,$acc0 2251f13597dSJung-uk Kim st $car1,[$tp] ! tp[j-1] 2261f13597dSJung-uk Kim srlx $car1,32,$car1 2271f13597dSJung-uk Kim mov $tmp1,$acc1 2281f13597dSJung-uk Kim cmp $j,$num 2291f13597dSJung-uk Kim bl %icc,.Linner 2301f13597dSJung-uk Kim add $tp,4,$tp ! tp++ 2311f13597dSJung-uk Kim!.Linner 2321f13597dSJung-uk Kim 2331f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 !epilogue! 2341f13597dSJung-uk Kim mulx $npj,$mul1,$tmp1 2351f13597dSJung-uk Kim add $tpj,$car0,$car0 2361f13597dSJung-uk Kim add $acc0,$car0,$car0 2371f13597dSJung-uk Kim ld [$tp+8],$tpj ! tp[j] 2381f13597dSJung-uk Kim and $car0,$mask,$acc0 2391f13597dSJung-uk Kim add $acc1,$car1,$car1 2401f13597dSJung-uk Kim srlx $car0,32,$car0 2411f13597dSJung-uk Kim add $acc0,$car1,$car1 2421f13597dSJung-uk Kim st $car1,[$tp] ! tp[j-1] 2431f13597dSJung-uk Kim srlx $car1,32,$car1 2441f13597dSJung-uk Kim 2451f13597dSJung-uk Kim add $tpj,$car0,$car0 2461f13597dSJung-uk Kim add $tmp0,$car0,$car0 2471f13597dSJung-uk Kim and $car0,$mask,$acc0 2481f13597dSJung-uk Kim add $tmp1,$car1,$car1 2491f13597dSJung-uk Kim add $acc0,$car1,$car1 2501f13597dSJung-uk Kim st $car1,[$tp+4] ! tp[j-1] 2511f13597dSJung-uk Kim srlx $car0,32,$car0 2521f13597dSJung-uk Kim add $i,4,$i ! i++ 2531f13597dSJung-uk Kim srlx $car1,32,$car1 2541f13597dSJung-uk Kim 2551f13597dSJung-uk Kim add $car0,$car1,$car1 2561f13597dSJung-uk Kim cmp $i,$num 2571f13597dSJung-uk Kim add $car2,$car1,$car1 2581f13597dSJung-uk Kim st $car1,[$tp+8] 2591f13597dSJung-uk Kim 2601f13597dSJung-uk Kim srlx $car1,32,$car2 2611f13597dSJung-uk Kim bl,a %icc,.Louter 2621f13597dSJung-uk Kim ld [$bp+$i],$mul0 ! bp[i] 2631f13597dSJung-uk Kim!.Louter 2641f13597dSJung-uk Kim 2651f13597dSJung-uk Kim add $tp,12,$tp 2661f13597dSJung-uk Kim 2671f13597dSJung-uk Kim.Ltail: 2681f13597dSJung-uk Kim add $np,$num,$np 2691f13597dSJung-uk Kim add $rp,$num,$rp 2701f13597dSJung-uk Kim sub %g0,$num,%o7 ! k=-num 2711f13597dSJung-uk Kim ba .Lsub 2721f13597dSJung-uk Kim subcc %g0,%g0,%g0 ! clear %icc.c 2731f13597dSJung-uk Kim.align 16 2741f13597dSJung-uk Kim.Lsub: 2751f13597dSJung-uk Kim ld [$tp+%o7],%o0 2761f13597dSJung-uk Kim ld [$np+%o7],%o1 2771f13597dSJung-uk Kim subccc %o0,%o1,%o1 ! tp[j]-np[j] 2781f13597dSJung-uk Kim add $rp,%o7,$i 2791f13597dSJung-uk Kim add %o7,4,%o7 2801f13597dSJung-uk Kim brnz %o7,.Lsub 2811f13597dSJung-uk Kim st %o1,[$i] 282dea77ea6SJung-uk Kim subccc $car2,0,$car2 ! handle upmost overflow bit 2831f13597dSJung-uk Kim sub %g0,$num,%o7 2841f13597dSJung-uk Kim 2851f13597dSJung-uk Kim.Lcopy: 286dea77ea6SJung-uk Kim ld [$tp+%o7],%o1 ! conditional copy 287dea77ea6SJung-uk Kim ld [$rp+%o7],%o0 2881f13597dSJung-uk Kim st %g0,[$tp+%o7] ! zap tp 289dea77ea6SJung-uk Kim movcs %icc,%o1,%o0 2901f13597dSJung-uk Kim st %o0,[$rp+%o7] 2911f13597dSJung-uk Kim add %o7,4,%o7 2921f13597dSJung-uk Kim brnz %o7,.Lcopy 2931f13597dSJung-uk Kim nop 2941f13597dSJung-uk Kim mov 1,%i0 2951f13597dSJung-uk Kim ret 2961f13597dSJung-uk Kim restore 2971f13597dSJung-uk Kim___ 2981f13597dSJung-uk Kim 2991f13597dSJung-uk Kim######## 3001f13597dSJung-uk Kim######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 3011f13597dSJung-uk Kim######## code without following dedicated squaring procedure. 3021f13597dSJung-uk Kim######## 303ed7112f0SJung-uk Kim$sbit="%o5"; 3041f13597dSJung-uk Kim 3051f13597dSJung-uk Kim$code.=<<___; 3061f13597dSJung-uk Kim.align 32 3071f13597dSJung-uk Kim.Lbn_sqr_mont: 3081f13597dSJung-uk Kim mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 3091f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 !prologue! 3101f13597dSJung-uk Kim and $car0,$mask,$acc0 3111f13597dSJung-uk Kim add %sp,$bias+$frame,$tp 3121f13597dSJung-uk Kim ld [$ap+8],$apj !prologue! 3131f13597dSJung-uk Kim 3141f13597dSJung-uk Kim mulx $n0,$acc0,$mul1 ! "t[0]"*n0 3151f13597dSJung-uk Kim srlx $car0,32,$car0 3161f13597dSJung-uk Kim and $mul1,$mask,$mul1 3171f13597dSJung-uk Kim 3181f13597dSJung-uk Kim mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 3191f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 !prologue! 3201f13597dSJung-uk Kim and $car0,1,$sbit 3211f13597dSJung-uk Kim ld [$np+8],$npj !prologue! 3221f13597dSJung-uk Kim srlx $car0,1,$car0 3231f13597dSJung-uk Kim add $acc0,$car1,$car1 3241f13597dSJung-uk Kim srlx $car1,32,$car1 3251f13597dSJung-uk Kim mov $tmp0,$acc0 !prologue! 3261f13597dSJung-uk Kim 3271f13597dSJung-uk Kim.Lsqr_1st: 3281f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 3291f13597dSJung-uk Kim mulx $npj,$mul1,$tmp1 3301f13597dSJung-uk Kim add $acc0,$car0,$car0 ! ap[j]*a0+c0 3311f13597dSJung-uk Kim add $acc1,$car1,$car1 3321f13597dSJung-uk Kim ld [$ap+$j],$apj ! ap[j] 3331f13597dSJung-uk Kim and $car0,$mask,$acc0 3341f13597dSJung-uk Kim ld [$np+$j],$npj ! np[j] 3351f13597dSJung-uk Kim srlx $car0,32,$car0 3361f13597dSJung-uk Kim add $acc0,$acc0,$acc0 3371f13597dSJung-uk Kim or $sbit,$acc0,$acc0 3381f13597dSJung-uk Kim mov $tmp1,$acc1 3391f13597dSJung-uk Kim srlx $acc0,32,$sbit 3401f13597dSJung-uk Kim add $j,4,$j ! j++ 3411f13597dSJung-uk Kim and $acc0,$mask,$acc0 3421f13597dSJung-uk Kim cmp $j,$num 3431f13597dSJung-uk Kim add $acc0,$car1,$car1 3441f13597dSJung-uk Kim st $car1,[$tp] 3451f13597dSJung-uk Kim mov $tmp0,$acc0 3461f13597dSJung-uk Kim srlx $car1,32,$car1 3471f13597dSJung-uk Kim bl %icc,.Lsqr_1st 3481f13597dSJung-uk Kim add $tp,4,$tp ! tp++ 3491f13597dSJung-uk Kim!.Lsqr_1st 3501f13597dSJung-uk Kim 3511f13597dSJung-uk Kim mulx $apj,$mul0,$tmp0 ! epilogue 3521f13597dSJung-uk Kim mulx $npj,$mul1,$tmp1 3531f13597dSJung-uk Kim add $acc0,$car0,$car0 ! ap[j]*a0+c0 3541f13597dSJung-uk Kim add $acc1,$car1,$car1 3551f13597dSJung-uk Kim and $car0,$mask,$acc0 3561f13597dSJung-uk Kim srlx $car0,32,$car0 3571f13597dSJung-uk Kim add $acc0,$acc0,$acc0 3581f13597dSJung-uk Kim or $sbit,$acc0,$acc0 3591f13597dSJung-uk Kim srlx $acc0,32,$sbit 3601f13597dSJung-uk Kim and $acc0,$mask,$acc0 3611f13597dSJung-uk Kim add $acc0,$car1,$car1 3621f13597dSJung-uk Kim st $car1,[$tp] 3631f13597dSJung-uk Kim srlx $car1,32,$car1 3641f13597dSJung-uk Kim 3651f13597dSJung-uk Kim add $tmp0,$car0,$car0 ! ap[j]*a0+c0 3661f13597dSJung-uk Kim add $tmp1,$car1,$car1 3671f13597dSJung-uk Kim and $car0,$mask,$acc0 3681f13597dSJung-uk Kim srlx $car0,32,$car0 3691f13597dSJung-uk Kim add $acc0,$acc0,$acc0 3701f13597dSJung-uk Kim or $sbit,$acc0,$acc0 3711f13597dSJung-uk Kim srlx $acc0,32,$sbit 3721f13597dSJung-uk Kim and $acc0,$mask,$acc0 3731f13597dSJung-uk Kim add $acc0,$car1,$car1 3741f13597dSJung-uk Kim st $car1,[$tp+4] 3751f13597dSJung-uk Kim srlx $car1,32,$car1 3761f13597dSJung-uk Kim 3771f13597dSJung-uk Kim add $car0,$car0,$car0 3781f13597dSJung-uk Kim or $sbit,$car0,$car0 3791f13597dSJung-uk Kim add $car0,$car1,$car1 3801f13597dSJung-uk Kim st $car1,[$tp+8] 3811f13597dSJung-uk Kim srlx $car1,32,$car2 3821f13597dSJung-uk Kim 3831f13597dSJung-uk Kim ld [%sp+$bias+$frame],$tmp0 ! tp[0] 3841f13597dSJung-uk Kim ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] 3851f13597dSJung-uk Kim ld [%sp+$bias+$frame+8],$tpj ! tp[2] 3861f13597dSJung-uk Kim ld [$ap+4],$mul0 ! ap[1] 3871f13597dSJung-uk Kim ld [$ap+8],$apj ! ap[2] 3881f13597dSJung-uk Kim ld [$np],$car1 ! np[0] 3891f13597dSJung-uk Kim ld [$np+4],$npj ! np[1] 3901f13597dSJung-uk Kim mulx $n0,$tmp0,$mul1 3911f13597dSJung-uk Kim 3921f13597dSJung-uk Kim mulx $mul0,$mul0,$car0 3931f13597dSJung-uk Kim and $mul1,$mask,$mul1 3941f13597dSJung-uk Kim 3951f13597dSJung-uk Kim mulx $car1,$mul1,$car1 3961f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 3971f13597dSJung-uk Kim add $tmp0,$car1,$car1 3981f13597dSJung-uk Kim and $car0,$mask,$acc0 3991f13597dSJung-uk Kim ld [$np+8],$npj ! np[2] 4001f13597dSJung-uk Kim srlx $car1,32,$car1 4011f13597dSJung-uk Kim add $tmp1,$car1,$car1 4021f13597dSJung-uk Kim srlx $car0,32,$car0 4031f13597dSJung-uk Kim add $acc0,$car1,$car1 4041f13597dSJung-uk Kim and $car0,1,$sbit 4051f13597dSJung-uk Kim add $acc1,$car1,$car1 4061f13597dSJung-uk Kim srlx $car0,1,$car0 4071f13597dSJung-uk Kim mov 12,$j 4081f13597dSJung-uk Kim st $car1,[%sp+$bias+$frame] ! tp[0]= 4091f13597dSJung-uk Kim srlx $car1,32,$car1 4101f13597dSJung-uk Kim add %sp,$bias+$frame+4,$tp 4111f13597dSJung-uk Kim 4121f13597dSJung-uk Kim.Lsqr_2nd: 4131f13597dSJung-uk Kim mulx $apj,$mul0,$acc0 4141f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 4151f13597dSJung-uk Kim add $acc0,$car0,$car0 416ed7112f0SJung-uk Kim add $tpj,$sbit,$sbit 4171f13597dSJung-uk Kim ld [$ap+$j],$apj ! ap[j] 4181f13597dSJung-uk Kim and $car0,$mask,$acc0 4191f13597dSJung-uk Kim ld [$np+$j],$npj ! np[j] 4201f13597dSJung-uk Kim srlx $car0,32,$car0 4211f13597dSJung-uk Kim add $acc1,$car1,$car1 4221f13597dSJung-uk Kim ld [$tp+8],$tpj ! tp[j] 4231f13597dSJung-uk Kim add $acc0,$acc0,$acc0 4241f13597dSJung-uk Kim add $j,4,$j ! j++ 425ed7112f0SJung-uk Kim add $sbit,$acc0,$acc0 4261f13597dSJung-uk Kim srlx $acc0,32,$sbit 4271f13597dSJung-uk Kim and $acc0,$mask,$acc0 4281f13597dSJung-uk Kim cmp $j,$num 4291f13597dSJung-uk Kim add $acc0,$car1,$car1 4301f13597dSJung-uk Kim st $car1,[$tp] ! tp[j-1] 4311f13597dSJung-uk Kim srlx $car1,32,$car1 4321f13597dSJung-uk Kim bl %icc,.Lsqr_2nd 4331f13597dSJung-uk Kim add $tp,4,$tp ! tp++ 4341f13597dSJung-uk Kim!.Lsqr_2nd 4351f13597dSJung-uk Kim 4361f13597dSJung-uk Kim mulx $apj,$mul0,$acc0 4371f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 4381f13597dSJung-uk Kim add $acc0,$car0,$car0 439ed7112f0SJung-uk Kim add $tpj,$sbit,$sbit 4401f13597dSJung-uk Kim and $car0,$mask,$acc0 4411f13597dSJung-uk Kim srlx $car0,32,$car0 4421f13597dSJung-uk Kim add $acc1,$car1,$car1 4431f13597dSJung-uk Kim add $acc0,$acc0,$acc0 444ed7112f0SJung-uk Kim add $sbit,$acc0,$acc0 4451f13597dSJung-uk Kim srlx $acc0,32,$sbit 4461f13597dSJung-uk Kim and $acc0,$mask,$acc0 4471f13597dSJung-uk Kim add $acc0,$car1,$car1 4481f13597dSJung-uk Kim st $car1,[$tp] ! tp[j-1] 4491f13597dSJung-uk Kim srlx $car1,32,$car1 4501f13597dSJung-uk Kim 4511f13597dSJung-uk Kim add $car0,$car0,$car0 452ed7112f0SJung-uk Kim add $sbit,$car0,$car0 4531f13597dSJung-uk Kim add $car0,$car1,$car1 4541f13597dSJung-uk Kim add $car2,$car1,$car1 4551f13597dSJung-uk Kim st $car1,[$tp+4] 4561f13597dSJung-uk Kim srlx $car1,32,$car2 4571f13597dSJung-uk Kim 4581f13597dSJung-uk Kim ld [%sp+$bias+$frame],$tmp1 ! tp[0] 4591f13597dSJung-uk Kim ld [%sp+$bias+$frame+4],$tpj ! tp[1] 4601f13597dSJung-uk Kim ld [$ap+8],$mul0 ! ap[2] 4611f13597dSJung-uk Kim ld [$np],$car1 ! np[0] 4621f13597dSJung-uk Kim ld [$np+4],$npj ! np[1] 4631f13597dSJung-uk Kim mulx $n0,$tmp1,$mul1 4641f13597dSJung-uk Kim and $mul1,$mask,$mul1 4651f13597dSJung-uk Kim mov 8,$i 4661f13597dSJung-uk Kim 4671f13597dSJung-uk Kim mulx $mul0,$mul0,$car0 4681f13597dSJung-uk Kim mulx $car1,$mul1,$car1 4691f13597dSJung-uk Kim and $car0,$mask,$acc0 4701f13597dSJung-uk Kim add $tmp1,$car1,$car1 4711f13597dSJung-uk Kim srlx $car0,32,$car0 4721f13597dSJung-uk Kim add %sp,$bias+$frame,$tp 4731f13597dSJung-uk Kim srlx $car1,32,$car1 4741f13597dSJung-uk Kim and $car0,1,$sbit 4751f13597dSJung-uk Kim srlx $car0,1,$car0 4761f13597dSJung-uk Kim mov 4,$j 4771f13597dSJung-uk Kim 4781f13597dSJung-uk Kim.Lsqr_outer: 4791f13597dSJung-uk Kim.Lsqr_inner1: 4801f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 4811f13597dSJung-uk Kim add $tpj,$car1,$car1 4821f13597dSJung-uk Kim add $j,4,$j 4831f13597dSJung-uk Kim ld [$tp+8],$tpj 4841f13597dSJung-uk Kim cmp $j,$i 4851f13597dSJung-uk Kim add $acc1,$car1,$car1 4861f13597dSJung-uk Kim ld [$np+$j],$npj 4871f13597dSJung-uk Kim st $car1,[$tp] 4881f13597dSJung-uk Kim srlx $car1,32,$car1 4891f13597dSJung-uk Kim bl %icc,.Lsqr_inner1 4901f13597dSJung-uk Kim add $tp,4,$tp 4911f13597dSJung-uk Kim!.Lsqr_inner1 4921f13597dSJung-uk Kim 4931f13597dSJung-uk Kim add $j,4,$j 4941f13597dSJung-uk Kim ld [$ap+$j],$apj ! ap[j] 4951f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 4961f13597dSJung-uk Kim add $tpj,$car1,$car1 4971f13597dSJung-uk Kim ld [$np+$j],$npj ! np[j] 498dea77ea6SJung-uk Kim srlx $car1,32,$tmp0 499dea77ea6SJung-uk Kim and $car1,$mask,$car1 500dea77ea6SJung-uk Kim add $tmp0,$sbit,$sbit 5011f13597dSJung-uk Kim add $acc0,$car1,$car1 5021f13597dSJung-uk Kim ld [$tp+8],$tpj ! tp[j] 5031f13597dSJung-uk Kim add $acc1,$car1,$car1 5041f13597dSJung-uk Kim st $car1,[$tp] 5051f13597dSJung-uk Kim srlx $car1,32,$car1 5061f13597dSJung-uk Kim 5071f13597dSJung-uk Kim add $j,4,$j 5081f13597dSJung-uk Kim cmp $j,$num 5091f13597dSJung-uk Kim be,pn %icc,.Lsqr_no_inner2 5101f13597dSJung-uk Kim add $tp,4,$tp 5111f13597dSJung-uk Kim 5121f13597dSJung-uk Kim.Lsqr_inner2: 5131f13597dSJung-uk Kim mulx $apj,$mul0,$acc0 5141f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 515ed7112f0SJung-uk Kim add $tpj,$sbit,$sbit 5161f13597dSJung-uk Kim add $acc0,$car0,$car0 5171f13597dSJung-uk Kim ld [$ap+$j],$apj ! ap[j] 5181f13597dSJung-uk Kim and $car0,$mask,$acc0 5191f13597dSJung-uk Kim ld [$np+$j],$npj ! np[j] 5201f13597dSJung-uk Kim srlx $car0,32,$car0 5211f13597dSJung-uk Kim add $acc0,$acc0,$acc0 5221f13597dSJung-uk Kim ld [$tp+8],$tpj ! tp[j] 523ed7112f0SJung-uk Kim add $sbit,$acc0,$acc0 5241f13597dSJung-uk Kim add $j,4,$j ! j++ 5251f13597dSJung-uk Kim srlx $acc0,32,$sbit 5261f13597dSJung-uk Kim and $acc0,$mask,$acc0 5271f13597dSJung-uk Kim cmp $j,$num 5281f13597dSJung-uk Kim add $acc0,$car1,$car1 5291f13597dSJung-uk Kim add $acc1,$car1,$car1 5301f13597dSJung-uk Kim st $car1,[$tp] ! tp[j-1] 5311f13597dSJung-uk Kim srlx $car1,32,$car1 5321f13597dSJung-uk Kim bl %icc,.Lsqr_inner2 5331f13597dSJung-uk Kim add $tp,4,$tp ! tp++ 5341f13597dSJung-uk Kim 5351f13597dSJung-uk Kim.Lsqr_no_inner2: 5361f13597dSJung-uk Kim mulx $apj,$mul0,$acc0 5371f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 538ed7112f0SJung-uk Kim add $tpj,$sbit,$sbit 5391f13597dSJung-uk Kim add $acc0,$car0,$car0 5401f13597dSJung-uk Kim and $car0,$mask,$acc0 5411f13597dSJung-uk Kim srlx $car0,32,$car0 5421f13597dSJung-uk Kim add $acc0,$acc0,$acc0 543ed7112f0SJung-uk Kim add $sbit,$acc0,$acc0 5441f13597dSJung-uk Kim srlx $acc0,32,$sbit 5451f13597dSJung-uk Kim and $acc0,$mask,$acc0 5461f13597dSJung-uk Kim add $acc0,$car1,$car1 5471f13597dSJung-uk Kim add $acc1,$car1,$car1 5481f13597dSJung-uk Kim st $car1,[$tp] ! tp[j-1] 5491f13597dSJung-uk Kim srlx $car1,32,$car1 5501f13597dSJung-uk Kim 5511f13597dSJung-uk Kim add $car0,$car0,$car0 552ed7112f0SJung-uk Kim add $sbit,$car0,$car0 5531f13597dSJung-uk Kim add $car0,$car1,$car1 5541f13597dSJung-uk Kim add $car2,$car1,$car1 5551f13597dSJung-uk Kim st $car1,[$tp+4] 5561f13597dSJung-uk Kim srlx $car1,32,$car2 5571f13597dSJung-uk Kim 5581f13597dSJung-uk Kim add $i,4,$i ! i++ 5591f13597dSJung-uk Kim ld [%sp+$bias+$frame],$tmp1 ! tp[0] 5601f13597dSJung-uk Kim ld [%sp+$bias+$frame+4],$tpj ! tp[1] 5611f13597dSJung-uk Kim ld [$ap+$i],$mul0 ! ap[j] 5621f13597dSJung-uk Kim ld [$np],$car1 ! np[0] 5631f13597dSJung-uk Kim ld [$np+4],$npj ! np[1] 5641f13597dSJung-uk Kim mulx $n0,$tmp1,$mul1 5651f13597dSJung-uk Kim and $mul1,$mask,$mul1 5661f13597dSJung-uk Kim add $i,4,$tmp0 5671f13597dSJung-uk Kim 5681f13597dSJung-uk Kim mulx $mul0,$mul0,$car0 5691f13597dSJung-uk Kim mulx $car1,$mul1,$car1 5701f13597dSJung-uk Kim and $car0,$mask,$acc0 5711f13597dSJung-uk Kim add $tmp1,$car1,$car1 5721f13597dSJung-uk Kim srlx $car0,32,$car0 5731f13597dSJung-uk Kim add %sp,$bias+$frame,$tp 5741f13597dSJung-uk Kim srlx $car1,32,$car1 5751f13597dSJung-uk Kim and $car0,1,$sbit 5761f13597dSJung-uk Kim srlx $car0,1,$car0 5771f13597dSJung-uk Kim 5781f13597dSJung-uk Kim cmp $tmp0,$num ! i<num-1 5791f13597dSJung-uk Kim bl %icc,.Lsqr_outer 5801f13597dSJung-uk Kim mov 4,$j 5811f13597dSJung-uk Kim 5821f13597dSJung-uk Kim.Lsqr_last: 5831f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 5841f13597dSJung-uk Kim add $tpj,$car1,$car1 5851f13597dSJung-uk Kim add $j,4,$j 5861f13597dSJung-uk Kim ld [$tp+8],$tpj 5871f13597dSJung-uk Kim cmp $j,$i 5881f13597dSJung-uk Kim add $acc1,$car1,$car1 5891f13597dSJung-uk Kim ld [$np+$j],$npj 5901f13597dSJung-uk Kim st $car1,[$tp] 5911f13597dSJung-uk Kim srlx $car1,32,$car1 5921f13597dSJung-uk Kim bl %icc,.Lsqr_last 5931f13597dSJung-uk Kim add $tp,4,$tp 5941f13597dSJung-uk Kim!.Lsqr_last 5951f13597dSJung-uk Kim 5961f13597dSJung-uk Kim mulx $npj,$mul1,$acc1 597ed7112f0SJung-uk Kim add $tpj,$acc0,$acc0 598ed7112f0SJung-uk Kim srlx $acc0,32,$tmp0 599ed7112f0SJung-uk Kim and $acc0,$mask,$acc0 600ed7112f0SJung-uk Kim add $tmp0,$sbit,$sbit 6011f13597dSJung-uk Kim add $acc0,$car1,$car1 6021f13597dSJung-uk Kim add $acc1,$car1,$car1 6031f13597dSJung-uk Kim st $car1,[$tp] 6041f13597dSJung-uk Kim srlx $car1,32,$car1 6051f13597dSJung-uk Kim 6061f13597dSJung-uk Kim add $car0,$car0,$car0 ! recover $car0 607ed7112f0SJung-uk Kim add $sbit,$car0,$car0 6081f13597dSJung-uk Kim add $car0,$car1,$car1 6091f13597dSJung-uk Kim add $car2,$car1,$car1 6101f13597dSJung-uk Kim st $car1,[$tp+4] 6111f13597dSJung-uk Kim srlx $car1,32,$car2 6121f13597dSJung-uk Kim 6131f13597dSJung-uk Kim ba .Ltail 6141f13597dSJung-uk Kim add $tp,8,$tp 6151f13597dSJung-uk Kim.type $fname,#function 6161f13597dSJung-uk Kim.size $fname,(.-$fname) 617e71b7053SJung-uk Kim.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 6181f13597dSJung-uk Kim.align 32 6191f13597dSJung-uk Kim___ 6201f13597dSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 6211f13597dSJung-uk Kimprint $code; 62217f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 623