11f13597dSJung-uk Kim#! /usr/bin/env perl
2*b077aed3SPierre Pronchery# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim
91f13597dSJung-uk Kim
101f13597dSJung-uk Kim# ====================================================================
11e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
151f13597dSJung-uk Kim# ====================================================================
161f13597dSJung-uk Kim
171f13597dSJung-uk Kim# December 2005
181f13597dSJung-uk Kim#
191f13597dSJung-uk Kim# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
201f13597dSJung-uk Kim# for undertaken effort are multiple. First of all, UltraSPARC is not
211f13597dSJung-uk Kim# the whole SPARCv9 universe and other VIS-free implementations deserve
221f13597dSJung-uk Kim# optimized code as much. Secondly, newly introduced UltraSPARC T1,
23e71b7053SJung-uk Kim# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
241f13597dSJung-uk Kim# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
251f13597dSJung-uk Kim# several integrated RSA/DSA accelerator circuits accessible through
261f13597dSJung-uk Kim# kernel driver [only(*)], but having decent user-land software
271f13597dSJung-uk Kim# implementation is important too. Finally, reasons like desire to
281f13597dSJung-uk Kim# experiment with dedicated squaring procedure. Yes, this module
291f13597dSJung-uk Kim# implements one, because it was easiest to draft it in SPARCv9
301f13597dSJung-uk Kim# instructions...
311f13597dSJung-uk Kim
321f13597dSJung-uk Kim# (*)	Engine accessing the driver in question is on my TODO list.
33e71b7053SJung-uk Kim#	For reference, accelerator is estimated to give 6 to 10 times
341f13597dSJung-uk Kim#	improvement on single-threaded RSA sign. It should be noted
351f13597dSJung-uk Kim#	that 6-10x improvement coefficient does not actually mean
361f13597dSJung-uk Kim#	something extraordinary in terms of absolute [single-threaded]
371f13597dSJung-uk Kim#	performance, as SPARCv9 instruction set is by all means least
381f13597dSJung-uk Kim#	suitable for high performance crypto among other 64 bit
391f13597dSJung-uk Kim#	platforms. 6-10x factor simply places T1 in same performance
401f13597dSJung-uk Kim#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
411f13597dSJung-uk Kim#	appear impressive at all, but it's the sign operation which is
421f13597dSJung-uk Kim#	far more critical/interesting.
431f13597dSJung-uk Kim
441f13597dSJung-uk Kim# You might notice that inner loops are modulo-scheduled:-) This has
451f13597dSJung-uk Kim# essentially negligible impact on UltraSPARC performance, it's
461f13597dSJung-uk Kim# Fujitsu SPARC64 V users who should notice and hopefully appreciate
471f13597dSJung-uk Kim# the advantage... Currently this module surpasses sparcv9a-mont.pl
481f13597dSJung-uk Kim# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
491f13597dSJung-uk Kim# module still have hidden potential [see TODO list there], which is
501f13597dSJung-uk Kim# estimated to be larger than 20%...
511f13597dSJung-uk Kim
52*b077aed3SPierre Pronchery$output = pop and open STDOUT,">$output";
53e71b7053SJung-uk Kim
541f13597dSJung-uk Kim# int bn_mul_mont(
551f13597dSJung-uk Kim$rp="%i0";	# BN_ULONG *rp,
561f13597dSJung-uk Kim$ap="%i1";	# const BN_ULONG *ap,
571f13597dSJung-uk Kim$bp="%i2";	# const BN_ULONG *bp,
581f13597dSJung-uk Kim$np="%i3";	# const BN_ULONG *np,
591f13597dSJung-uk Kim$n0="%i4";	# const BN_ULONG *n0,
601f13597dSJung-uk Kim$num="%i5";	# int num);
611f13597dSJung-uk Kim
62e71b7053SJung-uk Kim$frame="STACK_FRAME";
63e71b7053SJung-uk Kim$bias="STACK_BIAS";
641f13597dSJung-uk Kim
651f13597dSJung-uk Kim$car0="%o0";
661f13597dSJung-uk Kim$car1="%o1";
671f13597dSJung-uk Kim$car2="%o2";	# 1 bit
681f13597dSJung-uk Kim$acc0="%o3";
691f13597dSJung-uk Kim$acc1="%o4";
701f13597dSJung-uk Kim$mask="%g1";	# 32 bits, what a waste...
711f13597dSJung-uk Kim$tmp0="%g4";
721f13597dSJung-uk Kim$tmp1="%g5";
731f13597dSJung-uk Kim
741f13597dSJung-uk Kim$i="%l0";
751f13597dSJung-uk Kim$j="%l1";
761f13597dSJung-uk Kim$mul0="%l2";
771f13597dSJung-uk Kim$mul1="%l3";
781f13597dSJung-uk Kim$tp="%l4";
791f13597dSJung-uk Kim$apj="%l5";
801f13597dSJung-uk Kim$npj="%l6";
811f13597dSJung-uk Kim$tpj="%l7";
821f13597dSJung-uk Kim
831f13597dSJung-uk Kim$fname="bn_mul_mont_int";
841f13597dSJung-uk Kim
851f13597dSJung-uk Kim$code=<<___;
86*b077aed3SPierre Pronchery#ifndef __ASSEMBLER__
87*b077aed3SPierre Pronchery# define __ASSEMBLER__ 1
88*b077aed3SPierre Pronchery#endif
89*b077aed3SPierre Pronchery#include "crypto/sparc_arch.h"
90e71b7053SJung-uk Kim
911f13597dSJung-uk Kim.section	".text",#alloc,#execinstr
921f13597dSJung-uk Kim
931f13597dSJung-uk Kim.global	$fname
941f13597dSJung-uk Kim.align	32
951f13597dSJung-uk Kim$fname:
961f13597dSJung-uk Kim	cmp	%o5,4			! 128 bits minimum
971f13597dSJung-uk Kim	bge,pt	%icc,.Lenter
981f13597dSJung-uk Kim	sethi	%hi(0xffffffff),$mask
991f13597dSJung-uk Kim	retl
1001f13597dSJung-uk Kim	clr	%o0
1011f13597dSJung-uk Kim.align	32
1021f13597dSJung-uk Kim.Lenter:
1031f13597dSJung-uk Kim	save	%sp,-$frame,%sp
1041f13597dSJung-uk Kim	sll	$num,2,$num		! num*=4
1051f13597dSJung-uk Kim	or	$mask,%lo(0xffffffff),$mask
1061f13597dSJung-uk Kim	ld	[$n0],$n0
1071f13597dSJung-uk Kim	cmp	$ap,$bp
1081f13597dSJung-uk Kim	and	$num,$mask,$num
1091f13597dSJung-uk Kim	ld	[$bp],$mul0		! bp[0]
1101f13597dSJung-uk Kim	nop
1111f13597dSJung-uk Kim
1121f13597dSJung-uk Kim	add	%sp,$bias,%o7		! real top of stack
1131f13597dSJung-uk Kim	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
1141f13597dSJung-uk Kim	sub	%o7,$num,%o7
1151f13597dSJung-uk Kim	ld	[$ap+4],$apj		! ap[1]
1161f13597dSJung-uk Kim	and	%o7,-1024,%o7
1171f13597dSJung-uk Kim	ld	[$np],$car1		! np[0]
1181f13597dSJung-uk Kim	sub	%o7,$bias,%sp		! alloca
1191f13597dSJung-uk Kim	ld	[$np+4],$npj		! np[1]
120e71b7053SJung-uk Kim	be,pt	SIZE_T_CC,.Lbn_sqr_mont
1211f13597dSJung-uk Kim	mov	12,$j
1221f13597dSJung-uk Kim
1231f13597dSJung-uk Kim	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
1241f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
1251f13597dSJung-uk Kim	and	$car0,$mask,$acc0
1261f13597dSJung-uk Kim	add	%sp,$bias+$frame,$tp
1271f13597dSJung-uk Kim	ld	[$ap+8],$apj		!prologue!
1281f13597dSJung-uk Kim
1291f13597dSJung-uk Kim	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
1301f13597dSJung-uk Kim	and	$mul1,$mask,$mul1
1311f13597dSJung-uk Kim
1321f13597dSJung-uk Kim	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
1331f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
1341f13597dSJung-uk Kim	srlx	$car0,32,$car0
1351f13597dSJung-uk Kim	add	$acc0,$car1,$car1
1361f13597dSJung-uk Kim	ld	[$np+8],$npj		!prologue!
1371f13597dSJung-uk Kim	srlx	$car1,32,$car1
1381f13597dSJung-uk Kim	mov	$tmp0,$acc0		!prologue!
1391f13597dSJung-uk Kim
1401f13597dSJung-uk Kim.L1st:
1411f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0
1421f13597dSJung-uk Kim	mulx	$npj,$mul1,$tmp1
1431f13597dSJung-uk Kim	add	$acc0,$car0,$car0
1441f13597dSJung-uk Kim	ld	[$ap+$j],$apj		! ap[j]
1451f13597dSJung-uk Kim	and	$car0,$mask,$acc0
1461f13597dSJung-uk Kim	add	$acc1,$car1,$car1
1471f13597dSJung-uk Kim	ld	[$np+$j],$npj		! np[j]
1481f13597dSJung-uk Kim	srlx	$car0,32,$car0
1491f13597dSJung-uk Kim	add	$acc0,$car1,$car1
1501f13597dSJung-uk Kim	add	$j,4,$j			! j++
1511f13597dSJung-uk Kim	mov	$tmp0,$acc0
1521f13597dSJung-uk Kim	st	$car1,[$tp]
1531f13597dSJung-uk Kim	cmp	$j,$num
1541f13597dSJung-uk Kim	mov	$tmp1,$acc1
1551f13597dSJung-uk Kim	srlx	$car1,32,$car1
1561f13597dSJung-uk Kim	bl	%icc,.L1st
1571f13597dSJung-uk Kim	add	$tp,4,$tp		! tp++
1581f13597dSJung-uk Kim!.L1st
1591f13597dSJung-uk Kim
1601f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0	!epilogue!
1611f13597dSJung-uk Kim	mulx	$npj,$mul1,$tmp1
1621f13597dSJung-uk Kim	add	$acc0,$car0,$car0
1631f13597dSJung-uk Kim	and	$car0,$mask,$acc0
1641f13597dSJung-uk Kim	add	$acc1,$car1,$car1
1651f13597dSJung-uk Kim	srlx	$car0,32,$car0
1661f13597dSJung-uk Kim	add	$acc0,$car1,$car1
1671f13597dSJung-uk Kim	st	$car1,[$tp]
1681f13597dSJung-uk Kim	srlx	$car1,32,$car1
1691f13597dSJung-uk Kim
1701f13597dSJung-uk Kim	add	$tmp0,$car0,$car0
1711f13597dSJung-uk Kim	and	$car0,$mask,$acc0
1721f13597dSJung-uk Kim	add	$tmp1,$car1,$car1
1731f13597dSJung-uk Kim	srlx	$car0,32,$car0
1741f13597dSJung-uk Kim	add	$acc0,$car1,$car1
1751f13597dSJung-uk Kim	st	$car1,[$tp+4]
1761f13597dSJung-uk Kim	srlx	$car1,32,$car1
1771f13597dSJung-uk Kim
1781f13597dSJung-uk Kim	add	$car0,$car1,$car1
1791f13597dSJung-uk Kim	st	$car1,[$tp+8]
1801f13597dSJung-uk Kim	srlx	$car1,32,$car2
1811f13597dSJung-uk Kim
1821f13597dSJung-uk Kim	mov	4,$i			! i++
1831f13597dSJung-uk Kim	ld	[$bp+4],$mul0		! bp[1]
1841f13597dSJung-uk Kim.Louter:
1851f13597dSJung-uk Kim	add	%sp,$bias+$frame,$tp
1861f13597dSJung-uk Kim	ld	[$ap],$car0		! ap[0]
1871f13597dSJung-uk Kim	ld	[$ap+4],$apj		! ap[1]
1881f13597dSJung-uk Kim	ld	[$np],$car1		! np[0]
1891f13597dSJung-uk Kim	ld	[$np+4],$npj		! np[1]
1901f13597dSJung-uk Kim	ld	[$tp],$tmp1		! tp[0]
1911f13597dSJung-uk Kim	ld	[$tp+4],$tpj		! tp[1]
1921f13597dSJung-uk Kim	mov	12,$j
1931f13597dSJung-uk Kim
1941f13597dSJung-uk Kim	mulx	$car0,$mul0,$car0
1951f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0	!prologue!
1961f13597dSJung-uk Kim	add	$tmp1,$car0,$car0
1971f13597dSJung-uk Kim	ld	[$ap+8],$apj		!prologue!
1981f13597dSJung-uk Kim	and	$car0,$mask,$acc0
1991f13597dSJung-uk Kim
2001f13597dSJung-uk Kim	mulx	$n0,$acc0,$mul1
2011f13597dSJung-uk Kim	and	$mul1,$mask,$mul1
2021f13597dSJung-uk Kim
2031f13597dSJung-uk Kim	mulx	$car1,$mul1,$car1
2041f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1	!prologue!
2051f13597dSJung-uk Kim	srlx	$car0,32,$car0
2061f13597dSJung-uk Kim	add	$acc0,$car1,$car1
2071f13597dSJung-uk Kim	ld	[$np+8],$npj		!prologue!
2081f13597dSJung-uk Kim	srlx	$car1,32,$car1
2091f13597dSJung-uk Kim	mov	$tmp0,$acc0		!prologue!
2101f13597dSJung-uk Kim
2111f13597dSJung-uk Kim.Linner:
2121f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0
2131f13597dSJung-uk Kim	mulx	$npj,$mul1,$tmp1
2141f13597dSJung-uk Kim	add	$tpj,$car0,$car0
2151f13597dSJung-uk Kim	ld	[$ap+$j],$apj		! ap[j]
2161f13597dSJung-uk Kim	add	$acc0,$car0,$car0
2171f13597dSJung-uk Kim	add	$acc1,$car1,$car1
2181f13597dSJung-uk Kim	ld	[$np+$j],$npj		! np[j]
2191f13597dSJung-uk Kim	and	$car0,$mask,$acc0
2201f13597dSJung-uk Kim	ld	[$tp+8],$tpj		! tp[j]
2211f13597dSJung-uk Kim	srlx	$car0,32,$car0
2221f13597dSJung-uk Kim	add	$acc0,$car1,$car1
2231f13597dSJung-uk Kim	add	$j,4,$j			! j++
2241f13597dSJung-uk Kim	mov	$tmp0,$acc0
2251f13597dSJung-uk Kim	st	$car1,[$tp]		! tp[j-1]
2261f13597dSJung-uk Kim	srlx	$car1,32,$car1
2271f13597dSJung-uk Kim	mov	$tmp1,$acc1
2281f13597dSJung-uk Kim	cmp	$j,$num
2291f13597dSJung-uk Kim	bl	%icc,.Linner
2301f13597dSJung-uk Kim	add	$tp,4,$tp		! tp++
2311f13597dSJung-uk Kim!.Linner
2321f13597dSJung-uk Kim
2331f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0	!epilogue!
2341f13597dSJung-uk Kim	mulx	$npj,$mul1,$tmp1
2351f13597dSJung-uk Kim	add	$tpj,$car0,$car0
2361f13597dSJung-uk Kim	add	$acc0,$car0,$car0
2371f13597dSJung-uk Kim	ld	[$tp+8],$tpj		! tp[j]
2381f13597dSJung-uk Kim	and	$car0,$mask,$acc0
2391f13597dSJung-uk Kim	add	$acc1,$car1,$car1
2401f13597dSJung-uk Kim	srlx	$car0,32,$car0
2411f13597dSJung-uk Kim	add	$acc0,$car1,$car1
2421f13597dSJung-uk Kim	st	$car1,[$tp]		! tp[j-1]
2431f13597dSJung-uk Kim	srlx	$car1,32,$car1
2441f13597dSJung-uk Kim
2451f13597dSJung-uk Kim	add	$tpj,$car0,$car0
2461f13597dSJung-uk Kim	add	$tmp0,$car0,$car0
2471f13597dSJung-uk Kim	and	$car0,$mask,$acc0
2481f13597dSJung-uk Kim	add	$tmp1,$car1,$car1
2491f13597dSJung-uk Kim	add	$acc0,$car1,$car1
2501f13597dSJung-uk Kim	st	$car1,[$tp+4]		! tp[j-1]
2511f13597dSJung-uk Kim	srlx	$car0,32,$car0
2521f13597dSJung-uk Kim	add	$i,4,$i			! i++
2531f13597dSJung-uk Kim	srlx	$car1,32,$car1
2541f13597dSJung-uk Kim
2551f13597dSJung-uk Kim	add	$car0,$car1,$car1
2561f13597dSJung-uk Kim	cmp	$i,$num
2571f13597dSJung-uk Kim	add	$car2,$car1,$car1
2581f13597dSJung-uk Kim	st	$car1,[$tp+8]
2591f13597dSJung-uk Kim
2601f13597dSJung-uk Kim	srlx	$car1,32,$car2
2611f13597dSJung-uk Kim	bl,a	%icc,.Louter
2621f13597dSJung-uk Kim	ld	[$bp+$i],$mul0		! bp[i]
2631f13597dSJung-uk Kim!.Louter
2641f13597dSJung-uk Kim
2651f13597dSJung-uk Kim	add	$tp,12,$tp
2661f13597dSJung-uk Kim
2671f13597dSJung-uk Kim.Ltail:
2681f13597dSJung-uk Kim	add	$np,$num,$np
2691f13597dSJung-uk Kim	add	$rp,$num,$rp
2701f13597dSJung-uk Kim	sub	%g0,$num,%o7		! k=-num
2711f13597dSJung-uk Kim	ba	.Lsub
2721f13597dSJung-uk Kim	subcc	%g0,%g0,%g0		! clear %icc.c
2731f13597dSJung-uk Kim.align	16
2741f13597dSJung-uk Kim.Lsub:
2751f13597dSJung-uk Kim	ld	[$tp+%o7],%o0
2761f13597dSJung-uk Kim	ld	[$np+%o7],%o1
2771f13597dSJung-uk Kim	subccc	%o0,%o1,%o1		! tp[j]-np[j]
2781f13597dSJung-uk Kim	add	$rp,%o7,$i
2791f13597dSJung-uk Kim	add	%o7,4,%o7
2801f13597dSJung-uk Kim	brnz	%o7,.Lsub
2811f13597dSJung-uk Kim	st	%o1,[$i]
282dea77ea6SJung-uk Kim	subccc	$car2,0,$car2		! handle upmost overflow bit
2831f13597dSJung-uk Kim	sub	%g0,$num,%o7
2841f13597dSJung-uk Kim
2851f13597dSJung-uk Kim.Lcopy:
286dea77ea6SJung-uk Kim	ld	[$tp+%o7],%o1		! conditional copy
287dea77ea6SJung-uk Kim	ld	[$rp+%o7],%o0
2881f13597dSJung-uk Kim	st	%g0,[$tp+%o7]		! zap tp
289dea77ea6SJung-uk Kim	movcs	%icc,%o1,%o0
2901f13597dSJung-uk Kim	st	%o0,[$rp+%o7]
2911f13597dSJung-uk Kim	add	%o7,4,%o7
2921f13597dSJung-uk Kim	brnz	%o7,.Lcopy
2931f13597dSJung-uk Kim	nop
2941f13597dSJung-uk Kim	mov	1,%i0
2951f13597dSJung-uk Kim	ret
2961f13597dSJung-uk Kim	restore
2971f13597dSJung-uk Kim___
2981f13597dSJung-uk Kim
2991f13597dSJung-uk Kim########
3001f13597dSJung-uk Kim######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
3011f13597dSJung-uk Kim######## code without following dedicated squaring procedure.
3021f13597dSJung-uk Kim########
303ed7112f0SJung-uk Kim$sbit="%o5";
3041f13597dSJung-uk Kim
3051f13597dSJung-uk Kim$code.=<<___;
3061f13597dSJung-uk Kim.align	32
3071f13597dSJung-uk Kim.Lbn_sqr_mont:
3081f13597dSJung-uk Kim	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
3091f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0		!prologue!
3101f13597dSJung-uk Kim	and	$car0,$mask,$acc0
3111f13597dSJung-uk Kim	add	%sp,$bias+$frame,$tp
3121f13597dSJung-uk Kim	ld	[$ap+8],$apj			!prologue!
3131f13597dSJung-uk Kim
3141f13597dSJung-uk Kim	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
3151f13597dSJung-uk Kim	srlx	$car0,32,$car0
3161f13597dSJung-uk Kim	and	$mul1,$mask,$mul1
3171f13597dSJung-uk Kim
3181f13597dSJung-uk Kim	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
3191f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1		!prologue!
3201f13597dSJung-uk Kim	and	$car0,1,$sbit
3211f13597dSJung-uk Kim	ld	[$np+8],$npj			!prologue!
3221f13597dSJung-uk Kim	srlx	$car0,1,$car0
3231f13597dSJung-uk Kim	add	$acc0,$car1,$car1
3241f13597dSJung-uk Kim	srlx	$car1,32,$car1
3251f13597dSJung-uk Kim	mov	$tmp0,$acc0			!prologue!
3261f13597dSJung-uk Kim
3271f13597dSJung-uk Kim.Lsqr_1st:
3281f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0
3291f13597dSJung-uk Kim	mulx	$npj,$mul1,$tmp1
3301f13597dSJung-uk Kim	add	$acc0,$car0,$car0		! ap[j]*a0+c0
3311f13597dSJung-uk Kim	add	$acc1,$car1,$car1
3321f13597dSJung-uk Kim	ld	[$ap+$j],$apj			! ap[j]
3331f13597dSJung-uk Kim	and	$car0,$mask,$acc0
3341f13597dSJung-uk Kim	ld	[$np+$j],$npj			! np[j]
3351f13597dSJung-uk Kim	srlx	$car0,32,$car0
3361f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
3371f13597dSJung-uk Kim	or	$sbit,$acc0,$acc0
3381f13597dSJung-uk Kim	mov	$tmp1,$acc1
3391f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
3401f13597dSJung-uk Kim	add	$j,4,$j				! j++
3411f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
3421f13597dSJung-uk Kim	cmp	$j,$num
3431f13597dSJung-uk Kim	add	$acc0,$car1,$car1
3441f13597dSJung-uk Kim	st	$car1,[$tp]
3451f13597dSJung-uk Kim	mov	$tmp0,$acc0
3461f13597dSJung-uk Kim	srlx	$car1,32,$car1
3471f13597dSJung-uk Kim	bl	%icc,.Lsqr_1st
3481f13597dSJung-uk Kim	add	$tp,4,$tp			! tp++
3491f13597dSJung-uk Kim!.Lsqr_1st
3501f13597dSJung-uk Kim
3511f13597dSJung-uk Kim	mulx	$apj,$mul0,$tmp0		! epilogue
3521f13597dSJung-uk Kim	mulx	$npj,$mul1,$tmp1
3531f13597dSJung-uk Kim	add	$acc0,$car0,$car0		! ap[j]*a0+c0
3541f13597dSJung-uk Kim	add	$acc1,$car1,$car1
3551f13597dSJung-uk Kim	and	$car0,$mask,$acc0
3561f13597dSJung-uk Kim	srlx	$car0,32,$car0
3571f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
3581f13597dSJung-uk Kim	or	$sbit,$acc0,$acc0
3591f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
3601f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
3611f13597dSJung-uk Kim	add	$acc0,$car1,$car1
3621f13597dSJung-uk Kim	st	$car1,[$tp]
3631f13597dSJung-uk Kim	srlx	$car1,32,$car1
3641f13597dSJung-uk Kim
3651f13597dSJung-uk Kim	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
3661f13597dSJung-uk Kim	add	$tmp1,$car1,$car1
3671f13597dSJung-uk Kim	and	$car0,$mask,$acc0
3681f13597dSJung-uk Kim	srlx	$car0,32,$car0
3691f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
3701f13597dSJung-uk Kim	or	$sbit,$acc0,$acc0
3711f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
3721f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
3731f13597dSJung-uk Kim	add	$acc0,$car1,$car1
3741f13597dSJung-uk Kim	st	$car1,[$tp+4]
3751f13597dSJung-uk Kim	srlx	$car1,32,$car1
3761f13597dSJung-uk Kim
3771f13597dSJung-uk Kim	add	$car0,$car0,$car0
3781f13597dSJung-uk Kim	or	$sbit,$car0,$car0
3791f13597dSJung-uk Kim	add	$car0,$car1,$car1
3801f13597dSJung-uk Kim	st	$car1,[$tp+8]
3811f13597dSJung-uk Kim	srlx	$car1,32,$car2
3821f13597dSJung-uk Kim
3831f13597dSJung-uk Kim	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
3841f13597dSJung-uk Kim	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
3851f13597dSJung-uk Kim	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
3861f13597dSJung-uk Kim	ld	[$ap+4],$mul0			! ap[1]
3871f13597dSJung-uk Kim	ld	[$ap+8],$apj			! ap[2]
3881f13597dSJung-uk Kim	ld	[$np],$car1			! np[0]
3891f13597dSJung-uk Kim	ld	[$np+4],$npj			! np[1]
3901f13597dSJung-uk Kim	mulx	$n0,$tmp0,$mul1
3911f13597dSJung-uk Kim
3921f13597dSJung-uk Kim	mulx	$mul0,$mul0,$car0
3931f13597dSJung-uk Kim	and	$mul1,$mask,$mul1
3941f13597dSJung-uk Kim
3951f13597dSJung-uk Kim	mulx	$car1,$mul1,$car1
3961f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
3971f13597dSJung-uk Kim	add	$tmp0,$car1,$car1
3981f13597dSJung-uk Kim	and	$car0,$mask,$acc0
3991f13597dSJung-uk Kim	ld	[$np+8],$npj			! np[2]
4001f13597dSJung-uk Kim	srlx	$car1,32,$car1
4011f13597dSJung-uk Kim	add	$tmp1,$car1,$car1
4021f13597dSJung-uk Kim	srlx	$car0,32,$car0
4031f13597dSJung-uk Kim	add	$acc0,$car1,$car1
4041f13597dSJung-uk Kim	and	$car0,1,$sbit
4051f13597dSJung-uk Kim	add	$acc1,$car1,$car1
4061f13597dSJung-uk Kim	srlx	$car0,1,$car0
4071f13597dSJung-uk Kim	mov	12,$j
4081f13597dSJung-uk Kim	st	$car1,[%sp+$bias+$frame]	! tp[0]=
4091f13597dSJung-uk Kim	srlx	$car1,32,$car1
4101f13597dSJung-uk Kim	add	%sp,$bias+$frame+4,$tp
4111f13597dSJung-uk Kim
4121f13597dSJung-uk Kim.Lsqr_2nd:
4131f13597dSJung-uk Kim	mulx	$apj,$mul0,$acc0
4141f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
4151f13597dSJung-uk Kim	add	$acc0,$car0,$car0
416ed7112f0SJung-uk Kim	add	$tpj,$sbit,$sbit
4171f13597dSJung-uk Kim	ld	[$ap+$j],$apj			! ap[j]
4181f13597dSJung-uk Kim	and	$car0,$mask,$acc0
4191f13597dSJung-uk Kim	ld	[$np+$j],$npj			! np[j]
4201f13597dSJung-uk Kim	srlx	$car0,32,$car0
4211f13597dSJung-uk Kim	add	$acc1,$car1,$car1
4221f13597dSJung-uk Kim	ld	[$tp+8],$tpj			! tp[j]
4231f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
4241f13597dSJung-uk Kim	add	$j,4,$j				! j++
425ed7112f0SJung-uk Kim	add	$sbit,$acc0,$acc0
4261f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
4271f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
4281f13597dSJung-uk Kim	cmp	$j,$num
4291f13597dSJung-uk Kim	add	$acc0,$car1,$car1
4301f13597dSJung-uk Kim	st	$car1,[$tp]			! tp[j-1]
4311f13597dSJung-uk Kim	srlx	$car1,32,$car1
4321f13597dSJung-uk Kim	bl	%icc,.Lsqr_2nd
4331f13597dSJung-uk Kim	add	$tp,4,$tp			! tp++
4341f13597dSJung-uk Kim!.Lsqr_2nd
4351f13597dSJung-uk Kim
4361f13597dSJung-uk Kim	mulx	$apj,$mul0,$acc0
4371f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
4381f13597dSJung-uk Kim	add	$acc0,$car0,$car0
439ed7112f0SJung-uk Kim	add	$tpj,$sbit,$sbit
4401f13597dSJung-uk Kim	and	$car0,$mask,$acc0
4411f13597dSJung-uk Kim	srlx	$car0,32,$car0
4421f13597dSJung-uk Kim	add	$acc1,$car1,$car1
4431f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
444ed7112f0SJung-uk Kim	add	$sbit,$acc0,$acc0
4451f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
4461f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
4471f13597dSJung-uk Kim	add	$acc0,$car1,$car1
4481f13597dSJung-uk Kim	st	$car1,[$tp]			! tp[j-1]
4491f13597dSJung-uk Kim	srlx	$car1,32,$car1
4501f13597dSJung-uk Kim
4511f13597dSJung-uk Kim	add	$car0,$car0,$car0
452ed7112f0SJung-uk Kim	add	$sbit,$car0,$car0
4531f13597dSJung-uk Kim	add	$car0,$car1,$car1
4541f13597dSJung-uk Kim	add	$car2,$car1,$car1
4551f13597dSJung-uk Kim	st	$car1,[$tp+4]
4561f13597dSJung-uk Kim	srlx	$car1,32,$car2
4571f13597dSJung-uk Kim
4581f13597dSJung-uk Kim	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
4591f13597dSJung-uk Kim	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
4601f13597dSJung-uk Kim	ld	[$ap+8],$mul0			! ap[2]
4611f13597dSJung-uk Kim	ld	[$np],$car1			! np[0]
4621f13597dSJung-uk Kim	ld	[$np+4],$npj			! np[1]
4631f13597dSJung-uk Kim	mulx	$n0,$tmp1,$mul1
4641f13597dSJung-uk Kim	and	$mul1,$mask,$mul1
4651f13597dSJung-uk Kim	mov	8,$i
4661f13597dSJung-uk Kim
4671f13597dSJung-uk Kim	mulx	$mul0,$mul0,$car0
4681f13597dSJung-uk Kim	mulx	$car1,$mul1,$car1
4691f13597dSJung-uk Kim	and	$car0,$mask,$acc0
4701f13597dSJung-uk Kim	add	$tmp1,$car1,$car1
4711f13597dSJung-uk Kim	srlx	$car0,32,$car0
4721f13597dSJung-uk Kim	add	%sp,$bias+$frame,$tp
4731f13597dSJung-uk Kim	srlx	$car1,32,$car1
4741f13597dSJung-uk Kim	and	$car0,1,$sbit
4751f13597dSJung-uk Kim	srlx	$car0,1,$car0
4761f13597dSJung-uk Kim	mov	4,$j
4771f13597dSJung-uk Kim
4781f13597dSJung-uk Kim.Lsqr_outer:
4791f13597dSJung-uk Kim.Lsqr_inner1:
4801f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
4811f13597dSJung-uk Kim	add	$tpj,$car1,$car1
4821f13597dSJung-uk Kim	add	$j,4,$j
4831f13597dSJung-uk Kim	ld	[$tp+8],$tpj
4841f13597dSJung-uk Kim	cmp	$j,$i
4851f13597dSJung-uk Kim	add	$acc1,$car1,$car1
4861f13597dSJung-uk Kim	ld	[$np+$j],$npj
4871f13597dSJung-uk Kim	st	$car1,[$tp]
4881f13597dSJung-uk Kim	srlx	$car1,32,$car1
4891f13597dSJung-uk Kim	bl	%icc,.Lsqr_inner1
4901f13597dSJung-uk Kim	add	$tp,4,$tp
4911f13597dSJung-uk Kim!.Lsqr_inner1
4921f13597dSJung-uk Kim
4931f13597dSJung-uk Kim	add	$j,4,$j
4941f13597dSJung-uk Kim	ld	[$ap+$j],$apj			! ap[j]
4951f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
4961f13597dSJung-uk Kim	add	$tpj,$car1,$car1
4971f13597dSJung-uk Kim	ld	[$np+$j],$npj			! np[j]
498dea77ea6SJung-uk Kim	srlx	$car1,32,$tmp0
499dea77ea6SJung-uk Kim	and	$car1,$mask,$car1
500dea77ea6SJung-uk Kim	add	$tmp0,$sbit,$sbit
5011f13597dSJung-uk Kim	add	$acc0,$car1,$car1
5021f13597dSJung-uk Kim	ld	[$tp+8],$tpj			! tp[j]
5031f13597dSJung-uk Kim	add	$acc1,$car1,$car1
5041f13597dSJung-uk Kim	st	$car1,[$tp]
5051f13597dSJung-uk Kim	srlx	$car1,32,$car1
5061f13597dSJung-uk Kim
5071f13597dSJung-uk Kim	add	$j,4,$j
5081f13597dSJung-uk Kim	cmp	$j,$num
5091f13597dSJung-uk Kim	be,pn	%icc,.Lsqr_no_inner2
5101f13597dSJung-uk Kim	add	$tp,4,$tp
5111f13597dSJung-uk Kim
5121f13597dSJung-uk Kim.Lsqr_inner2:
5131f13597dSJung-uk Kim	mulx	$apj,$mul0,$acc0
5141f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
515ed7112f0SJung-uk Kim	add	$tpj,$sbit,$sbit
5161f13597dSJung-uk Kim	add	$acc0,$car0,$car0
5171f13597dSJung-uk Kim	ld	[$ap+$j],$apj			! ap[j]
5181f13597dSJung-uk Kim	and	$car0,$mask,$acc0
5191f13597dSJung-uk Kim	ld	[$np+$j],$npj			! np[j]
5201f13597dSJung-uk Kim	srlx	$car0,32,$car0
5211f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
5221f13597dSJung-uk Kim	ld	[$tp+8],$tpj			! tp[j]
523ed7112f0SJung-uk Kim	add	$sbit,$acc0,$acc0
5241f13597dSJung-uk Kim	add	$j,4,$j				! j++
5251f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
5261f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
5271f13597dSJung-uk Kim	cmp	$j,$num
5281f13597dSJung-uk Kim	add	$acc0,$car1,$car1
5291f13597dSJung-uk Kim	add	$acc1,$car1,$car1
5301f13597dSJung-uk Kim	st	$car1,[$tp]			! tp[j-1]
5311f13597dSJung-uk Kim	srlx	$car1,32,$car1
5321f13597dSJung-uk Kim	bl	%icc,.Lsqr_inner2
5331f13597dSJung-uk Kim	add	$tp,4,$tp			! tp++
5341f13597dSJung-uk Kim
5351f13597dSJung-uk Kim.Lsqr_no_inner2:
5361f13597dSJung-uk Kim	mulx	$apj,$mul0,$acc0
5371f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
538ed7112f0SJung-uk Kim	add	$tpj,$sbit,$sbit
5391f13597dSJung-uk Kim	add	$acc0,$car0,$car0
5401f13597dSJung-uk Kim	and	$car0,$mask,$acc0
5411f13597dSJung-uk Kim	srlx	$car0,32,$car0
5421f13597dSJung-uk Kim	add	$acc0,$acc0,$acc0
543ed7112f0SJung-uk Kim	add	$sbit,$acc0,$acc0
5441f13597dSJung-uk Kim	srlx	$acc0,32,$sbit
5451f13597dSJung-uk Kim	and	$acc0,$mask,$acc0
5461f13597dSJung-uk Kim	add	$acc0,$car1,$car1
5471f13597dSJung-uk Kim	add	$acc1,$car1,$car1
5481f13597dSJung-uk Kim	st	$car1,[$tp]			! tp[j-1]
5491f13597dSJung-uk Kim	srlx	$car1,32,$car1
5501f13597dSJung-uk Kim
5511f13597dSJung-uk Kim	add	$car0,$car0,$car0
552ed7112f0SJung-uk Kim	add	$sbit,$car0,$car0
5531f13597dSJung-uk Kim	add	$car0,$car1,$car1
5541f13597dSJung-uk Kim	add	$car2,$car1,$car1
5551f13597dSJung-uk Kim	st	$car1,[$tp+4]
5561f13597dSJung-uk Kim	srlx	$car1,32,$car2
5571f13597dSJung-uk Kim
5581f13597dSJung-uk Kim	add	$i,4,$i				! i++
5591f13597dSJung-uk Kim	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
5601f13597dSJung-uk Kim	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
5611f13597dSJung-uk Kim	ld	[$ap+$i],$mul0			! ap[j]
5621f13597dSJung-uk Kim	ld	[$np],$car1			! np[0]
5631f13597dSJung-uk Kim	ld	[$np+4],$npj			! np[1]
5641f13597dSJung-uk Kim	mulx	$n0,$tmp1,$mul1
5651f13597dSJung-uk Kim	and	$mul1,$mask,$mul1
5661f13597dSJung-uk Kim	add	$i,4,$tmp0
5671f13597dSJung-uk Kim
5681f13597dSJung-uk Kim	mulx	$mul0,$mul0,$car0
5691f13597dSJung-uk Kim	mulx	$car1,$mul1,$car1
5701f13597dSJung-uk Kim	and	$car0,$mask,$acc0
5711f13597dSJung-uk Kim	add	$tmp1,$car1,$car1
5721f13597dSJung-uk Kim	srlx	$car0,32,$car0
5731f13597dSJung-uk Kim	add	%sp,$bias+$frame,$tp
5741f13597dSJung-uk Kim	srlx	$car1,32,$car1
5751f13597dSJung-uk Kim	and	$car0,1,$sbit
5761f13597dSJung-uk Kim	srlx	$car0,1,$car0
5771f13597dSJung-uk Kim
5781f13597dSJung-uk Kim	cmp	$tmp0,$num			! i<num-1
5791f13597dSJung-uk Kim	bl	%icc,.Lsqr_outer
5801f13597dSJung-uk Kim	mov	4,$j
5811f13597dSJung-uk Kim
5821f13597dSJung-uk Kim.Lsqr_last:
5831f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
5841f13597dSJung-uk Kim	add	$tpj,$car1,$car1
5851f13597dSJung-uk Kim	add	$j,4,$j
5861f13597dSJung-uk Kim	ld	[$tp+8],$tpj
5871f13597dSJung-uk Kim	cmp	$j,$i
5881f13597dSJung-uk Kim	add	$acc1,$car1,$car1
5891f13597dSJung-uk Kim	ld	[$np+$j],$npj
5901f13597dSJung-uk Kim	st	$car1,[$tp]
5911f13597dSJung-uk Kim	srlx	$car1,32,$car1
5921f13597dSJung-uk Kim	bl	%icc,.Lsqr_last
5931f13597dSJung-uk Kim	add	$tp,4,$tp
5941f13597dSJung-uk Kim!.Lsqr_last
5951f13597dSJung-uk Kim
5961f13597dSJung-uk Kim	mulx	$npj,$mul1,$acc1
597ed7112f0SJung-uk Kim	add	$tpj,$acc0,$acc0
598ed7112f0SJung-uk Kim	srlx	$acc0,32,$tmp0
599ed7112f0SJung-uk Kim	and	$acc0,$mask,$acc0
600ed7112f0SJung-uk Kim	add	$tmp0,$sbit,$sbit
6011f13597dSJung-uk Kim	add	$acc0,$car1,$car1
6021f13597dSJung-uk Kim	add	$acc1,$car1,$car1
6031f13597dSJung-uk Kim	st	$car1,[$tp]
6041f13597dSJung-uk Kim	srlx	$car1,32,$car1
6051f13597dSJung-uk Kim
6061f13597dSJung-uk Kim	add	$car0,$car0,$car0		! recover $car0
607ed7112f0SJung-uk Kim	add	$sbit,$car0,$car0
6081f13597dSJung-uk Kim	add	$car0,$car1,$car1
6091f13597dSJung-uk Kim	add	$car2,$car1,$car1
6101f13597dSJung-uk Kim	st	$car1,[$tp+4]
6111f13597dSJung-uk Kim	srlx	$car1,32,$car2
6121f13597dSJung-uk Kim
6131f13597dSJung-uk Kim	ba	.Ltail
6141f13597dSJung-uk Kim	add	$tp,8,$tp
6151f13597dSJung-uk Kim.type	$fname,#function
6161f13597dSJung-uk Kim.size	$fname,(.-$fname)
617e71b7053SJung-uk Kim.asciz	"Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
6181f13597dSJung-uk Kim.align	32
6191f13597dSJung-uk Kim___
6201f13597dSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval($1)/gem;
6211f13597dSJung-uk Kimprint $code;
62217f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
623