1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0";	# BN_ULONG *rp,
20$ap="a1";	# const BN_ULONG *ap,
21$bp="a2";	# const BN_ULONG *bp,
22$np="a3";	# const BN_ULONG *np,
23$n0="a4";	# const BN_ULONG *n0,
24$num="a5";	# int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#include <asm.h>
45#include <regdef.h>
46
47.text
48
49.set	noat
50.set	noreorder
51
52.globl	bn_mul_mont
53.align	5
54.ent	bn_mul_mont
55bn_mul_mont:
56	lda	sp,-40(sp)
57	stq	ra,0(sp)
58	stq	s3,8(sp)
59	stq	s4,16(sp)
60	stq	s5,24(sp)
61	stq	fp,32(sp)
62	mov	sp,fp
63	.mask	0x0400f000,-40
64	.frame	fp,40,ra
65	.prologue 0
66
67	.align	4
68	.set	reorder
69	sextl	$num,$num
70	mov	0,v0
71	cmplt	$num,4,AT
72	bne	AT,.Lexit
73
74	ldq	$hi0,0($ap)	# ap[0]
75	s8addq	$num,16,AT
76	ldq	$aj,8($ap)
77	subq	sp,AT,sp
78	ldq	$bi,0($bp)	# bp[0]
79	mov	-4096,AT
80	ldq	$n0,0($n0)
81	and	sp,AT,sp
82
83	mulq	$hi0,$bi,$lo0
84	ldq	$hi1,0($np)	# np[0]
85	umulh	$hi0,$bi,$hi0
86	ldq	$nj,8($np)
87
88	mulq	$lo0,$n0,$m1
89
90	mulq	$hi1,$m1,$lo1
91	umulh	$hi1,$m1,$hi1
92
93	addq	$lo1,$lo0,$lo1
94	cmpult	$lo1,$lo0,AT
95	addq	$hi1,AT,$hi1
96
97	mulq	$aj,$bi,$alo
98	mov	2,$j
99	umulh	$aj,$bi,$ahi
100	mov	sp,$tp
101
102	mulq	$nj,$m1,$nlo
103	s8addq	$j,$ap,$aj
104	umulh	$nj,$m1,$nhi
105	s8addq	$j,$np,$nj
106.align	4
107.L1st:
108	.set	noreorder
109	ldq	$aj,($aj)
110	addl	$j,1,$j
111	ldq	$nj,($nj)
112	lda	$tp,8($tp)
113
114	addq	$alo,$hi0,$lo0
115	mulq	$aj,$bi,$alo
116	cmpult	$lo0,$hi0,AT
117	addq	$nlo,$hi1,$lo1
118
119	mulq	$nj,$m1,$nlo
120	addq	$ahi,AT,$hi0
121	cmpult	$lo1,$hi1,v0
122	cmplt	$j,$num,$tj
123
124	umulh	$aj,$bi,$ahi
125	addq	$nhi,v0,$hi1
126	addq	$lo1,$lo0,$lo1
127	s8addq	$j,$ap,$aj
128
129	umulh	$nj,$m1,$nhi
130	cmpult	$lo1,$lo0,v0
131	addq	$hi1,v0,$hi1
132	s8addq	$j,$np,$nj
133
134	stq	$lo1,-8($tp)
135	nop
136	unop
137	bne	$tj,.L1st
138	.set	reorder
139
140	addq	$alo,$hi0,$lo0
141	addq	$nlo,$hi1,$lo1
142	cmpult	$lo0,$hi0,AT
143	cmpult	$lo1,$hi1,v0
144	addq	$ahi,AT,$hi0
145	addq	$nhi,v0,$hi1
146
147	addq	$lo1,$lo0,$lo1
148	cmpult	$lo1,$lo0,v0
149	addq	$hi1,v0,$hi1
150
151	stq	$lo1,0($tp)
152
153	addq	$hi1,$hi0,$hi1
154	cmpult	$hi1,$hi0,AT
155	stq	$hi1,8($tp)
156	stq	AT,16($tp)
157
158	mov	1,$i
159.align	4
160.Louter:
161	s8addq	$i,$bp,$bi
162	ldq	$hi0,($ap)
163	ldq	$aj,8($ap)
164	ldq	$bi,($bi)
165	ldq	$hi1,($np)
166	ldq	$nj,8($np)
167	ldq	$tj,(sp)
168
169	mulq	$hi0,$bi,$lo0
170	umulh	$hi0,$bi,$hi0
171
172	addq	$lo0,$tj,$lo0
173	cmpult	$lo0,$tj,AT
174	addq	$hi0,AT,$hi0
175
176	mulq	$lo0,$n0,$m1
177
178	mulq	$hi1,$m1,$lo1
179	umulh	$hi1,$m1,$hi1
180
181	addq	$lo1,$lo0,$lo1
182	cmpult	$lo1,$lo0,AT
183	mov	2,$j
184	addq	$hi1,AT,$hi1
185
186	mulq	$aj,$bi,$alo
187	mov	sp,$tp
188	umulh	$aj,$bi,$ahi
189
190	mulq	$nj,$m1,$nlo
191	s8addq	$j,$ap,$aj
192	umulh	$nj,$m1,$nhi
193.align	4
194.Linner:
195	.set	noreorder
196	ldq	$tj,8($tp)	#L0
197	nop			#U1
198	ldq	$aj,($aj)	#L1
199	s8addq	$j,$np,$nj	#U0
200
201	ldq	$nj,($nj)	#L0
202	nop			#U1
203	addq	$alo,$hi0,$lo0	#L1
204	lda	$tp,8($tp)
205
206	mulq	$aj,$bi,$alo	#U1
207	cmpult	$lo0,$hi0,AT	#L0
208	addq	$nlo,$hi1,$lo1	#L1
209	addl	$j,1,$j
210
211	mulq	$nj,$m1,$nlo	#U1
212	addq	$ahi,AT,$hi0	#L0
213	addq	$lo0,$tj,$lo0	#L1
214	cmpult	$lo1,$hi1,v0	#U0
215
216	umulh	$aj,$bi,$ahi	#U1
217	cmpult	$lo0,$tj,AT	#L0
218	addq	$lo1,$lo0,$lo1	#L1
219	addq	$nhi,v0,$hi1	#U0
220
221	umulh	$nj,$m1,$nhi	#U1
222	s8addq	$j,$ap,$aj	#L0
223	cmpult	$lo1,$lo0,v0	#L1
224	cmplt	$j,$num,$tj	#U0	# borrow $tj
225
226	addq	$hi0,AT,$hi0	#L0
227	addq	$hi1,v0,$hi1	#U1
228	stq	$lo1,-8($tp)	#L1
229	bne	$tj,.Linner	#U0
230	.set	reorder
231
232	ldq	$tj,8($tp)
233	addq	$alo,$hi0,$lo0
234	addq	$nlo,$hi1,$lo1
235	cmpult	$lo0,$hi0,AT
236	cmpult	$lo1,$hi1,v0
237	addq	$ahi,AT,$hi0
238	addq	$nhi,v0,$hi1
239
240	addq	$lo0,$tj,$lo0
241	cmpult	$lo0,$tj,AT
242	addq	$hi0,AT,$hi0
243
244	ldq	$tj,16($tp)
245	addq	$lo1,$lo0,$j
246	cmpult	$j,$lo0,v0
247	addq	$hi1,v0,$hi1
248
249	addq	$hi1,$hi0,$lo1
250	stq	$j,($tp)
251	cmpult	$lo1,$hi0,$hi1
252	addq	$lo1,$tj,$lo1
253	cmpult	$lo1,$tj,AT
254	addl	$i,1,$i
255	addq	$hi1,AT,$hi1
256	stq	$lo1,8($tp)
257	cmplt	$i,$num,$tj	# borrow $tj
258	stq	$hi1,16($tp)
259	bne	$tj,.Louter
260
261	s8addq	$num,sp,$tj	# &tp[num]
262	mov	$rp,$bp		# put rp aside
263	mov	sp,$tp
264	mov	sp,$ap
265	mov	0,$hi0		# clear borrow bit
266
267.align	4
268.Lsub:	ldq	$lo0,($tp)
269	ldq	$lo1,($np)
270	lda	$tp,8($tp)
271	lda	$np,8($np)
272	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
273	cmpult	$lo0,$lo1,AT
274	subq	$lo1,$hi0,$lo0
275	cmpult	$lo1,$lo0,$hi0
276	or	$hi0,AT,$hi0
277	stq	$lo0,($rp)
278	cmpult	$tp,$tj,v0
279	lda	$rp,8($rp)
280	bne	v0,.Lsub
281
282	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
283	mov	sp,$tp
284	mov	$bp,$rp		# restore rp
285
286	and	sp,$hi0,$ap
287	bic	$bp,$hi0,$bp
288	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
289
290.align	4
291.Lcopy:	ldq	$aj,($ap)	# copy or in-place refresh
292	lda	$tp,8($tp)
293	lda	$rp,8($rp)
294	lda	$ap,8($ap)
295	stq	zero,-8($tp)	# zap tp
296	cmpult	$tp,$tj,AT
297	stq	$aj,-8($rp)
298	bne	AT,.Lcopy
299	mov	1,v0
300
301.Lexit:
302	.set	noreorder
303	mov	fp,sp
304	/*ldq	ra,0(sp)*/
305	ldq	s3,8(sp)
306	ldq	s4,16(sp)
307	ldq	s5,24(sp)
308	ldq	fp,32(sp)
309	lda	sp,40(sp)
310	ret	(ra)
311.end	bn_mul_mont
312.rdata
313.asciiz	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
314___
315
316print $code;
317close STDOUT;
318