1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0";	# BN_ULONG *rp,
20$ap="a1";	# const BN_ULONG *ap,
21$bp="a2";	# const BN_ULONG *bp,
22$np="a3";	# const BN_ULONG *np,
23$n0="a4";	# const BN_ULONG *n0,
24$num="a5";	# int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#ifdef __linux__
45#include <asm/regdef.h>
46#else
47#include <asm.h>
48#include <regdef.h>
49#endif
50
51.text
52
53.set	noat
54.set	noreorder
55
56.globl	bn_mul_mont
57.align	5
58.ent	bn_mul_mont
59bn_mul_mont:
60	lda	sp,-48(sp)
61	stq	ra,0(sp)
62	stq	s3,8(sp)
63	stq	s4,16(sp)
64	stq	s5,24(sp)
65	stq	fp,32(sp)
66	mov	sp,fp
67	.mask	0x0400f000,-48
68	.frame	fp,48,ra
69	.prologue 0
70
71	.align	4
72	.set	reorder
73	sextl	$num,$num
74	mov	0,v0
75	cmplt	$num,4,AT
76	bne	AT,.Lexit
77
78	ldq	$hi0,0($ap)	# ap[0]
79	s8addq	$num,16,AT
80	ldq	$aj,8($ap)
81	subq	sp,AT,sp
82	ldq	$bi,0($bp)	# bp[0]
83	lda	AT,-4096(zero)	# mov	-4096,AT
84	ldq	$n0,0($n0)
85	and	sp,AT,sp
86
87	mulq	$hi0,$bi,$lo0
88	ldq	$hi1,0($np)	# np[0]
89	umulh	$hi0,$bi,$hi0
90	ldq	$nj,8($np)
91
92	mulq	$lo0,$n0,$m1
93
94	mulq	$hi1,$m1,$lo1
95	umulh	$hi1,$m1,$hi1
96
97	addq	$lo1,$lo0,$lo1
98	cmpult	$lo1,$lo0,AT
99	addq	$hi1,AT,$hi1
100
101	mulq	$aj,$bi,$alo
102	mov	2,$j
103	umulh	$aj,$bi,$ahi
104	mov	sp,$tp
105
106	mulq	$nj,$m1,$nlo
107	s8addq	$j,$ap,$aj
108	umulh	$nj,$m1,$nhi
109	s8addq	$j,$np,$nj
110.align	4
111.L1st:
112	.set	noreorder
113	ldq	$aj,0($aj)
114	addl	$j,1,$j
115	ldq	$nj,0($nj)
116	lda	$tp,8($tp)
117
118	addq	$alo,$hi0,$lo0
119	mulq	$aj,$bi,$alo
120	cmpult	$lo0,$hi0,AT
121	addq	$nlo,$hi1,$lo1
122
123	mulq	$nj,$m1,$nlo
124	addq	$ahi,AT,$hi0
125	cmpult	$lo1,$hi1,v0
126	cmplt	$j,$num,$tj
127
128	umulh	$aj,$bi,$ahi
129	addq	$nhi,v0,$hi1
130	addq	$lo1,$lo0,$lo1
131	s8addq	$j,$ap,$aj
132
133	umulh	$nj,$m1,$nhi
134	cmpult	$lo1,$lo0,v0
135	addq	$hi1,v0,$hi1
136	s8addq	$j,$np,$nj
137
138	stq	$lo1,-8($tp)
139	nop
140	unop
141	bne	$tj,.L1st
142	.set	reorder
143
144	addq	$alo,$hi0,$lo0
145	addq	$nlo,$hi1,$lo1
146	cmpult	$lo0,$hi0,AT
147	cmpult	$lo1,$hi1,v0
148	addq	$ahi,AT,$hi0
149	addq	$nhi,v0,$hi1
150
151	addq	$lo1,$lo0,$lo1
152	cmpult	$lo1,$lo0,v0
153	addq	$hi1,v0,$hi1
154
155	stq	$lo1,0($tp)
156
157	addq	$hi1,$hi0,$hi1
158	cmpult	$hi1,$hi0,AT
159	stq	$hi1,8($tp)
160	stq	AT,16($tp)
161
162	mov	1,$i
163.align	4
164.Louter:
165	s8addq	$i,$bp,$bi
166	ldq	$hi0,0($ap)
167	ldq	$aj,8($ap)
168	ldq	$bi,0($bi)
169	ldq	$hi1,0($np)
170	ldq	$nj,8($np)
171	ldq	$tj,0(sp)
172
173	mulq	$hi0,$bi,$lo0
174	umulh	$hi0,$bi,$hi0
175
176	addq	$lo0,$tj,$lo0
177	cmpult	$lo0,$tj,AT
178	addq	$hi0,AT,$hi0
179
180	mulq	$lo0,$n0,$m1
181
182	mulq	$hi1,$m1,$lo1
183	umulh	$hi1,$m1,$hi1
184
185	addq	$lo1,$lo0,$lo1
186	cmpult	$lo1,$lo0,AT
187	mov	2,$j
188	addq	$hi1,AT,$hi1
189
190	mulq	$aj,$bi,$alo
191	mov	sp,$tp
192	umulh	$aj,$bi,$ahi
193
194	mulq	$nj,$m1,$nlo
195	s8addq	$j,$ap,$aj
196	umulh	$nj,$m1,$nhi
197.align	4
198.Linner:
199	.set	noreorder
200	ldq	$tj,8($tp)	#L0
201	nop			#U1
202	ldq	$aj,0($aj)	#L1
203	s8addq	$j,$np,$nj	#U0
204
205	ldq	$nj,0($nj)	#L0
206	nop			#U1
207	addq	$alo,$hi0,$lo0	#L1
208	lda	$tp,8($tp)
209
210	mulq	$aj,$bi,$alo	#U1
211	cmpult	$lo0,$hi0,AT	#L0
212	addq	$nlo,$hi1,$lo1	#L1
213	addl	$j,1,$j
214
215	mulq	$nj,$m1,$nlo	#U1
216	addq	$ahi,AT,$hi0	#L0
217	addq	$lo0,$tj,$lo0	#L1
218	cmpult	$lo1,$hi1,v0	#U0
219
220	umulh	$aj,$bi,$ahi	#U1
221	cmpult	$lo0,$tj,AT	#L0
222	addq	$lo1,$lo0,$lo1	#L1
223	addq	$nhi,v0,$hi1	#U0
224
225	umulh	$nj,$m1,$nhi	#U1
226	s8addq	$j,$ap,$aj	#L0
227	cmpult	$lo1,$lo0,v0	#L1
228	cmplt	$j,$num,$tj	#U0	# borrow $tj
229
230	addq	$hi0,AT,$hi0	#L0
231	addq	$hi1,v0,$hi1	#U1
232	stq	$lo1,-8($tp)	#L1
233	bne	$tj,.Linner	#U0
234	.set	reorder
235
236	ldq	$tj,8($tp)
237	addq	$alo,$hi0,$lo0
238	addq	$nlo,$hi1,$lo1
239	cmpult	$lo0,$hi0,AT
240	cmpult	$lo1,$hi1,v0
241	addq	$ahi,AT,$hi0
242	addq	$nhi,v0,$hi1
243
244	addq	$lo0,$tj,$lo0
245	cmpult	$lo0,$tj,AT
246	addq	$hi0,AT,$hi0
247
248	ldq	$tj,16($tp)
249	addq	$lo1,$lo0,$j
250	cmpult	$j,$lo0,v0
251	addq	$hi1,v0,$hi1
252
253	addq	$hi1,$hi0,$lo1
254	stq	$j,0($tp)
255	cmpult	$lo1,$hi0,$hi1
256	addq	$lo1,$tj,$lo1
257	cmpult	$lo1,$tj,AT
258	addl	$i,1,$i
259	addq	$hi1,AT,$hi1
260	stq	$lo1,8($tp)
261	cmplt	$i,$num,$tj	# borrow $tj
262	stq	$hi1,16($tp)
263	bne	$tj,.Louter
264
265	s8addq	$num,sp,$tj	# &tp[num]
266	mov	$rp,$bp		# put rp aside
267	mov	sp,$tp
268	mov	sp,$ap
269	mov	0,$hi0		# clear borrow bit
270
271.align	4
272.Lsub:	ldq	$lo0,0($tp)
273	ldq	$lo1,0($np)
274	lda	$tp,8($tp)
275	lda	$np,8($np)
276	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
277	cmpult	$lo0,$lo1,AT
278	subq	$lo1,$hi0,$lo0
279	cmpult	$lo1,$lo0,$hi0
280	or	$hi0,AT,$hi0
281	stq	$lo0,0($rp)
282	cmpult	$tp,$tj,v0
283	lda	$rp,8($rp)
284	bne	v0,.Lsub
285
286	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
287	mov	sp,$tp
288	mov	$bp,$rp		# restore rp
289
290	and	sp,$hi0,$ap
291	bic	$bp,$hi0,$bp
292	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
293
294.align	4
295.Lcopy:	ldq	$aj,0($ap)	# copy or in-place refresh
296	lda	$tp,8($tp)
297	lda	$rp,8($rp)
298	lda	$ap,8($ap)
299	stq	zero,-8($tp)	# zap tp
300	cmpult	$tp,$tj,AT
301	stq	$aj,-8($rp)
302	bne	AT,.Lcopy
303	mov	1,v0
304
305.Lexit:
306	.set	noreorder
307	mov	fp,sp
308	/*ldq	ra,0(sp)*/
309	ldq	s3,8(sp)
310	ldq	s4,16(sp)
311	ldq	s5,24(sp)
312	ldq	fp,32(sp)
313	lda	sp,48(sp)
314	ret	(ra)
315.end	bn_mul_mont
316.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
317.align	2
318___
319
320print $code;
321close STDOUT;
322