xref: /openbsd/lib/libcrypto/bn/asm/alpha-mont.pl (revision 5af055cd)
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0";	# BN_ULONG *rp,
20$ap="a1";	# const BN_ULONG *ap,
21$bp="a2";	# const BN_ULONG *bp,
22$np="a3";	# const BN_ULONG *np,
23$n0="a4";	# const BN_ULONG *n0,
24$num="a5";	# int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#include <machine/asm.h>
45
46.text
47
48.set	noat
49.set	noreorder
50
51.globl	bn_mul_mont
52.align	5
53.ent	bn_mul_mont
54bn_mul_mont:
55	lda	sp,-48(sp)
56	stq	ra,0(sp)
57	stq	s3,8(sp)
58	stq	s4,16(sp)
59	stq	s5,24(sp)
60	stq	fp,32(sp)
61	mov	sp,fp
62	.mask	0x0400f000,-48
63	.frame	fp,48,ra
64	.prologue 0
65
66	.align	4
67	.set	reorder
68	sextl	$num,$num
69	mov	0,v0
70	cmplt	$num,4,AT
71	bne	AT,.Lexit
72
73	ldq	$hi0,0($ap)	# ap[0]
74	s8addq	$num,16,AT
75	ldq	$aj,8($ap)
76	subq	sp,AT,sp
77	ldq	$bi,0($bp)	# bp[0]
78	lda	AT,-4096(zero)	# mov	-4096,AT
79	ldq	$n0,0($n0)
80	and	sp,AT,sp
81
82	mulq	$hi0,$bi,$lo0
83	ldq	$hi1,0($np)	# np[0]
84	umulh	$hi0,$bi,$hi0
85	ldq	$nj,8($np)
86
87	mulq	$lo0,$n0,$m1
88
89	mulq	$hi1,$m1,$lo1
90	umulh	$hi1,$m1,$hi1
91
92	addq	$lo1,$lo0,$lo1
93	cmpult	$lo1,$lo0,AT
94	addq	$hi1,AT,$hi1
95
96	mulq	$aj,$bi,$alo
97	mov	2,$j
98	umulh	$aj,$bi,$ahi
99	mov	sp,$tp
100
101	mulq	$nj,$m1,$nlo
102	s8addq	$j,$ap,$aj
103	umulh	$nj,$m1,$nhi
104	s8addq	$j,$np,$nj
105.align	4
106.L1st:
107	.set	noreorder
108	ldq	$aj,0($aj)
109	addl	$j,1,$j
110	ldq	$nj,0($nj)
111	lda	$tp,8($tp)
112
113	addq	$alo,$hi0,$lo0
114	mulq	$aj,$bi,$alo
115	cmpult	$lo0,$hi0,AT
116	addq	$nlo,$hi1,$lo1
117
118	mulq	$nj,$m1,$nlo
119	addq	$ahi,AT,$hi0
120	cmpult	$lo1,$hi1,v0
121	cmplt	$j,$num,$tj
122
123	umulh	$aj,$bi,$ahi
124	addq	$nhi,v0,$hi1
125	addq	$lo1,$lo0,$lo1
126	s8addq	$j,$ap,$aj
127
128	umulh	$nj,$m1,$nhi
129	cmpult	$lo1,$lo0,v0
130	addq	$hi1,v0,$hi1
131	s8addq	$j,$np,$nj
132
133	stq	$lo1,-8($tp)
134	nop
135	unop
136	bne	$tj,.L1st
137	.set	reorder
138
139	addq	$alo,$hi0,$lo0
140	addq	$nlo,$hi1,$lo1
141	cmpult	$lo0,$hi0,AT
142	cmpult	$lo1,$hi1,v0
143	addq	$ahi,AT,$hi0
144	addq	$nhi,v0,$hi1
145
146	addq	$lo1,$lo0,$lo1
147	cmpult	$lo1,$lo0,v0
148	addq	$hi1,v0,$hi1
149
150	stq	$lo1,0($tp)
151
152	addq	$hi1,$hi0,$hi1
153	cmpult	$hi1,$hi0,AT
154	stq	$hi1,8($tp)
155	stq	AT,16($tp)
156
157	mov	1,$i
158.align	4
159.Louter:
160	s8addq	$i,$bp,$bi
161	ldq	$hi0,0($ap)
162	ldq	$aj,8($ap)
163	ldq	$bi,0($bi)
164	ldq	$hi1,0($np)
165	ldq	$nj,8($np)
166	ldq	$tj,0(sp)
167
168	mulq	$hi0,$bi,$lo0
169	umulh	$hi0,$bi,$hi0
170
171	addq	$lo0,$tj,$lo0
172	cmpult	$lo0,$tj,AT
173	addq	$hi0,AT,$hi0
174
175	mulq	$lo0,$n0,$m1
176
177	mulq	$hi1,$m1,$lo1
178	umulh	$hi1,$m1,$hi1
179
180	addq	$lo1,$lo0,$lo1
181	cmpult	$lo1,$lo0,AT
182	mov	2,$j
183	addq	$hi1,AT,$hi1
184
185	mulq	$aj,$bi,$alo
186	mov	sp,$tp
187	umulh	$aj,$bi,$ahi
188
189	mulq	$nj,$m1,$nlo
190	s8addq	$j,$ap,$aj
191	umulh	$nj,$m1,$nhi
192.align	4
193.Linner:
194	.set	noreorder
195	ldq	$tj,8($tp)	#L0
196	nop			#U1
197	ldq	$aj,0($aj)	#L1
198	s8addq	$j,$np,$nj	#U0
199
200	ldq	$nj,0($nj)	#L0
201	nop			#U1
202	addq	$alo,$hi0,$lo0	#L1
203	lda	$tp,8($tp)
204
205	mulq	$aj,$bi,$alo	#U1
206	cmpult	$lo0,$hi0,AT	#L0
207	addq	$nlo,$hi1,$lo1	#L1
208	addl	$j,1,$j
209
210	mulq	$nj,$m1,$nlo	#U1
211	addq	$ahi,AT,$hi0	#L0
212	addq	$lo0,$tj,$lo0	#L1
213	cmpult	$lo1,$hi1,v0	#U0
214
215	umulh	$aj,$bi,$ahi	#U1
216	cmpult	$lo0,$tj,AT	#L0
217	addq	$lo1,$lo0,$lo1	#L1
218	addq	$nhi,v0,$hi1	#U0
219
220	umulh	$nj,$m1,$nhi	#U1
221	s8addq	$j,$ap,$aj	#L0
222	cmpult	$lo1,$lo0,v0	#L1
223	cmplt	$j,$num,$tj	#U0	# borrow $tj
224
225	addq	$hi0,AT,$hi0	#L0
226	addq	$hi1,v0,$hi1	#U1
227	stq	$lo1,-8($tp)	#L1
228	bne	$tj,.Linner	#U0
229	.set	reorder
230
231	ldq	$tj,8($tp)
232	addq	$alo,$hi0,$lo0
233	addq	$nlo,$hi1,$lo1
234	cmpult	$lo0,$hi0,AT
235	cmpult	$lo1,$hi1,v0
236	addq	$ahi,AT,$hi0
237	addq	$nhi,v0,$hi1
238
239	addq	$lo0,$tj,$lo0
240	cmpult	$lo0,$tj,AT
241	addq	$hi0,AT,$hi0
242
243	ldq	$tj,16($tp)
244	addq	$lo1,$lo0,$j
245	cmpult	$j,$lo0,v0
246	addq	$hi1,v0,$hi1
247
248	addq	$hi1,$hi0,$lo1
249	stq	$j,0($tp)
250	cmpult	$lo1,$hi0,$hi1
251	addq	$lo1,$tj,$lo1
252	cmpult	$lo1,$tj,AT
253	addl	$i,1,$i
254	addq	$hi1,AT,$hi1
255	stq	$lo1,8($tp)
256	cmplt	$i,$num,$tj	# borrow $tj
257	stq	$hi1,16($tp)
258	bne	$tj,.Louter
259
260	s8addq	$num,sp,$tj	# &tp[num]
261	mov	$rp,$bp		# put rp aside
262	mov	sp,$tp
263	mov	sp,$ap
264	mov	0,$hi0		# clear borrow bit
265
266.align	4
267.Lsub:	ldq	$lo0,0($tp)
268	ldq	$lo1,0($np)
269	lda	$tp,8($tp)
270	lda	$np,8($np)
271	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
272	cmpult	$lo0,$lo1,AT
273	subq	$lo1,$hi0,$lo0
274	cmpult	$lo1,$lo0,$hi0
275	or	$hi0,AT,$hi0
276	stq	$lo0,0($rp)
277	cmpult	$tp,$tj,v0
278	lda	$rp,8($rp)
279	bne	v0,.Lsub
280
281	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
282	mov	sp,$tp
283	mov	$bp,$rp		# restore rp
284
285	and	sp,$hi0,$ap
286	bic	$bp,$hi0,$bp
287	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
288
289.align	4
290.Lcopy:	ldq	$aj,0($ap)	# copy or in-place refresh
291	lda	$tp,8($tp)
292	lda	$rp,8($rp)
293	lda	$ap,8($ap)
294	stq	zero,-8($tp)	# zap tp
295	cmpult	$tp,$tj,AT
296	stq	$aj,-8($rp)
297	bne	AT,.Lcopy
298	mov	1,v0
299
300.Lexit:
301	.set	noreorder
302	mov	fp,sp
303	/*ldq	ra,0(sp)*/
304	ldq	s3,8(sp)
305	ldq	s4,16(sp)
306	ldq	s5,24(sp)
307	ldq	fp,32(sp)
308	lda	sp,48(sp)
309	ret	(ra)
310.end	bn_mul_mont
311.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
312.align	2
313___
314
315print $code;
316close STDOUT;
317