1#! /usr/bin/env perl
2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# On 21264 RSA sign performance improves by 70/35/20/15 percent for
18# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
19# instructed to '-tune host' code with in-line assembler. Other
20# benchmarks improve by 15-20%. To anchor it to something else, the
21# code provides approximately the same performance per GHz as AMD64.
22# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
23# difference.
24
25$output=pop and open STDOUT,">$output";
26
27# int bn_mul_mont(
28$rp="a0";	# BN_ULONG *rp,
29$ap="a1";	# const BN_ULONG *ap,
30$bp="a2";	# const BN_ULONG *bp,
31$np="a3";	# const BN_ULONG *np,
32$n0="a4";	# const BN_ULONG *n0,
33$num="a5";	# int num);
34
35$lo0="t0";
36$hi0="t1";
37$lo1="t2";
38$hi1="t3";
39$aj="t4";
40$bi="t5";
41$nj="t6";
42$tp="t7";
43$alo="t8";
44$ahi="t9";
45$nlo="t10";
46$nhi="t11";
47$tj="t12";
48$i="s3";
49$j="s4";
50$m1="s5";
51
52$code=<<___;
53#ifdef __linux__
54#include <asm/regdef.h>
55#else
56#include <asm.h>
57#include <regdef.h>
58#endif
59
60.text
61
62.set	noat
63.set	noreorder
64
65.globl	bn_mul_mont
66.align	5
67.ent	bn_mul_mont
68bn_mul_mont:
69	lda	sp,-48(sp)
70	stq	ra,0(sp)
71	stq	s3,8(sp)
72	stq	s4,16(sp)
73	stq	s5,24(sp)
74	stq	fp,32(sp)
75	mov	sp,fp
76	.mask	0x0400f000,-48
77	.frame	fp,48,ra
78	.prologue 0
79
80	.align	4
81	.set	reorder
82	sextl	$num,$num
83	mov	0,v0
84	cmplt	$num,4,AT
85	bne	AT,.Lexit
86
87	ldq	$hi0,0($ap)	# ap[0]
88	s8addq	$num,16,AT
89	ldq	$aj,8($ap)
90	subq	sp,AT,sp
91	ldq	$bi,0($bp)	# bp[0]
92	lda	AT,-4096(zero)	# mov	-4096,AT
93	ldq	$n0,0($n0)
94	and	sp,AT,sp
95
96	mulq	$hi0,$bi,$lo0
97	ldq	$hi1,0($np)	# np[0]
98	umulh	$hi0,$bi,$hi0
99	ldq	$nj,8($np)
100
101	mulq	$lo0,$n0,$m1
102
103	mulq	$hi1,$m1,$lo1
104	umulh	$hi1,$m1,$hi1
105
106	addq	$lo1,$lo0,$lo1
107	cmpult	$lo1,$lo0,AT
108	addq	$hi1,AT,$hi1
109
110	mulq	$aj,$bi,$alo
111	mov	2,$j
112	umulh	$aj,$bi,$ahi
113	mov	sp,$tp
114
115	mulq	$nj,$m1,$nlo
116	s8addq	$j,$ap,$aj
117	umulh	$nj,$m1,$nhi
118	s8addq	$j,$np,$nj
119.align	4
120.L1st:
121	.set	noreorder
122	ldq	$aj,0($aj)
123	addl	$j,1,$j
124	ldq	$nj,0($nj)
125	lda	$tp,8($tp)
126
127	addq	$alo,$hi0,$lo0
128	mulq	$aj,$bi,$alo
129	cmpult	$lo0,$hi0,AT
130	addq	$nlo,$hi1,$lo1
131
132	mulq	$nj,$m1,$nlo
133	addq	$ahi,AT,$hi0
134	cmpult	$lo1,$hi1,v0
135	cmplt	$j,$num,$tj
136
137	umulh	$aj,$bi,$ahi
138	addq	$nhi,v0,$hi1
139	addq	$lo1,$lo0,$lo1
140	s8addq	$j,$ap,$aj
141
142	umulh	$nj,$m1,$nhi
143	cmpult	$lo1,$lo0,v0
144	addq	$hi1,v0,$hi1
145	s8addq	$j,$np,$nj
146
147	stq	$lo1,-8($tp)
148	nop
149	unop
150	bne	$tj,.L1st
151	.set	reorder
152
153	addq	$alo,$hi0,$lo0
154	addq	$nlo,$hi1,$lo1
155	cmpult	$lo0,$hi0,AT
156	cmpult	$lo1,$hi1,v0
157	addq	$ahi,AT,$hi0
158	addq	$nhi,v0,$hi1
159
160	addq	$lo1,$lo0,$lo1
161	cmpult	$lo1,$lo0,v0
162	addq	$hi1,v0,$hi1
163
164	stq	$lo1,0($tp)
165
166	addq	$hi1,$hi0,$hi1
167	cmpult	$hi1,$hi0,AT
168	stq	$hi1,8($tp)
169	stq	AT,16($tp)
170
171	mov	1,$i
172.align	4
173.Louter:
174	s8addq	$i,$bp,$bi
175	ldq	$hi0,0($ap)
176	ldq	$aj,8($ap)
177	ldq	$bi,0($bi)
178	ldq	$hi1,0($np)
179	ldq	$nj,8($np)
180	ldq	$tj,0(sp)
181
182	mulq	$hi0,$bi,$lo0
183	umulh	$hi0,$bi,$hi0
184
185	addq	$lo0,$tj,$lo0
186	cmpult	$lo0,$tj,AT
187	addq	$hi0,AT,$hi0
188
189	mulq	$lo0,$n0,$m1
190
191	mulq	$hi1,$m1,$lo1
192	umulh	$hi1,$m1,$hi1
193
194	addq	$lo1,$lo0,$lo1
195	cmpult	$lo1,$lo0,AT
196	mov	2,$j
197	addq	$hi1,AT,$hi1
198
199	mulq	$aj,$bi,$alo
200	mov	sp,$tp
201	umulh	$aj,$bi,$ahi
202
203	mulq	$nj,$m1,$nlo
204	s8addq	$j,$ap,$aj
205	umulh	$nj,$m1,$nhi
206.align	4
207.Linner:
208	.set	noreorder
209	ldq	$tj,8($tp)	#L0
210	nop			#U1
211	ldq	$aj,0($aj)	#L1
212	s8addq	$j,$np,$nj	#U0
213
214	ldq	$nj,0($nj)	#L0
215	nop			#U1
216	addq	$alo,$hi0,$lo0	#L1
217	lda	$tp,8($tp)
218
219	mulq	$aj,$bi,$alo	#U1
220	cmpult	$lo0,$hi0,AT	#L0
221	addq	$nlo,$hi1,$lo1	#L1
222	addl	$j,1,$j
223
224	mulq	$nj,$m1,$nlo	#U1
225	addq	$ahi,AT,$hi0	#L0
226	addq	$lo0,$tj,$lo0	#L1
227	cmpult	$lo1,$hi1,v0	#U0
228
229	umulh	$aj,$bi,$ahi	#U1
230	cmpult	$lo0,$tj,AT	#L0
231	addq	$lo1,$lo0,$lo1	#L1
232	addq	$nhi,v0,$hi1	#U0
233
234	umulh	$nj,$m1,$nhi	#U1
235	s8addq	$j,$ap,$aj	#L0
236	cmpult	$lo1,$lo0,v0	#L1
237	cmplt	$j,$num,$tj	#U0	# borrow $tj
238
239	addq	$hi0,AT,$hi0	#L0
240	addq	$hi1,v0,$hi1	#U1
241	stq	$lo1,-8($tp)	#L1
242	bne	$tj,.Linner	#U0
243	.set	reorder
244
245	ldq	$tj,8($tp)
246	addq	$alo,$hi0,$lo0
247	addq	$nlo,$hi1,$lo1
248	cmpult	$lo0,$hi0,AT
249	cmpult	$lo1,$hi1,v0
250	addq	$ahi,AT,$hi0
251	addq	$nhi,v0,$hi1
252
253	addq	$lo0,$tj,$lo0
254	cmpult	$lo0,$tj,AT
255	addq	$hi0,AT,$hi0
256
257	ldq	$tj,16($tp)
258	addq	$lo1,$lo0,$j
259	cmpult	$j,$lo0,v0
260	addq	$hi1,v0,$hi1
261
262	addq	$hi1,$hi0,$lo1
263	stq	$j,0($tp)
264	cmpult	$lo1,$hi0,$hi1
265	addq	$lo1,$tj,$lo1
266	cmpult	$lo1,$tj,AT
267	addl	$i,1,$i
268	addq	$hi1,AT,$hi1
269	stq	$lo1,8($tp)
270	cmplt	$i,$num,$tj	# borrow $tj
271	stq	$hi1,16($tp)
272	bne	$tj,.Louter
273
274	s8addq	$num,sp,$tj	# &tp[num]
275	mov	$rp,$bp		# put rp aside
276	mov	sp,$tp
277	mov	sp,$ap
278	mov	0,$hi0		# clear borrow bit
279
280.align	4
281.Lsub:	ldq	$lo0,0($tp)
282	ldq	$lo1,0($np)
283	lda	$tp,8($tp)
284	lda	$np,8($np)
285	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
286	cmpult	$lo0,$lo1,AT
287	subq	$lo1,$hi0,$lo0
288	cmpult	$lo1,$lo0,$hi0
289	or	$hi0,AT,$hi0
290	stq	$lo0,0($rp)
291	cmpult	$tp,$tj,v0
292	lda	$rp,8($rp)
293	bne	v0,.Lsub
294
295	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
296	mov	sp,$tp
297	mov	$bp,$rp		# restore rp
298
299.align	4
300.Lcopy:	ldq	$aj,0($tp)	# conditional copy
301	ldq	$nj,0($rp)
302	lda	$tp,8($tp)
303	lda	$rp,8($rp)
304	cmoveq	$hi0,$nj,$aj
305	stq	zero,-8($tp)	# zap tp
306	cmpult	$tp,$tj,AT
307	stq	$aj,-8($rp)
308	bne	AT,.Lcopy
309	mov	1,v0
310
311.Lexit:
312	.set	noreorder
313	mov	fp,sp
314	/*ldq	ra,0(sp)*/
315	ldq	s3,8(sp)
316	ldq	s4,16(sp)
317	ldq	s5,24(sp)
318	ldq	fp,32(sp)
319	lda	sp,48(sp)
320	ret	(ra)
321.end	bn_mul_mont
322.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
323.align	2
324___
325
326print $code;
327close STDOUT or die "error closing STDOUT: $!";
328