1#! /usr/bin/env perl
2# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# On 21264 RSA sign performance improves by 70/35/20/15 percent for
18# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
19# instructed to '-tune host' code with in-line assembler. Other
20# benchmarks improve by 15-20%. To anchor it to something else, the
21# code provides approximately the same performance per GHz as AMD64.
22# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
23# difference.
24
25$output=pop;
26open STDOUT,">$output";
27
28# int bn_mul_mont(
29$rp="a0";	# BN_ULONG *rp,
30$ap="a1";	# const BN_ULONG *ap,
31$bp="a2";	# const BN_ULONG *bp,
32$np="a3";	# const BN_ULONG *np,
33$n0="a4";	# const BN_ULONG *n0,
34$num="a5";	# int num);
35
36$lo0="t0";
37$hi0="t1";
38$lo1="t2";
39$hi1="t3";
40$aj="t4";
41$bi="t5";
42$nj="t6";
43$tp="t7";
44$alo="t8";
45$ahi="t9";
46$nlo="t10";
47$nhi="t11";
48$tj="t12";
49$i="s3";
50$j="s4";
51$m1="s5";
52
53$code=<<___;
54#ifdef __linux__
55#include <asm/regdef.h>
56#else
57#include <asm.h>
58#include <regdef.h>
59#endif
60
61.text
62
63.set	noat
64.set	noreorder
65
66.globl	bn_mul_mont
67.align	5
68.ent	bn_mul_mont
69bn_mul_mont:
70	lda	sp,-48(sp)
71	stq	ra,0(sp)
72	stq	s3,8(sp)
73	stq	s4,16(sp)
74	stq	s5,24(sp)
75	stq	fp,32(sp)
76	mov	sp,fp
77	.mask	0x0400f000,-48
78	.frame	fp,48,ra
79	.prologue 0
80
81	.align	4
82	.set	reorder
83	sextl	$num,$num
84	mov	0,v0
85	cmplt	$num,4,AT
86	bne	AT,.Lexit
87
88	ldq	$hi0,0($ap)	# ap[0]
89	s8addq	$num,16,AT
90	ldq	$aj,8($ap)
91	subq	sp,AT,sp
92	ldq	$bi,0($bp)	# bp[0]
93	lda	AT,-4096(zero)	# mov	-4096,AT
94	ldq	$n0,0($n0)
95	and	sp,AT,sp
96
97	mulq	$hi0,$bi,$lo0
98	ldq	$hi1,0($np)	# np[0]
99	umulh	$hi0,$bi,$hi0
100	ldq	$nj,8($np)
101
102	mulq	$lo0,$n0,$m1
103
104	mulq	$hi1,$m1,$lo1
105	umulh	$hi1,$m1,$hi1
106
107	addq	$lo1,$lo0,$lo1
108	cmpult	$lo1,$lo0,AT
109	addq	$hi1,AT,$hi1
110
111	mulq	$aj,$bi,$alo
112	mov	2,$j
113	umulh	$aj,$bi,$ahi
114	mov	sp,$tp
115
116	mulq	$nj,$m1,$nlo
117	s8addq	$j,$ap,$aj
118	umulh	$nj,$m1,$nhi
119	s8addq	$j,$np,$nj
120.align	4
121.L1st:
122	.set	noreorder
123	ldq	$aj,0($aj)
124	addl	$j,1,$j
125	ldq	$nj,0($nj)
126	lda	$tp,8($tp)
127
128	addq	$alo,$hi0,$lo0
129	mulq	$aj,$bi,$alo
130	cmpult	$lo0,$hi0,AT
131	addq	$nlo,$hi1,$lo1
132
133	mulq	$nj,$m1,$nlo
134	addq	$ahi,AT,$hi0
135	cmpult	$lo1,$hi1,v0
136	cmplt	$j,$num,$tj
137
138	umulh	$aj,$bi,$ahi
139	addq	$nhi,v0,$hi1
140	addq	$lo1,$lo0,$lo1
141	s8addq	$j,$ap,$aj
142
143	umulh	$nj,$m1,$nhi
144	cmpult	$lo1,$lo0,v0
145	addq	$hi1,v0,$hi1
146	s8addq	$j,$np,$nj
147
148	stq	$lo1,-8($tp)
149	nop
150	unop
151	bne	$tj,.L1st
152	.set	reorder
153
154	addq	$alo,$hi0,$lo0
155	addq	$nlo,$hi1,$lo1
156	cmpult	$lo0,$hi0,AT
157	cmpult	$lo1,$hi1,v0
158	addq	$ahi,AT,$hi0
159	addq	$nhi,v0,$hi1
160
161	addq	$lo1,$lo0,$lo1
162	cmpult	$lo1,$lo0,v0
163	addq	$hi1,v0,$hi1
164
165	stq	$lo1,0($tp)
166
167	addq	$hi1,$hi0,$hi1
168	cmpult	$hi1,$hi0,AT
169	stq	$hi1,8($tp)
170	stq	AT,16($tp)
171
172	mov	1,$i
173.align	4
174.Louter:
175	s8addq	$i,$bp,$bi
176	ldq	$hi0,0($ap)
177	ldq	$aj,8($ap)
178	ldq	$bi,0($bi)
179	ldq	$hi1,0($np)
180	ldq	$nj,8($np)
181	ldq	$tj,0(sp)
182
183	mulq	$hi0,$bi,$lo0
184	umulh	$hi0,$bi,$hi0
185
186	addq	$lo0,$tj,$lo0
187	cmpult	$lo0,$tj,AT
188	addq	$hi0,AT,$hi0
189
190	mulq	$lo0,$n0,$m1
191
192	mulq	$hi1,$m1,$lo1
193	umulh	$hi1,$m1,$hi1
194
195	addq	$lo1,$lo0,$lo1
196	cmpult	$lo1,$lo0,AT
197	mov	2,$j
198	addq	$hi1,AT,$hi1
199
200	mulq	$aj,$bi,$alo
201	mov	sp,$tp
202	umulh	$aj,$bi,$ahi
203
204	mulq	$nj,$m1,$nlo
205	s8addq	$j,$ap,$aj
206	umulh	$nj,$m1,$nhi
207.align	4
208.Linner:
209	.set	noreorder
210	ldq	$tj,8($tp)	#L0
211	nop			#U1
212	ldq	$aj,0($aj)	#L1
213	s8addq	$j,$np,$nj	#U0
214
215	ldq	$nj,0($nj)	#L0
216	nop			#U1
217	addq	$alo,$hi0,$lo0	#L1
218	lda	$tp,8($tp)
219
220	mulq	$aj,$bi,$alo	#U1
221	cmpult	$lo0,$hi0,AT	#L0
222	addq	$nlo,$hi1,$lo1	#L1
223	addl	$j,1,$j
224
225	mulq	$nj,$m1,$nlo	#U1
226	addq	$ahi,AT,$hi0	#L0
227	addq	$lo0,$tj,$lo0	#L1
228	cmpult	$lo1,$hi1,v0	#U0
229
230	umulh	$aj,$bi,$ahi	#U1
231	cmpult	$lo0,$tj,AT	#L0
232	addq	$lo1,$lo0,$lo1	#L1
233	addq	$nhi,v0,$hi1	#U0
234
235	umulh	$nj,$m1,$nhi	#U1
236	s8addq	$j,$ap,$aj	#L0
237	cmpult	$lo1,$lo0,v0	#L1
238	cmplt	$j,$num,$tj	#U0	# borrow $tj
239
240	addq	$hi0,AT,$hi0	#L0
241	addq	$hi1,v0,$hi1	#U1
242	stq	$lo1,-8($tp)	#L1
243	bne	$tj,.Linner	#U0
244	.set	reorder
245
246	ldq	$tj,8($tp)
247	addq	$alo,$hi0,$lo0
248	addq	$nlo,$hi1,$lo1
249	cmpult	$lo0,$hi0,AT
250	cmpult	$lo1,$hi1,v0
251	addq	$ahi,AT,$hi0
252	addq	$nhi,v0,$hi1
253
254	addq	$lo0,$tj,$lo0
255	cmpult	$lo0,$tj,AT
256	addq	$hi0,AT,$hi0
257
258	ldq	$tj,16($tp)
259	addq	$lo1,$lo0,$j
260	cmpult	$j,$lo0,v0
261	addq	$hi1,v0,$hi1
262
263	addq	$hi1,$hi0,$lo1
264	stq	$j,0($tp)
265	cmpult	$lo1,$hi0,$hi1
266	addq	$lo1,$tj,$lo1
267	cmpult	$lo1,$tj,AT
268	addl	$i,1,$i
269	addq	$hi1,AT,$hi1
270	stq	$lo1,8($tp)
271	cmplt	$i,$num,$tj	# borrow $tj
272	stq	$hi1,16($tp)
273	bne	$tj,.Louter
274
275	s8addq	$num,sp,$tj	# &tp[num]
276	mov	$rp,$bp		# put rp aside
277	mov	sp,$tp
278	mov	sp,$ap
279	mov	0,$hi0		# clear borrow bit
280
281.align	4
282.Lsub:	ldq	$lo0,0($tp)
283	ldq	$lo1,0($np)
284	lda	$tp,8($tp)
285	lda	$np,8($np)
286	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
287	cmpult	$lo0,$lo1,AT
288	subq	$lo1,$hi0,$lo0
289	cmpult	$lo1,$lo0,$hi0
290	or	$hi0,AT,$hi0
291	stq	$lo0,0($rp)
292	cmpult	$tp,$tj,v0
293	lda	$rp,8($rp)
294	bne	v0,.Lsub
295
296	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
297	mov	sp,$tp
298	mov	$bp,$rp		# restore rp
299
300.align	4
301.Lcopy:	ldq	$aj,0($tp)	# conditional copy
302	ldq	$nj,0($rp)
303	lda	$tp,8($tp)
304	lda	$rp,8($rp)
305	cmoveq	$hi0,$nj,$aj
306	stq	zero,-8($tp)	# zap tp
307	cmpult	$tp,$tj,AT
308	stq	$aj,-8($rp)
309	bne	AT,.Lcopy
310	mov	1,v0
311
312.Lexit:
313	.set	noreorder
314	mov	fp,sp
315	/*ldq	ra,0(sp)*/
316	ldq	s3,8(sp)
317	ldq	s4,16(sp)
318	ldq	s5,24(sp)
319	ldq	fp,32(sp)
320	lda	sp,48(sp)
321	ret	(ra)
322.end	bn_mul_mont
323.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
324.align	2
325___
326
327print $code;
328close STDOUT;
329