1#! /usr/bin/env perl
2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2005
18#
19# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20# for undertaken effort are multiple. First of all, UltraSPARC is not
21# the whole SPARCv9 universe and other VIS-free implementations deserve
22# optimized code as much. Secondly, newly introduced UltraSPARC T1,
23# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25# several integrated RSA/DSA accelerator circuits accessible through
26# kernel driver [only(*)], but having decent user-land software
27# implementation is important too. Finally, reasons like desire to
28# experiment with dedicated squaring procedure. Yes, this module
29# implements one, because it was easiest to draft it in SPARCv9
30# instructions...
31
32# (*)	Engine accessing the driver in question is on my TODO list.
33#	For reference, accelerator is estimated to give 6 to 10 times
34#	improvement on single-threaded RSA sign. It should be noted
35#	that 6-10x improvement coefficient does not actually mean
36#	something extraordinary in terms of absolute [single-threaded]
37#	performance, as SPARCv9 instruction set is by all means least
38#	suitable for high performance crypto among other 64 bit
39#	platforms. 6-10x factor simply places T1 in same performance
40#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
41#	appear impressive at all, but it's the sign operation which is
42#	far more critical/interesting.
43
44# You might notice that inner loops are modulo-scheduled:-) This has
45# essentially negligible impact on UltraSPARC performance, it's
46# Fujitsu SPARC64 V users who should notice and hopefully appreciate
47# the advantage... Currently this module surpasses sparcv9a-mont.pl
48# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49# module still have hidden potential [see TODO list there], which is
50# estimated to be larger than 20%...
51
52$output = pop and open STDOUT,">$output";
53
54# int bn_mul_mont(
55$rp="%i0";	# BN_ULONG *rp,
56$ap="%i1";	# const BN_ULONG *ap,
57$bp="%i2";	# const BN_ULONG *bp,
58$np="%i3";	# const BN_ULONG *np,
59$n0="%i4";	# const BN_ULONG *n0,
60$num="%i5";	# int num);
61
62$frame="STACK_FRAME";
63$bias="STACK_BIAS";
64
65$car0="%o0";
66$car1="%o1";
67$car2="%o2";	# 1 bit
68$acc0="%o3";
69$acc1="%o4";
70$mask="%g1";	# 32 bits, what a waste...
71$tmp0="%g4";
72$tmp1="%g5";
73
74$i="%l0";
75$j="%l1";
76$mul0="%l2";
77$mul1="%l3";
78$tp="%l4";
79$apj="%l5";
80$npj="%l6";
81$tpj="%l7";
82
83$fname="bn_mul_mont_int";
84
85$code=<<___;
86#ifndef __ASSEMBLER__
87# define __ASSEMBLER__ 1
88#endif
89#include "crypto/sparc_arch.h"
90
91.section	".text",#alloc,#execinstr
92
93.global	$fname
94.align	32
95$fname:
96	cmp	%o5,4			! 128 bits minimum
97	bge,pt	%icc,.Lenter
98	sethi	%hi(0xffffffff),$mask
99	retl
100	clr	%o0
101.align	32
102.Lenter:
103	save	%sp,-$frame,%sp
104	sll	$num,2,$num		! num*=4
105	or	$mask,%lo(0xffffffff),$mask
106	ld	[$n0],$n0
107	cmp	$ap,$bp
108	and	$num,$mask,$num
109	ld	[$bp],$mul0		! bp[0]
110	nop
111
112	add	%sp,$bias,%o7		! real top of stack
113	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
114	sub	%o7,$num,%o7
115	ld	[$ap+4],$apj		! ap[1]
116	and	%o7,-1024,%o7
117	ld	[$np],$car1		! np[0]
118	sub	%o7,$bias,%sp		! alloca
119	ld	[$np+4],$npj		! np[1]
120	be,pt	SIZE_T_CC,.Lbn_sqr_mont
121	mov	12,$j
122
123	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
124	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
125	and	$car0,$mask,$acc0
126	add	%sp,$bias+$frame,$tp
127	ld	[$ap+8],$apj		!prologue!
128
129	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
130	and	$mul1,$mask,$mul1
131
132	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
133	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
134	srlx	$car0,32,$car0
135	add	$acc0,$car1,$car1
136	ld	[$np+8],$npj		!prologue!
137	srlx	$car1,32,$car1
138	mov	$tmp0,$acc0		!prologue!
139
140.L1st:
141	mulx	$apj,$mul0,$tmp0
142	mulx	$npj,$mul1,$tmp1
143	add	$acc0,$car0,$car0
144	ld	[$ap+$j],$apj		! ap[j]
145	and	$car0,$mask,$acc0
146	add	$acc1,$car1,$car1
147	ld	[$np+$j],$npj		! np[j]
148	srlx	$car0,32,$car0
149	add	$acc0,$car1,$car1
150	add	$j,4,$j			! j++
151	mov	$tmp0,$acc0
152	st	$car1,[$tp]
153	cmp	$j,$num
154	mov	$tmp1,$acc1
155	srlx	$car1,32,$car1
156	bl	%icc,.L1st
157	add	$tp,4,$tp		! tp++
158!.L1st
159
160	mulx	$apj,$mul0,$tmp0	!epilogue!
161	mulx	$npj,$mul1,$tmp1
162	add	$acc0,$car0,$car0
163	and	$car0,$mask,$acc0
164	add	$acc1,$car1,$car1
165	srlx	$car0,32,$car0
166	add	$acc0,$car1,$car1
167	st	$car1,[$tp]
168	srlx	$car1,32,$car1
169
170	add	$tmp0,$car0,$car0
171	and	$car0,$mask,$acc0
172	add	$tmp1,$car1,$car1
173	srlx	$car0,32,$car0
174	add	$acc0,$car1,$car1
175	st	$car1,[$tp+4]
176	srlx	$car1,32,$car1
177
178	add	$car0,$car1,$car1
179	st	$car1,[$tp+8]
180	srlx	$car1,32,$car2
181
182	mov	4,$i			! i++
183	ld	[$bp+4],$mul0		! bp[1]
184.Louter:
185	add	%sp,$bias+$frame,$tp
186	ld	[$ap],$car0		! ap[0]
187	ld	[$ap+4],$apj		! ap[1]
188	ld	[$np],$car1		! np[0]
189	ld	[$np+4],$npj		! np[1]
190	ld	[$tp],$tmp1		! tp[0]
191	ld	[$tp+4],$tpj		! tp[1]
192	mov	12,$j
193
194	mulx	$car0,$mul0,$car0
195	mulx	$apj,$mul0,$tmp0	!prologue!
196	add	$tmp1,$car0,$car0
197	ld	[$ap+8],$apj		!prologue!
198	and	$car0,$mask,$acc0
199
200	mulx	$n0,$acc0,$mul1
201	and	$mul1,$mask,$mul1
202
203	mulx	$car1,$mul1,$car1
204	mulx	$npj,$mul1,$acc1	!prologue!
205	srlx	$car0,32,$car0
206	add	$acc0,$car1,$car1
207	ld	[$np+8],$npj		!prologue!
208	srlx	$car1,32,$car1
209	mov	$tmp0,$acc0		!prologue!
210
211.Linner:
212	mulx	$apj,$mul0,$tmp0
213	mulx	$npj,$mul1,$tmp1
214	add	$tpj,$car0,$car0
215	ld	[$ap+$j],$apj		! ap[j]
216	add	$acc0,$car0,$car0
217	add	$acc1,$car1,$car1
218	ld	[$np+$j],$npj		! np[j]
219	and	$car0,$mask,$acc0
220	ld	[$tp+8],$tpj		! tp[j]
221	srlx	$car0,32,$car0
222	add	$acc0,$car1,$car1
223	add	$j,4,$j			! j++
224	mov	$tmp0,$acc0
225	st	$car1,[$tp]		! tp[j-1]
226	srlx	$car1,32,$car1
227	mov	$tmp1,$acc1
228	cmp	$j,$num
229	bl	%icc,.Linner
230	add	$tp,4,$tp		! tp++
231!.Linner
232
233	mulx	$apj,$mul0,$tmp0	!epilogue!
234	mulx	$npj,$mul1,$tmp1
235	add	$tpj,$car0,$car0
236	add	$acc0,$car0,$car0
237	ld	[$tp+8],$tpj		! tp[j]
238	and	$car0,$mask,$acc0
239	add	$acc1,$car1,$car1
240	srlx	$car0,32,$car0
241	add	$acc0,$car1,$car1
242	st	$car1,[$tp]		! tp[j-1]
243	srlx	$car1,32,$car1
244
245	add	$tpj,$car0,$car0
246	add	$tmp0,$car0,$car0
247	and	$car0,$mask,$acc0
248	add	$tmp1,$car1,$car1
249	add	$acc0,$car1,$car1
250	st	$car1,[$tp+4]		! tp[j-1]
251	srlx	$car0,32,$car0
252	add	$i,4,$i			! i++
253	srlx	$car1,32,$car1
254
255	add	$car0,$car1,$car1
256	cmp	$i,$num
257	add	$car2,$car1,$car1
258	st	$car1,[$tp+8]
259
260	srlx	$car1,32,$car2
261	bl,a	%icc,.Louter
262	ld	[$bp+$i],$mul0		! bp[i]
263!.Louter
264
265	add	$tp,12,$tp
266
267.Ltail:
268	add	$np,$num,$np
269	add	$rp,$num,$rp
270	sub	%g0,$num,%o7		! k=-num
271	ba	.Lsub
272	subcc	%g0,%g0,%g0		! clear %icc.c
273.align	16
274.Lsub:
275	ld	[$tp+%o7],%o0
276	ld	[$np+%o7],%o1
277	subccc	%o0,%o1,%o1		! tp[j]-np[j]
278	add	$rp,%o7,$i
279	add	%o7,4,%o7
280	brnz	%o7,.Lsub
281	st	%o1,[$i]
282	subccc	$car2,0,$car2		! handle upmost overflow bit
283	sub	%g0,$num,%o7
284
285.Lcopy:
286	ld	[$tp+%o7],%o1		! conditional copy
287	ld	[$rp+%o7],%o0
288	st	%g0,[$tp+%o7]		! zap tp
289	movcs	%icc,%o1,%o0
290	st	%o0,[$rp+%o7]
291	add	%o7,4,%o7
292	brnz	%o7,.Lcopy
293	nop
294	mov	1,%i0
295	ret
296	restore
297___
298
299########
300######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
301######## code without following dedicated squaring procedure.
302########
303$sbit="%o5";
304
305$code.=<<___;
306.align	32
307.Lbn_sqr_mont:
308	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
309	mulx	$apj,$mul0,$tmp0		!prologue!
310	and	$car0,$mask,$acc0
311	add	%sp,$bias+$frame,$tp
312	ld	[$ap+8],$apj			!prologue!
313
314	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
315	srlx	$car0,32,$car0
316	and	$mul1,$mask,$mul1
317
318	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
319	mulx	$npj,$mul1,$acc1		!prologue!
320	and	$car0,1,$sbit
321	ld	[$np+8],$npj			!prologue!
322	srlx	$car0,1,$car0
323	add	$acc0,$car1,$car1
324	srlx	$car1,32,$car1
325	mov	$tmp0,$acc0			!prologue!
326
327.Lsqr_1st:
328	mulx	$apj,$mul0,$tmp0
329	mulx	$npj,$mul1,$tmp1
330	add	$acc0,$car0,$car0		! ap[j]*a0+c0
331	add	$acc1,$car1,$car1
332	ld	[$ap+$j],$apj			! ap[j]
333	and	$car0,$mask,$acc0
334	ld	[$np+$j],$npj			! np[j]
335	srlx	$car0,32,$car0
336	add	$acc0,$acc0,$acc0
337	or	$sbit,$acc0,$acc0
338	mov	$tmp1,$acc1
339	srlx	$acc0,32,$sbit
340	add	$j,4,$j				! j++
341	and	$acc0,$mask,$acc0
342	cmp	$j,$num
343	add	$acc0,$car1,$car1
344	st	$car1,[$tp]
345	mov	$tmp0,$acc0
346	srlx	$car1,32,$car1
347	bl	%icc,.Lsqr_1st
348	add	$tp,4,$tp			! tp++
349!.Lsqr_1st
350
351	mulx	$apj,$mul0,$tmp0		! epilogue
352	mulx	$npj,$mul1,$tmp1
353	add	$acc0,$car0,$car0		! ap[j]*a0+c0
354	add	$acc1,$car1,$car1
355	and	$car0,$mask,$acc0
356	srlx	$car0,32,$car0
357	add	$acc0,$acc0,$acc0
358	or	$sbit,$acc0,$acc0
359	srlx	$acc0,32,$sbit
360	and	$acc0,$mask,$acc0
361	add	$acc0,$car1,$car1
362	st	$car1,[$tp]
363	srlx	$car1,32,$car1
364
365	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
366	add	$tmp1,$car1,$car1
367	and	$car0,$mask,$acc0
368	srlx	$car0,32,$car0
369	add	$acc0,$acc0,$acc0
370	or	$sbit,$acc0,$acc0
371	srlx	$acc0,32,$sbit
372	and	$acc0,$mask,$acc0
373	add	$acc0,$car1,$car1
374	st	$car1,[$tp+4]
375	srlx	$car1,32,$car1
376
377	add	$car0,$car0,$car0
378	or	$sbit,$car0,$car0
379	add	$car0,$car1,$car1
380	st	$car1,[$tp+8]
381	srlx	$car1,32,$car2
382
383	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
384	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
385	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
386	ld	[$ap+4],$mul0			! ap[1]
387	ld	[$ap+8],$apj			! ap[2]
388	ld	[$np],$car1			! np[0]
389	ld	[$np+4],$npj			! np[1]
390	mulx	$n0,$tmp0,$mul1
391
392	mulx	$mul0,$mul0,$car0
393	and	$mul1,$mask,$mul1
394
395	mulx	$car1,$mul1,$car1
396	mulx	$npj,$mul1,$acc1
397	add	$tmp0,$car1,$car1
398	and	$car0,$mask,$acc0
399	ld	[$np+8],$npj			! np[2]
400	srlx	$car1,32,$car1
401	add	$tmp1,$car1,$car1
402	srlx	$car0,32,$car0
403	add	$acc0,$car1,$car1
404	and	$car0,1,$sbit
405	add	$acc1,$car1,$car1
406	srlx	$car0,1,$car0
407	mov	12,$j
408	st	$car1,[%sp+$bias+$frame]	! tp[0]=
409	srlx	$car1,32,$car1
410	add	%sp,$bias+$frame+4,$tp
411
412.Lsqr_2nd:
413	mulx	$apj,$mul0,$acc0
414	mulx	$npj,$mul1,$acc1
415	add	$acc0,$car0,$car0
416	add	$tpj,$sbit,$sbit
417	ld	[$ap+$j],$apj			! ap[j]
418	and	$car0,$mask,$acc0
419	ld	[$np+$j],$npj			! np[j]
420	srlx	$car0,32,$car0
421	add	$acc1,$car1,$car1
422	ld	[$tp+8],$tpj			! tp[j]
423	add	$acc0,$acc0,$acc0
424	add	$j,4,$j				! j++
425	add	$sbit,$acc0,$acc0
426	srlx	$acc0,32,$sbit
427	and	$acc0,$mask,$acc0
428	cmp	$j,$num
429	add	$acc0,$car1,$car1
430	st	$car1,[$tp]			! tp[j-1]
431	srlx	$car1,32,$car1
432	bl	%icc,.Lsqr_2nd
433	add	$tp,4,$tp			! tp++
434!.Lsqr_2nd
435
436	mulx	$apj,$mul0,$acc0
437	mulx	$npj,$mul1,$acc1
438	add	$acc0,$car0,$car0
439	add	$tpj,$sbit,$sbit
440	and	$car0,$mask,$acc0
441	srlx	$car0,32,$car0
442	add	$acc1,$car1,$car1
443	add	$acc0,$acc0,$acc0
444	add	$sbit,$acc0,$acc0
445	srlx	$acc0,32,$sbit
446	and	$acc0,$mask,$acc0
447	add	$acc0,$car1,$car1
448	st	$car1,[$tp]			! tp[j-1]
449	srlx	$car1,32,$car1
450
451	add	$car0,$car0,$car0
452	add	$sbit,$car0,$car0
453	add	$car0,$car1,$car1
454	add	$car2,$car1,$car1
455	st	$car1,[$tp+4]
456	srlx	$car1,32,$car2
457
458	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
459	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
460	ld	[$ap+8],$mul0			! ap[2]
461	ld	[$np],$car1			! np[0]
462	ld	[$np+4],$npj			! np[1]
463	mulx	$n0,$tmp1,$mul1
464	and	$mul1,$mask,$mul1
465	mov	8,$i
466
467	mulx	$mul0,$mul0,$car0
468	mulx	$car1,$mul1,$car1
469	and	$car0,$mask,$acc0
470	add	$tmp1,$car1,$car1
471	srlx	$car0,32,$car0
472	add	%sp,$bias+$frame,$tp
473	srlx	$car1,32,$car1
474	and	$car0,1,$sbit
475	srlx	$car0,1,$car0
476	mov	4,$j
477
478.Lsqr_outer:
479.Lsqr_inner1:
480	mulx	$npj,$mul1,$acc1
481	add	$tpj,$car1,$car1
482	add	$j,4,$j
483	ld	[$tp+8],$tpj
484	cmp	$j,$i
485	add	$acc1,$car1,$car1
486	ld	[$np+$j],$npj
487	st	$car1,[$tp]
488	srlx	$car1,32,$car1
489	bl	%icc,.Lsqr_inner1
490	add	$tp,4,$tp
491!.Lsqr_inner1
492
493	add	$j,4,$j
494	ld	[$ap+$j],$apj			! ap[j]
495	mulx	$npj,$mul1,$acc1
496	add	$tpj,$car1,$car1
497	ld	[$np+$j],$npj			! np[j]
498	srlx	$car1,32,$tmp0
499	and	$car1,$mask,$car1
500	add	$tmp0,$sbit,$sbit
501	add	$acc0,$car1,$car1
502	ld	[$tp+8],$tpj			! tp[j]
503	add	$acc1,$car1,$car1
504	st	$car1,[$tp]
505	srlx	$car1,32,$car1
506
507	add	$j,4,$j
508	cmp	$j,$num
509	be,pn	%icc,.Lsqr_no_inner2
510	add	$tp,4,$tp
511
512.Lsqr_inner2:
513	mulx	$apj,$mul0,$acc0
514	mulx	$npj,$mul1,$acc1
515	add	$tpj,$sbit,$sbit
516	add	$acc0,$car0,$car0
517	ld	[$ap+$j],$apj			! ap[j]
518	and	$car0,$mask,$acc0
519	ld	[$np+$j],$npj			! np[j]
520	srlx	$car0,32,$car0
521	add	$acc0,$acc0,$acc0
522	ld	[$tp+8],$tpj			! tp[j]
523	add	$sbit,$acc0,$acc0
524	add	$j,4,$j				! j++
525	srlx	$acc0,32,$sbit
526	and	$acc0,$mask,$acc0
527	cmp	$j,$num
528	add	$acc0,$car1,$car1
529	add	$acc1,$car1,$car1
530	st	$car1,[$tp]			! tp[j-1]
531	srlx	$car1,32,$car1
532	bl	%icc,.Lsqr_inner2
533	add	$tp,4,$tp			! tp++
534
535.Lsqr_no_inner2:
536	mulx	$apj,$mul0,$acc0
537	mulx	$npj,$mul1,$acc1
538	add	$tpj,$sbit,$sbit
539	add	$acc0,$car0,$car0
540	and	$car0,$mask,$acc0
541	srlx	$car0,32,$car0
542	add	$acc0,$acc0,$acc0
543	add	$sbit,$acc0,$acc0
544	srlx	$acc0,32,$sbit
545	and	$acc0,$mask,$acc0
546	add	$acc0,$car1,$car1
547	add	$acc1,$car1,$car1
548	st	$car1,[$tp]			! tp[j-1]
549	srlx	$car1,32,$car1
550
551	add	$car0,$car0,$car0
552	add	$sbit,$car0,$car0
553	add	$car0,$car1,$car1
554	add	$car2,$car1,$car1
555	st	$car1,[$tp+4]
556	srlx	$car1,32,$car2
557
558	add	$i,4,$i				! i++
559	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
560	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
561	ld	[$ap+$i],$mul0			! ap[j]
562	ld	[$np],$car1			! np[0]
563	ld	[$np+4],$npj			! np[1]
564	mulx	$n0,$tmp1,$mul1
565	and	$mul1,$mask,$mul1
566	add	$i,4,$tmp0
567
568	mulx	$mul0,$mul0,$car0
569	mulx	$car1,$mul1,$car1
570	and	$car0,$mask,$acc0
571	add	$tmp1,$car1,$car1
572	srlx	$car0,32,$car0
573	add	%sp,$bias+$frame,$tp
574	srlx	$car1,32,$car1
575	and	$car0,1,$sbit
576	srlx	$car0,1,$car0
577
578	cmp	$tmp0,$num			! i<num-1
579	bl	%icc,.Lsqr_outer
580	mov	4,$j
581
582.Lsqr_last:
583	mulx	$npj,$mul1,$acc1
584	add	$tpj,$car1,$car1
585	add	$j,4,$j
586	ld	[$tp+8],$tpj
587	cmp	$j,$i
588	add	$acc1,$car1,$car1
589	ld	[$np+$j],$npj
590	st	$car1,[$tp]
591	srlx	$car1,32,$car1
592	bl	%icc,.Lsqr_last
593	add	$tp,4,$tp
594!.Lsqr_last
595
596	mulx	$npj,$mul1,$acc1
597	add	$tpj,$acc0,$acc0
598	srlx	$acc0,32,$tmp0
599	and	$acc0,$mask,$acc0
600	add	$tmp0,$sbit,$sbit
601	add	$acc0,$car1,$car1
602	add	$acc1,$car1,$car1
603	st	$car1,[$tp]
604	srlx	$car1,32,$car1
605
606	add	$car0,$car0,$car0		! recover $car0
607	add	$sbit,$car0,$car0
608	add	$car0,$car1,$car1
609	add	$car2,$car1,$car1
610	st	$car1,[$tp+4]
611	srlx	$car1,32,$car2
612
613	ba	.Ltail
614	add	$tp,8,$tp
615.type	$fname,#function
616.size	$fname,(.-$fname)
617.asciz	"Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
618.align	32
619___
620$code =~ s/\`([^\`]*)\`/eval($1)/gem;
621print $code;
622close STDOUT or die "error closing STDOUT: $!";
623