1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2005
18#
19# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20# for undertaken effort are multiple. First of all, UltraSPARC is not
21# the whole SPARCv9 universe and other VIS-free implementations deserve
22# optimized code as much. Secondly, newly introduced UltraSPARC T1,
23# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25# several integrated RSA/DSA accelerator circuits accessible through
26# kernel driver [only(*)], but having decent user-land software
27# implementation is important too. Finally, reasons like desire to
28# experiment with dedicated squaring procedure. Yes, this module
29# implements one, because it was easiest to draft it in SPARCv9
30# instructions...
31
32# (*)	Engine accessing the driver in question is on my TODO list.
33#	For reference, accelerator is estimated to give 6 to 10 times
34#	improvement on single-threaded RSA sign. It should be noted
35#	that 6-10x improvement coefficient does not actually mean
36#	something extraordinary in terms of absolute [single-threaded]
37#	performance, as SPARCv9 instruction set is by all means least
38#	suitable for high performance crypto among other 64 bit
39#	platforms. 6-10x factor simply places T1 in same performance
40#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
41#	appear impressive at all, but it's the sign operation which is
42#	far more critical/interesting.
43
44# You might notice that inner loops are modulo-scheduled:-) This has
45# essentially negligible impact on UltraSPARC performance, it's
46# Fujitsu SPARC64 V users who should notice and hopefully appreciate
47# the advantage... Currently this module surpasses sparcv9a-mont.pl
48# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49# module still have hidden potential [see TODO list there], which is
50# estimated to be larger than 20%...
51
52$output = pop;
53open STDOUT,">$output";
54
55# int bn_mul_mont(
56$rp="%i0";	# BN_ULONG *rp,
57$ap="%i1";	# const BN_ULONG *ap,
58$bp="%i2";	# const BN_ULONG *bp,
59$np="%i3";	# const BN_ULONG *np,
60$n0="%i4";	# const BN_ULONG *n0,
61$num="%i5";	# int num);
62
63$frame="STACK_FRAME";
64$bias="STACK_BIAS";
65
66$car0="%o0";
67$car1="%o1";
68$car2="%o2";	# 1 bit
69$acc0="%o3";
70$acc1="%o4";
71$mask="%g1";	# 32 bits, what a waste...
72$tmp0="%g4";
73$tmp1="%g5";
74
75$i="%l0";
76$j="%l1";
77$mul0="%l2";
78$mul1="%l3";
79$tp="%l4";
80$apj="%l5";
81$npj="%l6";
82$tpj="%l7";
83
84$fname="bn_mul_mont_int";
85
86$code=<<___;
87#include "sparc_arch.h"
88
89.section	".text",#alloc,#execinstr
90
91.global	$fname
92.align	32
93$fname:
94	cmp	%o5,4			! 128 bits minimum
95	bge,pt	%icc,.Lenter
96	sethi	%hi(0xffffffff),$mask
97	retl
98	clr	%o0
99.align	32
100.Lenter:
101	save	%sp,-$frame,%sp
102	sll	$num,2,$num		! num*=4
103	or	$mask,%lo(0xffffffff),$mask
104	ld	[$n0],$n0
105	cmp	$ap,$bp
106	and	$num,$mask,$num
107	ld	[$bp],$mul0		! bp[0]
108	nop
109
110	add	%sp,$bias,%o7		! real top of stack
111	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
112	sub	%o7,$num,%o7
113	ld	[$ap+4],$apj		! ap[1]
114	and	%o7,-1024,%o7
115	ld	[$np],$car1		! np[0]
116	sub	%o7,$bias,%sp		! alloca
117	ld	[$np+4],$npj		! np[1]
118	be,pt	SIZE_T_CC,.Lbn_sqr_mont
119	mov	12,$j
120
121	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
122	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
123	and	$car0,$mask,$acc0
124	add	%sp,$bias+$frame,$tp
125	ld	[$ap+8],$apj		!prologue!
126
127	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
128	and	$mul1,$mask,$mul1
129
130	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
131	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
132	srlx	$car0,32,$car0
133	add	$acc0,$car1,$car1
134	ld	[$np+8],$npj		!prologue!
135	srlx	$car1,32,$car1
136	mov	$tmp0,$acc0		!prologue!
137
138.L1st:
139	mulx	$apj,$mul0,$tmp0
140	mulx	$npj,$mul1,$tmp1
141	add	$acc0,$car0,$car0
142	ld	[$ap+$j],$apj		! ap[j]
143	and	$car0,$mask,$acc0
144	add	$acc1,$car1,$car1
145	ld	[$np+$j],$npj		! np[j]
146	srlx	$car0,32,$car0
147	add	$acc0,$car1,$car1
148	add	$j,4,$j			! j++
149	mov	$tmp0,$acc0
150	st	$car1,[$tp]
151	cmp	$j,$num
152	mov	$tmp1,$acc1
153	srlx	$car1,32,$car1
154	bl	%icc,.L1st
155	add	$tp,4,$tp		! tp++
156!.L1st
157
158	mulx	$apj,$mul0,$tmp0	!epilogue!
159	mulx	$npj,$mul1,$tmp1
160	add	$acc0,$car0,$car0
161	and	$car0,$mask,$acc0
162	add	$acc1,$car1,$car1
163	srlx	$car0,32,$car0
164	add	$acc0,$car1,$car1
165	st	$car1,[$tp]
166	srlx	$car1,32,$car1
167
168	add	$tmp0,$car0,$car0
169	and	$car0,$mask,$acc0
170	add	$tmp1,$car1,$car1
171	srlx	$car0,32,$car0
172	add	$acc0,$car1,$car1
173	st	$car1,[$tp+4]
174	srlx	$car1,32,$car1
175
176	add	$car0,$car1,$car1
177	st	$car1,[$tp+8]
178	srlx	$car1,32,$car2
179
180	mov	4,$i			! i++
181	ld	[$bp+4],$mul0		! bp[1]
182.Louter:
183	add	%sp,$bias+$frame,$tp
184	ld	[$ap],$car0		! ap[0]
185	ld	[$ap+4],$apj		! ap[1]
186	ld	[$np],$car1		! np[0]
187	ld	[$np+4],$npj		! np[1]
188	ld	[$tp],$tmp1		! tp[0]
189	ld	[$tp+4],$tpj		! tp[1]
190	mov	12,$j
191
192	mulx	$car0,$mul0,$car0
193	mulx	$apj,$mul0,$tmp0	!prologue!
194	add	$tmp1,$car0,$car0
195	ld	[$ap+8],$apj		!prologue!
196	and	$car0,$mask,$acc0
197
198	mulx	$n0,$acc0,$mul1
199	and	$mul1,$mask,$mul1
200
201	mulx	$car1,$mul1,$car1
202	mulx	$npj,$mul1,$acc1	!prologue!
203	srlx	$car0,32,$car0
204	add	$acc0,$car1,$car1
205	ld	[$np+8],$npj		!prologue!
206	srlx	$car1,32,$car1
207	mov	$tmp0,$acc0		!prologue!
208
209.Linner:
210	mulx	$apj,$mul0,$tmp0
211	mulx	$npj,$mul1,$tmp1
212	add	$tpj,$car0,$car0
213	ld	[$ap+$j],$apj		! ap[j]
214	add	$acc0,$car0,$car0
215	add	$acc1,$car1,$car1
216	ld	[$np+$j],$npj		! np[j]
217	and	$car0,$mask,$acc0
218	ld	[$tp+8],$tpj		! tp[j]
219	srlx	$car0,32,$car0
220	add	$acc0,$car1,$car1
221	add	$j,4,$j			! j++
222	mov	$tmp0,$acc0
223	st	$car1,[$tp]		! tp[j-1]
224	srlx	$car1,32,$car1
225	mov	$tmp1,$acc1
226	cmp	$j,$num
227	bl	%icc,.Linner
228	add	$tp,4,$tp		! tp++
229!.Linner
230
231	mulx	$apj,$mul0,$tmp0	!epilogue!
232	mulx	$npj,$mul1,$tmp1
233	add	$tpj,$car0,$car0
234	add	$acc0,$car0,$car0
235	ld	[$tp+8],$tpj		! tp[j]
236	and	$car0,$mask,$acc0
237	add	$acc1,$car1,$car1
238	srlx	$car0,32,$car0
239	add	$acc0,$car1,$car1
240	st	$car1,[$tp]		! tp[j-1]
241	srlx	$car1,32,$car1
242
243	add	$tpj,$car0,$car0
244	add	$tmp0,$car0,$car0
245	and	$car0,$mask,$acc0
246	add	$tmp1,$car1,$car1
247	add	$acc0,$car1,$car1
248	st	$car1,[$tp+4]		! tp[j-1]
249	srlx	$car0,32,$car0
250	add	$i,4,$i			! i++
251	srlx	$car1,32,$car1
252
253	add	$car0,$car1,$car1
254	cmp	$i,$num
255	add	$car2,$car1,$car1
256	st	$car1,[$tp+8]
257
258	srlx	$car1,32,$car2
259	bl,a	%icc,.Louter
260	ld	[$bp+$i],$mul0		! bp[i]
261!.Louter
262
263	add	$tp,12,$tp
264
265.Ltail:
266	add	$np,$num,$np
267	add	$rp,$num,$rp
268	sub	%g0,$num,%o7		! k=-num
269	ba	.Lsub
270	subcc	%g0,%g0,%g0		! clear %icc.c
271.align	16
272.Lsub:
273	ld	[$tp+%o7],%o0
274	ld	[$np+%o7],%o1
275	subccc	%o0,%o1,%o1		! tp[j]-np[j]
276	add	$rp,%o7,$i
277	add	%o7,4,%o7
278	brnz	%o7,.Lsub
279	st	%o1,[$i]
280	subccc	$car2,0,$car2		! handle upmost overflow bit
281	sub	%g0,$num,%o7
282
283.Lcopy:
284	ld	[$tp+%o7],%o1		! conditional copy
285	ld	[$rp+%o7],%o0
286	st	%g0,[$tp+%o7]		! zap tp
287	movcs	%icc,%o1,%o0
288	st	%o0,[$rp+%o7]
289	add	%o7,4,%o7
290	brnz	%o7,.Lcopy
291	nop
292	mov	1,%i0
293	ret
294	restore
295___
296
297########
298######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
299######## code without following dedicated squaring procedure.
300########
301$sbit="%o5";
302
303$code.=<<___;
304.align	32
305.Lbn_sqr_mont:
306	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
307	mulx	$apj,$mul0,$tmp0		!prologue!
308	and	$car0,$mask,$acc0
309	add	%sp,$bias+$frame,$tp
310	ld	[$ap+8],$apj			!prologue!
311
312	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
313	srlx	$car0,32,$car0
314	and	$mul1,$mask,$mul1
315
316	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
317	mulx	$npj,$mul1,$acc1		!prologue!
318	and	$car0,1,$sbit
319	ld	[$np+8],$npj			!prologue!
320	srlx	$car0,1,$car0
321	add	$acc0,$car1,$car1
322	srlx	$car1,32,$car1
323	mov	$tmp0,$acc0			!prologue!
324
325.Lsqr_1st:
326	mulx	$apj,$mul0,$tmp0
327	mulx	$npj,$mul1,$tmp1
328	add	$acc0,$car0,$car0		! ap[j]*a0+c0
329	add	$acc1,$car1,$car1
330	ld	[$ap+$j],$apj			! ap[j]
331	and	$car0,$mask,$acc0
332	ld	[$np+$j],$npj			! np[j]
333	srlx	$car0,32,$car0
334	add	$acc0,$acc0,$acc0
335	or	$sbit,$acc0,$acc0
336	mov	$tmp1,$acc1
337	srlx	$acc0,32,$sbit
338	add	$j,4,$j				! j++
339	and	$acc0,$mask,$acc0
340	cmp	$j,$num
341	add	$acc0,$car1,$car1
342	st	$car1,[$tp]
343	mov	$tmp0,$acc0
344	srlx	$car1,32,$car1
345	bl	%icc,.Lsqr_1st
346	add	$tp,4,$tp			! tp++
347!.Lsqr_1st
348
349	mulx	$apj,$mul0,$tmp0		! epilogue
350	mulx	$npj,$mul1,$tmp1
351	add	$acc0,$car0,$car0		! ap[j]*a0+c0
352	add	$acc1,$car1,$car1
353	and	$car0,$mask,$acc0
354	srlx	$car0,32,$car0
355	add	$acc0,$acc0,$acc0
356	or	$sbit,$acc0,$acc0
357	srlx	$acc0,32,$sbit
358	and	$acc0,$mask,$acc0
359	add	$acc0,$car1,$car1
360	st	$car1,[$tp]
361	srlx	$car1,32,$car1
362
363	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
364	add	$tmp1,$car1,$car1
365	and	$car0,$mask,$acc0
366	srlx	$car0,32,$car0
367	add	$acc0,$acc0,$acc0
368	or	$sbit,$acc0,$acc0
369	srlx	$acc0,32,$sbit
370	and	$acc0,$mask,$acc0
371	add	$acc0,$car1,$car1
372	st	$car1,[$tp+4]
373	srlx	$car1,32,$car1
374
375	add	$car0,$car0,$car0
376	or	$sbit,$car0,$car0
377	add	$car0,$car1,$car1
378	st	$car1,[$tp+8]
379	srlx	$car1,32,$car2
380
381	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
382	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
383	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
384	ld	[$ap+4],$mul0			! ap[1]
385	ld	[$ap+8],$apj			! ap[2]
386	ld	[$np],$car1			! np[0]
387	ld	[$np+4],$npj			! np[1]
388	mulx	$n0,$tmp0,$mul1
389
390	mulx	$mul0,$mul0,$car0
391	and	$mul1,$mask,$mul1
392
393	mulx	$car1,$mul1,$car1
394	mulx	$npj,$mul1,$acc1
395	add	$tmp0,$car1,$car1
396	and	$car0,$mask,$acc0
397	ld	[$np+8],$npj			! np[2]
398	srlx	$car1,32,$car1
399	add	$tmp1,$car1,$car1
400	srlx	$car0,32,$car0
401	add	$acc0,$car1,$car1
402	and	$car0,1,$sbit
403	add	$acc1,$car1,$car1
404	srlx	$car0,1,$car0
405	mov	12,$j
406	st	$car1,[%sp+$bias+$frame]	! tp[0]=
407	srlx	$car1,32,$car1
408	add	%sp,$bias+$frame+4,$tp
409
410.Lsqr_2nd:
411	mulx	$apj,$mul0,$acc0
412	mulx	$npj,$mul1,$acc1
413	add	$acc0,$car0,$car0
414	add	$tpj,$sbit,$sbit
415	ld	[$ap+$j],$apj			! ap[j]
416	and	$car0,$mask,$acc0
417	ld	[$np+$j],$npj			! np[j]
418	srlx	$car0,32,$car0
419	add	$acc1,$car1,$car1
420	ld	[$tp+8],$tpj			! tp[j]
421	add	$acc0,$acc0,$acc0
422	add	$j,4,$j				! j++
423	add	$sbit,$acc0,$acc0
424	srlx	$acc0,32,$sbit
425	and	$acc0,$mask,$acc0
426	cmp	$j,$num
427	add	$acc0,$car1,$car1
428	st	$car1,[$tp]			! tp[j-1]
429	srlx	$car1,32,$car1
430	bl	%icc,.Lsqr_2nd
431	add	$tp,4,$tp			! tp++
432!.Lsqr_2nd
433
434	mulx	$apj,$mul0,$acc0
435	mulx	$npj,$mul1,$acc1
436	add	$acc0,$car0,$car0
437	add	$tpj,$sbit,$sbit
438	and	$car0,$mask,$acc0
439	srlx	$car0,32,$car0
440	add	$acc1,$car1,$car1
441	add	$acc0,$acc0,$acc0
442	add	$sbit,$acc0,$acc0
443	srlx	$acc0,32,$sbit
444	and	$acc0,$mask,$acc0
445	add	$acc0,$car1,$car1
446	st	$car1,[$tp]			! tp[j-1]
447	srlx	$car1,32,$car1
448
449	add	$car0,$car0,$car0
450	add	$sbit,$car0,$car0
451	add	$car0,$car1,$car1
452	add	$car2,$car1,$car1
453	st	$car1,[$tp+4]
454	srlx	$car1,32,$car2
455
456	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
457	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
458	ld	[$ap+8],$mul0			! ap[2]
459	ld	[$np],$car1			! np[0]
460	ld	[$np+4],$npj			! np[1]
461	mulx	$n0,$tmp1,$mul1
462	and	$mul1,$mask,$mul1
463	mov	8,$i
464
465	mulx	$mul0,$mul0,$car0
466	mulx	$car1,$mul1,$car1
467	and	$car0,$mask,$acc0
468	add	$tmp1,$car1,$car1
469	srlx	$car0,32,$car0
470	add	%sp,$bias+$frame,$tp
471	srlx	$car1,32,$car1
472	and	$car0,1,$sbit
473	srlx	$car0,1,$car0
474	mov	4,$j
475
476.Lsqr_outer:
477.Lsqr_inner1:
478	mulx	$npj,$mul1,$acc1
479	add	$tpj,$car1,$car1
480	add	$j,4,$j
481	ld	[$tp+8],$tpj
482	cmp	$j,$i
483	add	$acc1,$car1,$car1
484	ld	[$np+$j],$npj
485	st	$car1,[$tp]
486	srlx	$car1,32,$car1
487	bl	%icc,.Lsqr_inner1
488	add	$tp,4,$tp
489!.Lsqr_inner1
490
491	add	$j,4,$j
492	ld	[$ap+$j],$apj			! ap[j]
493	mulx	$npj,$mul1,$acc1
494	add	$tpj,$car1,$car1
495	ld	[$np+$j],$npj			! np[j]
496	srlx	$car1,32,$tmp0
497	and	$car1,$mask,$car1
498	add	$tmp0,$sbit,$sbit
499	add	$acc0,$car1,$car1
500	ld	[$tp+8],$tpj			! tp[j]
501	add	$acc1,$car1,$car1
502	st	$car1,[$tp]
503	srlx	$car1,32,$car1
504
505	add	$j,4,$j
506	cmp	$j,$num
507	be,pn	%icc,.Lsqr_no_inner2
508	add	$tp,4,$tp
509
510.Lsqr_inner2:
511	mulx	$apj,$mul0,$acc0
512	mulx	$npj,$mul1,$acc1
513	add	$tpj,$sbit,$sbit
514	add	$acc0,$car0,$car0
515	ld	[$ap+$j],$apj			! ap[j]
516	and	$car0,$mask,$acc0
517	ld	[$np+$j],$npj			! np[j]
518	srlx	$car0,32,$car0
519	add	$acc0,$acc0,$acc0
520	ld	[$tp+8],$tpj			! tp[j]
521	add	$sbit,$acc0,$acc0
522	add	$j,4,$j				! j++
523	srlx	$acc0,32,$sbit
524	and	$acc0,$mask,$acc0
525	cmp	$j,$num
526	add	$acc0,$car1,$car1
527	add	$acc1,$car1,$car1
528	st	$car1,[$tp]			! tp[j-1]
529	srlx	$car1,32,$car1
530	bl	%icc,.Lsqr_inner2
531	add	$tp,4,$tp			! tp++
532
533.Lsqr_no_inner2:
534	mulx	$apj,$mul0,$acc0
535	mulx	$npj,$mul1,$acc1
536	add	$tpj,$sbit,$sbit
537	add	$acc0,$car0,$car0
538	and	$car0,$mask,$acc0
539	srlx	$car0,32,$car0
540	add	$acc0,$acc0,$acc0
541	add	$sbit,$acc0,$acc0
542	srlx	$acc0,32,$sbit
543	and	$acc0,$mask,$acc0
544	add	$acc0,$car1,$car1
545	add	$acc1,$car1,$car1
546	st	$car1,[$tp]			! tp[j-1]
547	srlx	$car1,32,$car1
548
549	add	$car0,$car0,$car0
550	add	$sbit,$car0,$car0
551	add	$car0,$car1,$car1
552	add	$car2,$car1,$car1
553	st	$car1,[$tp+4]
554	srlx	$car1,32,$car2
555
556	add	$i,4,$i				! i++
557	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
558	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
559	ld	[$ap+$i],$mul0			! ap[j]
560	ld	[$np],$car1			! np[0]
561	ld	[$np+4],$npj			! np[1]
562	mulx	$n0,$tmp1,$mul1
563	and	$mul1,$mask,$mul1
564	add	$i,4,$tmp0
565
566	mulx	$mul0,$mul0,$car0
567	mulx	$car1,$mul1,$car1
568	and	$car0,$mask,$acc0
569	add	$tmp1,$car1,$car1
570	srlx	$car0,32,$car0
571	add	%sp,$bias+$frame,$tp
572	srlx	$car1,32,$car1
573	and	$car0,1,$sbit
574	srlx	$car0,1,$car0
575
576	cmp	$tmp0,$num			! i<num-1
577	bl	%icc,.Lsqr_outer
578	mov	4,$j
579
580.Lsqr_last:
581	mulx	$npj,$mul1,$acc1
582	add	$tpj,$car1,$car1
583	add	$j,4,$j
584	ld	[$tp+8],$tpj
585	cmp	$j,$i
586	add	$acc1,$car1,$car1
587	ld	[$np+$j],$npj
588	st	$car1,[$tp]
589	srlx	$car1,32,$car1
590	bl	%icc,.Lsqr_last
591	add	$tp,4,$tp
592!.Lsqr_last
593
594	mulx	$npj,$mul1,$acc1
595	add	$tpj,$acc0,$acc0
596	srlx	$acc0,32,$tmp0
597	and	$acc0,$mask,$acc0
598	add	$tmp0,$sbit,$sbit
599	add	$acc0,$car1,$car1
600	add	$acc1,$car1,$car1
601	st	$car1,[$tp]
602	srlx	$car1,32,$car1
603
604	add	$car0,$car0,$car0		! recover $car0
605	add	$sbit,$car0,$car0
606	add	$car0,$car1,$car1
607	add	$car2,$car1,$car1
608	st	$car1,[$tp+4]
609	srlx	$car1,32,$car2
610
611	ba	.Ltail
612	add	$tp,8,$tp
613.type	$fname,#function
614.size	$fname,(.-$fname)
615.asciz	"Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
616.align	32
617___
618$code =~ s/\`([^\`]*)\`/eval($1)/gem;
619print $code;
620close STDOUT or die "error closing STDOUT: $!";
621