1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by David S. Miller and Andy Polyakov
12# The module is licensed under 2-clause BSD license.
13# November 2012. All rights reserved.
14# ====================================================================
15
16######################################################################
17# Montgomery squaring-n-multiplication module for SPARC T4.
18#
19# The module consists of three parts:
20#
21# 1) collection of "single-op" subroutines that perform single
22#    operation, Montgomery squaring or multiplication, on 512-,
23#    1024-, 1536- and 2048-bit operands;
24# 2) collection of "multi-op" subroutines that perform 5 squaring and
25#    1 multiplication operations on operands of above lengths;
26# 3) fall-back and helper VIS3 subroutines.
27#
28# RSA sign is dominated by multi-op subroutine, while RSA verify and
29# DSA - by single-op. Special note about 4096-bit RSA verify result.
30# Operands are too long for dedicated hardware and it's handled by
31# VIS3 code, which is why you don't see any improvement. It's surely
32# possible to improve it [by deploying 'mpmul' instruction], maybe in
33# the future...
34#
35# Performance improvement.
36#
37# 64-bit process, VIS3:
38#                   sign    verify    sign/s verify/s
39# rsa 1024 bits 0.000628s 0.000028s   1592.4  35434.4
40# rsa 2048 bits 0.003282s 0.000106s    304.7   9438.3
41# rsa 4096 bits 0.025866s 0.000340s     38.7   2940.9
42# dsa 1024 bits 0.000301s 0.000332s   3323.7   3013.9
43# dsa 2048 bits 0.001056s 0.001233s    946.9    810.8
44#
45# 64-bit process, this module:
46#                   sign    verify    sign/s verify/s
47# rsa 1024 bits 0.000256s 0.000016s   3904.4  61411.9
48# rsa 2048 bits 0.000946s 0.000029s   1056.8  34292.7
49# rsa 4096 bits 0.005061s 0.000340s    197.6   2940.5
50# dsa 1024 bits 0.000176s 0.000195s   5674.7   5130.5
51# dsa 2048 bits 0.000296s 0.000354s   3383.2   2827.6
52#
53######################################################################
54# 32-bit process, VIS3:
55#                   sign    verify    sign/s verify/s
56# rsa 1024 bits 0.000665s 0.000028s   1504.8  35233.3
57# rsa 2048 bits 0.003349s 0.000106s    298.6   9433.4
58# rsa 4096 bits 0.025959s 0.000341s     38.5   2934.8
59# dsa 1024 bits 0.000320s 0.000341s   3123.3   2929.6
60# dsa 2048 bits 0.001101s 0.001260s    908.2    793.4
61#
62# 32-bit process, this module:
63#                   sign    verify    sign/s verify/s
64# rsa 1024 bits 0.000301s 0.000017s   3317.1  60240.0
65# rsa 2048 bits 0.001034s 0.000030s    966.9  33812.7
66# rsa 4096 bits 0.005244s 0.000341s    190.7   2935.4
67# dsa 1024 bits 0.000201s 0.000205s   4976.1   4879.2
68# dsa 2048 bits 0.000328s 0.000360s   3051.1   2774.2
69#
70# 32-bit code is prone to performance degradation as interrupt rate
71# dispatched to CPU executing the code grows. This is because in
72# standard process of handling interrupt in 32-bit process context
73# upper halves of most integer registers used as input or output are
74# zeroed. This renders result invalid, and operation has to be re-run.
75# If CPU is "bothered" with timer interrupts only, the penalty is
76# hardly measurable. But in order to mitigate this problem for higher
77# interrupt rates contemporary Linux kernel recognizes biased stack
78# even in 32-bit process context and preserves full register contents.
79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
80# for details.
81
82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83push(@INC,"${dir}","${dir}../../perlasm");
84require "sparcv9_modes.pl";
85
86$output = pop;
87open STDOUT,">$output";
88
89$code.=<<___;
90#include "sparc_arch.h"
91
92#ifdef	__arch64__
93.register	%g2,#scratch
94.register	%g3,#scratch
95#endif
96
97.section	".text",#alloc,#execinstr
98
99#ifdef	__PIC__
100SPARC_PIC_THUNK(%g1)
101#endif
102___
103
104########################################################################
105# Register layout for mont[mul|sqr] instructions.
106# For details see "Oracle SPARC Architecture 2011" manual at
107# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
108#
109my @R=map("%f".2*$_,(0..11,30,31,12..29));
110my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
111my @A=(@N[0..13],@R[14..31]);
112my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
113
114########################################################################
115# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
116#			  const u64 *np,const BN_ULONG *n0);
117#
118sub generate_bn_mul_mont_t4() {
119my $NUM=shift;
120my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
121
122$code.=<<___;
123.globl	bn_mul_mont_t4_$NUM
124.align	32
125bn_mul_mont_t4_$NUM:
126#ifdef	__arch64__
127	mov	0,$sentinel
128	mov	-128,%g4
129#elif defined(SPARCV9_64BIT_STACK)
130	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
131	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
132	mov	-2047,%g4
133	and	%g1,SPARCV9_64BIT_STACK,%g1
134	movrz	%g1,0,%g4
135	mov	-1,$sentinel
136	add	%g4,-128,%g4
137#else
138	mov	-1,$sentinel
139	mov	-128,%g4
140#endif
141	sllx	$sentinel,32,$sentinel
142	save	%sp,%g4,%sp
143#ifndef	__arch64__
144	save	%sp,-128,%sp	! warm it up
145	save	%sp,-128,%sp
146	save	%sp,-128,%sp
147	save	%sp,-128,%sp
148	save	%sp,-128,%sp
149	save	%sp,-128,%sp
150	restore
151	restore
152	restore
153	restore
154	restore
155	restore
156#endif
157	and	%sp,1,%g4
158	or	$sentinel,%fp,%fp
159	or	%g4,$sentinel,$sentinel
160
161	! copy arguments to global registers
162	mov	%i0,$rp
163	mov	%i1,$ap
164	mov	%i2,$bp
165	mov	%i3,$np
166	ld	[%i4+0],%f1	! load *n0
167	ld	[%i4+4],%f0
168	fsrc2	%f0,%f60
169___
170
171# load ap[$NUM] ########################################################
172$code.=<<___;
173	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
174___
175for($i=0; $i<14 && $i<$NUM; $i++) {
176my $lo=$i<13?@A[$i+1]:"%o7";
177$code.=<<___;
178	ld	[$ap+$i*8+0],$lo
179	ld	[$ap+$i*8+4],@A[$i]
180	sllx	@A[$i],32,@A[$i]
181	or	$lo,@A[$i],@A[$i]
182___
183}
184for(; $i<$NUM; $i++) {
185my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
186$code.=<<___;
187	ld	[$ap+$i*8+0],$lo
188	ld	[$ap+$i*8+4],$hi
189	fsrc2	$hi,@A[$i]
190___
191}
192# load np[$NUM] ########################################################
193$code.=<<___;
194	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
195___
196for($i=0; $i<14 && $i<$NUM; $i++) {
197my $lo=$i<13?@N[$i+1]:"%o7";
198$code.=<<___;
199	ld	[$np+$i*8+0],$lo
200	ld	[$np+$i*8+4],@N[$i]
201	sllx	@N[$i],32,@N[$i]
202	or	$lo,@N[$i],@N[$i]
203___
204}
205$code.=<<___;
206	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
207___
208for(; $i<28 && $i<$NUM; $i++) {
209my $lo=$i<27?@N[$i+1]:"%o7";
210$code.=<<___;
211	ld	[$np+$i*8+0],$lo
212	ld	[$np+$i*8+4],@N[$i]
213	sllx	@N[$i],32,@N[$i]
214	or	$lo,@N[$i],@N[$i]
215___
216}
217$code.=<<___;
218	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
219___
220for(; $i<$NUM; $i++) {
221my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
222$code.=<<___;
223	ld	[$np+$i*8+0],$lo
224	ld	[$np+$i*8+4],@N[$i]
225	sllx	@N[$i],32,@N[$i]
226	or	$lo,@N[$i],@N[$i]
227___
228}
229$code.=<<___;
230	cmp	$ap,$bp
231	be	SIZE_T_CC,.Lmsquare_$NUM
232	nop
233___
234
235# load bp[$NUM] ########################################################
236$code.=<<___;
237	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
238___
239for($i=0; $i<14 && $i<$NUM; $i++) {
240my $lo=$i<13?@B[$i+1]:"%o7";
241$code.=<<___;
242	ld	[$bp+$i*8+0],$lo
243	ld	[$bp+$i*8+4],@B[$i]
244	sllx	@B[$i],32,@B[$i]
245	or	$lo,@B[$i],@B[$i]
246___
247}
248$code.=<<___;
249	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
250___
251for(; $i<$NUM; $i++) {
252my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
253$code.=<<___;
254	ld	[$bp+$i*8+0],$lo
255	ld	[$bp+$i*8+4],@B[$i]
256	sllx	@B[$i],32,@B[$i]
257	or	$lo,@B[$i],@B[$i]
258___
259}
260# magic ################################################################
261$code.=<<___;
262	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
263.Lmresume_$NUM:
264	fbu,pn	%fcc3,.Lmabort_$NUM
265#ifndef	__arch64__
266	and	%fp,$sentinel,$sentinel
267	brz,pn	$sentinel,.Lmabort_$NUM
268#endif
269	nop
270#ifdef	__arch64__
271	restore
272	restore
273	restore
274	restore
275	restore
276#else
277	restore;		and	%fp,$sentinel,$sentinel
278	restore;		and	%fp,$sentinel,$sentinel
279	restore;		and	%fp,$sentinel,$sentinel
280	restore;		and	%fp,$sentinel,$sentinel
281	 brz,pn	$sentinel,.Lmabort1_$NUM
282	restore
283#endif
284___
285
286# save tp[$NUM] ########################################################
287for($i=0; $i<14 && $i<$NUM; $i++) {
288$code.=<<___;
289	movxtod	@A[$i],@R[$i]
290___
291}
292$code.=<<___;
293#ifdef	__arch64__
294	restore
295#else
296	 and	%fp,$sentinel,$sentinel
297	restore
298	 and	$sentinel,1,%o7
299	 and	%fp,$sentinel,$sentinel
300	 srl	%fp,0,%fp		! just in case?
301	 or	%o7,$sentinel,$sentinel
302	brz,a,pn $sentinel,.Lmdone_$NUM
303	mov	0,%i0		! return failure
304#endif
305___
306for($i=0; $i<12 && $i<$NUM; $i++) {
307@R[$i] =~ /%f([0-9]+)/;
308my $lo = "%f".($1+1);
309$code.=<<___;
310	st	$lo,[$rp+$i*8+0]
311	st	@R[$i],[$rp+$i*8+4]
312___
313}
314for(; $i<$NUM; $i++) {
315my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
316$code.=<<___;
317	fsrc2	@R[$i],$hi
318	st	$lo,[$rp+$i*8+0]
319	st	$hi,[$rp+$i*8+4]
320___
321}
322$code.=<<___;
323	mov	1,%i0		! return success
324.Lmdone_$NUM:
325	ret
326	restore
327
328.Lmabort_$NUM:
329	restore
330	restore
331	restore
332	restore
333	restore
334.Lmabort1_$NUM:
335	restore
336
337	mov	0,%i0		! return failure
338	ret
339	restore
340
341.align	32
342.Lmsquare_$NUM:
343	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
344	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
345	.word   0x81b02940+$NUM-1	! montsqr	$NUM-1
346	ba	.Lmresume_$NUM
347	nop
348.type	bn_mul_mont_t4_$NUM, #function
349.size	bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
350___
351}
352
353for ($i=8;$i<=32;$i+=8) {
354	&generate_bn_mul_mont_t4($i);
355}
356
357########################################################################
358#
359sub load_ccr {
360my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
361$code.=<<___;
362	srl	$pwr,	2,	%o4
363	and	$pwr,	3,	%o5
364	and	%o4,	7,	%o4
365	sll	%o5,	3,	%o5	! offset within first cache line
366	add	%o5,	$ptbl,	$ptbl	! of the pwrtbl
367	or	%g0,	1,	%o5
368	sll	%o5,	%o4,	$ccr
369___
370$code.=<<___	if (!$skip_wr);
371	wr	$ccr,	%g0,	%ccr
372___
373}
374sub load_b_pair {
375my ($pwrtbl,$B0,$B1)=@_;
376
377$code.=<<___;
378	ldx	[$pwrtbl+0*32],	$B0
379	ldx	[$pwrtbl+8*32],	$B1
380	ldx	[$pwrtbl+1*32],	%o4
381	ldx	[$pwrtbl+9*32],	%o5
382	movvs	%icc,	%o4,	$B0
383	ldx	[$pwrtbl+2*32],	%o4
384	movvs	%icc,	%o5,	$B1
385	ldx	[$pwrtbl+10*32],%o5
386	move	%icc,	%o4,	$B0
387	ldx	[$pwrtbl+3*32],	%o4
388	move	%icc,	%o5,	$B1
389	ldx	[$pwrtbl+11*32],%o5
390	movneg	%icc,	%o4,	$B0
391	ldx	[$pwrtbl+4*32],	%o4
392	movneg	%icc,	%o5,	$B1
393	ldx	[$pwrtbl+12*32],%o5
394	movcs	%xcc,	%o4,	$B0
395	ldx	[$pwrtbl+5*32],%o4
396	movcs	%xcc,	%o5,	$B1
397	ldx	[$pwrtbl+13*32],%o5
398	movvs	%xcc,	%o4,	$B0
399	ldx	[$pwrtbl+6*32],	%o4
400	movvs	%xcc,	%o5,	$B1
401	ldx	[$pwrtbl+14*32],%o5
402	move	%xcc,	%o4,	$B0
403	ldx	[$pwrtbl+7*32],	%o4
404	move	%xcc,	%o5,	$B1
405	ldx	[$pwrtbl+15*32],%o5
406	movneg	%xcc,	%o4,	$B0
407	add	$pwrtbl,16*32,	$pwrtbl
408	movneg	%xcc,	%o5,	$B1
409___
410}
411sub load_b {
412my ($pwrtbl,$Bi)=@_;
413
414$code.=<<___;
415	ldx	[$pwrtbl+0*32],	$Bi
416	ldx	[$pwrtbl+1*32],	%o4
417	ldx	[$pwrtbl+2*32],	%o5
418	movvs	%icc,	%o4,	$Bi
419	ldx	[$pwrtbl+3*32],	%o4
420	move	%icc,	%o5,	$Bi
421	ldx	[$pwrtbl+4*32],	%o5
422	movneg	%icc,	%o4,	$Bi
423	ldx	[$pwrtbl+5*32],	%o4
424	movcs	%xcc,	%o5,	$Bi
425	ldx	[$pwrtbl+6*32],	%o5
426	movvs	%xcc,	%o4,	$Bi
427	ldx	[$pwrtbl+7*32],	%o4
428	move	%xcc,	%o5,	$Bi
429	add	$pwrtbl,8*32,	$pwrtbl
430	movneg	%xcc,	%o4,	$Bi
431___
432}
433
434########################################################################
435# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
436#			   const u64 *pwrtbl,int pwr,int stride);
437#
438sub generate_bn_pwr5_mont_t4() {
439my $NUM=shift;
440my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
441
442$code.=<<___;
443.globl	bn_pwr5_mont_t4_$NUM
444.align	32
445bn_pwr5_mont_t4_$NUM:
446#ifdef	__arch64__
447	mov	0,$sentinel
448	mov	-128,%g4
449#elif defined(SPARCV9_64BIT_STACK)
450	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
451	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
452	mov	-2047,%g4
453	and	%g1,SPARCV9_64BIT_STACK,%g1
454	movrz	%g1,0,%g4
455	mov	-1,$sentinel
456	add	%g4,-128,%g4
457#else
458	mov	-1,$sentinel
459	mov	-128,%g4
460#endif
461	sllx	$sentinel,32,$sentinel
462	save	%sp,%g4,%sp
463#ifndef	__arch64__
464	save	%sp,-128,%sp	! warm it up
465	save	%sp,-128,%sp
466	save	%sp,-128,%sp
467	save	%sp,-128,%sp
468	save	%sp,-128,%sp
469	save	%sp,-128,%sp
470	restore
471	restore
472	restore
473	restore
474	restore
475	restore
476#endif
477	and	%sp,1,%g4
478	or	$sentinel,%fp,%fp
479	or	%g4,$sentinel,$sentinel
480
481	! copy arguments to global registers
482	mov	%i0,$tp
483	mov	%i1,$np
484	ld	[%i2+0],%f1	! load *n0
485	ld	[%i2+4],%f0
486	mov	%i3,$pwrtbl
487	srl	%i4,%g0,%i4	! pack last arguments
488	sllx	%i5,32,$pwr
489	or	%i4,$pwr,$pwr
490	fsrc2	%f0,%f60
491___
492
493# load tp[$NUM] ########################################################
494$code.=<<___;
495	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
496___
497for($i=0; $i<14 && $i<$NUM; $i++) {
498$code.=<<___;
499	ldx	[$tp+$i*8],@A[$i]
500___
501}
502for(; $i<$NUM; $i++) {
503$code.=<<___;
504	ldd	[$tp+$i*8],@A[$i]
505___
506}
507# load np[$NUM] ########################################################
508$code.=<<___;
509	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
510___
511for($i=0; $i<14 && $i<$NUM; $i++) {
512$code.=<<___;
513	ldx	[$np+$i*8],@N[$i]
514___
515}
516$code.=<<___;
517	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
518___
519for(; $i<28 && $i<$NUM; $i++) {
520$code.=<<___;
521	ldx	[$np+$i*8],@N[$i]
522___
523}
524$code.=<<___;
525	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
526___
527for(; $i<$NUM; $i++) {
528$code.=<<___;
529	ldx	[$np+$i*8],@N[$i]
530___
531}
532# load pwrtbl[pwr] ########################################################
533$code.=<<___;
534	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
535
536	srlx	$pwr,	32,	%o4		! unpack $pwr
537	srl	$pwr,	%g0,	%o5
538	sub	%o4,	5,	%o4
539	mov	$pwrtbl,	%o7
540	sllx	%o4,	32,	$pwr		! re-pack $pwr
541	or	%o5,	$pwr,	$pwr
542	srl	%o5,	%o4,	%o5
543___
544	&load_ccr("%o7","%o5","%o4");
545$code.=<<___;
546	b	.Lstride_$NUM
547	nop
548.align	16
549.Lstride_$NUM:
550___
551for($i=0; $i<14 && $i<$NUM; $i+=2) {
552	&load_b_pair("%o7",@B[$i],@B[$i+1]);
553}
554$code.=<<___;
555	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
556___
557for(; $i<$NUM; $i+=2) {
558	&load_b_pair("%i7",@B[$i],@B[$i+1]);
559}
560$code.=<<___;
561	srax	$pwr,	32,	%o4		! unpack $pwr
562	srl	$pwr,	%g0,	%o5
563	sub	%o4,	5,	%o4
564	mov	$pwrtbl,	%i7
565	sllx	%o4,	32,	$pwr		! re-pack $pwr
566	or	%o5,	$pwr,	$pwr
567	srl	%o5,	%o4,	%o5
568___
569	&load_ccr("%i7","%o5","%o4",1);
570
571# magic ################################################################
572for($i=0; $i<5; $i++) {
573$code.=<<___;
574	.word	0x81b02940+$NUM-1	! montsqr	$NUM-1
575	fbu,pn	%fcc3,.Labort_$NUM
576#ifndef	__arch64__
577	and	%fp,$sentinel,$sentinel
578	brz,pn	$sentinel,.Labort_$NUM
579#endif
580	nop
581___
582}
583$code.=<<___;
584	wr	%o4,	%g0,	%ccr
585	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
586	fbu,pn	%fcc3,.Labort_$NUM
587#ifndef	__arch64__
588	and	%fp,$sentinel,$sentinel
589	brz,pn	$sentinel,.Labort_$NUM
590#endif
591
592	srax	$pwr,	32,	%o4
593#ifdef	__arch64__
594	brgez	%o4,.Lstride_$NUM
595	restore
596	restore
597	restore
598	restore
599	restore
600#else
601	brgez	%o4,.Lstride_$NUM
602	restore;		and	%fp,$sentinel,$sentinel
603	restore;		and	%fp,$sentinel,$sentinel
604	restore;		and	%fp,$sentinel,$sentinel
605	restore;		and	%fp,$sentinel,$sentinel
606	 brz,pn	$sentinel,.Labort1_$NUM
607	restore
608#endif
609___
610
611# save tp[$NUM] ########################################################
612for($i=0; $i<14 && $i<$NUM; $i++) {
613$code.=<<___;
614	movxtod	@A[$i],@R[$i]
615___
616}
617$code.=<<___;
618#ifdef	__arch64__
619	restore
620#else
621	 and	%fp,$sentinel,$sentinel
622	restore
623	 and	$sentinel,1,%o7
624	 and	%fp,$sentinel,$sentinel
625	 srl	%fp,0,%fp		! just in case?
626	 or	%o7,$sentinel,$sentinel
627	brz,a,pn $sentinel,.Ldone_$NUM
628	mov	0,%i0		! return failure
629#endif
630___
631for($i=0; $i<$NUM; $i++) {
632$code.=<<___;
633	std	@R[$i],[$tp+$i*8]
634___
635}
636$code.=<<___;
637	mov	1,%i0		! return success
638.Ldone_$NUM:
639	ret
640	restore
641
642.Labort_$NUM:
643	restore
644	restore
645	restore
646	restore
647	restore
648.Labort1_$NUM:
649	restore
650
651	mov	0,%i0		! return failure
652	ret
653	restore
654.type	bn_pwr5_mont_t4_$NUM, #function
655.size	bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
656___
657}
658
659for ($i=8;$i<=32;$i+=8) {
660	&generate_bn_pwr5_mont_t4($i);
661}
662
663{
664########################################################################
665# Fall-back subroutines
666#
667# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
668#
669($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
670	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
671
672# int bn_mul_mont(
673$rp="%o0";	# u64 *rp,
674$ap="%o1";	# const u64 *ap,
675$bp="%o2";	# const u64 *bp,
676$np="%o3";	# const u64 *np,
677$n0p="%o4";	# const BN_ULONG *n0,
678$num="%o5";	# int num);	# caller ensures that num is >=3
679$code.=<<___;
680.globl	bn_mul_mont_t4
681.align	32
682bn_mul_mont_t4:
683	add	%sp,	STACK_BIAS,	%g4	! real top of stack
684	sll	$num,	3,	$num		! size in bytes
685	add	$num,	63,	%g1
686	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
687	sub	%g4,	%g1,	%g1
688	andn	%g1,	63,	%g1		! align at 64 byte
689	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
690	sub	%g1,	%g4,	%g1
691
692	save	%sp,	%g1,	%sp
693___
694#	+-------------------------------+<-----	%sp
695#	.				.
696#	+-------------------------------+<-----	aligned at 64 bytes
697#	| __int64 tmp[0]		|
698#	+-------------------------------+
699#	.				.
700#	.				.
701#	+-------------------------------+<-----	aligned at 64 bytes
702#	.				.
703($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
704($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
705($ovf,$i)=($t0,$t1);
706$code.=<<___;
707	ld	[$n0p+0],	$t0	! pull n0[0..1] value
708	ld	[$n0p+4],	$t1
709	add	%sp, STACK_BIAS+STACK_FRAME, $tp
710	ldx	[$bp+0],	$m0	! m0=bp[0]
711	sllx	$t1,	32,	$n0
712	add	$bp,	8,	$bp
713	or	$t0,	$n0,	$n0
714
715	ldx	[$ap+0],	$aj	! ap[0]
716
717	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
718	umulxhi	$aj,	$m0,	$hi0
719
720	ldx	[$ap+8],	$aj	! ap[1]
721	add	$ap,	16,	$ap
722	ldx	[$np+0],	$nj	! np[0]
723
724	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
725
726	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
727	umulxhi	$aj,	$m0,	$aj	! ahi=aj
728
729	mulx	$nj,	$m1,	$lo1	! np[0]*m1
730	umulxhi	$nj,	$m1,	$hi1
731
732	ldx	[$np+8],	$nj	! np[1]
733
734	addcc	$lo0,	$lo1,	$lo1
735	add	$np,	16,	$np
736	addxc	%g0,	$hi1,	$hi1
737
738	mulx	$nj,	$m1,	$nlo	! np[1]*m1
739	umulxhi	$nj,	$m1,	$nj	! nhi=nj
740
741	ba	.L1st
742	sub	$num,	24,	$cnt	! cnt=num-3
743
744.align	16
745.L1st:
746	addcc	$alo,	$hi0,	$lo0
747	addxc	$aj,	%g0,	$hi0
748
749	ldx	[$ap+0],	$aj	! ap[j]
750	addcc	$nlo,	$hi1,	$lo1
751	add	$ap,	8,	$ap
752	addxc	$nj,	%g0,	$hi1	! nhi=nj
753
754	ldx	[$np+0],	$nj	! np[j]
755	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
756	add	$np,	8,	$np
757	umulxhi	$aj,	$m0,	$aj	! ahi=aj
758
759	mulx	$nj,	$m1,	$nlo	! np[j]*m1
760	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
761	umulxhi	$nj,	$m1,	$nj	! nhi=nj
762	addxc	%g0,	$hi1,	$hi1
763	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
764	add	$tp,	8,	$tp	! tp++
765
766	brnz,pt	$cnt,	.L1st
767	sub	$cnt,	8,	$cnt	! j--
768!.L1st
769	addcc	$alo,	$hi0,	$lo0
770	addxc	$aj,	%g0,	$hi0	! ahi=aj
771
772	addcc	$nlo,	$hi1,	$lo1
773	addxc	$nj,	%g0,	$hi1
774	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
775	addxc	%g0,	$hi1,	$hi1
776	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
777	add	$tp,	8,	$tp
778
779	addcc	$hi0,	$hi1,	$hi1
780	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
781	stxa	$hi1,	[$tp]0xe2
782	add	$tp,	8,	$tp
783
784	ba	.Louter
785	sub	$num,	16,	$i	! i=num-2
786
787.align	16
788.Louter:
789	ldx	[$bp+0],	$m0	! m0=bp[i]
790	add	$bp,	8,	$bp
791
792	sub	$ap,	$num,	$ap	! rewind
793	sub	$np,	$num,	$np
794	sub	$tp,	$num,	$tp
795
796	ldx	[$ap+0],	$aj	! ap[0]
797	ldx	[$np+0],	$nj	! np[0]
798
799	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
800	ldx	[$tp],		$tj	! tp[0]
801	umulxhi	$aj,	$m0,	$hi0
802	ldx	[$ap+8],	$aj	! ap[1]
803	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
804	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
805	addxc	%g0,	$hi0,	$hi0
806	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
807	umulxhi	$aj,	$m0,	$aj	! ahi=aj
808	mulx	$nj,	$m1,	$lo1	! np[0]*m1
809	add	$ap,	16,	$ap
810	umulxhi	$nj,	$m1,	$hi1
811	ldx	[$np+8],	$nj	! np[1]
812	add	$np,	16,	$np
813	addcc	$lo1,	$lo0,	$lo1
814	mulx	$nj,	$m1,	$nlo	! np[1]*m1
815	addxc	%g0,	$hi1,	$hi1
816	umulxhi	$nj,	$m1,	$nj	! nhi=nj
817
818	ba	.Linner
819	sub	$num,	24,	$cnt	! cnt=num-3
820.align	16
821.Linner:
822	addcc	$alo,	$hi0,	$lo0
823	ldx	[$tp+8],	$tj	! tp[j]
824	addxc	$aj,	%g0,	$hi0	! ahi=aj
825	ldx	[$ap+0],	$aj	! ap[j]
826	add	$ap,	8,	$ap
827	addcc	$nlo,	$hi1,	$lo1
828	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
829	addxc	$nj,	%g0,	$hi1	! nhi=nj
830	ldx	[$np+0],	$nj	! np[j]
831	add	$np,	8,	$np
832	umulxhi	$aj,	$m0,	$aj	! ahi=aj
833	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
834	mulx	$nj,	$m1,	$nlo	! np[j]*m1
835	addxc	%g0,	$hi0,	$hi0
836	umulxhi	$nj,	$m1,	$nj	! nhi=nj
837	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
838	addxc	%g0,	$hi1,	$hi1
839	stx	$lo1,	[$tp]		! tp[j-1]
840	add	$tp,	8,	$tp
841	brnz,pt	$cnt,	.Linner
842	sub	$cnt,	8,	$cnt
843!.Linner
844	ldx	[$tp+8],	$tj	! tp[j]
845	addcc	$alo,	$hi0,	$lo0
846	addxc	$aj,	%g0,	$hi0	! ahi=aj
847	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
848	addxc	%g0,	$hi0,	$hi0
849
850	addcc	$nlo,	$hi1,	$lo1
851	addxc	$nj,	%g0,	$hi1	! nhi=nj
852	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
853	addxc	%g0,	$hi1,	$hi1
854	stx	$lo1,	[$tp]		! tp[j-1]
855
856	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
857	addxccc	$hi1,	$hi0,	$hi1
858	addxc	%g0,	%g0,	$ovf
859	stx	$hi1,	[$tp+8]
860	add	$tp,	16,	$tp
861
862	brnz,pt	$i,	.Louter
863	sub	$i,	8,	$i
864
865	sub	$ap,	$num,	$ap	! rewind
866	sub	$np,	$num,	$np
867	sub	$tp,	$num,	$tp
868	ba	.Lsub
869	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
870
871.align	16
872.Lsub:
873	ldx	[$tp],		$tj
874	add	$tp,	8,	$tp
875	ldx	[$np+0],	$nj
876	add	$np,	8,	$np
877	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
878	srlx	$tj,	32,	$tj
879	srlx	$nj,	32,	$nj
880	subccc	$tj,	$nj,	$t3
881	add	$rp,	8,	$rp
882	st	$t2,	[$rp-4]		! reverse order
883	st	$t3,	[$rp-8]
884	brnz,pt	$cnt,	.Lsub
885	sub	$cnt,	8,	$cnt
886
887	sub	$np,	$num,	$np	! rewind
888	sub	$tp,	$num,	$tp
889	sub	$rp,	$num,	$rp
890
891	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
892	ba	.Lcopy
893	sub	$num,	8,	$cnt
894
895.align	16
896.Lcopy:					! conditional copy
897	ldx	[$tp],		$tj
898	ldx	[$rp+0],	$t2
899	stx	%g0,	[$tp]		! zap
900	add	$tp,	8,	$tp
901	movcs	%icc,	$tj,	$t2
902	stx	$t2,	[$rp+0]
903	add	$rp,	8,	$rp
904	brnz	$cnt,	.Lcopy
905	sub	$cnt,	8,	$cnt
906
907	mov	1,	%o0
908	ret
909	restore
910.type	bn_mul_mont_t4, #function
911.size	bn_mul_mont_t4, .-bn_mul_mont_t4
912___
913
914# int bn_mul_mont_gather5(
915$rp="%o0";	# u64 *rp,
916$ap="%o1";	# const u64 *ap,
917$bp="%o2";	# const u64 *pwrtbl,
918$np="%o3";	# const u64 *np,
919$n0p="%o4";	# const BN_ULONG *n0,
920$num="%o5";	# int num,	# caller ensures that num is >=3
921		# int power);
922$code.=<<___;
923.globl	bn_mul_mont_gather5_t4
924.align	32
925bn_mul_mont_gather5_t4:
926	add	%sp,	STACK_BIAS,	%g4	! real top of stack
927	sll	$num,	3,	$num		! size in bytes
928	add	$num,	63,	%g1
929	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
930	sub	%g4,	%g1,	%g1
931	andn	%g1,	63,	%g1		! align at 64 byte
932	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
933	sub	%g1,	%g4,	%g1
934	LDPTR	[%sp+STACK_7thARG],	%g4	! load power, 7th argument
935
936	save	%sp,	%g1,	%sp
937___
938#	+-------------------------------+<-----	%sp
939#	.				.
940#	+-------------------------------+<-----	aligned at 64 bytes
941#	| __int64 tmp[0]		|
942#	+-------------------------------+
943#	.				.
944#	.				.
945#	+-------------------------------+<-----	aligned at 64 bytes
946#	.				.
947($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
948($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
949($ovf,$i)=($t0,$t1);
950	&load_ccr($bp,"%g4",$ccr);
951	&load_b($bp,$m0,"%o7");		# m0=bp[0]
952
953$code.=<<___;
954	ld	[$n0p+0],	$t0	! pull n0[0..1] value
955	ld	[$n0p+4],	$t1
956	add	%sp, STACK_BIAS+STACK_FRAME, $tp
957	sllx	$t1,	32,	$n0
958	or	$t0,	$n0,	$n0
959
960	ldx	[$ap+0],	$aj	! ap[0]
961
962	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
963	umulxhi	$aj,	$m0,	$hi0
964
965	ldx	[$ap+8],	$aj	! ap[1]
966	add	$ap,	16,	$ap
967	ldx	[$np+0],	$nj	! np[0]
968
969	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
970
971	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
972	umulxhi	$aj,	$m0,	$aj	! ahi=aj
973
974	mulx	$nj,	$m1,	$lo1	! np[0]*m1
975	umulxhi	$nj,	$m1,	$hi1
976
977	ldx	[$np+8],	$nj	! np[1]
978
979	addcc	$lo0,	$lo1,	$lo1
980	add	$np,	16,	$np
981	addxc	%g0,	$hi1,	$hi1
982
983	mulx	$nj,	$m1,	$nlo	! np[1]*m1
984	umulxhi	$nj,	$m1,	$nj	! nhi=nj
985
986	ba	.L1st_g5
987	sub	$num,	24,	$cnt	! cnt=num-3
988
989.align	16
990.L1st_g5:
991	addcc	$alo,	$hi0,	$lo0
992	addxc	$aj,	%g0,	$hi0
993
994	ldx	[$ap+0],	$aj	! ap[j]
995	addcc	$nlo,	$hi1,	$lo1
996	add	$ap,	8,	$ap
997	addxc	$nj,	%g0,	$hi1	! nhi=nj
998
999	ldx	[$np+0],	$nj	! np[j]
1000	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
1001	add	$np,	8,	$np
1002	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1003
1004	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1005	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1006	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1007	addxc	%g0,	$hi1,	$hi1
1008	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1009	add	$tp,	8,	$tp	! tp++
1010
1011	brnz,pt	$cnt,	.L1st_g5
1012	sub	$cnt,	8,	$cnt	! j--
1013!.L1st_g5
1014	addcc	$alo,	$hi0,	$lo0
1015	addxc	$aj,	%g0,	$hi0	! ahi=aj
1016
1017	addcc	$nlo,	$hi1,	$lo1
1018	addxc	$nj,	%g0,	$hi1
1019	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1020	addxc	%g0,	$hi1,	$hi1
1021	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1022	add	$tp,	8,	$tp
1023
1024	addcc	$hi0,	$hi1,	$hi1
1025	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
1026	stxa	$hi1,	[$tp]0xe2
1027	add	$tp,	8,	$tp
1028
1029	ba	.Louter_g5
1030	sub	$num,	16,	$i	! i=num-2
1031
1032.align	16
1033.Louter_g5:
1034	wr	$ccr,	%g0,	%ccr
1035___
1036	&load_b($bp,$m0);		# m0=bp[i]
1037$code.=<<___;
1038	sub	$ap,	$num,	$ap	! rewind
1039	sub	$np,	$num,	$np
1040	sub	$tp,	$num,	$tp
1041
1042	ldx	[$ap+0],	$aj	! ap[0]
1043	ldx	[$np+0],	$nj	! np[0]
1044
1045	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
1046	ldx	[$tp],		$tj	! tp[0]
1047	umulxhi	$aj,	$m0,	$hi0
1048	ldx	[$ap+8],	$aj	! ap[1]
1049	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
1050	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
1051	addxc	%g0,	$hi0,	$hi0
1052	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
1053	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1054	mulx	$nj,	$m1,	$lo1	! np[0]*m1
1055	add	$ap,	16,	$ap
1056	umulxhi	$nj,	$m1,	$hi1
1057	ldx	[$np+8],	$nj	! np[1]
1058	add	$np,	16,	$np
1059	addcc	$lo1,	$lo0,	$lo1
1060	mulx	$nj,	$m1,	$nlo	! np[1]*m1
1061	addxc	%g0,	$hi1,	$hi1
1062	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1063
1064	ba	.Linner_g5
1065	sub	$num,	24,	$cnt	! cnt=num-3
1066.align	16
1067.Linner_g5:
1068	addcc	$alo,	$hi0,	$lo0
1069	ldx	[$tp+8],	$tj	! tp[j]
1070	addxc	$aj,	%g0,	$hi0	! ahi=aj
1071	ldx	[$ap+0],	$aj	! ap[j]
1072	add	$ap,	8,	$ap
1073	addcc	$nlo,	$hi1,	$lo1
1074	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
1075	addxc	$nj,	%g0,	$hi1	! nhi=nj
1076	ldx	[$np+0],	$nj	! np[j]
1077	add	$np,	8,	$np
1078	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1079	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1080	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1081	addxc	%g0,	$hi0,	$hi0
1082	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1083	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1084	addxc	%g0,	$hi1,	$hi1
1085	stx	$lo1,	[$tp]		! tp[j-1]
1086	add	$tp,	8,	$tp
1087	brnz,pt	$cnt,	.Linner_g5
1088	sub	$cnt,	8,	$cnt
1089!.Linner_g5
1090	ldx	[$tp+8],	$tj	! tp[j]
1091	addcc	$alo,	$hi0,	$lo0
1092	addxc	$aj,	%g0,	$hi0	! ahi=aj
1093	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1094	addxc	%g0,	$hi0,	$hi0
1095
1096	addcc	$nlo,	$hi1,	$lo1
1097	addxc	$nj,	%g0,	$hi1	! nhi=nj
1098	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1099	addxc	%g0,	$hi1,	$hi1
1100	stx	$lo1,	[$tp]		! tp[j-1]
1101
1102	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
1103	addxccc	$hi1,	$hi0,	$hi1
1104	addxc	%g0,	%g0,	$ovf
1105	stx	$hi1,	[$tp+8]
1106	add	$tp,	16,	$tp
1107
1108	brnz,pt	$i,	.Louter_g5
1109	sub	$i,	8,	$i
1110
1111	sub	$ap,	$num,	$ap	! rewind
1112	sub	$np,	$num,	$np
1113	sub	$tp,	$num,	$tp
1114	ba	.Lsub_g5
1115	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
1116
1117.align	16
1118.Lsub_g5:
1119	ldx	[$tp],		$tj
1120	add	$tp,	8,	$tp
1121	ldx	[$np+0],	$nj
1122	add	$np,	8,	$np
1123	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
1124	srlx	$tj,	32,	$tj
1125	srlx	$nj,	32,	$nj
1126	subccc	$tj,	$nj,	$t3
1127	add	$rp,	8,	$rp
1128	st	$t2,	[$rp-4]		! reverse order
1129	st	$t3,	[$rp-8]
1130	brnz,pt	$cnt,	.Lsub_g5
1131	sub	$cnt,	8,	$cnt
1132
1133	sub	$np,	$num,	$np	! rewind
1134	sub	$tp,	$num,	$tp
1135	sub	$rp,	$num,	$rp
1136
1137	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
1138	ba	.Lcopy_g5
1139	sub	$num,	8,	$cnt
1140
1141.align	16
1142.Lcopy_g5:				! conditional copy
1143	ldx	[$tp],		$tj
1144	ldx	[$rp+0],	$t2
1145	stx	%g0,	[$tp]		! zap
1146	add	$tp,	8,	$tp
1147	movcs	%icc,	$tj,	$t2
1148	stx	$t2,	[$rp+0]
1149	add	$rp,	8,	$rp
1150	brnz	$cnt,	.Lcopy_g5
1151	sub	$cnt,	8,	$cnt
1152
1153	mov	1,	%o0
1154	ret
1155	restore
1156.type	bn_mul_mont_gather5_t4, #function
1157.size	bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1158___
1159}
1160
1161$code.=<<___;
1162.globl	bn_flip_t4
1163.align	32
1164bn_flip_t4:
1165.Loop_flip:
1166	ld	[%o1+0],	%o4
1167	sub	%o2,	1,	%o2
1168	ld	[%o1+4],	%o5
1169	add	%o1,	8,	%o1
1170	st	%o5,	[%o0+0]
1171	st	%o4,	[%o0+4]
1172	brnz	%o2,	.Loop_flip
1173	add	%o0,	8,	%o0
1174	retl
1175	nop
1176.type	bn_flip_t4, #function
1177.size	bn_flip_t4, .-bn_flip_t4
1178
1179.globl	bn_flip_n_scatter5_t4
1180.align	32
1181bn_flip_n_scatter5_t4:
1182	sll	%o3,	3,	%o3
1183	srl	%o1,	1,	%o1
1184	add	%o3,	%o2,	%o2	! &pwrtbl[pwr]
1185	sub	%o1,	1,	%o1
1186.Loop_flip_n_scatter5:
1187	ld	[%o0+0],	%o4	! inp[i]
1188	ld	[%o0+4],	%o5
1189	add	%o0,	8,	%o0
1190	sllx	%o5,	32,	%o5
1191	or	%o4,	%o5,	%o5
1192	stx	%o5,	[%o2]
1193	add	%o2,	32*8,	%o2
1194	brnz	%o1,	.Loop_flip_n_scatter5
1195	sub	%o1,	1,	%o1
1196	retl
1197	nop
1198.type	bn_flip_n_scatter5_t4, #function
1199.size	bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1200
1201.globl	bn_gather5_t4
1202.align	32
1203bn_gather5_t4:
1204___
1205	&load_ccr("%o2","%o3","%g1");
1206$code.=<<___;
1207	sub	%o1,	1,	%o1
1208.Loop_gather5:
1209___
1210	&load_b("%o2","%g1");
1211$code.=<<___;
1212	stx	%g1,	[%o0]
1213	add	%o0,	8,	%o0
1214	brnz	%o1,	.Loop_gather5
1215	sub	%o1,	1,	%o1
1216
1217	retl
1218	nop
1219.type	bn_gather5_t4, #function
1220.size	bn_gather5_t4, .-bn_gather5_t4
1221
1222.asciz	"Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1223.align	4
1224___
1225
1226&emit_assembler();
1227
1228close STDOUT or die "error closing STDOUT: $!";
1229