1#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18#       AIX performance
19#
20#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22#	The following is the performance of 32-bit compiler
23#	generated code:
24#
25#	OpenSSL 0.9.6c 21 dec 2001
26#	built on: Tue Jun 11 11:06:51 EDT 2002
27#	options:bn(64,32) ...
28#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
29#                  sign    verify    sign/s verify/s
30#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
31#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
32#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
33#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
34#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
35#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
36#
37#	Same bechmark with this assembler code:
38#
39#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
40#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
41#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
42#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
43#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
44#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
45#
46#	Number of operations increases by at almost 75%
47#
48#	Here are performance numbers for 64-bit compiler
49#	generated code:
50#
51#	OpenSSL 0.9.6g [engine] 9 Aug 2002
52#	built on: Fri Apr 18 16:59:20 EDT 2003
53#	options:bn(64,64) ...
54#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55#                  sign    verify    sign/s verify/s
56#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
57#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
58#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
59#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
60#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
61#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
62#
63#	Same benchmark with this assembler code:
64#
65#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
66#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
67#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
68#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
69#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
70#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
71#
72#	Again, performance increases by at about 75%
73#
74#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75#       OpenSSL 0.9.7c 30 Sep 2003
76#
77#       Original code.
78#
79#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
80#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
81#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
82#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
83#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
84#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
85#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
86#
87#       Same benchmark with this assembler code:
88#
89#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
90#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
91#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
92#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
93#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
94#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
95#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
96#
97#        Performance increase of ~60%
98#
99#	If you have comments or suggestions to improve code send
100#	me a note at schari@us.ibm.com
101#
102
103$flavour = shift;
104
105if ($flavour =~ /32/) {
106	$BITS=	32;
107	$BNSZ=	$BITS/8;
108	$ISA=	"\"ppc\"";
109
110	$LD=	"lwz";		# load
111	$LDU=	"lwzu";		# load and update
112	$ST=	"stw";		# store
113	$STU=	"stwu";		# store and update
114	$UMULL=	"mullw";	# unsigned multiply low
115	$UMULH=	"mulhwu";	# unsigned multiply high
116	$UDIV=	"divwu";	# unsigned divide
117	$UCMPI=	"cmplwi";	# unsigned compare with immediate
118	$UCMP=	"cmplw";	# unsigned compare
119	$CNTLZ=	"cntlzw";	# count leading zeros
120	$SHL=	"slw";		# shift left
121	$SHR=	"srw";		# unsigned shift right
122	$SHRI=	"srwi";		# unsigned shift right by immediate
123	$SHLI=	"slwi";		# shift left by immediate
124	$CLRU=	"clrlwi";	# clear upper bits
125	$INSR=	"insrwi";	# insert right
126	$ROTL=	"rotlwi";	# rotate left by immediate
127	$TR=	"tw";		# conditional trap
128} elsif ($flavour =~ /64/) {
129	$BITS=	64;
130	$BNSZ=	$BITS/8;
131	$ISA=	"\"ppc64\"";
132
133	# same as above, but 64-bit mnemonics...
134	$LD=	"ld";		# load
135	$LDU=	"ldu";		# load and update
136	$ST=	"std";		# store
137	$STU=	"stdu";		# store and update
138	$UMULL=	"mulld";	# unsigned multiply low
139	$UMULH=	"mulhdu";	# unsigned multiply high
140	$UDIV=	"divdu";	# unsigned divide
141	$UCMPI=	"cmpldi";	# unsigned compare with immediate
142	$UCMP=	"cmpld";	# unsigned compare
143	$CNTLZ=	"cntlzd";	# count leading zeros
144	$SHL=	"sld";		# shift left
145	$SHR=	"srd";		# unsigned shift right
146	$SHRI=	"srdi";		# unsigned shift right by immediate
147	$SHLI=	"sldi";		# shift left by immediate
148	$CLRU=	"clrldi";	# clear upper bits
149	$INSR=	"insrdi";	# insert right
150	$ROTL=	"rotldi";	# rotate left by immediate
151	$TR=	"td";		# conditional trap
152} else { die "nonsense $flavour"; }
153
154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
158
159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160
161$data=<<EOF;
162#--------------------------------------------------------------------
163#
164#
165#
166#
167#	File:		ppc32.s
168#
169#	Created by:	Suresh Chari
170#			IBM Thomas J. Watson Research Library
171#			Hawthorne, NY
172#
173#
174#	Description:	Optimized assembly routines for OpenSSL crypto
175#			on the 32 bitPowerPC platform.
176#
177#
178#	Version History
179#
180#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181#	   cleaned up code. Also made a single version which can
182#	   be used for both the AIX and Linux compilers. See NOTE
183#	   below.
184#				12/05/03		Suresh Chari
185#			(with lots of help from)        Andy Polyakov
186##
187#	1. Initial version	10/20/02		Suresh Chari
188#
189#
190#	The following file works for the xlc,cc
191#	and gcc compilers.
192#
193#	NOTE:	To get the file to link correctly with the gcc compiler
194#	        you have to change the names of the routines and remove
195#		the first .(dot) character. This should automatically
196#		be done in the build process.
197#
198#	Hand optimized assembly code for the following routines
199#
200#	bn_sqr_comba4
201#	bn_sqr_comba8
202#	bn_mul_comba4
203#	bn_mul_comba8
204#	bn_sub_words
205#	bn_add_words
206#	bn_div_words
207#	bn_sqr_words
208#	bn_mul_words
209#	bn_mul_add_words
210#
211#	NOTE:	It is possible to optimize this code more for
212#	specific PowerPC or Power architectures. On the Northstar
213#	architecture the optimizations in this file do
214#	 NOT provide much improvement.
215#
216#	If you have comments or suggestions to improve code send
217#	me a note at schari\@us.ibm.com
218#
219#--------------------------------------------------------------------------
220#
221#	Defines to be used in the assembly code.
222#
223#.set r0,0	# we use it as storage for value of 0
224#.set SP,1	# preserved
225#.set RTOC,2	# preserved
226#.set r3,3	# 1st argument/return value
227#.set r4,4	# 2nd argument/volatile register
228#.set r5,5	# 3rd argument/volatile register
229#.set r6,6	# ...
230#.set r7,7
231#.set r8,8
232#.set r9,9
233#.set r10,10
234#.set r11,11
235#.set r12,12
236#.set r13,13	# not used, nor any other "below" it...
237
238#	Declare function names to be global
239#	NOTE:	For gcc these names MUST be changed to remove
240#	        the first . i.e. for example change ".bn_sqr_comba4"
241#		to "bn_sqr_comba4". This should be automatically done
242#		in the build.
243
244	.globl	.bn_sqr_comba4
245	.globl	.bn_sqr_comba8
246	.globl	.bn_mul_comba4
247	.globl	.bn_mul_comba8
248	.globl	.bn_sub_words
249	.globl	.bn_add_words
250	.globl	.bn_div_words
251	.globl	.bn_sqr_words
252	.globl	.bn_mul_words
253	.globl	.bn_mul_add_words
254
255# .text section
256
257	.machine	"any"
258
259#
260#	NOTE:	The following label name should be changed to
261#		"bn_sqr_comba4" i.e. remove the first dot
262#		for the gcc compiler. This should be automatically
263#		done in the build
264#
265
266.align	4
267.bn_sqr_comba4:
268#
269# Optimized version of bn_sqr_comba4.
270#
271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272# r3 contains r
273# r4 contains a
274#
275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276#
277# r5,r6 are the two BN_ULONGs being multiplied.
278# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279# r9,r10, r11 are the equivalents of c1,c2, c3.
280# Here's the assembly
281#
282#
283	xor		r0,r0,r0		# set r0 = 0. Used in the addze
284						# instructions below
285
286						#sqr_add_c(a,0,c1,c2,c3)
287	$LD		r5,`0*$BNSZ`(r4)
288	$UMULL		r9,r5,r5
289	$UMULH		r10,r5,r5		#in first iteration. No need
290						#to add since c1=c2=c3=0.
291						# Note c3(r11) is NOT set to 0
292						# but will be.
293
294	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
295						# sqr_add_c2(a,1,0,c2,c3,c1);
296	$LD		r6,`1*$BNSZ`(r4)
297	$UMULL		r7,r5,r6
298	$UMULH		r8,r5,r6
299
300	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
301	adde		r8,r8,r8
302	addze		r9,r0			# catch carry if any.
303						# r9= r0(=0) and carry
304
305	addc		r10,r7,r10		# now add to temp result.
306	addze		r11,r8                  # r8 added to r11 which is 0
307	addze		r9,r9
308
309	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
310						#sqr_add_c(a,1,c3,c1,c2)
311	$UMULL		r7,r6,r6
312	$UMULH		r8,r6,r6
313	addc		r11,r7,r11
314	adde		r9,r8,r9
315	addze		r10,r0
316						#sqr_add_c2(a,2,0,c3,c1,c2)
317	$LD		r6,`2*$BNSZ`(r4)
318	$UMULL		r7,r5,r6
319	$UMULH		r8,r5,r6
320
321	addc		r7,r7,r7
322	adde		r8,r8,r8
323	addze		r10,r10
324
325	addc		r11,r7,r11
326	adde		r9,r8,r9
327	addze		r10,r10
328	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
329						#sqr_add_c2(a,3,0,c1,c2,c3);
330	$LD		r6,`3*$BNSZ`(r4)
331	$UMULL		r7,r5,r6
332	$UMULH		r8,r5,r6
333	addc		r7,r7,r7
334	adde		r8,r8,r8
335	addze		r11,r0
336
337	addc		r9,r7,r9
338	adde		r10,r8,r10
339	addze		r11,r11
340						#sqr_add_c2(a,2,1,c1,c2,c3);
341	$LD		r5,`1*$BNSZ`(r4)
342	$LD		r6,`2*$BNSZ`(r4)
343	$UMULL		r7,r5,r6
344	$UMULH		r8,r5,r6
345
346	addc		r7,r7,r7
347	adde		r8,r8,r8
348	addze		r11,r11
349	addc		r9,r7,r9
350	adde		r10,r8,r10
351	addze		r11,r11
352	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
353						#sqr_add_c(a,2,c2,c3,c1);
354	$UMULL		r7,r6,r6
355	$UMULH		r8,r6,r6
356	addc		r10,r7,r10
357	adde		r11,r8,r11
358	addze		r9,r0
359						#sqr_add_c2(a,3,1,c2,c3,c1);
360	$LD		r6,`3*$BNSZ`(r4)
361	$UMULL		r7,r5,r6
362	$UMULH		r8,r5,r6
363	addc		r7,r7,r7
364	adde		r8,r8,r8
365	addze		r9,r9
366
367	addc		r10,r7,r10
368	adde		r11,r8,r11
369	addze		r9,r9
370	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
371						#sqr_add_c2(a,3,2,c3,c1,c2);
372	$LD		r5,`2*$BNSZ`(r4)
373	$UMULL		r7,r5,r6
374	$UMULH		r8,r5,r6
375	addc		r7,r7,r7
376	adde		r8,r8,r8
377	addze		r10,r0
378
379	addc		r11,r7,r11
380	adde		r9,r8,r9
381	addze		r10,r10
382	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
383						#sqr_add_c(a,3,c1,c2,c3);
384	$UMULL		r7,r6,r6
385	$UMULH		r8,r6,r6
386	addc		r9,r7,r9
387	adde		r10,r8,r10
388
389	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
390	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
391	blr
392	.long	0x00000000
393
394#
395#	NOTE:	The following label name should be changed to
396#		"bn_sqr_comba8" i.e. remove the first dot
397#		for the gcc compiler. This should be automatically
398#		done in the build
399#
400
401.align	4
402.bn_sqr_comba8:
403#
404# This is an optimized version of the bn_sqr_comba8 routine.
405# Tightly uses the adde instruction
406#
407#
408# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
409# r3 contains r
410# r4 contains a
411#
412# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
413#
414# r5,r6 are the two BN_ULONGs being multiplied.
415# r7,r8 are the results of the 32x32 giving 64 bit multiply.
416# r9,r10, r11 are the equivalents of c1,c2, c3.
417#
418# Possible optimization of loading all 8 longs of a into registers
419# doesnt provide any speedup
420#
421
422	xor		r0,r0,r0		#set r0 = 0.Used in addze
423						#instructions below.
424
425						#sqr_add_c(a,0,c1,c2,c3);
426	$LD		r5,`0*$BNSZ`(r4)
427	$UMULL		r9,r5,r5		#1st iteration:	no carries.
428	$UMULH		r10,r5,r5
429	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
430						#sqr_add_c2(a,1,0,c2,c3,c1);
431	$LD		r6,`1*$BNSZ`(r4)
432	$UMULL		r7,r5,r6
433	$UMULH		r8,r5,r6
434
435	addc		r10,r7,r10		#add the two register number
436	adde		r11,r8,r0 		# (r8,r7) to the three register
437	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
438
439	addc		r10,r7,r10		#add the two register number
440	adde		r11,r8,r11 		# (r8,r7) to the three register
441	addze		r9,r9			# number (r9,r11,r10).
442
443	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
444
445						#sqr_add_c(a,1,c3,c1,c2);
446	$UMULL		r7,r6,r6
447	$UMULH		r8,r6,r6
448	addc		r11,r7,r11
449	adde		r9,r8,r9
450	addze		r10,r0
451						#sqr_add_c2(a,2,0,c3,c1,c2);
452	$LD		r6,`2*$BNSZ`(r4)
453	$UMULL		r7,r5,r6
454	$UMULH		r8,r5,r6
455
456	addc		r11,r7,r11
457	adde		r9,r8,r9
458	addze		r10,r10
459
460	addc		r11,r7,r11
461	adde		r9,r8,r9
462	addze		r10,r10
463
464	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
465						#sqr_add_c2(a,3,0,c1,c2,c3);
466	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
467	$UMULL		r7,r5,r6
468	$UMULH		r8,r5,r6
469
470	addc		r9,r7,r9
471	adde		r10,r8,r10
472	addze		r11,r0
473
474	addc		r9,r7,r9
475	adde		r10,r8,r10
476	addze		r11,r11
477						#sqr_add_c2(a,2,1,c1,c2,c3);
478	$LD		r5,`1*$BNSZ`(r4)
479	$LD		r6,`2*$BNSZ`(r4)
480	$UMULL		r7,r5,r6
481	$UMULH		r8,r5,r6
482
483	addc		r9,r7,r9
484	adde		r10,r8,r10
485	addze		r11,r11
486
487	addc		r9,r7,r9
488	adde		r10,r8,r10
489	addze		r11,r11
490
491	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
492						#sqr_add_c(a,2,c2,c3,c1);
493	$UMULL		r7,r6,r6
494	$UMULH		r8,r6,r6
495
496	addc		r10,r7,r10
497	adde		r11,r8,r11
498	addze		r9,r0
499						#sqr_add_c2(a,3,1,c2,c3,c1);
500	$LD		r6,`3*$BNSZ`(r4)
501	$UMULL		r7,r5,r6
502	$UMULH		r8,r5,r6
503
504	addc		r10,r7,r10
505	adde		r11,r8,r11
506	addze		r9,r9
507
508	addc		r10,r7,r10
509	adde		r11,r8,r11
510	addze		r9,r9
511						#sqr_add_c2(a,4,0,c2,c3,c1);
512	$LD		r5,`0*$BNSZ`(r4)
513	$LD		r6,`4*$BNSZ`(r4)
514	$UMULL		r7,r5,r6
515	$UMULH		r8,r5,r6
516
517	addc		r10,r7,r10
518	adde		r11,r8,r11
519	addze		r9,r9
520
521	addc		r10,r7,r10
522	adde		r11,r8,r11
523	addze		r9,r9
524	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
525						#sqr_add_c2(a,5,0,c3,c1,c2);
526	$LD		r6,`5*$BNSZ`(r4)
527	$UMULL		r7,r5,r6
528	$UMULH		r8,r5,r6
529
530	addc		r11,r7,r11
531	adde		r9,r8,r9
532	addze		r10,r0
533
534	addc		r11,r7,r11
535	adde		r9,r8,r9
536	addze		r10,r10
537						#sqr_add_c2(a,4,1,c3,c1,c2);
538	$LD		r5,`1*$BNSZ`(r4)
539	$LD		r6,`4*$BNSZ`(r4)
540	$UMULL		r7,r5,r6
541	$UMULH		r8,r5,r6
542
543	addc		r11,r7,r11
544	adde		r9,r8,r9
545	addze		r10,r10
546
547	addc		r11,r7,r11
548	adde		r9,r8,r9
549	addze		r10,r10
550						#sqr_add_c2(a,3,2,c3,c1,c2);
551	$LD		r5,`2*$BNSZ`(r4)
552	$LD		r6,`3*$BNSZ`(r4)
553	$UMULL		r7,r5,r6
554	$UMULH		r8,r5,r6
555
556	addc		r11,r7,r11
557	adde		r9,r8,r9
558	addze		r10,r10
559
560	addc		r11,r7,r11
561	adde		r9,r8,r9
562	addze		r10,r10
563	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
564						#sqr_add_c(a,3,c1,c2,c3);
565	$UMULL		r7,r6,r6
566	$UMULH		r8,r6,r6
567	addc		r9,r7,r9
568	adde		r10,r8,r10
569	addze		r11,r0
570						#sqr_add_c2(a,4,2,c1,c2,c3);
571	$LD		r6,`4*$BNSZ`(r4)
572	$UMULL		r7,r5,r6
573	$UMULH		r8,r5,r6
574
575	addc		r9,r7,r9
576	adde		r10,r8,r10
577	addze		r11,r11
578
579	addc		r9,r7,r9
580	adde		r10,r8,r10
581	addze		r11,r11
582						#sqr_add_c2(a,5,1,c1,c2,c3);
583	$LD		r5,`1*$BNSZ`(r4)
584	$LD		r6,`5*$BNSZ`(r4)
585	$UMULL		r7,r5,r6
586	$UMULH		r8,r5,r6
587
588	addc		r9,r7,r9
589	adde		r10,r8,r10
590	addze		r11,r11
591
592	addc		r9,r7,r9
593	adde		r10,r8,r10
594	addze		r11,r11
595						#sqr_add_c2(a,6,0,c1,c2,c3);
596	$LD		r5,`0*$BNSZ`(r4)
597	$LD		r6,`6*$BNSZ`(r4)
598	$UMULL		r7,r5,r6
599	$UMULH		r8,r5,r6
600	addc		r9,r7,r9
601	adde		r10,r8,r10
602	addze		r11,r11
603	addc		r9,r7,r9
604	adde		r10,r8,r10
605	addze		r11,r11
606	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
607						#sqr_add_c2(a,7,0,c2,c3,c1);
608	$LD		r6,`7*$BNSZ`(r4)
609	$UMULL		r7,r5,r6
610	$UMULH		r8,r5,r6
611
612	addc		r10,r7,r10
613	adde		r11,r8,r11
614	addze		r9,r0
615	addc		r10,r7,r10
616	adde		r11,r8,r11
617	addze		r9,r9
618						#sqr_add_c2(a,6,1,c2,c3,c1);
619	$LD		r5,`1*$BNSZ`(r4)
620	$LD		r6,`6*$BNSZ`(r4)
621	$UMULL		r7,r5,r6
622	$UMULH		r8,r5,r6
623
624	addc		r10,r7,r10
625	adde		r11,r8,r11
626	addze		r9,r9
627	addc		r10,r7,r10
628	adde		r11,r8,r11
629	addze		r9,r9
630						#sqr_add_c2(a,5,2,c2,c3,c1);
631	$LD		r5,`2*$BNSZ`(r4)
632	$LD		r6,`5*$BNSZ`(r4)
633	$UMULL		r7,r5,r6
634	$UMULH		r8,r5,r6
635	addc		r10,r7,r10
636	adde		r11,r8,r11
637	addze		r9,r9
638	addc		r10,r7,r10
639	adde		r11,r8,r11
640	addze		r9,r9
641						#sqr_add_c2(a,4,3,c2,c3,c1);
642	$LD		r5,`3*$BNSZ`(r4)
643	$LD		r6,`4*$BNSZ`(r4)
644	$UMULL		r7,r5,r6
645	$UMULH		r8,r5,r6
646
647	addc		r10,r7,r10
648	adde		r11,r8,r11
649	addze		r9,r9
650	addc		r10,r7,r10
651	adde		r11,r8,r11
652	addze		r9,r9
653	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
654						#sqr_add_c(a,4,c3,c1,c2);
655	$UMULL		r7,r6,r6
656	$UMULH		r8,r6,r6
657	addc		r11,r7,r11
658	adde		r9,r8,r9
659	addze		r10,r0
660						#sqr_add_c2(a,5,3,c3,c1,c2);
661	$LD		r6,`5*$BNSZ`(r4)
662	$UMULL		r7,r5,r6
663	$UMULH		r8,r5,r6
664	addc		r11,r7,r11
665	adde		r9,r8,r9
666	addze		r10,r10
667	addc		r11,r7,r11
668	adde		r9,r8,r9
669	addze		r10,r10
670						#sqr_add_c2(a,6,2,c3,c1,c2);
671	$LD		r5,`2*$BNSZ`(r4)
672	$LD		r6,`6*$BNSZ`(r4)
673	$UMULL		r7,r5,r6
674	$UMULH		r8,r5,r6
675	addc		r11,r7,r11
676	adde		r9,r8,r9
677	addze		r10,r10
678
679	addc		r11,r7,r11
680	adde		r9,r8,r9
681	addze		r10,r10
682						#sqr_add_c2(a,7,1,c3,c1,c2);
683	$LD		r5,`1*$BNSZ`(r4)
684	$LD		r6,`7*$BNSZ`(r4)
685	$UMULL		r7,r5,r6
686	$UMULH		r8,r5,r6
687	addc		r11,r7,r11
688	adde		r9,r8,r9
689	addze		r10,r10
690	addc		r11,r7,r11
691	adde		r9,r8,r9
692	addze		r10,r10
693	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
694						#sqr_add_c2(a,7,2,c1,c2,c3);
695	$LD		r5,`2*$BNSZ`(r4)
696	$UMULL		r7,r5,r6
697	$UMULH		r8,r5,r6
698
699	addc		r9,r7,r9
700	adde		r10,r8,r10
701	addze		r11,r0
702	addc		r9,r7,r9
703	adde		r10,r8,r10
704	addze		r11,r11
705						#sqr_add_c2(a,6,3,c1,c2,c3);
706	$LD		r5,`3*$BNSZ`(r4)
707	$LD		r6,`6*$BNSZ`(r4)
708	$UMULL		r7,r5,r6
709	$UMULH		r8,r5,r6
710	addc		r9,r7,r9
711	adde		r10,r8,r10
712	addze		r11,r11
713	addc		r9,r7,r9
714	adde		r10,r8,r10
715	addze		r11,r11
716						#sqr_add_c2(a,5,4,c1,c2,c3);
717	$LD		r5,`4*$BNSZ`(r4)
718	$LD		r6,`5*$BNSZ`(r4)
719	$UMULL		r7,r5,r6
720	$UMULH		r8,r5,r6
721	addc		r9,r7,r9
722	adde		r10,r8,r10
723	addze		r11,r11
724	addc		r9,r7,r9
725	adde		r10,r8,r10
726	addze		r11,r11
727	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
728						#sqr_add_c(a,5,c2,c3,c1);
729	$UMULL		r7,r6,r6
730	$UMULH		r8,r6,r6
731	addc		r10,r7,r10
732	adde		r11,r8,r11
733	addze		r9,r0
734						#sqr_add_c2(a,6,4,c2,c3,c1);
735	$LD		r6,`6*$BNSZ`(r4)
736	$UMULL		r7,r5,r6
737	$UMULH		r8,r5,r6
738	addc		r10,r7,r10
739	adde		r11,r8,r11
740	addze		r9,r9
741	addc		r10,r7,r10
742	adde		r11,r8,r11
743	addze		r9,r9
744						#sqr_add_c2(a,7,3,c2,c3,c1);
745	$LD		r5,`3*$BNSZ`(r4)
746	$LD		r6,`7*$BNSZ`(r4)
747	$UMULL		r7,r5,r6
748	$UMULH		r8,r5,r6
749	addc		r10,r7,r10
750	adde		r11,r8,r11
751	addze		r9,r9
752	addc		r10,r7,r10
753	adde		r11,r8,r11
754	addze		r9,r9
755	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
756						#sqr_add_c2(a,7,4,c3,c1,c2);
757	$LD		r5,`4*$BNSZ`(r4)
758	$UMULL		r7,r5,r6
759	$UMULH		r8,r5,r6
760	addc		r11,r7,r11
761	adde		r9,r8,r9
762	addze		r10,r0
763	addc		r11,r7,r11
764	adde		r9,r8,r9
765	addze		r10,r10
766						#sqr_add_c2(a,6,5,c3,c1,c2);
767	$LD		r5,`5*$BNSZ`(r4)
768	$LD		r6,`6*$BNSZ`(r4)
769	$UMULL		r7,r5,r6
770	$UMULH		r8,r5,r6
771	addc		r11,r7,r11
772	adde		r9,r8,r9
773	addze		r10,r10
774	addc		r11,r7,r11
775	adde		r9,r8,r9
776	addze		r10,r10
777	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
778						#sqr_add_c(a,6,c1,c2,c3);
779	$UMULL		r7,r6,r6
780	$UMULH		r8,r6,r6
781	addc		r9,r7,r9
782	adde		r10,r8,r10
783	addze		r11,r0
784						#sqr_add_c2(a,7,5,c1,c2,c3)
785	$LD		r6,`7*$BNSZ`(r4)
786	$UMULL		r7,r5,r6
787	$UMULH		r8,r5,r6
788	addc		r9,r7,r9
789	adde		r10,r8,r10
790	addze		r11,r11
791	addc		r9,r7,r9
792	adde		r10,r8,r10
793	addze		r11,r11
794	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
795
796						#sqr_add_c2(a,7,6,c2,c3,c1)
797	$LD		r5,`6*$BNSZ`(r4)
798	$UMULL		r7,r5,r6
799	$UMULH		r8,r5,r6
800	addc		r10,r7,r10
801	adde		r11,r8,r11
802	addze		r9,r0
803	addc		r10,r7,r10
804	adde		r11,r8,r11
805	addze		r9,r9
806	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
807						#sqr_add_c(a,7,c3,c1,c2);
808	$UMULL		r7,r6,r6
809	$UMULH		r8,r6,r6
810	addc		r11,r7,r11
811	adde		r9,r8,r9
812	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
813	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
814
815
816	blr
817
818	.long	0x00000000
819
820#
821#	NOTE:	The following label name should be changed to
822#		"bn_mul_comba4" i.e. remove the first dot
823#		for the gcc compiler. This should be automatically
824#		done in the build
825#
826
827.align	4
828.bn_mul_comba4:
829#
830# This is an optimized version of the bn_mul_comba4 routine.
831#
832# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
833# r3 contains r
834# r4 contains a
835# r5 contains b
836# r6, r7 are the 2 BN_ULONGs being multiplied.
837# r8, r9 are the results of the 32x32 giving 64 multiply.
838# r10, r11, r12 are the equivalents of c1, c2, and c3.
839#
840	xor	r0,r0,r0		#r0=0. Used in addze below.
841					#mul_add_c(a[0],b[0],c1,c2,c3);
842	$LD	r6,`0*$BNSZ`(r4)
843	$LD	r7,`0*$BNSZ`(r5)
844	$UMULL	r10,r6,r7
845	$UMULH	r11,r6,r7
846	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
847					#mul_add_c(a[0],b[1],c2,c3,c1);
848	$LD	r7,`1*$BNSZ`(r5)
849	$UMULL	r8,r6,r7
850	$UMULH	r9,r6,r7
851	addc	r11,r8,r11
852	adde	r12,r9,r0
853	addze	r10,r0
854					#mul_add_c(a[1],b[0],c2,c3,c1);
855	$LD	r6, `1*$BNSZ`(r4)
856	$LD	r7, `0*$BNSZ`(r5)
857	$UMULL	r8,r6,r7
858	$UMULH	r9,r6,r7
859	addc	r11,r8,r11
860	adde	r12,r9,r12
861	addze	r10,r10
862	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
863					#mul_add_c(a[2],b[0],c3,c1,c2);
864	$LD	r6,`2*$BNSZ`(r4)
865	$UMULL	r8,r6,r7
866	$UMULH	r9,r6,r7
867	addc	r12,r8,r12
868	adde	r10,r9,r10
869	addze	r11,r0
870					#mul_add_c(a[1],b[1],c3,c1,c2);
871	$LD	r6,`1*$BNSZ`(r4)
872	$LD	r7,`1*$BNSZ`(r5)
873	$UMULL	r8,r6,r7
874	$UMULH	r9,r6,r7
875	addc	r12,r8,r12
876	adde	r10,r9,r10
877	addze	r11,r11
878					#mul_add_c(a[0],b[2],c3,c1,c2);
879	$LD	r6,`0*$BNSZ`(r4)
880	$LD	r7,`2*$BNSZ`(r5)
881	$UMULL	r8,r6,r7
882	$UMULH	r9,r6,r7
883	addc	r12,r8,r12
884	adde	r10,r9,r10
885	addze	r11,r11
886	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
887					#mul_add_c(a[0],b[3],c1,c2,c3);
888	$LD	r7,`3*$BNSZ`(r5)
889	$UMULL	r8,r6,r7
890	$UMULH	r9,r6,r7
891	addc	r10,r8,r10
892	adde	r11,r9,r11
893	addze	r12,r0
894					#mul_add_c(a[1],b[2],c1,c2,c3);
895	$LD	r6,`1*$BNSZ`(r4)
896	$LD	r7,`2*$BNSZ`(r5)
897	$UMULL	r8,r6,r7
898	$UMULH	r9,r6,r7
899	addc	r10,r8,r10
900	adde	r11,r9,r11
901	addze	r12,r12
902					#mul_add_c(a[2],b[1],c1,c2,c3);
903	$LD	r6,`2*$BNSZ`(r4)
904	$LD	r7,`1*$BNSZ`(r5)
905	$UMULL	r8,r6,r7
906	$UMULH	r9,r6,r7
907	addc	r10,r8,r10
908	adde	r11,r9,r11
909	addze	r12,r12
910					#mul_add_c(a[3],b[0],c1,c2,c3);
911	$LD	r6,`3*$BNSZ`(r4)
912	$LD	r7,`0*$BNSZ`(r5)
913	$UMULL	r8,r6,r7
914	$UMULH	r9,r6,r7
915	addc	r10,r8,r10
916	adde	r11,r9,r11
917	addze	r12,r12
918	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
919					#mul_add_c(a[3],b[1],c2,c3,c1);
920	$LD	r7,`1*$BNSZ`(r5)
921	$UMULL	r8,r6,r7
922	$UMULH	r9,r6,r7
923	addc	r11,r8,r11
924	adde	r12,r9,r12
925	addze	r10,r0
926					#mul_add_c(a[2],b[2],c2,c3,c1);
927	$LD	r6,`2*$BNSZ`(r4)
928	$LD	r7,`2*$BNSZ`(r5)
929	$UMULL	r8,r6,r7
930	$UMULH	r9,r6,r7
931	addc	r11,r8,r11
932	adde	r12,r9,r12
933	addze	r10,r10
934					#mul_add_c(a[1],b[3],c2,c3,c1);
935	$LD	r6,`1*$BNSZ`(r4)
936	$LD	r7,`3*$BNSZ`(r5)
937	$UMULL	r8,r6,r7
938	$UMULH	r9,r6,r7
939	addc	r11,r8,r11
940	adde	r12,r9,r12
941	addze	r10,r10
942	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
943					#mul_add_c(a[2],b[3],c3,c1,c2);
944	$LD	r6,`2*$BNSZ`(r4)
945	$UMULL	r8,r6,r7
946	$UMULH	r9,r6,r7
947	addc	r12,r8,r12
948	adde	r10,r9,r10
949	addze	r11,r0
950					#mul_add_c(a[3],b[2],c3,c1,c2);
951	$LD	r6,`3*$BNSZ`(r4)
952	$LD	r7,`2*$BNSZ`(r4)
953	$UMULL	r8,r6,r7
954	$UMULH	r9,r6,r7
955	addc	r12,r8,r12
956	adde	r10,r9,r10
957	addze	r11,r11
958	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
959					#mul_add_c(a[3],b[3],c1,c2,c3);
960	$LD	r7,`3*$BNSZ`(r5)
961	$UMULL	r8,r6,r7
962	$UMULH	r9,r6,r7
963	addc	r10,r8,r10
964	adde	r11,r9,r11
965
966	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
967	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
968	blr
969	.long	0x00000000
970
971#
972#	NOTE:	The following label name should be changed to
973#		"bn_mul_comba8" i.e. remove the first dot
974#		for the gcc compiler. This should be automatically
975#		done in the build
976#
977
978.align	4
979.bn_mul_comba8:
980#
981# Optimized version of the bn_mul_comba8 routine.
982#
983# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
984# r3 contains r
985# r4 contains a
986# r5 contains b
987# r6, r7 are the 2 BN_ULONGs being multiplied.
988# r8, r9 are the results of the 32x32 giving 64 multiply.
989# r10, r11, r12 are the equivalents of c1, c2, and c3.
990#
991	xor	r0,r0,r0		#r0=0. Used in addze below.
992
993					#mul_add_c(a[0],b[0],c1,c2,c3);
994	$LD	r6,`0*$BNSZ`(r4)	#a[0]
995	$LD	r7,`0*$BNSZ`(r5)	#b[0]
996	$UMULL	r10,r6,r7
997	$UMULH	r11,r6,r7
998	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
999					#mul_add_c(a[0],b[1],c2,c3,c1);
1000	$LD	r7,`1*$BNSZ`(r5)
1001	$UMULL	r8,r6,r7
1002	$UMULH	r9,r6,r7
1003	addc	r11,r11,r8
1004	addze	r12,r9			# since we didnt set r12 to zero before.
1005	addze	r10,r0
1006					#mul_add_c(a[1],b[0],c2,c3,c1);
1007	$LD	r6,`1*$BNSZ`(r4)
1008	$LD	r7,`0*$BNSZ`(r5)
1009	$UMULL	r8,r6,r7
1010	$UMULH	r9,r6,r7
1011	addc	r11,r11,r8
1012	adde	r12,r12,r9
1013	addze	r10,r10
1014	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1015					#mul_add_c(a[2],b[0],c3,c1,c2);
1016	$LD	r6,`2*$BNSZ`(r4)
1017	$UMULL	r8,r6,r7
1018	$UMULH	r9,r6,r7
1019	addc	r12,r12,r8
1020	adde	r10,r10,r9
1021	addze	r11,r0
1022					#mul_add_c(a[1],b[1],c3,c1,c2);
1023	$LD	r6,`1*$BNSZ`(r4)
1024	$LD	r7,`1*$BNSZ`(r5)
1025	$UMULL	r8,r6,r7
1026	$UMULH	r9,r6,r7
1027	addc	r12,r12,r8
1028	adde	r10,r10,r9
1029	addze	r11,r11
1030					#mul_add_c(a[0],b[2],c3,c1,c2);
1031	$LD	r6,`0*$BNSZ`(r4)
1032	$LD	r7,`2*$BNSZ`(r5)
1033	$UMULL	r8,r6,r7
1034	$UMULH	r9,r6,r7
1035	addc	r12,r12,r8
1036	adde	r10,r10,r9
1037	addze	r11,r11
1038	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1039					#mul_add_c(a[0],b[3],c1,c2,c3);
1040	$LD	r7,`3*$BNSZ`(r5)
1041	$UMULL	r8,r6,r7
1042	$UMULH	r9,r6,r7
1043	addc	r10,r10,r8
1044	adde	r11,r11,r9
1045	addze	r12,r0
1046					#mul_add_c(a[1],b[2],c1,c2,c3);
1047	$LD	r6,`1*$BNSZ`(r4)
1048	$LD	r7,`2*$BNSZ`(r5)
1049	$UMULL	r8,r6,r7
1050	$UMULH	r9,r6,r7
1051	addc	r10,r10,r8
1052	adde	r11,r11,r9
1053	addze	r12,r12
1054
1055					#mul_add_c(a[2],b[1],c1,c2,c3);
1056	$LD	r6,`2*$BNSZ`(r4)
1057	$LD	r7,`1*$BNSZ`(r5)
1058	$UMULL	r8,r6,r7
1059	$UMULH	r9,r6,r7
1060	addc	r10,r10,r8
1061	adde	r11,r11,r9
1062	addze	r12,r12
1063					#mul_add_c(a[3],b[0],c1,c2,c3);
1064	$LD	r6,`3*$BNSZ`(r4)
1065	$LD	r7,`0*$BNSZ`(r5)
1066	$UMULL	r8,r6,r7
1067	$UMULH	r9,r6,r7
1068	addc	r10,r10,r8
1069	adde	r11,r11,r9
1070	addze	r12,r12
1071	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1072					#mul_add_c(a[4],b[0],c2,c3,c1);
1073	$LD	r6,`4*$BNSZ`(r4)
1074	$UMULL	r8,r6,r7
1075	$UMULH	r9,r6,r7
1076	addc	r11,r11,r8
1077	adde	r12,r12,r9
1078	addze	r10,r0
1079					#mul_add_c(a[3],b[1],c2,c3,c1);
1080	$LD	r6,`3*$BNSZ`(r4)
1081	$LD	r7,`1*$BNSZ`(r5)
1082	$UMULL	r8,r6,r7
1083	$UMULH	r9,r6,r7
1084	addc	r11,r11,r8
1085	adde	r12,r12,r9
1086	addze	r10,r10
1087					#mul_add_c(a[2],b[2],c2,c3,c1);
1088	$LD	r6,`2*$BNSZ`(r4)
1089	$LD	r7,`2*$BNSZ`(r5)
1090	$UMULL	r8,r6,r7
1091	$UMULH	r9,r6,r7
1092	addc	r11,r11,r8
1093	adde	r12,r12,r9
1094	addze	r10,r10
1095					#mul_add_c(a[1],b[3],c2,c3,c1);
1096	$LD	r6,`1*$BNSZ`(r4)
1097	$LD	r7,`3*$BNSZ`(r5)
1098	$UMULL	r8,r6,r7
1099	$UMULH	r9,r6,r7
1100	addc	r11,r11,r8
1101	adde	r12,r12,r9
1102	addze	r10,r10
1103					#mul_add_c(a[0],b[4],c2,c3,c1);
1104	$LD	r6,`0*$BNSZ`(r4)
1105	$LD	r7,`4*$BNSZ`(r5)
1106	$UMULL	r8,r6,r7
1107	$UMULH	r9,r6,r7
1108	addc	r11,r11,r8
1109	adde	r12,r12,r9
1110	addze	r10,r10
1111	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1112					#mul_add_c(a[0],b[5],c3,c1,c2);
1113	$LD	r7,`5*$BNSZ`(r5)
1114	$UMULL	r8,r6,r7
1115	$UMULH	r9,r6,r7
1116	addc	r12,r12,r8
1117	adde	r10,r10,r9
1118	addze	r11,r0
1119					#mul_add_c(a[1],b[4],c3,c1,c2);
1120	$LD	r6,`1*$BNSZ`(r4)
1121	$LD	r7,`4*$BNSZ`(r5)
1122	$UMULL	r8,r6,r7
1123	$UMULH	r9,r6,r7
1124	addc	r12,r12,r8
1125	adde	r10,r10,r9
1126	addze	r11,r11
1127					#mul_add_c(a[2],b[3],c3,c1,c2);
1128	$LD	r6,`2*$BNSZ`(r4)
1129	$LD	r7,`3*$BNSZ`(r5)
1130	$UMULL	r8,r6,r7
1131	$UMULH	r9,r6,r7
1132	addc	r12,r12,r8
1133	adde	r10,r10,r9
1134	addze	r11,r11
1135					#mul_add_c(a[3],b[2],c3,c1,c2);
1136	$LD	r6,`3*$BNSZ`(r4)
1137	$LD	r7,`2*$BNSZ`(r5)
1138	$UMULL	r8,r6,r7
1139	$UMULH	r9,r6,r7
1140	addc	r12,r12,r8
1141	adde	r10,r10,r9
1142	addze	r11,r11
1143					#mul_add_c(a[4],b[1],c3,c1,c2);
1144	$LD	r6,`4*$BNSZ`(r4)
1145	$LD	r7,`1*$BNSZ`(r5)
1146	$UMULL	r8,r6,r7
1147	$UMULH	r9,r6,r7
1148	addc	r12,r12,r8
1149	adde	r10,r10,r9
1150	addze	r11,r11
1151					#mul_add_c(a[5],b[0],c3,c1,c2);
1152	$LD	r6,`5*$BNSZ`(r4)
1153	$LD	r7,`0*$BNSZ`(r5)
1154	$UMULL	r8,r6,r7
1155	$UMULH	r9,r6,r7
1156	addc	r12,r12,r8
1157	adde	r10,r10,r9
1158	addze	r11,r11
1159	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1160					#mul_add_c(a[6],b[0],c1,c2,c3);
1161	$LD	r6,`6*$BNSZ`(r4)
1162	$UMULL	r8,r6,r7
1163	$UMULH	r9,r6,r7
1164	addc	r10,r10,r8
1165	adde	r11,r11,r9
1166	addze	r12,r0
1167					#mul_add_c(a[5],b[1],c1,c2,c3);
1168	$LD	r6,`5*$BNSZ`(r4)
1169	$LD	r7,`1*$BNSZ`(r5)
1170	$UMULL	r8,r6,r7
1171	$UMULH	r9,r6,r7
1172	addc	r10,r10,r8
1173	adde	r11,r11,r9
1174	addze	r12,r12
1175					#mul_add_c(a[4],b[2],c1,c2,c3);
1176	$LD	r6,`4*$BNSZ`(r4)
1177	$LD	r7,`2*$BNSZ`(r5)
1178	$UMULL	r8,r6,r7
1179	$UMULH	r9,r6,r7
1180	addc	r10,r10,r8
1181	adde	r11,r11,r9
1182	addze	r12,r12
1183					#mul_add_c(a[3],b[3],c1,c2,c3);
1184	$LD	r6,`3*$BNSZ`(r4)
1185	$LD	r7,`3*$BNSZ`(r5)
1186	$UMULL	r8,r6,r7
1187	$UMULH	r9,r6,r7
1188	addc	r10,r10,r8
1189	adde	r11,r11,r9
1190	addze	r12,r12
1191					#mul_add_c(a[2],b[4],c1,c2,c3);
1192	$LD	r6,`2*$BNSZ`(r4)
1193	$LD	r7,`4*$BNSZ`(r5)
1194	$UMULL	r8,r6,r7
1195	$UMULH	r9,r6,r7
1196	addc	r10,r10,r8
1197	adde	r11,r11,r9
1198	addze	r12,r12
1199					#mul_add_c(a[1],b[5],c1,c2,c3);
1200	$LD	r6,`1*$BNSZ`(r4)
1201	$LD	r7,`5*$BNSZ`(r5)
1202	$UMULL	r8,r6,r7
1203	$UMULH	r9,r6,r7
1204	addc	r10,r10,r8
1205	adde	r11,r11,r9
1206	addze	r12,r12
1207					#mul_add_c(a[0],b[6],c1,c2,c3);
1208	$LD	r6,`0*$BNSZ`(r4)
1209	$LD	r7,`6*$BNSZ`(r5)
1210	$UMULL	r8,r6,r7
1211	$UMULH	r9,r6,r7
1212	addc	r10,r10,r8
1213	adde	r11,r11,r9
1214	addze	r12,r12
1215	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1216					#mul_add_c(a[0],b[7],c2,c3,c1);
1217	$LD	r7,`7*$BNSZ`(r5)
1218	$UMULL	r8,r6,r7
1219	$UMULH	r9,r6,r7
1220	addc	r11,r11,r8
1221	adde	r12,r12,r9
1222	addze	r10,r0
1223					#mul_add_c(a[1],b[6],c2,c3,c1);
1224	$LD	r6,`1*$BNSZ`(r4)
1225	$LD	r7,`6*$BNSZ`(r5)
1226	$UMULL	r8,r6,r7
1227	$UMULH	r9,r6,r7
1228	addc	r11,r11,r8
1229	adde	r12,r12,r9
1230	addze	r10,r10
1231					#mul_add_c(a[2],b[5],c2,c3,c1);
1232	$LD	r6,`2*$BNSZ`(r4)
1233	$LD	r7,`5*$BNSZ`(r5)
1234	$UMULL	r8,r6,r7
1235	$UMULH	r9,r6,r7
1236	addc	r11,r11,r8
1237	adde	r12,r12,r9
1238	addze	r10,r10
1239					#mul_add_c(a[3],b[4],c2,c3,c1);
1240	$LD	r6,`3*$BNSZ`(r4)
1241	$LD	r7,`4*$BNSZ`(r5)
1242	$UMULL	r8,r6,r7
1243	$UMULH	r9,r6,r7
1244	addc	r11,r11,r8
1245	adde	r12,r12,r9
1246	addze	r10,r10
1247					#mul_add_c(a[4],b[3],c2,c3,c1);
1248	$LD	r6,`4*$BNSZ`(r4)
1249	$LD	r7,`3*$BNSZ`(r5)
1250	$UMULL	r8,r6,r7
1251	$UMULH	r9,r6,r7
1252	addc	r11,r11,r8
1253	adde	r12,r12,r9
1254	addze	r10,r10
1255					#mul_add_c(a[5],b[2],c2,c3,c1);
1256	$LD	r6,`5*$BNSZ`(r4)
1257	$LD	r7,`2*$BNSZ`(r5)
1258	$UMULL	r8,r6,r7
1259	$UMULH	r9,r6,r7
1260	addc	r11,r11,r8
1261	adde	r12,r12,r9
1262	addze	r10,r10
1263					#mul_add_c(a[6],b[1],c2,c3,c1);
1264	$LD	r6,`6*$BNSZ`(r4)
1265	$LD	r7,`1*$BNSZ`(r5)
1266	$UMULL	r8,r6,r7
1267	$UMULH	r9,r6,r7
1268	addc	r11,r11,r8
1269	adde	r12,r12,r9
1270	addze	r10,r10
1271					#mul_add_c(a[7],b[0],c2,c3,c1);
1272	$LD	r6,`7*$BNSZ`(r4)
1273	$LD	r7,`0*$BNSZ`(r5)
1274	$UMULL	r8,r6,r7
1275	$UMULH	r9,r6,r7
1276	addc	r11,r11,r8
1277	adde	r12,r12,r9
1278	addze	r10,r10
1279	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1280					#mul_add_c(a[7],b[1],c3,c1,c2);
1281	$LD	r7,`1*$BNSZ`(r5)
1282	$UMULL	r8,r6,r7
1283	$UMULH	r9,r6,r7
1284	addc	r12,r12,r8
1285	adde	r10,r10,r9
1286	addze	r11,r0
1287					#mul_add_c(a[6],b[2],c3,c1,c2);
1288	$LD	r6,`6*$BNSZ`(r4)
1289	$LD	r7,`2*$BNSZ`(r5)
1290	$UMULL	r8,r6,r7
1291	$UMULH	r9,r6,r7
1292	addc	r12,r12,r8
1293	adde	r10,r10,r9
1294	addze	r11,r11
1295					#mul_add_c(a[5],b[3],c3,c1,c2);
1296	$LD	r6,`5*$BNSZ`(r4)
1297	$LD	r7,`3*$BNSZ`(r5)
1298	$UMULL	r8,r6,r7
1299	$UMULH	r9,r6,r7
1300	addc	r12,r12,r8
1301	adde	r10,r10,r9
1302	addze	r11,r11
1303					#mul_add_c(a[4],b[4],c3,c1,c2);
1304	$LD	r6,`4*$BNSZ`(r4)
1305	$LD	r7,`4*$BNSZ`(r5)
1306	$UMULL	r8,r6,r7
1307	$UMULH	r9,r6,r7
1308	addc	r12,r12,r8
1309	adde	r10,r10,r9
1310	addze	r11,r11
1311					#mul_add_c(a[3],b[5],c3,c1,c2);
1312	$LD	r6,`3*$BNSZ`(r4)
1313	$LD	r7,`5*$BNSZ`(r5)
1314	$UMULL	r8,r6,r7
1315	$UMULH	r9,r6,r7
1316	addc	r12,r12,r8
1317	adde	r10,r10,r9
1318	addze	r11,r11
1319					#mul_add_c(a[2],b[6],c3,c1,c2);
1320	$LD	r6,`2*$BNSZ`(r4)
1321	$LD	r7,`6*$BNSZ`(r5)
1322	$UMULL	r8,r6,r7
1323	$UMULH	r9,r6,r7
1324	addc	r12,r12,r8
1325	adde	r10,r10,r9
1326	addze	r11,r11
1327					#mul_add_c(a[1],b[7],c3,c1,c2);
1328	$LD	r6,`1*$BNSZ`(r4)
1329	$LD	r7,`7*$BNSZ`(r5)
1330	$UMULL	r8,r6,r7
1331	$UMULH	r9,r6,r7
1332	addc	r12,r12,r8
1333	adde	r10,r10,r9
1334	addze	r11,r11
1335	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1336					#mul_add_c(a[2],b[7],c1,c2,c3);
1337	$LD	r6,`2*$BNSZ`(r4)
1338	$UMULL	r8,r6,r7
1339	$UMULH	r9,r6,r7
1340	addc	r10,r10,r8
1341	adde	r11,r11,r9
1342	addze	r12,r0
1343					#mul_add_c(a[3],b[6],c1,c2,c3);
1344	$LD	r6,`3*$BNSZ`(r4)
1345	$LD	r7,`6*$BNSZ`(r5)
1346	$UMULL	r8,r6,r7
1347	$UMULH	r9,r6,r7
1348	addc	r10,r10,r8
1349	adde	r11,r11,r9
1350	addze	r12,r12
1351					#mul_add_c(a[4],b[5],c1,c2,c3);
1352	$LD	r6,`4*$BNSZ`(r4)
1353	$LD	r7,`5*$BNSZ`(r5)
1354	$UMULL	r8,r6,r7
1355	$UMULH	r9,r6,r7
1356	addc	r10,r10,r8
1357	adde	r11,r11,r9
1358	addze	r12,r12
1359					#mul_add_c(a[5],b[4],c1,c2,c3);
1360	$LD	r6,`5*$BNSZ`(r4)
1361	$LD	r7,`4*$BNSZ`(r5)
1362	$UMULL	r8,r6,r7
1363	$UMULH	r9,r6,r7
1364	addc	r10,r10,r8
1365	adde	r11,r11,r9
1366	addze	r12,r12
1367					#mul_add_c(a[6],b[3],c1,c2,c3);
1368	$LD	r6,`6*$BNSZ`(r4)
1369	$LD	r7,`3*$BNSZ`(r5)
1370	$UMULL	r8,r6,r7
1371	$UMULH	r9,r6,r7
1372	addc	r10,r10,r8
1373	adde	r11,r11,r9
1374	addze	r12,r12
1375					#mul_add_c(a[7],b[2],c1,c2,c3);
1376	$LD	r6,`7*$BNSZ`(r4)
1377	$LD	r7,`2*$BNSZ`(r5)
1378	$UMULL	r8,r6,r7
1379	$UMULH	r9,r6,r7
1380	addc	r10,r10,r8
1381	adde	r11,r11,r9
1382	addze	r12,r12
1383	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1384					#mul_add_c(a[7],b[3],c2,c3,c1);
1385	$LD	r7,`3*$BNSZ`(r5)
1386	$UMULL	r8,r6,r7
1387	$UMULH	r9,r6,r7
1388	addc	r11,r11,r8
1389	adde	r12,r12,r9
1390	addze	r10,r0
1391					#mul_add_c(a[6],b[4],c2,c3,c1);
1392	$LD	r6,`6*$BNSZ`(r4)
1393	$LD	r7,`4*$BNSZ`(r5)
1394	$UMULL	r8,r6,r7
1395	$UMULH	r9,r6,r7
1396	addc	r11,r11,r8
1397	adde	r12,r12,r9
1398	addze	r10,r10
1399					#mul_add_c(a[5],b[5],c2,c3,c1);
1400	$LD	r6,`5*$BNSZ`(r4)
1401	$LD	r7,`5*$BNSZ`(r5)
1402	$UMULL	r8,r6,r7
1403	$UMULH	r9,r6,r7
1404	addc	r11,r11,r8
1405	adde	r12,r12,r9
1406	addze	r10,r10
1407					#mul_add_c(a[4],b[6],c2,c3,c1);
1408	$LD	r6,`4*$BNSZ`(r4)
1409	$LD	r7,`6*$BNSZ`(r5)
1410	$UMULL	r8,r6,r7
1411	$UMULH	r9,r6,r7
1412	addc	r11,r11,r8
1413	adde	r12,r12,r9
1414	addze	r10,r10
1415					#mul_add_c(a[3],b[7],c2,c3,c1);
1416	$LD	r6,`3*$BNSZ`(r4)
1417	$LD	r7,`7*$BNSZ`(r5)
1418	$UMULL	r8,r6,r7
1419	$UMULH	r9,r6,r7
1420	addc	r11,r11,r8
1421	adde	r12,r12,r9
1422	addze	r10,r10
1423	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1424					#mul_add_c(a[4],b[7],c3,c1,c2);
1425	$LD	r6,`4*$BNSZ`(r4)
1426	$UMULL	r8,r6,r7
1427	$UMULH	r9,r6,r7
1428	addc	r12,r12,r8
1429	adde	r10,r10,r9
1430	addze	r11,r0
1431					#mul_add_c(a[5],b[6],c3,c1,c2);
1432	$LD	r6,`5*$BNSZ`(r4)
1433	$LD	r7,`6*$BNSZ`(r5)
1434	$UMULL	r8,r6,r7
1435	$UMULH	r9,r6,r7
1436	addc	r12,r12,r8
1437	adde	r10,r10,r9
1438	addze	r11,r11
1439					#mul_add_c(a[6],b[5],c3,c1,c2);
1440	$LD	r6,`6*$BNSZ`(r4)
1441	$LD	r7,`5*$BNSZ`(r5)
1442	$UMULL	r8,r6,r7
1443	$UMULH	r9,r6,r7
1444	addc	r12,r12,r8
1445	adde	r10,r10,r9
1446	addze	r11,r11
1447					#mul_add_c(a[7],b[4],c3,c1,c2);
1448	$LD	r6,`7*$BNSZ`(r4)
1449	$LD	r7,`4*$BNSZ`(r5)
1450	$UMULL	r8,r6,r7
1451	$UMULH	r9,r6,r7
1452	addc	r12,r12,r8
1453	adde	r10,r10,r9
1454	addze	r11,r11
1455	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1456					#mul_add_c(a[7],b[5],c1,c2,c3);
1457	$LD	r7,`5*$BNSZ`(r5)
1458	$UMULL	r8,r6,r7
1459	$UMULH	r9,r6,r7
1460	addc	r10,r10,r8
1461	adde	r11,r11,r9
1462	addze	r12,r0
1463					#mul_add_c(a[6],b[6],c1,c2,c3);
1464	$LD	r6,`6*$BNSZ`(r4)
1465	$LD	r7,`6*$BNSZ`(r5)
1466	$UMULL	r8,r6,r7
1467	$UMULH	r9,r6,r7
1468	addc	r10,r10,r8
1469	adde	r11,r11,r9
1470	addze	r12,r12
1471					#mul_add_c(a[5],b[7],c1,c2,c3);
1472	$LD	r6,`5*$BNSZ`(r4)
1473	$LD	r7,`7*$BNSZ`(r5)
1474	$UMULL	r8,r6,r7
1475	$UMULH	r9,r6,r7
1476	addc	r10,r10,r8
1477	adde	r11,r11,r9
1478	addze	r12,r12
1479	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1480					#mul_add_c(a[6],b[7],c2,c3,c1);
1481	$LD	r6,`6*$BNSZ`(r4)
1482	$UMULL	r8,r6,r7
1483	$UMULH	r9,r6,r7
1484	addc	r11,r11,r8
1485	adde	r12,r12,r9
1486	addze	r10,r0
1487					#mul_add_c(a[7],b[6],c2,c3,c1);
1488	$LD	r6,`7*$BNSZ`(r4)
1489	$LD	r7,`6*$BNSZ`(r5)
1490	$UMULL	r8,r6,r7
1491	$UMULH	r9,r6,r7
1492	addc	r11,r11,r8
1493	adde	r12,r12,r9
1494	addze	r10,r10
1495	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1496					#mul_add_c(a[7],b[7],c3,c1,c2);
1497	$LD	r7,`7*$BNSZ`(r5)
1498	$UMULL	r8,r6,r7
1499	$UMULH	r9,r6,r7
1500	addc	r12,r12,r8
1501	adde	r10,r10,r9
1502	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1503	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1504	blr
1505	.long	0x00000000
1506
1507#
1508#	NOTE:	The following label name should be changed to
1509#		"bn_sub_words" i.e. remove the first dot
1510#		for the gcc compiler. This should be automatically
1511#		done in the build
1512#
1513#
1514.align	4
1515.bn_sub_words:
1516#
1517#	Handcoded version of bn_sub_words
1518#
1519#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1520#
1521#	r3 = r
1522#	r4 = a
1523#	r5 = b
1524#	r6 = n
1525#
1526#       Note:	No loop unrolling done since this is not a performance
1527#               critical loop.
1528
1529	xor	r0,r0,r0	#set r0 = 0
1530#
1531#	check for r6 = 0 AND set carry bit.
1532#
1533	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1534				# if r6 > 0 then result !=0
1535				# In either case carry bit is set.
1536	beq	Lppcasm_sub_adios
1537	addi	r4,r4,-$BNSZ
1538	addi	r3,r3,-$BNSZ
1539	addi	r5,r5,-$BNSZ
1540	mtctr	r6
1541Lppcasm_sub_mainloop:
1542	$LDU	r7,$BNSZ(r4)
1543	$LDU	r8,$BNSZ(r5)
1544	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1545				# if carry = 1 this is r7-r8. Else it
1546				# is r7-r8 -1 as we need.
1547	$STU	r6,$BNSZ(r3)
1548	bdnz-	Lppcasm_sub_mainloop
1549Lppcasm_sub_adios:
1550	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1551	andi.	r3,r3,1         # keep only last bit.
1552	blr
1553	.long	0x00000000
1554
1555
1556#
1557#	NOTE:	The following label name should be changed to
1558#		"bn_add_words" i.e. remove the first dot
1559#		for the gcc compiler. This should be automatically
1560#		done in the build
1561#
1562
1563.align	4
1564.bn_add_words:
1565#
1566#	Handcoded version of bn_add_words
1567#
1568#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1569#
1570#	r3 = r
1571#	r4 = a
1572#	r5 = b
1573#	r6 = n
1574#
1575#       Note:	No loop unrolling done since this is not a performance
1576#               critical loop.
1577
1578	xor	r0,r0,r0
1579#
1580#	check for r6 = 0. Is this needed?
1581#
1582	addic.	r6,r6,0		#test r6 and clear carry bit.
1583	beq	Lppcasm_add_adios
1584	addi	r4,r4,-$BNSZ
1585	addi	r3,r3,-$BNSZ
1586	addi	r5,r5,-$BNSZ
1587	mtctr	r6
1588Lppcasm_add_mainloop:
1589	$LDU	r7,$BNSZ(r4)
1590	$LDU	r8,$BNSZ(r5)
1591	adde	r8,r7,r8
1592	$STU	r8,$BNSZ(r3)
1593	bdnz-	Lppcasm_add_mainloop
1594Lppcasm_add_adios:
1595	addze	r3,r0			#return carry bit.
1596	blr
1597	.long	0x00000000
1598
1599#
1600#	NOTE:	The following label name should be changed to
1601#		"bn_div_words" i.e. remove the first dot
1602#		for the gcc compiler. This should be automatically
1603#		done in the build
1604#
1605
1606.align	4
1607.bn_div_words:
1608#
1609#	This is a cleaned up version of code generated by
1610#	the AIX compiler. The only optimization is to use
1611#	the PPC instruction to count leading zeros instead
1612#	of call to num_bits_word. Since this was compiled
1613#	only at level -O2 we can possibly squeeze it more?
1614#
1615#	r3 = h
1616#	r4 = l
1617#	r5 = d
1618
1619	$UCMPI	0,r5,0			# compare r5 and 0
1620	bne	Lppcasm_div1		# proceed if d!=0
1621	li	r3,-1			# d=0 return -1
1622	blr
1623Lppcasm_div1:
1624	xor	r0,r0,r0		#r0=0
1625	li	r8,$BITS
1626	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1627	beq	Lppcasm_div2		#proceed if no leading zeros
1628	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1629	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1630	$TR	16,r9,r0		#if there're, signal to dump core...
1631Lppcasm_div2:
1632	$UCMP	0,r3,r5			#h>=d?
1633	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
1634	subf	r3,r5,r3		#h-=d ;
1635Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1636	cmpi	0,0,r7,0		# is (i == 0)?
1637	beq	Lppcasm_div4
1638	$SHL	r3,r3,r7		# h = (h<< i)
1639	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1640	$SHL	r5,r5,r7		# d<<=i
1641	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1642	$SHL	r4,r4,r7		# l <<=i
1643Lppcasm_div4:
1644	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1645					# dl will be computed when needed
1646					# as it saves registers.
1647	li	r6,2			#r6=2
1648	mtctr	r6			#counter will be in count.
1649Lppcasm_divouterloop:
1650	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1651	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1652					# compute here for innerloop.
1653	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1654	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
1655
1656	li	r8,-1
1657	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1658	b	Lppcasm_div6
1659Lppcasm_div5:
1660	$UDIV	r8,r3,r9		#q = h/dh
1661Lppcasm_div6:
1662	$UMULL	r12,r9,r8		#th = q*dh
1663	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1664	$UMULL	r6,r8,r10		#tl = q*dl
1665
1666Lppcasm_divinnerloop:
1667	subf	r10,r12,r3		#t = h -th
1668	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1669	addic.	r7,r7,0			#test if r7 == 0. used below.
1670					# now want to compute
1671					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1672					# the following 2 instructions do that
1673	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1674	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1675	$UCMP	cr1,r6,r7		# compare (tl <= r7)
1676	bne	Lppcasm_divinnerexit
1677	ble	cr1,Lppcasm_divinnerexit
1678	addi	r8,r8,-1		#q--
1679	subf	r12,r9,r12		#th -=dh
1680	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1681	subf	r6,r10,r6		#tl -=dl
1682	b	Lppcasm_divinnerloop
1683Lppcasm_divinnerexit:
1684	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1685	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1686	$UCMP	cr1,r4,r11		# compare l and tl
1687	add	r12,r12,r10		# th+=t
1688	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
1689	addi	r12,r12,1		# th++
1690Lppcasm_div7:
1691	subf	r11,r11,r4		#r11=l-tl
1692	$UCMP	cr1,r3,r12		#compare h and th
1693	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1694	addi	r8,r8,-1		# q--
1695	add	r3,r5,r3		# h+=d
1696Lppcasm_div8:
1697	subf	r12,r12,r3		#r12 = h-th
1698	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1699					# want to compute
1700					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1701					# the following 2 instructions will do this.
1702	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1703	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1704	bdz	Lppcasm_div9		#if (count==0) break ;
1705	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1706	b	Lppcasm_divouterloop
1707Lppcasm_div9:
1708	or	r3,r8,r0
1709	blr
1710	.long	0x00000000
1711
1712#
1713#	NOTE:	The following label name should be changed to
1714#		"bn_sqr_words" i.e. remove the first dot
1715#		for the gcc compiler. This should be automatically
1716#		done in the build
1717#
1718.align	4
1719.bn_sqr_words:
1720#
1721#	Optimized version of bn_sqr_words
1722#
1723#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1724#
1725#	r3 = r
1726#	r4 = a
1727#	r5 = n
1728#
1729#	r6 = a[i].
1730#	r7,r8 = product.
1731#
1732#	No unrolling done here. Not performance critical.
1733
1734	addic.	r5,r5,0			#test r5.
1735	beq	Lppcasm_sqr_adios
1736	addi	r4,r4,-$BNSZ
1737	addi	r3,r3,-$BNSZ
1738	mtctr	r5
1739Lppcasm_sqr_mainloop:
1740					#sqr(r[0],r[1],a[0]);
1741	$LDU	r6,$BNSZ(r4)
1742	$UMULL	r7,r6,r6
1743	$UMULH  r8,r6,r6
1744	$STU	r7,$BNSZ(r3)
1745	$STU	r8,$BNSZ(r3)
1746	bdnz-	Lppcasm_sqr_mainloop
1747Lppcasm_sqr_adios:
1748	blr
1749	.long	0x00000000
1750
1751
1752#
1753#	NOTE:	The following label name should be changed to
1754#		"bn_mul_words" i.e. remove the first dot
1755#		for the gcc compiler. This should be automatically
1756#		done in the build
1757#
1758
1759.align	4
1760.bn_mul_words:
1761#
1762# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1763#
1764# r3 = rp
1765# r4 = ap
1766# r5 = num
1767# r6 = w
1768	xor	r0,r0,r0
1769	xor	r12,r12,r12		# used for carry
1770	rlwinm.	r7,r5,30,2,31		# num >> 2
1771	beq	Lppcasm_mw_REM
1772	mtctr	r7
1773Lppcasm_mw_LOOP:
1774					#mul(rp[0],ap[0],w,c1);
1775	$LD	r8,`0*$BNSZ`(r4)
1776	$UMULL	r9,r6,r8
1777	$UMULH  r10,r6,r8
1778	addc	r9,r9,r12
1779	#addze	r10,r10			#carry is NOT ignored.
1780					#will be taken care of
1781					#in second spin below
1782					#using adde.
1783	$ST	r9,`0*$BNSZ`(r3)
1784					#mul(rp[1],ap[1],w,c1);
1785	$LD	r8,`1*$BNSZ`(r4)
1786	$UMULL	r11,r6,r8
1787	$UMULH  r12,r6,r8
1788	adde	r11,r11,r10
1789	#addze	r12,r12
1790	$ST	r11,`1*$BNSZ`(r3)
1791					#mul(rp[2],ap[2],w,c1);
1792	$LD	r8,`2*$BNSZ`(r4)
1793	$UMULL	r9,r6,r8
1794	$UMULH  r10,r6,r8
1795	adde	r9,r9,r12
1796	#addze	r10,r10
1797	$ST	r9,`2*$BNSZ`(r3)
1798					#mul_add(rp[3],ap[3],w,c1);
1799	$LD	r8,`3*$BNSZ`(r4)
1800	$UMULL	r11,r6,r8
1801	$UMULH  r12,r6,r8
1802	adde	r11,r11,r10
1803	addze	r12,r12			#this spin we collect carry into
1804					#r12
1805	$ST	r11,`3*$BNSZ`(r3)
1806
1807	addi	r3,r3,`4*$BNSZ`
1808	addi	r4,r4,`4*$BNSZ`
1809	bdnz-	Lppcasm_mw_LOOP
1810
1811Lppcasm_mw_REM:
1812	andi.	r5,r5,0x3
1813	beq	Lppcasm_mw_OVER
1814					#mul(rp[0],ap[0],w,c1);
1815	$LD	r8,`0*$BNSZ`(r4)
1816	$UMULL	r9,r6,r8
1817	$UMULH  r10,r6,r8
1818	addc	r9,r9,r12
1819	addze	r10,r10
1820	$ST	r9,`0*$BNSZ`(r3)
1821	addi	r12,r10,0
1822
1823	addi	r5,r5,-1
1824	cmpli	0,0,r5,0
1825	beq	Lppcasm_mw_OVER
1826
1827
1828					#mul(rp[1],ap[1],w,c1);
1829	$LD	r8,`1*$BNSZ`(r4)
1830	$UMULL	r9,r6,r8
1831	$UMULH  r10,r6,r8
1832	addc	r9,r9,r12
1833	addze	r10,r10
1834	$ST	r9,`1*$BNSZ`(r3)
1835	addi	r12,r10,0
1836
1837	addi	r5,r5,-1
1838	cmpli	0,0,r5,0
1839	beq	Lppcasm_mw_OVER
1840
1841					#mul_add(rp[2],ap[2],w,c1);
1842	$LD	r8,`2*$BNSZ`(r4)
1843	$UMULL	r9,r6,r8
1844	$UMULH  r10,r6,r8
1845	addc	r9,r9,r12
1846	addze	r10,r10
1847	$ST	r9,`2*$BNSZ`(r3)
1848	addi	r12,r10,0
1849
1850Lppcasm_mw_OVER:
1851	addi	r3,r12,0
1852	blr
1853	.long	0x00000000
1854
1855#
1856#	NOTE:	The following label name should be changed to
1857#		"bn_mul_add_words" i.e. remove the first dot
1858#		for the gcc compiler. This should be automatically
1859#		done in the build
1860#
1861
1862.align	4
1863.bn_mul_add_words:
1864#
1865# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1866#
1867# r3 = rp
1868# r4 = ap
1869# r5 = num
1870# r6 = w
1871#
1872# empirical evidence suggests that unrolled version performs best!!
1873#
1874	xor	r0,r0,r0		#r0 = 0
1875	xor	r12,r12,r12  		#r12 = 0 . used for carry
1876	rlwinm.	r7,r5,30,2,31		# num >> 2
1877	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1878	mtctr	r7
1879Lppcasm_maw_mainloop:
1880					#mul_add(rp[0],ap[0],w,c1);
1881	$LD	r8,`0*$BNSZ`(r4)
1882	$LD	r11,`0*$BNSZ`(r3)
1883	$UMULL	r9,r6,r8
1884	$UMULH  r10,r6,r8
1885	addc	r9,r9,r12		#r12 is carry.
1886	addze	r10,r10
1887	addc	r9,r9,r11
1888	#addze	r10,r10
1889					#the above instruction addze
1890					#is NOT needed. Carry will NOT
1891					#be ignored. It's not affected
1892					#by multiply and will be collected
1893					#in the next spin
1894	$ST	r9,`0*$BNSZ`(r3)
1895
1896					#mul_add(rp[1],ap[1],w,c1);
1897	$LD	r8,`1*$BNSZ`(r4)
1898	$LD	r9,`1*$BNSZ`(r3)
1899	$UMULL	r11,r6,r8
1900	$UMULH  r12,r6,r8
1901	adde	r11,r11,r10		#r10 is carry.
1902	addze	r12,r12
1903	addc	r11,r11,r9
1904	#addze	r12,r12
1905	$ST	r11,`1*$BNSZ`(r3)
1906
1907					#mul_add(rp[2],ap[2],w,c1);
1908	$LD	r8,`2*$BNSZ`(r4)
1909	$UMULL	r9,r6,r8
1910	$LD	r11,`2*$BNSZ`(r3)
1911	$UMULH  r10,r6,r8
1912	adde	r9,r9,r12
1913	addze	r10,r10
1914	addc	r9,r9,r11
1915	#addze	r10,r10
1916	$ST	r9,`2*$BNSZ`(r3)
1917
1918					#mul_add(rp[3],ap[3],w,c1);
1919	$LD	r8,`3*$BNSZ`(r4)
1920	$UMULL	r11,r6,r8
1921	$LD	r9,`3*$BNSZ`(r3)
1922	$UMULH  r12,r6,r8
1923	adde	r11,r11,r10
1924	addze	r12,r12
1925	addc	r11,r11,r9
1926	addze	r12,r12
1927	$ST	r11,`3*$BNSZ`(r3)
1928	addi	r3,r3,`4*$BNSZ`
1929	addi	r4,r4,`4*$BNSZ`
1930	bdnz-	Lppcasm_maw_mainloop
1931
1932Lppcasm_maw_leftover:
1933	andi.	r5,r5,0x3
1934	beq	Lppcasm_maw_adios
1935	addi	r3,r3,-$BNSZ
1936	addi	r4,r4,-$BNSZ
1937					#mul_add(rp[0],ap[0],w,c1);
1938	mtctr	r5
1939	$LDU	r8,$BNSZ(r4)
1940	$UMULL	r9,r6,r8
1941	$UMULH  r10,r6,r8
1942	$LDU	r11,$BNSZ(r3)
1943	addc	r9,r9,r11
1944	addze	r10,r10
1945	addc	r9,r9,r12
1946	addze	r12,r10
1947	$ST	r9,0(r3)
1948
1949	bdz	Lppcasm_maw_adios
1950					#mul_add(rp[1],ap[1],w,c1);
1951	$LDU	r8,$BNSZ(r4)
1952	$UMULL	r9,r6,r8
1953	$UMULH  r10,r6,r8
1954	$LDU	r11,$BNSZ(r3)
1955	addc	r9,r9,r11
1956	addze	r10,r10
1957	addc	r9,r9,r12
1958	addze	r12,r10
1959	$ST	r9,0(r3)
1960
1961	bdz	Lppcasm_maw_adios
1962					#mul_add(rp[2],ap[2],w,c1);
1963	$LDU	r8,$BNSZ(r4)
1964	$UMULL	r9,r6,r8
1965	$UMULH  r10,r6,r8
1966	$LDU	r11,$BNSZ(r3)
1967	addc	r9,r9,r11
1968	addze	r10,r10
1969	addc	r9,r9,r12
1970	addze	r12,r10
1971	$ST	r9,0(r3)
1972
1973Lppcasm_maw_adios:
1974	addi	r3,r12,0
1975	blr
1976	.long	0x00000000
1977	.align	4
1978EOF
1979$data =~ s/\`([^\`]*)\`/eval $1/gem;
1980print $data;
1981close STDOUT;
1982