xref: /freebsd/crypto/openssl/crypto/bn/asm/ppc.pl (revision 0957b409)
1#! /usr/bin/env perl
2# Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# Implemented as a Perl wrapper as we want to support several different
10# architectures with single file. We pick up the target based on the
11# file name we are asked to generate.
12#
13# It should be noted though that this perl code is nothing like
14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15# as pre-processor to cover for platform differences in name decoration,
16# linker tables, 32-/64-bit instruction sets...
17#
18# As you might know there're several PowerPC ABI in use. Most notably
19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20# are similar enough to implement leaf(!) functions, which would be ABI
21# neutral. And that's what you find here: ABI neutral leaf functions.
22# In case you wonder what that is...
23#
24#       AIX performance
25#
26#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27#
28#	The following is the performance of 32-bit compiler
29#	generated code:
30#
31#	OpenSSL 0.9.6c 21 dec 2001
32#	built on: Tue Jun 11 11:06:51 EDT 2002
33#	options:bn(64,32) ...
34#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
35#                  sign    verify    sign/s verify/s
36#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
37#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
38#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
39#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
40#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
41#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
42#
43#	Same benchmark with this assembler code:
44#
45#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
46#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
47#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
48#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
49#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
50#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
51#
52#	Number of operations increases by at almost 75%
53#
54#	Here are performance numbers for 64-bit compiler
55#	generated code:
56#
57#	OpenSSL 0.9.6g [engine] 9 Aug 2002
58#	built on: Fri Apr 18 16:59:20 EDT 2003
59#	options:bn(64,64) ...
60#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61#                  sign    verify    sign/s verify/s
62#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
63#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
64#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
65#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
66#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
67#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
68#
69#	Same benchmark with this assembler code:
70#
71#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
72#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
73#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
74#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
75#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
76#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
77#
78#	Again, performance increases by at about 75%
79#
80#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81#       OpenSSL 0.9.7c 30 Sep 2003
82#
83#       Original code.
84#
85#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
86#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
87#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
88#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
89#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
90#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
91#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
92#
93#       Same benchmark with this assembler code:
94#
95#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
96#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
97#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
98#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
99#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
100#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
101#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
102#
103#        Performance increase of ~60%
104#        Based on submission from Suresh N. Chari of IBM
105
106$flavour = shift;
107
108if ($flavour =~ /32/) {
109	$BITS=	32;
110	$BNSZ=	$BITS/8;
111	$ISA=	"\"ppc\"";
112
113	$LD=	"lwz";		# load
114	$LDU=	"lwzu";		# load and update
115	$ST=	"stw";		# store
116	$STU=	"stwu";		# store and update
117	$UMULL=	"mullw";	# unsigned multiply low
118	$UMULH=	"mulhwu";	# unsigned multiply high
119	$UDIV=	"divwu";	# unsigned divide
120	$UCMPI=	"cmplwi";	# unsigned compare with immediate
121	$UCMP=	"cmplw";	# unsigned compare
122	$CNTLZ=	"cntlzw";	# count leading zeros
123	$SHL=	"slw";		# shift left
124	$SHR=	"srw";		# unsigned shift right
125	$SHRI=	"srwi";		# unsigned shift right by immediate
126	$SHLI=	"slwi";		# shift left by immediate
127	$CLRU=	"clrlwi";	# clear upper bits
128	$INSR=	"insrwi";	# insert right
129	$ROTL=	"rotlwi";	# rotate left by immediate
130	$TR=	"tw";		# conditional trap
131} elsif ($flavour =~ /64/) {
132	$BITS=	64;
133	$BNSZ=	$BITS/8;
134	$ISA=	"\"ppc64\"";
135
136	# same as above, but 64-bit mnemonics...
137	$LD=	"ld";		# load
138	$LDU=	"ldu";		# load and update
139	$ST=	"std";		# store
140	$STU=	"stdu";		# store and update
141	$UMULL=	"mulld";	# unsigned multiply low
142	$UMULH=	"mulhdu";	# unsigned multiply high
143	$UDIV=	"divdu";	# unsigned divide
144	$UCMPI=	"cmpldi";	# unsigned compare with immediate
145	$UCMP=	"cmpld";	# unsigned compare
146	$CNTLZ=	"cntlzd";	# count leading zeros
147	$SHL=	"sld";		# shift left
148	$SHR=	"srd";		# unsigned shift right
149	$SHRI=	"srdi";		# unsigned shift right by immediate
150	$SHLI=	"sldi";		# shift left by immediate
151	$CLRU=	"clrldi";	# clear upper bits
152	$INSR=	"insrdi";	# insert right
153	$ROTL=	"rotldi";	# rotate left by immediate
154	$TR=	"td";		# conditional trap
155} else { die "nonsense $flavour"; }
156
157$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
158( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
159( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
160die "can't locate ppc-xlate.pl";
161
162open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
163
164$data=<<EOF;
165#--------------------------------------------------------------------
166#
167#
168#
169#
170#	File:		ppc32.s
171#
172#	Created by:	Suresh Chari
173#			IBM Thomas J. Watson Research Library
174#			Hawthorne, NY
175#
176#
177#	Description:	Optimized assembly routines for OpenSSL crypto
178#			on the 32 bitPowerPC platform.
179#
180#
181#	Version History
182#
183#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
184#	   cleaned up code. Also made a single version which can
185#	   be used for both the AIX and Linux compilers. See NOTE
186#	   below.
187#				12/05/03		Suresh Chari
188#			(with lots of help from)        Andy Polyakov
189##
190#	1. Initial version	10/20/02		Suresh Chari
191#
192#
193#	The following file works for the xlc,cc
194#	and gcc compilers.
195#
196#	NOTE:	To get the file to link correctly with the gcc compiler
197#	        you have to change the names of the routines and remove
198#		the first .(dot) character. This should automatically
199#		be done in the build process.
200#
201#	Hand optimized assembly code for the following routines
202#
203#	bn_sqr_comba4
204#	bn_sqr_comba8
205#	bn_mul_comba4
206#	bn_mul_comba8
207#	bn_sub_words
208#	bn_add_words
209#	bn_div_words
210#	bn_sqr_words
211#	bn_mul_words
212#	bn_mul_add_words
213#
214#	NOTE:	It is possible to optimize this code more for
215#	specific PowerPC or Power architectures. On the Northstar
216#	architecture the optimizations in this file do
217#	 NOT provide much improvement.
218#
219#	If you have comments or suggestions to improve code send
220#	me a note at schari\@us.ibm.com
221#
222#--------------------------------------------------------------------------
223#
224#	Defines to be used in the assembly code.
225#
226#.set r0,0	# we use it as storage for value of 0
227#.set SP,1	# preserved
228#.set RTOC,2	# preserved
229#.set r3,3	# 1st argument/return value
230#.set r4,4	# 2nd argument/volatile register
231#.set r5,5	# 3rd argument/volatile register
232#.set r6,6	# ...
233#.set r7,7
234#.set r8,8
235#.set r9,9
236#.set r10,10
237#.set r11,11
238#.set r12,12
239#.set r13,13	# not used, nor any other "below" it...
240
241#	Declare function names to be global
242#	NOTE:	For gcc these names MUST be changed to remove
243#	        the first . i.e. for example change ".bn_sqr_comba4"
244#		to "bn_sqr_comba4". This should be automatically done
245#		in the build.
246
247	.globl	.bn_sqr_comba4
248	.globl	.bn_sqr_comba8
249	.globl	.bn_mul_comba4
250	.globl	.bn_mul_comba8
251	.globl	.bn_sub_words
252	.globl	.bn_add_words
253	.globl	.bn_div_words
254	.globl	.bn_sqr_words
255	.globl	.bn_mul_words
256	.globl	.bn_mul_add_words
257
258# .text section
259
260	.machine	"any"
261
262#
263#	NOTE:	The following label name should be changed to
264#		"bn_sqr_comba4" i.e. remove the first dot
265#		for the gcc compiler. This should be automatically
266#		done in the build
267#
268
269.align	4
270.bn_sqr_comba4:
271#
272# Optimized version of bn_sqr_comba4.
273#
274# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
275# r3 contains r
276# r4 contains a
277#
278# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
279#
280# r5,r6 are the two BN_ULONGs being multiplied.
281# r7,r8 are the results of the 32x32 giving 64 bit multiply.
282# r9,r10, r11 are the equivalents of c1,c2, c3.
283# Here's the assembly
284#
285#
286	xor		r0,r0,r0		# set r0 = 0. Used in the addze
287						# instructions below
288
289						#sqr_add_c(a,0,c1,c2,c3)
290	$LD		r5,`0*$BNSZ`(r4)
291	$UMULL		r9,r5,r5
292	$UMULH		r10,r5,r5		#in first iteration. No need
293						#to add since c1=c2=c3=0.
294						# Note c3(r11) is NOT set to 0
295						# but will be.
296
297	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
298						# sqr_add_c2(a,1,0,c2,c3,c1);
299	$LD		r6,`1*$BNSZ`(r4)
300	$UMULL		r7,r5,r6
301	$UMULH		r8,r5,r6
302
303	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
304	adde		r8,r8,r8
305	addze		r9,r0			# catch carry if any.
306						# r9= r0(=0) and carry
307
308	addc		r10,r7,r10		# now add to temp result.
309	addze		r11,r8                  # r8 added to r11 which is 0
310	addze		r9,r9
311
312	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
313						#sqr_add_c(a,1,c3,c1,c2)
314	$UMULL		r7,r6,r6
315	$UMULH		r8,r6,r6
316	addc		r11,r7,r11
317	adde		r9,r8,r9
318	addze		r10,r0
319						#sqr_add_c2(a,2,0,c3,c1,c2)
320	$LD		r6,`2*$BNSZ`(r4)
321	$UMULL		r7,r5,r6
322	$UMULH		r8,r5,r6
323
324	addc		r7,r7,r7
325	adde		r8,r8,r8
326	addze		r10,r10
327
328	addc		r11,r7,r11
329	adde		r9,r8,r9
330	addze		r10,r10
331	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
332						#sqr_add_c2(a,3,0,c1,c2,c3);
333	$LD		r6,`3*$BNSZ`(r4)
334	$UMULL		r7,r5,r6
335	$UMULH		r8,r5,r6
336	addc		r7,r7,r7
337	adde		r8,r8,r8
338	addze		r11,r0
339
340	addc		r9,r7,r9
341	adde		r10,r8,r10
342	addze		r11,r11
343						#sqr_add_c2(a,2,1,c1,c2,c3);
344	$LD		r5,`1*$BNSZ`(r4)
345	$LD		r6,`2*$BNSZ`(r4)
346	$UMULL		r7,r5,r6
347	$UMULH		r8,r5,r6
348
349	addc		r7,r7,r7
350	adde		r8,r8,r8
351	addze		r11,r11
352	addc		r9,r7,r9
353	adde		r10,r8,r10
354	addze		r11,r11
355	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
356						#sqr_add_c(a,2,c2,c3,c1);
357	$UMULL		r7,r6,r6
358	$UMULH		r8,r6,r6
359	addc		r10,r7,r10
360	adde		r11,r8,r11
361	addze		r9,r0
362						#sqr_add_c2(a,3,1,c2,c3,c1);
363	$LD		r6,`3*$BNSZ`(r4)
364	$UMULL		r7,r5,r6
365	$UMULH		r8,r5,r6
366	addc		r7,r7,r7
367	adde		r8,r8,r8
368	addze		r9,r9
369
370	addc		r10,r7,r10
371	adde		r11,r8,r11
372	addze		r9,r9
373	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
374						#sqr_add_c2(a,3,2,c3,c1,c2);
375	$LD		r5,`2*$BNSZ`(r4)
376	$UMULL		r7,r5,r6
377	$UMULH		r8,r5,r6
378	addc		r7,r7,r7
379	adde		r8,r8,r8
380	addze		r10,r0
381
382	addc		r11,r7,r11
383	adde		r9,r8,r9
384	addze		r10,r10
385	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
386						#sqr_add_c(a,3,c1,c2,c3);
387	$UMULL		r7,r6,r6
388	$UMULH		r8,r6,r6
389	addc		r9,r7,r9
390	adde		r10,r8,r10
391
392	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
393	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
394	blr
395	.long	0
396	.byte	0,12,0x14,0,0,0,2,0
397	.long	0
398.size	.bn_sqr_comba4,.-.bn_sqr_comba4
399
400#
401#	NOTE:	The following label name should be changed to
402#		"bn_sqr_comba8" i.e. remove the first dot
403#		for the gcc compiler. This should be automatically
404#		done in the build
405#
406
407.align	4
408.bn_sqr_comba8:
409#
410# This is an optimized version of the bn_sqr_comba8 routine.
411# Tightly uses the adde instruction
412#
413#
414# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
415# r3 contains r
416# r4 contains a
417#
418# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
419#
420# r5,r6 are the two BN_ULONGs being multiplied.
421# r7,r8 are the results of the 32x32 giving 64 bit multiply.
422# r9,r10, r11 are the equivalents of c1,c2, c3.
423#
424# Possible optimization of loading all 8 longs of a into registers
425# doesn't provide any speedup
426#
427
428	xor		r0,r0,r0		#set r0 = 0.Used in addze
429						#instructions below.
430
431						#sqr_add_c(a,0,c1,c2,c3);
432	$LD		r5,`0*$BNSZ`(r4)
433	$UMULL		r9,r5,r5		#1st iteration:	no carries.
434	$UMULH		r10,r5,r5
435	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
436						#sqr_add_c2(a,1,0,c2,c3,c1);
437	$LD		r6,`1*$BNSZ`(r4)
438	$UMULL		r7,r5,r6
439	$UMULH		r8,r5,r6
440
441	addc		r10,r7,r10		#add the two register number
442	adde		r11,r8,r0 		# (r8,r7) to the three register
443	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
444
445	addc		r10,r7,r10		#add the two register number
446	adde		r11,r8,r11 		# (r8,r7) to the three register
447	addze		r9,r9			# number (r9,r11,r10).
448
449	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
450
451						#sqr_add_c(a,1,c3,c1,c2);
452	$UMULL		r7,r6,r6
453	$UMULH		r8,r6,r6
454	addc		r11,r7,r11
455	adde		r9,r8,r9
456	addze		r10,r0
457						#sqr_add_c2(a,2,0,c3,c1,c2);
458	$LD		r6,`2*$BNSZ`(r4)
459	$UMULL		r7,r5,r6
460	$UMULH		r8,r5,r6
461
462	addc		r11,r7,r11
463	adde		r9,r8,r9
464	addze		r10,r10
465
466	addc		r11,r7,r11
467	adde		r9,r8,r9
468	addze		r10,r10
469
470	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
471						#sqr_add_c2(a,3,0,c1,c2,c3);
472	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
473	$UMULL		r7,r5,r6
474	$UMULH		r8,r5,r6
475
476	addc		r9,r7,r9
477	adde		r10,r8,r10
478	addze		r11,r0
479
480	addc		r9,r7,r9
481	adde		r10,r8,r10
482	addze		r11,r11
483						#sqr_add_c2(a,2,1,c1,c2,c3);
484	$LD		r5,`1*$BNSZ`(r4)
485	$LD		r6,`2*$BNSZ`(r4)
486	$UMULL		r7,r5,r6
487	$UMULH		r8,r5,r6
488
489	addc		r9,r7,r9
490	adde		r10,r8,r10
491	addze		r11,r11
492
493	addc		r9,r7,r9
494	adde		r10,r8,r10
495	addze		r11,r11
496
497	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
498						#sqr_add_c(a,2,c2,c3,c1);
499	$UMULL		r7,r6,r6
500	$UMULH		r8,r6,r6
501
502	addc		r10,r7,r10
503	adde		r11,r8,r11
504	addze		r9,r0
505						#sqr_add_c2(a,3,1,c2,c3,c1);
506	$LD		r6,`3*$BNSZ`(r4)
507	$UMULL		r7,r5,r6
508	$UMULH		r8,r5,r6
509
510	addc		r10,r7,r10
511	adde		r11,r8,r11
512	addze		r9,r9
513
514	addc		r10,r7,r10
515	adde		r11,r8,r11
516	addze		r9,r9
517						#sqr_add_c2(a,4,0,c2,c3,c1);
518	$LD		r5,`0*$BNSZ`(r4)
519	$LD		r6,`4*$BNSZ`(r4)
520	$UMULL		r7,r5,r6
521	$UMULH		r8,r5,r6
522
523	addc		r10,r7,r10
524	adde		r11,r8,r11
525	addze		r9,r9
526
527	addc		r10,r7,r10
528	adde		r11,r8,r11
529	addze		r9,r9
530	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
531						#sqr_add_c2(a,5,0,c3,c1,c2);
532	$LD		r6,`5*$BNSZ`(r4)
533	$UMULL		r7,r5,r6
534	$UMULH		r8,r5,r6
535
536	addc		r11,r7,r11
537	adde		r9,r8,r9
538	addze		r10,r0
539
540	addc		r11,r7,r11
541	adde		r9,r8,r9
542	addze		r10,r10
543						#sqr_add_c2(a,4,1,c3,c1,c2);
544	$LD		r5,`1*$BNSZ`(r4)
545	$LD		r6,`4*$BNSZ`(r4)
546	$UMULL		r7,r5,r6
547	$UMULH		r8,r5,r6
548
549	addc		r11,r7,r11
550	adde		r9,r8,r9
551	addze		r10,r10
552
553	addc		r11,r7,r11
554	adde		r9,r8,r9
555	addze		r10,r10
556						#sqr_add_c2(a,3,2,c3,c1,c2);
557	$LD		r5,`2*$BNSZ`(r4)
558	$LD		r6,`3*$BNSZ`(r4)
559	$UMULL		r7,r5,r6
560	$UMULH		r8,r5,r6
561
562	addc		r11,r7,r11
563	adde		r9,r8,r9
564	addze		r10,r10
565
566	addc		r11,r7,r11
567	adde		r9,r8,r9
568	addze		r10,r10
569	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
570						#sqr_add_c(a,3,c1,c2,c3);
571	$UMULL		r7,r6,r6
572	$UMULH		r8,r6,r6
573	addc		r9,r7,r9
574	adde		r10,r8,r10
575	addze		r11,r0
576						#sqr_add_c2(a,4,2,c1,c2,c3);
577	$LD		r6,`4*$BNSZ`(r4)
578	$UMULL		r7,r5,r6
579	$UMULH		r8,r5,r6
580
581	addc		r9,r7,r9
582	adde		r10,r8,r10
583	addze		r11,r11
584
585	addc		r9,r7,r9
586	adde		r10,r8,r10
587	addze		r11,r11
588						#sqr_add_c2(a,5,1,c1,c2,c3);
589	$LD		r5,`1*$BNSZ`(r4)
590	$LD		r6,`5*$BNSZ`(r4)
591	$UMULL		r7,r5,r6
592	$UMULH		r8,r5,r6
593
594	addc		r9,r7,r9
595	adde		r10,r8,r10
596	addze		r11,r11
597
598	addc		r9,r7,r9
599	adde		r10,r8,r10
600	addze		r11,r11
601						#sqr_add_c2(a,6,0,c1,c2,c3);
602	$LD		r5,`0*$BNSZ`(r4)
603	$LD		r6,`6*$BNSZ`(r4)
604	$UMULL		r7,r5,r6
605	$UMULH		r8,r5,r6
606	addc		r9,r7,r9
607	adde		r10,r8,r10
608	addze		r11,r11
609	addc		r9,r7,r9
610	adde		r10,r8,r10
611	addze		r11,r11
612	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
613						#sqr_add_c2(a,7,0,c2,c3,c1);
614	$LD		r6,`7*$BNSZ`(r4)
615	$UMULL		r7,r5,r6
616	$UMULH		r8,r5,r6
617
618	addc		r10,r7,r10
619	adde		r11,r8,r11
620	addze		r9,r0
621	addc		r10,r7,r10
622	adde		r11,r8,r11
623	addze		r9,r9
624						#sqr_add_c2(a,6,1,c2,c3,c1);
625	$LD		r5,`1*$BNSZ`(r4)
626	$LD		r6,`6*$BNSZ`(r4)
627	$UMULL		r7,r5,r6
628	$UMULH		r8,r5,r6
629
630	addc		r10,r7,r10
631	adde		r11,r8,r11
632	addze		r9,r9
633	addc		r10,r7,r10
634	adde		r11,r8,r11
635	addze		r9,r9
636						#sqr_add_c2(a,5,2,c2,c3,c1);
637	$LD		r5,`2*$BNSZ`(r4)
638	$LD		r6,`5*$BNSZ`(r4)
639	$UMULL		r7,r5,r6
640	$UMULH		r8,r5,r6
641	addc		r10,r7,r10
642	adde		r11,r8,r11
643	addze		r9,r9
644	addc		r10,r7,r10
645	adde		r11,r8,r11
646	addze		r9,r9
647						#sqr_add_c2(a,4,3,c2,c3,c1);
648	$LD		r5,`3*$BNSZ`(r4)
649	$LD		r6,`4*$BNSZ`(r4)
650	$UMULL		r7,r5,r6
651	$UMULH		r8,r5,r6
652
653	addc		r10,r7,r10
654	adde		r11,r8,r11
655	addze		r9,r9
656	addc		r10,r7,r10
657	adde		r11,r8,r11
658	addze		r9,r9
659	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
660						#sqr_add_c(a,4,c3,c1,c2);
661	$UMULL		r7,r6,r6
662	$UMULH		r8,r6,r6
663	addc		r11,r7,r11
664	adde		r9,r8,r9
665	addze		r10,r0
666						#sqr_add_c2(a,5,3,c3,c1,c2);
667	$LD		r6,`5*$BNSZ`(r4)
668	$UMULL		r7,r5,r6
669	$UMULH		r8,r5,r6
670	addc		r11,r7,r11
671	adde		r9,r8,r9
672	addze		r10,r10
673	addc		r11,r7,r11
674	adde		r9,r8,r9
675	addze		r10,r10
676						#sqr_add_c2(a,6,2,c3,c1,c2);
677	$LD		r5,`2*$BNSZ`(r4)
678	$LD		r6,`6*$BNSZ`(r4)
679	$UMULL		r7,r5,r6
680	$UMULH		r8,r5,r6
681	addc		r11,r7,r11
682	adde		r9,r8,r9
683	addze		r10,r10
684
685	addc		r11,r7,r11
686	adde		r9,r8,r9
687	addze		r10,r10
688						#sqr_add_c2(a,7,1,c3,c1,c2);
689	$LD		r5,`1*$BNSZ`(r4)
690	$LD		r6,`7*$BNSZ`(r4)
691	$UMULL		r7,r5,r6
692	$UMULH		r8,r5,r6
693	addc		r11,r7,r11
694	adde		r9,r8,r9
695	addze		r10,r10
696	addc		r11,r7,r11
697	adde		r9,r8,r9
698	addze		r10,r10
699	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
700						#sqr_add_c2(a,7,2,c1,c2,c3);
701	$LD		r5,`2*$BNSZ`(r4)
702	$UMULL		r7,r5,r6
703	$UMULH		r8,r5,r6
704
705	addc		r9,r7,r9
706	adde		r10,r8,r10
707	addze		r11,r0
708	addc		r9,r7,r9
709	adde		r10,r8,r10
710	addze		r11,r11
711						#sqr_add_c2(a,6,3,c1,c2,c3);
712	$LD		r5,`3*$BNSZ`(r4)
713	$LD		r6,`6*$BNSZ`(r4)
714	$UMULL		r7,r5,r6
715	$UMULH		r8,r5,r6
716	addc		r9,r7,r9
717	adde		r10,r8,r10
718	addze		r11,r11
719	addc		r9,r7,r9
720	adde		r10,r8,r10
721	addze		r11,r11
722						#sqr_add_c2(a,5,4,c1,c2,c3);
723	$LD		r5,`4*$BNSZ`(r4)
724	$LD		r6,`5*$BNSZ`(r4)
725	$UMULL		r7,r5,r6
726	$UMULH		r8,r5,r6
727	addc		r9,r7,r9
728	adde		r10,r8,r10
729	addze		r11,r11
730	addc		r9,r7,r9
731	adde		r10,r8,r10
732	addze		r11,r11
733	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
734						#sqr_add_c(a,5,c2,c3,c1);
735	$UMULL		r7,r6,r6
736	$UMULH		r8,r6,r6
737	addc		r10,r7,r10
738	adde		r11,r8,r11
739	addze		r9,r0
740						#sqr_add_c2(a,6,4,c2,c3,c1);
741	$LD		r6,`6*$BNSZ`(r4)
742	$UMULL		r7,r5,r6
743	$UMULH		r8,r5,r6
744	addc		r10,r7,r10
745	adde		r11,r8,r11
746	addze		r9,r9
747	addc		r10,r7,r10
748	adde		r11,r8,r11
749	addze		r9,r9
750						#sqr_add_c2(a,7,3,c2,c3,c1);
751	$LD		r5,`3*$BNSZ`(r4)
752	$LD		r6,`7*$BNSZ`(r4)
753	$UMULL		r7,r5,r6
754	$UMULH		r8,r5,r6
755	addc		r10,r7,r10
756	adde		r11,r8,r11
757	addze		r9,r9
758	addc		r10,r7,r10
759	adde		r11,r8,r11
760	addze		r9,r9
761	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
762						#sqr_add_c2(a,7,4,c3,c1,c2);
763	$LD		r5,`4*$BNSZ`(r4)
764	$UMULL		r7,r5,r6
765	$UMULH		r8,r5,r6
766	addc		r11,r7,r11
767	adde		r9,r8,r9
768	addze		r10,r0
769	addc		r11,r7,r11
770	adde		r9,r8,r9
771	addze		r10,r10
772						#sqr_add_c2(a,6,5,c3,c1,c2);
773	$LD		r5,`5*$BNSZ`(r4)
774	$LD		r6,`6*$BNSZ`(r4)
775	$UMULL		r7,r5,r6
776	$UMULH		r8,r5,r6
777	addc		r11,r7,r11
778	adde		r9,r8,r9
779	addze		r10,r10
780	addc		r11,r7,r11
781	adde		r9,r8,r9
782	addze		r10,r10
783	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
784						#sqr_add_c(a,6,c1,c2,c3);
785	$UMULL		r7,r6,r6
786	$UMULH		r8,r6,r6
787	addc		r9,r7,r9
788	adde		r10,r8,r10
789	addze		r11,r0
790						#sqr_add_c2(a,7,5,c1,c2,c3)
791	$LD		r6,`7*$BNSZ`(r4)
792	$UMULL		r7,r5,r6
793	$UMULH		r8,r5,r6
794	addc		r9,r7,r9
795	adde		r10,r8,r10
796	addze		r11,r11
797	addc		r9,r7,r9
798	adde		r10,r8,r10
799	addze		r11,r11
800	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
801
802						#sqr_add_c2(a,7,6,c2,c3,c1)
803	$LD		r5,`6*$BNSZ`(r4)
804	$UMULL		r7,r5,r6
805	$UMULH		r8,r5,r6
806	addc		r10,r7,r10
807	adde		r11,r8,r11
808	addze		r9,r0
809	addc		r10,r7,r10
810	adde		r11,r8,r11
811	addze		r9,r9
812	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
813						#sqr_add_c(a,7,c3,c1,c2);
814	$UMULL		r7,r6,r6
815	$UMULH		r8,r6,r6
816	addc		r11,r7,r11
817	adde		r9,r8,r9
818	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
819	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
820
821
822	blr
823	.long	0
824	.byte	0,12,0x14,0,0,0,2,0
825	.long	0
826.size	.bn_sqr_comba8,.-.bn_sqr_comba8
827
828#
829#	NOTE:	The following label name should be changed to
830#		"bn_mul_comba4" i.e. remove the first dot
831#		for the gcc compiler. This should be automatically
832#		done in the build
833#
834
835.align	4
836.bn_mul_comba4:
837#
838# This is an optimized version of the bn_mul_comba4 routine.
839#
840# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
841# r3 contains r
842# r4 contains a
843# r5 contains b
844# r6, r7 are the 2 BN_ULONGs being multiplied.
845# r8, r9 are the results of the 32x32 giving 64 multiply.
846# r10, r11, r12 are the equivalents of c1, c2, and c3.
847#
848	xor	r0,r0,r0		#r0=0. Used in addze below.
849					#mul_add_c(a[0],b[0],c1,c2,c3);
850	$LD	r6,`0*$BNSZ`(r4)
851	$LD	r7,`0*$BNSZ`(r5)
852	$UMULL	r10,r6,r7
853	$UMULH	r11,r6,r7
854	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
855					#mul_add_c(a[0],b[1],c2,c3,c1);
856	$LD	r7,`1*$BNSZ`(r5)
857	$UMULL	r8,r6,r7
858	$UMULH	r9,r6,r7
859	addc	r11,r8,r11
860	adde	r12,r9,r0
861	addze	r10,r0
862					#mul_add_c(a[1],b[0],c2,c3,c1);
863	$LD	r6, `1*$BNSZ`(r4)
864	$LD	r7, `0*$BNSZ`(r5)
865	$UMULL	r8,r6,r7
866	$UMULH	r9,r6,r7
867	addc	r11,r8,r11
868	adde	r12,r9,r12
869	addze	r10,r10
870	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
871					#mul_add_c(a[2],b[0],c3,c1,c2);
872	$LD	r6,`2*$BNSZ`(r4)
873	$UMULL	r8,r6,r7
874	$UMULH	r9,r6,r7
875	addc	r12,r8,r12
876	adde	r10,r9,r10
877	addze	r11,r0
878					#mul_add_c(a[1],b[1],c3,c1,c2);
879	$LD	r6,`1*$BNSZ`(r4)
880	$LD	r7,`1*$BNSZ`(r5)
881	$UMULL	r8,r6,r7
882	$UMULH	r9,r6,r7
883	addc	r12,r8,r12
884	adde	r10,r9,r10
885	addze	r11,r11
886					#mul_add_c(a[0],b[2],c3,c1,c2);
887	$LD	r6,`0*$BNSZ`(r4)
888	$LD	r7,`2*$BNSZ`(r5)
889	$UMULL	r8,r6,r7
890	$UMULH	r9,r6,r7
891	addc	r12,r8,r12
892	adde	r10,r9,r10
893	addze	r11,r11
894	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
895					#mul_add_c(a[0],b[3],c1,c2,c3);
896	$LD	r7,`3*$BNSZ`(r5)
897	$UMULL	r8,r6,r7
898	$UMULH	r9,r6,r7
899	addc	r10,r8,r10
900	adde	r11,r9,r11
901	addze	r12,r0
902					#mul_add_c(a[1],b[2],c1,c2,c3);
903	$LD	r6,`1*$BNSZ`(r4)
904	$LD	r7,`2*$BNSZ`(r5)
905	$UMULL	r8,r6,r7
906	$UMULH	r9,r6,r7
907	addc	r10,r8,r10
908	adde	r11,r9,r11
909	addze	r12,r12
910					#mul_add_c(a[2],b[1],c1,c2,c3);
911	$LD	r6,`2*$BNSZ`(r4)
912	$LD	r7,`1*$BNSZ`(r5)
913	$UMULL	r8,r6,r7
914	$UMULH	r9,r6,r7
915	addc	r10,r8,r10
916	adde	r11,r9,r11
917	addze	r12,r12
918					#mul_add_c(a[3],b[0],c1,c2,c3);
919	$LD	r6,`3*$BNSZ`(r4)
920	$LD	r7,`0*$BNSZ`(r5)
921	$UMULL	r8,r6,r7
922	$UMULH	r9,r6,r7
923	addc	r10,r8,r10
924	adde	r11,r9,r11
925	addze	r12,r12
926	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
927					#mul_add_c(a[3],b[1],c2,c3,c1);
928	$LD	r7,`1*$BNSZ`(r5)
929	$UMULL	r8,r6,r7
930	$UMULH	r9,r6,r7
931	addc	r11,r8,r11
932	adde	r12,r9,r12
933	addze	r10,r0
934					#mul_add_c(a[2],b[2],c2,c3,c1);
935	$LD	r6,`2*$BNSZ`(r4)
936	$LD	r7,`2*$BNSZ`(r5)
937	$UMULL	r8,r6,r7
938	$UMULH	r9,r6,r7
939	addc	r11,r8,r11
940	adde	r12,r9,r12
941	addze	r10,r10
942					#mul_add_c(a[1],b[3],c2,c3,c1);
943	$LD	r6,`1*$BNSZ`(r4)
944	$LD	r7,`3*$BNSZ`(r5)
945	$UMULL	r8,r6,r7
946	$UMULH	r9,r6,r7
947	addc	r11,r8,r11
948	adde	r12,r9,r12
949	addze	r10,r10
950	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
951					#mul_add_c(a[2],b[3],c3,c1,c2);
952	$LD	r6,`2*$BNSZ`(r4)
953	$UMULL	r8,r6,r7
954	$UMULH	r9,r6,r7
955	addc	r12,r8,r12
956	adde	r10,r9,r10
957	addze	r11,r0
958					#mul_add_c(a[3],b[2],c3,c1,c2);
959	$LD	r6,`3*$BNSZ`(r4)
960	$LD	r7,`2*$BNSZ`(r5)
961	$UMULL	r8,r6,r7
962	$UMULH	r9,r6,r7
963	addc	r12,r8,r12
964	adde	r10,r9,r10
965	addze	r11,r11
966	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
967					#mul_add_c(a[3],b[3],c1,c2,c3);
968	$LD	r7,`3*$BNSZ`(r5)
969	$UMULL	r8,r6,r7
970	$UMULH	r9,r6,r7
971	addc	r10,r8,r10
972	adde	r11,r9,r11
973
974	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
975	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
976	blr
977	.long	0
978	.byte	0,12,0x14,0,0,0,3,0
979	.long	0
980.size	.bn_mul_comba4,.-.bn_mul_comba4
981
982#
983#	NOTE:	The following label name should be changed to
984#		"bn_mul_comba8" i.e. remove the first dot
985#		for the gcc compiler. This should be automatically
986#		done in the build
987#
988
989.align	4
990.bn_mul_comba8:
991#
992# Optimized version of the bn_mul_comba8 routine.
993#
994# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
995# r3 contains r
996# r4 contains a
997# r5 contains b
998# r6, r7 are the 2 BN_ULONGs being multiplied.
999# r8, r9 are the results of the 32x32 giving 64 multiply.
1000# r10, r11, r12 are the equivalents of c1, c2, and c3.
1001#
1002	xor	r0,r0,r0		#r0=0. Used in addze below.
1003
1004					#mul_add_c(a[0],b[0],c1,c2,c3);
1005	$LD	r6,`0*$BNSZ`(r4)	#a[0]
1006	$LD	r7,`0*$BNSZ`(r5)	#b[0]
1007	$UMULL	r10,r6,r7
1008	$UMULH	r11,r6,r7
1009	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
1010					#mul_add_c(a[0],b[1],c2,c3,c1);
1011	$LD	r7,`1*$BNSZ`(r5)
1012	$UMULL	r8,r6,r7
1013	$UMULH	r9,r6,r7
1014	addc	r11,r11,r8
1015	addze	r12,r9			# since we didn't set r12 to zero before.
1016	addze	r10,r0
1017					#mul_add_c(a[1],b[0],c2,c3,c1);
1018	$LD	r6,`1*$BNSZ`(r4)
1019	$LD	r7,`0*$BNSZ`(r5)
1020	$UMULL	r8,r6,r7
1021	$UMULH	r9,r6,r7
1022	addc	r11,r11,r8
1023	adde	r12,r12,r9
1024	addze	r10,r10
1025	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1026					#mul_add_c(a[2],b[0],c3,c1,c2);
1027	$LD	r6,`2*$BNSZ`(r4)
1028	$UMULL	r8,r6,r7
1029	$UMULH	r9,r6,r7
1030	addc	r12,r12,r8
1031	adde	r10,r10,r9
1032	addze	r11,r0
1033					#mul_add_c(a[1],b[1],c3,c1,c2);
1034	$LD	r6,`1*$BNSZ`(r4)
1035	$LD	r7,`1*$BNSZ`(r5)
1036	$UMULL	r8,r6,r7
1037	$UMULH	r9,r6,r7
1038	addc	r12,r12,r8
1039	adde	r10,r10,r9
1040	addze	r11,r11
1041					#mul_add_c(a[0],b[2],c3,c1,c2);
1042	$LD	r6,`0*$BNSZ`(r4)
1043	$LD	r7,`2*$BNSZ`(r5)
1044	$UMULL	r8,r6,r7
1045	$UMULH	r9,r6,r7
1046	addc	r12,r12,r8
1047	adde	r10,r10,r9
1048	addze	r11,r11
1049	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1050					#mul_add_c(a[0],b[3],c1,c2,c3);
1051	$LD	r7,`3*$BNSZ`(r5)
1052	$UMULL	r8,r6,r7
1053	$UMULH	r9,r6,r7
1054	addc	r10,r10,r8
1055	adde	r11,r11,r9
1056	addze	r12,r0
1057					#mul_add_c(a[1],b[2],c1,c2,c3);
1058	$LD	r6,`1*$BNSZ`(r4)
1059	$LD	r7,`2*$BNSZ`(r5)
1060	$UMULL	r8,r6,r7
1061	$UMULH	r9,r6,r7
1062	addc	r10,r10,r8
1063	adde	r11,r11,r9
1064	addze	r12,r12
1065
1066					#mul_add_c(a[2],b[1],c1,c2,c3);
1067	$LD	r6,`2*$BNSZ`(r4)
1068	$LD	r7,`1*$BNSZ`(r5)
1069	$UMULL	r8,r6,r7
1070	$UMULH	r9,r6,r7
1071	addc	r10,r10,r8
1072	adde	r11,r11,r9
1073	addze	r12,r12
1074					#mul_add_c(a[3],b[0],c1,c2,c3);
1075	$LD	r6,`3*$BNSZ`(r4)
1076	$LD	r7,`0*$BNSZ`(r5)
1077	$UMULL	r8,r6,r7
1078	$UMULH	r9,r6,r7
1079	addc	r10,r10,r8
1080	adde	r11,r11,r9
1081	addze	r12,r12
1082	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1083					#mul_add_c(a[4],b[0],c2,c3,c1);
1084	$LD	r6,`4*$BNSZ`(r4)
1085	$UMULL	r8,r6,r7
1086	$UMULH	r9,r6,r7
1087	addc	r11,r11,r8
1088	adde	r12,r12,r9
1089	addze	r10,r0
1090					#mul_add_c(a[3],b[1],c2,c3,c1);
1091	$LD	r6,`3*$BNSZ`(r4)
1092	$LD	r7,`1*$BNSZ`(r5)
1093	$UMULL	r8,r6,r7
1094	$UMULH	r9,r6,r7
1095	addc	r11,r11,r8
1096	adde	r12,r12,r9
1097	addze	r10,r10
1098					#mul_add_c(a[2],b[2],c2,c3,c1);
1099	$LD	r6,`2*$BNSZ`(r4)
1100	$LD	r7,`2*$BNSZ`(r5)
1101	$UMULL	r8,r6,r7
1102	$UMULH	r9,r6,r7
1103	addc	r11,r11,r8
1104	adde	r12,r12,r9
1105	addze	r10,r10
1106					#mul_add_c(a[1],b[3],c2,c3,c1);
1107	$LD	r6,`1*$BNSZ`(r4)
1108	$LD	r7,`3*$BNSZ`(r5)
1109	$UMULL	r8,r6,r7
1110	$UMULH	r9,r6,r7
1111	addc	r11,r11,r8
1112	adde	r12,r12,r9
1113	addze	r10,r10
1114					#mul_add_c(a[0],b[4],c2,c3,c1);
1115	$LD	r6,`0*$BNSZ`(r4)
1116	$LD	r7,`4*$BNSZ`(r5)
1117	$UMULL	r8,r6,r7
1118	$UMULH	r9,r6,r7
1119	addc	r11,r11,r8
1120	adde	r12,r12,r9
1121	addze	r10,r10
1122	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1123					#mul_add_c(a[0],b[5],c3,c1,c2);
1124	$LD	r7,`5*$BNSZ`(r5)
1125	$UMULL	r8,r6,r7
1126	$UMULH	r9,r6,r7
1127	addc	r12,r12,r8
1128	adde	r10,r10,r9
1129	addze	r11,r0
1130					#mul_add_c(a[1],b[4],c3,c1,c2);
1131	$LD	r6,`1*$BNSZ`(r4)
1132	$LD	r7,`4*$BNSZ`(r5)
1133	$UMULL	r8,r6,r7
1134	$UMULH	r9,r6,r7
1135	addc	r12,r12,r8
1136	adde	r10,r10,r9
1137	addze	r11,r11
1138					#mul_add_c(a[2],b[3],c3,c1,c2);
1139	$LD	r6,`2*$BNSZ`(r4)
1140	$LD	r7,`3*$BNSZ`(r5)
1141	$UMULL	r8,r6,r7
1142	$UMULH	r9,r6,r7
1143	addc	r12,r12,r8
1144	adde	r10,r10,r9
1145	addze	r11,r11
1146					#mul_add_c(a[3],b[2],c3,c1,c2);
1147	$LD	r6,`3*$BNSZ`(r4)
1148	$LD	r7,`2*$BNSZ`(r5)
1149	$UMULL	r8,r6,r7
1150	$UMULH	r9,r6,r7
1151	addc	r12,r12,r8
1152	adde	r10,r10,r9
1153	addze	r11,r11
1154					#mul_add_c(a[4],b[1],c3,c1,c2);
1155	$LD	r6,`4*$BNSZ`(r4)
1156	$LD	r7,`1*$BNSZ`(r5)
1157	$UMULL	r8,r6,r7
1158	$UMULH	r9,r6,r7
1159	addc	r12,r12,r8
1160	adde	r10,r10,r9
1161	addze	r11,r11
1162					#mul_add_c(a[5],b[0],c3,c1,c2);
1163	$LD	r6,`5*$BNSZ`(r4)
1164	$LD	r7,`0*$BNSZ`(r5)
1165	$UMULL	r8,r6,r7
1166	$UMULH	r9,r6,r7
1167	addc	r12,r12,r8
1168	adde	r10,r10,r9
1169	addze	r11,r11
1170	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1171					#mul_add_c(a[6],b[0],c1,c2,c3);
1172	$LD	r6,`6*$BNSZ`(r4)
1173	$UMULL	r8,r6,r7
1174	$UMULH	r9,r6,r7
1175	addc	r10,r10,r8
1176	adde	r11,r11,r9
1177	addze	r12,r0
1178					#mul_add_c(a[5],b[1],c1,c2,c3);
1179	$LD	r6,`5*$BNSZ`(r4)
1180	$LD	r7,`1*$BNSZ`(r5)
1181	$UMULL	r8,r6,r7
1182	$UMULH	r9,r6,r7
1183	addc	r10,r10,r8
1184	adde	r11,r11,r9
1185	addze	r12,r12
1186					#mul_add_c(a[4],b[2],c1,c2,c3);
1187	$LD	r6,`4*$BNSZ`(r4)
1188	$LD	r7,`2*$BNSZ`(r5)
1189	$UMULL	r8,r6,r7
1190	$UMULH	r9,r6,r7
1191	addc	r10,r10,r8
1192	adde	r11,r11,r9
1193	addze	r12,r12
1194					#mul_add_c(a[3],b[3],c1,c2,c3);
1195	$LD	r6,`3*$BNSZ`(r4)
1196	$LD	r7,`3*$BNSZ`(r5)
1197	$UMULL	r8,r6,r7
1198	$UMULH	r9,r6,r7
1199	addc	r10,r10,r8
1200	adde	r11,r11,r9
1201	addze	r12,r12
1202					#mul_add_c(a[2],b[4],c1,c2,c3);
1203	$LD	r6,`2*$BNSZ`(r4)
1204	$LD	r7,`4*$BNSZ`(r5)
1205	$UMULL	r8,r6,r7
1206	$UMULH	r9,r6,r7
1207	addc	r10,r10,r8
1208	adde	r11,r11,r9
1209	addze	r12,r12
1210					#mul_add_c(a[1],b[5],c1,c2,c3);
1211	$LD	r6,`1*$BNSZ`(r4)
1212	$LD	r7,`5*$BNSZ`(r5)
1213	$UMULL	r8,r6,r7
1214	$UMULH	r9,r6,r7
1215	addc	r10,r10,r8
1216	adde	r11,r11,r9
1217	addze	r12,r12
1218					#mul_add_c(a[0],b[6],c1,c2,c3);
1219	$LD	r6,`0*$BNSZ`(r4)
1220	$LD	r7,`6*$BNSZ`(r5)
1221	$UMULL	r8,r6,r7
1222	$UMULH	r9,r6,r7
1223	addc	r10,r10,r8
1224	adde	r11,r11,r9
1225	addze	r12,r12
1226	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1227					#mul_add_c(a[0],b[7],c2,c3,c1);
1228	$LD	r7,`7*$BNSZ`(r5)
1229	$UMULL	r8,r6,r7
1230	$UMULH	r9,r6,r7
1231	addc	r11,r11,r8
1232	adde	r12,r12,r9
1233	addze	r10,r0
1234					#mul_add_c(a[1],b[6],c2,c3,c1);
1235	$LD	r6,`1*$BNSZ`(r4)
1236	$LD	r7,`6*$BNSZ`(r5)
1237	$UMULL	r8,r6,r7
1238	$UMULH	r9,r6,r7
1239	addc	r11,r11,r8
1240	adde	r12,r12,r9
1241	addze	r10,r10
1242					#mul_add_c(a[2],b[5],c2,c3,c1);
1243	$LD	r6,`2*$BNSZ`(r4)
1244	$LD	r7,`5*$BNSZ`(r5)
1245	$UMULL	r8,r6,r7
1246	$UMULH	r9,r6,r7
1247	addc	r11,r11,r8
1248	adde	r12,r12,r9
1249	addze	r10,r10
1250					#mul_add_c(a[3],b[4],c2,c3,c1);
1251	$LD	r6,`3*$BNSZ`(r4)
1252	$LD	r7,`4*$BNSZ`(r5)
1253	$UMULL	r8,r6,r7
1254	$UMULH	r9,r6,r7
1255	addc	r11,r11,r8
1256	adde	r12,r12,r9
1257	addze	r10,r10
1258					#mul_add_c(a[4],b[3],c2,c3,c1);
1259	$LD	r6,`4*$BNSZ`(r4)
1260	$LD	r7,`3*$BNSZ`(r5)
1261	$UMULL	r8,r6,r7
1262	$UMULH	r9,r6,r7
1263	addc	r11,r11,r8
1264	adde	r12,r12,r9
1265	addze	r10,r10
1266					#mul_add_c(a[5],b[2],c2,c3,c1);
1267	$LD	r6,`5*$BNSZ`(r4)
1268	$LD	r7,`2*$BNSZ`(r5)
1269	$UMULL	r8,r6,r7
1270	$UMULH	r9,r6,r7
1271	addc	r11,r11,r8
1272	adde	r12,r12,r9
1273	addze	r10,r10
1274					#mul_add_c(a[6],b[1],c2,c3,c1);
1275	$LD	r6,`6*$BNSZ`(r4)
1276	$LD	r7,`1*$BNSZ`(r5)
1277	$UMULL	r8,r6,r7
1278	$UMULH	r9,r6,r7
1279	addc	r11,r11,r8
1280	adde	r12,r12,r9
1281	addze	r10,r10
1282					#mul_add_c(a[7],b[0],c2,c3,c1);
1283	$LD	r6,`7*$BNSZ`(r4)
1284	$LD	r7,`0*$BNSZ`(r5)
1285	$UMULL	r8,r6,r7
1286	$UMULH	r9,r6,r7
1287	addc	r11,r11,r8
1288	adde	r12,r12,r9
1289	addze	r10,r10
1290	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1291					#mul_add_c(a[7],b[1],c3,c1,c2);
1292	$LD	r7,`1*$BNSZ`(r5)
1293	$UMULL	r8,r6,r7
1294	$UMULH	r9,r6,r7
1295	addc	r12,r12,r8
1296	adde	r10,r10,r9
1297	addze	r11,r0
1298					#mul_add_c(a[6],b[2],c3,c1,c2);
1299	$LD	r6,`6*$BNSZ`(r4)
1300	$LD	r7,`2*$BNSZ`(r5)
1301	$UMULL	r8,r6,r7
1302	$UMULH	r9,r6,r7
1303	addc	r12,r12,r8
1304	adde	r10,r10,r9
1305	addze	r11,r11
1306					#mul_add_c(a[5],b[3],c3,c1,c2);
1307	$LD	r6,`5*$BNSZ`(r4)
1308	$LD	r7,`3*$BNSZ`(r5)
1309	$UMULL	r8,r6,r7
1310	$UMULH	r9,r6,r7
1311	addc	r12,r12,r8
1312	adde	r10,r10,r9
1313	addze	r11,r11
1314					#mul_add_c(a[4],b[4],c3,c1,c2);
1315	$LD	r6,`4*$BNSZ`(r4)
1316	$LD	r7,`4*$BNSZ`(r5)
1317	$UMULL	r8,r6,r7
1318	$UMULH	r9,r6,r7
1319	addc	r12,r12,r8
1320	adde	r10,r10,r9
1321	addze	r11,r11
1322					#mul_add_c(a[3],b[5],c3,c1,c2);
1323	$LD	r6,`3*$BNSZ`(r4)
1324	$LD	r7,`5*$BNSZ`(r5)
1325	$UMULL	r8,r6,r7
1326	$UMULH	r9,r6,r7
1327	addc	r12,r12,r8
1328	adde	r10,r10,r9
1329	addze	r11,r11
1330					#mul_add_c(a[2],b[6],c3,c1,c2);
1331	$LD	r6,`2*$BNSZ`(r4)
1332	$LD	r7,`6*$BNSZ`(r5)
1333	$UMULL	r8,r6,r7
1334	$UMULH	r9,r6,r7
1335	addc	r12,r12,r8
1336	adde	r10,r10,r9
1337	addze	r11,r11
1338					#mul_add_c(a[1],b[7],c3,c1,c2);
1339	$LD	r6,`1*$BNSZ`(r4)
1340	$LD	r7,`7*$BNSZ`(r5)
1341	$UMULL	r8,r6,r7
1342	$UMULH	r9,r6,r7
1343	addc	r12,r12,r8
1344	adde	r10,r10,r9
1345	addze	r11,r11
1346	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1347					#mul_add_c(a[2],b[7],c1,c2,c3);
1348	$LD	r6,`2*$BNSZ`(r4)
1349	$UMULL	r8,r6,r7
1350	$UMULH	r9,r6,r7
1351	addc	r10,r10,r8
1352	adde	r11,r11,r9
1353	addze	r12,r0
1354					#mul_add_c(a[3],b[6],c1,c2,c3);
1355	$LD	r6,`3*$BNSZ`(r4)
1356	$LD	r7,`6*$BNSZ`(r5)
1357	$UMULL	r8,r6,r7
1358	$UMULH	r9,r6,r7
1359	addc	r10,r10,r8
1360	adde	r11,r11,r9
1361	addze	r12,r12
1362					#mul_add_c(a[4],b[5],c1,c2,c3);
1363	$LD	r6,`4*$BNSZ`(r4)
1364	$LD	r7,`5*$BNSZ`(r5)
1365	$UMULL	r8,r6,r7
1366	$UMULH	r9,r6,r7
1367	addc	r10,r10,r8
1368	adde	r11,r11,r9
1369	addze	r12,r12
1370					#mul_add_c(a[5],b[4],c1,c2,c3);
1371	$LD	r6,`5*$BNSZ`(r4)
1372	$LD	r7,`4*$BNSZ`(r5)
1373	$UMULL	r8,r6,r7
1374	$UMULH	r9,r6,r7
1375	addc	r10,r10,r8
1376	adde	r11,r11,r9
1377	addze	r12,r12
1378					#mul_add_c(a[6],b[3],c1,c2,c3);
1379	$LD	r6,`6*$BNSZ`(r4)
1380	$LD	r7,`3*$BNSZ`(r5)
1381	$UMULL	r8,r6,r7
1382	$UMULH	r9,r6,r7
1383	addc	r10,r10,r8
1384	adde	r11,r11,r9
1385	addze	r12,r12
1386					#mul_add_c(a[7],b[2],c1,c2,c3);
1387	$LD	r6,`7*$BNSZ`(r4)
1388	$LD	r7,`2*$BNSZ`(r5)
1389	$UMULL	r8,r6,r7
1390	$UMULH	r9,r6,r7
1391	addc	r10,r10,r8
1392	adde	r11,r11,r9
1393	addze	r12,r12
1394	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1395					#mul_add_c(a[7],b[3],c2,c3,c1);
1396	$LD	r7,`3*$BNSZ`(r5)
1397	$UMULL	r8,r6,r7
1398	$UMULH	r9,r6,r7
1399	addc	r11,r11,r8
1400	adde	r12,r12,r9
1401	addze	r10,r0
1402					#mul_add_c(a[6],b[4],c2,c3,c1);
1403	$LD	r6,`6*$BNSZ`(r4)
1404	$LD	r7,`4*$BNSZ`(r5)
1405	$UMULL	r8,r6,r7
1406	$UMULH	r9,r6,r7
1407	addc	r11,r11,r8
1408	adde	r12,r12,r9
1409	addze	r10,r10
1410					#mul_add_c(a[5],b[5],c2,c3,c1);
1411	$LD	r6,`5*$BNSZ`(r4)
1412	$LD	r7,`5*$BNSZ`(r5)
1413	$UMULL	r8,r6,r7
1414	$UMULH	r9,r6,r7
1415	addc	r11,r11,r8
1416	adde	r12,r12,r9
1417	addze	r10,r10
1418					#mul_add_c(a[4],b[6],c2,c3,c1);
1419	$LD	r6,`4*$BNSZ`(r4)
1420	$LD	r7,`6*$BNSZ`(r5)
1421	$UMULL	r8,r6,r7
1422	$UMULH	r9,r6,r7
1423	addc	r11,r11,r8
1424	adde	r12,r12,r9
1425	addze	r10,r10
1426					#mul_add_c(a[3],b[7],c2,c3,c1);
1427	$LD	r6,`3*$BNSZ`(r4)
1428	$LD	r7,`7*$BNSZ`(r5)
1429	$UMULL	r8,r6,r7
1430	$UMULH	r9,r6,r7
1431	addc	r11,r11,r8
1432	adde	r12,r12,r9
1433	addze	r10,r10
1434	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1435					#mul_add_c(a[4],b[7],c3,c1,c2);
1436	$LD	r6,`4*$BNSZ`(r4)
1437	$UMULL	r8,r6,r7
1438	$UMULH	r9,r6,r7
1439	addc	r12,r12,r8
1440	adde	r10,r10,r9
1441	addze	r11,r0
1442					#mul_add_c(a[5],b[6],c3,c1,c2);
1443	$LD	r6,`5*$BNSZ`(r4)
1444	$LD	r7,`6*$BNSZ`(r5)
1445	$UMULL	r8,r6,r7
1446	$UMULH	r9,r6,r7
1447	addc	r12,r12,r8
1448	adde	r10,r10,r9
1449	addze	r11,r11
1450					#mul_add_c(a[6],b[5],c3,c1,c2);
1451	$LD	r6,`6*$BNSZ`(r4)
1452	$LD	r7,`5*$BNSZ`(r5)
1453	$UMULL	r8,r6,r7
1454	$UMULH	r9,r6,r7
1455	addc	r12,r12,r8
1456	adde	r10,r10,r9
1457	addze	r11,r11
1458					#mul_add_c(a[7],b[4],c3,c1,c2);
1459	$LD	r6,`7*$BNSZ`(r4)
1460	$LD	r7,`4*$BNSZ`(r5)
1461	$UMULL	r8,r6,r7
1462	$UMULH	r9,r6,r7
1463	addc	r12,r12,r8
1464	adde	r10,r10,r9
1465	addze	r11,r11
1466	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1467					#mul_add_c(a[7],b[5],c1,c2,c3);
1468	$LD	r7,`5*$BNSZ`(r5)
1469	$UMULL	r8,r6,r7
1470	$UMULH	r9,r6,r7
1471	addc	r10,r10,r8
1472	adde	r11,r11,r9
1473	addze	r12,r0
1474					#mul_add_c(a[6],b[6],c1,c2,c3);
1475	$LD	r6,`6*$BNSZ`(r4)
1476	$LD	r7,`6*$BNSZ`(r5)
1477	$UMULL	r8,r6,r7
1478	$UMULH	r9,r6,r7
1479	addc	r10,r10,r8
1480	adde	r11,r11,r9
1481	addze	r12,r12
1482					#mul_add_c(a[5],b[7],c1,c2,c3);
1483	$LD	r6,`5*$BNSZ`(r4)
1484	$LD	r7,`7*$BNSZ`(r5)
1485	$UMULL	r8,r6,r7
1486	$UMULH	r9,r6,r7
1487	addc	r10,r10,r8
1488	adde	r11,r11,r9
1489	addze	r12,r12
1490	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1491					#mul_add_c(a[6],b[7],c2,c3,c1);
1492	$LD	r6,`6*$BNSZ`(r4)
1493	$UMULL	r8,r6,r7
1494	$UMULH	r9,r6,r7
1495	addc	r11,r11,r8
1496	adde	r12,r12,r9
1497	addze	r10,r0
1498					#mul_add_c(a[7],b[6],c2,c3,c1);
1499	$LD	r6,`7*$BNSZ`(r4)
1500	$LD	r7,`6*$BNSZ`(r5)
1501	$UMULL	r8,r6,r7
1502	$UMULH	r9,r6,r7
1503	addc	r11,r11,r8
1504	adde	r12,r12,r9
1505	addze	r10,r10
1506	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1507					#mul_add_c(a[7],b[7],c3,c1,c2);
1508	$LD	r7,`7*$BNSZ`(r5)
1509	$UMULL	r8,r6,r7
1510	$UMULH	r9,r6,r7
1511	addc	r12,r12,r8
1512	adde	r10,r10,r9
1513	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1514	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1515	blr
1516	.long	0
1517	.byte	0,12,0x14,0,0,0,3,0
1518	.long	0
1519.size	.bn_mul_comba8,.-.bn_mul_comba8
1520
1521#
1522#	NOTE:	The following label name should be changed to
1523#		"bn_sub_words" i.e. remove the first dot
1524#		for the gcc compiler. This should be automatically
1525#		done in the build
1526#
1527#
1528.align	4
1529.bn_sub_words:
1530#
1531#	Handcoded version of bn_sub_words
1532#
1533#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1534#
1535#	r3 = r
1536#	r4 = a
1537#	r5 = b
1538#	r6 = n
1539#
1540#       Note:	No loop unrolling done since this is not a performance
1541#               critical loop.
1542
1543	xor	r0,r0,r0	#set r0 = 0
1544#
1545#	check for r6 = 0 AND set carry bit.
1546#
1547	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1548				# if r6 > 0 then result !=0
1549				# In either case carry bit is set.
1550	beq	Lppcasm_sub_adios
1551	addi	r4,r4,-$BNSZ
1552	addi	r3,r3,-$BNSZ
1553	addi	r5,r5,-$BNSZ
1554	mtctr	r6
1555Lppcasm_sub_mainloop:
1556	$LDU	r7,$BNSZ(r4)
1557	$LDU	r8,$BNSZ(r5)
1558	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1559				# if carry = 1 this is r7-r8. Else it
1560				# is r7-r8 -1 as we need.
1561	$STU	r6,$BNSZ(r3)
1562	bdnz	Lppcasm_sub_mainloop
1563Lppcasm_sub_adios:
1564	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1565	andi.	r3,r3,1         # keep only last bit.
1566	blr
1567	.long	0
1568	.byte	0,12,0x14,0,0,0,4,0
1569	.long	0
1570.size	.bn_sub_words,.-.bn_sub_words
1571
1572#
1573#	NOTE:	The following label name should be changed to
1574#		"bn_add_words" i.e. remove the first dot
1575#		for the gcc compiler. This should be automatically
1576#		done in the build
1577#
1578
1579.align	4
1580.bn_add_words:
1581#
1582#	Handcoded version of bn_add_words
1583#
1584#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1585#
1586#	r3 = r
1587#	r4 = a
1588#	r5 = b
1589#	r6 = n
1590#
1591#       Note:	No loop unrolling done since this is not a performance
1592#               critical loop.
1593
1594	xor	r0,r0,r0
1595#
1596#	check for r6 = 0. Is this needed?
1597#
1598	addic.	r6,r6,0		#test r6 and clear carry bit.
1599	beq	Lppcasm_add_adios
1600	addi	r4,r4,-$BNSZ
1601	addi	r3,r3,-$BNSZ
1602	addi	r5,r5,-$BNSZ
1603	mtctr	r6
1604Lppcasm_add_mainloop:
1605	$LDU	r7,$BNSZ(r4)
1606	$LDU	r8,$BNSZ(r5)
1607	adde	r8,r7,r8
1608	$STU	r8,$BNSZ(r3)
1609	bdnz	Lppcasm_add_mainloop
1610Lppcasm_add_adios:
1611	addze	r3,r0			#return carry bit.
1612	blr
1613	.long	0
1614	.byte	0,12,0x14,0,0,0,4,0
1615	.long	0
1616.size	.bn_add_words,.-.bn_add_words
1617
1618#
1619#	NOTE:	The following label name should be changed to
1620#		"bn_div_words" i.e. remove the first dot
1621#		for the gcc compiler. This should be automatically
1622#		done in the build
1623#
1624
1625.align	4
1626.bn_div_words:
1627#
1628#	This is a cleaned up version of code generated by
1629#	the AIX compiler. The only optimization is to use
1630#	the PPC instruction to count leading zeros instead
1631#	of call to num_bits_word. Since this was compiled
1632#	only at level -O2 we can possibly squeeze it more?
1633#
1634#	r3 = h
1635#	r4 = l
1636#	r5 = d
1637
1638	$UCMPI	0,r5,0			# compare r5 and 0
1639	bne	Lppcasm_div1		# proceed if d!=0
1640	li	r3,-1			# d=0 return -1
1641	blr
1642Lppcasm_div1:
1643	xor	r0,r0,r0		#r0=0
1644	li	r8,$BITS
1645	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1646	beq	Lppcasm_div2		#proceed if no leading zeros
1647	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1648	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1649	$TR	16,r9,r0		#if there're, signal to dump core...
1650Lppcasm_div2:
1651	$UCMP	0,r3,r5			#h>=d?
1652	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
1653	subf	r3,r5,r3		#h-=d ;
1654Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1655	cmpi	0,0,r7,0		# is (i == 0)?
1656	beq	Lppcasm_div4
1657	$SHL	r3,r3,r7		# h = (h<< i)
1658	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1659	$SHL	r5,r5,r7		# d<<=i
1660	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1661	$SHL	r4,r4,r7		# l <<=i
1662Lppcasm_div4:
1663	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1664					# dl will be computed when needed
1665					# as it saves registers.
1666	li	r6,2			#r6=2
1667	mtctr	r6			#counter will be in count.
1668Lppcasm_divouterloop:
1669	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1670	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1671					# compute here for innerloop.
1672	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1673	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
1674
1675	li	r8,-1
1676	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1677	b	Lppcasm_div6
1678Lppcasm_div5:
1679	$UDIV	r8,r3,r9		#q = h/dh
1680Lppcasm_div6:
1681	$UMULL	r12,r9,r8		#th = q*dh
1682	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1683	$UMULL	r6,r8,r10		#tl = q*dl
1684
1685Lppcasm_divinnerloop:
1686	subf	r10,r12,r3		#t = h -th
1687	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1688	addic.	r7,r7,0			#test if r7 == 0. used below.
1689					# now want to compute
1690					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1691					# the following 2 instructions do that
1692	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1693	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1694	$UCMP	cr1,r6,r7		# compare (tl <= r7)
1695	bne	Lppcasm_divinnerexit
1696	ble	cr1,Lppcasm_divinnerexit
1697	addi	r8,r8,-1		#q--
1698	subf	r12,r9,r12		#th -=dh
1699	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1700	subf	r6,r10,r6		#tl -=dl
1701	b	Lppcasm_divinnerloop
1702Lppcasm_divinnerexit:
1703	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1704	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1705	$UCMP	cr1,r4,r11		# compare l and tl
1706	add	r12,r12,r10		# th+=t
1707	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
1708	addi	r12,r12,1		# th++
1709Lppcasm_div7:
1710	subf	r11,r11,r4		#r11=l-tl
1711	$UCMP	cr1,r3,r12		#compare h and th
1712	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1713	addi	r8,r8,-1		# q--
1714	add	r3,r5,r3		# h+=d
1715Lppcasm_div8:
1716	subf	r12,r12,r3		#r12 = h-th
1717	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1718					# want to compute
1719					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1720					# the following 2 instructions will do this.
1721	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1722	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1723	bdz	Lppcasm_div9		#if (count==0) break ;
1724	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1725	b	Lppcasm_divouterloop
1726Lppcasm_div9:
1727	or	r3,r8,r0
1728	blr
1729	.long	0
1730	.byte	0,12,0x14,0,0,0,3,0
1731	.long	0
1732.size	.bn_div_words,.-.bn_div_words
1733
1734#
1735#	NOTE:	The following label name should be changed to
1736#		"bn_sqr_words" i.e. remove the first dot
1737#		for the gcc compiler. This should be automatically
1738#		done in the build
1739#
1740.align	4
1741.bn_sqr_words:
1742#
1743#	Optimized version of bn_sqr_words
1744#
1745#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1746#
1747#	r3 = r
1748#	r4 = a
1749#	r5 = n
1750#
1751#	r6 = a[i].
1752#	r7,r8 = product.
1753#
1754#	No unrolling done here. Not performance critical.
1755
1756	addic.	r5,r5,0			#test r5.
1757	beq	Lppcasm_sqr_adios
1758	addi	r4,r4,-$BNSZ
1759	addi	r3,r3,-$BNSZ
1760	mtctr	r5
1761Lppcasm_sqr_mainloop:
1762					#sqr(r[0],r[1],a[0]);
1763	$LDU	r6,$BNSZ(r4)
1764	$UMULL	r7,r6,r6
1765	$UMULH  r8,r6,r6
1766	$STU	r7,$BNSZ(r3)
1767	$STU	r8,$BNSZ(r3)
1768	bdnz	Lppcasm_sqr_mainloop
1769Lppcasm_sqr_adios:
1770	blr
1771	.long	0
1772	.byte	0,12,0x14,0,0,0,3,0
1773	.long	0
1774.size	.bn_sqr_words,.-.bn_sqr_words
1775
1776#
1777#	NOTE:	The following label name should be changed to
1778#		"bn_mul_words" i.e. remove the first dot
1779#		for the gcc compiler. This should be automatically
1780#		done in the build
1781#
1782
1783.align	4
1784.bn_mul_words:
1785#
1786# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1787#
1788# r3 = rp
1789# r4 = ap
1790# r5 = num
1791# r6 = w
1792	xor	r0,r0,r0
1793	xor	r12,r12,r12		# used for carry
1794	rlwinm.	r7,r5,30,2,31		# num >> 2
1795	beq	Lppcasm_mw_REM
1796	mtctr	r7
1797Lppcasm_mw_LOOP:
1798					#mul(rp[0],ap[0],w,c1);
1799	$LD	r8,`0*$BNSZ`(r4)
1800	$UMULL	r9,r6,r8
1801	$UMULH  r10,r6,r8
1802	addc	r9,r9,r12
1803	#addze	r10,r10			#carry is NOT ignored.
1804					#will be taken care of
1805					#in second spin below
1806					#using adde.
1807	$ST	r9,`0*$BNSZ`(r3)
1808					#mul(rp[1],ap[1],w,c1);
1809	$LD	r8,`1*$BNSZ`(r4)
1810	$UMULL	r11,r6,r8
1811	$UMULH  r12,r6,r8
1812	adde	r11,r11,r10
1813	#addze	r12,r12
1814	$ST	r11,`1*$BNSZ`(r3)
1815					#mul(rp[2],ap[2],w,c1);
1816	$LD	r8,`2*$BNSZ`(r4)
1817	$UMULL	r9,r6,r8
1818	$UMULH  r10,r6,r8
1819	adde	r9,r9,r12
1820	#addze	r10,r10
1821	$ST	r9,`2*$BNSZ`(r3)
1822					#mul_add(rp[3],ap[3],w,c1);
1823	$LD	r8,`3*$BNSZ`(r4)
1824	$UMULL	r11,r6,r8
1825	$UMULH  r12,r6,r8
1826	adde	r11,r11,r10
1827	addze	r12,r12			#this spin we collect carry into
1828					#r12
1829	$ST	r11,`3*$BNSZ`(r3)
1830
1831	addi	r3,r3,`4*$BNSZ`
1832	addi	r4,r4,`4*$BNSZ`
1833	bdnz	Lppcasm_mw_LOOP
1834
1835Lppcasm_mw_REM:
1836	andi.	r5,r5,0x3
1837	beq	Lppcasm_mw_OVER
1838					#mul(rp[0],ap[0],w,c1);
1839	$LD	r8,`0*$BNSZ`(r4)
1840	$UMULL	r9,r6,r8
1841	$UMULH  r10,r6,r8
1842	addc	r9,r9,r12
1843	addze	r10,r10
1844	$ST	r9,`0*$BNSZ`(r3)
1845	addi	r12,r10,0
1846
1847	addi	r5,r5,-1
1848	cmpli	0,0,r5,0
1849	beq	Lppcasm_mw_OVER
1850
1851
1852					#mul(rp[1],ap[1],w,c1);
1853	$LD	r8,`1*$BNSZ`(r4)
1854	$UMULL	r9,r6,r8
1855	$UMULH  r10,r6,r8
1856	addc	r9,r9,r12
1857	addze	r10,r10
1858	$ST	r9,`1*$BNSZ`(r3)
1859	addi	r12,r10,0
1860
1861	addi	r5,r5,-1
1862	cmpli	0,0,r5,0
1863	beq	Lppcasm_mw_OVER
1864
1865					#mul_add(rp[2],ap[2],w,c1);
1866	$LD	r8,`2*$BNSZ`(r4)
1867	$UMULL	r9,r6,r8
1868	$UMULH  r10,r6,r8
1869	addc	r9,r9,r12
1870	addze	r10,r10
1871	$ST	r9,`2*$BNSZ`(r3)
1872	addi	r12,r10,0
1873
1874Lppcasm_mw_OVER:
1875	addi	r3,r12,0
1876	blr
1877	.long	0
1878	.byte	0,12,0x14,0,0,0,4,0
1879	.long	0
1880.size	.bn_mul_words,.-.bn_mul_words
1881
1882#
1883#	NOTE:	The following label name should be changed to
1884#		"bn_mul_add_words" i.e. remove the first dot
1885#		for the gcc compiler. This should be automatically
1886#		done in the build
1887#
1888
1889.align	4
1890.bn_mul_add_words:
1891#
1892# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1893#
1894# r3 = rp
1895# r4 = ap
1896# r5 = num
1897# r6 = w
1898#
1899# empirical evidence suggests that unrolled version performs best!!
1900#
1901	xor	r0,r0,r0		#r0 = 0
1902	xor	r12,r12,r12  		#r12 = 0 . used for carry
1903	rlwinm.	r7,r5,30,2,31		# num >> 2
1904	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1905	mtctr	r7
1906Lppcasm_maw_mainloop:
1907					#mul_add(rp[0],ap[0],w,c1);
1908	$LD	r8,`0*$BNSZ`(r4)
1909	$LD	r11,`0*$BNSZ`(r3)
1910	$UMULL	r9,r6,r8
1911	$UMULH  r10,r6,r8
1912	addc	r9,r9,r12		#r12 is carry.
1913	addze	r10,r10
1914	addc	r9,r9,r11
1915	#addze	r10,r10
1916					#the above instruction addze
1917					#is NOT needed. Carry will NOT
1918					#be ignored. It's not affected
1919					#by multiply and will be collected
1920					#in the next spin
1921	$ST	r9,`0*$BNSZ`(r3)
1922
1923					#mul_add(rp[1],ap[1],w,c1);
1924	$LD	r8,`1*$BNSZ`(r4)
1925	$LD	r9,`1*$BNSZ`(r3)
1926	$UMULL	r11,r6,r8
1927	$UMULH  r12,r6,r8
1928	adde	r11,r11,r10		#r10 is carry.
1929	addze	r12,r12
1930	addc	r11,r11,r9
1931	#addze	r12,r12
1932	$ST	r11,`1*$BNSZ`(r3)
1933
1934					#mul_add(rp[2],ap[2],w,c1);
1935	$LD	r8,`2*$BNSZ`(r4)
1936	$UMULL	r9,r6,r8
1937	$LD	r11,`2*$BNSZ`(r3)
1938	$UMULH  r10,r6,r8
1939	adde	r9,r9,r12
1940	addze	r10,r10
1941	addc	r9,r9,r11
1942	#addze	r10,r10
1943	$ST	r9,`2*$BNSZ`(r3)
1944
1945					#mul_add(rp[3],ap[3],w,c1);
1946	$LD	r8,`3*$BNSZ`(r4)
1947	$UMULL	r11,r6,r8
1948	$LD	r9,`3*$BNSZ`(r3)
1949	$UMULH  r12,r6,r8
1950	adde	r11,r11,r10
1951	addze	r12,r12
1952	addc	r11,r11,r9
1953	addze	r12,r12
1954	$ST	r11,`3*$BNSZ`(r3)
1955	addi	r3,r3,`4*$BNSZ`
1956	addi	r4,r4,`4*$BNSZ`
1957	bdnz	Lppcasm_maw_mainloop
1958
1959Lppcasm_maw_leftover:
1960	andi.	r5,r5,0x3
1961	beq	Lppcasm_maw_adios
1962	addi	r3,r3,-$BNSZ
1963	addi	r4,r4,-$BNSZ
1964					#mul_add(rp[0],ap[0],w,c1);
1965	mtctr	r5
1966	$LDU	r8,$BNSZ(r4)
1967	$UMULL	r9,r6,r8
1968	$UMULH  r10,r6,r8
1969	$LDU	r11,$BNSZ(r3)
1970	addc	r9,r9,r11
1971	addze	r10,r10
1972	addc	r9,r9,r12
1973	addze	r12,r10
1974	$ST	r9,0(r3)
1975
1976	bdz	Lppcasm_maw_adios
1977					#mul_add(rp[1],ap[1],w,c1);
1978	$LDU	r8,$BNSZ(r4)
1979	$UMULL	r9,r6,r8
1980	$UMULH  r10,r6,r8
1981	$LDU	r11,$BNSZ(r3)
1982	addc	r9,r9,r11
1983	addze	r10,r10
1984	addc	r9,r9,r12
1985	addze	r12,r10
1986	$ST	r9,0(r3)
1987
1988	bdz	Lppcasm_maw_adios
1989					#mul_add(rp[2],ap[2],w,c1);
1990	$LDU	r8,$BNSZ(r4)
1991	$UMULL	r9,r6,r8
1992	$UMULH  r10,r6,r8
1993	$LDU	r11,$BNSZ(r3)
1994	addc	r9,r9,r11
1995	addze	r10,r10
1996	addc	r9,r9,r12
1997	addze	r12,r10
1998	$ST	r9,0(r3)
1999
2000Lppcasm_maw_adios:
2001	addi	r3,r12,0
2002	blr
2003	.long	0
2004	.byte	0,12,0x14,0,0,0,4,0
2005	.long	0
2006.size	.bn_mul_add_words,.-.bn_mul_add_words
2007	.align	4
2008EOF
2009$data =~ s/\`([^\`]*)\`/eval $1/gem;
2010print $data;
2011close STDOUT;
2012