xref: /freebsd/crypto/openssl/crypto/bn/asm/ppc-mont.pl (revision 42249ef2)
1#! /usr/bin/env perl
2# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# April 2006
18
19# "Teaser" Montgomery multiplication module for PowerPC. It's possible
20# to gain a bit more by modulo-scheduling outer loop, then dedicated
21# squaring procedure should give further 20% and code can be adapted
22# for 32-bit application running on 64-bit CPU. As for the latter.
23# It won't be able to achieve "native" 64-bit performance, because in
24# 32-bit application context every addc instruction will have to be
25# expanded as addc, twice right shift by 32 and finally adde, etc.
26# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
27# for 64-bit application running on PPC970/G5 is:
28#
29# 512-bit	+65%
30# 1024-bit	+35%
31# 2048-bit	+18%
32# 4096-bit	+4%
33
34# September 2016
35#
36# Add multiplication procedure operating on lengths divisible by 4
37# and squaring procedure operating on lengths divisible by 8. Length
38# is expressed in number of limbs. RSA private key operations are
39# ~35-50% faster (more for longer keys) on contemporary high-end POWER
40# processors in 64-bit builds, [mysteriously enough] more in 32-bit
41# builds. On low-end 32-bit processors performance improvement turned
42# to be marginal...
43
44$flavour = shift;
45
46if ($flavour =~ /32/) {
47	$BITS=	32;
48	$BNSZ=	$BITS/8;
49	$SIZE_T=4;
50	$RZONE=	224;
51
52	$LD=	"lwz";		# load
53	$LDU=	"lwzu";		# load and update
54	$LDX=	"lwzx";		# load indexed
55	$ST=	"stw";		# store
56	$STU=	"stwu";		# store and update
57	$STX=	"stwx";		# store indexed
58	$STUX=	"stwux";	# store indexed and update
59	$UMULL=	"mullw";	# unsigned multiply low
60	$UMULH=	"mulhwu";	# unsigned multiply high
61	$UCMP=	"cmplw";	# unsigned compare
62	$SHRI=	"srwi";		# unsigned shift right by immediate
63	$SHLI=	"slwi";		# unsigned shift left by immediate
64	$PUSH=	$ST;
65	$POP=	$LD;
66} elsif ($flavour =~ /64/) {
67	$BITS=	64;
68	$BNSZ=	$BITS/8;
69	$SIZE_T=8;
70	$RZONE=	288;
71
72	# same as above, but 64-bit mnemonics...
73	$LD=	"ld";		# load
74	$LDU=	"ldu";		# load and update
75	$LDX=	"ldx";		# load indexed
76	$ST=	"std";		# store
77	$STU=	"stdu";		# store and update
78	$STX=	"stdx";		# store indexed
79	$STUX=	"stdux";	# store indexed and update
80	$UMULL=	"mulld";	# unsigned multiply low
81	$UMULH=	"mulhdu";	# unsigned multiply high
82	$UCMP=	"cmpld";	# unsigned compare
83	$SHRI=	"srdi";		# unsigned shift right by immediate
84	$SHLI=	"sldi";		# unsigned shift left by immediate
85	$PUSH=	$ST;
86	$POP=	$LD;
87} else { die "nonsense $flavour"; }
88
89$FRAME=8*$SIZE_T+$RZONE;
90$LOCALS=8*$SIZE_T;
91
92$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
93( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
94( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
95die "can't locate ppc-xlate.pl";
96
97open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
98
99$sp="r1";
100$toc="r2";
101$rp="r3";
102$ap="r4";
103$bp="r5";
104$np="r6";
105$n0="r7";
106$num="r8";
107
108{
109my $ovf=$rp;
110my $rp="r9";	# $rp is reassigned
111my $aj="r10";
112my $nj="r11";
113my $tj="r12";
114# non-volatile registers
115my $i="r20";
116my $j="r21";
117my $tp="r22";
118my $m0="r23";
119my $m1="r24";
120my $lo0="r25";
121my $hi0="r26";
122my $lo1="r27";
123my $hi1="r28";
124my $alo="r29";
125my $ahi="r30";
126my $nlo="r31";
127#
128my $nhi="r0";
129
130$code=<<___;
131.machine "any"
132.text
133
134.globl	.bn_mul_mont_int
135.align	5
136.bn_mul_mont_int:
137	mr	$rp,r3		; $rp is reassigned
138	li	r3,0
139___
140$code.=<<___ if ($BNSZ==4);
141	cmpwi	$num,32		; longer key performance is not better
142	bgelr
143___
144$code.=<<___;
145	slwi	$num,$num,`log($BNSZ)/log(2)`
146	li	$tj,-4096
147	addi	$ovf,$num,$FRAME
148	subf	$ovf,$ovf,$sp	; $sp-$ovf
149	and	$ovf,$ovf,$tj	; minimize TLB usage
150	subf	$ovf,$sp,$ovf	; $ovf-$sp
151	mr	$tj,$sp
152	srwi	$num,$num,`log($BNSZ)/log(2)`
153	$STUX	$sp,$sp,$ovf
154
155	$PUSH	r20,`-12*$SIZE_T`($tj)
156	$PUSH	r21,`-11*$SIZE_T`($tj)
157	$PUSH	r22,`-10*$SIZE_T`($tj)
158	$PUSH	r23,`-9*$SIZE_T`($tj)
159	$PUSH	r24,`-8*$SIZE_T`($tj)
160	$PUSH	r25,`-7*$SIZE_T`($tj)
161	$PUSH	r26,`-6*$SIZE_T`($tj)
162	$PUSH	r27,`-5*$SIZE_T`($tj)
163	$PUSH	r28,`-4*$SIZE_T`($tj)
164	$PUSH	r29,`-3*$SIZE_T`($tj)
165	$PUSH	r30,`-2*$SIZE_T`($tj)
166	$PUSH	r31,`-1*$SIZE_T`($tj)
167
168	$LD	$n0,0($n0)	; pull n0[0] value
169	addi	$num,$num,-2	; adjust $num for counter register
170
171	$LD	$m0,0($bp)	; m0=bp[0]
172	$LD	$aj,0($ap)	; ap[0]
173	addi	$tp,$sp,$LOCALS
174	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
175	$UMULH	$hi0,$aj,$m0
176
177	$LD	$aj,$BNSZ($ap)	; ap[1]
178	$LD	$nj,0($np)	; np[0]
179
180	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
181
182	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
183	$UMULH	$ahi,$aj,$m0
184
185	$UMULL	$lo1,$nj,$m1	; np[0]*m1
186	$UMULH	$hi1,$nj,$m1
187	$LD	$nj,$BNSZ($np)	; np[1]
188	addc	$lo1,$lo1,$lo0
189	addze	$hi1,$hi1
190
191	$UMULL	$nlo,$nj,$m1	; np[1]*m1
192	$UMULH	$nhi,$nj,$m1
193
194	mtctr	$num
195	li	$j,`2*$BNSZ`
196.align	4
197L1st:
198	$LDX	$aj,$ap,$j	; ap[j]
199	addc	$lo0,$alo,$hi0
200	$LDX	$nj,$np,$j	; np[j]
201	addze	$hi0,$ahi
202	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
203	addc	$lo1,$nlo,$hi1
204	$UMULH	$ahi,$aj,$m0
205	addze	$hi1,$nhi
206	$UMULL	$nlo,$nj,$m1	; np[j]*m1
207	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
208	$UMULH	$nhi,$nj,$m1
209	addze	$hi1,$hi1
210	$ST	$lo1,0($tp)	; tp[j-1]
211
212	addi	$j,$j,$BNSZ	; j++
213	addi	$tp,$tp,$BNSZ	; tp++
214	bdnz	L1st
215;L1st
216	addc	$lo0,$alo,$hi0
217	addze	$hi0,$ahi
218
219	addc	$lo1,$nlo,$hi1
220	addze	$hi1,$nhi
221	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
222	addze	$hi1,$hi1
223	$ST	$lo1,0($tp)	; tp[j-1]
224
225	li	$ovf,0
226	addc	$hi1,$hi1,$hi0
227	addze	$ovf,$ovf	; upmost overflow bit
228	$ST	$hi1,$BNSZ($tp)
229
230	li	$i,$BNSZ
231.align	4
232Louter:
233	$LDX	$m0,$bp,$i	; m0=bp[i]
234	$LD	$aj,0($ap)	; ap[0]
235	addi	$tp,$sp,$LOCALS
236	$LD	$tj,$LOCALS($sp); tp[0]
237	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
238	$UMULH	$hi0,$aj,$m0
239	$LD	$aj,$BNSZ($ap)	; ap[1]
240	$LD	$nj,0($np)	; np[0]
241	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
242	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
243	addze	$hi0,$hi0
244	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
245	$UMULH	$ahi,$aj,$m0
246	$UMULL	$lo1,$nj,$m1	; np[0]*m1
247	$UMULH	$hi1,$nj,$m1
248	$LD	$nj,$BNSZ($np)	; np[1]
249	addc	$lo1,$lo1,$lo0
250	$UMULL	$nlo,$nj,$m1	; np[1]*m1
251	addze	$hi1,$hi1
252	$UMULH	$nhi,$nj,$m1
253
254	mtctr	$num
255	li	$j,`2*$BNSZ`
256.align	4
257Linner:
258	$LDX	$aj,$ap,$j	; ap[j]
259	addc	$lo0,$alo,$hi0
260	$LD	$tj,$BNSZ($tp)	; tp[j]
261	addze	$hi0,$ahi
262	$LDX	$nj,$np,$j	; np[j]
263	addc	$lo1,$nlo,$hi1
264	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
265	addze	$hi1,$nhi
266	$UMULH	$ahi,$aj,$m0
267	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
268	$UMULL	$nlo,$nj,$m1	; np[j]*m1
269	addze	$hi0,$hi0
270	$UMULH	$nhi,$nj,$m1
271	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
272	addi	$j,$j,$BNSZ	; j++
273	addze	$hi1,$hi1
274	$ST	$lo1,0($tp)	; tp[j-1]
275	addi	$tp,$tp,$BNSZ	; tp++
276	bdnz	Linner
277;Linner
278	$LD	$tj,$BNSZ($tp)	; tp[j]
279	addc	$lo0,$alo,$hi0
280	addze	$hi0,$ahi
281	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
282	addze	$hi0,$hi0
283
284	addc	$lo1,$nlo,$hi1
285	addze	$hi1,$nhi
286	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
287	addze	$hi1,$hi1
288	$ST	$lo1,0($tp)	; tp[j-1]
289
290	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
291	li	$ovf,0
292	adde	$hi1,$hi1,$hi0
293	addze	$ovf,$ovf
294	$ST	$hi1,$BNSZ($tp)
295;
296	slwi	$tj,$num,`log($BNSZ)/log(2)`
297	$UCMP	$i,$tj
298	addi	$i,$i,$BNSZ
299	ble	Louter
300
301	addi	$num,$num,2	; restore $num
302	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
303	addi	$tp,$sp,$LOCALS
304	mtctr	$num
305
306.align	4
307Lsub:	$LDX	$tj,$tp,$j
308	$LDX	$nj,$np,$j
309	subfe	$aj,$nj,$tj	; tp[j]-np[j]
310	$STX	$aj,$rp,$j
311	addi	$j,$j,$BNSZ
312	bdnz	Lsub
313
314	li	$j,0
315	mtctr	$num
316	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
317
318.align	4
319Lcopy:				; conditional copy
320	$LDX	$tj,$tp,$j
321	$LDX	$aj,$rp,$j
322	and	$tj,$tj,$ovf
323	andc	$aj,$aj,$ovf
324	$STX	$j,$tp,$j	; zap at once
325	or	$aj,$aj,$tj
326	$STX	$aj,$rp,$j
327	addi	$j,$j,$BNSZ
328	bdnz	Lcopy
329
330	$POP	$tj,0($sp)
331	li	r3,1
332	$POP	r20,`-12*$SIZE_T`($tj)
333	$POP	r21,`-11*$SIZE_T`($tj)
334	$POP	r22,`-10*$SIZE_T`($tj)
335	$POP	r23,`-9*$SIZE_T`($tj)
336	$POP	r24,`-8*$SIZE_T`($tj)
337	$POP	r25,`-7*$SIZE_T`($tj)
338	$POP	r26,`-6*$SIZE_T`($tj)
339	$POP	r27,`-5*$SIZE_T`($tj)
340	$POP	r28,`-4*$SIZE_T`($tj)
341	$POP	r29,`-3*$SIZE_T`($tj)
342	$POP	r30,`-2*$SIZE_T`($tj)
343	$POP	r31,`-1*$SIZE_T`($tj)
344	mr	$sp,$tj
345	blr
346	.long	0
347	.byte	0,12,4,0,0x80,12,6,0
348	.long	0
349.size	.bn_mul_mont_int,.-.bn_mul_mont_int
350___
351}
352if (1) {
353my ($a0,$a1,$a2,$a3,
354    $t0,$t1,$t2,$t3,
355    $m0,$m1,$m2,$m3,
356    $acc0,$acc1,$acc2,$acc3,$acc4,
357    $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
358my  ($carry,$zero) = ($rp,"r0");
359
360# sp----------->+-------------------------------+
361#		| saved sp			|
362#		+-------------------------------+
363#		.				.
364# +8*size_t	+-------------------------------+
365#		| 4 "n0*t0"			|
366#		.				.
367#		.				.
368# +12*size_t	+-------------------------------+
369#		| size_t tmp[num]		|
370#		.				.
371#		.				.
372#		.				.
373#		+-------------------------------+
374#		| topmost carry			|
375#		.				.
376# -18*size_t	+-------------------------------+
377#		| 18 saved gpr, r14-r31		|
378#		.				.
379#		.				.
380#		+-------------------------------+
381$code.=<<___;
382.globl	.bn_mul4x_mont_int
383.align	5
384.bn_mul4x_mont_int:
385	andi.	r0,$num,7
386	bne	.Lmul4x_do
387	$UCMP	$ap,$bp
388	bne	.Lmul4x_do
389	b	.Lsqr8x_do
390.Lmul4x_do:
391	slwi	$num,$num,`log($SIZE_T)/log(2)`
392	mr	$a0,$sp
393	li	$a1,-32*$SIZE_T
394	sub	$a1,$a1,$num
395	$STUX	$sp,$sp,$a1		# alloca
396
397	$PUSH	r14,-$SIZE_T*18($a0)
398	$PUSH	r15,-$SIZE_T*17($a0)
399	$PUSH	r16,-$SIZE_T*16($a0)
400	$PUSH	r17,-$SIZE_T*15($a0)
401	$PUSH	r18,-$SIZE_T*14($a0)
402	$PUSH	r19,-$SIZE_T*13($a0)
403	$PUSH	r20,-$SIZE_T*12($a0)
404	$PUSH	r21,-$SIZE_T*11($a0)
405	$PUSH	r22,-$SIZE_T*10($a0)
406	$PUSH	r23,-$SIZE_T*9($a0)
407	$PUSH	r24,-$SIZE_T*8($a0)
408	$PUSH	r25,-$SIZE_T*7($a0)
409	$PUSH	r26,-$SIZE_T*6($a0)
410	$PUSH	r27,-$SIZE_T*5($a0)
411	$PUSH	r28,-$SIZE_T*4($a0)
412	$PUSH	r29,-$SIZE_T*3($a0)
413	$PUSH	r30,-$SIZE_T*2($a0)
414	$PUSH	r31,-$SIZE_T*1($a0)
415
416	subi	$ap,$ap,$SIZE_T		# bias by -1
417	subi	$np,$np,$SIZE_T		# bias by -1
418	subi	$rp,$rp,$SIZE_T		# bias by -1
419	$LD	$n0,0($n0)		# *n0
420
421	add	$t0,$bp,$num
422	add	$ap_end,$ap,$num
423	subi	$t0,$t0,$SIZE_T*4	# &b[num-4]
424
425	$LD	$bi,$SIZE_T*0($bp)	# b[0]
426	li	$acc0,0
427	$LD	$a0,$SIZE_T*1($ap)	# a[0..3]
428	li	$acc1,0
429	$LD	$a1,$SIZE_T*2($ap)
430	li	$acc2,0
431	$LD	$a2,$SIZE_T*3($ap)
432	li	$acc3,0
433	$LDU	$a3,$SIZE_T*4($ap)
434	$LD	$m0,$SIZE_T*1($np)	# n[0..3]
435	$LD	$m1,$SIZE_T*2($np)
436	$LD	$m2,$SIZE_T*3($np)
437	$LDU	$m3,$SIZE_T*4($np)
438
439	$PUSH	$rp,$SIZE_T*6($sp)	# offload rp and &b[num-4]
440	$PUSH	$t0,$SIZE_T*7($sp)
441	li	$carry,0
442	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
443	li	$cnt,0
444	li	$zero,0
445	b	.Loop_mul4x_1st_reduction
446
447.align	5
448.Loop_mul4x_1st_reduction:
449	$UMULL	$t0,$a0,$bi		# lo(a[0..3]*b[0])
450	addze	$carry,$carry		# modulo-scheduled
451	$UMULL	$t1,$a1,$bi
452	addi	$cnt,$cnt,$SIZE_T
453	$UMULL	$t2,$a2,$bi
454	andi.	$cnt,$cnt,$SIZE_T*4-1
455	$UMULL	$t3,$a3,$bi
456	addc	$acc0,$acc0,$t0
457	$UMULH	$t0,$a0,$bi		# hi(a[0..3]*b[0])
458	adde	$acc1,$acc1,$t1
459	$UMULH	$t1,$a1,$bi
460	adde	$acc2,$acc2,$t2
461	$UMULL	$mi,$acc0,$n0		# t[0]*n0
462	adde	$acc3,$acc3,$t3
463	$UMULH	$t2,$a2,$bi
464	addze	$acc4,$zero
465	$UMULH	$t3,$a3,$bi
466	$LDX	$bi,$bp,$cnt		# next b[i] (or b[0])
467	addc	$acc1,$acc1,$t0
468	# (*)	mul	$t0,$m0,$mi	# lo(n[0..3]*t[0]*n0)
469	$STU	$mi,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
470	adde	$acc2,$acc2,$t1
471	$UMULL	$t1,$m1,$mi
472	adde	$acc3,$acc3,$t2
473	$UMULL	$t2,$m2,$mi
474	adde	$acc4,$acc4,$t3		# can't overflow
475	$UMULL	$t3,$m3,$mi
476	# (*)	addc	$acc0,$acc0,$t0
477	# (*)	As for removal of first multiplication and addition
478	#	instructions. The outcome of first addition is
479	#	guaranteed to be zero, which leaves two computationally
480	#	significant outcomes: it either carries or not. Then
481	#	question is when does it carry? Is there alternative
482	#	way to deduce it? If you follow operations, you can
483	#	observe that condition for carry is quite simple:
484	#	$acc0 being non-zero. So that carry can be calculated
485	#	by adding -1 to $acc0. That's what next instruction does.
486	addic	$acc0,$acc0,-1		# (*), discarded
487	$UMULH	$t0,$m0,$mi		# hi(n[0..3]*t[0]*n0)
488	adde	$acc0,$acc1,$t1
489	$UMULH	$t1,$m1,$mi
490	adde	$acc1,$acc2,$t2
491	$UMULH	$t2,$m2,$mi
492	adde	$acc2,$acc3,$t3
493	$UMULH	$t3,$m3,$mi
494	adde	$acc3,$acc4,$carry
495	addze	$carry,$zero
496	addc	$acc0,$acc0,$t0
497	adde	$acc1,$acc1,$t1
498	adde	$acc2,$acc2,$t2
499	adde	$acc3,$acc3,$t3
500	#addze	$carry,$carry
501	bne	.Loop_mul4x_1st_reduction
502
503	$UCMP	$ap_end,$ap
504	beq	.Lmul4x4_post_condition
505
506	$LD	$a0,$SIZE_T*1($ap)	# a[4..7]
507	$LD	$a1,$SIZE_T*2($ap)
508	$LD	$a2,$SIZE_T*3($ap)
509	$LDU	$a3,$SIZE_T*4($ap)
510	$LD	$mi,$SIZE_T*8($sp)	# a[0]*n0
511	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
512	$LD	$m1,$SIZE_T*2($np)
513	$LD	$m2,$SIZE_T*3($np)
514	$LDU	$m3,$SIZE_T*4($np)
515	b	.Loop_mul4x_1st_tail
516
517.align	5
518.Loop_mul4x_1st_tail:
519	$UMULL	$t0,$a0,$bi		# lo(a[4..7]*b[i])
520	addze	$carry,$carry		# modulo-scheduled
521	$UMULL	$t1,$a1,$bi
522	addi	$cnt,$cnt,$SIZE_T
523	$UMULL	$t2,$a2,$bi
524	andi.	$cnt,$cnt,$SIZE_T*4-1
525	$UMULL	$t3,$a3,$bi
526	addc	$acc0,$acc0,$t0
527	$UMULH	$t0,$a0,$bi		# hi(a[4..7]*b[i])
528	adde	$acc1,$acc1,$t1
529	$UMULH	$t1,$a1,$bi
530	adde	$acc2,$acc2,$t2
531	$UMULH	$t2,$a2,$bi
532	adde	$acc3,$acc3,$t3
533	$UMULH	$t3,$a3,$bi
534	addze	$acc4,$zero
535	$LDX	$bi,$bp,$cnt		# next b[i] (or b[0])
536	addc	$acc1,$acc1,$t0
537	$UMULL	$t0,$m0,$mi		# lo(n[4..7]*a[0]*n0)
538	adde	$acc2,$acc2,$t1
539	$UMULL	$t1,$m1,$mi
540	adde	$acc3,$acc3,$t2
541	$UMULL	$t2,$m2,$mi
542	adde	$acc4,$acc4,$t3		# can't overflow
543	$UMULL	$t3,$m3,$mi
544	addc	$acc0,$acc0,$t0
545	$UMULH	$t0,$m0,$mi		# hi(n[4..7]*a[0]*n0)
546	adde	$acc1,$acc1,$t1
547	$UMULH	$t1,$m1,$mi
548	adde	$acc2,$acc2,$t2
549	$UMULH	$t2,$m2,$mi
550	adde	$acc3,$acc3,$t3
551	adde	$acc4,$acc4,$carry
552	$UMULH	$t3,$m3,$mi
553	addze	$carry,$zero
554	addi	$mi,$sp,$SIZE_T*8
555	$LDX	$mi,$mi,$cnt		# next t[0]*n0
556	$STU	$acc0,$SIZE_T($tp)	# word of result
557	addc	$acc0,$acc1,$t0
558	adde	$acc1,$acc2,$t1
559	adde	$acc2,$acc3,$t2
560	adde	$acc3,$acc4,$t3
561	#addze	$carry,$carry
562	bne	.Loop_mul4x_1st_tail
563
564	sub	$t1,$ap_end,$num	# rewinded $ap
565	$UCMP	$ap_end,$ap		# done yet?
566	beq	.Lmul4x_proceed
567
568	$LD	$a0,$SIZE_T*1($ap)
569	$LD	$a1,$SIZE_T*2($ap)
570	$LD	$a2,$SIZE_T*3($ap)
571	$LDU	$a3,$SIZE_T*4($ap)
572	$LD	$m0,$SIZE_T*1($np)
573	$LD	$m1,$SIZE_T*2($np)
574	$LD	$m2,$SIZE_T*3($np)
575	$LDU	$m3,$SIZE_T*4($np)
576	b	.Loop_mul4x_1st_tail
577
578.align	5
579.Lmul4x_proceed:
580	$LDU	$bi,$SIZE_T*4($bp)	# *++b
581	addze	$carry,$carry		# topmost carry
582	$LD	$a0,$SIZE_T*1($t1)
583	$LD	$a1,$SIZE_T*2($t1)
584	$LD	$a2,$SIZE_T*3($t1)
585	$LD	$a3,$SIZE_T*4($t1)
586	addi	$ap,$t1,$SIZE_T*4
587	sub	$np,$np,$num		# rewind np
588
589	$ST	$acc0,$SIZE_T*1($tp)	# result
590	$ST	$acc1,$SIZE_T*2($tp)
591	$ST	$acc2,$SIZE_T*3($tp)
592	$ST	$acc3,$SIZE_T*4($tp)
593	$ST	$carry,$SIZE_T*5($tp)	# save topmost carry
594	$LD	$acc0,$SIZE_T*12($sp)	# t[0..3]
595	$LD	$acc1,$SIZE_T*13($sp)
596	$LD	$acc2,$SIZE_T*14($sp)
597	$LD	$acc3,$SIZE_T*15($sp)
598
599	$LD	$m0,$SIZE_T*1($np)	# n[0..3]
600	$LD	$m1,$SIZE_T*2($np)
601	$LD	$m2,$SIZE_T*3($np)
602	$LDU	$m3,$SIZE_T*4($np)
603	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
604	li	$carry,0
605	b	.Loop_mul4x_reduction
606
607.align	5
608.Loop_mul4x_reduction:
609	$UMULL	$t0,$a0,$bi		# lo(a[0..3]*b[4])
610	addze	$carry,$carry		# modulo-scheduled
611	$UMULL	$t1,$a1,$bi
612	addi	$cnt,$cnt,$SIZE_T
613	$UMULL	$t2,$a2,$bi
614	andi.	$cnt,$cnt,$SIZE_T*4-1
615	$UMULL	$t3,$a3,$bi
616	addc	$acc0,$acc0,$t0
617	$UMULH	$t0,$a0,$bi		# hi(a[0..3]*b[4])
618	adde	$acc1,$acc1,$t1
619	$UMULH	$t1,$a1,$bi
620	adde	$acc2,$acc2,$t2
621	$UMULL	$mi,$acc0,$n0		# t[0]*n0
622	adde	$acc3,$acc3,$t3
623	$UMULH	$t2,$a2,$bi
624	addze	$acc4,$zero
625	$UMULH	$t3,$a3,$bi
626	$LDX	$bi,$bp,$cnt		# next b[i]
627	addc	$acc1,$acc1,$t0
628	# (*)	mul	$t0,$m0,$mi
629	$STU	$mi,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
630	adde	$acc2,$acc2,$t1
631	$UMULL	$t1,$m1,$mi		# lo(n[0..3]*t[0]*n0
632	adde	$acc3,$acc3,$t2
633	$UMULL	$t2,$m2,$mi
634	adde	$acc4,$acc4,$t3		# can't overflow
635	$UMULL	$t3,$m3,$mi
636	# (*)	addc	$acc0,$acc0,$t0
637	addic	$acc0,$acc0,-1		# (*), discarded
638	$UMULH	$t0,$m0,$mi		# hi(n[0..3]*t[0]*n0
639	adde	$acc0,$acc1,$t1
640	$UMULH	$t1,$m1,$mi
641	adde	$acc1,$acc2,$t2
642	$UMULH	$t2,$m2,$mi
643	adde	$acc2,$acc3,$t3
644	$UMULH	$t3,$m3,$mi
645	adde	$acc3,$acc4,$carry
646	addze	$carry,$zero
647	addc	$acc0,$acc0,$t0
648	adde	$acc1,$acc1,$t1
649	adde	$acc2,$acc2,$t2
650	adde	$acc3,$acc3,$t3
651	#addze	$carry,$carry
652	bne	.Loop_mul4x_reduction
653
654	$LD	$t0,$SIZE_T*5($tp)	# t[4..7]
655	addze	$carry,$carry
656	$LD	$t1,$SIZE_T*6($tp)
657	$LD	$t2,$SIZE_T*7($tp)
658	$LD	$t3,$SIZE_T*8($tp)
659	$LD	$a0,$SIZE_T*1($ap)	# a[4..7]
660	$LD	$a1,$SIZE_T*2($ap)
661	$LD	$a2,$SIZE_T*3($ap)
662	$LDU	$a3,$SIZE_T*4($ap)
663	addc	$acc0,$acc0,$t0
664	adde	$acc1,$acc1,$t1
665	adde	$acc2,$acc2,$t2
666	adde	$acc3,$acc3,$t3
667	#addze	$carry,$carry
668
669	$LD	$mi,$SIZE_T*8($sp)	# t[0]*n0
670	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
671	$LD	$m1,$SIZE_T*2($np)
672	$LD	$m2,$SIZE_T*3($np)
673	$LDU	$m3,$SIZE_T*4($np)
674	b	.Loop_mul4x_tail
675
676.align	5
677.Loop_mul4x_tail:
678	$UMULL	$t0,$a0,$bi		# lo(a[4..7]*b[4])
679	addze	$carry,$carry		# modulo-scheduled
680	$UMULL	$t1,$a1,$bi
681	addi	$cnt,$cnt,$SIZE_T
682	$UMULL	$t2,$a2,$bi
683	andi.	$cnt,$cnt,$SIZE_T*4-1
684	$UMULL	$t3,$a3,$bi
685	addc	$acc0,$acc0,$t0
686	$UMULH	$t0,$a0,$bi		# hi(a[4..7]*b[4])
687	adde	$acc1,$acc1,$t1
688	$UMULH	$t1,$a1,$bi
689	adde	$acc2,$acc2,$t2
690	$UMULH	$t2,$a2,$bi
691	adde	$acc3,$acc3,$t3
692	$UMULH	$t3,$a3,$bi
693	addze	$acc4,$zero
694	$LDX	$bi,$bp,$cnt		# next b[i]
695	addc	$acc1,$acc1,$t0
696	$UMULL	$t0,$m0,$mi		# lo(n[4..7]*t[0]*n0)
697	adde	$acc2,$acc2,$t1
698	$UMULL	$t1,$m1,$mi
699	adde	$acc3,$acc3,$t2
700	$UMULL	$t2,$m2,$mi
701	adde	$acc4,$acc4,$t3		# can't overflow
702	$UMULL	$t3,$m3,$mi
703	addc	$acc0,$acc0,$t0
704	$UMULH	$t0,$m0,$mi		# hi(n[4..7]*t[0]*n0)
705	adde	$acc1,$acc1,$t1
706	$UMULH	$t1,$m1,$mi
707	adde	$acc2,$acc2,$t2
708	$UMULH	$t2,$m2,$mi
709	adde	$acc3,$acc3,$t3
710	$UMULH	$t3,$m3,$mi
711	adde	$acc4,$acc4,$carry
712	addi	$mi,$sp,$SIZE_T*8
713	$LDX	$mi,$mi,$cnt		# next a[0]*n0
714	addze	$carry,$zero
715	$STU	$acc0,$SIZE_T($tp)	# word of result
716	addc	$acc0,$acc1,$t0
717	adde	$acc1,$acc2,$t1
718	adde	$acc2,$acc3,$t2
719	adde	$acc3,$acc4,$t3
720	#addze	$carry,$carry
721	bne	.Loop_mul4x_tail
722
723	$LD	$t0,$SIZE_T*5($tp)	# next t[i] or topmost carry
724	sub	$t1,$np,$num		# rewinded np?
725	addze	$carry,$carry
726	$UCMP	$ap_end,$ap		# done yet?
727	beq	.Loop_mul4x_break
728
729	$LD	$t1,$SIZE_T*6($tp)
730	$LD	$t2,$SIZE_T*7($tp)
731	$LD	$t3,$SIZE_T*8($tp)
732	$LD	$a0,$SIZE_T*1($ap)
733	$LD	$a1,$SIZE_T*2($ap)
734	$LD	$a2,$SIZE_T*3($ap)
735	$LDU	$a3,$SIZE_T*4($ap)
736	addc	$acc0,$acc0,$t0
737	adde	$acc1,$acc1,$t1
738	adde	$acc2,$acc2,$t2
739	adde	$acc3,$acc3,$t3
740	#addze	$carry,$carry
741
742	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
743	$LD	$m1,$SIZE_T*2($np)
744	$LD	$m2,$SIZE_T*3($np)
745	$LDU	$m3,$SIZE_T*4($np)
746	b	.Loop_mul4x_tail
747
748.align	5
749.Loop_mul4x_break:
750	$POP	$t2,$SIZE_T*6($sp)	# pull rp and &b[num-4]
751	$POP	$t3,$SIZE_T*7($sp)
752	addc	$a0,$acc0,$t0		# accumulate topmost carry
753	$LD	$acc0,$SIZE_T*12($sp)	# t[0..3]
754	addze	$a1,$acc1
755	$LD	$acc1,$SIZE_T*13($sp)
756	addze	$a2,$acc2
757	$LD	$acc2,$SIZE_T*14($sp)
758	addze	$a3,$acc3
759	$LD	$acc3,$SIZE_T*15($sp)
760	addze	$carry,$carry		# topmost carry
761	$ST	$a0,$SIZE_T*1($tp)	# result
762	sub	$ap,$ap_end,$num	# rewind ap
763	$ST	$a1,$SIZE_T*2($tp)
764	$ST	$a2,$SIZE_T*3($tp)
765	$ST	$a3,$SIZE_T*4($tp)
766	$ST	$carry,$SIZE_T*5($tp)	# store topmost carry
767
768	$LD	$m0,$SIZE_T*1($t1)	# n[0..3]
769	$LD	$m1,$SIZE_T*2($t1)
770	$LD	$m2,$SIZE_T*3($t1)
771	$LD	$m3,$SIZE_T*4($t1)
772	addi	$np,$t1,$SIZE_T*4
773	$UCMP	$bp,$t3			# done yet?
774	beq	.Lmul4x_post
775
776	$LDU	$bi,$SIZE_T*4($bp)
777	$LD	$a0,$SIZE_T*1($ap)	# a[0..3]
778	$LD	$a1,$SIZE_T*2($ap)
779	$LD	$a2,$SIZE_T*3($ap)
780	$LDU	$a3,$SIZE_T*4($ap)
781	li	$carry,0
782	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
783	b	.Loop_mul4x_reduction
784
785.align	5
786.Lmul4x_post:
787	# Final step. We see if result is larger than modulus, and
788	# if it is, subtract the modulus. But comparison implies
789	# subtraction. So we subtract modulus, see if it borrowed,
790	# and conditionally copy original value.
791	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
792	mr	$bp,$t2			# &rp[-1]
793	subi	$cnt,$cnt,1
794	mr	$ap_end,$t2		# &rp[-1] copy
795	subfc	$t0,$m0,$acc0
796	addi	$tp,$sp,$SIZE_T*15
797	subfe	$t1,$m1,$acc1
798
799	mtctr	$cnt
800.Lmul4x_sub:
801	$LD	$m0,$SIZE_T*1($np)
802	$LD	$acc0,$SIZE_T*1($tp)
803	subfe	$t2,$m2,$acc2
804	$LD	$m1,$SIZE_T*2($np)
805	$LD	$acc1,$SIZE_T*2($tp)
806	subfe	$t3,$m3,$acc3
807	$LD	$m2,$SIZE_T*3($np)
808	$LD	$acc2,$SIZE_T*3($tp)
809	$LDU	$m3,$SIZE_T*4($np)
810	$LDU	$acc3,$SIZE_T*4($tp)
811	$ST	$t0,$SIZE_T*1($bp)
812	$ST	$t1,$SIZE_T*2($bp)
813	subfe	$t0,$m0,$acc0
814	$ST	$t2,$SIZE_T*3($bp)
815	$STU	$t3,$SIZE_T*4($bp)
816	subfe	$t1,$m1,$acc1
817	bdnz	.Lmul4x_sub
818
819	 $LD	$a0,$SIZE_T*1($ap_end)
820	$ST	$t0,$SIZE_T*1($bp)
821	 $LD	$t0,$SIZE_T*12($sp)
822	subfe	$t2,$m2,$acc2
823	 $LD	$a1,$SIZE_T*2($ap_end)
824	$ST	$t1,$SIZE_T*2($bp)
825	 $LD	$t1,$SIZE_T*13($sp)
826	subfe	$t3,$m3,$acc3
827	subfe	$carry,$zero,$carry	# did it borrow?
828	 addi	$tp,$sp,$SIZE_T*12
829	 $LD	$a2,$SIZE_T*3($ap_end)
830	$ST	$t2,$SIZE_T*3($bp)
831	 $LD	$t2,$SIZE_T*14($sp)
832	 $LD	$a3,$SIZE_T*4($ap_end)
833	$ST	$t3,$SIZE_T*4($bp)
834	 $LD	$t3,$SIZE_T*15($sp)
835
836	mtctr	$cnt
837.Lmul4x_cond_copy:
838	and	$t0,$t0,$carry
839	andc	$a0,$a0,$carry
840	$ST	$zero,$SIZE_T*0($tp)	# wipe stack clean
841	and	$t1,$t1,$carry
842	andc	$a1,$a1,$carry
843	$ST	$zero,$SIZE_T*1($tp)
844	and	$t2,$t2,$carry
845	andc	$a2,$a2,$carry
846	$ST	$zero,$SIZE_T*2($tp)
847	and	$t3,$t3,$carry
848	andc	$a3,$a3,$carry
849	$ST	$zero,$SIZE_T*3($tp)
850	or	$acc0,$t0,$a0
851	$LD	$a0,$SIZE_T*5($ap_end)
852	$LD	$t0,$SIZE_T*4($tp)
853	or	$acc1,$t1,$a1
854	$LD	$a1,$SIZE_T*6($ap_end)
855	$LD	$t1,$SIZE_T*5($tp)
856	or	$acc2,$t2,$a2
857	$LD	$a2,$SIZE_T*7($ap_end)
858	$LD	$t2,$SIZE_T*6($tp)
859	or	$acc3,$t3,$a3
860	$LD	$a3,$SIZE_T*8($ap_end)
861	$LD	$t3,$SIZE_T*7($tp)
862	addi	$tp,$tp,$SIZE_T*4
863	$ST	$acc0,$SIZE_T*1($ap_end)
864	$ST	$acc1,$SIZE_T*2($ap_end)
865	$ST	$acc2,$SIZE_T*3($ap_end)
866	$STU	$acc3,$SIZE_T*4($ap_end)
867	bdnz	.Lmul4x_cond_copy
868
869	$POP	$bp,0($sp)		# pull saved sp
870	and	$t0,$t0,$carry
871	andc	$a0,$a0,$carry
872	$ST	$zero,$SIZE_T*0($tp)
873	and	$t1,$t1,$carry
874	andc	$a1,$a1,$carry
875	$ST	$zero,$SIZE_T*1($tp)
876	and	$t2,$t2,$carry
877	andc	$a2,$a2,$carry
878	$ST	$zero,$SIZE_T*2($tp)
879	and	$t3,$t3,$carry
880	andc	$a3,$a3,$carry
881	$ST	$zero,$SIZE_T*3($tp)
882	or	$acc0,$t0,$a0
883	or	$acc1,$t1,$a1
884	$ST	$zero,$SIZE_T*4($tp)
885	or	$acc2,$t2,$a2
886	or	$acc3,$t3,$a3
887	$ST	$acc0,$SIZE_T*1($ap_end)
888	$ST	$acc1,$SIZE_T*2($ap_end)
889	$ST	$acc2,$SIZE_T*3($ap_end)
890	$ST	$acc3,$SIZE_T*4($ap_end)
891
892	b	.Lmul4x_done
893
894.align	4
895.Lmul4x4_post_condition:
896	$POP	$ap,$SIZE_T*6($sp)	# pull &rp[-1]
897	$POP	$bp,0($sp)		# pull saved sp
898	addze	$carry,$carry		# modulo-scheduled
899	# $acc0-3,$carry hold result, $m0-3 hold modulus
900	subfc	$a0,$m0,$acc0
901	subfe	$a1,$m1,$acc1
902	subfe	$a2,$m2,$acc2
903	subfe	$a3,$m3,$acc3
904	subfe	$carry,$zero,$carry	# did it borrow?
905
906	and	$m0,$m0,$carry
907	and	$m1,$m1,$carry
908	addc	$a0,$a0,$m0
909	and	$m2,$m2,$carry
910	adde	$a1,$a1,$m1
911	and	$m3,$m3,$carry
912	adde	$a2,$a2,$m2
913	adde	$a3,$a3,$m3
914
915	$ST	$a0,$SIZE_T*1($ap)	# write result
916	$ST	$a1,$SIZE_T*2($ap)
917	$ST	$a2,$SIZE_T*3($ap)
918	$ST	$a3,$SIZE_T*4($ap)
919
920.Lmul4x_done:
921	$ST	$zero,$SIZE_T*8($sp)	# wipe stack clean
922	$ST	$zero,$SIZE_T*9($sp)
923	$ST	$zero,$SIZE_T*10($sp)
924	$ST	$zero,$SIZE_T*11($sp)
925	li	r3,1			# signal "done"
926	$POP	r14,-$SIZE_T*18($bp)
927	$POP	r15,-$SIZE_T*17($bp)
928	$POP	r16,-$SIZE_T*16($bp)
929	$POP	r17,-$SIZE_T*15($bp)
930	$POP	r18,-$SIZE_T*14($bp)
931	$POP	r19,-$SIZE_T*13($bp)
932	$POP	r20,-$SIZE_T*12($bp)
933	$POP	r21,-$SIZE_T*11($bp)
934	$POP	r22,-$SIZE_T*10($bp)
935	$POP	r23,-$SIZE_T*9($bp)
936	$POP	r24,-$SIZE_T*8($bp)
937	$POP	r25,-$SIZE_T*7($bp)
938	$POP	r26,-$SIZE_T*6($bp)
939	$POP	r27,-$SIZE_T*5($bp)
940	$POP	r28,-$SIZE_T*4($bp)
941	$POP	r29,-$SIZE_T*3($bp)
942	$POP	r30,-$SIZE_T*2($bp)
943	$POP	r31,-$SIZE_T*1($bp)
944	mr	$sp,$bp
945	blr
946	.long	0
947	.byte	0,12,4,0x20,0x80,18,6,0
948	.long	0
949.size	.bn_mul4x_mont_int,.-.bn_mul4x_mont_int
950___
951}
952
953if (1) {
954########################################################################
955# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
956
957my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
958my ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
959my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
960my ($cnt,$carry,$zero)=("r30","r31","r0");
961my ($tp,$ap_end,$na0)=($bp,$np,$carry);
962
963# sp----------->+-------------------------------+
964#		| saved sp			|
965#		+-------------------------------+
966#		.				.
967# +12*size_t	+-------------------------------+
968#		| size_t tmp[2*num]		|
969#		.				.
970#		.				.
971#		.				.
972#		+-------------------------------+
973#		.				.
974# -18*size_t	+-------------------------------+
975#		| 18 saved gpr, r14-r31		|
976#		.				.
977#		.				.
978#		+-------------------------------+
979$code.=<<___;
980.align	5
981__bn_sqr8x_mont:
982.Lsqr8x_do:
983	mr	$a0,$sp
984	slwi	$a1,$num,`log($SIZE_T)/log(2)+1`
985	li	$a2,-32*$SIZE_T
986	sub	$a1,$a2,$a1
987	slwi	$num,$num,`log($SIZE_T)/log(2)`
988	$STUX	$sp,$sp,$a1		# alloca
989
990	$PUSH	r14,-$SIZE_T*18($a0)
991	$PUSH	r15,-$SIZE_T*17($a0)
992	$PUSH	r16,-$SIZE_T*16($a0)
993	$PUSH	r17,-$SIZE_T*15($a0)
994	$PUSH	r18,-$SIZE_T*14($a0)
995	$PUSH	r19,-$SIZE_T*13($a0)
996	$PUSH	r20,-$SIZE_T*12($a0)
997	$PUSH	r21,-$SIZE_T*11($a0)
998	$PUSH	r22,-$SIZE_T*10($a0)
999	$PUSH	r23,-$SIZE_T*9($a0)
1000	$PUSH	r24,-$SIZE_T*8($a0)
1001	$PUSH	r25,-$SIZE_T*7($a0)
1002	$PUSH	r26,-$SIZE_T*6($a0)
1003	$PUSH	r27,-$SIZE_T*5($a0)
1004	$PUSH	r28,-$SIZE_T*4($a0)
1005	$PUSH	r29,-$SIZE_T*3($a0)
1006	$PUSH	r30,-$SIZE_T*2($a0)
1007	$PUSH	r31,-$SIZE_T*1($a0)
1008
1009	subi	$ap,$ap,$SIZE_T		# bias by -1
1010	subi	$t0,$np,$SIZE_T		# bias by -1
1011	subi	$rp,$rp,$SIZE_T		# bias by -1
1012	$LD	$n0,0($n0)		# *n0
1013	li	$zero,0
1014
1015	add	$ap_end,$ap,$num
1016	$LD	$a0,$SIZE_T*1($ap)
1017	#li	$acc0,0
1018	$LD	$a1,$SIZE_T*2($ap)
1019	li	$acc1,0
1020	$LD	$a2,$SIZE_T*3($ap)
1021	li	$acc2,0
1022	$LD	$a3,$SIZE_T*4($ap)
1023	li	$acc3,0
1024	$LD	$a4,$SIZE_T*5($ap)
1025	li	$acc4,0
1026	$LD	$a5,$SIZE_T*6($ap)
1027	li	$acc5,0
1028	$LD	$a6,$SIZE_T*7($ap)
1029	li	$acc6,0
1030	$LDU	$a7,$SIZE_T*8($ap)
1031	li	$acc7,0
1032
1033	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1034	subic.	$cnt,$num,$SIZE_T*8
1035	b	.Lsqr8x_zero_start
1036
1037.align	5
1038.Lsqr8x_zero:
1039	subic.	$cnt,$cnt,$SIZE_T*8
1040	$ST	$zero,$SIZE_T*1($tp)
1041	$ST	$zero,$SIZE_T*2($tp)
1042	$ST	$zero,$SIZE_T*3($tp)
1043	$ST	$zero,$SIZE_T*4($tp)
1044	$ST	$zero,$SIZE_T*5($tp)
1045	$ST	$zero,$SIZE_T*6($tp)
1046	$ST	$zero,$SIZE_T*7($tp)
1047	$ST	$zero,$SIZE_T*8($tp)
1048.Lsqr8x_zero_start:
1049	$ST	$zero,$SIZE_T*9($tp)
1050	$ST	$zero,$SIZE_T*10($tp)
1051	$ST	$zero,$SIZE_T*11($tp)
1052	$ST	$zero,$SIZE_T*12($tp)
1053	$ST	$zero,$SIZE_T*13($tp)
1054	$ST	$zero,$SIZE_T*14($tp)
1055	$ST	$zero,$SIZE_T*15($tp)
1056	$STU	$zero,$SIZE_T*16($tp)
1057	bne	.Lsqr8x_zero
1058
1059	$PUSH	$rp,$SIZE_T*6($sp)	# offload &rp[-1]
1060	$PUSH	$t0,$SIZE_T*7($sp)	# offload &np[-1]
1061	$PUSH	$n0,$SIZE_T*8($sp)	# offload n0
1062	$PUSH	$tp,$SIZE_T*9($sp)	# &tp[2*num-1]
1063	$PUSH	$zero,$SIZE_T*10($sp)	# initial top-most carry
1064	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1065
1066	# Multiply everything but a[i]*a[i]
1067.align	5
1068.Lsqr8x_outer_loop:
1069	#						  a[1]a[0]     (i)
1070	#					      a[2]a[0]
1071	#					  a[3]a[0]
1072	#				      a[4]a[0]
1073	#				  a[5]a[0]
1074	#			      a[6]a[0]
1075	#			  a[7]a[0]
1076	#					  a[2]a[1]	       (ii)
1077	#				      a[3]a[1]
1078	#				  a[4]a[1]
1079	#			      a[5]a[1]
1080	#			  a[6]a[1]
1081	#		      a[7]a[1]
1082	#				  a[3]a[2]		       (iii)
1083	#			      a[4]a[2]
1084	#			  a[5]a[2]
1085	#		      a[6]a[2]
1086	#		  a[7]a[2]
1087	#			  a[4]a[3]			       (iv)
1088	#		      a[5]a[3]
1089	#		  a[6]a[3]
1090	#	      a[7]a[3]
1091	#		  a[5]a[4]				       (v)
1092	#	      a[6]a[4]
1093	#	  a[7]a[4]
1094	#	  a[6]a[5]					       (vi)
1095	#     a[7]a[5]
1096	# a[7]a[6]						       (vii)
1097
1098	$UMULL	$t0,$a1,$a0		# lo(a[1..7]*a[0])		(i)
1099	$UMULL	$t1,$a2,$a0
1100	$UMULL	$t2,$a3,$a0
1101	$UMULL	$t3,$a4,$a0
1102	addc	$acc1,$acc1,$t0		# t[1]+lo(a[1]*a[0])
1103	$UMULL	$t0,$a5,$a0
1104	adde	$acc2,$acc2,$t1
1105	$UMULL	$t1,$a6,$a0
1106	adde	$acc3,$acc3,$t2
1107	$UMULL	$t2,$a7,$a0
1108	adde	$acc4,$acc4,$t3
1109	$UMULH	$t3,$a1,$a0		# hi(a[1..7]*a[0])
1110	adde	$acc5,$acc5,$t0
1111	$UMULH	$t0,$a2,$a0
1112	adde	$acc6,$acc6,$t1
1113	$UMULH	$t1,$a3,$a0
1114	adde	$acc7,$acc7,$t2
1115	$UMULH	$t2,$a4,$a0
1116	$ST	$acc0,$SIZE_T*1($tp)	# t[0]
1117	addze	$acc0,$zero		# t[8]
1118	$ST	$acc1,$SIZE_T*2($tp)	# t[1]
1119	addc	$acc2,$acc2,$t3		# t[2]+lo(a[1]*a[0])
1120	$UMULH	$t3,$a5,$a0
1121	adde	$acc3,$acc3,$t0
1122	$UMULH	$t0,$a6,$a0
1123	adde	$acc4,$acc4,$t1
1124	$UMULH	$t1,$a7,$a0
1125	adde	$acc5,$acc5,$t2
1126	 $UMULL	$t2,$a2,$a1		# lo(a[2..7]*a[1])		(ii)
1127	adde	$acc6,$acc6,$t3
1128	 $UMULL	$t3,$a3,$a1
1129	adde	$acc7,$acc7,$t0
1130	 $UMULL	$t0,$a4,$a1
1131	adde	$acc0,$acc0,$t1
1132
1133	$UMULL	$t1,$a5,$a1
1134	addc	$acc3,$acc3,$t2
1135	$UMULL	$t2,$a6,$a1
1136	adde	$acc4,$acc4,$t3
1137	$UMULL	$t3,$a7,$a1
1138	adde	$acc5,$acc5,$t0
1139	$UMULH	$t0,$a2,$a1		# hi(a[2..7]*a[1])
1140	adde	$acc6,$acc6,$t1
1141	$UMULH	$t1,$a3,$a1
1142	adde	$acc7,$acc7,$t2
1143	$UMULH	$t2,$a4,$a1
1144	adde	$acc0,$acc0,$t3
1145	$UMULH	$t3,$a5,$a1
1146	$ST	$acc2,$SIZE_T*3($tp)	# t[2]
1147	addze	$acc1,$zero		# t[9]
1148	$ST	$acc3,$SIZE_T*4($tp)	# t[3]
1149	addc	$acc4,$acc4,$t0
1150	$UMULH	$t0,$a6,$a1
1151	adde	$acc5,$acc5,$t1
1152	$UMULH	$t1,$a7,$a1
1153	adde	$acc6,$acc6,$t2
1154	 $UMULL	$t2,$a3,$a2		# lo(a[3..7]*a[2])		(iii)
1155	adde	$acc7,$acc7,$t3
1156	 $UMULL	$t3,$a4,$a2
1157	adde	$acc0,$acc0,$t0
1158	 $UMULL	$t0,$a5,$a2
1159	adde	$acc1,$acc1,$t1
1160
1161	$UMULL	$t1,$a6,$a2
1162	addc	$acc5,$acc5,$t2
1163	$UMULL	$t2,$a7,$a2
1164	adde	$acc6,$acc6,$t3
1165	$UMULH	$t3,$a3,$a2		# hi(a[3..7]*a[2])
1166	adde	$acc7,$acc7,$t0
1167	$UMULH	$t0,$a4,$a2
1168	adde	$acc0,$acc0,$t1
1169	$UMULH	$t1,$a5,$a2
1170	adde	$acc1,$acc1,$t2
1171	$UMULH	$t2,$a6,$a2
1172	$ST	$acc4,$SIZE_T*5($tp)	# t[4]
1173	addze	$acc2,$zero		# t[10]
1174	$ST	$acc5,$SIZE_T*6($tp)	# t[5]
1175	addc	$acc6,$acc6,$t3
1176	$UMULH	$t3,$a7,$a2
1177	adde	$acc7,$acc7,$t0
1178	 $UMULL	$t0,$a4,$a3		# lo(a[4..7]*a[3])		(iv)
1179	adde	$acc0,$acc0,$t1
1180	 $UMULL	$t1,$a5,$a3
1181	adde	$acc1,$acc1,$t2
1182	 $UMULL	$t2,$a6,$a3
1183	adde	$acc2,$acc2,$t3
1184
1185	$UMULL	$t3,$a7,$a3
1186	addc	$acc7,$acc7,$t0
1187	$UMULH	$t0,$a4,$a3		# hi(a[4..7]*a[3])
1188	adde	$acc0,$acc0,$t1
1189	$UMULH	$t1,$a5,$a3
1190	adde	$acc1,$acc1,$t2
1191	$UMULH	$t2,$a6,$a3
1192	adde	$acc2,$acc2,$t3
1193	$UMULH	$t3,$a7,$a3
1194	$ST	$acc6,$SIZE_T*7($tp)	# t[6]
1195	addze	$acc3,$zero		# t[11]
1196	$STU	$acc7,$SIZE_T*8($tp)	# t[7]
1197	addc	$acc0,$acc0,$t0
1198	 $UMULL	$t0,$a5,$a4		# lo(a[5..7]*a[4])		(v)
1199	adde	$acc1,$acc1,$t1
1200	 $UMULL	$t1,$a6,$a4
1201	adde	$acc2,$acc2,$t2
1202	 $UMULL	$t2,$a7,$a4
1203	adde	$acc3,$acc3,$t3
1204
1205	$UMULH	$t3,$a5,$a4		# hi(a[5..7]*a[4])
1206	addc	$acc1,$acc1,$t0
1207	$UMULH	$t0,$a6,$a4
1208	adde	$acc2,$acc2,$t1
1209	$UMULH	$t1,$a7,$a4
1210	adde	$acc3,$acc3,$t2
1211	 $UMULL	$t2,$a6,$a5		# lo(a[6..7]*a[5])		(vi)
1212	addze	$acc4,$zero		# t[12]
1213	addc	$acc2,$acc2,$t3
1214	 $UMULL	$t3,$a7,$a5
1215	adde	$acc3,$acc3,$t0
1216	 $UMULH	$t0,$a6,$a5		# hi(a[6..7]*a[5])
1217	adde	$acc4,$acc4,$t1
1218
1219	$UMULH	$t1,$a7,$a5
1220	addc	$acc3,$acc3,$t2
1221	 $UMULL	$t2,$a7,$a6		# lo(a[7]*a[6])			(vii)
1222	adde	$acc4,$acc4,$t3
1223	 $UMULH	$t3,$a7,$a6		# hi(a[7]*a[6])
1224	addze	$acc5,$zero		# t[13]
1225	addc	$acc4,$acc4,$t0
1226	$UCMP	$ap_end,$ap		# done yet?
1227	adde	$acc5,$acc5,$t1
1228
1229	addc	$acc5,$acc5,$t2
1230	sub	$t0,$ap_end,$num	# rewinded ap
1231	addze	$acc6,$zero		# t[14]
1232	add	$acc6,$acc6,$t3
1233
1234	beq	.Lsqr8x_outer_break
1235
1236	mr	$n0,$a0
1237	$LD	$a0,$SIZE_T*1($tp)
1238	$LD	$a1,$SIZE_T*2($tp)
1239	$LD	$a2,$SIZE_T*3($tp)
1240	$LD	$a3,$SIZE_T*4($tp)
1241	$LD	$a4,$SIZE_T*5($tp)
1242	$LD	$a5,$SIZE_T*6($tp)
1243	$LD	$a6,$SIZE_T*7($tp)
1244	$LD	$a7,$SIZE_T*8($tp)
1245	addc	$acc0,$acc0,$a0
1246	$LD	$a0,$SIZE_T*1($ap)
1247	adde	$acc1,$acc1,$a1
1248	$LD	$a1,$SIZE_T*2($ap)
1249	adde	$acc2,$acc2,$a2
1250	$LD	$a2,$SIZE_T*3($ap)
1251	adde	$acc3,$acc3,$a3
1252	$LD	$a3,$SIZE_T*4($ap)
1253	adde	$acc4,$acc4,$a4
1254	$LD	$a4,$SIZE_T*5($ap)
1255	adde	$acc5,$acc5,$a5
1256	$LD	$a5,$SIZE_T*6($ap)
1257	adde	$acc6,$acc6,$a6
1258	$LD	$a6,$SIZE_T*7($ap)
1259	subi	$rp,$ap,$SIZE_T*7
1260	addze	$acc7,$a7
1261	$LDU	$a7,$SIZE_T*8($ap)
1262	#addze	$carry,$zero		# moved below
1263	li	$cnt,0
1264	b	.Lsqr8x_mul
1265
1266	#                                                          a[8]a[0]
1267	#                                                      a[9]a[0]
1268	#                                                  a[a]a[0]
1269	#                                              a[b]a[0]
1270	#                                          a[c]a[0]
1271	#                                      a[d]a[0]
1272	#                                  a[e]a[0]
1273	#                              a[f]a[0]
1274	#                                                      a[8]a[1]
1275	#                          a[f]a[1]........................
1276	#                                                  a[8]a[2]
1277	#                      a[f]a[2]........................
1278	#                                              a[8]a[3]
1279	#                  a[f]a[3]........................
1280	#                                          a[8]a[4]
1281	#              a[f]a[4]........................
1282	#                                      a[8]a[5]
1283	#          a[f]a[5]........................
1284	#                                  a[8]a[6]
1285	#      a[f]a[6]........................
1286	#                              a[8]a[7]
1287	#  a[f]a[7]........................
1288.align	5
1289.Lsqr8x_mul:
1290	$UMULL	$t0,$a0,$n0
1291	addze	$carry,$zero		# carry bit, modulo-scheduled
1292	$UMULL	$t1,$a1,$n0
1293	addi	$cnt,$cnt,$SIZE_T
1294	$UMULL	$t2,$a2,$n0
1295	andi.	$cnt,$cnt,$SIZE_T*8-1
1296	$UMULL	$t3,$a3,$n0
1297	addc	$acc0,$acc0,$t0
1298	$UMULL	$t0,$a4,$n0
1299	adde	$acc1,$acc1,$t1
1300	$UMULL	$t1,$a5,$n0
1301	adde	$acc2,$acc2,$t2
1302	$UMULL	$t2,$a6,$n0
1303	adde	$acc3,$acc3,$t3
1304	$UMULL	$t3,$a7,$n0
1305	adde	$acc4,$acc4,$t0
1306	$UMULH	$t0,$a0,$n0
1307	adde	$acc5,$acc5,$t1
1308	$UMULH	$t1,$a1,$n0
1309	adde	$acc6,$acc6,$t2
1310	$UMULH	$t2,$a2,$n0
1311	adde	$acc7,$acc7,$t3
1312	$UMULH	$t3,$a3,$n0
1313	addze	$carry,$carry
1314	$STU	$acc0,$SIZE_T($tp)
1315	addc	$acc0,$acc1,$t0
1316	$UMULH	$t0,$a4,$n0
1317	adde	$acc1,$acc2,$t1
1318	$UMULH	$t1,$a5,$n0
1319	adde	$acc2,$acc3,$t2
1320	$UMULH	$t2,$a6,$n0
1321	adde	$acc3,$acc4,$t3
1322	$UMULH	$t3,$a7,$n0
1323	$LDX	$n0,$rp,$cnt
1324	adde	$acc4,$acc5,$t0
1325	adde	$acc5,$acc6,$t1
1326	adde	$acc6,$acc7,$t2
1327	adde	$acc7,$carry,$t3
1328	#addze	$carry,$zero		# moved above
1329	bne	.Lsqr8x_mul
1330					# note that carry flag is guaranteed
1331					# to be zero at this point
1332	$UCMP	$ap,$ap_end		# done yet?
1333	beq	.Lsqr8x_break
1334
1335	$LD	$a0,$SIZE_T*1($tp)
1336	$LD	$a1,$SIZE_T*2($tp)
1337	$LD	$a2,$SIZE_T*3($tp)
1338	$LD	$a3,$SIZE_T*4($tp)
1339	$LD	$a4,$SIZE_T*5($tp)
1340	$LD	$a5,$SIZE_T*6($tp)
1341	$LD	$a6,$SIZE_T*7($tp)
1342	$LD	$a7,$SIZE_T*8($tp)
1343	addc	$acc0,$acc0,$a0
1344	$LD	$a0,$SIZE_T*1($ap)
1345	adde	$acc1,$acc1,$a1
1346	$LD	$a1,$SIZE_T*2($ap)
1347	adde	$acc2,$acc2,$a2
1348	$LD	$a2,$SIZE_T*3($ap)
1349	adde	$acc3,$acc3,$a3
1350	$LD	$a3,$SIZE_T*4($ap)
1351	adde	$acc4,$acc4,$a4
1352	$LD	$a4,$SIZE_T*5($ap)
1353	adde	$acc5,$acc5,$a5
1354	$LD	$a5,$SIZE_T*6($ap)
1355	adde	$acc6,$acc6,$a6
1356	$LD	$a6,$SIZE_T*7($ap)
1357	adde	$acc7,$acc7,$a7
1358	$LDU	$a7,$SIZE_T*8($ap)
1359	#addze	$carry,$zero		# moved above
1360	b	.Lsqr8x_mul
1361
1362.align	5
1363.Lsqr8x_break:
1364	$LD	$a0,$SIZE_T*8($rp)
1365	addi	$ap,$rp,$SIZE_T*15
1366	$LD	$a1,$SIZE_T*9($rp)
1367	sub.	$t0,$ap_end,$ap		# is it last iteration?
1368	$LD	$a2,$SIZE_T*10($rp)
1369	sub	$t1,$tp,$t0
1370	$LD	$a3,$SIZE_T*11($rp)
1371	$LD	$a4,$SIZE_T*12($rp)
1372	$LD	$a5,$SIZE_T*13($rp)
1373	$LD	$a6,$SIZE_T*14($rp)
1374	$LD	$a7,$SIZE_T*15($rp)
1375	beq	.Lsqr8x_outer_loop
1376
1377	$ST	$acc0,$SIZE_T*1($tp)
1378	$LD	$acc0,$SIZE_T*1($t1)
1379	$ST	$acc1,$SIZE_T*2($tp)
1380	$LD	$acc1,$SIZE_T*2($t1)
1381	$ST	$acc2,$SIZE_T*3($tp)
1382	$LD	$acc2,$SIZE_T*3($t1)
1383	$ST	$acc3,$SIZE_T*4($tp)
1384	$LD	$acc3,$SIZE_T*4($t1)
1385	$ST	$acc4,$SIZE_T*5($tp)
1386	$LD	$acc4,$SIZE_T*5($t1)
1387	$ST	$acc5,$SIZE_T*6($tp)
1388	$LD	$acc5,$SIZE_T*6($t1)
1389	$ST	$acc6,$SIZE_T*7($tp)
1390	$LD	$acc6,$SIZE_T*7($t1)
1391	$ST	$acc7,$SIZE_T*8($tp)
1392	$LD	$acc7,$SIZE_T*8($t1)
1393	mr	$tp,$t1
1394	b	.Lsqr8x_outer_loop
1395
1396.align	5
1397.Lsqr8x_outer_break:
1398	####################################################################
1399	# Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1400	$LD	$a1,$SIZE_T*1($t0)	# recall that $t0 is &a[-1]
1401	$LD	$a3,$SIZE_T*2($t0)
1402	$LD	$a5,$SIZE_T*3($t0)
1403	$LD	$a7,$SIZE_T*4($t0)
1404	addi	$ap,$t0,$SIZE_T*4
1405					# "tp[x]" comments are for num==8 case
1406	$LD	$t1,$SIZE_T*13($sp)	# =tp[1], t[0] is not interesting
1407	$LD	$t2,$SIZE_T*14($sp)
1408	$LD	$t3,$SIZE_T*15($sp)
1409	$LD	$t0,$SIZE_T*16($sp)
1410
1411	$ST	$acc0,$SIZE_T*1($tp)	# tp[8]=
1412	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
1413	$ST	$acc1,$SIZE_T*2($tp)
1414	subi	$cnt,$cnt,1
1415	$ST	$acc2,$SIZE_T*3($tp)
1416	$ST	$acc3,$SIZE_T*4($tp)
1417	$ST	$acc4,$SIZE_T*5($tp)
1418	$ST	$acc5,$SIZE_T*6($tp)
1419	$ST	$acc6,$SIZE_T*7($tp)
1420	#$ST	$acc7,$SIZE_T*8($tp)	# tp[15] is not interesting
1421	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1422	$UMULL	$acc0,$a1,$a1
1423	$UMULH	$a1,$a1,$a1
1424	add	$acc1,$t1,$t1		# <<1
1425	$SHRI	$t1,$t1,$BITS-1
1426	$UMULL	$a2,$a3,$a3
1427	$UMULH	$a3,$a3,$a3
1428	addc	$acc1,$acc1,$a1
1429	add	$acc2,$t2,$t2
1430	$SHRI	$t2,$t2,$BITS-1
1431	add	$acc3,$t3,$t3
1432	$SHRI	$t3,$t3,$BITS-1
1433	or	$acc2,$acc2,$t1
1434
1435	mtctr	$cnt
1436.Lsqr4x_shift_n_add:
1437	$UMULL	$a4,$a5,$a5
1438	$UMULH	$a5,$a5,$a5
1439	$LD	$t1,$SIZE_T*6($tp)	# =tp[5]
1440	$LD	$a1,$SIZE_T*1($ap)
1441	adde	$acc2,$acc2,$a2
1442	add	$acc4,$t0,$t0
1443	$SHRI	$t0,$t0,$BITS-1
1444	or	$acc3,$acc3,$t2
1445	$LD	$t2,$SIZE_T*7($tp)	# =tp[6]
1446	adde	$acc3,$acc3,$a3
1447	$LD	$a3,$SIZE_T*2($ap)
1448	add	$acc5,$t1,$t1
1449	$SHRI	$t1,$t1,$BITS-1
1450	or	$acc4,$acc4,$t3
1451	$LD	$t3,$SIZE_T*8($tp)	# =tp[7]
1452	$UMULL	$a6,$a7,$a7
1453	$UMULH	$a7,$a7,$a7
1454	adde	$acc4,$acc4,$a4
1455	add	$acc6,$t2,$t2
1456	$SHRI	$t2,$t2,$BITS-1
1457	or	$acc5,$acc5,$t0
1458	$LD	$t0,$SIZE_T*9($tp)	# =tp[8]
1459	adde	$acc5,$acc5,$a5
1460	$LD	$a5,$SIZE_T*3($ap)
1461	add	$acc7,$t3,$t3
1462	$SHRI	$t3,$t3,$BITS-1
1463	or	$acc6,$acc6,$t1
1464	$LD	$t1,$SIZE_T*10($tp)	# =tp[9]
1465	$UMULL	$a0,$a1,$a1
1466	$UMULH	$a1,$a1,$a1
1467	adde	$acc6,$acc6,$a6
1468	$ST	$acc0,$SIZE_T*1($tp)	# tp[0]=
1469	add	$acc0,$t0,$t0
1470	$SHRI	$t0,$t0,$BITS-1
1471	or	$acc7,$acc7,$t2
1472	$LD	$t2,$SIZE_T*11($tp)	# =tp[10]
1473	adde	$acc7,$acc7,$a7
1474	$LDU	$a7,$SIZE_T*4($ap)
1475	$ST	$acc1,$SIZE_T*2($tp)	# tp[1]=
1476	add	$acc1,$t1,$t1
1477	$SHRI	$t1,$t1,$BITS-1
1478	or	$acc0,$acc0,$t3
1479	$LD	$t3,$SIZE_T*12($tp)	# =tp[11]
1480	$UMULL	$a2,$a3,$a3
1481	$UMULH	$a3,$a3,$a3
1482	adde	$acc0,$acc0,$a0
1483	$ST	$acc2,$SIZE_T*3($tp)	# tp[2]=
1484	add	$acc2,$t2,$t2
1485	$SHRI	$t2,$t2,$BITS-1
1486	or	$acc1,$acc1,$t0
1487	$LD	$t0,$SIZE_T*13($tp)	# =tp[12]
1488	adde	$acc1,$acc1,$a1
1489	$ST	$acc3,$SIZE_T*4($tp)	# tp[3]=
1490	$ST	$acc4,$SIZE_T*5($tp)	# tp[4]=
1491	$ST	$acc5,$SIZE_T*6($tp)	# tp[5]=
1492	$ST	$acc6,$SIZE_T*7($tp)	# tp[6]=
1493	$STU	$acc7,$SIZE_T*8($tp)	# tp[7]=
1494	add	$acc3,$t3,$t3
1495	$SHRI	$t3,$t3,$BITS-1
1496	or	$acc2,$acc2,$t1
1497	bdnz	.Lsqr4x_shift_n_add
1498___
1499my ($np,$np_end)=($ap,$ap_end);
1500$code.=<<___;
1501	 $POP	$np,$SIZE_T*7($sp)	# pull &np[-1] and n0
1502	 $POP	$n0,$SIZE_T*8($sp)
1503
1504	$UMULL	$a4,$a5,$a5
1505	$UMULH	$a5,$a5,$a5
1506	$ST	$acc0,$SIZE_T*1($tp)	# tp[8]=
1507	 $LD	$acc0,$SIZE_T*12($sp)	# =tp[0]
1508	$LD	$t1,$SIZE_T*6($tp)	# =tp[13]
1509	adde	$acc2,$acc2,$a2
1510	add	$acc4,$t0,$t0
1511	$SHRI	$t0,$t0,$BITS-1
1512	or	$acc3,$acc3,$t2
1513	$LD	$t2,$SIZE_T*7($tp)	# =tp[14]
1514	adde	$acc3,$acc3,$a3
1515	add	$acc5,$t1,$t1
1516	$SHRI	$t1,$t1,$BITS-1
1517	or	$acc4,$acc4,$t3
1518	$UMULL	$a6,$a7,$a7
1519	$UMULH	$a7,$a7,$a7
1520	adde	$acc4,$acc4,$a4
1521	add	$acc6,$t2,$t2
1522	$SHRI	$t2,$t2,$BITS-1
1523	or	$acc5,$acc5,$t0
1524	$ST	$acc1,$SIZE_T*2($tp)	# tp[9]=
1525	 $LD	$acc1,$SIZE_T*13($sp)	# =tp[1]
1526	adde	$acc5,$acc5,$a5
1527	or	$acc6,$acc6,$t1
1528	 $LD	$a0,$SIZE_T*1($np)
1529	 $LD	$a1,$SIZE_T*2($np)
1530	adde	$acc6,$acc6,$a6
1531	 $LD	$a2,$SIZE_T*3($np)
1532	 $LD	$a3,$SIZE_T*4($np)
1533	adde	$acc7,$a7,$t2
1534	 $LD	$a4,$SIZE_T*5($np)
1535	 $LD	$a5,$SIZE_T*6($np)
1536
1537	################################################################
1538	# Reduce by 8 limbs per iteration
1539	$UMULL	$na0,$n0,$acc0		# t[0]*n0
1540	li	$cnt,8
1541	$LD	$a6,$SIZE_T*7($np)
1542	add	$np_end,$np,$num
1543	$LDU	$a7,$SIZE_T*8($np)
1544	$ST	$acc2,$SIZE_T*3($tp)	# tp[10]=
1545	$LD	$acc2,$SIZE_T*14($sp)
1546	$ST	$acc3,$SIZE_T*4($tp)	# tp[11]=
1547	$LD	$acc3,$SIZE_T*15($sp)
1548	$ST	$acc4,$SIZE_T*5($tp)	# tp[12]=
1549	$LD	$acc4,$SIZE_T*16($sp)
1550	$ST	$acc5,$SIZE_T*6($tp)	# tp[13]=
1551	$LD	$acc5,$SIZE_T*17($sp)
1552	$ST	$acc6,$SIZE_T*7($tp)	# tp[14]=
1553	$LD	$acc6,$SIZE_T*18($sp)
1554	$ST	$acc7,$SIZE_T*8($tp)	# tp[15]=
1555	$LD	$acc7,$SIZE_T*19($sp)
1556	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1557	mtctr	$cnt
1558	b	.Lsqr8x_reduction
1559
1560.align	5
1561.Lsqr8x_reduction:
1562	# (*)	$UMULL	$t0,$a0,$na0	# lo(n[0-7])*lo(t[0]*n0)
1563	$UMULL	$t1,$a1,$na0
1564	$UMULL	$t2,$a2,$na0
1565	$STU	$na0,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
1566	$UMULL	$t3,$a3,$na0
1567	# (*)	addc	$acc0,$acc0,$t0
1568	addic	$acc0,$acc0,-1		# (*)
1569	$UMULL	$t0,$a4,$na0
1570	adde	$acc0,$acc1,$t1
1571	$UMULL	$t1,$a5,$na0
1572	adde	$acc1,$acc2,$t2
1573	$UMULL	$t2,$a6,$na0
1574	adde	$acc2,$acc3,$t3
1575	$UMULL	$t3,$a7,$na0
1576	adde	$acc3,$acc4,$t0
1577	$UMULH	$t0,$a0,$na0		# hi(n[0-7])*lo(t[0]*n0)
1578	adde	$acc4,$acc5,$t1
1579	$UMULH	$t1,$a1,$na0
1580	adde	$acc5,$acc6,$t2
1581	$UMULH	$t2,$a2,$na0
1582	adde	$acc6,$acc7,$t3
1583	$UMULH	$t3,$a3,$na0
1584	addze	$acc7,$zero
1585	addc	$acc0,$acc0,$t0
1586	$UMULH	$t0,$a4,$na0
1587	adde	$acc1,$acc1,$t1
1588	$UMULH	$t1,$a5,$na0
1589	adde	$acc2,$acc2,$t2
1590	$UMULH	$t2,$a6,$na0
1591	adde	$acc3,$acc3,$t3
1592	$UMULH	$t3,$a7,$na0
1593	$UMULL	$na0,$n0,$acc0		# next t[0]*n0
1594	adde	$acc4,$acc4,$t0
1595	adde	$acc5,$acc5,$t1
1596	adde	$acc6,$acc6,$t2
1597	adde	$acc7,$acc7,$t3
1598	bdnz	.Lsqr8x_reduction
1599
1600	$LD	$t0,$SIZE_T*1($tp)
1601	$LD	$t1,$SIZE_T*2($tp)
1602	$LD	$t2,$SIZE_T*3($tp)
1603	$LD	$t3,$SIZE_T*4($tp)
1604	subi	$rp,$tp,$SIZE_T*7
1605	$UCMP	$np_end,$np		# done yet?
1606	addc	$acc0,$acc0,$t0
1607	$LD	$t0,$SIZE_T*5($tp)
1608	adde	$acc1,$acc1,$t1
1609	$LD	$t1,$SIZE_T*6($tp)
1610	adde	$acc2,$acc2,$t2
1611	$LD	$t2,$SIZE_T*7($tp)
1612	adde	$acc3,$acc3,$t3
1613	$LD	$t3,$SIZE_T*8($tp)
1614	adde	$acc4,$acc4,$t0
1615	adde	$acc5,$acc5,$t1
1616	adde	$acc6,$acc6,$t2
1617	adde	$acc7,$acc7,$t3
1618	#addze	$carry,$zero		# moved below
1619	beq	.Lsqr8x8_post_condition
1620
1621	$LD	$n0,$SIZE_T*0($rp)
1622	$LD	$a0,$SIZE_T*1($np)
1623	$LD	$a1,$SIZE_T*2($np)
1624	$LD	$a2,$SIZE_T*3($np)
1625	$LD	$a3,$SIZE_T*4($np)
1626	$LD	$a4,$SIZE_T*5($np)
1627	$LD	$a5,$SIZE_T*6($np)
1628	$LD	$a6,$SIZE_T*7($np)
1629	$LDU	$a7,$SIZE_T*8($np)
1630	li	$cnt,0
1631
1632.align	5
1633.Lsqr8x_tail:
1634	$UMULL	$t0,$a0,$n0
1635	addze	$carry,$zero		# carry bit, modulo-scheduled
1636	$UMULL	$t1,$a1,$n0
1637	addi	$cnt,$cnt,$SIZE_T
1638	$UMULL	$t2,$a2,$n0
1639	andi.	$cnt,$cnt,$SIZE_T*8-1
1640	$UMULL	$t3,$a3,$n0
1641	addc	$acc0,$acc0,$t0
1642	$UMULL	$t0,$a4,$n0
1643	adde	$acc1,$acc1,$t1
1644	$UMULL	$t1,$a5,$n0
1645	adde	$acc2,$acc2,$t2
1646	$UMULL	$t2,$a6,$n0
1647	adde	$acc3,$acc3,$t3
1648	$UMULL	$t3,$a7,$n0
1649	adde	$acc4,$acc4,$t0
1650	$UMULH	$t0,$a0,$n0
1651	adde	$acc5,$acc5,$t1
1652	$UMULH	$t1,$a1,$n0
1653	adde	$acc6,$acc6,$t2
1654	$UMULH	$t2,$a2,$n0
1655	adde	$acc7,$acc7,$t3
1656	$UMULH	$t3,$a3,$n0
1657	addze	$carry,$carry
1658	$STU	$acc0,$SIZE_T($tp)
1659	addc	$acc0,$acc1,$t0
1660	$UMULH	$t0,$a4,$n0
1661	adde	$acc1,$acc2,$t1
1662	$UMULH	$t1,$a5,$n0
1663	adde	$acc2,$acc3,$t2
1664	$UMULH	$t2,$a6,$n0
1665	adde	$acc3,$acc4,$t3
1666	$UMULH	$t3,$a7,$n0
1667	$LDX	$n0,$rp,$cnt
1668	adde	$acc4,$acc5,$t0
1669	adde	$acc5,$acc6,$t1
1670	adde	$acc6,$acc7,$t2
1671	adde	$acc7,$carry,$t3
1672	#addze	$carry,$zero		# moved above
1673	bne	.Lsqr8x_tail
1674					# note that carry flag is guaranteed
1675					# to be zero at this point
1676	$LD	$a0,$SIZE_T*1($tp)
1677	$POP	$carry,$SIZE_T*10($sp)	# pull top-most carry in case we break
1678	$UCMP	$np_end,$np		# done yet?
1679	$LD	$a1,$SIZE_T*2($tp)
1680	sub	$t2,$np_end,$num	# rewinded np
1681	$LD	$a2,$SIZE_T*3($tp)
1682	$LD	$a3,$SIZE_T*4($tp)
1683	$LD	$a4,$SIZE_T*5($tp)
1684	$LD	$a5,$SIZE_T*6($tp)
1685	$LD	$a6,$SIZE_T*7($tp)
1686	$LD	$a7,$SIZE_T*8($tp)
1687	beq	.Lsqr8x_tail_break
1688
1689	addc	$acc0,$acc0,$a0
1690	$LD	$a0,$SIZE_T*1($np)
1691	adde	$acc1,$acc1,$a1
1692	$LD	$a1,$SIZE_T*2($np)
1693	adde	$acc2,$acc2,$a2
1694	$LD	$a2,$SIZE_T*3($np)
1695	adde	$acc3,$acc3,$a3
1696	$LD	$a3,$SIZE_T*4($np)
1697	adde	$acc4,$acc4,$a4
1698	$LD	$a4,$SIZE_T*5($np)
1699	adde	$acc5,$acc5,$a5
1700	$LD	$a5,$SIZE_T*6($np)
1701	adde	$acc6,$acc6,$a6
1702	$LD	$a6,$SIZE_T*7($np)
1703	adde	$acc7,$acc7,$a7
1704	$LDU	$a7,$SIZE_T*8($np)
1705	#addze	$carry,$zero		# moved above
1706	b	.Lsqr8x_tail
1707
1708.align	5
1709.Lsqr8x_tail_break:
1710	$POP	$n0,$SIZE_T*8($sp)	# pull n0
1711	$POP	$t3,$SIZE_T*9($sp)	# &tp[2*num-1]
1712	addi	$cnt,$tp,$SIZE_T*8	# end of current t[num] window
1713
1714	addic	$carry,$carry,-1	# "move" top-most carry to carry bit
1715	adde	$t0,$acc0,$a0
1716	$LD	$acc0,$SIZE_T*8($rp)
1717	$LD	$a0,$SIZE_T*1($t2)	# recall that $t2 is &n[-1]
1718	adde	$t1,$acc1,$a1
1719	$LD	$acc1,$SIZE_T*9($rp)
1720	$LD	$a1,$SIZE_T*2($t2)
1721	adde	$acc2,$acc2,$a2
1722	$LD	$a2,$SIZE_T*3($t2)
1723	adde	$acc3,$acc3,$a3
1724	$LD	$a3,$SIZE_T*4($t2)
1725	adde	$acc4,$acc4,$a4
1726	$LD	$a4,$SIZE_T*5($t2)
1727	adde	$acc5,$acc5,$a5
1728	$LD	$a5,$SIZE_T*6($t2)
1729	adde	$acc6,$acc6,$a6
1730	$LD	$a6,$SIZE_T*7($t2)
1731	adde	$acc7,$acc7,$a7
1732	$LD	$a7,$SIZE_T*8($t2)
1733	addi	$np,$t2,$SIZE_T*8
1734	addze	$t2,$zero		# top-most carry
1735	$UMULL	$na0,$n0,$acc0
1736	$ST	$t0,$SIZE_T*1($tp)
1737	$UCMP	$cnt,$t3		# did we hit the bottom?
1738	$ST	$t1,$SIZE_T*2($tp)
1739	li	$cnt,8
1740	$ST	$acc2,$SIZE_T*3($tp)
1741	$LD	$acc2,$SIZE_T*10($rp)
1742	$ST	$acc3,$SIZE_T*4($tp)
1743	$LD	$acc3,$SIZE_T*11($rp)
1744	$ST	$acc4,$SIZE_T*5($tp)
1745	$LD	$acc4,$SIZE_T*12($rp)
1746	$ST	$acc5,$SIZE_T*6($tp)
1747	$LD	$acc5,$SIZE_T*13($rp)
1748	$ST	$acc6,$SIZE_T*7($tp)
1749	$LD	$acc6,$SIZE_T*14($rp)
1750	$ST	$acc7,$SIZE_T*8($tp)
1751	$LD	$acc7,$SIZE_T*15($rp)
1752	$PUSH	$t2,$SIZE_T*10($sp)	# off-load top-most carry
1753	addi	$tp,$rp,$SIZE_T*7	# slide the window
1754	mtctr	$cnt
1755	bne	.Lsqr8x_reduction
1756
1757	################################################################
1758	# Final step. We see if result is larger than modulus, and
1759	# if it is, subtract the modulus. But comparison implies
1760	# subtraction. So we subtract modulus, see if it borrowed,
1761	# and conditionally copy original value.
1762	$POP	$rp,$SIZE_T*6($sp)	# pull &rp[-1]
1763	srwi	$cnt,$num,`log($SIZE_T)/log(2)+3`
1764	mr	$n0,$tp			# put tp aside
1765	addi	$tp,$tp,$SIZE_T*8
1766	subi	$cnt,$cnt,1
1767	subfc	$t0,$a0,$acc0
1768	subfe	$t1,$a1,$acc1
1769	mr	$carry,$t2
1770	mr	$ap_end,$rp		# $rp copy
1771
1772	mtctr	$cnt
1773	b	.Lsqr8x_sub
1774
1775.align	5
1776.Lsqr8x_sub:
1777	$LD	$a0,$SIZE_T*1($np)
1778	$LD	$acc0,$SIZE_T*1($tp)
1779	$LD	$a1,$SIZE_T*2($np)
1780	$LD	$acc1,$SIZE_T*2($tp)
1781	subfe	$t2,$a2,$acc2
1782	$LD	$a2,$SIZE_T*3($np)
1783	$LD	$acc2,$SIZE_T*3($tp)
1784	subfe	$t3,$a3,$acc3
1785	$LD	$a3,$SIZE_T*4($np)
1786	$LD	$acc3,$SIZE_T*4($tp)
1787	$ST	$t0,$SIZE_T*1($rp)
1788	subfe	$t0,$a4,$acc4
1789	$LD	$a4,$SIZE_T*5($np)
1790	$LD	$acc4,$SIZE_T*5($tp)
1791	$ST	$t1,$SIZE_T*2($rp)
1792	subfe	$t1,$a5,$acc5
1793	$LD	$a5,$SIZE_T*6($np)
1794	$LD	$acc5,$SIZE_T*6($tp)
1795	$ST	$t2,$SIZE_T*3($rp)
1796	subfe	$t2,$a6,$acc6
1797	$LD	$a6,$SIZE_T*7($np)
1798	$LD	$acc6,$SIZE_T*7($tp)
1799	$ST	$t3,$SIZE_T*4($rp)
1800	subfe	$t3,$a7,$acc7
1801	$LDU	$a7,$SIZE_T*8($np)
1802	$LDU	$acc7,$SIZE_T*8($tp)
1803	$ST	$t0,$SIZE_T*5($rp)
1804	subfe	$t0,$a0,$acc0
1805	$ST	$t1,$SIZE_T*6($rp)
1806	subfe	$t1,$a1,$acc1
1807	$ST	$t2,$SIZE_T*7($rp)
1808	$STU	$t3,$SIZE_T*8($rp)
1809	bdnz	.Lsqr8x_sub
1810
1811	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
1812	 $LD	$a0,$SIZE_T*1($ap_end)	# original $rp
1813	 $LD	$acc0,$SIZE_T*1($n0)	# original $tp
1814	subi	$cnt,$cnt,1
1815	 $LD	$a1,$SIZE_T*2($ap_end)
1816	 $LD	$acc1,$SIZE_T*2($n0)
1817	subfe	$t2,$a2,$acc2
1818	 $LD	$a2,$SIZE_T*3($ap_end)
1819	 $LD	$acc2,$SIZE_T*3($n0)
1820	subfe	$t3,$a3,$acc3
1821	 $LD	$a3,$SIZE_T*4($ap_end)
1822	 $LDU	$acc3,$SIZE_T*4($n0)
1823	$ST	$t0,$SIZE_T*1($rp)
1824	subfe	$t0,$a4,$acc4
1825	$ST	$t1,$SIZE_T*2($rp)
1826	subfe	$t1,$a5,$acc5
1827	$ST	$t2,$SIZE_T*3($rp)
1828	subfe	$t2,$a6,$acc6
1829	$ST	$t3,$SIZE_T*4($rp)
1830	subfe	$t3,$a7,$acc7
1831	$ST	$t0,$SIZE_T*5($rp)
1832	subfe	$carry,$zero,$carry	# did it borrow?
1833	$ST	$t1,$SIZE_T*6($rp)
1834	$ST	$t2,$SIZE_T*7($rp)
1835	$ST	$t3,$SIZE_T*8($rp)
1836
1837	addi	$tp,$sp,$SIZE_T*11
1838	mtctr	$cnt
1839
1840.Lsqr4x_cond_copy:
1841	andc	$a0,$a0,$carry
1842	 $ST	$zero,-$SIZE_T*3($n0)	# wipe stack clean
1843	and	$acc0,$acc0,$carry
1844	 $ST	$zero,-$SIZE_T*2($n0)
1845	andc	$a1,$a1,$carry
1846	 $ST	$zero,-$SIZE_T*1($n0)
1847	and	$acc1,$acc1,$carry
1848	 $ST	$zero,-$SIZE_T*0($n0)
1849	andc	$a2,$a2,$carry
1850	 $ST	$zero,$SIZE_T*1($tp)
1851	and	$acc2,$acc2,$carry
1852	 $ST	$zero,$SIZE_T*2($tp)
1853	andc	$a3,$a3,$carry
1854	 $ST	$zero,$SIZE_T*3($tp)
1855	and	$acc3,$acc3,$carry
1856	 $STU	$zero,$SIZE_T*4($tp)
1857	or	$t0,$a0,$acc0
1858	$LD	$a0,$SIZE_T*5($ap_end)
1859	$LD	$acc0,$SIZE_T*1($n0)
1860	or	$t1,$a1,$acc1
1861	$LD	$a1,$SIZE_T*6($ap_end)
1862	$LD	$acc1,$SIZE_T*2($n0)
1863	or	$t2,$a2,$acc2
1864	$LD	$a2,$SIZE_T*7($ap_end)
1865	$LD	$acc2,$SIZE_T*3($n0)
1866	or	$t3,$a3,$acc3
1867	$LD	$a3,$SIZE_T*8($ap_end)
1868	$LDU	$acc3,$SIZE_T*4($n0)
1869	$ST	$t0,$SIZE_T*1($ap_end)
1870	$ST	$t1,$SIZE_T*2($ap_end)
1871	$ST	$t2,$SIZE_T*3($ap_end)
1872	$STU	$t3,$SIZE_T*4($ap_end)
1873	bdnz	.Lsqr4x_cond_copy
1874
1875	$POP	$ap,0($sp)		# pull saved sp
1876	andc	$a0,$a0,$carry
1877	and	$acc0,$acc0,$carry
1878	andc	$a1,$a1,$carry
1879	and	$acc1,$acc1,$carry
1880	andc	$a2,$a2,$carry
1881	and	$acc2,$acc2,$carry
1882	andc	$a3,$a3,$carry
1883	and	$acc3,$acc3,$carry
1884	or	$t0,$a0,$acc0
1885	or	$t1,$a1,$acc1
1886	or	$t2,$a2,$acc2
1887	or	$t3,$a3,$acc3
1888	$ST	$t0,$SIZE_T*1($ap_end)
1889	$ST	$t1,$SIZE_T*2($ap_end)
1890	$ST	$t2,$SIZE_T*3($ap_end)
1891	$ST	$t3,$SIZE_T*4($ap_end)
1892
1893	b	.Lsqr8x_done
1894
1895.align	5
1896.Lsqr8x8_post_condition:
1897	$POP	$rp,$SIZE_T*6($sp)	# pull rp
1898	$POP	$ap,0($sp)		# pull saved sp
1899	addze	$carry,$zero
1900
1901	# $acc0-7,$carry hold result, $a0-7 hold modulus
1902	subfc	$acc0,$a0,$acc0
1903	subfe	$acc1,$a1,$acc1
1904	 $ST	$zero,$SIZE_T*12($sp)	# wipe stack clean
1905	 $ST	$zero,$SIZE_T*13($sp)
1906	subfe	$acc2,$a2,$acc2
1907	 $ST	$zero,$SIZE_T*14($sp)
1908	 $ST	$zero,$SIZE_T*15($sp)
1909	subfe	$acc3,$a3,$acc3
1910	 $ST	$zero,$SIZE_T*16($sp)
1911	 $ST	$zero,$SIZE_T*17($sp)
1912	subfe	$acc4,$a4,$acc4
1913	 $ST	$zero,$SIZE_T*18($sp)
1914	 $ST	$zero,$SIZE_T*19($sp)
1915	subfe	$acc5,$a5,$acc5
1916	 $ST	$zero,$SIZE_T*20($sp)
1917	 $ST	$zero,$SIZE_T*21($sp)
1918	subfe	$acc6,$a6,$acc6
1919	 $ST	$zero,$SIZE_T*22($sp)
1920	 $ST	$zero,$SIZE_T*23($sp)
1921	subfe	$acc7,$a7,$acc7
1922	 $ST	$zero,$SIZE_T*24($sp)
1923	 $ST	$zero,$SIZE_T*25($sp)
1924	subfe	$carry,$zero,$carry	# did it borrow?
1925	 $ST	$zero,$SIZE_T*26($sp)
1926	 $ST	$zero,$SIZE_T*27($sp)
1927
1928	and	$a0,$a0,$carry
1929	and	$a1,$a1,$carry
1930	addc	$acc0,$acc0,$a0		# add modulus back if borrowed
1931	and	$a2,$a2,$carry
1932	adde	$acc1,$acc1,$a1
1933	and	$a3,$a3,$carry
1934	adde	$acc2,$acc2,$a2
1935	and	$a4,$a4,$carry
1936	adde	$acc3,$acc3,$a3
1937	and	$a5,$a5,$carry
1938	adde	$acc4,$acc4,$a4
1939	and	$a6,$a6,$carry
1940	adde	$acc5,$acc5,$a5
1941	and	$a7,$a7,$carry
1942	adde	$acc6,$acc6,$a6
1943	adde	$acc7,$acc7,$a7
1944	$ST	$acc0,$SIZE_T*1($rp)
1945	$ST	$acc1,$SIZE_T*2($rp)
1946	$ST	$acc2,$SIZE_T*3($rp)
1947	$ST	$acc3,$SIZE_T*4($rp)
1948	$ST	$acc4,$SIZE_T*5($rp)
1949	$ST	$acc5,$SIZE_T*6($rp)
1950	$ST	$acc6,$SIZE_T*7($rp)
1951	$ST	$acc7,$SIZE_T*8($rp)
1952
1953.Lsqr8x_done:
1954	$PUSH	$zero,$SIZE_T*8($sp)
1955	$PUSH	$zero,$SIZE_T*10($sp)
1956
1957	$POP	r14,-$SIZE_T*18($ap)
1958	li	r3,1			# signal "done"
1959	$POP	r15,-$SIZE_T*17($ap)
1960	$POP	r16,-$SIZE_T*16($ap)
1961	$POP	r17,-$SIZE_T*15($ap)
1962	$POP	r18,-$SIZE_T*14($ap)
1963	$POP	r19,-$SIZE_T*13($ap)
1964	$POP	r20,-$SIZE_T*12($ap)
1965	$POP	r21,-$SIZE_T*11($ap)
1966	$POP	r22,-$SIZE_T*10($ap)
1967	$POP	r23,-$SIZE_T*9($ap)
1968	$POP	r24,-$SIZE_T*8($ap)
1969	$POP	r25,-$SIZE_T*7($ap)
1970	$POP	r26,-$SIZE_T*6($ap)
1971	$POP	r27,-$SIZE_T*5($ap)
1972	$POP	r28,-$SIZE_T*4($ap)
1973	$POP	r29,-$SIZE_T*3($ap)
1974	$POP	r30,-$SIZE_T*2($ap)
1975	$POP	r31,-$SIZE_T*1($ap)
1976	mr	$sp,$ap
1977	blr
1978	.long	0
1979	.byte	0,12,4,0x20,0x80,18,6,0
1980	.long	0
1981.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1982___
1983}
1984$code.=<<___;
1985.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1986___
1987
1988$code =~ s/\`([^\`]*)\`/eval $1/gem;
1989print $code;
1990close STDOUT;
1991