1#! /usr/bin/env perl
2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2007
18
19# The reason for undertaken effort is basically following. Even though
20# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
21# performance was observed to be less than impressive, essentially as
22# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
23# Well, it's not surprising that IBM had to make some sacrifices to
24# boost the clock frequency that much, but no overall improvement?
25# Having observed how much difference did switching to FPU make on
26# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
27# Unfortunately the resulting performance improvement is not as
28# impressive, ~30%, and in absolute terms is still very far from what
29# one would expect from 4.7GHz CPU. There is a chance that I'm doing
30# something wrong, but in the lack of assembler level micro-profiling
31# data or at least decent platform guide I can't tell... Or better
32# results might be achieved with VMX... Anyway, this module provides
33# *worse* performance on other PowerPC implementations, ~40-15% slower
34# on PPC970 depending on key length and ~40% slower on Power 5 for all
35# key lengths. As it's obviously inappropriate as "best all-round"
36# alternative, it has to be complemented with run-time CPU family
37# detection. Oh! It should also be noted that unlike other PowerPC
38# implementation IALU ppc-mont.pl module performs *suboptimally* on
39# >=1024-bit key lengths on Power 6. It should also be noted that
40# *everything* said so far applies to 64-bit builds! As far as 32-bit
41# application executed on 64-bit CPU goes, this module is likely to
42# become preferred choice, because it's easy to adapt it for such
43# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
44
45# February 2008
46
47# Micro-profiling assisted optimization results in ~15% improvement
48# over original ppc64-mont.pl version, or overall ~50% improvement
49# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
50# Power 6 CPU, this module is 5-150% faster depending on key length,
51# [hereafter] more for longer keys. But if compared to ppc-mont.pl
52# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
53# in absolute terms, but it's apparently the way Power 6 is...
54
55# December 2009
56
57# Adapted for 32-bit build this module delivers 25-120%, yes, more
58# than *twice* for longer keys, performance improvement over 32-bit
59# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
60# even 64-bit integer operations and the trouble is that most PPC
61# operating systems don't preserve upper halves of general purpose
62# registers upon 32-bit signal delivery. They do preserve them upon
63# context switch, but not signalling:-( This means that asynchronous
64# signals have to be blocked upon entry to this subroutine. Signal
65# masking (and of course complementary unmasking) has quite an impact
66# on performance, naturally larger for shorter keys. It's so severe
67# that 512-bit key performance can be as low as 1/3 of expected one.
68# This is why this routine can be engaged for longer key operations
69# only on these OSes, see crypto/ppccap.c for further details. MacOS X
70# is an exception from this and doesn't require signal masking, and
71# that's where above improvement coefficients were collected. For
72# others alternative would be to break dependence on upper halves of
73# GPRs by sticking to 32-bit integer operations...
74
75# December 2012
76
77# Remove above mentioned dependence on GPRs' upper halves in 32-bit
78# build. No signal masking overhead, but integer instructions are
79# *more* numerous... It's still "universally" faster than 32-bit
80# ppc-mont.pl, but improvement coefficient is not as impressive
81# for longer keys...
82
83$flavour = shift;
84
85if ($flavour =~ /32/) {
86	$SIZE_T=4;
87	$RZONE=	224;
88	$fname=	"bn_mul_mont_fpu64";
89
90	$STUX=	"stwux";	# store indexed and update
91	$PUSH=	"stw";
92	$POP=	"lwz";
93} elsif ($flavour =~ /64/) {
94	$SIZE_T=8;
95	$RZONE=	288;
96	$fname=	"bn_mul_mont_fpu64";
97
98	# same as above, but 64-bit mnemonics...
99	$STUX=	"stdux";	# store indexed and update
100	$PUSH=	"std";
101	$POP=	"ld";
102} else { die "nonsense $flavour"; }
103
104$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
105
106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
108( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
109die "can't locate ppc-xlate.pl";
110
111open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
112
113$FRAME=64;	# padded frame header
114$TRANSFER=16*8;
115
116$carry="r0";
117$sp="r1";
118$toc="r2";
119$rp="r3";	$ovf="r3";
120$ap="r4";
121$bp="r5";
122$np="r6";
123$n0="r7";
124$num="r8";
125$rp="r9";	# $rp is reassigned
126$tp="r10";
127$j="r11";
128$i="r12";
129# non-volatile registers
130$c1="r19";
131$n1="r20";
132$a1="r21";
133$nap_d="r22";	# interleaved ap and np in double format
134$a0="r23";	# ap[0]
135$t0="r24";	# temporary registers
136$t1="r25";
137$t2="r26";
138$t3="r27";
139$t4="r28";
140$t5="r29";
141$t6="r30";
142$t7="r31";
143
144# PPC offers enough register bank capacity to unroll inner loops twice
145#
146#     ..A3A2A1A0
147#           dcba
148#    -----------
149#            A0a
150#           A0b
151#          A0c
152#         A0d
153#          A1a
154#         A1b
155#        A1c
156#       A1d
157#        A2a
158#       A2b
159#      A2c
160#     A2d
161#      A3a
162#     A3b
163#    A3c
164#   A3d
165#    ..a
166#   ..b
167#
168$ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
169$na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
170$dota="f8";	$dotb="f9";
171$A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
172$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
173$T0a="f24";	$T0b="f25";
174$T1a="f26";	$T1b="f27";
175$T2a="f28";	$T2b="f29";
176$T3a="f30";	$T3b="f31";
177
178# sp----------->+-------------------------------+
179#		| saved sp			|
180#		+-------------------------------+
181#		.				.
182#   +64		+-------------------------------+
183#		| 16 gpr<->fpr transfer zone	|
184#		.				.
185#		.				.
186#   +16*8	+-------------------------------+
187#		| __int64 tmp[-1]		|
188#		+-------------------------------+
189#		| __int64 tmp[num]		|
190#		.				.
191#		.				.
192#		.				.
193#   +(num+1)*8	+-------------------------------+
194#		| padding to 64 byte boundary	|
195#		.				.
196#   +X		+-------------------------------+
197#		| double nap_d[4*num]		|
198#		.				.
199#		.				.
200#		.				.
201#		+-------------------------------+
202#		.				.
203#   -13*size_t	+-------------------------------+
204#		| 13 saved gpr, r19-r31		|
205#		.				.
206#		.				.
207#   -12*8	+-------------------------------+
208#		| 12 saved fpr, f20-f31		|
209#		.				.
210#		.				.
211#		+-------------------------------+
212
213$code=<<___;
214.machine "any"
215.text
216
217.globl	.$fname
218.align	5
219.$fname:
220	cmpwi	$num,`3*8/$SIZE_T`
221	mr	$rp,r3		; $rp is reassigned
222	li	r3,0		; possible "not handled" return code
223	bltlr-
224	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
225	bnelr-
226
227	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
228	li	$i,-4096
229	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
230	add	$tp,$tp,$num	; place for tp[num+1]
231	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
232	subf	$tp,$tp,$sp	; $sp-$tp
233	and	$tp,$tp,$i	; minimize TLB usage
234	subf	$tp,$sp,$tp	; $tp-$sp
235	mr	$i,$sp
236	$STUX	$sp,$sp,$tp	; alloca
237
238	$PUSH	r19,`-12*8-13*$SIZE_T`($i)
239	$PUSH	r20,`-12*8-12*$SIZE_T`($i)
240	$PUSH	r21,`-12*8-11*$SIZE_T`($i)
241	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
242	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
243	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
244	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
245	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
246	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
247	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
248	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
249	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
250	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
251	stfd	f20,`-12*8`($i)
252	stfd	f21,`-11*8`($i)
253	stfd	f22,`-10*8`($i)
254	stfd	f23,`-9*8`($i)
255	stfd	f24,`-8*8`($i)
256	stfd	f25,`-7*8`($i)
257	stfd	f26,`-6*8`($i)
258	stfd	f27,`-5*8`($i)
259	stfd	f28,`-4*8`($i)
260	stfd	f29,`-3*8`($i)
261	stfd	f30,`-2*8`($i)
262	stfd	f31,`-1*8`($i)
263
264	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
265	li	$i,-64
266	add	$nap_d,$tp,$num
267	and	$nap_d,$nap_d,$i	; align to 64 bytes
268	; nap_d is off by 1, because it's used with stfdu/lfdu
269	addi	$nap_d,$nap_d,-8
270	srwi	$j,$num,`3+1`	; counter register, num/2
271	addi	$j,$j,-1
272	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
273	li	$carry,0
274	mtctr	$j
275___
276
277$code.=<<___ if ($SIZE_T==8);
278	ld	$a0,0($ap)		; pull ap[0] value
279	ld	$t3,0($bp)		; bp[0]
280	ld	$n0,0($n0)		; pull n0[0] value
281
282	mulld	$t7,$a0,$t3		; ap[0]*bp[0]
283	; transfer bp[0] to FPU as 4x16-bit values
284	extrdi	$t0,$t3,16,48
285	extrdi	$t1,$t3,16,32
286	extrdi	$t2,$t3,16,16
287	extrdi	$t3,$t3,16,0
288	std	$t0,`$FRAME+0`($sp)
289	std	$t1,`$FRAME+8`($sp)
290	std	$t2,`$FRAME+16`($sp)
291	std	$t3,`$FRAME+24`($sp)
292
293	mulld	$t7,$t7,$n0		; tp[0]*n0
294	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
295	extrdi	$t4,$t7,16,48
296	extrdi	$t5,$t7,16,32
297	extrdi	$t6,$t7,16,16
298	extrdi	$t7,$t7,16,0
299	std	$t4,`$FRAME+32`($sp)
300	std	$t5,`$FRAME+40`($sp)
301	std	$t6,`$FRAME+48`($sp)
302	std	$t7,`$FRAME+56`($sp)
303
304	extrdi	$t0,$a0,32,32		; lwz	$t0,4($ap)
305	extrdi	$t1,$a0,32,0		; lwz	$t1,0($ap)
306	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[1] as 32-bit word pair
307	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
308	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[0] as 32-bit word pair
309	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
310	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[1] as 32-bit word pair
311	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
312___
313$code.=<<___ if ($SIZE_T==4);
314	lwz	$a0,0($ap)		; pull ap[0,1] value
315	mr	$n1,$n0
316	lwz	$a1,4($ap)
317	li	$c1,0
318	lwz	$t1,0($bp)		; bp[0,1]
319	lwz	$t3,4($bp)
320	lwz	$n0,0($n1)		; pull n0[0,1] value
321	lwz	$n1,4($n1)
322
323	mullw	$t4,$a0,$t1		; mulld ap[0]*bp[0]
324	mulhwu	$t5,$a0,$t1
325	mullw	$t6,$a1,$t1
326	mullw	$t7,$a0,$t3
327	add	$t5,$t5,$t6
328	add	$t5,$t5,$t7
329	; transfer bp[0] to FPU as 4x16-bit values
330	extrwi	$t0,$t1,16,16
331	extrwi	$t1,$t1,16,0
332	extrwi	$t2,$t3,16,16
333	extrwi	$t3,$t3,16,0
334	std	$t0,`$FRAME+0`($sp)	; yes, std in 32-bit build
335	std	$t1,`$FRAME+8`($sp)
336	std	$t2,`$FRAME+16`($sp)
337	std	$t3,`$FRAME+24`($sp)
338
339	mullw	$t0,$t4,$n0		; mulld tp[0]*n0
340	mulhwu	$t1,$t4,$n0
341	mullw	$t2,$t5,$n0
342	mullw	$t3,$t4,$n1
343	add	$t1,$t1,$t2
344	add	$t1,$t1,$t3
345	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
346	extrwi	$t4,$t0,16,16
347	extrwi	$t5,$t0,16,0
348	extrwi	$t6,$t1,16,16
349	extrwi	$t7,$t1,16,0
350	std	$t4,`$FRAME+32`($sp)	; yes, std in 32-bit build
351	std	$t5,`$FRAME+40`($sp)
352	std	$t6,`$FRAME+48`($sp)
353	std	$t7,`$FRAME+56`($sp)
354
355	mr	$t0,$a0			; lwz	$t0,0($ap)
356	mr	$t1,$a1			; lwz	$t1,4($ap)
357	lwz	$t2,8($ap)		; load a[j..j+3] as 32-bit word pairs
358	lwz	$t3,12($ap)
359	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
360	lwz	$t5,4($np)
361	lwz	$t6,8($np)
362	lwz	$t7,12($np)
363___
364$code.=<<___;
365	lfd	$ba,`$FRAME+0`($sp)
366	lfd	$bb,`$FRAME+8`($sp)
367	lfd	$bc,`$FRAME+16`($sp)
368	lfd	$bd,`$FRAME+24`($sp)
369	lfd	$na,`$FRAME+32`($sp)
370	lfd	$nb,`$FRAME+40`($sp)
371	lfd	$nc,`$FRAME+48`($sp)
372	lfd	$nd,`$FRAME+56`($sp)
373	std	$t0,`$FRAME+64`($sp)	; yes, std even in 32-bit build
374	std	$t1,`$FRAME+72`($sp)
375	std	$t2,`$FRAME+80`($sp)
376	std	$t3,`$FRAME+88`($sp)
377	std	$t4,`$FRAME+96`($sp)
378	std	$t5,`$FRAME+104`($sp)
379	std	$t6,`$FRAME+112`($sp)
380	std	$t7,`$FRAME+120`($sp)
381	fcfid	$ba,$ba
382	fcfid	$bb,$bb
383	fcfid	$bc,$bc
384	fcfid	$bd,$bd
385	fcfid	$na,$na
386	fcfid	$nb,$nb
387	fcfid	$nc,$nc
388	fcfid	$nd,$nd
389
390	lfd	$A0,`$FRAME+64`($sp)
391	lfd	$A1,`$FRAME+72`($sp)
392	lfd	$A2,`$FRAME+80`($sp)
393	lfd	$A3,`$FRAME+88`($sp)
394	lfd	$N0,`$FRAME+96`($sp)
395	lfd	$N1,`$FRAME+104`($sp)
396	lfd	$N2,`$FRAME+112`($sp)
397	lfd	$N3,`$FRAME+120`($sp)
398	fcfid	$A0,$A0
399	fcfid	$A1,$A1
400	fcfid	$A2,$A2
401	fcfid	$A3,$A3
402	fcfid	$N0,$N0
403	fcfid	$N1,$N1
404	fcfid	$N2,$N2
405	fcfid	$N3,$N3
406	addi	$ap,$ap,16
407	addi	$np,$np,16
408
409	fmul	$T1a,$A1,$ba
410	fmul	$T1b,$A1,$bb
411	stfd	$A0,8($nap_d)		; save a[j] in double format
412	stfd	$A1,16($nap_d)
413	fmul	$T2a,$A2,$ba
414	fmul	$T2b,$A2,$bb
415	stfd	$A2,24($nap_d)		; save a[j+1] in double format
416	stfd	$A3,32($nap_d)
417	fmul	$T3a,$A3,$ba
418	fmul	$T3b,$A3,$bb
419	stfd	$N0,40($nap_d)		; save n[j] in double format
420	stfd	$N1,48($nap_d)
421	fmul	$T0a,$A0,$ba
422	fmul	$T0b,$A0,$bb
423	stfd	$N2,56($nap_d)		; save n[j+1] in double format
424	stfdu	$N3,64($nap_d)
425
426	fmadd	$T1a,$A0,$bc,$T1a
427	fmadd	$T1b,$A0,$bd,$T1b
428	fmadd	$T2a,$A1,$bc,$T2a
429	fmadd	$T2b,$A1,$bd,$T2b
430	fmadd	$T3a,$A2,$bc,$T3a
431	fmadd	$T3b,$A2,$bd,$T3b
432	fmul	$dota,$A3,$bc
433	fmul	$dotb,$A3,$bd
434
435	fmadd	$T1a,$N1,$na,$T1a
436	fmadd	$T1b,$N1,$nb,$T1b
437	fmadd	$T2a,$N2,$na,$T2a
438	fmadd	$T2b,$N2,$nb,$T2b
439	fmadd	$T3a,$N3,$na,$T3a
440	fmadd	$T3b,$N3,$nb,$T3b
441	fmadd	$T0a,$N0,$na,$T0a
442	fmadd	$T0b,$N0,$nb,$T0b
443
444	fmadd	$T1a,$N0,$nc,$T1a
445	fmadd	$T1b,$N0,$nd,$T1b
446	fmadd	$T2a,$N1,$nc,$T2a
447	fmadd	$T2b,$N1,$nd,$T2b
448	fmadd	$T3a,$N2,$nc,$T3a
449	fmadd	$T3b,$N2,$nd,$T3b
450	fmadd	$dota,$N3,$nc,$dota
451	fmadd	$dotb,$N3,$nd,$dotb
452
453	fctid	$T0a,$T0a
454	fctid	$T0b,$T0b
455	fctid	$T1a,$T1a
456	fctid	$T1b,$T1b
457	fctid	$T2a,$T2a
458	fctid	$T2b,$T2b
459	fctid	$T3a,$T3a
460	fctid	$T3b,$T3b
461
462	stfd	$T0a,`$FRAME+0`($sp)
463	stfd	$T0b,`$FRAME+8`($sp)
464	stfd	$T1a,`$FRAME+16`($sp)
465	stfd	$T1b,`$FRAME+24`($sp)
466	stfd	$T2a,`$FRAME+32`($sp)
467	stfd	$T2b,`$FRAME+40`($sp)
468	stfd	$T3a,`$FRAME+48`($sp)
469	stfd	$T3b,`$FRAME+56`($sp)
470
471.align	5
472L1st:
473___
474$code.=<<___ if ($SIZE_T==8);
475	lwz	$t0,`4^$LITTLE_ENDIAN`($ap)	; load a[j] as 32-bit word pair
476	lwz	$t1,`0^$LITTLE_ENDIAN`($ap)
477	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[j+1] as 32-bit word pair
478	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
479	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[j] as 32-bit word pair
480	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
481	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[j+1] as 32-bit word pair
482	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
483___
484$code.=<<___ if ($SIZE_T==4);
485	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
486	lwz	$t1,4($ap)
487	lwz	$t2,8($ap)
488	lwz	$t3,12($ap)
489	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
490	lwz	$t5,4($np)
491	lwz	$t6,8($np)
492	lwz	$t7,12($np)
493___
494$code.=<<___;
495	std	$t0,`$FRAME+64`($sp)	; yes, std even in 32-bit build
496	std	$t1,`$FRAME+72`($sp)
497	std	$t2,`$FRAME+80`($sp)
498	std	$t3,`$FRAME+88`($sp)
499	std	$t4,`$FRAME+96`($sp)
500	std	$t5,`$FRAME+104`($sp)
501	std	$t6,`$FRAME+112`($sp)
502	std	$t7,`$FRAME+120`($sp)
503___
504if ($SIZE_T==8 or $flavour =~ /osx/) {
505$code.=<<___;
506	ld	$t0,`$FRAME+0`($sp)
507	ld	$t1,`$FRAME+8`($sp)
508	ld	$t2,`$FRAME+16`($sp)
509	ld	$t3,`$FRAME+24`($sp)
510	ld	$t4,`$FRAME+32`($sp)
511	ld	$t5,`$FRAME+40`($sp)
512	ld	$t6,`$FRAME+48`($sp)
513	ld	$t7,`$FRAME+56`($sp)
514___
515} else {
516$code.=<<___;
517	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
518	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
519	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
520	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
521	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
522	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
523	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
524	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
525___
526}
527$code.=<<___;
528	lfd	$A0,`$FRAME+64`($sp)
529	lfd	$A1,`$FRAME+72`($sp)
530	lfd	$A2,`$FRAME+80`($sp)
531	lfd	$A3,`$FRAME+88`($sp)
532	lfd	$N0,`$FRAME+96`($sp)
533	lfd	$N1,`$FRAME+104`($sp)
534	lfd	$N2,`$FRAME+112`($sp)
535	lfd	$N3,`$FRAME+120`($sp)
536	fcfid	$A0,$A0
537	fcfid	$A1,$A1
538	fcfid	$A2,$A2
539	fcfid	$A3,$A3
540	fcfid	$N0,$N0
541	fcfid	$N1,$N1
542	fcfid	$N2,$N2
543	fcfid	$N3,$N3
544	addi	$ap,$ap,16
545	addi	$np,$np,16
546
547	fmul	$T1a,$A1,$ba
548	fmul	$T1b,$A1,$bb
549	fmul	$T2a,$A2,$ba
550	fmul	$T2b,$A2,$bb
551	stfd	$A0,8($nap_d)		; save a[j] in double format
552	stfd	$A1,16($nap_d)
553	fmul	$T3a,$A3,$ba
554	fmul	$T3b,$A3,$bb
555	fmadd	$T0a,$A0,$ba,$dota
556	fmadd	$T0b,$A0,$bb,$dotb
557	stfd	$A2,24($nap_d)		; save a[j+1] in double format
558	stfd	$A3,32($nap_d)
559___
560if ($SIZE_T==8 or $flavour =~ /osx/) {
561$code.=<<___;
562	fmadd	$T1a,$A0,$bc,$T1a
563	fmadd	$T1b,$A0,$bd,$T1b
564	fmadd	$T2a,$A1,$bc,$T2a
565	fmadd	$T2b,$A1,$bd,$T2b
566	stfd	$N0,40($nap_d)		; save n[j] in double format
567	stfd	$N1,48($nap_d)
568	fmadd	$T3a,$A2,$bc,$T3a
569	fmadd	$T3b,$A2,$bd,$T3b
570	 add	$t0,$t0,$carry		; can not overflow
571	fmul	$dota,$A3,$bc
572	fmul	$dotb,$A3,$bd
573	stfd	$N2,56($nap_d)		; save n[j+1] in double format
574	stfdu	$N3,64($nap_d)
575	 srdi	$carry,$t0,16
576	 add	$t1,$t1,$carry
577	 srdi	$carry,$t1,16
578
579	fmadd	$T1a,$N1,$na,$T1a
580	fmadd	$T1b,$N1,$nb,$T1b
581	 insrdi	$t0,$t1,16,32
582	fmadd	$T2a,$N2,$na,$T2a
583	fmadd	$T2b,$N2,$nb,$T2b
584	 add	$t2,$t2,$carry
585	fmadd	$T3a,$N3,$na,$T3a
586	fmadd	$T3b,$N3,$nb,$T3b
587	 srdi	$carry,$t2,16
588	fmadd	$T0a,$N0,$na,$T0a
589	fmadd	$T0b,$N0,$nb,$T0b
590	 insrdi	$t0,$t2,16,16
591	 add	$t3,$t3,$carry
592	 srdi	$carry,$t3,16
593
594	fmadd	$T1a,$N0,$nc,$T1a
595	fmadd	$T1b,$N0,$nd,$T1b
596	 insrdi	$t0,$t3,16,0		; 0..63 bits
597	fmadd	$T2a,$N1,$nc,$T2a
598	fmadd	$T2b,$N1,$nd,$T2b
599	 add	$t4,$t4,$carry
600	fmadd	$T3a,$N2,$nc,$T3a
601	fmadd	$T3b,$N2,$nd,$T3b
602	 srdi	$carry,$t4,16
603	fmadd	$dota,$N3,$nc,$dota
604	fmadd	$dotb,$N3,$nd,$dotb
605	 add	$t5,$t5,$carry
606	 srdi	$carry,$t5,16
607	 insrdi	$t4,$t5,16,32
608
609	fctid	$T0a,$T0a
610	fctid	$T0b,$T0b
611	 add	$t6,$t6,$carry
612	fctid	$T1a,$T1a
613	fctid	$T1b,$T1b
614	 srdi	$carry,$t6,16
615	fctid	$T2a,$T2a
616	fctid	$T2b,$T2b
617	 insrdi	$t4,$t6,16,16
618	fctid	$T3a,$T3a
619	fctid	$T3b,$T3b
620	 add	$t7,$t7,$carry
621	 insrdi	$t4,$t7,16,0		; 64..127 bits
622	 srdi	$carry,$t7,16		; upper 33 bits
623
624	stfd	$T0a,`$FRAME+0`($sp)
625	stfd	$T0b,`$FRAME+8`($sp)
626	stfd	$T1a,`$FRAME+16`($sp)
627	stfd	$T1b,`$FRAME+24`($sp)
628	stfd	$T2a,`$FRAME+32`($sp)
629	stfd	$T2b,`$FRAME+40`($sp)
630	stfd	$T3a,`$FRAME+48`($sp)
631	stfd	$T3b,`$FRAME+56`($sp)
632	 std	$t0,8($tp)		; tp[j-1]
633	 stdu	$t4,16($tp)		; tp[j]
634___
635} else {
636$code.=<<___;
637	fmadd	$T1a,$A0,$bc,$T1a
638	fmadd	$T1b,$A0,$bd,$T1b
639	 addc	$t0,$t0,$carry
640	 adde	$t1,$t1,$c1
641	 srwi	$carry,$t0,16
642	fmadd	$T2a,$A1,$bc,$T2a
643	fmadd	$T2b,$A1,$bd,$T2b
644	stfd	$N0,40($nap_d)		; save n[j] in double format
645	stfd	$N1,48($nap_d)
646	 srwi	$c1,$t1,16
647	 insrwi	$carry,$t1,16,0
648	fmadd	$T3a,$A2,$bc,$T3a
649	fmadd	$T3b,$A2,$bd,$T3b
650	 addc	$t2,$t2,$carry
651	 adde	$t3,$t3,$c1
652	 srwi	$carry,$t2,16
653	fmul	$dota,$A3,$bc
654	fmul	$dotb,$A3,$bd
655	stfd	$N2,56($nap_d)		; save n[j+1] in double format
656	stfdu	$N3,64($nap_d)
657	 insrwi	$t0,$t2,16,0		; 0..31 bits
658	 srwi	$c1,$t3,16
659	 insrwi	$carry,$t3,16,0
660
661	fmadd	$T1a,$N1,$na,$T1a
662	fmadd	$T1b,$N1,$nb,$T1b
663	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
664	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
665	 addc	$t4,$t4,$carry
666	 adde	$t5,$t5,$c1
667	 srwi	$carry,$t4,16
668	fmadd	$T2a,$N2,$na,$T2a
669	fmadd	$T2b,$N2,$nb,$T2b
670	 srwi	$c1,$t5,16
671	 insrwi	$carry,$t5,16,0
672	fmadd	$T3a,$N3,$na,$T3a
673	fmadd	$T3b,$N3,$nb,$T3b
674	 addc	$t6,$t6,$carry
675	 adde	$t7,$t7,$c1
676	 srwi	$carry,$t6,16
677	fmadd	$T0a,$N0,$na,$T0a
678	fmadd	$T0b,$N0,$nb,$T0b
679	 insrwi	$t4,$t6,16,0		; 32..63 bits
680	 srwi	$c1,$t7,16
681	 insrwi	$carry,$t7,16,0
682
683	fmadd	$T1a,$N0,$nc,$T1a
684	fmadd	$T1b,$N0,$nd,$T1b
685	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
686	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
687	 addc	$t2,$t2,$carry
688	 adde	$t3,$t3,$c1
689	 srwi	$carry,$t2,16
690	fmadd	$T2a,$N1,$nc,$T2a
691	fmadd	$T2b,$N1,$nd,$T2b
692	 stw	$t0,12($tp)		; tp[j-1]
693	 stw	$t4,8($tp)
694	 srwi	$c1,$t3,16
695	 insrwi	$carry,$t3,16,0
696	fmadd	$T3a,$N2,$nc,$T3a
697	fmadd	$T3b,$N2,$nd,$T3b
698	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
699	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
700	 addc	$t6,$t6,$carry
701	 adde	$t7,$t7,$c1
702	 srwi	$carry,$t6,16
703	fmadd	$dota,$N3,$nc,$dota
704	fmadd	$dotb,$N3,$nd,$dotb
705	 insrwi	$t2,$t6,16,0		; 64..95 bits
706	 srwi	$c1,$t7,16
707	 insrwi	$carry,$t7,16,0
708
709	fctid	$T0a,$T0a
710	fctid	$T0b,$T0b
711	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
712	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
713	 addc	$t0,$t0,$carry
714	 adde	$t1,$t1,$c1
715	 srwi	$carry,$t0,16
716	fctid	$T1a,$T1a
717	fctid	$T1b,$T1b
718	 srwi	$c1,$t1,16
719	 insrwi	$carry,$t1,16,0
720	fctid	$T2a,$T2a
721	fctid	$T2b,$T2b
722	 addc	$t4,$t4,$carry
723	 adde	$t5,$t5,$c1
724	 srwi	$carry,$t4,16
725	fctid	$T3a,$T3a
726	fctid	$T3b,$T3b
727	 insrwi	$t0,$t4,16,0		; 96..127 bits
728	 srwi	$c1,$t5,16
729	 insrwi	$carry,$t5,16,0
730
731	stfd	$T0a,`$FRAME+0`($sp)
732	stfd	$T0b,`$FRAME+8`($sp)
733	stfd	$T1a,`$FRAME+16`($sp)
734	stfd	$T1b,`$FRAME+24`($sp)
735	stfd	$T2a,`$FRAME+32`($sp)
736	stfd	$T2b,`$FRAME+40`($sp)
737	stfd	$T3a,`$FRAME+48`($sp)
738	stfd	$T3b,`$FRAME+56`($sp)
739	 stw	$t2,20($tp)		; tp[j]
740	 stwu	$t0,16($tp)
741___
742}
743$code.=<<___;
744	bdnz	L1st
745
746	fctid	$dota,$dota
747	fctid	$dotb,$dotb
748___
749if ($SIZE_T==8 or $flavour =~ /osx/) {
750$code.=<<___;
751	ld	$t0,`$FRAME+0`($sp)
752	ld	$t1,`$FRAME+8`($sp)
753	ld	$t2,`$FRAME+16`($sp)
754	ld	$t3,`$FRAME+24`($sp)
755	ld	$t4,`$FRAME+32`($sp)
756	ld	$t5,`$FRAME+40`($sp)
757	ld	$t6,`$FRAME+48`($sp)
758	ld	$t7,`$FRAME+56`($sp)
759	stfd	$dota,`$FRAME+64`($sp)
760	stfd	$dotb,`$FRAME+72`($sp)
761
762	add	$t0,$t0,$carry		; can not overflow
763	srdi	$carry,$t0,16
764	add	$t1,$t1,$carry
765	srdi	$carry,$t1,16
766	insrdi	$t0,$t1,16,32
767	add	$t2,$t2,$carry
768	srdi	$carry,$t2,16
769	insrdi	$t0,$t2,16,16
770	add	$t3,$t3,$carry
771	srdi	$carry,$t3,16
772	insrdi	$t0,$t3,16,0		; 0..63 bits
773	add	$t4,$t4,$carry
774	srdi	$carry,$t4,16
775	add	$t5,$t5,$carry
776	srdi	$carry,$t5,16
777	insrdi	$t4,$t5,16,32
778	add	$t6,$t6,$carry
779	srdi	$carry,$t6,16
780	insrdi	$t4,$t6,16,16
781	add	$t7,$t7,$carry
782	insrdi	$t4,$t7,16,0		; 64..127 bits
783	srdi	$carry,$t7,16		; upper 33 bits
784	ld	$t6,`$FRAME+64`($sp)
785	ld	$t7,`$FRAME+72`($sp)
786
787	std	$t0,8($tp)		; tp[j-1]
788	stdu	$t4,16($tp)		; tp[j]
789
790	add	$t6,$t6,$carry		; can not overflow
791	srdi	$carry,$t6,16
792	add	$t7,$t7,$carry
793	insrdi	$t6,$t7,48,0
794	srdi	$ovf,$t7,48
795	std	$t6,8($tp)		; tp[num-1]
796___
797} else {
798$code.=<<___;
799	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
800	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
801	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
802	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
803	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
804	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
805	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
806	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
807	stfd	$dota,`$FRAME+64`($sp)
808	stfd	$dotb,`$FRAME+72`($sp)
809
810	addc	$t0,$t0,$carry
811	adde	$t1,$t1,$c1
812	srwi	$carry,$t0,16
813	insrwi	$carry,$t1,16,0
814	srwi	$c1,$t1,16
815	addc	$t2,$t2,$carry
816	adde	$t3,$t3,$c1
817	srwi	$carry,$t2,16
818	 insrwi	$t0,$t2,16,0		; 0..31 bits
819	insrwi	$carry,$t3,16,0
820	srwi	$c1,$t3,16
821	addc	$t4,$t4,$carry
822	adde	$t5,$t5,$c1
823	srwi	$carry,$t4,16
824	insrwi	$carry,$t5,16,0
825	srwi	$c1,$t5,16
826	addc	$t6,$t6,$carry
827	adde	$t7,$t7,$c1
828	srwi	$carry,$t6,16
829	 insrwi	$t4,$t6,16,0		; 32..63 bits
830	insrwi	$carry,$t7,16,0
831	srwi	$c1,$t7,16
832	 stw	$t0,12($tp)		; tp[j-1]
833	 stw	$t4,8($tp)
834
835	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
836	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
837	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
838	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
839	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
840	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
841	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
842	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
843
844	addc	$t2,$t2,$carry
845	adde	$t3,$t3,$c1
846	srwi	$carry,$t2,16
847	insrwi	$carry,$t3,16,0
848	srwi	$c1,$t3,16
849	addc	$t6,$t6,$carry
850	adde	$t7,$t7,$c1
851	srwi	$carry,$t6,16
852	 insrwi	$t2,$t6,16,0		; 64..95 bits
853	insrwi	$carry,$t7,16,0
854	srwi	$c1,$t7,16
855	addc	$t0,$t0,$carry
856	adde	$t1,$t1,$c1
857	srwi	$carry,$t0,16
858	insrwi	$carry,$t1,16,0
859	srwi	$c1,$t1,16
860	addc	$t4,$t4,$carry
861	adde	$t5,$t5,$c1
862	srwi	$carry,$t4,16
863	 insrwi	$t0,$t4,16,0		; 96..127 bits
864	insrwi	$carry,$t5,16,0
865	srwi	$c1,$t5,16
866	 stw	$t2,20($tp)		; tp[j]
867	 stwu	$t0,16($tp)
868
869	lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
870	lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
871	lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
872	lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
873
874	addc	$t6,$t6,$carry
875	adde	$t7,$t7,$c1
876	srwi	$carry,$t6,16
877	insrwi	$carry,$t7,16,0
878	srwi	$c1,$t7,16
879	addc	$t4,$t4,$carry
880	adde	$t5,$t5,$c1
881
882	insrwi	$t6,$t4,16,0
883	srwi	$t4,$t4,16
884	insrwi	$t4,$t5,16,0
885	srwi	$ovf,$t5,16
886	stw	$t6,12($tp)		; tp[num-1]
887	stw	$t4,8($tp)
888___
889}
890$code.=<<___;
891	slwi	$t7,$num,2
892	subf	$nap_d,$t7,$nap_d	; rewind pointer
893
894	li	$i,8			; i=1
895.align	5
896Louter:
897	addi	$tp,$sp,`$FRAME+$TRANSFER`
898	li	$carry,0
899	mtctr	$j
900___
901$code.=<<___ if ($SIZE_T==8);
902	ldx	$t3,$bp,$i		; bp[i]
903
904	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
905	mulld	$t7,$a0,$t3		; ap[0]*bp[i]
906	add	$t7,$t7,$t6		; ap[0]*bp[i]+tp[0]
907	; transfer bp[i] to FPU as 4x16-bit values
908	extrdi	$t0,$t3,16,48
909	extrdi	$t1,$t3,16,32
910	extrdi	$t2,$t3,16,16
911	extrdi	$t3,$t3,16,0
912	std	$t0,`$FRAME+0`($sp)
913	std	$t1,`$FRAME+8`($sp)
914	std	$t2,`$FRAME+16`($sp)
915	std	$t3,`$FRAME+24`($sp)
916
917	mulld	$t7,$t7,$n0		; tp[0]*n0
918	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
919	extrdi	$t4,$t7,16,48
920	extrdi	$t5,$t7,16,32
921	extrdi	$t6,$t7,16,16
922	extrdi	$t7,$t7,16,0
923	std	$t4,`$FRAME+32`($sp)
924	std	$t5,`$FRAME+40`($sp)
925	std	$t6,`$FRAME+48`($sp)
926	std	$t7,`$FRAME+56`($sp)
927___
928$code.=<<___ if ($SIZE_T==4);
929	add	$t0,$bp,$i
930	li	$c1,0
931	lwz	$t1,0($t0)		; bp[i,i+1]
932	lwz	$t3,4($t0)
933
934	mullw	$t4,$a0,$t1		; ap[0]*bp[i]
935	lwz	$t0,`$FRAME+$TRANSFER+8+4`($sp)	; tp[0]
936	mulhwu	$t5,$a0,$t1
937	lwz	$t2,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
938	mullw	$t6,$a1,$t1
939	mullw	$t7,$a0,$t3
940	add	$t5,$t5,$t6
941	add	$t5,$t5,$t7
942	addc	$t4,$t4,$t0		; ap[0]*bp[i]+tp[0]
943	adde	$t5,$t5,$t2
944	; transfer bp[i] to FPU as 4x16-bit values
945	extrwi	$t0,$t1,16,16
946	extrwi	$t1,$t1,16,0
947	extrwi	$t2,$t3,16,16
948	extrwi	$t3,$t3,16,0
949	std	$t0,`$FRAME+0`($sp)	; yes, std in 32-bit build
950	std	$t1,`$FRAME+8`($sp)
951	std	$t2,`$FRAME+16`($sp)
952	std	$t3,`$FRAME+24`($sp)
953
954	mullw	$t0,$t4,$n0		; mulld tp[0]*n0
955	mulhwu	$t1,$t4,$n0
956	mullw	$t2,$t5,$n0
957	mullw	$t3,$t4,$n1
958	add	$t1,$t1,$t2
959	add	$t1,$t1,$t3
960	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
961	extrwi	$t4,$t0,16,16
962	extrwi	$t5,$t0,16,0
963	extrwi	$t6,$t1,16,16
964	extrwi	$t7,$t1,16,0
965	std	$t4,`$FRAME+32`($sp)	; yes, std in 32-bit build
966	std	$t5,`$FRAME+40`($sp)
967	std	$t6,`$FRAME+48`($sp)
968	std	$t7,`$FRAME+56`($sp)
969___
970$code.=<<___;
971	lfd	$A0,8($nap_d)		; load a[j] in double format
972	lfd	$A1,16($nap_d)
973	lfd	$A2,24($nap_d)		; load a[j+1] in double format
974	lfd	$A3,32($nap_d)
975	lfd	$N0,40($nap_d)		; load n[j] in double format
976	lfd	$N1,48($nap_d)
977	lfd	$N2,56($nap_d)		; load n[j+1] in double format
978	lfdu	$N3,64($nap_d)
979
980	lfd	$ba,`$FRAME+0`($sp)
981	lfd	$bb,`$FRAME+8`($sp)
982	lfd	$bc,`$FRAME+16`($sp)
983	lfd	$bd,`$FRAME+24`($sp)
984	lfd	$na,`$FRAME+32`($sp)
985	lfd	$nb,`$FRAME+40`($sp)
986	lfd	$nc,`$FRAME+48`($sp)
987	lfd	$nd,`$FRAME+56`($sp)
988
989	fcfid	$ba,$ba
990	fcfid	$bb,$bb
991	fcfid	$bc,$bc
992	fcfid	$bd,$bd
993	fcfid	$na,$na
994	fcfid	$nb,$nb
995	fcfid	$nc,$nc
996	fcfid	$nd,$nd
997
998	fmul	$T1a,$A1,$ba
999	fmul	$T1b,$A1,$bb
1000	fmul	$T2a,$A2,$ba
1001	fmul	$T2b,$A2,$bb
1002	fmul	$T3a,$A3,$ba
1003	fmul	$T3b,$A3,$bb
1004	fmul	$T0a,$A0,$ba
1005	fmul	$T0b,$A0,$bb
1006
1007	fmadd	$T1a,$A0,$bc,$T1a
1008	fmadd	$T1b,$A0,$bd,$T1b
1009	fmadd	$T2a,$A1,$bc,$T2a
1010	fmadd	$T2b,$A1,$bd,$T2b
1011	fmadd	$T3a,$A2,$bc,$T3a
1012	fmadd	$T3b,$A2,$bd,$T3b
1013	fmul	$dota,$A3,$bc
1014	fmul	$dotb,$A3,$bd
1015
1016	fmadd	$T1a,$N1,$na,$T1a
1017	fmadd	$T1b,$N1,$nb,$T1b
1018	 lfd	$A0,8($nap_d)		; load a[j] in double format
1019	 lfd	$A1,16($nap_d)
1020	fmadd	$T2a,$N2,$na,$T2a
1021	fmadd	$T2b,$N2,$nb,$T2b
1022	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
1023	 lfd	$A3,32($nap_d)
1024	fmadd	$T3a,$N3,$na,$T3a
1025	fmadd	$T3b,$N3,$nb,$T3b
1026	fmadd	$T0a,$N0,$na,$T0a
1027	fmadd	$T0b,$N0,$nb,$T0b
1028
1029	fmadd	$T1a,$N0,$nc,$T1a
1030	fmadd	$T1b,$N0,$nd,$T1b
1031	fmadd	$T2a,$N1,$nc,$T2a
1032	fmadd	$T2b,$N1,$nd,$T2b
1033	fmadd	$T3a,$N2,$nc,$T3a
1034	fmadd	$T3b,$N2,$nd,$T3b
1035	fmadd	$dota,$N3,$nc,$dota
1036	fmadd	$dotb,$N3,$nd,$dotb
1037
1038	fctid	$T0a,$T0a
1039	fctid	$T0b,$T0b
1040	fctid	$T1a,$T1a
1041	fctid	$T1b,$T1b
1042	fctid	$T2a,$T2a
1043	fctid	$T2b,$T2b
1044	fctid	$T3a,$T3a
1045	fctid	$T3b,$T3b
1046
1047	stfd	$T0a,`$FRAME+0`($sp)
1048	stfd	$T0b,`$FRAME+8`($sp)
1049	stfd	$T1a,`$FRAME+16`($sp)
1050	stfd	$T1b,`$FRAME+24`($sp)
1051	stfd	$T2a,`$FRAME+32`($sp)
1052	stfd	$T2b,`$FRAME+40`($sp)
1053	stfd	$T3a,`$FRAME+48`($sp)
1054	stfd	$T3b,`$FRAME+56`($sp)
1055
1056.align	5
1057Linner:
1058	fmul	$T1a,$A1,$ba
1059	fmul	$T1b,$A1,$bb
1060	fmul	$T2a,$A2,$ba
1061	fmul	$T2b,$A2,$bb
1062	lfd	$N0,40($nap_d)		; load n[j] in double format
1063	lfd	$N1,48($nap_d)
1064	fmul	$T3a,$A3,$ba
1065	fmul	$T3b,$A3,$bb
1066	fmadd	$T0a,$A0,$ba,$dota
1067	fmadd	$T0b,$A0,$bb,$dotb
1068	lfd	$N2,56($nap_d)		; load n[j+1] in double format
1069	lfdu	$N3,64($nap_d)
1070
1071	fmadd	$T1a,$A0,$bc,$T1a
1072	fmadd	$T1b,$A0,$bd,$T1b
1073	fmadd	$T2a,$A1,$bc,$T2a
1074	fmadd	$T2b,$A1,$bd,$T2b
1075	 lfd	$A0,8($nap_d)		; load a[j] in double format
1076	 lfd	$A1,16($nap_d)
1077	fmadd	$T3a,$A2,$bc,$T3a
1078	fmadd	$T3b,$A2,$bd,$T3b
1079	fmul	$dota,$A3,$bc
1080	fmul	$dotb,$A3,$bd
1081	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
1082	 lfd	$A3,32($nap_d)
1083___
1084if ($SIZE_T==8 or $flavour =~ /osx/) {
1085$code.=<<___;
1086	fmadd	$T1a,$N1,$na,$T1a
1087	fmadd	$T1b,$N1,$nb,$T1b
1088	 ld	$t0,`$FRAME+0`($sp)
1089	 ld	$t1,`$FRAME+8`($sp)
1090	fmadd	$T2a,$N2,$na,$T2a
1091	fmadd	$T2b,$N2,$nb,$T2b
1092	 ld	$t2,`$FRAME+16`($sp)
1093	 ld	$t3,`$FRAME+24`($sp)
1094	fmadd	$T3a,$N3,$na,$T3a
1095	fmadd	$T3b,$N3,$nb,$T3b
1096	 add	$t0,$t0,$carry		; can not overflow
1097	 ld	$t4,`$FRAME+32`($sp)
1098	 ld	$t5,`$FRAME+40`($sp)
1099	fmadd	$T0a,$N0,$na,$T0a
1100	fmadd	$T0b,$N0,$nb,$T0b
1101	 srdi	$carry,$t0,16
1102	 add	$t1,$t1,$carry
1103	 srdi	$carry,$t1,16
1104	 ld	$t6,`$FRAME+48`($sp)
1105	 ld	$t7,`$FRAME+56`($sp)
1106
1107	fmadd	$T1a,$N0,$nc,$T1a
1108	fmadd	$T1b,$N0,$nd,$T1b
1109	 insrdi	$t0,$t1,16,32
1110	 ld	$t1,8($tp)		; tp[j]
1111	fmadd	$T2a,$N1,$nc,$T2a
1112	fmadd	$T2b,$N1,$nd,$T2b
1113	 add	$t2,$t2,$carry
1114	fmadd	$T3a,$N2,$nc,$T3a
1115	fmadd	$T3b,$N2,$nd,$T3b
1116	 srdi	$carry,$t2,16
1117	 insrdi	$t0,$t2,16,16
1118	fmadd	$dota,$N3,$nc,$dota
1119	fmadd	$dotb,$N3,$nd,$dotb
1120	 add	$t3,$t3,$carry
1121	 ldu	$t2,16($tp)		; tp[j+1]
1122	 srdi	$carry,$t3,16
1123	 insrdi	$t0,$t3,16,0		; 0..63 bits
1124	 add	$t4,$t4,$carry
1125
1126	fctid	$T0a,$T0a
1127	fctid	$T0b,$T0b
1128	 srdi	$carry,$t4,16
1129	fctid	$T1a,$T1a
1130	fctid	$T1b,$T1b
1131	 add	$t5,$t5,$carry
1132	fctid	$T2a,$T2a
1133	fctid	$T2b,$T2b
1134	 srdi	$carry,$t5,16
1135	 insrdi	$t4,$t5,16,32
1136	fctid	$T3a,$T3a
1137	fctid	$T3b,$T3b
1138	 add	$t6,$t6,$carry
1139	 srdi	$carry,$t6,16
1140	 insrdi	$t4,$t6,16,16
1141
1142	stfd	$T0a,`$FRAME+0`($sp)
1143	stfd	$T0b,`$FRAME+8`($sp)
1144	 add	$t7,$t7,$carry
1145	 addc	$t3,$t0,$t1
1146___
1147$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1148	extrdi	$t0,$t0,32,0
1149	extrdi	$t1,$t1,32,0
1150	adde	$t0,$t0,$t1
1151___
1152$code.=<<___;
1153	stfd	$T1a,`$FRAME+16`($sp)
1154	stfd	$T1b,`$FRAME+24`($sp)
1155	 insrdi	$t4,$t7,16,0		; 64..127 bits
1156	 srdi	$carry,$t7,16		; upper 33 bits
1157	stfd	$T2a,`$FRAME+32`($sp)
1158	stfd	$T2b,`$FRAME+40`($sp)
1159	 adde	$t5,$t4,$t2
1160___
1161$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1162	extrdi	$t4,$t4,32,0
1163	extrdi	$t2,$t2,32,0
1164	adde	$t4,$t4,$t2
1165___
1166$code.=<<___;
1167	stfd	$T3a,`$FRAME+48`($sp)
1168	stfd	$T3b,`$FRAME+56`($sp)
1169	 addze	$carry,$carry
1170	 std	$t3,-16($tp)		; tp[j-1]
1171	 std	$t5,-8($tp)		; tp[j]
1172___
1173} else {
1174$code.=<<___;
1175	fmadd	$T1a,$N1,$na,$T1a
1176	fmadd	$T1b,$N1,$nb,$T1b
1177	 lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1178	 lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1179	fmadd	$T2a,$N2,$na,$T2a
1180	fmadd	$T2b,$N2,$nb,$T2b
1181	 lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1182	 lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1183	fmadd	$T3a,$N3,$na,$T3a
1184	fmadd	$T3b,$N3,$nb,$T3b
1185	 lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1186	 lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1187	 addc	$t0,$t0,$carry
1188	 adde	$t1,$t1,$c1
1189	 srwi	$carry,$t0,16
1190	fmadd	$T0a,$N0,$na,$T0a
1191	fmadd	$T0b,$N0,$nb,$T0b
1192	 lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1193	 lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1194	 srwi	$c1,$t1,16
1195	 insrwi	$carry,$t1,16,0
1196
1197	fmadd	$T1a,$N0,$nc,$T1a
1198	fmadd	$T1b,$N0,$nd,$T1b
1199	 addc	$t2,$t2,$carry
1200	 adde	$t3,$t3,$c1
1201	 srwi	$carry,$t2,16
1202	fmadd	$T2a,$N1,$nc,$T2a
1203	fmadd	$T2b,$N1,$nd,$T2b
1204	 insrwi	$t0,$t2,16,0		; 0..31 bits
1205	 srwi	$c1,$t3,16
1206	 insrwi	$carry,$t3,16,0
1207	fmadd	$T3a,$N2,$nc,$T3a
1208	fmadd	$T3b,$N2,$nd,$T3b
1209	 lwz	$t2,12($tp)		; tp[j]
1210	 lwz	$t3,8($tp)
1211	 addc	$t4,$t4,$carry
1212	 adde	$t5,$t5,$c1
1213	 srwi	$carry,$t4,16
1214	fmadd	$dota,$N3,$nc,$dota
1215	fmadd	$dotb,$N3,$nd,$dotb
1216	 srwi	$c1,$t5,16
1217	 insrwi	$carry,$t5,16,0
1218
1219	fctid	$T0a,$T0a
1220	 addc	$t6,$t6,$carry
1221	 adde	$t7,$t7,$c1
1222	 srwi	$carry,$t6,16
1223	fctid	$T0b,$T0b
1224	 insrwi	$t4,$t6,16,0		; 32..63 bits
1225	 srwi	$c1,$t7,16
1226	 insrwi	$carry,$t7,16,0
1227	fctid	$T1a,$T1a
1228	 addc	$t0,$t0,$t2
1229	 adde	$t4,$t4,$t3
1230	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
1231	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
1232	fctid	$T1b,$T1b
1233	 addze	$carry,$carry
1234	 addze	$c1,$c1
1235	 stw	$t0,4($tp)		; tp[j-1]
1236	 stw	$t4,0($tp)
1237	fctid	$T2a,$T2a
1238	 addc	$t2,$t2,$carry
1239	 adde	$t3,$t3,$c1
1240	 srwi	$carry,$t2,16
1241	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
1242	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
1243	fctid	$T2b,$T2b
1244	 srwi	$c1,$t3,16
1245	 insrwi	$carry,$t3,16,0
1246	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
1247	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
1248	fctid	$T3a,$T3a
1249	 addc	$t6,$t6,$carry
1250	 adde	$t7,$t7,$c1
1251	 srwi	$carry,$t6,16
1252	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
1253	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
1254	fctid	$T3b,$T3b
1255
1256	 insrwi	$t2,$t6,16,0		; 64..95 bits
1257	insrwi	$carry,$t7,16,0
1258	srwi	$c1,$t7,16
1259	 lwz	$t6,20($tp)
1260	 lwzu	$t7,16($tp)
1261	addc	$t0,$t0,$carry
1262	 stfd	$T0a,`$FRAME+0`($sp)
1263	adde	$t1,$t1,$c1
1264	srwi	$carry,$t0,16
1265	 stfd	$T0b,`$FRAME+8`($sp)
1266	insrwi	$carry,$t1,16,0
1267	srwi	$c1,$t1,16
1268	addc	$t4,$t4,$carry
1269	 stfd	$T1a,`$FRAME+16`($sp)
1270	adde	$t5,$t5,$c1
1271	srwi	$carry,$t4,16
1272	 insrwi	$t0,$t4,16,0		; 96..127 bits
1273	 stfd	$T1b,`$FRAME+24`($sp)
1274	insrwi	$carry,$t5,16,0
1275	srwi	$c1,$t5,16
1276
1277	addc	$t2,$t2,$t6
1278	 stfd	$T2a,`$FRAME+32`($sp)
1279	adde	$t0,$t0,$t7
1280	 stfd	$T2b,`$FRAME+40`($sp)
1281	addze	$carry,$carry
1282	 stfd	$T3a,`$FRAME+48`($sp)
1283	addze	$c1,$c1
1284	 stfd	$T3b,`$FRAME+56`($sp)
1285	 stw	$t2,-4($tp)		; tp[j]
1286	 stw	$t0,-8($tp)
1287___
1288}
1289$code.=<<___;
1290	bdnz	Linner
1291
1292	fctid	$dota,$dota
1293	fctid	$dotb,$dotb
1294___
1295if ($SIZE_T==8 or $flavour =~ /osx/) {
1296$code.=<<___;
1297	ld	$t0,`$FRAME+0`($sp)
1298	ld	$t1,`$FRAME+8`($sp)
1299	ld	$t2,`$FRAME+16`($sp)
1300	ld	$t3,`$FRAME+24`($sp)
1301	ld	$t4,`$FRAME+32`($sp)
1302	ld	$t5,`$FRAME+40`($sp)
1303	ld	$t6,`$FRAME+48`($sp)
1304	ld	$t7,`$FRAME+56`($sp)
1305	stfd	$dota,`$FRAME+64`($sp)
1306	stfd	$dotb,`$FRAME+72`($sp)
1307
1308	add	$t0,$t0,$carry		; can not overflow
1309	srdi	$carry,$t0,16
1310	add	$t1,$t1,$carry
1311	srdi	$carry,$t1,16
1312	insrdi	$t0,$t1,16,32
1313	add	$t2,$t2,$carry
1314	ld	$t1,8($tp)		; tp[j]
1315	srdi	$carry,$t2,16
1316	insrdi	$t0,$t2,16,16
1317	add	$t3,$t3,$carry
1318	ldu	$t2,16($tp)		; tp[j+1]
1319	srdi	$carry,$t3,16
1320	insrdi	$t0,$t3,16,0		; 0..63 bits
1321	add	$t4,$t4,$carry
1322	srdi	$carry,$t4,16
1323	add	$t5,$t5,$carry
1324	srdi	$carry,$t5,16
1325	insrdi	$t4,$t5,16,32
1326	add	$t6,$t6,$carry
1327	srdi	$carry,$t6,16
1328	insrdi	$t4,$t6,16,16
1329	add	$t7,$t7,$carry
1330	insrdi	$t4,$t7,16,0		; 64..127 bits
1331	srdi	$carry,$t7,16		; upper 33 bits
1332	ld	$t6,`$FRAME+64`($sp)
1333	ld	$t7,`$FRAME+72`($sp)
1334
1335	addc	$t3,$t0,$t1
1336___
1337$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1338	extrdi	$t0,$t0,32,0
1339	extrdi	$t1,$t1,32,0
1340	adde	$t0,$t0,$t1
1341___
1342$code.=<<___;
1343	adde	$t5,$t4,$t2
1344___
1345$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1346	extrdi	$t4,$t4,32,0
1347	extrdi	$t2,$t2,32,0
1348	adde	$t4,$t4,$t2
1349___
1350$code.=<<___;
1351	addze	$carry,$carry
1352
1353	std	$t3,-16($tp)		; tp[j-1]
1354	std	$t5,-8($tp)		; tp[j]
1355
1356	add	$carry,$carry,$ovf	; consume upmost overflow
1357	add	$t6,$t6,$carry		; can not overflow
1358	srdi	$carry,$t6,16
1359	add	$t7,$t7,$carry
1360	insrdi	$t6,$t7,48,0
1361	srdi	$ovf,$t7,48
1362	std	$t6,0($tp)		; tp[num-1]
1363___
1364} else {
1365$code.=<<___;
1366	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1367	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1368	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1369	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1370	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1371	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1372	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1373	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1374	stfd	$dota,`$FRAME+64`($sp)
1375	stfd	$dotb,`$FRAME+72`($sp)
1376
1377	addc	$t0,$t0,$carry
1378	adde	$t1,$t1,$c1
1379	srwi	$carry,$t0,16
1380	insrwi	$carry,$t1,16,0
1381	srwi	$c1,$t1,16
1382	addc	$t2,$t2,$carry
1383	adde	$t3,$t3,$c1
1384	srwi	$carry,$t2,16
1385	 insrwi	$t0,$t2,16,0		; 0..31 bits
1386	 lwz	$t2,12($tp)		; tp[j]
1387	insrwi	$carry,$t3,16,0
1388	srwi	$c1,$t3,16
1389	 lwz	$t3,8($tp)
1390	addc	$t4,$t4,$carry
1391	adde	$t5,$t5,$c1
1392	srwi	$carry,$t4,16
1393	insrwi	$carry,$t5,16,0
1394	srwi	$c1,$t5,16
1395	addc	$t6,$t6,$carry
1396	adde	$t7,$t7,$c1
1397	srwi	$carry,$t6,16
1398	 insrwi	$t4,$t6,16,0		; 32..63 bits
1399	insrwi	$carry,$t7,16,0
1400	srwi	$c1,$t7,16
1401
1402	addc	$t0,$t0,$t2
1403	adde	$t4,$t4,$t3
1404	addze	$carry,$carry
1405	addze	$c1,$c1
1406	 stw	$t0,4($tp)		; tp[j-1]
1407	 stw	$t4,0($tp)
1408
1409	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
1410	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
1411	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
1412	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
1413	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
1414	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
1415	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
1416	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
1417
1418	addc	$t2,$t2,$carry
1419	adde	$t3,$t3,$c1
1420	srwi	$carry,$t2,16
1421	insrwi	$carry,$t3,16,0
1422	srwi	$c1,$t3,16
1423	addc	$t6,$t6,$carry
1424	adde	$t7,$t7,$c1
1425	srwi	$carry,$t6,16
1426	 insrwi	$t2,$t6,16,0		; 64..95 bits
1427	 lwz	$t6,20($tp)
1428	insrwi	$carry,$t7,16,0
1429	srwi	$c1,$t7,16
1430	 lwzu	$t7,16($tp)
1431	addc	$t0,$t0,$carry
1432	adde	$t1,$t1,$c1
1433	srwi	$carry,$t0,16
1434	insrwi	$carry,$t1,16,0
1435	srwi	$c1,$t1,16
1436	addc	$t4,$t4,$carry
1437	adde	$t5,$t5,$c1
1438	srwi	$carry,$t4,16
1439	 insrwi	$t0,$t4,16,0		; 96..127 bits
1440	insrwi	$carry,$t5,16,0
1441	srwi	$c1,$t5,16
1442
1443	addc	$t2,$t2,$t6
1444	adde	$t0,$t0,$t7
1445	 lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1446	 lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1447	addze	$carry,$carry
1448	addze	$c1,$c1
1449	 lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1450	 lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1451
1452	addc	$t6,$t6,$carry
1453	adde	$t7,$t7,$c1
1454	 stw	$t2,-4($tp)		; tp[j]
1455	 stw	$t0,-8($tp)
1456	addc	$t6,$t6,$ovf
1457	addze	$t7,$t7
1458	srwi	$carry,$t6,16
1459	insrwi	$carry,$t7,16,0
1460	srwi	$c1,$t7,16
1461	addc	$t4,$t4,$carry
1462	adde	$t5,$t5,$c1
1463
1464	insrwi	$t6,$t4,16,0
1465	srwi	$t4,$t4,16
1466	insrwi	$t4,$t5,16,0
1467	srwi	$ovf,$t5,16
1468	stw	$t6,4($tp)		; tp[num-1]
1469	stw	$t4,0($tp)
1470___
1471}
1472$code.=<<___;
1473	slwi	$t7,$num,2
1474	addi	$i,$i,8
1475	subf	$nap_d,$t7,$nap_d	; rewind pointer
1476	cmpw	$i,$num
1477	blt-	Louter
1478___
1479
1480$code.=<<___ if ($SIZE_T==8);
1481	subf	$np,$num,$np	; rewind np
1482	addi	$j,$j,1		; restore counter
1483	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
1484	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
1485	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
1486	addi	$t5,$np,8
1487	addi	$t6,$rp,8
1488	mtctr	$j
1489
1490.align	4
1491Lsub:	ldx	$t0,$tp,$i
1492	ldx	$t1,$np,$i
1493	ldx	$t2,$t4,$i
1494	ldx	$t3,$t5,$i
1495	subfe	$t0,$t1,$t0	; tp[j]-np[j]
1496	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
1497	stdx	$t0,$rp,$i
1498	stdx	$t2,$t6,$i
1499	addi	$i,$i,16
1500	bdnz	Lsub
1501
1502	li	$i,0
1503	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1504	mtctr	$j
1505
1506.align	4
1507Lcopy:				; conditional copy
1508	ldx	$t0,$tp,$i
1509	ldx	$t1,$t4,$i
1510	ldx	$t2,$rp,$i
1511	ldx	$t3,$t6,$i
1512	std	$i,8($nap_d)	; zap nap_d
1513	std	$i,16($nap_d)
1514	std	$i,24($nap_d)
1515	std	$i,32($nap_d)
1516	std	$i,40($nap_d)
1517	std	$i,48($nap_d)
1518	std	$i,56($nap_d)
1519	stdu	$i,64($nap_d)
1520	and	$t0,$t0,$ovf
1521	and	$t1,$t1,$ovf
1522	andc	$t2,$t2,$ovf
1523	andc	$t3,$t3,$ovf
1524	or	$t0,$t0,$t2
1525	or	$t1,$t1,$t3
1526	stdx	$t0,$rp,$i
1527	stdx	$t1,$t6,$i
1528	stdx	$i,$tp,$i	; zap tp at once
1529	stdx	$i,$t4,$i
1530	addi	$i,$i,16
1531	bdnz	Lcopy
1532___
1533$code.=<<___ if ($SIZE_T==4);
1534	subf	$np,$num,$np	; rewind np
1535	addi	$j,$j,1		; restore counter
1536	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
1537	addi	$tp,$sp,`$FRAME+$TRANSFER`
1538	addi	$np,$np,-4
1539	addi	$rp,$rp,-4
1540	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
1541	mtctr	$j
1542
1543.align	4
1544Lsub:	lwz	$t0,12($tp)	; load tp[j..j+3] in 64-bit word order
1545	lwz	$t1,8($tp)
1546	lwz	$t2,20($tp)
1547	lwzu	$t3,16($tp)
1548	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
1549	lwz	$t5,8($np)
1550	lwz	$t6,12($np)
1551	lwzu	$t7,16($np)
1552	subfe	$t4,$t4,$t0	; tp[j]-np[j]
1553	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
1554	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
1555	 stw	$t1,8($ap)
1556	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
1557	 stw	$t2,12($ap)
1558	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
1559	 stwu	$t3,16($ap)
1560	stw	$t4,4($rp)
1561	stw	$t5,8($rp)
1562	stw	$t6,12($rp)
1563	stwu	$t7,16($rp)
1564	bdnz	Lsub
1565
1566	li	$i,0
1567	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1568	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
1569	subf	$rp,$num,$rp	; rewind rp
1570	addi	$tp,$sp,`$FRAME+$TRANSFER`
1571	mtctr	$j
1572
1573.align	4
1574Lcopy:				; conditional copy
1575	lwz	$t0,4($ap)
1576	lwz	$t1,8($ap)
1577	lwz	$t2,12($ap)
1578	lwzu	$t3,16($ap)
1579	lwz	$t4,4($rp)
1580	lwz	$t5,8($rp)
1581	lwz	$t6,12($rp)
1582	lwz	$t7,16($rp)
1583	std	$i,8($nap_d)	; zap nap_d
1584	std	$i,16($nap_d)
1585	std	$i,24($nap_d)
1586	std	$i,32($nap_d)
1587	std	$i,40($nap_d)
1588	std	$i,48($nap_d)
1589	std	$i,56($nap_d)
1590	stdu	$i,64($nap_d)
1591	and	$t0,$t0,$ovf
1592	and	$t1,$t1,$ovf
1593	and	$t2,$t2,$ovf
1594	and	$t3,$t3,$ovf
1595	andc	$t4,$t4,$ovf
1596	andc	$t5,$t5,$ovf
1597	andc	$t6,$t6,$ovf
1598	andc	$t7,$t7,$ovf
1599	or	$t0,$t0,$t4
1600	or	$t1,$t1,$t5
1601	or	$t2,$t2,$t6
1602	or	$t3,$t3,$t7
1603	stw	$t0,4($rp)
1604	stw	$t1,8($rp)
1605	stw	$t2,12($rp)
1606	stwu	$t3,16($rp)
1607	std	$i,8($tp)	; zap tp at once
1608	stdu	$i,16($tp)
1609	bdnz	Lcopy
1610___
1611
1612$code.=<<___;
1613	$POP	$i,0($sp)
1614	li	r3,1	; signal "handled"
1615	$POP	r19,`-12*8-13*$SIZE_T`($i)
1616	$POP	r20,`-12*8-12*$SIZE_T`($i)
1617	$POP	r21,`-12*8-11*$SIZE_T`($i)
1618	$POP	r22,`-12*8-10*$SIZE_T`($i)
1619	$POP	r23,`-12*8-9*$SIZE_T`($i)
1620	$POP	r24,`-12*8-8*$SIZE_T`($i)
1621	$POP	r25,`-12*8-7*$SIZE_T`($i)
1622	$POP	r26,`-12*8-6*$SIZE_T`($i)
1623	$POP	r27,`-12*8-5*$SIZE_T`($i)
1624	$POP	r28,`-12*8-4*$SIZE_T`($i)
1625	$POP	r29,`-12*8-3*$SIZE_T`($i)
1626	$POP	r30,`-12*8-2*$SIZE_T`($i)
1627	$POP	r31,`-12*8-1*$SIZE_T`($i)
1628	lfd	f20,`-12*8`($i)
1629	lfd	f21,`-11*8`($i)
1630	lfd	f22,`-10*8`($i)
1631	lfd	f23,`-9*8`($i)
1632	lfd	f24,`-8*8`($i)
1633	lfd	f25,`-7*8`($i)
1634	lfd	f26,`-6*8`($i)
1635	lfd	f27,`-5*8`($i)
1636	lfd	f28,`-4*8`($i)
1637	lfd	f29,`-3*8`($i)
1638	lfd	f30,`-2*8`($i)
1639	lfd	f31,`-1*8`($i)
1640	mr	$sp,$i
1641	blr
1642	.long	0
1643	.byte	0,12,4,0,0x8c,13,6,0
1644	.long	0
1645.size	.$fname,.-.$fname
1646
1647.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1648___
1649
1650$code =~ s/\`([^\`]*)\`/eval $1/gem;
1651print $code;
1652close STDOUT or die "error closing STDOUT: $!";
1653