1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef __64BIT__
43#define LOAD	lwz
44#else
45#define LOAD	ld
46#endif
47
48#ifdef __64BIT__
49#define STACKSIZE 320
50#define ALPHA   296(SP)
51#define FZERO	304(SP)
52#else
53#define STACKSIZE 240
54#define ALPHA   224(SP)
55#define FZERO	232(SP)
56#endif
57
58#define	M	r3
59#define	N	r4
60#define	K	r5
61
62#ifdef linux
63#ifndef __64BIT__
64#define A	r6
65#define	B	r7
66#define	C	r8
67#define	LDC	r9
68#define OFFSET	r10
69#else
70#define A	r7
71#define	B	r8
72#define	C	r9
73#define	LDC	r10
74#define OFFSET	r6
75#endif
76#endif
77
78#if defined(_AIX) || defined(__APPLE__)
79#if !defined(__64BIT__) && defined(DOUBLE)
80#define A	r8
81#define	B	r9
82#define	C	r10
83#define	LDC	r7
84#define OFFSET	r6
85#else
86#define A	r7
87#define	B	r8
88#define	C	r9
89#define	LDC	r10
90#define OFFSET	r6
91#endif
92#endif
93
94#define AORIG	r18
95#define TEMP	r19
96#define KK	r20
97#define	I	r21
98#define J	r22
99#define AO	r23
100#define	BO	r24
101#define	CO1	r25
102#define CO2	r26
103#define	CO3	r27
104#define	CO4	r28
105
106#define A1	f16
107#define A2	f17
108#define A3	f18
109#define A4	f19
110#define A5	f20
111#define A6	f21
112#define B1	f22
113#define B2	f23
114#define B3	f24
115#define B4	f25
116#define B5	f26
117#define B6	f27
118#define B7	f28
119#define B8	f29
120#define B9	f30
121#define B10	f31
122
123	PROLOGUE
124	PROFCODE
125
126	addi	SP, SP, -STACKSIZE
127	li	r0, 0
128
129	stfd	f14,    0(SP)
130	stfd	f15,    8(SP)
131	stfd	f16,   16(SP)
132	stfd	f17,   24(SP)
133
134	stfd	f18,   32(SP)
135	stfd	f19,   40(SP)
136	stfd	f20,   48(SP)
137	stfd	f21,   56(SP)
138
139	stfd	f22,   64(SP)
140	stfd	f23,   72(SP)
141	stfd	f24,   80(SP)
142	stfd	f25,   88(SP)
143
144	stfd	f26,   96(SP)
145	stfd	f27,  104(SP)
146	stfd	f28,  112(SP)
147	stfd	f29,  120(SP)
148
149	stfd	f30,  128(SP)
150	stfd	f31,  136(SP)
151
152#ifdef __64BIT__
153	std	r31,  144(SP)
154	std	r30,  152(SP)
155	std	r29,  160(SP)
156	std	r28,  168(SP)
157	std	r27,  176(SP)
158	std	r26,  184(SP)
159	std	r25,  192(SP)
160	std	r24,  200(SP)
161	std	r23,  208(SP)
162	std	r22,  216(SP)
163	std	r21,  224(SP)
164	std	r20,  232(SP)
165	std	r19,  240(SP)
166	std	r18,  248(SP)
167#else
168	stw	r31,  144(SP)
169	stw	r30,  148(SP)
170	stw	r29,  152(SP)
171	stw	r28,  156(SP)
172	stw	r27,  160(SP)
173	stw	r26,  164(SP)
174	stw	r25,  168(SP)
175	stw	r24,  172(SP)
176	stw	r23,  176(SP)
177	stw	r22,  180(SP)
178	stw	r21,  184(SP)
179	stw	r20,  188(SP)
180	stw	r19,  192(SP)
181	stw	r18,  196(SP)
182#endif
183
184	stw	r0,  FZERO
185
186#if defined(_AIX) || defined(__APPLE__)
187#if !defined(__64BIT__) && defined(DOUBLE)
188	lwz	LDC,    56 + STACKSIZE(SP)
189#endif
190#endif
191
192	slwi	LDC, LDC, BASE_SHIFT
193
194#if defined(linux) && defined(__64BIT__)
195	ld	OFFSET,   112 + STACKSIZE(SP)
196#endif
197
198#if defined(_AIX) || defined(__APPLE__)
199#ifdef __64BIT__
200	ld	OFFSET,  112 + STACKSIZE(SP)
201#else
202#ifdef DOUBLE
203	lwz	OFFSET,   60 + STACKSIZE(SP)
204#else
205	lwz	OFFSET,   56 + STACKSIZE(SP)
206#endif
207#endif
208#endif
209
210#ifdef LN
211	mullw	r0, M, K
212	slwi	r0, r0, BASE_SHIFT
213	add	A, A, r0
214
215	slwi	r0, M, BASE_SHIFT
216	add	C, C, r0
217#endif
218
219#ifdef RN
220	neg	KK, OFFSET
221#endif
222
223#ifdef RT
224	mullw	r0, N, K
225	slwi	r0, r0, BASE_SHIFT
226	add	B, B, r0
227
228	mullw	r0, N, LDC
229	add	C, C, r0
230
231	sub	KK, N, OFFSET
232#endif
233
234	cmpwi	cr0, M, 0
235	ble	.L999
236	cmpwi	cr0, N, 0
237	ble	.L999
238	cmpwi	cr0, K, 0
239	ble	.L999
240
241	lfs	f0, FZERO
242
243	srawi.	J, N,  2
244	ble	.L40
245	.align 4
246
247.L10:
248
249#ifdef RT
250	slwi	r0, K, 2 + BASE_SHIFT
251	sub	B, B, r0
252
253	slwi	r0, LDC, 2
254	sub	C, C, r0
255#endif
256
257	mr	CO1, C
258	add	CO2, C,  LDC
259	add	CO3, CO2, LDC
260	add	CO4, CO3, LDC
261
262#ifdef LN
263	add	KK, M, OFFSET
264#endif
265
266#ifdef LT
267	mr	KK, OFFSET
268#endif
269
270 	fmr	f1,  f0
271	fmr	f2,  f0
272	fmr	f3,  f0
273	fmr	f4,  f0
274	fmr	f5,  f0
275	fmr	f6,  f0
276	fmr	f7,  f0
277	fmr	f8,  f0
278	fmr	f9,  f0
279	fmr	f10, f0
280	fmr	f11, f0
281	fmr	f12, f0
282	fmr	f13, f0
283	fmr	f14, f0
284	fmr	f15, f0
285
286#if defined(LN) || defined(RT)
287	mr	AORIG, A
288#else
289	mr	AO, A
290#endif
291#ifndef RT
292	add	C,  CO4, LDC
293#endif
294
295.L30:
296	andi.	I,  M,  1
297	ble	.L20
298
299#if defined(LT) || defined(RN)
300	LFD	f16,  0 * SIZE(AO)
301	LFD	f17,  1 * SIZE(AO)
302	LFD	f18,  2 * SIZE(AO)
303	LFD	f19,  3 * SIZE(AO)
304
305	LFD	f20,  0 * SIZE(B)
306	LFD	f21,  1 * SIZE(B)
307	LFD	f22,  2 * SIZE(B)
308	LFD	f23,  3 * SIZE(B)
309
310	LFD	f24,  4 * SIZE(B)
311	LFD	f25,  5 * SIZE(B)
312	LFD	f26,  6 * SIZE(B)
313	LFD	f27,  7 * SIZE(B)
314
315	srawi.	r0, KK,  2
316	mtspr	CTR, r0
317	mr	BO,  B
318#else
319
320#ifdef LN
321	slwi	r0,   K,  BASE_SHIFT
322	sub	AORIG, AORIG, r0
323#endif
324
325	slwi	r0,   KK, 0 + BASE_SHIFT
326	slwi	TEMP, KK, 2 + BASE_SHIFT
327	add	AO, AORIG, r0
328	add	BO, B,     TEMP
329
330	sub	TEMP, K, KK
331
332	LFD	f16,  0 * SIZE(AO)
333	LFD	f17,  1 * SIZE(AO)
334	LFD	f18,  2 * SIZE(AO)
335	LFD	f19,  3 * SIZE(AO)
336
337	LFD	f20,  0 * SIZE(BO)
338	LFD	f21,  1 * SIZE(BO)
339	LFD	f22,  2 * SIZE(BO)
340	LFD	f23,  3 * SIZE(BO)
341
342	LFD	f24,  4 * SIZE(BO)
343	LFD	f25,  5 * SIZE(BO)
344	LFD	f26,  6 * SIZE(BO)
345	LFD	f27,  7 * SIZE(BO)
346
347	srawi.	r0, TEMP,  2
348	mtspr	CTR, r0
349#endif
350	ble	.L35
351	.align 5
352
353.L32:
354	FMADD	f0,  f16, f20, f0
355	LFD	f20,  8 * SIZE(BO)
356	FMADD	f4,  f16, f21, f4
357	LFD	f21,  9 * SIZE(BO)
358	FMADD	f8,  f16, f22, f8
359	LFD	f22, 10 * SIZE(BO)
360	FMADD	f12, f16, f23, f12
361	LFD	f23, 11 * SIZE(BO)
362	LFDU	f16,  4 * SIZE(AO)
363
364	FMADD	f1,  f17, f24, f1
365	LFD	f24, 12 * SIZE(BO)
366	FMADD	f5,  f17, f25, f5
367	LFD	f25, 13 * SIZE(BO)
368	FMADD	f9,  f17, f26, f9
369	LFD	f26, 14 * SIZE(BO)
370	FMADD	f13, f17, f27, f13
371	LFD	f27, 15 * SIZE(BO)
372	LFD	f17,  1 * SIZE(AO)
373
374	FMADD	f0,  f18, f20, f0
375	LFDU	f20, 16 * SIZE(BO)
376	FMADD	f4,  f18, f21, f4
377	LFD	f21,  1 * SIZE(BO)
378	FMADD	f8,  f18, f22, f8
379	LFD	f22,  2 * SIZE(BO)
380	FMADD	f12, f18, f23, f12
381	LFD	f23,  3 * SIZE(BO)
382	LFD	f18,  2 * SIZE(AO)
383
384	FMADD	f1,  f19, f24, f1
385	LFD	f24,  4 * SIZE(BO)
386	FMADD	f5,  f19, f25, f5
387	LFD	f25,  5 * SIZE(BO)
388	FMADD	f9,  f19, f26, f9
389	LFD	f26,  6 * SIZE(BO)
390	FMADD	f13, f19, f27, f13
391	LFD	f27,  7 * SIZE(BO)
392	LFD	f19,  3 * SIZE(AO)
393	bdnz	.L32
394
395	fadd	f0,  f1,   f0
396	fadd	f4,  f5,   f4
397	fadd	f8,  f9,   f8
398	fadd	f12, f13, f12
399	.align 4
400
401.L35:
402#if defined(LT) || defined(RN)
403	andi.	r0, KK,  3
404#else
405	andi.	r0, TEMP, 3
406#endif
407	mtspr	CTR, r0
408	ble+	.L38
409	.align 4
410
411.L36:
412	FMADD	f0,  f16, f20, f0
413	LFDU	f20,  4 * SIZE(BO)
414	FMADD	f4,  f16, f21, f4
415	LFD	f21,  1 * SIZE(BO)
416	FMADD	f8,  f16, f22, f8
417	LFD	f22,  2 * SIZE(BO)
418	FMADD	f12, f16, f23, f12
419	LFDU	f16,  1 * SIZE(AO)
420	LFD	f23,  3 * SIZE(BO)
421	bdnz	.L36
422	.align 4
423
424.L38:
425#if defined(LN) || defined(RT)
426#ifdef LN
427	subi	r0, KK, 1
428#else
429	subi	r0, KK, 4
430#endif
431	slwi	TEMP, r0, 0 + BASE_SHIFT
432	slwi	r0,   r0, 2 + BASE_SHIFT
433	add	AO, AORIG, TEMP
434	add	BO, B,     r0
435#endif
436
437#if defined(LN) || defined(LT)
438	LFD	f16,  0 * SIZE(BO)
439	LFD	f17,  1 * SIZE(BO)
440	LFD	f18,  2 * SIZE(BO)
441	LFD	f19,  3 * SIZE(BO)
442
443	FSUB	f0,  f16, f0
444	FSUB	f4,  f17, f4
445	FSUB	f8,  f18, f8
446	FSUB	f12, f19, f12
447#else
448	LFD	f16,  0 * SIZE(AO)
449	LFD	f20,  1 * SIZE(AO)
450	LFD	f24,  2 * SIZE(AO)
451	LFD	f28,  3 * SIZE(AO)
452
453	FSUB	f0,  f16, f0
454	FSUB	f4,  f20, f4
455	FSUB	f8,  f24, f8
456	FSUB	f12, f28, f12
457#endif
458
459#ifdef LN
460	LFD	f21,  0 * SIZE(AO)
461
462	FMUL	f0,  f21, f0
463	FMUL	f4,  f21, f4
464	FMUL	f8,  f21, f8
465	FMUL	f12, f21, f12
466#endif
467
468#ifdef LT
469	LFD	f16,  0 * SIZE(AO)
470
471	FMUL	f0,  f16, f0
472	FMUL	f4,  f16, f4
473	FMUL	f8,  f16, f8
474	FMUL	f12, f16, f12
475#endif
476
477#ifdef RN
478	LFD	f16,  0 * SIZE(BO)
479	LFD	f17,  1 * SIZE(BO)
480	LFD	f18,  2 * SIZE(BO)
481	LFD	f19,  3 * SIZE(BO)
482
483	FMUL	f0,  f16, f0
484	FNMSUB	f4,  f17, f0,  f4
485	FNMSUB	f8,  f18, f0,  f8
486	FNMSUB	f12, f19, f0,  f12
487
488	LFD	f16,  5 * SIZE(BO)
489	LFD	f17,  6 * SIZE(BO)
490	LFD	f18,  7 * SIZE(BO)
491	LFD	f19, 10 * SIZE(BO)
492
493	LFD	f20, 11 * SIZE(BO)
494	LFD	f21, 15 * SIZE(BO)
495
496	FMUL	f4,  f16, f4
497	FNMSUB	f8,  f17, f4,  f8
498	FNMSUB	f12, f18, f4,  f12
499	FMUL	f8,  f19, f8
500	FNMSUB	f12, f20, f8,  f12
501	FMUL	f12, f21, f12
502#endif
503
504#ifdef RT
505	LFD	f16, 15 * SIZE(BO)
506	LFD	f17, 14 * SIZE(BO)
507	LFD	f18, 13 * SIZE(BO)
508	LFD	f19, 12 * SIZE(BO)
509
510	FMUL	f12, f16, f12
511	FNMSUB	f8,  f17, f12, f8
512	FNMSUB	f4,  f18, f12, f4
513	FNMSUB	f0,  f19, f12, f0
514
515	LFD	f16, 10 * SIZE(BO)
516	LFD	f17,  9 * SIZE(BO)
517	LFD	f18,  8 * SIZE(BO)
518	LFD	f19,  5 * SIZE(BO)
519
520	FMUL	f8,  f16, f8
521
522	LFD	f20,  4 * SIZE(BO)
523	LFD	f21,  0 * SIZE(BO)
524
525	FNMSUB	f4,  f17, f8,  f4
526	FNMSUB	f0,  f18, f8,  f0
527
528	FMUL	f4,  f19, f4
529	FNMSUB	f0,  f20, f4,  f0
530	FMUL	f0,  f21, f0
531#endif
532
533#ifdef LN
534	subi	CO1, CO1, 1 * SIZE
535	subi	CO2, CO2, 1 * SIZE
536	subi	CO3, CO3, 1 * SIZE
537	subi	CO4, CO4, 1 * SIZE
538#endif
539
540#if defined(LN) || defined(LT)
541	STFD	f0,   0 * SIZE(BO)
542	STFD	f4,   1 * SIZE(BO)
543	STFD	f8,   2 * SIZE(BO)
544	STFD	f12,  3 * SIZE(BO)
545#else
546	STFD	f0,   0 * SIZE(AO)
547	STFD	f4,   1 * SIZE(AO)
548	STFD	f8,   2 * SIZE(AO)
549	STFD	f12,  3 * SIZE(AO)
550#endif
551
552	STFD	f0,   0 * SIZE(CO1)
553	STFD	f4,   0 * SIZE(CO2)
554	STFD	f8,   0 * SIZE(CO3)
555	STFD	f12,  0 * SIZE(CO4)
556
557	lfs	f0,  FZERO
558 	fmr	f1,  f0
559	fmr	f4,  f0
560	fmr	f5,  f0
561
562	fmr	f8,  f0
563	fmr	f9,  f0
564	fmr	f12, f0
565	fmr	f13, f0
566
567#ifndef LN
568	addi	CO1, CO1, 1 * SIZE
569	addi	CO2, CO2, 1 * SIZE
570	addi	CO3, CO3, 1 * SIZE
571	addi	CO4, CO4, 1 * SIZE
572#endif
573
574#ifdef RT
575	slwi	r0, K, 0 + BASE_SHIFT
576	add	AORIG, AORIG, r0
577#endif
578
579#if defined(LT) || defined(RN)
580	sub	TEMP, K, KK
581	slwi	r0,   TEMP, 0 + BASE_SHIFT
582	slwi	TEMP, TEMP, 2 + BASE_SHIFT
583	add	AO, AO, r0
584	add	BO, BO, TEMP
585#endif
586
587#ifdef LN
588	subi	KK, KK, 1
589#endif
590
591#ifdef LT
592	addi	KK, KK, 1
593#endif
594	.align 4
595
596.L20:
597	andi.	I,  M,  2
598	ble	.L09
599
600#if defined(LT) || defined(RN)
601	LFD	f16,  0 * SIZE(AO)
602	LFD	f17,  1 * SIZE(AO)
603	LFD	f18,  2 * SIZE(AO)
604	LFD	f19,  3 * SIZE(AO)
605
606	LFD	f20,  0 * SIZE(B)
607	LFD	f21,  1 * SIZE(B)
608	LFD	f22,  2 * SIZE(B)
609	LFD	f23,  3 * SIZE(B)
610
611	LFD	f24,  4 * SIZE(B)
612	LFD	f25,  5 * SIZE(B)
613	LFD	f26,  6 * SIZE(B)
614	LFD	f27,  7 * SIZE(B)
615
616	srawi.	r0, KK,  2
617	mtspr	CTR, r0
618	mr	BO,  B
619#else
620
621#ifdef LN
622	slwi	r0,   K,  1 + BASE_SHIFT
623	sub	AORIG, AORIG, r0
624#endif
625
626	slwi	r0,   KK, 1 + BASE_SHIFT
627	slwi	TEMP, KK, 2 + BASE_SHIFT
628	add	AO, AORIG, r0
629	add	BO, B,     TEMP
630
631	sub	TEMP, K, KK
632
633	LFD	f16,  0 * SIZE(AO)
634	LFD	f17,  1 * SIZE(AO)
635	LFD	f18,  2 * SIZE(AO)
636	LFD	f19,  3 * SIZE(AO)
637
638	LFD	f20,  0 * SIZE(BO)
639	LFD	f21,  1 * SIZE(BO)
640	LFD	f22,  2 * SIZE(BO)
641	LFD	f23,  3 * SIZE(BO)
642
643	LFD	f24,  4 * SIZE(BO)
644	LFD	f25,  5 * SIZE(BO)
645	LFD	f26,  6 * SIZE(BO)
646	LFD	f27,  7 * SIZE(BO)
647
648	srawi.	r0, TEMP,  2
649	mtspr	CTR, r0
650#endif
651	ble	.L25
652	.align 5
653
654.L22:
655	FMADD	f0,  f16, f20, f0
656	nop
657	FMADD	f1,  f17, f20, f1
658	LFD	f20,  8 * SIZE(BO)
659	FMADD	f4,  f16, f21, f4
660	nop
661	FMADD	f5,  f17, f21, f5
662	LFD	f21,  9 * SIZE(BO)
663
664	FMADD	f8,  f16, f22, f8
665	nop
666	FMADD	f9,  f17, f22, f9
667	LFD	f22, 10 * SIZE(BO)
668	FMADD	f12, f16, f23, f12
669	LFD	f16,  4 * SIZE(AO)
670	FMADD	f13, f17, f23, f13
671	LFD	f23, 11 * SIZE(BO)
672
673	FMADD	f2,  f18, f24, f2
674	LFD	f17,  5 * SIZE(AO)
675	FMADD	f3,  f19, f24, f3
676	LFD	f24, 12 * SIZE(BO)
677	FMADD	f6,  f18, f25, f6
678	nop
679	FMADD	f7,  f19, f25, f7
680	LFD	f25, 13 * SIZE(BO)
681
682	FMADD	f10, f18, f26, f10
683	nop
684	FMADD	f11, f19, f26, f11
685	LFD	f26, 14 * SIZE(BO)
686	FMADD	f14, f18, f27, f14
687	LFD	f18,  6 * SIZE(AO)
688	FMADD	f15, f19, f27, f15
689	LFD	f27, 15 * SIZE(BO)
690
691	FMADD	f0,  f16, f20, f0
692	LFD	f19,  7 * SIZE(AO)
693	FMADD	f1,  f17, f20, f1
694	LFDU	f20, 16 * SIZE(BO)
695	FMADD	f4,  f16, f21, f4
696	nop
697	FMADD	f5,  f17, f21, f5
698	LFD	f21,  1 * SIZE(BO)
699
700	FMADD	f8,  f16, f22, f8
701	nop
702	FMADD	f9,  f17, f22, f9
703	LFD	f22,  2 * SIZE(BO)
704	FMADD	f12, f16, f23, f12
705	LFDU	f16,  8 * SIZE(AO)
706	FMADD	f13, f17, f23, f13
707	LFD	f23,  3 * SIZE(BO)
708
709	FMADD	f2,  f18, f24, f2
710	LFD	f17,  1 * SIZE(AO)
711	FMADD	f3,  f19, f24, f3
712	LFD	f24,  4 * SIZE(BO)
713	FMADD	f6,  f18, f25, f6
714	nop
715	FMADD	f7,  f19, f25, f7
716	LFD	f25,  5 * SIZE(BO)
717
718	FMADD	f10, f18, f26, f10
719	nop
720	FMADD	f11, f19, f26, f11
721	LFD	f26,  6 * SIZE(BO)
722	FMADD	f14, f18, f27, f14
723	LFD	f18,  2 * SIZE(AO)
724	FMADD	f15, f19, f27, f15
725	LFD	f19,  3 * SIZE(AO)
726	LFD	f27,  7 * SIZE(BO)
727	bdnz	.L22
728
729	fadd	f0,  f2,  f0
730	fadd	f1,  f3,  f1
731	fadd	f4,  f6,  f4
732	fadd	f5,  f7,  f5
733	fadd	f8,  f10, f8
734	fadd	f9,  f11, f9
735	fadd	f12, f14, f12
736	fadd	f13, f15, f13
737	.align 4
738
739.L25:
740#if defined(LT) || defined(RN)
741	andi.	r0, KK,  3
742#else
743	andi.	r0, TEMP, 3
744#endif
745	mtspr	CTR, r0
746	ble+	.L28
747	.align 4
748
749.L26:
750	FMADD	f0,  f16, f20, f0
751	nop
752	FMADD	f1,  f17, f20, f1
753	LFDU	f20,  4 * SIZE(BO)
754	FMADD	f4,  f16, f21, f4
755	nop
756	FMADD	f5,  f17, f21, f5
757	LFD	f21,  1 * SIZE(BO)
758
759	FMADD	f8,  f16, f22, f8
760	nop
761	FMADD	f9,  f17, f22, f9
762	LFD	f22,  2 * SIZE(BO)
763	FMADD	f12, f16, f23, f12
764	LFDU	f16,  2 * SIZE(AO)
765	FMADD	f13, f17, f23, f13
766	LFD	f17,  1 * SIZE(AO)
767	LFD	f23,  3 * SIZE(BO)
768	bdnz	.L26
769	.align 4
770
771.L28:
772#if defined(LN) || defined(RT)
773#ifdef LN
774	subi	r0, KK, 2
775#else
776	subi	r0, KK, 4
777#endif
778	slwi	TEMP, r0, 1 + BASE_SHIFT
779	slwi	r0,   r0, 2 + BASE_SHIFT
780	add	AO, AORIG, TEMP
781	add	BO, B,     r0
782#endif
783
784#if defined(LN) || defined(LT)
785	LFD	f16,  0 * SIZE(BO)
786	LFD	f17,  1 * SIZE(BO)
787	LFD	f18,  2 * SIZE(BO)
788	LFD	f19,  3 * SIZE(BO)
789
790	LFD	f20,  4 * SIZE(BO)
791 	LFD	f21,  5 * SIZE(BO)
792	LFD	f22,  6 * SIZE(BO)
793	LFD	f23,  7 * SIZE(BO)
794
795	FSUB	f0,  f16, f0
796	FSUB	f4,  f17, f4
797	FSUB	f8,  f18, f8
798	FSUB	f12, f19, f12
799
800	FSUB	f1,  f20, f1
801	FSUB	f5,  f21, f5
802	FSUB	f9,  f22, f9
803	FSUB	f13, f23, f13
804#else
805	LFD	f16,  0 * SIZE(AO)
806	LFD	f17,  1 * SIZE(AO)
807	LFD	f20,  2 * SIZE(AO)
808 	LFD	f21,  3 * SIZE(AO)
809
810	LFD	f24,  4 * SIZE(AO)
811	LFD	f25,  5 * SIZE(AO)
812	LFD	f28,  6 * SIZE(AO)
813	LFD	f29,  7 * SIZE(AO)
814
815	FSUB	f0,  f16, f0
816	FSUB	f1,  f17, f1
817	FSUB	f4,  f20, f4
818	FSUB	f5,  f21, f5
819
820	FSUB	f8,  f24, f8
821	FSUB	f9,  f25, f9
822	FSUB	f12, f28, f12
823	FSUB	f13, f29, f13
824#endif
825
826#ifdef LN
827	LFD	f19,  3 * SIZE(AO)
828	LFD	f20,  2 * SIZE(AO)
829	LFD	f21,  0 * SIZE(AO)
830
831	FMUL	f1,  f19, f1
832	FMUL	f5,  f19, f5
833	FMUL	f9,  f19, f9
834	FMUL	f13, f19, f13
835
836	FNMSUB	f0,  f20, f1,  f0
837	FNMSUB	f4,  f20, f5,  f4
838	FNMSUB	f8,  f20, f9,  f8
839	FNMSUB	f12, f20, f13, f12
840
841	FMUL	f0,  f21, f0
842	FMUL	f4,  f21, f4
843	FMUL	f8,  f21, f8
844	FMUL	f12, f21, f12
845#endif
846
847#ifdef LT
848	LFD	f16,  0 * SIZE(AO)
849	LFD	f17,  1 * SIZE(AO)
850
851	FMUL	f0,  f16, f0
852	FMUL	f4,  f16, f4
853	FMUL	f8,  f16, f8
854	FMUL	f12, f16, f12
855
856	FNMSUB	f1,  f17, f0,  f1
857	FNMSUB	f5,  f17, f4,  f5
858	FNMSUB	f9,  f17, f8,  f9
859	FNMSUB	f13, f17, f12, f13
860
861	LFD	f17,  3 * SIZE(AO)
862
863	FMUL	f1,  f17, f1
864	FMUL	f5,  f17, f5
865	FMUL	f9,  f17, f9
866	FMUL	f13, f17, f13
867#endif
868
869#ifdef RN
870	LFD	f16,  0 * SIZE(BO)
871	LFD	f17,  1 * SIZE(BO)
872	LFD	f18,  2 * SIZE(BO)
873	LFD	f19,  3 * SIZE(BO)
874
875	FMUL	f0,  f16, f0
876	FMUL	f1,  f16, f1
877	FNMSUB	f4,  f17, f0,  f4
878	FNMSUB	f5,  f17, f1,  f5
879	FNMSUB	f8,  f18, f0,  f8
880	FNMSUB	f9,  f18, f1,  f9
881	FNMSUB	f12, f19, f0,  f12
882	FNMSUB	f13, f19, f1,  f13
883
884	LFD	f16,  5 * SIZE(BO)
885	LFD	f17,  6 * SIZE(BO)
886	LFD	f18,  7 * SIZE(BO)
887	LFD	f19, 10 * SIZE(BO)
888
889	LFD	f20, 11 * SIZE(BO)
890	LFD	f21, 15 * SIZE(BO)
891
892	FMUL	f4,  f16, f4
893	FMUL	f5,  f16, f5
894	FNMSUB	f8,  f17, f4,  f8
895	FNMSUB	f9,  f17, f5,  f9
896	FNMSUB	f12, f18, f4,  f12
897	FNMSUB	f13, f18, f5,  f13
898
899	FMUL	f8,  f19, f8
900	FMUL	f9,  f19, f9
901	FNMSUB	f12, f20, f8,  f12
902	FNMSUB	f13, f20, f9,  f13
903	FMUL	f12, f21, f12
904	FMUL	f13, f21, f13
905#endif
906
907#ifdef RT
908	LFD	f16, 15 * SIZE(BO)
909	LFD	f17, 14 * SIZE(BO)
910	LFD	f18, 13 * SIZE(BO)
911	LFD	f19, 12 * SIZE(BO)
912
913	FMUL	f12, f16, f12
914	FMUL	f13, f16, f13
915	FNMSUB	f8,  f17, f12, f8
916	FNMSUB	f9,  f17, f13, f9
917	FNMSUB	f4,  f18, f12, f4
918	FNMSUB	f5,  f18, f13, f5
919	FNMSUB	f0,  f19, f12, f0
920	FNMSUB	f1,  f19, f13, f1
921
922	LFD	f16, 10 * SIZE(BO)
923	LFD	f17,  9 * SIZE(BO)
924	LFD	f18,  8 * SIZE(BO)
925	LFD	f19,  5 * SIZE(BO)
926	LFD	f20,  4 * SIZE(BO)
927	LFD	f21,  0 * SIZE(BO)
928
929	FMUL	f8,  f16, f8
930	FMUL	f9,  f16, f9
931	FNMSUB	f4,  f17, f8,  f4
932	FNMSUB	f5,  f17, f9,  f5
933	FNMSUB	f0,  f18, f8,  f0
934	FNMSUB	f1,  f18, f9,  f1
935
936	FMUL	f4,  f19, f4
937	FMUL	f5,  f19, f5
938	FNMSUB	f0,  f20, f4,  f0
939	FNMSUB	f1,  f20, f5,  f1
940
941	FMUL	f0,  f21, f0
942	FMUL	f1,  f21, f1
943#endif
944
945#ifdef LN
946	subi	CO1, CO1, 2 * SIZE
947	subi	CO2, CO2, 2 * SIZE
948	subi	CO3, CO3, 2 * SIZE
949	subi	CO4, CO4, 2 * SIZE
950#endif
951
952#if defined(LN) || defined(LT)
953	STFD	f0,   0 * SIZE(BO)
954	STFD	f4,   1 * SIZE(BO)
955	STFD	f8,   2 * SIZE(BO)
956	STFD	f12,  3 * SIZE(BO)
957
958	STFD	f1,   4 * SIZE(BO)
959	STFD	f5,   5 * SIZE(BO)
960	STFD	f9,   6 * SIZE(BO)
961	STFD	f13,  7 * SIZE(BO)
962#else
963	STFD	f0,   0 * SIZE(AO)
964	STFD	f1,   1 * SIZE(AO)
965	STFD	f4,   2 * SIZE(AO)
966	STFD	f5,   3 * SIZE(AO)
967
968	STFD	f8,   4 * SIZE(AO)
969	STFD	f9,   5 * SIZE(AO)
970	STFD	f12,  6 * SIZE(AO)
971	STFD	f13,  7 * SIZE(AO)
972#endif
973
974	STFD	f0,   0 * SIZE(CO1)
975	STFD	f1,   1 * SIZE(CO1)
976	STFD	f4,   0 * SIZE(CO2)
977	STFD	f5,   1 * SIZE(CO2)
978
979	STFD	f8,   0 * SIZE(CO3)
980	STFD	f9,   1 * SIZE(CO3)
981	STFD	f12,  0 * SIZE(CO4)
982	STFD	f13,  1 * SIZE(CO4)
983
984	lfs	f0,  FZERO
985 	fmr	f1,  f0
986	fmr	f2,  f0
987	fmr	f3,  f0
988
989	fmr	f4,  f0
990	fmr	f5,  f0
991	fmr	f6,  f0
992	fmr	f7,  f0
993
994	fmr	f8,  f0
995	fmr	f9,  f0
996	fmr	f10, f0
997	fmr	f11, f0
998
999	fmr	f12, f0
1000	fmr	f13, f0
1001	fmr	f14, f0
1002	fmr	f15, f0
1003
1004#ifndef LN
1005	addi	CO1, CO1, 2 * SIZE
1006	addi	CO2, CO2, 2 * SIZE
1007	addi	CO3, CO3, 2 * SIZE
1008	addi	CO4, CO4, 2 * SIZE
1009#endif
1010
1011#ifdef RT
1012	slwi	r0, K, 1 + BASE_SHIFT
1013	add	AORIG, AORIG, r0
1014#endif
1015
1016#if defined(LT) || defined(RN)
1017	sub	TEMP, K, KK
1018	slwi	r0,   TEMP, 1 + BASE_SHIFT
1019	slwi	TEMP, TEMP, 2 + BASE_SHIFT
1020	add	AO, AO, r0
1021	add	BO, BO, TEMP
1022#endif
1023
1024#ifdef LN
1025	subi	KK, KK, 2
1026#endif
1027
1028#ifdef LT
1029	addi	KK, KK, 2
1030#endif
1031	.align 4
1032
1033.L09:
1034	srawi.	I, M,  2
1035	ble	.L39
1036	.align 4
1037
1038.L11:
1039#if defined(LT) || defined(RN)
1040	LFD	A1,  0 * SIZE(AO)
1041	LFD	A2,  1 * SIZE(AO)
1042	LFD	A4,  4 * SIZE(AO)
1043	LFD	A5,  8 * SIZE(AO)
1044
1045	LFD	B1,  0 * SIZE(B)
1046	LFD	B2,  1 * SIZE(B)
1047	LFD	B3,  2 * SIZE(B)
1048	LFD	B4,  3 * SIZE(B)
1049	LFD	B5,  4 * SIZE(B)
1050	LFD	B6,  8 * SIZE(B)
1051	LFD	B7, 12 * SIZE(B)
1052
1053	srawi.	r0, KK,  2
1054	mtspr	CTR, r0
1055	mr	BO,  B
1056#else
1057
1058#ifdef LN
1059	slwi	r0,   K,  2 + BASE_SHIFT
1060	sub	AORIG, AORIG, r0
1061#endif
1062
1063	slwi	TEMP, KK, 2 + BASE_SHIFT
1064	add	AO, AORIG, TEMP
1065	add	BO, B,     TEMP
1066
1067	sub	TEMP, K, KK
1068
1069	LFD	A1,  0 * SIZE(AO)
1070	LFD	A2,  1 * SIZE(AO)
1071	LFD	A4,  4 * SIZE(AO)
1072	LFD	A5,  8 * SIZE(AO)
1073
1074	LFD	B1,  0 * SIZE(BO)
1075	LFD	B2,  1 * SIZE(BO)
1076	LFD	B3,  2 * SIZE(BO)
1077	LFD	B4,  3 * SIZE(BO)
1078	LFD	B5,  4 * SIZE(BO)
1079	LFD	B6,  8 * SIZE(BO)
1080	LFD	B7, 12 * SIZE(BO)
1081
1082	srawi.	r0, TEMP,  2
1083	mtspr	CTR, r0
1084#endif
1085	ble	.L15
1086	.align 4
1087
1088.L12:
1089	FMADD	f0,  A1, B1, f0
1090	LFD	A3,  2 * SIZE(AO)
1091	FMADD	f4,  A1, B2, f4
1092	LFD	A6, 12 * SIZE(AO)
1093	FMADD	f8,  A1, B3, f8
1094	nop
1095	FMADD	f12, A1, B4, f12
1096	nop
1097
1098	FMADD	f1,  A2, B1, f1
1099	LFD	A1,  3 * SIZE(AO)
1100	FMADD	f5,  A2, B2, f5
1101	nop
1102	FMADD	f9,  A2, B3, f9
1103	nop
1104	FMADD	f13, A2, B4, f13
1105	nop
1106
1107	FMADD	f2,  A3, B1, f2
1108	nop
1109	FMADD	f6,  A3, B2, f6
1110	LFD	B8,  5 * SIZE(BO)
1111	FMADD	f10, A3, B3, f10
1112	LFD	B9,  6 * SIZE(BO)
1113	FMADD	f14, A3, B4, f14
1114	LFD	B10, 7 * SIZE(BO)
1115
1116	FMADD	f3,  A1, B1, f3
1117	LFD	A2,  5 * SIZE(AO)
1118	FMADD	f7,  A1, B2, f7
1119	LFD	B1, 16 * SIZE(BO)
1120	FMADD	f11, A1, B3, f11
1121	nop
1122	FMADD	f15, A1, B4, f15
1123	nop
1124
1125	FMADD	f0,  A4, B5, f0
1126 	LFD	A3,  6 * SIZE(AO)
1127	FMADD	f4,  A4, B8, f4
1128	LFD	A1, 16 * SIZE(AO)
1129	FMADD	f8,  A4, B9, f8
1130	nop
1131	FMADD	f12, A4, B10, f12
1132	nop
1133
1134	FMADD	f1,  A2, B5, f1
1135	LFD	A4,  7 * SIZE(AO)
1136	FMADD	f5,  A2, B8, f5
1137	nop
1138	FMADD	f9,  A2, B9, f9
1139	nop
1140	FMADD	f13, A2, B10, f13
1141	nop
1142
1143	FMADD	f2,  A3, B5, f2
1144	nop
1145	FMADD	f6,  A3, B8, f6
1146	LFD	B2,  9 * SIZE(BO)
1147	FMADD	f10, A3, B9, f10
1148	LFD	B3, 10 * SIZE(BO)
1149	FMADD	f14, A3, B10, f14
1150	LFD	B4, 11 * SIZE(BO)
1151
1152	FMADD	f3,  A4, B5, f3
1153	LFD	A2,  9 * SIZE(AO)
1154	FMADD	f7,  A4, B8, f7
1155	LFD	B5, 20 * SIZE(BO)
1156	FMADD	f11, A4, B9, f11
1157	nop
1158	FMADD	f15, A4, B10, f15
1159	nop
1160
1161	FMADD	f0,  A5, B6, f0
1162	LFD	A3, 10 * SIZE(AO)
1163	FMADD	f4,  A5, B2, f4
1164	LFD	A4, 20 * SIZE(AO)
1165	FMADD	f8,  A5, B3, f8
1166	nop
1167	FMADD	f12, A5, B4, f12
1168	nop
1169
1170	FMADD	f1,  A2, B6, f1
1171	LFD	A5, 11 * SIZE(AO)
1172	FMADD	f5,  A2, B2, f5
1173	nop
1174	FMADD	f9,  A2, B3, f9
1175	nop
1176	FMADD	f13, A2, B4, f13
1177	nop
1178
1179	FMADD	f2,  A3, B6, f2
1180	nop
1181	FMADD	f6,  A3, B2, f6
1182	LFD	B8, 13 * SIZE(BO)
1183	FMADD	f10, A3, B3, f10
1184	LFD	B9, 14 * SIZE(BO)
1185	FMADD	f14, A3, B4, f14
1186	LFD	B10,15 * SIZE(BO)
1187
1188	FMADD	f3,  A5, B6, f3
1189	LFD	A2, 13 * SIZE(AO)
1190	FMADD	f7,  A5, B2, f7
1191	LFD	B6, 24 * SIZE(BO)
1192	FMADD	f11, A5, B3, f11
1193	nop
1194	FMADD	f15, A5, B4, f15
1195	nop
1196
1197
1198	FMADD	f0,  A6, B7, f0
1199	LFD	A3, 14 * SIZE(AO)
1200	FMADD	f4,  A6, B8, f4
1201	LFD	A5, 24 * SIZE(AO)
1202	FMADD	f8,  A6, B9, f8
1203	nop
1204	FMADD	f12, A6, B10, f12
1205	nop
1206
1207	FMADD	f1,  A2, B7, f1
1208	LFD	A6, 15 * SIZE(AO)
1209	FMADD	f5,  A2, B8, f5
1210	nop
1211	FMADD	f9,  A2, B9, f9
1212	nop
1213	FMADD	f13, A2, B10, f13
1214	nop
1215
1216	FMADD	f2,  A3, B7, f2
1217	addi	AO, AO, 16 * SIZE
1218	FMADD	f6,  A3, B8, f6
1219	LFD	B2, 17 * SIZE(BO)
1220	FMADD	f10, A3, B9, f10
1221	LFD	B3, 18 * SIZE(BO)
1222	FMADD	f14, A3, B10, f14
1223	LFD	B4, 19 * SIZE(BO)
1224
1225	FMADD	f3,  A6, B7, f3
1226	LFD	A2,  1 * SIZE(AO)
1227	FMADD	f7,  A6, B8, f7
1228	LFD	B7, 28 * SIZE(BO)
1229	FMADD	f11, A6, B9, f11
1230	addi	BO, BO, 16 * SIZE
1231	FMADD	f15, A6, B10, f15
1232	bdnz	.L12
1233	.align 4
1234
1235.L15:
1236#if defined(LT) || defined(RN)
1237	andi.	r0, KK,  3
1238#else
1239	andi.	r0, TEMP, 3
1240#endif
1241	mtspr	CTR, r0
1242	ble+	.L18
1243	.align 4
1244
1245.L16:
1246	FMADD	f0,  A1, B1, f0
1247	LFD	A3,  2 * SIZE(AO)
1248	FMADD	f4,  A1, B2, f4
1249	FMADD	f8,  A1, B3, f8
1250	FMADD	f12, A1, B4, f12
1251	LFD	A4,  3 * SIZE(AO)
1252
1253	FMADD	f1,  A2, B1, f1
1254	FMADD	f5,  A2, B2, f5
1255	FMADD	f9,  A2, B3, f9
1256	FMADD	f13, A2, B4, f13
1257	LFDU	A1,  4 * SIZE(AO)
1258
1259	FMADD	f2,  A3, B1, f2
1260	FMADD	f6,  A3, B2, f6
1261	FMADD	f10, A3, B3, f10
1262	FMADD	f14, A3, B4, f14
1263	LFD	A2,  1 * SIZE(AO)
1264
1265	FMADD	f3,  A4, B1, f3
1266	LFDU	B1,  4 * SIZE(BO)
1267	FMADD	f7,  A4, B2, f7
1268	LFD	B2,  1 * SIZE(BO)
1269	FMADD	f11, A4, B3, f11
1270	LFD	B3,  2 * SIZE(BO)
1271	FMADD	f15, A4, B4, f15
1272	LFD	B4,  3 * SIZE(BO)
1273	bdnz	.L16
1274	.align 4
1275
1276.L18:
1277#if defined(LN) || defined(RT)
1278	subi	r0, KK, 4
1279	slwi	r0, r0, 2 + BASE_SHIFT
1280	add	AO, AORIG, r0
1281	add	BO, B,     r0
1282#endif
1283
1284#if defined(LN) || defined(LT)
1285	LFD	f16,  0 * SIZE(BO)
1286	LFD	f17,  1 * SIZE(BO)
1287	LFD	f18,  2 * SIZE(BO)
1288	LFD	f19,  3 * SIZE(BO)
1289
1290	LFD	f20,  4 * SIZE(BO)
1291 	LFD	f21,  5 * SIZE(BO)
1292	LFD	f22,  6 * SIZE(BO)
1293	LFD	f23,  7 * SIZE(BO)
1294
1295	LFD	f24,  8 * SIZE(BO)
1296	LFD	f25,  9 * SIZE(BO)
1297	LFD	f26, 10 * SIZE(BO)
1298	LFD	f27, 11 * SIZE(BO)
1299
1300	LFD	f28, 12 * SIZE(BO)
1301	LFD	f29, 13 * SIZE(BO)
1302	LFD	f30, 14 * SIZE(BO)
1303	LFD	f31, 15 * SIZE(BO)
1304
1305	FSUB	f0,  f16, f0
1306	FSUB	f4,  f17, f4
1307	FSUB	f8,  f18, f8
1308	FSUB	f12, f19, f12
1309
1310	FSUB	f1,  f20, f1
1311	FSUB	f5,  f21, f5
1312	FSUB	f9,  f22, f9
1313	FSUB	f13, f23, f13
1314
1315	FSUB	f2,  f24, f2
1316	FSUB	f6,  f25, f6
1317	FSUB	f10, f26, f10
1318	FSUB	f14, f27, f14
1319
1320	FSUB	f3,  f28, f3
1321	FSUB	f7,  f29, f7
1322	FSUB	f11, f30, f11
1323	FSUB	f15, f31, f15
1324#else
1325	LFD	f16,  0 * SIZE(AO)
1326	LFD	f17,  1 * SIZE(AO)
1327	LFD	f18,  2 * SIZE(AO)
1328	LFD	f19,  3 * SIZE(AO)
1329
1330	LFD	f20,  4 * SIZE(AO)
1331 	LFD	f21,  5 * SIZE(AO)
1332	LFD	f22,  6 * SIZE(AO)
1333	LFD	f23,  7 * SIZE(AO)
1334
1335	LFD	f24,  8 * SIZE(AO)
1336	LFD	f25,  9 * SIZE(AO)
1337	LFD	f26, 10 * SIZE(AO)
1338	LFD	f27, 11 * SIZE(AO)
1339
1340	LFD	f28, 12 * SIZE(AO)
1341	LFD	f29, 13 * SIZE(AO)
1342	LFD	f30, 14 * SIZE(AO)
1343	LFD	f31, 15 * SIZE(AO)
1344
1345	FSUB	f0,  f16, f0
1346	FSUB	f1,  f17, f1
1347	FSUB	f2,  f18, f2
1348	FSUB	f3,  f19, f3
1349
1350	FSUB	f4,  f20, f4
1351	FSUB	f5,  f21, f5
1352	FSUB	f6,  f22, f6
1353	FSUB	f7,  f23, f7
1354
1355	FSUB	f8,  f24, f8
1356	FSUB	f9,  f25, f9
1357	FSUB	f10, f26, f10
1358	FSUB	f11, f27, f11
1359
1360	FSUB	f12, f28, f12
1361	FSUB	f13, f29, f13
1362	FSUB	f14, f30, f14
1363	FSUB	f15, f31, f15
1364#endif
1365
1366#ifdef LN
1367	LFD	f16, 15 * SIZE(AO)
1368	LFD	f17, 14 * SIZE(AO)
1369	LFD	f18, 13 * SIZE(AO)
1370	LFD	f19, 12 * SIZE(AO)
1371
1372	FMUL	f3,  f16, f3
1373	FMUL	f7,  f16, f7
1374	FMUL	f11, f16, f11
1375	FMUL	f15, f16, f15
1376
1377	FNMSUB	f2,  f17, f3,  f2
1378	FNMSUB	f6,  f17, f7,  f6
1379	FNMSUB	f10, f17, f11, f10
1380	FNMSUB	f14, f17, f15, f14
1381
1382	FNMSUB	f1,  f18, f3,  f1
1383	FNMSUB	f5,  f18, f7,  f5
1384	FNMSUB	f9,  f18, f11, f9
1385	FNMSUB	f13, f18, f15, f13
1386
1387	FNMSUB	f0,  f19, f3,  f0
1388	FNMSUB	f4,  f19, f7,  f4
1389	FNMSUB	f8,  f19, f11, f8
1390	FNMSUB	f12, f19, f15, f12
1391
1392	LFD	f16, 10 * SIZE(AO)
1393	LFD	f17,  9 * SIZE(AO)
1394	LFD	f18,  8 * SIZE(AO)
1395	LFD	f19,  5 * SIZE(AO)
1396
1397	FMUL	f2,  f16, f2
1398	FMUL	f6,  f16, f6
1399	FMUL	f10, f16, f10
1400	FMUL	f14, f16, f14
1401
1402	LFD	f20,  4 * SIZE(AO)
1403	LFD	f21,  0 * SIZE(AO)
1404
1405	FNMSUB	f1,  f17, f2,  f1
1406	FNMSUB	f5,  f17, f6,  f5
1407	FNMSUB	f9,  f17, f10, f9
1408	FNMSUB	f13, f17, f14,  f13
1409
1410	FNMSUB	f0,  f18, f2,  f0
1411	FNMSUB	f4,  f18, f6,  f4
1412	FNMSUB	f8,  f18, f10, f8
1413	FNMSUB	f12, f18, f14, f12
1414
1415	FMUL	f1,  f19, f1
1416	FMUL	f5,  f19, f5
1417	FMUL	f9,  f19, f9
1418	FMUL	f13, f19, f13
1419
1420	FNMSUB	f0,  f20, f1,  f0
1421	FNMSUB	f4,  f20, f5,  f4
1422	FNMSUB	f8,  f20, f9,  f8
1423	FNMSUB	f12, f20, f13, f12
1424
1425	FMUL	f0,  f21, f0
1426	FMUL	f4,  f21, f4
1427	FMUL	f8,  f21, f8
1428	FMUL	f12, f21, f12
1429#endif
1430
1431#ifdef LT
1432	LFD	f16,  0 * SIZE(AO)
1433	LFD	f17,  1 * SIZE(AO)
1434	LFD	f18,  2 * SIZE(AO)
1435	LFD	f19,  3 * SIZE(AO)
1436
1437	FMUL	f0,  f16, f0
1438	FMUL	f4,  f16, f4
1439	FMUL	f8,  f16, f8
1440	FMUL	f12, f16, f12
1441
1442	FNMSUB	f1,  f17, f0,  f1
1443	FNMSUB	f5,  f17, f4,  f5
1444	FNMSUB	f9,  f17, f8,  f9
1445	FNMSUB	f13, f17, f12, f13
1446
1447	FNMSUB	f2,  f18, f0,  f2
1448	FNMSUB	f6,  f18, f4,  f6
1449	FNMSUB	f10, f18, f8,  f10
1450	FNMSUB	f14, f18, f12, f14
1451
1452	FNMSUB	f3,  f19, f0,  f3
1453	FNMSUB	f7,  f19, f4,  f7
1454	FNMSUB	f11, f19, f8,  f11
1455	FNMSUB	f15, f19, f12, f15
1456
1457	LFD	f16,  5 * SIZE(AO)
1458	LFD	f17,  6 * SIZE(AO)
1459	LFD	f18,  7 * SIZE(AO)
1460	LFD	f19, 10 * SIZE(AO)
1461
1462	FMUL	f1,  f16, f1
1463	FMUL	f5,  f16, f5
1464	FMUL	f9,  f16, f9
1465	FMUL	f13, f16, f13
1466
1467	LFD	f20, 11 * SIZE(AO)
1468	LFD	f21, 15 * SIZE(AO)
1469
1470	FNMSUB	f2,  f17, f1,  f2
1471	FNMSUB	f6,  f17, f5,  f6
1472	FNMSUB	f10, f17, f9,  f10
1473	FNMSUB	f14, f17, f13, f14
1474
1475	FNMSUB	f3,  f18, f1,  f3
1476	FNMSUB	f7,  f18, f5,  f7
1477	FNMSUB	f11, f18, f9,  f11
1478	FNMSUB	f15, f18, f13, f15
1479
1480	FMUL	f2,  f19, f2
1481	FMUL	f6,  f19, f6
1482	FMUL	f10, f19, f10
1483	FMUL	f14, f19, f14
1484
1485	FNMSUB	f3,  f20, f2,  f3
1486	FNMSUB	f7,  f20, f6,  f7
1487	FNMSUB	f11, f20, f10, f11
1488	FNMSUB	f15, f20, f14, f15
1489
1490	FMUL	f3,  f21, f3
1491	FMUL	f7,  f21, f7
1492	FMUL	f11, f21, f11
1493	FMUL	f15, f21, f15
1494#endif
1495
1496#ifdef RN
1497	LFD	f16,  0 * SIZE(BO)
1498	LFD	f17,  1 * SIZE(BO)
1499	LFD	f18,  2 * SIZE(BO)
1500	LFD	f19,  3 * SIZE(BO)
1501
1502	FMUL	f0,  f16, f0
1503	FMUL	f1,  f16, f1
1504	FMUL	f2,  f16, f2
1505	FMUL	f3,  f16, f3
1506
1507	FNMSUB	f4,  f17, f0,  f4
1508	FNMSUB	f5,  f17, f1,  f5
1509	FNMSUB	f6,  f17, f2,  f6
1510	FNMSUB	f7,  f17, f3,  f7
1511
1512	FNMSUB	f8,  f18, f0,  f8
1513	FNMSUB	f9,  f18, f1,  f9
1514	FNMSUB	f10, f18, f2,  f10
1515	FNMSUB	f11, f18, f3,  f11
1516
1517	FNMSUB	f12, f19, f0,  f12
1518	FNMSUB	f13, f19, f1,  f13
1519	FNMSUB	f14, f19, f2,  f14
1520	FNMSUB	f15, f19, f3,  f15
1521
1522	LFD	f16,  5 * SIZE(BO)
1523	LFD	f17,  6 * SIZE(BO)
1524	LFD	f18,  7 * SIZE(BO)
1525	LFD	f19, 10 * SIZE(BO)
1526
1527	FMUL	f4,  f16, f4
1528	FMUL	f5,  f16, f5
1529	FMUL	f6,  f16, f6
1530	FMUL	f7,  f16, f7
1531
1532	LFD	f20, 11 * SIZE(BO)
1533	LFD	f21, 15 * SIZE(BO)
1534
1535	FNMSUB	f8,  f17, f4,  f8
1536	FNMSUB	f9,  f17, f5,  f9
1537	FNMSUB	f10, f17, f6,  f10
1538	FNMSUB	f11, f17, f7,  f11
1539
1540	FNMSUB	f12, f18, f4,  f12
1541	FNMSUB	f13, f18, f5,  f13
1542	FNMSUB	f14, f18, f6,  f14
1543	FNMSUB	f15, f18, f7,  f15
1544
1545	FMUL	f8,  f19, f8
1546	FMUL	f9,  f19, f9
1547	FMUL	f10, f19, f10
1548	FMUL	f11, f19, f11
1549
1550	FNMSUB	f12, f20, f8,  f12
1551	FNMSUB	f13, f20, f9,  f13
1552	FNMSUB	f14, f20, f10, f14
1553	FNMSUB	f15, f20, f11, f15
1554
1555	FMUL	f12, f21, f12
1556	FMUL	f13, f21, f13
1557	FMUL	f14, f21, f14
1558	FMUL	f15, f21, f15
1559#endif
1560
1561#ifdef RT
1562	LFD	f16, 15 * SIZE(BO)
1563	LFD	f17, 14 * SIZE(BO)
1564	LFD	f18, 13 * SIZE(BO)
1565	LFD	f19, 12 * SIZE(BO)
1566
1567	FMUL	f12, f16, f12
1568	FMUL	f13, f16, f13
1569	FMUL	f14, f16, f14
1570	FMUL	f15, f16, f15
1571
1572	FNMSUB	f8,  f17, f12, f8
1573	FNMSUB	f9,  f17, f13, f9
1574	FNMSUB	f10, f17, f14, f10
1575	FNMSUB	f11, f17, f15, f11
1576
1577	FNMSUB	f4,  f18, f12, f4
1578	FNMSUB	f5,  f18, f13, f5
1579	FNMSUB	f6,  f18, f14, f6
1580	FNMSUB	f7,  f18, f15, f7
1581
1582	FNMSUB	f0,  f19, f12, f0
1583	FNMSUB	f1,  f19, f13, f1
1584	FNMSUB	f2,  f19, f14, f2
1585	FNMSUB	f3,  f19, f15, f3
1586
1587	LFD	f16, 10 * SIZE(BO)
1588	LFD	f17,  9 * SIZE(BO)
1589	LFD	f18,  8 * SIZE(BO)
1590	LFD	f19,  5 * SIZE(BO)
1591
1592	FMUL	f8,  f16, f8
1593	FMUL	f9,  f16, f9
1594	FMUL	f10, f16, f10
1595	FMUL	f11, f16, f11
1596
1597	LFD	f20,  4 * SIZE(BO)
1598	LFD	f21,  0 * SIZE(BO)
1599
1600	FNMSUB	f4,  f17, f8,  f4
1601	FNMSUB	f5,  f17, f9,  f5
1602	FNMSUB	f6,  f17, f10, f6
1603	FNMSUB	f7,  f17, f11, f7
1604
1605	FNMSUB	f0,  f18, f8,  f0
1606	FNMSUB	f1,  f18, f9,  f1
1607	FNMSUB	f2,  f18, f10, f2
1608	FNMSUB	f3,  f18, f11, f3
1609
1610	FMUL	f4,  f19, f4
1611	FMUL	f5,  f19, f5
1612	FMUL	f6,  f19, f6
1613	FMUL	f7,  f19, f7
1614
1615	FNMSUB	f0,  f20, f4,  f0
1616	FNMSUB	f1,  f20, f5,  f1
1617	FNMSUB	f2,  f20, f6,  f2
1618	FNMSUB	f3,  f20, f7,  f3
1619
1620	FMUL	f0,  f21, f0
1621	FMUL	f1,  f21, f1
1622	FMUL	f2,  f21, f2
1623	FMUL	f3,  f21, f3
1624#endif
1625
1626#ifdef LN
1627	subi	CO1, CO1, 4 * SIZE
1628	subi	CO2, CO2, 4 * SIZE
1629	subi	CO3, CO3, 4 * SIZE
1630	subi	CO4, CO4, 4 * SIZE
1631#endif
1632
1633#if defined(LN) || defined(LT)
1634	STFD	f0,   0 * SIZE(BO)
1635	STFD	f4,   1 * SIZE(BO)
1636	STFD	f8,   2 * SIZE(BO)
1637	STFD	f12,  3 * SIZE(BO)
1638
1639	STFD	f1,   4 * SIZE(BO)
1640	STFD	f5,   5 * SIZE(BO)
1641	STFD	f9,   6 * SIZE(BO)
1642	STFD	f13,  7 * SIZE(BO)
1643
1644	STFD	f2,   8 * SIZE(BO)
1645	STFD	f6,   9 * SIZE(BO)
1646	STFD	f10, 10 * SIZE(BO)
1647	STFD	f14, 11 * SIZE(BO)
1648
1649	STFD	f3,  12 * SIZE(BO)
1650	STFD	f7,  13 * SIZE(BO)
1651	STFD	f11, 14 * SIZE(BO)
1652	STFD	f15, 15 * SIZE(BO)
1653#else
1654	STFD	f0,   0 * SIZE(AO)
1655	STFD	f1,   1 * SIZE(AO)
1656	STFD	f2,   2 * SIZE(AO)
1657	STFD	f3,   3 * SIZE(AO)
1658
1659	STFD	f4,   4 * SIZE(AO)
1660	STFD	f5,   5 * SIZE(AO)
1661	STFD	f6,   6 * SIZE(AO)
1662	STFD	f7,   7 * SIZE(AO)
1663
1664	STFD	f8,   8 * SIZE(AO)
1665	STFD	f9,   9 * SIZE(AO)
1666	STFD	f10, 10 * SIZE(AO)
1667	STFD	f11, 11 * SIZE(AO)
1668
1669	STFD	f12, 12 * SIZE(AO)
1670	STFD	f13, 13 * SIZE(AO)
1671	STFD	f14, 14 * SIZE(AO)
1672	STFD	f15, 15 * SIZE(AO)
1673#endif
1674
1675	STFD	f0,   0 * SIZE(CO1)
1676	STFD	f1,   1 * SIZE(CO1)
1677	STFD	f2,   2 * SIZE(CO1)
1678	STFD	f3,   3 * SIZE(CO1)
1679
1680	STFD	f4,   0 * SIZE(CO2)
1681	STFD	f5,   1 * SIZE(CO2)
1682	STFD	f6,   2 * SIZE(CO2)
1683	STFD	f7,   3 * SIZE(CO2)
1684
1685	STFD	f8,   0 * SIZE(CO3)
1686	STFD	f9,   1 * SIZE(CO3)
1687	STFD	f10,  2 * SIZE(CO3)
1688	STFD	f11,  3 * SIZE(CO3)
1689
1690	STFD	f12,  0 * SIZE(CO4)
1691	STFD	f13,  1 * SIZE(CO4)
1692	STFD	f14,  2 * SIZE(CO4)
1693	STFD	f15,  3 * SIZE(CO4)
1694
1695	lfs	f0,  FZERO
1696 	fmr	f1,  f0
1697	fmr	f2,  f0
1698	fmr	f3,  f0
1699
1700	fmr	f4,  f0
1701	fmr	f5,  f0
1702	fmr	f6,  f0
1703	fmr	f7,  f0
1704
1705	fmr	f8,  f0
1706	fmr	f9,  f0
1707	fmr	f10, f0
1708	fmr	f11, f0
1709
1710	fmr	f12, f0
1711	fmr	f13, f0
1712	fmr	f14, f0
1713	fmr	f15, f0
1714
1715#ifndef LN
1716	addi	CO1, CO1, 4 * SIZE
1717	addi	CO2, CO2, 4 * SIZE
1718	addi	CO3, CO3, 4 * SIZE
1719	addi	CO4, CO4, 4 * SIZE
1720#endif
1721
1722#ifdef RT
1723	slwi	r0, K, 2 + BASE_SHIFT
1724	add	AORIG, AORIG, r0
1725#endif
1726
1727#if defined(LT) || defined(RN)
1728	sub	TEMP, K, KK
1729	slwi	TEMP, TEMP, 2 + BASE_SHIFT
1730	add	AO, AO, TEMP
1731	add	BO, BO, TEMP
1732#endif
1733
1734#ifdef LT
1735	addi	KK, KK, 4
1736#endif
1737
1738#ifdef LN
1739	subi	KK, KK, 4
1740#endif
1741
1742	addic.	I, I, -1
1743	bgt+	.L11
1744	.align 4
1745
1746
1747.L39:
1748#ifdef LN
1749	slwi	r0, K, 2 + BASE_SHIFT
1750	add	B, B, r0
1751#endif
1752
1753#if defined(LT) || defined(RN)
1754	mr	B,  BO
1755#endif
1756
1757#ifdef RN
1758	addi	KK, KK, 4
1759#endif
1760
1761#ifdef RT
1762	subi	KK, KK, 4
1763#endif
1764
1765	addic.	J, J, -1
1766	lfs	f0, FZERO
1767	bgt	.L10
1768	.align 4
1769
1770.L40:
1771	andi.	J, N,  2
1772	ble	.L70
1773
1774#ifdef RT
1775	slwi	r0, K, 1 + BASE_SHIFT
1776	sub	B, B, r0
1777
1778	slwi	r0, LDC, 1
1779	sub	C, C, r0
1780#endif
1781
1782	mr	CO1, C
1783	add	CO2, C,  LDC
1784
1785#ifdef LN
1786	add	KK, M, OFFSET
1787#endif
1788
1789#ifdef LT
1790	mr	KK, OFFSET
1791#endif
1792
1793 	fmr	f1,  f0
1794	fmr	f2,  f0
1795	fmr	f3,  f0
1796	fmr	f4,  f0
1797	fmr	f5,  f0
1798	fmr	f6,  f0
1799	fmr	f7,  f0
1800
1801#if defined(LN) || defined(RT)
1802	mr	AORIG, A
1803#else
1804	mr	AO, A
1805#endif
1806#ifndef RT
1807	add	C,  CO2, LDC
1808#endif
1809
1810.L60:
1811	andi.	I,  M,  1
1812	ble	.L50
1813
1814#if defined(LT) || defined(RN)
1815	LFD	f16,  0 * SIZE(AO)
1816	LFD	f17,  1 * SIZE(AO)
1817	LFD	f18,  2 * SIZE(AO)
1818	LFD	f19,  3 * SIZE(AO)
1819
1820	LFD	f20,  0 * SIZE(B)
1821	LFD	f21,  1 * SIZE(B)
1822	LFD	f22,  2 * SIZE(B)
1823	LFD	f23,  3 * SIZE(B)
1824
1825	LFD	f24,  4 * SIZE(B)
1826	LFD	f25,  5 * SIZE(B)
1827	LFD	f26,  6 * SIZE(B)
1828	LFD	f27,  7 * SIZE(B)
1829
1830	srawi.	r0, KK,  2
1831	mtspr	CTR, r0
1832	mr	BO,  B
1833#else
1834
1835#ifdef LN
1836	slwi	r0,   K,  BASE_SHIFT
1837	sub	AORIG, AORIG, r0
1838#endif
1839
1840	slwi	r0,   KK, 0 + BASE_SHIFT
1841	slwi	TEMP, KK, 1 + BASE_SHIFT
1842	add	AO, AORIG, r0
1843	add	BO, B,     TEMP
1844
1845	sub	TEMP, K, KK
1846
1847	LFD	f16,  0 * SIZE(AO)
1848	LFD	f17,  1 * SIZE(AO)
1849	LFD	f18,  2 * SIZE(AO)
1850	LFD	f19,  3 * SIZE(AO)
1851
1852	LFD	f20,  0 * SIZE(BO)
1853	LFD	f21,  1 * SIZE(BO)
1854	LFD	f22,  2 * SIZE(BO)
1855	LFD	f23,  3 * SIZE(BO)
1856
1857	LFD	f24,  4 * SIZE(BO)
1858	LFD	f25,  5 * SIZE(BO)
1859	LFD	f26,  6 * SIZE(BO)
1860	LFD	f27,  7 * SIZE(BO)
1861
1862	srawi.	r0, TEMP,  2
1863	mtspr	CTR, r0
1864#endif
1865	ble	.L65
1866	.align 5
1867
1868.L62:
1869	FMADD	f0,  f16, f20, f0
1870	LFDU	f20,  8 * SIZE(BO)
1871	FMADD	f1,  f16, f21, f1
1872	LFDU	f16,  4 * SIZE(AO)
1873	LFD	f21,  1 * SIZE(BO)
1874	FMADD	f2,  f17, f22, f2
1875	LFD	f22,  2 * SIZE(BO)
1876	FMADD	f3,  f17, f23, f3
1877	LFD	f17,  1 * SIZE(AO)
1878	LFD	f23,  3 * SIZE(BO)
1879
1880	FMADD	f0,  f18, f24, f0
1881	LFD	f24,  4 * SIZE(BO)
1882	FMADD	f1,  f18, f25, f1
1883	LFD	f18,  2 * SIZE(AO)
1884	LFD	f25,  5 * SIZE(BO)
1885	FMADD	f2,  f19, f26, f2
1886	LFD	f26,  6 * SIZE(BO)
1887	FMADD	f3,  f19, f27, f3
1888	LFD	f19,  3 * SIZE(AO)
1889	LFD	f27,  7 * SIZE(BO)
1890	bdnz	.L62
1891	.align 4
1892
1893.L65:
1894#if defined(LT) || defined(RN)
1895	andi.	r0, KK,  3
1896#else
1897	andi.	r0, TEMP, 3
1898#endif
1899	mtspr	CTR, r0
1900	ble+	.L68
1901	.align 4
1902
1903.L66:
1904	FMADD	f0,  f16, f20, f0
1905	LFDU	f20,  2 * SIZE(BO)
1906	FMADD	f1,  f16, f21, f1
1907	LFDU	f16,  1 * SIZE(AO)
1908	LFD	f21,  1 * SIZE(BO)
1909	bdnz	.L66
1910	.align 4
1911
1912.L68:
1913	FADD	f0, f2, f0
1914	FADD	f1, f3, f1
1915
1916#if defined(LN) || defined(RT)
1917#ifdef LN
1918	subi	r0, KK, 1
1919#else
1920	subi	r0, KK, 2
1921#endif
1922	slwi	TEMP, r0, 0 + BASE_SHIFT
1923	slwi	r0,   r0, 1 + BASE_SHIFT
1924	add	AO, AORIG, TEMP
1925	add	BO, B,     r0
1926#endif
1927
1928#if defined(LN) || defined(LT)
1929	LFD	f16,  0 * SIZE(BO)
1930	LFD	f17,  1 * SIZE(BO)
1931
1932	FSUB	f0,  f16, f0
1933	FSUB	f1,  f17, f1
1934#else
1935	LFD	f16,  0 * SIZE(AO)
1936	LFD	f20,  1 * SIZE(AO)
1937
1938	FSUB	f0,  f16, f0
1939	FSUB	f1,  f20, f1
1940#endif
1941
1942#ifdef LN
1943	LFD	f21,  0 * SIZE(AO)
1944
1945	FMUL	f0,  f21, f0
1946	FMUL	f1,  f21, f1
1947#endif
1948
1949#ifdef LT
1950	LFD	f16,  0 * SIZE(AO)
1951
1952	FMUL	f0,  f16, f0
1953	FMUL	f1,  f16, f1
1954#endif
1955
1956#ifdef RN
1957	LFD	f16,  0 * SIZE(BO)
1958	LFD	f17,  1 * SIZE(BO)
1959	LFD	f18,  3 * SIZE(BO)
1960
1961	FMUL	f0,  f16, f0
1962	FNMSUB	f1,  f17, f0,  f1
1963	FMUL	f1,  f18, f1
1964#endif
1965
1966#ifdef RT
1967	LFD	f19,  3 * SIZE(BO)
1968	LFD	f20,  2 * SIZE(BO)
1969	LFD	f21,  0 * SIZE(BO)
1970
1971	FMUL	f1,  f19, f1
1972	FNMSUB	f0,  f20, f1,  f0
1973	FMUL	f0,  f21, f0
1974#endif
1975
1976#ifdef LN
1977	subi	CO1, CO1, 1 * SIZE
1978	subi	CO2, CO2, 1 * SIZE
1979#endif
1980
1981#if defined(LN) || defined(LT)
1982	STFD	f0,   0 * SIZE(BO)
1983	STFD	f1,   1 * SIZE(BO)
1984#else
1985	STFD	f0,   0 * SIZE(AO)
1986	STFD	f1,   1 * SIZE(AO)
1987#endif
1988
1989	STFD	f0,   0 * SIZE(CO1)
1990	STFD	f1,   0 * SIZE(CO2)
1991
1992	lfs	f0,  FZERO
1993 	fmr	f1,  f0
1994	fmr	f4,  f0
1995	fmr	f5,  f0
1996
1997#ifndef LN
1998	addi	CO1, CO1, 1 * SIZE
1999	addi	CO2, CO2, 1 * SIZE
2000#endif
2001
2002#ifdef RT
2003	slwi	r0, K, 0 + BASE_SHIFT
2004	add	AORIG, AORIG, r0
2005#endif
2006
2007#if defined(LT) || defined(RN)
2008	sub	TEMP, K, KK
2009	slwi	r0,   TEMP, 0 + BASE_SHIFT
2010	slwi	TEMP, TEMP, 1 + BASE_SHIFT
2011	add	AO, AO, r0
2012	add	BO, BO, TEMP
2013#endif
2014
2015#ifdef LN
2016	subi	KK, KK, 1
2017#endif
2018
2019#ifdef LT
2020	addi	KK, KK, 1
2021#endif
2022	.align 4
2023
2024.L50:
2025	andi.	I,  M,  2
2026	ble	.L41
2027
2028#if defined(LT) || defined(RN)
2029	LFD	f16,  0 * SIZE(AO)
2030	LFD	f17,  1 * SIZE(AO)
2031	LFD	f18,  2 * SIZE(AO)
2032	LFD	f19,  3 * SIZE(AO)
2033
2034	LFD	f20,  0 * SIZE(B)
2035	LFD	f21,  1 * SIZE(B)
2036	LFD	f22,  2 * SIZE(B)
2037	LFD	f23,  3 * SIZE(B)
2038
2039	LFD	f24,  4 * SIZE(B)
2040	LFD	f25,  5 * SIZE(B)
2041	LFD	f26,  6 * SIZE(B)
2042	LFD	f27,  7 * SIZE(B)
2043
2044	srawi.	r0, KK,  2
2045	mtspr	CTR, r0
2046	mr	BO,  B
2047#else
2048
2049#ifdef LN
2050	slwi	r0,   K,  1 + BASE_SHIFT
2051	sub	AORIG, AORIG, r0
2052#endif
2053
2054	slwi	r0,   KK, 1 + BASE_SHIFT
2055	slwi	TEMP, KK, 1 + BASE_SHIFT
2056	add	AO, AORIG, r0
2057	add	BO, B,     TEMP
2058
2059	sub	TEMP, K, KK
2060
2061	LFD	f16,  0 * SIZE(AO)
2062	LFD	f17,  1 * SIZE(AO)
2063	LFD	f18,  2 * SIZE(AO)
2064	LFD	f19,  3 * SIZE(AO)
2065
2066	LFD	f20,  0 * SIZE(BO)
2067	LFD	f21,  1 * SIZE(BO)
2068	LFD	f22,  2 * SIZE(BO)
2069	LFD	f23,  3 * SIZE(BO)
2070
2071	LFD	f24,  4 * SIZE(BO)
2072	LFD	f25,  5 * SIZE(BO)
2073	LFD	f26,  6 * SIZE(BO)
2074	LFD	f27,  7 * SIZE(BO)
2075
2076	srawi.	r0, TEMP,  2
2077	mtspr	CTR, r0
2078#endif
2079	ble	.L55
2080	.align 5
2081
2082.L52:
2083	FMADD	f0,  f16, f20, f0
2084	FMADD	f1,  f17, f20, f1
2085	LFDU	f20,  8 * SIZE(BO)
2086	FMADD	f2,  f16, f21, f2
2087	LFD	f16,  4 * SIZE(AO)
2088	FMADD	f3,  f17, f21, f3
2089	LFD	f17,  5 * SIZE(AO)
2090
2091	FMADD	f4,  f18, f22, f4
2092	LFD	f21,  1 * SIZE(BO)
2093	FMADD	f5,  f19, f22, f5
2094	LFD	f22,  2 * SIZE(BO)
2095	FMADD	f6,  f18, f23, f6
2096	LFD	f18,  6 * SIZE(AO)
2097	FMADD	f7,  f19, f23, f7
2098	LFD	f19,  7 * SIZE(AO)
2099
2100	FMADD	f0,  f16, f24, f0
2101	LFD	f23,  3 * SIZE(BO)
2102	FMADD	f1,  f17, f24, f1
2103	LFD	f24,  4 * SIZE(BO)
2104	FMADD	f2,  f16, f25, f2
2105	LFDU	f16,  8 * SIZE(AO)
2106	FMADD	f3,  f17, f25, f3
2107	LFD	f17,  1 * SIZE(AO)
2108
2109	FMADD	f4,  f18, f26, f4
2110	LFD	f25,  5 * SIZE(BO)
2111	FMADD	f5,  f19, f26, f5
2112	LFD	f26,  6 * SIZE(BO)
2113	FMADD	f6,  f18, f27, f6
2114	LFD	f18,  2 * SIZE(AO)
2115	FMADD	f7,  f19, f27, f7
2116	LFD	f19,  3 * SIZE(AO)
2117
2118	LFD	f27,  7 * SIZE(BO)
2119	bdnz	.L52
2120	.align 4
2121
2122.L55:
2123#if defined(LT) || defined(RN)
2124	andi.	r0, KK,  3
2125#else
2126	andi.	r0, TEMP, 3
2127#endif
2128	mtspr	CTR, r0
2129	ble+	.L58
2130	.align 4
2131
2132.L56:
2133	FMADD	f0,  f16, f20, f0
2134	FMADD	f1,  f17, f20, f1
2135	LFDU	f20,  2 * SIZE(BO)
2136	FMADD	f2,  f16, f21, f2
2137	LFDU	f16,  2 * SIZE(AO)
2138	FMADD	f3,  f17, f21, f3
2139	LFD	f17,  1 * SIZE(AO)
2140	LFD	f21,  1 * SIZE(BO)
2141	bdnz	.L56
2142	.align 4
2143
2144.L58:
2145	FADD	f0, f4,  f0
2146	FADD	f1, f5,  f1
2147	FADD	f2, f6,  f2
2148	FADD	f3, f7,  f3
2149
2150#if defined(LN) || defined(RT)
2151#ifdef LN
2152	subi	r0, KK, 2
2153#else
2154	subi	r0, KK, 2
2155#endif
2156	slwi	TEMP, r0, 1 + BASE_SHIFT
2157	slwi	r0,   r0, 1 + BASE_SHIFT
2158	add	AO, AORIG, TEMP
2159	add	BO, B,     r0
2160#endif
2161
2162#if defined(LN) || defined(LT)
2163	LFD	f16,  0 * SIZE(BO)
2164	LFD	f17,  1 * SIZE(BO)
2165	LFD	f20,  2 * SIZE(BO)
2166 	LFD	f21,  3 * SIZE(BO)
2167
2168	FSUB	f0,  f16, f0
2169	FSUB	f2,  f17, f2
2170	FSUB	f1,  f20, f1
2171	FSUB	f3,  f21, f3
2172#else
2173	LFD	f16,  0 * SIZE(AO)
2174	LFD	f17,  1 * SIZE(AO)
2175	LFD	f20,  2 * SIZE(AO)
2176 	LFD	f21,  3 * SIZE(AO)
2177
2178	FSUB	f0,  f16, f0
2179	FSUB	f1,  f17, f1
2180	FSUB	f2,  f20, f2
2181	FSUB	f3,  f21, f3
2182#endif
2183
2184#ifdef LN
2185	LFD	f19,  3 * SIZE(AO)
2186	LFD	f20,  2 * SIZE(AO)
2187	LFD	f21,  0 * SIZE(AO)
2188
2189	FMUL	f1,  f19, f1
2190	FMUL	f3,  f19, f3
2191
2192	FNMSUB	f0,  f20, f1,  f0
2193	FNMSUB	f2,  f20, f3,  f2
2194
2195	FMUL	f0,  f21, f0
2196	FMUL	f2,  f21, f2
2197#endif
2198
2199#ifdef LT
2200	LFD	f16,  0 * SIZE(AO)
2201	LFD	f17,  1 * SIZE(AO)
2202
2203	FMUL	f0,  f16, f0
2204	FMUL	f2,  f16, f2
2205	FNMSUB	f1,  f17, f0,  f1
2206	FNMSUB	f3,  f17, f2,  f3
2207
2208	LFD	f17,  3 * SIZE(AO)
2209
2210	FMUL	f1,  f17, f1
2211	FMUL	f3,  f17, f3
2212#endif
2213
2214#ifdef RN
2215	LFD	f16,  0 * SIZE(BO)
2216	LFD	f17,  1 * SIZE(BO)
2217	LFD	f18,  3 * SIZE(BO)
2218
2219	FMUL	f0,  f16, f0
2220	FMUL	f1,  f16, f1
2221
2222	FNMSUB	f2,  f17, f0,  f2
2223	FNMSUB	f3,  f17, f1,  f3
2224	FMUL	f2,  f18, f2
2225	FMUL	f3,  f18, f3
2226#endif
2227
2228#ifdef RT
2229	LFD	f19,  3 * SIZE(BO)
2230	LFD	f20,  2 * SIZE(BO)
2231	LFD	f21,  0 * SIZE(BO)
2232
2233	FMUL	f2,  f19, f2
2234	FMUL	f3,  f19, f3
2235	FNMSUB	f0,  f20, f2,  f0
2236	FNMSUB	f1,  f20, f3,  f1
2237	FMUL	f0,  f21, f0
2238	FMUL	f1,  f21, f1
2239#endif
2240
2241#ifdef LN
2242	subi	CO1, CO1, 2 * SIZE
2243	subi	CO2, CO2, 2 * SIZE
2244#endif
2245
2246#if defined(LN) || defined(LT)
2247	STFD	f0,   0 * SIZE(BO)
2248	STFD	f2,   1 * SIZE(BO)
2249	STFD	f1,   2 * SIZE(BO)
2250	STFD	f3,   3 * SIZE(BO)
2251#else
2252	STFD	f0,   0 * SIZE(AO)
2253	STFD	f1,   1 * SIZE(AO)
2254	STFD	f2,   2 * SIZE(AO)
2255	STFD	f3,   3 * SIZE(AO)
2256#endif
2257
2258	STFD	f0,   0 * SIZE(CO1)
2259	STFD	f1,   1 * SIZE(CO1)
2260	STFD	f2,   0 * SIZE(CO2)
2261	STFD	f3,   1 * SIZE(CO2)
2262
2263	lfs	f0,  FZERO
2264 	fmr	f1,  f0
2265	fmr	f2,  f0
2266	fmr	f3,  f0
2267
2268	fmr	f4,  f0
2269	fmr	f5,  f0
2270	fmr	f6,  f0
2271	fmr	f7,  f0
2272
2273#ifndef LN
2274	addi	CO1, CO1, 2 * SIZE
2275	addi	CO2, CO2, 2 * SIZE
2276#endif
2277
2278#ifdef RT
2279	slwi	r0, K, 1 + BASE_SHIFT
2280	add	AORIG, AORIG, r0
2281#endif
2282
2283#if defined(LT) || defined(RN)
2284	sub	TEMP, K, KK
2285	slwi	r0,   TEMP, 1 + BASE_SHIFT
2286	slwi	TEMP, TEMP, 1 + BASE_SHIFT
2287	add	AO, AO, r0
2288	add	BO, BO, TEMP
2289#endif
2290
2291#ifdef LN
2292	subi	KK, KK, 2
2293#endif
2294
2295#ifdef LT
2296	addi	KK, KK, 2
2297#endif
2298	.align 4
2299
2300.L41:
2301	srawi.	I, M,  2
2302	ble	.L69
2303	.align 4
2304
2305.L42:
2306#if defined(LT) || defined(RN)
2307	LFD	f16,  0 * SIZE(AO)
2308	LFD	f17,  1 * SIZE(AO)
2309	LFD	f18,  2 * SIZE(AO)
2310	LFD	f19,  3 * SIZE(AO)
2311
2312	LFD	f20,  0 * SIZE(B)
2313	LFD	f21,  1 * SIZE(B)
2314	LFD	f22,  2 * SIZE(B)
2315	LFD	f23,  3 * SIZE(B)
2316
2317	srawi.	r0, KK,  2
2318	mtspr	CTR, r0
2319	mr	BO,  B
2320#else
2321
2322#ifdef LN
2323	slwi	r0,   K,  2 + BASE_SHIFT
2324	sub	AORIG, AORIG, r0
2325#endif
2326
2327	slwi	r0,   KK, 2 + BASE_SHIFT
2328	slwi	TEMP, KK, 1 + BASE_SHIFT
2329	add	AO, AORIG, r0
2330	add	BO, B,     TEMP
2331
2332	sub	TEMP, K, KK
2333
2334	LFD	f16,  0 * SIZE(AO)
2335	LFD	f17,  1 * SIZE(AO)
2336	LFD	f18,  2 * SIZE(AO)
2337	LFD	f19,  3 * SIZE(AO)
2338
2339	LFD	f20,  0 * SIZE(BO)
2340	LFD	f21,  1 * SIZE(BO)
2341	LFD	f22,  2 * SIZE(BO)
2342	LFD	f23,  3 * SIZE(BO)
2343
2344	srawi.	r0, TEMP,  2
2345	mtspr	CTR, r0
2346#endif
2347	ble	.L45
2348	.align 5
2349
2350.L43:
2351	FMADD	f0,  f16, f20, f0
2352	FMADD	f1,  f17, f20, f1
2353	FMADD	f2,  f18, f20, f2
2354	FMADD	f3,  f19, f20, f3
2355	LFD	f20,  4 * SIZE(BO)
2356
2357	FMADD	f4,  f16, f21, f4
2358	LFD	f16,  4 * SIZE(AO)
2359	FMADD	f5,  f17, f21, f5
2360	LFD	f17,  5 * SIZE(AO)
2361	FMADD	f6,  f18, f21, f6
2362	LFD	f18,  6 * SIZE(AO)
2363	FMADD	f7,  f19, f21, f7
2364	LFD	f19,  7 * SIZE(AO)
2365
2366	FMADD	f0,  f16, f22, f0
2367	LFD	f21,  5 * SIZE(BO)
2368	FMADD	f1,  f17, f22, f1
2369	FMADD	f2,  f18, f22, f2
2370	FMADD	f3,  f19, f22, f3
2371	LFD	f22,  6 * SIZE(BO)
2372
2373	FMADD	f4,  f16, f23, f4
2374	LFD	f16,  8 * SIZE(AO)
2375	FMADD	f5,  f17, f23, f5
2376	LFD	f17,  9 * SIZE(AO)
2377	FMADD	f6,  f18, f23, f6
2378	LFD	f18, 10 * SIZE(AO)
2379	FMADD	f7,  f19, f23, f7
2380	LFD	f19, 11 * SIZE(AO)
2381
2382	FMADD	f0,  f16, f20, f0
2383	LFD	f23,  7 * SIZE(BO)
2384	FMADD	f1,  f17, f20, f1
2385	FMADD	f2,  f18, f20, f2
2386	FMADD	f3,  f19, f20, f3
2387	LFDU	f20,  8 * SIZE(BO)
2388
2389	FMADD	f4,  f16, f21, f4
2390	LFD	f16, 12 * SIZE(AO)
2391	FMADD	f5,  f17, f21, f5
2392	LFD	f17, 13 * SIZE(AO)
2393	FMADD	f6,  f18, f21, f6
2394	LFD	f18, 14 * SIZE(AO)
2395	FMADD	f7,  f19, f21, f7
2396	LFD	f19, 15 * SIZE(AO)
2397
2398	FMADD	f0,  f16, f22, f0
2399	LFD	f21,  1 * SIZE(BO)
2400	FMADD	f1,  f17, f22, f1
2401	FMADD	f2,  f18, f22, f2
2402	FMADD	f3,  f19, f22, f3
2403	LFD	f22,  2 * SIZE(BO)
2404
2405	FMADD	f4,  f16, f23, f4
2406	LFDU	f16, 16 * SIZE(AO)
2407	FMADD	f5,  f17, f23, f5
2408	LFD	f17,  1 * SIZE(AO)
2409	FMADD	f6,  f18, f23, f6
2410	LFD	f18,  2 * SIZE(AO)
2411	FMADD	f7,  f19, f23, f7
2412	LFD	f19,  3 * SIZE(AO)
2413
2414	LFD	f23,  3 * SIZE(BO)
2415	bdnz	.L43
2416	.align 4
2417
2418.L45:
2419#if defined(LT) || defined(RN)
2420	andi.	r0, KK,  3
2421#else
2422	andi.	r0, TEMP, 3
2423#endif
2424	mtspr	CTR, r0
2425	ble+	.L48
2426	.align 4
2427
2428.L46:
2429	FMADD	f0,  f16, f20, f0
2430	FMADD	f1,  f17, f20, f1
2431	FMADD	f2,  f18, f20, f2
2432	FMADD	f3,  f19, f20, f3
2433	LFDU	f20,  2 * SIZE(BO)
2434
2435	FMADD	f4,  f16, f21, f4
2436	LFDU	f16,  4 * SIZE(AO)
2437	FMADD	f5,  f17, f21, f5
2438	LFD	f17,  1 * SIZE(AO)
2439	FMADD	f6,  f18, f21, f6
2440	LFD	f18,  2 * SIZE(AO)
2441	FMADD	f7,  f19, f21, f7
2442	LFD	f19,  3 * SIZE(AO)
2443	LFD	f21,  1 * SIZE(BO)
2444	bdnz	.L46
2445	.align 4
2446
2447.L48:
2448#if defined(LN) || defined(RT)
2449#ifdef LN
2450	subi	r0, KK, 4
2451#else
2452	subi	r0, KK, 2
2453#endif
2454	slwi	TEMP, r0, 2 + BASE_SHIFT
2455	slwi	r0,   r0, 1 + BASE_SHIFT
2456	add	AO, AORIG, TEMP
2457	add	BO, B,     r0
2458#endif
2459
2460#if defined(LN) || defined(LT)
2461	LFD	f16,  0 * SIZE(BO)
2462	LFD	f17,  1 * SIZE(BO)
2463	LFD	f20,  2 * SIZE(BO)
2464 	LFD	f21,  3 * SIZE(BO)
2465
2466	LFD	f24,  4 * SIZE(BO)
2467	LFD	f25,  5 * SIZE(BO)
2468	LFD	f28,  6 * SIZE(BO)
2469	LFD	f29,  7 * SIZE(BO)
2470
2471	FSUB	f0,  f16, f0
2472	FSUB	f4,  f17, f4
2473	FSUB	f1,  f20, f1
2474	FSUB	f5,  f21, f5
2475
2476	FSUB	f2,  f24, f2
2477	FSUB	f6,  f25, f6
2478	FSUB	f3,  f28, f3
2479	FSUB	f7,  f29, f7
2480#else
2481	LFD	f16,  0 * SIZE(AO)
2482	LFD	f17,  1 * SIZE(AO)
2483	LFD	f18,  2 * SIZE(AO)
2484	LFD	f19,  3 * SIZE(AO)
2485
2486	LFD	f20,  4 * SIZE(AO)
2487 	LFD	f21,  5 * SIZE(AO)
2488	LFD	f22,  6 * SIZE(AO)
2489	LFD	f23,  7 * SIZE(AO)
2490
2491	FSUB	f0,  f16, f0
2492	FSUB	f1,  f17, f1
2493	FSUB	f2,  f18, f2
2494	FSUB	f3,  f19, f3
2495
2496	FSUB	f4,  f20, f4
2497	FSUB	f5,  f21, f5
2498	FSUB	f6,  f22, f6
2499	FSUB	f7,  f23, f7
2500#endif
2501
2502#ifdef LN
2503	LFD	f16, 15 * SIZE(AO)
2504	LFD	f17, 14 * SIZE(AO)
2505	LFD	f18, 13 * SIZE(AO)
2506	LFD	f19, 12 * SIZE(AO)
2507
2508	FMUL	f3,  f16, f3
2509	FMUL	f7,  f16, f7
2510	FNMSUB	f2,  f17, f3,  f2
2511	FNMSUB	f6,  f17, f7,  f6
2512	FNMSUB	f1,  f18, f3,  f1
2513	FNMSUB	f5,  f18, f7,  f5
2514	FNMSUB	f0,  f19, f3,  f0
2515	FNMSUB	f4,  f19, f7,  f4
2516
2517	LFD	f16, 10 * SIZE(AO)
2518	LFD	f17,  9 * SIZE(AO)
2519	LFD	f18,  8 * SIZE(AO)
2520	LFD	f19,  5 * SIZE(AO)
2521
2522	LFD	f20,  4 * SIZE(AO)
2523	LFD	f21,  0 * SIZE(AO)
2524
2525	FMUL	f2,  f16, f2
2526	FMUL	f6,  f16, f6
2527	FNMSUB	f1,  f17, f2,  f1
2528	FNMSUB	f5,  f17, f6,  f5
2529	FNMSUB	f0,  f18, f2,  f0
2530	FNMSUB	f4,  f18, f6,  f4
2531
2532	FMUL	f1,  f19, f1
2533	FMUL	f5,  f19, f5
2534	FNMSUB	f0,  f20, f1,  f0
2535	FNMSUB	f4,  f20, f5,  f4
2536	FMUL	f0,  f21, f0
2537	FMUL	f4,  f21, f4
2538#endif
2539
2540#ifdef LT
2541	LFD	f16,  0 * SIZE(AO)
2542	LFD	f17,  1 * SIZE(AO)
2543	LFD	f18,  2 * SIZE(AO)
2544	LFD	f19,  3 * SIZE(AO)
2545
2546	FMUL	f0,  f16, f0
2547	FMUL	f4,  f16, f4
2548	FNMSUB	f1,  f17, f0,  f1
2549	FNMSUB	f5,  f17, f4,  f5
2550
2551	FNMSUB	f2,  f18, f0,  f2
2552	FNMSUB	f6,  f18, f4,  f6
2553	FNMSUB	f3,  f19, f0,  f3
2554	FNMSUB	f7,  f19, f4,  f7
2555
2556	LFD	f17,  5 * SIZE(AO)
2557	LFD	f18,  6 * SIZE(AO)
2558	LFD	f19,  7 * SIZE(AO)
2559
2560	FMUL	f1,  f17, f1
2561	FMUL	f5,  f17, f5
2562
2563	FNMSUB	f2,  f18, f1,  f2
2564	FNMSUB	f6,  f18, f5,  f6
2565
2566	FNMSUB	f3,  f19, f1,  f3
2567	FNMSUB	f7,  f19, f5,  f7
2568
2569	LFD	f18, 10 * SIZE(AO)
2570	LFD	f19, 11 * SIZE(AO)
2571
2572	FMUL	f2,  f18, f2
2573	FMUL	f6,  f18, f6
2574
2575	FNMSUB	f3,  f19, f2,  f3
2576	FNMSUB	f7,  f19, f6,  f7
2577
2578	LFD	f19, 15 * SIZE(AO)
2579
2580	FMUL	f3,  f19, f3
2581	FMUL	f7,  f19, f7
2582#endif
2583
2584#ifdef RN
2585	LFD	f16,  0 * SIZE(BO)
2586	LFD	f17,  1 * SIZE(BO)
2587	LFD	f18,  3 * SIZE(BO)
2588
2589	FMUL	f0,  f16, f0
2590	FMUL	f1,  f16, f1
2591	FMUL	f2,  f16, f2
2592	FMUL	f3,  f16, f3
2593
2594	FNMSUB	f4,  f17, f0,  f4
2595	FNMSUB	f5,  f17, f1,  f5
2596	FNMSUB	f6,  f17, f2,  f6
2597	FNMSUB	f7,  f17, f3,  f7
2598
2599	FMUL	f4,  f18, f4
2600	FMUL	f5,  f18, f5
2601	FMUL	f6,  f18, f6
2602	FMUL	f7,  f18, f7
2603#endif
2604
2605#ifdef RT
2606	LFD	f19,  3 * SIZE(BO)
2607	LFD	f20,  2 * SIZE(BO)
2608	LFD	f21,  0 * SIZE(BO)
2609
2610	FMUL	f4,  f19, f4
2611	FMUL	f5,  f19, f5
2612	FMUL	f6,  f19, f6
2613	FMUL	f7,  f19, f7
2614
2615	FNMSUB	f0,  f20, f4,  f0
2616	FNMSUB	f1,  f20, f5,  f1
2617	FNMSUB	f2,  f20, f6,  f2
2618	FNMSUB	f3,  f20, f7,  f3
2619
2620	FMUL	f0,  f21, f0
2621	FMUL	f1,  f21, f1
2622	FMUL	f2,  f21, f2
2623	FMUL	f3,  f21, f3
2624#endif
2625
2626#ifdef LN
2627	subi	CO1, CO1, 4 * SIZE
2628	subi	CO2, CO2, 4 * SIZE
2629#endif
2630
2631#if defined(LN) || defined(LT)
2632	STFD	f0,   0 * SIZE(BO)
2633	STFD	f4,   1 * SIZE(BO)
2634	STFD	f1,   2 * SIZE(BO)
2635	STFD	f5,   3 * SIZE(BO)
2636
2637	STFD	f2,   4 * SIZE(BO)
2638	STFD	f6,   5 * SIZE(BO)
2639	STFD	f3,   6 * SIZE(BO)
2640	STFD	f7,   7 * SIZE(BO)
2641#else
2642	STFD	f0,   0 * SIZE(AO)
2643	STFD	f1,   1 * SIZE(AO)
2644	STFD	f2,   2 * SIZE(AO)
2645	STFD	f3,   3 * SIZE(AO)
2646
2647	STFD	f4,   4 * SIZE(AO)
2648	STFD	f5,   5 * SIZE(AO)
2649	STFD	f6,   6 * SIZE(AO)
2650	STFD	f7,   7 * SIZE(AO)
2651#endif
2652
2653	STFD	f0,   0 * SIZE(CO1)
2654	STFD	f1,   1 * SIZE(CO1)
2655	STFD	f2,   2 * SIZE(CO1)
2656	STFD	f3,   3 * SIZE(CO1)
2657
2658	STFD	f4,   0 * SIZE(CO2)
2659	STFD	f5,   1 * SIZE(CO2)
2660	STFD	f6,   2 * SIZE(CO2)
2661	STFD	f7,   3 * SIZE(CO2)
2662
2663	lfs	f0,  FZERO
2664 	fmr	f1,  f0
2665	fmr	f2,  f0
2666	fmr	f3,  f0
2667
2668	fmr	f4,  f0
2669	fmr	f5,  f0
2670	fmr	f6,  f0
2671	fmr	f7,  f0
2672
2673#ifndef LN
2674	addi	CO1, CO1, 4 * SIZE
2675	addi	CO2, CO2, 4 * SIZE
2676#endif
2677
2678#ifdef RT
2679	slwi	r0, K, 2 + BASE_SHIFT
2680	add	AORIG, AORIG, r0
2681#endif
2682
2683#if defined(LT) || defined(RN)
2684	sub	TEMP, K, KK
2685	slwi	r0,   TEMP, 2 + BASE_SHIFT
2686	slwi	TEMP, TEMP, 1 + BASE_SHIFT
2687	add	AO, AO, r0
2688	add	BO, BO, TEMP
2689#endif
2690
2691#ifdef LN
2692	subi	KK, KK, 4
2693#endif
2694
2695#ifdef LT
2696	addi	KK, KK, 4
2697#endif
2698
2699	addic.	I, I, -1
2700	bgt+	.L42
2701	.align 4
2702
2703.L69:
2704#ifdef LN
2705	slwi	r0, K, 1 + BASE_SHIFT
2706	add	B, B, r0
2707#endif
2708
2709#if defined(LT) || defined(RN)
2710	mr	B,  BO
2711#endif
2712
2713#ifdef RN
2714	addi	KK, KK, 2
2715#endif
2716
2717#ifdef RT
2718	subi	KK, KK, 2
2719#endif
2720	lfs	f0, FZERO
2721	.align 4
2722
2723.L70:
2724	andi.	J, N,  1
2725	ble	.L999
2726
2727#ifdef RT
2728	slwi	r0, K, 0 + BASE_SHIFT
2729	sub	B, B, r0
2730
2731	sub	C, C, LDC
2732#endif
2733
2734	mr	CO1, C
2735
2736#ifdef LN
2737	add	KK, M, OFFSET
2738#endif
2739
2740#ifdef LT
2741	mr	KK, OFFSET
2742#endif
2743
2744 	fmr	f1,  f0
2745	fmr	f2,  f0
2746	fmr	f3,  f0
2747
2748#if defined(LN) || defined(RT)
2749	mr	AORIG, A
2750#else
2751	mr	AO, A
2752#endif
2753#ifndef RT
2754	add	C,  CO1, LDC
2755#endif
2756	.align 4
2757
2758.L90:
2759	andi.	I,  M,  1
2760	ble	.L80
2761
2762#if defined(LT) || defined(RN)
2763	LFD	f16,  0 * SIZE(AO)
2764	LFD	f17,  1 * SIZE(AO)
2765	LFD	f18,  2 * SIZE(AO)
2766	LFD	f19,  3 * SIZE(AO)
2767
2768	LFD	f20,  0 * SIZE(B)
2769	LFD	f21,  1 * SIZE(B)
2770	LFD	f22,  2 * SIZE(B)
2771	LFD	f23,  3 * SIZE(B)
2772
2773	srawi.	r0, KK,  3
2774	mtspr	CTR, r0
2775	mr	BO,  B
2776#else
2777
2778#ifdef LN
2779	slwi	r0,   K,  BASE_SHIFT
2780	sub	AORIG, AORIG, r0
2781#endif
2782
2783	slwi	r0,   KK, 0 + BASE_SHIFT
2784	slwi	TEMP, KK, 0 + BASE_SHIFT
2785	add	AO, AORIG, r0
2786	add	BO, B,     TEMP
2787
2788	sub	TEMP, K, KK
2789
2790	LFD	f16,  0 * SIZE(AO)
2791	LFD	f17,  1 * SIZE(AO)
2792	LFD	f18,  2 * SIZE(AO)
2793	LFD	f19,  3 * SIZE(AO)
2794
2795	LFD	f20,  0 * SIZE(BO)
2796	LFD	f21,  1 * SIZE(BO)
2797	LFD	f22,  2 * SIZE(BO)
2798	LFD	f23,  3 * SIZE(BO)
2799
2800	srawi.	r0, TEMP,  3
2801	mtspr	CTR, r0
2802#endif
2803	ble	.L95
2804	.align 5
2805
2806.L92:
2807	FMADD	f0,  f16, f20, f0
2808	LFD	f16,  4 * SIZE(AO)
2809	LFD	f20,  4 * SIZE(BO)
2810	FMADD	f1,  f17, f21, f1
2811	LFD	f17,  5 * SIZE(AO)
2812	LFD	f21,  5 * SIZE(BO)
2813	FMADD	f2,  f18, f22, f2
2814	LFD	f18,  6 * SIZE(AO)
2815	LFD	f22,  6 * SIZE(BO)
2816	FMADD	f3,  f19, f23, f3
2817	LFD	f19,  7 * SIZE(AO)
2818	LFD	f23,  7 * SIZE(BO)
2819
2820	FMADD	f0,  f16, f20, f0
2821	LFDU	f16,  8 * SIZE(AO)
2822	LFDU	f20,  8 * SIZE(BO)
2823	FMADD	f1,  f17, f21, f1
2824	LFD	f17,  1 * SIZE(AO)
2825	LFD	f21,  1 * SIZE(BO)
2826	FMADD	f2,  f18, f22, f2
2827	LFD	f18,  2 * SIZE(AO)
2828	LFD	f22,  2 * SIZE(BO)
2829	FMADD	f3,  f19, f23, f3
2830	LFD	f19,  3 * SIZE(AO)
2831	LFD	f23,  3 * SIZE(BO)
2832	bdnz	.L92
2833	.align 4
2834
2835.L95:
2836#if defined(LT) || defined(RN)
2837	andi.	r0, KK,  7
2838#else
2839	andi.	r0, TEMP, 7
2840#endif
2841	mtspr	CTR, r0
2842	ble+	.L98
2843	.align 4
2844
2845.L96:
2846	FMADD	f0,  f16, f20, f0
2847	LFDU	f16,  1 * SIZE(AO)
2848	LFDU	f20,  1 * SIZE(BO)
2849	bdnz	.L96
2850	.align 4
2851
2852.L98:
2853	FADD	f0, f1, f0
2854	FADD	f2, f3, f2
2855	FADD	f0, f2, f0
2856
2857#if defined(LN) || defined(RT)
2858#ifdef LN
2859	subi	r0, KK, 1
2860#else
2861	subi	r0, KK, 1
2862#endif
2863	slwi	TEMP, r0, 0 + BASE_SHIFT
2864	slwi	r0,   r0, 0 + BASE_SHIFT
2865	add	AO, AORIG, TEMP
2866	add	BO, B,     r0
2867#endif
2868
2869#if defined(LN) || defined(LT)
2870	LFD	f16,  0 * SIZE(BO)
2871	FSUB	f0,  f16, f0
2872#else
2873	LFD	f16,  0 * SIZE(AO)
2874	FSUB	f0,  f16, f0
2875#endif
2876
2877#ifdef LN
2878	LFD	f21,  0 * SIZE(AO)
2879	FMUL	f0,  f21, f0
2880#endif
2881
2882#ifdef LT
2883	LFD	f16,  0 * SIZE(AO)
2884	FMUL	f0,  f16, f0
2885#endif
2886
2887#ifdef RN
2888	LFD	f16,  0 * SIZE(BO)
2889	FMUL	f0,  f16, f0
2890#endif
2891
2892#ifdef RT
2893	LFD	f21,  0 * SIZE(BO)
2894	FMUL	f0,  f21, f0
2895#endif
2896
2897#ifdef LN
2898	subi	CO1, CO1, 1 * SIZE
2899#endif
2900
2901#if defined(LN) || defined(LT)
2902	STFD	f0,   0 * SIZE(BO)
2903#else
2904	STFD	f0,   0 * SIZE(AO)
2905#endif
2906
2907	STFD	f0,   0 * SIZE(CO1)
2908
2909	lfs	f0,  FZERO
2910 	fmr	f1,  f0
2911	fmr	f2,  f0
2912	fmr	f3,  f0
2913
2914#ifndef LN
2915	addi	CO1, CO1, 1 * SIZE
2916#endif
2917
2918#ifdef RT
2919	slwi	r0, K, 0 + BASE_SHIFT
2920	add	AORIG, AORIG, r0
2921#endif
2922
2923#if defined(LT) || defined(RN)
2924	sub	TEMP, K, KK
2925	slwi	r0,   TEMP, 0 + BASE_SHIFT
2926	slwi	TEMP, TEMP, 0 + BASE_SHIFT
2927	add	AO, AO, r0
2928	add	BO, BO, TEMP
2929#endif
2930
2931#ifdef LN
2932	subi	KK, KK, 1
2933#endif
2934
2935#ifdef LT
2936	addi	KK, KK, 1
2937#endif
2938	.align 4
2939
2940.L80:
2941	andi.	I,  M,  2
2942	ble	.L71
2943
2944#if defined(LT) || defined(RN)
2945	LFD	f16,  0 * SIZE(AO)
2946	LFD	f17,  1 * SIZE(AO)
2947	LFD	f18,  2 * SIZE(AO)
2948	LFD	f19,  3 * SIZE(AO)
2949
2950	LFD	f20,  0 * SIZE(B)
2951	LFD	f21,  1 * SIZE(B)
2952	LFD	f22,  2 * SIZE(B)
2953	LFD	f23,  3 * SIZE(B)
2954
2955	srawi.	r0, KK,  2
2956	mtspr	CTR, r0
2957	mr	BO,  B
2958#else
2959
2960#ifdef LN
2961	slwi	r0,   K,  1 + BASE_SHIFT
2962	sub	AORIG, AORIG, r0
2963#endif
2964
2965	slwi	r0,   KK, 1 + BASE_SHIFT
2966	slwi	TEMP, KK, 0 + BASE_SHIFT
2967	add	AO, AORIG, r0
2968	add	BO, B,     TEMP
2969
2970	sub	TEMP, K, KK
2971
2972	LFD	f16,  0 * SIZE(AO)
2973	LFD	f17,  1 * SIZE(AO)
2974	LFD	f18,  2 * SIZE(AO)
2975	LFD	f19,  3 * SIZE(AO)
2976
2977	LFD	f20,  0 * SIZE(BO)
2978	LFD	f21,  1 * SIZE(BO)
2979	LFD	f22,  2 * SIZE(BO)
2980	LFD	f23,  3 * SIZE(BO)
2981
2982	srawi.	r0, TEMP,  2
2983	mtspr	CTR, r0
2984#endif
2985	ble	.L85
2986	.align 5
2987
2988.L82:
2989	FMADD	f0,  f16, f20, f0
2990	LFD	f16,  4 * SIZE(AO)
2991	FMADD	f1,  f17, f20, f1
2992	LFDU	f20,  4 * SIZE(BO)
2993	LFD	f17,  5 * SIZE(AO)
2994	FMADD	f2,  f18, f21, f2
2995	LFD	f18,  6 * SIZE(AO)
2996	FMADD	f3,  f19, f21, f3
2997	LFD	f21,  1 * SIZE(BO)
2998	LFD	f19,  7 * SIZE(AO)
2999
3000	FMADD	f0,  f16, f22, f0
3001	LFDU	f16,  8 * SIZE(AO)
3002	FMADD	f1,  f17, f22, f1
3003	LFD	f22,  2 * SIZE(BO)
3004	LFD	f17,  1 * SIZE(AO)
3005	FMADD	f2,  f18, f23, f2
3006	LFD	f18,  2 * SIZE(AO)
3007	FMADD	f3,  f19, f23, f3
3008	LFD	f23,  3 * SIZE(BO)
3009	LFD	f19,  3 * SIZE(AO)
3010	bdnz	.L82
3011	.align 4
3012
3013.L85:
3014#if defined(LT) || defined(RN)
3015	andi.	r0, KK,  3
3016#else
3017	andi.	r0, TEMP, 3
3018#endif
3019	mtspr	CTR, r0
3020	ble+	.L88
3021	.align 4
3022
3023.L86:
3024	FMADD	f0,  f16, f20, f0
3025	LFDU	f16,  2 * SIZE(AO)
3026	FMADD	f1,  f17, f20, f1
3027	LFDU	f20,  1 * SIZE(BO)
3028	LFD	f17,  1 * SIZE(AO)
3029	bdnz	.L86
3030	.align 4
3031
3032.L88:
3033	FADD	f0, f2, f0
3034	FADD	f1, f3, f1
3035
3036#if defined(LN) || defined(RT)
3037#ifdef LN
3038	subi	r0, KK, 2
3039#else
3040	subi	r0, KK, 1
3041#endif
3042	slwi	TEMP, r0, 1 + BASE_SHIFT
3043	slwi	r0,   r0, 0 + BASE_SHIFT
3044	add	AO, AORIG, TEMP
3045	add	BO, B,     r0
3046#endif
3047
3048#if defined(LN) || defined(LT)
3049	LFD	f16,  0 * SIZE(BO)
3050	LFD	f20,  1 * SIZE(BO)
3051
3052	FSUB	f0,  f16, f0
3053	FSUB	f1,  f20, f1
3054#else
3055	LFD	f16,  0 * SIZE(AO)
3056	LFD	f17,  1 * SIZE(AO)
3057
3058	FSUB	f0,  f16, f0
3059	FSUB	f1,  f17, f1
3060#endif
3061
3062#ifdef LN
3063	LFD	f19,  3 * SIZE(AO)
3064	LFD	f20,  2 * SIZE(AO)
3065	LFD	f21,  0 * SIZE(AO)
3066
3067	FMUL	f1,  f19, f1
3068	FNMSUB	f0,  f20, f1,  f0
3069	FMUL	f0,  f21, f0
3070#endif
3071
3072#ifdef LT
3073	LFD	f16,  0 * SIZE(AO)
3074	LFD	f17,  1 * SIZE(AO)
3075
3076	FMUL	f0,  f16, f0
3077	FNMSUB	f1,  f17, f0,  f1
3078
3079	LFD	f17,  3 * SIZE(AO)
3080	FMUL	f1,  f17, f1
3081#endif
3082
3083#ifdef RN
3084	LFD	f16,  0 * SIZE(BO)
3085
3086	FMUL	f0,  f16, f0
3087	FMUL	f1,  f16, f1
3088#endif
3089
3090#ifdef RT
3091	LFD	f21,  0 * SIZE(BO)
3092
3093	FMUL	f0,  f21, f0
3094	FMUL	f1,  f21, f1
3095#endif
3096
3097#ifdef LN
3098	subi	CO1, CO1, 2 * SIZE
3099#endif
3100
3101#if defined(LN) || defined(LT)
3102	STFD	f0,   0 * SIZE(BO)
3103	STFD	f1,   1 * SIZE(BO)
3104#else
3105	STFD	f0,   0 * SIZE(AO)
3106	STFD	f1,   1 * SIZE(AO)
3107#endif
3108
3109	STFD	f0,   0 * SIZE(CO1)
3110	STFD	f1,   1 * SIZE(CO1)
3111
3112	lfs	f0,  FZERO
3113 	fmr	f1,  f0
3114	fmr	f2,  f0
3115	fmr	f3,  f0
3116
3117#ifndef LN
3118	addi	CO1, CO1, 2 * SIZE
3119#endif
3120
3121#ifdef RT
3122	slwi	r0, K, 1 + BASE_SHIFT
3123	add	AORIG, AORIG, r0
3124#endif
3125
3126#if defined(LT) || defined(RN)
3127	sub	TEMP, K, KK
3128	slwi	r0,   TEMP, 1 + BASE_SHIFT
3129	slwi	TEMP, TEMP, 0 + BASE_SHIFT
3130	add	AO, AO, r0
3131	add	BO, BO, TEMP
3132#endif
3133
3134#ifdef LN
3135	subi	KK, KK, 2
3136#endif
3137
3138#ifdef LT
3139	addi	KK, KK, 2
3140#endif
3141	.align 4
3142
3143.L71:
3144	srawi.	I, M,  2
3145	ble	.L999
3146	.align 4
3147
3148.L72:
3149#if defined(LT) || defined(RN)
3150	LFD	f16,  0 * SIZE(AO)
3151	LFD	f17,  1 * SIZE(AO)
3152	LFD	f18,  2 * SIZE(AO)
3153	LFD	f19,  3 * SIZE(AO)
3154
3155	LFD	f20,  0 * SIZE(B)
3156	LFD	f21,  1 * SIZE(B)
3157	LFD	f22,  2 * SIZE(B)
3158	LFD	f23,  3 * SIZE(B)
3159
3160	srawi.	r0, KK,  2
3161	mtspr	CTR, r0
3162	mr	BO,  B
3163#else
3164
3165#ifdef LN
3166	slwi	r0,   K,  2 + BASE_SHIFT
3167	sub	AORIG, AORIG, r0
3168#endif
3169
3170	slwi	r0,   KK, 2 + BASE_SHIFT
3171	slwi	TEMP, KK, 0 + BASE_SHIFT
3172	add	AO, AORIG, r0
3173	add	BO, B,     TEMP
3174
3175	sub	TEMP, K, KK
3176
3177	LFD	f16,  0 * SIZE(AO)
3178	LFD	f17,  1 * SIZE(AO)
3179	LFD	f18,  2 * SIZE(AO)
3180	LFD	f19,  3 * SIZE(AO)
3181
3182	LFD	f20,  0 * SIZE(BO)
3183	LFD	f21,  1 * SIZE(BO)
3184	LFD	f22,  2 * SIZE(BO)
3185	LFD	f23,  3 * SIZE(BO)
3186
3187	srawi.	r0, TEMP,  2
3188	mtspr	CTR, r0
3189#endif
3190	ble	.L75
3191	.align 5
3192
3193.L73:
3194	FMADD	f0,  f16, f20, f0
3195	LFD	f16,  4 * SIZE(AO)
3196	FMADD	f1,  f17, f20, f1
3197	LFD	f17,  5 * SIZE(AO)
3198	FMADD	f2,  f18, f20, f2
3199	LFD	f18,  6 * SIZE(AO)
3200	FMADD	f3,  f19, f20, f3
3201	LFD	f19,  7 * SIZE(AO)
3202	LFDU	f20,  4 * SIZE(BO)
3203
3204	FMADD	f0,  f16, f21, f0
3205	LFD	f16,  8 * SIZE(AO)
3206	FMADD	f1,  f17, f21, f1
3207	LFD	f17,  9 * SIZE(AO)
3208	FMADD	f2,  f18, f21, f2
3209	LFD	f18, 10 * SIZE(AO)
3210	FMADD	f3,  f19, f21, f3
3211	LFD	f19, 11 * SIZE(AO)
3212	LFD	f21,  1 * SIZE(BO)
3213
3214	FMADD	f0,  f16, f22, f0
3215	LFD	f16, 12 * SIZE(AO)
3216	FMADD	f1,  f17, f22, f1
3217	LFD	f17, 13 * SIZE(AO)
3218	FMADD	f2,  f18, f22, f2
3219	LFD	f18, 14 * SIZE(AO)
3220	FMADD	f3,  f19, f22, f3
3221	LFD	f19, 15 * SIZE(AO)
3222	LFD	f22,  2 * SIZE(BO)
3223
3224	FMADD	f0,  f16, f23, f0
3225	LFDU	f16, 16 * SIZE(AO)
3226	FMADD	f1,  f17, f23, f1
3227	LFD	f17,  1 * SIZE(AO)
3228	FMADD	f2,  f18, f23, f2
3229	LFD	f18,  2 * SIZE(AO)
3230	FMADD	f3,  f19, f23, f3
3231	LFD	f19,  3 * SIZE(AO)
3232	LFD	f23,  3 * SIZE(BO)
3233	bdnz	.L73
3234	.align 4
3235
3236.L75:
3237#if defined(LT) || defined(RN)
3238	andi.	r0, KK,  3
3239#else
3240	andi.	r0, TEMP, 3
3241#endif
3242	mtspr	CTR, r0
3243	ble+	.L78
3244	.align 4
3245
3246.L76:
3247	FMADD	f0,  f16, f20, f0
3248	LFDU	f16,  4 * SIZE(AO)
3249	FMADD	f1,  f17, f20, f1
3250	LFD	f17,  1 * SIZE(AO)
3251	FMADD	f2,  f18, f20, f2
3252	LFD	f18,  2 * SIZE(AO)
3253	FMADD	f3,  f19, f20, f3
3254	LFDU	f20,  1 * SIZE(BO)
3255	LFD	f19,  3 * SIZE(AO)
3256	bdnz	.L76
3257	.align 4
3258
3259.L78:
3260#if defined(LN) || defined(RT)
3261#ifdef LN
3262	subi	r0, KK, 4
3263#else
3264	subi	r0, KK, 1
3265#endif
3266	slwi	TEMP, r0, 2 + BASE_SHIFT
3267	slwi	r0,   r0, 0 + BASE_SHIFT
3268	add	AO, AORIG, TEMP
3269	add	BO, B,     r0
3270#endif
3271
3272#if defined(LN) || defined(LT)
3273	LFD	f16,  0 * SIZE(BO)
3274	LFD	f20,  1 * SIZE(BO)
3275	LFD	f24,  2 * SIZE(BO)
3276	LFD	f28,  3 * SIZE(BO)
3277
3278	FSUB	f0,  f16, f0
3279	FSUB	f1,  f20, f1
3280	FSUB	f2,  f24, f2
3281	FSUB	f3,  f28, f3
3282#else
3283	LFD	f16,  0 * SIZE(AO)
3284	LFD	f17,  1 * SIZE(AO)
3285	LFD	f18,  2 * SIZE(AO)
3286	LFD	f19,  3 * SIZE(AO)
3287
3288	FSUB	f0,  f16, f0
3289	FSUB	f1,  f17, f1
3290	FSUB	f2,  f18, f2
3291	FSUB	f3,  f19, f3
3292#endif
3293
3294#ifdef LN
3295	LFD	f16, 15 * SIZE(AO)
3296	LFD	f17, 14 * SIZE(AO)
3297	LFD	f18, 13 * SIZE(AO)
3298	LFD	f19, 12 * SIZE(AO)
3299
3300	FMUL	f3,  f16, f3
3301	FNMSUB	f2,  f17, f3,  f2
3302	FNMSUB	f1,  f18, f3,  f1
3303	FNMSUB	f0,  f19, f3,  f0
3304
3305	LFD	f16, 10 * SIZE(AO)
3306	LFD	f17,  9 * SIZE(AO)
3307	LFD	f18,  8 * SIZE(AO)
3308	LFD	f19,  5 * SIZE(AO)
3309
3310	LFD	f20,  4 * SIZE(AO)
3311	LFD	f21,  0 * SIZE(AO)
3312
3313	FMUL	f2,  f16, f2
3314	FNMSUB	f1,  f17, f2,  f1
3315	FNMSUB	f0,  f18, f2,  f0
3316
3317	FMUL	f1,  f19, f1
3318	FNMSUB	f0,  f20, f1,  f0
3319	FMUL	f0,  f21, f0
3320#endif
3321
3322#ifdef LT
3323	LFD	f16,  0 * SIZE(AO)
3324	LFD	f17,  1 * SIZE(AO)
3325	LFD	f18,  2 * SIZE(AO)
3326	LFD	f19,  3 * SIZE(AO)
3327
3328	FMUL	f0,  f16, f0
3329	FNMSUB	f1,  f17, f0,  f1
3330	FNMSUB	f2,  f18, f0,  f2
3331	FNMSUB	f3,  f19, f0,  f3
3332
3333	LFD	f17,  5 * SIZE(AO)
3334	LFD	f18,  6 * SIZE(AO)
3335	LFD	f19,  7 * SIZE(AO)
3336
3337	FMUL	f1,  f17, f1
3338	FNMSUB	f2,  f18, f1,  f2
3339	FNMSUB	f3,  f19, f1,  f3
3340
3341	LFD	f18, 10 * SIZE(AO)
3342	LFD	f19, 11 * SIZE(AO)
3343
3344	FMUL	f2,  f18, f2
3345	FNMSUB	f3,  f19, f2,  f3
3346
3347	LFD	f19, 15 * SIZE(AO)
3348
3349	FMUL	f3,  f19, f3
3350#endif
3351
3352#ifdef RN
3353	LFD	f16,  0 * SIZE(BO)
3354
3355	FMUL	f0,  f16, f0
3356	FMUL	f1,  f16, f1
3357	FMUL	f2,  f16, f2
3358	FMUL	f3,  f16, f3
3359#endif
3360
3361#ifdef RT
3362	LFD	f21,  0 * SIZE(BO)
3363
3364	FMUL	f0,  f21, f0
3365	FMUL	f1,  f21, f1
3366	FMUL	f2,  f21, f2
3367	FMUL	f3,  f21, f3
3368#endif
3369
3370#ifdef LN
3371	subi	CO1, CO1, 4 * SIZE
3372#endif
3373
3374#if defined(LN) || defined(LT)
3375	STFD	f0,   0 * SIZE(BO)
3376	STFD	f1,   1 * SIZE(BO)
3377	STFD	f2,   2 * SIZE(BO)
3378	STFD	f3,   3 * SIZE(BO)
3379#else
3380	STFD	f0,   0 * SIZE(AO)
3381	STFD	f1,   1 * SIZE(AO)
3382	STFD	f2,   2 * SIZE(AO)
3383	STFD	f3,   3 * SIZE(AO)
3384#endif
3385
3386	STFD	f0,   0 * SIZE(CO1)
3387	STFD	f1,   1 * SIZE(CO1)
3388	STFD	f2,   2 * SIZE(CO1)
3389	STFD	f3,   3 * SIZE(CO1)
3390
3391	lfs	f0,  FZERO
3392 	fmr	f1,  f0
3393	fmr	f2,  f0
3394	fmr	f3,  f0
3395
3396#ifndef LN
3397	addi	CO1, CO1, 4 * SIZE
3398#endif
3399
3400#ifdef RT
3401	slwi	r0, K, 2 + BASE_SHIFT
3402	add	AORIG, AORIG, r0
3403#endif
3404
3405#if defined(LT) || defined(RN)
3406	sub	TEMP, K, KK
3407	slwi	r0,   TEMP, 2 + BASE_SHIFT
3408	slwi	TEMP, TEMP, 0 + BASE_SHIFT
3409	add	AO, AO, r0
3410	add	BO, BO, TEMP
3411#endif
3412
3413#ifdef LN
3414	subi	KK, KK, 4
3415#endif
3416
3417#ifdef LT
3418	addi	KK, KK, 4
3419#endif
3420
3421	addic.	I, I, -1
3422	bgt+	.L72
3423	.align 4
3424
3425.L999:
3426	addi	r3, 0, 0
3427
3428	lfd	f14,    0(SP)
3429	lfd	f15,    8(SP)
3430	lfd	f16,   16(SP)
3431	lfd	f17,   24(SP)
3432
3433	lfd	f18,   32(SP)
3434	lfd	f19,   40(SP)
3435	lfd	f20,   48(SP)
3436	lfd	f21,   56(SP)
3437
3438	lfd	f22,   64(SP)
3439	lfd	f23,   72(SP)
3440	lfd	f24,   80(SP)
3441	lfd	f25,   88(SP)
3442
3443	lfd	f26,   96(SP)
3444	lfd	f27,  104(SP)
3445	lfd	f28,  112(SP)
3446	lfd	f29,  120(SP)
3447
3448	lfd	f30,  128(SP)
3449	lfd	f31,  136(SP)
3450
3451#ifdef __64BIT__
3452	ld	r31,  144(SP)
3453	ld	r30,  152(SP)
3454	ld	r29,  160(SP)
3455	ld	r28,  168(SP)
3456	ld	r27,  176(SP)
3457	ld	r26,  184(SP)
3458	ld	r25,  192(SP)
3459	ld	r24,  200(SP)
3460	ld	r23,  208(SP)
3461	ld	r22,  216(SP)
3462	ld	r21,  224(SP)
3463	ld	r20,  232(SP)
3464	ld	r19,  240(SP)
3465	ld	r18,  248(SP)
3466#else
3467	lwz	r31,  144(SP)
3468	lwz	r30,  148(SP)
3469	lwz	r29,  152(SP)
3470	lwz	r28,  156(SP)
3471	lwz	r27,  160(SP)
3472	lwz	r26,  164(SP)
3473	lwz	r25,  168(SP)
3474	lwz	r24,  172(SP)
3475	lwz	r23,  176(SP)
3476	lwz	r22,  180(SP)
3477	lwz	r21,  184(SP)
3478	lwz	r20,  188(SP)
3479	lwz	r19,  192(SP)
3480	lwz	r18,  196(SP)
3481#endif
3482
3483	addi	SP, SP, STACKSIZE
3484
3485	blr
3486
3487	EPILOGUE
3488