1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef __64BIT__
43#define LOAD	lwz
44#else
45#define LOAD	ld
46#endif
47
48#ifdef __64BIT__
49#define STACKSIZE 320
50#define ALPHA   296(SP)
51#define FZERO	304(SP)
52#else
53#define STACKSIZE 240
54#define ALPHA   224(SP)
55#define FZERO	232(SP)
56#endif
57
58#define	M	r3
59#define	N	r4
60#define	K	r5
61
62#ifdef linux
63#ifndef __64BIT__
64#define A	r6
65#define	B	r7
66#define	C	r8
67#define	LDC	r9
68#define OFFSET	r10
69#else
70#define A	r7
71#define	B	r8
72#define	C	r9
73#define	LDC	r10
74#define OFFSET	r6
75#endif
76#endif
77
78#if defined(_AIX) || defined(__APPLE__)
79#if !defined(__64BIT__) && defined(DOUBLE)
80#define A	r8
81#define	B	r9
82#define	C	r10
83#define	LDC	r7
84#define OFFSET	r6
85#else
86#define A	r7
87#define	B	r8
88#define	C	r9
89#define	LDC	r10
90#define OFFSET	r6
91#endif
92#endif
93
94#define TEMP	r18
95#define KK	r19
96#define BB	r20
97#define	I	r21
98#define J	r22
99#define AO	r23
100#define	BO	r24
101#define	CO1	r25
102#define CO2	r26
103#define	CO3	r27
104#define	CO4	r28
105
106#define PREA	r29
107#define PREB	r30
108#define PREC	r31
109
110#ifndef NEEDPARAM
111
112	PROLOGUE
113	PROFCODE
114
115	addi	SP, SP, -STACKSIZE
116	li	r0, 0
117
118	stfd	f14,    0(SP)
119	stfd	f15,    8(SP)
120	stfd	f16,   16(SP)
121	stfd	f17,   24(SP)
122
123	stfd	f18,   32(SP)
124	stfd	f19,   40(SP)
125	stfd	f20,   48(SP)
126	stfd	f21,   56(SP)
127
128	stfd	f22,   64(SP)
129	stfd	f23,   72(SP)
130	stfd	f24,   80(SP)
131	stfd	f25,   88(SP)
132
133	stfd	f26,   96(SP)
134	stfd	f27,  104(SP)
135	stfd	f28,  112(SP)
136	stfd	f29,  120(SP)
137
138	stfd	f30,  128(SP)
139	stfd	f31,  136(SP)
140
141#ifdef __64BIT__
142	std	r31,  144(SP)
143	std	r30,  152(SP)
144	std	r29,  160(SP)
145	std	r28,  168(SP)
146	std	r27,  176(SP)
147	std	r26,  184(SP)
148	std	r25,  192(SP)
149	std	r24,  200(SP)
150	std	r23,  208(SP)
151	std	r22,  216(SP)
152	std	r21,  224(SP)
153	std	r20,  232(SP)
154#if defined(TRMMKERNEL)
155	std	r19,  240(SP)
156	std	r18,  248(SP)
157#endif
158#else
159	stw	r31,  144(SP)
160	stw	r30,  148(SP)
161	stw	r29,  152(SP)
162	stw	r28,  156(SP)
163	stw	r27,  160(SP)
164	stw	r26,  164(SP)
165	stw	r25,  168(SP)
166	stw	r24,  172(SP)
167	stw	r23,  176(SP)
168	stw	r22,  180(SP)
169	stw	r21,  184(SP)
170	stw	r20,  188(SP)
171#if defined(TRMMKERNEL)
172	stw	r19,  192(SP)
173	stw	r18,  196(SP)
174#endif
175#endif
176
177	stfd	f1,  ALPHA
178	stw	r0,  FZERO
179
180#if defined(_AIX) || defined(__APPLE__)
181#if !defined(__64BIT__) && defined(DOUBLE)
182	lwz	LDC,    56 + STACKSIZE(SP)
183#endif
184#endif
185
186	slwi	LDC, LDC, BASE_SHIFT
187
188#if defined(TRMMKERNEL)
189#if defined(linux) && defined(__64BIT__)
190	ld	OFFSET,   112 + STACKSIZE(SP)
191#endif
192
193#if defined(_AIX) || defined(__APPLE__)
194#ifdef __64BIT__
195	ld	OFFSET,  112 + STACKSIZE(SP)
196#else
197#ifdef DOUBLE
198	lwz	OFFSET,   60 + STACKSIZE(SP)
199#else
200	lwz	OFFSET,   56 + STACKSIZE(SP)
201#endif
202#endif
203#endif
204#endif
205
206#if defined(TRMMKERNEL) && !defined(LEFT)
207	neg	KK, OFFSET
208#endif
209
210	cmpwi	cr0, M, 0
211	ble	LL(999)
212	cmpwi	cr0, N, 0
213	ble	LL(999)
214	cmpwi	cr0, K, 0
215	ble	LL(999)
216
217#ifndef PREFETCHTEST
218/* Normal prefetch */
219#ifdef PPC970
220	li	PREC,   4 * SIZE
221#endif
222#ifdef POWER4
223	li	PREC,   4 * SIZE   /* is 12 best? */
224#endif
225#ifdef POWER5
226	li	PREC,   3 * SIZE
227#endif
228
229#else
230
231#ifdef linux
232#ifndef __64BIT__
233	mr	PREA,  r10
234	lwz	PREB,   8 + STACKSIZE(SP)
235	lwz	PREC,  12 + STACKSIZE(SP)
236#else
237	ld	PREA,  112 + STACKSIZE(SP)
238	ld	PREB,  120 + STACKSIZE(SP)
239	ld	PREC,  128 + STACKSIZE(SP)
240#endif
241#endif
242
243#if defined(_AIX) || defined(__APPLE__)
244#ifdef __64BIT__
245	ld	PREA,  112 + STACKSIZE(SP)
246	ld	PREB,  120 + STACKSIZE(SP)
247	ld	PREC,  128 + STACKSIZE(SP)
248#else
249#ifdef DOUBLE
250	lwz	PREA,   60 + STACKSIZE(SP)
251	lwz	PREB,   64 + STACKSIZE(SP)
252	lwz	PREC,   68 + STACKSIZE(SP)
253#else
254	lwz	PREA,   56 + STACKSIZE(SP)
255	lwz	PREB,   60 + STACKSIZE(SP)
256	lwz	PREC,   64 + STACKSIZE(SP)
257#endif
258#endif
259#endif
260
261#endif
262
263#ifndef PREFETCHTEST
264#ifdef PPC970
265#ifdef ALLOC_HUGETLB
266	li	PREA,   (16 *  1 * SIZE)
267	li	PREB,   (16 *  5 * SIZE)
268#else
269	li	PREA,   (16 * 19 * SIZE)
270	li	PREB,   (16 *  8 * SIZE)
271#endif
272#endif
273#ifdef POWER4
274#ifdef ALLOC_HUGETLB
275	li	PREA,   (16 *  1 * SIZE)
276	li	PREB,   (16 *  1 * SIZE)
277#else
278	li	PREA,   (16 *  2 * SIZE)
279	li	PREB,   (16 *  2 * SIZE)
280#endif
281#endif
282#ifdef POWER5
283#ifdef ALLOC_HUGETLB
284	li	PREA,   (16 *  7 * SIZE)
285	li	PREB,   (16 *  7 * SIZE)
286#else
287	li	PREA,   (16 * 12 * SIZE)
288	li	PREB,   (16 *  6 * SIZE)
289#endif
290#endif
291#endif
292
293	srawi.	J, N,  2
294	ble	LL(40)
295	.align 4
296
297LL(10):
298	mr	CO1, C
299	add	CO2, C,  LDC
300	add	CO3, CO2, LDC
301	add	CO4, CO3, LDC
302
303#if defined(TRMMKERNEL) &&  defined(LEFT)
304	mr	KK, OFFSET
305#endif
306
307	slwi	BB, K, BASE_SHIFT + 2
308
309	lfs	f0,  FZERO
310 	fmr	f1,  f0
311	fmr	f2,  f0
312	fmr	f3,  f0
313	fmr	f4,  f0
314	fmr	f5,  f0
315	fmr	f6,  f0
316	fmr	f7,  f0
317	fmr	f8,  f0
318	fmr	f9,  f0
319	fmr	f10, f0
320	fmr	f11, f0
321	fmr	f12, f0
322	fmr	f13, f0
323	fmr	f14, f0
324	fmr	f15, f0
325
326	srawi.	I, M,  2
327	mr	AO, A
328	add	C,  CO4, LDC
329	ble	LL(20)
330	.align 4
331
332LL(11):
333#if defined(TRMMKERNEL)
334#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
335	LFD	f16,  0 * SIZE(AO)
336	LFD	f17,  1 * SIZE(AO)
337	LFD	f18,  2 * SIZE(AO)
338	LFD	f19,  3 * SIZE(AO)
339
340	LFD	f20,  0 * SIZE(B)
341	LFD	f21,  1 * SIZE(B)
342	LFD	f22,  2 * SIZE(B)
343	LFD	f23,  3 * SIZE(B)
344
345#ifdef POWER5
346	LFD	f28,  4 * SIZE(B)
347	LFD	f29,  5 * SIZE(B)
348	LFD	f30,  6 * SIZE(B)
349	LFD	f31,  7 * SIZE(B)
350#endif
351	mr	BO,  B
352#else
353	slwi	r0, KK, 2 + BASE_SHIFT
354	add	AO, AO, r0
355	add	BO, B,  r0
356
357	LFD	f16,  0 * SIZE(AO)
358	LFD	f17,  1 * SIZE(AO)
359	LFD	f18,  2 * SIZE(AO)
360	LFD	f19,  3 * SIZE(AO)
361
362	LFD	f20,  0 * SIZE(BO)
363	LFD	f21,  1 * SIZE(BO)
364	LFD	f22,  2 * SIZE(BO)
365	LFD	f23,  3 * SIZE(BO)
366
367#ifdef POWER5
368	LFD	f28,  4 * SIZE(BO)
369	LFD	f29,  5 * SIZE(BO)
370	LFD	f30,  6 * SIZE(BO)
371	LFD	f31,  7 * SIZE(BO)
372#endif
373#endif
374
375	DCBTST(CO1, PREC)
376	DCBTST(CO2, PREC)
377	DCBTST(CO3, PREC)
378	DCBTST(CO4, PREC)
379
380	dcbt	B, BB
381	addi	BB, BB, 16 * SIZE
382
383#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
384	sub	TEMP, K, KK
385#elif defined(LEFT)
386	addi	TEMP, KK, 4
387#else
388	addi	TEMP, KK, 4
389#endif
390	srawi.	TEMP,  TEMP,  2
391	mtspr	CTR, TEMP
392	ble	LL(15)
393
394#else
395
396	LFD	f16,  0 * SIZE(AO)
397	LFD	f17,  1 * SIZE(AO)
398	LFD	f18,  2 * SIZE(AO)
399	LFD	f19,  3 * SIZE(AO)
400
401	LFD	f20,  0 * SIZE(B)
402	LFD	f21,  1 * SIZE(B)
403	LFD	f22,  2 * SIZE(B)
404	LFD	f23,  3 * SIZE(B)
405
406#ifdef POWER5
407	LFD	f28,  4 * SIZE(B)
408	LFD	f29,  5 * SIZE(B)
409	LFD	f30,  6 * SIZE(B)
410	LFD	f31,  7 * SIZE(B)
411#endif
412
413	DCBTST(CO1, PREC)
414	DCBTST(CO2, PREC)
415	DCBTST(CO3, PREC)
416	DCBTST(CO4, PREC)
417
418	dcbt	B, BB
419	addi	BB, BB, 16 * SIZE
420
421	srawi.	r0,  K,  2
422	mtspr	CTR, r0
423	mr	BO,  B
424	ble	LL(15)
425#endif
426	.align 4
427
428LL(12):
429	FMADD	f0,  f16, f20, f0
430	FMADD	f5,  f17, f21, f5
431	FMADD	f10, f18, f22, f10
432	FMADD	f15, f19, f23, f15
433
434#if defined(ALLOC_HUGETLB) && !defined(POWER5)
435	LFD	f28,  4 * SIZE(BO)
436	LFD	f29,  5 * SIZE(BO)
437	LFD	f30,  6 * SIZE(BO)
438	LFD	f31,  7 * SIZE(BO)
439#endif
440
441	FMADD	f1,  f17, f20, f1
442	FMADD	f2,  f18, f20, f2
443	FMADD	f3,  f19, f20, f3
444	FMADD	f4,  f16, f21, f4
445
446#if !defined(ALLOC_HUGETLB) && !defined(POWER5)
447	LFD	f28,  4 * SIZE(BO)
448	LFD	f29,  5 * SIZE(BO)
449	LFD	f30,  6 * SIZE(BO)
450	LFD	f31,  7 * SIZE(BO)
451#endif
452
453	LFD	f24,  4 * SIZE(AO)
454	LFD	f25,  5 * SIZE(AO)
455	LFD	f26,  6 * SIZE(AO)
456	LFD	f27,  7 * SIZE(AO)
457
458	FMADD	f6,  f18, f21, f6
459	FMADD	f7,  f19, f21, f7
460	FMADD	f8,  f16, f22, f8
461	FMADD	f9,  f17, f22, f9
462
463	FMADD	f11, f19, f22, f11
464	FMADD	f12, f16, f23, f12
465	FMADD	f13, f17, f23, f13
466	FMADD	f14, f18, f23, f14
467
468	LFD	f20,  8 * SIZE(BO)
469	LFD	f21,  9 * SIZE(BO)
470	LFD	f22, 10 * SIZE(BO)
471	LFD	f23, 11 * SIZE(BO)
472
473	FMADD	f0,  f24, f28, f0
474	FMADD	f5,  f25, f29, f5
475	FMADD	f10, f26, f30, f10
476	FMADD	f15, f27, f31, f15
477
478	LFD	f16,  8 * SIZE(AO)
479	LFD	f17,  9 * SIZE(AO)
480	LFD	f18, 10 * SIZE(AO)
481	LFD	f19, 11 * SIZE(AO)
482
483	FMADD	f1,  f25, f28, f1
484	FMADD	f2,  f26, f28, f2
485	FMADD	f3,  f27, f28, f3
486	FMADD	f4,  f24, f29, f4
487
488	FMADD	f6,  f26, f29, f6
489	FMADD	f7,  f27, f29, f7
490	FMADD	f8,  f24, f30, f8
491	FMADD	f9,  f25, f30, f9
492
493	FMADD	f11, f27, f30, f11
494	FMADD	f12, f24, f31, f12
495	FMADD	f13, f25, f31, f13
496	FMADD	f14, f26, f31, f14
497
498	LFD	f28, 12 * SIZE(BO)
499	LFD	f29, 13 * SIZE(BO)
500	LFD	f30, 14 * SIZE(BO)
501	LFD	f31, 15 * SIZE(BO)
502
503	FMADD	f0,  f16, f20, f0
504	FMADD	f5,  f17, f21, f5
505	FMADD	f10, f18, f22, f10
506	FMADD	f15, f19, f23, f15
507
508	LFD	f24, 12 * SIZE(AO)
509	LFD	f25, 13 * SIZE(AO)
510	LFD	f26, 14 * SIZE(AO)
511	LFD	f27, 15 * SIZE(AO)
512
513	FMADD	f1,  f17, f20, f1
514	FMADD	f2,  f18, f20, f2
515	FMADD	f3,  f19, f20, f3
516	FMADD	f4,  f16, f21, f4
517
518	FMADD	f6,  f18, f21, f6
519	FMADD	f7,  f19, f21, f7
520	FMADD	f8,  f16, f22, f8
521	FMADD	f9,  f17, f22, f9
522
523	FMADD	f11, f19, f22, f11
524	FMADD	f12, f16, f23, f12
525	FMADD	f13, f17, f23, f13
526	FMADD	f14, f18, f23, f14
527
528#ifndef POWER5
529	LFD	f16, 16 * SIZE(AO)
530	LFD	f17, 17 * SIZE(AO)
531	LFD	f18, 18 * SIZE(AO)
532	LFD	f19, 19 * SIZE(AO)
533#else
534	LFD	f20, 16 * SIZE(BO)
535	LFD	f21, 17 * SIZE(BO)
536	LFD	f22, 18 * SIZE(BO)
537	LFD	f23, 19 * SIZE(BO)
538#endif
539
540	FMADD	f0,  f24, f28, f0
541	FMADD	f5,  f25, f29, f5
542	FMADD	f10, f26, f30, f10
543	FMADD	f15, f27, f31, f15
544
545#ifndef POWER5
546	LFD	f20, 16 * SIZE(BO)
547	LFD	f21, 17 * SIZE(BO)
548	LFD	f22, 18 * SIZE(BO)
549	LFD	f23, 19 * SIZE(BO)
550#else
551	LFD	f16, 16 * SIZE(AO)
552	LFD	f17, 17 * SIZE(AO)
553	LFD	f18, 18 * SIZE(AO)
554	LFD	f19, 19 * SIZE(AO)
555#endif
556
557	FMADD	f1,  f25, f28, f1
558	FMADD	f2,  f26, f28, f2
559	FMADD	f3,  f27, f28, f3
560	FMADD	f4,  f24, f29, f4
561
562	FMADD	f6,  f26, f29, f6
563	FMADD	f7,  f27, f29, f7
564	FMADD	f8,  f24, f30, f8
565	FMADD	f9,  f25, f30, f9
566
567	FMADD	f11, f27, f30, f11
568	FMADD	f12, f24, f31, f12
569	FMADD	f13, f25, f31, f13
570	FMADD	f14, f26, f31, f14
571
572#if (L2_SIZE == 1024976) && defined (ALLOC_HUGETLB)
573	nop
574	nop
575	nop
576	nop
577#endif
578
579#ifdef POWER5
580	LFD	f28, 20 * SIZE(BO)
581	LFD	f29, 21 * SIZE(BO)
582	LFD	f30, 22 * SIZE(BO)
583	LFD	f31, 23 * SIZE(BO)
584#endif
585
586	addi	AO, AO, 16 * SIZE
587	addi	BO, BO, 16 * SIZE
588
589#ifdef PPC970
590#ifndef ALLOC_HUGETLB
591	DCBT(AO, PREA)
592#endif
593	DCBT(BO, PREB)
594#endif
595
596#ifdef POWER4
597#ifndef ALLOC_HUGETLB
598	DCBT(AO, PREA)
599#endif
600	DCBT(BO, PREB)
601#endif
602
603#ifdef POWER5
604#ifndef ALLOC_HUGETLB
605	DCBT(BO, PREB)
606	DCBT(AO, PREA)
607#endif
608#endif
609	bdnz	LL(12)
610	.align 4
611
612LL(15):
613	lfd	f30,  ALPHA
614
615#if defined(TRMMKERNEL)
616
617#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
618	sub	TEMP, K, KK
619#elif defined(LEFT)
620	addi	TEMP, KK, 4
621#else
622	addi	TEMP, KK, 4
623#endif
624
625	andi.	TEMP,  TEMP,  3
626	mtspr	CTR, TEMP
627#else
628
629	andi.	r0,  K,  3
630	mtspr	CTR, r0
631
632#endif
633	ble+	LL(18)
634	.align 4
635
636LL(16):
637	FMADD	f0,  f16, f20, f0
638	FMADD	f5,  f17, f21, f5
639	FMADD	f10, f18, f22, f10
640	FMADD	f15, f19, f23, f15
641
642	FMADD	f1,  f17, f20, f1
643	FMADD	f2,  f18, f20, f2
644	FMADD	f3,  f19, f20, f3
645	FMADD	f4,  f16, f21, f4
646
647	FMADD	f6,  f18, f21, f6
648	FMADD	f7,  f19, f21, f7
649	FMADD	f8,  f16, f22, f8
650	FMADD	f9,  f17, f22, f9
651
652	FMADD	f11, f19, f22, f11
653	FMADD	f12, f16, f23, f12
654	FMADD	f13, f17, f23, f13
655	FMADD	f14, f18, f23, f14
656
657	LFD	f16,  4 * SIZE(AO)
658	LFD	f17,  5 * SIZE(AO)
659	LFD	f18,  6 * SIZE(AO)
660	LFD	f19,  7 * SIZE(AO)
661
662	LFD	f20,  4 * SIZE(BO)
663	LFD	f21,  5 * SIZE(BO)
664	LFD	f22,  6 * SIZE(BO)
665	LFD	f23,  7 * SIZE(BO)
666
667	addi	BO, BO,  4 * SIZE
668	addi	AO, AO,  4 * SIZE
669	bdnz	LL(16)
670	.align 4
671
672LL(18):
673#ifndef TRMMKERNEL
674	LFD	f16, 0 * SIZE(CO1)
675	LFD	f17, 1 * SIZE(CO1)
676	LFD	f18, 2 * SIZE(CO1)
677	LFD	f19, 3 * SIZE(CO1)
678
679	LFD	f20, 0 * SIZE(CO2)
680	LFD	f21, 1 * SIZE(CO2)
681	LFD	f22, 2 * SIZE(CO2)
682	LFD	f23, 3 * SIZE(CO2)
683
684	FMADD	f0,  f0, f30, f16
685	FMADD	f1,  f1, f30, f17
686	FMADD	f2,  f2, f30, f18
687	FMADD	f3,  f3, f30, f19
688
689	FMADD	f4,  f4, f30, f20
690	FMADD	f5,  f5, f30, f21
691	FMADD	f6,  f6, f30, f22
692	FMADD	f7,  f7, f30, f23
693
694	LFD	f16, 0 * SIZE(CO3)
695	LFD	f17, 1 * SIZE(CO3)
696	LFD	f18, 2 * SIZE(CO3)
697	LFD	f19, 3 * SIZE(CO3)
698
699	LFD	f20, 0 * SIZE(CO4)
700	LFD	f21, 1 * SIZE(CO4)
701	LFD	f22, 2 * SIZE(CO4)
702	LFD	f23, 3 * SIZE(CO4)
703
704	FMADD	f8,  f8,  f30, f16
705	FMADD	f9,  f9,  f30, f17
706	FMADD	f10, f10, f30, f18
707	FMADD	f11, f11, f30, f19
708
709	FMADD	f12, f12, f30, f20
710	FMADD	f13, f13, f30, f21
711	FMADD	f14, f14, f30, f22
712	FMADD	f15, f15, f30, f23
713
714#else
715
716	FMUL	f0,  f0, f30
717	FMUL	f1,  f1, f30
718	FMUL	f2,  f2, f30
719	FMUL	f3,  f3, f30
720
721	FMUL	f4,  f4, f30
722	FMUL	f5,  f5, f30
723	FMUL	f6,  f6, f30
724	FMUL	f7,  f7, f30
725
726	FMUL	f8,  f8,  f30
727	FMUL	f9,  f9,  f30
728	FMUL	f10, f10, f30
729	FMUL	f11, f11, f30
730
731	FMUL	f12, f12, f30
732	FMUL	f13, f13, f30
733	FMUL	f14, f14, f30
734	FMUL	f15, f15, f30
735#endif
736
737	STFD	f0,  0 * SIZE(CO1)
738	STFD	f1,  1 * SIZE(CO1)
739	STFD	f2,  2 * SIZE(CO1)
740	STFD	f3,  3 * SIZE(CO1)
741
742	lfs	f0,  FZERO
743 	fmr	f1,  f0
744	fmr	f2,  f0
745	fmr	f3,  f0
746
747	STFD	f4,  0 * SIZE(CO2)
748	STFD	f5,  1 * SIZE(CO2)
749	STFD	f6,  2 * SIZE(CO2)
750	STFD	f7,  3 * SIZE(CO2)
751
752	fmr	f4,  f0
753	fmr	f5,  f0
754	fmr	f6,  f0
755	fmr	f7,  f0
756
757	STFD	f8,  0 * SIZE(CO3)
758	STFD	f9,  1 * SIZE(CO3)
759	STFD	f10, 2 * SIZE(CO3)
760	STFD	f11, 3 * SIZE(CO3)
761
762	fmr	f8,  f0
763	fmr	f9,  f0
764	fmr	f10, f0
765	fmr	f11, f0
766
767	STFD	f12, 0 * SIZE(CO4)
768	STFD	f13, 1 * SIZE(CO4)
769	STFD	f14, 2 * SIZE(CO4)
770	STFD	f15, 3 * SIZE(CO4)
771
772	fmr	f12, f0
773	fmr	f13, f0
774	fmr	f14, f0
775	fmr	f15, f0
776
777	addi	CO1, CO1, 4 * SIZE
778	addi	CO2, CO2, 4 * SIZE
779	addi	CO3, CO3, 4 * SIZE
780	addi	CO4, CO4, 4 * SIZE
781
782#ifdef TRMMKERNEL
783#if ( defined(LEFT) &&  defined(TRANSA)) || \
784    (!defined(LEFT) && !defined(TRANSA))
785	sub	TEMP, K, KK
786#ifdef LEFT
787	addi	TEMP, TEMP, -4
788#else
789	addi	TEMP, TEMP, -4
790#endif
791	slwi	TEMP, TEMP, 2 + BASE_SHIFT
792	add	AO, AO, TEMP
793	add	BO, BO, TEMP
794#endif
795
796#ifdef LEFT
797	addi	KK, KK, 4
798#endif
799#endif
800
801	addic.	I, I, -1
802	bgt+	LL(11)
803	.align 4
804
805LL(20):
806	andi.	I,  M,  2
807	ble	LL(30)
808
809#if defined(TRMMKERNEL)
810#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
811
812	LFD	f16,  0 * SIZE(AO)
813	LFD	f17,  1 * SIZE(AO)
814	LFD	f18,  2 * SIZE(AO)
815	LFD	f19,  3 * SIZE(AO)
816
817	LFD	f20,  0 * SIZE(B)
818	LFD	f21,  1 * SIZE(B)
819	LFD	f22,  2 * SIZE(B)
820	LFD	f23,  3 * SIZE(B)
821
822	LFD	f24,  4 * SIZE(B)
823	LFD	f25,  5 * SIZE(B)
824	LFD	f26,  6 * SIZE(B)
825	LFD	f27,  7 * SIZE(B)
826
827	mr	BO,  B
828#else
829	slwi	r0,   KK, 1 + BASE_SHIFT
830	slwi	TEMP, KK, 2 + BASE_SHIFT
831	add	AO, AO, r0
832	add	BO, B,  TEMP
833
834	LFD	f16,  0 * SIZE(AO)
835	LFD	f17,  1 * SIZE(AO)
836	LFD	f18,  2 * SIZE(AO)
837	LFD	f19,  3 * SIZE(AO)
838
839	LFD	f20,  0 * SIZE(BO)
840	LFD	f21,  1 * SIZE(BO)
841	LFD	f22,  2 * SIZE(BO)
842	LFD	f23,  3 * SIZE(BO)
843
844	LFD	f24,  4 * SIZE(BO)
845	LFD	f25,  5 * SIZE(BO)
846	LFD	f26,  6 * SIZE(BO)
847	LFD	f27,  7 * SIZE(BO)
848#endif
849
850#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
851	sub	TEMP, K, KK
852#elif defined(LEFT)
853	addi	TEMP, KK, 2
854#else
855	addi	TEMP, KK, 4
856#endif
857	srawi.	TEMP,  TEMP,  2
858	mtspr	CTR, TEMP
859
860#else
861	LFD	f16,  0 * SIZE(AO)
862	LFD	f17,  1 * SIZE(AO)
863	LFD	f18,  2 * SIZE(AO)
864	LFD	f19,  3 * SIZE(AO)
865
866	LFD	f20,  0 * SIZE(B)
867	LFD	f21,  1 * SIZE(B)
868	LFD	f22,  2 * SIZE(B)
869	LFD	f23,  3 * SIZE(B)
870
871	LFD	f24,  4 * SIZE(B)
872	LFD	f25,  5 * SIZE(B)
873	LFD	f26,  6 * SIZE(B)
874	LFD	f27,  7 * SIZE(B)
875
876	srawi.	r0,  K,  2
877	mtspr	CTR, r0
878	mr	BO,  B
879#endif
880	ble	LL(25)
881	.align 5
882
883LL(22):
884	FMADD	f0,  f16, f20, f0
885	FMADD	f1,  f17, f20, f1
886	FMADD	f4,  f16, f21, f4
887	FMADD	f5,  f17, f21, f5
888
889	FMADD	f8,  f16, f22, f8
890	FMADD	f9,  f17, f22, f9
891	FMADD	f12, f16, f23, f12
892	FMADD	f13, f17, f23, f13
893
894	LFD	f20,  8 * SIZE(BO)
895	LFD	f21,  9 * SIZE(BO)
896	LFD	f22, 10 * SIZE(BO)
897	LFD	f23, 11 * SIZE(BO)
898
899	FMADD	f2,  f18, f24, f2
900	FMADD	f3,  f19, f24, f3
901	FMADD	f6,  f18, f25, f6
902	FMADD	f7,  f19, f25, f7
903
904	FMADD	f10, f18, f26, f10
905	FMADD	f11, f19, f26, f11
906	FMADD	f14, f18, f27, f14
907	FMADD	f15, f19, f27, f15
908
909	LFD	f16,  4 * SIZE(AO)
910	LFD	f17,  5 * SIZE(AO)
911	LFD	f18,  6 * SIZE(AO)
912	LFD	f19,  7 * SIZE(AO)
913
914	FMADD	f0,  f16, f20, f0
915	FMADD	f1,  f17, f20, f1
916	FMADD	f4,  f16, f21, f4
917	FMADD	f5,  f17, f21, f5
918
919	LFD	f24, 12 * SIZE(BO)
920	LFD	f25, 13 * SIZE(BO)
921	LFD	f26, 14 * SIZE(BO)
922	LFD	f27, 15 * SIZE(BO)
923
924	FMADD	f8,  f16, f22, f8
925	FMADD	f9,  f17, f22, f9
926	FMADD	f12, f16, f23, f12
927	FMADD	f13, f17, f23, f13
928
929	LFD	f20, 16 * SIZE(BO)
930	LFD	f21, 17 * SIZE(BO)
931	LFD	f22, 18 * SIZE(BO)
932	LFD	f23, 19 * SIZE(BO)
933
934	FMADD	f2,  f18, f24, f2
935	FMADD	f3,  f19, f24, f3
936	FMADD	f6,  f18, f25, f6
937	FMADD	f7,  f19, f25, f7
938
939	FMADD	f10, f18, f26, f10
940	FMADD	f11, f19, f26, f11
941	FMADD	f14, f18, f27, f14
942	FMADD	f15, f19, f27, f15
943
944	LFD	f16,  8 * SIZE(AO)
945	LFD	f17,  9 * SIZE(AO)
946	LFD	f18, 10 * SIZE(AO)
947	LFD	f19, 11 * SIZE(AO)
948
949	LFD	f24, 20 * SIZE(BO)
950	LFD	f25, 21 * SIZE(BO)
951	LFD	f26, 22 * SIZE(BO)
952	LFD	f27, 23 * SIZE(BO)
953
954	addi	AO, AO,  8 * SIZE
955	addi	BO, BO, 16 * SIZE
956	DCBT(BO, PREB)
957	bdnz	LL(22)
958
959	fadd	f0,  f2,  f0
960	fadd	f1,  f3,  f1
961	fadd	f4,  f6,  f4
962	fadd	f5,  f7,  f5
963	fadd	f8,  f10, f8
964	fadd	f9,  f11, f9
965	fadd	f12, f14, f12
966	fadd	f13, f15, f13
967	.align 4
968
969LL(25):
970	lfd	f30,  ALPHA
971
972#if   defined(TRMMKERNEL)
973
974#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
975	sub	TEMP, K, KK
976#elif defined(LEFT)
977	addi	TEMP, KK, 2
978#else
979	addi	TEMP, KK, 4
980#endif
981	andi.	TEMP,  TEMP,  3
982	mtspr	CTR, TEMP
983
984#else
985
986	andi.	r0,  K,  3
987	mtspr	CTR, r0
988
989#endif
990	ble+	LL(28)
991	.align 4
992
993LL(26):
994	FMADD	f0,  f16, f20, f0
995	FMADD	f1,  f17, f20, f1
996	FMADD	f4,  f16, f21, f4
997	FMADD	f5,  f17, f21, f5
998
999	FMADD	f8,  f16, f22, f8
1000	FMADD	f9,  f17, f22, f9
1001	FMADD	f12, f16, f23, f12
1002	FMADD	f13, f17, f23, f13
1003
1004	LFD	f16,  2 * SIZE(AO)
1005	LFD	f17,  3 * SIZE(AO)
1006
1007	LFD	f20,  4 * SIZE(BO)
1008	LFD	f21,  5 * SIZE(BO)
1009	LFD	f22,  6 * SIZE(BO)
1010	LFD	f23,  7 * SIZE(BO)
1011
1012	addi	BO, BO,  4 * SIZE
1013	addi	AO, AO,  2 * SIZE
1014	bdnz	LL(26)
1015	.align 4
1016
1017LL(28):
1018#ifndef TRMMKERNEL
1019	LFD	f16, 0 * SIZE(CO1)
1020	LFD	f17, 1 * SIZE(CO1)
1021	LFD	f18, 0 * SIZE(CO2)
1022	LFD	f19, 1 * SIZE(CO2)
1023
1024	FMADD	f0,  f0, f30, f16
1025	FMADD	f1,  f1, f30, f17
1026	FMADD	f4,  f4, f30, f18
1027	FMADD	f5,  f5, f30, f19
1028
1029	LFD	f20, 0 * SIZE(CO3)
1030	LFD	f21, 1 * SIZE(CO3)
1031	LFD	f22, 0 * SIZE(CO4)
1032	LFD	f23, 1 * SIZE(CO4)
1033
1034	FMADD	f8,  f8,  f30, f20
1035	FMADD	f9,  f9,  f30, f21
1036	FMADD	f12, f12, f30, f22
1037	FMADD	f13, f13, f30, f23
1038#else
1039	FMUL	f0,  f0, f30
1040	FMUL	f1,  f1, f30
1041	FMUL	f4,  f4, f30
1042	FMUL	f5,  f5, f30
1043
1044	FMUL	f8,  f8,  f30
1045	FMUL	f9,  f9,  f30
1046	FMUL	f12, f12, f30
1047	FMUL	f13, f13, f30
1048#endif
1049
1050	STFD	f0,  0 * SIZE(CO1)
1051	STFD	f1,  1 * SIZE(CO1)
1052	STFD	f4,  0 * SIZE(CO2)
1053	STFD	f5,  1 * SIZE(CO2)
1054
1055	lfs	f0,  FZERO
1056 	fmr	f1,  f0
1057	fmr	f2,  f0
1058	fmr	f3,  f0
1059
1060	STFD	f8,  0 * SIZE(CO3)
1061	STFD	f9,  1 * SIZE(CO3)
1062	STFD	f12, 0 * SIZE(CO4)
1063	STFD	f13, 1 * SIZE(CO4)
1064
1065	fmr	f4,  f0
1066	fmr	f5,  f0
1067	fmr	f6,  f0
1068	fmr	f7,  f0
1069
1070	fmr	f8,  f0
1071	fmr	f9,  f0
1072	fmr	f10, f0
1073	fmr	f11, f0
1074
1075	fmr	f12, f0
1076	fmr	f13, f0
1077	fmr	f14, f0
1078	fmr	f15, f0
1079
1080	addi	CO1, CO1, 2 * SIZE
1081	addi	CO2, CO2, 2 * SIZE
1082	addi	CO3, CO3, 2 * SIZE
1083	addi	CO4, CO4, 2 * SIZE
1084
1085#ifdef TRMMKERNEL
1086#if ( defined(LEFT) &&  defined(TRANSA)) || \
1087    (!defined(LEFT) && !defined(TRANSA))
1088	sub	TEMP, K, KK
1089#ifdef LEFT
1090	addi	TEMP, TEMP, -2
1091#else
1092	addi	TEMP, TEMP, -4
1093#endif
1094	slwi	r0,   TEMP, 1 + BASE_SHIFT
1095	slwi	TEMP, TEMP, 2 + BASE_SHIFT
1096	add	AO, AO, r0
1097	add	BO, BO, TEMP
1098#endif
1099
1100#ifdef LEFT
1101	addi	KK, KK, 2
1102#endif
1103#endif
1104	.align 4
1105
1106LL(30):
1107	andi.	I,  M,  1
1108	ble	LL(39)
1109
1110#if   defined(TRMMKERNEL)
1111
1112#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1113
1114	LFD	f16,  0 * SIZE(AO)
1115	LFD	f17,  1 * SIZE(AO)
1116	LFD	f18,  2 * SIZE(AO)
1117	LFD	f19,  3 * SIZE(AO)
1118
1119	LFD	f20,  0 * SIZE(B)
1120	LFD	f21,  1 * SIZE(B)
1121	LFD	f22,  2 * SIZE(B)
1122	LFD	f23,  3 * SIZE(B)
1123
1124	LFD	f24,  4 * SIZE(B)
1125	LFD	f25,  5 * SIZE(B)
1126	LFD	f26,  6 * SIZE(B)
1127	LFD	f27,  7 * SIZE(B)
1128
1129	mr	BO,  B
1130#else
1131	slwi	r0,   KK, 0 + BASE_SHIFT
1132	slwi	TEMP, KK, 2 + BASE_SHIFT
1133	add	AO, AO, r0
1134	add	BO, B,  TEMP
1135
1136	LFD	f16,  0 * SIZE(AO)
1137	LFD	f17,  1 * SIZE(AO)
1138	LFD	f18,  2 * SIZE(AO)
1139	LFD	f19,  3 * SIZE(AO)
1140
1141	LFD	f20,  0 * SIZE(BO)
1142	LFD	f21,  1 * SIZE(BO)
1143	LFD	f22,  2 * SIZE(BO)
1144	LFD	f23,  3 * SIZE(BO)
1145
1146	LFD	f24,  4 * SIZE(BO)
1147	LFD	f25,  5 * SIZE(BO)
1148	LFD	f26,  6 * SIZE(BO)
1149	LFD	f27,  7 * SIZE(BO)
1150#endif
1151
1152#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1153	sub	TEMP, K, KK
1154#elif defined(LEFT)
1155	addi	TEMP, KK, 1
1156#else
1157	addi	TEMP, KK, 4
1158#endif
1159
1160	srawi.	TEMP,  TEMP,  2
1161	mtspr	CTR, TEMP
1162
1163#else
1164	LFD	f16,  0 * SIZE(AO)
1165	LFD	f17,  1 * SIZE(AO)
1166	LFD	f18,  2 * SIZE(AO)
1167	LFD	f19,  3 * SIZE(AO)
1168
1169	LFD	f20,  0 * SIZE(B)
1170	LFD	f21,  1 * SIZE(B)
1171	LFD	f22,  2 * SIZE(B)
1172	LFD	f23,  3 * SIZE(B)
1173
1174	LFD	f24,  4 * SIZE(B)
1175	LFD	f25,  5 * SIZE(B)
1176	LFD	f26,  6 * SIZE(B)
1177	LFD	f27,  7 * SIZE(B)
1178
1179	srawi.	r0,  K,  2
1180	mtspr	CTR, r0
1181	mr	BO,  B
1182#endif
1183	ble	LL(35)
1184	.align 5
1185
1186LL(32):
1187	FMADD	f0,  f16, f20, f0
1188	FMADD	f4,  f16, f21, f4
1189	FMADD	f8,  f16, f22, f8
1190	FMADD	f12, f16, f23, f12
1191
1192	LFD	f20,  8 * SIZE(BO)
1193	LFD	f21,  9 * SIZE(BO)
1194	LFD	f22, 10 * SIZE(BO)
1195	LFD	f23, 11 * SIZE(BO)
1196
1197	FMADD	f1,  f17, f24, f1
1198	FMADD	f5,  f17, f25, f5
1199	FMADD	f9,  f17, f26, f9
1200	FMADD	f13, f17, f27, f13
1201
1202	LFD	f24, 12 * SIZE(BO)
1203	LFD	f25, 13 * SIZE(BO)
1204	LFD	f26, 14 * SIZE(BO)
1205	LFD	f27, 15 * SIZE(BO)
1206
1207	FMADD	f0,  f18, f20, f0
1208	FMADD	f4,  f18, f21, f4
1209	FMADD	f8,  f18, f22, f8
1210	FMADD	f12, f18, f23, f12
1211
1212	LFD	f20, 16 * SIZE(BO)
1213	LFD	f21, 17 * SIZE(BO)
1214	LFD	f22, 18 * SIZE(BO)
1215	LFD	f23, 19 * SIZE(BO)
1216
1217	FMADD	f1,  f19, f24, f1
1218	FMADD	f5,  f19, f25, f5
1219	FMADD	f9,  f19, f26, f9
1220	FMADD	f13, f19, f27, f13
1221
1222	LFD	f16,  4 * SIZE(AO)
1223	LFD	f17,  5 * SIZE(AO)
1224	LFD	f18,  6 * SIZE(AO)
1225	LFD	f19,  7 * SIZE(AO)
1226
1227	LFD	f24, 20 * SIZE(BO)
1228	LFD	f25, 21 * SIZE(BO)
1229	LFD	f26, 22 * SIZE(BO)
1230	LFD	f27, 23 * SIZE(BO)
1231
1232	addi	AO, AO,  4 * SIZE
1233	addi	BO, BO, 16 * SIZE
1234	DCBT(BO, PREB)
1235	bdnz	LL(32)
1236
1237	fadd	f0,  f1,   f0
1238	fadd	f4,  f5,   f4
1239	fadd	f8,  f9,   f8
1240	fadd	f12, f13, f12
1241	.align 4
1242
1243LL(35):
1244	lfd	f30,  ALPHA
1245#if  defined(TRMMKERNEL)
1246
1247#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1248	sub	TEMP, K, KK
1249#elif defined(LEFT)
1250	addi	TEMP, KK, 1
1251#else
1252	addi	TEMP, KK, 4
1253#endif
1254	andi.	TEMP,  TEMP,  3
1255	mtspr	CTR, TEMP
1256
1257#else
1258	andi.	r0,  K,  3
1259	mtspr	CTR, r0
1260
1261#endif
1262	ble+	LL(38)
1263	.align 4
1264
1265LL(36):
1266	FMADD	f0,  f16, f20, f0
1267	FMADD	f4,  f16, f21, f4
1268	FMADD	f8,  f16, f22, f8
1269	FMADD	f12, f16, f23, f12
1270
1271	LFD	f16,  1 * SIZE(AO)
1272
1273	LFD	f20,  4 * SIZE(BO)
1274	LFD	f21,  5 * SIZE(BO)
1275	LFD	f22,  6 * SIZE(BO)
1276	LFD	f23,  7 * SIZE(BO)
1277
1278	addi	BO, BO,  4 * SIZE
1279	addi	AO, AO,  1 * SIZE
1280	bdnz	LL(36)
1281	.align 4
1282
1283LL(38):
1284#ifndef TRMMKERNEL
1285	LFD	f16, 0 * SIZE(CO1)
1286	LFD	f18, 0 * SIZE(CO2)
1287	LFD	f20, 0 * SIZE(CO3)
1288	LFD	f22, 0 * SIZE(CO4)
1289
1290	FMADD	f0,  f0,  f30, f16
1291	FMADD	f4,  f4,  f30, f18
1292	FMADD	f8,  f8,  f30, f20
1293	FMADD	f12, f12, f30, f22
1294#else
1295	FMUL	f0,  f0,  f30
1296	FMUL	f4,  f4,  f30
1297	FMUL	f8,  f8,  f30
1298	FMUL	f12, f12, f30
1299#endif
1300
1301	STFD	f0,  0 * SIZE(CO1)
1302	STFD	f4,  0 * SIZE(CO2)
1303	STFD	f8,  0 * SIZE(CO3)
1304	STFD	f12, 0 * SIZE(CO4)
1305
1306	lfs	f0,  FZERO
1307 	fmr	f1,  f0
1308	fmr	f4,  f0
1309	fmr	f5,  f0
1310
1311	fmr	f8,  f0
1312	fmr	f9,  f0
1313	fmr	f12, f0
1314	fmr	f13, f0
1315
1316#ifdef TRMMKERNEL
1317#if ( defined(LEFT) &&  defined(TRANSA)) || \
1318    (!defined(LEFT) && !defined(TRANSA))
1319	sub	TEMP, K, KK
1320#ifdef LEFT
1321	addi	TEMP, TEMP, -1
1322#else
1323	addi	TEMP, TEMP, -4
1324#endif
1325	slwi	r0,   TEMP, 0 + BASE_SHIFT
1326	slwi	TEMP, TEMP, 2 + BASE_SHIFT
1327	add	AO, AO, r0
1328	add	BO, BO, TEMP
1329#endif
1330
1331#ifdef LEFT
1332	addi	KK, KK, 2
1333#endif
1334#endif
1335	.align 4
1336
1337
1338LL(39):
1339#if defined(TRMMKERNEL) && !defined(LEFT)
1340	addi	KK, KK, 4
1341#endif
1342
1343	mr	B,  BO
1344	addic.	J, J, -1
1345	bgt	LL(10)
1346	.align 4
1347
1348LL(40):
1349	mr	CO1, C
1350	add	CO2, C,  LDC
1351	andi.	J, N,  2
1352	ble	LL(70)
1353
1354#if defined(TRMMKERNEL) && defined(LEFT)
1355	mr	KK, OFFSET
1356#endif
1357
1358
1359	lfs	f0,  FZERO
1360 	fmr	f1,  f0
1361	fmr	f2,  f0
1362	fmr	f3,  f0
1363	fmr	f4,  f0
1364	fmr	f5,  f0
1365	fmr	f6,  f0
1366	fmr	f7,  f0
1367
1368	srawi.	I, M,  2
1369	add	C,  CO2, LDC
1370	mr	AO, A
1371	ble	LL(50)
1372	.align 4
1373
1374LL(41):
1375#if defined(TRMMKERNEL)
1376#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1377
1378	LFD	f16,  0 * SIZE(AO)
1379	LFD	f17,  1 * SIZE(AO)
1380	LFD	f18,  2 * SIZE(AO)
1381	LFD	f19,  3 * SIZE(AO)
1382
1383	LFD	f20,  0 * SIZE(B)
1384	LFD	f21,  1 * SIZE(B)
1385	LFD	f22,  2 * SIZE(B)
1386	LFD	f23,  3 * SIZE(B)
1387
1388	mr	BO,  B
1389#else
1390	slwi	r0,   KK, 2 + BASE_SHIFT
1391	slwi	TEMP, KK, 1 + BASE_SHIFT
1392	add	AO, AO, r0
1393	add	BO, B,  TEMP
1394
1395	LFD	f16,  0 * SIZE(AO)
1396	LFD	f17,  1 * SIZE(AO)
1397	LFD	f18,  2 * SIZE(AO)
1398	LFD	f19,  3 * SIZE(AO)
1399
1400	LFD	f20,  0 * SIZE(BO)
1401	LFD	f21,  1 * SIZE(BO)
1402	LFD	f22,  2 * SIZE(BO)
1403	LFD	f23,  3 * SIZE(BO)
1404#endif
1405
1406	DCBTST(CO1, PREC)
1407	DCBTST(CO2, PREC)
1408
1409#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1410	sub	TEMP, K, KK
1411#elif defined(LEFT)
1412	addi	TEMP, KK, 4
1413#else
1414	addi	TEMP, KK, 2
1415#endif
1416	srawi.	TEMP,  TEMP,  2
1417	mtspr	CTR, TEMP
1418
1419#else
1420
1421	LFD	f16,  0 * SIZE(AO)
1422	LFD	f17,  1 * SIZE(AO)
1423	LFD	f18,  2 * SIZE(AO)
1424	LFD	f19,  3 * SIZE(AO)
1425
1426	LFD	f20,  0 * SIZE(B)
1427	LFD	f21,  1 * SIZE(B)
1428	LFD	f22,  2 * SIZE(B)
1429	LFD	f23,  3 * SIZE(B)
1430
1431	DCBTST(CO1, PREC)
1432	DCBTST(CO2, PREC)
1433
1434	srawi.	r0,  K,  2
1435	mtspr	CTR, r0
1436	mr	BO,  B
1437#endif
1438	ble	LL(45)
1439	.align 5
1440
1441LL(42):
1442	FMADD	f0,  f16, f20, f0
1443	FMADD	f1,  f17, f20, f1
1444	FMADD	f2,  f18, f20, f2
1445	FMADD	f3,  f19, f20, f3
1446
1447	FMADD	f4,  f16, f21, f4
1448	FMADD	f5,  f17, f21, f5
1449	FMADD	f6,  f18, f21, f6
1450	FMADD	f7,  f19, f21, f7
1451
1452	LFD	f16,  4 * SIZE(AO)
1453	LFD	f17,  5 * SIZE(AO)
1454	LFD	f18,  6 * SIZE(AO)
1455	LFD	f19,  7 * SIZE(AO)
1456
1457	FMADD	f0,  f16, f22, f0
1458	FMADD	f1,  f17, f22, f1
1459	FMADD	f2,  f18, f22, f2
1460	FMADD	f3,  f19, f22, f3
1461
1462	FMADD	f4,  f16, f23, f4
1463	FMADD	f5,  f17, f23, f5
1464	FMADD	f6,  f18, f23, f6
1465	FMADD	f7,  f19, f23, f7
1466
1467	LFD	f16,  8 * SIZE(AO)
1468	LFD	f17,  9 * SIZE(AO)
1469	LFD	f18, 10 * SIZE(AO)
1470	LFD	f19, 11 * SIZE(AO)
1471
1472	LFD	f20,  4 * SIZE(BO)
1473	LFD	f21,  5 * SIZE(BO)
1474	LFD	f22,  6 * SIZE(BO)
1475	LFD	f23,  7 * SIZE(BO)
1476
1477	FMADD	f0,  f16, f20, f0
1478	FMADD	f1,  f17, f20, f1
1479	FMADD	f2,  f18, f20, f2
1480	FMADD	f3,  f19, f20, f3
1481
1482	FMADD	f4,  f16, f21, f4
1483	FMADD	f5,  f17, f21, f5
1484	FMADD	f6,  f18, f21, f6
1485	FMADD	f7,  f19, f21, f7
1486
1487	LFD	f16, 12 * SIZE(AO)
1488	LFD	f17, 13 * SIZE(AO)
1489	LFD	f18, 14 * SIZE(AO)
1490	LFD	f19, 15 * SIZE(AO)
1491
1492	FMADD	f0,  f16, f22, f0
1493	FMADD	f1,  f17, f22, f1
1494	FMADD	f2,  f18, f22, f2
1495	FMADD	f3,  f19, f22, f3
1496
1497	FMADD	f4,  f16, f23, f4
1498	FMADD	f5,  f17, f23, f5
1499	FMADD	f6,  f18, f23, f6
1500	FMADD	f7,  f19, f23, f7
1501
1502	LFD	f16, 16 * SIZE(AO)
1503	LFD	f17, 17 * SIZE(AO)
1504	LFD	f18, 18 * SIZE(AO)
1505	LFD	f19, 19 * SIZE(AO)
1506
1507	LFD	f20,  8 * SIZE(BO)
1508	LFD	f21,  9 * SIZE(BO)
1509	LFD	f22, 10 * SIZE(BO)
1510	LFD	f23, 11 * SIZE(BO)
1511
1512	addi	AO, AO, 16 * SIZE
1513	addi	BO, BO,  8 * SIZE
1514	DCBT(BO, PREB)
1515	bdnz	LL(42)
1516	.align 4
1517
1518LL(45):
1519	lfd	f30,  ALPHA
1520#if defined(TRMMKERNEL)
1521
1522#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1523	sub	TEMP, K, KK
1524#elif defined(LEFT)
1525	addi	TEMP, KK, 4
1526#else
1527	addi	TEMP, KK, 2
1528#endif
1529	andi.	TEMP,  TEMP,  3
1530	mtspr	CTR, TEMP
1531#else
1532	andi.	r0,  K,  3
1533	mtspr	CTR, r0
1534#endif
1535	ble+	LL(48)
1536	.align 4
1537
1538LL(46):
1539	FMADD	f0,  f16, f20, f0
1540	FMADD	f1,  f17, f20, f1
1541	FMADD	f2,  f18, f20, f2
1542	FMADD	f3,  f19, f20, f3
1543
1544	FMADD	f4,  f16, f21, f4
1545	FMADD	f5,  f17, f21, f5
1546	FMADD	f6,  f18, f21, f6
1547	FMADD	f7,  f19, f21, f7
1548
1549	LFD	f16,  4 * SIZE(AO)
1550	LFD	f17,  5 * SIZE(AO)
1551	LFD	f18,  6 * SIZE(AO)
1552	LFD	f19,  7 * SIZE(AO)
1553
1554	LFD	f20,  2 * SIZE(BO)
1555	LFD	f21,  3 * SIZE(BO)
1556
1557	addi	BO, BO,  2 * SIZE
1558	addi	AO, AO,  4 * SIZE
1559	bdnz	LL(46)
1560	.align 4
1561
1562LL(48):
1563#ifndef TRMMKERNEL
1564	LFD	f16, 0 * SIZE(CO1)
1565	LFD	f17, 1 * SIZE(CO1)
1566	LFD	f18, 2 * SIZE(CO1)
1567	LFD	f19, 3 * SIZE(CO1)
1568
1569	LFD	f20, 0 * SIZE(CO2)
1570	LFD	f21, 1 * SIZE(CO2)
1571	LFD	f22, 2 * SIZE(CO2)
1572	LFD	f23, 3 * SIZE(CO2)
1573
1574	FMADD	f0,  f0, f30, f16
1575	FMADD	f1,  f1, f30, f17
1576	FMADD	f2,  f2, f30, f18
1577	FMADD	f3,  f3, f30, f19
1578
1579	FMADD	f4,  f4, f30, f20
1580	FMADD	f5,  f5, f30, f21
1581	FMADD	f6,  f6, f30, f22
1582	FMADD	f7,  f7, f30, f23
1583#else
1584	FMUL	f0,  f0, f30
1585	FMUL	f1,  f1, f30
1586	FMUL	f2,  f2, f30
1587	FMUL	f3,  f3, f30
1588
1589	FMUL	f4,  f4, f30
1590	FMUL	f5,  f5, f30
1591	FMUL	f6,  f6, f30
1592	FMUL	f7,  f7, f30
1593#endif
1594
1595	STFD	f0,  0 * SIZE(CO1)
1596	STFD	f1,  1 * SIZE(CO1)
1597	STFD	f2,  2 * SIZE(CO1)
1598	STFD	f3,  3 * SIZE(CO1)
1599
1600	lfs	f0,  FZERO
1601 	fmr	f1,  f0
1602	fmr	f2,  f0
1603	fmr	f3,  f0
1604
1605	STFD	f4,  0 * SIZE(CO2)
1606	STFD	f5,  1 * SIZE(CO2)
1607	STFD	f6,  2 * SIZE(CO2)
1608	STFD	f7,  3 * SIZE(CO2)
1609
1610	fmr	f4,  f0
1611	fmr	f5,  f0
1612	fmr	f6,  f0
1613	fmr	f7,  f0
1614
1615	addi	CO1, CO1, 4 * SIZE
1616	addi	CO2, CO2, 4 * SIZE
1617
1618#ifdef TRMMKERNEL
1619#if ( defined(LEFT) &&  defined(TRANSA)) || \
1620    (!defined(LEFT) && !defined(TRANSA))
1621	sub	TEMP, K, KK
1622#ifdef LEFT
1623	addi	TEMP, TEMP, -4
1624#else
1625	addi	TEMP, TEMP, -2
1626#endif
1627	slwi	r0,   TEMP, 2 + BASE_SHIFT
1628	slwi	TEMP, TEMP, 1 + BASE_SHIFT
1629	add	AO, AO, r0
1630	add	BO, BO, TEMP
1631#endif
1632
1633#ifdef LEFT
1634	addi	KK, KK, 4
1635#endif
1636#endif
1637
1638	addic.	I, I, -1
1639	bgt+	LL(41)
1640	.align 4
1641
1642LL(50):
1643	andi.	I,  M,  2
1644	ble	LL(60)
1645
1646#if defined(TRMMKERNEL)
1647
1648#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1649
1650	LFD	f16,  0 * SIZE(AO)
1651	LFD	f17,  1 * SIZE(AO)
1652	LFD	f18,  2 * SIZE(AO)
1653	LFD	f19,  3 * SIZE(AO)
1654
1655	LFD	f20,  0 * SIZE(B)
1656	LFD	f21,  1 * SIZE(B)
1657	LFD	f22,  2 * SIZE(B)
1658	LFD	f23,  3 * SIZE(B)
1659
1660	LFD	f24,  4 * SIZE(B)
1661	LFD	f25,  5 * SIZE(B)
1662	LFD	f26,  6 * SIZE(B)
1663	LFD	f27,  7 * SIZE(B)
1664
1665	mr	BO,  B
1666#else
1667	slwi	r0,   KK, 1 + BASE_SHIFT
1668	slwi	TEMP, KK, 1 + BASE_SHIFT
1669	add	AO, AO, r0
1670	add	BO, B,  TEMP
1671
1672	LFD	f16,  0 * SIZE(AO)
1673	LFD	f17,  1 * SIZE(AO)
1674	LFD	f18,  2 * SIZE(AO)
1675	LFD	f19,  3 * SIZE(AO)
1676
1677	LFD	f20,  0 * SIZE(BO)
1678	LFD	f21,  1 * SIZE(BO)
1679	LFD	f22,  2 * SIZE(BO)
1680	LFD	f23,  3 * SIZE(BO)
1681
1682	LFD	f24,  4 * SIZE(BO)
1683	LFD	f25,  5 * SIZE(BO)
1684	LFD	f26,  6 * SIZE(BO)
1685	LFD	f27,  7 * SIZE(BO)
1686#endif
1687
1688#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1689	sub	TEMP, K, KK
1690#elif defined(LEFT)
1691	addi	TEMP, KK, 2
1692#else
1693	addi	TEMP, KK, 2
1694#endif
1695	srawi.	TEMP,  TEMP,  2
1696	mtspr	CTR, TEMP
1697
1698#else
1699	LFD	f16,  0 * SIZE(AO)
1700	LFD	f17,  1 * SIZE(AO)
1701	LFD	f18,  2 * SIZE(AO)
1702	LFD	f19,  3 * SIZE(AO)
1703
1704	LFD	f20,  0 * SIZE(B)
1705	LFD	f21,  1 * SIZE(B)
1706	LFD	f22,  2 * SIZE(B)
1707	LFD	f23,  3 * SIZE(B)
1708
1709	LFD	f24,  4 * SIZE(B)
1710	LFD	f25,  5 * SIZE(B)
1711	LFD	f26,  6 * SIZE(B)
1712	LFD	f27,  7 * SIZE(B)
1713
1714	srawi.	r0,  K,  2
1715	mtspr	CTR, r0
1716	mr	BO,  B
1717#endif
1718	ble	LL(55)
1719	.align 5
1720
1721LL(52):
1722	FMADD	f0,  f16, f20, f0
1723	FMADD	f1,  f17, f20, f1
1724	FMADD	f2,  f16, f21, f2
1725	FMADD	f3,  f17, f21, f3
1726
1727	FMADD	f4,  f18, f22, f4
1728	FMADD	f5,  f19, f22, f5
1729	FMADD	f6,  f18, f23, f6
1730	FMADD	f7,  f19, f23, f7
1731
1732	LFD	f16,  4 * SIZE(AO)
1733	LFD	f17,  5 * SIZE(AO)
1734	LFD	f18,  6 * SIZE(AO)
1735	LFD	f19,  7 * SIZE(AO)
1736
1737	LFD	f20,  8 * SIZE(BO)
1738	LFD	f21,  9 * SIZE(BO)
1739	LFD	f22, 10 * SIZE(BO)
1740	LFD	f23, 11 * SIZE(BO)
1741
1742	FMADD	f0,  f16, f24, f0
1743	FMADD	f1,  f17, f24, f1
1744	FMADD	f2,  f16, f25, f2
1745	FMADD	f3,  f17, f25, f3
1746
1747	FMADD	f4,  f18, f26, f4
1748	FMADD	f5,  f19, f26, f5
1749	FMADD	f6,  f18, f27, f6
1750	FMADD	f7,  f19, f27, f7
1751
1752	LFD	f16,  8 * SIZE(AO)
1753	LFD	f17,  9 * SIZE(AO)
1754	LFD	f18, 10 * SIZE(AO)
1755	LFD	f19, 11 * SIZE(AO)
1756
1757	LFD	f24, 12 * SIZE(BO)
1758	LFD	f25, 13 * SIZE(BO)
1759	LFD	f26, 14 * SIZE(BO)
1760	LFD	f27, 15 * SIZE(BO)
1761
1762	addi	AO, AO,  8 * SIZE
1763	addi	BO, BO,  8 * SIZE
1764	DCBT(BO, PREB)
1765	bdnz	LL(52)
1766	.align 4
1767
1768LL(55):
1769	lfd	f30,  ALPHA
1770#if defined(TRMMKERNEL)
1771
1772#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1773	sub	TEMP, K, KK
1774#elif defined(LEFT)
1775	addi	TEMP, KK, 2
1776#else
1777	addi	TEMP, KK, 2
1778#endif
1779	andi.	TEMP,  TEMP,  3
1780	mtspr	CTR, TEMP
1781
1782#else
1783	andi.	r0,  K,  3
1784	mtspr	CTR, r0
1785#endif
1786	ble+	LL(58)
1787	.align 4
1788
1789LL(56):
1790	FMADD	f0,  f16, f20, f0
1791	FMADD	f1,  f17, f20, f1
1792	FMADD	f2,  f16, f21, f2
1793	FMADD	f3,  f17, f21, f3
1794
1795	LFD	f16,  2 * SIZE(AO)
1796	LFD	f17,  3 * SIZE(AO)
1797	LFD	f20,  2 * SIZE(BO)
1798	LFD	f21,  3 * SIZE(BO)
1799
1800	addi	BO, BO,  2 * SIZE
1801	addi	AO, AO,  2 * SIZE
1802	bdnz	LL(56)
1803	.align 4
1804
1805LL(58):
1806#ifndef TRMMKERNEL
1807	LFD	f16, 0 * SIZE(CO1)
1808	LFD	f17, 1 * SIZE(CO1)
1809	LFD	f18, 0 * SIZE(CO2)
1810	LFD	f19, 1 * SIZE(CO2)
1811
1812	FADD	f0, f4,  f0
1813	FADD	f1, f5,  f1
1814	FADD	f2, f6,  f2
1815	FADD	f3, f7,  f3
1816
1817	FMADD	f0,  f0, f30, f16
1818	FMADD	f1,  f1, f30, f17
1819	FMADD	f2,  f2, f30, f18
1820	FMADD	f3,  f3, f30, f19
1821#else
1822	FADD	f0, f4,  f0
1823	FADD	f1, f5,  f1
1824	FADD	f2, f6,  f2
1825	FADD	f3, f7,  f3
1826
1827	FMUL	f0,  f0, f30
1828	FMUL	f1,  f1, f30
1829	FMUL	f2,  f2, f30
1830	FMUL	f3,  f3, f30
1831#endif
1832
1833	STFD	f0,  0 * SIZE(CO1)
1834	STFD	f1,  1 * SIZE(CO1)
1835	STFD	f2,  0 * SIZE(CO2)
1836	STFD	f3,  1 * SIZE(CO2)
1837
1838	lfs	f0,  FZERO
1839 	fmr	f1,  f0
1840	fmr	f2,  f0
1841	fmr	f3,  f0
1842
1843	fmr	f4,  f0
1844	fmr	f5,  f0
1845	fmr	f6,  f0
1846	fmr	f7,  f0
1847
1848	addi	CO1, CO1, 2 * SIZE
1849	addi	CO2, CO2, 2 * SIZE
1850
1851#ifdef TRMMKERNEL
1852#if ( defined(LEFT) &&  defined(TRANSA)) || \
1853    (!defined(LEFT) && !defined(TRANSA))
1854	sub	TEMP, K, KK
1855#ifdef LEFT
1856	addi	TEMP, TEMP, -2
1857#else
1858	addi	TEMP, TEMP, -2
1859#endif
1860	slwi	r0,   TEMP, 1 + BASE_SHIFT
1861	slwi	TEMP, TEMP, 1 + BASE_SHIFT
1862	add	AO, AO, r0
1863	add	BO, BO, TEMP
1864#endif
1865
1866#ifdef LEFT
1867	addi	KK, KK, 2
1868#endif
1869#endif
1870	.align 4
1871
1872LL(60):
1873	andi.	I,  M,  1
1874	ble	LL(69)
1875
1876#if defined(TRMMKERNEL)
1877
1878#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1879
1880	LFD	f16,  0 * SIZE(AO)
1881	LFD	f17,  1 * SIZE(AO)
1882	LFD	f18,  2 * SIZE(AO)
1883	LFD	f19,  3 * SIZE(AO)
1884
1885	LFD	f20,  0 * SIZE(B)
1886	LFD	f21,  1 * SIZE(B)
1887	LFD	f22,  2 * SIZE(B)
1888	LFD	f23,  3 * SIZE(B)
1889
1890	LFD	f24,  4 * SIZE(B)
1891	LFD	f25,  5 * SIZE(B)
1892	LFD	f26,  6 * SIZE(B)
1893	LFD	f27,  7 * SIZE(B)
1894
1895	mr	BO,  B
1896#else
1897	slwi	r0,   KK, 0 + BASE_SHIFT
1898	slwi	TEMP, KK, 1 + BASE_SHIFT
1899	add	AO, AO, r0
1900	add	BO, B,  TEMP
1901
1902	LFD	f16,  0 * SIZE(AO)
1903	LFD	f17,  1 * SIZE(AO)
1904	LFD	f18,  2 * SIZE(AO)
1905	LFD	f19,  3 * SIZE(AO)
1906
1907	LFD	f20,  0 * SIZE(BO)
1908	LFD	f21,  1 * SIZE(BO)
1909	LFD	f22,  2 * SIZE(BO)
1910	LFD	f23,  3 * SIZE(BO)
1911
1912	LFD	f24,  4 * SIZE(BO)
1913	LFD	f25,  5 * SIZE(BO)
1914	LFD	f26,  6 * SIZE(BO)
1915	LFD	f27,  7 * SIZE(BO)
1916#endif
1917
1918#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1919	sub	TEMP, K, KK
1920#elif defined(LEFT)
1921	addi	TEMP, KK, 1
1922#else
1923	addi	TEMP, KK, 2
1924#endif
1925	srawi.	TEMP,  TEMP,  2
1926	mtspr	CTR, TEMP
1927#else
1928	LFD	f16,  0 * SIZE(AO)
1929	LFD	f17,  1 * SIZE(AO)
1930	LFD	f18,  2 * SIZE(AO)
1931	LFD	f19,  3 * SIZE(AO)
1932
1933	LFD	f20,  0 * SIZE(B)
1934	LFD	f21,  1 * SIZE(B)
1935	LFD	f22,  2 * SIZE(B)
1936	LFD	f23,  3 * SIZE(B)
1937
1938	LFD	f24,  4 * SIZE(B)
1939	LFD	f25,  5 * SIZE(B)
1940	LFD	f26,  6 * SIZE(B)
1941	LFD	f27,  7 * SIZE(B)
1942
1943	srawi.	r0,  K,  2
1944	mtspr	CTR, r0
1945	mr	BO,  B
1946#endif
1947	ble	LL(65)
1948	.align 5
1949
1950LL(62):
1951	FMADD	f0,  f16, f20, f0
1952	FMADD	f1,  f16, f21, f1
1953	FMADD	f2,  f17, f22, f2
1954	FMADD	f3,  f17, f23, f3
1955
1956	LFD	f20,  8 * SIZE(BO)
1957	LFD	f21,  9 * SIZE(BO)
1958	LFD	f22, 10 * SIZE(BO)
1959	LFD	f23, 11 * SIZE(BO)
1960
1961	FMADD	f0,  f18, f24, f0
1962	FMADD	f1,  f18, f25, f1
1963	FMADD	f2,  f19, f26, f2
1964	FMADD	f3,  f19, f27, f3
1965
1966	LFD	f16,  4 * SIZE(AO)
1967	LFD	f17,  5 * SIZE(AO)
1968	LFD	f18,  6 * SIZE(AO)
1969	LFD	f19,  7 * SIZE(AO)
1970
1971	LFD	f24, 12 * SIZE(BO)
1972	LFD	f25, 13 * SIZE(BO)
1973	LFD	f26, 14 * SIZE(BO)
1974	LFD	f27, 15 * SIZE(BO)
1975
1976	addi	AO, AO,  4 * SIZE
1977	addi	BO, BO,  8 * SIZE
1978	bdnz	LL(62)
1979	.align 4
1980
1981LL(65):
1982	lfd	f30,  ALPHA
1983
1984#if defined(TRMMKERNEL)
1985
1986#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1987	sub	TEMP, K, KK
1988#elif defined(LEFT)
1989	addi	TEMP, KK, 1
1990#else
1991	addi	TEMP, KK, 2
1992#endif
1993	andi.	TEMP,  TEMP,  3
1994	mtspr	CTR, TEMP
1995
1996#else
1997	andi.	r0,  K,  3
1998	mtspr	CTR, r0
1999
2000#endif
2001	ble+	LL(68)
2002	.align 4
2003
2004LL(66):
2005	FMADD	f0,  f16, f20, f0
2006	FMADD	f1,  f16, f21, f1
2007
2008	LFD	f16,  1 * SIZE(AO)
2009
2010	LFD	f20,  2 * SIZE(BO)
2011	LFD	f21,  3 * SIZE(BO)
2012
2013	addi	BO, BO,  2 * SIZE
2014	addi	AO, AO,  1 * SIZE
2015	bdnz	LL(66)
2016	.align 4
2017
2018LL(68):
2019#ifndef TRMMKERNEL
2020	LFD	f16, 0 * SIZE(CO1)
2021	LFD	f18, 0 * SIZE(CO2)
2022
2023	FADD	f0, f2, f0
2024	FADD	f1, f3, f1
2025
2026	FMADD	f0,  f0,  f30, f16
2027	FMADD	f1,  f1,  f30, f18
2028#else
2029	FADD	f0, f2, f0
2030	FADD	f1, f3, f1
2031
2032	FMUL	f0,  f0,  f30
2033	FMUL	f1,  f1,  f30
2034#endif
2035
2036	STFD	f0,  0 * SIZE(CO1)
2037	STFD	f1,  0 * SIZE(CO2)
2038
2039	lfs	f0,  FZERO
2040 	fmr	f1,  f0
2041	fmr	f4,  f0
2042	fmr	f5,  f0
2043
2044
2045#ifdef TRMMKERNEL
2046#if ( defined(LEFT) &&  defined(TRANSA)) || \
2047    (!defined(LEFT) && !defined(TRANSA))
2048	sub	TEMP, K, KK
2049#ifdef LEFT
2050	addi	TEMP, TEMP, -1
2051#else
2052	addi	TEMP, TEMP, -2
2053#endif
2054	slwi	r0,   TEMP, 0 + BASE_SHIFT
2055	slwi	TEMP, TEMP, 1 + BASE_SHIFT
2056	add	AO, AO, r0
2057	add	BO, BO, TEMP
2058#endif
2059
2060#ifdef LEFT
2061	addi	KK, KK, 1
2062#endif
2063#endif
2064	.align 4
2065
2066LL(69):
2067#if defined(TRMMKERNEL) && !defined(LEFT)
2068	addi	KK, KK, 2
2069#endif
2070
2071	mr	B,  BO
2072	.align 4
2073
2074LL(70):
2075	mr	CO1, C
2076	andi.	J, N,  1
2077	ble	LL(999)
2078
2079#if defined(TRMMKERNEL) && defined(LEFT)
2080	mr	KK, OFFSET
2081#endif
2082
2083	lfs	f0,  FZERO
2084 	fmr	f1,  f0
2085	fmr	f2,  f0
2086	fmr	f3,  f0
2087
2088	srawi.	I, M,  2
2089	mr	AO, A
2090	ble	LL(80)
2091	.align 4
2092
2093LL(71):
2094#if defined(TRMMKERNEL)
2095
2096#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2097
2098	LFD	f16,  0 * SIZE(AO)
2099	LFD	f17,  1 * SIZE(AO)
2100	LFD	f18,  2 * SIZE(AO)
2101	LFD	f19,  3 * SIZE(AO)
2102
2103	LFD	f20,  0 * SIZE(B)
2104	LFD	f21,  1 * SIZE(B)
2105	LFD	f22,  2 * SIZE(B)
2106	LFD	f23,  3 * SIZE(B)
2107
2108	mr	BO,  B
2109#else
2110	slwi	r0,   KK, 2 + BASE_SHIFT
2111	slwi	TEMP, KK, 0 + BASE_SHIFT
2112	add	AO, AO, r0
2113	add	BO, B,  TEMP
2114
2115	LFD	f16,  0 * SIZE(AO)
2116	LFD	f17,  1 * SIZE(AO)
2117	LFD	f18,  2 * SIZE(AO)
2118	LFD	f19,  3 * SIZE(AO)
2119
2120	LFD	f20,  0 * SIZE(BO)
2121	LFD	f21,  1 * SIZE(BO)
2122	LFD	f22,  2 * SIZE(BO)
2123	LFD	f23,  3 * SIZE(BO)
2124#endif
2125
2126	DCBTST(CO1, PREC)
2127
2128#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2129	sub	TEMP, K, KK
2130#elif defined(LEFT)
2131	addi	TEMP, KK, 4
2132#else
2133	addi	TEMP, KK, 1
2134#endif
2135	srawi.	TEMP,  TEMP,  2
2136	mtspr	CTR, TEMP
2137#else
2138	LFD	f16,  0 * SIZE(AO)
2139	LFD	f17,  1 * SIZE(AO)
2140	LFD	f18,  2 * SIZE(AO)
2141	LFD	f19,  3 * SIZE(AO)
2142
2143	LFD	f20,  0 * SIZE(B)
2144	LFD	f21,  1 * SIZE(B)
2145	LFD	f22,  2 * SIZE(B)
2146	LFD	f23,  3 * SIZE(B)
2147
2148	DCBTST(CO1, PREC)
2149
2150	srawi.	r0,  K,  2
2151	mtspr	CTR, r0
2152	mr	BO,  B
2153#endif
2154	ble	LL(75)
2155	.align 5
2156
2157LL(72):
2158	FMADD	f0,  f16, f20, f0
2159	FMADD	f1,  f17, f20, f1
2160	FMADD	f2,  f18, f20, f2
2161	FMADD	f3,  f19, f20, f3
2162
2163	LFD	f16,  4 * SIZE(AO)
2164	LFD	f17,  5 * SIZE(AO)
2165	LFD	f18,  6 * SIZE(AO)
2166	LFD	f19,  7 * SIZE(AO)
2167
2168	FMADD	f0,  f16, f21, f0
2169	FMADD	f1,  f17, f21, f1
2170	FMADD	f2,  f18, f21, f2
2171	FMADD	f3,  f19, f21, f3
2172
2173	LFD	f16,  8 * SIZE(AO)
2174	LFD	f17,  9 * SIZE(AO)
2175	LFD	f18, 10 * SIZE(AO)
2176	LFD	f19, 11 * SIZE(AO)
2177
2178	FMADD	f0,  f16, f22, f0
2179	FMADD	f1,  f17, f22, f1
2180	FMADD	f2,  f18, f22, f2
2181	FMADD	f3,  f19, f22, f3
2182
2183	LFD	f16, 12 * SIZE(AO)
2184	LFD	f17, 13 * SIZE(AO)
2185	LFD	f18, 14 * SIZE(AO)
2186	LFD	f19, 15 * SIZE(AO)
2187
2188	FMADD	f0,  f16, f23, f0
2189	FMADD	f1,  f17, f23, f1
2190	FMADD	f2,  f18, f23, f2
2191	FMADD	f3,  f19, f23, f3
2192
2193	LFD	f16, 16 * SIZE(AO)
2194	LFD	f17, 17 * SIZE(AO)
2195	LFD	f18, 18 * SIZE(AO)
2196	LFD	f19, 19 * SIZE(AO)
2197
2198	LFD	f20,  4 * SIZE(BO)
2199	LFD	f21,  5 * SIZE(BO)
2200	LFD	f22,  6 * SIZE(BO)
2201	LFD	f23,  7 * SIZE(BO)
2202
2203	addi	AO, AO, 16 * SIZE
2204	addi	BO, BO,  4 * SIZE
2205	DCBT(BO, PREB)
2206	bdnz	LL(72)
2207	.align 4
2208
2209LL(75):
2210	lfd	f30,  ALPHA
2211#if defined(TRMMKERNEL)
2212
2213#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2214	sub	TEMP, K, KK
2215#elif defined(LEFT)
2216	addi	TEMP, KK, 4
2217#else
2218	addi	TEMP, KK, 1
2219#endif
2220	andi.	TEMP,  TEMP,  3
2221	mtspr	CTR, TEMP
2222
2223#else
2224	andi.	r0,  K,  3
2225	mtspr	CTR, r0
2226
2227#endif
2228	ble+	LL(78)
2229	.align 4
2230
2231LL(76):
2232	FMADD	f0,  f16, f20, f0
2233	FMADD	f1,  f17, f20, f1
2234	FMADD	f2,  f18, f20, f2
2235	FMADD	f3,  f19, f20, f3
2236
2237	LFD	f16,  4 * SIZE(AO)
2238	LFD	f17,  5 * SIZE(AO)
2239	LFD	f18,  6 * SIZE(AO)
2240	LFD	f19,  7 * SIZE(AO)
2241
2242	LFD	f20,  1 * SIZE(BO)
2243
2244	addi	BO, BO,  1 * SIZE
2245	addi	AO, AO,  4 * SIZE
2246	bdnz	LL(76)
2247	.align 4
2248
2249LL(78):
2250#ifndef TRMMKERNEL
2251	LFD	f16, 0 * SIZE(CO1)
2252	LFD	f17, 1 * SIZE(CO1)
2253	LFD	f18, 2 * SIZE(CO1)
2254	LFD	f19, 3 * SIZE(CO1)
2255
2256	FMADD	f0,  f0, f30, f16
2257	FMADD	f1,  f1, f30, f17
2258	FMADD	f2,  f2, f30, f18
2259	FMADD	f3,  f3, f30, f19
2260#else
2261	FMUL	f0,  f0, f30
2262	FMUL	f1,  f1, f30
2263	FMUL	f2,  f2, f30
2264	FMUL	f3,  f3, f30
2265#endif
2266
2267	STFD	f0,  0 * SIZE(CO1)
2268	STFD	f1,  1 * SIZE(CO1)
2269	STFD	f2,  2 * SIZE(CO1)
2270	STFD	f3,  3 * SIZE(CO1)
2271
2272	lfs	f0,  FZERO
2273 	fmr	f1,  f0
2274	fmr	f2,  f0
2275	fmr	f3,  f0
2276
2277#ifdef TRMMKERNEL
2278#if ( defined(LEFT) &&  defined(TRANSA)) || \
2279    (!defined(LEFT) && !defined(TRANSA))
2280	sub	TEMP, K, KK
2281#ifdef LEFT
2282	addi	TEMP, TEMP, -4
2283#else
2284	addi	TEMP, TEMP, -1
2285#endif
2286	slwi	r0  , TEMP, 2 + BASE_SHIFT
2287	slwi	TEMP, TEMP, 0 + BASE_SHIFT
2288	add	AO, AO, r0
2289	add	BO, BO, TEMP
2290#endif
2291
2292#ifdef LEFT
2293	addi	KK, KK, 4
2294#endif
2295#endif
2296
2297	addi	CO1, CO1, 4 * SIZE
2298	addic.	I, I, -1
2299	bgt+	LL(71)
2300	.align 4
2301
2302LL(80):
2303	andi.	I,  M,  2
2304	ble	LL(90)
2305
2306#if defined(TRMMKERNEL)
2307
2308#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2309
2310	LFD	f16,  0 * SIZE(AO)
2311	LFD	f17,  1 * SIZE(AO)
2312	LFD	f18,  2 * SIZE(AO)
2313	LFD	f19,  3 * SIZE(AO)
2314
2315	LFD	f20,  0 * SIZE(B)
2316	LFD	f21,  1 * SIZE(B)
2317	LFD	f22,  2 * SIZE(B)
2318	LFD	f23,  3 * SIZE(B)
2319
2320	mr	BO,  B
2321#else
2322	slwi	r0,   KK, 1 + BASE_SHIFT
2323	slwi	TEMP, KK, 0 + BASE_SHIFT
2324	add	AO, AO, r0
2325	add	BO, B,  TEMP
2326
2327	LFD	f16,  0 * SIZE(AO)
2328	LFD	f17,  1 * SIZE(AO)
2329	LFD	f18,  2 * SIZE(AO)
2330	LFD	f19,  3 * SIZE(AO)
2331
2332	LFD	f20,  0 * SIZE(BO)
2333	LFD	f21,  1 * SIZE(BO)
2334	LFD	f22,  2 * SIZE(BO)
2335	LFD	f23,  3 * SIZE(BO)
2336#endif
2337
2338#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2339	sub	TEMP, K, KK
2340#elif defined(LEFT)
2341	addi	TEMP, KK, 2
2342#else
2343	addi	TEMP, KK, 1
2344#endif
2345	srawi.	TEMP,  TEMP,  2
2346	mtspr	CTR, TEMP
2347
2348#else
2349	LFD	f16,  0 * SIZE(AO)
2350	LFD	f17,  1 * SIZE(AO)
2351	LFD	f18,  2 * SIZE(AO)
2352	LFD	f19,  3 * SIZE(AO)
2353
2354	LFD	f20,  0 * SIZE(B)
2355	LFD	f21,  1 * SIZE(B)
2356	LFD	f22,  2 * SIZE(B)
2357	LFD	f23,  3 * SIZE(B)
2358
2359	srawi.	r0,  K,  2
2360	mtspr	CTR, r0
2361	mr	BO,  B
2362
2363#endif
2364	ble	LL(85)
2365	.align 5
2366
2367LL(82):
2368	FMADD	f0,  f16, f20, f0
2369	FMADD	f1,  f17, f20, f1
2370	FMADD	f2,  f18, f21, f2
2371	FMADD	f3,  f19, f21, f3
2372
2373	LFD	f16,  4 * SIZE(AO)
2374	LFD	f17,  5 * SIZE(AO)
2375	LFD	f18,  6 * SIZE(AO)
2376	LFD	f19,  7 * SIZE(AO)
2377
2378	FMADD	f0,  f16, f22, f0
2379	FMADD	f1,  f17, f22, f1
2380	FMADD	f2,  f18, f23, f2
2381	FMADD	f3,  f19, f23, f3
2382
2383	LFD	f16,  8 * SIZE(AO)
2384	LFD	f17,  9 * SIZE(AO)
2385	LFD	f18, 10 * SIZE(AO)
2386	LFD	f19, 11 * SIZE(AO)
2387
2388	LFD	f20,  4 * SIZE(BO)
2389	LFD	f21,  5 * SIZE(BO)
2390	LFD	f22,  6 * SIZE(BO)
2391	LFD	f23,  7 * SIZE(BO)
2392
2393	addi	AO, AO,  8 * SIZE
2394	addi	BO, BO,  4 * SIZE
2395	DCBT(BO, PREB)
2396	bdnz	LL(82)
2397	.align 4
2398
2399LL(85):
2400	lfd	f30,  ALPHA
2401#if defined(TRMMKERNEL)
2402
2403#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2404	sub	TEMP, K, KK
2405#elif defined(LEFT)
2406	addi	TEMP, KK, 2
2407#else
2408	addi	TEMP, KK, 1
2409#endif
2410	andi.	TEMP,  TEMP,  3
2411	mtspr	CTR, TEMP
2412
2413#else
2414
2415	andi.	r0,  K,  3
2416	mtspr	CTR, r0
2417
2418#endif
2419	ble+	LL(88)
2420	.align 4
2421
2422LL(86):
2423	FMADD	f0,  f16, f20, f0
2424	FMADD	f1,  f17, f20, f1
2425
2426	LFD	f16,  2 * SIZE(AO)
2427	LFD	f17,  3 * SIZE(AO)
2428	LFD	f20,  1 * SIZE(BO)
2429
2430	addi	BO, BO,  1 * SIZE
2431	addi	AO, AO,  2 * SIZE
2432	bdnz	LL(86)
2433	.align 4
2434
2435LL(88):
2436#ifndef TRMMKERNEL
2437	LFD	f16, 0 * SIZE(CO1)
2438	LFD	f17, 1 * SIZE(CO1)
2439
2440	FADD	f0, f2, f0
2441	FADD	f1, f3, f1
2442
2443	FMADD	f0,  f0, f30, f16
2444	FMADD	f1,  f1, f30, f17
2445#else
2446	FADD	f0, f2, f0
2447	FADD	f1, f3, f1
2448
2449	FMUL	f0,  f0, f30
2450	FMUL	f1,  f1, f30
2451#endif
2452
2453	STFD	f0,  0 * SIZE(CO1)
2454	STFD	f1,  1 * SIZE(CO1)
2455
2456	lfs	f0,  FZERO
2457 	fmr	f1,  f0
2458	fmr	f2,  f0
2459	fmr	f3,  f0
2460
2461	addi	CO1, CO1, 2 * SIZE
2462
2463#ifdef TRMMKERNEL
2464#if ( defined(LEFT) &&  defined(TRANSA)) || \
2465    (!defined(LEFT) && !defined(TRANSA))
2466	sub	TEMP, K, KK
2467#ifdef LEFT
2468	addi	TEMP, TEMP, -2
2469#else
2470	addi	TEMP, TEMP, -1
2471#endif
2472	slwi	r0  , TEMP, 1 + BASE_SHIFT
2473	slwi	TEMP, TEMP, 0 + BASE_SHIFT
2474	add	AO, AO, r0
2475	add	BO, BO, TEMP
2476#endif
2477
2478#ifdef LEFT
2479	addi	KK, KK, 2
2480#endif
2481#endif
2482	.align 4
2483
2484LL(90):
2485	andi.	I,  M,  1
2486	ble	LL(999)
2487
2488
2489#if defined(TRMMKERNEL)
2490
2491#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2492
2493	LFD	f16,  0 * SIZE(AO)
2494	LFD	f17,  1 * SIZE(AO)
2495	LFD	f18,  2 * SIZE(AO)
2496	LFD	f19,  3 * SIZE(AO)
2497
2498	LFD	f20,  0 * SIZE(B)
2499	LFD	f21,  1 * SIZE(B)
2500	LFD	f22,  2 * SIZE(B)
2501	LFD	f23,  3 * SIZE(B)
2502
2503	mr	BO,  B
2504#else
2505	slwi	r0,   KK, 0 + BASE_SHIFT
2506	slwi	TEMP, KK, 0 + BASE_SHIFT
2507	add	AO, AO, r0
2508	add	BO, B,  TEMP
2509
2510	LFD	f16,  0 * SIZE(AO)
2511	LFD	f17,  1 * SIZE(AO)
2512	LFD	f18,  2 * SIZE(AO)
2513	LFD	f19,  3 * SIZE(AO)
2514
2515	LFD	f20,  0 * SIZE(BO)
2516	LFD	f21,  1 * SIZE(BO)
2517	LFD	f22,  2 * SIZE(BO)
2518	LFD	f23,  3 * SIZE(BO)
2519#endif
2520
2521#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2522	sub	TEMP, K, KK
2523#elif defined(LEFT)
2524	addi	TEMP, KK, 1
2525#else
2526	addi	TEMP, KK, 1
2527#endif
2528	srawi.	TEMP,  TEMP,  3
2529	mtspr	CTR, TEMP
2530
2531#else
2532	LFD	f16,  0 * SIZE(AO)
2533	LFD	f17,  1 * SIZE(AO)
2534	LFD	f18,  2 * SIZE(AO)
2535	LFD	f19,  3 * SIZE(AO)
2536
2537	LFD	f20,  0 * SIZE(B)
2538	LFD	f21,  1 * SIZE(B)
2539	LFD	f22,  2 * SIZE(B)
2540	LFD	f23,  3 * SIZE(B)
2541
2542	srawi.	r0,  K,  3
2543	mtspr	CTR, r0
2544	mr	BO,  B
2545#endif
2546	ble	LL(95)
2547	.align 5
2548
2549LL(92):
2550	FMADD	f0,  f16, f20, f0
2551	FMADD	f1,  f17, f21, f1
2552	FMADD	f2,  f18, f22, f2
2553	FMADD	f3,  f19, f23, f3
2554
2555	LFD	f16,  4 * SIZE(AO)
2556	LFD	f17,  5 * SIZE(AO)
2557	LFD	f18,  6 * SIZE(AO)
2558	LFD	f19,  7 * SIZE(AO)
2559
2560	LFD	f20,  4 * SIZE(BO)
2561	LFD	f21,  5 * SIZE(BO)
2562	LFD	f22,  6 * SIZE(BO)
2563	LFD	f23,  7 * SIZE(BO)
2564
2565	FMADD	f0,  f16, f20, f0
2566	FMADD	f1,  f17, f21, f1
2567	FMADD	f2,  f18, f22, f2
2568	FMADD	f3,  f19, f23, f3
2569
2570	LFD	f16,  8 * SIZE(AO)
2571	LFD	f17,  9 * SIZE(AO)
2572	LFD	f18, 10 * SIZE(AO)
2573	LFD	f19, 11 * SIZE(AO)
2574
2575	LFD	f20,  8 * SIZE(BO)
2576	LFD	f21,  9 * SIZE(BO)
2577	LFD	f22, 10 * SIZE(BO)
2578	LFD	f23, 11 * SIZE(BO)
2579
2580	addi	AO, AO,  8 * SIZE
2581	addi	BO, BO,  8 * SIZE
2582	bdnz	LL(92)
2583	.align 4
2584
2585LL(95):
2586	lfd	f30,  ALPHA
2587
2588#if defined(TRMMKERNEL)
2589
2590#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2591	sub	TEMP, K, KK
2592#elif defined(LEFT)
2593	addi	TEMP, KK, 1
2594#else
2595	addi	TEMP, KK, 1
2596#endif
2597	andi.	TEMP,  TEMP,  7
2598	mtspr	CTR, TEMP
2599
2600#else
2601
2602	andi.	r0,  K,  7
2603	mtspr	CTR, r0
2604
2605#endif
2606	ble+	LL(98)
2607	.align 4
2608
2609LL(96):
2610	FMADD	f0,  f16, f20, f0
2611	LFD	f16,  1 * SIZE(AO)
2612	LFD	f20,  1 * SIZE(BO)
2613	addi	BO, BO,  1 * SIZE
2614	addi	AO, AO,  1 * SIZE
2615	bdnz	LL(96)
2616	.align 4
2617
2618LL(98):
2619#ifndef TRMMKERNEL
2620	LFD	f16, 0 * SIZE(CO1)
2621
2622	FADD	f0, f1, f0
2623	FADD	f2, f3, f2
2624	FADD	f0, f2, f0
2625
2626	FMADD	f0,  f0,  f30, f16
2627#else
2628	FADD	f0, f1, f0
2629	FADD	f2, f3, f2
2630	FADD	f0, f2, f0
2631
2632	FMUL	f0,  f0,  f30
2633#endif
2634
2635	STFD	f0,  0 * SIZE(CO1)
2636	.align 4
2637
2638LL(999):
2639	addi	r3, 0, 0
2640
2641	lfd	f14,    0(SP)
2642	lfd	f15,    8(SP)
2643	lfd	f16,   16(SP)
2644	lfd	f17,   24(SP)
2645
2646	lfd	f18,   32(SP)
2647	lfd	f19,   40(SP)
2648	lfd	f20,   48(SP)
2649	lfd	f21,   56(SP)
2650
2651	lfd	f22,   64(SP)
2652	lfd	f23,   72(SP)
2653	lfd	f24,   80(SP)
2654	lfd	f25,   88(SP)
2655
2656	lfd	f26,   96(SP)
2657	lfd	f27,  104(SP)
2658	lfd	f28,  112(SP)
2659	lfd	f29,  120(SP)
2660
2661	lfd	f30,  128(SP)
2662	lfd	f31,  136(SP)
2663
2664#ifdef __64BIT__
2665	ld	r31,  144(SP)
2666	ld	r30,  152(SP)
2667	ld	r29,  160(SP)
2668	ld	r28,  168(SP)
2669	ld	r27,  176(SP)
2670	ld	r26,  184(SP)
2671	ld	r25,  192(SP)
2672	ld	r24,  200(SP)
2673	ld	r23,  208(SP)
2674	ld	r22,  216(SP)
2675	ld	r21,  224(SP)
2676	ld	r20,  232(SP)
2677#if defined(TRMMKERNEL) || defined(TRSMKERNEL)
2678	ld	r19,  240(SP)
2679	ld	r18,  248(SP)
2680#endif
2681#else
2682	lwz	r31,  144(SP)
2683	lwz	r30,  148(SP)
2684	lwz	r29,  152(SP)
2685	lwz	r28,  156(SP)
2686	lwz	r27,  160(SP)
2687	lwz	r26,  164(SP)
2688	lwz	r25,  168(SP)
2689	lwz	r24,  172(SP)
2690	lwz	r23,  176(SP)
2691	lwz	r22,  180(SP)
2692	lwz	r21,  184(SP)
2693	lwz	r20,  188(SP)
2694#if defined(TRMMKERNEL) || defined(TRSMKERNEL)
2695	lwz	r19,  192(SP)
2696	lwz	r18,  196(SP)
2697#endif
2698#endif
2699
2700	addi	SP, SP, STACKSIZE
2701
2702	blr
2703
2704	EPILOGUE
2705#endif
2706