1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	$4
26#define	N	$5
27#define	K	$6
28#define A	$8
29#define B	$9
30#define C	$10
31#define LDC	$11
32
33#define AO	$12
34#define BO	$13
35
36#define I	$2
37#define J	$3
38#define L	$7
39
40#define PREFETCHSIZE (4 * 10)
41
42#define CO1	$14
43#define CO2	$15
44#define CO3	$16
45#define CO4	$17
46#define CO5	$18
47#define CO6	$19
48#define CO7	$20
49#define CO8	$21
50
51#define BB	$22
52
53#if defined(TRMMKERNEL)
54#define OFFSET	$23
55#define KK	$24
56#define TEMP	$25
57#endif
58
59#define a1	$f0
60#define a2	$f1
61#define a3	$f27
62#define a4	$f28
63
64#define b1	$f2
65#define b2	$f3
66#define b3	$f4
67#define b4	$f5
68#define b5	$f6
69#define b6	$f7
70#define b7	$f8
71#define b8	$f9
72
73#define a5	b8
74
75#define c11	$f10
76#define c12	$f11
77#define c21	$f12
78#define c22	$f13
79#define c31	$f14
80#define c32	$f16
81#define c41	$f17
82#define c42	$f18
83#define c51	$f19
84#define c52	$f20
85#define c61	$f21
86#define c62	$f22
87#define c71	$f23
88#define c72	$f24
89#define c81	$f25
90#define c82	$f26
91
92#define ALPHA	$f15
93
94	PROLOGUE
95
96	daddiu	$sp, $sp, -160
97
98	SDARG	$16,   0($sp)
99	SDARG	$17,   8($sp)
100	SDARG	$18,  16($sp)
101	SDARG	$19,  24($sp)
102	SDARG	$20,  32($sp)
103	SDARG	$21,  40($sp)
104	SDARG	$22,  48($sp)
105
106	sdc1	$f24, 56($sp)
107	sdc1	$f25, 64($sp)
108	sdc1	$f26, 72($sp)
109	sdc1	$f27, 80($sp)
110	sdc1	$f28, 88($sp)
111
112#if defined(TRMMKERNEL)
113	SDARG	$23,  96($sp)
114	SDARG	$24, 104($sp)
115	SDARG	$25, 112($sp)
116
117	LDARG	OFFSET, 160($sp)
118#endif
119
120#ifndef __64BIT__
121	sdc1	$f20,120($sp)
122	sdc1	$f21,128($sp)
123	sdc1	$f22,136($sp)
124	sdc1	$f23,144($sp)
125#endif
126
127	dsll	LDC, LDC, BASE_SHIFT
128
129#if defined(TRMMKERNEL) && !defined(LEFT)
130	neg	KK, OFFSET
131#endif
132
133	dsra	J,  N, 3
134	blez	J, .L30
135	nop
136
137.L10:
138	move	CO1, C
139	MTC	$0,  c11
140	daddu	CO2, C,   LDC
141	move	AO, A
142	daddu	CO3, CO2, LDC
143	daddiu	J, J, -1
144	daddu	CO4, CO3, LDC
145	MOV	c21, c11
146	daddu	CO5, CO4, LDC
147	MOV	c31, c11
148	daddu	CO6, CO5, LDC
149	MOV	c41, c11
150	daddu	CO7, CO6, LDC
151	MOV	c51, c11
152	daddu	CO8, CO7, LDC
153	dsra	I,  M, 1
154	daddu	C,   CO8, LDC
155
156	dsll	BB, K, 2 + BASE_SHIFT
157	daddu	BB, B, BB
158
159#if defined(TRMMKERNEL) &&  defined(LEFT)
160	move	KK, OFFSET
161#endif
162
163	blez	I, .L20
164	MOV	c61, c11
165
166.L11:
167#if defined(TRMMKERNEL)
168#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
169	move	BO,  B
170#else
171	dsll	L,    KK, 1 + BASE_SHIFT
172	dsll	TEMP, KK, 3 + BASE_SHIFT
173
174	daddu	AO, AO, L
175	daddu	BO, B,  TEMP
176#endif
177
178	LD	a1,  0 * SIZE(AO)
179	MOV	c71, c11
180	LD	b1,  0 * SIZE(BO)
181	MOV	c81, c11
182
183	LD	a3,  4 * SIZE(AO)
184	MOV	c12, c11
185	LD	b2,  1 * SIZE(BO)
186	MOV	c22, c11
187
188	MOV	c32, c11
189	LD	b3,  2 * SIZE(BO)
190	MOV	c42, c11
191
192	LD	b4,  3 * SIZE(BO)
193	MOV	c52, c11
194	LD	b5,  4 * SIZE(BO)
195	MOV	c62, c11
196
197	LD	b6,  8 * SIZE(BO)
198	MOV	c72, c11
199	LD	b7, 12 * SIZE(BO)
200	MOV	c82, c11
201
202#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
203	dsubu	TEMP, K, KK
204#elif defined(LEFT)
205	daddiu	TEMP, KK, 2
206#else
207	daddiu	TEMP, KK, 8
208#endif
209	dsra	L,  TEMP, 2
210
211	blez	L, .L15
212	NOP
213#else
214	LD	a1,  0 * SIZE(AO)
215	MOV	c71, c11
216	LD	b1,  0 * SIZE(B)
217	MOV	c81, c11
218
219	pref	1, 3 * SIZE(CO1)
220	pref	1, 3 * SIZE(CO2)
221
222	LD	a3,  4 * SIZE(AO)
223	MOV	c12, c11
224	LD	b2,  1 * SIZE(B)
225	MOV	c22, c11
226
227	dsra	L,  K, 2
228	MOV	c32, c11
229	LD	b3,  2 * SIZE(B)
230	MOV	c42, c11
231
232	LD	b4,  3 * SIZE(B)
233	MOV	c52, c11
234	LD	b5,  4 * SIZE(B)
235	MOV	c62, c11
236
237	LD	b6,  8 * SIZE(B)
238	MOV	c72, c11
239	LD	b7, 12 * SIZE(B)
240	MOV	c82, c11
241
242	blez	L, .L15
243	move	BO,  B
244#endif
245
246	MADD	c11, c11, a1, b1
247	LD	a2,  1 * SIZE(AO)
248	MADD	c21, c21, a1, b2
249	daddiu	L, L, -1
250	MADD	c31, c31, a1, b3
251	blez	L, .L13
252	MADD	c41, c41, a1, b4
253	pref	1, 2 * SIZE(CO3)
254	.align	3
255
256.L12:
257	MADD	c12, c12, a2, b1
258	LD	b1, 16 * SIZE(BO)
259	MADD	c22, c22, a2, b2
260	LD	b2,  5 * SIZE(BO)
261	MADD	c32, c32, a2, b3
262	LD	b3,  6 * SIZE(BO)
263	MADD	c42, c42, a2, b4
264	LD	b4,  7 * SIZE(BO)
265
266	MADD	c51, c51, a1, b5
267	LD	a4,  2 * SIZE(AO)
268	MADD	c61, c61, a1, b2
269	NOP
270	MADD	c71, c71, a1, b3
271	NOP
272	MADD	c81, c81, a1, b4
273	LD	a1,  8 * SIZE(AO)
274
275	MADD	c52, c52, a2, b5
276	LD	b5, 20 * SIZE(BO)
277	MADD	c62, c62, a2, b2
278	LD	b2,  9 * SIZE(BO)
279	MADD	c72, c72, a2, b3
280	LD	b3, 10 * SIZE(BO)
281	MADD	c82, c82, a2, b4
282	LD	b4, 11 * SIZE(BO)
283
284	MADD	c11, c11, a4, b6
285	LD	a2,  3 * SIZE(AO)
286	MADD	c21, c21, a4, b2
287	NOP
288	MADD	c31, c31, a4, b3
289	NOP
290	MADD	c41, c41, a4, b4
291	NOP
292
293	MADD	c12, c12, a2, b6
294	LD	b6, 24 * SIZE(BO)
295	MADD	c22, c22, a2, b2
296	LD	b2, 13 * SIZE(BO)
297	MADD	c32, c32, a2, b3
298	LD	b3, 14 * SIZE(BO)
299	MADD	c42, c42, a2, b4
300	LD	b4, 15 * SIZE(BO)
301
302	MADD	c51, c51, a4, b7
303	NOP
304	MADD	c61, c61, a4, b2
305	NOP
306	MADD	c71, c71, a4, b3
307	NOP
308	MADD	c81, c81, a4, b4
309	NOP
310
311	MADD	c52, c52, a2, b7
312	LD	b7, 28 * SIZE(BO)
313	MADD	c62, c62, a2, b2
314	LD	b2, 17 * SIZE(BO)
315	MADD	c72, c72, a2, b3
316	LD	b3, 18 * SIZE(BO)
317	MADD	c82, c82, a2, b4
318	LD	b4, 19 * SIZE(BO)
319
320	MADD	c11, c11, a3, b1
321	LD	a2,  5 * SIZE(AO)
322	MADD	c21, c21, a3, b2
323	NOP
324	MADD	c31, c31, a3, b3
325	NOP
326	MADD	c41, c41, a3, b4
327	NOP
328
329	MADD	c12, c12, a2, b1
330	LD	b1, 32 * SIZE(BO)
331	MADD	c22, c22, a2, b2
332	LD	b2, 21 * SIZE(BO)
333	MADD	c32, c32, a2, b3
334	LD	b3, 22 * SIZE(BO)
335	MADD	c42, c42, a2, b4
336	LD	b4, 23 * SIZE(BO)
337
338	MADD	c51, c51, a3, b5
339	LD	a4,  6 * SIZE(AO)
340	MADD	c61, c61, a3, b2
341	NOP
342	MADD	c71, c71, a3, b3
343	NOP
344	MADD	c81, c81, a3, b4
345	LD	a3, 12 * SIZE(AO)
346
347	MADD	c52, c52, a2, b5
348	LD	b5, 36 * SIZE(BO)
349	MADD	c62, c62, a2, b2
350	LD	b2, 25 * SIZE(BO)
351	MADD	c72, c72, a2, b3
352	LD	b3, 26 * SIZE(BO)
353	MADD	c82, c82, a2, b4
354	LD	b4, 27 * SIZE(BO)
355
356	MADD	c11, c11, a4, b6
357	LD	a2,  7 * SIZE(AO)
358	MADD	c21, c21, a4, b2
359	NOP
360	MADD	c31, c31, a4, b3
361	NOP
362	MADD	c41, c41, a4, b4
363	daddiu	L, L, -1
364
365	MADD	c12, c12, a2, b6
366	LD	b6, 40 * SIZE(BO)
367	MADD	c22, c22, a2, b2
368	LD	b2, 29 * SIZE(BO)
369	MADD	c32, c32, a2, b3
370	LD	b3, 30 * SIZE(BO)
371	MADD	c42, c42, a2, b4
372	LD	b4, 31 * SIZE(BO)
373
374	MADD	c51, c51, a4, b7
375	daddiu	BO, BO, 32 * SIZE
376	MADD	c61, c61, a4, b2
377	daddiu	AO, AO,  8 * SIZE
378	MADD	c71, c71, a4, b3
379	NOP
380	MADD	c81, c81, a4, b4
381	NOP
382
383	MADD	c52, c52, a2, b7
384	LD	b7, 12 * SIZE(BO)
385	MADD	c62, c62, a2, b2
386	LD	b2,  1 * SIZE(BO)
387	MADD	c72, c72, a2, b3
388	LD	b3,  2 * SIZE(BO)
389	MADD	c82, c82, a2, b4
390	LD	b4,  3 * SIZE(BO)
391
392	MADD	c11, c11, a1, b1
393	LD	a2,  1 * SIZE(AO)
394	MADD	c21, c21, a1, b2
395	NOP
396	MADD	c31, c31, a1, b3
397	bgtz	L, .L12
398	MADD	c41, c41, a1, b4
399	NOP
400	.align 3
401
402.L13:
403	MADD	c12, c12, a2, b1
404	LD	b1, 16 * SIZE(BO)
405	MADD	c22, c22, a2, b2
406	LD	b2,  5 * SIZE(BO)
407	MADD	c32, c32, a2, b3
408	LD	b3,  6 * SIZE(BO)
409	MADD	c42, c42, a2, b4
410	LD	b4,  7 * SIZE(BO)
411
412	MADD	c51, c51, a1, b5
413	NOP
414	MADD	c61, c61, a1, b2
415	LD	a4,  2 * SIZE(AO)
416	MADD	c71, c71, a1, b3
417	NOP
418	MADD	c81, c81, a1, b4
419	LD	a1,  8 * SIZE(AO)
420
421	MADD	c52, c52, a2, b5
422	LD	b5, 20 * SIZE(BO)
423	MADD	c62, c62, a2, b2
424	LD	b2,  9 * SIZE(BO)
425	MADD	c72, c72, a2, b3
426	LD	b3, 10 * SIZE(BO)
427	MADD	c82, c82, a2, b4
428	LD	b4, 11 * SIZE(BO)
429
430	MADD	c11, c11, a4, b6
431	LD	a2,  3 * SIZE(AO)
432	MADD	c21, c21, a4, b2
433	NOP
434	MADD	c31, c31, a4, b3
435	pref	1, 3 * SIZE(CO4)
436	MADD	c41, c41, a4, b4
437	NOP
438
439	MADD	c12, c12, a2, b6
440	LD	b6, 24 * SIZE(BO)
441	MADD	c22, c22, a2, b2
442	LD	b2, 13 * SIZE(BO)
443	MADD	c32, c32, a2, b3
444	LD	b3, 14 * SIZE(BO)
445	MADD	c42, c42, a2, b4
446	LD	b4, 15 * SIZE(BO)
447
448	MADD	c51, c51, a4, b7
449	pref	1, 3 * SIZE(CO5)
450	MADD	c61, c61, a4, b2
451	NOP
452	MADD	c71, c71, a4, b3
453	pref	1, 3 * SIZE(CO6)
454	MADD	c81, c81, a4, b4
455	NOP
456
457	MADD	c52, c52, a2, b7
458	LD	b7, 28 * SIZE(BO)
459	MADD	c62, c62, a2, b2
460	LD	b2, 17 * SIZE(BO)
461	MADD	c72, c72, a2, b3
462	LD	b3, 18 * SIZE(BO)
463	MADD	c82, c82, a2, b4
464	LD	b4, 19 * SIZE(BO)
465
466	MADD	c11, c11, a3, b1
467	LD	a2,  5 * SIZE(AO)
468	MADD	c21, c21, a3, b2
469	NOP
470	MADD	c31, c31, a3, b3
471	pref	1, 3 * SIZE(CO7)
472	MADD	c41, c41, a3, b4
473	NOP
474
475	MADD	c12, c12, a2, b1
476	LD	b1, 32 * SIZE(BO)
477	MADD	c22, c22, a2, b2
478	LD	b2, 21 * SIZE(BO)
479	MADD	c32, c32, a2, b3
480	LD	b3, 22 * SIZE(BO)
481	MADD	c42, c42, a2, b4
482	LD	b4, 23 * SIZE(BO)
483
484	MADD	c51, c51, a3, b5
485	NOP
486	MADD	c61, c61, a3, b2
487	LD	a4,  6 * SIZE(AO)
488	MADD	c71, c71, a3, b3
489	NOP
490	MADD	c81, c81, a3, b4
491	NOP
492
493	MADD	c52, c52, a2, b5
494	LD	b5, 36 * SIZE(BO)
495	MADD	c62, c62, a2, b2
496	LD	b2, 25 * SIZE(BO)
497	MADD	c72, c72, a2, b3
498	LD	b3, 26 * SIZE(BO)
499	MADD	c82, c82, a2, b4
500	LD	b4, 27 * SIZE(BO)
501
502	MADD	c11, c11, a4, b6
503	LD	a2,  7 * SIZE(AO)
504	MADD	c21, c21, a4, b2
505	NOP
506	MADD	c31, c31, a4, b3
507	NOP
508	MADD	c41, c41, a4, b4
509	NOP
510
511	MADD	c12, c12, a2, b6
512	LD	b6, 40 * SIZE(BO)
513	MADD	c22, c22, a2, b2
514	LD	b2, 29 * SIZE(BO)
515	MADD	c32, c32, a2, b3
516	LD	b3, 30 * SIZE(BO)
517	MADD	c42, c42, a2, b4
518	LD	b4, 31 * SIZE(BO)
519
520	MADD	c51, c51, a4, b7
521	daddiu	BO, BO, 32 * SIZE
522	MADD	c61, c61, a4, b2
523	daddiu	AO, AO,  8 * SIZE
524	MADD	c71, c71, a4, b3
525	NOP
526	MADD	c81, c81, a4, b4
527	NOP
528
529	MADD	c52, c52, a2, b7
530	LD	b7, 12 * SIZE(BO)
531	MADD	c62, c62, a2, b2
532	LD	b2,  1 * SIZE(BO)
533	MADD	c72, c72, a2, b3
534	LD	b3,  2 * SIZE(BO)
535	MADD	c82, c82, a2, b4
536	LD	b4,  3 * SIZE(BO)
537	.align 3
538
539.L15:
540#ifndef TRMMKERNEL
541	andi	L,  K, 3
542#else
543	andi	L,  TEMP, 3
544#endif
545	NOP
546	blez	L, .L18
547	pref	1, 3 * SIZE(CO8)
548	.align	3
549
550.L16:
551	MADD	c11, c11, a1, b1
552	LD	a2,  1 * SIZE(AO)
553	MADD	c21, c21, a1, b2
554	NOP
555	MADD	c31, c31, a1, b3
556	NOP
557	MADD	c41, c41, a1, b4
558	NOP
559
560	MADD	c12, c12, a2, b1
561	LD	b1,  8 * SIZE(BO)
562	MADD	c22, c22, a2, b2
563	LD	b2,  5 * SIZE(BO)
564	MADD	c32, c32, a2, b3
565	LD	b3,  6 * SIZE(BO)
566	MADD	c42, c42, a2, b4
567	LD	b4,  7 * SIZE(BO)
568
569	MADD	c51, c51, a1, b5
570	daddiu	L, L, -1
571	MADD	c61, c61, a1, b2
572	daddiu	AO, AO,  2 * SIZE
573	MADD	c71, c71, a1, b3
574	daddiu	BO, BO,  8 * SIZE
575	MADD	c81, c81, a1, b4
576	LD	a1,  0 * SIZE(AO)
577
578	MADD	c52, c52, a2, b5
579	LD	b5,  4 * SIZE(BO)
580	MADD	c62, c62, a2, b2
581	LD	b2,  1 * SIZE(BO)
582	MADD	c72, c72, a2, b3
583	LD	b3,  2 * SIZE(BO)
584	MADD	c82, c82, a2, b4
585	bgtz	L, .L16
586	LD	b4,  3 * SIZE(BO)
587
588.L18:
589#ifndef TRMMKERNEL
590	LD	$f0, 0 * SIZE(CO1)
591	daddiu	CO3,CO3, 2 * SIZE
592	LD	$f1, 1 * SIZE(CO1)
593	daddiu	CO1,CO1, 2 * SIZE
594	LD	$f2, 0 * SIZE(CO2)
595	daddiu	CO4,CO4, 2 * SIZE
596	LD	$f3, 1 * SIZE(CO2)
597	daddiu	CO2,CO2, 2 * SIZE
598
599	LD	$f4, -2 * SIZE(CO3)
600	daddiu	CO5,CO5, 2 * SIZE
601	LD	$f5, -1 * SIZE(CO3)
602	daddiu	CO6,CO6, 2 * SIZE
603	LD	$f6, -2 * SIZE(CO4)
604	daddiu	CO7,CO7, 2 * SIZE
605	LD	$f7, -1 * SIZE(CO4)
606	daddiu	I, I, -1
607
608	MADD	c11, $f0, ALPHA, c11
609	LD	$f0,-2 * SIZE(CO5)
610	MADD	c12, $f1, ALPHA, c12
611	LD	$f1,-1 * SIZE(CO5)
612	MADD	c21, $f2, ALPHA, c21
613	LD	$f2,-2 * SIZE(CO6)
614	MADD	c22, $f3, ALPHA, c22
615	LD	$f3,-1 * SIZE(CO6)
616
617	MADD	c31, $f4, ALPHA, c31
618	LD	$f4,-2 * SIZE(CO7)
619	MADD	c32, $f5, ALPHA, c32
620	LD	$f5,-1 * SIZE(CO7)
621	MADD	c41, $f6, ALPHA, c41
622	LD	$f6, 0 * SIZE(CO8)
623	MADD	c42, $f7, ALPHA, c42
624	LD	$f7, 1 * SIZE(CO8)
625
626	pref	0, 0 * SIZE(BB)
627	pref	0, 8 * SIZE(BB)
628
629	ST	c11, -2 * SIZE(CO1)
630	MTC	$0,  c11
631	ST	c12, -1 * SIZE(CO1)
632	daddiu	CO8,CO8, 2 * SIZE
633	ST	c21, -2 * SIZE(CO2)
634	MOV	c21, c11
635	ST	c22, -1 * SIZE(CO2)
636	daddiu	BB, BB, 16 * SIZE
637
638	MADD	c51, $f0, ALPHA, c51
639	ST	c31, -2 * SIZE(CO3)
640	MADD	c52, $f1, ALPHA, c52
641	ST	c32, -1 * SIZE(CO3)
642	MADD	c61, $f2, ALPHA, c61
643	ST	c41, -2 * SIZE(CO4)
644	MADD	c62, $f3, ALPHA, c62
645	ST	c42, -1 * SIZE(CO4)
646
647	MADD	c71, $f4, ALPHA, c71
648	ST	c51, -2 * SIZE(CO5)
649	MADD	c72, $f5, ALPHA, c72
650	ST	c52, -1 * SIZE(CO5)
651	MADD	c81, $f6, ALPHA, c81
652	ST	c61, -2 * SIZE(CO6)
653	MADD	c82, $f7, ALPHA, c82
654	ST	c62, -1 * SIZE(CO6)
655
656	ST	c71, -2 * SIZE(CO7)
657	MOV	c31, c11
658	ST	c72, -1 * SIZE(CO7)
659	MOV	c41, c11
660
661	ST	c81, -2 * SIZE(CO8)
662	MOV	c51, c11
663	ST	c82, -1 * SIZE(CO8)
664	bgtz	I, .L11
665	MOV	c61, c11
666#else
667	daddiu	CO4,CO4, 2 * SIZE
668	daddiu	CO5,CO5, 2 * SIZE
669	daddiu	CO6,CO6, 2 * SIZE
670	daddiu	CO7,CO7, 2 * SIZE
671
672	pref	0, 0 * SIZE(BB)
673	pref	0, 8 * SIZE(BB)
674
675	MUL	c11, ALPHA, c11
676	daddiu	CO1,CO1, 2 * SIZE
677	MUL	c12, ALPHA, c12
678	MTC	$0,  a1
679	MUL	c21, ALPHA, c21
680	daddiu	CO2,CO2, 2 * SIZE
681	MUL	c22, ALPHA, c22
682	daddiu	CO3,CO3, 2 * SIZE
683
684	ST	c11, -2 * SIZE(CO1)
685	MUL	c31, ALPHA, c31
686	ST	c12, -1 * SIZE(CO1)
687	MUL	c32, ALPHA, c32
688	ST	c21, -2 * SIZE(CO2)
689	MUL	c41, ALPHA, c41
690	ST	c22, -1 * SIZE(CO2)
691	MUL	c42, ALPHA, c42
692
693	ST	c31, -2 * SIZE(CO3)
694	MUL	c51, ALPHA, c51
695	ST	c32, -1 * SIZE(CO3)
696	MUL	c52, ALPHA, c52
697	ST	c41, -2 * SIZE(CO4)
698	MUL	c61, ALPHA, c61
699	ST	c42, -1 * SIZE(CO4)
700	MUL	c62, ALPHA, c62
701
702	ST	c51, -2 * SIZE(CO5)
703	MUL	c71, ALPHA, c71
704	ST	c52, -1 * SIZE(CO5)
705	MUL	c72, ALPHA, c72
706	ST	c61, -2 * SIZE(CO6)
707	MUL	c81, ALPHA, c81
708	ST	c62, -1 * SIZE(CO6)
709	MUL	c82, ALPHA, c82
710
711	ST	c71, -2 * SIZE(CO7)
712	MOV	c11, a1
713	ST	c72, -1 * SIZE(CO7)
714	MOV	c21, a1
715
716	daddiu	CO8,CO8, 2 * SIZE
717	daddiu	BB, BB, 16 * SIZE
718
719	ST	c81, -2 * SIZE(CO8)
720	MOV	c31, a1
721	ST	c82, -1 * SIZE(CO8)
722	MOV	c41, a1
723
724	daddiu	I, I, -1
725	MOV	c51, a1
726
727#if ( defined(LEFT) &&  defined(TRANSA)) || \
728    (!defined(LEFT) && !defined(TRANSA))
729	dsubu	TEMP, K, KK
730#ifdef LEFT
731	daddiu	TEMP, TEMP, -2
732#else
733	daddiu	TEMP, TEMP, -8
734#endif
735
736	dsll	L,    TEMP, 1 + BASE_SHIFT
737	dsll	TEMP, TEMP, 3 + BASE_SHIFT
738
739	daddu	AO, AO, L
740	daddu	BO, BO, TEMP
741#endif
742
743#ifdef LEFT
744	daddiu	KK, KK, 2
745#endif
746
747	bgtz	I, .L11
748	MOV	c61, a1
749#endif
750	.align 3
751
752.L20:
753	andi	I,  M, 1
754	MOV	c61, c11
755	blez	I, .L29
756	MOV	c71, c11
757
758#if defined(TRMMKERNEL)
759#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
760	move	BO,  B
761#else
762	dsll	L,    KK, 0 + BASE_SHIFT
763	dsll	TEMP, KK, 3 + BASE_SHIFT
764
765	daddu	AO, AO, L
766	daddu	BO, B,  TEMP
767#endif
768
769	LD	a1,  0 * SIZE(AO)
770	LD	a2,  1 * SIZE(AO)
771	LD	a3,  2 * SIZE(AO)
772	LD	a4,  3 * SIZE(AO)
773
774	LD	b1,  0 * SIZE(BO)
775	LD	b2,  1 * SIZE(BO)
776	LD	b3,  2 * SIZE(BO)
777	LD	b4,  3 * SIZE(BO)
778	LD	b5,  4 * SIZE(BO)
779	LD	b6,  8 * SIZE(BO)
780	LD	b7, 12 * SIZE(BO)
781
782#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
783	dsubu	TEMP, K, KK
784#elif defined(LEFT)
785	daddiu	TEMP, KK, 1
786#else
787	daddiu	TEMP, KK, 8
788#endif
789	dsra	L,  TEMP, 2
790
791	blez	L, .L25
792	MOV	c81, c11
793#else
794	LD	a1,  0 * SIZE(AO)
795	LD	a2,  1 * SIZE(AO)
796	LD	a3,  2 * SIZE(AO)
797	LD	a4,  3 * SIZE(AO)
798
799	LD	b1,  0 * SIZE(B)
800	LD	b2,  1 * SIZE(B)
801	LD	b3,  2 * SIZE(B)
802	LD	b4,  3 * SIZE(B)
803	LD	b5,  4 * SIZE(B)
804	LD	b6,  8 * SIZE(B)
805	LD	b7, 12 * SIZE(B)
806
807	dsra	L,  K, 2
808	MOV	c81, c11
809
810	blez	L, .L25
811	move	BO,  B
812#endif
813	.align	3
814
815.L22:
816	MADD	c11, c11, a1, b1
817	LD	b1, 16 * SIZE(BO)
818	MADD	c21, c21, a1, b2
819	LD	b2,  5 * SIZE(BO)
820	MADD	c31, c31, a1, b3
821	LD	b3,  6 * SIZE(BO)
822	MADD	c41, c41, a1, b4
823	LD	b4,  7 * SIZE(BO)
824
825	MADD	c51, c51, a1, b5
826	LD	b5, 20 * SIZE(BO)
827	MADD	c61, c61, a1, b2
828	LD	b2,  9 * SIZE(BO)
829	MADD	c71, c71, a1, b3
830	LD	b3, 10 * SIZE(BO)
831	MADD	c81, c81, a1, b4
832	LD	b4, 11 * SIZE(BO)
833
834	LD	a1,  4 * SIZE(AO)
835	daddiu	L, L, -1
836
837	MADD	c11, c11, a2, b6
838	LD	b6, 24 * SIZE(BO)
839	MADD	c21, c21, a2, b2
840	LD	b2, 13 * SIZE(BO)
841	MADD	c31, c31, a2, b3
842	LD	b3, 14 * SIZE(BO)
843	MADD	c41, c41, a2, b4
844	LD	b4, 15 * SIZE(BO)
845
846	MADD	c51, c51, a2, b7
847	LD	b7, 28 * SIZE(BO)
848	MADD	c61, c61, a2, b2
849	LD	b2, 17 * SIZE(BO)
850	MADD	c71, c71, a2, b3
851	LD	b3, 18 * SIZE(BO)
852	MADD	c81, c81, a2, b4
853	LD	b4, 19 * SIZE(BO)
854
855	LD	a2,  5 * SIZE(AO)
856	daddiu	AO, AO,  4 * SIZE
857
858	MADD	c11, c11, a3, b1
859	LD	b1, 32 * SIZE(BO)
860	MADD	c21, c21, a3, b2
861	LD	b2, 21 * SIZE(BO)
862	MADD	c31, c31, a3, b3
863	LD	b3, 22 * SIZE(BO)
864	MADD	c41, c41, a3, b4
865	LD	b4, 23 * SIZE(BO)
866
867	MADD	c51, c51, a3, b5
868	LD	b5, 36 * SIZE(BO)
869	MADD	c61, c61, a3, b2
870	LD	b2, 25 * SIZE(BO)
871	MADD	c71, c71, a3, b3
872	LD	b3, 26 * SIZE(BO)
873	MADD	c81, c81, a3, b4
874	LD	b4, 27 * SIZE(BO)
875
876	LD	a3,  2 * SIZE(AO)
877	daddiu	BO, BO, 32 * SIZE
878
879	MADD	c11, c11, a4, b6
880	LD	b6,  8 * SIZE(BO)
881	MADD	c21, c21, a4, b2
882	LD	b2, -3 * SIZE(BO)
883	MADD	c31, c31, a4, b3
884	LD	b3, -2 * SIZE(BO)
885	MADD	c41, c41, a4, b4
886	LD	b4, -1 * SIZE(BO)
887
888	MADD	c51, c51, a4, b7
889	LD	b7, 12 * SIZE(BO)
890	MADD	c61, c61, a4, b2
891	LD	b2,  1 * SIZE(BO)
892	MADD	c71, c71, a4, b3
893	LD	b3,  2 * SIZE(BO)
894	MADD	c81, c81, a4, b4
895	LD	b4,  3 * SIZE(BO)
896	bgtz	L, .L22
897	LD	a4,  3 * SIZE(AO)
898	.align 3
899
900.L25:
901#ifndef TRMMKERNEL
902	andi	L,  K, 3
903#else
904	andi	L,  TEMP, 3
905#endif
906	NOP
907	blez	L, .L28
908	NOP
909	.align	3
910
911.L26:
912	MADD	c11, c11, a1, b1
913	LD	b1,  8 * SIZE(BO)
914	MADD	c21, c21, a1, b2
915	LD	b2,  5 * SIZE(BO)
916	MADD	c31, c31, a1, b3
917	LD	b3,  6 * SIZE(BO)
918	MADD	c41, c41, a1, b4
919	LD	b4,  7 * SIZE(BO)
920
921	daddiu	L, L, -1
922	MOV	a2, a2
923	daddiu	AO, AO,  1 * SIZE
924	daddiu	BO, BO,  8 * SIZE
925
926	MADD	c51, c51, a1, b5
927	LD	b5,  4 * SIZE(BO)
928	MADD	c61, c61, a1, b2
929	LD	b2,  1 * SIZE(BO)
930	MADD	c71, c71, a1, b3
931	LD	b3,  2 * SIZE(BO)
932	MADD	c81, c81, a1, b4
933	LD	a1,  0 * SIZE(AO)
934
935	bgtz	L, .L26
936	LD	b4,  3 * SIZE(BO)
937
938.L28:
939#ifndef TRMMKERNEL
940	LD	$f0, 0 * SIZE(CO1)
941	LD	$f1, 0 * SIZE(CO2)
942	LD	$f2, 0 * SIZE(CO3)
943	LD	$f3, 0 * SIZE(CO4)
944	MADD	c11, $f0, ALPHA, c11
945	LD	$f4, 0 * SIZE(CO5)
946	MADD	c21, $f1, ALPHA, c21
947	LD	$f5, 0 * SIZE(CO6)
948	MADD	c31, $f2, ALPHA, c31
949	LD	$f6, 0 * SIZE(CO7)
950	MADD	c41, $f3, ALPHA, c41
951	LD	$f7, 0 * SIZE(CO8)
952	MADD	c51, $f4, ALPHA, c51
953	ST	c11,  0 * SIZE(CO1)
954	MADD	c61, $f5, ALPHA, c61
955	ST	c21,  0 * SIZE(CO2)
956	MADD	c71, $f6, ALPHA, c71
957	ST	c31,  0 * SIZE(CO3)
958	MADD	c81, $f7, ALPHA, c81
959	ST	c41,  0 * SIZE(CO4)
960	ST	c51,  0 * SIZE(CO5)
961	ST	c61,  0 * SIZE(CO6)
962	ST	c71,  0 * SIZE(CO7)
963	ST	c81,  0 * SIZE(CO8)
964#else
965	MUL	c11, ALPHA, c11
966	MUL	c21, ALPHA, c21
967	MUL	c31, ALPHA, c31
968	MUL	c41, ALPHA, c41
969
970	ST	c11,  0 * SIZE(CO1)
971	MUL	c51, ALPHA, c51
972	ST	c21,  0 * SIZE(CO2)
973	MUL	c61, ALPHA, c61
974	ST	c31,  0 * SIZE(CO3)
975	MUL	c71, ALPHA, c71
976	ST	c41,  0 * SIZE(CO4)
977	MUL	c81, ALPHA, c81
978
979	ST	c51,  0 * SIZE(CO5)
980	ST	c61,  0 * SIZE(CO6)
981	ST	c71,  0 * SIZE(CO7)
982	ST	c81,  0 * SIZE(CO8)
983
984#if ( defined(LEFT) &&  defined(TRANSA)) || \
985    (!defined(LEFT) && !defined(TRANSA))
986	dsubu	TEMP, K, KK
987#ifdef LEFT
988	daddiu	TEMP, TEMP, -1
989#else
990	daddiu	TEMP, TEMP, -8
991#endif
992
993	dsll	L,    TEMP, 0 + BASE_SHIFT
994	dsll	TEMP, TEMP, 3 + BASE_SHIFT
995
996	daddu	AO, AO, L
997	daddu	BO, BO, TEMP
998#endif
999
1000#ifdef LEFT
1001	daddiu	KK, KK, 1
1002#endif
1003#endif
1004	.align 3
1005
1006.L29:
1007#if defined(TRMMKERNEL) && !defined(LEFT)
1008	daddiu	KK, KK, 8
1009#endif
1010
1011	bgtz	J, .L10
1012	move	B, BO
1013	.align 3
1014
1015.L30:
1016	andi	J,  N, 4
1017	blez	J, .L50
1018	move	AO, A
1019
1020	move	CO1, C
1021	MTC	$0,  c11
1022	daddu	CO2, C,   LDC
1023	daddu	CO3, CO2, LDC
1024	daddu	CO4, CO3, LDC
1025	MOV	c21, c11
1026	daddu	C,   CO4, LDC
1027	MOV	c31, c11
1028
1029#if defined(TRMMKERNEL) &&  defined(LEFT)
1030	move	KK, OFFSET
1031#endif
1032
1033	dsra	I,  M, 1
1034	blez	I, .L40
1035	MOV	c41, c11
1036
1037.L31:
1038#if defined(TRMMKERNEL)
1039#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1040	move	BO,  B
1041#else
1042	dsll	L,    KK, 1 + BASE_SHIFT
1043	dsll	TEMP, KK, 2 + BASE_SHIFT
1044
1045	daddu	AO, AO, L
1046	daddu	BO, B,  TEMP
1047#endif
1048
1049	LD	a1,  0 * SIZE(AO)
1050	LD	a3,  4 * SIZE(AO)
1051
1052	LD	b1,  0 * SIZE(BO)
1053	MOV	c12, c11
1054	LD	b2,  1 * SIZE(BO)
1055	MOV	c22, c11
1056	LD	b3,  2 * SIZE(BO)
1057	MOV	c32, c11
1058	LD	b4,  3 * SIZE(BO)
1059	MOV	c42, c11
1060
1061	LD	b5,  4 * SIZE(BO)
1062	LD	b6,  8 * SIZE(BO)
1063	LD	b7, 12 * SIZE(BO)
1064
1065#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1066	dsubu	TEMP, K, KK
1067#elif defined(LEFT)
1068	daddiu	TEMP, KK, 2
1069#else
1070	daddiu	TEMP, KK, 4
1071#endif
1072	dsra	L,  TEMP, 2
1073	blez	L, .L35
1074	NOP
1075#else
1076	LD	a1,  0 * SIZE(AO)
1077	LD	a3,  4 * SIZE(AO)
1078
1079	LD	b1,  0 * SIZE(B)
1080	MOV	c12, c11
1081	LD	b2,  1 * SIZE(B)
1082	MOV	c22, c11
1083	LD	b3,  2 * SIZE(B)
1084	MOV	c32, c11
1085	LD	b4,  3 * SIZE(B)
1086	MOV	c42, c11
1087
1088	LD	b5,  4 * SIZE(B)
1089	dsra	L,  K, 2
1090	LD	b6,  8 * SIZE(B)
1091	LD	b7, 12 * SIZE(B)
1092
1093	blez	L, .L35
1094	move	BO,  B
1095#endif
1096	.align	3
1097
1098.L32:
1099	MADD	c11, c11, a1, b1
1100	LD	a2,  1 * SIZE(AO)
1101	MADD	c21, c21, a1, b2
1102	daddiu	L, L, -1
1103	MADD	c31, c31, a1, b3
1104	NOP
1105	MADD	c41, c41, a1, b4
1106	LD	a1,  2 * SIZE(AO)
1107
1108	MADD	c12, c12, a2, b1
1109	LD	b1, 16 * SIZE(BO)
1110	MADD	c22, c22, a2, b2
1111	LD	b2,  5 * SIZE(BO)
1112	MADD	c32, c32, a2, b3
1113	LD	b3,  6 * SIZE(BO)
1114	MADD	c42, c42, a2, b4
1115	LD	b4,  7 * SIZE(BO)
1116
1117	MADD	c11, c11, a1, b5
1118	LD	a2,  3 * SIZE(AO)
1119	MADD	c21, c21, a1, b2
1120	NOP
1121	MADD	c31, c31, a1, b3
1122	NOP
1123	MADD	c41, c41, a1, b4
1124	LD	a1,  8 * SIZE(AO)
1125
1126	MADD	c12, c12, a2, b5
1127	LD	b5, 20 * SIZE(BO)
1128	MADD	c22, c22, a2, b2
1129	LD	b2,  9 * SIZE(BO)
1130	MADD	c32, c32, a2, b3
1131	LD	b3, 10 * SIZE(BO)
1132	MADD	c42, c42, a2, b4
1133	LD	b4, 11 * SIZE(BO)
1134
1135	MADD	c11, c11, a3, b6
1136	LD	a2,  5 * SIZE(AO)
1137	MADD	c21, c21, a3, b2
1138	NOP
1139	MADD	c31, c31, a3, b3
1140	NOP
1141	MADD	c41, c41, a3, b4
1142	LD	a3,  6 * SIZE(AO)
1143
1144	MADD	c12, c12, a2, b6
1145	LD	b6, 24 * SIZE(BO)
1146	MADD	c22, c22, a2, b2
1147	LD	b2, 13 * SIZE(BO)
1148	MADD	c32, c32, a2, b3
1149	LD	b3, 14 * SIZE(BO)
1150	MADD	c42, c42, a2, b4
1151	LD	b4, 15 * SIZE(BO)
1152
1153	MADD	c11, c11, a3, b7
1154	LD	a2,  7 * SIZE(AO)
1155	MADD	c21, c21, a3, b2
1156	daddiu	AO, AO,  8 * SIZE
1157	MADD	c31, c31, a3, b3
1158	daddiu	BO, BO, 16 * SIZE
1159	MADD	c41, c41, a3, b4
1160	LD	a3,  4 * SIZE(AO)
1161
1162	MADD	c12, c12, a2, b7
1163	LD	b7, 12 * SIZE(BO)
1164	MADD	c22, c22, a2, b2
1165	LD	b2,  1 * SIZE(BO)
1166	MADD	c32, c32, a2, b3
1167	LD	b3,  2 * SIZE(BO)
1168	MADD	c42, c42, a2, b4
1169	NOP
1170
1171	bgtz	L, .L32
1172	LD	b4,  3 * SIZE(BO)
1173	.align 3
1174
1175.L35:
1176#ifndef TRMMKERNEL
1177	andi	L,  K, 3
1178#else
1179	andi	L,  TEMP, 3
1180#endif
1181	NOP
1182	blez	L, .L38
1183	NOP
1184	.align	3
1185
1186.L36:
1187	MADD	c11, c11, a1, b1
1188	LD	a2,  1 * SIZE(AO)
1189	MADD	c21, c21, a1, b2
1190	daddiu	L, L, -1
1191	MADD	c31, c31, a1, b3
1192	daddiu	AO, AO,  2 * SIZE
1193	MADD	c41, c41, a1, b4
1194	LD	a1,  0 * SIZE(AO)
1195
1196	MADD	c12, c12, a2, b1
1197	LD	b1,  4 * SIZE(BO)
1198	MADD	c22, c22, a2, b2
1199	LD	b2,  5 * SIZE(BO)
1200	MADD	c32, c32, a2, b3
1201	LD	b3,  6 * SIZE(BO)
1202	MADD	c42, c42, a2, b4
1203	LD	b4,  7 * SIZE(BO)
1204
1205	bgtz	L, .L36
1206	daddiu	BO, BO,  4 * SIZE
1207
1208.L38:
1209#ifndef TRMMKERNEL
1210	LD	$f0, 0 * SIZE(CO1)
1211	daddiu	CO3,CO3, 2 * SIZE
1212	LD	$f1, 1 * SIZE(CO1)
1213	daddiu	CO1,CO1, 2 * SIZE
1214	LD	$f2, 0 * SIZE(CO2)
1215	daddiu	CO4,CO4, 2 * SIZE
1216	LD	$f3, 1 * SIZE(CO2)
1217	daddiu	CO2,CO2, 2 * SIZE
1218
1219	LD	$f4, -2 * SIZE(CO3)
1220	MADD	c11, $f0, ALPHA, c11
1221	LD	$f5, -1 * SIZE(CO3)
1222	MADD	c12, $f1, ALPHA, c12
1223	LD	$f6, -2 * SIZE(CO4)
1224	MADD	c21, $f2, ALPHA, c21
1225	LD	$f7, -1 * SIZE(CO4)
1226	MADD	c22, $f3, ALPHA, c22
1227
1228	MADD	c31, $f4, ALPHA, c31
1229	ST	c11, -2 * SIZE(CO1)
1230	MADD	c32, $f5, ALPHA, c32
1231	ST	c12, -1 * SIZE(CO1)
1232	MADD	c41, $f6, ALPHA, c41
1233	ST	c21, -2 * SIZE(CO2)
1234	MADD	c42, $f7, ALPHA, c42
1235	ST	c22, -1 * SIZE(CO2)
1236
1237	ST	c31, -2 * SIZE(CO3)
1238	MTC	$0,  c11
1239	ST	c32, -1 * SIZE(CO3)
1240	daddiu	I, I, -1
1241	ST	c41, -2 * SIZE(CO4)
1242	MOV	c21, c11
1243	ST	c42, -1 * SIZE(CO4)
1244	MOV	c31, c11
1245#else
1246	MUL	c11, ALPHA, c11
1247	daddiu	CO3,CO3, 2 * SIZE
1248	MUL	c12, ALPHA, c12
1249	daddiu	CO1,CO1, 2 * SIZE
1250	MUL	c21, ALPHA, c21
1251	daddiu	CO4,CO4, 2 * SIZE
1252	MUL	c22, ALPHA, c22
1253	daddiu	CO2,CO2, 2 * SIZE
1254
1255	ST	c11, -2 * SIZE(CO1)
1256	MUL	c31, ALPHA, c31
1257	ST	c12, -1 * SIZE(CO1)
1258	MUL	c32, ALPHA, c32
1259	ST	c21, -2 * SIZE(CO2)
1260	MUL	c41, ALPHA, c41
1261	ST	c22, -1 * SIZE(CO2)
1262	MUL	c42, ALPHA, c42
1263
1264	ST	c31, -2 * SIZE(CO3)
1265	MTC	$0,  c11
1266	ST	c32, -1 * SIZE(CO3)
1267	daddiu	I, I, -1
1268	ST	c41, -2 * SIZE(CO4)
1269	MOV	c21, c11
1270	ST	c42, -1 * SIZE(CO4)
1271	MOV	c31, c11
1272
1273#if ( defined(LEFT) &&  defined(TRANSA)) || \
1274    (!defined(LEFT) && !defined(TRANSA))
1275	dsubu	TEMP, K, KK
1276#ifdef LEFT
1277	daddiu	TEMP, TEMP, -2
1278#else
1279	daddiu	TEMP, TEMP, -4
1280#endif
1281
1282	dsll	L,    TEMP, 1 + BASE_SHIFT
1283	dsll	TEMP, TEMP, 2 + BASE_SHIFT
1284
1285	daddu	AO, AO, L
1286	daddu	BO, BO, TEMP
1287#endif
1288
1289#ifdef LEFT
1290	daddiu	KK, KK, 2
1291#endif
1292#endif
1293
1294	bgtz	I, .L31
1295	MOV	c41, c11
1296	.align 3
1297
1298.L40:
1299	andi	I,  M, 1
1300	blez	I, .L49
1301	MOV	c61, c11
1302
1303#if defined(TRMMKERNEL)
1304#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1305	move	BO,  B
1306#else
1307	dsll	L,    KK, 0 + BASE_SHIFT
1308	dsll	TEMP, KK, 2 + BASE_SHIFT
1309
1310	daddu	AO, AO, L
1311	daddu	BO, B,  TEMP
1312#endif
1313
1314	LD	a1,  0 * SIZE(AO)
1315	MOV	c71, c11
1316	LD	a2,  1 * SIZE(AO)
1317	MOV	c81, c11
1318
1319	LD	b1,  0 * SIZE(BO)
1320	LD	b2,  1 * SIZE(BO)
1321	LD	b3,  2 * SIZE(BO)
1322	LD	b4,  3 * SIZE(BO)
1323	LD	b5,  4 * SIZE(BO)
1324	LD	b6,  8 * SIZE(BO)
1325	LD	b7, 12 * SIZE(BO)
1326
1327#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1328	dsubu	TEMP, K, KK
1329#elif defined(LEFT)
1330	daddiu	TEMP, KK, 1
1331#else
1332	daddiu	TEMP, KK, 4
1333#endif
1334	dsra	L,  TEMP, 2
1335
1336	blez	L, .L45
1337	NOP
1338#else
1339	LD	a1,  0 * SIZE(AO)
1340	MOV	c71, c11
1341	LD	a2,  1 * SIZE(AO)
1342	MOV	c81, c11
1343
1344	LD	b1,  0 * SIZE(B)
1345	LD	b2,  1 * SIZE(B)
1346	LD	b3,  2 * SIZE(B)
1347	LD	b4,  3 * SIZE(B)
1348	LD	b5,  4 * SIZE(B)
1349	LD	b6,  8 * SIZE(B)
1350	LD	b7, 12 * SIZE(B)
1351
1352	dsra	L,  K, 2
1353
1354	blez	L, .L45
1355	move	BO,  B
1356#endif
1357	.align	3
1358
1359.L42:
1360	MADD	c11, c11, a1, b1
1361	LD	b1, 16 * SIZE(BO)
1362	MADD	c21, c21, a1, b2
1363	LD	b2,  5 * SIZE(BO)
1364	MADD	c31, c31, a1, b3
1365	LD	b3,  6 * SIZE(BO)
1366	MADD	c41, c41, a1, b4
1367	LD	b4,  7 * SIZE(BO)
1368
1369	LD	a1,  4 * SIZE(AO)
1370	daddiu	L, L, -1
1371
1372	MADD	c11, c11, a2, b5
1373	LD	b5, 20 * SIZE(BO)
1374	MADD	c21, c21, a2, b2
1375	LD	b2,  9 * SIZE(BO)
1376	MADD	c31, c31, a2, b3
1377	LD	b3, 10 * SIZE(BO)
1378	MADD	c41, c41, a2, b4
1379	LD	b4, 11 * SIZE(BO)
1380
1381	LD	a2,  2 * SIZE(AO)
1382	daddiu	AO, AO,  4 * SIZE
1383
1384	MADD	c11, c11, a2, b6
1385	LD	b6, 24 * SIZE(BO)
1386	MADD	c21, c21, a2, b2
1387	LD	b2, 13 * SIZE(BO)
1388	MADD	c31, c31, a2, b3
1389	LD	b3, 14 * SIZE(BO)
1390	MADD	c41, c41, a2, b4
1391	LD	b4, 15 * SIZE(BO)
1392
1393	LD	a2, -1 * SIZE(AO)
1394	daddiu	BO, BO, 16 * SIZE
1395
1396	MADD	c11, c11, a2, b7
1397	LD	b7, 12 * SIZE(BO)
1398	MADD	c21, c21, a2, b2
1399	LD	b2,  1 * SIZE(BO)
1400	MADD	c31, c31, a2, b3
1401	LD	b3,  2 * SIZE(BO)
1402	MADD	c41, c41, a2, b4
1403	LD	b4,  3 * SIZE(BO)
1404
1405	bgtz	L, .L42
1406	LD	a2,  1 * SIZE(AO)
1407	.align 3
1408
1409.L45:
1410#ifndef TRMMKERNEL
1411	andi	L,  K, 3
1412#else
1413	andi	L,  TEMP, 3
1414#endif
1415	NOP
1416	blez	L, .L48
1417	NOP
1418	.align	3
1419
1420.L46:
1421	MADD	c11, c11, a1, b1
1422	LD	b1,  4 * SIZE(BO)
1423	MADD	c21, c21, a1, b2
1424	LD	b2,  5 * SIZE(BO)
1425	MADD	c31, c31, a1, b3
1426	LD	b3,  6 * SIZE(BO)
1427	MADD	c41, c41, a1, b4
1428	LD	a1,  1 * SIZE(AO)
1429
1430	LD	b4,  7 * SIZE(BO)
1431	daddiu	L, L, -1
1432
1433	daddiu	AO, AO,  1 * SIZE
1434	MOV	a2, a2
1435	bgtz	L, .L46
1436	daddiu	BO, BO,  4 * SIZE
1437
1438
1439.L48:
1440#ifndef TRMMKERNEL
1441	LD	$f0, 0 * SIZE(CO1)
1442	LD	$f1, 0 * SIZE(CO2)
1443	LD	$f2, 0 * SIZE(CO3)
1444	LD	$f3, 0 * SIZE(CO4)
1445
1446	MADD	c11, $f0, ALPHA, c11
1447	MADD	c21, $f1, ALPHA, c21
1448	MADD	c31, $f2, ALPHA, c31
1449	MADD	c41, $f3, ALPHA, c41
1450
1451	ST	c11,  0 * SIZE(CO1)
1452	ST	c21,  0 * SIZE(CO2)
1453	ST	c31,  0 * SIZE(CO3)
1454	ST	c41,  0 * SIZE(CO4)
1455#else
1456	MUL	c11, ALPHA, c11
1457	MUL	c21, ALPHA, c21
1458	MUL	c31, ALPHA, c31
1459	MUL	c41, ALPHA, c41
1460
1461	ST	c11,  0 * SIZE(CO1)
1462	ST	c21,  0 * SIZE(CO2)
1463	ST	c31,  0 * SIZE(CO3)
1464	ST	c41,  0 * SIZE(CO4)
1465
1466#if ( defined(LEFT) &&  defined(TRANSA)) || \
1467    (!defined(LEFT) && !defined(TRANSA))
1468	dsubu	TEMP, K, KK
1469#ifdef LEFT
1470	daddiu	TEMP, TEMP, -1
1471#else
1472	daddiu	TEMP, TEMP, -4
1473#endif
1474
1475	dsll	L,    TEMP, 0 + BASE_SHIFT
1476	dsll	TEMP, TEMP, 2 + BASE_SHIFT
1477
1478	daddu	AO, AO, L
1479	daddu	BO, BO, TEMP
1480#endif
1481
1482#ifdef LEFT
1483	daddiu	KK, KK, 1
1484#endif
1485#endif
1486	.align 3
1487
1488.L49:
1489#if defined(TRMMKERNEL) && !defined(LEFT)
1490	daddiu	KK, KK, 4
1491#endif
1492	move	B, BO
1493	.align 3
1494
1495.L50:
1496	andi	J,  N, 2
1497	blez	J, .L70
1498
1499	move	AO, A
1500	move	CO1, C
1501	daddu	CO2, C,   LDC
1502
1503#if defined(TRMMKERNEL) &&  defined(LEFT)
1504	move	KK, OFFSET
1505#endif
1506
1507	dsra	I,  M, 1
1508	blez	I, .L60
1509	daddu	C,   CO2, LDC
1510
1511.L51:
1512#if defined(TRMMKERNEL)
1513#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1514	move	BO,  B
1515#else
1516	dsll	L,    KK, 1 + BASE_SHIFT
1517	dsll	TEMP, KK, 1 + BASE_SHIFT
1518
1519	daddu	AO, AO, L
1520	daddu	BO, B,  TEMP
1521#endif
1522
1523	LD	a1,  0 * SIZE(AO)
1524	MTC	$0,  c11
1525	LD	a2,  1 * SIZE(AO)
1526	MOV	c21, c11
1527	LD	a5,  4 * SIZE(AO)
1528
1529	LD	b1,  0 * SIZE(BO)
1530	MOV	c12, c11
1531	LD	b2,  1 * SIZE(BO)
1532	MOV	c22, c11
1533	LD	b3,  2 * SIZE(BO)
1534	LD	b5,  4 * SIZE(BO)
1535	LD	b6,  8 * SIZE(BO)
1536	LD	b7, 12 * SIZE(BO)
1537
1538#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1539	dsubu	TEMP, K, KK
1540#elif defined(LEFT)
1541	daddiu	TEMP, KK, 2
1542#else
1543	daddiu	TEMP, KK, 2
1544#endif
1545	dsra	L,  TEMP, 2
1546	blez	L, .L55
1547	NOP
1548#else
1549	LD	a1,  0 * SIZE(AO)
1550	MTC	$0,  c11
1551	LD	a2,  1 * SIZE(AO)
1552	MOV	c21, c11
1553	LD	a5,  4 * SIZE(AO)
1554
1555	LD	b1,  0 * SIZE(B)
1556	MOV	c12, c11
1557	LD	b2,  1 * SIZE(B)
1558	MOV	c22, c11
1559	LD	b3,  2 * SIZE(B)
1560	LD	b5,  4 * SIZE(B)
1561	dsra	L,  K, 2
1562	LD	b6,  8 * SIZE(B)
1563	LD	b7, 12 * SIZE(B)
1564
1565	blez	L, .L55
1566	move	BO,  B
1567#endif
1568	.align	3
1569
1570.L52:
1571	MADD	c11, c11, a1, b1
1572	LD	a3,  2 * SIZE(AO)
1573	MADD	c21, c21, a1, b2
1574	LD	b4,  3 * SIZE(BO)
1575	MADD	c12, c12, a2, b1
1576	LD	a4,  3 * SIZE(AO)
1577	MADD	c22, c22, a2, b2
1578	LD	b1,  8 * SIZE(BO)
1579
1580	MADD	c11, c11, a3, b3
1581	LD	a1,  8 * SIZE(AO)
1582	MADD	c21, c21, a3, b4
1583	LD	b2,  5 * SIZE(BO)
1584	MADD	c12, c12, a4, b3
1585	LD	a2,  5 * SIZE(AO)
1586	MADD	c22, c22, a4, b4
1587	LD	b3,  6 * SIZE(BO)
1588
1589	MADD	c11, c11, a5, b5
1590	LD	a3,  6 * SIZE(AO)
1591	MADD	c21, c21, a5, b2
1592	LD	b4,  7 * SIZE(BO)
1593	MADD	c12, c12, a2, b5
1594	LD	a4,  7 * SIZE(AO)
1595	MADD	c22, c22, a2, b2
1596	LD	b5, 12 * SIZE(BO)
1597
1598	MADD	c11, c11, a3, b3
1599	LD	a5, 12 * SIZE(AO)
1600	MADD	c21, c21, a3, b4
1601	LD	b2,  9 * SIZE(BO)
1602	MADD	c12, c12, a4, b3
1603	LD	a2,  9 * SIZE(AO)
1604	MADD	c22, c22, a4, b4
1605	LD	b3, 10 * SIZE(BO)
1606
1607	daddiu	AO, AO,  8 * SIZE
1608	daddiu	L, L, -1
1609	bgtz	L, .L52
1610	daddiu	BO, BO,  8 * SIZE
1611	.align 3
1612
1613.L55:
1614#ifndef TRMMKERNEL
1615	andi	L,  K, 3
1616#else
1617	andi	L,  TEMP, 3
1618#endif
1619	NOP
1620	blez	L, .L58
1621	NOP
1622	.align	3
1623
1624.L56:
1625	MADD	c11, c11, a1, b1
1626	LD	a2,  1 * SIZE(AO)
1627	MADD	c21, c21, a1, b2
1628	LD	a1,  2 * SIZE(AO)
1629
1630	MADD	c12, c12, a2, b1
1631	LD	b1,  2 * SIZE(BO)
1632	MADD	c22, c22, a2, b2
1633	LD	b2,  3 * SIZE(BO)
1634
1635	daddiu	L, L, -1
1636	daddiu	AO, AO,  2 * SIZE
1637	bgtz	L, .L56
1638	daddiu	BO, BO,  2 * SIZE
1639
1640.L58:
1641#ifndef TRMMKERNEL
1642	LD	$f0, 0 * SIZE(CO1)
1643	daddiu	I, I, -1
1644	LD	$f1, 1 * SIZE(CO1)
1645	daddiu	CO1,CO1, 2 * SIZE
1646	LD	$f2, 0 * SIZE(CO2)
1647	NOP
1648	LD	$f3, 1 * SIZE(CO2)
1649	daddiu	CO2,CO2, 2 * SIZE
1650
1651	MADD	c11, $f0, ALPHA, c11
1652	MADD	c12, $f1, ALPHA, c12
1653	MADD	c21, $f2, ALPHA, c21
1654	MADD	c22, $f3, ALPHA, c22
1655
1656	ST	c11, -2 * SIZE(CO1)
1657	ST	c12, -1 * SIZE(CO1)
1658	ST	c21, -2 * SIZE(CO2)
1659	NOP
1660	bgtz	I, .L51
1661	ST	c22, -1 * SIZE(CO2)
1662#else
1663	daddiu	I, I, -1
1664
1665	daddiu	CO1,CO1, 2 * SIZE
1666	daddiu	CO2,CO2, 2 * SIZE
1667
1668	MUL	c11, ALPHA, c11
1669	MUL	c12, ALPHA, c12
1670	MUL	c21, ALPHA, c21
1671	MUL	c22, ALPHA, c22
1672
1673	ST	c11, -2 * SIZE(CO1)
1674	ST	c12, -1 * SIZE(CO1)
1675	ST	c21, -2 * SIZE(CO2)
1676	ST	c22, -1 * SIZE(CO2)
1677
1678#if ( defined(LEFT) &&  defined(TRANSA)) || \
1679    (!defined(LEFT) && !defined(TRANSA))
1680	dsubu	TEMP, K, KK
1681#ifdef LEFT
1682	daddiu	TEMP, TEMP, -2
1683#else
1684	daddiu	TEMP, TEMP, -2
1685#endif
1686
1687	dsll	L,    TEMP, 1 + BASE_SHIFT
1688	dsll	TEMP, TEMP, 1 + BASE_SHIFT
1689
1690	daddu	AO, AO, L
1691	daddu	BO, BO, TEMP
1692#endif
1693
1694#ifdef LEFT
1695	daddiu	KK, KK, 2
1696#endif
1697
1698	bgtz	I, .L51
1699	NOP
1700#endif
1701	.align 3
1702
1703.L60:
1704	andi	I,  M, 1
1705	blez	I, .L69
1706	NOP
1707
1708#if defined(TRMMKERNEL)
1709#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1710	move	BO,  B
1711#else
1712	dsll	L,    KK, 0 + BASE_SHIFT
1713	dsll	TEMP, KK, 1 + BASE_SHIFT
1714
1715	daddu	AO, AO, L
1716	daddu	BO, B,  TEMP
1717#endif
1718
1719	LD	a1,  0 * SIZE(AO)
1720	MTC	$0,  c11
1721	LD	a2,  1 * SIZE(AO)
1722	MOV	c21, c11
1723	LD	a3,  2 * SIZE(AO)
1724	MOV	c31, c11
1725	LD	a4,  3 * SIZE(AO)
1726	MOV	c41, c11
1727
1728	LD	b1,  0 * SIZE(BO)
1729	LD	b2,  1 * SIZE(BO)
1730	LD	b3,  2 * SIZE(BO)
1731	LD	b4,  3 * SIZE(BO)
1732	LD	b5,  4 * SIZE(BO)
1733	LD	b6,  8 * SIZE(BO)
1734	LD	b7, 12 * SIZE(BO)
1735
1736#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1737	dsubu	TEMP, K, KK
1738#elif defined(LEFT)
1739	daddiu	TEMP, KK, 1
1740#else
1741	daddiu	TEMP, KK, 2
1742#endif
1743	dsra	L,  TEMP, 2
1744	blez	L, .L65
1745	NOP
1746#else
1747	dsra	L,  K, 2
1748	LD	a1,  0 * SIZE(AO)
1749	MTC	$0,  c11
1750	LD	a2,  1 * SIZE(AO)
1751	MOV	c21, c11
1752	LD	a3,  2 * SIZE(AO)
1753	MOV	c31, c11
1754	LD	a4,  3 * SIZE(AO)
1755	MOV	c41, c11
1756
1757	LD	b1,  0 * SIZE(B)
1758	LD	b2,  1 * SIZE(B)
1759	LD	b3,  2 * SIZE(B)
1760	LD	b4,  3 * SIZE(B)
1761	LD	b5,  4 * SIZE(B)
1762	LD	b6,  8 * SIZE(B)
1763	LD	b7, 12 * SIZE(B)
1764
1765	blez	L, .L65
1766	move	BO,  B
1767#endif
1768	.align	3
1769
1770.L62:
1771	MADD	c11, c11, a1, b1
1772	LD	b1,  4 * SIZE(BO)
1773	MADD	c21, c21, a1, b2
1774	LD	b2,  5 * SIZE(BO)
1775	MADD	c31, c31, a2, b3
1776	LD	b3,  6 * SIZE(BO)
1777	MADD	c41, c41, a2, b4
1778	LD	b4,  7 * SIZE(BO)
1779
1780	LD	a1,  4 * SIZE(AO)
1781	LD	a2,  5 * SIZE(AO)
1782
1783	MADD	c11, c11, a3, b1
1784	LD	b1,  8 * SIZE(BO)
1785	MADD	c21, c21, a3, b2
1786	LD	b2,  9 * SIZE(BO)
1787	MADD	c31, c31, a4, b3
1788	LD	b3, 10 * SIZE(BO)
1789	MADD	c41, c41, a4, b4
1790	LD	b4, 11 * SIZE(BO)
1791
1792	LD	a3,  6 * SIZE(AO)
1793	LD	a4,  7 * SIZE(AO)
1794
1795	daddiu	L, L, -1
1796	daddiu	AO, AO,  4 * SIZE
1797
1798	bgtz	L, .L62
1799	daddiu	BO, BO,  8 * SIZE
1800	.align 3
1801
1802.L65:
1803#ifndef TRMMKERNEL
1804	andi	L,  K, 3
1805#else
1806	andi	L,  TEMP, 3
1807#endif
1808	NOP
1809	blez	L, .L68
1810	NOP
1811	.align	3
1812
1813.L66:
1814	MADD	c11, c11, a1, b1
1815	LD	b1,  2 * SIZE(BO)
1816	MADD	c21, c21, a1, b2
1817	LD	b2,  3 * SIZE(BO)
1818
1819	LD	a1,  1 * SIZE(AO)
1820	daddiu	L, L, -1
1821
1822	daddiu	AO, AO,  1 * SIZE
1823	bgtz	L, .L66
1824	daddiu	BO, BO,  2 * SIZE
1825
1826
1827.L68:
1828#ifndef TRMMKERNEL
1829	LD	$f0, 0 * SIZE(CO1)
1830	LD	$f1, 0 * SIZE(CO2)
1831
1832	ADD	c11, c11, c31
1833	ADD	c21, c21, c41
1834
1835	MADD	c11, $f0, ALPHA, c11
1836	MADD	c21, $f1, ALPHA, c21
1837
1838	ST	c11,  0 * SIZE(CO1)
1839	ST	c21,  0 * SIZE(CO2)
1840#else
1841	ADD	c11, c11, c31
1842	ADD	c21, c21, c41
1843
1844	MUL	c11, ALPHA, c11
1845	MUL	c21, ALPHA, c21
1846
1847	ST	c11,  0 * SIZE(CO1)
1848	ST	c21,  0 * SIZE(CO2)
1849
1850#if ( defined(LEFT) &&  defined(TRANSA)) || \
1851    (!defined(LEFT) && !defined(TRANSA))
1852	dsubu	TEMP, K, KK
1853#ifdef LEFT
1854	daddiu	TEMP, TEMP, -1
1855#else
1856	daddiu	TEMP, TEMP, -2
1857#endif
1858
1859	dsll	L,    TEMP, 0 + BASE_SHIFT
1860	dsll	TEMP, TEMP, 1 + BASE_SHIFT
1861
1862	daddu	AO, AO, L
1863	daddu	BO, BO, TEMP
1864#endif
1865
1866#ifdef LEFT
1867	daddiu	KK, KK, 1
1868#endif
1869#endif
1870	.align 3
1871
1872.L69:
1873#if defined(TRMMKERNEL) && !defined(LEFT)
1874	daddiu	KK, KK, 2
1875#endif
1876	move	B, BO
1877	.align 3
1878
1879.L70:
1880	andi	J,  N, 1
1881	blez	J, .L999
1882
1883	move	AO, A
1884	move	CO1, C
1885
1886#if defined(TRMMKERNEL) &&  defined(LEFT)
1887	move	KK, OFFSET
1888#endif
1889
1890	dsra	I,  M, 1
1891	blez	I, .L80
1892	daddu	C,   CO1, LDC
1893
1894.L71:
1895#if defined(TRMMKERNEL)
1896#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1897	move	BO,  B
1898#else
1899	dsll	L,    KK, 1 + BASE_SHIFT
1900	dsll	TEMP, KK, 0 + BASE_SHIFT
1901
1902	daddu	AO, AO, L
1903	daddu	BO, B,  TEMP
1904#endif
1905
1906	LD	a1,  0 * SIZE(AO)
1907	MTC	$0,  c11
1908	LD	a2,  1 * SIZE(AO)
1909	MOV	c21, c11
1910	LD	a5,  4 * SIZE(AO)
1911
1912	LD	b1,  0 * SIZE(BO)
1913	MOV	c12, c11
1914	LD	b2,  1 * SIZE(BO)
1915	MOV	c22, c11
1916	LD	b3,  2 * SIZE(BO)
1917	LD	b5,  4 * SIZE(BO)
1918	LD	b6,  8 * SIZE(BO)
1919	LD	b7, 12 * SIZE(BO)
1920
1921#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1922	dsubu	TEMP, K, KK
1923#elif defined(LEFT)
1924	daddiu	TEMP, KK, 2
1925#else
1926	daddiu	TEMP, KK, 1
1927#endif
1928	dsra	L,  TEMP, 2
1929	blez	L, .L75
1930	NOP
1931#else
1932	LD	a1,  0 * SIZE(AO)
1933	MTC	$0,  c11
1934	LD	a2,  1 * SIZE(AO)
1935	MOV	c21, c11
1936	LD	a5,  4 * SIZE(AO)
1937
1938	LD	b1,  0 * SIZE(B)
1939	MOV	c12, c11
1940	LD	b2,  1 * SIZE(B)
1941	MOV	c22, c11
1942	LD	b3,  2 * SIZE(B)
1943	LD	b5,  4 * SIZE(B)
1944	dsra	L,  K, 2
1945	LD	b6,  8 * SIZE(B)
1946	LD	b7, 12 * SIZE(B)
1947
1948	blez	L, .L75
1949	move	BO,  B
1950#endif
1951	.align	3
1952
1953.L72:
1954	LD	a1,  0 * SIZE(AO)
1955	LD	a2,  1 * SIZE(AO)
1956	LD	b1,  0 * SIZE(BO)
1957
1958	MADD	c11, c11, a1, b1
1959	MADD	c12, c12, a2, b1
1960
1961	LD	a1,  2 * SIZE(AO)
1962	LD	a2,  3 * SIZE(AO)
1963	LD	b1,  1 * SIZE(BO)
1964
1965	MADD	c11, c11, a1, b1
1966	MADD	c12, c12, a2, b1
1967
1968	LD	a1,  4 * SIZE(AO)
1969	LD	a2,  5 * SIZE(AO)
1970	LD	b1,  2 * SIZE(BO)
1971
1972	MADD	c11, c11, a1, b1
1973	MADD	c12, c12, a2, b1
1974
1975	LD	a1,  6 * SIZE(AO)
1976	LD	a2,  7 * SIZE(AO)
1977	LD	b1,  3 * SIZE(BO)
1978
1979	MADD	c11, c11, a1, b1
1980	MADD	c12, c12, a2, b1
1981
1982	daddiu	L, L, -1
1983	daddiu	AO, AO,  8 * SIZE
1984	bgtz	L, .L72
1985	daddiu	BO, BO,  4 * SIZE
1986	.align 3
1987
1988.L75:
1989#ifndef TRMMKERNEL
1990	andi	L,  K, 3
1991#else
1992	andi	L,  TEMP, 3
1993#endif
1994	NOP
1995	blez	L, .L78
1996	NOP
1997	.align	3
1998
1999.L76:
2000	LD	a1,  0 * SIZE(AO)
2001	LD	a2,  1 * SIZE(AO)
2002	LD	b1,  0 * SIZE(BO)
2003
2004	MADD	c11, c11, a1, b1
2005	MADD	c12, c12, a2, b1
2006
2007	daddiu	L, L, -1
2008	daddiu	AO, AO,  2 * SIZE
2009	bgtz	L, .L76
2010	daddiu	BO, BO,  1 * SIZE
2011
2012.L78:
2013#ifndef TRMMKERNEL
2014	LD	$f0, 0 * SIZE(CO1)
2015	daddiu	I, I, -1
2016	LD	$f1, 1 * SIZE(CO1)
2017	daddiu	CO1,CO1, 2 * SIZE
2018
2019	ADD	c11, c11, c21
2020	ADD	c12, c12, c22
2021
2022	MADD	c11, $f0, ALPHA, c11
2023	MADD	c12, $f1, ALPHA, c12
2024
2025	ST	c11, -2 * SIZE(CO1)
2026	bgtz	I, .L71
2027	ST	c12, -1 * SIZE(CO1)
2028#else
2029	ADD	c11, c11, c21
2030	daddiu	I, I, -1
2031	ADD	c12, c12, c22
2032	daddiu	CO1,CO1, 2 * SIZE
2033
2034	MUL	c11, ALPHA, c11
2035	MUL	c12, ALPHA, c12
2036
2037	ST	c11, -2 * SIZE(CO1)
2038	ST	c12, -1 * SIZE(CO1)
2039
2040#if ( defined(LEFT) &&  defined(TRANSA)) || \
2041    (!defined(LEFT) && !defined(TRANSA))
2042	dsubu	TEMP, K, KK
2043#ifdef LEFT
2044	daddiu	TEMP, TEMP, -2
2045#else
2046	daddiu	TEMP, TEMP, -1
2047#endif
2048
2049	dsll	L,    TEMP, 1 + BASE_SHIFT
2050	dsll	TEMP, TEMP, 0 + BASE_SHIFT
2051
2052	daddu	AO, AO, L
2053	daddu	BO, BO, TEMP
2054#endif
2055
2056#ifdef LEFT
2057	daddiu	KK, KK, 2
2058#endif
2059
2060	bgtz	I, .L71
2061	NOP
2062#endif
2063	.align 3
2064
2065.L80:
2066	andi	I,  M, 1
2067	blez	I, .L89
2068	NOP
2069
2070#if defined(TRMMKERNEL)
2071#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2072	move	BO,  B
2073#else
2074	dsll	L,    KK, 0 + BASE_SHIFT
2075	dsll	TEMP, KK, 0 + BASE_SHIFT
2076
2077	daddu	AO, AO, L
2078	daddu	BO, B,  TEMP
2079#endif
2080
2081	LD	a1,  0 * SIZE(AO)
2082	MTC	$0,  c11
2083	LD	a2,  1 * SIZE(AO)
2084	MOV	c21, c11
2085	LD	a3,  2 * SIZE(AO)
2086	LD	a4,  3 * SIZE(AO)
2087
2088	LD	b1,  0 * SIZE(BO)
2089	LD	b2,  1 * SIZE(BO)
2090	LD	b3,  2 * SIZE(BO)
2091	LD	b4,  3 * SIZE(BO)
2092	LD	b5,  4 * SIZE(BO)
2093	LD	b6,  8 * SIZE(BO)
2094	LD	b7, 12 * SIZE(BO)
2095
2096#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2097	dsubu	TEMP, K, KK
2098#elif defined(LEFT)
2099	daddiu	TEMP, KK, 1
2100#else
2101	daddiu	TEMP, KK, 1
2102#endif
2103	dsra	L,  TEMP, 2
2104	blez	L, .L85
2105	NOP
2106#else
2107	LD	a1,  0 * SIZE(AO)
2108	MTC	$0,  c11
2109	LD	a2,  1 * SIZE(AO)
2110	MOV	c21, c11
2111	LD	a3,  2 * SIZE(AO)
2112	LD	a4,  3 * SIZE(AO)
2113
2114	LD	b1,  0 * SIZE(B)
2115	LD	b2,  1 * SIZE(B)
2116	LD	b3,  2 * SIZE(B)
2117	LD	b4,  3 * SIZE(B)
2118	LD	b5,  4 * SIZE(B)
2119	LD	b6,  8 * SIZE(B)
2120	LD	b7, 12 * SIZE(B)
2121
2122	dsra	L,  K, 2
2123	blez	L, .L85
2124	move	BO,  B
2125#endif
2126	.align	3
2127
2128.L82:
2129	LD	a1,  0 * SIZE(AO)
2130	LD	b1,  0 * SIZE(BO)
2131
2132	MADD	c11, c11, a1, b1
2133
2134	LD	a1,  1 * SIZE(AO)
2135	LD	b1,  1 * SIZE(BO)
2136
2137	MADD	c21, c21, a1, b1
2138
2139	LD	a1,  2 * SIZE(AO)
2140	LD	b1,  2 * SIZE(BO)
2141
2142	MADD	c11, c11, a1, b1
2143
2144	LD	a1,  3 * SIZE(AO)
2145	LD	b1,  3 * SIZE(BO)
2146
2147	MADD	c21, c21, a1, b1
2148
2149	daddiu	L, L, -1
2150	daddiu	AO, AO,  4 * SIZE
2151	bgtz	L, .L82
2152	daddiu	BO, BO,  4 * SIZE
2153	.align 3
2154
2155.L85:
2156#ifndef TRMMKERNEL
2157	andi	L,  K, 3
2158#else
2159	andi	L,  TEMP, 3
2160#endif
2161	NOP
2162	blez	L, .L88
2163	NOP
2164	.align	3
2165
2166.L86:
2167	LD	a1,  0 * SIZE(AO)
2168	LD	b1,  0 * SIZE(BO)
2169
2170	MADD	c11, c11, a1, b1
2171
2172	daddiu	L, L, -1
2173	daddiu	AO, AO,  1 * SIZE
2174	bgtz	L, .L86
2175	daddiu	BO, BO,  1 * SIZE
2176
2177
2178.L88:
2179#ifndef TRMMKERNEL
2180	LD	$f0, 0 * SIZE(CO1)
2181
2182	ADD	c11, c11, c21
2183	MADD	c11, $f0, ALPHA, c11
2184
2185	ST	c11,  0 * SIZE(CO1)
2186#else
2187	ADD	c11, c11, c21
2188	MUL	c11, ALPHA, c11
2189
2190	ST	c11,  0 * SIZE(CO1)
2191#endif
2192	.align 3
2193
2194.L89:
2195#if defined(TRMMKERNEL) && !defined(LEFT)
2196	daddiu	KK, KK, 1
2197#endif
2198	move	B, BO
2199	.align 3
2200
2201
2202.L999:
2203	LDARG	$16,   0($sp)
2204	LDARG	$17,   8($sp)
2205	LDARG	$18,  16($sp)
2206	LDARG	$19,  24($sp)
2207	LDARG	$20,  32($sp)
2208	LDARG	$21,  40($sp)
2209	LDARG	$22,  48($sp)
2210
2211	ldc1	$f24, 56($sp)
2212	ldc1	$f25, 64($sp)
2213	ldc1	$f26, 72($sp)
2214	ldc1	$f27, 80($sp)
2215	ldc1	$f28, 88($sp)
2216
2217#if defined(TRMMKERNEL)
2218	LDARG	$23,  96($sp)
2219	LDARG	$24, 104($sp)
2220	LDARG	$25, 112($sp)
2221#endif
2222
2223#ifndef __64BIT__
2224	ldc1	$f20,120($sp)
2225	ldc1	$f21,128($sp)
2226	ldc1	$f22,136($sp)
2227	ldc1	$f23,144($sp)
2228#endif
2229
2230	j	$31
2231	daddiu	$sp, $sp, 160
2232
2233	EPILOGUE
2234