1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	$4
26#define	N	$5
27#define	K	$6
28#define A	$9
29#define B	$10
30#define C	$11
31#define LDC	$8
32
33#define AO	$12
34#define BO	$13
35
36#define I	$2
37#define J	$3
38#define L	$7
39
40#define CO1	$14
41#define CO2	$15
42#define CO3	$16
43#define CO4	$17
44
45#if defined(TRMMKERNEL)
46#define OFFSET	$18
47#define KK	$19
48#define TEMP	$20
49#endif
50
51#define a1	$f0
52#define a2	$f1
53#define a3	$f28
54#define a4	$f29
55
56#define b1	$f2
57#define b2	$f3
58#define b3	$f4
59#define b4	$f5
60#define b5	$f6
61#define b6	$f7
62#define b7	$f8
63#define b8	$f9
64
65#define a5	b8
66
67#define c11	$f10
68#define c12	$f11
69#define c21	$f12
70#define c22	$f13
71#define c31	$f14
72#define c32	$f17
73#define c41	$f18
74#define c42	$f19
75#define c51	$f20
76#define c52	$f21
77#define c61	$f22
78#define c62	$f23
79#define c71	$f24
80#define c72	$f25
81#define c81	$f26
82#define c82	$f27
83
84#define ALPHA_R	$f15
85#define ALPHA_I	$f16
86
87#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
88#define MADD1	  MADD
89#define MADD2	  MADD
90#define MADD3	  MADD
91#define MADD4	  NMSUB
92#endif
93
94#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
95#define MADD1	  MADD
96#define MADD2	  MADD
97#define MADD3	  NMSUB
98#define MADD4	  MADD
99#endif
100
101#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
102#define MADD1	  MADD
103#define MADD2	  NMSUB
104#define MADD3	  MADD
105#define MADD4	  MADD
106#endif
107
108#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
109#define MADD1	  MADD
110#define MADD2	  NMSUB
111#define MADD3	  NMSUB
112#define MADD4	  NMSUB
113#endif
114
115	PROLOGUE
116
117	LDARG	LDC,   0($sp)
118	daddiu	$sp, $sp, -128
119
120	SDARG	$16,   0($sp)
121	SDARG	$17,   8($sp)
122	sdc1	$f24, 16($sp)
123	sdc1	$f25, 24($sp)
124	sdc1	$f26, 32($sp)
125	sdc1	$f27, 40($sp)
126	sdc1	$f28, 48($sp)
127	sdc1	$f29, 56($sp)
128
129#if defined(TRMMKERNEL)
130	SDARG	$18,  64($sp)
131	SDARG	$19,  72($sp)
132	SDARG	$20,  80($sp)
133
134	LDARG	OFFSET, 128 + 8($sp)
135#endif
136
137#ifndef __64BIT__
138	sdc1	$f20, 88($sp)
139	sdc1	$f21, 96($sp)
140	sdc1	$f22,104($sp)
141	sdc1	$f23,112($sp)
142#endif
143
144	dsll	LDC, LDC, ZBASE_SHIFT
145
146#if defined(TRMMKERNEL) && !defined(LEFT)
147	neg	KK, OFFSET
148#endif
149
150	dsra	J,  N, 2
151	blez	J, .L20
152	nop
153
154.L10:
155	move	CO1, C
156	MTC	$0,  c11
157	daddu	CO2, C,   LDC
158	move	AO, A
159	daddu	CO3, CO2, LDC
160	daddiu	J, J, -1
161	daddu	CO4, CO3, LDC
162	MOV	c21, c11
163	MOV	c31, c11
164#if defined(TRMMKERNEL) &&  defined(LEFT)
165	move	KK, OFFSET
166#endif
167	MOV	c41, c11
168	MOV	c51, c11
169	move	I,  M
170	daddu	C,   CO4, LDC
171
172	blez	I, .L19
173	MOV	c61, c11
174
175.L11:
176#if defined(TRMMKERNEL)
177#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
178	move	BO,  B
179#else
180	dsll	L,    KK,  ZBASE_SHIFT
181	dsll	TEMP, KK, 2 + ZBASE_SHIFT
182
183	daddu	AO, AO, L
184	daddu	BO, B,  TEMP
185#endif
186
187	LD	a1,  0 * SIZE(AO)
188	MOV	c71, c11
189	LD	b1,  0 * SIZE(BO)
190	MOV	c81, c11
191
192	LD	a3,  4 * SIZE(AO)
193	MOV	c12, c11
194	LD	b2,  1 * SIZE(BO)
195	MOV	c22, c11
196
197	MOV	c32, c11
198	LD	b3,  2 * SIZE(BO)
199	MOV	c42, c11
200
201	LD	b4,  3 * SIZE(BO)
202	MOV	c52, c11
203	LD	b5,  4 * SIZE(BO)
204	MOV	c62, c11
205
206	LD	b6,  8 * SIZE(BO)
207	MOV	c72, c11
208	LD	b7, 12 * SIZE(BO)
209	MOV	c82, c11
210
211#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
212	dsubu	TEMP, K, KK
213#elif defined(LEFT)
214	daddiu	TEMP, KK, 1
215#else
216	daddiu	TEMP, KK, 4
217#endif
218	dsra	L,  TEMP, 2
219
220	blez	L, .L15
221	NOP
222#else
223	LD	a1,  0 * SIZE(AO)
224	MOV	c71, c11
225	LD	b1,  0 * SIZE(B)
226	MOV	c81, c11
227
228	LD	a3,  4 * SIZE(AO)
229	MOV	c12, c11
230	LD	b2,  1 * SIZE(B)
231	MOV	c22, c11
232
233	dsra	L,  K, 2
234	MOV	c32, c11
235	LD	b3,  2 * SIZE(B)
236	MOV	c42, c11
237
238	LD	b4,  3 * SIZE(B)
239	MOV	c52, c11
240	LD	b5,  4 * SIZE(B)
241	MOV	c62, c11
242
243	LD	b6,  8 * SIZE(B)
244	MOV	c72, c11
245	LD	b7, 12 * SIZE(B)
246	MOV	c82, c11
247
248	blez	L, .L15
249	move	BO,  B
250#endif
251
252	MADD1	c11, c11, a1, b1
253	LD	a2,  1 * SIZE(AO)
254	MADD3	c21, c21, a1, b2
255	daddiu	L, L, -1
256	MADD1	c31, c31, a1, b3
257	NOP
258	blez	L, .L13
259	MADD3	c41, c41, a1, b4
260	.align	3
261
262.L12:
263	MADD2	c12, c12, a2, b1
264	LD	b1, 16 * SIZE(BO)
265	MADD4	c22, c22, a2, b2
266	LD	b2,  5 * SIZE(BO)
267	MADD2	c32, c32, a2, b3
268	LD	b3,  6 * SIZE(BO)
269	MADD4	c42, c42, a2, b4
270	LD	b4,  7 * SIZE(BO)
271
272	MADD1	c51, c51, a1, b5
273	NOP
274	MADD3	c61, c61, a1, b2
275	LD	a4,  2 * SIZE(AO)
276	MADD1	c71, c71, a1, b3
277	NOP
278	MADD3	c81, c81, a1, b4
279	LD	a1,  8 * SIZE(AO)
280
281	MADD2	c52, c52, a2, b5
282	LD	b5, 20 * SIZE(BO)
283	MADD4	c62, c62, a2, b2
284	LD	b2,  9 * SIZE(BO)
285	MADD2	c72, c72, a2, b3
286	LD	b3, 10 * SIZE(BO)
287	MADD4	c82, c82, a2, b4
288	LD	b4, 11 * SIZE(BO)
289
290	MADD1	c11, c11, a4, b6
291	LD	a2,  3 * SIZE(AO)
292	MADD3	c21, c21, a4, b2
293	NOP
294	MADD1	c31, c31, a4, b3
295	NOP
296	MADD3	c41, c41, a4, b4
297	NOP
298
299	MADD2	c12, c12, a2, b6
300	LD	b6, 24 * SIZE(BO)
301	MADD4	c22, c22, a2, b2
302	LD	b2, 13 * SIZE(BO)
303	MADD2	c32, c32, a2, b3
304	LD	b3, 14 * SIZE(BO)
305	MADD4	c42, c42, a2, b4
306	LD	b4, 15 * SIZE(BO)
307
308	MADD1	c51, c51, a4, b7
309	NOP
310	MADD3	c61, c61, a4, b2
311	NOP
312	MADD1	c71, c71, a4, b3
313	NOP
314	MADD3	c81, c81, a4, b4
315	NOP
316
317	MADD2	c52, c52, a2, b7
318	LD	b7, 28 * SIZE(BO)
319	MADD4	c62, c62, a2, b2
320	LD	b2, 17 * SIZE(BO)
321	MADD2	c72, c72, a2, b3
322	LD	b3, 18 * SIZE(BO)
323	MADD4	c82, c82, a2, b4
324	LD	b4, 19 * SIZE(BO)
325
326	MADD1	c11, c11, a3, b1
327	LD	a2,  5 * SIZE(AO)
328	MADD3	c21, c21, a3, b2
329	NOP
330	MADD1	c31, c31, a3, b3
331	NOP
332	MADD3	c41, c41, a3, b4
333	NOP
334
335	MADD2	c12, c12, a2, b1
336	LD	b1, 32 * SIZE(BO)
337	MADD4	c22, c22, a2, b2
338	LD	b2, 21 * SIZE(BO)
339	MADD2	c32, c32, a2, b3
340	LD	b3, 22 * SIZE(BO)
341	MADD4	c42, c42, a2, b4
342	LD	b4, 23 * SIZE(BO)
343
344	MADD1	c51, c51, a3, b5
345	NOP
346	MADD3	c61, c61, a3, b2
347	LD	a4,  6 * SIZE(AO)
348	MADD1	c71, c71, a3, b3
349	NOP
350	MADD3	c81, c81, a3, b4
351	LD	a3, 12 * SIZE(AO)
352
353	MADD2	c52, c52, a2, b5
354	LD	b5, 36 * SIZE(BO)
355	MADD4	c62, c62, a2, b2
356	LD	b2, 25 * SIZE(BO)
357	MADD2	c72, c72, a2, b3
358	LD	b3, 26 * SIZE(BO)
359	MADD4	c82, c82, a2, b4
360	LD	b4, 27 * SIZE(BO)
361
362	MADD1	c11, c11, a4, b6
363	LD	a2,  7 * SIZE(AO)
364	MADD3	c21, c21, a4, b2
365	NOP
366	MADD1	c31, c31, a4, b3
367	NOP
368	MADD3	c41, c41, a4, b4
369	daddiu	L, L, -1
370
371	MADD2	c12, c12, a2, b6
372	LD	b6, 40 * SIZE(BO)
373	MADD4	c22, c22, a2, b2
374	LD	b2, 29 * SIZE(BO)
375	MADD2	c32, c32, a2, b3
376	LD	b3, 30 * SIZE(BO)
377	MADD4	c42, c42, a2, b4
378	LD	b4, 31 * SIZE(BO)
379
380	MADD1	c51, c51, a4, b7
381	daddiu	BO, BO, 32 * SIZE
382	MADD3	c61, c61, a4, b2
383	daddiu	AO, AO,  8 * SIZE
384	MADD1	c71, c71, a4, b3
385	NOP
386	MADD3	c81, c81, a4, b4
387	NOP
388
389	MADD2	c52, c52, a2, b7
390	LD	b7, 12 * SIZE(BO)
391	MADD4	c62, c62, a2, b2
392	LD	b2,  1 * SIZE(BO)
393	MADD2	c72, c72, a2, b3
394	LD	b3,  2 * SIZE(BO)
395	MADD4	c82, c82, a2, b4
396	LD	b4,  3 * SIZE(BO)
397
398	MADD1	c11, c11, a1, b1
399	LD	a2,  1 * SIZE(AO)
400	MADD3	c21, c21, a1, b2
401	NOP
402	MADD1	c31, c31, a1, b3
403	NOP
404	bgtz	L, .L12
405	MADD3	c41, c41, a1, b4
406	.align 3
407
408.L13:
409	MADD2	c12, c12, a2, b1
410	LD	b1, 16 * SIZE(BO)
411	MADD4	c22, c22, a2, b2
412	LD	b2,  5 * SIZE(BO)
413	MADD2	c32, c32, a2, b3
414	LD	b3,  6 * SIZE(BO)
415	MADD4	c42, c42, a2, b4
416	LD	b4,  7 * SIZE(BO)
417
418	MADD1	c51, c51, a1, b5
419	NOP
420	MADD3	c61, c61, a1, b2
421	LD	a4,  2 * SIZE(AO)
422	MADD1	c71, c71, a1, b3
423	NOP
424	MADD3	c81, c81, a1, b4
425	LD	a1,  8 * SIZE(AO)
426
427	MADD2	c52, c52, a2, b5
428	LD	b5, 20 * SIZE(BO)
429	MADD4	c62, c62, a2, b2
430	LD	b2,  9 * SIZE(BO)
431	MADD2	c72, c72, a2, b3
432	LD	b3, 10 * SIZE(BO)
433	MADD4	c82, c82, a2, b4
434	LD	b4, 11 * SIZE(BO)
435
436	MADD1	c11, c11, a4, b6
437	LD	a2,  3 * SIZE(AO)
438	MADD3	c21, c21, a4, b2
439	NOP
440	MADD1	c31, c31, a4, b3
441	NOP
442	MADD3	c41, c41, a4, b4
443	NOP
444
445	MADD2	c12, c12, a2, b6
446	LD	b6, 24 * SIZE(BO)
447	MADD4	c22, c22, a2, b2
448	LD	b2, 13 * SIZE(BO)
449	MADD2	c32, c32, a2, b3
450	LD	b3, 14 * SIZE(BO)
451	MADD4	c42, c42, a2, b4
452	LD	b4, 15 * SIZE(BO)
453
454	MADD1	c51, c51, a4, b7
455	NOP
456	MADD3	c61, c61, a4, b2
457	NOP
458	MADD1	c71, c71, a4, b3
459	NOP
460	MADD3	c81, c81, a4, b4
461	NOP
462
463	MADD2	c52, c52, a2, b7
464	LD	b7, 28 * SIZE(BO)
465	MADD4	c62, c62, a2, b2
466	LD	b2, 17 * SIZE(BO)
467	MADD2	c72, c72, a2, b3
468	LD	b3, 18 * SIZE(BO)
469	MADD4	c82, c82, a2, b4
470	LD	b4, 19 * SIZE(BO)
471
472	MADD1	c11, c11, a3, b1
473	LD	a2,  5 * SIZE(AO)
474	MADD3	c21, c21, a3, b2
475	NOP
476	MADD1	c31, c31, a3, b3
477	NOP
478	MADD3	c41, c41, a3, b4
479	NOP
480
481	MADD2	c12, c12, a2, b1
482	LD	b1, 32 * SIZE(BO)
483	MADD4	c22, c22, a2, b2
484	LD	b2, 21 * SIZE(BO)
485	MADD2	c32, c32, a2, b3
486	LD	b3, 22 * SIZE(BO)
487	MADD4	c42, c42, a2, b4
488	LD	b4, 23 * SIZE(BO)
489
490	MADD1	c51, c51, a3, b5
491	NOP
492	MADD3	c61, c61, a3, b2
493	LD	a4,  6 * SIZE(AO)
494	MADD1	c71, c71, a3, b3
495	NOP
496	MADD3	c81, c81, a3, b4
497	LD	a3, 12 * SIZE(AO)
498
499	MADD2	c52, c52, a2, b5
500	LD	b5, 36 * SIZE(BO)
501	MADD4	c62, c62, a2, b2
502	LD	b2, 25 * SIZE(BO)
503	MADD2	c72, c72, a2, b3
504	LD	b3, 26 * SIZE(BO)
505	MADD4	c82, c82, a2, b4
506	LD	b4, 27 * SIZE(BO)
507
508	MADD1	c11, c11, a4, b6
509	LD	a2,  7 * SIZE(AO)
510	MADD3	c21, c21, a4, b2
511	NOP
512	MADD1	c31, c31, a4, b3
513	NOP
514	MADD3	c41, c41, a4, b4
515	NOP
516
517	MADD2	c12, c12, a2, b6
518	LD	b6, 40 * SIZE(BO)
519	MADD4	c22, c22, a2, b2
520	LD	b2, 29 * SIZE(BO)
521	MADD2	c32, c32, a2, b3
522	LD	b3, 30 * SIZE(BO)
523	MADD4	c42, c42, a2, b4
524	LD	b4, 31 * SIZE(BO)
525
526	MADD1	c51, c51, a4, b7
527	daddiu	BO, BO, 32 * SIZE
528	MADD3	c61, c61, a4, b2
529	daddiu	AO, AO,  8 * SIZE
530	MADD1	c71, c71, a4, b3
531	NOP
532	MADD3	c81, c81, a4, b4
533	NOP
534
535	MADD2	c52, c52, a2, b7
536	LD	b7, 12 * SIZE(BO)
537	MADD4	c62, c62, a2, b2
538	LD	b2,  1 * SIZE(BO)
539	MADD2	c72, c72, a2, b3
540	LD	b3,  2 * SIZE(BO)
541	MADD4	c82, c82, a2, b4
542	LD	b4,  3 * SIZE(BO)
543	.align 3
544
545.L15:
546#ifndef TRMMKERNEL
547	andi	L,  K, 3
548#else
549	andi	L,  TEMP, 3
550#endif
551	NOP
552	blez	L, .L18
553	NOP
554	.align	3
555
556.L16:
557	MADD1	c11, c11, a1, b1
558	LD	a2,  1 * SIZE(AO)
559	MADD3	c21, c21, a1, b2
560	NOP
561	MADD1	c31, c31, a1, b3
562	NOP
563	MADD3	c41, c41, a1, b4
564	NOP
565
566	MADD2	c12, c12, a2, b1
567	LD	b1,  8 * SIZE(BO)
568	MADD4	c22, c22, a2, b2
569	LD	b2,  5 * SIZE(BO)
570	MADD2	c32, c32, a2, b3
571	LD	b3,  6 * SIZE(BO)
572	MADD4	c42, c42, a2, b4
573	LD	b4,  7 * SIZE(BO)
574
575	MADD1	c51, c51, a1, b5
576	daddiu	L, L, -1
577	MADD3	c61, c61, a1, b2
578	daddiu	AO, AO,  2 * SIZE
579	MADD1	c71, c71, a1, b3
580	daddiu	BO, BO,  8 * SIZE
581	MADD3	c81, c81, a1, b4
582	LD	a1,  0 * SIZE(AO)
583
584	MADD2	c52, c52, a2, b5
585	LD	b5,  4 * SIZE(BO)
586	MADD4	c62, c62, a2, b2
587	LD	b2,  1 * SIZE(BO)
588	MADD2	c72, c72, a2, b3
589	LD	b3,  2 * SIZE(BO)
590	MADD4	c82, c82, a2, b4
591	bgtz	L, .L16
592	LD	b4,  3 * SIZE(BO)
593
594.L18:
595#ifndef TRMMKERNEL
596	LD	b1,  0 * SIZE(CO1)
597 	ADD	c11, c11, c22
598	LD	b2,  1 * SIZE(CO1)
599	ADD	c12, c12, c21
600	LD	b3,  0 * SIZE(CO2)
601	ADD	c31, c31, c42
602	LD	b4,  1 * SIZE(CO2)
603	ADD	c32, c32, c41
604
605	LD	b5,  0 * SIZE(CO3)
606	ADD	c51, c51, c62
607	LD	b6,  1 * SIZE(CO3)
608	ADD	c52, c52, c61
609	LD	b7,  0 * SIZE(CO4)
610	ADD	c71, c71, c82
611	LD	b8,  1 * SIZE(CO4)
612	ADD	c72, c72, c81
613
614	MADD	b1, b1, ALPHA_R, c11
615	daddiu	CO1,CO1, 2 * SIZE
616	MADD	b2, b2, ALPHA_R, c12
617	daddiu	CO2,CO2, 2 * SIZE
618	MADD	b3, b3, ALPHA_R, c31
619	daddiu	CO3,CO3, 2 * SIZE
620	MADD	b4, b4, ALPHA_R, c32
621	daddiu	CO4,CO4, 2 * SIZE
622
623	MADD	b5, b5, ALPHA_R, c51
624	daddiu	I, I, -1
625	MADD	b6, b6, ALPHA_R, c52
626	NOP
627	MADD	b7, b7, ALPHA_R, c71
628	NOP
629	MADD	b8, b8, ALPHA_R, c72
630	NOP
631
632	NMSUB	b1, b1, ALPHA_I, c12
633	NOP
634	MADD	b2, b2, ALPHA_I, c11
635	MTC	$0,  c11
636	NMSUB	b3, b3, ALPHA_I, c32
637	NOP
638	MADD	b4, b4, ALPHA_I, c31
639	NOP
640
641	ST	b1, -2 * SIZE(CO1)
642	NMSUB	b5, b5, ALPHA_I, c52
643	ST	b2, -1 * SIZE(CO1)
644	MADD	b6, b6, ALPHA_I, c51
645	ST	b3, -2 * SIZE(CO2)
646	NMSUB	b7, b7, ALPHA_I, c72
647	ST	b4, -1 * SIZE(CO2)
648	MADD	b8, b8, ALPHA_I, c71
649
650	ST	b5, -2 * SIZE(CO3)
651	MOV	c21, c11
652	ST	b6, -1 * SIZE(CO3)
653	MOV	c31, c11
654	ST	b7, -2 * SIZE(CO4)
655	MOV	c41, c11
656	ST	b8, -1 * SIZE(CO4)
657	MOV	c51, c11
658
659#else
660
661 	ADD	c11, c11, c22
662	daddiu	CO1,CO1, 2 * SIZE
663	ADD	c12, c12, c21
664	daddiu	CO2,CO2, 2 * SIZE
665	ADD	c31, c31, c42
666	daddiu	CO3,CO3, 2 * SIZE
667	ADD	c32, c32, c41
668	daddiu	CO4,CO4, 2 * SIZE
669
670	ADD	c51, c51, c62
671	daddiu	I, I, -1
672	ADD	c52, c52, c61
673	ADD	c71, c71, c82
674	ADD	c72, c72, c81
675
676	MUL	b1, ALPHA_R, c11
677	MUL	b2, ALPHA_R, c12
678	MUL	b3, ALPHA_R, c31
679	MUL	b4, ALPHA_R, c32
680
681	MUL	b5, ALPHA_R, c51
682	MUL	b6, ALPHA_R, c52
683	MUL	b7, ALPHA_R, c71
684	MUL	b8, ALPHA_R, c72
685
686	NMSUB	b1, b1, ALPHA_I, c12
687	NOP
688	MADD	b2, b2, ALPHA_I, c11
689	MTC	$0,  c11
690	NMSUB	b3, b3, ALPHA_I, c32
691	NOP
692	MADD	b4, b4, ALPHA_I, c31
693	NOP
694
695	ST	b1, -2 * SIZE(CO1)
696	NMSUB	b5, b5, ALPHA_I, c52
697	ST	b2, -1 * SIZE(CO1)
698	MADD	b6, b6, ALPHA_I, c51
699	ST	b3, -2 * SIZE(CO2)
700	NMSUB	b7, b7, ALPHA_I, c72
701	ST	b4, -1 * SIZE(CO2)
702	MADD	b8, b8, ALPHA_I, c71
703
704	ST	b5, -2 * SIZE(CO3)
705	MOV	c21, c11
706	ST	b6, -1 * SIZE(CO3)
707	MOV	c31, c11
708	ST	b7, -2 * SIZE(CO4)
709	MOV	c41, c11
710	ST	b8, -1 * SIZE(CO4)
711	MOV	c51, c11
712
713#if ( defined(LEFT) &&  defined(TRANSA)) || \
714    (!defined(LEFT) && !defined(TRANSA))
715	dsubu	TEMP, K, KK
716#ifdef LEFT
717	daddiu	TEMP, TEMP, -1
718#else
719	daddiu	TEMP, TEMP, -4
720#endif
721
722	dsll	L,    TEMP, ZBASE_SHIFT
723	dsll	TEMP, TEMP, 2 + ZBASE_SHIFT
724
725	daddu	AO, AO, L
726	daddu	BO, BO, TEMP
727#endif
728
729#ifdef LEFT
730	daddiu	KK, KK, 1
731#endif
732#endif
733	bgtz	I, .L11
734	MOV	c61, c11
735	.align 3
736
737.L19:
738#if defined(TRMMKERNEL) && !defined(LEFT)
739	daddiu	KK, KK, 4
740#endif
741
742	bgtz	J, .L10
743	move	B, BO
744	.align 3
745
746.L20:
747	andi	J,  N, 2
748	MTC	$0,  c11
749	blez	J, .L30
750	move	CO1, C
751
752	daddu	CO2, C,   LDC
753	daddu	C,   CO2, LDC
754
755#if defined(TRMMKERNEL) &&  defined(LEFT)
756	move	KK, OFFSET
757#endif
758
759	move	I,  M
760	blez	I, .L29
761	move	AO, A
762	.align 3
763
764.L21:
765#if defined(TRMMKERNEL)
766#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
767	move	BO,  B
768#else
769	dsll	L,    KK,  ZBASE_SHIFT
770	dsll	TEMP, KK, 1 + ZBASE_SHIFT
771
772	daddu	AO, AO, L
773	daddu	BO, B,  TEMP
774#endif
775
776	LD	a1,  0 * SIZE(AO)
777	MOV	c21, c11
778	LD	b1,  0 * SIZE(BO)
779	MOV	c31, c11
780	LD	a3,  4 * SIZE(AO)
781	MOV	c41, c11
782	LD	b2,  1 * SIZE(BO)
783
784	LD	b3,  2 * SIZE(BO)
785	MOV	c12, c11
786	LD	b4,  3 * SIZE(BO)
787	MOV	c22, c11
788	LD	b5,  4 * SIZE(BO)
789	MOV	c32, c11
790
791#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
792	dsubu	TEMP, K, KK
793#elif defined(LEFT)
794	daddiu	TEMP, KK, 1
795#else
796	daddiu	TEMP, KK, 2
797#endif
798	dsra	L,  TEMP, 2
799	blez	L, .L25
800	MOV	c42, c11
801
802#else
803	LD	a1,  0 * SIZE(AO)
804	MOV	c21, c11
805	LD	b1,  0 * SIZE(B)
806	MOV	c31, c11
807	LD	a3,  4 * SIZE(AO)
808	MOV	c41, c11
809	LD	b2,  1 * SIZE(B)
810	dsra	L,  K, 2
811
812	LD	b3,  2 * SIZE(B)
813	MOV	c12, c11
814	LD	b4,  3 * SIZE(B)
815	MOV	c22, c11
816	LD	b5,  4 * SIZE(B)
817	MOV	c32, c11
818
819	NOP
820	MOV	c42, c11
821	blez	L, .L25
822	move	BO,  B
823#endif
824	.align	3
825
826.L22:
827	MADD1	c11, c11, a1, b1
828	LD	a2,  1 * SIZE(AO)
829	MADD3	c21, c21, a1, b2
830	daddiu	L, L, -1
831	MADD1	c31, c31, a1, b3
832	NOP
833	MADD3	c41, c41, a1, b4
834	LD	a1,  2 * SIZE(AO)
835
836	MADD2	c12, c12, a2, b1
837	LD	b1,  8 * SIZE(BO)
838	MADD4	c22, c22, a2, b2
839	LD	b2,  5 * SIZE(BO)
840	MADD2	c32, c32, a2, b3
841	LD	b3,  6 * SIZE(BO)
842	MADD4	c42, c42, a2, b4
843	LD	b4,  7 * SIZE(BO)
844
845	MADD1	c11, c11, a1, b5
846	LD	a2,  3 * SIZE(AO)
847	MADD3	c21, c21, a1, b2
848	NOP
849	MADD1	c31, c31, a1, b3
850	NOP
851	MADD3	c41, c41, a1, b4
852	LD	a1,  8 * SIZE(AO)
853
854	MADD2	c12, c12, a2, b5
855	LD	b5, 12 * SIZE(BO)
856	MADD4	c22, c22, a2, b2
857	LD	b2,  9 * SIZE(BO)
858	MADD2	c32, c32, a2, b3
859	LD	b3, 10 * SIZE(BO)
860	MADD4	c42, c42, a2, b4
861	LD	b4, 11 * SIZE(BO)
862
863	MADD1	c11, c11, a3, b1
864	LD	a2,  5 * SIZE(AO)
865	MADD3	c21, c21, a3, b2
866	NOP
867	MADD1	c31, c31, a3, b3
868	NOP
869	MADD3	c41, c41, a3, b4
870	LD	a3,  6 * SIZE(AO)
871
872	MADD2	c12, c12, a2, b1
873	LD	b1, 16 * SIZE(BO)
874	MADD4	c22, c22, a2, b2
875	LD	b2, 13 * SIZE(BO)
876	MADD2	c32, c32, a2, b3
877	LD	b3, 14 * SIZE(BO)
878	MADD4	c42, c42, a2, b4
879	LD	b4, 15 * SIZE(BO)
880
881	MADD1	c11, c11, a3, b5
882	LD	a2,  7 * SIZE(AO)
883	MADD3	c21, c21, a3, b2
884	daddiu	AO, AO,  8 * SIZE
885	MADD1	c31, c31, a3, b3
886	NOP
887	MADD3	c41, c41, a3, b4
888	LD	a3,  4 * SIZE(AO)
889
890	MADD2	c12, c12, a2, b5
891	LD	b5, 20 * SIZE(BO)
892	MADD4	c22, c22, a2, b2
893	LD	b2, 17 * SIZE(BO)
894	MADD2	c32, c32, a2, b3
895	LD	b3, 18 * SIZE(BO)
896	MADD4	c42, c42, a2, b4
897	LD	b4, 19 * SIZE(BO)
898
899	bgtz	L, .L22
900	daddiu	BO, BO, 16 * SIZE
901	.align 3
902
903.L25:
904#ifndef TRMMKERNEL
905	andi	L,  K, 3
906#else
907	andi	L,  TEMP, 3
908#endif
909	NOP
910	blez	L, .L28
911	NOP
912	.align	3
913
914.L26:
915	MADD1	c11, c11, a1, b1
916	LD	a2,  1 * SIZE(AO)
917	MADD3	c21, c21, a1, b2
918	daddiu	L, L, -1
919	MADD1	c31, c31, a1, b3
920	daddiu	BO, BO,  4 * SIZE
921	MADD3	c41, c41, a1, b4
922	LD	a1,  2 * SIZE(AO)
923
924	MADD2	c12, c12, a2, b1
925	LD	b1,  0 * SIZE(BO)
926	MADD4	c22, c22, a2, b2
927	LD	b2,  1 * SIZE(BO)
928	MADD2	c32, c32, a2, b3
929	LD	b3,  2 * SIZE(BO)
930	MADD4	c42, c42, a2, b4
931	LD	b4,  3 * SIZE(BO)
932
933	bgtz	L, .L26
934	daddiu	AO, AO,  2 * SIZE
935
936.L28:
937#ifndef TRMMKERNEL
938	LD	b1,  0 * SIZE(CO1)
939 	ADD	c11, c11, c22
940	LD	b2,  1 * SIZE(CO1)
941	ADD	c12, c12, c21
942	LD	b3,  0 * SIZE(CO2)
943	ADD	c31, c31, c42
944	LD	b4,  1 * SIZE(CO2)
945	ADD	c32, c32, c41
946
947	MADD	b1, b1, ALPHA_R, c11
948	daddiu	CO1,CO1, 2 * SIZE
949	MADD	b2, b2, ALPHA_R, c12
950	daddiu	CO2,CO2, 2 * SIZE
951	MADD	b3, b3, ALPHA_R, c31
952	daddiu	I, I, -1
953	MADD	b4, b4, ALPHA_R, c32
954
955	NMSUB	b1, b1, ALPHA_I, c12
956	NOP
957	MADD	b2, b2, ALPHA_I, c11
958	MTC	$0,  c11
959	NMSUB	b3, b3, ALPHA_I, c32
960	NOP
961	MADD	b4, b4, ALPHA_I, c31
962	NOP
963
964	ST	b1, -2 * SIZE(CO1)
965	ST	b2, -1 * SIZE(CO1)
966	ST	b3, -2 * SIZE(CO2)
967#else
968 	ADD	c11, c11, c22
969	ADD	c12, c12, c21
970	ADD	c31, c31, c42
971	ADD	c32, c32, c41
972
973	MUL	b1, ALPHA_R, c11
974	daddiu	CO1,CO1, 2 * SIZE
975	MUL	b2, ALPHA_R, c12
976	daddiu	CO2,CO2, 2 * SIZE
977	MUL	b3, ALPHA_R, c31
978	daddiu	I, I, -1
979	MUL	b4, ALPHA_R, c32
980
981	NMSUB	b1, b1, ALPHA_I, c12
982	NOP
983	MADD	b2, b2, ALPHA_I, c11
984	MTC	$0,  c11
985	NMSUB	b3, b3, ALPHA_I, c32
986	NOP
987	MADD	b4, b4, ALPHA_I, c31
988	NOP
989
990	ST	b1, -2 * SIZE(CO1)
991	ST	b2, -1 * SIZE(CO1)
992	ST	b3, -2 * SIZE(CO2)
993
994#if ( defined(LEFT) &&  defined(TRANSA)) || \
995    (!defined(LEFT) && !defined(TRANSA))
996	dsubu	TEMP, K, KK
997#ifdef LEFT
998	daddiu	TEMP, TEMP, -1
999#else
1000	daddiu	TEMP, TEMP, -2
1001#endif
1002
1003	dsll	L,    TEMP, ZBASE_SHIFT
1004	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT
1005
1006	daddu	AO, AO, L
1007	daddu	BO, BO, TEMP
1008#endif
1009
1010#ifdef LEFT
1011	daddiu	KK, KK, 1
1012#endif
1013#endif
1014	bgtz	I, .L21
1015	ST	b4, -1 * SIZE(CO2)
1016	.align 3
1017
1018.L29:
1019#if defined(TRMMKERNEL) && !defined(LEFT)
1020	daddiu	KK, KK, 2
1021#endif
1022
1023	move	B, BO
1024	.align 3
1025
1026.L30:
1027	andi	J,  N, 1
1028	MTC	$0,  c11
1029	blez	J, .L999
1030	move	CO1, C
1031
1032#if defined(TRMMKERNEL) &&  defined(LEFT)
1033	move	KK, OFFSET
1034#endif
1035
1036	move	I,  M
1037	daddu	C,   CO1, LDC
1038	blez	I, .L39
1039	move	AO, A
1040	.align 3
1041
1042.L31:
1043#if defined(TRMMKERNEL)
1044#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1045	move	BO,  B
1046#else
1047	dsll	TEMP, KK,  ZBASE_SHIFT
1048
1049	daddu	AO, AO, TEMP
1050	daddu	BO, B,  TEMP
1051#endif
1052
1053	LD	a1,  0 * SIZE(AO)
1054	MOV	c21, c11
1055	LD	b1,  0 * SIZE(BO)
1056	MOV	c31, c11
1057	LD	a2,  1 * SIZE(AO)
1058
1059	MOV	c41, c11
1060	LD	b2,  1 * SIZE(BO)
1061	MOV	c12, c11
1062	NOP
1063
1064	MOV	c22, c11
1065	LD	a3,  4 * SIZE(AO)
1066	MOV	c32, c11
1067	LD	b3,  4 * SIZE(BO)
1068
1069#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1070	dsubu	TEMP, K, KK
1071#elif defined(LEFT)
1072	daddiu	TEMP, KK, 1
1073#else
1074	daddiu	TEMP, KK, 1
1075#endif
1076	dsra	L,  TEMP, 2
1077
1078	blez	L, .L35
1079	MOV	c42, c11
1080#else
1081	LD	a1,  0 * SIZE(AO)
1082	MOV	c21, c11
1083	LD	b1,  0 * SIZE(B)
1084	MOV	c31, c11
1085	LD	a2,  1 * SIZE(AO)
1086
1087	MOV	c41, c11
1088	LD	b2,  1 * SIZE(B)
1089	MOV	c12, c11
1090	dsra	L,  K, 2
1091
1092	MOV	c22, c11
1093	LD	a3,  4 * SIZE(AO)
1094	MOV	c32, c11
1095	LD	b3,  4 * SIZE(B)
1096
1097	NOP
1098	MOV	c42, c11
1099	blez	L, .L35
1100	move	BO,  B
1101#endif
1102	.align	3
1103
1104.L32:
1105	MADD1	c11, c11, a1, b1
1106	LD	b4,  3 * SIZE(BO)
1107	MADD3	c21, c21, a1, b2
1108	LD	a1,  2 * SIZE(AO)
1109	MADD2	c12, c12, a2, b1
1110	LD	b1,  2 * SIZE(BO)
1111	MADD4	c22, c22, a2, b2
1112	LD	a2,  3 * SIZE(AO)
1113
1114	MADD1	c11, c11, a1, b1
1115	LD	b2,  5 * SIZE(BO)
1116	MADD3	c21, c21, a1, b4
1117	LD	a1,  8 * SIZE(AO)
1118	MADD2	c12, c12, a2, b1
1119	LD	b1,  8 * SIZE(BO)
1120	MADD4	c22, c22, a2, b4
1121	LD	a2,  5 * SIZE(AO)
1122
1123	MADD1	c11, c11, a3, b3
1124	LD	b4,  7 * SIZE(BO)
1125	MADD3	c21, c21, a3, b2
1126	LD	a3,  6 * SIZE(AO)
1127	MADD2	c12, c12, a2, b3
1128	LD	b3,  6 * SIZE(BO)
1129	MADD4	c22, c22, a2, b2
1130	LD	a2,  7 * SIZE(AO)
1131
1132	MADD1	c11, c11, a3, b3
1133	LD	b2,  9 * SIZE(BO)
1134	MADD3	c21, c21, a3, b4
1135	LD	a3, 12 * SIZE(AO)
1136	MADD2	c12, c12, a2, b3
1137	LD	b3, 12 * SIZE(BO)
1138	MADD4	c22, c22, a2, b4
1139	LD	a2,  9 * SIZE(AO)
1140
1141	daddiu	AO, AO,  8 * SIZE
1142	daddiu	L, L, -1
1143
1144	bgtz	L, .L32
1145	daddiu	BO, BO,  8 * SIZE
1146	.align 3
1147
1148.L35:
1149#ifndef TRMMKERNEL
1150	andi	L,  K, 3
1151#else
1152	andi	L,  TEMP, 3
1153#endif
1154	NOP
1155	blez	L, .L38
1156	NOP
1157	.align	3
1158
1159.L36:
1160	MADD1	c11, c11, a1, b1
1161	daddiu	L, L, -1
1162	MADD3	c21, c21, a1, b2
1163	LD	a1,  2 * SIZE(AO)
1164	MADD2	c12, c12, a2, b1
1165	LD	b1,  2 * SIZE(BO)
1166	MADD4	c22, c22, a2, b2
1167	LD	a2,  3 * SIZE(AO)
1168
1169	LD	b2,  3 * SIZE(BO)
1170	daddiu	BO, BO,  2 * SIZE
1171	bgtz	L, .L36
1172	daddiu	AO, AO,  2 * SIZE
1173
1174.L38:
1175#ifndef TRMMKERNEL
1176	LD	b1,  0 * SIZE(CO1)
1177 	ADD	c11, c11, c22
1178	LD	b2,  1 * SIZE(CO1)
1179	ADD	c12, c12, c21
1180
1181	MADD	b1, b1, ALPHA_R, c11
1182	daddiu	CO1,CO1, 2 * SIZE
1183	MADD	b2, b2, ALPHA_R, c12
1184	daddiu	I, I, -1
1185
1186	NMSUB	b1, b1, ALPHA_I, c12
1187	NOP
1188	MADD	b2, b2, ALPHA_I, c11
1189	MTC	$0,  c11
1190
1191	ST	b1, -2 * SIZE(CO1)
1192	NOP
1193	bgtz	I, .L31
1194	ST	b2, -1 * SIZE(CO1)
1195#else
1196 	ADD	c11, c11, c22
1197	ADD	c12, c12, c21
1198
1199	MUL	b1, ALPHA_R, c11
1200	daddiu	CO1,CO1, 2 * SIZE
1201	MUL	b2, ALPHA_R, c12
1202	daddiu	I, I, -1
1203
1204	NMSUB	b1, b1, ALPHA_I, c12
1205	NOP
1206	MADD	b2, b2, ALPHA_I, c11
1207	MTC	$0,  c11
1208
1209#if ( defined(LEFT) &&  defined(TRANSA)) || \
1210    (!defined(LEFT) && !defined(TRANSA))
1211	dsubu	TEMP, K, KK
1212#ifdef LEFT
1213	daddiu	TEMP, TEMP, -1
1214#else
1215	daddiu	TEMP, TEMP, -1
1216#endif
1217
1218	dsll	TEMP, TEMP, ZBASE_SHIFT
1219
1220	daddu	AO, AO, TEMP
1221	daddu	BO, BO, TEMP
1222#endif
1223
1224#ifdef LEFT
1225	daddiu	KK, KK, 1
1226#endif
1227
1228	ST	b1, -2 * SIZE(CO1)
1229	NOP
1230	bgtz	I, .L31
1231	ST	b2, -1 * SIZE(CO1)
1232#endif
1233	.align 3
1234
1235.L39:
1236#if defined(TRMMKERNEL) && !defined(LEFT)
1237	daddiu	KK, KK, 1
1238#endif
1239	move	B, BO
1240	.align 3
1241
1242
1243.L999:
1244	LDARG	$16,   0($sp)
1245	LDARG	$17,   8($sp)
1246	ldc1	$f24, 16($sp)
1247	ldc1	$f25, 24($sp)
1248	ldc1	$f26, 32($sp)
1249	ldc1	$f27, 40($sp)
1250	ldc1	$f28, 48($sp)
1251	ldc1	$f29, 56($sp)
1252
1253#if defined(TRMMKERNEL)
1254	LDARG	$18,  64($sp)
1255	LDARG	$19,  72($sp)
1256	LDARG	$20,  80($sp)
1257#endif
1258
1259#ifndef __64BIT__
1260	ldc1	$f20, 88($sp)
1261	ldc1	$f21, 96($sp)
1262	ldc1	$f22,104($sp)
1263	ldc1	$f23,112($sp)
1264#endif
1265
1266	j	$31
1267	daddiu	$sp, $sp, 128
1268
1269	EPILOGUE
1270