1#define REALNAME ASMNAME
2#define ASSEMBLER
3#include "common.h"
4
5#define FETCH	ld
6#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
7#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
8
9#define M	$4
10#define	N	$5
11#define	K	$6
12#define A	$8
13#define B	$9
14#define C	$10
15#define LDC	$11
16
17#define AO	$12
18#define BO	$13
19
20#define CO1	$14
21#define CO2	$15
22#define CO3	$16
23#define CO4	$17
24
25#define KCO	$18
26#define MCO	$19
27#define NCO	$20
28
29#define SPANB	$21
30#define PREB	$23
31#define PREA	$24
32#define SPANA	$25
33
34#define ALPHA	$f15
35
36#if defined(TRMMKERNEL)
37#define	OFFSET	$2
38#define	KK	$3
39#define	TEMP	$7
40#endif
41
42#define R8	8
43#define	R9	9
44#define R14	14
45#define R15	15
46#define R16	16
47#define R17 17
48
49#define	t11	$f30
50#define	t21	$f31
51#define	t31	$f28
52#define	t41	$f29
53
54#define	t12	$f26
55#define	t22	$f27
56#define	t32	$f24
57#define	t42	$f25
58
59#define	t13	$f22
60#define	t23	$f23
61#define	t33	$f20
62#define	t43	$f21
63
64#define	t14	$f18
65#define	t24	$f19
66#define	t34	$f16
67#define	t44	$f17
68
69#define	c11	$f0
70#define	c21	$f1
71#define	c31	$f2
72#define	c41	$f3
73
74#define	c12	$f4
75#define	c22	$f5
76#define	c32	$f6
77#define	c42	$f7
78
79#define	c13	$f8
80#define	c23	$f9
81#define	c33	$f10
82#define c43	$f11
83
84#define	c14	$f12
85#define	c24	$f13
86#define	c34	$f14
87#define	c44	$f0
88
89#define	a0	$f0
90#define	a1	$f1
91#define	a2	$f2
92#define	a3	$f3
93#define	a4	$f4
94#define	a5	$f5
95#define	a6	$f6
96#define	a7	$f7
97#define	b0	$f8
98#define	b1	$f9
99#define	b2	$f10
100#define b3	$f11
101#define	b4	$f12
102#define	b5	$f13
103#define	b6	$f14
104#define	b7	$f15
105
106#define F31 31
107#define F30 30
108#define F29 29
109#define F28 28
110#define F27 27
111#define F26 26
112#define F25 25
113#define F24 24
114#define F23 23
115#define F22 22
116#define F21 21
117#define F20 20
118#define F19 19
119#define F18 18
120#define F17 17
121#define F16 16
122#define F15 15
123#define F14 14
124#define F13 13
125#define F12 12
126#define F11 11
127#define F10 10
128#define F9 9
129#define F8 8
130#define F7 7
131#define F6 6
132#define F5 5
133#define F4 4
134#define F3 3
135#define F2 2
136#define F1 1
137#define F0 0
138
139	PROLOGUE
140
141	daddiu	$sp, $sp, -160
142	sd	$16,   0($sp)
143	sd	$17,   8($sp)
144	sd	$18,  16($sp)
145	sd	$19,  24($sp)
146	sd	$20,  32($sp)
147	sd	$21,  40($sp)
148	sd	$22,  48($sp)
149	ST	$f24, 56($sp)
150	ST	$f25, 64($sp)
151	ST	$f26, 72($sp)
152	ST	$f27, 80($sp)
153	ST	$f28, 88($sp)
154	sd	$23,  96($sp)
155	sd	$24, 104($sp)
156	sd	$25, 112($sp)
157	ST	$f20,120($sp)
158	ST	$f21,128($sp)
159	ST	$f22,136($sp)
160	ST	$f23,144($sp)
161
162
163	.align	5
164.L0_N4:									#  Loop N
165	ST	ALPHA,152($sp)					#  Backup	ALPHA
166	move	MCO,M						#  Backup	M
167
168	move	NCO,N						#  Backup	N
169	move	KCO,K						#  Backup	K
170
171	move	AO,A						#  Backup	A_addr
172	dsra	N,NCO,2						#  N=NCO/2
173
174	dsll	LDC,LDC,BASE_SHIFT			#  LDC*8Byte
175	dsll	SPANB,KCO,2+BASE_SHIFT		#  SPANB=KC*4nr*8Byte=KC*2^5
176
177#if defined(TRMMKERNEL)
178	LDARG	OFFSET,160($sp)				#	OFFSET is relate to the data part
179#endif
180
181#if defined(TRMMKERNEL) && !defined(LEFT)
182	neg		KK,OFFSET
183#endif
184
185	move	BO,B						#  Backup	B_addr
186	beq		N,$0,.L0_N2					#  N=0,NCO<4
187	dsll	SPANA,KCO,1+BASE_SHIFT		#  SPANA = KCO*2mr*8Byte
188
189.L0_N4_Lb:								#	mr=4,nr=4
190	move	CO1,C
191	dsra	M,MCO,2						#  M=MCO/2
192
193	move	A,AO						#  Reset A
194	daddu	CO2,C,LDC
195
196	daddu	PREB,BO,SPANB				#  PreB point next panelB
197	daddu	CO3,CO2,LDC
198
199	daddu	PREA,AO,SPANA
200	daddu	CO4,CO3,LDC
201
202#if defined(TRMMKERNEL) && defined(LEFT)
203	move	KK,OFFSET
204#endif
205	beqz	M,.L14_M2
206	daddu	C,CO4,LDC					#	move C to next panel Cj
207
208.L10:
209#if defined(TRMMKERNEL)
210#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
211	move	B,BO						#	(SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
212#else
213	dsll	K,KK,2 + BASE_SHIFT			#  KK is the length that needs to span to the data part
214	dsll	TEMP,KK,2 + BASE_SHIFT
215
216	daddu	A,A,K						#  move A B to data part
217	daddu	B,BO,TEMP
218#endif
219
220	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
221	LD	a0,0(A)
222
223	MOV	t21,t11
224	MOV	t31,t11
225	LD	a1,1*SIZE(A)
226
227	MOV	t41,t11
228	MOV	t12,t11
229	LD	b0,0(B)
230
231	MOV	t22,t11
232	MOV	t32,t11
233	LD	b1,1*SIZE(B)
234
235	MOV	t42,t11
236	LD	a2,2*SIZE(A)
237
238	MOV	t13,t11
239	MOV	t23,t11
240	LD	b2,2*SIZE(B)
241
242	MOV	t33,t11
243	MOV	t43,t11
244	LD	a3,3*SIZE(A)
245
246	MOV	t14,t11
247	MOV	t24,t11
248	LD	b3,3*SIZE(B)
249
250#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
251	dsubu	TEMP,KCO,KK					#  temp is the length of the data part
252#elif defined(LEFT)
253	daddiu	TEMP, KK, 4					#	S=L,U=L
254#else
255	daddiu	TEMP, KK, 4					#	S=R,U=U,for this two situation KK is the length of the data part
256#endif
257	dsra	K,TEMP,2					#  K=KCO/2
258	MOV	t34,t11
259	beqz	K,.L15
260	MOV	t44,t11
261
262#else
263	move	B,BO						#	Reset B
264	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
265	LD	a0,0(A)
266
267	MOV	t21,t11
268	MOV	t31,t11
269	LD	a1,1*SIZE(A)
270
271	MOV	t41,t11
272	MOV	t12,t11
273	LD	b0,0(B)
274
275	MOV	t22,t11
276	MOV	t32,t11
277	LD	b1,1*SIZE(B)
278
279	MOV	t42,t11
280	dsra	K,KCO,2						#  K=KCO/2
281	LD	a2,2*SIZE(A)
282
283	MOV	t13,t11
284	MOV	t23,t11
285	LD	b2,2*SIZE(B)
286
287	MOV	t33,t11
288	MOV	t43,t11
289	LD	a3,3*SIZE(A)
290
291	MOV	t14,t11
292	MOV	t24,t11
293	LD	b3,3*SIZE(B)
294
295	MOV	t34,t11
296	beqz	K,.L15
297	MOV	t44,t11							#	clear 16 results registers
298#endif
299
300	.align	5
301.L11:									#  kr=4
302	MADD	t11,t11,a0,b0
303	MADD	t21,t21,a1,b0
304	LD	a4,4*SIZE(A)
305
306	MADD	t12,t12,a0,b1
307	MADD	t22,t22,a1,b1
308	LD	a5,5*SIZE(A)
309
310	MADD	t31,t31,a2,b0
311	MADD	t41,t41,a3,b0
312	LD	b4,4*SIZE(B)
313
314	MADD	t32,t32,a2,b1
315	MADD	t42,t42,a3,b1
316	LD	b5,5*SIZE(B)
317	FETCH		$0,(PREB)
318
319	MADD	t13,t13,a0,b2
320	MADD	t23,t23,a1,b2
321	LD	a6,6*SIZE(A)
322
323	MADD	t14,t14,a0,b3
324	MADD	t24,t24,a1,b3
325	LD	b6,6*SIZE(B)
326	FETCH		$0,(PREA)
327
328	MADD	t33,t33,a2,b2
329	MADD	t43,t43,a3,b2
330	LD	a7,7*SIZE(A)
331
332	MADD	t34,t34,a2,b3
333	MADD	t44,t44,a3,b3
334	LD	b7,7*SIZE(B)
335
336.L12:
337	MADD	t11,t11,a4,b4
338	MADD	t21,t21,a5,b4
339	LD	a0,8*SIZE(A)
340
341	MADD	t12,t12,a4,b5
342	MADD	t22,t22,a5,b5
343	LD	a1,9*SIZE(A)
344
345	MADD	t31,t31,a6,b4
346	MADD	t41,t41,a7,b4
347	LD	b0,8*SIZE(B)
348
349	MADD	t32,t32,a6,b5
350	MADD	t42,t42,a7,b5
351	LD	b1,9*SIZE(B)
352
353	FETCH		$0,4*SIZE(PREB)
354	MADD	t13,t13,a4,b6
355	MADD	t23,t23,a5,b6
356	LD	a2,10*SIZE(A)
357
358	MADD	t14,t14,a4,b7
359	MADD	t24,t24,a5,b7
360	LD	b2,10*SIZE(B)
361
362	FETCH		$0,4*SIZE(PREA)
363	MADD	t33,t33,a6,b6
364	MADD	t43,t43,a7,b6
365	LD	a3,11*SIZE(A)
366
367	MADD	t34,t34,a6,b7
368	MADD	t44,t44,a7,b7
369	LD	b3,11*SIZE(B)
370
371.L13:
372	MADD	t11,t11,a0,b0
373	MADD	t21,t21,a1,b0
374	LD	a4,12*SIZE(A)
375
376	MADD	t12,t12,a0,b1
377	MADD	t22,t22,a1,b1
378	LD	a5,13*SIZE(A)
379
380	MADD	t31,t31,a2,b0
381	MADD	t41,t41,a3,b0
382	LD	b4,12*SIZE(B)
383
384	FETCH		$0,8*SIZE(PREA)
385	MADD	t32,t32,a2,b1
386	MADD	t42,t42,a3,b1
387	LD	b5,13*SIZE(B)
388
389	FETCH		$0,8*SIZE(PREB)
390	MADD	t13,t13,a0,b2
391	MADD	t23,t23,a1,b2
392	LD	a6,14*SIZE(A)
393
394	MADD	t14,t14,a0,b3
395	MADD	t24,t24,a1,b3
396	daddu	A,A,16*SIZE					#  4mr*4kr
397	LD	b6,14*SIZE(B)
398
399	MADD	t33,t33,a2,b2
400	MADD	t43,t43,a3,b2
401	daddu	B,B,16*SIZE					#	4nr*4kr
402	LD	a7,-1*SIZE(A)
403
404	MADD	t34,t34,a2,b3
405	MADD	t44,t44,a3,b3
406	LD	b7,-1*SIZE(B)
407
408.L14:
409	MADD	t11,t11,a4,b4
410	MADD	t21,t21,a5,b4
411	LD	a0,0(A)
412
413	MADD	t12,t12,a4,b5
414	MADD	t22,t22,a5,b5
415	LD	a1,1*SIZE(A)
416
417	MADD	t31,t31,a6,b4
418	MADD	t41,t41,a7,b4
419	daddiu	K,K,-1
420	LD	b0,0(B)
421
422	MADD	t32,t32,a6,b5
423	MADD	t42,t42,a7,b5
424	daddu	PREA,PREA,16*SIZE
425	LD	b1,1*SIZE(B)
426
427	FETCH		$0,12*SIZE(PREB)
428	MADD	t13,t13,a4,b6
429	MADD	t23,t23,a5,b6
430	LD	a2,2*SIZE(A)
431
432	FETCH		$0,-4*SIZE(PREA)
433	MADD	t14,t14,a4,b7
434	MADD	t24,t24,a5,b7
435	LD	b2,2*SIZE(B)
436
437	MADD	t33,t33,a6,b6
438	MADD	t43,t43,a7,b6
439	daddu	PREB,PREB,16*SIZE
440	LD	a3,3*SIZE(A)
441
442	MADD	t34,t34,a6,b7
443	MADD	t44,t44,a7,b7
444	bnez 	K,.L11
445	LD	b3,3*SIZE(B)
446
447
448.L15:									#  kr=2
449#ifndef TRMMKERNEL
450	andi	K,KCO,2
451#else
452	andi	K,TEMP, 2
453#endif
454	beqz	K,.L18
455	nop
456
457.L16:
458	MADD	t11,t11,a0,b0
459	MADD	t21,t21,a1,b0
460	LD	a4,4*SIZE(A)
461
462	MADD	t12,t12,a0,b1
463	MADD	t22,t22,a1,b1
464	LD	a5,5*SIZE(A)
465
466	MADD	t31,t31,a2,b0
467	MADD	t41,t41,a3,b0
468	LD	b4,4*SIZE(B)
469
470	FETCH		$0,0(PREA)
471	MADD	t32,t32,a2,b1
472	MADD	t42,t42,a3,b1
473	LD	b5,5*SIZE(B)
474
475	FETCH		$0,0(PREB)
476	MADD	t13,t13,a0,b2
477	MADD	t23,t23,a1,b2
478	LD	a6,6*SIZE(A)
479
480	MADD	t14,t14,a0,b3
481	MADD	t24,t24,a1,b3
482	daddu	A,A,8*SIZE					#	4mr*2kr
483	LD	b6,6*SIZE(B)
484
485	MADD	t33,t33,a2,b2
486	MADD	t43,t43,a3,b2
487	daddu	B,B,8*SIZE					#	4nr*2kr
488	LD	a7,-1*SIZE(A)
489
490	MADD	t34,t34,a2,b3
491	MADD	t44,t44,a3,b3
492	LD	b7,-1*SIZE(B)
493
494.L17:
495	MADD	t11,t11,a4,b4
496	MADD	t21,t21,a5,b4
497	LD	a0,0*SIZE(A)
498
499	MADD	t12,t12,a4,b5
500	MADD	t22,t22,a5,b5
501	LD	a1,1*SIZE(A)
502
503	MADD	t31,t31,a6,b4
504	MADD	t41,t41,a7,b4
505	LD	b0,0*SIZE(B)
506
507	MADD	t32,t32,a6,b5
508	MADD	t42,t42,a7,b5
509	LD	b1,1*SIZE(B)
510
511	FETCH		$0,4*SIZE(PREB)
512	MADD	t13,t13,a4,b6
513	MADD	t23,t23,a5,b6
514	LD	a2,2*SIZE(A)
515
516	FETCH		$0,4*SIZE(PREA)
517	MADD	t14,t14,a4,b7
518	MADD	t24,t24,a5,b7
519	LD	b2,2*SIZE(B)
520
521	MADD	t33,t33,a6,b6
522	MADD	t43,t43,a7,b6
523	daddu	PREA,PREA,8*SIZE
524	LD	a3,3*SIZE(A)
525
526	MADD	t34,t34,a6,b7
527	MADD	t44,t44,a7,b7
528	daddu	PREB,PREB,8*SIZE
529	LD	b3,3*SIZE(B)
530
531
532.L18:									#	kr=1
533#ifndef TRMMKERNEL
534	andi	K,KCO,1
535#else
536	andi	K,TEMP,1
537#endif
538	beqz	K,.L19
539	LD	ALPHA,152($sp)					#  Get ALPHA
540
541	FETCH		$0,0(PREB)
542	MADD	t11,t11,a0,b0
543	MADD	t21,t21,a1,b0
544	daddu	A,A,4*SIZE					#  	4mr*kr
545
546	MADD	t12,t12,a0,b1
547	MADD	t22,t22,a1,b1
548	daddu	B,B,4*SIZE					#	4nr*kr
549
550	FETCH		$0,0(PREA)
551	MADD	t31,t31,a2,b0
552	MADD	t41,t41,a3,b0
553	daddu	PREB,PREB,4*SIZE
554
555	MADD	t32,t32,a2,b1
556	MADD	t42,t42,a3,b1
557	daddu	PREA,PREA,4*SIZE
558
559	MADD	t13,t13,a0,b2
560	MADD	t23,t23,a1,b2
561
562	MADD	t14,t14,a0,b3
563	MADD	t24,t24,a1,b3
564
565	MADD	t33,t33,a2,b2
566	MADD	t43,t43,a3,b2
567
568	MADD	t34,t34,a2,b3
569	MADD	t44,t44,a3,b3
570
571.L19:									#  Write Back to C
572#ifndef TRMMKERNEL
573	LD	c11,0(CO1)						#  GEMM write part
574	LD	c21,1*SIZE(CO1)					#  get 16 C
575	LD	c31,2*SIZE(CO1)
576	LD	c41,3*SIZE(CO1)
577
578	LD	c12,0(CO2)
579	MADD	t11,c11,t11,ALPHA
580	LD	c22,1*SIZE(CO2)
581	MADD	t21,c21,t21,ALPHA
582	LD	c32,2*SIZE(CO2)
583	MADD	t31,c31,t31,ALPHA
584	LD	c42,3*SIZE(CO2)
585	MADD	t41,c41,t41,ALPHA
586
587	LD	c13,0(CO3)
588	MADD	t12,c12,t12,ALPHA
589	LD	c23,1*SIZE(CO3)
590	MADD	t22,c22,t22,ALPHA
591	LD	c33,2*SIZE(CO3)
592	MADD	t32,c32,t32,ALPHA
593	LD	c43,3*SIZE(CO3)
594	MADD	t42,c42,t42,ALPHA
595
596	LD	c14,0(CO4)
597	MADD	t13,c13,t13,ALPHA
598	LD	c24,1*SIZE(CO4)
599	MADD	t23,c23,t23,ALPHA
600	LD	c34,2*SIZE(CO4)
601	MADD	t33,c33,t33,ALPHA
602	LD	c44,3*SIZE(CO4)
603	MADD	t43,c43,t43,ALPHA
604
605	ST	t11,0(CO1)
606	MADD	t14,c14,t14,ALPHA
607	ST	t21,1*SIZE(CO1)
608	MADD	t24,c24,t24,ALPHA
609	ST	t31,2*SIZE(CO1)
610	MADD	t34,c34,t34,ALPHA
611	ST	t41,3*SIZE(CO1)
612	MADD	t44,c44,t44,ALPHA
613	daddiu	M,M,-1						#  M--
614
615	ST	t12,0(CO2)
616	ST	t22,1*SIZE(CO2)
617	ST	t32,2*SIZE(CO2)
618	ST	t42,3*SIZE(CO2)
619
620	ST	t13,0(CO3)
621	ST	t23,1*SIZE(CO3)
622	ST	t33,2*SIZE(CO3)
623	ST	t43,3*SIZE(CO3)
624
625	FETCH	$0,4*SIZE(CO1)
626	FETCH	$0,4*SIZE(CO2)
627	FETCH	$0,4*SIZE(CO3)
628	FETCH	$0,4*SIZE(CO4)
629
630	FETCH	$0,8*SIZE(CO1)
631	FETCH	$0,8*SIZE(CO2)
632	FETCH	$0,8*SIZE(CO3)
633	FETCH	$0,8*SIZE(CO4)
634
635	ST	t14,0(CO4)
636	daddu	CO1,CO1,4*SIZE				#  COi += 4
637	ST	t24,1*SIZE(CO4)
638	daddu	CO2,CO2,4*SIZE
639	ST	t34,2*SIZE(CO4)
640	daddu	CO3,CO3,4*SIZE
641	ST	t44,3*SIZE(CO4)
642	daddu	PREB,BO,SPANB
643
644	bnez	M,.L10
645	daddu	CO4,CO4,4*SIZE
646
647#else
648	MUL	t11, ALPHA, t11					#	TRMM write back part
649	MUL	t21, ALPHA, t21
650	MUL	t31, ALPHA, t31
651	MUL	t41, ALPHA, t41
652
653	ST	t11, 0 * SIZE(CO1)
654	MUL	t12, ALPHA, t12
655	ST	t21, 1 * SIZE(CO1)
656	MUL	t22, ALPHA, t22
657	ST	t31, 2 * SIZE(CO1)
658	MUL	t32, ALPHA, t32
659	ST	t41, 3 * SIZE(CO1)
660	MUL	t42, ALPHA, t42
661
662	ST	t12, 0 * SIZE(CO2)
663	MUL	t13, ALPHA, t13
664	ST	t22, 1 * SIZE(CO2)
665	MUL	t23, ALPHA, t23
666	ST	t32, 2 * SIZE(CO2)
667	MUL	t33, ALPHA, t33
668	ST	t42, 3 * SIZE(CO2)
669	MUL	t43, ALPHA, t43
670
671	ST	t13, 0 * SIZE(CO3)
672	MUL	t14, ALPHA, t14
673	ST	t23, 1 * SIZE(CO3)
674	MUL	t24, ALPHA, t24
675	ST	t33, 2 * SIZE(CO3)
676	MUL	t34, ALPHA, t34
677	ST	t43, 3 * SIZE(CO3)
678	MUL	t44, ALPHA, t44
679
680	ST	t14, 0 * SIZE(CO4)
681	daddiu	M,M,-1						#  M--
682	ST	t24, 1 * SIZE(CO4)
683	ST	t34, 2 * SIZE(CO4)
684	ST	t44, 3 * SIZE(CO4)
685	daddiu	CO1,CO1, 4 * SIZE
686	daddiu	CO2,CO2, 4 * SIZE
687	daddiu	CO3,CO3, 4 * SIZE
688	daddiu	CO4,CO4, 4 * SIZE
689
690	FETCH	$0,4*SIZE(CO1)
691	FETCH	$0,4*SIZE(CO2)
692	FETCH	$0,4*SIZE(CO3)
693	FETCH	$0,4*SIZE(CO4)
694
695	FETCH	$0,0(CO1)
696	FETCH	$0,0(CO2)
697	FETCH	$0,0(CO3)
698	FETCH	$0,0(CO4)
699
700#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
701	dsubu	TEMP,KCO,KK
702#ifdef LEFT
703	daddiu	TEMP,TEMP, -4
704#else
705	daddiu	TEMP,TEMP, -4
706#endif
707	dsll	K,TEMP,2 + BASE_SHIFT
708	dsll	TEMP,TEMP,2 + BASE_SHIFT
709	daddu	A,A,K						# 	mov A to the end of panel Ai
710	daddu	B,B,TEMP					# 	mov B to the end of panel Bj
711#endif
712
713#ifdef LEFT
714	daddiu	KK, KK,4
715#endif
716	bnez	M,.L10
717	nop
718#endif
719
720
721	.align 3
722.L14_M2:
723	andi	M, MCO, 2					#	nr=4,mr=2
724	beqz	M,.L14_M1
725	nop
726
727.L20:
728#if defined(TRMMKERNEL)
729#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
730	move	B,BO						#	Reset B
731#else
732	dsll	K,KK,1 + BASE_SHIFT			#	mr=2
733	dsll	TEMP,KK,2 + BASE_SHIFT		#	nr=4
734	daddu	A,A,K
735	daddu	B,BO,TEMP
736#endif
737
738	LD	a0,0*SIZE(A)
739	MTC		$0,t11
740	LD	a1,1*SIZE(A)
741
742	MOV	t21,t11
743	LD	b0,0*SIZE(B)
744	MOV	t12,t11
745	LD	b1,1*SIZE(B)
746
747	MOV	t22,t11
748	LD	b2,2*SIZE(B)
749
750	MOV	t13,t11
751	MOV	t23,t11
752	LD	b3,3*SIZE(B)
753
754#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
755	dsubu	TEMP,KCO,KK
756#elif defined(LEFT)
757	daddiu	TEMP,KK,2					#	left part,controlled by mr, mr=2
758#else
759	daddiu	TEMP,KK,4					#  	right part,controlled by nr,nr=4
760#endif
761	dsra	K,TEMP,2
762	MOV	t14,t11
763	beqz	K,.L25
764	MOV	t24,t11							#	clear 2*4=8 results registers
765
766#else
767	move	B,BO						#	Reset B
768	LD	a0,0*SIZE(A)
769	MTC		$0,t11
770	LD	a1,1*SIZE(A)
771
772	MOV	t21,t11
773	LD	b0,0*SIZE(B)
774	MOV	t12,t11
775	LD	b1,1*SIZE(B)
776
777	MOV	t22,t11
778	dsra	K,KCO,2
779	LD	b2,2*SIZE(B)
780
781	MOV	t13,t11
782	MOV	t23,t11
783	LD	b3,3*SIZE(B)
784
785	MOV	t14,t11
786	beqz	K,.L25
787	MOV	t24,t11
788
789#endif
790
791.L21:									#  nr=4,mr=2,kr=4
792	MADD	t11,t11,a0,b0
793	LD	a4,2*SIZE(A)
794	MADD	t21,t21,a1,b0
795	LD	a5,3*SIZE(A)
796
797	MADD	t12,t12,a0,b1
798	LD	b4,4*SIZE(B)
799	MADD	t22,t22,a1,b1
800	LD	b5,5*SIZE(B)
801
802	MADD	t13,t13,a0,b2
803	LD	b6,6*SIZE(B)
804	MADD	t23,t23,a1,b2
805	LD	b7,7*SIZE(B)
806
807	MADD	t14,t14,a0,b3
808	MADD	t24,t24,a1,b3
809
810	MADD	t11,t11,a4,b4
811	LD	a2,4*SIZE(A)
812	MADD	t21,t21,a5,b4
813	LD	a3,5*SIZE(A)
814
815	MADD	t12,t12,a4,b5
816	LD	b0,8*SIZE(B)
817	MADD	t22,t22,a5,b5
818	LD	b1,9*SIZE(B)
819
820	MADD	t13,t13,a4,b6
821	LD	b2,10*SIZE(B)
822	MADD	t23,t23,a5,b6
823	LD	b3,11*SIZE(B)
824
825	MADD	t14,t14,a4,b7
826	MADD	t24,t24,a5,b7
827	daddiu	K,K,-1
828
829	MADD	t11,t11,a2,b0
830	LD	a6,6*SIZE(A)
831	MADD	t21,t21,a3,b0
832	LD	a7,7*SIZE(A)
833
834	MADD	t12,t12,a2,b1
835	LD	b4,12*SIZE(B)
836	MADD	t22,t22,a3,b1
837	LD	b5,13*SIZE(B)
838
839	MADD	t13,t13,a2,b2
840	LD	b6,14*SIZE(B)
841	MADD	t23,t23,a3,b2
842	LD	b7,15*SIZE(B)
843
844	MADD	t14,t14,a2,b3
845	MADD	t24,t24,a3,b3
846	daddu	A,A,8*SIZE					#  2mr*4kr
847	daddu	B,B,16*SIZE					#	4nr*4kr
848
849	MADD	t11,t11,a6,b4
850	LD	a0,0*SIZE(A)
851	MADD	t21,t21,a7,b4
852	LD	a1,1*SIZE(A)
853
854	MADD	t12,t12,a6,b5
855	LD	b0,0*SIZE(B)
856	MADD	t22,t22,a7,b5
857	LD	b1,1*SIZE(B)
858
859	MADD	t13,t13,a6,b6
860	LD	b2,2*SIZE(B)
861	MADD	t23,t23,a7,b6
862	LD	b3,3*SIZE(B)
863
864	MADD	t14,t14,a6,b7
865	bnez 	K,.L21
866	MADD	t24,t24,a7,b7
867
868
869.L25:
870#ifndef TRMMKERNEL
871	andi	K,KCO,2						#	kr=2
872#else
873	andi	K,TEMP,2
874#endif
875	beqz	K,.L28
876	nop
877
878.L26:
879	MADD	t11,t11,a0,b0
880	LD	a4,2*SIZE(A)
881	MADD	t21,t21,a1,b0
882	LD	a5,3*SIZE(A)
883
884	MADD	t12,t12,a0,b1
885	LD	b4,4*SIZE(B)
886	MADD	t22,t22,a1,b1
887	LD	b5,5*SIZE(B)
888
889	MADD	t13,t13,a0,b2
890	LD	b6,6*SIZE(B)
891	MADD	t23,t23,a1,b2
892	LD	b7,7*SIZE(B)
893
894	MADD	t14,t14,a0,b3
895	MADD	t24,t24,a1,b3
896	daddu	A,A,4*SIZE					#  	2mr*2kr
897	daddu	B,B,8*SIZE					#	4nr*2kr
898
899.L27:
900	MADD	t11,t11,a4,b4
901	LD	a0,0*SIZE(A)
902	MADD	t21,t21,a5,b4
903	LD	a1,1*SIZE(A)
904
905	MADD	t12,t12,a4,b5
906	LD	b0,0*SIZE(B)
907	MADD	t22,t22,a5,b5
908	LD	b1,1*SIZE(B)
909
910	MADD	t13,t13,a4,b6
911	LD	b2,2*SIZE(B)
912	MADD	t23,t23,a5,b6
913	LD	b3,3*SIZE(B)
914
915	MADD	t14,t14,a4,b7
916	MADD	t24,t24,a5,b7
917
918
919.L28:									#	kr=1
920#ifndef TRMMKERNEL
921	andi	K,KCO,1
922#else
923	andi	K,TEMP,1
924#endif
925	beqz	K,.L29
926	LD	ALPHA,152($sp)					#  Get ALPHA
927
928	MADD	t11,t11,a0,b0
929	MADD	t21,t21,a1,b0
930	daddu	A,A,2*SIZE					#  2mr*kr
931	daddu	B,B,4*SIZE					#  4nr*kr
932
933	MADD	t12,t12,a0,b1
934	MADD	t22,t22,a1,b1
935
936	MADD	t13,t13,a0,b2
937	MADD	t23,t23,a1,b2
938
939	MADD	t14,t14,a0,b3
940	MADD	t24,t24,a1,b3
941
942.L29:									#  Write Back to C
943#ifndef TRMMKERNEL
944	LD	c11,0(CO1)						#	GEMM write back part
945	LD	c21,1*SIZE(CO1)
946
947	LD	c12,0(CO2)
948	LD	c22,1*SIZE(CO2)
949
950	LD	c13,0(CO3)
951	MADD	t11,c11,t11,ALPHA
952	LD	c23,1*SIZE(CO3)
953	MADD	t21,c21,t21,ALPHA
954
955	LD	c14,0(CO4)
956	MADD	t12,c12,t12,ALPHA
957	LD	c24,1*SIZE(CO4)
958	MADD	t22,c22,t22,ALPHA
959
960	ST	t11,0(CO1)
961	MADD	t13,c13,t13,ALPHA
962	ST	t21,1*SIZE(CO1)
963	MADD	t23,c23,t23,ALPHA
964
965	ST	t12,0(CO2)
966	MADD	t14,c14,t14,ALPHA
967	ST	t22,1*SIZE(CO2)
968	MADD	t24,c24,t24,ALPHA
969
970	ST	t13,0(CO3)
971	daddu	CO1,CO1,2*SIZE				#  COi += 2
972	ST	t23,1*SIZE(CO3)
973	daddu	CO2,CO2,2*SIZE
974
975	ST	t14,0(CO4)
976	daddu	CO3,CO3,2*SIZE
977	ST	t24,1*SIZE(CO4)
978	daddu	CO4,CO4,2*SIZE
979
980	FETCH	$0,0(CO1)
981	FETCH	$0,0(CO2)
982	FETCH	$0,0(CO3)
983	FETCH	$0,0(CO4)
984
985#else
986	MUL	t11, ALPHA, t11					#	TRMM write back part
987	MUL	t21, ALPHA, t21
988
989	ST	t11, 0 * SIZE(CO1)
990	MUL	t12, ALPHA, t12
991	ST	t21, 1 * SIZE(CO1)
992	MUL	t22, ALPHA, t22
993
994	ST	t12, 0 * SIZE(CO2)
995	MUL	t13, ALPHA, t13
996	ST	t22, 1 * SIZE(CO2)
997	MUL	t23, ALPHA, t23
998
999	ST	t13, 0 * SIZE(CO3)
1000	MUL	t14, ALPHA, t14
1001	ST	t23, 1 * SIZE(CO3)
1002	MUL	t24, ALPHA, t24
1003
1004	ST	t14, 0 * SIZE(CO4)
1005	ST	t24, 1 * SIZE(CO4)
1006
1007	daddiu	CO1,CO1, 2 * SIZE
1008	daddiu	CO2,CO2, 2 * SIZE
1009	daddiu	CO3,CO3, 2 * SIZE
1010	daddiu	CO4,CO4, 2 * SIZE
1011
1012	FETCH	$0,0(CO1)
1013	FETCH	$0,0(CO2)
1014	FETCH	$0,0(CO3)
1015	FETCH	$0,0(CO4)
1016
1017#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1018	dsubu	TEMP,KCO,KK
1019#ifdef LEFT
1020	daddiu	TEMP,TEMP,-2
1021#else
1022	daddiu	TEMP,TEMP,-4
1023#endif
1024	dsll	K,TEMP,1 + BASE_SHIFT
1025	dsll	TEMP,TEMP,2 + BASE_SHIFT
1026
1027	daddu	A,A,K						#	move A to next panel Ai
1028	daddu	B,B,TEMP					#	move B to next panel Bj
1029#endif
1030
1031#ifdef LEFT
1032	daddiu	KK, KK, 2
1033#endif
1034#endif
1035
1036
1037	.align 3
1038.L14_M1:
1039	andi	M,MCO,1						#	mr=1
1040	beqz	M,.L0_N4_Loop				#  	M = 0, finishing one panel Bj
1041	nop
1042
1043.L30:
1044#if defined(TRMMKERNEL)
1045#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1046	move	B,BO						#	Reset B
1047#else
1048	dsll	K,KK, BASE_SHIFT
1049	dsll	TEMP,KK,2 + BASE_SHIFT
1050
1051	daddu	A,A,K
1052	daddu	B,BO,TEMP
1053#endif
1054
1055	LD	a0, 0 * SIZE(A)					#	a0
1056
1057	MTC		$0,t11
1058	LD	b0,0*SIZE(B)
1059
1060	MOV	t12,t11
1061	LD	b1,1*SIZE(B)
1062
1063	MOV	t13,t11
1064	LD	b2,2*SIZE(B)
1065
1066	MOV	t14,t11
1067	LD	b3,3*SIZE(B)
1068
1069#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1070	dsubu	TEMP, KCO, KK
1071#elif defined(LEFT)
1072	daddiu	TEMP, KK, 1
1073#else
1074	daddiu	TEMP, KK, 4
1075#endif
1076	dsra	K,TEMP, 2
1077	nop
1078	beqz	K,.L35
1079	nop
1080
1081#else
1082	move	B,BO						#	Reset B, GEMM part
1083	dsra	K,KCO,2						#  	K=KCO/2
1084	LD	a0, 0 * SIZE(A)					#	a0
1085
1086	MTC		$0,t11
1087	LD	b0,0*SIZE(B)
1088
1089	MOV	t12,t11
1090	LD	b1,1*SIZE(B)
1091
1092	MOV	t13,t11
1093	LD	b2,2*SIZE(B)
1094
1095	MOV	t14,t11
1096	beqz	K,.L35
1097	LD	b3,3*SIZE(B)
1098
1099#endif
1100
1101.L31:									#	nr=4,mr=1,kr=4
1102	LD	a1,	1*SIZE(A)					#	load a1
1103	MADD	t11,t11,a0,b0
1104
1105	LD	b4,4*SIZE(B)
1106	LD	b5,5*SIZE(B)
1107	MADD	t12,t12,a0,b1
1108
1109	LD	b6,6*SIZE(B)
1110	LD	b7,7*SIZE(B)
1111	MADD	t13,t13,a0,b2
1112	MADD	t14,t14,a0,b3
1113
1114	LD	a2,	2*SIZE(A)					#	a2
1115	MADD	t11,t11,a1,b4
1116
1117	LD	b0,8*SIZE(B)
1118	LD	b1,9*SIZE(B)
1119	MADD	t12,t12,a1,b5
1120
1121	LD	b2,10*SIZE(B)
1122	LD	b3,11*SIZE(B)
1123	MADD	t13,t13,a1,b6
1124	MADD	t14,t14,a1,b7
1125
1126	LD	a3,	3*SIZE(A)					#	a3
1127	MADD	t11,t11,a2,b0
1128	daddiu	K,K,-1
1129
1130	LD	b4,12*SIZE(B)
1131	LD	b5,13*SIZE(B)
1132	MADD	t12,t12,a2,b1
1133	daddu	A,A,4*SIZE					#	1mr*4kr
1134
1135	LD	b6,14*SIZE(B)
1136	LD	b7,15*SIZE(B)
1137	MADD	t13,t13,a2,b2
1138	MADD	t14,t14,a2,b3
1139
1140	LD	a0,	0*SIZE(A)					#	a0
1141	daddu	B,B,16*SIZE					#	4nr*4kr
1142	MADD	t11,t11,a3,b4
1143
1144	LD	b0,0*SIZE(B)
1145	MADD	t12,t12,a3,b5
1146	LD	b1,1*SIZE(B)
1147	MADD	t13,t13,a3,b6
1148
1149	LD	b2,2*SIZE(B)
1150	MADD	t14,t14,a3,b7
1151	bnez 	K,.L31
1152	LD	b3,3*SIZE(B)
1153
1154
1155.L35:									#  kr=2
1156#ifndef TRMMKERNEL
1157	andi	K,KCO,2
1158#else
1159	andi	K,TEMP,2
1160#endif
1161	beqz	K,.L38
1162	nop
1163
1164.L36:
1165	LD	a1,1*SIZE(A)					#	load a1
1166	MADD	t11,t11,a0,b0
1167
1168	LD	b4,4*SIZE(B)
1169	LD	b5,5*SIZE(B)
1170	MADD	t12,t12,a0,b1
1171	daddu	A,A,2*SIZE					#  	mr*2kr
1172
1173	LD	b6,6*SIZE(B)
1174	MADD	t13,t13,a0,b2
1175
1176	LD	b7,7*SIZE(B)
1177	MADD	t14,t14,a0,b3
1178	daddu	B,B,8*SIZE					#	4nr*2kr
1179
1180
1181.L37:
1182	LD	a0,0(A)
1183	MADD	t11,t11,a1,b4
1184
1185	LD	b0,0*SIZE(B)
1186	LD	b1,1*SIZE(B)
1187	MADD	t12,t12,a1,b5
1188
1189	LD	b2,2*SIZE(B)
1190	LD	b3,3*SIZE(B)
1191	MADD	t13,t13,a1,b6
1192	MADD	t14,t14,a1,b7
1193
1194
1195.L38:									#  	kr=1
1196#ifndef TRMMKERNEL
1197	andi	K,KCO,1
1198#else
1199	andi	K,TEMP,1
1200#endif
1201	beqz	K,.L39
1202	LD	ALPHA,152($sp)					#  Get ALPHA
1203
1204	MADD	t11,t11,a0,b0
1205	MADD	t12,t12,a0,b1
1206	daddu	A,A,1*SIZE
1207	daddu	B,B,4*SIZE
1208
1209	MADD	t13,t13,a0,b2
1210	MADD	t14,t14,a0,b3
1211
1212.L39:									#  Write Back
1213#ifndef TRMMKERNEL
1214	LD	c11,0(CO1)
1215	LD	c12,0(CO2)
1216	LD	c13,0(CO3)
1217	LD	c14,0(CO4)
1218
1219	MADD	t11,c11,t11,ALPHA
1220	MADD	t12,c12,t12,ALPHA
1221	MADD	t13,c13,t13,ALPHA
1222	MADD	t14,c14,t14,ALPHA
1223
1224	ST	t11,0(CO1)
1225	ST	t12,0(CO2)
1226	ST	t13,0(CO3)
1227	ST	t14,0(CO4)
1228#else
1229	MUL	t11, ALPHA, t11
1230	MUL	t12, ALPHA, t12
1231	MUL	t13, ALPHA, t13
1232	MUL	t14, ALPHA, t14
1233
1234	ST	t11,  0 * SIZE(CO1)
1235	ST	t12,  0 * SIZE(CO2)
1236	ST	t13,  0 * SIZE(CO3)
1237	ST	t14,  0 * SIZE(CO4)
1238
1239#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1240	dsubu	TEMP, KCO, KK
1241#ifdef LEFT
1242	daddiu	TEMP, TEMP, -1
1243#else
1244	daddiu	TEMP, TEMP, -4
1245#endif
1246
1247	dsll	K,TEMP, BASE_SHIFT
1248	dsll	TEMP,TEMP, 2 + BASE_SHIFT
1249
1250	daddu	A,A,K
1251	daddu	B,B,TEMP
1252#endif
1253
1254#ifdef LEFT
1255	daddiu	KK, KK, 1
1256#endif
1257#endif
1258
1259
1260	.align	3
1261.L0_N4_Loop:								#	mc finished
1262	daddiu	N,N,-1							#  N--
1263#if defined(TRMMKERNEL) && !defined(LEFT)
1264	daddiu	KK, KK,4
1265#endif
1266	bnez	N,.L0_N4_Lb
1267	move	BO,B							#  Set BO point to next panel Bj
1268
1269	.align	5
1270.L0_N2:
1271	andi	N,NCO,2							#  	nr = 2
1272	beqz	N,.L0_N1
1273	nop
1274
1275.L0_N2_Lb:
1276	move	CO1,C
1277	daddu	CO2,C,LDC
1278
1279	dsra	M,MCO,2
1280	move	A,AO							#  Reset A
1281
1282	daddu	PREA,AO,SPANA
1283	daddu	C,CO2,LDC
1284
1285#if defined(TRMMKERNEL) &&  defined(LEFT)
1286	move	KK, OFFSET
1287#endif
1288	beqz	M,.L12_M2
1289	nop
1290
1291.L40:
1292#if defined(TRMMKERNEL)
1293#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1294	move	B,BO							#	Reset B
1295#else
1296	dsll	K,KK, 2 + BASE_SHIFT
1297	dsll	TEMP, KK,1 + BASE_SHIFT
1298
1299	daddu	A,A,K
1300	daddu	B,BO,TEMP
1301#endif
1302	LD	a0,0*SIZE(A)
1303	MTC		$0,t11							#  	gemm part
1304	LD	a1,1*SIZE(A)
1305
1306	MOV	t21,t11
1307	LD	b0,0*SIZE(B)
1308	MOV	t31,t11
1309	LD	b1,1*SIZE(B)
1310
1311	MOV	t41,t11
1312	LD	a2,2*SIZE(A)
1313	LD	a3,3*SIZE(A)
1314
1315	MOV	t12,t11
1316	MOV	t22,t11
1317
1318#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1319	dsubu	TEMP,KCO,KK
1320#elif defined(LEFT)
1321	daddiu	TEMP, KK, 4
1322#else
1323	daddiu	TEMP, KK, 2
1324#endif
1325	dsra	K,TEMP,2
1326	MOV	t32,t11
1327	beqz	K,.L45
1328	MOV	t42,t11
1329
1330#else
1331	move	B,BO							#	Reset B
1332	LD	a0,0*SIZE(A)
1333	MTC		$0,t11							#  	gemm part
1334	LD	a1,1*SIZE(A)
1335
1336	MOV	t21,t11
1337	LD	b0,0*SIZE(B)
1338	MOV	t31,t11
1339	LD	b1,1*SIZE(B)
1340
1341	MOV	t41,t11
1342	LD	a2,2*SIZE(A)
1343	dsra	K,KCO,2							#	K=KCO/2
1344	LD	a3,3*SIZE(A)
1345
1346	MOV	t12,t11
1347	MOV	t22,t11
1348
1349	MOV	t32,t11
1350	beqz	K,.L45
1351	MOV	t42,t11
1352
1353#endif
1354
1355.L41:										#  	nr=2,mr=kr=4
1356	MADD	t11,t11,a0,b0
1357	LD	a4,4*SIZE(A)
1358	MADD	t21,t21,a1,b0
1359	LD	a5,5*SIZE(A)
1360
1361	MADD	t12,t12,a0,b1
1362	LD	b4,2*SIZE(B)
1363	MADD	t22,t22,a1,b1
1364	LD	b5,3*SIZE(B)
1365
1366	MADD	t31,t31,a2,b0
1367	LD	a6,6*SIZE(A)
1368	MADD	t41,t41,a3,b0
1369	LD	a7,7*SIZE(A)
1370
1371	FETCH		$0,(PREA)
1372	MADD	t32,t32,a2,b1
1373	MADD	t42,t42,a3,b1
1374
1375.L42:
1376	MADD	t11,t11,a4,b4
1377	LD	a0,8*SIZE(A)
1378	MADD	t21,t21,a5,b4
1379	LD	a1,9*SIZE(A)
1380
1381	MADD	t12,t12,a4,b5
1382	LD	b2,4*SIZE(B)
1383	MADD	t22,t22,a5,b5
1384	LD	b3,5*SIZE(B)
1385
1386	MADD	t31,t31,a6,b4
1387	LD	a2,10*SIZE(A)
1388	MADD	t41,t41,a7,b4
1389	LD	a3,11*SIZE(A)
1390
1391	FETCH		$0,4*SIZE(PREA)
1392	MADD	t32,t32,a6,b5
1393	MADD	t42,t42,a7,b5
1394
1395.L43:
1396	MADD	t11,t11,a0,b2
1397	LD	a4,12*SIZE(A)
1398	MADD	t21,t21,a1,b2
1399	LD	a5,13*SIZE(A)
1400
1401	MADD	t12,t12,a0,b3
1402	LD	b6,6*SIZE(B)
1403	MADD	t22,t22,a1,b3
1404	LD	b7,7*SIZE(B)
1405
1406	MADD	t31,t31,a2,b2
1407	LD	a6,14*SIZE(A)
1408	MADD	t41,t41,a3,b2
1409	LD	a7,15*SIZE(A)
1410
1411	FETCH		$0,8*SIZE(PREA)
1412	MADD	t32,t32,a2,b3
1413	MADD	t42,t42,a3,b3
1414
1415	daddu	A,A,16*SIZE						#	4mr*4kr
1416	daddu	B,B,8*SIZE						#	2nr*4kr
1417
1418.L44:
1419	MADD	t11,t11,a4,b6
1420	LD	a0,0*SIZE(A)
1421	MADD	t21,t21,a5,b6
1422	LD	a1,1*SIZE(A)
1423
1424
1425	MADD	t12,t12,a4,b7
1426	LD	b0,0*SIZE(B)
1427	MADD	t22,t22,a5,b7
1428	LD	b1,1*SIZE(B)
1429
1430	daddiu	K,K,-1
1431	daddu	PREA,PREA,16*SIZE
1432
1433	MADD	t31,t31,a6,b6
1434	LD	a2,2*SIZE(A)
1435	MADD	t41,t41,a7,b6
1436	LD	a3,3*SIZE(A)
1437
1438	FETCH		$0,-4*SIZE(PREA)
1439	MADD	t32,t32,a6,b7
1440	bnez 	K,.L41
1441	MADD	t42,t42,a7,b7
1442
1443
1444.L45:										#  	kr=2
1445#ifndef TRMMKERNEL
1446	andi	K,KCO,2
1447#else
1448	andi	K,TEMP,2
1449#endif
1450	beqz	K,.L48
1451	nop
1452
1453.L46:
1454	MADD	t11,t11,a0,b0
1455	LD	a4,4*SIZE(A)
1456	MADD	t21,t21,a1,b0
1457	LD	a5,5*SIZE(A)
1458
1459	MADD	t12,t12,a0,b1
1460	LD	b4,2*SIZE(B)
1461	MADD	t22,t22,a1,b1
1462	LD	b5,3*SIZE(B)
1463
1464	MADD	t31,t31,a2,b0
1465	LD	a6,6*SIZE(A)
1466	MADD	t41,t41,a3,b0
1467	LD	a7,7*SIZE(A)
1468
1469	FETCH		$0,0(PREA)
1470	MADD	t32,t32,a2,b1
1471	daddu	B,B,4*SIZE						#  B+=2(nr)*2(kr)*8Byte=32
1472
1473	MADD	t42,t42,a3,b1
1474	daddu	A,A,8*SIZE						#  A+=4(mr)*2(kr)*8Byte=8*SIZE
1475
1476.L47:
1477	MADD	t11,t11,a4,b4
1478	LD	a0,0*SIZE(A)
1479	MADD	t21,t21,a5,b4
1480	LD	a1,1*SIZE(A)
1481
1482	MADD	t12,t12,a4,b5
1483	LD	b0,0*SIZE(B)
1484	MADD	t22,t22,a5,b5
1485	LD	b1,1*SIZE(B)
1486
1487	MADD	t31,t31,a6,b4
1488	LD	a2,2*SIZE(A)
1489	MADD	t41,t41,a7,b4
1490	LD	a3,3*SIZE(A)
1491
1492	FETCH		$0,4*SIZE(PREA)
1493	MADD	t32,t32,a6,b5
1494	MADD	t42,t42,a7,b5
1495	daddu	PREA,PREA,8*SIZE
1496
1497
1498
1499.L48:										#	 kr=1
1500#ifndef TRMMKERNEL
1501	andi	K,KCO,1
1502#else
1503	andi	K,TEMP,1
1504#endif
1505	beqz	K,.L49
1506	LD	ALPHA,152($sp)						#  Get ALPHA
1507
1508	FETCH		$0,0(PREA)
1509	MADD	t11,t11,a0,b0
1510	MADD	t21,t21,a1,b0
1511	daddu	A,A,4*SIZE						#  A+=4(mr)*1(kr)*8Byte=32
1512
1513	MADD	t12,t12,a0,b1
1514	MADD	t22,t22,a1,b1
1515	daddu	B,B,2*SIZE
1516	daddu	PREA,PREA,4*SIZE
1517
1518	MADD	t31,t31,a2,b0
1519	MADD	t41,t41,a3,b0
1520
1521	MADD	t32,t32,a2,b1
1522	MADD	t42,t42,a3,b1
1523
1524.L49:										#  Write Back
1525#ifndef TRMMKERNEL
1526	LD	c11,0(CO1)							#  gemm write back part Fetch 16 C
1527	LD	c21,1*SIZE(CO1)
1528	LD	c31,2*SIZE(CO1)
1529	LD	c41,3*SIZE(CO1)
1530
1531	LD	c12,0(CO2)
1532	MADD	t11,c11,t11,ALPHA
1533	LD	c22,1*SIZE(CO2)
1534	MADD	t21,c21,t21,ALPHA
1535	LD	c32,2*SIZE(CO2)
1536	MADD	t31,c31,t31,ALPHA
1537	LD	c42,3*SIZE(CO2)
1538	MADD	t41,c41,t41,ALPHA
1539
1540	ST	t11,0(CO1)
1541	MADD	t12,c12,t12,ALPHA
1542	ST	t21,1*SIZE(CO1)
1543	MADD	t22,c22,t22,ALPHA
1544	ST	t31,2*SIZE(CO1)
1545	MADD	t32,c32,t32,ALPHA
1546	ST	t41,3*SIZE(CO1)
1547	MADD	t42,c42,t42,ALPHA
1548	daddiu	M,M,-1
1549
1550	ST	t12,0(CO2)
1551	ST	t22,1*SIZE(CO2)
1552	ST	t32,2*SIZE(CO2)
1553	ST	t42,3*SIZE(CO2)
1554
1555	FETCH	$0,4*SIZE(CO1)
1556	FETCH	$0,4*SIZE(CO2)
1557	FETCH	$0,8*SIZE(CO1)
1558	FETCH	$0,8*SIZE(CO2)
1559
1560	daddu	CO1,CO1,4*SIZE
1561	bnez	M,.L40
1562	daddu	CO2,CO2,4*SIZE
1563
1564#else
1565	MUL	t11, ALPHA, t11
1566	MUL	t21, ALPHA, t21
1567	MUL	t31, ALPHA, t31
1568	MUL	t41, ALPHA, t41
1569
1570	MUL	t12, ALPHA, t12
1571	ST	t11, 0 * SIZE(CO1)
1572	MUL	t22, ALPHA, t22
1573	ST	t21, 1 * SIZE(CO1)
1574	MUL	t32, ALPHA, t32
1575	ST	t31, 2 * SIZE(CO1)
1576	MUL	t42, ALPHA, t42
1577	ST	t41, 3 * SIZE(CO1)
1578
1579	ST	t12, 0 * SIZE(CO2)
1580	daddiu	M,M,-1
1581	ST	t22, 1 * SIZE(CO2)
1582	ST	t32, 2 * SIZE(CO2)
1583	ST	t42, 3 * SIZE(CO2)
1584
1585	daddiu	CO1,CO1, 4*SIZE
1586	daddiu	CO2,CO2, 4*SIZE
1587
1588	FETCH	$0,0(CO1)
1589	FETCH	$0,0(CO2)
1590	FETCH	$0,4(CO1)
1591	FETCH	$0,4(CO2)
1592
1593#if ( defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
1594	dsubu	TEMP, KCO, KK
1595#ifdef LEFT
1596	daddiu	TEMP, TEMP, -4
1597#else
1598	daddiu	TEMP, TEMP, -2
1599#endif
1600	dsll	K,TEMP, 2 + BASE_SHIFT
1601	dsll	TEMP, TEMP, 1 + BASE_SHIFT
1602
1603	daddu	A,A,K
1604	daddu	B,B,TEMP
1605#endif
1606
1607#ifdef LEFT
1608	daddiu	KK, KK, 4
1609#endif
1610	bnez	M,.L40
1611	nop
1612#endif
1613
1614
1615	.align 3
1616.L12_M2:
1617	andi	M,MCO,2						#  	mr = 2
1618	beqz	M,.L12_M1
1619	nop
1620
1621.L50:
1622#if defined(TRMMKERNEL)
1623#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1624	move	B,BO
1625#else
1626	dsll	K,    KK, 1 + BASE_SHIFT	#mr=2
1627	dsll	TEMP, KK, 1 + BASE_SHIFT	#nr=2
1628
1629	daddu	A, A, K
1630	daddu	B, BO,  TEMP
1631#endif
1632	LD	a0,0*SIZE(A)
1633	LD	a1,1*SIZE(A)
1634
1635	MTC		$0,t11
1636	LD	b0,0*SIZE(B)
1637	MOV	t21,t11
1638	LD	b1,1*SIZE(B)
1639
1640#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1641	dsubu	TEMP, KCO, KK
1642#elif defined(LEFT)
1643	daddiu	TEMP, KK, 2
1644#else
1645	daddiu	TEMP, KK, 2
1646#endif
1647	dsra	K,TEMP,2
1648	MOV	t12,t11
1649	beqz	K,.L55
1650	MOV	t22,t11
1651
1652#else
1653	move	B,BO
1654	LD	a0,0*SIZE(A)
1655	dsra	K,KCO,2						#  K=KCO/2
1656	LD	a1,1*SIZE(A)
1657
1658	MTC		$0,t11
1659	LD	b0,0*SIZE(B)
1660	MOV	t21,t11
1661	LD	b1,1*SIZE(B)
1662
1663	MOV	t12,t11
1664	beqz	K,.L55
1665	MOV	t22,t11
1666
1667#endif
1668
1669.L51:									#  nr=2 mr=2,kr=4
1670	MADD	t11,t11,a0,b0
1671	LD	a4,2*SIZE(A)
1672	MADD	t21,t21,a1,b0
1673	LD	b4,2*SIZE(B)
1674
1675	MADD	t12,t12,a0,b1
1676	LD	a5,3*SIZE(A)
1677	MADD	t22,t22,a1,b1
1678	LD	b5,3*SIZE(B)
1679
1680	MADD	t11,t11,a4,b4
1681	LD	a2,4*SIZE(A)
1682	MADD	t21,t21,a5,b4
1683	LD	b2,4*SIZE(B)
1684
1685	MADD	t12,t12,a4,b5
1686	LD	a3,5*SIZE(A)
1687	MADD	t22,t22,a5,b5
1688	daddiu	K,K,-1
1689	LD	b3,5*SIZE(B)
1690
1691	MADD	t11,t11,a2,b2
1692	LD	a6,6*SIZE(A)
1693	MADD	t21,t21,a3,b2
1694	daddu	A,A,8*SIZE					#  A+=2(mr)*4(kr)*8Byte=8*SIZE
1695	LD	b6,6*SIZE(B)
1696
1697	MADD	t12,t12,a2,b3
1698	daddu	B,B,8*SIZE					#  B+=2(nr)*4(kr)*8Byte=16*SIZE
1699	LD	a7,-1*SIZE(A)
1700	MADD	t22,t22,a3,b3
1701	LD	b7,-1*SIZE(B)
1702
1703	MADD	t11,t11,a6,b6
1704	LD	a0,0*SIZE(A)
1705	MADD	t21,t21,a7,b6
1706	LD	b0,0*SIZE(B)
1707
1708	MADD	t12,t12,a6,b7
1709	LD	a1,1*SIZE(A)
1710
1711	MADD	t22,t22,a7,b7
1712	bnez 	K,.L51
1713	LD	b1,1*SIZE(B)
1714
1715
1716.L55:									#  	kr=2
1717#ifndef TRMMKERNEL
1718	andi	K,KCO,2
1719#else
1720	andi	K,TEMP,2
1721#endif
1722	beqz	K,.L58
1723	nop
1724
1725.L56:
1726	MADD	t11,t11,a0,b0
1727	LD	a4,2*SIZE(A)
1728	MADD	t21,t21,a1,b0
1729	daddu	A,A,4*SIZE					#  A+=2(mr)*2(kr)*8Byte=32
1730	LD	b4,2*SIZE(B)
1731
1732	MADD	t12,t12,a0,b1
1733	daddu	B,B,4*SIZE					#	2nr*2kr
1734	LD	a5,-1*SIZE(A)
1735	MADD	t22,t22,a1,b1
1736	LD	b5,-1*SIZE(B)
1737
1738.L57:
1739	MADD	t11,t11,a4,b4
1740	LD	a0,0*SIZE(A)
1741	MADD	t21,t21,a5,b4
1742	LD	b0,0*SIZE(B)
1743
1744	MADD	t12,t12,a4,b5
1745	LD	a1,1*SIZE(A)
1746	MADD	t22,t22,a5,b5
1747	LD	b1,1*SIZE(B)
1748
1749.L58:									#  kr=1
1750#ifndef TRMMKERNEL
1751	andi	K,KCO,1
1752#else
1753	andi	K,TEMP, 1
1754#endif
1755	beqz	K,.L59
1756	LD	ALPHA,152($sp)					#  Get ALPHA
1757
1758	MADD	t11,t11,a0,b0
1759	MADD	t21,t21,a1,b0
1760	daddu	A,A,2*SIZE					#  	A+=2(mr)*1(kr)*8Byte=16
1761	daddu	B,B,2*SIZE					#	2nr*kr
1762
1763	MADD	t12,t12,a0,b1
1764	MADD	t22,t22,a1,b1
1765
1766
1767.L59:									#  Write Back
1768#ifndef TRMMKERNEL
1769	LD	c11,0(CO1)						#  write gemm part back Fetch 16 C
1770	LD	c21,1*SIZE(CO1)
1771	LD	c12,0(CO2)
1772	LD	c22,1*SIZE(CO2)
1773
1774	MADD	t11,c11,t11,ALPHA
1775	MADD	t21,c21,t21,ALPHA
1776	MADD	t12,c12,t12,ALPHA
1777	MADD	t22,c22,t22,ALPHA
1778
1779	ST	t11,0(CO1)
1780	ST	t21,1*SIZE(CO1)
1781	ST	t12,0(CO2)
1782	ST	t22,1*SIZE(CO2)
1783
1784	daddu	CO1,CO1,2*SIZE
1785	daddu	CO2,CO2,2*SIZE
1786
1787	FETCH	$0,0(CO1)
1788	FETCH	$0,0(CO2)
1789#else
1790	daddiu	M, M, -1
1791	daddiu	CO1,CO1, 2 * SIZE
1792	daddiu	CO2,CO2, 2 * SIZE
1793	MUL	t11, ALPHA, t11
1794	MUL	t21, ALPHA, t21
1795	MUL	t12, ALPHA, t12
1796	MUL	t22, ALPHA, t22
1797
1798	ST	t11, -2 * SIZE(CO1)
1799	ST	t21, -1 * SIZE(CO1)
1800	ST	t12, -2 * SIZE(CO2)
1801	ST	t22, -1 * SIZE(CO2)
1802
1803	FETCH	$0,0(CO1)
1804	FETCH	$0,0(CO2)
1805
1806#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1807	dsubu	TEMP, KCO, KK
1808#ifdef LEFT
1809	daddiu	TEMP, TEMP, -2
1810#else
1811	daddiu	TEMP, TEMP, -2
1812#endif
1813
1814	dsll	K,    TEMP, 1 + BASE_SHIFT
1815	dsll	TEMP, TEMP, 1 + BASE_SHIFT
1816
1817	daddu	A, A, K
1818	daddu	B, B, TEMP
1819#endif
1820
1821#ifdef LEFT
1822	daddiu	KK, KK, 2
1823#endif
1824#endif
1825
1826
1827	.align 3
1828.L12_M1:
1829	andi	M,MCO,1					#  	mr = 1
1830	beqz	M,.L0_N2_Loop
1831	nop
1832
1833.L60:
1834#if defined(TRMMKERNEL)
1835#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1836	move	B,BO					#	Reset B
1837#else
1838	dsll	K,    KK, 0 + BASE_SHIFT
1839	dsll	TEMP, KK, 1 + BASE_SHIFT
1840
1841	daddu	A, A, K
1842	daddu	B, BO,  TEMP
1843#endif
1844	LD	a0,0*SIZE(A)
1845
1846	MTC		$0,t11
1847	MOV	t21,t11
1848	LD	b0,0*SIZE(B)
1849
1850	MOV	t12,t11
1851	LD	b1,1*SIZE(B)
1852
1853#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1854	dsubu	TEMP, KCO, KK
1855#elif defined(LEFT)
1856	daddiu	TEMP, KK, 1
1857#else
1858	daddiu	TEMP, KK, 2
1859#endif
1860	dsra	K,TEMP,2
1861	MOV	t22,t11
1862	beqz	K,.L65
1863	nop
1864
1865#else
1866	dsra	K,KCO,2
1867	move	B,BO					#  Reset B
1868	LD	a0,0*SIZE(A)
1869
1870	MTC		$0,t11
1871	MOV	t21,t11
1872	LD	b0,0*SIZE(B)
1873
1874	MOV	t12,t11
1875	LD	b1,1*SIZE(B)
1876	beqz	K,.L65
1877	MOV	t22,t11
1878
1879#endif
1880
1881.L61:								#	nr=2,mr=1,kr=4
1882	LD	a4,	1*SIZE(A)				#	a2
1883	LD	b4, 2*SIZE(B)
1884	MADD	t11,t11,a0,b0
1885
1886	LD	b5,3*SIZE(B)
1887	MADD	t12,t12,a0,b1
1888
1889	LD	a2,	2*SIZE(A)				#	a3
1890	LD	b2,4*SIZE(B)
1891	MADD	t11,t11,a4,b4
1892
1893	LD	b3,5*SIZE(B)
1894	MADD	t12,t12,a4,b5
1895
1896	LD	a6,	3*SIZE(A)				#	a4
1897	daddiu	K,K,-1
1898	LD	b6,6*SIZE(B)
1899	MADD	t11,t11,a2,b2
1900
1901	LD	b7,7*SIZE(B)
1902	MADD	t12,t12,a2,b3
1903	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32
1904
1905	LD	a0,	0*SIZE(A)
1906	daddu	B,B,8*SIZE				#  B+=2(nr)*4(kr)*8Byte=8*SIZE
1907
1908	LD	b0,0*SIZE(B)
1909	MADD	t11,t11,a6,b6
1910
1911	LD	b1,1*SIZE(B)
1912	bnez 	K,.L61
1913	MADD	t12,t12,a6,b7
1914
1915
1916
1917.L65:								#  kr=2
1918#ifndef TRMMKERNEL
1919	andi	K,KCO,2
1920#else
1921	andi	K,TEMP,2
1922#endif
1923	beqz	K,.L68
1924	nop
1925
1926.L66:
1927	LD	a4,	1*SIZE(A)				#	a1
1928	MADD	t11,t11,a0,b0
1929	LD	b4,2*SIZE(B)
1930	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=16
1931
1932	LD	b5,3*SIZE(B)
1933	MADD	t12,t12,a0,b1
1934	daddu	B,B,4*SIZE
1935
1936.L67:
1937	LD	a0,0(A)						#	a0
1938	LD	b0,0*SIZE(B)
1939	MADD	t11,t11,a4,b4
1940
1941	LD	b1,1*SIZE(B)
1942	MADD	t12,t12,a4,b5
1943
1944
1945.L68:								#   kr=1
1946#ifndef TRMMKERNEL
1947	andi	K,KCO,1
1948#else
1949	andi	K,TEMP,1
1950#endif
1951	beqz	K,.L69
1952	LD	ALPHA,152($sp)				#  Get ALPHA
1953
1954	MADD	t11,t11,a0,b0
1955	MADD	t12,t12,a0,b1
1956	daddu	A,A,1*SIZE				#  A+=1(mr)*1(kr)*8Byte=16
1957	daddu	B,B,2*SIZE
1958
1959
1960.L69:								#  Write Back
1961#ifndef TRMMKERNEL
1962	LD	c11,0(CO1)					#  Fetch 16 C
1963	LD	c12,0(CO2)
1964
1965	MADD	t11,c11,t11,ALPHA
1966	MADD	t12,c12,t12,ALPHA
1967
1968	ST	t11,0(CO1)
1969	ST	t12,0(CO2)
1970
1971	daddu	CO1,CO1,1*SIZE
1972	daddu	CO2,CO2,1*SIZE
1973
1974#else
1975	MUL	t11, ALPHA, t11
1976	MUL	t12, ALPHA, t12
1977
1978	ST	t11,  0 * SIZE(CO1)
1979	ST	t12,  0 * SIZE(CO2)
1980
1981	daddu	CO1,CO1,1*SIZE
1982	daddu	CO2,CO2,1*SIZE
1983
1984#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1985	dsubu	TEMP, KCO, KK
1986#ifdef LEFT
1987	daddiu	TEMP, TEMP, -1
1988#else
1989	daddiu	TEMP, TEMP, -2
1990#endif
1991
1992	dsll	K,    TEMP, 0 + BASE_SHIFT
1993	dsll	TEMP, TEMP, 1 + BASE_SHIFT
1994
1995	daddu	A, A, K
1996	daddu	B, B, TEMP
1997#endif
1998
1999#ifdef LEFT
2000	daddiu	KK, KK, 1
2001#endif
2002#endif
2003
2004.L0_N2_Loop:
2005#if defined(TRMMKERNEL) && !defined(LEFT)
2006	daddiu	KK, KK, 2
2007#endif
2008	move	BO, B
2009
2010
2011	.align	5
2012.L0_N1:
2013	andi	N,NCO,1					#  nr = 1
2014	beqz	N,.L999
2015	nop
2016
2017	move	CO1,C
2018	dsra	M,MCO,2
2019
2020	move	A,AO					#  Reset A
2021	daddu	PREA,AO,SPANA
2022#if defined(TRMMKERNEL) &&  defined(LEFT)
2023	move	KK, OFFSET
2024#endif
2025
2026	beqz	M,.L11_M2
2027	daddu	C,CO1,LDC
2028
2029.L70:
2030#if defined(TRMMKERNEL)
2031#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2032	move	B, BO					#	Reset B
2033#else
2034	dsll	K,    KK, 2 + BASE_SHIFT
2035	dsll	TEMP, KK, 0 + BASE_SHIFT
2036
2037	daddu	A, A, K
2038	daddu	B, BO,  TEMP
2039#endif
2040	LD	b0,	0*SIZE(B)
2041
2042	MTC		$0,t11
2043	LD	a0,0*SIZE(A)
2044	MOV	t21,t11
2045	LD	a1,1*SIZE(A)
2046
2047	MOV	t31,t11
2048	LD	a2,2*SIZE(A)
2049	MOV	t41,t11
2050	LD	a3,3*SIZE(A)
2051
2052
2053#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2054	dsubu	TEMP, KCO, KK
2055#elif defined(LEFT)
2056	daddiu	TEMP, KK, 4
2057#else
2058	daddiu	TEMP, KK, 1
2059#endif
2060	dsra	K,TEMP,2
2061	beqz	K,.L75
2062	nop
2063#else
2064	move	B, BO					#	Reset B
2065	dsra	K,KCO,2
2066	LD	b0,	0*SIZE(B)
2067
2068	MTC		$0,t11
2069	LD	a0,0*SIZE(A)
2070	MOV	t21,t11
2071	LD	a1,1*SIZE(A)
2072
2073	MOV	t31,t11
2074	LD	a2,2*SIZE(A)
2075	MOV	t41,t11
2076	beqz	K,.L75
2077	LD	a3,3*SIZE(A)
2078
2079#endif
2080
2081.L71:								#  nr=1,mr=kr=4
2082	LD	b4,	1*SIZE(B)				#	b1
2083	MADD	t11,t11,a0,b0
2084
2085	LD	a4,	4*SIZE(A)
2086	MADD	t21,t21,a1,b0
2087
2088	LD	a5,	5*SIZE(A)
2089	FETCH		$0,(PREA)
2090
2091	LD	a6,6*SIZE(A)
2092	MADD	t31,t31,a2,b0
2093
2094	LD	a7,7*SIZE(A)
2095	MADD	t41,t41,a3,b0
2096
2097.L72:
2098	LD	b2,	2*SIZE(B)				#	b2
2099	MADD	t11,t11,a4,b4
2100
2101	LD	a0,8*SIZE(A)
2102	MADD	t21,t21,a5,b4
2103
2104	LD	a1,9*SIZE(A)
2105	FETCH		$0,4*SIZE(PREA)
2106
2107	LD	a2,10*SIZE(A)
2108	MADD	t31,t31,a6,b4
2109
2110	LD	a3,11*SIZE(A)
2111	MADD	t41,t41,a7,b4
2112
2113.L73:
2114	LD	b6,	3*SIZE(B)
2115	MADD	t11,t11,a0,b2
2116
2117	LD	a4,12*SIZE(A)
2118	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
2119
2120	LD	a5,13*SIZE(A)
2121	MADD	t21,t21,a1,b2
2122
2123	LD	a6,14*SIZE(A)
2124	FETCH		$0,8*SIZE(PREA)
2125	MADD	t31,t31,a2,b2
2126
2127	LD	a7,15*SIZE(A)
2128	MADD	t41,t41,a3,b2
2129	daddu	A,A,16*SIZE				#  A+=4(mr)*4(kr)*8Byte=16*SIZE
2130
2131.L74:
2132	LD	b0,	0*SIZE(B)
2133	MADD	t11,t11,a4,b6
2134
2135	LD	a0,0*SIZE(A)
2136	daddu	PREA,PREA,16*SIZE
2137
2138	LD	a1,1*SIZE(A)
2139	MADD	t21,t21,a5,b6
2140
2141	LD	a2,2*SIZE(A)
2142	daddiu	K,K,-1
2143	MADD	t31,t31,a6,b6
2144
2145	LD	a3,3*SIZE(A)
2146	MADD	t41,t41,a7,b6
2147	bnez 	K,.L71
2148	FETCH		$0,-32(PREA)
2149
2150
2151.L75:								#  kr=2
2152#ifndef TRMMKERNEL
2153	andi	K,KCO,2
2154#else
2155	andi	K,TEMP,2
2156#endif
2157	beqz	K,.L78
2158	nop
2159
2160.L76:
2161	LD	b4,	1*SIZE(B)
2162	MADD	t11,t11,a0,b0
2163
2164	LD	a4,4*SIZE(A)
2165	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=32
2166
2167	LD	a5,5*SIZE(A)
2168	MADD	t21,t21,a1,b0
2169	FETCH		$0,0(PREA)
2170
2171	LD	a6,6*SIZE(A)
2172	MADD	t31,t31,a2,b0
2173
2174	LD	a7,7*SIZE(A)
2175	MADD	t41,t41,a3,b0
2176	daddu	A,A,8*SIZE				#  A+=4(mr)*2(kr)*8Byte=8*SIZE
2177
2178.L77:
2179	LD	b0,0(B)
2180	MADD	t11,t11,a4,b4
2181
2182	LD	a0,0*SIZE(A)
2183	MADD	t21,t21,a5,b4
2184	FETCH		$0,4*SIZE(PREA)
2185
2186	LD	a1,1*SIZE(A)
2187	MADD	t31,t31,a6,b4
2188
2189	LD	a2,2*SIZE(A)
2190	MADD	t41,t41,a7,b4
2191
2192	LD	a3,3*SIZE(A)
2193	daddu	PREA,PREA,8*SIZE
2194
2195
2196
2197.L78:								#   kr=1
2198#ifndef TRMMKERNEL
2199	andi	K,KCO,1
2200#else
2201	andi	K,TEMP,1
2202#endif
2203	beqz	K,.L79
2204	LD	ALPHA,152($sp)				#  Get ALPHA
2205
2206	FETCH		$0,0(PREA)
2207	MADD	t11,t11,a0,b0
2208	MADD	t21,t21,a1,b0
2209	daddu	A,A,4*SIZE				#  A+=4(mr)*1(kr)*8Byte=32
2210
2211	MADD	t31,t31,a2,b0
2212	MADD	t41,t41,a3,b0
2213	daddu	B,B,1*SIZE
2214	daddu	PREA,PREA,4*SIZE
2215
2216
2217.L79:								#  Write Back
2218#ifndef TRMMKERNEL
2219	LD	c11,0(CO1)					#  Fetch 16 C
2220	LD	c21,1*SIZE(CO1)
2221	LD	c31,2*SIZE(CO1)
2222	LD	c41,3*SIZE(CO1)
2223
2224	MADD	t11,c11,t11,ALPHA
2225	MADD	t21,c21,t21,ALPHA
2226	MADD	t31,c31,t31,ALPHA
2227	MADD	t41,c41,t41,ALPHA
2228
2229	ST	t11,0(CO1)
2230	ST	t21,1*SIZE(CO1)
2231	ST	t31,2*SIZE(CO1)
2232	ST	t41,3*SIZE(CO1)
2233	daddiu	M,M,-1					#  M--
2234
2235	FETCH	$0,4*SIZE(CO1)
2236	FETCH	$0,8*SIZE(CO1)
2237
2238	bnez	M,.L70					#  M!=0
2239	daddu	CO1,CO1,4*SIZE			#  COx += 4*8Byte
2240#else
2241	daddiu	M,M,-1					#  M--
2242	MUL	t11, ALPHA, t11
2243	MUL	t21, ALPHA, t21
2244	MUL	t31, ALPHA, t31
2245	MUL	t41, ALPHA, t41
2246
2247	ST	t11,0(CO1)
2248	ST	t21,1*SIZE(CO1)
2249	ST	t31,2*SIZE(CO1)
2250	ST	t41,3*SIZE(CO1)
2251
2252	FETCH	$0,4*SIZE(CO1)
2253	FETCH	$0,8*SIZE(CO1)
2254
2255	daddu	CO1,CO1,4*SIZE
2256#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2257	dsubu	TEMP, KCO, KK
2258#ifdef LEFT
2259	daddiu	TEMP, TEMP, -4
2260#else
2261	daddiu	TEMP, TEMP, -1
2262#endif
2263
2264	dsll	K,    TEMP, 2 + BASE_SHIFT
2265	dsll	TEMP, TEMP, 0 + BASE_SHIFT
2266
2267	daddu	A, A,K
2268	daddu	B, B, TEMP
2269#endif
2270
2271#ifdef LEFT
2272	daddiu	KK, KK, 4
2273#endif
2274	bnez	M,.L70
2275	nop
2276#endif
2277
2278
2279	.align 3
2280.L11_M2:
2281	andi	M,MCO,2					#  mr = 2
2282	beqz	M,.L11_M1
2283	nop
2284
2285.L80:
2286#if defined(TRMMKERNEL)
2287#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2288	move	B, BO
2289#else
2290	dsll	K,    KK, 1 + BASE_SHIFT
2291	dsll	TEMP, KK, 0 + BASE_SHIFT
2292
2293	daddu	A, A, K
2294	daddu	B, BO,  TEMP
2295#endif
2296	LD	b0,	0*SIZE(B)
2297
2298	MTC		$0,t11
2299	MOV		t21,t11
2300	LD	a0,0*SIZE(A)
2301	LD	a1,1*SIZE(A)
2302
2303#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2304	dsubu	TEMP, KCO, KK
2305#elif defined(LEFT)
2306	daddiu	TEMP, KK, 2
2307#else
2308	daddiu	TEMP, KK, 1
2309#endif
2310	dsra	K,TEMP,2				#  K=KCO/2
2311	beqz	K,.L85
2312	nop
2313#else
2314	move	B, BO
2315	dsra	K,KCO,2
2316	LD	b0,	0*SIZE(B)
2317
2318	MTC		$0,t11
2319	MOV		t21,t11
2320	LD	a0,0*SIZE(A)
2321
2322	beqz	K,.L85
2323	LD	a1,1*SIZE(A)
2324
2325#endif
2326
2327.L81:								#  nr=1,mr=2,kr=4
2328	LD	b4,	1*SIZE(B)
2329	LD	a4,2*SIZE(A)
2330	MADD	t11,t11,a0,b0
2331	LD	a5,3*SIZE(A)
2332	MADD	t21,t21,a1,b0
2333
2334	LD	b2,	2*SIZE(B)
2335	LD	a2,4*SIZE(A)
2336	MADD	t11,t11,a4,b4
2337	LD	a3,5*SIZE(A)
2338	MADD	t21,t21,a5,b4
2339
2340	LD	b6,	3*SIZE(B)
2341	LD	a6,6*SIZE(A)
2342	MADD	t11,t11,a2,b2
2343	LD	a7,7*SIZE(A)
2344	MADD	t21,t21,a3,b2
2345
2346	daddu	A,A,8*SIZE				#  A+=2(mr)*4(kr)*8Byte=8*SIZE
2347	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
2348
2349	LD	b0,	0*SIZE(B)
2350	daddiu	K,K,-1
2351
2352	LD	a0,0*SIZE(A)
2353	MADD	t11,t11,a6,b6
2354
2355	LD	a1,1*SIZE(A)
2356	bnez 	K,.L81
2357	MADD	t21,t21,a7,b6
2358
2359.L85:								#  kr=2
2360#ifndef TRMMKERNEL
2361	andi	K,KCO,2
2362#else
2363	andi	K,TEMP,2
2364#endif
2365	beqz	K,.L88
2366	nop
2367
2368.L86:
2369	LD	b4,	1*SIZE(B)
2370	LD	a4,2*SIZE(A)
2371	MADD	t11,t11,a0,b0
2372	LD	a5,3*SIZE(A)
2373	MADD	t21,t21,a1,b0
2374
2375	daddu	A,A,4*SIZE				#  A+=2(mr)*2(kr)*8Byte=32
2376	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
2377
2378	LD	b0,0(B)
2379	LD	a0,0*SIZE(A)
2380	MADD	t11,t11,a4,b4
2381	LD	a1,1*SIZE(A)
2382	MADD	t21,t21,a5,b4
2383
2384
2385
2386.L88:								#  kr=1
2387#ifndef TRMMKERNEL
2388	andi	K,KCO,1
2389#else
2390	andi	K,TEMP,1
2391#endif
2392	beqz	K,.L89
2393	LD	ALPHA,152($sp)				#  Get ALPHA
2394
2395	MADD	t11,t11,a0,b0
2396	MADD	t21,t21,a1,b0
2397	daddu	A,A,2*SIZE				#  A+=2(mr)*1(kr)*8Byte=16
2398	daddu	B,B,1*SIZE
2399
2400
2401.L89:								#  Write Back
2402#ifndef TRMMKERNEL
2403	LD	c11,0(CO1)					#  Fetch 16 C
2404	LD	c21,1*SIZE(CO1)
2405
2406	MADD	t11,c11,t11,ALPHA
2407	MADD	t21,c21,t21,ALPHA
2408
2409	ST	t11,0(CO1)
2410	ST	t21,1*SIZE(CO1)
2411
2412	FETCH	$0,2*SIZE(CO1)
2413
2414	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte
2415
2416#else
2417	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte
2418	MUL	t11, ALPHA, t11
2419	MUL	t21, ALPHA, t21
2420
2421	FETCH	$0,0(CO1)
2422	ST	t11, -2 * SIZE(CO1)
2423	ST	t21, -1 * SIZE(CO1)
2424#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2425	dsubu	TEMP, KCO, KK
2426#ifdef LEFT
2427	daddiu	TEMP, TEMP, -2
2428#else
2429	daddiu	TEMP, TEMP, -1
2430#endif
2431
2432	dsll	K,    TEMP, 1 + BASE_SHIFT
2433	dsll	TEMP, TEMP, 0 + BASE_SHIFT
2434
2435	daddu	A, A, K
2436	daddu	B, B, TEMP
2437#endif
2438
2439#ifdef LEFT
2440	daddiu	KK, KK, 2
2441#endif
2442#endif
2443
2444
2445	.align 3
2446.L11_M1:
2447	andi		M,MCO,1				#   mr = 1
2448	beqz	M,.L999
2449	nop
2450
2451.L90:
2452#if defined(TRMMKERNEL)
2453#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2454	move	B,  BO
2455#else
2456	dsll	K,    KK, 0 + BASE_SHIFT
2457	dsll	TEMP, KK, 0 + BASE_SHIFT
2458
2459	daddu	A, A, K
2460	daddu	B, BO,  TEMP
2461#endif
2462	LD	a0,	0*SIZE(A)
2463	LD	b0,	0*SIZE(B)
2464	MTC		$0,t11
2465
2466#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2467	dsubu	TEMP, KCO, KK
2468#elif defined(LEFT)
2469	daddiu	TEMP, KK, 1
2470#else
2471	daddiu	TEMP, KK, 1
2472#endif
2473	dsra	K,  TEMP, 2
2474	beqz	K,.L95
2475	nop
2476
2477#else
2478	move	B,  BO
2479	LD	a0,	0*SIZE(A)
2480	LD	b0,	0*SIZE(B)
2481	dsra	K,KCO,2
2482	beqz	K,.L95
2483	MTC		$0,t11
2484#endif
2485
2486.L91:								#  nr=mr=1,kr=4
2487	LD	a4,	1*SIZE(A)
2488	LD	b4,	1*SIZE(B)
2489	MADD	t11,t11,a0,b0
2490
2491	LD	a2,	2*SIZE(A)
2492	LD	b2,	2*SIZE(B)
2493	MADD	t11,t11,a4,b4
2494
2495	LD	a6,	3*SIZE(A)
2496	LD	b6,	3*SIZE(B)
2497	MADD	t11,t11,a2,b2
2498
2499	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32
2500	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
2501
2502	LD	a0,	0*SIZE(A)
2503	LD	b0,	0*SIZE(B)
2504	MADD	t11,t11,a6,b6
2505
2506	daddiu	K,K,-1
2507	bnez 	K,.L91
2508	nop
2509
2510.L95:								#  kr=2
2511#ifndef TRMMKERNEL
2512	andi	K,KCO,2
2513#else
2514	andi	K,TEMP,2
2515#endif
2516	beqz	K,.L98
2517	nop
2518
2519.L96:
2520	LD	a4,	1*SIZE(A)
2521	LD	b4,	1*SIZE(B)
2522	MADD	t11,t11,a0,b0
2523	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
2524	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=32
2525
2526	LD	b0,0(B)
2527	LD	a0,0(A)
2528	MADD	t11,t11,a4,b4
2529
2530.L98:								#  kr=1
2531#ifndef TRMMKERNEL
2532	andi		K,KCO,1
2533#else
2534	andi	K,TEMP,1
2535#endif
2536	beqz	K,.L99
2537	LD	ALPHA,152($sp)				#  Get ALPHA
2538
2539	MADD	t11,t11,a0,b0
2540
2541
2542.L99:								#  Write Back
2543#ifndef TRMMKERNEL
2544	LD	c11,0(CO1)					#  Fetch 16 C
2545	MADD	t11,c11,t11,ALPHA
2546	ST	t11,0(CO1)
2547
2548#else
2549	MUL	t11, ALPHA, t11
2550
2551	ST	t11,  0 * SIZE(CO1)
2552#endif
2553
2554
2555.L999:							#  End
2556	ld	$16,   0($sp)
2557	ld	$17,   8($sp)
2558	ld	$18,  16($sp)
2559	ld	$19,  24($sp)
2560	ld	$20,  32($sp)
2561	ld	$21,  40($sp)
2562	ld	$22,  48($sp)
2563	LD	$f24, 56($sp)
2564	LD	$f25, 64($sp)
2565	LD	$f26, 72($sp)
2566	LD	$f27, 80($sp)
2567	LD	$f28, 88($sp)
2568	ld	$23,  96($sp)
2569	ld	$24, 104($sp)
2570	ld	$25, 112($sp)
2571	LD	$f20,120($sp)
2572	LD	$f21,128($sp)
2573	LD	$f22,136($sp)
2574	LD	$f23,144($sp)
2575
2576	j	$31
2577	daddiu	$sp, $sp, 160
2578
2579	EPILOGUE
2580