1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	$4
26#define	N	$5
27#define	K	$6
28#define A	$9
29#define B	$10
30#define C	$11
31#define LDC	$8
32
33#define AO	$12
34#define BO	$13
35
36#define I	$2
37#define J	$3
38#define L	$7
39
40#define CO1	$14
41#define CO2	$15
42#define CO3	$16
43#define CO4	$17
44#define CO5	$18
45#define CO6	$19
46#define CO7	$20
47#define CO8	$21
48
49#if defined(TRMMKERNEL)
50#define OFFSET	$22
51#define KK	$23
52#define TEMP	$24
53#endif
54
55#define a1	$f0
56#define a2	$f1
57#define a3	$f28
58#define a4	$f29
59
60#define b1	$f2
61#define b2	$f3
62#define b3	$f4
63#define b4	$f5
64#define b5	$f6
65#define b6	$f7
66#define b7	$f8
67#define b8	$f9
68
69#define a5	b8
70
71#define c11	$f10
72#define c12	$f11
73#define c21	$f12
74#define c22	$f13
75#define c31	$f14
76#define c32	$f17
77#define c41	$f18
78#define c42	$f19
79#define c51	$f20
80#define c52	$f21
81#define c61	$f22
82#define c62	$f23
83#define c71	$f24
84#define c72	$f25
85#define c81	$f26
86#define c82	$f27
87
88#define ALPHA_R	$f15
89#define ALPHA_I	$f16
90
91	PROLOGUE
92
93	daddiu	$sp, $sp, -128
94
95	SDARG	$16,   0($sp)
96	SDARG	$17,   8($sp)
97	SDARG	$18,  16($sp)
98	SDARG	$19,  24($sp)
99	SDARG	$20,  32($sp)
100	SDARG	$21,  40($sp)
101	sdc1	$f24, 48($sp)
102	sdc1	$f25, 56($sp)
103	sdc1	$f26, 64($sp)
104	sdc1	$f27, 72($sp)
105	sdc1	$f28, 80($sp)
106	sdc1	$f29, 88($sp)
107
108	LDARG	LDC,  128($sp)
109
110	dsll	LDC, LDC, ZBASE_SHIFT
111
112	dsra	J,  N, 3
113	blez	J, .L30
114	nop
115
116.L10:
117	move	CO1, C
118	MTC	$0,  c11
119	daddu	CO2, C,   LDC
120	move	AO, A
121	daddu	CO3, CO2, LDC
122	daddiu	J, J, -1
123	daddu	CO4, CO3, LDC
124	MOV	c21, c11
125	daddu	CO5, CO4, LDC
126	MOV	c31, c11
127	daddu	CO6, CO5, LDC
128	MOV	c41, c11
129	daddu	CO7, CO6, LDC
130	MOV	c51, c11
131	daddu	CO8, CO7, LDC
132	dsra	I,  M, 1
133	daddu	C,   CO8, LDC
134
135	blez	I, .L20
136	MOV	c61, c11
137
138.L11:
139	LD	a1,  0 * SIZE(AO)
140	MOV	c71, c11
141	LD	b1,  0 * SIZE(B)
142	MOV	c81, c11
143
144	LD	a3,  4 * SIZE(AO)
145	MOV	c12, c11
146	LD	b2,  1 * SIZE(B)
147	MOV	c22, c11
148
149	dsra	L,  K, 2
150	MOV	c32, c11
151	LD	b3,  2 * SIZE(B)
152	MOV	c42, c11
153
154	LD	b4,  3 * SIZE(B)
155	MOV	c52, c11
156	LD	b5,  4 * SIZE(B)
157	MOV	c62, c11
158
159	LD	b6,  8 * SIZE(B)
160	MOV	c72, c11
161	LD	b7, 12 * SIZE(B)
162	MOV	c82, c11
163
164	blez	L, .L15
165	move	BO,  B
166
167	MADD	c11, c11, a1, b1
168	LD	a2,  1 * SIZE(AO)
169	MADD	c21, c21, a1, b2
170	daddiu	L, L, -1
171	MADD	c31, c31, a1, b3
172	blez	L, .L13
173	MADD	c41, c41, a1, b4
174	NOP
175	.align	3
176
177.L12:
178	MADD	c12, c12, a2, b1
179	LD	b1, 16 * SIZE(BO)
180	MADD	c22, c22, a2, b2
181	LD	b2,  5 * SIZE(BO)
182	MADD	c32, c32, a2, b3
183	LD	b3,  6 * SIZE(BO)
184	MADD	c42, c42, a2, b4
185	LD	b4,  7 * SIZE(BO)
186
187	MADD	c51, c51, a1, b5
188	LD	a4,  2 * SIZE(AO)
189	MADD	c61, c61, a1, b2
190	NOP
191	MADD	c71, c71, a1, b3
192	NOP
193	MADD	c81, c81, a1, b4
194	LD	a1,  8 * SIZE(AO)
195
196	MADD	c52, c52, a2, b5
197	LD	b5, 20 * SIZE(BO)
198	MADD	c62, c62, a2, b2
199	LD	b2,  9 * SIZE(BO)
200	MADD	c72, c72, a2, b3
201	LD	b3, 10 * SIZE(BO)
202	MADD	c82, c82, a2, b4
203	LD	b4, 11 * SIZE(BO)
204
205	MADD	c11, c11, a4, b6
206	LD	a2,  3 * SIZE(AO)
207	MADD	c21, c21, a4, b2
208	NOP
209	MADD	c31, c31, a4, b3
210	NOP
211	MADD	c41, c41, a4, b4
212	NOP
213
214	MADD	c12, c12, a2, b6
215	LD	b6, 24 * SIZE(BO)
216	MADD	c22, c22, a2, b2
217	LD	b2, 13 * SIZE(BO)
218	MADD	c32, c32, a2, b3
219	LD	b3, 14 * SIZE(BO)
220	MADD	c42, c42, a2, b4
221	LD	b4, 15 * SIZE(BO)
222
223	MADD	c51, c51, a4, b7
224	NOP
225	MADD	c61, c61, a4, b2
226	NOP
227	MADD	c71, c71, a4, b3
228	NOP
229	MADD	c81, c81, a4, b4
230	NOP
231
232	MADD	c52, c52, a2, b7
233	LD	b7, 28 * SIZE(BO)
234	MADD	c62, c62, a2, b2
235	LD	b2, 17 * SIZE(BO)
236	MADD	c72, c72, a2, b3
237	LD	b3, 18 * SIZE(BO)
238	MADD	c82, c82, a2, b4
239	LD	b4, 19 * SIZE(BO)
240
241	MADD	c11, c11, a3, b1
242	LD	a2,  5 * SIZE(AO)
243	MADD	c21, c21, a3, b2
244	NOP
245	MADD	c31, c31, a3, b3
246	NOP
247	MADD	c41, c41, a3, b4
248	NOP
249
250	MADD	c12, c12, a2, b1
251	LD	b1, 32 * SIZE(BO)
252	MADD	c22, c22, a2, b2
253	LD	b2, 21 * SIZE(BO)
254	MADD	c32, c32, a2, b3
255	LD	b3, 22 * SIZE(BO)
256	MADD	c42, c42, a2, b4
257	LD	b4, 23 * SIZE(BO)
258
259	MADD	c51, c51, a3, b5
260	LD	a4,  6 * SIZE(AO)
261	MADD	c61, c61, a3, b2
262	NOP
263	MADD	c71, c71, a3, b3
264	NOP
265	MADD	c81, c81, a3, b4
266	LD	a3, 12 * SIZE(AO)
267
268	MADD	c52, c52, a2, b5
269	LD	b5, 36 * SIZE(BO)
270	MADD	c62, c62, a2, b2
271	LD	b2, 25 * SIZE(BO)
272	MADD	c72, c72, a2, b3
273	LD	b3, 26 * SIZE(BO)
274	MADD	c82, c82, a2, b4
275	LD	b4, 27 * SIZE(BO)
276
277	MADD	c11, c11, a4, b6
278	LD	a2,  7 * SIZE(AO)
279	MADD	c21, c21, a4, b2
280	NOP
281	MADD	c31, c31, a4, b3
282	NOP
283	MADD	c41, c41, a4, b4
284	daddiu	L, L, -1
285
286	MADD	c12, c12, a2, b6
287	LD	b6, 40 * SIZE(BO)
288	MADD	c22, c22, a2, b2
289	LD	b2, 29 * SIZE(BO)
290	MADD	c32, c32, a2, b3
291	LD	b3, 30 * SIZE(BO)
292	MADD	c42, c42, a2, b4
293	LD	b4, 31 * SIZE(BO)
294
295	MADD	c51, c51, a4, b7
296	daddiu	BO, BO, 32 * SIZE
297	MADD	c61, c61, a4, b2
298	daddiu	AO, AO,  8 * SIZE
299	MADD	c71, c71, a4, b3
300	NOP
301	MADD	c81, c81, a4, b4
302	NOP
303
304	MADD	c52, c52, a2, b7
305	LD	b7, 12 * SIZE(BO)
306	MADD	c62, c62, a2, b2
307	LD	b2,  1 * SIZE(BO)
308	MADD	c72, c72, a2, b3
309	LD	b3,  2 * SIZE(BO)
310	MADD	c82, c82, a2, b4
311	LD	b4,  3 * SIZE(BO)
312
313	MADD	c11, c11, a1, b1
314	LD	a2,  1 * SIZE(AO)
315	MADD	c21, c21, a1, b2
316	NOP
317	MADD	c31, c31, a1, b3
318	bgtz	L, .L12
319	MADD	c41, c41, a1, b4
320	NOP
321	.align 3
322
323.L13:
324	MADD	c12, c12, a2, b1
325	LD	b1, 16 * SIZE(BO)
326	MADD	c22, c22, a2, b2
327	LD	b2,  5 * SIZE(BO)
328	MADD	c32, c32, a2, b3
329	LD	b3,  6 * SIZE(BO)
330	MADD	c42, c42, a2, b4
331	LD	b4,  7 * SIZE(BO)
332
333	MADD	c51, c51, a1, b5
334	NOP
335	MADD	c61, c61, a1, b2
336	LD	a4,  2 * SIZE(AO)
337	MADD	c71, c71, a1, b3
338	NOP
339	MADD	c81, c81, a1, b4
340	LD	a1,  8 * SIZE(AO)
341
342	MADD	c52, c52, a2, b5
343	LD	b5, 20 * SIZE(BO)
344	MADD	c62, c62, a2, b2
345	LD	b2,  9 * SIZE(BO)
346	MADD	c72, c72, a2, b3
347	LD	b3, 10 * SIZE(BO)
348	MADD	c82, c82, a2, b4
349	LD	b4, 11 * SIZE(BO)
350
351	MADD	c11, c11, a4, b6
352	LD	a2,  3 * SIZE(AO)
353	MADD	c21, c21, a4, b2
354	NOP
355	MADD	c31, c31, a4, b3
356	NOP
357	MADD	c41, c41, a4, b4
358	NOP
359
360	MADD	c12, c12, a2, b6
361	LD	b6, 24 * SIZE(BO)
362	MADD	c22, c22, a2, b2
363	LD	b2, 13 * SIZE(BO)
364	MADD	c32, c32, a2, b3
365	LD	b3, 14 * SIZE(BO)
366	MADD	c42, c42, a2, b4
367	LD	b4, 15 * SIZE(BO)
368
369	MADD	c51, c51, a4, b7
370	NOP
371	MADD	c61, c61, a4, b2
372	NOP
373	MADD	c71, c71, a4, b3
374	NOP
375	MADD	c81, c81, a4, b4
376	NOP
377
378	MADD	c52, c52, a2, b7
379	LD	b7, 28 * SIZE(BO)
380	MADD	c62, c62, a2, b2
381	LD	b2, 17 * SIZE(BO)
382	MADD	c72, c72, a2, b3
383	LD	b3, 18 * SIZE(BO)
384	MADD	c82, c82, a2, b4
385	LD	b4, 19 * SIZE(BO)
386
387	MADD	c11, c11, a3, b1
388	LD	a2,  5 * SIZE(AO)
389	MADD	c21, c21, a3, b2
390	NOP
391	MADD	c31, c31, a3, b3
392	NOP
393	MADD	c41, c41, a3, b4
394	NOP
395
396	MADD	c12, c12, a2, b1
397	LD	b1, 32 * SIZE(BO)
398	MADD	c22, c22, a2, b2
399	LD	b2, 21 * SIZE(BO)
400	MADD	c32, c32, a2, b3
401	LD	b3, 22 * SIZE(BO)
402	MADD	c42, c42, a2, b4
403	LD	b4, 23 * SIZE(BO)
404
405	MADD	c51, c51, a3, b5
406	NOP
407	MADD	c61, c61, a3, b2
408	LD	a4,  6 * SIZE(AO)
409	MADD	c71, c71, a3, b3
410	NOP
411	MADD	c81, c81, a3, b4
412	LD	a3, 12 * SIZE(AO)
413
414	MADD	c52, c52, a2, b5
415	LD	b5, 36 * SIZE(BO)
416	MADD	c62, c62, a2, b2
417	LD	b2, 25 * SIZE(BO)
418	MADD	c72, c72, a2, b3
419	LD	b3, 26 * SIZE(BO)
420	MADD	c82, c82, a2, b4
421	LD	b4, 27 * SIZE(BO)
422
423	MADD	c11, c11, a4, b6
424	LD	a2,  7 * SIZE(AO)
425	MADD	c21, c21, a4, b2
426	NOP
427	MADD	c31, c31, a4, b3
428	NOP
429	MADD	c41, c41, a4, b4
430	NOP
431
432	MADD	c12, c12, a2, b6
433	LD	b6, 40 * SIZE(BO)
434	MADD	c22, c22, a2, b2
435	LD	b2, 29 * SIZE(BO)
436	MADD	c32, c32, a2, b3
437	LD	b3, 30 * SIZE(BO)
438	MADD	c42, c42, a2, b4
439	LD	b4, 31 * SIZE(BO)
440
441	MADD	c51, c51, a4, b7
442	daddiu	BO, BO, 32 * SIZE
443	MADD	c61, c61, a4, b2
444	daddiu	AO, AO,  8 * SIZE
445	MADD	c71, c71, a4, b3
446	NOP
447	MADD	c81, c81, a4, b4
448	NOP
449
450	MADD	c52, c52, a2, b7
451	LD	b7, 12 * SIZE(BO)
452	MADD	c62, c62, a2, b2
453	LD	b2,  1 * SIZE(BO)
454	MADD	c72, c72, a2, b3
455	LD	b3,  2 * SIZE(BO)
456	MADD	c82, c82, a2, b4
457	LD	b4,  3 * SIZE(BO)
458	.align 3
459
460.L15:
461	andi	L,  K, 3
462	NOP
463	blez	L, .L18
464	NOP
465	.align	3
466
467.L16:
468	MADD	c11, c11, a1, b1
469	LD	a2,  1 * SIZE(AO)
470	MADD	c21, c21, a1, b2
471	NOP
472	MADD	c31, c31, a1, b3
473	NOP
474	MADD	c41, c41, a1, b4
475	NOP
476
477	MADD	c12, c12, a2, b1
478	LD	b1,  8 * SIZE(BO)
479	MADD	c22, c22, a2, b2
480	LD	b2,  5 * SIZE(BO)
481	MADD	c32, c32, a2, b3
482	LD	b3,  6 * SIZE(BO)
483	MADD	c42, c42, a2, b4
484	LD	b4,  7 * SIZE(BO)
485
486	MADD	c51, c51, a1, b5
487	daddiu	L, L, -1
488	MADD	c61, c61, a1, b2
489	daddiu	AO, AO,  2 * SIZE
490	MADD	c71, c71, a1, b3
491	daddiu	BO, BO,  8 * SIZE
492	MADD	c81, c81, a1, b4
493	LD	a1,  0 * SIZE(AO)
494
495	MADD	c52, c52, a2, b5
496	LD	b5,  4 * SIZE(BO)
497	MADD	c62, c62, a2, b2
498	LD	b2,  1 * SIZE(BO)
499	MADD	c72, c72, a2, b3
500	LD	b3,  2 * SIZE(BO)
501	MADD	c82, c82, a2, b4
502	bgtz	L, .L16
503	LD	b4,  3 * SIZE(BO)
504
505.L18:
506	LD	$f0, 0 * SIZE(CO1)
507	LD	$f1, 1 * SIZE(CO1)
508	LD	$f2, 2 * SIZE(CO1)
509	LD	$f3, 3 * SIZE(CO1)
510
511	LD	$f4, 0 * SIZE(CO2)
512	MADD	$f0, $f0, ALPHA_R, c11
513	LD	$f5, 1 * SIZE(CO2)
514	MADD	$f1, $f1, ALPHA_I, c11
515	LD	$f6, 2 * SIZE(CO2)
516	MADD	$f2, $f2, ALPHA_R, c12
517	LD	$f7, 3 * SIZE(CO2)
518	MADD	$f3, $f3, ALPHA_I, c12
519
520	MADD	$f4, $f4, ALPHA_R, c21
521	ST	$f0,  0 * SIZE(CO1)
522	MADD	$f5, $f5, ALPHA_I, c21
523	ST	$f1,  1 * SIZE(CO1)
524	MADD	$f6, $f6, ALPHA_R, c22
525	ST	$f2,  2 * SIZE(CO1)
526	MADD	$f7, $f7, ALPHA_I, c22
527	ST	$f3,  3 * SIZE(CO1)
528
529	LD	$f0, 0 * SIZE(CO3)
530	LD	$f1, 1 * SIZE(CO3)
531	LD	$f2, 2 * SIZE(CO3)
532	LD	$f3, 3 * SIZE(CO3)
533
534	ST	$f4,  0 * SIZE(CO2)
535	ST	$f5,  1 * SIZE(CO2)
536	ST	$f6,  2 * SIZE(CO2)
537	ST	$f7,  3 * SIZE(CO2)
538
539	LD	$f4, 0 * SIZE(CO4)
540	LD	$f5, 1 * SIZE(CO4)
541	LD	$f6, 2 * SIZE(CO4)
542	LD	$f7, 3 * SIZE(CO4)
543
544	MADD	$f0, $f0, ALPHA_R, c31
545	MADD	$f1, $f1, ALPHA_I, c31
546	MADD	$f2, $f2, ALPHA_R, c32
547	MADD	$f3, $f3, ALPHA_I, c32
548
549	MADD	$f4, $f4, ALPHA_R, c41
550	ST	$f0,  0 * SIZE(CO3)
551	MADD	$f5, $f5, ALPHA_I, c41
552	ST	$f1,  1 * SIZE(CO3)
553	MADD	$f6, $f6, ALPHA_R, c42
554	ST	$f2,  2 * SIZE(CO3)
555	MADD	$f7, $f7, ALPHA_I, c42
556	ST	$f3,  3 * SIZE(CO3)
557
558	LD	$f0, 0 * SIZE(CO5)
559	LD	$f1, 1 * SIZE(CO5)
560	LD	$f2, 2 * SIZE(CO5)
561	LD	$f3, 3 * SIZE(CO5)
562
563	ST	$f4,  0 * SIZE(CO4)
564	ST	$f5,  1 * SIZE(CO4)
565	ST	$f6,  2 * SIZE(CO4)
566	ST	$f7,  3 * SIZE(CO4)
567
568	LD	$f4, 0 * SIZE(CO6)
569	LD	$f5, 1 * SIZE(CO6)
570	LD	$f6, 2 * SIZE(CO6)
571	LD	$f7, 3 * SIZE(CO6)
572
573	MADD	$f0, $f0, ALPHA_R, c51
574	daddiu	CO1,CO1, 4 * SIZE
575	MADD	$f1, $f1, ALPHA_I, c51
576	daddiu	CO2,CO2, 4 * SIZE
577	MADD	$f2, $f2, ALPHA_R, c52
578	daddiu	CO3,CO3, 4 * SIZE
579	MADD	$f3, $f3, ALPHA_I, c52
580	daddiu	CO4,CO4, 4 * SIZE
581
582	MADD	$f4, $f4, ALPHA_R, c61
583	ST	$f0,  0 * SIZE(CO5)
584	MADD	$f5, $f5, ALPHA_I, c61
585	ST	$f1,  1 * SIZE(CO5)
586	MADD	$f6, $f6, ALPHA_R, c62
587	ST	$f2,  2 * SIZE(CO5)
588	MADD	$f7, $f7, ALPHA_I, c62
589	ST	$f3,  3 * SIZE(CO5)
590
591	LD	$f0, 0 * SIZE(CO7)
592	LD	$f1, 1 * SIZE(CO7)
593	LD	$f2, 2 * SIZE(CO7)
594	LD	$f3, 3 * SIZE(CO7)
595
596	ST	$f4,  0 * SIZE(CO6)
597	ST	$f5,  1 * SIZE(CO6)
598	ST	$f6,  2 * SIZE(CO6)
599	ST	$f7,  3 * SIZE(CO6)
600
601	LD	$f4, 0 * SIZE(CO8)
602	daddiu	I, I, -1
603	LD	$f5, 1 * SIZE(CO8)
604	MTC	$0,  c11
605	LD	$f6, 2 * SIZE(CO8)
606	LD	$f7, 3 * SIZE(CO8)
607
608	MADD	$f0, $f0, ALPHA_R, c71
609	daddiu	CO5,CO5, 4 * SIZE
610	MADD	$f1, $f1, ALPHA_I, c71
611	daddiu	CO6,CO6, 4 * SIZE
612	MADD	$f2, $f2, ALPHA_R, c72
613	daddiu	CO7,CO7, 4 * SIZE
614	MADD	$f3, $f3, ALPHA_I, c72
615	daddiu	CO8,CO8, 4 * SIZE
616
617	MADD	$f4, $f4, ALPHA_R, c81
618	ST	$f0, -4 * SIZE(CO7)
619	MADD	$f5, $f5, ALPHA_I, c81
620	ST	$f1, -3 * SIZE(CO7)
621	MADD	$f6, $f6, ALPHA_R, c82
622	ST	$f2, -2 * SIZE(CO7)
623	MADD	$f7, $f7, ALPHA_I, c82
624	ST	$f3, -1 * SIZE(CO7)
625
626	ST	$f4, -4 * SIZE(CO8)
627	MOV	c21, c11
628	ST	$f5, -3 * SIZE(CO8)
629	MOV	c31, c11
630	ST	$f6, -2 * SIZE(CO8)
631	MOV	c41, c11
632	ST	$f7, -1 * SIZE(CO8)
633	MOV	c51, c11
634	bgtz	I, .L11
635	MOV	c61, c11
636	.align 3
637
638.L20:
639	andi	I,  M, 1
640	MOV	c61, c11
641	blez	I, .L29
642	MOV	c71, c11
643
644	LD	a1,  0 * SIZE(AO)
645	LD	a2,  1 * SIZE(AO)
646	LD	a3,  2 * SIZE(AO)
647	LD	a4,  3 * SIZE(AO)
648
649	LD	b1,  0 * SIZE(B)
650	LD	b2,  1 * SIZE(B)
651	LD	b3,  2 * SIZE(B)
652	LD	b4,  3 * SIZE(B)
653	LD	b5,  4 * SIZE(B)
654	LD	b6,  8 * SIZE(B)
655	LD	b7, 12 * SIZE(B)
656
657	dsra	L,  K, 2
658	MOV	c81, c11
659
660	blez	L, .L25
661	move	BO,  B
662	.align	3
663
664.L22:
665	MADD	c11, c11, a1, b1
666	LD	b1, 16 * SIZE(BO)
667	MADD	c21, c21, a1, b2
668	LD	b2,  5 * SIZE(BO)
669	MADD	c31, c31, a1, b3
670	LD	b3,  6 * SIZE(BO)
671	MADD	c41, c41, a1, b4
672	LD	b4,  7 * SIZE(BO)
673
674	MADD	c51, c51, a1, b5
675	LD	b5, 20 * SIZE(BO)
676	MADD	c61, c61, a1, b2
677	LD	b2,  9 * SIZE(BO)
678	MADD	c71, c71, a1, b3
679	LD	b3, 10 * SIZE(BO)
680	MADD	c81, c81, a1, b4
681	LD	b4, 11 * SIZE(BO)
682
683	LD	a1,  4 * SIZE(AO)
684	daddiu	L, L, -1
685
686	MADD	c11, c11, a2, b6
687	LD	b6, 24 * SIZE(BO)
688	MADD	c21, c21, a2, b2
689	LD	b2, 13 * SIZE(BO)
690	MADD	c31, c31, a2, b3
691	LD	b3, 14 * SIZE(BO)
692	MADD	c41, c41, a2, b4
693	LD	b4, 15 * SIZE(BO)
694
695	MADD	c51, c51, a2, b7
696	LD	b7, 28 * SIZE(BO)
697	MADD	c61, c61, a2, b2
698	LD	b2, 17 * SIZE(BO)
699	MADD	c71, c71, a2, b3
700	LD	b3, 18 * SIZE(BO)
701	MADD	c81, c81, a2, b4
702	LD	b4, 19 * SIZE(BO)
703
704	LD	a2,  5 * SIZE(AO)
705	daddiu	AO, AO,  4 * SIZE
706
707	MADD	c11, c11, a3, b1
708	LD	b1, 32 * SIZE(BO)
709	MADD	c21, c21, a3, b2
710	LD	b2, 21 * SIZE(BO)
711	MADD	c31, c31, a3, b3
712	LD	b3, 22 * SIZE(BO)
713	MADD	c41, c41, a3, b4
714	LD	b4, 23 * SIZE(BO)
715
716	MADD	c51, c51, a3, b5
717	LD	b5, 36 * SIZE(BO)
718	MADD	c61, c61, a3, b2
719	LD	b2, 25 * SIZE(BO)
720	MADD	c71, c71, a3, b3
721	LD	b3, 26 * SIZE(BO)
722	MADD	c81, c81, a3, b4
723	LD	b4, 27 * SIZE(BO)
724
725	LD	a3,  2 * SIZE(AO)
726	daddiu	BO, BO, 32 * SIZE
727
728	MADD	c11, c11, a4, b6
729	LD	b6,  8 * SIZE(BO)
730	MADD	c21, c21, a4, b2
731	LD	b2, -3 * SIZE(BO)
732	MADD	c31, c31, a4, b3
733	LD	b3, -2 * SIZE(BO)
734	MADD	c41, c41, a4, b4
735	LD	b4, -1 * SIZE(BO)
736
737	MADD	c51, c51, a4, b7
738	LD	b7, 12 * SIZE(BO)
739	MADD	c61, c61, a4, b2
740	LD	b2,  1 * SIZE(BO)
741	MADD	c71, c71, a4, b3
742	LD	b3,  2 * SIZE(BO)
743	MADD	c81, c81, a4, b4
744	LD	b4,  3 * SIZE(BO)
745	bgtz	L, .L22
746	LD	a4,  3 * SIZE(AO)
747	.align 3
748
749.L25:
750	andi	L,  K, 3
751	NOP
752	blez	L, .L28
753	NOP
754	.align	3
755
756.L26:
757	MADD	c11, c11, a1, b1
758	LD	b1,  8 * SIZE(BO)
759	MADD	c21, c21, a1, b2
760	LD	b2,  5 * SIZE(BO)
761	MADD	c31, c31, a1, b3
762	LD	b3,  6 * SIZE(BO)
763	MADD	c41, c41, a1, b4
764	LD	b4,  7 * SIZE(BO)
765
766	daddiu	L, L, -1
767	MOV	a2, a2
768	daddiu	AO, AO,  1 * SIZE
769	daddiu	BO, BO,  8 * SIZE
770
771	MADD	c51, c51, a1, b5
772	LD	b5,  4 * SIZE(BO)
773	MADD	c61, c61, a1, b2
774	LD	b2,  1 * SIZE(BO)
775	MADD	c71, c71, a1, b3
776	LD	b3,  2 * SIZE(BO)
777	MADD	c81, c81, a1, b4
778	LD	a1,  0 * SIZE(AO)
779
780	bgtz	L, .L26
781	LD	b4,  3 * SIZE(BO)
782
783.L28:
784	LD	$f0, 0 * SIZE(CO1)
785	LD	$f1, 1 * SIZE(CO1)
786	LD	$f2, 0 * SIZE(CO2)
787	LD	$f3, 1 * SIZE(CO2)
788
789	LD	$f4, 0 * SIZE(CO3)
790	MADD	$f0, $f0, ALPHA_R, c11
791	LD	$f5, 1 * SIZE(CO3)
792	MADD	$f1, $f1, ALPHA_I, c11
793	LD	$f6, 0 * SIZE(CO4)
794	MADD	$f2, $f2, ALPHA_R, c21
795	LD	$f7, 1 * SIZE(CO4)
796	MADD	$f3, $f3, ALPHA_I, c21
797
798	MADD	$f4, $f4, ALPHA_R, c31
799	ST	$f0,  0 * SIZE(CO1)
800	MADD	$f5, $f5, ALPHA_I, c31
801	ST	$f1,  1 * SIZE(CO1)
802	MADD	$f6, $f6, ALPHA_R, c41
803	ST	$f2,  0 * SIZE(CO2)
804	MADD	$f7, $f7, ALPHA_I, c41
805	ST	$f3,  1 * SIZE(CO2)
806
807	LD	$f0, 0 * SIZE(CO5)
808	LD	$f1, 1 * SIZE(CO5)
809	LD	$f2, 0 * SIZE(CO6)
810	LD	$f3, 1 * SIZE(CO6)
811
812	ST	$f4,  0 * SIZE(CO3)
813	ST	$f5,  1 * SIZE(CO3)
814	ST	$f6,  0 * SIZE(CO4)
815	ST	$f7,  1 * SIZE(CO4)
816
817	LD	$f4, 0 * SIZE(CO7)
818	MADD	$f0, $f0, ALPHA_R, c51
819	LD	$f5, 1 * SIZE(CO7)
820	MADD	$f1, $f1, ALPHA_I, c51
821	LD	$f6, 0 * SIZE(CO8)
822	MADD	$f2, $f2, ALPHA_R, c61
823	LD	$f7, 1 * SIZE(CO8)
824	MADD	$f3, $f3, ALPHA_I, c61
825
826	MADD	$f4, $f4, ALPHA_R, c71
827	ST	$f0,  0 * SIZE(CO5)
828	MADD	$f5, $f5, ALPHA_I, c71
829	ST	$f1,  1 * SIZE(CO5)
830	MADD	$f6, $f6, ALPHA_R, c81
831	ST	$f2,  0 * SIZE(CO6)
832	MADD	$f7, $f7, ALPHA_I, c81
833	ST	$f3,  1 * SIZE(CO6)
834
835	ST	$f4,  0 * SIZE(CO7)
836	ST	$f5,  1 * SIZE(CO7)
837	ST	$f6,  0 * SIZE(CO8)
838	ST	$f7,  1 * SIZE(CO8)
839	.align 3
840
841.L29:
842	bgtz	J, .L10
843	move	B, BO
844	.align 3
845
846.L30:
847	andi	J,  N, 4
848	blez	J, .L50
849	move	AO, A
850
851	move	CO1, C
852	MTC	$0,  c11
853	daddu	CO2, C,   LDC
854	daddu	CO3, CO2, LDC
855	daddu	CO4, CO3, LDC
856	MOV	c21, c11
857	daddu	C,   CO4, LDC
858	MOV	c31, c11
859
860	dsra	I,  M, 1
861	blez	I, .L40
862	MOV	c41, c11
863
864.L31:
865	LD	a1,  0 * SIZE(AO)
866	LD	a3,  4 * SIZE(AO)
867
868	LD	b1,  0 * SIZE(B)
869	MOV	c12, c11
870	LD	b2,  1 * SIZE(B)
871	MOV	c22, c11
872	LD	b3,  2 * SIZE(B)
873	MOV	c32, c11
874	LD	b4,  3 * SIZE(B)
875	MOV	c42, c11
876
877	LD	b5,  4 * SIZE(B)
878	dsra	L,  K, 2
879	LD	b6,  8 * SIZE(B)
880	LD	b7, 12 * SIZE(B)
881
882	blez	L, .L35
883	move	BO,  B
884	.align	3
885
886.L32:
887	MADD	c11, c11, a1, b1
888	LD	a2,  1 * SIZE(AO)
889	MADD	c21, c21, a1, b2
890	daddiu	L, L, -1
891	MADD	c31, c31, a1, b3
892	NOP
893	MADD	c41, c41, a1, b4
894	LD	a1,  2 * SIZE(AO)
895
896	MADD	c12, c12, a2, b1
897	LD	b1, 16 * SIZE(BO)
898	MADD	c22, c22, a2, b2
899	LD	b2,  5 * SIZE(BO)
900	MADD	c32, c32, a2, b3
901	LD	b3,  6 * SIZE(BO)
902	MADD	c42, c42, a2, b4
903	LD	b4,  7 * SIZE(BO)
904
905	MADD	c11, c11, a1, b5
906	LD	a2,  3 * SIZE(AO)
907	MADD	c21, c21, a1, b2
908	NOP
909	MADD	c31, c31, a1, b3
910	NOP
911	MADD	c41, c41, a1, b4
912	LD	a1,  8 * SIZE(AO)
913
914	MADD	c12, c12, a2, b5
915	LD	b5, 20 * SIZE(BO)
916	MADD	c22, c22, a2, b2
917	LD	b2,  9 * SIZE(BO)
918	MADD	c32, c32, a2, b3
919	LD	b3, 10 * SIZE(BO)
920	MADD	c42, c42, a2, b4
921	LD	b4, 11 * SIZE(BO)
922
923	MADD	c11, c11, a3, b6
924	LD	a2,  5 * SIZE(AO)
925	MADD	c21, c21, a3, b2
926	NOP
927	MADD	c31, c31, a3, b3
928	NOP
929	MADD	c41, c41, a3, b4
930	LD	a3,  6 * SIZE(AO)
931
932	MADD	c12, c12, a2, b6
933	LD	b6, 24 * SIZE(BO)
934	MADD	c22, c22, a2, b2
935	LD	b2, 13 * SIZE(BO)
936	MADD	c32, c32, a2, b3
937	LD	b3, 14 * SIZE(BO)
938	MADD	c42, c42, a2, b4
939	LD	b4, 15 * SIZE(BO)
940
941	MADD	c11, c11, a3, b7
942	LD	a2,  7 * SIZE(AO)
943	MADD	c21, c21, a3, b2
944	daddiu	AO, AO,  8 * SIZE
945	MADD	c31, c31, a3, b3
946	daddiu	BO, BO, 16 * SIZE
947	MADD	c41, c41, a3, b4
948	LD	a3,  4 * SIZE(AO)
949
950	MADD	c12, c12, a2, b7
951	LD	b7, 12 * SIZE(BO)
952	MADD	c22, c22, a2, b2
953	LD	b2,  1 * SIZE(BO)
954	MADD	c32, c32, a2, b3
955	LD	b3,  2 * SIZE(BO)
956	MADD	c42, c42, a2, b4
957	NOP
958
959	bgtz	L, .L32
960	LD	b4,  3 * SIZE(BO)
961	.align 3
962
963.L35:
964	andi	L,  K, 3
965	NOP
966	blez	L, .L38
967	NOP
968	.align	3
969
970.L36:
971	MADD	c11, c11, a1, b1
972	LD	a2,  1 * SIZE(AO)
973	MADD	c21, c21, a1, b2
974	daddiu	L, L, -1
975	MADD	c31, c31, a1, b3
976	daddiu	AO, AO,  2 * SIZE
977	MADD	c41, c41, a1, b4
978	LD	a1,  0 * SIZE(AO)
979
980	MADD	c12, c12, a2, b1
981	LD	b1,  4 * SIZE(BO)
982	MADD	c22, c22, a2, b2
983	LD	b2,  5 * SIZE(BO)
984	MADD	c32, c32, a2, b3
985	LD	b3,  6 * SIZE(BO)
986	MADD	c42, c42, a2, b4
987	LD	b4,  7 * SIZE(BO)
988
989	bgtz	L, .L36
990	daddiu	BO, BO,  4 * SIZE
991
992.L38:
993	LD	$f0, 0 * SIZE(CO1)
994	LD	$f1, 1 * SIZE(CO1)
995	LD	$f2, 2 * SIZE(CO1)
996	LD	$f3, 3 * SIZE(CO1)
997
998	LD	$f4, 0 * SIZE(CO2)
999	LD	$f5, 1 * SIZE(CO2)
1000	LD	$f6, 2 * SIZE(CO2)
1001	LD	$f7, 3 * SIZE(CO2)
1002
1003	MADD	$f0, $f0, ALPHA_R, c11
1004	MADD	$f1, $f1, ALPHA_I, c11
1005	MADD	$f2, $f2, ALPHA_R, c12
1006	MADD	$f3, $f3, ALPHA_I, c12
1007
1008	MADD	$f4, $f4, ALPHA_R, c21
1009	ST	$f0,  0 * SIZE(CO1)
1010	MADD	$f5, $f5, ALPHA_I, c21
1011	ST	$f1,  1 * SIZE(CO1)
1012	MADD	$f6, $f6, ALPHA_R, c22
1013	ST	$f2,  2 * SIZE(CO1)
1014	MADD	$f7, $f7, ALPHA_I, c22
1015	ST	$f3,  3 * SIZE(CO1)
1016
1017	LD	$f0, 0 * SIZE(CO3)
1018	LD	$f1, 1 * SIZE(CO3)
1019	LD	$f2, 2 * SIZE(CO3)
1020	LD	$f3, 3 * SIZE(CO3)
1021
1022	ST	$f4,  0 * SIZE(CO2)
1023	MADD	$f0, $f0, ALPHA_R, c31
1024	ST	$f5,  1 * SIZE(CO2)
1025	MADD	$f1, $f1, ALPHA_I, c31
1026	ST	$f6,  2 * SIZE(CO2)
1027	MADD	$f2, $f2, ALPHA_R, c32
1028	ST	$f7,  3 * SIZE(CO2)
1029	MADD	$f3, $f3, ALPHA_I, c32
1030
1031	LD	$f4, 0 * SIZE(CO4)
1032	LD	$f5, 1 * SIZE(CO4)
1033	LD	$f6, 2 * SIZE(CO4)
1034	LD	$f7, 3 * SIZE(CO4)
1035
1036	MADD	$f4, $f4, ALPHA_R, c41
1037	daddiu	CO1,CO1, 4 * SIZE
1038	MADD	$f5, $f5, ALPHA_I, c41
1039	daddiu	CO2,CO2, 4 * SIZE
1040	MADD	$f6, $f6, ALPHA_R, c42
1041	daddiu	CO3,CO3, 4 * SIZE
1042	MADD	$f7, $f7, ALPHA_I, c42
1043	daddiu	CO4,CO4, 4 * SIZE
1044
1045	ST	$f0, -4 * SIZE(CO3)
1046	daddiu	I, I, -1
1047	ST	$f1, -3 * SIZE(CO3)
1048	ST	$f2, -2 * SIZE(CO3)
1049	ST	$f3, -1 * SIZE(CO3)
1050
1051	ST	$f4, -4 * SIZE(CO4)
1052	MTC	$0,  c11
1053	ST	$f5, -3 * SIZE(CO4)
1054	MOV	c21, c11
1055	ST	$f6, -2 * SIZE(CO4)
1056	MOV	c31, c11
1057	ST	$f7, -1 * SIZE(CO4)
1058	bgtz	I, .L31
1059	MOV	c41, c11
1060	.align 3
1061
1062.L40:
1063	andi	I,  M, 1
1064	blez	I, .L49
1065	MOV	c61, c11
1066
1067	LD	a1,  0 * SIZE(AO)
1068	MOV	c71, c11
1069	LD	a2,  1 * SIZE(AO)
1070	MOV	c81, c11
1071
1072	LD	b1,  0 * SIZE(B)
1073	LD	b2,  1 * SIZE(B)
1074	LD	b3,  2 * SIZE(B)
1075	LD	b4,  3 * SIZE(B)
1076	LD	b5,  4 * SIZE(B)
1077	LD	b6,  8 * SIZE(B)
1078	LD	b7, 12 * SIZE(B)
1079
1080	dsra	L,  K, 2
1081
1082	blez	L, .L45
1083	move	BO,  B
1084	.align	3
1085
1086.L42:
1087	MADD	c11, c11, a1, b1
1088	LD	b1, 16 * SIZE(BO)
1089	MADD	c21, c21, a1, b2
1090	LD	b2,  5 * SIZE(BO)
1091	MADD	c31, c31, a1, b3
1092	LD	b3,  6 * SIZE(BO)
1093	MADD	c41, c41, a1, b4
1094	LD	b4,  7 * SIZE(BO)
1095
1096	LD	a1,  4 * SIZE(AO)
1097	daddiu	L, L, -1
1098
1099	MADD	c11, c11, a2, b5
1100	LD	b5, 20 * SIZE(BO)
1101	MADD	c21, c21, a2, b2
1102	LD	b2,  9 * SIZE(BO)
1103	MADD	c31, c31, a2, b3
1104	LD	b3, 10 * SIZE(BO)
1105	MADD	c41, c41, a2, b4
1106	LD	b4, 11 * SIZE(BO)
1107
1108	LD	a2,  2 * SIZE(AO)
1109	daddiu	AO, AO,  4 * SIZE
1110
1111	MADD	c11, c11, a2, b6
1112	LD	b6, 24 * SIZE(BO)
1113	MADD	c21, c21, a2, b2
1114	LD	b2, 13 * SIZE(BO)
1115	MADD	c31, c31, a2, b3
1116	LD	b3, 14 * SIZE(BO)
1117	MADD	c41, c41, a2, b4
1118	LD	b4, 15 * SIZE(BO)
1119
1120	LD	a2, -1 * SIZE(AO)
1121	daddiu	BO, BO, 16 * SIZE
1122
1123	MADD	c11, c11, a2, b7
1124	LD	b7, 12 * SIZE(BO)
1125	MADD	c21, c21, a2, b2
1126	LD	b2,  1 * SIZE(BO)
1127	MADD	c31, c31, a2, b3
1128	LD	b3,  2 * SIZE(BO)
1129	MADD	c41, c41, a2, b4
1130	LD	b4,  3 * SIZE(BO)
1131
1132	bgtz	L, .L42
1133	LD	a2,  1 * SIZE(AO)
1134	.align 3
1135
1136.L45:
1137	andi	L,  K, 3
1138	NOP
1139	blez	L, .L48
1140	NOP
1141	.align	3
1142
1143.L46:
1144	MADD	c11, c11, a1, b1
1145	LD	b1,  4 * SIZE(BO)
1146	MADD	c21, c21, a1, b2
1147	LD	b2,  5 * SIZE(BO)
1148	MADD	c31, c31, a1, b3
1149	LD	b3,  6 * SIZE(BO)
1150	MADD	c41, c41, a1, b4
1151	LD	a1,  1 * SIZE(AO)
1152
1153	LD	b4,  7 * SIZE(BO)
1154	daddiu	L, L, -1
1155
1156	daddiu	AO, AO,  1 * SIZE
1157	MOV	a2, a2
1158	bgtz	L, .L46
1159	daddiu	BO, BO,  4 * SIZE
1160
1161
1162.L48:
1163	LD	$f0, 0 * SIZE(CO1)
1164	LD	$f1, 1 * SIZE(CO1)
1165	LD	$f2, 0 * SIZE(CO2)
1166	LD	$f3, 1 * SIZE(CO2)
1167
1168	LD	$f4, 0 * SIZE(CO3)
1169	MADD	$f0, $f0, ALPHA_R, c11
1170	LD	$f5, 1 * SIZE(CO3)
1171	MADD	$f1, $f1, ALPHA_I, c11
1172	LD	$f6, 0 * SIZE(CO4)
1173	MADD	$f2, $f2, ALPHA_R, c21
1174	LD	$f7, 1 * SIZE(CO4)
1175	MADD	$f3, $f3, ALPHA_I, c21
1176
1177	MADD	$f4, $f4, ALPHA_R, c31
1178	ST	$f0,  0 * SIZE(CO1)
1179	MADD	$f5, $f5, ALPHA_I, c31
1180	ST	$f1,  1 * SIZE(CO1)
1181	MADD	$f6, $f6, ALPHA_R, c41
1182	ST	$f2,  0 * SIZE(CO2)
1183	MADD	$f7, $f7, ALPHA_I, c41
1184	ST	$f3,  1 * SIZE(CO2)
1185
1186	ST	$f4,  0 * SIZE(CO3)
1187	ST	$f5,  1 * SIZE(CO3)
1188	ST	$f6,  0 * SIZE(CO4)
1189	ST	$f7,  1 * SIZE(CO4)
1190	.align 3
1191
1192.L49:
1193	move	B, BO
1194	.align 3
1195
1196.L50:
1197	andi	J,  N, 2
1198	blez	J, .L70
1199
1200	move	AO, A
1201	move	CO1, C
1202	daddu	CO2, C,   LDC
1203
1204	dsra	I,  M, 1
1205	blez	I, .L60
1206	daddu	C,   CO2, LDC
1207
1208.L51:
1209	LD	a1,  0 * SIZE(AO)
1210	MTC	$0,  c11
1211	LD	a2,  1 * SIZE(AO)
1212	MOV	c21, c11
1213	LD	a5,  4 * SIZE(AO)
1214
1215	LD	b1,  0 * SIZE(B)
1216	MOV	c12, c11
1217	LD	b2,  1 * SIZE(B)
1218	MOV	c22, c11
1219	LD	b3,  2 * SIZE(B)
1220	LD	b5,  4 * SIZE(B)
1221	dsra	L,  K, 2
1222	LD	b6,  8 * SIZE(B)
1223	LD	b7, 12 * SIZE(B)
1224
1225	blez	L, .L55
1226	move	BO,  B
1227	.align	3
1228
1229.L52:
1230	MADD	c11, c11, a1, b1
1231	LD	a3,  2 * SIZE(AO)
1232	MADD	c21, c21, a1, b2
1233	LD	b4,  3 * SIZE(BO)
1234	MADD	c12, c12, a2, b1
1235	LD	a4,  3 * SIZE(AO)
1236	MADD	c22, c22, a2, b2
1237	LD	b1,  8 * SIZE(BO)
1238
1239	MADD	c11, c11, a3, b3
1240	LD	a1,  8 * SIZE(AO)
1241	MADD	c21, c21, a3, b4
1242	LD	b2,  5 * SIZE(BO)
1243	MADD	c12, c12, a4, b3
1244	LD	a2,  5 * SIZE(AO)
1245	MADD	c22, c22, a4, b4
1246	LD	b3,  6 * SIZE(BO)
1247
1248	MADD	c11, c11, a5, b5
1249	LD	a3,  6 * SIZE(AO)
1250	MADD	c21, c21, a5, b2
1251	LD	b4,  7 * SIZE(BO)
1252	MADD	c12, c12, a2, b5
1253	LD	a4,  7 * SIZE(AO)
1254	MADD	c22, c22, a2, b2
1255	LD	b5, 12 * SIZE(BO)
1256
1257	MADD	c11, c11, a3, b3
1258	LD	a5, 12 * SIZE(AO)
1259	MADD	c21, c21, a3, b4
1260	LD	b2,  9 * SIZE(BO)
1261	MADD	c12, c12, a4, b3
1262	LD	a2,  9 * SIZE(AO)
1263	MADD	c22, c22, a4, b4
1264	LD	b3, 10 * SIZE(BO)
1265
1266	daddiu	AO, AO,  8 * SIZE
1267	daddiu	L, L, -1
1268	bgtz	L, .L52
1269	daddiu	BO, BO,  8 * SIZE
1270	.align 3
1271
1272.L55:
1273	andi	L,  K, 3
1274	NOP
1275	blez	L, .L58
1276	NOP
1277	.align	3
1278
1279.L56:
1280	MADD	c11, c11, a1, b1
1281	LD	a2,  1 * SIZE(AO)
1282	MADD	c21, c21, a1, b2
1283	LD	a1,  2 * SIZE(AO)
1284
1285	MADD	c12, c12, a2, b1
1286	LD	b1,  2 * SIZE(BO)
1287	MADD	c22, c22, a2, b2
1288	LD	b2,  3 * SIZE(BO)
1289
1290	daddiu	L, L, -1
1291	daddiu	AO, AO,  2 * SIZE
1292	bgtz	L, .L56
1293	daddiu	BO, BO,  2 * SIZE
1294
1295.L58:
1296	LD	$f0, 0 * SIZE(CO1)
1297	LD	$f1, 1 * SIZE(CO1)
1298	LD	$f2, 2 * SIZE(CO1)
1299	LD	$f3, 3 * SIZE(CO1)
1300
1301	LD	$f4, 0 * SIZE(CO2)
1302	LD	$f5, 1 * SIZE(CO2)
1303	LD	$f6, 2 * SIZE(CO2)
1304	LD	$f7, 3 * SIZE(CO2)
1305
1306	MADD	$f0, $f0, ALPHA_R, c11
1307	daddiu	I, I, -1
1308	MADD	$f1, $f1, ALPHA_I, c11
1309	daddiu	CO1,CO1, 4 * SIZE
1310	MADD	$f2, $f2, ALPHA_R, c12
1311	daddiu	CO2,CO2, 4 * SIZE
1312	MADD	$f3, $f3, ALPHA_I, c12
1313	MADD	$f4, $f4, ALPHA_R, c21
1314	MADD	$f5, $f5, ALPHA_I, c21
1315	MADD	$f6, $f6, ALPHA_R, c22
1316	MADD	$f7, $f7, ALPHA_I, c22
1317
1318	ST	$f0, -4 * SIZE(CO1)
1319	ST	$f1, -3 * SIZE(CO1)
1320	ST	$f2, -2 * SIZE(CO1)
1321	ST	$f3, -1 * SIZE(CO1)
1322
1323	ST	$f4, -4 * SIZE(CO2)
1324	ST	$f5, -3 * SIZE(CO2)
1325	ST	$f6, -2 * SIZE(CO2)
1326	bgtz	I, .L51
1327	ST	$f7, -1 * SIZE(CO2)
1328	.align 3
1329
1330.L60:
1331	andi	I,  M, 1
1332	blez	I, .L69
1333	NOP
1334
1335	dsra	L,  K, 2
1336	LD	a1,  0 * SIZE(AO)
1337	MTC	$0,  c11
1338	LD	a2,  1 * SIZE(AO)
1339	MOV	c21, c11
1340	LD	a3,  2 * SIZE(AO)
1341	MOV	c31, c11
1342	LD	a4,  3 * SIZE(AO)
1343	MOV	c41, c11
1344
1345	LD	b1,  0 * SIZE(B)
1346	LD	b2,  1 * SIZE(B)
1347	LD	b3,  2 * SIZE(B)
1348	LD	b4,  3 * SIZE(B)
1349	LD	b5,  4 * SIZE(B)
1350	LD	b6,  8 * SIZE(B)
1351	LD	b7, 12 * SIZE(B)
1352
1353	blez	L, .L65
1354	move	BO,  B
1355	.align	3
1356
1357.L62:
1358	MADD	c11, c11, a1, b1
1359	LD	b1,  4 * SIZE(BO)
1360	MADD	c21, c21, a1, b2
1361	LD	b2,  5 * SIZE(BO)
1362	MADD	c31, c31, a2, b3
1363	LD	b3,  6 * SIZE(BO)
1364	MADD	c41, c41, a2, b4
1365	LD	b4,  7 * SIZE(BO)
1366
1367	LD	a1,  4 * SIZE(AO)
1368	LD	a2,  5 * SIZE(AO)
1369
1370	MADD	c11, c11, a3, b1
1371	LD	b1,  8 * SIZE(BO)
1372	MADD	c21, c21, a3, b2
1373	LD	b2,  9 * SIZE(BO)
1374	MADD	c31, c31, a4, b3
1375	LD	b3, 10 * SIZE(BO)
1376	MADD	c41, c41, a4, b4
1377	LD	b4, 11 * SIZE(BO)
1378
1379	LD	a3,  6 * SIZE(AO)
1380	LD	a4,  7 * SIZE(AO)
1381
1382	daddiu	L, L, -1
1383	daddiu	AO, AO,  4 * SIZE
1384
1385	bgtz	L, .L62
1386	daddiu	BO, BO,  8 * SIZE
1387	.align 3
1388
1389.L65:
1390	andi	L,  K, 3
1391	NOP
1392	blez	L, .L68
1393	NOP
1394	.align	3
1395
1396.L66:
1397	MADD	c11, c11, a1, b1
1398	LD	b1,  2 * SIZE(BO)
1399	MADD	c21, c21, a1, b2
1400	LD	b2,  3 * SIZE(BO)
1401
1402	LD	a1,  1 * SIZE(AO)
1403	daddiu	L, L, -1
1404
1405	daddiu	AO, AO,  1 * SIZE
1406	bgtz	L, .L66
1407	daddiu	BO, BO,  2 * SIZE
1408
1409
1410.L68:
1411	LD	$f0, 0 * SIZE(CO1)
1412	LD	$f1, 1 * SIZE(CO1)
1413	LD	$f2, 0 * SIZE(CO2)
1414	LD	$f3, 1 * SIZE(CO2)
1415
1416	ADD	c11, c11, c31
1417	ADD	c21, c21, c41
1418
1419	MADD	$f0, $f0, ALPHA_R, c11
1420	MADD	$f1, $f1, ALPHA_I, c11
1421	MADD	$f2, $f2, ALPHA_R, c21
1422	MADD	$f3, $f3, ALPHA_I, c21
1423
1424	ST	$f0,  0 * SIZE(CO1)
1425	ST	$f1,  1 * SIZE(CO1)
1426	ST	$f2,  0 * SIZE(CO2)
1427	ST	$f3,  1 * SIZE(CO2)
1428	.align 3
1429
1430.L69:
1431	move	B, BO
1432	.align 3
1433
1434.L70:
1435	andi	J,  N, 1
1436	blez	J, .L999
1437
1438	move	AO, A
1439	move	CO1, C
1440
1441	dsra	I,  M, 1
1442	blez	I, .L80
1443	daddu	C,   CO1, LDC
1444
1445.L71:
1446	LD	a1,  0 * SIZE(AO)
1447	MTC	$0,  c11
1448	LD	a2,  1 * SIZE(AO)
1449	MOV	c21, c11
1450	LD	a5,  4 * SIZE(AO)
1451
1452	LD	b1,  0 * SIZE(B)
1453	MOV	c12, c11
1454	LD	b2,  1 * SIZE(B)
1455	MOV	c22, c11
1456	LD	b3,  2 * SIZE(B)
1457	LD	b5,  4 * SIZE(B)
1458	dsra	L,  K, 2
1459	LD	b6,  8 * SIZE(B)
1460	LD	b7, 12 * SIZE(B)
1461
1462	blez	L, .L75
1463	move	BO,  B
1464	.align	3
1465
1466.L72:
1467	LD	a1,  0 * SIZE(AO)
1468	LD	a2,  1 * SIZE(AO)
1469	LD	b1,  0 * SIZE(BO)
1470
1471	MADD	c11, c11, a1, b1
1472	MADD	c12, c12, a2, b1
1473
1474	LD	a1,  2 * SIZE(AO)
1475	LD	a2,  3 * SIZE(AO)
1476	LD	b1,  1 * SIZE(BO)
1477
1478	MADD	c11, c11, a1, b1
1479	MADD	c12, c12, a2, b1
1480
1481	LD	a1,  4 * SIZE(AO)
1482	LD	a2,  5 * SIZE(AO)
1483	LD	b1,  2 * SIZE(BO)
1484
1485	MADD	c11, c11, a1, b1
1486	MADD	c12, c12, a2, b1
1487
1488	LD	a1,  6 * SIZE(AO)
1489	LD	a2,  7 * SIZE(AO)
1490	LD	b1,  3 * SIZE(BO)
1491
1492	MADD	c11, c11, a1, b1
1493	MADD	c12, c12, a2, b1
1494
1495	daddiu	L, L, -1
1496	daddiu	AO, AO,  8 * SIZE
1497	bgtz	L, .L72
1498	daddiu	BO, BO,  4 * SIZE
1499	.align 3
1500
1501.L75:
1502	andi	L,  K, 3
1503	NOP
1504	blez	L, .L78
1505	NOP
1506	.align	3
1507
1508.L76:
1509	LD	a1,  0 * SIZE(AO)
1510	LD	a2,  1 * SIZE(AO)
1511	LD	b1,  0 * SIZE(BO)
1512
1513	MADD	c11, c11, a1, b1
1514	MADD	c12, c12, a2, b1
1515
1516	daddiu	L, L, -1
1517	daddiu	AO, AO,  2 * SIZE
1518	bgtz	L, .L76
1519	daddiu	BO, BO,  1 * SIZE
1520
1521.L78:
1522	LD	$f0, 0 * SIZE(CO1)
1523	LD	$f1, 1 * SIZE(CO1)
1524	LD	$f2, 2 * SIZE(CO1)
1525	LD	$f3, 3 * SIZE(CO1)
1526
1527	ADD	c11, c11, c21
1528	daddiu	I, I, -1
1529	ADD	c12, c12, c22
1530	daddiu	CO1,CO1, 4 * SIZE
1531
1532	MADD	$f0, $f0, ALPHA_R, c11
1533	MADD	$f1, $f1, ALPHA_I, c11
1534	MADD	$f2, $f2, ALPHA_R, c12
1535	MADD	$f3, $f3, ALPHA_I, c12
1536
1537	ST	$f0, -4 * SIZE(CO1)
1538	ST	$f1, -3 * SIZE(CO1)
1539	ST	$f2, -2 * SIZE(CO1)
1540
1541	bgtz	I, .L71
1542	ST	$f3, -1 * SIZE(CO1)
1543	.align 3
1544
1545.L80:
1546	andi	I,  M, 1
1547	blez	I, .L89
1548	NOP
1549
1550	LD	a1,  0 * SIZE(AO)
1551	MTC	$0,  c11
1552	LD	a2,  1 * SIZE(AO)
1553	MOV	c21, c11
1554	LD	a3,  2 * SIZE(AO)
1555	LD	a4,  3 * SIZE(AO)
1556
1557	LD	b1,  0 * SIZE(B)
1558	LD	b2,  1 * SIZE(B)
1559	LD	b3,  2 * SIZE(B)
1560	LD	b4,  3 * SIZE(B)
1561	LD	b5,  4 * SIZE(B)
1562	LD	b6,  8 * SIZE(B)
1563	LD	b7, 12 * SIZE(B)
1564
1565	dsra	L,  K, 2
1566	blez	L, .L85
1567	move	BO,  B
1568	.align	3
1569
1570.L82:
1571	LD	a1,  0 * SIZE(AO)
1572	LD	b1,  0 * SIZE(BO)
1573
1574	MADD	c11, c11, a1, b1
1575
1576	LD	a1,  1 * SIZE(AO)
1577	LD	b1,  1 * SIZE(BO)
1578
1579	MADD	c21, c21, a1, b1
1580
1581	LD	a1,  2 * SIZE(AO)
1582	LD	b1,  2 * SIZE(BO)
1583
1584	MADD	c11, c11, a1, b1
1585
1586	LD	a1,  3 * SIZE(AO)
1587	LD	b1,  3 * SIZE(BO)
1588
1589	MADD	c21, c21, a1, b1
1590
1591	daddiu	L, L, -1
1592	daddiu	AO, AO,  4 * SIZE
1593	bgtz	L, .L82
1594	daddiu	BO, BO,  4 * SIZE
1595	.align 3
1596
1597.L85:
1598	andi	L,  K, 3
1599	NOP
1600	blez	L, .L88
1601	NOP
1602	.align	3
1603
1604.L86:
1605	LD	a1,  0 * SIZE(AO)
1606	LD	b1,  0 * SIZE(BO)
1607
1608	MADD	c11, c11, a1, b1
1609
1610	daddiu	L, L, -1
1611	daddiu	AO, AO,  1 * SIZE
1612	bgtz	L, .L86
1613	daddiu	BO, BO,  1 * SIZE
1614
1615
1616.L88:
1617	LD	$f0, 0 * SIZE(CO1)
1618	LD	$f1, 1 * SIZE(CO1)
1619
1620	ADD	c11, c11, c21
1621	MADD	$f0, $f0, ALPHA_R, c11
1622	MADD	$f1, $f1, ALPHA_I, c11
1623
1624	ST	$f0,  0 * SIZE(CO1)
1625	ST	$f1,  1 * SIZE(CO1)
1626	.align 3
1627
1628.L89:
1629	move	B, BO
1630	.align 3
1631
1632.L999:
1633	LDARG	$16,   0($sp)
1634	LDARG	$17,   8($sp)
1635	LDARG	$18,  16($sp)
1636	LDARG	$19,  24($sp)
1637	LDARG	$20,  32($sp)
1638	LDARG	$21,  40($sp)
1639	ldc1	$f24, 48($sp)
1640	ldc1	$f25, 56($sp)
1641	ldc1	$f26, 64($sp)
1642	ldc1	$f27, 72($sp)
1643	ldc1	$f28, 80($sp)
1644	ldc1	$f29, 88($sp)
1645
1646	j	$31
1647	daddiu	$sp, $sp, 128
1648
1649	EPILOGUE
1650