1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	$4
26#define	N	$5
27#define	K	$6
28#define A	$8
29#define B	$9
30#define C	$10
31#define LDC	$11
32
33#define AO	$12
34#define BO	$13
35
36#define I	$2
37#define J	$3
38#define L	$7
39
40#define CO1	$14
41#define CO2	$15
42#define CO3	$16
43#define CO4	$17
44#define CO5	$18
45#define CO6	$19
46#define CO7	$20
47#define CO8	$21
48
49#define OFFSET	$22
50#define KK	$23
51#define TEMP	$24
52#define AORIG	$25
53
54#define a1	$f0
55#define a2	$f1
56#define a3	$f27
57#define a4	$f28
58
59#define b1	$f2
60#define b2	$f3
61#define b3	$f4
62#define b4	$f5
63#define b5	$f6
64#define b6	$f7
65#define b7	$f8
66#define b8	$f9
67
68#define a5	b8
69
70#define c11	$f10
71#define c12	$f11
72#define c21	$f12
73#define c22	$f13
74#define c31	$f14
75#define c32	$f16
76#define c41	$f17
77#define c42	$f18
78#define c51	$f19
79#define c52	$f20
80#define c61	$f21
81#define c62	$f22
82#define c71	$f23
83#define c72	$f24
84#define c81	$f25
85#define c82	$f26
86
87#define ALPHA	$f15
88
89	PROLOGUE
90
91	daddiu	$sp, $sp, -144
92
93	SDARG	$16,   0($sp)
94	SDARG	$17,   8($sp)
95	SDARG	$18,  16($sp)
96	SDARG	$19,  24($sp)
97	SDARG	$20,  32($sp)
98	SDARG	$21,  40($sp)
99	sdc1	$f24, 48($sp)
100	sdc1	$f25, 56($sp)
101	sdc1	$f26, 64($sp)
102	sdc1	$f27, 72($sp)
103	sdc1	$f28, 80($sp)
104
105	SDARG	$22,  88($sp)
106	SDARG	$23,  96($sp)
107	SDARG	$24, 104($sp)
108	SDARG	$25, 112($sp)
109
110#ifndef __64BIT__
111	sdc1	$f20,112($sp)
112	sdc1	$f21,120($sp)
113	sdc1	$f22,128($sp)
114	sdc1	$f23,136($sp)
115#endif
116
117	LDARG	OFFSET, 144($sp)
118
119	dsll	LDC, LDC, BASE_SHIFT
120
121#ifdef LN
122	mult	M, K
123	mflo	TEMP
124
125	dsll	TEMP, TEMP, BASE_SHIFT
126	daddu	A, A, TEMP
127
128	dsll	TEMP, M, BASE_SHIFT
129	daddu	C, C, TEMP
130#endif
131
132#ifdef RN
133	neg	KK, OFFSET
134#endif
135
136#ifdef RT
137	mult	N, K
138	mflo	TEMP
139
140	dsll	TEMP, TEMP, BASE_SHIFT
141	daddu	B, B, TEMP
142
143	mult	N, LDC
144	mflo	TEMP
145	daddu	C, C, TEMP
146
147	dsubu	KK, N, OFFSET
148#endif
149
150	dsra	J,  N, 3
151	blez	J, .L30
152	nop
153
154.L10:
155#ifdef RT
156	dsll	TEMP, K, 3 + BASE_SHIFT
157	dsubu	B, B, TEMP
158
159	dsll	TEMP, LDC, 3
160	dsubu	C, C, TEMP
161#endif
162
163	move	CO1, C
164	MTC	$0,  c11
165	daddu	CO2, C,   LDC
166	daddu	CO3, CO2, LDC
167	daddiu	J, J, -1
168	daddu	CO4, CO3, LDC
169	MOV	c21, c11
170	daddu	CO5, CO4, LDC
171	MOV	c31, c11
172	daddu	CO6, CO5, LDC
173	MOV	c41, c11
174	daddu	CO7, CO6, LDC
175	MOV	c51, c11
176	daddu	CO8, CO7, LDC
177
178#ifdef LN
179	daddu	KK, M, OFFSET
180#endif
181
182#ifdef LT
183	move	KK, OFFSET
184#endif
185
186#if defined(LN) || defined(RT)
187	move	AORIG, A
188#else
189	move	AO, A
190#endif
191#ifndef RT
192	daddu	C,  CO8, LDC
193#endif
194
195	andi	I,  M, 1
196	MOV	c61, c11
197	blez	I, .L20
198	MOV	c71, c11
199
200#if defined(LT) || defined(RN)
201	LD	a1,  0 * SIZE(AO)
202	LD	a2,  1 * SIZE(AO)
203	LD	a3,  2 * SIZE(AO)
204	LD	a4,  3 * SIZE(AO)
205
206	LD	b1,  0 * SIZE(B)
207	LD	b2,  1 * SIZE(B)
208	LD	b3,  2 * SIZE(B)
209	LD	b4,  3 * SIZE(B)
210	LD	b5,  4 * SIZE(B)
211	LD	b6,  8 * SIZE(B)
212	LD	b7, 12 * SIZE(B)
213
214	dsra	L,  KK, 2
215	MOV	c81, c11
216
217	blez	L, .L25
218	move	BO,  B
219#else
220
221#ifdef LN
222	dsll	TEMP,   K,  0 + BASE_SHIFT
223	dsubu	AORIG, AORIG, TEMP
224#endif
225
226	dsll	L,    KK, 0 + BASE_SHIFT
227	dsll	TEMP, KK, 3 + BASE_SHIFT
228
229	daddu	AO, AORIG, L
230	daddu	BO, B,     TEMP
231
232	dsubu	TEMP, K, KK
233
234	LD	a1,  0 * SIZE(AO)
235	LD	a2,  1 * SIZE(AO)
236	LD	a3,  2 * SIZE(AO)
237	LD	a4,  3 * SIZE(AO)
238
239	LD	b1,  0 * SIZE(BO)
240	LD	b2,  1 * SIZE(BO)
241	LD	b3,  2 * SIZE(BO)
242	LD	b4,  3 * SIZE(BO)
243	LD	b5,  4 * SIZE(BO)
244	LD	b6,  8 * SIZE(BO)
245	LD	b7, 12 * SIZE(BO)
246
247	dsra	L,  TEMP, 2
248	MOV	c81, c11
249
250	blez	L, .L25
251	NOP
252#endif
253	.align	3
254
255.L22:
256	MADD	c11, c11, a1, b1
257	LD	b1, 16 * SIZE(BO)
258	MADD	c21, c21, a1, b2
259	LD	b2,  5 * SIZE(BO)
260	MADD	c31, c31, a1, b3
261	LD	b3,  6 * SIZE(BO)
262	MADD	c41, c41, a1, b4
263	LD	b4,  7 * SIZE(BO)
264
265	MADD	c51, c51, a1, b5
266	LD	b5, 20 * SIZE(BO)
267	MADD	c61, c61, a1, b2
268	LD	b2,  9 * SIZE(BO)
269	MADD	c71, c71, a1, b3
270	LD	b3, 10 * SIZE(BO)
271	MADD	c81, c81, a1, b4
272	LD	b4, 11 * SIZE(BO)
273
274	LD	a1,  4 * SIZE(AO)
275	daddiu	L, L, -1
276
277	MADD	c11, c11, a2, b6
278	LD	b6, 24 * SIZE(BO)
279	MADD	c21, c21, a2, b2
280	LD	b2, 13 * SIZE(BO)
281	MADD	c31, c31, a2, b3
282	LD	b3, 14 * SIZE(BO)
283	MADD	c41, c41, a2, b4
284	LD	b4, 15 * SIZE(BO)
285
286	MADD	c51, c51, a2, b7
287	LD	b7, 28 * SIZE(BO)
288	MADD	c61, c61, a2, b2
289	LD	b2, 17 * SIZE(BO)
290	MADD	c71, c71, a2, b3
291	LD	b3, 18 * SIZE(BO)
292	MADD	c81, c81, a2, b4
293	LD	b4, 19 * SIZE(BO)
294
295	LD	a2,  5 * SIZE(AO)
296	daddiu	AO, AO,  4 * SIZE
297
298	MADD	c11, c11, a3, b1
299	LD	b1, 32 * SIZE(BO)
300	MADD	c21, c21, a3, b2
301	LD	b2, 21 * SIZE(BO)
302	MADD	c31, c31, a3, b3
303	LD	b3, 22 * SIZE(BO)
304	MADD	c41, c41, a3, b4
305	LD	b4, 23 * SIZE(BO)
306
307	MADD	c51, c51, a3, b5
308	LD	b5, 36 * SIZE(BO)
309	MADD	c61, c61, a3, b2
310	LD	b2, 25 * SIZE(BO)
311	MADD	c71, c71, a3, b3
312	LD	b3, 26 * SIZE(BO)
313	MADD	c81, c81, a3, b4
314	LD	b4, 27 * SIZE(BO)
315
316	LD	a3,  2 * SIZE(AO)
317	daddiu	BO, BO, 32 * SIZE
318
319	MADD	c11, c11, a4, b6
320	LD	b6,  8 * SIZE(BO)
321	MADD	c21, c21, a4, b2
322	LD	b2, -3 * SIZE(BO)
323	MADD	c31, c31, a4, b3
324	LD	b3, -2 * SIZE(BO)
325	MADD	c41, c41, a4, b4
326	LD	b4, -1 * SIZE(BO)
327
328	MADD	c51, c51, a4, b7
329	LD	b7, 12 * SIZE(BO)
330	MADD	c61, c61, a4, b2
331	LD	b2,  1 * SIZE(BO)
332	MADD	c71, c71, a4, b3
333	LD	b3,  2 * SIZE(BO)
334	MADD	c81, c81, a4, b4
335	LD	b4,  3 * SIZE(BO)
336	bgtz	L, .L22
337	LD	a4,  3 * SIZE(AO)
338	.align 3
339
340.L25:
341#if defined(LT) || defined(RN)
342	andi	L, KK,  3
343#else
344	andi	L, TEMP, 3
345#endif
346	NOP
347	blez	L, .L28
348	NOP
349	.align	3
350
351.L26:
352	MADD	c11, c11, a1, b1
353	LD	b1,  8 * SIZE(BO)
354	MADD	c21, c21, a1, b2
355	LD	b2,  5 * SIZE(BO)
356	MADD	c31, c31, a1, b3
357	LD	b3,  6 * SIZE(BO)
358	MADD	c41, c41, a1, b4
359	LD	b4,  7 * SIZE(BO)
360
361	daddiu	L, L, -1
362	MOV	a2, a2
363	daddiu	AO, AO,  1 * SIZE
364	daddiu	BO, BO,  8 * SIZE
365
366	MADD	c51, c51, a1, b5
367	LD	b5,  4 * SIZE(BO)
368	MADD	c61, c61, a1, b2
369	LD	b2,  1 * SIZE(BO)
370	MADD	c71, c71, a1, b3
371	LD	b3,  2 * SIZE(BO)
372	MADD	c81, c81, a1, b4
373	LD	a1,  0 * SIZE(AO)
374
375	bgtz	L, .L26
376	LD	b4,  3 * SIZE(BO)
377
378.L28:
379#if defined(LN) || defined(RT)
380#ifdef LN
381	daddiu	TEMP, KK, -1
382#else
383	daddiu	TEMP, KK, -8
384#endif
385
386	dsll	L,    TEMP, 0 + BASE_SHIFT
387	dsll	TEMP, TEMP, 3 + BASE_SHIFT
388	daddu	AO, AORIG, L
389	daddu	BO, B,     TEMP
390#endif
391
392
393#if defined(LN) || defined(LT)
394	LD	b1,  0 * SIZE(BO)
395	LD	b2,  1 * SIZE(BO)
396	LD	b3,  2 * SIZE(BO)
397	LD	b4,  3 * SIZE(BO)
398	LD	b5,  4 * SIZE(BO)
399	LD	b6,  5 * SIZE(BO)
400	LD	b7,  6 * SIZE(BO)
401	LD	b8,  7 * SIZE(BO)
402
403	SUB	c11, b1, c11
404	SUB	c21, b2, c21
405	SUB	c31, b3, c31
406	SUB	c41, b4, c41
407	SUB	c51, b5, c51
408	SUB	c61, b6, c61
409	SUB	c71, b7, c71
410	SUB	c81, b8, c81
411#else
412	LD	b1,  0 * SIZE(AO)
413	LD	b2,  1 * SIZE(AO)
414	LD	b3,  2 * SIZE(AO)
415	LD	b4,  3 * SIZE(AO)
416	LD	b5,  4 * SIZE(AO)
417	LD	b6,  5 * SIZE(AO)
418	LD	b7,  6 * SIZE(AO)
419	LD	b8,  7 * SIZE(AO)
420
421	SUB	c11, b1, c11
422	SUB	c21, b2, c21
423	SUB	c31, b3, c31
424	SUB	c41, b4, c41
425	SUB	c51, b5, c51
426	SUB	c61, b6, c61
427	SUB	c71, b7, c71
428	SUB	c81, b8, c81
429#endif
430
431#if defined(LN) || defined(LT)
432	LD	b1,  0 * SIZE(AO)
433
434	MUL	c11, b1, c11
435	MUL	c21, b1, c21
436	MUL	c31, b1, c31
437	MUL	c41, b1, c41
438	MUL	c51, b1, c51
439	MUL	c61, b1, c61
440	MUL	c71, b1, c71
441	MUL	c81, b1, c81
442#endif
443
444#ifdef RN
445	LD	b1,  0 * SIZE(BO)
446	LD	b2,  1 * SIZE(BO)
447	LD	b3,  2 * SIZE(BO)
448	LD	b4,  3 * SIZE(BO)
449	LD	b5,  4 * SIZE(BO)
450	LD	b6,  5 * SIZE(BO)
451	LD	b7,  6 * SIZE(BO)
452	LD	b8,  7 * SIZE(BO)
453
454	MUL	c11, b1, c11
455
456	NMSUB	c21, c21, b2, c11
457	NMSUB	c31, c31, b3, c11
458	NMSUB	c41, c41, b4, c11
459	NMSUB	c51, c51, b5, c11
460	NMSUB	c61, c61, b6, c11
461	NMSUB	c71, c71, b7, c11
462	NMSUB	c81, c81, b8, c11
463
464	LD	b2,  9 * SIZE(BO)
465	LD	b3, 10 * SIZE(BO)
466	LD	b4, 11 * SIZE(BO)
467	LD	b5, 12 * SIZE(BO)
468	LD	b6, 13 * SIZE(BO)
469	LD	b7, 14 * SIZE(BO)
470	LD	b8, 15 * SIZE(BO)
471
472	MUL	c21, b2, c21
473
474	NMSUB	c31, c31, b3, c21
475	NMSUB	c41, c41, b4, c21
476	NMSUB	c51, c51, b5, c21
477	NMSUB	c61, c61, b6, c21
478	NMSUB	c71, c71, b7, c21
479	NMSUB	c81, c81, b8, c21
480
481	LD	b3, 18 * SIZE(BO)
482	LD	b4, 19 * SIZE(BO)
483	LD	b5, 20 * SIZE(BO)
484	LD	b6, 21 * SIZE(BO)
485	LD	b7, 22 * SIZE(BO)
486	LD	b8, 23 * SIZE(BO)
487
488	MUL	c31, b3, c31
489
490	NMSUB	c41, c41, b4, c31
491	NMSUB	c51, c51, b5, c31
492	NMSUB	c61, c61, b6, c31
493	NMSUB	c71, c71, b7, c31
494	NMSUB	c81, c81, b8, c31
495
496	LD	b4, 27 * SIZE(BO)
497	LD	b5, 28 * SIZE(BO)
498	LD	b6, 29 * SIZE(BO)
499	LD	b7, 30 * SIZE(BO)
500	LD	b8, 31 * SIZE(BO)
501
502	MUL	c41, b4, c41
503
504	NMSUB	c51, c51, b5, c41
505	NMSUB	c61, c61, b6, c41
506	NMSUB	c71, c71, b7, c41
507	NMSUB	c81, c81, b8, c41
508
509	LD	b5, 36 * SIZE(BO)
510	LD	b6, 37 * SIZE(BO)
511	LD	b7, 38 * SIZE(BO)
512	LD	b8, 39 * SIZE(BO)
513
514	MUL	c51, b5, c51
515
516	NMSUB	c61, c61, b6, c51
517	NMSUB	c71, c71, b7, c51
518	NMSUB	c81, c81, b8, c51
519
520	LD	b6, 45 * SIZE(BO)
521	LD	b7, 46 * SIZE(BO)
522	LD	b8, 47 * SIZE(BO)
523
524	MUL	c61, b6, c61
525
526	NMSUB	c71, c71, b7, c61
527	NMSUB	c81, c81, b8, c61
528
529	LD	b7, 54 * SIZE(BO)
530	LD	b8, 55 * SIZE(BO)
531
532	MUL	c71, b7, c71
533
534	NMSUB	c81, c81, b8, c71
535
536	LD	b8, 63 * SIZE(BO)
537
538	MUL	c81, b8, c81
539#endif
540
541#ifdef RT
542	LD	b1, 63 * SIZE(BO)
543	LD	b2, 62 * SIZE(BO)
544	LD	b3, 61 * SIZE(BO)
545	LD	b4, 60 * SIZE(BO)
546	LD	b5, 59 * SIZE(BO)
547	LD	b6, 58 * SIZE(BO)
548	LD	b7, 57 * SIZE(BO)
549	LD	b8, 56 * SIZE(BO)
550
551	MUL	c81, b1, c81
552
553	NMSUB	c71, c71, b2, c81
554	NMSUB	c61, c61, b3, c81
555	NMSUB	c51, c51, b4, c81
556	NMSUB	c41, c41, b5, c81
557	NMSUB	c31, c31, b6, c81
558	NMSUB	c21, c21, b7, c81
559	NMSUB	c11, c11, b8, c81
560
561	LD	b2, 54 * SIZE(BO)
562	LD	b3, 53 * SIZE(BO)
563	LD	b4, 52 * SIZE(BO)
564	LD	b5, 51 * SIZE(BO)
565	LD	b6, 50 * SIZE(BO)
566	LD	b7, 49 * SIZE(BO)
567	LD	b8, 48 * SIZE(BO)
568
569	MUL	c71, b2, c71
570
571	NMSUB	c61, c61, b3, c71
572	NMSUB	c51, c51, b4, c71
573	NMSUB	c41, c41, b5, c71
574	NMSUB	c31, c31, b6, c71
575	NMSUB	c21, c21, b7, c71
576	NMSUB	c11, c11, b8, c71
577
578	LD	b3, 45 * SIZE(BO)
579	LD	b4, 44 * SIZE(BO)
580	LD	b5, 43 * SIZE(BO)
581	LD	b6, 42 * SIZE(BO)
582	LD	b7, 41 * SIZE(BO)
583	LD	b8, 40 * SIZE(BO)
584
585	MUL	c61, b3, c61
586
587	NMSUB	c51, c51, b4, c61
588	NMSUB	c41, c41, b5, c61
589	NMSUB	c31, c31, b6, c61
590	NMSUB	c21, c21, b7, c61
591	NMSUB	c11, c11, b8, c61
592
593	LD	b4, 36 * SIZE(BO)
594	LD	b5, 35 * SIZE(BO)
595	LD	b6, 34 * SIZE(BO)
596	LD	b7, 33 * SIZE(BO)
597	LD	b8, 32 * SIZE(BO)
598
599	MUL	c51, b4, c51
600
601	NMSUB	c41, c41, b5, c51
602	NMSUB	c31, c31, b6, c51
603	NMSUB	c21, c21, b7, c51
604	NMSUB	c11, c11, b8, c51
605
606	LD	b5, 27 * SIZE(BO)
607	LD	b6, 26 * SIZE(BO)
608	LD	b7, 25 * SIZE(BO)
609	LD	b8, 24 * SIZE(BO)
610
611	MUL	c41, b5, c41
612
613	NMSUB	c31, c31, b6, c41
614	NMSUB	c21, c21, b7, c41
615	NMSUB	c11, c11, b8, c41
616
617	LD	b6, 18 * SIZE(BO)
618	LD	b7, 17 * SIZE(BO)
619	LD	b8, 16 * SIZE(BO)
620
621	MUL	c31, b6, c31
622
623	NMSUB	c21, c21, b7, c31
624	NMSUB	c11, c11, b8, c31
625
626	LD	b7,  9 * SIZE(BO)
627	LD	b8,  8 * SIZE(BO)
628
629	MUL	c21, b7, c21
630
631	NMSUB	c11, c11, b8, c21
632
633	LD	b8,  0 * SIZE(BO)
634
635	MUL	c11, b8, c11
636#endif
637
638#ifdef LN
639	daddiu	CO1, CO1, -1 * SIZE
640	daddiu	CO2, CO2, -1 * SIZE
641	daddiu	CO3, CO3, -1 * SIZE
642	daddiu	CO4, CO4, -1 * SIZE
643	daddiu	CO5, CO5, -1 * SIZE
644	daddiu	CO6, CO6, -1 * SIZE
645	daddiu	CO7, CO7, -1 * SIZE
646	daddiu	CO8, CO8, -1 * SIZE
647#endif
648
649#if defined(LN) || defined(LT)
650	ST	c11,  0 * SIZE(BO)
651	ST	c21,  1 * SIZE(BO)
652	ST	c31,  2 * SIZE(BO)
653	ST	c41,  3 * SIZE(BO)
654	ST	c51,  4 * SIZE(BO)
655	ST	c61,  5 * SIZE(BO)
656	ST	c71,  6 * SIZE(BO)
657	ST	c81,  7 * SIZE(BO)
658#else
659	ST	c11,  0 * SIZE(AO)
660	ST	c21,  1 * SIZE(AO)
661	ST	c31,  2 * SIZE(AO)
662	ST	c41,  3 * SIZE(AO)
663	ST	c51,  4 * SIZE(AO)
664	ST	c61,  5 * SIZE(AO)
665	ST	c71,  6 * SIZE(AO)
666	ST	c81,  7 * SIZE(AO)
667#endif
668
669	ST	c11,  0 * SIZE(CO1)
670	ST	c21,  0 * SIZE(CO2)
671	ST	c31,  0 * SIZE(CO3)
672	ST	c41,  0 * SIZE(CO4)
673	ST	c51,  0 * SIZE(CO5)
674	ST	c61,  0 * SIZE(CO6)
675	ST	c71,  0 * SIZE(CO7)
676	ST	c81,  0 * SIZE(CO8)
677
678	MTC	$0,  c11
679
680#ifndef LN
681	daddiu	CO1, CO1, 1 * SIZE
682	daddiu	CO2, CO2, 1 * SIZE
683	daddiu	CO3, CO3, 1 * SIZE
684	daddiu	CO4, CO4, 1 * SIZE
685	daddiu	CO5, CO5, 1 * SIZE
686	daddiu	CO6, CO6, 1 * SIZE
687	daddiu	CO7, CO7, 1 * SIZE
688	daddiu	CO8, CO8, 1 * SIZE
689#endif
690
691	MOV	c21, c11
692
693#ifdef RT
694	dsll	TEMP, K, BASE_SHIFT
695	daddu	AORIG, AORIG, TEMP
696#endif
697
698	MOV	c31, c11
699
700#if defined(LT) || defined(RN)
701	dsubu	TEMP, K, KK
702	dsll	L,    TEMP, 0 + BASE_SHIFT
703	dsll	TEMP, TEMP, 3 + BASE_SHIFT
704	daddu	AO, AO, L
705	daddu	BO, BO, TEMP
706#endif
707
708	MOV	c41, c11
709
710#ifdef LT
711	daddiu	KK, KK, 1
712#endif
713
714#ifdef LN
715	daddiu	KK, KK, -1
716#endif
717	.align 3
718
719.L20:
720	dsra	I,  M, 1
721	MOV	c51, c11
722	blez	I, .L29
723	MOV	c61, c11
724
725.L11:
726#if defined(LT) || defined(RN)
727	LD	a1,  0 * SIZE(AO)
728	MOV	c71, c11
729	LD	b1,  0 * SIZE(B)
730	MOV	c81, c11
731
732	LD	a3,  4 * SIZE(AO)
733	MOV	c12, c11
734	LD	b2,  1 * SIZE(B)
735	MOV	c22, c11
736
737	dsra	L,  KK, 2
738	MOV	c32, c11
739	LD	b3,  2 * SIZE(B)
740	MOV	c42, c11
741
742	LD	b4,  3 * SIZE(B)
743	MOV	c52, c11
744	LD	b5,  4 * SIZE(B)
745	MOV	c62, c11
746
747	LD	b6,  8 * SIZE(B)
748	MOV	c72, c11
749	LD	b7, 12 * SIZE(B)
750	MOV	c82, c11
751
752	blez	L, .L15
753	move	BO,  B
754#else
755
756#ifdef LN
757	dsll	TEMP,   K,  1 + BASE_SHIFT
758	dsubu	AORIG, AORIG, TEMP
759#endif
760
761	dsll	L,    KK, 1 + BASE_SHIFT
762	dsll	TEMP, KK, 3 + BASE_SHIFT
763
764	daddu	AO, AORIG, L
765	daddu	BO, B,     TEMP
766
767	dsubu	TEMP, K, KK
768
769	LD	a1,  0 * SIZE(AO)
770	MOV	c71, c11
771	LD	b1,  0 * SIZE(BO)
772	MOV	c81, c11
773
774	LD	a3,  4 * SIZE(AO)
775	MOV	c12, c11
776	LD	b2,  1 * SIZE(BO)
777	MOV	c22, c11
778
779	MOV	c32, c11
780	LD	b3,  2 * SIZE(BO)
781	MOV	c42, c11
782
783	LD	b4,  3 * SIZE(BO)
784	MOV	c52, c11
785	LD	b5,  4 * SIZE(BO)
786	MOV	c62, c11
787
788	LD	b6,  8 * SIZE(BO)
789	MOV	c72, c11
790	LD	b7, 12 * SIZE(BO)
791	MOV	c82, c11
792
793	dsra	L,  TEMP, 2
794	blez	L, .L15
795	NOP
796#endif
797
798	MADD	c11, c11, a1, b1
799	LD	a2,  1 * SIZE(AO)
800	MADD	c21, c21, a1, b2
801	daddiu	L, L, -1
802	MADD	c31, c31, a1, b3
803	blez	L, .L13
804	MADD	c41, c41, a1, b4
805	NOP
806	.align	3
807
808.L12:
809	MADD	c12, c12, a2, b1
810	LD	b1, 16 * SIZE(BO)
811	MADD	c22, c22, a2, b2
812	LD	b2,  5 * SIZE(BO)
813	MADD	c32, c32, a2, b3
814	LD	b3,  6 * SIZE(BO)
815	MADD	c42, c42, a2, b4
816	LD	b4,  7 * SIZE(BO)
817
818	MADD	c51, c51, a1, b5
819	NOP
820	MADD	c61, c61, a1, b2
821	LD	a4,  2 * SIZE(AO)
822	MADD	c71, c71, a1, b3
823	NOP
824	MADD	c81, c81, a1, b4
825	LD	a1,  8 * SIZE(AO)
826
827	MADD	c52, c52, a2, b5
828	LD	b5, 20 * SIZE(BO)
829	MADD	c62, c62, a2, b2
830	LD	b2,  9 * SIZE(BO)
831	MADD	c72, c72, a2, b3
832	LD	b3, 10 * SIZE(BO)
833	MADD	c82, c82, a2, b4
834	LD	b4, 11 * SIZE(BO)
835
836	MADD	c11, c11, a4, b6
837	LD	a2,  3 * SIZE(AO)
838	MADD	c21, c21, a4, b2
839	NOP
840	MADD	c31, c31, a4, b3
841	NOP
842	MADD	c41, c41, a4, b4
843	NOP
844
845	MADD	c12, c12, a2, b6
846	LD	b6, 24 * SIZE(BO)
847	MADD	c22, c22, a2, b2
848	LD	b2, 13 * SIZE(BO)
849	MADD	c32, c32, a2, b3
850	LD	b3, 14 * SIZE(BO)
851	MADD	c42, c42, a2, b4
852	LD	b4, 15 * SIZE(BO)
853
854	MADD	c51, c51, a4, b7
855	NOP
856	MADD	c61, c61, a4, b2
857	NOP
858	MADD	c71, c71, a4, b3
859	NOP
860	MADD	c81, c81, a4, b4
861	NOP
862
863	MADD	c52, c52, a2, b7
864	LD	b7, 28 * SIZE(BO)
865	MADD	c62, c62, a2, b2
866	LD	b2, 17 * SIZE(BO)
867	MADD	c72, c72, a2, b3
868	LD	b3, 18 * SIZE(BO)
869	MADD	c82, c82, a2, b4
870	LD	b4, 19 * SIZE(BO)
871
872	MADD	c11, c11, a3, b1
873	LD	a2,  5 * SIZE(AO)
874	MADD	c21, c21, a3, b2
875	NOP
876	MADD	c31, c31, a3, b3
877	NOP
878	MADD	c41, c41, a3, b4
879	NOP
880
881	MADD	c12, c12, a2, b1
882	LD	b1, 32 * SIZE(BO)
883	MADD	c22, c22, a2, b2
884	LD	b2, 21 * SIZE(BO)
885	MADD	c32, c32, a2, b3
886	LD	b3, 22 * SIZE(BO)
887	MADD	c42, c42, a2, b4
888	LD	b4, 23 * SIZE(BO)
889
890	MADD	c51, c51, a3, b5
891	NOP
892	MADD	c61, c61, a3, b2
893	LD	a4,  6 * SIZE(AO)
894	MADD	c71, c71, a3, b3
895	NOP
896	MADD	c81, c81, a3, b4
897	LD	a3, 12 * SIZE(AO)
898
899	MADD	c52, c52, a2, b5
900	LD	b5, 36 * SIZE(BO)
901	MADD	c62, c62, a2, b2
902	LD	b2, 25 * SIZE(BO)
903	MADD	c72, c72, a2, b3
904	LD	b3, 26 * SIZE(BO)
905	MADD	c82, c82, a2, b4
906	LD	b4, 27 * SIZE(BO)
907
908	MADD	c11, c11, a4, b6
909	LD	a2,  7 * SIZE(AO)
910	MADD	c21, c21, a4, b2
911	NOP
912	MADD	c31, c31, a4, b3
913	NOP
914	MADD	c41, c41, a4, b4
915	daddiu	L, L, -1
916
917	MADD	c12, c12, a2, b6
918	LD	b6, 40 * SIZE(BO)
919	MADD	c22, c22, a2, b2
920	LD	b2, 29 * SIZE(BO)
921	MADD	c32, c32, a2, b3
922	LD	b3, 30 * SIZE(BO)
923	MADD	c42, c42, a2, b4
924	LD	b4, 31 * SIZE(BO)
925
926	MADD	c51, c51, a4, b7
927	daddiu	BO, BO, 32 * SIZE
928	MADD	c61, c61, a4, b2
929	daddiu	AO, AO,  8 * SIZE
930	MADD	c71, c71, a4, b3
931	NOP
932	MADD	c81, c81, a4, b4
933	NOP
934
935	MADD	c52, c52, a2, b7
936	LD	b7, 12 * SIZE(BO)
937	MADD	c62, c62, a2, b2
938	LD	b2,  1 * SIZE(BO)
939	MADD	c72, c72, a2, b3
940	LD	b3,  2 * SIZE(BO)
941	MADD	c82, c82, a2, b4
942	LD	b4,  3 * SIZE(BO)
943
944	MADD	c11, c11, a1, b1
945	LD	a2,  1 * SIZE(AO)
946	MADD	c21, c21, a1, b2
947	NOP
948	MADD	c31, c31, a1, b3
949	bgtz	L, .L12
950	MADD	c41, c41, a1, b4
951	NOP
952	.align 3
953
954.L13:
955	MADD	c12, c12, a2, b1
956	LD	b1, 16 * SIZE(BO)
957	MADD	c22, c22, a2, b2
958	LD	b2,  5 * SIZE(BO)
959	MADD	c32, c32, a2, b3
960	LD	b3,  6 * SIZE(BO)
961	MADD	c42, c42, a2, b4
962	LD	b4,  7 * SIZE(BO)
963
964	MADD	c51, c51, a1, b5
965	NOP
966	MADD	c61, c61, a1, b2
967	LD	a4,  2 * SIZE(AO)
968	MADD	c71, c71, a1, b3
969	NOP
970	MADD	c81, c81, a1, b4
971	LD	a1,  8 * SIZE(AO)
972
973	MADD	c52, c52, a2, b5
974	LD	b5, 20 * SIZE(BO)
975	MADD	c62, c62, a2, b2
976	LD	b2,  9 * SIZE(BO)
977	MADD	c72, c72, a2, b3
978	LD	b3, 10 * SIZE(BO)
979	MADD	c82, c82, a2, b4
980	LD	b4, 11 * SIZE(BO)
981
982	MADD	c11, c11, a4, b6
983	LD	a2,  3 * SIZE(AO)
984	MADD	c21, c21, a4, b2
985	NOP
986	MADD	c31, c31, a4, b3
987	NOP
988	MADD	c41, c41, a4, b4
989	NOP
990
991	MADD	c12, c12, a2, b6
992	LD	b6, 24 * SIZE(BO)
993	MADD	c22, c22, a2, b2
994	LD	b2, 13 * SIZE(BO)
995	MADD	c32, c32, a2, b3
996	LD	b3, 14 * SIZE(BO)
997	MADD	c42, c42, a2, b4
998	LD	b4, 15 * SIZE(BO)
999
1000	MADD	c51, c51, a4, b7
1001	NOP
1002	MADD	c61, c61, a4, b2
1003	NOP
1004	MADD	c71, c71, a4, b3
1005	NOP
1006	MADD	c81, c81, a4, b4
1007	NOP
1008
1009	MADD	c52, c52, a2, b7
1010	LD	b7, 28 * SIZE(BO)
1011	MADD	c62, c62, a2, b2
1012	LD	b2, 17 * SIZE(BO)
1013	MADD	c72, c72, a2, b3
1014	LD	b3, 18 * SIZE(BO)
1015	MADD	c82, c82, a2, b4
1016	LD	b4, 19 * SIZE(BO)
1017
1018	MADD	c11, c11, a3, b1
1019	LD	a2,  5 * SIZE(AO)
1020	MADD	c21, c21, a3, b2
1021	NOP
1022	MADD	c31, c31, a3, b3
1023	NOP
1024	MADD	c41, c41, a3, b4
1025	NOP
1026
1027	MADD	c12, c12, a2, b1
1028	LD	b1, 32 * SIZE(BO)
1029	MADD	c22, c22, a2, b2
1030	LD	b2, 21 * SIZE(BO)
1031	MADD	c32, c32, a2, b3
1032	LD	b3, 22 * SIZE(BO)
1033	MADD	c42, c42, a2, b4
1034	LD	b4, 23 * SIZE(BO)
1035
1036	MADD	c51, c51, a3, b5
1037	NOP
1038	MADD	c61, c61, a3, b2
1039	LD	a4,  6 * SIZE(AO)
1040	MADD	c71, c71, a3, b3
1041	NOP
1042	MADD	c81, c81, a3, b4
1043	LD	a3, 12 * SIZE(AO)
1044
1045	MADD	c52, c52, a2, b5
1046	LD	b5, 36 * SIZE(BO)
1047	MADD	c62, c62, a2, b2
1048	LD	b2, 25 * SIZE(BO)
1049	MADD	c72, c72, a2, b3
1050	LD	b3, 26 * SIZE(BO)
1051	MADD	c82, c82, a2, b4
1052	LD	b4, 27 * SIZE(BO)
1053
1054	MADD	c11, c11, a4, b6
1055	LD	a2,  7 * SIZE(AO)
1056	MADD	c21, c21, a4, b2
1057	NOP
1058	MADD	c31, c31, a4, b3
1059	NOP
1060	MADD	c41, c41, a4, b4
1061	NOP
1062
1063	MADD	c12, c12, a2, b6
1064	LD	b6, 40 * SIZE(BO)
1065	MADD	c22, c22, a2, b2
1066	LD	b2, 29 * SIZE(BO)
1067	MADD	c32, c32, a2, b3
1068	LD	b3, 30 * SIZE(BO)
1069	MADD	c42, c42, a2, b4
1070	LD	b4, 31 * SIZE(BO)
1071
1072	MADD	c51, c51, a4, b7
1073	daddiu	BO, BO, 32 * SIZE
1074	MADD	c61, c61, a4, b2
1075	daddiu	AO, AO,  8 * SIZE
1076	MADD	c71, c71, a4, b3
1077	NOP
1078	MADD	c81, c81, a4, b4
1079	NOP
1080
1081	MADD	c52, c52, a2, b7
1082	LD	b7, 12 * SIZE(BO)
1083	MADD	c62, c62, a2, b2
1084	LD	b2,  1 * SIZE(BO)
1085	MADD	c72, c72, a2, b3
1086	LD	b3,  2 * SIZE(BO)
1087	MADD	c82, c82, a2, b4
1088	LD	b4,  3 * SIZE(BO)
1089	.align 3
1090
1091.L15:
1092#if defined(LT) || defined(RN)
1093	andi	L, KK,  3
1094#else
1095	andi	L, TEMP, 3
1096#endif
1097	blez	L, .L18
1098	NOP
1099	.align	3
1100
1101.L16:
1102	MADD	c11, c11, a1, b1
1103	LD	a2,  1 * SIZE(AO)
1104	MADD	c21, c21, a1, b2
1105	NOP
1106	MADD	c31, c31, a1, b3
1107	NOP
1108	MADD	c41, c41, a1, b4
1109	NOP
1110
1111	MADD	c12, c12, a2, b1
1112	LD	b1,  8 * SIZE(BO)
1113	MADD	c22, c22, a2, b2
1114	LD	b2,  5 * SIZE(BO)
1115	MADD	c32, c32, a2, b3
1116	LD	b3,  6 * SIZE(BO)
1117	MADD	c42, c42, a2, b4
1118	LD	b4,  7 * SIZE(BO)
1119
1120	MADD	c51, c51, a1, b5
1121	daddiu	L, L, -1
1122	MADD	c61, c61, a1, b2
1123	daddiu	AO, AO,  2 * SIZE
1124	MADD	c71, c71, a1, b3
1125	daddiu	BO, BO,  8 * SIZE
1126	MADD	c81, c81, a1, b4
1127	LD	a1,  0 * SIZE(AO)
1128
1129	MADD	c52, c52, a2, b5
1130	LD	b5,  4 * SIZE(BO)
1131	MADD	c62, c62, a2, b2
1132	LD	b2,  1 * SIZE(BO)
1133	MADD	c72, c72, a2, b3
1134	LD	b3,  2 * SIZE(BO)
1135	MADD	c82, c82, a2, b4
1136	bgtz	L, .L16
1137	LD	b4,  3 * SIZE(BO)
1138
1139.L18:
1140#if defined(LN) || defined(RT)
1141#ifdef LN
1142	daddiu	TEMP, KK, -2
1143#else
1144	daddiu	TEMP, KK, -8
1145#endif
1146
1147	dsll	L,    TEMP, 1 + BASE_SHIFT
1148	dsll	TEMP, TEMP, 3 + BASE_SHIFT
1149	daddu	AO, AORIG, L
1150	daddu	BO, B,     TEMP
1151#endif
1152
1153#if defined(LN) || defined(LT)
1154	LD	b1,  0 * SIZE(BO)
1155	LD	b2,  1 * SIZE(BO)
1156	LD	b3,  2 * SIZE(BO)
1157	LD	b4,  3 * SIZE(BO)
1158
1159	SUB	c11, b1, c11
1160	LD	b5,  4 * SIZE(BO)
1161	SUB	c21, b2, c21
1162	LD	b6,  5 * SIZE(BO)
1163	SUB	c31, b3, c31
1164	LD	b7,  6 * SIZE(BO)
1165	SUB	c41, b4, c41
1166	LD	b8,  7 * SIZE(BO)
1167
1168	SUB	c51, b5, c51
1169	LD	b1,  8 * SIZE(BO)
1170	SUB	c61, b6, c61
1171	LD	b2,  9 * SIZE(BO)
1172	SUB	c71, b7, c71
1173	LD	b3, 10 * SIZE(BO)
1174	SUB	c81, b8, c81
1175	LD	b4, 11 * SIZE(BO)
1176
1177	SUB	c12, b1, c12
1178	LD	b5, 12 * SIZE(BO)
1179	SUB	c22, b2, c22
1180	LD	b6, 13 * SIZE(BO)
1181	SUB	c32, b3, c32
1182	LD	b7, 14 * SIZE(BO)
1183	SUB	c42, b4, c42
1184	LD	b8, 15 * SIZE(BO)
1185
1186	SUB	c52, b5, c52
1187#ifdef LN
1188	LD	b1,  3 * SIZE(AO)
1189#else
1190	LD	b1,  0 * SIZE(AO)
1191#endif
1192	SUB	c62, b6, c62
1193	SUB	c72, b7, c72
1194	SUB	c82, b8, c82
1195#else
1196	LD	b1,  0 * SIZE(AO)
1197	LD	b2,  1 * SIZE(AO)
1198	LD	b3,  2 * SIZE(AO)
1199	LD	b4,  3 * SIZE(AO)
1200
1201	SUB	c11, b1, c11
1202	LD	b5,  4 * SIZE(AO)
1203	SUB	c12, b2, c12
1204	LD	b6,  5 * SIZE(AO)
1205	SUB	c21, b3, c21
1206	LD	b7,  6 * SIZE(AO)
1207	SUB	c22, b4, c22
1208	LD	b8,  7 * SIZE(AO)
1209
1210	SUB	c31, b5, c31
1211	LD	b1,  8 * SIZE(AO)
1212	SUB	c32, b6, c32
1213	LD	b2,  9 * SIZE(AO)
1214	SUB	c41, b7, c41
1215	LD	b3, 10 * SIZE(AO)
1216	SUB	c42, b8, c42
1217	LD	b4, 11 * SIZE(AO)
1218
1219	LD	b5, 12 * SIZE(AO)
1220	SUB	c51, b1, c51
1221	LD	b6, 13 * SIZE(AO)
1222	SUB	c52, b2, c52
1223	LD	b7, 14 * SIZE(AO)
1224	SUB	c61, b3, c61
1225	LD	b8, 15 * SIZE(AO)
1226	SUB	c62, b4, c62
1227
1228	SUB	c71, b5, c71
1229	SUB	c72, b6, c72
1230	SUB	c81, b7, c81
1231	SUB	c82, b8, c82
1232#endif
1233
1234#ifdef LN
1235	MUL	c12, b1, c12
1236	LD	b2,  2 * SIZE(AO)
1237	MUL	c22, b1, c22
1238	MUL	c32, b1, c32
1239	MUL	c42, b1, c42
1240	MUL	c52, b1, c52
1241	MUL	c62, b1, c62
1242	MUL	c72, b1, c72
1243	MUL	c82, b1, c82
1244
1245	NMSUB	c11, c11, b2, c12
1246	LD	b3,  0 * SIZE(AO)
1247	NMSUB	c21, c21, b2, c22
1248	NMSUB	c31, c31, b2, c32
1249	NMSUB	c41, c41, b2, c42
1250	NMSUB	c51, c51, b2, c52
1251	NMSUB	c61, c61, b2, c62
1252	NMSUB	c71, c71, b2, c72
1253	NMSUB	c81, c81, b2, c82
1254
1255	MUL	c11, b3, c11
1256	daddiu	CO1, CO1, -2 * SIZE
1257	MUL	c21, b3, c21
1258	daddiu	CO2, CO2, -2 * SIZE
1259	MUL	c31, b3, c31
1260	daddiu	CO3, CO3, -2 * SIZE
1261	MUL	c41, b3, c41
1262	daddiu	CO4, CO4, -2 * SIZE
1263	MUL	c51, b3, c51
1264	daddiu	CO5, CO5, -2 * SIZE
1265	MUL	c61, b3, c61
1266	daddiu	CO6, CO6, -2 * SIZE
1267	MUL	c71, b3, c71
1268	daddiu	CO7, CO7, -2 * SIZE
1269	MUL	c81, b3, c81
1270	daddiu	CO8, CO8, -2 * SIZE
1271#endif
1272
1273#ifdef LT
1274	MUL	c11, b1, c11
1275	LD	b2,  1 * SIZE(AO)
1276	MUL	c21, b1, c21
1277	MUL	c31, b1, c31
1278	MUL	c41, b1, c41
1279	MUL	c51, b1, c51
1280	MUL	c61, b1, c61
1281	MUL	c71, b1, c71
1282	MUL	c81, b1, c81
1283
1284	NMSUB	c12, c12, b2, c11
1285	LD	b3,  3 * SIZE(AO)
1286	NMSUB	c22, c22, b2, c21
1287	NMSUB	c32, c32, b2, c31
1288	NMSUB	c42, c42, b2, c41
1289	NMSUB	c52, c52, b2, c51
1290	NMSUB	c62, c62, b2, c61
1291	NMSUB	c72, c72, b2, c71
1292	NMSUB	c82, c82, b2, c81
1293
1294	MUL	c12, b3, c12
1295	MUL	c22, b3, c22
1296	MUL	c32, b3, c32
1297	MUL	c42, b3, c42
1298	MUL	c52, b3, c52
1299	MUL	c62, b3, c62
1300	MUL	c72, b3, c72
1301	MUL	c82, b3, c82
1302#endif
1303
1304#ifdef RN
1305	LD	b1,  0 * SIZE(BO)
1306	LD	b2,  1 * SIZE(BO)
1307	LD	b3,  2 * SIZE(BO)
1308	LD	b4,  3 * SIZE(BO)
1309
1310	MUL	c11, b1, c11
1311	MUL	c12, b1, c12
1312	LD	b5,  4 * SIZE(BO)
1313
1314	NMSUB	c21, c21, b2, c11
1315	NMSUB	c22, c22, b2, c12
1316	LD	b6,  5 * SIZE(BO)
1317	NMSUB	c31, c31, b3, c11
1318	NMSUB	c32, c32, b3, c12
1319	LD	b7,  6 * SIZE(BO)
1320	NMSUB	c41, c41, b4, c11
1321	NMSUB	c42, c42, b4, c12
1322	LD	b8,  7 * SIZE(BO)
1323
1324	NMSUB	c51, c51, b5, c11
1325	NMSUB	c52, c52, b5, c12
1326	LD	b2,  9 * SIZE(BO)
1327	NMSUB	c61, c61, b6, c11
1328	NMSUB	c62, c62, b6, c12
1329	LD	b3, 10 * SIZE(BO)
1330	NMSUB	c71, c71, b7, c11
1331	NMSUB	c72, c72, b7, c12
1332	LD	b4, 11 * SIZE(BO)
1333	NMSUB	c81, c81, b8, c11
1334	NMSUB	c82, c82, b8, c12
1335	LD	b5, 12 * SIZE(BO)
1336
1337	MUL	c21, b2, c21
1338	MUL	c22, b2, c22
1339	LD	b6, 13 * SIZE(BO)
1340
1341	NMSUB	c31, c31, b3, c21
1342	NMSUB	c32, c32, b3, c22
1343	LD	b7, 14 * SIZE(BO)
1344	NMSUB	c41, c41, b4, c21
1345	NMSUB	c42, c42, b4, c22
1346	LD	b8, 15 * SIZE(BO)
1347	NMSUB	c51, c51, b5, c21
1348	NMSUB	c52, c52, b5, c22
1349	LD	b3, 18 * SIZE(BO)
1350	NMSUB	c61, c61, b6, c21
1351	NMSUB	c62, c62, b6, c22
1352	LD	b4, 19 * SIZE(BO)
1353	NMSUB	c71, c71, b7, c21
1354	NMSUB	c72, c72, b7, c22
1355	LD	b5, 20 * SIZE(BO)
1356	NMSUB	c81, c81, b8, c21
1357	NMSUB	c82, c82, b8, c22
1358	LD	b6, 21 * SIZE(BO)
1359
1360	MUL	c31, b3, c31
1361	MUL	c32, b3, c32
1362	LD	b7, 22 * SIZE(BO)
1363
1364	NMSUB	c41, c41, b4, c31
1365	NMSUB	c42, c42, b4, c32
1366	LD	b8, 23 * SIZE(BO)
1367	NMSUB	c51, c51, b5, c31
1368	NMSUB	c52, c52, b5, c32
1369	LD	b4, 27 * SIZE(BO)
1370	NMSUB	c61, c61, b6, c31
1371	NMSUB	c62, c62, b6, c32
1372	LD	b5, 28 * SIZE(BO)
1373	NMSUB	c71, c71, b7, c31
1374	NMSUB	c72, c72, b7, c32
1375	LD	b6, 29 * SIZE(BO)
1376	NMSUB	c81, c81, b8, c31
1377	NMSUB	c82, c82, b8, c32
1378	LD	b7, 30 * SIZE(BO)
1379
1380	MUL	c41, b4, c41
1381	MUL	c42, b4, c42
1382	LD	b8, 31 * SIZE(BO)
1383
1384	NMSUB	c51, c51, b5, c41
1385	NMSUB	c52, c52, b5, c42
1386	LD	b5, 36 * SIZE(BO)
1387	NMSUB	c61, c61, b6, c41
1388	NMSUB	c62, c62, b6, c42
1389	LD	b6, 37 * SIZE(BO)
1390	NMSUB	c71, c71, b7, c41
1391	NMSUB	c72, c72, b7, c42
1392	LD	b7, 38 * SIZE(BO)
1393	NMSUB	c81, c81, b8, c41
1394	NMSUB	c82, c82, b8, c42
1395	LD	b8, 39 * SIZE(BO)
1396
1397	MUL	c51, b5, c51
1398	MUL	c52, b5, c52
1399
1400	NMSUB	c61, c61, b6, c51
1401	NMSUB	c62, c62, b6, c52
1402	LD	b6, 45 * SIZE(BO)
1403	NMSUB	c71, c71, b7, c51
1404	NMSUB	c72, c72, b7, c52
1405	LD	b7, 46 * SIZE(BO)
1406	NMSUB	c81, c81, b8, c51
1407	NMSUB	c82, c82, b8, c52
1408	LD	b8, 47 * SIZE(BO)
1409
1410	MUL	c61, b6, c61
1411	MUL	c62, b6, c62
1412
1413	NMSUB	c71, c71, b7, c61
1414	NMSUB	c72, c72, b7, c62
1415	LD	b7, 54 * SIZE(BO)
1416	NMSUB	c81, c81, b8, c61
1417	NMSUB	c82, c82, b8, c62
1418	LD	b8, 55 * SIZE(BO)
1419
1420	MUL	c71, b7, c71
1421	MUL	c72, b7, c72
1422
1423	NMSUB	c81, c81, b8, c71
1424	NMSUB	c82, c82, b8, c72
1425	LD	b8, 63 * SIZE(BO)
1426
1427	MUL	c81, b8, c81
1428	MUL	c82, b8, c82
1429#endif
1430
1431#ifdef RT
1432	LD	b1, 63 * SIZE(BO)
1433	LD	b2, 62 * SIZE(BO)
1434	LD	b3, 61 * SIZE(BO)
1435	LD	b4, 60 * SIZE(BO)
1436
1437	MUL	c81, b1, c81
1438	MUL	c82, b1, c82
1439	LD	b5, 59 * SIZE(BO)
1440
1441	NMSUB	c71, c71, b2, c81
1442	NMSUB	c72, c72, b2, c82
1443	LD	b6, 58 * SIZE(BO)
1444	NMSUB	c61, c61, b3, c81
1445	NMSUB	c62, c62, b3, c82
1446	LD	b7, 57 * SIZE(BO)
1447	NMSUB	c51, c51, b4, c81
1448	NMSUB	c52, c52, b4, c82
1449	LD	b8, 56 * SIZE(BO)
1450
1451	NMSUB	c41, c41, b5, c81
1452	NMSUB	c42, c42, b5, c82
1453	LD	b2, 54 * SIZE(BO)
1454	NMSUB	c31, c31, b6, c81
1455	NMSUB	c32, c32, b6, c82
1456	LD	b3, 53 * SIZE(BO)
1457	NMSUB	c21, c21, b7, c81
1458	NMSUB	c22, c22, b7, c82
1459	LD	b4, 52 * SIZE(BO)
1460	NMSUB	c11, c11, b8, c81
1461	NMSUB	c12, c12, b8, c82
1462	LD	b5, 51 * SIZE(BO)
1463
1464	MUL	c71, b2, c71
1465	MUL	c72, b2, c72
1466	LD	b6, 50 * SIZE(BO)
1467
1468	NMSUB	c61, c61, b3, c71
1469	NMSUB	c62, c62, b3, c72
1470	LD	b7, 49 * SIZE(BO)
1471	NMSUB	c51, c51, b4, c71
1472	NMSUB	c52, c52, b4, c72
1473	LD	b8, 48 * SIZE(BO)
1474	NMSUB	c41, c41, b5, c71
1475	NMSUB	c42, c42, b5, c72
1476	LD	b3, 45 * SIZE(BO)
1477	NMSUB	c31, c31, b6, c71
1478	NMSUB	c32, c32, b6, c72
1479	LD	b4, 44 * SIZE(BO)
1480	NMSUB	c21, c21, b7, c71
1481	NMSUB	c22, c22, b7, c72
1482	LD	b5, 43 * SIZE(BO)
1483	NMSUB	c11, c11, b8, c71
1484	NMSUB	c12, c12, b8, c72
1485	LD	b6, 42 * SIZE(BO)
1486
1487	MUL	c61, b3, c61
1488	MUL	c62, b3, c62
1489	LD	b7, 41 * SIZE(BO)
1490
1491	NMSUB	c51, c51, b4, c61
1492	NMSUB	c52, c52, b4, c62
1493	LD	b8, 40 * SIZE(BO)
1494	NMSUB	c41, c41, b5, c61
1495	NMSUB	c42, c42, b5, c62
1496	LD	b4, 36 * SIZE(BO)
1497	NMSUB	c31, c31, b6, c61
1498	NMSUB	c32, c32, b6, c62
1499	LD	b5, 35 * SIZE(BO)
1500	NMSUB	c21, c21, b7, c61
1501	NMSUB	c22, c22, b7, c62
1502	LD	b6, 34 * SIZE(BO)
1503	NMSUB	c11, c11, b8, c61
1504	NMSUB	c12, c12, b8, c62
1505	LD	b7, 33 * SIZE(BO)
1506
1507	MUL	c51, b4, c51
1508	MUL	c52, b4, c52
1509	LD	b8, 32 * SIZE(BO)
1510
1511	NMSUB	c41, c41, b5, c51
1512	NMSUB	c42, c42, b5, c52
1513	LD	b5, 27 * SIZE(BO)
1514	NMSUB	c31, c31, b6, c51
1515	NMSUB	c32, c32, b6, c52
1516	LD	b6, 26 * SIZE(BO)
1517	NMSUB	c21, c21, b7, c51
1518	NMSUB	c22, c22, b7, c52
1519	LD	b7, 25 * SIZE(BO)
1520	NMSUB	c11, c11, b8, c51
1521	NMSUB	c12, c12, b8, c52
1522	LD	b8, 24 * SIZE(BO)
1523
1524	MUL	c41, b5, c41
1525	MUL	c42, b5, c42
1526
1527	NMSUB	c31, c31, b6, c41
1528	NMSUB	c32, c32, b6, c42
1529	LD	b6, 18 * SIZE(BO)
1530	NMSUB	c21, c21, b7, c41
1531	NMSUB	c22, c22, b7, c42
1532	LD	b7, 17 * SIZE(BO)
1533	NMSUB	c11, c11, b8, c41
1534	NMSUB	c12, c12, b8, c42
1535	LD	b8, 16 * SIZE(BO)
1536
1537	MUL	c31, b6, c31
1538	MUL	c32, b6, c32
1539
1540	NMSUB	c21, c21, b7, c31
1541	NMSUB	c22, c22, b7, c32
1542	LD	b7,  9 * SIZE(BO)
1543	NMSUB	c11, c11, b8, c31
1544	NMSUB	c12, c12, b8, c32
1545	LD	b8,  8 * SIZE(BO)
1546
1547	MUL	c21, b7, c21
1548	MUL	c22, b7, c22
1549
1550	NMSUB	c11, c11, b8, c21
1551	NMSUB	c12, c12, b8, c22
1552	LD	b8,  0 * SIZE(BO)
1553
1554	MUL	c11, b8, c11
1555	MUL	c12, b8, c12
1556#endif
1557
1558#if defined(LN) || defined(LT)
1559	ST	c11,  0 * SIZE(BO)
1560	ST	c21,  1 * SIZE(BO)
1561	ST	c31,  2 * SIZE(BO)
1562	ST	c41,  3 * SIZE(BO)
1563	ST	c51,  4 * SIZE(BO)
1564	ST	c61,  5 * SIZE(BO)
1565	ST	c71,  6 * SIZE(BO)
1566	ST	c81,  7 * SIZE(BO)
1567
1568	ST	c12,  8 * SIZE(BO)
1569	ST	c22,  9 * SIZE(BO)
1570	ST	c32, 10 * SIZE(BO)
1571	ST	c42, 11 * SIZE(BO)
1572	ST	c52, 12 * SIZE(BO)
1573	ST	c62, 13 * SIZE(BO)
1574	ST	c72, 14 * SIZE(BO)
1575	ST	c82, 15 * SIZE(BO)
1576#else
1577	ST	c11,  0 * SIZE(AO)
1578	ST	c12,  1 * SIZE(AO)
1579	ST	c21,  2 * SIZE(AO)
1580	ST	c22,  3 * SIZE(AO)
1581	ST	c31,  4 * SIZE(AO)
1582	ST	c32,  5 * SIZE(AO)
1583	ST	c41,  6 * SIZE(AO)
1584	ST	c42,  7 * SIZE(AO)
1585
1586	ST	c51,  8 * SIZE(AO)
1587	ST	c52,  9 * SIZE(AO)
1588	ST	c61, 10 * SIZE(AO)
1589	ST	c62, 11 * SIZE(AO)
1590	ST	c71, 12 * SIZE(AO)
1591	ST	c72, 13 * SIZE(AO)
1592	ST	c81, 14 * SIZE(AO)
1593	ST	c82, 15 * SIZE(AO)
1594#endif
1595
1596	ST	c11,  0 * SIZE(CO1)
1597	ST	c12,  1 * SIZE(CO1)
1598	ST	c21,  0 * SIZE(CO2)
1599	ST	c22,  1 * SIZE(CO2)
1600	ST	c31,  0 * SIZE(CO3)
1601	ST	c32,  1 * SIZE(CO3)
1602	ST	c41,  0 * SIZE(CO4)
1603	ST	c42,  1 * SIZE(CO4)
1604	ST	c51,  0 * SIZE(CO5)
1605	ST	c52,  1 * SIZE(CO5)
1606	ST	c61,  0 * SIZE(CO6)
1607	ST	c62,  1 * SIZE(CO6)
1608	ST	c71,  0 * SIZE(CO7)
1609	ST	c72,  1 * SIZE(CO7)
1610	ST	c81,  0 * SIZE(CO8)
1611	ST	c82,  1 * SIZE(CO8)
1612
1613	MTC	$0,  a1
1614
1615#ifndef LN
1616	daddiu	CO1, CO1, 2 * SIZE
1617	daddiu	CO2, CO2, 2 * SIZE
1618	daddiu	CO3, CO3, 2 * SIZE
1619	daddiu	CO4, CO4, 2 * SIZE
1620	daddiu	CO5, CO5, 2 * SIZE
1621	daddiu	CO6, CO6, 2 * SIZE
1622	daddiu	CO7, CO7, 2 * SIZE
1623	daddiu	CO8, CO8, 2 * SIZE
1624#endif
1625
1626	MOV	c11, a1
1627	MOV	c21, a1
1628
1629#ifdef RT
1630	dsll	TEMP, K, 1 + BASE_SHIFT
1631	daddu	AORIG, AORIG, TEMP
1632#endif
1633
1634	MOV	c31, a1
1635	MOV	c41, a1
1636
1637#if defined(LT) || defined(RN)
1638	dsubu	TEMP, K, KK
1639	dsll	L,    TEMP, 1 + BASE_SHIFT
1640	dsll	TEMP, TEMP, 3 + BASE_SHIFT
1641	daddu	AO, AO, L
1642	daddu	BO, BO, TEMP
1643#endif
1644
1645#ifdef LT
1646	daddiu	KK, KK, 2
1647#endif
1648
1649#ifdef LN
1650	daddiu	KK, KK, -2
1651#endif
1652
1653	daddiu	I, I, -1
1654	MOV	c51, a1
1655
1656	bgtz	I, .L11
1657	MOV	c61, a1
1658	.align 3
1659
1660.L29:
1661#ifdef LN
1662	dsll	TEMP, K, 3 + BASE_SHIFT
1663	daddu	B, B, TEMP
1664#endif
1665
1666#if defined(LT) || defined(RN)
1667	move	B,  BO
1668#endif
1669
1670#ifdef RN
1671	daddiu	KK, KK,  8
1672#endif
1673
1674#ifdef RT
1675	daddiu	KK, KK, -8
1676#endif
1677
1678	bgtz	J, .L10
1679	NOP
1680	.align 3
1681
1682.L30:
1683	andi	J,  N, 4
1684	blez	J, .L50
1685	move	AO, A
1686
1687#ifdef RT
1688	dsll	TEMP, K, 2 + BASE_SHIFT
1689	dsubu	B, B, TEMP
1690
1691	dsll	TEMP, LDC, 2
1692	dsubu	C, C, TEMP
1693#endif
1694
1695	move	CO1, C
1696	MTC	$0,  c11
1697	daddu	CO2, C,   LDC
1698	daddu	CO3, CO2, LDC
1699	MOV	c21, c11
1700	daddu	CO4, CO3, LDC
1701	MOV	c31, c11
1702
1703#ifdef LN
1704	daddu	KK, M, OFFSET
1705#endif
1706
1707#ifdef LT
1708	move	KK, OFFSET
1709#endif
1710
1711#if defined(LN) || defined(RT)
1712	move	AORIG, A
1713#else
1714	move	AO, A
1715#endif
1716#ifndef RT
1717	daddu	C,  CO4, LDC
1718#endif
1719
1720	andi	I,  M, 1
1721	blez	I, .L40
1722	MOV	c41, c11
1723
1724#if defined(LT) || defined(RN)
1725	LD	a1,  0 * SIZE(AO)
1726	MOV	c71, c11
1727	LD	a2,  1 * SIZE(AO)
1728	MOV	c81, c11
1729
1730	LD	b1,  0 * SIZE(B)
1731	LD	b2,  1 * SIZE(B)
1732	LD	b3,  2 * SIZE(B)
1733	LD	b4,  3 * SIZE(B)
1734	LD	b5,  4 * SIZE(B)
1735	LD	b6,  8 * SIZE(B)
1736	LD	b7, 12 * SIZE(B)
1737
1738	dsra	L,  KK, 2
1739
1740	blez	L, .L45
1741	move	BO,  B
1742#else
1743#ifdef LN
1744	dsll	TEMP,   K,  BASE_SHIFT
1745	dsubu	AORIG, AORIG, TEMP
1746#endif
1747
1748	dsll	L,    KK, 0 + BASE_SHIFT
1749	dsll	TEMP, KK, 2 + BASE_SHIFT
1750
1751	daddu	AO, AORIG, L
1752	daddu	BO, B,     TEMP
1753
1754	dsubu	TEMP, K, KK
1755
1756	LD	a1,  0 * SIZE(AO)
1757	MOV	c71, c11
1758	LD	a2,  1 * SIZE(AO)
1759	MOV	c81, c11
1760
1761	LD	b1,  0 * SIZE(BO)
1762	LD	b2,  1 * SIZE(BO)
1763	LD	b3,  2 * SIZE(BO)
1764	LD	b4,  3 * SIZE(BO)
1765	LD	b5,  4 * SIZE(BO)
1766	LD	b6,  8 * SIZE(BO)
1767	LD	b7, 12 * SIZE(BO)
1768
1769	dsra	L,  TEMP, 2
1770
1771	blez	L, .L45
1772	NOP
1773#endif
1774	.align	3
1775
1776.L42:
1777	MADD	c11, c11, a1, b1
1778	LD	b1, 16 * SIZE(BO)
1779	MADD	c21, c21, a1, b2
1780	LD	b2,  5 * SIZE(BO)
1781	MADD	c31, c31, a1, b3
1782	LD	b3,  6 * SIZE(BO)
1783	MADD	c41, c41, a1, b4
1784	LD	b4,  7 * SIZE(BO)
1785
1786	LD	a1,  4 * SIZE(AO)
1787	daddiu	L, L, -1
1788
1789	MADD	c11, c11, a2, b5
1790	LD	b5, 20 * SIZE(BO)
1791	MADD	c21, c21, a2, b2
1792	LD	b2,  9 * SIZE(BO)
1793	MADD	c31, c31, a2, b3
1794	LD	b3, 10 * SIZE(BO)
1795	MADD	c41, c41, a2, b4
1796	LD	b4, 11 * SIZE(BO)
1797
1798	LD	a2,  2 * SIZE(AO)
1799	daddiu	AO, AO,  4 * SIZE
1800
1801	MADD	c11, c11, a2, b6
1802	LD	b6, 24 * SIZE(BO)
1803	MADD	c21, c21, a2, b2
1804	LD	b2, 13 * SIZE(BO)
1805	MADD	c31, c31, a2, b3
1806	LD	b3, 14 * SIZE(BO)
1807	MADD	c41, c41, a2, b4
1808	LD	b4, 15 * SIZE(BO)
1809
1810	LD	a2, -1 * SIZE(AO)
1811	daddiu	BO, BO, 16 * SIZE
1812
1813	MADD	c11, c11, a2, b7
1814	LD	b7, 12 * SIZE(BO)
1815	MADD	c21, c21, a2, b2
1816	LD	b2,  1 * SIZE(BO)
1817	MADD	c31, c31, a2, b3
1818	LD	b3,  2 * SIZE(BO)
1819	MADD	c41, c41, a2, b4
1820	LD	b4,  3 * SIZE(BO)
1821
1822	bgtz	L, .L42
1823	LD	a2,  1 * SIZE(AO)
1824	.align 3
1825
1826.L45:
1827#if defined(LT) || defined(RN)
1828	andi	L, KK,  3
1829#else
1830	andi	L, TEMP, 3
1831#endif
1832	NOP
1833	blez	L, .L48
1834	NOP
1835	.align	3
1836
1837.L46:
1838	MADD	c11, c11, a1, b1
1839	LD	b1,  4 * SIZE(BO)
1840	MADD	c21, c21, a1, b2
1841	LD	b2,  5 * SIZE(BO)
1842	MADD	c31, c31, a1, b3
1843	LD	b3,  6 * SIZE(BO)
1844	MADD	c41, c41, a1, b4
1845	LD	a1,  1 * SIZE(AO)
1846
1847	LD	b4,  7 * SIZE(BO)
1848	daddiu	L, L, -1
1849
1850	daddiu	AO, AO,  1 * SIZE
1851	MOV	a2, a2
1852	bgtz	L, .L46
1853	daddiu	BO, BO,  4 * SIZE
1854
1855
1856.L48:
1857#if defined(LN) || defined(RT)
1858#ifdef LN
1859	daddiu	TEMP, KK, -1
1860#else
1861	daddiu	TEMP, KK, -4
1862#endif
1863
1864	dsll	L,    TEMP, 0 + BASE_SHIFT
1865	dsll	TEMP, TEMP, 2 + BASE_SHIFT
1866	daddu	AO, AORIG, L
1867	daddu	BO, B,     TEMP
1868#endif
1869
1870
1871#if defined(LN) || defined(LT)
1872	LD	b1,  0 * SIZE(BO)
1873	LD	b2,  1 * SIZE(BO)
1874	LD	b3,  2 * SIZE(BO)
1875	LD	b4,  3 * SIZE(BO)
1876
1877	SUB	c11, b1, c11
1878	SUB	c21, b2, c21
1879	SUB	c31, b3, c31
1880	SUB	c41, b4, c41
1881#else
1882	LD	b1,  0 * SIZE(AO)
1883	LD	b2,  1 * SIZE(AO)
1884	LD	b3,  2 * SIZE(AO)
1885	LD	b4,  3 * SIZE(AO)
1886
1887	SUB	c11, b1, c11
1888	SUB	c21, b2, c21
1889	SUB	c31, b3, c31
1890	SUB	c41, b4, c41
1891#endif
1892
1893#if defined(LN) || defined(LT)
1894	LD	b1,  0 * SIZE(AO)
1895
1896	MUL	c11, b1, c11
1897	MUL	c21, b1, c21
1898	MUL	c31, b1, c31
1899	MUL	c41, b1, c41
1900#endif
1901
1902#ifdef RN
1903	LD	b1,  0 * SIZE(BO)
1904	LD	b2,  1 * SIZE(BO)
1905	LD	b3,  2 * SIZE(BO)
1906	LD	b4,  3 * SIZE(BO)
1907
1908	MUL	c11, b1, c11
1909
1910	NMSUB	c21, c21, b2, c11
1911	NMSUB	c31, c31, b3, c11
1912	NMSUB	c41, c41, b4, c11
1913
1914	LD	b2,  5 * SIZE(BO)
1915	LD	b3,  6 * SIZE(BO)
1916	LD	b4,  7 * SIZE(BO)
1917
1918	MUL	c21, b2, c21
1919
1920	NMSUB	c31, c31, b3, c21
1921	NMSUB	c41, c41, b4, c21
1922
1923	LD	b3, 10 * SIZE(BO)
1924	LD	b4, 11 * SIZE(BO)
1925
1926	MUL	c31, b3, c31
1927
1928	NMSUB	c41, c41, b4, c31
1929
1930	LD	b4, 15 * SIZE(BO)
1931
1932	MUL	c41, b4, c41
1933#endif
1934
1935#ifdef RT
1936	LD	b5, 15 * SIZE(BO)
1937	LD	b6, 14 * SIZE(BO)
1938	LD	b7, 13 * SIZE(BO)
1939	LD	b8, 12 * SIZE(BO)
1940
1941	MUL	c41, b5, c41
1942
1943	NMSUB	c31, c31, b6, c41
1944	NMSUB	c21, c21, b7, c41
1945	NMSUB	c11, c11, b8, c41
1946
1947	LD	b6, 10 * SIZE(BO)
1948	LD	b7,  9 * SIZE(BO)
1949	LD	b8,  8 * SIZE(BO)
1950
1951	MUL	c31, b6, c31
1952
1953	NMSUB	c21, c21, b7, c31
1954	NMSUB	c11, c11, b8, c31
1955
1956	LD	b7,  5 * SIZE(BO)
1957	LD	b8,  4 * SIZE(BO)
1958
1959	MUL	c21, b7, c21
1960
1961	NMSUB	c11, c11, b8, c21
1962
1963	LD	b8,  0 * SIZE(BO)
1964
1965	MUL	c11, b8, c11
1966#endif
1967
1968#ifdef LN
1969	daddiu	CO1, CO1, -1 * SIZE
1970	daddiu	CO2, CO2, -1 * SIZE
1971	daddiu	CO3, CO3, -1 * SIZE
1972	daddiu	CO4, CO4, -1 * SIZE
1973#endif
1974
1975#if defined(LN) || defined(LT)
1976	ST	c11,  0 * SIZE(BO)
1977	ST	c21,  1 * SIZE(BO)
1978	ST	c31,  2 * SIZE(BO)
1979	ST	c41,  3 * SIZE(BO)
1980#else
1981	ST	c11,  0 * SIZE(AO)
1982	ST	c21,  1 * SIZE(AO)
1983	ST	c31,  2 * SIZE(AO)
1984	ST	c41,  3 * SIZE(AO)
1985#endif
1986
1987	ST	c11,  0 * SIZE(CO1)
1988	ST	c21,  0 * SIZE(CO2)
1989	ST	c31,  0 * SIZE(CO3)
1990	ST	c41,  0 * SIZE(CO4)
1991
1992	MTC	$0,  c11
1993
1994#ifndef LN
1995	daddiu	CO1, CO1, 1 * SIZE
1996	daddiu	CO2, CO2, 1 * SIZE
1997	daddiu	CO3, CO3, 1 * SIZE
1998	daddiu	CO4, CO4, 1 * SIZE
1999#endif
2000
2001	MOV	c21, c11
2002
2003#ifdef RT
2004	dsll	TEMP, K, BASE_SHIFT
2005	daddu	AORIG, AORIG, TEMP
2006#endif
2007
2008#if defined(LT) || defined(RN)
2009	dsubu	TEMP, K, KK
2010	dsll	L,    TEMP, 0 + BASE_SHIFT
2011	dsll	TEMP, TEMP, 2 + BASE_SHIFT
2012	daddu	AO, AO, L
2013	daddu	BO, BO, TEMP
2014#endif
2015
2016	MOV	c31, c11
2017
2018#ifdef LT
2019	daddiu	KK, KK, 1
2020#endif
2021
2022#ifdef LN
2023	daddiu	KK, KK, -1
2024#endif
2025	.align 3
2026
2027.L40:
2028	dsra	I,  M, 1
2029	MOV	c61, c11
2030	blez	I, .L49
2031	MOV	c41, c11
2032
2033.L31:
2034#if defined(LT) || defined(RN)
2035	LD	a1,  0 * SIZE(AO)
2036	LD	a3,  4 * SIZE(AO)
2037
2038	LD	b1,  0 * SIZE(B)
2039	MOV	c12, c11
2040	LD	b2,  1 * SIZE(B)
2041	MOV	c22, c11
2042	LD	b3,  2 * SIZE(B)
2043	MOV	c32, c11
2044	LD	b4,  3 * SIZE(B)
2045	MOV	c42, c11
2046
2047	LD	b5,  4 * SIZE(B)
2048	dsra	L,  KK, 2
2049	LD	b6,  8 * SIZE(B)
2050	LD	b7, 12 * SIZE(B)
2051
2052	blez	L, .L35
2053	move	BO,  B
2054#else
2055#ifdef LN
2056	dsll	TEMP,   K,  1 + BASE_SHIFT
2057	dsubu	AORIG, AORIG, TEMP
2058#endif
2059
2060	dsll	L,    KK, 1 + BASE_SHIFT
2061	dsll	TEMP, KK, 2 + BASE_SHIFT
2062
2063	daddu	AO, AORIG, L
2064	daddu	BO, B,     TEMP
2065
2066	dsubu	TEMP, K, KK
2067
2068	LD	a1,  0 * SIZE(AO)
2069	LD	a3,  4 * SIZE(AO)
2070
2071	LD	b1,  0 * SIZE(BO)
2072	MOV	c12, c11
2073	LD	b2,  1 * SIZE(BO)
2074	MOV	c22, c11
2075	LD	b3,  2 * SIZE(BO)
2076	MOV	c32, c11
2077	LD	b4,  3 * SIZE(BO)
2078	MOV	c42, c11
2079
2080	LD	b5,  4 * SIZE(BO)
2081	dsra	L,  TEMP, 2
2082	LD	b6,  8 * SIZE(BO)
2083	LD	b7, 12 * SIZE(BO)
2084
2085	blez	L, .L35
2086	NOP
2087#endif
2088	.align	3
2089
2090.L32:
2091	MADD	c11, c11, a1, b1
2092	LD	a2,  1 * SIZE(AO)
2093	MADD	c21, c21, a1, b2
2094	daddiu	L, L, -1
2095	MADD	c31, c31, a1, b3
2096	NOP
2097	MADD	c41, c41, a1, b4
2098	LD	a1,  2 * SIZE(AO)
2099
2100	MADD	c12, c12, a2, b1
2101	LD	b1, 16 * SIZE(BO)
2102	MADD	c22, c22, a2, b2
2103	LD	b2,  5 * SIZE(BO)
2104	MADD	c32, c32, a2, b3
2105	LD	b3,  6 * SIZE(BO)
2106	MADD	c42, c42, a2, b4
2107	LD	b4,  7 * SIZE(BO)
2108
2109	MADD	c11, c11, a1, b5
2110	LD	a2,  3 * SIZE(AO)
2111	MADD	c21, c21, a1, b2
2112	NOP
2113	MADD	c31, c31, a1, b3
2114	NOP
2115	MADD	c41, c41, a1, b4
2116	LD	a1,  8 * SIZE(AO)
2117
2118	MADD	c12, c12, a2, b5
2119	LD	b5, 20 * SIZE(BO)
2120	MADD	c22, c22, a2, b2
2121	LD	b2,  9 * SIZE(BO)
2122	MADD	c32, c32, a2, b3
2123	LD	b3, 10 * SIZE(BO)
2124	MADD	c42, c42, a2, b4
2125	LD	b4, 11 * SIZE(BO)
2126
2127	MADD	c11, c11, a3, b6
2128	LD	a2,  5 * SIZE(AO)
2129	MADD	c21, c21, a3, b2
2130	NOP
2131	MADD	c31, c31, a3, b3
2132	NOP
2133	MADD	c41, c41, a3, b4
2134	LD	a3,  6 * SIZE(AO)
2135
2136	MADD	c12, c12, a2, b6
2137	LD	b6, 24 * SIZE(BO)
2138	MADD	c22, c22, a2, b2
2139	LD	b2, 13 * SIZE(BO)
2140	MADD	c32, c32, a2, b3
2141	LD	b3, 14 * SIZE(BO)
2142	MADD	c42, c42, a2, b4
2143	LD	b4, 15 * SIZE(BO)
2144
2145	MADD	c11, c11, a3, b7
2146	LD	a2,  7 * SIZE(AO)
2147	MADD	c21, c21, a3, b2
2148	daddiu	AO, AO,  8 * SIZE
2149	MADD	c31, c31, a3, b3
2150	daddiu	BO, BO, 16 * SIZE
2151	MADD	c41, c41, a3, b4
2152	LD	a3,  4 * SIZE(AO)
2153
2154	MADD	c12, c12, a2, b7
2155	LD	b7, 12 * SIZE(BO)
2156	MADD	c22, c22, a2, b2
2157	LD	b2,  1 * SIZE(BO)
2158	MADD	c32, c32, a2, b3
2159	LD	b3,  2 * SIZE(BO)
2160	MADD	c42, c42, a2, b4
2161	NOP
2162
2163	bgtz	L, .L32
2164	LD	b4,  3 * SIZE(BO)
2165	.align 3
2166
2167.L35:
2168#if defined(LT) || defined(RN)
2169	andi	L, KK,  3
2170#else
2171	andi	L, TEMP, 3
2172#endif
2173	NOP
2174	blez	L, .L38
2175	NOP
2176	.align	3
2177
2178.L36:
2179	MADD	c11, c11, a1, b1
2180	LD	a2,  1 * SIZE(AO)
2181	MADD	c21, c21, a1, b2
2182	daddiu	L, L, -1
2183	MADD	c31, c31, a1, b3
2184	daddiu	AO, AO,  2 * SIZE
2185	MADD	c41, c41, a1, b4
2186	LD	a1,  0 * SIZE(AO)
2187
2188	MADD	c12, c12, a2, b1
2189	LD	b1,  4 * SIZE(BO)
2190	MADD	c22, c22, a2, b2
2191	LD	b2,  5 * SIZE(BO)
2192	MADD	c32, c32, a2, b3
2193	LD	b3,  6 * SIZE(BO)
2194	MADD	c42, c42, a2, b4
2195	LD	b4,  7 * SIZE(BO)
2196
2197	bgtz	L, .L36
2198	daddiu	BO, BO,  4 * SIZE
2199
2200.L38:
2201#if defined(LN) || defined(RT)
2202#ifdef LN
2203	daddiu	TEMP, KK, -2
2204#else
2205	daddiu	TEMP, KK, -4
2206#endif
2207
2208	dsll	L,    TEMP, 1 + BASE_SHIFT
2209	dsll	TEMP, TEMP, 2 + BASE_SHIFT
2210	daddu	AO, AORIG, L
2211	daddu	BO, B,     TEMP
2212#endif
2213
2214
2215#if defined(LN) || defined(LT)
2216	LD	b1,  0 * SIZE(BO)
2217	LD	b2,  1 * SIZE(BO)
2218	LD	b3,  2 * SIZE(BO)
2219	LD	b4,  3 * SIZE(BO)
2220	LD	b5,  4 * SIZE(BO)
2221	LD	b6,  5 * SIZE(BO)
2222	LD	b7,  6 * SIZE(BO)
2223	LD	b8,  7 * SIZE(BO)
2224
2225	SUB	c11, b1, c11
2226	SUB	c21, b2, c21
2227	SUB	c31, b3, c31
2228	SUB	c41, b4, c41
2229	SUB	c12, b5, c12
2230	SUB	c22, b6, c22
2231	SUB	c32, b7, c32
2232	SUB	c42, b8, c42
2233#else
2234	LD	b1,  0 * SIZE(AO)
2235	LD	b2,  1 * SIZE(AO)
2236	LD	b3,  2 * SIZE(AO)
2237	LD	b4,  3 * SIZE(AO)
2238	LD	b5,  4 * SIZE(AO)
2239	LD	b6,  5 * SIZE(AO)
2240	LD	b7,  6 * SIZE(AO)
2241	LD	b8,  7 * SIZE(AO)
2242
2243	SUB	c11, b1, c11
2244	SUB	c12, b2, c12
2245	SUB	c21, b3, c21
2246	SUB	c22, b4, c22
2247	SUB	c31, b5, c31
2248	SUB	c32, b6, c32
2249	SUB	c41, b7, c41
2250	SUB	c42, b8, c42
2251#endif
2252
2253#ifdef LN
2254	LD	b1,  3 * SIZE(AO)
2255	LD	b2,  2 * SIZE(AO)
2256	LD	b3,  0 * SIZE(AO)
2257
2258	MUL	c12, b1, c12
2259	MUL	c22, b1, c22
2260	MUL	c32, b1, c32
2261	MUL	c42, b1, c42
2262
2263	NMSUB	c11, c11, b2, c12
2264	NMSUB	c21, c21, b2, c22
2265	NMSUB	c31, c31, b2, c32
2266	NMSUB	c41, c41, b2, c42
2267
2268	MUL	c11, b3, c11
2269	MUL	c21, b3, c21
2270	MUL	c31, b3, c31
2271	MUL	c41, b3, c41
2272#endif
2273
2274#ifdef LT
2275	LD	b1,  0 * SIZE(AO)
2276	LD	b2,  1 * SIZE(AO)
2277	LD	b3,  3 * SIZE(AO)
2278
2279	MUL	c11, b1, c11
2280	MUL	c21, b1, c21
2281	MUL	c31, b1, c31
2282	MUL	c41, b1, c41
2283
2284	NMSUB	c12, c12, b2, c11
2285	NMSUB	c22, c22, b2, c21
2286	NMSUB	c32, c32, b2, c31
2287	NMSUB	c42, c42, b2, c41
2288
2289	MUL	c12, b3, c12
2290	MUL	c22, b3, c22
2291	MUL	c32, b3, c32
2292	MUL	c42, b3, c42
2293#endif
2294
2295#ifdef RN
2296	LD	b1,  0 * SIZE(BO)
2297	LD	b2,  1 * SIZE(BO)
2298	LD	b3,  2 * SIZE(BO)
2299	LD	b4,  3 * SIZE(BO)
2300
2301	MUL	c11, b1, c11
2302	MUL	c12, b1, c12
2303
2304	NMSUB	c21, c21, b2, c11
2305	NMSUB	c22, c22, b2, c12
2306	NMSUB	c31, c31, b3, c11
2307	NMSUB	c32, c32, b3, c12
2308	NMSUB	c41, c41, b4, c11
2309	NMSUB	c42, c42, b4, c12
2310
2311	LD	b2,  5 * SIZE(BO)
2312	LD	b3,  6 * SIZE(BO)
2313	LD	b4,  7 * SIZE(BO)
2314
2315	MUL	c21, b2, c21
2316	MUL	c22, b2, c22
2317
2318	NMSUB	c31, c31, b3, c21
2319	NMSUB	c32, c32, b3, c22
2320	NMSUB	c41, c41, b4, c21
2321	NMSUB	c42, c42, b4, c22
2322
2323	LD	b3, 10 * SIZE(BO)
2324	LD	b4, 11 * SIZE(BO)
2325
2326	MUL	c31, b3, c31
2327	MUL	c32, b3, c32
2328
2329	NMSUB	c41, c41, b4, c31
2330	NMSUB	c42, c42, b4, c32
2331
2332	LD	b4, 15 * SIZE(BO)
2333
2334	MUL	c41, b4, c41
2335	MUL	c42, b4, c42
2336#endif
2337
2338#ifdef RT
2339	LD	b5, 15 * SIZE(BO)
2340	LD	b6, 14 * SIZE(BO)
2341	LD	b7, 13 * SIZE(BO)
2342	LD	b8, 12 * SIZE(BO)
2343
2344	MUL	c41, b5, c41
2345	MUL	c42, b5, c42
2346
2347	NMSUB	c31, c31, b6, c41
2348	NMSUB	c32, c32, b6, c42
2349	NMSUB	c21, c21, b7, c41
2350	NMSUB	c22, c22, b7, c42
2351	NMSUB	c11, c11, b8, c41
2352	NMSUB	c12, c12, b8, c42
2353
2354	LD	b6, 10 * SIZE(BO)
2355	LD	b7,  9 * SIZE(BO)
2356	LD	b8,  8 * SIZE(BO)
2357
2358	MUL	c31, b6, c31
2359	MUL	c32, b6, c32
2360
2361	NMSUB	c21, c21, b7, c31
2362	NMSUB	c22, c22, b7, c32
2363	NMSUB	c11, c11, b8, c31
2364	NMSUB	c12, c12, b8, c32
2365
2366	LD	b7,  5 * SIZE(BO)
2367	LD	b8,  4 * SIZE(BO)
2368
2369	MUL	c21, b7, c21
2370	MUL	c22, b7, c22
2371
2372	NMSUB	c11, c11, b8, c21
2373	NMSUB	c12, c12, b8, c22
2374
2375	LD	b8,  0 * SIZE(BO)
2376
2377	MUL	c11, b8, c11
2378	MUL	c12, b8, c12
2379#endif
2380
2381#ifdef LN
2382	daddiu	CO1, CO1, -2 * SIZE
2383	daddiu	CO2, CO2, -2 * SIZE
2384	daddiu	CO3, CO3, -2 * SIZE
2385	daddiu	CO4, CO4, -2 * SIZE
2386#endif
2387
2388#if defined(LN) || defined(LT)
2389	ST	c11,  0 * SIZE(BO)
2390	ST	c21,  1 * SIZE(BO)
2391	ST	c31,  2 * SIZE(BO)
2392	ST	c41,  3 * SIZE(BO)
2393	ST	c12,  4 * SIZE(BO)
2394	ST	c22,  5 * SIZE(BO)
2395	ST	c32,  6 * SIZE(BO)
2396	ST	c42,  7 * SIZE(BO)
2397#else
2398	ST	c11,  0 * SIZE(AO)
2399	ST	c12,  1 * SIZE(AO)
2400	ST	c21,  2 * SIZE(AO)
2401	ST	c22,  3 * SIZE(AO)
2402	ST	c31,  4 * SIZE(AO)
2403	ST	c32,  5 * SIZE(AO)
2404	ST	c41,  6 * SIZE(AO)
2405	ST	c42,  7 * SIZE(AO)
2406#endif
2407
2408	ST	c11,  0 * SIZE(CO1)
2409	ST	c12,  1 * SIZE(CO1)
2410	ST	c21,  0 * SIZE(CO2)
2411	ST	c22,  1 * SIZE(CO2)
2412	ST	c31,  0 * SIZE(CO3)
2413	ST	c32,  1 * SIZE(CO3)
2414	ST	c41,  0 * SIZE(CO4)
2415	ST	c42,  1 * SIZE(CO4)
2416
2417#ifndef LN
2418	daddiu	CO1, CO1, 2 * SIZE
2419	daddiu	CO2, CO2, 2 * SIZE
2420	daddiu	CO3, CO3, 2 * SIZE
2421	daddiu	CO4, CO4, 2 * SIZE
2422#endif
2423
2424#ifdef RT
2425	dsll	TEMP, K, 1 + BASE_SHIFT
2426	daddu	AORIG, AORIG, TEMP
2427#endif
2428
2429#if defined(LT) || defined(RN)
2430	dsubu	TEMP, K, KK
2431	dsll	L,    TEMP, 1 + BASE_SHIFT
2432	dsll	TEMP, TEMP, 2 + BASE_SHIFT
2433	daddu	AO, AO, L
2434	daddu	BO, BO, TEMP
2435#endif
2436
2437#ifdef LT
2438	daddiu	KK, KK, 2
2439#endif
2440
2441#ifdef LN
2442	daddiu	KK, KK, -2
2443#endif
2444
2445	MTC	$0,  a1
2446
2447	MOV	c11, a1
2448	MOV	c21, a1
2449	MOV	c31, a1
2450
2451	daddiu	I, I, -1
2452
2453	bgtz	I, .L31
2454	MOV	c41, c11
2455	.align 3
2456
2457.L49:
2458#ifdef LN
2459	dsll	TEMP, K, 2 + BASE_SHIFT
2460	daddu	B, B, TEMP
2461#endif
2462
2463#if defined(LT) || defined(RN)
2464	move	B,  BO
2465#endif
2466
2467#ifdef RN
2468	daddiu	KK, KK,  4
2469#endif
2470
2471#ifdef RT
2472	daddiu	KK, KK, -4
2473#endif
2474	.align 3
2475
2476.L50:
2477	andi	J,  N, 2
2478	blez	J, .L70
2479
2480#ifdef RT
2481	dsll	TEMP, K, 1 + BASE_SHIFT
2482	dsubu	B, B, TEMP
2483
2484	dsll	TEMP, LDC, 1
2485	dsubu	C, C, TEMP
2486#endif
2487
2488	move	AO, A
2489	move	CO1, C
2490	daddu	CO2, C,   LDC
2491
2492#ifdef LN
2493	daddu	KK, M, OFFSET
2494#endif
2495
2496#ifdef LT
2497	move	KK, OFFSET
2498#endif
2499
2500#if defined(LN) || defined(RT)
2501	move	AORIG, A
2502#else
2503	move	AO, A
2504#endif
2505#ifndef RT
2506	daddu	C,  CO2, LDC
2507#endif
2508
2509	andi	I,  M, 1
2510	blez	I, .L60
2511	NOP
2512
2513#if defined(LT) || defined(RN)
2514	dsra	L,  KK, 2
2515	LD	a1,  0 * SIZE(AO)
2516	MTC	$0,  c11
2517	LD	a2,  1 * SIZE(AO)
2518	MOV	c21, c11
2519	LD	a3,  2 * SIZE(AO)
2520	MOV	c31, c11
2521	LD	a4,  3 * SIZE(AO)
2522	MOV	c41, c11
2523
2524	LD	b1,  0 * SIZE(B)
2525	LD	b2,  1 * SIZE(B)
2526	LD	b3,  2 * SIZE(B)
2527	LD	b4,  3 * SIZE(B)
2528	LD	b5,  4 * SIZE(B)
2529	LD	b6,  8 * SIZE(B)
2530	LD	b7, 12 * SIZE(B)
2531
2532	blez	L, .L65
2533	move	BO,  B
2534#else
2535#ifdef LN
2536	dsll	TEMP,   K,  BASE_SHIFT
2537	dsubu	AORIG, AORIG, TEMP
2538#endif
2539
2540	dsll	L,    KK, 0 + BASE_SHIFT
2541	dsll	TEMP, KK, 1 + BASE_SHIFT
2542
2543	daddu	AO, AORIG, L
2544	daddu	BO, B,     TEMP
2545
2546	dsubu	TEMP, K, KK
2547
2548	dsra	L,  TEMP, 2
2549	LD	a1,  0 * SIZE(AO)
2550	MTC	$0,  c11
2551	LD	a2,  1 * SIZE(AO)
2552	MOV	c21, c11
2553	LD	a3,  2 * SIZE(AO)
2554	MOV	c31, c11
2555	LD	a4,  3 * SIZE(AO)
2556	MOV	c41, c11
2557
2558	LD	b1,  0 * SIZE(BO)
2559	LD	b2,  1 * SIZE(BO)
2560	LD	b3,  2 * SIZE(BO)
2561	LD	b4,  3 * SIZE(BO)
2562	LD	b5,  4 * SIZE(BO)
2563	LD	b6,  8 * SIZE(BO)
2564	LD	b7, 12 * SIZE(BO)
2565
2566	blez	L, .L65
2567	NOP
2568#endif
2569	.align	3
2570
2571.L62:
2572	MADD	c11, c11, a1, b1
2573	LD	b1,  4 * SIZE(BO)
2574	MADD	c21, c21, a1, b2
2575	LD	b2,  5 * SIZE(BO)
2576	MADD	c31, c31, a2, b3
2577	LD	b3,  6 * SIZE(BO)
2578	MADD	c41, c41, a2, b4
2579	LD	b4,  7 * SIZE(BO)
2580
2581	LD	a1,  4 * SIZE(AO)
2582	LD	a2,  5 * SIZE(AO)
2583
2584	MADD	c11, c11, a3, b1
2585	LD	b1,  8 * SIZE(BO)
2586	MADD	c21, c21, a3, b2
2587	LD	b2,  9 * SIZE(BO)
2588	MADD	c31, c31, a4, b3
2589	LD	b3, 10 * SIZE(BO)
2590	MADD	c41, c41, a4, b4
2591	LD	b4, 11 * SIZE(BO)
2592
2593	LD	a3,  6 * SIZE(AO)
2594	LD	a4,  7 * SIZE(AO)
2595
2596	daddiu	L, L, -1
2597	daddiu	AO, AO,  4 * SIZE
2598
2599	bgtz	L, .L62
2600	daddiu	BO, BO,  8 * SIZE
2601	.align 3
2602
2603.L65:
2604#if defined(LT) || defined(RN)
2605	andi	L, KK,  3
2606#else
2607	andi	L, TEMP, 3
2608#endif
2609	NOP
2610	blez	L, .L68
2611	NOP
2612	.align	3
2613
2614.L66:
2615	MADD	c11, c11, a1, b1
2616	LD	b1,  2 * SIZE(BO)
2617	MADD	c21, c21, a1, b2
2618	LD	b2,  3 * SIZE(BO)
2619
2620	LD	a1,  1 * SIZE(AO)
2621	daddiu	L, L, -1
2622
2623	daddiu	AO, AO,  1 * SIZE
2624	bgtz	L, .L66
2625	daddiu	BO, BO,  2 * SIZE
2626
2627
2628.L68:
2629	ADD	c11, c11, c31
2630	ADD	c21, c21, c41
2631
2632#if defined(LN) || defined(RT)
2633#ifdef LN
2634	daddiu	TEMP, KK, -1
2635#else
2636	daddiu	TEMP, KK, -2
2637#endif
2638
2639	dsll	L,    TEMP, 0 + BASE_SHIFT
2640	dsll	TEMP, TEMP, 1 + BASE_SHIFT
2641	daddu	AO, AORIG, L
2642	daddu	BO, B,     TEMP
2643#endif
2644
2645
2646#if defined(LN) || defined(LT)
2647	LD	b1,  0 * SIZE(BO)
2648	LD	b2,  1 * SIZE(BO)
2649
2650	SUB	c11, b1, c11
2651	SUB	c21, b2, c21
2652#else
2653	LD	b1,  0 * SIZE(AO)
2654	LD	b2,  1 * SIZE(AO)
2655
2656	SUB	c11, b1, c11
2657	SUB	c21, b2, c21
2658#endif
2659
2660#if defined(LN) || defined(LT)
2661	LD	b3,  0 * SIZE(AO)
2662
2663	MUL	c11, b3, c11
2664	MUL	c21, b3, c21
2665#endif
2666
2667#ifdef RN
2668	LD	b1,  0 * SIZE(BO)
2669	LD	b2,  1 * SIZE(BO)
2670	LD	b3,  3 * SIZE(BO)
2671
2672	MUL	c11, b1, c11
2673
2674	NMSUB	c21, c21, b2, c11
2675
2676	MUL	c21, b3, c21
2677#endif
2678
2679#ifdef RT
2680	LD	b1,  3 * SIZE(BO)
2681	LD	b2,  2 * SIZE(BO)
2682	LD	b3,  0 * SIZE(BO)
2683
2684	MUL	c21, b1, c21
2685
2686	NMSUB	c11, c11, b2, c21
2687
2688	MUL	c11, b3, c11
2689#endif
2690
2691#ifdef LN
2692	daddiu	CO1, CO1, -1 * SIZE
2693	daddiu	CO2, CO2, -1 * SIZE
2694#endif
2695
2696#if defined(LN) || defined(LT)
2697	ST	c11,  0 * SIZE(BO)
2698	ST	c21,  1 * SIZE(BO)
2699#else
2700	ST	c11,  0 * SIZE(AO)
2701	ST	c21,  1 * SIZE(AO)
2702#endif
2703
2704	ST	c11,  0 * SIZE(CO1)
2705	ST	c21,  0 * SIZE(CO2)
2706
2707#ifndef LN
2708	daddiu	CO1, CO1, 1 * SIZE
2709	daddiu	CO2, CO2, 1 * SIZE
2710#endif
2711
2712#ifdef RT
2713	dsll	TEMP, K, 0 + BASE_SHIFT
2714	daddu	AORIG, AORIG, TEMP
2715#endif
2716
2717#if defined(LT) || defined(RN)
2718	dsubu	TEMP, K, KK
2719	dsll	L,    TEMP, 0 + BASE_SHIFT
2720	dsll	TEMP, TEMP, 1 + BASE_SHIFT
2721	daddu	AO, AO, L
2722	daddu	BO, BO, TEMP
2723#endif
2724
2725#ifdef LT
2726	daddiu	KK, KK, 1
2727#endif
2728
2729#ifdef LN
2730	daddiu	KK, KK, -1
2731#endif
2732	.align 3
2733
2734.L60:
2735	dsra	I,  M, 1
2736	blez	I, .L69
2737	NOP
2738
2739.L51:
2740#if defined(LT) || defined(RN)
2741	LD	a1,  0 * SIZE(AO)
2742	MTC	$0,  c11
2743	LD	a2,  1 * SIZE(AO)
2744	MOV	c21, c11
2745	LD	a5,  4 * SIZE(AO)
2746
2747	LD	b1,  0 * SIZE(B)
2748	MOV	c12, c11
2749	LD	b2,  1 * SIZE(B)
2750	MOV	c22, c11
2751	LD	b3,  2 * SIZE(B)
2752	LD	b5,  4 * SIZE(B)
2753	dsra	L,  KK, 2
2754	LD	b6,  8 * SIZE(B)
2755	LD	b7, 12 * SIZE(B)
2756
2757	blez	L, .L55
2758	move	BO,  B
2759
2760#else
2761#ifdef LN
2762	dsll	TEMP,   K,  1 + BASE_SHIFT
2763	dsubu	AORIG, AORIG, TEMP
2764#endif
2765
2766	dsll	L,    KK, 1 + BASE_SHIFT
2767	dsll	TEMP, KK, 1 + BASE_SHIFT
2768
2769	daddu	AO, AORIG, L
2770	daddu	BO, B,     TEMP
2771
2772	dsubu	TEMP, K, KK
2773
2774	LD	a1,  0 * SIZE(AO)
2775	MTC	$0,  c11
2776	LD	a2,  1 * SIZE(AO)
2777	MOV	c21, c11
2778	LD	a5,  4 * SIZE(AO)
2779
2780	LD	b1,  0 * SIZE(BO)
2781	MOV	c12, c11
2782	LD	b2,  1 * SIZE(BO)
2783	MOV	c22, c11
2784	LD	b3,  2 * SIZE(BO)
2785	LD	b5,  4 * SIZE(BO)
2786	dsra	L,  TEMP, 2
2787	LD	b6,  8 * SIZE(BO)
2788	LD	b7, 12 * SIZE(BO)
2789
2790	blez	L, .L55
2791	NOP
2792#endif
2793	.align	3
2794
2795.L52:
2796	MADD	c11, c11, a1, b1
2797	LD	a3,  2 * SIZE(AO)
2798	MADD	c21, c21, a1, b2
2799	LD	b4,  3 * SIZE(BO)
2800	MADD	c12, c12, a2, b1
2801	LD	a4,  3 * SIZE(AO)
2802	MADD	c22, c22, a2, b2
2803	LD	b1,  8 * SIZE(BO)
2804
2805	MADD	c11, c11, a3, b3
2806	LD	a1,  8 * SIZE(AO)
2807	MADD	c21, c21, a3, b4
2808	LD	b2,  5 * SIZE(BO)
2809	MADD	c12, c12, a4, b3
2810	LD	a2,  5 * SIZE(AO)
2811	MADD	c22, c22, a4, b4
2812	LD	b3,  6 * SIZE(BO)
2813
2814	MADD	c11, c11, a5, b5
2815	LD	a3,  6 * SIZE(AO)
2816	MADD	c21, c21, a5, b2
2817	LD	b4,  7 * SIZE(BO)
2818	MADD	c12, c12, a2, b5
2819	LD	a4,  7 * SIZE(AO)
2820	MADD	c22, c22, a2, b2
2821	LD	b5, 12 * SIZE(BO)
2822
2823	MADD	c11, c11, a3, b3
2824	LD	a5, 12 * SIZE(AO)
2825	MADD	c21, c21, a3, b4
2826	LD	b2,  9 * SIZE(BO)
2827	MADD	c12, c12, a4, b3
2828	LD	a2,  9 * SIZE(AO)
2829	MADD	c22, c22, a4, b4
2830	LD	b3, 10 * SIZE(BO)
2831
2832	daddiu	AO, AO,  8 * SIZE
2833	daddiu	L, L, -1
2834	bgtz	L, .L52
2835	daddiu	BO, BO,  8 * SIZE
2836	.align 3
2837
2838.L55:
2839#if defined(LT) || defined(RN)
2840	andi	L, KK,  3
2841#else
2842	andi	L, TEMP, 3
2843#endif
2844	NOP
2845	blez	L, .L58
2846	NOP
2847	.align	3
2848
2849.L56:
2850	MADD	c11, c11, a1, b1
2851	LD	a2,  1 * SIZE(AO)
2852	MADD	c21, c21, a1, b2
2853	LD	a1,  2 * SIZE(AO)
2854
2855	MADD	c12, c12, a2, b1
2856	LD	b1,  2 * SIZE(BO)
2857	MADD	c22, c22, a2, b2
2858	LD	b2,  3 * SIZE(BO)
2859
2860	daddiu	L, L, -1
2861	daddiu	AO, AO,  2 * SIZE
2862	bgtz	L, .L56
2863	daddiu	BO, BO,  2 * SIZE
2864
2865.L58:
2866#if defined(LN) || defined(RT)
2867#ifdef LN
2868	daddiu	TEMP, KK, -2
2869#else
2870	daddiu	TEMP, KK, -2
2871#endif
2872
2873	dsll	L,    TEMP, 1 + BASE_SHIFT
2874	dsll	TEMP, TEMP, 1 + BASE_SHIFT
2875	daddu	AO, AORIG, L
2876	daddu	BO, B,     TEMP
2877#endif
2878
2879
2880#if defined(LN) || defined(LT)
2881	LD	b1,  0 * SIZE(BO)
2882	LD	b2,  1 * SIZE(BO)
2883	LD	b3,  2 * SIZE(BO)
2884	LD	b4,  3 * SIZE(BO)
2885
2886	SUB	c11, b1, c11
2887	SUB	c21, b2, c21
2888	SUB	c12, b3, c12
2889	SUB	c22, b4, c22
2890#else
2891	LD	b1,  0 * SIZE(AO)
2892	LD	b2,  1 * SIZE(AO)
2893	LD	b3,  2 * SIZE(AO)
2894	LD	b4,  3 * SIZE(AO)
2895
2896	SUB	c11, b1, c11
2897	SUB	c12, b2, c12
2898	SUB	c21, b3, c21
2899	SUB	c22, b4, c22
2900#endif
2901
2902#ifdef LN
2903	LD	b1,  3 * SIZE(AO)
2904	LD	b2,  2 * SIZE(AO)
2905	LD	b3,  0 * SIZE(AO)
2906
2907	MUL	c12, b1, c12
2908	MUL	c22, b1, c22
2909
2910	NMSUB	c11, c11, b2, c12
2911	NMSUB	c21, c21, b2, c22
2912
2913	MUL	c11, b3, c11
2914	MUL	c21, b3, c21
2915#endif
2916
2917#ifdef LT
2918	LD	b1,  0 * SIZE(AO)
2919	LD	b2,  1 * SIZE(AO)
2920	LD	b3,  3 * SIZE(AO)
2921
2922	MUL	c11, b1, c11
2923	MUL	c21, b1, c21
2924
2925	NMSUB	c12, c12, b2, c11
2926	NMSUB	c22, c22, b2, c21
2927
2928	MUL	c12, b3, c12
2929	MUL	c22, b3, c22
2930#endif
2931
2932#ifdef RN
2933	LD	b1,  0 * SIZE(BO)
2934	LD	b2,  1 * SIZE(BO)
2935	LD	b3,  3 * SIZE(BO)
2936
2937	MUL	c11, b1, c11
2938	MUL	c12, b1, c12
2939
2940	NMSUB	c21, c21, b2, c11
2941	NMSUB	c22, c22, b2, c12
2942
2943	MUL	c21, b3, c21
2944	MUL	c22, b3, c22
2945#endif
2946
2947#ifdef RT
2948	LD	b1,  3 * SIZE(BO)
2949	LD	b2,  2 * SIZE(BO)
2950	LD	b3,  0 * SIZE(BO)
2951
2952	MUL	c21, b1, c21
2953	MUL	c22, b1, c22
2954
2955	NMSUB	c11, c11, b2, c21
2956	NMSUB	c12, c12, b2, c22
2957
2958	MUL	c11, b3, c11
2959	MUL	c12, b3, c12
2960#endif
2961
2962#ifdef LN
2963	daddiu	CO1, CO1, -2 * SIZE
2964	daddiu	CO2, CO2, -2 * SIZE
2965#endif
2966
2967#if defined(LN) || defined(LT)
2968	ST	c11,  0 * SIZE(BO)
2969	ST	c21,  1 * SIZE(BO)
2970	ST	c12,  2 * SIZE(BO)
2971	ST	c22,  3 * SIZE(BO)
2972#else
2973	ST	c11,  0 * SIZE(AO)
2974	ST	c12,  1 * SIZE(AO)
2975	ST	c21,  2 * SIZE(AO)
2976	ST	c22,  3 * SIZE(AO)
2977#endif
2978
2979	ST	c11,  0 * SIZE(CO1)
2980	ST	c12,  1 * SIZE(CO1)
2981	ST	c21,  0 * SIZE(CO2)
2982	ST	c22,  1 * SIZE(CO2)
2983
2984#ifndef LN
2985	daddiu	CO1, CO1, 2 * SIZE
2986	daddiu	CO2, CO2, 2 * SIZE
2987#endif
2988
2989#ifdef RT
2990	dsll	TEMP, K, 1 + BASE_SHIFT
2991	daddu	AORIG, AORIG, TEMP
2992#endif
2993
2994#if defined(LT) || defined(RN)
2995	dsubu	TEMP, K, KK
2996	dsll	TEMP, TEMP, 1 + BASE_SHIFT
2997	daddu	AO, AO, TEMP
2998	daddu	BO, BO, TEMP
2999#endif
3000
3001#ifdef LT
3002	daddiu	KK, KK, 2
3003#endif
3004
3005#ifdef LN
3006	daddiu	KK, KK, -2
3007#endif
3008
3009	MTC	$0,  a1
3010
3011	MOV	c11, a1
3012	MOV	c21, a1
3013	MOV	c31, a1
3014
3015	daddiu	I, I, -1
3016
3017	bgtz	I, .L51
3018	MOV	c41, c11
3019	.align 3
3020
3021.L69:
3022#ifdef LN
3023	dsll	TEMP, K, 1 + BASE_SHIFT
3024	daddu	B, B, TEMP
3025#endif
3026
3027#if defined(LT) || defined(RN)
3028	move	B,  BO
3029#endif
3030
3031#ifdef RN
3032	daddiu	KK, KK,  2
3033#endif
3034
3035#ifdef RT
3036	daddiu	KK, KK, -2
3037#endif
3038	.align 3
3039
3040.L70:
3041	andi	J,  N, 1
3042	blez	J, .L999
3043	NOP
3044
3045#ifdef RT
3046	dsll	TEMP, K, BASE_SHIFT
3047	dsubu	B, B, TEMP
3048
3049	dsubu	C, C, LDC
3050#endif
3051
3052	move	AO, A
3053	move	CO1, C
3054
3055#ifdef LN
3056	daddu	KK, M, OFFSET
3057#endif
3058
3059#ifdef LT
3060	move	KK, OFFSET
3061#endif
3062
3063#if defined(LN) || defined(RT)
3064	move	AORIG, A
3065#else
3066	move	AO, A
3067#endif
3068#ifndef RT
3069	daddu	C,  CO1, LDC
3070#endif
3071
3072	andi	I,  M, 1
3073	blez	I, .L80
3074	NOP
3075
3076#if defined(LT) || defined(RN)
3077	LD	a1,  0 * SIZE(AO)
3078	MTC	$0,  c11
3079	LD	a2,  1 * SIZE(AO)
3080	MOV	c21, c11
3081	LD	a3,  2 * SIZE(AO)
3082	LD	a4,  3 * SIZE(AO)
3083
3084	LD	b1,  0 * SIZE(B)
3085	LD	b2,  1 * SIZE(B)
3086	LD	b3,  2 * SIZE(B)
3087	LD	b4,  3 * SIZE(B)
3088	LD	b5,  4 * SIZE(B)
3089	LD	b6,  8 * SIZE(B)
3090	LD	b7, 12 * SIZE(B)
3091
3092	dsra	L,  KK, 2
3093	blez	L, .L85
3094	move	BO,  B
3095#else
3096#ifdef LN
3097	dsll	TEMP,   K,  BASE_SHIFT
3098	dsubu	AORIG, AORIG, TEMP
3099#endif
3100
3101	dsll	TEMP, KK, BASE_SHIFT
3102
3103	daddu	AO, AORIG, TEMP
3104	daddu	BO, B,     TEMP
3105
3106	dsubu	TEMP, K, KK
3107
3108	LD	a1,  0 * SIZE(AO)
3109	MTC	$0,  c11
3110	LD	a2,  1 * SIZE(AO)
3111	MOV	c21, c11
3112	LD	a3,  2 * SIZE(AO)
3113	LD	a4,  3 * SIZE(AO)
3114
3115	LD	b1,  0 * SIZE(BO)
3116	LD	b2,  1 * SIZE(BO)
3117	LD	b3,  2 * SIZE(BO)
3118	LD	b4,  3 * SIZE(BO)
3119	LD	b5,  4 * SIZE(BO)
3120	LD	b6,  8 * SIZE(BO)
3121	LD	b7, 12 * SIZE(BO)
3122
3123	dsra	L,  TEMP, 2
3124	blez	L, .L85
3125	NOP
3126#endif
3127	.align	3
3128
3129.L82:
3130	LD	a1,  0 * SIZE(AO)
3131	LD	b1,  0 * SIZE(BO)
3132
3133	MADD	c11, c11, a1, b1
3134
3135	LD	a1,  1 * SIZE(AO)
3136	LD	b1,  1 * SIZE(BO)
3137
3138	MADD	c21, c21, a1, b1
3139
3140	LD	a1,  2 * SIZE(AO)
3141	LD	b1,  2 * SIZE(BO)
3142
3143	MADD	c11, c11, a1, b1
3144
3145	LD	a1,  3 * SIZE(AO)
3146	LD	b1,  3 * SIZE(BO)
3147
3148	MADD	c21, c21, a1, b1
3149
3150	daddiu	L, L, -1
3151	daddiu	AO, AO,  4 * SIZE
3152	bgtz	L, .L82
3153	daddiu	BO, BO,  4 * SIZE
3154	.align 3
3155
3156.L85:
3157#if defined(LT) || defined(RN)
3158	andi	L, KK,  3
3159#else
3160	andi	L, TEMP, 3
3161#endif
3162	NOP
3163	blez	L, .L88
3164	NOP
3165	.align	3
3166
3167.L86:
3168	LD	a1,  0 * SIZE(AO)
3169	LD	b1,  0 * SIZE(BO)
3170
3171	MADD	c11, c11, a1, b1
3172
3173	daddiu	L, L, -1
3174	daddiu	AO, AO,  1 * SIZE
3175	bgtz	L, .L86
3176	daddiu	BO, BO,  1 * SIZE
3177
3178
3179.L88:
3180	ADD	c11, c11, c21
3181
3182#if defined(LN) || defined(RT)
3183#ifdef LN
3184	daddiu	TEMP, KK, -1
3185#else
3186	daddiu	TEMP, KK, -1
3187#endif
3188
3189	dsll	TEMP, TEMP, 0 + BASE_SHIFT
3190	daddu	AO, AORIG, TEMP
3191	daddu	BO, B,     TEMP
3192#endif
3193
3194
3195#if defined(LN) || defined(LT)
3196	LD	b1,  0 * SIZE(BO)
3197
3198	SUB	c11, b1, c11
3199#else
3200	LD	b1,  0 * SIZE(AO)
3201
3202	SUB	c11, b1, c11
3203#endif
3204
3205#if defined(LN) || defined(LT)
3206	LD	b1,  0 * SIZE(AO)
3207
3208	MUL	c11, b1, c11
3209#endif
3210
3211#if defined(RN) || defined(RT)
3212	LD	b1,  0 * SIZE(BO)
3213
3214	MUL	c11, b1, c11
3215#endif
3216
3217#ifdef LN
3218	daddiu	CO1, CO1, -1 * SIZE
3219#endif
3220
3221#if defined(LN) || defined(LT)
3222	ST	c11,  0 * SIZE(BO)
3223#else
3224	ST	c11,  0 * SIZE(AO)
3225#endif
3226
3227	ST	c11,  0 * SIZE(CO1)
3228
3229#ifndef LN
3230	daddiu	CO1, CO1, 1 * SIZE
3231#endif
3232
3233#ifdef RT
3234	dsll	TEMP, K, BASE_SHIFT
3235	daddu	AORIG, AORIG, TEMP
3236#endif
3237
3238#if defined(LT) || defined(RN)
3239	dsubu	TEMP, K, KK
3240	dsll	TEMP, TEMP, 0 + BASE_SHIFT
3241	daddu	AO, AO, TEMP
3242	daddu	BO, BO, TEMP
3243#endif
3244
3245#ifdef LT
3246	daddiu	KK, KK, 1
3247#endif
3248
3249#ifdef LN
3250	daddiu	KK, KK, -1
3251#endif
3252	.align 3
3253
3254.L80:
3255	dsra	I,  M, 1
3256	blez	I, .L89
3257	NOP
3258
3259.L71:
3260#if defined(LT) || defined(RN)
3261	LD	a1,  0 * SIZE(AO)
3262	MTC	$0,  c11
3263	LD	a2,  1 * SIZE(AO)
3264	MOV	c21, c11
3265	LD	a5,  4 * SIZE(AO)
3266
3267	LD	b1,  0 * SIZE(B)
3268	MOV	c12, c11
3269	LD	b2,  1 * SIZE(B)
3270	MOV	c22, c11
3271	LD	b3,  2 * SIZE(B)
3272	LD	b5,  4 * SIZE(B)
3273	dsra	L,  KK, 2
3274	LD	b6,  8 * SIZE(B)
3275	LD	b7, 12 * SIZE(B)
3276
3277	blez	L, .L75
3278	move	BO,  B
3279#else
3280#ifdef LN
3281	dsll	TEMP,   K,  1 + BASE_SHIFT
3282	dsubu	AORIG, AORIG, TEMP
3283#endif
3284
3285	dsll	L,    KK, 1 + BASE_SHIFT
3286	dsll	TEMP, KK, 0 + BASE_SHIFT
3287
3288	daddu	AO, AORIG, L
3289	daddu	BO, B,     TEMP
3290
3291	dsubu	TEMP, K, KK
3292
3293	LD	a1,  0 * SIZE(AO)
3294	MTC	$0,  c11
3295	LD	a2,  1 * SIZE(AO)
3296	MOV	c21, c11
3297	LD	a5,  4 * SIZE(AO)
3298
3299	LD	b1,  0 * SIZE(BO)
3300	MOV	c12, c11
3301	LD	b2,  1 * SIZE(BO)
3302	MOV	c22, c11
3303	LD	b3,  2 * SIZE(BO)
3304	LD	b5,  4 * SIZE(BO)
3305	dsra	L,  TEMP, 2
3306	LD	b6,  8 * SIZE(BO)
3307	LD	b7, 12 * SIZE(BO)
3308
3309	blez	L, .L75
3310	NOP
3311#endif
3312	.align	3
3313
3314.L72:
3315	LD	a1,  0 * SIZE(AO)
3316	LD	a2,  1 * SIZE(AO)
3317	LD	b1,  0 * SIZE(BO)
3318
3319	MADD	c11, c11, a1, b1
3320	MADD	c12, c12, a2, b1
3321
3322	LD	a1,  2 * SIZE(AO)
3323	LD	a2,  3 * SIZE(AO)
3324	LD	b1,  1 * SIZE(BO)
3325
3326	MADD	c11, c11, a1, b1
3327	MADD	c12, c12, a2, b1
3328
3329	LD	a1,  4 * SIZE(AO)
3330	LD	a2,  5 * SIZE(AO)
3331	LD	b1,  2 * SIZE(BO)
3332
3333	MADD	c11, c11, a1, b1
3334	MADD	c12, c12, a2, b1
3335
3336	LD	a1,  6 * SIZE(AO)
3337	LD	a2,  7 * SIZE(AO)
3338	LD	b1,  3 * SIZE(BO)
3339
3340	MADD	c11, c11, a1, b1
3341	MADD	c12, c12, a2, b1
3342
3343	daddiu	L, L, -1
3344	daddiu	AO, AO,  8 * SIZE
3345	bgtz	L, .L72
3346	daddiu	BO, BO,  4 * SIZE
3347	.align 3
3348
3349.L75:
3350#if defined(LT) || defined(RN)
3351	andi	L, KK,  3
3352#else
3353	andi	L, TEMP, 3
3354#endif
3355	NOP
3356	blez	L, .L78
3357	NOP
3358	.align	3
3359
3360.L76:
3361	LD	a1,  0 * SIZE(AO)
3362	LD	a2,  1 * SIZE(AO)
3363	LD	b1,  0 * SIZE(BO)
3364
3365	MADD	c11, c11, a1, b1
3366	MADD	c12, c12, a2, b1
3367
3368	daddiu	L, L, -1
3369	daddiu	AO, AO,  2 * SIZE
3370	bgtz	L, .L76
3371	daddiu	BO, BO,  1 * SIZE
3372
3373.L78:
3374	ADD	c11, c11, c21
3375	ADD	c12, c12, c22
3376
3377#if defined(LN) || defined(RT)
3378#ifdef LN
3379	daddiu	TEMP, KK, -2
3380#else
3381	daddiu	TEMP, KK, -1
3382#endif
3383
3384	dsll	L,    TEMP, 1 + BASE_SHIFT
3385	dsll	TEMP, TEMP, 0 + BASE_SHIFT
3386	daddu	AO, AORIG, L
3387	daddu	BO, B,     TEMP
3388#endif
3389
3390
3391#if defined(LN) || defined(LT)
3392	LD	b1,  0 * SIZE(BO)
3393	LD	b2,  1 * SIZE(BO)
3394
3395	SUB	c11, b1, c11
3396	SUB	c12, b2, c12
3397#else
3398	LD	b1,  0 * SIZE(AO)
3399	LD	b2,  1 * SIZE(AO)
3400
3401	SUB	c11, b1, c11
3402	SUB	c12, b2, c12
3403#endif
3404
3405#ifdef LN
3406	LD	b1,  3 * SIZE(AO)
3407	LD	b2,  2 * SIZE(AO)
3408	LD	b3,  0 * SIZE(AO)
3409
3410	MUL	c12, b1, c12
3411	NMSUB	c11, c11, b2, c12
3412	MUL	c11, b3, c11
3413#endif
3414
3415#ifdef LT
3416	LD	b1,  0 * SIZE(AO)
3417	LD	b2,  1 * SIZE(AO)
3418	LD	b3,  3 * SIZE(AO)
3419
3420	MUL	c11, b1, c11
3421	NMSUB	c12, c12, b2, c11
3422	MUL	c12, b3, c12
3423#endif
3424
3425#if defined(RN) || defined(RT)
3426	LD	b1,  0 * SIZE(BO)
3427
3428	MUL	c11, b1, c11
3429	MUL	c12, b1, c12
3430#endif
3431
3432#ifdef LN
3433	daddiu	CO1, CO1, -2 * SIZE
3434#endif
3435
3436#if defined(LN) || defined(LT)
3437	ST	c11,  0 * SIZE(BO)
3438	ST	c12,  1 * SIZE(BO)
3439#else
3440	ST	c11,  0 * SIZE(AO)
3441	ST	c12,  1 * SIZE(AO)
3442#endif
3443
3444	ST	c11,  0 * SIZE(CO1)
3445	ST	c12,  1 * SIZE(CO1)
3446
3447#ifndef LN
3448	daddiu	CO1, CO1, 2 * SIZE
3449#endif
3450
3451#ifdef RT
3452	dsll	TEMP, K, 1 + BASE_SHIFT
3453	daddu	AORIG, AORIG, TEMP
3454#endif
3455
3456#if defined(LT) || defined(RN)
3457	dsubu	TEMP, K, KK
3458	dsll	L,    TEMP, 1 + BASE_SHIFT
3459	dsll	TEMP, TEMP, 0 + BASE_SHIFT
3460	daddu	AO, AO, L
3461	daddu	BO, BO, TEMP
3462#endif
3463
3464#ifdef LT
3465	daddiu	KK, KK, 2
3466#endif
3467
3468#ifdef LN
3469	daddiu	KK, KK, -2
3470#endif
3471
3472	daddiu	I, I, -1
3473
3474	bgtz	I, .L71
3475	NOP
3476	.align 3
3477
3478
3479.L89:
3480#ifdef LN
3481	dsll	TEMP, K, BASE_SHIFT
3482	daddu	B, B, TEMP
3483#endif
3484
3485#if defined(LT) || defined(RN)
3486	move	B,  BO
3487#endif
3488
3489#ifdef RN
3490	daddiu	KK, KK,  1
3491#endif
3492
3493#ifdef RT
3494	daddiu	KK, KK, -1
3495#endif
3496	.align 3
3497
3498
3499.L999:
3500	LDARG	$16,   0($sp)
3501	LDARG	$17,   8($sp)
3502	LDARG	$18,  16($sp)
3503	LDARG	$19,  24($sp)
3504	LDARG	$20,  32($sp)
3505	LDARG	$21,  40($sp)
3506	ldc1	$f24, 48($sp)
3507	ldc1	$f25, 56($sp)
3508	ldc1	$f26, 64($sp)
3509	ldc1	$f27, 72($sp)
3510	ldc1	$f28, 80($sp)
3511
3512	LDARG	$22,  88($sp)
3513	LDARG	$23,  96($sp)
3514	LDARG	$24, 104($sp)
3515	LDARG	$25, 112($sp)
3516
3517#ifndef __64BIT__
3518	ldc1	$f20,112($sp)
3519	ldc1	$f21,120($sp)
3520	ldc1	$f22,128($sp)
3521	ldc1	$f23,136($sp)
3522#endif
3523
3524	j	$31
3525	daddiu	$sp, $sp, 144
3526
3527	EPILOGUE
3528