1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2005. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define APREFETCHSIZE 24
26#define APREFETCH_CATEGORY 0
27
28#define M	%i0
29#define N	%i1
30#define K	%i2
31
32#if defined(DOUBLE) && !defined(__64BIT__)
33#define A	%i5
34#define B	%i4
35#else
36#define A	%i4
37#define B	%i5
38#endif
39
40#define C	%o4
41#define LDC	%o5
42
43#define AO	%l0
44#define BO	%l1
45#define I	%l2
46#define J	%l3
47#define L	%l4
48
49#define C1	%o0
50#define C2	%o1
51#define C3	%o2
52#define C4	%o3
53
54#define C5	%l5
55#define	C6	%l6
56#define C7	%l7
57#define C8	%i3
58
59#define OFFSET	%g1
60#define	KK	%g2
61#define TEMP1	%g3
62#define TEMP2	%g4
63#define AORIG	%o7
64
65#ifdef DOUBLE
66#define c01	%f0
67#define c02	%f2
68#define c03	%f4
69#define c04	%f6
70#define c05	%f8
71#define c06	%f10
72#define c07	%f12
73#define c08	%f14
74#define c09	%f16
75#define c10	%f18
76#define c11	%f20
77#define c12	%f22
78#define c13	%f24
79#define c14	%f26
80#define c15	%f28
81#define c16	%f30
82
83#define a1	%f32
84#define a2	%f34
85#define a3	%f36
86#define a4	%f38
87#define a5	%f40
88
89#define b1	%f42
90#define b2	%f44
91#define b3	%f46
92#define b4	%f48
93#define b5	%f50
94#define b6	%f52
95#define b7	%f54
96#define b8	%f56
97#define b9	%f58
98
99#define cc01	0
100#define cc02	2
101#define cc03	4
102#define cc04	6
103#define cc05	8
104#define cc06	10
105#define cc07	12
106#define cc08	14
107#define cc09	16
108#define cc10	18
109#define cc11	20
110#define cc12	22
111#define cc13	24
112#define cc14	26
113#define cc15	28
114#define cc16	30
115
116#define aa1	 1
117#define aa2	 3
118#define aa3	 5
119#define aa4	 7
120#define aa5	 9
121
122#define bb1	11
123#define bb2	13
124#define bb3	15
125#define bb4	17
126#define bb5	19
127#define bb6	21
128#define bb7	23
129#define bb8	25
130#define bb9	27
131
132#else
133#define c01	%f0
134#define c02	%f1
135#define c03	%f2
136#define c04	%f3
137#define c05	%f4
138#define c06	%f5
139#define c07	%f6
140#define c08	%f7
141#define c09	%f8
142#define c10	%f9
143#define c11	%f10
144#define c12	%f11
145#define c13	%f12
146#define c14	%f13
147#define c15	%f14
148#define c16	%f15
149
150#define a1	%f16
151#define a2	%f17
152#define a3	%f18
153#define a4	%f19
154#define a5	%f20
155
156#define b1	%f21
157#define b2	%f22
158#define b3	%f23
159#define b4	%f24
160#define b5	%f25
161#define b6	%f26
162#define b7	%f27
163#define b8	%f28
164#define b9	%f29
165
166#define cc01	0
167#define cc02	1
168#define cc03	2
169#define cc04	3
170#define cc05	4
171#define cc06	5
172#define cc07	6
173#define cc08	7
174#define cc09	8
175#define cc10	9
176#define cc11	10
177#define cc12	11
178#define cc13	12
179#define cc14	13
180#define cc15	14
181#define cc16	15
182
183#define aa1	16
184#define aa2	17
185#define aa3	18
186#define aa4	19
187#define aa5	20
188
189#define bb1	21
190#define bb2	22
191#define bb3	23
192#define bb4	24
193#define bb5	25
194#define bb6	26
195#define bb7	27
196#define bb8	28
197#define bb9	29
198
199#endif
200
201        .register %g2, #scratch
202        .register %g3, #scratch
203
204	PROLOGUE
205	SAVESP
206	nop
207
208#ifndef __64BIT__
209
210#ifdef DOUBLE
211	ld	[%sp + STACK_START + 28], B
212	ld	[%sp + STACK_START + 32], C
213	ld	[%sp + STACK_START + 36], LDC
214	ld	[%sp + STACK_START + 40], OFFSET
215#else
216	ld	[%sp + STACK_START + 28], C
217	ld	[%sp + STACK_START + 32], LDC
218	ld	[%sp + STACK_START + 36], OFFSET
219#endif
220	st	%g1, [%sp + STACK_START +  8]
221	st	%g2, [%sp + STACK_START + 12]
222	st	%g3, [%sp + STACK_START + 16]
223	st	%g4, [%sp + STACK_START + 20]
224#else
225
226	ldx	[%sp+  STACK_START + 56], C
227	ldx	[%sp+  STACK_START + 64], LDC
228	ldx	[%sp+  STACK_START + 72], OFFSET
229
230	stx	%g1, [%sp + STACK_START + 32]
231	stx	%g2, [%sp + STACK_START + 40]
232	stx	%g3, [%sp + STACK_START + 48]
233	stx	%g4, [%sp + STACK_START + 56]
234#endif
235
236#if defined(TRMMKERNEL) && !defined(LEFT)
237	neg	OFFSET, KK
238#endif
239
240	sll	LDC, BASE_SHIFT, LDC
241
242#ifdef LN
243	smul	M, K, TEMP1
244	sll	TEMP1, BASE_SHIFT, TEMP1
245	add	A, TEMP1, A
246
247	sll	M, BASE_SHIFT, TEMP1
248	add	C, TEMP1, C
249#endif
250
251#ifdef RN
252	neg	OFFSET, KK
253#endif
254
255#ifdef RT
256	smul	N, K, TEMP1
257	sll	TEMP1, BASE_SHIFT, TEMP1
258	add	B, TEMP1, B
259
260	smul	N, LDC, TEMP1
261	add	C, TEMP1, C
262
263	sub	N, OFFSET, KK
264#endif
265
266	and	N, 1, J
267	cmp	J, 0
268	ble,pn	%icc, .LL50
269	nop
270
271#ifdef RT
272	sll	K, BASE_SHIFT, TEMP1
273	sub	B, TEMP1, B
274#endif
275
276#ifndef RT
277	mov	C,  C1
278	add	C1, LDC, C
279#else
280	sub	C,  LDC, C1
281	sub	C,  LDC, C
282#endif
283
284#ifdef LN
285	add	M, OFFSET, KK
286#endif
287
288#ifdef LT
289	mov	OFFSET, KK
290#endif
291
292#if defined(LN) || defined(RT)
293	mov	A, AORIG
294#else
295	mov	A, AO
296#endif
297
298	sra	M, 1, I
299	cmp	I, 0
300	ble,pn	%icc, .LL80
301	nop
302	.align 4
303
304.LL72:
305#if defined(LT) || defined(RN)
306	mov	B, BO
307#else
308#ifdef LN
309	sll	K,  BASE_SHIFT + 1, TEMP1
310	sub	AORIG, TEMP1, AORIG
311#endif
312
313	sll	KK, BASE_SHIFT + 1, TEMP1
314	sll	KK, BASE_SHIFT + 0, TEMP2
315
316	add	AORIG, TEMP1, AO
317	add	B,     TEMP2, BO
318#endif
319
320	LDF	[AO +  0 * SIZE], a1
321	LDF	[AO +  1 * SIZE], a2
322	LDF	[AO +  2 * SIZE], a3
323	LDF	[AO +  3 * SIZE], a4
324
325	LDF	[BO +  0 * SIZE], b1
326	LDF	[BO +  1 * SIZE], b2
327	LDF	[BO +  2 * SIZE], b3
328	FCLR	(cc01)
329	LDF	[BO +  3 * SIZE], b4
330	FCLR	(cc02)
331
332	prefetch [C1 + 2 * SIZE], 3
333
334#if defined(LT) || defined(RN)
335	sra	KK, 2, L
336#else
337	sub	K, KK, L
338	sra	L,  2, L
339#endif
340	cmp	L,  0
341	ble,pn	%icc, .LL75
342	nop
343
344.LL73:
345	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
346	add	L, -1, L
347
348	FMADD	(aa1, bb1, cc01, cc01)
349	LDF	[AO +  4 * SIZE], a1
350	FMADD	(aa2, bb1, cc02, cc02)
351	LDF	[AO +  5 * SIZE], a2
352
353	LDF	[BO +  4 * SIZE], b1
354	cmp	L, 0
355
356	FMADD	(aa3, bb2, cc01, cc01)
357	LDF	[AO +  6 * SIZE], a3
358	FMADD	(aa4, bb2, cc02, cc02)
359	LDF	[AO +  7 * SIZE], a4
360
361	LDF	[BO +  5 * SIZE], b2
362	add	BO,  4 * SIZE, BO
363
364	FMADD	(aa1, bb3, cc01, cc01)
365	LDF	[AO +  8 * SIZE], a1
366	FMADD	(aa2, bb3, cc02, cc02)
367	LDF	[AO +  9 * SIZE], a2
368
369	LDF	[BO +  2 * SIZE], b3
370	add	AO,  8 * SIZE, AO
371
372	FMADD	(aa3, bb4, cc01, cc01)
373	LDF	[AO +  2 * SIZE], a3
374	FMADD	(aa4, bb4, cc02, cc02)
375	LDF	[AO +  3 * SIZE], a4
376
377	bg,pt	%icc, .LL73
378	LDF	[BO +  3 * SIZE], b4
379	.align 4
380
381.LL75:
382#if defined(LT) || defined(RN)
383	and	KK, 3, L
384#else
385	sub	K, KK, L
386	and	L,  3, L
387#endif
388	cmp	L,  0
389	ble,a,pn %icc, .LL78
390	nop
391	.align 4
392
393.LL77:
394	FMADD	(aa1, bb1, cc01, cc01)
395	LDF	[AO + 2 * SIZE], a1
396	FMADD	(aa2, bb1, cc02, cc02)
397	LDF	[AO + 3 * SIZE], a2
398
399	LDF	[BO + 1 * SIZE], b1
400	add	L, -1, L
401	add	AO, 2 * SIZE, AO
402	cmp	L, 0
403	bg,pt	%icc, .LL77
404	add	BO, 1 * SIZE, BO
405	.align 4
406
407.LL78:
408#if defined(LN) || defined(RT)
409#ifdef LN
410	sub	KK, 2, TEMP1
411#else
412	sub	KK, 1, TEMP1
413#endif
414	sll	TEMP1, BASE_SHIFT + 1, TEMP2
415	sll	TEMP1, BASE_SHIFT + 0, TEMP1
416
417	add	AORIG, TEMP2, AO
418	add	B,     TEMP1, BO
419#endif
420
421#if defined(LN) || defined(LT)
422	LDF	[BO +  0 * SIZE], a1
423	LDF	[BO +  1 * SIZE], a2
424
425	FSUB	a1, c01, c01
426	FSUB	a2, c02, c02
427#else
428	LDF	[AO +  0 * SIZE], a1
429	LDF	[AO +  1 * SIZE], a2
430
431	FSUB	a1, c01, c01
432	FSUB	a2, c02, c02
433#endif
434
435#ifdef LN
436	LDF	[AO +  3 * SIZE], a1
437	LDF	[AO +  2 * SIZE], a2
438	LDF	[AO +  0 * SIZE], a3
439
440	FMUL	a1, c02, c02
441
442	FNMSUB	(aa2, cc02, cc01, cc01)
443
444	FMUL	a3, c01, c01
445#endif
446
447#ifdef LT
448	LDF	[AO +  0 * SIZE], a1
449	LDF	[AO +  1 * SIZE], a2
450	LDF	[AO +  3 * SIZE], a3
451
452	FMUL	a1, c01, c01
453
454	FNMSUB	(aa2, cc01, cc02, cc02)
455
456	FMUL	a3, c02, c02
457#endif
458
459#if defined(RN) || defined(RT)
460	LDF	[BO +  0 * SIZE], a1
461
462	FMUL	a1, c01, c01
463	FMUL	a1, c02, c02
464#endif
465
466#ifdef LN
467	add	C1, -2 * SIZE, C1
468#endif
469
470#if defined(LN) || defined(LT)
471	STF	c01, [BO +  0 * SIZE]
472	STF	c02, [BO +  1 * SIZE]
473#else
474	STF	c01, [AO +  0 * SIZE]
475	STF	c02, [AO +  1 * SIZE]
476#endif
477
478	STF	c01, [C1 + 0 * SIZE]
479	STF	c02, [C1 + 1 * SIZE]
480
481#ifndef LN
482	add	C1, 2 * SIZE, C1
483#endif
484
485#ifdef RT
486	sll	K, BASE_SHIFT + 1, TEMP1
487	add	AORIG, TEMP1, AORIG
488#endif
489
490#if defined(LT) || defined(RN)
491	sub	K, KK, TEMP1
492	sll	TEMP1, BASE_SHIFT + 1, TEMP2
493	sll	TEMP1, BASE_SHIFT + 0, TEMP1
494	add	AO, TEMP2, AO
495	add	BO, TEMP1, BO
496#endif
497
498#ifdef LT
499	add	KK, 2, KK
500#endif
501
502#ifdef LN
503	sub	KK, 2, KK
504#endif
505
506	add	I, -1, I
507	cmp	I, 0
508	bg,pt	%icc, .LL72
509	nop
510	.align 4
511
512.LL80:
513	and	M, 1, I
514	cmp	I, 0
515	ble,pn	%icc, .LL89
516	nop
517
518#if defined(LT) || defined(RN)
519	mov	B, BO
520#else
521#ifdef LN
522	sll	K,  BASE_SHIFT + 0, TEMP1
523	sub	AORIG, TEMP1, AORIG
524#endif
525
526	sll	KK, BASE_SHIFT + 0, TEMP1
527	sll	KK, BASE_SHIFT + 0, TEMP2
528
529	add	AORIG, TEMP1, AO
530	add	B,     TEMP2, BO
531#endif
532
533	LDF	[AO +  0 * SIZE], a1
534	LDF	[BO +  0 * SIZE], b1
535	LDF	[AO +  1 * SIZE], a2
536	LDF	[BO +  1 * SIZE], b2
537	LDF	[AO +  2 * SIZE], a3
538	LDF	[BO +  2 * SIZE], b3
539	LDF	[AO +  3 * SIZE], a4
540	LDF	[BO +  3 * SIZE], b4
541
542#if defined(LT) || defined(RN)
543	sra	KK, 2, L
544#else
545	sub	K, KK, L
546	sra	L,  2, L
547#endif
548	cmp	L,  0
549	ble,pn	%icc, .LL85
550	FCLR	(cc01)
551	.align 4
552
553.LL83:
554	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
555	add	L, -1, L
556
557	FMADD	(aa1, bb1, cc01, cc01)
558	LDF	[AO +  4 * SIZE], a1
559	LDF	[BO +  4 * SIZE], b1
560
561	FMADD	(aa2, bb2, cc01, cc01)
562	LDF	[AO +  5 * SIZE], a2
563	LDF	[BO +  5 * SIZE], b2
564
565	FMADD	(aa3, bb3, cc01, cc01)
566	LDF	[AO +  6 * SIZE], a3
567	LDF	[BO +  6 * SIZE], b3
568
569	FMADD	(aa4, bb4, cc01, cc01)
570	LDF	[AO +  7 * SIZE], a4
571	LDF	[BO +  7 * SIZE], b4
572
573	add	AO,  4 * SIZE, AO
574	cmp	L, 0
575
576	bg,pt	%icc, .LL83
577	add	BO,  4 * SIZE, BO
578	.align 4
579
580.LL85:
581#if defined(LT) || defined(RN)
582	and	KK, 3, L
583#else
584	sub	K, KK, L
585	and	L,  3, L
586#endif
587	cmp	L,  0
588	ble,a,pn %icc, .LL88
589	nop
590	.align 4
591
592.LL87:
593	FMADD	(aa1, bb1, cc01, cc01)
594	LDF	[AO + 1 * SIZE], a1
595	LDF	[BO + 1 * SIZE], b1
596
597	add	AO, 1 * SIZE, AO
598	add	L, -1, L
599	cmp	L, 0
600	bg,pt	%icc, .LL87
601	add	BO, 1 * SIZE, BO
602	.align 4
603
604.LL88:
605#if defined(LN) || defined(RT)
606#ifdef LN
607	sub	KK, 1, TEMP1
608#else
609	sub	KK, 1, TEMP1
610#endif
611	sll	TEMP1, BASE_SHIFT + 0, TEMP2
612	sll	TEMP1, BASE_SHIFT + 0, TEMP1
613
614	add	AORIG, TEMP2, AO
615	add	B,     TEMP1, BO
616#endif
617
618#if defined(LN) || defined(LT)
619	LDF	[BO +  0 * SIZE], a1
620
621	FSUB	a1, c01, c01
622#else
623	LDF	[AO +  0 * SIZE], a1
624
625	FSUB	a1, c01, c01
626#endif
627
628#if defined(LN) || defined(LT)
629	LDF	[AO +  0 * SIZE], a1
630
631	FMUL	a1, c01, c01
632#endif
633
634#if defined(RN) || defined(RT)
635	LDF	[BO +  0 * SIZE], a1
636
637	FMUL	a1, c01, c01
638#endif
639
640#ifdef LN
641	add	C1, -1 * SIZE, C1
642#endif
643
644#if defined(LN) || defined(LT)
645	STF	c01, [BO +  0 * SIZE]
646#else
647	STF	c01, [AO +  0 * SIZE]
648#endif
649
650	STF	c01, [C1 + 0 * SIZE]
651
652#ifdef RT
653	sll	K, BASE_SHIFT + 0, TEMP1
654	add	AORIG, TEMP1, AORIG
655#endif
656
657#if defined(LT) || defined(RN)
658	sub	K, KK, TEMP1
659	sll	TEMP1, BASE_SHIFT + 0, TEMP2
660	sll	TEMP1, BASE_SHIFT + 0, TEMP1
661	add	AO, TEMP2, AO
662	add	BO, TEMP1, BO
663#endif
664
665#ifdef LT
666	add	KK, 1, KK
667#endif
668
669#ifdef LN
670	sub	KK, 1, KK
671#endif
672	.align 4
673
674.LL89:
675#ifdef LN
676	sll	K, BASE_SHIFT, TEMP1
677	add	B, TEMP1, B
678#endif
679
680#if defined(LT) || defined(RN)
681	mov	BO, B
682#endif
683
684#ifdef RN
685	add	KK, 1, KK
686#endif
687
688#ifdef RT
689	sub	KK, 1, KK
690#endif
691	.align 4
692
693.LL50:
694	and	N, 2, J
695	cmp	J, 0
696	ble,pn	%icc, .LL30
697	nop
698
699#ifdef RT
700	sll	K, BASE_SHIFT + 1, TEMP1
701	sub	B, TEMP1, B
702#endif
703
704#ifndef RT
705	mov	C,  C1
706	add	C,  LDC, C2
707	add	C2, LDC, C
708#else
709	sub	C,  LDC, C2
710	sub	C2, LDC, C1
711	sub	C2, LDC, C
712#endif
713
714#ifdef LN
715	add	M, OFFSET, KK
716#endif
717
718#ifdef LT
719	mov	OFFSET, KK
720#endif
721
722#if defined(LN) || defined(RT)
723	mov	A, AORIG
724#else
725	mov	A, AO
726#endif
727
728	sra	M, 1, I
729	cmp	I, 0
730	ble,pn	%icc, .LL60
731	nop
732	.align 4
733
734.LL52:
735#if defined(LT) || defined(RN)
736	mov	B, BO
737#else
738#ifdef LN
739	sll	K,  BASE_SHIFT + 1, TEMP1
740	sub	AORIG, TEMP1, AORIG
741#endif
742
743	sll	KK, BASE_SHIFT + 1, TEMP1
744	sll	KK, BASE_SHIFT + 1, TEMP2
745
746	add	AORIG, TEMP1, AO
747	add	B,     TEMP2, BO
748#endif
749
750	LDF	[AO +  0 * SIZE], a1
751	LDF	[AO +  1 * SIZE], a2
752	LDF	[AO +  2 * SIZE], a3
753	LDF	[AO +  3 * SIZE], a4
754
755	LDF	[BO +  0 * SIZE], b1
756	LDF	[BO +  1 * SIZE], b2
757	LDF	[BO +  2 * SIZE], b3
758	FCLR	(cc01)
759	LDF	[BO +  3 * SIZE], b4
760	FCLR	(cc02)
761
762	LDF	[BO +  4 * SIZE], b5
763	FCLR	(cc03)
764	LDF	[BO +  5 * SIZE], b6
765	FCLR	(cc04)
766	LDF	[BO +  6 * SIZE], b7
767	FCLR	(cc05)
768	LDF	[BO +  7 * SIZE], b8
769	FCLR	(cc06)
770
771	prefetch [C1 + 2 * SIZE], 3
772	FCLR	(cc07)
773	prefetch [C2 + 2 * SIZE], 3
774	FCLR	(cc08)
775
776#if defined(LT) || defined(RN)
777	sra	KK, 2, L
778#else
779	sub	K, KK, L
780	sra	L,  2, L
781#endif
782	cmp	L,  0
783	ble,pn	%icc, .LL55
784	nop
785	.align 4
786
787.LL53:
788	FMADD	(aa1, bb1, cc01, cc01)
789	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
790	FMADD	(aa2, bb1, cc02, cc02)
791	LDF	[BO +  8 * SIZE], b1
792
793	FMADD	(aa1, bb2, cc03, cc03)
794	LDF	[AO +  4 * SIZE], a1
795	FMADD	(aa2, bb2, cc04, cc04)
796	LDF	[AO +  5 * SIZE], a2
797
798	FMADD	(aa3, bb3, cc01, cc01)
799	LDF	[BO +  9 * SIZE], b2
800	FMADD	(aa4, bb3, cc02, cc02)
801	LDF	[BO + 10 * SIZE], b3
802
803	FMADD	(aa3, bb4, cc03, cc03)
804	LDF	[AO +  6 * SIZE], a3
805	FMADD	(aa4, bb4, cc04, cc04)
806	LDF	[AO +  7 * SIZE], a4
807
808	FMADD	(aa1, bb5, cc01, cc01)
809	LDF	[BO + 11 * SIZE], b4
810	FMADD	(aa2, bb5, cc02, cc02)
811	LDF	[BO + 12 * SIZE], b5
812
813	FMADD	(aa1, bb6, cc03, cc03)
814	LDF	[AO +  8 * SIZE], a1
815	FMADD	(aa2, bb6, cc04, cc04)
816	LDF	[AO +  9 * SIZE], a2
817
818	FMADD	(aa3, bb7, cc01, cc01)
819	LDF	[BO + 13 * SIZE], b6
820
821	FMADD	(aa4, bb7, cc02, cc02)
822	LDF	[BO + 14 * SIZE], b7
823
824	FMADD	(aa3, bb8, cc03, cc03)
825	LDF	[AO + 10 * SIZE], a3
826	FMADD	(aa4, bb8, cc04, cc04)
827	LDF	[AO + 11 * SIZE], a4
828
829	add	AO,  8 * SIZE, AO
830	add	L, -1, L
831	add	BO,  8 * SIZE, BO
832	cmp	L, 0
833
834	bg,pt	%icc, .LL53
835	LDF	[BO +  7 * SIZE], b8
836	.align 4
837
838.LL55:
839#if defined(LT) || defined(RN)
840	and	KK, 3, L
841#else
842	sub	K, KK, L
843	and	L,  3, L
844#endif
845	cmp	L,  0
846	ble,a,pn %icc, .LL58
847	nop
848	.align 4
849
850.LL57:
851	FMADD	(aa1, bb1, cc01, cc01)
852	add	L, -1, L
853	FMADD	(aa2, bb1, cc02, cc02)
854	LDF	[BO + 2 * SIZE], b1
855
856	FMADD	(aa1, bb2, cc03, cc03)
857	LDF	[AO + 2 * SIZE], a1
858	FMADD	(aa2, bb2, cc04, cc04)
859	LDF	[AO + 3 * SIZE], a2
860
861	add	AO, 2 * SIZE, AO
862	cmp	L, 0
863	add	BO, 2 * SIZE, BO
864	bg,pt	%icc, .LL57
865	LDF	[BO + 1 * SIZE], b2
866	.align 4
867
868.LL58:
869#if defined(LN) || defined(RT)
870#ifdef LN
871	sub	KK, 2, TEMP1
872#else
873	sub	KK, 2, TEMP1
874#endif
875	sll	TEMP1, BASE_SHIFT + 1, TEMP2
876	sll	TEMP1, BASE_SHIFT + 1, TEMP1
877
878	add	AORIG, TEMP2, AO
879	add	B,     TEMP1, BO
880#endif
881
882#if defined(LN) || defined(LT)
883	LDF	[BO +  0 * SIZE], a1
884	LDF	[BO +  1 * SIZE], a2
885	LDF	[BO +  2 * SIZE], a3
886	LDF	[BO +  3 * SIZE], a4
887
888	FSUB	a1, c01, c01
889	FSUB	a2, c03, c03
890	FSUB	a3, c02, c02
891	FSUB	a4, c04, c04
892#else
893	LDF	[AO +  0 * SIZE], a1
894	LDF	[AO +  1 * SIZE], a2
895	LDF	[AO +  2 * SIZE], a3
896	LDF	[AO +  3 * SIZE], a4
897
898	FSUB	a1, c01, c01
899	FSUB	a2, c02, c02
900	FSUB	a3, c03, c03
901	FSUB	a4, c04, c04
902#endif
903
904#ifdef LN
905	LDF	[AO +  3 * SIZE], a1
906	LDF	[AO +  2 * SIZE], a2
907	LDF	[AO +  0 * SIZE], a3
908
909	FMUL	a1, c02, c02
910	FMUL	a1, c04, c04
911
912	FNMSUB	(aa2, cc02, cc01, cc01)
913	FNMSUB	(aa2, cc04, cc03, cc03)
914
915	FMUL	a3, c01, c01
916	FMUL	a3, c03, c03
917#endif
918
919#ifdef LT
920	LDF	[AO +  0 * SIZE], a1
921	LDF	[AO +  1 * SIZE], a2
922	LDF	[AO +  3 * SIZE], a3
923
924	FMUL	a1, c01, c01
925	FMUL	a1, c03, c03
926
927	FNMSUB	(aa2, cc01, cc02, cc02)
928	FNMSUB	(aa2, cc03, cc04, cc04)
929
930	FMUL	a3, c02, c02
931	FMUL	a3, c04, c04
932#endif
933
934#ifdef RN
935	LDF	[BO +  0 * SIZE], a1
936	LDF	[BO +  1 * SIZE], a2
937
938	FMUL	a1, c01, c01
939	FMUL	a1, c02, c02
940
941	FNMSUB	(aa2, cc01, cc03, cc03)
942	FNMSUB	(aa2, cc02, cc04, cc04)
943
944	LDF	[BO +  3 * SIZE], a1
945
946	FMUL	a1, c03, c03
947	FMUL	a1, c04, c04
948#endif
949
950#ifdef RT
951	LDF	[BO +  3 * SIZE], a1
952	LDF	[BO +  2 * SIZE], a2
953
954	FMUL	a1, c04, c04
955	FMUL	a1, c03, c03
956
957	FNMSUB	(aa2, cc04, cc02, cc02)
958	FNMSUB	(aa2, cc03, cc01, cc01)
959
960	LDF	[BO +  0 * SIZE], a1
961
962	FMUL	a1, c02, c02
963	FMUL	a1, c01, c01
964#endif
965
966#ifdef LN
967	add	C1, -2 * SIZE, C1
968	add	C2, -2 * SIZE, C2
969#endif
970
971#if defined(LN) || defined(LT)
972	STF	c01, [BO +  0 * SIZE]
973	STF	c03, [BO +  1 * SIZE]
974	STF	c02, [BO +  2 * SIZE]
975	STF	c04, [BO +  3 * SIZE]
976#else
977	STF	c01, [AO +  0 * SIZE]
978	STF	c02, [AO +  1 * SIZE]
979	STF	c03, [AO +  2 * SIZE]
980	STF	c04, [AO +  3 * SIZE]
981#endif
982
983	STF	c01, [C1 + 0 * SIZE]
984	STF	c02, [C1 + 1 * SIZE]
985	STF	c03, [C2 + 0 * SIZE]
986	STF	c04, [C2 + 1 * SIZE]
987
988#ifndef LN
989	add	C1, 2 * SIZE, C1
990	add	C2, 2 * SIZE, C2
991#endif
992
993#ifdef RT
994	sll	K, BASE_SHIFT + 1, TEMP1
995	add	AORIG, TEMP1, AORIG
996#endif
997
998#if defined(LT) || defined(RN)
999	sub	K, KK, TEMP1
1000	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1001	sll	TEMP1, BASE_SHIFT + 1, TEMP1
1002	add	AO, TEMP2, AO
1003	add	BO, TEMP1, BO
1004#endif
1005
1006#ifdef LT
1007	add	KK, 2, KK
1008#endif
1009
1010#ifdef LN
1011	sub	KK, 2, KK
1012#endif
1013
1014	add	I, -1, I
1015	cmp	I, 0
1016	bg,pt	%icc, .LL52
1017	nop
1018	.align 4
1019
1020.LL60:
1021	and	M, 1, I
1022	cmp	I, 0
1023	ble,pn	%icc, .LL69
1024	nop
1025
1026#if defined(LT) || defined(RN)
1027	mov	B, BO
1028#else
1029#ifdef LN
1030	sll	K,  BASE_SHIFT + 0, TEMP1
1031	sub	AORIG, TEMP1, AORIG
1032#endif
1033
1034	sll	KK, BASE_SHIFT + 0, TEMP1
1035	sll	KK, BASE_SHIFT + 1, TEMP2
1036
1037	add	AORIG, TEMP1, AO
1038	add	B,     TEMP2, BO
1039#endif
1040
1041	LDF	[AO +  0 * SIZE], a1
1042	LDF	[AO +  1 * SIZE], a2
1043	LDF	[AO +  2 * SIZE], a3
1044	LDF	[AO +  3 * SIZE], a4
1045
1046	LDF	[BO +  0 * SIZE], b1
1047	LDF	[BO +  1 * SIZE], b2
1048	LDF	[BO +  2 * SIZE], b3
1049	LDF	[BO +  3 * SIZE], b4
1050	LDF	[BO +  4 * SIZE], b5
1051	LDF	[BO +  5 * SIZE], b6
1052	LDF	[BO +  6 * SIZE], b7
1053	FCLR	(cc01)
1054	LDF	[BO +  7 * SIZE], b8
1055	FCLR	(cc03)
1056
1057#if defined(LT) || defined(RN)
1058	sra	KK, 2, L
1059#else
1060	sub	K, KK, L
1061	sra	L,  2, L
1062#endif
1063	cmp	L,  0
1064	ble,pn	%icc, .LL65
1065	nop
1066	.align 4
1067
1068.LL63:
1069	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1070	add	L, -1, L
1071
1072	FMADD	(aa1, bb1, cc01, cc01)
1073	LDF	[BO +  8 * SIZE], b1
1074	FMADD	(aa1, bb2, cc03, cc03)
1075	LDF	[BO +  9 * SIZE], b2
1076
1077	LDF	[AO +  4 * SIZE], a1
1078	cmp	L, 0
1079
1080	FMADD	(aa2, bb3, cc01, cc01)
1081	LDF	[BO + 10 * SIZE], b3
1082	FMADD	(aa2, bb4, cc03, cc03)
1083	LDF	[BO + 11 * SIZE], b4
1084
1085	LDF	[AO +  5 * SIZE], a2
1086	add	AO,  4 * SIZE, AO
1087
1088	FMADD	(aa3, bb5, cc01, cc01)
1089	LDF	[BO + 12 * SIZE], b5
1090	FMADD	(aa3, bb6, cc03, cc03)
1091	LDF	[BO + 13 * SIZE], b6
1092
1093	LDF	[AO +  2 * SIZE], a3
1094	add	BO,  8 * SIZE, BO
1095
1096	FMADD	(aa4, bb7, cc01, cc01)
1097	LDF	[BO +  6 * SIZE], b7
1098	FMADD	(aa4, bb8, cc03, cc03)
1099	LDF	[BO + 7 * SIZE], b8
1100
1101	bg,pt	%icc, .LL63
1102	LDF	[AO +  3 * SIZE], a4
1103	.align 4
1104
1105.LL65:
1106#if defined(LT) || defined(RN)
1107	and	KK, 3, L
1108#else
1109	sub	K, KK, L
1110	and	L,  3, L
1111#endif
1112	cmp	L,  0
1113	ble,a,pn %icc, .LL68
1114	nop
1115	.align 4
1116
1117.LL67:
1118	FMADD	(aa1, bb1, cc01, cc01)
1119	LDF	[BO + 2 * SIZE], b1
1120	FMADD	(aa1, bb2, cc03, cc03)
1121	LDF	[BO + 3 * SIZE], b2
1122
1123	LDF	[AO + 1 * SIZE], a1
1124	add	L, -1, L
1125	add	AO, 1 * SIZE, AO
1126	cmp	L, 0
1127
1128	bg,pt	%icc, .LL67
1129	add	BO, 2 * SIZE, BO
1130	.align 4
1131
1132.LL68:
1133#if defined(LN) || defined(RT)
1134#ifdef LN
1135	sub	KK, 1, TEMP1
1136#else
1137	sub	KK, 2, TEMP1
1138#endif
1139	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1140	sll	TEMP1, BASE_SHIFT + 1, TEMP1
1141
1142	add	AORIG, TEMP2, AO
1143	add	B,     TEMP1, BO
1144#endif
1145
1146#if defined(LN) || defined(LT)
1147	LDF	[BO +  0 * SIZE], a1
1148	LDF	[BO +  1 * SIZE], a2
1149
1150	FSUB	a1, c01, c01
1151	FSUB	a2, c03, c03
1152#else
1153	LDF	[AO +  0 * SIZE], a1
1154	LDF	[AO +  1 * SIZE], a2
1155
1156	FSUB	a1, c01, c01
1157	FSUB	a2, c03, c03
1158#endif
1159
1160#if defined(LN) || defined(LT)
1161	LDF	[AO +  0 * SIZE], a1
1162
1163	FMUL	a1, c01, c01
1164	FMUL	a1, c03, c03
1165#endif
1166
1167#ifdef RN
1168	LDF	[BO +  0 * SIZE], a1
1169	LDF	[BO +  1 * SIZE], a2
1170
1171	FMUL	a1, c01, c01
1172
1173	FNMSUB	(aa2, cc01, cc03, cc03)
1174
1175	LDF	[BO +  3 * SIZE], a1
1176
1177	FMUL	a1, c03, c03
1178#endif
1179
1180#ifdef RT
1181	LDF	[BO +  3 * SIZE], a1
1182	LDF	[BO +  2 * SIZE], a2
1183
1184	FMUL	a1, c03, c03
1185
1186	FNMSUB	(aa2, cc03, cc01, cc01)
1187
1188	LDF	[BO +  0 * SIZE], a1
1189
1190	FMUL	a1, c01, c01
1191#endif
1192
1193#ifdef LN
1194	add	C1, -1 * SIZE, C1
1195	add	C2, -1 * SIZE, C2
1196#endif
1197
1198#if defined(LN) || defined(LT)
1199	STF	c01, [BO +  0 * SIZE]
1200	STF	c03, [BO +  1 * SIZE]
1201#else
1202	STF	c01, [AO +  0 * SIZE]
1203	STF	c03, [AO +  1 * SIZE]
1204#endif
1205
1206	STF	c01, [C1 + 0 * SIZE]
1207	STF	c03, [C2 + 0 * SIZE]
1208
1209#ifdef RT
1210	sll	K, BASE_SHIFT + 0, TEMP1
1211	add	AORIG, TEMP1, AORIG
1212#endif
1213
1214#if defined(LT) || defined(RN)
1215	sub	K, KK, TEMP1
1216	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1217	sll	TEMP1, BASE_SHIFT + 1, TEMP1
1218	add	AO, TEMP2, AO
1219	add	BO, TEMP1, BO
1220#endif
1221
1222#ifdef LT
1223	add	KK, 1, KK
1224#endif
1225
1226#ifdef LN
1227	sub	KK, 1, KK
1228#endif
1229	.align 4
1230
1231.LL69:
1232#ifdef LN
1233	sll	K, BASE_SHIFT + 1, TEMP1
1234	add	B, TEMP1, B
1235#endif
1236
1237#if defined(LT) || defined(RN)
1238	mov	BO, B
1239#endif
1240
1241#ifdef RN
1242	add	KK, 2, KK
1243#endif
1244
1245#ifdef RT
1246	sub	KK, 2, KK
1247#endif
1248	.align 4
1249
1250.LL30:
1251	and	N, 4, J
1252	cmp	J, 0
1253	ble,pn	%icc, .LL10
1254	nop
1255
1256#ifdef RT
1257	sll	K, BASE_SHIFT + 2, TEMP1
1258	sub	B, TEMP1, B
1259#endif
1260
1261#ifndef RT
1262	mov	C,  C1
1263	add	C,  LDC, C2
1264	add	C2, LDC, C3
1265	add	C3, LDC, C4
1266	add	C4, LDC, C
1267#else
1268	sub	C,  LDC, C4
1269	sub	C4, LDC, C3
1270	sub	C3, LDC, C2
1271	sub	C2, LDC, C1
1272	sub	C2, LDC, C
1273#endif
1274
1275#ifdef LN
1276	add	M, OFFSET, KK
1277#endif
1278
1279#ifdef LT
1280	mov	OFFSET, KK
1281#endif
1282
1283#if defined(LN) || defined(RT)
1284	mov	A, AORIG
1285#else
1286	mov	A, AO
1287#endif
1288
1289	sra	M, 1, I
1290	cmp	I, 0
1291	ble,pn	%icc, .LL40
1292	nop
1293	.align 4
1294
1295.LL32:
1296#if defined(LT) || defined(RN)
1297	mov	B, BO
1298#else
1299#ifdef LN
1300	sll	K,  BASE_SHIFT + 1, TEMP1
1301	sub	AORIG, TEMP1, AORIG
1302#endif
1303
1304	sll	KK, BASE_SHIFT + 1, TEMP1
1305	sll	KK, BASE_SHIFT + 2, TEMP2
1306
1307	add	AORIG, TEMP1, AO
1308	add	B,     TEMP2, BO
1309#endif
1310
1311	LDF	[AO +  0 * SIZE], a1
1312	LDF	[AO +  1 * SIZE], a2
1313
1314	LDF	[BO +  0 * SIZE], b1
1315	LDF	[BO +  1 * SIZE], b2
1316	LDF	[BO +  2 * SIZE], b3
1317	LDF	[BO +  3 * SIZE], b4
1318	LDF	[BO +  4 * SIZE], b5
1319
1320	LDF	[BO +  5 * SIZE], b6
1321	FCLR	(cc01)
1322	LDF	[BO +  6 * SIZE], b7
1323	FCLR	(cc02)
1324	LDF	[BO +  7 * SIZE], b8
1325	FCLR	(cc03)
1326	LDF	[BO +  8 * SIZE], b9
1327	FCLR	(cc04)
1328
1329	prefetch [C1 + 2 * SIZE], 3
1330	FCLR	(cc05)
1331	prefetch [C2 + 2 * SIZE], 3
1332	FCLR	(cc06)
1333	prefetch [C3 + 2 * SIZE], 3
1334	FCLR	(cc07)
1335	prefetch [C4 + 2 * SIZE], 3
1336	FCLR	(cc08)
1337
1338#if defined(LT) || defined(RN)
1339	sra	KK, 2, L
1340#else
1341	sub	K, KK, L
1342	sra	L,  2, L
1343#endif
1344	cmp	L,  0
1345	ble,pn	%icc, .LL35
1346	nop
1347	.align 4
1348
1349.LL33:
1350	FMADD	(aa1, bb1, cc01, cc01)
1351	LDF	[AO +  2 * SIZE], a3
1352	FMADD	(aa2, bb1, cc02, cc02)
1353	LDF	[AO +  3 * SIZE], a4
1354
1355	FMADD	(aa1, bb2, cc03, cc03)
1356	LDF	[BO + 16 * SIZE], b1
1357	FMADD	(aa2, bb2, cc04, cc04)
1358	LDF	[BO +  9 * SIZE], b2
1359
1360	FMADD	(aa1, bb3, cc05, cc05)
1361	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1362	FMADD	(aa2, bb3, cc06, cc06)
1363	add	L, -1, L
1364
1365	FMADD	(aa1, bb4, cc07, cc07)
1366	LDF	[BO + 10 * SIZE], b3
1367	FMADD	(aa2, bb4, cc08, cc08)
1368	LDF	[BO + 11 * SIZE], b4
1369
1370	FMADD	(aa3, bb5, cc01, cc01)
1371	LDF	[AO +  4 * SIZE], a1
1372	FMADD	(aa4, bb5, cc02, cc02)
1373	LDF	[AO +  5 * SIZE], a2
1374
1375	FMADD	(aa3, bb6, cc03, cc03)
1376	LDF	[BO + 12 * SIZE], b5
1377	FMADD	(aa4, bb6, cc04, cc04)
1378	LDF	[BO + 13 * SIZE], b6
1379
1380	FMADD	(aa3, bb7, cc05, cc05)
1381	cmp	L, 0
1382	FMADD	(aa4, bb7, cc06, cc06)
1383	add	AO,  8 * SIZE, AO
1384
1385	FMADD	(aa3, bb8, cc07, cc07)
1386	LDF	[BO + 14 * SIZE], b7
1387	FMADD	(aa4, bb8, cc08, cc08)
1388	LDF	[BO + 15 * SIZE], b8
1389
1390	FMADD	(aa1, bb9, cc01, cc01)
1391	LDF	[AO -  2 * SIZE], a3
1392	FMADD	(aa2, bb9, cc02, cc02)
1393	LDF	[AO -  1 * SIZE], a4
1394
1395	FMADD	(aa1, bb2, cc03, cc03)
1396	LDF	[BO + 24 * SIZE], b9
1397	FMADD	(aa2, bb2, cc04, cc04)
1398	LDF	[BO + 17 * SIZE], b2
1399
1400	FMADD	(aa1, bb3, cc05, cc05)
1401	add	BO, 16 * SIZE, BO
1402	FMADD	(aa2, bb3, cc06, cc06)
1403	nop
1404
1405	FMADD	(aa1, bb4, cc07, cc07)
1406	LDF	[BO +  2 * SIZE], b3
1407	FMADD	(aa2, bb4, cc08, cc08)
1408	LDF	[BO +  3 * SIZE], b4
1409
1410	FMADD	(aa3, bb5, cc01, cc01)
1411	LDF	[AO +  0 * SIZE], a1
1412	FMADD	(aa4, bb5, cc02, cc02)
1413	LDF	[AO +  1 * SIZE], a2
1414	FMADD	(aa3, bb6, cc03, cc03)
1415	LDF	[BO +  4 * SIZE], b5
1416	FMADD	(aa4, bb6, cc04, cc04)
1417	LDF	[BO +  5 * SIZE], b6
1418
1419	FMADD	(aa3, bb7, cc05, cc05)
1420	nop
1421	FMADD	(aa4, bb7, cc06, cc06)
1422	LDF	[BO +  6 * SIZE], b7
1423
1424	FMADD	(aa3, bb8, cc07, cc07)
1425	FMADD	(aa4, bb8, cc08, cc08)
1426	bg,pt	%icc, .LL33
1427	LDF	[BO +  7 * SIZE], b8
1428	.align 4
1429
1430.LL35:
1431#if defined(LT) || defined(RN)
1432	and	KK, 3, L
1433#else
1434	sub	K, KK, L
1435	and	L,  3, L
1436#endif
1437	cmp	L,  0
1438	ble,a,pn %icc, .LL38
1439	nop
1440	.align 4
1441
1442.LL37:
1443	FMADD	(aa1, bb1, cc01, cc01)
1444	add	L, -1, L
1445	FMADD	(aa2, bb1, cc02, cc02)
1446	LDF	[BO + 4 * SIZE], b1
1447
1448	FMADD	(aa1, bb2, cc03, cc03)
1449	add	AO, 2 * SIZE, AO
1450	FMADD	(aa2, bb2, cc04, cc04)
1451	LDF	[BO + 5 * SIZE], b2
1452
1453	FMADD	(aa1, bb3, cc05, cc05)
1454	cmp	L, 0
1455	FMADD	(aa2, bb3, cc06, cc06)
1456	LDF	[BO + 6 * SIZE], b3
1457
1458	FMADD	(aa1, bb4, cc07, cc07)
1459	LDF	[AO + 0 * SIZE], a1
1460	FMADD	(aa2, bb4, cc08, cc08)
1461	LDF	[AO + 1 * SIZE], a2
1462
1463	LDF	[BO + 7 * SIZE], b4
1464	bg,pt	%icc, .LL37
1465	add	BO, 4 * SIZE, BO
1466	.align 4
1467
1468.LL38:
1469#if defined(LN) || defined(RT)
1470#ifdef LN
1471	sub	KK, 2, TEMP1
1472#else
1473	sub	KK, 4, TEMP1
1474#endif
1475	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1476	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1477
1478	add	AORIG, TEMP2, AO
1479	add	B,     TEMP1, BO
1480#endif
1481
1482#if defined(LN) || defined(LT)
1483	LDF	[BO +  0 * SIZE], a1
1484	LDF	[BO +  1 * SIZE], a2
1485	LDF	[BO +  2 * SIZE], a3
1486	LDF	[BO +  3 * SIZE], a4
1487
1488	LDF	[BO +  4 * SIZE], b1
1489	LDF	[BO +  5 * SIZE], b2
1490	LDF	[BO +  6 * SIZE], b3
1491	LDF	[BO +  7 * SIZE], b4
1492
1493	FSUB	a1, c01, c01
1494	FSUB	a2, c03, c03
1495	FSUB	a3, c05, c05
1496	FSUB	a4, c07, c07
1497
1498	FSUB	b1, c02, c02
1499	FSUB	b2, c04, c04
1500	FSUB	b3, c06, c06
1501	FSUB	b4, c08, c08
1502#else
1503	LDF	[AO +  0 * SIZE], a1
1504	LDF	[AO +  1 * SIZE], a2
1505	LDF	[AO +  2 * SIZE], a3
1506	LDF	[AO +  3 * SIZE], a4
1507
1508	LDF	[AO +  4 * SIZE], b1
1509	LDF	[AO +  5 * SIZE], b2
1510	LDF	[AO +  6 * SIZE], b3
1511	LDF	[AO +  7 * SIZE], b4
1512
1513	FSUB	a1, c01, c01
1514	FSUB	a2, c02, c02
1515	FSUB	a3, c03, c03
1516	FSUB	a4, c04, c04
1517
1518	FSUB	b1, c05, c05
1519	FSUB	b2, c06, c06
1520	FSUB	b3, c07, c07
1521	FSUB	b4, c08, c08
1522
1523#endif
1524
1525#ifdef LN
1526	LDF	[AO +  3 * SIZE], a1
1527	LDF	[AO +  2 * SIZE], a2
1528	LDF	[AO +  0 * SIZE], a3
1529
1530	FMUL	a1, c02, c02
1531	FMUL	a1, c04, c04
1532	FMUL	a1, c06, c06
1533	FMUL	a1, c08, c08
1534
1535	FNMSUB	(aa2, cc02, cc01, cc01)
1536	FNMSUB	(aa2, cc04, cc03, cc03)
1537	FNMSUB	(aa2, cc06, cc05, cc05)
1538	FNMSUB	(aa2, cc08, cc07, cc07)
1539
1540	FMUL	a3, c01, c01
1541	FMUL	a3, c03, c03
1542	FMUL	a3, c05, c05
1543	FMUL	a3, c07, c07
1544#endif
1545
1546#ifdef LT
1547	LDF	[AO +  0 * SIZE], a1
1548	LDF	[AO +  1 * SIZE], a2
1549	LDF	[AO +  3 * SIZE], a3
1550
1551	FMUL	a1, c01, c01
1552	FMUL	a1, c03, c03
1553	FMUL	a1, c05, c05
1554	FMUL	a1, c07, c07
1555
1556	FNMSUB	(aa2, cc01, cc02, cc02)
1557	FNMSUB	(aa2, cc03, cc04, cc04)
1558	FNMSUB	(aa2, cc05, cc06, cc06)
1559	FNMSUB	(aa2, cc07, cc08, cc08)
1560
1561	FMUL	a3, c02, c02
1562	FMUL	a3, c04, c04
1563	FMUL	a3, c06, c06
1564	FMUL	a3, c08, c08
1565#endif
1566
1567#ifdef RN
1568	LDF	[BO +  0 * SIZE], a1
1569	LDF	[BO +  1 * SIZE], a2
1570	LDF	[BO +  2 * SIZE], a3
1571	LDF	[BO +  3 * SIZE], a4
1572
1573	FMUL	a1, c01, c01
1574	FMUL	a1, c02, c02
1575
1576	FNMSUB	(aa2, cc01, cc03, cc03)
1577	FNMSUB	(aa2, cc02, cc04, cc04)
1578	FNMSUB	(aa3, cc01, cc05, cc05)
1579	FNMSUB	(aa3, cc02, cc06, cc06)
1580	FNMSUB	(aa4, cc01, cc07, cc07)
1581	FNMSUB	(aa4, cc02, cc08, cc08)
1582
1583	LDF	[BO +  5 * SIZE], a1
1584	LDF	[BO +  6 * SIZE], a2
1585	LDF	[BO +  7 * SIZE], a3
1586
1587	FMUL	a1, c03, c03
1588	FMUL	a1, c04, c04
1589
1590	FNMSUB	(aa2, cc03, cc05, cc05)
1591	FNMSUB	(aa2, cc04, cc06, cc06)
1592	FNMSUB	(aa3, cc03, cc07, cc07)
1593	FNMSUB	(aa3, cc04, cc08, cc08)
1594
1595	LDF	[BO + 10 * SIZE], a1
1596	LDF	[BO + 11 * SIZE], a2
1597
1598	FMUL	a1, c05, c05
1599	FMUL	a1, c06, c06
1600
1601	FNMSUB	(aa2, cc05, cc07, cc07)
1602	FNMSUB	(aa2, cc06, cc08, cc08)
1603
1604	LDF	[BO + 15 * SIZE], a1
1605
1606	FMUL	a1, c07, c07
1607	FMUL	a1, c08, c08
1608#endif
1609
1610#ifdef RT
1611	LDF	[BO + 15 * SIZE], a1
1612	LDF	[BO + 14 * SIZE], a2
1613	LDF	[BO + 13 * SIZE], a3
1614	LDF	[BO + 12 * SIZE], a4
1615
1616	FMUL	a1, c08, c08
1617	FMUL	a1, c07, c07
1618
1619	FNMSUB	(aa2, cc08, cc06, cc06)
1620	FNMSUB	(aa2, cc07, cc05, cc05)
1621	FNMSUB	(aa3, cc08, cc04, cc04)
1622	FNMSUB	(aa3, cc07, cc03, cc03)
1623	FNMSUB	(aa4, cc08, cc02, cc02)
1624	FNMSUB	(aa4, cc07, cc01, cc01)
1625
1626	LDF	[BO + 10 * SIZE], a1
1627	LDF	[BO +  9 * SIZE], a2
1628	LDF	[BO +  8 * SIZE], a3
1629
1630	FMUL	a1, c06, c06
1631	FMUL	a1, c05, c05
1632
1633	FNMSUB	(aa2, cc06, cc04, cc04)
1634	FNMSUB	(aa2, cc05, cc03, cc03)
1635	FNMSUB	(aa3, cc06, cc02, cc02)
1636	FNMSUB	(aa3, cc05, cc01, cc01)
1637
1638	LDF	[BO +  5 * SIZE], a1
1639	LDF	[BO +  4 * SIZE], a2
1640
1641	FMUL	a1, c04, c04
1642	FMUL	a1, c03, c03
1643
1644	FNMSUB	(aa2, cc04, cc02, cc02)
1645	FNMSUB	(aa2, cc03, cc01, cc01)
1646
1647	LDF	[BO +  0 * SIZE], a1
1648
1649	FMUL	a1, c02, c02
1650	FMUL	a1, c01, c01
1651#endif
1652
1653#ifdef LN
1654	add	C1, -2 * SIZE, C1
1655	add	C2, -2 * SIZE, C2
1656	add	C3, -2 * SIZE, C3
1657	add	C4, -2 * SIZE, C4
1658#endif
1659
1660#if defined(LN) || defined(LT)
1661	STF	c01, [BO +  0 * SIZE]
1662	STF	c03, [BO +  1 * SIZE]
1663	STF	c05, [BO +  2 * SIZE]
1664	STF	c07, [BO +  3 * SIZE]
1665
1666	STF	c02, [BO +  4 * SIZE]
1667	STF	c04, [BO +  5 * SIZE]
1668	STF	c06, [BO +  6 * SIZE]
1669	STF	c08, [BO +  7 * SIZE]
1670#else
1671	STF	c01, [AO +  0 * SIZE]
1672	STF	c02, [AO +  1 * SIZE]
1673	STF	c03, [AO +  2 * SIZE]
1674	STF	c04, [AO +  3 * SIZE]
1675
1676	STF	c05, [AO +  4 * SIZE]
1677	STF	c06, [AO +  5 * SIZE]
1678	STF	c07, [AO +  6 * SIZE]
1679	STF	c08, [AO +  7 * SIZE]
1680#endif
1681
1682	STF	c01, [C1 + 0 * SIZE]
1683	STF	c02, [C1 + 1 * SIZE]
1684	STF	c03, [C2 + 0 * SIZE]
1685	STF	c04, [C2 + 1 * SIZE]
1686
1687	STF	c05, [C3 + 0 * SIZE]
1688	STF	c06, [C3 + 1 * SIZE]
1689	STF	c07, [C4 + 0 * SIZE]
1690	STF	c08, [C4 + 1 * SIZE]
1691
1692#ifndef LN
1693	add	C1, 2 * SIZE, C1
1694	add	C2, 2 * SIZE, C2
1695	add	C3, 2 * SIZE, C3
1696	add	C4, 2 * SIZE, C4
1697#endif
1698
1699#ifdef RT
1700	sll	K, BASE_SHIFT + 1, TEMP1
1701	add	AORIG, TEMP1, AORIG
1702#endif
1703
1704#if defined(LT) || defined(RN)
1705	sub	K, KK, TEMP1
1706	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1707	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1708	add	AO, TEMP2, AO
1709	add	BO, TEMP1, BO
1710#endif
1711
1712#ifdef LT
1713	add	KK, 2, KK
1714#endif
1715
1716#ifdef LN
1717	sub	KK, 2, KK
1718#endif
1719
1720	add	I, -1, I
1721	cmp	I, 0
1722	bg,pt	%icc, .LL32
1723	nop
1724
1725.LL40:
1726	and	M, 1, I
1727	cmp	I, 0
1728	ble,pn	%icc, .LL49
1729	nop
1730
1731#if defined(LT) || defined(RN)
1732	mov	B, BO
1733#else
1734#ifdef LN
1735	sll	K,  BASE_SHIFT + 0, TEMP1
1736	sub	AORIG, TEMP1, AORIG
1737#endif
1738
1739	sll	KK, BASE_SHIFT + 0, TEMP1
1740	sll	KK, BASE_SHIFT + 2, TEMP2
1741
1742	add	AORIG, TEMP1, AO
1743	add	B,     TEMP2, BO
1744#endif
1745
1746	LDF	[AO +  0 * SIZE], a1
1747	LDF	[AO +  1 * SIZE], a2
1748	LDF	[AO +  2 * SIZE], a3
1749	LDF	[AO +  3 * SIZE], a4
1750
1751	LDF	[BO +  0 * SIZE], b1
1752	LDF	[BO +  1 * SIZE], b2
1753	LDF	[BO +  2 * SIZE], b3
1754	LDF	[BO +  3 * SIZE], b4
1755	LDF	[BO +  4 * SIZE], b5
1756	LDF	[BO +  5 * SIZE], b6
1757	FCLR	(cc01)
1758	LDF	[BO +  6 * SIZE], b7
1759	FCLR	(cc03)
1760	LDF	[BO +  7 * SIZE], b8
1761	FCLR	(cc05)
1762	LDF	[BO +  8 * SIZE], b9
1763	FCLR	(cc07)
1764
1765#if defined(LT) || defined(RN)
1766	sra	KK, 2, L
1767#else
1768	sub	K, KK, L
1769	sra	L,  2, L
1770#endif
1771	cmp	L,  0
1772	ble,pn	%icc, .LL45
1773	nop
1774
1775.LL43:
1776	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1777	add	L, -1, L
1778
1779	FMADD	(aa1, bb1, cc01, cc01)
1780	LDF	[BO + 16 * SIZE], b1
1781	FMADD	(aa1, bb2, cc03, cc03)
1782	LDF	[BO +  9 * SIZE], b2
1783	FMADD	(aa1, bb3, cc05, cc05)
1784	LDF	[BO + 10 * SIZE], b3
1785	FMADD	(aa1, bb4, cc07, cc07)
1786	LDF	[BO + 11 * SIZE], b4
1787
1788	LDF	[AO +  4 * SIZE], a1
1789	cmp	L, 0
1790
1791	FMADD	(aa2, bb5, cc01, cc01)
1792	LDF	[BO + 12 * SIZE], b5
1793	FMADD	(aa2, bb6, cc03, cc03)
1794	LDF	[BO + 13 * SIZE], b6
1795	FMADD	(aa2, bb7, cc05, cc05)
1796	LDF	[BO + 14 * SIZE], b7
1797	FMADD	(aa2, bb8, cc07, cc07)
1798	LDF	[BO + 15 * SIZE], b8
1799
1800	LDF	[AO +  5 * SIZE], a2
1801	add	AO,  4 * SIZE, AO
1802
1803	FMADD	(aa3, bb9, cc01, cc01)
1804	LDF	[BO + 24 * SIZE], b9
1805	FMADD	(aa3, bb2, cc03, cc03)
1806	LDF	[BO + 17 * SIZE], b2
1807	FMADD	(aa3, bb3, cc05, cc05)
1808	LDF	[BO + 18 * SIZE], b3
1809	FMADD	(aa3, bb4, cc07, cc07)
1810	LDF	[BO + 19 * SIZE], b4
1811
1812	LDF	[AO +  2 * SIZE], a3
1813	add	BO, 16 * SIZE, BO
1814
1815	FMADD	(aa4, bb5, cc01, cc01)
1816	LDF	[BO +  4 * SIZE], b5
1817	FMADD	(aa4, bb6, cc03, cc03)
1818	LDF	[BO +  5 * SIZE], b6
1819	FMADD	(aa4, bb7, cc05, cc05)
1820	LDF	[BO +  6 * SIZE], b7
1821	FMADD	(aa4, bb8, cc07, cc07)
1822	LDF	[BO +  7 * SIZE], b8
1823
1824	bg,pt	%icc, .LL43
1825	LDF	[AO +  3 * SIZE], a4
1826	.align 4
1827
1828.LL45:
1829#if defined(LT) || defined(RN)
1830	and	KK, 3, L
1831#else
1832	sub	K, KK, L
1833	and	L,  3, L
1834#endif
1835	cmp	L,  0
1836	ble,a,pn %icc, .LL48
1837	nop
1838	.align 4
1839
1840.LL47:
1841	FMADD	(aa1, bb1, cc01, cc01)
1842	LDF	[BO + 4 * SIZE], b1
1843	add	L, -1, L
1844	FMADD	(aa1, bb2, cc03, cc03)
1845	LDF	[BO + 5 * SIZE], b2
1846	add	AO, 1 * SIZE, AO
1847
1848	FMADD	(aa1, bb3, cc05, cc05)
1849	LDF	[BO + 6 * SIZE], b3
1850	cmp	L, 0
1851	FMADD	(aa1, bb4, cc07, cc07)
1852	LDF	[BO + 7 * SIZE], b4
1853	add	BO, 4 * SIZE, BO
1854
1855	bg,pt	%icc, .LL47
1856	LDF	[AO + 0 * SIZE], a1
1857	.align 4
1858
1859.LL48:
1860#if defined(LN) || defined(RT)
1861#ifdef LN
1862	sub	KK, 1, TEMP1
1863#else
1864	sub	KK, 4, TEMP1
1865#endif
1866	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1867	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1868
1869	add	AORIG, TEMP2, AO
1870	add	B,     TEMP1, BO
1871#endif
1872
1873#if defined(LN) || defined(LT)
1874	LDF	[BO +  0 * SIZE], a1
1875	LDF	[BO +  1 * SIZE], a2
1876	LDF	[BO +  2 * SIZE], a3
1877	LDF	[BO +  3 * SIZE], a4
1878
1879	FSUB	a1, c01, c01
1880	FSUB	a2, c03, c03
1881	FSUB	a3, c05, c05
1882	FSUB	a4, c07, c07
1883#else
1884	LDF	[AO +  0 * SIZE], a1
1885	LDF	[AO +  1 * SIZE], a2
1886	LDF	[AO +  2 * SIZE], a3
1887	LDF	[AO +  3 * SIZE], a4
1888
1889	FSUB	a1, c01, c01
1890	FSUB	a2, c03, c03
1891	FSUB	a3, c05, c05
1892	FSUB	a4, c07, c07
1893#endif
1894
1895#if defined(LN) || defined(LT)
1896	LDF	[AO +  0 * SIZE], a1
1897
1898	FMUL	a1, c01, c01
1899	FMUL	a1, c03, c03
1900	FMUL	a1, c05, c05
1901	FMUL	a1, c07, c07
1902#endif
1903
1904#ifdef RN
1905	LDF	[BO +  0 * SIZE], a1
1906	LDF	[BO +  1 * SIZE], a2
1907	LDF	[BO +  2 * SIZE], a3
1908	LDF	[BO +  3 * SIZE], a4
1909
1910	FMUL	a1, c01, c01
1911
1912	FNMSUB	(aa2, cc01, cc03, cc03)
1913	FNMSUB	(aa3, cc01, cc05, cc05)
1914	FNMSUB	(aa4, cc01, cc07, cc07)
1915
1916	LDF	[BO +  5 * SIZE], a1
1917	LDF	[BO +  6 * SIZE], a2
1918	LDF	[BO +  7 * SIZE], a3
1919
1920	FMUL	a1, c03, c03
1921
1922	FNMSUB	(aa2, cc03, cc05, cc05)
1923	FNMSUB	(aa3, cc03, cc07, cc07)
1924
1925	LDF	[BO + 10 * SIZE], a1
1926	LDF	[BO + 11 * SIZE], a2
1927
1928	FMUL	a1, c05, c05
1929
1930	FNMSUB	(aa2, cc05, cc07, cc07)
1931
1932	LDF	[BO + 15 * SIZE], a1
1933
1934	FMUL	a1, c07, c07
1935#endif
1936
1937#ifdef RT
1938	LDF	[BO + 15 * SIZE], a1
1939	LDF	[BO + 14 * SIZE], a2
1940	LDF	[BO + 13 * SIZE], a3
1941	LDF	[BO + 12 * SIZE], a4
1942
1943	FMUL	a1, c07, c07
1944
1945	FNMSUB	(aa2, cc07, cc05, cc05)
1946	FNMSUB	(aa3, cc07, cc03, cc03)
1947	FNMSUB	(aa4, cc07, cc01, cc01)
1948
1949	LDF	[BO + 10 * SIZE], a1
1950	LDF	[BO +  9 * SIZE], a2
1951	LDF	[BO +  8 * SIZE], a3
1952
1953	FMUL	a1, c05, c05
1954
1955	FNMSUB	(aa2, cc05, cc03, cc03)
1956	FNMSUB	(aa3, cc05, cc01, cc01)
1957
1958	LDF	[BO +  5 * SIZE], a1
1959	LDF	[BO +  4 * SIZE], a2
1960
1961	FMUL	a1, c03, c03
1962
1963	FNMSUB	(aa2, cc03, cc01, cc01)
1964
1965	LDF	[BO +  0 * SIZE], a1
1966
1967	FMUL	a1, c01, c01
1968#endif
1969
1970#ifdef LN
1971	add	C1, -1 * SIZE, C1
1972	add	C2, -1 * SIZE, C2
1973	add	C3, -1 * SIZE, C3
1974	add	C4, -1 * SIZE, C4
1975#endif
1976
1977#if defined(LN) || defined(LT)
1978	STF	c01, [BO +  0 * SIZE]
1979	STF	c03, [BO +  1 * SIZE]
1980	STF	c05, [BO +  2 * SIZE]
1981	STF	c07, [BO +  3 * SIZE]
1982#else
1983	STF	c01, [AO +  0 * SIZE]
1984	STF	c03, [AO +  1 * SIZE]
1985	STF	c05, [AO +  2 * SIZE]
1986	STF	c07, [AO +  3 * SIZE]
1987#endif
1988
1989	STF	c01, [C1 + 0 * SIZE]
1990	STF	c03, [C2 + 0 * SIZE]
1991	STF	c05, [C3 + 0 * SIZE]
1992	STF	c07, [C4 + 0 * SIZE]
1993
1994#ifdef RT
1995	sll	K, BASE_SHIFT + 0, TEMP1
1996	add	AORIG, TEMP1, AORIG
1997#endif
1998
1999#if defined(LT) || defined(RN)
2000	sub	K, KK, TEMP1
2001	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2002	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2003	add	AO, TEMP2, AO
2004	add	BO, TEMP1, BO
2005#endif
2006
2007#ifdef LT
2008	add	KK, 1, KK
2009#endif
2010
2011#ifdef LN
2012	sub	KK, 1, KK
2013#endif
2014	.align 4
2015
2016.LL49:
2017#ifdef LN
2018	sll	K, BASE_SHIFT + 2, TEMP1
2019	add	B, TEMP1, B
2020#endif
2021
2022#if defined(LT) || defined(RN)
2023	mov	BO, B
2024#endif
2025
2026#ifdef RN
2027	add	KK, 4, KK
2028#endif
2029
2030#ifdef RT
2031	sub	KK, 4, KK
2032#endif
2033	.align 4
2034
2035.LL10:
2036	sra	N, 3, J
2037	cmp	J, 0
2038	ble,pn	%icc, .LL999
2039	nop
2040	.align 4
2041
2042.LL11:
2043#ifdef RT
2044	sll	K, BASE_SHIFT + 3, TEMP1
2045	sub	B, TEMP1, B
2046#endif
2047
2048#ifndef RT
2049	mov	C,  C1
2050	add	C,  LDC, C2
2051	add	C2, LDC, C3
2052	add	C3, LDC, C4
2053	add	C4, LDC, C5
2054	add	C5, LDC, C6
2055	add	C6, LDC, C7
2056	add	C7, LDC, C8
2057	add	C8, LDC, C
2058#else
2059	sub	C,  LDC, C8
2060	sub	C8, LDC, C7
2061	sub	C7, LDC, C6
2062	sub	C6, LDC, C5
2063	sub	C5, LDC, C4
2064	sub	C4, LDC, C3
2065	sub	C3, LDC, C2
2066	sub	C2, LDC, C1
2067	sub	C2, LDC, C
2068#endif
2069
2070#ifdef LN
2071	add	M, OFFSET, KK
2072#endif
2073
2074#ifdef LT
2075	mov	OFFSET, KK
2076#endif
2077
2078#if defined(LN) || defined(RT)
2079	mov	A, AORIG
2080#else
2081	mov	A, AO
2082#endif
2083
2084	sra	M, 1, I
2085	cmp	I, 0
2086	ble,pn	%icc, .LL20
2087	nop
2088	.align 4
2089
2090.LL12:
2091#if defined(LT) || defined(RN)
2092	mov	B, BO
2093#else
2094#ifdef LN
2095	sll	K,  BASE_SHIFT + 1, TEMP1
2096	sub	AORIG, TEMP1, AORIG
2097#endif
2098
2099	sll	KK, BASE_SHIFT + 1, TEMP1
2100	sll	KK, BASE_SHIFT + 3, TEMP2
2101
2102	add	AORIG, TEMP1, AO
2103	add	B,     TEMP2, BO
2104#endif
2105
2106	LDF	[AO +  0 * SIZE], a1
2107	LDF	[AO +  1 * SIZE], a2
2108	LDF	[AO +  8 * SIZE], a5
2109
2110	LDF	[BO +  0 * SIZE], b1
2111
2112	LDF	[BO +  1 * SIZE], b2
2113	FCLR	(cc01)
2114	LDF	[BO +  2 * SIZE], b3
2115	FCLR	(cc05)
2116	LDF	[BO +  3 * SIZE], b4
2117	FCLR	(cc09)
2118	LDF	[BO +  4 * SIZE], b5
2119	FCLR	(cc13)
2120
2121	LDF	[BO +  5 * SIZE], b6
2122	FCLR	(cc02)
2123	LDF	[BO +  6 * SIZE], b7
2124	FCLR	(cc06)
2125	LDF	[BO +  7 * SIZE], b8
2126	FCLR	(cc10)
2127	LDF	[BO +  8 * SIZE], b9
2128	FCLR	(cc14)
2129
2130	prefetch [C1 + 1 * SIZE], 3
2131	FCLR	(cc03)
2132	prefetch [C2 + 2 * SIZE], 3
2133	FCLR	(cc07)
2134	prefetch [C3 + 1 * SIZE], 3
2135	FCLR	(cc11)
2136	prefetch [C4 + 2 * SIZE], 3
2137	FCLR	(cc15)
2138
2139	prefetch [C5 + 1 * SIZE], 3
2140	FCLR	(cc04)
2141	prefetch [C6 + 2 * SIZE], 3
2142	FCLR	(cc08)
2143	prefetch [C7 + 1 * SIZE], 3
2144	FCLR	(cc12)
2145	prefetch [C8 + 2 * SIZE], 3
2146	FCLR	(cc16)
2147
2148#if defined(LT) || defined(RN)
2149	sra	KK, 3, L
2150#else
2151	sub	K, KK, L
2152	sra	L,  3, L
2153#endif
2154	cmp	L,  0
2155	ble,pn	%icc, .LL15
2156	nop
2157	.align 4
2158
2159.LL13:
2160	FMADD	(aa1, bb1, cc01, cc01)
2161	FMADD	(aa2, bb1, cc02, cc02)
2162	FMADD	(aa1, bb2, cc03, cc03)
2163	FMADD	(aa2, bb2, cc04, cc04)
2164
2165	FMADD	(aa1, bb3, cc05, cc05)
2166	LDF	[BO + 16 * SIZE], b1
2167	FMADD	(aa2, bb3, cc06, cc06)
2168	LDF	[BO +  9 * SIZE], b2
2169
2170	FMADD	(aa1, bb4, cc07, cc07)
2171	LDF	[BO + 10 * SIZE], b3
2172	FMADD	(aa2, bb4, cc08, cc08)
2173	LDF	[BO + 11 * SIZE], b4
2174
2175	FMADD	(aa1, bb5, cc09, cc09)
2176	LDF	[AO +  2 * SIZE], a3
2177	FMADD	(aa2, bb5, cc10, cc10)
2178	LDF	[AO +  3 * SIZE], a4
2179
2180	FMADD	(aa1, bb6, cc11, cc11)
2181	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2182	FMADD	(aa2, bb6, cc12, cc12)
2183	nop
2184
2185	FMADD	(aa1, bb7, cc13, cc13)
2186	LDF	[BO + 12 * SIZE], b5
2187	FMADD	(aa2, bb7, cc14, cc14)
2188	LDF	[BO + 13 * SIZE], b6
2189
2190	FMADD	(aa1, bb8, cc15, cc15)
2191	LDF	[BO + 14 * SIZE], b7
2192	FMADD	(aa2, bb8, cc16, cc16)
2193	LDF	[BO + 15 * SIZE], b8
2194
2195	FMADD	(aa3, bb9, cc01, cc01)
2196	FMADD	(aa4, bb9, cc02, cc02)
2197	FMADD	(aa3, bb2, cc03, cc03)
2198	FMADD	(aa4, bb2, cc04, cc04)
2199
2200	FMADD	(aa3, bb3, cc05, cc05)
2201	LDF	[BO + 24 * SIZE], b9
2202	FMADD	(aa4, bb3, cc06, cc06)
2203	LDF	[BO + 17 * SIZE], b2
2204
2205	FMADD	(aa3, bb4, cc07, cc07)
2206	LDF	[BO + 18 * SIZE], b3
2207	FMADD	(aa4, bb4, cc08, cc08)
2208	LDF	[BO + 19 * SIZE], b4
2209
2210	FMADD	(aa3, bb5, cc09, cc09)
2211	LDF	[AO +  4 * SIZE], a1
2212	FMADD	(aa4, bb5, cc10, cc10)
2213	LDF	[AO +  5 * SIZE], a2
2214
2215	FMADD	(aa3, bb6, cc11, cc11)
2216	add	L, -1, L
2217	FMADD	(aa4, bb6, cc12, cc12)
2218	nop
2219
2220	FMADD	(aa3, bb7, cc13, cc13)
2221	LDF	[BO + 20 * SIZE], b5
2222	FMADD	(aa4, bb7, cc14, cc14)
2223	LDF	[BO + 21 * SIZE], b6
2224
2225	FMADD	(aa3, bb8, cc15, cc15)
2226	LDF	[BO + 22 * SIZE], b7
2227	FMADD	(aa4, bb8, cc16, cc16)
2228	LDF	[BO + 23 * SIZE], b8
2229
2230	FMADD	(aa1, bb1, cc01, cc01)
2231	FMADD	(aa2, bb1, cc02, cc02)
2232	FMADD	(aa1, bb2, cc03, cc03)
2233	FMADD	(aa2, bb2, cc04, cc04)
2234
2235	FMADD	(aa1, bb3, cc05, cc05)
2236	LDF	[BO + 32 * SIZE], b1
2237	FMADD	(aa2, bb3, cc06, cc06)
2238	LDF	[BO + 25 * SIZE], b2
2239
2240	FMADD	(aa1, bb4, cc07, cc07)
2241	LDF	[BO + 26 * SIZE], b3
2242	FMADD	(aa2, bb4, cc08, cc08)
2243	LDF	[BO + 27 * SIZE], b4
2244
2245	FMADD	(aa1, bb5, cc09, cc09)
2246	LDF	[AO +  6 * SIZE], a3
2247	FMADD	(aa2, bb5, cc10, cc10)
2248	LDF	[AO +  7 * SIZE], a4
2249
2250	FMADD	(aa1, bb6, cc11, cc11)
2251	nop
2252	FMADD	(aa2, bb6, cc12, cc12)
2253	nop
2254
2255	FMADD	(aa1, bb7, cc13, cc13)
2256	LDF	[BO + 28 * SIZE], b5
2257	FMADD	(aa2, bb7, cc14, cc14)
2258	LDF	[BO + 29 * SIZE], b6
2259
2260	FMADD	(aa1, bb8, cc15, cc15)
2261	LDF	[BO + 30 * SIZE], b7
2262	FMADD	(aa2, bb8, cc16, cc16)
2263	LDF	[BO + 31 * SIZE], b8
2264
2265	FMADD	(aa3, bb9, cc01, cc01)
2266	FMADD	(aa4, bb9, cc02, cc02)
2267	FMADD	(aa3, bb2, cc03, cc03)
2268	FMADD	(aa4, bb2, cc04, cc04)
2269
2270	FMADD	(aa3, bb3, cc05, cc05)
2271	LDF	[BO + 40 * SIZE], b9
2272	FMADD	(aa4, bb3, cc06, cc06)
2273	LDF	[BO + 33 * SIZE], b2
2274
2275	FMADD	(aa3, bb4, cc07, cc07)
2276	LDF	[BO + 34 * SIZE], b3
2277	FMADD	(aa4, bb4, cc08, cc08)
2278	LDF	[BO + 35 * SIZE], b4
2279
2280	FMADD	(aa3, bb5, cc09, cc09)
2281	LDF	[AO + 16 * SIZE], a1  /****/
2282	FMADD	(aa4, bb5, cc10, cc10)
2283	LDF	[AO +  9 * SIZE], a2
2284
2285	FMADD	(aa3, bb6, cc11, cc11)
2286	nop
2287	FMADD	(aa4, bb6, cc12, cc12)
2288	nop
2289
2290	FMADD	(aa3, bb7, cc13, cc13)
2291	LDF	[BO + 36 * SIZE], b5
2292	FMADD	(aa4, bb7, cc14, cc14)
2293	LDF	[BO + 37 * SIZE], b6
2294
2295	FMADD	(aa3, bb8, cc15, cc15)
2296	LDF	[BO + 38 * SIZE], b7
2297	FMADD	(aa4, bb8, cc16, cc16)
2298	LDF	[BO + 39 * SIZE], b8
2299
2300	FMADD	(aa5, bb1, cc01, cc01)
2301	FMADD	(aa2, bb1, cc02, cc02)
2302	FMADD	(aa5, bb2, cc03, cc03)
2303	FMADD	(aa2, bb2, cc04, cc04)
2304
2305	FMADD	(aa5, bb3, cc05, cc05)
2306	LDF	[BO + 48 * SIZE], b1
2307	FMADD	(aa2, bb3, cc06, cc06)
2308	LDF	[BO + 41 * SIZE], b2
2309
2310	FMADD	(aa5, bb4, cc07, cc07)
2311	LDF	[BO + 42 * SIZE], b3
2312	FMADD	(aa2, bb4, cc08, cc08)
2313	LDF	[BO + 43 * SIZE], b4
2314
2315	FMADD	(aa5, bb5, cc09, cc09)
2316	LDF	[AO + 10 * SIZE], a3
2317	FMADD	(aa2, bb5, cc10, cc10)
2318	LDF	[AO + 11 * SIZE], a4
2319
2320	FMADD	(aa5, bb6, cc11, cc11)
2321	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
2322	FMADD	(aa2, bb6, cc12, cc12)
2323	nop
2324
2325	FMADD	(aa5, bb7, cc13, cc13)
2326	LDF	[BO + 44 * SIZE], b5
2327	FMADD	(aa2, bb7, cc14, cc14)
2328	LDF	[BO + 45 * SIZE], b6
2329
2330	FMADD	(aa5, bb8, cc15, cc15)
2331	LDF	[BO + 46 * SIZE], b7
2332	FMADD	(aa2, bb8, cc16, cc16)
2333	LDF	[BO + 47 * SIZE], b8
2334
2335	FMADD	(aa3, bb9, cc01, cc01)
2336	FMADD	(aa4, bb9, cc02, cc02)
2337	FMADD	(aa3, bb2, cc03, cc03)
2338	FMADD	(aa4, bb2, cc04, cc04)
2339
2340	FMADD	(aa3, bb3, cc05, cc05)
2341	LDF	[BO + 56 * SIZE], b9
2342	FMADD	(aa4, bb3, cc06, cc06)
2343	LDF	[BO + 49 * SIZE], b2
2344
2345	FMADD	(aa3, bb4, cc07, cc07)
2346	LDF	[BO + 50 * SIZE], b3
2347	FMADD	(aa4, bb4, cc08, cc08)
2348	LDF	[BO + 51 * SIZE], b4
2349
2350	FMADD	(aa3, bb5, cc09, cc09)
2351	LDF	[AO + 12 * SIZE], a5
2352	FMADD	(aa4, bb5, cc10, cc10)
2353	LDF	[AO + 13 * SIZE], a2
2354
2355	FMADD	(aa3, bb6, cc11, cc11)
2356	cmp	L, 0
2357	FMADD	(aa4, bb6, cc12, cc12)
2358	nop
2359
2360	FMADD	(aa3, bb7, cc13, cc13)
2361	LDF	[BO + 52 * SIZE], b5
2362	FMADD	(aa4, bb7, cc14, cc14)
2363	LDF	[BO + 53 * SIZE], b6
2364
2365	FMADD	(aa3, bb8, cc15, cc15)
2366	LDF	[BO + 54 * SIZE], b7
2367	FMADD	(aa4, bb8, cc16, cc16)
2368	LDF	[BO + 55 * SIZE], b8
2369
2370	FMADD	(aa5, bb1, cc01, cc01)
2371	FMADD	(aa2, bb1, cc02, cc02)
2372	FMADD	(aa5, bb2, cc03, cc03)
2373	FMADD	(aa2, bb2, cc04, cc04)
2374
2375	FMADD	(aa5, bb3, cc05, cc05)
2376	LDF	[BO + 64 * SIZE], b1
2377	FMADD	(aa2, bb3, cc06, cc06)
2378	LDF	[BO + 57 * SIZE], b2
2379
2380	FMADD	(aa5, bb4, cc07, cc07)
2381	LDF	[BO + 58 * SIZE], b3
2382	FMADD	(aa2, bb4, cc08, cc08)
2383	LDF	[BO + 59 * SIZE], b4
2384
2385	FMADD	(aa5, bb5, cc09, cc09)
2386	LDF	[AO + 14 * SIZE], a3
2387	FMADD	(aa2, bb5, cc10, cc10)
2388	LDF	[AO + 15 * SIZE], a4
2389
2390	FMADD	(aa5, bb6, cc11, cc11)
2391	add	BO, 64 * SIZE, BO
2392	FMADD	(aa2, bb6, cc12, cc12)
2393	add	AO, 16 * SIZE, AO
2394
2395	FMADD	(aa5, bb7, cc13, cc13)
2396	LDF	[BO -  4 * SIZE], b5
2397	FMADD	(aa2, bb7, cc14, cc14)
2398	LDF	[BO -  3 * SIZE], b6
2399
2400	FMADD	(aa5, bb8, cc15, cc15)
2401	LDF	[BO -  2 * SIZE], b7
2402	FMADD	(aa2, bb8, cc16, cc16)
2403	LDF	[BO -  1 * SIZE], b8
2404
2405	FMADD	(aa3, bb9, cc01, cc01)
2406	FMADD	(aa4, bb9, cc02, cc02)
2407	FMADD	(aa3, bb2, cc03, cc03)
2408	FMADD	(aa4, bb2, cc04, cc04)
2409
2410	FMADD	(aa3, bb3, cc05, cc05)
2411	LDF	[BO +  8 * SIZE], b9
2412	FMADD	(aa4, bb3, cc06, cc06)
2413	LDF	[BO +  1 * SIZE], b2
2414
2415	FMADD	(aa3, bb4, cc07, cc07)
2416	LDF	[BO +  2 * SIZE], b3
2417	FMADD	(aa4, bb4, cc08, cc08)
2418	LDF	[BO +  3 * SIZE], b4
2419
2420	FMADD	(aa3, bb5, cc09, cc09)
2421	LDF	[AO +  8 * SIZE], a5  /****/
2422	FMADD	(aa4, bb5, cc10, cc10)
2423	LDF	[AO +  1 * SIZE], a2
2424
2425	FMADD	(aa3, bb6, cc11, cc11)
2426	FMADD	(aa4, bb6, cc12, cc12)
2427
2428	FMADD	(aa3, bb7, cc13, cc13)
2429	LDF	[BO +  4 * SIZE], b5
2430	FMADD	(aa4, bb7, cc14, cc14)
2431	LDF	[BO +  5 * SIZE], b6
2432
2433	FMADD	(aa3, bb8, cc15, cc15)
2434	LDF	[BO +  6 * SIZE], b7
2435	FMADD	(aa4, bb8, cc16, cc16)
2436	ble,pn	%icc, .LL15
2437	LDF	[BO +  7 * SIZE], b8
2438
2439	FMADD	(aa1, bb1, cc01, cc01)
2440	FMADD	(aa2, bb1, cc02, cc02)
2441	FMADD	(aa1, bb2, cc03, cc03)
2442	FMADD	(aa2, bb2, cc04, cc04)
2443
2444	FMADD	(aa1, bb3, cc05, cc05)
2445	LDF	[BO + 16 * SIZE], b1
2446	FMADD	(aa2, bb3, cc06, cc06)
2447	LDF	[BO +  9 * SIZE], b2
2448
2449	FMADD	(aa1, bb4, cc07, cc07)
2450	LDF	[BO + 10 * SIZE], b3
2451	FMADD	(aa2, bb4, cc08, cc08)
2452	LDF	[BO + 11 * SIZE], b4
2453
2454	FMADD	(aa1, bb5, cc09, cc09)
2455	LDF	[AO +  2 * SIZE], a3
2456	FMADD	(aa2, bb5, cc10, cc10)
2457	LDF	[AO +  3 * SIZE], a4
2458
2459	FMADD	(aa1, bb6, cc11, cc11)
2460	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2461	FMADD	(aa2, bb6, cc12, cc12)
2462	nop
2463
2464	FMADD	(aa1, bb7, cc13, cc13)
2465	LDF	[BO + 12 * SIZE], b5
2466	FMADD	(aa2, bb7, cc14, cc14)
2467	LDF	[BO + 13 * SIZE], b6
2468
2469	FMADD	(aa1, bb8, cc15, cc15)
2470	LDF	[BO + 14 * SIZE], b7
2471	FMADD	(aa2, bb8, cc16, cc16)
2472	LDF	[BO + 15 * SIZE], b8
2473
2474	FMADD	(aa3, bb9, cc01, cc01)
2475	FMADD	(aa4, bb9, cc02, cc02)
2476	FMADD	(aa3, bb2, cc03, cc03)
2477	FMADD	(aa4, bb2, cc04, cc04)
2478
2479	FMADD	(aa3, bb3, cc05, cc05)
2480	LDF	[BO + 24 * SIZE], b9
2481	FMADD	(aa4, bb3, cc06, cc06)
2482	LDF	[BO + 17 * SIZE], b2
2483
2484	FMADD	(aa3, bb4, cc07, cc07)
2485	LDF	[BO + 18 * SIZE], b3
2486	FMADD	(aa4, bb4, cc08, cc08)
2487	LDF	[BO + 19 * SIZE], b4
2488
2489	FMADD	(aa3, bb5, cc09, cc09)
2490	LDF	[AO +  4 * SIZE], a1
2491	FMADD	(aa4, bb5, cc10, cc10)
2492	LDF	[AO +  5 * SIZE], a2
2493
2494	FMADD	(aa3, bb6, cc11, cc11)
2495	add	L, -1, L
2496	FMADD	(aa4, bb6, cc12, cc12)
2497	nop
2498
2499	FMADD	(aa3, bb7, cc13, cc13)
2500	LDF	[BO + 20 * SIZE], b5
2501	FMADD	(aa4, bb7, cc14, cc14)
2502	LDF	[BO + 21 * SIZE], b6
2503
2504	FMADD	(aa3, bb8, cc15, cc15)
2505	LDF	[BO + 22 * SIZE], b7
2506	FMADD	(aa4, bb8, cc16, cc16)
2507	LDF	[BO + 23 * SIZE], b8
2508
2509	FMADD	(aa1, bb1, cc01, cc01)
2510	FMADD	(aa2, bb1, cc02, cc02)
2511	FMADD	(aa1, bb2, cc03, cc03)
2512	FMADD	(aa2, bb2, cc04, cc04)
2513
2514	FMADD	(aa1, bb3, cc05, cc05)
2515	LDF	[BO + 32 * SIZE], b1
2516	FMADD	(aa2, bb3, cc06, cc06)
2517	LDF	[BO + 25 * SIZE], b2
2518
2519	FMADD	(aa1, bb4, cc07, cc07)
2520	LDF	[BO + 26 * SIZE], b3
2521	FMADD	(aa2, bb4, cc08, cc08)
2522	LDF	[BO + 27 * SIZE], b4
2523
2524	FMADD	(aa1, bb5, cc09, cc09)
2525	LDF	[AO +  6 * SIZE], a3
2526	FMADD	(aa2, bb5, cc10, cc10)
2527	LDF	[AO +  7 * SIZE], a4
2528
2529	FMADD	(aa1, bb6, cc11, cc11)
2530	nop
2531	FMADD	(aa2, bb6, cc12, cc12)
2532	nop
2533
2534	FMADD	(aa1, bb7, cc13, cc13)
2535	LDF	[BO + 28 * SIZE], b5
2536	FMADD	(aa2, bb7, cc14, cc14)
2537	LDF	[BO + 29 * SIZE], b6
2538
2539	FMADD	(aa1, bb8, cc15, cc15)
2540	LDF	[BO + 30 * SIZE], b7
2541	FMADD	(aa2, bb8, cc16, cc16)
2542	LDF	[BO + 31 * SIZE], b8
2543
2544	FMADD	(aa3, bb9, cc01, cc01)
2545	FMADD	(aa4, bb9, cc02, cc02)
2546	FMADD	(aa3, bb2, cc03, cc03)
2547	FMADD	(aa4, bb2, cc04, cc04)
2548
2549	FMADD	(aa3, bb3, cc05, cc05)
2550	LDF	[BO + 40 * SIZE], b9
2551	FMADD	(aa4, bb3, cc06, cc06)
2552	LDF	[BO + 33 * SIZE], b2
2553
2554	FMADD	(aa3, bb4, cc07, cc07)
2555	LDF	[BO + 34 * SIZE], b3
2556	FMADD	(aa4, bb4, cc08, cc08)
2557	LDF	[BO + 35 * SIZE], b4
2558
2559	FMADD	(aa3, bb5, cc09, cc09)
2560	LDF	[AO + 16 * SIZE], a1  /****/
2561	FMADD	(aa4, bb5, cc10, cc10)
2562	LDF	[AO +  9 * SIZE], a2
2563
2564	FMADD	(aa3, bb6, cc11, cc11)
2565	nop
2566	FMADD	(aa4, bb6, cc12, cc12)
2567	nop
2568
2569	FMADD	(aa3, bb7, cc13, cc13)
2570	LDF	[BO + 36 * SIZE], b5
2571	FMADD	(aa4, bb7, cc14, cc14)
2572	LDF	[BO + 37 * SIZE], b6
2573
2574	FMADD	(aa3, bb8, cc15, cc15)
2575	LDF	[BO + 38 * SIZE], b7
2576	FMADD	(aa4, bb8, cc16, cc16)
2577	LDF	[BO + 39 * SIZE], b8
2578
2579	FMADD	(aa5, bb1, cc01, cc01)
2580	FMADD	(aa2, bb1, cc02, cc02)
2581	FMADD	(aa5, bb2, cc03, cc03)
2582	FMADD	(aa2, bb2, cc04, cc04)
2583
2584	FMADD	(aa5, bb3, cc05, cc05)
2585	LDF	[BO + 48 * SIZE], b1
2586	FMADD	(aa2, bb3, cc06, cc06)
2587	LDF	[BO + 41 * SIZE], b2
2588
2589	FMADD	(aa5, bb4, cc07, cc07)
2590	LDF	[BO + 42 * SIZE], b3
2591	FMADD	(aa2, bb4, cc08, cc08)
2592	LDF	[BO + 43 * SIZE], b4
2593
2594	FMADD	(aa5, bb5, cc09, cc09)
2595	LDF	[AO + 10 * SIZE], a3
2596	FMADD	(aa2, bb5, cc10, cc10)
2597	LDF	[AO + 11 * SIZE], a4
2598
2599	FMADD	(aa5, bb6, cc11, cc11)
2600	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
2601	FMADD	(aa2, bb6, cc12, cc12)
2602	nop
2603
2604	FMADD	(aa5, bb7, cc13, cc13)
2605	LDF	[BO + 44 * SIZE], b5
2606	FMADD	(aa2, bb7, cc14, cc14)
2607	LDF	[BO + 45 * SIZE], b6
2608
2609	FMADD	(aa5, bb8, cc15, cc15)
2610	LDF	[BO + 46 * SIZE], b7
2611	FMADD	(aa2, bb8, cc16, cc16)
2612	LDF	[BO + 47 * SIZE], b8
2613
2614	FMADD	(aa3, bb9, cc01, cc01)
2615	FMADD	(aa4, bb9, cc02, cc02)
2616	FMADD	(aa3, bb2, cc03, cc03)
2617	FMADD	(aa4, bb2, cc04, cc04)
2618
2619	FMADD	(aa3, bb3, cc05, cc05)
2620	LDF	[BO + 56 * SIZE], b9
2621	FMADD	(aa4, bb3, cc06, cc06)
2622	LDF	[BO + 49 * SIZE], b2
2623
2624	FMADD	(aa3, bb4, cc07, cc07)
2625	LDF	[BO + 50 * SIZE], b3
2626	FMADD	(aa4, bb4, cc08, cc08)
2627	LDF	[BO + 51 * SIZE], b4
2628
2629	FMADD	(aa3, bb5, cc09, cc09)
2630	LDF	[AO + 12 * SIZE], a5
2631	FMADD	(aa4, bb5, cc10, cc10)
2632	LDF	[AO + 13 * SIZE], a2
2633
2634	FMADD	(aa3, bb6, cc11, cc11)
2635	cmp	L, 0
2636	FMADD	(aa4, bb6, cc12, cc12)
2637	nop
2638
2639	FMADD	(aa3, bb7, cc13, cc13)
2640	LDF	[BO + 52 * SIZE], b5
2641	FMADD	(aa4, bb7, cc14, cc14)
2642	LDF	[BO + 53 * SIZE], b6
2643
2644	FMADD	(aa3, bb8, cc15, cc15)
2645	LDF	[BO + 54 * SIZE], b7
2646	FMADD	(aa4, bb8, cc16, cc16)
2647	LDF	[BO + 55 * SIZE], b8
2648
2649	FMADD	(aa5, bb1, cc01, cc01)
2650	FMADD	(aa2, bb1, cc02, cc02)
2651	FMADD	(aa5, bb2, cc03, cc03)
2652	FMADD	(aa2, bb2, cc04, cc04)
2653
2654	FMADD	(aa5, bb3, cc05, cc05)
2655	LDF	[BO + 64 * SIZE], b1
2656	FMADD	(aa2, bb3, cc06, cc06)
2657	LDF	[BO + 57 * SIZE], b2
2658
2659	FMADD	(aa5, bb4, cc07, cc07)
2660	LDF	[BO + 58 * SIZE], b3
2661	FMADD	(aa2, bb4, cc08, cc08)
2662	LDF	[BO + 59 * SIZE], b4
2663
2664	FMADD	(aa5, bb5, cc09, cc09)
2665	LDF	[AO + 14 * SIZE], a3
2666	FMADD	(aa2, bb5, cc10, cc10)
2667	LDF	[AO + 15 * SIZE], a4
2668
2669	FMADD	(aa5, bb6, cc11, cc11)
2670	add	BO, 64 * SIZE, BO
2671	FMADD	(aa2, bb6, cc12, cc12)
2672	add	AO, 16 * SIZE, AO
2673
2674	FMADD	(aa5, bb7, cc13, cc13)
2675	LDF	[BO -  4 * SIZE], b5
2676	FMADD	(aa2, bb7, cc14, cc14)
2677	LDF	[BO -  3 * SIZE], b6
2678
2679	FMADD	(aa5, bb8, cc15, cc15)
2680	LDF	[BO -  2 * SIZE], b7
2681	FMADD	(aa2, bb8, cc16, cc16)
2682	LDF	[BO -  1 * SIZE], b8
2683
2684	FMADD	(aa3, bb9, cc01, cc01)
2685	FMADD	(aa4, bb9, cc02, cc02)
2686	FMADD	(aa3, bb2, cc03, cc03)
2687	FMADD	(aa4, bb2, cc04, cc04)
2688
2689	FMADD	(aa3, bb3, cc05, cc05)
2690	LDF	[BO +  8 * SIZE], b9
2691	FMADD	(aa4, bb3, cc06, cc06)
2692	LDF	[BO +  1 * SIZE], b2
2693
2694	FMADD	(aa3, bb4, cc07, cc07)
2695	LDF	[BO +  2 * SIZE], b3
2696	FMADD	(aa4, bb4, cc08, cc08)
2697	LDF	[BO +  3 * SIZE], b4
2698
2699	FMADD	(aa3, bb5, cc09, cc09)
2700	LDF	[AO +  8 * SIZE], a5  /****/
2701	FMADD	(aa4, bb5, cc10, cc10)
2702	LDF	[AO +  1 * SIZE], a2
2703
2704	FMADD	(aa3, bb6, cc11, cc11)
2705	FMADD	(aa4, bb6, cc12, cc12)
2706
2707	FMADD	(aa3, bb7, cc13, cc13)
2708	LDF	[BO +  4 * SIZE], b5
2709	FMADD	(aa4, bb7, cc14, cc14)
2710	LDF	[BO +  5 * SIZE], b6
2711
2712	FMADD	(aa3, bb8, cc15, cc15)
2713	LDF	[BO +  6 * SIZE], b7
2714	FMADD	(aa4, bb8, cc16, cc16)
2715	bg,pt	%icc, .LL13
2716	LDF	[BO +  7 * SIZE], b8
2717	.align 4
2718
2719.LL15:
2720#if defined(LT) || defined(RN)
2721	and	KK, 7, L
2722#else
2723	sub	K, KK, L
2724	and	L,  7, L
2725#endif
2726	cmp	L,  0
2727	ble,a,pn %icc, .LL18
2728	nop
2729	.align 4
2730
2731.LL17:
2732	FMADD	(aa1, bb1, cc01, cc01)
2733	add	L, -1, L
2734	FMADD	(aa2, bb1, cc02, cc02)
2735	nop
2736
2737	FMADD	(aa1, bb2, cc03, cc03)
2738	LDF	[BO +  8 * SIZE], b1
2739	FMADD	(aa2, bb2, cc04, cc04)
2740	LDF	[BO +  9 * SIZE], b2
2741
2742	FMADD	(aa1, bb3, cc05, cc05)
2743	cmp	L, 0
2744	FMADD	(aa2, bb3, cc06, cc06)
2745	nop
2746
2747	FMADD	(aa1, bb4, cc07, cc07)
2748	LDF	[BO + 10 * SIZE], b3
2749	FMADD	(aa2, bb4, cc08, cc08)
2750	LDF	[BO + 11 * SIZE], b4
2751
2752	FMADD	(aa1, bb5, cc09, cc09)
2753	nop
2754	FMADD	(aa2, bb5, cc10, cc10)
2755	nop
2756
2757	FMADD	(aa1, bb6, cc11, cc11)
2758	LDF	[BO + 12 * SIZE], b5
2759	FMADD	(aa2, bb6, cc12, cc12)
2760	LDF	[BO + 13 * SIZE], b6
2761
2762	FMADD	(aa1, bb7, cc13, cc13)
2763	add	AO, 2 * SIZE, AO
2764	FMADD	(aa2, bb7, cc14, cc14)
2765	add	BO, 8 * SIZE, BO
2766
2767	FMADD	(aa1, bb8, cc15, cc15)
2768	LDF	[AO +  0 * SIZE], a1
2769	FMADD	(aa2, bb8, cc16, cc16)
2770	LDF	[AO +  1 * SIZE], a2
2771
2772	LDF	[BO +  6 * SIZE], b7
2773	bg,pt	%icc, .LL17
2774	LDF	[BO +  7 * SIZE], b8
2775	nop
2776	.align 4
2777
2778.LL18:
2779#if defined(LN) || defined(RT)
2780#ifdef LN
2781	sub	KK, 2, TEMP1
2782#else
2783	sub	KK, 8, TEMP1
2784#endif
2785	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2786	sll	TEMP1, BASE_SHIFT + 3, TEMP1
2787
2788	add	AORIG, TEMP2, AO
2789	add	B,     TEMP1, BO
2790#endif
2791
2792#if defined(LN) || defined(LT)
2793	LDF	[BO +  0 * SIZE], a1
2794	LDF	[BO +  1 * SIZE], a2
2795	LDF	[BO +  2 * SIZE], a3
2796	LDF	[BO +  3 * SIZE], a4
2797
2798	LDF	[BO +  4 * SIZE], b1
2799	LDF	[BO +  5 * SIZE], b2
2800	LDF	[BO +  6 * SIZE], b3
2801	LDF	[BO +  7 * SIZE], b4
2802
2803	FSUB	a1, c01, c01
2804	FSUB	a2, c03, c03
2805	FSUB	a3, c05, c05
2806	FSUB	a4, c07, c07
2807
2808	FSUB	b1, c09, c09
2809	FSUB	b2, c11, c11
2810	FSUB	b3, c13, c13
2811	FSUB	b4, c15, c15
2812
2813	LDF	[BO +  8 * SIZE], a1
2814	LDF	[BO +  9 * SIZE], a2
2815	LDF	[BO + 10 * SIZE], a3
2816	LDF	[BO + 11 * SIZE], a4
2817
2818	LDF	[BO + 12 * SIZE], b1
2819	LDF	[BO + 13 * SIZE], b2
2820	LDF	[BO + 14 * SIZE], b3
2821	LDF	[BO + 15 * SIZE], b4
2822
2823	FSUB	a1, c02, c02
2824	FSUB	a2, c04, c04
2825	FSUB	a3, c06, c06
2826	FSUB	a4, c08, c08
2827
2828	FSUB	b1, c10, c10
2829	FSUB	b2, c12, c12
2830	FSUB	b3, c14, c14
2831	FSUB	b4, c16, c16
2832#else
2833	LDF	[AO +  0 * SIZE], a1
2834	LDF	[AO +  1 * SIZE], a2
2835	LDF	[AO +  2 * SIZE], a3
2836	LDF	[AO +  3 * SIZE], a4
2837
2838	LDF	[AO +  4 * SIZE], b1
2839	LDF	[AO +  5 * SIZE], b2
2840	LDF	[AO +  6 * SIZE], b3
2841	LDF	[AO +  7 * SIZE], b4
2842
2843	FSUB	a1, c01, c01
2844	FSUB	a2, c02, c02
2845	FSUB	a3, c03, c03
2846	FSUB	a4, c04, c04
2847
2848	FSUB	b1, c05, c05
2849	FSUB	b2, c06, c06
2850	FSUB	b3, c07, c07
2851	FSUB	b4, c08, c08
2852
2853	LDF	[AO +  8 * SIZE], a1
2854	LDF	[AO +  9 * SIZE], a2
2855	LDF	[AO + 10 * SIZE], a3
2856	LDF	[AO + 11 * SIZE], a4
2857
2858	LDF	[AO + 12 * SIZE], b1
2859	LDF	[AO + 13 * SIZE], b2
2860	LDF	[AO + 14 * SIZE], b3
2861	LDF	[AO + 15 * SIZE], b4
2862
2863	FSUB	a1, c09, c09
2864	FSUB	a2, c10, c10
2865	FSUB	a3, c11, c11
2866	FSUB	a4, c12, c12
2867
2868	FSUB	b1, c13, c13
2869	FSUB	b2, c14, c14
2870	FSUB	b3, c15, c15
2871	FSUB	b4, c16, c16
2872#endif
2873
2874#ifdef LN
2875	LDF	[AO +  3 * SIZE], a1
2876	LDF	[AO +  2 * SIZE], a2
2877	LDF	[AO +  0 * SIZE], a3
2878
2879	FMUL	a1, c02, c02
2880	FMUL	a1, c04, c04
2881	FMUL	a1, c06, c06
2882	FMUL	a1, c08, c08
2883	FMUL	a1, c10, c10
2884	FMUL	a1, c12, c12
2885	FMUL	a1, c14, c14
2886	FMUL	a1, c16, c16
2887
2888	FNMSUB	(aa2, cc02, cc01, cc01)
2889	FNMSUB	(aa2, cc04, cc03, cc03)
2890	FNMSUB	(aa2, cc06, cc05, cc05)
2891	FNMSUB	(aa2, cc08, cc07, cc07)
2892	FNMSUB	(aa2, cc10, cc09, cc09)
2893	FNMSUB	(aa2, cc12, cc11, cc11)
2894	FNMSUB	(aa2, cc14, cc13, cc13)
2895	FNMSUB	(aa2, cc16, cc15, cc15)
2896
2897	FMUL	a3, c01, c01
2898	FMUL	a3, c03, c03
2899	FMUL	a3, c05, c05
2900	FMUL	a3, c07, c07
2901	FMUL	a3, c09, c09
2902	FMUL	a3, c11, c11
2903	FMUL	a3, c13, c13
2904	FMUL	a3, c15, c15
2905#endif
2906
2907#ifdef LT
2908	LDF	[AO +  0 * SIZE], a1
2909	LDF	[AO +  1 * SIZE], a2
2910	LDF	[AO +  3 * SIZE], a3
2911
2912	FMUL	a1, c01, c01
2913	FMUL	a1, c03, c03
2914	FMUL	a1, c05, c05
2915	FMUL	a1, c07, c07
2916	FMUL	a1, c09, c09
2917	FMUL	a1, c11, c11
2918	FMUL	a1, c13, c13
2919	FMUL	a1, c15, c15
2920
2921	FNMSUB	(aa2, cc01, cc02, cc02)
2922	FNMSUB	(aa2, cc03, cc04, cc04)
2923	FNMSUB	(aa2, cc05, cc06, cc06)
2924	FNMSUB	(aa2, cc07, cc08, cc08)
2925	FNMSUB	(aa2, cc09, cc10, cc10)
2926	FNMSUB	(aa2, cc11, cc12, cc12)
2927	FNMSUB	(aa2, cc13, cc14, cc14)
2928	FNMSUB	(aa2, cc15, cc16, cc16)
2929
2930	FMUL	a3, c02, c02
2931	FMUL	a3, c04, c04
2932	FMUL	a3, c06, c06
2933	FMUL	a3, c08, c08
2934	FMUL	a3, c10, c10
2935	FMUL	a3, c12, c12
2936	FMUL	a3, c14, c14
2937	FMUL	a3, c16, c16
2938#endif
2939
2940#ifdef RN
2941	LDF	[BO +  0 * SIZE], a1
2942	LDF	[BO +  1 * SIZE], a2
2943	LDF	[BO +  2 * SIZE], a3
2944	LDF	[BO +  3 * SIZE], a4
2945	LDF	[BO +  4 * SIZE], b1
2946	LDF	[BO +  5 * SIZE], b2
2947	LDF	[BO +  6 * SIZE], b3
2948	LDF	[BO +  7 * SIZE], b4
2949
2950	FMUL	a1, c01, c01
2951	FMUL	a1, c02, c02
2952
2953	FNMSUB	(aa2, cc01, cc03, cc03)
2954	FNMSUB	(aa2, cc02, cc04, cc04)
2955	FNMSUB	(aa3, cc01, cc05, cc05)
2956	FNMSUB	(aa3, cc02, cc06, cc06)
2957	FNMSUB	(aa4, cc01, cc07, cc07)
2958	FNMSUB	(aa4, cc02, cc08, cc08)
2959	FNMSUB	(bb1, cc01, cc09, cc09)
2960	FNMSUB	(bb1, cc02, cc10, cc10)
2961	FNMSUB	(bb2, cc01, cc11, cc11)
2962	FNMSUB	(bb2, cc02, cc12, cc12)
2963	FNMSUB	(bb3, cc01, cc13, cc13)
2964	FNMSUB	(bb3, cc02, cc14, cc14)
2965	FNMSUB	(bb4, cc01, cc15, cc15)
2966	FNMSUB	(bb4, cc02, cc16, cc16)
2967
2968	LDF	[BO +  9 * SIZE], a1
2969	LDF	[BO + 10 * SIZE], a2
2970	LDF	[BO + 11 * SIZE], a3
2971	LDF	[BO + 12 * SIZE], a4
2972	LDF	[BO + 13 * SIZE], b1
2973	LDF	[BO + 14 * SIZE], b2
2974	LDF	[BO + 15 * SIZE], b3
2975
2976	FMUL	a1, c03, c03
2977	FMUL	a1, c04, c04
2978
2979	FNMSUB	(aa2, cc03, cc05, cc05)
2980	FNMSUB	(aa2, cc04, cc06, cc06)
2981	FNMSUB	(aa3, cc03, cc07, cc07)
2982	FNMSUB	(aa3, cc04, cc08, cc08)
2983	FNMSUB	(aa4, cc03, cc09, cc09)
2984	FNMSUB	(aa4, cc04, cc10, cc10)
2985	FNMSUB	(bb1, cc03, cc11, cc11)
2986	FNMSUB	(bb1, cc04, cc12, cc12)
2987	FNMSUB	(bb2, cc03, cc13, cc13)
2988	FNMSUB	(bb2, cc04, cc14, cc14)
2989	FNMSUB	(bb3, cc03, cc15, cc15)
2990	FNMSUB	(bb3, cc04, cc16, cc16)
2991
2992	LDF	[BO + 18 * SIZE], a1
2993	LDF	[BO + 19 * SIZE], a2
2994	LDF	[BO + 20 * SIZE], a3
2995	LDF	[BO + 21 * SIZE], a4
2996	LDF	[BO + 22 * SIZE], b1
2997	LDF	[BO + 23 * SIZE], b2
2998
2999	FMUL	a1, c05, c05
3000	FMUL	a1, c06, c06
3001
3002	FNMSUB	(aa2, cc05, cc07, cc07)
3003	FNMSUB	(aa2, cc06, cc08, cc08)
3004	FNMSUB	(aa3, cc05, cc09, cc09)
3005	FNMSUB	(aa3, cc06, cc10, cc10)
3006	FNMSUB	(aa4, cc05, cc11, cc11)
3007	FNMSUB	(aa4, cc06, cc12, cc12)
3008	FNMSUB	(bb1, cc05, cc13, cc13)
3009	FNMSUB	(bb1, cc06, cc14, cc14)
3010	FNMSUB	(bb2, cc05, cc15, cc15)
3011	FNMSUB	(bb2, cc06, cc16, cc16)
3012
3013	LDF	[BO + 27 * SIZE], a1
3014	LDF	[BO + 28 * SIZE], a2
3015	LDF	[BO + 29 * SIZE], a3
3016	LDF	[BO + 30 * SIZE], a4
3017	LDF	[BO + 31 * SIZE], b1
3018
3019	FMUL	a1, c07, c07
3020	FMUL	a1, c08, c08
3021
3022	FNMSUB	(aa2, cc07, cc09, cc09)
3023	FNMSUB	(aa2, cc08, cc10, cc10)
3024	FNMSUB	(aa3, cc07, cc11, cc11)
3025	FNMSUB	(aa3, cc08, cc12, cc12)
3026	FNMSUB	(aa4, cc07, cc13, cc13)
3027	FNMSUB	(aa4, cc08, cc14, cc14)
3028	FNMSUB	(bb1, cc07, cc15, cc15)
3029	FNMSUB	(bb1, cc08, cc16, cc16)
3030
3031	LDF	[BO + 36 * SIZE], a1
3032	LDF	[BO + 37 * SIZE], a2
3033	LDF	[BO + 38 * SIZE], a3
3034	LDF	[BO + 39 * SIZE], a4
3035
3036	FMUL	a1, c09, c09
3037	FMUL	a1, c10, c10
3038
3039	FNMSUB	(aa2, cc09, cc11, cc11)
3040	FNMSUB	(aa2, cc10, cc12, cc12)
3041	FNMSUB	(aa3, cc09, cc13, cc13)
3042	FNMSUB	(aa3, cc10, cc14, cc14)
3043	FNMSUB	(aa4, cc09, cc15, cc15)
3044	FNMSUB	(aa4, cc10, cc16, cc16)
3045
3046	LDF	[BO + 45 * SIZE], a1
3047	LDF	[BO + 46 * SIZE], a2
3048	LDF	[BO + 47 * SIZE], a3
3049
3050	FMUL	a1, c11, c11
3051	FMUL	a1, c12, c12
3052
3053	FNMSUB	(aa2, cc11, cc13, cc13)
3054	FNMSUB	(aa2, cc12, cc14, cc14)
3055	FNMSUB	(aa3, cc11, cc15, cc15)
3056	FNMSUB	(aa3, cc12, cc16, cc16)
3057
3058	LDF	[BO + 54 * SIZE], a1
3059	LDF	[BO + 55 * SIZE], a2
3060
3061	FMUL	a1, c13, c13
3062	FMUL	a1, c14, c14
3063
3064	FNMSUB	(aa2, cc13, cc15, cc15)
3065	FNMSUB	(aa2, cc14, cc16, cc16)
3066
3067	LDF	[BO + 63 * SIZE], a1
3068
3069	FMUL	a1, c15, c15
3070	FMUL	a1, c16, c16
3071#endif
3072
3073#ifdef RT
3074	LDF	[BO + 63 * SIZE], a1
3075	LDF	[BO + 62 * SIZE], a2
3076	LDF	[BO + 61 * SIZE], a3
3077	LDF	[BO + 60 * SIZE], a4
3078	LDF	[BO + 59 * SIZE], b1
3079	LDF	[BO + 58 * SIZE], b2
3080	LDF	[BO + 57 * SIZE], b3
3081	LDF	[BO + 56 * SIZE], b4
3082
3083	FMUL	a1, c16, c16
3084	FMUL	a1, c15, c15
3085
3086	FNMSUB	(aa2, cc16, cc14, cc14)
3087	FNMSUB	(aa2, cc15, cc13, cc13)
3088	FNMSUB	(aa3, cc16, cc12, cc12)
3089	FNMSUB	(aa3, cc15, cc11, cc11)
3090	FNMSUB	(aa4, cc16, cc10, cc10)
3091	FNMSUB	(aa4, cc15, cc09, cc09)
3092	FNMSUB	(bb1, cc16, cc08, cc08)
3093	FNMSUB	(bb1, cc15, cc07, cc07)
3094	FNMSUB	(bb2, cc16, cc06, cc06)
3095	FNMSUB	(bb2, cc15, cc05, cc05)
3096	FNMSUB	(bb3, cc16, cc04, cc04)
3097	FNMSUB	(bb3, cc15, cc03, cc03)
3098	FNMSUB	(bb4, cc16, cc02, cc02)
3099	FNMSUB	(bb4, cc15, cc01, cc01)
3100
3101	LDF	[BO + 54 * SIZE], a1
3102	LDF	[BO + 53 * SIZE], a2
3103	LDF	[BO + 52 * SIZE], a3
3104	LDF	[BO + 51 * SIZE], a4
3105	LDF	[BO + 50 * SIZE], b1
3106	LDF	[BO + 49 * SIZE], b2
3107	LDF	[BO + 48 * SIZE], b3
3108
3109	FMUL	a1, c14, c14
3110	FMUL	a1, c13, c13
3111
3112	FNMSUB	(aa2, cc14, cc12, cc12)
3113	FNMSUB	(aa2, cc13, cc11, cc11)
3114	FNMSUB	(aa3, cc14, cc10, cc10)
3115	FNMSUB	(aa3, cc13, cc09, cc09)
3116	FNMSUB	(aa4, cc14, cc08, cc08)
3117	FNMSUB	(aa4, cc13, cc07, cc07)
3118	FNMSUB	(bb1, cc14, cc06, cc06)
3119	FNMSUB	(bb1, cc13, cc05, cc05)
3120	FNMSUB	(bb2, cc14, cc04, cc04)
3121	FNMSUB	(bb2, cc13, cc03, cc03)
3122	FNMSUB	(bb3, cc14, cc02, cc02)
3123	FNMSUB	(bb3, cc13, cc01, cc01)
3124
3125	LDF	[BO + 45 * SIZE], a1
3126	LDF	[BO + 44 * SIZE], a2
3127	LDF	[BO + 43 * SIZE], a3
3128	LDF	[BO + 42 * SIZE], a4
3129	LDF	[BO + 41 * SIZE], b1
3130	LDF	[BO + 40 * SIZE], b2
3131
3132	FMUL	a1, c12, c12
3133	FMUL	a1, c11, c11
3134
3135	FNMSUB	(aa2, cc12, cc10, cc10)
3136	FNMSUB	(aa2, cc11, cc09, cc09)
3137	FNMSUB	(aa3, cc12, cc08, cc08)
3138	FNMSUB	(aa3, cc11, cc07, cc07)
3139	FNMSUB	(aa4, cc12, cc06, cc06)
3140	FNMSUB	(aa4, cc11, cc05, cc05)
3141	FNMSUB	(bb1, cc12, cc04, cc04)
3142	FNMSUB	(bb1, cc11, cc03, cc03)
3143	FNMSUB	(bb2, cc12, cc02, cc02)
3144	FNMSUB	(bb2, cc11, cc01, cc01)
3145
3146	LDF	[BO + 36 * SIZE], a1
3147	LDF	[BO + 35 * SIZE], a2
3148	LDF	[BO + 34 * SIZE], a3
3149	LDF	[BO + 33 * SIZE], a4
3150	LDF	[BO + 32 * SIZE], b1
3151
3152	FMUL	a1, c10, c10
3153	FMUL	a1, c09, c09
3154
3155	FNMSUB	(aa2, cc10, cc08, cc08)
3156	FNMSUB	(aa2, cc09, cc07, cc07)
3157	FNMSUB	(aa3, cc10, cc06, cc06)
3158	FNMSUB	(aa3, cc09, cc05, cc05)
3159	FNMSUB	(aa4, cc10, cc04, cc04)
3160	FNMSUB	(aa4, cc09, cc03, cc03)
3161	FNMSUB	(bb1, cc10, cc02, cc02)
3162	FNMSUB	(bb1, cc09, cc01, cc01)
3163
3164	LDF	[BO + 27 * SIZE], a1
3165	LDF	[BO + 26 * SIZE], a2
3166	LDF	[BO + 25 * SIZE], a3
3167	LDF	[BO + 24 * SIZE], a4
3168
3169	FMUL	a1, c08, c08
3170	FMUL	a1, c07, c07
3171
3172	FNMSUB	(aa2, cc08, cc06, cc06)
3173	FNMSUB	(aa2, cc07, cc05, cc05)
3174	FNMSUB	(aa3, cc08, cc04, cc04)
3175	FNMSUB	(aa3, cc07, cc03, cc03)
3176	FNMSUB	(aa4, cc08, cc02, cc02)
3177	FNMSUB	(aa4, cc07, cc01, cc01)
3178
3179	LDF	[BO + 18 * SIZE], a1
3180	LDF	[BO + 17 * SIZE], a2
3181	LDF	[BO + 16 * SIZE], a3
3182
3183	FMUL	a1, c06, c06
3184	FMUL	a1, c05, c05
3185
3186	FNMSUB	(aa2, cc06, cc04, cc04)
3187	FNMSUB	(aa2, cc05, cc03, cc03)
3188	FNMSUB	(aa3, cc06, cc02, cc02)
3189	FNMSUB	(aa3, cc05, cc01, cc01)
3190
3191	LDF	[BO +  9 * SIZE], a1
3192	LDF	[BO +  8 * SIZE], a2
3193
3194	FMUL	a1, c04, c04
3195	FMUL	a1, c03, c03
3196
3197	FNMSUB	(aa2, cc04, cc02, cc02)
3198	FNMSUB	(aa2, cc03, cc01, cc01)
3199
3200	LDF	[BO +  0 * SIZE], a1
3201
3202	FMUL	a1, c02, c02
3203	FMUL	a1, c01, c01
3204#endif
3205
3206#ifdef LN
3207	add	C1, -2 * SIZE, C1
3208	add	C2, -2 * SIZE, C2
3209	add	C3, -2 * SIZE, C3
3210	add	C4, -2 * SIZE, C4
3211	add	C5, -2 * SIZE, C5
3212	add	C6, -2 * SIZE, C6
3213	add	C7, -2 * SIZE, C7
3214	add	C8, -2 * SIZE, C8
3215#endif
3216
3217#if defined(LN) || defined(LT)
3218	STF	c01, [BO +  0 * SIZE]
3219	STF	c03, [BO +  1 * SIZE]
3220	STF	c05, [BO +  2 * SIZE]
3221	STF	c07, [BO +  3 * SIZE]
3222
3223	STF	c09, [BO +  4 * SIZE]
3224	STF	c11, [BO +  5 * SIZE]
3225	STF	c13, [BO +  6 * SIZE]
3226	STF	c15, [BO +  7 * SIZE]
3227
3228	STF	c02, [BO +  8 * SIZE]
3229	STF	c04, [BO +  9 * SIZE]
3230	STF	c06, [BO + 10 * SIZE]
3231	STF	c08, [BO + 11 * SIZE]
3232
3233	STF	c10, [BO + 12 * SIZE]
3234	STF	c12, [BO + 13 * SIZE]
3235	STF	c14, [BO + 14 * SIZE]
3236	STF	c16, [BO + 15 * SIZE]
3237#else
3238	STF	c01, [AO +  0 * SIZE]
3239	STF	c02, [AO +  1 * SIZE]
3240	STF	c03, [AO +  2 * SIZE]
3241	STF	c04, [AO +  3 * SIZE]
3242
3243	STF	c05, [AO +  4 * SIZE]
3244	STF	c06, [AO +  5 * SIZE]
3245	STF	c07, [AO +  6 * SIZE]
3246	STF	c08, [AO +  7 * SIZE]
3247
3248	STF	c09, [AO +  8 * SIZE]
3249	STF	c10, [AO +  9 * SIZE]
3250	STF	c11, [AO + 10 * SIZE]
3251	STF	c12, [AO + 11 * SIZE]
3252
3253	STF	c13, [AO + 12 * SIZE]
3254	STF	c14, [AO + 13 * SIZE]
3255	STF	c15, [AO + 14 * SIZE]
3256	STF	c16, [AO + 15 * SIZE]
3257#endif
3258
3259	STF	c01, [C1 + 0 * SIZE]
3260	STF	c02, [C1 + 1 * SIZE]
3261	STF	c03, [C2 + 0 * SIZE]
3262	STF	c04, [C2 + 1 * SIZE]
3263
3264	STF	c05, [C3 + 0 * SIZE]
3265	STF	c06, [C3 + 1 * SIZE]
3266	STF	c07, [C4 + 0 * SIZE]
3267	STF	c08, [C4 + 1 * SIZE]
3268
3269	STF	c09, [C5 + 0 * SIZE]
3270	STF	c10, [C5 + 1 * SIZE]
3271	STF	c11, [C6 + 0 * SIZE]
3272	STF	c12, [C6 + 1 * SIZE]
3273
3274	STF	c13, [C7 + 0 * SIZE]
3275	STF	c14, [C7 + 1 * SIZE]
3276	STF	c15, [C8 + 0 * SIZE]
3277	STF	c16, [C8 + 1 * SIZE]
3278
3279#ifndef LN
3280	add	C1, 2 * SIZE, C1
3281	add	C2, 2 * SIZE, C2
3282	add	C3, 2 * SIZE, C3
3283	add	C4, 2 * SIZE, C4
3284	add	C5, 2 * SIZE, C5
3285	add	C6, 2 * SIZE, C6
3286	add	C7, 2 * SIZE, C7
3287	add	C8, 2 * SIZE, C8
3288#endif
3289
3290#ifdef RT
3291	sll	K, BASE_SHIFT + 1, TEMP1
3292	add	AORIG, TEMP1, AORIG
3293#endif
3294
3295#if defined(LT) || defined(RN)
3296	sub	K, KK, TEMP1
3297	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3298	sll	TEMP1, BASE_SHIFT + 3, TEMP1
3299	add	AO, TEMP2, AO
3300	add	BO, TEMP1, BO
3301#endif
3302
3303#ifdef LT
3304	add	KK, 2, KK
3305#endif
3306
3307#ifdef LN
3308	sub	KK, 2, KK
3309#endif
3310
3311	add	I, -1, I
3312	cmp	I, 0
3313	bg,pt	%icc, .LL12
3314	nop
3315	.align 4
3316
3317.LL20:
3318	and	M, 1, I
3319	cmp	I, 0
3320	ble,pn	%icc, .LL29
3321	nop
3322
3323#if defined(LT) || defined(RN)
3324	mov	B, BO
3325#else
3326#ifdef LN
3327	sll	K,  BASE_SHIFT + 0, TEMP1
3328	sub	AORIG, TEMP1, AORIG
3329#endif
3330
3331	sll	KK, BASE_SHIFT + 0, TEMP1
3332	sll	KK, BASE_SHIFT + 3, TEMP2
3333
3334	add	AORIG, TEMP1, AO
3335	add	B,     TEMP2, BO
3336#endif
3337
3338	LDF	[AO +  0 * SIZE], a1
3339	LDF	[AO +  1 * SIZE], a2
3340	LDF	[AO +  2 * SIZE], a3
3341	LDF	[AO +  3 * SIZE], a4
3342
3343	LDF	[BO +  0 * SIZE], b1
3344	FCLR	(cc01)
3345	LDF	[BO +  1 * SIZE], b2
3346	FCLR	(cc03)
3347	LDF	[BO +  2 * SIZE], b3
3348	FCLR	(cc05)
3349	LDF	[BO +  3 * SIZE], b4
3350	FCLR	(cc07)
3351	LDF	[BO +  4 * SIZE], b5
3352	FCLR	(cc09)
3353	LDF	[BO +  5 * SIZE], b6
3354	FCLR	(cc11)
3355	LDF	[BO +  6 * SIZE], b7
3356	FCLR	(cc13)
3357	LDF	[BO +  7 * SIZE], b8
3358	FCLR	(cc15)
3359
3360#if defined(LT) || defined(RN)
3361	sra	KK, 2, L
3362#else
3363	sub	K, KK, L
3364	sra	L,  2, L
3365#endif
3366	cmp	L,  0
3367	ble,pn	%icc, .LL25
3368	LDF	[BO +  8 * SIZE], b9
3369	.align 4
3370
3371.LL23:
3372	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3373	add	L, -1, L
3374
3375	FMADD	(aa1, bb1, cc01, cc01)
3376	LDF	[BO + 16 * SIZE], b1
3377	FMADD	(aa1, bb2, cc03, cc03)
3378	LDF	[BO +  9 * SIZE], b2
3379
3380	FMADD	(aa1, bb3, cc05, cc05)
3381	LDF	[BO + 10 * SIZE], b3
3382	FMADD	(aa1, bb4, cc07, cc07)
3383	LDF	[BO + 11 * SIZE], b4
3384
3385	FMADD	(aa1, bb5, cc09, cc09)
3386	LDF	[BO + 12 * SIZE], b5
3387	FMADD	(aa1, bb6, cc11, cc11)
3388	LDF	[BO + 13 * SIZE], b6
3389
3390	FMADD	(aa1, bb7, cc13, cc13)
3391	LDF	[BO + 14 * SIZE], b7
3392	FMADD	(aa1, bb8, cc15, cc15)
3393	LDF	[BO + 15 * SIZE], b8
3394
3395	FMADD	(aa2, bb9, cc01, cc01)
3396	LDF	[BO + 24 * SIZE], b9
3397	FMADD	(aa2, bb2, cc03, cc03)
3398	LDF	[BO + 17 * SIZE], b2
3399
3400	FMADD	(aa2, bb3, cc05, cc05)
3401	LDF	[BO + 18 * SIZE], b3
3402	FMADD	(aa2, bb4, cc07, cc07)
3403	LDF	[BO + 19 * SIZE], b4
3404
3405	FMADD	(aa2, bb5, cc09, cc09)
3406	LDF	[BO + 20 * SIZE], b5
3407	FMADD	(aa2, bb6, cc11, cc11)
3408	LDF	[BO + 21 * SIZE], b6
3409
3410	FMADD	(aa2, bb7, cc13, cc13)
3411	LDF	[BO + 22 * SIZE], b7
3412	FMADD	(aa2, bb8, cc15, cc15)
3413	LDF	[BO + 23 * SIZE], b8
3414
3415	LDF	[AO +  4 * SIZE], a1
3416	LDF	[AO +  5 * SIZE], a2
3417
3418	FMADD	(aa3, bb1, cc01, cc01)
3419	LDF	[BO + 32 * SIZE], b1
3420	FMADD	(aa3, bb2, cc03, cc03)
3421	LDF	[BO + 25 * SIZE], b2
3422
3423	FMADD	(aa3, bb3, cc05, cc05)
3424	LDF	[BO + 26 * SIZE], b3
3425	FMADD	(aa3, bb4, cc07, cc07)
3426	LDF	[BO + 27 * SIZE], b4
3427
3428	FMADD	(aa3, bb5, cc09, cc09)
3429	LDF	[BO + 28 * SIZE], b5
3430	FMADD	(aa3, bb6, cc11, cc11)
3431	LDF	[BO + 29 * SIZE], b6
3432
3433	FMADD	(aa3, bb7, cc13, cc13)
3434	LDF	[BO + 30 * SIZE], b7
3435	FMADD	(aa3, bb8, cc15, cc15)
3436	LDF	[BO + 31 * SIZE], b8
3437
3438	FMADD	(aa4, bb9, cc01, cc01)
3439	LDF	[BO + 40 * SIZE], b9
3440	FMADD	(aa4, bb2, cc03, cc03)
3441	LDF	[BO + 33 * SIZE], b2
3442
3443	FMADD	(aa4, bb3, cc05, cc05)
3444	LDF	[BO + 34 * SIZE], b3
3445	FMADD	(aa4, bb4, cc07, cc07)
3446	LDF	[BO + 35 * SIZE], b4
3447
3448	FMADD	(aa4, bb5, cc09, cc09)
3449	LDF	[BO + 36 * SIZE], b5
3450	FMADD	(aa4, bb6, cc11, cc11)
3451	LDF	[BO + 37 * SIZE], b6
3452
3453	FMADD	(aa4, bb7, cc13, cc13)
3454	LDF	[BO + 38 * SIZE], b7
3455	FMADD	(aa4, bb8, cc15, cc15)
3456	LDF	[BO + 39 * SIZE], b8
3457
3458	LDF	[AO +  6 * SIZE], a3
3459	LDF	[AO +  7 * SIZE], a4
3460
3461	add	AO,  4 * SIZE, AO
3462	cmp	L, 0
3463	bg,pt	%icc, .LL23
3464	add	BO, 32 * SIZE, BO
3465	.align 4
3466
3467.LL25:
3468#if defined(LT) || defined(RN)
3469	and	KK, 3, L
3470#else
3471	sub	K, KK, L
3472	and	L,  3, L
3473#endif
3474	cmp	L,  0
3475	ble,a,pn %icc, .LL28
3476	nop
3477	.align 4
3478
3479.LL27:
3480	FMADD	(aa1, bb1, cc01, cc01)
3481	LDF	[BO +  8 * SIZE], b1
3482	FMADD	(aa1, bb2, cc03, cc03)
3483	LDF	[BO +  9 * SIZE], b2
3484
3485	FMADD	(aa1, bb3, cc05, cc05)
3486	LDF	[BO + 10 * SIZE], b3
3487	FMADD	(aa1, bb4, cc07, cc07)
3488	LDF	[BO + 11 * SIZE], b4
3489
3490	FMADD	(aa1, bb5, cc09, cc09)
3491	LDF	[BO + 12 * SIZE], b5
3492	FMADD	(aa1, bb6, cc11, cc11)
3493	LDF	[BO + 13 * SIZE], b6
3494
3495	FMADD	(aa1, bb7, cc13, cc13)
3496	LDF	[BO + 14 * SIZE], b7
3497	FMADD	(aa1, bb8, cc15, cc15)
3498	LDF	[BO + 15 * SIZE], b8
3499
3500	LDF	[AO +  1 * SIZE], a1
3501	add	AO, 1 * SIZE, AO
3502
3503	add	L, -1, L
3504	cmp	L, 0
3505	bg,pt	%icc, .LL27
3506	add	BO, 8 * SIZE, BO
3507	.align 4
3508
3509.LL28:
3510#if defined(LN) || defined(RT)
3511#ifdef LN
3512	sub	KK, 1, TEMP1
3513#else
3514	sub	KK, 8, TEMP1
3515#endif
3516	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3517	sll	TEMP1, BASE_SHIFT + 3, TEMP1
3518
3519	add	AORIG, TEMP2, AO
3520	add	B,     TEMP1, BO
3521#endif
3522
3523#if defined(LN) || defined(LT)
3524	LDF	[BO +  0 * SIZE], a1
3525	LDF	[BO +  1 * SIZE], a2
3526	LDF	[BO +  2 * SIZE], a3
3527	LDF	[BO +  3 * SIZE], a4
3528
3529	LDF	[BO +  4 * SIZE], b1
3530	LDF	[BO +  5 * SIZE], b2
3531	LDF	[BO +  6 * SIZE], b3
3532	LDF	[BO +  7 * SIZE], b4
3533
3534	FSUB	a1, c01, c01
3535	FSUB	a2, c03, c03
3536	FSUB	a3, c05, c05
3537	FSUB	a4, c07, c07
3538
3539	FSUB	b1, c09, c09
3540	FSUB	b2, c11, c11
3541	FSUB	b3, c13, c13
3542	FSUB	b4, c15, c15
3543#else
3544	LDF	[AO +  0 * SIZE], a1
3545	LDF	[AO +  1 * SIZE], a2
3546	LDF	[AO +  2 * SIZE], a3
3547	LDF	[AO +  3 * SIZE], a4
3548
3549	LDF	[AO +  4 * SIZE], b1
3550	LDF	[AO +  5 * SIZE], b2
3551	LDF	[AO +  6 * SIZE], b3
3552	LDF	[AO +  7 * SIZE], b4
3553
3554	FSUB	a1, c01, c01
3555	FSUB	a2, c03, c03
3556	FSUB	a3, c05, c05
3557	FSUB	a4, c07, c07
3558
3559	FSUB	b1, c09, c09
3560	FSUB	b2, c11, c11
3561	FSUB	b3, c13, c13
3562	FSUB	b4, c15, c15
3563#endif
3564
3565#if defined(LN) || defined(LT)
3566	LDF	[AO +  0 * SIZE], a1
3567
3568	FMUL	a1, c01, c01
3569	FMUL	a1, c03, c03
3570	FMUL	a1, c05, c05
3571	FMUL	a1, c07, c07
3572	FMUL	a1, c09, c09
3573	FMUL	a1, c11, c11
3574	FMUL	a1, c13, c13
3575	FMUL	a1, c15, c15
3576#endif
3577
3578#ifdef RN
3579	LDF	[BO +  0 * SIZE], a1
3580	LDF	[BO +  1 * SIZE], a2
3581	LDF	[BO +  2 * SIZE], a3
3582	LDF	[BO +  3 * SIZE], a4
3583	LDF	[BO +  4 * SIZE], b1
3584	LDF	[BO +  5 * SIZE], b2
3585	LDF	[BO +  6 * SIZE], b3
3586	LDF	[BO +  7 * SIZE], b4
3587
3588	FMUL	a1, c01, c01
3589
3590	FNMSUB	(aa2, cc01, cc03, cc03)
3591	FNMSUB	(aa3, cc01, cc05, cc05)
3592	FNMSUB	(aa4, cc01, cc07, cc07)
3593	FNMSUB	(bb1, cc01, cc09, cc09)
3594	FNMSUB	(bb2, cc01, cc11, cc11)
3595	FNMSUB	(bb3, cc01, cc13, cc13)
3596	FNMSUB	(bb4, cc01, cc15, cc15)
3597
3598	LDF	[BO +  9 * SIZE], a1
3599	LDF	[BO + 10 * SIZE], a2
3600	LDF	[BO + 11 * SIZE], a3
3601	LDF	[BO + 12 * SIZE], a4
3602	LDF	[BO + 13 * SIZE], b1
3603	LDF	[BO + 14 * SIZE], b2
3604	LDF	[BO + 15 * SIZE], b3
3605
3606	FMUL	a1, c03, c03
3607
3608	FNMSUB	(aa2, cc03, cc05, cc05)
3609	FNMSUB	(aa3, cc03, cc07, cc07)
3610	FNMSUB	(aa4, cc03, cc09, cc09)
3611	FNMSUB	(bb1, cc03, cc11, cc11)
3612	FNMSUB	(bb2, cc03, cc13, cc13)
3613	FNMSUB	(bb3, cc03, cc15, cc15)
3614
3615	LDF	[BO + 18 * SIZE], a1
3616	LDF	[BO + 19 * SIZE], a2
3617	LDF	[BO + 20 * SIZE], a3
3618	LDF	[BO + 21 * SIZE], a4
3619	LDF	[BO + 22 * SIZE], b1
3620	LDF	[BO + 23 * SIZE], b2
3621
3622	FMUL	a1, c05, c05
3623
3624	FNMSUB	(aa2, cc05, cc07, cc07)
3625	FNMSUB	(aa3, cc05, cc09, cc09)
3626	FNMSUB	(aa4, cc05, cc11, cc11)
3627	FNMSUB	(bb1, cc05, cc13, cc13)
3628	FNMSUB	(bb2, cc05, cc15, cc15)
3629
3630	LDF	[BO + 27 * SIZE], a1
3631	LDF	[BO + 28 * SIZE], a2
3632	LDF	[BO + 29 * SIZE], a3
3633	LDF	[BO + 30 * SIZE], a4
3634	LDF	[BO + 31 * SIZE], b1
3635
3636	FMUL	a1, c07, c07
3637
3638	FNMSUB	(aa2, cc07, cc09, cc09)
3639	FNMSUB	(aa3, cc07, cc11, cc11)
3640	FNMSUB	(aa4, cc07, cc13, cc13)
3641	FNMSUB	(bb1, cc07, cc15, cc15)
3642
3643	LDF	[BO + 36 * SIZE], a1
3644	LDF	[BO + 37 * SIZE], a2
3645	LDF	[BO + 38 * SIZE], a3
3646	LDF	[BO + 39 * SIZE], a4
3647
3648	FMUL	a1, c09, c09
3649
3650	FNMSUB	(aa2, cc09, cc11, cc11)
3651	FNMSUB	(aa3, cc09, cc13, cc13)
3652	FNMSUB	(aa4, cc09, cc15, cc15)
3653
3654	LDF	[BO + 45 * SIZE], a1
3655	LDF	[BO + 46 * SIZE], a2
3656	LDF	[BO + 47 * SIZE], a3
3657
3658	FMUL	a1, c11, c11
3659
3660	FNMSUB	(aa2, cc11, cc13, cc13)
3661	FNMSUB	(aa3, cc11, cc15, cc15)
3662
3663	LDF	[BO + 54 * SIZE], a1
3664	LDF	[BO + 55 * SIZE], a2
3665
3666	FMUL	a1, c13, c13
3667
3668	FNMSUB	(aa2, cc13, cc15, cc15)
3669
3670	LDF	[BO + 63 * SIZE], a1
3671
3672	FMUL	a1, c15, c15
3673#endif
3674
3675#ifdef RT
3676	LDF	[BO + 63 * SIZE], a1
3677	LDF	[BO + 62 * SIZE], a2
3678	LDF	[BO + 61 * SIZE], a3
3679	LDF	[BO + 60 * SIZE], a4
3680	LDF	[BO + 59 * SIZE], b1
3681	LDF	[BO + 58 * SIZE], b2
3682	LDF	[BO + 57 * SIZE], b3
3683	LDF	[BO + 56 * SIZE], b4
3684
3685	FMUL	a1, c15, c15
3686
3687	FNMSUB	(aa2, cc15, cc13, cc13)
3688	FNMSUB	(aa3, cc15, cc11, cc11)
3689	FNMSUB	(aa4, cc15, cc09, cc09)
3690	FNMSUB	(bb1, cc15, cc07, cc07)
3691	FNMSUB	(bb2, cc15, cc05, cc05)
3692	FNMSUB	(bb3, cc15, cc03, cc03)
3693	FNMSUB	(bb4, cc15, cc01, cc01)
3694
3695	LDF	[BO + 54 * SIZE], a1
3696	LDF	[BO + 53 * SIZE], a2
3697	LDF	[BO + 52 * SIZE], a3
3698	LDF	[BO + 51 * SIZE], a4
3699	LDF	[BO + 50 * SIZE], b1
3700	LDF	[BO + 49 * SIZE], b2
3701	LDF	[BO + 48 * SIZE], b3
3702
3703	FMUL	a1, c13, c13
3704
3705	FNMSUB	(aa2, cc13, cc11, cc11)
3706	FNMSUB	(aa3, cc13, cc09, cc09)
3707	FNMSUB	(aa4, cc13, cc07, cc07)
3708	FNMSUB	(bb1, cc13, cc05, cc05)
3709	FNMSUB	(bb2, cc13, cc03, cc03)
3710	FNMSUB	(bb3, cc13, cc01, cc01)
3711
3712	LDF	[BO + 45 * SIZE], a1
3713	LDF	[BO + 44 * SIZE], a2
3714	LDF	[BO + 43 * SIZE], a3
3715	LDF	[BO + 42 * SIZE], a4
3716	LDF	[BO + 41 * SIZE], b1
3717	LDF	[BO + 40 * SIZE], b2
3718
3719	FMUL	a1, c11, c11
3720
3721	FNMSUB	(aa2, cc11, cc09, cc09)
3722	FNMSUB	(aa3, cc11, cc07, cc07)
3723	FNMSUB	(aa4, cc11, cc05, cc05)
3724	FNMSUB	(bb1, cc11, cc03, cc03)
3725	FNMSUB	(bb2, cc11, cc01, cc01)
3726
3727	LDF	[BO + 36 * SIZE], a1
3728	LDF	[BO + 35 * SIZE], a2
3729	LDF	[BO + 34 * SIZE], a3
3730	LDF	[BO + 33 * SIZE], a4
3731	LDF	[BO + 32 * SIZE], b1
3732
3733	FMUL	a1, c09, c09
3734
3735	FNMSUB	(aa2, cc09, cc07, cc07)
3736	FNMSUB	(aa3, cc09, cc05, cc05)
3737	FNMSUB	(aa4, cc09, cc03, cc03)
3738	FNMSUB	(bb1, cc09, cc01, cc01)
3739
3740	LDF	[BO + 27 * SIZE], a1
3741	LDF	[BO + 26 * SIZE], a2
3742	LDF	[BO + 25 * SIZE], a3
3743	LDF	[BO + 24 * SIZE], a4
3744
3745	FMUL	a1, c07, c07
3746
3747	FNMSUB	(aa2, cc07, cc05, cc05)
3748	FNMSUB	(aa3, cc07, cc03, cc03)
3749	FNMSUB	(aa4, cc07, cc01, cc01)
3750
3751	LDF	[BO + 18 * SIZE], a1
3752	LDF	[BO + 17 * SIZE], a2
3753	LDF	[BO + 16 * SIZE], a3
3754
3755	FMUL	a1, c05, c05
3756
3757	FNMSUB	(aa2, cc05, cc03, cc03)
3758	FNMSUB	(aa3, cc05, cc01, cc01)
3759
3760	LDF	[BO +  9 * SIZE], a1
3761	LDF	[BO +  8 * SIZE], a2
3762
3763	FMUL	a1, c03, c03
3764
3765	FNMSUB	(aa2, cc03, cc01, cc01)
3766
3767	LDF	[BO +  0 * SIZE], a1
3768
3769	FMUL	a1, c01, c01
3770#endif
3771
3772#ifdef LN
3773	add	C1, -1 * SIZE, C1
3774	add	C2, -1 * SIZE, C2
3775	add	C3, -1 * SIZE, C3
3776	add	C4, -1 * SIZE, C4
3777	add	C5, -1 * SIZE, C5
3778	add	C6, -1 * SIZE, C6
3779	add	C7, -1 * SIZE, C7
3780	add	C8, -1 * SIZE, C8
3781#endif
3782
3783#if defined(LN) || defined(LT)
3784	STF	c01, [BO +  0 * SIZE]
3785	STF	c03, [BO +  1 * SIZE]
3786	STF	c05, [BO +  2 * SIZE]
3787	STF	c07, [BO +  3 * SIZE]
3788
3789	STF	c09, [BO +  4 * SIZE]
3790	STF	c11, [BO +  5 * SIZE]
3791	STF	c13, [BO +  6 * SIZE]
3792	STF	c15, [BO +  7 * SIZE]
3793#else
3794	STF	c01, [AO +  0 * SIZE]
3795	STF	c03, [AO +  1 * SIZE]
3796	STF	c05, [AO +  2 * SIZE]
3797	STF	c07, [AO +  3 * SIZE]
3798
3799	STF	c09, [AO +  4 * SIZE]
3800	STF	c11, [AO +  5 * SIZE]
3801	STF	c13, [AO +  6 * SIZE]
3802	STF	c15, [AO +  7 * SIZE]
3803#endif
3804
3805	STF	c01, [C1 + 0 * SIZE]
3806	STF	c03, [C2 + 0 * SIZE]
3807	STF	c05, [C3 + 0 * SIZE]
3808	STF	c07, [C4 + 0 * SIZE]
3809
3810	STF	c09, [C5 + 0 * SIZE]
3811	STF	c11, [C6 + 0 * SIZE]
3812	STF	c13, [C7 + 0 * SIZE]
3813	STF	c15, [C8 + 0 * SIZE]
3814
3815#ifdef RT
3816	sll	K, BASE_SHIFT + 0, TEMP1
3817	add	AORIG, TEMP1, AORIG
3818#endif
3819
3820#if defined(LT) || defined(RN)
3821	sub	K, KK, TEMP1
3822	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3823	sll	TEMP1, BASE_SHIFT + 3, TEMP1
3824	add	AO, TEMP2, AO
3825	add	BO, TEMP1, BO
3826#endif
3827
3828#ifdef LT
3829	add	KK, 1, KK
3830#endif
3831
3832#ifdef LN
3833	sub	KK, 1, KK
3834#endif
3835	.align 4
3836
3837.LL29:
3838#ifdef LN
3839	sll	K, BASE_SHIFT + 3, TEMP1
3840	add	B, TEMP1, B
3841#endif
3842
3843#if defined(LT) || defined(RN)
3844	mov	BO, B
3845#endif
3846
3847#ifdef RN
3848	add	KK, 8, KK
3849#endif
3850
3851#ifdef RT
3852	sub	KK, 8, KK
3853#endif
3854
3855	add	J, -1, J
3856	cmp	J, 0
3857	bg,pt	%icc, .LL11
3858	nop
3859	.align 4
3860
3861.LL999:
3862#ifdef TRMMKERNEL
3863#ifndef __64BIT__
3864	ld	[%sp + STACK_START +  8], %g1
3865	ld	[%sp + STACK_START + 12], %g2
3866	ld	[%sp + STACK_START + 16], %g3
3867	ld	[%sp + STACK_START + 20], %g4
3868#else
3869	ldx	[%sp + STACK_START + 32], %g1
3870	ldx	[%sp + STACK_START + 40], %g2
3871	ldx	[%sp + STACK_START + 48], %g3
3872	ldx	[%sp + STACK_START + 56], %g4
3873#endif
3874#endif
3875
3876	return	%i7 + 8
3877	clr	%o0
3878
3879	EPILOGUE
3880