1/*********************************************************************/
2/* Copyright 2005-2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define APREFETCHSIZE 24
43#define APREFETCH_CATEGORY 0
44
45#define M	%i0
46#define N	%i1
47#define K	%i2
48
49#if defined(DOUBLE) && !defined(__64BIT__)
50#define A	%i5
51#define B	%i4
52#else
53#define A	%i4
54#define B	%i5
55#endif
56
57#define C	%o4
58#define LDC	%o5
59
60#define AO	%l0
61#define BO	%l1
62#define I	%l2
63#define J	%l3
64#define L	%l4
65
66#define C1	%o0
67#define C2	%o1
68#define C3	%o2
69#define C4	%o3
70
71#define C5	%l5
72#define	C6	%l6
73#define C7	%l7
74#define C8	%i3
75
76#define OFFSET	%g1
77#define	KK	%g2
78#define TEMP1	%g3
79#define TEMP2	%g4
80#define AORIG	%o7
81
82#ifdef DOUBLE
83#define c01	%f0
84#define c02	%f2
85#define c03	%f4
86#define c04	%f6
87#define c05	%f8
88#define c06	%f10
89#define c07	%f12
90#define c08	%f14
91#define c09	%f16
92#define c10	%f18
93#define c11	%f20
94#define c12	%f22
95#define c13	%f24
96#define c14	%f26
97#define c15	%f28
98#define c16	%f30
99
100#define a1	%f32
101#define a2	%f34
102#define a3	%f36
103#define a4	%f38
104#define a5	%f40
105
106#define b1	%f42
107#define b2	%f44
108#define b3	%f46
109#define b4	%f48
110#define b5	%f50
111#define b6	%f52
112#define b7	%f54
113#define b8	%f56
114#define b9	%f58
115
116#define cc01	0
117#define cc02	2
118#define cc03	4
119#define cc04	6
120#define cc05	8
121#define cc06	10
122#define cc07	12
123#define cc08	14
124#define cc09	16
125#define cc10	18
126#define cc11	20
127#define cc12	22
128#define cc13	24
129#define cc14	26
130#define cc15	28
131#define cc16	30
132
133#define aa1	 1
134#define aa2	 3
135#define aa3	 5
136#define aa4	 7
137#define aa5	 9
138
139#define bb1	11
140#define bb2	13
141#define bb3	15
142#define bb4	17
143#define bb5	19
144#define bb6	21
145#define bb7	23
146#define bb8	25
147#define bb9	27
148
149#else
150#define c01	%f0
151#define c02	%f1
152#define c03	%f2
153#define c04	%f3
154#define c05	%f4
155#define c06	%f5
156#define c07	%f6
157#define c08	%f7
158#define c09	%f8
159#define c10	%f9
160#define c11	%f10
161#define c12	%f11
162#define c13	%f12
163#define c14	%f13
164#define c15	%f14
165#define c16	%f15
166
167#define a1	%f16
168#define a2	%f17
169#define a3	%f18
170#define a4	%f19
171#define a5	%f20
172
173#define b1	%f21
174#define b2	%f22
175#define b3	%f23
176#define b4	%f24
177#define b5	%f25
178#define b6	%f26
179#define b7	%f27
180#define b8	%f28
181#define b9	%f29
182
183#define cc01	0
184#define cc02	1
185#define cc03	2
186#define cc04	3
187#define cc05	4
188#define cc06	5
189#define cc07	6
190#define cc08	7
191#define cc09	8
192#define cc10	9
193#define cc11	10
194#define cc12	11
195#define cc13	12
196#define cc14	13
197#define cc15	14
198#define cc16	15
199
200#define aa1	16
201#define aa2	17
202#define aa3	18
203#define aa4	19
204#define aa5	20
205
206#define bb1	21
207#define bb2	22
208#define bb3	23
209#define bb4	24
210#define bb5	25
211#define bb6	26
212#define bb7	27
213#define bb8	28
214#define bb9	29
215
216#endif
217
218        .register %g2, #scratch
219        .register %g3, #scratch
220
221	PROLOGUE
222	SAVESP
223	nop
224
225#ifndef __64BIT__
226
227#ifdef DOUBLE
228	ld	[%sp + STACK_START + 28], B
229	ld	[%sp + STACK_START + 32], C
230	ld	[%sp + STACK_START + 36], LDC
231	ld	[%sp + STACK_START + 40], OFFSET
232#else
233	ld	[%sp + STACK_START + 28], C
234	ld	[%sp + STACK_START + 32], LDC
235	ld	[%sp + STACK_START + 36], OFFSET
236#endif
237	st	%g1, [%sp + STACK_START +  8]
238	st	%g2, [%sp + STACK_START + 12]
239	st	%g3, [%sp + STACK_START + 16]
240	st	%g4, [%sp + STACK_START + 20]
241#else
242
243	ldx	[%sp+  STACK_START + 56], C
244	ldx	[%sp+  STACK_START + 64], LDC
245	ldx	[%sp+  STACK_START + 72], OFFSET
246
247	stx	%g1, [%sp + STACK_START + 32]
248	stx	%g2, [%sp + STACK_START + 40]
249	stx	%g3, [%sp + STACK_START + 48]
250	stx	%g4, [%sp + STACK_START + 56]
251#endif
252
253#if defined(TRMMKERNEL) && !defined(LEFT)
254	neg	OFFSET, KK
255#endif
256
257	sll	LDC, BASE_SHIFT, LDC
258
259#ifdef LN
260	smul	M, K, TEMP1
261	sll	TEMP1, BASE_SHIFT, TEMP1
262	add	A, TEMP1, A
263
264	sll	M, BASE_SHIFT, TEMP1
265	add	C, TEMP1, C
266#endif
267
268#ifdef RN
269	neg	OFFSET, KK
270#endif
271
272#ifdef RT
273	smul	N, K, TEMP1
274	sll	TEMP1, BASE_SHIFT, TEMP1
275	add	B, TEMP1, B
276
277	smul	N, LDC, TEMP1
278	add	C, TEMP1, C
279
280	sub	N, OFFSET, KK
281#endif
282
283	and	N, 1, J
284	cmp	J, 0
285	ble,pn	%icc, .LL50
286	nop
287
288#ifdef RT
289	sll	K, BASE_SHIFT, TEMP1
290	sub	B, TEMP1, B
291#endif
292
293#ifndef RT
294	mov	C,  C1
295	add	C1, LDC, C
296#else
297	sub	C,  LDC, C1
298	sub	C,  LDC, C
299#endif
300
301#ifdef LN
302	add	M, OFFSET, KK
303#endif
304
305#ifdef LT
306	mov	OFFSET, KK
307#endif
308
309#if defined(LN) || defined(RT)
310	mov	A, AORIG
311#else
312	mov	A, AO
313#endif
314
315	sra	M, 1, I
316	cmp	I, 0
317	ble,pn	%icc, .LL80
318	nop
319	.align 4
320
321.LL72:
322#if defined(LT) || defined(RN)
323	mov	B, BO
324#else
325#ifdef LN
326	sll	K,  BASE_SHIFT + 1, TEMP1
327	sub	AORIG, TEMP1, AORIG
328#endif
329
330	sll	KK, BASE_SHIFT + 1, TEMP1
331	sll	KK, BASE_SHIFT + 0, TEMP2
332
333	add	AORIG, TEMP1, AO
334	add	B,     TEMP2, BO
335#endif
336
337	LDF	[AO +  0 * SIZE], a1
338	LDF	[AO +  1 * SIZE], a2
339	LDF	[AO +  2 * SIZE], a3
340	LDF	[AO +  3 * SIZE], a4
341
342	LDF	[BO +  0 * SIZE], b1
343	LDF	[BO +  1 * SIZE], b2
344	LDF	[BO +  2 * SIZE], b3
345	FCLR	(cc01)
346	LDF	[BO +  3 * SIZE], b4
347	FCLR	(cc02)
348
349	prefetch [C1 + 2 * SIZE], 3
350
351#if defined(LT) || defined(RN)
352	sra	KK, 2, L
353#else
354	sub	K, KK, L
355	sra	L,  2, L
356#endif
357	cmp	L,  0
358	ble,pn	%icc, .LL75
359	nop
360
361.LL73:
362	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
363	add	L, -1, L
364
365	FMADD	(aa1, bb1, cc01, cc01)
366	LDF	[AO +  4 * SIZE], a1
367	FMADD	(aa2, bb1, cc02, cc02)
368	LDF	[AO +  5 * SIZE], a2
369
370	LDF	[BO +  4 * SIZE], b1
371	cmp	L, 0
372
373	FMADD	(aa3, bb2, cc01, cc01)
374	LDF	[AO +  6 * SIZE], a3
375	FMADD	(aa4, bb2, cc02, cc02)
376	LDF	[AO +  7 * SIZE], a4
377
378	LDF	[BO +  5 * SIZE], b2
379	add	BO,  4 * SIZE, BO
380
381	FMADD	(aa1, bb3, cc01, cc01)
382	LDF	[AO +  8 * SIZE], a1
383	FMADD	(aa2, bb3, cc02, cc02)
384	LDF	[AO +  9 * SIZE], a2
385
386	LDF	[BO +  2 * SIZE], b3
387	add	AO,  8 * SIZE, AO
388
389	FMADD	(aa3, bb4, cc01, cc01)
390	LDF	[AO +  2 * SIZE], a3
391	FMADD	(aa4, bb4, cc02, cc02)
392	LDF	[AO +  3 * SIZE], a4
393
394	bg,pt	%icc, .LL73
395	LDF	[BO +  3 * SIZE], b4
396	.align 4
397
398.LL75:
399#if defined(LT) || defined(RN)
400	and	KK, 3, L
401#else
402	sub	K, KK, L
403	and	L,  3, L
404#endif
405	cmp	L,  0
406	ble,a,pn %icc, .LL78
407	nop
408	.align 4
409
410.LL77:
411	FMADD	(aa1, bb1, cc01, cc01)
412	LDF	[AO + 2 * SIZE], a1
413	FMADD	(aa2, bb1, cc02, cc02)
414	LDF	[AO + 3 * SIZE], a2
415
416	LDF	[BO + 1 * SIZE], b1
417	add	L, -1, L
418	add	AO, 2 * SIZE, AO
419	cmp	L, 0
420	bg,pt	%icc, .LL77
421	add	BO, 1 * SIZE, BO
422	.align 4
423
424.LL78:
425#if defined(LN) || defined(RT)
426#ifdef LN
427	sub	KK, 2, TEMP1
428#else
429	sub	KK, 1, TEMP1
430#endif
431	sll	TEMP1, BASE_SHIFT + 1, TEMP2
432	sll	TEMP1, BASE_SHIFT + 0, TEMP1
433
434	add	AORIG, TEMP2, AO
435	add	B,     TEMP1, BO
436#endif
437
438#if defined(LN) || defined(LT)
439	LDF	[BO +  0 * SIZE], a1
440	LDF	[BO +  1 * SIZE], a2
441
442	FSUB	a1, c01, c01
443	FSUB	a2, c02, c02
444#else
445	LDF	[AO +  0 * SIZE], a1
446	LDF	[AO +  1 * SIZE], a2
447
448	FSUB	a1, c01, c01
449	FSUB	a2, c02, c02
450#endif
451
452#ifdef LN
453	LDF	[AO +  3 * SIZE], a1
454	LDF	[AO +  2 * SIZE], a2
455	LDF	[AO +  0 * SIZE], a3
456
457	FMUL	a1, c02, c02
458
459	FNMSUB	(aa2, cc02, cc01, cc01)
460
461	FMUL	a3, c01, c01
462#endif
463
464#ifdef LT
465	LDF	[AO +  0 * SIZE], a1
466	LDF	[AO +  1 * SIZE], a2
467	LDF	[AO +  3 * SIZE], a3
468
469	FMUL	a1, c01, c01
470
471	FNMSUB	(aa2, cc01, cc02, cc02)
472
473	FMUL	a3, c02, c02
474#endif
475
476#if defined(RN) || defined(RT)
477	LDF	[BO +  0 * SIZE], a1
478
479	FMUL	a1, c01, c01
480	FMUL	a1, c02, c02
481#endif
482
483#ifdef LN
484	add	C1, -2 * SIZE, C1
485#endif
486
487#if defined(LN) || defined(LT)
488	STF	c01, [BO +  0 * SIZE]
489	STF	c02, [BO +  1 * SIZE]
490#else
491	STF	c01, [AO +  0 * SIZE]
492	STF	c02, [AO +  1 * SIZE]
493#endif
494
495	STF	c01, [C1 + 0 * SIZE]
496	STF	c02, [C1 + 1 * SIZE]
497
498#ifndef LN
499	add	C1, 2 * SIZE, C1
500#endif
501
502#ifdef RT
503	sll	K, BASE_SHIFT + 1, TEMP1
504	add	AORIG, TEMP1, AORIG
505#endif
506
507#if defined(LT) || defined(RN)
508	sub	K, KK, TEMP1
509	sll	TEMP1, BASE_SHIFT + 1, TEMP2
510	sll	TEMP1, BASE_SHIFT + 0, TEMP1
511	add	AO, TEMP2, AO
512	add	BO, TEMP1, BO
513#endif
514
515#ifdef LT
516	add	KK, 2, KK
517#endif
518
519#ifdef LN
520	sub	KK, 2, KK
521#endif
522
523	add	I, -1, I
524	cmp	I, 0
525	bg,pt	%icc, .LL72
526	nop
527	.align 4
528
529.LL80:
530	and	M, 1, I
531	cmp	I, 0
532	ble,pn	%icc, .LL89
533	nop
534
535#if defined(LT) || defined(RN)
536	mov	B, BO
537#else
538#ifdef LN
539	sll	K,  BASE_SHIFT + 0, TEMP1
540	sub	AORIG, TEMP1, AORIG
541#endif
542
543	sll	KK, BASE_SHIFT + 0, TEMP1
544	sll	KK, BASE_SHIFT + 0, TEMP2
545
546	add	AORIG, TEMP1, AO
547	add	B,     TEMP2, BO
548#endif
549
550	LDF	[AO +  0 * SIZE], a1
551	LDF	[BO +  0 * SIZE], b1
552	LDF	[AO +  1 * SIZE], a2
553	LDF	[BO +  1 * SIZE], b2
554	LDF	[AO +  2 * SIZE], a3
555	LDF	[BO +  2 * SIZE], b3
556	LDF	[AO +  3 * SIZE], a4
557	LDF	[BO +  3 * SIZE], b4
558
559#if defined(LT) || defined(RN)
560	sra	KK, 2, L
561#else
562	sub	K, KK, L
563	sra	L,  2, L
564#endif
565	cmp	L,  0
566	ble,pn	%icc, .LL85
567	FCLR	(cc01)
568	.align 4
569
570.LL83:
571	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
572	add	L, -1, L
573
574	FMADD	(aa1, bb1, cc01, cc01)
575	LDF	[AO +  4 * SIZE], a1
576	LDF	[BO +  4 * SIZE], b1
577
578	FMADD	(aa2, bb2, cc01, cc01)
579	LDF	[AO +  5 * SIZE], a2
580	LDF	[BO +  5 * SIZE], b2
581
582	FMADD	(aa3, bb3, cc01, cc01)
583	LDF	[AO +  6 * SIZE], a3
584	LDF	[BO +  6 * SIZE], b3
585
586	FMADD	(aa4, bb4, cc01, cc01)
587	LDF	[AO +  7 * SIZE], a4
588	LDF	[BO +  7 * SIZE], b4
589
590	add	AO,  4 * SIZE, AO
591	cmp	L, 0
592
593	bg,pt	%icc, .LL83
594	add	BO,  4 * SIZE, BO
595	.align 4
596
597.LL85:
598#if defined(LT) || defined(RN)
599	and	KK, 3, L
600#else
601	sub	K, KK, L
602	and	L,  3, L
603#endif
604	cmp	L,  0
605	ble,a,pn %icc, .LL88
606	nop
607	.align 4
608
609.LL87:
610	FMADD	(aa1, bb1, cc01, cc01)
611	LDF	[AO + 1 * SIZE], a1
612	LDF	[BO + 1 * SIZE], b1
613
614	add	AO, 1 * SIZE, AO
615	add	L, -1, L
616	cmp	L, 0
617	bg,pt	%icc, .LL87
618	add	BO, 1 * SIZE, BO
619	.align 4
620
621.LL88:
622#if defined(LN) || defined(RT)
623#ifdef LN
624	sub	KK, 1, TEMP1
625#else
626	sub	KK, 1, TEMP1
627#endif
628	sll	TEMP1, BASE_SHIFT + 0, TEMP2
629	sll	TEMP1, BASE_SHIFT + 0, TEMP1
630
631	add	AORIG, TEMP2, AO
632	add	B,     TEMP1, BO
633#endif
634
635#if defined(LN) || defined(LT)
636	LDF	[BO +  0 * SIZE], a1
637
638	FSUB	a1, c01, c01
639#else
640	LDF	[AO +  0 * SIZE], a1
641
642	FSUB	a1, c01, c01
643#endif
644
645#if defined(LN) || defined(LT)
646	LDF	[AO +  0 * SIZE], a1
647
648	FMUL	a1, c01, c01
649#endif
650
651#if defined(RN) || defined(RT)
652	LDF	[BO +  0 * SIZE], a1
653
654	FMUL	a1, c01, c01
655#endif
656
657#ifdef LN
658	add	C1, -1 * SIZE, C1
659#endif
660
661#if defined(LN) || defined(LT)
662	STF	c01, [BO +  0 * SIZE]
663#else
664	STF	c01, [AO +  0 * SIZE]
665#endif
666
667	STF	c01, [C1 + 0 * SIZE]
668
669#ifdef RT
670	sll	K, BASE_SHIFT + 0, TEMP1
671	add	AORIG, TEMP1, AORIG
672#endif
673
674#if defined(LT) || defined(RN)
675	sub	K, KK, TEMP1
676	sll	TEMP1, BASE_SHIFT + 0, TEMP2
677	sll	TEMP1, BASE_SHIFT + 0, TEMP1
678	add	AO, TEMP2, AO
679	add	BO, TEMP1, BO
680#endif
681
682#ifdef LT
683	add	KK, 1, KK
684#endif
685
686#ifdef LN
687	sub	KK, 1, KK
688#endif
689	.align 4
690
691.LL89:
692#ifdef LN
693	sll	K, BASE_SHIFT, TEMP1
694	add	B, TEMP1, B
695#endif
696
697#if defined(LT) || defined(RN)
698	mov	BO, B
699#endif
700
701#ifdef RN
702	add	KK, 1, KK
703#endif
704
705#ifdef RT
706	sub	KK, 1, KK
707#endif
708	.align 4
709
710.LL50:
711	and	N, 2, J
712	cmp	J, 0
713	ble,pn	%icc, .LL30
714	nop
715
716#ifdef RT
717	sll	K, BASE_SHIFT + 1, TEMP1
718	sub	B, TEMP1, B
719#endif
720
721#ifndef RT
722	mov	C,  C1
723	add	C,  LDC, C2
724	add	C2, LDC, C
725#else
726	sub	C,  LDC, C2
727	sub	C2, LDC, C1
728	sub	C2, LDC, C
729#endif
730
731#ifdef LN
732	add	M, OFFSET, KK
733#endif
734
735#ifdef LT
736	mov	OFFSET, KK
737#endif
738
739#if defined(LN) || defined(RT)
740	mov	A, AORIG
741#else
742	mov	A, AO
743#endif
744
745	sra	M, 1, I
746	cmp	I, 0
747	ble,pn	%icc, .LL60
748	nop
749	.align 4
750
751.LL52:
752#if defined(LT) || defined(RN)
753	mov	B, BO
754#else
755#ifdef LN
756	sll	K,  BASE_SHIFT + 1, TEMP1
757	sub	AORIG, TEMP1, AORIG
758#endif
759
760	sll	KK, BASE_SHIFT + 1, TEMP1
761	sll	KK, BASE_SHIFT + 1, TEMP2
762
763	add	AORIG, TEMP1, AO
764	add	B,     TEMP2, BO
765#endif
766
767	LDF	[AO +  0 * SIZE], a1
768	LDF	[AO +  1 * SIZE], a2
769	LDF	[AO +  2 * SIZE], a3
770	LDF	[AO +  3 * SIZE], a4
771
772	LDF	[BO +  0 * SIZE], b1
773	LDF	[BO +  1 * SIZE], b2
774	LDF	[BO +  2 * SIZE], b3
775	FCLR	(cc01)
776	LDF	[BO +  3 * SIZE], b4
777	FCLR	(cc02)
778
779	LDF	[BO +  4 * SIZE], b5
780	FCLR	(cc03)
781	LDF	[BO +  5 * SIZE], b6
782	FCLR	(cc04)
783	LDF	[BO +  6 * SIZE], b7
784	FCLR	(cc05)
785	LDF	[BO +  7 * SIZE], b8
786	FCLR	(cc06)
787
788	prefetch [C1 + 2 * SIZE], 3
789	FCLR	(cc07)
790	prefetch [C2 + 2 * SIZE], 3
791	FCLR	(cc08)
792
793#if defined(LT) || defined(RN)
794	sra	KK, 2, L
795#else
796	sub	K, KK, L
797	sra	L,  2, L
798#endif
799	cmp	L,  0
800	ble,pn	%icc, .LL55
801	nop
802	.align 4
803
804.LL53:
805	FMADD	(aa1, bb1, cc01, cc01)
806	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
807	FMADD	(aa2, bb1, cc02, cc02)
808	LDF	[BO +  8 * SIZE], b1
809
810	FMADD	(aa1, bb2, cc03, cc03)
811	LDF	[AO +  4 * SIZE], a1
812	FMADD	(aa2, bb2, cc04, cc04)
813	LDF	[AO +  5 * SIZE], a2
814
815	FMADD	(aa3, bb3, cc01, cc01)
816	LDF	[BO +  9 * SIZE], b2
817	FMADD	(aa4, bb3, cc02, cc02)
818	LDF	[BO + 10 * SIZE], b3
819
820	FMADD	(aa3, bb4, cc03, cc03)
821	LDF	[AO +  6 * SIZE], a3
822	FMADD	(aa4, bb4, cc04, cc04)
823	LDF	[AO +  7 * SIZE], a4
824
825	FMADD	(aa1, bb5, cc01, cc01)
826	LDF	[BO + 11 * SIZE], b4
827	FMADD	(aa2, bb5, cc02, cc02)
828	LDF	[BO + 12 * SIZE], b5
829
830	FMADD	(aa1, bb6, cc03, cc03)
831	LDF	[AO +  8 * SIZE], a1
832	FMADD	(aa2, bb6, cc04, cc04)
833	LDF	[AO +  9 * SIZE], a2
834
835	FMADD	(aa3, bb7, cc01, cc01)
836	LDF	[BO + 13 * SIZE], b6
837
838	FMADD	(aa4, bb7, cc02, cc02)
839	LDF	[BO + 14 * SIZE], b7
840
841	FMADD	(aa3, bb8, cc03, cc03)
842	LDF	[AO + 10 * SIZE], a3
843	FMADD	(aa4, bb8, cc04, cc04)
844	LDF	[AO + 11 * SIZE], a4
845
846	add	AO,  8 * SIZE, AO
847	add	L, -1, L
848	add	BO,  8 * SIZE, BO
849	cmp	L, 0
850
851	bg,pt	%icc, .LL53
852	LDF	[BO +  7 * SIZE], b8
853	.align 4
854
855.LL55:
856#if defined(LT) || defined(RN)
857	and	KK, 3, L
858#else
859	sub	K, KK, L
860	and	L,  3, L
861#endif
862	cmp	L,  0
863	ble,a,pn %icc, .LL58
864	nop
865	.align 4
866
867.LL57:
868	FMADD	(aa1, bb1, cc01, cc01)
869	add	L, -1, L
870	FMADD	(aa2, bb1, cc02, cc02)
871	LDF	[BO + 2 * SIZE], b1
872
873	FMADD	(aa1, bb2, cc03, cc03)
874	LDF	[AO + 2 * SIZE], a1
875	FMADD	(aa2, bb2, cc04, cc04)
876	LDF	[AO + 3 * SIZE], a2
877
878	add	AO, 2 * SIZE, AO
879	cmp	L, 0
880	add	BO, 2 * SIZE, BO
881	bg,pt	%icc, .LL57
882	LDF	[BO + 1 * SIZE], b2
883	.align 4
884
885.LL58:
886#if defined(LN) || defined(RT)
887#ifdef LN
888	sub	KK, 2, TEMP1
889#else
890	sub	KK, 2, TEMP1
891#endif
892	sll	TEMP1, BASE_SHIFT + 1, TEMP2
893	sll	TEMP1, BASE_SHIFT + 1, TEMP1
894
895	add	AORIG, TEMP2, AO
896	add	B,     TEMP1, BO
897#endif
898
899#if defined(LN) || defined(LT)
900	LDF	[BO +  0 * SIZE], a1
901	LDF	[BO +  1 * SIZE], a2
902	LDF	[BO +  2 * SIZE], a3
903	LDF	[BO +  3 * SIZE], a4
904
905	FSUB	a1, c01, c01
906	FSUB	a2, c03, c03
907	FSUB	a3, c02, c02
908	FSUB	a4, c04, c04
909#else
910	LDF	[AO +  0 * SIZE], a1
911	LDF	[AO +  1 * SIZE], a2
912	LDF	[AO +  2 * SIZE], a3
913	LDF	[AO +  3 * SIZE], a4
914
915	FSUB	a1, c01, c01
916	FSUB	a2, c02, c02
917	FSUB	a3, c03, c03
918	FSUB	a4, c04, c04
919#endif
920
921#ifdef LN
922	LDF	[AO +  3 * SIZE], a1
923	LDF	[AO +  2 * SIZE], a2
924	LDF	[AO +  0 * SIZE], a3
925
926	FMUL	a1, c02, c02
927	FMUL	a1, c04, c04
928
929	FNMSUB	(aa2, cc02, cc01, cc01)
930	FNMSUB	(aa2, cc04, cc03, cc03)
931
932	FMUL	a3, c01, c01
933	FMUL	a3, c03, c03
934#endif
935
936#ifdef LT
937	LDF	[AO +  0 * SIZE], a1
938	LDF	[AO +  1 * SIZE], a2
939	LDF	[AO +  3 * SIZE], a3
940
941	FMUL	a1, c01, c01
942	FMUL	a1, c03, c03
943
944	FNMSUB	(aa2, cc01, cc02, cc02)
945	FNMSUB	(aa2, cc03, cc04, cc04)
946
947	FMUL	a3, c02, c02
948	FMUL	a3, c04, c04
949#endif
950
951#ifdef RN
952	LDF	[BO +  0 * SIZE], a1
953	LDF	[BO +  1 * SIZE], a2
954
955	FMUL	a1, c01, c01
956	FMUL	a1, c02, c02
957
958	FNMSUB	(aa2, cc01, cc03, cc03)
959	FNMSUB	(aa2, cc02, cc04, cc04)
960
961	LDF	[BO +  3 * SIZE], a1
962
963	FMUL	a1, c03, c03
964	FMUL	a1, c04, c04
965#endif
966
967#ifdef RT
968	LDF	[BO +  3 * SIZE], a1
969	LDF	[BO +  2 * SIZE], a2
970
971	FMUL	a1, c04, c04
972	FMUL	a1, c03, c03
973
974	FNMSUB	(aa2, cc04, cc02, cc02)
975	FNMSUB	(aa2, cc03, cc01, cc01)
976
977	LDF	[BO +  0 * SIZE], a1
978
979	FMUL	a1, c02, c02
980	FMUL	a1, c01, c01
981#endif
982
983#ifdef LN
984	add	C1, -2 * SIZE, C1
985	add	C2, -2 * SIZE, C2
986#endif
987
988#if defined(LN) || defined(LT)
989	STF	c01, [BO +  0 * SIZE]
990	STF	c03, [BO +  1 * SIZE]
991	STF	c02, [BO +  2 * SIZE]
992	STF	c04, [BO +  3 * SIZE]
993#else
994	STF	c01, [AO +  0 * SIZE]
995	STF	c02, [AO +  1 * SIZE]
996	STF	c03, [AO +  2 * SIZE]
997	STF	c04, [AO +  3 * SIZE]
998#endif
999
1000	STF	c01, [C1 + 0 * SIZE]
1001	STF	c02, [C1 + 1 * SIZE]
1002	STF	c03, [C2 + 0 * SIZE]
1003	STF	c04, [C2 + 1 * SIZE]
1004
1005#ifndef LN
1006	add	C1, 2 * SIZE, C1
1007	add	C2, 2 * SIZE, C2
1008#endif
1009
1010#ifdef RT
1011	sll	K, BASE_SHIFT + 1, TEMP1
1012	add	AORIG, TEMP1, AORIG
1013#endif
1014
1015#if defined(LT) || defined(RN)
1016	sub	K, KK, TEMP1
1017	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1018	sll	TEMP1, BASE_SHIFT + 1, TEMP1
1019	add	AO, TEMP2, AO
1020	add	BO, TEMP1, BO
1021#endif
1022
1023#ifdef LT
1024	add	KK, 2, KK
1025#endif
1026
1027#ifdef LN
1028	sub	KK, 2, KK
1029#endif
1030
1031	add	I, -1, I
1032	cmp	I, 0
1033	bg,pt	%icc, .LL52
1034	nop
1035	.align 4
1036
1037.LL60:
1038	and	M, 1, I
1039	cmp	I, 0
1040	ble,pn	%icc, .LL69
1041	nop
1042
1043#if defined(LT) || defined(RN)
1044	mov	B, BO
1045#else
1046#ifdef LN
1047	sll	K,  BASE_SHIFT + 0, TEMP1
1048	sub	AORIG, TEMP1, AORIG
1049#endif
1050
1051	sll	KK, BASE_SHIFT + 0, TEMP1
1052	sll	KK, BASE_SHIFT + 1, TEMP2
1053
1054	add	AORIG, TEMP1, AO
1055	add	B,     TEMP2, BO
1056#endif
1057
1058	LDF	[AO +  0 * SIZE], a1
1059	LDF	[AO +  1 * SIZE], a2
1060	LDF	[AO +  2 * SIZE], a3
1061	LDF	[AO +  3 * SIZE], a4
1062
1063	LDF	[BO +  0 * SIZE], b1
1064	LDF	[BO +  1 * SIZE], b2
1065	LDF	[BO +  2 * SIZE], b3
1066	LDF	[BO +  3 * SIZE], b4
1067	LDF	[BO +  4 * SIZE], b5
1068	LDF	[BO +  5 * SIZE], b6
1069	LDF	[BO +  6 * SIZE], b7
1070	FCLR	(cc01)
1071	LDF	[BO +  7 * SIZE], b8
1072	FCLR	(cc03)
1073
1074#if defined(LT) || defined(RN)
1075	sra	KK, 2, L
1076#else
1077	sub	K, KK, L
1078	sra	L,  2, L
1079#endif
1080	cmp	L,  0
1081	ble,pn	%icc, .LL65
1082	nop
1083	.align 4
1084
1085.LL63:
1086	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1087	add	L, -1, L
1088
1089	FMADD	(aa1, bb1, cc01, cc01)
1090	LDF	[BO +  8 * SIZE], b1
1091	FMADD	(aa1, bb2, cc03, cc03)
1092	LDF	[BO +  9 * SIZE], b2
1093
1094	LDF	[AO +  4 * SIZE], a1
1095	cmp	L, 0
1096
1097	FMADD	(aa2, bb3, cc01, cc01)
1098	LDF	[BO + 10 * SIZE], b3
1099	FMADD	(aa2, bb4, cc03, cc03)
1100	LDF	[BO + 11 * SIZE], b4
1101
1102	LDF	[AO +  5 * SIZE], a2
1103	add	AO,  4 * SIZE, AO
1104
1105	FMADD	(aa3, bb5, cc01, cc01)
1106	LDF	[BO + 12 * SIZE], b5
1107	FMADD	(aa3, bb6, cc03, cc03)
1108	LDF	[BO + 13 * SIZE], b6
1109
1110	LDF	[AO +  2 * SIZE], a3
1111	add	BO,  8 * SIZE, BO
1112
1113	FMADD	(aa4, bb7, cc01, cc01)
1114	LDF	[BO +  6 * SIZE], b7
1115	FMADD	(aa4, bb8, cc03, cc03)
1116	LDF	[BO + 7 * SIZE], b8
1117
1118	bg,pt	%icc, .LL63
1119	LDF	[AO +  3 * SIZE], a4
1120	.align 4
1121
1122.LL65:
1123#if defined(LT) || defined(RN)
1124	and	KK, 3, L
1125#else
1126	sub	K, KK, L
1127	and	L,  3, L
1128#endif
1129	cmp	L,  0
1130	ble,a,pn %icc, .LL68
1131	nop
1132	.align 4
1133
1134.LL67:
1135	FMADD	(aa1, bb1, cc01, cc01)
1136	LDF	[BO + 2 * SIZE], b1
1137	FMADD	(aa1, bb2, cc03, cc03)
1138	LDF	[BO + 3 * SIZE], b2
1139
1140	LDF	[AO + 1 * SIZE], a1
1141	add	L, -1, L
1142	add	AO, 1 * SIZE, AO
1143	cmp	L, 0
1144
1145	bg,pt	%icc, .LL67
1146	add	BO, 2 * SIZE, BO
1147	.align 4
1148
1149.LL68:
1150#if defined(LN) || defined(RT)
1151#ifdef LN
1152	sub	KK, 1, TEMP1
1153#else
1154	sub	KK, 2, TEMP1
1155#endif
1156	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1157	sll	TEMP1, BASE_SHIFT + 1, TEMP1
1158
1159	add	AORIG, TEMP2, AO
1160	add	B,     TEMP1, BO
1161#endif
1162
1163#if defined(LN) || defined(LT)
1164	LDF	[BO +  0 * SIZE], a1
1165	LDF	[BO +  1 * SIZE], a2
1166
1167	FSUB	a1, c01, c01
1168	FSUB	a2, c03, c03
1169#else
1170	LDF	[AO +  0 * SIZE], a1
1171	LDF	[AO +  1 * SIZE], a2
1172
1173	FSUB	a1, c01, c01
1174	FSUB	a2, c03, c03
1175#endif
1176
1177#if defined(LN) || defined(LT)
1178	LDF	[AO +  0 * SIZE], a1
1179
1180	FMUL	a1, c01, c01
1181	FMUL	a1, c03, c03
1182#endif
1183
1184#ifdef RN
1185	LDF	[BO +  0 * SIZE], a1
1186	LDF	[BO +  1 * SIZE], a2
1187
1188	FMUL	a1, c01, c01
1189
1190	FNMSUB	(aa2, cc01, cc03, cc03)
1191
1192	LDF	[BO +  3 * SIZE], a1
1193
1194	FMUL	a1, c03, c03
1195#endif
1196
1197#ifdef RT
1198	LDF	[BO +  3 * SIZE], a1
1199	LDF	[BO +  2 * SIZE], a2
1200
1201	FMUL	a1, c03, c03
1202
1203	FNMSUB	(aa2, cc03, cc01, cc01)
1204
1205	LDF	[BO +  0 * SIZE], a1
1206
1207	FMUL	a1, c01, c01
1208#endif
1209
1210#ifdef LN
1211	add	C1, -1 * SIZE, C1
1212	add	C2, -1 * SIZE, C2
1213#endif
1214
1215#if defined(LN) || defined(LT)
1216	STF	c01, [BO +  0 * SIZE]
1217	STF	c03, [BO +  1 * SIZE]
1218#else
1219	STF	c01, [AO +  0 * SIZE]
1220	STF	c03, [AO +  1 * SIZE]
1221#endif
1222
1223	STF	c01, [C1 + 0 * SIZE]
1224	STF	c03, [C2 + 0 * SIZE]
1225
1226#ifdef RT
1227	sll	K, BASE_SHIFT + 0, TEMP1
1228	add	AORIG, TEMP1, AORIG
1229#endif
1230
1231#if defined(LT) || defined(RN)
1232	sub	K, KK, TEMP1
1233	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1234	sll	TEMP1, BASE_SHIFT + 1, TEMP1
1235	add	AO, TEMP2, AO
1236	add	BO, TEMP1, BO
1237#endif
1238
1239#ifdef LT
1240	add	KK, 1, KK
1241#endif
1242
1243#ifdef LN
1244	sub	KK, 1, KK
1245#endif
1246	.align 4
1247
1248.LL69:
1249#ifdef LN
1250	sll	K, BASE_SHIFT + 1, TEMP1
1251	add	B, TEMP1, B
1252#endif
1253
1254#if defined(LT) || defined(RN)
1255	mov	BO, B
1256#endif
1257
1258#ifdef RN
1259	add	KK, 2, KK
1260#endif
1261
1262#ifdef RT
1263	sub	KK, 2, KK
1264#endif
1265	.align 4
1266
1267.LL30:
1268	and	N, 4, J
1269	cmp	J, 0
1270	ble,pn	%icc, .LL10
1271	nop
1272
1273#ifdef RT
1274	sll	K, BASE_SHIFT + 2, TEMP1
1275	sub	B, TEMP1, B
1276#endif
1277
1278#ifndef RT
1279	mov	C,  C1
1280	add	C,  LDC, C2
1281	add	C2, LDC, C3
1282	add	C3, LDC, C4
1283	add	C4, LDC, C
1284#else
1285	sub	C,  LDC, C4
1286	sub	C4, LDC, C3
1287	sub	C3, LDC, C2
1288	sub	C2, LDC, C1
1289	sub	C2, LDC, C
1290#endif
1291
1292#ifdef LN
1293	add	M, OFFSET, KK
1294#endif
1295
1296#ifdef LT
1297	mov	OFFSET, KK
1298#endif
1299
1300#if defined(LN) || defined(RT)
1301	mov	A, AORIG
1302#else
1303	mov	A, AO
1304#endif
1305
1306	sra	M, 1, I
1307	cmp	I, 0
1308	ble,pn	%icc, .LL40
1309	nop
1310	.align 4
1311
1312.LL32:
1313#if defined(LT) || defined(RN)
1314	mov	B, BO
1315#else
1316#ifdef LN
1317	sll	K,  BASE_SHIFT + 1, TEMP1
1318	sub	AORIG, TEMP1, AORIG
1319#endif
1320
1321	sll	KK, BASE_SHIFT + 1, TEMP1
1322	sll	KK, BASE_SHIFT + 2, TEMP2
1323
1324	add	AORIG, TEMP1, AO
1325	add	B,     TEMP2, BO
1326#endif
1327
1328	LDF	[AO +  0 * SIZE], a1
1329	LDF	[AO +  1 * SIZE], a2
1330
1331	LDF	[BO +  0 * SIZE], b1
1332	LDF	[BO +  1 * SIZE], b2
1333	LDF	[BO +  2 * SIZE], b3
1334	LDF	[BO +  3 * SIZE], b4
1335	LDF	[BO +  4 * SIZE], b5
1336
1337	LDF	[BO +  5 * SIZE], b6
1338	FCLR	(cc01)
1339	LDF	[BO +  6 * SIZE], b7
1340	FCLR	(cc02)
1341	LDF	[BO +  7 * SIZE], b8
1342	FCLR	(cc03)
1343	LDF	[BO +  8 * SIZE], b9
1344	FCLR	(cc04)
1345
1346	prefetch [C1 + 2 * SIZE], 3
1347	FCLR	(cc05)
1348	prefetch [C2 + 2 * SIZE], 3
1349	FCLR	(cc06)
1350	prefetch [C3 + 2 * SIZE], 3
1351	FCLR	(cc07)
1352	prefetch [C4 + 2 * SIZE], 3
1353	FCLR	(cc08)
1354
1355#if defined(LT) || defined(RN)
1356	sra	KK, 2, L
1357#else
1358	sub	K, KK, L
1359	sra	L,  2, L
1360#endif
1361	cmp	L,  0
1362	ble,pn	%icc, .LL35
1363	nop
1364	.align 4
1365
1366.LL33:
1367	FMADD	(aa1, bb1, cc01, cc01)
1368	LDF	[AO +  2 * SIZE], a3
1369	FMADD	(aa2, bb1, cc02, cc02)
1370	LDF	[AO +  3 * SIZE], a4
1371
1372	FMADD	(aa1, bb2, cc03, cc03)
1373	LDF	[BO + 16 * SIZE], b1
1374	FMADD	(aa2, bb2, cc04, cc04)
1375	LDF	[BO +  9 * SIZE], b2
1376
1377	FMADD	(aa1, bb3, cc05, cc05)
1378	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1379	FMADD	(aa2, bb3, cc06, cc06)
1380	add	L, -1, L
1381
1382	FMADD	(aa1, bb4, cc07, cc07)
1383	LDF	[BO + 10 * SIZE], b3
1384	FMADD	(aa2, bb4, cc08, cc08)
1385	LDF	[BO + 11 * SIZE], b4
1386
1387	FMADD	(aa3, bb5, cc01, cc01)
1388	LDF	[AO +  4 * SIZE], a1
1389	FMADD	(aa4, bb5, cc02, cc02)
1390	LDF	[AO +  5 * SIZE], a2
1391
1392	FMADD	(aa3, bb6, cc03, cc03)
1393	LDF	[BO + 12 * SIZE], b5
1394	FMADD	(aa4, bb6, cc04, cc04)
1395	LDF	[BO + 13 * SIZE], b6
1396
1397	FMADD	(aa3, bb7, cc05, cc05)
1398	cmp	L, 0
1399	FMADD	(aa4, bb7, cc06, cc06)
1400	add	AO,  8 * SIZE, AO
1401
1402	FMADD	(aa3, bb8, cc07, cc07)
1403	LDF	[BO + 14 * SIZE], b7
1404	FMADD	(aa4, bb8, cc08, cc08)
1405	LDF	[BO + 15 * SIZE], b8
1406
1407	FMADD	(aa1, bb9, cc01, cc01)
1408	LDF	[AO -  2 * SIZE], a3
1409	FMADD	(aa2, bb9, cc02, cc02)
1410	LDF	[AO -  1 * SIZE], a4
1411
1412	FMADD	(aa1, bb2, cc03, cc03)
1413	LDF	[BO + 24 * SIZE], b9
1414	FMADD	(aa2, bb2, cc04, cc04)
1415	LDF	[BO + 17 * SIZE], b2
1416
1417	FMADD	(aa1, bb3, cc05, cc05)
1418	add	BO, 16 * SIZE, BO
1419	FMADD	(aa2, bb3, cc06, cc06)
1420	nop
1421
1422	FMADD	(aa1, bb4, cc07, cc07)
1423	LDF	[BO +  2 * SIZE], b3
1424	FMADD	(aa2, bb4, cc08, cc08)
1425	LDF	[BO +  3 * SIZE], b4
1426
1427	FMADD	(aa3, bb5, cc01, cc01)
1428	LDF	[AO +  0 * SIZE], a1
1429	FMADD	(aa4, bb5, cc02, cc02)
1430	LDF	[AO +  1 * SIZE], a2
1431	FMADD	(aa3, bb6, cc03, cc03)
1432	LDF	[BO +  4 * SIZE], b5
1433	FMADD	(aa4, bb6, cc04, cc04)
1434	LDF	[BO +  5 * SIZE], b6
1435
1436	FMADD	(aa3, bb7, cc05, cc05)
1437	nop
1438	FMADD	(aa4, bb7, cc06, cc06)
1439	LDF	[BO +  6 * SIZE], b7
1440
1441	FMADD	(aa3, bb8, cc07, cc07)
1442	FMADD	(aa4, bb8, cc08, cc08)
1443	bg,pt	%icc, .LL33
1444	LDF	[BO +  7 * SIZE], b8
1445	.align 4
1446
1447.LL35:
1448#if defined(LT) || defined(RN)
1449	and	KK, 3, L
1450#else
1451	sub	K, KK, L
1452	and	L,  3, L
1453#endif
1454	cmp	L,  0
1455	ble,a,pn %icc, .LL38
1456	nop
1457	.align 4
1458
1459.LL37:
1460	FMADD	(aa1, bb1, cc01, cc01)
1461	add	L, -1, L
1462	FMADD	(aa2, bb1, cc02, cc02)
1463	LDF	[BO + 4 * SIZE], b1
1464
1465	FMADD	(aa1, bb2, cc03, cc03)
1466	add	AO, 2 * SIZE, AO
1467	FMADD	(aa2, bb2, cc04, cc04)
1468	LDF	[BO + 5 * SIZE], b2
1469
1470	FMADD	(aa1, bb3, cc05, cc05)
1471	cmp	L, 0
1472	FMADD	(aa2, bb3, cc06, cc06)
1473	LDF	[BO + 6 * SIZE], b3
1474
1475	FMADD	(aa1, bb4, cc07, cc07)
1476	LDF	[AO + 0 * SIZE], a1
1477	FMADD	(aa2, bb4, cc08, cc08)
1478	LDF	[AO + 1 * SIZE], a2
1479
1480	LDF	[BO + 7 * SIZE], b4
1481	bg,pt	%icc, .LL37
1482	add	BO, 4 * SIZE, BO
1483	.align 4
1484
1485.LL38:
1486#if defined(LN) || defined(RT)
1487#ifdef LN
1488	sub	KK, 2, TEMP1
1489#else
1490	sub	KK, 4, TEMP1
1491#endif
1492	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1493	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1494
1495	add	AORIG, TEMP2, AO
1496	add	B,     TEMP1, BO
1497#endif
1498
1499#if defined(LN) || defined(LT)
1500	LDF	[BO +  0 * SIZE], a1
1501	LDF	[BO +  1 * SIZE], a2
1502	LDF	[BO +  2 * SIZE], a3
1503	LDF	[BO +  3 * SIZE], a4
1504
1505	LDF	[BO +  4 * SIZE], b1
1506	LDF	[BO +  5 * SIZE], b2
1507	LDF	[BO +  6 * SIZE], b3
1508	LDF	[BO +  7 * SIZE], b4
1509
1510	FSUB	a1, c01, c01
1511	FSUB	a2, c03, c03
1512	FSUB	a3, c05, c05
1513	FSUB	a4, c07, c07
1514
1515	FSUB	b1, c02, c02
1516	FSUB	b2, c04, c04
1517	FSUB	b3, c06, c06
1518	FSUB	b4, c08, c08
1519#else
1520	LDF	[AO +  0 * SIZE], a1
1521	LDF	[AO +  1 * SIZE], a2
1522	LDF	[AO +  2 * SIZE], a3
1523	LDF	[AO +  3 * SIZE], a4
1524
1525	LDF	[AO +  4 * SIZE], b1
1526	LDF	[AO +  5 * SIZE], b2
1527	LDF	[AO +  6 * SIZE], b3
1528	LDF	[AO +  7 * SIZE], b4
1529
1530	FSUB	a1, c01, c01
1531	FSUB	a2, c02, c02
1532	FSUB	a3, c03, c03
1533	FSUB	a4, c04, c04
1534
1535	FSUB	b1, c05, c05
1536	FSUB	b2, c06, c06
1537	FSUB	b3, c07, c07
1538	FSUB	b4, c08, c08
1539
1540#endif
1541
1542#ifdef LN
1543	LDF	[AO +  3 * SIZE], a1
1544	LDF	[AO +  2 * SIZE], a2
1545	LDF	[AO +  0 * SIZE], a3
1546
1547	FMUL	a1, c02, c02
1548	FMUL	a1, c04, c04
1549	FMUL	a1, c06, c06
1550	FMUL	a1, c08, c08
1551
1552	FNMSUB	(aa2, cc02, cc01, cc01)
1553	FNMSUB	(aa2, cc04, cc03, cc03)
1554	FNMSUB	(aa2, cc06, cc05, cc05)
1555	FNMSUB	(aa2, cc08, cc07, cc07)
1556
1557	FMUL	a3, c01, c01
1558	FMUL	a3, c03, c03
1559	FMUL	a3, c05, c05
1560	FMUL	a3, c07, c07
1561#endif
1562
1563#ifdef LT
1564	LDF	[AO +  0 * SIZE], a1
1565	LDF	[AO +  1 * SIZE], a2
1566	LDF	[AO +  3 * SIZE], a3
1567
1568	FMUL	a1, c01, c01
1569	FMUL	a1, c03, c03
1570	FMUL	a1, c05, c05
1571	FMUL	a1, c07, c07
1572
1573	FNMSUB	(aa2, cc01, cc02, cc02)
1574	FNMSUB	(aa2, cc03, cc04, cc04)
1575	FNMSUB	(aa2, cc05, cc06, cc06)
1576	FNMSUB	(aa2, cc07, cc08, cc08)
1577
1578	FMUL	a3, c02, c02
1579	FMUL	a3, c04, c04
1580	FMUL	a3, c06, c06
1581	FMUL	a3, c08, c08
1582#endif
1583
1584#ifdef RN
1585	LDF	[BO +  0 * SIZE], a1
1586	LDF	[BO +  1 * SIZE], a2
1587	LDF	[BO +  2 * SIZE], a3
1588	LDF	[BO +  3 * SIZE], a4
1589
1590	FMUL	a1, c01, c01
1591	FMUL	a1, c02, c02
1592
1593	FNMSUB	(aa2, cc01, cc03, cc03)
1594	FNMSUB	(aa2, cc02, cc04, cc04)
1595	FNMSUB	(aa3, cc01, cc05, cc05)
1596	FNMSUB	(aa3, cc02, cc06, cc06)
1597	FNMSUB	(aa4, cc01, cc07, cc07)
1598	FNMSUB	(aa4, cc02, cc08, cc08)
1599
1600	LDF	[BO +  5 * SIZE], a1
1601	LDF	[BO +  6 * SIZE], a2
1602	LDF	[BO +  7 * SIZE], a3
1603
1604	FMUL	a1, c03, c03
1605	FMUL	a1, c04, c04
1606
1607	FNMSUB	(aa2, cc03, cc05, cc05)
1608	FNMSUB	(aa2, cc04, cc06, cc06)
1609	FNMSUB	(aa3, cc03, cc07, cc07)
1610	FNMSUB	(aa3, cc04, cc08, cc08)
1611
1612	LDF	[BO + 10 * SIZE], a1
1613	LDF	[BO + 11 * SIZE], a2
1614
1615	FMUL	a1, c05, c05
1616	FMUL	a1, c06, c06
1617
1618	FNMSUB	(aa2, cc05, cc07, cc07)
1619	FNMSUB	(aa2, cc06, cc08, cc08)
1620
1621	LDF	[BO + 15 * SIZE], a1
1622
1623	FMUL	a1, c07, c07
1624	FMUL	a1, c08, c08
1625#endif
1626
1627#ifdef RT
1628	LDF	[BO + 15 * SIZE], a1
1629	LDF	[BO + 14 * SIZE], a2
1630	LDF	[BO + 13 * SIZE], a3
1631	LDF	[BO + 12 * SIZE], a4
1632
1633	FMUL	a1, c08, c08
1634	FMUL	a1, c07, c07
1635
1636	FNMSUB	(aa2, cc08, cc06, cc06)
1637	FNMSUB	(aa2, cc07, cc05, cc05)
1638	FNMSUB	(aa3, cc08, cc04, cc04)
1639	FNMSUB	(aa3, cc07, cc03, cc03)
1640	FNMSUB	(aa4, cc08, cc02, cc02)
1641	FNMSUB	(aa4, cc07, cc01, cc01)
1642
1643	LDF	[BO + 10 * SIZE], a1
1644	LDF	[BO +  9 * SIZE], a2
1645	LDF	[BO +  8 * SIZE], a3
1646
1647	FMUL	a1, c06, c06
1648	FMUL	a1, c05, c05
1649
1650	FNMSUB	(aa2, cc06, cc04, cc04)
1651	FNMSUB	(aa2, cc05, cc03, cc03)
1652	FNMSUB	(aa3, cc06, cc02, cc02)
1653	FNMSUB	(aa3, cc05, cc01, cc01)
1654
1655	LDF	[BO +  5 * SIZE], a1
1656	LDF	[BO +  4 * SIZE], a2
1657
1658	FMUL	a1, c04, c04
1659	FMUL	a1, c03, c03
1660
1661	FNMSUB	(aa2, cc04, cc02, cc02)
1662	FNMSUB	(aa2, cc03, cc01, cc01)
1663
1664	LDF	[BO +  0 * SIZE], a1
1665
1666	FMUL	a1, c02, c02
1667	FMUL	a1, c01, c01
1668#endif
1669
1670#ifdef LN
1671	add	C1, -2 * SIZE, C1
1672	add	C2, -2 * SIZE, C2
1673	add	C3, -2 * SIZE, C3
1674	add	C4, -2 * SIZE, C4
1675#endif
1676
1677#if defined(LN) || defined(LT)
1678	STF	c01, [BO +  0 * SIZE]
1679	STF	c03, [BO +  1 * SIZE]
1680	STF	c05, [BO +  2 * SIZE]
1681	STF	c07, [BO +  3 * SIZE]
1682
1683	STF	c02, [BO +  4 * SIZE]
1684	STF	c04, [BO +  5 * SIZE]
1685	STF	c06, [BO +  6 * SIZE]
1686	STF	c08, [BO +  7 * SIZE]
1687#else
1688	STF	c01, [AO +  0 * SIZE]
1689	STF	c02, [AO +  1 * SIZE]
1690	STF	c03, [AO +  2 * SIZE]
1691	STF	c04, [AO +  3 * SIZE]
1692
1693	STF	c05, [AO +  4 * SIZE]
1694	STF	c06, [AO +  5 * SIZE]
1695	STF	c07, [AO +  6 * SIZE]
1696	STF	c08, [AO +  7 * SIZE]
1697#endif
1698
1699	STF	c01, [C1 + 0 * SIZE]
1700	STF	c02, [C1 + 1 * SIZE]
1701	STF	c03, [C2 + 0 * SIZE]
1702	STF	c04, [C2 + 1 * SIZE]
1703
1704	STF	c05, [C3 + 0 * SIZE]
1705	STF	c06, [C3 + 1 * SIZE]
1706	STF	c07, [C4 + 0 * SIZE]
1707	STF	c08, [C4 + 1 * SIZE]
1708
1709#ifndef LN
1710	add	C1, 2 * SIZE, C1
1711	add	C2, 2 * SIZE, C2
1712	add	C3, 2 * SIZE, C3
1713	add	C4, 2 * SIZE, C4
1714#endif
1715
1716#ifdef RT
1717	sll	K, BASE_SHIFT + 1, TEMP1
1718	add	AORIG, TEMP1, AORIG
1719#endif
1720
1721#if defined(LT) || defined(RN)
1722	sub	K, KK, TEMP1
1723	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1724	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1725	add	AO, TEMP2, AO
1726	add	BO, TEMP1, BO
1727#endif
1728
1729#ifdef LT
1730	add	KK, 2, KK
1731#endif
1732
1733#ifdef LN
1734	sub	KK, 2, KK
1735#endif
1736
1737	add	I, -1, I
1738	cmp	I, 0
1739	bg,pt	%icc, .LL32
1740	nop
1741
1742.LL40:
1743	and	M, 1, I
1744	cmp	I, 0
1745	ble,pn	%icc, .LL49
1746	nop
1747
1748#if defined(LT) || defined(RN)
1749	mov	B, BO
1750#else
1751#ifdef LN
1752	sll	K,  BASE_SHIFT + 0, TEMP1
1753	sub	AORIG, TEMP1, AORIG
1754#endif
1755
1756	sll	KK, BASE_SHIFT + 0, TEMP1
1757	sll	KK, BASE_SHIFT + 2, TEMP2
1758
1759	add	AORIG, TEMP1, AO
1760	add	B,     TEMP2, BO
1761#endif
1762
1763	LDF	[AO +  0 * SIZE], a1
1764	LDF	[AO +  1 * SIZE], a2
1765	LDF	[AO +  2 * SIZE], a3
1766	LDF	[AO +  3 * SIZE], a4
1767
1768	LDF	[BO +  0 * SIZE], b1
1769	LDF	[BO +  1 * SIZE], b2
1770	LDF	[BO +  2 * SIZE], b3
1771	LDF	[BO +  3 * SIZE], b4
1772	LDF	[BO +  4 * SIZE], b5
1773	LDF	[BO +  5 * SIZE], b6
1774	FCLR	(cc01)
1775	LDF	[BO +  6 * SIZE], b7
1776	FCLR	(cc03)
1777	LDF	[BO +  7 * SIZE], b8
1778	FCLR	(cc05)
1779	LDF	[BO +  8 * SIZE], b9
1780	FCLR	(cc07)
1781
1782#if defined(LT) || defined(RN)
1783	sra	KK, 2, L
1784#else
1785	sub	K, KK, L
1786	sra	L,  2, L
1787#endif
1788	cmp	L,  0
1789	ble,pn	%icc, .LL45
1790	nop
1791
1792.LL43:
1793	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1794	add	L, -1, L
1795
1796	FMADD	(aa1, bb1, cc01, cc01)
1797	LDF	[BO + 16 * SIZE], b1
1798	FMADD	(aa1, bb2, cc03, cc03)
1799	LDF	[BO +  9 * SIZE], b2
1800	FMADD	(aa1, bb3, cc05, cc05)
1801	LDF	[BO + 10 * SIZE], b3
1802	FMADD	(aa1, bb4, cc07, cc07)
1803	LDF	[BO + 11 * SIZE], b4
1804
1805	LDF	[AO +  4 * SIZE], a1
1806	cmp	L, 0
1807
1808	FMADD	(aa2, bb5, cc01, cc01)
1809	LDF	[BO + 12 * SIZE], b5
1810	FMADD	(aa2, bb6, cc03, cc03)
1811	LDF	[BO + 13 * SIZE], b6
1812	FMADD	(aa2, bb7, cc05, cc05)
1813	LDF	[BO + 14 * SIZE], b7
1814	FMADD	(aa2, bb8, cc07, cc07)
1815	LDF	[BO + 15 * SIZE], b8
1816
1817	LDF	[AO +  5 * SIZE], a2
1818	add	AO,  4 * SIZE, AO
1819
1820	FMADD	(aa3, bb9, cc01, cc01)
1821	LDF	[BO + 24 * SIZE], b9
1822	FMADD	(aa3, bb2, cc03, cc03)
1823	LDF	[BO + 17 * SIZE], b2
1824	FMADD	(aa3, bb3, cc05, cc05)
1825	LDF	[BO + 18 * SIZE], b3
1826	FMADD	(aa3, bb4, cc07, cc07)
1827	LDF	[BO + 19 * SIZE], b4
1828
1829	LDF	[AO +  2 * SIZE], a3
1830	add	BO, 16 * SIZE, BO
1831
1832	FMADD	(aa4, bb5, cc01, cc01)
1833	LDF	[BO +  4 * SIZE], b5
1834	FMADD	(aa4, bb6, cc03, cc03)
1835	LDF	[BO +  5 * SIZE], b6
1836	FMADD	(aa4, bb7, cc05, cc05)
1837	LDF	[BO +  6 * SIZE], b7
1838	FMADD	(aa4, bb8, cc07, cc07)
1839	LDF	[BO +  7 * SIZE], b8
1840
1841	bg,pt	%icc, .LL43
1842	LDF	[AO +  3 * SIZE], a4
1843	.align 4
1844
1845.LL45:
1846#if defined(LT) || defined(RN)
1847	and	KK, 3, L
1848#else
1849	sub	K, KK, L
1850	and	L,  3, L
1851#endif
1852	cmp	L,  0
1853	ble,a,pn %icc, .LL48
1854	nop
1855	.align 4
1856
1857.LL47:
1858	FMADD	(aa1, bb1, cc01, cc01)
1859	LDF	[BO + 4 * SIZE], b1
1860	add	L, -1, L
1861	FMADD	(aa1, bb2, cc03, cc03)
1862	LDF	[BO + 5 * SIZE], b2
1863	add	AO, 1 * SIZE, AO
1864
1865	FMADD	(aa1, bb3, cc05, cc05)
1866	LDF	[BO + 6 * SIZE], b3
1867	cmp	L, 0
1868	FMADD	(aa1, bb4, cc07, cc07)
1869	LDF	[BO + 7 * SIZE], b4
1870	add	BO, 4 * SIZE, BO
1871
1872	bg,pt	%icc, .LL47
1873	LDF	[AO + 0 * SIZE], a1
1874	.align 4
1875
1876.LL48:
1877#if defined(LN) || defined(RT)
1878#ifdef LN
1879	sub	KK, 1, TEMP1
1880#else
1881	sub	KK, 4, TEMP1
1882#endif
1883	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1884	sll	TEMP1, BASE_SHIFT + 2, TEMP1
1885
1886	add	AORIG, TEMP2, AO
1887	add	B,     TEMP1, BO
1888#endif
1889
1890#if defined(LN) || defined(LT)
1891	LDF	[BO +  0 * SIZE], a1
1892	LDF	[BO +  1 * SIZE], a2
1893	LDF	[BO +  2 * SIZE], a3
1894	LDF	[BO +  3 * SIZE], a4
1895
1896	FSUB	a1, c01, c01
1897	FSUB	a2, c03, c03
1898	FSUB	a3, c05, c05
1899	FSUB	a4, c07, c07
1900#else
1901	LDF	[AO +  0 * SIZE], a1
1902	LDF	[AO +  1 * SIZE], a2
1903	LDF	[AO +  2 * SIZE], a3
1904	LDF	[AO +  3 * SIZE], a4
1905
1906	FSUB	a1, c01, c01
1907	FSUB	a2, c03, c03
1908	FSUB	a3, c05, c05
1909	FSUB	a4, c07, c07
1910#endif
1911
1912#if defined(LN) || defined(LT)
1913	LDF	[AO +  0 * SIZE], a1
1914
1915	FMUL	a1, c01, c01
1916	FMUL	a1, c03, c03
1917	FMUL	a1, c05, c05
1918	FMUL	a1, c07, c07
1919#endif
1920
1921#ifdef RN
1922	LDF	[BO +  0 * SIZE], a1
1923	LDF	[BO +  1 * SIZE], a2
1924	LDF	[BO +  2 * SIZE], a3
1925	LDF	[BO +  3 * SIZE], a4
1926
1927	FMUL	a1, c01, c01
1928
1929	FNMSUB	(aa2, cc01, cc03, cc03)
1930	FNMSUB	(aa3, cc01, cc05, cc05)
1931	FNMSUB	(aa4, cc01, cc07, cc07)
1932
1933	LDF	[BO +  5 * SIZE], a1
1934	LDF	[BO +  6 * SIZE], a2
1935	LDF	[BO +  7 * SIZE], a3
1936
1937	FMUL	a1, c03, c03
1938
1939	FNMSUB	(aa2, cc03, cc05, cc05)
1940	FNMSUB	(aa3, cc03, cc07, cc07)
1941
1942	LDF	[BO + 10 * SIZE], a1
1943	LDF	[BO + 11 * SIZE], a2
1944
1945	FMUL	a1, c05, c05
1946
1947	FNMSUB	(aa2, cc05, cc07, cc07)
1948
1949	LDF	[BO + 15 * SIZE], a1
1950
1951	FMUL	a1, c07, c07
1952#endif
1953
1954#ifdef RT
1955	LDF	[BO + 15 * SIZE], a1
1956	LDF	[BO + 14 * SIZE], a2
1957	LDF	[BO + 13 * SIZE], a3
1958	LDF	[BO + 12 * SIZE], a4
1959
1960	FMUL	a1, c07, c07
1961
1962	FNMSUB	(aa2, cc07, cc05, cc05)
1963	FNMSUB	(aa3, cc07, cc03, cc03)
1964	FNMSUB	(aa4, cc07, cc01, cc01)
1965
1966	LDF	[BO + 10 * SIZE], a1
1967	LDF	[BO +  9 * SIZE], a2
1968	LDF	[BO +  8 * SIZE], a3
1969
1970	FMUL	a1, c05, c05
1971
1972	FNMSUB	(aa2, cc05, cc03, cc03)
1973	FNMSUB	(aa3, cc05, cc01, cc01)
1974
1975	LDF	[BO +  5 * SIZE], a1
1976	LDF	[BO +  4 * SIZE], a2
1977
1978	FMUL	a1, c03, c03
1979
1980	FNMSUB	(aa2, cc03, cc01, cc01)
1981
1982	LDF	[BO +  0 * SIZE], a1
1983
1984	FMUL	a1, c01, c01
1985#endif
1986
1987#ifdef LN
1988	add	C1, -1 * SIZE, C1
1989	add	C2, -1 * SIZE, C2
1990	add	C3, -1 * SIZE, C3
1991	add	C4, -1 * SIZE, C4
1992#endif
1993
1994#if defined(LN) || defined(LT)
1995	STF	c01, [BO +  0 * SIZE]
1996	STF	c03, [BO +  1 * SIZE]
1997	STF	c05, [BO +  2 * SIZE]
1998	STF	c07, [BO +  3 * SIZE]
1999#else
2000	STF	c01, [AO +  0 * SIZE]
2001	STF	c03, [AO +  1 * SIZE]
2002	STF	c05, [AO +  2 * SIZE]
2003	STF	c07, [AO +  3 * SIZE]
2004#endif
2005
2006	STF	c01, [C1 + 0 * SIZE]
2007	STF	c03, [C2 + 0 * SIZE]
2008	STF	c05, [C3 + 0 * SIZE]
2009	STF	c07, [C4 + 0 * SIZE]
2010
2011#ifdef RT
2012	sll	K, BASE_SHIFT + 0, TEMP1
2013	add	AORIG, TEMP1, AORIG
2014#endif
2015
2016#if defined(LT) || defined(RN)
2017	sub	K, KK, TEMP1
2018	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2019	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2020	add	AO, TEMP2, AO
2021	add	BO, TEMP1, BO
2022#endif
2023
2024#ifdef LT
2025	add	KK, 1, KK
2026#endif
2027
2028#ifdef LN
2029	sub	KK, 1, KK
2030#endif
2031	.align 4
2032
2033.LL49:
2034#ifdef LN
2035	sll	K, BASE_SHIFT + 2, TEMP1
2036	add	B, TEMP1, B
2037#endif
2038
2039#if defined(LT) || defined(RN)
2040	mov	BO, B
2041#endif
2042
2043#ifdef RN
2044	add	KK, 4, KK
2045#endif
2046
2047#ifdef RT
2048	sub	KK, 4, KK
2049#endif
2050	.align 4
2051
2052.LL10:
2053	sra	N, 3, J
2054	cmp	J, 0
2055	ble,pn	%icc, .LL999
2056	nop
2057	.align 4
2058
2059.LL11:
2060#ifdef RT
2061	sll	K, BASE_SHIFT + 3, TEMP1
2062	sub	B, TEMP1, B
2063#endif
2064
2065#ifndef RT
2066	mov	C,  C1
2067	add	C,  LDC, C2
2068	add	C2, LDC, C3
2069	add	C3, LDC, C4
2070	add	C4, LDC, C5
2071	add	C5, LDC, C6
2072	add	C6, LDC, C7
2073	add	C7, LDC, C8
2074	add	C8, LDC, C
2075#else
2076	sub	C,  LDC, C8
2077	sub	C8, LDC, C7
2078	sub	C7, LDC, C6
2079	sub	C6, LDC, C5
2080	sub	C5, LDC, C4
2081	sub	C4, LDC, C3
2082	sub	C3, LDC, C2
2083	sub	C2, LDC, C1
2084	sub	C2, LDC, C
2085#endif
2086
2087#ifdef LN
2088	add	M, OFFSET, KK
2089#endif
2090
2091#ifdef LT
2092	mov	OFFSET, KK
2093#endif
2094
2095#if defined(LN) || defined(RT)
2096	mov	A, AORIG
2097#else
2098	mov	A, AO
2099#endif
2100
2101	sra	M, 1, I
2102	cmp	I, 0
2103	ble,pn	%icc, .LL20
2104	nop
2105	.align 4
2106
2107.LL12:
2108#if defined(LT) || defined(RN)
2109	mov	B, BO
2110#else
2111#ifdef LN
2112	sll	K,  BASE_SHIFT + 1, TEMP1
2113	sub	AORIG, TEMP1, AORIG
2114#endif
2115
2116	sll	KK, BASE_SHIFT + 1, TEMP1
2117	sll	KK, BASE_SHIFT + 3, TEMP2
2118
2119	add	AORIG, TEMP1, AO
2120	add	B,     TEMP2, BO
2121#endif
2122
2123	LDF	[AO +  0 * SIZE], a1
2124	LDF	[AO +  1 * SIZE], a2
2125	LDF	[AO +  8 * SIZE], a5
2126
2127	LDF	[BO +  0 * SIZE], b1
2128
2129	LDF	[BO +  1 * SIZE], b2
2130	FCLR	(cc01)
2131	LDF	[BO +  2 * SIZE], b3
2132	FCLR	(cc05)
2133	LDF	[BO +  3 * SIZE], b4
2134	FCLR	(cc09)
2135	LDF	[BO +  4 * SIZE], b5
2136	FCLR	(cc13)
2137
2138	LDF	[BO +  5 * SIZE], b6
2139	FCLR	(cc02)
2140	LDF	[BO +  6 * SIZE], b7
2141	FCLR	(cc06)
2142	LDF	[BO +  7 * SIZE], b8
2143	FCLR	(cc10)
2144	LDF	[BO +  8 * SIZE], b9
2145	FCLR	(cc14)
2146
2147	prefetch [C1 + 1 * SIZE], 3
2148	FCLR	(cc03)
2149	prefetch [C2 + 2 * SIZE], 3
2150	FCLR	(cc07)
2151	prefetch [C3 + 1 * SIZE], 3
2152	FCLR	(cc11)
2153	prefetch [C4 + 2 * SIZE], 3
2154	FCLR	(cc15)
2155
2156	prefetch [C5 + 1 * SIZE], 3
2157	FCLR	(cc04)
2158	prefetch [C6 + 2 * SIZE], 3
2159	FCLR	(cc08)
2160	prefetch [C7 + 1 * SIZE], 3
2161	FCLR	(cc12)
2162	prefetch [C8 + 2 * SIZE], 3
2163	FCLR	(cc16)
2164
2165#if defined(LT) || defined(RN)
2166	sra	KK, 3, L
2167#else
2168	sub	K, KK, L
2169	sra	L,  3, L
2170#endif
2171	cmp	L,  0
2172	ble,pn	%icc, .LL15
2173	nop
2174	.align 4
2175
2176.LL13:
2177	FMADD	(aa1, bb1, cc01, cc01)
2178	FMADD	(aa2, bb1, cc02, cc02)
2179	FMADD	(aa1, bb2, cc03, cc03)
2180	FMADD	(aa2, bb2, cc04, cc04)
2181
2182	FMADD	(aa1, bb3, cc05, cc05)
2183	LDF	[BO + 16 * SIZE], b1
2184	FMADD	(aa2, bb3, cc06, cc06)
2185	LDF	[BO +  9 * SIZE], b2
2186
2187	FMADD	(aa1, bb4, cc07, cc07)
2188	LDF	[BO + 10 * SIZE], b3
2189	FMADD	(aa2, bb4, cc08, cc08)
2190	LDF	[BO + 11 * SIZE], b4
2191
2192	FMADD	(aa1, bb5, cc09, cc09)
2193	LDF	[AO +  2 * SIZE], a3
2194	FMADD	(aa2, bb5, cc10, cc10)
2195	LDF	[AO +  3 * SIZE], a4
2196
2197	FMADD	(aa1, bb6, cc11, cc11)
2198	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2199	FMADD	(aa2, bb6, cc12, cc12)
2200	nop
2201
2202	FMADD	(aa1, bb7, cc13, cc13)
2203	LDF	[BO + 12 * SIZE], b5
2204	FMADD	(aa2, bb7, cc14, cc14)
2205	LDF	[BO + 13 * SIZE], b6
2206
2207	FMADD	(aa1, bb8, cc15, cc15)
2208	LDF	[BO + 14 * SIZE], b7
2209	FMADD	(aa2, bb8, cc16, cc16)
2210	LDF	[BO + 15 * SIZE], b8
2211
2212	FMADD	(aa3, bb9, cc01, cc01)
2213	FMADD	(aa4, bb9, cc02, cc02)
2214	FMADD	(aa3, bb2, cc03, cc03)
2215	FMADD	(aa4, bb2, cc04, cc04)
2216
2217	FMADD	(aa3, bb3, cc05, cc05)
2218	LDF	[BO + 24 * SIZE], b9
2219	FMADD	(aa4, bb3, cc06, cc06)
2220	LDF	[BO + 17 * SIZE], b2
2221
2222	FMADD	(aa3, bb4, cc07, cc07)
2223	LDF	[BO + 18 * SIZE], b3
2224	FMADD	(aa4, bb4, cc08, cc08)
2225	LDF	[BO + 19 * SIZE], b4
2226
2227	FMADD	(aa3, bb5, cc09, cc09)
2228	LDF	[AO +  4 * SIZE], a1
2229	FMADD	(aa4, bb5, cc10, cc10)
2230	LDF	[AO +  5 * SIZE], a2
2231
2232	FMADD	(aa3, bb6, cc11, cc11)
2233	add	L, -1, L
2234	FMADD	(aa4, bb6, cc12, cc12)
2235	nop
2236
2237	FMADD	(aa3, bb7, cc13, cc13)
2238	LDF	[BO + 20 * SIZE], b5
2239	FMADD	(aa4, bb7, cc14, cc14)
2240	LDF	[BO + 21 * SIZE], b6
2241
2242	FMADD	(aa3, bb8, cc15, cc15)
2243	LDF	[BO + 22 * SIZE], b7
2244	FMADD	(aa4, bb8, cc16, cc16)
2245	LDF	[BO + 23 * SIZE], b8
2246
2247	FMADD	(aa1, bb1, cc01, cc01)
2248	FMADD	(aa2, bb1, cc02, cc02)
2249	FMADD	(aa1, bb2, cc03, cc03)
2250	FMADD	(aa2, bb2, cc04, cc04)
2251
2252	FMADD	(aa1, bb3, cc05, cc05)
2253	LDF	[BO + 32 * SIZE], b1
2254	FMADD	(aa2, bb3, cc06, cc06)
2255	LDF	[BO + 25 * SIZE], b2
2256
2257	FMADD	(aa1, bb4, cc07, cc07)
2258	LDF	[BO + 26 * SIZE], b3
2259	FMADD	(aa2, bb4, cc08, cc08)
2260	LDF	[BO + 27 * SIZE], b4
2261
2262	FMADD	(aa1, bb5, cc09, cc09)
2263	LDF	[AO +  6 * SIZE], a3
2264	FMADD	(aa2, bb5, cc10, cc10)
2265	LDF	[AO +  7 * SIZE], a4
2266
2267	FMADD	(aa1, bb6, cc11, cc11)
2268	nop
2269	FMADD	(aa2, bb6, cc12, cc12)
2270	nop
2271
2272	FMADD	(aa1, bb7, cc13, cc13)
2273	LDF	[BO + 28 * SIZE], b5
2274	FMADD	(aa2, bb7, cc14, cc14)
2275	LDF	[BO + 29 * SIZE], b6
2276
2277	FMADD	(aa1, bb8, cc15, cc15)
2278	LDF	[BO + 30 * SIZE], b7
2279	FMADD	(aa2, bb8, cc16, cc16)
2280	LDF	[BO + 31 * SIZE], b8
2281
2282	FMADD	(aa3, bb9, cc01, cc01)
2283	FMADD	(aa4, bb9, cc02, cc02)
2284	FMADD	(aa3, bb2, cc03, cc03)
2285	FMADD	(aa4, bb2, cc04, cc04)
2286
2287	FMADD	(aa3, bb3, cc05, cc05)
2288	LDF	[BO + 40 * SIZE], b9
2289	FMADD	(aa4, bb3, cc06, cc06)
2290	LDF	[BO + 33 * SIZE], b2
2291
2292	FMADD	(aa3, bb4, cc07, cc07)
2293	LDF	[BO + 34 * SIZE], b3
2294	FMADD	(aa4, bb4, cc08, cc08)
2295	LDF	[BO + 35 * SIZE], b4
2296
2297	FMADD	(aa3, bb5, cc09, cc09)
2298	LDF	[AO + 16 * SIZE], a1  /****/
2299	FMADD	(aa4, bb5, cc10, cc10)
2300	LDF	[AO +  9 * SIZE], a2
2301
2302	FMADD	(aa3, bb6, cc11, cc11)
2303	nop
2304	FMADD	(aa4, bb6, cc12, cc12)
2305	nop
2306
2307	FMADD	(aa3, bb7, cc13, cc13)
2308	LDF	[BO + 36 * SIZE], b5
2309	FMADD	(aa4, bb7, cc14, cc14)
2310	LDF	[BO + 37 * SIZE], b6
2311
2312	FMADD	(aa3, bb8, cc15, cc15)
2313	LDF	[BO + 38 * SIZE], b7
2314	FMADD	(aa4, bb8, cc16, cc16)
2315	LDF	[BO + 39 * SIZE], b8
2316
2317	FMADD	(aa5, bb1, cc01, cc01)
2318	FMADD	(aa2, bb1, cc02, cc02)
2319	FMADD	(aa5, bb2, cc03, cc03)
2320	FMADD	(aa2, bb2, cc04, cc04)
2321
2322	FMADD	(aa5, bb3, cc05, cc05)
2323	LDF	[BO + 48 * SIZE], b1
2324	FMADD	(aa2, bb3, cc06, cc06)
2325	LDF	[BO + 41 * SIZE], b2
2326
2327	FMADD	(aa5, bb4, cc07, cc07)
2328	LDF	[BO + 42 * SIZE], b3
2329	FMADD	(aa2, bb4, cc08, cc08)
2330	LDF	[BO + 43 * SIZE], b4
2331
2332	FMADD	(aa5, bb5, cc09, cc09)
2333	LDF	[AO + 10 * SIZE], a3
2334	FMADD	(aa2, bb5, cc10, cc10)
2335	LDF	[AO + 11 * SIZE], a4
2336
2337	FMADD	(aa5, bb6, cc11, cc11)
2338	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
2339	FMADD	(aa2, bb6, cc12, cc12)
2340	nop
2341
2342	FMADD	(aa5, bb7, cc13, cc13)
2343	LDF	[BO + 44 * SIZE], b5
2344	FMADD	(aa2, bb7, cc14, cc14)
2345	LDF	[BO + 45 * SIZE], b6
2346
2347	FMADD	(aa5, bb8, cc15, cc15)
2348	LDF	[BO + 46 * SIZE], b7
2349	FMADD	(aa2, bb8, cc16, cc16)
2350	LDF	[BO + 47 * SIZE], b8
2351
2352	FMADD	(aa3, bb9, cc01, cc01)
2353	FMADD	(aa4, bb9, cc02, cc02)
2354	FMADD	(aa3, bb2, cc03, cc03)
2355	FMADD	(aa4, bb2, cc04, cc04)
2356
2357	FMADD	(aa3, bb3, cc05, cc05)
2358	LDF	[BO + 56 * SIZE], b9
2359	FMADD	(aa4, bb3, cc06, cc06)
2360	LDF	[BO + 49 * SIZE], b2
2361
2362	FMADD	(aa3, bb4, cc07, cc07)
2363	LDF	[BO + 50 * SIZE], b3
2364	FMADD	(aa4, bb4, cc08, cc08)
2365	LDF	[BO + 51 * SIZE], b4
2366
2367	FMADD	(aa3, bb5, cc09, cc09)
2368	LDF	[AO + 12 * SIZE], a5
2369	FMADD	(aa4, bb5, cc10, cc10)
2370	LDF	[AO + 13 * SIZE], a2
2371
2372	FMADD	(aa3, bb6, cc11, cc11)
2373	cmp	L, 0
2374	FMADD	(aa4, bb6, cc12, cc12)
2375	nop
2376
2377	FMADD	(aa3, bb7, cc13, cc13)
2378	LDF	[BO + 52 * SIZE], b5
2379	FMADD	(aa4, bb7, cc14, cc14)
2380	LDF	[BO + 53 * SIZE], b6
2381
2382	FMADD	(aa3, bb8, cc15, cc15)
2383	LDF	[BO + 54 * SIZE], b7
2384	FMADD	(aa4, bb8, cc16, cc16)
2385	LDF	[BO + 55 * SIZE], b8
2386
2387	FMADD	(aa5, bb1, cc01, cc01)
2388	FMADD	(aa2, bb1, cc02, cc02)
2389	FMADD	(aa5, bb2, cc03, cc03)
2390	FMADD	(aa2, bb2, cc04, cc04)
2391
2392	FMADD	(aa5, bb3, cc05, cc05)
2393	LDF	[BO + 64 * SIZE], b1
2394	FMADD	(aa2, bb3, cc06, cc06)
2395	LDF	[BO + 57 * SIZE], b2
2396
2397	FMADD	(aa5, bb4, cc07, cc07)
2398	LDF	[BO + 58 * SIZE], b3
2399	FMADD	(aa2, bb4, cc08, cc08)
2400	LDF	[BO + 59 * SIZE], b4
2401
2402	FMADD	(aa5, bb5, cc09, cc09)
2403	LDF	[AO + 14 * SIZE], a3
2404	FMADD	(aa2, bb5, cc10, cc10)
2405	LDF	[AO + 15 * SIZE], a4
2406
2407	FMADD	(aa5, bb6, cc11, cc11)
2408	add	BO, 64 * SIZE, BO
2409	FMADD	(aa2, bb6, cc12, cc12)
2410	add	AO, 16 * SIZE, AO
2411
2412	FMADD	(aa5, bb7, cc13, cc13)
2413	LDF	[BO -  4 * SIZE], b5
2414	FMADD	(aa2, bb7, cc14, cc14)
2415	LDF	[BO -  3 * SIZE], b6
2416
2417	FMADD	(aa5, bb8, cc15, cc15)
2418	LDF	[BO -  2 * SIZE], b7
2419	FMADD	(aa2, bb8, cc16, cc16)
2420	LDF	[BO -  1 * SIZE], b8
2421
2422	FMADD	(aa3, bb9, cc01, cc01)
2423	FMADD	(aa4, bb9, cc02, cc02)
2424	FMADD	(aa3, bb2, cc03, cc03)
2425	FMADD	(aa4, bb2, cc04, cc04)
2426
2427	FMADD	(aa3, bb3, cc05, cc05)
2428	LDF	[BO +  8 * SIZE], b9
2429	FMADD	(aa4, bb3, cc06, cc06)
2430	LDF	[BO +  1 * SIZE], b2
2431
2432	FMADD	(aa3, bb4, cc07, cc07)
2433	LDF	[BO +  2 * SIZE], b3
2434	FMADD	(aa4, bb4, cc08, cc08)
2435	LDF	[BO +  3 * SIZE], b4
2436
2437	FMADD	(aa3, bb5, cc09, cc09)
2438	LDF	[AO +  8 * SIZE], a5  /****/
2439	FMADD	(aa4, bb5, cc10, cc10)
2440	LDF	[AO +  1 * SIZE], a2
2441
2442	FMADD	(aa3, bb6, cc11, cc11)
2443	FMADD	(aa4, bb6, cc12, cc12)
2444
2445	FMADD	(aa3, bb7, cc13, cc13)
2446	LDF	[BO +  4 * SIZE], b5
2447	FMADD	(aa4, bb7, cc14, cc14)
2448	LDF	[BO +  5 * SIZE], b6
2449
2450	FMADD	(aa3, bb8, cc15, cc15)
2451	LDF	[BO +  6 * SIZE], b7
2452	FMADD	(aa4, bb8, cc16, cc16)
2453	ble,pn	%icc, .LL15
2454	LDF	[BO +  7 * SIZE], b8
2455
2456	FMADD	(aa1, bb1, cc01, cc01)
2457	FMADD	(aa2, bb1, cc02, cc02)
2458	FMADD	(aa1, bb2, cc03, cc03)
2459	FMADD	(aa2, bb2, cc04, cc04)
2460
2461	FMADD	(aa1, bb3, cc05, cc05)
2462	LDF	[BO + 16 * SIZE], b1
2463	FMADD	(aa2, bb3, cc06, cc06)
2464	LDF	[BO +  9 * SIZE], b2
2465
2466	FMADD	(aa1, bb4, cc07, cc07)
2467	LDF	[BO + 10 * SIZE], b3
2468	FMADD	(aa2, bb4, cc08, cc08)
2469	LDF	[BO + 11 * SIZE], b4
2470
2471	FMADD	(aa1, bb5, cc09, cc09)
2472	LDF	[AO +  2 * SIZE], a3
2473	FMADD	(aa2, bb5, cc10, cc10)
2474	LDF	[AO +  3 * SIZE], a4
2475
2476	FMADD	(aa1, bb6, cc11, cc11)
2477	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2478	FMADD	(aa2, bb6, cc12, cc12)
2479	nop
2480
2481	FMADD	(aa1, bb7, cc13, cc13)
2482	LDF	[BO + 12 * SIZE], b5
2483	FMADD	(aa2, bb7, cc14, cc14)
2484	LDF	[BO + 13 * SIZE], b6
2485
2486	FMADD	(aa1, bb8, cc15, cc15)
2487	LDF	[BO + 14 * SIZE], b7
2488	FMADD	(aa2, bb8, cc16, cc16)
2489	LDF	[BO + 15 * SIZE], b8
2490
2491	FMADD	(aa3, bb9, cc01, cc01)
2492	FMADD	(aa4, bb9, cc02, cc02)
2493	FMADD	(aa3, bb2, cc03, cc03)
2494	FMADD	(aa4, bb2, cc04, cc04)
2495
2496	FMADD	(aa3, bb3, cc05, cc05)
2497	LDF	[BO + 24 * SIZE], b9
2498	FMADD	(aa4, bb3, cc06, cc06)
2499	LDF	[BO + 17 * SIZE], b2
2500
2501	FMADD	(aa3, bb4, cc07, cc07)
2502	LDF	[BO + 18 * SIZE], b3
2503	FMADD	(aa4, bb4, cc08, cc08)
2504	LDF	[BO + 19 * SIZE], b4
2505
2506	FMADD	(aa3, bb5, cc09, cc09)
2507	LDF	[AO +  4 * SIZE], a1
2508	FMADD	(aa4, bb5, cc10, cc10)
2509	LDF	[AO +  5 * SIZE], a2
2510
2511	FMADD	(aa3, bb6, cc11, cc11)
2512	add	L, -1, L
2513	FMADD	(aa4, bb6, cc12, cc12)
2514	nop
2515
2516	FMADD	(aa3, bb7, cc13, cc13)
2517	LDF	[BO + 20 * SIZE], b5
2518	FMADD	(aa4, bb7, cc14, cc14)
2519	LDF	[BO + 21 * SIZE], b6
2520
2521	FMADD	(aa3, bb8, cc15, cc15)
2522	LDF	[BO + 22 * SIZE], b7
2523	FMADD	(aa4, bb8, cc16, cc16)
2524	LDF	[BO + 23 * SIZE], b8
2525
2526	FMADD	(aa1, bb1, cc01, cc01)
2527	FMADD	(aa2, bb1, cc02, cc02)
2528	FMADD	(aa1, bb2, cc03, cc03)
2529	FMADD	(aa2, bb2, cc04, cc04)
2530
2531	FMADD	(aa1, bb3, cc05, cc05)
2532	LDF	[BO + 32 * SIZE], b1
2533	FMADD	(aa2, bb3, cc06, cc06)
2534	LDF	[BO + 25 * SIZE], b2
2535
2536	FMADD	(aa1, bb4, cc07, cc07)
2537	LDF	[BO + 26 * SIZE], b3
2538	FMADD	(aa2, bb4, cc08, cc08)
2539	LDF	[BO + 27 * SIZE], b4
2540
2541	FMADD	(aa1, bb5, cc09, cc09)
2542	LDF	[AO +  6 * SIZE], a3
2543	FMADD	(aa2, bb5, cc10, cc10)
2544	LDF	[AO +  7 * SIZE], a4
2545
2546	FMADD	(aa1, bb6, cc11, cc11)
2547	nop
2548	FMADD	(aa2, bb6, cc12, cc12)
2549	nop
2550
2551	FMADD	(aa1, bb7, cc13, cc13)
2552	LDF	[BO + 28 * SIZE], b5
2553	FMADD	(aa2, bb7, cc14, cc14)
2554	LDF	[BO + 29 * SIZE], b6
2555
2556	FMADD	(aa1, bb8, cc15, cc15)
2557	LDF	[BO + 30 * SIZE], b7
2558	FMADD	(aa2, bb8, cc16, cc16)
2559	LDF	[BO + 31 * SIZE], b8
2560
2561	FMADD	(aa3, bb9, cc01, cc01)
2562	FMADD	(aa4, bb9, cc02, cc02)
2563	FMADD	(aa3, bb2, cc03, cc03)
2564	FMADD	(aa4, bb2, cc04, cc04)
2565
2566	FMADD	(aa3, bb3, cc05, cc05)
2567	LDF	[BO + 40 * SIZE], b9
2568	FMADD	(aa4, bb3, cc06, cc06)
2569	LDF	[BO + 33 * SIZE], b2
2570
2571	FMADD	(aa3, bb4, cc07, cc07)
2572	LDF	[BO + 34 * SIZE], b3
2573	FMADD	(aa4, bb4, cc08, cc08)
2574	LDF	[BO + 35 * SIZE], b4
2575
2576	FMADD	(aa3, bb5, cc09, cc09)
2577	LDF	[AO + 16 * SIZE], a1  /****/
2578	FMADD	(aa4, bb5, cc10, cc10)
2579	LDF	[AO +  9 * SIZE], a2
2580
2581	FMADD	(aa3, bb6, cc11, cc11)
2582	nop
2583	FMADD	(aa4, bb6, cc12, cc12)
2584	nop
2585
2586	FMADD	(aa3, bb7, cc13, cc13)
2587	LDF	[BO + 36 * SIZE], b5
2588	FMADD	(aa4, bb7, cc14, cc14)
2589	LDF	[BO + 37 * SIZE], b6
2590
2591	FMADD	(aa3, bb8, cc15, cc15)
2592	LDF	[BO + 38 * SIZE], b7
2593	FMADD	(aa4, bb8, cc16, cc16)
2594	LDF	[BO + 39 * SIZE], b8
2595
2596	FMADD	(aa5, bb1, cc01, cc01)
2597	FMADD	(aa2, bb1, cc02, cc02)
2598	FMADD	(aa5, bb2, cc03, cc03)
2599	FMADD	(aa2, bb2, cc04, cc04)
2600
2601	FMADD	(aa5, bb3, cc05, cc05)
2602	LDF	[BO + 48 * SIZE], b1
2603	FMADD	(aa2, bb3, cc06, cc06)
2604	LDF	[BO + 41 * SIZE], b2
2605
2606	FMADD	(aa5, bb4, cc07, cc07)
2607	LDF	[BO + 42 * SIZE], b3
2608	FMADD	(aa2, bb4, cc08, cc08)
2609	LDF	[BO + 43 * SIZE], b4
2610
2611	FMADD	(aa5, bb5, cc09, cc09)
2612	LDF	[AO + 10 * SIZE], a3
2613	FMADD	(aa2, bb5, cc10, cc10)
2614	LDF	[AO + 11 * SIZE], a4
2615
2616	FMADD	(aa5, bb6, cc11, cc11)
2617	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
2618	FMADD	(aa2, bb6, cc12, cc12)
2619	nop
2620
2621	FMADD	(aa5, bb7, cc13, cc13)
2622	LDF	[BO + 44 * SIZE], b5
2623	FMADD	(aa2, bb7, cc14, cc14)
2624	LDF	[BO + 45 * SIZE], b6
2625
2626	FMADD	(aa5, bb8, cc15, cc15)
2627	LDF	[BO + 46 * SIZE], b7
2628	FMADD	(aa2, bb8, cc16, cc16)
2629	LDF	[BO + 47 * SIZE], b8
2630
2631	FMADD	(aa3, bb9, cc01, cc01)
2632	FMADD	(aa4, bb9, cc02, cc02)
2633	FMADD	(aa3, bb2, cc03, cc03)
2634	FMADD	(aa4, bb2, cc04, cc04)
2635
2636	FMADD	(aa3, bb3, cc05, cc05)
2637	LDF	[BO + 56 * SIZE], b9
2638	FMADD	(aa4, bb3, cc06, cc06)
2639	LDF	[BO + 49 * SIZE], b2
2640
2641	FMADD	(aa3, bb4, cc07, cc07)
2642	LDF	[BO + 50 * SIZE], b3
2643	FMADD	(aa4, bb4, cc08, cc08)
2644	LDF	[BO + 51 * SIZE], b4
2645
2646	FMADD	(aa3, bb5, cc09, cc09)
2647	LDF	[AO + 12 * SIZE], a5
2648	FMADD	(aa4, bb5, cc10, cc10)
2649	LDF	[AO + 13 * SIZE], a2
2650
2651	FMADD	(aa3, bb6, cc11, cc11)
2652	cmp	L, 0
2653	FMADD	(aa4, bb6, cc12, cc12)
2654	nop
2655
2656	FMADD	(aa3, bb7, cc13, cc13)
2657	LDF	[BO + 52 * SIZE], b5
2658	FMADD	(aa4, bb7, cc14, cc14)
2659	LDF	[BO + 53 * SIZE], b6
2660
2661	FMADD	(aa3, bb8, cc15, cc15)
2662	LDF	[BO + 54 * SIZE], b7
2663	FMADD	(aa4, bb8, cc16, cc16)
2664	LDF	[BO + 55 * SIZE], b8
2665
2666	FMADD	(aa5, bb1, cc01, cc01)
2667	FMADD	(aa2, bb1, cc02, cc02)
2668	FMADD	(aa5, bb2, cc03, cc03)
2669	FMADD	(aa2, bb2, cc04, cc04)
2670
2671	FMADD	(aa5, bb3, cc05, cc05)
2672	LDF	[BO + 64 * SIZE], b1
2673	FMADD	(aa2, bb3, cc06, cc06)
2674	LDF	[BO + 57 * SIZE], b2
2675
2676	FMADD	(aa5, bb4, cc07, cc07)
2677	LDF	[BO + 58 * SIZE], b3
2678	FMADD	(aa2, bb4, cc08, cc08)
2679	LDF	[BO + 59 * SIZE], b4
2680
2681	FMADD	(aa5, bb5, cc09, cc09)
2682	LDF	[AO + 14 * SIZE], a3
2683	FMADD	(aa2, bb5, cc10, cc10)
2684	LDF	[AO + 15 * SIZE], a4
2685
2686	FMADD	(aa5, bb6, cc11, cc11)
2687	add	BO, 64 * SIZE, BO
2688	FMADD	(aa2, bb6, cc12, cc12)
2689	add	AO, 16 * SIZE, AO
2690
2691	FMADD	(aa5, bb7, cc13, cc13)
2692	LDF	[BO -  4 * SIZE], b5
2693	FMADD	(aa2, bb7, cc14, cc14)
2694	LDF	[BO -  3 * SIZE], b6
2695
2696	FMADD	(aa5, bb8, cc15, cc15)
2697	LDF	[BO -  2 * SIZE], b7
2698	FMADD	(aa2, bb8, cc16, cc16)
2699	LDF	[BO -  1 * SIZE], b8
2700
2701	FMADD	(aa3, bb9, cc01, cc01)
2702	FMADD	(aa4, bb9, cc02, cc02)
2703	FMADD	(aa3, bb2, cc03, cc03)
2704	FMADD	(aa4, bb2, cc04, cc04)
2705
2706	FMADD	(aa3, bb3, cc05, cc05)
2707	LDF	[BO +  8 * SIZE], b9
2708	FMADD	(aa4, bb3, cc06, cc06)
2709	LDF	[BO +  1 * SIZE], b2
2710
2711	FMADD	(aa3, bb4, cc07, cc07)
2712	LDF	[BO +  2 * SIZE], b3
2713	FMADD	(aa4, bb4, cc08, cc08)
2714	LDF	[BO +  3 * SIZE], b4
2715
2716	FMADD	(aa3, bb5, cc09, cc09)
2717	LDF	[AO +  8 * SIZE], a5  /****/
2718	FMADD	(aa4, bb5, cc10, cc10)
2719	LDF	[AO +  1 * SIZE], a2
2720
2721	FMADD	(aa3, bb6, cc11, cc11)
2722	FMADD	(aa4, bb6, cc12, cc12)
2723
2724	FMADD	(aa3, bb7, cc13, cc13)
2725	LDF	[BO +  4 * SIZE], b5
2726	FMADD	(aa4, bb7, cc14, cc14)
2727	LDF	[BO +  5 * SIZE], b6
2728
2729	FMADD	(aa3, bb8, cc15, cc15)
2730	LDF	[BO +  6 * SIZE], b7
2731	FMADD	(aa4, bb8, cc16, cc16)
2732	bg,pt	%icc, .LL13
2733	LDF	[BO +  7 * SIZE], b8
2734	.align 4
2735
2736.LL15:
2737#if defined(LT) || defined(RN)
2738	and	KK, 7, L
2739#else
2740	sub	K, KK, L
2741	and	L,  7, L
2742#endif
2743	cmp	L,  0
2744	ble,a,pn %icc, .LL18
2745	nop
2746	.align 4
2747
2748.LL17:
2749	FMADD	(aa1, bb1, cc01, cc01)
2750	add	L, -1, L
2751	FMADD	(aa2, bb1, cc02, cc02)
2752	nop
2753
2754	FMADD	(aa1, bb2, cc03, cc03)
2755	LDF	[BO +  8 * SIZE], b1
2756	FMADD	(aa2, bb2, cc04, cc04)
2757	LDF	[BO +  9 * SIZE], b2
2758
2759	FMADD	(aa1, bb3, cc05, cc05)
2760	cmp	L, 0
2761	FMADD	(aa2, bb3, cc06, cc06)
2762	nop
2763
2764	FMADD	(aa1, bb4, cc07, cc07)
2765	LDF	[BO + 10 * SIZE], b3
2766	FMADD	(aa2, bb4, cc08, cc08)
2767	LDF	[BO + 11 * SIZE], b4
2768
2769	FMADD	(aa1, bb5, cc09, cc09)
2770	nop
2771	FMADD	(aa2, bb5, cc10, cc10)
2772	nop
2773
2774	FMADD	(aa1, bb6, cc11, cc11)
2775	LDF	[BO + 12 * SIZE], b5
2776	FMADD	(aa2, bb6, cc12, cc12)
2777	LDF	[BO + 13 * SIZE], b6
2778
2779	FMADD	(aa1, bb7, cc13, cc13)
2780	add	AO, 2 * SIZE, AO
2781	FMADD	(aa2, bb7, cc14, cc14)
2782	add	BO, 8 * SIZE, BO
2783
2784	FMADD	(aa1, bb8, cc15, cc15)
2785	LDF	[AO +  0 * SIZE], a1
2786	FMADD	(aa2, bb8, cc16, cc16)
2787	LDF	[AO +  1 * SIZE], a2
2788
2789	LDF	[BO +  6 * SIZE], b7
2790	bg,pt	%icc, .LL17
2791	LDF	[BO +  7 * SIZE], b8
2792	nop
2793	.align 4
2794
2795.LL18:
2796#if defined(LN) || defined(RT)
2797#ifdef LN
2798	sub	KK, 2, TEMP1
2799#else
2800	sub	KK, 8, TEMP1
2801#endif
2802	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2803	sll	TEMP1, BASE_SHIFT + 3, TEMP1
2804
2805	add	AORIG, TEMP2, AO
2806	add	B,     TEMP1, BO
2807#endif
2808
2809#if defined(LN) || defined(LT)
2810	LDF	[BO +  0 * SIZE], a1
2811	LDF	[BO +  1 * SIZE], a2
2812	LDF	[BO +  2 * SIZE], a3
2813	LDF	[BO +  3 * SIZE], a4
2814
2815	LDF	[BO +  4 * SIZE], b1
2816	LDF	[BO +  5 * SIZE], b2
2817	LDF	[BO +  6 * SIZE], b3
2818	LDF	[BO +  7 * SIZE], b4
2819
2820	FSUB	a1, c01, c01
2821	FSUB	a2, c03, c03
2822	FSUB	a3, c05, c05
2823	FSUB	a4, c07, c07
2824
2825	FSUB	b1, c09, c09
2826	FSUB	b2, c11, c11
2827	FSUB	b3, c13, c13
2828	FSUB	b4, c15, c15
2829
2830	LDF	[BO +  8 * SIZE], a1
2831	LDF	[BO +  9 * SIZE], a2
2832	LDF	[BO + 10 * SIZE], a3
2833	LDF	[BO + 11 * SIZE], a4
2834
2835	LDF	[BO + 12 * SIZE], b1
2836	LDF	[BO + 13 * SIZE], b2
2837	LDF	[BO + 14 * SIZE], b3
2838	LDF	[BO + 15 * SIZE], b4
2839
2840	FSUB	a1, c02, c02
2841	FSUB	a2, c04, c04
2842	FSUB	a3, c06, c06
2843	FSUB	a4, c08, c08
2844
2845	FSUB	b1, c10, c10
2846	FSUB	b2, c12, c12
2847	FSUB	b3, c14, c14
2848	FSUB	b4, c16, c16
2849#else
2850	LDF	[AO +  0 * SIZE], a1
2851	LDF	[AO +  1 * SIZE], a2
2852	LDF	[AO +  2 * SIZE], a3
2853	LDF	[AO +  3 * SIZE], a4
2854
2855	LDF	[AO +  4 * SIZE], b1
2856	LDF	[AO +  5 * SIZE], b2
2857	LDF	[AO +  6 * SIZE], b3
2858	LDF	[AO +  7 * SIZE], b4
2859
2860	FSUB	a1, c01, c01
2861	FSUB	a2, c02, c02
2862	FSUB	a3, c03, c03
2863	FSUB	a4, c04, c04
2864
2865	FSUB	b1, c05, c05
2866	FSUB	b2, c06, c06
2867	FSUB	b3, c07, c07
2868	FSUB	b4, c08, c08
2869
2870	LDF	[AO +  8 * SIZE], a1
2871	LDF	[AO +  9 * SIZE], a2
2872	LDF	[AO + 10 * SIZE], a3
2873	LDF	[AO + 11 * SIZE], a4
2874
2875	LDF	[AO + 12 * SIZE], b1
2876	LDF	[AO + 13 * SIZE], b2
2877	LDF	[AO + 14 * SIZE], b3
2878	LDF	[AO + 15 * SIZE], b4
2879
2880	FSUB	a1, c09, c09
2881	FSUB	a2, c10, c10
2882	FSUB	a3, c11, c11
2883	FSUB	a4, c12, c12
2884
2885	FSUB	b1, c13, c13
2886	FSUB	b2, c14, c14
2887	FSUB	b3, c15, c15
2888	FSUB	b4, c16, c16
2889#endif
2890
2891#ifdef LN
2892	LDF	[AO +  3 * SIZE], a1
2893	LDF	[AO +  2 * SIZE], a2
2894	LDF	[AO +  0 * SIZE], a3
2895
2896	FMUL	a1, c02, c02
2897	FMUL	a1, c04, c04
2898	FMUL	a1, c06, c06
2899	FMUL	a1, c08, c08
2900	FMUL	a1, c10, c10
2901	FMUL	a1, c12, c12
2902	FMUL	a1, c14, c14
2903	FMUL	a1, c16, c16
2904
2905	FNMSUB	(aa2, cc02, cc01, cc01)
2906	FNMSUB	(aa2, cc04, cc03, cc03)
2907	FNMSUB	(aa2, cc06, cc05, cc05)
2908	FNMSUB	(aa2, cc08, cc07, cc07)
2909	FNMSUB	(aa2, cc10, cc09, cc09)
2910	FNMSUB	(aa2, cc12, cc11, cc11)
2911	FNMSUB	(aa2, cc14, cc13, cc13)
2912	FNMSUB	(aa2, cc16, cc15, cc15)
2913
2914	FMUL	a3, c01, c01
2915	FMUL	a3, c03, c03
2916	FMUL	a3, c05, c05
2917	FMUL	a3, c07, c07
2918	FMUL	a3, c09, c09
2919	FMUL	a3, c11, c11
2920	FMUL	a3, c13, c13
2921	FMUL	a3, c15, c15
2922#endif
2923
2924#ifdef LT
2925	LDF	[AO +  0 * SIZE], a1
2926	LDF	[AO +  1 * SIZE], a2
2927	LDF	[AO +  3 * SIZE], a3
2928
2929	FMUL	a1, c01, c01
2930	FMUL	a1, c03, c03
2931	FMUL	a1, c05, c05
2932	FMUL	a1, c07, c07
2933	FMUL	a1, c09, c09
2934	FMUL	a1, c11, c11
2935	FMUL	a1, c13, c13
2936	FMUL	a1, c15, c15
2937
2938	FNMSUB	(aa2, cc01, cc02, cc02)
2939	FNMSUB	(aa2, cc03, cc04, cc04)
2940	FNMSUB	(aa2, cc05, cc06, cc06)
2941	FNMSUB	(aa2, cc07, cc08, cc08)
2942	FNMSUB	(aa2, cc09, cc10, cc10)
2943	FNMSUB	(aa2, cc11, cc12, cc12)
2944	FNMSUB	(aa2, cc13, cc14, cc14)
2945	FNMSUB	(aa2, cc15, cc16, cc16)
2946
2947	FMUL	a3, c02, c02
2948	FMUL	a3, c04, c04
2949	FMUL	a3, c06, c06
2950	FMUL	a3, c08, c08
2951	FMUL	a3, c10, c10
2952	FMUL	a3, c12, c12
2953	FMUL	a3, c14, c14
2954	FMUL	a3, c16, c16
2955#endif
2956
2957#ifdef RN
2958	LDF	[BO +  0 * SIZE], a1
2959	LDF	[BO +  1 * SIZE], a2
2960	LDF	[BO +  2 * SIZE], a3
2961	LDF	[BO +  3 * SIZE], a4
2962	LDF	[BO +  4 * SIZE], b1
2963	LDF	[BO +  5 * SIZE], b2
2964	LDF	[BO +  6 * SIZE], b3
2965	LDF	[BO +  7 * SIZE], b4
2966
2967	FMUL	a1, c01, c01
2968	FMUL	a1, c02, c02
2969
2970	FNMSUB	(aa2, cc01, cc03, cc03)
2971	FNMSUB	(aa2, cc02, cc04, cc04)
2972	FNMSUB	(aa3, cc01, cc05, cc05)
2973	FNMSUB	(aa3, cc02, cc06, cc06)
2974	FNMSUB	(aa4, cc01, cc07, cc07)
2975	FNMSUB	(aa4, cc02, cc08, cc08)
2976	FNMSUB	(bb1, cc01, cc09, cc09)
2977	FNMSUB	(bb1, cc02, cc10, cc10)
2978	FNMSUB	(bb2, cc01, cc11, cc11)
2979	FNMSUB	(bb2, cc02, cc12, cc12)
2980	FNMSUB	(bb3, cc01, cc13, cc13)
2981	FNMSUB	(bb3, cc02, cc14, cc14)
2982	FNMSUB	(bb4, cc01, cc15, cc15)
2983	FNMSUB	(bb4, cc02, cc16, cc16)
2984
2985	LDF	[BO +  9 * SIZE], a1
2986	LDF	[BO + 10 * SIZE], a2
2987	LDF	[BO + 11 * SIZE], a3
2988	LDF	[BO + 12 * SIZE], a4
2989	LDF	[BO + 13 * SIZE], b1
2990	LDF	[BO + 14 * SIZE], b2
2991	LDF	[BO + 15 * SIZE], b3
2992
2993	FMUL	a1, c03, c03
2994	FMUL	a1, c04, c04
2995
2996	FNMSUB	(aa2, cc03, cc05, cc05)
2997	FNMSUB	(aa2, cc04, cc06, cc06)
2998	FNMSUB	(aa3, cc03, cc07, cc07)
2999	FNMSUB	(aa3, cc04, cc08, cc08)
3000	FNMSUB	(aa4, cc03, cc09, cc09)
3001	FNMSUB	(aa4, cc04, cc10, cc10)
3002	FNMSUB	(bb1, cc03, cc11, cc11)
3003	FNMSUB	(bb1, cc04, cc12, cc12)
3004	FNMSUB	(bb2, cc03, cc13, cc13)
3005	FNMSUB	(bb2, cc04, cc14, cc14)
3006	FNMSUB	(bb3, cc03, cc15, cc15)
3007	FNMSUB	(bb3, cc04, cc16, cc16)
3008
3009	LDF	[BO + 18 * SIZE], a1
3010	LDF	[BO + 19 * SIZE], a2
3011	LDF	[BO + 20 * SIZE], a3
3012	LDF	[BO + 21 * SIZE], a4
3013	LDF	[BO + 22 * SIZE], b1
3014	LDF	[BO + 23 * SIZE], b2
3015
3016	FMUL	a1, c05, c05
3017	FMUL	a1, c06, c06
3018
3019	FNMSUB	(aa2, cc05, cc07, cc07)
3020	FNMSUB	(aa2, cc06, cc08, cc08)
3021	FNMSUB	(aa3, cc05, cc09, cc09)
3022	FNMSUB	(aa3, cc06, cc10, cc10)
3023	FNMSUB	(aa4, cc05, cc11, cc11)
3024	FNMSUB	(aa4, cc06, cc12, cc12)
3025	FNMSUB	(bb1, cc05, cc13, cc13)
3026	FNMSUB	(bb1, cc06, cc14, cc14)
3027	FNMSUB	(bb2, cc05, cc15, cc15)
3028	FNMSUB	(bb2, cc06, cc16, cc16)
3029
3030	LDF	[BO + 27 * SIZE], a1
3031	LDF	[BO + 28 * SIZE], a2
3032	LDF	[BO + 29 * SIZE], a3
3033	LDF	[BO + 30 * SIZE], a4
3034	LDF	[BO + 31 * SIZE], b1
3035
3036	FMUL	a1, c07, c07
3037	FMUL	a1, c08, c08
3038
3039	FNMSUB	(aa2, cc07, cc09, cc09)
3040	FNMSUB	(aa2, cc08, cc10, cc10)
3041	FNMSUB	(aa3, cc07, cc11, cc11)
3042	FNMSUB	(aa3, cc08, cc12, cc12)
3043	FNMSUB	(aa4, cc07, cc13, cc13)
3044	FNMSUB	(aa4, cc08, cc14, cc14)
3045	FNMSUB	(bb1, cc07, cc15, cc15)
3046	FNMSUB	(bb1, cc08, cc16, cc16)
3047
3048	LDF	[BO + 36 * SIZE], a1
3049	LDF	[BO + 37 * SIZE], a2
3050	LDF	[BO + 38 * SIZE], a3
3051	LDF	[BO + 39 * SIZE], a4
3052
3053	FMUL	a1, c09, c09
3054	FMUL	a1, c10, c10
3055
3056	FNMSUB	(aa2, cc09, cc11, cc11)
3057	FNMSUB	(aa2, cc10, cc12, cc12)
3058	FNMSUB	(aa3, cc09, cc13, cc13)
3059	FNMSUB	(aa3, cc10, cc14, cc14)
3060	FNMSUB	(aa4, cc09, cc15, cc15)
3061	FNMSUB	(aa4, cc10, cc16, cc16)
3062
3063	LDF	[BO + 45 * SIZE], a1
3064	LDF	[BO + 46 * SIZE], a2
3065	LDF	[BO + 47 * SIZE], a3
3066
3067	FMUL	a1, c11, c11
3068	FMUL	a1, c12, c12
3069
3070	FNMSUB	(aa2, cc11, cc13, cc13)
3071	FNMSUB	(aa2, cc12, cc14, cc14)
3072	FNMSUB	(aa3, cc11, cc15, cc15)
3073	FNMSUB	(aa3, cc12, cc16, cc16)
3074
3075	LDF	[BO + 54 * SIZE], a1
3076	LDF	[BO + 55 * SIZE], a2
3077
3078	FMUL	a1, c13, c13
3079	FMUL	a1, c14, c14
3080
3081	FNMSUB	(aa2, cc13, cc15, cc15)
3082	FNMSUB	(aa2, cc14, cc16, cc16)
3083
3084	LDF	[BO + 63 * SIZE], a1
3085
3086	FMUL	a1, c15, c15
3087	FMUL	a1, c16, c16
3088#endif
3089
3090#ifdef RT
3091	LDF	[BO + 63 * SIZE], a1
3092	LDF	[BO + 62 * SIZE], a2
3093	LDF	[BO + 61 * SIZE], a3
3094	LDF	[BO + 60 * SIZE], a4
3095	LDF	[BO + 59 * SIZE], b1
3096	LDF	[BO + 58 * SIZE], b2
3097	LDF	[BO + 57 * SIZE], b3
3098	LDF	[BO + 56 * SIZE], b4
3099
3100	FMUL	a1, c16, c16
3101	FMUL	a1, c15, c15
3102
3103	FNMSUB	(aa2, cc16, cc14, cc14)
3104	FNMSUB	(aa2, cc15, cc13, cc13)
3105	FNMSUB	(aa3, cc16, cc12, cc12)
3106	FNMSUB	(aa3, cc15, cc11, cc11)
3107	FNMSUB	(aa4, cc16, cc10, cc10)
3108	FNMSUB	(aa4, cc15, cc09, cc09)
3109	FNMSUB	(bb1, cc16, cc08, cc08)
3110	FNMSUB	(bb1, cc15, cc07, cc07)
3111	FNMSUB	(bb2, cc16, cc06, cc06)
3112	FNMSUB	(bb2, cc15, cc05, cc05)
3113	FNMSUB	(bb3, cc16, cc04, cc04)
3114	FNMSUB	(bb3, cc15, cc03, cc03)
3115	FNMSUB	(bb4, cc16, cc02, cc02)
3116	FNMSUB	(bb4, cc15, cc01, cc01)
3117
3118	LDF	[BO + 54 * SIZE], a1
3119	LDF	[BO + 53 * SIZE], a2
3120	LDF	[BO + 52 * SIZE], a3
3121	LDF	[BO + 51 * SIZE], a4
3122	LDF	[BO + 50 * SIZE], b1
3123	LDF	[BO + 49 * SIZE], b2
3124	LDF	[BO + 48 * SIZE], b3
3125
3126	FMUL	a1, c14, c14
3127	FMUL	a1, c13, c13
3128
3129	FNMSUB	(aa2, cc14, cc12, cc12)
3130	FNMSUB	(aa2, cc13, cc11, cc11)
3131	FNMSUB	(aa3, cc14, cc10, cc10)
3132	FNMSUB	(aa3, cc13, cc09, cc09)
3133	FNMSUB	(aa4, cc14, cc08, cc08)
3134	FNMSUB	(aa4, cc13, cc07, cc07)
3135	FNMSUB	(bb1, cc14, cc06, cc06)
3136	FNMSUB	(bb1, cc13, cc05, cc05)
3137	FNMSUB	(bb2, cc14, cc04, cc04)
3138	FNMSUB	(bb2, cc13, cc03, cc03)
3139	FNMSUB	(bb3, cc14, cc02, cc02)
3140	FNMSUB	(bb3, cc13, cc01, cc01)
3141
3142	LDF	[BO + 45 * SIZE], a1
3143	LDF	[BO + 44 * SIZE], a2
3144	LDF	[BO + 43 * SIZE], a3
3145	LDF	[BO + 42 * SIZE], a4
3146	LDF	[BO + 41 * SIZE], b1
3147	LDF	[BO + 40 * SIZE], b2
3148
3149	FMUL	a1, c12, c12
3150	FMUL	a1, c11, c11
3151
3152	FNMSUB	(aa2, cc12, cc10, cc10)
3153	FNMSUB	(aa2, cc11, cc09, cc09)
3154	FNMSUB	(aa3, cc12, cc08, cc08)
3155	FNMSUB	(aa3, cc11, cc07, cc07)
3156	FNMSUB	(aa4, cc12, cc06, cc06)
3157	FNMSUB	(aa4, cc11, cc05, cc05)
3158	FNMSUB	(bb1, cc12, cc04, cc04)
3159	FNMSUB	(bb1, cc11, cc03, cc03)
3160	FNMSUB	(bb2, cc12, cc02, cc02)
3161	FNMSUB	(bb2, cc11, cc01, cc01)
3162
3163	LDF	[BO + 36 * SIZE], a1
3164	LDF	[BO + 35 * SIZE], a2
3165	LDF	[BO + 34 * SIZE], a3
3166	LDF	[BO + 33 * SIZE], a4
3167	LDF	[BO + 32 * SIZE], b1
3168
3169	FMUL	a1, c10, c10
3170	FMUL	a1, c09, c09
3171
3172	FNMSUB	(aa2, cc10, cc08, cc08)
3173	FNMSUB	(aa2, cc09, cc07, cc07)
3174	FNMSUB	(aa3, cc10, cc06, cc06)
3175	FNMSUB	(aa3, cc09, cc05, cc05)
3176	FNMSUB	(aa4, cc10, cc04, cc04)
3177	FNMSUB	(aa4, cc09, cc03, cc03)
3178	FNMSUB	(bb1, cc10, cc02, cc02)
3179	FNMSUB	(bb1, cc09, cc01, cc01)
3180
3181	LDF	[BO + 27 * SIZE], a1
3182	LDF	[BO + 26 * SIZE], a2
3183	LDF	[BO + 25 * SIZE], a3
3184	LDF	[BO + 24 * SIZE], a4
3185
3186	FMUL	a1, c08, c08
3187	FMUL	a1, c07, c07
3188
3189	FNMSUB	(aa2, cc08, cc06, cc06)
3190	FNMSUB	(aa2, cc07, cc05, cc05)
3191	FNMSUB	(aa3, cc08, cc04, cc04)
3192	FNMSUB	(aa3, cc07, cc03, cc03)
3193	FNMSUB	(aa4, cc08, cc02, cc02)
3194	FNMSUB	(aa4, cc07, cc01, cc01)
3195
3196	LDF	[BO + 18 * SIZE], a1
3197	LDF	[BO + 17 * SIZE], a2
3198	LDF	[BO + 16 * SIZE], a3
3199
3200	FMUL	a1, c06, c06
3201	FMUL	a1, c05, c05
3202
3203	FNMSUB	(aa2, cc06, cc04, cc04)
3204	FNMSUB	(aa2, cc05, cc03, cc03)
3205	FNMSUB	(aa3, cc06, cc02, cc02)
3206	FNMSUB	(aa3, cc05, cc01, cc01)
3207
3208	LDF	[BO +  9 * SIZE], a1
3209	LDF	[BO +  8 * SIZE], a2
3210
3211	FMUL	a1, c04, c04
3212	FMUL	a1, c03, c03
3213
3214	FNMSUB	(aa2, cc04, cc02, cc02)
3215	FNMSUB	(aa2, cc03, cc01, cc01)
3216
3217	LDF	[BO +  0 * SIZE], a1
3218
3219	FMUL	a1, c02, c02
3220	FMUL	a1, c01, c01
3221#endif
3222
3223#ifdef LN
3224	add	C1, -2 * SIZE, C1
3225	add	C2, -2 * SIZE, C2
3226	add	C3, -2 * SIZE, C3
3227	add	C4, -2 * SIZE, C4
3228	add	C5, -2 * SIZE, C5
3229	add	C6, -2 * SIZE, C6
3230	add	C7, -2 * SIZE, C7
3231	add	C8, -2 * SIZE, C8
3232#endif
3233
3234#if defined(LN) || defined(LT)
3235	STF	c01, [BO +  0 * SIZE]
3236	STF	c03, [BO +  1 * SIZE]
3237	STF	c05, [BO +  2 * SIZE]
3238	STF	c07, [BO +  3 * SIZE]
3239
3240	STF	c09, [BO +  4 * SIZE]
3241	STF	c11, [BO +  5 * SIZE]
3242	STF	c13, [BO +  6 * SIZE]
3243	STF	c15, [BO +  7 * SIZE]
3244
3245	STF	c02, [BO +  8 * SIZE]
3246	STF	c04, [BO +  9 * SIZE]
3247	STF	c06, [BO + 10 * SIZE]
3248	STF	c08, [BO + 11 * SIZE]
3249
3250	STF	c10, [BO + 12 * SIZE]
3251	STF	c12, [BO + 13 * SIZE]
3252	STF	c14, [BO + 14 * SIZE]
3253	STF	c16, [BO + 15 * SIZE]
3254#else
3255	STF	c01, [AO +  0 * SIZE]
3256	STF	c02, [AO +  1 * SIZE]
3257	STF	c03, [AO +  2 * SIZE]
3258	STF	c04, [AO +  3 * SIZE]
3259
3260	STF	c05, [AO +  4 * SIZE]
3261	STF	c06, [AO +  5 * SIZE]
3262	STF	c07, [AO +  6 * SIZE]
3263	STF	c08, [AO +  7 * SIZE]
3264
3265	STF	c09, [AO +  8 * SIZE]
3266	STF	c10, [AO +  9 * SIZE]
3267	STF	c11, [AO + 10 * SIZE]
3268	STF	c12, [AO + 11 * SIZE]
3269
3270	STF	c13, [AO + 12 * SIZE]
3271	STF	c14, [AO + 13 * SIZE]
3272	STF	c15, [AO + 14 * SIZE]
3273	STF	c16, [AO + 15 * SIZE]
3274#endif
3275
3276	STF	c01, [C1 + 0 * SIZE]
3277	STF	c02, [C1 + 1 * SIZE]
3278	STF	c03, [C2 + 0 * SIZE]
3279	STF	c04, [C2 + 1 * SIZE]
3280
3281	STF	c05, [C3 + 0 * SIZE]
3282	STF	c06, [C3 + 1 * SIZE]
3283	STF	c07, [C4 + 0 * SIZE]
3284	STF	c08, [C4 + 1 * SIZE]
3285
3286	STF	c09, [C5 + 0 * SIZE]
3287	STF	c10, [C5 + 1 * SIZE]
3288	STF	c11, [C6 + 0 * SIZE]
3289	STF	c12, [C6 + 1 * SIZE]
3290
3291	STF	c13, [C7 + 0 * SIZE]
3292	STF	c14, [C7 + 1 * SIZE]
3293	STF	c15, [C8 + 0 * SIZE]
3294	STF	c16, [C8 + 1 * SIZE]
3295
3296#ifndef LN
3297	add	C1, 2 * SIZE, C1
3298	add	C2, 2 * SIZE, C2
3299	add	C3, 2 * SIZE, C3
3300	add	C4, 2 * SIZE, C4
3301	add	C5, 2 * SIZE, C5
3302	add	C6, 2 * SIZE, C6
3303	add	C7, 2 * SIZE, C7
3304	add	C8, 2 * SIZE, C8
3305#endif
3306
3307#ifdef RT
3308	sll	K, BASE_SHIFT + 1, TEMP1
3309	add	AORIG, TEMP1, AORIG
3310#endif
3311
3312#if defined(LT) || defined(RN)
3313	sub	K, KK, TEMP1
3314	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3315	sll	TEMP1, BASE_SHIFT + 3, TEMP1
3316	add	AO, TEMP2, AO
3317	add	BO, TEMP1, BO
3318#endif
3319
3320#ifdef LT
3321	add	KK, 2, KK
3322#endif
3323
3324#ifdef LN
3325	sub	KK, 2, KK
3326#endif
3327
3328	add	I, -1, I
3329	cmp	I, 0
3330	bg,pt	%icc, .LL12
3331	nop
3332	.align 4
3333
3334.LL20:
3335	and	M, 1, I
3336	cmp	I, 0
3337	ble,pn	%icc, .LL29
3338	nop
3339
3340#if defined(LT) || defined(RN)
3341	mov	B, BO
3342#else
3343#ifdef LN
3344	sll	K,  BASE_SHIFT + 0, TEMP1
3345	sub	AORIG, TEMP1, AORIG
3346#endif
3347
3348	sll	KK, BASE_SHIFT + 0, TEMP1
3349	sll	KK, BASE_SHIFT + 3, TEMP2
3350
3351	add	AORIG, TEMP1, AO
3352	add	B,     TEMP2, BO
3353#endif
3354
3355	LDF	[AO +  0 * SIZE], a1
3356	LDF	[AO +  1 * SIZE], a2
3357	LDF	[AO +  2 * SIZE], a3
3358	LDF	[AO +  3 * SIZE], a4
3359
3360	LDF	[BO +  0 * SIZE], b1
3361	FCLR	(cc01)
3362	LDF	[BO +  1 * SIZE], b2
3363	FCLR	(cc03)
3364	LDF	[BO +  2 * SIZE], b3
3365	FCLR	(cc05)
3366	LDF	[BO +  3 * SIZE], b4
3367	FCLR	(cc07)
3368	LDF	[BO +  4 * SIZE], b5
3369	FCLR	(cc09)
3370	LDF	[BO +  5 * SIZE], b6
3371	FCLR	(cc11)
3372	LDF	[BO +  6 * SIZE], b7
3373	FCLR	(cc13)
3374	LDF	[BO +  7 * SIZE], b8
3375	FCLR	(cc15)
3376
3377#if defined(LT) || defined(RN)
3378	sra	KK, 2, L
3379#else
3380	sub	K, KK, L
3381	sra	L,  2, L
3382#endif
3383	cmp	L,  0
3384	ble,pn	%icc, .LL25
3385	LDF	[BO +  8 * SIZE], b9
3386	.align 4
3387
3388.LL23:
3389	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3390	add	L, -1, L
3391
3392	FMADD	(aa1, bb1, cc01, cc01)
3393	LDF	[BO + 16 * SIZE], b1
3394	FMADD	(aa1, bb2, cc03, cc03)
3395	LDF	[BO +  9 * SIZE], b2
3396
3397	FMADD	(aa1, bb3, cc05, cc05)
3398	LDF	[BO + 10 * SIZE], b3
3399	FMADD	(aa1, bb4, cc07, cc07)
3400	LDF	[BO + 11 * SIZE], b4
3401
3402	FMADD	(aa1, bb5, cc09, cc09)
3403	LDF	[BO + 12 * SIZE], b5
3404	FMADD	(aa1, bb6, cc11, cc11)
3405	LDF	[BO + 13 * SIZE], b6
3406
3407	FMADD	(aa1, bb7, cc13, cc13)
3408	LDF	[BO + 14 * SIZE], b7
3409	FMADD	(aa1, bb8, cc15, cc15)
3410	LDF	[BO + 15 * SIZE], b8
3411
3412	FMADD	(aa2, bb9, cc01, cc01)
3413	LDF	[BO + 24 * SIZE], b9
3414	FMADD	(aa2, bb2, cc03, cc03)
3415	LDF	[BO + 17 * SIZE], b2
3416
3417	FMADD	(aa2, bb3, cc05, cc05)
3418	LDF	[BO + 18 * SIZE], b3
3419	FMADD	(aa2, bb4, cc07, cc07)
3420	LDF	[BO + 19 * SIZE], b4
3421
3422	FMADD	(aa2, bb5, cc09, cc09)
3423	LDF	[BO + 20 * SIZE], b5
3424	FMADD	(aa2, bb6, cc11, cc11)
3425	LDF	[BO + 21 * SIZE], b6
3426
3427	FMADD	(aa2, bb7, cc13, cc13)
3428	LDF	[BO + 22 * SIZE], b7
3429	FMADD	(aa2, bb8, cc15, cc15)
3430	LDF	[BO + 23 * SIZE], b8
3431
3432	LDF	[AO +  4 * SIZE], a1
3433	LDF	[AO +  5 * SIZE], a2
3434
3435	FMADD	(aa3, bb1, cc01, cc01)
3436	LDF	[BO + 32 * SIZE], b1
3437	FMADD	(aa3, bb2, cc03, cc03)
3438	LDF	[BO + 25 * SIZE], b2
3439
3440	FMADD	(aa3, bb3, cc05, cc05)
3441	LDF	[BO + 26 * SIZE], b3
3442	FMADD	(aa3, bb4, cc07, cc07)
3443	LDF	[BO + 27 * SIZE], b4
3444
3445	FMADD	(aa3, bb5, cc09, cc09)
3446	LDF	[BO + 28 * SIZE], b5
3447	FMADD	(aa3, bb6, cc11, cc11)
3448	LDF	[BO + 29 * SIZE], b6
3449
3450	FMADD	(aa3, bb7, cc13, cc13)
3451	LDF	[BO + 30 * SIZE], b7
3452	FMADD	(aa3, bb8, cc15, cc15)
3453	LDF	[BO + 31 * SIZE], b8
3454
3455	FMADD	(aa4, bb9, cc01, cc01)
3456	LDF	[BO + 40 * SIZE], b9
3457	FMADD	(aa4, bb2, cc03, cc03)
3458	LDF	[BO + 33 * SIZE], b2
3459
3460	FMADD	(aa4, bb3, cc05, cc05)
3461	LDF	[BO + 34 * SIZE], b3
3462	FMADD	(aa4, bb4, cc07, cc07)
3463	LDF	[BO + 35 * SIZE], b4
3464
3465	FMADD	(aa4, bb5, cc09, cc09)
3466	LDF	[BO + 36 * SIZE], b5
3467	FMADD	(aa4, bb6, cc11, cc11)
3468	LDF	[BO + 37 * SIZE], b6
3469
3470	FMADD	(aa4, bb7, cc13, cc13)
3471	LDF	[BO + 38 * SIZE], b7
3472	FMADD	(aa4, bb8, cc15, cc15)
3473	LDF	[BO + 39 * SIZE], b8
3474
3475	LDF	[AO +  6 * SIZE], a3
3476	LDF	[AO +  7 * SIZE], a4
3477
3478	add	AO,  4 * SIZE, AO
3479	cmp	L, 0
3480	bg,pt	%icc, .LL23
3481	add	BO, 32 * SIZE, BO
3482	.align 4
3483
3484.LL25:
3485#if defined(LT) || defined(RN)
3486	and	KK, 3, L
3487#else
3488	sub	K, KK, L
3489	and	L,  3, L
3490#endif
3491	cmp	L,  0
3492	ble,a,pn %icc, .LL28
3493	nop
3494	.align 4
3495
3496.LL27:
3497	FMADD	(aa1, bb1, cc01, cc01)
3498	LDF	[BO +  8 * SIZE], b1
3499	FMADD	(aa1, bb2, cc03, cc03)
3500	LDF	[BO +  9 * SIZE], b2
3501
3502	FMADD	(aa1, bb3, cc05, cc05)
3503	LDF	[BO + 10 * SIZE], b3
3504	FMADD	(aa1, bb4, cc07, cc07)
3505	LDF	[BO + 11 * SIZE], b4
3506
3507	FMADD	(aa1, bb5, cc09, cc09)
3508	LDF	[BO + 12 * SIZE], b5
3509	FMADD	(aa1, bb6, cc11, cc11)
3510	LDF	[BO + 13 * SIZE], b6
3511
3512	FMADD	(aa1, bb7, cc13, cc13)
3513	LDF	[BO + 14 * SIZE], b7
3514	FMADD	(aa1, bb8, cc15, cc15)
3515	LDF	[BO + 15 * SIZE], b8
3516
3517	LDF	[AO +  1 * SIZE], a1
3518	add	AO, 1 * SIZE, AO
3519
3520	add	L, -1, L
3521	cmp	L, 0
3522	bg,pt	%icc, .LL27
3523	add	BO, 8 * SIZE, BO
3524	.align 4
3525
3526.LL28:
3527#if defined(LN) || defined(RT)
3528#ifdef LN
3529	sub	KK, 1, TEMP1
3530#else
3531	sub	KK, 8, TEMP1
3532#endif
3533	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3534	sll	TEMP1, BASE_SHIFT + 3, TEMP1
3535
3536	add	AORIG, TEMP2, AO
3537	add	B,     TEMP1, BO
3538#endif
3539
3540#if defined(LN) || defined(LT)
3541	LDF	[BO +  0 * SIZE], a1
3542	LDF	[BO +  1 * SIZE], a2
3543	LDF	[BO +  2 * SIZE], a3
3544	LDF	[BO +  3 * SIZE], a4
3545
3546	LDF	[BO +  4 * SIZE], b1
3547	LDF	[BO +  5 * SIZE], b2
3548	LDF	[BO +  6 * SIZE], b3
3549	LDF	[BO +  7 * SIZE], b4
3550
3551	FSUB	a1, c01, c01
3552	FSUB	a2, c03, c03
3553	FSUB	a3, c05, c05
3554	FSUB	a4, c07, c07
3555
3556	FSUB	b1, c09, c09
3557	FSUB	b2, c11, c11
3558	FSUB	b3, c13, c13
3559	FSUB	b4, c15, c15
3560#else
3561	LDF	[AO +  0 * SIZE], a1
3562	LDF	[AO +  1 * SIZE], a2
3563	LDF	[AO +  2 * SIZE], a3
3564	LDF	[AO +  3 * SIZE], a4
3565
3566	LDF	[AO +  4 * SIZE], b1
3567	LDF	[AO +  5 * SIZE], b2
3568	LDF	[AO +  6 * SIZE], b3
3569	LDF	[AO +  7 * SIZE], b4
3570
3571	FSUB	a1, c01, c01
3572	FSUB	a2, c03, c03
3573	FSUB	a3, c05, c05
3574	FSUB	a4, c07, c07
3575
3576	FSUB	b1, c09, c09
3577	FSUB	b2, c11, c11
3578	FSUB	b3, c13, c13
3579	FSUB	b4, c15, c15
3580#endif
3581
3582#if defined(LN) || defined(LT)
3583	LDF	[AO +  0 * SIZE], a1
3584
3585	FMUL	a1, c01, c01
3586	FMUL	a1, c03, c03
3587	FMUL	a1, c05, c05
3588	FMUL	a1, c07, c07
3589	FMUL	a1, c09, c09
3590	FMUL	a1, c11, c11
3591	FMUL	a1, c13, c13
3592	FMUL	a1, c15, c15
3593#endif
3594
3595#ifdef RN
3596	LDF	[BO +  0 * SIZE], a1
3597	LDF	[BO +  1 * SIZE], a2
3598	LDF	[BO +  2 * SIZE], a3
3599	LDF	[BO +  3 * SIZE], a4
3600	LDF	[BO +  4 * SIZE], b1
3601	LDF	[BO +  5 * SIZE], b2
3602	LDF	[BO +  6 * SIZE], b3
3603	LDF	[BO +  7 * SIZE], b4
3604
3605	FMUL	a1, c01, c01
3606
3607	FNMSUB	(aa2, cc01, cc03, cc03)
3608	FNMSUB	(aa3, cc01, cc05, cc05)
3609	FNMSUB	(aa4, cc01, cc07, cc07)
3610	FNMSUB	(bb1, cc01, cc09, cc09)
3611	FNMSUB	(bb2, cc01, cc11, cc11)
3612	FNMSUB	(bb3, cc01, cc13, cc13)
3613	FNMSUB	(bb4, cc01, cc15, cc15)
3614
3615	LDF	[BO +  9 * SIZE], a1
3616	LDF	[BO + 10 * SIZE], a2
3617	LDF	[BO + 11 * SIZE], a3
3618	LDF	[BO + 12 * SIZE], a4
3619	LDF	[BO + 13 * SIZE], b1
3620	LDF	[BO + 14 * SIZE], b2
3621	LDF	[BO + 15 * SIZE], b3
3622
3623	FMUL	a1, c03, c03
3624
3625	FNMSUB	(aa2, cc03, cc05, cc05)
3626	FNMSUB	(aa3, cc03, cc07, cc07)
3627	FNMSUB	(aa4, cc03, cc09, cc09)
3628	FNMSUB	(bb1, cc03, cc11, cc11)
3629	FNMSUB	(bb2, cc03, cc13, cc13)
3630	FNMSUB	(bb3, cc03, cc15, cc15)
3631
3632	LDF	[BO + 18 * SIZE], a1
3633	LDF	[BO + 19 * SIZE], a2
3634	LDF	[BO + 20 * SIZE], a3
3635	LDF	[BO + 21 * SIZE], a4
3636	LDF	[BO + 22 * SIZE], b1
3637	LDF	[BO + 23 * SIZE], b2
3638
3639	FMUL	a1, c05, c05
3640
3641	FNMSUB	(aa2, cc05, cc07, cc07)
3642	FNMSUB	(aa3, cc05, cc09, cc09)
3643	FNMSUB	(aa4, cc05, cc11, cc11)
3644	FNMSUB	(bb1, cc05, cc13, cc13)
3645	FNMSUB	(bb2, cc05, cc15, cc15)
3646
3647	LDF	[BO + 27 * SIZE], a1
3648	LDF	[BO + 28 * SIZE], a2
3649	LDF	[BO + 29 * SIZE], a3
3650	LDF	[BO + 30 * SIZE], a4
3651	LDF	[BO + 31 * SIZE], b1
3652
3653	FMUL	a1, c07, c07
3654
3655	FNMSUB	(aa2, cc07, cc09, cc09)
3656	FNMSUB	(aa3, cc07, cc11, cc11)
3657	FNMSUB	(aa4, cc07, cc13, cc13)
3658	FNMSUB	(bb1, cc07, cc15, cc15)
3659
3660	LDF	[BO + 36 * SIZE], a1
3661	LDF	[BO + 37 * SIZE], a2
3662	LDF	[BO + 38 * SIZE], a3
3663	LDF	[BO + 39 * SIZE], a4
3664
3665	FMUL	a1, c09, c09
3666
3667	FNMSUB	(aa2, cc09, cc11, cc11)
3668	FNMSUB	(aa3, cc09, cc13, cc13)
3669	FNMSUB	(aa4, cc09, cc15, cc15)
3670
3671	LDF	[BO + 45 * SIZE], a1
3672	LDF	[BO + 46 * SIZE], a2
3673	LDF	[BO + 47 * SIZE], a3
3674
3675	FMUL	a1, c11, c11
3676
3677	FNMSUB	(aa2, cc11, cc13, cc13)
3678	FNMSUB	(aa3, cc11, cc15, cc15)
3679
3680	LDF	[BO + 54 * SIZE], a1
3681	LDF	[BO + 55 * SIZE], a2
3682
3683	FMUL	a1, c13, c13
3684
3685	FNMSUB	(aa2, cc13, cc15, cc15)
3686
3687	LDF	[BO + 63 * SIZE], a1
3688
3689	FMUL	a1, c15, c15
3690#endif
3691
3692#ifdef RT
3693	LDF	[BO + 63 * SIZE], a1
3694	LDF	[BO + 62 * SIZE], a2
3695	LDF	[BO + 61 * SIZE], a3
3696	LDF	[BO + 60 * SIZE], a4
3697	LDF	[BO + 59 * SIZE], b1
3698	LDF	[BO + 58 * SIZE], b2
3699	LDF	[BO + 57 * SIZE], b3
3700	LDF	[BO + 56 * SIZE], b4
3701
3702	FMUL	a1, c15, c15
3703
3704	FNMSUB	(aa2, cc15, cc13, cc13)
3705	FNMSUB	(aa3, cc15, cc11, cc11)
3706	FNMSUB	(aa4, cc15, cc09, cc09)
3707	FNMSUB	(bb1, cc15, cc07, cc07)
3708	FNMSUB	(bb2, cc15, cc05, cc05)
3709	FNMSUB	(bb3, cc15, cc03, cc03)
3710	FNMSUB	(bb4, cc15, cc01, cc01)
3711
3712	LDF	[BO + 54 * SIZE], a1
3713	LDF	[BO + 53 * SIZE], a2
3714	LDF	[BO + 52 * SIZE], a3
3715	LDF	[BO + 51 * SIZE], a4
3716	LDF	[BO + 50 * SIZE], b1
3717	LDF	[BO + 49 * SIZE], b2
3718	LDF	[BO + 48 * SIZE], b3
3719
3720	FMUL	a1, c13, c13
3721
3722	FNMSUB	(aa2, cc13, cc11, cc11)
3723	FNMSUB	(aa3, cc13, cc09, cc09)
3724	FNMSUB	(aa4, cc13, cc07, cc07)
3725	FNMSUB	(bb1, cc13, cc05, cc05)
3726	FNMSUB	(bb2, cc13, cc03, cc03)
3727	FNMSUB	(bb3, cc13, cc01, cc01)
3728
3729	LDF	[BO + 45 * SIZE], a1
3730	LDF	[BO + 44 * SIZE], a2
3731	LDF	[BO + 43 * SIZE], a3
3732	LDF	[BO + 42 * SIZE], a4
3733	LDF	[BO + 41 * SIZE], b1
3734	LDF	[BO + 40 * SIZE], b2
3735
3736	FMUL	a1, c11, c11
3737
3738	FNMSUB	(aa2, cc11, cc09, cc09)
3739	FNMSUB	(aa3, cc11, cc07, cc07)
3740	FNMSUB	(aa4, cc11, cc05, cc05)
3741	FNMSUB	(bb1, cc11, cc03, cc03)
3742	FNMSUB	(bb2, cc11, cc01, cc01)
3743
3744	LDF	[BO + 36 * SIZE], a1
3745	LDF	[BO + 35 * SIZE], a2
3746	LDF	[BO + 34 * SIZE], a3
3747	LDF	[BO + 33 * SIZE], a4
3748	LDF	[BO + 32 * SIZE], b1
3749
3750	FMUL	a1, c09, c09
3751
3752	FNMSUB	(aa2, cc09, cc07, cc07)
3753	FNMSUB	(aa3, cc09, cc05, cc05)
3754	FNMSUB	(aa4, cc09, cc03, cc03)
3755	FNMSUB	(bb1, cc09, cc01, cc01)
3756
3757	LDF	[BO + 27 * SIZE], a1
3758	LDF	[BO + 26 * SIZE], a2
3759	LDF	[BO + 25 * SIZE], a3
3760	LDF	[BO + 24 * SIZE], a4
3761
3762	FMUL	a1, c07, c07
3763
3764	FNMSUB	(aa2, cc07, cc05, cc05)
3765	FNMSUB	(aa3, cc07, cc03, cc03)
3766	FNMSUB	(aa4, cc07, cc01, cc01)
3767
3768	LDF	[BO + 18 * SIZE], a1
3769	LDF	[BO + 17 * SIZE], a2
3770	LDF	[BO + 16 * SIZE], a3
3771
3772	FMUL	a1, c05, c05
3773
3774	FNMSUB	(aa2, cc05, cc03, cc03)
3775	FNMSUB	(aa3, cc05, cc01, cc01)
3776
3777	LDF	[BO +  9 * SIZE], a1
3778	LDF	[BO +  8 * SIZE], a2
3779
3780	FMUL	a1, c03, c03
3781
3782	FNMSUB	(aa2, cc03, cc01, cc01)
3783
3784	LDF	[BO +  0 * SIZE], a1
3785
3786	FMUL	a1, c01, c01
3787#endif
3788
3789#ifdef LN
3790	add	C1, -1 * SIZE, C1
3791	add	C2, -1 * SIZE, C2
3792	add	C3, -1 * SIZE, C3
3793	add	C4, -1 * SIZE, C4
3794	add	C5, -1 * SIZE, C5
3795	add	C6, -1 * SIZE, C6
3796	add	C7, -1 * SIZE, C7
3797	add	C8, -1 * SIZE, C8
3798#endif
3799
3800#if defined(LN) || defined(LT)
3801	STF	c01, [BO +  0 * SIZE]
3802	STF	c03, [BO +  1 * SIZE]
3803	STF	c05, [BO +  2 * SIZE]
3804	STF	c07, [BO +  3 * SIZE]
3805
3806	STF	c09, [BO +  4 * SIZE]
3807	STF	c11, [BO +  5 * SIZE]
3808	STF	c13, [BO +  6 * SIZE]
3809	STF	c15, [BO +  7 * SIZE]
3810#else
3811	STF	c01, [AO +  0 * SIZE]
3812	STF	c03, [AO +  1 * SIZE]
3813	STF	c05, [AO +  2 * SIZE]
3814	STF	c07, [AO +  3 * SIZE]
3815
3816	STF	c09, [AO +  4 * SIZE]
3817	STF	c11, [AO +  5 * SIZE]
3818	STF	c13, [AO +  6 * SIZE]
3819	STF	c15, [AO +  7 * SIZE]
3820#endif
3821
3822	STF	c01, [C1 + 0 * SIZE]
3823	STF	c03, [C2 + 0 * SIZE]
3824	STF	c05, [C3 + 0 * SIZE]
3825	STF	c07, [C4 + 0 * SIZE]
3826
3827	STF	c09, [C5 + 0 * SIZE]
3828	STF	c11, [C6 + 0 * SIZE]
3829	STF	c13, [C7 + 0 * SIZE]
3830	STF	c15, [C8 + 0 * SIZE]
3831
3832#ifdef RT
3833	sll	K, BASE_SHIFT + 0, TEMP1
3834	add	AORIG, TEMP1, AORIG
3835#endif
3836
3837#if defined(LT) || defined(RN)
3838	sub	K, KK, TEMP1
3839	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3840	sll	TEMP1, BASE_SHIFT + 3, TEMP1
3841	add	AO, TEMP2, AO
3842	add	BO, TEMP1, BO
3843#endif
3844
3845#ifdef LT
3846	add	KK, 1, KK
3847#endif
3848
3849#ifdef LN
3850	sub	KK, 1, KK
3851#endif
3852	.align 4
3853
3854.LL29:
3855#ifdef LN
3856	sll	K, BASE_SHIFT + 3, TEMP1
3857	add	B, TEMP1, B
3858#endif
3859
3860#if defined(LT) || defined(RN)
3861	mov	BO, B
3862#endif
3863
3864#ifdef RN
3865	add	KK, 8, KK
3866#endif
3867
3868#ifdef RT
3869	sub	KK, 8, KK
3870#endif
3871
3872	add	J, -1, J
3873	cmp	J, 0
3874	bg,pt	%icc, .LL11
3875	nop
3876	.align 4
3877
3878.LL999:
3879#ifdef TRMMKERNEL
3880#ifndef __64BIT__
3881	ld	[%sp + STACK_START +  8], %g1
3882	ld	[%sp + STACK_START + 12], %g2
3883	ld	[%sp + STACK_START + 16], %g3
3884	ld	[%sp + STACK_START + 20], %g4
3885#else
3886	ldx	[%sp + STACK_START + 32], %g1
3887	ldx	[%sp + STACK_START + 40], %g2
3888	ldx	[%sp + STACK_START + 48], %g3
3889	ldx	[%sp + STACK_START + 56], %g4
3890#endif
3891#endif
3892
3893	return	%i7 + 8
3894	clr	%o0
3895
3896	EPILOGUE
3897