1/*********************************************************************/
2/* Copyright 2005-2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define APREFETCHSIZE 24
43#define APREFETCH_CATEGORY 0
44
45#define M	%i0
46#define N	%i1
47#define K	%i2
48
49#if defined(DOUBLE) && !defined(__64BIT__)
50#define A	%i5
51#define B	%i4
52#else
53#define A	%i4
54#define B	%i5
55#endif
56
57#define C	%o4
58#define LDC	%o5
59
60#define AO	%l0
61#define BO	%l1
62#define I	%l2
63#define J	%l3
64#define L	%l4
65
66#define C1	%o0
67#define C2	%o1
68#define C3	%o2
69#define C4	%o3
70
71#define C5	%l5
72#define	C6	%l6
73#define C7	%l7
74#define C8	%i3
75
76#define OFFSET	%g1
77#define	KK	%g2
78#define TEMP1	%g3
79#define TEMP2	%g4
80#define AORIG	%o7
81
82#ifdef DOUBLE
83#define c01	%f0
84#define c02	%f2
85#define c03	%f4
86#define c04	%f6
87#define c05	%f8
88#define c06	%f10
89#define c07	%f12
90#define c08	%f14
91#define c09	%f16
92#define c10	%f18
93#define c11	%f20
94#define c12	%f22
95#define c13	%f24
96#define c14	%f26
97#define c15	%f28
98#define c16	%f30
99
100#define a1	%f32
101#define a2	%f34
102#define a3	%f36
103#define a4	%f38
104#define a5	%f40
105
106#define b1	%f42
107#define b2	%f44
108#define b3	%f46
109#define b4	%f48
110#define b5	%f50
111#define b6	%f52
112#define b7	%f54
113#define b8	%f56
114#define b9	%f58
115
116#define cc01	0
117#define cc02	2
118#define cc03	4
119#define cc04	6
120#define cc05	8
121#define cc06	10
122#define cc07	12
123#define cc08	14
124#define cc09	16
125#define cc10	18
126#define cc11	20
127#define cc12	22
128#define cc13	24
129#define cc14	26
130#define cc15	28
131#define cc16	30
132
133#define aa1	 1
134#define aa2	 3
135#define aa3	 5
136#define aa4	 7
137#define aa5	 9
138
139#define bb1	11
140#define bb2	13
141#define bb3	15
142#define bb4	17
143#define bb5	19
144#define bb6	21
145#define bb7	23
146#define bb8	25
147#define bb9	27
148
149#else
150#define c01	%f0
151#define c02	%f1
152#define c03	%f2
153#define c04	%f3
154#define c05	%f4
155#define c06	%f5
156#define c07	%f6
157#define c08	%f7
158#define c09	%f8
159#define c10	%f9
160#define c11	%f10
161#define c12	%f11
162#define c13	%f12
163#define c14	%f13
164#define c15	%f14
165#define c16	%f15
166
167#define a1	%f16
168#define a2	%f17
169#define a3	%f18
170#define a4	%f19
171#define a5	%f20
172
173#define b1	%f21
174#define b2	%f22
175#define b3	%f23
176#define b4	%f24
177#define b5	%f25
178#define b6	%f26
179#define b7	%f27
180#define b8	%f28
181#define b9	%f29
182
183#define cc01	0
184#define cc02	1
185#define cc03	2
186#define cc04	3
187#define cc05	4
188#define cc06	5
189#define cc07	6
190#define cc08	7
191#define cc09	8
192#define cc10	9
193#define cc11	10
194#define cc12	11
195#define cc13	12
196#define cc14	13
197#define cc15	14
198#define cc16	15
199
200#define aa1	16
201#define aa2	17
202#define aa3	18
203#define aa4	19
204#define aa5	20
205
206#define bb1	21
207#define bb2	22
208#define bb3	23
209#define bb4	24
210#define bb5	25
211#define bb6	26
212#define bb7	27
213#define bb8	28
214#define bb9	29
215
216#endif
217
218        .register %g2, #scratch
219        .register %g3, #scratch
220
221	PROLOGUE
222	SAVESP
223	nop
224
225#ifndef __64BIT__
226
227#ifdef DOUBLE
228	ld	[%sp + STACK_START + 28], B
229	ld	[%sp + STACK_START + 32], C
230	ld	[%sp + STACK_START + 36], LDC
231	ld	[%sp + STACK_START + 40], OFFSET
232#else
233	ld	[%sp + STACK_START + 28], C
234	ld	[%sp + STACK_START + 32], LDC
235	ld	[%sp + STACK_START + 36], OFFSET
236#endif
237
238	st	%g1, [%sp + STACK_START +  8]
239	st	%g2, [%sp + STACK_START + 12]
240	st	%g3, [%sp + STACK_START + 16]
241	st	%g4, [%sp + STACK_START + 20]
242#else
243
244	ldx	[%sp+  STACK_START + 56], C
245	ldx	[%sp+  STACK_START + 64], LDC
246	ldx	[%sp+  STACK_START + 72], OFFSET
247
248	stx	%g1, [%sp + STACK_START + 32]
249	stx	%g2, [%sp + STACK_START + 40]
250	stx	%g3, [%sp + STACK_START + 48]
251	stx	%g4, [%sp + STACK_START + 56]
252#endif
253
254#if defined(TRMMKERNEL) && !defined(LEFT)
255	neg	OFFSET, KK
256#endif
257
258	sll	LDC, BASE_SHIFT, LDC
259
260#ifdef LN
261	smul	M, K, TEMP1
262	sll	TEMP1, BASE_SHIFT, TEMP1
263	add	A, TEMP1, A
264
265	sll	M, BASE_SHIFT, TEMP1
266	add	C, TEMP1, C
267#endif
268
269#ifdef RN
270	neg	OFFSET, KK
271#endif
272
273#ifdef RT
274	smul	N, K, TEMP1
275	sll	TEMP1, BASE_SHIFT, TEMP1
276	add	B, TEMP1, B
277
278	smul	N, LDC, TEMP1
279	add	C, TEMP1, C
280
281	sub	N, OFFSET, KK
282#endif
283
284	sra	N, 3, J
285	cmp	J, 0
286	ble,pn	%icc, .LL30
287	nop
288	.align 4
289
290.LL11:
291#ifdef RT
292	sll	K, BASE_SHIFT + 3, TEMP1
293	sub	B, TEMP1, B
294#endif
295
296#ifndef RT
297	mov	C,  C1
298	add	C,  LDC, C2
299	add	C2, LDC, C3
300	add	C3, LDC, C4
301	add	C4, LDC, C5
302	add	C5, LDC, C6
303	add	C6, LDC, C7
304	add	C7, LDC, C8
305	add	C8, LDC, C
306#else
307	sub	C,  LDC, C8
308	sub	C8, LDC, C7
309	sub	C7, LDC, C6
310	sub	C6, LDC, C5
311	sub	C5, LDC, C4
312	sub	C4, LDC, C3
313	sub	C3, LDC, C2
314	sub	C2, LDC, C1
315	sub	C2, LDC, C
316#endif
317
318#ifdef LN
319	add	M, OFFSET, KK
320#endif
321
322#ifdef LT
323	mov	OFFSET, KK
324#endif
325
326#if defined(LN) || defined(RT)
327	mov	A, AORIG
328#else
329	mov	A, AO
330#endif
331
332	and	M, 1, I
333	cmp	I, 0
334	ble,pn	%icc, .LL20
335	nop
336
337#if defined(LT) || defined(RN)
338	mov	B, BO
339#else
340#ifdef LN
341	sll	K,  BASE_SHIFT + 0, TEMP1
342	sub	AORIG, TEMP1, AORIG
343#endif
344
345	sll	KK, BASE_SHIFT + 0, TEMP1
346	sll	KK, BASE_SHIFT + 3, TEMP2
347
348	add	AORIG, TEMP1, AO
349	add	B,     TEMP2, BO
350#endif
351
352	LDF	[AO +  0 * SIZE], a1
353	LDF	[AO +  1 * SIZE], a2
354	LDF	[AO +  2 * SIZE], a3
355	LDF	[AO +  3 * SIZE], a4
356
357	LDF	[BO +  0 * SIZE], b1
358	FCLR	(cc01)
359	LDF	[BO +  1 * SIZE], b2
360	FCLR	(cc03)
361	LDF	[BO +  2 * SIZE], b3
362	FCLR	(cc05)
363	LDF	[BO +  3 * SIZE], b4
364	FCLR	(cc07)
365	LDF	[BO +  4 * SIZE], b5
366	FCLR	(cc09)
367	LDF	[BO +  5 * SIZE], b6
368	FCLR	(cc11)
369	LDF	[BO +  6 * SIZE], b7
370	FCLR	(cc13)
371	LDF	[BO +  7 * SIZE], b8
372	FCLR	(cc15)
373
374#if defined(LT) || defined(RN)
375	sra	KK, 2, L
376#else
377	sub	K, KK, L
378	sra	L,  2, L
379#endif
380	cmp	L,  0
381	ble,pn	%icc, .LL25
382	LDF	[BO +  8 * SIZE], b9
383	.align 4
384
385.LL23:
386	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
387	add	L, -1, L
388
389	FMADD	(aa1, bb1, cc01, cc01)
390	LDF	[BO + 16 * SIZE], b1
391	FMADD	(aa1, bb2, cc03, cc03)
392	LDF	[BO +  9 * SIZE], b2
393
394	FMADD	(aa1, bb3, cc05, cc05)
395	LDF	[BO + 10 * SIZE], b3
396	FMADD	(aa1, bb4, cc07, cc07)
397	LDF	[BO + 11 * SIZE], b4
398
399	FMADD	(aa1, bb5, cc09, cc09)
400	LDF	[BO + 12 * SIZE], b5
401	FMADD	(aa1, bb6, cc11, cc11)
402	LDF	[BO + 13 * SIZE], b6
403
404	FMADD	(aa1, bb7, cc13, cc13)
405	LDF	[BO + 14 * SIZE], b7
406	FMADD	(aa1, bb8, cc15, cc15)
407	LDF	[BO + 15 * SIZE], b8
408
409	FMADD	(aa2, bb9, cc01, cc01)
410	LDF	[BO + 24 * SIZE], b9
411	FMADD	(aa2, bb2, cc03, cc03)
412	LDF	[BO + 17 * SIZE], b2
413
414	FMADD	(aa2, bb3, cc05, cc05)
415	LDF	[BO + 18 * SIZE], b3
416	FMADD	(aa2, bb4, cc07, cc07)
417	LDF	[BO + 19 * SIZE], b4
418
419	FMADD	(aa2, bb5, cc09, cc09)
420	LDF	[BO + 20 * SIZE], b5
421	FMADD	(aa2, bb6, cc11, cc11)
422	LDF	[BO + 21 * SIZE], b6
423
424	FMADD	(aa2, bb7, cc13, cc13)
425	LDF	[BO + 22 * SIZE], b7
426	FMADD	(aa2, bb8, cc15, cc15)
427	LDF	[BO + 23 * SIZE], b8
428
429	LDF	[AO +  4 * SIZE], a1
430	LDF	[AO +  5 * SIZE], a2
431
432	FMADD	(aa3, bb1, cc01, cc01)
433	LDF	[BO + 32 * SIZE], b1
434	FMADD	(aa3, bb2, cc03, cc03)
435	LDF	[BO + 25 * SIZE], b2
436
437	FMADD	(aa3, bb3, cc05, cc05)
438	LDF	[BO + 26 * SIZE], b3
439	FMADD	(aa3, bb4, cc07, cc07)
440	LDF	[BO + 27 * SIZE], b4
441
442	FMADD	(aa3, bb5, cc09, cc09)
443	LDF	[BO + 28 * SIZE], b5
444	FMADD	(aa3, bb6, cc11, cc11)
445	LDF	[BO + 29 * SIZE], b6
446
447	FMADD	(aa3, bb7, cc13, cc13)
448	LDF	[BO + 30 * SIZE], b7
449	FMADD	(aa3, bb8, cc15, cc15)
450	LDF	[BO + 31 * SIZE], b8
451
452	FMADD	(aa4, bb9, cc01, cc01)
453	LDF	[BO + 40 * SIZE], b9
454	FMADD	(aa4, bb2, cc03, cc03)
455	LDF	[BO + 33 * SIZE], b2
456
457	FMADD	(aa4, bb3, cc05, cc05)
458	LDF	[BO + 34 * SIZE], b3
459	FMADD	(aa4, bb4, cc07, cc07)
460	LDF	[BO + 35 * SIZE], b4
461
462	FMADD	(aa4, bb5, cc09, cc09)
463	LDF	[BO + 36 * SIZE], b5
464	FMADD	(aa4, bb6, cc11, cc11)
465	LDF	[BO + 37 * SIZE], b6
466
467	FMADD	(aa4, bb7, cc13, cc13)
468	LDF	[BO + 38 * SIZE], b7
469	FMADD	(aa4, bb8, cc15, cc15)
470	LDF	[BO + 39 * SIZE], b8
471
472	LDF	[AO +  6 * SIZE], a3
473	LDF	[AO +  7 * SIZE], a4
474
475	add	AO,  4 * SIZE, AO
476	cmp	L, 0
477	bg,pt	%icc, .LL23
478	add	BO, 32 * SIZE, BO
479	.align 4
480
481.LL25:
482#if defined(LT) || defined(RN)
483	and	KK, 3, L
484#else
485	sub	K, KK, L
486	and	L,  3, L
487#endif
488	cmp	L,  0
489	ble,a,pn %icc, .LL28
490	nop
491	.align 4
492
493.LL27:
494	FMADD	(aa1, bb1, cc01, cc01)
495	LDF	[BO +  8 * SIZE], b1
496	FMADD	(aa1, bb2, cc03, cc03)
497	LDF	[BO +  9 * SIZE], b2
498
499	FMADD	(aa1, bb3, cc05, cc05)
500	LDF	[BO + 10 * SIZE], b3
501	FMADD	(aa1, bb4, cc07, cc07)
502	LDF	[BO + 11 * SIZE], b4
503
504	FMADD	(aa1, bb5, cc09, cc09)
505	LDF	[BO + 12 * SIZE], b5
506	FMADD	(aa1, bb6, cc11, cc11)
507	LDF	[BO + 13 * SIZE], b6
508
509	FMADD	(aa1, bb7, cc13, cc13)
510	LDF	[BO + 14 * SIZE], b7
511	FMADD	(aa1, bb8, cc15, cc15)
512	LDF	[BO + 15 * SIZE], b8
513
514	LDF	[AO +  1 * SIZE], a1
515	add	AO, 1 * SIZE, AO
516
517	add	L, -1, L
518	cmp	L, 0
519	bg,pt	%icc, .LL27
520	add	BO, 8 * SIZE, BO
521	.align 4
522
523.LL28:
524#if defined(LN) || defined(RT)
525#ifdef LN
526	sub	KK, 1, TEMP1
527#else
528	sub	KK, 8, TEMP1
529#endif
530	sll	TEMP1, BASE_SHIFT + 0, TEMP2
531	sll	TEMP1, BASE_SHIFT + 3, TEMP1
532
533	add	AORIG, TEMP2, AO
534	add	B,     TEMP1, BO
535#endif
536
537#if defined(LN) || defined(LT)
538	LDF	[BO +  0 * SIZE], a1
539	LDF	[BO +  1 * SIZE], a2
540	LDF	[BO +  2 * SIZE], a3
541	LDF	[BO +  3 * SIZE], a4
542
543	LDF	[BO +  4 * SIZE], b1
544	LDF	[BO +  5 * SIZE], b2
545	LDF	[BO +  6 * SIZE], b3
546	LDF	[BO +  7 * SIZE], b4
547
548	FSUB	a1, c01, c01
549	FSUB	a2, c03, c03
550	FSUB	a3, c05, c05
551	FSUB	a4, c07, c07
552
553	FSUB	b1, c09, c09
554	FSUB	b2, c11, c11
555	FSUB	b3, c13, c13
556	FSUB	b4, c15, c15
557#else
558	LDF	[AO +  0 * SIZE], a1
559	LDF	[AO +  1 * SIZE], a2
560	LDF	[AO +  2 * SIZE], a3
561	LDF	[AO +  3 * SIZE], a4
562
563	LDF	[AO +  4 * SIZE], b1
564	LDF	[AO +  5 * SIZE], b2
565	LDF	[AO +  6 * SIZE], b3
566	LDF	[AO +  7 * SIZE], b4
567
568	FSUB	a1, c01, c01
569	FSUB	a2, c03, c03
570	FSUB	a3, c05, c05
571	FSUB	a4, c07, c07
572
573	FSUB	b1, c09, c09
574	FSUB	b2, c11, c11
575	FSUB	b3, c13, c13
576	FSUB	b4, c15, c15
577#endif
578
579#if defined(LN) || defined(LT)
580	LDF	[AO +  0 * SIZE], a1
581
582	FMUL	a1, c01, c01
583	FMUL	a1, c03, c03
584	FMUL	a1, c05, c05
585	FMUL	a1, c07, c07
586	FMUL	a1, c09, c09
587	FMUL	a1, c11, c11
588	FMUL	a1, c13, c13
589	FMUL	a1, c15, c15
590#endif
591
592#ifdef RN
593	LDF	[BO +  0 * SIZE], a1
594	LDF	[BO +  1 * SIZE], a2
595	LDF	[BO +  2 * SIZE], a3
596	LDF	[BO +  3 * SIZE], a4
597	LDF	[BO +  4 * SIZE], b1
598	LDF	[BO +  5 * SIZE], b2
599	LDF	[BO +  6 * SIZE], b3
600	LDF	[BO +  7 * SIZE], b4
601
602	FMUL	a1, c01, c01
603
604	FNMSUB	(aa2, cc01, cc03, cc03)
605	FNMSUB	(aa3, cc01, cc05, cc05)
606	FNMSUB	(aa4, cc01, cc07, cc07)
607	FNMSUB	(bb1, cc01, cc09, cc09)
608	FNMSUB	(bb2, cc01, cc11, cc11)
609	FNMSUB	(bb3, cc01, cc13, cc13)
610	FNMSUB	(bb4, cc01, cc15, cc15)
611
612	LDF	[BO +  9 * SIZE], a1
613	LDF	[BO + 10 * SIZE], a2
614	LDF	[BO + 11 * SIZE], a3
615	LDF	[BO + 12 * SIZE], a4
616	LDF	[BO + 13 * SIZE], b1
617	LDF	[BO + 14 * SIZE], b2
618	LDF	[BO + 15 * SIZE], b3
619
620	FMUL	a1, c03, c03
621
622	FNMSUB	(aa2, cc03, cc05, cc05)
623	FNMSUB	(aa3, cc03, cc07, cc07)
624	FNMSUB	(aa4, cc03, cc09, cc09)
625	FNMSUB	(bb1, cc03, cc11, cc11)
626	FNMSUB	(bb2, cc03, cc13, cc13)
627	FNMSUB	(bb3, cc03, cc15, cc15)
628
629	LDF	[BO + 18 * SIZE], a1
630	LDF	[BO + 19 * SIZE], a2
631	LDF	[BO + 20 * SIZE], a3
632	LDF	[BO + 21 * SIZE], a4
633	LDF	[BO + 22 * SIZE], b1
634	LDF	[BO + 23 * SIZE], b2
635
636	FMUL	a1, c05, c05
637
638	FNMSUB	(aa2, cc05, cc07, cc07)
639	FNMSUB	(aa3, cc05, cc09, cc09)
640	FNMSUB	(aa4, cc05, cc11, cc11)
641	FNMSUB	(bb1, cc05, cc13, cc13)
642	FNMSUB	(bb2, cc05, cc15, cc15)
643
644	LDF	[BO + 27 * SIZE], a1
645	LDF	[BO + 28 * SIZE], a2
646	LDF	[BO + 29 * SIZE], a3
647	LDF	[BO + 30 * SIZE], a4
648	LDF	[BO + 31 * SIZE], b1
649
650	FMUL	a1, c07, c07
651
652	FNMSUB	(aa2, cc07, cc09, cc09)
653	FNMSUB	(aa3, cc07, cc11, cc11)
654	FNMSUB	(aa4, cc07, cc13, cc13)
655	FNMSUB	(bb1, cc07, cc15, cc15)
656
657	LDF	[BO + 36 * SIZE], a1
658	LDF	[BO + 37 * SIZE], a2
659	LDF	[BO + 38 * SIZE], a3
660	LDF	[BO + 39 * SIZE], a4
661
662	FMUL	a1, c09, c09
663
664	FNMSUB	(aa2, cc09, cc11, cc11)
665	FNMSUB	(aa3, cc09, cc13, cc13)
666	FNMSUB	(aa4, cc09, cc15, cc15)
667
668	LDF	[BO + 45 * SIZE], a1
669	LDF	[BO + 46 * SIZE], a2
670	LDF	[BO + 47 * SIZE], a3
671
672	FMUL	a1, c11, c11
673
674	FNMSUB	(aa2, cc11, cc13, cc13)
675	FNMSUB	(aa3, cc11, cc15, cc15)
676
677	LDF	[BO + 54 * SIZE], a1
678	LDF	[BO + 55 * SIZE], a2
679
680	FMUL	a1, c13, c13
681
682	FNMSUB	(aa2, cc13, cc15, cc15)
683
684	LDF	[BO + 63 * SIZE], a1
685
686	FMUL	a1, c15, c15
687#endif
688
689#ifdef RT
690	LDF	[BO + 63 * SIZE], a1
691	LDF	[BO + 62 * SIZE], a2
692	LDF	[BO + 61 * SIZE], a3
693	LDF	[BO + 60 * SIZE], a4
694	LDF	[BO + 59 * SIZE], b1
695	LDF	[BO + 58 * SIZE], b2
696	LDF	[BO + 57 * SIZE], b3
697	LDF	[BO + 56 * SIZE], b4
698
699	FMUL	a1, c15, c15
700
701	FNMSUB	(aa2, cc15, cc13, cc13)
702	FNMSUB	(aa3, cc15, cc11, cc11)
703	FNMSUB	(aa4, cc15, cc09, cc09)
704	FNMSUB	(bb1, cc15, cc07, cc07)
705	FNMSUB	(bb2, cc15, cc05, cc05)
706	FNMSUB	(bb3, cc15, cc03, cc03)
707	FNMSUB	(bb4, cc15, cc01, cc01)
708
709	LDF	[BO + 54 * SIZE], a1
710	LDF	[BO + 53 * SIZE], a2
711	LDF	[BO + 52 * SIZE], a3
712	LDF	[BO + 51 * SIZE], a4
713	LDF	[BO + 50 * SIZE], b1
714	LDF	[BO + 49 * SIZE], b2
715	LDF	[BO + 48 * SIZE], b3
716
717	FMUL	a1, c13, c13
718
719	FNMSUB	(aa2, cc13, cc11, cc11)
720	FNMSUB	(aa3, cc13, cc09, cc09)
721	FNMSUB	(aa4, cc13, cc07, cc07)
722	FNMSUB	(bb1, cc13, cc05, cc05)
723	FNMSUB	(bb2, cc13, cc03, cc03)
724	FNMSUB	(bb3, cc13, cc01, cc01)
725
726	LDF	[BO + 45 * SIZE], a1
727	LDF	[BO + 44 * SIZE], a2
728	LDF	[BO + 43 * SIZE], a3
729	LDF	[BO + 42 * SIZE], a4
730	LDF	[BO + 41 * SIZE], b1
731	LDF	[BO + 40 * SIZE], b2
732
733	FMUL	a1, c11, c11
734
735	FNMSUB	(aa2, cc11, cc09, cc09)
736	FNMSUB	(aa3, cc11, cc07, cc07)
737	FNMSUB	(aa4, cc11, cc05, cc05)
738	FNMSUB	(bb1, cc11, cc03, cc03)
739	FNMSUB	(bb2, cc11, cc01, cc01)
740
741	LDF	[BO + 36 * SIZE], a1
742	LDF	[BO + 35 * SIZE], a2
743	LDF	[BO + 34 * SIZE], a3
744	LDF	[BO + 33 * SIZE], a4
745	LDF	[BO + 32 * SIZE], b1
746
747	FMUL	a1, c09, c09
748
749	FNMSUB	(aa2, cc09, cc07, cc07)
750	FNMSUB	(aa3, cc09, cc05, cc05)
751	FNMSUB	(aa4, cc09, cc03, cc03)
752	FNMSUB	(bb1, cc09, cc01, cc01)
753
754	LDF	[BO + 27 * SIZE], a1
755	LDF	[BO + 26 * SIZE], a2
756	LDF	[BO + 25 * SIZE], a3
757	LDF	[BO + 24 * SIZE], a4
758
759	FMUL	a1, c07, c07
760
761	FNMSUB	(aa2, cc07, cc05, cc05)
762	FNMSUB	(aa3, cc07, cc03, cc03)
763	FNMSUB	(aa4, cc07, cc01, cc01)
764
765	LDF	[BO + 18 * SIZE], a1
766	LDF	[BO + 17 * SIZE], a2
767	LDF	[BO + 16 * SIZE], a3
768
769	FMUL	a1, c05, c05
770
771	FNMSUB	(aa2, cc05, cc03, cc03)
772	FNMSUB	(aa3, cc05, cc01, cc01)
773
774	LDF	[BO +  9 * SIZE], a1
775	LDF	[BO +  8 * SIZE], a2
776
777	FMUL	a1, c03, c03
778
779	FNMSUB	(aa2, cc03, cc01, cc01)
780
781	LDF	[BO +  0 * SIZE], a1
782
783	FMUL	a1, c01, c01
784#endif
785
786#ifdef LN
787	add	C1, -1 * SIZE, C1
788	add	C2, -1 * SIZE, C2
789	add	C3, -1 * SIZE, C3
790	add	C4, -1 * SIZE, C4
791	add	C5, -1 * SIZE, C5
792	add	C6, -1 * SIZE, C6
793	add	C7, -1 * SIZE, C7
794	add	C8, -1 * SIZE, C8
795#endif
796
797#if defined(LN) || defined(LT)
798	STF	c01, [BO +  0 * SIZE]
799	STF	c03, [BO +  1 * SIZE]
800	STF	c05, [BO +  2 * SIZE]
801	STF	c07, [BO +  3 * SIZE]
802
803	STF	c09, [BO +  4 * SIZE]
804	STF	c11, [BO +  5 * SIZE]
805	STF	c13, [BO +  6 * SIZE]
806	STF	c15, [BO +  7 * SIZE]
807#else
808	STF	c01, [AO +  0 * SIZE]
809	STF	c03, [AO +  1 * SIZE]
810	STF	c05, [AO +  2 * SIZE]
811	STF	c07, [AO +  3 * SIZE]
812
813	STF	c09, [AO +  4 * SIZE]
814	STF	c11, [AO +  5 * SIZE]
815	STF	c13, [AO +  6 * SIZE]
816	STF	c15, [AO +  7 * SIZE]
817#endif
818
819	STF	c01, [C1 + 0 * SIZE]
820	STF	c03, [C2 + 0 * SIZE]
821	STF	c05, [C3 + 0 * SIZE]
822	STF	c07, [C4 + 0 * SIZE]
823
824	STF	c09, [C5 + 0 * SIZE]
825	STF	c11, [C6 + 0 * SIZE]
826	STF	c13, [C7 + 0 * SIZE]
827	STF	c15, [C8 + 0 * SIZE]
828
829#ifdef RT
830	sll	K, BASE_SHIFT + 0, TEMP1
831	add	AORIG, TEMP1, AORIG
832#endif
833
834#if defined(LT) || defined(RN)
835	sub	K, KK, TEMP1
836	sll	TEMP1, BASE_SHIFT + 0, TEMP2
837	sll	TEMP1, BASE_SHIFT + 3, TEMP1
838	add	AO, TEMP2, AO
839	add	BO, TEMP1, BO
840#endif
841
842#ifdef LT
843	add	KK, 1, KK
844#endif
845
846#ifdef LN
847	sub	KK, 1, KK
848#endif
849	.align 4
850
851.LL20:
852	sra	M, 1, I
853	cmp	I, 0
854	ble,pn	%icc, .LL29
855	nop
856	.align 4
857
858.LL12:
859#if defined(LT) || defined(RN)
860	mov	B, BO
861#else
862#ifdef LN
863	sll	K,  BASE_SHIFT + 1, TEMP1
864	sub	AORIG, TEMP1, AORIG
865#endif
866
867	sll	KK, BASE_SHIFT + 1, TEMP1
868	sll	KK, BASE_SHIFT + 3, TEMP2
869
870	add	AORIG, TEMP1, AO
871	add	B,     TEMP2, BO
872#endif
873
874	LDF	[AO +  0 * SIZE], a1
875	LDF	[AO +  1 * SIZE], a2
876	LDF	[AO +  8 * SIZE], a5
877
878	LDF	[BO +  0 * SIZE], b1
879
880	LDF	[BO +  1 * SIZE], b2
881	FCLR	(cc01)
882	LDF	[BO +  2 * SIZE], b3
883	FCLR	(cc05)
884	LDF	[BO +  3 * SIZE], b4
885	FCLR	(cc09)
886	LDF	[BO +  4 * SIZE], b5
887	FCLR	(cc13)
888
889	LDF	[BO +  5 * SIZE], b6
890	FCLR	(cc02)
891	LDF	[BO +  6 * SIZE], b7
892	FCLR	(cc06)
893	LDF	[BO +  7 * SIZE], b8
894	FCLR	(cc10)
895	LDF	[BO +  8 * SIZE], b9
896	FCLR	(cc14)
897
898	prefetch [C1 + 1 * SIZE], 3
899	FCLR	(cc03)
900	prefetch [C2 + 2 * SIZE], 3
901	FCLR	(cc07)
902	prefetch [C3 + 1 * SIZE], 3
903	FCLR	(cc11)
904	prefetch [C4 + 2 * SIZE], 3
905	FCLR	(cc15)
906
907	prefetch [C5 + 1 * SIZE], 3
908	FCLR	(cc04)
909	prefetch [C6 + 2 * SIZE], 3
910	FCLR	(cc08)
911	prefetch [C7 + 1 * SIZE], 3
912	FCLR	(cc12)
913	prefetch [C8 + 2 * SIZE], 3
914	FCLR	(cc16)
915
916#if defined(LT) || defined(RN)
917	sra	KK, 3, L
918#else
919	sub	K, KK, L
920	sra	L,  3, L
921#endif
922	cmp	L,  0
923	ble,pn	%icc, .LL15
924	nop
925	.align 4
926
927.LL13:
928	FMADD	(aa1, bb1, cc01, cc01)
929	FMADD	(aa2, bb1, cc02, cc02)
930	FMADD	(aa1, bb2, cc03, cc03)
931	FMADD	(aa2, bb2, cc04, cc04)
932
933	FMADD	(aa1, bb3, cc05, cc05)
934	LDF	[BO + 16 * SIZE], b1
935	FMADD	(aa2, bb3, cc06, cc06)
936	LDF	[BO +  9 * SIZE], b2
937
938	FMADD	(aa1, bb4, cc07, cc07)
939	LDF	[BO + 10 * SIZE], b3
940	FMADD	(aa2, bb4, cc08, cc08)
941	LDF	[BO + 11 * SIZE], b4
942
943	FMADD	(aa1, bb5, cc09, cc09)
944	LDF	[AO +  2 * SIZE], a3
945	FMADD	(aa2, bb5, cc10, cc10)
946	LDF	[AO +  3 * SIZE], a4
947
948	FMADD	(aa1, bb6, cc11, cc11)
949	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
950	FMADD	(aa2, bb6, cc12, cc12)
951	nop
952
953	FMADD	(aa1, bb7, cc13, cc13)
954	LDF	[BO + 12 * SIZE], b5
955	FMADD	(aa2, bb7, cc14, cc14)
956	LDF	[BO + 13 * SIZE], b6
957
958	FMADD	(aa1, bb8, cc15, cc15)
959	LDF	[BO + 14 * SIZE], b7
960	FMADD	(aa2, bb8, cc16, cc16)
961	LDF	[BO + 15 * SIZE], b8
962
963	FMADD	(aa3, bb9, cc01, cc01)
964	FMADD	(aa4, bb9, cc02, cc02)
965	FMADD	(aa3, bb2, cc03, cc03)
966	FMADD	(aa4, bb2, cc04, cc04)
967
968	FMADD	(aa3, bb3, cc05, cc05)
969	LDF	[BO + 24 * SIZE], b9
970	FMADD	(aa4, bb3, cc06, cc06)
971	LDF	[BO + 17 * SIZE], b2
972
973	FMADD	(aa3, bb4, cc07, cc07)
974	LDF	[BO + 18 * SIZE], b3
975	FMADD	(aa4, bb4, cc08, cc08)
976	LDF	[BO + 19 * SIZE], b4
977
978	FMADD	(aa3, bb5, cc09, cc09)
979	LDF	[AO +  4 * SIZE], a1
980	FMADD	(aa4, bb5, cc10, cc10)
981	LDF	[AO +  5 * SIZE], a2
982
983	FMADD	(aa3, bb6, cc11, cc11)
984	add	L, -1, L
985	FMADD	(aa4, bb6, cc12, cc12)
986	nop
987
988	FMADD	(aa3, bb7, cc13, cc13)
989	LDF	[BO + 20 * SIZE], b5
990	FMADD	(aa4, bb7, cc14, cc14)
991	LDF	[BO + 21 * SIZE], b6
992
993	FMADD	(aa3, bb8, cc15, cc15)
994	LDF	[BO + 22 * SIZE], b7
995	FMADD	(aa4, bb8, cc16, cc16)
996	LDF	[BO + 23 * SIZE], b8
997
998	FMADD	(aa1, bb1, cc01, cc01)
999	FMADD	(aa2, bb1, cc02, cc02)
1000	FMADD	(aa1, bb2, cc03, cc03)
1001	FMADD	(aa2, bb2, cc04, cc04)
1002
1003	FMADD	(aa1, bb3, cc05, cc05)
1004	LDF	[BO + 32 * SIZE], b1
1005	FMADD	(aa2, bb3, cc06, cc06)
1006	LDF	[BO + 25 * SIZE], b2
1007
1008	FMADD	(aa1, bb4, cc07, cc07)
1009	LDF	[BO + 26 * SIZE], b3
1010	FMADD	(aa2, bb4, cc08, cc08)
1011	LDF	[BO + 27 * SIZE], b4
1012
1013	FMADD	(aa1, bb5, cc09, cc09)
1014	LDF	[AO +  6 * SIZE], a3
1015	FMADD	(aa2, bb5, cc10, cc10)
1016	LDF	[AO +  7 * SIZE], a4
1017
1018	FMADD	(aa1, bb6, cc11, cc11)
1019	nop
1020	FMADD	(aa2, bb6, cc12, cc12)
1021	nop
1022
1023	FMADD	(aa1, bb7, cc13, cc13)
1024	LDF	[BO + 28 * SIZE], b5
1025	FMADD	(aa2, bb7, cc14, cc14)
1026	LDF	[BO + 29 * SIZE], b6
1027
1028	FMADD	(aa1, bb8, cc15, cc15)
1029	LDF	[BO + 30 * SIZE], b7
1030	FMADD	(aa2, bb8, cc16, cc16)
1031	LDF	[BO + 31 * SIZE], b8
1032
1033	FMADD	(aa3, bb9, cc01, cc01)
1034	FMADD	(aa4, bb9, cc02, cc02)
1035	FMADD	(aa3, bb2, cc03, cc03)
1036	FMADD	(aa4, bb2, cc04, cc04)
1037
1038	FMADD	(aa3, bb3, cc05, cc05)
1039	LDF	[BO + 40 * SIZE], b9
1040	FMADD	(aa4, bb3, cc06, cc06)
1041	LDF	[BO + 33 * SIZE], b2
1042
1043	FMADD	(aa3, bb4, cc07, cc07)
1044	LDF	[BO + 34 * SIZE], b3
1045	FMADD	(aa4, bb4, cc08, cc08)
1046	LDF	[BO + 35 * SIZE], b4
1047
1048	FMADD	(aa3, bb5, cc09, cc09)
1049	LDF	[AO + 16 * SIZE], a1  /****/
1050	FMADD	(aa4, bb5, cc10, cc10)
1051	LDF	[AO +  9 * SIZE], a2
1052
1053	FMADD	(aa3, bb6, cc11, cc11)
1054	nop
1055	FMADD	(aa4, bb6, cc12, cc12)
1056	nop
1057
1058	FMADD	(aa3, bb7, cc13, cc13)
1059	LDF	[BO + 36 * SIZE], b5
1060	FMADD	(aa4, bb7, cc14, cc14)
1061	LDF	[BO + 37 * SIZE], b6
1062
1063	FMADD	(aa3, bb8, cc15, cc15)
1064	LDF	[BO + 38 * SIZE], b7
1065	FMADD	(aa4, bb8, cc16, cc16)
1066	LDF	[BO + 39 * SIZE], b8
1067
1068	FMADD	(aa5, bb1, cc01, cc01)
1069	FMADD	(aa2, bb1, cc02, cc02)
1070	FMADD	(aa5, bb2, cc03, cc03)
1071	FMADD	(aa2, bb2, cc04, cc04)
1072
1073	FMADD	(aa5, bb3, cc05, cc05)
1074	LDF	[BO + 48 * SIZE], b1
1075	FMADD	(aa2, bb3, cc06, cc06)
1076	LDF	[BO + 41 * SIZE], b2
1077
1078	FMADD	(aa5, bb4, cc07, cc07)
1079	LDF	[BO + 42 * SIZE], b3
1080	FMADD	(aa2, bb4, cc08, cc08)
1081	LDF	[BO + 43 * SIZE], b4
1082
1083	FMADD	(aa5, bb5, cc09, cc09)
1084	LDF	[AO + 10 * SIZE], a3
1085	FMADD	(aa2, bb5, cc10, cc10)
1086	LDF	[AO + 11 * SIZE], a4
1087
1088	FMADD	(aa5, bb6, cc11, cc11)
1089	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
1090	FMADD	(aa2, bb6, cc12, cc12)
1091	nop
1092
1093	FMADD	(aa5, bb7, cc13, cc13)
1094	LDF	[BO + 44 * SIZE], b5
1095	FMADD	(aa2, bb7, cc14, cc14)
1096	LDF	[BO + 45 * SIZE], b6
1097
1098	FMADD	(aa5, bb8, cc15, cc15)
1099	LDF	[BO + 46 * SIZE], b7
1100	FMADD	(aa2, bb8, cc16, cc16)
1101	LDF	[BO + 47 * SIZE], b8
1102
1103	FMADD	(aa3, bb9, cc01, cc01)
1104	FMADD	(aa4, bb9, cc02, cc02)
1105	FMADD	(aa3, bb2, cc03, cc03)
1106	FMADD	(aa4, bb2, cc04, cc04)
1107
1108	FMADD	(aa3, bb3, cc05, cc05)
1109	LDF	[BO + 56 * SIZE], b9
1110	FMADD	(aa4, bb3, cc06, cc06)
1111	LDF	[BO + 49 * SIZE], b2
1112
1113	FMADD	(aa3, bb4, cc07, cc07)
1114	LDF	[BO + 50 * SIZE], b3
1115	FMADD	(aa4, bb4, cc08, cc08)
1116	LDF	[BO + 51 * SIZE], b4
1117
1118	FMADD	(aa3, bb5, cc09, cc09)
1119	LDF	[AO + 12 * SIZE], a5
1120	FMADD	(aa4, bb5, cc10, cc10)
1121	LDF	[AO + 13 * SIZE], a2
1122
1123	FMADD	(aa3, bb6, cc11, cc11)
1124	cmp	L, 0
1125	FMADD	(aa4, bb6, cc12, cc12)
1126	nop
1127
1128	FMADD	(aa3, bb7, cc13, cc13)
1129	LDF	[BO + 52 * SIZE], b5
1130	FMADD	(aa4, bb7, cc14, cc14)
1131	LDF	[BO + 53 * SIZE], b6
1132
1133	FMADD	(aa3, bb8, cc15, cc15)
1134	LDF	[BO + 54 * SIZE], b7
1135	FMADD	(aa4, bb8, cc16, cc16)
1136	LDF	[BO + 55 * SIZE], b8
1137
1138	FMADD	(aa5, bb1, cc01, cc01)
1139	FMADD	(aa2, bb1, cc02, cc02)
1140	FMADD	(aa5, bb2, cc03, cc03)
1141	FMADD	(aa2, bb2, cc04, cc04)
1142
1143	FMADD	(aa5, bb3, cc05, cc05)
1144	LDF	[BO + 64 * SIZE], b1
1145	FMADD	(aa2, bb3, cc06, cc06)
1146	LDF	[BO + 57 * SIZE], b2
1147
1148	FMADD	(aa5, bb4, cc07, cc07)
1149	LDF	[BO + 58 * SIZE], b3
1150	FMADD	(aa2, bb4, cc08, cc08)
1151	LDF	[BO + 59 * SIZE], b4
1152
1153	FMADD	(aa5, bb5, cc09, cc09)
1154	LDF	[AO + 14 * SIZE], a3
1155	FMADD	(aa2, bb5, cc10, cc10)
1156	LDF	[AO + 15 * SIZE], a4
1157
1158	FMADD	(aa5, bb6, cc11, cc11)
1159	add	BO, 64 * SIZE, BO
1160	FMADD	(aa2, bb6, cc12, cc12)
1161	add	AO, 16 * SIZE, AO
1162
1163	FMADD	(aa5, bb7, cc13, cc13)
1164	LDF	[BO -  4 * SIZE], b5
1165	FMADD	(aa2, bb7, cc14, cc14)
1166	LDF	[BO -  3 * SIZE], b6
1167
1168	FMADD	(aa5, bb8, cc15, cc15)
1169	LDF	[BO -  2 * SIZE], b7
1170	FMADD	(aa2, bb8, cc16, cc16)
1171	LDF	[BO -  1 * SIZE], b8
1172
1173	FMADD	(aa3, bb9, cc01, cc01)
1174	FMADD	(aa4, bb9, cc02, cc02)
1175	FMADD	(aa3, bb2, cc03, cc03)
1176	FMADD	(aa4, bb2, cc04, cc04)
1177
1178	FMADD	(aa3, bb3, cc05, cc05)
1179	LDF	[BO +  8 * SIZE], b9
1180	FMADD	(aa4, bb3, cc06, cc06)
1181	LDF	[BO +  1 * SIZE], b2
1182
1183	FMADD	(aa3, bb4, cc07, cc07)
1184	LDF	[BO +  2 * SIZE], b3
1185	FMADD	(aa4, bb4, cc08, cc08)
1186	LDF	[BO +  3 * SIZE], b4
1187
1188	FMADD	(aa3, bb5, cc09, cc09)
1189	LDF	[AO +  8 * SIZE], a5  /****/
1190	FMADD	(aa4, bb5, cc10, cc10)
1191	LDF	[AO +  1 * SIZE], a2
1192
1193	FMADD	(aa3, bb6, cc11, cc11)
1194	FMADD	(aa4, bb6, cc12, cc12)
1195
1196	FMADD	(aa3, bb7, cc13, cc13)
1197	LDF	[BO +  4 * SIZE], b5
1198	FMADD	(aa4, bb7, cc14, cc14)
1199	LDF	[BO +  5 * SIZE], b6
1200
1201	FMADD	(aa3, bb8, cc15, cc15)
1202	LDF	[BO +  6 * SIZE], b7
1203	FMADD	(aa4, bb8, cc16, cc16)
1204	ble,pn	%icc, .LL15
1205	LDF	[BO +  7 * SIZE], b8
1206
1207	FMADD	(aa1, bb1, cc01, cc01)
1208	FMADD	(aa2, bb1, cc02, cc02)
1209	FMADD	(aa1, bb2, cc03, cc03)
1210	FMADD	(aa2, bb2, cc04, cc04)
1211
1212	FMADD	(aa1, bb3, cc05, cc05)
1213	LDF	[BO + 16 * SIZE], b1
1214	FMADD	(aa2, bb3, cc06, cc06)
1215	LDF	[BO +  9 * SIZE], b2
1216
1217	FMADD	(aa1, bb4, cc07, cc07)
1218	LDF	[BO + 10 * SIZE], b3
1219	FMADD	(aa2, bb4, cc08, cc08)
1220	LDF	[BO + 11 * SIZE], b4
1221
1222	FMADD	(aa1, bb5, cc09, cc09)
1223	LDF	[AO +  2 * SIZE], a3
1224	FMADD	(aa2, bb5, cc10, cc10)
1225	LDF	[AO +  3 * SIZE], a4
1226
1227	FMADD	(aa1, bb6, cc11, cc11)
1228	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1229	FMADD	(aa2, bb6, cc12, cc12)
1230	nop
1231
1232	FMADD	(aa1, bb7, cc13, cc13)
1233	LDF	[BO + 12 * SIZE], b5
1234	FMADD	(aa2, bb7, cc14, cc14)
1235	LDF	[BO + 13 * SIZE], b6
1236
1237	FMADD	(aa1, bb8, cc15, cc15)
1238	LDF	[BO + 14 * SIZE], b7
1239	FMADD	(aa2, bb8, cc16, cc16)
1240	LDF	[BO + 15 * SIZE], b8
1241
1242	FMADD	(aa3, bb9, cc01, cc01)
1243	FMADD	(aa4, bb9, cc02, cc02)
1244	FMADD	(aa3, bb2, cc03, cc03)
1245	FMADD	(aa4, bb2, cc04, cc04)
1246
1247	FMADD	(aa3, bb3, cc05, cc05)
1248	LDF	[BO + 24 * SIZE], b9
1249	FMADD	(aa4, bb3, cc06, cc06)
1250	LDF	[BO + 17 * SIZE], b2
1251
1252	FMADD	(aa3, bb4, cc07, cc07)
1253	LDF	[BO + 18 * SIZE], b3
1254	FMADD	(aa4, bb4, cc08, cc08)
1255	LDF	[BO + 19 * SIZE], b4
1256
1257	FMADD	(aa3, bb5, cc09, cc09)
1258	LDF	[AO +  4 * SIZE], a1
1259	FMADD	(aa4, bb5, cc10, cc10)
1260	LDF	[AO +  5 * SIZE], a2
1261
1262	FMADD	(aa3, bb6, cc11, cc11)
1263	add	L, -1, L
1264	FMADD	(aa4, bb6, cc12, cc12)
1265	nop
1266
1267	FMADD	(aa3, bb7, cc13, cc13)
1268	LDF	[BO + 20 * SIZE], b5
1269	FMADD	(aa4, bb7, cc14, cc14)
1270	LDF	[BO + 21 * SIZE], b6
1271
1272	FMADD	(aa3, bb8, cc15, cc15)
1273	LDF	[BO + 22 * SIZE], b7
1274	FMADD	(aa4, bb8, cc16, cc16)
1275	LDF	[BO + 23 * SIZE], b8
1276
1277	FMADD	(aa1, bb1, cc01, cc01)
1278	FMADD	(aa2, bb1, cc02, cc02)
1279	FMADD	(aa1, bb2, cc03, cc03)
1280	FMADD	(aa2, bb2, cc04, cc04)
1281
1282	FMADD	(aa1, bb3, cc05, cc05)
1283	LDF	[BO + 32 * SIZE], b1
1284	FMADD	(aa2, bb3, cc06, cc06)
1285	LDF	[BO + 25 * SIZE], b2
1286
1287	FMADD	(aa1, bb4, cc07, cc07)
1288	LDF	[BO + 26 * SIZE], b3
1289	FMADD	(aa2, bb4, cc08, cc08)
1290	LDF	[BO + 27 * SIZE], b4
1291
1292	FMADD	(aa1, bb5, cc09, cc09)
1293	LDF	[AO +  6 * SIZE], a3
1294	FMADD	(aa2, bb5, cc10, cc10)
1295	LDF	[AO +  7 * SIZE], a4
1296
1297	FMADD	(aa1, bb6, cc11, cc11)
1298	nop
1299	FMADD	(aa2, bb6, cc12, cc12)
1300	nop
1301
1302	FMADD	(aa1, bb7, cc13, cc13)
1303	LDF	[BO + 28 * SIZE], b5
1304	FMADD	(aa2, bb7, cc14, cc14)
1305	LDF	[BO + 29 * SIZE], b6
1306
1307	FMADD	(aa1, bb8, cc15, cc15)
1308	LDF	[BO + 30 * SIZE], b7
1309	FMADD	(aa2, bb8, cc16, cc16)
1310	LDF	[BO + 31 * SIZE], b8
1311
1312	FMADD	(aa3, bb9, cc01, cc01)
1313	FMADD	(aa4, bb9, cc02, cc02)
1314	FMADD	(aa3, bb2, cc03, cc03)
1315	FMADD	(aa4, bb2, cc04, cc04)
1316
1317	FMADD	(aa3, bb3, cc05, cc05)
1318	LDF	[BO + 40 * SIZE], b9
1319	FMADD	(aa4, bb3, cc06, cc06)
1320	LDF	[BO + 33 * SIZE], b2
1321
1322	FMADD	(aa3, bb4, cc07, cc07)
1323	LDF	[BO + 34 * SIZE], b3
1324	FMADD	(aa4, bb4, cc08, cc08)
1325	LDF	[BO + 35 * SIZE], b4
1326
1327	FMADD	(aa3, bb5, cc09, cc09)
1328	LDF	[AO + 16 * SIZE], a1  /****/
1329	FMADD	(aa4, bb5, cc10, cc10)
1330	LDF	[AO +  9 * SIZE], a2
1331
1332	FMADD	(aa3, bb6, cc11, cc11)
1333	nop
1334	FMADD	(aa4, bb6, cc12, cc12)
1335	nop
1336
1337	FMADD	(aa3, bb7, cc13, cc13)
1338	LDF	[BO + 36 * SIZE], b5
1339	FMADD	(aa4, bb7, cc14, cc14)
1340	LDF	[BO + 37 * SIZE], b6
1341
1342	FMADD	(aa3, bb8, cc15, cc15)
1343	LDF	[BO + 38 * SIZE], b7
1344	FMADD	(aa4, bb8, cc16, cc16)
1345	LDF	[BO + 39 * SIZE], b8
1346
1347	FMADD	(aa5, bb1, cc01, cc01)
1348	FMADD	(aa2, bb1, cc02, cc02)
1349	FMADD	(aa5, bb2, cc03, cc03)
1350	FMADD	(aa2, bb2, cc04, cc04)
1351
1352	FMADD	(aa5, bb3, cc05, cc05)
1353	LDF	[BO + 48 * SIZE], b1
1354	FMADD	(aa2, bb3, cc06, cc06)
1355	LDF	[BO + 41 * SIZE], b2
1356
1357	FMADD	(aa5, bb4, cc07, cc07)
1358	LDF	[BO + 42 * SIZE], b3
1359	FMADD	(aa2, bb4, cc08, cc08)
1360	LDF	[BO + 43 * SIZE], b4
1361
1362	FMADD	(aa5, bb5, cc09, cc09)
1363	LDF	[AO + 10 * SIZE], a3
1364	FMADD	(aa2, bb5, cc10, cc10)
1365	LDF	[AO + 11 * SIZE], a4
1366
1367	FMADD	(aa5, bb6, cc11, cc11)
1368	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
1369	FMADD	(aa2, bb6, cc12, cc12)
1370	nop
1371
1372	FMADD	(aa5, bb7, cc13, cc13)
1373	LDF	[BO + 44 * SIZE], b5
1374	FMADD	(aa2, bb7, cc14, cc14)
1375	LDF	[BO + 45 * SIZE], b6
1376
1377	FMADD	(aa5, bb8, cc15, cc15)
1378	LDF	[BO + 46 * SIZE], b7
1379	FMADD	(aa2, bb8, cc16, cc16)
1380	LDF	[BO + 47 * SIZE], b8
1381
1382	FMADD	(aa3, bb9, cc01, cc01)
1383	FMADD	(aa4, bb9, cc02, cc02)
1384	FMADD	(aa3, bb2, cc03, cc03)
1385	FMADD	(aa4, bb2, cc04, cc04)
1386
1387	FMADD	(aa3, bb3, cc05, cc05)
1388	LDF	[BO + 56 * SIZE], b9
1389	FMADD	(aa4, bb3, cc06, cc06)
1390	LDF	[BO + 49 * SIZE], b2
1391
1392	FMADD	(aa3, bb4, cc07, cc07)
1393	LDF	[BO + 50 * SIZE], b3
1394	FMADD	(aa4, bb4, cc08, cc08)
1395	LDF	[BO + 51 * SIZE], b4
1396
1397	FMADD	(aa3, bb5, cc09, cc09)
1398	LDF	[AO + 12 * SIZE], a5
1399	FMADD	(aa4, bb5, cc10, cc10)
1400	LDF	[AO + 13 * SIZE], a2
1401
1402	FMADD	(aa3, bb6, cc11, cc11)
1403	cmp	L, 0
1404	FMADD	(aa4, bb6, cc12, cc12)
1405	nop
1406
1407	FMADD	(aa3, bb7, cc13, cc13)
1408	LDF	[BO + 52 * SIZE], b5
1409	FMADD	(aa4, bb7, cc14, cc14)
1410	LDF	[BO + 53 * SIZE], b6
1411
1412	FMADD	(aa3, bb8, cc15, cc15)
1413	LDF	[BO + 54 * SIZE], b7
1414	FMADD	(aa4, bb8, cc16, cc16)
1415	LDF	[BO + 55 * SIZE], b8
1416
1417	FMADD	(aa5, bb1, cc01, cc01)
1418	FMADD	(aa2, bb1, cc02, cc02)
1419	FMADD	(aa5, bb2, cc03, cc03)
1420	FMADD	(aa2, bb2, cc04, cc04)
1421
1422	FMADD	(aa5, bb3, cc05, cc05)
1423	LDF	[BO + 64 * SIZE], b1
1424	FMADD	(aa2, bb3, cc06, cc06)
1425	LDF	[BO + 57 * SIZE], b2
1426
1427	FMADD	(aa5, bb4, cc07, cc07)
1428	LDF	[BO + 58 * SIZE], b3
1429	FMADD	(aa2, bb4, cc08, cc08)
1430	LDF	[BO + 59 * SIZE], b4
1431
1432	FMADD	(aa5, bb5, cc09, cc09)
1433	LDF	[AO + 14 * SIZE], a3
1434	FMADD	(aa2, bb5, cc10, cc10)
1435	LDF	[AO + 15 * SIZE], a4
1436
1437	FMADD	(aa5, bb6, cc11, cc11)
1438	add	BO, 64 * SIZE, BO
1439	FMADD	(aa2, bb6, cc12, cc12)
1440	add	AO, 16 * SIZE, AO
1441
1442	FMADD	(aa5, bb7, cc13, cc13)
1443	LDF	[BO -  4 * SIZE], b5
1444	FMADD	(aa2, bb7, cc14, cc14)
1445	LDF	[BO -  3 * SIZE], b6
1446
1447	FMADD	(aa5, bb8, cc15, cc15)
1448	LDF	[BO -  2 * SIZE], b7
1449	FMADD	(aa2, bb8, cc16, cc16)
1450	LDF	[BO -  1 * SIZE], b8
1451
1452	FMADD	(aa3, bb9, cc01, cc01)
1453	FMADD	(aa4, bb9, cc02, cc02)
1454	FMADD	(aa3, bb2, cc03, cc03)
1455	FMADD	(aa4, bb2, cc04, cc04)
1456
1457	FMADD	(aa3, bb3, cc05, cc05)
1458	LDF	[BO +  8 * SIZE], b9
1459	FMADD	(aa4, bb3, cc06, cc06)
1460	LDF	[BO +  1 * SIZE], b2
1461
1462	FMADD	(aa3, bb4, cc07, cc07)
1463	LDF	[BO +  2 * SIZE], b3
1464	FMADD	(aa4, bb4, cc08, cc08)
1465	LDF	[BO +  3 * SIZE], b4
1466
1467	FMADD	(aa3, bb5, cc09, cc09)
1468	LDF	[AO +  8 * SIZE], a5  /****/
1469	FMADD	(aa4, bb5, cc10, cc10)
1470	LDF	[AO +  1 * SIZE], a2
1471
1472	FMADD	(aa3, bb6, cc11, cc11)
1473	FMADD	(aa4, bb6, cc12, cc12)
1474
1475	FMADD	(aa3, bb7, cc13, cc13)
1476	LDF	[BO +  4 * SIZE], b5
1477	FMADD	(aa4, bb7, cc14, cc14)
1478	LDF	[BO +  5 * SIZE], b6
1479
1480	FMADD	(aa3, bb8, cc15, cc15)
1481	LDF	[BO +  6 * SIZE], b7
1482	FMADD	(aa4, bb8, cc16, cc16)
1483	bg,pt	%icc, .LL13
1484	LDF	[BO +  7 * SIZE], b8
1485	.align 4
1486
1487.LL15:
1488#if defined(LT) || defined(RN)
1489	and	KK, 7, L
1490#else
1491	sub	K, KK, L
1492	and	L,  7, L
1493#endif
1494	cmp	L,  0
1495	ble,a,pn %icc, .LL18
1496	nop
1497	.align 4
1498
1499.LL17:
1500	FMADD	(aa1, bb1, cc01, cc01)
1501	add	L, -1, L
1502	FMADD	(aa2, bb1, cc02, cc02)
1503	nop
1504
1505	FMADD	(aa1, bb2, cc03, cc03)
1506	LDF	[BO +  8 * SIZE], b1
1507	FMADD	(aa2, bb2, cc04, cc04)
1508	LDF	[BO +  9 * SIZE], b2
1509
1510	FMADD	(aa1, bb3, cc05, cc05)
1511	cmp	L, 0
1512	FMADD	(aa2, bb3, cc06, cc06)
1513	nop
1514
1515	FMADD	(aa1, bb4, cc07, cc07)
1516	LDF	[BO + 10 * SIZE], b3
1517	FMADD	(aa2, bb4, cc08, cc08)
1518	LDF	[BO + 11 * SIZE], b4
1519
1520	FMADD	(aa1, bb5, cc09, cc09)
1521	nop
1522	FMADD	(aa2, bb5, cc10, cc10)
1523	nop
1524
1525	FMADD	(aa1, bb6, cc11, cc11)
1526	LDF	[BO + 12 * SIZE], b5
1527	FMADD	(aa2, bb6, cc12, cc12)
1528	LDF	[BO + 13 * SIZE], b6
1529
1530	FMADD	(aa1, bb7, cc13, cc13)
1531	add	AO, 2 * SIZE, AO
1532	FMADD	(aa2, bb7, cc14, cc14)
1533	add	BO, 8 * SIZE, BO
1534
1535	FMADD	(aa1, bb8, cc15, cc15)
1536	LDF	[AO +  0 * SIZE], a1
1537	FMADD	(aa2, bb8, cc16, cc16)
1538	LDF	[AO +  1 * SIZE], a2
1539
1540	LDF	[BO +  6 * SIZE], b7
1541	bg,pt	%icc, .LL17
1542	LDF	[BO +  7 * SIZE], b8
1543	nop
1544	.align 4
1545
1546.LL18:
1547#if defined(LN) || defined(RT)
1548#ifdef LN
1549	sub	KK, 2, TEMP1
1550#else
1551	sub	KK, 8, TEMP1
1552#endif
1553	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1554	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1555
1556	add	AORIG, TEMP2, AO
1557	add	B,     TEMP1, BO
1558#endif
1559
1560#if defined(LN) || defined(LT)
1561	LDF	[BO +  0 * SIZE], a1
1562	LDF	[BO +  1 * SIZE], a2
1563	LDF	[BO +  2 * SIZE], a3
1564	LDF	[BO +  3 * SIZE], a4
1565
1566	LDF	[BO +  4 * SIZE], b1
1567	LDF	[BO +  5 * SIZE], b2
1568	LDF	[BO +  6 * SIZE], b3
1569	LDF	[BO +  7 * SIZE], b4
1570
1571	FSUB	a1, c01, c01
1572	FSUB	a2, c03, c03
1573	FSUB	a3, c05, c05
1574	FSUB	a4, c07, c07
1575
1576	FSUB	b1, c09, c09
1577	FSUB	b2, c11, c11
1578	FSUB	b3, c13, c13
1579	FSUB	b4, c15, c15
1580
1581	LDF	[BO +  8 * SIZE], a1
1582	LDF	[BO +  9 * SIZE], a2
1583	LDF	[BO + 10 * SIZE], a3
1584	LDF	[BO + 11 * SIZE], a4
1585
1586	LDF	[BO + 12 * SIZE], b1
1587	LDF	[BO + 13 * SIZE], b2
1588	LDF	[BO + 14 * SIZE], b3
1589	LDF	[BO + 15 * SIZE], b4
1590
1591	FSUB	a1, c02, c02
1592	FSUB	a2, c04, c04
1593	FSUB	a3, c06, c06
1594	FSUB	a4, c08, c08
1595
1596	FSUB	b1, c10, c10
1597	FSUB	b2, c12, c12
1598	FSUB	b3, c14, c14
1599	FSUB	b4, c16, c16
1600#else
1601	LDF	[AO +  0 * SIZE], a1
1602	LDF	[AO +  1 * SIZE], a2
1603	LDF	[AO +  2 * SIZE], a3
1604	LDF	[AO +  3 * SIZE], a4
1605
1606	LDF	[AO +  4 * SIZE], b1
1607	LDF	[AO +  5 * SIZE], b2
1608	LDF	[AO +  6 * SIZE], b3
1609	LDF	[AO +  7 * SIZE], b4
1610
1611	FSUB	a1, c01, c01
1612	FSUB	a2, c02, c02
1613	FSUB	a3, c03, c03
1614	FSUB	a4, c04, c04
1615
1616	FSUB	b1, c05, c05
1617	FSUB	b2, c06, c06
1618	FSUB	b3, c07, c07
1619	FSUB	b4, c08, c08
1620
1621	LDF	[AO +  8 * SIZE], a1
1622	LDF	[AO +  9 * SIZE], a2
1623	LDF	[AO + 10 * SIZE], a3
1624	LDF	[AO + 11 * SIZE], a4
1625
1626	LDF	[AO + 12 * SIZE], b1
1627	LDF	[AO + 13 * SIZE], b2
1628	LDF	[AO + 14 * SIZE], b3
1629	LDF	[AO + 15 * SIZE], b4
1630
1631	FSUB	a1, c09, c09
1632	FSUB	a2, c10, c10
1633	FSUB	a3, c11, c11
1634	FSUB	a4, c12, c12
1635
1636	FSUB	b1, c13, c13
1637	FSUB	b2, c14, c14
1638	FSUB	b3, c15, c15
1639	FSUB	b4, c16, c16
1640#endif
1641
1642#ifdef LN
1643	LDF	[AO +  3 * SIZE], a1
1644	LDF	[AO +  2 * SIZE], a2
1645	LDF	[AO +  0 * SIZE], a3
1646
1647	FMUL	a1, c02, c02
1648	FMUL	a1, c04, c04
1649	FMUL	a1, c06, c06
1650	FMUL	a1, c08, c08
1651	FMUL	a1, c10, c10
1652	FMUL	a1, c12, c12
1653	FMUL	a1, c14, c14
1654	FMUL	a1, c16, c16
1655
1656	FNMSUB	(aa2, cc02, cc01, cc01)
1657	FNMSUB	(aa2, cc04, cc03, cc03)
1658	FNMSUB	(aa2, cc06, cc05, cc05)
1659	FNMSUB	(aa2, cc08, cc07, cc07)
1660	FNMSUB	(aa2, cc10, cc09, cc09)
1661	FNMSUB	(aa2, cc12, cc11, cc11)
1662	FNMSUB	(aa2, cc14, cc13, cc13)
1663	FNMSUB	(aa2, cc16, cc15, cc15)
1664
1665	FMUL	a3, c01, c01
1666	FMUL	a3, c03, c03
1667	FMUL	a3, c05, c05
1668	FMUL	a3, c07, c07
1669	FMUL	a3, c09, c09
1670	FMUL	a3, c11, c11
1671	FMUL	a3, c13, c13
1672	FMUL	a3, c15, c15
1673#endif
1674
1675#ifdef LT
1676	LDF	[AO +  0 * SIZE], a1
1677	LDF	[AO +  1 * SIZE], a2
1678	LDF	[AO +  3 * SIZE], a3
1679
1680	FMUL	a1, c01, c01
1681	FMUL	a1, c03, c03
1682	FMUL	a1, c05, c05
1683	FMUL	a1, c07, c07
1684	FMUL	a1, c09, c09
1685	FMUL	a1, c11, c11
1686	FMUL	a1, c13, c13
1687	FMUL	a1, c15, c15
1688
1689	FNMSUB	(aa2, cc01, cc02, cc02)
1690	FNMSUB	(aa2, cc03, cc04, cc04)
1691	FNMSUB	(aa2, cc05, cc06, cc06)
1692	FNMSUB	(aa2, cc07, cc08, cc08)
1693	FNMSUB	(aa2, cc09, cc10, cc10)
1694	FNMSUB	(aa2, cc11, cc12, cc12)
1695	FNMSUB	(aa2, cc13, cc14, cc14)
1696	FNMSUB	(aa2, cc15, cc16, cc16)
1697
1698	FMUL	a3, c02, c02
1699	FMUL	a3, c04, c04
1700	FMUL	a3, c06, c06
1701	FMUL	a3, c08, c08
1702	FMUL	a3, c10, c10
1703	FMUL	a3, c12, c12
1704	FMUL	a3, c14, c14
1705	FMUL	a3, c16, c16
1706#endif
1707
1708#ifdef RN
1709	LDF	[BO +  0 * SIZE], a1
1710	LDF	[BO +  1 * SIZE], a2
1711	LDF	[BO +  2 * SIZE], a3
1712	LDF	[BO +  3 * SIZE], a4
1713	LDF	[BO +  4 * SIZE], b1
1714	LDF	[BO +  5 * SIZE], b2
1715	LDF	[BO +  6 * SIZE], b3
1716	LDF	[BO +  7 * SIZE], b4
1717
1718	FMUL	a1, c01, c01
1719	FMUL	a1, c02, c02
1720
1721	FNMSUB	(aa2, cc01, cc03, cc03)
1722	FNMSUB	(aa2, cc02, cc04, cc04)
1723	FNMSUB	(aa3, cc01, cc05, cc05)
1724	FNMSUB	(aa3, cc02, cc06, cc06)
1725	FNMSUB	(aa4, cc01, cc07, cc07)
1726	FNMSUB	(aa4, cc02, cc08, cc08)
1727	FNMSUB	(bb1, cc01, cc09, cc09)
1728	FNMSUB	(bb1, cc02, cc10, cc10)
1729	FNMSUB	(bb2, cc01, cc11, cc11)
1730	FNMSUB	(bb2, cc02, cc12, cc12)
1731	FNMSUB	(bb3, cc01, cc13, cc13)
1732	FNMSUB	(bb3, cc02, cc14, cc14)
1733	FNMSUB	(bb4, cc01, cc15, cc15)
1734	FNMSUB	(bb4, cc02, cc16, cc16)
1735
1736	LDF	[BO +  9 * SIZE], a1
1737	LDF	[BO + 10 * SIZE], a2
1738	LDF	[BO + 11 * SIZE], a3
1739	LDF	[BO + 12 * SIZE], a4
1740	LDF	[BO + 13 * SIZE], b1
1741	LDF	[BO + 14 * SIZE], b2
1742	LDF	[BO + 15 * SIZE], b3
1743
1744	FMUL	a1, c03, c03
1745	FMUL	a1, c04, c04
1746
1747	FNMSUB	(aa2, cc03, cc05, cc05)
1748	FNMSUB	(aa2, cc04, cc06, cc06)
1749	FNMSUB	(aa3, cc03, cc07, cc07)
1750	FNMSUB	(aa3, cc04, cc08, cc08)
1751	FNMSUB	(aa4, cc03, cc09, cc09)
1752	FNMSUB	(aa4, cc04, cc10, cc10)
1753	FNMSUB	(bb1, cc03, cc11, cc11)
1754	FNMSUB	(bb1, cc04, cc12, cc12)
1755	FNMSUB	(bb2, cc03, cc13, cc13)
1756	FNMSUB	(bb2, cc04, cc14, cc14)
1757	FNMSUB	(bb3, cc03, cc15, cc15)
1758	FNMSUB	(bb3, cc04, cc16, cc16)
1759
1760	LDF	[BO + 18 * SIZE], a1
1761	LDF	[BO + 19 * SIZE], a2
1762	LDF	[BO + 20 * SIZE], a3
1763	LDF	[BO + 21 * SIZE], a4
1764	LDF	[BO + 22 * SIZE], b1
1765	LDF	[BO + 23 * SIZE], b2
1766
1767	FMUL	a1, c05, c05
1768	FMUL	a1, c06, c06
1769
1770	FNMSUB	(aa2, cc05, cc07, cc07)
1771	FNMSUB	(aa2, cc06, cc08, cc08)
1772	FNMSUB	(aa3, cc05, cc09, cc09)
1773	FNMSUB	(aa3, cc06, cc10, cc10)
1774	FNMSUB	(aa4, cc05, cc11, cc11)
1775	FNMSUB	(aa4, cc06, cc12, cc12)
1776	FNMSUB	(bb1, cc05, cc13, cc13)
1777	FNMSUB	(bb1, cc06, cc14, cc14)
1778	FNMSUB	(bb2, cc05, cc15, cc15)
1779	FNMSUB	(bb2, cc06, cc16, cc16)
1780
1781	LDF	[BO + 27 * SIZE], a1
1782	LDF	[BO + 28 * SIZE], a2
1783	LDF	[BO + 29 * SIZE], a3
1784	LDF	[BO + 30 * SIZE], a4
1785	LDF	[BO + 31 * SIZE], b1
1786
1787	FMUL	a1, c07, c07
1788	FMUL	a1, c08, c08
1789
1790	FNMSUB	(aa2, cc07, cc09, cc09)
1791	FNMSUB	(aa2, cc08, cc10, cc10)
1792	FNMSUB	(aa3, cc07, cc11, cc11)
1793	FNMSUB	(aa3, cc08, cc12, cc12)
1794	FNMSUB	(aa4, cc07, cc13, cc13)
1795	FNMSUB	(aa4, cc08, cc14, cc14)
1796	FNMSUB	(bb1, cc07, cc15, cc15)
1797	FNMSUB	(bb1, cc08, cc16, cc16)
1798
1799	LDF	[BO + 36 * SIZE], a1
1800	LDF	[BO + 37 * SIZE], a2
1801	LDF	[BO + 38 * SIZE], a3
1802	LDF	[BO + 39 * SIZE], a4
1803
1804	FMUL	a1, c09, c09
1805	FMUL	a1, c10, c10
1806
1807	FNMSUB	(aa2, cc09, cc11, cc11)
1808	FNMSUB	(aa2, cc10, cc12, cc12)
1809	FNMSUB	(aa3, cc09, cc13, cc13)
1810	FNMSUB	(aa3, cc10, cc14, cc14)
1811	FNMSUB	(aa4, cc09, cc15, cc15)
1812	FNMSUB	(aa4, cc10, cc16, cc16)
1813
1814	LDF	[BO + 45 * SIZE], a1
1815	LDF	[BO + 46 * SIZE], a2
1816	LDF	[BO + 47 * SIZE], a3
1817
1818	FMUL	a1, c11, c11
1819	FMUL	a1, c12, c12
1820
1821	FNMSUB	(aa2, cc11, cc13, cc13)
1822	FNMSUB	(aa2, cc12, cc14, cc14)
1823	FNMSUB	(aa3, cc11, cc15, cc15)
1824	FNMSUB	(aa3, cc12, cc16, cc16)
1825
1826	LDF	[BO + 54 * SIZE], a1
1827	LDF	[BO + 55 * SIZE], a2
1828
1829	FMUL	a1, c13, c13
1830	FMUL	a1, c14, c14
1831
1832	FNMSUB	(aa2, cc13, cc15, cc15)
1833	FNMSUB	(aa2, cc14, cc16, cc16)
1834
1835	LDF	[BO + 63 * SIZE], a1
1836
1837	FMUL	a1, c15, c15
1838	FMUL	a1, c16, c16
1839#endif
1840
1841#ifdef RT
1842	LDF	[BO + 63 * SIZE], a1
1843	LDF	[BO + 62 * SIZE], a2
1844	LDF	[BO + 61 * SIZE], a3
1845	LDF	[BO + 60 * SIZE], a4
1846	LDF	[BO + 59 * SIZE], b1
1847	LDF	[BO + 58 * SIZE], b2
1848	LDF	[BO + 57 * SIZE], b3
1849	LDF	[BO + 56 * SIZE], b4
1850
1851	FMUL	a1, c16, c16
1852	FMUL	a1, c15, c15
1853
1854	FNMSUB	(aa2, cc16, cc14, cc14)
1855	FNMSUB	(aa2, cc15, cc13, cc13)
1856	FNMSUB	(aa3, cc16, cc12, cc12)
1857	FNMSUB	(aa3, cc15, cc11, cc11)
1858	FNMSUB	(aa4, cc16, cc10, cc10)
1859	FNMSUB	(aa4, cc15, cc09, cc09)
1860	FNMSUB	(bb1, cc16, cc08, cc08)
1861	FNMSUB	(bb1, cc15, cc07, cc07)
1862	FNMSUB	(bb2, cc16, cc06, cc06)
1863	FNMSUB	(bb2, cc15, cc05, cc05)
1864	FNMSUB	(bb3, cc16, cc04, cc04)
1865	FNMSUB	(bb3, cc15, cc03, cc03)
1866	FNMSUB	(bb4, cc16, cc02, cc02)
1867	FNMSUB	(bb4, cc15, cc01, cc01)
1868
1869	LDF	[BO + 54 * SIZE], a1
1870	LDF	[BO + 53 * SIZE], a2
1871	LDF	[BO + 52 * SIZE], a3
1872	LDF	[BO + 51 * SIZE], a4
1873	LDF	[BO + 50 * SIZE], b1
1874	LDF	[BO + 49 * SIZE], b2
1875	LDF	[BO + 48 * SIZE], b3
1876
1877	FMUL	a1, c14, c14
1878	FMUL	a1, c13, c13
1879
1880	FNMSUB	(aa2, cc14, cc12, cc12)
1881	FNMSUB	(aa2, cc13, cc11, cc11)
1882	FNMSUB	(aa3, cc14, cc10, cc10)
1883	FNMSUB	(aa3, cc13, cc09, cc09)
1884	FNMSUB	(aa4, cc14, cc08, cc08)
1885	FNMSUB	(aa4, cc13, cc07, cc07)
1886	FNMSUB	(bb1, cc14, cc06, cc06)
1887	FNMSUB	(bb1, cc13, cc05, cc05)
1888	FNMSUB	(bb2, cc14, cc04, cc04)
1889	FNMSUB	(bb2, cc13, cc03, cc03)
1890	FNMSUB	(bb3, cc14, cc02, cc02)
1891	FNMSUB	(bb3, cc13, cc01, cc01)
1892
1893	LDF	[BO + 45 * SIZE], a1
1894	LDF	[BO + 44 * SIZE], a2
1895	LDF	[BO + 43 * SIZE], a3
1896	LDF	[BO + 42 * SIZE], a4
1897	LDF	[BO + 41 * SIZE], b1
1898	LDF	[BO + 40 * SIZE], b2
1899
1900	FMUL	a1, c12, c12
1901	FMUL	a1, c11, c11
1902
1903	FNMSUB	(aa2, cc12, cc10, cc10)
1904	FNMSUB	(aa2, cc11, cc09, cc09)
1905	FNMSUB	(aa3, cc12, cc08, cc08)
1906	FNMSUB	(aa3, cc11, cc07, cc07)
1907	FNMSUB	(aa4, cc12, cc06, cc06)
1908	FNMSUB	(aa4, cc11, cc05, cc05)
1909	FNMSUB	(bb1, cc12, cc04, cc04)
1910	FNMSUB	(bb1, cc11, cc03, cc03)
1911	FNMSUB	(bb2, cc12, cc02, cc02)
1912	FNMSUB	(bb2, cc11, cc01, cc01)
1913
1914	LDF	[BO + 36 * SIZE], a1
1915	LDF	[BO + 35 * SIZE], a2
1916	LDF	[BO + 34 * SIZE], a3
1917	LDF	[BO + 33 * SIZE], a4
1918	LDF	[BO + 32 * SIZE], b1
1919
1920	FMUL	a1, c10, c10
1921	FMUL	a1, c09, c09
1922
1923	FNMSUB	(aa2, cc10, cc08, cc08)
1924	FNMSUB	(aa2, cc09, cc07, cc07)
1925	FNMSUB	(aa3, cc10, cc06, cc06)
1926	FNMSUB	(aa3, cc09, cc05, cc05)
1927	FNMSUB	(aa4, cc10, cc04, cc04)
1928	FNMSUB	(aa4, cc09, cc03, cc03)
1929	FNMSUB	(bb1, cc10, cc02, cc02)
1930	FNMSUB	(bb1, cc09, cc01, cc01)
1931
1932	LDF	[BO + 27 * SIZE], a1
1933	LDF	[BO + 26 * SIZE], a2
1934	LDF	[BO + 25 * SIZE], a3
1935	LDF	[BO + 24 * SIZE], a4
1936
1937	FMUL	a1, c08, c08
1938	FMUL	a1, c07, c07
1939
1940	FNMSUB	(aa2, cc08, cc06, cc06)
1941	FNMSUB	(aa2, cc07, cc05, cc05)
1942	FNMSUB	(aa3, cc08, cc04, cc04)
1943	FNMSUB	(aa3, cc07, cc03, cc03)
1944	FNMSUB	(aa4, cc08, cc02, cc02)
1945	FNMSUB	(aa4, cc07, cc01, cc01)
1946
1947	LDF	[BO + 18 * SIZE], a1
1948	LDF	[BO + 17 * SIZE], a2
1949	LDF	[BO + 16 * SIZE], a3
1950
1951	FMUL	a1, c06, c06
1952	FMUL	a1, c05, c05
1953
1954	FNMSUB	(aa2, cc06, cc04, cc04)
1955	FNMSUB	(aa2, cc05, cc03, cc03)
1956	FNMSUB	(aa3, cc06, cc02, cc02)
1957	FNMSUB	(aa3, cc05, cc01, cc01)
1958
1959	LDF	[BO +  9 * SIZE], a1
1960	LDF	[BO +  8 * SIZE], a2
1961
1962	FMUL	a1, c04, c04
1963	FMUL	a1, c03, c03
1964
1965	FNMSUB	(aa2, cc04, cc02, cc02)
1966	FNMSUB	(aa2, cc03, cc01, cc01)
1967
1968	LDF	[BO +  0 * SIZE], a1
1969
1970	FMUL	a1, c02, c02
1971	FMUL	a1, c01, c01
1972#endif
1973
1974#ifdef LN
1975	add	C1, -2 * SIZE, C1
1976	add	C2, -2 * SIZE, C2
1977	add	C3, -2 * SIZE, C3
1978	add	C4, -2 * SIZE, C4
1979	add	C5, -2 * SIZE, C5
1980	add	C6, -2 * SIZE, C6
1981	add	C7, -2 * SIZE, C7
1982	add	C8, -2 * SIZE, C8
1983#endif
1984
1985#if defined(LN) || defined(LT)
1986	STF	c01, [BO +  0 * SIZE]
1987	STF	c03, [BO +  1 * SIZE]
1988	STF	c05, [BO +  2 * SIZE]
1989	STF	c07, [BO +  3 * SIZE]
1990
1991	STF	c09, [BO +  4 * SIZE]
1992	STF	c11, [BO +  5 * SIZE]
1993	STF	c13, [BO +  6 * SIZE]
1994	STF	c15, [BO +  7 * SIZE]
1995
1996	STF	c02, [BO +  8 * SIZE]
1997	STF	c04, [BO +  9 * SIZE]
1998	STF	c06, [BO + 10 * SIZE]
1999	STF	c08, [BO + 11 * SIZE]
2000
2001	STF	c10, [BO + 12 * SIZE]
2002	STF	c12, [BO + 13 * SIZE]
2003	STF	c14, [BO + 14 * SIZE]
2004	STF	c16, [BO + 15 * SIZE]
2005#else
2006	STF	c01, [AO +  0 * SIZE]
2007	STF	c02, [AO +  1 * SIZE]
2008	STF	c03, [AO +  2 * SIZE]
2009	STF	c04, [AO +  3 * SIZE]
2010
2011	STF	c05, [AO +  4 * SIZE]
2012	STF	c06, [AO +  5 * SIZE]
2013	STF	c07, [AO +  6 * SIZE]
2014	STF	c08, [AO +  7 * SIZE]
2015
2016	STF	c09, [AO +  8 * SIZE]
2017	STF	c10, [AO +  9 * SIZE]
2018	STF	c11, [AO + 10 * SIZE]
2019	STF	c12, [AO + 11 * SIZE]
2020
2021	STF	c13, [AO + 12 * SIZE]
2022	STF	c14, [AO + 13 * SIZE]
2023	STF	c15, [AO + 14 * SIZE]
2024	STF	c16, [AO + 15 * SIZE]
2025#endif
2026
2027	STF	c01, [C1 + 0 * SIZE]
2028	STF	c02, [C1 + 1 * SIZE]
2029	STF	c03, [C2 + 0 * SIZE]
2030	STF	c04, [C2 + 1 * SIZE]
2031
2032	STF	c05, [C3 + 0 * SIZE]
2033	STF	c06, [C3 + 1 * SIZE]
2034	STF	c07, [C4 + 0 * SIZE]
2035	STF	c08, [C4 + 1 * SIZE]
2036
2037	STF	c09, [C5 + 0 * SIZE]
2038	STF	c10, [C5 + 1 * SIZE]
2039	STF	c11, [C6 + 0 * SIZE]
2040	STF	c12, [C6 + 1 * SIZE]
2041
2042	STF	c13, [C7 + 0 * SIZE]
2043	STF	c14, [C7 + 1 * SIZE]
2044	STF	c15, [C8 + 0 * SIZE]
2045	STF	c16, [C8 + 1 * SIZE]
2046
2047#ifndef LN
2048	add	C1, 2 * SIZE, C1
2049	add	C2, 2 * SIZE, C2
2050	add	C3, 2 * SIZE, C3
2051	add	C4, 2 * SIZE, C4
2052	add	C5, 2 * SIZE, C5
2053	add	C6, 2 * SIZE, C6
2054	add	C7, 2 * SIZE, C7
2055	add	C8, 2 * SIZE, C8
2056#endif
2057
2058#ifdef RT
2059	sll	K, BASE_SHIFT + 1, TEMP1
2060	add	AORIG, TEMP1, AORIG
2061#endif
2062
2063#if defined(LT) || defined(RN)
2064	sub	K, KK, TEMP1
2065	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2066	sll	TEMP1, BASE_SHIFT + 3, TEMP1
2067	add	AO, TEMP2, AO
2068	add	BO, TEMP1, BO
2069#endif
2070
2071#ifdef LT
2072	add	KK, 2, KK
2073#endif
2074
2075#ifdef LN
2076	sub	KK, 2, KK
2077#endif
2078
2079	add	I, -1, I
2080	cmp	I, 0
2081	bg,pt	%icc, .LL12
2082	nop
2083	.align 4
2084
2085.LL29:
2086#ifdef LN
2087	sll	K, BASE_SHIFT + 3, TEMP1
2088	add	B, TEMP1, B
2089#endif
2090
2091#if defined(LT) || defined(RN)
2092	mov	BO, B
2093#endif
2094
2095#ifdef RN
2096	add	KK, 8, KK
2097#endif
2098
2099#ifdef RT
2100	sub	KK, 8, KK
2101#endif
2102
2103	add	J, -1, J
2104	cmp	J, 0
2105	bg,pt	%icc, .LL11
2106	nop
2107	.align 4
2108
2109.LL30:
2110	and	N, 4, J
2111	cmp	J, 0
2112	ble,pn	%icc, .LL50
2113	nop
2114
2115#ifdef RT
2116	sll	K, BASE_SHIFT + 2, TEMP1
2117	sub	B, TEMP1, B
2118#endif
2119
2120#ifndef RT
2121	mov	C,  C1
2122	add	C,  LDC, C2
2123	add	C2, LDC, C3
2124	add	C3, LDC, C4
2125	add	C4, LDC, C
2126#else
2127	sub	C,  LDC, C4
2128	sub	C4, LDC, C3
2129	sub	C3, LDC, C2
2130	sub	C2, LDC, C1
2131	sub	C2, LDC, C
2132#endif
2133
2134#ifdef LN
2135	add	M, OFFSET, KK
2136#endif
2137
2138#ifdef LT
2139	mov	OFFSET, KK
2140#endif
2141
2142#if defined(LN) || defined(RT)
2143	mov	A, AORIG
2144#else
2145	mov	A, AO
2146#endif
2147
2148	and	M, 1, I
2149	cmp	I, 0
2150	ble,pn	%icc, .LL40
2151	nop
2152
2153#if defined(LT) || defined(RN)
2154	mov	B, BO
2155#else
2156#ifdef LN
2157	sll	K,  BASE_SHIFT + 0, TEMP1
2158	sub	AORIG, TEMP1, AORIG
2159#endif
2160
2161	sll	KK, BASE_SHIFT + 0, TEMP1
2162	sll	KK, BASE_SHIFT + 2, TEMP2
2163
2164	add	AORIG, TEMP1, AO
2165	add	B,     TEMP2, BO
2166#endif
2167
2168	LDF	[AO +  0 * SIZE], a1
2169	LDF	[AO +  1 * SIZE], a2
2170	LDF	[AO +  2 * SIZE], a3
2171	LDF	[AO +  3 * SIZE], a4
2172
2173	LDF	[BO +  0 * SIZE], b1
2174	LDF	[BO +  1 * SIZE], b2
2175	LDF	[BO +  2 * SIZE], b3
2176	LDF	[BO +  3 * SIZE], b4
2177	LDF	[BO +  4 * SIZE], b5
2178	LDF	[BO +  5 * SIZE], b6
2179	FCLR	(cc01)
2180	LDF	[BO +  6 * SIZE], b7
2181	FCLR	(cc03)
2182	LDF	[BO +  7 * SIZE], b8
2183	FCLR	(cc05)
2184	LDF	[BO +  8 * SIZE], b9
2185	FCLR	(cc07)
2186
2187#if defined(LT) || defined(RN)
2188	sra	KK, 2, L
2189#else
2190	sub	K, KK, L
2191	sra	L,  2, L
2192#endif
2193	cmp	L,  0
2194	ble,pn	%icc, .LL45
2195	nop
2196
2197.LL43:
2198	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2199	add	L, -1, L
2200
2201	FMADD	(aa1, bb1, cc01, cc01)
2202	LDF	[BO + 16 * SIZE], b1
2203	FMADD	(aa1, bb2, cc03, cc03)
2204	LDF	[BO +  9 * SIZE], b2
2205	FMADD	(aa1, bb3, cc05, cc05)
2206	LDF	[BO + 10 * SIZE], b3
2207	FMADD	(aa1, bb4, cc07, cc07)
2208	LDF	[BO + 11 * SIZE], b4
2209
2210	LDF	[AO +  4 * SIZE], a1
2211	cmp	L, 0
2212
2213	FMADD	(aa2, bb5, cc01, cc01)
2214	LDF	[BO + 12 * SIZE], b5
2215	FMADD	(aa2, bb6, cc03, cc03)
2216	LDF	[BO + 13 * SIZE], b6
2217	FMADD	(aa2, bb7, cc05, cc05)
2218	LDF	[BO + 14 * SIZE], b7
2219	FMADD	(aa2, bb8, cc07, cc07)
2220	LDF	[BO + 15 * SIZE], b8
2221
2222	LDF	[AO +  5 * SIZE], a2
2223	add	AO,  4 * SIZE, AO
2224
2225	FMADD	(aa3, bb9, cc01, cc01)
2226	LDF	[BO + 24 * SIZE], b9
2227	FMADD	(aa3, bb2, cc03, cc03)
2228	LDF	[BO + 17 * SIZE], b2
2229	FMADD	(aa3, bb3, cc05, cc05)
2230	LDF	[BO + 18 * SIZE], b3
2231	FMADD	(aa3, bb4, cc07, cc07)
2232	LDF	[BO + 19 * SIZE], b4
2233
2234	LDF	[AO +  2 * SIZE], a3
2235	add	BO, 16 * SIZE, BO
2236
2237	FMADD	(aa4, bb5, cc01, cc01)
2238	LDF	[BO +  4 * SIZE], b5
2239	FMADD	(aa4, bb6, cc03, cc03)
2240	LDF	[BO +  5 * SIZE], b6
2241	FMADD	(aa4, bb7, cc05, cc05)
2242	LDF	[BO +  6 * SIZE], b7
2243	FMADD	(aa4, bb8, cc07, cc07)
2244	LDF	[BO +  7 * SIZE], b8
2245
2246	bg,pt	%icc, .LL43
2247	LDF	[AO +  3 * SIZE], a4
2248	.align 4
2249
2250.LL45:
2251#if defined(LT) || defined(RN)
2252	and	KK, 3, L
2253#else
2254	sub	K, KK, L
2255	and	L,  3, L
2256#endif
2257	cmp	L,  0
2258	ble,a,pn %icc, .LL48
2259	nop
2260	.align 4
2261
2262.LL47:
2263	FMADD	(aa1, bb1, cc01, cc01)
2264	LDF	[BO + 4 * SIZE], b1
2265	add	L, -1, L
2266	FMADD	(aa1, bb2, cc03, cc03)
2267	LDF	[BO + 5 * SIZE], b2
2268	add	AO, 1 * SIZE, AO
2269
2270	FMADD	(aa1, bb3, cc05, cc05)
2271	LDF	[BO + 6 * SIZE], b3
2272	cmp	L, 0
2273	FMADD	(aa1, bb4, cc07, cc07)
2274	LDF	[BO + 7 * SIZE], b4
2275	add	BO, 4 * SIZE, BO
2276
2277	bg,pt	%icc, .LL47
2278	LDF	[AO + 0 * SIZE], a1
2279	.align 4
2280
2281.LL48:
2282#if defined(LN) || defined(RT)
2283#ifdef LN
2284	sub	KK, 1, TEMP1
2285#else
2286	sub	KK, 4, TEMP1
2287#endif
2288	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2289	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2290
2291	add	AORIG, TEMP2, AO
2292	add	B,     TEMP1, BO
2293#endif
2294
2295#if defined(LN) || defined(LT)
2296	LDF	[BO +  0 * SIZE], a1
2297	LDF	[BO +  1 * SIZE], a2
2298	LDF	[BO +  2 * SIZE], a3
2299	LDF	[BO +  3 * SIZE], a4
2300
2301	FSUB	a1, c01, c01
2302	FSUB	a2, c03, c03
2303	FSUB	a3, c05, c05
2304	FSUB	a4, c07, c07
2305#else
2306	LDF	[AO +  0 * SIZE], a1
2307	LDF	[AO +  1 * SIZE], a2
2308	LDF	[AO +  2 * SIZE], a3
2309	LDF	[AO +  3 * SIZE], a4
2310
2311	FSUB	a1, c01, c01
2312	FSUB	a2, c03, c03
2313	FSUB	a3, c05, c05
2314	FSUB	a4, c07, c07
2315#endif
2316
2317#if defined(LN) || defined(LT)
2318	LDF	[AO +  0 * SIZE], a1
2319
2320	FMUL	a1, c01, c01
2321	FMUL	a1, c03, c03
2322	FMUL	a1, c05, c05
2323	FMUL	a1, c07, c07
2324#endif
2325
2326#ifdef RN
2327	LDF	[BO +  0 * SIZE], a1
2328	LDF	[BO +  1 * SIZE], a2
2329	LDF	[BO +  2 * SIZE], a3
2330	LDF	[BO +  3 * SIZE], a4
2331
2332	FMUL	a1, c01, c01
2333
2334	FNMSUB	(aa2, cc01, cc03, cc03)
2335	FNMSUB	(aa3, cc01, cc05, cc05)
2336	FNMSUB	(aa4, cc01, cc07, cc07)
2337
2338	LDF	[BO +  5 * SIZE], a1
2339	LDF	[BO +  6 * SIZE], a2
2340	LDF	[BO +  7 * SIZE], a3
2341
2342	FMUL	a1, c03, c03
2343
2344	FNMSUB	(aa2, cc03, cc05, cc05)
2345	FNMSUB	(aa3, cc03, cc07, cc07)
2346
2347	LDF	[BO + 10 * SIZE], a1
2348	LDF	[BO + 11 * SIZE], a2
2349
2350	FMUL	a1, c05, c05
2351
2352	FNMSUB	(aa2, cc05, cc07, cc07)
2353
2354	LDF	[BO + 15 * SIZE], a1
2355
2356	FMUL	a1, c07, c07
2357#endif
2358
2359#ifdef RT
2360	LDF	[BO + 15 * SIZE], a1
2361	LDF	[BO + 14 * SIZE], a2
2362	LDF	[BO + 13 * SIZE], a3
2363	LDF	[BO + 12 * SIZE], a4
2364
2365	FMUL	a1, c07, c07
2366
2367	FNMSUB	(aa2, cc07, cc05, cc05)
2368	FNMSUB	(aa3, cc07, cc03, cc03)
2369	FNMSUB	(aa4, cc07, cc01, cc01)
2370
2371	LDF	[BO + 10 * SIZE], a1
2372	LDF	[BO +  9 * SIZE], a2
2373	LDF	[BO +  8 * SIZE], a3
2374
2375	FMUL	a1, c05, c05
2376
2377	FNMSUB	(aa2, cc05, cc03, cc03)
2378	FNMSUB	(aa3, cc05, cc01, cc01)
2379
2380	LDF	[BO +  5 * SIZE], a1
2381	LDF	[BO +  4 * SIZE], a2
2382
2383	FMUL	a1, c03, c03
2384
2385	FNMSUB	(aa2, cc03, cc01, cc01)
2386
2387	LDF	[BO +  0 * SIZE], a1
2388
2389	FMUL	a1, c01, c01
2390#endif
2391
2392#ifdef LN
2393	add	C1, -1 * SIZE, C1
2394	add	C2, -1 * SIZE, C2
2395	add	C3, -1 * SIZE, C3
2396	add	C4, -1 * SIZE, C4
2397#endif
2398
2399#if defined(LN) || defined(LT)
2400	STF	c01, [BO +  0 * SIZE]
2401	STF	c03, [BO +  1 * SIZE]
2402	STF	c05, [BO +  2 * SIZE]
2403	STF	c07, [BO +  3 * SIZE]
2404#else
2405	STF	c01, [AO +  0 * SIZE]
2406	STF	c03, [AO +  1 * SIZE]
2407	STF	c05, [AO +  2 * SIZE]
2408	STF	c07, [AO +  3 * SIZE]
2409#endif
2410
2411	STF	c01, [C1 + 0 * SIZE]
2412	STF	c03, [C2 + 0 * SIZE]
2413	STF	c05, [C3 + 0 * SIZE]
2414	STF	c07, [C4 + 0 * SIZE]
2415
2416#ifdef RT
2417	sll	K, BASE_SHIFT + 0, TEMP1
2418	add	AORIG, TEMP1, AORIG
2419#endif
2420
2421#if defined(LT) || defined(RN)
2422	sub	K, KK, TEMP1
2423	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2424	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2425	add	AO, TEMP2, AO
2426	add	BO, TEMP1, BO
2427#endif
2428
2429#ifdef LT
2430	add	KK, 1, KK
2431#endif
2432
2433#ifdef LN
2434	sub	KK, 1, KK
2435#endif
2436	.align 4
2437
2438.LL40:
2439	sra	M, 1, I
2440	cmp	I, 0
2441	ble,pn	%icc, .LL49
2442	nop
2443	.align 4
2444
2445.LL32:
2446#if defined(LT) || defined(RN)
2447	mov	B, BO
2448#else
2449#ifdef LN
2450	sll	K,  BASE_SHIFT + 1, TEMP1
2451	sub	AORIG, TEMP1, AORIG
2452#endif
2453
2454	sll	KK, BASE_SHIFT + 1, TEMP1
2455	sll	KK, BASE_SHIFT + 2, TEMP2
2456
2457	add	AORIG, TEMP1, AO
2458	add	B,     TEMP2, BO
2459#endif
2460
2461	LDF	[AO +  0 * SIZE], a1
2462	LDF	[AO +  1 * SIZE], a2
2463
2464	LDF	[BO +  0 * SIZE], b1
2465	LDF	[BO +  1 * SIZE], b2
2466	LDF	[BO +  2 * SIZE], b3
2467	LDF	[BO +  3 * SIZE], b4
2468	LDF	[BO +  4 * SIZE], b5
2469
2470	LDF	[BO +  5 * SIZE], b6
2471	FCLR	(cc01)
2472	LDF	[BO +  6 * SIZE], b7
2473	FCLR	(cc02)
2474	LDF	[BO +  7 * SIZE], b8
2475	FCLR	(cc03)
2476	LDF	[BO +  8 * SIZE], b9
2477	FCLR	(cc04)
2478
2479	prefetch [C1 + 2 * SIZE], 3
2480	FCLR	(cc05)
2481	prefetch [C2 + 2 * SIZE], 3
2482	FCLR	(cc06)
2483	prefetch [C3 + 2 * SIZE], 3
2484	FCLR	(cc07)
2485	prefetch [C4 + 2 * SIZE], 3
2486	FCLR	(cc08)
2487
2488#if defined(LT) || defined(RN)
2489	sra	KK, 2, L
2490#else
2491	sub	K, KK, L
2492	sra	L,  2, L
2493#endif
2494	cmp	L,  0
2495	ble,pn	%icc, .LL35
2496	nop
2497	.align 4
2498
2499.LL33:
2500	FMADD	(aa1, bb1, cc01, cc01)
2501	LDF	[AO +  2 * SIZE], a3
2502	FMADD	(aa2, bb1, cc02, cc02)
2503	LDF	[AO +  3 * SIZE], a4
2504
2505	FMADD	(aa1, bb2, cc03, cc03)
2506	LDF	[BO + 16 * SIZE], b1
2507	FMADD	(aa2, bb2, cc04, cc04)
2508	LDF	[BO +  9 * SIZE], b2
2509
2510	FMADD	(aa1, bb3, cc05, cc05)
2511	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2512	FMADD	(aa2, bb3, cc06, cc06)
2513	add	L, -1, L
2514
2515	FMADD	(aa1, bb4, cc07, cc07)
2516	LDF	[BO + 10 * SIZE], b3
2517	FMADD	(aa2, bb4, cc08, cc08)
2518	LDF	[BO + 11 * SIZE], b4
2519
2520	FMADD	(aa3, bb5, cc01, cc01)
2521	LDF	[AO +  4 * SIZE], a1
2522	FMADD	(aa4, bb5, cc02, cc02)
2523	LDF	[AO +  5 * SIZE], a2
2524
2525	FMADD	(aa3, bb6, cc03, cc03)
2526	LDF	[BO + 12 * SIZE], b5
2527	FMADD	(aa4, bb6, cc04, cc04)
2528	LDF	[BO + 13 * SIZE], b6
2529
2530	FMADD	(aa3, bb7, cc05, cc05)
2531	cmp	L, 0
2532	FMADD	(aa4, bb7, cc06, cc06)
2533	add	AO,  8 * SIZE, AO
2534
2535	FMADD	(aa3, bb8, cc07, cc07)
2536	LDF	[BO + 14 * SIZE], b7
2537	FMADD	(aa4, bb8, cc08, cc08)
2538	LDF	[BO + 15 * SIZE], b8
2539
2540	FMADD	(aa1, bb9, cc01, cc01)
2541	LDF	[AO -  2 * SIZE], a3
2542	FMADD	(aa2, bb9, cc02, cc02)
2543	LDF	[AO -  1 * SIZE], a4
2544
2545	FMADD	(aa1, bb2, cc03, cc03)
2546	LDF	[BO + 24 * SIZE], b9
2547	FMADD	(aa2, bb2, cc04, cc04)
2548	LDF	[BO + 17 * SIZE], b2
2549
2550	FMADD	(aa1, bb3, cc05, cc05)
2551	add	BO, 16 * SIZE, BO
2552	FMADD	(aa2, bb3, cc06, cc06)
2553	nop
2554
2555	FMADD	(aa1, bb4, cc07, cc07)
2556	LDF	[BO +  2 * SIZE], b3
2557	FMADD	(aa2, bb4, cc08, cc08)
2558	LDF	[BO +  3 * SIZE], b4
2559
2560	FMADD	(aa3, bb5, cc01, cc01)
2561	LDF	[AO +  0 * SIZE], a1
2562	FMADD	(aa4, bb5, cc02, cc02)
2563	LDF	[AO +  1 * SIZE], a2
2564	FMADD	(aa3, bb6, cc03, cc03)
2565	LDF	[BO +  4 * SIZE], b5
2566	FMADD	(aa4, bb6, cc04, cc04)
2567	LDF	[BO +  5 * SIZE], b6
2568
2569	FMADD	(aa3, bb7, cc05, cc05)
2570	nop
2571	FMADD	(aa4, bb7, cc06, cc06)
2572	LDF	[BO +  6 * SIZE], b7
2573
2574	FMADD	(aa3, bb8, cc07, cc07)
2575	FMADD	(aa4, bb8, cc08, cc08)
2576	bg,pt	%icc, .LL33
2577	LDF	[BO +  7 * SIZE], b8
2578	.align 4
2579
2580.LL35:
2581#if defined(LT) || defined(RN)
2582	and	KK, 3, L
2583#else
2584	sub	K, KK, L
2585	and	L,  3, L
2586#endif
2587	cmp	L,  0
2588	ble,a,pn %icc, .LL38
2589	nop
2590	.align 4
2591
2592.LL37:
2593	FMADD	(aa1, bb1, cc01, cc01)
2594	add	L, -1, L
2595	FMADD	(aa2, bb1, cc02, cc02)
2596	LDF	[BO + 4 * SIZE], b1
2597
2598	FMADD	(aa1, bb2, cc03, cc03)
2599	add	AO, 2 * SIZE, AO
2600	FMADD	(aa2, bb2, cc04, cc04)
2601	LDF	[BO + 5 * SIZE], b2
2602
2603	FMADD	(aa1, bb3, cc05, cc05)
2604	cmp	L, 0
2605	FMADD	(aa2, bb3, cc06, cc06)
2606	LDF	[BO + 6 * SIZE], b3
2607
2608	FMADD	(aa1, bb4, cc07, cc07)
2609	LDF	[AO + 0 * SIZE], a1
2610	FMADD	(aa2, bb4, cc08, cc08)
2611	LDF	[AO + 1 * SIZE], a2
2612
2613	LDF	[BO + 7 * SIZE], b4
2614	bg,pt	%icc, .LL37
2615	add	BO, 4 * SIZE, BO
2616	.align 4
2617
2618.LL38:
2619#if defined(LN) || defined(RT)
2620#ifdef LN
2621	sub	KK, 2, TEMP1
2622#else
2623	sub	KK, 4, TEMP1
2624#endif
2625	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2626	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2627
2628	add	AORIG, TEMP2, AO
2629	add	B,     TEMP1, BO
2630#endif
2631
2632#if defined(LN) || defined(LT)
2633	LDF	[BO +  0 * SIZE], a1
2634	LDF	[BO +  1 * SIZE], a2
2635	LDF	[BO +  2 * SIZE], a3
2636	LDF	[BO +  3 * SIZE], a4
2637
2638	LDF	[BO +  4 * SIZE], b1
2639	LDF	[BO +  5 * SIZE], b2
2640	LDF	[BO +  6 * SIZE], b3
2641	LDF	[BO +  7 * SIZE], b4
2642
2643	FSUB	a1, c01, c01
2644	FSUB	a2, c03, c03
2645	FSUB	a3, c05, c05
2646	FSUB	a4, c07, c07
2647
2648	FSUB	b1, c02, c02
2649	FSUB	b2, c04, c04
2650	FSUB	b3, c06, c06
2651	FSUB	b4, c08, c08
2652#else
2653	LDF	[AO +  0 * SIZE], a1
2654	LDF	[AO +  1 * SIZE], a2
2655	LDF	[AO +  2 * SIZE], a3
2656	LDF	[AO +  3 * SIZE], a4
2657
2658	LDF	[AO +  4 * SIZE], b1
2659	LDF	[AO +  5 * SIZE], b2
2660	LDF	[AO +  6 * SIZE], b3
2661	LDF	[AO +  7 * SIZE], b4
2662
2663	FSUB	a1, c01, c01
2664	FSUB	a2, c02, c02
2665	FSUB	a3, c03, c03
2666	FSUB	a4, c04, c04
2667
2668	FSUB	b1, c05, c05
2669	FSUB	b2, c06, c06
2670	FSUB	b3, c07, c07
2671	FSUB	b4, c08, c08
2672
2673#endif
2674
2675#ifdef LN
2676	LDF	[AO +  3 * SIZE], a1
2677	LDF	[AO +  2 * SIZE], a2
2678	LDF	[AO +  0 * SIZE], a3
2679
2680	FMUL	a1, c02, c02
2681	FMUL	a1, c04, c04
2682	FMUL	a1, c06, c06
2683	FMUL	a1, c08, c08
2684
2685	FNMSUB	(aa2, cc02, cc01, cc01)
2686	FNMSUB	(aa2, cc04, cc03, cc03)
2687	FNMSUB	(aa2, cc06, cc05, cc05)
2688	FNMSUB	(aa2, cc08, cc07, cc07)
2689
2690	FMUL	a3, c01, c01
2691	FMUL	a3, c03, c03
2692	FMUL	a3, c05, c05
2693	FMUL	a3, c07, c07
2694#endif
2695
2696#ifdef LT
2697	LDF	[AO +  0 * SIZE], a1
2698	LDF	[AO +  1 * SIZE], a2
2699	LDF	[AO +  3 * SIZE], a3
2700
2701	FMUL	a1, c01, c01
2702	FMUL	a1, c03, c03
2703	FMUL	a1, c05, c05
2704	FMUL	a1, c07, c07
2705
2706	FNMSUB	(aa2, cc01, cc02, cc02)
2707	FNMSUB	(aa2, cc03, cc04, cc04)
2708	FNMSUB	(aa2, cc05, cc06, cc06)
2709	FNMSUB	(aa2, cc07, cc08, cc08)
2710
2711	FMUL	a3, c02, c02
2712	FMUL	a3, c04, c04
2713	FMUL	a3, c06, c06
2714	FMUL	a3, c08, c08
2715#endif
2716
2717#ifdef RN
2718	LDF	[BO +  0 * SIZE], a1
2719	LDF	[BO +  1 * SIZE], a2
2720	LDF	[BO +  2 * SIZE], a3
2721	LDF	[BO +  3 * SIZE], a4
2722
2723	FMUL	a1, c01, c01
2724	FMUL	a1, c02, c02
2725
2726	FNMSUB	(aa2, cc01, cc03, cc03)
2727	FNMSUB	(aa2, cc02, cc04, cc04)
2728	FNMSUB	(aa3, cc01, cc05, cc05)
2729	FNMSUB	(aa3, cc02, cc06, cc06)
2730	FNMSUB	(aa4, cc01, cc07, cc07)
2731	FNMSUB	(aa4, cc02, cc08, cc08)
2732
2733	LDF	[BO +  5 * SIZE], a1
2734	LDF	[BO +  6 * SIZE], a2
2735	LDF	[BO +  7 * SIZE], a3
2736
2737	FMUL	a1, c03, c03
2738	FMUL	a1, c04, c04
2739
2740	FNMSUB	(aa2, cc03, cc05, cc05)
2741	FNMSUB	(aa2, cc04, cc06, cc06)
2742	FNMSUB	(aa3, cc03, cc07, cc07)
2743	FNMSUB	(aa3, cc04, cc08, cc08)
2744
2745	LDF	[BO + 10 * SIZE], a1
2746	LDF	[BO + 11 * SIZE], a2
2747
2748	FMUL	a1, c05, c05
2749	FMUL	a1, c06, c06
2750
2751	FNMSUB	(aa2, cc05, cc07, cc07)
2752	FNMSUB	(aa2, cc06, cc08, cc08)
2753
2754	LDF	[BO + 15 * SIZE], a1
2755
2756	FMUL	a1, c07, c07
2757	FMUL	a1, c08, c08
2758#endif
2759
2760#ifdef RT
2761	LDF	[BO + 15 * SIZE], a1
2762	LDF	[BO + 14 * SIZE], a2
2763	LDF	[BO + 13 * SIZE], a3
2764	LDF	[BO + 12 * SIZE], a4
2765
2766	FMUL	a1, c08, c08
2767	FMUL	a1, c07, c07
2768
2769	FNMSUB	(aa2, cc08, cc06, cc06)
2770	FNMSUB	(aa2, cc07, cc05, cc05)
2771	FNMSUB	(aa3, cc08, cc04, cc04)
2772	FNMSUB	(aa3, cc07, cc03, cc03)
2773	FNMSUB	(aa4, cc08, cc02, cc02)
2774	FNMSUB	(aa4, cc07, cc01, cc01)
2775
2776	LDF	[BO + 10 * SIZE], a1
2777	LDF	[BO +  9 * SIZE], a2
2778	LDF	[BO +  8 * SIZE], a3
2779
2780	FMUL	a1, c06, c06
2781	FMUL	a1, c05, c05
2782
2783	FNMSUB	(aa2, cc06, cc04, cc04)
2784	FNMSUB	(aa2, cc05, cc03, cc03)
2785	FNMSUB	(aa3, cc06, cc02, cc02)
2786	FNMSUB	(aa3, cc05, cc01, cc01)
2787
2788	LDF	[BO +  5 * SIZE], a1
2789	LDF	[BO +  4 * SIZE], a2
2790
2791	FMUL	a1, c04, c04
2792	FMUL	a1, c03, c03
2793
2794	FNMSUB	(aa2, cc04, cc02, cc02)
2795	FNMSUB	(aa2, cc03, cc01, cc01)
2796
2797	LDF	[BO +  0 * SIZE], a1
2798
2799	FMUL	a1, c02, c02
2800	FMUL	a1, c01, c01
2801#endif
2802
2803#ifdef LN
2804	add	C1, -2 * SIZE, C1
2805	add	C2, -2 * SIZE, C2
2806	add	C3, -2 * SIZE, C3
2807	add	C4, -2 * SIZE, C4
2808#endif
2809
2810#if defined(LN) || defined(LT)
2811	STF	c01, [BO +  0 * SIZE]
2812	STF	c03, [BO +  1 * SIZE]
2813	STF	c05, [BO +  2 * SIZE]
2814	STF	c07, [BO +  3 * SIZE]
2815
2816	STF	c02, [BO +  4 * SIZE]
2817	STF	c04, [BO +  5 * SIZE]
2818	STF	c06, [BO +  6 * SIZE]
2819	STF	c08, [BO +  7 * SIZE]
2820#else
2821	STF	c01, [AO +  0 * SIZE]
2822	STF	c02, [AO +  1 * SIZE]
2823	STF	c03, [AO +  2 * SIZE]
2824	STF	c04, [AO +  3 * SIZE]
2825
2826	STF	c05, [AO +  4 * SIZE]
2827	STF	c06, [AO +  5 * SIZE]
2828	STF	c07, [AO +  6 * SIZE]
2829	STF	c08, [AO +  7 * SIZE]
2830#endif
2831
2832	STF	c01, [C1 + 0 * SIZE]
2833	STF	c02, [C1 + 1 * SIZE]
2834	STF	c03, [C2 + 0 * SIZE]
2835	STF	c04, [C2 + 1 * SIZE]
2836
2837	STF	c05, [C3 + 0 * SIZE]
2838	STF	c06, [C3 + 1 * SIZE]
2839	STF	c07, [C4 + 0 * SIZE]
2840	STF	c08, [C4 + 1 * SIZE]
2841
2842#ifndef LN
2843	add	C1, 2 * SIZE, C1
2844	add	C2, 2 * SIZE, C2
2845	add	C3, 2 * SIZE, C3
2846	add	C4, 2 * SIZE, C4
2847#endif
2848
2849#ifdef RT
2850	sll	K, BASE_SHIFT + 1, TEMP1
2851	add	AORIG, TEMP1, AORIG
2852#endif
2853
2854#if defined(LT) || defined(RN)
2855	sub	K, KK, TEMP1
2856	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2857	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2858	add	AO, TEMP2, AO
2859	add	BO, TEMP1, BO
2860#endif
2861
2862#ifdef LT
2863	add	KK, 2, KK
2864#endif
2865
2866#ifdef LN
2867	sub	KK, 2, KK
2868#endif
2869
2870	add	I, -1, I
2871	cmp	I, 0
2872	bg,pt	%icc, .LL32
2873	nop
2874
2875.LL49:
2876#ifdef LN
2877	sll	K, BASE_SHIFT + 2, TEMP1
2878	add	B, TEMP1, B
2879#endif
2880
2881#if defined(LT) || defined(RN)
2882	mov	BO, B
2883#endif
2884
2885#ifdef RN
2886	add	KK, 4, KK
2887#endif
2888
2889#ifdef RT
2890	sub	KK, 4, KK
2891#endif
2892	.align 4
2893
2894.LL50:
2895	and	N, 2, J
2896	cmp	J, 0
2897	ble,pn	%icc, .LL70
2898	nop
2899
2900#ifdef RT
2901	sll	K, BASE_SHIFT + 1, TEMP1
2902	sub	B, TEMP1, B
2903#endif
2904
2905#ifndef RT
2906	mov	C,  C1
2907	add	C,  LDC, C2
2908	add	C2, LDC, C
2909#else
2910	sub	C,  LDC, C2
2911	sub	C2, LDC, C1
2912	sub	C2, LDC, C
2913#endif
2914
2915#ifdef LN
2916	add	M, OFFSET, KK
2917#endif
2918
2919#ifdef LT
2920	mov	OFFSET, KK
2921#endif
2922
2923#if defined(LN) || defined(RT)
2924	mov	A, AORIG
2925#else
2926	mov	A, AO
2927#endif
2928
2929	and	M, 1, I
2930	cmp	I, 0
2931	ble,pn	%icc, .LL60
2932	nop
2933
2934#if defined(LT) || defined(RN)
2935	mov	B, BO
2936#else
2937#ifdef LN
2938	sll	K,  BASE_SHIFT + 0, TEMP1
2939	sub	AORIG, TEMP1, AORIG
2940#endif
2941
2942	sll	KK, BASE_SHIFT + 0, TEMP1
2943	sll	KK, BASE_SHIFT + 1, TEMP2
2944
2945	add	AORIG, TEMP1, AO
2946	add	B,     TEMP2, BO
2947#endif
2948
2949	LDF	[AO +  0 * SIZE], a1
2950	LDF	[AO +  1 * SIZE], a2
2951	LDF	[AO +  2 * SIZE], a3
2952	LDF	[AO +  3 * SIZE], a4
2953
2954	LDF	[BO +  0 * SIZE], b1
2955	LDF	[BO +  1 * SIZE], b2
2956	LDF	[BO +  2 * SIZE], b3
2957	LDF	[BO +  3 * SIZE], b4
2958	LDF	[BO +  4 * SIZE], b5
2959	LDF	[BO +  5 * SIZE], b6
2960	LDF	[BO +  6 * SIZE], b7
2961	FCLR	(cc01)
2962	LDF	[BO +  7 * SIZE], b8
2963	FCLR	(cc03)
2964
2965#if defined(LT) || defined(RN)
2966	sra	KK, 2, L
2967#else
2968	sub	K, KK, L
2969	sra	L,  2, L
2970#endif
2971	cmp	L,  0
2972	ble,pn	%icc, .LL65
2973	nop
2974	.align 4
2975
2976.LL63:
2977	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2978	add	L, -1, L
2979
2980	FMADD	(aa1, bb1, cc01, cc01)
2981	LDF	[BO +  8 * SIZE], b1
2982	FMADD	(aa1, bb2, cc03, cc03)
2983	LDF	[BO +  9 * SIZE], b2
2984
2985	LDF	[AO +  4 * SIZE], a1
2986	cmp	L, 0
2987
2988	FMADD	(aa2, bb3, cc01, cc01)
2989	LDF	[BO + 10 * SIZE], b3
2990	FMADD	(aa2, bb4, cc03, cc03)
2991	LDF	[BO + 11 * SIZE], b4
2992
2993	LDF	[AO +  5 * SIZE], a2
2994	add	AO,  4 * SIZE, AO
2995
2996	FMADD	(aa3, bb5, cc01, cc01)
2997	LDF	[BO + 12 * SIZE], b5
2998	FMADD	(aa3, bb6, cc03, cc03)
2999	LDF	[BO + 13 * SIZE], b6
3000
3001	LDF	[AO +  2 * SIZE], a3
3002	add	BO,  8 * SIZE, BO
3003
3004	FMADD	(aa4, bb7, cc01, cc01)
3005	LDF	[BO +  6 * SIZE], b7
3006	FMADD	(aa4, bb8, cc03, cc03)
3007	LDF	[BO + 7 * SIZE], b8
3008
3009	bg,pt	%icc, .LL63
3010	LDF	[AO +  3 * SIZE], a4
3011	.align 4
3012
3013.LL65:
3014#if defined(LT) || defined(RN)
3015	and	KK, 3, L
3016#else
3017	sub	K, KK, L
3018	and	L,  3, L
3019#endif
3020	cmp	L,  0
3021	ble,a,pn %icc, .LL68
3022	nop
3023	.align 4
3024
3025.LL67:
3026	FMADD	(aa1, bb1, cc01, cc01)
3027	LDF	[BO + 2 * SIZE], b1
3028	FMADD	(aa1, bb2, cc03, cc03)
3029	LDF	[BO + 3 * SIZE], b2
3030
3031	LDF	[AO + 1 * SIZE], a1
3032	add	L, -1, L
3033	add	AO, 1 * SIZE, AO
3034	cmp	L, 0
3035
3036	bg,pt	%icc, .LL67
3037	add	BO, 2 * SIZE, BO
3038	.align 4
3039
3040.LL68:
3041#if defined(LN) || defined(RT)
3042#ifdef LN
3043	sub	KK, 1, TEMP1
3044#else
3045	sub	KK, 2, TEMP1
3046#endif
3047	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3048	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3049
3050	add	AORIG, TEMP2, AO
3051	add	B,     TEMP1, BO
3052#endif
3053
3054#if defined(LN) || defined(LT)
3055	LDF	[BO +  0 * SIZE], a1
3056	LDF	[BO +  1 * SIZE], a2
3057
3058	FSUB	a1, c01, c01
3059	FSUB	a2, c03, c03
3060#else
3061	LDF	[AO +  0 * SIZE], a1
3062	LDF	[AO +  1 * SIZE], a2
3063
3064	FSUB	a1, c01, c01
3065	FSUB	a2, c03, c03
3066#endif
3067
3068#if defined(LN) || defined(LT)
3069	LDF	[AO +  0 * SIZE], a1
3070
3071	FMUL	a1, c01, c01
3072	FMUL	a1, c03, c03
3073#endif
3074
3075#ifdef RN
3076	LDF	[BO +  0 * SIZE], a1
3077	LDF	[BO +  1 * SIZE], a2
3078
3079	FMUL	a1, c01, c01
3080
3081	FNMSUB	(aa2, cc01, cc03, cc03)
3082
3083	LDF	[BO +  3 * SIZE], a1
3084
3085	FMUL	a1, c03, c03
3086#endif
3087
3088#ifdef RT
3089	LDF	[BO +  3 * SIZE], a1
3090	LDF	[BO +  2 * SIZE], a2
3091
3092	FMUL	a1, c03, c03
3093
3094	FNMSUB	(aa2, cc03, cc01, cc01)
3095
3096	LDF	[BO +  0 * SIZE], a1
3097
3098	FMUL	a1, c01, c01
3099#endif
3100
3101#ifdef LN
3102	add	C1, -1 * SIZE, C1
3103	add	C2, -1 * SIZE, C2
3104#endif
3105
3106#if defined(LN) || defined(LT)
3107	STF	c01, [BO +  0 * SIZE]
3108	STF	c03, [BO +  1 * SIZE]
3109#else
3110	STF	c01, [AO +  0 * SIZE]
3111	STF	c03, [AO +  1 * SIZE]
3112#endif
3113
3114	STF	c01, [C1 + 0 * SIZE]
3115	STF	c03, [C2 + 0 * SIZE]
3116
3117#ifdef RT
3118	sll	K, BASE_SHIFT + 0, TEMP1
3119	add	AORIG, TEMP1, AORIG
3120#endif
3121
3122#if defined(LT) || defined(RN)
3123	sub	K, KK, TEMP1
3124	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3125	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3126	add	AO, TEMP2, AO
3127	add	BO, TEMP1, BO
3128#endif
3129
3130#ifdef LT
3131	add	KK, 1, KK
3132#endif
3133
3134#ifdef LN
3135	sub	KK, 1, KK
3136#endif
3137	.align 4
3138
3139.LL60:
3140	sra	M, 1, I
3141	cmp	I, 0
3142	ble,pn	%icc, .LL69
3143	nop
3144	.align 4
3145
3146.LL52:
3147#if defined(LT) || defined(RN)
3148	mov	B, BO
3149#else
3150#ifdef LN
3151	sll	K,  BASE_SHIFT + 1, TEMP1
3152	sub	AORIG, TEMP1, AORIG
3153#endif
3154
3155	sll	KK, BASE_SHIFT + 1, TEMP1
3156	sll	KK, BASE_SHIFT + 1, TEMP2
3157
3158	add	AORIG, TEMP1, AO
3159	add	B,     TEMP2, BO
3160#endif
3161
3162	LDF	[AO +  0 * SIZE], a1
3163	LDF	[AO +  1 * SIZE], a2
3164	LDF	[AO +  2 * SIZE], a3
3165	LDF	[AO +  3 * SIZE], a4
3166
3167	LDF	[BO +  0 * SIZE], b1
3168	LDF	[BO +  1 * SIZE], b2
3169	LDF	[BO +  2 * SIZE], b3
3170	FCLR	(cc01)
3171	LDF	[BO +  3 * SIZE], b4
3172	FCLR	(cc02)
3173
3174	LDF	[BO +  4 * SIZE], b5
3175	FCLR	(cc03)
3176	LDF	[BO +  5 * SIZE], b6
3177	FCLR	(cc04)
3178	LDF	[BO +  6 * SIZE], b7
3179	FCLR	(cc05)
3180	LDF	[BO +  7 * SIZE], b8
3181	FCLR	(cc06)
3182
3183	prefetch [C1 + 2 * SIZE], 3
3184	FCLR	(cc07)
3185	prefetch [C2 + 2 * SIZE], 3
3186	FCLR	(cc08)
3187
3188#if defined(LT) || defined(RN)
3189	sra	KK, 2, L
3190#else
3191	sub	K, KK, L
3192	sra	L,  2, L
3193#endif
3194	cmp	L,  0
3195	ble,pn	%icc, .LL55
3196	nop
3197	.align 4
3198
3199.LL53:
3200	FMADD	(aa1, bb1, cc01, cc01)
3201	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3202	FMADD	(aa2, bb1, cc02, cc02)
3203	LDF	[BO +  8 * SIZE], b1
3204
3205	FMADD	(aa1, bb2, cc03, cc03)
3206	LDF	[AO +  4 * SIZE], a1
3207	FMADD	(aa2, bb2, cc04, cc04)
3208	LDF	[AO +  5 * SIZE], a2
3209
3210	FMADD	(aa3, bb3, cc01, cc01)
3211	LDF	[BO +  9 * SIZE], b2
3212	FMADD	(aa4, bb3, cc02, cc02)
3213	LDF	[BO + 10 * SIZE], b3
3214
3215	FMADD	(aa3, bb4, cc03, cc03)
3216	LDF	[AO +  6 * SIZE], a3
3217	FMADD	(aa4, bb4, cc04, cc04)
3218	LDF	[AO +  7 * SIZE], a4
3219
3220	FMADD	(aa1, bb5, cc01, cc01)
3221	LDF	[BO + 11 * SIZE], b4
3222	FMADD	(aa2, bb5, cc02, cc02)
3223	LDF	[BO + 12 * SIZE], b5
3224
3225	FMADD	(aa1, bb6, cc03, cc03)
3226	LDF	[AO +  8 * SIZE], a1
3227	FMADD	(aa2, bb6, cc04, cc04)
3228	LDF	[AO +  9 * SIZE], a2
3229
3230	FMADD	(aa3, bb7, cc01, cc01)
3231	LDF	[BO + 13 * SIZE], b6
3232
3233	FMADD	(aa4, bb7, cc02, cc02)
3234	LDF	[BO + 14 * SIZE], b7
3235
3236	FMADD	(aa3, bb8, cc03, cc03)
3237	LDF	[AO + 10 * SIZE], a3
3238	FMADD	(aa4, bb8, cc04, cc04)
3239	LDF	[AO + 11 * SIZE], a4
3240
3241	add	AO,  8 * SIZE, AO
3242	add	L, -1, L
3243	add	BO,  8 * SIZE, BO
3244	cmp	L, 0
3245
3246	bg,pt	%icc, .LL53
3247	LDF	[BO +  7 * SIZE], b8
3248	.align 4
3249
3250.LL55:
3251#if defined(LT) || defined(RN)
3252	and	KK, 3, L
3253#else
3254	sub	K, KK, L
3255	and	L,  3, L
3256#endif
3257	cmp	L,  0
3258	ble,a,pn %icc, .LL58
3259	nop
3260	.align 4
3261
3262.LL57:
3263	FMADD	(aa1, bb1, cc01, cc01)
3264	add	L, -1, L
3265	FMADD	(aa2, bb1, cc02, cc02)
3266	LDF	[BO + 2 * SIZE], b1
3267
3268	FMADD	(aa1, bb2, cc03, cc03)
3269	LDF	[AO + 2 * SIZE], a1
3270	FMADD	(aa2, bb2, cc04, cc04)
3271	LDF	[AO + 3 * SIZE], a2
3272
3273	add	AO, 2 * SIZE, AO
3274	cmp	L, 0
3275	add	BO, 2 * SIZE, BO
3276	bg,pt	%icc, .LL57
3277	LDF	[BO + 1 * SIZE], b2
3278	.align 4
3279
3280.LL58:
3281#if defined(LN) || defined(RT)
3282#ifdef LN
3283	sub	KK, 2, TEMP1
3284#else
3285	sub	KK, 2, TEMP1
3286#endif
3287	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3288	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3289
3290	add	AORIG, TEMP2, AO
3291	add	B,     TEMP1, BO
3292#endif
3293
3294#if defined(LN) || defined(LT)
3295	LDF	[BO +  0 * SIZE], a1
3296	LDF	[BO +  1 * SIZE], a2
3297	LDF	[BO +  2 * SIZE], a3
3298	LDF	[BO +  3 * SIZE], a4
3299
3300	FSUB	a1, c01, c01
3301	FSUB	a2, c03, c03
3302	FSUB	a3, c02, c02
3303	FSUB	a4, c04, c04
3304#else
3305	LDF	[AO +  0 * SIZE], a1
3306	LDF	[AO +  1 * SIZE], a2
3307	LDF	[AO +  2 * SIZE], a3
3308	LDF	[AO +  3 * SIZE], a4
3309
3310	FSUB	a1, c01, c01
3311	FSUB	a2, c02, c02
3312	FSUB	a3, c03, c03
3313	FSUB	a4, c04, c04
3314#endif
3315
3316#ifdef LN
3317	LDF	[AO +  3 * SIZE], a1
3318	LDF	[AO +  2 * SIZE], a2
3319	LDF	[AO +  0 * SIZE], a3
3320
3321	FMUL	a1, c02, c02
3322	FMUL	a1, c04, c04
3323
3324	FNMSUB	(aa2, cc02, cc01, cc01)
3325	FNMSUB	(aa2, cc04, cc03, cc03)
3326
3327	FMUL	a3, c01, c01
3328	FMUL	a3, c03, c03
3329#endif
3330
3331#ifdef LT
3332	LDF	[AO +  0 * SIZE], a1
3333	LDF	[AO +  1 * SIZE], a2
3334	LDF	[AO +  3 * SIZE], a3
3335
3336	FMUL	a1, c01, c01
3337	FMUL	a1, c03, c03
3338
3339	FNMSUB	(aa2, cc01, cc02, cc02)
3340	FNMSUB	(aa2, cc03, cc04, cc04)
3341
3342	FMUL	a3, c02, c02
3343	FMUL	a3, c04, c04
3344#endif
3345
3346#ifdef RN
3347	LDF	[BO +  0 * SIZE], a1
3348	LDF	[BO +  1 * SIZE], a2
3349
3350	FMUL	a1, c01, c01
3351	FMUL	a1, c02, c02
3352
3353	FNMSUB	(aa2, cc01, cc03, cc03)
3354	FNMSUB	(aa2, cc02, cc04, cc04)
3355
3356	LDF	[BO +  3 * SIZE], a1
3357
3358	FMUL	a1, c03, c03
3359	FMUL	a1, c04, c04
3360#endif
3361
3362#ifdef RT
3363	LDF	[BO +  3 * SIZE], a1
3364	LDF	[BO +  2 * SIZE], a2
3365
3366	FMUL	a1, c04, c04
3367	FMUL	a1, c03, c03
3368
3369	FNMSUB	(aa2, cc04, cc02, cc02)
3370	FNMSUB	(aa2, cc03, cc01, cc01)
3371
3372	LDF	[BO +  0 * SIZE], a1
3373
3374	FMUL	a1, c02, c02
3375	FMUL	a1, c01, c01
3376#endif
3377
3378#ifdef LN
3379	add	C1, -2 * SIZE, C1
3380	add	C2, -2 * SIZE, C2
3381#endif
3382
3383#if defined(LN) || defined(LT)
3384	STF	c01, [BO +  0 * SIZE]
3385	STF	c03, [BO +  1 * SIZE]
3386	STF	c02, [BO +  2 * SIZE]
3387	STF	c04, [BO +  3 * SIZE]
3388#else
3389	STF	c01, [AO +  0 * SIZE]
3390	STF	c02, [AO +  1 * SIZE]
3391	STF	c03, [AO +  2 * SIZE]
3392	STF	c04, [AO +  3 * SIZE]
3393#endif
3394
3395	STF	c01, [C1 + 0 * SIZE]
3396	STF	c02, [C1 + 1 * SIZE]
3397	STF	c03, [C2 + 0 * SIZE]
3398	STF	c04, [C2 + 1 * SIZE]
3399
3400#ifndef LN
3401	add	C1, 2 * SIZE, C1
3402	add	C2, 2 * SIZE, C2
3403#endif
3404
3405#ifdef RT
3406	sll	K, BASE_SHIFT + 1, TEMP1
3407	add	AORIG, TEMP1, AORIG
3408#endif
3409
3410#if defined(LT) || defined(RN)
3411	sub	K, KK, TEMP1
3412	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3413	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3414	add	AO, TEMP2, AO
3415	add	BO, TEMP1, BO
3416#endif
3417
3418#ifdef LT
3419	add	KK, 2, KK
3420#endif
3421
3422#ifdef LN
3423	sub	KK, 2, KK
3424#endif
3425
3426	add	I, -1, I
3427	cmp	I, 0
3428	bg,pt	%icc, .LL52
3429	nop
3430	.align 4
3431
3432.LL69:
3433#ifdef LN
3434	sll	K, BASE_SHIFT + 1, TEMP1
3435	add	B, TEMP1, B
3436#endif
3437
3438#if defined(LT) || defined(RN)
3439	mov	BO, B
3440#endif
3441
3442#ifdef RN
3443	add	KK, 2, KK
3444#endif
3445
3446#ifdef RT
3447	sub	KK, 2, KK
3448#endif
3449	.align 4
3450
3451.LL70:
3452	and	N, 1, J
3453	cmp	J, 0
3454	ble,pn	%icc, .LL999
3455	nop
3456
3457#ifdef RT
3458	sll	K, BASE_SHIFT, TEMP1
3459	sub	B, TEMP1, B
3460#endif
3461
3462#ifndef RT
3463	mov	C,  C1
3464	add	C1, LDC, C
3465#else
3466	sub	C,  LDC, C1
3467	sub	C,  LDC, C
3468#endif
3469
3470#ifdef LN
3471	add	M, OFFSET, KK
3472#endif
3473
3474#ifdef LT
3475	mov	OFFSET, KK
3476#endif
3477
3478#if defined(LN) || defined(RT)
3479	mov	A, AORIG
3480#else
3481	mov	A, AO
3482#endif
3483
3484	and	M, 1, I
3485	cmp	I, 0
3486	ble,pn	%icc, .LL80
3487	nop
3488
3489#if defined(LT) || defined(RN)
3490	mov	B, BO
3491#else
3492#ifdef LN
3493	sll	K,  BASE_SHIFT + 0, TEMP1
3494	sub	AORIG, TEMP1, AORIG
3495#endif
3496
3497	sll	KK, BASE_SHIFT + 0, TEMP1
3498	sll	KK, BASE_SHIFT + 0, TEMP2
3499
3500	add	AORIG, TEMP1, AO
3501	add	B,     TEMP2, BO
3502#endif
3503
3504	LDF	[AO +  0 * SIZE], a1
3505	LDF	[BO +  0 * SIZE], b1
3506	LDF	[AO +  1 * SIZE], a2
3507	LDF	[BO +  1 * SIZE], b2
3508	LDF	[AO +  2 * SIZE], a3
3509	LDF	[BO +  2 * SIZE], b3
3510	LDF	[AO +  3 * SIZE], a4
3511	LDF	[BO +  3 * SIZE], b4
3512
3513#if defined(LT) || defined(RN)
3514	sra	KK, 2, L
3515#else
3516	sub	K, KK, L
3517	sra	L,  2, L
3518#endif
3519	cmp	L,  0
3520	ble,pn	%icc, .LL85
3521	FCLR	(cc01)
3522	.align 4
3523
3524.LL83:
3525	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3526	add	L, -1, L
3527
3528	FMADD	(aa1, bb1, cc01, cc01)
3529	LDF	[AO +  4 * SIZE], a1
3530	LDF	[BO +  4 * SIZE], b1
3531
3532	FMADD	(aa2, bb2, cc01, cc01)
3533	LDF	[AO +  5 * SIZE], a2
3534	LDF	[BO +  5 * SIZE], b2
3535
3536	FMADD	(aa3, bb3, cc01, cc01)
3537	LDF	[AO +  6 * SIZE], a3
3538	LDF	[BO +  6 * SIZE], b3
3539
3540	FMADD	(aa4, bb4, cc01, cc01)
3541	LDF	[AO +  7 * SIZE], a4
3542	LDF	[BO +  7 * SIZE], b4
3543
3544	add	AO,  4 * SIZE, AO
3545	cmp	L, 0
3546
3547	bg,pt	%icc, .LL83
3548	add	BO,  4 * SIZE, BO
3549	.align 4
3550
3551.LL85:
3552#if defined(LT) || defined(RN)
3553	and	KK, 3, L
3554#else
3555	sub	K, KK, L
3556	and	L,  3, L
3557#endif
3558	cmp	L,  0
3559	ble,a,pn %icc, .LL88
3560	nop
3561	.align 4
3562
3563.LL87:
3564	FMADD	(aa1, bb1, cc01, cc01)
3565	LDF	[AO + 1 * SIZE], a1
3566	LDF	[BO + 1 * SIZE], b1
3567
3568	add	AO, 1 * SIZE, AO
3569	add	L, -1, L
3570	cmp	L, 0
3571	bg,pt	%icc, .LL87
3572	add	BO, 1 * SIZE, BO
3573	.align 4
3574
3575.LL88:
3576#if defined(LN) || defined(RT)
3577#ifdef LN
3578	sub	KK, 1, TEMP1
3579#else
3580	sub	KK, 1, TEMP1
3581#endif
3582	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3583	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3584
3585	add	AORIG, TEMP2, AO
3586	add	B,     TEMP1, BO
3587#endif
3588
3589#if defined(LN) || defined(LT)
3590	LDF	[BO +  0 * SIZE], a1
3591
3592	FSUB	a1, c01, c01
3593#else
3594	LDF	[AO +  0 * SIZE], a1
3595
3596	FSUB	a1, c01, c01
3597#endif
3598
3599#if defined(LN) || defined(LT)
3600	LDF	[AO +  0 * SIZE], a1
3601
3602	FMUL	a1, c01, c01
3603#endif
3604
3605#if defined(RN) || defined(RT)
3606	LDF	[BO +  0 * SIZE], a1
3607
3608	FMUL	a1, c01, c01
3609#endif
3610
3611#ifdef LN
3612	add	C1, -1 * SIZE, C1
3613#endif
3614
3615#if defined(LN) || defined(LT)
3616	STF	c01, [BO +  0 * SIZE]
3617#else
3618	STF	c01, [AO +  0 * SIZE]
3619#endif
3620
3621	STF	c01, [C1 + 0 * SIZE]
3622
3623#ifdef RT
3624	sll	K, BASE_SHIFT + 0, TEMP1
3625	add	AORIG, TEMP1, AORIG
3626#endif
3627
3628#if defined(LT) || defined(RN)
3629	sub	K, KK, TEMP1
3630	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3631	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3632	add	AO, TEMP2, AO
3633	add	BO, TEMP1, BO
3634#endif
3635
3636#ifdef LT
3637	add	KK, 1, KK
3638#endif
3639
3640#ifdef LN
3641	sub	KK, 1, KK
3642#endif
3643	.align 4
3644
3645.LL80:
3646	sra	M, 1, I
3647	cmp	I, 0
3648	ble,pn	%icc, .LL89
3649	nop
3650	.align 4
3651
3652.LL72:
3653#if defined(LT) || defined(RN)
3654	mov	B, BO
3655#else
3656#ifdef LN
3657	sll	K,  BASE_SHIFT + 1, TEMP1
3658	sub	AORIG, TEMP1, AORIG
3659#endif
3660
3661	sll	KK, BASE_SHIFT + 1, TEMP1
3662	sll	KK, BASE_SHIFT + 0, TEMP2
3663
3664	add	AORIG, TEMP1, AO
3665	add	B,     TEMP2, BO
3666#endif
3667
3668	LDF	[AO +  0 * SIZE], a1
3669	LDF	[AO +  1 * SIZE], a2
3670	LDF	[AO +  2 * SIZE], a3
3671	LDF	[AO +  3 * SIZE], a4
3672
3673	LDF	[BO +  0 * SIZE], b1
3674	LDF	[BO +  1 * SIZE], b2
3675	LDF	[BO +  2 * SIZE], b3
3676	FCLR	(cc01)
3677	LDF	[BO +  3 * SIZE], b4
3678	FCLR	(cc02)
3679
3680	prefetch [C1 + 2 * SIZE], 3
3681
3682#if defined(LT) || defined(RN)
3683	sra	KK, 2, L
3684#else
3685	sub	K, KK, L
3686	sra	L,  2, L
3687#endif
3688	cmp	L,  0
3689	ble,pn	%icc, .LL75
3690	nop
3691
3692.LL73:
3693	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3694	add	L, -1, L
3695
3696	FMADD	(aa1, bb1, cc01, cc01)
3697	LDF	[AO +  4 * SIZE], a1
3698	FMADD	(aa2, bb1, cc02, cc02)
3699	LDF	[AO +  5 * SIZE], a2
3700
3701	LDF	[BO +  4 * SIZE], b1
3702	cmp	L, 0
3703
3704	FMADD	(aa3, bb2, cc01, cc01)
3705	LDF	[AO +  6 * SIZE], a3
3706	FMADD	(aa4, bb2, cc02, cc02)
3707	LDF	[AO +  7 * SIZE], a4
3708
3709	LDF	[BO +  5 * SIZE], b2
3710	add	BO,  4 * SIZE, BO
3711
3712	FMADD	(aa1, bb3, cc01, cc01)
3713	LDF	[AO +  8 * SIZE], a1
3714	FMADD	(aa2, bb3, cc02, cc02)
3715	LDF	[AO +  9 * SIZE], a2
3716
3717	LDF	[BO +  2 * SIZE], b3
3718	add	AO,  8 * SIZE, AO
3719
3720	FMADD	(aa3, bb4, cc01, cc01)
3721	LDF	[AO +  2 * SIZE], a3
3722	FMADD	(aa4, bb4, cc02, cc02)
3723	LDF	[AO +  3 * SIZE], a4
3724
3725	bg,pt	%icc, .LL73
3726	LDF	[BO +  3 * SIZE], b4
3727	.align 4
3728
3729.LL75:
3730#if defined(LT) || defined(RN)
3731	and	KK, 3, L
3732#else
3733	sub	K, KK, L
3734	and	L,  3, L
3735#endif
3736	cmp	L,  0
3737	ble,a,pn %icc, .LL78
3738	nop
3739	.align 4
3740
3741.LL77:
3742	FMADD	(aa1, bb1, cc01, cc01)
3743	LDF	[AO + 2 * SIZE], a1
3744	FMADD	(aa2, bb1, cc02, cc02)
3745	LDF	[AO + 3 * SIZE], a2
3746
3747	LDF	[BO + 1 * SIZE], b1
3748	add	L, -1, L
3749	add	AO, 2 * SIZE, AO
3750	cmp	L, 0
3751	bg,pt	%icc, .LL77
3752	add	BO, 1 * SIZE, BO
3753	.align 4
3754
3755.LL78:
3756#if defined(LN) || defined(RT)
3757#ifdef LN
3758	sub	KK, 2, TEMP1
3759#else
3760	sub	KK, 1, TEMP1
3761#endif
3762	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3763	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3764
3765	add	AORIG, TEMP2, AO
3766	add	B,     TEMP1, BO
3767#endif
3768
3769#if defined(LN) || defined(LT)
3770	LDF	[BO +  0 * SIZE], a1
3771	LDF	[BO +  1 * SIZE], a2
3772
3773	FSUB	a1, c01, c01
3774	FSUB	a2, c02, c02
3775#else
3776	LDF	[AO +  0 * SIZE], a1
3777	LDF	[AO +  1 * SIZE], a2
3778
3779	FSUB	a1, c01, c01
3780	FSUB	a2, c02, c02
3781#endif
3782
3783#ifdef LN
3784	LDF	[AO +  3 * SIZE], a1
3785	LDF	[AO +  2 * SIZE], a2
3786	LDF	[AO +  0 * SIZE], a3
3787
3788	FMUL	a1, c02, c02
3789
3790	FNMSUB	(aa2, cc02, cc01, cc01)
3791
3792	FMUL	a3, c01, c01
3793#endif
3794
3795#ifdef LT
3796	LDF	[AO +  0 * SIZE], a1
3797	LDF	[AO +  1 * SIZE], a2
3798	LDF	[AO +  3 * SIZE], a3
3799
3800	FMUL	a1, c01, c01
3801
3802	FNMSUB	(aa2, cc01, cc02, cc02)
3803
3804	FMUL	a3, c02, c02
3805#endif
3806
3807#if defined(RN) || defined(RT)
3808	LDF	[BO +  0 * SIZE], a1
3809
3810	FMUL	a1, c01, c01
3811	FMUL	a1, c02, c02
3812#endif
3813
3814#ifdef LN
3815	add	C1, -2 * SIZE, C1
3816#endif
3817
3818#if defined(LN) || defined(LT)
3819	STF	c01, [BO +  0 * SIZE]
3820	STF	c02, [BO +  1 * SIZE]
3821#else
3822	STF	c01, [AO +  0 * SIZE]
3823	STF	c02, [AO +  1 * SIZE]
3824#endif
3825
3826	STF	c01, [C1 + 0 * SIZE]
3827	STF	c02, [C1 + 1 * SIZE]
3828
3829#ifndef LN
3830	add	C1, 2 * SIZE, C1
3831#endif
3832
3833#ifdef RT
3834	sll	K, BASE_SHIFT + 1, TEMP1
3835	add	AORIG, TEMP1, AORIG
3836#endif
3837
3838#if defined(LT) || defined(RN)
3839	sub	K, KK, TEMP1
3840	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3841	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3842	add	AO, TEMP2, AO
3843	add	BO, TEMP1, BO
3844#endif
3845
3846#ifdef LT
3847	add	KK, 2, KK
3848#endif
3849
3850#ifdef LN
3851	sub	KK, 2, KK
3852#endif
3853
3854	add	I, -1, I
3855	cmp	I, 0
3856	bg,pt	%icc, .LL72
3857	nop
3858	.align 4
3859
3860.LL89:
3861#ifdef LN
3862	sll	K, BASE_SHIFT, TEMP1
3863	add	B, TEMP1, B
3864#endif
3865
3866#if defined(LT) || defined(RN)
3867	mov	BO, B
3868#endif
3869
3870#ifdef RN
3871	add	KK, 1, KK
3872#endif
3873
3874#ifdef RT
3875	sub	KK, 1, KK
3876#endif
3877	.align 4
3878
3879.LL999:
3880#ifdef TRMMKERNEL
3881#ifndef __64BIT__
3882	ld	[%sp + STACK_START +  8], %g1
3883	ld	[%sp + STACK_START + 12], %g2
3884	ld	[%sp + STACK_START + 16], %g3
3885	ld	[%sp + STACK_START + 20], %g4
3886#else
3887	ldx	[%sp + STACK_START + 32], %g1
3888	ldx	[%sp + STACK_START + 40], %g2
3889	ldx	[%sp + STACK_START + 48], %g3
3890	ldx	[%sp + STACK_START + 56], %g4
3891#endif
3892#endif
3893
3894	return	%i7 + 8
3895	clr	%o0
3896
3897	EPILOGUE
3898