1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2005. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define APREFETCHSIZE 24
26#define APREFETCH_CATEGORY 0
27
28#define M	%i0
29#define N	%i1
30#define K	%i2
31
32#if defined(DOUBLE) && !defined(__64BIT__)
33#define A	%i5
34#define B	%i4
35#else
36#define A	%i4
37#define B	%i5
38#endif
39
40#define C	%o4
41#define LDC	%o5
42
43#define AO	%l0
44#define BO	%l1
45#define I	%l2
46#define J	%l3
47#define L	%l4
48
49#define C1	%o0
50#define C2	%o1
51#define C3	%o2
52#define C4	%o3
53
54#define C5	%l5
55#define	C6	%l6
56#define C7	%l7
57#define C8	%i3
58
59#define OFFSET	%g1
60#define	KK	%g2
61#define TEMP1	%g3
62#define TEMP2	%g4
63#define AORIG	%o7
64
65#ifdef DOUBLE
66#define c01	%f0
67#define c02	%f2
68#define c03	%f4
69#define c04	%f6
70#define c05	%f8
71#define c06	%f10
72#define c07	%f12
73#define c08	%f14
74#define c09	%f16
75#define c10	%f18
76#define c11	%f20
77#define c12	%f22
78#define c13	%f24
79#define c14	%f26
80#define c15	%f28
81#define c16	%f30
82
83#define a1	%f32
84#define a2	%f34
85#define a3	%f36
86#define a4	%f38
87#define a5	%f40
88
89#define b1	%f42
90#define b2	%f44
91#define b3	%f46
92#define b4	%f48
93#define b5	%f50
94#define b6	%f52
95#define b7	%f54
96#define b8	%f56
97#define b9	%f58
98
99#define cc01	0
100#define cc02	2
101#define cc03	4
102#define cc04	6
103#define cc05	8
104#define cc06	10
105#define cc07	12
106#define cc08	14
107#define cc09	16
108#define cc10	18
109#define cc11	20
110#define cc12	22
111#define cc13	24
112#define cc14	26
113#define cc15	28
114#define cc16	30
115
116#define aa1	 1
117#define aa2	 3
118#define aa3	 5
119#define aa4	 7
120#define aa5	 9
121
122#define bb1	11
123#define bb2	13
124#define bb3	15
125#define bb4	17
126#define bb5	19
127#define bb6	21
128#define bb7	23
129#define bb8	25
130#define bb9	27
131
132#else
133#define c01	%f0
134#define c02	%f1
135#define c03	%f2
136#define c04	%f3
137#define c05	%f4
138#define c06	%f5
139#define c07	%f6
140#define c08	%f7
141#define c09	%f8
142#define c10	%f9
143#define c11	%f10
144#define c12	%f11
145#define c13	%f12
146#define c14	%f13
147#define c15	%f14
148#define c16	%f15
149
150#define a1	%f16
151#define a2	%f17
152#define a3	%f18
153#define a4	%f19
154#define a5	%f20
155
156#define b1	%f21
157#define b2	%f22
158#define b3	%f23
159#define b4	%f24
160#define b5	%f25
161#define b6	%f26
162#define b7	%f27
163#define b8	%f28
164#define b9	%f29
165
166#define cc01	0
167#define cc02	1
168#define cc03	2
169#define cc04	3
170#define cc05	4
171#define cc06	5
172#define cc07	6
173#define cc08	7
174#define cc09	8
175#define cc10	9
176#define cc11	10
177#define cc12	11
178#define cc13	12
179#define cc14	13
180#define cc15	14
181#define cc16	15
182
183#define aa1	16
184#define aa2	17
185#define aa3	18
186#define aa4	19
187#define aa5	20
188
189#define bb1	21
190#define bb2	22
191#define bb3	23
192#define bb4	24
193#define bb5	25
194#define bb6	26
195#define bb7	27
196#define bb8	28
197#define bb9	29
198
199#endif
200
201        .register %g2, #scratch
202        .register %g3, #scratch
203
204	PROLOGUE
205	SAVESP
206	nop
207
208#ifndef __64BIT__
209
210#ifdef DOUBLE
211	ld	[%sp + STACK_START + 28], B
212	ld	[%sp + STACK_START + 32], C
213	ld	[%sp + STACK_START + 36], LDC
214	ld	[%sp + STACK_START + 40], OFFSET
215#else
216	ld	[%sp + STACK_START + 28], C
217	ld	[%sp + STACK_START + 32], LDC
218	ld	[%sp + STACK_START + 36], OFFSET
219#endif
220
221	st	%g1, [%sp + STACK_START +  8]
222	st	%g2, [%sp + STACK_START + 12]
223	st	%g3, [%sp + STACK_START + 16]
224	st	%g4, [%sp + STACK_START + 20]
225#else
226
227	ldx	[%sp+  STACK_START + 56], C
228	ldx	[%sp+  STACK_START + 64], LDC
229	ldx	[%sp+  STACK_START + 72], OFFSET
230
231	stx	%g1, [%sp + STACK_START + 32]
232	stx	%g2, [%sp + STACK_START + 40]
233	stx	%g3, [%sp + STACK_START + 48]
234	stx	%g4, [%sp + STACK_START + 56]
235#endif
236
237#if defined(TRMMKERNEL) && !defined(LEFT)
238	neg	OFFSET, KK
239#endif
240
241	sll	LDC, BASE_SHIFT, LDC
242
243#ifdef LN
244	smul	M, K, TEMP1
245	sll	TEMP1, BASE_SHIFT, TEMP1
246	add	A, TEMP1, A
247
248	sll	M, BASE_SHIFT, TEMP1
249	add	C, TEMP1, C
250#endif
251
252#ifdef RN
253	neg	OFFSET, KK
254#endif
255
256#ifdef RT
257	smul	N, K, TEMP1
258	sll	TEMP1, BASE_SHIFT, TEMP1
259	add	B, TEMP1, B
260
261	smul	N, LDC, TEMP1
262	add	C, TEMP1, C
263
264	sub	N, OFFSET, KK
265#endif
266
267	sra	N, 3, J
268	cmp	J, 0
269	ble,pn	%icc, .LL30
270	nop
271	.align 4
272
273.LL11:
274#ifdef RT
275	sll	K, BASE_SHIFT + 3, TEMP1
276	sub	B, TEMP1, B
277#endif
278
279#ifndef RT
280	mov	C,  C1
281	add	C,  LDC, C2
282	add	C2, LDC, C3
283	add	C3, LDC, C4
284	add	C4, LDC, C5
285	add	C5, LDC, C6
286	add	C6, LDC, C7
287	add	C7, LDC, C8
288	add	C8, LDC, C
289#else
290	sub	C,  LDC, C8
291	sub	C8, LDC, C7
292	sub	C7, LDC, C6
293	sub	C6, LDC, C5
294	sub	C5, LDC, C4
295	sub	C4, LDC, C3
296	sub	C3, LDC, C2
297	sub	C2, LDC, C1
298	sub	C2, LDC, C
299#endif
300
301#ifdef LN
302	add	M, OFFSET, KK
303#endif
304
305#ifdef LT
306	mov	OFFSET, KK
307#endif
308
309#if defined(LN) || defined(RT)
310	mov	A, AORIG
311#else
312	mov	A, AO
313#endif
314
315	and	M, 1, I
316	cmp	I, 0
317	ble,pn	%icc, .LL20
318	nop
319
320#if defined(LT) || defined(RN)
321	mov	B, BO
322#else
323#ifdef LN
324	sll	K,  BASE_SHIFT + 0, TEMP1
325	sub	AORIG, TEMP1, AORIG
326#endif
327
328	sll	KK, BASE_SHIFT + 0, TEMP1
329	sll	KK, BASE_SHIFT + 3, TEMP2
330
331	add	AORIG, TEMP1, AO
332	add	B,     TEMP2, BO
333#endif
334
335	LDF	[AO +  0 * SIZE], a1
336	LDF	[AO +  1 * SIZE], a2
337	LDF	[AO +  2 * SIZE], a3
338	LDF	[AO +  3 * SIZE], a4
339
340	LDF	[BO +  0 * SIZE], b1
341	FCLR	(cc01)
342	LDF	[BO +  1 * SIZE], b2
343	FCLR	(cc03)
344	LDF	[BO +  2 * SIZE], b3
345	FCLR	(cc05)
346	LDF	[BO +  3 * SIZE], b4
347	FCLR	(cc07)
348	LDF	[BO +  4 * SIZE], b5
349	FCLR	(cc09)
350	LDF	[BO +  5 * SIZE], b6
351	FCLR	(cc11)
352	LDF	[BO +  6 * SIZE], b7
353	FCLR	(cc13)
354	LDF	[BO +  7 * SIZE], b8
355	FCLR	(cc15)
356
357#if defined(LT) || defined(RN)
358	sra	KK, 2, L
359#else
360	sub	K, KK, L
361	sra	L,  2, L
362#endif
363	cmp	L,  0
364	ble,pn	%icc, .LL25
365	LDF	[BO +  8 * SIZE], b9
366	.align 4
367
368.LL23:
369	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
370	add	L, -1, L
371
372	FMADD	(aa1, bb1, cc01, cc01)
373	LDF	[BO + 16 * SIZE], b1
374	FMADD	(aa1, bb2, cc03, cc03)
375	LDF	[BO +  9 * SIZE], b2
376
377	FMADD	(aa1, bb3, cc05, cc05)
378	LDF	[BO + 10 * SIZE], b3
379	FMADD	(aa1, bb4, cc07, cc07)
380	LDF	[BO + 11 * SIZE], b4
381
382	FMADD	(aa1, bb5, cc09, cc09)
383	LDF	[BO + 12 * SIZE], b5
384	FMADD	(aa1, bb6, cc11, cc11)
385	LDF	[BO + 13 * SIZE], b6
386
387	FMADD	(aa1, bb7, cc13, cc13)
388	LDF	[BO + 14 * SIZE], b7
389	FMADD	(aa1, bb8, cc15, cc15)
390	LDF	[BO + 15 * SIZE], b8
391
392	FMADD	(aa2, bb9, cc01, cc01)
393	LDF	[BO + 24 * SIZE], b9
394	FMADD	(aa2, bb2, cc03, cc03)
395	LDF	[BO + 17 * SIZE], b2
396
397	FMADD	(aa2, bb3, cc05, cc05)
398	LDF	[BO + 18 * SIZE], b3
399	FMADD	(aa2, bb4, cc07, cc07)
400	LDF	[BO + 19 * SIZE], b4
401
402	FMADD	(aa2, bb5, cc09, cc09)
403	LDF	[BO + 20 * SIZE], b5
404	FMADD	(aa2, bb6, cc11, cc11)
405	LDF	[BO + 21 * SIZE], b6
406
407	FMADD	(aa2, bb7, cc13, cc13)
408	LDF	[BO + 22 * SIZE], b7
409	FMADD	(aa2, bb8, cc15, cc15)
410	LDF	[BO + 23 * SIZE], b8
411
412	LDF	[AO +  4 * SIZE], a1
413	LDF	[AO +  5 * SIZE], a2
414
415	FMADD	(aa3, bb1, cc01, cc01)
416	LDF	[BO + 32 * SIZE], b1
417	FMADD	(aa3, bb2, cc03, cc03)
418	LDF	[BO + 25 * SIZE], b2
419
420	FMADD	(aa3, bb3, cc05, cc05)
421	LDF	[BO + 26 * SIZE], b3
422	FMADD	(aa3, bb4, cc07, cc07)
423	LDF	[BO + 27 * SIZE], b4
424
425	FMADD	(aa3, bb5, cc09, cc09)
426	LDF	[BO + 28 * SIZE], b5
427	FMADD	(aa3, bb6, cc11, cc11)
428	LDF	[BO + 29 * SIZE], b6
429
430	FMADD	(aa3, bb7, cc13, cc13)
431	LDF	[BO + 30 * SIZE], b7
432	FMADD	(aa3, bb8, cc15, cc15)
433	LDF	[BO + 31 * SIZE], b8
434
435	FMADD	(aa4, bb9, cc01, cc01)
436	LDF	[BO + 40 * SIZE], b9
437	FMADD	(aa4, bb2, cc03, cc03)
438	LDF	[BO + 33 * SIZE], b2
439
440	FMADD	(aa4, bb3, cc05, cc05)
441	LDF	[BO + 34 * SIZE], b3
442	FMADD	(aa4, bb4, cc07, cc07)
443	LDF	[BO + 35 * SIZE], b4
444
445	FMADD	(aa4, bb5, cc09, cc09)
446	LDF	[BO + 36 * SIZE], b5
447	FMADD	(aa4, bb6, cc11, cc11)
448	LDF	[BO + 37 * SIZE], b6
449
450	FMADD	(aa4, bb7, cc13, cc13)
451	LDF	[BO + 38 * SIZE], b7
452	FMADD	(aa4, bb8, cc15, cc15)
453	LDF	[BO + 39 * SIZE], b8
454
455	LDF	[AO +  6 * SIZE], a3
456	LDF	[AO +  7 * SIZE], a4
457
458	add	AO,  4 * SIZE, AO
459	cmp	L, 0
460	bg,pt	%icc, .LL23
461	add	BO, 32 * SIZE, BO
462	.align 4
463
464.LL25:
465#if defined(LT) || defined(RN)
466	and	KK, 3, L
467#else
468	sub	K, KK, L
469	and	L,  3, L
470#endif
471	cmp	L,  0
472	ble,a,pn %icc, .LL28
473	nop
474	.align 4
475
476.LL27:
477	FMADD	(aa1, bb1, cc01, cc01)
478	LDF	[BO +  8 * SIZE], b1
479	FMADD	(aa1, bb2, cc03, cc03)
480	LDF	[BO +  9 * SIZE], b2
481
482	FMADD	(aa1, bb3, cc05, cc05)
483	LDF	[BO + 10 * SIZE], b3
484	FMADD	(aa1, bb4, cc07, cc07)
485	LDF	[BO + 11 * SIZE], b4
486
487	FMADD	(aa1, bb5, cc09, cc09)
488	LDF	[BO + 12 * SIZE], b5
489	FMADD	(aa1, bb6, cc11, cc11)
490	LDF	[BO + 13 * SIZE], b6
491
492	FMADD	(aa1, bb7, cc13, cc13)
493	LDF	[BO + 14 * SIZE], b7
494	FMADD	(aa1, bb8, cc15, cc15)
495	LDF	[BO + 15 * SIZE], b8
496
497	LDF	[AO +  1 * SIZE], a1
498	add	AO, 1 * SIZE, AO
499
500	add	L, -1, L
501	cmp	L, 0
502	bg,pt	%icc, .LL27
503	add	BO, 8 * SIZE, BO
504	.align 4
505
506.LL28:
507#if defined(LN) || defined(RT)
508#ifdef LN
509	sub	KK, 1, TEMP1
510#else
511	sub	KK, 8, TEMP1
512#endif
513	sll	TEMP1, BASE_SHIFT + 0, TEMP2
514	sll	TEMP1, BASE_SHIFT + 3, TEMP1
515
516	add	AORIG, TEMP2, AO
517	add	B,     TEMP1, BO
518#endif
519
520#if defined(LN) || defined(LT)
521	LDF	[BO +  0 * SIZE], a1
522	LDF	[BO +  1 * SIZE], a2
523	LDF	[BO +  2 * SIZE], a3
524	LDF	[BO +  3 * SIZE], a4
525
526	LDF	[BO +  4 * SIZE], b1
527	LDF	[BO +  5 * SIZE], b2
528	LDF	[BO +  6 * SIZE], b3
529	LDF	[BO +  7 * SIZE], b4
530
531	FSUB	a1, c01, c01
532	FSUB	a2, c03, c03
533	FSUB	a3, c05, c05
534	FSUB	a4, c07, c07
535
536	FSUB	b1, c09, c09
537	FSUB	b2, c11, c11
538	FSUB	b3, c13, c13
539	FSUB	b4, c15, c15
540#else
541	LDF	[AO +  0 * SIZE], a1
542	LDF	[AO +  1 * SIZE], a2
543	LDF	[AO +  2 * SIZE], a3
544	LDF	[AO +  3 * SIZE], a4
545
546	LDF	[AO +  4 * SIZE], b1
547	LDF	[AO +  5 * SIZE], b2
548	LDF	[AO +  6 * SIZE], b3
549	LDF	[AO +  7 * SIZE], b4
550
551	FSUB	a1, c01, c01
552	FSUB	a2, c03, c03
553	FSUB	a3, c05, c05
554	FSUB	a4, c07, c07
555
556	FSUB	b1, c09, c09
557	FSUB	b2, c11, c11
558	FSUB	b3, c13, c13
559	FSUB	b4, c15, c15
560#endif
561
562#if defined(LN) || defined(LT)
563	LDF	[AO +  0 * SIZE], a1
564
565	FMUL	a1, c01, c01
566	FMUL	a1, c03, c03
567	FMUL	a1, c05, c05
568	FMUL	a1, c07, c07
569	FMUL	a1, c09, c09
570	FMUL	a1, c11, c11
571	FMUL	a1, c13, c13
572	FMUL	a1, c15, c15
573#endif
574
575#ifdef RN
576	LDF	[BO +  0 * SIZE], a1
577	LDF	[BO +  1 * SIZE], a2
578	LDF	[BO +  2 * SIZE], a3
579	LDF	[BO +  3 * SIZE], a4
580	LDF	[BO +  4 * SIZE], b1
581	LDF	[BO +  5 * SIZE], b2
582	LDF	[BO +  6 * SIZE], b3
583	LDF	[BO +  7 * SIZE], b4
584
585	FMUL	a1, c01, c01
586
587	FNMSUB	(aa2, cc01, cc03, cc03)
588	FNMSUB	(aa3, cc01, cc05, cc05)
589	FNMSUB	(aa4, cc01, cc07, cc07)
590	FNMSUB	(bb1, cc01, cc09, cc09)
591	FNMSUB	(bb2, cc01, cc11, cc11)
592	FNMSUB	(bb3, cc01, cc13, cc13)
593	FNMSUB	(bb4, cc01, cc15, cc15)
594
595	LDF	[BO +  9 * SIZE], a1
596	LDF	[BO + 10 * SIZE], a2
597	LDF	[BO + 11 * SIZE], a3
598	LDF	[BO + 12 * SIZE], a4
599	LDF	[BO + 13 * SIZE], b1
600	LDF	[BO + 14 * SIZE], b2
601	LDF	[BO + 15 * SIZE], b3
602
603	FMUL	a1, c03, c03
604
605	FNMSUB	(aa2, cc03, cc05, cc05)
606	FNMSUB	(aa3, cc03, cc07, cc07)
607	FNMSUB	(aa4, cc03, cc09, cc09)
608	FNMSUB	(bb1, cc03, cc11, cc11)
609	FNMSUB	(bb2, cc03, cc13, cc13)
610	FNMSUB	(bb3, cc03, cc15, cc15)
611
612	LDF	[BO + 18 * SIZE], a1
613	LDF	[BO + 19 * SIZE], a2
614	LDF	[BO + 20 * SIZE], a3
615	LDF	[BO + 21 * SIZE], a4
616	LDF	[BO + 22 * SIZE], b1
617	LDF	[BO + 23 * SIZE], b2
618
619	FMUL	a1, c05, c05
620
621	FNMSUB	(aa2, cc05, cc07, cc07)
622	FNMSUB	(aa3, cc05, cc09, cc09)
623	FNMSUB	(aa4, cc05, cc11, cc11)
624	FNMSUB	(bb1, cc05, cc13, cc13)
625	FNMSUB	(bb2, cc05, cc15, cc15)
626
627	LDF	[BO + 27 * SIZE], a1
628	LDF	[BO + 28 * SIZE], a2
629	LDF	[BO + 29 * SIZE], a3
630	LDF	[BO + 30 * SIZE], a4
631	LDF	[BO + 31 * SIZE], b1
632
633	FMUL	a1, c07, c07
634
635	FNMSUB	(aa2, cc07, cc09, cc09)
636	FNMSUB	(aa3, cc07, cc11, cc11)
637	FNMSUB	(aa4, cc07, cc13, cc13)
638	FNMSUB	(bb1, cc07, cc15, cc15)
639
640	LDF	[BO + 36 * SIZE], a1
641	LDF	[BO + 37 * SIZE], a2
642	LDF	[BO + 38 * SIZE], a3
643	LDF	[BO + 39 * SIZE], a4
644
645	FMUL	a1, c09, c09
646
647	FNMSUB	(aa2, cc09, cc11, cc11)
648	FNMSUB	(aa3, cc09, cc13, cc13)
649	FNMSUB	(aa4, cc09, cc15, cc15)
650
651	LDF	[BO + 45 * SIZE], a1
652	LDF	[BO + 46 * SIZE], a2
653	LDF	[BO + 47 * SIZE], a3
654
655	FMUL	a1, c11, c11
656
657	FNMSUB	(aa2, cc11, cc13, cc13)
658	FNMSUB	(aa3, cc11, cc15, cc15)
659
660	LDF	[BO + 54 * SIZE], a1
661	LDF	[BO + 55 * SIZE], a2
662
663	FMUL	a1, c13, c13
664
665	FNMSUB	(aa2, cc13, cc15, cc15)
666
667	LDF	[BO + 63 * SIZE], a1
668
669	FMUL	a1, c15, c15
670#endif
671
672#ifdef RT
673	LDF	[BO + 63 * SIZE], a1
674	LDF	[BO + 62 * SIZE], a2
675	LDF	[BO + 61 * SIZE], a3
676	LDF	[BO + 60 * SIZE], a4
677	LDF	[BO + 59 * SIZE], b1
678	LDF	[BO + 58 * SIZE], b2
679	LDF	[BO + 57 * SIZE], b3
680	LDF	[BO + 56 * SIZE], b4
681
682	FMUL	a1, c15, c15
683
684	FNMSUB	(aa2, cc15, cc13, cc13)
685	FNMSUB	(aa3, cc15, cc11, cc11)
686	FNMSUB	(aa4, cc15, cc09, cc09)
687	FNMSUB	(bb1, cc15, cc07, cc07)
688	FNMSUB	(bb2, cc15, cc05, cc05)
689	FNMSUB	(bb3, cc15, cc03, cc03)
690	FNMSUB	(bb4, cc15, cc01, cc01)
691
692	LDF	[BO + 54 * SIZE], a1
693	LDF	[BO + 53 * SIZE], a2
694	LDF	[BO + 52 * SIZE], a3
695	LDF	[BO + 51 * SIZE], a4
696	LDF	[BO + 50 * SIZE], b1
697	LDF	[BO + 49 * SIZE], b2
698	LDF	[BO + 48 * SIZE], b3
699
700	FMUL	a1, c13, c13
701
702	FNMSUB	(aa2, cc13, cc11, cc11)
703	FNMSUB	(aa3, cc13, cc09, cc09)
704	FNMSUB	(aa4, cc13, cc07, cc07)
705	FNMSUB	(bb1, cc13, cc05, cc05)
706	FNMSUB	(bb2, cc13, cc03, cc03)
707	FNMSUB	(bb3, cc13, cc01, cc01)
708
709	LDF	[BO + 45 * SIZE], a1
710	LDF	[BO + 44 * SIZE], a2
711	LDF	[BO + 43 * SIZE], a3
712	LDF	[BO + 42 * SIZE], a4
713	LDF	[BO + 41 * SIZE], b1
714	LDF	[BO + 40 * SIZE], b2
715
716	FMUL	a1, c11, c11
717
718	FNMSUB	(aa2, cc11, cc09, cc09)
719	FNMSUB	(aa3, cc11, cc07, cc07)
720	FNMSUB	(aa4, cc11, cc05, cc05)
721	FNMSUB	(bb1, cc11, cc03, cc03)
722	FNMSUB	(bb2, cc11, cc01, cc01)
723
724	LDF	[BO + 36 * SIZE], a1
725	LDF	[BO + 35 * SIZE], a2
726	LDF	[BO + 34 * SIZE], a3
727	LDF	[BO + 33 * SIZE], a4
728	LDF	[BO + 32 * SIZE], b1
729
730	FMUL	a1, c09, c09
731
732	FNMSUB	(aa2, cc09, cc07, cc07)
733	FNMSUB	(aa3, cc09, cc05, cc05)
734	FNMSUB	(aa4, cc09, cc03, cc03)
735	FNMSUB	(bb1, cc09, cc01, cc01)
736
737	LDF	[BO + 27 * SIZE], a1
738	LDF	[BO + 26 * SIZE], a2
739	LDF	[BO + 25 * SIZE], a3
740	LDF	[BO + 24 * SIZE], a4
741
742	FMUL	a1, c07, c07
743
744	FNMSUB	(aa2, cc07, cc05, cc05)
745	FNMSUB	(aa3, cc07, cc03, cc03)
746	FNMSUB	(aa4, cc07, cc01, cc01)
747
748	LDF	[BO + 18 * SIZE], a1
749	LDF	[BO + 17 * SIZE], a2
750	LDF	[BO + 16 * SIZE], a3
751
752	FMUL	a1, c05, c05
753
754	FNMSUB	(aa2, cc05, cc03, cc03)
755	FNMSUB	(aa3, cc05, cc01, cc01)
756
757	LDF	[BO +  9 * SIZE], a1
758	LDF	[BO +  8 * SIZE], a2
759
760	FMUL	a1, c03, c03
761
762	FNMSUB	(aa2, cc03, cc01, cc01)
763
764	LDF	[BO +  0 * SIZE], a1
765
766	FMUL	a1, c01, c01
767#endif
768
769#ifdef LN
770	add	C1, -1 * SIZE, C1
771	add	C2, -1 * SIZE, C2
772	add	C3, -1 * SIZE, C3
773	add	C4, -1 * SIZE, C4
774	add	C5, -1 * SIZE, C5
775	add	C6, -1 * SIZE, C6
776	add	C7, -1 * SIZE, C7
777	add	C8, -1 * SIZE, C8
778#endif
779
780#if defined(LN) || defined(LT)
781	STF	c01, [BO +  0 * SIZE]
782	STF	c03, [BO +  1 * SIZE]
783	STF	c05, [BO +  2 * SIZE]
784	STF	c07, [BO +  3 * SIZE]
785
786	STF	c09, [BO +  4 * SIZE]
787	STF	c11, [BO +  5 * SIZE]
788	STF	c13, [BO +  6 * SIZE]
789	STF	c15, [BO +  7 * SIZE]
790#else
791	STF	c01, [AO +  0 * SIZE]
792	STF	c03, [AO +  1 * SIZE]
793	STF	c05, [AO +  2 * SIZE]
794	STF	c07, [AO +  3 * SIZE]
795
796	STF	c09, [AO +  4 * SIZE]
797	STF	c11, [AO +  5 * SIZE]
798	STF	c13, [AO +  6 * SIZE]
799	STF	c15, [AO +  7 * SIZE]
800#endif
801
802	STF	c01, [C1 + 0 * SIZE]
803	STF	c03, [C2 + 0 * SIZE]
804	STF	c05, [C3 + 0 * SIZE]
805	STF	c07, [C4 + 0 * SIZE]
806
807	STF	c09, [C5 + 0 * SIZE]
808	STF	c11, [C6 + 0 * SIZE]
809	STF	c13, [C7 + 0 * SIZE]
810	STF	c15, [C8 + 0 * SIZE]
811
812#ifdef RT
813	sll	K, BASE_SHIFT + 0, TEMP1
814	add	AORIG, TEMP1, AORIG
815#endif
816
817#if defined(LT) || defined(RN)
818	sub	K, KK, TEMP1
819	sll	TEMP1, BASE_SHIFT + 0, TEMP2
820	sll	TEMP1, BASE_SHIFT + 3, TEMP1
821	add	AO, TEMP2, AO
822	add	BO, TEMP1, BO
823#endif
824
825#ifdef LT
826	add	KK, 1, KK
827#endif
828
829#ifdef LN
830	sub	KK, 1, KK
831#endif
832	.align 4
833
834.LL20:
835	sra	M, 1, I
836	cmp	I, 0
837	ble,pn	%icc, .LL29
838	nop
839	.align 4
840
841.LL12:
842#if defined(LT) || defined(RN)
843	mov	B, BO
844#else
845#ifdef LN
846	sll	K,  BASE_SHIFT + 1, TEMP1
847	sub	AORIG, TEMP1, AORIG
848#endif
849
850	sll	KK, BASE_SHIFT + 1, TEMP1
851	sll	KK, BASE_SHIFT + 3, TEMP2
852
853	add	AORIG, TEMP1, AO
854	add	B,     TEMP2, BO
855#endif
856
857	LDF	[AO +  0 * SIZE], a1
858	LDF	[AO +  1 * SIZE], a2
859	LDF	[AO +  8 * SIZE], a5
860
861	LDF	[BO +  0 * SIZE], b1
862
863	LDF	[BO +  1 * SIZE], b2
864	FCLR	(cc01)
865	LDF	[BO +  2 * SIZE], b3
866	FCLR	(cc05)
867	LDF	[BO +  3 * SIZE], b4
868	FCLR	(cc09)
869	LDF	[BO +  4 * SIZE], b5
870	FCLR	(cc13)
871
872	LDF	[BO +  5 * SIZE], b6
873	FCLR	(cc02)
874	LDF	[BO +  6 * SIZE], b7
875	FCLR	(cc06)
876	LDF	[BO +  7 * SIZE], b8
877	FCLR	(cc10)
878	LDF	[BO +  8 * SIZE], b9
879	FCLR	(cc14)
880
881	prefetch [C1 + 1 * SIZE], 3
882	FCLR	(cc03)
883	prefetch [C2 + 2 * SIZE], 3
884	FCLR	(cc07)
885	prefetch [C3 + 1 * SIZE], 3
886	FCLR	(cc11)
887	prefetch [C4 + 2 * SIZE], 3
888	FCLR	(cc15)
889
890	prefetch [C5 + 1 * SIZE], 3
891	FCLR	(cc04)
892	prefetch [C6 + 2 * SIZE], 3
893	FCLR	(cc08)
894	prefetch [C7 + 1 * SIZE], 3
895	FCLR	(cc12)
896	prefetch [C8 + 2 * SIZE], 3
897	FCLR	(cc16)
898
899#if defined(LT) || defined(RN)
900	sra	KK, 3, L
901#else
902	sub	K, KK, L
903	sra	L,  3, L
904#endif
905	cmp	L,  0
906	ble,pn	%icc, .LL15
907	nop
908	.align 4
909
910.LL13:
911	FMADD	(aa1, bb1, cc01, cc01)
912	FMADD	(aa2, bb1, cc02, cc02)
913	FMADD	(aa1, bb2, cc03, cc03)
914	FMADD	(aa2, bb2, cc04, cc04)
915
916	FMADD	(aa1, bb3, cc05, cc05)
917	LDF	[BO + 16 * SIZE], b1
918	FMADD	(aa2, bb3, cc06, cc06)
919	LDF	[BO +  9 * SIZE], b2
920
921	FMADD	(aa1, bb4, cc07, cc07)
922	LDF	[BO + 10 * SIZE], b3
923	FMADD	(aa2, bb4, cc08, cc08)
924	LDF	[BO + 11 * SIZE], b4
925
926	FMADD	(aa1, bb5, cc09, cc09)
927	LDF	[AO +  2 * SIZE], a3
928	FMADD	(aa2, bb5, cc10, cc10)
929	LDF	[AO +  3 * SIZE], a4
930
931	FMADD	(aa1, bb6, cc11, cc11)
932	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
933	FMADD	(aa2, bb6, cc12, cc12)
934	nop
935
936	FMADD	(aa1, bb7, cc13, cc13)
937	LDF	[BO + 12 * SIZE], b5
938	FMADD	(aa2, bb7, cc14, cc14)
939	LDF	[BO + 13 * SIZE], b6
940
941	FMADD	(aa1, bb8, cc15, cc15)
942	LDF	[BO + 14 * SIZE], b7
943	FMADD	(aa2, bb8, cc16, cc16)
944	LDF	[BO + 15 * SIZE], b8
945
946	FMADD	(aa3, bb9, cc01, cc01)
947	FMADD	(aa4, bb9, cc02, cc02)
948	FMADD	(aa3, bb2, cc03, cc03)
949	FMADD	(aa4, bb2, cc04, cc04)
950
951	FMADD	(aa3, bb3, cc05, cc05)
952	LDF	[BO + 24 * SIZE], b9
953	FMADD	(aa4, bb3, cc06, cc06)
954	LDF	[BO + 17 * SIZE], b2
955
956	FMADD	(aa3, bb4, cc07, cc07)
957	LDF	[BO + 18 * SIZE], b3
958	FMADD	(aa4, bb4, cc08, cc08)
959	LDF	[BO + 19 * SIZE], b4
960
961	FMADD	(aa3, bb5, cc09, cc09)
962	LDF	[AO +  4 * SIZE], a1
963	FMADD	(aa4, bb5, cc10, cc10)
964	LDF	[AO +  5 * SIZE], a2
965
966	FMADD	(aa3, bb6, cc11, cc11)
967	add	L, -1, L
968	FMADD	(aa4, bb6, cc12, cc12)
969	nop
970
971	FMADD	(aa3, bb7, cc13, cc13)
972	LDF	[BO + 20 * SIZE], b5
973	FMADD	(aa4, bb7, cc14, cc14)
974	LDF	[BO + 21 * SIZE], b6
975
976	FMADD	(aa3, bb8, cc15, cc15)
977	LDF	[BO + 22 * SIZE], b7
978	FMADD	(aa4, bb8, cc16, cc16)
979	LDF	[BO + 23 * SIZE], b8
980
981	FMADD	(aa1, bb1, cc01, cc01)
982	FMADD	(aa2, bb1, cc02, cc02)
983	FMADD	(aa1, bb2, cc03, cc03)
984	FMADD	(aa2, bb2, cc04, cc04)
985
986	FMADD	(aa1, bb3, cc05, cc05)
987	LDF	[BO + 32 * SIZE], b1
988	FMADD	(aa2, bb3, cc06, cc06)
989	LDF	[BO + 25 * SIZE], b2
990
991	FMADD	(aa1, bb4, cc07, cc07)
992	LDF	[BO + 26 * SIZE], b3
993	FMADD	(aa2, bb4, cc08, cc08)
994	LDF	[BO + 27 * SIZE], b4
995
996	FMADD	(aa1, bb5, cc09, cc09)
997	LDF	[AO +  6 * SIZE], a3
998	FMADD	(aa2, bb5, cc10, cc10)
999	LDF	[AO +  7 * SIZE], a4
1000
1001	FMADD	(aa1, bb6, cc11, cc11)
1002	nop
1003	FMADD	(aa2, bb6, cc12, cc12)
1004	nop
1005
1006	FMADD	(aa1, bb7, cc13, cc13)
1007	LDF	[BO + 28 * SIZE], b5
1008	FMADD	(aa2, bb7, cc14, cc14)
1009	LDF	[BO + 29 * SIZE], b6
1010
1011	FMADD	(aa1, bb8, cc15, cc15)
1012	LDF	[BO + 30 * SIZE], b7
1013	FMADD	(aa2, bb8, cc16, cc16)
1014	LDF	[BO + 31 * SIZE], b8
1015
1016	FMADD	(aa3, bb9, cc01, cc01)
1017	FMADD	(aa4, bb9, cc02, cc02)
1018	FMADD	(aa3, bb2, cc03, cc03)
1019	FMADD	(aa4, bb2, cc04, cc04)
1020
1021	FMADD	(aa3, bb3, cc05, cc05)
1022	LDF	[BO + 40 * SIZE], b9
1023	FMADD	(aa4, bb3, cc06, cc06)
1024	LDF	[BO + 33 * SIZE], b2
1025
1026	FMADD	(aa3, bb4, cc07, cc07)
1027	LDF	[BO + 34 * SIZE], b3
1028	FMADD	(aa4, bb4, cc08, cc08)
1029	LDF	[BO + 35 * SIZE], b4
1030
1031	FMADD	(aa3, bb5, cc09, cc09)
1032	LDF	[AO + 16 * SIZE], a1  /****/
1033	FMADD	(aa4, bb5, cc10, cc10)
1034	LDF	[AO +  9 * SIZE], a2
1035
1036	FMADD	(aa3, bb6, cc11, cc11)
1037	nop
1038	FMADD	(aa4, bb6, cc12, cc12)
1039	nop
1040
1041	FMADD	(aa3, bb7, cc13, cc13)
1042	LDF	[BO + 36 * SIZE], b5
1043	FMADD	(aa4, bb7, cc14, cc14)
1044	LDF	[BO + 37 * SIZE], b6
1045
1046	FMADD	(aa3, bb8, cc15, cc15)
1047	LDF	[BO + 38 * SIZE], b7
1048	FMADD	(aa4, bb8, cc16, cc16)
1049	LDF	[BO + 39 * SIZE], b8
1050
1051	FMADD	(aa5, bb1, cc01, cc01)
1052	FMADD	(aa2, bb1, cc02, cc02)
1053	FMADD	(aa5, bb2, cc03, cc03)
1054	FMADD	(aa2, bb2, cc04, cc04)
1055
1056	FMADD	(aa5, bb3, cc05, cc05)
1057	LDF	[BO + 48 * SIZE], b1
1058	FMADD	(aa2, bb3, cc06, cc06)
1059	LDF	[BO + 41 * SIZE], b2
1060
1061	FMADD	(aa5, bb4, cc07, cc07)
1062	LDF	[BO + 42 * SIZE], b3
1063	FMADD	(aa2, bb4, cc08, cc08)
1064	LDF	[BO + 43 * SIZE], b4
1065
1066	FMADD	(aa5, bb5, cc09, cc09)
1067	LDF	[AO + 10 * SIZE], a3
1068	FMADD	(aa2, bb5, cc10, cc10)
1069	LDF	[AO + 11 * SIZE], a4
1070
1071	FMADD	(aa5, bb6, cc11, cc11)
1072	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
1073	FMADD	(aa2, bb6, cc12, cc12)
1074	nop
1075
1076	FMADD	(aa5, bb7, cc13, cc13)
1077	LDF	[BO + 44 * SIZE], b5
1078	FMADD	(aa2, bb7, cc14, cc14)
1079	LDF	[BO + 45 * SIZE], b6
1080
1081	FMADD	(aa5, bb8, cc15, cc15)
1082	LDF	[BO + 46 * SIZE], b7
1083	FMADD	(aa2, bb8, cc16, cc16)
1084	LDF	[BO + 47 * SIZE], b8
1085
1086	FMADD	(aa3, bb9, cc01, cc01)
1087	FMADD	(aa4, bb9, cc02, cc02)
1088	FMADD	(aa3, bb2, cc03, cc03)
1089	FMADD	(aa4, bb2, cc04, cc04)
1090
1091	FMADD	(aa3, bb3, cc05, cc05)
1092	LDF	[BO + 56 * SIZE], b9
1093	FMADD	(aa4, bb3, cc06, cc06)
1094	LDF	[BO + 49 * SIZE], b2
1095
1096	FMADD	(aa3, bb4, cc07, cc07)
1097	LDF	[BO + 50 * SIZE], b3
1098	FMADD	(aa4, bb4, cc08, cc08)
1099	LDF	[BO + 51 * SIZE], b4
1100
1101	FMADD	(aa3, bb5, cc09, cc09)
1102	LDF	[AO + 12 * SIZE], a5
1103	FMADD	(aa4, bb5, cc10, cc10)
1104	LDF	[AO + 13 * SIZE], a2
1105
1106	FMADD	(aa3, bb6, cc11, cc11)
1107	cmp	L, 0
1108	FMADD	(aa4, bb6, cc12, cc12)
1109	nop
1110
1111	FMADD	(aa3, bb7, cc13, cc13)
1112	LDF	[BO + 52 * SIZE], b5
1113	FMADD	(aa4, bb7, cc14, cc14)
1114	LDF	[BO + 53 * SIZE], b6
1115
1116	FMADD	(aa3, bb8, cc15, cc15)
1117	LDF	[BO + 54 * SIZE], b7
1118	FMADD	(aa4, bb8, cc16, cc16)
1119	LDF	[BO + 55 * SIZE], b8
1120
1121	FMADD	(aa5, bb1, cc01, cc01)
1122	FMADD	(aa2, bb1, cc02, cc02)
1123	FMADD	(aa5, bb2, cc03, cc03)
1124	FMADD	(aa2, bb2, cc04, cc04)
1125
1126	FMADD	(aa5, bb3, cc05, cc05)
1127	LDF	[BO + 64 * SIZE], b1
1128	FMADD	(aa2, bb3, cc06, cc06)
1129	LDF	[BO + 57 * SIZE], b2
1130
1131	FMADD	(aa5, bb4, cc07, cc07)
1132	LDF	[BO + 58 * SIZE], b3
1133	FMADD	(aa2, bb4, cc08, cc08)
1134	LDF	[BO + 59 * SIZE], b4
1135
1136	FMADD	(aa5, bb5, cc09, cc09)
1137	LDF	[AO + 14 * SIZE], a3
1138	FMADD	(aa2, bb5, cc10, cc10)
1139	LDF	[AO + 15 * SIZE], a4
1140
1141	FMADD	(aa5, bb6, cc11, cc11)
1142	add	BO, 64 * SIZE, BO
1143	FMADD	(aa2, bb6, cc12, cc12)
1144	add	AO, 16 * SIZE, AO
1145
1146	FMADD	(aa5, bb7, cc13, cc13)
1147	LDF	[BO -  4 * SIZE], b5
1148	FMADD	(aa2, bb7, cc14, cc14)
1149	LDF	[BO -  3 * SIZE], b6
1150
1151	FMADD	(aa5, bb8, cc15, cc15)
1152	LDF	[BO -  2 * SIZE], b7
1153	FMADD	(aa2, bb8, cc16, cc16)
1154	LDF	[BO -  1 * SIZE], b8
1155
1156	FMADD	(aa3, bb9, cc01, cc01)
1157	FMADD	(aa4, bb9, cc02, cc02)
1158	FMADD	(aa3, bb2, cc03, cc03)
1159	FMADD	(aa4, bb2, cc04, cc04)
1160
1161	FMADD	(aa3, bb3, cc05, cc05)
1162	LDF	[BO +  8 * SIZE], b9
1163	FMADD	(aa4, bb3, cc06, cc06)
1164	LDF	[BO +  1 * SIZE], b2
1165
1166	FMADD	(aa3, bb4, cc07, cc07)
1167	LDF	[BO +  2 * SIZE], b3
1168	FMADD	(aa4, bb4, cc08, cc08)
1169	LDF	[BO +  3 * SIZE], b4
1170
1171	FMADD	(aa3, bb5, cc09, cc09)
1172	LDF	[AO +  8 * SIZE], a5  /****/
1173	FMADD	(aa4, bb5, cc10, cc10)
1174	LDF	[AO +  1 * SIZE], a2
1175
1176	FMADD	(aa3, bb6, cc11, cc11)
1177	FMADD	(aa4, bb6, cc12, cc12)
1178
1179	FMADD	(aa3, bb7, cc13, cc13)
1180	LDF	[BO +  4 * SIZE], b5
1181	FMADD	(aa4, bb7, cc14, cc14)
1182	LDF	[BO +  5 * SIZE], b6
1183
1184	FMADD	(aa3, bb8, cc15, cc15)
1185	LDF	[BO +  6 * SIZE], b7
1186	FMADD	(aa4, bb8, cc16, cc16)
1187	ble,pn	%icc, .LL15
1188	LDF	[BO +  7 * SIZE], b8
1189
1190	FMADD	(aa1, bb1, cc01, cc01)
1191	FMADD	(aa2, bb1, cc02, cc02)
1192	FMADD	(aa1, bb2, cc03, cc03)
1193	FMADD	(aa2, bb2, cc04, cc04)
1194
1195	FMADD	(aa1, bb3, cc05, cc05)
1196	LDF	[BO + 16 * SIZE], b1
1197	FMADD	(aa2, bb3, cc06, cc06)
1198	LDF	[BO +  9 * SIZE], b2
1199
1200	FMADD	(aa1, bb4, cc07, cc07)
1201	LDF	[BO + 10 * SIZE], b3
1202	FMADD	(aa2, bb4, cc08, cc08)
1203	LDF	[BO + 11 * SIZE], b4
1204
1205	FMADD	(aa1, bb5, cc09, cc09)
1206	LDF	[AO +  2 * SIZE], a3
1207	FMADD	(aa2, bb5, cc10, cc10)
1208	LDF	[AO +  3 * SIZE], a4
1209
1210	FMADD	(aa1, bb6, cc11, cc11)
1211	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1212	FMADD	(aa2, bb6, cc12, cc12)
1213	nop
1214
1215	FMADD	(aa1, bb7, cc13, cc13)
1216	LDF	[BO + 12 * SIZE], b5
1217	FMADD	(aa2, bb7, cc14, cc14)
1218	LDF	[BO + 13 * SIZE], b6
1219
1220	FMADD	(aa1, bb8, cc15, cc15)
1221	LDF	[BO + 14 * SIZE], b7
1222	FMADD	(aa2, bb8, cc16, cc16)
1223	LDF	[BO + 15 * SIZE], b8
1224
1225	FMADD	(aa3, bb9, cc01, cc01)
1226	FMADD	(aa4, bb9, cc02, cc02)
1227	FMADD	(aa3, bb2, cc03, cc03)
1228	FMADD	(aa4, bb2, cc04, cc04)
1229
1230	FMADD	(aa3, bb3, cc05, cc05)
1231	LDF	[BO + 24 * SIZE], b9
1232	FMADD	(aa4, bb3, cc06, cc06)
1233	LDF	[BO + 17 * SIZE], b2
1234
1235	FMADD	(aa3, bb4, cc07, cc07)
1236	LDF	[BO + 18 * SIZE], b3
1237	FMADD	(aa4, bb4, cc08, cc08)
1238	LDF	[BO + 19 * SIZE], b4
1239
1240	FMADD	(aa3, bb5, cc09, cc09)
1241	LDF	[AO +  4 * SIZE], a1
1242	FMADD	(aa4, bb5, cc10, cc10)
1243	LDF	[AO +  5 * SIZE], a2
1244
1245	FMADD	(aa3, bb6, cc11, cc11)
1246	add	L, -1, L
1247	FMADD	(aa4, bb6, cc12, cc12)
1248	nop
1249
1250	FMADD	(aa3, bb7, cc13, cc13)
1251	LDF	[BO + 20 * SIZE], b5
1252	FMADD	(aa4, bb7, cc14, cc14)
1253	LDF	[BO + 21 * SIZE], b6
1254
1255	FMADD	(aa3, bb8, cc15, cc15)
1256	LDF	[BO + 22 * SIZE], b7
1257	FMADD	(aa4, bb8, cc16, cc16)
1258	LDF	[BO + 23 * SIZE], b8
1259
1260	FMADD	(aa1, bb1, cc01, cc01)
1261	FMADD	(aa2, bb1, cc02, cc02)
1262	FMADD	(aa1, bb2, cc03, cc03)
1263	FMADD	(aa2, bb2, cc04, cc04)
1264
1265	FMADD	(aa1, bb3, cc05, cc05)
1266	LDF	[BO + 32 * SIZE], b1
1267	FMADD	(aa2, bb3, cc06, cc06)
1268	LDF	[BO + 25 * SIZE], b2
1269
1270	FMADD	(aa1, bb4, cc07, cc07)
1271	LDF	[BO + 26 * SIZE], b3
1272	FMADD	(aa2, bb4, cc08, cc08)
1273	LDF	[BO + 27 * SIZE], b4
1274
1275	FMADD	(aa1, bb5, cc09, cc09)
1276	LDF	[AO +  6 * SIZE], a3
1277	FMADD	(aa2, bb5, cc10, cc10)
1278	LDF	[AO +  7 * SIZE], a4
1279
1280	FMADD	(aa1, bb6, cc11, cc11)
1281	nop
1282	FMADD	(aa2, bb6, cc12, cc12)
1283	nop
1284
1285	FMADD	(aa1, bb7, cc13, cc13)
1286	LDF	[BO + 28 * SIZE], b5
1287	FMADD	(aa2, bb7, cc14, cc14)
1288	LDF	[BO + 29 * SIZE], b6
1289
1290	FMADD	(aa1, bb8, cc15, cc15)
1291	LDF	[BO + 30 * SIZE], b7
1292	FMADD	(aa2, bb8, cc16, cc16)
1293	LDF	[BO + 31 * SIZE], b8
1294
1295	FMADD	(aa3, bb9, cc01, cc01)
1296	FMADD	(aa4, bb9, cc02, cc02)
1297	FMADD	(aa3, bb2, cc03, cc03)
1298	FMADD	(aa4, bb2, cc04, cc04)
1299
1300	FMADD	(aa3, bb3, cc05, cc05)
1301	LDF	[BO + 40 * SIZE], b9
1302	FMADD	(aa4, bb3, cc06, cc06)
1303	LDF	[BO + 33 * SIZE], b2
1304
1305	FMADD	(aa3, bb4, cc07, cc07)
1306	LDF	[BO + 34 * SIZE], b3
1307	FMADD	(aa4, bb4, cc08, cc08)
1308	LDF	[BO + 35 * SIZE], b4
1309
1310	FMADD	(aa3, bb5, cc09, cc09)
1311	LDF	[AO + 16 * SIZE], a1  /****/
1312	FMADD	(aa4, bb5, cc10, cc10)
1313	LDF	[AO +  9 * SIZE], a2
1314
1315	FMADD	(aa3, bb6, cc11, cc11)
1316	nop
1317	FMADD	(aa4, bb6, cc12, cc12)
1318	nop
1319
1320	FMADD	(aa3, bb7, cc13, cc13)
1321	LDF	[BO + 36 * SIZE], b5
1322	FMADD	(aa4, bb7, cc14, cc14)
1323	LDF	[BO + 37 * SIZE], b6
1324
1325	FMADD	(aa3, bb8, cc15, cc15)
1326	LDF	[BO + 38 * SIZE], b7
1327	FMADD	(aa4, bb8, cc16, cc16)
1328	LDF	[BO + 39 * SIZE], b8
1329
1330	FMADD	(aa5, bb1, cc01, cc01)
1331	FMADD	(aa2, bb1, cc02, cc02)
1332	FMADD	(aa5, bb2, cc03, cc03)
1333	FMADD	(aa2, bb2, cc04, cc04)
1334
1335	FMADD	(aa5, bb3, cc05, cc05)
1336	LDF	[BO + 48 * SIZE], b1
1337	FMADD	(aa2, bb3, cc06, cc06)
1338	LDF	[BO + 41 * SIZE], b2
1339
1340	FMADD	(aa5, bb4, cc07, cc07)
1341	LDF	[BO + 42 * SIZE], b3
1342	FMADD	(aa2, bb4, cc08, cc08)
1343	LDF	[BO + 43 * SIZE], b4
1344
1345	FMADD	(aa5, bb5, cc09, cc09)
1346	LDF	[AO + 10 * SIZE], a3
1347	FMADD	(aa2, bb5, cc10, cc10)
1348	LDF	[AO + 11 * SIZE], a4
1349
1350	FMADD	(aa5, bb6, cc11, cc11)
1351	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
1352	FMADD	(aa2, bb6, cc12, cc12)
1353	nop
1354
1355	FMADD	(aa5, bb7, cc13, cc13)
1356	LDF	[BO + 44 * SIZE], b5
1357	FMADD	(aa2, bb7, cc14, cc14)
1358	LDF	[BO + 45 * SIZE], b6
1359
1360	FMADD	(aa5, bb8, cc15, cc15)
1361	LDF	[BO + 46 * SIZE], b7
1362	FMADD	(aa2, bb8, cc16, cc16)
1363	LDF	[BO + 47 * SIZE], b8
1364
1365	FMADD	(aa3, bb9, cc01, cc01)
1366	FMADD	(aa4, bb9, cc02, cc02)
1367	FMADD	(aa3, bb2, cc03, cc03)
1368	FMADD	(aa4, bb2, cc04, cc04)
1369
1370	FMADD	(aa3, bb3, cc05, cc05)
1371	LDF	[BO + 56 * SIZE], b9
1372	FMADD	(aa4, bb3, cc06, cc06)
1373	LDF	[BO + 49 * SIZE], b2
1374
1375	FMADD	(aa3, bb4, cc07, cc07)
1376	LDF	[BO + 50 * SIZE], b3
1377	FMADD	(aa4, bb4, cc08, cc08)
1378	LDF	[BO + 51 * SIZE], b4
1379
1380	FMADD	(aa3, bb5, cc09, cc09)
1381	LDF	[AO + 12 * SIZE], a5
1382	FMADD	(aa4, bb5, cc10, cc10)
1383	LDF	[AO + 13 * SIZE], a2
1384
1385	FMADD	(aa3, bb6, cc11, cc11)
1386	cmp	L, 0
1387	FMADD	(aa4, bb6, cc12, cc12)
1388	nop
1389
1390	FMADD	(aa3, bb7, cc13, cc13)
1391	LDF	[BO + 52 * SIZE], b5
1392	FMADD	(aa4, bb7, cc14, cc14)
1393	LDF	[BO + 53 * SIZE], b6
1394
1395	FMADD	(aa3, bb8, cc15, cc15)
1396	LDF	[BO + 54 * SIZE], b7
1397	FMADD	(aa4, bb8, cc16, cc16)
1398	LDF	[BO + 55 * SIZE], b8
1399
1400	FMADD	(aa5, bb1, cc01, cc01)
1401	FMADD	(aa2, bb1, cc02, cc02)
1402	FMADD	(aa5, bb2, cc03, cc03)
1403	FMADD	(aa2, bb2, cc04, cc04)
1404
1405	FMADD	(aa5, bb3, cc05, cc05)
1406	LDF	[BO + 64 * SIZE], b1
1407	FMADD	(aa2, bb3, cc06, cc06)
1408	LDF	[BO + 57 * SIZE], b2
1409
1410	FMADD	(aa5, bb4, cc07, cc07)
1411	LDF	[BO + 58 * SIZE], b3
1412	FMADD	(aa2, bb4, cc08, cc08)
1413	LDF	[BO + 59 * SIZE], b4
1414
1415	FMADD	(aa5, bb5, cc09, cc09)
1416	LDF	[AO + 14 * SIZE], a3
1417	FMADD	(aa2, bb5, cc10, cc10)
1418	LDF	[AO + 15 * SIZE], a4
1419
1420	FMADD	(aa5, bb6, cc11, cc11)
1421	add	BO, 64 * SIZE, BO
1422	FMADD	(aa2, bb6, cc12, cc12)
1423	add	AO, 16 * SIZE, AO
1424
1425	FMADD	(aa5, bb7, cc13, cc13)
1426	LDF	[BO -  4 * SIZE], b5
1427	FMADD	(aa2, bb7, cc14, cc14)
1428	LDF	[BO -  3 * SIZE], b6
1429
1430	FMADD	(aa5, bb8, cc15, cc15)
1431	LDF	[BO -  2 * SIZE], b7
1432	FMADD	(aa2, bb8, cc16, cc16)
1433	LDF	[BO -  1 * SIZE], b8
1434
1435	FMADD	(aa3, bb9, cc01, cc01)
1436	FMADD	(aa4, bb9, cc02, cc02)
1437	FMADD	(aa3, bb2, cc03, cc03)
1438	FMADD	(aa4, bb2, cc04, cc04)
1439
1440	FMADD	(aa3, bb3, cc05, cc05)
1441	LDF	[BO +  8 * SIZE], b9
1442	FMADD	(aa4, bb3, cc06, cc06)
1443	LDF	[BO +  1 * SIZE], b2
1444
1445	FMADD	(aa3, bb4, cc07, cc07)
1446	LDF	[BO +  2 * SIZE], b3
1447	FMADD	(aa4, bb4, cc08, cc08)
1448	LDF	[BO +  3 * SIZE], b4
1449
1450	FMADD	(aa3, bb5, cc09, cc09)
1451	LDF	[AO +  8 * SIZE], a5  /****/
1452	FMADD	(aa4, bb5, cc10, cc10)
1453	LDF	[AO +  1 * SIZE], a2
1454
1455	FMADD	(aa3, bb6, cc11, cc11)
1456	FMADD	(aa4, bb6, cc12, cc12)
1457
1458	FMADD	(aa3, bb7, cc13, cc13)
1459	LDF	[BO +  4 * SIZE], b5
1460	FMADD	(aa4, bb7, cc14, cc14)
1461	LDF	[BO +  5 * SIZE], b6
1462
1463	FMADD	(aa3, bb8, cc15, cc15)
1464	LDF	[BO +  6 * SIZE], b7
1465	FMADD	(aa4, bb8, cc16, cc16)
1466	bg,pt	%icc, .LL13
1467	LDF	[BO +  7 * SIZE], b8
1468	.align 4
1469
1470.LL15:
1471#if defined(LT) || defined(RN)
1472	and	KK, 7, L
1473#else
1474	sub	K, KK, L
1475	and	L,  7, L
1476#endif
1477	cmp	L,  0
1478	ble,a,pn %icc, .LL18
1479	nop
1480	.align 4
1481
1482.LL17:
1483	FMADD	(aa1, bb1, cc01, cc01)
1484	add	L, -1, L
1485	FMADD	(aa2, bb1, cc02, cc02)
1486	nop
1487
1488	FMADD	(aa1, bb2, cc03, cc03)
1489	LDF	[BO +  8 * SIZE], b1
1490	FMADD	(aa2, bb2, cc04, cc04)
1491	LDF	[BO +  9 * SIZE], b2
1492
1493	FMADD	(aa1, bb3, cc05, cc05)
1494	cmp	L, 0
1495	FMADD	(aa2, bb3, cc06, cc06)
1496	nop
1497
1498	FMADD	(aa1, bb4, cc07, cc07)
1499	LDF	[BO + 10 * SIZE], b3
1500	FMADD	(aa2, bb4, cc08, cc08)
1501	LDF	[BO + 11 * SIZE], b4
1502
1503	FMADD	(aa1, bb5, cc09, cc09)
1504	nop
1505	FMADD	(aa2, bb5, cc10, cc10)
1506	nop
1507
1508	FMADD	(aa1, bb6, cc11, cc11)
1509	LDF	[BO + 12 * SIZE], b5
1510	FMADD	(aa2, bb6, cc12, cc12)
1511	LDF	[BO + 13 * SIZE], b6
1512
1513	FMADD	(aa1, bb7, cc13, cc13)
1514	add	AO, 2 * SIZE, AO
1515	FMADD	(aa2, bb7, cc14, cc14)
1516	add	BO, 8 * SIZE, BO
1517
1518	FMADD	(aa1, bb8, cc15, cc15)
1519	LDF	[AO +  0 * SIZE], a1
1520	FMADD	(aa2, bb8, cc16, cc16)
1521	LDF	[AO +  1 * SIZE], a2
1522
1523	LDF	[BO +  6 * SIZE], b7
1524	bg,pt	%icc, .LL17
1525	LDF	[BO +  7 * SIZE], b8
1526	nop
1527	.align 4
1528
1529.LL18:
1530#if defined(LN) || defined(RT)
1531#ifdef LN
1532	sub	KK, 2, TEMP1
1533#else
1534	sub	KK, 8, TEMP1
1535#endif
1536	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1537	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1538
1539	add	AORIG, TEMP2, AO
1540	add	B,     TEMP1, BO
1541#endif
1542
1543#if defined(LN) || defined(LT)
1544	LDF	[BO +  0 * SIZE], a1
1545	LDF	[BO +  1 * SIZE], a2
1546	LDF	[BO +  2 * SIZE], a3
1547	LDF	[BO +  3 * SIZE], a4
1548
1549	LDF	[BO +  4 * SIZE], b1
1550	LDF	[BO +  5 * SIZE], b2
1551	LDF	[BO +  6 * SIZE], b3
1552	LDF	[BO +  7 * SIZE], b4
1553
1554	FSUB	a1, c01, c01
1555	FSUB	a2, c03, c03
1556	FSUB	a3, c05, c05
1557	FSUB	a4, c07, c07
1558
1559	FSUB	b1, c09, c09
1560	FSUB	b2, c11, c11
1561	FSUB	b3, c13, c13
1562	FSUB	b4, c15, c15
1563
1564	LDF	[BO +  8 * SIZE], a1
1565	LDF	[BO +  9 * SIZE], a2
1566	LDF	[BO + 10 * SIZE], a3
1567	LDF	[BO + 11 * SIZE], a4
1568
1569	LDF	[BO + 12 * SIZE], b1
1570	LDF	[BO + 13 * SIZE], b2
1571	LDF	[BO + 14 * SIZE], b3
1572	LDF	[BO + 15 * SIZE], b4
1573
1574	FSUB	a1, c02, c02
1575	FSUB	a2, c04, c04
1576	FSUB	a3, c06, c06
1577	FSUB	a4, c08, c08
1578
1579	FSUB	b1, c10, c10
1580	FSUB	b2, c12, c12
1581	FSUB	b3, c14, c14
1582	FSUB	b4, c16, c16
1583#else
1584	LDF	[AO +  0 * SIZE], a1
1585	LDF	[AO +  1 * SIZE], a2
1586	LDF	[AO +  2 * SIZE], a3
1587	LDF	[AO +  3 * SIZE], a4
1588
1589	LDF	[AO +  4 * SIZE], b1
1590	LDF	[AO +  5 * SIZE], b2
1591	LDF	[AO +  6 * SIZE], b3
1592	LDF	[AO +  7 * SIZE], b4
1593
1594	FSUB	a1, c01, c01
1595	FSUB	a2, c02, c02
1596	FSUB	a3, c03, c03
1597	FSUB	a4, c04, c04
1598
1599	FSUB	b1, c05, c05
1600	FSUB	b2, c06, c06
1601	FSUB	b3, c07, c07
1602	FSUB	b4, c08, c08
1603
1604	LDF	[AO +  8 * SIZE], a1
1605	LDF	[AO +  9 * SIZE], a2
1606	LDF	[AO + 10 * SIZE], a3
1607	LDF	[AO + 11 * SIZE], a4
1608
1609	LDF	[AO + 12 * SIZE], b1
1610	LDF	[AO + 13 * SIZE], b2
1611	LDF	[AO + 14 * SIZE], b3
1612	LDF	[AO + 15 * SIZE], b4
1613
1614	FSUB	a1, c09, c09
1615	FSUB	a2, c10, c10
1616	FSUB	a3, c11, c11
1617	FSUB	a4, c12, c12
1618
1619	FSUB	b1, c13, c13
1620	FSUB	b2, c14, c14
1621	FSUB	b3, c15, c15
1622	FSUB	b4, c16, c16
1623#endif
1624
1625#ifdef LN
1626	LDF	[AO +  3 * SIZE], a1
1627	LDF	[AO +  2 * SIZE], a2
1628	LDF	[AO +  0 * SIZE], a3
1629
1630	FMUL	a1, c02, c02
1631	FMUL	a1, c04, c04
1632	FMUL	a1, c06, c06
1633	FMUL	a1, c08, c08
1634	FMUL	a1, c10, c10
1635	FMUL	a1, c12, c12
1636	FMUL	a1, c14, c14
1637	FMUL	a1, c16, c16
1638
1639	FNMSUB	(aa2, cc02, cc01, cc01)
1640	FNMSUB	(aa2, cc04, cc03, cc03)
1641	FNMSUB	(aa2, cc06, cc05, cc05)
1642	FNMSUB	(aa2, cc08, cc07, cc07)
1643	FNMSUB	(aa2, cc10, cc09, cc09)
1644	FNMSUB	(aa2, cc12, cc11, cc11)
1645	FNMSUB	(aa2, cc14, cc13, cc13)
1646	FNMSUB	(aa2, cc16, cc15, cc15)
1647
1648	FMUL	a3, c01, c01
1649	FMUL	a3, c03, c03
1650	FMUL	a3, c05, c05
1651	FMUL	a3, c07, c07
1652	FMUL	a3, c09, c09
1653	FMUL	a3, c11, c11
1654	FMUL	a3, c13, c13
1655	FMUL	a3, c15, c15
1656#endif
1657
1658#ifdef LT
1659	LDF	[AO +  0 * SIZE], a1
1660	LDF	[AO +  1 * SIZE], a2
1661	LDF	[AO +  3 * SIZE], a3
1662
1663	FMUL	a1, c01, c01
1664	FMUL	a1, c03, c03
1665	FMUL	a1, c05, c05
1666	FMUL	a1, c07, c07
1667	FMUL	a1, c09, c09
1668	FMUL	a1, c11, c11
1669	FMUL	a1, c13, c13
1670	FMUL	a1, c15, c15
1671
1672	FNMSUB	(aa2, cc01, cc02, cc02)
1673	FNMSUB	(aa2, cc03, cc04, cc04)
1674	FNMSUB	(aa2, cc05, cc06, cc06)
1675	FNMSUB	(aa2, cc07, cc08, cc08)
1676	FNMSUB	(aa2, cc09, cc10, cc10)
1677	FNMSUB	(aa2, cc11, cc12, cc12)
1678	FNMSUB	(aa2, cc13, cc14, cc14)
1679	FNMSUB	(aa2, cc15, cc16, cc16)
1680
1681	FMUL	a3, c02, c02
1682	FMUL	a3, c04, c04
1683	FMUL	a3, c06, c06
1684	FMUL	a3, c08, c08
1685	FMUL	a3, c10, c10
1686	FMUL	a3, c12, c12
1687	FMUL	a3, c14, c14
1688	FMUL	a3, c16, c16
1689#endif
1690
1691#ifdef RN
1692	LDF	[BO +  0 * SIZE], a1
1693	LDF	[BO +  1 * SIZE], a2
1694	LDF	[BO +  2 * SIZE], a3
1695	LDF	[BO +  3 * SIZE], a4
1696	LDF	[BO +  4 * SIZE], b1
1697	LDF	[BO +  5 * SIZE], b2
1698	LDF	[BO +  6 * SIZE], b3
1699	LDF	[BO +  7 * SIZE], b4
1700
1701	FMUL	a1, c01, c01
1702	FMUL	a1, c02, c02
1703
1704	FNMSUB	(aa2, cc01, cc03, cc03)
1705	FNMSUB	(aa2, cc02, cc04, cc04)
1706	FNMSUB	(aa3, cc01, cc05, cc05)
1707	FNMSUB	(aa3, cc02, cc06, cc06)
1708	FNMSUB	(aa4, cc01, cc07, cc07)
1709	FNMSUB	(aa4, cc02, cc08, cc08)
1710	FNMSUB	(bb1, cc01, cc09, cc09)
1711	FNMSUB	(bb1, cc02, cc10, cc10)
1712	FNMSUB	(bb2, cc01, cc11, cc11)
1713	FNMSUB	(bb2, cc02, cc12, cc12)
1714	FNMSUB	(bb3, cc01, cc13, cc13)
1715	FNMSUB	(bb3, cc02, cc14, cc14)
1716	FNMSUB	(bb4, cc01, cc15, cc15)
1717	FNMSUB	(bb4, cc02, cc16, cc16)
1718
1719	LDF	[BO +  9 * SIZE], a1
1720	LDF	[BO + 10 * SIZE], a2
1721	LDF	[BO + 11 * SIZE], a3
1722	LDF	[BO + 12 * SIZE], a4
1723	LDF	[BO + 13 * SIZE], b1
1724	LDF	[BO + 14 * SIZE], b2
1725	LDF	[BO + 15 * SIZE], b3
1726
1727	FMUL	a1, c03, c03
1728	FMUL	a1, c04, c04
1729
1730	FNMSUB	(aa2, cc03, cc05, cc05)
1731	FNMSUB	(aa2, cc04, cc06, cc06)
1732	FNMSUB	(aa3, cc03, cc07, cc07)
1733	FNMSUB	(aa3, cc04, cc08, cc08)
1734	FNMSUB	(aa4, cc03, cc09, cc09)
1735	FNMSUB	(aa4, cc04, cc10, cc10)
1736	FNMSUB	(bb1, cc03, cc11, cc11)
1737	FNMSUB	(bb1, cc04, cc12, cc12)
1738	FNMSUB	(bb2, cc03, cc13, cc13)
1739	FNMSUB	(bb2, cc04, cc14, cc14)
1740	FNMSUB	(bb3, cc03, cc15, cc15)
1741	FNMSUB	(bb3, cc04, cc16, cc16)
1742
1743	LDF	[BO + 18 * SIZE], a1
1744	LDF	[BO + 19 * SIZE], a2
1745	LDF	[BO + 20 * SIZE], a3
1746	LDF	[BO + 21 * SIZE], a4
1747	LDF	[BO + 22 * SIZE], b1
1748	LDF	[BO + 23 * SIZE], b2
1749
1750	FMUL	a1, c05, c05
1751	FMUL	a1, c06, c06
1752
1753	FNMSUB	(aa2, cc05, cc07, cc07)
1754	FNMSUB	(aa2, cc06, cc08, cc08)
1755	FNMSUB	(aa3, cc05, cc09, cc09)
1756	FNMSUB	(aa3, cc06, cc10, cc10)
1757	FNMSUB	(aa4, cc05, cc11, cc11)
1758	FNMSUB	(aa4, cc06, cc12, cc12)
1759	FNMSUB	(bb1, cc05, cc13, cc13)
1760	FNMSUB	(bb1, cc06, cc14, cc14)
1761	FNMSUB	(bb2, cc05, cc15, cc15)
1762	FNMSUB	(bb2, cc06, cc16, cc16)
1763
1764	LDF	[BO + 27 * SIZE], a1
1765	LDF	[BO + 28 * SIZE], a2
1766	LDF	[BO + 29 * SIZE], a3
1767	LDF	[BO + 30 * SIZE], a4
1768	LDF	[BO + 31 * SIZE], b1
1769
1770	FMUL	a1, c07, c07
1771	FMUL	a1, c08, c08
1772
1773	FNMSUB	(aa2, cc07, cc09, cc09)
1774	FNMSUB	(aa2, cc08, cc10, cc10)
1775	FNMSUB	(aa3, cc07, cc11, cc11)
1776	FNMSUB	(aa3, cc08, cc12, cc12)
1777	FNMSUB	(aa4, cc07, cc13, cc13)
1778	FNMSUB	(aa4, cc08, cc14, cc14)
1779	FNMSUB	(bb1, cc07, cc15, cc15)
1780	FNMSUB	(bb1, cc08, cc16, cc16)
1781
1782	LDF	[BO + 36 * SIZE], a1
1783	LDF	[BO + 37 * SIZE], a2
1784	LDF	[BO + 38 * SIZE], a3
1785	LDF	[BO + 39 * SIZE], a4
1786
1787	FMUL	a1, c09, c09
1788	FMUL	a1, c10, c10
1789
1790	FNMSUB	(aa2, cc09, cc11, cc11)
1791	FNMSUB	(aa2, cc10, cc12, cc12)
1792	FNMSUB	(aa3, cc09, cc13, cc13)
1793	FNMSUB	(aa3, cc10, cc14, cc14)
1794	FNMSUB	(aa4, cc09, cc15, cc15)
1795	FNMSUB	(aa4, cc10, cc16, cc16)
1796
1797	LDF	[BO + 45 * SIZE], a1
1798	LDF	[BO + 46 * SIZE], a2
1799	LDF	[BO + 47 * SIZE], a3
1800
1801	FMUL	a1, c11, c11
1802	FMUL	a1, c12, c12
1803
1804	FNMSUB	(aa2, cc11, cc13, cc13)
1805	FNMSUB	(aa2, cc12, cc14, cc14)
1806	FNMSUB	(aa3, cc11, cc15, cc15)
1807	FNMSUB	(aa3, cc12, cc16, cc16)
1808
1809	LDF	[BO + 54 * SIZE], a1
1810	LDF	[BO + 55 * SIZE], a2
1811
1812	FMUL	a1, c13, c13
1813	FMUL	a1, c14, c14
1814
1815	FNMSUB	(aa2, cc13, cc15, cc15)
1816	FNMSUB	(aa2, cc14, cc16, cc16)
1817
1818	LDF	[BO + 63 * SIZE], a1
1819
1820	FMUL	a1, c15, c15
1821	FMUL	a1, c16, c16
1822#endif
1823
1824#ifdef RT
1825	LDF	[BO + 63 * SIZE], a1
1826	LDF	[BO + 62 * SIZE], a2
1827	LDF	[BO + 61 * SIZE], a3
1828	LDF	[BO + 60 * SIZE], a4
1829	LDF	[BO + 59 * SIZE], b1
1830	LDF	[BO + 58 * SIZE], b2
1831	LDF	[BO + 57 * SIZE], b3
1832	LDF	[BO + 56 * SIZE], b4
1833
1834	FMUL	a1, c16, c16
1835	FMUL	a1, c15, c15
1836
1837	FNMSUB	(aa2, cc16, cc14, cc14)
1838	FNMSUB	(aa2, cc15, cc13, cc13)
1839	FNMSUB	(aa3, cc16, cc12, cc12)
1840	FNMSUB	(aa3, cc15, cc11, cc11)
1841	FNMSUB	(aa4, cc16, cc10, cc10)
1842	FNMSUB	(aa4, cc15, cc09, cc09)
1843	FNMSUB	(bb1, cc16, cc08, cc08)
1844	FNMSUB	(bb1, cc15, cc07, cc07)
1845	FNMSUB	(bb2, cc16, cc06, cc06)
1846	FNMSUB	(bb2, cc15, cc05, cc05)
1847	FNMSUB	(bb3, cc16, cc04, cc04)
1848	FNMSUB	(bb3, cc15, cc03, cc03)
1849	FNMSUB	(bb4, cc16, cc02, cc02)
1850	FNMSUB	(bb4, cc15, cc01, cc01)
1851
1852	LDF	[BO + 54 * SIZE], a1
1853	LDF	[BO + 53 * SIZE], a2
1854	LDF	[BO + 52 * SIZE], a3
1855	LDF	[BO + 51 * SIZE], a4
1856	LDF	[BO + 50 * SIZE], b1
1857	LDF	[BO + 49 * SIZE], b2
1858	LDF	[BO + 48 * SIZE], b3
1859
1860	FMUL	a1, c14, c14
1861	FMUL	a1, c13, c13
1862
1863	FNMSUB	(aa2, cc14, cc12, cc12)
1864	FNMSUB	(aa2, cc13, cc11, cc11)
1865	FNMSUB	(aa3, cc14, cc10, cc10)
1866	FNMSUB	(aa3, cc13, cc09, cc09)
1867	FNMSUB	(aa4, cc14, cc08, cc08)
1868	FNMSUB	(aa4, cc13, cc07, cc07)
1869	FNMSUB	(bb1, cc14, cc06, cc06)
1870	FNMSUB	(bb1, cc13, cc05, cc05)
1871	FNMSUB	(bb2, cc14, cc04, cc04)
1872	FNMSUB	(bb2, cc13, cc03, cc03)
1873	FNMSUB	(bb3, cc14, cc02, cc02)
1874	FNMSUB	(bb3, cc13, cc01, cc01)
1875
1876	LDF	[BO + 45 * SIZE], a1
1877	LDF	[BO + 44 * SIZE], a2
1878	LDF	[BO + 43 * SIZE], a3
1879	LDF	[BO + 42 * SIZE], a4
1880	LDF	[BO + 41 * SIZE], b1
1881	LDF	[BO + 40 * SIZE], b2
1882
1883	FMUL	a1, c12, c12
1884	FMUL	a1, c11, c11
1885
1886	FNMSUB	(aa2, cc12, cc10, cc10)
1887	FNMSUB	(aa2, cc11, cc09, cc09)
1888	FNMSUB	(aa3, cc12, cc08, cc08)
1889	FNMSUB	(aa3, cc11, cc07, cc07)
1890	FNMSUB	(aa4, cc12, cc06, cc06)
1891	FNMSUB	(aa4, cc11, cc05, cc05)
1892	FNMSUB	(bb1, cc12, cc04, cc04)
1893	FNMSUB	(bb1, cc11, cc03, cc03)
1894	FNMSUB	(bb2, cc12, cc02, cc02)
1895	FNMSUB	(bb2, cc11, cc01, cc01)
1896
1897	LDF	[BO + 36 * SIZE], a1
1898	LDF	[BO + 35 * SIZE], a2
1899	LDF	[BO + 34 * SIZE], a3
1900	LDF	[BO + 33 * SIZE], a4
1901	LDF	[BO + 32 * SIZE], b1
1902
1903	FMUL	a1, c10, c10
1904	FMUL	a1, c09, c09
1905
1906	FNMSUB	(aa2, cc10, cc08, cc08)
1907	FNMSUB	(aa2, cc09, cc07, cc07)
1908	FNMSUB	(aa3, cc10, cc06, cc06)
1909	FNMSUB	(aa3, cc09, cc05, cc05)
1910	FNMSUB	(aa4, cc10, cc04, cc04)
1911	FNMSUB	(aa4, cc09, cc03, cc03)
1912	FNMSUB	(bb1, cc10, cc02, cc02)
1913	FNMSUB	(bb1, cc09, cc01, cc01)
1914
1915	LDF	[BO + 27 * SIZE], a1
1916	LDF	[BO + 26 * SIZE], a2
1917	LDF	[BO + 25 * SIZE], a3
1918	LDF	[BO + 24 * SIZE], a4
1919
1920	FMUL	a1, c08, c08
1921	FMUL	a1, c07, c07
1922
1923	FNMSUB	(aa2, cc08, cc06, cc06)
1924	FNMSUB	(aa2, cc07, cc05, cc05)
1925	FNMSUB	(aa3, cc08, cc04, cc04)
1926	FNMSUB	(aa3, cc07, cc03, cc03)
1927	FNMSUB	(aa4, cc08, cc02, cc02)
1928	FNMSUB	(aa4, cc07, cc01, cc01)
1929
1930	LDF	[BO + 18 * SIZE], a1
1931	LDF	[BO + 17 * SIZE], a2
1932	LDF	[BO + 16 * SIZE], a3
1933
1934	FMUL	a1, c06, c06
1935	FMUL	a1, c05, c05
1936
1937	FNMSUB	(aa2, cc06, cc04, cc04)
1938	FNMSUB	(aa2, cc05, cc03, cc03)
1939	FNMSUB	(aa3, cc06, cc02, cc02)
1940	FNMSUB	(aa3, cc05, cc01, cc01)
1941
1942	LDF	[BO +  9 * SIZE], a1
1943	LDF	[BO +  8 * SIZE], a2
1944
1945	FMUL	a1, c04, c04
1946	FMUL	a1, c03, c03
1947
1948	FNMSUB	(aa2, cc04, cc02, cc02)
1949	FNMSUB	(aa2, cc03, cc01, cc01)
1950
1951	LDF	[BO +  0 * SIZE], a1
1952
1953	FMUL	a1, c02, c02
1954	FMUL	a1, c01, c01
1955#endif
1956
1957#ifdef LN
1958	add	C1, -2 * SIZE, C1
1959	add	C2, -2 * SIZE, C2
1960	add	C3, -2 * SIZE, C3
1961	add	C4, -2 * SIZE, C4
1962	add	C5, -2 * SIZE, C5
1963	add	C6, -2 * SIZE, C6
1964	add	C7, -2 * SIZE, C7
1965	add	C8, -2 * SIZE, C8
1966#endif
1967
1968#if defined(LN) || defined(LT)
1969	STF	c01, [BO +  0 * SIZE]
1970	STF	c03, [BO +  1 * SIZE]
1971	STF	c05, [BO +  2 * SIZE]
1972	STF	c07, [BO +  3 * SIZE]
1973
1974	STF	c09, [BO +  4 * SIZE]
1975	STF	c11, [BO +  5 * SIZE]
1976	STF	c13, [BO +  6 * SIZE]
1977	STF	c15, [BO +  7 * SIZE]
1978
1979	STF	c02, [BO +  8 * SIZE]
1980	STF	c04, [BO +  9 * SIZE]
1981	STF	c06, [BO + 10 * SIZE]
1982	STF	c08, [BO + 11 * SIZE]
1983
1984	STF	c10, [BO + 12 * SIZE]
1985	STF	c12, [BO + 13 * SIZE]
1986	STF	c14, [BO + 14 * SIZE]
1987	STF	c16, [BO + 15 * SIZE]
1988#else
1989	STF	c01, [AO +  0 * SIZE]
1990	STF	c02, [AO +  1 * SIZE]
1991	STF	c03, [AO +  2 * SIZE]
1992	STF	c04, [AO +  3 * SIZE]
1993
1994	STF	c05, [AO +  4 * SIZE]
1995	STF	c06, [AO +  5 * SIZE]
1996	STF	c07, [AO +  6 * SIZE]
1997	STF	c08, [AO +  7 * SIZE]
1998
1999	STF	c09, [AO +  8 * SIZE]
2000	STF	c10, [AO +  9 * SIZE]
2001	STF	c11, [AO + 10 * SIZE]
2002	STF	c12, [AO + 11 * SIZE]
2003
2004	STF	c13, [AO + 12 * SIZE]
2005	STF	c14, [AO + 13 * SIZE]
2006	STF	c15, [AO + 14 * SIZE]
2007	STF	c16, [AO + 15 * SIZE]
2008#endif
2009
2010	STF	c01, [C1 + 0 * SIZE]
2011	STF	c02, [C1 + 1 * SIZE]
2012	STF	c03, [C2 + 0 * SIZE]
2013	STF	c04, [C2 + 1 * SIZE]
2014
2015	STF	c05, [C3 + 0 * SIZE]
2016	STF	c06, [C3 + 1 * SIZE]
2017	STF	c07, [C4 + 0 * SIZE]
2018	STF	c08, [C4 + 1 * SIZE]
2019
2020	STF	c09, [C5 + 0 * SIZE]
2021	STF	c10, [C5 + 1 * SIZE]
2022	STF	c11, [C6 + 0 * SIZE]
2023	STF	c12, [C6 + 1 * SIZE]
2024
2025	STF	c13, [C7 + 0 * SIZE]
2026	STF	c14, [C7 + 1 * SIZE]
2027	STF	c15, [C8 + 0 * SIZE]
2028	STF	c16, [C8 + 1 * SIZE]
2029
2030#ifndef LN
2031	add	C1, 2 * SIZE, C1
2032	add	C2, 2 * SIZE, C2
2033	add	C3, 2 * SIZE, C3
2034	add	C4, 2 * SIZE, C4
2035	add	C5, 2 * SIZE, C5
2036	add	C6, 2 * SIZE, C6
2037	add	C7, 2 * SIZE, C7
2038	add	C8, 2 * SIZE, C8
2039#endif
2040
2041#ifdef RT
2042	sll	K, BASE_SHIFT + 1, TEMP1
2043	add	AORIG, TEMP1, AORIG
2044#endif
2045
2046#if defined(LT) || defined(RN)
2047	sub	K, KK, TEMP1
2048	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2049	sll	TEMP1, BASE_SHIFT + 3, TEMP1
2050	add	AO, TEMP2, AO
2051	add	BO, TEMP1, BO
2052#endif
2053
2054#ifdef LT
2055	add	KK, 2, KK
2056#endif
2057
2058#ifdef LN
2059	sub	KK, 2, KK
2060#endif
2061
2062	add	I, -1, I
2063	cmp	I, 0
2064	bg,pt	%icc, .LL12
2065	nop
2066	.align 4
2067
2068.LL29:
2069#ifdef LN
2070	sll	K, BASE_SHIFT + 3, TEMP1
2071	add	B, TEMP1, B
2072#endif
2073
2074#if defined(LT) || defined(RN)
2075	mov	BO, B
2076#endif
2077
2078#ifdef RN
2079	add	KK, 8, KK
2080#endif
2081
2082#ifdef RT
2083	sub	KK, 8, KK
2084#endif
2085
2086	add	J, -1, J
2087	cmp	J, 0
2088	bg,pt	%icc, .LL11
2089	nop
2090	.align 4
2091
2092.LL30:
2093	and	N, 4, J
2094	cmp	J, 0
2095	ble,pn	%icc, .LL50
2096	nop
2097
2098#ifdef RT
2099	sll	K, BASE_SHIFT + 2, TEMP1
2100	sub	B, TEMP1, B
2101#endif
2102
2103#ifndef RT
2104	mov	C,  C1
2105	add	C,  LDC, C2
2106	add	C2, LDC, C3
2107	add	C3, LDC, C4
2108	add	C4, LDC, C
2109#else
2110	sub	C,  LDC, C4
2111	sub	C4, LDC, C3
2112	sub	C3, LDC, C2
2113	sub	C2, LDC, C1
2114	sub	C2, LDC, C
2115#endif
2116
2117#ifdef LN
2118	add	M, OFFSET, KK
2119#endif
2120
2121#ifdef LT
2122	mov	OFFSET, KK
2123#endif
2124
2125#if defined(LN) || defined(RT)
2126	mov	A, AORIG
2127#else
2128	mov	A, AO
2129#endif
2130
2131	and	M, 1, I
2132	cmp	I, 0
2133	ble,pn	%icc, .LL40
2134	nop
2135
2136#if defined(LT) || defined(RN)
2137	mov	B, BO
2138#else
2139#ifdef LN
2140	sll	K,  BASE_SHIFT + 0, TEMP1
2141	sub	AORIG, TEMP1, AORIG
2142#endif
2143
2144	sll	KK, BASE_SHIFT + 0, TEMP1
2145	sll	KK, BASE_SHIFT + 2, TEMP2
2146
2147	add	AORIG, TEMP1, AO
2148	add	B,     TEMP2, BO
2149#endif
2150
2151	LDF	[AO +  0 * SIZE], a1
2152	LDF	[AO +  1 * SIZE], a2
2153	LDF	[AO +  2 * SIZE], a3
2154	LDF	[AO +  3 * SIZE], a4
2155
2156	LDF	[BO +  0 * SIZE], b1
2157	LDF	[BO +  1 * SIZE], b2
2158	LDF	[BO +  2 * SIZE], b3
2159	LDF	[BO +  3 * SIZE], b4
2160	LDF	[BO +  4 * SIZE], b5
2161	LDF	[BO +  5 * SIZE], b6
2162	FCLR	(cc01)
2163	LDF	[BO +  6 * SIZE], b7
2164	FCLR	(cc03)
2165	LDF	[BO +  7 * SIZE], b8
2166	FCLR	(cc05)
2167	LDF	[BO +  8 * SIZE], b9
2168	FCLR	(cc07)
2169
2170#if defined(LT) || defined(RN)
2171	sra	KK, 2, L
2172#else
2173	sub	K, KK, L
2174	sra	L,  2, L
2175#endif
2176	cmp	L,  0
2177	ble,pn	%icc, .LL45
2178	nop
2179
2180.LL43:
2181	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2182	add	L, -1, L
2183
2184	FMADD	(aa1, bb1, cc01, cc01)
2185	LDF	[BO + 16 * SIZE], b1
2186	FMADD	(aa1, bb2, cc03, cc03)
2187	LDF	[BO +  9 * SIZE], b2
2188	FMADD	(aa1, bb3, cc05, cc05)
2189	LDF	[BO + 10 * SIZE], b3
2190	FMADD	(aa1, bb4, cc07, cc07)
2191	LDF	[BO + 11 * SIZE], b4
2192
2193	LDF	[AO +  4 * SIZE], a1
2194	cmp	L, 0
2195
2196	FMADD	(aa2, bb5, cc01, cc01)
2197	LDF	[BO + 12 * SIZE], b5
2198	FMADD	(aa2, bb6, cc03, cc03)
2199	LDF	[BO + 13 * SIZE], b6
2200	FMADD	(aa2, bb7, cc05, cc05)
2201	LDF	[BO + 14 * SIZE], b7
2202	FMADD	(aa2, bb8, cc07, cc07)
2203	LDF	[BO + 15 * SIZE], b8
2204
2205	LDF	[AO +  5 * SIZE], a2
2206	add	AO,  4 * SIZE, AO
2207
2208	FMADD	(aa3, bb9, cc01, cc01)
2209	LDF	[BO + 24 * SIZE], b9
2210	FMADD	(aa3, bb2, cc03, cc03)
2211	LDF	[BO + 17 * SIZE], b2
2212	FMADD	(aa3, bb3, cc05, cc05)
2213	LDF	[BO + 18 * SIZE], b3
2214	FMADD	(aa3, bb4, cc07, cc07)
2215	LDF	[BO + 19 * SIZE], b4
2216
2217	LDF	[AO +  2 * SIZE], a3
2218	add	BO, 16 * SIZE, BO
2219
2220	FMADD	(aa4, bb5, cc01, cc01)
2221	LDF	[BO +  4 * SIZE], b5
2222	FMADD	(aa4, bb6, cc03, cc03)
2223	LDF	[BO +  5 * SIZE], b6
2224	FMADD	(aa4, bb7, cc05, cc05)
2225	LDF	[BO +  6 * SIZE], b7
2226	FMADD	(aa4, bb8, cc07, cc07)
2227	LDF	[BO +  7 * SIZE], b8
2228
2229	bg,pt	%icc, .LL43
2230	LDF	[AO +  3 * SIZE], a4
2231	.align 4
2232
2233.LL45:
2234#if defined(LT) || defined(RN)
2235	and	KK, 3, L
2236#else
2237	sub	K, KK, L
2238	and	L,  3, L
2239#endif
2240	cmp	L,  0
2241	ble,a,pn %icc, .LL48
2242	nop
2243	.align 4
2244
2245.LL47:
2246	FMADD	(aa1, bb1, cc01, cc01)
2247	LDF	[BO + 4 * SIZE], b1
2248	add	L, -1, L
2249	FMADD	(aa1, bb2, cc03, cc03)
2250	LDF	[BO + 5 * SIZE], b2
2251	add	AO, 1 * SIZE, AO
2252
2253	FMADD	(aa1, bb3, cc05, cc05)
2254	LDF	[BO + 6 * SIZE], b3
2255	cmp	L, 0
2256	FMADD	(aa1, bb4, cc07, cc07)
2257	LDF	[BO + 7 * SIZE], b4
2258	add	BO, 4 * SIZE, BO
2259
2260	bg,pt	%icc, .LL47
2261	LDF	[AO + 0 * SIZE], a1
2262	.align 4
2263
2264.LL48:
2265#if defined(LN) || defined(RT)
2266#ifdef LN
2267	sub	KK, 1, TEMP1
2268#else
2269	sub	KK, 4, TEMP1
2270#endif
2271	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2272	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2273
2274	add	AORIG, TEMP2, AO
2275	add	B,     TEMP1, BO
2276#endif
2277
2278#if defined(LN) || defined(LT)
2279	LDF	[BO +  0 * SIZE], a1
2280	LDF	[BO +  1 * SIZE], a2
2281	LDF	[BO +  2 * SIZE], a3
2282	LDF	[BO +  3 * SIZE], a4
2283
2284	FSUB	a1, c01, c01
2285	FSUB	a2, c03, c03
2286	FSUB	a3, c05, c05
2287	FSUB	a4, c07, c07
2288#else
2289	LDF	[AO +  0 * SIZE], a1
2290	LDF	[AO +  1 * SIZE], a2
2291	LDF	[AO +  2 * SIZE], a3
2292	LDF	[AO +  3 * SIZE], a4
2293
2294	FSUB	a1, c01, c01
2295	FSUB	a2, c03, c03
2296	FSUB	a3, c05, c05
2297	FSUB	a4, c07, c07
2298#endif
2299
2300#if defined(LN) || defined(LT)
2301	LDF	[AO +  0 * SIZE], a1
2302
2303	FMUL	a1, c01, c01
2304	FMUL	a1, c03, c03
2305	FMUL	a1, c05, c05
2306	FMUL	a1, c07, c07
2307#endif
2308
2309#ifdef RN
2310	LDF	[BO +  0 * SIZE], a1
2311	LDF	[BO +  1 * SIZE], a2
2312	LDF	[BO +  2 * SIZE], a3
2313	LDF	[BO +  3 * SIZE], a4
2314
2315	FMUL	a1, c01, c01
2316
2317	FNMSUB	(aa2, cc01, cc03, cc03)
2318	FNMSUB	(aa3, cc01, cc05, cc05)
2319	FNMSUB	(aa4, cc01, cc07, cc07)
2320
2321	LDF	[BO +  5 * SIZE], a1
2322	LDF	[BO +  6 * SIZE], a2
2323	LDF	[BO +  7 * SIZE], a3
2324
2325	FMUL	a1, c03, c03
2326
2327	FNMSUB	(aa2, cc03, cc05, cc05)
2328	FNMSUB	(aa3, cc03, cc07, cc07)
2329
2330	LDF	[BO + 10 * SIZE], a1
2331	LDF	[BO + 11 * SIZE], a2
2332
2333	FMUL	a1, c05, c05
2334
2335	FNMSUB	(aa2, cc05, cc07, cc07)
2336
2337	LDF	[BO + 15 * SIZE], a1
2338
2339	FMUL	a1, c07, c07
2340#endif
2341
2342#ifdef RT
2343	LDF	[BO + 15 * SIZE], a1
2344	LDF	[BO + 14 * SIZE], a2
2345	LDF	[BO + 13 * SIZE], a3
2346	LDF	[BO + 12 * SIZE], a4
2347
2348	FMUL	a1, c07, c07
2349
2350	FNMSUB	(aa2, cc07, cc05, cc05)
2351	FNMSUB	(aa3, cc07, cc03, cc03)
2352	FNMSUB	(aa4, cc07, cc01, cc01)
2353
2354	LDF	[BO + 10 * SIZE], a1
2355	LDF	[BO +  9 * SIZE], a2
2356	LDF	[BO +  8 * SIZE], a3
2357
2358	FMUL	a1, c05, c05
2359
2360	FNMSUB	(aa2, cc05, cc03, cc03)
2361	FNMSUB	(aa3, cc05, cc01, cc01)
2362
2363	LDF	[BO +  5 * SIZE], a1
2364	LDF	[BO +  4 * SIZE], a2
2365
2366	FMUL	a1, c03, c03
2367
2368	FNMSUB	(aa2, cc03, cc01, cc01)
2369
2370	LDF	[BO +  0 * SIZE], a1
2371
2372	FMUL	a1, c01, c01
2373#endif
2374
2375#ifdef LN
2376	add	C1, -1 * SIZE, C1
2377	add	C2, -1 * SIZE, C2
2378	add	C3, -1 * SIZE, C3
2379	add	C4, -1 * SIZE, C4
2380#endif
2381
2382#if defined(LN) || defined(LT)
2383	STF	c01, [BO +  0 * SIZE]
2384	STF	c03, [BO +  1 * SIZE]
2385	STF	c05, [BO +  2 * SIZE]
2386	STF	c07, [BO +  3 * SIZE]
2387#else
2388	STF	c01, [AO +  0 * SIZE]
2389	STF	c03, [AO +  1 * SIZE]
2390	STF	c05, [AO +  2 * SIZE]
2391	STF	c07, [AO +  3 * SIZE]
2392#endif
2393
2394	STF	c01, [C1 + 0 * SIZE]
2395	STF	c03, [C2 + 0 * SIZE]
2396	STF	c05, [C3 + 0 * SIZE]
2397	STF	c07, [C4 + 0 * SIZE]
2398
2399#ifdef RT
2400	sll	K, BASE_SHIFT + 0, TEMP1
2401	add	AORIG, TEMP1, AORIG
2402#endif
2403
2404#if defined(LT) || defined(RN)
2405	sub	K, KK, TEMP1
2406	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2407	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2408	add	AO, TEMP2, AO
2409	add	BO, TEMP1, BO
2410#endif
2411
2412#ifdef LT
2413	add	KK, 1, KK
2414#endif
2415
2416#ifdef LN
2417	sub	KK, 1, KK
2418#endif
2419	.align 4
2420
2421.LL40:
2422	sra	M, 1, I
2423	cmp	I, 0
2424	ble,pn	%icc, .LL49
2425	nop
2426	.align 4
2427
2428.LL32:
2429#if defined(LT) || defined(RN)
2430	mov	B, BO
2431#else
2432#ifdef LN
2433	sll	K,  BASE_SHIFT + 1, TEMP1
2434	sub	AORIG, TEMP1, AORIG
2435#endif
2436
2437	sll	KK, BASE_SHIFT + 1, TEMP1
2438	sll	KK, BASE_SHIFT + 2, TEMP2
2439
2440	add	AORIG, TEMP1, AO
2441	add	B,     TEMP2, BO
2442#endif
2443
2444	LDF	[AO +  0 * SIZE], a1
2445	LDF	[AO +  1 * SIZE], a2
2446
2447	LDF	[BO +  0 * SIZE], b1
2448	LDF	[BO +  1 * SIZE], b2
2449	LDF	[BO +  2 * SIZE], b3
2450	LDF	[BO +  3 * SIZE], b4
2451	LDF	[BO +  4 * SIZE], b5
2452
2453	LDF	[BO +  5 * SIZE], b6
2454	FCLR	(cc01)
2455	LDF	[BO +  6 * SIZE], b7
2456	FCLR	(cc02)
2457	LDF	[BO +  7 * SIZE], b8
2458	FCLR	(cc03)
2459	LDF	[BO +  8 * SIZE], b9
2460	FCLR	(cc04)
2461
2462	prefetch [C1 + 2 * SIZE], 3
2463	FCLR	(cc05)
2464	prefetch [C2 + 2 * SIZE], 3
2465	FCLR	(cc06)
2466	prefetch [C3 + 2 * SIZE], 3
2467	FCLR	(cc07)
2468	prefetch [C4 + 2 * SIZE], 3
2469	FCLR	(cc08)
2470
2471#if defined(LT) || defined(RN)
2472	sra	KK, 2, L
2473#else
2474	sub	K, KK, L
2475	sra	L,  2, L
2476#endif
2477	cmp	L,  0
2478	ble,pn	%icc, .LL35
2479	nop
2480	.align 4
2481
2482.LL33:
2483	FMADD	(aa1, bb1, cc01, cc01)
2484	LDF	[AO +  2 * SIZE], a3
2485	FMADD	(aa2, bb1, cc02, cc02)
2486	LDF	[AO +  3 * SIZE], a4
2487
2488	FMADD	(aa1, bb2, cc03, cc03)
2489	LDF	[BO + 16 * SIZE], b1
2490	FMADD	(aa2, bb2, cc04, cc04)
2491	LDF	[BO +  9 * SIZE], b2
2492
2493	FMADD	(aa1, bb3, cc05, cc05)
2494	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2495	FMADD	(aa2, bb3, cc06, cc06)
2496	add	L, -1, L
2497
2498	FMADD	(aa1, bb4, cc07, cc07)
2499	LDF	[BO + 10 * SIZE], b3
2500	FMADD	(aa2, bb4, cc08, cc08)
2501	LDF	[BO + 11 * SIZE], b4
2502
2503	FMADD	(aa3, bb5, cc01, cc01)
2504	LDF	[AO +  4 * SIZE], a1
2505	FMADD	(aa4, bb5, cc02, cc02)
2506	LDF	[AO +  5 * SIZE], a2
2507
2508	FMADD	(aa3, bb6, cc03, cc03)
2509	LDF	[BO + 12 * SIZE], b5
2510	FMADD	(aa4, bb6, cc04, cc04)
2511	LDF	[BO + 13 * SIZE], b6
2512
2513	FMADD	(aa3, bb7, cc05, cc05)
2514	cmp	L, 0
2515	FMADD	(aa4, bb7, cc06, cc06)
2516	add	AO,  8 * SIZE, AO
2517
2518	FMADD	(aa3, bb8, cc07, cc07)
2519	LDF	[BO + 14 * SIZE], b7
2520	FMADD	(aa4, bb8, cc08, cc08)
2521	LDF	[BO + 15 * SIZE], b8
2522
2523	FMADD	(aa1, bb9, cc01, cc01)
2524	LDF	[AO -  2 * SIZE], a3
2525	FMADD	(aa2, bb9, cc02, cc02)
2526	LDF	[AO -  1 * SIZE], a4
2527
2528	FMADD	(aa1, bb2, cc03, cc03)
2529	LDF	[BO + 24 * SIZE], b9
2530	FMADD	(aa2, bb2, cc04, cc04)
2531	LDF	[BO + 17 * SIZE], b2
2532
2533	FMADD	(aa1, bb3, cc05, cc05)
2534	add	BO, 16 * SIZE, BO
2535	FMADD	(aa2, bb3, cc06, cc06)
2536	nop
2537
2538	FMADD	(aa1, bb4, cc07, cc07)
2539	LDF	[BO +  2 * SIZE], b3
2540	FMADD	(aa2, bb4, cc08, cc08)
2541	LDF	[BO +  3 * SIZE], b4
2542
2543	FMADD	(aa3, bb5, cc01, cc01)
2544	LDF	[AO +  0 * SIZE], a1
2545	FMADD	(aa4, bb5, cc02, cc02)
2546	LDF	[AO +  1 * SIZE], a2
2547	FMADD	(aa3, bb6, cc03, cc03)
2548	LDF	[BO +  4 * SIZE], b5
2549	FMADD	(aa4, bb6, cc04, cc04)
2550	LDF	[BO +  5 * SIZE], b6
2551
2552	FMADD	(aa3, bb7, cc05, cc05)
2553	nop
2554	FMADD	(aa4, bb7, cc06, cc06)
2555	LDF	[BO +  6 * SIZE], b7
2556
2557	FMADD	(aa3, bb8, cc07, cc07)
2558	FMADD	(aa4, bb8, cc08, cc08)
2559	bg,pt	%icc, .LL33
2560	LDF	[BO +  7 * SIZE], b8
2561	.align 4
2562
2563.LL35:
2564#if defined(LT) || defined(RN)
2565	and	KK, 3, L
2566#else
2567	sub	K, KK, L
2568	and	L,  3, L
2569#endif
2570	cmp	L,  0
2571	ble,a,pn %icc, .LL38
2572	nop
2573	.align 4
2574
2575.LL37:
2576	FMADD	(aa1, bb1, cc01, cc01)
2577	add	L, -1, L
2578	FMADD	(aa2, bb1, cc02, cc02)
2579	LDF	[BO + 4 * SIZE], b1
2580
2581	FMADD	(aa1, bb2, cc03, cc03)
2582	add	AO, 2 * SIZE, AO
2583	FMADD	(aa2, bb2, cc04, cc04)
2584	LDF	[BO + 5 * SIZE], b2
2585
2586	FMADD	(aa1, bb3, cc05, cc05)
2587	cmp	L, 0
2588	FMADD	(aa2, bb3, cc06, cc06)
2589	LDF	[BO + 6 * SIZE], b3
2590
2591	FMADD	(aa1, bb4, cc07, cc07)
2592	LDF	[AO + 0 * SIZE], a1
2593	FMADD	(aa2, bb4, cc08, cc08)
2594	LDF	[AO + 1 * SIZE], a2
2595
2596	LDF	[BO + 7 * SIZE], b4
2597	bg,pt	%icc, .LL37
2598	add	BO, 4 * SIZE, BO
2599	.align 4
2600
2601.LL38:
2602#if defined(LN) || defined(RT)
2603#ifdef LN
2604	sub	KK, 2, TEMP1
2605#else
2606	sub	KK, 4, TEMP1
2607#endif
2608	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2609	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2610
2611	add	AORIG, TEMP2, AO
2612	add	B,     TEMP1, BO
2613#endif
2614
2615#if defined(LN) || defined(LT)
2616	LDF	[BO +  0 * SIZE], a1
2617	LDF	[BO +  1 * SIZE], a2
2618	LDF	[BO +  2 * SIZE], a3
2619	LDF	[BO +  3 * SIZE], a4
2620
2621	LDF	[BO +  4 * SIZE], b1
2622	LDF	[BO +  5 * SIZE], b2
2623	LDF	[BO +  6 * SIZE], b3
2624	LDF	[BO +  7 * SIZE], b4
2625
2626	FSUB	a1, c01, c01
2627	FSUB	a2, c03, c03
2628	FSUB	a3, c05, c05
2629	FSUB	a4, c07, c07
2630
2631	FSUB	b1, c02, c02
2632	FSUB	b2, c04, c04
2633	FSUB	b3, c06, c06
2634	FSUB	b4, c08, c08
2635#else
2636	LDF	[AO +  0 * SIZE], a1
2637	LDF	[AO +  1 * SIZE], a2
2638	LDF	[AO +  2 * SIZE], a3
2639	LDF	[AO +  3 * SIZE], a4
2640
2641	LDF	[AO +  4 * SIZE], b1
2642	LDF	[AO +  5 * SIZE], b2
2643	LDF	[AO +  6 * SIZE], b3
2644	LDF	[AO +  7 * SIZE], b4
2645
2646	FSUB	a1, c01, c01
2647	FSUB	a2, c02, c02
2648	FSUB	a3, c03, c03
2649	FSUB	a4, c04, c04
2650
2651	FSUB	b1, c05, c05
2652	FSUB	b2, c06, c06
2653	FSUB	b3, c07, c07
2654	FSUB	b4, c08, c08
2655
2656#endif
2657
2658#ifdef LN
2659	LDF	[AO +  3 * SIZE], a1
2660	LDF	[AO +  2 * SIZE], a2
2661	LDF	[AO +  0 * SIZE], a3
2662
2663	FMUL	a1, c02, c02
2664	FMUL	a1, c04, c04
2665	FMUL	a1, c06, c06
2666	FMUL	a1, c08, c08
2667
2668	FNMSUB	(aa2, cc02, cc01, cc01)
2669	FNMSUB	(aa2, cc04, cc03, cc03)
2670	FNMSUB	(aa2, cc06, cc05, cc05)
2671	FNMSUB	(aa2, cc08, cc07, cc07)
2672
2673	FMUL	a3, c01, c01
2674	FMUL	a3, c03, c03
2675	FMUL	a3, c05, c05
2676	FMUL	a3, c07, c07
2677#endif
2678
2679#ifdef LT
2680	LDF	[AO +  0 * SIZE], a1
2681	LDF	[AO +  1 * SIZE], a2
2682	LDF	[AO +  3 * SIZE], a3
2683
2684	FMUL	a1, c01, c01
2685	FMUL	a1, c03, c03
2686	FMUL	a1, c05, c05
2687	FMUL	a1, c07, c07
2688
2689	FNMSUB	(aa2, cc01, cc02, cc02)
2690	FNMSUB	(aa2, cc03, cc04, cc04)
2691	FNMSUB	(aa2, cc05, cc06, cc06)
2692	FNMSUB	(aa2, cc07, cc08, cc08)
2693
2694	FMUL	a3, c02, c02
2695	FMUL	a3, c04, c04
2696	FMUL	a3, c06, c06
2697	FMUL	a3, c08, c08
2698#endif
2699
2700#ifdef RN
2701	LDF	[BO +  0 * SIZE], a1
2702	LDF	[BO +  1 * SIZE], a2
2703	LDF	[BO +  2 * SIZE], a3
2704	LDF	[BO +  3 * SIZE], a4
2705
2706	FMUL	a1, c01, c01
2707	FMUL	a1, c02, c02
2708
2709	FNMSUB	(aa2, cc01, cc03, cc03)
2710	FNMSUB	(aa2, cc02, cc04, cc04)
2711	FNMSUB	(aa3, cc01, cc05, cc05)
2712	FNMSUB	(aa3, cc02, cc06, cc06)
2713	FNMSUB	(aa4, cc01, cc07, cc07)
2714	FNMSUB	(aa4, cc02, cc08, cc08)
2715
2716	LDF	[BO +  5 * SIZE], a1
2717	LDF	[BO +  6 * SIZE], a2
2718	LDF	[BO +  7 * SIZE], a3
2719
2720	FMUL	a1, c03, c03
2721	FMUL	a1, c04, c04
2722
2723	FNMSUB	(aa2, cc03, cc05, cc05)
2724	FNMSUB	(aa2, cc04, cc06, cc06)
2725	FNMSUB	(aa3, cc03, cc07, cc07)
2726	FNMSUB	(aa3, cc04, cc08, cc08)
2727
2728	LDF	[BO + 10 * SIZE], a1
2729	LDF	[BO + 11 * SIZE], a2
2730
2731	FMUL	a1, c05, c05
2732	FMUL	a1, c06, c06
2733
2734	FNMSUB	(aa2, cc05, cc07, cc07)
2735	FNMSUB	(aa2, cc06, cc08, cc08)
2736
2737	LDF	[BO + 15 * SIZE], a1
2738
2739	FMUL	a1, c07, c07
2740	FMUL	a1, c08, c08
2741#endif
2742
2743#ifdef RT
2744	LDF	[BO + 15 * SIZE], a1
2745	LDF	[BO + 14 * SIZE], a2
2746	LDF	[BO + 13 * SIZE], a3
2747	LDF	[BO + 12 * SIZE], a4
2748
2749	FMUL	a1, c08, c08
2750	FMUL	a1, c07, c07
2751
2752	FNMSUB	(aa2, cc08, cc06, cc06)
2753	FNMSUB	(aa2, cc07, cc05, cc05)
2754	FNMSUB	(aa3, cc08, cc04, cc04)
2755	FNMSUB	(aa3, cc07, cc03, cc03)
2756	FNMSUB	(aa4, cc08, cc02, cc02)
2757	FNMSUB	(aa4, cc07, cc01, cc01)
2758
2759	LDF	[BO + 10 * SIZE], a1
2760	LDF	[BO +  9 * SIZE], a2
2761	LDF	[BO +  8 * SIZE], a3
2762
2763	FMUL	a1, c06, c06
2764	FMUL	a1, c05, c05
2765
2766	FNMSUB	(aa2, cc06, cc04, cc04)
2767	FNMSUB	(aa2, cc05, cc03, cc03)
2768	FNMSUB	(aa3, cc06, cc02, cc02)
2769	FNMSUB	(aa3, cc05, cc01, cc01)
2770
2771	LDF	[BO +  5 * SIZE], a1
2772	LDF	[BO +  4 * SIZE], a2
2773
2774	FMUL	a1, c04, c04
2775	FMUL	a1, c03, c03
2776
2777	FNMSUB	(aa2, cc04, cc02, cc02)
2778	FNMSUB	(aa2, cc03, cc01, cc01)
2779
2780	LDF	[BO +  0 * SIZE], a1
2781
2782	FMUL	a1, c02, c02
2783	FMUL	a1, c01, c01
2784#endif
2785
2786#ifdef LN
2787	add	C1, -2 * SIZE, C1
2788	add	C2, -2 * SIZE, C2
2789	add	C3, -2 * SIZE, C3
2790	add	C4, -2 * SIZE, C4
2791#endif
2792
2793#if defined(LN) || defined(LT)
2794	STF	c01, [BO +  0 * SIZE]
2795	STF	c03, [BO +  1 * SIZE]
2796	STF	c05, [BO +  2 * SIZE]
2797	STF	c07, [BO +  3 * SIZE]
2798
2799	STF	c02, [BO +  4 * SIZE]
2800	STF	c04, [BO +  5 * SIZE]
2801	STF	c06, [BO +  6 * SIZE]
2802	STF	c08, [BO +  7 * SIZE]
2803#else
2804	STF	c01, [AO +  0 * SIZE]
2805	STF	c02, [AO +  1 * SIZE]
2806	STF	c03, [AO +  2 * SIZE]
2807	STF	c04, [AO +  3 * SIZE]
2808
2809	STF	c05, [AO +  4 * SIZE]
2810	STF	c06, [AO +  5 * SIZE]
2811	STF	c07, [AO +  6 * SIZE]
2812	STF	c08, [AO +  7 * SIZE]
2813#endif
2814
2815	STF	c01, [C1 + 0 * SIZE]
2816	STF	c02, [C1 + 1 * SIZE]
2817	STF	c03, [C2 + 0 * SIZE]
2818	STF	c04, [C2 + 1 * SIZE]
2819
2820	STF	c05, [C3 + 0 * SIZE]
2821	STF	c06, [C3 + 1 * SIZE]
2822	STF	c07, [C4 + 0 * SIZE]
2823	STF	c08, [C4 + 1 * SIZE]
2824
2825#ifndef LN
2826	add	C1, 2 * SIZE, C1
2827	add	C2, 2 * SIZE, C2
2828	add	C3, 2 * SIZE, C3
2829	add	C4, 2 * SIZE, C4
2830#endif
2831
2832#ifdef RT
2833	sll	K, BASE_SHIFT + 1, TEMP1
2834	add	AORIG, TEMP1, AORIG
2835#endif
2836
2837#if defined(LT) || defined(RN)
2838	sub	K, KK, TEMP1
2839	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2840	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2841	add	AO, TEMP2, AO
2842	add	BO, TEMP1, BO
2843#endif
2844
2845#ifdef LT
2846	add	KK, 2, KK
2847#endif
2848
2849#ifdef LN
2850	sub	KK, 2, KK
2851#endif
2852
2853	add	I, -1, I
2854	cmp	I, 0
2855	bg,pt	%icc, .LL32
2856	nop
2857
2858.LL49:
2859#ifdef LN
2860	sll	K, BASE_SHIFT + 2, TEMP1
2861	add	B, TEMP1, B
2862#endif
2863
2864#if defined(LT) || defined(RN)
2865	mov	BO, B
2866#endif
2867
2868#ifdef RN
2869	add	KK, 4, KK
2870#endif
2871
2872#ifdef RT
2873	sub	KK, 4, KK
2874#endif
2875	.align 4
2876
2877.LL50:
2878	and	N, 2, J
2879	cmp	J, 0
2880	ble,pn	%icc, .LL70
2881	nop
2882
2883#ifdef RT
2884	sll	K, BASE_SHIFT + 1, TEMP1
2885	sub	B, TEMP1, B
2886#endif
2887
2888#ifndef RT
2889	mov	C,  C1
2890	add	C,  LDC, C2
2891	add	C2, LDC, C
2892#else
2893	sub	C,  LDC, C2
2894	sub	C2, LDC, C1
2895	sub	C2, LDC, C
2896#endif
2897
2898#ifdef LN
2899	add	M, OFFSET, KK
2900#endif
2901
2902#ifdef LT
2903	mov	OFFSET, KK
2904#endif
2905
2906#if defined(LN) || defined(RT)
2907	mov	A, AORIG
2908#else
2909	mov	A, AO
2910#endif
2911
2912	and	M, 1, I
2913	cmp	I, 0
2914	ble,pn	%icc, .LL60
2915	nop
2916
2917#if defined(LT) || defined(RN)
2918	mov	B, BO
2919#else
2920#ifdef LN
2921	sll	K,  BASE_SHIFT + 0, TEMP1
2922	sub	AORIG, TEMP1, AORIG
2923#endif
2924
2925	sll	KK, BASE_SHIFT + 0, TEMP1
2926	sll	KK, BASE_SHIFT + 1, TEMP2
2927
2928	add	AORIG, TEMP1, AO
2929	add	B,     TEMP2, BO
2930#endif
2931
2932	LDF	[AO +  0 * SIZE], a1
2933	LDF	[AO +  1 * SIZE], a2
2934	LDF	[AO +  2 * SIZE], a3
2935	LDF	[AO +  3 * SIZE], a4
2936
2937	LDF	[BO +  0 * SIZE], b1
2938	LDF	[BO +  1 * SIZE], b2
2939	LDF	[BO +  2 * SIZE], b3
2940	LDF	[BO +  3 * SIZE], b4
2941	LDF	[BO +  4 * SIZE], b5
2942	LDF	[BO +  5 * SIZE], b6
2943	LDF	[BO +  6 * SIZE], b7
2944	FCLR	(cc01)
2945	LDF	[BO +  7 * SIZE], b8
2946	FCLR	(cc03)
2947
2948#if defined(LT) || defined(RN)
2949	sra	KK, 2, L
2950#else
2951	sub	K, KK, L
2952	sra	L,  2, L
2953#endif
2954	cmp	L,  0
2955	ble,pn	%icc, .LL65
2956	nop
2957	.align 4
2958
2959.LL63:
2960	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2961	add	L, -1, L
2962
2963	FMADD	(aa1, bb1, cc01, cc01)
2964	LDF	[BO +  8 * SIZE], b1
2965	FMADD	(aa1, bb2, cc03, cc03)
2966	LDF	[BO +  9 * SIZE], b2
2967
2968	LDF	[AO +  4 * SIZE], a1
2969	cmp	L, 0
2970
2971	FMADD	(aa2, bb3, cc01, cc01)
2972	LDF	[BO + 10 * SIZE], b3
2973	FMADD	(aa2, bb4, cc03, cc03)
2974	LDF	[BO + 11 * SIZE], b4
2975
2976	LDF	[AO +  5 * SIZE], a2
2977	add	AO,  4 * SIZE, AO
2978
2979	FMADD	(aa3, bb5, cc01, cc01)
2980	LDF	[BO + 12 * SIZE], b5
2981	FMADD	(aa3, bb6, cc03, cc03)
2982	LDF	[BO + 13 * SIZE], b6
2983
2984	LDF	[AO +  2 * SIZE], a3
2985	add	BO,  8 * SIZE, BO
2986
2987	FMADD	(aa4, bb7, cc01, cc01)
2988	LDF	[BO +  6 * SIZE], b7
2989	FMADD	(aa4, bb8, cc03, cc03)
2990	LDF	[BO + 7 * SIZE], b8
2991
2992	bg,pt	%icc, .LL63
2993	LDF	[AO +  3 * SIZE], a4
2994	.align 4
2995
2996.LL65:
2997#if defined(LT) || defined(RN)
2998	and	KK, 3, L
2999#else
3000	sub	K, KK, L
3001	and	L,  3, L
3002#endif
3003	cmp	L,  0
3004	ble,a,pn %icc, .LL68
3005	nop
3006	.align 4
3007
3008.LL67:
3009	FMADD	(aa1, bb1, cc01, cc01)
3010	LDF	[BO + 2 * SIZE], b1
3011	FMADD	(aa1, bb2, cc03, cc03)
3012	LDF	[BO + 3 * SIZE], b2
3013
3014	LDF	[AO + 1 * SIZE], a1
3015	add	L, -1, L
3016	add	AO, 1 * SIZE, AO
3017	cmp	L, 0
3018
3019	bg,pt	%icc, .LL67
3020	add	BO, 2 * SIZE, BO
3021	.align 4
3022
3023.LL68:
3024#if defined(LN) || defined(RT)
3025#ifdef LN
3026	sub	KK, 1, TEMP1
3027#else
3028	sub	KK, 2, TEMP1
3029#endif
3030	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3031	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3032
3033	add	AORIG, TEMP2, AO
3034	add	B,     TEMP1, BO
3035#endif
3036
3037#if defined(LN) || defined(LT)
3038	LDF	[BO +  0 * SIZE], a1
3039	LDF	[BO +  1 * SIZE], a2
3040
3041	FSUB	a1, c01, c01
3042	FSUB	a2, c03, c03
3043#else
3044	LDF	[AO +  0 * SIZE], a1
3045	LDF	[AO +  1 * SIZE], a2
3046
3047	FSUB	a1, c01, c01
3048	FSUB	a2, c03, c03
3049#endif
3050
3051#if defined(LN) || defined(LT)
3052	LDF	[AO +  0 * SIZE], a1
3053
3054	FMUL	a1, c01, c01
3055	FMUL	a1, c03, c03
3056#endif
3057
3058#ifdef RN
3059	LDF	[BO +  0 * SIZE], a1
3060	LDF	[BO +  1 * SIZE], a2
3061
3062	FMUL	a1, c01, c01
3063
3064	FNMSUB	(aa2, cc01, cc03, cc03)
3065
3066	LDF	[BO +  3 * SIZE], a1
3067
3068	FMUL	a1, c03, c03
3069#endif
3070
3071#ifdef RT
3072	LDF	[BO +  3 * SIZE], a1
3073	LDF	[BO +  2 * SIZE], a2
3074
3075	FMUL	a1, c03, c03
3076
3077	FNMSUB	(aa2, cc03, cc01, cc01)
3078
3079	LDF	[BO +  0 * SIZE], a1
3080
3081	FMUL	a1, c01, c01
3082#endif
3083
3084#ifdef LN
3085	add	C1, -1 * SIZE, C1
3086	add	C2, -1 * SIZE, C2
3087#endif
3088
3089#if defined(LN) || defined(LT)
3090	STF	c01, [BO +  0 * SIZE]
3091	STF	c03, [BO +  1 * SIZE]
3092#else
3093	STF	c01, [AO +  0 * SIZE]
3094	STF	c03, [AO +  1 * SIZE]
3095#endif
3096
3097	STF	c01, [C1 + 0 * SIZE]
3098	STF	c03, [C2 + 0 * SIZE]
3099
3100#ifdef RT
3101	sll	K, BASE_SHIFT + 0, TEMP1
3102	add	AORIG, TEMP1, AORIG
3103#endif
3104
3105#if defined(LT) || defined(RN)
3106	sub	K, KK, TEMP1
3107	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3108	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3109	add	AO, TEMP2, AO
3110	add	BO, TEMP1, BO
3111#endif
3112
3113#ifdef LT
3114	add	KK, 1, KK
3115#endif
3116
3117#ifdef LN
3118	sub	KK, 1, KK
3119#endif
3120	.align 4
3121
3122.LL60:
3123	sra	M, 1, I
3124	cmp	I, 0
3125	ble,pn	%icc, .LL69
3126	nop
3127	.align 4
3128
3129.LL52:
3130#if defined(LT) || defined(RN)
3131	mov	B, BO
3132#else
3133#ifdef LN
3134	sll	K,  BASE_SHIFT + 1, TEMP1
3135	sub	AORIG, TEMP1, AORIG
3136#endif
3137
3138	sll	KK, BASE_SHIFT + 1, TEMP1
3139	sll	KK, BASE_SHIFT + 1, TEMP2
3140
3141	add	AORIG, TEMP1, AO
3142	add	B,     TEMP2, BO
3143#endif
3144
3145	LDF	[AO +  0 * SIZE], a1
3146	LDF	[AO +  1 * SIZE], a2
3147	LDF	[AO +  2 * SIZE], a3
3148	LDF	[AO +  3 * SIZE], a4
3149
3150	LDF	[BO +  0 * SIZE], b1
3151	LDF	[BO +  1 * SIZE], b2
3152	LDF	[BO +  2 * SIZE], b3
3153	FCLR	(cc01)
3154	LDF	[BO +  3 * SIZE], b4
3155	FCLR	(cc02)
3156
3157	LDF	[BO +  4 * SIZE], b5
3158	FCLR	(cc03)
3159	LDF	[BO +  5 * SIZE], b6
3160	FCLR	(cc04)
3161	LDF	[BO +  6 * SIZE], b7
3162	FCLR	(cc05)
3163	LDF	[BO +  7 * SIZE], b8
3164	FCLR	(cc06)
3165
3166	prefetch [C1 + 2 * SIZE], 3
3167	FCLR	(cc07)
3168	prefetch [C2 + 2 * SIZE], 3
3169	FCLR	(cc08)
3170
3171#if defined(LT) || defined(RN)
3172	sra	KK, 2, L
3173#else
3174	sub	K, KK, L
3175	sra	L,  2, L
3176#endif
3177	cmp	L,  0
3178	ble,pn	%icc, .LL55
3179	nop
3180	.align 4
3181
3182.LL53:
3183	FMADD	(aa1, bb1, cc01, cc01)
3184	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3185	FMADD	(aa2, bb1, cc02, cc02)
3186	LDF	[BO +  8 * SIZE], b1
3187
3188	FMADD	(aa1, bb2, cc03, cc03)
3189	LDF	[AO +  4 * SIZE], a1
3190	FMADD	(aa2, bb2, cc04, cc04)
3191	LDF	[AO +  5 * SIZE], a2
3192
3193	FMADD	(aa3, bb3, cc01, cc01)
3194	LDF	[BO +  9 * SIZE], b2
3195	FMADD	(aa4, bb3, cc02, cc02)
3196	LDF	[BO + 10 * SIZE], b3
3197
3198	FMADD	(aa3, bb4, cc03, cc03)
3199	LDF	[AO +  6 * SIZE], a3
3200	FMADD	(aa4, bb4, cc04, cc04)
3201	LDF	[AO +  7 * SIZE], a4
3202
3203	FMADD	(aa1, bb5, cc01, cc01)
3204	LDF	[BO + 11 * SIZE], b4
3205	FMADD	(aa2, bb5, cc02, cc02)
3206	LDF	[BO + 12 * SIZE], b5
3207
3208	FMADD	(aa1, bb6, cc03, cc03)
3209	LDF	[AO +  8 * SIZE], a1
3210	FMADD	(aa2, bb6, cc04, cc04)
3211	LDF	[AO +  9 * SIZE], a2
3212
3213	FMADD	(aa3, bb7, cc01, cc01)
3214	LDF	[BO + 13 * SIZE], b6
3215
3216	FMADD	(aa4, bb7, cc02, cc02)
3217	LDF	[BO + 14 * SIZE], b7
3218
3219	FMADD	(aa3, bb8, cc03, cc03)
3220	LDF	[AO + 10 * SIZE], a3
3221	FMADD	(aa4, bb8, cc04, cc04)
3222	LDF	[AO + 11 * SIZE], a4
3223
3224	add	AO,  8 * SIZE, AO
3225	add	L, -1, L
3226	add	BO,  8 * SIZE, BO
3227	cmp	L, 0
3228
3229	bg,pt	%icc, .LL53
3230	LDF	[BO +  7 * SIZE], b8
3231	.align 4
3232
3233.LL55:
3234#if defined(LT) || defined(RN)
3235	and	KK, 3, L
3236#else
3237	sub	K, KK, L
3238	and	L,  3, L
3239#endif
3240	cmp	L,  0
3241	ble,a,pn %icc, .LL58
3242	nop
3243	.align 4
3244
3245.LL57:
3246	FMADD	(aa1, bb1, cc01, cc01)
3247	add	L, -1, L
3248	FMADD	(aa2, bb1, cc02, cc02)
3249	LDF	[BO + 2 * SIZE], b1
3250
3251	FMADD	(aa1, bb2, cc03, cc03)
3252	LDF	[AO + 2 * SIZE], a1
3253	FMADD	(aa2, bb2, cc04, cc04)
3254	LDF	[AO + 3 * SIZE], a2
3255
3256	add	AO, 2 * SIZE, AO
3257	cmp	L, 0
3258	add	BO, 2 * SIZE, BO
3259	bg,pt	%icc, .LL57
3260	LDF	[BO + 1 * SIZE], b2
3261	.align 4
3262
3263.LL58:
3264#if defined(LN) || defined(RT)
3265#ifdef LN
3266	sub	KK, 2, TEMP1
3267#else
3268	sub	KK, 2, TEMP1
3269#endif
3270	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3271	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3272
3273	add	AORIG, TEMP2, AO
3274	add	B,     TEMP1, BO
3275#endif
3276
3277#if defined(LN) || defined(LT)
3278	LDF	[BO +  0 * SIZE], a1
3279	LDF	[BO +  1 * SIZE], a2
3280	LDF	[BO +  2 * SIZE], a3
3281	LDF	[BO +  3 * SIZE], a4
3282
3283	FSUB	a1, c01, c01
3284	FSUB	a2, c03, c03
3285	FSUB	a3, c02, c02
3286	FSUB	a4, c04, c04
3287#else
3288	LDF	[AO +  0 * SIZE], a1
3289	LDF	[AO +  1 * SIZE], a2
3290	LDF	[AO +  2 * SIZE], a3
3291	LDF	[AO +  3 * SIZE], a4
3292
3293	FSUB	a1, c01, c01
3294	FSUB	a2, c02, c02
3295	FSUB	a3, c03, c03
3296	FSUB	a4, c04, c04
3297#endif
3298
3299#ifdef LN
3300	LDF	[AO +  3 * SIZE], a1
3301	LDF	[AO +  2 * SIZE], a2
3302	LDF	[AO +  0 * SIZE], a3
3303
3304	FMUL	a1, c02, c02
3305	FMUL	a1, c04, c04
3306
3307	FNMSUB	(aa2, cc02, cc01, cc01)
3308	FNMSUB	(aa2, cc04, cc03, cc03)
3309
3310	FMUL	a3, c01, c01
3311	FMUL	a3, c03, c03
3312#endif
3313
3314#ifdef LT
3315	LDF	[AO +  0 * SIZE], a1
3316	LDF	[AO +  1 * SIZE], a2
3317	LDF	[AO +  3 * SIZE], a3
3318
3319	FMUL	a1, c01, c01
3320	FMUL	a1, c03, c03
3321
3322	FNMSUB	(aa2, cc01, cc02, cc02)
3323	FNMSUB	(aa2, cc03, cc04, cc04)
3324
3325	FMUL	a3, c02, c02
3326	FMUL	a3, c04, c04
3327#endif
3328
3329#ifdef RN
3330	LDF	[BO +  0 * SIZE], a1
3331	LDF	[BO +  1 * SIZE], a2
3332
3333	FMUL	a1, c01, c01
3334	FMUL	a1, c02, c02
3335
3336	FNMSUB	(aa2, cc01, cc03, cc03)
3337	FNMSUB	(aa2, cc02, cc04, cc04)
3338
3339	LDF	[BO +  3 * SIZE], a1
3340
3341	FMUL	a1, c03, c03
3342	FMUL	a1, c04, c04
3343#endif
3344
3345#ifdef RT
3346	LDF	[BO +  3 * SIZE], a1
3347	LDF	[BO +  2 * SIZE], a2
3348
3349	FMUL	a1, c04, c04
3350	FMUL	a1, c03, c03
3351
3352	FNMSUB	(aa2, cc04, cc02, cc02)
3353	FNMSUB	(aa2, cc03, cc01, cc01)
3354
3355	LDF	[BO +  0 * SIZE], a1
3356
3357	FMUL	a1, c02, c02
3358	FMUL	a1, c01, c01
3359#endif
3360
3361#ifdef LN
3362	add	C1, -2 * SIZE, C1
3363	add	C2, -2 * SIZE, C2
3364#endif
3365
3366#if defined(LN) || defined(LT)
3367	STF	c01, [BO +  0 * SIZE]
3368	STF	c03, [BO +  1 * SIZE]
3369	STF	c02, [BO +  2 * SIZE]
3370	STF	c04, [BO +  3 * SIZE]
3371#else
3372	STF	c01, [AO +  0 * SIZE]
3373	STF	c02, [AO +  1 * SIZE]
3374	STF	c03, [AO +  2 * SIZE]
3375	STF	c04, [AO +  3 * SIZE]
3376#endif
3377
3378	STF	c01, [C1 + 0 * SIZE]
3379	STF	c02, [C1 + 1 * SIZE]
3380	STF	c03, [C2 + 0 * SIZE]
3381	STF	c04, [C2 + 1 * SIZE]
3382
3383#ifndef LN
3384	add	C1, 2 * SIZE, C1
3385	add	C2, 2 * SIZE, C2
3386#endif
3387
3388#ifdef RT
3389	sll	K, BASE_SHIFT + 1, TEMP1
3390	add	AORIG, TEMP1, AORIG
3391#endif
3392
3393#if defined(LT) || defined(RN)
3394	sub	K, KK, TEMP1
3395	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3396	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3397	add	AO, TEMP2, AO
3398	add	BO, TEMP1, BO
3399#endif
3400
3401#ifdef LT
3402	add	KK, 2, KK
3403#endif
3404
3405#ifdef LN
3406	sub	KK, 2, KK
3407#endif
3408
3409	add	I, -1, I
3410	cmp	I, 0
3411	bg,pt	%icc, .LL52
3412	nop
3413	.align 4
3414
3415.LL69:
3416#ifdef LN
3417	sll	K, BASE_SHIFT + 1, TEMP1
3418	add	B, TEMP1, B
3419#endif
3420
3421#if defined(LT) || defined(RN)
3422	mov	BO, B
3423#endif
3424
3425#ifdef RN
3426	add	KK, 2, KK
3427#endif
3428
3429#ifdef RT
3430	sub	KK, 2, KK
3431#endif
3432	.align 4
3433
3434.LL70:
3435	and	N, 1, J
3436	cmp	J, 0
3437	ble,pn	%icc, .LL999
3438	nop
3439
3440#ifdef RT
3441	sll	K, BASE_SHIFT, TEMP1
3442	sub	B, TEMP1, B
3443#endif
3444
3445#ifndef RT
3446	mov	C,  C1
3447	add	C1, LDC, C
3448#else
3449	sub	C,  LDC, C1
3450	sub	C,  LDC, C
3451#endif
3452
3453#ifdef LN
3454	add	M, OFFSET, KK
3455#endif
3456
3457#ifdef LT
3458	mov	OFFSET, KK
3459#endif
3460
3461#if defined(LN) || defined(RT)
3462	mov	A, AORIG
3463#else
3464	mov	A, AO
3465#endif
3466
3467	and	M, 1, I
3468	cmp	I, 0
3469	ble,pn	%icc, .LL80
3470	nop
3471
3472#if defined(LT) || defined(RN)
3473	mov	B, BO
3474#else
3475#ifdef LN
3476	sll	K,  BASE_SHIFT + 0, TEMP1
3477	sub	AORIG, TEMP1, AORIG
3478#endif
3479
3480	sll	KK, BASE_SHIFT + 0, TEMP1
3481	sll	KK, BASE_SHIFT + 0, TEMP2
3482
3483	add	AORIG, TEMP1, AO
3484	add	B,     TEMP2, BO
3485#endif
3486
3487	LDF	[AO +  0 * SIZE], a1
3488	LDF	[BO +  0 * SIZE], b1
3489	LDF	[AO +  1 * SIZE], a2
3490	LDF	[BO +  1 * SIZE], b2
3491	LDF	[AO +  2 * SIZE], a3
3492	LDF	[BO +  2 * SIZE], b3
3493	LDF	[AO +  3 * SIZE], a4
3494	LDF	[BO +  3 * SIZE], b4
3495
3496#if defined(LT) || defined(RN)
3497	sra	KK, 2, L
3498#else
3499	sub	K, KK, L
3500	sra	L,  2, L
3501#endif
3502	cmp	L,  0
3503	ble,pn	%icc, .LL85
3504	FCLR	(cc01)
3505	.align 4
3506
3507.LL83:
3508	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3509	add	L, -1, L
3510
3511	FMADD	(aa1, bb1, cc01, cc01)
3512	LDF	[AO +  4 * SIZE], a1
3513	LDF	[BO +  4 * SIZE], b1
3514
3515	FMADD	(aa2, bb2, cc01, cc01)
3516	LDF	[AO +  5 * SIZE], a2
3517	LDF	[BO +  5 * SIZE], b2
3518
3519	FMADD	(aa3, bb3, cc01, cc01)
3520	LDF	[AO +  6 * SIZE], a3
3521	LDF	[BO +  6 * SIZE], b3
3522
3523	FMADD	(aa4, bb4, cc01, cc01)
3524	LDF	[AO +  7 * SIZE], a4
3525	LDF	[BO +  7 * SIZE], b4
3526
3527	add	AO,  4 * SIZE, AO
3528	cmp	L, 0
3529
3530	bg,pt	%icc, .LL83
3531	add	BO,  4 * SIZE, BO
3532	.align 4
3533
3534.LL85:
3535#if defined(LT) || defined(RN)
3536	and	KK, 3, L
3537#else
3538	sub	K, KK, L
3539	and	L,  3, L
3540#endif
3541	cmp	L,  0
3542	ble,a,pn %icc, .LL88
3543	nop
3544	.align 4
3545
3546.LL87:
3547	FMADD	(aa1, bb1, cc01, cc01)
3548	LDF	[AO + 1 * SIZE], a1
3549	LDF	[BO + 1 * SIZE], b1
3550
3551	add	AO, 1 * SIZE, AO
3552	add	L, -1, L
3553	cmp	L, 0
3554	bg,pt	%icc, .LL87
3555	add	BO, 1 * SIZE, BO
3556	.align 4
3557
3558.LL88:
3559#if defined(LN) || defined(RT)
3560#ifdef LN
3561	sub	KK, 1, TEMP1
3562#else
3563	sub	KK, 1, TEMP1
3564#endif
3565	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3566	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3567
3568	add	AORIG, TEMP2, AO
3569	add	B,     TEMP1, BO
3570#endif
3571
3572#if defined(LN) || defined(LT)
3573	LDF	[BO +  0 * SIZE], a1
3574
3575	FSUB	a1, c01, c01
3576#else
3577	LDF	[AO +  0 * SIZE], a1
3578
3579	FSUB	a1, c01, c01
3580#endif
3581
3582#if defined(LN) || defined(LT)
3583	LDF	[AO +  0 * SIZE], a1
3584
3585	FMUL	a1, c01, c01
3586#endif
3587
3588#if defined(RN) || defined(RT)
3589	LDF	[BO +  0 * SIZE], a1
3590
3591	FMUL	a1, c01, c01
3592#endif
3593
3594#ifdef LN
3595	add	C1, -1 * SIZE, C1
3596#endif
3597
3598#if defined(LN) || defined(LT)
3599	STF	c01, [BO +  0 * SIZE]
3600#else
3601	STF	c01, [AO +  0 * SIZE]
3602#endif
3603
3604	STF	c01, [C1 + 0 * SIZE]
3605
3606#ifdef RT
3607	sll	K, BASE_SHIFT + 0, TEMP1
3608	add	AORIG, TEMP1, AORIG
3609#endif
3610
3611#if defined(LT) || defined(RN)
3612	sub	K, KK, TEMP1
3613	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3614	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3615	add	AO, TEMP2, AO
3616	add	BO, TEMP1, BO
3617#endif
3618
3619#ifdef LT
3620	add	KK, 1, KK
3621#endif
3622
3623#ifdef LN
3624	sub	KK, 1, KK
3625#endif
3626	.align 4
3627
3628.LL80:
3629	sra	M, 1, I
3630	cmp	I, 0
3631	ble,pn	%icc, .LL89
3632	nop
3633	.align 4
3634
3635.LL72:
3636#if defined(LT) || defined(RN)
3637	mov	B, BO
3638#else
3639#ifdef LN
3640	sll	K,  BASE_SHIFT + 1, TEMP1
3641	sub	AORIG, TEMP1, AORIG
3642#endif
3643
3644	sll	KK, BASE_SHIFT + 1, TEMP1
3645	sll	KK, BASE_SHIFT + 0, TEMP2
3646
3647	add	AORIG, TEMP1, AO
3648	add	B,     TEMP2, BO
3649#endif
3650
3651	LDF	[AO +  0 * SIZE], a1
3652	LDF	[AO +  1 * SIZE], a2
3653	LDF	[AO +  2 * SIZE], a3
3654	LDF	[AO +  3 * SIZE], a4
3655
3656	LDF	[BO +  0 * SIZE], b1
3657	LDF	[BO +  1 * SIZE], b2
3658	LDF	[BO +  2 * SIZE], b3
3659	FCLR	(cc01)
3660	LDF	[BO +  3 * SIZE], b4
3661	FCLR	(cc02)
3662
3663	prefetch [C1 + 2 * SIZE], 3
3664
3665#if defined(LT) || defined(RN)
3666	sra	KK, 2, L
3667#else
3668	sub	K, KK, L
3669	sra	L,  2, L
3670#endif
3671	cmp	L,  0
3672	ble,pn	%icc, .LL75
3673	nop
3674
3675.LL73:
3676	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3677	add	L, -1, L
3678
3679	FMADD	(aa1, bb1, cc01, cc01)
3680	LDF	[AO +  4 * SIZE], a1
3681	FMADD	(aa2, bb1, cc02, cc02)
3682	LDF	[AO +  5 * SIZE], a2
3683
3684	LDF	[BO +  4 * SIZE], b1
3685	cmp	L, 0
3686
3687	FMADD	(aa3, bb2, cc01, cc01)
3688	LDF	[AO +  6 * SIZE], a3
3689	FMADD	(aa4, bb2, cc02, cc02)
3690	LDF	[AO +  7 * SIZE], a4
3691
3692	LDF	[BO +  5 * SIZE], b2
3693	add	BO,  4 * SIZE, BO
3694
3695	FMADD	(aa1, bb3, cc01, cc01)
3696	LDF	[AO +  8 * SIZE], a1
3697	FMADD	(aa2, bb3, cc02, cc02)
3698	LDF	[AO +  9 * SIZE], a2
3699
3700	LDF	[BO +  2 * SIZE], b3
3701	add	AO,  8 * SIZE, AO
3702
3703	FMADD	(aa3, bb4, cc01, cc01)
3704	LDF	[AO +  2 * SIZE], a3
3705	FMADD	(aa4, bb4, cc02, cc02)
3706	LDF	[AO +  3 * SIZE], a4
3707
3708	bg,pt	%icc, .LL73
3709	LDF	[BO +  3 * SIZE], b4
3710	.align 4
3711
3712.LL75:
3713#if defined(LT) || defined(RN)
3714	and	KK, 3, L
3715#else
3716	sub	K, KK, L
3717	and	L,  3, L
3718#endif
3719	cmp	L,  0
3720	ble,a,pn %icc, .LL78
3721	nop
3722	.align 4
3723
3724.LL77:
3725	FMADD	(aa1, bb1, cc01, cc01)
3726	LDF	[AO + 2 * SIZE], a1
3727	FMADD	(aa2, bb1, cc02, cc02)
3728	LDF	[AO + 3 * SIZE], a2
3729
3730	LDF	[BO + 1 * SIZE], b1
3731	add	L, -1, L
3732	add	AO, 2 * SIZE, AO
3733	cmp	L, 0
3734	bg,pt	%icc, .LL77
3735	add	BO, 1 * SIZE, BO
3736	.align 4
3737
3738.LL78:
3739#if defined(LN) || defined(RT)
3740#ifdef LN
3741	sub	KK, 2, TEMP1
3742#else
3743	sub	KK, 1, TEMP1
3744#endif
3745	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3746	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3747
3748	add	AORIG, TEMP2, AO
3749	add	B,     TEMP1, BO
3750#endif
3751
3752#if defined(LN) || defined(LT)
3753	LDF	[BO +  0 * SIZE], a1
3754	LDF	[BO +  1 * SIZE], a2
3755
3756	FSUB	a1, c01, c01
3757	FSUB	a2, c02, c02
3758#else
3759	LDF	[AO +  0 * SIZE], a1
3760	LDF	[AO +  1 * SIZE], a2
3761
3762	FSUB	a1, c01, c01
3763	FSUB	a2, c02, c02
3764#endif
3765
3766#ifdef LN
3767	LDF	[AO +  3 * SIZE], a1
3768	LDF	[AO +  2 * SIZE], a2
3769	LDF	[AO +  0 * SIZE], a3
3770
3771	FMUL	a1, c02, c02
3772
3773	FNMSUB	(aa2, cc02, cc01, cc01)
3774
3775	FMUL	a3, c01, c01
3776#endif
3777
3778#ifdef LT
3779	LDF	[AO +  0 * SIZE], a1
3780	LDF	[AO +  1 * SIZE], a2
3781	LDF	[AO +  3 * SIZE], a3
3782
3783	FMUL	a1, c01, c01
3784
3785	FNMSUB	(aa2, cc01, cc02, cc02)
3786
3787	FMUL	a3, c02, c02
3788#endif
3789
3790#if defined(RN) || defined(RT)
3791	LDF	[BO +  0 * SIZE], a1
3792
3793	FMUL	a1, c01, c01
3794	FMUL	a1, c02, c02
3795#endif
3796
3797#ifdef LN
3798	add	C1, -2 * SIZE, C1
3799#endif
3800
3801#if defined(LN) || defined(LT)
3802	STF	c01, [BO +  0 * SIZE]
3803	STF	c02, [BO +  1 * SIZE]
3804#else
3805	STF	c01, [AO +  0 * SIZE]
3806	STF	c02, [AO +  1 * SIZE]
3807#endif
3808
3809	STF	c01, [C1 + 0 * SIZE]
3810	STF	c02, [C1 + 1 * SIZE]
3811
3812#ifndef LN
3813	add	C1, 2 * SIZE, C1
3814#endif
3815
3816#ifdef RT
3817	sll	K, BASE_SHIFT + 1, TEMP1
3818	add	AORIG, TEMP1, AORIG
3819#endif
3820
3821#if defined(LT) || defined(RN)
3822	sub	K, KK, TEMP1
3823	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3824	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3825	add	AO, TEMP2, AO
3826	add	BO, TEMP1, BO
3827#endif
3828
3829#ifdef LT
3830	add	KK, 2, KK
3831#endif
3832
3833#ifdef LN
3834	sub	KK, 2, KK
3835#endif
3836
3837	add	I, -1, I
3838	cmp	I, 0
3839	bg,pt	%icc, .LL72
3840	nop
3841	.align 4
3842
3843.LL89:
3844#ifdef LN
3845	sll	K, BASE_SHIFT, TEMP1
3846	add	B, TEMP1, B
3847#endif
3848
3849#if defined(LT) || defined(RN)
3850	mov	BO, B
3851#endif
3852
3853#ifdef RN
3854	add	KK, 1, KK
3855#endif
3856
3857#ifdef RT
3858	sub	KK, 1, KK
3859#endif
3860	.align 4
3861
3862.LL999:
3863#ifdef TRMMKERNEL
3864#ifndef __64BIT__
3865	ld	[%sp + STACK_START +  8], %g1
3866	ld	[%sp + STACK_START + 12], %g2
3867	ld	[%sp + STACK_START + 16], %g3
3868	ld	[%sp + STACK_START + 20], %g4
3869#else
3870	ldx	[%sp + STACK_START + 32], %g1
3871	ldx	[%sp + STACK_START + 40], %g2
3872	ldx	[%sp + STACK_START + 48], %g3
3873	ldx	[%sp + STACK_START + 56], %g4
3874#endif
3875#endif
3876
3877	return	%i7 + 8
3878	clr	%o0
3879
3880	EPILOGUE
3881