1/*********************************************************************/
2/* Copyright 2005-2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define APREFETCHSIZE 24
43#define APREFETCH_CATEGORY 0
44
45#define M	%i0
46#define N	%i1
47#define K	%i2
48
49#if defined(DOUBLE) && !defined(__64BIT__)
50#define A	%i5
51#define B	%i4
52#else
53#define A	%i4
54#define B	%i5
55#endif
56
57#define C	%o4
58#define LDC	%o5
59
60#define AO	%l0
61#define BO	%l1
62#define I	%l2
63#define J	%l3
64#define L	%l4
65
66#define C1	%o0
67#define C2	%o1
68#define C3	%o2
69#define C4	%o3
70
71#define C5	%l5
72#define	C6	%l6
73#define C7	%l7
74#define C8	%i3
75
76#define OFFSET	%g1
77#define	KK	%g2
78#define TEMP1	%g3
79#define TEMP2	%g4
80#define AORIG	%o7
81
82#ifdef DOUBLE
83#define c01	%f0
84#define c02	%f2
85#define c03	%f4
86#define c04	%f6
87#define c05	%f8
88#define c06	%f10
89#define c07	%f12
90#define c08	%f14
91#define c09	%f16
92#define c10	%f18
93#define c11	%f20
94#define c12	%f22
95#define c13	%f24
96#define c14	%f26
97#define c15	%f28
98#define c16	%f30
99
100#define a1	%f32
101#define a2	%f34
102#define a3	%f36
103#define a4	%f38
104#define a5	%f40
105
106#define b1	%f42
107#define b2	%f44
108#define b3	%f46
109#define b4	%f48
110#define b5	%f50
111#define b6	%f52
112#define b7	%f54
113#define b8	%f56
114#define b9	%f58
115
116#define cc01	0
117#define cc02	2
118#define cc03	4
119#define cc04	6
120#define cc05	8
121#define cc06	10
122#define cc07	12
123#define cc08	14
124#define cc09	16
125#define cc10	18
126#define cc11	20
127#define cc12	22
128#define cc13	24
129#define cc14	26
130#define cc15	28
131#define cc16	30
132
133#define aa1	 1
134#define aa2	 3
135#define aa3	 5
136#define aa4	 7
137#define aa5	 9
138
139#define bb1	11
140#define bb2	13
141#define bb3	15
142#define bb4	17
143#define bb5	19
144#define bb6	21
145#define bb7	23
146#define bb8	25
147#define bb9	27
148
149#else
150#define c01	%f0
151#define c02	%f1
152#define c03	%f2
153#define c04	%f3
154#define c05	%f4
155#define c06	%f5
156#define c07	%f6
157#define c08	%f7
158#define c09	%f8
159#define c10	%f9
160#define c11	%f10
161#define c12	%f11
162#define c13	%f12
163#define c14	%f13
164#define c15	%f14
165#define c16	%f15
166
167#define a1	%f16
168#define a2	%f17
169#define a3	%f18
170#define a4	%f19
171#define a5	%f20
172
173#define b1	%f21
174#define b2	%f22
175#define b3	%f23
176#define b4	%f24
177#define b5	%f25
178#define b6	%f26
179#define b7	%f27
180#define b8	%f28
181#define b9	%f29
182
183#define cc01	0
184#define cc02	1
185#define cc03	2
186#define cc04	3
187#define cc05	4
188#define cc06	5
189#define cc07	6
190#define cc08	7
191#define cc09	8
192#define cc10	9
193#define cc11	10
194#define cc12	11
195#define cc13	12
196#define cc14	13
197#define cc15	14
198#define cc16	15
199
200#define aa1	16
201#define aa2	17
202#define aa3	18
203#define aa4	19
204#define aa5	20
205
206#define bb1	21
207#define bb2	22
208#define bb3	23
209#define bb4	24
210#define bb5	25
211#define bb6	26
212#define bb7	27
213#define bb8	28
214#define bb9	29
215
216#endif
217
218        .register %g2, #scratch
219        .register %g3, #scratch
220
221	PROLOGUE
222	SAVESP
223	nop
224
225#ifndef __64BIT__
226
227#ifdef DOUBLE
228	ld	[%sp + STACK_START + 28], B
229	ld	[%sp + STACK_START + 32], C
230	ld	[%sp + STACK_START + 36], LDC
231	ld	[%sp + STACK_START + 40], OFFSET
232#else
233	ld	[%sp + STACK_START + 28], C
234	ld	[%sp + STACK_START + 32], LDC
235	ld	[%sp + STACK_START + 36], OFFSET
236#endif
237	st	%g1, [%sp + STACK_START +  8]
238	st	%g2, [%sp + STACK_START + 12]
239	st	%g3, [%sp + STACK_START + 16]
240	st	%g4, [%sp + STACK_START + 20]
241#else
242
243	ldx	[%sp+  STACK_START + 56], C
244	ldx	[%sp+  STACK_START + 64], LDC
245	ldx	[%sp+  STACK_START + 72], OFFSET
246
247	stx	%g1, [%sp + STACK_START + 32]
248	stx	%g2, [%sp + STACK_START + 40]
249	stx	%g3, [%sp + STACK_START + 48]
250	stx	%g4, [%sp + STACK_START + 56]
251#endif
252
253#if defined(TRMMKERNEL) && !defined(LEFT)
254	neg	OFFSET, KK
255#endif
256
257	sll	LDC, BASE_SHIFT, LDC
258
259#ifdef LN
260	smul	M, K, TEMP1
261	sll	TEMP1, BASE_SHIFT, TEMP1
262	add	A, TEMP1, A
263
264	sll	M, BASE_SHIFT, TEMP1
265	add	C, TEMP1, C
266#endif
267
268#ifdef RN
269	neg	OFFSET, KK
270#endif
271
272#ifdef RT
273	smul	N, K, TEMP1
274	sll	TEMP1, BASE_SHIFT, TEMP1
275	add	B, TEMP1, B
276
277	smul	N, LDC, TEMP1
278	add	C, TEMP1, C
279
280	sub	N, OFFSET, KK
281#endif
282
283	sra	N, 3, J
284	cmp	J, 0
285	ble,pn	%icc, .LL30
286	nop
287	.align 4
288
289.LL11:
290#ifdef RT
291	sll	K, BASE_SHIFT + 3, TEMP1
292	sub	B, TEMP1, B
293#endif
294
295#ifndef RT
296	mov	C,  C1
297	add	C,  LDC, C2
298	add	C2, LDC, C3
299	add	C3, LDC, C4
300	add	C4, LDC, C5
301	add	C5, LDC, C6
302	add	C6, LDC, C7
303	add	C7, LDC, C8
304	add	C8, LDC, C
305#else
306	sub	C,  LDC, C8
307	sub	C8, LDC, C7
308	sub	C7, LDC, C6
309	sub	C6, LDC, C5
310	sub	C5, LDC, C4
311	sub	C4, LDC, C3
312	sub	C3, LDC, C2
313	sub	C2, LDC, C1
314	sub	C2, LDC, C
315#endif
316
317#ifdef LN
318	add	M, OFFSET, KK
319#endif
320
321#ifdef LT
322	mov	OFFSET, KK
323#endif
324
325#if defined(LN) || defined(RT)
326	mov	A, AORIG
327#else
328	mov	A, AO
329#endif
330
331	sra	M, 1, I
332	cmp	I, 0
333	ble,pn	%icc, .LL20
334	nop
335	.align 4
336
337.LL12:
338#if defined(LT) || defined(RN)
339	mov	B, BO
340#else
341#ifdef LN
342	sll	K,  BASE_SHIFT + 1, TEMP1
343	sub	AORIG, TEMP1, AORIG
344#endif
345
346	sll	KK, BASE_SHIFT + 1, TEMP1
347	sll	KK, BASE_SHIFT + 3, TEMP2
348
349	add	AORIG, TEMP1, AO
350	add	B,     TEMP2, BO
351#endif
352
353	LDF	[AO +  0 * SIZE], a1
354	LDF	[AO +  1 * SIZE], a2
355	LDF	[AO +  8 * SIZE], a5
356
357	LDF	[BO +  0 * SIZE], b1
358
359	LDF	[BO +  1 * SIZE], b2
360	FCLR	(cc01)
361	LDF	[BO +  2 * SIZE], b3
362	FCLR	(cc05)
363	LDF	[BO +  3 * SIZE], b4
364	FCLR	(cc09)
365	LDF	[BO +  4 * SIZE], b5
366	FCLR	(cc13)
367
368	LDF	[BO +  5 * SIZE], b6
369	FCLR	(cc02)
370	LDF	[BO +  6 * SIZE], b7
371	FCLR	(cc06)
372	LDF	[BO +  7 * SIZE], b8
373	FCLR	(cc10)
374	LDF	[BO +  8 * SIZE], b9
375	FCLR	(cc14)
376
377	prefetch [C1 + 1 * SIZE], 3
378	FCLR	(cc03)
379	prefetch [C2 + 2 * SIZE], 3
380	FCLR	(cc07)
381	prefetch [C3 + 1 * SIZE], 3
382	FCLR	(cc11)
383	prefetch [C4 + 2 * SIZE], 3
384	FCLR	(cc15)
385
386	prefetch [C5 + 1 * SIZE], 3
387	FCLR	(cc04)
388	prefetch [C6 + 2 * SIZE], 3
389	FCLR	(cc08)
390	prefetch [C7 + 1 * SIZE], 3
391	FCLR	(cc12)
392	prefetch [C8 + 2 * SIZE], 3
393	FCLR	(cc16)
394
395#if defined(LT) || defined(RN)
396	sra	KK, 3, L
397#else
398	sub	K, KK, L
399	sra	L,  3, L
400#endif
401	cmp	L,  0
402	ble,pn	%icc, .LL15
403	nop
404	.align 4
405
406.LL13:
407	FMADD	(aa1, bb1, cc01, cc01)
408	FMADD	(aa2, bb1, cc02, cc02)
409	FMADD	(aa1, bb2, cc03, cc03)
410	FMADD	(aa2, bb2, cc04, cc04)
411
412	FMADD	(aa1, bb3, cc05, cc05)
413	LDF	[BO + 16 * SIZE], b1
414	FMADD	(aa2, bb3, cc06, cc06)
415	LDF	[BO +  9 * SIZE], b2
416
417	FMADD	(aa1, bb4, cc07, cc07)
418	LDF	[BO + 10 * SIZE], b3
419	FMADD	(aa2, bb4, cc08, cc08)
420	LDF	[BO + 11 * SIZE], b4
421
422	FMADD	(aa1, bb5, cc09, cc09)
423	LDF	[AO +  2 * SIZE], a3
424	FMADD	(aa2, bb5, cc10, cc10)
425	LDF	[AO +  3 * SIZE], a4
426
427	FMADD	(aa1, bb6, cc11, cc11)
428	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
429	FMADD	(aa2, bb6, cc12, cc12)
430	nop
431
432	FMADD	(aa1, bb7, cc13, cc13)
433	LDF	[BO + 12 * SIZE], b5
434	FMADD	(aa2, bb7, cc14, cc14)
435	LDF	[BO + 13 * SIZE], b6
436
437	FMADD	(aa1, bb8, cc15, cc15)
438	LDF	[BO + 14 * SIZE], b7
439	FMADD	(aa2, bb8, cc16, cc16)
440	LDF	[BO + 15 * SIZE], b8
441
442	FMADD	(aa3, bb9, cc01, cc01)
443	FMADD	(aa4, bb9, cc02, cc02)
444	FMADD	(aa3, bb2, cc03, cc03)
445	FMADD	(aa4, bb2, cc04, cc04)
446
447	FMADD	(aa3, bb3, cc05, cc05)
448	LDF	[BO + 24 * SIZE], b9
449	FMADD	(aa4, bb3, cc06, cc06)
450	LDF	[BO + 17 * SIZE], b2
451
452	FMADD	(aa3, bb4, cc07, cc07)
453	LDF	[BO + 18 * SIZE], b3
454	FMADD	(aa4, bb4, cc08, cc08)
455	LDF	[BO + 19 * SIZE], b4
456
457	FMADD	(aa3, bb5, cc09, cc09)
458	LDF	[AO +  4 * SIZE], a1
459	FMADD	(aa4, bb5, cc10, cc10)
460	LDF	[AO +  5 * SIZE], a2
461
462	FMADD	(aa3, bb6, cc11, cc11)
463	add	L, -1, L
464	FMADD	(aa4, bb6, cc12, cc12)
465	nop
466
467	FMADD	(aa3, bb7, cc13, cc13)
468	LDF	[BO + 20 * SIZE], b5
469	FMADD	(aa4, bb7, cc14, cc14)
470	LDF	[BO + 21 * SIZE], b6
471
472	FMADD	(aa3, bb8, cc15, cc15)
473	LDF	[BO + 22 * SIZE], b7
474	FMADD	(aa4, bb8, cc16, cc16)
475	LDF	[BO + 23 * SIZE], b8
476
477	FMADD	(aa1, bb1, cc01, cc01)
478	FMADD	(aa2, bb1, cc02, cc02)
479	FMADD	(aa1, bb2, cc03, cc03)
480	FMADD	(aa2, bb2, cc04, cc04)
481
482	FMADD	(aa1, bb3, cc05, cc05)
483	LDF	[BO + 32 * SIZE], b1
484	FMADD	(aa2, bb3, cc06, cc06)
485	LDF	[BO + 25 * SIZE], b2
486
487	FMADD	(aa1, bb4, cc07, cc07)
488	LDF	[BO + 26 * SIZE], b3
489	FMADD	(aa2, bb4, cc08, cc08)
490	LDF	[BO + 27 * SIZE], b4
491
492	FMADD	(aa1, bb5, cc09, cc09)
493	LDF	[AO +  6 * SIZE], a3
494	FMADD	(aa2, bb5, cc10, cc10)
495	LDF	[AO +  7 * SIZE], a4
496
497	FMADD	(aa1, bb6, cc11, cc11)
498	nop
499	FMADD	(aa2, bb6, cc12, cc12)
500	nop
501
502	FMADD	(aa1, bb7, cc13, cc13)
503	LDF	[BO + 28 * SIZE], b5
504	FMADD	(aa2, bb7, cc14, cc14)
505	LDF	[BO + 29 * SIZE], b6
506
507	FMADD	(aa1, bb8, cc15, cc15)
508	LDF	[BO + 30 * SIZE], b7
509	FMADD	(aa2, bb8, cc16, cc16)
510	LDF	[BO + 31 * SIZE], b8
511
512	FMADD	(aa3, bb9, cc01, cc01)
513	FMADD	(aa4, bb9, cc02, cc02)
514	FMADD	(aa3, bb2, cc03, cc03)
515	FMADD	(aa4, bb2, cc04, cc04)
516
517	FMADD	(aa3, bb3, cc05, cc05)
518	LDF	[BO + 40 * SIZE], b9
519	FMADD	(aa4, bb3, cc06, cc06)
520	LDF	[BO + 33 * SIZE], b2
521
522	FMADD	(aa3, bb4, cc07, cc07)
523	LDF	[BO + 34 * SIZE], b3
524	FMADD	(aa4, bb4, cc08, cc08)
525	LDF	[BO + 35 * SIZE], b4
526
527	FMADD	(aa3, bb5, cc09, cc09)
528	LDF	[AO + 16 * SIZE], a1  /****/
529	FMADD	(aa4, bb5, cc10, cc10)
530	LDF	[AO +  9 * SIZE], a2
531
532	FMADD	(aa3, bb6, cc11, cc11)
533	nop
534	FMADD	(aa4, bb6, cc12, cc12)
535	nop
536
537	FMADD	(aa3, bb7, cc13, cc13)
538	LDF	[BO + 36 * SIZE], b5
539	FMADD	(aa4, bb7, cc14, cc14)
540	LDF	[BO + 37 * SIZE], b6
541
542	FMADD	(aa3, bb8, cc15, cc15)
543	LDF	[BO + 38 * SIZE], b7
544	FMADD	(aa4, bb8, cc16, cc16)
545	LDF	[BO + 39 * SIZE], b8
546
547	FMADD	(aa5, bb1, cc01, cc01)
548	FMADD	(aa2, bb1, cc02, cc02)
549	FMADD	(aa5, bb2, cc03, cc03)
550	FMADD	(aa2, bb2, cc04, cc04)
551
552	FMADD	(aa5, bb3, cc05, cc05)
553	LDF	[BO + 48 * SIZE], b1
554	FMADD	(aa2, bb3, cc06, cc06)
555	LDF	[BO + 41 * SIZE], b2
556
557	FMADD	(aa5, bb4, cc07, cc07)
558	LDF	[BO + 42 * SIZE], b3
559	FMADD	(aa2, bb4, cc08, cc08)
560	LDF	[BO + 43 * SIZE], b4
561
562	FMADD	(aa5, bb5, cc09, cc09)
563	LDF	[AO + 10 * SIZE], a3
564	FMADD	(aa2, bb5, cc10, cc10)
565	LDF	[AO + 11 * SIZE], a4
566
567	FMADD	(aa5, bb6, cc11, cc11)
568	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
569	FMADD	(aa2, bb6, cc12, cc12)
570	nop
571
572	FMADD	(aa5, bb7, cc13, cc13)
573	LDF	[BO + 44 * SIZE], b5
574	FMADD	(aa2, bb7, cc14, cc14)
575	LDF	[BO + 45 * SIZE], b6
576
577	FMADD	(aa5, bb8, cc15, cc15)
578	LDF	[BO + 46 * SIZE], b7
579	FMADD	(aa2, bb8, cc16, cc16)
580	LDF	[BO + 47 * SIZE], b8
581
582	FMADD	(aa3, bb9, cc01, cc01)
583	FMADD	(aa4, bb9, cc02, cc02)
584	FMADD	(aa3, bb2, cc03, cc03)
585	FMADD	(aa4, bb2, cc04, cc04)
586
587	FMADD	(aa3, bb3, cc05, cc05)
588	LDF	[BO + 56 * SIZE], b9
589	FMADD	(aa4, bb3, cc06, cc06)
590	LDF	[BO + 49 * SIZE], b2
591
592	FMADD	(aa3, bb4, cc07, cc07)
593	LDF	[BO + 50 * SIZE], b3
594	FMADD	(aa4, bb4, cc08, cc08)
595	LDF	[BO + 51 * SIZE], b4
596
597	FMADD	(aa3, bb5, cc09, cc09)
598	LDF	[AO + 12 * SIZE], a5
599	FMADD	(aa4, bb5, cc10, cc10)
600	LDF	[AO + 13 * SIZE], a2
601
602	FMADD	(aa3, bb6, cc11, cc11)
603	cmp	L, 0
604	FMADD	(aa4, bb6, cc12, cc12)
605	nop
606
607	FMADD	(aa3, bb7, cc13, cc13)
608	LDF	[BO + 52 * SIZE], b5
609	FMADD	(aa4, bb7, cc14, cc14)
610	LDF	[BO + 53 * SIZE], b6
611
612	FMADD	(aa3, bb8, cc15, cc15)
613	LDF	[BO + 54 * SIZE], b7
614	FMADD	(aa4, bb8, cc16, cc16)
615	LDF	[BO + 55 * SIZE], b8
616
617	FMADD	(aa5, bb1, cc01, cc01)
618	FMADD	(aa2, bb1, cc02, cc02)
619	FMADD	(aa5, bb2, cc03, cc03)
620	FMADD	(aa2, bb2, cc04, cc04)
621
622	FMADD	(aa5, bb3, cc05, cc05)
623	LDF	[BO + 64 * SIZE], b1
624	FMADD	(aa2, bb3, cc06, cc06)
625	LDF	[BO + 57 * SIZE], b2
626
627	FMADD	(aa5, bb4, cc07, cc07)
628	LDF	[BO + 58 * SIZE], b3
629	FMADD	(aa2, bb4, cc08, cc08)
630	LDF	[BO + 59 * SIZE], b4
631
632	FMADD	(aa5, bb5, cc09, cc09)
633	LDF	[AO + 14 * SIZE], a3
634	FMADD	(aa2, bb5, cc10, cc10)
635	LDF	[AO + 15 * SIZE], a4
636
637	FMADD	(aa5, bb6, cc11, cc11)
638	add	BO, 64 * SIZE, BO
639	FMADD	(aa2, bb6, cc12, cc12)
640	add	AO, 16 * SIZE, AO
641
642	FMADD	(aa5, bb7, cc13, cc13)
643	LDF	[BO -  4 * SIZE], b5
644	FMADD	(aa2, bb7, cc14, cc14)
645	LDF	[BO -  3 * SIZE], b6
646
647	FMADD	(aa5, bb8, cc15, cc15)
648	LDF	[BO -  2 * SIZE], b7
649	FMADD	(aa2, bb8, cc16, cc16)
650	LDF	[BO -  1 * SIZE], b8
651
652	FMADD	(aa3, bb9, cc01, cc01)
653	FMADD	(aa4, bb9, cc02, cc02)
654	FMADD	(aa3, bb2, cc03, cc03)
655	FMADD	(aa4, bb2, cc04, cc04)
656
657	FMADD	(aa3, bb3, cc05, cc05)
658	LDF	[BO +  8 * SIZE], b9
659	FMADD	(aa4, bb3, cc06, cc06)
660	LDF	[BO +  1 * SIZE], b2
661
662	FMADD	(aa3, bb4, cc07, cc07)
663	LDF	[BO +  2 * SIZE], b3
664	FMADD	(aa4, bb4, cc08, cc08)
665	LDF	[BO +  3 * SIZE], b4
666
667	FMADD	(aa3, bb5, cc09, cc09)
668	LDF	[AO +  8 * SIZE], a5  /****/
669	FMADD	(aa4, bb5, cc10, cc10)
670	LDF	[AO +  1 * SIZE], a2
671
672	FMADD	(aa3, bb6, cc11, cc11)
673	FMADD	(aa4, bb6, cc12, cc12)
674
675	FMADD	(aa3, bb7, cc13, cc13)
676	LDF	[BO +  4 * SIZE], b5
677	FMADD	(aa4, bb7, cc14, cc14)
678	LDF	[BO +  5 * SIZE], b6
679
680	FMADD	(aa3, bb8, cc15, cc15)
681	LDF	[BO +  6 * SIZE], b7
682	FMADD	(aa4, bb8, cc16, cc16)
683	ble,pn	%icc, .LL15
684	LDF	[BO +  7 * SIZE], b8
685
686	FMADD	(aa1, bb1, cc01, cc01)
687	FMADD	(aa2, bb1, cc02, cc02)
688	FMADD	(aa1, bb2, cc03, cc03)
689	FMADD	(aa2, bb2, cc04, cc04)
690
691	FMADD	(aa1, bb3, cc05, cc05)
692	LDF	[BO + 16 * SIZE], b1
693	FMADD	(aa2, bb3, cc06, cc06)
694	LDF	[BO +  9 * SIZE], b2
695
696	FMADD	(aa1, bb4, cc07, cc07)
697	LDF	[BO + 10 * SIZE], b3
698	FMADD	(aa2, bb4, cc08, cc08)
699	LDF	[BO + 11 * SIZE], b4
700
701	FMADD	(aa1, bb5, cc09, cc09)
702	LDF	[AO +  2 * SIZE], a3
703	FMADD	(aa2, bb5, cc10, cc10)
704	LDF	[AO +  3 * SIZE], a4
705
706	FMADD	(aa1, bb6, cc11, cc11)
707	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
708	FMADD	(aa2, bb6, cc12, cc12)
709	nop
710
711	FMADD	(aa1, bb7, cc13, cc13)
712	LDF	[BO + 12 * SIZE], b5
713	FMADD	(aa2, bb7, cc14, cc14)
714	LDF	[BO + 13 * SIZE], b6
715
716	FMADD	(aa1, bb8, cc15, cc15)
717	LDF	[BO + 14 * SIZE], b7
718	FMADD	(aa2, bb8, cc16, cc16)
719	LDF	[BO + 15 * SIZE], b8
720
721	FMADD	(aa3, bb9, cc01, cc01)
722	FMADD	(aa4, bb9, cc02, cc02)
723	FMADD	(aa3, bb2, cc03, cc03)
724	FMADD	(aa4, bb2, cc04, cc04)
725
726	FMADD	(aa3, bb3, cc05, cc05)
727	LDF	[BO + 24 * SIZE], b9
728	FMADD	(aa4, bb3, cc06, cc06)
729	LDF	[BO + 17 * SIZE], b2
730
731	FMADD	(aa3, bb4, cc07, cc07)
732	LDF	[BO + 18 * SIZE], b3
733	FMADD	(aa4, bb4, cc08, cc08)
734	LDF	[BO + 19 * SIZE], b4
735
736	FMADD	(aa3, bb5, cc09, cc09)
737	LDF	[AO +  4 * SIZE], a1
738	FMADD	(aa4, bb5, cc10, cc10)
739	LDF	[AO +  5 * SIZE], a2
740
741	FMADD	(aa3, bb6, cc11, cc11)
742	add	L, -1, L
743	FMADD	(aa4, bb6, cc12, cc12)
744	nop
745
746	FMADD	(aa3, bb7, cc13, cc13)
747	LDF	[BO + 20 * SIZE], b5
748	FMADD	(aa4, bb7, cc14, cc14)
749	LDF	[BO + 21 * SIZE], b6
750
751	FMADD	(aa3, bb8, cc15, cc15)
752	LDF	[BO + 22 * SIZE], b7
753	FMADD	(aa4, bb8, cc16, cc16)
754	LDF	[BO + 23 * SIZE], b8
755
756	FMADD	(aa1, bb1, cc01, cc01)
757	FMADD	(aa2, bb1, cc02, cc02)
758	FMADD	(aa1, bb2, cc03, cc03)
759	FMADD	(aa2, bb2, cc04, cc04)
760
761	FMADD	(aa1, bb3, cc05, cc05)
762	LDF	[BO + 32 * SIZE], b1
763	FMADD	(aa2, bb3, cc06, cc06)
764	LDF	[BO + 25 * SIZE], b2
765
766	FMADD	(aa1, bb4, cc07, cc07)
767	LDF	[BO + 26 * SIZE], b3
768	FMADD	(aa2, bb4, cc08, cc08)
769	LDF	[BO + 27 * SIZE], b4
770
771	FMADD	(aa1, bb5, cc09, cc09)
772	LDF	[AO +  6 * SIZE], a3
773	FMADD	(aa2, bb5, cc10, cc10)
774	LDF	[AO +  7 * SIZE], a4
775
776	FMADD	(aa1, bb6, cc11, cc11)
777	nop
778	FMADD	(aa2, bb6, cc12, cc12)
779	nop
780
781	FMADD	(aa1, bb7, cc13, cc13)
782	LDF	[BO + 28 * SIZE], b5
783	FMADD	(aa2, bb7, cc14, cc14)
784	LDF	[BO + 29 * SIZE], b6
785
786	FMADD	(aa1, bb8, cc15, cc15)
787	LDF	[BO + 30 * SIZE], b7
788	FMADD	(aa2, bb8, cc16, cc16)
789	LDF	[BO + 31 * SIZE], b8
790
791	FMADD	(aa3, bb9, cc01, cc01)
792	FMADD	(aa4, bb9, cc02, cc02)
793	FMADD	(aa3, bb2, cc03, cc03)
794	FMADD	(aa4, bb2, cc04, cc04)
795
796	FMADD	(aa3, bb3, cc05, cc05)
797	LDF	[BO + 40 * SIZE], b9
798	FMADD	(aa4, bb3, cc06, cc06)
799	LDF	[BO + 33 * SIZE], b2
800
801	FMADD	(aa3, bb4, cc07, cc07)
802	LDF	[BO + 34 * SIZE], b3
803	FMADD	(aa4, bb4, cc08, cc08)
804	LDF	[BO + 35 * SIZE], b4
805
806	FMADD	(aa3, bb5, cc09, cc09)
807	LDF	[AO + 16 * SIZE], a1  /****/
808	FMADD	(aa4, bb5, cc10, cc10)
809	LDF	[AO +  9 * SIZE], a2
810
811	FMADD	(aa3, bb6, cc11, cc11)
812	nop
813	FMADD	(aa4, bb6, cc12, cc12)
814	nop
815
816	FMADD	(aa3, bb7, cc13, cc13)
817	LDF	[BO + 36 * SIZE], b5
818	FMADD	(aa4, bb7, cc14, cc14)
819	LDF	[BO + 37 * SIZE], b6
820
821	FMADD	(aa3, bb8, cc15, cc15)
822	LDF	[BO + 38 * SIZE], b7
823	FMADD	(aa4, bb8, cc16, cc16)
824	LDF	[BO + 39 * SIZE], b8
825
826	FMADD	(aa5, bb1, cc01, cc01)
827	FMADD	(aa2, bb1, cc02, cc02)
828	FMADD	(aa5, bb2, cc03, cc03)
829	FMADD	(aa2, bb2, cc04, cc04)
830
831	FMADD	(aa5, bb3, cc05, cc05)
832	LDF	[BO + 48 * SIZE], b1
833	FMADD	(aa2, bb3, cc06, cc06)
834	LDF	[BO + 41 * SIZE], b2
835
836	FMADD	(aa5, bb4, cc07, cc07)
837	LDF	[BO + 42 * SIZE], b3
838	FMADD	(aa2, bb4, cc08, cc08)
839	LDF	[BO + 43 * SIZE], b4
840
841	FMADD	(aa5, bb5, cc09, cc09)
842	LDF	[AO + 10 * SIZE], a3
843	FMADD	(aa2, bb5, cc10, cc10)
844	LDF	[AO + 11 * SIZE], a4
845
846	FMADD	(aa5, bb6, cc11, cc11)
847	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
848	FMADD	(aa2, bb6, cc12, cc12)
849	nop
850
851	FMADD	(aa5, bb7, cc13, cc13)
852	LDF	[BO + 44 * SIZE], b5
853	FMADD	(aa2, bb7, cc14, cc14)
854	LDF	[BO + 45 * SIZE], b6
855
856	FMADD	(aa5, bb8, cc15, cc15)
857	LDF	[BO + 46 * SIZE], b7
858	FMADD	(aa2, bb8, cc16, cc16)
859	LDF	[BO + 47 * SIZE], b8
860
861	FMADD	(aa3, bb9, cc01, cc01)
862	FMADD	(aa4, bb9, cc02, cc02)
863	FMADD	(aa3, bb2, cc03, cc03)
864	FMADD	(aa4, bb2, cc04, cc04)
865
866	FMADD	(aa3, bb3, cc05, cc05)
867	LDF	[BO + 56 * SIZE], b9
868	FMADD	(aa4, bb3, cc06, cc06)
869	LDF	[BO + 49 * SIZE], b2
870
871	FMADD	(aa3, bb4, cc07, cc07)
872	LDF	[BO + 50 * SIZE], b3
873	FMADD	(aa4, bb4, cc08, cc08)
874	LDF	[BO + 51 * SIZE], b4
875
876	FMADD	(aa3, bb5, cc09, cc09)
877	LDF	[AO + 12 * SIZE], a5
878	FMADD	(aa4, bb5, cc10, cc10)
879	LDF	[AO + 13 * SIZE], a2
880
881	FMADD	(aa3, bb6, cc11, cc11)
882	cmp	L, 0
883	FMADD	(aa4, bb6, cc12, cc12)
884	nop
885
886	FMADD	(aa3, bb7, cc13, cc13)
887	LDF	[BO + 52 * SIZE], b5
888	FMADD	(aa4, bb7, cc14, cc14)
889	LDF	[BO + 53 * SIZE], b6
890
891	FMADD	(aa3, bb8, cc15, cc15)
892	LDF	[BO + 54 * SIZE], b7
893	FMADD	(aa4, bb8, cc16, cc16)
894	LDF	[BO + 55 * SIZE], b8
895
896	FMADD	(aa5, bb1, cc01, cc01)
897	FMADD	(aa2, bb1, cc02, cc02)
898	FMADD	(aa5, bb2, cc03, cc03)
899	FMADD	(aa2, bb2, cc04, cc04)
900
901	FMADD	(aa5, bb3, cc05, cc05)
902	LDF	[BO + 64 * SIZE], b1
903	FMADD	(aa2, bb3, cc06, cc06)
904	LDF	[BO + 57 * SIZE], b2
905
906	FMADD	(aa5, bb4, cc07, cc07)
907	LDF	[BO + 58 * SIZE], b3
908	FMADD	(aa2, bb4, cc08, cc08)
909	LDF	[BO + 59 * SIZE], b4
910
911	FMADD	(aa5, bb5, cc09, cc09)
912	LDF	[AO + 14 * SIZE], a3
913	FMADD	(aa2, bb5, cc10, cc10)
914	LDF	[AO + 15 * SIZE], a4
915
916	FMADD	(aa5, bb6, cc11, cc11)
917	add	BO, 64 * SIZE, BO
918	FMADD	(aa2, bb6, cc12, cc12)
919	add	AO, 16 * SIZE, AO
920
921	FMADD	(aa5, bb7, cc13, cc13)
922	LDF	[BO -  4 * SIZE], b5
923	FMADD	(aa2, bb7, cc14, cc14)
924	LDF	[BO -  3 * SIZE], b6
925
926	FMADD	(aa5, bb8, cc15, cc15)
927	LDF	[BO -  2 * SIZE], b7
928	FMADD	(aa2, bb8, cc16, cc16)
929	LDF	[BO -  1 * SIZE], b8
930
931	FMADD	(aa3, bb9, cc01, cc01)
932	FMADD	(aa4, bb9, cc02, cc02)
933	FMADD	(aa3, bb2, cc03, cc03)
934	FMADD	(aa4, bb2, cc04, cc04)
935
936	FMADD	(aa3, bb3, cc05, cc05)
937	LDF	[BO +  8 * SIZE], b9
938	FMADD	(aa4, bb3, cc06, cc06)
939	LDF	[BO +  1 * SIZE], b2
940
941	FMADD	(aa3, bb4, cc07, cc07)
942	LDF	[BO +  2 * SIZE], b3
943	FMADD	(aa4, bb4, cc08, cc08)
944	LDF	[BO +  3 * SIZE], b4
945
946	FMADD	(aa3, bb5, cc09, cc09)
947	LDF	[AO +  8 * SIZE], a5  /****/
948	FMADD	(aa4, bb5, cc10, cc10)
949	LDF	[AO +  1 * SIZE], a2
950
951	FMADD	(aa3, bb6, cc11, cc11)
952	FMADD	(aa4, bb6, cc12, cc12)
953
954	FMADD	(aa3, bb7, cc13, cc13)
955	LDF	[BO +  4 * SIZE], b5
956	FMADD	(aa4, bb7, cc14, cc14)
957	LDF	[BO +  5 * SIZE], b6
958
959	FMADD	(aa3, bb8, cc15, cc15)
960	LDF	[BO +  6 * SIZE], b7
961	FMADD	(aa4, bb8, cc16, cc16)
962	bg,pt	%icc, .LL13
963	LDF	[BO +  7 * SIZE], b8
964	.align 4
965
966.LL15:
967#if defined(LT) || defined(RN)
968	and	KK, 7, L
969#else
970	sub	K, KK, L
971	and	L,  7, L
972#endif
973	cmp	L,  0
974	ble,a,pn %icc, .LL18
975	nop
976	.align 4
977
978.LL17:
979	FMADD	(aa1, bb1, cc01, cc01)
980	add	L, -1, L
981	FMADD	(aa2, bb1, cc02, cc02)
982	nop
983
984	FMADD	(aa1, bb2, cc03, cc03)
985	LDF	[BO +  8 * SIZE], b1
986	FMADD	(aa2, bb2, cc04, cc04)
987	LDF	[BO +  9 * SIZE], b2
988
989	FMADD	(aa1, bb3, cc05, cc05)
990	cmp	L, 0
991	FMADD	(aa2, bb3, cc06, cc06)
992	nop
993
994	FMADD	(aa1, bb4, cc07, cc07)
995	LDF	[BO + 10 * SIZE], b3
996	FMADD	(aa2, bb4, cc08, cc08)
997	LDF	[BO + 11 * SIZE], b4
998
999	FMADD	(aa1, bb5, cc09, cc09)
1000	nop
1001	FMADD	(aa2, bb5, cc10, cc10)
1002	nop
1003
1004	FMADD	(aa1, bb6, cc11, cc11)
1005	LDF	[BO + 12 * SIZE], b5
1006	FMADD	(aa2, bb6, cc12, cc12)
1007	LDF	[BO + 13 * SIZE], b6
1008
1009	FMADD	(aa1, bb7, cc13, cc13)
1010	add	AO, 2 * SIZE, AO
1011	FMADD	(aa2, bb7, cc14, cc14)
1012	add	BO, 8 * SIZE, BO
1013
1014	FMADD	(aa1, bb8, cc15, cc15)
1015	LDF	[AO +  0 * SIZE], a1
1016	FMADD	(aa2, bb8, cc16, cc16)
1017	LDF	[AO +  1 * SIZE], a2
1018
1019	LDF	[BO +  6 * SIZE], b7
1020	bg,pt	%icc, .LL17
1021	LDF	[BO +  7 * SIZE], b8
1022	nop
1023	.align 4
1024
1025.LL18:
1026#if defined(LN) || defined(RT)
1027#ifdef LN
1028	sub	KK, 2, TEMP1
1029#else
1030	sub	KK, 8, TEMP1
1031#endif
1032	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1033	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1034
1035	add	AORIG, TEMP2, AO
1036	add	B,     TEMP1, BO
1037#endif
1038
1039#if defined(LN) || defined(LT)
1040	LDF	[BO +  0 * SIZE], a1
1041	LDF	[BO +  1 * SIZE], a2
1042	LDF	[BO +  2 * SIZE], a3
1043	LDF	[BO +  3 * SIZE], a4
1044
1045	LDF	[BO +  4 * SIZE], b1
1046	LDF	[BO +  5 * SIZE], b2
1047	LDF	[BO +  6 * SIZE], b3
1048	LDF	[BO +  7 * SIZE], b4
1049
1050	FSUB	a1, c01, c01
1051	FSUB	a2, c03, c03
1052	FSUB	a3, c05, c05
1053	FSUB	a4, c07, c07
1054
1055	FSUB	b1, c09, c09
1056	FSUB	b2, c11, c11
1057	FSUB	b3, c13, c13
1058	FSUB	b4, c15, c15
1059
1060	LDF	[BO +  8 * SIZE], a1
1061	LDF	[BO +  9 * SIZE], a2
1062	LDF	[BO + 10 * SIZE], a3
1063	LDF	[BO + 11 * SIZE], a4
1064
1065	LDF	[BO + 12 * SIZE], b1
1066	LDF	[BO + 13 * SIZE], b2
1067	LDF	[BO + 14 * SIZE], b3
1068	LDF	[BO + 15 * SIZE], b4
1069
1070	FSUB	a1, c02, c02
1071	FSUB	a2, c04, c04
1072	FSUB	a3, c06, c06
1073	FSUB	a4, c08, c08
1074
1075	FSUB	b1, c10, c10
1076	FSUB	b2, c12, c12
1077	FSUB	b3, c14, c14
1078	FSUB	b4, c16, c16
1079#else
1080	LDF	[AO +  0 * SIZE], a1
1081	LDF	[AO +  1 * SIZE], a2
1082	LDF	[AO +  2 * SIZE], a3
1083	LDF	[AO +  3 * SIZE], a4
1084
1085	LDF	[AO +  4 * SIZE], b1
1086	LDF	[AO +  5 * SIZE], b2
1087	LDF	[AO +  6 * SIZE], b3
1088	LDF	[AO +  7 * SIZE], b4
1089
1090	FSUB	a1, c01, c01
1091	FSUB	a2, c02, c02
1092	FSUB	a3, c03, c03
1093	FSUB	a4, c04, c04
1094
1095	FSUB	b1, c05, c05
1096	FSUB	b2, c06, c06
1097	FSUB	b3, c07, c07
1098	FSUB	b4, c08, c08
1099
1100	LDF	[AO +  8 * SIZE], a1
1101	LDF	[AO +  9 * SIZE], a2
1102	LDF	[AO + 10 * SIZE], a3
1103	LDF	[AO + 11 * SIZE], a4
1104
1105	LDF	[AO + 12 * SIZE], b1
1106	LDF	[AO + 13 * SIZE], b2
1107	LDF	[AO + 14 * SIZE], b3
1108	LDF	[AO + 15 * SIZE], b4
1109
1110	FSUB	a1, c09, c09
1111	FSUB	a2, c10, c10
1112	FSUB	a3, c11, c11
1113	FSUB	a4, c12, c12
1114
1115	FSUB	b1, c13, c13
1116	FSUB	b2, c14, c14
1117	FSUB	b3, c15, c15
1118	FSUB	b4, c16, c16
1119#endif
1120
1121#ifdef LN
1122	LDF	[AO +  3 * SIZE], a1
1123	LDF	[AO +  2 * SIZE], a2
1124	LDF	[AO +  0 * SIZE], a3
1125
1126	FMUL	a1, c02, c02
1127	FMUL	a1, c04, c04
1128	FMUL	a1, c06, c06
1129	FMUL	a1, c08, c08
1130	FMUL	a1, c10, c10
1131	FMUL	a1, c12, c12
1132	FMUL	a1, c14, c14
1133	FMUL	a1, c16, c16
1134
1135	FNMSUB	(aa2, cc02, cc01, cc01)
1136	FNMSUB	(aa2, cc04, cc03, cc03)
1137	FNMSUB	(aa2, cc06, cc05, cc05)
1138	FNMSUB	(aa2, cc08, cc07, cc07)
1139	FNMSUB	(aa2, cc10, cc09, cc09)
1140	FNMSUB	(aa2, cc12, cc11, cc11)
1141	FNMSUB	(aa2, cc14, cc13, cc13)
1142	FNMSUB	(aa2, cc16, cc15, cc15)
1143
1144	FMUL	a3, c01, c01
1145	FMUL	a3, c03, c03
1146	FMUL	a3, c05, c05
1147	FMUL	a3, c07, c07
1148	FMUL	a3, c09, c09
1149	FMUL	a3, c11, c11
1150	FMUL	a3, c13, c13
1151	FMUL	a3, c15, c15
1152#endif
1153
1154#ifdef LT
1155	LDF	[AO +  0 * SIZE], a1
1156	LDF	[AO +  1 * SIZE], a2
1157	LDF	[AO +  3 * SIZE], a3
1158
1159	FMUL	a1, c01, c01
1160	FMUL	a1, c03, c03
1161	FMUL	a1, c05, c05
1162	FMUL	a1, c07, c07
1163	FMUL	a1, c09, c09
1164	FMUL	a1, c11, c11
1165	FMUL	a1, c13, c13
1166	FMUL	a1, c15, c15
1167
1168	FNMSUB	(aa2, cc01, cc02, cc02)
1169	FNMSUB	(aa2, cc03, cc04, cc04)
1170	FNMSUB	(aa2, cc05, cc06, cc06)
1171	FNMSUB	(aa2, cc07, cc08, cc08)
1172	FNMSUB	(aa2, cc09, cc10, cc10)
1173	FNMSUB	(aa2, cc11, cc12, cc12)
1174	FNMSUB	(aa2, cc13, cc14, cc14)
1175	FNMSUB	(aa2, cc15, cc16, cc16)
1176
1177	FMUL	a3, c02, c02
1178	FMUL	a3, c04, c04
1179	FMUL	a3, c06, c06
1180	FMUL	a3, c08, c08
1181	FMUL	a3, c10, c10
1182	FMUL	a3, c12, c12
1183	FMUL	a3, c14, c14
1184	FMUL	a3, c16, c16
1185#endif
1186
1187#ifdef RN
1188	LDF	[BO +  0 * SIZE], a1
1189	LDF	[BO +  1 * SIZE], a2
1190	LDF	[BO +  2 * SIZE], a3
1191	LDF	[BO +  3 * SIZE], a4
1192	LDF	[BO +  4 * SIZE], b1
1193	LDF	[BO +  5 * SIZE], b2
1194	LDF	[BO +  6 * SIZE], b3
1195	LDF	[BO +  7 * SIZE], b4
1196
1197	FMUL	a1, c01, c01
1198	FMUL	a1, c02, c02
1199
1200	FNMSUB	(aa2, cc01, cc03, cc03)
1201	FNMSUB	(aa2, cc02, cc04, cc04)
1202	FNMSUB	(aa3, cc01, cc05, cc05)
1203	FNMSUB	(aa3, cc02, cc06, cc06)
1204	FNMSUB	(aa4, cc01, cc07, cc07)
1205	FNMSUB	(aa4, cc02, cc08, cc08)
1206	FNMSUB	(bb1, cc01, cc09, cc09)
1207	FNMSUB	(bb1, cc02, cc10, cc10)
1208	FNMSUB	(bb2, cc01, cc11, cc11)
1209	FNMSUB	(bb2, cc02, cc12, cc12)
1210	FNMSUB	(bb3, cc01, cc13, cc13)
1211	FNMSUB	(bb3, cc02, cc14, cc14)
1212	FNMSUB	(bb4, cc01, cc15, cc15)
1213	FNMSUB	(bb4, cc02, cc16, cc16)
1214
1215	LDF	[BO +  9 * SIZE], a1
1216	LDF	[BO + 10 * SIZE], a2
1217	LDF	[BO + 11 * SIZE], a3
1218	LDF	[BO + 12 * SIZE], a4
1219	LDF	[BO + 13 * SIZE], b1
1220	LDF	[BO + 14 * SIZE], b2
1221	LDF	[BO + 15 * SIZE], b3
1222
1223	FMUL	a1, c03, c03
1224	FMUL	a1, c04, c04
1225
1226	FNMSUB	(aa2, cc03, cc05, cc05)
1227	FNMSUB	(aa2, cc04, cc06, cc06)
1228	FNMSUB	(aa3, cc03, cc07, cc07)
1229	FNMSUB	(aa3, cc04, cc08, cc08)
1230	FNMSUB	(aa4, cc03, cc09, cc09)
1231	FNMSUB	(aa4, cc04, cc10, cc10)
1232	FNMSUB	(bb1, cc03, cc11, cc11)
1233	FNMSUB	(bb1, cc04, cc12, cc12)
1234	FNMSUB	(bb2, cc03, cc13, cc13)
1235	FNMSUB	(bb2, cc04, cc14, cc14)
1236	FNMSUB	(bb3, cc03, cc15, cc15)
1237	FNMSUB	(bb3, cc04, cc16, cc16)
1238
1239	LDF	[BO + 18 * SIZE], a1
1240	LDF	[BO + 19 * SIZE], a2
1241	LDF	[BO + 20 * SIZE], a3
1242	LDF	[BO + 21 * SIZE], a4
1243	LDF	[BO + 22 * SIZE], b1
1244	LDF	[BO + 23 * SIZE], b2
1245
1246	FMUL	a1, c05, c05
1247	FMUL	a1, c06, c06
1248
1249	FNMSUB	(aa2, cc05, cc07, cc07)
1250	FNMSUB	(aa2, cc06, cc08, cc08)
1251	FNMSUB	(aa3, cc05, cc09, cc09)
1252	FNMSUB	(aa3, cc06, cc10, cc10)
1253	FNMSUB	(aa4, cc05, cc11, cc11)
1254	FNMSUB	(aa4, cc06, cc12, cc12)
1255	FNMSUB	(bb1, cc05, cc13, cc13)
1256	FNMSUB	(bb1, cc06, cc14, cc14)
1257	FNMSUB	(bb2, cc05, cc15, cc15)
1258	FNMSUB	(bb2, cc06, cc16, cc16)
1259
1260	LDF	[BO + 27 * SIZE], a1
1261	LDF	[BO + 28 * SIZE], a2
1262	LDF	[BO + 29 * SIZE], a3
1263	LDF	[BO + 30 * SIZE], a4
1264	LDF	[BO + 31 * SIZE], b1
1265
1266	FMUL	a1, c07, c07
1267	FMUL	a1, c08, c08
1268
1269	FNMSUB	(aa2, cc07, cc09, cc09)
1270	FNMSUB	(aa2, cc08, cc10, cc10)
1271	FNMSUB	(aa3, cc07, cc11, cc11)
1272	FNMSUB	(aa3, cc08, cc12, cc12)
1273	FNMSUB	(aa4, cc07, cc13, cc13)
1274	FNMSUB	(aa4, cc08, cc14, cc14)
1275	FNMSUB	(bb1, cc07, cc15, cc15)
1276	FNMSUB	(bb1, cc08, cc16, cc16)
1277
1278	LDF	[BO + 36 * SIZE], a1
1279	LDF	[BO + 37 * SIZE], a2
1280	LDF	[BO + 38 * SIZE], a3
1281	LDF	[BO + 39 * SIZE], a4
1282
1283	FMUL	a1, c09, c09
1284	FMUL	a1, c10, c10
1285
1286	FNMSUB	(aa2, cc09, cc11, cc11)
1287	FNMSUB	(aa2, cc10, cc12, cc12)
1288	FNMSUB	(aa3, cc09, cc13, cc13)
1289	FNMSUB	(aa3, cc10, cc14, cc14)
1290	FNMSUB	(aa4, cc09, cc15, cc15)
1291	FNMSUB	(aa4, cc10, cc16, cc16)
1292
1293	LDF	[BO + 45 * SIZE], a1
1294	LDF	[BO + 46 * SIZE], a2
1295	LDF	[BO + 47 * SIZE], a3
1296
1297	FMUL	a1, c11, c11
1298	FMUL	a1, c12, c12
1299
1300	FNMSUB	(aa2, cc11, cc13, cc13)
1301	FNMSUB	(aa2, cc12, cc14, cc14)
1302	FNMSUB	(aa3, cc11, cc15, cc15)
1303	FNMSUB	(aa3, cc12, cc16, cc16)
1304
1305	LDF	[BO + 54 * SIZE], a1
1306	LDF	[BO + 55 * SIZE], a2
1307
1308	FMUL	a1, c13, c13
1309	FMUL	a1, c14, c14
1310
1311	FNMSUB	(aa2, cc13, cc15, cc15)
1312	FNMSUB	(aa2, cc14, cc16, cc16)
1313
1314	LDF	[BO + 63 * SIZE], a1
1315
1316	FMUL	a1, c15, c15
1317	FMUL	a1, c16, c16
1318#endif
1319
1320#ifdef RT
1321	LDF	[BO + 63 * SIZE], a1
1322	LDF	[BO + 62 * SIZE], a2
1323	LDF	[BO + 61 * SIZE], a3
1324	LDF	[BO + 60 * SIZE], a4
1325	LDF	[BO + 59 * SIZE], b1
1326	LDF	[BO + 58 * SIZE], b2
1327	LDF	[BO + 57 * SIZE], b3
1328	LDF	[BO + 56 * SIZE], b4
1329
1330	FMUL	a1, c16, c16
1331	FMUL	a1, c15, c15
1332
1333	FNMSUB	(aa2, cc16, cc14, cc14)
1334	FNMSUB	(aa2, cc15, cc13, cc13)
1335	FNMSUB	(aa3, cc16, cc12, cc12)
1336	FNMSUB	(aa3, cc15, cc11, cc11)
1337	FNMSUB	(aa4, cc16, cc10, cc10)
1338	FNMSUB	(aa4, cc15, cc09, cc09)
1339	FNMSUB	(bb1, cc16, cc08, cc08)
1340	FNMSUB	(bb1, cc15, cc07, cc07)
1341	FNMSUB	(bb2, cc16, cc06, cc06)
1342	FNMSUB	(bb2, cc15, cc05, cc05)
1343	FNMSUB	(bb3, cc16, cc04, cc04)
1344	FNMSUB	(bb3, cc15, cc03, cc03)
1345	FNMSUB	(bb4, cc16, cc02, cc02)
1346	FNMSUB	(bb4, cc15, cc01, cc01)
1347
1348	LDF	[BO + 54 * SIZE], a1
1349	LDF	[BO + 53 * SIZE], a2
1350	LDF	[BO + 52 * SIZE], a3
1351	LDF	[BO + 51 * SIZE], a4
1352	LDF	[BO + 50 * SIZE], b1
1353	LDF	[BO + 49 * SIZE], b2
1354	LDF	[BO + 48 * SIZE], b3
1355
1356	FMUL	a1, c14, c14
1357	FMUL	a1, c13, c13
1358
1359	FNMSUB	(aa2, cc14, cc12, cc12)
1360	FNMSUB	(aa2, cc13, cc11, cc11)
1361	FNMSUB	(aa3, cc14, cc10, cc10)
1362	FNMSUB	(aa3, cc13, cc09, cc09)
1363	FNMSUB	(aa4, cc14, cc08, cc08)
1364	FNMSUB	(aa4, cc13, cc07, cc07)
1365	FNMSUB	(bb1, cc14, cc06, cc06)
1366	FNMSUB	(bb1, cc13, cc05, cc05)
1367	FNMSUB	(bb2, cc14, cc04, cc04)
1368	FNMSUB	(bb2, cc13, cc03, cc03)
1369	FNMSUB	(bb3, cc14, cc02, cc02)
1370	FNMSUB	(bb3, cc13, cc01, cc01)
1371
1372	LDF	[BO + 45 * SIZE], a1
1373	LDF	[BO + 44 * SIZE], a2
1374	LDF	[BO + 43 * SIZE], a3
1375	LDF	[BO + 42 * SIZE], a4
1376	LDF	[BO + 41 * SIZE], b1
1377	LDF	[BO + 40 * SIZE], b2
1378
1379	FMUL	a1, c12, c12
1380	FMUL	a1, c11, c11
1381
1382	FNMSUB	(aa2, cc12, cc10, cc10)
1383	FNMSUB	(aa2, cc11, cc09, cc09)
1384	FNMSUB	(aa3, cc12, cc08, cc08)
1385	FNMSUB	(aa3, cc11, cc07, cc07)
1386	FNMSUB	(aa4, cc12, cc06, cc06)
1387	FNMSUB	(aa4, cc11, cc05, cc05)
1388	FNMSUB	(bb1, cc12, cc04, cc04)
1389	FNMSUB	(bb1, cc11, cc03, cc03)
1390	FNMSUB	(bb2, cc12, cc02, cc02)
1391	FNMSUB	(bb2, cc11, cc01, cc01)
1392
1393	LDF	[BO + 36 * SIZE], a1
1394	LDF	[BO + 35 * SIZE], a2
1395	LDF	[BO + 34 * SIZE], a3
1396	LDF	[BO + 33 * SIZE], a4
1397	LDF	[BO + 32 * SIZE], b1
1398
1399	FMUL	a1, c10, c10
1400	FMUL	a1, c09, c09
1401
1402	FNMSUB	(aa2, cc10, cc08, cc08)
1403	FNMSUB	(aa2, cc09, cc07, cc07)
1404	FNMSUB	(aa3, cc10, cc06, cc06)
1405	FNMSUB	(aa3, cc09, cc05, cc05)
1406	FNMSUB	(aa4, cc10, cc04, cc04)
1407	FNMSUB	(aa4, cc09, cc03, cc03)
1408	FNMSUB	(bb1, cc10, cc02, cc02)
1409	FNMSUB	(bb1, cc09, cc01, cc01)
1410
1411	LDF	[BO + 27 * SIZE], a1
1412	LDF	[BO + 26 * SIZE], a2
1413	LDF	[BO + 25 * SIZE], a3
1414	LDF	[BO + 24 * SIZE], a4
1415
1416	FMUL	a1, c08, c08
1417	FMUL	a1, c07, c07
1418
1419	FNMSUB	(aa2, cc08, cc06, cc06)
1420	FNMSUB	(aa2, cc07, cc05, cc05)
1421	FNMSUB	(aa3, cc08, cc04, cc04)
1422	FNMSUB	(aa3, cc07, cc03, cc03)
1423	FNMSUB	(aa4, cc08, cc02, cc02)
1424	FNMSUB	(aa4, cc07, cc01, cc01)
1425
1426	LDF	[BO + 18 * SIZE], a1
1427	LDF	[BO + 17 * SIZE], a2
1428	LDF	[BO + 16 * SIZE], a3
1429
1430	FMUL	a1, c06, c06
1431	FMUL	a1, c05, c05
1432
1433	FNMSUB	(aa2, cc06, cc04, cc04)
1434	FNMSUB	(aa2, cc05, cc03, cc03)
1435	FNMSUB	(aa3, cc06, cc02, cc02)
1436	FNMSUB	(aa3, cc05, cc01, cc01)
1437
1438	LDF	[BO +  9 * SIZE], a1
1439	LDF	[BO +  8 * SIZE], a2
1440
1441	FMUL	a1, c04, c04
1442	FMUL	a1, c03, c03
1443
1444	FNMSUB	(aa2, cc04, cc02, cc02)
1445	FNMSUB	(aa2, cc03, cc01, cc01)
1446
1447	LDF	[BO +  0 * SIZE], a1
1448
1449	FMUL	a1, c02, c02
1450	FMUL	a1, c01, c01
1451#endif
1452
1453#ifdef LN
1454	add	C1, -2 * SIZE, C1
1455	add	C2, -2 * SIZE, C2
1456	add	C3, -2 * SIZE, C3
1457	add	C4, -2 * SIZE, C4
1458	add	C5, -2 * SIZE, C5
1459	add	C6, -2 * SIZE, C6
1460	add	C7, -2 * SIZE, C7
1461	add	C8, -2 * SIZE, C8
1462#endif
1463
1464#if defined(LN) || defined(LT)
1465	STF	c01, [BO +  0 * SIZE]
1466	STF	c03, [BO +  1 * SIZE]
1467	STF	c05, [BO +  2 * SIZE]
1468	STF	c07, [BO +  3 * SIZE]
1469
1470	STF	c09, [BO +  4 * SIZE]
1471	STF	c11, [BO +  5 * SIZE]
1472	STF	c13, [BO +  6 * SIZE]
1473	STF	c15, [BO +  7 * SIZE]
1474
1475	STF	c02, [BO +  8 * SIZE]
1476	STF	c04, [BO +  9 * SIZE]
1477	STF	c06, [BO + 10 * SIZE]
1478	STF	c08, [BO + 11 * SIZE]
1479
1480	STF	c10, [BO + 12 * SIZE]
1481	STF	c12, [BO + 13 * SIZE]
1482	STF	c14, [BO + 14 * SIZE]
1483	STF	c16, [BO + 15 * SIZE]
1484#else
1485	STF	c01, [AO +  0 * SIZE]
1486	STF	c02, [AO +  1 * SIZE]
1487	STF	c03, [AO +  2 * SIZE]
1488	STF	c04, [AO +  3 * SIZE]
1489
1490	STF	c05, [AO +  4 * SIZE]
1491	STF	c06, [AO +  5 * SIZE]
1492	STF	c07, [AO +  6 * SIZE]
1493	STF	c08, [AO +  7 * SIZE]
1494
1495	STF	c09, [AO +  8 * SIZE]
1496	STF	c10, [AO +  9 * SIZE]
1497	STF	c11, [AO + 10 * SIZE]
1498	STF	c12, [AO + 11 * SIZE]
1499
1500	STF	c13, [AO + 12 * SIZE]
1501	STF	c14, [AO + 13 * SIZE]
1502	STF	c15, [AO + 14 * SIZE]
1503	STF	c16, [AO + 15 * SIZE]
1504#endif
1505
1506	STF	c01, [C1 + 0 * SIZE]
1507	STF	c02, [C1 + 1 * SIZE]
1508	STF	c03, [C2 + 0 * SIZE]
1509	STF	c04, [C2 + 1 * SIZE]
1510
1511	STF	c05, [C3 + 0 * SIZE]
1512	STF	c06, [C3 + 1 * SIZE]
1513	STF	c07, [C4 + 0 * SIZE]
1514	STF	c08, [C4 + 1 * SIZE]
1515
1516	STF	c09, [C5 + 0 * SIZE]
1517	STF	c10, [C5 + 1 * SIZE]
1518	STF	c11, [C6 + 0 * SIZE]
1519	STF	c12, [C6 + 1 * SIZE]
1520
1521	STF	c13, [C7 + 0 * SIZE]
1522	STF	c14, [C7 + 1 * SIZE]
1523	STF	c15, [C8 + 0 * SIZE]
1524	STF	c16, [C8 + 1 * SIZE]
1525
1526#ifndef LN
1527	add	C1, 2 * SIZE, C1
1528	add	C2, 2 * SIZE, C2
1529	add	C3, 2 * SIZE, C3
1530	add	C4, 2 * SIZE, C4
1531	add	C5, 2 * SIZE, C5
1532	add	C6, 2 * SIZE, C6
1533	add	C7, 2 * SIZE, C7
1534	add	C8, 2 * SIZE, C8
1535#endif
1536
1537#ifdef RT
1538	sll	K, BASE_SHIFT + 1, TEMP1
1539	add	AORIG, TEMP1, AORIG
1540#endif
1541
1542#if defined(LT) || defined(RN)
1543	sub	K, KK, TEMP1
1544	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1545	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1546	add	AO, TEMP2, AO
1547	add	BO, TEMP1, BO
1548#endif
1549
1550#ifdef LT
1551	add	KK, 2, KK
1552#endif
1553
1554#ifdef LN
1555	sub	KK, 2, KK
1556#endif
1557
1558	add	I, -1, I
1559	cmp	I, 0
1560	bg,pt	%icc, .LL12
1561	nop
1562	.align 4
1563
1564.LL20:
1565	and	M, 1, I
1566	cmp	I, 0
1567	ble,pn	%icc, .LL29
1568	nop
1569
1570#if defined(LT) || defined(RN)
1571	mov	B, BO
1572#else
1573#ifdef LN
1574	sll	K,  BASE_SHIFT + 0, TEMP1
1575	sub	AORIG, TEMP1, AORIG
1576#endif
1577
1578	sll	KK, BASE_SHIFT + 0, TEMP1
1579	sll	KK, BASE_SHIFT + 3, TEMP2
1580
1581	add	AORIG, TEMP1, AO
1582	add	B,     TEMP2, BO
1583#endif
1584
1585	LDF	[AO +  0 * SIZE], a1
1586	LDF	[AO +  1 * SIZE], a2
1587	LDF	[AO +  2 * SIZE], a3
1588	LDF	[AO +  3 * SIZE], a4
1589
1590	LDF	[BO +  0 * SIZE], b1
1591	FCLR	(cc01)
1592	LDF	[BO +  1 * SIZE], b2
1593	FCLR	(cc03)
1594	LDF	[BO +  2 * SIZE], b3
1595	FCLR	(cc05)
1596	LDF	[BO +  3 * SIZE], b4
1597	FCLR	(cc07)
1598	LDF	[BO +  4 * SIZE], b5
1599	FCLR	(cc09)
1600	LDF	[BO +  5 * SIZE], b6
1601	FCLR	(cc11)
1602	LDF	[BO +  6 * SIZE], b7
1603	FCLR	(cc13)
1604	LDF	[BO +  7 * SIZE], b8
1605	FCLR	(cc15)
1606
1607#if defined(LT) || defined(RN)
1608	sra	KK, 2, L
1609#else
1610	sub	K, KK, L
1611	sra	L,  2, L
1612#endif
1613	cmp	L,  0
1614	ble,pn	%icc, .LL25
1615	LDF	[BO +  8 * SIZE], b9
1616	.align 4
1617
1618.LL23:
1619	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1620	add	L, -1, L
1621
1622	FMADD	(aa1, bb1, cc01, cc01)
1623	LDF	[BO + 16 * SIZE], b1
1624	FMADD	(aa1, bb2, cc03, cc03)
1625	LDF	[BO +  9 * SIZE], b2
1626
1627	FMADD	(aa1, bb3, cc05, cc05)
1628	LDF	[BO + 10 * SIZE], b3
1629	FMADD	(aa1, bb4, cc07, cc07)
1630	LDF	[BO + 11 * SIZE], b4
1631
1632	FMADD	(aa1, bb5, cc09, cc09)
1633	LDF	[BO + 12 * SIZE], b5
1634	FMADD	(aa1, bb6, cc11, cc11)
1635	LDF	[BO + 13 * SIZE], b6
1636
1637	FMADD	(aa1, bb7, cc13, cc13)
1638	LDF	[BO + 14 * SIZE], b7
1639	FMADD	(aa1, bb8, cc15, cc15)
1640	LDF	[BO + 15 * SIZE], b8
1641
1642	FMADD	(aa2, bb9, cc01, cc01)
1643	LDF	[BO + 24 * SIZE], b9
1644	FMADD	(aa2, bb2, cc03, cc03)
1645	LDF	[BO + 17 * SIZE], b2
1646
1647	FMADD	(aa2, bb3, cc05, cc05)
1648	LDF	[BO + 18 * SIZE], b3
1649	FMADD	(aa2, bb4, cc07, cc07)
1650	LDF	[BO + 19 * SIZE], b4
1651
1652	FMADD	(aa2, bb5, cc09, cc09)
1653	LDF	[BO + 20 * SIZE], b5
1654	FMADD	(aa2, bb6, cc11, cc11)
1655	LDF	[BO + 21 * SIZE], b6
1656
1657	FMADD	(aa2, bb7, cc13, cc13)
1658	LDF	[BO + 22 * SIZE], b7
1659	FMADD	(aa2, bb8, cc15, cc15)
1660	LDF	[BO + 23 * SIZE], b8
1661
1662	LDF	[AO +  4 * SIZE], a1
1663	LDF	[AO +  5 * SIZE], a2
1664
1665	FMADD	(aa3, bb1, cc01, cc01)
1666	LDF	[BO + 32 * SIZE], b1
1667	FMADD	(aa3, bb2, cc03, cc03)
1668	LDF	[BO + 25 * SIZE], b2
1669
1670	FMADD	(aa3, bb3, cc05, cc05)
1671	LDF	[BO + 26 * SIZE], b3
1672	FMADD	(aa3, bb4, cc07, cc07)
1673	LDF	[BO + 27 * SIZE], b4
1674
1675	FMADD	(aa3, bb5, cc09, cc09)
1676	LDF	[BO + 28 * SIZE], b5
1677	FMADD	(aa3, bb6, cc11, cc11)
1678	LDF	[BO + 29 * SIZE], b6
1679
1680	FMADD	(aa3, bb7, cc13, cc13)
1681	LDF	[BO + 30 * SIZE], b7
1682	FMADD	(aa3, bb8, cc15, cc15)
1683	LDF	[BO + 31 * SIZE], b8
1684
1685	FMADD	(aa4, bb9, cc01, cc01)
1686	LDF	[BO + 40 * SIZE], b9
1687	FMADD	(aa4, bb2, cc03, cc03)
1688	LDF	[BO + 33 * SIZE], b2
1689
1690	FMADD	(aa4, bb3, cc05, cc05)
1691	LDF	[BO + 34 * SIZE], b3
1692	FMADD	(aa4, bb4, cc07, cc07)
1693	LDF	[BO + 35 * SIZE], b4
1694
1695	FMADD	(aa4, bb5, cc09, cc09)
1696	LDF	[BO + 36 * SIZE], b5
1697	FMADD	(aa4, bb6, cc11, cc11)
1698	LDF	[BO + 37 * SIZE], b6
1699
1700	FMADD	(aa4, bb7, cc13, cc13)
1701	LDF	[BO + 38 * SIZE], b7
1702	FMADD	(aa4, bb8, cc15, cc15)
1703	LDF	[BO + 39 * SIZE], b8
1704
1705	LDF	[AO +  6 * SIZE], a3
1706	LDF	[AO +  7 * SIZE], a4
1707
1708	add	AO,  4 * SIZE, AO
1709	cmp	L, 0
1710	bg,pt	%icc, .LL23
1711	add	BO, 32 * SIZE, BO
1712	.align 4
1713
1714.LL25:
1715#if defined(LT) || defined(RN)
1716	and	KK, 3, L
1717#else
1718	sub	K, KK, L
1719	and	L,  3, L
1720#endif
1721	cmp	L,  0
1722	ble,a,pn %icc, .LL28
1723	nop
1724	.align 4
1725
1726.LL27:
1727	FMADD	(aa1, bb1, cc01, cc01)
1728	LDF	[BO +  8 * SIZE], b1
1729	FMADD	(aa1, bb2, cc03, cc03)
1730	LDF	[BO +  9 * SIZE], b2
1731
1732	FMADD	(aa1, bb3, cc05, cc05)
1733	LDF	[BO + 10 * SIZE], b3
1734	FMADD	(aa1, bb4, cc07, cc07)
1735	LDF	[BO + 11 * SIZE], b4
1736
1737	FMADD	(aa1, bb5, cc09, cc09)
1738	LDF	[BO + 12 * SIZE], b5
1739	FMADD	(aa1, bb6, cc11, cc11)
1740	LDF	[BO + 13 * SIZE], b6
1741
1742	FMADD	(aa1, bb7, cc13, cc13)
1743	LDF	[BO + 14 * SIZE], b7
1744	FMADD	(aa1, bb8, cc15, cc15)
1745	LDF	[BO + 15 * SIZE], b8
1746
1747	LDF	[AO +  1 * SIZE], a1
1748	add	AO, 1 * SIZE, AO
1749
1750	add	L, -1, L
1751	cmp	L, 0
1752	bg,pt	%icc, .LL27
1753	add	BO, 8 * SIZE, BO
1754	.align 4
1755
1756.LL28:
1757#if defined(LN) || defined(RT)
1758#ifdef LN
1759	sub	KK, 1, TEMP1
1760#else
1761	sub	KK, 8, TEMP1
1762#endif
1763	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1764	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1765
1766	add	AORIG, TEMP2, AO
1767	add	B,     TEMP1, BO
1768#endif
1769
1770#if defined(LN) || defined(LT)
1771	LDF	[BO +  0 * SIZE], a1
1772	LDF	[BO +  1 * SIZE], a2
1773	LDF	[BO +  2 * SIZE], a3
1774	LDF	[BO +  3 * SIZE], a4
1775
1776	LDF	[BO +  4 * SIZE], b1
1777	LDF	[BO +  5 * SIZE], b2
1778	LDF	[BO +  6 * SIZE], b3
1779	LDF	[BO +  7 * SIZE], b4
1780
1781	FSUB	a1, c01, c01
1782	FSUB	a2, c03, c03
1783	FSUB	a3, c05, c05
1784	FSUB	a4, c07, c07
1785
1786	FSUB	b1, c09, c09
1787	FSUB	b2, c11, c11
1788	FSUB	b3, c13, c13
1789	FSUB	b4, c15, c15
1790#else
1791	LDF	[AO +  0 * SIZE], a1
1792	LDF	[AO +  1 * SIZE], a2
1793	LDF	[AO +  2 * SIZE], a3
1794	LDF	[AO +  3 * SIZE], a4
1795
1796	LDF	[AO +  4 * SIZE], b1
1797	LDF	[AO +  5 * SIZE], b2
1798	LDF	[AO +  6 * SIZE], b3
1799	LDF	[AO +  7 * SIZE], b4
1800
1801	FSUB	a1, c01, c01
1802	FSUB	a2, c03, c03
1803	FSUB	a3, c05, c05
1804	FSUB	a4, c07, c07
1805
1806	FSUB	b1, c09, c09
1807	FSUB	b2, c11, c11
1808	FSUB	b3, c13, c13
1809	FSUB	b4, c15, c15
1810#endif
1811
1812#if defined(LN) || defined(LT)
1813	LDF	[AO +  0 * SIZE], a1
1814
1815	FMUL	a1, c01, c01
1816	FMUL	a1, c03, c03
1817	FMUL	a1, c05, c05
1818	FMUL	a1, c07, c07
1819	FMUL	a1, c09, c09
1820	FMUL	a1, c11, c11
1821	FMUL	a1, c13, c13
1822	FMUL	a1, c15, c15
1823#endif
1824
1825#ifdef RN
1826	LDF	[BO +  0 * SIZE], a1
1827	LDF	[BO +  1 * SIZE], a2
1828	LDF	[BO +  2 * SIZE], a3
1829	LDF	[BO +  3 * SIZE], a4
1830	LDF	[BO +  4 * SIZE], b1
1831	LDF	[BO +  5 * SIZE], b2
1832	LDF	[BO +  6 * SIZE], b3
1833	LDF	[BO +  7 * SIZE], b4
1834
1835	FMUL	a1, c01, c01
1836
1837	FNMSUB	(aa2, cc01, cc03, cc03)
1838	FNMSUB	(aa3, cc01, cc05, cc05)
1839	FNMSUB	(aa4, cc01, cc07, cc07)
1840	FNMSUB	(bb1, cc01, cc09, cc09)
1841	FNMSUB	(bb2, cc01, cc11, cc11)
1842	FNMSUB	(bb3, cc01, cc13, cc13)
1843	FNMSUB	(bb4, cc01, cc15, cc15)
1844
1845	LDF	[BO +  9 * SIZE], a1
1846	LDF	[BO + 10 * SIZE], a2
1847	LDF	[BO + 11 * SIZE], a3
1848	LDF	[BO + 12 * SIZE], a4
1849	LDF	[BO + 13 * SIZE], b1
1850	LDF	[BO + 14 * SIZE], b2
1851	LDF	[BO + 15 * SIZE], b3
1852
1853	FMUL	a1, c03, c03
1854
1855	FNMSUB	(aa2, cc03, cc05, cc05)
1856	FNMSUB	(aa3, cc03, cc07, cc07)
1857	FNMSUB	(aa4, cc03, cc09, cc09)
1858	FNMSUB	(bb1, cc03, cc11, cc11)
1859	FNMSUB	(bb2, cc03, cc13, cc13)
1860	FNMSUB	(bb3, cc03, cc15, cc15)
1861
1862	LDF	[BO + 18 * SIZE], a1
1863	LDF	[BO + 19 * SIZE], a2
1864	LDF	[BO + 20 * SIZE], a3
1865	LDF	[BO + 21 * SIZE], a4
1866	LDF	[BO + 22 * SIZE], b1
1867	LDF	[BO + 23 * SIZE], b2
1868
1869	FMUL	a1, c05, c05
1870
1871	FNMSUB	(aa2, cc05, cc07, cc07)
1872	FNMSUB	(aa3, cc05, cc09, cc09)
1873	FNMSUB	(aa4, cc05, cc11, cc11)
1874	FNMSUB	(bb1, cc05, cc13, cc13)
1875	FNMSUB	(bb2, cc05, cc15, cc15)
1876
1877	LDF	[BO + 27 * SIZE], a1
1878	LDF	[BO + 28 * SIZE], a2
1879	LDF	[BO + 29 * SIZE], a3
1880	LDF	[BO + 30 * SIZE], a4
1881	LDF	[BO + 31 * SIZE], b1
1882
1883	FMUL	a1, c07, c07
1884
1885	FNMSUB	(aa2, cc07, cc09, cc09)
1886	FNMSUB	(aa3, cc07, cc11, cc11)
1887	FNMSUB	(aa4, cc07, cc13, cc13)
1888	FNMSUB	(bb1, cc07, cc15, cc15)
1889
1890	LDF	[BO + 36 * SIZE], a1
1891	LDF	[BO + 37 * SIZE], a2
1892	LDF	[BO + 38 * SIZE], a3
1893	LDF	[BO + 39 * SIZE], a4
1894
1895	FMUL	a1, c09, c09
1896
1897	FNMSUB	(aa2, cc09, cc11, cc11)
1898	FNMSUB	(aa3, cc09, cc13, cc13)
1899	FNMSUB	(aa4, cc09, cc15, cc15)
1900
1901	LDF	[BO + 45 * SIZE], a1
1902	LDF	[BO + 46 * SIZE], a2
1903	LDF	[BO + 47 * SIZE], a3
1904
1905	FMUL	a1, c11, c11
1906
1907	FNMSUB	(aa2, cc11, cc13, cc13)
1908	FNMSUB	(aa3, cc11, cc15, cc15)
1909
1910	LDF	[BO + 54 * SIZE], a1
1911	LDF	[BO + 55 * SIZE], a2
1912
1913	FMUL	a1, c13, c13
1914
1915	FNMSUB	(aa2, cc13, cc15, cc15)
1916
1917	LDF	[BO + 63 * SIZE], a1
1918
1919	FMUL	a1, c15, c15
1920#endif
1921
1922#ifdef RT
1923	LDF	[BO + 63 * SIZE], a1
1924	LDF	[BO + 62 * SIZE], a2
1925	LDF	[BO + 61 * SIZE], a3
1926	LDF	[BO + 60 * SIZE], a4
1927	LDF	[BO + 59 * SIZE], b1
1928	LDF	[BO + 58 * SIZE], b2
1929	LDF	[BO + 57 * SIZE], b3
1930	LDF	[BO + 56 * SIZE], b4
1931
1932	FMUL	a1, c15, c15
1933
1934	FNMSUB	(aa2, cc15, cc13, cc13)
1935	FNMSUB	(aa3, cc15, cc11, cc11)
1936	FNMSUB	(aa4, cc15, cc09, cc09)
1937	FNMSUB	(bb1, cc15, cc07, cc07)
1938	FNMSUB	(bb2, cc15, cc05, cc05)
1939	FNMSUB	(bb3, cc15, cc03, cc03)
1940	FNMSUB	(bb4, cc15, cc01, cc01)
1941
1942	LDF	[BO + 54 * SIZE], a1
1943	LDF	[BO + 53 * SIZE], a2
1944	LDF	[BO + 52 * SIZE], a3
1945	LDF	[BO + 51 * SIZE], a4
1946	LDF	[BO + 50 * SIZE], b1
1947	LDF	[BO + 49 * SIZE], b2
1948	LDF	[BO + 48 * SIZE], b3
1949
1950	FMUL	a1, c13, c13
1951
1952	FNMSUB	(aa2, cc13, cc11, cc11)
1953	FNMSUB	(aa3, cc13, cc09, cc09)
1954	FNMSUB	(aa4, cc13, cc07, cc07)
1955	FNMSUB	(bb1, cc13, cc05, cc05)
1956	FNMSUB	(bb2, cc13, cc03, cc03)
1957	FNMSUB	(bb3, cc13, cc01, cc01)
1958
1959	LDF	[BO + 45 * SIZE], a1
1960	LDF	[BO + 44 * SIZE], a2
1961	LDF	[BO + 43 * SIZE], a3
1962	LDF	[BO + 42 * SIZE], a4
1963	LDF	[BO + 41 * SIZE], b1
1964	LDF	[BO + 40 * SIZE], b2
1965
1966	FMUL	a1, c11, c11
1967
1968	FNMSUB	(aa2, cc11, cc09, cc09)
1969	FNMSUB	(aa3, cc11, cc07, cc07)
1970	FNMSUB	(aa4, cc11, cc05, cc05)
1971	FNMSUB	(bb1, cc11, cc03, cc03)
1972	FNMSUB	(bb2, cc11, cc01, cc01)
1973
1974	LDF	[BO + 36 * SIZE], a1
1975	LDF	[BO + 35 * SIZE], a2
1976	LDF	[BO + 34 * SIZE], a3
1977	LDF	[BO + 33 * SIZE], a4
1978	LDF	[BO + 32 * SIZE], b1
1979
1980	FMUL	a1, c09, c09
1981
1982	FNMSUB	(aa2, cc09, cc07, cc07)
1983	FNMSUB	(aa3, cc09, cc05, cc05)
1984	FNMSUB	(aa4, cc09, cc03, cc03)
1985	FNMSUB	(bb1, cc09, cc01, cc01)
1986
1987	LDF	[BO + 27 * SIZE], a1
1988	LDF	[BO + 26 * SIZE], a2
1989	LDF	[BO + 25 * SIZE], a3
1990	LDF	[BO + 24 * SIZE], a4
1991
1992	FMUL	a1, c07, c07
1993
1994	FNMSUB	(aa2, cc07, cc05, cc05)
1995	FNMSUB	(aa3, cc07, cc03, cc03)
1996	FNMSUB	(aa4, cc07, cc01, cc01)
1997
1998	LDF	[BO + 18 * SIZE], a1
1999	LDF	[BO + 17 * SIZE], a2
2000	LDF	[BO + 16 * SIZE], a3
2001
2002	FMUL	a1, c05, c05
2003
2004	FNMSUB	(aa2, cc05, cc03, cc03)
2005	FNMSUB	(aa3, cc05, cc01, cc01)
2006
2007	LDF	[BO +  9 * SIZE], a1
2008	LDF	[BO +  8 * SIZE], a2
2009
2010	FMUL	a1, c03, c03
2011
2012	FNMSUB	(aa2, cc03, cc01, cc01)
2013
2014	LDF	[BO +  0 * SIZE], a1
2015
2016	FMUL	a1, c01, c01
2017#endif
2018
2019#ifdef LN
2020	add	C1, -1 * SIZE, C1
2021	add	C2, -1 * SIZE, C2
2022	add	C3, -1 * SIZE, C3
2023	add	C4, -1 * SIZE, C4
2024	add	C5, -1 * SIZE, C5
2025	add	C6, -1 * SIZE, C6
2026	add	C7, -1 * SIZE, C7
2027	add	C8, -1 * SIZE, C8
2028#endif
2029
2030#if defined(LN) || defined(LT)
2031	STF	c01, [BO +  0 * SIZE]
2032	STF	c03, [BO +  1 * SIZE]
2033	STF	c05, [BO +  2 * SIZE]
2034	STF	c07, [BO +  3 * SIZE]
2035
2036	STF	c09, [BO +  4 * SIZE]
2037	STF	c11, [BO +  5 * SIZE]
2038	STF	c13, [BO +  6 * SIZE]
2039	STF	c15, [BO +  7 * SIZE]
2040#else
2041	STF	c01, [AO +  0 * SIZE]
2042	STF	c03, [AO +  1 * SIZE]
2043	STF	c05, [AO +  2 * SIZE]
2044	STF	c07, [AO +  3 * SIZE]
2045
2046	STF	c09, [AO +  4 * SIZE]
2047	STF	c11, [AO +  5 * SIZE]
2048	STF	c13, [AO +  6 * SIZE]
2049	STF	c15, [AO +  7 * SIZE]
2050#endif
2051
2052	STF	c01, [C1 + 0 * SIZE]
2053	STF	c03, [C2 + 0 * SIZE]
2054	STF	c05, [C3 + 0 * SIZE]
2055	STF	c07, [C4 + 0 * SIZE]
2056
2057	STF	c09, [C5 + 0 * SIZE]
2058	STF	c11, [C6 + 0 * SIZE]
2059	STF	c13, [C7 + 0 * SIZE]
2060	STF	c15, [C8 + 0 * SIZE]
2061
2062#ifdef RT
2063	sll	K, BASE_SHIFT + 0, TEMP1
2064	add	AORIG, TEMP1, AORIG
2065#endif
2066
2067#if defined(LT) || defined(RN)
2068	sub	K, KK, TEMP1
2069	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2070	sll	TEMP1, BASE_SHIFT + 3, TEMP1
2071	add	AO, TEMP2, AO
2072	add	BO, TEMP1, BO
2073#endif
2074
2075#ifdef LT
2076	add	KK, 1, KK
2077#endif
2078
2079#ifdef LN
2080	sub	KK, 1, KK
2081#endif
2082	.align 4
2083
2084.LL29:
2085#ifdef LN
2086	sll	K, BASE_SHIFT + 3, TEMP1
2087	add	B, TEMP1, B
2088#endif
2089
2090#if defined(LT) || defined(RN)
2091	mov	BO, B
2092#endif
2093
2094#ifdef RN
2095	add	KK, 8, KK
2096#endif
2097
2098#ifdef RT
2099	sub	KK, 8, KK
2100#endif
2101
2102	add	J, -1, J
2103	cmp	J, 0
2104	bg,pt	%icc, .LL11
2105	nop
2106	.align 4
2107
2108.LL30:
2109	and	N, 4, J
2110	cmp	J, 0
2111	ble,pn	%icc, .LL50
2112	nop
2113
2114#ifdef RT
2115	sll	K, BASE_SHIFT + 2, TEMP1
2116	sub	B, TEMP1, B
2117#endif
2118
2119#ifndef RT
2120	mov	C,  C1
2121	add	C,  LDC, C2
2122	add	C2, LDC, C3
2123	add	C3, LDC, C4
2124	add	C4, LDC, C
2125#else
2126	sub	C,  LDC, C4
2127	sub	C4, LDC, C3
2128	sub	C3, LDC, C2
2129	sub	C2, LDC, C1
2130	sub	C2, LDC, C
2131#endif
2132
2133#ifdef LN
2134	add	M, OFFSET, KK
2135#endif
2136
2137#ifdef LT
2138	mov	OFFSET, KK
2139#endif
2140
2141#if defined(LN) || defined(RT)
2142	mov	A, AORIG
2143#else
2144	mov	A, AO
2145#endif
2146
2147	sra	M, 1, I
2148	cmp	I, 0
2149	ble,pn	%icc, .LL40
2150	nop
2151	.align 4
2152
2153.LL32:
2154#if defined(LT) || defined(RN)
2155	mov	B, BO
2156#else
2157#ifdef LN
2158	sll	K,  BASE_SHIFT + 1, TEMP1
2159	sub	AORIG, TEMP1, AORIG
2160#endif
2161
2162	sll	KK, BASE_SHIFT + 1, TEMP1
2163	sll	KK, BASE_SHIFT + 2, TEMP2
2164
2165	add	AORIG, TEMP1, AO
2166	add	B,     TEMP2, BO
2167#endif
2168
2169	LDF	[AO +  0 * SIZE], a1
2170	LDF	[AO +  1 * SIZE], a2
2171
2172	LDF	[BO +  0 * SIZE], b1
2173	LDF	[BO +  1 * SIZE], b2
2174	LDF	[BO +  2 * SIZE], b3
2175	LDF	[BO +  3 * SIZE], b4
2176	LDF	[BO +  4 * SIZE], b5
2177
2178	LDF	[BO +  5 * SIZE], b6
2179	FCLR	(cc01)
2180	LDF	[BO +  6 * SIZE], b7
2181	FCLR	(cc02)
2182	LDF	[BO +  7 * SIZE], b8
2183	FCLR	(cc03)
2184	LDF	[BO +  8 * SIZE], b9
2185	FCLR	(cc04)
2186
2187	prefetch [C1 + 2 * SIZE], 3
2188	FCLR	(cc05)
2189	prefetch [C2 + 2 * SIZE], 3
2190	FCLR	(cc06)
2191	prefetch [C3 + 2 * SIZE], 3
2192	FCLR	(cc07)
2193	prefetch [C4 + 2 * SIZE], 3
2194	FCLR	(cc08)
2195
2196#if defined(LT) || defined(RN)
2197	sra	KK, 2, L
2198#else
2199	sub	K, KK, L
2200	sra	L,  2, L
2201#endif
2202	cmp	L,  0
2203	ble,pn	%icc, .LL35
2204	nop
2205	.align 4
2206
2207.LL33:
2208	FMADD	(aa1, bb1, cc01, cc01)
2209	LDF	[AO +  2 * SIZE], a3
2210	FMADD	(aa2, bb1, cc02, cc02)
2211	LDF	[AO +  3 * SIZE], a4
2212
2213	FMADD	(aa1, bb2, cc03, cc03)
2214	LDF	[BO + 16 * SIZE], b1
2215	FMADD	(aa2, bb2, cc04, cc04)
2216	LDF	[BO +  9 * SIZE], b2
2217
2218	FMADD	(aa1, bb3, cc05, cc05)
2219	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2220	FMADD	(aa2, bb3, cc06, cc06)
2221	add	L, -1, L
2222
2223	FMADD	(aa1, bb4, cc07, cc07)
2224	LDF	[BO + 10 * SIZE], b3
2225	FMADD	(aa2, bb4, cc08, cc08)
2226	LDF	[BO + 11 * SIZE], b4
2227
2228	FMADD	(aa3, bb5, cc01, cc01)
2229	LDF	[AO +  4 * SIZE], a1
2230	FMADD	(aa4, bb5, cc02, cc02)
2231	LDF	[AO +  5 * SIZE], a2
2232
2233	FMADD	(aa3, bb6, cc03, cc03)
2234	LDF	[BO + 12 * SIZE], b5
2235	FMADD	(aa4, bb6, cc04, cc04)
2236	LDF	[BO + 13 * SIZE], b6
2237
2238	FMADD	(aa3, bb7, cc05, cc05)
2239	cmp	L, 0
2240	FMADD	(aa4, bb7, cc06, cc06)
2241	add	AO,  8 * SIZE, AO
2242
2243	FMADD	(aa3, bb8, cc07, cc07)
2244	LDF	[BO + 14 * SIZE], b7
2245	FMADD	(aa4, bb8, cc08, cc08)
2246	LDF	[BO + 15 * SIZE], b8
2247
2248	FMADD	(aa1, bb9, cc01, cc01)
2249	LDF	[AO -  2 * SIZE], a3
2250	FMADD	(aa2, bb9, cc02, cc02)
2251	LDF	[AO -  1 * SIZE], a4
2252
2253	FMADD	(aa1, bb2, cc03, cc03)
2254	LDF	[BO + 24 * SIZE], b9
2255	FMADD	(aa2, bb2, cc04, cc04)
2256	LDF	[BO + 17 * SIZE], b2
2257
2258	FMADD	(aa1, bb3, cc05, cc05)
2259	add	BO, 16 * SIZE, BO
2260	FMADD	(aa2, bb3, cc06, cc06)
2261	nop
2262
2263	FMADD	(aa1, bb4, cc07, cc07)
2264	LDF	[BO +  2 * SIZE], b3
2265	FMADD	(aa2, bb4, cc08, cc08)
2266	LDF	[BO +  3 * SIZE], b4
2267
2268	FMADD	(aa3, bb5, cc01, cc01)
2269	LDF	[AO +  0 * SIZE], a1
2270	FMADD	(aa4, bb5, cc02, cc02)
2271	LDF	[AO +  1 * SIZE], a2
2272	FMADD	(aa3, bb6, cc03, cc03)
2273	LDF	[BO +  4 * SIZE], b5
2274	FMADD	(aa4, bb6, cc04, cc04)
2275	LDF	[BO +  5 * SIZE], b6
2276
2277	FMADD	(aa3, bb7, cc05, cc05)
2278	nop
2279	FMADD	(aa4, bb7, cc06, cc06)
2280	LDF	[BO +  6 * SIZE], b7
2281
2282	FMADD	(aa3, bb8, cc07, cc07)
2283	FMADD	(aa4, bb8, cc08, cc08)
2284	bg,pt	%icc, .LL33
2285	LDF	[BO +  7 * SIZE], b8
2286	.align 4
2287
2288.LL35:
2289#if defined(LT) || defined(RN)
2290	and	KK, 3, L
2291#else
2292	sub	K, KK, L
2293	and	L,  3, L
2294#endif
2295	cmp	L,  0
2296	ble,a,pn %icc, .LL38
2297	nop
2298	.align 4
2299
2300.LL37:
2301	FMADD	(aa1, bb1, cc01, cc01)
2302	add	L, -1, L
2303	FMADD	(aa2, bb1, cc02, cc02)
2304	LDF	[BO + 4 * SIZE], b1
2305
2306	FMADD	(aa1, bb2, cc03, cc03)
2307	add	AO, 2 * SIZE, AO
2308	FMADD	(aa2, bb2, cc04, cc04)
2309	LDF	[BO + 5 * SIZE], b2
2310
2311	FMADD	(aa1, bb3, cc05, cc05)
2312	cmp	L, 0
2313	FMADD	(aa2, bb3, cc06, cc06)
2314	LDF	[BO + 6 * SIZE], b3
2315
2316	FMADD	(aa1, bb4, cc07, cc07)
2317	LDF	[AO + 0 * SIZE], a1
2318	FMADD	(aa2, bb4, cc08, cc08)
2319	LDF	[AO + 1 * SIZE], a2
2320
2321	LDF	[BO + 7 * SIZE], b4
2322	bg,pt	%icc, .LL37
2323	add	BO, 4 * SIZE, BO
2324	.align 4
2325
2326.LL38:
2327#if defined(LN) || defined(RT)
2328#ifdef LN
2329	sub	KK, 2, TEMP1
2330#else
2331	sub	KK, 4, TEMP1
2332#endif
2333	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2334	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2335
2336	add	AORIG, TEMP2, AO
2337	add	B,     TEMP1, BO
2338#endif
2339
2340#if defined(LN) || defined(LT)
2341	LDF	[BO +  0 * SIZE], a1
2342	LDF	[BO +  1 * SIZE], a2
2343	LDF	[BO +  2 * SIZE], a3
2344	LDF	[BO +  3 * SIZE], a4
2345
2346	LDF	[BO +  4 * SIZE], b1
2347	LDF	[BO +  5 * SIZE], b2
2348	LDF	[BO +  6 * SIZE], b3
2349	LDF	[BO +  7 * SIZE], b4
2350
2351	FSUB	a1, c01, c01
2352	FSUB	a2, c03, c03
2353	FSUB	a3, c05, c05
2354	FSUB	a4, c07, c07
2355
2356	FSUB	b1, c02, c02
2357	FSUB	b2, c04, c04
2358	FSUB	b3, c06, c06
2359	FSUB	b4, c08, c08
2360#else
2361	LDF	[AO +  0 * SIZE], a1
2362	LDF	[AO +  1 * SIZE], a2
2363	LDF	[AO +  2 * SIZE], a3
2364	LDF	[AO +  3 * SIZE], a4
2365
2366	LDF	[AO +  4 * SIZE], b1
2367	LDF	[AO +  5 * SIZE], b2
2368	LDF	[AO +  6 * SIZE], b3
2369	LDF	[AO +  7 * SIZE], b4
2370
2371	FSUB	a1, c01, c01
2372	FSUB	a2, c02, c02
2373	FSUB	a3, c03, c03
2374	FSUB	a4, c04, c04
2375
2376	FSUB	b1, c05, c05
2377	FSUB	b2, c06, c06
2378	FSUB	b3, c07, c07
2379	FSUB	b4, c08, c08
2380
2381#endif
2382
2383#ifdef LN
2384	LDF	[AO +  3 * SIZE], a1
2385	LDF	[AO +  2 * SIZE], a2
2386	LDF	[AO +  0 * SIZE], a3
2387
2388	FMUL	a1, c02, c02
2389	FMUL	a1, c04, c04
2390	FMUL	a1, c06, c06
2391	FMUL	a1, c08, c08
2392
2393	FNMSUB	(aa2, cc02, cc01, cc01)
2394	FNMSUB	(aa2, cc04, cc03, cc03)
2395	FNMSUB	(aa2, cc06, cc05, cc05)
2396	FNMSUB	(aa2, cc08, cc07, cc07)
2397
2398	FMUL	a3, c01, c01
2399	FMUL	a3, c03, c03
2400	FMUL	a3, c05, c05
2401	FMUL	a3, c07, c07
2402#endif
2403
2404#ifdef LT
2405	LDF	[AO +  0 * SIZE], a1
2406	LDF	[AO +  1 * SIZE], a2
2407	LDF	[AO +  3 * SIZE], a3
2408
2409	FMUL	a1, c01, c01
2410	FMUL	a1, c03, c03
2411	FMUL	a1, c05, c05
2412	FMUL	a1, c07, c07
2413
2414	FNMSUB	(aa2, cc01, cc02, cc02)
2415	FNMSUB	(aa2, cc03, cc04, cc04)
2416	FNMSUB	(aa2, cc05, cc06, cc06)
2417	FNMSUB	(aa2, cc07, cc08, cc08)
2418
2419	FMUL	a3, c02, c02
2420	FMUL	a3, c04, c04
2421	FMUL	a3, c06, c06
2422	FMUL	a3, c08, c08
2423#endif
2424
2425#ifdef RN
2426	LDF	[BO +  0 * SIZE], a1
2427	LDF	[BO +  1 * SIZE], a2
2428	LDF	[BO +  2 * SIZE], a3
2429	LDF	[BO +  3 * SIZE], a4
2430
2431	FMUL	a1, c01, c01
2432	FMUL	a1, c02, c02
2433
2434	FNMSUB	(aa2, cc01, cc03, cc03)
2435	FNMSUB	(aa2, cc02, cc04, cc04)
2436	FNMSUB	(aa3, cc01, cc05, cc05)
2437	FNMSUB	(aa3, cc02, cc06, cc06)
2438	FNMSUB	(aa4, cc01, cc07, cc07)
2439	FNMSUB	(aa4, cc02, cc08, cc08)
2440
2441	LDF	[BO +  5 * SIZE], a1
2442	LDF	[BO +  6 * SIZE], a2
2443	LDF	[BO +  7 * SIZE], a3
2444
2445	FMUL	a1, c03, c03
2446	FMUL	a1, c04, c04
2447
2448	FNMSUB	(aa2, cc03, cc05, cc05)
2449	FNMSUB	(aa2, cc04, cc06, cc06)
2450	FNMSUB	(aa3, cc03, cc07, cc07)
2451	FNMSUB	(aa3, cc04, cc08, cc08)
2452
2453	LDF	[BO + 10 * SIZE], a1
2454	LDF	[BO + 11 * SIZE], a2
2455
2456	FMUL	a1, c05, c05
2457	FMUL	a1, c06, c06
2458
2459	FNMSUB	(aa2, cc05, cc07, cc07)
2460	FNMSUB	(aa2, cc06, cc08, cc08)
2461
2462	LDF	[BO + 15 * SIZE], a1
2463
2464	FMUL	a1, c07, c07
2465	FMUL	a1, c08, c08
2466#endif
2467
2468#ifdef RT
2469	LDF	[BO + 15 * SIZE], a1
2470	LDF	[BO + 14 * SIZE], a2
2471	LDF	[BO + 13 * SIZE], a3
2472	LDF	[BO + 12 * SIZE], a4
2473
2474	FMUL	a1, c08, c08
2475	FMUL	a1, c07, c07
2476
2477	FNMSUB	(aa2, cc08, cc06, cc06)
2478	FNMSUB	(aa2, cc07, cc05, cc05)
2479	FNMSUB	(aa3, cc08, cc04, cc04)
2480	FNMSUB	(aa3, cc07, cc03, cc03)
2481	FNMSUB	(aa4, cc08, cc02, cc02)
2482	FNMSUB	(aa4, cc07, cc01, cc01)
2483
2484	LDF	[BO + 10 * SIZE], a1
2485	LDF	[BO +  9 * SIZE], a2
2486	LDF	[BO +  8 * SIZE], a3
2487
2488	FMUL	a1, c06, c06
2489	FMUL	a1, c05, c05
2490
2491	FNMSUB	(aa2, cc06, cc04, cc04)
2492	FNMSUB	(aa2, cc05, cc03, cc03)
2493	FNMSUB	(aa3, cc06, cc02, cc02)
2494	FNMSUB	(aa3, cc05, cc01, cc01)
2495
2496	LDF	[BO +  5 * SIZE], a1
2497	LDF	[BO +  4 * SIZE], a2
2498
2499	FMUL	a1, c04, c04
2500	FMUL	a1, c03, c03
2501
2502	FNMSUB	(aa2, cc04, cc02, cc02)
2503	FNMSUB	(aa2, cc03, cc01, cc01)
2504
2505	LDF	[BO +  0 * SIZE], a1
2506
2507	FMUL	a1, c02, c02
2508	FMUL	a1, c01, c01
2509#endif
2510
2511#ifdef LN
2512	add	C1, -2 * SIZE, C1
2513	add	C2, -2 * SIZE, C2
2514	add	C3, -2 * SIZE, C3
2515	add	C4, -2 * SIZE, C4
2516#endif
2517
2518#if defined(LN) || defined(LT)
2519	STF	c01, [BO +  0 * SIZE]
2520	STF	c03, [BO +  1 * SIZE]
2521	STF	c05, [BO +  2 * SIZE]
2522	STF	c07, [BO +  3 * SIZE]
2523
2524	STF	c02, [BO +  4 * SIZE]
2525	STF	c04, [BO +  5 * SIZE]
2526	STF	c06, [BO +  6 * SIZE]
2527	STF	c08, [BO +  7 * SIZE]
2528#else
2529	STF	c01, [AO +  0 * SIZE]
2530	STF	c02, [AO +  1 * SIZE]
2531	STF	c03, [AO +  2 * SIZE]
2532	STF	c04, [AO +  3 * SIZE]
2533
2534	STF	c05, [AO +  4 * SIZE]
2535	STF	c06, [AO +  5 * SIZE]
2536	STF	c07, [AO +  6 * SIZE]
2537	STF	c08, [AO +  7 * SIZE]
2538#endif
2539
2540	STF	c01, [C1 + 0 * SIZE]
2541	STF	c02, [C1 + 1 * SIZE]
2542	STF	c03, [C2 + 0 * SIZE]
2543	STF	c04, [C2 + 1 * SIZE]
2544
2545	STF	c05, [C3 + 0 * SIZE]
2546	STF	c06, [C3 + 1 * SIZE]
2547	STF	c07, [C4 + 0 * SIZE]
2548	STF	c08, [C4 + 1 * SIZE]
2549
2550#ifndef LN
2551	add	C1, 2 * SIZE, C1
2552	add	C2, 2 * SIZE, C2
2553	add	C3, 2 * SIZE, C3
2554	add	C4, 2 * SIZE, C4
2555#endif
2556
2557#ifdef RT
2558	sll	K, BASE_SHIFT + 1, TEMP1
2559	add	AORIG, TEMP1, AORIG
2560#endif
2561
2562#if defined(LT) || defined(RN)
2563	sub	K, KK, TEMP1
2564	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2565	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2566	add	AO, TEMP2, AO
2567	add	BO, TEMP1, BO
2568#endif
2569
2570#ifdef LT
2571	add	KK, 2, KK
2572#endif
2573
2574#ifdef LN
2575	sub	KK, 2, KK
2576#endif
2577
2578	add	I, -1, I
2579	cmp	I, 0
2580	bg,pt	%icc, .LL32
2581	nop
2582
2583.LL40:
2584	and	M, 1, I
2585	cmp	I, 0
2586	ble,pn	%icc, .LL49
2587	nop
2588
2589#if defined(LT) || defined(RN)
2590	mov	B, BO
2591#else
2592#ifdef LN
2593	sll	K,  BASE_SHIFT + 0, TEMP1
2594	sub	AORIG, TEMP1, AORIG
2595#endif
2596
2597	sll	KK, BASE_SHIFT + 0, TEMP1
2598	sll	KK, BASE_SHIFT + 2, TEMP2
2599
2600	add	AORIG, TEMP1, AO
2601	add	B,     TEMP2, BO
2602#endif
2603
2604	LDF	[AO +  0 * SIZE], a1
2605	LDF	[AO +  1 * SIZE], a2
2606	LDF	[AO +  2 * SIZE], a3
2607	LDF	[AO +  3 * SIZE], a4
2608
2609	LDF	[BO +  0 * SIZE], b1
2610	LDF	[BO +  1 * SIZE], b2
2611	LDF	[BO +  2 * SIZE], b3
2612	LDF	[BO +  3 * SIZE], b4
2613	LDF	[BO +  4 * SIZE], b5
2614	LDF	[BO +  5 * SIZE], b6
2615	FCLR	(cc01)
2616	LDF	[BO +  6 * SIZE], b7
2617	FCLR	(cc03)
2618	LDF	[BO +  7 * SIZE], b8
2619	FCLR	(cc05)
2620	LDF	[BO +  8 * SIZE], b9
2621	FCLR	(cc07)
2622
2623#if defined(LT) || defined(RN)
2624	sra	KK, 2, L
2625#else
2626	sub	K, KK, L
2627	sra	L,  2, L
2628#endif
2629	cmp	L,  0
2630	ble,pn	%icc, .LL45
2631	nop
2632
2633.LL43:
2634	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2635	add	L, -1, L
2636
2637	FMADD	(aa1, bb1, cc01, cc01)
2638	LDF	[BO + 16 * SIZE], b1
2639	FMADD	(aa1, bb2, cc03, cc03)
2640	LDF	[BO +  9 * SIZE], b2
2641	FMADD	(aa1, bb3, cc05, cc05)
2642	LDF	[BO + 10 * SIZE], b3
2643	FMADD	(aa1, bb4, cc07, cc07)
2644	LDF	[BO + 11 * SIZE], b4
2645
2646	LDF	[AO +  4 * SIZE], a1
2647	cmp	L, 0
2648
2649	FMADD	(aa2, bb5, cc01, cc01)
2650	LDF	[BO + 12 * SIZE], b5
2651	FMADD	(aa2, bb6, cc03, cc03)
2652	LDF	[BO + 13 * SIZE], b6
2653	FMADD	(aa2, bb7, cc05, cc05)
2654	LDF	[BO + 14 * SIZE], b7
2655	FMADD	(aa2, bb8, cc07, cc07)
2656	LDF	[BO + 15 * SIZE], b8
2657
2658	LDF	[AO +  5 * SIZE], a2
2659	add	AO,  4 * SIZE, AO
2660
2661	FMADD	(aa3, bb9, cc01, cc01)
2662	LDF	[BO + 24 * SIZE], b9
2663	FMADD	(aa3, bb2, cc03, cc03)
2664	LDF	[BO + 17 * SIZE], b2
2665	FMADD	(aa3, bb3, cc05, cc05)
2666	LDF	[BO + 18 * SIZE], b3
2667	FMADD	(aa3, bb4, cc07, cc07)
2668	LDF	[BO + 19 * SIZE], b4
2669
2670	LDF	[AO +  2 * SIZE], a3
2671	add	BO, 16 * SIZE, BO
2672
2673	FMADD	(aa4, bb5, cc01, cc01)
2674	LDF	[BO +  4 * SIZE], b5
2675	FMADD	(aa4, bb6, cc03, cc03)
2676	LDF	[BO +  5 * SIZE], b6
2677	FMADD	(aa4, bb7, cc05, cc05)
2678	LDF	[BO +  6 * SIZE], b7
2679	FMADD	(aa4, bb8, cc07, cc07)
2680	LDF	[BO +  7 * SIZE], b8
2681
2682	bg,pt	%icc, .LL43
2683	LDF	[AO +  3 * SIZE], a4
2684	.align 4
2685
2686.LL45:
2687#if defined(LT) || defined(RN)
2688	and	KK, 3, L
2689#else
2690	sub	K, KK, L
2691	and	L,  3, L
2692#endif
2693	cmp	L,  0
2694	ble,a,pn %icc, .LL48
2695	nop
2696	.align 4
2697
2698.LL47:
2699	FMADD	(aa1, bb1, cc01, cc01)
2700	LDF	[BO + 4 * SIZE], b1
2701	add	L, -1, L
2702	FMADD	(aa1, bb2, cc03, cc03)
2703	LDF	[BO + 5 * SIZE], b2
2704	add	AO, 1 * SIZE, AO
2705
2706	FMADD	(aa1, bb3, cc05, cc05)
2707	LDF	[BO + 6 * SIZE], b3
2708	cmp	L, 0
2709	FMADD	(aa1, bb4, cc07, cc07)
2710	LDF	[BO + 7 * SIZE], b4
2711	add	BO, 4 * SIZE, BO
2712
2713	bg,pt	%icc, .LL47
2714	LDF	[AO + 0 * SIZE], a1
2715	.align 4
2716
2717.LL48:
2718#if defined(LN) || defined(RT)
2719#ifdef LN
2720	sub	KK, 1, TEMP1
2721#else
2722	sub	KK, 4, TEMP1
2723#endif
2724	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2725	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2726
2727	add	AORIG, TEMP2, AO
2728	add	B,     TEMP1, BO
2729#endif
2730
2731#if defined(LN) || defined(LT)
2732	LDF	[BO +  0 * SIZE], a1
2733	LDF	[BO +  1 * SIZE], a2
2734	LDF	[BO +  2 * SIZE], a3
2735	LDF	[BO +  3 * SIZE], a4
2736
2737	FSUB	a1, c01, c01
2738	FSUB	a2, c03, c03
2739	FSUB	a3, c05, c05
2740	FSUB	a4, c07, c07
2741#else
2742	LDF	[AO +  0 * SIZE], a1
2743	LDF	[AO +  1 * SIZE], a2
2744	LDF	[AO +  2 * SIZE], a3
2745	LDF	[AO +  3 * SIZE], a4
2746
2747	FSUB	a1, c01, c01
2748	FSUB	a2, c03, c03
2749	FSUB	a3, c05, c05
2750	FSUB	a4, c07, c07
2751#endif
2752
2753#if defined(LN) || defined(LT)
2754	LDF	[AO +  0 * SIZE], a1
2755
2756	FMUL	a1, c01, c01
2757	FMUL	a1, c03, c03
2758	FMUL	a1, c05, c05
2759	FMUL	a1, c07, c07
2760#endif
2761
2762#ifdef RN
2763	LDF	[BO +  0 * SIZE], a1
2764	LDF	[BO +  1 * SIZE], a2
2765	LDF	[BO +  2 * SIZE], a3
2766	LDF	[BO +  3 * SIZE], a4
2767
2768	FMUL	a1, c01, c01
2769
2770	FNMSUB	(aa2, cc01, cc03, cc03)
2771	FNMSUB	(aa3, cc01, cc05, cc05)
2772	FNMSUB	(aa4, cc01, cc07, cc07)
2773
2774	LDF	[BO +  5 * SIZE], a1
2775	LDF	[BO +  6 * SIZE], a2
2776	LDF	[BO +  7 * SIZE], a3
2777
2778	FMUL	a1, c03, c03
2779
2780	FNMSUB	(aa2, cc03, cc05, cc05)
2781	FNMSUB	(aa3, cc03, cc07, cc07)
2782
2783	LDF	[BO + 10 * SIZE], a1
2784	LDF	[BO + 11 * SIZE], a2
2785
2786	FMUL	a1, c05, c05
2787
2788	FNMSUB	(aa2, cc05, cc07, cc07)
2789
2790	LDF	[BO + 15 * SIZE], a1
2791
2792	FMUL	a1, c07, c07
2793#endif
2794
2795#ifdef RT
2796	LDF	[BO + 15 * SIZE], a1
2797	LDF	[BO + 14 * SIZE], a2
2798	LDF	[BO + 13 * SIZE], a3
2799	LDF	[BO + 12 * SIZE], a4
2800
2801	FMUL	a1, c07, c07
2802
2803	FNMSUB	(aa2, cc07, cc05, cc05)
2804	FNMSUB	(aa3, cc07, cc03, cc03)
2805	FNMSUB	(aa4, cc07, cc01, cc01)
2806
2807	LDF	[BO + 10 * SIZE], a1
2808	LDF	[BO +  9 * SIZE], a2
2809	LDF	[BO +  8 * SIZE], a3
2810
2811	FMUL	a1, c05, c05
2812
2813	FNMSUB	(aa2, cc05, cc03, cc03)
2814	FNMSUB	(aa3, cc05, cc01, cc01)
2815
2816	LDF	[BO +  5 * SIZE], a1
2817	LDF	[BO +  4 * SIZE], a2
2818
2819	FMUL	a1, c03, c03
2820
2821	FNMSUB	(aa2, cc03, cc01, cc01)
2822
2823	LDF	[BO +  0 * SIZE], a1
2824
2825	FMUL	a1, c01, c01
2826#endif
2827
2828#ifdef LN
2829	add	C1, -1 * SIZE, C1
2830	add	C2, -1 * SIZE, C2
2831	add	C3, -1 * SIZE, C3
2832	add	C4, -1 * SIZE, C4
2833#endif
2834
2835#if defined(LN) || defined(LT)
2836	STF	c01, [BO +  0 * SIZE]
2837	STF	c03, [BO +  1 * SIZE]
2838	STF	c05, [BO +  2 * SIZE]
2839	STF	c07, [BO +  3 * SIZE]
2840#else
2841	STF	c01, [AO +  0 * SIZE]
2842	STF	c03, [AO +  1 * SIZE]
2843	STF	c05, [AO +  2 * SIZE]
2844	STF	c07, [AO +  3 * SIZE]
2845#endif
2846
2847	STF	c01, [C1 + 0 * SIZE]
2848	STF	c03, [C2 + 0 * SIZE]
2849	STF	c05, [C3 + 0 * SIZE]
2850	STF	c07, [C4 + 0 * SIZE]
2851
2852#ifdef RT
2853	sll	K, BASE_SHIFT + 0, TEMP1
2854	add	AORIG, TEMP1, AORIG
2855#endif
2856
2857#if defined(LT) || defined(RN)
2858	sub	K, KK, TEMP1
2859	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2860	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2861	add	AO, TEMP2, AO
2862	add	BO, TEMP1, BO
2863#endif
2864
2865#ifdef LT
2866	add	KK, 1, KK
2867#endif
2868
2869#ifdef LN
2870	sub	KK, 1, KK
2871#endif
2872	.align 4
2873
2874.LL49:
2875#ifdef LN
2876	sll	K, BASE_SHIFT + 2, TEMP1
2877	add	B, TEMP1, B
2878#endif
2879
2880#if defined(LT) || defined(RN)
2881	mov	BO, B
2882#endif
2883
2884#ifdef RN
2885	add	KK, 4, KK
2886#endif
2887
2888#ifdef RT
2889	sub	KK, 4, KK
2890#endif
2891	.align 4
2892
2893.LL50:
2894	and	N, 2, J
2895	cmp	J, 0
2896	ble,pn	%icc, .LL70
2897	nop
2898
2899#ifdef RT
2900	sll	K, BASE_SHIFT + 1, TEMP1
2901	sub	B, TEMP1, B
2902#endif
2903
2904#ifndef RT
2905	mov	C,  C1
2906	add	C,  LDC, C2
2907	add	C2, LDC, C
2908#else
2909	sub	C,  LDC, C2
2910	sub	C2, LDC, C1
2911	sub	C2, LDC, C
2912#endif
2913
2914#ifdef LN
2915	add	M, OFFSET, KK
2916#endif
2917
2918#ifdef LT
2919	mov	OFFSET, KK
2920#endif
2921
2922#if defined(LN) || defined(RT)
2923	mov	A, AORIG
2924#else
2925	mov	A, AO
2926#endif
2927
2928	sra	M, 1, I
2929	cmp	I, 0
2930	ble,pn	%icc, .LL60
2931	nop
2932	.align 4
2933
2934.LL52:
2935#if defined(LT) || defined(RN)
2936	mov	B, BO
2937#else
2938#ifdef LN
2939	sll	K,  BASE_SHIFT + 1, TEMP1
2940	sub	AORIG, TEMP1, AORIG
2941#endif
2942
2943	sll	KK, BASE_SHIFT + 1, TEMP1
2944	sll	KK, BASE_SHIFT + 1, TEMP2
2945
2946	add	AORIG, TEMP1, AO
2947	add	B,     TEMP2, BO
2948#endif
2949
2950	LDF	[AO +  0 * SIZE], a1
2951	LDF	[AO +  1 * SIZE], a2
2952	LDF	[AO +  2 * SIZE], a3
2953	LDF	[AO +  3 * SIZE], a4
2954
2955	LDF	[BO +  0 * SIZE], b1
2956	LDF	[BO +  1 * SIZE], b2
2957	LDF	[BO +  2 * SIZE], b3
2958	FCLR	(cc01)
2959	LDF	[BO +  3 * SIZE], b4
2960	FCLR	(cc02)
2961
2962	LDF	[BO +  4 * SIZE], b5
2963	FCLR	(cc03)
2964	LDF	[BO +  5 * SIZE], b6
2965	FCLR	(cc04)
2966	LDF	[BO +  6 * SIZE], b7
2967	FCLR	(cc05)
2968	LDF	[BO +  7 * SIZE], b8
2969	FCLR	(cc06)
2970
2971	prefetch [C1 + 2 * SIZE], 3
2972	FCLR	(cc07)
2973	prefetch [C2 + 2 * SIZE], 3
2974	FCLR	(cc08)
2975
2976#if defined(LT) || defined(RN)
2977	sra	KK, 2, L
2978#else
2979	sub	K, KK, L
2980	sra	L,  2, L
2981#endif
2982	cmp	L,  0
2983	ble,pn	%icc, .LL55
2984	nop
2985	.align 4
2986
2987.LL53:
2988	FMADD	(aa1, bb1, cc01, cc01)
2989	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2990	FMADD	(aa2, bb1, cc02, cc02)
2991	LDF	[BO +  8 * SIZE], b1
2992
2993	FMADD	(aa1, bb2, cc03, cc03)
2994	LDF	[AO +  4 * SIZE], a1
2995	FMADD	(aa2, bb2, cc04, cc04)
2996	LDF	[AO +  5 * SIZE], a2
2997
2998	FMADD	(aa3, bb3, cc01, cc01)
2999	LDF	[BO +  9 * SIZE], b2
3000	FMADD	(aa4, bb3, cc02, cc02)
3001	LDF	[BO + 10 * SIZE], b3
3002
3003	FMADD	(aa3, bb4, cc03, cc03)
3004	LDF	[AO +  6 * SIZE], a3
3005	FMADD	(aa4, bb4, cc04, cc04)
3006	LDF	[AO +  7 * SIZE], a4
3007
3008	FMADD	(aa1, bb5, cc01, cc01)
3009	LDF	[BO + 11 * SIZE], b4
3010	FMADD	(aa2, bb5, cc02, cc02)
3011	LDF	[BO + 12 * SIZE], b5
3012
3013	FMADD	(aa1, bb6, cc03, cc03)
3014	LDF	[AO +  8 * SIZE], a1
3015	FMADD	(aa2, bb6, cc04, cc04)
3016	LDF	[AO +  9 * SIZE], a2
3017
3018	FMADD	(aa3, bb7, cc01, cc01)
3019	LDF	[BO + 13 * SIZE], b6
3020
3021	FMADD	(aa4, bb7, cc02, cc02)
3022	LDF	[BO + 14 * SIZE], b7
3023
3024	FMADD	(aa3, bb8, cc03, cc03)
3025	LDF	[AO + 10 * SIZE], a3
3026	FMADD	(aa4, bb8, cc04, cc04)
3027	LDF	[AO + 11 * SIZE], a4
3028
3029	add	AO,  8 * SIZE, AO
3030	add	L, -1, L
3031	add	BO,  8 * SIZE, BO
3032	cmp	L, 0
3033
3034	bg,pt	%icc, .LL53
3035	LDF	[BO +  7 * SIZE], b8
3036	.align 4
3037
3038.LL55:
3039#if defined(LT) || defined(RN)
3040	and	KK, 3, L
3041#else
3042	sub	K, KK, L
3043	and	L,  3, L
3044#endif
3045	cmp	L,  0
3046	ble,a,pn %icc, .LL58
3047	nop
3048	.align 4
3049
3050.LL57:
3051	FMADD	(aa1, bb1, cc01, cc01)
3052	add	L, -1, L
3053	FMADD	(aa2, bb1, cc02, cc02)
3054	LDF	[BO + 2 * SIZE], b1
3055
3056	FMADD	(aa1, bb2, cc03, cc03)
3057	LDF	[AO + 2 * SIZE], a1
3058	FMADD	(aa2, bb2, cc04, cc04)
3059	LDF	[AO + 3 * SIZE], a2
3060
3061	add	AO, 2 * SIZE, AO
3062	cmp	L, 0
3063	add	BO, 2 * SIZE, BO
3064	bg,pt	%icc, .LL57
3065	LDF	[BO + 1 * SIZE], b2
3066	.align 4
3067
3068.LL58:
3069#if defined(LN) || defined(RT)
3070#ifdef LN
3071	sub	KK, 2, TEMP1
3072#else
3073	sub	KK, 2, TEMP1
3074#endif
3075	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3076	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3077
3078	add	AORIG, TEMP2, AO
3079	add	B,     TEMP1, BO
3080#endif
3081
3082#if defined(LN) || defined(LT)
3083	LDF	[BO +  0 * SIZE], a1
3084	LDF	[BO +  1 * SIZE], a2
3085	LDF	[BO +  2 * SIZE], a3
3086	LDF	[BO +  3 * SIZE], a4
3087
3088	FSUB	a1, c01, c01
3089	FSUB	a2, c03, c03
3090	FSUB	a3, c02, c02
3091	FSUB	a4, c04, c04
3092#else
3093	LDF	[AO +  0 * SIZE], a1
3094	LDF	[AO +  1 * SIZE], a2
3095	LDF	[AO +  2 * SIZE], a3
3096	LDF	[AO +  3 * SIZE], a4
3097
3098	FSUB	a1, c01, c01
3099	FSUB	a2, c02, c02
3100	FSUB	a3, c03, c03
3101	FSUB	a4, c04, c04
3102#endif
3103
3104#ifdef LN
3105	LDF	[AO +  3 * SIZE], a1
3106	LDF	[AO +  2 * SIZE], a2
3107	LDF	[AO +  0 * SIZE], a3
3108
3109	FMUL	a1, c02, c02
3110	FMUL	a1, c04, c04
3111
3112	FNMSUB	(aa2, cc02, cc01, cc01)
3113	FNMSUB	(aa2, cc04, cc03, cc03)
3114
3115	FMUL	a3, c01, c01
3116	FMUL	a3, c03, c03
3117#endif
3118
3119#ifdef LT
3120	LDF	[AO +  0 * SIZE], a1
3121	LDF	[AO +  1 * SIZE], a2
3122	LDF	[AO +  3 * SIZE], a3
3123
3124	FMUL	a1, c01, c01
3125	FMUL	a1, c03, c03
3126
3127	FNMSUB	(aa2, cc01, cc02, cc02)
3128	FNMSUB	(aa2, cc03, cc04, cc04)
3129
3130	FMUL	a3, c02, c02
3131	FMUL	a3, c04, c04
3132#endif
3133
3134#ifdef RN
3135	LDF	[BO +  0 * SIZE], a1
3136	LDF	[BO +  1 * SIZE], a2
3137
3138	FMUL	a1, c01, c01
3139	FMUL	a1, c02, c02
3140
3141	FNMSUB	(aa2, cc01, cc03, cc03)
3142	FNMSUB	(aa2, cc02, cc04, cc04)
3143
3144	LDF	[BO +  3 * SIZE], a1
3145
3146	FMUL	a1, c03, c03
3147	FMUL	a1, c04, c04
3148#endif
3149
3150#ifdef RT
3151	LDF	[BO +  3 * SIZE], a1
3152	LDF	[BO +  2 * SIZE], a2
3153
3154	FMUL	a1, c04, c04
3155	FMUL	a1, c03, c03
3156
3157	FNMSUB	(aa2, cc04, cc02, cc02)
3158	FNMSUB	(aa2, cc03, cc01, cc01)
3159
3160	LDF	[BO +  0 * SIZE], a1
3161
3162	FMUL	a1, c02, c02
3163	FMUL	a1, c01, c01
3164#endif
3165
3166#ifdef LN
3167	add	C1, -2 * SIZE, C1
3168	add	C2, -2 * SIZE, C2
3169#endif
3170
3171#if defined(LN) || defined(LT)
3172	STF	c01, [BO +  0 * SIZE]
3173	STF	c03, [BO +  1 * SIZE]
3174	STF	c02, [BO +  2 * SIZE]
3175	STF	c04, [BO +  3 * SIZE]
3176#else
3177	STF	c01, [AO +  0 * SIZE]
3178	STF	c02, [AO +  1 * SIZE]
3179	STF	c03, [AO +  2 * SIZE]
3180	STF	c04, [AO +  3 * SIZE]
3181#endif
3182
3183	STF	c01, [C1 + 0 * SIZE]
3184	STF	c02, [C1 + 1 * SIZE]
3185	STF	c03, [C2 + 0 * SIZE]
3186	STF	c04, [C2 + 1 * SIZE]
3187
3188#ifndef LN
3189	add	C1, 2 * SIZE, C1
3190	add	C2, 2 * SIZE, C2
3191#endif
3192
3193#ifdef RT
3194	sll	K, BASE_SHIFT + 1, TEMP1
3195	add	AORIG, TEMP1, AORIG
3196#endif
3197
3198#if defined(LT) || defined(RN)
3199	sub	K, KK, TEMP1
3200	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3201	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3202	add	AO, TEMP2, AO
3203	add	BO, TEMP1, BO
3204#endif
3205
3206#ifdef LT
3207	add	KK, 2, KK
3208#endif
3209
3210#ifdef LN
3211	sub	KK, 2, KK
3212#endif
3213
3214	add	I, -1, I
3215	cmp	I, 0
3216	bg,pt	%icc, .LL52
3217	nop
3218	.align 4
3219
3220.LL60:
3221	and	M, 1, I
3222	cmp	I, 0
3223	ble,pn	%icc, .LL69
3224	nop
3225
3226#if defined(LT) || defined(RN)
3227	mov	B, BO
3228#else
3229#ifdef LN
3230	sll	K,  BASE_SHIFT + 0, TEMP1
3231	sub	AORIG, TEMP1, AORIG
3232#endif
3233
3234	sll	KK, BASE_SHIFT + 0, TEMP1
3235	sll	KK, BASE_SHIFT + 1, TEMP2
3236
3237	add	AORIG, TEMP1, AO
3238	add	B,     TEMP2, BO
3239#endif
3240
3241	LDF	[AO +  0 * SIZE], a1
3242	LDF	[AO +  1 * SIZE], a2
3243	LDF	[AO +  2 * SIZE], a3
3244	LDF	[AO +  3 * SIZE], a4
3245
3246	LDF	[BO +  0 * SIZE], b1
3247	LDF	[BO +  1 * SIZE], b2
3248	LDF	[BO +  2 * SIZE], b3
3249	LDF	[BO +  3 * SIZE], b4
3250	LDF	[BO +  4 * SIZE], b5
3251	LDF	[BO +  5 * SIZE], b6
3252	LDF	[BO +  6 * SIZE], b7
3253	FCLR	(cc01)
3254	LDF	[BO +  7 * SIZE], b8
3255	FCLR	(cc03)
3256
3257#if defined(LT) || defined(RN)
3258	sra	KK, 2, L
3259#else
3260	sub	K, KK, L
3261	sra	L,  2, L
3262#endif
3263	cmp	L,  0
3264	ble,pn	%icc, .LL65
3265	nop
3266	.align 4
3267
3268.LL63:
3269	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3270	add	L, -1, L
3271
3272	FMADD	(aa1, bb1, cc01, cc01)
3273	LDF	[BO +  8 * SIZE], b1
3274	FMADD	(aa1, bb2, cc03, cc03)
3275	LDF	[BO +  9 * SIZE], b2
3276
3277	LDF	[AO +  4 * SIZE], a1
3278	cmp	L, 0
3279
3280	FMADD	(aa2, bb3, cc01, cc01)
3281	LDF	[BO + 10 * SIZE], b3
3282	FMADD	(aa2, bb4, cc03, cc03)
3283	LDF	[BO + 11 * SIZE], b4
3284
3285	LDF	[AO +  5 * SIZE], a2
3286	add	AO,  4 * SIZE, AO
3287
3288	FMADD	(aa3, bb5, cc01, cc01)
3289	LDF	[BO + 12 * SIZE], b5
3290	FMADD	(aa3, bb6, cc03, cc03)
3291	LDF	[BO + 13 * SIZE], b6
3292
3293	LDF	[AO +  2 * SIZE], a3
3294	add	BO,  8 * SIZE, BO
3295
3296	FMADD	(aa4, bb7, cc01, cc01)
3297	LDF	[BO +  6 * SIZE], b7
3298	FMADD	(aa4, bb8, cc03, cc03)
3299	LDF	[BO + 7 * SIZE], b8
3300
3301	bg,pt	%icc, .LL63
3302	LDF	[AO +  3 * SIZE], a4
3303	.align 4
3304
3305.LL65:
3306#if defined(LT) || defined(RN)
3307	and	KK, 3, L
3308#else
3309	sub	K, KK, L
3310	and	L,  3, L
3311#endif
3312	cmp	L,  0
3313	ble,a,pn %icc, .LL68
3314	nop
3315	.align 4
3316
3317.LL67:
3318	FMADD	(aa1, bb1, cc01, cc01)
3319	LDF	[BO + 2 * SIZE], b1
3320	FMADD	(aa1, bb2, cc03, cc03)
3321	LDF	[BO + 3 * SIZE], b2
3322
3323	LDF	[AO + 1 * SIZE], a1
3324	add	L, -1, L
3325	add	AO, 1 * SIZE, AO
3326	cmp	L, 0
3327
3328	bg,pt	%icc, .LL67
3329	add	BO, 2 * SIZE, BO
3330	.align 4
3331
3332.LL68:
3333#if defined(LN) || defined(RT)
3334#ifdef LN
3335	sub	KK, 1, TEMP1
3336#else
3337	sub	KK, 2, TEMP1
3338#endif
3339	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3340	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3341
3342	add	AORIG, TEMP2, AO
3343	add	B,     TEMP1, BO
3344#endif
3345
3346#if defined(LN) || defined(LT)
3347	LDF	[BO +  0 * SIZE], a1
3348	LDF	[BO +  1 * SIZE], a2
3349
3350	FSUB	a1, c01, c01
3351	FSUB	a2, c03, c03
3352#else
3353	LDF	[AO +  0 * SIZE], a1
3354	LDF	[AO +  1 * SIZE], a2
3355
3356	FSUB	a1, c01, c01
3357	FSUB	a2, c03, c03
3358#endif
3359
3360#if defined(LN) || defined(LT)
3361	LDF	[AO +  0 * SIZE], a1
3362
3363	FMUL	a1, c01, c01
3364	FMUL	a1, c03, c03
3365#endif
3366
3367#ifdef RN
3368	LDF	[BO +  0 * SIZE], a1
3369	LDF	[BO +  1 * SIZE], a2
3370
3371	FMUL	a1, c01, c01
3372
3373	FNMSUB	(aa2, cc01, cc03, cc03)
3374
3375	LDF	[BO +  3 * SIZE], a1
3376
3377	FMUL	a1, c03, c03
3378#endif
3379
3380#ifdef RT
3381	LDF	[BO +  3 * SIZE], a1
3382	LDF	[BO +  2 * SIZE], a2
3383
3384	FMUL	a1, c03, c03
3385
3386	FNMSUB	(aa2, cc03, cc01, cc01)
3387
3388	LDF	[BO +  0 * SIZE], a1
3389
3390	FMUL	a1, c01, c01
3391#endif
3392
3393#ifdef LN
3394	add	C1, -1 * SIZE, C1
3395	add	C2, -1 * SIZE, C2
3396#endif
3397
3398#if defined(LN) || defined(LT)
3399	STF	c01, [BO +  0 * SIZE]
3400	STF	c03, [BO +  1 * SIZE]
3401#else
3402	STF	c01, [AO +  0 * SIZE]
3403	STF	c03, [AO +  1 * SIZE]
3404#endif
3405
3406	STF	c01, [C1 + 0 * SIZE]
3407	STF	c03, [C2 + 0 * SIZE]
3408
3409#ifdef RT
3410	sll	K, BASE_SHIFT + 0, TEMP1
3411	add	AORIG, TEMP1, AORIG
3412#endif
3413
3414#if defined(LT) || defined(RN)
3415	sub	K, KK, TEMP1
3416	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3417	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3418	add	AO, TEMP2, AO
3419	add	BO, TEMP1, BO
3420#endif
3421
3422#ifdef LT
3423	add	KK, 1, KK
3424#endif
3425
3426#ifdef LN
3427	sub	KK, 1, KK
3428#endif
3429	.align 4
3430
3431.LL69:
3432#ifdef LN
3433	sll	K, BASE_SHIFT + 1, TEMP1
3434	add	B, TEMP1, B
3435#endif
3436
3437#if defined(LT) || defined(RN)
3438	mov	BO, B
3439#endif
3440
3441#ifdef RN
3442	add	KK, 2, KK
3443#endif
3444
3445#ifdef RT
3446	sub	KK, 2, KK
3447#endif
3448	.align 4
3449
3450.LL70:
3451	and	N, 1, J
3452	cmp	J, 0
3453	ble,pn	%icc, .LL999
3454	nop
3455
3456#ifdef RT
3457	sll	K, BASE_SHIFT, TEMP1
3458	sub	B, TEMP1, B
3459#endif
3460
3461#ifndef RT
3462	mov	C,  C1
3463	add	C1, LDC, C
3464#else
3465	sub	C,  LDC, C1
3466	sub	C,  LDC, C
3467#endif
3468
3469#ifdef LN
3470	add	M, OFFSET, KK
3471#endif
3472
3473#ifdef LT
3474	mov	OFFSET, KK
3475#endif
3476
3477#if defined(LN) || defined(RT)
3478	mov	A, AORIG
3479#else
3480	mov	A, AO
3481#endif
3482
3483	sra	M, 1, I
3484	cmp	I, 0
3485	ble,pn	%icc, .LL80
3486	nop
3487	.align 4
3488
3489.LL72:
3490#if defined(LT) || defined(RN)
3491	mov	B, BO
3492#else
3493#ifdef LN
3494	sll	K,  BASE_SHIFT + 1, TEMP1
3495	sub	AORIG, TEMP1, AORIG
3496#endif
3497
3498	sll	KK, BASE_SHIFT + 1, TEMP1
3499	sll	KK, BASE_SHIFT + 0, TEMP2
3500
3501	add	AORIG, TEMP1, AO
3502	add	B,     TEMP2, BO
3503#endif
3504
3505	LDF	[AO +  0 * SIZE], a1
3506	LDF	[AO +  1 * SIZE], a2
3507	LDF	[AO +  2 * SIZE], a3
3508	LDF	[AO +  3 * SIZE], a4
3509
3510	LDF	[BO +  0 * SIZE], b1
3511	LDF	[BO +  1 * SIZE], b2
3512	LDF	[BO +  2 * SIZE], b3
3513	FCLR	(cc01)
3514	LDF	[BO +  3 * SIZE], b4
3515	FCLR	(cc02)
3516
3517	prefetch [C1 + 2 * SIZE], 3
3518
3519#if defined(LT) || defined(RN)
3520	sra	KK, 2, L
3521#else
3522	sub	K, KK, L
3523	sra	L,  2, L
3524#endif
3525	cmp	L,  0
3526	ble,pn	%icc, .LL75
3527	nop
3528
3529.LL73:
3530	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3531	add	L, -1, L
3532
3533	FMADD	(aa1, bb1, cc01, cc01)
3534	LDF	[AO +  4 * SIZE], a1
3535	FMADD	(aa2, bb1, cc02, cc02)
3536	LDF	[AO +  5 * SIZE], a2
3537
3538	LDF	[BO +  4 * SIZE], b1
3539	cmp	L, 0
3540
3541	FMADD	(aa3, bb2, cc01, cc01)
3542	LDF	[AO +  6 * SIZE], a3
3543	FMADD	(aa4, bb2, cc02, cc02)
3544	LDF	[AO +  7 * SIZE], a4
3545
3546	LDF	[BO +  5 * SIZE], b2
3547	add	BO,  4 * SIZE, BO
3548
3549	FMADD	(aa1, bb3, cc01, cc01)
3550	LDF	[AO +  8 * SIZE], a1
3551	FMADD	(aa2, bb3, cc02, cc02)
3552	LDF	[AO +  9 * SIZE], a2
3553
3554	LDF	[BO +  2 * SIZE], b3
3555	add	AO,  8 * SIZE, AO
3556
3557	FMADD	(aa3, bb4, cc01, cc01)
3558	LDF	[AO +  2 * SIZE], a3
3559	FMADD	(aa4, bb4, cc02, cc02)
3560	LDF	[AO +  3 * SIZE], a4
3561
3562	bg,pt	%icc, .LL73
3563	LDF	[BO +  3 * SIZE], b4
3564	.align 4
3565
3566.LL75:
3567#if defined(LT) || defined(RN)
3568	and	KK, 3, L
3569#else
3570	sub	K, KK, L
3571	and	L,  3, L
3572#endif
3573	cmp	L,  0
3574	ble,a,pn %icc, .LL78
3575	nop
3576	.align 4
3577
3578.LL77:
3579	FMADD	(aa1, bb1, cc01, cc01)
3580	LDF	[AO + 2 * SIZE], a1
3581	FMADD	(aa2, bb1, cc02, cc02)
3582	LDF	[AO + 3 * SIZE], a2
3583
3584	LDF	[BO + 1 * SIZE], b1
3585	add	L, -1, L
3586	add	AO, 2 * SIZE, AO
3587	cmp	L, 0
3588	bg,pt	%icc, .LL77
3589	add	BO, 1 * SIZE, BO
3590	.align 4
3591
3592.LL78:
3593#if defined(LN) || defined(RT)
3594#ifdef LN
3595	sub	KK, 2, TEMP1
3596#else
3597	sub	KK, 1, TEMP1
3598#endif
3599	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3600	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3601
3602	add	AORIG, TEMP2, AO
3603	add	B,     TEMP1, BO
3604#endif
3605
3606#if defined(LN) || defined(LT)
3607	LDF	[BO +  0 * SIZE], a1
3608	LDF	[BO +  1 * SIZE], a2
3609
3610	FSUB	a1, c01, c01
3611	FSUB	a2, c02, c02
3612#else
3613	LDF	[AO +  0 * SIZE], a1
3614	LDF	[AO +  1 * SIZE], a2
3615
3616	FSUB	a1, c01, c01
3617	FSUB	a2, c02, c02
3618#endif
3619
3620#ifdef LN
3621	LDF	[AO +  3 * SIZE], a1
3622	LDF	[AO +  2 * SIZE], a2
3623	LDF	[AO +  0 * SIZE], a3
3624
3625	FMUL	a1, c02, c02
3626
3627	FNMSUB	(aa2, cc02, cc01, cc01)
3628
3629	FMUL	a3, c01, c01
3630#endif
3631
3632#ifdef LT
3633	LDF	[AO +  0 * SIZE], a1
3634	LDF	[AO +  1 * SIZE], a2
3635	LDF	[AO +  3 * SIZE], a3
3636
3637	FMUL	a1, c01, c01
3638
3639	FNMSUB	(aa2, cc01, cc02, cc02)
3640
3641	FMUL	a3, c02, c02
3642#endif
3643
3644#if defined(RN) || defined(RT)
3645	LDF	[BO +  0 * SIZE], a1
3646
3647	FMUL	a1, c01, c01
3648	FMUL	a1, c02, c02
3649#endif
3650
3651#ifdef LN
3652	add	C1, -2 * SIZE, C1
3653#endif
3654
3655#if defined(LN) || defined(LT)
3656	STF	c01, [BO +  0 * SIZE]
3657	STF	c02, [BO +  1 * SIZE]
3658#else
3659	STF	c01, [AO +  0 * SIZE]
3660	STF	c02, [AO +  1 * SIZE]
3661#endif
3662
3663	STF	c01, [C1 + 0 * SIZE]
3664	STF	c02, [C1 + 1 * SIZE]
3665
3666#ifndef LN
3667	add	C1, 2 * SIZE, C1
3668#endif
3669
3670#ifdef RT
3671	sll	K, BASE_SHIFT + 1, TEMP1
3672	add	AORIG, TEMP1, AORIG
3673#endif
3674
3675#if defined(LT) || defined(RN)
3676	sub	K, KK, TEMP1
3677	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3678	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3679	add	AO, TEMP2, AO
3680	add	BO, TEMP1, BO
3681#endif
3682
3683#ifdef LT
3684	add	KK, 2, KK
3685#endif
3686
3687#ifdef LN
3688	sub	KK, 2, KK
3689#endif
3690
3691	add	I, -1, I
3692	cmp	I, 0
3693	bg,pt	%icc, .LL72
3694	nop
3695	.align 4
3696
3697.LL80:
3698	and	M, 1, I
3699	cmp	I, 0
3700	ble,pn	%icc, .LL89
3701	nop
3702
3703#if defined(LT) || defined(RN)
3704	mov	B, BO
3705#else
3706#ifdef LN
3707	sll	K,  BASE_SHIFT + 0, TEMP1
3708	sub	AORIG, TEMP1, AORIG
3709#endif
3710
3711	sll	KK, BASE_SHIFT + 0, TEMP1
3712	sll	KK, BASE_SHIFT + 0, TEMP2
3713
3714	add	AORIG, TEMP1, AO
3715	add	B,     TEMP2, BO
3716#endif
3717
3718	LDF	[AO +  0 * SIZE], a1
3719	LDF	[BO +  0 * SIZE], b1
3720	LDF	[AO +  1 * SIZE], a2
3721	LDF	[BO +  1 * SIZE], b2
3722	LDF	[AO +  2 * SIZE], a3
3723	LDF	[BO +  2 * SIZE], b3
3724	LDF	[AO +  3 * SIZE], a4
3725	LDF	[BO +  3 * SIZE], b4
3726
3727#if defined(LT) || defined(RN)
3728	sra	KK, 2, L
3729#else
3730	sub	K, KK, L
3731	sra	L,  2, L
3732#endif
3733	cmp	L,  0
3734	ble,pn	%icc, .LL85
3735	FCLR	(cc01)
3736	.align 4
3737
3738.LL83:
3739	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3740	add	L, -1, L
3741
3742	FMADD	(aa1, bb1, cc01, cc01)
3743	LDF	[AO +  4 * SIZE], a1
3744	LDF	[BO +  4 * SIZE], b1
3745
3746	FMADD	(aa2, bb2, cc01, cc01)
3747	LDF	[AO +  5 * SIZE], a2
3748	LDF	[BO +  5 * SIZE], b2
3749
3750	FMADD	(aa3, bb3, cc01, cc01)
3751	LDF	[AO +  6 * SIZE], a3
3752	LDF	[BO +  6 * SIZE], b3
3753
3754	FMADD	(aa4, bb4, cc01, cc01)
3755	LDF	[AO +  7 * SIZE], a4
3756	LDF	[BO +  7 * SIZE], b4
3757
3758	add	AO,  4 * SIZE, AO
3759	cmp	L, 0
3760
3761	bg,pt	%icc, .LL83
3762	add	BO,  4 * SIZE, BO
3763	.align 4
3764
3765.LL85:
3766#if defined(LT) || defined(RN)
3767	and	KK, 3, L
3768#else
3769	sub	K, KK, L
3770	and	L,  3, L
3771#endif
3772	cmp	L,  0
3773	ble,a,pn %icc, .LL88
3774	nop
3775	.align 4
3776
3777.LL87:
3778	FMADD	(aa1, bb1, cc01, cc01)
3779	LDF	[AO + 1 * SIZE], a1
3780	LDF	[BO + 1 * SIZE], b1
3781
3782	add	AO, 1 * SIZE, AO
3783	add	L, -1, L
3784	cmp	L, 0
3785	bg,pt	%icc, .LL87
3786	add	BO, 1 * SIZE, BO
3787	.align 4
3788
3789.LL88:
3790#if defined(LN) || defined(RT)
3791#ifdef LN
3792	sub	KK, 1, TEMP1
3793#else
3794	sub	KK, 1, TEMP1
3795#endif
3796	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3797	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3798
3799	add	AORIG, TEMP2, AO
3800	add	B,     TEMP1, BO
3801#endif
3802
3803#if defined(LN) || defined(LT)
3804	LDF	[BO +  0 * SIZE], a1
3805
3806	FSUB	a1, c01, c01
3807#else
3808	LDF	[AO +  0 * SIZE], a1
3809
3810	FSUB	a1, c01, c01
3811#endif
3812
3813#if defined(LN) || defined(LT)
3814	LDF	[AO +  0 * SIZE], a1
3815
3816	FMUL	a1, c01, c01
3817#endif
3818
3819#if defined(RN) || defined(RT)
3820	LDF	[BO +  0 * SIZE], a1
3821
3822	FMUL	a1, c01, c01
3823#endif
3824
3825#ifdef LN
3826	add	C1, -1 * SIZE, C1
3827#endif
3828
3829#if defined(LN) || defined(LT)
3830	STF	c01, [BO +  0 * SIZE]
3831#else
3832	STF	c01, [AO +  0 * SIZE]
3833#endif
3834
3835	STF	c01, [C1 + 0 * SIZE]
3836
3837#ifdef RT
3838	sll	K, BASE_SHIFT + 0, TEMP1
3839	add	AORIG, TEMP1, AORIG
3840#endif
3841
3842#if defined(LT) || defined(RN)
3843	sub	K, KK, TEMP1
3844	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3845	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3846	add	AO, TEMP2, AO
3847	add	BO, TEMP1, BO
3848#endif
3849
3850#ifdef LT
3851	add	KK, 1, KK
3852#endif
3853
3854#ifdef LN
3855	sub	KK, 1, KK
3856#endif
3857	.align 4
3858
3859.LL89:
3860#ifdef LN
3861	sll	K, BASE_SHIFT, TEMP1
3862	add	B, TEMP1, B
3863#endif
3864
3865#if defined(LT) || defined(RN)
3866	mov	BO, B
3867#endif
3868
3869#ifdef RN
3870	add	KK, 1, KK
3871#endif
3872
3873#ifdef RT
3874	sub	KK, 1, KK
3875#endif
3876	.align 4
3877
3878.LL999:
3879#ifdef TRMMKERNEL
3880#ifndef __64BIT__
3881	ld	[%sp + STACK_START +  8], %g1
3882	ld	[%sp + STACK_START + 12], %g2
3883	ld	[%sp + STACK_START + 16], %g3
3884	ld	[%sp + STACK_START + 20], %g4
3885#else
3886	ldx	[%sp + STACK_START + 32], %g1
3887	ldx	[%sp + STACK_START + 40], %g2
3888	ldx	[%sp + STACK_START + 48], %g3
3889	ldx	[%sp + STACK_START + 56], %g4
3890#endif
3891#endif
3892
3893	return	%i7 + 8
3894	clr	%o0
3895
3896	EPILOGUE
3897