1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2005. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define APREFETCHSIZE 24
26#define APREFETCH_CATEGORY 0
27
28#define M	%i0
29#define N	%i1
30#define K	%i2
31#define A	%i5
32#define B	%i3
33#define C	%i4
34
35#define LDC	%o0
36#define AO	%o1
37#define BO	%o2
38#define I	%o3
39#define J	%o4
40#define L	%o5
41
42#define C1	%l0
43#define C2	%l1
44#define C3	%l2
45#define C4	%l3
46
47#define OFFSET	%l4
48#define	KK	%l5
49#define TEMP1	%l6
50#define TEMP2	%l7
51#define AORIG	%o7
52
53#ifdef DOUBLE
54#define c01	%f0
55#define c02	%f2
56#define c03	%f4
57#define c04	%f6
58#define c05	%f8
59#define c06	%f10
60#define c07	%f12
61#define c08	%f14
62#define c09	%f16
63#define c10	%f18
64#define c11	%f20
65#define c12	%f22
66#define c13	%f24
67#define c14	%f26
68#define c15	%f28
69#define c16	%f30
70
71#define a1	%f32
72#define a2	%f34
73#define a3	%f36
74#define a4	%f38
75#define a5	%f40
76
77#define b1	%f42
78#define b2	%f44
79#define b3	%f46
80#define b4	%f48
81#define b5	%f50
82#define b6	%f52
83#define b7	%f54
84#define b8	%f56
85#define b9	%f58
86
87#define cc01	0
88#define cc02	2
89#define cc03	4
90#define cc04	6
91#define cc05	8
92#define cc06	10
93#define cc07	12
94#define cc08	14
95#define cc09	16
96#define cc10	18
97#define cc11	20
98#define cc12	22
99#define cc13	24
100#define cc14	26
101#define cc15	28
102#define cc16	30
103
104#define aa1	 1
105#define aa2	 3
106#define aa3	 5
107#define aa4	 7
108#define aa5	 9
109
110#define bb1	11
111#define bb2	13
112#define bb3	15
113#define bb4	17
114#define bb5	19
115#define bb6	21
116#define bb7	23
117#define bb8	25
118#define bb9	27
119#else
120#define c01	%f0
121#define c02	%f1
122#define c03	%f2
123#define c04	%f3
124#define c05	%f4
125#define c06	%f5
126#define c07	%f6
127#define c08	%f7
128#define c09	%f8
129#define c10	%f9
130#define c11	%f10
131#define c12	%f11
132#define c13	%f12
133#define c14	%f13
134#define c15	%f14
135#define c16	%f15
136
137#define a1	%f16
138#define a2	%f17
139#define a3	%f18
140#define a4	%f19
141#define a5	%f20
142
143#define b1	%f21
144#define b2	%f22
145#define b3	%f23
146#define b4	%f24
147#define b5	%f25
148#define b6	%f26
149#define b7	%f27
150#define b8	%f28
151#define b9	%f29
152
153#define cc01	0
154#define cc02	1
155#define cc03	2
156#define cc04	3
157#define cc05	4
158#define cc06	5
159#define cc07	6
160#define cc08	7
161#define cc09	8
162#define cc10	9
163#define cc11	10
164#define cc12	11
165#define cc13	12
166#define cc14	13
167#define cc15	14
168#define cc16	15
169
170#define aa1	16
171#define aa2	17
172#define aa3	18
173#define aa4	19
174#define aa5	20
175
176#define bb1	21
177#define bb2	22
178#define bb3	23
179#define bb4	24
180#define bb5	25
181#define bb6	26
182#define bb7	27
183#define bb8	28
184#define bb9	29
185#endif
186
187#ifndef CONJ
188#define FMADD1	FMADD
189#define FMADD2	FMADD
190#define FMADD3	FMADD
191#define FMADD4	FNMSUB
192#else
193#if defined(LN) || defined(LT)
194#define FMADD1	FMADD
195#define FMADD2	FNMSUB
196#define FMADD3	FMADD
197#define FMADD4	FMADD
198#endif
199#if defined(RN) || defined(RT)
200#define FMADD1	FMADD
201#define FMADD2	FMADD
202#define FMADD3	FNMSUB
203#define FMADD4	FMADD
204#endif
205#endif
206
207        .register %g2, #scratch
208        .register %g3, #scratch
209
210	PROLOGUE
211	SAVESP
212
213#ifndef __64BIT__
214#ifdef DOUBLE
215	ld	[%sp + STACK_START + 32], A
216	ld	[%sp + STACK_START + 36], B
217	ld	[%sp + STACK_START + 40], C
218	ld	[%sp + STACK_START + 44], LDC
219	ld	[%sp + STACK_START + 48], OFFSET
220#else
221	ld	[%sp + STACK_START + 28], B
222	ld	[%sp + STACK_START + 32], C
223	ld	[%sp + STACK_START + 36], LDC
224	ld	[%sp + STACK_START + 40], OFFSET
225#endif
226#else
227	ldx	[%sp + STACK_START + 56], B
228	ldx	[%sp + STACK_START + 64], C
229	ldx	[%sp + STACK_START + 72], LDC
230	ldx	[%sp + STACK_START + 80], OFFSET
231#endif
232
233	cmp	M, 0
234	ble,pn	%icc, .LL999
235	nop
236
237	sll	LDC, ZBASE_SHIFT, LDC
238
239#ifdef LN
240	smul	M, K, TEMP1
241	sll	TEMP1, ZBASE_SHIFT, TEMP1
242	add	A, TEMP1, A
243
244	sll	M, ZBASE_SHIFT, TEMP1
245	add	C, TEMP1, C
246#endif
247
248#ifdef RN
249	neg	OFFSET, KK
250#endif
251
252#ifdef RT
253	smul	N, K, TEMP1
254	sll	TEMP1, ZBASE_SHIFT, TEMP1
255	add	B, TEMP1, B
256
257	smul	N, LDC, TEMP1
258	add	C, TEMP1, C
259
260	sub	N, OFFSET, KK
261#endif
262
263	sra	N, 2, J
264	cmp	J, 0
265	ble,pn	%icc, .LL20
266	nop
267	.align 4
268
269.LL11:
270#ifdef RT
271	sll	K, ZBASE_SHIFT + 2, TEMP1
272	sub	B, TEMP1, B
273#endif
274
275#ifndef RT
276	mov	C,  C1
277	add	C,  LDC, C2
278	add	C2, LDC, C3
279	add	C3, LDC, C4
280	add	C4, LDC, C
281#else
282	sub	C,  LDC, C4
283	sub	C4, LDC, C3
284	sub	C3, LDC, C2
285	sub	C2, LDC, C1
286	sub	C2, LDC, C
287#endif
288
289#ifdef LN
290	add	M, OFFSET, KK
291#endif
292
293#ifdef LT
294	mov	OFFSET, KK
295#endif
296
297#if defined(LN) || defined(RT)
298	mov	A, AORIG
299#else
300	mov	A, AO
301#endif
302
303	mov	M, I
304	.align 4
305
306.LL12:
307#if defined(LT) || defined(RN)
308	mov	B, BO
309#else
310#ifdef LN
311	sll	K,  ZBASE_SHIFT, TEMP1
312	sub	AORIG, TEMP1, AORIG
313#endif
314
315	sll	KK, ZBASE_SHIFT + 0, TEMP1
316	sll	KK, ZBASE_SHIFT + 2, TEMP2
317
318	add	AORIG, TEMP1, AO
319	add	B,     TEMP2, BO
320#endif
321
322	LDF	[AO +  0 * SIZE], a1
323	FCLR	(cc01)
324	LDF	[AO +  1 * SIZE], a2
325	FCLR	(cc05)
326	LDF	[AO +  8 * SIZE], a5
327	FCLR	(cc09)
328	LDF	[BO +  0 * SIZE], b1
329	FCLR	(cc13)
330
331	LDF	[BO +  1 * SIZE], b2
332	FCLR	(cc02)
333	LDF	[BO +  2 * SIZE], b3
334	FCLR	(cc06)
335	LDF	[BO +  3 * SIZE], b4
336	FCLR	(cc10)
337	LDF	[BO +  4 * SIZE], b5
338	FCLR	(cc14)
339
340	LDF	[BO +  5 * SIZE], b6
341	FCLR	(cc03)
342	LDF	[BO +  6 * SIZE], b7
343	FCLR	(cc07)
344	LDF	[BO +  7 * SIZE], b8
345	FCLR	(cc11)
346	LDF	[BO +  8 * SIZE], b9
347	FCLR	(cc15)
348
349	prefetch [C1 + 1 * SIZE], 3
350	FCLR	(cc04)
351	prefetch [C2 + 2 * SIZE], 3
352	FCLR	(cc08)
353	prefetch [C3 + 1 * SIZE], 3
354	FCLR	(cc12)
355	prefetch [C4 + 2 * SIZE], 3
356	FCLR	(cc16)
357
358#if defined(LT) || defined(RN)
359	sra	KK, 3, L
360#else
361	sub	K, KK, L
362	sra	L,  3, L
363#endif
364	cmp	L,  0
365	ble,pn	%icc, .LL15
366	nop
367	.align 4
368
369.LL13:
370	FMADD1	(aa1, bb1, cc01, cc01)
371	FMADD2	(aa2, bb1, cc02, cc02)
372	FMADD3	(aa1, bb2, cc03, cc03)
373	FMADD4	(aa2, bb2, cc04, cc04)
374
375	FMADD1	(aa1, bb3, cc05, cc05)
376	LDF	[BO + 16 * SIZE], b1
377	FMADD2	(aa2, bb3, cc06, cc06)
378	LDF	[BO +  9 * SIZE], b2
379
380	FMADD3	(aa1, bb4, cc07, cc07)
381	LDF	[BO + 10 * SIZE], b3
382	FMADD4	(aa2, bb4, cc08, cc08)
383	LDF	[BO + 11 * SIZE], b4
384
385	FMADD1	(aa1, bb5, cc09, cc09)
386	LDF	[AO +  2 * SIZE], a3
387	FMADD2	(aa2, bb5, cc10, cc10)
388	LDF	[AO +  3 * SIZE], a4
389
390	FMADD3	(aa1, bb6, cc11, cc11)
391	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
392	FMADD4	(aa2, bb6, cc12, cc12)
393	nop
394
395	FMADD1	(aa1, bb7, cc13, cc13)
396	LDF	[BO + 12 * SIZE], b5
397	FMADD2	(aa2, bb7, cc14, cc14)
398	LDF	[BO + 13 * SIZE], b6
399
400	FMADD3	(aa1, bb8, cc15, cc15)
401	LDF	[BO + 14 * SIZE], b7
402	FMADD4	(aa2, bb8, cc16, cc16)
403	LDF	[BO + 15 * SIZE], b8
404
405	FMADD1	(aa3, bb9, cc01, cc01)
406	FMADD2	(aa4, bb9, cc02, cc02)
407	FMADD3	(aa3, bb2, cc03, cc03)
408	FMADD4	(aa4, bb2, cc04, cc04)
409
410	FMADD1	(aa3, bb3, cc05, cc05)
411	LDF	[BO + 24 * SIZE], b9
412	FMADD2	(aa4, bb3, cc06, cc06)
413	LDF	[BO + 17 * SIZE], b2
414
415	FMADD3	(aa3, bb4, cc07, cc07)
416	LDF	[BO + 18 * SIZE], b3
417	FMADD4	(aa4, bb4, cc08, cc08)
418	LDF	[BO + 19 * SIZE], b4
419
420	FMADD1	(aa3, bb5, cc09, cc09)
421	LDF	[AO +  4 * SIZE], a1
422	FMADD2	(aa4, bb5, cc10, cc10)
423	LDF	[AO +  5 * SIZE], a2
424
425	FMADD3	(aa3, bb6, cc11, cc11)
426	add	L, -1, L
427	FMADD4	(aa4, bb6, cc12, cc12)
428	nop
429
430	FMADD1	(aa3, bb7, cc13, cc13)
431	LDF	[BO + 20 * SIZE], b5
432	FMADD2	(aa4, bb7, cc14, cc14)
433	LDF	[BO + 21 * SIZE], b6
434
435	FMADD3	(aa3, bb8, cc15, cc15)
436	LDF	[BO + 22 * SIZE], b7
437	FMADD4	(aa4, bb8, cc16, cc16)
438	LDF	[BO + 23 * SIZE], b8
439
440	FMADD1	(aa1, bb1, cc01, cc01)
441	FMADD2	(aa2, bb1, cc02, cc02)
442	FMADD3	(aa1, bb2, cc03, cc03)
443	FMADD4	(aa2, bb2, cc04, cc04)
444
445	FMADD1	(aa1, bb3, cc05, cc05)
446	LDF	[BO + 32 * SIZE], b1
447	FMADD2	(aa2, bb3, cc06, cc06)
448	LDF	[BO + 25 * SIZE], b2
449
450	FMADD3	(aa1, bb4, cc07, cc07)
451	LDF	[BO + 26 * SIZE], b3
452	FMADD4	(aa2, bb4, cc08, cc08)
453	LDF	[BO + 27 * SIZE], b4
454
455	FMADD1	(aa1, bb5, cc09, cc09)
456	LDF	[AO +  6 * SIZE], a3
457	FMADD2	(aa2, bb5, cc10, cc10)
458	LDF	[AO +  7 * SIZE], a4
459
460	FMADD3	(aa1, bb6, cc11, cc11)
461	nop
462	FMADD4	(aa2, bb6, cc12, cc12)
463	nop
464
465	FMADD1	(aa1, bb7, cc13, cc13)
466	LDF	[BO + 28 * SIZE], b5
467	FMADD2	(aa2, bb7, cc14, cc14)
468	LDF	[BO + 29 * SIZE], b6
469
470	FMADD3	(aa1, bb8, cc15, cc15)
471	LDF	[BO + 30 * SIZE], b7
472	FMADD4	(aa2, bb8, cc16, cc16)
473	LDF	[BO + 31 * SIZE], b8
474
475	FMADD1	(aa3, bb9, cc01, cc01)
476	FMADD2	(aa4, bb9, cc02, cc02)
477	FMADD3	(aa3, bb2, cc03, cc03)
478	FMADD4	(aa4, bb2, cc04, cc04)
479
480	FMADD1	(aa3, bb3, cc05, cc05)
481	LDF	[BO + 40 * SIZE], b9
482	FMADD2	(aa4, bb3, cc06, cc06)
483	LDF	[BO + 33 * SIZE], b2
484
485	FMADD3	(aa3, bb4, cc07, cc07)
486	LDF	[BO + 34 * SIZE], b3
487	FMADD4	(aa4, bb4, cc08, cc08)
488	LDF	[BO + 35 * SIZE], b4
489
490	FMADD1	(aa3, bb5, cc09, cc09)
491	LDF	[AO + 16 * SIZE], a1  /****/
492	FMADD2	(aa4, bb5, cc10, cc10)
493	LDF	[AO +  9 * SIZE], a2
494
495	FMADD3	(aa3, bb6, cc11, cc11)
496	nop
497	FMADD4	(aa4, bb6, cc12, cc12)
498	nop
499
500	FMADD1	(aa3, bb7, cc13, cc13)
501	LDF	[BO + 36 * SIZE], b5
502	FMADD2	(aa4, bb7, cc14, cc14)
503	LDF	[BO + 37 * SIZE], b6
504
505	FMADD3	(aa3, bb8, cc15, cc15)
506	LDF	[BO + 38 * SIZE], b7
507	FMADD4	(aa4, bb8, cc16, cc16)
508	LDF	[BO + 39 * SIZE], b8
509
510	FMADD1	(aa5, bb1, cc01, cc01)
511	FMADD2	(aa2, bb1, cc02, cc02)
512	FMADD3	(aa5, bb2, cc03, cc03)
513	FMADD4	(aa2, bb2, cc04, cc04)
514
515	FMADD1	(aa5, bb3, cc05, cc05)
516	LDF	[BO + 48 * SIZE], b1
517	FMADD2	(aa2, bb3, cc06, cc06)
518	LDF	[BO + 41 * SIZE], b2
519
520	FMADD3	(aa5, bb4, cc07, cc07)
521	LDF	[BO + 42 * SIZE], b3
522	FMADD4	(aa2, bb4, cc08, cc08)
523	LDF	[BO + 43 * SIZE], b4
524
525	FMADD1	(aa5, bb5, cc09, cc09)
526	LDF	[AO + 10 * SIZE], a3
527	FMADD2	(aa2, bb5, cc10, cc10)
528	LDF	[AO + 11 * SIZE], a4
529
530	FMADD3	(aa5, bb6, cc11, cc11)
531	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
532	FMADD4	(aa2, bb6, cc12, cc12)
533	nop
534
535	FMADD1	(aa5, bb7, cc13, cc13)
536	LDF	[BO + 44 * SIZE], b5
537	FMADD2	(aa2, bb7, cc14, cc14)
538	LDF	[BO + 45 * SIZE], b6
539
540	FMADD3	(aa5, bb8, cc15, cc15)
541	LDF	[BO + 46 * SIZE], b7
542	FMADD4	(aa2, bb8, cc16, cc16)
543	LDF	[BO + 47 * SIZE], b8
544
545	FMADD1	(aa3, bb9, cc01, cc01)
546	FMADD2	(aa4, bb9, cc02, cc02)
547	FMADD3	(aa3, bb2, cc03, cc03)
548	FMADD4	(aa4, bb2, cc04, cc04)
549
550	FMADD1	(aa3, bb3, cc05, cc05)
551	LDF	[BO + 56 * SIZE], b9
552	FMADD2	(aa4, bb3, cc06, cc06)
553	LDF	[BO + 49 * SIZE], b2
554
555	FMADD3	(aa3, bb4, cc07, cc07)
556	LDF	[BO + 50 * SIZE], b3
557	FMADD4	(aa4, bb4, cc08, cc08)
558	LDF	[BO + 51 * SIZE], b4
559
560	FMADD1	(aa3, bb5, cc09, cc09)
561	LDF	[AO + 12 * SIZE], a5
562	FMADD2	(aa4, bb5, cc10, cc10)
563	LDF	[AO + 13 * SIZE], a2
564
565	FMADD3	(aa3, bb6, cc11, cc11)
566	cmp	L, 0
567	FMADD4	(aa4, bb6, cc12, cc12)
568	nop
569
570	FMADD1	(aa3, bb7, cc13, cc13)
571	LDF	[BO + 52 * SIZE], b5
572	FMADD2	(aa4, bb7, cc14, cc14)
573	LDF	[BO + 53 * SIZE], b6
574
575	FMADD3	(aa3, bb8, cc15, cc15)
576	LDF	[BO + 54 * SIZE], b7
577	FMADD4	(aa4, bb8, cc16, cc16)
578	LDF	[BO + 55 * SIZE], b8
579
580	FMADD1	(aa5, bb1, cc01, cc01)
581	FMADD2	(aa2, bb1, cc02, cc02)
582	FMADD3	(aa5, bb2, cc03, cc03)
583	FMADD4	(aa2, bb2, cc04, cc04)
584
585	FMADD1	(aa5, bb3, cc05, cc05)
586	LDF	[BO + 64 * SIZE], b1
587	FMADD2	(aa2, bb3, cc06, cc06)
588	LDF	[BO + 57 * SIZE], b2
589
590	FMADD3	(aa5, bb4, cc07, cc07)
591	LDF	[BO + 58 * SIZE], b3
592	FMADD4	(aa2, bb4, cc08, cc08)
593	LDF	[BO + 59 * SIZE], b4
594
595	FMADD1	(aa5, bb5, cc09, cc09)
596	LDF	[AO + 14 * SIZE], a3
597	FMADD2	(aa2, bb5, cc10, cc10)
598	LDF	[AO + 15 * SIZE], a4
599
600	FMADD3	(aa5, bb6, cc11, cc11)
601	add	BO, 64 * SIZE, BO
602	FMADD4	(aa2, bb6, cc12, cc12)
603	add	AO, 16 * SIZE, AO
604
605	FMADD1	(aa5, bb7, cc13, cc13)
606	LDF	[BO -  4 * SIZE], b5
607	FMADD2	(aa2, bb7, cc14, cc14)
608	LDF	[BO -  3 * SIZE], b6
609
610	FMADD3	(aa5, bb8, cc15, cc15)
611	LDF	[BO -  2 * SIZE], b7
612	FMADD4	(aa2, bb8, cc16, cc16)
613	LDF	[BO -  1 * SIZE], b8
614
615	FMADD1	(aa3, bb9, cc01, cc01)
616	FMADD2	(aa4, bb9, cc02, cc02)
617	FMADD3	(aa3, bb2, cc03, cc03)
618	FMADD4	(aa4, bb2, cc04, cc04)
619
620	FMADD1	(aa3, bb3, cc05, cc05)
621	LDF	[BO +  8 * SIZE], b9
622	FMADD2	(aa4, bb3, cc06, cc06)
623	LDF	[BO +  1 * SIZE], b2
624
625	FMADD3	(aa3, bb4, cc07, cc07)
626	LDF	[BO +  2 * SIZE], b3
627	FMADD4	(aa4, bb4, cc08, cc08)
628	LDF	[BO +  3 * SIZE], b4
629
630	FMADD1	(aa3, bb5, cc09, cc09)
631	LDF	[AO +  8 * SIZE], a5  /****/
632	FMADD2	(aa4, bb5, cc10, cc10)
633	LDF	[AO +  1 * SIZE], a2
634
635	FMADD3	(aa3, bb6, cc11, cc11)
636	FMADD4	(aa4, bb6, cc12, cc12)
637
638	FMADD1	(aa3, bb7, cc13, cc13)
639	LDF	[BO +  4 * SIZE], b5
640	FMADD2	(aa4, bb7, cc14, cc14)
641	LDF	[BO +  5 * SIZE], b6
642
643	FMADD3	(aa3, bb8, cc15, cc15)
644	LDF	[BO +  6 * SIZE], b7
645	FMADD4	(aa4, bb8, cc16, cc16)
646	ble,pn	%icc, .LL15
647	LDF	[BO +  7 * SIZE], b8
648
649	FMADD1	(aa1, bb1, cc01, cc01)
650	FMADD2	(aa2, bb1, cc02, cc02)
651	FMADD3	(aa1, bb2, cc03, cc03)
652	FMADD4	(aa2, bb2, cc04, cc04)
653
654	FMADD1	(aa1, bb3, cc05, cc05)
655	LDF	[BO + 16 * SIZE], b1
656	FMADD2	(aa2, bb3, cc06, cc06)
657	LDF	[BO +  9 * SIZE], b2
658
659	FMADD3	(aa1, bb4, cc07, cc07)
660	LDF	[BO + 10 * SIZE], b3
661	FMADD4	(aa2, bb4, cc08, cc08)
662	LDF	[BO + 11 * SIZE], b4
663
664	FMADD1	(aa1, bb5, cc09, cc09)
665	LDF	[AO +  2 * SIZE], a3
666	FMADD2	(aa2, bb5, cc10, cc10)
667	LDF	[AO +  3 * SIZE], a4
668
669	FMADD3	(aa1, bb6, cc11, cc11)
670	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
671	FMADD4	(aa2, bb6, cc12, cc12)
672	nop
673
674	FMADD1	(aa1, bb7, cc13, cc13)
675	LDF	[BO + 12 * SIZE], b5
676	FMADD2	(aa2, bb7, cc14, cc14)
677	LDF	[BO + 13 * SIZE], b6
678
679	FMADD3	(aa1, bb8, cc15, cc15)
680	LDF	[BO + 14 * SIZE], b7
681	FMADD4	(aa2, bb8, cc16, cc16)
682	LDF	[BO + 15 * SIZE], b8
683
684	FMADD1	(aa3, bb9, cc01, cc01)
685	FMADD2	(aa4, bb9, cc02, cc02)
686	FMADD3	(aa3, bb2, cc03, cc03)
687	FMADD4	(aa4, bb2, cc04, cc04)
688
689	FMADD1	(aa3, bb3, cc05, cc05)
690	LDF	[BO + 24 * SIZE], b9
691	FMADD2	(aa4, bb3, cc06, cc06)
692	LDF	[BO + 17 * SIZE], b2
693
694	FMADD3	(aa3, bb4, cc07, cc07)
695	LDF	[BO + 18 * SIZE], b3
696	FMADD4	(aa4, bb4, cc08, cc08)
697	LDF	[BO + 19 * SIZE], b4
698
699	FMADD1	(aa3, bb5, cc09, cc09)
700	LDF	[AO +  4 * SIZE], a1
701	FMADD2	(aa4, bb5, cc10, cc10)
702	LDF	[AO +  5 * SIZE], a2
703
704	FMADD3	(aa3, bb6, cc11, cc11)
705	add	L, -1, L
706	FMADD4	(aa4, bb6, cc12, cc12)
707	nop
708
709	FMADD1	(aa3, bb7, cc13, cc13)
710	LDF	[BO + 20 * SIZE], b5
711	FMADD2	(aa4, bb7, cc14, cc14)
712	LDF	[BO + 21 * SIZE], b6
713
714	FMADD3	(aa3, bb8, cc15, cc15)
715	LDF	[BO + 22 * SIZE], b7
716	FMADD4	(aa4, bb8, cc16, cc16)
717	LDF	[BO + 23 * SIZE], b8
718
719	FMADD1	(aa1, bb1, cc01, cc01)
720	FMADD2	(aa2, bb1, cc02, cc02)
721	FMADD3	(aa1, bb2, cc03, cc03)
722	FMADD4	(aa2, bb2, cc04, cc04)
723
724	FMADD1	(aa1, bb3, cc05, cc05)
725	LDF	[BO + 32 * SIZE], b1
726	FMADD2	(aa2, bb3, cc06, cc06)
727	LDF	[BO + 25 * SIZE], b2
728
729	FMADD3	(aa1, bb4, cc07, cc07)
730	LDF	[BO + 26 * SIZE], b3
731	FMADD4	(aa2, bb4, cc08, cc08)
732	LDF	[BO + 27 * SIZE], b4
733
734	FMADD1	(aa1, bb5, cc09, cc09)
735	LDF	[AO +  6 * SIZE], a3
736	FMADD2	(aa2, bb5, cc10, cc10)
737	LDF	[AO +  7 * SIZE], a4
738
739	FMADD3	(aa1, bb6, cc11, cc11)
740	nop
741	FMADD4	(aa2, bb6, cc12, cc12)
742	nop
743
744	FMADD1	(aa1, bb7, cc13, cc13)
745	LDF	[BO + 28 * SIZE], b5
746	FMADD2	(aa2, bb7, cc14, cc14)
747	LDF	[BO + 29 * SIZE], b6
748
749	FMADD3	(aa1, bb8, cc15, cc15)
750	LDF	[BO + 30 * SIZE], b7
751	FMADD4	(aa2, bb8, cc16, cc16)
752	LDF	[BO + 31 * SIZE], b8
753
754	FMADD1	(aa3, bb9, cc01, cc01)
755	FMADD2	(aa4, bb9, cc02, cc02)
756	FMADD3	(aa3, bb2, cc03, cc03)
757	FMADD4	(aa4, bb2, cc04, cc04)
758
759	FMADD1	(aa3, bb3, cc05, cc05)
760	LDF	[BO + 40 * SIZE], b9
761	FMADD2	(aa4, bb3, cc06, cc06)
762	LDF	[BO + 33 * SIZE], b2
763
764	FMADD3	(aa3, bb4, cc07, cc07)
765	LDF	[BO + 34 * SIZE], b3
766	FMADD4	(aa4, bb4, cc08, cc08)
767	LDF	[BO + 35 * SIZE], b4
768
769	FMADD1	(aa3, bb5, cc09, cc09)
770	LDF	[AO + 16 * SIZE], a1  /****/
771	FMADD2	(aa4, bb5, cc10, cc10)
772	LDF	[AO +  9 * SIZE], a2
773
774	FMADD3	(aa3, bb6, cc11, cc11)
775	nop
776	FMADD4	(aa4, bb6, cc12, cc12)
777	nop
778
779	FMADD1	(aa3, bb7, cc13, cc13)
780	LDF	[BO + 36 * SIZE], b5
781	FMADD2	(aa4, bb7, cc14, cc14)
782	LDF	[BO + 37 * SIZE], b6
783
784	FMADD3	(aa3, bb8, cc15, cc15)
785	LDF	[BO + 38 * SIZE], b7
786	FMADD4	(aa4, bb8, cc16, cc16)
787	LDF	[BO + 39 * SIZE], b8
788
789	FMADD1	(aa5, bb1, cc01, cc01)
790	FMADD2	(aa2, bb1, cc02, cc02)
791	FMADD3	(aa5, bb2, cc03, cc03)
792	FMADD4	(aa2, bb2, cc04, cc04)
793
794	FMADD1	(aa5, bb3, cc05, cc05)
795	LDF	[BO + 48 * SIZE], b1
796	FMADD2	(aa2, bb3, cc06, cc06)
797	LDF	[BO + 41 * SIZE], b2
798
799	FMADD3	(aa5, bb4, cc07, cc07)
800	LDF	[BO + 42 * SIZE], b3
801	FMADD4	(aa2, bb4, cc08, cc08)
802	LDF	[BO + 43 * SIZE], b4
803
804	FMADD1	(aa5, bb5, cc09, cc09)
805	LDF	[AO + 10 * SIZE], a3
806	FMADD2	(aa2, bb5, cc10, cc10)
807	LDF	[AO + 11 * SIZE], a4
808
809	FMADD3	(aa5, bb6, cc11, cc11)
810	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
811	FMADD4	(aa2, bb6, cc12, cc12)
812	nop
813
814	FMADD1	(aa5, bb7, cc13, cc13)
815	LDF	[BO + 44 * SIZE], b5
816	FMADD2	(aa2, bb7, cc14, cc14)
817	LDF	[BO + 45 * SIZE], b6
818
819	FMADD3	(aa5, bb8, cc15, cc15)
820	LDF	[BO + 46 * SIZE], b7
821	FMADD4	(aa2, bb8, cc16, cc16)
822	LDF	[BO + 47 * SIZE], b8
823
824	FMADD1	(aa3, bb9, cc01, cc01)
825	FMADD2	(aa4, bb9, cc02, cc02)
826	FMADD3	(aa3, bb2, cc03, cc03)
827	FMADD4	(aa4, bb2, cc04, cc04)
828
829	FMADD1	(aa3, bb3, cc05, cc05)
830	LDF	[BO + 56 * SIZE], b9
831	FMADD2	(aa4, bb3, cc06, cc06)
832	LDF	[BO + 49 * SIZE], b2
833
834	FMADD3	(aa3, bb4, cc07, cc07)
835	LDF	[BO + 50 * SIZE], b3
836	FMADD4	(aa4, bb4, cc08, cc08)
837	LDF	[BO + 51 * SIZE], b4
838
839	FMADD1	(aa3, bb5, cc09, cc09)
840	LDF	[AO + 12 * SIZE], a5
841	FMADD2	(aa4, bb5, cc10, cc10)
842	LDF	[AO + 13 * SIZE], a2
843
844	FMADD3	(aa3, bb6, cc11, cc11)
845	cmp	L, 0
846	FMADD4	(aa4, bb6, cc12, cc12)
847	nop
848
849	FMADD1	(aa3, bb7, cc13, cc13)
850	LDF	[BO + 52 * SIZE], b5
851	FMADD2	(aa4, bb7, cc14, cc14)
852	LDF	[BO + 53 * SIZE], b6
853
854	FMADD3	(aa3, bb8, cc15, cc15)
855	LDF	[BO + 54 * SIZE], b7
856	FMADD4	(aa4, bb8, cc16, cc16)
857	LDF	[BO + 55 * SIZE], b8
858
859	FMADD1	(aa5, bb1, cc01, cc01)
860	FMADD2	(aa2, bb1, cc02, cc02)
861	FMADD3	(aa5, bb2, cc03, cc03)
862	FMADD4	(aa2, bb2, cc04, cc04)
863
864	FMADD1	(aa5, bb3, cc05, cc05)
865	LDF	[BO + 64 * SIZE], b1
866	FMADD2	(aa2, bb3, cc06, cc06)
867	LDF	[BO + 57 * SIZE], b2
868
869	FMADD3	(aa5, bb4, cc07, cc07)
870	LDF	[BO + 58 * SIZE], b3
871	FMADD4	(aa2, bb4, cc08, cc08)
872	LDF	[BO + 59 * SIZE], b4
873
874	FMADD1	(aa5, bb5, cc09, cc09)
875	LDF	[AO + 14 * SIZE], a3
876	FMADD2	(aa2, bb5, cc10, cc10)
877	LDF	[AO + 15 * SIZE], a4
878
879	FMADD3	(aa5, bb6, cc11, cc11)
880	add	BO, 64 * SIZE, BO
881	FMADD4	(aa2, bb6, cc12, cc12)
882	add	AO, 16 * SIZE, AO
883
884	FMADD1	(aa5, bb7, cc13, cc13)
885	LDF	[BO -  4 * SIZE], b5
886	FMADD2	(aa2, bb7, cc14, cc14)
887	LDF	[BO -  3 * SIZE], b6
888
889	FMADD3	(aa5, bb8, cc15, cc15)
890	LDF	[BO -  2 * SIZE], b7
891	FMADD4	(aa2, bb8, cc16, cc16)
892	LDF	[BO -  1 * SIZE], b8
893
894	FMADD1	(aa3, bb9, cc01, cc01)
895	FMADD2	(aa4, bb9, cc02, cc02)
896	FMADD3	(aa3, bb2, cc03, cc03)
897	FMADD4	(aa4, bb2, cc04, cc04)
898
899	FMADD1	(aa3, bb3, cc05, cc05)
900	LDF	[BO +  8 * SIZE], b9
901	FMADD2	(aa4, bb3, cc06, cc06)
902	LDF	[BO +  1 * SIZE], b2
903
904	FMADD3	(aa3, bb4, cc07, cc07)
905	LDF	[BO +  2 * SIZE], b3
906	FMADD4	(aa4, bb4, cc08, cc08)
907	LDF	[BO +  3 * SIZE], b4
908
909	FMADD1	(aa3, bb5, cc09, cc09)
910	LDF	[AO +  8 * SIZE], a5  /****/
911	FMADD2	(aa4, bb5, cc10, cc10)
912	LDF	[AO +  1 * SIZE], a2
913
914	FMADD3	(aa3, bb6, cc11, cc11)
915	FMADD4	(aa4, bb6, cc12, cc12)
916
917	FMADD1	(aa3, bb7, cc13, cc13)
918	LDF	[BO +  4 * SIZE], b5
919	FMADD2	(aa4, bb7, cc14, cc14)
920	LDF	[BO +  5 * SIZE], b6
921
922	FMADD3	(aa3, bb8, cc15, cc15)
923	LDF	[BO +  6 * SIZE], b7
924	FMADD4	(aa4, bb8, cc16, cc16)
925	bg,pt	%icc, .LL13
926	LDF	[BO +  7 * SIZE], b8
927	.align 4
928
929.LL15:
930#if defined(LT) || defined(RN)
931	and	KK, 7, L
932#else
933	sub	K, KK, L
934	and	L,  7, L
935#endif
936	cmp	L,  0
937	ble,a,pn %icc, .LL18
938	nop
939	.align 4
940
941.LL17:
942	FMADD1	(aa1, bb1, cc01, cc01)
943	add	L, -1, L
944	FMADD2	(aa2, bb1, cc02, cc02)
945	nop
946
947	FMADD3	(aa1, bb2, cc03, cc03)
948	LDF	[BO +  8 * SIZE], b1
949	FMADD4	(aa2, bb2, cc04, cc04)
950	LDF	[BO +  9 * SIZE], b2
951
952	FMADD1	(aa1, bb3, cc05, cc05)
953	cmp	L, 0
954	FMADD2	(aa2, bb3, cc06, cc06)
955	nop
956
957	FMADD3	(aa1, bb4, cc07, cc07)
958	LDF	[BO + 10 * SIZE], b3
959	FMADD4	(aa2, bb4, cc08, cc08)
960	LDF	[BO + 11 * SIZE], b4
961
962	FMADD1	(aa1, bb5, cc09, cc09)
963	nop
964	FMADD2	(aa2, bb5, cc10, cc10)
965	nop
966
967	FMADD3	(aa1, bb6, cc11, cc11)
968	LDF	[BO + 12 * SIZE], b5
969	FMADD4	(aa2, bb6, cc12, cc12)
970	LDF	[BO + 13 * SIZE], b6
971
972	FMADD1	(aa1, bb7, cc13, cc13)
973	add	AO, 2 * SIZE, AO
974	FMADD2	(aa2, bb7, cc14, cc14)
975	add	BO, 8 * SIZE, BO
976
977	FMADD3	(aa1, bb8, cc15, cc15)
978	LDF	[AO +  0 * SIZE], a1
979	FMADD4	(aa2, bb8, cc16, cc16)
980	LDF	[AO +  1 * SIZE], a2
981
982	LDF	[BO +  6 * SIZE], b7
983	bg,pt	%icc, .LL17
984	LDF	[BO +  7 * SIZE], b8
985	nop
986	.align 4
987
988.LL18:
989	FADD	c01, c04, c01
990	FADD	c02, c03, c02
991	FADD	c05, c08, c05
992	FADD	c06, c07, c06
993
994	FADD	c09, c12, c09
995	FADD	c10, c11, c10
996	FADD	c13, c16, c13
997	FADD	c14, c15, c14
998
999#if defined(LN) || defined(RT)
1000#ifdef LN
1001	sub	KK, 1, TEMP1
1002#else
1003	sub	KK, 4, TEMP1
1004#endif
1005	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1006	sll	TEMP1, ZBASE_SHIFT + 2, TEMP1
1007
1008	add	AORIG, TEMP2, AO
1009	add	B,     TEMP1, BO
1010#endif
1011
1012#if defined(LN) || defined(LT)
1013	LDF	[BO +  0 * SIZE], a1
1014	LDF	[BO +  1 * SIZE], a2
1015	LDF	[BO +  2 * SIZE], a3
1016	LDF	[BO +  3 * SIZE], a4
1017
1018	LDF	[BO +  4 * SIZE], b1
1019	LDF	[BO +  5 * SIZE], b2
1020	LDF	[BO +  6 * SIZE], b3
1021	LDF	[BO +  7 * SIZE], b4
1022#else
1023	LDF	[AO +  0 * SIZE], a1
1024	LDF	[AO +  1 * SIZE], a2
1025	LDF	[AO +  2 * SIZE], a3
1026	LDF	[AO +  3 * SIZE], a4
1027
1028	LDF	[AO +  4 * SIZE], b1
1029	LDF	[AO +  5 * SIZE], b2
1030	LDF	[AO +  6 * SIZE], b3
1031	LDF	[AO +  7 * SIZE], b4
1032#endif
1033
1034	FSUB	a1, c01, c01
1035	FSUB	a2, c02, c02
1036	FSUB	a3, c05, c05
1037	FSUB	a4, c06, c06
1038
1039	FSUB	b1, c09, c09
1040	FSUB	b2, c10, c10
1041	FSUB	b3, c13, c13
1042	FSUB	b4, c14, c14
1043
1044#if defined(LN) || defined(LT)
1045	LDF	[AO +  0 * SIZE], a1
1046	LDF	[AO +  1 * SIZE], a2
1047
1048	FMUL	a1, c01, b1
1049	FMUL	a2, c01, b2
1050	FMUL	a1, c05, b3
1051	FMUL	a2, c05, b4
1052	FMUL	a1, c09, b5
1053	FMUL	a2, c09, b6
1054	FMUL	a1, c13, b7
1055	FMUL	a2, c13, b8
1056
1057#ifndef CONJ
1058	FNMSUB	(aa2, cc02, bb1, cc01)
1059	FMADD	(aa1, cc02, bb2, cc02)
1060	FNMSUB	(aa2, cc06, bb3, cc05)
1061	FMADD	(aa1, cc06, bb4, cc06)
1062	FNMSUB	(aa2, cc10, bb5, cc09)
1063	FMADD	(aa1, cc10, bb6, cc10)
1064	FNMSUB	(aa2, cc14, bb7, cc13)
1065	FMADD	(aa1, cc14, bb8, cc14)
1066#else
1067	FMADD	(aa2, cc02, bb1, cc01)
1068	FMSUB	(aa1, cc02, bb2, cc02)
1069	FMADD	(aa2, cc06, bb3, cc05)
1070	FMSUB	(aa1, cc06, bb4, cc06)
1071	FMADD	(aa2, cc10, bb5, cc09)
1072	FMSUB	(aa1, cc10, bb6, cc10)
1073	FMADD	(aa2, cc14, bb7, cc13)
1074	FMSUB	(aa1, cc14, bb8, cc14)
1075#endif
1076#endif
1077
1078#ifdef RN
1079	LDF	[BO +  0 * SIZE], b1
1080	LDF	[BO +  1 * SIZE], b2
1081	LDF	[BO +  2 * SIZE], b3
1082	LDF	[BO +  3 * SIZE], b4
1083	LDF	[BO +  4 * SIZE], b5
1084	LDF	[BO +  5 * SIZE], b6
1085	LDF	[BO +  6 * SIZE], b7
1086	LDF	[BO +  7 * SIZE], b8
1087
1088	FMUL	b1, c01, a1
1089	FMUL	b2, c01, a2
1090
1091#ifndef CONJ
1092	FNMSUB	(bb2, cc02, aa1, cc01)
1093	FMADD	(bb1, cc02, aa2, cc02)
1094#else
1095	FMADD	(bb2, cc02, aa1, cc01)
1096	FMSUB	(bb1, cc02, aa2, cc02)
1097#endif
1098
1099	FNMSUB	(bb3, cc01, cc05, cc05)
1100	FNMSUB	(bb3, cc02, cc06, cc06)
1101	FNMSUB	(bb5, cc01, cc09, cc09)
1102	FNMSUB	(bb5, cc02, cc10, cc10)
1103	FNMSUB	(bb7, cc01, cc13, cc13)
1104	FNMSUB	(bb7, cc02, cc14, cc14)
1105
1106#ifndef CONJ
1107	FMADD	(bb4, cc02, cc05, cc05)
1108	FNMSUB	(bb4, cc01, cc06, cc06)
1109	FMADD	(bb6, cc02, cc09, cc09)
1110	FNMSUB	(bb6, cc01, cc10, cc10)
1111	FMADD	(bb8, cc02, cc13, cc13)
1112	FNMSUB	(bb8, cc01, cc14, cc14)
1113#else
1114	FNMSUB	(bb4, cc02, cc05, cc05)
1115	FMADD	(bb4, cc01, cc06, cc06)
1116	FNMSUB	(bb6, cc02, cc09, cc09)
1117	FMADD	(bb6, cc01, cc10, cc10)
1118	FNMSUB	(bb8, cc02, cc13, cc13)
1119	FMADD	(bb8, cc01, cc14, cc14)
1120#endif
1121
1122	LDF	[BO + 10 * SIZE], b1
1123	LDF	[BO + 11 * SIZE], b2
1124	LDF	[BO + 12 * SIZE], b3
1125	LDF	[BO + 13 * SIZE], b4
1126	LDF	[BO + 14 * SIZE], b5
1127	LDF	[BO + 15 * SIZE], b6
1128
1129	FMUL	b1, c05, a1
1130	FMUL	b2, c05, a2
1131
1132#ifndef CONJ
1133	FNMSUB	(bb2, cc06, aa1, cc05)
1134	FMADD	(bb1, cc06, aa2, cc06)
1135#else
1136	FMADD	(bb2, cc06, aa1, cc05)
1137	FMSUB	(bb1, cc06, aa2, cc06)
1138#endif
1139
1140	FNMSUB	(bb3, cc05, cc09, cc09)
1141	FNMSUB	(bb3, cc06, cc10, cc10)
1142	FNMSUB	(bb5, cc05, cc13, cc13)
1143	FNMSUB	(bb5, cc06, cc14, cc14)
1144
1145#ifndef CONJ
1146	FMADD	(bb4, cc06, cc09, cc09)
1147	FNMSUB	(bb4, cc05, cc10, cc10)
1148	FMADD	(bb6, cc06, cc13, cc13)
1149	FNMSUB	(bb6, cc05, cc14, cc14)
1150#else
1151	FNMSUB	(bb4, cc06, cc09, cc09)
1152	FMADD	(bb4, cc05, cc10, cc10)
1153	FNMSUB	(bb6, cc06, cc13, cc13)
1154	FMADD	(bb6, cc05, cc14, cc14)
1155#endif
1156
1157	LDF	[BO + 20 * SIZE], b1
1158	LDF	[BO + 21 * SIZE], b2
1159	LDF	[BO + 22 * SIZE], b3
1160	LDF	[BO + 23 * SIZE], b4
1161
1162	FMUL	b1, c09, a1
1163	FMUL	b2, c09, a2
1164
1165#ifndef CONJ
1166	FNMSUB	(bb2, cc10, aa1, cc09)
1167	FMADD	(bb1, cc10, aa2, cc10)
1168#else
1169	FMADD	(bb2, cc10, aa1, cc09)
1170	FMSUB	(bb1, cc10, aa2, cc10)
1171#endif
1172
1173	FNMSUB	(bb3, cc09, cc13, cc13)
1174	FNMSUB	(bb3, cc10, cc14, cc14)
1175
1176#ifndef CONJ
1177	FMADD	(bb4, cc10, cc13, cc13)
1178	FNMSUB	(bb4, cc09, cc14, cc14)
1179#else
1180	FNMSUB	(bb4, cc10, cc13, cc13)
1181	FMADD	(bb4, cc09, cc14, cc14)
1182#endif
1183
1184	LDF	[BO + 30 * SIZE], b1
1185	LDF	[BO + 31 * SIZE], b2
1186
1187	FMUL	b1, c13, a1
1188	FMUL	b2, c13, a2
1189
1190#ifndef CONJ
1191	FNMSUB	(bb2, cc14, aa1, cc13)
1192	FMADD	(bb1, cc14, aa2, cc14)
1193#else
1194	FMADD	(bb2, cc14, aa1, cc13)
1195	FMSUB	(bb1, cc14, aa2, cc14)
1196#endif
1197#endif
1198
1199#ifdef RT
1200	LDF	[BO + 30 * SIZE], b1
1201	LDF	[BO + 31 * SIZE], b2
1202	LDF	[BO + 28 * SIZE], b3
1203	LDF	[BO + 29 * SIZE], b4
1204	LDF	[BO + 26 * SIZE], b5
1205	LDF	[BO + 27 * SIZE], b6
1206	LDF	[BO + 24 * SIZE], b7
1207	LDF	[BO + 25 * SIZE], b8
1208
1209	FMUL	b1, c13, a1
1210	FMUL	b2, c13, a2
1211
1212#ifndef CONJ
1213	FNMSUB	(bb2, cc14, aa1, cc13)
1214	FMADD	(bb1, cc14, aa2, cc14)
1215#else
1216	FMADD	(bb2, cc14, aa1, cc13)
1217	FMSUB	(bb1, cc14, aa2, cc14)
1218#endif
1219
1220	FNMSUB	(bb3, cc13, cc09, cc09)
1221	FNMSUB	(bb3, cc14, cc10, cc10)
1222	FNMSUB	(bb5, cc13, cc05, cc05)
1223	FNMSUB	(bb5, cc14, cc06, cc06)
1224	FNMSUB	(bb7, cc13, cc01, cc01)
1225	FNMSUB	(bb7, cc14, cc02, cc02)
1226
1227#ifndef CONJ
1228	FMADD	(bb4, cc14, cc09, cc09)
1229	FNMSUB	(bb4, cc13, cc10, cc10)
1230	FMADD	(bb6, cc14, cc05, cc05)
1231	FNMSUB	(bb6, cc13, cc06, cc06)
1232	FMADD	(bb8, cc14, cc01, cc01)
1233	FNMSUB	(bb8, cc13, cc02, cc02)
1234#else
1235	FNMSUB	(bb4, cc14, cc09, cc09)
1236	FMADD	(bb4, cc13, cc10, cc10)
1237	FNMSUB	(bb6, cc14, cc05, cc05)
1238	FMADD	(bb6, cc13, cc06, cc06)
1239	FNMSUB	(bb8, cc14, cc01, cc01)
1240	FMADD	(bb8, cc13, cc02, cc02)
1241#endif
1242
1243	LDF	[BO + 20 * SIZE], b1
1244	LDF	[BO + 21 * SIZE], b2
1245	LDF	[BO + 18 * SIZE], b3
1246	LDF	[BO + 19 * SIZE], b4
1247	LDF	[BO + 16 * SIZE], b5
1248	LDF	[BO + 17 * SIZE], b6
1249
1250	FMUL	b1, c09, a1
1251	FMUL	b2, c09, a2
1252
1253#ifndef CONJ
1254	FNMSUB	(bb2, cc10, aa1, cc09)
1255	FMADD	(bb1, cc10, aa2, cc10)
1256#else
1257	FMADD	(bb2, cc10, aa1, cc09)
1258	FMSUB	(bb1, cc10, aa2, cc10)
1259#endif
1260
1261	FNMSUB	(bb3, cc09, cc05, cc05)
1262	FNMSUB	(bb3, cc10, cc06, cc06)
1263	FNMSUB	(bb5, cc09, cc01, cc01)
1264	FNMSUB	(bb5, cc10, cc02, cc02)
1265
1266#ifndef CONJ
1267	FMADD	(bb4, cc10, cc05, cc05)
1268	FNMSUB	(bb4, cc09, cc06, cc06)
1269	FMADD	(bb6, cc10, cc01, cc01)
1270	FNMSUB	(bb6, cc09, cc02, cc02)
1271#else
1272	FNMSUB	(bb4, cc10, cc05, cc05)
1273	FMADD	(bb4, cc09, cc06, cc06)
1274	FNMSUB	(bb6, cc10, cc01, cc01)
1275	FMADD	(bb6, cc09, cc02, cc02)
1276#endif
1277
1278	LDF	[BO + 10 * SIZE], b1
1279	LDF	[BO + 11 * SIZE], b2
1280	LDF	[BO +  8 * SIZE], b3
1281	LDF	[BO +  9 * SIZE], b4
1282
1283	FMUL	b1, c05, a1
1284	FMUL	b2, c05, a2
1285
1286#ifndef CONJ
1287	FNMSUB	(bb2, cc06, aa1, cc05)
1288	FMADD	(bb1, cc06, aa2, cc06)
1289#else
1290	FMADD	(bb2, cc06, aa1, cc05)
1291	FMSUB	(bb1, cc06, aa2, cc06)
1292#endif
1293
1294	FNMSUB	(bb3, cc05, cc01, cc01)
1295	FNMSUB	(bb3, cc06, cc02, cc02)
1296
1297#ifndef CONJ
1298	FMADD	(bb4, cc06, cc01, cc01)
1299	FNMSUB	(bb4, cc05, cc02, cc02)
1300#else
1301	FNMSUB	(bb4, cc06, cc01, cc01)
1302	FMADD	(bb4, cc05, cc02, cc02)
1303#endif
1304
1305	LDF	[BO +  0 * SIZE], b1
1306	LDF	[BO +  1 * SIZE], b2
1307
1308	FMUL	b1, c01, a1
1309	FMUL	b2, c01, a2
1310
1311#ifndef CONJ
1312	FNMSUB	(bb2, cc02, aa1, cc01)
1313	FMADD	(bb1, cc02, aa2, cc02)
1314#else
1315	FMADD	(bb2, cc02, aa1, cc01)
1316	FMSUB	(bb1, cc02, aa2, cc02)
1317#endif
1318#endif
1319
1320#ifdef LN
1321	add	C1, -2 * SIZE, C1
1322	add	C2, -2 * SIZE, C2
1323	add	C3, -2 * SIZE, C3
1324	add	C4, -2 * SIZE, C4
1325#endif
1326
1327#if defined(LN) || defined(LT)
1328	STF	c01, [BO +  0 * SIZE]
1329	STF	c02, [BO +  1 * SIZE]
1330	STF	c05, [BO +  2 * SIZE]
1331	STF	c06, [BO +  3 * SIZE]
1332
1333	STF	c09, [BO +  4 * SIZE]
1334	STF	c10, [BO +  5 * SIZE]
1335	STF	c13, [BO +  6 * SIZE]
1336	STF	c14, [BO +  7 * SIZE]
1337#else
1338	STF	c01, [AO +  0 * SIZE]
1339	STF	c02, [AO +  1 * SIZE]
1340	STF	c05, [AO +  2 * SIZE]
1341	STF	c06, [AO +  3 * SIZE]
1342
1343	STF	c09, [AO +  4 * SIZE]
1344	STF	c10, [AO +  5 * SIZE]
1345	STF	c13, [AO +  6 * SIZE]
1346	STF	c14, [AO +  7 * SIZE]
1347#endif
1348
1349	STF	c01, [C1 + 0 * SIZE]
1350	STF	c02, [C1 + 1 * SIZE]
1351	STF	c05, [C2 + 0 * SIZE]
1352	STF	c06, [C2 + 1 * SIZE]
1353
1354	STF	c09, [C3 + 0 * SIZE]
1355	STF	c10, [C3 + 1 * SIZE]
1356	STF	c13, [C4 + 0 * SIZE]
1357	STF	c14, [C4 + 1 * SIZE]
1358
1359#ifndef LN
1360	add	C1, 2 * SIZE, C1
1361	add	C2, 2 * SIZE, C2
1362	add	C3, 2 * SIZE, C3
1363	add	C4, 2 * SIZE, C4
1364#endif
1365
1366#ifdef RT
1367	sll	K, ZBASE_SHIFT, TEMP1
1368	add	AORIG, TEMP1, AORIG
1369#endif
1370
1371#if defined(LT) || defined(RN)
1372	sub	K, KK, TEMP1
1373	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1374	sll	TEMP1, ZBASE_SHIFT + 2, TEMP1
1375	add	AO, TEMP2, AO
1376	add	BO, TEMP1, BO
1377#endif
1378
1379#ifdef LT
1380	add	KK, 1, KK
1381#endif
1382
1383#ifdef LN
1384	sub	KK, 1, KK
1385#endif
1386
1387	add	I, -1, I
1388	cmp	I, 0
1389	bg,pt	%icc, .LL12
1390	nop
1391
1392#ifdef LN
1393	sll	K, ZBASE_SHIFT + 2, TEMP1
1394	add	B, TEMP1, B
1395#endif
1396
1397#if defined(LT) || defined(RN)
1398	mov	BO, B
1399#endif
1400
1401#ifdef RN
1402	add	KK, 4, KK
1403#endif
1404
1405#ifdef RT
1406	sub	KK, 4, KK
1407#endif
1408
1409	add	J, -1, J
1410	cmp	J, 0
1411	bg,pt	%icc, .LL11
1412	nop
1413	.align 4
1414
1415.LL20:
1416	and	N, 2, J
1417	cmp	J, 0
1418	ble,pn	%icc, .LL30
1419	nop
1420
1421#ifdef RT
1422	sll	K, ZBASE_SHIFT + 1, TEMP1
1423	sub	B, TEMP1, B
1424#endif
1425
1426#ifndef RT
1427	mov	C,  C1
1428	add	C,  LDC, C2
1429	add	C2, LDC, C
1430#else
1431	sub	C,  LDC, C2
1432	sub	C2, LDC, C1
1433	sub	C2, LDC, C
1434#endif
1435
1436#ifdef LN
1437	add	M, OFFSET, KK
1438#endif
1439
1440#ifdef LT
1441	mov	OFFSET, KK
1442#endif
1443
1444#if defined(LN) || defined(RT)
1445	mov	A, AORIG
1446#else
1447	mov	A, AO
1448#endif
1449
1450	mov	M, I
1451	.align 4
1452
1453.LL22:
1454#if defined(LT) || defined(RN)
1455	mov	B, BO
1456#else
1457#ifdef LN
1458	sll	K,  ZBASE_SHIFT, TEMP1
1459	sub	AORIG, TEMP1, AORIG
1460#endif
1461
1462	sll	KK, ZBASE_SHIFT + 0, TEMP1
1463	sll	KK, ZBASE_SHIFT + 1, TEMP2
1464
1465	add	AORIG, TEMP1, AO
1466	add	B,     TEMP2, BO
1467#endif
1468
1469	LDF	[AO +  0 * SIZE], a1
1470	LDF	[AO +  1 * SIZE], a2
1471
1472	LDF	[BO +  0 * SIZE], b1
1473	LDF	[BO +  1 * SIZE], b2
1474	LDF	[BO +  2 * SIZE], b3
1475	LDF	[BO +  3 * SIZE], b4
1476	LDF	[BO +  4 * SIZE], b5
1477	FCLR	(cc01)
1478
1479	LDF	[BO +  5 * SIZE], b6
1480	FCLR	(cc02)
1481	LDF	[BO +  6 * SIZE], b7
1482	FCLR	(cc03)
1483	LDF	[BO +  7 * SIZE], b8
1484	FCLR	(cc04)
1485	LDF	[BO +  8 * SIZE], b9
1486	FCLR	(cc05)
1487
1488	prefetch [C1 + 2 * SIZE], 3
1489	FCLR	(cc06)
1490	prefetch [C2 + 2 * SIZE], 3
1491	FCLR	(cc07)
1492
1493#if defined(LT) || defined(RN)
1494	sra	KK, 2, L
1495#else
1496	sub	K, KK, L
1497	sra	L,  2, L
1498#endif
1499	cmp	L,  0
1500	ble,pn	%icc, .LL25
1501	FCLR	(cc08)
1502	.align 4
1503
1504.LL23:
1505	FMADD1	(aa1, bb1, cc01, cc01)
1506	LDF	[AO +  2 * SIZE], a3
1507	FMADD2	(aa2, bb1, cc02, cc02)
1508	LDF	[AO +  3 * SIZE], a4
1509
1510	FMADD3	(aa1, bb2, cc03, cc03)
1511	LDF	[BO + 16 * SIZE], b1
1512	FMADD4	(aa2, bb2, cc04, cc04)
1513	LDF	[BO +  9 * SIZE], b2
1514
1515	FMADD1	(aa1, bb3, cc05, cc05)
1516	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1517	FMADD2	(aa2, bb3, cc06, cc06)
1518	add	L, -1, L
1519
1520	FMADD3	(aa1, bb4, cc07, cc07)
1521	LDF	[BO + 10 * SIZE], b3
1522	FMADD4	(aa2, bb4, cc08, cc08)
1523	LDF	[BO + 11 * SIZE], b4
1524
1525	FMADD1	(aa3, bb5, cc01, cc01)
1526	LDF	[AO +  4 * SIZE], a1
1527	FMADD2	(aa4, bb5, cc02, cc02)
1528	LDF	[AO +  5 * SIZE], a2
1529
1530	FMADD3	(aa3, bb6, cc03, cc03)
1531	LDF	[BO + 12 * SIZE], b5
1532	FMADD4	(aa4, bb6, cc04, cc04)
1533	LDF	[BO + 13 * SIZE], b6
1534
1535	FMADD1	(aa3, bb7, cc05, cc05)
1536	cmp	L, 0
1537	FMADD2	(aa4, bb7, cc06, cc06)
1538	add	AO,  8 * SIZE, AO
1539
1540	FMADD3	(aa3, bb8, cc07, cc07)
1541	LDF	[BO + 14 * SIZE], b7
1542	FMADD4	(aa4, bb8, cc08, cc08)
1543	LDF	[BO + 15 * SIZE], b8
1544
1545	FMADD1	(aa1, bb9, cc01, cc01)
1546	LDF	[AO -  2 * SIZE], a3
1547	FMADD2	(aa2, bb9, cc02, cc02)
1548	LDF	[AO -  1 * SIZE], a4
1549
1550	FMADD3	(aa1, bb2, cc03, cc03)
1551	LDF	[BO + 24 * SIZE], b9
1552	FMADD4	(aa2, bb2, cc04, cc04)
1553	LDF	[BO + 17 * SIZE], b2
1554
1555	FMADD1	(aa1, bb3, cc05, cc05)
1556	add	BO, 16 * SIZE, BO
1557	FMADD2	(aa2, bb3, cc06, cc06)
1558	nop
1559
1560	FMADD3	(aa1, bb4, cc07, cc07)
1561	LDF	[BO +  2 * SIZE], b3
1562	FMADD4	(aa2, bb4, cc08, cc08)
1563	LDF	[BO +  3 * SIZE], b4
1564
1565	FMADD1	(aa3, bb5, cc01, cc01)
1566	LDF	[AO +  0 * SIZE], a1
1567	FMADD2	(aa4, bb5, cc02, cc02)
1568	LDF	[AO +  1 * SIZE], a2
1569	FMADD3	(aa3, bb6, cc03, cc03)
1570	LDF	[BO +  4 * SIZE], b5
1571	FMADD4	(aa4, bb6, cc04, cc04)
1572	LDF	[BO +  5 * SIZE], b6
1573
1574	FMADD1	(aa3, bb7, cc05, cc05)
1575	nop
1576	FMADD2	(aa4, bb7, cc06, cc06)
1577	LDF	[BO +  6 * SIZE], b7
1578
1579	FMADD3	(aa3, bb8, cc07, cc07)
1580	FMADD4	(aa4, bb8, cc08, cc08)
1581	bg,pt	%icc, .LL23
1582	LDF	[BO +  7 * SIZE], b8
1583	.align 4
1584
1585.LL25:
1586#if defined(LT) || defined(RN)
1587	and	KK, 3, L
1588#else
1589	sub	K, KK, L
1590	and	L,  3, L
1591#endif
1592	cmp	L,  0
1593	ble,a,pn %icc, .LL28
1594	nop
1595	.align 4
1596
1597.LL27:
1598	FMADD1	(aa1, bb1, cc01, cc01)
1599	add	L, -1, L
1600	FMADD2	(aa2, bb1, cc02, cc02)
1601	LDF	[BO + 4 * SIZE], b1
1602
1603	FMADD3	(aa1, bb2, cc03, cc03)
1604	add	AO, 2 * SIZE, AO
1605	FMADD4	(aa2, bb2, cc04, cc04)
1606	LDF	[BO + 5 * SIZE], b2
1607
1608	FMADD1	(aa1, bb3, cc05, cc05)
1609	cmp	L, 0
1610	FMADD2	(aa2, bb3, cc06, cc06)
1611	LDF	[BO + 6 * SIZE], b3
1612
1613	FMADD3	(aa1, bb4, cc07, cc07)
1614	LDF	[AO + 0 * SIZE], a1
1615	FMADD4	(aa2, bb4, cc08, cc08)
1616	LDF	[AO + 1 * SIZE], a2
1617
1618	LDF	[BO + 7 * SIZE], b4
1619	bg,pt	%icc, .LL27
1620	add	BO, 4 * SIZE, BO
1621	.align 4
1622
1623.LL28:
1624	FADD	c01, c04, c01
1625	FADD	c02, c03, c02
1626	FADD	c05, c08, c05
1627	FADD	c06, c07, c06
1628
1629#if defined(LN) || defined(RT)
1630#ifdef LN
1631	sub	KK, 1, TEMP1
1632#else
1633	sub	KK, 2, TEMP1
1634#endif
1635	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1636	sll	TEMP1, ZBASE_SHIFT + 1, TEMP1
1637
1638	add	AORIG, TEMP2, AO
1639	add	B,     TEMP1, BO
1640#endif
1641
1642#if defined(LN) || defined(LT)
1643	LDF	[BO +  0 * SIZE], a1
1644	LDF	[BO +  1 * SIZE], a2
1645	LDF	[BO +  2 * SIZE], a3
1646	LDF	[BO +  3 * SIZE], a4
1647#else
1648	LDF	[AO +  0 * SIZE], a1
1649	LDF	[AO +  1 * SIZE], a2
1650	LDF	[AO +  2 * SIZE], a3
1651	LDF	[AO +  3 * SIZE], a4
1652#endif
1653
1654	FSUB	a1, c01, c01
1655	FSUB	a2, c02, c02
1656	FSUB	a3, c05, c05
1657	FSUB	a4, c06, c06
1658
1659#if defined(LN) || defined(LT)
1660	LDF	[AO +  0 * SIZE], a1
1661	LDF	[AO +  1 * SIZE], a2
1662
1663	FMUL	a1, c01, b1
1664	FMUL	a2, c01, b2
1665	FMUL	a1, c05, b3
1666	FMUL	a2, c05, b4
1667
1668#ifndef CONJ
1669	FNMSUB	(aa2, cc02, bb1, cc01)
1670	FMADD	(aa1, cc02, bb2, cc02)
1671	FNMSUB	(aa2, cc06, bb3, cc05)
1672	FMADD	(aa1, cc06, bb4, cc06)
1673#else
1674	FMADD	(aa2, cc02, bb1, cc01)
1675	FMSUB	(aa1, cc02, bb2, cc02)
1676	FMADD	(aa2, cc06, bb3, cc05)
1677	FMSUB	(aa1, cc06, bb4, cc06)
1678#endif
1679#endif
1680
1681#ifdef RN
1682	LDF	[BO +  0 * SIZE], b1
1683	LDF	[BO +  1 * SIZE], b2
1684	LDF	[BO +  2 * SIZE], b3
1685	LDF	[BO +  3 * SIZE], b4
1686
1687	FMUL	b1, c01, a1
1688	FMUL	b2, c01, a2
1689
1690#ifndef CONJ
1691	FNMSUB	(bb2, cc02, aa1, cc01)
1692	FMADD	(bb1, cc02, aa2, cc02)
1693#else
1694	FMADD	(bb2, cc02, aa1, cc01)
1695	FMSUB	(bb1, cc02, aa2, cc02)
1696#endif
1697
1698	FNMSUB	(bb3, cc01, cc05, cc05)
1699	FNMSUB	(bb3, cc02, cc06, cc06)
1700
1701#ifndef CONJ
1702	FMADD	(bb4, cc02, cc05, cc05)
1703	FNMSUB	(bb4, cc01, cc06, cc06)
1704#else
1705	FNMSUB	(bb4, cc02, cc05, cc05)
1706	FMADD	(bb4, cc01, cc06, cc06)
1707#endif
1708
1709	LDF	[BO +  6 * SIZE], b1
1710	LDF	[BO +  7 * SIZE], b2
1711
1712	FMUL	b1, c05, a1
1713	FMUL	b2, c05, a2
1714
1715#ifndef CONJ
1716	FNMSUB	(bb2, cc06, aa1, cc05)
1717	FMADD	(bb1, cc06, aa2, cc06)
1718#else
1719	FMADD	(bb2, cc06, aa1, cc05)
1720	FMSUB	(bb1, cc06, aa2, cc06)
1721#endif
1722#endif
1723
1724#ifdef RT
1725	LDF	[BO +  6 * SIZE], b1
1726	LDF	[BO +  7 * SIZE], b2
1727	LDF	[BO +  4 * SIZE], b3
1728	LDF	[BO +  5 * SIZE], b4
1729
1730	FMUL	b1, c05, a1
1731	FMUL	b2, c05, a2
1732
1733#ifndef CONJ
1734	FNMSUB	(bb2, cc06, aa1, cc05)
1735	FMADD	(bb1, cc06, aa2, cc06)
1736#else
1737	FMADD	(bb2, cc06, aa1, cc05)
1738	FMSUB	(bb1, cc06, aa2, cc06)
1739#endif
1740
1741	FNMSUB	(bb3, cc05, cc01, cc01)
1742	FNMSUB	(bb3, cc06, cc02, cc02)
1743
1744#ifndef CONJ
1745	FMADD	(bb4, cc06, cc01, cc01)
1746	FNMSUB	(bb4, cc05, cc02, cc02)
1747#else
1748	FNMSUB	(bb4, cc06, cc01, cc01)
1749	FMADD	(bb4, cc05, cc02, cc02)
1750#endif
1751
1752	LDF	[BO +  0 * SIZE], b1
1753	LDF	[BO +  1 * SIZE], b2
1754
1755	FMUL	b1, c01, a1
1756	FMUL	b2, c01, a2
1757
1758#ifndef CONJ
1759	FNMSUB	(bb2, cc02, aa1, cc01)
1760	FMADD	(bb1, cc02, aa2, cc02)
1761#else
1762	FMADD	(bb2, cc02, aa1, cc01)
1763	FMSUB	(bb1, cc02, aa2, cc02)
1764#endif
1765#endif
1766
1767#ifdef LN
1768	add	C1, -2 * SIZE, C1
1769	add	C2, -2 * SIZE, C2
1770#endif
1771
1772#if defined(LN) || defined(LT)
1773	STF	c01, [BO +  0 * SIZE]
1774	STF	c02, [BO +  1 * SIZE]
1775	STF	c05, [BO +  2 * SIZE]
1776	STF	c06, [BO +  3 * SIZE]
1777#else
1778	STF	c01, [AO +  0 * SIZE]
1779	STF	c02, [AO +  1 * SIZE]
1780	STF	c05, [AO +  2 * SIZE]
1781	STF	c06, [AO +  3 * SIZE]
1782#endif
1783
1784	STF	c01, [C1 + 0 * SIZE]
1785	STF	c02, [C1 + 1 * SIZE]
1786	STF	c05, [C2 + 0 * SIZE]
1787	STF	c06, [C2 + 1 * SIZE]
1788
1789#ifndef LN
1790	add	C1, 2 * SIZE, C1
1791	add	C2, 2 * SIZE, C2
1792#endif
1793
1794#ifdef RT
1795	sll	K, ZBASE_SHIFT, TEMP1
1796	add	AORIG, TEMP1, AORIG
1797#endif
1798
1799#if defined(LT) || defined(RN)
1800	sub	K, KK, TEMP1
1801	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1802	sll	TEMP1, ZBASE_SHIFT + 1, TEMP1
1803	add	AO, TEMP2, AO
1804	add	BO, TEMP1, BO
1805#endif
1806
1807#ifdef LT
1808	add	KK, 1, KK
1809#endif
1810
1811#ifdef LN
1812	sub	KK, 1, KK
1813#endif
1814
1815	add	I, -1, I
1816	cmp	I, 0
1817	bg,pt	%icc, .LL22
1818	nop
1819
1820#ifdef LN
1821	sll	K, ZBASE_SHIFT + 1, TEMP1
1822	add	B, TEMP1, B
1823#endif
1824
1825#if defined(LT) || defined(RN)
1826	mov	BO, B
1827#endif
1828
1829#ifdef RN
1830	add	KK, 2, KK
1831#endif
1832
1833#ifdef RT
1834	sub	KK, 2, KK
1835#endif
1836	.align 4
1837
1838.LL30:
1839	and	N, 1, J
1840	cmp	J, 0
1841	ble,pn	%icc, .LL999
1842	nop
1843
1844#ifdef RT
1845	sll	K, ZBASE_SHIFT, TEMP1
1846	sub	B, TEMP1, B
1847#endif
1848
1849#ifndef RT
1850	mov	C,  C1
1851	add	C,  LDC, C
1852#else
1853	sub	C,  LDC, C1
1854	sub	C,  LDC, C
1855#endif
1856
1857#ifdef LN
1858	add	M, OFFSET, KK
1859#endif
1860
1861#ifdef LT
1862	mov	OFFSET, KK
1863#endif
1864
1865#if defined(LN) || defined(RT)
1866	mov	A, AORIG
1867#else
1868	mov	A, AO
1869#endif
1870
1871	mov	M, I
1872	.align 4
1873
1874.LL32:
1875#if defined(LT) || defined(RN)
1876	mov	B, BO
1877#else
1878#ifdef LN
1879	sll	K,  ZBASE_SHIFT, TEMP1
1880	sub	AORIG, TEMP1, AORIG
1881#endif
1882
1883	sll	KK, ZBASE_SHIFT + 0, TEMP1
1884
1885	add	AORIG, TEMP1, AO
1886	add	B,     TEMP1, BO
1887#endif
1888
1889	LDF	[AO +  0 * SIZE], a1
1890	LDF	[AO +  1 * SIZE], a2
1891	LDF	[AO +  2 * SIZE], a3
1892	LDF	[AO +  3 * SIZE], a4
1893
1894	LDF	[BO +  0 * SIZE], b1
1895	LDF	[BO +  1 * SIZE], b2
1896	LDF	[BO +  2 * SIZE], b3
1897	FCLR	(cc01)
1898	LDF	[BO +  3 * SIZE], b4
1899	FCLR	(cc02)
1900
1901	LDF	[BO +  4 * SIZE], b5
1902	FCLR	(cc03)
1903	LDF	[BO +  5 * SIZE], b6
1904	FCLR	(cc04)
1905	LDF	[BO +  6 * SIZE], b7
1906	FCLR	(cc05)
1907	LDF	[BO +  7 * SIZE], b8
1908	FCLR	(cc06)
1909
1910	prefetch [C1 + 2 * SIZE], 3
1911	FCLR	(cc07)
1912
1913#if defined(LT) || defined(RN)
1914	sra	KK, 2, L
1915#else
1916	sub	K, KK, L
1917	sra	L,  2, L
1918#endif
1919	cmp	L,  0
1920	ble,pn	%icc, .LL35
1921	FCLR	(cc08)
1922	.align 4
1923
1924.LL33:
1925	FMADD1	(aa1, bb1, cc01, cc01)
1926	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1927	FMADD2	(aa2, bb1, cc02, cc02)
1928	LDF	[BO +  8 * SIZE], b1
1929
1930	FMADD3	(aa1, bb2, cc03, cc03)
1931	LDF	[AO +  4 * SIZE], a1
1932	FMADD4	(aa2, bb2, cc04, cc04)
1933	LDF	[AO +  5 * SIZE], a2
1934
1935	FMADD1	(aa3, bb3, cc01, cc01)
1936	LDF	[BO +  9 * SIZE], b2
1937	FMADD2	(aa4, bb3, cc02, cc02)
1938	LDF	[BO + 10 * SIZE], b3
1939
1940	FMADD3	(aa3, bb4, cc03, cc03)
1941	LDF	[AO +  6 * SIZE], a3
1942	FMADD4	(aa4, bb4, cc04, cc04)
1943	LDF	[AO +  7 * SIZE], a4
1944
1945	FMADD1	(aa1, bb5, cc01, cc01)
1946	LDF	[BO + 11 * SIZE], b4
1947	FMADD2	(aa2, bb5, cc02, cc02)
1948	LDF	[BO + 12 * SIZE], b5
1949
1950	FMADD3	(aa1, bb6, cc03, cc03)
1951	LDF	[AO +  8 * SIZE], a1
1952	FMADD4	(aa2, bb6, cc04, cc04)
1953	LDF	[AO +  9 * SIZE], a2
1954
1955	FMADD1	(aa3, bb7, cc01, cc01)
1956	LDF	[BO + 13 * SIZE], b6
1957
1958	FMADD2	(aa4, bb7, cc02, cc02)
1959	LDF	[BO + 14 * SIZE], b7
1960
1961	FMADD3	(aa3, bb8, cc03, cc03)
1962	LDF	[AO + 10 * SIZE], a3
1963	FMADD4	(aa4, bb8, cc04, cc04)
1964	LDF	[AO + 11 * SIZE], a4
1965
1966	add	AO,  8 * SIZE, AO
1967	add	L, -1, L
1968	add	BO,  8 * SIZE, BO
1969	cmp	L, 0
1970
1971	bg,pt	%icc, .LL33
1972	LDF	[BO +  7 * SIZE], b8
1973	.align 4
1974
1975.LL35:
1976#if defined(LT) || defined(RN)
1977	and	KK, 3, L
1978#else
1979	sub	K, KK, L
1980	and	L,  3, L
1981#endif
1982	cmp	L,  0
1983	ble,a,pn %icc, .LL38
1984	nop
1985	.align 4
1986
1987.LL37:
1988	FMADD1	(aa1, bb1, cc01, cc01)
1989	add	L, -1, L
1990	FMADD2	(aa2, bb1, cc02, cc02)
1991	LDF	[BO + 2 * SIZE], b1
1992
1993	FMADD3	(aa1, bb2, cc03, cc03)
1994	LDF	[AO + 2 * SIZE], a1
1995	FMADD4	(aa2, bb2, cc04, cc04)
1996	LDF	[AO + 3 * SIZE], a2
1997
1998	add	AO, 2 * SIZE, AO
1999	cmp	L, 0
2000	add	BO, 2 * SIZE, BO
2001	bg,pt	%icc, .LL37
2002	LDF	[BO + 1 * SIZE], b2
2003	.align 4
2004
2005.LL38:
2006	FADD	c01, c04, c01
2007	FADD	c02, c03, c02
2008
2009#if defined(LN) || defined(RT)
2010	sub	KK, 1, TEMP1
2011
2012	sll	TEMP1, ZBASE_SHIFT, TEMP1
2013
2014	add	AORIG, TEMP1, AO
2015	add	B,     TEMP1, BO
2016#endif
2017
2018#if defined(LN) || defined(LT)
2019	LDF	[BO +  0 * SIZE], a1
2020	LDF	[BO +  1 * SIZE], a2
2021#else
2022	LDF	[AO +  0 * SIZE], a1
2023	LDF	[AO +  1 * SIZE], a2
2024#endif
2025
2026	FSUB	a1, c01, c01
2027	FSUB	a2, c02, c02
2028
2029#if defined(LN) || defined(LT)
2030	LDF	[AO +  0 * SIZE], a1
2031	LDF	[AO +  1 * SIZE], a2
2032#else
2033	LDF	[BO +  0 * SIZE], a1
2034	LDF	[BO +  1 * SIZE], a2
2035#endif
2036
2037	FMUL	a1, c01, b1
2038	FMUL	a2, c01, b2
2039
2040#ifndef CONJ
2041	FNMSUB	(aa2, cc02, bb1, cc01)
2042	FMADD	(aa1, cc02, bb2, cc02)
2043#else
2044	FMADD	(aa2, cc02, bb1, cc01)
2045	FMSUB	(aa1, cc02, bb2, cc02)
2046#endif
2047
2048#ifdef LN
2049	add	C1, -2 * SIZE, C1
2050#endif
2051
2052#if defined(LN) || defined(LT)
2053	STF	c01, [BO +  0 * SIZE]
2054	STF	c02, [BO +  1 * SIZE]
2055#else
2056	STF	c01, [AO +  0 * SIZE]
2057	STF	c02, [AO +  1 * SIZE]
2058#endif
2059
2060	STF	c01, [C1 + 0 * SIZE]
2061	STF	c02, [C1 + 1 * SIZE]
2062
2063#ifndef LN
2064	add	C1, 2 * SIZE, C1
2065#endif
2066
2067#ifdef RT
2068	sll	K, ZBASE_SHIFT, TEMP1
2069	add	AORIG, TEMP1, AORIG
2070#endif
2071
2072#if defined(LT) || defined(RN)
2073	sub	K, KK, TEMP1
2074	sll	TEMP1, ZBASE_SHIFT, TEMP1
2075	add	AO, TEMP1, AO
2076	add	BO, TEMP1, BO
2077#endif
2078
2079#ifdef LT
2080	add	KK, 1, KK
2081#endif
2082
2083#ifdef LN
2084	sub	KK, 1, KK
2085#endif
2086
2087	add	I, -1, I
2088	cmp	I, 0
2089	bg,pt	%icc, .LL32
2090	nop
2091
2092#ifdef LN
2093	sll	K, ZBASE_SHIFT, TEMP1
2094	add	B, TEMP1, B
2095#endif
2096
2097#if defined(LT) || defined(RN)
2098	mov	BO, B
2099#endif
2100
2101#ifdef RN
2102	add	KK, 1, KK
2103#endif
2104
2105#ifdef RT
2106	sub	KK, 1, KK
2107#endif
2108	.align 4
2109
2110.LL999:
2111	return	%i7 + 8
2112	clr	%o0
2113
2114	EPILOGUE
2115