1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2005. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define APREFETCHSIZE 24
26#define APREFETCH_CATEGORY 0
27
28#define M	%i0
29#define N	%i1
30#define K	%i2
31
32#if defined(DOUBLE) && !defined(__64BIT__)
33#define A	%i5
34#define B	%i4
35#else
36#define A	%i4
37#define B	%i5
38#endif
39
40#define C	%o4
41#define LDC	%o5
42
43#define AO	%l0
44#define BO	%l1
45#define I	%l2
46#define J	%l3
47#define L	%l4
48
49#define C1	%o0
50#define C2	%o1
51#define C3	%o2
52#define C4	%o3
53
54#define C5	%l5
55#define	C6	%l6
56#define C7	%l7
57#define C8	%i3
58
59#define OFFSET	%g1
60#define	KK	%g2
61#define TEMP1	%g3
62#define TEMP2	%g4
63#define AORIG	%o7
64
65#ifdef DOUBLE
66#define c01	%f0
67#define c02	%f2
68#define c03	%f4
69#define c04	%f6
70#define c05	%f8
71#define c06	%f10
72#define c07	%f12
73#define c08	%f14
74#define c09	%f16
75#define c10	%f18
76#define c11	%f20
77#define c12	%f22
78#define c13	%f24
79#define c14	%f26
80#define c15	%f28
81#define c16	%f30
82
83#define a1	%f32
84#define a2	%f34
85#define a3	%f36
86#define a4	%f38
87#define a5	%f40
88
89#define b1	%f42
90#define b2	%f44
91#define b3	%f46
92#define b4	%f48
93#define b5	%f50
94#define b6	%f52
95#define b7	%f54
96#define b8	%f56
97#define b9	%f58
98
99#define cc01	0
100#define cc02	2
101#define cc03	4
102#define cc04	6
103#define cc05	8
104#define cc06	10
105#define cc07	12
106#define cc08	14
107#define cc09	16
108#define cc10	18
109#define cc11	20
110#define cc12	22
111#define cc13	24
112#define cc14	26
113#define cc15	28
114#define cc16	30
115
116#define aa1	 1
117#define aa2	 3
118#define aa3	 5
119#define aa4	 7
120#define aa5	 9
121
122#define bb1	11
123#define bb2	13
124#define bb3	15
125#define bb4	17
126#define bb5	19
127#define bb6	21
128#define bb7	23
129#define bb8	25
130#define bb9	27
131
132#else
133#define c01	%f0
134#define c02	%f1
135#define c03	%f2
136#define c04	%f3
137#define c05	%f4
138#define c06	%f5
139#define c07	%f6
140#define c08	%f7
141#define c09	%f8
142#define c10	%f9
143#define c11	%f10
144#define c12	%f11
145#define c13	%f12
146#define c14	%f13
147#define c15	%f14
148#define c16	%f15
149
150#define a1	%f16
151#define a2	%f17
152#define a3	%f18
153#define a4	%f19
154#define a5	%f20
155
156#define b1	%f21
157#define b2	%f22
158#define b3	%f23
159#define b4	%f24
160#define b5	%f25
161#define b6	%f26
162#define b7	%f27
163#define b8	%f28
164#define b9	%f29
165
166#define cc01	0
167#define cc02	1
168#define cc03	2
169#define cc04	3
170#define cc05	4
171#define cc06	5
172#define cc07	6
173#define cc08	7
174#define cc09	8
175#define cc10	9
176#define cc11	10
177#define cc12	11
178#define cc13	12
179#define cc14	13
180#define cc15	14
181#define cc16	15
182
183#define aa1	16
184#define aa2	17
185#define aa3	18
186#define aa4	19
187#define aa5	20
188
189#define bb1	21
190#define bb2	22
191#define bb3	23
192#define bb4	24
193#define bb5	25
194#define bb6	26
195#define bb7	27
196#define bb8	28
197#define bb9	29
198
199#endif
200
201        .register %g2, #scratch
202        .register %g3, #scratch
203
204	PROLOGUE
205	SAVESP
206	nop
207
208#ifndef __64BIT__
209
210#ifdef DOUBLE
211	ld	[%sp + STACK_START + 28], B
212	ld	[%sp + STACK_START + 32], C
213	ld	[%sp + STACK_START + 36], LDC
214	ld	[%sp + STACK_START + 40], OFFSET
215#else
216	ld	[%sp + STACK_START + 28], C
217	ld	[%sp + STACK_START + 32], LDC
218	ld	[%sp + STACK_START + 36], OFFSET
219#endif
220	st	%g1, [%sp + STACK_START +  8]
221	st	%g2, [%sp + STACK_START + 12]
222	st	%g3, [%sp + STACK_START + 16]
223	st	%g4, [%sp + STACK_START + 20]
224#else
225
226	ldx	[%sp+  STACK_START + 56], C
227	ldx	[%sp+  STACK_START + 64], LDC
228	ldx	[%sp+  STACK_START + 72], OFFSET
229
230	stx	%g1, [%sp + STACK_START + 32]
231	stx	%g2, [%sp + STACK_START + 40]
232	stx	%g3, [%sp + STACK_START + 48]
233	stx	%g4, [%sp + STACK_START + 56]
234#endif
235
236#if defined(TRMMKERNEL) && !defined(LEFT)
237	neg	OFFSET, KK
238#endif
239
240	sll	LDC, BASE_SHIFT, LDC
241
242#ifdef LN
243	smul	M, K, TEMP1
244	sll	TEMP1, BASE_SHIFT, TEMP1
245	add	A, TEMP1, A
246
247	sll	M, BASE_SHIFT, TEMP1
248	add	C, TEMP1, C
249#endif
250
251#ifdef RN
252	neg	OFFSET, KK
253#endif
254
255#ifdef RT
256	smul	N, K, TEMP1
257	sll	TEMP1, BASE_SHIFT, TEMP1
258	add	B, TEMP1, B
259
260	smul	N, LDC, TEMP1
261	add	C, TEMP1, C
262
263	sub	N, OFFSET, KK
264#endif
265
266	sra	N, 3, J
267	cmp	J, 0
268	ble,pn	%icc, .LL30
269	nop
270	.align 4
271
272.LL11:
273#ifdef RT
274	sll	K, BASE_SHIFT + 3, TEMP1
275	sub	B, TEMP1, B
276#endif
277
278#ifndef RT
279	mov	C,  C1
280	add	C,  LDC, C2
281	add	C2, LDC, C3
282	add	C3, LDC, C4
283	add	C4, LDC, C5
284	add	C5, LDC, C6
285	add	C6, LDC, C7
286	add	C7, LDC, C8
287	add	C8, LDC, C
288#else
289	sub	C,  LDC, C8
290	sub	C8, LDC, C7
291	sub	C7, LDC, C6
292	sub	C6, LDC, C5
293	sub	C5, LDC, C4
294	sub	C4, LDC, C3
295	sub	C3, LDC, C2
296	sub	C2, LDC, C1
297	sub	C2, LDC, C
298#endif
299
300#ifdef LN
301	add	M, OFFSET, KK
302#endif
303
304#ifdef LT
305	mov	OFFSET, KK
306#endif
307
308#if defined(LN) || defined(RT)
309	mov	A, AORIG
310#else
311	mov	A, AO
312#endif
313
314	sra	M, 1, I
315	cmp	I, 0
316	ble,pn	%icc, .LL20
317	nop
318	.align 4
319
320.LL12:
321#if defined(LT) || defined(RN)
322	mov	B, BO
323#else
324#ifdef LN
325	sll	K,  BASE_SHIFT + 1, TEMP1
326	sub	AORIG, TEMP1, AORIG
327#endif
328
329	sll	KK, BASE_SHIFT + 1, TEMP1
330	sll	KK, BASE_SHIFT + 3, TEMP2
331
332	add	AORIG, TEMP1, AO
333	add	B,     TEMP2, BO
334#endif
335
336	LDF	[AO +  0 * SIZE], a1
337	LDF	[AO +  1 * SIZE], a2
338	LDF	[AO +  8 * SIZE], a5
339
340	LDF	[BO +  0 * SIZE], b1
341
342	LDF	[BO +  1 * SIZE], b2
343	FCLR	(cc01)
344	LDF	[BO +  2 * SIZE], b3
345	FCLR	(cc05)
346	LDF	[BO +  3 * SIZE], b4
347	FCLR	(cc09)
348	LDF	[BO +  4 * SIZE], b5
349	FCLR	(cc13)
350
351	LDF	[BO +  5 * SIZE], b6
352	FCLR	(cc02)
353	LDF	[BO +  6 * SIZE], b7
354	FCLR	(cc06)
355	LDF	[BO +  7 * SIZE], b8
356	FCLR	(cc10)
357	LDF	[BO +  8 * SIZE], b9
358	FCLR	(cc14)
359
360	prefetch [C1 + 1 * SIZE], 3
361	FCLR	(cc03)
362	prefetch [C2 + 2 * SIZE], 3
363	FCLR	(cc07)
364	prefetch [C3 + 1 * SIZE], 3
365	FCLR	(cc11)
366	prefetch [C4 + 2 * SIZE], 3
367	FCLR	(cc15)
368
369	prefetch [C5 + 1 * SIZE], 3
370	FCLR	(cc04)
371	prefetch [C6 + 2 * SIZE], 3
372	FCLR	(cc08)
373	prefetch [C7 + 1 * SIZE], 3
374	FCLR	(cc12)
375	prefetch [C8 + 2 * SIZE], 3
376	FCLR	(cc16)
377
378#if defined(LT) || defined(RN)
379	sra	KK, 3, L
380#else
381	sub	K, KK, L
382	sra	L,  3, L
383#endif
384	cmp	L,  0
385	ble,pn	%icc, .LL15
386	nop
387	.align 4
388
389.LL13:
390	FMADD	(aa1, bb1, cc01, cc01)
391	FMADD	(aa2, bb1, cc02, cc02)
392	FMADD	(aa1, bb2, cc03, cc03)
393	FMADD	(aa2, bb2, cc04, cc04)
394
395	FMADD	(aa1, bb3, cc05, cc05)
396	LDF	[BO + 16 * SIZE], b1
397	FMADD	(aa2, bb3, cc06, cc06)
398	LDF	[BO +  9 * SIZE], b2
399
400	FMADD	(aa1, bb4, cc07, cc07)
401	LDF	[BO + 10 * SIZE], b3
402	FMADD	(aa2, bb4, cc08, cc08)
403	LDF	[BO + 11 * SIZE], b4
404
405	FMADD	(aa1, bb5, cc09, cc09)
406	LDF	[AO +  2 * SIZE], a3
407	FMADD	(aa2, bb5, cc10, cc10)
408	LDF	[AO +  3 * SIZE], a4
409
410	FMADD	(aa1, bb6, cc11, cc11)
411	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
412	FMADD	(aa2, bb6, cc12, cc12)
413	nop
414
415	FMADD	(aa1, bb7, cc13, cc13)
416	LDF	[BO + 12 * SIZE], b5
417	FMADD	(aa2, bb7, cc14, cc14)
418	LDF	[BO + 13 * SIZE], b6
419
420	FMADD	(aa1, bb8, cc15, cc15)
421	LDF	[BO + 14 * SIZE], b7
422	FMADD	(aa2, bb8, cc16, cc16)
423	LDF	[BO + 15 * SIZE], b8
424
425	FMADD	(aa3, bb9, cc01, cc01)
426	FMADD	(aa4, bb9, cc02, cc02)
427	FMADD	(aa3, bb2, cc03, cc03)
428	FMADD	(aa4, bb2, cc04, cc04)
429
430	FMADD	(aa3, bb3, cc05, cc05)
431	LDF	[BO + 24 * SIZE], b9
432	FMADD	(aa4, bb3, cc06, cc06)
433	LDF	[BO + 17 * SIZE], b2
434
435	FMADD	(aa3, bb4, cc07, cc07)
436	LDF	[BO + 18 * SIZE], b3
437	FMADD	(aa4, bb4, cc08, cc08)
438	LDF	[BO + 19 * SIZE], b4
439
440	FMADD	(aa3, bb5, cc09, cc09)
441	LDF	[AO +  4 * SIZE], a1
442	FMADD	(aa4, bb5, cc10, cc10)
443	LDF	[AO +  5 * SIZE], a2
444
445	FMADD	(aa3, bb6, cc11, cc11)
446	add	L, -1, L
447	FMADD	(aa4, bb6, cc12, cc12)
448	nop
449
450	FMADD	(aa3, bb7, cc13, cc13)
451	LDF	[BO + 20 * SIZE], b5
452	FMADD	(aa4, bb7, cc14, cc14)
453	LDF	[BO + 21 * SIZE], b6
454
455	FMADD	(aa3, bb8, cc15, cc15)
456	LDF	[BO + 22 * SIZE], b7
457	FMADD	(aa4, bb8, cc16, cc16)
458	LDF	[BO + 23 * SIZE], b8
459
460	FMADD	(aa1, bb1, cc01, cc01)
461	FMADD	(aa2, bb1, cc02, cc02)
462	FMADD	(aa1, bb2, cc03, cc03)
463	FMADD	(aa2, bb2, cc04, cc04)
464
465	FMADD	(aa1, bb3, cc05, cc05)
466	LDF	[BO + 32 * SIZE], b1
467	FMADD	(aa2, bb3, cc06, cc06)
468	LDF	[BO + 25 * SIZE], b2
469
470	FMADD	(aa1, bb4, cc07, cc07)
471	LDF	[BO + 26 * SIZE], b3
472	FMADD	(aa2, bb4, cc08, cc08)
473	LDF	[BO + 27 * SIZE], b4
474
475	FMADD	(aa1, bb5, cc09, cc09)
476	LDF	[AO +  6 * SIZE], a3
477	FMADD	(aa2, bb5, cc10, cc10)
478	LDF	[AO +  7 * SIZE], a4
479
480	FMADD	(aa1, bb6, cc11, cc11)
481	nop
482	FMADD	(aa2, bb6, cc12, cc12)
483	nop
484
485	FMADD	(aa1, bb7, cc13, cc13)
486	LDF	[BO + 28 * SIZE], b5
487	FMADD	(aa2, bb7, cc14, cc14)
488	LDF	[BO + 29 * SIZE], b6
489
490	FMADD	(aa1, bb8, cc15, cc15)
491	LDF	[BO + 30 * SIZE], b7
492	FMADD	(aa2, bb8, cc16, cc16)
493	LDF	[BO + 31 * SIZE], b8
494
495	FMADD	(aa3, bb9, cc01, cc01)
496	FMADD	(aa4, bb9, cc02, cc02)
497	FMADD	(aa3, bb2, cc03, cc03)
498	FMADD	(aa4, bb2, cc04, cc04)
499
500	FMADD	(aa3, bb3, cc05, cc05)
501	LDF	[BO + 40 * SIZE], b9
502	FMADD	(aa4, bb3, cc06, cc06)
503	LDF	[BO + 33 * SIZE], b2
504
505	FMADD	(aa3, bb4, cc07, cc07)
506	LDF	[BO + 34 * SIZE], b3
507	FMADD	(aa4, bb4, cc08, cc08)
508	LDF	[BO + 35 * SIZE], b4
509
510	FMADD	(aa3, bb5, cc09, cc09)
511	LDF	[AO + 16 * SIZE], a1  /****/
512	FMADD	(aa4, bb5, cc10, cc10)
513	LDF	[AO +  9 * SIZE], a2
514
515	FMADD	(aa3, bb6, cc11, cc11)
516	nop
517	FMADD	(aa4, bb6, cc12, cc12)
518	nop
519
520	FMADD	(aa3, bb7, cc13, cc13)
521	LDF	[BO + 36 * SIZE], b5
522	FMADD	(aa4, bb7, cc14, cc14)
523	LDF	[BO + 37 * SIZE], b6
524
525	FMADD	(aa3, bb8, cc15, cc15)
526	LDF	[BO + 38 * SIZE], b7
527	FMADD	(aa4, bb8, cc16, cc16)
528	LDF	[BO + 39 * SIZE], b8
529
530	FMADD	(aa5, bb1, cc01, cc01)
531	FMADD	(aa2, bb1, cc02, cc02)
532	FMADD	(aa5, bb2, cc03, cc03)
533	FMADD	(aa2, bb2, cc04, cc04)
534
535	FMADD	(aa5, bb3, cc05, cc05)
536	LDF	[BO + 48 * SIZE], b1
537	FMADD	(aa2, bb3, cc06, cc06)
538	LDF	[BO + 41 * SIZE], b2
539
540	FMADD	(aa5, bb4, cc07, cc07)
541	LDF	[BO + 42 * SIZE], b3
542	FMADD	(aa2, bb4, cc08, cc08)
543	LDF	[BO + 43 * SIZE], b4
544
545	FMADD	(aa5, bb5, cc09, cc09)
546	LDF	[AO + 10 * SIZE], a3
547	FMADD	(aa2, bb5, cc10, cc10)
548	LDF	[AO + 11 * SIZE], a4
549
550	FMADD	(aa5, bb6, cc11, cc11)
551	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
552	FMADD	(aa2, bb6, cc12, cc12)
553	nop
554
555	FMADD	(aa5, bb7, cc13, cc13)
556	LDF	[BO + 44 * SIZE], b5
557	FMADD	(aa2, bb7, cc14, cc14)
558	LDF	[BO + 45 * SIZE], b6
559
560	FMADD	(aa5, bb8, cc15, cc15)
561	LDF	[BO + 46 * SIZE], b7
562	FMADD	(aa2, bb8, cc16, cc16)
563	LDF	[BO + 47 * SIZE], b8
564
565	FMADD	(aa3, bb9, cc01, cc01)
566	FMADD	(aa4, bb9, cc02, cc02)
567	FMADD	(aa3, bb2, cc03, cc03)
568	FMADD	(aa4, bb2, cc04, cc04)
569
570	FMADD	(aa3, bb3, cc05, cc05)
571	LDF	[BO + 56 * SIZE], b9
572	FMADD	(aa4, bb3, cc06, cc06)
573	LDF	[BO + 49 * SIZE], b2
574
575	FMADD	(aa3, bb4, cc07, cc07)
576	LDF	[BO + 50 * SIZE], b3
577	FMADD	(aa4, bb4, cc08, cc08)
578	LDF	[BO + 51 * SIZE], b4
579
580	FMADD	(aa3, bb5, cc09, cc09)
581	LDF	[AO + 12 * SIZE], a5
582	FMADD	(aa4, bb5, cc10, cc10)
583	LDF	[AO + 13 * SIZE], a2
584
585	FMADD	(aa3, bb6, cc11, cc11)
586	cmp	L, 0
587	FMADD	(aa4, bb6, cc12, cc12)
588	nop
589
590	FMADD	(aa3, bb7, cc13, cc13)
591	LDF	[BO + 52 * SIZE], b5
592	FMADD	(aa4, bb7, cc14, cc14)
593	LDF	[BO + 53 * SIZE], b6
594
595	FMADD	(aa3, bb8, cc15, cc15)
596	LDF	[BO + 54 * SIZE], b7
597	FMADD	(aa4, bb8, cc16, cc16)
598	LDF	[BO + 55 * SIZE], b8
599
600	FMADD	(aa5, bb1, cc01, cc01)
601	FMADD	(aa2, bb1, cc02, cc02)
602	FMADD	(aa5, bb2, cc03, cc03)
603	FMADD	(aa2, bb2, cc04, cc04)
604
605	FMADD	(aa5, bb3, cc05, cc05)
606	LDF	[BO + 64 * SIZE], b1
607	FMADD	(aa2, bb3, cc06, cc06)
608	LDF	[BO + 57 * SIZE], b2
609
610	FMADD	(aa5, bb4, cc07, cc07)
611	LDF	[BO + 58 * SIZE], b3
612	FMADD	(aa2, bb4, cc08, cc08)
613	LDF	[BO + 59 * SIZE], b4
614
615	FMADD	(aa5, bb5, cc09, cc09)
616	LDF	[AO + 14 * SIZE], a3
617	FMADD	(aa2, bb5, cc10, cc10)
618	LDF	[AO + 15 * SIZE], a4
619
620	FMADD	(aa5, bb6, cc11, cc11)
621	add	BO, 64 * SIZE, BO
622	FMADD	(aa2, bb6, cc12, cc12)
623	add	AO, 16 * SIZE, AO
624
625	FMADD	(aa5, bb7, cc13, cc13)
626	LDF	[BO -  4 * SIZE], b5
627	FMADD	(aa2, bb7, cc14, cc14)
628	LDF	[BO -  3 * SIZE], b6
629
630	FMADD	(aa5, bb8, cc15, cc15)
631	LDF	[BO -  2 * SIZE], b7
632	FMADD	(aa2, bb8, cc16, cc16)
633	LDF	[BO -  1 * SIZE], b8
634
635	FMADD	(aa3, bb9, cc01, cc01)
636	FMADD	(aa4, bb9, cc02, cc02)
637	FMADD	(aa3, bb2, cc03, cc03)
638	FMADD	(aa4, bb2, cc04, cc04)
639
640	FMADD	(aa3, bb3, cc05, cc05)
641	LDF	[BO +  8 * SIZE], b9
642	FMADD	(aa4, bb3, cc06, cc06)
643	LDF	[BO +  1 * SIZE], b2
644
645	FMADD	(aa3, bb4, cc07, cc07)
646	LDF	[BO +  2 * SIZE], b3
647	FMADD	(aa4, bb4, cc08, cc08)
648	LDF	[BO +  3 * SIZE], b4
649
650	FMADD	(aa3, bb5, cc09, cc09)
651	LDF	[AO +  8 * SIZE], a5  /****/
652	FMADD	(aa4, bb5, cc10, cc10)
653	LDF	[AO +  1 * SIZE], a2
654
655	FMADD	(aa3, bb6, cc11, cc11)
656	FMADD	(aa4, bb6, cc12, cc12)
657
658	FMADD	(aa3, bb7, cc13, cc13)
659	LDF	[BO +  4 * SIZE], b5
660	FMADD	(aa4, bb7, cc14, cc14)
661	LDF	[BO +  5 * SIZE], b6
662
663	FMADD	(aa3, bb8, cc15, cc15)
664	LDF	[BO +  6 * SIZE], b7
665	FMADD	(aa4, bb8, cc16, cc16)
666	ble,pn	%icc, .LL15
667	LDF	[BO +  7 * SIZE], b8
668
669	FMADD	(aa1, bb1, cc01, cc01)
670	FMADD	(aa2, bb1, cc02, cc02)
671	FMADD	(aa1, bb2, cc03, cc03)
672	FMADD	(aa2, bb2, cc04, cc04)
673
674	FMADD	(aa1, bb3, cc05, cc05)
675	LDF	[BO + 16 * SIZE], b1
676	FMADD	(aa2, bb3, cc06, cc06)
677	LDF	[BO +  9 * SIZE], b2
678
679	FMADD	(aa1, bb4, cc07, cc07)
680	LDF	[BO + 10 * SIZE], b3
681	FMADD	(aa2, bb4, cc08, cc08)
682	LDF	[BO + 11 * SIZE], b4
683
684	FMADD	(aa1, bb5, cc09, cc09)
685	LDF	[AO +  2 * SIZE], a3
686	FMADD	(aa2, bb5, cc10, cc10)
687	LDF	[AO +  3 * SIZE], a4
688
689	FMADD	(aa1, bb6, cc11, cc11)
690	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
691	FMADD	(aa2, bb6, cc12, cc12)
692	nop
693
694	FMADD	(aa1, bb7, cc13, cc13)
695	LDF	[BO + 12 * SIZE], b5
696	FMADD	(aa2, bb7, cc14, cc14)
697	LDF	[BO + 13 * SIZE], b6
698
699	FMADD	(aa1, bb8, cc15, cc15)
700	LDF	[BO + 14 * SIZE], b7
701	FMADD	(aa2, bb8, cc16, cc16)
702	LDF	[BO + 15 * SIZE], b8
703
704	FMADD	(aa3, bb9, cc01, cc01)
705	FMADD	(aa4, bb9, cc02, cc02)
706	FMADD	(aa3, bb2, cc03, cc03)
707	FMADD	(aa4, bb2, cc04, cc04)
708
709	FMADD	(aa3, bb3, cc05, cc05)
710	LDF	[BO + 24 * SIZE], b9
711	FMADD	(aa4, bb3, cc06, cc06)
712	LDF	[BO + 17 * SIZE], b2
713
714	FMADD	(aa3, bb4, cc07, cc07)
715	LDF	[BO + 18 * SIZE], b3
716	FMADD	(aa4, bb4, cc08, cc08)
717	LDF	[BO + 19 * SIZE], b4
718
719	FMADD	(aa3, bb5, cc09, cc09)
720	LDF	[AO +  4 * SIZE], a1
721	FMADD	(aa4, bb5, cc10, cc10)
722	LDF	[AO +  5 * SIZE], a2
723
724	FMADD	(aa3, bb6, cc11, cc11)
725	add	L, -1, L
726	FMADD	(aa4, bb6, cc12, cc12)
727	nop
728
729	FMADD	(aa3, bb7, cc13, cc13)
730	LDF	[BO + 20 * SIZE], b5
731	FMADD	(aa4, bb7, cc14, cc14)
732	LDF	[BO + 21 * SIZE], b6
733
734	FMADD	(aa3, bb8, cc15, cc15)
735	LDF	[BO + 22 * SIZE], b7
736	FMADD	(aa4, bb8, cc16, cc16)
737	LDF	[BO + 23 * SIZE], b8
738
739	FMADD	(aa1, bb1, cc01, cc01)
740	FMADD	(aa2, bb1, cc02, cc02)
741	FMADD	(aa1, bb2, cc03, cc03)
742	FMADD	(aa2, bb2, cc04, cc04)
743
744	FMADD	(aa1, bb3, cc05, cc05)
745	LDF	[BO + 32 * SIZE], b1
746	FMADD	(aa2, bb3, cc06, cc06)
747	LDF	[BO + 25 * SIZE], b2
748
749	FMADD	(aa1, bb4, cc07, cc07)
750	LDF	[BO + 26 * SIZE], b3
751	FMADD	(aa2, bb4, cc08, cc08)
752	LDF	[BO + 27 * SIZE], b4
753
754	FMADD	(aa1, bb5, cc09, cc09)
755	LDF	[AO +  6 * SIZE], a3
756	FMADD	(aa2, bb5, cc10, cc10)
757	LDF	[AO +  7 * SIZE], a4
758
759	FMADD	(aa1, bb6, cc11, cc11)
760	nop
761	FMADD	(aa2, bb6, cc12, cc12)
762	nop
763
764	FMADD	(aa1, bb7, cc13, cc13)
765	LDF	[BO + 28 * SIZE], b5
766	FMADD	(aa2, bb7, cc14, cc14)
767	LDF	[BO + 29 * SIZE], b6
768
769	FMADD	(aa1, bb8, cc15, cc15)
770	LDF	[BO + 30 * SIZE], b7
771	FMADD	(aa2, bb8, cc16, cc16)
772	LDF	[BO + 31 * SIZE], b8
773
774	FMADD	(aa3, bb9, cc01, cc01)
775	FMADD	(aa4, bb9, cc02, cc02)
776	FMADD	(aa3, bb2, cc03, cc03)
777	FMADD	(aa4, bb2, cc04, cc04)
778
779	FMADD	(aa3, bb3, cc05, cc05)
780	LDF	[BO + 40 * SIZE], b9
781	FMADD	(aa4, bb3, cc06, cc06)
782	LDF	[BO + 33 * SIZE], b2
783
784	FMADD	(aa3, bb4, cc07, cc07)
785	LDF	[BO + 34 * SIZE], b3
786	FMADD	(aa4, bb4, cc08, cc08)
787	LDF	[BO + 35 * SIZE], b4
788
789	FMADD	(aa3, bb5, cc09, cc09)
790	LDF	[AO + 16 * SIZE], a1  /****/
791	FMADD	(aa4, bb5, cc10, cc10)
792	LDF	[AO +  9 * SIZE], a2
793
794	FMADD	(aa3, bb6, cc11, cc11)
795	nop
796	FMADD	(aa4, bb6, cc12, cc12)
797	nop
798
799	FMADD	(aa3, bb7, cc13, cc13)
800	LDF	[BO + 36 * SIZE], b5
801	FMADD	(aa4, bb7, cc14, cc14)
802	LDF	[BO + 37 * SIZE], b6
803
804	FMADD	(aa3, bb8, cc15, cc15)
805	LDF	[BO + 38 * SIZE], b7
806	FMADD	(aa4, bb8, cc16, cc16)
807	LDF	[BO + 39 * SIZE], b8
808
809	FMADD	(aa5, bb1, cc01, cc01)
810	FMADD	(aa2, bb1, cc02, cc02)
811	FMADD	(aa5, bb2, cc03, cc03)
812	FMADD	(aa2, bb2, cc04, cc04)
813
814	FMADD	(aa5, bb3, cc05, cc05)
815	LDF	[BO + 48 * SIZE], b1
816	FMADD	(aa2, bb3, cc06, cc06)
817	LDF	[BO + 41 * SIZE], b2
818
819	FMADD	(aa5, bb4, cc07, cc07)
820	LDF	[BO + 42 * SIZE], b3
821	FMADD	(aa2, bb4, cc08, cc08)
822	LDF	[BO + 43 * SIZE], b4
823
824	FMADD	(aa5, bb5, cc09, cc09)
825	LDF	[AO + 10 * SIZE], a3
826	FMADD	(aa2, bb5, cc10, cc10)
827	LDF	[AO + 11 * SIZE], a4
828
829	FMADD	(aa5, bb6, cc11, cc11)
830	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
831	FMADD	(aa2, bb6, cc12, cc12)
832	nop
833
834	FMADD	(aa5, bb7, cc13, cc13)
835	LDF	[BO + 44 * SIZE], b5
836	FMADD	(aa2, bb7, cc14, cc14)
837	LDF	[BO + 45 * SIZE], b6
838
839	FMADD	(aa5, bb8, cc15, cc15)
840	LDF	[BO + 46 * SIZE], b7
841	FMADD	(aa2, bb8, cc16, cc16)
842	LDF	[BO + 47 * SIZE], b8
843
844	FMADD	(aa3, bb9, cc01, cc01)
845	FMADD	(aa4, bb9, cc02, cc02)
846	FMADD	(aa3, bb2, cc03, cc03)
847	FMADD	(aa4, bb2, cc04, cc04)
848
849	FMADD	(aa3, bb3, cc05, cc05)
850	LDF	[BO + 56 * SIZE], b9
851	FMADD	(aa4, bb3, cc06, cc06)
852	LDF	[BO + 49 * SIZE], b2
853
854	FMADD	(aa3, bb4, cc07, cc07)
855	LDF	[BO + 50 * SIZE], b3
856	FMADD	(aa4, bb4, cc08, cc08)
857	LDF	[BO + 51 * SIZE], b4
858
859	FMADD	(aa3, bb5, cc09, cc09)
860	LDF	[AO + 12 * SIZE], a5
861	FMADD	(aa4, bb5, cc10, cc10)
862	LDF	[AO + 13 * SIZE], a2
863
864	FMADD	(aa3, bb6, cc11, cc11)
865	cmp	L, 0
866	FMADD	(aa4, bb6, cc12, cc12)
867	nop
868
869	FMADD	(aa3, bb7, cc13, cc13)
870	LDF	[BO + 52 * SIZE], b5
871	FMADD	(aa4, bb7, cc14, cc14)
872	LDF	[BO + 53 * SIZE], b6
873
874	FMADD	(aa3, bb8, cc15, cc15)
875	LDF	[BO + 54 * SIZE], b7
876	FMADD	(aa4, bb8, cc16, cc16)
877	LDF	[BO + 55 * SIZE], b8
878
879	FMADD	(aa5, bb1, cc01, cc01)
880	FMADD	(aa2, bb1, cc02, cc02)
881	FMADD	(aa5, bb2, cc03, cc03)
882	FMADD	(aa2, bb2, cc04, cc04)
883
884	FMADD	(aa5, bb3, cc05, cc05)
885	LDF	[BO + 64 * SIZE], b1
886	FMADD	(aa2, bb3, cc06, cc06)
887	LDF	[BO + 57 * SIZE], b2
888
889	FMADD	(aa5, bb4, cc07, cc07)
890	LDF	[BO + 58 * SIZE], b3
891	FMADD	(aa2, bb4, cc08, cc08)
892	LDF	[BO + 59 * SIZE], b4
893
894	FMADD	(aa5, bb5, cc09, cc09)
895	LDF	[AO + 14 * SIZE], a3
896	FMADD	(aa2, bb5, cc10, cc10)
897	LDF	[AO + 15 * SIZE], a4
898
899	FMADD	(aa5, bb6, cc11, cc11)
900	add	BO, 64 * SIZE, BO
901	FMADD	(aa2, bb6, cc12, cc12)
902	add	AO, 16 * SIZE, AO
903
904	FMADD	(aa5, bb7, cc13, cc13)
905	LDF	[BO -  4 * SIZE], b5
906	FMADD	(aa2, bb7, cc14, cc14)
907	LDF	[BO -  3 * SIZE], b6
908
909	FMADD	(aa5, bb8, cc15, cc15)
910	LDF	[BO -  2 * SIZE], b7
911	FMADD	(aa2, bb8, cc16, cc16)
912	LDF	[BO -  1 * SIZE], b8
913
914	FMADD	(aa3, bb9, cc01, cc01)
915	FMADD	(aa4, bb9, cc02, cc02)
916	FMADD	(aa3, bb2, cc03, cc03)
917	FMADD	(aa4, bb2, cc04, cc04)
918
919	FMADD	(aa3, bb3, cc05, cc05)
920	LDF	[BO +  8 * SIZE], b9
921	FMADD	(aa4, bb3, cc06, cc06)
922	LDF	[BO +  1 * SIZE], b2
923
924	FMADD	(aa3, bb4, cc07, cc07)
925	LDF	[BO +  2 * SIZE], b3
926	FMADD	(aa4, bb4, cc08, cc08)
927	LDF	[BO +  3 * SIZE], b4
928
929	FMADD	(aa3, bb5, cc09, cc09)
930	LDF	[AO +  8 * SIZE], a5  /****/
931	FMADD	(aa4, bb5, cc10, cc10)
932	LDF	[AO +  1 * SIZE], a2
933
934	FMADD	(aa3, bb6, cc11, cc11)
935	FMADD	(aa4, bb6, cc12, cc12)
936
937	FMADD	(aa3, bb7, cc13, cc13)
938	LDF	[BO +  4 * SIZE], b5
939	FMADD	(aa4, bb7, cc14, cc14)
940	LDF	[BO +  5 * SIZE], b6
941
942	FMADD	(aa3, bb8, cc15, cc15)
943	LDF	[BO +  6 * SIZE], b7
944	FMADD	(aa4, bb8, cc16, cc16)
945	bg,pt	%icc, .LL13
946	LDF	[BO +  7 * SIZE], b8
947	.align 4
948
949.LL15:
950#if defined(LT) || defined(RN)
951	and	KK, 7, L
952#else
953	sub	K, KK, L
954	and	L,  7, L
955#endif
956	cmp	L,  0
957	ble,a,pn %icc, .LL18
958	nop
959	.align 4
960
961.LL17:
962	FMADD	(aa1, bb1, cc01, cc01)
963	add	L, -1, L
964	FMADD	(aa2, bb1, cc02, cc02)
965	nop
966
967	FMADD	(aa1, bb2, cc03, cc03)
968	LDF	[BO +  8 * SIZE], b1
969	FMADD	(aa2, bb2, cc04, cc04)
970	LDF	[BO +  9 * SIZE], b2
971
972	FMADD	(aa1, bb3, cc05, cc05)
973	cmp	L, 0
974	FMADD	(aa2, bb3, cc06, cc06)
975	nop
976
977	FMADD	(aa1, bb4, cc07, cc07)
978	LDF	[BO + 10 * SIZE], b3
979	FMADD	(aa2, bb4, cc08, cc08)
980	LDF	[BO + 11 * SIZE], b4
981
982	FMADD	(aa1, bb5, cc09, cc09)
983	nop
984	FMADD	(aa2, bb5, cc10, cc10)
985	nop
986
987	FMADD	(aa1, bb6, cc11, cc11)
988	LDF	[BO + 12 * SIZE], b5
989	FMADD	(aa2, bb6, cc12, cc12)
990	LDF	[BO + 13 * SIZE], b6
991
992	FMADD	(aa1, bb7, cc13, cc13)
993	add	AO, 2 * SIZE, AO
994	FMADD	(aa2, bb7, cc14, cc14)
995	add	BO, 8 * SIZE, BO
996
997	FMADD	(aa1, bb8, cc15, cc15)
998	LDF	[AO +  0 * SIZE], a1
999	FMADD	(aa2, bb8, cc16, cc16)
1000	LDF	[AO +  1 * SIZE], a2
1001
1002	LDF	[BO +  6 * SIZE], b7
1003	bg,pt	%icc, .LL17
1004	LDF	[BO +  7 * SIZE], b8
1005	nop
1006	.align 4
1007
1008.LL18:
1009#if defined(LN) || defined(RT)
1010#ifdef LN
1011	sub	KK, 2, TEMP1
1012#else
1013	sub	KK, 8, TEMP1
1014#endif
1015	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1016	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1017
1018	add	AORIG, TEMP2, AO
1019	add	B,     TEMP1, BO
1020#endif
1021
1022#if defined(LN) || defined(LT)
1023	LDF	[BO +  0 * SIZE], a1
1024	LDF	[BO +  1 * SIZE], a2
1025	LDF	[BO +  2 * SIZE], a3
1026	LDF	[BO +  3 * SIZE], a4
1027
1028	LDF	[BO +  4 * SIZE], b1
1029	LDF	[BO +  5 * SIZE], b2
1030	LDF	[BO +  6 * SIZE], b3
1031	LDF	[BO +  7 * SIZE], b4
1032
1033	FSUB	a1, c01, c01
1034	FSUB	a2, c03, c03
1035	FSUB	a3, c05, c05
1036	FSUB	a4, c07, c07
1037
1038	FSUB	b1, c09, c09
1039	FSUB	b2, c11, c11
1040	FSUB	b3, c13, c13
1041	FSUB	b4, c15, c15
1042
1043	LDF	[BO +  8 * SIZE], a1
1044	LDF	[BO +  9 * SIZE], a2
1045	LDF	[BO + 10 * SIZE], a3
1046	LDF	[BO + 11 * SIZE], a4
1047
1048	LDF	[BO + 12 * SIZE], b1
1049	LDF	[BO + 13 * SIZE], b2
1050	LDF	[BO + 14 * SIZE], b3
1051	LDF	[BO + 15 * SIZE], b4
1052
1053	FSUB	a1, c02, c02
1054	FSUB	a2, c04, c04
1055	FSUB	a3, c06, c06
1056	FSUB	a4, c08, c08
1057
1058	FSUB	b1, c10, c10
1059	FSUB	b2, c12, c12
1060	FSUB	b3, c14, c14
1061	FSUB	b4, c16, c16
1062#else
1063	LDF	[AO +  0 * SIZE], a1
1064	LDF	[AO +  1 * SIZE], a2
1065	LDF	[AO +  2 * SIZE], a3
1066	LDF	[AO +  3 * SIZE], a4
1067
1068	LDF	[AO +  4 * SIZE], b1
1069	LDF	[AO +  5 * SIZE], b2
1070	LDF	[AO +  6 * SIZE], b3
1071	LDF	[AO +  7 * SIZE], b4
1072
1073	FSUB	a1, c01, c01
1074	FSUB	a2, c02, c02
1075	FSUB	a3, c03, c03
1076	FSUB	a4, c04, c04
1077
1078	FSUB	b1, c05, c05
1079	FSUB	b2, c06, c06
1080	FSUB	b3, c07, c07
1081	FSUB	b4, c08, c08
1082
1083	LDF	[AO +  8 * SIZE], a1
1084	LDF	[AO +  9 * SIZE], a2
1085	LDF	[AO + 10 * SIZE], a3
1086	LDF	[AO + 11 * SIZE], a4
1087
1088	LDF	[AO + 12 * SIZE], b1
1089	LDF	[AO + 13 * SIZE], b2
1090	LDF	[AO + 14 * SIZE], b3
1091	LDF	[AO + 15 * SIZE], b4
1092
1093	FSUB	a1, c09, c09
1094	FSUB	a2, c10, c10
1095	FSUB	a3, c11, c11
1096	FSUB	a4, c12, c12
1097
1098	FSUB	b1, c13, c13
1099	FSUB	b2, c14, c14
1100	FSUB	b3, c15, c15
1101	FSUB	b4, c16, c16
1102#endif
1103
1104#ifdef LN
1105	LDF	[AO +  3 * SIZE], a1
1106	LDF	[AO +  2 * SIZE], a2
1107	LDF	[AO +  0 * SIZE], a3
1108
1109	FMUL	a1, c02, c02
1110	FMUL	a1, c04, c04
1111	FMUL	a1, c06, c06
1112	FMUL	a1, c08, c08
1113	FMUL	a1, c10, c10
1114	FMUL	a1, c12, c12
1115	FMUL	a1, c14, c14
1116	FMUL	a1, c16, c16
1117
1118	FNMSUB	(aa2, cc02, cc01, cc01)
1119	FNMSUB	(aa2, cc04, cc03, cc03)
1120	FNMSUB	(aa2, cc06, cc05, cc05)
1121	FNMSUB	(aa2, cc08, cc07, cc07)
1122	FNMSUB	(aa2, cc10, cc09, cc09)
1123	FNMSUB	(aa2, cc12, cc11, cc11)
1124	FNMSUB	(aa2, cc14, cc13, cc13)
1125	FNMSUB	(aa2, cc16, cc15, cc15)
1126
1127	FMUL	a3, c01, c01
1128	FMUL	a3, c03, c03
1129	FMUL	a3, c05, c05
1130	FMUL	a3, c07, c07
1131	FMUL	a3, c09, c09
1132	FMUL	a3, c11, c11
1133	FMUL	a3, c13, c13
1134	FMUL	a3, c15, c15
1135#endif
1136
1137#ifdef LT
1138	LDF	[AO +  0 * SIZE], a1
1139	LDF	[AO +  1 * SIZE], a2
1140	LDF	[AO +  3 * SIZE], a3
1141
1142	FMUL	a1, c01, c01
1143	FMUL	a1, c03, c03
1144	FMUL	a1, c05, c05
1145	FMUL	a1, c07, c07
1146	FMUL	a1, c09, c09
1147	FMUL	a1, c11, c11
1148	FMUL	a1, c13, c13
1149	FMUL	a1, c15, c15
1150
1151	FNMSUB	(aa2, cc01, cc02, cc02)
1152	FNMSUB	(aa2, cc03, cc04, cc04)
1153	FNMSUB	(aa2, cc05, cc06, cc06)
1154	FNMSUB	(aa2, cc07, cc08, cc08)
1155	FNMSUB	(aa2, cc09, cc10, cc10)
1156	FNMSUB	(aa2, cc11, cc12, cc12)
1157	FNMSUB	(aa2, cc13, cc14, cc14)
1158	FNMSUB	(aa2, cc15, cc16, cc16)
1159
1160	FMUL	a3, c02, c02
1161	FMUL	a3, c04, c04
1162	FMUL	a3, c06, c06
1163	FMUL	a3, c08, c08
1164	FMUL	a3, c10, c10
1165	FMUL	a3, c12, c12
1166	FMUL	a3, c14, c14
1167	FMUL	a3, c16, c16
1168#endif
1169
1170#ifdef RN
1171	LDF	[BO +  0 * SIZE], a1
1172	LDF	[BO +  1 * SIZE], a2
1173	LDF	[BO +  2 * SIZE], a3
1174	LDF	[BO +  3 * SIZE], a4
1175	LDF	[BO +  4 * SIZE], b1
1176	LDF	[BO +  5 * SIZE], b2
1177	LDF	[BO +  6 * SIZE], b3
1178	LDF	[BO +  7 * SIZE], b4
1179
1180	FMUL	a1, c01, c01
1181	FMUL	a1, c02, c02
1182
1183	FNMSUB	(aa2, cc01, cc03, cc03)
1184	FNMSUB	(aa2, cc02, cc04, cc04)
1185	FNMSUB	(aa3, cc01, cc05, cc05)
1186	FNMSUB	(aa3, cc02, cc06, cc06)
1187	FNMSUB	(aa4, cc01, cc07, cc07)
1188	FNMSUB	(aa4, cc02, cc08, cc08)
1189	FNMSUB	(bb1, cc01, cc09, cc09)
1190	FNMSUB	(bb1, cc02, cc10, cc10)
1191	FNMSUB	(bb2, cc01, cc11, cc11)
1192	FNMSUB	(bb2, cc02, cc12, cc12)
1193	FNMSUB	(bb3, cc01, cc13, cc13)
1194	FNMSUB	(bb3, cc02, cc14, cc14)
1195	FNMSUB	(bb4, cc01, cc15, cc15)
1196	FNMSUB	(bb4, cc02, cc16, cc16)
1197
1198	LDF	[BO +  9 * SIZE], a1
1199	LDF	[BO + 10 * SIZE], a2
1200	LDF	[BO + 11 * SIZE], a3
1201	LDF	[BO + 12 * SIZE], a4
1202	LDF	[BO + 13 * SIZE], b1
1203	LDF	[BO + 14 * SIZE], b2
1204	LDF	[BO + 15 * SIZE], b3
1205
1206	FMUL	a1, c03, c03
1207	FMUL	a1, c04, c04
1208
1209	FNMSUB	(aa2, cc03, cc05, cc05)
1210	FNMSUB	(aa2, cc04, cc06, cc06)
1211	FNMSUB	(aa3, cc03, cc07, cc07)
1212	FNMSUB	(aa3, cc04, cc08, cc08)
1213	FNMSUB	(aa4, cc03, cc09, cc09)
1214	FNMSUB	(aa4, cc04, cc10, cc10)
1215	FNMSUB	(bb1, cc03, cc11, cc11)
1216	FNMSUB	(bb1, cc04, cc12, cc12)
1217	FNMSUB	(bb2, cc03, cc13, cc13)
1218	FNMSUB	(bb2, cc04, cc14, cc14)
1219	FNMSUB	(bb3, cc03, cc15, cc15)
1220	FNMSUB	(bb3, cc04, cc16, cc16)
1221
1222	LDF	[BO + 18 * SIZE], a1
1223	LDF	[BO + 19 * SIZE], a2
1224	LDF	[BO + 20 * SIZE], a3
1225	LDF	[BO + 21 * SIZE], a4
1226	LDF	[BO + 22 * SIZE], b1
1227	LDF	[BO + 23 * SIZE], b2
1228
1229	FMUL	a1, c05, c05
1230	FMUL	a1, c06, c06
1231
1232	FNMSUB	(aa2, cc05, cc07, cc07)
1233	FNMSUB	(aa2, cc06, cc08, cc08)
1234	FNMSUB	(aa3, cc05, cc09, cc09)
1235	FNMSUB	(aa3, cc06, cc10, cc10)
1236	FNMSUB	(aa4, cc05, cc11, cc11)
1237	FNMSUB	(aa4, cc06, cc12, cc12)
1238	FNMSUB	(bb1, cc05, cc13, cc13)
1239	FNMSUB	(bb1, cc06, cc14, cc14)
1240	FNMSUB	(bb2, cc05, cc15, cc15)
1241	FNMSUB	(bb2, cc06, cc16, cc16)
1242
1243	LDF	[BO + 27 * SIZE], a1
1244	LDF	[BO + 28 * SIZE], a2
1245	LDF	[BO + 29 * SIZE], a3
1246	LDF	[BO + 30 * SIZE], a4
1247	LDF	[BO + 31 * SIZE], b1
1248
1249	FMUL	a1, c07, c07
1250	FMUL	a1, c08, c08
1251
1252	FNMSUB	(aa2, cc07, cc09, cc09)
1253	FNMSUB	(aa2, cc08, cc10, cc10)
1254	FNMSUB	(aa3, cc07, cc11, cc11)
1255	FNMSUB	(aa3, cc08, cc12, cc12)
1256	FNMSUB	(aa4, cc07, cc13, cc13)
1257	FNMSUB	(aa4, cc08, cc14, cc14)
1258	FNMSUB	(bb1, cc07, cc15, cc15)
1259	FNMSUB	(bb1, cc08, cc16, cc16)
1260
1261	LDF	[BO + 36 * SIZE], a1
1262	LDF	[BO + 37 * SIZE], a2
1263	LDF	[BO + 38 * SIZE], a3
1264	LDF	[BO + 39 * SIZE], a4
1265
1266	FMUL	a1, c09, c09
1267	FMUL	a1, c10, c10
1268
1269	FNMSUB	(aa2, cc09, cc11, cc11)
1270	FNMSUB	(aa2, cc10, cc12, cc12)
1271	FNMSUB	(aa3, cc09, cc13, cc13)
1272	FNMSUB	(aa3, cc10, cc14, cc14)
1273	FNMSUB	(aa4, cc09, cc15, cc15)
1274	FNMSUB	(aa4, cc10, cc16, cc16)
1275
1276	LDF	[BO + 45 * SIZE], a1
1277	LDF	[BO + 46 * SIZE], a2
1278	LDF	[BO + 47 * SIZE], a3
1279
1280	FMUL	a1, c11, c11
1281	FMUL	a1, c12, c12
1282
1283	FNMSUB	(aa2, cc11, cc13, cc13)
1284	FNMSUB	(aa2, cc12, cc14, cc14)
1285	FNMSUB	(aa3, cc11, cc15, cc15)
1286	FNMSUB	(aa3, cc12, cc16, cc16)
1287
1288	LDF	[BO + 54 * SIZE], a1
1289	LDF	[BO + 55 * SIZE], a2
1290
1291	FMUL	a1, c13, c13
1292	FMUL	a1, c14, c14
1293
1294	FNMSUB	(aa2, cc13, cc15, cc15)
1295	FNMSUB	(aa2, cc14, cc16, cc16)
1296
1297	LDF	[BO + 63 * SIZE], a1
1298
1299	FMUL	a1, c15, c15
1300	FMUL	a1, c16, c16
1301#endif
1302
1303#ifdef RT
1304	LDF	[BO + 63 * SIZE], a1
1305	LDF	[BO + 62 * SIZE], a2
1306	LDF	[BO + 61 * SIZE], a3
1307	LDF	[BO + 60 * SIZE], a4
1308	LDF	[BO + 59 * SIZE], b1
1309	LDF	[BO + 58 * SIZE], b2
1310	LDF	[BO + 57 * SIZE], b3
1311	LDF	[BO + 56 * SIZE], b4
1312
1313	FMUL	a1, c16, c16
1314	FMUL	a1, c15, c15
1315
1316	FNMSUB	(aa2, cc16, cc14, cc14)
1317	FNMSUB	(aa2, cc15, cc13, cc13)
1318	FNMSUB	(aa3, cc16, cc12, cc12)
1319	FNMSUB	(aa3, cc15, cc11, cc11)
1320	FNMSUB	(aa4, cc16, cc10, cc10)
1321	FNMSUB	(aa4, cc15, cc09, cc09)
1322	FNMSUB	(bb1, cc16, cc08, cc08)
1323	FNMSUB	(bb1, cc15, cc07, cc07)
1324	FNMSUB	(bb2, cc16, cc06, cc06)
1325	FNMSUB	(bb2, cc15, cc05, cc05)
1326	FNMSUB	(bb3, cc16, cc04, cc04)
1327	FNMSUB	(bb3, cc15, cc03, cc03)
1328	FNMSUB	(bb4, cc16, cc02, cc02)
1329	FNMSUB	(bb4, cc15, cc01, cc01)
1330
1331	LDF	[BO + 54 * SIZE], a1
1332	LDF	[BO + 53 * SIZE], a2
1333	LDF	[BO + 52 * SIZE], a3
1334	LDF	[BO + 51 * SIZE], a4
1335	LDF	[BO + 50 * SIZE], b1
1336	LDF	[BO + 49 * SIZE], b2
1337	LDF	[BO + 48 * SIZE], b3
1338
1339	FMUL	a1, c14, c14
1340	FMUL	a1, c13, c13
1341
1342	FNMSUB	(aa2, cc14, cc12, cc12)
1343	FNMSUB	(aa2, cc13, cc11, cc11)
1344	FNMSUB	(aa3, cc14, cc10, cc10)
1345	FNMSUB	(aa3, cc13, cc09, cc09)
1346	FNMSUB	(aa4, cc14, cc08, cc08)
1347	FNMSUB	(aa4, cc13, cc07, cc07)
1348	FNMSUB	(bb1, cc14, cc06, cc06)
1349	FNMSUB	(bb1, cc13, cc05, cc05)
1350	FNMSUB	(bb2, cc14, cc04, cc04)
1351	FNMSUB	(bb2, cc13, cc03, cc03)
1352	FNMSUB	(bb3, cc14, cc02, cc02)
1353	FNMSUB	(bb3, cc13, cc01, cc01)
1354
1355	LDF	[BO + 45 * SIZE], a1
1356	LDF	[BO + 44 * SIZE], a2
1357	LDF	[BO + 43 * SIZE], a3
1358	LDF	[BO + 42 * SIZE], a4
1359	LDF	[BO + 41 * SIZE], b1
1360	LDF	[BO + 40 * SIZE], b2
1361
1362	FMUL	a1, c12, c12
1363	FMUL	a1, c11, c11
1364
1365	FNMSUB	(aa2, cc12, cc10, cc10)
1366	FNMSUB	(aa2, cc11, cc09, cc09)
1367	FNMSUB	(aa3, cc12, cc08, cc08)
1368	FNMSUB	(aa3, cc11, cc07, cc07)
1369	FNMSUB	(aa4, cc12, cc06, cc06)
1370	FNMSUB	(aa4, cc11, cc05, cc05)
1371	FNMSUB	(bb1, cc12, cc04, cc04)
1372	FNMSUB	(bb1, cc11, cc03, cc03)
1373	FNMSUB	(bb2, cc12, cc02, cc02)
1374	FNMSUB	(bb2, cc11, cc01, cc01)
1375
1376	LDF	[BO + 36 * SIZE], a1
1377	LDF	[BO + 35 * SIZE], a2
1378	LDF	[BO + 34 * SIZE], a3
1379	LDF	[BO + 33 * SIZE], a4
1380	LDF	[BO + 32 * SIZE], b1
1381
1382	FMUL	a1, c10, c10
1383	FMUL	a1, c09, c09
1384
1385	FNMSUB	(aa2, cc10, cc08, cc08)
1386	FNMSUB	(aa2, cc09, cc07, cc07)
1387	FNMSUB	(aa3, cc10, cc06, cc06)
1388	FNMSUB	(aa3, cc09, cc05, cc05)
1389	FNMSUB	(aa4, cc10, cc04, cc04)
1390	FNMSUB	(aa4, cc09, cc03, cc03)
1391	FNMSUB	(bb1, cc10, cc02, cc02)
1392	FNMSUB	(bb1, cc09, cc01, cc01)
1393
1394	LDF	[BO + 27 * SIZE], a1
1395	LDF	[BO + 26 * SIZE], a2
1396	LDF	[BO + 25 * SIZE], a3
1397	LDF	[BO + 24 * SIZE], a4
1398
1399	FMUL	a1, c08, c08
1400	FMUL	a1, c07, c07
1401
1402	FNMSUB	(aa2, cc08, cc06, cc06)
1403	FNMSUB	(aa2, cc07, cc05, cc05)
1404	FNMSUB	(aa3, cc08, cc04, cc04)
1405	FNMSUB	(aa3, cc07, cc03, cc03)
1406	FNMSUB	(aa4, cc08, cc02, cc02)
1407	FNMSUB	(aa4, cc07, cc01, cc01)
1408
1409	LDF	[BO + 18 * SIZE], a1
1410	LDF	[BO + 17 * SIZE], a2
1411	LDF	[BO + 16 * SIZE], a3
1412
1413	FMUL	a1, c06, c06
1414	FMUL	a1, c05, c05
1415
1416	FNMSUB	(aa2, cc06, cc04, cc04)
1417	FNMSUB	(aa2, cc05, cc03, cc03)
1418	FNMSUB	(aa3, cc06, cc02, cc02)
1419	FNMSUB	(aa3, cc05, cc01, cc01)
1420
1421	LDF	[BO +  9 * SIZE], a1
1422	LDF	[BO +  8 * SIZE], a2
1423
1424	FMUL	a1, c04, c04
1425	FMUL	a1, c03, c03
1426
1427	FNMSUB	(aa2, cc04, cc02, cc02)
1428	FNMSUB	(aa2, cc03, cc01, cc01)
1429
1430	LDF	[BO +  0 * SIZE], a1
1431
1432	FMUL	a1, c02, c02
1433	FMUL	a1, c01, c01
1434#endif
1435
1436#ifdef LN
1437	add	C1, -2 * SIZE, C1
1438	add	C2, -2 * SIZE, C2
1439	add	C3, -2 * SIZE, C3
1440	add	C4, -2 * SIZE, C4
1441	add	C5, -2 * SIZE, C5
1442	add	C6, -2 * SIZE, C6
1443	add	C7, -2 * SIZE, C7
1444	add	C8, -2 * SIZE, C8
1445#endif
1446
1447#if defined(LN) || defined(LT)
1448	STF	c01, [BO +  0 * SIZE]
1449	STF	c03, [BO +  1 * SIZE]
1450	STF	c05, [BO +  2 * SIZE]
1451	STF	c07, [BO +  3 * SIZE]
1452
1453	STF	c09, [BO +  4 * SIZE]
1454	STF	c11, [BO +  5 * SIZE]
1455	STF	c13, [BO +  6 * SIZE]
1456	STF	c15, [BO +  7 * SIZE]
1457
1458	STF	c02, [BO +  8 * SIZE]
1459	STF	c04, [BO +  9 * SIZE]
1460	STF	c06, [BO + 10 * SIZE]
1461	STF	c08, [BO + 11 * SIZE]
1462
1463	STF	c10, [BO + 12 * SIZE]
1464	STF	c12, [BO + 13 * SIZE]
1465	STF	c14, [BO + 14 * SIZE]
1466	STF	c16, [BO + 15 * SIZE]
1467#else
1468	STF	c01, [AO +  0 * SIZE]
1469	STF	c02, [AO +  1 * SIZE]
1470	STF	c03, [AO +  2 * SIZE]
1471	STF	c04, [AO +  3 * SIZE]
1472
1473	STF	c05, [AO +  4 * SIZE]
1474	STF	c06, [AO +  5 * SIZE]
1475	STF	c07, [AO +  6 * SIZE]
1476	STF	c08, [AO +  7 * SIZE]
1477
1478	STF	c09, [AO +  8 * SIZE]
1479	STF	c10, [AO +  9 * SIZE]
1480	STF	c11, [AO + 10 * SIZE]
1481	STF	c12, [AO + 11 * SIZE]
1482
1483	STF	c13, [AO + 12 * SIZE]
1484	STF	c14, [AO + 13 * SIZE]
1485	STF	c15, [AO + 14 * SIZE]
1486	STF	c16, [AO + 15 * SIZE]
1487#endif
1488
1489	STF	c01, [C1 + 0 * SIZE]
1490	STF	c02, [C1 + 1 * SIZE]
1491	STF	c03, [C2 + 0 * SIZE]
1492	STF	c04, [C2 + 1 * SIZE]
1493
1494	STF	c05, [C3 + 0 * SIZE]
1495	STF	c06, [C3 + 1 * SIZE]
1496	STF	c07, [C4 + 0 * SIZE]
1497	STF	c08, [C4 + 1 * SIZE]
1498
1499	STF	c09, [C5 + 0 * SIZE]
1500	STF	c10, [C5 + 1 * SIZE]
1501	STF	c11, [C6 + 0 * SIZE]
1502	STF	c12, [C6 + 1 * SIZE]
1503
1504	STF	c13, [C7 + 0 * SIZE]
1505	STF	c14, [C7 + 1 * SIZE]
1506	STF	c15, [C8 + 0 * SIZE]
1507	STF	c16, [C8 + 1 * SIZE]
1508
1509#ifndef LN
1510	add	C1, 2 * SIZE, C1
1511	add	C2, 2 * SIZE, C2
1512	add	C3, 2 * SIZE, C3
1513	add	C4, 2 * SIZE, C4
1514	add	C5, 2 * SIZE, C5
1515	add	C6, 2 * SIZE, C6
1516	add	C7, 2 * SIZE, C7
1517	add	C8, 2 * SIZE, C8
1518#endif
1519
1520#ifdef RT
1521	sll	K, BASE_SHIFT + 1, TEMP1
1522	add	AORIG, TEMP1, AORIG
1523#endif
1524
1525#if defined(LT) || defined(RN)
1526	sub	K, KK, TEMP1
1527	sll	TEMP1, BASE_SHIFT + 1, TEMP2
1528	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1529	add	AO, TEMP2, AO
1530	add	BO, TEMP1, BO
1531#endif
1532
1533#ifdef LT
1534	add	KK, 2, KK
1535#endif
1536
1537#ifdef LN
1538	sub	KK, 2, KK
1539#endif
1540
1541	add	I, -1, I
1542	cmp	I, 0
1543	bg,pt	%icc, .LL12
1544	nop
1545	.align 4
1546
1547.LL20:
1548	and	M, 1, I
1549	cmp	I, 0
1550	ble,pn	%icc, .LL29
1551	nop
1552
1553#if defined(LT) || defined(RN)
1554	mov	B, BO
1555#else
1556#ifdef LN
1557	sll	K,  BASE_SHIFT + 0, TEMP1
1558	sub	AORIG, TEMP1, AORIG
1559#endif
1560
1561	sll	KK, BASE_SHIFT + 0, TEMP1
1562	sll	KK, BASE_SHIFT + 3, TEMP2
1563
1564	add	AORIG, TEMP1, AO
1565	add	B,     TEMP2, BO
1566#endif
1567
1568	LDF	[AO +  0 * SIZE], a1
1569	LDF	[AO +  1 * SIZE], a2
1570	LDF	[AO +  2 * SIZE], a3
1571	LDF	[AO +  3 * SIZE], a4
1572
1573	LDF	[BO +  0 * SIZE], b1
1574	FCLR	(cc01)
1575	LDF	[BO +  1 * SIZE], b2
1576	FCLR	(cc03)
1577	LDF	[BO +  2 * SIZE], b3
1578	FCLR	(cc05)
1579	LDF	[BO +  3 * SIZE], b4
1580	FCLR	(cc07)
1581	LDF	[BO +  4 * SIZE], b5
1582	FCLR	(cc09)
1583	LDF	[BO +  5 * SIZE], b6
1584	FCLR	(cc11)
1585	LDF	[BO +  6 * SIZE], b7
1586	FCLR	(cc13)
1587	LDF	[BO +  7 * SIZE], b8
1588	FCLR	(cc15)
1589
1590#if defined(LT) || defined(RN)
1591	sra	KK, 2, L
1592#else
1593	sub	K, KK, L
1594	sra	L,  2, L
1595#endif
1596	cmp	L,  0
1597	ble,pn	%icc, .LL25
1598	LDF	[BO +  8 * SIZE], b9
1599	.align 4
1600
1601.LL23:
1602	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1603	add	L, -1, L
1604
1605	FMADD	(aa1, bb1, cc01, cc01)
1606	LDF	[BO + 16 * SIZE], b1
1607	FMADD	(aa1, bb2, cc03, cc03)
1608	LDF	[BO +  9 * SIZE], b2
1609
1610	FMADD	(aa1, bb3, cc05, cc05)
1611	LDF	[BO + 10 * SIZE], b3
1612	FMADD	(aa1, bb4, cc07, cc07)
1613	LDF	[BO + 11 * SIZE], b4
1614
1615	FMADD	(aa1, bb5, cc09, cc09)
1616	LDF	[BO + 12 * SIZE], b5
1617	FMADD	(aa1, bb6, cc11, cc11)
1618	LDF	[BO + 13 * SIZE], b6
1619
1620	FMADD	(aa1, bb7, cc13, cc13)
1621	LDF	[BO + 14 * SIZE], b7
1622	FMADD	(aa1, bb8, cc15, cc15)
1623	LDF	[BO + 15 * SIZE], b8
1624
1625	FMADD	(aa2, bb9, cc01, cc01)
1626	LDF	[BO + 24 * SIZE], b9
1627	FMADD	(aa2, bb2, cc03, cc03)
1628	LDF	[BO + 17 * SIZE], b2
1629
1630	FMADD	(aa2, bb3, cc05, cc05)
1631	LDF	[BO + 18 * SIZE], b3
1632	FMADD	(aa2, bb4, cc07, cc07)
1633	LDF	[BO + 19 * SIZE], b4
1634
1635	FMADD	(aa2, bb5, cc09, cc09)
1636	LDF	[BO + 20 * SIZE], b5
1637	FMADD	(aa2, bb6, cc11, cc11)
1638	LDF	[BO + 21 * SIZE], b6
1639
1640	FMADD	(aa2, bb7, cc13, cc13)
1641	LDF	[BO + 22 * SIZE], b7
1642	FMADD	(aa2, bb8, cc15, cc15)
1643	LDF	[BO + 23 * SIZE], b8
1644
1645	LDF	[AO +  4 * SIZE], a1
1646	LDF	[AO +  5 * SIZE], a2
1647
1648	FMADD	(aa3, bb1, cc01, cc01)
1649	LDF	[BO + 32 * SIZE], b1
1650	FMADD	(aa3, bb2, cc03, cc03)
1651	LDF	[BO + 25 * SIZE], b2
1652
1653	FMADD	(aa3, bb3, cc05, cc05)
1654	LDF	[BO + 26 * SIZE], b3
1655	FMADD	(aa3, bb4, cc07, cc07)
1656	LDF	[BO + 27 * SIZE], b4
1657
1658	FMADD	(aa3, bb5, cc09, cc09)
1659	LDF	[BO + 28 * SIZE], b5
1660	FMADD	(aa3, bb6, cc11, cc11)
1661	LDF	[BO + 29 * SIZE], b6
1662
1663	FMADD	(aa3, bb7, cc13, cc13)
1664	LDF	[BO + 30 * SIZE], b7
1665	FMADD	(aa3, bb8, cc15, cc15)
1666	LDF	[BO + 31 * SIZE], b8
1667
1668	FMADD	(aa4, bb9, cc01, cc01)
1669	LDF	[BO + 40 * SIZE], b9
1670	FMADD	(aa4, bb2, cc03, cc03)
1671	LDF	[BO + 33 * SIZE], b2
1672
1673	FMADD	(aa4, bb3, cc05, cc05)
1674	LDF	[BO + 34 * SIZE], b3
1675	FMADD	(aa4, bb4, cc07, cc07)
1676	LDF	[BO + 35 * SIZE], b4
1677
1678	FMADD	(aa4, bb5, cc09, cc09)
1679	LDF	[BO + 36 * SIZE], b5
1680	FMADD	(aa4, bb6, cc11, cc11)
1681	LDF	[BO + 37 * SIZE], b6
1682
1683	FMADD	(aa4, bb7, cc13, cc13)
1684	LDF	[BO + 38 * SIZE], b7
1685	FMADD	(aa4, bb8, cc15, cc15)
1686	LDF	[BO + 39 * SIZE], b8
1687
1688	LDF	[AO +  6 * SIZE], a3
1689	LDF	[AO +  7 * SIZE], a4
1690
1691	add	AO,  4 * SIZE, AO
1692	cmp	L, 0
1693	bg,pt	%icc, .LL23
1694	add	BO, 32 * SIZE, BO
1695	.align 4
1696
1697.LL25:
1698#if defined(LT) || defined(RN)
1699	and	KK, 3, L
1700#else
1701	sub	K, KK, L
1702	and	L,  3, L
1703#endif
1704	cmp	L,  0
1705	ble,a,pn %icc, .LL28
1706	nop
1707	.align 4
1708
1709.LL27:
1710	FMADD	(aa1, bb1, cc01, cc01)
1711	LDF	[BO +  8 * SIZE], b1
1712	FMADD	(aa1, bb2, cc03, cc03)
1713	LDF	[BO +  9 * SIZE], b2
1714
1715	FMADD	(aa1, bb3, cc05, cc05)
1716	LDF	[BO + 10 * SIZE], b3
1717	FMADD	(aa1, bb4, cc07, cc07)
1718	LDF	[BO + 11 * SIZE], b4
1719
1720	FMADD	(aa1, bb5, cc09, cc09)
1721	LDF	[BO + 12 * SIZE], b5
1722	FMADD	(aa1, bb6, cc11, cc11)
1723	LDF	[BO + 13 * SIZE], b6
1724
1725	FMADD	(aa1, bb7, cc13, cc13)
1726	LDF	[BO + 14 * SIZE], b7
1727	FMADD	(aa1, bb8, cc15, cc15)
1728	LDF	[BO + 15 * SIZE], b8
1729
1730	LDF	[AO +  1 * SIZE], a1
1731	add	AO, 1 * SIZE, AO
1732
1733	add	L, -1, L
1734	cmp	L, 0
1735	bg,pt	%icc, .LL27
1736	add	BO, 8 * SIZE, BO
1737	.align 4
1738
1739.LL28:
1740#if defined(LN) || defined(RT)
1741#ifdef LN
1742	sub	KK, 1, TEMP1
1743#else
1744	sub	KK, 8, TEMP1
1745#endif
1746	sll	TEMP1, BASE_SHIFT + 0, TEMP2
1747	sll	TEMP1, BASE_SHIFT + 3, TEMP1
1748
1749	add	AORIG, TEMP2, AO
1750	add	B,     TEMP1, BO
1751#endif
1752
1753#if defined(LN) || defined(LT)
1754	LDF	[BO +  0 * SIZE], a1
1755	LDF	[BO +  1 * SIZE], a2
1756	LDF	[BO +  2 * SIZE], a3
1757	LDF	[BO +  3 * SIZE], a4
1758
1759	LDF	[BO +  4 * SIZE], b1
1760	LDF	[BO +  5 * SIZE], b2
1761	LDF	[BO +  6 * SIZE], b3
1762	LDF	[BO +  7 * SIZE], b4
1763
1764	FSUB	a1, c01, c01
1765	FSUB	a2, c03, c03
1766	FSUB	a3, c05, c05
1767	FSUB	a4, c07, c07
1768
1769	FSUB	b1, c09, c09
1770	FSUB	b2, c11, c11
1771	FSUB	b3, c13, c13
1772	FSUB	b4, c15, c15
1773#else
1774	LDF	[AO +  0 * SIZE], a1
1775	LDF	[AO +  1 * SIZE], a2
1776	LDF	[AO +  2 * SIZE], a3
1777	LDF	[AO +  3 * SIZE], a4
1778
1779	LDF	[AO +  4 * SIZE], b1
1780	LDF	[AO +  5 * SIZE], b2
1781	LDF	[AO +  6 * SIZE], b3
1782	LDF	[AO +  7 * SIZE], b4
1783
1784	FSUB	a1, c01, c01
1785	FSUB	a2, c03, c03
1786	FSUB	a3, c05, c05
1787	FSUB	a4, c07, c07
1788
1789	FSUB	b1, c09, c09
1790	FSUB	b2, c11, c11
1791	FSUB	b3, c13, c13
1792	FSUB	b4, c15, c15
1793#endif
1794
1795#if defined(LN) || defined(LT)
1796	LDF	[AO +  0 * SIZE], a1
1797
1798	FMUL	a1, c01, c01
1799	FMUL	a1, c03, c03
1800	FMUL	a1, c05, c05
1801	FMUL	a1, c07, c07
1802	FMUL	a1, c09, c09
1803	FMUL	a1, c11, c11
1804	FMUL	a1, c13, c13
1805	FMUL	a1, c15, c15
1806#endif
1807
1808#ifdef RN
1809	LDF	[BO +  0 * SIZE], a1
1810	LDF	[BO +  1 * SIZE], a2
1811	LDF	[BO +  2 * SIZE], a3
1812	LDF	[BO +  3 * SIZE], a4
1813	LDF	[BO +  4 * SIZE], b1
1814	LDF	[BO +  5 * SIZE], b2
1815	LDF	[BO +  6 * SIZE], b3
1816	LDF	[BO +  7 * SIZE], b4
1817
1818	FMUL	a1, c01, c01
1819
1820	FNMSUB	(aa2, cc01, cc03, cc03)
1821	FNMSUB	(aa3, cc01, cc05, cc05)
1822	FNMSUB	(aa4, cc01, cc07, cc07)
1823	FNMSUB	(bb1, cc01, cc09, cc09)
1824	FNMSUB	(bb2, cc01, cc11, cc11)
1825	FNMSUB	(bb3, cc01, cc13, cc13)
1826	FNMSUB	(bb4, cc01, cc15, cc15)
1827
1828	LDF	[BO +  9 * SIZE], a1
1829	LDF	[BO + 10 * SIZE], a2
1830	LDF	[BO + 11 * SIZE], a3
1831	LDF	[BO + 12 * SIZE], a4
1832	LDF	[BO + 13 * SIZE], b1
1833	LDF	[BO + 14 * SIZE], b2
1834	LDF	[BO + 15 * SIZE], b3
1835
1836	FMUL	a1, c03, c03
1837
1838	FNMSUB	(aa2, cc03, cc05, cc05)
1839	FNMSUB	(aa3, cc03, cc07, cc07)
1840	FNMSUB	(aa4, cc03, cc09, cc09)
1841	FNMSUB	(bb1, cc03, cc11, cc11)
1842	FNMSUB	(bb2, cc03, cc13, cc13)
1843	FNMSUB	(bb3, cc03, cc15, cc15)
1844
1845	LDF	[BO + 18 * SIZE], a1
1846	LDF	[BO + 19 * SIZE], a2
1847	LDF	[BO + 20 * SIZE], a3
1848	LDF	[BO + 21 * SIZE], a4
1849	LDF	[BO + 22 * SIZE], b1
1850	LDF	[BO + 23 * SIZE], b2
1851
1852	FMUL	a1, c05, c05
1853
1854	FNMSUB	(aa2, cc05, cc07, cc07)
1855	FNMSUB	(aa3, cc05, cc09, cc09)
1856	FNMSUB	(aa4, cc05, cc11, cc11)
1857	FNMSUB	(bb1, cc05, cc13, cc13)
1858	FNMSUB	(bb2, cc05, cc15, cc15)
1859
1860	LDF	[BO + 27 * SIZE], a1
1861	LDF	[BO + 28 * SIZE], a2
1862	LDF	[BO + 29 * SIZE], a3
1863	LDF	[BO + 30 * SIZE], a4
1864	LDF	[BO + 31 * SIZE], b1
1865
1866	FMUL	a1, c07, c07
1867
1868	FNMSUB	(aa2, cc07, cc09, cc09)
1869	FNMSUB	(aa3, cc07, cc11, cc11)
1870	FNMSUB	(aa4, cc07, cc13, cc13)
1871	FNMSUB	(bb1, cc07, cc15, cc15)
1872
1873	LDF	[BO + 36 * SIZE], a1
1874	LDF	[BO + 37 * SIZE], a2
1875	LDF	[BO + 38 * SIZE], a3
1876	LDF	[BO + 39 * SIZE], a4
1877
1878	FMUL	a1, c09, c09
1879
1880	FNMSUB	(aa2, cc09, cc11, cc11)
1881	FNMSUB	(aa3, cc09, cc13, cc13)
1882	FNMSUB	(aa4, cc09, cc15, cc15)
1883
1884	LDF	[BO + 45 * SIZE], a1
1885	LDF	[BO + 46 * SIZE], a2
1886	LDF	[BO + 47 * SIZE], a3
1887
1888	FMUL	a1, c11, c11
1889
1890	FNMSUB	(aa2, cc11, cc13, cc13)
1891	FNMSUB	(aa3, cc11, cc15, cc15)
1892
1893	LDF	[BO + 54 * SIZE], a1
1894	LDF	[BO + 55 * SIZE], a2
1895
1896	FMUL	a1, c13, c13
1897
1898	FNMSUB	(aa2, cc13, cc15, cc15)
1899
1900	LDF	[BO + 63 * SIZE], a1
1901
1902	FMUL	a1, c15, c15
1903#endif
1904
1905#ifdef RT
1906	LDF	[BO + 63 * SIZE], a1
1907	LDF	[BO + 62 * SIZE], a2
1908	LDF	[BO + 61 * SIZE], a3
1909	LDF	[BO + 60 * SIZE], a4
1910	LDF	[BO + 59 * SIZE], b1
1911	LDF	[BO + 58 * SIZE], b2
1912	LDF	[BO + 57 * SIZE], b3
1913	LDF	[BO + 56 * SIZE], b4
1914
1915	FMUL	a1, c15, c15
1916
1917	FNMSUB	(aa2, cc15, cc13, cc13)
1918	FNMSUB	(aa3, cc15, cc11, cc11)
1919	FNMSUB	(aa4, cc15, cc09, cc09)
1920	FNMSUB	(bb1, cc15, cc07, cc07)
1921	FNMSUB	(bb2, cc15, cc05, cc05)
1922	FNMSUB	(bb3, cc15, cc03, cc03)
1923	FNMSUB	(bb4, cc15, cc01, cc01)
1924
1925	LDF	[BO + 54 * SIZE], a1
1926	LDF	[BO + 53 * SIZE], a2
1927	LDF	[BO + 52 * SIZE], a3
1928	LDF	[BO + 51 * SIZE], a4
1929	LDF	[BO + 50 * SIZE], b1
1930	LDF	[BO + 49 * SIZE], b2
1931	LDF	[BO + 48 * SIZE], b3
1932
1933	FMUL	a1, c13, c13
1934
1935	FNMSUB	(aa2, cc13, cc11, cc11)
1936	FNMSUB	(aa3, cc13, cc09, cc09)
1937	FNMSUB	(aa4, cc13, cc07, cc07)
1938	FNMSUB	(bb1, cc13, cc05, cc05)
1939	FNMSUB	(bb2, cc13, cc03, cc03)
1940	FNMSUB	(bb3, cc13, cc01, cc01)
1941
1942	LDF	[BO + 45 * SIZE], a1
1943	LDF	[BO + 44 * SIZE], a2
1944	LDF	[BO + 43 * SIZE], a3
1945	LDF	[BO + 42 * SIZE], a4
1946	LDF	[BO + 41 * SIZE], b1
1947	LDF	[BO + 40 * SIZE], b2
1948
1949	FMUL	a1, c11, c11
1950
1951	FNMSUB	(aa2, cc11, cc09, cc09)
1952	FNMSUB	(aa3, cc11, cc07, cc07)
1953	FNMSUB	(aa4, cc11, cc05, cc05)
1954	FNMSUB	(bb1, cc11, cc03, cc03)
1955	FNMSUB	(bb2, cc11, cc01, cc01)
1956
1957	LDF	[BO + 36 * SIZE], a1
1958	LDF	[BO + 35 * SIZE], a2
1959	LDF	[BO + 34 * SIZE], a3
1960	LDF	[BO + 33 * SIZE], a4
1961	LDF	[BO + 32 * SIZE], b1
1962
1963	FMUL	a1, c09, c09
1964
1965	FNMSUB	(aa2, cc09, cc07, cc07)
1966	FNMSUB	(aa3, cc09, cc05, cc05)
1967	FNMSUB	(aa4, cc09, cc03, cc03)
1968	FNMSUB	(bb1, cc09, cc01, cc01)
1969
1970	LDF	[BO + 27 * SIZE], a1
1971	LDF	[BO + 26 * SIZE], a2
1972	LDF	[BO + 25 * SIZE], a3
1973	LDF	[BO + 24 * SIZE], a4
1974
1975	FMUL	a1, c07, c07
1976
1977	FNMSUB	(aa2, cc07, cc05, cc05)
1978	FNMSUB	(aa3, cc07, cc03, cc03)
1979	FNMSUB	(aa4, cc07, cc01, cc01)
1980
1981	LDF	[BO + 18 * SIZE], a1
1982	LDF	[BO + 17 * SIZE], a2
1983	LDF	[BO + 16 * SIZE], a3
1984
1985	FMUL	a1, c05, c05
1986
1987	FNMSUB	(aa2, cc05, cc03, cc03)
1988	FNMSUB	(aa3, cc05, cc01, cc01)
1989
1990	LDF	[BO +  9 * SIZE], a1
1991	LDF	[BO +  8 * SIZE], a2
1992
1993	FMUL	a1, c03, c03
1994
1995	FNMSUB	(aa2, cc03, cc01, cc01)
1996
1997	LDF	[BO +  0 * SIZE], a1
1998
1999	FMUL	a1, c01, c01
2000#endif
2001
2002#ifdef LN
2003	add	C1, -1 * SIZE, C1
2004	add	C2, -1 * SIZE, C2
2005	add	C3, -1 * SIZE, C3
2006	add	C4, -1 * SIZE, C4
2007	add	C5, -1 * SIZE, C5
2008	add	C6, -1 * SIZE, C6
2009	add	C7, -1 * SIZE, C7
2010	add	C8, -1 * SIZE, C8
2011#endif
2012
2013#if defined(LN) || defined(LT)
2014	STF	c01, [BO +  0 * SIZE]
2015	STF	c03, [BO +  1 * SIZE]
2016	STF	c05, [BO +  2 * SIZE]
2017	STF	c07, [BO +  3 * SIZE]
2018
2019	STF	c09, [BO +  4 * SIZE]
2020	STF	c11, [BO +  5 * SIZE]
2021	STF	c13, [BO +  6 * SIZE]
2022	STF	c15, [BO +  7 * SIZE]
2023#else
2024	STF	c01, [AO +  0 * SIZE]
2025	STF	c03, [AO +  1 * SIZE]
2026	STF	c05, [AO +  2 * SIZE]
2027	STF	c07, [AO +  3 * SIZE]
2028
2029	STF	c09, [AO +  4 * SIZE]
2030	STF	c11, [AO +  5 * SIZE]
2031	STF	c13, [AO +  6 * SIZE]
2032	STF	c15, [AO +  7 * SIZE]
2033#endif
2034
2035	STF	c01, [C1 + 0 * SIZE]
2036	STF	c03, [C2 + 0 * SIZE]
2037	STF	c05, [C3 + 0 * SIZE]
2038	STF	c07, [C4 + 0 * SIZE]
2039
2040	STF	c09, [C5 + 0 * SIZE]
2041	STF	c11, [C6 + 0 * SIZE]
2042	STF	c13, [C7 + 0 * SIZE]
2043	STF	c15, [C8 + 0 * SIZE]
2044
2045#ifdef RT
2046	sll	K, BASE_SHIFT + 0, TEMP1
2047	add	AORIG, TEMP1, AORIG
2048#endif
2049
2050#if defined(LT) || defined(RN)
2051	sub	K, KK, TEMP1
2052	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2053	sll	TEMP1, BASE_SHIFT + 3, TEMP1
2054	add	AO, TEMP2, AO
2055	add	BO, TEMP1, BO
2056#endif
2057
2058#ifdef LT
2059	add	KK, 1, KK
2060#endif
2061
2062#ifdef LN
2063	sub	KK, 1, KK
2064#endif
2065	.align 4
2066
2067.LL29:
2068#ifdef LN
2069	sll	K, BASE_SHIFT + 3, TEMP1
2070	add	B, TEMP1, B
2071#endif
2072
2073#if defined(LT) || defined(RN)
2074	mov	BO, B
2075#endif
2076
2077#ifdef RN
2078	add	KK, 8, KK
2079#endif
2080
2081#ifdef RT
2082	sub	KK, 8, KK
2083#endif
2084
2085	add	J, -1, J
2086	cmp	J, 0
2087	bg,pt	%icc, .LL11
2088	nop
2089	.align 4
2090
2091.LL30:
2092	and	N, 4, J
2093	cmp	J, 0
2094	ble,pn	%icc, .LL50
2095	nop
2096
2097#ifdef RT
2098	sll	K, BASE_SHIFT + 2, TEMP1
2099	sub	B, TEMP1, B
2100#endif
2101
2102#ifndef RT
2103	mov	C,  C1
2104	add	C,  LDC, C2
2105	add	C2, LDC, C3
2106	add	C3, LDC, C4
2107	add	C4, LDC, C
2108#else
2109	sub	C,  LDC, C4
2110	sub	C4, LDC, C3
2111	sub	C3, LDC, C2
2112	sub	C2, LDC, C1
2113	sub	C2, LDC, C
2114#endif
2115
2116#ifdef LN
2117	add	M, OFFSET, KK
2118#endif
2119
2120#ifdef LT
2121	mov	OFFSET, KK
2122#endif
2123
2124#if defined(LN) || defined(RT)
2125	mov	A, AORIG
2126#else
2127	mov	A, AO
2128#endif
2129
2130	sra	M, 1, I
2131	cmp	I, 0
2132	ble,pn	%icc, .LL40
2133	nop
2134	.align 4
2135
2136.LL32:
2137#if defined(LT) || defined(RN)
2138	mov	B, BO
2139#else
2140#ifdef LN
2141	sll	K,  BASE_SHIFT + 1, TEMP1
2142	sub	AORIG, TEMP1, AORIG
2143#endif
2144
2145	sll	KK, BASE_SHIFT + 1, TEMP1
2146	sll	KK, BASE_SHIFT + 2, TEMP2
2147
2148	add	AORIG, TEMP1, AO
2149	add	B,     TEMP2, BO
2150#endif
2151
2152	LDF	[AO +  0 * SIZE], a1
2153	LDF	[AO +  1 * SIZE], a2
2154
2155	LDF	[BO +  0 * SIZE], b1
2156	LDF	[BO +  1 * SIZE], b2
2157	LDF	[BO +  2 * SIZE], b3
2158	LDF	[BO +  3 * SIZE], b4
2159	LDF	[BO +  4 * SIZE], b5
2160
2161	LDF	[BO +  5 * SIZE], b6
2162	FCLR	(cc01)
2163	LDF	[BO +  6 * SIZE], b7
2164	FCLR	(cc02)
2165	LDF	[BO +  7 * SIZE], b8
2166	FCLR	(cc03)
2167	LDF	[BO +  8 * SIZE], b9
2168	FCLR	(cc04)
2169
2170	prefetch [C1 + 2 * SIZE], 3
2171	FCLR	(cc05)
2172	prefetch [C2 + 2 * SIZE], 3
2173	FCLR	(cc06)
2174	prefetch [C3 + 2 * SIZE], 3
2175	FCLR	(cc07)
2176	prefetch [C4 + 2 * SIZE], 3
2177	FCLR	(cc08)
2178
2179#if defined(LT) || defined(RN)
2180	sra	KK, 2, L
2181#else
2182	sub	K, KK, L
2183	sra	L,  2, L
2184#endif
2185	cmp	L,  0
2186	ble,pn	%icc, .LL35
2187	nop
2188	.align 4
2189
2190.LL33:
2191	FMADD	(aa1, bb1, cc01, cc01)
2192	LDF	[AO +  2 * SIZE], a3
2193	FMADD	(aa2, bb1, cc02, cc02)
2194	LDF	[AO +  3 * SIZE], a4
2195
2196	FMADD	(aa1, bb2, cc03, cc03)
2197	LDF	[BO + 16 * SIZE], b1
2198	FMADD	(aa2, bb2, cc04, cc04)
2199	LDF	[BO +  9 * SIZE], b2
2200
2201	FMADD	(aa1, bb3, cc05, cc05)
2202	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2203	FMADD	(aa2, bb3, cc06, cc06)
2204	add	L, -1, L
2205
2206	FMADD	(aa1, bb4, cc07, cc07)
2207	LDF	[BO + 10 * SIZE], b3
2208	FMADD	(aa2, bb4, cc08, cc08)
2209	LDF	[BO + 11 * SIZE], b4
2210
2211	FMADD	(aa3, bb5, cc01, cc01)
2212	LDF	[AO +  4 * SIZE], a1
2213	FMADD	(aa4, bb5, cc02, cc02)
2214	LDF	[AO +  5 * SIZE], a2
2215
2216	FMADD	(aa3, bb6, cc03, cc03)
2217	LDF	[BO + 12 * SIZE], b5
2218	FMADD	(aa4, bb6, cc04, cc04)
2219	LDF	[BO + 13 * SIZE], b6
2220
2221	FMADD	(aa3, bb7, cc05, cc05)
2222	cmp	L, 0
2223	FMADD	(aa4, bb7, cc06, cc06)
2224	add	AO,  8 * SIZE, AO
2225
2226	FMADD	(aa3, bb8, cc07, cc07)
2227	LDF	[BO + 14 * SIZE], b7
2228	FMADD	(aa4, bb8, cc08, cc08)
2229	LDF	[BO + 15 * SIZE], b8
2230
2231	FMADD	(aa1, bb9, cc01, cc01)
2232	LDF	[AO -  2 * SIZE], a3
2233	FMADD	(aa2, bb9, cc02, cc02)
2234	LDF	[AO -  1 * SIZE], a4
2235
2236	FMADD	(aa1, bb2, cc03, cc03)
2237	LDF	[BO + 24 * SIZE], b9
2238	FMADD	(aa2, bb2, cc04, cc04)
2239	LDF	[BO + 17 * SIZE], b2
2240
2241	FMADD	(aa1, bb3, cc05, cc05)
2242	add	BO, 16 * SIZE, BO
2243	FMADD	(aa2, bb3, cc06, cc06)
2244	nop
2245
2246	FMADD	(aa1, bb4, cc07, cc07)
2247	LDF	[BO +  2 * SIZE], b3
2248	FMADD	(aa2, bb4, cc08, cc08)
2249	LDF	[BO +  3 * SIZE], b4
2250
2251	FMADD	(aa3, bb5, cc01, cc01)
2252	LDF	[AO +  0 * SIZE], a1
2253	FMADD	(aa4, bb5, cc02, cc02)
2254	LDF	[AO +  1 * SIZE], a2
2255	FMADD	(aa3, bb6, cc03, cc03)
2256	LDF	[BO +  4 * SIZE], b5
2257	FMADD	(aa4, bb6, cc04, cc04)
2258	LDF	[BO +  5 * SIZE], b6
2259
2260	FMADD	(aa3, bb7, cc05, cc05)
2261	nop
2262	FMADD	(aa4, bb7, cc06, cc06)
2263	LDF	[BO +  6 * SIZE], b7
2264
2265	FMADD	(aa3, bb8, cc07, cc07)
2266	FMADD	(aa4, bb8, cc08, cc08)
2267	bg,pt	%icc, .LL33
2268	LDF	[BO +  7 * SIZE], b8
2269	.align 4
2270
2271.LL35:
2272#if defined(LT) || defined(RN)
2273	and	KK, 3, L
2274#else
2275	sub	K, KK, L
2276	and	L,  3, L
2277#endif
2278	cmp	L,  0
2279	ble,a,pn %icc, .LL38
2280	nop
2281	.align 4
2282
2283.LL37:
2284	FMADD	(aa1, bb1, cc01, cc01)
2285	add	L, -1, L
2286	FMADD	(aa2, bb1, cc02, cc02)
2287	LDF	[BO + 4 * SIZE], b1
2288
2289	FMADD	(aa1, bb2, cc03, cc03)
2290	add	AO, 2 * SIZE, AO
2291	FMADD	(aa2, bb2, cc04, cc04)
2292	LDF	[BO + 5 * SIZE], b2
2293
2294	FMADD	(aa1, bb3, cc05, cc05)
2295	cmp	L, 0
2296	FMADD	(aa2, bb3, cc06, cc06)
2297	LDF	[BO + 6 * SIZE], b3
2298
2299	FMADD	(aa1, bb4, cc07, cc07)
2300	LDF	[AO + 0 * SIZE], a1
2301	FMADD	(aa2, bb4, cc08, cc08)
2302	LDF	[AO + 1 * SIZE], a2
2303
2304	LDF	[BO + 7 * SIZE], b4
2305	bg,pt	%icc, .LL37
2306	add	BO, 4 * SIZE, BO
2307	.align 4
2308
2309.LL38:
2310#if defined(LN) || defined(RT)
2311#ifdef LN
2312	sub	KK, 2, TEMP1
2313#else
2314	sub	KK, 4, TEMP1
2315#endif
2316	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2317	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2318
2319	add	AORIG, TEMP2, AO
2320	add	B,     TEMP1, BO
2321#endif
2322
2323#if defined(LN) || defined(LT)
2324	LDF	[BO +  0 * SIZE], a1
2325	LDF	[BO +  1 * SIZE], a2
2326	LDF	[BO +  2 * SIZE], a3
2327	LDF	[BO +  3 * SIZE], a4
2328
2329	LDF	[BO +  4 * SIZE], b1
2330	LDF	[BO +  5 * SIZE], b2
2331	LDF	[BO +  6 * SIZE], b3
2332	LDF	[BO +  7 * SIZE], b4
2333
2334	FSUB	a1, c01, c01
2335	FSUB	a2, c03, c03
2336	FSUB	a3, c05, c05
2337	FSUB	a4, c07, c07
2338
2339	FSUB	b1, c02, c02
2340	FSUB	b2, c04, c04
2341	FSUB	b3, c06, c06
2342	FSUB	b4, c08, c08
2343#else
2344	LDF	[AO +  0 * SIZE], a1
2345	LDF	[AO +  1 * SIZE], a2
2346	LDF	[AO +  2 * SIZE], a3
2347	LDF	[AO +  3 * SIZE], a4
2348
2349	LDF	[AO +  4 * SIZE], b1
2350	LDF	[AO +  5 * SIZE], b2
2351	LDF	[AO +  6 * SIZE], b3
2352	LDF	[AO +  7 * SIZE], b4
2353
2354	FSUB	a1, c01, c01
2355	FSUB	a2, c02, c02
2356	FSUB	a3, c03, c03
2357	FSUB	a4, c04, c04
2358
2359	FSUB	b1, c05, c05
2360	FSUB	b2, c06, c06
2361	FSUB	b3, c07, c07
2362	FSUB	b4, c08, c08
2363
2364#endif
2365
2366#ifdef LN
2367	LDF	[AO +  3 * SIZE], a1
2368	LDF	[AO +  2 * SIZE], a2
2369	LDF	[AO +  0 * SIZE], a3
2370
2371	FMUL	a1, c02, c02
2372	FMUL	a1, c04, c04
2373	FMUL	a1, c06, c06
2374	FMUL	a1, c08, c08
2375
2376	FNMSUB	(aa2, cc02, cc01, cc01)
2377	FNMSUB	(aa2, cc04, cc03, cc03)
2378	FNMSUB	(aa2, cc06, cc05, cc05)
2379	FNMSUB	(aa2, cc08, cc07, cc07)
2380
2381	FMUL	a3, c01, c01
2382	FMUL	a3, c03, c03
2383	FMUL	a3, c05, c05
2384	FMUL	a3, c07, c07
2385#endif
2386
2387#ifdef LT
2388	LDF	[AO +  0 * SIZE], a1
2389	LDF	[AO +  1 * SIZE], a2
2390	LDF	[AO +  3 * SIZE], a3
2391
2392	FMUL	a1, c01, c01
2393	FMUL	a1, c03, c03
2394	FMUL	a1, c05, c05
2395	FMUL	a1, c07, c07
2396
2397	FNMSUB	(aa2, cc01, cc02, cc02)
2398	FNMSUB	(aa2, cc03, cc04, cc04)
2399	FNMSUB	(aa2, cc05, cc06, cc06)
2400	FNMSUB	(aa2, cc07, cc08, cc08)
2401
2402	FMUL	a3, c02, c02
2403	FMUL	a3, c04, c04
2404	FMUL	a3, c06, c06
2405	FMUL	a3, c08, c08
2406#endif
2407
2408#ifdef RN
2409	LDF	[BO +  0 * SIZE], a1
2410	LDF	[BO +  1 * SIZE], a2
2411	LDF	[BO +  2 * SIZE], a3
2412	LDF	[BO +  3 * SIZE], a4
2413
2414	FMUL	a1, c01, c01
2415	FMUL	a1, c02, c02
2416
2417	FNMSUB	(aa2, cc01, cc03, cc03)
2418	FNMSUB	(aa2, cc02, cc04, cc04)
2419	FNMSUB	(aa3, cc01, cc05, cc05)
2420	FNMSUB	(aa3, cc02, cc06, cc06)
2421	FNMSUB	(aa4, cc01, cc07, cc07)
2422	FNMSUB	(aa4, cc02, cc08, cc08)
2423
2424	LDF	[BO +  5 * SIZE], a1
2425	LDF	[BO +  6 * SIZE], a2
2426	LDF	[BO +  7 * SIZE], a3
2427
2428	FMUL	a1, c03, c03
2429	FMUL	a1, c04, c04
2430
2431	FNMSUB	(aa2, cc03, cc05, cc05)
2432	FNMSUB	(aa2, cc04, cc06, cc06)
2433	FNMSUB	(aa3, cc03, cc07, cc07)
2434	FNMSUB	(aa3, cc04, cc08, cc08)
2435
2436	LDF	[BO + 10 * SIZE], a1
2437	LDF	[BO + 11 * SIZE], a2
2438
2439	FMUL	a1, c05, c05
2440	FMUL	a1, c06, c06
2441
2442	FNMSUB	(aa2, cc05, cc07, cc07)
2443	FNMSUB	(aa2, cc06, cc08, cc08)
2444
2445	LDF	[BO + 15 * SIZE], a1
2446
2447	FMUL	a1, c07, c07
2448	FMUL	a1, c08, c08
2449#endif
2450
2451#ifdef RT
2452	LDF	[BO + 15 * SIZE], a1
2453	LDF	[BO + 14 * SIZE], a2
2454	LDF	[BO + 13 * SIZE], a3
2455	LDF	[BO + 12 * SIZE], a4
2456
2457	FMUL	a1, c08, c08
2458	FMUL	a1, c07, c07
2459
2460	FNMSUB	(aa2, cc08, cc06, cc06)
2461	FNMSUB	(aa2, cc07, cc05, cc05)
2462	FNMSUB	(aa3, cc08, cc04, cc04)
2463	FNMSUB	(aa3, cc07, cc03, cc03)
2464	FNMSUB	(aa4, cc08, cc02, cc02)
2465	FNMSUB	(aa4, cc07, cc01, cc01)
2466
2467	LDF	[BO + 10 * SIZE], a1
2468	LDF	[BO +  9 * SIZE], a2
2469	LDF	[BO +  8 * SIZE], a3
2470
2471	FMUL	a1, c06, c06
2472	FMUL	a1, c05, c05
2473
2474	FNMSUB	(aa2, cc06, cc04, cc04)
2475	FNMSUB	(aa2, cc05, cc03, cc03)
2476	FNMSUB	(aa3, cc06, cc02, cc02)
2477	FNMSUB	(aa3, cc05, cc01, cc01)
2478
2479	LDF	[BO +  5 * SIZE], a1
2480	LDF	[BO +  4 * SIZE], a2
2481
2482	FMUL	a1, c04, c04
2483	FMUL	a1, c03, c03
2484
2485	FNMSUB	(aa2, cc04, cc02, cc02)
2486	FNMSUB	(aa2, cc03, cc01, cc01)
2487
2488	LDF	[BO +  0 * SIZE], a1
2489
2490	FMUL	a1, c02, c02
2491	FMUL	a1, c01, c01
2492#endif
2493
2494#ifdef LN
2495	add	C1, -2 * SIZE, C1
2496	add	C2, -2 * SIZE, C2
2497	add	C3, -2 * SIZE, C3
2498	add	C4, -2 * SIZE, C4
2499#endif
2500
2501#if defined(LN) || defined(LT)
2502	STF	c01, [BO +  0 * SIZE]
2503	STF	c03, [BO +  1 * SIZE]
2504	STF	c05, [BO +  2 * SIZE]
2505	STF	c07, [BO +  3 * SIZE]
2506
2507	STF	c02, [BO +  4 * SIZE]
2508	STF	c04, [BO +  5 * SIZE]
2509	STF	c06, [BO +  6 * SIZE]
2510	STF	c08, [BO +  7 * SIZE]
2511#else
2512	STF	c01, [AO +  0 * SIZE]
2513	STF	c02, [AO +  1 * SIZE]
2514	STF	c03, [AO +  2 * SIZE]
2515	STF	c04, [AO +  3 * SIZE]
2516
2517	STF	c05, [AO +  4 * SIZE]
2518	STF	c06, [AO +  5 * SIZE]
2519	STF	c07, [AO +  6 * SIZE]
2520	STF	c08, [AO +  7 * SIZE]
2521#endif
2522
2523	STF	c01, [C1 + 0 * SIZE]
2524	STF	c02, [C1 + 1 * SIZE]
2525	STF	c03, [C2 + 0 * SIZE]
2526	STF	c04, [C2 + 1 * SIZE]
2527
2528	STF	c05, [C3 + 0 * SIZE]
2529	STF	c06, [C3 + 1 * SIZE]
2530	STF	c07, [C4 + 0 * SIZE]
2531	STF	c08, [C4 + 1 * SIZE]
2532
2533#ifndef LN
2534	add	C1, 2 * SIZE, C1
2535	add	C2, 2 * SIZE, C2
2536	add	C3, 2 * SIZE, C3
2537	add	C4, 2 * SIZE, C4
2538#endif
2539
2540#ifdef RT
2541	sll	K, BASE_SHIFT + 1, TEMP1
2542	add	AORIG, TEMP1, AORIG
2543#endif
2544
2545#if defined(LT) || defined(RN)
2546	sub	K, KK, TEMP1
2547	sll	TEMP1, BASE_SHIFT + 1, TEMP2
2548	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2549	add	AO, TEMP2, AO
2550	add	BO, TEMP1, BO
2551#endif
2552
2553#ifdef LT
2554	add	KK, 2, KK
2555#endif
2556
2557#ifdef LN
2558	sub	KK, 2, KK
2559#endif
2560
2561	add	I, -1, I
2562	cmp	I, 0
2563	bg,pt	%icc, .LL32
2564	nop
2565
2566.LL40:
2567	and	M, 1, I
2568	cmp	I, 0
2569	ble,pn	%icc, .LL49
2570	nop
2571
2572#if defined(LT) || defined(RN)
2573	mov	B, BO
2574#else
2575#ifdef LN
2576	sll	K,  BASE_SHIFT + 0, TEMP1
2577	sub	AORIG, TEMP1, AORIG
2578#endif
2579
2580	sll	KK, BASE_SHIFT + 0, TEMP1
2581	sll	KK, BASE_SHIFT + 2, TEMP2
2582
2583	add	AORIG, TEMP1, AO
2584	add	B,     TEMP2, BO
2585#endif
2586
2587	LDF	[AO +  0 * SIZE], a1
2588	LDF	[AO +  1 * SIZE], a2
2589	LDF	[AO +  2 * SIZE], a3
2590	LDF	[AO +  3 * SIZE], a4
2591
2592	LDF	[BO +  0 * SIZE], b1
2593	LDF	[BO +  1 * SIZE], b2
2594	LDF	[BO +  2 * SIZE], b3
2595	LDF	[BO +  3 * SIZE], b4
2596	LDF	[BO +  4 * SIZE], b5
2597	LDF	[BO +  5 * SIZE], b6
2598	FCLR	(cc01)
2599	LDF	[BO +  6 * SIZE], b7
2600	FCLR	(cc03)
2601	LDF	[BO +  7 * SIZE], b8
2602	FCLR	(cc05)
2603	LDF	[BO +  8 * SIZE], b9
2604	FCLR	(cc07)
2605
2606#if defined(LT) || defined(RN)
2607	sra	KK, 2, L
2608#else
2609	sub	K, KK, L
2610	sra	L,  2, L
2611#endif
2612	cmp	L,  0
2613	ble,pn	%icc, .LL45
2614	nop
2615
2616.LL43:
2617	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2618	add	L, -1, L
2619
2620	FMADD	(aa1, bb1, cc01, cc01)
2621	LDF	[BO + 16 * SIZE], b1
2622	FMADD	(aa1, bb2, cc03, cc03)
2623	LDF	[BO +  9 * SIZE], b2
2624	FMADD	(aa1, bb3, cc05, cc05)
2625	LDF	[BO + 10 * SIZE], b3
2626	FMADD	(aa1, bb4, cc07, cc07)
2627	LDF	[BO + 11 * SIZE], b4
2628
2629	LDF	[AO +  4 * SIZE], a1
2630	cmp	L, 0
2631
2632	FMADD	(aa2, bb5, cc01, cc01)
2633	LDF	[BO + 12 * SIZE], b5
2634	FMADD	(aa2, bb6, cc03, cc03)
2635	LDF	[BO + 13 * SIZE], b6
2636	FMADD	(aa2, bb7, cc05, cc05)
2637	LDF	[BO + 14 * SIZE], b7
2638	FMADD	(aa2, bb8, cc07, cc07)
2639	LDF	[BO + 15 * SIZE], b8
2640
2641	LDF	[AO +  5 * SIZE], a2
2642	add	AO,  4 * SIZE, AO
2643
2644	FMADD	(aa3, bb9, cc01, cc01)
2645	LDF	[BO + 24 * SIZE], b9
2646	FMADD	(aa3, bb2, cc03, cc03)
2647	LDF	[BO + 17 * SIZE], b2
2648	FMADD	(aa3, bb3, cc05, cc05)
2649	LDF	[BO + 18 * SIZE], b3
2650	FMADD	(aa3, bb4, cc07, cc07)
2651	LDF	[BO + 19 * SIZE], b4
2652
2653	LDF	[AO +  2 * SIZE], a3
2654	add	BO, 16 * SIZE, BO
2655
2656	FMADD	(aa4, bb5, cc01, cc01)
2657	LDF	[BO +  4 * SIZE], b5
2658	FMADD	(aa4, bb6, cc03, cc03)
2659	LDF	[BO +  5 * SIZE], b6
2660	FMADD	(aa4, bb7, cc05, cc05)
2661	LDF	[BO +  6 * SIZE], b7
2662	FMADD	(aa4, bb8, cc07, cc07)
2663	LDF	[BO +  7 * SIZE], b8
2664
2665	bg,pt	%icc, .LL43
2666	LDF	[AO +  3 * SIZE], a4
2667	.align 4
2668
2669.LL45:
2670#if defined(LT) || defined(RN)
2671	and	KK, 3, L
2672#else
2673	sub	K, KK, L
2674	and	L,  3, L
2675#endif
2676	cmp	L,  0
2677	ble,a,pn %icc, .LL48
2678	nop
2679	.align 4
2680
2681.LL47:
2682	FMADD	(aa1, bb1, cc01, cc01)
2683	LDF	[BO + 4 * SIZE], b1
2684	add	L, -1, L
2685	FMADD	(aa1, bb2, cc03, cc03)
2686	LDF	[BO + 5 * SIZE], b2
2687	add	AO, 1 * SIZE, AO
2688
2689	FMADD	(aa1, bb3, cc05, cc05)
2690	LDF	[BO + 6 * SIZE], b3
2691	cmp	L, 0
2692	FMADD	(aa1, bb4, cc07, cc07)
2693	LDF	[BO + 7 * SIZE], b4
2694	add	BO, 4 * SIZE, BO
2695
2696	bg,pt	%icc, .LL47
2697	LDF	[AO + 0 * SIZE], a1
2698	.align 4
2699
2700.LL48:
2701#if defined(LN) || defined(RT)
2702#ifdef LN
2703	sub	KK, 1, TEMP1
2704#else
2705	sub	KK, 4, TEMP1
2706#endif
2707	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2708	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2709
2710	add	AORIG, TEMP2, AO
2711	add	B,     TEMP1, BO
2712#endif
2713
2714#if defined(LN) || defined(LT)
2715	LDF	[BO +  0 * SIZE], a1
2716	LDF	[BO +  1 * SIZE], a2
2717	LDF	[BO +  2 * SIZE], a3
2718	LDF	[BO +  3 * SIZE], a4
2719
2720	FSUB	a1, c01, c01
2721	FSUB	a2, c03, c03
2722	FSUB	a3, c05, c05
2723	FSUB	a4, c07, c07
2724#else
2725	LDF	[AO +  0 * SIZE], a1
2726	LDF	[AO +  1 * SIZE], a2
2727	LDF	[AO +  2 * SIZE], a3
2728	LDF	[AO +  3 * SIZE], a4
2729
2730	FSUB	a1, c01, c01
2731	FSUB	a2, c03, c03
2732	FSUB	a3, c05, c05
2733	FSUB	a4, c07, c07
2734#endif
2735
2736#if defined(LN) || defined(LT)
2737	LDF	[AO +  0 * SIZE], a1
2738
2739	FMUL	a1, c01, c01
2740	FMUL	a1, c03, c03
2741	FMUL	a1, c05, c05
2742	FMUL	a1, c07, c07
2743#endif
2744
2745#ifdef RN
2746	LDF	[BO +  0 * SIZE], a1
2747	LDF	[BO +  1 * SIZE], a2
2748	LDF	[BO +  2 * SIZE], a3
2749	LDF	[BO +  3 * SIZE], a4
2750
2751	FMUL	a1, c01, c01
2752
2753	FNMSUB	(aa2, cc01, cc03, cc03)
2754	FNMSUB	(aa3, cc01, cc05, cc05)
2755	FNMSUB	(aa4, cc01, cc07, cc07)
2756
2757	LDF	[BO +  5 * SIZE], a1
2758	LDF	[BO +  6 * SIZE], a2
2759	LDF	[BO +  7 * SIZE], a3
2760
2761	FMUL	a1, c03, c03
2762
2763	FNMSUB	(aa2, cc03, cc05, cc05)
2764	FNMSUB	(aa3, cc03, cc07, cc07)
2765
2766	LDF	[BO + 10 * SIZE], a1
2767	LDF	[BO + 11 * SIZE], a2
2768
2769	FMUL	a1, c05, c05
2770
2771	FNMSUB	(aa2, cc05, cc07, cc07)
2772
2773	LDF	[BO + 15 * SIZE], a1
2774
2775	FMUL	a1, c07, c07
2776#endif
2777
2778#ifdef RT
2779	LDF	[BO + 15 * SIZE], a1
2780	LDF	[BO + 14 * SIZE], a2
2781	LDF	[BO + 13 * SIZE], a3
2782	LDF	[BO + 12 * SIZE], a4
2783
2784	FMUL	a1, c07, c07
2785
2786	FNMSUB	(aa2, cc07, cc05, cc05)
2787	FNMSUB	(aa3, cc07, cc03, cc03)
2788	FNMSUB	(aa4, cc07, cc01, cc01)
2789
2790	LDF	[BO + 10 * SIZE], a1
2791	LDF	[BO +  9 * SIZE], a2
2792	LDF	[BO +  8 * SIZE], a3
2793
2794	FMUL	a1, c05, c05
2795
2796	FNMSUB	(aa2, cc05, cc03, cc03)
2797	FNMSUB	(aa3, cc05, cc01, cc01)
2798
2799	LDF	[BO +  5 * SIZE], a1
2800	LDF	[BO +  4 * SIZE], a2
2801
2802	FMUL	a1, c03, c03
2803
2804	FNMSUB	(aa2, cc03, cc01, cc01)
2805
2806	LDF	[BO +  0 * SIZE], a1
2807
2808	FMUL	a1, c01, c01
2809#endif
2810
2811#ifdef LN
2812	add	C1, -1 * SIZE, C1
2813	add	C2, -1 * SIZE, C2
2814	add	C3, -1 * SIZE, C3
2815	add	C4, -1 * SIZE, C4
2816#endif
2817
2818#if defined(LN) || defined(LT)
2819	STF	c01, [BO +  0 * SIZE]
2820	STF	c03, [BO +  1 * SIZE]
2821	STF	c05, [BO +  2 * SIZE]
2822	STF	c07, [BO +  3 * SIZE]
2823#else
2824	STF	c01, [AO +  0 * SIZE]
2825	STF	c03, [AO +  1 * SIZE]
2826	STF	c05, [AO +  2 * SIZE]
2827	STF	c07, [AO +  3 * SIZE]
2828#endif
2829
2830	STF	c01, [C1 + 0 * SIZE]
2831	STF	c03, [C2 + 0 * SIZE]
2832	STF	c05, [C3 + 0 * SIZE]
2833	STF	c07, [C4 + 0 * SIZE]
2834
2835#ifdef RT
2836	sll	K, BASE_SHIFT + 0, TEMP1
2837	add	AORIG, TEMP1, AORIG
2838#endif
2839
2840#if defined(LT) || defined(RN)
2841	sub	K, KK, TEMP1
2842	sll	TEMP1, BASE_SHIFT + 0, TEMP2
2843	sll	TEMP1, BASE_SHIFT + 2, TEMP1
2844	add	AO, TEMP2, AO
2845	add	BO, TEMP1, BO
2846#endif
2847
2848#ifdef LT
2849	add	KK, 1, KK
2850#endif
2851
2852#ifdef LN
2853	sub	KK, 1, KK
2854#endif
2855	.align 4
2856
2857.LL49:
2858#ifdef LN
2859	sll	K, BASE_SHIFT + 2, TEMP1
2860	add	B, TEMP1, B
2861#endif
2862
2863#if defined(LT) || defined(RN)
2864	mov	BO, B
2865#endif
2866
2867#ifdef RN
2868	add	KK, 4, KK
2869#endif
2870
2871#ifdef RT
2872	sub	KK, 4, KK
2873#endif
2874	.align 4
2875
2876.LL50:
2877	and	N, 2, J
2878	cmp	J, 0
2879	ble,pn	%icc, .LL70
2880	nop
2881
2882#ifdef RT
2883	sll	K, BASE_SHIFT + 1, TEMP1
2884	sub	B, TEMP1, B
2885#endif
2886
2887#ifndef RT
2888	mov	C,  C1
2889	add	C,  LDC, C2
2890	add	C2, LDC, C
2891#else
2892	sub	C,  LDC, C2
2893	sub	C2, LDC, C1
2894	sub	C2, LDC, C
2895#endif
2896
2897#ifdef LN
2898	add	M, OFFSET, KK
2899#endif
2900
2901#ifdef LT
2902	mov	OFFSET, KK
2903#endif
2904
2905#if defined(LN) || defined(RT)
2906	mov	A, AORIG
2907#else
2908	mov	A, AO
2909#endif
2910
2911	sra	M, 1, I
2912	cmp	I, 0
2913	ble,pn	%icc, .LL60
2914	nop
2915	.align 4
2916
2917.LL52:
2918#if defined(LT) || defined(RN)
2919	mov	B, BO
2920#else
2921#ifdef LN
2922	sll	K,  BASE_SHIFT + 1, TEMP1
2923	sub	AORIG, TEMP1, AORIG
2924#endif
2925
2926	sll	KK, BASE_SHIFT + 1, TEMP1
2927	sll	KK, BASE_SHIFT + 1, TEMP2
2928
2929	add	AORIG, TEMP1, AO
2930	add	B,     TEMP2, BO
2931#endif
2932
2933	LDF	[AO +  0 * SIZE], a1
2934	LDF	[AO +  1 * SIZE], a2
2935	LDF	[AO +  2 * SIZE], a3
2936	LDF	[AO +  3 * SIZE], a4
2937
2938	LDF	[BO +  0 * SIZE], b1
2939	LDF	[BO +  1 * SIZE], b2
2940	LDF	[BO +  2 * SIZE], b3
2941	FCLR	(cc01)
2942	LDF	[BO +  3 * SIZE], b4
2943	FCLR	(cc02)
2944
2945	LDF	[BO +  4 * SIZE], b5
2946	FCLR	(cc03)
2947	LDF	[BO +  5 * SIZE], b6
2948	FCLR	(cc04)
2949	LDF	[BO +  6 * SIZE], b7
2950	FCLR	(cc05)
2951	LDF	[BO +  7 * SIZE], b8
2952	FCLR	(cc06)
2953
2954	prefetch [C1 + 2 * SIZE], 3
2955	FCLR	(cc07)
2956	prefetch [C2 + 2 * SIZE], 3
2957	FCLR	(cc08)
2958
2959#if defined(LT) || defined(RN)
2960	sra	KK, 2, L
2961#else
2962	sub	K, KK, L
2963	sra	L,  2, L
2964#endif
2965	cmp	L,  0
2966	ble,pn	%icc, .LL55
2967	nop
2968	.align 4
2969
2970.LL53:
2971	FMADD	(aa1, bb1, cc01, cc01)
2972	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
2973	FMADD	(aa2, bb1, cc02, cc02)
2974	LDF	[BO +  8 * SIZE], b1
2975
2976	FMADD	(aa1, bb2, cc03, cc03)
2977	LDF	[AO +  4 * SIZE], a1
2978	FMADD	(aa2, bb2, cc04, cc04)
2979	LDF	[AO +  5 * SIZE], a2
2980
2981	FMADD	(aa3, bb3, cc01, cc01)
2982	LDF	[BO +  9 * SIZE], b2
2983	FMADD	(aa4, bb3, cc02, cc02)
2984	LDF	[BO + 10 * SIZE], b3
2985
2986	FMADD	(aa3, bb4, cc03, cc03)
2987	LDF	[AO +  6 * SIZE], a3
2988	FMADD	(aa4, bb4, cc04, cc04)
2989	LDF	[AO +  7 * SIZE], a4
2990
2991	FMADD	(aa1, bb5, cc01, cc01)
2992	LDF	[BO + 11 * SIZE], b4
2993	FMADD	(aa2, bb5, cc02, cc02)
2994	LDF	[BO + 12 * SIZE], b5
2995
2996	FMADD	(aa1, bb6, cc03, cc03)
2997	LDF	[AO +  8 * SIZE], a1
2998	FMADD	(aa2, bb6, cc04, cc04)
2999	LDF	[AO +  9 * SIZE], a2
3000
3001	FMADD	(aa3, bb7, cc01, cc01)
3002	LDF	[BO + 13 * SIZE], b6
3003
3004	FMADD	(aa4, bb7, cc02, cc02)
3005	LDF	[BO + 14 * SIZE], b7
3006
3007	FMADD	(aa3, bb8, cc03, cc03)
3008	LDF	[AO + 10 * SIZE], a3
3009	FMADD	(aa4, bb8, cc04, cc04)
3010	LDF	[AO + 11 * SIZE], a4
3011
3012	add	AO,  8 * SIZE, AO
3013	add	L, -1, L
3014	add	BO,  8 * SIZE, BO
3015	cmp	L, 0
3016
3017	bg,pt	%icc, .LL53
3018	LDF	[BO +  7 * SIZE], b8
3019	.align 4
3020
3021.LL55:
3022#if defined(LT) || defined(RN)
3023	and	KK, 3, L
3024#else
3025	sub	K, KK, L
3026	and	L,  3, L
3027#endif
3028	cmp	L,  0
3029	ble,a,pn %icc, .LL58
3030	nop
3031	.align 4
3032
3033.LL57:
3034	FMADD	(aa1, bb1, cc01, cc01)
3035	add	L, -1, L
3036	FMADD	(aa2, bb1, cc02, cc02)
3037	LDF	[BO + 2 * SIZE], b1
3038
3039	FMADD	(aa1, bb2, cc03, cc03)
3040	LDF	[AO + 2 * SIZE], a1
3041	FMADD	(aa2, bb2, cc04, cc04)
3042	LDF	[AO + 3 * SIZE], a2
3043
3044	add	AO, 2 * SIZE, AO
3045	cmp	L, 0
3046	add	BO, 2 * SIZE, BO
3047	bg,pt	%icc, .LL57
3048	LDF	[BO + 1 * SIZE], b2
3049	.align 4
3050
3051.LL58:
3052#if defined(LN) || defined(RT)
3053#ifdef LN
3054	sub	KK, 2, TEMP1
3055#else
3056	sub	KK, 2, TEMP1
3057#endif
3058	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3059	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3060
3061	add	AORIG, TEMP2, AO
3062	add	B,     TEMP1, BO
3063#endif
3064
3065#if defined(LN) || defined(LT)
3066	LDF	[BO +  0 * SIZE], a1
3067	LDF	[BO +  1 * SIZE], a2
3068	LDF	[BO +  2 * SIZE], a3
3069	LDF	[BO +  3 * SIZE], a4
3070
3071	FSUB	a1, c01, c01
3072	FSUB	a2, c03, c03
3073	FSUB	a3, c02, c02
3074	FSUB	a4, c04, c04
3075#else
3076	LDF	[AO +  0 * SIZE], a1
3077	LDF	[AO +  1 * SIZE], a2
3078	LDF	[AO +  2 * SIZE], a3
3079	LDF	[AO +  3 * SIZE], a4
3080
3081	FSUB	a1, c01, c01
3082	FSUB	a2, c02, c02
3083	FSUB	a3, c03, c03
3084	FSUB	a4, c04, c04
3085#endif
3086
3087#ifdef LN
3088	LDF	[AO +  3 * SIZE], a1
3089	LDF	[AO +  2 * SIZE], a2
3090	LDF	[AO +  0 * SIZE], a3
3091
3092	FMUL	a1, c02, c02
3093	FMUL	a1, c04, c04
3094
3095	FNMSUB	(aa2, cc02, cc01, cc01)
3096	FNMSUB	(aa2, cc04, cc03, cc03)
3097
3098	FMUL	a3, c01, c01
3099	FMUL	a3, c03, c03
3100#endif
3101
3102#ifdef LT
3103	LDF	[AO +  0 * SIZE], a1
3104	LDF	[AO +  1 * SIZE], a2
3105	LDF	[AO +  3 * SIZE], a3
3106
3107	FMUL	a1, c01, c01
3108	FMUL	a1, c03, c03
3109
3110	FNMSUB	(aa2, cc01, cc02, cc02)
3111	FNMSUB	(aa2, cc03, cc04, cc04)
3112
3113	FMUL	a3, c02, c02
3114	FMUL	a3, c04, c04
3115#endif
3116
3117#ifdef RN
3118	LDF	[BO +  0 * SIZE], a1
3119	LDF	[BO +  1 * SIZE], a2
3120
3121	FMUL	a1, c01, c01
3122	FMUL	a1, c02, c02
3123
3124	FNMSUB	(aa2, cc01, cc03, cc03)
3125	FNMSUB	(aa2, cc02, cc04, cc04)
3126
3127	LDF	[BO +  3 * SIZE], a1
3128
3129	FMUL	a1, c03, c03
3130	FMUL	a1, c04, c04
3131#endif
3132
3133#ifdef RT
3134	LDF	[BO +  3 * SIZE], a1
3135	LDF	[BO +  2 * SIZE], a2
3136
3137	FMUL	a1, c04, c04
3138	FMUL	a1, c03, c03
3139
3140	FNMSUB	(aa2, cc04, cc02, cc02)
3141	FNMSUB	(aa2, cc03, cc01, cc01)
3142
3143	LDF	[BO +  0 * SIZE], a1
3144
3145	FMUL	a1, c02, c02
3146	FMUL	a1, c01, c01
3147#endif
3148
3149#ifdef LN
3150	add	C1, -2 * SIZE, C1
3151	add	C2, -2 * SIZE, C2
3152#endif
3153
3154#if defined(LN) || defined(LT)
3155	STF	c01, [BO +  0 * SIZE]
3156	STF	c03, [BO +  1 * SIZE]
3157	STF	c02, [BO +  2 * SIZE]
3158	STF	c04, [BO +  3 * SIZE]
3159#else
3160	STF	c01, [AO +  0 * SIZE]
3161	STF	c02, [AO +  1 * SIZE]
3162	STF	c03, [AO +  2 * SIZE]
3163	STF	c04, [AO +  3 * SIZE]
3164#endif
3165
3166	STF	c01, [C1 + 0 * SIZE]
3167	STF	c02, [C1 + 1 * SIZE]
3168	STF	c03, [C2 + 0 * SIZE]
3169	STF	c04, [C2 + 1 * SIZE]
3170
3171#ifndef LN
3172	add	C1, 2 * SIZE, C1
3173	add	C2, 2 * SIZE, C2
3174#endif
3175
3176#ifdef RT
3177	sll	K, BASE_SHIFT + 1, TEMP1
3178	add	AORIG, TEMP1, AORIG
3179#endif
3180
3181#if defined(LT) || defined(RN)
3182	sub	K, KK, TEMP1
3183	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3184	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3185	add	AO, TEMP2, AO
3186	add	BO, TEMP1, BO
3187#endif
3188
3189#ifdef LT
3190	add	KK, 2, KK
3191#endif
3192
3193#ifdef LN
3194	sub	KK, 2, KK
3195#endif
3196
3197	add	I, -1, I
3198	cmp	I, 0
3199	bg,pt	%icc, .LL52
3200	nop
3201	.align 4
3202
3203.LL60:
3204	and	M, 1, I
3205	cmp	I, 0
3206	ble,pn	%icc, .LL69
3207	nop
3208
3209#if defined(LT) || defined(RN)
3210	mov	B, BO
3211#else
3212#ifdef LN
3213	sll	K,  BASE_SHIFT + 0, TEMP1
3214	sub	AORIG, TEMP1, AORIG
3215#endif
3216
3217	sll	KK, BASE_SHIFT + 0, TEMP1
3218	sll	KK, BASE_SHIFT + 1, TEMP2
3219
3220	add	AORIG, TEMP1, AO
3221	add	B,     TEMP2, BO
3222#endif
3223
3224	LDF	[AO +  0 * SIZE], a1
3225	LDF	[AO +  1 * SIZE], a2
3226	LDF	[AO +  2 * SIZE], a3
3227	LDF	[AO +  3 * SIZE], a4
3228
3229	LDF	[BO +  0 * SIZE], b1
3230	LDF	[BO +  1 * SIZE], b2
3231	LDF	[BO +  2 * SIZE], b3
3232	LDF	[BO +  3 * SIZE], b4
3233	LDF	[BO +  4 * SIZE], b5
3234	LDF	[BO +  5 * SIZE], b6
3235	LDF	[BO +  6 * SIZE], b7
3236	FCLR	(cc01)
3237	LDF	[BO +  7 * SIZE], b8
3238	FCLR	(cc03)
3239
3240#if defined(LT) || defined(RN)
3241	sra	KK, 2, L
3242#else
3243	sub	K, KK, L
3244	sra	L,  2, L
3245#endif
3246	cmp	L,  0
3247	ble,pn	%icc, .LL65
3248	nop
3249	.align 4
3250
3251.LL63:
3252	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3253	add	L, -1, L
3254
3255	FMADD	(aa1, bb1, cc01, cc01)
3256	LDF	[BO +  8 * SIZE], b1
3257	FMADD	(aa1, bb2, cc03, cc03)
3258	LDF	[BO +  9 * SIZE], b2
3259
3260	LDF	[AO +  4 * SIZE], a1
3261	cmp	L, 0
3262
3263	FMADD	(aa2, bb3, cc01, cc01)
3264	LDF	[BO + 10 * SIZE], b3
3265	FMADD	(aa2, bb4, cc03, cc03)
3266	LDF	[BO + 11 * SIZE], b4
3267
3268	LDF	[AO +  5 * SIZE], a2
3269	add	AO,  4 * SIZE, AO
3270
3271	FMADD	(aa3, bb5, cc01, cc01)
3272	LDF	[BO + 12 * SIZE], b5
3273	FMADD	(aa3, bb6, cc03, cc03)
3274	LDF	[BO + 13 * SIZE], b6
3275
3276	LDF	[AO +  2 * SIZE], a3
3277	add	BO,  8 * SIZE, BO
3278
3279	FMADD	(aa4, bb7, cc01, cc01)
3280	LDF	[BO +  6 * SIZE], b7
3281	FMADD	(aa4, bb8, cc03, cc03)
3282	LDF	[BO + 7 * SIZE], b8
3283
3284	bg,pt	%icc, .LL63
3285	LDF	[AO +  3 * SIZE], a4
3286	.align 4
3287
3288.LL65:
3289#if defined(LT) || defined(RN)
3290	and	KK, 3, L
3291#else
3292	sub	K, KK, L
3293	and	L,  3, L
3294#endif
3295	cmp	L,  0
3296	ble,a,pn %icc, .LL68
3297	nop
3298	.align 4
3299
3300.LL67:
3301	FMADD	(aa1, bb1, cc01, cc01)
3302	LDF	[BO + 2 * SIZE], b1
3303	FMADD	(aa1, bb2, cc03, cc03)
3304	LDF	[BO + 3 * SIZE], b2
3305
3306	LDF	[AO + 1 * SIZE], a1
3307	add	L, -1, L
3308	add	AO, 1 * SIZE, AO
3309	cmp	L, 0
3310
3311	bg,pt	%icc, .LL67
3312	add	BO, 2 * SIZE, BO
3313	.align 4
3314
3315.LL68:
3316#if defined(LN) || defined(RT)
3317#ifdef LN
3318	sub	KK, 1, TEMP1
3319#else
3320	sub	KK, 2, TEMP1
3321#endif
3322	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3323	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3324
3325	add	AORIG, TEMP2, AO
3326	add	B,     TEMP1, BO
3327#endif
3328
3329#if defined(LN) || defined(LT)
3330	LDF	[BO +  0 * SIZE], a1
3331	LDF	[BO +  1 * SIZE], a2
3332
3333	FSUB	a1, c01, c01
3334	FSUB	a2, c03, c03
3335#else
3336	LDF	[AO +  0 * SIZE], a1
3337	LDF	[AO +  1 * SIZE], a2
3338
3339	FSUB	a1, c01, c01
3340	FSUB	a2, c03, c03
3341#endif
3342
3343#if defined(LN) || defined(LT)
3344	LDF	[AO +  0 * SIZE], a1
3345
3346	FMUL	a1, c01, c01
3347	FMUL	a1, c03, c03
3348#endif
3349
3350#ifdef RN
3351	LDF	[BO +  0 * SIZE], a1
3352	LDF	[BO +  1 * SIZE], a2
3353
3354	FMUL	a1, c01, c01
3355
3356	FNMSUB	(aa2, cc01, cc03, cc03)
3357
3358	LDF	[BO +  3 * SIZE], a1
3359
3360	FMUL	a1, c03, c03
3361#endif
3362
3363#ifdef RT
3364	LDF	[BO +  3 * SIZE], a1
3365	LDF	[BO +  2 * SIZE], a2
3366
3367	FMUL	a1, c03, c03
3368
3369	FNMSUB	(aa2, cc03, cc01, cc01)
3370
3371	LDF	[BO +  0 * SIZE], a1
3372
3373	FMUL	a1, c01, c01
3374#endif
3375
3376#ifdef LN
3377	add	C1, -1 * SIZE, C1
3378	add	C2, -1 * SIZE, C2
3379#endif
3380
3381#if defined(LN) || defined(LT)
3382	STF	c01, [BO +  0 * SIZE]
3383	STF	c03, [BO +  1 * SIZE]
3384#else
3385	STF	c01, [AO +  0 * SIZE]
3386	STF	c03, [AO +  1 * SIZE]
3387#endif
3388
3389	STF	c01, [C1 + 0 * SIZE]
3390	STF	c03, [C2 + 0 * SIZE]
3391
3392#ifdef RT
3393	sll	K, BASE_SHIFT + 0, TEMP1
3394	add	AORIG, TEMP1, AORIG
3395#endif
3396
3397#if defined(LT) || defined(RN)
3398	sub	K, KK, TEMP1
3399	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3400	sll	TEMP1, BASE_SHIFT + 1, TEMP1
3401	add	AO, TEMP2, AO
3402	add	BO, TEMP1, BO
3403#endif
3404
3405#ifdef LT
3406	add	KK, 1, KK
3407#endif
3408
3409#ifdef LN
3410	sub	KK, 1, KK
3411#endif
3412	.align 4
3413
3414.LL69:
3415#ifdef LN
3416	sll	K, BASE_SHIFT + 1, TEMP1
3417	add	B, TEMP1, B
3418#endif
3419
3420#if defined(LT) || defined(RN)
3421	mov	BO, B
3422#endif
3423
3424#ifdef RN
3425	add	KK, 2, KK
3426#endif
3427
3428#ifdef RT
3429	sub	KK, 2, KK
3430#endif
3431	.align 4
3432
3433.LL70:
3434	and	N, 1, J
3435	cmp	J, 0
3436	ble,pn	%icc, .LL999
3437	nop
3438
3439#ifdef RT
3440	sll	K, BASE_SHIFT, TEMP1
3441	sub	B, TEMP1, B
3442#endif
3443
3444#ifndef RT
3445	mov	C,  C1
3446	add	C1, LDC, C
3447#else
3448	sub	C,  LDC, C1
3449	sub	C,  LDC, C
3450#endif
3451
3452#ifdef LN
3453	add	M, OFFSET, KK
3454#endif
3455
3456#ifdef LT
3457	mov	OFFSET, KK
3458#endif
3459
3460#if defined(LN) || defined(RT)
3461	mov	A, AORIG
3462#else
3463	mov	A, AO
3464#endif
3465
3466	sra	M, 1, I
3467	cmp	I, 0
3468	ble,pn	%icc, .LL80
3469	nop
3470	.align 4
3471
3472.LL72:
3473#if defined(LT) || defined(RN)
3474	mov	B, BO
3475#else
3476#ifdef LN
3477	sll	K,  BASE_SHIFT + 1, TEMP1
3478	sub	AORIG, TEMP1, AORIG
3479#endif
3480
3481	sll	KK, BASE_SHIFT + 1, TEMP1
3482	sll	KK, BASE_SHIFT + 0, TEMP2
3483
3484	add	AORIG, TEMP1, AO
3485	add	B,     TEMP2, BO
3486#endif
3487
3488	LDF	[AO +  0 * SIZE], a1
3489	LDF	[AO +  1 * SIZE], a2
3490	LDF	[AO +  2 * SIZE], a3
3491	LDF	[AO +  3 * SIZE], a4
3492
3493	LDF	[BO +  0 * SIZE], b1
3494	LDF	[BO +  1 * SIZE], b2
3495	LDF	[BO +  2 * SIZE], b3
3496	FCLR	(cc01)
3497	LDF	[BO +  3 * SIZE], b4
3498	FCLR	(cc02)
3499
3500	prefetch [C1 + 2 * SIZE], 3
3501
3502#if defined(LT) || defined(RN)
3503	sra	KK, 2, L
3504#else
3505	sub	K, KK, L
3506	sra	L,  2, L
3507#endif
3508	cmp	L,  0
3509	ble,pn	%icc, .LL75
3510	nop
3511
3512.LL73:
3513	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3514	add	L, -1, L
3515
3516	FMADD	(aa1, bb1, cc01, cc01)
3517	LDF	[AO +  4 * SIZE], a1
3518	FMADD	(aa2, bb1, cc02, cc02)
3519	LDF	[AO +  5 * SIZE], a2
3520
3521	LDF	[BO +  4 * SIZE], b1
3522	cmp	L, 0
3523
3524	FMADD	(aa3, bb2, cc01, cc01)
3525	LDF	[AO +  6 * SIZE], a3
3526	FMADD	(aa4, bb2, cc02, cc02)
3527	LDF	[AO +  7 * SIZE], a4
3528
3529	LDF	[BO +  5 * SIZE], b2
3530	add	BO,  4 * SIZE, BO
3531
3532	FMADD	(aa1, bb3, cc01, cc01)
3533	LDF	[AO +  8 * SIZE], a1
3534	FMADD	(aa2, bb3, cc02, cc02)
3535	LDF	[AO +  9 * SIZE], a2
3536
3537	LDF	[BO +  2 * SIZE], b3
3538	add	AO,  8 * SIZE, AO
3539
3540	FMADD	(aa3, bb4, cc01, cc01)
3541	LDF	[AO +  2 * SIZE], a3
3542	FMADD	(aa4, bb4, cc02, cc02)
3543	LDF	[AO +  3 * SIZE], a4
3544
3545	bg,pt	%icc, .LL73
3546	LDF	[BO +  3 * SIZE], b4
3547	.align 4
3548
3549.LL75:
3550#if defined(LT) || defined(RN)
3551	and	KK, 3, L
3552#else
3553	sub	K, KK, L
3554	and	L,  3, L
3555#endif
3556	cmp	L,  0
3557	ble,a,pn %icc, .LL78
3558	nop
3559	.align 4
3560
3561.LL77:
3562	FMADD	(aa1, bb1, cc01, cc01)
3563	LDF	[AO + 2 * SIZE], a1
3564	FMADD	(aa2, bb1, cc02, cc02)
3565	LDF	[AO + 3 * SIZE], a2
3566
3567	LDF	[BO + 1 * SIZE], b1
3568	add	L, -1, L
3569	add	AO, 2 * SIZE, AO
3570	cmp	L, 0
3571	bg,pt	%icc, .LL77
3572	add	BO, 1 * SIZE, BO
3573	.align 4
3574
3575.LL78:
3576#if defined(LN) || defined(RT)
3577#ifdef LN
3578	sub	KK, 2, TEMP1
3579#else
3580	sub	KK, 1, TEMP1
3581#endif
3582	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3583	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3584
3585	add	AORIG, TEMP2, AO
3586	add	B,     TEMP1, BO
3587#endif
3588
3589#if defined(LN) || defined(LT)
3590	LDF	[BO +  0 * SIZE], a1
3591	LDF	[BO +  1 * SIZE], a2
3592
3593	FSUB	a1, c01, c01
3594	FSUB	a2, c02, c02
3595#else
3596	LDF	[AO +  0 * SIZE], a1
3597	LDF	[AO +  1 * SIZE], a2
3598
3599	FSUB	a1, c01, c01
3600	FSUB	a2, c02, c02
3601#endif
3602
3603#ifdef LN
3604	LDF	[AO +  3 * SIZE], a1
3605	LDF	[AO +  2 * SIZE], a2
3606	LDF	[AO +  0 * SIZE], a3
3607
3608	FMUL	a1, c02, c02
3609
3610	FNMSUB	(aa2, cc02, cc01, cc01)
3611
3612	FMUL	a3, c01, c01
3613#endif
3614
3615#ifdef LT
3616	LDF	[AO +  0 * SIZE], a1
3617	LDF	[AO +  1 * SIZE], a2
3618	LDF	[AO +  3 * SIZE], a3
3619
3620	FMUL	a1, c01, c01
3621
3622	FNMSUB	(aa2, cc01, cc02, cc02)
3623
3624	FMUL	a3, c02, c02
3625#endif
3626
3627#if defined(RN) || defined(RT)
3628	LDF	[BO +  0 * SIZE], a1
3629
3630	FMUL	a1, c01, c01
3631	FMUL	a1, c02, c02
3632#endif
3633
3634#ifdef LN
3635	add	C1, -2 * SIZE, C1
3636#endif
3637
3638#if defined(LN) || defined(LT)
3639	STF	c01, [BO +  0 * SIZE]
3640	STF	c02, [BO +  1 * SIZE]
3641#else
3642	STF	c01, [AO +  0 * SIZE]
3643	STF	c02, [AO +  1 * SIZE]
3644#endif
3645
3646	STF	c01, [C1 + 0 * SIZE]
3647	STF	c02, [C1 + 1 * SIZE]
3648
3649#ifndef LN
3650	add	C1, 2 * SIZE, C1
3651#endif
3652
3653#ifdef RT
3654	sll	K, BASE_SHIFT + 1, TEMP1
3655	add	AORIG, TEMP1, AORIG
3656#endif
3657
3658#if defined(LT) || defined(RN)
3659	sub	K, KK, TEMP1
3660	sll	TEMP1, BASE_SHIFT + 1, TEMP2
3661	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3662	add	AO, TEMP2, AO
3663	add	BO, TEMP1, BO
3664#endif
3665
3666#ifdef LT
3667	add	KK, 2, KK
3668#endif
3669
3670#ifdef LN
3671	sub	KK, 2, KK
3672#endif
3673
3674	add	I, -1, I
3675	cmp	I, 0
3676	bg,pt	%icc, .LL72
3677	nop
3678	.align 4
3679
3680.LL80:
3681	and	M, 1, I
3682	cmp	I, 0
3683	ble,pn	%icc, .LL89
3684	nop
3685
3686#if defined(LT) || defined(RN)
3687	mov	B, BO
3688#else
3689#ifdef LN
3690	sll	K,  BASE_SHIFT + 0, TEMP1
3691	sub	AORIG, TEMP1, AORIG
3692#endif
3693
3694	sll	KK, BASE_SHIFT + 0, TEMP1
3695	sll	KK, BASE_SHIFT + 0, TEMP2
3696
3697	add	AORIG, TEMP1, AO
3698	add	B,     TEMP2, BO
3699#endif
3700
3701	LDF	[AO +  0 * SIZE], a1
3702	LDF	[BO +  0 * SIZE], b1
3703	LDF	[AO +  1 * SIZE], a2
3704	LDF	[BO +  1 * SIZE], b2
3705	LDF	[AO +  2 * SIZE], a3
3706	LDF	[BO +  2 * SIZE], b3
3707	LDF	[AO +  3 * SIZE], a4
3708	LDF	[BO +  3 * SIZE], b4
3709
3710#if defined(LT) || defined(RN)
3711	sra	KK, 2, L
3712#else
3713	sub	K, KK, L
3714	sra	L,  2, L
3715#endif
3716	cmp	L,  0
3717	ble,pn	%icc, .LL85
3718	FCLR	(cc01)
3719	.align 4
3720
3721.LL83:
3722	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
3723	add	L, -1, L
3724
3725	FMADD	(aa1, bb1, cc01, cc01)
3726	LDF	[AO +  4 * SIZE], a1
3727	LDF	[BO +  4 * SIZE], b1
3728
3729	FMADD	(aa2, bb2, cc01, cc01)
3730	LDF	[AO +  5 * SIZE], a2
3731	LDF	[BO +  5 * SIZE], b2
3732
3733	FMADD	(aa3, bb3, cc01, cc01)
3734	LDF	[AO +  6 * SIZE], a3
3735	LDF	[BO +  6 * SIZE], b3
3736
3737	FMADD	(aa4, bb4, cc01, cc01)
3738	LDF	[AO +  7 * SIZE], a4
3739	LDF	[BO +  7 * SIZE], b4
3740
3741	add	AO,  4 * SIZE, AO
3742	cmp	L, 0
3743
3744	bg,pt	%icc, .LL83
3745	add	BO,  4 * SIZE, BO
3746	.align 4
3747
3748.LL85:
3749#if defined(LT) || defined(RN)
3750	and	KK, 3, L
3751#else
3752	sub	K, KK, L
3753	and	L,  3, L
3754#endif
3755	cmp	L,  0
3756	ble,a,pn %icc, .LL88
3757	nop
3758	.align 4
3759
3760.LL87:
3761	FMADD	(aa1, bb1, cc01, cc01)
3762	LDF	[AO + 1 * SIZE], a1
3763	LDF	[BO + 1 * SIZE], b1
3764
3765	add	AO, 1 * SIZE, AO
3766	add	L, -1, L
3767	cmp	L, 0
3768	bg,pt	%icc, .LL87
3769	add	BO, 1 * SIZE, BO
3770	.align 4
3771
3772.LL88:
3773#if defined(LN) || defined(RT)
3774#ifdef LN
3775	sub	KK, 1, TEMP1
3776#else
3777	sub	KK, 1, TEMP1
3778#endif
3779	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3780	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3781
3782	add	AORIG, TEMP2, AO
3783	add	B,     TEMP1, BO
3784#endif
3785
3786#if defined(LN) || defined(LT)
3787	LDF	[BO +  0 * SIZE], a1
3788
3789	FSUB	a1, c01, c01
3790#else
3791	LDF	[AO +  0 * SIZE], a1
3792
3793	FSUB	a1, c01, c01
3794#endif
3795
3796#if defined(LN) || defined(LT)
3797	LDF	[AO +  0 * SIZE], a1
3798
3799	FMUL	a1, c01, c01
3800#endif
3801
3802#if defined(RN) || defined(RT)
3803	LDF	[BO +  0 * SIZE], a1
3804
3805	FMUL	a1, c01, c01
3806#endif
3807
3808#ifdef LN
3809	add	C1, -1 * SIZE, C1
3810#endif
3811
3812#if defined(LN) || defined(LT)
3813	STF	c01, [BO +  0 * SIZE]
3814#else
3815	STF	c01, [AO +  0 * SIZE]
3816#endif
3817
3818	STF	c01, [C1 + 0 * SIZE]
3819
3820#ifdef RT
3821	sll	K, BASE_SHIFT + 0, TEMP1
3822	add	AORIG, TEMP1, AORIG
3823#endif
3824
3825#if defined(LT) || defined(RN)
3826	sub	K, KK, TEMP1
3827	sll	TEMP1, BASE_SHIFT + 0, TEMP2
3828	sll	TEMP1, BASE_SHIFT + 0, TEMP1
3829	add	AO, TEMP2, AO
3830	add	BO, TEMP1, BO
3831#endif
3832
3833#ifdef LT
3834	add	KK, 1, KK
3835#endif
3836
3837#ifdef LN
3838	sub	KK, 1, KK
3839#endif
3840	.align 4
3841
3842.LL89:
3843#ifdef LN
3844	sll	K, BASE_SHIFT, TEMP1
3845	add	B, TEMP1, B
3846#endif
3847
3848#if defined(LT) || defined(RN)
3849	mov	BO, B
3850#endif
3851
3852#ifdef RN
3853	add	KK, 1, KK
3854#endif
3855
3856#ifdef RT
3857	sub	KK, 1, KK
3858#endif
3859	.align 4
3860
3861.LL999:
3862#ifdef TRMMKERNEL
3863#ifndef __64BIT__
3864	ld	[%sp + STACK_START +  8], %g1
3865	ld	[%sp + STACK_START + 12], %g2
3866	ld	[%sp + STACK_START + 16], %g3
3867	ld	[%sp + STACK_START + 20], %g4
3868#else
3869	ldx	[%sp + STACK_START + 32], %g1
3870	ldx	[%sp + STACK_START + 40], %g2
3871	ldx	[%sp + STACK_START + 48], %g3
3872	ldx	[%sp + STACK_START + 56], %g4
3873#endif
3874#endif
3875
3876	return	%i7 + 8
3877	clr	%o0
3878
3879	EPILOGUE
3880