1/*********************************************************************/
2/* Copyright 2005-2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define APREFETCHSIZE 24
43#define APREFETCH_CATEGORY 0
44
45#define M	%i0
46#define N	%i1
47#define K	%i2
48#define A	%i5
49#define B	%i3
50#define C	%i4
51
52#define LDC	%o0
53#define AO	%o1
54#define BO	%o2
55#define I	%o3
56#define J	%o4
57#define L	%o5
58
59#define C1	%l0
60#define C2	%l1
61#define C3	%l2
62#define C4	%l3
63
64#define OFFSET	%l4
65#define	KK	%l5
66#define TEMP1	%l6
67#define TEMP2	%l7
68#define AORIG	%o7
69
70#ifdef DOUBLE
71#define c01	%f0
72#define c02	%f2
73#define c03	%f4
74#define c04	%f6
75#define c05	%f8
76#define c06	%f10
77#define c07	%f12
78#define c08	%f14
79#define c09	%f16
80#define c10	%f18
81#define c11	%f20
82#define c12	%f22
83#define c13	%f24
84#define c14	%f26
85#define c15	%f28
86#define c16	%f30
87
88#define a1	%f32
89#define a2	%f34
90#define a3	%f36
91#define a4	%f38
92#define a5	%f40
93
94#define b1	%f42
95#define b2	%f44
96#define b3	%f46
97#define b4	%f48
98#define b5	%f50
99#define b6	%f52
100#define b7	%f54
101#define b8	%f56
102#define b9	%f58
103
104#define cc01	0
105#define cc02	2
106#define cc03	4
107#define cc04	6
108#define cc05	8
109#define cc06	10
110#define cc07	12
111#define cc08	14
112#define cc09	16
113#define cc10	18
114#define cc11	20
115#define cc12	22
116#define cc13	24
117#define cc14	26
118#define cc15	28
119#define cc16	30
120
121#define aa1	 1
122#define aa2	 3
123#define aa3	 5
124#define aa4	 7
125#define aa5	 9
126
127#define bb1	11
128#define bb2	13
129#define bb3	15
130#define bb4	17
131#define bb5	19
132#define bb6	21
133#define bb7	23
134#define bb8	25
135#define bb9	27
136#else
137#define c01	%f0
138#define c02	%f1
139#define c03	%f2
140#define c04	%f3
141#define c05	%f4
142#define c06	%f5
143#define c07	%f6
144#define c08	%f7
145#define c09	%f8
146#define c10	%f9
147#define c11	%f10
148#define c12	%f11
149#define c13	%f12
150#define c14	%f13
151#define c15	%f14
152#define c16	%f15
153
154#define a1	%f16
155#define a2	%f17
156#define a3	%f18
157#define a4	%f19
158#define a5	%f20
159
160#define b1	%f21
161#define b2	%f22
162#define b3	%f23
163#define b4	%f24
164#define b5	%f25
165#define b6	%f26
166#define b7	%f27
167#define b8	%f28
168#define b9	%f29
169
170#define cc01	0
171#define cc02	1
172#define cc03	2
173#define cc04	3
174#define cc05	4
175#define cc06	5
176#define cc07	6
177#define cc08	7
178#define cc09	8
179#define cc10	9
180#define cc11	10
181#define cc12	11
182#define cc13	12
183#define cc14	13
184#define cc15	14
185#define cc16	15
186
187#define aa1	16
188#define aa2	17
189#define aa3	18
190#define aa4	19
191#define aa5	20
192
193#define bb1	21
194#define bb2	22
195#define bb3	23
196#define bb4	24
197#define bb5	25
198#define bb6	26
199#define bb7	27
200#define bb8	28
201#define bb9	29
202#endif
203
204#ifndef CONJ
205#define FMADD1	FMADD
206#define FMADD2	FMADD
207#define FMADD3	FMADD
208#define FMADD4	FNMSUB
209#else
210#if defined(LN) || defined(LT)
211#define FMADD1	FMADD
212#define FMADD2	FNMSUB
213#define FMADD3	FMADD
214#define FMADD4	FMADD
215#endif
216#if defined(RN) || defined(RT)
217#define FMADD1	FMADD
218#define FMADD2	FMADD
219#define FMADD3	FNMSUB
220#define FMADD4	FMADD
221#endif
222#endif
223
224        .register %g2, #scratch
225        .register %g3, #scratch
226
227	PROLOGUE
228	SAVESP
229
230#ifndef __64BIT__
231#ifdef DOUBLE
232	ld	[%sp + STACK_START + 32], A
233	ld	[%sp + STACK_START + 36], B
234	ld	[%sp + STACK_START + 40], C
235	ld	[%sp + STACK_START + 44], LDC
236	ld	[%sp + STACK_START + 48], OFFSET
237#else
238	ld	[%sp + STACK_START + 28], B
239	ld	[%sp + STACK_START + 32], C
240	ld	[%sp + STACK_START + 36], LDC
241	ld	[%sp + STACK_START + 40], OFFSET
242#endif
243#else
244	ldx	[%sp + STACK_START + 56], B
245	ldx	[%sp + STACK_START + 64], C
246	ldx	[%sp + STACK_START + 72], LDC
247	ldx	[%sp + STACK_START + 80], OFFSET
248#endif
249
250	cmp	M, 0
251	ble,pn	%icc, .LL999
252	nop
253
254	sll	LDC, ZBASE_SHIFT, LDC
255
256#ifdef LN
257	smul	M, K, TEMP1
258	sll	TEMP1, ZBASE_SHIFT, TEMP1
259	add	A, TEMP1, A
260
261	sll	M, ZBASE_SHIFT, TEMP1
262	add	C, TEMP1, C
263#endif
264
265#ifdef RN
266	neg	OFFSET, KK
267#endif
268
269#ifdef RT
270	smul	N, K, TEMP1
271	sll	TEMP1, ZBASE_SHIFT, TEMP1
272	add	B, TEMP1, B
273
274	smul	N, LDC, TEMP1
275	add	C, TEMP1, C
276
277	sub	N, OFFSET, KK
278#endif
279
280	sra	N, 2, J
281	cmp	J, 0
282	ble,pn	%icc, .LL20
283	nop
284	.align 4
285
286.LL11:
287#ifdef RT
288	sll	K, ZBASE_SHIFT + 2, TEMP1
289	sub	B, TEMP1, B
290#endif
291
292#ifndef RT
293	mov	C,  C1
294	add	C,  LDC, C2
295	add	C2, LDC, C3
296	add	C3, LDC, C4
297	add	C4, LDC, C
298#else
299	sub	C,  LDC, C4
300	sub	C4, LDC, C3
301	sub	C3, LDC, C2
302	sub	C2, LDC, C1
303	sub	C2, LDC, C
304#endif
305
306#ifdef LN
307	add	M, OFFSET, KK
308#endif
309
310#ifdef LT
311	mov	OFFSET, KK
312#endif
313
314#if defined(LN) || defined(RT)
315	mov	A, AORIG
316#else
317	mov	A, AO
318#endif
319
320	mov	M, I
321	.align 4
322
323.LL12:
324#if defined(LT) || defined(RN)
325	mov	B, BO
326#else
327#ifdef LN
328	sll	K,  ZBASE_SHIFT, TEMP1
329	sub	AORIG, TEMP1, AORIG
330#endif
331
332	sll	KK, ZBASE_SHIFT + 0, TEMP1
333	sll	KK, ZBASE_SHIFT + 2, TEMP2
334
335	add	AORIG, TEMP1, AO
336	add	B,     TEMP2, BO
337#endif
338
339	LDF	[AO +  0 * SIZE], a1
340	FCLR	(cc01)
341	LDF	[AO +  1 * SIZE], a2
342	FCLR	(cc05)
343	LDF	[AO +  8 * SIZE], a5
344	FCLR	(cc09)
345	LDF	[BO +  0 * SIZE], b1
346	FCLR	(cc13)
347
348	LDF	[BO +  1 * SIZE], b2
349	FCLR	(cc02)
350	LDF	[BO +  2 * SIZE], b3
351	FCLR	(cc06)
352	LDF	[BO +  3 * SIZE], b4
353	FCLR	(cc10)
354	LDF	[BO +  4 * SIZE], b5
355	FCLR	(cc14)
356
357	LDF	[BO +  5 * SIZE], b6
358	FCLR	(cc03)
359	LDF	[BO +  6 * SIZE], b7
360	FCLR	(cc07)
361	LDF	[BO +  7 * SIZE], b8
362	FCLR	(cc11)
363	LDF	[BO +  8 * SIZE], b9
364	FCLR	(cc15)
365
366	prefetch [C1 + 1 * SIZE], 3
367	FCLR	(cc04)
368	prefetch [C2 + 2 * SIZE], 3
369	FCLR	(cc08)
370	prefetch [C3 + 1 * SIZE], 3
371	FCLR	(cc12)
372	prefetch [C4 + 2 * SIZE], 3
373	FCLR	(cc16)
374
375#if defined(LT) || defined(RN)
376	sra	KK, 3, L
377#else
378	sub	K, KK, L
379	sra	L,  3, L
380#endif
381	cmp	L,  0
382	ble,pn	%icc, .LL15
383	nop
384	.align 4
385
386.LL13:
387	FMADD1	(aa1, bb1, cc01, cc01)
388	FMADD2	(aa2, bb1, cc02, cc02)
389	FMADD3	(aa1, bb2, cc03, cc03)
390	FMADD4	(aa2, bb2, cc04, cc04)
391
392	FMADD1	(aa1, bb3, cc05, cc05)
393	LDF	[BO + 16 * SIZE], b1
394	FMADD2	(aa2, bb3, cc06, cc06)
395	LDF	[BO +  9 * SIZE], b2
396
397	FMADD3	(aa1, bb4, cc07, cc07)
398	LDF	[BO + 10 * SIZE], b3
399	FMADD4	(aa2, bb4, cc08, cc08)
400	LDF	[BO + 11 * SIZE], b4
401
402	FMADD1	(aa1, bb5, cc09, cc09)
403	LDF	[AO +  2 * SIZE], a3
404	FMADD2	(aa2, bb5, cc10, cc10)
405	LDF	[AO +  3 * SIZE], a4
406
407	FMADD3	(aa1, bb6, cc11, cc11)
408	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
409	FMADD4	(aa2, bb6, cc12, cc12)
410	nop
411
412	FMADD1	(aa1, bb7, cc13, cc13)
413	LDF	[BO + 12 * SIZE], b5
414	FMADD2	(aa2, bb7, cc14, cc14)
415	LDF	[BO + 13 * SIZE], b6
416
417	FMADD3	(aa1, bb8, cc15, cc15)
418	LDF	[BO + 14 * SIZE], b7
419	FMADD4	(aa2, bb8, cc16, cc16)
420	LDF	[BO + 15 * SIZE], b8
421
422	FMADD1	(aa3, bb9, cc01, cc01)
423	FMADD2	(aa4, bb9, cc02, cc02)
424	FMADD3	(aa3, bb2, cc03, cc03)
425	FMADD4	(aa4, bb2, cc04, cc04)
426
427	FMADD1	(aa3, bb3, cc05, cc05)
428	LDF	[BO + 24 * SIZE], b9
429	FMADD2	(aa4, bb3, cc06, cc06)
430	LDF	[BO + 17 * SIZE], b2
431
432	FMADD3	(aa3, bb4, cc07, cc07)
433	LDF	[BO + 18 * SIZE], b3
434	FMADD4	(aa4, bb4, cc08, cc08)
435	LDF	[BO + 19 * SIZE], b4
436
437	FMADD1	(aa3, bb5, cc09, cc09)
438	LDF	[AO +  4 * SIZE], a1
439	FMADD2	(aa4, bb5, cc10, cc10)
440	LDF	[AO +  5 * SIZE], a2
441
442	FMADD3	(aa3, bb6, cc11, cc11)
443	add	L, -1, L
444	FMADD4	(aa4, bb6, cc12, cc12)
445	nop
446
447	FMADD1	(aa3, bb7, cc13, cc13)
448	LDF	[BO + 20 * SIZE], b5
449	FMADD2	(aa4, bb7, cc14, cc14)
450	LDF	[BO + 21 * SIZE], b6
451
452	FMADD3	(aa3, bb8, cc15, cc15)
453	LDF	[BO + 22 * SIZE], b7
454	FMADD4	(aa4, bb8, cc16, cc16)
455	LDF	[BO + 23 * SIZE], b8
456
457	FMADD1	(aa1, bb1, cc01, cc01)
458	FMADD2	(aa2, bb1, cc02, cc02)
459	FMADD3	(aa1, bb2, cc03, cc03)
460	FMADD4	(aa2, bb2, cc04, cc04)
461
462	FMADD1	(aa1, bb3, cc05, cc05)
463	LDF	[BO + 32 * SIZE], b1
464	FMADD2	(aa2, bb3, cc06, cc06)
465	LDF	[BO + 25 * SIZE], b2
466
467	FMADD3	(aa1, bb4, cc07, cc07)
468	LDF	[BO + 26 * SIZE], b3
469	FMADD4	(aa2, bb4, cc08, cc08)
470	LDF	[BO + 27 * SIZE], b4
471
472	FMADD1	(aa1, bb5, cc09, cc09)
473	LDF	[AO +  6 * SIZE], a3
474	FMADD2	(aa2, bb5, cc10, cc10)
475	LDF	[AO +  7 * SIZE], a4
476
477	FMADD3	(aa1, bb6, cc11, cc11)
478	nop
479	FMADD4	(aa2, bb6, cc12, cc12)
480	nop
481
482	FMADD1	(aa1, bb7, cc13, cc13)
483	LDF	[BO + 28 * SIZE], b5
484	FMADD2	(aa2, bb7, cc14, cc14)
485	LDF	[BO + 29 * SIZE], b6
486
487	FMADD3	(aa1, bb8, cc15, cc15)
488	LDF	[BO + 30 * SIZE], b7
489	FMADD4	(aa2, bb8, cc16, cc16)
490	LDF	[BO + 31 * SIZE], b8
491
492	FMADD1	(aa3, bb9, cc01, cc01)
493	FMADD2	(aa4, bb9, cc02, cc02)
494	FMADD3	(aa3, bb2, cc03, cc03)
495	FMADD4	(aa4, bb2, cc04, cc04)
496
497	FMADD1	(aa3, bb3, cc05, cc05)
498	LDF	[BO + 40 * SIZE], b9
499	FMADD2	(aa4, bb3, cc06, cc06)
500	LDF	[BO + 33 * SIZE], b2
501
502	FMADD3	(aa3, bb4, cc07, cc07)
503	LDF	[BO + 34 * SIZE], b3
504	FMADD4	(aa4, bb4, cc08, cc08)
505	LDF	[BO + 35 * SIZE], b4
506
507	FMADD1	(aa3, bb5, cc09, cc09)
508	LDF	[AO + 16 * SIZE], a1  /****/
509	FMADD2	(aa4, bb5, cc10, cc10)
510	LDF	[AO +  9 * SIZE], a2
511
512	FMADD3	(aa3, bb6, cc11, cc11)
513	nop
514	FMADD4	(aa4, bb6, cc12, cc12)
515	nop
516
517	FMADD1	(aa3, bb7, cc13, cc13)
518	LDF	[BO + 36 * SIZE], b5
519	FMADD2	(aa4, bb7, cc14, cc14)
520	LDF	[BO + 37 * SIZE], b6
521
522	FMADD3	(aa3, bb8, cc15, cc15)
523	LDF	[BO + 38 * SIZE], b7
524	FMADD4	(aa4, bb8, cc16, cc16)
525	LDF	[BO + 39 * SIZE], b8
526
527	FMADD1	(aa5, bb1, cc01, cc01)
528	FMADD2	(aa2, bb1, cc02, cc02)
529	FMADD3	(aa5, bb2, cc03, cc03)
530	FMADD4	(aa2, bb2, cc04, cc04)
531
532	FMADD1	(aa5, bb3, cc05, cc05)
533	LDF	[BO + 48 * SIZE], b1
534	FMADD2	(aa2, bb3, cc06, cc06)
535	LDF	[BO + 41 * SIZE], b2
536
537	FMADD3	(aa5, bb4, cc07, cc07)
538	LDF	[BO + 42 * SIZE], b3
539	FMADD4	(aa2, bb4, cc08, cc08)
540	LDF	[BO + 43 * SIZE], b4
541
542	FMADD1	(aa5, bb5, cc09, cc09)
543	LDF	[AO + 10 * SIZE], a3
544	FMADD2	(aa2, bb5, cc10, cc10)
545	LDF	[AO + 11 * SIZE], a4
546
547	FMADD3	(aa5, bb6, cc11, cc11)
548	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
549	FMADD4	(aa2, bb6, cc12, cc12)
550	nop
551
552	FMADD1	(aa5, bb7, cc13, cc13)
553	LDF	[BO + 44 * SIZE], b5
554	FMADD2	(aa2, bb7, cc14, cc14)
555	LDF	[BO + 45 * SIZE], b6
556
557	FMADD3	(aa5, bb8, cc15, cc15)
558	LDF	[BO + 46 * SIZE], b7
559	FMADD4	(aa2, bb8, cc16, cc16)
560	LDF	[BO + 47 * SIZE], b8
561
562	FMADD1	(aa3, bb9, cc01, cc01)
563	FMADD2	(aa4, bb9, cc02, cc02)
564	FMADD3	(aa3, bb2, cc03, cc03)
565	FMADD4	(aa4, bb2, cc04, cc04)
566
567	FMADD1	(aa3, bb3, cc05, cc05)
568	LDF	[BO + 56 * SIZE], b9
569	FMADD2	(aa4, bb3, cc06, cc06)
570	LDF	[BO + 49 * SIZE], b2
571
572	FMADD3	(aa3, bb4, cc07, cc07)
573	LDF	[BO + 50 * SIZE], b3
574	FMADD4	(aa4, bb4, cc08, cc08)
575	LDF	[BO + 51 * SIZE], b4
576
577	FMADD1	(aa3, bb5, cc09, cc09)
578	LDF	[AO + 12 * SIZE], a5
579	FMADD2	(aa4, bb5, cc10, cc10)
580	LDF	[AO + 13 * SIZE], a2
581
582	FMADD3	(aa3, bb6, cc11, cc11)
583	cmp	L, 0
584	FMADD4	(aa4, bb6, cc12, cc12)
585	nop
586
587	FMADD1	(aa3, bb7, cc13, cc13)
588	LDF	[BO + 52 * SIZE], b5
589	FMADD2	(aa4, bb7, cc14, cc14)
590	LDF	[BO + 53 * SIZE], b6
591
592	FMADD3	(aa3, bb8, cc15, cc15)
593	LDF	[BO + 54 * SIZE], b7
594	FMADD4	(aa4, bb8, cc16, cc16)
595	LDF	[BO + 55 * SIZE], b8
596
597	FMADD1	(aa5, bb1, cc01, cc01)
598	FMADD2	(aa2, bb1, cc02, cc02)
599	FMADD3	(aa5, bb2, cc03, cc03)
600	FMADD4	(aa2, bb2, cc04, cc04)
601
602	FMADD1	(aa5, bb3, cc05, cc05)
603	LDF	[BO + 64 * SIZE], b1
604	FMADD2	(aa2, bb3, cc06, cc06)
605	LDF	[BO + 57 * SIZE], b2
606
607	FMADD3	(aa5, bb4, cc07, cc07)
608	LDF	[BO + 58 * SIZE], b3
609	FMADD4	(aa2, bb4, cc08, cc08)
610	LDF	[BO + 59 * SIZE], b4
611
612	FMADD1	(aa5, bb5, cc09, cc09)
613	LDF	[AO + 14 * SIZE], a3
614	FMADD2	(aa2, bb5, cc10, cc10)
615	LDF	[AO + 15 * SIZE], a4
616
617	FMADD3	(aa5, bb6, cc11, cc11)
618	add	BO, 64 * SIZE, BO
619	FMADD4	(aa2, bb6, cc12, cc12)
620	add	AO, 16 * SIZE, AO
621
622	FMADD1	(aa5, bb7, cc13, cc13)
623	LDF	[BO -  4 * SIZE], b5
624	FMADD2	(aa2, bb7, cc14, cc14)
625	LDF	[BO -  3 * SIZE], b6
626
627	FMADD3	(aa5, bb8, cc15, cc15)
628	LDF	[BO -  2 * SIZE], b7
629	FMADD4	(aa2, bb8, cc16, cc16)
630	LDF	[BO -  1 * SIZE], b8
631
632	FMADD1	(aa3, bb9, cc01, cc01)
633	FMADD2	(aa4, bb9, cc02, cc02)
634	FMADD3	(aa3, bb2, cc03, cc03)
635	FMADD4	(aa4, bb2, cc04, cc04)
636
637	FMADD1	(aa3, bb3, cc05, cc05)
638	LDF	[BO +  8 * SIZE], b9
639	FMADD2	(aa4, bb3, cc06, cc06)
640	LDF	[BO +  1 * SIZE], b2
641
642	FMADD3	(aa3, bb4, cc07, cc07)
643	LDF	[BO +  2 * SIZE], b3
644	FMADD4	(aa4, bb4, cc08, cc08)
645	LDF	[BO +  3 * SIZE], b4
646
647	FMADD1	(aa3, bb5, cc09, cc09)
648	LDF	[AO +  8 * SIZE], a5  /****/
649	FMADD2	(aa4, bb5, cc10, cc10)
650	LDF	[AO +  1 * SIZE], a2
651
652	FMADD3	(aa3, bb6, cc11, cc11)
653	FMADD4	(aa4, bb6, cc12, cc12)
654
655	FMADD1	(aa3, bb7, cc13, cc13)
656	LDF	[BO +  4 * SIZE], b5
657	FMADD2	(aa4, bb7, cc14, cc14)
658	LDF	[BO +  5 * SIZE], b6
659
660	FMADD3	(aa3, bb8, cc15, cc15)
661	LDF	[BO +  6 * SIZE], b7
662	FMADD4	(aa4, bb8, cc16, cc16)
663	ble,pn	%icc, .LL15
664	LDF	[BO +  7 * SIZE], b8
665
666	FMADD1	(aa1, bb1, cc01, cc01)
667	FMADD2	(aa2, bb1, cc02, cc02)
668	FMADD3	(aa1, bb2, cc03, cc03)
669	FMADD4	(aa2, bb2, cc04, cc04)
670
671	FMADD1	(aa1, bb3, cc05, cc05)
672	LDF	[BO + 16 * SIZE], b1
673	FMADD2	(aa2, bb3, cc06, cc06)
674	LDF	[BO +  9 * SIZE], b2
675
676	FMADD3	(aa1, bb4, cc07, cc07)
677	LDF	[BO + 10 * SIZE], b3
678	FMADD4	(aa2, bb4, cc08, cc08)
679	LDF	[BO + 11 * SIZE], b4
680
681	FMADD1	(aa1, bb5, cc09, cc09)
682	LDF	[AO +  2 * SIZE], a3
683	FMADD2	(aa2, bb5, cc10, cc10)
684	LDF	[AO +  3 * SIZE], a4
685
686	FMADD3	(aa1, bb6, cc11, cc11)
687	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
688	FMADD4	(aa2, bb6, cc12, cc12)
689	nop
690
691	FMADD1	(aa1, bb7, cc13, cc13)
692	LDF	[BO + 12 * SIZE], b5
693	FMADD2	(aa2, bb7, cc14, cc14)
694	LDF	[BO + 13 * SIZE], b6
695
696	FMADD3	(aa1, bb8, cc15, cc15)
697	LDF	[BO + 14 * SIZE], b7
698	FMADD4	(aa2, bb8, cc16, cc16)
699	LDF	[BO + 15 * SIZE], b8
700
701	FMADD1	(aa3, bb9, cc01, cc01)
702	FMADD2	(aa4, bb9, cc02, cc02)
703	FMADD3	(aa3, bb2, cc03, cc03)
704	FMADD4	(aa4, bb2, cc04, cc04)
705
706	FMADD1	(aa3, bb3, cc05, cc05)
707	LDF	[BO + 24 * SIZE], b9
708	FMADD2	(aa4, bb3, cc06, cc06)
709	LDF	[BO + 17 * SIZE], b2
710
711	FMADD3	(aa3, bb4, cc07, cc07)
712	LDF	[BO + 18 * SIZE], b3
713	FMADD4	(aa4, bb4, cc08, cc08)
714	LDF	[BO + 19 * SIZE], b4
715
716	FMADD1	(aa3, bb5, cc09, cc09)
717	LDF	[AO +  4 * SIZE], a1
718	FMADD2	(aa4, bb5, cc10, cc10)
719	LDF	[AO +  5 * SIZE], a2
720
721	FMADD3	(aa3, bb6, cc11, cc11)
722	add	L, -1, L
723	FMADD4	(aa4, bb6, cc12, cc12)
724	nop
725
726	FMADD1	(aa3, bb7, cc13, cc13)
727	LDF	[BO + 20 * SIZE], b5
728	FMADD2	(aa4, bb7, cc14, cc14)
729	LDF	[BO + 21 * SIZE], b6
730
731	FMADD3	(aa3, bb8, cc15, cc15)
732	LDF	[BO + 22 * SIZE], b7
733	FMADD4	(aa4, bb8, cc16, cc16)
734	LDF	[BO + 23 * SIZE], b8
735
736	FMADD1	(aa1, bb1, cc01, cc01)
737	FMADD2	(aa2, bb1, cc02, cc02)
738	FMADD3	(aa1, bb2, cc03, cc03)
739	FMADD4	(aa2, bb2, cc04, cc04)
740
741	FMADD1	(aa1, bb3, cc05, cc05)
742	LDF	[BO + 32 * SIZE], b1
743	FMADD2	(aa2, bb3, cc06, cc06)
744	LDF	[BO + 25 * SIZE], b2
745
746	FMADD3	(aa1, bb4, cc07, cc07)
747	LDF	[BO + 26 * SIZE], b3
748	FMADD4	(aa2, bb4, cc08, cc08)
749	LDF	[BO + 27 * SIZE], b4
750
751	FMADD1	(aa1, bb5, cc09, cc09)
752	LDF	[AO +  6 * SIZE], a3
753	FMADD2	(aa2, bb5, cc10, cc10)
754	LDF	[AO +  7 * SIZE], a4
755
756	FMADD3	(aa1, bb6, cc11, cc11)
757	nop
758	FMADD4	(aa2, bb6, cc12, cc12)
759	nop
760
761	FMADD1	(aa1, bb7, cc13, cc13)
762	LDF	[BO + 28 * SIZE], b5
763	FMADD2	(aa2, bb7, cc14, cc14)
764	LDF	[BO + 29 * SIZE], b6
765
766	FMADD3	(aa1, bb8, cc15, cc15)
767	LDF	[BO + 30 * SIZE], b7
768	FMADD4	(aa2, bb8, cc16, cc16)
769	LDF	[BO + 31 * SIZE], b8
770
771	FMADD1	(aa3, bb9, cc01, cc01)
772	FMADD2	(aa4, bb9, cc02, cc02)
773	FMADD3	(aa3, bb2, cc03, cc03)
774	FMADD4	(aa4, bb2, cc04, cc04)
775
776	FMADD1	(aa3, bb3, cc05, cc05)
777	LDF	[BO + 40 * SIZE], b9
778	FMADD2	(aa4, bb3, cc06, cc06)
779	LDF	[BO + 33 * SIZE], b2
780
781	FMADD3	(aa3, bb4, cc07, cc07)
782	LDF	[BO + 34 * SIZE], b3
783	FMADD4	(aa4, bb4, cc08, cc08)
784	LDF	[BO + 35 * SIZE], b4
785
786	FMADD1	(aa3, bb5, cc09, cc09)
787	LDF	[AO + 16 * SIZE], a1  /****/
788	FMADD2	(aa4, bb5, cc10, cc10)
789	LDF	[AO +  9 * SIZE], a2
790
791	FMADD3	(aa3, bb6, cc11, cc11)
792	nop
793	FMADD4	(aa4, bb6, cc12, cc12)
794	nop
795
796	FMADD1	(aa3, bb7, cc13, cc13)
797	LDF	[BO + 36 * SIZE], b5
798	FMADD2	(aa4, bb7, cc14, cc14)
799	LDF	[BO + 37 * SIZE], b6
800
801	FMADD3	(aa3, bb8, cc15, cc15)
802	LDF	[BO + 38 * SIZE], b7
803	FMADD4	(aa4, bb8, cc16, cc16)
804	LDF	[BO + 39 * SIZE], b8
805
806	FMADD1	(aa5, bb1, cc01, cc01)
807	FMADD2	(aa2, bb1, cc02, cc02)
808	FMADD3	(aa5, bb2, cc03, cc03)
809	FMADD4	(aa2, bb2, cc04, cc04)
810
811	FMADD1	(aa5, bb3, cc05, cc05)
812	LDF	[BO + 48 * SIZE], b1
813	FMADD2	(aa2, bb3, cc06, cc06)
814	LDF	[BO + 41 * SIZE], b2
815
816	FMADD3	(aa5, bb4, cc07, cc07)
817	LDF	[BO + 42 * SIZE], b3
818	FMADD4	(aa2, bb4, cc08, cc08)
819	LDF	[BO + 43 * SIZE], b4
820
821	FMADD1	(aa5, bb5, cc09, cc09)
822	LDF	[AO + 10 * SIZE], a3
823	FMADD2	(aa2, bb5, cc10, cc10)
824	LDF	[AO + 11 * SIZE], a4
825
826	FMADD3	(aa5, bb6, cc11, cc11)
827	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
828	FMADD4	(aa2, bb6, cc12, cc12)
829	nop
830
831	FMADD1	(aa5, bb7, cc13, cc13)
832	LDF	[BO + 44 * SIZE], b5
833	FMADD2	(aa2, bb7, cc14, cc14)
834	LDF	[BO + 45 * SIZE], b6
835
836	FMADD3	(aa5, bb8, cc15, cc15)
837	LDF	[BO + 46 * SIZE], b7
838	FMADD4	(aa2, bb8, cc16, cc16)
839	LDF	[BO + 47 * SIZE], b8
840
841	FMADD1	(aa3, bb9, cc01, cc01)
842	FMADD2	(aa4, bb9, cc02, cc02)
843	FMADD3	(aa3, bb2, cc03, cc03)
844	FMADD4	(aa4, bb2, cc04, cc04)
845
846	FMADD1	(aa3, bb3, cc05, cc05)
847	LDF	[BO + 56 * SIZE], b9
848	FMADD2	(aa4, bb3, cc06, cc06)
849	LDF	[BO + 49 * SIZE], b2
850
851	FMADD3	(aa3, bb4, cc07, cc07)
852	LDF	[BO + 50 * SIZE], b3
853	FMADD4	(aa4, bb4, cc08, cc08)
854	LDF	[BO + 51 * SIZE], b4
855
856	FMADD1	(aa3, bb5, cc09, cc09)
857	LDF	[AO + 12 * SIZE], a5
858	FMADD2	(aa4, bb5, cc10, cc10)
859	LDF	[AO + 13 * SIZE], a2
860
861	FMADD3	(aa3, bb6, cc11, cc11)
862	cmp	L, 0
863	FMADD4	(aa4, bb6, cc12, cc12)
864	nop
865
866	FMADD1	(aa3, bb7, cc13, cc13)
867	LDF	[BO + 52 * SIZE], b5
868	FMADD2	(aa4, bb7, cc14, cc14)
869	LDF	[BO + 53 * SIZE], b6
870
871	FMADD3	(aa3, bb8, cc15, cc15)
872	LDF	[BO + 54 * SIZE], b7
873	FMADD4	(aa4, bb8, cc16, cc16)
874	LDF	[BO + 55 * SIZE], b8
875
876	FMADD1	(aa5, bb1, cc01, cc01)
877	FMADD2	(aa2, bb1, cc02, cc02)
878	FMADD3	(aa5, bb2, cc03, cc03)
879	FMADD4	(aa2, bb2, cc04, cc04)
880
881	FMADD1	(aa5, bb3, cc05, cc05)
882	LDF	[BO + 64 * SIZE], b1
883	FMADD2	(aa2, bb3, cc06, cc06)
884	LDF	[BO + 57 * SIZE], b2
885
886	FMADD3	(aa5, bb4, cc07, cc07)
887	LDF	[BO + 58 * SIZE], b3
888	FMADD4	(aa2, bb4, cc08, cc08)
889	LDF	[BO + 59 * SIZE], b4
890
891	FMADD1	(aa5, bb5, cc09, cc09)
892	LDF	[AO + 14 * SIZE], a3
893	FMADD2	(aa2, bb5, cc10, cc10)
894	LDF	[AO + 15 * SIZE], a4
895
896	FMADD3	(aa5, bb6, cc11, cc11)
897	add	BO, 64 * SIZE, BO
898	FMADD4	(aa2, bb6, cc12, cc12)
899	add	AO, 16 * SIZE, AO
900
901	FMADD1	(aa5, bb7, cc13, cc13)
902	LDF	[BO -  4 * SIZE], b5
903	FMADD2	(aa2, bb7, cc14, cc14)
904	LDF	[BO -  3 * SIZE], b6
905
906	FMADD3	(aa5, bb8, cc15, cc15)
907	LDF	[BO -  2 * SIZE], b7
908	FMADD4	(aa2, bb8, cc16, cc16)
909	LDF	[BO -  1 * SIZE], b8
910
911	FMADD1	(aa3, bb9, cc01, cc01)
912	FMADD2	(aa4, bb9, cc02, cc02)
913	FMADD3	(aa3, bb2, cc03, cc03)
914	FMADD4	(aa4, bb2, cc04, cc04)
915
916	FMADD1	(aa3, bb3, cc05, cc05)
917	LDF	[BO +  8 * SIZE], b9
918	FMADD2	(aa4, bb3, cc06, cc06)
919	LDF	[BO +  1 * SIZE], b2
920
921	FMADD3	(aa3, bb4, cc07, cc07)
922	LDF	[BO +  2 * SIZE], b3
923	FMADD4	(aa4, bb4, cc08, cc08)
924	LDF	[BO +  3 * SIZE], b4
925
926	FMADD1	(aa3, bb5, cc09, cc09)
927	LDF	[AO +  8 * SIZE], a5  /****/
928	FMADD2	(aa4, bb5, cc10, cc10)
929	LDF	[AO +  1 * SIZE], a2
930
931	FMADD3	(aa3, bb6, cc11, cc11)
932	FMADD4	(aa4, bb6, cc12, cc12)
933
934	FMADD1	(aa3, bb7, cc13, cc13)
935	LDF	[BO +  4 * SIZE], b5
936	FMADD2	(aa4, bb7, cc14, cc14)
937	LDF	[BO +  5 * SIZE], b6
938
939	FMADD3	(aa3, bb8, cc15, cc15)
940	LDF	[BO +  6 * SIZE], b7
941	FMADD4	(aa4, bb8, cc16, cc16)
942	bg,pt	%icc, .LL13
943	LDF	[BO +  7 * SIZE], b8
944	.align 4
945
946.LL15:
947#if defined(LT) || defined(RN)
948	and	KK, 7, L
949#else
950	sub	K, KK, L
951	and	L,  7, L
952#endif
953	cmp	L,  0
954	ble,a,pn %icc, .LL18
955	nop
956	.align 4
957
958.LL17:
959	FMADD1	(aa1, bb1, cc01, cc01)
960	add	L, -1, L
961	FMADD2	(aa2, bb1, cc02, cc02)
962	nop
963
964	FMADD3	(aa1, bb2, cc03, cc03)
965	LDF	[BO +  8 * SIZE], b1
966	FMADD4	(aa2, bb2, cc04, cc04)
967	LDF	[BO +  9 * SIZE], b2
968
969	FMADD1	(aa1, bb3, cc05, cc05)
970	cmp	L, 0
971	FMADD2	(aa2, bb3, cc06, cc06)
972	nop
973
974	FMADD3	(aa1, bb4, cc07, cc07)
975	LDF	[BO + 10 * SIZE], b3
976	FMADD4	(aa2, bb4, cc08, cc08)
977	LDF	[BO + 11 * SIZE], b4
978
979	FMADD1	(aa1, bb5, cc09, cc09)
980	nop
981	FMADD2	(aa2, bb5, cc10, cc10)
982	nop
983
984	FMADD3	(aa1, bb6, cc11, cc11)
985	LDF	[BO + 12 * SIZE], b5
986	FMADD4	(aa2, bb6, cc12, cc12)
987	LDF	[BO + 13 * SIZE], b6
988
989	FMADD1	(aa1, bb7, cc13, cc13)
990	add	AO, 2 * SIZE, AO
991	FMADD2	(aa2, bb7, cc14, cc14)
992	add	BO, 8 * SIZE, BO
993
994	FMADD3	(aa1, bb8, cc15, cc15)
995	LDF	[AO +  0 * SIZE], a1
996	FMADD4	(aa2, bb8, cc16, cc16)
997	LDF	[AO +  1 * SIZE], a2
998
999	LDF	[BO +  6 * SIZE], b7
1000	bg,pt	%icc, .LL17
1001	LDF	[BO +  7 * SIZE], b8
1002	nop
1003	.align 4
1004
1005.LL18:
1006	FADD	c01, c04, c01
1007	FADD	c02, c03, c02
1008	FADD	c05, c08, c05
1009	FADD	c06, c07, c06
1010
1011	FADD	c09, c12, c09
1012	FADD	c10, c11, c10
1013	FADD	c13, c16, c13
1014	FADD	c14, c15, c14
1015
1016#if defined(LN) || defined(RT)
1017#ifdef LN
1018	sub	KK, 1, TEMP1
1019#else
1020	sub	KK, 4, TEMP1
1021#endif
1022	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1023	sll	TEMP1, ZBASE_SHIFT + 2, TEMP1
1024
1025	add	AORIG, TEMP2, AO
1026	add	B,     TEMP1, BO
1027#endif
1028
1029#if defined(LN) || defined(LT)
1030	LDF	[BO +  0 * SIZE], a1
1031	LDF	[BO +  1 * SIZE], a2
1032	LDF	[BO +  2 * SIZE], a3
1033	LDF	[BO +  3 * SIZE], a4
1034
1035	LDF	[BO +  4 * SIZE], b1
1036	LDF	[BO +  5 * SIZE], b2
1037	LDF	[BO +  6 * SIZE], b3
1038	LDF	[BO +  7 * SIZE], b4
1039#else
1040	LDF	[AO +  0 * SIZE], a1
1041	LDF	[AO +  1 * SIZE], a2
1042	LDF	[AO +  2 * SIZE], a3
1043	LDF	[AO +  3 * SIZE], a4
1044
1045	LDF	[AO +  4 * SIZE], b1
1046	LDF	[AO +  5 * SIZE], b2
1047	LDF	[AO +  6 * SIZE], b3
1048	LDF	[AO +  7 * SIZE], b4
1049#endif
1050
1051	FSUB	a1, c01, c01
1052	FSUB	a2, c02, c02
1053	FSUB	a3, c05, c05
1054	FSUB	a4, c06, c06
1055
1056	FSUB	b1, c09, c09
1057	FSUB	b2, c10, c10
1058	FSUB	b3, c13, c13
1059	FSUB	b4, c14, c14
1060
1061#if defined(LN) || defined(LT)
1062	LDF	[AO +  0 * SIZE], a1
1063	LDF	[AO +  1 * SIZE], a2
1064
1065	FMUL	a1, c01, b1
1066	FMUL	a2, c01, b2
1067	FMUL	a1, c05, b3
1068	FMUL	a2, c05, b4
1069	FMUL	a1, c09, b5
1070	FMUL	a2, c09, b6
1071	FMUL	a1, c13, b7
1072	FMUL	a2, c13, b8
1073
1074#ifndef CONJ
1075	FNMSUB	(aa2, cc02, bb1, cc01)
1076	FMADD	(aa1, cc02, bb2, cc02)
1077	FNMSUB	(aa2, cc06, bb3, cc05)
1078	FMADD	(aa1, cc06, bb4, cc06)
1079	FNMSUB	(aa2, cc10, bb5, cc09)
1080	FMADD	(aa1, cc10, bb6, cc10)
1081	FNMSUB	(aa2, cc14, bb7, cc13)
1082	FMADD	(aa1, cc14, bb8, cc14)
1083#else
1084	FMADD	(aa2, cc02, bb1, cc01)
1085	FMSUB	(aa1, cc02, bb2, cc02)
1086	FMADD	(aa2, cc06, bb3, cc05)
1087	FMSUB	(aa1, cc06, bb4, cc06)
1088	FMADD	(aa2, cc10, bb5, cc09)
1089	FMSUB	(aa1, cc10, bb6, cc10)
1090	FMADD	(aa2, cc14, bb7, cc13)
1091	FMSUB	(aa1, cc14, bb8, cc14)
1092#endif
1093#endif
1094
1095#ifdef RN
1096	LDF	[BO +  0 * SIZE], b1
1097	LDF	[BO +  1 * SIZE], b2
1098	LDF	[BO +  2 * SIZE], b3
1099	LDF	[BO +  3 * SIZE], b4
1100	LDF	[BO +  4 * SIZE], b5
1101	LDF	[BO +  5 * SIZE], b6
1102	LDF	[BO +  6 * SIZE], b7
1103	LDF	[BO +  7 * SIZE], b8
1104
1105	FMUL	b1, c01, a1
1106	FMUL	b2, c01, a2
1107
1108#ifndef CONJ
1109	FNMSUB	(bb2, cc02, aa1, cc01)
1110	FMADD	(bb1, cc02, aa2, cc02)
1111#else
1112	FMADD	(bb2, cc02, aa1, cc01)
1113	FMSUB	(bb1, cc02, aa2, cc02)
1114#endif
1115
1116	FNMSUB	(bb3, cc01, cc05, cc05)
1117	FNMSUB	(bb3, cc02, cc06, cc06)
1118	FNMSUB	(bb5, cc01, cc09, cc09)
1119	FNMSUB	(bb5, cc02, cc10, cc10)
1120	FNMSUB	(bb7, cc01, cc13, cc13)
1121	FNMSUB	(bb7, cc02, cc14, cc14)
1122
1123#ifndef CONJ
1124	FMADD	(bb4, cc02, cc05, cc05)
1125	FNMSUB	(bb4, cc01, cc06, cc06)
1126	FMADD	(bb6, cc02, cc09, cc09)
1127	FNMSUB	(bb6, cc01, cc10, cc10)
1128	FMADD	(bb8, cc02, cc13, cc13)
1129	FNMSUB	(bb8, cc01, cc14, cc14)
1130#else
1131	FNMSUB	(bb4, cc02, cc05, cc05)
1132	FMADD	(bb4, cc01, cc06, cc06)
1133	FNMSUB	(bb6, cc02, cc09, cc09)
1134	FMADD	(bb6, cc01, cc10, cc10)
1135	FNMSUB	(bb8, cc02, cc13, cc13)
1136	FMADD	(bb8, cc01, cc14, cc14)
1137#endif
1138
1139	LDF	[BO + 10 * SIZE], b1
1140	LDF	[BO + 11 * SIZE], b2
1141	LDF	[BO + 12 * SIZE], b3
1142	LDF	[BO + 13 * SIZE], b4
1143	LDF	[BO + 14 * SIZE], b5
1144	LDF	[BO + 15 * SIZE], b6
1145
1146	FMUL	b1, c05, a1
1147	FMUL	b2, c05, a2
1148
1149#ifndef CONJ
1150	FNMSUB	(bb2, cc06, aa1, cc05)
1151	FMADD	(bb1, cc06, aa2, cc06)
1152#else
1153	FMADD	(bb2, cc06, aa1, cc05)
1154	FMSUB	(bb1, cc06, aa2, cc06)
1155#endif
1156
1157	FNMSUB	(bb3, cc05, cc09, cc09)
1158	FNMSUB	(bb3, cc06, cc10, cc10)
1159	FNMSUB	(bb5, cc05, cc13, cc13)
1160	FNMSUB	(bb5, cc06, cc14, cc14)
1161
1162#ifndef CONJ
1163	FMADD	(bb4, cc06, cc09, cc09)
1164	FNMSUB	(bb4, cc05, cc10, cc10)
1165	FMADD	(bb6, cc06, cc13, cc13)
1166	FNMSUB	(bb6, cc05, cc14, cc14)
1167#else
1168	FNMSUB	(bb4, cc06, cc09, cc09)
1169	FMADD	(bb4, cc05, cc10, cc10)
1170	FNMSUB	(bb6, cc06, cc13, cc13)
1171	FMADD	(bb6, cc05, cc14, cc14)
1172#endif
1173
1174	LDF	[BO + 20 * SIZE], b1
1175	LDF	[BO + 21 * SIZE], b2
1176	LDF	[BO + 22 * SIZE], b3
1177	LDF	[BO + 23 * SIZE], b4
1178
1179	FMUL	b1, c09, a1
1180	FMUL	b2, c09, a2
1181
1182#ifndef CONJ
1183	FNMSUB	(bb2, cc10, aa1, cc09)
1184	FMADD	(bb1, cc10, aa2, cc10)
1185#else
1186	FMADD	(bb2, cc10, aa1, cc09)
1187	FMSUB	(bb1, cc10, aa2, cc10)
1188#endif
1189
1190	FNMSUB	(bb3, cc09, cc13, cc13)
1191	FNMSUB	(bb3, cc10, cc14, cc14)
1192
1193#ifndef CONJ
1194	FMADD	(bb4, cc10, cc13, cc13)
1195	FNMSUB	(bb4, cc09, cc14, cc14)
1196#else
1197	FNMSUB	(bb4, cc10, cc13, cc13)
1198	FMADD	(bb4, cc09, cc14, cc14)
1199#endif
1200
1201	LDF	[BO + 30 * SIZE], b1
1202	LDF	[BO + 31 * SIZE], b2
1203
1204	FMUL	b1, c13, a1
1205	FMUL	b2, c13, a2
1206
1207#ifndef CONJ
1208	FNMSUB	(bb2, cc14, aa1, cc13)
1209	FMADD	(bb1, cc14, aa2, cc14)
1210#else
1211	FMADD	(bb2, cc14, aa1, cc13)
1212	FMSUB	(bb1, cc14, aa2, cc14)
1213#endif
1214#endif
1215
1216#ifdef RT
1217	LDF	[BO + 30 * SIZE], b1
1218	LDF	[BO + 31 * SIZE], b2
1219	LDF	[BO + 28 * SIZE], b3
1220	LDF	[BO + 29 * SIZE], b4
1221	LDF	[BO + 26 * SIZE], b5
1222	LDF	[BO + 27 * SIZE], b6
1223	LDF	[BO + 24 * SIZE], b7
1224	LDF	[BO + 25 * SIZE], b8
1225
1226	FMUL	b1, c13, a1
1227	FMUL	b2, c13, a2
1228
1229#ifndef CONJ
1230	FNMSUB	(bb2, cc14, aa1, cc13)
1231	FMADD	(bb1, cc14, aa2, cc14)
1232#else
1233	FMADD	(bb2, cc14, aa1, cc13)
1234	FMSUB	(bb1, cc14, aa2, cc14)
1235#endif
1236
1237	FNMSUB	(bb3, cc13, cc09, cc09)
1238	FNMSUB	(bb3, cc14, cc10, cc10)
1239	FNMSUB	(bb5, cc13, cc05, cc05)
1240	FNMSUB	(bb5, cc14, cc06, cc06)
1241	FNMSUB	(bb7, cc13, cc01, cc01)
1242	FNMSUB	(bb7, cc14, cc02, cc02)
1243
1244#ifndef CONJ
1245	FMADD	(bb4, cc14, cc09, cc09)
1246	FNMSUB	(bb4, cc13, cc10, cc10)
1247	FMADD	(bb6, cc14, cc05, cc05)
1248	FNMSUB	(bb6, cc13, cc06, cc06)
1249	FMADD	(bb8, cc14, cc01, cc01)
1250	FNMSUB	(bb8, cc13, cc02, cc02)
1251#else
1252	FNMSUB	(bb4, cc14, cc09, cc09)
1253	FMADD	(bb4, cc13, cc10, cc10)
1254	FNMSUB	(bb6, cc14, cc05, cc05)
1255	FMADD	(bb6, cc13, cc06, cc06)
1256	FNMSUB	(bb8, cc14, cc01, cc01)
1257	FMADD	(bb8, cc13, cc02, cc02)
1258#endif
1259
1260	LDF	[BO + 20 * SIZE], b1
1261	LDF	[BO + 21 * SIZE], b2
1262	LDF	[BO + 18 * SIZE], b3
1263	LDF	[BO + 19 * SIZE], b4
1264	LDF	[BO + 16 * SIZE], b5
1265	LDF	[BO + 17 * SIZE], b6
1266
1267	FMUL	b1, c09, a1
1268	FMUL	b2, c09, a2
1269
1270#ifndef CONJ
1271	FNMSUB	(bb2, cc10, aa1, cc09)
1272	FMADD	(bb1, cc10, aa2, cc10)
1273#else
1274	FMADD	(bb2, cc10, aa1, cc09)
1275	FMSUB	(bb1, cc10, aa2, cc10)
1276#endif
1277
1278	FNMSUB	(bb3, cc09, cc05, cc05)
1279	FNMSUB	(bb3, cc10, cc06, cc06)
1280	FNMSUB	(bb5, cc09, cc01, cc01)
1281	FNMSUB	(bb5, cc10, cc02, cc02)
1282
1283#ifndef CONJ
1284	FMADD	(bb4, cc10, cc05, cc05)
1285	FNMSUB	(bb4, cc09, cc06, cc06)
1286	FMADD	(bb6, cc10, cc01, cc01)
1287	FNMSUB	(bb6, cc09, cc02, cc02)
1288#else
1289	FNMSUB	(bb4, cc10, cc05, cc05)
1290	FMADD	(bb4, cc09, cc06, cc06)
1291	FNMSUB	(bb6, cc10, cc01, cc01)
1292	FMADD	(bb6, cc09, cc02, cc02)
1293#endif
1294
1295	LDF	[BO + 10 * SIZE], b1
1296	LDF	[BO + 11 * SIZE], b2
1297	LDF	[BO +  8 * SIZE], b3
1298	LDF	[BO +  9 * SIZE], b4
1299
1300	FMUL	b1, c05, a1
1301	FMUL	b2, c05, a2
1302
1303#ifndef CONJ
1304	FNMSUB	(bb2, cc06, aa1, cc05)
1305	FMADD	(bb1, cc06, aa2, cc06)
1306#else
1307	FMADD	(bb2, cc06, aa1, cc05)
1308	FMSUB	(bb1, cc06, aa2, cc06)
1309#endif
1310
1311	FNMSUB	(bb3, cc05, cc01, cc01)
1312	FNMSUB	(bb3, cc06, cc02, cc02)
1313
1314#ifndef CONJ
1315	FMADD	(bb4, cc06, cc01, cc01)
1316	FNMSUB	(bb4, cc05, cc02, cc02)
1317#else
1318	FNMSUB	(bb4, cc06, cc01, cc01)
1319	FMADD	(bb4, cc05, cc02, cc02)
1320#endif
1321
1322	LDF	[BO +  0 * SIZE], b1
1323	LDF	[BO +  1 * SIZE], b2
1324
1325	FMUL	b1, c01, a1
1326	FMUL	b2, c01, a2
1327
1328#ifndef CONJ
1329	FNMSUB	(bb2, cc02, aa1, cc01)
1330	FMADD	(bb1, cc02, aa2, cc02)
1331#else
1332	FMADD	(bb2, cc02, aa1, cc01)
1333	FMSUB	(bb1, cc02, aa2, cc02)
1334#endif
1335#endif
1336
1337#ifdef LN
1338	add	C1, -2 * SIZE, C1
1339	add	C2, -2 * SIZE, C2
1340	add	C3, -2 * SIZE, C3
1341	add	C4, -2 * SIZE, C4
1342#endif
1343
1344#if defined(LN) || defined(LT)
1345	STF	c01, [BO +  0 * SIZE]
1346	STF	c02, [BO +  1 * SIZE]
1347	STF	c05, [BO +  2 * SIZE]
1348	STF	c06, [BO +  3 * SIZE]
1349
1350	STF	c09, [BO +  4 * SIZE]
1351	STF	c10, [BO +  5 * SIZE]
1352	STF	c13, [BO +  6 * SIZE]
1353	STF	c14, [BO +  7 * SIZE]
1354#else
1355	STF	c01, [AO +  0 * SIZE]
1356	STF	c02, [AO +  1 * SIZE]
1357	STF	c05, [AO +  2 * SIZE]
1358	STF	c06, [AO +  3 * SIZE]
1359
1360	STF	c09, [AO +  4 * SIZE]
1361	STF	c10, [AO +  5 * SIZE]
1362	STF	c13, [AO +  6 * SIZE]
1363	STF	c14, [AO +  7 * SIZE]
1364#endif
1365
1366	STF	c01, [C1 + 0 * SIZE]
1367	STF	c02, [C1 + 1 * SIZE]
1368	STF	c05, [C2 + 0 * SIZE]
1369	STF	c06, [C2 + 1 * SIZE]
1370
1371	STF	c09, [C3 + 0 * SIZE]
1372	STF	c10, [C3 + 1 * SIZE]
1373	STF	c13, [C4 + 0 * SIZE]
1374	STF	c14, [C4 + 1 * SIZE]
1375
1376#ifndef LN
1377	add	C1, 2 * SIZE, C1
1378	add	C2, 2 * SIZE, C2
1379	add	C3, 2 * SIZE, C3
1380	add	C4, 2 * SIZE, C4
1381#endif
1382
1383#ifdef RT
1384	sll	K, ZBASE_SHIFT, TEMP1
1385	add	AORIG, TEMP1, AORIG
1386#endif
1387
1388#if defined(LT) || defined(RN)
1389	sub	K, KK, TEMP1
1390	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1391	sll	TEMP1, ZBASE_SHIFT + 2, TEMP1
1392	add	AO, TEMP2, AO
1393	add	BO, TEMP1, BO
1394#endif
1395
1396#ifdef LT
1397	add	KK, 1, KK
1398#endif
1399
1400#ifdef LN
1401	sub	KK, 1, KK
1402#endif
1403
1404	add	I, -1, I
1405	cmp	I, 0
1406	bg,pt	%icc, .LL12
1407	nop
1408
1409#ifdef LN
1410	sll	K, ZBASE_SHIFT + 2, TEMP1
1411	add	B, TEMP1, B
1412#endif
1413
1414#if defined(LT) || defined(RN)
1415	mov	BO, B
1416#endif
1417
1418#ifdef RN
1419	add	KK, 4, KK
1420#endif
1421
1422#ifdef RT
1423	sub	KK, 4, KK
1424#endif
1425
1426	add	J, -1, J
1427	cmp	J, 0
1428	bg,pt	%icc, .LL11
1429	nop
1430	.align 4
1431
1432.LL20:
1433	and	N, 2, J
1434	cmp	J, 0
1435	ble,pn	%icc, .LL30
1436	nop
1437
1438#ifdef RT
1439	sll	K, ZBASE_SHIFT + 1, TEMP1
1440	sub	B, TEMP1, B
1441#endif
1442
1443#ifndef RT
1444	mov	C,  C1
1445	add	C,  LDC, C2
1446	add	C2, LDC, C
1447#else
1448	sub	C,  LDC, C2
1449	sub	C2, LDC, C1
1450	sub	C2, LDC, C
1451#endif
1452
1453#ifdef LN
1454	add	M, OFFSET, KK
1455#endif
1456
1457#ifdef LT
1458	mov	OFFSET, KK
1459#endif
1460
1461#if defined(LN) || defined(RT)
1462	mov	A, AORIG
1463#else
1464	mov	A, AO
1465#endif
1466
1467	mov	M, I
1468	.align 4
1469
1470.LL22:
1471#if defined(LT) || defined(RN)
1472	mov	B, BO
1473#else
1474#ifdef LN
1475	sll	K,  ZBASE_SHIFT, TEMP1
1476	sub	AORIG, TEMP1, AORIG
1477#endif
1478
1479	sll	KK, ZBASE_SHIFT + 0, TEMP1
1480	sll	KK, ZBASE_SHIFT + 1, TEMP2
1481
1482	add	AORIG, TEMP1, AO
1483	add	B,     TEMP2, BO
1484#endif
1485
1486	LDF	[AO +  0 * SIZE], a1
1487	LDF	[AO +  1 * SIZE], a2
1488
1489	LDF	[BO +  0 * SIZE], b1
1490	LDF	[BO +  1 * SIZE], b2
1491	LDF	[BO +  2 * SIZE], b3
1492	LDF	[BO +  3 * SIZE], b4
1493	LDF	[BO +  4 * SIZE], b5
1494	FCLR	(cc01)
1495
1496	LDF	[BO +  5 * SIZE], b6
1497	FCLR	(cc02)
1498	LDF	[BO +  6 * SIZE], b7
1499	FCLR	(cc03)
1500	LDF	[BO +  7 * SIZE], b8
1501	FCLR	(cc04)
1502	LDF	[BO +  8 * SIZE], b9
1503	FCLR	(cc05)
1504
1505	prefetch [C1 + 2 * SIZE], 3
1506	FCLR	(cc06)
1507	prefetch [C2 + 2 * SIZE], 3
1508	FCLR	(cc07)
1509
1510#if defined(LT) || defined(RN)
1511	sra	KK, 2, L
1512#else
1513	sub	K, KK, L
1514	sra	L,  2, L
1515#endif
1516	cmp	L,  0
1517	ble,pn	%icc, .LL25
1518	FCLR	(cc08)
1519	.align 4
1520
1521.LL23:
1522	FMADD1	(aa1, bb1, cc01, cc01)
1523	LDF	[AO +  2 * SIZE], a3
1524	FMADD2	(aa2, bb1, cc02, cc02)
1525	LDF	[AO +  3 * SIZE], a4
1526
1527	FMADD3	(aa1, bb2, cc03, cc03)
1528	LDF	[BO + 16 * SIZE], b1
1529	FMADD4	(aa2, bb2, cc04, cc04)
1530	LDF	[BO +  9 * SIZE], b2
1531
1532	FMADD1	(aa1, bb3, cc05, cc05)
1533	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1534	FMADD2	(aa2, bb3, cc06, cc06)
1535	add	L, -1, L
1536
1537	FMADD3	(aa1, bb4, cc07, cc07)
1538	LDF	[BO + 10 * SIZE], b3
1539	FMADD4	(aa2, bb4, cc08, cc08)
1540	LDF	[BO + 11 * SIZE], b4
1541
1542	FMADD1	(aa3, bb5, cc01, cc01)
1543	LDF	[AO +  4 * SIZE], a1
1544	FMADD2	(aa4, bb5, cc02, cc02)
1545	LDF	[AO +  5 * SIZE], a2
1546
1547	FMADD3	(aa3, bb6, cc03, cc03)
1548	LDF	[BO + 12 * SIZE], b5
1549	FMADD4	(aa4, bb6, cc04, cc04)
1550	LDF	[BO + 13 * SIZE], b6
1551
1552	FMADD1	(aa3, bb7, cc05, cc05)
1553	cmp	L, 0
1554	FMADD2	(aa4, bb7, cc06, cc06)
1555	add	AO,  8 * SIZE, AO
1556
1557	FMADD3	(aa3, bb8, cc07, cc07)
1558	LDF	[BO + 14 * SIZE], b7
1559	FMADD4	(aa4, bb8, cc08, cc08)
1560	LDF	[BO + 15 * SIZE], b8
1561
1562	FMADD1	(aa1, bb9, cc01, cc01)
1563	LDF	[AO -  2 * SIZE], a3
1564	FMADD2	(aa2, bb9, cc02, cc02)
1565	LDF	[AO -  1 * SIZE], a4
1566
1567	FMADD3	(aa1, bb2, cc03, cc03)
1568	LDF	[BO + 24 * SIZE], b9
1569	FMADD4	(aa2, bb2, cc04, cc04)
1570	LDF	[BO + 17 * SIZE], b2
1571
1572	FMADD1	(aa1, bb3, cc05, cc05)
1573	add	BO, 16 * SIZE, BO
1574	FMADD2	(aa2, bb3, cc06, cc06)
1575	nop
1576
1577	FMADD3	(aa1, bb4, cc07, cc07)
1578	LDF	[BO +  2 * SIZE], b3
1579	FMADD4	(aa2, bb4, cc08, cc08)
1580	LDF	[BO +  3 * SIZE], b4
1581
1582	FMADD1	(aa3, bb5, cc01, cc01)
1583	LDF	[AO +  0 * SIZE], a1
1584	FMADD2	(aa4, bb5, cc02, cc02)
1585	LDF	[AO +  1 * SIZE], a2
1586	FMADD3	(aa3, bb6, cc03, cc03)
1587	LDF	[BO +  4 * SIZE], b5
1588	FMADD4	(aa4, bb6, cc04, cc04)
1589	LDF	[BO +  5 * SIZE], b6
1590
1591	FMADD1	(aa3, bb7, cc05, cc05)
1592	nop
1593	FMADD2	(aa4, bb7, cc06, cc06)
1594	LDF	[BO +  6 * SIZE], b7
1595
1596	FMADD3	(aa3, bb8, cc07, cc07)
1597	FMADD4	(aa4, bb8, cc08, cc08)
1598	bg,pt	%icc, .LL23
1599	LDF	[BO +  7 * SIZE], b8
1600	.align 4
1601
1602.LL25:
1603#if defined(LT) || defined(RN)
1604	and	KK, 3, L
1605#else
1606	sub	K, KK, L
1607	and	L,  3, L
1608#endif
1609	cmp	L,  0
1610	ble,a,pn %icc, .LL28
1611	nop
1612	.align 4
1613
1614.LL27:
1615	FMADD1	(aa1, bb1, cc01, cc01)
1616	add	L, -1, L
1617	FMADD2	(aa2, bb1, cc02, cc02)
1618	LDF	[BO + 4 * SIZE], b1
1619
1620	FMADD3	(aa1, bb2, cc03, cc03)
1621	add	AO, 2 * SIZE, AO
1622	FMADD4	(aa2, bb2, cc04, cc04)
1623	LDF	[BO + 5 * SIZE], b2
1624
1625	FMADD1	(aa1, bb3, cc05, cc05)
1626	cmp	L, 0
1627	FMADD2	(aa2, bb3, cc06, cc06)
1628	LDF	[BO + 6 * SIZE], b3
1629
1630	FMADD3	(aa1, bb4, cc07, cc07)
1631	LDF	[AO + 0 * SIZE], a1
1632	FMADD4	(aa2, bb4, cc08, cc08)
1633	LDF	[AO + 1 * SIZE], a2
1634
1635	LDF	[BO + 7 * SIZE], b4
1636	bg,pt	%icc, .LL27
1637	add	BO, 4 * SIZE, BO
1638	.align 4
1639
1640.LL28:
1641	FADD	c01, c04, c01
1642	FADD	c02, c03, c02
1643	FADD	c05, c08, c05
1644	FADD	c06, c07, c06
1645
1646#if defined(LN) || defined(RT)
1647#ifdef LN
1648	sub	KK, 1, TEMP1
1649#else
1650	sub	KK, 2, TEMP1
1651#endif
1652	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1653	sll	TEMP1, ZBASE_SHIFT + 1, TEMP1
1654
1655	add	AORIG, TEMP2, AO
1656	add	B,     TEMP1, BO
1657#endif
1658
1659#if defined(LN) || defined(LT)
1660	LDF	[BO +  0 * SIZE], a1
1661	LDF	[BO +  1 * SIZE], a2
1662	LDF	[BO +  2 * SIZE], a3
1663	LDF	[BO +  3 * SIZE], a4
1664#else
1665	LDF	[AO +  0 * SIZE], a1
1666	LDF	[AO +  1 * SIZE], a2
1667	LDF	[AO +  2 * SIZE], a3
1668	LDF	[AO +  3 * SIZE], a4
1669#endif
1670
1671	FSUB	a1, c01, c01
1672	FSUB	a2, c02, c02
1673	FSUB	a3, c05, c05
1674	FSUB	a4, c06, c06
1675
1676#if defined(LN) || defined(LT)
1677	LDF	[AO +  0 * SIZE], a1
1678	LDF	[AO +  1 * SIZE], a2
1679
1680	FMUL	a1, c01, b1
1681	FMUL	a2, c01, b2
1682	FMUL	a1, c05, b3
1683	FMUL	a2, c05, b4
1684
1685#ifndef CONJ
1686	FNMSUB	(aa2, cc02, bb1, cc01)
1687	FMADD	(aa1, cc02, bb2, cc02)
1688	FNMSUB	(aa2, cc06, bb3, cc05)
1689	FMADD	(aa1, cc06, bb4, cc06)
1690#else
1691	FMADD	(aa2, cc02, bb1, cc01)
1692	FMSUB	(aa1, cc02, bb2, cc02)
1693	FMADD	(aa2, cc06, bb3, cc05)
1694	FMSUB	(aa1, cc06, bb4, cc06)
1695#endif
1696#endif
1697
1698#ifdef RN
1699	LDF	[BO +  0 * SIZE], b1
1700	LDF	[BO +  1 * SIZE], b2
1701	LDF	[BO +  2 * SIZE], b3
1702	LDF	[BO +  3 * SIZE], b4
1703
1704	FMUL	b1, c01, a1
1705	FMUL	b2, c01, a2
1706
1707#ifndef CONJ
1708	FNMSUB	(bb2, cc02, aa1, cc01)
1709	FMADD	(bb1, cc02, aa2, cc02)
1710#else
1711	FMADD	(bb2, cc02, aa1, cc01)
1712	FMSUB	(bb1, cc02, aa2, cc02)
1713#endif
1714
1715	FNMSUB	(bb3, cc01, cc05, cc05)
1716	FNMSUB	(bb3, cc02, cc06, cc06)
1717
1718#ifndef CONJ
1719	FMADD	(bb4, cc02, cc05, cc05)
1720	FNMSUB	(bb4, cc01, cc06, cc06)
1721#else
1722	FNMSUB	(bb4, cc02, cc05, cc05)
1723	FMADD	(bb4, cc01, cc06, cc06)
1724#endif
1725
1726	LDF	[BO +  6 * SIZE], b1
1727	LDF	[BO +  7 * SIZE], b2
1728
1729	FMUL	b1, c05, a1
1730	FMUL	b2, c05, a2
1731
1732#ifndef CONJ
1733	FNMSUB	(bb2, cc06, aa1, cc05)
1734	FMADD	(bb1, cc06, aa2, cc06)
1735#else
1736	FMADD	(bb2, cc06, aa1, cc05)
1737	FMSUB	(bb1, cc06, aa2, cc06)
1738#endif
1739#endif
1740
1741#ifdef RT
1742	LDF	[BO +  6 * SIZE], b1
1743	LDF	[BO +  7 * SIZE], b2
1744	LDF	[BO +  4 * SIZE], b3
1745	LDF	[BO +  5 * SIZE], b4
1746
1747	FMUL	b1, c05, a1
1748	FMUL	b2, c05, a2
1749
1750#ifndef CONJ
1751	FNMSUB	(bb2, cc06, aa1, cc05)
1752	FMADD	(bb1, cc06, aa2, cc06)
1753#else
1754	FMADD	(bb2, cc06, aa1, cc05)
1755	FMSUB	(bb1, cc06, aa2, cc06)
1756#endif
1757
1758	FNMSUB	(bb3, cc05, cc01, cc01)
1759	FNMSUB	(bb3, cc06, cc02, cc02)
1760
1761#ifndef CONJ
1762	FMADD	(bb4, cc06, cc01, cc01)
1763	FNMSUB	(bb4, cc05, cc02, cc02)
1764#else
1765	FNMSUB	(bb4, cc06, cc01, cc01)
1766	FMADD	(bb4, cc05, cc02, cc02)
1767#endif
1768
1769	LDF	[BO +  0 * SIZE], b1
1770	LDF	[BO +  1 * SIZE], b2
1771
1772	FMUL	b1, c01, a1
1773	FMUL	b2, c01, a2
1774
1775#ifndef CONJ
1776	FNMSUB	(bb2, cc02, aa1, cc01)
1777	FMADD	(bb1, cc02, aa2, cc02)
1778#else
1779	FMADD	(bb2, cc02, aa1, cc01)
1780	FMSUB	(bb1, cc02, aa2, cc02)
1781#endif
1782#endif
1783
1784#ifdef LN
1785	add	C1, -2 * SIZE, C1
1786	add	C2, -2 * SIZE, C2
1787#endif
1788
1789#if defined(LN) || defined(LT)
1790	STF	c01, [BO +  0 * SIZE]
1791	STF	c02, [BO +  1 * SIZE]
1792	STF	c05, [BO +  2 * SIZE]
1793	STF	c06, [BO +  3 * SIZE]
1794#else
1795	STF	c01, [AO +  0 * SIZE]
1796	STF	c02, [AO +  1 * SIZE]
1797	STF	c05, [AO +  2 * SIZE]
1798	STF	c06, [AO +  3 * SIZE]
1799#endif
1800
1801	STF	c01, [C1 + 0 * SIZE]
1802	STF	c02, [C1 + 1 * SIZE]
1803	STF	c05, [C2 + 0 * SIZE]
1804	STF	c06, [C2 + 1 * SIZE]
1805
1806#ifndef LN
1807	add	C1, 2 * SIZE, C1
1808	add	C2, 2 * SIZE, C2
1809#endif
1810
1811#ifdef RT
1812	sll	K, ZBASE_SHIFT, TEMP1
1813	add	AORIG, TEMP1, AORIG
1814#endif
1815
1816#if defined(LT) || defined(RN)
1817	sub	K, KK, TEMP1
1818	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
1819	sll	TEMP1, ZBASE_SHIFT + 1, TEMP1
1820	add	AO, TEMP2, AO
1821	add	BO, TEMP1, BO
1822#endif
1823
1824#ifdef LT
1825	add	KK, 1, KK
1826#endif
1827
1828#ifdef LN
1829	sub	KK, 1, KK
1830#endif
1831
1832	add	I, -1, I
1833	cmp	I, 0
1834	bg,pt	%icc, .LL22
1835	nop
1836
1837#ifdef LN
1838	sll	K, ZBASE_SHIFT + 1, TEMP1
1839	add	B, TEMP1, B
1840#endif
1841
1842#if defined(LT) || defined(RN)
1843	mov	BO, B
1844#endif
1845
1846#ifdef RN
1847	add	KK, 2, KK
1848#endif
1849
1850#ifdef RT
1851	sub	KK, 2, KK
1852#endif
1853	.align 4
1854
1855.LL30:
1856	and	N, 1, J
1857	cmp	J, 0
1858	ble,pn	%icc, .LL999
1859	nop
1860
1861#ifdef RT
1862	sll	K, ZBASE_SHIFT, TEMP1
1863	sub	B, TEMP1, B
1864#endif
1865
1866#ifndef RT
1867	mov	C,  C1
1868	add	C,  LDC, C
1869#else
1870	sub	C,  LDC, C1
1871	sub	C,  LDC, C
1872#endif
1873
1874#ifdef LN
1875	add	M, OFFSET, KK
1876#endif
1877
1878#ifdef LT
1879	mov	OFFSET, KK
1880#endif
1881
1882#if defined(LN) || defined(RT)
1883	mov	A, AORIG
1884#else
1885	mov	A, AO
1886#endif
1887
1888	mov	M, I
1889	.align 4
1890
1891.LL32:
1892#if defined(LT) || defined(RN)
1893	mov	B, BO
1894#else
1895#ifdef LN
1896	sll	K,  ZBASE_SHIFT, TEMP1
1897	sub	AORIG, TEMP1, AORIG
1898#endif
1899
1900	sll	KK, ZBASE_SHIFT + 0, TEMP1
1901
1902	add	AORIG, TEMP1, AO
1903	add	B,     TEMP1, BO
1904#endif
1905
1906	LDF	[AO +  0 * SIZE], a1
1907	LDF	[AO +  1 * SIZE], a2
1908	LDF	[AO +  2 * SIZE], a3
1909	LDF	[AO +  3 * SIZE], a4
1910
1911	LDF	[BO +  0 * SIZE], b1
1912	LDF	[BO +  1 * SIZE], b2
1913	LDF	[BO +  2 * SIZE], b3
1914	FCLR	(cc01)
1915	LDF	[BO +  3 * SIZE], b4
1916	FCLR	(cc02)
1917
1918	LDF	[BO +  4 * SIZE], b5
1919	FCLR	(cc03)
1920	LDF	[BO +  5 * SIZE], b6
1921	FCLR	(cc04)
1922	LDF	[BO +  6 * SIZE], b7
1923	FCLR	(cc05)
1924	LDF	[BO +  7 * SIZE], b8
1925	FCLR	(cc06)
1926
1927	prefetch [C1 + 2 * SIZE], 3
1928	FCLR	(cc07)
1929
1930#if defined(LT) || defined(RN)
1931	sra	KK, 2, L
1932#else
1933	sub	K, KK, L
1934	sra	L,  2, L
1935#endif
1936	cmp	L,  0
1937	ble,pn	%icc, .LL35
1938	FCLR	(cc08)
1939	.align 4
1940
1941.LL33:
1942	FMADD1	(aa1, bb1, cc01, cc01)
1943	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
1944	FMADD2	(aa2, bb1, cc02, cc02)
1945	LDF	[BO +  8 * SIZE], b1
1946
1947	FMADD3	(aa1, bb2, cc03, cc03)
1948	LDF	[AO +  4 * SIZE], a1
1949	FMADD4	(aa2, bb2, cc04, cc04)
1950	LDF	[AO +  5 * SIZE], a2
1951
1952	FMADD1	(aa3, bb3, cc01, cc01)
1953	LDF	[BO +  9 * SIZE], b2
1954	FMADD2	(aa4, bb3, cc02, cc02)
1955	LDF	[BO + 10 * SIZE], b3
1956
1957	FMADD3	(aa3, bb4, cc03, cc03)
1958	LDF	[AO +  6 * SIZE], a3
1959	FMADD4	(aa4, bb4, cc04, cc04)
1960	LDF	[AO +  7 * SIZE], a4
1961
1962	FMADD1	(aa1, bb5, cc01, cc01)
1963	LDF	[BO + 11 * SIZE], b4
1964	FMADD2	(aa2, bb5, cc02, cc02)
1965	LDF	[BO + 12 * SIZE], b5
1966
1967	FMADD3	(aa1, bb6, cc03, cc03)
1968	LDF	[AO +  8 * SIZE], a1
1969	FMADD4	(aa2, bb6, cc04, cc04)
1970	LDF	[AO +  9 * SIZE], a2
1971
1972	FMADD1	(aa3, bb7, cc01, cc01)
1973	LDF	[BO + 13 * SIZE], b6
1974
1975	FMADD2	(aa4, bb7, cc02, cc02)
1976	LDF	[BO + 14 * SIZE], b7
1977
1978	FMADD3	(aa3, bb8, cc03, cc03)
1979	LDF	[AO + 10 * SIZE], a3
1980	FMADD4	(aa4, bb8, cc04, cc04)
1981	LDF	[AO + 11 * SIZE], a4
1982
1983	add	AO,  8 * SIZE, AO
1984	add	L, -1, L
1985	add	BO,  8 * SIZE, BO
1986	cmp	L, 0
1987
1988	bg,pt	%icc, .LL33
1989	LDF	[BO +  7 * SIZE], b8
1990	.align 4
1991
1992.LL35:
1993#if defined(LT) || defined(RN)
1994	and	KK, 3, L
1995#else
1996	sub	K, KK, L
1997	and	L,  3, L
1998#endif
1999	cmp	L,  0
2000	ble,a,pn %icc, .LL38
2001	nop
2002	.align 4
2003
2004.LL37:
2005	FMADD1	(aa1, bb1, cc01, cc01)
2006	add	L, -1, L
2007	FMADD2	(aa2, bb1, cc02, cc02)
2008	LDF	[BO + 2 * SIZE], b1
2009
2010	FMADD3	(aa1, bb2, cc03, cc03)
2011	LDF	[AO + 2 * SIZE], a1
2012	FMADD4	(aa2, bb2, cc04, cc04)
2013	LDF	[AO + 3 * SIZE], a2
2014
2015	add	AO, 2 * SIZE, AO
2016	cmp	L, 0
2017	add	BO, 2 * SIZE, BO
2018	bg,pt	%icc, .LL37
2019	LDF	[BO + 1 * SIZE], b2
2020	.align 4
2021
2022.LL38:
2023	FADD	c01, c04, c01
2024	FADD	c02, c03, c02
2025
2026#if defined(LN) || defined(RT)
2027	sub	KK, 1, TEMP1
2028
2029	sll	TEMP1, ZBASE_SHIFT, TEMP1
2030
2031	add	AORIG, TEMP1, AO
2032	add	B,     TEMP1, BO
2033#endif
2034
2035#if defined(LN) || defined(LT)
2036	LDF	[BO +  0 * SIZE], a1
2037	LDF	[BO +  1 * SIZE], a2
2038#else
2039	LDF	[AO +  0 * SIZE], a1
2040	LDF	[AO +  1 * SIZE], a2
2041#endif
2042
2043	FSUB	a1, c01, c01
2044	FSUB	a2, c02, c02
2045
2046#if defined(LN) || defined(LT)
2047	LDF	[AO +  0 * SIZE], a1
2048	LDF	[AO +  1 * SIZE], a2
2049#else
2050	LDF	[BO +  0 * SIZE], a1
2051	LDF	[BO +  1 * SIZE], a2
2052#endif
2053
2054	FMUL	a1, c01, b1
2055	FMUL	a2, c01, b2
2056
2057#ifndef CONJ
2058	FNMSUB	(aa2, cc02, bb1, cc01)
2059	FMADD	(aa1, cc02, bb2, cc02)
2060#else
2061	FMADD	(aa2, cc02, bb1, cc01)
2062	FMSUB	(aa1, cc02, bb2, cc02)
2063#endif
2064
2065#ifdef LN
2066	add	C1, -2 * SIZE, C1
2067#endif
2068
2069#if defined(LN) || defined(LT)
2070	STF	c01, [BO +  0 * SIZE]
2071	STF	c02, [BO +  1 * SIZE]
2072#else
2073	STF	c01, [AO +  0 * SIZE]
2074	STF	c02, [AO +  1 * SIZE]
2075#endif
2076
2077	STF	c01, [C1 + 0 * SIZE]
2078	STF	c02, [C1 + 1 * SIZE]
2079
2080#ifndef LN
2081	add	C1, 2 * SIZE, C1
2082#endif
2083
2084#ifdef RT
2085	sll	K, ZBASE_SHIFT, TEMP1
2086	add	AORIG, TEMP1, AORIG
2087#endif
2088
2089#if defined(LT) || defined(RN)
2090	sub	K, KK, TEMP1
2091	sll	TEMP1, ZBASE_SHIFT, TEMP1
2092	add	AO, TEMP1, AO
2093	add	BO, TEMP1, BO
2094#endif
2095
2096#ifdef LT
2097	add	KK, 1, KK
2098#endif
2099
2100#ifdef LN
2101	sub	KK, 1, KK
2102#endif
2103
2104	add	I, -1, I
2105	cmp	I, 0
2106	bg,pt	%icc, .LL32
2107	nop
2108
2109#ifdef LN
2110	sll	K, ZBASE_SHIFT, TEMP1
2111	add	B, TEMP1, B
2112#endif
2113
2114#if defined(LT) || defined(RN)
2115	mov	BO, B
2116#endif
2117
2118#ifdef RN
2119	add	KK, 1, KK
2120#endif
2121
2122#ifdef RT
2123	sub	KK, 1, KK
2124#endif
2125	.align 4
2126
2127.LL999:
2128	return	%i7 + 8
2129	clr	%o0
2130
2131	EPILOGUE
2132