1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%i0
43#define N	%i1
44#define K	%i2
45#define A	%i5
46#define B	%i3
47#define C	%i4
48
49#define LDC	%o0
50#define AO	%o1
51#define BO	%o2
52#define I	%o3
53#define J	%o4
54#define L	%o5
55
56#define C1	%l0
57#define C2	%l1
58
59#define OFFSET	%l2
60#define	KK	%l3
61#define TEMP1	%l4
62#define TEMP2	%l5
63#define AORIG	%l6
64
65#ifdef DOUBLE
66#define c01	%f0
67#define c02	%f2
68#define c03	%f4
69#define c04	%f6
70#define c05	%f8
71#define c06	%f10
72#define c07	%f12
73#define c08	%f14
74#define c09	%f16
75#define c10	%f18
76#define c11	%f20
77#define c12	%f22
78#define c13	%f24
79#define c14	%f26
80#define c15	%f28
81#define c16	%f30
82
83#define t1	%f32
84#define	t2 	%f34
85#define t3	%f36
86#define	t4 	%f38
87
88#define a1	%f40
89#define a2	%f42
90#define a3	%f44
91#define a4	%f46
92#define a5	%f62
93
94#define b1	%f48
95#define b2	%f50
96#define b3	%f52
97#define b4	%f54
98#define b5	%f56
99
100#define FZERO	%f58
101
102#else
103#define c01	%f0
104#define c02	%f1
105#define c03	%f2
106#define c04	%f3
107#define c05	%f4
108#define c06	%f5
109#define c07	%f6
110#define c08	%f7
111#define c09	%f8
112#define c10	%f9
113#define c11	%f10
114#define c12	%f11
115#define c13	%f12
116#define c14	%f13
117#define c15	%f14
118#define c16	%f15
119
120#define t1	%f16
121#define	t2 	%f17
122#define t3	%f18
123#define	t4 	%f19
124
125#define a1	%f20
126#define a2	%f21
127#define a3	%f22
128#define a4	%f23
129#define a5	%f31
130
131#define b1	%f24
132#define b2	%f25
133#define b3	%f26
134#define b4	%f27
135#define b5	%f28
136
137#define FZERO	%f29
138#endif
139
140#define	t5	c13
141#define	t6	c14
142#define	t7	c15
143#define	t8	c16
144
145#ifndef CONJ
146#define FADD1	FADD
147#define FADD2	FADD
148#define FADD3	FADD
149#define FADD4	FSUB
150#else
151
152#if defined(LN) || defined(LT)
153#define FADD1	FADD
154#define FADD2	FSUB
155#define FADD3	FADD
156#define FADD4	FADD
157#endif
158
159#if defined(RN) || defined(RT)
160#define FADD1	FADD
161#define FADD2	FADD
162#define FADD3	FSUB
163#define FADD4	FADD
164#endif
165#endif
166
167#define APREFETCHSIZE 40
168#define BPREFETCHSIZE 40
169
170#define APREFETCH_CATEGORY 0
171#define BPREFETCH_CATEGORY 0
172
173	PROLOGUE
174	SAVESP
175
176#ifndef __64BIT__
177#ifdef DOUBLE
178	ld	[%sp + STACK_START + 32], A
179	ld	[%sp + STACK_START + 36], B
180	ld	[%sp + STACK_START + 40], C
181	ld	[%sp + STACK_START + 44], LDC
182	ld	[%sp + STACK_START + 48], OFFSET
183#else
184	ld	[%sp + STACK_START + 28], B
185	ld	[%sp + STACK_START + 32], C
186	ld	[%sp + STACK_START + 36], LDC
187	ld	[%sp + STACK_START + 40], OFFSET
188#endif
189#else
190	ldx	[%sp+  STACK_START + 56], B
191	ldx	[%sp+  STACK_START + 64], C
192	ldx	[%sp+  STACK_START + 72], LDC
193	ldx	[%sp+  STACK_START + 80], OFFSET
194#endif
195
196#ifdef DOUBLE
197       FCLR(27)
198#else
199       FCLR(29)
200#endif
201
202	sll	LDC, ZBASE_SHIFT, LDC
203
204#ifdef LN
205	smul	M, K, TEMP1
206	sll	TEMP1, ZBASE_SHIFT, TEMP1
207	add	A, TEMP1, A
208
209	sll	M, ZBASE_SHIFT, TEMP1
210	add	C, TEMP1, C
211#endif
212
213#ifdef RN
214	neg	OFFSET, KK
215#endif
216
217#ifdef RT
218	smul	N, K, TEMP1
219	sll	TEMP1, ZBASE_SHIFT, TEMP1
220	add	B, TEMP1, B
221
222	smul	N, LDC, TEMP1
223	add	C, TEMP1, C
224
225	sub	N, OFFSET, KK
226#endif
227
228	sra	N, 1, J
229	cmp	J, 0
230	ble,pn	%icc, .LL100
231	nop
232
233.LL11:
234#ifdef RT
235	sll	K, 1 + ZBASE_SHIFT, TEMP1
236	sub	B, TEMP1, B
237
238	add	LDC, LDC, TEMP1
239	sub	C, TEMP1, C
240#endif
241
242	mov	C, C1
243	add	C, LDC, C2
244
245#ifdef LN
246	add	M, OFFSET, KK
247#endif
248
249#ifdef LT
250	mov	OFFSET, KK
251#endif
252
253#if defined(LN) || defined(RT)
254	mov	A, AORIG
255#else
256	mov	A, AO
257#endif
258
259#ifndef RT
260	add	C2, LDC, C
261#endif
262
263	and	M, 1, I
264	cmp	I, 0
265	ble,pn	%icc, .LL50
266	nop
267
268#if defined(LT) || defined(RN)
269	sra	KK, 2, L
270
271	mov	B, BO
272	cmp	L,  0
273#else
274
275#ifdef LN
276	sll	K,  0 + ZBASE_SHIFT, TEMP1
277	sub	AORIG, TEMP1, AORIG
278#endif
279
280	sll	KK, 0 + ZBASE_SHIFT, TEMP1
281	sll	KK, 1 + ZBASE_SHIFT, TEMP2
282
283	add	AORIG, TEMP1, AO
284	add	B,     TEMP2, BO
285
286	sub	K, KK, TEMP1
287
288	sra	TEMP1, 2, L
289	cmp	L,  0
290#endif
291
292	FMOV	FZERO, c02
293	FMOV	FZERO, t1
294	FMOV	FZERO, c04
295
296	LDF	[AO + 0 * SIZE], a1
297	FMOV	FZERO, t2
298	LDF	[BO + 0 * SIZE], b1
299	FMOV	FZERO, c06
300	LDF	[AO + 1 * SIZE], a2
301	FMOV	FZERO, t3
302	LDF	[BO + 1 * SIZE], b2
303	FMOV	FZERO, c08
304	LDF	[AO + 2 * SIZE], a3
305	FMOV	FZERO, t4
306	LDF	[BO + 2 * SIZE], b3
307	FMOV	FZERO, c01
308	LDF	[AO + 3 * SIZE], a4
309	FMOV	FZERO, c03
310	LDF	[BO + 3 * SIZE], b4
311	FMOV	FZERO, c05
312
313	ble,pn	%icc, .LL55
314	FMOV	FZERO, c07
315
316.LL52:
317	FADD2	c02, t1, c02
318	add	AO,  8 * SIZE, AO
319	prefetch [AO + APREFETCHSIZE * SIZE], 0
320
321	FMUL	a1, b1, t1
322	add	BO, 16 * SIZE, BO
323
324	FADD4	c04, t2, c04
325	add	L, -1, L
326	FMUL	a1, b2, t2
327
328	FADD2	c06, t3, c06
329	cmp	L, 0
330	FMUL	a1, b3, t3
331
332	FADD4	c08, t4, c08
333	FMUL	a1, b4, t4
334	LDF	[AO -  4 * SIZE], a1
335
336	FADD1	c01, t1, c01
337	FMUL	a2, b1, t1
338	LDF	[BO - 12 * SIZE], b1
339	FADD3	c03, t2, c03
340	FMUL	a2, b2, t2
341	LDF	[BO - 11 * SIZE], b2
342
343	FADD1	c05, t3, c05
344	FMUL	a2, b3, t3
345	LDF	[BO - 10 * SIZE], b3
346	FADD3	c07, t4, c07
347	FMUL	a2, b4, t4
348	LDF	[BO -  9 * SIZE], b4
349
350	FADD2	c02, t1, c02
351	FMUL	a3, b1, t1
352	LDF	[AO -  3 * SIZE], a2
353	FADD4	c04, t2, c04
354	FMUL	a3, b2, t2
355
356	FADD2	c06, t3, c06
357	FMUL	a3, b3, t3
358	FADD4	c08, t4, c08
359	FMUL	a3, b4, t4
360	LDF	[AO -  2 * SIZE], a3
361
362	FADD1	c01, t1, c01
363	FMUL	a4, b1, t1
364	LDF	[BO -  8 * SIZE], b1
365	FADD3	c03, t2, c03
366	FMUL	a4, b2, t2
367	LDF	[BO -  7 * SIZE], b2
368
369	FADD1	c05, t3, c05
370	FMUL	a4, b3, t3
371	LDF	[BO -  6 * SIZE], b3
372	FADD3	c07, t4, c07
373	FMUL	a4, b4, t4
374	LDF	[BO -  5 * SIZE], b4
375
376	FADD2	c02, t1, c02
377	FMUL	a1, b1, t1
378	LDF	[AO -  1 * SIZE], a4
379	FADD4	c04, t2, c04
380	FMUL	a1, b2, t2
381
382	FADD2	c06, t3, c06
383	FMUL	a1, b3, t3
384	FADD4	c08, t4, c08
385	FMUL	a1, b4, t4
386	LDF	[AO +  0 * SIZE], a1
387
388	FADD1	c01, t1, c01
389	FMUL	a2, b1, t1
390	LDF	[BO -  4 * SIZE], b1
391
392	FADD3	c03, t2, c03
393	FMUL	a2, b2, t2
394	LDF	[BO -  3 * SIZE], b2
395
396	FADD1	c05, t3, c05
397	FMUL	a2, b3, t3
398	LDF	[BO -  2 * SIZE], b3
399	FADD3	c07, t4, c07
400	FMUL	a2, b4, t4
401	LDF	[BO -  1 * SIZE], b4
402
403	FADD2	c02, t1, c02
404	FMUL	a3, b1, t1
405	LDF	[AO +  1 * SIZE], a2
406	FADD4	c04, t2, c04
407	FMUL	a3, b2, t2
408
409	FADD2	c06, t3, c06
410	FMUL	a3, b3, t3
411	FADD4	c08, t4, c08
412	FMUL	a3, b4, t4
413	LDF	[AO +  2 * SIZE], a3
414
415	FADD1	c01, t1, c01
416	FMUL	a4, b1, t1
417	LDF	[BO +  0 * SIZE], b1
418	FADD3	c03, t2, c03
419	FMUL	a4, b2, t2
420	LDF	[BO +  1 * SIZE], b2
421
422	FADD1	c05, t3, c05
423	FMUL	a4, b3, t3
424	LDF	[BO +  2 * SIZE], b3
425	FADD3	c07, t4, c07
426	FMUL	a4, b4, t4
427	LDF	[BO +  3 * SIZE], b4
428
429	bg,pt	%icc, .LL52
430	LDF	[AO +  3 * SIZE], a4
431
432.LL55:
433#if defined(LT) || defined(RN)
434	and	KK,  3, L
435#else
436	and	TEMP1, 3, L
437#endif
438	cmp	L,  0
439	ble,a,pn %icc, .LL59
440	nop
441
442.LL56:
443	FADD2	c02, t1, c02
444	add	AO, 2 * SIZE, AO
445	FMUL	a1, b1, t1
446	add	L, -1, L
447
448	add	BO, 4 * SIZE, BO
449	FADD4	c04, t2, c04
450	cmp	L, 0
451	FMUL	a1, b2, t2
452
453	FADD2	c06, t3, c06
454	FMUL	a1, b3, t3
455	FADD4	c08, t4, c08
456	FMUL	a1, b4, t4
457	LDF	[AO + 0 * SIZE], a1
458
459	FADD1	c01, t1, c01
460	FMUL	a2, b1, t1
461	LDF	[BO + 0 * SIZE], b1
462	FADD3	c03, t2, c03
463	FMUL	a2, b2, t2
464	LDF	[BO + 1 * SIZE], b2
465
466	FADD1	c05, t3, c05
467	FMUL	a2, b3, t3
468	LDF	[BO + 2 * SIZE], b3
469	FADD3	c07, t4, c07
470	FMUL	a2, b4, t4
471	LDF	[BO + 3 * SIZE], b4
472
473	bg,pt	%icc, .LL56
474	LDF	[AO + 1 * SIZE], a2
475
476.LL59:
477#if defined(LN) || defined(RT)
478#ifdef LN
479	sub	KK, 1, TEMP1
480#else
481	sub	KK, 2, TEMP1
482#endif
483	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP2
484	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP1
485	add	AORIG, TEMP2, AO
486	add	B,     TEMP1, BO
487#endif
488
489	FADD2	c02, t1, c02
490	FADD4	c04, t2, c04
491	FADD2	c06, t3, c06
492	FADD4	c08, t4, c08
493
494	FADD	c01, c04, c01
495	FADD	c02, c03, c02
496	FADD	c05, c08, c05
497	FADD	c06, c07, c06
498
499#if defined(LN) || defined(LT)
500	LDF	[BO +  0 * SIZE], a1
501	LDF	[BO +  1 * SIZE], a2
502	LDF	[BO +  2 * SIZE], a3
503	LDF	[BO +  3 * SIZE], a4
504
505	FSUB	a1, c01, c01
506	FSUB	a2, c02, c02
507	FSUB	a3, c05, c05
508	FSUB	a4, c06, c06
509
510#else
511	LDF	[AO +  0 * SIZE], a1
512	LDF	[AO +  1 * SIZE], a2
513	LDF	[AO +  2 * SIZE], a3
514	LDF	[AO +  3 * SIZE], a4
515
516	FSUB	a1, c01, c01
517	FSUB	a2, c02, c02
518	FSUB	a3, c05, c05
519	FSUB	a4, c06, c06
520#endif
521
522#ifdef LN
523	LDF	[AO +  0 * SIZE], a1
524	LDF	[AO +  1 * SIZE], a2
525
526	FMUL	a1, c01, t1
527	FMUL	a2, c02, t2
528	FMUL	a1, c02, t3
529	FMUL	a2, c01, t4
530
531	FMUL	a1, c05, t5
532	FMUL	a2, c06, t6
533	FMUL	a1, c06, t7
534	FMUL	a2, c05, t8
535
536	FADD4	t1, t2, c01
537	FADD2	t3, t4, c02
538	FADD4	t5, t6, c05
539	FADD2	t7, t8, c06
540#endif
541
542#ifdef LT
543	LDF	[AO +  0 * SIZE], a1
544	LDF	[AO +  1 * SIZE], a2
545
546	FMUL	a1, c01, t1
547	FMUL	a2, c02, t2
548	FMUL	a1, c02, t3
549	FMUL	a2, c01, t4
550
551	FMUL	a1, c05, t5
552	FMUL	a2, c06, t6
553	FMUL	a1, c06, t7
554	FMUL	a2, c05, t8
555
556	FADD4	t1, t2, c01
557	FADD2	t3, t4, c02
558	FADD4	t5, t6, c05
559	FADD2	t7, t8, c06
560#endif
561
562#ifdef RN
563	LDF	[BO +  0 * SIZE], a1
564	LDF	[BO +  1 * SIZE], a2
565	LDF	[BO +  2 * SIZE], a3
566	LDF	[BO +  3 * SIZE], a4
567	LDF	[BO +  6 * SIZE], b1
568	LDF	[BO +  7 * SIZE], b2
569
570	FMUL	a1, c01, t1
571	FMUL	a2, c02, t2
572	FMUL	a1, c02, t3
573	FMUL	a2, c01, t4
574
575	FADD4	t1, t2, c01
576	FADD3	t3, t4, c02
577
578	FMUL	a3, c01, t1
579	FMUL	a3, c02, t2
580	FMUL	a4, c02, t3
581	FMUL	a4, c01, t4
582
583	FSUB	c05, t1, c05
584	FSUB	c06, t2, c06
585	FADD3	c05, t3, c05
586	FADD4	c06, t4, c06
587
588	FMUL	b1, c05, t1
589	FMUL	b2, c06, t2
590	FMUL	b1, c06, t3
591	FMUL	b2, c05, t4
592
593	FADD4	t1, t2, c05
594	FADD3	t3, t4, c06
595#endif
596
597#ifdef RT
598	LDF	[BO +  6 * SIZE], a1
599	LDF	[BO +  7 * SIZE], a2
600	LDF	[BO +  4 * SIZE], a3
601	LDF	[BO +  5 * SIZE], a4
602	LDF	[BO +  0 * SIZE], b1
603	LDF	[BO +  1 * SIZE], b2
604
605	FMUL	a1, c05, t1
606	FMUL	a2, c06, t2
607	FMUL	a1, c06, t3
608	FMUL	a2, c05, t4
609
610	FADD4	t1, t2, c05
611	FADD3	t3, t4, c06
612
613	FMUL	a3, c05, t1
614	FMUL	a3, c06, t2
615	FMUL	a4, c06, t3
616	FMUL	a4, c05, t4
617
618	FSUB	c01, t1, c01
619	FSUB	c02, t2, c02
620	FADD3	c01, t3, c01
621	FADD4	c02, t4, c02
622
623	FMUL	b1, c01, t1
624	FMUL	b2, c02, t2
625	FMUL	b1, c02, t3
626	FMUL	b2, c01, t4
627
628	FADD4	t1, t2, c01
629	FADD3	t3, t4, c02
630#endif
631
632#ifdef LN
633	add	C1, -2 * SIZE, C1
634	add	C2, -2 * SIZE, C2
635#endif
636
637#if defined(LN) || defined(LT)
638	STF	c01, [BO +  0 * SIZE]
639	STF	c02, [BO +  1 * SIZE]
640	STF	c05, [BO +  2 * SIZE]
641	STF	c06, [BO +  3 * SIZE]
642#else
643	STF	c01, [AO +  0 * SIZE]
644	STF	c02, [AO +  1 * SIZE]
645	STF	c05, [AO +  2 * SIZE]
646	STF	c06, [AO +  3 * SIZE]
647#endif
648
649	STF	c01, [C1 + 0 * SIZE]
650	STF	c02, [C1 + 1 * SIZE]
651	STF	c05, [C2 + 0 * SIZE]
652	STF	c06, [C2 + 1 * SIZE]
653
654	FMOV	FZERO, t1
655	FMOV	FZERO, t2
656	FMOV	FZERO, t3
657	FMOV	FZERO, t4
658
659#ifndef LN
660	add	C1, 2 * SIZE, C1
661	add	C2, 2 * SIZE, C2
662#endif
663
664#ifdef RT
665	sll	K, 0 + ZBASE_SHIFT, TEMP1
666	add	AORIG, TEMP1, AORIG
667#endif
668
669#if defined(LT) || defined(RN)
670	sub	K, KK, TEMP1
671	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP2
672	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP1
673	add	AO, TEMP2, AO
674	add	BO, TEMP1, BO
675#endif
676
677#ifdef LT
678	add	KK, 1, KK
679#endif
680
681#ifdef LN
682	sub	KK, 1, KK
683#endif
684
685.LL50:
686	sra	M, 1, I
687	cmp	I, 0
688	ble,pn	%icc, .LL99
689	nop
690
691.LL21:
692#if defined(LT) || defined(RN)
693	sra	KK, 2, L
694
695	mov	B, BO
696	cmp	L,  0
697#else
698
699#ifdef LN
700	sll	K,  1 + ZBASE_SHIFT, TEMP1
701	sub	AORIG, TEMP1, AORIG
702#endif
703
704	sll	KK, 1 + ZBASE_SHIFT, TEMP1
705
706	add	AORIG, TEMP1, AO
707	add	B,     TEMP1, BO
708
709	sub	K, KK, TEMP1
710
711	sra	TEMP1, 2, L
712	cmp	L,  0
713#endif
714
715	FMOV	FZERO, t1
716	FMOV	FZERO, t2
717	FMOV	FZERO, t3
718	FMOV	FZERO, t4
719
720	FMOV	FZERO, c01
721	FMOV	FZERO, c02
722
723	LDF	[AO + 0 * SIZE], a1
724	FMOV	FZERO, c03
725	LDF	[BO + 0 * SIZE], b1
726	FMOV	FZERO, c04
727
728	LDF	[AO + 1 * SIZE], a2
729	FMOV	FZERO, c05
730	LDF	[BO + 1 * SIZE], b2
731	FMOV	FZERO, c06
732
733	LDF	[AO + 2 * SIZE], a3
734	FMOV	FZERO, c07
735	LDF	[BO + 2 * SIZE], b3
736	FMOV	FZERO, c08
737
738	LDF	[AO + 3 * SIZE], a4
739	FMOV	FZERO, c09
740	LDF	[BO + 3 * SIZE], b4
741	FMOV	FZERO, c10
742
743	LDF	[BO +  4 * SIZE], b5
744	FMOV	FZERO, c11
745	LDF	[AO +  4 * SIZE], a5
746	FMOV	FZERO, c12
747
748#ifdef LN
749	prefetch [C1 - 3 * SIZE], 3
750	FMOV	FZERO, c13
751	prefetch [C2 - 3 * SIZE], 3
752	FMOV	FZERO, c14
753#else
754	prefetch [C1 + 3 * SIZE], 3
755	FMOV	FZERO, c13
756	prefetch [C2 + 3 * SIZE], 3
757	FMOV	FZERO, c14
758#endif
759
760	FMOV	FZERO, c15
761	ble,pn	%icc, .LL25
762	FMOV	FZERO, c16
763
764.LL22:
765	FADD2	c04, t1, c04
766	prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
767	FMUL	a1, b1, t1
768	nop
769
770	FADD4	c08, t2, c08
771	prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
772	FMUL	a1, b2, t2
773	add	AO, 16 * SIZE, AO
774
775	FADD2	c12, t3, c12
776	LDF	[AO - 13 * SIZE], a4
777	FMUL	a1, b3, t3
778	add	BO, 16 * SIZE, BO
779
780	FADD4	c16, t4, c16
781	nop
782	FMUL	a1, b4, t4
783	LDF	[AO -  8 * SIZE], a1
784
785	FADD1	c01, t1, c01
786	nop
787	FMUL	a2, b1, t1
788	nop
789
790	FADD3	c05, t2, c05
791	nop
792	FMUL	a2, b2, t2
793	nop
794
795	FADD1	c09, t3, c09
796	nop
797	FMUL	a2, b3, t3
798	nop
799
800	FADD3	c13, t4, c13
801	add	L, -1, L
802	FMUL	a2, b4, t4
803	LDF	[AO - 11 * SIZE], a2
804
805	FADD2	c02, t1, c02
806	nop
807	FMUL	a3, b1, t1
808	nop
809
810	FADD4	c06, t2, c06
811	nop
812	FMUL	a3, b2, t2
813	nop
814
815	FADD2	c10, t3, c10
816	nop
817	FMUL	a3, b3, t3
818	nop
819
820	FADD4	c14, t4, c14
821	nop
822	FMUL	a3, b4, t4
823	LDF	[AO - 10 * SIZE], a3
824
825	FADD1	c03, t1, c03
826	nop
827	FMUL	a4, b1, t1
828	LDF	[BO -  8 * SIZE], b1
829
830	FADD3	c07, t2, c07
831	nop
832	FMUL	a4, b2, t2
833	LDF	[BO - 11 * SIZE], b2
834
835	FADD1	c11, t3, c11
836	nop
837	FMUL	a4, b3, t3
838	LDF	[BO - 10 * SIZE], b3
839
840	FADD3	c15, t4, c15
841	nop
842	FMUL	a4, b4, t4
843	LDF	[BO -  9 * SIZE], b4
844
845	FADD2	c04, t1, c04
846	nop
847	FMUL	a5, b5, t1
848	LDF	[AO -  9 * SIZE], a4
849
850	FADD4	c08, t2, c08
851	nop
852	FMUL	a5, b2, t2
853	nop
854
855	FADD2	c12, t3, c12
856	nop
857	FMUL	a5, b3, t3
858	nop
859
860	FADD4	c16, t4, c16
861	nop
862	FMUL	a5, b4, t4
863	LDF	[AO - 4 * SIZE], a5
864
865	FADD1	c01, t1, c01
866	nop
867	FMUL	a2, b5, t1
868	nop
869
870	FADD3	c05, t2, c05
871	nop
872	FMUL	a2, b2, t2
873	nop
874
875	FADD1	c09, t3, c09
876	nop
877	FMUL	a2, b3, t3
878	nop
879
880	FADD3	c13, t4, c13
881	nop
882	FMUL	a2, b4, t4
883	LDF	[AO -  7 * SIZE], a2
884
885	FADD2	c02, t1, c02
886	nop
887	FMUL	a3, b5, t1
888	nop
889
890	FADD4	c06, t2, c06
891	nop
892	FMUL	a3, b2, t2
893	nop
894
895	FADD2	c10, t3, c10
896	nop
897	FMUL	a3, b3, t3
898	nop
899
900	FADD4	c14, t4, c14
901	nop
902	FMUL	a3, b4, t4
903	LDF	[AO -  6 * SIZE], a3
904
905	FADD1	c03, t1, c03
906	nop
907	FMUL	a4, b5, t1
908	LDF	[BO - 4 * SIZE], b5
909
910	FADD3	c07, t2, c07
911	nop
912	FMUL	a4, b2, t2
913	LDF	[BO -  7 * SIZE], b2
914
915	FADD1	c11, t3, c11
916	nop
917	FMUL	a4, b3, t3
918	LDF	[BO -  6 * SIZE], b3
919
920	FADD3	c15, t4, c15
921	nop
922	FMUL	a4, b4, t4
923	LDF	[BO -  5 * SIZE], b4
924
925	FADD2	c04, t1, c04
926	nop
927	FMUL	a1, b1, t1
928	LDF	[AO -  5 * SIZE], a4
929
930	FADD4	c08, t2, c08
931	nop
932	FMUL	a1, b2, t2
933	nop
934
935	FADD2	c12, t3, c12
936	nop
937	FMUL	a1, b3, t3
938	nop
939
940	FADD4	c16, t4, c16
941	nop
942	FMUL	a1, b4, t4
943	LDF	[AO -  0 * SIZE], a1
944
945	FADD1	c01, t1, c01
946	nop
947	FMUL	a2, b1, t1
948	nop
949
950#ifdef DOUBLE
951	prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
952#else
953	nop
954#endif
955	FADD3	c05, t2, c05
956	nop
957	FMUL	a2, b2, t2
958
959	FADD1	c09, t3, c09
960	nop
961	FMUL	a2, b3, t3
962	nop
963
964	FADD3	c13, t4, c13
965	nop
966	FMUL	a2, b4, t4
967	nop
968
969	FADD2	c02, t1, c02
970	nop
971	FMUL	a3, b1, t1
972	LDF	[AO - 3 * SIZE], a2
973
974	FADD4	c06, t2, c06
975#ifdef DOUBLE
976	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
977#else
978	nop
979#endif
980	FMUL	a3, b2, t2
981	nop
982
983	FADD2	c10, t3, c10
984	nop
985	FMUL	a3, b3, t3
986	nop
987
988	FADD4	c14, t4, c14
989	nop
990	FMUL	a3, b4, t4
991	LDF	[AO - 2 * SIZE], a3
992
993	FADD1	c03, t1, c03
994	nop
995	FMUL	a4, b1, t1
996	LDF	[BO -  0 * SIZE], b1
997
998	FADD3	c07, t2, c07
999	nop
1000	FMUL	a4, b2, t2
1001	LDF	[BO - 3 * SIZE], b2
1002
1003	FADD1	c11, t3, c11
1004	nop
1005	FMUL	a4, b3, t3
1006	LDF	[BO - 2 * SIZE], b3
1007
1008	FADD3	c15, t4, c15
1009	nop
1010	FMUL	a4, b4, t4
1011	LDF	[BO - 1 * SIZE], b4
1012
1013	FADD2	c04, t1, c04
1014	nop
1015	FMUL	a5, b5, t1
1016	LDF	[AO - 1 * SIZE], a4
1017
1018	FADD4	c08, t2, c08
1019	FMUL	a5, b2, t2
1020	FADD2	c12, t3, c12
1021	FMUL	a5, b3, t3
1022
1023	FADD4	c16, t4, c16
1024	nop
1025	FMUL	a5, b4, t4
1026	LDF	[AO +  4 * SIZE], a5
1027
1028	FADD1	c01, t1, c01
1029	nop
1030	FMUL	a2, b5, t1
1031	nop
1032
1033	FADD3	c05, t2, c05
1034	nop
1035	FMUL	a2, b2, t2
1036	nop
1037
1038	FADD1	c09, t3, c09
1039	nop
1040	FMUL	a2, b3, t3
1041	nop
1042
1043	FADD3	c13, t4, c13
1044	nop
1045	FMUL	a2, b4, t4
1046	LDF	[AO +  1 * SIZE], a2
1047
1048	FADD2	c02, t1, c02
1049	nop
1050	FMUL	a3, b5, t1
1051	nop
1052
1053	FADD4	c06, t2, c06
1054	nop
1055	FMUL	a3, b2, t2
1056	nop
1057
1058	FADD2	c10, t3, c10
1059	nop
1060	FMUL	a3, b3, t3
1061	nop
1062
1063	FADD4	c14, t4, c14
1064	nop
1065	FMUL	a3, b4, t4
1066	LDF	[AO +  2 * SIZE], a3
1067
1068	FADD1	c03, t1, c03
1069	cmp	L, 0
1070	FMUL	a4, b5, t1
1071	LDF	[BO +  4 * SIZE], b5
1072
1073	FADD3	c07, t2, c07
1074	nop
1075	FMUL	a4, b2, t2
1076	LDF	[BO +  1 * SIZE], b2
1077
1078	FADD1	c11, t3, c11
1079	nop
1080	FMUL	a4, b3, t3
1081	LDF	[BO +  2 * SIZE], b3
1082
1083	FADD3	c15, t4, c15
1084	FMUL	a4, b4, t4
1085	bg,pt	%icc, .LL22
1086	LDF	[BO +  3 * SIZE], b4
1087
1088.LL25:
1089#if defined(LT) || defined(RN)
1090	and	KK,  3, L
1091#else
1092	and	TEMP1, 3, L
1093#endif
1094	cmp	L,  0
1095	ble,pn %icc, .LL29
1096	nop
1097
1098.LL26:
1099	FADD2	c04, t1, c04
1100	LDF	[AO +  3 * SIZE], a4
1101	FMUL	a1, b1, t1
1102	add	AO, 4 * SIZE, AO
1103
1104	FADD4	c08, t2, c08
1105	add	BO, 4 * SIZE, BO
1106	FMUL	a1, b2, t2
1107	add	L, -1, L
1108
1109	FADD2	c12, t3, c12
1110	nop
1111	FMUL	a1, b3, t3
1112	cmp	L, 0
1113
1114	FADD4	c16, t4, c16
1115	nop
1116	FMUL	a1, b4, t4
1117	LDF	[AO + 0 * SIZE], a1
1118
1119	FADD1	c01, t1, c01
1120	nop
1121	FMUL	a2, b1, t1
1122	nop
1123
1124	FADD3	c05, t2, c05
1125	nop
1126	FMUL	a2, b2, t2
1127	nop
1128
1129	FADD1	c09, t3, c09
1130	nop
1131	FMUL	a2, b3, t3
1132	nop
1133
1134	FADD3	c13, t4, c13
1135	nop
1136	FMUL	a2, b4, t4
1137	LDF	[AO + 1 * SIZE], a2
1138
1139	FADD2	c02, t1, c02
1140	nop
1141	FMUL	a3, b1, t1
1142	nop
1143
1144	FADD4	c06, t2, c06
1145	nop
1146	FMUL	a3, b2, t2
1147	nop
1148
1149	FADD2	c10, t3, c10
1150	nop
1151	FMUL	a3, b3, t3
1152	nop
1153
1154	FADD4	c14, t4, c14
1155	nop
1156	FMUL	a3, b4, t4
1157	LDF	[AO + 2 * SIZE], a3
1158
1159	FADD1	c03, t1, c03
1160	nop
1161	FMUL	a4, b1, t1
1162	LDF	[BO + 0 * SIZE], b1
1163
1164	FADD3	c07, t2, c07
1165	nop
1166	FMUL	a4, b2, t2
1167	LDF	[BO + 1 * SIZE], b2
1168
1169	FADD1	c11, t3, c11
1170	nop
1171	FMUL	a4, b3, t3
1172	LDF	[BO + 2 * SIZE], b3
1173
1174	FADD3	c15, t4, c15
1175	FMUL	a4, b4, t4
1176	bg,pt	%icc, .LL26
1177	LDF	[BO + 3 * SIZE], b4
1178
1179.LL29:
1180#if defined(LN) || defined(RT)
1181	sub	KK, 2, TEMP1
1182	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP1
1183	add	AORIG, TEMP1, AO
1184	add	B,     TEMP1, BO
1185#endif
1186
1187	FADD2	c04, t1, c04
1188	FADD4	c08, t2, c08
1189	FADD2	c12, t3, c12
1190	FADD4	c16, t4, c16
1191
1192	FADD	c01, c06, c01
1193	FADD	c02, c05, c02
1194	FADD	c03, c08, c03
1195	FADD	c04, c07, c04
1196
1197	FADD	c09, c14, c09
1198	FADD	c10, c13, c10
1199	FADD	c11, c16, c11
1200	FADD	c12, c15, c12
1201
1202#if defined(LN) || defined(LT)
1203	LDF	[BO +  0 * SIZE], a1
1204	LDF	[BO +  1 * SIZE], a2
1205	LDF	[BO +  2 * SIZE], a3
1206	LDF	[BO +  3 * SIZE], a4
1207
1208	LDF	[BO +  4 * SIZE], b1
1209	LDF	[BO +  5 * SIZE], b2
1210	LDF	[BO +  6 * SIZE], b3
1211	LDF	[BO +  7 * SIZE], b4
1212
1213	FSUB	a1, c01, c01
1214	FSUB	a2, c02, c02
1215	FSUB	a3, c09, c09
1216	FSUB	a4, c10, c10
1217
1218	FSUB	b1, c03, c03
1219	FSUB	b2, c04, c04
1220	FSUB	b3, c11, c11
1221	FSUB	b4, c12, c12
1222#else
1223	LDF	[AO +  0 * SIZE], a1
1224	LDF	[AO +  1 * SIZE], a2
1225	LDF	[AO +  2 * SIZE], a3
1226	LDF	[AO +  3 * SIZE], a4
1227
1228	LDF	[AO +  4 * SIZE], b1
1229	LDF	[AO +  5 * SIZE], b2
1230	LDF	[AO +  6 * SIZE], b3
1231	LDF	[AO +  7 * SIZE], b4
1232
1233	FSUB	a1, c01, c01
1234	FSUB	a2, c02, c02
1235	FSUB	a3, c03, c03
1236	FSUB	a4, c04, c04
1237
1238	FSUB	b1, c09, c09
1239	FSUB	b2, c10, c10
1240	FSUB	b3, c11, c11
1241	FSUB	b4, c12, c12
1242#endif
1243
1244#ifdef LN
1245	LDF	[AO +  6 * SIZE], a1
1246	LDF	[AO +  7 * SIZE], a2
1247	LDF	[AO +  4 * SIZE], a3
1248	LDF	[AO +  5 * SIZE], a4
1249	LDF	[AO +  0 * SIZE], b1
1250	LDF	[AO +  1 * SIZE], b2
1251
1252	FMUL	a1, c03, t1
1253	FMUL	a2, c04, t2
1254	FMUL	a1, c04, t3
1255	FMUL	a2, c03, t4
1256
1257	FMUL	a1, c11, t5
1258	FMUL	a2, c12, t6
1259	FMUL	a1, c12, t7
1260	FMUL	a2, c11, t8
1261
1262	FADD4	t1, t2, c03
1263	FADD2	t3, t4, c04
1264	FADD4	t5, t6, c11
1265	FADD2	t7, t8, c12
1266
1267	FMUL	a3, c03, t1
1268	FMUL	a3, c04, t2
1269	FMUL	a3, c11, t3
1270	FMUL	a3, c12, t4
1271
1272	FMUL	a4, c04, t5
1273	FMUL	a4, c03, t6
1274	FMUL	a4, c12, t7
1275	FMUL	a4, c11, t8
1276
1277	FSUB	c01, t1, c01
1278	FSUB	c02, t2, c02
1279	FSUB	c09, t3, c09
1280	FSUB	c10, t4, c10
1281
1282	FADD2	c01, t5, c01
1283	FADD4	c02, t6, c02
1284	FADD2	c09, t7, c09
1285	FADD4	c10, t8, c10
1286
1287	FMUL	b1, c01, t1
1288	FMUL	b2, c02, t2
1289	FMUL	b1, c02, t3
1290	FMUL	b2, c01, t4
1291
1292	FMUL	b1, c09, t5
1293	FMUL	b2, c10, t6
1294	FMUL	b1, c10, t7
1295	FMUL	b2, c09, t8
1296
1297	FADD4	t1, t2, c01
1298	FADD2	t3, t4, c02
1299	FADD4	t5, t6, c09
1300	FADD2	t7, t8, c10
1301#endif
1302
1303#ifdef LT
1304	LDF	[AO +  0 * SIZE], a1
1305	LDF	[AO +  1 * SIZE], a2
1306	LDF	[AO +  2 * SIZE], a3
1307	LDF	[AO +  3 * SIZE], a4
1308	LDF	[AO +  6 * SIZE], b1
1309	LDF	[AO +  7 * SIZE], b2
1310
1311	FMUL	a1, c01, t1
1312	FMUL	a2, c02, t2
1313	FMUL	a1, c02, t3
1314	FMUL	a2, c01, t4
1315
1316	FMUL	a1, c09, t5
1317	FMUL	a2, c10, t6
1318	FMUL	a1, c10, t7
1319	FMUL	a2, c09, t8
1320
1321	FADD4	t1, t2, c01
1322	FADD2	t3, t4, c02
1323	FADD4	t5, t6, c09
1324	FADD2	t7, t8, c10
1325
1326	FMUL	a3, c01, t1
1327	FMUL	a3, c02, t2
1328	FMUL	a3, c09, t3
1329	FMUL	a3, c10, t4
1330
1331	FMUL	a4, c02, t5
1332	FMUL	a4, c01, t6
1333	FMUL	a4, c10, t7
1334	FMUL	a4, c09, t8
1335
1336	FSUB	c03, t1, c03
1337	FSUB	c04, t2, c04
1338	FSUB	c11, t3, c11
1339	FSUB	c12, t4, c12
1340
1341	FADD2	c03, t5, c03
1342	FADD4	c04, t6, c04
1343	FADD2	c11, t7, c11
1344	FADD4	c12, t8, c12
1345
1346	FMUL	b1, c03, t1
1347	FMUL	b2, c04, t2
1348	FMUL	b1, c04, t3
1349	FMUL	b2, c03, t4
1350
1351	FMUL	b1, c11, t5
1352	FMUL	b2, c12, t6
1353	FMUL	b1, c12, t7
1354	FMUL	b2, c11, t8
1355
1356	FADD4	t1, t2, c03
1357	FADD2	t3, t4, c04
1358	FADD4	t5, t6, c11
1359	FADD2	t7, t8, c12
1360#endif
1361
1362#ifdef RN
1363	LDF	[BO +  0 * SIZE], a1
1364	LDF	[BO +  1 * SIZE], a2
1365	LDF	[BO +  2 * SIZE], a3
1366	LDF	[BO +  3 * SIZE], a4
1367	LDF	[BO +  6 * SIZE], b1
1368	LDF	[BO +  7 * SIZE], b2
1369
1370	FMUL	a1, c01, t1
1371	FMUL	a2, c02, t2
1372	FMUL	a1, c02, t3
1373	FMUL	a2, c01, t4
1374
1375	FMUL	a1, c03, t5
1376	FMUL	a2, c04, t6
1377	FMUL	a1, c04, t7
1378	FMUL	a2, c03, t8
1379
1380	FADD4	t1, t2, c01
1381	FADD3	t3, t4, c02
1382	FADD4	t5, t6, c03
1383	FADD3	t7, t8, c04
1384
1385	FMUL	a3, c01, t1
1386	FMUL	a3, c02, t2
1387	FMUL	a3, c03, t3
1388	FMUL	a3, c04, t4
1389
1390	FMUL	a4, c02, t5
1391	FMUL	a4, c01, t6
1392	FMUL	a4, c04, t7
1393	FMUL	a4, c03, t8
1394
1395	FSUB	c09, t1, c09
1396	FSUB	c10, t2, c10
1397	FSUB	c11, t3, c11
1398	FSUB	c12, t4, c12
1399
1400	FADD3	c09, t5, c09
1401	FADD4	c10, t6, c10
1402	FADD3	c11, t7, c11
1403	FADD4	c12, t8, c12
1404
1405	FMUL	b1, c09, t1
1406	FMUL	b2, c10, t2
1407	FMUL	b1, c10, t3
1408	FMUL	b2, c09, t4
1409
1410	FMUL	b1, c11, t5
1411	FMUL	b2, c12, t6
1412	FMUL	b1, c12, t7
1413	FMUL	b2, c11, t8
1414
1415	FADD4	t1, t2, c09
1416	FADD3	t3, t4, c10
1417	FADD4	t5, t6, c11
1418	FADD3	t7, t8, c12
1419#endif
1420
1421#ifdef RT
1422	LDF	[BO +  6 * SIZE], a1
1423	LDF	[BO +  7 * SIZE], a2
1424	LDF	[BO +  4 * SIZE], a3
1425	LDF	[BO +  5 * SIZE], a4
1426	LDF	[BO +  0 * SIZE], b1
1427	LDF	[BO +  1 * SIZE], b2
1428
1429	FMUL	a1, c09, t1
1430	FMUL	a2, c10, t2
1431	FMUL	a1, c10, t3
1432	FMUL	a2, c09, t4
1433
1434	FMUL	a1, c11, t5
1435	FMUL	a2, c12, t6
1436	FMUL	a1, c12, t7
1437	FMUL	a2, c11, t8
1438
1439	FADD4	t1, t2, c09
1440	FADD3	t3, t4, c10
1441	FADD4	t5, t6, c11
1442	FADD3	t7, t8, c12
1443
1444	FMUL	a3, c09, t1
1445	FMUL	a3, c10, t2
1446	FMUL	a3, c11, t3
1447	FMUL	a3, c12, t4
1448
1449	FMUL	a4, c10, t5
1450	FMUL	a4, c09, t6
1451	FMUL	a4, c12, t7
1452	FMUL	a4, c11, t8
1453
1454	FSUB	c01, t1, c01
1455	FSUB	c02, t2, c02
1456	FSUB	c03, t3, c03
1457	FSUB	c04, t4, c04
1458
1459	FADD3	c01, t5, c01
1460	FADD4	c02, t6, c02
1461	FADD3	c03, t7, c03
1462	FADD4	c04, t8, c04
1463
1464	FMUL	b1, c01, t1
1465	FMUL	b2, c02, t2
1466	FMUL	b1, c02, t3
1467	FMUL	b2, c01, t4
1468
1469	FMUL	b1, c03, t5
1470	FMUL	b2, c04, t6
1471	FMUL	b1, c04, t7
1472	FMUL	b2, c03, t8
1473
1474	FADD4	t1, t2, c01
1475	FADD3	t3, t4, c02
1476	FADD4	t5, t6, c03
1477	FADD3	t7, t8, c04
1478#endif
1479
1480#ifdef LN
1481	add	C1, -4 * SIZE, C1
1482	add	C2, -4 * SIZE, C2
1483#endif
1484
1485#if defined(LN) || defined(LT)
1486	STF	c01, [BO +  0 * SIZE]
1487	STF	c02, [BO +  1 * SIZE]
1488	STF	c09, [BO +  2 * SIZE]
1489	STF	c10, [BO +  3 * SIZE]
1490
1491	STF	c03, [BO +  4 * SIZE]
1492	STF	c04, [BO +  5 * SIZE]
1493	STF	c11, [BO +  6 * SIZE]
1494	STF	c12, [BO +  7 * SIZE]
1495#else
1496	STF	c01, [AO +  0 * SIZE]
1497	STF	c02, [AO +  1 * SIZE]
1498	STF	c03, [AO +  2 * SIZE]
1499	STF	c04, [AO +  3 * SIZE]
1500
1501	STF	c09, [AO +  4 * SIZE]
1502	STF	c10, [AO +  5 * SIZE]
1503	STF	c11, [AO +  6 * SIZE]
1504	STF	c12, [AO +  7 * SIZE]
1505#endif
1506
1507	STF	c01, [C1 + 0 * SIZE]
1508	STF	c02, [C1 + 1 * SIZE]
1509	STF	c03, [C1 + 2 * SIZE]
1510	STF	c04, [C1 + 3 * SIZE]
1511
1512	STF	c09, [C2 + 0 * SIZE]
1513	STF	c10, [C2 + 1 * SIZE]
1514	STF	c11, [C2 + 2 * SIZE]
1515	STF	c12, [C2 + 3 * SIZE]
1516
1517#ifndef LN
1518	add	C1, 4 * SIZE, C1
1519	add	C2, 4 * SIZE, C2
1520#endif
1521
1522#ifdef RT
1523	sll	K, 1 + ZBASE_SHIFT, TEMP1
1524	add	AORIG, TEMP1, AORIG
1525#endif
1526
1527#if defined(LT) || defined(RN)
1528	sub	K, KK, TEMP1
1529	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP1
1530	add	AO, TEMP1, AO
1531	add	BO, TEMP1, BO
1532#endif
1533
1534#ifdef LT
1535	add	KK, 2, KK
1536#endif
1537
1538#ifdef LN
1539	sub	KK, 2, KK
1540#endif
1541
1542	add	I, -1, I
1543	cmp	I, 0
1544	bg,pt	%icc, .LL21
1545	nop
1546
1547.LL99:
1548#ifdef LN
1549	sll	K, 1 + ZBASE_SHIFT, TEMP1
1550	add	B, TEMP1, B
1551#endif
1552
1553#if defined(LT) || defined(RN)
1554	mov	BO, B
1555#endif
1556
1557#ifdef RN
1558	add	KK, 2, KK
1559#endif
1560
1561#ifdef RT
1562	sub	KK, 2, KK
1563#endif
1564
1565	add	J, -1, J
1566	cmp	J, 0
1567	bg,pt	%icc, .LL11
1568	nop
1569
1570.LL100:
1571	and	N, 1, J
1572
1573	cmp	J, 0
1574	ble,pn	%icc, .LL999
1575	nop
1576
1577#ifdef RT
1578	sll	K, 0 + ZBASE_SHIFT, TEMP1
1579	sub	B, TEMP1, B
1580
1581	sub	C, LDC, C
1582#endif
1583
1584	mov	C, C1
1585
1586#ifdef LN
1587	add	M, OFFSET, KK
1588#endif
1589
1590#ifdef LT
1591	mov	OFFSET, KK
1592#endif
1593
1594#if defined(LN) || defined(RT)
1595	mov	A, AORIG
1596#else
1597	mov	A, AO
1598#endif
1599
1600#ifndef RT
1601	add	C, LDC, C
1602#endif
1603
1604	and	M, 1, I
1605	cmp	I, 0
1606	ble,pn	%icc, .LL150
1607	nop
1608
1609#if defined(LT) || defined(RN)
1610	sra	KK, 2, L
1611
1612	mov	B, BO
1613	cmp	L,  0
1614#else
1615
1616#ifdef LN
1617	sll	K,  0 + ZBASE_SHIFT, TEMP1
1618	sub	AORIG, TEMP1, AORIG
1619#endif
1620
1621	sll	KK, 0 + ZBASE_SHIFT, TEMP1
1622	add	AORIG, TEMP1, AO
1623	add	B,     TEMP1, BO
1624
1625	sub	K, KK, TEMP1
1626
1627	sra	TEMP1, 2, L
1628	cmp	L,  0
1629#endif
1630
1631	LDF	[AO + 0 * SIZE], a1
1632	FMOV	FZERO, c01
1633	LDF	[BO + 0 * SIZE], b1
1634	FMOV	FZERO, t1
1635
1636	LDF	[AO + 1 * SIZE], a2
1637	FMOV	FZERO, c02
1638	LDF	[BO + 1 * SIZE], b2
1639	FMOV	FZERO, t2
1640
1641	LDF	[AO + 2 * SIZE], a3
1642	FMOV	FZERO, c03
1643	LDF	[BO + 2 * SIZE], b3
1644	FMOV	FZERO, t3
1645
1646	LDF	[AO + 3 * SIZE], a4
1647	FMOV	FZERO, c04
1648	LDF	[BO + 3 * SIZE], b4
1649	FMOV	FZERO, t4
1650
1651	ble,pn	%icc, .LL155
1652	nop
1653
1654.LL152:
1655	FADD1	c01, t1, c01
1656	add	L, -1, L
1657	FMUL	a1, b1, t1
1658	prefetch [AO + APREFETCHSIZE * SIZE], 0
1659
1660	FADD3	c02, t2, c02
1661	add	BO,  8 * SIZE, BO
1662	FMUL	a1, b2, t2
1663	LDF	[AO + 4 * SIZE], a1
1664
1665	FADD2	c03, t3, c03
1666	cmp	L, 0
1667	FMUL	a2, b1, t3
1668	LDF	[BO - 4 * SIZE], b1
1669
1670	FADD4	c04, t4, c04
1671	nop
1672	FMUL	a2, b2, t4
1673	LDF	[AO + 5 * SIZE], a2
1674
1675	FADD1	c01, t1, c01
1676	nop
1677	FMUL	a3, b3, t1
1678	LDF	[BO - 3 * SIZE], b2
1679
1680	FADD3	c02, t2, c02
1681	nop
1682	FMUL	a3, b4, t2
1683	LDF	[AO + 6 * SIZE], a3
1684
1685	FADD2	c03, t3, c03
1686	nop
1687	FMUL	a4, b3, t3
1688	LDF	[BO - 2 * SIZE], b3
1689
1690	FADD4	c04, t4, c04
1691	nop
1692	FMUL	a4, b4, t4
1693	LDF	[AO + 7 * SIZE], a4
1694
1695	FADD1	c01, t1, c01
1696	nop
1697	FMUL	a1, b1, t1
1698	LDF	[BO - 1 * SIZE], b4
1699
1700	FADD3	c02, t2, c02
1701	FMUL	a1, b2, t2
1702	LDF	[AO +  8 * SIZE], a1
1703
1704	FADD2	c03, t3, c03
1705	FMUL	a2, b1, t3
1706	LDF	[BO +  0 * SIZE], b1
1707
1708	FADD4	c04, t4, c04
1709	FMUL	a2, b2, t4
1710	LDF	[AO +  9 * SIZE], a2
1711
1712	FADD1	c01, t1, c01
1713	FMUL	a3, b3, t1
1714	LDF	[BO +  1 * SIZE], b2
1715
1716	FADD3	c02, t2, c02
1717	FMUL	a3, b4, t2
1718	LDF	[AO + 10 * SIZE], a3
1719
1720	FADD2	c03, t3, c03
1721	FMUL	a4, b3, t3
1722	LDF	[BO +  2 * SIZE], b3
1723
1724	FADD4	c04, t4, c04
1725	FMUL	a4, b4, t4
1726	LDF	[AO + 11 * SIZE], a4
1727
1728	add	AO,  8 * SIZE, AO
1729	bg,pt	%icc, .LL152
1730	LDF	[BO +  3 * SIZE], b4
1731
1732.LL155:
1733#if defined(LT) || defined(RN)
1734	and	KK,  3, L
1735#else
1736	and	TEMP1, 3, L
1737#endif
1738	cmp	L,  0
1739	ble,a,pn %icc, .LL159
1740	nop
1741
1742.LL156:
1743	FADD1	c01, t1, c01
1744	add	AO, 2 * SIZE, AO
1745	FMUL	a1, b1, t1
1746	add	BO, 2 * SIZE, BO
1747	FADD3	c02, t2, c02
1748	add	L, -1, L
1749	FMUL	a1, b2, t2
1750	LDF	[AO + 0 * SIZE], a1
1751	FADD2	c03, t3, c03
1752	FMUL	a2, b1, t3
1753	LDF	[BO + 0 * SIZE], b1
1754	cmp	L, 0
1755	FADD4	c04, t4, c04
1756	FMUL	a2, b2, t4
1757	LDF	[BO + 1 * SIZE], b2
1758
1759	bg,pt	%icc, .LL156
1760	LDF	[AO + 1 * SIZE], a2
1761
1762.LL159:
1763	FADD1	c01, t1, c01
1764	FADD3	c02, t2, c02
1765	FADD2	c03, t3, c03
1766	FADD4	c04, t4, c04
1767
1768	FADD	c01, c04, c01
1769	FADD	c02, c03, c02
1770
1771#if defined(LN) || defined(RT)
1772	sub	KK, 1, TEMP1
1773
1774	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP1
1775	add	AORIG, TEMP1, AO
1776	add	B,     TEMP1, BO
1777#endif
1778
1779#if defined(LN) || defined(LT)
1780	LDF	[BO +  0 * SIZE], a1
1781	LDF	[BO +  1 * SIZE], a2
1782
1783	FSUB	a1, c01, c01
1784	FSUB	a2, c02, c02
1785#else
1786	LDF	[AO +  0 * SIZE], a1
1787	LDF	[AO +  1 * SIZE], a2
1788
1789	FSUB	a1, c01, c01
1790	FSUB	a2, c02, c02
1791#endif
1792
1793#ifdef LN
1794	LDF	[AO +  0 * SIZE], a1
1795	LDF	[AO +  1 * SIZE], a2
1796
1797	FMUL	a1, c01, t1
1798	FMUL	a2, c02, t2
1799	FMUL	a1, c02, t3
1800	FMUL	a2, c01, t4
1801
1802	FADD4	t1, t2, c01
1803	FADD2	t3, t4, c02
1804#endif
1805
1806#ifdef LT
1807	LDF	[AO +  0 * SIZE], a1
1808	LDF	[AO +  1 * SIZE], a2
1809
1810	FMUL	a1, c01, t1
1811	FMUL	a2, c02, t2
1812	FMUL	a1, c02, t3
1813	FMUL	a2, c01, t4
1814
1815	FADD4	t1, t2, c01
1816	FADD2	t3, t4, c02
1817#endif
1818
1819#ifdef RN
1820	LDF	[BO +  0 * SIZE], a1
1821	LDF	[BO +  1 * SIZE], a2
1822
1823	FMUL	a1, c01, t1
1824	FMUL	a2, c02, t2
1825	FMUL	a1, c02, t3
1826	FMUL	a2, c01, t4
1827
1828	FADD4	t1, t2, c01
1829	FADD3	t3, t4, c02
1830#endif
1831
1832#ifdef RT
1833	LDF	[BO +  0 * SIZE], a1
1834	LDF	[BO +  1 * SIZE], a2
1835
1836	FMUL	a1, c01, t1
1837	FMUL	a2, c02, t2
1838	FMUL	a1, c02, t3
1839	FMUL	a2, c01, t4
1840
1841	FADD4	t1, t2, c01
1842	FADD3	t3, t4, c02
1843#endif
1844
1845#ifdef LN
1846	add	C1, -2 * SIZE, C1
1847#endif
1848
1849#if defined(LN) || defined(LT)
1850	STF	c01, [BO +  0 * SIZE]
1851	STF	c02, [BO +  1 * SIZE]
1852#else
1853	STF	c01, [AO +  0 * SIZE]
1854	STF	c02, [AO +  1 * SIZE]
1855#endif
1856
1857	STF	c01, [C1 + 0 * SIZE]
1858	STF	c02, [C1 + 1 * SIZE]
1859
1860	FMOV	FZERO, t1
1861	FMOV	FZERO, t2
1862	FMOV	FZERO, t3
1863	FMOV	FZERO, t4
1864
1865#ifndef LN
1866	add	C1, 2 * SIZE, C1
1867#endif
1868
1869#ifdef RT
1870	sll	K, 0 + ZBASE_SHIFT, TEMP1
1871	add	AORIG, TEMP1, AORIG
1872#endif
1873
1874#if defined(LT) || defined(RN)
1875	sub	K, KK, TEMP1
1876	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP1
1877	add	AO, TEMP1, AO
1878	add	BO, TEMP1, BO
1879#endif
1880
1881#ifdef LT
1882	add	KK, 1, KK
1883#endif
1884
1885#ifdef LN
1886	sub	KK, 1, KK
1887#endif
1888
1889.LL150:
1890	sra	M, 1, I
1891	cmp	I, 0
1892	ble,pn	%icc, .LL199
1893	nop
1894
1895
1896.LL121:
1897#if defined(LT) || defined(RN)
1898	sra	KK, 2, L
1899
1900	mov	B, BO
1901	cmp	L,  0
1902#else
1903
1904#ifdef LN
1905	sll	K,  1 + ZBASE_SHIFT, TEMP1
1906	sub	AORIG, TEMP1, AORIG
1907#endif
1908
1909	sll	KK, 1 + ZBASE_SHIFT, TEMP1
1910	sll	KK, 0 + ZBASE_SHIFT, TEMP2
1911
1912	add	AORIG, TEMP1, AO
1913	add	B,     TEMP2, BO
1914
1915	sub	K, KK, TEMP1
1916	sra	TEMP1, 2, L
1917	cmp	L,  0
1918#endif
1919
1920	FMOV	FZERO, c03
1921
1922	LDF	[AO + 0 * SIZE], a1
1923	FMOV	FZERO, t1
1924	LDF	[BO + 0 * SIZE], b1
1925	FMOV	FZERO, c07
1926
1927	LDF	[AO + 1 * SIZE], a2
1928	FMOV	FZERO, t2
1929	LDF	[BO + 1 * SIZE], b2
1930	FMOV	FZERO, c04
1931
1932	LDF	[AO + 2 * SIZE], a3
1933	FMOV	FZERO, t3
1934	LDF	[BO + 2 * SIZE], b3
1935	FMOV	FZERO, c08
1936
1937	LDF	[AO + 3 * SIZE], a4
1938	FMOV	FZERO, t4
1939	LDF	[BO + 3 * SIZE], b4
1940	FMOV	FZERO, c01
1941
1942#ifdef LN
1943	prefetch [C1 - 3 * SIZE], 3
1944#else
1945	prefetch [C1 + 3 * SIZE], 3
1946#endif
1947	FMOV	FZERO, c05
1948	FMOV	FZERO, c02
1949
1950	ble,pn	%icc, .LL125
1951	FMOV	FZERO, c06
1952
1953.LL122:
1954	FADD1	c03, t1, c03
1955	add	L, -1, L
1956	FMUL	a1, b1, t1
1957	prefetch [AO + APREFETCHSIZE * SIZE], 0
1958
1959	FADD3	c07, t2, c07
1960	add	BO,  8 * SIZE, BO
1961	FMUL	a1, b2, t2
1962	LDF	[AO + 4 * SIZE], a1
1963
1964	FADD2	c04, t3, c04
1965	add	AO, 16 * SIZE, AO
1966	FMUL	a2, b1, t3
1967	cmp	L,  0
1968
1969	FADD4	c08, t4, c08
1970	nop
1971	FMUL	a2, b2, t4
1972	LDF	[AO - 11 * SIZE], a2
1973
1974	FADD1	c01, t1, c01
1975	nop
1976	FMUL	a3, b1, t1
1977	nop
1978
1979	FADD3	c05, t2, c05
1980	nop
1981	FMUL	a3, b2, t2
1982	LDF	[AO - 10 * SIZE], a3
1983
1984	FADD2	c02, t3, c02
1985	nop
1986	FMUL	a4, b1, t3
1987	LDF	[BO -  4 * SIZE], b1
1988
1989	FADD4	c06, t4, c06
1990	nop
1991	FMUL	a4, b2, t4
1992	LDF	[BO -  3 * SIZE], b2
1993
1994	FADD1	c03, t1, c03
1995	nop
1996	FMUL	a1, b3, t1
1997	LDF	[AO -  9 * SIZE], a4
1998
1999	FADD3	c07, t2, c07
2000	nop
2001	FMUL	a1, b4, t2
2002	LDF	[AO -  8 * SIZE], a1
2003
2004	FADD2	c04, t3, c04
2005	nop
2006	FMUL	a2, b3, t3
2007	nop
2008
2009	FADD4	c08, t4, c08
2010	nop
2011	FMUL	a2, b4, t4
2012	LDF	[AO -  7 * SIZE], a2
2013
2014	FADD1	c01, t1, c01
2015	nop
2016	FMUL	a3, b3, t1
2017	nop
2018
2019	FADD3	c05, t2, c05
2020	nop
2021	FMUL	a3, b4, t2
2022	LDF	[AO -  6 * SIZE], a3
2023
2024	FADD2	c02, t3, c02
2025	nop
2026	FMUL	a4, b3, t3
2027	LDF	[BO -  2 * SIZE], b3
2028
2029	FADD4	c06, t4, c06
2030	nop
2031	FMUL	a4, b4, t4
2032	LDF	[BO -  1 * SIZE], b4
2033
2034	FADD1	c03, t1, c03
2035	nop
2036	FMUL	a1, b1, t1
2037	LDF	[AO -  5 * SIZE], a4
2038
2039	FADD3	c07, t2, c07
2040	nop
2041	FMUL	a1, b2, t2
2042	LDF	[AO -  4 * SIZE], a1
2043
2044	FADD2	c04, t3, c04
2045	nop
2046	FMUL	a2, b1, t3
2047	nop
2048
2049	FADD4	c08, t4, c08
2050	nop
2051	FMUL	a2, b2, t4
2052	LDF	[AO -  3 * SIZE], a2
2053
2054	FADD1	c01, t1, c01
2055	nop
2056	FMUL	a3, b1, t1
2057	nop
2058
2059	FADD3	c05, t2, c05
2060	nop
2061	FMUL	a3, b2, t2
2062	LDF	[AO -  2 * SIZE], a3
2063
2064	FADD2	c02, t3, c02
2065	nop
2066	FMUL	a4, b1, t3
2067	LDF	[BO +  0 * SIZE], b1
2068
2069	FADD4	c06, t4, c06
2070	nop
2071	FMUL	a4, b2, t4
2072	LDF	[BO +  1 * SIZE], b2
2073
2074	FADD1	c03, t1, c03
2075	nop
2076	FMUL	a1, b3, t1
2077	LDF	[AO -  1 * SIZE], a4
2078
2079	FADD3	c07, t2, c07
2080	nop
2081	FMUL	a1, b4, t2
2082	LDF	[AO +  0 * SIZE], a1
2083
2084	FADD2	c04, t3, c04
2085	nop
2086	FMUL	a2, b3, t3
2087	nop
2088
2089	FADD4	c08, t4, c08
2090	nop
2091	FMUL	a2, b4, t4
2092	LDF	[AO +  1 * SIZE], a2
2093
2094	FADD1	c01, t1, c01
2095	nop
2096	FMUL	a3, b3, t1
2097	nop
2098
2099	FADD3	c05, t2, c05
2100	nop
2101	FMUL	a3, b4, t2
2102	LDF	[AO +  2 * SIZE], a3
2103
2104	FADD2	c02, t3, c02
2105	nop
2106	FMUL	a4, b3, t3
2107	LDF	[BO +  2 * SIZE], b3
2108
2109	FADD4	c06, t4, c06
2110	FMUL	a4, b4, t4
2111	LDF	[AO +  3 * SIZE], a4
2112
2113	bg,pt	%icc, .LL122
2114	LDF	[BO +  3 * SIZE], b4
2115
2116.LL125:
2117#if defined(LT) || defined(RN)
2118	and	KK,  3, L
2119#else
2120	and	TEMP1, 3, L
2121#endif
2122	cmp	L,  0
2123	ble,a,pn %icc, .LL129
2124	nop
2125
2126.LL126:
2127	FADD1	c03, t1, c03
2128	add	AO, 4 * SIZE, AO
2129	FMUL	a1, b1, t1
2130	add	BO, 2 * SIZE, BO
2131
2132	FADD3	c07, t2, c07
2133	add	L, -1, L
2134	FMUL	a1, b2, t2
2135	LDF	[AO + 0 * SIZE], a1
2136
2137	FADD2	c04, t3, c04
2138	cmp	L, 0
2139	FMUL	a2, b1, t3
2140
2141	FADD4	c08, t4, c08
2142	FMUL	a2, b2, t4
2143	LDF	[AO + 1 * SIZE], a2
2144
2145	FADD1	c01, t1, c01
2146	FMUL	a3, b1, t1
2147	FADD3	c05, t2, c05
2148	FMUL	a3, b2, t2
2149	LDF	[AO + 2 * SIZE], a3
2150
2151	FADD2	c02, t3, c02
2152	FMUL	a4, b1, t3
2153	LDF	[BO + 0 * SIZE], b1
2154	FADD4	c06, t4, c06
2155	FMUL	a4, b2, t4
2156	LDF	[BO + 1 * SIZE], b2
2157	bg,pt	%icc, .LL126
2158	LDF	[AO + 3 * SIZE], a4
2159
2160.LL129:
2161	FADD1	c03, t1, c03
2162	FADD3	c07, t2, c07
2163	FADD2	c04, t3, c04
2164	FADD4	c08, t4, c08
2165
2166	FADD	c01, c06, c01
2167	FADD	c02, c05, c02
2168	FADD	c03, c08, c03
2169	FADD	c04, c07, c04
2170
2171#if defined(LN) || defined(RT)
2172#ifdef LN
2173	sub	KK, 2, TEMP1
2174#else
2175	sub	KK, 1, TEMP1
2176#endif
2177	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP2
2178	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP1
2179	add	AORIG, TEMP2, AO
2180	add	B,     TEMP1, BO
2181#endif
2182
2183#if defined(LN) || defined(LT)
2184	LDF	[BO +  0 * SIZE], a1
2185	LDF	[BO +  1 * SIZE], a2
2186	LDF	[BO +  2 * SIZE], a3
2187	LDF	[BO +  3 * SIZE], a4
2188
2189	FSUB	a1, c01, c01
2190	FSUB	a2, c02, c02
2191	FSUB	a3, c03, c03
2192	FSUB	a4, c04, c04
2193#else
2194	LDF	[AO +  0 * SIZE], a1
2195	LDF	[AO +  1 * SIZE], a2
2196	LDF	[AO +  2 * SIZE], a3
2197	LDF	[AO +  3 * SIZE], a4
2198
2199	FSUB	a1, c01, c01
2200	FSUB	a2, c02, c02
2201	FSUB	a3, c03, c03
2202	FSUB	a4, c04, c04
2203#endif
2204
2205#ifdef LN
2206	LDF	[AO +  6 * SIZE], a1
2207	LDF	[AO +  7 * SIZE], a2
2208	LDF	[AO +  4 * SIZE], a3
2209	LDF	[AO +  5 * SIZE], a4
2210	LDF	[AO +  0 * SIZE], b1
2211	LDF	[AO +  1 * SIZE], b2
2212
2213	FMUL	a1, c03, t1
2214	FMUL	a2, c04, t2
2215	FMUL	a1, c04, t3
2216	FMUL	a2, c03, t4
2217
2218	FADD4	t1, t2, c03
2219	FADD2	t3, t4, c04
2220
2221	FMUL	a3, c03, t1
2222	FMUL	a3, c04, t2
2223
2224	FMUL	a4, c04, t5
2225	FMUL	a4, c03, t6
2226
2227	FSUB	c01, t1, c01
2228	FSUB	c02, t2, c02
2229
2230	FADD2	c01, t5, c01
2231	FADD4	c02, t6, c02
2232
2233	FMUL	b1, c01, t1
2234	FMUL	b2, c02, t2
2235	FMUL	b1, c02, t3
2236	FMUL	b2, c01, t4
2237
2238	FADD4	t1, t2, c01
2239	FADD2	t3, t4, c02
2240#endif
2241
2242#ifdef LT
2243	LDF	[AO +  0 * SIZE], a1
2244	LDF	[AO +  1 * SIZE], a2
2245	LDF	[AO +  2 * SIZE], a3
2246	LDF	[AO +  3 * SIZE], a4
2247	LDF	[AO +  6 * SIZE], b1
2248	LDF	[AO +  7 * SIZE], b2
2249
2250	FMUL	a1, c01, t1
2251	FMUL	a2, c02, t2
2252	FMUL	a1, c02, t3
2253	FMUL	a2, c01, t4
2254
2255	FADD4	t1, t2, c01
2256	FADD2	t3, t4, c02
2257
2258	FMUL	a3, c01, t1
2259	FMUL	a3, c02, t2
2260	FMUL	a4, c02, t5
2261	FMUL	a4, c01, t6
2262
2263	FSUB	c03, t1, c03
2264	FSUB	c04, t2, c04
2265	FADD2	c03, t5, c03
2266	FADD4	c04, t6, c04
2267
2268	FMUL	b1, c03, t1
2269	FMUL	b2, c04, t2
2270	FMUL	b1, c04, t3
2271	FMUL	b2, c03, t4
2272
2273	FADD4	t1, t2, c03
2274	FADD2	t3, t4, c04
2275#endif
2276
2277#ifdef RN
2278	LDF	[BO +  0 * SIZE], a1
2279	LDF	[BO +  1 * SIZE], a2
2280
2281	FMUL	a1, c01, t1
2282	FMUL	a2, c02, t2
2283	FMUL	a1, c02, t3
2284	FMUL	a2, c01, t4
2285
2286	FMUL	a1, c03, t5
2287	FMUL	a2, c04, t6
2288	FMUL	a1, c04, t7
2289	FMUL	a2, c03, t8
2290
2291	FADD4	t1, t2, c01
2292	FADD3	t3, t4, c02
2293	FADD4	t5, t6, c03
2294	FADD3	t7, t8, c04
2295#endif
2296
2297#ifdef RT
2298	LDF	[BO +  0 * SIZE], a1
2299	LDF	[BO +  1 * SIZE], a2
2300
2301	FMUL	a1, c01, t1
2302	FMUL	a2, c02, t2
2303	FMUL	a1, c02, t3
2304	FMUL	a2, c01, t4
2305
2306	FMUL	a1, c03, t5
2307	FMUL	a2, c04, t6
2308	FMUL	a1, c04, t7
2309	FMUL	a2, c03, t8
2310
2311	FADD4	t1, t2, c01
2312	FADD3	t3, t4, c02
2313	FADD4	t5, t6, c03
2314	FADD3	t7, t8, c04
2315#endif
2316
2317#ifdef LN
2318	add	C1, -4 * SIZE, C1
2319#endif
2320
2321#if defined(LN) || defined(LT)
2322	STF	c01, [BO +  0 * SIZE]
2323	STF	c02, [BO +  1 * SIZE]
2324	STF	c03, [BO +  2 * SIZE]
2325	STF	c04, [BO +  3 * SIZE]
2326#else
2327	STF	c01, [AO +  0 * SIZE]
2328	STF	c02, [AO +  1 * SIZE]
2329	STF	c03, [AO +  2 * SIZE]
2330	STF	c04, [AO +  3 * SIZE]
2331#endif
2332
2333	STF	c01, [C1 + 0 * SIZE]
2334	STF	c02, [C1 + 1 * SIZE]
2335	STF	c03, [C1 + 2 * SIZE]
2336	STF	c04, [C1 + 3 * SIZE]
2337
2338	FMOV	FZERO, t1
2339	FMOV	FZERO, t2
2340	FMOV	FZERO, t3
2341	FMOV	FZERO, t4
2342
2343#ifndef LN
2344	add	C1, 4 * SIZE, C1
2345#endif
2346
2347#ifdef RT
2348	sll	K, 1 + ZBASE_SHIFT, TEMP1
2349	add	AORIG, TEMP1, AORIG
2350#endif
2351
2352#if defined(LT) || defined(RN)
2353	sub	K, KK, TEMP1
2354	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP2
2355	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP1
2356	add	AO, TEMP2, AO
2357	add	BO, TEMP1, BO
2358#endif
2359
2360#ifdef LT
2361	add	KK, 2, KK
2362#endif
2363
2364#ifdef LN
2365	sub	KK, 2, KK
2366#endif
2367
2368	add	I, -1, I
2369	cmp	I, 0
2370	bg,pt	%icc, .LL121
2371	FMOV	FZERO, c03
2372
2373.LL199:
2374#ifdef LN
2375	sll	K, 0 + ZBASE_SHIFT, TEMP1
2376	add	B, TEMP1, B
2377#endif
2378
2379#if defined(LT) || defined(RN)
2380	mov	BO, B
2381#endif
2382
2383#ifdef RN
2384	add	KK, 1, KK
2385#endif
2386
2387#ifdef RT
2388	sub	KK, 1, KK
2389#endif
2390
2391.LL999:
2392	return	%i7 + 8
2393	clr	%o0
2394
2395	EPILOGUE
2396