1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1
43#define N	ARG2
44#define K	ARG3
45#define A	ARG4
46#define B	ARG5
47#define C	ARG6
48#define LDC	%r10
49
50#define I	%r12
51#define J	%r13
52#define AO	%r14
53#define BO	%r15
54#define	CO	%rbp
55
56#define KK	%r11
57#define AORIG	 48(%rsp)
58
59#define STACKSIZE 64
60
61#define ALPHA	 8 + STACKSIZE(%rsp)
62#define OFFSET	32 + STACKSIZE(%rsp)
63
64#ifdef OPTERON
65#define PREFETCH	prefetch
66#define PREFETCHW	prefetchw
67#else
68#define PREFETCH	prefetcht0
69#define PREFETCHW	prefetcht0
70#endif
71
72#define PREFETCHSIZE (5 + 4 * 10)
73
74	PROLOGUE
75	PROFCODE
76
77#ifdef WINDOWS_ABI
78	emms
79#endif
80
81	subq	$STACKSIZE, %rsp
82	movq	%rbx,  0(%rsp)
83	movq	%rbp,  8(%rsp)
84	movq	%r12, 16(%rsp)
85	movq	%r13, 24(%rsp)
86	movq	%r14, 32(%rsp)
87	movq	%r15, 40(%rsp)
88
89	movq	24 + STACKSIZE(%rsp), LDC
90
91#if defined(TRMMKERNEL) && !defined(LEFT)
92	movq	OFFSET, %rax
93	negq	%rax
94	movq	%rax, KK
95#endif
96
97	addq	$8 * SIZE, A
98	addq	$8 * SIZE, B
99
100	salq	$BASE_SHIFT, LDC
101
102#ifdef LN
103       movq	M, %rax
104       salq	$BASE_SHIFT, %rax
105       addq	%rax, C
106       imulq	K, %rax
107       addq	%rax, A
108#endif
109
110#ifdef RT
111       movq	N, %rax
112       salq	$BASE_SHIFT, %rax
113       imulq	K, %rax
114       addq	%rax, B
115
116       movq	N,   %rax
117       imulq	LDC, %rax
118       addq	%rax, C
119#endif
120
121#ifdef RN
122       movq	OFFSET, %rax
123       negq	%rax
124       movq	%rax, KK
125#endif
126
127#ifdef RT
128       movq	N, %rax
129       subq	OFFSET, %rax
130       movq	%rax, KK
131#endif
132
133	movq	N,  %rax
134	testq	$1, %rax
135	je	.L30
136
137#if defined(LT) || defined(RN)
138	movq	A, AO
139#else
140	movq	A, %rax
141	movq	%rax, AORIG
142#endif
143
144#ifdef RT
145	movq	K, %rax
146	salq	$0 + BASE_SHIFT, %rax
147	subq	%rax, B
148#endif
149
150#ifdef RT
151	subq	LDC, C
152#endif
153	movq	C, CO
154#ifndef RT
155	addq	LDC, C
156#endif
157
158#ifdef LN
159	movq	OFFSET, %rax
160	addq	M, %rax
161	movq	%rax, KK
162#endif
163
164#ifdef LT
165	movq	OFFSET, %rax
166	movq	%rax, KK
167#endif
168
169	movq	M,  I
170	sarq	$1, I
171	je	.L40
172	ALIGN_4
173
174.L31:
175#ifdef LN
176       movq	K, %rax
177       salq	$1 + BASE_SHIFT, %rax
178       subq	%rax, AORIG
179#endif
180
181#if defined(LN) || defined(RT)
182	movq	KK, %rax
183	salq	$BASE_SHIFT, %rax
184	movq	AORIG, AO
185	leaq	(AO, %rax, 2), AO
186	leaq	(B,  %rax, 1), BO
187#else
188	movq	B, BO
189#endif
190
191	fldz
192	fldz
193
194#if   defined(HAVE_3DNOW)
195	prefetchw	2 * SIZE(CO)
196#elif defined(HAVE_SSE)
197	prefetchnta	2 * SIZE(CO)
198#endif
199
200#if defined(LT) || defined(RN)
201	movq	KK, %rax
202#else
203	movq	K,  %rax
204	subq	KK, %rax
205#endif
206	sarq	$2, %rax
207 	je	.L35
208	ALIGN_4
209
210.L32:
211	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
212
213	FLD	 -8 * SIZE(BO)
214	FLD	 -8 * SIZE(AO)
215	fmul	 %st(1), %st
216	faddp	 %st, %st(2)
217
218	FLD	 -7 * SIZE(AO)
219	fmulp	 %st, %st(1)
220	faddp	 %st, %st(2)
221
222	FLD	 -7 * SIZE(BO)
223	FLD	 -6 * SIZE(AO)
224	fmul	 %st(1), %st
225	faddp	 %st, %st(2)
226
227	FLD	 -5 * SIZE(AO)
228	fmulp	 %st, %st(1)
229	faddp	 %st, %st(2)
230
231	FLD	 -6 * SIZE(BO)
232	FLD	 -4 * SIZE(AO)
233	fmul	 %st(1), %st
234	faddp	 %st, %st(2)
235
236	FLD	 -3 * SIZE(AO)
237	fmulp	 %st, %st(1)
238	faddp	 %st, %st(2)
239
240	FLD	 -5 * SIZE(BO)
241	FLD	 -2 * SIZE(AO)
242	fmul	 %st(1), %st
243	faddp	 %st, %st(2)
244
245	FLD	 -1 * SIZE(AO)
246	fmulp	 %st, %st(1)
247	faddp	 %st, %st(2)
248
249	addq	$8 * SIZE,AO
250	addq	$4 * SIZE,BO
251
252	decq	%rax
253	jne	.L32
254	ALIGN_4
255
256.L35:
257#if defined(LT) || defined(RN)
258	movq	KK, %rax
259#else
260	movq	K,  %rax
261	subq	KK, %rax
262#endif
263	and	$3,  %rax
264	je	.L38
265	ALIGN_4
266
267.L36:
268	FLD	 -8 * SIZE(BO)
269
270	FLD	 -8 * SIZE(AO)
271	fmul	 %st(1), %st
272	faddp	 %st, %st(2)
273
274	FLD	 -7 * SIZE(AO)
275	fmulp	 %st, %st(1)
276	faddp	 %st, %st(2)
277
278	addq	$2 * SIZE,AO
279	addq	$1 * SIZE,BO
280
281	decq	%rax
282	jne	 .L36
283	ALIGN_4
284
285.L38:
286#if defined(LN) || defined(RT)
287	movq	KK, %rax
288#ifdef LN
289	subq	$2, %rax
290#else
291	subq	$1, %rax
292#endif
293
294	salq	$BASE_SHIFT, %rax
295
296	movq	AORIG, AO
297	leaq	(AO, %rax, 2), AO
298	leaq	(B,  %rax, 1), BO
299#endif
300
301#if defined(LN) || defined(LT)
302	FLD	-8 * SIZE(BO)
303	fsubp	%st, %st(1)
304	FLD	-7 * SIZE(BO)
305	fsubp	%st, %st(2)
306#else
307	FLD	-8 * SIZE(AO)
308	fsubp	%st, %st(1)
309	FLD	-7 * SIZE(AO)
310	fsubp	%st, %st(2)
311#endif
312
313#ifdef LN
314       FLD	-5 * SIZE(AO)
315       fmulp	%st, %st(2)
316
317       FLD	-6 * SIZE(AO)
318       fmul	%st(2), %st
319
320       fsubrp	%st, %st(1)
321       FLD	-8 * SIZE(AO)
322       fmulp	%st, %st(1)
323#endif
324
325#ifdef LT
326       FLD	-8 * SIZE(AO)
327       fmulp	%st, %st(1)
328
329       FLD	-7 * SIZE(AO)
330       fmul	%st(1), %st
331
332       fsubrp	%st, %st(2)
333
334       FLD	-5 * SIZE(AO)
335       fmulp	%st, %st(2)
336#endif
337
338#ifdef RN
339       FLD	-8 * SIZE(BO)
340       fmul	%st, %st(1)
341       fmulp	%st, %st(2)
342#endif
343
344#ifdef RT
345       FLD	-8 * SIZE(BO)
346       fmul	%st, %st(1)
347       fmulp	%st, %st(2)
348#endif
349
350#ifdef LN
351	subq	$2 * SIZE, CO
352#endif
353
354#if defined(LN) || defined(LT)
355	fld	%st
356	FST	-8 * SIZE(BO)
357	fxch	%st(1)
358	fld	%st
359	FST	-7 * SIZE(BO)
360#else
361	fld	%st
362	FST	-8 * SIZE(AO)
363	fxch	%st(1)
364	fld	%st
365	FST	-7 * SIZE(AO)
366#endif
367
368	FST	1 * SIZE(CO)
369	FST	0 * SIZE(CO)
370
371#ifndef LN
372	addq	$2 * SIZE, CO
373#endif
374
375#if defined(LT) || defined(RN)
376	movq	K,  %rax
377	subq	KK, %rax
378	salq	$BASE_SHIFT, %rax
379	leaq	(AO, %rax, 2), AO
380	leaq	(BO, %rax, 1), BO
381#endif
382
383#ifdef LN
384	subq	$2, KK
385#endif
386
387#ifdef LT
388	addq	$2, KK
389#endif
390
391#ifdef RT
392       movq	K, %rax
393       salq	$1 + BASE_SHIFT, %rax
394       addq	%rax, AORIG
395#endif
396
397	decq	I
398	jne	.L31
399	ALIGN_4
400
401.L40:
402	movq	 M, %rax
403	andq	$1, %rax
404	je	.L49
405	ALIGN_4
406
407.L41:
408#ifdef LN
409       movq	K, %rax
410       salq	$0 + BASE_SHIFT, %rax
411       subq	%rax, AORIG
412#endif
413
414#if defined(LN) || defined(RT)
415	movq	KK, %rax
416	salq	$BASE_SHIFT, %rax
417	movq	AORIG, AO
418	leaq	(AO, %rax, 1), AO
419	leaq	(B,  %rax, 1), BO
420#else
421	movq	B, BO
422#endif
423
424	fldz
425
426#if defined(LT) || defined(RN)
427	movq	KK, %rax
428#else
429	movq	K,  %rax
430	subq	KK, %rax
431#endif
432	sarq	$2, %rax
433 	je	.L45
434	ALIGN_4
435
436.L42:
437	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
438
439	FLD	 -8 * SIZE(AO)
440	FLD	 -8 * SIZE(BO)
441	fmulp	 %st, %st(1)
442	faddp	 %st, %st(1)
443
444	FLD	 -7 * SIZE(AO)
445	FLD	 -7 * SIZE(BO)
446	fmulp	 %st, %st(1)
447	faddp	 %st, %st(1)
448
449	FLD	 -6 * SIZE(AO)
450	FLD	 -6 * SIZE(BO)
451	fmulp	 %st, %st(1)
452	faddp	 %st, %st(1)
453
454	FLD	 -5 * SIZE(AO)
455	FLD	 -5 * SIZE(BO)
456	fmulp	 %st, %st(1)
457	faddp	 %st, %st(1)
458
459	addq	$4 * SIZE,AO
460	addq	$4 * SIZE,BO
461
462	decq	%rax
463	jne	.L42
464	ALIGN_4
465
466.L45:
467#if defined(LT) || defined(RN)
468	movq	KK, %rax
469#else
470	movq	K,  %rax
471	subq	KK, %rax
472#endif
473	and	$3,  %rax
474	je	.L48
475	ALIGN_4
476
477.L46:
478	FLD	 -8 * SIZE(AO)
479
480	FLD	 -8 * SIZE(BO)
481	fmulp	 %st, %st(1)
482	faddp	 %st, %st(1)
483
484	addq	$1 * SIZE,AO
485	addq	$1 * SIZE,BO
486
487	decq	%rax
488	jne	 .L46
489	ALIGN_4
490
491.L48:
492#if defined(LN) || defined(RT)
493	movq	KK, %rax
494#ifdef LN
495	subq	$1, %rax
496#else
497	subq	$1, %rax
498#endif
499
500	salq	$BASE_SHIFT, %rax
501
502	movq	AORIG, AO
503	leaq	(AO, %rax, 1), AO
504	leaq	(B,  %rax, 1), BO
505#endif
506
507#if defined(LN) || defined(LT)
508	FLD	-8 * SIZE(BO)
509	fsubp	%st, %st(1)
510#else
511	FLD	-8 * SIZE(AO)
512	fsubp	%st, %st(1)
513#endif
514
515#ifdef LN
516       FLD	-8 * SIZE(AO)
517       fmulp	%st, %st(1)
518#endif
519
520#ifdef LT
521       FLD	-8 * SIZE(AO)
522       fmulp	%st, %st(1)
523#endif
524
525#ifdef RN
526       FLD	-8 * SIZE(BO)
527       fmulp	%st, %st(1)
528#endif
529
530#ifdef RT
531       FLD	-8 * SIZE(BO)
532       fmulp	%st, %st(1)
533#endif
534
535#ifdef LN
536	subq	$1 * SIZE, CO
537#endif
538
539#if defined(LN) || defined(LT)
540	fld	%st
541	FST	-8 * SIZE(BO)
542#else
543	fld	%st
544	FST	-8 * SIZE(AO)
545#endif
546
547	FST	0 * SIZE(CO)
548
549#ifndef LN
550	addq	$1 * SIZE, CO
551#endif
552
553#if defined(LT) || defined(RN)
554	movq	K,  %rax
555	subq	KK, %rax
556	salq	$BASE_SHIFT, %rax
557	leaq	(AO, %rax, 1), AO
558	leaq	(BO, %rax, 1), BO
559#endif
560
561#ifdef LN
562	subq	$1, KK
563#endif
564
565#ifdef LT
566	addq	$1, KK
567#endif
568
569#ifdef RT
570       movq	K, %rax
571       salq	$0 + BASE_SHIFT, %rax
572       addq	%rax, AORIG
573#endif
574	ALIGN_4
575
576.L49:
577#ifdef LN
578       movq	K, %rax
579       salq	$BASE_SHIFT, %rax
580       leaq	(B, %rax, 1), B
581#endif
582
583#if defined(LT) || defined(RN)
584	movq	BO, B
585#endif
586
587#ifdef RN
588	addq	$1, KK
589#endif
590
591#ifdef RT
592	subq	$1, KK
593#endif
594	ALIGN_4
595
596.L30:
597	movq	N,   %rax
598	sarq	$1,  %rax
599	movq	%rax, J
600	je	.L999
601	ALIGN_4
602
603.L01:
604#if defined(LT) || defined(RN)
605	movq	A, AO
606#else
607	movq	A, %rax
608	movq	%rax, AORIG
609#endif
610
611#ifdef RT
612	movq	K, %rax
613	salq	$1 + BASE_SHIFT, %rax
614	subq	%rax, B
615#endif
616
617	lea	(, LDC, 2), %rax
618
619#ifdef RT
620	subq	%rax, C
621#endif
622	movq	C, CO
623#ifndef RT
624	addq	%rax, C
625#endif
626
627#ifdef LN
628	movq	OFFSET, %rax
629	addq	M, %rax
630	movq	%rax, KK
631#endif
632
633#ifdef LT
634	movq	OFFSET, %rax
635	movq	%rax, KK
636#endif
637
638	movq	M,  I
639	sarq	$1, I
640	je	.L20
641	ALIGN_4
642
643.L11:
644#ifdef LN
645       movq	K, %rax
646       salq	$1 + BASE_SHIFT, %rax
647       subq	%rax, AORIG
648#endif
649
650#if defined(LN) || defined(RT)
651	movq	KK, %rax
652	salq	$BASE_SHIFT, %rax
653	movq	AORIG, AO
654	leaq	(AO, %rax, 2), AO
655	leaq	(B,  %rax, 2), BO
656#else
657	movq	B, BO
658#endif
659
660	fldz
661	fldz
662	fldz
663	fldz
664
665#if   defined(HAVE_3DNOW)
666	prefetchw	2 * SIZE(CO)
667 	prefetchw	2 * SIZE(CO, LDC, 1)
668#elif defined(HAVE_SSE)
669	prefetchnta	2 * SIZE(CO)
670 	prefetchnta	2 * SIZE(CO, LDC, 1)
671#endif
672
673#if defined(LT) || defined(RN)
674	movq	KK, %rax
675#else
676	movq	K,  %rax
677	subq	KK, %rax
678#endif
679	sarq	$2, %rax
680 	je	.L15
681	ALIGN_4
682
683.L12:
684	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
685
686	FLD	 -8 * SIZE(AO)
687
688	FLD	 -8 * SIZE(BO)
689	fld	 %st(1)
690	fmul	 %st(1), %st
691	faddp	 %st, %st(3)
692
693	FLD	 -7 * SIZE(BO)
694	fmul	 %st, %st(2)
695
696	FLD	 -7 * SIZE(AO)
697	fmul	 %st, %st(2)
698	fmulp	 %st, %st(1)
699
700	faddp	 %st, %st(6)
701	faddp	 %st, %st(4)
702	faddp	 %st, %st(2)
703
704	FLD	 -6 * SIZE(AO)
705
706	FLD	 -6 * SIZE(BO)
707	fld	 %st(1)
708	fmul	 %st(1), %st
709	faddp	 %st, %st(3)
710
711	FLD	 -5 * SIZE(BO)
712	fmul	 %st, %st(2)
713
714	FLD	 -5 * SIZE(AO)
715	fmul	 %st, %st(2)
716	fmulp	 %st, %st(1)
717
718	faddp	 %st, %st(6)
719	faddp	 %st, %st(4)
720	faddp	 %st, %st(2)
721
722	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
723
724	FLD	 -4 * SIZE(AO)
725
726	FLD	 -4 * SIZE(BO)
727	fld	 %st(1)
728	fmul	 %st(1), %st
729	faddp	 %st, %st(3)
730
731	FLD	 -3 * SIZE(BO)
732	fmul	 %st, %st(2)
733
734	FLD	 -3 * SIZE(AO)
735	fmul	 %st, %st(2)
736	fmulp	 %st, %st(1)
737
738	faddp	 %st, %st(6)
739	faddp	 %st, %st(4)
740	faddp	 %st, %st(2)
741
742	FLD	 -2 * SIZE(AO)
743
744	FLD	 -2 * SIZE(BO)
745	fld	 %st(1)
746	fmul	 %st(1), %st
747	faddp	 %st, %st(3)
748
749	FLD	 -1 * SIZE(BO)
750	fmul	 %st, %st(2)
751
752	FLD	 -1 * SIZE(AO)
753	fmul	 %st, %st(2)
754	fmulp	 %st, %st(1)
755
756	faddp	 %st, %st(6)
757	faddp	 %st, %st(4)
758	faddp	 %st, %st(2)
759
760	addq	$8 * SIZE,AO
761	addq	$8 * SIZE,BO
762
763	decq	%rax
764	jne	.L12
765	ALIGN_4
766
767.L15:
768#if defined(LT) || defined(RN)
769	movq	KK, %rax
770#else
771	movq	K,  %rax
772	subq	KK, %rax
773#endif
774	and	$3,  %rax
775	je	.L18
776	ALIGN_4
777
778.L16:
779	FLD	 -8 * SIZE(AO)
780
781	FLD	 -8 * SIZE(BO)
782	fld	 %st(1)
783	fmul	 %st(1), %st
784	faddp	 %st, %st(3)
785
786	FLD	 -7 * SIZE(BO)
787	fmul	 %st, %st(2)
788
789	FLD	 -7 * SIZE(AO)
790	fmul	 %st, %st(2)
791	fmulp	 %st, %st(1)
792
793	faddp	 %st, %st(6)
794	faddp	 %st, %st(4)
795	faddp	 %st, %st(2)
796
797	addq	$2 * SIZE,AO
798	addq	$2 * SIZE,BO
799
800	decq	%rax
801	jne	 .L16
802	ALIGN_4
803
804.L18:
805#if defined(LN) || defined(RT)
806	movq	KK, %rax
807#ifdef LN
808	subq	$2, %rax
809#else
810	subq	$2, %rax
811#endif
812
813	salq	$BASE_SHIFT, %rax
814
815	movq	AORIG, AO
816	leaq	(AO, %rax, 2), AO
817	leaq	(B,  %rax, 2), BO
818#endif
819
820#if defined(LN) || defined(LT)
821	FLD	-8 * SIZE(BO)
822	fsubp	%st, %st(1)
823	FLD	-7 * SIZE(BO)
824	fsubp	%st, %st(2)
825	FLD	-6 * SIZE(BO)
826	fsubp	%st, %st(3)
827	FLD	-5 * SIZE(BO)
828	fsubp	%st, %st(4)
829#else
830	FLD	-8 * SIZE(AO)
831	fsubp	%st, %st(1)
832	FLD	-7 * SIZE(AO)
833	fsubp	%st, %st(3)
834	FLD	-6 * SIZE(AO)
835	fsubp	%st, %st(2)
836	FLD	-5 * SIZE(AO)
837	fsubp	%st, %st(4)
838#endif
839
840#ifdef LN
841       FLD	-5 * SIZE(AO)
842       fmul	%st, %st(3)
843       fmulp	%st, %st(4)
844
845       FLD	-6 * SIZE(AO)
846       fmul	%st(3), %st
847       FLD	-6 * SIZE(AO)
848       fmul	%st(5), %st
849
850       fsubrp	%st, %st(3)
851       fsubrp	%st, %st(1)
852
853       FLD	-8 * SIZE(AO)
854       fmul	%st, %st(1)
855       fmulp	%st, %st(2)
856#endif
857
858#ifdef LT
859       FLD	-8 * SIZE(AO)
860       fmul	%st, %st(1)
861       fmulp	%st, %st(2)
862
863       FLD	-7 * SIZE(AO)
864       fmul	%st(1), %st
865       FLD	-7 * SIZE(AO)
866       fmul	%st(3), %st
867
868       fsubrp	%st, %st(5)
869       fsubrp	%st, %st(3)
870
871       FLD	-5 * SIZE(AO)
872       fmul	%st, %st(3)
873       fmulp	%st, %st(4)
874#endif
875
876#ifdef RN
877       FLD	-8 * SIZE(BO)
878       fmul	%st, %st(1)
879       fmulp	%st, %st(3)
880
881       FLD	-7 * SIZE(BO)
882       fmul	%st(1), %st
883       FLD	-7 * SIZE(BO)
884       fmul	%st(4), %st
885
886       fsubrp	%st, %st(5)
887       fsubrp	%st, %st(2)
888
889       FLD	-5 * SIZE(BO)
890       fmul	%st, %st(2)
891       fmulp	%st, %st(4)
892#endif
893
894#ifdef RT
895       FLD	-5 * SIZE(BO)
896       fmul	%st, %st(2)
897       fmulp	%st, %st(4)
898
899       FLD	-6 * SIZE(BO)
900       fmul	%st(2), %st
901       FLD	-6 * SIZE(BO)
902       fmul	%st(5), %st
903
904       fsubrp	%st, %st(4)
905       fsubrp	%st, %st(1)
906
907       FLD	-8 * SIZE(BO)
908       fmul	%st, %st(1)
909       fmulp	%st, %st(3)
910#endif
911
912#ifdef LN
913	subq	$2 * SIZE, CO
914#endif
915
916#if defined(LN) || defined(LT)
917	fld	%st
918	FST	-8 * SIZE(BO)
919	fxch	%st(1)
920	fld	%st
921	FST	-7 * SIZE(BO)
922	fxch	%st(2)
923	fld	%st
924	FST	-6 * SIZE(BO)
925	fxch	%st(3)
926	fld	%st
927	FST	-5 * SIZE(BO)
928
929	FST	1 * SIZE(CO, LDC)
930	FST	0 * SIZE(CO)
931	FST	0 * SIZE(CO, LDC)
932	FST	1 * SIZE(CO)
933#else
934	fld	%st
935	FST	-8 * SIZE(AO)
936	fxch	%st(2)
937	fld	%st
938	FST	-7 * SIZE(AO)
939	fxch	%st(1)
940	fld	%st
941	FST	-6 * SIZE(AO)
942	fxch	%st(3)
943	fld	%st
944	FST	-5 * SIZE(AO)
945
946	FST	1 * SIZE(CO, LDC)
947	FST	1 * SIZE(CO)
948	FST	0 * SIZE(CO)
949	FST	0 * SIZE(CO, LDC)
950#endif
951
952#ifndef LN
953	addq	$2 * SIZE, CO
954#endif
955
956#if defined(LT) || defined(RN)
957	movq	K,  %rax
958	subq	KK, %rax
959	salq	$BASE_SHIFT, %rax
960	leaq	(AO, %rax, 2), AO
961	leaq	(BO, %rax, 2), BO
962#endif
963
964#ifdef LN
965	subq	$2, KK
966#endif
967
968#ifdef LT
969	addq	$2, KK
970#endif
971
972#ifdef RT
973       movq	K, %rax
974       salq	$1 + BASE_SHIFT, %rax
975       addq	%rax, AORIG
976#endif
977
978	decq	I
979	jne	.L11
980	ALIGN_4
981
982.L20:
983	movq	 M, %rax
984	andq	$1, %rax
985	je	.L29
986	ALIGN_4
987
988.L21:
989#ifdef LN
990       movq	K, %rax
991       salq	$0 + BASE_SHIFT, %rax
992       subq	%rax, AORIG
993#endif
994
995#if defined(LN) || defined(RT)
996	movq	KK, %rax
997	salq	$BASE_SHIFT, %rax
998	movq	AORIG, AO
999	leaq	(AO, %rax, 1), AO
1000	leaq	(B,  %rax, 2), BO
1001#else
1002	movq	B, BO
1003#endif
1004
1005	fldz
1006	fldz
1007
1008#if defined(LT) || defined(RN)
1009	movq	KK, %rax
1010#else
1011	movq	K,  %rax
1012	subq	KK, %rax
1013#endif
1014	sarq	$2, %rax
1015 	je	.L25
1016	ALIGN_4
1017
1018.L22:
1019	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1020
1021	FLD	 -8 * SIZE(AO)
1022
1023	FLD	 -8 * SIZE(BO)
1024	fmul	 %st(1), %st
1025	faddp	 %st, %st(2)
1026
1027	FLD	 -7 * SIZE(BO)
1028	fmulp	 %st, %st(1)
1029	faddp	 %st, %st(2)
1030
1031	FLD	 -7 * SIZE(AO)
1032
1033	FLD	 -6 * SIZE(BO)
1034	fmul	 %st(1), %st
1035	faddp	 %st, %st(2)
1036
1037	FLD	 -5 * SIZE(BO)
1038	fmulp	 %st, %st(1)
1039	faddp	 %st, %st(2)
1040
1041	FLD	 -6 * SIZE(AO)
1042
1043	FLD	 -4 * SIZE(BO)
1044	fmul	 %st(1), %st
1045	faddp	 %st, %st(2)
1046
1047	FLD	 -3 * SIZE(BO)
1048	fmulp	 %st, %st(1)
1049	faddp	 %st, %st(2)
1050
1051	FLD	 -5 * SIZE(AO)
1052
1053	FLD	 -2 * SIZE(BO)
1054	fmul	 %st(1), %st
1055	faddp	 %st, %st(2)
1056
1057	FLD	 -1 * SIZE(BO)
1058	fmulp	 %st, %st(1)
1059	faddp	 %st, %st(2)
1060
1061	addq	$4 * SIZE,AO
1062	addq	$8 * SIZE,BO
1063
1064	decq	%rax
1065	jne	.L22
1066	ALIGN_4
1067
1068.L25:
1069#if defined(LT) || defined(RN)
1070	movq	KK, %rax
1071#else
1072	movq	K,  %rax
1073	subq	KK, %rax
1074#endif
1075	and	$3,  %rax
1076	je	.L28
1077	ALIGN_4
1078
1079.L26:
1080	FLD	 -8 * SIZE(AO)
1081
1082	FLD	 -8 * SIZE(BO)
1083	fmul	 %st(1), %st
1084	faddp	 %st, %st(2)
1085
1086	FLD	 -7 * SIZE(BO)
1087	fmulp	 %st, %st(1)
1088	faddp	 %st, %st(2)
1089
1090	addq	$1 * SIZE,AO
1091	addq	$2 * SIZE,BO
1092
1093	decq	%rax
1094	jne	 .L26
1095	ALIGN_4
1096
1097.L28:
1098#if defined(LN) || defined(RT)
1099	movq	KK, %rax
1100#ifdef LN
1101	subq	$1, %rax
1102#else
1103	subq	$2, %rax
1104#endif
1105
1106	salq	$BASE_SHIFT, %rax
1107
1108	movq	AORIG, AO
1109	leaq	(AO, %rax, 1), AO
1110	leaq	(B,  %rax, 2), BO
1111#endif
1112
1113#if defined(LN) || defined(LT)
1114	FLD	-8 * SIZE(BO)
1115	fsubp	%st, %st(1)
1116	FLD	-7 * SIZE(BO)
1117	fsubp	%st, %st(2)
1118#else
1119	FLD	-8 * SIZE(AO)
1120	fsubp	%st, %st(1)
1121	FLD	-7 * SIZE(AO)
1122	fsubp	%st, %st(2)
1123#endif
1124
1125#if defined(LN) || defined(LT)
1126       FLD	-8 * SIZE(AO)
1127       fmul	%st, %st(1)
1128       fmulp	%st, %st(2)
1129#endif
1130
1131#ifdef RN
1132       FLD	-8 * SIZE(BO)
1133       fmulp	%st, %st(1)
1134
1135       FLD	-7 * SIZE(BO)
1136       fmul	%st(1), %st
1137
1138       fsubrp	%st, %st(2)
1139
1140       FLD	-5 * SIZE(BO)
1141       fmulp	%st, %st(2)
1142#endif
1143
1144#ifdef RT
1145       FLD	-5 * SIZE(BO)
1146       fmulp	%st, %st(2)
1147
1148       FLD	-6 * SIZE(BO)
1149       fmul	%st(2), %st
1150
1151       fsubrp	%st, %st(1)
1152
1153       FLD	-8 * SIZE(BO)
1154       fmulp	%st, %st(1)
1155#endif
1156
1157#ifdef LN
1158	subq	$1 * SIZE, CO
1159#endif
1160
1161#if defined(LN) || defined(LT)
1162	fld	%st
1163	FST	-8 * SIZE(BO)
1164	fxch	%st(1)
1165	fld	%st
1166	FST	-7 * SIZE(BO)
1167#else
1168	fld	%st
1169	FST	-8 * SIZE(AO)
1170	fxch	%st(1)
1171	fld	%st
1172	FST	-7 * SIZE(AO)
1173#endif
1174
1175	FST	0 * SIZE(CO, LDC)
1176	FST	0 * SIZE(CO)
1177
1178#ifndef LN
1179	addq	$1 * SIZE, CO
1180#endif
1181
1182#if defined(LT) || defined(RN)
1183	movq	K,  %rax
1184	subq	KK, %rax
1185	salq	$BASE_SHIFT, %rax
1186	leaq	(AO, %rax, 1), AO
1187	leaq	(BO, %rax, 2), BO
1188#endif
1189
1190#ifdef LN
1191	subq	$1, KK
1192#endif
1193
1194#ifdef LT
1195	addq	$1, KK
1196#endif
1197
1198#ifdef RT
1199       movq	K, %rax
1200       salq	$0 + BASE_SHIFT, %rax
1201       addq	%rax, AORIG
1202#endif
1203	ALIGN_4
1204
1205.L29:
1206#ifdef LN
1207       movq	K, %rax
1208       salq	$BASE_SHIFT, %rax
1209       leaq	(B, %rax, 2), B
1210#endif
1211
1212#if defined(LT) || defined(RN)
1213	movq	BO, B
1214#endif
1215
1216#ifdef RN
1217	addq	$2, KK
1218#endif
1219
1220#ifdef RT
1221	subq	$2, KK
1222#endif
1223
1224	decq	J
1225	jne	.L01
1226	ALIGN_4
1227
1228.L999:
1229	movq	  0(%rsp), %rbx
1230	movq	  8(%rsp), %rbp
1231	movq	 16(%rsp), %r12
1232	movq	 24(%rsp), %r13
1233	movq	 32(%rsp), %r14
1234	movq	 40(%rsp), %r15
1235	addq	$STACKSIZE, %rsp
1236	ret
1237
1238	EPILOGUE
1239