1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1
43#define N	ARG2
44#define K	ARG3
45#define A	ARG4
46#define B	ARG5
47#define C	ARG6
48#define LDC	%r10
49
50#define I	%r12
51#define J	%r13
52#define AO	%r14
53#define BO	%r15
54#define	CO	%rbp
55
56#define KK	%r11
57#define AORIG	 48(%rsp)
58
59#define STACKSIZE 64
60
61#define ALPHA	 8 + STACKSIZE(%rsp)
62#define OFFSET	32 + STACKSIZE(%rsp)
63
64#ifdef OPTERON
65#define PREFETCH	prefetch
66#define PREFETCHW	prefetchw
67#else
68#define PREFETCH	prefetcht0
69#define PREFETCHW	prefetcht0
70#endif
71
72#define PREFETCHSIZE (5 + 4 * 10)
73
74	PROLOGUE
75	PROFCODE
76
77	subq	$STACKSIZE, %rsp
78	movq	%rbx,  0(%rsp)
79	movq	%rbp,  8(%rsp)
80	movq	%r12, 16(%rsp)
81	movq	%r13, 24(%rsp)
82	movq	%r14, 32(%rsp)
83	movq	%r15, 40(%rsp)
84
85	movq	24 + STACKSIZE(%rsp), LDC
86
87#if defined(TRMMKERNEL) && !defined(LEFT)
88	movq	OFFSET, %rax
89	negq	%rax
90	movq	%rax, KK
91#endif
92
93	addq	$8 * SIZE, A
94	addq	$8 * SIZE, B
95
96	salq	$BASE_SHIFT, LDC
97
98#ifdef LN
99       movq	M, %rax
100       salq	$BASE_SHIFT, %rax
101       addq	%rax, C
102       imulq	K, %rax
103       addq	%rax, A
104#endif
105
106#ifdef RT
107       movq	N, %rax
108       salq	$BASE_SHIFT, %rax
109       imulq	K, %rax
110       addq	%rax, B
111
112       movq	N,   %rax
113       imulq	LDC, %rax
114       addq	%rax, C
115#endif
116
117#ifdef RN
118       movq	OFFSET, %rax
119       negq	%rax
120       movq	%rax, KK
121#endif
122
123#ifdef RT
124       movq	N, %rax
125       subq	OFFSET, %rax
126       movq	%rax, KK
127#endif
128
129	movq	N,  %rax
130	testq	$1, %rax
131	je	.L30
132
133#if defined(LT) || defined(RN)
134	movq	A, AO
135#else
136	movq	A, %rax
137	movq	%rax, AORIG
138#endif
139
140#ifdef RT
141	movq	K, %rax
142	salq	$0 + BASE_SHIFT, %rax
143	subq	%rax, B
144#endif
145
146#ifdef RT
147	subq	LDC, C
148#endif
149	movq	C, CO
150#ifndef RT
151	addq	LDC, C
152#endif
153
154#ifdef LN
155	movq	OFFSET, %rax
156	addq	M, %rax
157	movq	%rax, KK
158#endif
159
160#ifdef LT
161	movq	OFFSET, %rax
162	movq	%rax, KK
163#endif
164
165	movq	M,  I
166	sarq	$1, I
167	je	.L40
168	ALIGN_4
169
170.L31:
171#ifdef LN
172       movq	K, %rax
173       salq	$1 + BASE_SHIFT, %rax
174       subq	%rax, AORIG
175#endif
176
177#if defined(LN) || defined(RT)
178	movq	KK, %rax
179	salq	$BASE_SHIFT, %rax
180	movq	AORIG, AO
181	leaq	(AO, %rax, 2), AO
182	leaq	(B,  %rax, 1), BO
183#else
184	movq	B, BO
185#endif
186
187	fldz
188	fldz
189
190#if   defined(HAVE_3DNOW)
191	prefetchw	2 * SIZE(CO)
192#elif defined(HAVE_SSE)
193	prefetchnta	2 * SIZE(CO)
194#endif
195
196#if defined(LT) || defined(RN)
197	movq	KK, %rax
198#else
199	movq	K,  %rax
200	subq	KK, %rax
201#endif
202	sarq	$2, %rax
203 	je	.L35
204	ALIGN_4
205
206.L32:
207	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
208
209	FLD	 -8 * SIZE(BO)
210	FLD	 -8 * SIZE(AO)
211	fmul	 %st(1), %st
212	faddp	 %st, %st(2)
213
214	FLD	 -7 * SIZE(AO)
215	fmulp	 %st, %st(1)
216	faddp	 %st, %st(2)
217
218	FLD	 -7 * SIZE(BO)
219	FLD	 -6 * SIZE(AO)
220	fmul	 %st(1), %st
221	faddp	 %st, %st(2)
222
223	FLD	 -5 * SIZE(AO)
224	fmulp	 %st, %st(1)
225	faddp	 %st, %st(2)
226
227	FLD	 -6 * SIZE(BO)
228	FLD	 -4 * SIZE(AO)
229	fmul	 %st(1), %st
230	faddp	 %st, %st(2)
231
232	FLD	 -3 * SIZE(AO)
233	fmulp	 %st, %st(1)
234	faddp	 %st, %st(2)
235
236	FLD	 -5 * SIZE(BO)
237	FLD	 -2 * SIZE(AO)
238	fmul	 %st(1), %st
239	faddp	 %st, %st(2)
240
241	FLD	 -1 * SIZE(AO)
242	fmulp	 %st, %st(1)
243	faddp	 %st, %st(2)
244
245	addq	$8 * SIZE,AO
246	addq	$4 * SIZE,BO
247
248	decq	%rax
249	jne	.L32
250	ALIGN_4
251
252.L35:
253#if defined(LT) || defined(RN)
254	movq	KK, %rax
255#else
256	movq	K,  %rax
257	subq	KK, %rax
258#endif
259	and	$3,  %rax
260	je	.L38
261	ALIGN_4
262
263.L36:
264	FLD	 -8 * SIZE(BO)
265
266	FLD	 -8 * SIZE(AO)
267	fmul	 %st(1), %st
268	faddp	 %st, %st(2)
269
270	FLD	 -7 * SIZE(AO)
271	fmulp	 %st, %st(1)
272	faddp	 %st, %st(2)
273
274	addq	$2 * SIZE,AO
275	addq	$1 * SIZE,BO
276
277	decq	%rax
278	jne	 .L36
279	ALIGN_4
280
281.L38:
282#if defined(LN) || defined(RT)
283	movq	KK, %rax
284#ifdef LN
285	subq	$2, %rax
286#else
287	subq	$1, %rax
288#endif
289
290	salq	$BASE_SHIFT, %rax
291
292	movq	AORIG, AO
293	leaq	(AO, %rax, 2), AO
294	leaq	(B,  %rax, 1), BO
295#endif
296
297#if defined(LN) || defined(LT)
298	FLD	-8 * SIZE(BO)
299	fsubp	%st, %st(1)
300	FLD	-7 * SIZE(BO)
301	fsubp	%st, %st(2)
302#else
303	FLD	-8 * SIZE(AO)
304	fsubp	%st, %st(1)
305	FLD	-7 * SIZE(AO)
306	fsubp	%st, %st(2)
307#endif
308
309#ifdef LN
310       FLD	-5 * SIZE(AO)
311       fmulp	%st, %st(2)
312
313       FLD	-6 * SIZE(AO)
314       fmul	%st(2), %st
315
316       fsubrp	%st, %st(1)
317       FLD	-8 * SIZE(AO)
318       fmulp	%st, %st(1)
319#endif
320
321#ifdef LT
322       FLD	-8 * SIZE(AO)
323       fmulp	%st, %st(1)
324
325       FLD	-7 * SIZE(AO)
326       fmul	%st(1), %st
327
328       fsubrp	%st, %st(2)
329
330       FLD	-5 * SIZE(AO)
331       fmulp	%st, %st(2)
332#endif
333
334#ifdef RN
335       FLD	-8 * SIZE(BO)
336       fmul	%st, %st(1)
337       fmulp	%st, %st(2)
338#endif
339
340#ifdef RT
341       FLD	-8 * SIZE(BO)
342       fmul	%st, %st(1)
343       fmulp	%st, %st(2)
344#endif
345
346#ifdef LN
347	subq	$2 * SIZE, CO
348#endif
349
350#if defined(LN) || defined(LT)
351	fld	%st
352	FST	-8 * SIZE(BO)
353	fxch	%st(1)
354	fld	%st
355	FST	-7 * SIZE(BO)
356#else
357	fld	%st
358	FST	-8 * SIZE(AO)
359	fxch	%st(1)
360	fld	%st
361	FST	-7 * SIZE(AO)
362#endif
363
364	FST	1 * SIZE(CO)
365	FST	0 * SIZE(CO)
366
367#ifndef LN
368	addq	$2 * SIZE, CO
369#endif
370
371#if defined(LT) || defined(RN)
372	movq	K,  %rax
373	subq	KK, %rax
374	salq	$BASE_SHIFT, %rax
375	leaq	(AO, %rax, 2), AO
376	leaq	(BO, %rax, 1), BO
377#endif
378
379#ifdef LN
380	subq	$2, KK
381#endif
382
383#ifdef LT
384	addq	$2, KK
385#endif
386
387#ifdef RT
388       movq	K, %rax
389       salq	$1 + BASE_SHIFT, %rax
390       addq	%rax, AORIG
391#endif
392
393	decq	I
394	jne	.L31
395	ALIGN_4
396
397.L40:
398	movq	 M, %rax
399	andq	$1, %rax
400	je	.L49
401	ALIGN_4
402
403.L41:
404#ifdef LN
405       movq	K, %rax
406       salq	$0 + BASE_SHIFT, %rax
407       subq	%rax, AORIG
408#endif
409
410#if defined(LN) || defined(RT)
411	movq	KK, %rax
412	salq	$BASE_SHIFT, %rax
413	movq	AORIG, AO
414	leaq	(AO, %rax, 1), AO
415	leaq	(B,  %rax, 1), BO
416#else
417	movq	B, BO
418#endif
419
420	fldz
421
422#if defined(LT) || defined(RN)
423	movq	KK, %rax
424#else
425	movq	K,  %rax
426	subq	KK, %rax
427#endif
428	sarq	$2, %rax
429 	je	.L45
430	ALIGN_4
431
432.L42:
433	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
434
435	FLD	 -8 * SIZE(AO)
436	FLD	 -8 * SIZE(BO)
437	fmulp	 %st, %st(1)
438	faddp	 %st, %st(1)
439
440	FLD	 -7 * SIZE(AO)
441	FLD	 -7 * SIZE(BO)
442	fmulp	 %st, %st(1)
443	faddp	 %st, %st(1)
444
445	FLD	 -6 * SIZE(AO)
446	FLD	 -6 * SIZE(BO)
447	fmulp	 %st, %st(1)
448	faddp	 %st, %st(1)
449
450	FLD	 -5 * SIZE(AO)
451	FLD	 -5 * SIZE(BO)
452	fmulp	 %st, %st(1)
453	faddp	 %st, %st(1)
454
455	addq	$4 * SIZE,AO
456	addq	$4 * SIZE,BO
457
458	decq	%rax
459	jne	.L42
460	ALIGN_4
461
462.L45:
463#if defined(LT) || defined(RN)
464	movq	KK, %rax
465#else
466	movq	K,  %rax
467	subq	KK, %rax
468#endif
469	and	$3,  %rax
470	je	.L48
471	ALIGN_4
472
473.L46:
474	FLD	 -8 * SIZE(AO)
475
476	FLD	 -8 * SIZE(BO)
477	fmulp	 %st, %st(1)
478	faddp	 %st, %st(1)
479
480	addq	$1 * SIZE,AO
481	addq	$1 * SIZE,BO
482
483	decq	%rax
484	jne	 .L46
485	ALIGN_4
486
487.L48:
488#if defined(LN) || defined(RT)
489	movq	KK, %rax
490#ifdef LN
491	subq	$1, %rax
492#else
493	subq	$1, %rax
494#endif
495
496	salq	$BASE_SHIFT, %rax
497
498	movq	AORIG, AO
499	leaq	(AO, %rax, 1), AO
500	leaq	(B,  %rax, 1), BO
501#endif
502
503#if defined(LN) || defined(LT)
504	FLD	-8 * SIZE(BO)
505	fsubp	%st, %st(1)
506#else
507	FLD	-8 * SIZE(AO)
508	fsubp	%st, %st(1)
509#endif
510
511#ifdef LN
512       FLD	-8 * SIZE(AO)
513       fmulp	%st, %st(1)
514#endif
515
516#ifdef LT
517       FLD	-8 * SIZE(AO)
518       fmulp	%st, %st(1)
519#endif
520
521#ifdef RN
522       FLD	-8 * SIZE(BO)
523       fmulp	%st, %st(1)
524#endif
525
526#ifdef RT
527       FLD	-8 * SIZE(BO)
528       fmulp	%st, %st(1)
529#endif
530
531#ifdef LN
532	subq	$1 * SIZE, CO
533#endif
534
535#if defined(LN) || defined(LT)
536	fld	%st
537	FST	-8 * SIZE(BO)
538#else
539	fld	%st
540	FST	-8 * SIZE(AO)
541#endif
542
543	FST	0 * SIZE(CO)
544
545#ifndef LN
546	addq	$1 * SIZE, CO
547#endif
548
549#if defined(LT) || defined(RN)
550	movq	K,  %rax
551	subq	KK, %rax
552	salq	$BASE_SHIFT, %rax
553	leaq	(AO, %rax, 1), AO
554	leaq	(BO, %rax, 1), BO
555#endif
556
557#ifdef LN
558	subq	$1, KK
559#endif
560
561#ifdef LT
562	addq	$1, KK
563#endif
564
565#ifdef RT
566       movq	K, %rax
567       salq	$0 + BASE_SHIFT, %rax
568       addq	%rax, AORIG
569#endif
570	ALIGN_4
571
572.L49:
573#ifdef LN
574       movq	K, %rax
575       salq	$BASE_SHIFT, %rax
576       leaq	(B, %rax, 1), B
577#endif
578
579#if defined(LT) || defined(RN)
580	movq	BO, B
581#endif
582
583#ifdef RN
584	addq	$1, KK
585#endif
586
587#ifdef RT
588	subq	$1, KK
589#endif
590	ALIGN_4
591
592.L30:
593	movq	N,   %rax
594	sarq	$1,  %rax
595	movq	%rax, J
596	je	.L999
597	ALIGN_4
598
599.L01:
600#if defined(LT) || defined(RN)
601	movq	A, AO
602#else
603	movq	A, %rax
604	movq	%rax, AORIG
605#endif
606
607#ifdef RT
608	movq	K, %rax
609	salq	$1 + BASE_SHIFT, %rax
610	subq	%rax, B
611#endif
612
613	lea	(, LDC, 2), %rax
614
615#ifdef RT
616	subq	%rax, C
617#endif
618	movq	C, CO
619#ifndef RT
620	addq	%rax, C
621#endif
622
623#ifdef LN
624	movq	OFFSET, %rax
625	addq	M, %rax
626	movq	%rax, KK
627#endif
628
629#ifdef LT
630	movq	OFFSET, %rax
631	movq	%rax, KK
632#endif
633
634	movq	M,  I
635	sarq	$1, I
636	je	.L20
637	ALIGN_4
638
639.L11:
640#ifdef LN
641       movq	K, %rax
642       salq	$1 + BASE_SHIFT, %rax
643       subq	%rax, AORIG
644#endif
645
646#if defined(LN) || defined(RT)
647	movq	KK, %rax
648	salq	$BASE_SHIFT, %rax
649	movq	AORIG, AO
650	leaq	(AO, %rax, 2), AO
651	leaq	(B,  %rax, 2), BO
652#else
653	movq	B, BO
654#endif
655
656	fldz
657	fldz
658	fldz
659	fldz
660
661#if   defined(HAVE_3DNOW)
662	prefetchw	2 * SIZE(CO)
663 	prefetchw	2 * SIZE(CO, LDC, 1)
664#elif defined(HAVE_SSE)
665	prefetchnta	2 * SIZE(CO)
666 	prefetchnta	2 * SIZE(CO, LDC, 1)
667#endif
668
669#if defined(LT) || defined(RN)
670	movq	KK, %rax
671#else
672	movq	K,  %rax
673	subq	KK, %rax
674#endif
675	sarq	$2, %rax
676 	je	.L15
677	ALIGN_4
678
679.L12:
680	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
681
682	FLD	 -8 * SIZE(AO)
683
684	FLD	 -8 * SIZE(BO)
685	fld	 %st(1)
686	fmul	 %st(1), %st
687	faddp	 %st, %st(3)
688
689	FLD	 -7 * SIZE(BO)
690	fmul	 %st, %st(2)
691
692	FLD	 -7 * SIZE(AO)
693	fmul	 %st, %st(2)
694	fmulp	 %st, %st(1)
695
696	faddp	 %st, %st(6)
697	faddp	 %st, %st(4)
698	faddp	 %st, %st(2)
699
700	FLD	 -6 * SIZE(AO)
701
702	FLD	 -6 * SIZE(BO)
703	fld	 %st(1)
704	fmul	 %st(1), %st
705	faddp	 %st, %st(3)
706
707	FLD	 -5 * SIZE(BO)
708	fmul	 %st, %st(2)
709
710	FLD	 -5 * SIZE(AO)
711	fmul	 %st, %st(2)
712	fmulp	 %st, %st(1)
713
714	faddp	 %st, %st(6)
715	faddp	 %st, %st(4)
716	faddp	 %st, %st(2)
717
718	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
719
720	FLD	 -4 * SIZE(AO)
721
722	FLD	 -4 * SIZE(BO)
723	fld	 %st(1)
724	fmul	 %st(1), %st
725	faddp	 %st, %st(3)
726
727	FLD	 -3 * SIZE(BO)
728	fmul	 %st, %st(2)
729
730	FLD	 -3 * SIZE(AO)
731	fmul	 %st, %st(2)
732	fmulp	 %st, %st(1)
733
734	faddp	 %st, %st(6)
735	faddp	 %st, %st(4)
736	faddp	 %st, %st(2)
737
738	FLD	 -2 * SIZE(AO)
739
740	FLD	 -2 * SIZE(BO)
741	fld	 %st(1)
742	fmul	 %st(1), %st
743	faddp	 %st, %st(3)
744
745	FLD	 -1 * SIZE(BO)
746	fmul	 %st, %st(2)
747
748	FLD	 -1 * SIZE(AO)
749	fmul	 %st, %st(2)
750	fmulp	 %st, %st(1)
751
752	faddp	 %st, %st(6)
753	faddp	 %st, %st(4)
754	faddp	 %st, %st(2)
755
756	addq	$8 * SIZE,AO
757	addq	$8 * SIZE,BO
758
759	decq	%rax
760	jne	.L12
761	ALIGN_4
762
763.L15:
764#if defined(LT) || defined(RN)
765	movq	KK, %rax
766#else
767	movq	K,  %rax
768	subq	KK, %rax
769#endif
770	and	$3,  %rax
771	je	.L18
772	ALIGN_4
773
774.L16:
775	FLD	 -8 * SIZE(AO)
776
777	FLD	 -8 * SIZE(BO)
778	fld	 %st(1)
779	fmul	 %st(1), %st
780	faddp	 %st, %st(3)
781
782	FLD	 -7 * SIZE(BO)
783	fmul	 %st, %st(2)
784
785	FLD	 -7 * SIZE(AO)
786	fmul	 %st, %st(2)
787	fmulp	 %st, %st(1)
788
789	faddp	 %st, %st(6)
790	faddp	 %st, %st(4)
791	faddp	 %st, %st(2)
792
793	addq	$2 * SIZE,AO
794	addq	$2 * SIZE,BO
795
796	decq	%rax
797	jne	 .L16
798	ALIGN_4
799
800.L18:
801#if defined(LN) || defined(RT)
802	movq	KK, %rax
803#ifdef LN
804	subq	$2, %rax
805#else
806	subq	$2, %rax
807#endif
808
809	salq	$BASE_SHIFT, %rax
810
811	movq	AORIG, AO
812	leaq	(AO, %rax, 2), AO
813	leaq	(B,  %rax, 2), BO
814#endif
815
816#if defined(LN) || defined(LT)
817	FLD	-8 * SIZE(BO)
818	fsubp	%st, %st(1)
819	FLD	-7 * SIZE(BO)
820	fsubp	%st, %st(2)
821	FLD	-6 * SIZE(BO)
822	fsubp	%st, %st(3)
823	FLD	-5 * SIZE(BO)
824	fsubp	%st, %st(4)
825#else
826	FLD	-8 * SIZE(AO)
827	fsubp	%st, %st(1)
828	FLD	-7 * SIZE(AO)
829	fsubp	%st, %st(3)
830	FLD	-6 * SIZE(AO)
831	fsubp	%st, %st(2)
832	FLD	-5 * SIZE(AO)
833	fsubp	%st, %st(4)
834#endif
835
836#ifdef LN
837       FLD	-5 * SIZE(AO)
838       fmul	%st, %st(3)
839       fmulp	%st, %st(4)
840
841       FLD	-6 * SIZE(AO)
842       fmul	%st(3), %st
843       FLD	-6 * SIZE(AO)
844       fmul	%st(5), %st
845
846       fsubrp	%st, %st(3)
847       fsubrp	%st, %st(1)
848
849       FLD	-8 * SIZE(AO)
850       fmul	%st, %st(1)
851       fmulp	%st, %st(2)
852#endif
853
854#ifdef LT
855       FLD	-8 * SIZE(AO)
856       fmul	%st, %st(1)
857       fmulp	%st, %st(2)
858
859       FLD	-7 * SIZE(AO)
860       fmul	%st(1), %st
861       FLD	-7 * SIZE(AO)
862       fmul	%st(3), %st
863
864       fsubrp	%st, %st(5)
865       fsubrp	%st, %st(3)
866
867       FLD	-5 * SIZE(AO)
868       fmul	%st, %st(3)
869       fmulp	%st, %st(4)
870#endif
871
872#ifdef RN
873       FLD	-8 * SIZE(BO)
874       fmul	%st, %st(1)
875       fmulp	%st, %st(3)
876
877       FLD	-7 * SIZE(BO)
878       fmul	%st(1), %st
879       FLD	-7 * SIZE(BO)
880       fmul	%st(4), %st
881
882       fsubrp	%st, %st(5)
883       fsubrp	%st, %st(2)
884
885       FLD	-5 * SIZE(BO)
886       fmul	%st, %st(2)
887       fmulp	%st, %st(4)
888#endif
889
890#ifdef RT
891       FLD	-5 * SIZE(BO)
892       fmul	%st, %st(2)
893       fmulp	%st, %st(4)
894
895       FLD	-6 * SIZE(BO)
896       fmul	%st(2), %st
897       FLD	-6 * SIZE(BO)
898       fmul	%st(5), %st
899
900       fsubrp	%st, %st(4)
901       fsubrp	%st, %st(1)
902
903       FLD	-8 * SIZE(BO)
904       fmul	%st, %st(1)
905       fmulp	%st, %st(3)
906#endif
907
908#ifdef LN
909	subq	$2 * SIZE, CO
910#endif
911
912#if defined(LN) || defined(LT)
913	fld	%st
914	FST	-8 * SIZE(BO)
915	fxch	%st(1)
916	fld	%st
917	FST	-7 * SIZE(BO)
918	fxch	%st(2)
919	fld	%st
920	FST	-6 * SIZE(BO)
921	fxch	%st(3)
922	fld	%st
923	FST	-5 * SIZE(BO)
924
925	FST	1 * SIZE(CO, LDC)
926	FST	0 * SIZE(CO)
927	FST	0 * SIZE(CO, LDC)
928	FST	1 * SIZE(CO)
929#else
930	fld	%st
931	FST	-8 * SIZE(AO)
932	fxch	%st(2)
933	fld	%st
934	FST	-7 * SIZE(AO)
935	fxch	%st(1)
936	fld	%st
937	FST	-6 * SIZE(AO)
938	fxch	%st(3)
939	fld	%st
940	FST	-5 * SIZE(AO)
941
942	FST	1 * SIZE(CO, LDC)
943	FST	1 * SIZE(CO)
944	FST	0 * SIZE(CO)
945	FST	0 * SIZE(CO, LDC)
946#endif
947
948#ifndef LN
949	addq	$2 * SIZE, CO
950#endif
951
952#if defined(LT) || defined(RN)
953	movq	K,  %rax
954	subq	KK, %rax
955	salq	$BASE_SHIFT, %rax
956	leaq	(AO, %rax, 2), AO
957	leaq	(BO, %rax, 2), BO
958#endif
959
960#ifdef LN
961	subq	$2, KK
962#endif
963
964#ifdef LT
965	addq	$2, KK
966#endif
967
968#ifdef RT
969       movq	K, %rax
970       salq	$1 + BASE_SHIFT, %rax
971       addq	%rax, AORIG
972#endif
973
974	decq	I
975	jne	.L11
976	ALIGN_4
977
978.L20:
979	movq	 M, %rax
980	andq	$1, %rax
981	je	.L29
982	ALIGN_4
983
984.L21:
985#ifdef LN
986       movq	K, %rax
987       salq	$0 + BASE_SHIFT, %rax
988       subq	%rax, AORIG
989#endif
990
991#if defined(LN) || defined(RT)
992	movq	KK, %rax
993	salq	$BASE_SHIFT, %rax
994	movq	AORIG, AO
995	leaq	(AO, %rax, 1), AO
996	leaq	(B,  %rax, 2), BO
997#else
998	movq	B, BO
999#endif
1000
1001	fldz
1002	fldz
1003
1004#if defined(LT) || defined(RN)
1005	movq	KK, %rax
1006#else
1007	movq	K,  %rax
1008	subq	KK, %rax
1009#endif
1010	sarq	$2, %rax
1011 	je	.L25
1012	ALIGN_4
1013
1014.L22:
1015	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1016
1017	FLD	 -8 * SIZE(AO)
1018
1019	FLD	 -8 * SIZE(BO)
1020	fmul	 %st(1), %st
1021	faddp	 %st, %st(2)
1022
1023	FLD	 -7 * SIZE(BO)
1024	fmulp	 %st, %st(1)
1025	faddp	 %st, %st(2)
1026
1027	FLD	 -7 * SIZE(AO)
1028
1029	FLD	 -6 * SIZE(BO)
1030	fmul	 %st(1), %st
1031	faddp	 %st, %st(2)
1032
1033	FLD	 -5 * SIZE(BO)
1034	fmulp	 %st, %st(1)
1035	faddp	 %st, %st(2)
1036
1037	FLD	 -6 * SIZE(AO)
1038
1039	FLD	 -4 * SIZE(BO)
1040	fmul	 %st(1), %st
1041	faddp	 %st, %st(2)
1042
1043	FLD	 -3 * SIZE(BO)
1044	fmulp	 %st, %st(1)
1045	faddp	 %st, %st(2)
1046
1047	FLD	 -5 * SIZE(AO)
1048
1049	FLD	 -2 * SIZE(BO)
1050	fmul	 %st(1), %st
1051	faddp	 %st, %st(2)
1052
1053	FLD	 -1 * SIZE(BO)
1054	fmulp	 %st, %st(1)
1055	faddp	 %st, %st(2)
1056
1057	addq	$4 * SIZE,AO
1058	addq	$8 * SIZE,BO
1059
1060	decq	%rax
1061	jne	.L22
1062	ALIGN_4
1063
1064.L25:
1065#if defined(LT) || defined(RN)
1066	movq	KK, %rax
1067#else
1068	movq	K,  %rax
1069	subq	KK, %rax
1070#endif
1071	and	$3,  %rax
1072	je	.L28
1073	ALIGN_4
1074
1075.L26:
1076	FLD	 -8 * SIZE(AO)
1077
1078	FLD	 -8 * SIZE(BO)
1079	fmul	 %st(1), %st
1080	faddp	 %st, %st(2)
1081
1082	FLD	 -7 * SIZE(BO)
1083	fmulp	 %st, %st(1)
1084	faddp	 %st, %st(2)
1085
1086	addq	$1 * SIZE,AO
1087	addq	$2 * SIZE,BO
1088
1089	decq	%rax
1090	jne	 .L26
1091	ALIGN_4
1092
1093.L28:
1094#if defined(LN) || defined(RT)
1095	movq	KK, %rax
1096#ifdef LN
1097	subq	$1, %rax
1098#else
1099	subq	$2, %rax
1100#endif
1101
1102	salq	$BASE_SHIFT, %rax
1103
1104	movq	AORIG, AO
1105	leaq	(AO, %rax, 1), AO
1106	leaq	(B,  %rax, 2), BO
1107#endif
1108
1109#if defined(LN) || defined(LT)
1110	FLD	-8 * SIZE(BO)
1111	fsubp	%st, %st(1)
1112	FLD	-7 * SIZE(BO)
1113	fsubp	%st, %st(2)
1114#else
1115	FLD	-8 * SIZE(AO)
1116	fsubp	%st, %st(1)
1117	FLD	-7 * SIZE(AO)
1118	fsubp	%st, %st(2)
1119#endif
1120
1121#if defined(LN) || defined(LT)
1122       FLD	-8 * SIZE(AO)
1123       fmul	%st, %st(1)
1124       fmulp	%st, %st(2)
1125#endif
1126
1127#ifdef RN
1128       FLD	-8 * SIZE(BO)
1129       fmulp	%st, %st(1)
1130
1131       FLD	-7 * SIZE(BO)
1132       fmul	%st(1), %st
1133
1134       fsubrp	%st, %st(2)
1135
1136       FLD	-5 * SIZE(BO)
1137       fmulp	%st, %st(2)
1138#endif
1139
1140#ifdef RT
1141       FLD	-5 * SIZE(BO)
1142       fmulp	%st, %st(2)
1143
1144       FLD	-6 * SIZE(BO)
1145       fmul	%st(2), %st
1146
1147       fsubrp	%st, %st(1)
1148
1149       FLD	-8 * SIZE(BO)
1150       fmulp	%st, %st(1)
1151#endif
1152
1153#ifdef LN
1154	subq	$1 * SIZE, CO
1155#endif
1156
1157#if defined(LN) || defined(LT)
1158	fld	%st
1159	FST	-8 * SIZE(BO)
1160	fxch	%st(1)
1161	fld	%st
1162	FST	-7 * SIZE(BO)
1163#else
1164	fld	%st
1165	FST	-8 * SIZE(AO)
1166	fxch	%st(1)
1167	fld	%st
1168	FST	-7 * SIZE(AO)
1169#endif
1170
1171	FST	0 * SIZE(CO, LDC)
1172	FST	0 * SIZE(CO)
1173
1174#ifndef LN
1175	addq	$1 * SIZE, CO
1176#endif
1177
1178#if defined(LT) || defined(RN)
1179	movq	K,  %rax
1180	subq	KK, %rax
1181	salq	$BASE_SHIFT, %rax
1182	leaq	(AO, %rax, 1), AO
1183	leaq	(BO, %rax, 2), BO
1184#endif
1185
1186#ifdef LN
1187	subq	$1, KK
1188#endif
1189
1190#ifdef LT
1191	addq	$1, KK
1192#endif
1193
1194#ifdef RT
1195       movq	K, %rax
1196       salq	$0 + BASE_SHIFT, %rax
1197       addq	%rax, AORIG
1198#endif
1199	ALIGN_4
1200
1201.L29:
1202#ifdef LN
1203       movq	K, %rax
1204       salq	$BASE_SHIFT, %rax
1205       leaq	(B, %rax, 2), B
1206#endif
1207
1208#if defined(LT) || defined(RN)
1209	movq	BO, B
1210#endif
1211
1212#ifdef RN
1213	addq	$2, KK
1214#endif
1215
1216#ifdef RT
1217	subq	$2, KK
1218#endif
1219
1220	decq	J
1221	jne	.L01
1222	ALIGN_4
1223
1224.L999:
1225	movq	  0(%rsp), %rbx
1226	movq	  8(%rsp), %rbp
1227	movq	 16(%rsp), %r12
1228	movq	 24(%rsp), %r13
1229	movq	 32(%rsp), %r14
1230	movq	 40(%rsp), %r15
1231	addq	$STACKSIZE, %rsp
1232	ret
1233
1234	EPILOGUE
1235