1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	ARG1
26#define N	ARG2
27#define K	ARG3
28#define A	ARG4
29#define B	ARG5
30#define C	ARG6
31#define LDC	%r10
32
33#define I	%r12
34#define J	%r13
35#define AO	%r14
36#define BO	%r15
37#define	CO	%rbp
38
39#define KK	%r11
40#define AORIG	 48(%rsp)
41
42#define STACKSIZE 64
43
44#define ALPHA	 8 + STACKSIZE(%rsp)
45#define OFFSET	32 + STACKSIZE(%rsp)
46
47#ifdef OPTERON
48#define PREFETCH	prefetch
49#define PREFETCHW	prefetchw
50#else
51#define PREFETCH	prefetcht0
52#define PREFETCHW	prefetcht0
53#endif
54
55#define PREFETCHSIZE (5 + 4 * 10)
56
57	PROLOGUE
58	PROFCODE
59
60	subq	$STACKSIZE, %rsp
61	movq	%rbx,  0(%rsp)
62	movq	%rbp,  8(%rsp)
63	movq	%r12, 16(%rsp)
64	movq	%r13, 24(%rsp)
65	movq	%r14, 32(%rsp)
66	movq	%r15, 40(%rsp)
67
68	movq	24 + STACKSIZE(%rsp), LDC
69
70#if defined(TRMMKERNEL) && !defined(LEFT)
71	movq	OFFSET, %rax
72	negq	%rax
73	movq	%rax, KK
74#endif
75
76	addq	$8 * SIZE, A
77	addq	$8 * SIZE, B
78
79	salq	$BASE_SHIFT, LDC
80
81#ifdef LN
82       movq	M, %rax
83       salq	$BASE_SHIFT, %rax
84       addq	%rax, C
85       imulq	K, %rax
86       addq	%rax, A
87#endif
88
89#ifdef RT
90       movq	N, %rax
91       salq	$BASE_SHIFT, %rax
92       imulq	K, %rax
93       addq	%rax, B
94
95       movq	N,   %rax
96       imulq	LDC, %rax
97       addq	%rax, C
98#endif
99
100#ifdef RN
101       movq	OFFSET, %rax
102       negq	%rax
103       movq	%rax, KK
104#endif
105
106#ifdef RT
107       movq	N, %rax
108       subq	OFFSET, %rax
109       movq	%rax, KK
110#endif
111
112	movq	N,  %rax
113	testq	$1, %rax
114	je	.L30
115
116#if defined(LT) || defined(RN)
117	movq	A, AO
118#else
119	movq	A, %rax
120	movq	%rax, AORIG
121#endif
122
123#ifdef RT
124	movq	K, %rax
125	salq	$0 + BASE_SHIFT, %rax
126	subq	%rax, B
127#endif
128
129#ifdef RT
130	subq	LDC, C
131#endif
132	movq	C, CO
133#ifndef RT
134	addq	LDC, C
135#endif
136
137#ifdef LN
138	movq	OFFSET, %rax
139	addq	M, %rax
140	movq	%rax, KK
141#endif
142
143#ifdef LT
144	movq	OFFSET, %rax
145	movq	%rax, KK
146#endif
147
148	movq	M,  I
149	sarq	$1, I
150	je	.L40
151	ALIGN_4
152
153.L31:
154#ifdef LN
155       movq	K, %rax
156       salq	$1 + BASE_SHIFT, %rax
157       subq	%rax, AORIG
158#endif
159
160#if defined(LN) || defined(RT)
161	movq	KK, %rax
162	salq	$BASE_SHIFT, %rax
163	movq	AORIG, AO
164	leaq	(AO, %rax, 2), AO
165	leaq	(B,  %rax, 1), BO
166#else
167	movq	B, BO
168#endif
169
170	fldz
171	fldz
172
173#if   defined(HAVE_3DNOW)
174	prefetchw	2 * SIZE(CO)
175#elif defined(HAVE_SSE)
176	prefetchnta	2 * SIZE(CO)
177#endif
178
179#if defined(LT) || defined(RN)
180	movq	KK, %rax
181#else
182	movq	K,  %rax
183	subq	KK, %rax
184#endif
185	sarq	$2, %rax
186 	je	.L35
187	ALIGN_4
188
189.L32:
190	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
191
192	FLD	 -8 * SIZE(BO)
193	FLD	 -8 * SIZE(AO)
194	fmul	 %st(1), %st
195	faddp	 %st, %st(2)
196
197	FLD	 -7 * SIZE(AO)
198	fmulp	 %st, %st(1)
199	faddp	 %st, %st(2)
200
201	FLD	 -7 * SIZE(BO)
202	FLD	 -6 * SIZE(AO)
203	fmul	 %st(1), %st
204	faddp	 %st, %st(2)
205
206	FLD	 -5 * SIZE(AO)
207	fmulp	 %st, %st(1)
208	faddp	 %st, %st(2)
209
210	FLD	 -6 * SIZE(BO)
211	FLD	 -4 * SIZE(AO)
212	fmul	 %st(1), %st
213	faddp	 %st, %st(2)
214
215	FLD	 -3 * SIZE(AO)
216	fmulp	 %st, %st(1)
217	faddp	 %st, %st(2)
218
219	FLD	 -5 * SIZE(BO)
220	FLD	 -2 * SIZE(AO)
221	fmul	 %st(1), %st
222	faddp	 %st, %st(2)
223
224	FLD	 -1 * SIZE(AO)
225	fmulp	 %st, %st(1)
226	faddp	 %st, %st(2)
227
228	addq	$8 * SIZE,AO
229	addq	$4 * SIZE,BO
230
231	decq	%rax
232	jne	.L32
233	ALIGN_4
234
235.L35:
236#if defined(LT) || defined(RN)
237	movq	KK, %rax
238#else
239	movq	K,  %rax
240	subq	KK, %rax
241#endif
242	and	$3,  %rax
243	je	.L38
244	ALIGN_4
245
246.L36:
247	FLD	 -8 * SIZE(BO)
248
249	FLD	 -8 * SIZE(AO)
250	fmul	 %st(1), %st
251	faddp	 %st, %st(2)
252
253	FLD	 -7 * SIZE(AO)
254	fmulp	 %st, %st(1)
255	faddp	 %st, %st(2)
256
257	addq	$2 * SIZE,AO
258	addq	$1 * SIZE,BO
259
260	decq	%rax
261	jne	 .L36
262	ALIGN_4
263
264.L38:
265#if defined(LN) || defined(RT)
266	movq	KK, %rax
267#ifdef LN
268	subq	$2, %rax
269#else
270	subq	$1, %rax
271#endif
272
273	salq	$BASE_SHIFT, %rax
274
275	movq	AORIG, AO
276	leaq	(AO, %rax, 2), AO
277	leaq	(B,  %rax, 1), BO
278#endif
279
280#if defined(LN) || defined(LT)
281	FLD	-8 * SIZE(BO)
282	fsubp	%st, %st(1)
283	FLD	-7 * SIZE(BO)
284	fsubp	%st, %st(2)
285#else
286	FLD	-8 * SIZE(AO)
287	fsubp	%st, %st(1)
288	FLD	-7 * SIZE(AO)
289	fsubp	%st, %st(2)
290#endif
291
292#ifdef LN
293       FLD	-5 * SIZE(AO)
294       fmulp	%st, %st(2)
295
296       FLD	-6 * SIZE(AO)
297       fmul	%st(2), %st
298
299       fsubrp	%st, %st(1)
300       FLD	-8 * SIZE(AO)
301       fmulp	%st, %st(1)
302#endif
303
304#ifdef LT
305       FLD	-8 * SIZE(AO)
306       fmulp	%st, %st(1)
307
308       FLD	-7 * SIZE(AO)
309       fmul	%st(1), %st
310
311       fsubrp	%st, %st(2)
312
313       FLD	-5 * SIZE(AO)
314       fmulp	%st, %st(2)
315#endif
316
317#ifdef RN
318       FLD	-8 * SIZE(BO)
319       fmul	%st, %st(1)
320       fmulp	%st, %st(2)
321#endif
322
323#ifdef RT
324       FLD	-8 * SIZE(BO)
325       fmul	%st, %st(1)
326       fmulp	%st, %st(2)
327#endif
328
329#ifdef LN
330	subq	$2 * SIZE, CO
331#endif
332
333#if defined(LN) || defined(LT)
334	fld	%st
335	FST	-8 * SIZE(BO)
336	fxch	%st(1)
337	fld	%st
338	FST	-7 * SIZE(BO)
339#else
340	fld	%st
341	FST	-8 * SIZE(AO)
342	fxch	%st(1)
343	fld	%st
344	FST	-7 * SIZE(AO)
345#endif
346
347	FST	1 * SIZE(CO)
348	FST	0 * SIZE(CO)
349
350#ifndef LN
351	addq	$2 * SIZE, CO
352#endif
353
354#if defined(LT) || defined(RN)
355	movq	K,  %rax
356	subq	KK, %rax
357	salq	$BASE_SHIFT, %rax
358	leaq	(AO, %rax, 2), AO
359	leaq	(BO, %rax, 1), BO
360#endif
361
362#ifdef LN
363	subq	$2, KK
364#endif
365
366#ifdef LT
367	addq	$2, KK
368#endif
369
370#ifdef RT
371       movq	K, %rax
372       salq	$1 + BASE_SHIFT, %rax
373       addq	%rax, AORIG
374#endif
375
376	decq	I
377	jne	.L31
378	ALIGN_4
379
380.L40:
381	movq	 M, %rax
382	andq	$1, %rax
383	je	.L49
384	ALIGN_4
385
386.L41:
387#ifdef LN
388       movq	K, %rax
389       salq	$0 + BASE_SHIFT, %rax
390       subq	%rax, AORIG
391#endif
392
393#if defined(LN) || defined(RT)
394	movq	KK, %rax
395	salq	$BASE_SHIFT, %rax
396	movq	AORIG, AO
397	leaq	(AO, %rax, 1), AO
398	leaq	(B,  %rax, 1), BO
399#else
400	movq	B, BO
401#endif
402
403	fldz
404
405#if defined(LT) || defined(RN)
406	movq	KK, %rax
407#else
408	movq	K,  %rax
409	subq	KK, %rax
410#endif
411	sarq	$2, %rax
412 	je	.L45
413	ALIGN_4
414
415.L42:
416	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
417
418	FLD	 -8 * SIZE(AO)
419	FLD	 -8 * SIZE(BO)
420	fmulp	 %st, %st(1)
421	faddp	 %st, %st(1)
422
423	FLD	 -7 * SIZE(AO)
424	FLD	 -7 * SIZE(BO)
425	fmulp	 %st, %st(1)
426	faddp	 %st, %st(1)
427
428	FLD	 -6 * SIZE(AO)
429	FLD	 -6 * SIZE(BO)
430	fmulp	 %st, %st(1)
431	faddp	 %st, %st(1)
432
433	FLD	 -5 * SIZE(AO)
434	FLD	 -5 * SIZE(BO)
435	fmulp	 %st, %st(1)
436	faddp	 %st, %st(1)
437
438	addq	$4 * SIZE,AO
439	addq	$4 * SIZE,BO
440
441	decq	%rax
442	jne	.L42
443	ALIGN_4
444
445.L45:
446#if defined(LT) || defined(RN)
447	movq	KK, %rax
448#else
449	movq	K,  %rax
450	subq	KK, %rax
451#endif
452	and	$3,  %rax
453	je	.L48
454	ALIGN_4
455
456.L46:
457	FLD	 -8 * SIZE(AO)
458
459	FLD	 -8 * SIZE(BO)
460	fmulp	 %st, %st(1)
461	faddp	 %st, %st(1)
462
463	addq	$1 * SIZE,AO
464	addq	$1 * SIZE,BO
465
466	decq	%rax
467	jne	 .L46
468	ALIGN_4
469
470.L48:
471#if defined(LN) || defined(RT)
472	movq	KK, %rax
473#ifdef LN
474	subq	$1, %rax
475#else
476	subq	$1, %rax
477#endif
478
479	salq	$BASE_SHIFT, %rax
480
481	movq	AORIG, AO
482	leaq	(AO, %rax, 1), AO
483	leaq	(B,  %rax, 1), BO
484#endif
485
486#if defined(LN) || defined(LT)
487	FLD	-8 * SIZE(BO)
488	fsubp	%st, %st(1)
489#else
490	FLD	-8 * SIZE(AO)
491	fsubp	%st, %st(1)
492#endif
493
494#ifdef LN
495       FLD	-8 * SIZE(AO)
496       fmulp	%st, %st(1)
497#endif
498
499#ifdef LT
500       FLD	-8 * SIZE(AO)
501       fmulp	%st, %st(1)
502#endif
503
504#ifdef RN
505       FLD	-8 * SIZE(BO)
506       fmulp	%st, %st(1)
507#endif
508
509#ifdef RT
510       FLD	-8 * SIZE(BO)
511       fmulp	%st, %st(1)
512#endif
513
514#ifdef LN
515	subq	$1 * SIZE, CO
516#endif
517
518#if defined(LN) || defined(LT)
519	fld	%st
520	FST	-8 * SIZE(BO)
521#else
522	fld	%st
523	FST	-8 * SIZE(AO)
524#endif
525
526	FST	0 * SIZE(CO)
527
528#ifndef LN
529	addq	$1 * SIZE, CO
530#endif
531
532#if defined(LT) || defined(RN)
533	movq	K,  %rax
534	subq	KK, %rax
535	salq	$BASE_SHIFT, %rax
536	leaq	(AO, %rax, 1), AO
537	leaq	(BO, %rax, 1), BO
538#endif
539
540#ifdef LN
541	subq	$1, KK
542#endif
543
544#ifdef LT
545	addq	$1, KK
546#endif
547
548#ifdef RT
549       movq	K, %rax
550       salq	$0 + BASE_SHIFT, %rax
551       addq	%rax, AORIG
552#endif
553	ALIGN_4
554
555.L49:
556#ifdef LN
557       movq	K, %rax
558       salq	$BASE_SHIFT, %rax
559       leaq	(B, %rax, 1), B
560#endif
561
562#if defined(LT) || defined(RN)
563	movq	BO, B
564#endif
565
566#ifdef RN
567	addq	$1, KK
568#endif
569
570#ifdef RT
571	subq	$1, KK
572#endif
573	ALIGN_4
574
575.L30:
576	movq	N,   %rax
577	sarq	$1,  %rax
578	movq	%rax, J
579	je	.L999
580	ALIGN_4
581
582.L01:
583#if defined(LT) || defined(RN)
584	movq	A, AO
585#else
586	movq	A, %rax
587	movq	%rax, AORIG
588#endif
589
590#ifdef RT
591	movq	K, %rax
592	salq	$1 + BASE_SHIFT, %rax
593	subq	%rax, B
594#endif
595
596	lea	(, LDC, 2), %rax
597
598#ifdef RT
599	subq	%rax, C
600#endif
601	movq	C, CO
602#ifndef RT
603	addq	%rax, C
604#endif
605
606#ifdef LN
607	movq	OFFSET, %rax
608	addq	M, %rax
609	movq	%rax, KK
610#endif
611
612#ifdef LT
613	movq	OFFSET, %rax
614	movq	%rax, KK
615#endif
616
617	movq	M,  I
618	sarq	$1, I
619	je	.L20
620	ALIGN_4
621
622.L11:
623#ifdef LN
624       movq	K, %rax
625       salq	$1 + BASE_SHIFT, %rax
626       subq	%rax, AORIG
627#endif
628
629#if defined(LN) || defined(RT)
630	movq	KK, %rax
631	salq	$BASE_SHIFT, %rax
632	movq	AORIG, AO
633	leaq	(AO, %rax, 2), AO
634	leaq	(B,  %rax, 2), BO
635#else
636	movq	B, BO
637#endif
638
639	fldz
640	fldz
641	fldz
642	fldz
643
644#if   defined(HAVE_3DNOW)
645	prefetchw	2 * SIZE(CO)
646 	prefetchw	2 * SIZE(CO, LDC, 1)
647#elif defined(HAVE_SSE)
648	prefetchnta	2 * SIZE(CO)
649 	prefetchnta	2 * SIZE(CO, LDC, 1)
650#endif
651
652#if defined(LT) || defined(RN)
653	movq	KK, %rax
654#else
655	movq	K,  %rax
656	subq	KK, %rax
657#endif
658	sarq	$2, %rax
659 	je	.L15
660	ALIGN_4
661
662.L12:
663	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
664
665	FLD	 -8 * SIZE(AO)
666
667	FLD	 -8 * SIZE(BO)
668	fld	 %st(1)
669	fmul	 %st(1), %st
670	faddp	 %st, %st(3)
671
672	FLD	 -7 * SIZE(BO)
673	fmul	 %st, %st(2)
674
675	FLD	 -7 * SIZE(AO)
676	fmul	 %st, %st(2)
677	fmulp	 %st, %st(1)
678
679	faddp	 %st, %st(6)
680	faddp	 %st, %st(4)
681	faddp	 %st, %st(2)
682
683	FLD	 -6 * SIZE(AO)
684
685	FLD	 -6 * SIZE(BO)
686	fld	 %st(1)
687	fmul	 %st(1), %st
688	faddp	 %st, %st(3)
689
690	FLD	 -5 * SIZE(BO)
691	fmul	 %st, %st(2)
692
693	FLD	 -5 * SIZE(AO)
694	fmul	 %st, %st(2)
695	fmulp	 %st, %st(1)
696
697	faddp	 %st, %st(6)
698	faddp	 %st, %st(4)
699	faddp	 %st, %st(2)
700
701	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
702
703	FLD	 -4 * SIZE(AO)
704
705	FLD	 -4 * SIZE(BO)
706	fld	 %st(1)
707	fmul	 %st(1), %st
708	faddp	 %st, %st(3)
709
710	FLD	 -3 * SIZE(BO)
711	fmul	 %st, %st(2)
712
713	FLD	 -3 * SIZE(AO)
714	fmul	 %st, %st(2)
715	fmulp	 %st, %st(1)
716
717	faddp	 %st, %st(6)
718	faddp	 %st, %st(4)
719	faddp	 %st, %st(2)
720
721	FLD	 -2 * SIZE(AO)
722
723	FLD	 -2 * SIZE(BO)
724	fld	 %st(1)
725	fmul	 %st(1), %st
726	faddp	 %st, %st(3)
727
728	FLD	 -1 * SIZE(BO)
729	fmul	 %st, %st(2)
730
731	FLD	 -1 * SIZE(AO)
732	fmul	 %st, %st(2)
733	fmulp	 %st, %st(1)
734
735	faddp	 %st, %st(6)
736	faddp	 %st, %st(4)
737	faddp	 %st, %st(2)
738
739	addq	$8 * SIZE,AO
740	addq	$8 * SIZE,BO
741
742	decq	%rax
743	jne	.L12
744	ALIGN_4
745
746.L15:
747#if defined(LT) || defined(RN)
748	movq	KK, %rax
749#else
750	movq	K,  %rax
751	subq	KK, %rax
752#endif
753	and	$3,  %rax
754	je	.L18
755	ALIGN_4
756
757.L16:
758	FLD	 -8 * SIZE(AO)
759
760	FLD	 -8 * SIZE(BO)
761	fld	 %st(1)
762	fmul	 %st(1), %st
763	faddp	 %st, %st(3)
764
765	FLD	 -7 * SIZE(BO)
766	fmul	 %st, %st(2)
767
768	FLD	 -7 * SIZE(AO)
769	fmul	 %st, %st(2)
770	fmulp	 %st, %st(1)
771
772	faddp	 %st, %st(6)
773	faddp	 %st, %st(4)
774	faddp	 %st, %st(2)
775
776	addq	$2 * SIZE,AO
777	addq	$2 * SIZE,BO
778
779	decq	%rax
780	jne	 .L16
781	ALIGN_4
782
783.L18:
784#if defined(LN) || defined(RT)
785	movq	KK, %rax
786#ifdef LN
787	subq	$2, %rax
788#else
789	subq	$2, %rax
790#endif
791
792	salq	$BASE_SHIFT, %rax
793
794	movq	AORIG, AO
795	leaq	(AO, %rax, 2), AO
796	leaq	(B,  %rax, 2), BO
797#endif
798
799#if defined(LN) || defined(LT)
800	FLD	-8 * SIZE(BO)
801	fsubp	%st, %st(1)
802	FLD	-7 * SIZE(BO)
803	fsubp	%st, %st(2)
804	FLD	-6 * SIZE(BO)
805	fsubp	%st, %st(3)
806	FLD	-5 * SIZE(BO)
807	fsubp	%st, %st(4)
808#else
809	FLD	-8 * SIZE(AO)
810	fsubp	%st, %st(1)
811	FLD	-7 * SIZE(AO)
812	fsubp	%st, %st(3)
813	FLD	-6 * SIZE(AO)
814	fsubp	%st, %st(2)
815	FLD	-5 * SIZE(AO)
816	fsubp	%st, %st(4)
817#endif
818
819#ifdef LN
820       FLD	-5 * SIZE(AO)
821       fmul	%st, %st(3)
822       fmulp	%st, %st(4)
823
824       FLD	-6 * SIZE(AO)
825       fmul	%st(3), %st
826       FLD	-6 * SIZE(AO)
827       fmul	%st(5), %st
828
829       fsubrp	%st, %st(3)
830       fsubrp	%st, %st(1)
831
832       FLD	-8 * SIZE(AO)
833       fmul	%st, %st(1)
834       fmulp	%st, %st(2)
835#endif
836
837#ifdef LT
838       FLD	-8 * SIZE(AO)
839       fmul	%st, %st(1)
840       fmulp	%st, %st(2)
841
842       FLD	-7 * SIZE(AO)
843       fmul	%st(1), %st
844       FLD	-7 * SIZE(AO)
845       fmul	%st(3), %st
846
847       fsubrp	%st, %st(5)
848       fsubrp	%st, %st(3)
849
850       FLD	-5 * SIZE(AO)
851       fmul	%st, %st(3)
852       fmulp	%st, %st(4)
853#endif
854
855#ifdef RN
856       FLD	-8 * SIZE(BO)
857       fmul	%st, %st(1)
858       fmulp	%st, %st(3)
859
860       FLD	-7 * SIZE(BO)
861       fmul	%st(1), %st
862       FLD	-7 * SIZE(BO)
863       fmul	%st(4), %st
864
865       fsubrp	%st, %st(5)
866       fsubrp	%st, %st(2)
867
868       FLD	-5 * SIZE(BO)
869       fmul	%st, %st(2)
870       fmulp	%st, %st(4)
871#endif
872
873#ifdef RT
874       FLD	-5 * SIZE(BO)
875       fmul	%st, %st(2)
876       fmulp	%st, %st(4)
877
878       FLD	-6 * SIZE(BO)
879       fmul	%st(2), %st
880       FLD	-6 * SIZE(BO)
881       fmul	%st(5), %st
882
883       fsubrp	%st, %st(4)
884       fsubrp	%st, %st(1)
885
886       FLD	-8 * SIZE(BO)
887       fmul	%st, %st(1)
888       fmulp	%st, %st(3)
889#endif
890
891#ifdef LN
892	subq	$2 * SIZE, CO
893#endif
894
895#if defined(LN) || defined(LT)
896	fld	%st
897	FST	-8 * SIZE(BO)
898	fxch	%st(1)
899	fld	%st
900	FST	-7 * SIZE(BO)
901	fxch	%st(2)
902	fld	%st
903	FST	-6 * SIZE(BO)
904	fxch	%st(3)
905	fld	%st
906	FST	-5 * SIZE(BO)
907
908	FST	1 * SIZE(CO, LDC)
909	FST	0 * SIZE(CO)
910	FST	0 * SIZE(CO, LDC)
911	FST	1 * SIZE(CO)
912#else
913	fld	%st
914	FST	-8 * SIZE(AO)
915	fxch	%st(2)
916	fld	%st
917	FST	-7 * SIZE(AO)
918	fxch	%st(1)
919	fld	%st
920	FST	-6 * SIZE(AO)
921	fxch	%st(3)
922	fld	%st
923	FST	-5 * SIZE(AO)
924
925	FST	1 * SIZE(CO, LDC)
926	FST	1 * SIZE(CO)
927	FST	0 * SIZE(CO)
928	FST	0 * SIZE(CO, LDC)
929#endif
930
931#ifndef LN
932	addq	$2 * SIZE, CO
933#endif
934
935#if defined(LT) || defined(RN)
936	movq	K,  %rax
937	subq	KK, %rax
938	salq	$BASE_SHIFT, %rax
939	leaq	(AO, %rax, 2), AO
940	leaq	(BO, %rax, 2), BO
941#endif
942
943#ifdef LN
944	subq	$2, KK
945#endif
946
947#ifdef LT
948	addq	$2, KK
949#endif
950
951#ifdef RT
952       movq	K, %rax
953       salq	$1 + BASE_SHIFT, %rax
954       addq	%rax, AORIG
955#endif
956
957	decq	I
958	jne	.L11
959	ALIGN_4
960
961.L20:
962	movq	 M, %rax
963	andq	$1, %rax
964	je	.L29
965	ALIGN_4
966
967.L21:
968#ifdef LN
969       movq	K, %rax
970       salq	$0 + BASE_SHIFT, %rax
971       subq	%rax, AORIG
972#endif
973
974#if defined(LN) || defined(RT)
975	movq	KK, %rax
976	salq	$BASE_SHIFT, %rax
977	movq	AORIG, AO
978	leaq	(AO, %rax, 1), AO
979	leaq	(B,  %rax, 2), BO
980#else
981	movq	B, BO
982#endif
983
984	fldz
985	fldz
986
987#if defined(LT) || defined(RN)
988	movq	KK, %rax
989#else
990	movq	K,  %rax
991	subq	KK, %rax
992#endif
993	sarq	$2, %rax
994 	je	.L25
995	ALIGN_4
996
997.L22:
998	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
999
1000	FLD	 -8 * SIZE(AO)
1001
1002	FLD	 -8 * SIZE(BO)
1003	fmul	 %st(1), %st
1004	faddp	 %st, %st(2)
1005
1006	FLD	 -7 * SIZE(BO)
1007	fmulp	 %st, %st(1)
1008	faddp	 %st, %st(2)
1009
1010	FLD	 -7 * SIZE(AO)
1011
1012	FLD	 -6 * SIZE(BO)
1013	fmul	 %st(1), %st
1014	faddp	 %st, %st(2)
1015
1016	FLD	 -5 * SIZE(BO)
1017	fmulp	 %st, %st(1)
1018	faddp	 %st, %st(2)
1019
1020	FLD	 -6 * SIZE(AO)
1021
1022	FLD	 -4 * SIZE(BO)
1023	fmul	 %st(1), %st
1024	faddp	 %st, %st(2)
1025
1026	FLD	 -3 * SIZE(BO)
1027	fmulp	 %st, %st(1)
1028	faddp	 %st, %st(2)
1029
1030	FLD	 -5 * SIZE(AO)
1031
1032	FLD	 -2 * SIZE(BO)
1033	fmul	 %st(1), %st
1034	faddp	 %st, %st(2)
1035
1036	FLD	 -1 * SIZE(BO)
1037	fmulp	 %st, %st(1)
1038	faddp	 %st, %st(2)
1039
1040	addq	$4 * SIZE,AO
1041	addq	$8 * SIZE,BO
1042
1043	decq	%rax
1044	jne	.L22
1045	ALIGN_4
1046
1047.L25:
1048#if defined(LT) || defined(RN)
1049	movq	KK, %rax
1050#else
1051	movq	K,  %rax
1052	subq	KK, %rax
1053#endif
1054	and	$3,  %rax
1055	je	.L28
1056	ALIGN_4
1057
1058.L26:
1059	FLD	 -8 * SIZE(AO)
1060
1061	FLD	 -8 * SIZE(BO)
1062	fmul	 %st(1), %st
1063	faddp	 %st, %st(2)
1064
1065	FLD	 -7 * SIZE(BO)
1066	fmulp	 %st, %st(1)
1067	faddp	 %st, %st(2)
1068
1069	addq	$1 * SIZE,AO
1070	addq	$2 * SIZE,BO
1071
1072	decq	%rax
1073	jne	 .L26
1074	ALIGN_4
1075
1076.L28:
1077#if defined(LN) || defined(RT)
1078	movq	KK, %rax
1079#ifdef LN
1080	subq	$1, %rax
1081#else
1082	subq	$2, %rax
1083#endif
1084
1085	salq	$BASE_SHIFT, %rax
1086
1087	movq	AORIG, AO
1088	leaq	(AO, %rax, 1), AO
1089	leaq	(B,  %rax, 2), BO
1090#endif
1091
1092#if defined(LN) || defined(LT)
1093	FLD	-8 * SIZE(BO)
1094	fsubp	%st, %st(1)
1095	FLD	-7 * SIZE(BO)
1096	fsubp	%st, %st(2)
1097#else
1098	FLD	-8 * SIZE(AO)
1099	fsubp	%st, %st(1)
1100	FLD	-7 * SIZE(AO)
1101	fsubp	%st, %st(2)
1102#endif
1103
1104#if defined(LN) || defined(LT)
1105       FLD	-8 * SIZE(AO)
1106       fmul	%st, %st(1)
1107       fmulp	%st, %st(2)
1108#endif
1109
1110#ifdef RN
1111       FLD	-8 * SIZE(BO)
1112       fmulp	%st, %st(1)
1113
1114       FLD	-7 * SIZE(BO)
1115       fmul	%st(1), %st
1116
1117       fsubrp	%st, %st(2)
1118
1119       FLD	-5 * SIZE(BO)
1120       fmulp	%st, %st(2)
1121#endif
1122
1123#ifdef RT
1124       FLD	-5 * SIZE(BO)
1125       fmulp	%st, %st(2)
1126
1127       FLD	-6 * SIZE(BO)
1128       fmul	%st(2), %st
1129
1130       fsubrp	%st, %st(1)
1131
1132       FLD	-8 * SIZE(BO)
1133       fmulp	%st, %st(1)
1134#endif
1135
1136#ifdef LN
1137	subq	$1 * SIZE, CO
1138#endif
1139
1140#if defined(LN) || defined(LT)
1141	fld	%st
1142	FST	-8 * SIZE(BO)
1143	fxch	%st(1)
1144	fld	%st
1145	FST	-7 * SIZE(BO)
1146#else
1147	fld	%st
1148	FST	-8 * SIZE(AO)
1149	fxch	%st(1)
1150	fld	%st
1151	FST	-7 * SIZE(AO)
1152#endif
1153
1154	FST	0 * SIZE(CO, LDC)
1155	FST	0 * SIZE(CO)
1156
1157#ifndef LN
1158	addq	$1 * SIZE, CO
1159#endif
1160
1161#if defined(LT) || defined(RN)
1162	movq	K,  %rax
1163	subq	KK, %rax
1164	salq	$BASE_SHIFT, %rax
1165	leaq	(AO, %rax, 1), AO
1166	leaq	(BO, %rax, 2), BO
1167#endif
1168
1169#ifdef LN
1170	subq	$1, KK
1171#endif
1172
1173#ifdef LT
1174	addq	$1, KK
1175#endif
1176
1177#ifdef RT
1178       movq	K, %rax
1179       salq	$0 + BASE_SHIFT, %rax
1180       addq	%rax, AORIG
1181#endif
1182	ALIGN_4
1183
1184.L29:
1185#ifdef LN
1186       movq	K, %rax
1187       salq	$BASE_SHIFT, %rax
1188       leaq	(B, %rax, 2), B
1189#endif
1190
1191#if defined(LT) || defined(RN)
1192	movq	BO, B
1193#endif
1194
1195#ifdef RN
1196	addq	$2, KK
1197#endif
1198
1199#ifdef RT
1200	subq	$2, KK
1201#endif
1202
1203	decq	J
1204	jne	.L01
1205	ALIGN_4
1206
1207.L999:
1208	movq	  0(%rsp), %rbx
1209	movq	  8(%rsp), %rbp
1210	movq	 16(%rsp), %r12
1211	movq	 24(%rsp), %r13
1212	movq	 32(%rsp), %r14
1213	movq	 40(%rsp), %r15
1214	addq	$STACKSIZE, %rsp
1215	ret
1216
1217	EPILOGUE
1218