1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	ARG1
26#define N	ARG2
27#define K	ARG3
28#define A	ARG4
29#define B	ARG5
30#define C	ARG6
31#define LDC	%r10
32
33#define I	%r12
34#define J	%r13
35#define AO	%r14
36#define BO	%r15
37#define	CO	%rbp
38
39#define KK	%r11
40#define AORIG	 48(%rsp)
41
42#define STACKSIZE 64
43
44#define ALPHA	 8 + STACKSIZE(%rsp)
45#define OFFSET	32 + STACKSIZE(%rsp)
46
47#ifdef OPTERON
48#define PREFETCH	prefetch
49#define PREFETCHW	prefetchw
50#else
51#define PREFETCH	prefetcht0
52#define PREFETCHW	prefetcht0
53#endif
54
55#define PREFETCHSIZE (5 + 4 * 10)
56
57	PROLOGUE
58	PROFCODE
59
60	subq	$STACKSIZE, %rsp
61	movq	%rbx,  0(%rsp)
62	movq	%rbp,  8(%rsp)
63	movq	%r12, 16(%rsp)
64	movq	%r13, 24(%rsp)
65	movq	%r14, 32(%rsp)
66	movq	%r15, 40(%rsp)
67
68	movq	24 + STACKSIZE(%rsp), LDC
69
70#if defined(TRMMKERNEL) && !defined(LEFT)
71	movq	OFFSET, %rax
72	negq	%rax
73	movq	%rax, KK
74#endif
75
76	addq	$8 * SIZE, A
77	addq	$8 * SIZE, B
78
79	salq	$BASE_SHIFT, LDC
80
81#ifdef LN
82       movq	M, %rax
83       salq	$BASE_SHIFT, %rax
84       addq	%rax, C
85       imulq	K, %rax
86       addq	%rax, A
87#endif
88
89#ifdef RT
90       movq	N, %rax
91       salq	$BASE_SHIFT, %rax
92       imulq	K, %rax
93       addq	%rax, B
94
95       movq	N,   %rax
96       imulq	LDC, %rax
97       addq	%rax, C
98#endif
99
100#ifdef RN
101       movq	OFFSET, %rax
102       negq	%rax
103       movq	%rax, KK
104#endif
105
106#ifdef RT
107       movq	N, %rax
108       subq	OFFSET, %rax
109       movq	%rax, KK
110#endif
111
112	movq	N,   %rax
113	sarq	$1,  %rax
114	movq	%rax, J
115	je	.L30
116	ALIGN_4
117
118.L01:
119#if defined(LT) || defined(RN)
120	movq	A, AO
121#else
122	movq	A, %rax
123	movq	%rax, AORIG
124#endif
125
126#ifdef RT
127	movq	K, %rax
128	salq	$1 + BASE_SHIFT, %rax
129	subq	%rax, B
130#endif
131
132	lea	(, LDC, 2), %rax
133
134#ifdef RT
135	subq	%rax, C
136#endif
137	movq	C, CO
138#ifndef RT
139	addq	%rax, C
140#endif
141
142#ifdef LN
143	movq	OFFSET, %rax
144	addq	M, %rax
145	movq	%rax, KK
146#endif
147
148#ifdef LT
149	movq	OFFSET, %rax
150	movq	%rax, KK
151#endif
152
153	movq	M,  I
154	sarq	$1, I
155	je	.L20
156	ALIGN_4
157
158.L11:
159#ifdef LN
160       movq	K, %rax
161       salq	$1 + BASE_SHIFT, %rax
162       subq	%rax, AORIG
163#endif
164
165#if defined(LN) || defined(RT)
166	movq	KK, %rax
167	salq	$BASE_SHIFT, %rax
168	movq	AORIG, AO
169	leaq	(AO, %rax, 2), AO
170	leaq	(B,  %rax, 2), BO
171#else
172	movq	B, BO
173#endif
174
175	fldz
176	fldz
177	fldz
178	fldz
179
180#if   defined(HAVE_3DNOW)
181	prefetchw	2 * SIZE(CO)
182 	prefetchw	2 * SIZE(CO, LDC, 1)
183#elif defined(HAVE_SSE)
184	prefetchnta	2 * SIZE(CO)
185 	prefetchnta	2 * SIZE(CO, LDC, 1)
186#endif
187
188#if defined(LT) || defined(RN)
189	movq	KK, %rax
190#else
191	movq	K,  %rax
192	subq	KK, %rax
193#endif
194	sarq	$2, %rax
195 	je	.L15
196	ALIGN_4
197
198.L12:
199	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
200
201	FLD	 -8 * SIZE(AO)
202
203	FLD	 -8 * SIZE(BO)
204	fld	 %st(1)
205	fmul	 %st(1), %st
206	faddp	 %st, %st(3)
207
208	FLD	 -7 * SIZE(BO)
209	fmul	 %st, %st(2)
210
211	FLD	 -7 * SIZE(AO)
212	fmul	 %st, %st(2)
213	fmulp	 %st, %st(1)
214
215	faddp	 %st, %st(6)
216	faddp	 %st, %st(4)
217	faddp	 %st, %st(2)
218
219	FLD	 -6 * SIZE(AO)
220
221	FLD	 -6 * SIZE(BO)
222	fld	 %st(1)
223	fmul	 %st(1), %st
224	faddp	 %st, %st(3)
225
226	FLD	 -5 * SIZE(BO)
227	fmul	 %st, %st(2)
228
229	FLD	 -5 * SIZE(AO)
230	fmul	 %st, %st(2)
231	fmulp	 %st, %st(1)
232
233	faddp	 %st, %st(6)
234	faddp	 %st, %st(4)
235	faddp	 %st, %st(2)
236
237	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
238
239	FLD	 -4 * SIZE(AO)
240
241	FLD	 -4 * SIZE(BO)
242	fld	 %st(1)
243	fmul	 %st(1), %st
244	faddp	 %st, %st(3)
245
246	FLD	 -3 * SIZE(BO)
247	fmul	 %st, %st(2)
248
249	FLD	 -3 * SIZE(AO)
250	fmul	 %st, %st(2)
251	fmulp	 %st, %st(1)
252
253	faddp	 %st, %st(6)
254	faddp	 %st, %st(4)
255	faddp	 %st, %st(2)
256
257	FLD	 -2 * SIZE(AO)
258
259	FLD	 -2 * SIZE(BO)
260	fld	 %st(1)
261	fmul	 %st(1), %st
262	faddp	 %st, %st(3)
263
264	FLD	 -1 * SIZE(BO)
265	fmul	 %st, %st(2)
266
267	FLD	 -1 * SIZE(AO)
268	fmul	 %st, %st(2)
269	fmulp	 %st, %st(1)
270
271	faddp	 %st, %st(6)
272	faddp	 %st, %st(4)
273	faddp	 %st, %st(2)
274
275	addq	$8 * SIZE,AO
276	addq	$8 * SIZE,BO
277
278	decq	%rax
279	jne	.L12
280	ALIGN_4
281
282.L15:
283#if defined(LT) || defined(RN)
284	movq	KK, %rax
285#else
286	movq	K,  %rax
287	subq	KK, %rax
288#endif
289	and	$3,  %rax
290	je	.L18
291	ALIGN_4
292
293.L16:
294	FLD	 -8 * SIZE(AO)
295
296	FLD	 -8 * SIZE(BO)
297	fld	 %st(1)
298	fmul	 %st(1), %st
299	faddp	 %st, %st(3)
300
301	FLD	 -7 * SIZE(BO)
302	fmul	 %st, %st(2)
303
304	FLD	 -7 * SIZE(AO)
305	fmul	 %st, %st(2)
306	fmulp	 %st, %st(1)
307
308	faddp	 %st, %st(6)
309	faddp	 %st, %st(4)
310	faddp	 %st, %st(2)
311
312	addq	$2 * SIZE,AO
313	addq	$2 * SIZE,BO
314
315	decq	%rax
316	jne	 .L16
317	ALIGN_4
318
319.L18:
320#if defined(LN) || defined(RT)
321	movq	KK, %rax
322#ifdef LN
323	subq	$2, %rax
324#else
325	subq	$2, %rax
326#endif
327
328	salq	$BASE_SHIFT, %rax
329
330	movq	AORIG, AO
331	leaq	(AO, %rax, 2), AO
332	leaq	(B,  %rax, 2), BO
333#endif
334
335#if defined(LN) || defined(LT)
336	FLD	-8 * SIZE(BO)
337	fsubp	%st, %st(1)
338	FLD	-7 * SIZE(BO)
339	fsubp	%st, %st(2)
340	FLD	-6 * SIZE(BO)
341	fsubp	%st, %st(3)
342	FLD	-5 * SIZE(BO)
343	fsubp	%st, %st(4)
344#else
345	FLD	-8 * SIZE(AO)
346	fsubp	%st, %st(1)
347	FLD	-7 * SIZE(AO)
348	fsubp	%st, %st(3)
349	FLD	-6 * SIZE(AO)
350	fsubp	%st, %st(2)
351	FLD	-5 * SIZE(AO)
352	fsubp	%st, %st(4)
353#endif
354
355#ifdef LN
356       FLD	-5 * SIZE(AO)
357       fmul	%st, %st(3)
358       fmulp	%st, %st(4)
359
360       FLD	-6 * SIZE(AO)
361       fmul	%st(3), %st
362       FLD	-6 * SIZE(AO)
363       fmul	%st(5), %st
364
365       fsubrp	%st, %st(3)
366       fsubrp	%st, %st(1)
367
368       FLD	-8 * SIZE(AO)
369       fmul	%st, %st(1)
370       fmulp	%st, %st(2)
371#endif
372
373#ifdef LT
374       FLD	-8 * SIZE(AO)
375       fmul	%st, %st(1)
376       fmulp	%st, %st(2)
377
378       FLD	-7 * SIZE(AO)
379       fmul	%st(1), %st
380       FLD	-7 * SIZE(AO)
381       fmul	%st(3), %st
382
383       fsubrp	%st, %st(5)
384       fsubrp	%st, %st(3)
385
386       FLD	-5 * SIZE(AO)
387       fmul	%st, %st(3)
388       fmulp	%st, %st(4)
389#endif
390
391#ifdef RN
392       FLD	-8 * SIZE(BO)
393       fmul	%st, %st(1)
394       fmulp	%st, %st(3)
395
396       FLD	-7 * SIZE(BO)
397       fmul	%st(1), %st
398       FLD	-7 * SIZE(BO)
399       fmul	%st(4), %st
400
401       fsubrp	%st, %st(5)
402       fsubrp	%st, %st(2)
403
404       FLD	-5 * SIZE(BO)
405       fmul	%st, %st(2)
406       fmulp	%st, %st(4)
407#endif
408
409#ifdef RT
410       FLD	-5 * SIZE(BO)
411       fmul	%st, %st(2)
412       fmulp	%st, %st(4)
413
414       FLD	-6 * SIZE(BO)
415       fmul	%st(2), %st
416       FLD	-6 * SIZE(BO)
417       fmul	%st(5), %st
418
419       fsubrp	%st, %st(4)
420       fsubrp	%st, %st(1)
421
422       FLD	-8 * SIZE(BO)
423       fmul	%st, %st(1)
424       fmulp	%st, %st(3)
425#endif
426
427#ifdef LN
428	subq	$2 * SIZE, CO
429#endif
430
431#if defined(LN) || defined(LT)
432	fld	%st
433	FST	-8 * SIZE(BO)
434	fxch	%st(1)
435	fld	%st
436	FST	-7 * SIZE(BO)
437	fxch	%st(2)
438	fld	%st
439	FST	-6 * SIZE(BO)
440	fxch	%st(3)
441	fld	%st
442	FST	-5 * SIZE(BO)
443
444	FST	1 * SIZE(CO, LDC)
445	FST	0 * SIZE(CO)
446	FST	0 * SIZE(CO, LDC)
447	FST	1 * SIZE(CO)
448#else
449	fld	%st
450	FST	-8 * SIZE(AO)
451	fxch	%st(2)
452	fld	%st
453	FST	-7 * SIZE(AO)
454	fxch	%st(1)
455	fld	%st
456	FST	-6 * SIZE(AO)
457	fxch	%st(3)
458	fld	%st
459	FST	-5 * SIZE(AO)
460
461	FST	1 * SIZE(CO, LDC)
462	FST	1 * SIZE(CO)
463	FST	0 * SIZE(CO)
464	FST	0 * SIZE(CO, LDC)
465#endif
466
467#ifndef LN
468	addq	$2 * SIZE, CO
469#endif
470
471#if defined(LT) || defined(RN)
472	movq	K,  %rax
473	subq	KK, %rax
474	salq	$BASE_SHIFT, %rax
475	leaq	(AO, %rax, 2), AO
476	leaq	(BO, %rax, 2), BO
477#endif
478
479#ifdef LN
480	subq	$2, KK
481#endif
482
483#ifdef LT
484	addq	$2, KK
485#endif
486
487#ifdef RT
488       movq	K, %rax
489       salq	$1 + BASE_SHIFT, %rax
490       addq	%rax, AORIG
491#endif
492
493	decq	I
494	jne	.L11
495	ALIGN_4
496
497.L20:
498	movq	 M, %rax
499	andq	$1, %rax
500	je	.L29
501	ALIGN_4
502
503.L21:
504#ifdef LN
505       movq	K, %rax
506       salq	$0 + BASE_SHIFT, %rax
507       subq	%rax, AORIG
508#endif
509
510#if defined(LN) || defined(RT)
511	movq	KK, %rax
512	salq	$BASE_SHIFT, %rax
513	movq	AORIG, AO
514	leaq	(AO, %rax, 1), AO
515	leaq	(B,  %rax, 2), BO
516#else
517	movq	B, BO
518#endif
519
520	fldz
521	fldz
522
523#if defined(LT) || defined(RN)
524	movq	KK, %rax
525#else
526	movq	K,  %rax
527	subq	KK, %rax
528#endif
529	sarq	$2, %rax
530 	je	.L25
531	ALIGN_4
532
533.L22:
534	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
535
536	FLD	 -8 * SIZE(AO)
537
538	FLD	 -8 * SIZE(BO)
539	fmul	 %st(1), %st
540	faddp	 %st, %st(2)
541
542	FLD	 -7 * SIZE(BO)
543	fmulp	 %st, %st(1)
544	faddp	 %st, %st(2)
545
546	FLD	 -7 * SIZE(AO)
547
548	FLD	 -6 * SIZE(BO)
549	fmul	 %st(1), %st
550	faddp	 %st, %st(2)
551
552	FLD	 -5 * SIZE(BO)
553	fmulp	 %st, %st(1)
554	faddp	 %st, %st(2)
555
556	FLD	 -6 * SIZE(AO)
557
558	FLD	 -4 * SIZE(BO)
559	fmul	 %st(1), %st
560	faddp	 %st, %st(2)
561
562	FLD	 -3 * SIZE(BO)
563	fmulp	 %st, %st(1)
564	faddp	 %st, %st(2)
565
566	FLD	 -5 * SIZE(AO)
567
568	FLD	 -2 * SIZE(BO)
569	fmul	 %st(1), %st
570	faddp	 %st, %st(2)
571
572	FLD	 -1 * SIZE(BO)
573	fmulp	 %st, %st(1)
574	faddp	 %st, %st(2)
575
576	addq	$4 * SIZE,AO
577	addq	$8 * SIZE,BO
578
579	decq	%rax
580	jne	.L22
581	ALIGN_4
582
583.L25:
584#if defined(LT) || defined(RN)
585	movq	KK, %rax
586#else
587	movq	K,  %rax
588	subq	KK, %rax
589#endif
590	and	$3,  %rax
591	je	.L28
592	ALIGN_4
593
594.L26:
595	FLD	 -8 * SIZE(AO)
596
597	FLD	 -8 * SIZE(BO)
598	fmul	 %st(1), %st
599	faddp	 %st, %st(2)
600
601	FLD	 -7 * SIZE(BO)
602	fmulp	 %st, %st(1)
603	faddp	 %st, %st(2)
604
605	addq	$1 * SIZE,AO
606	addq	$2 * SIZE,BO
607
608	decq	%rax
609	jne	 .L26
610	ALIGN_4
611
612.L28:
613#if defined(LN) || defined(RT)
614	movq	KK, %rax
615#ifdef LN
616	subq	$1, %rax
617#else
618	subq	$2, %rax
619#endif
620
621	salq	$BASE_SHIFT, %rax
622
623	movq	AORIG, AO
624	leaq	(AO, %rax, 1), AO
625	leaq	(B,  %rax, 2), BO
626#endif
627
628#if defined(LN) || defined(LT)
629	FLD	-8 * SIZE(BO)
630	fsubp	%st, %st(1)
631	FLD	-7 * SIZE(BO)
632	fsubp	%st, %st(2)
633#else
634	FLD	-8 * SIZE(AO)
635	fsubp	%st, %st(1)
636	FLD	-7 * SIZE(AO)
637	fsubp	%st, %st(2)
638#endif
639
640#if defined(LN) || defined(LT)
641       FLD	-8 * SIZE(AO)
642       fmul	%st, %st(1)
643       fmulp	%st, %st(2)
644#endif
645
646#ifdef RN
647       FLD	-8 * SIZE(BO)
648       fmulp	%st, %st(1)
649
650       FLD	-7 * SIZE(BO)
651       fmul	%st(1), %st
652
653       fsubrp	%st, %st(2)
654
655       FLD	-5 * SIZE(BO)
656       fmulp	%st, %st(2)
657#endif
658
659#ifdef RT
660       FLD	-5 * SIZE(BO)
661       fmulp	%st, %st(2)
662
663       FLD	-6 * SIZE(BO)
664       fmul	%st(2), %st
665
666       fsubrp	%st, %st(1)
667
668       FLD	-8 * SIZE(BO)
669       fmulp	%st, %st(1)
670#endif
671
672#ifdef LN
673	subq	$1 * SIZE, CO
674#endif
675
676#if defined(LN) || defined(LT)
677	fld	%st
678	FST	-8 * SIZE(BO)
679	fxch	%st(1)
680	fld	%st
681	FST	-7 * SIZE(BO)
682#else
683	fld	%st
684	FST	-8 * SIZE(AO)
685	fxch	%st(1)
686	fld	%st
687	FST	-7 * SIZE(AO)
688#endif
689
690	FST	0 * SIZE(CO, LDC)
691	FST	0 * SIZE(CO)
692
693#ifndef LN
694	addq	$1 * SIZE, CO
695#endif
696
697#if defined(LT) || defined(RN)
698	movq	K,  %rax
699	subq	KK, %rax
700	salq	$BASE_SHIFT, %rax
701	leaq	(AO, %rax, 1), AO
702	leaq	(BO, %rax, 2), BO
703#endif
704
705#ifdef LN
706	subq	$1, KK
707#endif
708
709#ifdef LT
710	addq	$1, KK
711#endif
712
713#ifdef RT
714       movq	K, %rax
715       salq	$0 + BASE_SHIFT, %rax
716       addq	%rax, AORIG
717#endif
718	ALIGN_4
719
720.L29:
721#ifdef LN
722       movq	K, %rax
723       salq	$BASE_SHIFT, %rax
724       leaq	(B, %rax, 2), B
725#endif
726
727#if defined(LT) || defined(RN)
728	movq	BO, B
729#endif
730
731#ifdef RN
732	addq	$2, KK
733#endif
734
735#ifdef RT
736	subq	$2, KK
737#endif
738
739	decq	J
740	jne	.L01
741	ALIGN_4
742
743.L30:
744	movq	N,  %rax
745	testq	$1, %rax
746	je	.L999
747
748#if defined(LT) || defined(RN)
749	movq	A, AO
750#else
751	movq	A, %rax
752	movq	%rax, AORIG
753#endif
754
755#ifdef RT
756	movq	K, %rax
757	salq	$0 + BASE_SHIFT, %rax
758	subq	%rax, B
759#endif
760
761#ifdef RT
762	subq	LDC, C
763#endif
764	movq	C, CO
765#ifndef RT
766	addq	LDC, C
767#endif
768
769#ifdef LN
770	movq	OFFSET, %rax
771	addq	M, %rax
772	movq	%rax, KK
773#endif
774
775#ifdef LT
776	movq	OFFSET, %rax
777	movq	%rax, KK
778#endif
779
780	movq	M,  I
781	sarq	$1, I
782	je	.L40
783	ALIGN_4
784
785.L31:
786#ifdef LN
787       movq	K, %rax
788       salq	$1 + BASE_SHIFT, %rax
789       subq	%rax, AORIG
790#endif
791
792#if defined(LN) || defined(RT)
793	movq	KK, %rax
794	salq	$BASE_SHIFT, %rax
795	movq	AORIG, AO
796	leaq	(AO, %rax, 2), AO
797	leaq	(B,  %rax, 1), BO
798#else
799	movq	B, BO
800#endif
801
802	fldz
803	fldz
804
805#if   defined(HAVE_3DNOW)
806	prefetchw	2 * SIZE(CO)
807#elif defined(HAVE_SSE)
808	prefetchnta	2 * SIZE(CO)
809#endif
810
811#if defined(LT) || defined(RN)
812	movq	KK, %rax
813#else
814	movq	K,  %rax
815	subq	KK, %rax
816#endif
817	sarq	$2, %rax
818 	je	.L35
819	ALIGN_4
820
821.L32:
822	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
823
824	FLD	 -8 * SIZE(BO)
825	FLD	 -8 * SIZE(AO)
826	fmul	 %st(1), %st
827	faddp	 %st, %st(2)
828
829	FLD	 -7 * SIZE(AO)
830	fmulp	 %st, %st(1)
831	faddp	 %st, %st(2)
832
833	FLD	 -7 * SIZE(BO)
834	FLD	 -6 * SIZE(AO)
835	fmul	 %st(1), %st
836	faddp	 %st, %st(2)
837
838	FLD	 -5 * SIZE(AO)
839	fmulp	 %st, %st(1)
840	faddp	 %st, %st(2)
841
842	FLD	 -6 * SIZE(BO)
843	FLD	 -4 * SIZE(AO)
844	fmul	 %st(1), %st
845	faddp	 %st, %st(2)
846
847	FLD	 -3 * SIZE(AO)
848	fmulp	 %st, %st(1)
849	faddp	 %st, %st(2)
850
851	FLD	 -5 * SIZE(BO)
852	FLD	 -2 * SIZE(AO)
853	fmul	 %st(1), %st
854	faddp	 %st, %st(2)
855
856	FLD	 -1 * SIZE(AO)
857	fmulp	 %st, %st(1)
858	faddp	 %st, %st(2)
859
860	addq	$8 * SIZE,AO
861	addq	$4 * SIZE,BO
862
863	decq	%rax
864	jne	.L32
865	ALIGN_4
866
867.L35:
868#if defined(LT) || defined(RN)
869	movq	KK, %rax
870#else
871	movq	K,  %rax
872	subq	KK, %rax
873#endif
874	and	$3,  %rax
875	je	.L38
876	ALIGN_4
877
878.L36:
879	FLD	 -8 * SIZE(BO)
880
881	FLD	 -8 * SIZE(AO)
882	fmul	 %st(1), %st
883	faddp	 %st, %st(2)
884
885	FLD	 -7 * SIZE(AO)
886	fmulp	 %st, %st(1)
887	faddp	 %st, %st(2)
888
889	addq	$2 * SIZE,AO
890	addq	$1 * SIZE,BO
891
892	decq	%rax
893	jne	 .L36
894	ALIGN_4
895
896.L38:
897#if defined(LN) || defined(RT)
898	movq	KK, %rax
899#ifdef LN
900	subq	$2, %rax
901#else
902	subq	$1, %rax
903#endif
904
905	salq	$BASE_SHIFT, %rax
906
907	movq	AORIG, AO
908	leaq	(AO, %rax, 2), AO
909	leaq	(B,  %rax, 1), BO
910#endif
911
912#if defined(LN) || defined(LT)
913	FLD	-8 * SIZE(BO)
914	fsubp	%st, %st(1)
915	FLD	-7 * SIZE(BO)
916	fsubp	%st, %st(2)
917#else
918	FLD	-8 * SIZE(AO)
919	fsubp	%st, %st(1)
920	FLD	-7 * SIZE(AO)
921	fsubp	%st, %st(2)
922#endif
923
924#ifdef LN
925       FLD	-5 * SIZE(AO)
926       fmulp	%st, %st(2)
927
928       FLD	-6 * SIZE(AO)
929       fmul	%st(2), %st
930
931       fsubrp	%st, %st(1)
932       FLD	-8 * SIZE(AO)
933       fmulp	%st, %st(1)
934#endif
935
936#ifdef LT
937       FLD	-8 * SIZE(AO)
938       fmulp	%st, %st(1)
939
940       FLD	-7 * SIZE(AO)
941       fmul	%st(1), %st
942
943       fsubrp	%st, %st(2)
944
945       FLD	-5 * SIZE(AO)
946       fmulp	%st, %st(2)
947#endif
948
949#ifdef RN
950       FLD	-8 * SIZE(BO)
951       fmul	%st, %st(1)
952       fmulp	%st, %st(2)
953#endif
954
955#ifdef RT
956       FLD	-8 * SIZE(BO)
957       fmul	%st, %st(1)
958       fmulp	%st, %st(2)
959#endif
960
961#ifdef LN
962	subq	$2 * SIZE, CO
963#endif
964
965#if defined(LN) || defined(LT)
966	fld	%st
967	FST	-8 * SIZE(BO)
968	fxch	%st(1)
969	fld	%st
970	FST	-7 * SIZE(BO)
971#else
972	fld	%st
973	FST	-8 * SIZE(AO)
974	fxch	%st(1)
975	fld	%st
976	FST	-7 * SIZE(AO)
977#endif
978
979	FST	1 * SIZE(CO)
980	FST	0 * SIZE(CO)
981
982#ifndef LN
983	addq	$2 * SIZE, CO
984#endif
985
986#if defined(LT) || defined(RN)
987	movq	K,  %rax
988	subq	KK, %rax
989	salq	$BASE_SHIFT, %rax
990	leaq	(AO, %rax, 2), AO
991	leaq	(BO, %rax, 1), BO
992#endif
993
994#ifdef LN
995	subq	$2, KK
996#endif
997
998#ifdef LT
999	addq	$2, KK
1000#endif
1001
1002#ifdef RT
1003       movq	K, %rax
1004       salq	$1 + BASE_SHIFT, %rax
1005       addq	%rax, AORIG
1006#endif
1007
1008	decq	I
1009	jne	.L31
1010	ALIGN_4
1011
1012.L40:
1013	movq	 M, %rax
1014	andq	$1, %rax
1015	je	.L49
1016	ALIGN_4
1017
1018.L41:
1019#ifdef LN
1020       movq	K, %rax
1021       salq	$0 + BASE_SHIFT, %rax
1022       subq	%rax, AORIG
1023#endif
1024
1025#if defined(LN) || defined(RT)
1026	movq	KK, %rax
1027	salq	$BASE_SHIFT, %rax
1028	movq	AORIG, AO
1029	leaq	(AO, %rax, 1), AO
1030	leaq	(B,  %rax, 1), BO
1031#else
1032	movq	B, BO
1033#endif
1034
1035	fldz
1036
1037#if defined(LT) || defined(RN)
1038	movq	KK, %rax
1039#else
1040	movq	K,  %rax
1041	subq	KK, %rax
1042#endif
1043	sarq	$2, %rax
1044 	je	.L45
1045	ALIGN_4
1046
1047.L42:
1048	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1049
1050	FLD	 -8 * SIZE(AO)
1051	FLD	 -8 * SIZE(BO)
1052	fmulp	 %st, %st(1)
1053	faddp	 %st, %st(1)
1054
1055	FLD	 -7 * SIZE(AO)
1056	FLD	 -7 * SIZE(BO)
1057	fmulp	 %st, %st(1)
1058	faddp	 %st, %st(1)
1059
1060	FLD	 -6 * SIZE(AO)
1061	FLD	 -6 * SIZE(BO)
1062	fmulp	 %st, %st(1)
1063	faddp	 %st, %st(1)
1064
1065	FLD	 -5 * SIZE(AO)
1066	FLD	 -5 * SIZE(BO)
1067	fmulp	 %st, %st(1)
1068	faddp	 %st, %st(1)
1069
1070	addq	$4 * SIZE,AO
1071	addq	$4 * SIZE,BO
1072
1073	decq	%rax
1074	jne	.L42
1075	ALIGN_4
1076
1077.L45:
1078#if defined(LT) || defined(RN)
1079	movq	KK, %rax
1080#else
1081	movq	K,  %rax
1082	subq	KK, %rax
1083#endif
1084	and	$3,  %rax
1085	je	.L48
1086	ALIGN_4
1087
1088.L46:
1089	FLD	 -8 * SIZE(AO)
1090
1091	FLD	 -8 * SIZE(BO)
1092	fmulp	 %st, %st(1)
1093	faddp	 %st, %st(1)
1094
1095	addq	$1 * SIZE,AO
1096	addq	$1 * SIZE,BO
1097
1098	decq	%rax
1099	jne	 .L46
1100	ALIGN_4
1101
1102.L48:
1103#if defined(LN) || defined(RT)
1104	movq	KK, %rax
1105#ifdef LN
1106	subq	$1, %rax
1107#else
1108	subq	$1, %rax
1109#endif
1110
1111	salq	$BASE_SHIFT, %rax
1112
1113	movq	AORIG, AO
1114	leaq	(AO, %rax, 1), AO
1115	leaq	(B,  %rax, 1), BO
1116#endif
1117
1118#if defined(LN) || defined(LT)
1119	FLD	-8 * SIZE(BO)
1120	fsubp	%st, %st(1)
1121#else
1122	FLD	-8 * SIZE(AO)
1123	fsubp	%st, %st(1)
1124#endif
1125
1126#ifdef LN
1127       FLD	-8 * SIZE(AO)
1128       fmulp	%st, %st(1)
1129#endif
1130
1131#ifdef LT
1132       FLD	-8 * SIZE(AO)
1133       fmulp	%st, %st(1)
1134#endif
1135
1136#ifdef RN
1137       FLD	-8 * SIZE(BO)
1138       fmulp	%st, %st(1)
1139#endif
1140
1141#ifdef RT
1142       FLD	-8 * SIZE(BO)
1143       fmulp	%st, %st(1)
1144#endif
1145
1146#ifdef LN
1147	subq	$1 * SIZE, CO
1148#endif
1149
1150#if defined(LN) || defined(LT)
1151	fld	%st
1152	FST	-8 * SIZE(BO)
1153#else
1154	fld	%st
1155	FST	-8 * SIZE(AO)
1156#endif
1157
1158	FST	0 * SIZE(CO)
1159
1160#ifndef LN
1161	addq	$1 * SIZE, CO
1162#endif
1163
1164#if defined(LT) || defined(RN)
1165	movq	K,  %rax
1166	subq	KK, %rax
1167	salq	$BASE_SHIFT, %rax
1168	leaq	(AO, %rax, 1), AO
1169	leaq	(BO, %rax, 1), BO
1170#endif
1171
1172#ifdef LN
1173	subq	$1, KK
1174#endif
1175
1176#ifdef LT
1177	addq	$1, KK
1178#endif
1179
1180#ifdef RT
1181       movq	K, %rax
1182       salq	$0 + BASE_SHIFT, %rax
1183       addq	%rax, AORIG
1184#endif
1185	ALIGN_4
1186
1187.L49:
1188#ifdef LN
1189       movq	K, %rax
1190       salq	$BASE_SHIFT, %rax
1191       leaq	(B, %rax, 1), B
1192#endif
1193
1194#if defined(LT) || defined(RN)
1195	movq	BO, B
1196#endif
1197
1198#ifdef RN
1199	addq	$1, KK
1200#endif
1201
1202#ifdef RT
1203	subq	$1, KK
1204#endif
1205	ALIGN_4
1206
1207.L999:
1208	movq	  0(%rsp), %rbx
1209	movq	  8(%rsp), %rbp
1210	movq	 16(%rsp), %r12
1211	movq	 24(%rsp), %r13
1212	movq	 32(%rsp), %r14
1213	movq	 40(%rsp), %r15
1214	addq	$STACKSIZE, %rsp
1215	ret
1216
1217	EPILOGUE
1218