1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1
43#define N	ARG2
44#define K	ARG3
45#define A	ARG4
46#define B	ARG5
47#define C	ARG6
48#define LDC	%r10
49
50#define I	%r12
51#define J	%r13
52#define AO	%r14
53#define BO	%r15
54#define	CO	%rbp
55
56#define KK	%r11
57#define AORIG	 48(%rsp)
58
59#define STACKSIZE 64
60
61#define ALPHA	 8 + STACKSIZE(%rsp)
62#define OFFSET	32 + STACKSIZE(%rsp)
63
64#ifdef OPTERON
65#define PREFETCH	prefetch
66#define PREFETCHW	prefetchw
67#else
68#define PREFETCH	prefetcht0
69#define PREFETCHW	prefetcht0
70#endif
71
72#define PREFETCHSIZE (5 + 4 * 10)
73
74	PROLOGUE
75	PROFCODE
76
77#ifdef WINDOWS_ABI
78	emms
79#endif
80
81	subq	$STACKSIZE, %rsp
82	movq	%rbx,  0(%rsp)
83	movq	%rbp,  8(%rsp)
84	movq	%r12, 16(%rsp)
85	movq	%r13, 24(%rsp)
86	movq	%r14, 32(%rsp)
87	movq	%r15, 40(%rsp)
88
89	movq	24 + STACKSIZE(%rsp), LDC
90
91#if defined(TRMMKERNEL) && !defined(LEFT)
92	movq	OFFSET, %rax
93	negq	%rax
94	movq	%rax, KK
95#endif
96
97	addq	$8 * SIZE, A
98	addq	$8 * SIZE, B
99
100	salq	$BASE_SHIFT, LDC
101
102#ifdef LN
103       movq	M, %rax
104       salq	$BASE_SHIFT, %rax
105       addq	%rax, C
106       imulq	K, %rax
107       addq	%rax, A
108#endif
109
110#ifdef RT
111       movq	N, %rax
112       salq	$BASE_SHIFT, %rax
113       imulq	K, %rax
114       addq	%rax, B
115
116       movq	N,   %rax
117       imulq	LDC, %rax
118       addq	%rax, C
119#endif
120
121#ifdef RN
122       movq	OFFSET, %rax
123       negq	%rax
124       movq	%rax, KK
125#endif
126
127#ifdef RT
128       movq	N, %rax
129       subq	OFFSET, %rax
130       movq	%rax, KK
131#endif
132
133	movq	N,   %rax
134	sarq	$1,  %rax
135	movq	%rax, J
136	je	.L30
137	ALIGN_4
138
139.L01:
140#if defined(LT) || defined(RN)
141	movq	A, AO
142#else
143	movq	A, %rax
144	movq	%rax, AORIG
145#endif
146
147#ifdef RT
148	movq	K, %rax
149	salq	$1 + BASE_SHIFT, %rax
150	subq	%rax, B
151#endif
152
153	lea	(, LDC, 2), %rax
154
155#ifdef RT
156	subq	%rax, C
157#endif
158	movq	C, CO
159#ifndef RT
160	addq	%rax, C
161#endif
162
163#ifdef LN
164	movq	OFFSET, %rax
165	addq	M, %rax
166	movq	%rax, KK
167#endif
168
169#ifdef LT
170	movq	OFFSET, %rax
171	movq	%rax, KK
172#endif
173
174	movq	 M, %rax
175	andq	$1, %rax
176	je	.L20
177	ALIGN_4
178
179.L21:
180#ifdef LN
181       movq	K, %rax
182       salq	$0 + BASE_SHIFT, %rax
183       subq	%rax, AORIG
184#endif
185
186#if defined(LN) || defined(RT)
187	movq	KK, %rax
188	salq	$BASE_SHIFT, %rax
189	movq	AORIG, AO
190	leaq	(AO, %rax, 1), AO
191	leaq	(B,  %rax, 2), BO
192#else
193	movq	B, BO
194#endif
195
196	fldz
197	fldz
198
199#if defined(LT) || defined(RN)
200	movq	KK, %rax
201#else
202	movq	K,  %rax
203	subq	KK, %rax
204#endif
205	sarq	$2, %rax
206 	je	.L25
207	ALIGN_4
208
209.L22:
210	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
211
212	FLD	 -8 * SIZE(AO)
213
214	FLD	 -8 * SIZE(BO)
215	fmul	 %st(1), %st
216	faddp	 %st, %st(2)
217
218	FLD	 -7 * SIZE(BO)
219	fmulp	 %st, %st(1)
220	faddp	 %st, %st(2)
221
222	FLD	 -7 * SIZE(AO)
223
224	FLD	 -6 * SIZE(BO)
225	fmul	 %st(1), %st
226	faddp	 %st, %st(2)
227
228	FLD	 -5 * SIZE(BO)
229	fmulp	 %st, %st(1)
230	faddp	 %st, %st(2)
231
232	FLD	 -6 * SIZE(AO)
233
234	FLD	 -4 * SIZE(BO)
235	fmul	 %st(1), %st
236	faddp	 %st, %st(2)
237
238	FLD	 -3 * SIZE(BO)
239	fmulp	 %st, %st(1)
240	faddp	 %st, %st(2)
241
242	FLD	 -5 * SIZE(AO)
243
244	FLD	 -2 * SIZE(BO)
245	fmul	 %st(1), %st
246	faddp	 %st, %st(2)
247
248	FLD	 -1 * SIZE(BO)
249	fmulp	 %st, %st(1)
250	faddp	 %st, %st(2)
251
252	addq	$4 * SIZE,AO
253	addq	$8 * SIZE,BO
254
255	decq	%rax
256	jne	.L22
257	ALIGN_4
258
259.L25:
260#if defined(LT) || defined(RN)
261	movq	KK, %rax
262#else
263	movq	K,  %rax
264	subq	KK, %rax
265#endif
266	and	$3,  %rax
267	je	.L28
268	ALIGN_4
269
270.L26:
271	FLD	 -8 * SIZE(AO)
272
273	FLD	 -8 * SIZE(BO)
274	fmul	 %st(1), %st
275	faddp	 %st, %st(2)
276
277	FLD	 -7 * SIZE(BO)
278	fmulp	 %st, %st(1)
279	faddp	 %st, %st(2)
280
281	addq	$1 * SIZE,AO
282	addq	$2 * SIZE,BO
283
284	decq	%rax
285	jne	 .L26
286	ALIGN_4
287
288.L28:
289#if defined(LN) || defined(RT)
290	movq	KK, %rax
291#ifdef LN
292	subq	$1, %rax
293#else
294	subq	$2, %rax
295#endif
296
297	salq	$BASE_SHIFT, %rax
298
299	movq	AORIG, AO
300	leaq	(AO, %rax, 1), AO
301	leaq	(B,  %rax, 2), BO
302#endif
303
304#if defined(LN) || defined(LT)
305	FLD	-8 * SIZE(BO)
306	fsubp	%st, %st(1)
307	FLD	-7 * SIZE(BO)
308	fsubp	%st, %st(2)
309#else
310	FLD	-8 * SIZE(AO)
311	fsubp	%st, %st(1)
312	FLD	-7 * SIZE(AO)
313	fsubp	%st, %st(2)
314#endif
315
316#if defined(LN) || defined(LT)
317       FLD	-8 * SIZE(AO)
318       fmul	%st, %st(1)
319       fmulp	%st, %st(2)
320#endif
321
322#ifdef RN
323       FLD	-8 * SIZE(BO)
324       fmulp	%st, %st(1)
325
326       FLD	-7 * SIZE(BO)
327       fmul	%st(1), %st
328
329       fsubrp	%st, %st(2)
330
331       FLD	-5 * SIZE(BO)
332       fmulp	%st, %st(2)
333#endif
334
335#ifdef RT
336       FLD	-5 * SIZE(BO)
337       fmulp	%st, %st(2)
338
339       FLD	-6 * SIZE(BO)
340       fmul	%st(2), %st
341
342       fsubrp	%st, %st(1)
343
344       FLD	-8 * SIZE(BO)
345       fmulp	%st, %st(1)
346#endif
347
348#ifdef LN
349	subq	$1 * SIZE, CO
350#endif
351
352#if defined(LN) || defined(LT)
353	fld	%st
354	FST	-8 * SIZE(BO)
355	fxch	%st(1)
356	fld	%st
357	FST	-7 * SIZE(BO)
358#else
359	fld	%st
360	FST	-8 * SIZE(AO)
361	fxch	%st(1)
362	fld	%st
363	FST	-7 * SIZE(AO)
364#endif
365
366	FST	0 * SIZE(CO, LDC)
367	FST	0 * SIZE(CO)
368
369#ifndef LN
370	addq	$1 * SIZE, CO
371#endif
372
373#if defined(LT) || defined(RN)
374	movq	K,  %rax
375	subq	KK, %rax
376	salq	$BASE_SHIFT, %rax
377	leaq	(AO, %rax, 1), AO
378	leaq	(BO, %rax, 2), BO
379#endif
380
381#ifdef LN
382	subq	$1, KK
383#endif
384
385#ifdef LT
386	addq	$1, KK
387#endif
388
389#ifdef RT
390       movq	K, %rax
391       salq	$0 + BASE_SHIFT, %rax
392       addq	%rax, AORIG
393#endif
394	ALIGN_4
395
396.L20:
397	movq	M,  I
398	sarq	$1, I
399	je	.L29
400	ALIGN_4
401
402.L11:
403#ifdef LN
404       movq	K, %rax
405       salq	$1 + BASE_SHIFT, %rax
406       subq	%rax, AORIG
407#endif
408
409#if defined(LN) || defined(RT)
410	movq	KK, %rax
411	salq	$BASE_SHIFT, %rax
412	movq	AORIG, AO
413	leaq	(AO, %rax, 2), AO
414	leaq	(B,  %rax, 2), BO
415#else
416	movq	B, BO
417#endif
418
419	fldz
420	fldz
421	fldz
422	fldz
423
424#if   defined(HAVE_3DNOW)
425	prefetchw	2 * SIZE(CO)
426 	prefetchw	2 * SIZE(CO, LDC, 1)
427#elif defined(HAVE_SSE)
428	prefetchnta	2 * SIZE(CO)
429 	prefetchnta	2 * SIZE(CO, LDC, 1)
430#endif
431
432#if defined(LT) || defined(RN)
433	movq	KK, %rax
434#else
435	movq	K,  %rax
436	subq	KK, %rax
437#endif
438	sarq	$2, %rax
439 	je	.L15
440	ALIGN_4
441
442.L12:
443	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
444
445	FLD	 -8 * SIZE(AO)
446
447	FLD	 -8 * SIZE(BO)
448	fld	 %st(1)
449	fmul	 %st(1), %st
450	faddp	 %st, %st(3)
451
452	FLD	 -7 * SIZE(BO)
453	fmul	 %st, %st(2)
454
455	FLD	 -7 * SIZE(AO)
456	fmul	 %st, %st(2)
457	fmulp	 %st, %st(1)
458
459	faddp	 %st, %st(6)
460	faddp	 %st, %st(4)
461	faddp	 %st, %st(2)
462
463	FLD	 -6 * SIZE(AO)
464
465	FLD	 -6 * SIZE(BO)
466	fld	 %st(1)
467	fmul	 %st(1), %st
468	faddp	 %st, %st(3)
469
470	FLD	 -5 * SIZE(BO)
471	fmul	 %st, %st(2)
472
473	FLD	 -5 * SIZE(AO)
474	fmul	 %st, %st(2)
475	fmulp	 %st, %st(1)
476
477	faddp	 %st, %st(6)
478	faddp	 %st, %st(4)
479	faddp	 %st, %st(2)
480
481	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
482
483	FLD	 -4 * SIZE(AO)
484
485	FLD	 -4 * SIZE(BO)
486	fld	 %st(1)
487	fmul	 %st(1), %st
488	faddp	 %st, %st(3)
489
490	FLD	 -3 * SIZE(BO)
491	fmul	 %st, %st(2)
492
493	FLD	 -3 * SIZE(AO)
494	fmul	 %st, %st(2)
495	fmulp	 %st, %st(1)
496
497	faddp	 %st, %st(6)
498	faddp	 %st, %st(4)
499	faddp	 %st, %st(2)
500
501	FLD	 -2 * SIZE(AO)
502
503	FLD	 -2 * SIZE(BO)
504	fld	 %st(1)
505	fmul	 %st(1), %st
506	faddp	 %st, %st(3)
507
508	FLD	 -1 * SIZE(BO)
509	fmul	 %st, %st(2)
510
511	FLD	 -1 * SIZE(AO)
512	fmul	 %st, %st(2)
513	fmulp	 %st, %st(1)
514
515	faddp	 %st, %st(6)
516	faddp	 %st, %st(4)
517	faddp	 %st, %st(2)
518
519	addq	$8 * SIZE,AO
520	addq	$8 * SIZE,BO
521
522	decq	%rax
523	jne	.L12
524	ALIGN_4
525
526.L15:
527#if defined(LT) || defined(RN)
528	movq	KK, %rax
529#else
530	movq	K,  %rax
531	subq	KK, %rax
532#endif
533	and	$3,  %rax
534	je	.L18
535	ALIGN_4
536
537.L16:
538	FLD	 -8 * SIZE(AO)
539
540	FLD	 -8 * SIZE(BO)
541	fld	 %st(1)
542	fmul	 %st(1), %st
543	faddp	 %st, %st(3)
544
545	FLD	 -7 * SIZE(BO)
546	fmul	 %st, %st(2)
547
548	FLD	 -7 * SIZE(AO)
549	fmul	 %st, %st(2)
550	fmulp	 %st, %st(1)
551
552	faddp	 %st, %st(6)
553	faddp	 %st, %st(4)
554	faddp	 %st, %st(2)
555
556	addq	$2 * SIZE,AO
557	addq	$2 * SIZE,BO
558
559	decq	%rax
560	jne	 .L16
561	ALIGN_4
562
563.L18:
564#if defined(LN) || defined(RT)
565	movq	KK, %rax
566#ifdef LN
567	subq	$2, %rax
568#else
569	subq	$2, %rax
570#endif
571
572	salq	$BASE_SHIFT, %rax
573
574	movq	AORIG, AO
575	leaq	(AO, %rax, 2), AO
576	leaq	(B,  %rax, 2), BO
577#endif
578
579#if defined(LN) || defined(LT)
580	FLD	-8 * SIZE(BO)
581	fsubp	%st, %st(1)
582	FLD	-7 * SIZE(BO)
583	fsubp	%st, %st(2)
584	FLD	-6 * SIZE(BO)
585	fsubp	%st, %st(3)
586	FLD	-5 * SIZE(BO)
587	fsubp	%st, %st(4)
588#else
589	FLD	-8 * SIZE(AO)
590	fsubp	%st, %st(1)
591	FLD	-7 * SIZE(AO)
592	fsubp	%st, %st(3)
593	FLD	-6 * SIZE(AO)
594	fsubp	%st, %st(2)
595	FLD	-5 * SIZE(AO)
596	fsubp	%st, %st(4)
597#endif
598
599#ifdef LN
600       FLD	-5 * SIZE(AO)
601       fmul	%st, %st(3)
602       fmulp	%st, %st(4)
603
604       FLD	-6 * SIZE(AO)
605       fmul	%st(3), %st
606       FLD	-6 * SIZE(AO)
607       fmul	%st(5), %st
608
609       fsubrp	%st, %st(3)
610       fsubrp	%st, %st(1)
611
612       FLD	-8 * SIZE(AO)
613       fmul	%st, %st(1)
614       fmulp	%st, %st(2)
615#endif
616
617#ifdef LT
618       FLD	-8 * SIZE(AO)
619       fmul	%st, %st(1)
620       fmulp	%st, %st(2)
621
622       FLD	-7 * SIZE(AO)
623       fmul	%st(1), %st
624       FLD	-7 * SIZE(AO)
625       fmul	%st(3), %st
626
627       fsubrp	%st, %st(5)
628       fsubrp	%st, %st(3)
629
630       FLD	-5 * SIZE(AO)
631       fmul	%st, %st(3)
632       fmulp	%st, %st(4)
633#endif
634
635#ifdef RN
636       FLD	-8 * SIZE(BO)
637       fmul	%st, %st(1)
638       fmulp	%st, %st(3)
639
640       FLD	-7 * SIZE(BO)
641       fmul	%st(1), %st
642       FLD	-7 * SIZE(BO)
643       fmul	%st(4), %st
644
645       fsubrp	%st, %st(5)
646       fsubrp	%st, %st(2)
647
648       FLD	-5 * SIZE(BO)
649       fmul	%st, %st(2)
650       fmulp	%st, %st(4)
651#endif
652
653#ifdef RT
654       FLD	-5 * SIZE(BO)
655       fmul	%st, %st(2)
656       fmulp	%st, %st(4)
657
658       FLD	-6 * SIZE(BO)
659       fmul	%st(2), %st
660       FLD	-6 * SIZE(BO)
661       fmul	%st(5), %st
662
663       fsubrp	%st, %st(4)
664       fsubrp	%st, %st(1)
665
666       FLD	-8 * SIZE(BO)
667       fmul	%st, %st(1)
668       fmulp	%st, %st(3)
669#endif
670
671#ifdef LN
672	subq	$2 * SIZE, CO
673#endif
674
675#if defined(LN) || defined(LT)
676	fld	%st
677	FST	-8 * SIZE(BO)
678	fxch	%st(1)
679	fld	%st
680	FST	-7 * SIZE(BO)
681	fxch	%st(2)
682	fld	%st
683	FST	-6 * SIZE(BO)
684	fxch	%st(3)
685	fld	%st
686	FST	-5 * SIZE(BO)
687
688	FST	1 * SIZE(CO, LDC)
689	FST	0 * SIZE(CO)
690	FST	0 * SIZE(CO, LDC)
691	FST	1 * SIZE(CO)
692#else
693	fld	%st
694	FST	-8 * SIZE(AO)
695	fxch	%st(2)
696	fld	%st
697	FST	-7 * SIZE(AO)
698	fxch	%st(1)
699	fld	%st
700	FST	-6 * SIZE(AO)
701	fxch	%st(3)
702	fld	%st
703	FST	-5 * SIZE(AO)
704
705	FST	1 * SIZE(CO, LDC)
706	FST	1 * SIZE(CO)
707	FST	0 * SIZE(CO)
708	FST	0 * SIZE(CO, LDC)
709#endif
710
711#ifndef LN
712	addq	$2 * SIZE, CO
713#endif
714
715#if defined(LT) || defined(RN)
716	movq	K,  %rax
717	subq	KK, %rax
718	salq	$BASE_SHIFT, %rax
719	leaq	(AO, %rax, 2), AO
720	leaq	(BO, %rax, 2), BO
721#endif
722
723#ifdef LN
724	subq	$2, KK
725#endif
726
727#ifdef LT
728	addq	$2, KK
729#endif
730
731#ifdef RT
732       movq	K, %rax
733       salq	$1 + BASE_SHIFT, %rax
734       addq	%rax, AORIG
735#endif
736
737	decq	I
738	jne	.L11
739	ALIGN_4
740
741.L29:
742#ifdef LN
743       movq	K, %rax
744       salq	$BASE_SHIFT, %rax
745       leaq	(B, %rax, 2), B
746#endif
747
748#if defined(LT) || defined(RN)
749	movq	BO, B
750#endif
751
752#ifdef RN
753	addq	$2, KK
754#endif
755
756#ifdef RT
757	subq	$2, KK
758#endif
759
760	decq	J
761	jne	.L01
762	ALIGN_4
763
764.L30:
765	movq	N,  %rax
766	testq	$1, %rax
767	je	.L999
768
769#if defined(LT) || defined(RN)
770	movq	A, AO
771#else
772	movq	A, %rax
773	movq	%rax, AORIG
774#endif
775
776#ifdef RT
777	movq	K, %rax
778	salq	$0 + BASE_SHIFT, %rax
779	subq	%rax, B
780#endif
781
782#ifdef RT
783	subq	LDC, C
784#endif
785	movq	C, CO
786#ifndef RT
787	addq	LDC, C
788#endif
789
790#ifdef LN
791	movq	OFFSET, %rax
792	addq	M, %rax
793	movq	%rax, KK
794#endif
795
796#ifdef LT
797	movq	OFFSET, %rax
798	movq	%rax, KK
799#endif
800
801	movq	 M, %rax
802	andq	$1, %rax
803	je	.L40
804	ALIGN_4
805
806.L41:
807#ifdef LN
808       movq	K, %rax
809       salq	$0 + BASE_SHIFT, %rax
810       subq	%rax, AORIG
811#endif
812
813#if defined(LN) || defined(RT)
814	movq	KK, %rax
815	salq	$BASE_SHIFT, %rax
816	movq	AORIG, AO
817	leaq	(AO, %rax, 1), AO
818	leaq	(B,  %rax, 1), BO
819#else
820	movq	B, BO
821#endif
822
823	fldz
824
825#if defined(LT) || defined(RN)
826	movq	KK, %rax
827#else
828	movq	K,  %rax
829	subq	KK, %rax
830#endif
831	sarq	$2, %rax
832 	je	.L45
833	ALIGN_4
834
835.L42:
836	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
837
838	FLD	 -8 * SIZE(AO)
839	FLD	 -8 * SIZE(BO)
840	fmulp	 %st, %st(1)
841	faddp	 %st, %st(1)
842
843	FLD	 -7 * SIZE(AO)
844	FLD	 -7 * SIZE(BO)
845	fmulp	 %st, %st(1)
846	faddp	 %st, %st(1)
847
848	FLD	 -6 * SIZE(AO)
849	FLD	 -6 * SIZE(BO)
850	fmulp	 %st, %st(1)
851	faddp	 %st, %st(1)
852
853	FLD	 -5 * SIZE(AO)
854	FLD	 -5 * SIZE(BO)
855	fmulp	 %st, %st(1)
856	faddp	 %st, %st(1)
857
858	addq	$4 * SIZE,AO
859	addq	$4 * SIZE,BO
860
861	decq	%rax
862	jne	.L42
863	ALIGN_4
864
865.L45:
866#if defined(LT) || defined(RN)
867	movq	KK, %rax
868#else
869	movq	K,  %rax
870	subq	KK, %rax
871#endif
872	and	$3,  %rax
873	je	.L48
874	ALIGN_4
875
876.L46:
877	FLD	 -8 * SIZE(AO)
878
879	FLD	 -8 * SIZE(BO)
880	fmulp	 %st, %st(1)
881	faddp	 %st, %st(1)
882
883	addq	$1 * SIZE,AO
884	addq	$1 * SIZE,BO
885
886	decq	%rax
887	jne	 .L46
888	ALIGN_4
889
890.L48:
891#if defined(LN) || defined(RT)
892	movq	KK, %rax
893#ifdef LN
894	subq	$1, %rax
895#else
896	subq	$1, %rax
897#endif
898
899	salq	$BASE_SHIFT, %rax
900
901	movq	AORIG, AO
902	leaq	(AO, %rax, 1), AO
903	leaq	(B,  %rax, 1), BO
904#endif
905
906#if defined(LN) || defined(LT)
907	FLD	-8 * SIZE(BO)
908	fsubp	%st, %st(1)
909#else
910	FLD	-8 * SIZE(AO)
911	fsubp	%st, %st(1)
912#endif
913
914#ifdef LN
915       FLD	-8 * SIZE(AO)
916       fmulp	%st, %st(1)
917#endif
918
919#ifdef LT
920       FLD	-8 * SIZE(AO)
921       fmulp	%st, %st(1)
922#endif
923
924#ifdef RN
925       FLD	-8 * SIZE(BO)
926       fmulp	%st, %st(1)
927#endif
928
929#ifdef RT
930       FLD	-8 * SIZE(BO)
931       fmulp	%st, %st(1)
932#endif
933
934#ifdef LN
935	subq	$1 * SIZE, CO
936#endif
937
938#if defined(LN) || defined(LT)
939	fld	%st
940	FST	-8 * SIZE(BO)
941#else
942	fld	%st
943	FST	-8 * SIZE(AO)
944#endif
945
946	FST	0 * SIZE(CO)
947
948#ifndef LN
949	addq	$1 * SIZE, CO
950#endif
951
952#if defined(LT) || defined(RN)
953	movq	K,  %rax
954	subq	KK, %rax
955	salq	$BASE_SHIFT, %rax
956	leaq	(AO, %rax, 1), AO
957	leaq	(BO, %rax, 1), BO
958#endif
959
960#ifdef LN
961	subq	$1, KK
962#endif
963
964#ifdef LT
965	addq	$1, KK
966#endif
967
968#ifdef RT
969       movq	K, %rax
970       salq	$0 + BASE_SHIFT, %rax
971       addq	%rax, AORIG
972#endif
973	ALIGN_4
974
975.L40:
976	movq	M,  I
977	sarq	$1, I
978	je	.L49
979	ALIGN_4
980
981.L31:
982#ifdef LN
983       movq	K, %rax
984       salq	$1 + BASE_SHIFT, %rax
985       subq	%rax, AORIG
986#endif
987
988#if defined(LN) || defined(RT)
989	movq	KK, %rax
990	salq	$BASE_SHIFT, %rax
991	movq	AORIG, AO
992	leaq	(AO, %rax, 2), AO
993	leaq	(B,  %rax, 1), BO
994#else
995	movq	B, BO
996#endif
997
998	fldz
999	fldz
1000
1001#if   defined(HAVE_3DNOW)
1002	prefetchw	2 * SIZE(CO)
1003#elif defined(HAVE_SSE)
1004	prefetchnta	2 * SIZE(CO)
1005#endif
1006
1007#if defined(LT) || defined(RN)
1008	movq	KK, %rax
1009#else
1010	movq	K,  %rax
1011	subq	KK, %rax
1012#endif
1013	sarq	$2, %rax
1014 	je	.L35
1015	ALIGN_4
1016
1017.L32:
1018	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1019
1020	FLD	 -8 * SIZE(BO)
1021	FLD	 -8 * SIZE(AO)
1022	fmul	 %st(1), %st
1023	faddp	 %st, %st(2)
1024
1025	FLD	 -7 * SIZE(AO)
1026	fmulp	 %st, %st(1)
1027	faddp	 %st, %st(2)
1028
1029	FLD	 -7 * SIZE(BO)
1030	FLD	 -6 * SIZE(AO)
1031	fmul	 %st(1), %st
1032	faddp	 %st, %st(2)
1033
1034	FLD	 -5 * SIZE(AO)
1035	fmulp	 %st, %st(1)
1036	faddp	 %st, %st(2)
1037
1038	FLD	 -6 * SIZE(BO)
1039	FLD	 -4 * SIZE(AO)
1040	fmul	 %st(1), %st
1041	faddp	 %st, %st(2)
1042
1043	FLD	 -3 * SIZE(AO)
1044	fmulp	 %st, %st(1)
1045	faddp	 %st, %st(2)
1046
1047	FLD	 -5 * SIZE(BO)
1048	FLD	 -2 * SIZE(AO)
1049	fmul	 %st(1), %st
1050	faddp	 %st, %st(2)
1051
1052	FLD	 -1 * SIZE(AO)
1053	fmulp	 %st, %st(1)
1054	faddp	 %st, %st(2)
1055
1056	addq	$8 * SIZE,AO
1057	addq	$4 * SIZE,BO
1058
1059	decq	%rax
1060	jne	.L32
1061	ALIGN_4
1062
1063.L35:
1064#if defined(LT) || defined(RN)
1065	movq	KK, %rax
1066#else
1067	movq	K,  %rax
1068	subq	KK, %rax
1069#endif
1070	and	$3,  %rax
1071	je	.L38
1072	ALIGN_4
1073
1074.L36:
1075	FLD	 -8 * SIZE(BO)
1076
1077	FLD	 -8 * SIZE(AO)
1078	fmul	 %st(1), %st
1079	faddp	 %st, %st(2)
1080
1081	FLD	 -7 * SIZE(AO)
1082	fmulp	 %st, %st(1)
1083	faddp	 %st, %st(2)
1084
1085	addq	$2 * SIZE,AO
1086	addq	$1 * SIZE,BO
1087
1088	decq	%rax
1089	jne	 .L36
1090	ALIGN_4
1091
1092.L38:
1093#if defined(LN) || defined(RT)
1094	movq	KK, %rax
1095#ifdef LN
1096	subq	$2, %rax
1097#else
1098	subq	$1, %rax
1099#endif
1100
1101	salq	$BASE_SHIFT, %rax
1102
1103	movq	AORIG, AO
1104	leaq	(AO, %rax, 2), AO
1105	leaq	(B,  %rax, 1), BO
1106#endif
1107
1108#if defined(LN) || defined(LT)
1109	FLD	-8 * SIZE(BO)
1110	fsubp	%st, %st(1)
1111	FLD	-7 * SIZE(BO)
1112	fsubp	%st, %st(2)
1113#else
1114	FLD	-8 * SIZE(AO)
1115	fsubp	%st, %st(1)
1116	FLD	-7 * SIZE(AO)
1117	fsubp	%st, %st(2)
1118#endif
1119
1120#ifdef LN
1121       FLD	-5 * SIZE(AO)
1122       fmulp	%st, %st(2)
1123
1124       FLD	-6 * SIZE(AO)
1125       fmul	%st(2), %st
1126
1127       fsubrp	%st, %st(1)
1128       FLD	-8 * SIZE(AO)
1129       fmulp	%st, %st(1)
1130#endif
1131
1132#ifdef LT
1133       FLD	-8 * SIZE(AO)
1134       fmulp	%st, %st(1)
1135
1136       FLD	-7 * SIZE(AO)
1137       fmul	%st(1), %st
1138
1139       fsubrp	%st, %st(2)
1140
1141       FLD	-5 * SIZE(AO)
1142       fmulp	%st, %st(2)
1143#endif
1144
1145#ifdef RN
1146       FLD	-8 * SIZE(BO)
1147       fmul	%st, %st(1)
1148       fmulp	%st, %st(2)
1149#endif
1150
1151#ifdef RT
1152       FLD	-8 * SIZE(BO)
1153       fmul	%st, %st(1)
1154       fmulp	%st, %st(2)
1155#endif
1156
1157#ifdef LN
1158	subq	$2 * SIZE, CO
1159#endif
1160
1161#if defined(LN) || defined(LT)
1162	fld	%st
1163	FST	-8 * SIZE(BO)
1164	fxch	%st(1)
1165	fld	%st
1166	FST	-7 * SIZE(BO)
1167#else
1168	fld	%st
1169	FST	-8 * SIZE(AO)
1170	fxch	%st(1)
1171	fld	%st
1172	FST	-7 * SIZE(AO)
1173#endif
1174
1175	FST	1 * SIZE(CO)
1176	FST	0 * SIZE(CO)
1177
1178#ifndef LN
1179	addq	$2 * SIZE, CO
1180#endif
1181
1182#if defined(LT) || defined(RN)
1183	movq	K,  %rax
1184	subq	KK, %rax
1185	salq	$BASE_SHIFT, %rax
1186	leaq	(AO, %rax, 2), AO
1187	leaq	(BO, %rax, 1), BO
1188#endif
1189
1190#ifdef LN
1191	subq	$2, KK
1192#endif
1193
1194#ifdef LT
1195	addq	$2, KK
1196#endif
1197
1198#ifdef RT
1199       movq	K, %rax
1200       salq	$1 + BASE_SHIFT, %rax
1201       addq	%rax, AORIG
1202#endif
1203
1204	decq	I
1205	jne	.L31
1206	ALIGN_4
1207
1208.L49:
1209#ifdef LN
1210       movq	K, %rax
1211       salq	$BASE_SHIFT, %rax
1212       leaq	(B, %rax, 1), B
1213#endif
1214
1215#if defined(LT) || defined(RN)
1216	movq	BO, B
1217#endif
1218
1219#ifdef RN
1220	addq	$1, KK
1221#endif
1222
1223#ifdef RT
1224	subq	$1, KK
1225#endif
1226	ALIGN_4
1227
1228.L999:
1229	movq	  0(%rsp), %rbx
1230	movq	  8(%rsp), %rbp
1231	movq	 16(%rsp), %r12
1232	movq	 24(%rsp), %r13
1233	movq	 32(%rsp), %r14
1234	movq	 40(%rsp), %r15
1235	addq	$STACKSIZE, %rsp
1236	ret
1237
1238	EPILOGUE
1239