1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1
43#define N	ARG2
44#define K	ARG3
45#define A	ARG4
46#define B	ARG5
47#define C	ARG6
48#define LDC	%r10
49
50#define I	%r12
51#define J	%r13
52#define AO	%r14
53#define BO	%r15
54#define	CO	%rbp
55
56#define KK	%r11
57#define AORIG	 48(%rsp)
58
59#define STACKSIZE 64
60
61#define ALPHA	 8 + STACKSIZE(%rsp)
62#define OFFSET	32 + STACKSIZE(%rsp)
63
64#ifdef OPTERON
65#define PREFETCH	prefetch
66#define PREFETCHW	prefetchw
67#else
68#define PREFETCH	prefetcht0
69#define PREFETCHW	prefetcht0
70#endif
71
72#define PREFETCHSIZE (5 + 4 * 10)
73
74	PROLOGUE
75	PROFCODE
76
77	subq	$STACKSIZE, %rsp
78	movq	%rbx,  0(%rsp)
79	movq	%rbp,  8(%rsp)
80	movq	%r12, 16(%rsp)
81	movq	%r13, 24(%rsp)
82	movq	%r14, 32(%rsp)
83	movq	%r15, 40(%rsp)
84
85	movq	24 + STACKSIZE(%rsp), LDC
86
87#if defined(TRMMKERNEL) && !defined(LEFT)
88	movq	OFFSET, %rax
89	negq	%rax
90	movq	%rax, KK
91#endif
92
93	addq	$8 * SIZE, A
94	addq	$8 * SIZE, B
95
96	salq	$BASE_SHIFT, LDC
97
98#ifdef LN
99       movq	M, %rax
100       salq	$BASE_SHIFT, %rax
101       addq	%rax, C
102       imulq	K, %rax
103       addq	%rax, A
104#endif
105
106#ifdef RT
107       movq	N, %rax
108       salq	$BASE_SHIFT, %rax
109       imulq	K, %rax
110       addq	%rax, B
111
112       movq	N,   %rax
113       imulq	LDC, %rax
114       addq	%rax, C
115#endif
116
117#ifdef RN
118       movq	OFFSET, %rax
119       negq	%rax
120       movq	%rax, KK
121#endif
122
123#ifdef RT
124       movq	N, %rax
125       subq	OFFSET, %rax
126       movq	%rax, KK
127#endif
128
129	movq	N,   %rax
130	sarq	$1,  %rax
131	movq	%rax, J
132	je	.L30
133	ALIGN_4
134
135.L01:
136#if defined(LT) || defined(RN)
137	movq	A, AO
138#else
139	movq	A, %rax
140	movq	%rax, AORIG
141#endif
142
143#ifdef RT
144	movq	K, %rax
145	salq	$1 + BASE_SHIFT, %rax
146	subq	%rax, B
147#endif
148
149	lea	(, LDC, 2), %rax
150
151#ifdef RT
152	subq	%rax, C
153#endif
154	movq	C, CO
155#ifndef RT
156	addq	%rax, C
157#endif
158
159#ifdef LN
160	movq	OFFSET, %rax
161	addq	M, %rax
162	movq	%rax, KK
163#endif
164
165#ifdef LT
166	movq	OFFSET, %rax
167	movq	%rax, KK
168#endif
169
170	movq	M,  I
171	sarq	$1, I
172	je	.L20
173	ALIGN_4
174
175.L11:
176#ifdef LN
177       movq	K, %rax
178       salq	$1 + BASE_SHIFT, %rax
179       subq	%rax, AORIG
180#endif
181
182#if defined(LN) || defined(RT)
183	movq	KK, %rax
184	salq	$BASE_SHIFT, %rax
185	movq	AORIG, AO
186	leaq	(AO, %rax, 2), AO
187	leaq	(B,  %rax, 2), BO
188#else
189	movq	B, BO
190#endif
191
192	fldz
193	fldz
194	fldz
195	fldz
196
197#if   defined(HAVE_3DNOW)
198	prefetchw	2 * SIZE(CO)
199 	prefetchw	2 * SIZE(CO, LDC, 1)
200#elif defined(HAVE_SSE)
201	prefetchnta	2 * SIZE(CO)
202 	prefetchnta	2 * SIZE(CO, LDC, 1)
203#endif
204
205#if defined(LT) || defined(RN)
206	movq	KK, %rax
207#else
208	movq	K,  %rax
209	subq	KK, %rax
210#endif
211	sarq	$2, %rax
212 	je	.L15
213	ALIGN_4
214
215.L12:
216	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
217
218	FLD	 -8 * SIZE(AO)
219
220	FLD	 -8 * SIZE(BO)
221	fld	 %st(1)
222	fmul	 %st(1), %st
223	faddp	 %st, %st(3)
224
225	FLD	 -7 * SIZE(BO)
226	fmul	 %st, %st(2)
227
228	FLD	 -7 * SIZE(AO)
229	fmul	 %st, %st(2)
230	fmulp	 %st, %st(1)
231
232	faddp	 %st, %st(6)
233	faddp	 %st, %st(4)
234	faddp	 %st, %st(2)
235
236	FLD	 -6 * SIZE(AO)
237
238	FLD	 -6 * SIZE(BO)
239	fld	 %st(1)
240	fmul	 %st(1), %st
241	faddp	 %st, %st(3)
242
243	FLD	 -5 * SIZE(BO)
244	fmul	 %st, %st(2)
245
246	FLD	 -5 * SIZE(AO)
247	fmul	 %st, %st(2)
248	fmulp	 %st, %st(1)
249
250	faddp	 %st, %st(6)
251	faddp	 %st, %st(4)
252	faddp	 %st, %st(2)
253
254	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
255
256	FLD	 -4 * SIZE(AO)
257
258	FLD	 -4 * SIZE(BO)
259	fld	 %st(1)
260	fmul	 %st(1), %st
261	faddp	 %st, %st(3)
262
263	FLD	 -3 * SIZE(BO)
264	fmul	 %st, %st(2)
265
266	FLD	 -3 * SIZE(AO)
267	fmul	 %st, %st(2)
268	fmulp	 %st, %st(1)
269
270	faddp	 %st, %st(6)
271	faddp	 %st, %st(4)
272	faddp	 %st, %st(2)
273
274	FLD	 -2 * SIZE(AO)
275
276	FLD	 -2 * SIZE(BO)
277	fld	 %st(1)
278	fmul	 %st(1), %st
279	faddp	 %st, %st(3)
280
281	FLD	 -1 * SIZE(BO)
282	fmul	 %st, %st(2)
283
284	FLD	 -1 * SIZE(AO)
285	fmul	 %st, %st(2)
286	fmulp	 %st, %st(1)
287
288	faddp	 %st, %st(6)
289	faddp	 %st, %st(4)
290	faddp	 %st, %st(2)
291
292	addq	$8 * SIZE,AO
293	addq	$8 * SIZE,BO
294
295	decq	%rax
296	jne	.L12
297	ALIGN_4
298
299.L15:
300#if defined(LT) || defined(RN)
301	movq	KK, %rax
302#else
303	movq	K,  %rax
304	subq	KK, %rax
305#endif
306	and	$3,  %rax
307	je	.L18
308	ALIGN_4
309
310.L16:
311	FLD	 -8 * SIZE(AO)
312
313	FLD	 -8 * SIZE(BO)
314	fld	 %st(1)
315	fmul	 %st(1), %st
316	faddp	 %st, %st(3)
317
318	FLD	 -7 * SIZE(BO)
319	fmul	 %st, %st(2)
320
321	FLD	 -7 * SIZE(AO)
322	fmul	 %st, %st(2)
323	fmulp	 %st, %st(1)
324
325	faddp	 %st, %st(6)
326	faddp	 %st, %st(4)
327	faddp	 %st, %st(2)
328
329	addq	$2 * SIZE,AO
330	addq	$2 * SIZE,BO
331
332	decq	%rax
333	jne	 .L16
334	ALIGN_4
335
336.L18:
337#if defined(LN) || defined(RT)
338	movq	KK, %rax
339#ifdef LN
340	subq	$2, %rax
341#else
342	subq	$2, %rax
343#endif
344
345	salq	$BASE_SHIFT, %rax
346
347	movq	AORIG, AO
348	leaq	(AO, %rax, 2), AO
349	leaq	(B,  %rax, 2), BO
350#endif
351
352#if defined(LN) || defined(LT)
353	FLD	-8 * SIZE(BO)
354	fsubp	%st, %st(1)
355	FLD	-7 * SIZE(BO)
356	fsubp	%st, %st(2)
357	FLD	-6 * SIZE(BO)
358	fsubp	%st, %st(3)
359	FLD	-5 * SIZE(BO)
360	fsubp	%st, %st(4)
361#else
362	FLD	-8 * SIZE(AO)
363	fsubp	%st, %st(1)
364	FLD	-7 * SIZE(AO)
365	fsubp	%st, %st(3)
366	FLD	-6 * SIZE(AO)
367	fsubp	%st, %st(2)
368	FLD	-5 * SIZE(AO)
369	fsubp	%st, %st(4)
370#endif
371
372#ifdef LN
373       FLD	-5 * SIZE(AO)
374       fmul	%st, %st(3)
375       fmulp	%st, %st(4)
376
377       FLD	-6 * SIZE(AO)
378       fmul	%st(3), %st
379       FLD	-6 * SIZE(AO)
380       fmul	%st(5), %st
381
382       fsubrp	%st, %st(3)
383       fsubrp	%st, %st(1)
384
385       FLD	-8 * SIZE(AO)
386       fmul	%st, %st(1)
387       fmulp	%st, %st(2)
388#endif
389
390#ifdef LT
391       FLD	-8 * SIZE(AO)
392       fmul	%st, %st(1)
393       fmulp	%st, %st(2)
394
395       FLD	-7 * SIZE(AO)
396       fmul	%st(1), %st
397       FLD	-7 * SIZE(AO)
398       fmul	%st(3), %st
399
400       fsubrp	%st, %st(5)
401       fsubrp	%st, %st(3)
402
403       FLD	-5 * SIZE(AO)
404       fmul	%st, %st(3)
405       fmulp	%st, %st(4)
406#endif
407
408#ifdef RN
409       FLD	-8 * SIZE(BO)
410       fmul	%st, %st(1)
411       fmulp	%st, %st(3)
412
413       FLD	-7 * SIZE(BO)
414       fmul	%st(1), %st
415       FLD	-7 * SIZE(BO)
416       fmul	%st(4), %st
417
418       fsubrp	%st, %st(5)
419       fsubrp	%st, %st(2)
420
421       FLD	-5 * SIZE(BO)
422       fmul	%st, %st(2)
423       fmulp	%st, %st(4)
424#endif
425
426#ifdef RT
427       FLD	-5 * SIZE(BO)
428       fmul	%st, %st(2)
429       fmulp	%st, %st(4)
430
431       FLD	-6 * SIZE(BO)
432       fmul	%st(2), %st
433       FLD	-6 * SIZE(BO)
434       fmul	%st(5), %st
435
436       fsubrp	%st, %st(4)
437       fsubrp	%st, %st(1)
438
439       FLD	-8 * SIZE(BO)
440       fmul	%st, %st(1)
441       fmulp	%st, %st(3)
442#endif
443
444#ifdef LN
445	subq	$2 * SIZE, CO
446#endif
447
448#if defined(LN) || defined(LT)
449	fld	%st
450	FST	-8 * SIZE(BO)
451	fxch	%st(1)
452	fld	%st
453	FST	-7 * SIZE(BO)
454	fxch	%st(2)
455	fld	%st
456	FST	-6 * SIZE(BO)
457	fxch	%st(3)
458	fld	%st
459	FST	-5 * SIZE(BO)
460
461	FST	1 * SIZE(CO, LDC)
462	FST	0 * SIZE(CO)
463	FST	0 * SIZE(CO, LDC)
464	FST	1 * SIZE(CO)
465#else
466	fld	%st
467	FST	-8 * SIZE(AO)
468	fxch	%st(2)
469	fld	%st
470	FST	-7 * SIZE(AO)
471	fxch	%st(1)
472	fld	%st
473	FST	-6 * SIZE(AO)
474	fxch	%st(3)
475	fld	%st
476	FST	-5 * SIZE(AO)
477
478	FST	1 * SIZE(CO, LDC)
479	FST	1 * SIZE(CO)
480	FST	0 * SIZE(CO)
481	FST	0 * SIZE(CO, LDC)
482#endif
483
484#ifndef LN
485	addq	$2 * SIZE, CO
486#endif
487
488#if defined(LT) || defined(RN)
489	movq	K,  %rax
490	subq	KK, %rax
491	salq	$BASE_SHIFT, %rax
492	leaq	(AO, %rax, 2), AO
493	leaq	(BO, %rax, 2), BO
494#endif
495
496#ifdef LN
497	subq	$2, KK
498#endif
499
500#ifdef LT
501	addq	$2, KK
502#endif
503
504#ifdef RT
505       movq	K, %rax
506       salq	$1 + BASE_SHIFT, %rax
507       addq	%rax, AORIG
508#endif
509
510	decq	I
511	jne	.L11
512	ALIGN_4
513
514.L20:
515	movq	 M, %rax
516	andq	$1, %rax
517	je	.L29
518	ALIGN_4
519
520.L21:
521#ifdef LN
522       movq	K, %rax
523       salq	$0 + BASE_SHIFT, %rax
524       subq	%rax, AORIG
525#endif
526
527#if defined(LN) || defined(RT)
528	movq	KK, %rax
529	salq	$BASE_SHIFT, %rax
530	movq	AORIG, AO
531	leaq	(AO, %rax, 1), AO
532	leaq	(B,  %rax, 2), BO
533#else
534	movq	B, BO
535#endif
536
537	fldz
538	fldz
539
540#if defined(LT) || defined(RN)
541	movq	KK, %rax
542#else
543	movq	K,  %rax
544	subq	KK, %rax
545#endif
546	sarq	$2, %rax
547 	je	.L25
548	ALIGN_4
549
550.L22:
551	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
552
553	FLD	 -8 * SIZE(AO)
554
555	FLD	 -8 * SIZE(BO)
556	fmul	 %st(1), %st
557	faddp	 %st, %st(2)
558
559	FLD	 -7 * SIZE(BO)
560	fmulp	 %st, %st(1)
561	faddp	 %st, %st(2)
562
563	FLD	 -7 * SIZE(AO)
564
565	FLD	 -6 * SIZE(BO)
566	fmul	 %st(1), %st
567	faddp	 %st, %st(2)
568
569	FLD	 -5 * SIZE(BO)
570	fmulp	 %st, %st(1)
571	faddp	 %st, %st(2)
572
573	FLD	 -6 * SIZE(AO)
574
575	FLD	 -4 * SIZE(BO)
576	fmul	 %st(1), %st
577	faddp	 %st, %st(2)
578
579	FLD	 -3 * SIZE(BO)
580	fmulp	 %st, %st(1)
581	faddp	 %st, %st(2)
582
583	FLD	 -5 * SIZE(AO)
584
585	FLD	 -2 * SIZE(BO)
586	fmul	 %st(1), %st
587	faddp	 %st, %st(2)
588
589	FLD	 -1 * SIZE(BO)
590	fmulp	 %st, %st(1)
591	faddp	 %st, %st(2)
592
593	addq	$4 * SIZE,AO
594	addq	$8 * SIZE,BO
595
596	decq	%rax
597	jne	.L22
598	ALIGN_4
599
600.L25:
601#if defined(LT) || defined(RN)
602	movq	KK, %rax
603#else
604	movq	K,  %rax
605	subq	KK, %rax
606#endif
607	and	$3,  %rax
608	je	.L28
609	ALIGN_4
610
611.L26:
612	FLD	 -8 * SIZE(AO)
613
614	FLD	 -8 * SIZE(BO)
615	fmul	 %st(1), %st
616	faddp	 %st, %st(2)
617
618	FLD	 -7 * SIZE(BO)
619	fmulp	 %st, %st(1)
620	faddp	 %st, %st(2)
621
622	addq	$1 * SIZE,AO
623	addq	$2 * SIZE,BO
624
625	decq	%rax
626	jne	 .L26
627	ALIGN_4
628
629.L28:
630#if defined(LN) || defined(RT)
631	movq	KK, %rax
632#ifdef LN
633	subq	$1, %rax
634#else
635	subq	$2, %rax
636#endif
637
638	salq	$BASE_SHIFT, %rax
639
640	movq	AORIG, AO
641	leaq	(AO, %rax, 1), AO
642	leaq	(B,  %rax, 2), BO
643#endif
644
645#if defined(LN) || defined(LT)
646	FLD	-8 * SIZE(BO)
647	fsubp	%st, %st(1)
648	FLD	-7 * SIZE(BO)
649	fsubp	%st, %st(2)
650#else
651	FLD	-8 * SIZE(AO)
652	fsubp	%st, %st(1)
653	FLD	-7 * SIZE(AO)
654	fsubp	%st, %st(2)
655#endif
656
657#if defined(LN) || defined(LT)
658       FLD	-8 * SIZE(AO)
659       fmul	%st, %st(1)
660       fmulp	%st, %st(2)
661#endif
662
663#ifdef RN
664       FLD	-8 * SIZE(BO)
665       fmulp	%st, %st(1)
666
667       FLD	-7 * SIZE(BO)
668       fmul	%st(1), %st
669
670       fsubrp	%st, %st(2)
671
672       FLD	-5 * SIZE(BO)
673       fmulp	%st, %st(2)
674#endif
675
676#ifdef RT
677       FLD	-5 * SIZE(BO)
678       fmulp	%st, %st(2)
679
680       FLD	-6 * SIZE(BO)
681       fmul	%st(2), %st
682
683       fsubrp	%st, %st(1)
684
685       FLD	-8 * SIZE(BO)
686       fmulp	%st, %st(1)
687#endif
688
689#ifdef LN
690	subq	$1 * SIZE, CO
691#endif
692
693#if defined(LN) || defined(LT)
694	fld	%st
695	FST	-8 * SIZE(BO)
696	fxch	%st(1)
697	fld	%st
698	FST	-7 * SIZE(BO)
699#else
700	fld	%st
701	FST	-8 * SIZE(AO)
702	fxch	%st(1)
703	fld	%st
704	FST	-7 * SIZE(AO)
705#endif
706
707	FST	0 * SIZE(CO, LDC)
708	FST	0 * SIZE(CO)
709
710#ifndef LN
711	addq	$1 * SIZE, CO
712#endif
713
714#if defined(LT) || defined(RN)
715	movq	K,  %rax
716	subq	KK, %rax
717	salq	$BASE_SHIFT, %rax
718	leaq	(AO, %rax, 1), AO
719	leaq	(BO, %rax, 2), BO
720#endif
721
722#ifdef LN
723	subq	$1, KK
724#endif
725
726#ifdef LT
727	addq	$1, KK
728#endif
729
730#ifdef RT
731       movq	K, %rax
732       salq	$0 + BASE_SHIFT, %rax
733       addq	%rax, AORIG
734#endif
735	ALIGN_4
736
737.L29:
738#ifdef LN
739       movq	K, %rax
740       salq	$BASE_SHIFT, %rax
741       leaq	(B, %rax, 2), B
742#endif
743
744#if defined(LT) || defined(RN)
745	movq	BO, B
746#endif
747
748#ifdef RN
749	addq	$2, KK
750#endif
751
752#ifdef RT
753	subq	$2, KK
754#endif
755
756	decq	J
757	jne	.L01
758	ALIGN_4
759
760.L30:
761	movq	N,  %rax
762	testq	$1, %rax
763	je	.L999
764
765#if defined(LT) || defined(RN)
766	movq	A, AO
767#else
768	movq	A, %rax
769	movq	%rax, AORIG
770#endif
771
772#ifdef RT
773	movq	K, %rax
774	salq	$0 + BASE_SHIFT, %rax
775	subq	%rax, B
776#endif
777
778#ifdef RT
779	subq	LDC, C
780#endif
781	movq	C, CO
782#ifndef RT
783	addq	LDC, C
784#endif
785
786#ifdef LN
787	movq	OFFSET, %rax
788	addq	M, %rax
789	movq	%rax, KK
790#endif
791
792#ifdef LT
793	movq	OFFSET, %rax
794	movq	%rax, KK
795#endif
796
797	movq	M,  I
798	sarq	$1, I
799	je	.L40
800	ALIGN_4
801
802.L31:
803#ifdef LN
804       movq	K, %rax
805       salq	$1 + BASE_SHIFT, %rax
806       subq	%rax, AORIG
807#endif
808
809#if defined(LN) || defined(RT)
810	movq	KK, %rax
811	salq	$BASE_SHIFT, %rax
812	movq	AORIG, AO
813	leaq	(AO, %rax, 2), AO
814	leaq	(B,  %rax, 1), BO
815#else
816	movq	B, BO
817#endif
818
819	fldz
820	fldz
821
822#if   defined(HAVE_3DNOW)
823	prefetchw	2 * SIZE(CO)
824#elif defined(HAVE_SSE)
825	prefetchnta	2 * SIZE(CO)
826#endif
827
828#if defined(LT) || defined(RN)
829	movq	KK, %rax
830#else
831	movq	K,  %rax
832	subq	KK, %rax
833#endif
834	sarq	$2, %rax
835 	je	.L35
836	ALIGN_4
837
838.L32:
839	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
840
841	FLD	 -8 * SIZE(BO)
842	FLD	 -8 * SIZE(AO)
843	fmul	 %st(1), %st
844	faddp	 %st, %st(2)
845
846	FLD	 -7 * SIZE(AO)
847	fmulp	 %st, %st(1)
848	faddp	 %st, %st(2)
849
850	FLD	 -7 * SIZE(BO)
851	FLD	 -6 * SIZE(AO)
852	fmul	 %st(1), %st
853	faddp	 %st, %st(2)
854
855	FLD	 -5 * SIZE(AO)
856	fmulp	 %st, %st(1)
857	faddp	 %st, %st(2)
858
859	FLD	 -6 * SIZE(BO)
860	FLD	 -4 * SIZE(AO)
861	fmul	 %st(1), %st
862	faddp	 %st, %st(2)
863
864	FLD	 -3 * SIZE(AO)
865	fmulp	 %st, %st(1)
866	faddp	 %st, %st(2)
867
868	FLD	 -5 * SIZE(BO)
869	FLD	 -2 * SIZE(AO)
870	fmul	 %st(1), %st
871	faddp	 %st, %st(2)
872
873	FLD	 -1 * SIZE(AO)
874	fmulp	 %st, %st(1)
875	faddp	 %st, %st(2)
876
877	addq	$8 * SIZE,AO
878	addq	$4 * SIZE,BO
879
880	decq	%rax
881	jne	.L32
882	ALIGN_4
883
884.L35:
885#if defined(LT) || defined(RN)
886	movq	KK, %rax
887#else
888	movq	K,  %rax
889	subq	KK, %rax
890#endif
891	and	$3,  %rax
892	je	.L38
893	ALIGN_4
894
895.L36:
896	FLD	 -8 * SIZE(BO)
897
898	FLD	 -8 * SIZE(AO)
899	fmul	 %st(1), %st
900	faddp	 %st, %st(2)
901
902	FLD	 -7 * SIZE(AO)
903	fmulp	 %st, %st(1)
904	faddp	 %st, %st(2)
905
906	addq	$2 * SIZE,AO
907	addq	$1 * SIZE,BO
908
909	decq	%rax
910	jne	 .L36
911	ALIGN_4
912
913.L38:
914#if defined(LN) || defined(RT)
915	movq	KK, %rax
916#ifdef LN
917	subq	$2, %rax
918#else
919	subq	$1, %rax
920#endif
921
922	salq	$BASE_SHIFT, %rax
923
924	movq	AORIG, AO
925	leaq	(AO, %rax, 2), AO
926	leaq	(B,  %rax, 1), BO
927#endif
928
929#if defined(LN) || defined(LT)
930	FLD	-8 * SIZE(BO)
931	fsubp	%st, %st(1)
932	FLD	-7 * SIZE(BO)
933	fsubp	%st, %st(2)
934#else
935	FLD	-8 * SIZE(AO)
936	fsubp	%st, %st(1)
937	FLD	-7 * SIZE(AO)
938	fsubp	%st, %st(2)
939#endif
940
941#ifdef LN
942       FLD	-5 * SIZE(AO)
943       fmulp	%st, %st(2)
944
945       FLD	-6 * SIZE(AO)
946       fmul	%st(2), %st
947
948       fsubrp	%st, %st(1)
949       FLD	-8 * SIZE(AO)
950       fmulp	%st, %st(1)
951#endif
952
953#ifdef LT
954       FLD	-8 * SIZE(AO)
955       fmulp	%st, %st(1)
956
957       FLD	-7 * SIZE(AO)
958       fmul	%st(1), %st
959
960       fsubrp	%st, %st(2)
961
962       FLD	-5 * SIZE(AO)
963       fmulp	%st, %st(2)
964#endif
965
966#ifdef RN
967       FLD	-8 * SIZE(BO)
968       fmul	%st, %st(1)
969       fmulp	%st, %st(2)
970#endif
971
972#ifdef RT
973       FLD	-8 * SIZE(BO)
974       fmul	%st, %st(1)
975       fmulp	%st, %st(2)
976#endif
977
978#ifdef LN
979	subq	$2 * SIZE, CO
980#endif
981
982#if defined(LN) || defined(LT)
983	fld	%st
984	FST	-8 * SIZE(BO)
985	fxch	%st(1)
986	fld	%st
987	FST	-7 * SIZE(BO)
988#else
989	fld	%st
990	FST	-8 * SIZE(AO)
991	fxch	%st(1)
992	fld	%st
993	FST	-7 * SIZE(AO)
994#endif
995
996	FST	1 * SIZE(CO)
997	FST	0 * SIZE(CO)
998
999#ifndef LN
1000	addq	$2 * SIZE, CO
1001#endif
1002
1003#if defined(LT) || defined(RN)
1004	movq	K,  %rax
1005	subq	KK, %rax
1006	salq	$BASE_SHIFT, %rax
1007	leaq	(AO, %rax, 2), AO
1008	leaq	(BO, %rax, 1), BO
1009#endif
1010
1011#ifdef LN
1012	subq	$2, KK
1013#endif
1014
1015#ifdef LT
1016	addq	$2, KK
1017#endif
1018
1019#ifdef RT
1020       movq	K, %rax
1021       salq	$1 + BASE_SHIFT, %rax
1022       addq	%rax, AORIG
1023#endif
1024
1025	decq	I
1026	jne	.L31
1027	ALIGN_4
1028
1029.L40:
1030	movq	 M, %rax
1031	andq	$1, %rax
1032	je	.L49
1033	ALIGN_4
1034
1035.L41:
1036#ifdef LN
1037       movq	K, %rax
1038       salq	$0 + BASE_SHIFT, %rax
1039       subq	%rax, AORIG
1040#endif
1041
1042#if defined(LN) || defined(RT)
1043	movq	KK, %rax
1044	salq	$BASE_SHIFT, %rax
1045	movq	AORIG, AO
1046	leaq	(AO, %rax, 1), AO
1047	leaq	(B,  %rax, 1), BO
1048#else
1049	movq	B, BO
1050#endif
1051
1052	fldz
1053
1054#if defined(LT) || defined(RN)
1055	movq	KK, %rax
1056#else
1057	movq	K,  %rax
1058	subq	KK, %rax
1059#endif
1060	sarq	$2, %rax
1061 	je	.L45
1062	ALIGN_4
1063
1064.L42:
1065	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1066
1067	FLD	 -8 * SIZE(AO)
1068	FLD	 -8 * SIZE(BO)
1069	fmulp	 %st, %st(1)
1070	faddp	 %st, %st(1)
1071
1072	FLD	 -7 * SIZE(AO)
1073	FLD	 -7 * SIZE(BO)
1074	fmulp	 %st, %st(1)
1075	faddp	 %st, %st(1)
1076
1077	FLD	 -6 * SIZE(AO)
1078	FLD	 -6 * SIZE(BO)
1079	fmulp	 %st, %st(1)
1080	faddp	 %st, %st(1)
1081
1082	FLD	 -5 * SIZE(AO)
1083	FLD	 -5 * SIZE(BO)
1084	fmulp	 %st, %st(1)
1085	faddp	 %st, %st(1)
1086
1087	addq	$4 * SIZE,AO
1088	addq	$4 * SIZE,BO
1089
1090	decq	%rax
1091	jne	.L42
1092	ALIGN_4
1093
1094.L45:
1095#if defined(LT) || defined(RN)
1096	movq	KK, %rax
1097#else
1098	movq	K,  %rax
1099	subq	KK, %rax
1100#endif
1101	and	$3,  %rax
1102	je	.L48
1103	ALIGN_4
1104
1105.L46:
1106	FLD	 -8 * SIZE(AO)
1107
1108	FLD	 -8 * SIZE(BO)
1109	fmulp	 %st, %st(1)
1110	faddp	 %st, %st(1)
1111
1112	addq	$1 * SIZE,AO
1113	addq	$1 * SIZE,BO
1114
1115	decq	%rax
1116	jne	 .L46
1117	ALIGN_4
1118
1119.L48:
1120#if defined(LN) || defined(RT)
1121	movq	KK, %rax
1122#ifdef LN
1123	subq	$1, %rax
1124#else
1125	subq	$1, %rax
1126#endif
1127
1128	salq	$BASE_SHIFT, %rax
1129
1130	movq	AORIG, AO
1131	leaq	(AO, %rax, 1), AO
1132	leaq	(B,  %rax, 1), BO
1133#endif
1134
1135#if defined(LN) || defined(LT)
1136	FLD	-8 * SIZE(BO)
1137	fsubp	%st, %st(1)
1138#else
1139	FLD	-8 * SIZE(AO)
1140	fsubp	%st, %st(1)
1141#endif
1142
1143#ifdef LN
1144       FLD	-8 * SIZE(AO)
1145       fmulp	%st, %st(1)
1146#endif
1147
1148#ifdef LT
1149       FLD	-8 * SIZE(AO)
1150       fmulp	%st, %st(1)
1151#endif
1152
1153#ifdef RN
1154       FLD	-8 * SIZE(BO)
1155       fmulp	%st, %st(1)
1156#endif
1157
1158#ifdef RT
1159       FLD	-8 * SIZE(BO)
1160       fmulp	%st, %st(1)
1161#endif
1162
1163#ifdef LN
1164	subq	$1 * SIZE, CO
1165#endif
1166
1167#if defined(LN) || defined(LT)
1168	fld	%st
1169	FST	-8 * SIZE(BO)
1170#else
1171	fld	%st
1172	FST	-8 * SIZE(AO)
1173#endif
1174
1175	FST	0 * SIZE(CO)
1176
1177#ifndef LN
1178	addq	$1 * SIZE, CO
1179#endif
1180
1181#if defined(LT) || defined(RN)
1182	movq	K,  %rax
1183	subq	KK, %rax
1184	salq	$BASE_SHIFT, %rax
1185	leaq	(AO, %rax, 1), AO
1186	leaq	(BO, %rax, 1), BO
1187#endif
1188
1189#ifdef LN
1190	subq	$1, KK
1191#endif
1192
1193#ifdef LT
1194	addq	$1, KK
1195#endif
1196
1197#ifdef RT
1198       movq	K, %rax
1199       salq	$0 + BASE_SHIFT, %rax
1200       addq	%rax, AORIG
1201#endif
1202	ALIGN_4
1203
1204.L49:
1205#ifdef LN
1206       movq	K, %rax
1207       salq	$BASE_SHIFT, %rax
1208       leaq	(B, %rax, 1), B
1209#endif
1210
1211#if defined(LT) || defined(RN)
1212	movq	BO, B
1213#endif
1214
1215#ifdef RN
1216	addq	$1, KK
1217#endif
1218
1219#ifdef RT
1220	subq	$1, KK
1221#endif
1222	ALIGN_4
1223
1224.L999:
1225	movq	  0(%rsp), %rbx
1226	movq	  8(%rsp), %rbp
1227	movq	 16(%rsp), %r12
1228	movq	 24(%rsp), %r13
1229	movq	 32(%rsp), %r14
1230	movq	 40(%rsp), %r15
1231	addq	$STACKSIZE, %rsp
1232	ret
1233
1234	EPILOGUE
1235