1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1
43#define N	ARG2
44#define K	ARG3
45#define A	ARG4
46#define B	ARG5
47#define C	ARG6
48#define LDC	%r10
49
50#define I	%r12
51#define J	%r13
52#define AO	%r14
53#define BO	%r15
54#define	CO	%rbp
55
56#define KK	%r11
57#define AORIG	 48(%rsp)
58
59#define STACKSIZE 64
60
61#define ALPHA	 8 + STACKSIZE(%rsp)
62#define OFFSET	32 + STACKSIZE(%rsp)
63
64#ifdef OPTERON
65#define PREFETCH	prefetch
66#define PREFETCHW	prefetchw
67#else
68#define PREFETCH	prefetcht0
69#define PREFETCHW	prefetcht0
70#endif
71
72#define PREFETCHSIZE (5 + 4 * 10)
73
74	PROLOGUE
75	PROFCODE
76
77#ifdef WINDOWS_ABI
78	emms
79#endif
80
81	subq	$STACKSIZE, %rsp
82	movq	%rbx,  0(%rsp)
83	movq	%rbp,  8(%rsp)
84	movq	%r12, 16(%rsp)
85	movq	%r13, 24(%rsp)
86	movq	%r14, 32(%rsp)
87	movq	%r15, 40(%rsp)
88
89	movq	24 + STACKSIZE(%rsp), LDC
90
91#if defined(TRMMKERNEL) && !defined(LEFT)
92	movq	OFFSET, %rax
93	negq	%rax
94	movq	%rax, KK
95#endif
96
97	addq	$8 * SIZE, A
98	addq	$8 * SIZE, B
99
100	salq	$BASE_SHIFT, LDC
101
102#ifdef LN
103       movq	M, %rax
104       salq	$BASE_SHIFT, %rax
105       addq	%rax, C
106       imulq	K, %rax
107       addq	%rax, A
108#endif
109
110#ifdef RT
111       movq	N, %rax
112       salq	$BASE_SHIFT, %rax
113       imulq	K, %rax
114       addq	%rax, B
115
116       movq	N,   %rax
117       imulq	LDC, %rax
118       addq	%rax, C
119#endif
120
121#ifdef RN
122       movq	OFFSET, %rax
123       negq	%rax
124       movq	%rax, KK
125#endif
126
127#ifdef RT
128       movq	N, %rax
129       subq	OFFSET, %rax
130       movq	%rax, KK
131#endif
132
133	movq	N,   %rax
134	sarq	$1,  %rax
135	movq	%rax, J
136	je	.L30
137	ALIGN_4
138
139.L01:
140#if defined(LT) || defined(RN)
141	movq	A, AO
142#else
143	movq	A, %rax
144	movq	%rax, AORIG
145#endif
146
147#ifdef RT
148	movq	K, %rax
149	salq	$1 + BASE_SHIFT, %rax
150	subq	%rax, B
151#endif
152
153	lea	(, LDC, 2), %rax
154
155#ifdef RT
156	subq	%rax, C
157#endif
158	movq	C, CO
159#ifndef RT
160	addq	%rax, C
161#endif
162
163#ifdef LN
164	movq	OFFSET, %rax
165	addq	M, %rax
166	movq	%rax, KK
167#endif
168
169#ifdef LT
170	movq	OFFSET, %rax
171	movq	%rax, KK
172#endif
173
174	movq	M,  I
175	sarq	$1, I
176	je	.L20
177	ALIGN_4
178
179.L11:
180#ifdef LN
181       movq	K, %rax
182       salq	$1 + BASE_SHIFT, %rax
183       subq	%rax, AORIG
184#endif
185
186#if defined(LN) || defined(RT)
187	movq	KK, %rax
188	salq	$BASE_SHIFT, %rax
189	movq	AORIG, AO
190	leaq	(AO, %rax, 2), AO
191	leaq	(B,  %rax, 2), BO
192#else
193	movq	B, BO
194#endif
195
196	fldz
197	fldz
198	fldz
199	fldz
200
201#if   defined(HAVE_3DNOW)
202	prefetchw	2 * SIZE(CO)
203 	prefetchw	2 * SIZE(CO, LDC, 1)
204#elif defined(HAVE_SSE)
205	prefetchnta	2 * SIZE(CO)
206 	prefetchnta	2 * SIZE(CO, LDC, 1)
207#endif
208
209#if defined(LT) || defined(RN)
210	movq	KK, %rax
211#else
212	movq	K,  %rax
213	subq	KK, %rax
214#endif
215	sarq	$2, %rax
216 	je	.L15
217	ALIGN_4
218
219.L12:
220	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
221
222	FLD	 -8 * SIZE(AO)
223
224	FLD	 -8 * SIZE(BO)
225	fld	 %st(1)
226	fmul	 %st(1), %st
227	faddp	 %st, %st(3)
228
229	FLD	 -7 * SIZE(BO)
230	fmul	 %st, %st(2)
231
232	FLD	 -7 * SIZE(AO)
233	fmul	 %st, %st(2)
234	fmulp	 %st, %st(1)
235
236	faddp	 %st, %st(6)
237	faddp	 %st, %st(4)
238	faddp	 %st, %st(2)
239
240	FLD	 -6 * SIZE(AO)
241
242	FLD	 -6 * SIZE(BO)
243	fld	 %st(1)
244	fmul	 %st(1), %st
245	faddp	 %st, %st(3)
246
247	FLD	 -5 * SIZE(BO)
248	fmul	 %st, %st(2)
249
250	FLD	 -5 * SIZE(AO)
251	fmul	 %st, %st(2)
252	fmulp	 %st, %st(1)
253
254	faddp	 %st, %st(6)
255	faddp	 %st, %st(4)
256	faddp	 %st, %st(2)
257
258	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
259
260	FLD	 -4 * SIZE(AO)
261
262	FLD	 -4 * SIZE(BO)
263	fld	 %st(1)
264	fmul	 %st(1), %st
265	faddp	 %st, %st(3)
266
267	FLD	 -3 * SIZE(BO)
268	fmul	 %st, %st(2)
269
270	FLD	 -3 * SIZE(AO)
271	fmul	 %st, %st(2)
272	fmulp	 %st, %st(1)
273
274	faddp	 %st, %st(6)
275	faddp	 %st, %st(4)
276	faddp	 %st, %st(2)
277
278	FLD	 -2 * SIZE(AO)
279
280	FLD	 -2 * SIZE(BO)
281	fld	 %st(1)
282	fmul	 %st(1), %st
283	faddp	 %st, %st(3)
284
285	FLD	 -1 * SIZE(BO)
286	fmul	 %st, %st(2)
287
288	FLD	 -1 * SIZE(AO)
289	fmul	 %st, %st(2)
290	fmulp	 %st, %st(1)
291
292	faddp	 %st, %st(6)
293	faddp	 %st, %st(4)
294	faddp	 %st, %st(2)
295
296	addq	$8 * SIZE,AO
297	addq	$8 * SIZE,BO
298
299	decq	%rax
300	jne	.L12
301	ALIGN_4
302
303.L15:
304#if defined(LT) || defined(RN)
305	movq	KK, %rax
306#else
307	movq	K,  %rax
308	subq	KK, %rax
309#endif
310	and	$3,  %rax
311	je	.L18
312	ALIGN_4
313
314.L16:
315	FLD	 -8 * SIZE(AO)
316
317	FLD	 -8 * SIZE(BO)
318	fld	 %st(1)
319	fmul	 %st(1), %st
320	faddp	 %st, %st(3)
321
322	FLD	 -7 * SIZE(BO)
323	fmul	 %st, %st(2)
324
325	FLD	 -7 * SIZE(AO)
326	fmul	 %st, %st(2)
327	fmulp	 %st, %st(1)
328
329	faddp	 %st, %st(6)
330	faddp	 %st, %st(4)
331	faddp	 %st, %st(2)
332
333	addq	$2 * SIZE,AO
334	addq	$2 * SIZE,BO
335
336	decq	%rax
337	jne	 .L16
338	ALIGN_4
339
340.L18:
341#if defined(LN) || defined(RT)
342	movq	KK, %rax
343#ifdef LN
344	subq	$2, %rax
345#else
346	subq	$2, %rax
347#endif
348
349	salq	$BASE_SHIFT, %rax
350
351	movq	AORIG, AO
352	leaq	(AO, %rax, 2), AO
353	leaq	(B,  %rax, 2), BO
354#endif
355
356#if defined(LN) || defined(LT)
357	FLD	-8 * SIZE(BO)
358	fsubp	%st, %st(1)
359	FLD	-7 * SIZE(BO)
360	fsubp	%st, %st(2)
361	FLD	-6 * SIZE(BO)
362	fsubp	%st, %st(3)
363	FLD	-5 * SIZE(BO)
364	fsubp	%st, %st(4)
365#else
366	FLD	-8 * SIZE(AO)
367	fsubp	%st, %st(1)
368	FLD	-7 * SIZE(AO)
369	fsubp	%st, %st(3)
370	FLD	-6 * SIZE(AO)
371	fsubp	%st, %st(2)
372	FLD	-5 * SIZE(AO)
373	fsubp	%st, %st(4)
374#endif
375
376#ifdef LN
377       FLD	-5 * SIZE(AO)
378       fmul	%st, %st(3)
379       fmulp	%st, %st(4)
380
381       FLD	-6 * SIZE(AO)
382       fmul	%st(3), %st
383       FLD	-6 * SIZE(AO)
384       fmul	%st(5), %st
385
386       fsubrp	%st, %st(3)
387       fsubrp	%st, %st(1)
388
389       FLD	-8 * SIZE(AO)
390       fmul	%st, %st(1)
391       fmulp	%st, %st(2)
392#endif
393
394#ifdef LT
395       FLD	-8 * SIZE(AO)
396       fmul	%st, %st(1)
397       fmulp	%st, %st(2)
398
399       FLD	-7 * SIZE(AO)
400       fmul	%st(1), %st
401       FLD	-7 * SIZE(AO)
402       fmul	%st(3), %st
403
404       fsubrp	%st, %st(5)
405       fsubrp	%st, %st(3)
406
407       FLD	-5 * SIZE(AO)
408       fmul	%st, %st(3)
409       fmulp	%st, %st(4)
410#endif
411
412#ifdef RN
413       FLD	-8 * SIZE(BO)
414       fmul	%st, %st(1)
415       fmulp	%st, %st(3)
416
417       FLD	-7 * SIZE(BO)
418       fmul	%st(1), %st
419       FLD	-7 * SIZE(BO)
420       fmul	%st(4), %st
421
422       fsubrp	%st, %st(5)
423       fsubrp	%st, %st(2)
424
425       FLD	-5 * SIZE(BO)
426       fmul	%st, %st(2)
427       fmulp	%st, %st(4)
428#endif
429
430#ifdef RT
431       FLD	-5 * SIZE(BO)
432       fmul	%st, %st(2)
433       fmulp	%st, %st(4)
434
435       FLD	-6 * SIZE(BO)
436       fmul	%st(2), %st
437       FLD	-6 * SIZE(BO)
438       fmul	%st(5), %st
439
440       fsubrp	%st, %st(4)
441       fsubrp	%st, %st(1)
442
443       FLD	-8 * SIZE(BO)
444       fmul	%st, %st(1)
445       fmulp	%st, %st(3)
446#endif
447
448#ifdef LN
449	subq	$2 * SIZE, CO
450#endif
451
452#if defined(LN) || defined(LT)
453	fld	%st
454	FST	-8 * SIZE(BO)
455	fxch	%st(1)
456	fld	%st
457	FST	-7 * SIZE(BO)
458	fxch	%st(2)
459	fld	%st
460	FST	-6 * SIZE(BO)
461	fxch	%st(3)
462	fld	%st
463	FST	-5 * SIZE(BO)
464
465	FST	1 * SIZE(CO, LDC)
466	FST	0 * SIZE(CO)
467	FST	0 * SIZE(CO, LDC)
468	FST	1 * SIZE(CO)
469#else
470	fld	%st
471	FST	-8 * SIZE(AO)
472	fxch	%st(2)
473	fld	%st
474	FST	-7 * SIZE(AO)
475	fxch	%st(1)
476	fld	%st
477	FST	-6 * SIZE(AO)
478	fxch	%st(3)
479	fld	%st
480	FST	-5 * SIZE(AO)
481
482	FST	1 * SIZE(CO, LDC)
483	FST	1 * SIZE(CO)
484	FST	0 * SIZE(CO)
485	FST	0 * SIZE(CO, LDC)
486#endif
487
488#ifndef LN
489	addq	$2 * SIZE, CO
490#endif
491
492#if defined(LT) || defined(RN)
493	movq	K,  %rax
494	subq	KK, %rax
495	salq	$BASE_SHIFT, %rax
496	leaq	(AO, %rax, 2), AO
497	leaq	(BO, %rax, 2), BO
498#endif
499
500#ifdef LN
501	subq	$2, KK
502#endif
503
504#ifdef LT
505	addq	$2, KK
506#endif
507
508#ifdef RT
509       movq	K, %rax
510       salq	$1 + BASE_SHIFT, %rax
511       addq	%rax, AORIG
512#endif
513
514	decq	I
515	jne	.L11
516	ALIGN_4
517
518.L20:
519	movq	 M, %rax
520	andq	$1, %rax
521	je	.L29
522	ALIGN_4
523
524.L21:
525#ifdef LN
526       movq	K, %rax
527       salq	$0 + BASE_SHIFT, %rax
528       subq	%rax, AORIG
529#endif
530
531#if defined(LN) || defined(RT)
532	movq	KK, %rax
533	salq	$BASE_SHIFT, %rax
534	movq	AORIG, AO
535	leaq	(AO, %rax, 1), AO
536	leaq	(B,  %rax, 2), BO
537#else
538	movq	B, BO
539#endif
540
541	fldz
542	fldz
543
544#if defined(LT) || defined(RN)
545	movq	KK, %rax
546#else
547	movq	K,  %rax
548	subq	KK, %rax
549#endif
550	sarq	$2, %rax
551 	je	.L25
552	ALIGN_4
553
554.L22:
555	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
556
557	FLD	 -8 * SIZE(AO)
558
559	FLD	 -8 * SIZE(BO)
560	fmul	 %st(1), %st
561	faddp	 %st, %st(2)
562
563	FLD	 -7 * SIZE(BO)
564	fmulp	 %st, %st(1)
565	faddp	 %st, %st(2)
566
567	FLD	 -7 * SIZE(AO)
568
569	FLD	 -6 * SIZE(BO)
570	fmul	 %st(1), %st
571	faddp	 %st, %st(2)
572
573	FLD	 -5 * SIZE(BO)
574	fmulp	 %st, %st(1)
575	faddp	 %st, %st(2)
576
577	FLD	 -6 * SIZE(AO)
578
579	FLD	 -4 * SIZE(BO)
580	fmul	 %st(1), %st
581	faddp	 %st, %st(2)
582
583	FLD	 -3 * SIZE(BO)
584	fmulp	 %st, %st(1)
585	faddp	 %st, %st(2)
586
587	FLD	 -5 * SIZE(AO)
588
589	FLD	 -2 * SIZE(BO)
590	fmul	 %st(1), %st
591	faddp	 %st, %st(2)
592
593	FLD	 -1 * SIZE(BO)
594	fmulp	 %st, %st(1)
595	faddp	 %st, %st(2)
596
597	addq	$4 * SIZE,AO
598	addq	$8 * SIZE,BO
599
600	decq	%rax
601	jne	.L22
602	ALIGN_4
603
604.L25:
605#if defined(LT) || defined(RN)
606	movq	KK, %rax
607#else
608	movq	K,  %rax
609	subq	KK, %rax
610#endif
611	and	$3,  %rax
612	je	.L28
613	ALIGN_4
614
615.L26:
616	FLD	 -8 * SIZE(AO)
617
618	FLD	 -8 * SIZE(BO)
619	fmul	 %st(1), %st
620	faddp	 %st, %st(2)
621
622	FLD	 -7 * SIZE(BO)
623	fmulp	 %st, %st(1)
624	faddp	 %st, %st(2)
625
626	addq	$1 * SIZE,AO
627	addq	$2 * SIZE,BO
628
629	decq	%rax
630	jne	 .L26
631	ALIGN_4
632
633.L28:
634#if defined(LN) || defined(RT)
635	movq	KK, %rax
636#ifdef LN
637	subq	$1, %rax
638#else
639	subq	$2, %rax
640#endif
641
642	salq	$BASE_SHIFT, %rax
643
644	movq	AORIG, AO
645	leaq	(AO, %rax, 1), AO
646	leaq	(B,  %rax, 2), BO
647#endif
648
649#if defined(LN) || defined(LT)
650	FLD	-8 * SIZE(BO)
651	fsubp	%st, %st(1)
652	FLD	-7 * SIZE(BO)
653	fsubp	%st, %st(2)
654#else
655	FLD	-8 * SIZE(AO)
656	fsubp	%st, %st(1)
657	FLD	-7 * SIZE(AO)
658	fsubp	%st, %st(2)
659#endif
660
661#if defined(LN) || defined(LT)
662       FLD	-8 * SIZE(AO)
663       fmul	%st, %st(1)
664       fmulp	%st, %st(2)
665#endif
666
667#ifdef RN
668       FLD	-8 * SIZE(BO)
669       fmulp	%st, %st(1)
670
671       FLD	-7 * SIZE(BO)
672       fmul	%st(1), %st
673
674       fsubrp	%st, %st(2)
675
676       FLD	-5 * SIZE(BO)
677       fmulp	%st, %st(2)
678#endif
679
680#ifdef RT
681       FLD	-5 * SIZE(BO)
682       fmulp	%st, %st(2)
683
684       FLD	-6 * SIZE(BO)
685       fmul	%st(2), %st
686
687       fsubrp	%st, %st(1)
688
689       FLD	-8 * SIZE(BO)
690       fmulp	%st, %st(1)
691#endif
692
693#ifdef LN
694	subq	$1 * SIZE, CO
695#endif
696
697#if defined(LN) || defined(LT)
698	fld	%st
699	FST	-8 * SIZE(BO)
700	fxch	%st(1)
701	fld	%st
702	FST	-7 * SIZE(BO)
703#else
704	fld	%st
705	FST	-8 * SIZE(AO)
706	fxch	%st(1)
707	fld	%st
708	FST	-7 * SIZE(AO)
709#endif
710
711	FST	0 * SIZE(CO, LDC)
712	FST	0 * SIZE(CO)
713
714#ifndef LN
715	addq	$1 * SIZE, CO
716#endif
717
718#if defined(LT) || defined(RN)
719	movq	K,  %rax
720	subq	KK, %rax
721	salq	$BASE_SHIFT, %rax
722	leaq	(AO, %rax, 1), AO
723	leaq	(BO, %rax, 2), BO
724#endif
725
726#ifdef LN
727	subq	$1, KK
728#endif
729
730#ifdef LT
731	addq	$1, KK
732#endif
733
734#ifdef RT
735       movq	K, %rax
736       salq	$0 + BASE_SHIFT, %rax
737       addq	%rax, AORIG
738#endif
739	ALIGN_4
740
741.L29:
742#ifdef LN
743       movq	K, %rax
744       salq	$BASE_SHIFT, %rax
745       leaq	(B, %rax, 2), B
746#endif
747
748#if defined(LT) || defined(RN)
749	movq	BO, B
750#endif
751
752#ifdef RN
753	addq	$2, KK
754#endif
755
756#ifdef RT
757	subq	$2, KK
758#endif
759
760	decq	J
761	jne	.L01
762	ALIGN_4
763
764.L30:
765	movq	N,  %rax
766	testq	$1, %rax
767	je	.L999
768
769#if defined(LT) || defined(RN)
770	movq	A, AO
771#else
772	movq	A, %rax
773	movq	%rax, AORIG
774#endif
775
776#ifdef RT
777	movq	K, %rax
778	salq	$0 + BASE_SHIFT, %rax
779	subq	%rax, B
780#endif
781
782#ifdef RT
783	subq	LDC, C
784#endif
785	movq	C, CO
786#ifndef RT
787	addq	LDC, C
788#endif
789
790#ifdef LN
791	movq	OFFSET, %rax
792	addq	M, %rax
793	movq	%rax, KK
794#endif
795
796#ifdef LT
797	movq	OFFSET, %rax
798	movq	%rax, KK
799#endif
800
801	movq	M,  I
802	sarq	$1, I
803	je	.L40
804	ALIGN_4
805
806.L31:
807#ifdef LN
808       movq	K, %rax
809       salq	$1 + BASE_SHIFT, %rax
810       subq	%rax, AORIG
811#endif
812
813#if defined(LN) || defined(RT)
814	movq	KK, %rax
815	salq	$BASE_SHIFT, %rax
816	movq	AORIG, AO
817	leaq	(AO, %rax, 2), AO
818	leaq	(B,  %rax, 1), BO
819#else
820	movq	B, BO
821#endif
822
823	fldz
824	fldz
825
826#if   defined(HAVE_3DNOW)
827	prefetchw	2 * SIZE(CO)
828#elif defined(HAVE_SSE)
829	prefetchnta	2 * SIZE(CO)
830#endif
831
832#if defined(LT) || defined(RN)
833	movq	KK, %rax
834#else
835	movq	K,  %rax
836	subq	KK, %rax
837#endif
838	sarq	$2, %rax
839 	je	.L35
840	ALIGN_4
841
842.L32:
843	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
844
845	FLD	 -8 * SIZE(BO)
846	FLD	 -8 * SIZE(AO)
847	fmul	 %st(1), %st
848	faddp	 %st, %st(2)
849
850	FLD	 -7 * SIZE(AO)
851	fmulp	 %st, %st(1)
852	faddp	 %st, %st(2)
853
854	FLD	 -7 * SIZE(BO)
855	FLD	 -6 * SIZE(AO)
856	fmul	 %st(1), %st
857	faddp	 %st, %st(2)
858
859	FLD	 -5 * SIZE(AO)
860	fmulp	 %st, %st(1)
861	faddp	 %st, %st(2)
862
863	FLD	 -6 * SIZE(BO)
864	FLD	 -4 * SIZE(AO)
865	fmul	 %st(1), %st
866	faddp	 %st, %st(2)
867
868	FLD	 -3 * SIZE(AO)
869	fmulp	 %st, %st(1)
870	faddp	 %st, %st(2)
871
872	FLD	 -5 * SIZE(BO)
873	FLD	 -2 * SIZE(AO)
874	fmul	 %st(1), %st
875	faddp	 %st, %st(2)
876
877	FLD	 -1 * SIZE(AO)
878	fmulp	 %st, %st(1)
879	faddp	 %st, %st(2)
880
881	addq	$8 * SIZE,AO
882	addq	$4 * SIZE,BO
883
884	decq	%rax
885	jne	.L32
886	ALIGN_4
887
888.L35:
889#if defined(LT) || defined(RN)
890	movq	KK, %rax
891#else
892	movq	K,  %rax
893	subq	KK, %rax
894#endif
895	and	$3,  %rax
896	je	.L38
897	ALIGN_4
898
899.L36:
900	FLD	 -8 * SIZE(BO)
901
902	FLD	 -8 * SIZE(AO)
903	fmul	 %st(1), %st
904	faddp	 %st, %st(2)
905
906	FLD	 -7 * SIZE(AO)
907	fmulp	 %st, %st(1)
908	faddp	 %st, %st(2)
909
910	addq	$2 * SIZE,AO
911	addq	$1 * SIZE,BO
912
913	decq	%rax
914	jne	 .L36
915	ALIGN_4
916
917.L38:
918#if defined(LN) || defined(RT)
919	movq	KK, %rax
920#ifdef LN
921	subq	$2, %rax
922#else
923	subq	$1, %rax
924#endif
925
926	salq	$BASE_SHIFT, %rax
927
928	movq	AORIG, AO
929	leaq	(AO, %rax, 2), AO
930	leaq	(B,  %rax, 1), BO
931#endif
932
933#if defined(LN) || defined(LT)
934	FLD	-8 * SIZE(BO)
935	fsubp	%st, %st(1)
936	FLD	-7 * SIZE(BO)
937	fsubp	%st, %st(2)
938#else
939	FLD	-8 * SIZE(AO)
940	fsubp	%st, %st(1)
941	FLD	-7 * SIZE(AO)
942	fsubp	%st, %st(2)
943#endif
944
945#ifdef LN
946       FLD	-5 * SIZE(AO)
947       fmulp	%st, %st(2)
948
949       FLD	-6 * SIZE(AO)
950       fmul	%st(2), %st
951
952       fsubrp	%st, %st(1)
953       FLD	-8 * SIZE(AO)
954       fmulp	%st, %st(1)
955#endif
956
957#ifdef LT
958       FLD	-8 * SIZE(AO)
959       fmulp	%st, %st(1)
960
961       FLD	-7 * SIZE(AO)
962       fmul	%st(1), %st
963
964       fsubrp	%st, %st(2)
965
966       FLD	-5 * SIZE(AO)
967       fmulp	%st, %st(2)
968#endif
969
970#ifdef RN
971       FLD	-8 * SIZE(BO)
972       fmul	%st, %st(1)
973       fmulp	%st, %st(2)
974#endif
975
976#ifdef RT
977       FLD	-8 * SIZE(BO)
978       fmul	%st, %st(1)
979       fmulp	%st, %st(2)
980#endif
981
982#ifdef LN
983	subq	$2 * SIZE, CO
984#endif
985
986#if defined(LN) || defined(LT)
987	fld	%st
988	FST	-8 * SIZE(BO)
989	fxch	%st(1)
990	fld	%st
991	FST	-7 * SIZE(BO)
992#else
993	fld	%st
994	FST	-8 * SIZE(AO)
995	fxch	%st(1)
996	fld	%st
997	FST	-7 * SIZE(AO)
998#endif
999
1000	FST	1 * SIZE(CO)
1001	FST	0 * SIZE(CO)
1002
1003#ifndef LN
1004	addq	$2 * SIZE, CO
1005#endif
1006
1007#if defined(LT) || defined(RN)
1008	movq	K,  %rax
1009	subq	KK, %rax
1010	salq	$BASE_SHIFT, %rax
1011	leaq	(AO, %rax, 2), AO
1012	leaq	(BO, %rax, 1), BO
1013#endif
1014
1015#ifdef LN
1016	subq	$2, KK
1017#endif
1018
1019#ifdef LT
1020	addq	$2, KK
1021#endif
1022
1023#ifdef RT
1024       movq	K, %rax
1025       salq	$1 + BASE_SHIFT, %rax
1026       addq	%rax, AORIG
1027#endif
1028
1029	decq	I
1030	jne	.L31
1031	ALIGN_4
1032
1033.L40:
1034	movq	 M, %rax
1035	andq	$1, %rax
1036	je	.L49
1037	ALIGN_4
1038
1039.L41:
1040#ifdef LN
1041       movq	K, %rax
1042       salq	$0 + BASE_SHIFT, %rax
1043       subq	%rax, AORIG
1044#endif
1045
1046#if defined(LN) || defined(RT)
1047	movq	KK, %rax
1048	salq	$BASE_SHIFT, %rax
1049	movq	AORIG, AO
1050	leaq	(AO, %rax, 1), AO
1051	leaq	(B,  %rax, 1), BO
1052#else
1053	movq	B, BO
1054#endif
1055
1056	fldz
1057
1058#if defined(LT) || defined(RN)
1059	movq	KK, %rax
1060#else
1061	movq	K,  %rax
1062	subq	KK, %rax
1063#endif
1064	sarq	$2, %rax
1065 	je	.L45
1066	ALIGN_4
1067
1068.L42:
1069	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1070
1071	FLD	 -8 * SIZE(AO)
1072	FLD	 -8 * SIZE(BO)
1073	fmulp	 %st, %st(1)
1074	faddp	 %st, %st(1)
1075
1076	FLD	 -7 * SIZE(AO)
1077	FLD	 -7 * SIZE(BO)
1078	fmulp	 %st, %st(1)
1079	faddp	 %st, %st(1)
1080
1081	FLD	 -6 * SIZE(AO)
1082	FLD	 -6 * SIZE(BO)
1083	fmulp	 %st, %st(1)
1084	faddp	 %st, %st(1)
1085
1086	FLD	 -5 * SIZE(AO)
1087	FLD	 -5 * SIZE(BO)
1088	fmulp	 %st, %st(1)
1089	faddp	 %st, %st(1)
1090
1091	addq	$4 * SIZE,AO
1092	addq	$4 * SIZE,BO
1093
1094	decq	%rax
1095	jne	.L42
1096	ALIGN_4
1097
1098.L45:
1099#if defined(LT) || defined(RN)
1100	movq	KK, %rax
1101#else
1102	movq	K,  %rax
1103	subq	KK, %rax
1104#endif
1105	and	$3,  %rax
1106	je	.L48
1107	ALIGN_4
1108
1109.L46:
1110	FLD	 -8 * SIZE(AO)
1111
1112	FLD	 -8 * SIZE(BO)
1113	fmulp	 %st, %st(1)
1114	faddp	 %st, %st(1)
1115
1116	addq	$1 * SIZE,AO
1117	addq	$1 * SIZE,BO
1118
1119	decq	%rax
1120	jne	 .L46
1121	ALIGN_4
1122
1123.L48:
1124#if defined(LN) || defined(RT)
1125	movq	KK, %rax
1126#ifdef LN
1127	subq	$1, %rax
1128#else
1129	subq	$1, %rax
1130#endif
1131
1132	salq	$BASE_SHIFT, %rax
1133
1134	movq	AORIG, AO
1135	leaq	(AO, %rax, 1), AO
1136	leaq	(B,  %rax, 1), BO
1137#endif
1138
1139#if defined(LN) || defined(LT)
1140	FLD	-8 * SIZE(BO)
1141	fsubp	%st, %st(1)
1142#else
1143	FLD	-8 * SIZE(AO)
1144	fsubp	%st, %st(1)
1145#endif
1146
1147#ifdef LN
1148       FLD	-8 * SIZE(AO)
1149       fmulp	%st, %st(1)
1150#endif
1151
1152#ifdef LT
1153       FLD	-8 * SIZE(AO)
1154       fmulp	%st, %st(1)
1155#endif
1156
1157#ifdef RN
1158       FLD	-8 * SIZE(BO)
1159       fmulp	%st, %st(1)
1160#endif
1161
1162#ifdef RT
1163       FLD	-8 * SIZE(BO)
1164       fmulp	%st, %st(1)
1165#endif
1166
1167#ifdef LN
1168	subq	$1 * SIZE, CO
1169#endif
1170
1171#if defined(LN) || defined(LT)
1172	fld	%st
1173	FST	-8 * SIZE(BO)
1174#else
1175	fld	%st
1176	FST	-8 * SIZE(AO)
1177#endif
1178
1179	FST	0 * SIZE(CO)
1180
1181#ifndef LN
1182	addq	$1 * SIZE, CO
1183#endif
1184
1185#if defined(LT) || defined(RN)
1186	movq	K,  %rax
1187	subq	KK, %rax
1188	salq	$BASE_SHIFT, %rax
1189	leaq	(AO, %rax, 1), AO
1190	leaq	(BO, %rax, 1), BO
1191#endif
1192
1193#ifdef LN
1194	subq	$1, KK
1195#endif
1196
1197#ifdef LT
1198	addq	$1, KK
1199#endif
1200
1201#ifdef RT
1202       movq	K, %rax
1203       salq	$0 + BASE_SHIFT, %rax
1204       addq	%rax, AORIG
1205#endif
1206	ALIGN_4
1207
1208.L49:
1209#ifdef LN
1210       movq	K, %rax
1211       salq	$BASE_SHIFT, %rax
1212       leaq	(B, %rax, 1), B
1213#endif
1214
1215#if defined(LT) || defined(RN)
1216	movq	BO, B
1217#endif
1218
1219#ifdef RN
1220	addq	$1, KK
1221#endif
1222
1223#ifdef RT
1224	subq	$1, KK
1225#endif
1226	ALIGN_4
1227
1228.L999:
1229	movq	  0(%rsp), %rbx
1230	movq	  8(%rsp), %rbp
1231	movq	 16(%rsp), %r12
1232	movq	 24(%rsp), %r13
1233	movq	 32(%rsp), %r14
1234	movq	 40(%rsp), %r15
1235	addq	$STACKSIZE, %rsp
1236	ret
1237
1238	EPILOGUE
1239