1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef OPTERON
26#define PREFETCH	prefetch
27#define PREFETCHW	prefetchw
28#else
29#define PREFETCH	prefetcht0
30#define PREFETCHW	prefetcht0
31#endif
32
33#define PREFETCHSIZE (5 + 4 * 10)
34#define STACK	16
35#define ARGS	16
36
37#define J	 0 + STACK(%esp)
38#define KK	 4 + STACK(%esp)
39#define KKK	 8 + STACK(%esp)
40#define AORIG	12 + STACK(%esp)
41
42#define M	 4 + STACK + ARGS(%esp)
43#define N	 8 + STACK + ARGS(%esp)
44#define K	12 + STACK + ARGS(%esp)
45#define ALPHA	16 + STACK + ARGS(%esp)
46#define A	32 + STACK + ARGS(%esp)
47#define ARG_B	36 + STACK + ARGS(%esp)
48#define C	40 + STACK + ARGS(%esp)
49#define ARG_LDC	44 + STACK + ARGS(%esp)
50#define OFFSET	48 + STACK + ARGS(%esp)
51
52#define I	%esi
53#define B	%ebx
54#define CO	%edi
55#define AO	%edx
56#define BO	%ecx
57#define LDC	%ebp
58
59#define PREFETCH_OFFSET 48
60
61	PROLOGUE
62
63	subl	$ARGS, %esp	# Generate Stack Frame
64
65	pushl	%ebp
66	pushl	%edi
67	pushl	%esi
68	pushl	%ebx
69
70	PROFCODE
71
72	movl	ARG_LDC, LDC
73	movl	ARG_B,   B
74	sall	$BASE_SHIFT, LDC
75
76	addl	$8 * SIZE, A
77	addl	$8 * SIZE, B
78
79
80#ifdef LN
81       movl	M, %eax
82       sall	$BASE_SHIFT, %eax
83       addl	%eax, C
84       imull	K, %eax
85       addl	%eax, A
86#endif
87
88#ifdef RT
89       movl	N, %eax
90       sall	$BASE_SHIFT, %eax
91       imull	K, %eax
92       addl	%eax, B
93
94       movl	N,    %eax
95       imull	%ebp, %eax
96       addl	%eax, C
97#endif
98
99#ifdef RN
100       movl	OFFSET, %eax
101       negl	%eax
102       movl	%eax, KK
103#endif
104
105#ifdef RT
106       movl	N, %eax
107       subl	OFFSET, %eax
108       movl	%eax, KK
109#endif
110
111	movl	N,  %eax
112	testl	$1, %eax
113	je	.L30
114
115#if defined(LT) || defined(RN)
116	movl	A, AO
117#else
118	movl	A, %eax
119	movl	%eax, AORIG
120#endif
121
122#ifdef RT
123	movl	K, %eax
124	sall	$0 + BASE_SHIFT, %eax
125	subl	%eax, B
126#endif
127
128#ifdef RT
129	subl	LDC, C
130#endif
131	movl	C, CO
132#ifndef RT
133	addl	LDC, C
134#endif
135
136#ifdef LN
137	movl	OFFSET, %eax
138	addl	M, %eax
139	movl	%eax, KK
140#endif
141
142#ifdef LT
143	movl	OFFSET, %eax
144	movl	%eax, KK
145#endif
146
147	movl	M,  I
148	sarl	$1, I
149	je	.L40
150	ALIGN_4
151
152.L31:
153#ifdef LN
154       movl	K, %eax
155       sall	$1 + BASE_SHIFT, %eax
156       subl	%eax, AORIG
157#endif
158
159#if defined(LN) || defined(RT)
160	movl	KK, %eax
161	sall	$BASE_SHIFT, %eax
162	movl	AORIG, AO
163	leal	(AO, %eax, 2), AO
164	leal	(B,  %eax, 1), BO
165#else
166	movl	B, BO
167#endif
168
169	fldz
170	fldz
171
172#if   defined(HAVE_3DNOW)
173	prefetchw	2 * SIZE(CO)
174#elif defined(HAVE_SSE)
175	prefetchnta	2 * SIZE(CO)
176#endif
177
178#if defined(LT) || defined(RN)
179	movl	KK, %eax
180#else
181	movl	K,  %eax
182	subl	KK, %eax
183#endif
184	sarl	$2, %eax
185 	je	.L35
186	ALIGN_4
187
188.L32:
189	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
190
191	FLD	 -8 * SIZE(BO)
192	FLD	 -8 * SIZE(AO)
193	fmul	 %st(1), %st
194	faddp	 %st, %st(2)
195
196	FLD	 -7 * SIZE(AO)
197	fmulp	 %st, %st(1)
198	faddp	 %st, %st(2)
199
200	FLD	 -7 * SIZE(BO)
201	FLD	 -6 * SIZE(AO)
202	fmul	 %st(1), %st
203	faddp	 %st, %st(2)
204
205	FLD	 -5 * SIZE(AO)
206	fmulp	 %st, %st(1)
207	faddp	 %st, %st(2)
208
209	FLD	 -6 * SIZE(BO)
210	FLD	 -4 * SIZE(AO)
211	fmul	 %st(1), %st
212	faddp	 %st, %st(2)
213
214	FLD	 -3 * SIZE(AO)
215	fmulp	 %st, %st(1)
216	faddp	 %st, %st(2)
217
218	FLD	 -5 * SIZE(BO)
219	FLD	 -2 * SIZE(AO)
220	fmul	 %st(1), %st
221	faddp	 %st, %st(2)
222
223	FLD	 -1 * SIZE(AO)
224	fmulp	 %st, %st(1)
225	faddp	 %st, %st(2)
226
227	addl	$8 * SIZE,AO
228	addl	$4 * SIZE,BO
229
230	decl	%eax
231	jne	.L32
232	ALIGN_4
233
234.L35:
235#if defined(LT) || defined(RN)
236	movl	KK, %eax
237#else
238	movl	K,  %eax
239	subl	KK, %eax
240#endif
241	and	$3,  %eax
242	je	.L38
243	ALIGN_4
244
245.L36:
246	FLD	 -8 * SIZE(BO)
247
248	FLD	 -8 * SIZE(AO)
249	fmul	 %st(1), %st
250	faddp	 %st, %st(2)
251
252	FLD	 -7 * SIZE(AO)
253	fmulp	 %st, %st(1)
254	faddp	 %st, %st(2)
255
256	addl	$2 * SIZE,AO
257	addl	$1 * SIZE,BO
258
259	decl	%eax
260	jne	 .L36
261	ALIGN_4
262
263.L38:
264#if defined(LN) || defined(RT)
265	movl	KK, %eax
266#ifdef LN
267	subl	$2, %eax
268#else
269	subl	$1, %eax
270#endif
271
272	sall	$BASE_SHIFT, %eax
273
274	movl	AORIG, AO
275	leal	(AO, %eax, 2), AO
276	leal	(B,  %eax, 1), BO
277#endif
278
279#if defined(LN) || defined(LT)
280	FLD	-8 * SIZE(BO)
281	fsubp	%st, %st(1)
282	FLD	-7 * SIZE(BO)
283	fsubp	%st, %st(2)
284#else
285	FLD	-8 * SIZE(AO)
286	fsubp	%st, %st(1)
287	FLD	-7 * SIZE(AO)
288	fsubp	%st, %st(2)
289#endif
290
291#ifdef LN
292       FLD	-5 * SIZE(AO)
293       fmulp	%st, %st(2)
294
295       FLD	-6 * SIZE(AO)
296       fmul	%st(2), %st
297
298       fsubrp	%st, %st(1)
299       FLD	-8 * SIZE(AO)
300       fmulp	%st, %st(1)
301#endif
302
303#ifdef LT
304       FLD	-8 * SIZE(AO)
305       fmulp	%st, %st(1)
306
307       FLD	-7 * SIZE(AO)
308       fmul	%st(1), %st
309
310       fsubrp	%st, %st(2)
311
312       FLD	-5 * SIZE(AO)
313       fmulp	%st, %st(2)
314#endif
315
316#ifdef RN
317       FLD	-8 * SIZE(BO)
318       fmul	%st, %st(1)
319       fmulp	%st, %st(2)
320#endif
321
322#ifdef RT
323       FLD	-8 * SIZE(BO)
324       fmul	%st, %st(1)
325       fmulp	%st, %st(2)
326#endif
327
328#ifdef LN
329	subl	$2 * SIZE, CO
330#endif
331
332#if defined(LN) || defined(LT)
333	fld	%st
334	FST	-8 * SIZE(BO)
335	fxch	%st(1)
336	fld	%st
337	FST	-7 * SIZE(BO)
338#else
339	fld	%st
340	FST	-8 * SIZE(AO)
341	fxch	%st(1)
342	fld	%st
343	FST	-7 * SIZE(AO)
344#endif
345
346	FST	1 * SIZE(CO)
347	FST	0 * SIZE(CO)
348
349#ifndef LN
350	addl	$2 * SIZE, CO
351#endif
352
353#if defined(LT) || defined(RN)
354	movl	K,  %eax
355	subl	KK, %eax
356	sall	$BASE_SHIFT, %eax
357	leal	(AO, %eax, 2), AO
358	leal	(BO, %eax, 1), BO
359#endif
360
361#ifdef LN
362	subl	$2, KK
363#endif
364
365#ifdef LT
366	addl	$2, KK
367#endif
368
369#ifdef RT
370       movl	K, %eax
371       sall	$1 + BASE_SHIFT, %eax
372       addl	%eax, AORIG
373#endif
374
375	decl	I
376	jne	.L31
377	ALIGN_4
378
379.L40:
380	movl	 M, %eax
381	andl	$1, %eax
382	je	.L49
383	ALIGN_4
384
385.L41:
386#ifdef LN
387       movl	K, %eax
388       sall	$0 + BASE_SHIFT, %eax
389       subl	%eax, AORIG
390#endif
391
392#if defined(LN) || defined(RT)
393	movl	KK, %eax
394	sall	$BASE_SHIFT, %eax
395	movl	AORIG, AO
396	leal	(AO, %eax, 1), AO
397	leal	(B,  %eax, 1), BO
398#else
399	movl	B, BO
400#endif
401
402	fldz
403
404#if defined(LT) || defined(RN)
405	movl	KK, %eax
406#else
407	movl	K,  %eax
408	subl	KK, %eax
409#endif
410	sarl	$2, %eax
411 	je	.L45
412	ALIGN_4
413
414.L42:
415	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
416
417	FLD	 -8 * SIZE(AO)
418	FLD	 -8 * SIZE(BO)
419	fmulp	 %st, %st(1)
420	faddp	 %st, %st(1)
421
422	FLD	 -7 * SIZE(AO)
423	FLD	 -7 * SIZE(BO)
424	fmulp	 %st, %st(1)
425	faddp	 %st, %st(1)
426
427	FLD	 -6 * SIZE(AO)
428	FLD	 -6 * SIZE(BO)
429	fmulp	 %st, %st(1)
430	faddp	 %st, %st(1)
431
432	FLD	 -5 * SIZE(AO)
433	FLD	 -5 * SIZE(BO)
434	fmulp	 %st, %st(1)
435	faddp	 %st, %st(1)
436
437	addl	$4 * SIZE,AO
438	addl	$4 * SIZE,BO
439
440	decl	%eax
441	jne	.L42
442	ALIGN_4
443
444.L45:
445#if defined(LT) || defined(RN)
446	movl	KK, %eax
447#else
448	movl	K,  %eax
449	subl	KK, %eax
450#endif
451	and	$3,  %eax
452	je	.L48
453	ALIGN_4
454
455.L46:
456	FLD	 -8 * SIZE(AO)
457
458	FLD	 -8 * SIZE(BO)
459	fmulp	 %st, %st(1)
460	faddp	 %st, %st(1)
461
462	addl	$1 * SIZE,AO
463	addl	$1 * SIZE,BO
464
465	decl	%eax
466	jne	 .L46
467	ALIGN_4
468
469.L48:
470#if defined(LN) || defined(RT)
471	movl	KK, %eax
472#ifdef LN
473	subl	$1, %eax
474#else
475	subl	$1, %eax
476#endif
477
478	sall	$BASE_SHIFT, %eax
479
480	movl	AORIG, AO
481	leal	(AO, %eax, 1), AO
482	leal	(B,  %eax, 1), BO
483#endif
484
485#if defined(LN) || defined(LT)
486	FLD	-8 * SIZE(BO)
487	fsubp	%st, %st(1)
488#else
489	FLD	-8 * SIZE(AO)
490	fsubp	%st, %st(1)
491#endif
492
493#ifdef LN
494       FLD	-8 * SIZE(AO)
495       fmulp	%st, %st(1)
496#endif
497
498#ifdef LT
499       FLD	-8 * SIZE(AO)
500       fmulp	%st, %st(1)
501#endif
502
503#ifdef RN
504       FLD	-8 * SIZE(BO)
505       fmulp	%st, %st(1)
506#endif
507
508#ifdef RT
509       FLD	-8 * SIZE(BO)
510       fmulp	%st, %st(1)
511#endif
512
513#ifdef LN
514	subl	$1 * SIZE, CO
515#endif
516
517#if defined(LN) || defined(LT)
518	fld	%st
519	FST	-8 * SIZE(BO)
520#else
521	fld	%st
522	FST	-8 * SIZE(AO)
523#endif
524
525	FST	0 * SIZE(CO)
526
527#ifndef LN
528	addl	$1 * SIZE, CO
529#endif
530
531#if defined(LT) || defined(RN)
532	movl	K,  %eax
533	subl	KK, %eax
534	sall	$BASE_SHIFT, %eax
535	leal	(AO, %eax, 1), AO
536	leal	(BO, %eax, 1), BO
537#endif
538
539#ifdef LN
540	subl	$1, KK
541#endif
542
543#ifdef LT
544	addl	$1, KK
545#endif
546
547#ifdef RT
548       movl	K, %eax
549       sall	$0 + BASE_SHIFT, %eax
550       addl	%eax, AORIG
551#endif
552	ALIGN_4
553
554.L49:
555#ifdef LN
556       movl	K, %eax
557       sall	$BASE_SHIFT, %eax
558       leal	(B, %eax, 1), B
559#endif
560
561#if defined(LT) || defined(RN)
562	movl	BO, B
563#endif
564
565#ifdef RN
566	addl	$1, KK
567#endif
568
569#ifdef RT
570	subl	$1, KK
571#endif
572	ALIGN_4
573
574.L30:
575	movl	N,   %eax
576	sarl	$1,  %eax
577	movl	%eax, J
578	je	.L999
579	ALIGN_4
580
581.L01:
582#if defined(LT) || defined(RN)
583	movl	A, AO
584#else
585	movl	A, %eax
586	movl	%eax, AORIG
587#endif
588
589#ifdef RT
590	movl	K, %eax
591	sall	$1 + BASE_SHIFT, %eax
592	subl	%eax, B
593#endif
594
595	lea	(, LDC, 2), %eax
596
597#ifdef RT
598	subl	%eax, C
599#endif
600	movl	C, CO
601#ifndef RT
602	addl	%eax, C
603#endif
604
605#ifdef LN
606	movl	OFFSET, %eax
607	addl	M, %eax
608	movl	%eax, KK
609#endif
610
611#ifdef LT
612	movl	OFFSET, %eax
613	movl	%eax, KK
614#endif
615
616	movl	M,  I
617	sarl	$1, I
618	je	.L20
619	ALIGN_4
620
621.L11:
622#ifdef LN
623       movl	K, %eax
624       sall	$1 + BASE_SHIFT, %eax
625       subl	%eax, AORIG
626#endif
627
628#if defined(LN) || defined(RT)
629	movl	KK, %eax
630	sall	$BASE_SHIFT, %eax
631	movl	AORIG, AO
632	leal	(AO, %eax, 2), AO
633	leal	(B,  %eax, 2), BO
634#else
635	movl	B, BO
636#endif
637
638	fldz
639	fldz
640	fldz
641	fldz
642
643#if   defined(HAVE_3DNOW)
644	prefetchw	2 * SIZE(CO)
645 	prefetchw	2 * SIZE(CO, LDC, 1)
646#elif defined(HAVE_SSE)
647	prefetchnta	2 * SIZE(CO)
648 	prefetchnta	2 * SIZE(CO, LDC, 1)
649#endif
650
651#if defined(LT) || defined(RN)
652	movl	KK, %eax
653#else
654	movl	K,  %eax
655	subl	KK, %eax
656#endif
657	sarl	$2, %eax
658 	je	.L15
659	ALIGN_4
660
661.L12:
662	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
663
664	FLD	 -8 * SIZE(AO)
665
666	FLD	 -8 * SIZE(BO)
667	fld	 %st(1)
668	fmul	 %st(1), %st
669	faddp	 %st, %st(3)
670
671	FLD	 -7 * SIZE(BO)
672	fmul	 %st, %st(2)
673
674	FLD	 -7 * SIZE(AO)
675	fmul	 %st, %st(2)
676	fmulp	 %st, %st(1)
677
678	faddp	 %st, %st(6)
679	faddp	 %st, %st(4)
680	faddp	 %st, %st(2)
681
682	FLD	 -6 * SIZE(AO)
683
684	FLD	 -6 * SIZE(BO)
685	fld	 %st(1)
686	fmul	 %st(1), %st
687	faddp	 %st, %st(3)
688
689	FLD	 -5 * SIZE(BO)
690	fmul	 %st, %st(2)
691
692	FLD	 -5 * SIZE(AO)
693	fmul	 %st, %st(2)
694	fmulp	 %st, %st(1)
695
696	faddp	 %st, %st(6)
697	faddp	 %st, %st(4)
698	faddp	 %st, %st(2)
699
700	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
701
702	FLD	 -4 * SIZE(AO)
703
704	FLD	 -4 * SIZE(BO)
705	fld	 %st(1)
706	fmul	 %st(1), %st
707	faddp	 %st, %st(3)
708
709	FLD	 -3 * SIZE(BO)
710	fmul	 %st, %st(2)
711
712	FLD	 -3 * SIZE(AO)
713	fmul	 %st, %st(2)
714	fmulp	 %st, %st(1)
715
716	faddp	 %st, %st(6)
717	faddp	 %st, %st(4)
718	faddp	 %st, %st(2)
719
720	FLD	 -2 * SIZE(AO)
721
722	FLD	 -2 * SIZE(BO)
723	fld	 %st(1)
724	fmul	 %st(1), %st
725	faddp	 %st, %st(3)
726
727	FLD	 -1 * SIZE(BO)
728	fmul	 %st, %st(2)
729
730	FLD	 -1 * SIZE(AO)
731	fmul	 %st, %st(2)
732	fmulp	 %st, %st(1)
733
734	faddp	 %st, %st(6)
735	faddp	 %st, %st(4)
736	faddp	 %st, %st(2)
737
738	addl	$8 * SIZE,AO
739	addl	$8 * SIZE,BO
740
741	decl	%eax
742	jne	.L12
743	ALIGN_4
744
745.L15:
746#if defined(LT) || defined(RN)
747	movl	KK, %eax
748#else
749	movl	K,  %eax
750	subl	KK, %eax
751#endif
752	and	$3,  %eax
753	je	.L18
754	ALIGN_4
755
756.L16:
757	FLD	 -8 * SIZE(AO)
758
759	FLD	 -8 * SIZE(BO)
760	fld	 %st(1)
761	fmul	 %st(1), %st
762	faddp	 %st, %st(3)
763
764	FLD	 -7 * SIZE(BO)
765	fmul	 %st, %st(2)
766
767	FLD	 -7 * SIZE(AO)
768	fmul	 %st, %st(2)
769	fmulp	 %st, %st(1)
770
771	faddp	 %st, %st(6)
772	faddp	 %st, %st(4)
773	faddp	 %st, %st(2)
774
775	addl	$2 * SIZE,AO
776	addl	$2 * SIZE,BO
777
778	decl	%eax
779	jne	 .L16
780	ALIGN_4
781
782.L18:
783#if defined(LN) || defined(RT)
784	movl	KK, %eax
785#ifdef LN
786	subl	$2, %eax
787#else
788	subl	$2, %eax
789#endif
790
791	sall	$BASE_SHIFT, %eax
792
793	movl	AORIG, AO
794	leal	(AO, %eax, 2), AO
795	leal	(B,  %eax, 2), BO
796#endif
797
798#if defined(LN) || defined(LT)
799	FLD	-8 * SIZE(BO)
800	fsubp	%st, %st(1)
801	FLD	-7 * SIZE(BO)
802	fsubp	%st, %st(2)
803	FLD	-6 * SIZE(BO)
804	fsubp	%st, %st(3)
805	FLD	-5 * SIZE(BO)
806	fsubp	%st, %st(4)
807#else
808	FLD	-8 * SIZE(AO)
809	fsubp	%st, %st(1)
810	FLD	-7 * SIZE(AO)
811	fsubp	%st, %st(3)
812	FLD	-6 * SIZE(AO)
813	fsubp	%st, %st(2)
814	FLD	-5 * SIZE(AO)
815	fsubp	%st, %st(4)
816#endif
817
818#ifdef LN
819       FLD	-5 * SIZE(AO)
820       fmul	%st, %st(3)
821       fmulp	%st, %st(4)
822
823       FLD	-6 * SIZE(AO)
824       fmul	%st(3), %st
825       FLD	-6 * SIZE(AO)
826       fmul	%st(5), %st
827
828       fsubrp	%st, %st(3)
829       fsubrp	%st, %st(1)
830
831       FLD	-8 * SIZE(AO)
832       fmul	%st, %st(1)
833       fmulp	%st, %st(2)
834#endif
835
836#ifdef LT
837       FLD	-8 * SIZE(AO)
838       fmul	%st, %st(1)
839       fmulp	%st, %st(2)
840
841       FLD	-7 * SIZE(AO)
842       fmul	%st(1), %st
843       FLD	-7 * SIZE(AO)
844       fmul	%st(3), %st
845
846       fsubrp	%st, %st(5)
847       fsubrp	%st, %st(3)
848
849       FLD	-5 * SIZE(AO)
850       fmul	%st, %st(3)
851       fmulp	%st, %st(4)
852#endif
853
854#ifdef RN
855       FLD	-8 * SIZE(BO)
856       fmul	%st, %st(1)
857       fmulp	%st, %st(3)
858
859       FLD	-7 * SIZE(BO)
860       fmul	%st(1), %st
861       FLD	-7 * SIZE(BO)
862       fmul	%st(4), %st
863
864       fsubrp	%st, %st(5)
865       fsubrp	%st, %st(2)
866
867       FLD	-5 * SIZE(BO)
868       fmul	%st, %st(2)
869       fmulp	%st, %st(4)
870#endif
871
872#ifdef RT
873       FLD	-5 * SIZE(BO)
874       fmul	%st, %st(2)
875       fmulp	%st, %st(4)
876
877       FLD	-6 * SIZE(BO)
878       fmul	%st(2), %st
879       FLD	-6 * SIZE(BO)
880       fmul	%st(5), %st
881
882       fsubrp	%st, %st(4)
883       fsubrp	%st, %st(1)
884
885       FLD	-8 * SIZE(BO)
886       fmul	%st, %st(1)
887       fmulp	%st, %st(3)
888#endif
889
890#ifdef LN
891	subl	$2 * SIZE, CO
892#endif
893
894#if defined(LN) || defined(LT)
895	fld	%st
896	FST	-8 * SIZE(BO)
897	fxch	%st(1)
898	fld	%st
899	FST	-7 * SIZE(BO)
900	fxch	%st(2)
901	fld	%st
902	FST	-6 * SIZE(BO)
903	fxch	%st(3)
904	fld	%st
905	FST	-5 * SIZE(BO)
906
907	FST	1 * SIZE(CO, LDC)
908	FST	0 * SIZE(CO)
909	FST	0 * SIZE(CO, LDC)
910	FST	1 * SIZE(CO)
911#else
912	fld	%st
913	FST	-8 * SIZE(AO)
914	fxch	%st(2)
915	fld	%st
916	FST	-7 * SIZE(AO)
917	fxch	%st(1)
918	fld	%st
919	FST	-6 * SIZE(AO)
920	fxch	%st(3)
921	fld	%st
922	FST	-5 * SIZE(AO)
923
924	FST	1 * SIZE(CO, LDC)
925	FST	1 * SIZE(CO)
926	FST	0 * SIZE(CO)
927	FST	0 * SIZE(CO, LDC)
928#endif
929
930#ifndef LN
931	addl	$2 * SIZE, CO
932#endif
933
934#if defined(LT) || defined(RN)
935	movl	K,  %eax
936	subl	KK, %eax
937	sall	$BASE_SHIFT, %eax
938	leal	(AO, %eax, 2), AO
939	leal	(BO, %eax, 2), BO
940#endif
941
942#ifdef LN
943	subl	$2, KK
944#endif
945
946#ifdef LT
947	addl	$2, KK
948#endif
949
950#ifdef RT
951       movl	K, %eax
952       sall	$1 + BASE_SHIFT, %eax
953       addl	%eax, AORIG
954#endif
955
956	decl	I
957	jne	.L11
958	ALIGN_4
959
960.L20:
961	movl	 M, %eax
962	andl	$1, %eax
963	je	.L29
964	ALIGN_4
965
966.L21:
967#ifdef LN
968       movl	K, %eax
969       sall	$0 + BASE_SHIFT, %eax
970       subl	%eax, AORIG
971#endif
972
973#if defined(LN) || defined(RT)
974	movl	KK, %eax
975	sall	$BASE_SHIFT, %eax
976	movl	AORIG, AO
977	leal	(AO, %eax, 1), AO
978	leal	(B,  %eax, 2), BO
979#else
980	movl	B, BO
981#endif
982
983	fldz
984	fldz
985
986#if defined(LT) || defined(RN)
987	movl	KK, %eax
988#else
989	movl	K,  %eax
990	subl	KK, %eax
991#endif
992	sarl	$2, %eax
993 	je	.L25
994	ALIGN_4
995
996.L22:
997	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
998
999	FLD	 -8 * SIZE(AO)
1000
1001	FLD	 -8 * SIZE(BO)
1002	fmul	 %st(1), %st
1003	faddp	 %st, %st(2)
1004
1005	FLD	 -7 * SIZE(BO)
1006	fmulp	 %st, %st(1)
1007	faddp	 %st, %st(2)
1008
1009	FLD	 -7 * SIZE(AO)
1010
1011	FLD	 -6 * SIZE(BO)
1012	fmul	 %st(1), %st
1013	faddp	 %st, %st(2)
1014
1015	FLD	 -5 * SIZE(BO)
1016	fmulp	 %st, %st(1)
1017	faddp	 %st, %st(2)
1018
1019	FLD	 -6 * SIZE(AO)
1020
1021	FLD	 -4 * SIZE(BO)
1022	fmul	 %st(1), %st
1023	faddp	 %st, %st(2)
1024
1025	FLD	 -3 * SIZE(BO)
1026	fmulp	 %st, %st(1)
1027	faddp	 %st, %st(2)
1028
1029	FLD	 -5 * SIZE(AO)
1030
1031	FLD	 -2 * SIZE(BO)
1032	fmul	 %st(1), %st
1033	faddp	 %st, %st(2)
1034
1035	FLD	 -1 * SIZE(BO)
1036	fmulp	 %st, %st(1)
1037	faddp	 %st, %st(2)
1038
1039	addl	$4 * SIZE,AO
1040	addl	$8 * SIZE,BO
1041
1042	decl	%eax
1043	jne	.L22
1044	ALIGN_4
1045
1046.L25:
1047#if defined(LT) || defined(RN)
1048	movl	KK, %eax
1049#else
1050	movl	K,  %eax
1051	subl	KK, %eax
1052#endif
1053	and	$3,  %eax
1054	je	.L28
1055	ALIGN_4
1056
1057.L26:
1058	FLD	 -8 * SIZE(AO)
1059
1060	FLD	 -8 * SIZE(BO)
1061	fmul	 %st(1), %st
1062	faddp	 %st, %st(2)
1063
1064	FLD	 -7 * SIZE(BO)
1065	fmulp	 %st, %st(1)
1066	faddp	 %st, %st(2)
1067
1068	addl	$1 * SIZE,AO
1069	addl	$2 * SIZE,BO
1070
1071	decl	%eax
1072	jne	 .L26
1073	ALIGN_4
1074
1075.L28:
1076#if defined(LN) || defined(RT)
1077	movl	KK, %eax
1078#ifdef LN
1079	subl	$1, %eax
1080#else
1081	subl	$2, %eax
1082#endif
1083
1084	sall	$BASE_SHIFT, %eax
1085
1086	movl	AORIG, AO
1087	leal	(AO, %eax, 1), AO
1088	leal	(B,  %eax, 2), BO
1089#endif
1090
1091#if defined(LN) || defined(LT)
1092	FLD	-8 * SIZE(BO)
1093	fsubp	%st, %st(1)
1094	FLD	-7 * SIZE(BO)
1095	fsubp	%st, %st(2)
1096#else
1097	FLD	-8 * SIZE(AO)
1098	fsubp	%st, %st(1)
1099	FLD	-7 * SIZE(AO)
1100	fsubp	%st, %st(2)
1101#endif
1102
1103#if defined(LN) || defined(LT)
1104       FLD	-8 * SIZE(AO)
1105       fmul	%st, %st(1)
1106       fmulp	%st, %st(2)
1107#endif
1108
1109#ifdef RN
1110       FLD	-8 * SIZE(BO)
1111       fmulp	%st, %st(1)
1112
1113       FLD	-7 * SIZE(BO)
1114       fmul	%st(1), %st
1115
1116       fsubrp	%st, %st(2)
1117
1118       FLD	-5 * SIZE(BO)
1119       fmulp	%st, %st(2)
1120#endif
1121
1122#ifdef RT
1123       FLD	-5 * SIZE(BO)
1124       fmulp	%st, %st(2)
1125
1126       FLD	-6 * SIZE(BO)
1127       fmul	%st(2), %st
1128
1129       fsubrp	%st, %st(1)
1130
1131       FLD	-8 * SIZE(BO)
1132       fmulp	%st, %st(1)
1133#endif
1134
1135#ifdef LN
1136	subl	$1 * SIZE, CO
1137#endif
1138
1139#if defined(LN) || defined(LT)
1140	fld	%st
1141	FST	-8 * SIZE(BO)
1142	fxch	%st(1)
1143	fld	%st
1144	FST	-7 * SIZE(BO)
1145#else
1146	fld	%st
1147	FST	-8 * SIZE(AO)
1148	fxch	%st(1)
1149	fld	%st
1150	FST	-7 * SIZE(AO)
1151#endif
1152
1153	FST	0 * SIZE(CO, LDC)
1154	FST	0 * SIZE(CO)
1155
1156#ifndef LN
1157	addl	$1 * SIZE, CO
1158#endif
1159
1160#if defined(LT) || defined(RN)
1161	movl	K,  %eax
1162	subl	KK, %eax
1163	sall	$BASE_SHIFT, %eax
1164	leal	(AO, %eax, 1), AO
1165	leal	(BO, %eax, 2), BO
1166#endif
1167
1168#ifdef LN
1169	subl	$1, KK
1170#endif
1171
1172#ifdef LT
1173	addl	$1, KK
1174#endif
1175
1176#ifdef RT
1177       movl	K, %eax
1178       sall	$0 + BASE_SHIFT, %eax
1179       addl	%eax, AORIG
1180#endif
1181	ALIGN_4
1182
1183.L29:
1184#ifdef LN
1185       movl	K, %eax
1186       sall	$BASE_SHIFT, %eax
1187       leal	(B, %eax, 2), B
1188#endif
1189
1190#if defined(LT) || defined(RN)
1191	movl	BO, B
1192#endif
1193
1194#ifdef RN
1195	addl	$2, KK
1196#endif
1197
1198#ifdef RT
1199	subl	$2, KK
1200#endif
1201
1202	decl	J
1203	jne	.L01
1204	ALIGN_4
1205
1206.L999:
1207	popl	%ebx
1208	popl	%esi
1209	popl	%edi
1210	popl	%ebp
1211	addl	$ARGS, %esp
1212	ret
1213
1214	EPILOGUE
1215