1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef OPTERON
26#define PREFETCH	prefetch
27#define PREFETCHW	prefetchw
28#else
29#define PREFETCH	prefetcht0
30#define PREFETCHW	prefetcht0
31#endif
32
33#define PREFETCHSIZE (5 + 4 * 10)
34#define STACK	16
35#define ARGS	16
36
37#define J	 0 + STACK(%esp)
38#define KK	 4 + STACK(%esp)
39#define AORIG	 8 + STACK(%esp)
40
41#define M	 4 + STACK + ARGS(%esp)
42#define N	 8 + STACK + ARGS(%esp)
43#define K	12 + STACK + ARGS(%esp)
44#define ALPHA	16 + STACK + ARGS(%esp)
45#define A	32 + STACK + ARGS(%esp)
46#define ARG_B	36 + STACK + ARGS(%esp)
47#define C	40 + STACK + ARGS(%esp)
48#define ARG_LDC	44 + STACK + ARGS(%esp)
49#define OFFSET	48 + STACK + ARGS(%esp)
50
51#define I	%esi
52#define B	%ebx
53#define CO	%edi
54#define AO	%edx
55#define BO	%ecx
56#define LDC	%ebp
57
58#define PREFETCH_OFFSET 48
59
60	PROLOGUE
61
62	subl	$ARGS, %esp	# Generate Stack Frame
63
64	pushl	%ebp
65	pushl	%edi
66	pushl	%esi
67	pushl	%ebx
68
69	PROFCODE
70
71	movl	ARG_LDC, LDC
72	movl	ARG_B,   B
73	sall	$BASE_SHIFT, LDC
74
75	addl	$8 * SIZE, A
76	addl	$8 * SIZE, B
77
78#ifdef LN
79       movl	M, %eax
80       sall	$BASE_SHIFT, %eax
81       addl	%eax, C
82       imull	K, %eax
83       addl	%eax, A
84#endif
85
86#ifdef RT
87       movl	N, %eax
88       sall	$BASE_SHIFT, %eax
89       imull	K, %eax
90       addl	%eax, B
91
92       movl	N,    %eax
93       imull	%ebp, %eax
94       addl	%eax, C
95#endif
96
97#ifdef RN
98       movl	OFFSET, %eax
99       negl	%eax
100       movl	%eax, KK
101#endif
102
103#ifdef RT
104       movl	N, %eax
105       subl	OFFSET, %eax
106       movl	%eax, KK
107#endif
108
109	movl	N,   %eax
110	sarl	$1,  %eax
111	movl	%eax, J
112	je	.L30
113	ALIGN_4
114
115.L01:
116#if defined(LT) || defined(RN)
117	movl	A, AO
118#else
119	movl	A, %eax
120	movl	%eax, AORIG
121#endif
122
123#ifdef RT
124	movl	K, %eax
125	sall	$1 + BASE_SHIFT, %eax
126	subl	%eax, B
127#endif
128
129	lea	(, LDC, 2), %eax
130
131#ifdef RT
132	subl	%eax, C
133#endif
134	movl	C, CO
135#ifndef RT
136	addl	%eax, C
137#endif
138
139#ifdef LN
140	movl	OFFSET, %eax
141	addl	M, %eax
142	movl	%eax, KK
143#endif
144
145#ifdef LT
146	movl	OFFSET, %eax
147	movl	%eax, KK
148#endif
149
150	movl	M,  I
151	sarl	$1, I
152	je	.L20
153	ALIGN_4
154
155.L11:
156#ifdef LN
157       movl	K, %eax
158       sall	$1 + BASE_SHIFT, %eax
159       subl	%eax, AORIG
160#endif
161
162#if defined(LN) || defined(RT)
163	movl	KK, %eax
164	sall	$BASE_SHIFT, %eax
165	movl	AORIG, AO
166	leal	(AO, %eax, 2), AO
167	leal	(B,  %eax, 2), BO
168#else
169	movl	B, BO
170#endif
171
172	fldz
173	fldz
174	fldz
175	fldz
176
177#if   defined(HAVE_3DNOW)
178	prefetchw	2 * SIZE(CO)
179 	prefetchw	2 * SIZE(CO, LDC, 1)
180#elif defined(HAVE_SSE)
181	prefetchnta	2 * SIZE(CO)
182 	prefetchnta	2 * SIZE(CO, LDC, 1)
183#endif
184
185#if defined(LT) || defined(RN)
186	movl	KK, %eax
187#else
188	movl	K,  %eax
189	subl	KK, %eax
190#endif
191	sarl	$2, %eax
192 	je	.L15
193	ALIGN_4
194
195.L12:
196	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
197
198	FLD	 -8 * SIZE(AO)
199
200	FLD	 -8 * SIZE(BO)
201	fld	 %st(1)
202	fmul	 %st(1), %st
203	faddp	 %st, %st(3)
204
205	FLD	 -7 * SIZE(BO)
206	fmul	 %st, %st(2)
207
208	FLD	 -7 * SIZE(AO)
209	fmul	 %st, %st(2)
210	fmulp	 %st, %st(1)
211
212	faddp	 %st, %st(6)
213	faddp	 %st, %st(4)
214	faddp	 %st, %st(2)
215
216	FLD	 -6 * SIZE(AO)
217
218	FLD	 -6 * SIZE(BO)
219	fld	 %st(1)
220	fmul	 %st(1), %st
221	faddp	 %st, %st(3)
222
223	FLD	 -5 * SIZE(BO)
224	fmul	 %st, %st(2)
225
226	FLD	 -5 * SIZE(AO)
227	fmul	 %st, %st(2)
228	fmulp	 %st, %st(1)
229
230	faddp	 %st, %st(6)
231	faddp	 %st, %st(4)
232	faddp	 %st, %st(2)
233
234	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
235
236	FLD	 -4 * SIZE(AO)
237
238	FLD	 -4 * SIZE(BO)
239	fld	 %st(1)
240	fmul	 %st(1), %st
241	faddp	 %st, %st(3)
242
243	FLD	 -3 * SIZE(BO)
244	fmul	 %st, %st(2)
245
246	FLD	 -3 * SIZE(AO)
247	fmul	 %st, %st(2)
248	fmulp	 %st, %st(1)
249
250	faddp	 %st, %st(6)
251	faddp	 %st, %st(4)
252	faddp	 %st, %st(2)
253
254	FLD	 -2 * SIZE(AO)
255
256	FLD	 -2 * SIZE(BO)
257	fld	 %st(1)
258	fmul	 %st(1), %st
259	faddp	 %st, %st(3)
260
261	FLD	 -1 * SIZE(BO)
262	fmul	 %st, %st(2)
263
264	FLD	 -1 * SIZE(AO)
265	fmul	 %st, %st(2)
266	fmulp	 %st, %st(1)
267
268	faddp	 %st, %st(6)
269	faddp	 %st, %st(4)
270	faddp	 %st, %st(2)
271
272	addl	$8 * SIZE,AO
273	addl	$8 * SIZE,BO
274
275	decl	%eax
276	jne	.L12
277	ALIGN_4
278
279.L15:
280#if defined(LT) || defined(RN)
281	movl	KK, %eax
282#else
283	movl	K,  %eax
284	subl	KK, %eax
285#endif
286	and	$3,  %eax
287	je	.L18
288	ALIGN_4
289
290.L16:
291	FLD	 -8 * SIZE(AO)
292
293	FLD	 -8 * SIZE(BO)
294	fld	 %st(1)
295	fmul	 %st(1), %st
296	faddp	 %st, %st(3)
297
298	FLD	 -7 * SIZE(BO)
299	fmul	 %st, %st(2)
300
301	FLD	 -7 * SIZE(AO)
302	fmul	 %st, %st(2)
303	fmulp	 %st, %st(1)
304
305	faddp	 %st, %st(6)
306	faddp	 %st, %st(4)
307	faddp	 %st, %st(2)
308
309	addl	$2 * SIZE,AO
310	addl	$2 * SIZE,BO
311
312	decl	%eax
313	jne	 .L16
314	ALIGN_4
315
316.L18:
317#if defined(LN) || defined(RT)
318	movl	KK, %eax
319#ifdef LN
320	subl	$2, %eax
321#else
322	subl	$2, %eax
323#endif
324
325	sall	$BASE_SHIFT, %eax
326
327	movl	AORIG, AO
328	leal	(AO, %eax, 2), AO
329	leal	(B,  %eax, 2), BO
330#endif
331
332#if defined(LN) || defined(LT)
333	FLD	-8 * SIZE(BO)
334	fsubp	%st, %st(1)
335	FLD	-7 * SIZE(BO)
336	fsubp	%st, %st(2)
337	FLD	-6 * SIZE(BO)
338	fsubp	%st, %st(3)
339	FLD	-5 * SIZE(BO)
340	fsubp	%st, %st(4)
341#else
342	FLD	-8 * SIZE(AO)
343	fsubp	%st, %st(1)
344	FLD	-7 * SIZE(AO)
345	fsubp	%st, %st(3)
346	FLD	-6 * SIZE(AO)
347	fsubp	%st, %st(2)
348	FLD	-5 * SIZE(AO)
349	fsubp	%st, %st(4)
350#endif
351
352#ifdef LN
353       FLD	-5 * SIZE(AO)
354       fmul	%st, %st(3)
355       fmulp	%st, %st(4)
356
357       FLD	-6 * SIZE(AO)
358       fmul	%st(3), %st
359       FLD	-6 * SIZE(AO)
360       fmul	%st(5), %st
361
362       fsubrp	%st, %st(3)
363       fsubrp	%st, %st(1)
364
365       FLD	-8 * SIZE(AO)
366       fmul	%st, %st(1)
367       fmulp	%st, %st(2)
368#endif
369
370#ifdef LT
371       FLD	-8 * SIZE(AO)
372       fmul	%st, %st(1)
373       fmulp	%st, %st(2)
374
375       FLD	-7 * SIZE(AO)
376       fmul	%st(1), %st
377       FLD	-7 * SIZE(AO)
378       fmul	%st(3), %st
379
380       fsubrp	%st, %st(5)
381       fsubrp	%st, %st(3)
382
383       FLD	-5 * SIZE(AO)
384       fmul	%st, %st(3)
385       fmulp	%st, %st(4)
386#endif
387
388#ifdef RN
389       FLD	-8 * SIZE(BO)
390       fmul	%st, %st(1)
391       fmulp	%st, %st(3)
392
393       FLD	-7 * SIZE(BO)
394       fmul	%st(1), %st
395       FLD	-7 * SIZE(BO)
396       fmul	%st(4), %st
397
398       fsubrp	%st, %st(5)
399       fsubrp	%st, %st(2)
400
401       FLD	-5 * SIZE(BO)
402       fmul	%st, %st(2)
403       fmulp	%st, %st(4)
404#endif
405
406#ifdef RT
407       FLD	-5 * SIZE(BO)
408       fmul	%st, %st(2)
409       fmulp	%st, %st(4)
410
411       FLD	-6 * SIZE(BO)
412       fmul	%st(2), %st
413       FLD	-6 * SIZE(BO)
414       fmul	%st(5), %st
415
416       fsubrp	%st, %st(4)
417       fsubrp	%st, %st(1)
418
419       FLD	-8 * SIZE(BO)
420       fmul	%st, %st(1)
421       fmulp	%st, %st(3)
422#endif
423
424#ifdef LN
425	subl	$2 * SIZE, CO
426#endif
427
428#if defined(LN) || defined(LT)
429	fld	%st
430	FST	-8 * SIZE(BO)
431	fxch	%st(1)
432	fld	%st
433	FST	-7 * SIZE(BO)
434	fxch	%st(2)
435	fld	%st
436	FST	-6 * SIZE(BO)
437	fxch	%st(3)
438	fld	%st
439	FST	-5 * SIZE(BO)
440
441	FST	1 * SIZE(CO, LDC)
442	FST	0 * SIZE(CO)
443	FST	0 * SIZE(CO, LDC)
444	FST	1 * SIZE(CO)
445#else
446	fld	%st
447	FST	-8 * SIZE(AO)
448	fxch	%st(2)
449	fld	%st
450	FST	-7 * SIZE(AO)
451	fxch	%st(1)
452	fld	%st
453	FST	-6 * SIZE(AO)
454	fxch	%st(3)
455	fld	%st
456	FST	-5 * SIZE(AO)
457
458	FST	1 * SIZE(CO, LDC)
459	FST	1 * SIZE(CO)
460	FST	0 * SIZE(CO)
461	FST	0 * SIZE(CO, LDC)
462#endif
463
464#ifndef LN
465	addl	$2 * SIZE, CO
466#endif
467
468#if defined(LT) || defined(RN)
469	movl	K,  %eax
470	subl	KK, %eax
471	sall	$BASE_SHIFT, %eax
472	leal	(AO, %eax, 2), AO
473	leal	(BO, %eax, 2), BO
474#endif
475
476#ifdef LN
477	subl	$2, KK
478#endif
479
480#ifdef LT
481	addl	$2, KK
482#endif
483
484#ifdef RT
485       movl	K, %eax
486       sall	$1 + BASE_SHIFT, %eax
487       addl	%eax, AORIG
488#endif
489
490	decl	I
491	jne	.L11
492	ALIGN_4
493
494.L20:
495	movl	 M, %eax
496	andl	$1, %eax
497	je	.L29
498	ALIGN_4
499
500.L21:
501#ifdef LN
502       movl	K, %eax
503       sall	$0 + BASE_SHIFT, %eax
504       subl	%eax, AORIG
505#endif
506
507#if defined(LN) || defined(RT)
508	movl	KK, %eax
509	sall	$BASE_SHIFT, %eax
510	movl	AORIG, AO
511	leal	(AO, %eax, 1), AO
512	leal	(B,  %eax, 2), BO
513#else
514	movl	B, BO
515#endif
516
517	fldz
518	fldz
519
520#if defined(LT) || defined(RN)
521	movl	KK, %eax
522#else
523	movl	K,  %eax
524	subl	KK, %eax
525#endif
526	sarl	$2, %eax
527 	je	.L25
528	ALIGN_4
529
530.L22:
531	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
532
533	FLD	 -8 * SIZE(AO)
534
535	FLD	 -8 * SIZE(BO)
536	fmul	 %st(1), %st
537	faddp	 %st, %st(2)
538
539	FLD	 -7 * SIZE(BO)
540	fmulp	 %st, %st(1)
541	faddp	 %st, %st(2)
542
543	FLD	 -7 * SIZE(AO)
544
545	FLD	 -6 * SIZE(BO)
546	fmul	 %st(1), %st
547	faddp	 %st, %st(2)
548
549	FLD	 -5 * SIZE(BO)
550	fmulp	 %st, %st(1)
551	faddp	 %st, %st(2)
552
553	FLD	 -6 * SIZE(AO)
554
555	FLD	 -4 * SIZE(BO)
556	fmul	 %st(1), %st
557	faddp	 %st, %st(2)
558
559	FLD	 -3 * SIZE(BO)
560	fmulp	 %st, %st(1)
561	faddp	 %st, %st(2)
562
563	FLD	 -5 * SIZE(AO)
564
565	FLD	 -2 * SIZE(BO)
566	fmul	 %st(1), %st
567	faddp	 %st, %st(2)
568
569	FLD	 -1 * SIZE(BO)
570	fmulp	 %st, %st(1)
571	faddp	 %st, %st(2)
572
573	addl	$4 * SIZE,AO
574	addl	$8 * SIZE,BO
575
576	decl	%eax
577	jne	.L22
578	ALIGN_4
579
580.L25:
581#if defined(LT) || defined(RN)
582	movl	KK, %eax
583#else
584	movl	K,  %eax
585	subl	KK, %eax
586#endif
587	and	$3,  %eax
588	je	.L28
589	ALIGN_4
590
591.L26:
592	FLD	 -8 * SIZE(AO)
593
594	FLD	 -8 * SIZE(BO)
595	fmul	 %st(1), %st
596	faddp	 %st, %st(2)
597
598	FLD	 -7 * SIZE(BO)
599	fmulp	 %st, %st(1)
600	faddp	 %st, %st(2)
601
602	addl	$1 * SIZE,AO
603	addl	$2 * SIZE,BO
604
605	decl	%eax
606	jne	 .L26
607	ALIGN_4
608
609.L28:
610#if defined(LN) || defined(RT)
611	movl	KK, %eax
612#ifdef LN
613	subl	$1, %eax
614#else
615	subl	$2, %eax
616#endif
617
618	sall	$BASE_SHIFT, %eax
619
620	movl	AORIG, AO
621	leal	(AO, %eax, 1), AO
622	leal	(B,  %eax, 2), BO
623#endif
624
625#if defined(LN) || defined(LT)
626	FLD	-8 * SIZE(BO)
627	fsubp	%st, %st(1)
628	FLD	-7 * SIZE(BO)
629	fsubp	%st, %st(2)
630#else
631	FLD	-8 * SIZE(AO)
632	fsubp	%st, %st(1)
633	FLD	-7 * SIZE(AO)
634	fsubp	%st, %st(2)
635#endif
636
637#if defined(LN) || defined(LT)
638       FLD	-8 * SIZE(AO)
639       fmul	%st, %st(1)
640       fmulp	%st, %st(2)
641#endif
642
643#ifdef RN
644       FLD	-8 * SIZE(BO)
645       fmulp	%st, %st(1)
646
647       FLD	-7 * SIZE(BO)
648       fmul	%st(1), %st
649
650       fsubrp	%st, %st(2)
651
652       FLD	-5 * SIZE(BO)
653       fmulp	%st, %st(2)
654#endif
655
656#ifdef RT
657       FLD	-5 * SIZE(BO)
658       fmulp	%st, %st(2)
659
660       FLD	-6 * SIZE(BO)
661       fmul	%st(2), %st
662
663       fsubrp	%st, %st(1)
664
665       FLD	-8 * SIZE(BO)
666       fmulp	%st, %st(1)
667#endif
668
669#ifdef LN
670	subl	$1 * SIZE, CO
671#endif
672
673#if defined(LN) || defined(LT)
674	fld	%st
675	FST	-8 * SIZE(BO)
676	fxch	%st(1)
677	fld	%st
678	FST	-7 * SIZE(BO)
679#else
680	fld	%st
681	FST	-8 * SIZE(AO)
682	fxch	%st(1)
683	fld	%st
684	FST	-7 * SIZE(AO)
685#endif
686
687	FST	0 * SIZE(CO, LDC)
688	FST	0 * SIZE(CO)
689
690#ifndef LN
691	addl	$1 * SIZE, CO
692#endif
693
694#if defined(LT) || defined(RN)
695	movl	K,  %eax
696	subl	KK, %eax
697	sall	$BASE_SHIFT, %eax
698	leal	(AO, %eax, 1), AO
699	leal	(BO, %eax, 2), BO
700#endif
701
702#ifdef LN
703	subl	$1, KK
704#endif
705
706#ifdef LT
707	addl	$1, KK
708#endif
709
710#ifdef RT
711       movl	K, %eax
712       sall	$0 + BASE_SHIFT, %eax
713       addl	%eax, AORIG
714#endif
715	ALIGN_4
716
717.L29:
718#ifdef LN
719       movl	K, %eax
720       sall	$BASE_SHIFT, %eax
721       leal	(B, %eax, 2), B
722#endif
723
724#if defined(LT) || defined(RN)
725	movl	BO, B
726#endif
727
728#ifdef RN
729	addl	$2, KK
730#endif
731
732#ifdef RT
733	subl	$2, KK
734#endif
735
736	decl	J
737	jne	.L01
738	ALIGN_4
739
740.L30:
741	movl	N,  %eax
742	testl	$1, %eax
743	je	.L999
744
745#if defined(LT) || defined(RN)
746	movl	A, AO
747#else
748	movl	A, %eax
749	movl	%eax, AORIG
750#endif
751
752#ifdef RT
753	movl	K, %eax
754	sall	$0 + BASE_SHIFT, %eax
755	subl	%eax, B
756#endif
757
758#ifdef RT
759	subl	LDC, C
760#endif
761	movl	C, CO
762#ifndef RT
763	addl	LDC, C
764#endif
765
766#ifdef LN
767	movl	OFFSET, %eax
768	addl	M, %eax
769	movl	%eax, KK
770#endif
771
772#ifdef LT
773	movl	OFFSET, %eax
774	movl	%eax, KK
775#endif
776
777	movl	M,  I
778	sarl	$1, I
779	je	.L40
780	ALIGN_4
781
782.L31:
783#ifdef LN
784       movl	K, %eax
785       sall	$1 + BASE_SHIFT, %eax
786       subl	%eax, AORIG
787#endif
788
789#if defined(LN) || defined(RT)
790	movl	KK, %eax
791	sall	$BASE_SHIFT, %eax
792	movl	AORIG, AO
793	leal	(AO, %eax, 2), AO
794	leal	(B,  %eax, 1), BO
795#else
796	movl	B, BO
797#endif
798
799	fldz
800	fldz
801
802#if   defined(HAVE_3DNOW)
803	prefetchw	2 * SIZE(CO)
804#elif defined(HAVE_SSE)
805	prefetchnta	2 * SIZE(CO)
806#endif
807
808#if defined(LT) || defined(RN)
809	movl	KK, %eax
810#else
811	movl	K,  %eax
812	subl	KK, %eax
813#endif
814	sarl	$2, %eax
815 	je	.L35
816	ALIGN_4
817
818.L32:
819	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
820
821	FLD	 -8 * SIZE(BO)
822	FLD	 -8 * SIZE(AO)
823	fmul	 %st(1), %st
824	faddp	 %st, %st(2)
825
826	FLD	 -7 * SIZE(AO)
827	fmulp	 %st, %st(1)
828	faddp	 %st, %st(2)
829
830	FLD	 -7 * SIZE(BO)
831	FLD	 -6 * SIZE(AO)
832	fmul	 %st(1), %st
833	faddp	 %st, %st(2)
834
835	FLD	 -5 * SIZE(AO)
836	fmulp	 %st, %st(1)
837	faddp	 %st, %st(2)
838
839	FLD	 -6 * SIZE(BO)
840	FLD	 -4 * SIZE(AO)
841	fmul	 %st(1), %st
842	faddp	 %st, %st(2)
843
844	FLD	 -3 * SIZE(AO)
845	fmulp	 %st, %st(1)
846	faddp	 %st, %st(2)
847
848	FLD	 -5 * SIZE(BO)
849	FLD	 -2 * SIZE(AO)
850	fmul	 %st(1), %st
851	faddp	 %st, %st(2)
852
853	FLD	 -1 * SIZE(AO)
854	fmulp	 %st, %st(1)
855	faddp	 %st, %st(2)
856
857	addl	$8 * SIZE,AO
858	addl	$4 * SIZE,BO
859
860	decl	%eax
861	jne	.L32
862	ALIGN_4
863
864.L35:
865#if defined(LT) || defined(RN)
866	movl	KK, %eax
867#else
868	movl	K,  %eax
869	subl	KK, %eax
870#endif
871	and	$3,  %eax
872	je	.L38
873	ALIGN_4
874
875.L36:
876	FLD	 -8 * SIZE(BO)
877
878	FLD	 -8 * SIZE(AO)
879	fmul	 %st(1), %st
880	faddp	 %st, %st(2)
881
882	FLD	 -7 * SIZE(AO)
883	fmulp	 %st, %st(1)
884	faddp	 %st, %st(2)
885
886	addl	$2 * SIZE,AO
887	addl	$1 * SIZE,BO
888
889	decl	%eax
890	jne	 .L36
891	ALIGN_4
892
893.L38:
894#if defined(LN) || defined(RT)
895	movl	KK, %eax
896#ifdef LN
897	subl	$2, %eax
898#else
899	subl	$1, %eax
900#endif
901
902	sall	$BASE_SHIFT, %eax
903
904	movl	AORIG, AO
905	leal	(AO, %eax, 2), AO
906	leal	(B,  %eax, 1), BO
907#endif
908
909#if defined(LN) || defined(LT)
910	FLD	-8 * SIZE(BO)
911	fsubp	%st, %st(1)
912	FLD	-7 * SIZE(BO)
913	fsubp	%st, %st(2)
914#else
915	FLD	-8 * SIZE(AO)
916	fsubp	%st, %st(1)
917	FLD	-7 * SIZE(AO)
918	fsubp	%st, %st(2)
919#endif
920
921#ifdef LN
922       FLD	-5 * SIZE(AO)
923       fmulp	%st, %st(2)
924
925       FLD	-6 * SIZE(AO)
926       fmul	%st(2), %st
927
928       fsubrp	%st, %st(1)
929       FLD	-8 * SIZE(AO)
930       fmulp	%st, %st(1)
931#endif
932
933#ifdef LT
934       FLD	-8 * SIZE(AO)
935       fmulp	%st, %st(1)
936
937       FLD	-7 * SIZE(AO)
938       fmul	%st(1), %st
939
940       fsubrp	%st, %st(2)
941
942       FLD	-5 * SIZE(AO)
943       fmulp	%st, %st(2)
944#endif
945
946#ifdef RN
947       FLD	-8 * SIZE(BO)
948       fmul	%st, %st(1)
949       fmulp	%st, %st(2)
950#endif
951
952#ifdef RT
953       FLD	-8 * SIZE(BO)
954       fmul	%st, %st(1)
955       fmulp	%st, %st(2)
956#endif
957
958#ifdef LN
959	subl	$2 * SIZE, CO
960#endif
961
962#if defined(LN) || defined(LT)
963	fld	%st
964	FST	-8 * SIZE(BO)
965	fxch	%st(1)
966	fld	%st
967	FST	-7 * SIZE(BO)
968#else
969	fld	%st
970	FST	-8 * SIZE(AO)
971	fxch	%st(1)
972	fld	%st
973	FST	-7 * SIZE(AO)
974#endif
975
976	FST	1 * SIZE(CO)
977	FST	0 * SIZE(CO)
978
979#ifndef LN
980	addl	$2 * SIZE, CO
981#endif
982
983#if defined(LT) || defined(RN)
984	movl	K,  %eax
985	subl	KK, %eax
986	sall	$BASE_SHIFT, %eax
987	leal	(AO, %eax, 2), AO
988	leal	(BO, %eax, 1), BO
989#endif
990
991#ifdef LN
992	subl	$2, KK
993#endif
994
995#ifdef LT
996	addl	$2, KK
997#endif
998
999#ifdef RT
1000       movl	K, %eax
1001       sall	$1 + BASE_SHIFT, %eax
1002       addl	%eax, AORIG
1003#endif
1004
1005	decl	I
1006	jne	.L31
1007	ALIGN_4
1008
1009.L40:
1010	movl	 M, %eax
1011	andl	$1, %eax
1012	je	.L49
1013	ALIGN_4
1014
1015.L41:
1016#ifdef LN
1017       movl	K, %eax
1018       sall	$0 + BASE_SHIFT, %eax
1019       subl	%eax, AORIG
1020#endif
1021
1022#if defined(LN) || defined(RT)
1023	movl	KK, %eax
1024	sall	$BASE_SHIFT, %eax
1025	movl	AORIG, AO
1026	leal	(AO, %eax, 1), AO
1027	leal	(B,  %eax, 1), BO
1028#else
1029	movl	B, BO
1030#endif
1031
1032	fldz
1033
1034#if defined(LT) || defined(RN)
1035	movl	KK, %eax
1036#else
1037	movl	K,  %eax
1038	subl	KK, %eax
1039#endif
1040	sarl	$2, %eax
1041 	je	.L45
1042	ALIGN_4
1043
1044.L42:
1045	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1046
1047	FLD	 -8 * SIZE(AO)
1048	FLD	 -8 * SIZE(BO)
1049	fmulp	 %st, %st(1)
1050	faddp	 %st, %st(1)
1051
1052	FLD	 -7 * SIZE(AO)
1053	FLD	 -7 * SIZE(BO)
1054	fmulp	 %st, %st(1)
1055	faddp	 %st, %st(1)
1056
1057	FLD	 -6 * SIZE(AO)
1058	FLD	 -6 * SIZE(BO)
1059	fmulp	 %st, %st(1)
1060	faddp	 %st, %st(1)
1061
1062	FLD	 -5 * SIZE(AO)
1063	FLD	 -5 * SIZE(BO)
1064	fmulp	 %st, %st(1)
1065	faddp	 %st, %st(1)
1066
1067	addl	$4 * SIZE,AO
1068	addl	$4 * SIZE,BO
1069
1070	decl	%eax
1071	jne	.L42
1072	ALIGN_4
1073
1074.L45:
1075#if defined(LT) || defined(RN)
1076	movl	KK, %eax
1077#else
1078	movl	K,  %eax
1079	subl	KK, %eax
1080#endif
1081	and	$3,  %eax
1082	je	.L48
1083	ALIGN_4
1084
1085.L46:
1086	FLD	 -8 * SIZE(AO)
1087
1088	FLD	 -8 * SIZE(BO)
1089	fmulp	 %st, %st(1)
1090	faddp	 %st, %st(1)
1091
1092	addl	$1 * SIZE,AO
1093	addl	$1 * SIZE,BO
1094
1095	decl	%eax
1096	jne	 .L46
1097	ALIGN_4
1098
1099.L48:
1100#if defined(LN) || defined(RT)
1101	movl	KK, %eax
1102#ifdef LN
1103	subl	$1, %eax
1104#else
1105	subl	$1, %eax
1106#endif
1107
1108	sall	$BASE_SHIFT, %eax
1109
1110	movl	AORIG, AO
1111	leal	(AO, %eax, 1), AO
1112	leal	(B,  %eax, 1), BO
1113#endif
1114
1115#if defined(LN) || defined(LT)
1116	FLD	-8 * SIZE(BO)
1117	fsubp	%st, %st(1)
1118#else
1119	FLD	-8 * SIZE(AO)
1120	fsubp	%st, %st(1)
1121#endif
1122
1123#ifdef LN
1124       FLD	-8 * SIZE(AO)
1125       fmulp	%st, %st(1)
1126#endif
1127
1128#ifdef LT
1129       FLD	-8 * SIZE(AO)
1130       fmulp	%st, %st(1)
1131#endif
1132
1133#ifdef RN
1134       FLD	-8 * SIZE(BO)
1135       fmulp	%st, %st(1)
1136#endif
1137
1138#ifdef RT
1139       FLD	-8 * SIZE(BO)
1140       fmulp	%st, %st(1)
1141#endif
1142
1143#ifdef LN
1144	subl	$1 * SIZE, CO
1145#endif
1146
1147#if defined(LN) || defined(LT)
1148	fld	%st
1149	FST	-8 * SIZE(BO)
1150#else
1151	fld	%st
1152	FST	-8 * SIZE(AO)
1153#endif
1154
1155	FST	0 * SIZE(CO)
1156
1157#ifndef LN
1158	addl	$1 * SIZE, CO
1159#endif
1160
1161#if defined(LT) || defined(RN)
1162	movl	K,  %eax
1163	subl	KK, %eax
1164	sall	$BASE_SHIFT, %eax
1165	leal	(AO, %eax, 1), AO
1166	leal	(BO, %eax, 1), BO
1167#endif
1168
1169#ifdef LN
1170	subl	$1, KK
1171#endif
1172
1173#ifdef LT
1174	addl	$1, KK
1175#endif
1176
1177#ifdef RT
1178       movl	K, %eax
1179       sall	$0 + BASE_SHIFT, %eax
1180       addl	%eax, AORIG
1181#endif
1182	ALIGN_4
1183
1184.L49:
1185#ifdef LN
1186       movl	K, %eax
1187       sall	$BASE_SHIFT, %eax
1188       leal	(B, %eax, 1), B
1189#endif
1190
1191#if defined(LT) || defined(RN)
1192	movl	BO, B
1193#endif
1194
1195#ifdef RN
1196	addl	$1, KK
1197#endif
1198
1199#ifdef RT
1200	subl	$1, KK
1201#endif
1202	ALIGN_4
1203
1204.L999:
1205	popl	%ebx
1206	popl	%esi
1207	popl	%edi
1208	popl	%ebp
1209	addl	$ARGS, %esp
1210	ret
1211
1212	EPILOGUE
1213