1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef OPTERON
43#define PREFETCH	prefetch
44#define PREFETCHW	prefetchw
45#else
46#define PREFETCH	prefetcht0
47#define PREFETCHW	prefetcht0
48#endif
49
50#define PREFETCHSIZE (5 + 4 * 10)
51#define STACK	16
52#define ARGS	16
53
54#define J	 0 + STACK(%esp)
55#define KK	 4 + STACK(%esp)
56#define KKK	 8 + STACK(%esp)
57#define AORIG	12 + STACK(%esp)
58
59#define M	 4 + STACK + ARGS(%esp)
60#define N	 8 + STACK + ARGS(%esp)
61#define K	12 + STACK + ARGS(%esp)
62#define ALPHA	16 + STACK + ARGS(%esp)
63#define A	32 + STACK + ARGS(%esp)
64#define ARG_B	36 + STACK + ARGS(%esp)
65#define C	40 + STACK + ARGS(%esp)
66#define ARG_LDC	44 + STACK + ARGS(%esp)
67#define OFFSET	48 + STACK + ARGS(%esp)
68
69#define I	%esi
70#define B	%ebx
71#define CO	%edi
72#define AO	%edx
73#define BO	%ecx
74#define LDC	%ebp
75
76#define PREFETCH_OFFSET 48
77
78	PROLOGUE
79
80	subl	$ARGS, %esp	# Generate Stack Frame
81
82	pushl	%ebp
83	pushl	%edi
84	pushl	%esi
85	pushl	%ebx
86
87	PROFCODE
88
89	movl	ARG_LDC, LDC
90	movl	ARG_B,   B
91	sall	$BASE_SHIFT, LDC
92
93	addl	$8 * SIZE, A
94	addl	$8 * SIZE, B
95
96
97#ifdef LN
98       movl	M, %eax
99       sall	$BASE_SHIFT, %eax
100       addl	%eax, C
101       imull	K, %eax
102       addl	%eax, A
103#endif
104
105#ifdef RT
106       movl	N, %eax
107       sall	$BASE_SHIFT, %eax
108       imull	K, %eax
109       addl	%eax, B
110
111       movl	N,    %eax
112       imull	%ebp, %eax
113       addl	%eax, C
114#endif
115
116#ifdef RN
117       movl	OFFSET, %eax
118       negl	%eax
119       movl	%eax, KK
120#endif
121
122#ifdef RT
123       movl	N, %eax
124       subl	OFFSET, %eax
125       movl	%eax, KK
126#endif
127
128	movl	N,   %eax
129	sarl	$1,  %eax
130	movl	%eax, J
131	je	.L30
132	ALIGN_4
133
134.L01:
135#if defined(LT) || defined(RN)
136	movl	A, AO
137#else
138	movl	A, %eax
139	movl	%eax, AORIG
140#endif
141
142#ifdef RT
143	movl	K, %eax
144	sall	$1 + BASE_SHIFT, %eax
145	subl	%eax, B
146#endif
147
148	lea	(, LDC, 2), %eax
149
150#ifdef RT
151	subl	%eax, C
152#endif
153	movl	C, CO
154#ifndef RT
155	addl	%eax, C
156#endif
157
158#ifdef LN
159	movl	OFFSET, %eax
160	addl	M, %eax
161	movl	%eax, KK
162#endif
163
164#ifdef LT
165	movl	OFFSET, %eax
166	movl	%eax, KK
167#endif
168
169	movl	 M, %eax
170	andl	$1, %eax
171	je	.L20
172	ALIGN_4
173
174.L21:
175#ifdef LN
176       movl	K, %eax
177       sall	$0 + BASE_SHIFT, %eax
178       subl	%eax, AORIG
179#endif
180
181#if defined(LN) || defined(RT)
182	movl	KK, %eax
183	sall	$BASE_SHIFT, %eax
184	movl	AORIG, AO
185	leal	(AO, %eax, 1), AO
186	leal	(B,  %eax, 2), BO
187#else
188	movl	B, BO
189#endif
190
191	fldz
192	fldz
193
194#if defined(LT) || defined(RN)
195	movl	KK, %eax
196#else
197	movl	K,  %eax
198	subl	KK, %eax
199#endif
200	sarl	$2, %eax
201 	je	.L25
202	ALIGN_4
203
204.L22:
205	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
206
207	FLD	 -8 * SIZE(AO)
208
209	FLD	 -8 * SIZE(BO)
210	fmul	 %st(1), %st
211	faddp	 %st, %st(2)
212
213	FLD	 -7 * SIZE(BO)
214	fmulp	 %st, %st(1)
215	faddp	 %st, %st(2)
216
217	FLD	 -7 * SIZE(AO)
218
219	FLD	 -6 * SIZE(BO)
220	fmul	 %st(1), %st
221	faddp	 %st, %st(2)
222
223	FLD	 -5 * SIZE(BO)
224	fmulp	 %st, %st(1)
225	faddp	 %st, %st(2)
226
227	FLD	 -6 * SIZE(AO)
228
229	FLD	 -4 * SIZE(BO)
230	fmul	 %st(1), %st
231	faddp	 %st, %st(2)
232
233	FLD	 -3 * SIZE(BO)
234	fmulp	 %st, %st(1)
235	faddp	 %st, %st(2)
236
237	FLD	 -5 * SIZE(AO)
238
239	FLD	 -2 * SIZE(BO)
240	fmul	 %st(1), %st
241	faddp	 %st, %st(2)
242
243	FLD	 -1 * SIZE(BO)
244	fmulp	 %st, %st(1)
245	faddp	 %st, %st(2)
246
247	addl	$4 * SIZE,AO
248	addl	$8 * SIZE,BO
249
250	decl	%eax
251	jne	.L22
252	ALIGN_4
253
254.L25:
255#if defined(LT) || defined(RN)
256	movl	KK, %eax
257#else
258	movl	K,  %eax
259	subl	KK, %eax
260#endif
261	and	$3,  %eax
262	je	.L28
263	ALIGN_4
264
265.L26:
266	FLD	 -8 * SIZE(AO)
267
268	FLD	 -8 * SIZE(BO)
269	fmul	 %st(1), %st
270	faddp	 %st, %st(2)
271
272	FLD	 -7 * SIZE(BO)
273	fmulp	 %st, %st(1)
274	faddp	 %st, %st(2)
275
276	addl	$1 * SIZE,AO
277	addl	$2 * SIZE,BO
278
279	decl	%eax
280	jne	 .L26
281	ALIGN_4
282
283.L28:
284#if defined(LN) || defined(RT)
285	movl	KK, %eax
286#ifdef LN
287	subl	$1, %eax
288#else
289	subl	$2, %eax
290#endif
291
292	sall	$BASE_SHIFT, %eax
293
294	movl	AORIG, AO
295	leal	(AO, %eax, 1), AO
296	leal	(B,  %eax, 2), BO
297#endif
298
299#if defined(LN) || defined(LT)
300	FLD	-8 * SIZE(BO)
301	fsubp	%st, %st(1)
302	FLD	-7 * SIZE(BO)
303	fsubp	%st, %st(2)
304#else
305	FLD	-8 * SIZE(AO)
306	fsubp	%st, %st(1)
307	FLD	-7 * SIZE(AO)
308	fsubp	%st, %st(3)
309#endif
310
311#if defined(LN) || defined(LT)
312       FLD	-8 * SIZE(AO)
313       fmul	%st, %st(1)
314       fmulp	%st, %st(2)
315#endif
316
317#ifdef RN
318       FLD	-8 * SIZE(BO)
319       fmulp	%st, %st(1)
320
321       FLD	-7 * SIZE(BO)
322       fmul	%st(1), %st
323
324       fsubrp	%st, %st(2)
325
326       FLD	-5 * SIZE(BO)
327       fmulp	%st, %st(2)
328#endif
329
330#ifdef RT
331       FLD	-5 * SIZE(BO)
332       fmulp	%st, %st(2)
333
334       FLD	-6 * SIZE(BO)
335       fmul	%st(2), %st
336
337       fsubrp	%st, %st(1)
338
339       FLD	-8 * SIZE(BO)
340       fmulp	%st, %st(1)
341#endif
342
343#ifdef LN
344	subl	$1 * SIZE, CO
345#endif
346
347#if defined(LN) || defined(LT)
348	fld	%st
349	FST	-8 * SIZE(BO)
350	fxch	%st(1)
351	fld	%st
352	FST	-7 * SIZE(BO)
353#else
354	fld	%st
355	FST	-8 * SIZE(AO)
356	fxch	%st(1)
357	fld	%st
358	FST	-7 * SIZE(AO)
359#endif
360
361	FST	0 * SIZE(CO, LDC)
362	FST	0 * SIZE(CO)
363
364#ifndef LN
365	addl	$1 * SIZE, CO
366#endif
367
368#if defined(LT) || defined(RN)
369	movl	K,  %eax
370	subl	KK, %eax
371	sall	$BASE_SHIFT, %eax
372	leal	(AO, %eax, 1), AO
373	leal	(BO, %eax, 2), BO
374#endif
375
376#ifdef LN
377	subl	$1, KK
378#endif
379
380#ifdef LT
381	addl	$1, KK
382#endif
383
384#ifdef RT
385       movl	K, %eax
386       sall	$0 + BASE_SHIFT, %eax
387       addl	%eax, AORIG
388#endif
389	ALIGN_4
390
391.L20:
392	movl	M,  I
393	sarl	$1, I
394	je	.L29
395	ALIGN_4
396
397.L11:
398#ifdef LN
399       movl	K, %eax
400       sall	$1 + BASE_SHIFT, %eax
401       subl	%eax, AORIG
402#endif
403
404#if defined(LN) || defined(RT)
405	movl	KK, %eax
406	sall	$BASE_SHIFT, %eax
407	movl	AORIG, AO
408	leal	(AO, %eax, 2), AO
409	leal	(B,  %eax, 2), BO
410#else
411	movl	B, BO
412#endif
413
414	fldz
415	fldz
416	fldz
417	fldz
418
419#if   defined(HAVE_3DNOW)
420	prefetchw	2 * SIZE(CO)
421 	prefetchw	2 * SIZE(CO, LDC, 1)
422#elif defined(HAVE_SSE)
423	prefetchnta	2 * SIZE(CO)
424 	prefetchnta	2 * SIZE(CO, LDC, 1)
425#endif
426
427#if defined(LT) || defined(RN)
428	movl	KK, %eax
429#else
430	movl	K,  %eax
431	subl	KK, %eax
432#endif
433	sarl	$2, %eax
434 	je	.L15
435	ALIGN_4
436
437.L12:
438	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
439
440	FLD	 -8 * SIZE(AO)
441
442	FLD	 -8 * SIZE(BO)
443	fld	 %st(1)
444	fmul	 %st(1), %st
445	faddp	 %st, %st(3)
446
447	FLD	 -7 * SIZE(BO)
448	fmul	 %st, %st(2)
449
450	FLD	 -7 * SIZE(AO)
451	fmul	 %st, %st(2)
452	fmulp	 %st, %st(1)
453
454	faddp	 %st, %st(6)
455	faddp	 %st, %st(4)
456	faddp	 %st, %st(2)
457
458	FLD	 -6 * SIZE(AO)
459
460	FLD	 -6 * SIZE(BO)
461	fld	 %st(1)
462	fmul	 %st(1), %st
463	faddp	 %st, %st(3)
464
465	FLD	 -5 * SIZE(BO)
466	fmul	 %st, %st(2)
467
468	FLD	 -5 * SIZE(AO)
469	fmul	 %st, %st(2)
470	fmulp	 %st, %st(1)
471
472	faddp	 %st, %st(6)
473	faddp	 %st, %st(4)
474	faddp	 %st, %st(2)
475
476	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
477
478	FLD	 -4 * SIZE(AO)
479
480	FLD	 -4 * SIZE(BO)
481	fld	 %st(1)
482	fmul	 %st(1), %st
483	faddp	 %st, %st(3)
484
485	FLD	 -3 * SIZE(BO)
486	fmul	 %st, %st(2)
487
488	FLD	 -3 * SIZE(AO)
489	fmul	 %st, %st(2)
490	fmulp	 %st, %st(1)
491
492	faddp	 %st, %st(6)
493	faddp	 %st, %st(4)
494	faddp	 %st, %st(2)
495
496	FLD	 -2 * SIZE(AO)
497
498	FLD	 -2 * SIZE(BO)
499	fld	 %st(1)
500	fmul	 %st(1), %st
501	faddp	 %st, %st(3)
502
503	FLD	 -1 * SIZE(BO)
504	fmul	 %st, %st(2)
505
506	FLD	 -1 * SIZE(AO)
507	fmul	 %st, %st(2)
508	fmulp	 %st, %st(1)
509
510	faddp	 %st, %st(6)
511	faddp	 %st, %st(4)
512	faddp	 %st, %st(2)
513
514	addl	$8 * SIZE,AO
515	addl	$8 * SIZE,BO
516
517	decl	%eax
518	jne	.L12
519	ALIGN_4
520
521.L15:
522#if defined(LT) || defined(RN)
523	movl	KK, %eax
524#else
525	movl	K,  %eax
526	subl	KK, %eax
527#endif
528	and	$3,  %eax
529	je	.L18
530	ALIGN_4
531
532.L16:
533	FLD	 -8 * SIZE(AO)
534
535	FLD	 -8 * SIZE(BO)
536	fld	 %st(1)
537	fmul	 %st(1), %st
538	faddp	 %st, %st(3)
539
540	FLD	 -7 * SIZE(BO)
541	fmul	 %st, %st(2)
542
543	FLD	 -7 * SIZE(AO)
544	fmul	 %st, %st(2)
545	fmulp	 %st, %st(1)
546
547	faddp	 %st, %st(6)
548	faddp	 %st, %st(4)
549	faddp	 %st, %st(2)
550
551	addl	$2 * SIZE,AO
552	addl	$2 * SIZE,BO
553
554	decl	%eax
555	jne	 .L16
556	ALIGN_4
557
558.L18:
559#if defined(LN) || defined(RT)
560	movl	KK, %eax
561#ifdef LN
562	subl	$2, %eax
563#else
564	subl	$2, %eax
565#endif
566
567	sall	$BASE_SHIFT, %eax
568
569	movl	AORIG, AO
570	leal	(AO, %eax, 2), AO
571	leal	(B,  %eax, 2), BO
572#endif
573
574#if defined(LN) || defined(LT)
575	FLD	-8 * SIZE(BO)
576	fsubp	%st, %st(1)
577	FLD	-7 * SIZE(BO)
578	fsubp	%st, %st(2)
579	FLD	-6 * SIZE(BO)
580	fsubp	%st, %st(3)
581	FLD	-5 * SIZE(BO)
582	fsubp	%st, %st(4)
583#else
584	FLD	-8 * SIZE(AO)
585	fsubp	%st, %st(1)
586	FLD	-7 * SIZE(AO)
587	fsubp	%st, %st(3)
588	FLD	-6 * SIZE(AO)
589	fsubp	%st, %st(2)
590	FLD	-5 * SIZE(AO)
591	fsubp	%st, %st(4)
592#endif
593
594#ifdef LN
595       FLD	-5 * SIZE(AO)
596       fmul	%st, %st(3)
597       fmulp	%st, %st(4)
598
599       FLD	-6 * SIZE(AO)
600       fmul	%st(3), %st
601       FLD	-6 * SIZE(AO)
602       fmul	%st(5), %st
603
604       fsubrp	%st, %st(3)
605       fsubrp	%st, %st(1)
606
607       FLD	-8 * SIZE(AO)
608       fmul	%st, %st(1)
609       fmulp	%st, %st(2)
610#endif
611
612#ifdef LT
613       FLD	-8 * SIZE(AO)
614       fmul	%st, %st(1)
615       fmulp	%st, %st(2)
616
617       FLD	-7 * SIZE(AO)
618       fmul	%st(1), %st
619       FLD	-7 * SIZE(AO)
620       fmul	%st(3), %st
621
622       fsubrp	%st, %st(5)
623       fsubrp	%st, %st(3)
624
625       FLD	-5 * SIZE(AO)
626       fmul	%st, %st(3)
627       fmulp	%st, %st(4)
628#endif
629
630#ifdef RN
631       FLD	-8 * SIZE(BO)
632       fmul	%st, %st(1)
633       fmulp	%st, %st(3)
634
635       FLD	-7 * SIZE(BO)
636       fmul	%st(1), %st
637       FLD	-7 * SIZE(BO)
638       fmul	%st(4), %st
639
640       fsubrp	%st, %st(5)
641       fsubrp	%st, %st(2)
642
643       FLD	-5 * SIZE(BO)
644       fmul	%st, %st(2)
645       fmulp	%st, %st(4)
646#endif
647
648#ifdef RT
649       FLD	-5 * SIZE(BO)
650       fmul	%st, %st(2)
651       fmulp	%st, %st(4)
652
653       FLD	-6 * SIZE(BO)
654       fmul	%st(2), %st
655       FLD	-6 * SIZE(BO)
656       fmul	%st(5), %st
657
658       fsubrp	%st, %st(4)
659       fsubrp	%st, %st(1)
660
661       FLD	-8 * SIZE(BO)
662       fmul	%st, %st(1)
663       fmulp	%st, %st(3)
664#endif
665
666#ifdef LN
667	subl	$2 * SIZE, CO
668#endif
669
670#if defined(LN) || defined(LT)
671	fld	%st
672	FST	-8 * SIZE(BO)
673	fxch	%st(1)
674	fld	%st
675	FST	-7 * SIZE(BO)
676	fxch	%st(2)
677	fld	%st
678	FST	-6 * SIZE(BO)
679	fxch	%st(3)
680	fld	%st
681	FST	-5 * SIZE(BO)
682
683	FST	1 * SIZE(CO, LDC)
684	FST	0 * SIZE(CO)
685	FST	0 * SIZE(CO, LDC)
686	FST	1 * SIZE(CO)
687#else
688	fld	%st
689	FST	-8 * SIZE(AO)
690	fxch	%st(2)
691	fld	%st
692	FST	-7 * SIZE(AO)
693	fxch	%st(1)
694	fld	%st
695	FST	-6 * SIZE(AO)
696	fxch	%st(3)
697	fld	%st
698	FST	-5 * SIZE(AO)
699
700	FST	1 * SIZE(CO, LDC)
701	FST	1 * SIZE(CO)
702	FST	0 * SIZE(CO)
703	FST	0 * SIZE(CO, LDC)
704#endif
705
706#ifndef LN
707	addl	$2 * SIZE, CO
708#endif
709
710#if defined(LT) || defined(RN)
711	movl	K,  %eax
712	subl	KK, %eax
713	sall	$BASE_SHIFT, %eax
714	leal	(AO, %eax, 2), AO
715	leal	(BO, %eax, 2), BO
716#endif
717
718#ifdef LN
719	subl	$2, KK
720#endif
721
722#ifdef LT
723	addl	$2, KK
724#endif
725
726#ifdef RT
727       movl	K, %eax
728       sall	$1 + BASE_SHIFT, %eax
729       addl	%eax, AORIG
730#endif
731
732	decl	I
733	jne	.L11
734	ALIGN_4
735
736.L29:
737#ifdef LN
738       movl	K, %eax
739       sall	$BASE_SHIFT, %eax
740       leal	(B, %eax, 2), B
741#endif
742
743#if defined(LT) || defined(RN)
744	movl	BO, B
745#endif
746
747#ifdef RN
748	addl	$2, KK
749#endif
750
751#ifdef RT
752	subl	$2, KK
753#endif
754
755	decl	J
756	jne	.L01
757	ALIGN_4
758
759.L30:
760	movl	N,  %eax
761	testl	$1, %eax
762	je	.L999
763
764#if defined(LT) || defined(RN)
765	movl	A, AO
766#else
767	movl	A, %eax
768	movl	%eax, AORIG
769#endif
770
771#ifdef RT
772	movl	K, %eax
773	sall	$0 + BASE_SHIFT, %eax
774	subl	%eax, B
775#endif
776
777#ifdef RT
778	subl	LDC, C
779#endif
780	movl	C, CO
781#ifndef RT
782	addl	%eax, C
783#endif
784
785#ifdef LN
786	movl	OFFSET, %eax
787	addl	M, %eax
788	movl	%eax, KK
789#endif
790
791#ifdef LT
792	movl	OFFSET, %eax
793	movl	%eax, KK
794#endif
795
796	movl	 M, %eax
797	andl	$1, %eax
798	je	.L40
799	ALIGN_4
800
801.L41:
802#ifdef LN
803       movl	K, %eax
804       sall	$0 + BASE_SHIFT, %eax
805       subl	%eax, AORIG
806#endif
807
808#if defined(LN) || defined(RT)
809	movl	KK, %eax
810	sall	$BASE_SHIFT, %eax
811	movl	AORIG, AO
812	leal	(AO, %eax, 1), AO
813	leal	(B,  %eax, 1), BO
814#else
815	movl	B, BO
816#endif
817
818	fldz
819
820#if defined(LT) || defined(RN)
821	movl	KK, %eax
822#else
823	movl	K,  %eax
824	subl	KK, %eax
825#endif
826	sarl	$2, %eax
827 	je	.L45
828	ALIGN_4
829
830.L42:
831	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
832
833	FLD	 -8 * SIZE(AO)
834	FLD	 -8 * SIZE(BO)
835	fmulp	 %st, %st(1)
836	faddp	 %st, %st(1)
837
838	FLD	 -7 * SIZE(AO)
839	FLD	 -7 * SIZE(BO)
840	fmulp	 %st, %st(1)
841	faddp	 %st, %st(1)
842
843	FLD	 -6 * SIZE(AO)
844	FLD	 -6 * SIZE(BO)
845	fmulp	 %st, %st(1)
846	faddp	 %st, %st(1)
847
848	FLD	 -5 * SIZE(AO)
849	FLD	 -5 * SIZE(BO)
850	fmulp	 %st, %st(1)
851	faddp	 %st, %st(1)
852
853	addl	$4 * SIZE,AO
854	addl	$4 * SIZE,BO
855
856	decl	%eax
857	jne	.L42
858	ALIGN_4
859
860.L45:
861#if defined(LT) || defined(RN)
862	movl	KK, %eax
863#else
864	movl	K,  %eax
865	subl	KK, %eax
866#endif
867	and	$3,  %eax
868	je	.L48
869	ALIGN_4
870
871.L46:
872	FLD	 -8 * SIZE(AO)
873
874	FLD	 -8 * SIZE(BO)
875	fmulp	 %st, %st(1)
876	faddp	 %st, %st(1)
877
878	addl	$1 * SIZE,AO
879	addl	$1 * SIZE,BO
880
881	decl	%eax
882	jne	 .L46
883	ALIGN_4
884
885.L48:
886#if defined(LN) || defined(RT)
887	movl	KK, %eax
888#ifdef LN
889	subl	$1, %eax
890#else
891	subl	$1, %eax
892#endif
893
894	sall	$BASE_SHIFT, %eax
895
896	movl	AORIG, AO
897	leal	(AO, %eax, 1), AO
898	leal	(B,  %eax, 1), BO
899#endif
900
901#if defined(LN) || defined(LT)
902	FLD	-8 * SIZE(BO)
903	fsubp	%st, %st(1)
904#else
905	FLD	-8 * SIZE(AO)
906	fsubp	%st, %st(1)
907#endif
908
909#ifdef LN
910       FLD	-8 * SIZE(AO)
911       fmulp	%st, %st(1)
912#endif
913
914#ifdef LT
915       FLD	-8 * SIZE(AO)
916       fmulp	%st, %st(1)
917#endif
918
919#ifdef RN
920       FLD	-8 * SIZE(BO)
921       fmulp	%st, %st(1)
922#endif
923
924#ifdef RT
925       FLD	-8 * SIZE(BO)
926       fmulp	%st, %st(1)
927#endif
928
929#ifdef LN
930	subl	$1 * SIZE, CO
931#endif
932
933#if defined(LN) || defined(LT)
934	fld	%st
935	FST	-8 * SIZE(BO)
936#else
937	fld	%st
938	FST	-8 * SIZE(AO)
939#endif
940
941	FST	0 * SIZE(CO)
942
943#ifndef LN
944	addl	$1 * SIZE, CO
945#endif
946
947#if defined(LT) || defined(RN)
948	movl	K,  %eax
949	subl	KK, %eax
950	sall	$BASE_SHIFT, %eax
951	leal	(AO, %eax, 1), AO
952	leal	(BO, %eax, 1), BO
953#endif
954
955#ifdef LN
956	subl	$1, KK
957#endif
958
959#ifdef LT
960	addl	$1, KK
961#endif
962
963#ifdef RT
964       movl	K, %eax
965       sall	$0 + BASE_SHIFT, %eax
966       addl	%eax, AORIG
967#endif
968	ALIGN_4
969
970.L40:
971	movl	M,  I
972	sarl	$1, I
973	je	.L49
974	ALIGN_4
975
976.L31:
977#ifdef LN
978       movl	K, %eax
979       sall	$1 + BASE_SHIFT, %eax
980       subl	%eax, AORIG
981#endif
982
983#if defined(LN) || defined(RT)
984	movl	KK, %eax
985	sall	$BASE_SHIFT, %eax
986	movl	AORIG, AO
987	leal	(AO, %eax, 2), AO
988	leal	(B,  %eax, 1), BO
989#else
990	movl	B, BO
991#endif
992
993	fldz
994	fldz
995
996#if   defined(HAVE_3DNOW)
997	prefetchw	2 * SIZE(CO)
998#elif defined(HAVE_SSE)
999	prefetchnta	2 * SIZE(CO)
1000#endif
1001
1002#if defined(LT) || defined(RN)
1003	movl	KK, %eax
1004#else
1005	movl	K,  %eax
1006	subl	KK, %eax
1007#endif
1008	sarl	$2, %eax
1009 	je	.L35
1010	ALIGN_4
1011
1012.L32:
1013	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1014
1015	FLD	 -8 * SIZE(BO)
1016	FLD	 -8 * SIZE(AO)
1017	fmul	 %st(1), %st
1018	faddp	 %st, %st(2)
1019
1020	FLD	 -7 * SIZE(AO)
1021	fmulp	 %st, %st(1)
1022	faddp	 %st, %st(2)
1023
1024	FLD	 -7 * SIZE(BO)
1025	FLD	 -6 * SIZE(AO)
1026	fmul	 %st(1), %st
1027	faddp	 %st, %st(2)
1028
1029	FLD	 -5 * SIZE(AO)
1030	fmulp	 %st, %st(1)
1031	faddp	 %st, %st(2)
1032
1033	FLD	 -6 * SIZE(BO)
1034	FLD	 -4 * SIZE(AO)
1035	fmul	 %st(1), %st
1036	faddp	 %st, %st(2)
1037
1038	FLD	 -3 * SIZE(AO)
1039	fmulp	 %st, %st(1)
1040	faddp	 %st, %st(2)
1041
1042	FLD	 -5 * SIZE(BO)
1043	FLD	 -2 * SIZE(AO)
1044	fmul	 %st(1), %st
1045	faddp	 %st, %st(2)
1046
1047	FLD	 -1 * SIZE(AO)
1048	fmulp	 %st, %st(1)
1049	faddp	 %st, %st(2)
1050
1051	addl	$8 * SIZE,AO
1052	addl	$4 * SIZE,BO
1053
1054	decl	%eax
1055	jne	.L32
1056	ALIGN_4
1057
1058.L35:
1059#if defined(LT) || defined(RN)
1060	movl	KK, %eax
1061#else
1062	movl	K,  %eax
1063	subl	KK, %eax
1064#endif
1065	and	$3,  %eax
1066	je	.L38
1067	ALIGN_4
1068
1069.L36:
1070	FLD	 -8 * SIZE(BO)
1071
1072	FLD	 -8 * SIZE(AO)
1073	fmul	 %st(1), %st
1074	faddp	 %st, %st(2)
1075
1076	FLD	 -7 * SIZE(AO)
1077	fmulp	 %st, %st(1)
1078	faddp	 %st, %st(2)
1079
1080	addl	$2 * SIZE,AO
1081	addl	$1 * SIZE,BO
1082
1083	decl	%eax
1084	jne	 .L36
1085	ALIGN_4
1086
1087.L38:
1088#if defined(LN) || defined(RT)
1089	movl	KK, %eax
1090#ifdef LN
1091	subl	$2, %eax
1092#else
1093	subl	$1, %eax
1094#endif
1095
1096	sall	$BASE_SHIFT, %eax
1097
1098	movl	AORIG, AO
1099	leal	(AO, %eax, 2), AO
1100	leal	(B,  %eax, 1), BO
1101#endif
1102
1103#if defined(LN) || defined(LT)
1104	FLD	-8 * SIZE(BO)
1105	fsubp	%st, %st(1)
1106	FLD	-7 * SIZE(BO)
1107	fsubp	%st, %st(2)
1108#else
1109	FLD	-8 * SIZE(AO)
1110	fsubp	%st, %st(1)
1111	FLD	-7 * SIZE(AO)
1112	fsubp	%st, %st(3)
1113#endif
1114
1115#ifdef LN
1116       FLD	-5 * SIZE(AO)
1117       fmulp	%st, %st(2)
1118
1119       FLD	-6 * SIZE(AO)
1120       fmul	%st(2), %st
1121
1122       fsubrp	%st, %st(1)
1123       FLD	-8 * SIZE(AO)
1124       fmulp	%st, %st(1)
1125#endif
1126
1127#ifdef LT
1128       FLD	-8 * SIZE(AO)
1129       fmulp	%st, %st(1)
1130
1131       FLD	-7 * SIZE(AO)
1132       fmul	%st(1), %st
1133
1134       fsubrp	%st, %st(2)
1135
1136       FLD	-5 * SIZE(AO)
1137       fmulp	%st, %st(2)
1138#endif
1139
1140#ifdef RN
1141       FLD	-8 * SIZE(BO)
1142       fmul	%st, %st(1)
1143       fmulp	%st, %st(2)
1144#endif
1145
1146#ifdef RT
1147       FLD	-8 * SIZE(BO)
1148       fmul	%st, %st(1)
1149       fmulp	%st, %st(2)
1150#endif
1151
1152#ifdef LN
1153	subl	$2 * SIZE, CO
1154#endif
1155
1156#if defined(LN) || defined(LT)
1157	fld	%st
1158	FST	-8 * SIZE(BO)
1159	fxch	%st(1)
1160	fld	%st
1161	FST	-7 * SIZE(BO)
1162#else
1163	fld	%st
1164	FST	-8 * SIZE(AO)
1165	fxch	%st(1)
1166	fld	%st
1167	FST	-7 * SIZE(AO)
1168#endif
1169
1170	FST	1 * SIZE(CO)
1171	FST	0 * SIZE(CO)
1172
1173#ifndef LN
1174	addl	$2 * SIZE, CO
1175#endif
1176
1177#if defined(LT) || defined(RN)
1178	movl	K,  %eax
1179	subl	KK, %eax
1180	sall	$BASE_SHIFT, %eax
1181	leal	(AO, %eax, 2), AO
1182	leal	(BO, %eax, 1), BO
1183#endif
1184
1185#ifdef LN
1186	subl	$2, KK
1187#endif
1188
1189#ifdef LT
1190	addl	$2, KK
1191#endif
1192
1193#ifdef RT
1194       movl	K, %eax
1195       sall	$1 + BASE_SHIFT, %eax
1196       addl	%eax, AORIG
1197#endif
1198
1199	decl	I
1200	jne	.L31
1201	ALIGN_4
1202
1203.L49:
1204#ifdef LN
1205       movl	K, %eax
1206       sall	$BASE_SHIFT, %eax
1207       leal	(B, %eax, 1), B
1208#endif
1209
1210#if defined(LT) || defined(RN)
1211	movl	BO, B
1212#endif
1213
1214#ifdef RN
1215	addl	$1, KK
1216#endif
1217
1218#ifdef RT
1219	subl	$1, KK
1220#endif
1221	ALIGN_4
1222
1223.L999:
1224	popl	%ebx
1225	popl	%esi
1226	popl	%edi
1227	popl	%ebp
1228	addl	$ARGS, %esp
1229	ret
1230
1231	EPILOGUE
1232