1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef OPTERON
43#define PREFETCH	prefetch
44#define PREFETCHW	prefetchw
45#else
46#define PREFETCH	prefetcht0
47#define PREFETCHW	prefetcht0
48#endif
49
50#define PREFETCHSIZE (5 + 4 * 10)
51#define STACK	16
52#define ARGS	16
53
54#define J	 0 + STACK(%esp)
55#define KK	 4 + STACK(%esp)
56#define AORIG	 8 + STACK(%esp)
57
58#define M	 4 + STACK + ARGS(%esp)
59#define N	 8 + STACK + ARGS(%esp)
60#define K	12 + STACK + ARGS(%esp)
61#define ALPHA	16 + STACK + ARGS(%esp)
62#define A	32 + STACK + ARGS(%esp)
63#define ARG_B	36 + STACK + ARGS(%esp)
64#define C	40 + STACK + ARGS(%esp)
65#define ARG_LDC	44 + STACK + ARGS(%esp)
66#define OFFSET	48 + STACK + ARGS(%esp)
67
68#define I	%esi
69#define B	%ebx
70#define CO	%edi
71#define AO	%edx
72#define BO	%ecx
73#define LDC	%ebp
74
75#define PREFETCH_OFFSET 48
76
77	PROLOGUE
78
79	subl	$ARGS, %esp	# Generate Stack Frame
80
81	pushl	%ebp
82	pushl	%edi
83	pushl	%esi
84	pushl	%ebx
85
86	PROFCODE
87
88	movl	ARG_LDC, LDC
89	movl	ARG_B,   B
90	sall	$BASE_SHIFT, LDC
91
92	addl	$8 * SIZE, A
93	addl	$8 * SIZE, B
94
95#ifdef LN
96       movl	M, %eax
97       sall	$BASE_SHIFT, %eax
98       addl	%eax, C
99       imull	K, %eax
100       addl	%eax, A
101#endif
102
103#ifdef RT
104       movl	N, %eax
105       sall	$BASE_SHIFT, %eax
106       imull	K, %eax
107       addl	%eax, B
108
109       movl	N,    %eax
110       imull	%ebp, %eax
111       addl	%eax, C
112#endif
113
114#ifdef RN
115       movl	OFFSET, %eax
116       negl	%eax
117       movl	%eax, KK
118#endif
119
120#ifdef RT
121       movl	N, %eax
122       subl	OFFSET, %eax
123       movl	%eax, KK
124#endif
125
126	movl	N,   %eax
127	sarl	$1,  %eax
128	movl	%eax, J
129	je	.L30
130	ALIGN_4
131
132.L01:
133#if defined(LT) || defined(RN)
134	movl	A, AO
135#else
136	movl	A, %eax
137	movl	%eax, AORIG
138#endif
139
140#ifdef RT
141	movl	K, %eax
142	sall	$1 + BASE_SHIFT, %eax
143	subl	%eax, B
144#endif
145
146	lea	(, LDC, 2), %eax
147
148#ifdef RT
149	subl	%eax, C
150#endif
151	movl	C, CO
152#ifndef RT
153	addl	%eax, C
154#endif
155
156#ifdef LN
157	movl	OFFSET, %eax
158	addl	M, %eax
159	movl	%eax, KK
160#endif
161
162#ifdef LT
163	movl	OFFSET, %eax
164	movl	%eax, KK
165#endif
166
167	movl	M,  I
168	sarl	$1, I
169	je	.L20
170	ALIGN_4
171
172.L11:
173#ifdef LN
174       movl	K, %eax
175       sall	$1 + BASE_SHIFT, %eax
176       subl	%eax, AORIG
177#endif
178
179#if defined(LN) || defined(RT)
180	movl	KK, %eax
181	sall	$BASE_SHIFT, %eax
182	movl	AORIG, AO
183	leal	(AO, %eax, 2), AO
184	leal	(B,  %eax, 2), BO
185#else
186	movl	B, BO
187#endif
188
189	fldz
190	fldz
191	fldz
192	fldz
193
194#if   defined(HAVE_3DNOW)
195	prefetchw	2 * SIZE(CO)
196 	prefetchw	2 * SIZE(CO, LDC, 1)
197#elif defined(HAVE_SSE)
198	prefetchnta	2 * SIZE(CO)
199 	prefetchnta	2 * SIZE(CO, LDC, 1)
200#endif
201
202#if defined(LT) || defined(RN)
203	movl	KK, %eax
204#else
205	movl	K,  %eax
206	subl	KK, %eax
207#endif
208	sarl	$2, %eax
209 	je	.L15
210	ALIGN_4
211
212.L12:
213	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
214
215	FLD	 -8 * SIZE(AO)
216
217	FLD	 -8 * SIZE(BO)
218	fld	 %st(1)
219	fmul	 %st(1), %st
220	faddp	 %st, %st(3)
221
222	FLD	 -7 * SIZE(BO)
223	fmul	 %st, %st(2)
224
225	FLD	 -7 * SIZE(AO)
226	fmul	 %st, %st(2)
227	fmulp	 %st, %st(1)
228
229	faddp	 %st, %st(6)
230	faddp	 %st, %st(4)
231	faddp	 %st, %st(2)
232
233	FLD	 -6 * SIZE(AO)
234
235	FLD	 -6 * SIZE(BO)
236	fld	 %st(1)
237	fmul	 %st(1), %st
238	faddp	 %st, %st(3)
239
240	FLD	 -5 * SIZE(BO)
241	fmul	 %st, %st(2)
242
243	FLD	 -5 * SIZE(AO)
244	fmul	 %st, %st(2)
245	fmulp	 %st, %st(1)
246
247	faddp	 %st, %st(6)
248	faddp	 %st, %st(4)
249	faddp	 %st, %st(2)
250
251	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
252
253	FLD	 -4 * SIZE(AO)
254
255	FLD	 -4 * SIZE(BO)
256	fld	 %st(1)
257	fmul	 %st(1), %st
258	faddp	 %st, %st(3)
259
260	FLD	 -3 * SIZE(BO)
261	fmul	 %st, %st(2)
262
263	FLD	 -3 * SIZE(AO)
264	fmul	 %st, %st(2)
265	fmulp	 %st, %st(1)
266
267	faddp	 %st, %st(6)
268	faddp	 %st, %st(4)
269	faddp	 %st, %st(2)
270
271	FLD	 -2 * SIZE(AO)
272
273	FLD	 -2 * SIZE(BO)
274	fld	 %st(1)
275	fmul	 %st(1), %st
276	faddp	 %st, %st(3)
277
278	FLD	 -1 * SIZE(BO)
279	fmul	 %st, %st(2)
280
281	FLD	 -1 * SIZE(AO)
282	fmul	 %st, %st(2)
283	fmulp	 %st, %st(1)
284
285	faddp	 %st, %st(6)
286	faddp	 %st, %st(4)
287	faddp	 %st, %st(2)
288
289	addl	$8 * SIZE,AO
290	addl	$8 * SIZE,BO
291
292	decl	%eax
293	jne	.L12
294	ALIGN_4
295
296.L15:
297#if defined(LT) || defined(RN)
298	movl	KK, %eax
299#else
300	movl	K,  %eax
301	subl	KK, %eax
302#endif
303	and	$3,  %eax
304	je	.L18
305	ALIGN_4
306
307.L16:
308	FLD	 -8 * SIZE(AO)
309
310	FLD	 -8 * SIZE(BO)
311	fld	 %st(1)
312	fmul	 %st(1), %st
313	faddp	 %st, %st(3)
314
315	FLD	 -7 * SIZE(BO)
316	fmul	 %st, %st(2)
317
318	FLD	 -7 * SIZE(AO)
319	fmul	 %st, %st(2)
320	fmulp	 %st, %st(1)
321
322	faddp	 %st, %st(6)
323	faddp	 %st, %st(4)
324	faddp	 %st, %st(2)
325
326	addl	$2 * SIZE,AO
327	addl	$2 * SIZE,BO
328
329	decl	%eax
330	jne	 .L16
331	ALIGN_4
332
333.L18:
334#if defined(LN) || defined(RT)
335	movl	KK, %eax
336#ifdef LN
337	subl	$2, %eax
338#else
339	subl	$2, %eax
340#endif
341
342	sall	$BASE_SHIFT, %eax
343
344	movl	AORIG, AO
345	leal	(AO, %eax, 2), AO
346	leal	(B,  %eax, 2), BO
347#endif
348
349#if defined(LN) || defined(LT)
350	FLD	-8 * SIZE(BO)
351	fsubp	%st, %st(1)
352	FLD	-7 * SIZE(BO)
353	fsubp	%st, %st(2)
354	FLD	-6 * SIZE(BO)
355	fsubp	%st, %st(3)
356	FLD	-5 * SIZE(BO)
357	fsubp	%st, %st(4)
358#else
359	FLD	-8 * SIZE(AO)
360	fsubp	%st, %st(1)
361	FLD	-7 * SIZE(AO)
362	fsubp	%st, %st(3)
363	FLD	-6 * SIZE(AO)
364	fsubp	%st, %st(2)
365	FLD	-5 * SIZE(AO)
366	fsubp	%st, %st(4)
367#endif
368
369#ifdef LN
370       FLD	-5 * SIZE(AO)
371       fmul	%st, %st(3)
372       fmulp	%st, %st(4)
373
374       FLD	-6 * SIZE(AO)
375       fmul	%st(3), %st
376       FLD	-6 * SIZE(AO)
377       fmul	%st(5), %st
378
379       fsubrp	%st, %st(3)
380       fsubrp	%st, %st(1)
381
382       FLD	-8 * SIZE(AO)
383       fmul	%st, %st(1)
384       fmulp	%st, %st(2)
385#endif
386
387#ifdef LT
388       FLD	-8 * SIZE(AO)
389       fmul	%st, %st(1)
390       fmulp	%st, %st(2)
391
392       FLD	-7 * SIZE(AO)
393       fmul	%st(1), %st
394       FLD	-7 * SIZE(AO)
395       fmul	%st(3), %st
396
397       fsubrp	%st, %st(5)
398       fsubrp	%st, %st(3)
399
400       FLD	-5 * SIZE(AO)
401       fmul	%st, %st(3)
402       fmulp	%st, %st(4)
403#endif
404
405#ifdef RN
406       FLD	-8 * SIZE(BO)
407       fmul	%st, %st(1)
408       fmulp	%st, %st(3)
409
410       FLD	-7 * SIZE(BO)
411       fmul	%st(1), %st
412       FLD	-7 * SIZE(BO)
413       fmul	%st(4), %st
414
415       fsubrp	%st, %st(5)
416       fsubrp	%st, %st(2)
417
418       FLD	-5 * SIZE(BO)
419       fmul	%st, %st(2)
420       fmulp	%st, %st(4)
421#endif
422
423#ifdef RT
424       FLD	-5 * SIZE(BO)
425       fmul	%st, %st(2)
426       fmulp	%st, %st(4)
427
428       FLD	-6 * SIZE(BO)
429       fmul	%st(2), %st
430       FLD	-6 * SIZE(BO)
431       fmul	%st(5), %st
432
433       fsubrp	%st, %st(4)
434       fsubrp	%st, %st(1)
435
436       FLD	-8 * SIZE(BO)
437       fmul	%st, %st(1)
438       fmulp	%st, %st(3)
439#endif
440
441#ifdef LN
442	subl	$2 * SIZE, CO
443#endif
444
445#if defined(LN) || defined(LT)
446	fld	%st
447	FST	-8 * SIZE(BO)
448	fxch	%st(1)
449	fld	%st
450	FST	-7 * SIZE(BO)
451	fxch	%st(2)
452	fld	%st
453	FST	-6 * SIZE(BO)
454	fxch	%st(3)
455	fld	%st
456	FST	-5 * SIZE(BO)
457
458	FST	1 * SIZE(CO, LDC)
459	FST	0 * SIZE(CO)
460	FST	0 * SIZE(CO, LDC)
461	FST	1 * SIZE(CO)
462#else
463	fld	%st
464	FST	-8 * SIZE(AO)
465	fxch	%st(2)
466	fld	%st
467	FST	-7 * SIZE(AO)
468	fxch	%st(1)
469	fld	%st
470	FST	-6 * SIZE(AO)
471	fxch	%st(3)
472	fld	%st
473	FST	-5 * SIZE(AO)
474
475	FST	1 * SIZE(CO, LDC)
476	FST	1 * SIZE(CO)
477	FST	0 * SIZE(CO)
478	FST	0 * SIZE(CO, LDC)
479#endif
480
481#ifndef LN
482	addl	$2 * SIZE, CO
483#endif
484
485#if defined(LT) || defined(RN)
486	movl	K,  %eax
487	subl	KK, %eax
488	sall	$BASE_SHIFT, %eax
489	leal	(AO, %eax, 2), AO
490	leal	(BO, %eax, 2), BO
491#endif
492
493#ifdef LN
494	subl	$2, KK
495#endif
496
497#ifdef LT
498	addl	$2, KK
499#endif
500
501#ifdef RT
502       movl	K, %eax
503       sall	$1 + BASE_SHIFT, %eax
504       addl	%eax, AORIG
505#endif
506
507	decl	I
508	jne	.L11
509	ALIGN_4
510
511.L20:
512	movl	 M, %eax
513	andl	$1, %eax
514	je	.L29
515	ALIGN_4
516
517.L21:
518#ifdef LN
519       movl	K, %eax
520       sall	$0 + BASE_SHIFT, %eax
521       subl	%eax, AORIG
522#endif
523
524#if defined(LN) || defined(RT)
525	movl	KK, %eax
526	sall	$BASE_SHIFT, %eax
527	movl	AORIG, AO
528	leal	(AO, %eax, 1), AO
529	leal	(B,  %eax, 2), BO
530#else
531	movl	B, BO
532#endif
533
534	fldz
535	fldz
536
537#if defined(LT) || defined(RN)
538	movl	KK, %eax
539#else
540	movl	K,  %eax
541	subl	KK, %eax
542#endif
543	sarl	$2, %eax
544 	je	.L25
545	ALIGN_4
546
547.L22:
548	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
549
550	FLD	 -8 * SIZE(AO)
551
552	FLD	 -8 * SIZE(BO)
553	fmul	 %st(1), %st
554	faddp	 %st, %st(2)
555
556	FLD	 -7 * SIZE(BO)
557	fmulp	 %st, %st(1)
558	faddp	 %st, %st(2)
559
560	FLD	 -7 * SIZE(AO)
561
562	FLD	 -6 * SIZE(BO)
563	fmul	 %st(1), %st
564	faddp	 %st, %st(2)
565
566	FLD	 -5 * SIZE(BO)
567	fmulp	 %st, %st(1)
568	faddp	 %st, %st(2)
569
570	FLD	 -6 * SIZE(AO)
571
572	FLD	 -4 * SIZE(BO)
573	fmul	 %st(1), %st
574	faddp	 %st, %st(2)
575
576	FLD	 -3 * SIZE(BO)
577	fmulp	 %st, %st(1)
578	faddp	 %st, %st(2)
579
580	FLD	 -5 * SIZE(AO)
581
582	FLD	 -2 * SIZE(BO)
583	fmul	 %st(1), %st
584	faddp	 %st, %st(2)
585
586	FLD	 -1 * SIZE(BO)
587	fmulp	 %st, %st(1)
588	faddp	 %st, %st(2)
589
590	addl	$4 * SIZE,AO
591	addl	$8 * SIZE,BO
592
593	decl	%eax
594	jne	.L22
595	ALIGN_4
596
597.L25:
598#if defined(LT) || defined(RN)
599	movl	KK, %eax
600#else
601	movl	K,  %eax
602	subl	KK, %eax
603#endif
604	and	$3,  %eax
605	je	.L28
606	ALIGN_4
607
608.L26:
609	FLD	 -8 * SIZE(AO)
610
611	FLD	 -8 * SIZE(BO)
612	fmul	 %st(1), %st
613	faddp	 %st, %st(2)
614
615	FLD	 -7 * SIZE(BO)
616	fmulp	 %st, %st(1)
617	faddp	 %st, %st(2)
618
619	addl	$1 * SIZE,AO
620	addl	$2 * SIZE,BO
621
622	decl	%eax
623	jne	 .L26
624	ALIGN_4
625
626.L28:
627#if defined(LN) || defined(RT)
628	movl	KK, %eax
629#ifdef LN
630	subl	$1, %eax
631#else
632	subl	$2, %eax
633#endif
634
635	sall	$BASE_SHIFT, %eax
636
637	movl	AORIG, AO
638	leal	(AO, %eax, 1), AO
639	leal	(B,  %eax, 2), BO
640#endif
641
642#if defined(LN) || defined(LT)
643	FLD	-8 * SIZE(BO)
644	fsubp	%st, %st(1)
645	FLD	-7 * SIZE(BO)
646	fsubp	%st, %st(2)
647#else
648	FLD	-8 * SIZE(AO)
649	fsubp	%st, %st(1)
650	FLD	-7 * SIZE(AO)
651	fsubp	%st, %st(2)
652#endif
653
654#if defined(LN) || defined(LT)
655       FLD	-8 * SIZE(AO)
656       fmul	%st, %st(1)
657       fmulp	%st, %st(2)
658#endif
659
660#ifdef RN
661       FLD	-8 * SIZE(BO)
662       fmulp	%st, %st(1)
663
664       FLD	-7 * SIZE(BO)
665       fmul	%st(1), %st
666
667       fsubrp	%st, %st(2)
668
669       FLD	-5 * SIZE(BO)
670       fmulp	%st, %st(2)
671#endif
672
673#ifdef RT
674       FLD	-5 * SIZE(BO)
675       fmulp	%st, %st(2)
676
677       FLD	-6 * SIZE(BO)
678       fmul	%st(2), %st
679
680       fsubrp	%st, %st(1)
681
682       FLD	-8 * SIZE(BO)
683       fmulp	%st, %st(1)
684#endif
685
686#ifdef LN
687	subl	$1 * SIZE, CO
688#endif
689
690#if defined(LN) || defined(LT)
691	fld	%st
692	FST	-8 * SIZE(BO)
693	fxch	%st(1)
694	fld	%st
695	FST	-7 * SIZE(BO)
696#else
697	fld	%st
698	FST	-8 * SIZE(AO)
699	fxch	%st(1)
700	fld	%st
701	FST	-7 * SIZE(AO)
702#endif
703
704	FST	0 * SIZE(CO, LDC)
705	FST	0 * SIZE(CO)
706
707#ifndef LN
708	addl	$1 * SIZE, CO
709#endif
710
711#if defined(LT) || defined(RN)
712	movl	K,  %eax
713	subl	KK, %eax
714	sall	$BASE_SHIFT, %eax
715	leal	(AO, %eax, 1), AO
716	leal	(BO, %eax, 2), BO
717#endif
718
719#ifdef LN
720	subl	$1, KK
721#endif
722
723#ifdef LT
724	addl	$1, KK
725#endif
726
727#ifdef RT
728       movl	K, %eax
729       sall	$0 + BASE_SHIFT, %eax
730       addl	%eax, AORIG
731#endif
732	ALIGN_4
733
734.L29:
735#ifdef LN
736       movl	K, %eax
737       sall	$BASE_SHIFT, %eax
738       leal	(B, %eax, 2), B
739#endif
740
741#if defined(LT) || defined(RN)
742	movl	BO, B
743#endif
744
745#ifdef RN
746	addl	$2, KK
747#endif
748
749#ifdef RT
750	subl	$2, KK
751#endif
752
753	decl	J
754	jne	.L01
755	ALIGN_4
756
757.L30:
758	movl	N,  %eax
759	testl	$1, %eax
760	je	.L999
761
762#if defined(LT) || defined(RN)
763	movl	A, AO
764#else
765	movl	A, %eax
766	movl	%eax, AORIG
767#endif
768
769#ifdef RT
770	movl	K, %eax
771	sall	$0 + BASE_SHIFT, %eax
772	subl	%eax, B
773#endif
774
775#ifdef RT
776	subl	LDC, C
777#endif
778	movl	C, CO
779#ifndef RT
780	addl	LDC, C
781#endif
782
783#ifdef LN
784	movl	OFFSET, %eax
785	addl	M, %eax
786	movl	%eax, KK
787#endif
788
789#ifdef LT
790	movl	OFFSET, %eax
791	movl	%eax, KK
792#endif
793
794	movl	M,  I
795	sarl	$1, I
796	je	.L40
797	ALIGN_4
798
799.L31:
800#ifdef LN
801       movl	K, %eax
802       sall	$1 + BASE_SHIFT, %eax
803       subl	%eax, AORIG
804#endif
805
806#if defined(LN) || defined(RT)
807	movl	KK, %eax
808	sall	$BASE_SHIFT, %eax
809	movl	AORIG, AO
810	leal	(AO, %eax, 2), AO
811	leal	(B,  %eax, 1), BO
812#else
813	movl	B, BO
814#endif
815
816	fldz
817	fldz
818
819#if   defined(HAVE_3DNOW)
820	prefetchw	2 * SIZE(CO)
821#elif defined(HAVE_SSE)
822	prefetchnta	2 * SIZE(CO)
823#endif
824
825#if defined(LT) || defined(RN)
826	movl	KK, %eax
827#else
828	movl	K,  %eax
829	subl	KK, %eax
830#endif
831	sarl	$2, %eax
832 	je	.L35
833	ALIGN_4
834
835.L32:
836	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
837
838	FLD	 -8 * SIZE(BO)
839	FLD	 -8 * SIZE(AO)
840	fmul	 %st(1), %st
841	faddp	 %st, %st(2)
842
843	FLD	 -7 * SIZE(AO)
844	fmulp	 %st, %st(1)
845	faddp	 %st, %st(2)
846
847	FLD	 -7 * SIZE(BO)
848	FLD	 -6 * SIZE(AO)
849	fmul	 %st(1), %st
850	faddp	 %st, %st(2)
851
852	FLD	 -5 * SIZE(AO)
853	fmulp	 %st, %st(1)
854	faddp	 %st, %st(2)
855
856	FLD	 -6 * SIZE(BO)
857	FLD	 -4 * SIZE(AO)
858	fmul	 %st(1), %st
859	faddp	 %st, %st(2)
860
861	FLD	 -3 * SIZE(AO)
862	fmulp	 %st, %st(1)
863	faddp	 %st, %st(2)
864
865	FLD	 -5 * SIZE(BO)
866	FLD	 -2 * SIZE(AO)
867	fmul	 %st(1), %st
868	faddp	 %st, %st(2)
869
870	FLD	 -1 * SIZE(AO)
871	fmulp	 %st, %st(1)
872	faddp	 %st, %st(2)
873
874	addl	$8 * SIZE,AO
875	addl	$4 * SIZE,BO
876
877	decl	%eax
878	jne	.L32
879	ALIGN_4
880
881.L35:
882#if defined(LT) || defined(RN)
883	movl	KK, %eax
884#else
885	movl	K,  %eax
886	subl	KK, %eax
887#endif
888	and	$3,  %eax
889	je	.L38
890	ALIGN_4
891
892.L36:
893	FLD	 -8 * SIZE(BO)
894
895	FLD	 -8 * SIZE(AO)
896	fmul	 %st(1), %st
897	faddp	 %st, %st(2)
898
899	FLD	 -7 * SIZE(AO)
900	fmulp	 %st, %st(1)
901	faddp	 %st, %st(2)
902
903	addl	$2 * SIZE,AO
904	addl	$1 * SIZE,BO
905
906	decl	%eax
907	jne	 .L36
908	ALIGN_4
909
910.L38:
911#if defined(LN) || defined(RT)
912	movl	KK, %eax
913#ifdef LN
914	subl	$2, %eax
915#else
916	subl	$1, %eax
917#endif
918
919	sall	$BASE_SHIFT, %eax
920
921	movl	AORIG, AO
922	leal	(AO, %eax, 2), AO
923	leal	(B,  %eax, 1), BO
924#endif
925
926#if defined(LN) || defined(LT)
927	FLD	-8 * SIZE(BO)
928	fsubp	%st, %st(1)
929	FLD	-7 * SIZE(BO)
930	fsubp	%st, %st(2)
931#else
932	FLD	-8 * SIZE(AO)
933	fsubp	%st, %st(1)
934	FLD	-7 * SIZE(AO)
935	fsubp	%st, %st(2)
936#endif
937
938#ifdef LN
939       FLD	-5 * SIZE(AO)
940       fmulp	%st, %st(2)
941
942       FLD	-6 * SIZE(AO)
943       fmul	%st(2), %st
944
945       fsubrp	%st, %st(1)
946       FLD	-8 * SIZE(AO)
947       fmulp	%st, %st(1)
948#endif
949
950#ifdef LT
951       FLD	-8 * SIZE(AO)
952       fmulp	%st, %st(1)
953
954       FLD	-7 * SIZE(AO)
955       fmul	%st(1), %st
956
957       fsubrp	%st, %st(2)
958
959       FLD	-5 * SIZE(AO)
960       fmulp	%st, %st(2)
961#endif
962
963#ifdef RN
964       FLD	-8 * SIZE(BO)
965       fmul	%st, %st(1)
966       fmulp	%st, %st(2)
967#endif
968
969#ifdef RT
970       FLD	-8 * SIZE(BO)
971       fmul	%st, %st(1)
972       fmulp	%st, %st(2)
973#endif
974
975#ifdef LN
976	subl	$2 * SIZE, CO
977#endif
978
979#if defined(LN) || defined(LT)
980	fld	%st
981	FST	-8 * SIZE(BO)
982	fxch	%st(1)
983	fld	%st
984	FST	-7 * SIZE(BO)
985#else
986	fld	%st
987	FST	-8 * SIZE(AO)
988	fxch	%st(1)
989	fld	%st
990	FST	-7 * SIZE(AO)
991#endif
992
993	FST	1 * SIZE(CO)
994	FST	0 * SIZE(CO)
995
996#ifndef LN
997	addl	$2 * SIZE, CO
998#endif
999
1000#if defined(LT) || defined(RN)
1001	movl	K,  %eax
1002	subl	KK, %eax
1003	sall	$BASE_SHIFT, %eax
1004	leal	(AO, %eax, 2), AO
1005	leal	(BO, %eax, 1), BO
1006#endif
1007
1008#ifdef LN
1009	subl	$2, KK
1010#endif
1011
1012#ifdef LT
1013	addl	$2, KK
1014#endif
1015
1016#ifdef RT
1017       movl	K, %eax
1018       sall	$1 + BASE_SHIFT, %eax
1019       addl	%eax, AORIG
1020#endif
1021
1022	decl	I
1023	jne	.L31
1024	ALIGN_4
1025
1026.L40:
1027	movl	 M, %eax
1028	andl	$1, %eax
1029	je	.L49
1030	ALIGN_4
1031
1032.L41:
1033#ifdef LN
1034       movl	K, %eax
1035       sall	$0 + BASE_SHIFT, %eax
1036       subl	%eax, AORIG
1037#endif
1038
1039#if defined(LN) || defined(RT)
1040	movl	KK, %eax
1041	sall	$BASE_SHIFT, %eax
1042	movl	AORIG, AO
1043	leal	(AO, %eax, 1), AO
1044	leal	(B,  %eax, 1), BO
1045#else
1046	movl	B, BO
1047#endif
1048
1049	fldz
1050
1051#if defined(LT) || defined(RN)
1052	movl	KK, %eax
1053#else
1054	movl	K,  %eax
1055	subl	KK, %eax
1056#endif
1057	sarl	$2, %eax
1058 	je	.L45
1059	ALIGN_4
1060
1061.L42:
1062	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
1063
1064	FLD	 -8 * SIZE(AO)
1065	FLD	 -8 * SIZE(BO)
1066	fmulp	 %st, %st(1)
1067	faddp	 %st, %st(1)
1068
1069	FLD	 -7 * SIZE(AO)
1070	FLD	 -7 * SIZE(BO)
1071	fmulp	 %st, %st(1)
1072	faddp	 %st, %st(1)
1073
1074	FLD	 -6 * SIZE(AO)
1075	FLD	 -6 * SIZE(BO)
1076	fmulp	 %st, %st(1)
1077	faddp	 %st, %st(1)
1078
1079	FLD	 -5 * SIZE(AO)
1080	FLD	 -5 * SIZE(BO)
1081	fmulp	 %st, %st(1)
1082	faddp	 %st, %st(1)
1083
1084	addl	$4 * SIZE,AO
1085	addl	$4 * SIZE,BO
1086
1087	decl	%eax
1088	jne	.L42
1089	ALIGN_4
1090
1091.L45:
1092#if defined(LT) || defined(RN)
1093	movl	KK, %eax
1094#else
1095	movl	K,  %eax
1096	subl	KK, %eax
1097#endif
1098	and	$3,  %eax
1099	je	.L48
1100	ALIGN_4
1101
1102.L46:
1103	FLD	 -8 * SIZE(AO)
1104
1105	FLD	 -8 * SIZE(BO)
1106	fmulp	 %st, %st(1)
1107	faddp	 %st, %st(1)
1108
1109	addl	$1 * SIZE,AO
1110	addl	$1 * SIZE,BO
1111
1112	decl	%eax
1113	jne	 .L46
1114	ALIGN_4
1115
1116.L48:
1117#if defined(LN) || defined(RT)
1118	movl	KK, %eax
1119#ifdef LN
1120	subl	$1, %eax
1121#else
1122	subl	$1, %eax
1123#endif
1124
1125	sall	$BASE_SHIFT, %eax
1126
1127	movl	AORIG, AO
1128	leal	(AO, %eax, 1), AO
1129	leal	(B,  %eax, 1), BO
1130#endif
1131
1132#if defined(LN) || defined(LT)
1133	FLD	-8 * SIZE(BO)
1134	fsubp	%st, %st(1)
1135#else
1136	FLD	-8 * SIZE(AO)
1137	fsubp	%st, %st(1)
1138#endif
1139
1140#ifdef LN
1141       FLD	-8 * SIZE(AO)
1142       fmulp	%st, %st(1)
1143#endif
1144
1145#ifdef LT
1146       FLD	-8 * SIZE(AO)
1147       fmulp	%st, %st(1)
1148#endif
1149
1150#ifdef RN
1151       FLD	-8 * SIZE(BO)
1152       fmulp	%st, %st(1)
1153#endif
1154
1155#ifdef RT
1156       FLD	-8 * SIZE(BO)
1157       fmulp	%st, %st(1)
1158#endif
1159
1160#ifdef LN
1161	subl	$1 * SIZE, CO
1162#endif
1163
1164#if defined(LN) || defined(LT)
1165	fld	%st
1166	FST	-8 * SIZE(BO)
1167#else
1168	fld	%st
1169	FST	-8 * SIZE(AO)
1170#endif
1171
1172	FST	0 * SIZE(CO)
1173
1174#ifndef LN
1175	addl	$1 * SIZE, CO
1176#endif
1177
1178#if defined(LT) || defined(RN)
1179	movl	K,  %eax
1180	subl	KK, %eax
1181	sall	$BASE_SHIFT, %eax
1182	leal	(AO, %eax, 1), AO
1183	leal	(BO, %eax, 1), BO
1184#endif
1185
1186#ifdef LN
1187	subl	$1, KK
1188#endif
1189
1190#ifdef LT
1191	addl	$1, KK
1192#endif
1193
1194#ifdef RT
1195       movl	K, %eax
1196       sall	$0 + BASE_SHIFT, %eax
1197       addl	%eax, AORIG
1198#endif
1199	ALIGN_4
1200
1201.L49:
1202#ifdef LN
1203       movl	K, %eax
1204       sall	$BASE_SHIFT, %eax
1205       leal	(B, %eax, 1), B
1206#endif
1207
1208#if defined(LT) || defined(RN)
1209	movl	BO, B
1210#endif
1211
1212#ifdef RN
1213	addl	$1, KK
1214#endif
1215
1216#ifdef RT
1217	subl	$1, KK
1218#endif
1219	ALIGN_4
1220
1221.L999:
1222	popl	%ebx
1223	popl	%esi
1224	popl	%edi
1225	popl	%ebp
1226	addl	$ARGS, %esp
1227	ret
1228
1229	EPILOGUE
1230