1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	16
44
45#define J	 0 + STACK(%esp)
46#define KK	 4 + STACK(%esp)
47#define KKK	 8 + STACK(%esp)
48#define AORIG	12 + STACK(%esp)
49
50#define M	 4 + STACK + ARGS(%esp)
51#define N	 8 + STACK + ARGS(%esp)
52#define K	12 + STACK + ARGS(%esp)
53#define ALPHA	16 + STACK + ARGS(%esp)
54#ifdef DOUBLE
55#define A	24 + STACK + ARGS(%esp)
56#define B	28 + STACK + ARGS(%esp)
57#define C	32 + STACK + ARGS(%esp)
58#define LDC	36 + STACK + ARGS(%esp)
59#define OFFSET	40 + STACK + ARGS(%esp)
60#else
61#define A	20 + STACK + ARGS(%esp)
62#define B	24 + STACK + ARGS(%esp)
63#define C	28 + STACK + ARGS(%esp)
64#define LDC	32 + STACK + ARGS(%esp)
65#define OFFSET	36 + STACK + ARGS(%esp)
66#endif
67
68#define PREFETCH_OFFSET 48
69
70#if defined(PENTIUM3) || defined(PENTIUMM)
71#define REP rep
72#else
73#define REP rep
74#endif
75
76#define AA	%edx
77#define BB	%ecx
78
79	PROLOGUE
80
81	subl	$ARGS, %esp	# Generate Stack Frame
82
83	pushl	%ebp
84	pushl	%edi
85	pushl	%esi
86	pushl	%ebx
87
88	PROFCODE
89
90	movl	LDC, %ebp		# ldc			# MEMORY
91	movl	B,   %ebx
92	leal	(, %ebp, SIZE), %ebp
93
94#ifdef LN
95       movl	M, %eax
96       leal	(, %eax, SIZE), %eax
97       addl	%eax, C
98       imull	K, %eax
99       addl	%eax, A
100#endif
101
102#ifdef RT
103       movl	N, %eax
104       leal	(, %eax, SIZE), %eax
105       imull	K, %eax
106       addl	%eax, %ebx
107
108       movl	N,    %eax
109       imull	%ebp, %eax
110       addl	%eax, C
111#endif
112
113#ifdef RN
114	negl	KK
115#endif
116
117#ifdef RT
118       movl	N, %eax
119       subl	OFFSET, %eax
120       movl	%eax, KK
121#endif
122
123	movl	N,  %eax		# n			# MEMORY
124	andl	$1, %eax
125	je	.L8
126
127#if defined(LT) || defined(RN)
128	movl	A, AA
129#else
130	movl	A, %eax
131	movl	%eax, AORIG
132#endif
133
134#ifdef RT
135	movl	K, %eax
136	sall	$0 + BASE_SHIFT, %eax
137	subl	%eax, %ebx
138#endif
139
140#ifdef RT
141	subl	%ebp, C
142#endif
143	movl	C, %edi			# c			# MEMORY
144#ifndef RT
145	addl	%ebp, C
146#endif
147
148#ifdef LN
149	movl	OFFSET, %eax
150	addl	M, %eax
151	movl	%eax, KK
152#endif
153
154#ifdef LT
155	movl	OFFSET, %eax
156	movl	%eax, KK
157#endif
158
159	movl	M,  %esi		# m			# MEMORY
160	sarl	$1, %esi		# m >> 1
161	je	.L36
162	ALIGN_4
163
164.L46:
165#ifdef LN
166       movl	K, %eax
167       sall	$1 + BASE_SHIFT, %eax
168       subl	%eax, AORIG
169#endif
170
171#if defined(LN) || defined(RT)
172	movl	KK, %eax
173	leal	(, %eax, SIZE), %eax
174	movl	AORIG, AA
175	leal	(AA,   %eax, 2), AA
176	leal	(%ebx, %eax, 1), BB
177#else
178	movl	%ebx, BB
179#endif
180
181	fldz
182	fldz
183	FLD	0 * SIZE(BB)		# temp1 = *(boffset + 0)
184
185#if defined(LT) || defined(RN)
186	movl	KK, %eax
187#else
188	movl	K,  %eax
189	subl	KK, %eax
190#endif
191	sarl	$1, %eax
192	je	.L56
193	ALIGN_4
194
195.L57:
196	FLD	0 * SIZE(AA)		# temp2 = *(aoffset + 0)
197	fmul	%st(1), %st
198	faddp	%st, %st(2)
199
200	FMUL	1 * SIZE(AA)		# temp2 = *(aoffset + 0)
201	faddp	%st, %st(2)
202	FLD	1 * SIZE(BB)		# temp1 = *(boffset + 0)
203
204	FLD	2 * SIZE(AA)		# temp2 = *(aoffset + 0)
205	fmul	%st(1), %st
206	faddp	%st, %st(2)
207
208	FMUL	3 * SIZE(AA)		# temp2 = *(aoffset + 0)
209	faddp	%st, %st(2)
210	FLD	2 * SIZE(BB)		# temp1 = *(boffset + 0)
211
212	addl	$4 * SIZE,AA
213	addl	$2 * SIZE,BB
214	dec	%eax
215	jne	.L57
216	ALIGN_4
217
218.L56:
219#if defined(LT) || defined(RN)
220	movl	KK, %eax
221#else
222	movl	K,  %eax
223	subl	KK, %eax
224#endif
225	andl	$1, %eax
226	je	.L45
227	ALIGN_4
228
229	FLD	0 * SIZE(AA)		# temp2 = *(aoffset + 0)
230	fmul	%st(1), %st
231	faddp	%st, %st(2)
232
233	FMUL	1 * SIZE(AA)		# temp2 = *(aoffset + 0)
234	faddp	%st, %st(2)
235	FLD	3 * SIZE(BB)		# temp1 = *(boffset + 0)
236
237	addl	$2 * SIZE,AA
238	addl	$1 * SIZE,BB
239	ALIGN_4
240
241.L45:
242	ffreep	%st(0)
243
244#if defined(LN) || defined(RT)
245	movl	KK, %eax
246#ifdef LN
247	subl	$2, %eax
248#else
249	subl	$1, %eax
250#endif
251
252	leal	(, %eax, SIZE), %eax
253
254	movl	AORIG, AA
255	leal	(AA,   %eax, 2), AA
256	leal	(%ebx, %eax, 1), BB
257#endif
258
259#if defined(LN) || defined(LT)
260	FLD	0 * SIZE(BB)
261	fsubp	%st, %st(1)
262	FLD	1 * SIZE(BB)
263	fsubp	%st, %st(2)
264#else
265	FLD	0 * SIZE(AA)
266	fsubp	%st, %st(1)
267	FLD	1 * SIZE(AA)
268	fsubp	%st, %st(2)
269#endif
270
271#ifdef LN
272       FLD	3 * SIZE(AA)
273       fmulp	%st, %st(2)
274
275       FLD	2 * SIZE(AA)
276       fmul	%st(2), %st
277
278       fsubrp	%st, %st(1)
279       FLD	0 * SIZE(AA)
280       fmulp	%st, %st(1)
281#endif
282
283#ifdef LT
284       FLD	0 * SIZE(AA)
285       fmulp	%st, %st(1)
286
287       FLD	1 * SIZE(AA)
288       fmul	%st(1), %st
289
290       fsubrp	%st, %st(2)
291
292       FLD	3 * SIZE(AA)
293       fmulp	%st, %st(2)
294#endif
295
296#ifdef RN
297       FLD	0 * SIZE(BB)
298       fmul	%st, %st(1)
299       fmulp	%st, %st(2)
300#endif
301
302#ifdef RT
303       FLD	0 * SIZE(BB)
304       fmul	%st, %st(1)
305       fmulp	%st, %st(2)
306#endif
307
308#ifdef LN
309	subl	$2 * SIZE, %edi
310#endif
311
312#if defined(LN) || defined(LT)
313	FSTU	0 * SIZE(BB)
314	fxch	%st(1)
315	FSTU	1 * SIZE(BB)
316#else
317	FSTU	0 * SIZE(AA)
318	fxch	%st(1)
319	FSTU	1 * SIZE(AA)
320#endif
321
322	FST	1 * SIZE(%edi)
323	FST	0 * SIZE(%edi)
324
325#ifndef LN
326	addl	$2 * SIZE, %edi
327#endif
328
329#if defined(LT) || defined(RN)
330	movl	K,  %eax
331	subl	KK, %eax
332	leal	(,%eax, SIZE), %eax
333	leal	(AA, %eax, 2), AA
334	leal	(BB, %eax, 1), BB
335#endif
336
337#ifdef LN
338	subl	$2, KK
339#endif
340
341#ifdef LT
342	addl	$2, KK
343#endif
344
345#ifdef RT
346       movl	K, %eax
347       sall	$1 + BASE_SHIFT, %eax
348       addl	%eax, AORIG
349#endif
350
351	decl	%esi			# i --
352	jne	.L46
353	ALIGN_4
354
355.L36:
356	movl	M,  %eax		# m			# MEMORY
357	andl	$1, %eax		# m & 1
358	je	.L99
359
360#ifdef LN
361       movl	K, %eax
362       sall	$0 + BASE_SHIFT, %eax
363       subl	%eax, AORIG
364#endif
365
366#if defined(LN) || defined(RT)
367	movl	KK, %eax
368	leal	(, %eax, SIZE), %eax
369	movl	AORIG, AA
370	leal	(AA,   %eax, 1), AA
371	leal	(%ebx, %eax, 1), BB
372#else
373	movl	%ebx, BB
374#endif
375
376	fldz
377
378#if defined(LT) || defined(RN)
379	movl	KK, %eax
380#else
381	movl	K,  %eax
382	subl	KK, %eax
383#endif
384	test	%eax, %eax
385	jle	.L52
386	ALIGN_3
387
388.L51:
389	FLD	(AA)
390	FMUL	(BB)
391	addl	$1 * SIZE,AA
392	addl	$1 * SIZE,BB
393	faddp	%st,%st(1)
394	decl	%eax
395	jne	.L51
396	ALIGN_4
397
398.L52:
399
400#if defined(LN) || defined(RT)
401	movl	KK, %eax
402#ifdef LN
403	subl	$1, %eax
404#else
405	subl	$1, %eax
406#endif
407
408	leal	(, %eax, SIZE), %eax
409
410	movl	AORIG, AA
411	leal	(AA,   %eax, 1), AA
412	leal	(%ebx, %eax, 1), BB
413#endif
414
415#if defined(LN) || defined(LT)
416	FLD	0 * SIZE(BB)
417	fsubp	%st, %st(1)
418#else
419	FLD	0 * SIZE(AA)
420	fsubp	%st, %st(1)
421#endif
422
423#if defined(LN) || defined(LT)
424       FMUL	0 * SIZE(AA)
425#else
426       FMUL	0 * SIZE(BB)
427#endif
428
429#ifdef LN
430	subl	$1 * SIZE, %edi
431#endif
432
433#if defined(LN) || defined(LT)
434	FSTU	0 * SIZE(BB)
435#else
436	FSTU	0 * SIZE(AA)
437#endif
438
439	FST	0 * SIZE(%edi)
440
441#ifndef LN
442	addl	$1 * SIZE, %edi
443#endif
444
445#if defined(LT) || defined(RN)
446	movl	K,  %eax
447	subl	KK, %eax
448	leal	(,%eax, SIZE), %eax
449	leal	(AA, %eax, 1), AA
450	leal	(BB, %eax, 1), BB
451#endif
452
453#ifdef LN
454	subl	$1, KK
455#endif
456
457#ifdef LT
458	addl	$1, KK
459#endif
460
461#ifdef RT
462       movl	K, %eax
463       sall	$0 + BASE_SHIFT, %eax
464       addl	%eax, AORIG
465#endif
466	ALIGN_4
467
468.L99:
469#ifdef LN
470       movl	K, %eax
471       leal	(%ebx, %eax, SIZE), %ebx
472#endif
473#if defined(LT) || defined(RN)
474	movl	BB, %ebx
475#endif
476
477#ifdef RN
478	addl	$1, KK
479#endif
480
481#ifdef RT
482	subl	$1, KK
483#endif
484	ALIGN_4
485
486.L8:
487	movl	N,   %eax		# j = (n >> 1)		# MEMORY
488	sarl	$1,  %eax
489	movl	%eax, J			# j = (n >> 1)		# MEMORY
490	je	.End
491	ALIGN_4
492
493.L34:
494#if defined(LT) || defined(RN)
495	movl	A, AA
496#else
497	movl	A, %eax
498	movl	%eax, AORIG
499#endif
500
501#ifdef RT
502	movl	K, %eax
503	sall	$1 + BASE_SHIFT, %eax
504	subl	%eax, %ebx
505#endif
506	lea	(, %ebp, 2), %eax
507
508#ifdef RT
509	subl	%eax, C
510#endif
511	movl	C, %edi
512#ifndef RT
513	addl	%eax, C
514#endif
515
516#ifdef LN
517	movl	OFFSET, %eax
518	addl	M, %eax
519	movl	%eax, KK
520#endif
521
522#ifdef LT
523	movl	OFFSET, %eax
524	movl	%eax, KK
525#endif
526
527	movl	M,  %esi
528	sarl	$1, %esi
529	je	.L12
530	ALIGN_4
531
532.MainHead:
533#ifdef LN
534       movl	K, %eax
535       sall	$1 + BASE_SHIFT, %eax
536       subl	%eax, AORIG
537#endif
538
539#if defined(LN) || defined(RT)
540	movl	KK, %eax
541	leal	(, %eax, SIZE), %eax
542	movl	AORIG, AA
543	leal	(AA,   %eax, 2), AA
544	leal	(%ebx, %eax, 2), BB
545#else
546	movl	%ebx, BB
547#endif
548
549	fldz
550	fldz
551	fldz
552	fldz
553
554	FLD	 4 * SIZE(BB)		# b5
555	FLD	 4 * SIZE(AA)		# a5
556	FLD	 0 * SIZE(BB)		# b1
557	FLD	 0 * SIZE(AA)		# a1
558
559#if   defined(HAVE_3DNOW)
560	prefetchw	2 * SIZE(%edi)
561 	prefetchw	2 * SIZE(%edi, %ebp, 1)
562#elif defined(HAVE_SSE)
563	prefetchnta	2 * SIZE(%edi)
564 	prefetchnta	2 * SIZE(%edi, %ebp, 1)
565#endif
566
567#if defined(LT) || defined(RN)
568	movl	KK, %eax
569#else
570	movl	K,  %eax
571	subl	KK, %eax
572#endif
573	sarl	$2, %eax
574 	je	.L16
575	ALIGN_4
576
577.MainLoop:
578#if defined(HAVE_3DNOW)
579	prefetch	(PREFETCH_OFFSET) * SIZE(BB)
580	nop
581#elif defined(HAVE_SSE)
582	prefetchnta	(PREFETCH_OFFSET) * SIZE(BB)
583#if (L2_SIZE == 524288)
584	prefetcht0	(PREFETCH_OFFSET) * SIZE(AA)
585#endif
586#endif
587
588	fmul	%st, %st(1)
589	FMUL	 1 * SIZE(BB)
590	fxch	%st(1)
591	faddp	%st, %st(4)
592	FLD	 0 * SIZE(BB)
593	fxch	%st(1)
594	faddp	%st, %st(5)
595	FLD	 1 * SIZE(AA)
596	fmul	%st, %st(1)
597	FMUL	 1 * SIZE(BB)
598	fxch	%st(1)
599	faddp	%st, %st(6)
600	FLD	 2 * SIZE(BB)
601	fxch	%st(1)
602	faddp	%st, %st(7)
603	FLD	 2 * SIZE(AA)
604
605	fmul	%st, %st(1)
606	FMUL	 3 * SIZE(BB)
607	fxch	%st(1)
608	faddp	%st, %st(4)
609	FLD	 2 * SIZE(BB)
610	fxch	%st(1)
611	faddp	%st, %st(5)
612	FLD	 3 * SIZE(AA)
613	fmul	%st, %st(1)
614	FMUL	 3 * SIZE(BB)
615	fxch	%st(1)
616	faddp	%st, %st(6)
617	FLD	 8 * SIZE(BB)
618	fxch	%st(1)
619	faddp	%st, %st(7)
620	FLD	 8 * SIZE(AA)
621	fxch	%st(2)
622
623#if !defined(HAVE_3DNOW) && defined(HAVE_SSE)  && defined(DOUBLE)
624	prefetchnta	(PREFETCH_OFFSET + 4) * SIZE(BB)
625#if (L2_SIZE == 524288)
626	prefetcht0	(PREFETCH_OFFSET + 4) * SIZE(AA)
627#endif
628#endif
629
630	fmul	%st, %st(3)
631	FMUL	 5 * SIZE(BB)
632	fxch	%st(3)
633	faddp	%st, %st(4)
634	FLD	 4 * SIZE(BB)
635	fxch	%st(3)
636	faddp	%st, %st(5)
637	FLD	 5 * SIZE(AA)
638	fmul	%st, %st(3)
639	FMUL	 5 * SIZE(BB)
640	fxch	%st(3)
641	faddp	%st, %st(6)
642	FLD	 6 * SIZE(BB)
643	fxch	%st(3)
644	faddp	%st, %st(7)
645	FLD	 6 * SIZE(AA)
646
647	fmul	%st, %st(3)
648	FMUL	 7 * SIZE(BB)
649	fxch	%st(3)
650	faddp	%st, %st(4)
651	FLD	 6 * SIZE(BB)
652	fxch	%st(3)
653	faddp	%st, %st(5)
654	FLD	 7 * SIZE(AA)
655	fmul	%st, %st(3)
656	FMUL	 7 * SIZE(BB)
657	fxch	%st(3)
658	faddp	%st, %st(6)
659	FLD	12 * SIZE(BB)
660	fxch	%st(3)
661	faddp	%st, %st(7)
662	FLD	12 * SIZE(AA)
663	fxch	%st(2)
664
665	subl	$-8 * SIZE, BB
666	subl	$-8 * SIZE, AA
667	decl	%eax			# l --
668	jne	.MainLoop
669	ALIGN_4
670
671.L16:
672#if defined(LT) || defined(RN)
673	movl	KK, %eax
674#else
675	movl	K,  %eax
676	subl	KK, %eax
677#endif
678	and	$3,  %eax
679	je	.L21
680	ALIGN_4
681
682.SubLoop:
683	fmul	%st, %st(1)
684	FMUL	 1 * SIZE(BB)
685	fxch	%st(1)
686	faddp	%st, %st(4)
687	FLD	 0 * SIZE(BB)
688	fxch	%st(1)
689	faddp	%st, %st(5)
690	FLD	 1 * SIZE(AA)
691	fmul	%st, %st(1)
692	FMUL	 1 * SIZE(BB)
693	fxch	%st(1)
694	faddp	%st, %st(6)
695	FLD	 2 * SIZE(BB)
696	fxch	%st(1)
697	faddp	%st, %st(7)
698	FLD	 2 * SIZE(AA)
699
700	addl	$2 * SIZE,BB
701	addl	$2 * SIZE,AA
702	decl	%eax
703	jne	 .SubLoop
704	ALIGN_4
705
706.L21:
707	ffreep	%st(0)
708	ffreep	%st(0)
709	ffreep	%st(0)
710	ffreep	%st(0)
711
712#if defined(LN) || defined(RT)
713	movl	KK, %eax
714#ifdef LN
715	subl	$2, %eax
716#else
717	subl	$2, %eax
718#endif
719
720	leal	(, %eax, SIZE), %eax
721
722	movl	AORIG, AA
723	leal	(AA,   %eax, 2), AA
724	leal	(%ebx, %eax, 2), BB
725#endif
726
727#if defined(LN) || defined(LT)
728	FLD	0 * SIZE(BB)
729	fsubp	%st, %st(1)
730	FLD	1 * SIZE(BB)
731	fsubp	%st, %st(2)
732	FLD	2 * SIZE(BB)
733	fsubp	%st, %st(3)
734	FLD	3 * SIZE(BB)
735	fsubp	%st, %st(4)
736#else
737	FLD	0 * SIZE(AA)
738	fsubp	%st, %st(1)
739	FLD	1 * SIZE(AA)
740	fsubp	%st, %st(3)
741	FLD	2 * SIZE(AA)
742	fsubp	%st, %st(2)
743	FLD	3 * SIZE(AA)
744	fsubp	%st, %st(4)
745#endif
746
747#ifdef LN
748       FLD	3 * SIZE(AA)
749       fmul	%st, %st(3)
750       fmulp	%st, %st(4)
751
752       FLD	2 * SIZE(AA)
753       fmul	%st(3), %st
754       FLD	2 * SIZE(AA)
755       fmul	%st(5), %st
756
757       fsubrp	%st, %st(3)
758       fsubrp	%st, %st(1)
759
760       FLD	0 * SIZE(AA)
761       fmul	%st, %st(1)
762       fmulp	%st, %st(2)
763#endif
764
765#ifdef LT
766       FLD	0 * SIZE(AA)
767       fmul	%st, %st(1)
768       fmulp	%st, %st(2)
769
770       FLD	1 * SIZE(AA)
771       fmul	%st(1), %st
772       FLD	1 * SIZE(AA)
773       fmul	%st(3), %st
774
775       fsubrp	%st, %st(5)
776       fsubrp	%st, %st(3)
777
778       FLD	3 * SIZE(AA)
779       fmul	%st, %st(3)
780       fmulp	%st, %st(4)
781#endif
782
783#ifdef RN
784       FLD	0 * SIZE(BB)
785       fmul	%st, %st(1)
786       fmulp	%st, %st(3)
787
788       FLD	1 * SIZE(BB)
789       fmul	%st(1), %st
790       FLD	1 * SIZE(BB)
791       fmul	%st(4), %st
792
793       fsubrp	%st, %st(5)
794       fsubrp	%st, %st(2)
795
796       FLD	3 * SIZE(BB)
797       fmul	%st, %st(2)
798       fmulp	%st, %st(4)
799#endif
800
801#ifdef RT
802       FLD	3 * SIZE(BB)
803       fmul	%st, %st(2)
804       fmulp	%st, %st(4)
805
806       FLD	2 * SIZE(BB)
807       fmul	%st(2), %st
808       FLD	2 * SIZE(BB)
809       fmul	%st(5), %st
810
811       fsubrp	%st, %st(4)
812       fsubrp	%st, %st(1)
813
814       FLD	0 * SIZE(BB)
815       fmul	%st, %st(1)
816       fmulp	%st, %st(3)
817#endif
818
819#ifdef LN
820	subl	$2 * SIZE, %edi
821#endif
822
823#if defined(LN) || defined(LT)
824	FSTU	0 * SIZE(BB)
825	fxch	%st(1)
826	FSTU	1 * SIZE(BB)
827	fxch	%st(2)
828	FSTU	2 * SIZE(BB)
829	fxch	%st(3)
830	FSTU	3 * SIZE(BB)
831
832	FST	1 * SIZE(%edi,%ebp)
833	FST	0 * SIZE(%edi)
834	FST	0 * SIZE(%edi,%ebp)
835	FST	1 * SIZE(%edi)
836#else
837	FSTU	0 * SIZE(AA)
838	fxch	%st(2)
839	FSTU	1 * SIZE(AA)
840	fxch	%st(1)
841	FSTU	2 * SIZE(AA)
842	fxch	%st(3)
843	FSTU	3 * SIZE(AA)
844
845	FST	1 * SIZE(%edi,%ebp)
846	FST	1 * SIZE(%edi)
847	FST	0 * SIZE(%edi)
848	FST	0 * SIZE(%edi,%ebp)
849#endif
850
851#ifndef LN
852	addl	$2 * SIZE, %edi
853#endif
854
855#if defined(LT) || defined(RN)
856	movl	K,  %eax
857	subl	KK, %eax
858	leal	(,%eax, SIZE), %eax
859	leal	(AA, %eax, 2), AA
860	leal	(BB, %eax, 2), BB
861#endif
862
863#ifdef LN
864	subl	$2, KK
865#endif
866
867#ifdef LT
868	addl	$2, KK
869#endif
870
871#ifdef RT
872       movl	K, %eax
873       sall	$1 + BASE_SHIFT, %eax
874       addl	%eax, AORIG
875#endif
876
877	decl	%esi			# i --
878	jne	.MainHead
879	ALIGN_4
880
881.L12:
882	movl	 M, %eax		# m			# MEMORY
883	andl	$1, %eax
884	je	.L27
885
886#ifdef LN
887       movl	K, %eax
888       sall	$0 + BASE_SHIFT, %eax
889       subl	%eax, AORIG
890#endif
891
892#if defined(LN) || defined(RT)
893	movl	KK, %eax
894	leal	(, %eax, SIZE), %eax
895	movl	AORIG, AA
896	leal	(AA,   %eax, 1), AA
897	leal	(%ebx, %eax, 2), BB
898#else
899	movl	%ebx, BB
900#endif
901
902	fldz
903	fldz
904
905	FLD	0 * SIZE(AA)		# temp1 = *(aoffset + 0)
906
907#if defined(LT) || defined(RN)
908	movl	KK, %eax
909#else
910	movl	K,  %eax
911	subl	KK, %eax
912#endif
913	sarl	$1,%eax			# k >> 1		# MEMORY
914	je	 .L54
915	ALIGN_4
916
917.L55:
918	FLD	0 * SIZE(BB)		# temp2 = *(boffset + 0)
919	rep
920	fmul	%st(1), %st
921	faddp	%st, %st(2)
922
923	FMUL	1 * SIZE(BB)		# temp2 = *(boffset + 0)
924	faddp	%st, %st(2)
925	FLD	1 * SIZE(AA)		# temp1 = *(aoffset + 0)
926
927	FLD	2 * SIZE(BB)		# temp2 = *(boffset + 0)
928	rep
929	fmul	%st(1), %st
930	faddp	%st, %st(2)
931
932	FMUL	3 * SIZE(BB)		# temp2 = *(boffset + 0)
933	faddp	%st, %st(2)
934	FLD	2 * SIZE(AA)		# temp1 = *(aoffset + 0)
935
936	addl	$2 * SIZE, AA
937	addl	$4 * SIZE, BB
938	decl	%eax
939	jne	.L55
940	ALIGN_4
941
942.L54:
943#if defined(LT) || defined(RN)
944	movl	KK, %eax
945#else
946	movl	K,  %eax
947	subl	KK, %eax
948#endif
949	andl	$1,%eax			# k & 1
950	je	.L33
951	ALIGN_4
952
953	FLD	0 * SIZE(BB)		# temp2 = *(boffset + 0)
954	rep
955	fmul	%st(1), %st
956	faddp	%st, %st(2)
957
958	FMUL	1 * SIZE(BB)		# temp2 = *(boffset + 0)
959	faddp	%st, %st(2)
960	FLD	1 * SIZE(AA)		# temp1 = *(aoffset + 0)
961
962	addl	$1 * SIZE, AA
963	addl	$2 * SIZE, BB
964	ALIGN_4
965
966.L33:
967	ffreep	%st(0)
968
969#if defined(LN) || defined(RT)
970	movl	KK, %eax
971#ifdef LN
972	subl	$1, %eax
973#else
974	subl	$2, %eax
975#endif
976
977	leal	(, %eax, SIZE), %eax
978
979	movl	AORIG, AA
980	leal	(AA,   %eax, 1), AA
981	leal	(%ebx, %eax, 2), BB
982#endif
983
984#if defined(LN) || defined(LT)
985	FLD	0 * SIZE(BB)
986	fsubp	%st, %st(1)
987	FLD	1 * SIZE(BB)
988	fsubp	%st, %st(2)
989#else
990	FLD	0 * SIZE(AA)
991	fsubp	%st, %st(1)
992	FLD	1 * SIZE(AA)
993	fsubp	%st, %st(2)
994#endif
995
996#if defined(LN) || defined(LT)
997       FLD	0 * SIZE(AA)
998       fmul	%st, %st(1)
999       fmulp	%st, %st(2)
1000#endif
1001
1002#ifdef RN
1003       FLD	0 * SIZE(BB)
1004       fmulp	%st, %st(1)
1005
1006       FLD	1 * SIZE(BB)
1007       fmul	%st(1), %st
1008
1009       fsubrp	%st, %st(2)
1010
1011       FLD	3 * SIZE(BB)
1012       fmulp	%st, %st(2)
1013#endif
1014
1015#ifdef RT
1016       FLD	3 * SIZE(BB)
1017       fmulp	%st, %st(2)
1018
1019       FLD	2 * SIZE(BB)
1020       fmul	%st(2), %st
1021
1022       fsubrp	%st, %st(1)
1023
1024       FLD	0 * SIZE(BB)
1025       fmulp	%st, %st(1)
1026#endif
1027
1028#ifdef LN
1029	subl	$1 * SIZE, %edi
1030#endif
1031
1032#if defined(LN) || defined(LT)
1033	FSTU	0 * SIZE(BB)
1034	fxch	%st(1)
1035	FSTU	1 * SIZE(BB)
1036#else
1037	FSTU	0 * SIZE(AA)
1038	fxch	%st(1)
1039	FSTU	1 * SIZE(AA)
1040#endif
1041
1042	FST	0 * SIZE(%edi,%ebp)
1043	FST	0 * SIZE(%edi)
1044
1045#ifndef LN
1046	addl	$1 * SIZE, %edi
1047#endif
1048
1049#if defined(LT) || defined(RN)
1050	movl	K,  %eax
1051	subl	KK, %eax
1052	leal	(,%eax, SIZE), %eax
1053	leal	(AA, %eax, 1), AA
1054	leal	(BB, %eax, 2), BB
1055#endif
1056
1057#ifdef LN
1058	subl	$1, KK
1059#endif
1060
1061#ifdef LT
1062	addl	$1, KK
1063#endif
1064
1065#ifdef RT
1066       movl	K, %eax
1067       sall	$0 + BASE_SHIFT, %eax
1068       addl	%eax, AORIG
1069#endif
1070	ALIGN_4
1071
1072.L27:
1073#ifdef LN
1074       movl	K, %eax
1075       leal	(    , %eax, SIZE), %eax
1076       leal	(%ebx, %eax, 2), %ebx
1077#endif
1078#if defined(LT) || defined(RN)
1079	movl	BB, %ebx
1080#endif
1081
1082#ifdef RN
1083	addl	$2, KK
1084#endif
1085
1086#ifdef RT
1087	subl	$2, KK
1088#endif
1089
1090	decl	J			# j--			# MEMORY
1091	jne	.L34
1092	ALIGN_4
1093
1094.End:
1095	popl	%ebx
1096	popl	%esi
1097	popl	%edi
1098	popl	%ebp
1099	addl	$ARGS, %esp
1100	ret
1101
1102	EPILOGUE
1103