1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	16
44
45#define J	 0 + STACK(%esp)
46#define BX	 4 + STACK(%esp)
47#define KK	 8 + STACK(%esp)
48#define KKK	12 + STACK(%esp)
49
50#define M	 4 + STACK + ARGS(%esp)
51#define N	 8 + STACK + ARGS(%esp)
52#define K	12 + STACK + ARGS(%esp)
53#define ALPHA	16 + STACK + ARGS(%esp)
54#ifdef DOUBLE
55#define A	24 + STACK + ARGS(%esp)
56#define B	28 + STACK + ARGS(%esp)
57#define C	32 + STACK + ARGS(%esp)
58#define LDC	36 + STACK + ARGS(%esp)
59#define OFFSET	40 + STACK + ARGS(%esp)
60#else
61#define A	20 + STACK + ARGS(%esp)
62#define B	24 + STACK + ARGS(%esp)
63#define C	28 + STACK + ARGS(%esp)
64#define LDC	32 + STACK + ARGS(%esp)
65#define OFFSET	36 + STACK + ARGS(%esp)
66#endif
67
68#define PREFETCH_OFFSET 48
69
70#if defined(PENTIUM3) || defined(PENTIUMM)
71#define REP rep
72#else
73#define REP rep
74#endif
75
76	PROLOGUE
77
78	subl	$ARGS, %esp	# Generate Stack Frame
79
80	pushl	%ebp
81	pushl	%edi
82	pushl	%esi
83	pushl	%ebx
84
85	PROFCODE
86
87#if defined(TRMMKERNEL) && !defined(LEFT)
88	movl	OFFSET, %eax
89	negl	%eax
90	movl	%eax, KK
91#endif
92
93	movl	N,   %eax		# j = (n >> 1)		# MEMORY
94	movl	LDC, %ebp		# ldc			# MEMORY
95	movl	B,   %ebx
96
97	sarl	$1,  %eax
98	leal	(, %ebp, SIZE), %ebp
99	leal	0(%ecx) , %ecx		# NOP
100	movl	%eax, J			# j = (n >> 1)		# MEMORY
101	test	%eax, %eax
102	je	.L8			# if !(n >> 1) goto .L8
103	ALIGN_4
104
105.L34:
106#if defined(TRMMKERNEL) && defined(LEFT)
107	movl	OFFSET, %eax
108	movl	%eax, KK
109#endif
110
111	movl	%ebx, BX
112
113	movl	M, %esi			# m			# MEMORY
114	movl	A, %edx			# a			# MEMORY
115	movl	C, %edi			# C			# MEMORY
116	sarl	$1,   %esi		# i = (m >> 1)
117	je	.L12
118	ALIGN_4
119
120.MainHead:
121#if !defined(TRMMKERNEL) || \
122	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
123	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
124	movl	%ebx, %ecx
125#else
126	movl	KK,   %eax
127	leal	(, %eax, SIZE), %eax
128	leal	(%edx, %eax, 2), %edx
129	leal	(%ebx, %eax, 2), %ecx
130#endif
131
132#ifdef HAVE_SSE
133	movl	BX, %eax
134
135	prefetcht2  0 * SIZE(%eax)
136	prefetcht2  4 * SIZE(%eax)
137
138#if   L2_SIZE > 262144
139
140	subl	$-8 * SIZE, BX
141
142#elif L2_SIZE > 131072
143
144	prefetcht2  8 * SIZE(%eax)
145	prefetcht2 12 * SIZE(%eax)
146
147
148	subl	$-16 * SIZE, BX
149#else
150	prefetcht2 16 * SIZE(%eax)
151	prefetcht2 20 * SIZE(%eax)
152	prefetcht2 24 * SIZE(%eax)
153	prefetcht2 28 * SIZE(%eax)
154
155	subl	$-32 * SIZE, BX
156#endif
157#endif
158
159	fldz
160	fldz
161
162#ifndef TRMMKERNEL
163	movl	K, %eax
164#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
165	movl	K, %eax
166	subl	KK, %eax
167	movl	%eax, KKK
168#else
169	movl	KK, %eax
170#ifdef LEFT
171	addl	$2, %eax
172#else
173	addl	$2, %eax
174#endif
175	movl	%eax, KKK
176#endif
177	fldz
178	fldz
179
180	FLD	 4 * SIZE(%ecx)		# b5
181	FLD	 4 * SIZE(%edx)		# a5
182	FLD	 0 * SIZE(%ecx)		# b1
183	FLD	 0 * SIZE(%edx)		# a1
184
185#if   defined(HAVE_3DNOW)
186	prefetchw	2 * SIZE(%edi)
187 	prefetchw	2 * SIZE(%edi, %ebp, 1)
188#elif defined(HAVE_SSE)
189	prefetchnta	2 * SIZE(%edi)
190 	prefetchnta	2 * SIZE(%edi, %ebp, 1)
191#endif
192	sarl	$2, %eax
193 	je	.L16
194	ALIGN_4
195
196.MainLoop:
197#if defined(HAVE_3DNOW)
198	prefetch	(PREFETCH_OFFSET) * SIZE(%ecx)
199	nop
200#elif defined(HAVE_SSE)
201	prefetchnta	(PREFETCH_OFFSET) * SIZE(%ecx)
202#ifdef CORE_KATMAI
203	prefetcht0	(PREFETCH_OFFSET) * SIZE(%edx)
204#endif
205#endif
206
207	fmul	%st, %st(1)
208	FMUL	 1 * SIZE(%ecx)
209	fxch	%st(1)
210	faddp	%st, %st(4)
211	FLD	 0 * SIZE(%ecx)
212	fxch	%st(1)
213	faddp	%st, %st(5)
214	FLD	 1 * SIZE(%edx)
215	fmul	%st, %st(1)
216	FMUL	 1 * SIZE(%ecx)
217	fxch	%st(1)
218	faddp	%st, %st(6)
219	FLD	 2 * SIZE(%ecx)
220	fxch	%st(1)
221	faddp	%st, %st(7)
222	FLD	 2 * SIZE(%edx)
223
224	fmul	%st, %st(1)
225	FMUL	 3 * SIZE(%ecx)
226	fxch	%st(1)
227	faddp	%st, %st(4)
228	FLD	 2 * SIZE(%ecx)
229	fxch	%st(1)
230	faddp	%st, %st(5)
231	FLD	 3 * SIZE(%edx)
232	fmul	%st, %st(1)
233	FMUL	 3 * SIZE(%ecx)
234	fxch	%st(1)
235	faddp	%st, %st(6)
236	FLD	 8 * SIZE(%ecx)
237	fxch	%st(1)
238	faddp	%st, %st(7)
239	FLD	 8 * SIZE(%edx)
240	fxch	%st(2)
241
242#if !defined(HAVE_3DNOW) && defined(HAVE_SSE)  && defined(DOUBLE)
243	prefetchnta	(PREFETCH_OFFSET + 4) * SIZE(%ecx)
244#ifdef CORE_KATMAI
245	prefetcht0	(PREFETCH_OFFSET + 4) * SIZE(%edx)
246#endif
247#endif
248
249	fmul	%st, %st(3)
250	FMUL	 5 * SIZE(%ecx)
251	fxch	%st(3)
252	faddp	%st, %st(4)
253	FLD	 4 * SIZE(%ecx)
254	fxch	%st(3)
255	faddp	%st, %st(5)
256	FLD	 5 * SIZE(%edx)
257	fmul	%st, %st(3)
258	FMUL	 5 * SIZE(%ecx)
259	fxch	%st(3)
260	faddp	%st, %st(6)
261	FLD	 6 * SIZE(%ecx)
262	fxch	%st(3)
263	faddp	%st, %st(7)
264	FLD	 6 * SIZE(%edx)
265
266	fmul	%st, %st(3)
267	FMUL	 7 * SIZE(%ecx)
268	fxch	%st(3)
269	faddp	%st, %st(4)
270	FLD	 6 * SIZE(%ecx)
271	fxch	%st(3)
272	faddp	%st, %st(5)
273	FLD	 7 * SIZE(%edx)
274	fmul	%st, %st(3)
275	FMUL	 7 * SIZE(%ecx)
276	fxch	%st(3)
277	faddp	%st, %st(6)
278	FLD	12 * SIZE(%ecx)
279	fxch	%st(3)
280	faddp	%st, %st(7)
281	FLD	12 * SIZE(%edx)
282	fxch	%st(2)
283
284	subl	$-8 * SIZE, %ecx
285	subl	$-8 * SIZE, %edx
286	decl	%eax			# l --
287	jne	.MainLoop
288	ALIGN_4
289
290.L16:
291#ifndef TRMMKERNEL
292	movl	K, %eax
293#else
294	movl	KKK, %eax
295#endif
296	and	$3,  %eax
297	je	.L21
298	ALIGN_4
299
300.SubLoop:
301	fmul	%st, %st(1)
302	FMUL	 1 * SIZE(%ecx)
303	fxch	%st(1)
304	faddp	%st, %st(4)
305	FLD	 0 * SIZE(%ecx)
306	fxch	%st(1)
307	faddp	%st, %st(5)
308	FLD	 1 * SIZE(%edx)
309	fmul	%st, %st(1)
310	FMUL	 1 * SIZE(%ecx)
311	fxch	%st(1)
312	faddp	%st, %st(6)
313	FLD	 2 * SIZE(%ecx)
314	fxch	%st(1)
315	faddp	%st, %st(7)
316	FLD	 2 * SIZE(%edx)
317
318	addl	$2 * SIZE,%ecx
319	addl	$2 * SIZE,%edx
320	decl	%eax
321	jne	 .SubLoop
322	ALIGN_4
323
324.L21:
325	ffreep	%st(0)
326	ffreep	%st(0)
327	ffreep	%st(0)
328	ffreep	%st(0)
329
330	FLD	ALPHA
331	fmul	%st, %st(4)
332	fmul	%st, %st(1)
333	fmul	%st, %st(2)
334	fmulp	%st, %st(3)
335
336#ifndef TRMMKERNEL
337	FADD	0 * SIZE(%edi)
338	FST	0 * SIZE(%edi)
339	FADD	0 * SIZE(%edi,%ebp)
340	FST	0 * SIZE(%edi,%ebp)
341	FADD	1 * SIZE(%edi)
342	FST	1 * SIZE(%edi)
343	FADD	1 * SIZE(%edi,%ebp)
344	FST	1 * SIZE(%edi,%ebp)
345#else
346	FST	0 * SIZE(%edi)
347	FST	0 * SIZE(%edi,%ebp)
348	FST	1 * SIZE(%edi)
349	FST	1 * SIZE(%edi,%ebp)
350#endif
351
352
353#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
354    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
355	movl	K, %eax
356	subl	KKK, %eax
357	leal	(,%eax, SIZE), %eax
358	leal	(%edx, %eax, 2), %edx
359	leal	(%ecx, %eax, 2), %ecx
360#endif
361
362#if defined(TRMMKERNEL) && defined(LEFT)
363	addl	$2, KK
364#endif
365
366	addl	$2 * SIZE, %edi
367	rep
368	decl	%esi			# i --
369	rep
370	jne	.MainHead
371	ALIGN_4
372
373.L12:
374	movl	 M, %eax		# m			# MEMORY
375	andl	$1, %eax
376	je	.L27
377
378#if !defined(TRMMKERNEL) || \
379	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
380	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
381	movl	%ebx, %ecx
382#else
383	movl	KK,   %eax
384	leal	(, %eax, SIZE), %eax
385	leal	(%edx, %eax, 1), %edx
386	leal	(%ebx, %eax, 2), %ecx
387#endif
388	fldz
389	fldz
390
391	FLD	0 * SIZE(%edx)		# temp1 = *(aoffset + 0)
392
393#ifndef TRMMKERNEL
394	movl	K, %eax
395#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
396	movl	K, %eax
397	subl	KK, %eax
398	movl	%eax, KKK
399#else
400	movl	KK, %eax
401#ifdef LEFT
402	addl	$1, %eax
403#else
404	addl	$2, %eax
405#endif
406	movl	%eax, KKK
407#endif
408	sarl	$1,%eax			# k >> 1		# MEMORY
409	je	 .L54
410	ALIGN_4
411
412.L55:
413	FLD	0 * SIZE(%ecx)		# temp2 = *(boffset + 0)
414	rep
415	fmul	%st(1), %st
416	faddp	%st, %st(2)
417
418	FMUL	1 * SIZE(%ecx)		# temp2 = *(boffset + 0)
419	faddp	%st, %st(2)
420	FLD	1 * SIZE(%edx)		# temp1 = *(aoffset + 0)
421
422	FLD	2 * SIZE(%ecx)		# temp2 = *(boffset + 0)
423	rep
424	fmul	%st(1), %st
425	faddp	%st, %st(2)
426
427	FMUL	3 * SIZE(%ecx)		# temp2 = *(boffset + 0)
428	faddp	%st, %st(2)
429	FLD	2 * SIZE(%edx)		# temp1 = *(aoffset + 0)
430
431	addl	$2 * SIZE, %edx
432	addl	$4 * SIZE, %ecx
433	decl	%eax
434	jne	.L55
435	ALIGN_4
436
437.L54:
438#ifndef TRMMKERNEL
439	movl	K, %eax
440#else
441	movl	KKK, %eax
442#endif
443	andl	$1,%eax			# k & 1
444	je	.L33
445	ALIGN_4
446
447	FLD	0 * SIZE(%ecx)		# temp2 = *(boffset + 0)
448	rep
449	fmul	%st(1), %st
450	faddp	%st, %st(2)
451
452	FMUL	1 * SIZE(%ecx)		# temp2 = *(boffset + 0)
453	faddp	%st, %st(2)
454	FLD	1 * SIZE(%edx)		# temp1 = *(aoffset + 0)
455
456	addl	$1 * SIZE, %edx
457	addl	$2 * SIZE, %ecx
458	ALIGN_4
459
460.L33:
461	ffreep	%st(0)
462	FLD	ALPHA
463
464	fmul	%st, %st(2)
465	fmulp	%st, %st(1)
466
467#ifndef TRMMKERNEL
468	FADD	(%edi)
469	FST	(%edi)
470	FADD	(%edi,%ebp)
471	FST	(%edi,%ebp)
472#else
473	FST	(%edi)
474	FST	(%edi,%ebp)
475#endif
476
477#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
478    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
479	movl	K, %eax
480	subl	KKK, %eax
481	leal	(,%eax, SIZE), %eax
482	leal	(%edx, %eax, 1), %edx
483	leal	(%ecx, %eax, 2), %ecx
484#endif
485
486#if defined(TRMMKERNEL) && defined(LEFT)
487	addl	$1, KK
488#endif
489	ALIGN_4
490
491.L27:
492#if defined(TRMMKERNEL) && !defined(LEFT)
493	addl	$2, KK
494#endif
495
496	lea	(, %ebp, 2), %eax
497	addl	%eax, C			# C + 2 * ldc		# MEMORY
498	movl	%ecx, %ebx		# b			# MEMORY
499	decl	J			# j--			# MEMORY
500	jne	.L34
501	ALIGN_4
502
503.L8:
504	movl	N,  %eax		# n			# MEMORY
505	andl	$1, %eax
506	je	.End
507
508#if defined(TRMMKERNEL) && defined(LEFT)
509	movl	OFFSET, %eax
510	movl	%eax, KK
511#endif
512
513	movl	C, %edi			# c			# MEMORY
514	movl	A, %edx			# a			# MEMORY
515
516	movl	M,  %esi		# m			# MEMORY
517	sarl	$1, %esi		# m >> 1
518	je	.L36
519	ALIGN_4
520
521.L46:
522#if !defined(TRMMKERNEL) || \
523	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
524	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
525	movl	%ebx, %ecx
526#else
527	movl	KK,   %eax
528	leal	(, %eax, SIZE), %eax
529	leal	(%edx, %eax, 2), %edx
530	leal	(%ebx, %eax, 1), %ecx
531#endif
532
533#ifndef TRMMKERNEL
534	movl	K, %eax
535#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
536	movl	K, %eax
537	subl	KK, %eax
538	movl	%eax, KKK
539#else
540	movl	KK, %eax
541#ifdef LEFT
542	addl	$2, %eax
543#else
544	addl	$1, %eax
545#endif
546	movl	%eax, KKK
547#endif
548	fldz
549	sarl	$1, %eax
550	fldz
551	FLD	0 * SIZE(%ecx)		# temp1 = *(boffset + 0)
552
553	je	.L56
554	ALIGN_4
555
556.L57:
557	FLD	0 * SIZE(%edx)		# temp2 = *(aoffset + 0)
558	fmul	%st(1), %st
559	faddp	%st, %st(2)
560
561	FMUL	1 * SIZE(%edx)		# temp2 = *(aoffset + 0)
562	faddp	%st, %st(2)
563	FLD	1 * SIZE(%ecx)		# temp1 = *(boffset + 0)
564
565	FLD	2 * SIZE(%edx)		# temp2 = *(aoffset + 0)
566	fmul	%st(1), %st
567	faddp	%st, %st(2)
568
569	FMUL	3 * SIZE(%edx)		# temp2 = *(aoffset + 0)
570	faddp	%st, %st(2)
571	FLD	2 * SIZE(%ecx)		# temp1 = *(boffset + 0)
572
573	addl	$4 * SIZE,%edx
574	addl	$2 * SIZE,%ecx
575	dec	%eax
576	jne	.L57
577	ALIGN_4
578
579.L56:
580#ifndef TRMMKERNEL
581	movl	K, %eax
582#else
583	movl	KKK, %eax
584#endif
585	andl	$1, %eax
586	je	.L45
587	ALIGN_4
588
589	FLD	0 * SIZE(%edx)		# temp2 = *(aoffset + 0)
590	fmul	%st(1), %st
591	faddp	%st, %st(2)
592
593	FMUL	1 * SIZE(%edx)		# temp2 = *(aoffset + 0)
594	faddp	%st, %st(2)
595	FLD	3 * SIZE(%ecx)		# temp1 = *(boffset + 0)
596
597	addl	$2 * SIZE,%edx
598	addl	$1 * SIZE,%ecx
599	ALIGN_4
600
601.L45:
602	ffreep	%st(0)
603	FLD	ALPHA
604
605	fmul	%st, %st(1)
606	fmulp	%st, %st(2)
607
608#ifndef TRMMKERNEL
609	FADD	0 * SIZE(%edi)
610	FST	0 * SIZE(%edi)
611	FADD	1 * SIZE(%edi)
612	FST	1 * SIZE(%edi)
613#else
614	FST	0 * SIZE(%edi)
615	FST	1 * SIZE(%edi)
616#endif
617
618	addl	$2 * SIZE, %edi
619
620#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
621    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
622	movl	K, %eax
623	subl	KKK, %eax
624	leal	(,%eax, SIZE), %eax
625	leal	(%edx, %eax, 2), %edx
626	leal	(%ecx, %eax, 1), %ecx
627#endif
628
629#if defined(TRMMKERNEL) && defined(LEFT)
630	addl	$2, KK
631#endif
632
633	decl	%esi			# i --
634	jne	.L46
635	ALIGN_4
636
637.L36:
638	movl	M,  %eax		# m			# MEMORY
639	andl	$1, %eax		# m & 1
640	je	.End
641
642#if !defined(TRMMKERNEL) || \
643	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
644	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
645	movl	%ebx, %ecx
646#else
647	movl	KK,   %eax
648	leal	(, %eax, SIZE), %eax
649	leal	(%edx, %eax, 1), %edx
650	leal	(%ebx, %eax, 1), %ecx
651#endif
652
653#ifndef TRMMKERNEL
654	movl	K, %eax
655#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
656	movl	K, %eax
657	subl	KK, %eax
658	movl	%eax, KKK
659#else
660	movl	KK, %eax
661#ifdef LEFT
662	addl	$1, %eax
663#else
664	addl	$1, %eax
665#endif
666	movl	%eax, KKK
667#endif
668	fldz
669	ALIGN_3
670
671.L51:
672	FLD	(%edx)
673	FMUL	(%ecx)
674	addl	$1 * SIZE,%edx
675	addl	$1 * SIZE,%ecx
676	faddp	%st,%st(1)
677	decl	%eax
678	jne	.L51
679
680	FMUL	ALPHA
681#ifndef TRMMKERNEL
682	FADD	(%edi)
683	FST	(%edi)
684#else
685	FST	(%edi)
686#endif
687	ALIGN_4
688
689.End:
690	popl	%ebx
691	popl	%esi
692	popl	%edi
693	popl	%ebp
694	addl	$ARGS, %esp
695	ret
696
697	EPILOGUE
698