1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#if defined(OPTERON) || defined(BARCELONA)
26#define PREFETCH	prefetch
27#define PREFETCHW	prefetchw
28#else
29#define PREFETCH	prefetcht0
30#define PREFETCHW	prefetcht0
31#endif
32
33#define PREFETCHSIZE (5 + 4 * 10)
34#define STACK	16
35#define ARGS	16
36
37#define J	 0 + STACK(%esp)
38#define KK	 4 + STACK(%esp)
39#define KKK	 8 + STACK(%esp)
40
41#define M	 4 + STACK + ARGS(%esp)
42#define N	 8 + STACK + ARGS(%esp)
43#define K	12 + STACK + ARGS(%esp)
44#define ALPHA	16 + STACK + ARGS(%esp)
45#define A	32 + STACK + ARGS(%esp)
46#define ARG_B	36 + STACK + ARGS(%esp)
47#define C	40 + STACK + ARGS(%esp)
48#define ARG_LDC	44 + STACK + ARGS(%esp)
49#define OFFSET	48 + STACK + ARGS(%esp)
50
51#define I	%esi
52#define B	%ebx
53#define CO	%edi
54#define AO	%edx
55#define BO	%ecx
56#define LDC	%ebp
57
58#define PREFETCH_OFFSET 48
59
60	PROLOGUE
61
62	subl	$ARGS, %esp	# Generate Stack Frame
63
64	pushl	%ebp
65	pushl	%edi
66	pushl	%esi
67	pushl	%ebx
68
69	PROFCODE
70
71#if defined(TRMMKERNEL) && !defined(LEFT)
72	movl	OFFSET, %eax
73	negl	%eax
74	movl	%eax, KK
75#endif
76
77	movl	ARG_LDC, LDC
78	movl	ARG_B,   B
79
80	addl	$8 * SIZE, A
81	addl	$8 * SIZE, B
82
83	sall	$BASE_SHIFT, LDC
84
85	movl	N,   %eax
86	sarl	$1,  %eax
87	movl	%eax, J
88	je	.L30
89	ALIGN_4
90
91.L01:
92#if defined(TRMMKERNEL) && defined(LEFT)
93	movl	OFFSET, %eax
94	movl	%eax, KK
95#endif
96
97	movl	A, AO
98
99	movl	C, CO
100	lea	(, LDC, 2), %eax
101	addl	%eax, C
102
103	movl	M,  I
104	sarl	$1, I
105	je	.L20
106	ALIGN_4
107
108.L11:
109#if !defined(TRMMKERNEL) || \
110	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
111	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
112	movl	B, BO
113#else
114	movl	KK,   %eax
115	sall	$BASE_SHIFT, %eax
116	leal	(AO, %eax, 2), AO
117	leal	(B,  %eax, 2), BO
118#endif
119
120	fldz
121	fldz
122	fldz
123	fldz
124
125#if   defined(HAVE_3DNOW)
126	prefetchw	2 * SIZE(CO)
127 	prefetchw	2 * SIZE(CO, LDC, 1)
128#elif defined(HAVE_SSE)
129	prefetchnta	2 * SIZE(CO)
130 	prefetchnta	2 * SIZE(CO, LDC, 1)
131#endif
132
133#ifndef TRMMKERNEL
134	movl	K, %eax
135#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
136	movl	K, %eax
137	subl	KK, %eax
138	movl	%eax, KKK
139#else
140	movl	KK, %eax
141#ifdef LEFT
142	addl	$2, %eax
143#else
144	addl	$2, %eax
145#endif
146	movl	%eax, KKK
147#endif
148	sarl	$2, %eax
149 	je	.L15
150	ALIGN_4
151
152.L12:
153	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
154
155	FLD	 -8 * SIZE(AO)
156
157	FLD	 -8 * SIZE(BO)
158	fld	 %st(1)
159	fmul	 %st(1), %st
160	faddp	 %st, %st(3)
161
162	FLD	 -7 * SIZE(BO)
163	fmul	 %st, %st(2)
164
165	FLD	 -7 * SIZE(AO)
166	fmul	 %st, %st(2)
167	fmulp	 %st, %st(1)
168
169	faddp	 %st, %st(6)
170	faddp	 %st, %st(3)
171	faddp	 %st, %st(3)
172
173	FLD	 -6 * SIZE(AO)
174
175	FLD	 -6 * SIZE(BO)
176	fld	 %st(1)
177	fmul	 %st(1), %st
178	faddp	 %st, %st(3)
179
180	FLD	 -5 * SIZE(BO)
181	fmul	 %st, %st(2)
182
183	FLD	 -5 * SIZE(AO)
184	fmul	 %st, %st(2)
185	fmulp	 %st, %st(1)
186
187	faddp	 %st, %st(6)
188	faddp	 %st, %st(3)
189	faddp	 %st, %st(3)
190
191	PREFETCH	(PREFETCHSIZE + 4) * SIZE(AO)
192
193	FLD	 -4 * SIZE(AO)
194
195	FLD	 -4 * SIZE(BO)
196	fld	 %st(1)
197	fmul	 %st(1), %st
198	faddp	 %st, %st(3)
199
200	FLD	 -3 * SIZE(BO)
201	fmul	 %st, %st(2)
202
203	FLD	 -3 * SIZE(AO)
204	fmul	 %st, %st(2)
205	fmulp	 %st, %st(1)
206
207	faddp	 %st, %st(6)
208	faddp	 %st, %st(3)
209	faddp	 %st, %st(3)
210
211	FLD	 -2 * SIZE(AO)
212
213	FLD	 -2 * SIZE(BO)
214	fld	 %st(1)
215	fmul	 %st(1), %st
216	faddp	 %st, %st(3)
217
218	FLD	 -1 * SIZE(BO)
219	fmul	 %st, %st(2)
220
221	FLD	 -1 * SIZE(AO)
222	fmul	 %st, %st(2)
223	fmulp	 %st, %st(1)
224
225	faddp	 %st, %st(6)
226	faddp	 %st, %st(3)
227	faddp	 %st, %st(3)
228
229	addl	$8 * SIZE,AO
230	addl	$8 * SIZE,BO
231
232	decl	%eax
233	jne	.L12
234	ALIGN_4
235
236.L15:
237#ifndef TRMMKERNEL
238	movl	K, %eax
239#else
240	movl	KKK, %eax
241#endif
242	and	$3,  %eax
243	je	.L18
244	ALIGN_4
245
246.L16:
247	FLD	 -8 * SIZE(AO)
248
249	FLD	 -8 * SIZE(BO)
250	fld	 %st(1)
251	fmul	 %st(1), %st
252	faddp	 %st, %st(3)
253
254	FLD	 -7 * SIZE(BO)
255	fmul	 %st, %st(2)
256
257	FLD	 -7 * SIZE(AO)
258	fmul	 %st, %st(2)
259	fmulp	 %st, %st(1)
260
261	faddp	 %st, %st(6)
262	faddp	 %st, %st(3)
263	faddp	 %st, %st(3)
264
265	addl	$2 * SIZE,AO
266	addl	$2 * SIZE,BO
267
268	decl	%eax
269	jne	 .L16
270	ALIGN_4
271
272.L18:
273#ifndef TRMMKERNEL
274	FLD	ALPHA
275
276	fmul	%st, %st(1)
277	fmul	%st, %st(2)
278	fmul	%st, %st(3)
279	fmulp	%st, %st(4)
280
281	FLD	0 * SIZE(CO)
282	faddp	%st, %st(1)
283	FST	0 * SIZE(CO)
284
285	FLD	1 * SIZE(CO)
286	faddp	%st, %st(1)
287	FST	1 * SIZE(CO)
288
289	FLD	0 * SIZE(CO, LDC)
290	faddp	%st, %st(1)
291	FST	0 * SIZE(CO, LDC)
292
293	FLD	1 * SIZE(CO, LDC)
294	faddp	%st, %st(1)
295	FST	1 * SIZE(CO, LDC)
296#else
297	FST	0 * SIZE(CO)
298	FST	1 * SIZE(CO)
299	FST	0 * SIZE(CO, LDC)
300	FST	1 * SIZE(CO, LDC)
301#endif
302
303#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
304    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
305	movl	K, %eax
306	subl	KKK, %eax
307	sall	$BASE_SHIFT, %eax
308	leal	(AO, %eax, 2), AO
309	leal	(BO, %eax, 2), BO
310#endif
311
312#if defined(TRMMKERNEL) && defined(LEFT)
313	addl	$2, KK
314#endif
315
316	addl	$2 * SIZE, CO
317	decl	I
318	jne	.L11
319	ALIGN_4
320
321.L20:
322	movl	 M, %eax
323	andl	$1, %eax
324	je	.L29
325	ALIGN_4
326
327.L21:
328#if !defined(TRMMKERNEL) || \
329	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
330	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
331	movl	B, BO
332#else
333	movl	KK,   %eax
334	sall	$BASE_SHIFT, %eax
335	leal	(AO, %eax, 1), AO
336	leal	( B, %eax, 2), BO
337#endif
338
339	fldz
340	fldz
341
342#ifndef TRMMKERNEL
343	movl	K, %eax
344#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
345	movl	K, %eax
346	subl	KK, %eax
347	movl	%eax, KKK
348#else
349	movl	KK, %eax
350#ifdef LEFT
351	addl	$1, %eax
352#else
353	addl	$2, %eax
354#endif
355	movl	%eax, KKK
356#endif
357	sarl	$2, %eax
358 	je	.L25
359	ALIGN_4
360
361.L22:
362	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
363
364	FLD	 -8 * SIZE(AO)
365
366	FLD	 -8 * SIZE(BO)
367	fmul	 %st(1), %st
368	faddp	 %st, %st(2)
369
370	FLD	 -7 * SIZE(BO)
371	fmulp	 %st, %st(1)
372	faddp	 %st, %st(2)
373
374	FLD	 -7 * SIZE(AO)
375
376	FLD	 -6 * SIZE(BO)
377	fmul	 %st(1), %st
378	faddp	 %st, %st(2)
379
380	FLD	 -5 * SIZE(BO)
381	fmulp	 %st, %st(1)
382	faddp	 %st, %st(2)
383
384	FLD	 -6 * SIZE(AO)
385
386	FLD	 -4 * SIZE(BO)
387	fmul	 %st(1), %st
388	faddp	 %st, %st(2)
389
390	FLD	 -3 * SIZE(BO)
391	fmulp	 %st, %st(1)
392	faddp	 %st, %st(2)
393
394	FLD	 -5 * SIZE(AO)
395
396	FLD	 -2 * SIZE(BO)
397	fmul	 %st(1), %st
398	faddp	 %st, %st(2)
399
400	FLD	 -1 * SIZE(BO)
401	fmulp	 %st, %st(1)
402	faddp	 %st, %st(2)
403
404	addl	$4 * SIZE,AO
405	addl	$8 * SIZE,BO
406
407	decl	%eax
408	jne	.L22
409	ALIGN_4
410
411.L25:
412#ifndef TRMMKERNEL
413	movl	K, %eax
414#else
415	movl	KKK, %eax
416#endif
417	and	$3,  %eax
418	je	.L28
419	ALIGN_4
420
421.L26:
422	FLD	 -8 * SIZE(AO)
423
424	FLD	 -8 * SIZE(BO)
425	fmul	 %st(1), %st
426	faddp	 %st, %st(2)
427
428	FLD	 -7 * SIZE(BO)
429	fmulp	 %st, %st(1)
430	faddp	 %st, %st(2)
431
432	addl	$1 * SIZE,AO
433	addl	$2 * SIZE,BO
434
435	decl	%eax
436	jne	 .L26
437	ALIGN_4
438
439.L28:
440#ifndef TRMMKERNEL
441	FLD	ALPHA
442
443	fmul	%st, %st(1)
444	fmulp	%st, %st(2)
445
446	FLD	0 * SIZE(CO)
447	faddp	%st, %st(1)
448	FST	0 * SIZE(CO)
449
450	FLD	0 * SIZE(CO, LDC)
451	faddp	%st, %st(1)
452	FST	0 * SIZE(CO, LDC)
453#else
454	FST	0 * SIZE(CO)
455	FST	0 * SIZE(CO, LDC)
456#endif
457
458#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
459    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
460	movl	K, %eax
461	subl	KKK, %eax
462	sall	$BASE_SHIFT, %eax
463	leal	(AO, %eax, 1), AO
464	leal	(BO, %eax, 2), BO
465#endif
466
467#if defined(TRMMKERNEL) && defined(LEFT)
468	addl	$1, KK
469#endif
470
471	addl	$1 * SIZE, CO
472	ALIGN_4
473
474.L29:
475#if defined(TRMMKERNEL) && !defined(LEFT)
476	addl	$2, KK
477#endif
478
479	movl	BO, B
480	decl	J
481	jne	.L01
482	ALIGN_4
483
484.L30:
485	movl	N,  %eax
486	testl	$1, %eax
487	je	.L999
488
489#if defined(TRMMKERNEL) && defined(LEFT)
490	movl	OFFSET, %eax
491	movl	%eax, KK
492#endif
493
494	movl	A, AO
495
496	movl	C, CO
497	addl	LDC, C
498
499	movl	M,  I
500	sarl	$1, I
501	je	.L40
502	ALIGN_4
503
504.L31:
505#if !defined(TRMMKERNEL) || \
506	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
507	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
508	movl	B, BO
509#else
510	movl	KK,   %eax
511	sall	$BASE_SHIFT, %eax
512	leal	(AO, %eax, 2), AO
513	leal	( B, %eax, 1), BO
514#endif
515
516	fldz
517	fldz
518
519#if   defined(HAVE_3DNOW)
520	prefetchw	2 * SIZE(CO)
521#elif defined(HAVE_SSE)
522	prefetchnta	2 * SIZE(CO)
523#endif
524
525#ifndef TRMMKERNEL
526	movl	K, %eax
527#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
528	movl	K, %eax
529	subl	KK, %eax
530	movl	%eax, KKK
531#else
532	movl	KK, %eax
533#ifdef LEFT
534	addl	$2, %eax
535#else
536	addl	$1, %eax
537#endif
538	movl	%eax, KKK
539#endif
540	sarl	$2, %eax
541 	je	.L35
542	ALIGN_4
543
544.L32:
545	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
546
547	FLD	 -8 * SIZE(BO)
548	FLD	 -8 * SIZE(AO)
549	fmul	 %st(1), %st
550	faddp	 %st, %st(2)
551
552	FLD	 -7 * SIZE(AO)
553	fmulp	 %st, %st(1)
554	faddp	 %st, %st(2)
555
556	FLD	 -7 * SIZE(BO)
557	FLD	 -6 * SIZE(AO)
558	fmul	 %st(1), %st
559	faddp	 %st, %st(2)
560
561	FLD	 -5 * SIZE(AO)
562	fmulp	 %st, %st(1)
563	faddp	 %st, %st(2)
564
565	FLD	 -6 * SIZE(BO)
566	FLD	 -4 * SIZE(AO)
567	fmul	 %st(1), %st
568	faddp	 %st, %st(2)
569
570	FLD	 -3 * SIZE(AO)
571	fmulp	 %st, %st(1)
572	faddp	 %st, %st(2)
573
574	FLD	 -5 * SIZE(BO)
575	FLD	 -2 * SIZE(AO)
576	fmul	 %st(1), %st
577	faddp	 %st, %st(2)
578
579	FLD	 -1 * SIZE(AO)
580	fmulp	 %st, %st(1)
581	faddp	 %st, %st(2)
582
583	addl	$8 * SIZE,AO
584	addl	$4 * SIZE,BO
585
586	decl	%eax
587	jne	.L32
588	ALIGN_4
589
590.L35:
591#ifndef TRMMKERNEL
592	movl	K, %eax
593#else
594	movl	KKK, %eax
595#endif
596	and	$3,  %eax
597	je	.L38
598	ALIGN_4
599
600.L36:
601	FLD	 -8 * SIZE(BO)
602
603	FLD	 -8 * SIZE(AO)
604	fmul	 %st(1), %st
605	faddp	 %st, %st(2)
606
607	FLD	 -7 * SIZE(AO)
608	fmulp	 %st, %st(1)
609	faddp	 %st, %st(2)
610
611	addl	$2 * SIZE,AO
612	addl	$1 * SIZE,BO
613
614	decl	%eax
615	jne	 .L36
616	ALIGN_4
617
618.L38:
619#ifndef TRMMKERNEL
620	FLD	ALPHA
621
622	fmul	%st, %st(1)
623	fmulp	%st, %st(2)
624
625	FLD	0 * SIZE(CO)
626	faddp	%st, %st(1)
627	FST	0 * SIZE(CO)
628
629	FLD	1 * SIZE(CO)
630	faddp	%st, %st(1)
631	FST	1 * SIZE(CO)
632#else
633	FST	0 * SIZE(CO)
634	FST	1 * SIZE(CO)
635#endif
636
637#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
638    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
639	movl	K, %eax
640	subl	KKK, %eax
641	sall	$BASE_SHIFT, %eax
642	leal	(AO, %eax, 2), AO
643	leal	(BO, %eax, 1), BO
644#endif
645
646#if defined(TRMMKERNEL) && defined(LEFT)
647	addl	$2, KK
648#endif
649
650	addl	$2 * SIZE, CO
651	decl	I
652	jne	.L31
653	ALIGN_4
654
655.L40:
656	movl	 M, %eax
657	andl	$1, %eax
658	je	.L49
659	ALIGN_4
660
661.L41:
662#if !defined(TRMMKERNEL) || \
663	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
664	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
665	movl	B, BO
666#else
667	movl	KK,   %eax
668	sall	$BASE_SHIFT, %eax
669	leal	(AO, %eax, 1), AO
670	leal	( B, %eax, 1), BO
671#endif
672
673	fldz
674
675#ifndef TRMMKERNEL
676	movl	K, %eax
677#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
678	movl	K, %eax
679	subl	KK, %eax
680	movl	%eax, KKK
681#else
682	movl	KK, %eax
683#ifdef LEFT
684	addl	$1, %eax
685#else
686	addl	$1, %eax
687#endif
688	movl	%eax, KKK
689#endif
690	sarl	$2, %eax
691 	je	.L45
692	ALIGN_4
693
694.L42:
695	PREFETCH	(PREFETCHSIZE + 0) * SIZE(AO)
696
697	FLD	 -8 * SIZE(AO)
698	FLD	 -8 * SIZE(BO)
699	fmulp	 %st, %st(1)
700	faddp	 %st, %st(1)
701
702	FLD	 -7 * SIZE(AO)
703	FLD	 -7 * SIZE(BO)
704	fmulp	 %st, %st(1)
705	faddp	 %st, %st(1)
706
707	FLD	 -6 * SIZE(AO)
708	FLD	 -6 * SIZE(BO)
709	fmulp	 %st, %st(1)
710	faddp	 %st, %st(1)
711
712	FLD	 -5 * SIZE(AO)
713	FLD	 -5 * SIZE(BO)
714	fmulp	 %st, %st(1)
715	faddp	 %st, %st(1)
716
717	addl	$4 * SIZE,AO
718	addl	$4 * SIZE,BO
719
720	decl	%eax
721	jne	.L42
722	ALIGN_4
723
724.L45:
725#ifndef TRMMKERNEL
726	movl	K, %eax
727#else
728	movl	KKK, %eax
729#endif
730	and	$3,  %eax
731	je	.L48
732	ALIGN_4
733
734.L46:
735	FLD	 -8 * SIZE(AO)
736
737	FLD	 -8 * SIZE(BO)
738	fmulp	 %st, %st(1)
739	faddp	 %st, %st(1)
740
741	addl	$1 * SIZE,AO
742	addl	$1 * SIZE,BO
743
744	decl	%eax
745	jne	 .L46
746	ALIGN_4
747
748.L48:
749#ifndef TRMMKERNEL
750	FLD	ALPHA
751
752	fmulp	%st, %st(1)
753
754	FLD	0 * SIZE(CO)
755	faddp	%st, %st(1)
756	FST	0 * SIZE(CO)
757#else
758	FST	0 * SIZE(CO)
759#endif
760
761#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
762    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
763	movl	K, %eax
764	subl	KKK, %eax
765	sall	$BASE_SHIFT, %eax
766	leal	(AO, %eax, 1), AO
767	leal	(BO, %eax, 1), BO
768#endif
769
770#if defined(TRMMKERNEL) && defined(LEFT)
771	addl	$1, KK
772#endif
773
774	addl	$1 * SIZE, CO
775	ALIGN_4
776
777.L49:
778#if defined(TRMMKERNEL) && !defined(LEFT)
779	addl	$1, KK
780#endif
781
782	movl	BO, B
783	ALIGN_4
784
785.L999:
786	popl	%ebx
787	popl	%esi
788	popl	%edi
789	popl	%ebp
790	addl	$ARGS, %esp
791	ret
792
793	EPILOGUE
794