1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef movsd
43#undef movsd
44#endif
45
46#ifdef PENTIUM3
47#ifdef HAVE_SSE
48#define PREFETCH	prefetcht0
49#define PREFETCHW	prefetcht0
50#define PREFETCHSIZE	(16 * 2)
51#endif
52#define movsd		movlps
53#endif
54
55#ifdef PENTIUM4
56#define PREFETCH	prefetcht0
57#define PREFETCHW	prefetcht0
58#define PREFETCHSIZE	(16 * 4)
59#endif
60
61#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
62#define PREFETCH	prefetcht0
63#define PREFETCHW	prefetcht0
64#define PREFETCHSIZE	(16 * 7)
65#endif
66
67#ifdef OPTERON
68#define PREFETCH	prefetchnta
69#define PREFETCHW	prefetchw
70#define PREFETCHSIZE	(16 * 3)
71#define movsd		movlps
72#endif
73
74#ifdef BARCELONA
75#define PREFETCH	prefetchnta
76#define PREFETCHW	prefetchw
77#define PREFETCHSIZE	(16 * 5)
78#endif
79
80#ifdef ATOM
81#define PREFETCH	prefetchnta
82#define PREFETCHW	prefetcht0
83#define PREFETCHSIZE	(16 * 6)
84#endif
85
86#ifdef NANO
87#define PREFETCH	prefetcht0
88#define PREFETCHSIZE	(16 * 4)
89#endif
90
91#define STACKSIZE	16
92#define ARGS	20
93
94#define M		 4 + STACKSIZE+ARGS(%esp)
95#define N		 8 + STACKSIZE+ARGS(%esp)
96#define ALPHA		16 + STACKSIZE+ARGS(%esp)
97#define A		20 + STACKSIZE+ARGS(%esp)
98#define STACK_LDA	24 + STACKSIZE+ARGS(%esp)
99#define STACK_X		28 + STACKSIZE+ARGS(%esp)
100#define STACK_INCX	32 + STACKSIZE+ARGS(%esp)
101#define Y		36 + STACKSIZE+ARGS(%esp)
102#define STACK_INCY	40 + STACKSIZE+ARGS(%esp)
103#define BUFFER		44 + STACKSIZE+ARGS(%esp)
104
105#define MMM	0+ARGS(%esp)
106#define AA	4+ARGS(%esp)
107#define XX	8+ARGS(%esp)
108
109#define I	%eax
110#define J	%ebx
111
112#define INCX	J
113#define INCY	%ecx
114
115#define A1	%esi
116#define X	%edx
117#define Y1	%edi
118#define LDA	%ebp
119
120	PROLOGUE
121
122	subl	$ARGS,%esp
123	pushl	%ebp
124	pushl	%edi
125	pushl	%esi
126	pushl	%ebx
127
128	PROFCODE
129
130	movl	STACK_X,    X
131	movl	X,XX
132	movl	A,J
133	movl	J,AA				# backup A
134    movl	M,J
135	movl	J,MMM				# mov M to MMM
136.L0t:
137	xorl	J,J
138	addl	$1,J
139	sall    $22,J                           # J=2^24*sizeof(float)=buffer size(16MB)
140	subl    $8, J                           # Don't use last 8 float in the buffer.
141	subl	J,MMM				# MMM=MMM-J
142	movl	J,M
143	jge		.L00t
144	ALIGN_4
145
146	movl	MMM,%eax
147	addl	J,%eax
148	jle		.L999x
149	movl	%eax,M
150
151.L00t:
152	movl	AA,%eax
153	movl	%eax,A			 	# mov AA to A
154
155	movl	XX,%eax
156	movl	%eax,X
157
158	movl	STACK_LDA, LDA
159	movl	STACK_INCX, INCX
160	movl	STACK_INCY, INCY
161
162	leal	(,INCX, SIZE), INCX
163	leal	(,INCY, SIZE), INCY
164	leal	(,LDA,  SIZE), LDA
165
166	subl	$-32 * SIZE, A
167
168	cmpl	$0, N
169	jle	.L999
170	cmpl	$0, M
171	jle	.L999
172
173	movl	BUFFER, Y1
174
175	movl	M,  I
176	sarl	$3, I
177	jle	.L05
178	ALIGN_4
179
180.L02:
181	movss	(X), %xmm0
182	addl	INCX, X
183	movss	(X), %xmm1
184	addl	INCX, X
185
186	unpcklps %xmm1, %xmm0
187
188	movss	(X), %xmm2
189	addl	INCX, X
190	movss	(X), %xmm3
191	addl	INCX, X
192
193	unpcklps %xmm3, %xmm2
194
195	movss	(X), %xmm4
196	addl	INCX, X
197	movss	(X), %xmm5
198	addl	INCX, X
199
200	unpcklps %xmm5, %xmm4
201
202	movss	(X), %xmm6
203	addl	INCX, X
204	movss	(X), %xmm7
205	addl	INCX, X
206
207	unpcklps %xmm7, %xmm6
208
209	movlps	%xmm0, 0 * SIZE(Y1)
210	movlps	%xmm2, 2 * SIZE(Y1)
211	movlps	%xmm4, 4 * SIZE(Y1)
212	movlps	%xmm6, 6 * SIZE(Y1)
213
214	addl	$8 * SIZE, Y1
215	decl	I
216	jg	.L02
217	ALIGN_4
218
219.L05:
220	movl	M,  I
221	andl	$7, I
222	jle	.L10
223	ALIGN_2
224
225.L06:
226	movss	(X), %xmm0
227	addl	INCX, X
228	movss	%xmm0, 0 * SIZE(Y1)
229	addl	$SIZE, Y1
230	decl	I
231	jg	.L06
232	ALIGN_4
233
234//Padding zero to prevent loading the dirty number from buffer.
235	movl	M,  I
236	movl	$8, J
237	andl	$7, I
238	xorps	%xmm0, %xmm0
239	subl	I, J
240	ALIGN_2
241.L07:
242	movss	%xmm0, 0 * SIZE(Y1)
243	addl	$SIZE, Y1
244	decl	J
245	jg	.L07
246	ALIGN_4
247
248.L10:
249	movl	Y, Y1
250
251	movl	N,  J
252	sarl	$1, J
253	jle	.L20
254	ALIGN_3
255
256.L11:
257	movl	BUFFER, X
258	addl	$32 * SIZE, X
259
260	movl	A, A1
261	leal	(A1, LDA, 2), %eax
262	movl	%eax, A
263
264	xorps	%xmm0, %xmm0
265	xorps	%xmm1, %xmm1
266
267	movaps	-32 * SIZE(X), %xmm2
268	movaps	-28 * SIZE(X), %xmm3
269
270	movl	M,   I
271	sarl	$4,  I
272	jle	.L15
273
274	movsd	-32 * SIZE(A1), %xmm4
275	movhps	-30 * SIZE(A1), %xmm4
276	movsd	-32 * SIZE(A1, LDA), %xmm5
277	movhps	-30 * SIZE(A1, LDA), %xmm5
278
279	movsd	-28 * SIZE(A1), %xmm6
280	movhps	-26 * SIZE(A1), %xmm6
281	movsd	-28 * SIZE(A1, LDA), %xmm7
282	movhps	-26 * SIZE(A1, LDA), %xmm7
283
284	decl	I
285	jle	.L13
286	ALIGN_4
287
288.L12:
289#ifdef PREFETCH
290	PREFETCH PREFETCHSIZE * SIZE(A1)
291#endif
292
293	mulps	%xmm2, %xmm4
294	addps	%xmm4, %xmm0
295	movsd	-24 * SIZE(A1), %xmm4
296	movhps	-22 * SIZE(A1), %xmm4
297	mulps	%xmm2, %xmm5
298	movaps	-24 * SIZE(X), %xmm2
299	addps	%xmm5, %xmm1
300	movsd	-24 * SIZE(A1, LDA), %xmm5
301	movhps	-22 * SIZE(A1, LDA), %xmm5
302
303	mulps	%xmm3, %xmm6
304	addps	%xmm6, %xmm0
305	movsd	-20 * SIZE(A1), %xmm6
306	movhps	-18 * SIZE(A1), %xmm6
307	mulps	%xmm3, %xmm7
308	movaps	-20 * SIZE(X), %xmm3
309	addps	%xmm7, %xmm1
310	movsd	-20 * SIZE(A1, LDA), %xmm7
311	movhps	-18 * SIZE(A1, LDA), %xmm7
312
313#ifdef PREFETCH
314	PREFETCH PREFETCHSIZE * SIZE(A1, LDA)
315#endif
316
317	mulps	%xmm2, %xmm4
318	addps	%xmm4, %xmm0
319	movsd	-16 * SIZE(A1), %xmm4
320	movhps	-14 * SIZE(A1), %xmm4
321	mulps	%xmm2, %xmm5
322	movaps	-16 * SIZE(X), %xmm2
323	addps	%xmm5, %xmm1
324	movsd	-16 * SIZE(A1, LDA), %xmm5
325	movhps	-14 * SIZE(A1, LDA), %xmm5
326
327	mulps	%xmm3, %xmm6
328	addps	%xmm6, %xmm0
329	movsd	-12 * SIZE(A1), %xmm6
330	movhps	-10 * SIZE(A1), %xmm6
331	mulps	%xmm3, %xmm7
332	movaps	-12 * SIZE(X), %xmm3
333	addps	%xmm7, %xmm1
334	movsd	-12 * SIZE(A1, LDA), %xmm7
335	movhps	-10 * SIZE(A1, LDA), %xmm7
336
337	addl	$16 * SIZE, A1
338	addl	$16 * SIZE, X
339
340	decl	I
341	jg	.L12
342	ALIGN_4
343
344.L13:
345	mulps	%xmm2, %xmm4
346	addps	%xmm4, %xmm0
347	movsd	-24 * SIZE(A1), %xmm4
348	movhps	-22 * SIZE(A1), %xmm4
349	mulps	%xmm2, %xmm5
350	movaps	-24 * SIZE(X), %xmm2
351	addps	%xmm5, %xmm1
352	movsd	-24 * SIZE(A1, LDA), %xmm5
353	movhps	-22 * SIZE(A1, LDA), %xmm5
354
355	mulps	%xmm3, %xmm6
356	addps	%xmm6, %xmm0
357	movsd	-20 * SIZE(A1), %xmm6
358	movhps	-18 * SIZE(A1), %xmm6
359	mulps	%xmm3, %xmm7
360	movaps	-20 * SIZE(X), %xmm3
361	addps	%xmm7, %xmm1
362	movsd	-20 * SIZE(A1, LDA), %xmm7
363	movhps	-18 * SIZE(A1, LDA), %xmm7
364
365	mulps	%xmm2, %xmm4
366	addps	%xmm4, %xmm0
367	mulps	%xmm2, %xmm5
368	movaps	-16 * SIZE(X), %xmm2
369	addps	%xmm5, %xmm1
370
371	mulps	%xmm3, %xmm6
372	addps	%xmm6, %xmm0
373	mulps	%xmm3, %xmm7
374	movaps	-12 * SIZE(X), %xmm3
375	addps	%xmm7, %xmm1
376
377	addl	$16 * SIZE, A1
378	addl	$16 * SIZE, X
379	ALIGN_4
380
381.L15:
382	testl	$8, M
383	jle	.L16
384
385	movsd	-32 * SIZE(A1), %xmm4
386	movhps	-30 * SIZE(A1), %xmm4
387	movsd	-32 * SIZE(A1, LDA), %xmm5
388	movhps	-30 * SIZE(A1, LDA), %xmm5
389
390	movsd	-28 * SIZE(A1), %xmm6
391	movhps	-26 * SIZE(A1), %xmm6
392	movsd	-28 * SIZE(A1, LDA), %xmm7
393	movhps	-26 * SIZE(A1, LDA), %xmm7
394
395	mulps	%xmm2, %xmm4
396	addps	%xmm4, %xmm0
397	mulps	%xmm2, %xmm5
398	movaps	-24 * SIZE(X), %xmm2
399	addps	%xmm5, %xmm1
400
401	mulps	%xmm3, %xmm6
402	addps	%xmm6, %xmm0
403	mulps	%xmm3, %xmm7
404	movaps	-20 * SIZE(X), %xmm3
405	addps	%xmm7, %xmm1
406
407	addl	$8 * SIZE, A1
408	addl	$8 * SIZE, X
409	ALIGN_4
410
411.L16:
412	testl	$4, M
413	jle	.L17
414
415	movsd	-32 * SIZE(A1), %xmm4
416	movhps	-30 * SIZE(A1), %xmm4
417
418	movsd	-32 * SIZE(A1, LDA), %xmm5
419	movhps	-30 * SIZE(A1, LDA), %xmm5
420
421	mulps	%xmm2, %xmm4
422	addps	%xmm4, %xmm0
423	mulps	%xmm2, %xmm5
424	addps	%xmm5, %xmm1
425	movaps	%xmm3, %xmm2
426
427	addl	$4 * SIZE, A1
428	ALIGN_4
429
430.L17:
431	testl	$2, M
432	jle	.L18
433
434#ifdef movsd
435	xorps	%xmm4, %xmm4
436#endif
437	movsd	-32 * SIZE(A1), %xmm4
438
439#ifdef movsd
440	xorps	%xmm5, %xmm5
441#endif
442	movsd	-32 * SIZE(A1, LDA), %xmm5
443
444	mulps	%xmm2, %xmm4
445	addps	%xmm4, %xmm0
446	mulps	%xmm2, %xmm5
447	addps	%xmm5, %xmm1
448	movhlps	%xmm2, %xmm2
449
450	addl	$2 * SIZE, A1
451	ALIGN_4
452
453.L18:
454	testl	$1, M
455	jle	.L19
456
457	movss	-32 * SIZE(A1), %xmm4
458	mulss	%xmm2, %xmm4
459	addss	%xmm4, %xmm0
460	movss	-32 * SIZE(A1, LDA), %xmm5
461	mulss	%xmm2, %xmm5
462	addss	%xmm5, %xmm1
463	ALIGN_4
464
465.L19:
466#ifdef HAVE_SSE3
467	haddps	%xmm0, %xmm0
468	haddps	%xmm1, %xmm1
469
470	haddps	%xmm0, %xmm0
471	haddps	%xmm1, %xmm1
472#else
473	movhlps	%xmm0, %xmm2
474	movhlps	%xmm1, %xmm3
475
476	addps	%xmm2, %xmm0
477	addps	%xmm3, %xmm1
478
479	movaps	%xmm0, %xmm2
480	shufps	$1, %xmm0, %xmm0
481	movaps	%xmm1, %xmm3
482	shufps	$1, %xmm1, %xmm1
483
484	addss	%xmm2, %xmm0
485	addss	%xmm3, %xmm1
486#endif
487
488	movss	ALPHA, %xmm7
489
490	mulss	%xmm7, %xmm0
491	mulss	%xmm7, %xmm1
492
493	addss	(Y1), %xmm0
494	addss	(Y1, INCY), %xmm1
495
496	movss	%xmm0, (Y1)
497	movss	%xmm1, (Y1, INCY)
498	leal	(Y1, INCY, 2), Y1
499
500	decl	J
501	jg	.L11
502	ALIGN_4
503
504.L20:
505	testl	$1, N
506	jle	.L999
507
508	movl	BUFFER, X
509	addl	$32 * SIZE, X
510
511	movl	A, A1
512
513	xorps	%xmm0, %xmm0
514	xorps	%xmm1, %xmm1
515
516	movaps	-32 * SIZE(X), %xmm2
517	movaps	-28 * SIZE(X), %xmm3
518
519	movl	M,   I
520	sarl	$4,  I
521	jle	.L25
522
523	movsd	-32 * SIZE(A1), %xmm4
524	movhps	-30 * SIZE(A1), %xmm4
525	movsd	-28 * SIZE(A1), %xmm6
526	movhps	-26 * SIZE(A1), %xmm6
527
528	decl	I
529	jle	.L23
530	ALIGN_4
531
532.L22:
533#ifdef PREFETCH
534	PREFETCH PREFETCHSIZE * SIZE(A1)
535#endif
536
537	mulps	%xmm2, %xmm4
538	movaps	-24 * SIZE(X), %xmm2
539	addps	%xmm4, %xmm0
540	movsd	-24 * SIZE(A1), %xmm4
541	movhps	-22 * SIZE(A1), %xmm4
542
543	mulps	%xmm3, %xmm6
544	movaps	-20 * SIZE(X), %xmm3
545	addps	%xmm6, %xmm0
546	movsd	-20 * SIZE(A1), %xmm6
547	movhps	-18 * SIZE(A1), %xmm6
548
549	mulps	%xmm2, %xmm4
550	movaps	-16 * SIZE(X), %xmm2
551	addps	%xmm4, %xmm0
552	movsd	-16 * SIZE(A1), %xmm4
553	movhps	-14 * SIZE(A1), %xmm4
554
555	mulps	%xmm3, %xmm6
556	movaps	-12 * SIZE(X), %xmm3
557	addps	%xmm6, %xmm0
558	movsd	-12 * SIZE(A1), %xmm6
559	movhps	-10 * SIZE(A1), %xmm6
560
561	addl	$16 * SIZE, A1
562	addl	$16 * SIZE, X
563
564	decl	I
565	jg	.L22
566	ALIGN_4
567
568.L23:
569	mulps	%xmm2, %xmm4
570	movaps	-24 * SIZE(X), %xmm2
571	addps	%xmm4, %xmm0
572	movsd	-24 * SIZE(A1), %xmm4
573	movhps	-22 * SIZE(A1), %xmm4
574
575	mulps	%xmm3, %xmm6
576	movaps	-20 * SIZE(X), %xmm3
577	addps	%xmm6, %xmm0
578	movsd	-20 * SIZE(A1), %xmm6
579	movhps	-18 * SIZE(A1), %xmm6
580
581	mulps	%xmm2, %xmm4
582	movaps	-16 * SIZE(X), %xmm2
583	addps	%xmm4, %xmm0
584
585	mulps	%xmm3, %xmm6
586	movaps	-12 * SIZE(X), %xmm3
587	addps	%xmm6, %xmm0
588
589	addl	$16 * SIZE, A1
590	addl	$16 * SIZE, X
591	ALIGN_4
592
593.L25:
594	testl	$8, M
595	jle	.L26
596
597	movsd	-32 * SIZE(A1), %xmm4
598	movhps	-30 * SIZE(A1), %xmm4
599	movsd	-28 * SIZE(A1), %xmm6
600	movhps	-26 * SIZE(A1), %xmm6
601
602	mulps	%xmm2, %xmm4
603	movaps	-24 * SIZE(X), %xmm2
604	addps	%xmm4, %xmm0
605
606	mulps	%xmm3, %xmm6
607	movaps	-20 * SIZE(X), %xmm3
608	addps	%xmm6, %xmm0
609
610	addl	$8 * SIZE, A1
611	addl	$8 * SIZE, X
612	ALIGN_4
613
614.L26:
615	testl	$4, M
616	jle	.L27
617
618	movsd	-32 * SIZE(A1), %xmm4
619	movhps	-30 * SIZE(A1), %xmm4
620
621	mulps	%xmm2, %xmm4
622	addps	%xmm4, %xmm0
623	movaps	%xmm3, %xmm2
624
625	addl	$4 * SIZE, A1
626	ALIGN_4
627
628.L27:
629	testl	$2, M
630	jle	.L28
631
632#ifdef movsd
633	xorps	%xmm4, %xmm4
634#endif
635	movsd	-32 * SIZE(A1), %xmm4
636
637	mulps	%xmm2, %xmm4
638	addps	%xmm4, %xmm0
639	movhlps	%xmm2, %xmm2
640
641	addl	$2 * SIZE, A1
642	ALIGN_4
643
644.L28:
645	testl	$1, M
646	jle	.L29
647
648	movss	-32 * SIZE(A1), %xmm4
649	mulss	%xmm2, %xmm4
650	addss	%xmm4, %xmm0
651	ALIGN_4
652
653.L29:
654#ifdef HAVE_SSE3
655	haddps	%xmm0, %xmm0
656	haddps	%xmm0, %xmm0
657#else
658	movhlps	%xmm0, %xmm2
659
660	addps	%xmm2, %xmm0
661
662	movaps	%xmm0, %xmm2
663	shufps	$1, %xmm0, %xmm0
664
665	addss	%xmm2, %xmm0
666#endif
667
668	movss	ALPHA, %xmm7
669
670	mulss	%xmm7, %xmm0
671
672	addss	(Y1), %xmm0
673
674	movss	%xmm0, (Y1)
675	ALIGN_4
676
677.L999:
678	movl	M,J
679	leal	(,J,SIZE),%eax
680	addl	%eax,AA
681	movl	STACK_INCX,INCX
682	imull	INCX,%eax
683	addl	%eax,XX
684	jmp		.L0t
685	ALIGN_4
686
687.L999x:
688	popl	%ebx
689	popl	%esi
690	popl	%edi
691	popl	%ebp
692
693	addl	$ARGS,%esp
694	ret
695
696	EPILOGUE
697