1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef movsd
43#undef movsd
44#endif
45
46#ifdef PENTIUM3
47#ifdef HAVE_SSE
48#define PREFETCH	prefetcht0
49#define PREFETCHW	prefetcht0
50#define PREFETCHSIZE	(16 * 2)
51#endif
52#define movsd		movlps
53#endif
54
55#ifdef PENTIUM4
56#define PREFETCH	prefetcht0
57#define PREFETCHW	prefetcht0
58#define PREFETCHSIZE	(16 * 2)
59#endif
60
61#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
62#define PREFETCH	prefetcht0
63#define PREFETCHW	prefetcht0
64#define PREFETCHSIZE	(16 * 7)
65#endif
66
67#ifdef OPTERON
68#define PREFETCH	prefetchnta
69#define PREFETCHW	prefetchw
70#define PREFETCHSIZE	(16 * 3)
71#define movsd		movlps
72#endif
73
74#ifdef BARCELONA
75#define PREFETCH	prefetchnta
76#define PREFETCHW	prefetchw
77#define PREFETCHSIZE	(16 * 5)
78#endif
79
80#ifdef ATOM
81#define PREFETCH	prefetchnta
82#define PREFETCHW	prefetcht0
83#define PREFETCHSIZE	(16 * 6)
84#endif
85
86#ifdef NANO
87#define PREFETCH	prefetcht0
88#define PREFETCHSIZE	(16 * 4)
89#endif
90
91#define STACKSIZE	16
92
93#define M		 4 + STACKSIZE(%esp)
94#define N		 8 + STACKSIZE(%esp)
95#define ALPHA_R		16 + STACKSIZE(%esp)
96#define ALPHA_I		20 + STACKSIZE(%esp)
97#define A		24 + STACKSIZE(%esp)
98#define STACK_LDA	28 + STACKSIZE(%esp)
99#define STACK_X		32 + STACKSIZE(%esp)
100#define STACK_INCX	36 + STACKSIZE(%esp)
101#define Y		40 + STACKSIZE(%esp)
102#define STACK_INCY	44 + STACKSIZE(%esp)
103#define BUFFER		48 + STACKSIZE(%esp)
104
105#define I	%eax
106#define J	%ebx
107
108#define INCX	%ecx
109#define INCY	J
110
111#define A1	%esi
112#define X	%edx
113#define Y1	%edi
114#define LDA	%ebp
115
116#undef SUBPS
117
118#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
119#define SUBPS	   subps
120#else
121#define SUBPS	   addps
122#endif
123
124	PROLOGUE
125
126	pushl	%ebp
127	pushl	%edi
128	pushl	%esi
129	pushl	%ebx
130
131	PROFCODE
132
133	movl	STACK_LDA,  LDA
134	movl	STACK_X,    X
135	movl	STACK_INCX, INCX
136
137	sall	$ZBASE_SHIFT, INCX
138	sall	$ZBASE_SHIFT, LDA
139
140	subl	$-32 * SIZE, A
141
142	cmpl	$0, N
143	jle	.L999
144	cmpl	$0, M
145	jle	.L999
146
147	movl	BUFFER, Y1
148
149	movl	N,  J
150
151	xorps	%xmm7, %xmm7
152
153	movl	M,  %eax
154	addl	$8, %eax
155	sarl	$3, %eax
156	ALIGN_3
157
158.L01:
159	movaps	%xmm7,  0 * SIZE(Y1)
160	movaps	%xmm7,  4 * SIZE(Y1)
161	movaps	%xmm7,  8 * SIZE(Y1)
162	movaps	%xmm7, 12 * SIZE(Y1)
163	subl	$-16 * SIZE, Y1
164	decl	%eax
165	jg	.L01
166	ALIGN_3
167
168.L10:
169	movl	BUFFER, Y1
170	addl	$32 * SIZE, Y1
171
172	movl	A,  A1
173	addl	LDA, A
174
175	movsd	(X), %xmm7
176	addl	INCX, X
177
178#ifdef HAVE_SSE2
179	pcmpeqb	%xmm5, %xmm5
180	psllq	$63,   %xmm5
181#else
182	subl	$8, %esp
183	movl	$0x00000000,  0(%esp)
184	movl	$0x80000000,  4(%esp)
185	movlps	(%esp), %xmm5
186	addl	$8, %esp
187	movlhps	%xmm5, %xmm5
188#endif
189
190#ifdef HAVE_SSE2
191	pshufd	$0x00, %xmm7, %xmm6
192	pshufd	$0x55, %xmm7, %xmm7
193#else
194	movaps	%xmm7, %xmm6
195	shufps	$0x00, %xmm6, %xmm6
196	shufps	$0x55, %xmm7, %xmm7
197#endif
198
199#ifdef HAVE_SSE3
200	movddup	ALPHA_R, %xmm3
201#else
202	movsd	ALPHA_R, %xmm3
203
204	movlhps %xmm3, %xmm3
205#endif
206
207#ifdef HAVE_SSE2
208	pshufd	$0xb1, %xmm3, %xmm4
209#else
210	movaps	%xmm3, %xmm4
211	shufps	$0xb1, %xmm4, %xmm4
212#endif
213
214
215#ifndef XCONJ
216	xorps	 %xmm5, %xmm7
217#else
218	xorps	 %xmm5, %xmm6
219#endif
220
221	mulps	 %xmm3, %xmm6
222	mulps	 %xmm4, %xmm7
223
224#ifndef XCONJ
225	subps	 %xmm7, %xmm6
226#else
227	addps	 %xmm7, %xmm6
228#endif
229
230#ifdef HAVE_SSE2
231	pshufd	 $0x55, %xmm6, %xmm7
232	pshufd	 $0x00, %xmm6, %xmm6
233#else
234	movaps	 %xmm6, %xmm7
235	shufps	 $0x55, %xmm7, %xmm7
236	shufps	 $0x00, %xmm6, %xmm6
237#endif
238
239#ifndef CONJ
240	xorps	 %xmm5, %xmm7
241#else
242	xorps	 %xmm5, %xmm6
243#endif
244
245	movaps	 -32 * SIZE(Y1), %xmm0
246	movaps	 -28 * SIZE(Y1), %xmm1
247	ALIGN_3
248
249	movl	M,   I
250	sarl	$3,  I
251	jle	.L15
252
253	movsd	 -32 * SIZE(A1), %xmm2
254	movhps	 -30 * SIZE(A1), %xmm2
255	movsd	 -28 * SIZE(A1), %xmm4
256	movhps	 -26 * SIZE(A1), %xmm4
257
258	decl	 I
259	jle	 .L14
260	ALIGN_3
261
262.L13:
263#ifdef PREFETCH
264	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
265#endif
266
267#ifdef HAVE_SSE2
268	pshufd	 $0xb1, %xmm2,  %xmm3
269#else
270	movaps	 %xmm2, %xmm3
271	shufps	 $0xb1, %xmm3,  %xmm3
272#endif
273	mulps	 %xmm6, %xmm2
274	addps	 %xmm2, %xmm0
275	movsd	 -24 * SIZE(A1), %xmm2
276	movhps	 -22 * SIZE(A1), %xmm2
277#ifdef HAVE_SSE2
278	pshufd	 $0xb1, %xmm4,  %xmm5
279#else
280	movaps	 %xmm4, %xmm5
281	shufps	 $0xb1, %xmm5,  %xmm5
282#endif
283	mulps	 %xmm6, %xmm4
284	addps	 %xmm4, %xmm1
285	movsd	 -20 * SIZE(A1), %xmm4
286	movhps	 -18 * SIZE(A1), %xmm4
287
288	mulps	 %xmm7, %xmm3
289	SUBPS	 %xmm3, %xmm0
290	movaps	 %xmm0, -32 * SIZE(Y1)
291	movaps	 -24 * SIZE(Y1), %xmm0
292	mulps	 %xmm7, %xmm5
293	SUBPS	 %xmm5, %xmm1
294	movaps	 %xmm1, -28 * SIZE(Y1)
295	movaps	 -20 * SIZE(Y1), %xmm1
296
297#ifdef HAVE_SSE2
298	pshufd	 $0xb1, %xmm2,  %xmm3
299#else
300	movaps	 %xmm2, %xmm3
301	shufps	 $0xb1, %xmm3,  %xmm3
302#endif
303	mulps	 %xmm6, %xmm2
304	addps	 %xmm2, %xmm0
305	movsd	 -16 * SIZE(A1), %xmm2
306	movhps	 -14 * SIZE(A1), %xmm2
307#ifdef HAVE_SSE2
308	pshufd	 $0xb1, %xmm4,  %xmm5
309#else
310	movaps	 %xmm4, %xmm5
311	shufps	 $0xb1, %xmm5,  %xmm5
312#endif
313	mulps	 %xmm6, %xmm4
314	addps	 %xmm4, %xmm1
315	movsd	 -12 * SIZE(A1), %xmm4
316	movhps	 -10 * SIZE(A1), %xmm4
317
318	mulps	 %xmm7, %xmm3
319	SUBPS	 %xmm3, %xmm0
320	movaps	 %xmm0, -24 * SIZE(Y1)
321	movaps	 -16 * SIZE(Y1), %xmm0
322	mulps	 %xmm7, %xmm5
323	SUBPS	 %xmm5, %xmm1
324	movaps	 %xmm1, -20 * SIZE(Y1)
325	movaps	 -12 * SIZE(Y1), %xmm1
326
327	subl	 $-16 * SIZE, A1
328	subl	 $-16 * SIZE, Y1
329
330	subl	 $1, I
331	BRANCH
332	jg	.L13
333	ALIGN_3
334
335.L14:
336#ifdef HAVE_SSE2
337	pshufd	 $0xb1, %xmm2,  %xmm3
338#else
339	movaps	 %xmm2, %xmm3
340	shufps	 $0xb1, %xmm3,  %xmm3
341#endif
342	mulps	 %xmm6, %xmm2
343	addps	 %xmm2, %xmm0
344	movsd	 -24 * SIZE(A1), %xmm2
345	movhps	 -22 * SIZE(A1), %xmm2
346#ifdef HAVE_SSE2
347	pshufd	 $0xb1, %xmm4,  %xmm5
348#else
349	movaps	 %xmm4, %xmm5
350	shufps	 $0xb1, %xmm5,  %xmm5
351#endif
352	mulps	 %xmm6, %xmm4
353	addps	 %xmm4, %xmm1
354	movsd	 -20 * SIZE(A1), %xmm4
355	movhps	 -18 * SIZE(A1), %xmm4
356
357	mulps	 %xmm7, %xmm3
358	SUBPS	 %xmm3, %xmm0
359	movaps	 %xmm0, -32 * SIZE(Y1)
360	movaps	 -24 * SIZE(Y1), %xmm0
361	mulps	 %xmm7, %xmm5
362	SUBPS	 %xmm5, %xmm1
363	movaps	 %xmm1, -28 * SIZE(Y1)
364	movaps	 -20 * SIZE(Y1), %xmm1
365
366#ifdef HAVE_SSE2
367	pshufd	 $0xb1, %xmm2,  %xmm3
368#else
369	movaps	 %xmm2, %xmm3
370	shufps	 $0xb1, %xmm3,  %xmm3
371#endif
372	mulps	 %xmm6, %xmm2
373	addps	 %xmm2, %xmm0
374#ifdef HAVE_SSE2
375	pshufd	 $0xb1, %xmm4,  %xmm5
376#else
377	movaps	 %xmm4, %xmm5
378	shufps	 $0xb1, %xmm5,  %xmm5
379#endif
380	mulps	 %xmm6, %xmm4
381	addps	 %xmm4, %xmm1
382
383	mulps	 %xmm7, %xmm3
384	SUBPS	 %xmm3, %xmm0
385	movaps	 %xmm0, -24 * SIZE(Y1)
386	movaps	 -16 * SIZE(Y1), %xmm0
387	mulps	 %xmm7, %xmm5
388	SUBPS	 %xmm5, %xmm1
389	movaps	 %xmm1, -20 * SIZE(Y1)
390	movaps	 -12 * SIZE(Y1), %xmm1
391
392	subl	 $-16 * SIZE, A1
393	subl	 $-16 * SIZE, Y1
394	ALIGN_3
395
396.L15:
397	testl	$4, M
398	je	.L17
399
400	movsd	 -32 * SIZE(A1), %xmm2
401	movhps	 -30 * SIZE(A1), %xmm2
402	movsd	 -28 * SIZE(A1), %xmm4
403	movhps	 -26 * SIZE(A1), %xmm4
404
405#ifdef HAVE_SSE2
406	pshufd	 $0xb1, %xmm2,  %xmm3
407#else
408	movaps	 %xmm2, %xmm3
409	shufps	 $0xb1, %xmm3,  %xmm3
410#endif
411	mulps	 %xmm6, %xmm2
412	addps	 %xmm2, %xmm0
413#ifdef HAVE_SSE2
414	pshufd	 $0xb1, %xmm4,  %xmm5
415#else
416	movaps	 %xmm4, %xmm5
417	shufps	 $0xb1, %xmm5,  %xmm5
418#endif
419	mulps	 %xmm6, %xmm4
420	addps	 %xmm4, %xmm1
421
422	mulps	 %xmm7, %xmm3
423	SUBPS	 %xmm3, %xmm0
424	movaps	 %xmm0, -32 * SIZE(Y1)
425	movaps	 -24 * SIZE(Y1), %xmm0
426	mulps	 %xmm7, %xmm5
427	SUBPS	 %xmm5, %xmm1
428	movaps	 %xmm1, -28 * SIZE(Y1)
429	movaps	 -20 * SIZE(Y1), %xmm1
430
431	addl	 $8 * SIZE, A1
432	addl	 $8 * SIZE, Y1
433	ALIGN_3
434
435.L17:
436	testl	$2, M
437	je	.L18
438
439	movsd	 -32 * SIZE(A1), %xmm2
440	movhps	 -30 * SIZE(A1), %xmm2
441
442#ifdef HAVE_SSE2
443	pshufd	 $0xb1, %xmm2,  %xmm3
444#else
445	movaps	 %xmm2, %xmm3
446	shufps	 $0xb1, %xmm3,  %xmm3
447#endif
448	mulps	 %xmm6, %xmm2
449	addps	 %xmm2, %xmm0
450	mulps	 %xmm7, %xmm3
451	SUBPS	 %xmm3, %xmm0
452
453	movaps	 %xmm0, -32 * SIZE(Y1)
454	movaps	 %xmm1, %xmm0
455
456	addl	 $4 * SIZE, A1
457	addl	 $4 * SIZE, Y1
458	ALIGN_3
459
460.L18:
461	testl	$1, M
462	je	.L19
463
464#ifdef	movsd
465	xorps	%xmm2, %xmm2
466#endif
467	movsd	 -32 * SIZE(A1), %xmm2
468
469#ifdef HAVE_SSE2
470	pshufd	 $0xb1, %xmm2,  %xmm3
471#else
472	movaps	 %xmm2, %xmm3
473	shufps	 $0xb1, %xmm3,  %xmm3
474#endif
475	mulps	 %xmm6, %xmm2
476	addps	 %xmm2, %xmm0
477	mulps	 %xmm7, %xmm3
478	SUBPS	 %xmm3, %xmm0
479
480	movlps	 %xmm0, -32 * SIZE(Y1)
481	ALIGN_3
482
483.L19:
484	decl	J
485	jg	.L10
486	ALIGN_4
487
488.L990:
489	movl	Y,   Y1
490	movl	BUFFER, X
491
492	movl	STACK_INCY, INCY
493	sall	$ZBASE_SHIFT, INCY
494
495	movl	M,   %eax
496	sarl	$3,  %eax
497	jle	.L994
498	ALIGN_3
499
500.L992:
501	movsd	(Y1), %xmm0
502	movhps	(Y1, INCY), %xmm0
503
504	addps	 0 * SIZE(X), %xmm0
505
506	movlps	%xmm0, (Y1)
507	movhps	%xmm0, (Y1, INCY)
508	leal	(Y1, INCY, 2), Y1
509
510	movsd	(Y1), %xmm0
511	movhps	(Y1, INCY), %xmm0
512
513	addps	 4 * SIZE(X), %xmm0
514
515	movlps	%xmm0, (Y1)
516	movhps	%xmm0, (Y1, INCY)
517	leal	(Y1, INCY, 2), Y1
518
519	movsd	(Y1), %xmm0
520	movhps	(Y1, INCY), %xmm0
521
522	addps	 8 * SIZE(X), %xmm0
523
524	movlps	%xmm0, (Y1)
525	movhps	%xmm0, (Y1, INCY)
526	leal	(Y1, INCY, 2), Y1
527
528	movsd	(Y1), %xmm0
529	movhps	(Y1, INCY), %xmm0
530
531	addps	12 * SIZE(X), %xmm0
532
533	movlps	%xmm0, (Y1)
534	movhps	%xmm0, (Y1, INCY)
535	leal	(Y1, INCY, 2), Y1
536
537	addl	$16 * SIZE, X
538	decl	%eax
539	jg	.L992
540	ALIGN_3
541
542.L994:
543	testl	$4, M
544	jle	.L995
545
546	movsd	(Y1), %xmm0
547	movhps	(Y1, INCY), %xmm0
548
549	addps	 0 * SIZE(X), %xmm0
550
551	movlps	%xmm0, (Y1)
552	movhps	%xmm0, (Y1, INCY)
553	leal	(Y1, INCY, 2), Y1
554
555	movsd	(Y1), %xmm0
556	movhps	(Y1, INCY), %xmm0
557
558	addps	 4 * SIZE(X), %xmm0
559
560	movlps	%xmm0, (Y1)
561	movhps	%xmm0, (Y1, INCY)
562	leal	(Y1, INCY, 2), Y1
563
564	addl	$8 * SIZE, X
565	ALIGN_3
566
567.L995:
568	testl	$2, M
569	jle	.L996
570
571	movsd	(Y1), %xmm0
572	movhps	(Y1, INCY), %xmm0
573
574	addps	 0 * SIZE(X), %xmm0
575
576	movlps	%xmm0, (Y1)
577	movhps	%xmm0, (Y1, INCY)
578	leal	(Y1, INCY, 2), Y1
579
580	addl	$4 * SIZE, X
581	ALIGN_3
582
583.L996:
584	testl	$1, M
585	jle	.L999
586
587#ifdef	movsd
588	xorps	%xmm0, %xmm0
589#endif
590	movsd	(Y1), %xmm0
591
592	addps	 0 * SIZE(X), %xmm0
593
594	movlps	%xmm0, (Y1)
595	ALIGN_3
596
597.L999:
598	popl	%ebx
599	popl	%esi
600	popl	%edi
601	popl	%ebp
602	ret
603
604	EPILOGUE
605