1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef PENTIUM4
43#define PREFETCH	prefetcht0
44#define PREFETCHW	prefetcht0
45#define PREFETCHSIZE	(8 * 2)
46#endif
47
48#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
49#define PREFETCH	prefetcht0
50#define PREFETCHW	prefetcht0
51#define PREFETCHSIZE	(8 * 7)
52#endif
53
54#ifdef OPTERON
55#define PREFETCH	prefetchnta
56#define PREFETCHW	prefetchw
57#define PREFETCHSIZE	(8 * 3)
58#define movsd		movlps
59#endif
60
61#ifdef BARCELONA
62#define PREFETCH	prefetchnta
63#define PREFETCHW	prefetchw
64#define PREFETCHSIZE	(8 * 5)
65#endif
66
67#ifdef ATOM
68#define PREFETCH	prefetchnta
69#define PREFETCHW	prefetcht0
70#define PREFETCHSIZE	(8 * 6)
71#endif
72
73#ifdef NANO
74#define PREFETCH	prefetcht0
75#define PREFETCHSIZE	(8 * 4)
76#endif
77
78#define STACKSIZE	16
79
80#define M		 4 + STACKSIZE(%esp)
81#define N		 8 + STACKSIZE(%esp)
82#define ALPHA_R		16 + STACKSIZE(%esp)
83#define ALPHA_I		24 + STACKSIZE(%esp)
84#define A		32 + STACKSIZE(%esp)
85#define STACK_LDA	36 + STACKSIZE(%esp)
86#define STACK_X		40 + STACKSIZE(%esp)
87#define STACK_INCX	44 + STACKSIZE(%esp)
88#define Y		48 + STACKSIZE(%esp)
89#define STACK_INCY	52 + STACKSIZE(%esp)
90#define BUFFER		56 + STACKSIZE(%esp)
91
92#define I	%eax
93#define J	%ebx
94
95#define INCX	%ecx
96#define INCY	J
97
98#define A1	%esi
99#define X	%edx
100#define Y1	%edi
101#define LDA	%ebp
102
103#undef SUBPD
104
105#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
106#define SUBPD	   subpd
107#else
108#define SUBPD	   addpd
109#endif
110
111	PROLOGUE
112
113	pushl	%ebp
114	pushl	%edi
115	pushl	%esi
116	pushl	%ebx
117
118	PROFCODE
119
120	movl	STACK_LDA,  LDA
121	movl	STACK_X,    X
122	movl	STACK_INCX, INCX
123
124	sall	$ZBASE_SHIFT, INCX
125	sall	$ZBASE_SHIFT, LDA
126
127	subl	$-16 * SIZE, A
128
129	cmpl	$0, N
130	jle	.L999
131	cmpl	$0, M
132	jle	.L999
133
134	movl	BUFFER, Y1
135
136	movl	N,  J
137
138	pxor	%xmm7, %xmm7
139
140	movl	M,  %eax
141	addl	$8, %eax
142	sarl	$3, %eax
143	ALIGN_3
144
145.L01:
146	movapd	%xmm7,  0 * SIZE(Y1)
147	movapd	%xmm7,  2 * SIZE(Y1)
148	movapd	%xmm7,  4 * SIZE(Y1)
149	movapd	%xmm7,  6 * SIZE(Y1)
150	movapd	%xmm7,  8 * SIZE(Y1)
151	movapd	%xmm7, 10 * SIZE(Y1)
152	movapd	%xmm7, 12 * SIZE(Y1)
153	movapd	%xmm7, 14 * SIZE(Y1)
154	subl	$-16 * SIZE, Y1
155	decl	%eax
156	jg	.L01
157	ALIGN_3
158
159.L10:
160	movl	BUFFER, Y1
161	addl	$16 * SIZE, Y1
162
163	movl	A,  A1
164	addl	LDA, A
165
166	movsd	0 * SIZE(X), %xmm6
167	movhpd	1 * SIZE(X), %xmm6
168	addl	INCX, X
169
170	pcmpeqb	%xmm5, %xmm5
171	psllq	$63,    %xmm5
172	shufps	$0xc0, %xmm5, %xmm5
173
174	pshufd	$0x4e, %xmm6, %xmm7
175
176#ifdef HAVE_SSE3
177	movddup	ALPHA_R, %xmm3
178	movddup	ALPHA_I, %xmm4
179#else
180	movsd	ALPHA_R, %xmm3
181	movsd	ALPHA_I, %xmm4
182
183	unpcklpd %xmm3, %xmm3
184	unpcklpd %xmm4, %xmm4
185#endif
186
187	xorpd	 %xmm5, %xmm7
188
189	mulpd	 %xmm3, %xmm6
190	mulpd	 %xmm4, %xmm7
191
192#ifndef XCONJ
193	subpd	 %xmm7, %xmm6
194#else
195	addpd	 %xmm7, %xmm6
196#endif
197
198	pshufd	 $0xee, %xmm6, %xmm7
199	pshufd	 $0x44, %xmm6, %xmm6
200
201#ifndef CONJ
202	xorpd	 %xmm5, %xmm7
203#else
204	xorpd	 %xmm5, %xmm6
205#endif
206
207	movapd	 -16 * SIZE(Y1), %xmm0
208	movapd	 -14 * SIZE(Y1), %xmm1
209	ALIGN_3
210
211	movl	M,   I
212	sarl	$2,  I
213	jle	.L15
214
215	movsd	 -16 * SIZE(A1), %xmm2
216	movhpd	 -15 * SIZE(A1), %xmm2
217	movsd	 -14 * SIZE(A1), %xmm4
218	movhpd	 -13 * SIZE(A1), %xmm4
219
220	decl	 I
221	jle	 .L14
222	ALIGN_3
223
224.L13:
225#ifdef PREFETCH
226	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
227#endif
228
229	pshufd	 $0x4e, %xmm2,  %xmm3
230	mulpd	 %xmm6, %xmm2
231	addpd	 %xmm2, %xmm0
232	movsd	 -12 * SIZE(A1), %xmm2
233	movhpd	 -11 * SIZE(A1), %xmm2
234	pshufd	 $0x4e, %xmm4,  %xmm5
235	mulpd	 %xmm6, %xmm4
236	addpd	 %xmm4, %xmm1
237	movsd	 -10 * SIZE(A1), %xmm4
238	movhpd	  -9 * SIZE(A1), %xmm4
239
240	mulpd	 %xmm7, %xmm3
241	SUBPD	 %xmm3, %xmm0
242	movapd	 %xmm0, -16 * SIZE(Y1)
243	movapd	 -12 * SIZE(Y1), %xmm0
244	mulpd	 %xmm7, %xmm5
245	SUBPD	 %xmm5, %xmm1
246	movapd	 %xmm1, -14 * SIZE(Y1)
247	movapd	 -10 * SIZE(Y1), %xmm1
248
249	pshufd	 $0x4e, %xmm2,  %xmm3
250	mulpd	 %xmm6, %xmm2
251	addpd	 %xmm2, %xmm0
252	movsd	  -8 * SIZE(A1), %xmm2
253	movhpd	  -7 * SIZE(A1), %xmm2
254	pshufd	 $0x4e, %xmm4,  %xmm5
255	mulpd	 %xmm6, %xmm4
256	addpd	 %xmm4, %xmm1
257	movsd	  -6 * SIZE(A1), %xmm4
258	movhpd	  -5 * SIZE(A1), %xmm4
259
260	mulpd	 %xmm7, %xmm3
261	SUBPD	 %xmm3, %xmm0
262	movapd	 %xmm0, -12 * SIZE(Y1)
263	movapd	  -8 * SIZE(Y1), %xmm0
264	mulpd	 %xmm7, %xmm5
265	SUBPD	 %xmm5, %xmm1
266	movapd	 %xmm1, -10 * SIZE(Y1)
267	movapd	  -6 * SIZE(Y1), %xmm1
268
269	subl	 $-8 * SIZE, A1
270	subl	 $-8 * SIZE, Y1
271
272	subl	 $1, I
273	BRANCH
274	jg	.L13
275	ALIGN_3
276
277.L14:
278	pshufd	 $0x4e, %xmm2,  %xmm3
279	mulpd	 %xmm6, %xmm2
280	addpd	 %xmm2, %xmm0
281	movsd	 -12 * SIZE(A1), %xmm2
282	movhpd	 -11 * SIZE(A1), %xmm2
283	pshufd	 $0x4e, %xmm4,  %xmm5
284	mulpd	 %xmm6, %xmm4
285	addpd	 %xmm4, %xmm1
286	movsd	 -10 * SIZE(A1), %xmm4
287	movhpd	  -9 * SIZE(A1), %xmm4
288
289	mulpd	 %xmm7, %xmm3
290	SUBPD	 %xmm3, %xmm0
291	movapd	 %xmm0, -16 * SIZE(Y1)
292	movapd	 -12 * SIZE(Y1), %xmm0
293	mulpd	 %xmm7, %xmm5
294	SUBPD	 %xmm5, %xmm1
295	movapd	 %xmm1, -14 * SIZE(Y1)
296	movapd	 -10 * SIZE(Y1), %xmm1
297
298	pshufd	 $0x4e, %xmm2,  %xmm3
299	mulpd	 %xmm6, %xmm2
300	addpd	 %xmm2, %xmm0
301	pshufd	 $0x4e, %xmm4,  %xmm5
302	mulpd	 %xmm6, %xmm4
303	addpd	 %xmm4, %xmm1
304
305	mulpd	 %xmm7, %xmm3
306	SUBPD	 %xmm3, %xmm0
307	movapd	 %xmm0, -12 * SIZE(Y1)
308	movapd	  -8 * SIZE(Y1), %xmm0
309	mulpd	 %xmm7, %xmm5
310	SUBPD	 %xmm5, %xmm1
311	movapd	 %xmm1, -10 * SIZE(Y1)
312	movapd	  -6 * SIZE(Y1), %xmm1
313
314	subl	 $-8 * SIZE, A1
315	subl	 $-8 * SIZE, Y1
316	ALIGN_3
317
318.L15:
319	testl	$2, M
320	je	.L17
321
322	movsd	 -16 * SIZE(A1), %xmm2
323	movhpd	 -15 * SIZE(A1), %xmm2
324	movsd	 -14 * SIZE(A1), %xmm4
325	movhpd	 -13 * SIZE(A1), %xmm4
326
327	pshufd	 $0x4e, %xmm2,  %xmm3
328	mulpd	 %xmm6, %xmm2
329	addpd	 %xmm2, %xmm0
330	pshufd	 $0x4e, %xmm4,  %xmm5
331	mulpd	 %xmm6, %xmm4
332	addpd	 %xmm4, %xmm1
333
334	mulpd	 %xmm7, %xmm3
335	SUBPD	 %xmm3, %xmm0
336	movapd	 %xmm0, -16 * SIZE(Y1)
337	mulpd	 %xmm7, %xmm5
338	SUBPD	 %xmm5, %xmm1
339	movapd	 %xmm1, -14 * SIZE(Y1)
340
341	movapd	 -12 * SIZE(Y1), %xmm0
342
343	addl	 $4 * SIZE, A1
344	addl	 $4 * SIZE, Y1
345	ALIGN_3
346
347.L17:
348	testl	$1, M
349	je	.L19
350
351	movsd	 -16 * SIZE(A1), %xmm2
352	movhpd	 -15 * SIZE(A1), %xmm2
353
354	pshufd	 $0x4e, %xmm2, %xmm3
355	mulpd	 %xmm6, %xmm2
356	addpd	 %xmm2, %xmm0
357	mulpd	 %xmm7, %xmm3
358	SUBPD	 %xmm3, %xmm0
359
360	movapd	 %xmm0, -16 * SIZE(Y1)
361	ALIGN_3
362
363.L19:
364	decl	J
365	jg	.L10
366	ALIGN_4
367
368.L990:
369	movl	Y,   Y1
370	movl	BUFFER, X
371
372	movl	STACK_INCY, INCY
373	sall	$ZBASE_SHIFT, INCY
374
375	movl	M,   %eax
376	sarl	$2,  %eax
377	jle	.L994
378	ALIGN_3
379
380.L992:
381	movsd	0 * SIZE(Y1), %xmm0
382	movhpd	1 * SIZE(Y1), %xmm0
383
384	addpd	0 * SIZE(X), %xmm0
385
386	movlpd	%xmm0, 0 * SIZE(Y1)
387	movhpd	%xmm0, 1 * SIZE(Y1)
388	addl	INCY, Y1
389
390	movsd	0 * SIZE(Y1), %xmm0
391	movhpd	1 * SIZE(Y1), %xmm0
392
393	addpd	2 * SIZE(X), %xmm0
394
395	movlpd	%xmm0, 0 * SIZE(Y1)
396	movhpd	%xmm0, 1 * SIZE(Y1)
397	addl	INCY, Y1
398
399	movsd	0 * SIZE(Y1), %xmm0
400	movhpd	1 * SIZE(Y1), %xmm0
401
402	addpd	4 * SIZE(X), %xmm0
403
404	movlpd	%xmm0, 0 * SIZE(Y1)
405	movhpd	%xmm0, 1 * SIZE(Y1)
406	addl	INCY, Y1
407
408	movsd	0 * SIZE(Y1), %xmm0
409	movhpd	1 * SIZE(Y1), %xmm0
410
411	addpd	6 * SIZE(X), %xmm0
412
413	movlpd	%xmm0, 0 * SIZE(Y1)
414	movhpd	%xmm0, 1 * SIZE(Y1)
415	addl	INCY, Y1
416
417	addl	$8 * SIZE, X
418	decl	%eax
419	jg	.L992
420	ALIGN_3
421
422.L994:
423	testl	$2, M
424	jle	.L996
425
426	movsd	0 * SIZE(Y1), %xmm0
427	movhpd	1 * SIZE(Y1), %xmm0
428
429	addpd	0 * SIZE(X), %xmm0
430
431	movlpd	%xmm0, 0 * SIZE(Y1)
432	movhpd	%xmm0, 1 * SIZE(Y1)
433	addl	INCY, Y1
434
435	movsd	0 * SIZE(Y1), %xmm0
436	movhpd	1 * SIZE(Y1), %xmm0
437
438	addpd	2 * SIZE(X), %xmm0
439
440	movlpd	%xmm0, 0 * SIZE(Y1)
441	movhpd	%xmm0, 1 * SIZE(Y1)
442	addl	INCY, Y1
443
444	addl	$4 * SIZE, X
445	ALIGN_3
446
447.L996:
448	testl	$1, M
449	jle	.L999
450
451	movsd	0 * SIZE(Y1), %xmm0
452	movhpd	1 * SIZE(Y1), %xmm0
453
454	addpd	0 * SIZE(X), %xmm0
455
456	movlpd	%xmm0, 0 * SIZE(Y1)
457	movhpd	%xmm0, 1 * SIZE(Y1)
458	ALIGN_3
459
460.L999:
461	popl	%ebx
462	popl	%esi
463	popl	%edi
464	popl	%ebp
465	ret
466
467	EPILOGUE
468