1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef ATOM
43#define PREFETCH	prefetchnta
44#define PREFETCHW	prefetcht0
45#define PREFETCHSIZE	(8 * 6)
46#endif
47
48#define STACKSIZE	16
49
50#define M		 4 + STACKSIZE(%esp)
51#define N		 8 + STACKSIZE(%esp)
52#define ALPHA_R		16 + STACKSIZE(%esp)
53#define ALPHA_I		24 + STACKSIZE(%esp)
54#define A		32 + STACKSIZE(%esp)
55#define STACK_LDA	36 + STACKSIZE(%esp)
56#define STACK_X		40 + STACKSIZE(%esp)
57#define STACK_INCX	44 + STACKSIZE(%esp)
58#define Y		48 + STACKSIZE(%esp)
59#define STACK_INCY	52 + STACKSIZE(%esp)
60#define BUFFER		56 + STACKSIZE(%esp)
61
62#define I	%eax
63#define J	%ebx
64
65#define INCX	%ecx
66#define INCY	J
67
68#define A1	%esi
69#define X	%edx
70#define Y1	%edi
71#define LDA	%ebp
72
73#if !defined(CONJ) && !defined(XCONJ)
74#define ADD1	   addsd
75#define ADD2	   addsd
76#define ADD3	   subsd
77#define ADD4	   addsd
78#endif
79
80#if  defined(CONJ) && !defined(XCONJ)
81#define ADD1	   addsd
82#define ADD2	   addsd
83#define ADD3	   addsd
84#define ADD4	   subsd
85#endif
86
87#if !defined(CONJ) &&  defined(XCONJ)
88#define ADD1	   addsd
89#define ADD2	   subsd
90#define ADD3	   addsd
91#define ADD4	   addsd
92#endif
93
94#if  defined(CONJ) &&  defined(XCONJ)
95#define ADD1	   addsd
96#define ADD2	   subsd
97#define ADD3	   subsd
98#define ADD4	   subsd
99#endif
100
101	PROLOGUE
102
103	pushl	%ebp
104	pushl	%edi
105	pushl	%esi
106	pushl	%ebx
107
108	PROFCODE
109
110	movl	STACK_LDA,  LDA
111	movl	STACK_X,    X
112	movl	STACK_INCX, INCX
113
114	sall	$ZBASE_SHIFT, INCX
115	sall	$ZBASE_SHIFT, LDA
116
117	subl	$-16 * SIZE, A
118
119	cmpl	$0, N
120	jle	.L999
121	cmpl	$0, M
122	jle	.L999
123
124	movl	BUFFER, Y1
125
126	movl	N,  J
127
128	pxor	%xmm7, %xmm7
129
130	movl	M,  %eax
131	addl	$8, %eax
132	sarl	$3, %eax
133	ALIGN_3
134
135.L01:
136	movapd	%xmm7,  0 * SIZE(Y1)
137	movapd	%xmm7,  2 * SIZE(Y1)
138	movapd	%xmm7,  4 * SIZE(Y1)
139	movapd	%xmm7,  6 * SIZE(Y1)
140	movapd	%xmm7,  8 * SIZE(Y1)
141	movapd	%xmm7, 10 * SIZE(Y1)
142	movapd	%xmm7, 12 * SIZE(Y1)
143	movapd	%xmm7, 14 * SIZE(Y1)
144	subl	$-16 * SIZE, Y1
145	decl	%eax
146	jg	.L01
147	ALIGN_3
148
149.L10:
150	movl	BUFFER, Y1
151	addl	$16 * SIZE, Y1
152
153	movl	A,  A1
154	addl	LDA, A
155
156	movsd	0 * SIZE(X), %xmm6
157	movsd	1 * SIZE(X), %xmm7
158	addl	INCX, X
159
160	movapd	%xmm6,   %xmm2
161	mulsd	ALPHA_R, %xmm6
162	mulsd	ALPHA_I, %xmm2
163	movapd	%xmm7,   %xmm3
164	mulsd	ALPHA_I, %xmm3
165	mulsd	ALPHA_R, %xmm7
166
167#ifndef XCONJ
168	subsd	%xmm3,  %xmm6
169	addsd	%xmm2,  %xmm7
170#else
171	addsd	%xmm3,  %xmm6
172	subsd	%xmm2,  %xmm7
173#endif
174
175	movsd	 -16 * SIZE(Y1), %xmm0
176	movsd	 -15 * SIZE(Y1), %xmm1
177	ALIGN_3
178
179	movl	M,   I
180	sarl	$2,  I
181	jle	.L15
182
183	movsd	 -16 * SIZE(A1), %xmm2
184	movsd	 -15 * SIZE(A1), %xmm3
185
186	movapd	%xmm2, %xmm4
187	mulsd	%xmm6, %xmm2
188	mulsd	%xmm7, %xmm4
189
190	decl	 I
191	jle	 .L14
192	ALIGN_3
193
194.L13:
195#ifdef PREFETCH
196	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
197#endif
198
199	movapd	%xmm3, %xmm5
200	mulsd	%xmm7, %xmm3
201	ADD1	%xmm2, %xmm0
202	movsd	 -14 * SIZE(A1), %xmm2
203	mulsd	%xmm6, %xmm5
204	ADD2	%xmm4, %xmm1
205
206	movapd	%xmm2, %xmm4
207	mulsd	%xmm6, %xmm2
208	ADD3	%xmm3, %xmm0
209	movsd	 -13 * SIZE(A1), %xmm3
210	ADD4	%xmm5, %xmm1
211	mulsd	%xmm7, %xmm4
212
213	movlpd	 %xmm0, -16 * SIZE(Y1)
214	movsd	 -14 * SIZE(Y1), %xmm0
215	movlpd	 %xmm1, -15 * SIZE(Y1)
216	movsd	 -13 * SIZE(Y1), %xmm1
217
218	movapd	%xmm3, %xmm5
219	mulsd	%xmm7, %xmm3
220	ADD1	%xmm2, %xmm0
221	movsd	 -12 * SIZE(A1), %xmm2
222	mulsd	%xmm6, %xmm5
223	ADD2	%xmm4, %xmm1
224
225	movapd	%xmm2, %xmm4
226	mulsd	%xmm6, %xmm2
227	ADD3	%xmm3, %xmm0
228	movsd	 -11 * SIZE(A1), %xmm3
229	mulsd	%xmm7, %xmm4
230	ADD4	%xmm5, %xmm1
231
232	movlpd	 %xmm0, -14 * SIZE(Y1)
233	movsd	 -12 * SIZE(Y1), %xmm0
234	movlpd	 %xmm1, -13 * SIZE(Y1)
235	movsd	 -11 * SIZE(Y1), %xmm1
236
237	movapd	%xmm3, %xmm5
238	mulsd	%xmm7, %xmm3
239	ADD1	%xmm2, %xmm0
240	movsd	 -10 * SIZE(A1), %xmm2
241	mulsd	%xmm6, %xmm5
242	ADD2	%xmm4, %xmm1
243
244	movapd	%xmm2, %xmm4
245	mulsd	%xmm6, %xmm2
246	ADD3	%xmm3, %xmm0
247	movsd	  -9 * SIZE(A1), %xmm3
248	ADD4	%xmm5, %xmm1
249	mulsd	%xmm7, %xmm4
250
251	movlpd	 %xmm0, -12 * SIZE(Y1)
252	movsd	 -10 * SIZE(Y1), %xmm0
253	movlpd	 %xmm1, -11 * SIZE(Y1)
254	movsd	  -9 * SIZE(Y1), %xmm1
255
256	movapd	%xmm3, %xmm5
257	mulsd	%xmm7, %xmm3
258	ADD1	%xmm2, %xmm0
259	movsd	  -8 * SIZE(A1), %xmm2
260	mulsd	%xmm6, %xmm5
261	ADD2	%xmm4, %xmm1
262
263	movapd	%xmm2, %xmm4
264	mulsd	%xmm6, %xmm2
265	ADD3	%xmm3, %xmm0
266	movsd	  -7 * SIZE(A1), %xmm3
267	mulsd	%xmm7, %xmm4
268	ADD4	%xmm5, %xmm1
269
270	movlpd	 %xmm0, -10 * SIZE(Y1)
271	movsd	  -8 * SIZE(Y1), %xmm0
272	movlpd	 %xmm1,  -9 * SIZE(Y1)
273	movsd	  -7 * SIZE(Y1), %xmm1
274
275	subl	 $-8 * SIZE, A1
276	subl	 $-8 * SIZE, Y1
277
278	subl	 $1, I
279	BRANCH
280	jg	.L13
281	ALIGN_3
282
283.L14:
284	movapd	%xmm3, %xmm5
285	mulsd	%xmm7, %xmm3
286	ADD1	%xmm2, %xmm0
287	movsd	 -14 * SIZE(A1), %xmm2
288	mulsd	%xmm6, %xmm5
289	ADD2	%xmm4, %xmm1
290
291	movapd	%xmm2, %xmm4
292	mulsd	%xmm6, %xmm2
293	ADD3	%xmm3, %xmm0
294	movsd	 -13 * SIZE(A1), %xmm3
295	ADD4	%xmm5, %xmm1
296	mulsd	%xmm7, %xmm4
297
298	movlpd	 %xmm0, -16 * SIZE(Y1)
299	movsd	 -14 * SIZE(Y1), %xmm0
300	movlpd	 %xmm1, -15 * SIZE(Y1)
301	movsd	 -13 * SIZE(Y1), %xmm1
302
303	movapd	%xmm3, %xmm5
304	mulsd	%xmm7, %xmm3
305	ADD1	%xmm2, %xmm0
306	movsd	 -12 * SIZE(A1), %xmm2
307	mulsd	%xmm6, %xmm5
308	ADD2	%xmm4, %xmm1
309
310	movapd	%xmm2, %xmm4
311	mulsd	%xmm6, %xmm2
312	ADD3	%xmm3, %xmm0
313	movsd	 -11 * SIZE(A1), %xmm3
314	mulsd	%xmm7, %xmm4
315	ADD4	%xmm5, %xmm1
316
317	movlpd	 %xmm0, -14 * SIZE(Y1)
318	movsd	 -12 * SIZE(Y1), %xmm0
319	movlpd	 %xmm1, -13 * SIZE(Y1)
320	movsd	 -11 * SIZE(Y1), %xmm1
321
322	movapd	%xmm3, %xmm5
323	mulsd	%xmm7, %xmm3
324	ADD1	%xmm2, %xmm0
325	movsd	 -10 * SIZE(A1), %xmm2
326	mulsd	%xmm6, %xmm5
327	ADD2	%xmm4, %xmm1
328
329	movapd	%xmm2, %xmm4
330	mulsd	%xmm6, %xmm2
331	ADD3	%xmm3, %xmm0
332	movsd	  -9 * SIZE(A1), %xmm3
333	ADD4	%xmm5, %xmm1
334	mulsd	%xmm7, %xmm4
335
336	movlpd	 %xmm0, -12 * SIZE(Y1)
337	movsd	 -10 * SIZE(Y1), %xmm0
338	movlpd	 %xmm1, -11 * SIZE(Y1)
339	movsd	  -9 * SIZE(Y1), %xmm1
340
341	movapd	%xmm3, %xmm5
342	mulsd	%xmm7, %xmm3
343	ADD1	%xmm2, %xmm0
344	mulsd	%xmm6, %xmm5
345	ADD2	%xmm4, %xmm1
346
347	ADD3	%xmm3, %xmm0
348	ADD4	%xmm5, %xmm1
349
350	movlpd	 %xmm0, -10 * SIZE(Y1)
351	movsd	  -8 * SIZE(Y1), %xmm0
352	movlpd	 %xmm1,  -9 * SIZE(Y1)
353	movsd	  -7 * SIZE(Y1), %xmm1
354
355	subl	 $-8 * SIZE, A1
356	subl	 $-8 * SIZE, Y1
357	ALIGN_3
358
359.L15:
360	testl	$2, M
361	je	.L17
362
363	movsd	 -16 * SIZE(A1), %xmm2
364	movsd	 -15 * SIZE(A1), %xmm3
365
366	movapd	%xmm2, %xmm4
367	mulsd	%xmm6, %xmm2
368	mulsd	%xmm7, %xmm4
369
370	movapd	%xmm3, %xmm5
371	mulsd	%xmm7, %xmm3
372	ADD1	%xmm2, %xmm0
373	movsd	 -14 * SIZE(A1), %xmm2
374	mulsd	%xmm6, %xmm5
375	ADD2	%xmm4, %xmm1
376
377	movapd	%xmm2, %xmm4
378	mulsd	%xmm6, %xmm2
379	ADD3	%xmm3, %xmm0
380	movsd	 -13 * SIZE(A1), %xmm3
381	ADD4	%xmm5, %xmm1
382	mulsd	%xmm7, %xmm4
383
384	movlpd	 %xmm0, -16 * SIZE(Y1)
385	movsd	 -14 * SIZE(Y1), %xmm0
386	movlpd	 %xmm1, -15 * SIZE(Y1)
387	movsd	 -13 * SIZE(Y1), %xmm1
388
389	movapd	%xmm3, %xmm5
390	mulsd	%xmm7, %xmm3
391	ADD1	%xmm2, %xmm0
392	mulsd	%xmm6, %xmm5
393	ADD2	%xmm4, %xmm1
394
395	ADD3	%xmm3, %xmm0
396	ADD4	%xmm5, %xmm1
397
398	movlpd	 %xmm0, -14 * SIZE(Y1)
399	movsd	 -12 * SIZE(Y1), %xmm0
400	movlpd	 %xmm1, -13 * SIZE(Y1)
401	movsd	 -11 * SIZE(Y1), %xmm1
402
403	addl	 $4 * SIZE, A1
404	addl	 $4 * SIZE, Y1
405	ALIGN_3
406
407.L17:
408	testl	$1, M
409	je	.L19
410
411	movsd	 -16 * SIZE(A1), %xmm2
412	movsd	 -15 * SIZE(A1), %xmm3
413
414	movapd	%xmm2, %xmm4
415	mulsd	%xmm6, %xmm2
416	mulsd	%xmm7, %xmm4
417
418	movapd	%xmm3, %xmm5
419	mulsd	%xmm7, %xmm3
420	ADD1	%xmm2, %xmm0
421	mulsd	%xmm6, %xmm5
422	ADD2	%xmm4, %xmm1
423
424	ADD3	%xmm3, %xmm0
425	ADD4	%xmm5, %xmm1
426
427	movlpd	 %xmm0, -16 * SIZE(Y1)
428	movlpd	 %xmm1, -15 * SIZE(Y1)
429	ALIGN_3
430
431.L19:
432	decl	J
433	jg	.L10
434	ALIGN_4
435
436.L990:
437	movl	Y,   Y1
438	movl	BUFFER, X
439	movl	STACK_INCY, INCY
440
441	movl	Y1, A1
442	sall	$ZBASE_SHIFT, INCY
443
444	movl	M,   %eax
445	sarl	$2,  %eax
446	jle	.L994
447	ALIGN_3
448
449.L992:
450	movsd	0 * SIZE(Y1), %xmm0
451	movsd	1 * SIZE(Y1), %xmm1
452	addl	INCY, Y1
453
454	movsd	0 * SIZE(Y1), %xmm2
455	movsd	1 * SIZE(Y1), %xmm3
456	addl	INCY, Y1
457
458	movsd	0 * SIZE(Y1), %xmm4
459	movsd	1 * SIZE(Y1), %xmm5
460	addl	INCY, Y1
461
462	movsd	0 * SIZE(Y1), %xmm6
463	movsd	1 * SIZE(Y1), %xmm7
464	addl	INCY, Y1
465
466	addsd	0 * SIZE(X), %xmm0
467	addsd	1 * SIZE(X), %xmm1
468	addsd	2 * SIZE(X), %xmm2
469	addsd	3 * SIZE(X), %xmm3
470	addsd	4 * SIZE(X), %xmm4
471	addsd	5 * SIZE(X), %xmm5
472	addsd	6 * SIZE(X), %xmm6
473	addsd	7 * SIZE(X), %xmm7
474
475	movlpd	%xmm0, 0 * SIZE(A1)
476	movlpd	%xmm1, 1 * SIZE(A1)
477	addl	INCY, A1
478
479	movlpd	%xmm2, 0 * SIZE(A1)
480	movlpd	%xmm3, 1 * SIZE(A1)
481	addl	INCY, A1
482
483	movlpd	%xmm4, 0 * SIZE(A1)
484	movlpd	%xmm5, 1 * SIZE(A1)
485	addl	INCY, A1
486
487	movlpd	%xmm6, 0 * SIZE(A1)
488	movlpd	%xmm7, 1 * SIZE(A1)
489	addl	INCY, A1
490
491	addl	$8 * SIZE, X
492	decl	%eax
493	jg	.L992
494	ALIGN_3
495
496.L994:
497	testl	$2, M
498	jle	.L996
499
500	movsd	0 * SIZE(Y1), %xmm0
501	movsd	1 * SIZE(Y1), %xmm1
502	addl	INCY, Y1
503
504	movsd	0 * SIZE(Y1), %xmm2
505	movsd	1 * SIZE(Y1), %xmm3
506	addl	INCY, Y1
507
508	addsd	0 * SIZE(X), %xmm0
509	addsd	1 * SIZE(X), %xmm1
510	addsd	2 * SIZE(X), %xmm2
511	addsd	3 * SIZE(X), %xmm3
512
513	movlpd	%xmm0, 0 * SIZE(A1)
514	movlpd	%xmm1, 1 * SIZE(A1)
515	addl	INCY, A1
516
517	movlpd	%xmm2, 0 * SIZE(A1)
518	movlpd	%xmm3, 1 * SIZE(A1)
519	addl	INCY, A1
520
521	addl	$4 * SIZE, X
522	ALIGN_3
523
524.L996:
525	testl	$1, M
526	jle	.L999
527
528	movsd	0 * SIZE(Y1), %xmm0
529	movsd	1 * SIZE(Y1), %xmm1
530
531	addsd	0 * SIZE(X), %xmm0
532	addsd	1 * SIZE(X), %xmm1
533
534	movlpd	%xmm0, 0 * SIZE(A1)
535	movlpd	%xmm1, 1 * SIZE(A1)
536	ALIGN_3
537
538.L999:
539	popl	%ebx
540	popl	%esi
541	popl	%edi
542	popl	%ebp
543	ret
544
545	EPILOGUE
546