1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef ATOM
43#define PREFETCH	prefetcht0
44#define PREFETCHW	prefetcht0
45#define PREFETCHSIZE	(16 * 24)
46#endif
47
48#ifdef CORE2
49#define PREFETCH	prefetcht0
50#define PREFETCHW	prefetcht0
51#define PREFETCHSIZE	(16 * 24)
52#endif
53
54#if defined(PENRYN) || defined(DUNNINGTON)
55#define PREFETCH	prefetcht0
56#define PREFETCHW	prefetcht0
57#define PREFETCHSIZE	(16 * 24)
58#endif
59
60#ifdef NEHALEM
61#define PREFETCH	prefetcht0
62#define PREFETCHW	prefetcht0
63#define PREFETCHSIZE	(16 * 24)
64#endif
65
66#ifdef PENTIUM4
67#define PREFETCH	prefetcht0
68#define PREFETCHW	prefetcht0
69#define PREFETCHSIZE	(16 * 28)
70#endif
71
72#ifdef OPTERON
73#define PREFETCH	prefetch
74#define PREFETCHW	prefetchw
75#define PREFETCHSIZE	(16 * 12)
76#define movsd		movlpd
77#endif
78
79#if defined(BARCELONA)  || defined(SHANGHAI)
80#define PREFETCH	prefetch
81#define PREFETCHW	prefetchw
82#define PREFETCHSIZE	(16 * 16)
83#endif
84
85#ifdef NANO
86#define PREFETCH	prefetcht0
87#define PREFETCHW	prefetcht0
88#define PREFETCHSIZE	(16 * 24)
89#endif
90
91#ifdef GENERIC
92#define PREFETCH	prefetcht0
93#define PREFETCHW	prefetcht0
94#define PREFETCHSIZE	(16 * 14)
95#endif
96
97#ifndef WINDOWS_ABI
98
99#define STACKSIZE	80
100
101#define OLD_Y		 8 + STACKSIZE(%rsp)
102#define OLD_INCY	16 + STACKSIZE(%rsp)
103#define OLD_BUFFER	24 + STACKSIZE(%rsp)
104
105#define M	  ARG1
106#define	N	  ARG2
107#define	A	  ARG3
108#define LDA	  ARG4
109#define	X	  ARG5
110#define INCX	  ARG6
111
112#else
113
114#define STACKSIZE	256
115
116#define OLD_A		 40 + STACKSIZE(%rsp)
117#define OLD_LDA		 48 + STACKSIZE(%rsp)
118#define OLD_X		 56 + STACKSIZE(%rsp)
119#define OLD_INCX	 64 + STACKSIZE(%rsp)
120#define OLD_Y		 72 + STACKSIZE(%rsp)
121#define OLD_INCY	 80 + STACKSIZE(%rsp)
122#define OLD_BUFFER	 88 + STACKSIZE(%rsp)
123
124#define M	  ARG1
125#define N	  ARG2
126#define	A	  ARG4
127#define LDA	  ARG3
128#define	X	  %rdi
129#define INCX	  %rsi
130#endif
131
132#define	Y	%r10
133#define INCY	%r11
134#define BUFFER	%r12
135
136#define TEMP	%rax
137#define I	%rax
138#define A1	%rbx
139#define A2	%rbp
140#define XX	%r13
141#define YY	%r14
142#define IS	%r15
143#define NEW_X	BUFFER
144#define NEW_Y	X
145
146#define ALPHA_R  %xmm0
147#define ALPHA_I  %xmm1
148
149#define xsum1  %xmm0
150#define xsum2  %xmm1
151#define xsum3  %xmm2
152#define xsum4  %xmm3
153
154#define atemp1 %xmm4
155#define atemp2 %xmm5
156#define atemp3 %xmm6
157#define atemp4 %xmm7
158
159#define xtemp1 %xmm8
160#define xtemp2 %xmm9
161#define a1     %xmm10
162#define a2     %xmm11
163
164#define a3     %xmm12
165#define yy1    %xmm13
166#define	xt1    %xmm14
167#define	xt2    %xmm15
168
169#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
170#define MOVDDUP(a, b, c)	movddup	a(b), c
171#define MOVDDUP2(a, b, c)	movddup	a##b, c
172#else
173#define MOVDDUP(a, b, c)	movlpd	a(b), c;movhpd	a(b), c
174#define MOVDDUP2(a, b, c)	movlpd	a##b, c;movhpd	a##b, c
175#endif
176
177	PROLOGUE
178	PROFCODE
179
180	subq	$STACKSIZE, %rsp
181	movq	%rbx,  0(%rsp)
182	movq	%rbp,  8(%rsp)
183	movq	%r12, 16(%rsp)
184	movq	%r13, 24(%rsp)
185	movq	%r14, 32(%rsp)
186	movq	%r15, 40(%rsp)
187
188#ifdef WINDOWS_ABI
189	movq	%rdi,    48(%rsp)
190	movq	%rsi,    56(%rsp)
191	movups	%xmm6,   64(%rsp)
192	movups	%xmm7,   80(%rsp)
193	movups	%xmm8,   96(%rsp)
194	movups	%xmm9,  112(%rsp)
195	movups	%xmm10, 128(%rsp)
196	movups	%xmm11, 144(%rsp)
197	movups	%xmm12, 160(%rsp)
198	movups	%xmm13, 176(%rsp)
199	movups	%xmm14, 192(%rsp)
200	movups	%xmm15, 208(%rsp)
201
202	movq	OLD_A,     A
203	movq	OLD_LDA,   LDA
204	movq	OLD_X,     X
205	movq	OLD_INCX,  INCX
206
207	movaps	%xmm2, %xmm0
208	movaps	%xmm3, %xmm1
209#endif
210
211	movq	OLD_Y,     Y
212	movq	OLD_INCY,   INCY
213	movq	OLD_BUFFER, BUFFER
214
215	salq	$ZBASE_SHIFT, INCX
216	salq	$ZBASE_SHIFT, INCY
217	salq	$ZBASE_SHIFT, LDA
218
219	testq	M, M
220	jle	.L999
221
222	negq	IS
223	addq	M, IS
224
225	movq	IS,  TEMP
226	imulq	LDA, TEMP
227	addq	TEMP, A
228
229	pcmpeqb	%xmm3,  %xmm3
230	xorpd	%xmm2,  %xmm2
231	pslld	$31,    %xmm3
232	unpckhps %xmm3, %xmm2
233
234	shufps	 $0, ALPHA_R, ALPHA_R
235	shufps	 $0, ALPHA_I, ALPHA_I
236	movaps	 ALPHA_I, %xmm3
237
238	unpcklps ALPHA_R, ALPHA_I
239	unpcklps %xmm3,   ALPHA_R
240	pxor	 %xmm2,   ALPHA_R
241
242	movq	BUFFER, XX
243
244	movq	M,  %rax
245	sarq	$2, %rax
246	jle	.L02
247	ALIGN_3
248
249.L01:
250	movsd	0 * SIZE(X), %xmm4
251	addq	INCX, X
252	movhps	0 * SIZE(X), %xmm4
253	addq	INCX, X
254	movsd	0 * SIZE(X), %xmm6
255	addq	INCX, X
256	movhps	0 * SIZE(X), %xmm6
257	addq	INCX, X
258
259	movsldup %xmm4, %xmm3
260	movshdup %xmm4, %xmm4
261	movsldup %xmm6, %xmm5
262	movshdup %xmm6, %xmm6
263
264	mulps	ALPHA_I, %xmm3
265	mulps	ALPHA_R, %xmm4
266	mulps	ALPHA_I, %xmm5
267	mulps	ALPHA_R, %xmm6
268
269	addps	%xmm4, %xmm3
270	addps	%xmm6, %xmm5
271
272	movaps	%xmm3,  4 * SIZE(XX)
273	movaps	%xmm5, 12 * SIZE(XX)
274
275	shufps	$0xb1, %xmm3, %xmm3
276	shufps	$0xb1, %xmm5, %xmm5
277
278	pxor	%xmm2, %xmm3
279	pxor	%xmm2, %xmm5
280
281	movaps	%xmm3,  0 * SIZE(XX)
282	movaps	%xmm5,  8 * SIZE(XX)
283
284	subq	$-16 * SIZE, XX
285	decq	%rax
286	jg	.L01
287	ALIGN_3
288
289.L02:
290	testq	$2, M
291	jle	.L03
292
293	movsd	0 * SIZE(X), %xmm4
294	addq	INCX, X
295	movhps	0 * SIZE(X), %xmm4
296	addq	INCX, X
297
298	movsldup %xmm4, %xmm3
299	movshdup %xmm4, %xmm4
300
301	mulps	ALPHA_I, %xmm3
302	mulps	ALPHA_R, %xmm4
303
304	addps	%xmm4, %xmm3
305
306	movaps	%xmm3,  4 * SIZE(XX)
307
308	shufps	$0xb1, %xmm3, %xmm3
309	pxor	%xmm2, %xmm3
310	movaps	%xmm3,  0 * SIZE(XX)
311
312	subq	$-8 * SIZE, XX
313	ALIGN_3
314
315.L03:
316	testq	$1, M
317	jle	.L05
318
319	movsd	0 * SIZE(X), %xmm4
320	addq	INCX, X
321
322	movsldup %xmm4, %xmm3
323	movshdup %xmm4, %xmm4
324
325	mulps	ALPHA_I, %xmm3
326	mulps	ALPHA_R, %xmm4
327
328	addps	%xmm4, %xmm3
329
330	movlps	%xmm3,  2 * SIZE(XX)
331
332	shufps	$0xb1, %xmm3, %xmm3
333	pxor	%xmm2, %xmm3
334	movlps	%xmm3,  0 * SIZE(XX)
335
336	subq	$-4 * SIZE, XX
337	ALIGN_3
338
339.L05:
340	/* now we don't need original X */
341	movq   Y, NEW_Y
342
343	addq   $512, XX
344	andq   $-512, XX
345
346	cmpq   $2 * SIZE, INCY
347	je    .L10
348
349	movq   Y,  YY
350	movq   XX, NEW_Y
351
352	movq	M,  %rax
353	sarq	$2, %rax
354	jle	.L07
355	ALIGN_3
356
357.L06:
358	movsd	0 * SIZE(YY), %xmm0
359	addq	INCY, YY
360	movhps	0 * SIZE(YY), %xmm0
361	addq	INCY, YY
362	movsd	0 * SIZE(YY), %xmm1
363	addq	INCY, YY
364	movhps	0 * SIZE(YY), %xmm1
365	addq	INCY, YY
366
367	movaps	%xmm0, 0 * SIZE(XX)
368	movaps	%xmm1, 8 * SIZE(XX)
369
370	addq	$8 * SIZE, XX
371	decq	%rax
372	jg	.L06
373	ALIGN_3
374
375.L07:
376	movq	M, %rax
377	andq	$3, %rax
378	jle	.L10
379	ALIGN_3
380
381.L08:
382	movsd	0 * SIZE(YY), %xmm0
383	addq	INCY, YY
384
385	movlps	%xmm0, 0 * SIZE(XX)
386
387	addq	$2 * SIZE, XX
388	decq	%rax
389	jg	.L08
390	ALIGN_3
391
392.L10:
393	movq	 IS, I
394	addq	 $2, I
395	cmpq	 M,  I
396	jg	 .L20
397	ALIGN_3
398
399.L11:
400	movq	A,  A1
401	leaq	(A, LDA, 1), A2
402	leaq	(A, LDA, 2), A
403
404	leaq	(, IS, 4), I
405
406	movsd	 0 * SIZE(NEW_X, I, SIZE), atemp2
407	movhps	 4 * SIZE(NEW_X, I, SIZE), atemp2
408	movsd	 2 * SIZE(NEW_X, I, SIZE), atemp4
409	movhps	 6 * SIZE(NEW_X, I, SIZE), atemp4
410
411	pshufd	 $0xcc, atemp2, atemp1
412	pshufd	 $0x99, atemp2, atemp2
413	pshufd	 $0xcc, atemp4, atemp3
414	pshufd	 $0x99, atemp4, atemp4
415
416	pxor		xsum1, xsum1
417	pxor		xsum2, xsum2
418	pxor		xsum3, xsum3
419	pxor		xsum4, xsum4
420
421	movq		NEW_X, XX
422	movq		NEW_Y, YY
423
424	movq	IS,  I
425	sarq	$2,  I
426	jle	.L15
427	ALIGN_3
428
429.L12:
430	HALT
431
432	subq	 $-16 * SIZE, XX
433	addq	 $  8 * SIZE, YY
434	addq	 $  8 * SIZE, A1
435	addq	 $  8 * SIZE, A2
436
437	decq	 I
438	jg	 .L12
439	ALIGN_3
440
441.L15:
442	testq	$2, IS
443	jle	.L18
444
445	movsd	 0 * SIZE(YY), yy1
446	movhps	 2 * SIZE(YY), yy1
447
448	movaps	 0 * SIZE(XX), xtemp1
449	movaps	 4 * SIZE(XX), xtemp2
450
451	movsd	 0 * SIZE(A1), a1
452	movhps	 2 * SIZE(A1), a1
453
454	movaps	 xtemp1, xt1
455	movaps	 xtemp2, xt2
456	mulps	 a1, xt1
457	mulps	 a1, xt2
458	addps	 xt1, xsum1
459	addps	 xt2, xsum2
460
461	pshufd	 $0xb1, a1, xt2
462	mulps	 atemp1, a1
463	mulps	 atemp2, xt2
464	addps	 a1,  yy1
465	addps	 xt2, yy1
466
467	movsd	 0 * SIZE(A2), a1
468	movhps	 2 * SIZE(A2), a1
469
470	movaps	 xtemp1, xt1
471	movaps	 xtemp2, xt2
472	mulps	 a1, xt1
473	mulps	 a1, xt2
474	addps	 xt1, xsum3
475	addps	 xt2, xsum4
476
477	pshufd	 $0xb1, a1, xt2
478	mulps	 atemp1, a1
479	mulps	 atemp2, xt2
480	addps	  a1, yy1
481	addps	 xt2, yy1
482
483	movlps	 yy1, 0 * SIZE(YY)
484	movhps	 yy1, 2 * SIZE(YY)
485
486	addq	 $8 * SIZE, XX
487	addq	 $4 * SIZE, YY
488	addq	 $4 * SIZE, A1
489	addq	 $4 * SIZE, A2
490	ALIGN_3
491
492.L18:
493	leaq	(, IS, 4), I
494
495	movaps	 0 * SIZE(NEW_X, I, SIZE), atemp1
496	movaps	 4 * SIZE(NEW_X, I, SIZE), atemp2
497
498	movlps	 0 * SIZE(YY), yy1
499	movhps	 2 * SIZE(YY), yy1
500
501	movsd	 0 * SIZE(A1), a1
502	movhps	 0 * SIZE(A2), a1
503
504	movaps	 a1, a2
505	mulps	 atemp1, a1
506	mulps	 atemp2, a2
507	addps	 a1, xsum1
508	addps	 a2, xsum2
509
510	movsd	 0 * SIZE(A2), a1
511	movhps	 2 * SIZE(A2), a1
512
513	movaps	 a1, a2
514	mulps	 atemp1, a1
515	mulps	 atemp2, a2
516	addps	 a1, xsum3
517	addps	 a2, xsum4
518
519	haddps	 xsum2, xsum1
520	haddps	 xsum4, xsum3
521
522	haddps	 xsum3, xsum1
523	addps	 xsum1, yy1
524
525	movlps	 yy1, 0 * SIZE(YY)
526	movhps	 yy1, 2 * SIZE(YY)
527
528	addq	 $2, IS
529
530	movq	 IS, I
531	addq	 $2, I
532	cmpq	 M, I
533	jle	 .L11
534	ALIGN_3
535
536.L20:
537	testq	$1, M
538	jle	.L990
539
540
541.L990:
542	cmpq   $2 * SIZE, INCY
543	je    .L999
544
545	movq	M,  %rax
546	sarq	$2, %rax
547	jle	.L997
548	ALIGN_3
549
550.L996:
551	movaps	 0 * SIZE(NEW_Y), %xmm0
552	movaps	 4 * SIZE(NEW_Y), %xmm1
553
554	movlps	%xmm0,  0 * SIZE(Y)
555	addq	INCY, Y
556	movhps	%xmm0,  0 * SIZE(Y)
557	addq	INCY, Y
558	movlps	%xmm1,  0 * SIZE(Y)
559	addq	INCY, Y
560	movhps	%xmm1,  0 * SIZE(Y)
561	addq	INCY, Y
562
563	addq	$8 * SIZE, NEW_Y
564	decq	%rax
565	jg	.L996
566	ALIGN_3
567
568.L997:
569	movq	M, %rax
570	andq	$3, %rax
571	jle	.L999
572	ALIGN_3
573
574.L998:
575	movlps	0 * SIZE(NEW_Y), %xmm0
576	addq	$2 * SIZE, NEW_Y
577
578	movlps	%xmm0,  0 * SIZE(Y)
579	addq	INCY, Y
580
581	decq	%rax
582	jg	.L998
583	ALIGN_3
584
585.L999:
586	movq	  0(%rsp), %rbx
587	movq	  8(%rsp), %rbp
588	movq	 16(%rsp), %r12
589	movq	 24(%rsp), %r13
590	movq	 32(%rsp), %r14
591	movq	 40(%rsp), %r15
592	addq	$STACKSIZE, %rsp
593	ret
594	EPILOGUE
595