1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef ATOM
26#define PREFETCH	prefetcht0
27#define PREFETCHW	prefetcht0
28#define PREFETCHSIZE	(16 * 24)
29#endif
30
31#ifdef CORE2
32#define PREFETCH	prefetcht0
33#define PREFETCHW	prefetcht0
34#define PREFETCHSIZE	(16 * 24)
35#endif
36
37#if defined(PENRYN) || defined(DUNNINGTON)
38#define PREFETCH	prefetcht0
39#define PREFETCHW	prefetcht0
40#define PREFETCHSIZE	(16 * 24)
41#endif
42
43#ifdef NEHALEM
44#define PREFETCH	prefetcht0
45#define PREFETCHW	prefetcht0
46#define PREFETCHSIZE	(16 * 24)
47#endif
48
49#ifdef PENTIUM4
50#define PREFETCH	prefetcht0
51#define PREFETCHW	prefetcht0
52#define PREFETCHSIZE	(16 * 28)
53#endif
54
55#ifdef OPTERON
56#define PREFETCH	prefetch
57#define PREFETCHW	prefetchw
58#define PREFETCHSIZE	(16 * 12)
59#define movsd		movlpd
60#endif
61
62#if defined(BARCELONA)  || defined(SHANGHAI)
63#define PREFETCH	prefetch
64#define PREFETCHW	prefetchw
65#define PREFETCHSIZE	(16 * 16)
66#endif
67
68#ifdef NANO
69#define PREFETCH	prefetcht0
70#define PREFETCHW	prefetcht0
71#define PREFETCHSIZE	(16 * 24)
72#endif
73
74#ifdef GENERIC
75#define PREFETCH	prefetcht0
76#define PREFETCHW	prefetcht0
77#define PREFETCHSIZE	(16 * 14)
78#endif
79
80#ifndef WINDOWS_ABI
81
82#define STACKSIZE	80
83
84#define OLD_Y		 8 + STACKSIZE(%rsp)
85#define OLD_INCY	16 + STACKSIZE(%rsp)
86#define OLD_BUFFER	24 + STACKSIZE(%rsp)
87
88#define M	  ARG1
89#define	N	  ARG2
90#define	A	  ARG3
91#define LDA	  ARG4
92#define	X	  ARG5
93#define INCX	  ARG6
94
95#else
96
97#define STACKSIZE	256
98
99#define OLD_A		 40 + STACKSIZE(%rsp)
100#define OLD_LDA		 48 + STACKSIZE(%rsp)
101#define OLD_X		 56 + STACKSIZE(%rsp)
102#define OLD_INCX	 64 + STACKSIZE(%rsp)
103#define OLD_Y		 72 + STACKSIZE(%rsp)
104#define OLD_INCY	 80 + STACKSIZE(%rsp)
105#define OLD_BUFFER	 88 + STACKSIZE(%rsp)
106
107#define M	  ARG1
108#define N	  ARG2
109#define	A	  ARG4
110#define LDA	  ARG3
111#define	X	  %rdi
112#define INCX	  %rsi
113#endif
114
115#define	Y	%r10
116#define INCY	%r11
117#define BUFFER	%r12
118
119#define TEMP	%rax
120#define I	%rax
121#define A1	%rbx
122#define A2	%rbp
123#define XX	%r13
124#define YY	%r14
125#define IS	%r15
126#define NEW_X	BUFFER
127#define NEW_Y	X
128
129#define ALPHA_R  %xmm0
130#define ALPHA_I  %xmm1
131
132#define xsum1  %xmm0
133#define xsum2  %xmm1
134#define xsum3  %xmm2
135#define xsum4  %xmm3
136
137#define atemp1 %xmm4
138#define atemp2 %xmm5
139#define atemp3 %xmm6
140#define atemp4 %xmm7
141
142#define xtemp1 %xmm8
143#define xtemp2 %xmm9
144#define a1     %xmm10
145#define a2     %xmm11
146
147#define a3     %xmm12
148#define yy1    %xmm13
149#define	xt1    %xmm14
150#define	xt2    %xmm15
151
152#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
153#define MOVDDUP(a, b, c)	movddup	a(b), c
154#define MOVDDUP2(a, b, c)	movddup	a##b, c
155#else
156#define MOVDDUP(a, b, c)	movlpd	a(b), c;movhpd	a(b), c
157#define MOVDDUP2(a, b, c)	movlpd	a##b, c;movhpd	a##b, c
158#endif
159
160	PROLOGUE
161	PROFCODE
162
163	subq	$STACKSIZE, %rsp
164	movq	%rbx,  0(%rsp)
165	movq	%rbp,  8(%rsp)
166	movq	%r12, 16(%rsp)
167	movq	%r13, 24(%rsp)
168	movq	%r14, 32(%rsp)
169	movq	%r15, 40(%rsp)
170
171#ifdef WINDOWS_ABI
172	movq	%rdi,    48(%rsp)
173	movq	%rsi,    56(%rsp)
174	movups	%xmm6,   64(%rsp)
175	movups	%xmm7,   80(%rsp)
176	movups	%xmm8,   96(%rsp)
177	movups	%xmm9,  112(%rsp)
178	movups	%xmm10, 128(%rsp)
179	movups	%xmm11, 144(%rsp)
180	movups	%xmm12, 160(%rsp)
181	movups	%xmm13, 176(%rsp)
182	movups	%xmm14, 192(%rsp)
183	movups	%xmm15, 208(%rsp)
184
185	movq	OLD_A,     A
186	movq	OLD_LDA,   LDA
187	movq	OLD_X,     X
188	movq	OLD_INCX,  INCX
189
190	movaps	%xmm2, %xmm0
191	movaps	%xmm3, %xmm1
192#endif
193
194	movq	OLD_Y,     Y
195	movq	OLD_INCY,   INCY
196	movq	OLD_BUFFER, BUFFER
197
198	salq	$ZBASE_SHIFT, INCX
199	salq	$ZBASE_SHIFT, INCY
200	salq	$ZBASE_SHIFT, LDA
201
202	testq	M, M
203	jle	.L999
204
205	negq	IS
206	addq	M, IS
207
208	movq	IS,  TEMP
209	imulq	LDA, TEMP
210	addq	TEMP, A
211
212	pcmpeqb	%xmm3,  %xmm3
213	xorpd	%xmm2,  %xmm2
214	pslld	$31,    %xmm3
215	unpckhps %xmm3, %xmm2
216
217	shufps	 $0, ALPHA_R, ALPHA_R
218	shufps	 $0, ALPHA_I, ALPHA_I
219	movaps	 ALPHA_I, %xmm3
220
221	unpcklps ALPHA_R, ALPHA_I
222	unpcklps %xmm3,   ALPHA_R
223	pxor	 %xmm2,   ALPHA_R
224
225	movq	BUFFER, XX
226
227	movq	M,  %rax
228	sarq	$2, %rax
229	jle	.L02
230	ALIGN_3
231
232.L01:
233	movsd	0 * SIZE(X), %xmm4
234	addq	INCX, X
235	movhps	0 * SIZE(X), %xmm4
236	addq	INCX, X
237	movsd	0 * SIZE(X), %xmm6
238	addq	INCX, X
239	movhps	0 * SIZE(X), %xmm6
240	addq	INCX, X
241
242	movsldup %xmm4, %xmm3
243	movshdup %xmm4, %xmm4
244	movsldup %xmm6, %xmm5
245	movshdup %xmm6, %xmm6
246
247	mulps	ALPHA_I, %xmm3
248	mulps	ALPHA_R, %xmm4
249	mulps	ALPHA_I, %xmm5
250	mulps	ALPHA_R, %xmm6
251
252	addps	%xmm4, %xmm3
253	addps	%xmm6, %xmm5
254
255	movaps	%xmm3,  4 * SIZE(XX)
256	movaps	%xmm5, 12 * SIZE(XX)
257
258	shufps	$0xb1, %xmm3, %xmm3
259	shufps	$0xb1, %xmm5, %xmm5
260
261	pxor	%xmm2, %xmm3
262	pxor	%xmm2, %xmm5
263
264	movaps	%xmm3,  0 * SIZE(XX)
265	movaps	%xmm5,  8 * SIZE(XX)
266
267	subq	$-16 * SIZE, XX
268	decq	%rax
269	jg	.L01
270	ALIGN_3
271
272.L02:
273	testq	$2, M
274	jle	.L03
275
276	movsd	0 * SIZE(X), %xmm4
277	addq	INCX, X
278	movhps	0 * SIZE(X), %xmm4
279	addq	INCX, X
280
281	movsldup %xmm4, %xmm3
282	movshdup %xmm4, %xmm4
283
284	mulps	ALPHA_I, %xmm3
285	mulps	ALPHA_R, %xmm4
286
287	addps	%xmm4, %xmm3
288
289	movaps	%xmm3,  4 * SIZE(XX)
290
291	shufps	$0xb1, %xmm3, %xmm3
292	pxor	%xmm2, %xmm3
293	movaps	%xmm3,  0 * SIZE(XX)
294
295	subq	$-8 * SIZE, XX
296	ALIGN_3
297
298.L03:
299	testq	$1, M
300	jle	.L05
301
302	movsd	0 * SIZE(X), %xmm4
303	addq	INCX, X
304
305	movsldup %xmm4, %xmm3
306	movshdup %xmm4, %xmm4
307
308	mulps	ALPHA_I, %xmm3
309	mulps	ALPHA_R, %xmm4
310
311	addps	%xmm4, %xmm3
312
313	movlps	%xmm3,  2 * SIZE(XX)
314
315	shufps	$0xb1, %xmm3, %xmm3
316	pxor	%xmm2, %xmm3
317	movlps	%xmm3,  0 * SIZE(XX)
318
319	subq	$-4 * SIZE, XX
320	ALIGN_3
321
322.L05:
323	/* now we don't need original X */
324	movq   Y, NEW_Y
325
326	addq   $512, XX
327	andq   $-512, XX
328
329	cmpq   $2 * SIZE, INCY
330	je    .L10
331
332	movq   Y,  YY
333	movq   XX, NEW_Y
334
335	movq	M,  %rax
336	sarq	$2, %rax
337	jle	.L07
338	ALIGN_3
339
340.L06:
341	movsd	0 * SIZE(YY), %xmm0
342	addq	INCY, YY
343	movhps	0 * SIZE(YY), %xmm0
344	addq	INCY, YY
345	movsd	0 * SIZE(YY), %xmm1
346	addq	INCY, YY
347	movhps	0 * SIZE(YY), %xmm1
348	addq	INCY, YY
349
350	movaps	%xmm0, 0 * SIZE(XX)
351	movaps	%xmm1, 8 * SIZE(XX)
352
353	addq	$8 * SIZE, XX
354	decq	%rax
355	jg	.L06
356	ALIGN_3
357
358.L07:
359	movq	M, %rax
360	andq	$3, %rax
361	jle	.L10
362	ALIGN_3
363
364.L08:
365	movsd	0 * SIZE(YY), %xmm0
366	addq	INCY, YY
367
368	movlps	%xmm0, 0 * SIZE(XX)
369
370	addq	$2 * SIZE, XX
371	decq	%rax
372	jg	.L08
373	ALIGN_3
374
375.L10:
376	movq	 IS, I
377	addq	 $2, I
378	cmpq	 M,  I
379	jg	 .L20
380	ALIGN_3
381
382.L11:
383	movq	A,  A1
384	leaq	(A, LDA, 1), A2
385	leaq	(A, LDA, 2), A
386
387	leaq	(, IS, 4), I
388
389	movsd	 0 * SIZE(NEW_X, I, SIZE), atemp2
390	movhps	 4 * SIZE(NEW_X, I, SIZE), atemp2
391	movsd	 2 * SIZE(NEW_X, I, SIZE), atemp4
392	movhps	 6 * SIZE(NEW_X, I, SIZE), atemp4
393
394	pshufd	 $0xcc, atemp2, atemp1
395	pshufd	 $0x99, atemp2, atemp2
396	pshufd	 $0xcc, atemp4, atemp3
397	pshufd	 $0x99, atemp4, atemp4
398
399	pxor		xsum1, xsum1
400	pxor		xsum2, xsum2
401	pxor		xsum3, xsum3
402	pxor		xsum4, xsum4
403
404	movq		NEW_X, XX
405	movq		NEW_Y, YY
406
407	movq	IS,  I
408	sarq	$2,  I
409	jle	.L15
410	ALIGN_3
411
412.L12:
413	HALT
414
415	subq	 $-16 * SIZE, XX
416	addq	 $  8 * SIZE, YY
417	addq	 $  8 * SIZE, A1
418	addq	 $  8 * SIZE, A2
419
420	decq	 I
421	jg	 .L12
422	ALIGN_3
423
424.L15:
425	testq	$2, IS
426	jle	.L18
427
428	movsd	 0 * SIZE(YY), yy1
429	movhps	 2 * SIZE(YY), yy1
430
431	movaps	 0 * SIZE(XX), xtemp1
432	movaps	 4 * SIZE(XX), xtemp2
433
434	movsd	 0 * SIZE(A1), a1
435	movhps	 2 * SIZE(A1), a1
436
437	movaps	 xtemp1, xt1
438	movaps	 xtemp2, xt2
439	mulps	 a1, xt1
440	mulps	 a1, xt2
441	addps	 xt1, xsum1
442	addps	 xt2, xsum2
443
444	pshufd	 $0xb1, a1, xt2
445	mulps	 atemp1, a1
446	mulps	 atemp2, xt2
447	addps	 a1,  yy1
448	addps	 xt2, yy1
449
450	movsd	 0 * SIZE(A2), a1
451	movhps	 2 * SIZE(A2), a1
452
453	movaps	 xtemp1, xt1
454	movaps	 xtemp2, xt2
455	mulps	 a1, xt1
456	mulps	 a1, xt2
457	addps	 xt1, xsum3
458	addps	 xt2, xsum4
459
460	pshufd	 $0xb1, a1, xt2
461	mulps	 atemp1, a1
462	mulps	 atemp2, xt2
463	addps	  a1, yy1
464	addps	 xt2, yy1
465
466	movlps	 yy1, 0 * SIZE(YY)
467	movhps	 yy1, 2 * SIZE(YY)
468
469	addq	 $8 * SIZE, XX
470	addq	 $4 * SIZE, YY
471	addq	 $4 * SIZE, A1
472	addq	 $4 * SIZE, A2
473	ALIGN_3
474
475.L18:
476	leaq	(, IS, 4), I
477
478	movaps	 0 * SIZE(NEW_X, I, SIZE), atemp1
479	movaps	 4 * SIZE(NEW_X, I, SIZE), atemp2
480
481	movlps	 0 * SIZE(YY), yy1
482	movhps	 2 * SIZE(YY), yy1
483
484	movsd	 0 * SIZE(A1), a1
485	movhps	 0 * SIZE(A2), a1
486
487	movaps	 a1, a2
488	mulps	 atemp1, a1
489	mulps	 atemp2, a2
490	addps	 a1, xsum1
491	addps	 a2, xsum2
492
493	movsd	 0 * SIZE(A2), a1
494	movhps	 2 * SIZE(A2), a1
495
496	movaps	 a1, a2
497	mulps	 atemp1, a1
498	mulps	 atemp2, a2
499	addps	 a1, xsum3
500	addps	 a2, xsum4
501
502	haddps	 xsum2, xsum1
503	haddps	 xsum4, xsum3
504
505	haddps	 xsum3, xsum1
506	addps	 xsum1, yy1
507
508	movlps	 yy1, 0 * SIZE(YY)
509	movhps	 yy1, 2 * SIZE(YY)
510
511	addq	 $2, IS
512
513	movq	 IS, I
514	addq	 $2, I
515	cmpq	 M, I
516	jle	 .L11
517	ALIGN_3
518
519.L20:
520	testq	$1, M
521	jle	.L990
522
523
524.L990:
525	cmpq   $2 * SIZE, INCY
526	je    .L999
527
528	movq	M,  %rax
529	sarq	$2, %rax
530	jle	.L997
531	ALIGN_3
532
533.L996:
534	movaps	 0 * SIZE(NEW_Y), %xmm0
535	movaps	 4 * SIZE(NEW_Y), %xmm1
536
537	movlps	%xmm0,  0 * SIZE(Y)
538	addq	INCY, Y
539	movhps	%xmm0,  0 * SIZE(Y)
540	addq	INCY, Y
541	movlps	%xmm1,  0 * SIZE(Y)
542	addq	INCY, Y
543	movhps	%xmm1,  0 * SIZE(Y)
544	addq	INCY, Y
545
546	addq	$8 * SIZE, NEW_Y
547	decq	%rax
548	jg	.L996
549	ALIGN_3
550
551.L997:
552	movq	M, %rax
553	andq	$3, %rax
554	jle	.L999
555	ALIGN_3
556
557.L998:
558	movlps	0 * SIZE(NEW_Y), %xmm0
559	addq	$2 * SIZE, NEW_Y
560
561	movlps	%xmm0,  0 * SIZE(Y)
562	addq	INCY, Y
563
564	decq	%rax
565	jg	.L998
566	ALIGN_3
567
568.L999:
569	movq	  0(%rsp), %rbx
570	movq	  8(%rsp), %rbp
571	movq	 16(%rsp), %r12
572	movq	 24(%rsp), %r13
573	movq	 32(%rsp), %r14
574	movq	 40(%rsp), %r15
575	addq	$STACKSIZE, %rsp
576	ret
577	EPILOGUE
578