1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef ATOM
26#define PREFETCH	prefetchnta
27#define PREFETCHW	prefetcht0
28#define PREFETCHSIZE	(8 * 6)
29#endif
30
31#define STACKSIZE	16
32
33#define M		 4 + STACKSIZE(%esp)
34#define N		 8 + STACKSIZE(%esp)
35#define ALPHA_R		16 + STACKSIZE(%esp)
36#define ALPHA_I		24 + STACKSIZE(%esp)
37#define A		32 + STACKSIZE(%esp)
38#define STACK_LDA	36 + STACKSIZE(%esp)
39#define STACK_X		40 + STACKSIZE(%esp)
40#define STACK_INCX	44 + STACKSIZE(%esp)
41#define Y		48 + STACKSIZE(%esp)
42#define STACK_INCY	52 + STACKSIZE(%esp)
43#define BUFFER		56 + STACKSIZE(%esp)
44
45#define I	%eax
46#define J	%ebx
47
48#define INCX	%ecx
49#define INCY	J
50
51#define A1	%esi
52#define X	%edx
53#define Y1	%edi
54#define LDA	%ebp
55
56#if !defined(CONJ) && !defined(XCONJ)
57#define ADD1	   addsd
58#define ADD2	   addsd
59#define ADD3	   subsd
60#define ADD4	   addsd
61#endif
62
63#if  defined(CONJ) && !defined(XCONJ)
64#define ADD1	   addsd
65#define ADD2	   addsd
66#define ADD3	   addsd
67#define ADD4	   subsd
68#endif
69
70#if !defined(CONJ) &&  defined(XCONJ)
71#define ADD1	   addsd
72#define ADD2	   subsd
73#define ADD3	   addsd
74#define ADD4	   addsd
75#endif
76
77#if  defined(CONJ) &&  defined(XCONJ)
78#define ADD1	   addsd
79#define ADD2	   subsd
80#define ADD3	   subsd
81#define ADD4	   subsd
82#endif
83
84	PROLOGUE
85
86	pushl	%ebp
87	pushl	%edi
88	pushl	%esi
89	pushl	%ebx
90
91	PROFCODE
92
93	movl	STACK_LDA,  LDA
94	movl	STACK_X,    X
95	movl	STACK_INCX, INCX
96
97	sall	$ZBASE_SHIFT, INCX
98	sall	$ZBASE_SHIFT, LDA
99
100	subl	$-16 * SIZE, A
101
102	cmpl	$0, N
103	jle	.L999
104	cmpl	$0, M
105	jle	.L999
106
107	movl	BUFFER, Y1
108
109	movl	N,  J
110
111	pxor	%xmm7, %xmm7
112
113	movl	M,  %eax
114	addl	$8, %eax
115	sarl	$3, %eax
116	ALIGN_3
117
118.L01:
119	movapd	%xmm7,  0 * SIZE(Y1)
120	movapd	%xmm7,  2 * SIZE(Y1)
121	movapd	%xmm7,  4 * SIZE(Y1)
122	movapd	%xmm7,  6 * SIZE(Y1)
123	movapd	%xmm7,  8 * SIZE(Y1)
124	movapd	%xmm7, 10 * SIZE(Y1)
125	movapd	%xmm7, 12 * SIZE(Y1)
126	movapd	%xmm7, 14 * SIZE(Y1)
127	subl	$-16 * SIZE, Y1
128	decl	%eax
129	jg	.L01
130	ALIGN_3
131
132.L10:
133	movl	BUFFER, Y1
134	addl	$16 * SIZE, Y1
135
136	movl	A,  A1
137	addl	LDA, A
138
139	movsd	0 * SIZE(X), %xmm6
140	movsd	1 * SIZE(X), %xmm7
141	addl	INCX, X
142
143	movapd	%xmm6,   %xmm2
144	mulsd	ALPHA_R, %xmm6
145	mulsd	ALPHA_I, %xmm2
146	movapd	%xmm7,   %xmm3
147	mulsd	ALPHA_I, %xmm3
148	mulsd	ALPHA_R, %xmm7
149
150#ifndef XCONJ
151	subsd	%xmm3,  %xmm6
152	addsd	%xmm2,  %xmm7
153#else
154	addsd	%xmm3,  %xmm6
155	subsd	%xmm2,  %xmm7
156#endif
157
158	movsd	 -16 * SIZE(Y1), %xmm0
159	movsd	 -15 * SIZE(Y1), %xmm1
160	ALIGN_3
161
162	movl	M,   I
163	sarl	$2,  I
164	jle	.L15
165
166	movsd	 -16 * SIZE(A1), %xmm2
167	movsd	 -15 * SIZE(A1), %xmm3
168
169	movapd	%xmm2, %xmm4
170	mulsd	%xmm6, %xmm2
171	mulsd	%xmm7, %xmm4
172
173	decl	 I
174	jle	 .L14
175	ALIGN_3
176
177.L13:
178#ifdef PREFETCH
179	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
180#endif
181
182	movapd	%xmm3, %xmm5
183	mulsd	%xmm7, %xmm3
184	ADD1	%xmm2, %xmm0
185	movsd	 -14 * SIZE(A1), %xmm2
186	mulsd	%xmm6, %xmm5
187	ADD2	%xmm4, %xmm1
188
189	movapd	%xmm2, %xmm4
190	mulsd	%xmm6, %xmm2
191	ADD3	%xmm3, %xmm0
192	movsd	 -13 * SIZE(A1), %xmm3
193	ADD4	%xmm5, %xmm1
194	mulsd	%xmm7, %xmm4
195
196	movlpd	 %xmm0, -16 * SIZE(Y1)
197	movsd	 -14 * SIZE(Y1), %xmm0
198	movlpd	 %xmm1, -15 * SIZE(Y1)
199	movsd	 -13 * SIZE(Y1), %xmm1
200
201	movapd	%xmm3, %xmm5
202	mulsd	%xmm7, %xmm3
203	ADD1	%xmm2, %xmm0
204	movsd	 -12 * SIZE(A1), %xmm2
205	mulsd	%xmm6, %xmm5
206	ADD2	%xmm4, %xmm1
207
208	movapd	%xmm2, %xmm4
209	mulsd	%xmm6, %xmm2
210	ADD3	%xmm3, %xmm0
211	movsd	 -11 * SIZE(A1), %xmm3
212	mulsd	%xmm7, %xmm4
213	ADD4	%xmm5, %xmm1
214
215	movlpd	 %xmm0, -14 * SIZE(Y1)
216	movsd	 -12 * SIZE(Y1), %xmm0
217	movlpd	 %xmm1, -13 * SIZE(Y1)
218	movsd	 -11 * SIZE(Y1), %xmm1
219
220	movapd	%xmm3, %xmm5
221	mulsd	%xmm7, %xmm3
222	ADD1	%xmm2, %xmm0
223	movsd	 -10 * SIZE(A1), %xmm2
224	mulsd	%xmm6, %xmm5
225	ADD2	%xmm4, %xmm1
226
227	movapd	%xmm2, %xmm4
228	mulsd	%xmm6, %xmm2
229	ADD3	%xmm3, %xmm0
230	movsd	  -9 * SIZE(A1), %xmm3
231	ADD4	%xmm5, %xmm1
232	mulsd	%xmm7, %xmm4
233
234	movlpd	 %xmm0, -12 * SIZE(Y1)
235	movsd	 -10 * SIZE(Y1), %xmm0
236	movlpd	 %xmm1, -11 * SIZE(Y1)
237	movsd	  -9 * SIZE(Y1), %xmm1
238
239	movapd	%xmm3, %xmm5
240	mulsd	%xmm7, %xmm3
241	ADD1	%xmm2, %xmm0
242	movsd	  -8 * SIZE(A1), %xmm2
243	mulsd	%xmm6, %xmm5
244	ADD2	%xmm4, %xmm1
245
246	movapd	%xmm2, %xmm4
247	mulsd	%xmm6, %xmm2
248	ADD3	%xmm3, %xmm0
249	movsd	  -7 * SIZE(A1), %xmm3
250	mulsd	%xmm7, %xmm4
251	ADD4	%xmm5, %xmm1
252
253	movlpd	 %xmm0, -10 * SIZE(Y1)
254	movsd	  -8 * SIZE(Y1), %xmm0
255	movlpd	 %xmm1,  -9 * SIZE(Y1)
256	movsd	  -7 * SIZE(Y1), %xmm1
257
258	subl	 $-8 * SIZE, A1
259	subl	 $-8 * SIZE, Y1
260
261	subl	 $1, I
262	BRANCH
263	jg	.L13
264	ALIGN_3
265
266.L14:
267	movapd	%xmm3, %xmm5
268	mulsd	%xmm7, %xmm3
269	ADD1	%xmm2, %xmm0
270	movsd	 -14 * SIZE(A1), %xmm2
271	mulsd	%xmm6, %xmm5
272	ADD2	%xmm4, %xmm1
273
274	movapd	%xmm2, %xmm4
275	mulsd	%xmm6, %xmm2
276	ADD3	%xmm3, %xmm0
277	movsd	 -13 * SIZE(A1), %xmm3
278	ADD4	%xmm5, %xmm1
279	mulsd	%xmm7, %xmm4
280
281	movlpd	 %xmm0, -16 * SIZE(Y1)
282	movsd	 -14 * SIZE(Y1), %xmm0
283	movlpd	 %xmm1, -15 * SIZE(Y1)
284	movsd	 -13 * SIZE(Y1), %xmm1
285
286	movapd	%xmm3, %xmm5
287	mulsd	%xmm7, %xmm3
288	ADD1	%xmm2, %xmm0
289	movsd	 -12 * SIZE(A1), %xmm2
290	mulsd	%xmm6, %xmm5
291	ADD2	%xmm4, %xmm1
292
293	movapd	%xmm2, %xmm4
294	mulsd	%xmm6, %xmm2
295	ADD3	%xmm3, %xmm0
296	movsd	 -11 * SIZE(A1), %xmm3
297	mulsd	%xmm7, %xmm4
298	ADD4	%xmm5, %xmm1
299
300	movlpd	 %xmm0, -14 * SIZE(Y1)
301	movsd	 -12 * SIZE(Y1), %xmm0
302	movlpd	 %xmm1, -13 * SIZE(Y1)
303	movsd	 -11 * SIZE(Y1), %xmm1
304
305	movapd	%xmm3, %xmm5
306	mulsd	%xmm7, %xmm3
307	ADD1	%xmm2, %xmm0
308	movsd	 -10 * SIZE(A1), %xmm2
309	mulsd	%xmm6, %xmm5
310	ADD2	%xmm4, %xmm1
311
312	movapd	%xmm2, %xmm4
313	mulsd	%xmm6, %xmm2
314	ADD3	%xmm3, %xmm0
315	movsd	  -9 * SIZE(A1), %xmm3
316	ADD4	%xmm5, %xmm1
317	mulsd	%xmm7, %xmm4
318
319	movlpd	 %xmm0, -12 * SIZE(Y1)
320	movsd	 -10 * SIZE(Y1), %xmm0
321	movlpd	 %xmm1, -11 * SIZE(Y1)
322	movsd	  -9 * SIZE(Y1), %xmm1
323
324	movapd	%xmm3, %xmm5
325	mulsd	%xmm7, %xmm3
326	ADD1	%xmm2, %xmm0
327	mulsd	%xmm6, %xmm5
328	ADD2	%xmm4, %xmm1
329
330	ADD3	%xmm3, %xmm0
331	ADD4	%xmm5, %xmm1
332
333	movlpd	 %xmm0, -10 * SIZE(Y1)
334	movsd	  -8 * SIZE(Y1), %xmm0
335	movlpd	 %xmm1,  -9 * SIZE(Y1)
336	movsd	  -7 * SIZE(Y1), %xmm1
337
338	subl	 $-8 * SIZE, A1
339	subl	 $-8 * SIZE, Y1
340	ALIGN_3
341
342.L15:
343	testl	$2, M
344	je	.L17
345
346	movsd	 -16 * SIZE(A1), %xmm2
347	movsd	 -15 * SIZE(A1), %xmm3
348
349	movapd	%xmm2, %xmm4
350	mulsd	%xmm6, %xmm2
351	mulsd	%xmm7, %xmm4
352
353	movapd	%xmm3, %xmm5
354	mulsd	%xmm7, %xmm3
355	ADD1	%xmm2, %xmm0
356	movsd	 -14 * SIZE(A1), %xmm2
357	mulsd	%xmm6, %xmm5
358	ADD2	%xmm4, %xmm1
359
360	movapd	%xmm2, %xmm4
361	mulsd	%xmm6, %xmm2
362	ADD3	%xmm3, %xmm0
363	movsd	 -13 * SIZE(A1), %xmm3
364	ADD4	%xmm5, %xmm1
365	mulsd	%xmm7, %xmm4
366
367	movlpd	 %xmm0, -16 * SIZE(Y1)
368	movsd	 -14 * SIZE(Y1), %xmm0
369	movlpd	 %xmm1, -15 * SIZE(Y1)
370	movsd	 -13 * SIZE(Y1), %xmm1
371
372	movapd	%xmm3, %xmm5
373	mulsd	%xmm7, %xmm3
374	ADD1	%xmm2, %xmm0
375	mulsd	%xmm6, %xmm5
376	ADD2	%xmm4, %xmm1
377
378	ADD3	%xmm3, %xmm0
379	ADD4	%xmm5, %xmm1
380
381	movlpd	 %xmm0, -14 * SIZE(Y1)
382	movsd	 -12 * SIZE(Y1), %xmm0
383	movlpd	 %xmm1, -13 * SIZE(Y1)
384	movsd	 -11 * SIZE(Y1), %xmm1
385
386	addl	 $4 * SIZE, A1
387	addl	 $4 * SIZE, Y1
388	ALIGN_3
389
390.L17:
391	testl	$1, M
392	je	.L19
393
394	movsd	 -16 * SIZE(A1), %xmm2
395	movsd	 -15 * SIZE(A1), %xmm3
396
397	movapd	%xmm2, %xmm4
398	mulsd	%xmm6, %xmm2
399	mulsd	%xmm7, %xmm4
400
401	movapd	%xmm3, %xmm5
402	mulsd	%xmm7, %xmm3
403	ADD1	%xmm2, %xmm0
404	mulsd	%xmm6, %xmm5
405	ADD2	%xmm4, %xmm1
406
407	ADD3	%xmm3, %xmm0
408	ADD4	%xmm5, %xmm1
409
410	movlpd	 %xmm0, -16 * SIZE(Y1)
411	movlpd	 %xmm1, -15 * SIZE(Y1)
412	ALIGN_3
413
414.L19:
415	decl	J
416	jg	.L10
417	ALIGN_4
418
419.L990:
420	movl	Y,   Y1
421	movl	BUFFER, X
422	movl	STACK_INCY, INCY
423
424	movl	Y1, A1
425	sall	$ZBASE_SHIFT, INCY
426
427	movl	M,   %eax
428	sarl	$2,  %eax
429	jle	.L994
430	ALIGN_3
431
432.L992:
433	movsd	0 * SIZE(Y1), %xmm0
434	movsd	1 * SIZE(Y1), %xmm1
435	addl	INCY, Y1
436
437	movsd	0 * SIZE(Y1), %xmm2
438	movsd	1 * SIZE(Y1), %xmm3
439	addl	INCY, Y1
440
441	movsd	0 * SIZE(Y1), %xmm4
442	movsd	1 * SIZE(Y1), %xmm5
443	addl	INCY, Y1
444
445	movsd	0 * SIZE(Y1), %xmm6
446	movsd	1 * SIZE(Y1), %xmm7
447	addl	INCY, Y1
448
449	addsd	0 * SIZE(X), %xmm0
450	addsd	1 * SIZE(X), %xmm1
451	addsd	2 * SIZE(X), %xmm2
452	addsd	3 * SIZE(X), %xmm3
453	addsd	4 * SIZE(X), %xmm4
454	addsd	5 * SIZE(X), %xmm5
455	addsd	6 * SIZE(X), %xmm6
456	addsd	7 * SIZE(X), %xmm7
457
458	movlpd	%xmm0, 0 * SIZE(A1)
459	movlpd	%xmm1, 1 * SIZE(A1)
460	addl	INCY, A1
461
462	movlpd	%xmm2, 0 * SIZE(A1)
463	movlpd	%xmm3, 1 * SIZE(A1)
464	addl	INCY, A1
465
466	movlpd	%xmm4, 0 * SIZE(A1)
467	movlpd	%xmm5, 1 * SIZE(A1)
468	addl	INCY, A1
469
470	movlpd	%xmm6, 0 * SIZE(A1)
471	movlpd	%xmm7, 1 * SIZE(A1)
472	addl	INCY, A1
473
474	addl	$8 * SIZE, X
475	decl	%eax
476	jg	.L992
477	ALIGN_3
478
479.L994:
480	testl	$2, M
481	jle	.L996
482
483	movsd	0 * SIZE(Y1), %xmm0
484	movsd	1 * SIZE(Y1), %xmm1
485	addl	INCY, Y1
486
487	movsd	0 * SIZE(Y1), %xmm2
488	movsd	1 * SIZE(Y1), %xmm3
489	addl	INCY, Y1
490
491	addsd	0 * SIZE(X), %xmm0
492	addsd	1 * SIZE(X), %xmm1
493	addsd	2 * SIZE(X), %xmm2
494	addsd	3 * SIZE(X), %xmm3
495
496	movlpd	%xmm0, 0 * SIZE(A1)
497	movlpd	%xmm1, 1 * SIZE(A1)
498	addl	INCY, A1
499
500	movlpd	%xmm2, 0 * SIZE(A1)
501	movlpd	%xmm3, 1 * SIZE(A1)
502	addl	INCY, A1
503
504	addl	$4 * SIZE, X
505	ALIGN_3
506
507.L996:
508	testl	$1, M
509	jle	.L999
510
511	movsd	0 * SIZE(Y1), %xmm0
512	movsd	1 * SIZE(Y1), %xmm1
513
514	addsd	0 * SIZE(X), %xmm0
515	addsd	1 * SIZE(X), %xmm1
516
517	movlpd	%xmm0, 0 * SIZE(A1)
518	movlpd	%xmm1, 1 * SIZE(A1)
519	ALIGN_3
520
521.L999:
522	popl	%ebx
523	popl	%esi
524	popl	%edi
525	popl	%ebp
526	ret
527
528	EPILOGUE
529