1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef PENTIUM4
26#define PREFETCH	prefetcht0
27#define PREFETCHW	prefetcht0
28#define PREFETCHSIZE	(8 * 2)
29#endif
30
31#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
32#define PREFETCH	prefetcht0
33#define PREFETCHW	prefetcht0
34#define PREFETCHSIZE	(8 * 7)
35#endif
36
37#ifdef OPTERON
38#define PREFETCH	prefetchnta
39#define PREFETCHW	prefetchw
40#define PREFETCHSIZE	(8 * 3)
41#define movsd		movlps
42#endif
43
44#ifdef BARCELONA
45#define PREFETCH	prefetchnta
46#define PREFETCHW	prefetchw
47#define PREFETCHSIZE	(8 * 5)
48#endif
49
50#ifdef ATOM
51#define PREFETCH	prefetch
52#define PREFETCHW	prefetcht0
53#define PREFETCHSIZE	(8 * 6)
54#endif
55
56#ifdef NANO
57#define PREFETCH	prefetcht0
58#define PREFETCHSIZE	(8 * 4)
59#endif
60
61#define STACKSIZE	16
62
63#define M		 4 + STACKSIZE(%esp)
64#define N		 8 + STACKSIZE(%esp)
65#define ALPHA		16 + STACKSIZE(%esp)
66#define A		24 + STACKSIZE(%esp)
67#define STACK_LDA	28 + STACKSIZE(%esp)
68#define STACK_X		32 + STACKSIZE(%esp)
69#define STACK_INCX	36 + STACKSIZE(%esp)
70#define Y		40 + STACKSIZE(%esp)
71#define STACK_INCY	44 + STACKSIZE(%esp)
72#define BUFFER		48 + STACKSIZE(%esp)
73
74#define I	%eax
75#define J	%ebx
76
77#define INCX	%ecx
78#define INCY	J
79
80#define A1	%esi
81#define X	%edx
82#define Y1	%edi
83#define LDA	%ebp
84
85	PROLOGUE
86
87	pushl	%ebp
88	pushl	%edi
89	pushl	%esi
90	pushl	%ebx
91
92	PROFCODE
93
94	movl	STACK_LDA,  LDA
95	movl	STACK_X,    X
96	movl	STACK_INCX, INCX
97
98	leal	(,INCX, SIZE), INCX
99	leal	(,LDA,  SIZE), LDA
100
101	subl	$-16 * SIZE, A
102
103	cmpl	$0, N
104	jle	.L999
105	cmpl	$0, M
106	jle	.L999
107
108	movl	BUFFER, Y1
109
110	pxor	%xmm7, %xmm7
111
112	movl	M,   %eax
113	addl	$16, %eax
114	sarl	$4,  %eax
115	ALIGN_3
116
117.L01:
118	movapd	%xmm7,  0 * SIZE(Y1)
119	movapd	%xmm7,  2 * SIZE(Y1)
120	movapd	%xmm7,  4 * SIZE(Y1)
121	movapd	%xmm7,  6 * SIZE(Y1)
122	movapd	%xmm7,  8 * SIZE(Y1)
123	movapd	%xmm7, 10 * SIZE(Y1)
124	movapd	%xmm7, 12 * SIZE(Y1)
125	movapd	%xmm7, 14 * SIZE(Y1)
126	subl	$-16 * SIZE, Y1
127	decl	%eax
128	jg	.L01
129	ALIGN_3
130
131.L10:
132	movl	N,  J
133	sarl	$1, J
134	jle	.L20
135	ALIGN_3
136
137.L11:
138
139	movl	BUFFER, Y1
140	addl	$16 * SIZE, Y1
141
142	movl	A,  A1
143	leal	(A1,  LDA, 2), %eax
144	movl	%eax, A
145
146#ifdef HAVE_SSE3
147	movddup	(X), %xmm6
148	addl	INCX, X
149	movddup	(X), %xmm7
150	addl	INCX, X
151
152	movddup	ALPHA, %xmm0
153
154	mulpd	%xmm0, %xmm6
155	mulpd	%xmm0, %xmm7
156#else
157	movsd	(X), %xmm6
158	addl	INCX, X
159	movsd	(X), %xmm7
160	addl	INCX, X
161
162	movsd	ALPHA, %xmm0
163
164	mulsd	%xmm0, %xmm6
165	mulsd	%xmm0, %xmm7
166
167	unpcklpd %xmm6, %xmm6
168	unpcklpd %xmm7, %xmm7
169#endif
170
171	ALIGN_3
172
173	movl	M,   I
174	sarl	$3,  I
175	jle	.L15
176
177	movsd	 -16 * SIZE(A1), %xmm2
178	movhpd	 -15 * SIZE(A1), %xmm2
179	movsd	 -14 * SIZE(A1), %xmm3
180	movhpd	 -13 * SIZE(A1), %xmm3
181
182	movapd	 -16 * SIZE(Y1), %xmm0
183	movapd	 -14 * SIZE(Y1), %xmm1
184
185	movsd	 -16 * SIZE(A1, LDA), %xmm4
186	movhpd	 -15 * SIZE(A1, LDA), %xmm4
187	movsd	 -14 * SIZE(A1, LDA), %xmm5
188	movhpd	 -13 * SIZE(A1, LDA), %xmm5
189
190	decl	 I
191	jle	 .L14
192	ALIGN_3
193
194.L13:
195#ifdef PREFETCH
196	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
197#endif
198
199	mulpd	 %xmm6, %xmm2
200	addpd	 %xmm2, %xmm0
201	movsd	 -12 * SIZE(A1), %xmm2
202	movhpd	 -11 * SIZE(A1), %xmm2
203	mulpd	 %xmm6, %xmm3
204	addpd	 %xmm3, %xmm1
205	movsd	 -10 * SIZE(A1), %xmm3
206	movhpd	  -9 * SIZE(A1), %xmm3
207
208	mulpd	 %xmm7, %xmm4
209	addpd	 %xmm4, %xmm0
210	movsd	 -12 * SIZE(A1, LDA), %xmm4
211	movhpd	 -11 * SIZE(A1, LDA), %xmm4
212
213	movapd	 %xmm0,  -16 * SIZE(Y1)
214	movapd	 -12 * SIZE(Y1), %xmm0
215
216	mulpd	 %xmm7, %xmm5
217	addpd	 %xmm5, %xmm1
218	movsd	 -10 * SIZE(A1, LDA), %xmm5
219	movhpd	  -9 * SIZE(A1, LDA), %xmm5
220
221	movapd	 %xmm1,  -14 * SIZE(Y1)
222	movapd	 -10 * SIZE(Y1), %xmm1
223
224#ifdef PREFETCH
225	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1, LDA)
226#endif
227
228	mulpd	 %xmm6, %xmm2
229	addpd	 %xmm2, %xmm0
230	movsd	  -8 * SIZE(A1), %xmm2
231	movhpd	  -7 * SIZE(A1), %xmm2
232	mulpd	 %xmm6, %xmm3
233	addpd	 %xmm3, %xmm1
234	movsd	  -6 * SIZE(A1), %xmm3
235	movhpd	  -5 * SIZE(A1), %xmm3
236
237	mulpd	 %xmm7, %xmm4
238	addpd	 %xmm4, %xmm0
239	movsd	  -8 * SIZE(A1, LDA), %xmm4
240	movhpd	  -7 * SIZE(A1, LDA), %xmm4
241
242	movapd	 %xmm0,  -12 * SIZE(Y1)
243	movapd	  -8 * SIZE(Y1), %xmm0
244
245	mulpd	 %xmm7, %xmm5
246	addpd	 %xmm5, %xmm1
247	movsd	  -6 * SIZE(A1, LDA), %xmm5
248	movhpd	  -5 * SIZE(A1, LDA), %xmm5
249
250	movapd	 %xmm1,  -10 * SIZE(Y1)
251	movapd	  -6 * SIZE(Y1), %xmm1
252
253	subl	 $-8 * SIZE, A1
254	subl	 $-8 * SIZE, Y1
255
256	subl	 $1, I
257	BRANCH
258	jg	.L13
259	ALIGN_3
260
261.L14:
262	mulpd	 %xmm6, %xmm2
263	addpd	 %xmm2, %xmm0
264	movsd	 -12 * SIZE(A1), %xmm2
265	movhpd	 -11 * SIZE(A1), %xmm2
266	mulpd	 %xmm6, %xmm3
267	addpd	 %xmm3, %xmm1
268	movsd	 -10 * SIZE(A1), %xmm3
269	movhpd	  -9 * SIZE(A1), %xmm3
270
271	mulpd	 %xmm7, %xmm4
272	addpd	 %xmm4, %xmm0
273	movsd	 -12 * SIZE(A1, LDA), %xmm4
274	movhpd	 -11 * SIZE(A1, LDA), %xmm4
275
276	movapd	 %xmm0,  -16 * SIZE(Y1)
277	movapd	 -12 * SIZE(Y1), %xmm0
278
279	mulpd	 %xmm7, %xmm5
280	addpd	 %xmm5, %xmm1
281	movsd	 -10 * SIZE(A1, LDA), %xmm5
282	movhpd	  -9 * SIZE(A1, LDA), %xmm5
283
284	movapd	 %xmm1,  -14 * SIZE(Y1)
285	movapd	 -10 * SIZE(Y1), %xmm1
286
287	mulpd	 %xmm6, %xmm2
288	addpd	 %xmm2, %xmm0
289	mulpd	 %xmm6, %xmm3
290	addpd	 %xmm3, %xmm1
291
292	mulpd	 %xmm7, %xmm4
293	addpd	 %xmm4, %xmm0
294	movapd	 %xmm0,  -12 * SIZE(Y1)
295	mulpd	 %xmm7, %xmm5
296	addpd	 %xmm5, %xmm1
297	movapd	 %xmm1,  -10 * SIZE(Y1)
298
299	subl	 $-8 * SIZE, A1
300	subl	 $-8 * SIZE, Y1
301	ALIGN_3
302
303.L15:
304	testl	$4, M
305	je	.L16
306
307	movsd	 -16 * SIZE(A1), %xmm2
308	movhpd	 -15 * SIZE(A1), %xmm2
309	movsd	 -14 * SIZE(A1), %xmm3
310	movhpd	 -13 * SIZE(A1), %xmm3
311
312	movapd	 -16 * SIZE(Y1), %xmm0
313	movapd	 -14 * SIZE(Y1), %xmm1
314
315	mulpd	 %xmm6, %xmm2
316	addpd	 %xmm2, %xmm0
317	mulpd	 %xmm6, %xmm3
318	addpd	 %xmm3, %xmm1
319
320	movsd	 -16 * SIZE(A1, LDA), %xmm4
321	movhpd	 -15 * SIZE(A1, LDA), %xmm4
322	movsd	 -14 * SIZE(A1, LDA), %xmm5
323	movhpd	 -13 * SIZE(A1, LDA), %xmm5
324
325	mulpd	 %xmm7, %xmm4
326	addpd	 %xmm4, %xmm0
327	mulpd	 %xmm7, %xmm5
328	addpd	 %xmm5, %xmm1
329
330	movapd	 %xmm0,  -16 * SIZE(Y1)
331	movapd	 %xmm1,  -14 * SIZE(Y1)
332
333	addl	 $4 * SIZE, A1
334	addl	 $4 * SIZE, Y1
335	ALIGN_3
336
337.L16:
338	testl	$2, M
339	je	.L17
340
341	movsd	 -16 * SIZE(A1), %xmm2
342	movhpd	 -15 * SIZE(A1), %xmm2
343	movsd	 -16 * SIZE(A1, LDA), %xmm3
344	movhpd	 -15 * SIZE(A1, LDA), %xmm3
345
346	movapd	 -16 * SIZE(Y1), %xmm0
347
348	mulpd	 %xmm6, %xmm2
349	addpd	 %xmm2, %xmm0
350	mulpd	 %xmm7, %xmm3
351	addpd	 %xmm3, %xmm0
352
353	movapd	 %xmm0, -16 * SIZE(Y1)
354
355	addl	 $2 * SIZE, A1
356	addl	 $2 * SIZE, Y1
357	ALIGN_3
358
359.L17:
360	testl	$1, M
361	je	.L19
362
363	movsd	 -16 * SIZE(A1), %xmm2
364	movsd	 -16 * SIZE(A1, LDA), %xmm3
365
366	movsd	 -16 * SIZE(Y1), %xmm0
367
368	mulsd	 %xmm6, %xmm2
369	addsd	 %xmm2, %xmm0
370	mulsd	 %xmm7, %xmm3
371	addsd	 %xmm3, %xmm0
372
373	movsd	 %xmm0, -16 * SIZE(Y1)
374	ALIGN_3
375
376.L19:
377	decl	J
378	jg	.L11
379	ALIGN_4
380
381.L20:
382	testl	$1, N
383	jle	.L990
384
385	movl	BUFFER, Y1
386	addl	$16 * SIZE, Y1
387
388	movl	A,  A1
389
390#ifdef HAVE_SSE3
391	movddup	(X), %xmm6
392	addl	INCX, X
393
394	movddup	ALPHA, %xmm0
395
396	mulpd	%xmm0, %xmm6
397#else
398	movsd	(X), %xmm6
399	addl	INCX, X
400
401	movsd	ALPHA, %xmm0
402
403	mulsd	%xmm0, %xmm6
404	unpcklpd %xmm6, %xmm6
405#endif
406
407	ALIGN_3
408
409	movl	M,   I
410	sarl	$3,  I
411	jle	.L25
412
413	movsd	 -16 * SIZE(A1), %xmm2
414	movhpd	 -15 * SIZE(A1), %xmm2
415	movsd	 -14 * SIZE(A1), %xmm3
416	movhpd	 -13 * SIZE(A1), %xmm3
417
418	movapd	 -16 * SIZE(Y1), %xmm0
419	movapd	 -14 * SIZE(Y1), %xmm1
420	decl	 I
421	jle	 .L24
422	ALIGN_3
423
424.L23:
425#ifdef PREFETCH
426	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
427#endif
428
429	mulpd	 %xmm6, %xmm2
430	addpd	 %xmm2, %xmm0
431	movsd	 -12 * SIZE(A1), %xmm2
432	movhpd	 -11 * SIZE(A1), %xmm2
433
434	movapd	 %xmm0,  -16 * SIZE(Y1)
435	movapd	 -12 * SIZE(Y1), %xmm0
436
437	mulpd	 %xmm6, %xmm3
438	addpd	 %xmm3, %xmm1
439	movsd	 -10 * SIZE(A1), %xmm3
440	movhpd	  -9 * SIZE(A1), %xmm3
441
442	movapd	 %xmm1,  -14 * SIZE(Y1)
443	movapd	 -10 * SIZE(Y1), %xmm1
444
445	mulpd	 %xmm6, %xmm2
446	addpd	 %xmm2, %xmm0
447	movsd	  -8 * SIZE(A1), %xmm2
448	movhpd	  -7 * SIZE(A1), %xmm2
449
450	movapd	 %xmm0,  -12 * SIZE(Y1)
451	movapd	  -8 * SIZE(Y1), %xmm0
452
453	mulpd	 %xmm6, %xmm3
454	addpd	 %xmm3, %xmm1
455	movsd	  -6 * SIZE(A1), %xmm3
456	movhpd	  -5 * SIZE(A1), %xmm3
457
458	movapd	 %xmm1,  -10 * SIZE(Y1)
459	movapd	  -6 * SIZE(Y1), %xmm1
460
461	subl	 $-8 * SIZE, A1
462	subl	 $-8 * SIZE, Y1
463
464	subl	 $1, I
465	BRANCH
466	jg	.L23
467	ALIGN_3
468
469.L24:
470	mulpd	 %xmm6, %xmm2
471	addpd	 %xmm2, %xmm0
472	movsd	 -12 * SIZE(A1), %xmm2
473	movhpd	 -11 * SIZE(A1), %xmm2
474	mulpd	 %xmm6, %xmm3
475	addpd	 %xmm3, %xmm1
476	movsd	 -10 * SIZE(A1), %xmm3
477	movhpd	  -9 * SIZE(A1), %xmm3
478
479	movapd	 %xmm0,  -16 * SIZE(Y1)
480	movapd	 -12 * SIZE(Y1), %xmm0
481
482	movapd	 %xmm1,  -14 * SIZE(Y1)
483	movapd	 -10 * SIZE(Y1), %xmm1
484
485	mulpd	 %xmm6, %xmm2
486	addpd	 %xmm2, %xmm0
487	movapd	 %xmm0,  -12 * SIZE(Y1)
488	mulpd	 %xmm6, %xmm3
489	addpd	 %xmm3, %xmm1
490	movapd	 %xmm1,  -10 * SIZE(Y1)
491
492	subl	 $-8 * SIZE, A1
493	subl	 $-8 * SIZE, Y1
494	ALIGN_3
495
496.L25:
497	testl	$4, M
498	je	.L26
499
500	movsd	 -16 * SIZE(A1), %xmm2
501	movhpd	 -15 * SIZE(A1), %xmm2
502	movsd	 -14 * SIZE(A1), %xmm3
503	movhpd	 -13 * SIZE(A1), %xmm3
504
505	movapd	 -16 * SIZE(Y1), %xmm0
506	movapd	 -14 * SIZE(Y1), %xmm1
507
508	mulpd	 %xmm6, %xmm2
509	addpd	 %xmm2, %xmm0
510	mulpd	 %xmm6, %xmm3
511	addpd	 %xmm3, %xmm1
512
513	movapd	 %xmm0,  -16 * SIZE(Y1)
514	movapd	 %xmm1,  -14 * SIZE(Y1)
515
516	addl	 $4 * SIZE, A1
517	addl	 $4 * SIZE, Y1
518	ALIGN_3
519
520.L26:
521	testl	$2, M
522	je	.L27
523
524	movsd	 -16 * SIZE(A1), %xmm2
525	movhpd	 -15 * SIZE(A1), %xmm2
526
527	movapd	 -16 * SIZE(Y1), %xmm0
528
529	mulpd	 %xmm6, %xmm2
530	addpd	 %xmm2, %xmm0
531
532	movapd	 %xmm0, -16 * SIZE(Y1)
533
534	addl	 $2 * SIZE, A1
535	addl	 $2 * SIZE, Y1
536	ALIGN_3
537
538.L27:
539	testl	$1, M
540	je	.L990
541
542	movsd	 -16 * SIZE(A1), %xmm2
543	movsd	 -16 * SIZE(Y1), %xmm0
544
545	mulsd	 %xmm6, %xmm2
546	addsd	 %xmm2, %xmm0
547
548	movsd	 %xmm0, -16 * SIZE(Y1)
549	ALIGN_3
550
551.L990:
552	movl	Y,   Y1
553	movl	BUFFER, X
554
555	movl	STACK_INCY, INCY
556	sall	$BASE_SHIFT, INCY
557
558	movl	M,   %eax
559	sarl	$3,  %eax
560	jle	.L994
561	ALIGN_3
562
563.L992:
564	movsd	(Y1), %xmm0
565	movhpd	(Y1, INCY), %xmm0
566
567	addpd	0 * SIZE(X), %xmm0
568
569	movlpd	%xmm0, (Y1)
570	movhpd	%xmm0, (Y1, INCY)
571	leal	(Y1, INCY, 2), Y1
572
573	movsd	(Y1), %xmm0
574	movhpd	(Y1, INCY), %xmm0
575
576	addpd	2 * SIZE(X), %xmm0
577
578	movlpd	%xmm0, (Y1)
579	movhpd	%xmm0, (Y1, INCY)
580	leal	(Y1, INCY, 2), Y1
581
582	movsd	(Y1), %xmm0
583	movhpd	(Y1, INCY), %xmm0
584
585	addpd	4 * SIZE(X), %xmm0
586
587	movlpd	%xmm0, (Y1)
588	movhpd	%xmm0, (Y1, INCY)
589	leal	(Y1, INCY, 2), Y1
590
591	movsd	(Y1), %xmm0
592	movhpd	(Y1, INCY), %xmm0
593
594	addpd	6 * SIZE(X), %xmm0
595
596	movlpd	%xmm0, (Y1)
597	movhpd	%xmm0, (Y1, INCY)
598	leal	(Y1, INCY, 2), Y1
599
600	addl	$8 * SIZE, X
601	decl	%eax
602	jg	.L992
603	ALIGN_3
604
605.L994:
606	testl	$7, M
607	jle	.L999
608
609	testl	$4, M
610	jle	.L995
611
612	movsd	(Y1), %xmm0
613	movhpd	(Y1, INCY), %xmm0
614
615	addpd	0 * SIZE(X), %xmm0
616
617	movlpd	%xmm0, (Y1)
618	movhpd	%xmm0, (Y1, INCY)
619	leal	(Y1, INCY, 2), Y1
620
621	movsd	(Y1), %xmm0
622	movhpd	(Y1, INCY), %xmm0
623
624	addpd	2 * SIZE(X), %xmm0
625
626	movlpd	%xmm0, (Y1)
627	movhpd	%xmm0, (Y1, INCY)
628	leal	(Y1, INCY, 2), Y1
629
630	addl	$4 * SIZE, X
631	ALIGN_3
632
633.L995:
634	testl	$2, M
635	jle	.L996
636
637	movsd	(Y1), %xmm0
638	movhpd	(Y1, INCY), %xmm0
639
640	addpd	0 * SIZE(X), %xmm0
641
642	movlpd	%xmm0, (Y1)
643	movhpd	%xmm0, (Y1, INCY)
644	leal	(Y1, INCY, 2), Y1
645
646	addl	$2 * SIZE, X
647	ALIGN_3
648
649.L996:
650	testl	$1, M
651	jle	.L999
652
653	movsd	(Y1), %xmm0
654
655	movsd	0 * SIZE(X), %xmm4
656
657	addsd	%xmm4, %xmm0
658
659	movlpd	%xmm0, (Y1)
660	ALIGN_3
661
662.L999:
663	popl	%ebx
664	popl	%esi
665	popl	%edi
666	popl	%ebp
667	ret
668
669	EPILOGUE
670