1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef ATOM
43#define PREFETCH	prefetcht0
44#define PREFETCHW	prefetcht0
45#define PREFETCHSIZE	(16 * 12)
46#endif
47
48#ifdef CORE2
49#define PREFETCH	prefetcht0
50#define PREFETCHW	prefetcht0
51#define PREFETCHSIZE	(16 * 12)
52#endif
53
54#if defined(PENRYN) || defined(DUNNINGTON)
55#define PREFETCH	prefetcht0
56#define PREFETCHW	prefetcht0
57#define PREFETCHSIZE	(16 * 12)
58#endif
59
60#ifdef NEHALEM
61#define PREFETCH	prefetcht0
62#define PREFETCHW	prefetcht0
63#define PREFETCHSIZE	(16 * 24)
64#endif
65
66#ifdef PENTIUM4
67#define PREFETCH	prefetcht0
68#define PREFETCHW	prefetcht0
69#define PREFETCHSIZE	(16 * 20)
70#endif
71
72#ifdef OPTERON
73#define PREFETCH	prefetch
74#define PREFETCHW	prefetchw
75#define PREFETCHSIZE	(16 * 8)
76#define movsd		movlpd
77#endif
78
79#if defined(BARCELONA) || defined(SHANGHAI)
80#define PREFETCH	prefetch
81#define PREFETCHW	prefetchw
82#define PREFETCHSIZE	(16 * 16)
83#endif
84
85#ifdef NANO
86#define PREFETCH	prefetcht0
87#define PREFETCHW	prefetcht0
88#define PREFETCHSIZE	(8 * 24)
89#endif
90
91#ifdef GENERIC
92#define PREFETCH	prefetcht0
93#define PREFETCHW	prefetcht0
94#define PREFETCHSIZE	(16 * 20)
95#endif
96
97#ifndef WINDOWS_ABI
98
99#define STACKSIZE	80
100
101#define OLD_Y		 8 + STACKSIZE(%rsp)
102#define OLD_INCY	16 + STACKSIZE(%rsp)
103#define OLD_BUFFER	24 + STACKSIZE(%rsp)
104
105#define M	  ARG1
106#define	IS	  ARG2
107#define	A	  ARG3
108#define LDA	  ARG4
109#define	X	  ARG5
110#define INCX	  ARG6
111
112#else
113
114#define STACKSIZE	256
115
116#define OLD_LDA		 40 + STACKSIZE(%rsp)
117#define OLD_X		 48 + STACKSIZE(%rsp)
118#define OLD_INCX	 56 + STACKSIZE(%rsp)
119#define OLD_Y		 64 + STACKSIZE(%rsp)
120#define OLD_INCY	 72 + STACKSIZE(%rsp)
121#define OLD_BUFFER	 80 + STACKSIZE(%rsp)
122
123#define M	  ARG1
124#define IS	  ARG2
125#define	A	  ARG4
126#define LDA	  ARG3
127#define	X	  %rdi
128#define INCX	  %rsi
129
130#endif
131
132#define	Y	%r10
133#define INCY	%r11
134#define BUFFER	%r12
135
136#define TEMP	%rax
137#define I	%rax
138#define A1	%rbx
139#define A2	%rbp
140#define XX	%r13
141#define YY	%r14
142#define NEW_X	BUFFER
143#define NEW_Y	X
144
145#define ALPHA  %xmm0
146
147#define xtemp1 %xmm0
148#define xtemp2 %xmm1
149#define yy1    %xmm2
150#define yy2    %xmm3
151
152#define atemp1 %xmm4
153#define atemp2 %xmm5
154#define atemp3 %xmm6
155#define atemp4 %xmm7
156
157#define xsum1  %xmm8
158#define xsum2  %xmm9
159#define xsum3  %xmm10
160#define xsum4  %xmm11
161
162#define a1     %xmm12
163#define a2     %xmm13
164#define a3     %xmm14
165#define	xt1    %xmm15
166
167	PROLOGUE
168	PROFCODE
169
170	subq	$STACKSIZE, %rsp
171	movq	%rbx,  0(%rsp)
172	movq	%rbp,  8(%rsp)
173	movq	%r12, 16(%rsp)
174	movq	%r13, 24(%rsp)
175	movq	%r14, 32(%rsp)
176	movq	%r15, 40(%rsp)
177
178#ifdef WINDOWS_ABI
179	movq	%rdi,    48(%rsp)
180	movq	%rsi,    56(%rsp)
181	movups	%xmm6,   64(%rsp)
182	movups	%xmm7,   80(%rsp)
183	movups	%xmm8,   96(%rsp)
184	movups	%xmm9,  112(%rsp)
185	movups	%xmm10, 128(%rsp)
186	movups	%xmm11, 144(%rsp)
187	movups	%xmm12, 160(%rsp)
188	movups	%xmm13, 176(%rsp)
189	movups	%xmm14, 192(%rsp)
190	movups	%xmm15, 208(%rsp)
191
192	movq	OLD_LDA,   LDA
193	movq	OLD_X,     X
194	movq	OLD_INCX,  INCX
195
196	movaps	%xmm2, %xmm0
197#endif
198
199	movq	OLD_Y,     Y
200	movq	OLD_INCY,   INCY
201	movq	OLD_BUFFER, BUFFER
202
203	leaq	(,INCX, SIZE), INCX
204	leaq	(,INCY, SIZE), INCY
205	leaq	(,LDA,  SIZE), LDA
206
207	testq	M, M
208	jle	.L999
209
210	negq	IS
211	addq	M, IS
212
213	movq	IS,  TEMP
214	imulq	LDA, TEMP
215	addq	TEMP, A
216
217	unpcklpd ALPHA, ALPHA
218
219	movq	BUFFER, XX
220
221	movq	M,  %rax
222	sarq	$3, %rax
223	jle	.L02
224	ALIGN_3
225
226.L01:
227	movsd	0 * SIZE(X), %xmm1
228	addq	INCX, X
229	movhpd	0 * SIZE(X), %xmm1
230	addq	INCX, X
231	movsd	0 * SIZE(X), %xmm2
232	addq	INCX, X
233	movhpd	0 * SIZE(X), %xmm2
234	addq	INCX, X
235	movsd	0 * SIZE(X), %xmm3
236	addq	INCX, X
237	movhpd	0 * SIZE(X), %xmm3
238	addq	INCX, X
239	movsd	0 * SIZE(X), %xmm4
240	addq	INCX, X
241	movhpd	0 * SIZE(X), %xmm4
242	addq	INCX, X
243
244	mulpd	ALPHA, %xmm1
245	mulpd	ALPHA, %xmm2
246	mulpd	ALPHA, %xmm3
247	mulpd	ALPHA, %xmm4
248
249	movapd	%xmm1, 0 * SIZE(XX)
250	movapd	%xmm2, 2 * SIZE(XX)
251	movapd	%xmm3, 4 * SIZE(XX)
252	movapd	%xmm4, 6 * SIZE(XX)
253
254	addq	$8 * SIZE, XX
255	decq	%rax
256	jg	.L01
257	ALIGN_3
258
259.L02:
260	movq	M, %rax
261	andq	$7, %rax
262	jle	.L05
263	ALIGN_3
264
265.L03:
266	movsd	0 * SIZE(X), %xmm1
267	addq	INCX, X
268
269	mulsd	ALPHA, %xmm1
270
271	movlpd	%xmm1, 0 * SIZE(XX)
272
273	addq	$1 * SIZE, XX
274	decq	%rax
275	jg	.L03
276	ALIGN_3
277
278.L05:
279	/* now we don't need original X */
280	movq   Y, NEW_Y
281
282	addq   $512, XX
283	andq   $-512, XX
284
285	cmpq   $SIZE, INCY
286	je    .L10
287
288	movq   Y,  YY
289	movq   XX, NEW_Y
290
291	movq	M,  %rax
292	sarq	$3, %rax
293	jle	.L07
294	ALIGN_3
295
296.L06:
297	movsd	0 * SIZE(YY), %xmm0
298	addq	INCY, YY
299	movhpd	0 * SIZE(YY), %xmm0
300	addq	INCY, YY
301	movsd	0 * SIZE(YY), %xmm1
302	addq	INCY, YY
303	movhpd	0 * SIZE(YY), %xmm1
304	addq	INCY, YY
305	movsd	0 * SIZE(YY), %xmm2
306	addq	INCY, YY
307	movhpd	0 * SIZE(YY), %xmm2
308	addq	INCY, YY
309	movsd	0 * SIZE(YY), %xmm3
310	addq	INCY, YY
311	movhpd	0 * SIZE(YY), %xmm3
312	addq	INCY, YY
313
314	movapd	%xmm0, 0 * SIZE(XX)
315	movapd	%xmm1, 2 * SIZE(XX)
316	movapd	%xmm2, 4 * SIZE(XX)
317	movapd	%xmm3, 6 * SIZE(XX)
318
319	addq	$8 * SIZE, XX
320	decq	%rax
321	jg	.L06
322	ALIGN_3
323
324.L07:
325	movq	M, %rax
326	andq	$7, %rax
327	jle	.L10
328	ALIGN_3
329
330.L08:
331	movsd	0 * SIZE(YY), %xmm0
332	addq	INCY, YY
333
334	movsd	%xmm0, 0 * SIZE(XX)
335
336	addq	$1 * SIZE, XX
337	decq	%rax
338	jg	.L08
339	ALIGN_3
340
341.L10:
342	movq	 IS, I
343	addq	 $4, I
344	cmpq	 M,  I
345	jg	 .L20
346	ALIGN_3
347
348.L11:
349	movq	A,  A1
350	leaq	(A, LDA, 2), A2
351	leaq	(A, LDA, 4), A
352
353#ifdef HAVE_SSE3
354	movddup		0 * SIZE(NEW_X, IS, SIZE), atemp1
355	movddup		1 * SIZE(NEW_X, IS, SIZE), atemp2
356	movddup		2 * SIZE(NEW_X, IS, SIZE), atemp3
357	movddup		3 * SIZE(NEW_X, IS, SIZE), atemp4
358#else
359	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1
360	movhpd		0 * SIZE(NEW_X, IS, SIZE), atemp1
361	movsd		1 * SIZE(NEW_X, IS, SIZE), atemp2
362	movhpd		1 * SIZE(NEW_X, IS, SIZE), atemp2
363	movsd		2 * SIZE(NEW_X, IS, SIZE), atemp3
364	movhpd		2 * SIZE(NEW_X, IS, SIZE), atemp3
365	movsd		3 * SIZE(NEW_X, IS, SIZE), atemp4
366	movhpd		3 * SIZE(NEW_X, IS, SIZE), atemp4
367#endif
368
369	pxor		xsum1, xsum1
370	pxor		xsum2, xsum2
371	pxor		xsum3, xsum3
372	pxor		xsum4, xsum4
373
374	movapd	 0 * SIZE(NEW_X), xtemp1
375	movapd	 2 * SIZE(NEW_X), xtemp2
376
377	movsd	 0 * SIZE(A1), a1
378	movhpd	 1 * SIZE(A1), a1
379	movsd	 2 * SIZE(A1), a2
380	movhpd	 3 * SIZE(A1), a2
381	movsd	 0 * SIZE(A1, LDA, 1), a3
382	movhpd	 1 * SIZE(A1, LDA, 1), a3
383
384	movsd	 0 * SIZE(NEW_Y), yy1
385	movhpd	 1 * SIZE(NEW_Y), yy1
386	movsd	 2 * SIZE(NEW_Y), yy2
387	movhpd	 3 * SIZE(NEW_Y), yy2
388
389	movq		NEW_X, XX
390	movq		NEW_Y, YY
391
392	movq	IS,  I
393	sarq	$3,  I
394	jle	.L15
395	ALIGN_3
396
397.L12:
398	movapd	 xtemp1, xt1
399	mulpd	 a1,     xt1
400	mulpd	 atemp1, a1
401	addpd	 xt1,    xsum1
402	addpd	 a1,     yy1
403	movsd	 2 * SIZE(A1, LDA, 1), a1
404	movhpd	 3 * SIZE(A1, LDA, 1), a1
405
406	PREFETCH	PREFETCHSIZE(A1)
407
408	movapd	 xtemp2, xt1
409	mulpd	 a2,     xt1
410	mulpd	 atemp1, a2
411	addpd	 xt1,    xsum1
412	addpd	 a2,     yy2
413	movsd	 0 * SIZE(A2), a2
414	movhpd	 1 * SIZE(A2), a2
415
416	movapd	 xtemp1, xt1
417	mulpd	 a3,     xt1
418	mulpd	 atemp2, a3
419	addpd	 xt1,    xsum2
420	addpd	 a3,     yy1
421	movsd	 2 * SIZE(A2), a3
422	movhpd	 3 * SIZE(A2), a3
423
424#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
425	PREFETCH	PREFETCHSIZE(XX)
426#endif
427
428	movapd	 xtemp2, xt1
429	mulpd	 a1,     xt1
430	mulpd	 atemp2, a1
431	addpd	 xt1,    xsum2
432	addpd	 a1,     yy2
433	movsd	 0 * SIZE(A2, LDA, 1), a1
434	movhpd	 1 * SIZE(A2, LDA, 1), a1
435
436	movapd	 xtemp1, xt1
437	mulpd	 a2,     xt1
438	mulpd	 atemp3, a2
439	addpd	 xt1,    xsum3
440	addpd	 a2,     yy1
441	movsd	 2 * SIZE(A2, LDA, 1), a2
442	movhpd	 3 * SIZE(A2, LDA, 1), a2
443
444	PREFETCH	PREFETCHSIZE(A1, LDA, 1)
445
446	movapd	 xtemp2, xt1
447	mulpd	 a3,     xt1
448	mulpd	 atemp3, a3
449	addpd	 xt1,    xsum3
450	addpd	 a3,     yy2
451	movsd	 4 * SIZE(A1), a3
452	movhpd	 5 * SIZE(A1), a3
453
454	movapd	 xtemp1, xt1
455	movapd	 4 * SIZE(XX), xtemp1
456	mulpd	 a1,     xt1
457	mulpd	 atemp4, a1
458	addpd	 xt1,    xsum4
459	addpd	 a1,     yy1
460	movsd	 6 * SIZE(A1), a1
461	movhpd	 7 * SIZE(A1), a1
462
463	movapd	 xtemp2, xt1
464	movapd	 6 * SIZE(XX), xtemp2
465	mulpd	 a2,     xt1
466	mulpd	 atemp4, a2
467	addpd	 xt1,    xsum4
468	addpd	 a2,     yy2
469	movsd	 4 * SIZE(A1, LDA, 1), a2
470	movhpd	 5 * SIZE(A1, LDA, 1), a2
471
472	movsd	 yy1, 0 * SIZE(YY)
473	movhpd	 yy1, 1 * SIZE(YY)
474	movsd	 4 * SIZE(YY), yy1
475	movhpd	 5 * SIZE(YY), yy1
476
477	movsd	 yy2, 2 * SIZE(YY)
478	movhpd	 yy2, 3 * SIZE(YY)
479	movsd	 6 * SIZE(YY), yy2
480	movhpd	 7 * SIZE(YY), yy2
481
482	movapd	 xtemp1, xt1
483	mulpd	 a3,     xt1
484	mulpd	 atemp1, a3
485	addpd	 xt1,    xsum1
486	addpd	 a3,     yy1
487	movsd	 6 * SIZE(A1, LDA, 1), a3
488	movhpd	 7 * SIZE(A1, LDA, 1), a3
489
490	PREFETCH	PREFETCHSIZE(A2)
491
492	movapd	 xtemp2, xt1
493	mulpd	 a1,     xt1
494	mulpd	 atemp1, a1
495	addpd	 xt1,    xsum1
496	addpd	 a1,     yy2
497	movsd	 4 * SIZE(A2), a1
498	movhpd	 5 * SIZE(A2), a1
499
500	movapd	 xtemp1, xt1
501	mulpd	 a2,     xt1
502	mulpd	 atemp2, a2
503	addpd	 xt1,    xsum2
504	addpd	 a2,     yy1
505	movsd	 6 * SIZE(A2), a2
506	movhpd	 7 * SIZE(A2), a2
507
508#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
509	PREFETCHW	PREFETCHSIZE(YY)
510#endif
511
512	movapd	 xtemp2, xt1
513	mulpd	 a3,     xt1
514	mulpd	 atemp2, a3
515	addpd	 xt1,    xsum2
516	addpd	 a3,     yy2
517	movsd	 4 * SIZE(A2, LDA, 1), a3
518	movhpd	 5 * SIZE(A2, LDA, 1), a3
519
520	movapd	 xtemp1, xt1
521	mulpd	 a1,     xt1
522	mulpd	 atemp3, a1
523	addpd	 xt1,    xsum3
524	addpd	 a1,     yy1
525	movsd	 6 * SIZE(A2, LDA, 1), a1
526	movhpd	 7 * SIZE(A2, LDA, 1), a1
527
528	PREFETCH	PREFETCHSIZE(A2, LDA, 1)
529
530	movapd	 xtemp2, xt1
531	mulpd	 a2,     xt1
532	mulpd	 atemp3, a2
533	addpd	 xt1,    xsum3
534	addpd	 a2,     yy2
535	movsd	10 * SIZE(A1), a2
536	movhpd	11 * SIZE(A1), a2
537
538	movapd	 xtemp1, xt1
539	movapd	 8 * SIZE(XX), xtemp1
540	mulpd	 a3,     xt1
541	mulpd	 atemp4, a3
542	addpd	 xt1,    xsum4
543	addpd	 a3,     yy1
544	movsd	 8 * SIZE(A1, LDA, 1), a3
545	movhpd	 9 * SIZE(A1, LDA, 1), a3
546
547	movapd	 xtemp2, xt1
548	movapd	10 * SIZE(XX), xtemp2
549	mulpd	 a1,     xt1
550	mulpd	 atemp4, a1
551	addpd	 xt1,    xsum4
552	addpd	 a1,     yy2
553	movsd	 8 * SIZE(A1), a1
554	movhpd	 9 * SIZE(A1), a1
555
556	movsd	 yy1, 4 * SIZE(YY)
557	movhpd	 yy1, 5 * SIZE(YY)
558	movsd	 8 * SIZE(YY), yy1
559	movhpd	 9 * SIZE(YY), yy1
560
561	movsd	 yy2, 6 * SIZE(YY)
562	movhpd	 yy2, 7 * SIZE(YY)
563	movsd	10 * SIZE(YY), yy2
564	movhpd	11 * SIZE(YY), yy2
565
566	addq	 $8 * SIZE, XX
567	addq	 $8 * SIZE, YY
568	addq	 $8 * SIZE, A1
569	addq	 $8 * SIZE, A2
570
571	decq	 I
572	jg	 .L12
573	ALIGN_3
574
575.L15:
576	testq	$4, IS
577	jle	.L18
578
579	movapd	 xtemp1, xt1
580	mulpd	 a1,     xt1
581	mulpd	 atemp1, a1
582	addpd	 xt1,    xsum1
583	addpd	 a1,     yy1
584	movsd	 2 * SIZE(A1, LDA, 1), a1
585	movhpd	 3 * SIZE(A1, LDA, 1), a1
586
587	movapd	 xtemp2, xt1
588	mulpd	 a2,     xt1
589	mulpd	 atemp1, a2
590	addpd	 xt1,    xsum1
591	addpd	 a2,     yy2
592	movsd	 0 * SIZE(A2), a2
593	movhpd	 1 * SIZE(A2), a2
594
595	movapd	 xtemp1, xt1
596	mulpd	 a3,     xt1
597	mulpd	 atemp2, a3
598	addpd	 xt1,    xsum2
599	addpd	 a3,     yy1
600	movsd	 2 * SIZE(A2), a3
601	movhpd	 3 * SIZE(A2), a3
602
603	movapd	 xtemp2, xt1
604	mulpd	 a1,     xt1
605	mulpd	 atemp2, a1
606	addpd	 xt1,    xsum2
607	addpd	 a1,     yy2
608	movsd	 0 * SIZE(A2, LDA, 1), a1
609	movhpd	 1 * SIZE(A2, LDA, 1), a1
610
611	movapd	 xtemp1, xt1
612	mulpd	 a2,     xt1
613	mulpd	 atemp3, a2
614	addpd	 xt1,    xsum3
615	addpd	 a2,     yy1
616	movsd	 2 * SIZE(A2, LDA, 1), a2
617	movhpd	 3 * SIZE(A2, LDA, 1), a2
618
619	movapd	 xtemp2, xt1
620	mulpd	 a3,     xt1
621	mulpd	 atemp3, a3
622	addpd	 xt1,    xsum3
623	addpd	 a3,     yy2
624
625	movapd	 xtemp1, xt1
626	movapd	 4 * SIZE(XX), xtemp1
627	mulpd	 a1,     xt1
628	mulpd	 atemp4, a1
629	addpd	 xt1,    xsum4
630	addpd	 a1,     yy1
631
632	movapd	 xtemp2, xt1
633	movapd	 6 * SIZE(XX), xtemp2
634	mulpd	 a2,     xt1
635	mulpd	 atemp4, a2
636	addpd	 xt1,    xsum4
637	addpd	 a2,     yy2
638
639	movsd	 yy1, 0 * SIZE(YY)
640	movhpd	 yy1, 1 * SIZE(YY)
641	movsd	 4 * SIZE(YY), yy1
642	movhpd	 5 * SIZE(YY), yy1
643
644	movsd	 yy2, 2 * SIZE(YY)
645	movhpd	 yy2, 3 * SIZE(YY)
646	movsd	 6 * SIZE(YY), yy2
647	movhpd	 7 * SIZE(YY), yy2
648
649	addq	 $4 * SIZE, XX
650	addq	 $4 * SIZE, YY
651	addq	 $4 * SIZE, A1
652	addq	 $4 * SIZE, A2
653	ALIGN_3
654
655.L18:
656	unpckhpd atemp2, atemp1
657	unpckhpd atemp4, atemp3
658
659	movsd	 0 * SIZE(A1), a1
660	movhpd	 0 * SIZE(A1, LDA, 1), a1
661	mulpd	 atemp1, a1
662	addpd	 a1, xsum1
663
664	movsd	 0 * SIZE(A1, LDA, 1), a1
665	movhpd	 1 * SIZE(A1, LDA, 1), a1
666	mulpd	 atemp1, a1
667	addpd	 a1, xsum2
668
669	movsd	 0 * SIZE(A2), a1
670	movhpd	 1 * SIZE(A2), a1
671	mulpd	 atemp1, a1
672	addpd	 a1, xsum3
673
674	movsd	 0 * SIZE(A2, LDA, 1), a1
675	movhpd	 1 * SIZE(A2, LDA, 1), a1
676	mulpd	 atemp1, a1
677	addpd	 a1, xsum4
678
679	movsd	 0 * SIZE(A2), a1
680	movhpd	 0 * SIZE(A2, LDA, 1), a1
681	mulpd	 atemp3, a1
682	addpd	 a1, xsum1
683
684	movsd	 1 * SIZE(A2), a1
685	movhpd	 1 * SIZE(A2, LDA, 1), a1
686	mulpd	 atemp3, a1
687	addpd	 a1, xsum2
688
689	movsd	 2 * SIZE(A2), a1
690	movhpd	 2 * SIZE(A2, LDA, 1), a1
691	mulpd	 atemp3, a1
692	addpd	 a1, xsum3
693
694	movsd	 2 * SIZE(A2, LDA, 1), a1
695	movhpd	 3 * SIZE(A2, LDA, 1), a1
696	mulpd	 atemp3, a1
697	addpd	 a1, xsum4
698
699#ifndef HAVE_SSE3
700	movapd	xsum1, atemp1
701	movapd	xsum3, atemp3
702
703	unpcklpd xsum2, xsum1
704	unpcklpd xsum4, xsum3
705
706	unpckhpd xsum2, atemp1
707	unpckhpd xsum4, atemp3
708
709	addpd	 atemp1, xsum1
710	addpd	 atemp3, xsum3
711#else
712	haddpd	 xsum2, xsum1
713	haddpd	 xsum4, xsum3
714#endif
715
716	addpd	 xsum1, yy1
717	addpd	 xsum3, yy2
718
719	movsd	 yy1, 0 * SIZE(YY)
720	movhpd	 yy1, 1 * SIZE(YY)
721	movsd	 yy2, 2 * SIZE(YY)
722	movhpd	 yy2, 3 * SIZE(YY)
723
724	addq	 $4, IS
725
726	movq	 IS, I
727	addq	 $4, I
728	cmpq	 M, I
729	jle	 .L11
730	ALIGN_3
731
732.L20:
733	testq	$2, M
734	je	.L30
735	ALIGN_3
736
737.L21:
738	movq	A,  A1
739	leaq	(A, LDA, 2), A
740
741#ifdef HAVE_SSE3
742	movddup		0 * SIZE(NEW_X, IS, SIZE), atemp1
743	movddup		1 * SIZE(NEW_X, IS, SIZE), atemp2
744#else
745	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1
746	movhpd		0 * SIZE(NEW_X, IS, SIZE), atemp1
747	movsd		1 * SIZE(NEW_X, IS, SIZE), atemp2
748	movhpd		1 * SIZE(NEW_X, IS, SIZE), atemp2
749#endif
750
751	pxor		xsum1, xsum1
752	pxor		xsum2, xsum2
753
754	movapd	 0 * SIZE(NEW_X), xtemp1
755
756	movsd	 0 * SIZE(NEW_Y), yy1
757	movhpd	 1 * SIZE(NEW_Y), yy1
758
759	movsd	 0 * SIZE(A1), a1
760	movhpd	 1 * SIZE(A1), a1
761	movsd	 0 * SIZE(A1, LDA, 1), a2
762	movhpd	 1 * SIZE(A1, LDA, 1), a2
763
764	movq		NEW_X, XX
765	movq		NEW_Y, YY
766
767	movq	IS,  I
768	sarq	$1,  I
769	jle	.L28
770	ALIGN_3
771
772.L22:
773	movapd	 xtemp1, xt1
774	mulpd	 a1,     xt1
775	mulpd	 atemp1, a1
776	addpd	 xt1,    xsum1
777	addpd	 a1,     yy1
778	movsd	 2 * SIZE(A1), a1
779	movhpd	 3 * SIZE(A1), a1
780
781	movapd	 xtemp1, xt1
782	movapd	 2 * SIZE(XX), xtemp1
783	mulpd	 a2,     xt1
784	mulpd	 atemp2, a2
785	addpd	 xt1,    xsum2
786	addpd	 a2,     yy1
787	movsd	 2 * SIZE(A1, LDA, 1), a2
788	movhpd	 3 * SIZE(A1, LDA, 1), a2
789
790	movsd	 yy1, 0 * SIZE(YY)
791	movhpd	 yy1, 1 * SIZE(YY)
792	movsd	 2 * SIZE(YY), yy1
793	movhpd	 3 * SIZE(YY), yy1
794
795	addq	 $2 * SIZE, XX
796	addq	 $2 * SIZE, YY
797	addq	 $2 * SIZE, A1
798
799	decq	 I
800	jg	 .L22
801	ALIGN_3
802
803.L28:
804	unpckhpd atemp2, atemp1
805
806	movsd	 0 * SIZE(A1), a1
807	movhpd	 0 * SIZE(A1, LDA, 1), a1
808	mulpd	 atemp1, a1
809	addpd	 a1, xsum1
810
811	movsd	 0 * SIZE(A1, LDA, 1), a1
812	movhpd	 1 * SIZE(A1, LDA, 1), a1
813	mulpd	 atemp1, a1
814	addpd	 a1, xsum2
815
816#ifndef HAVE_SSE3
817	movapd	xsum1, atemp1
818
819	unpcklpd xsum2, xsum1
820	unpckhpd xsum2, atemp1
821
822	addpd	 atemp1, xsum1
823#else
824	haddpd	 xsum2, xsum1
825#endif
826
827	addpd	 xsum1, yy1
828
829	movsd	 yy1, 0 * SIZE(YY)
830	movhpd	 yy1, 1 * SIZE(YY)
831
832	addq	 $2, IS
833	ALIGN_3
834
835.L30:
836	testq	$1, M
837	je	.L990
838	ALIGN_3
839
840.L31:
841	movq	A,  A1
842
843#ifdef HAVE_SSE3
844	movddup		0 * SIZE(NEW_X, IS, SIZE), atemp1
845#else
846	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1
847	movhpd		0 * SIZE(NEW_X, IS, SIZE), atemp1
848#endif
849
850	pxor		xsum1, xsum1
851
852	movsd	 0 * SIZE(NEW_X), xtemp1
853	movsd	 0 * SIZE(NEW_Y), yy1
854	movsd	 0 * SIZE(A1), a1
855
856	movq		NEW_X, XX
857	movq		NEW_Y, YY
858
859	movq	IS,  I
860	testq	I,  I
861	jle	.L38
862	ALIGN_3
863
864.L32:
865	movapd	 xtemp1, xt1
866	mulpd	 a1,     xt1
867	mulpd	 atemp1, a1
868	addpd	 xt1,    xsum1
869	addpd	 a1,     yy1
870	movsd	 1 * SIZE(A1), a1
871
872	movsd	 1 * SIZE(XX), xtemp1
873
874	movsd	 yy1, 0 * SIZE(YY)
875	movsd	 1 * SIZE(YY), yy1
876
877	addq	 $1 * SIZE, XX
878	addq	 $1 * SIZE, YY
879	addq	 $1 * SIZE, A1
880
881	decq	 I
882	jg	 .L32
883	ALIGN_3
884
885.L38:
886	movsd	 0 * SIZE(A1), a1
887	mulsd	 atemp1, a1
888	addsd	 a1, xsum1
889
890	addsd	 xsum1, yy1
891
892	movsd	 yy1, 0 * SIZE(YY)
893	ALIGN_3
894
895.L990:
896	cmpq   $SIZE, INCY
897	je    .L999
898
899	movq	M,  %rax
900	sarq	$3, %rax
901	jle	.L997
902	ALIGN_3
903
904.L996:
905	movapd	 0 * SIZE(NEW_Y), %xmm0
906	movapd	 2 * SIZE(NEW_Y), %xmm1
907	movapd	 4 * SIZE(NEW_Y), %xmm2
908	movapd	 6 * SIZE(NEW_Y), %xmm3
909
910	movsd	%xmm0,  0 * SIZE(Y)
911	addq	INCY, Y
912	movhpd	%xmm0,  0 * SIZE(Y)
913	addq	INCY, Y
914	movsd	%xmm1,  0 * SIZE(Y)
915	addq	INCY, Y
916	movhpd	%xmm1,  0 * SIZE(Y)
917	addq	INCY, Y
918	movsd	%xmm2,  0 * SIZE(Y)
919	addq	INCY, Y
920	movhpd	%xmm2,  0 * SIZE(Y)
921	addq	INCY, Y
922	movsd	%xmm3,  0 * SIZE(Y)
923	addq	INCY, Y
924	movhpd	%xmm3,  0 * SIZE(Y)
925	addq	INCY, Y
926
927	addq	$8 * SIZE, NEW_Y
928	decq	%rax
929	jg	.L996
930	ALIGN_3
931
932.L997:
933	movq	M, %rax
934	andq	$7, %rax
935	jle	.L999
936	ALIGN_3
937
938.L998:
939	movsd	0 * SIZE(NEW_Y), %xmm0
940
941	movsd	%xmm0,  0 * SIZE(Y)
942	addq	INCY, Y
943
944	addq	$1 * SIZE, NEW_Y
945
946	decq	%rax
947	jg	.L998
948	ALIGN_3
949
950
951.L999:
952	movq	  0(%rsp), %rbx
953	movq	  8(%rsp), %rbp
954	movq	 16(%rsp), %r12
955	movq	 24(%rsp), %r13
956	movq	 32(%rsp), %r14
957	movq	 40(%rsp), %r15
958
959#ifdef WINDOWS_ABI
960	movq	 48(%rsp), %rdi
961	movq	 56(%rsp), %rsi
962	movups	 64(%rsp), %xmm6
963	movups	 80(%rsp), %xmm7
964	movups	 96(%rsp), %xmm8
965	movups	112(%rsp), %xmm9
966	movups	128(%rsp), %xmm10
967	movups	144(%rsp), %xmm11
968	movups	160(%rsp), %xmm12
969	movups	176(%rsp), %xmm13
970	movups	192(%rsp), %xmm14
971	movups	208(%rsp), %xmm15
972#endif
973
974	addq	$STACKSIZE, %rsp
975	ret
976	EPILOGUE
977