1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef ATOM
43#define PREFETCH	prefetcht0
44#define PREFETCHW	prefetcht0
45#define PREFETCHSIZE	(16 * 12)
46#endif
47
48#ifdef CORE2
49#define PREFETCH	prefetcht0
50#define PREFETCHW	prefetcht0
51#define PREFETCHSIZE	(16 * 12)
52#endif
53
54#if defined(PENRYN) || defined(DUNNINGTON)
55#define PREFETCH	prefetcht0
56#define PREFETCHW	prefetcht0
57#define PREFETCHSIZE	(16 * 12)
58#endif
59
60#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
61#define PREFETCH	prefetcht0
62#define PREFETCHW	prefetcht0
63#define PREFETCHSIZE	(16 * 12)
64#endif
65
66#ifdef PENTIUM4
67#define PREFETCH	prefetcht0
68#define PREFETCHW	prefetcht0
69#define PREFETCHSIZE	(16 * 20)
70#endif
71
72#ifdef OPTERON
73#define PREFETCH	prefetch
74#define PREFETCHW	prefetchw
75#define PREFETCHSIZE	(16 * 8)
76#define movsd		movlps
77#endif
78
79#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
80#define PREFETCH	prefetch
81#define PREFETCHW	prefetchw
82#define PREFETCHSIZE	(16 * 16)
83#endif
84
85#ifdef NANO
86#define PREFETCH	prefetcht0
87#define PREFETCHW	prefetcht0
88#define PREFETCHSIZE	(16 * 24)
89#endif
90
91#ifdef GENERIC
92#define PREFETCH	prefetcht0
93#define PREFETCHW	prefetcht0
94#define PREFETCHSIZE	(16 * 20)
95#endif
96
97#ifndef WINDOWS_ABI
98
99#define STACKSIZE	80
100
101#define OLD_Y		 8 + STACKSIZE(%rsp)
102#define OLD_INCY	16 + STACKSIZE(%rsp)
103#define OLD_BUFFER	24 + STACKSIZE(%rsp)
104
105#define M	  ARG1
106#define	N	  ARG2
107#define	A	  ARG3
108#define LDA	  ARG4
109#define	X	  ARG5
110#define INCX	  ARG6
111
112#else
113
114#define STACKSIZE	256
115
116#define OLD_LDA		 40 + STACKSIZE(%rsp)
117#define OLD_X		 48 + STACKSIZE(%rsp)
118#define OLD_INCX	 56 + STACKSIZE(%rsp)
119#define OLD_Y		 64 + STACKSIZE(%rsp)
120#define OLD_INCY	 72 + STACKSIZE(%rsp)
121#define OLD_BUFFER	 80 + STACKSIZE(%rsp)
122
123#define M	  ARG1
124#define N	  ARG2
125#define	A	  ARG4
126#define LDA	  ARG3
127#define	X	  %rdi
128#define INCX	  %rsi
129#endif
130
131#define	Y	%r10
132#define INCY	%r11
133#define BUFFER	%r12
134
135#define TEMP	%rax
136#define I	%rax
137#define A1	%rbx
138#define A2	%rbp
139#define XX	%r13
140#define YY	%r14
141#define IS	%r15
142#define NEW_X	BUFFER
143#define NEW_Y	X
144
145#define ALPHA  %xmm0
146
147#define atemp1 %xmm0
148#define atemp2 %xmm1
149#define atemp3 %xmm2
150#define atemp4 %xmm3
151
152#define xsum1  %xmm4
153#define xsum2  %xmm5
154#define xsum3  %xmm6
155#define xsum4  %xmm7
156
157#define xtemp1 %xmm8
158#define xtemp2 %xmm9
159#define yy1    %xmm10
160#define	xt1    %xmm11
161
162#define a1     %xmm12
163#define a2     %xmm13
164#define a3     %xmm14
165#define a4     %xmm15
166
167
168	PROLOGUE
169	PROFCODE
170
171	subq	$STACKSIZE, %rsp
172	movq	%rbx,  0(%rsp)
173	movq	%rbp,  8(%rsp)
174	movq	%r12, 16(%rsp)
175	movq	%r13, 24(%rsp)
176	movq	%r14, 32(%rsp)
177	movq	%r15, 40(%rsp)
178
179#ifdef WINDOWS_ABI
180	movq	%rdi,    48(%rsp)
181	movq	%rsi,    56(%rsp)
182	movups	%xmm6,   64(%rsp)
183	movups	%xmm7,   80(%rsp)
184	movups	%xmm8,   96(%rsp)
185	movups	%xmm9,  112(%rsp)
186	movups	%xmm10, 128(%rsp)
187	movups	%xmm11, 144(%rsp)
188	movups	%xmm12, 160(%rsp)
189	movups	%xmm13, 176(%rsp)
190	movups	%xmm14, 192(%rsp)
191	movups	%xmm15, 208(%rsp)
192
193	movq	OLD_LDA,   LDA
194	movq	OLD_X,     X
195	movq	OLD_INCX,  INCX
196
197	movaps	%xmm2, %xmm0
198#endif
199
200	movq	OLD_Y,     Y
201	movq	OLD_INCY,   INCY
202	movq	OLD_BUFFER, BUFFER
203
204	leaq	(,INCX, SIZE), INCX
205	leaq	(,INCY, SIZE), INCY
206	leaq	(,LDA,  SIZE), LDA
207
208	testq	M, M
209	jle	.L999
210
211	shufps	$0, ALPHA, ALPHA
212
213	movq	BUFFER, XX
214
215	movq	M,  %rax
216	sarq	$3, %rax
217	jle	.L02
218	ALIGN_3
219
220.L01:
221	movss	0 * SIZE(X), %xmm1
222	addq	INCX, X
223	movss	0 * SIZE(X), %xmm2
224	addq	INCX, X
225	movss	0 * SIZE(X), %xmm3
226	addq	INCX, X
227	movss	0 * SIZE(X), %xmm4
228	addq	INCX, X
229	movss	0 * SIZE(X), %xmm5
230	addq	INCX, X
231	movss	0 * SIZE(X), %xmm6
232	addq	INCX, X
233	movss	0 * SIZE(X), %xmm7
234	addq	INCX, X
235	movss	0 * SIZE(X), %xmm8
236	addq	INCX, X
237
238	mulss	ALPHA, %xmm1
239	mulss	ALPHA, %xmm2
240	mulss	ALPHA, %xmm3
241	mulss	ALPHA, %xmm4
242	mulss	ALPHA, %xmm5
243	mulss	ALPHA, %xmm6
244	mulss	ALPHA, %xmm7
245	mulss	ALPHA, %xmm8
246
247	movss	%xmm1, 0 * SIZE(XX)
248	movss	%xmm2, 1 * SIZE(XX)
249	movss	%xmm3, 2 * SIZE(XX)
250	movss	%xmm4, 3 * SIZE(XX)
251	movss	%xmm5, 4 * SIZE(XX)
252	movss	%xmm6, 5 * SIZE(XX)
253	movss	%xmm7, 6 * SIZE(XX)
254	movss	%xmm8, 7 * SIZE(XX)
255
256	addq	$8 * SIZE, XX
257	decq	%rax
258	jg	.L01
259	ALIGN_3
260
261.L02:
262	movq	M, %rax
263	andq	$7, %rax
264	jle	.L05
265	ALIGN_3
266
267.L03:
268	movss	0 * SIZE(X), %xmm1
269	addq	INCX, X
270
271	mulss	ALPHA, %xmm1
272
273	movss	%xmm1, 0 * SIZE(XX)
274
275	addq	$1 * SIZE, XX
276	decq	%rax
277	jg	.L03
278	ALIGN_3
279
280.L05:
281	/* now we don't need original X */
282	movq   Y, NEW_Y
283
284	addq   $512, XX
285	andq   $-512, XX
286
287	cmpq   $SIZE, INCY
288	je    .L10
289
290	movq   Y,  YY
291	movq   XX, NEW_Y
292
293	movq	M,  %rax
294	sarq	$3, %rax
295	jle	.L07
296	ALIGN_3
297
298.L06:
299	movss	0 * SIZE(YY), %xmm0
300	addq	INCY, YY
301	movss	0 * SIZE(YY), %xmm1
302	addq	INCY, YY
303	movss	0 * SIZE(YY), %xmm2
304	addq	INCY, YY
305	movss	0 * SIZE(YY), %xmm3
306	addq	INCY, YY
307	movss	0 * SIZE(YY), %xmm4
308	addq	INCY, YY
309	movss	0 * SIZE(YY), %xmm5
310	addq	INCY, YY
311	movss	0 * SIZE(YY), %xmm6
312	addq	INCY, YY
313	movss	0 * SIZE(YY), %xmm7
314	addq	INCY, YY
315
316	movss	%xmm0, 0 * SIZE(XX)
317	movss	%xmm1, 1 * SIZE(XX)
318	movss	%xmm2, 2 * SIZE(XX)
319	movss	%xmm3, 3 * SIZE(XX)
320	movss	%xmm4, 4 * SIZE(XX)
321	movss	%xmm5, 5 * SIZE(XX)
322	movss	%xmm6, 6 * SIZE(XX)
323	movss	%xmm7, 7 * SIZE(XX)
324
325	addq	$8 * SIZE, XX
326	decq	%rax
327	jg	.L06
328	ALIGN_3
329
330.L07:
331	movq	M, %rax
332	andq	$7, %rax
333	jle	.L10
334	ALIGN_3
335
336.L08:
337	movss	0 * SIZE(YY), %xmm0
338	addq	INCY, YY
339
340	movss	%xmm0, 0 * SIZE(XX)
341
342	addq	$1 * SIZE, XX
343	decq	%rax
344	jg	.L08
345	ALIGN_3
346
347.L10:
348	xorq	IS, IS		# is = 0
349
350	cmpq	$4, N
351	jl	.L20
352	ALIGN_3
353
354.L11:
355	movq	A,  A1
356	leaq	(A, LDA, 2), A2
357	leaq	4 * SIZE(A, LDA, 4), A
358
359	leaq	        (NEW_X, IS, SIZE), XX
360	leaq	4 * SIZE(NEW_Y, IS, SIZE), YY
361
362	movaps		0 * SIZE(XX), atemp4
363
364	movsd	 0 * SIZE(A1), xsum1
365	movhps	 2 * SIZE(A1), xsum1
366	mulps	 atemp4, xsum1
367
368	movss	 1 * SIZE(A1), xsum2
369	movss	 1 * SIZE(A1, LDA, 1), a2
370	movss	 2 * SIZE(A1, LDA, 1), a3
371	movss	 3 * SIZE(A1, LDA, 1), a4
372	unpcklps a3, xsum2
373	unpcklps a4, a2
374	unpcklps a2, xsum2
375	mulps	 atemp4, xsum2
376
377	movss	 2 * SIZE(A1), xsum3
378	movss	 2 * SIZE(A1, LDA, 1), a2
379	movss	 2 * SIZE(A2), a3
380	movss	 3 * SIZE(A2), a4
381	unpcklps a3, xsum3
382	unpcklps a4, a2
383	unpcklps a2, xsum3
384	mulps	 atemp4, xsum3
385
386	movss	 3 * SIZE(A1), xsum4
387	movss	 3 * SIZE(A1, LDA, 1), a2
388	movss	 3 * SIZE(A2), a3
389	movss	 3 * SIZE(A2, LDA, 1), a4
390	unpcklps a3, xsum4
391	unpcklps a4, a2
392	unpcklps a2, xsum4
393	mulps	 atemp4, xsum4
394
395	pshufd	$0x00, atemp4, atemp1
396	pshufd	$0x55, atemp4, atemp2
397	pshufd	$0xaa, atemp4, atemp3
398	pshufd	$0xff, atemp4, atemp4
399
400	movaps	 4 * SIZE(XX), xtemp1
401	movaps	 8 * SIZE(XX), xtemp2
402
403	movsd	 0 * SIZE(YY), yy1
404	movhps	 2 * SIZE(YY), yy1
405
406	movsd	 4 * SIZE(A1), a1
407	movhps	 6 * SIZE(A1), a1
408	movsd	 4 * SIZE(A1, LDA, 1), a2
409	movhps	 6 * SIZE(A1, LDA, 1), a2
410	movsd	 4 * SIZE(A2), a3
411	movhps	 6 * SIZE(A2), a3
412	movsd	 4 * SIZE(A2, LDA, 1), a4
413	movhps	 6 * SIZE(A2, LDA, 1), a4
414
415	addq	 $4 * SIZE, XX
416	addq	 $4 * SIZE, A1
417	addq	 $4 * SIZE, A2
418
419	movq	M,  I
420	subq	IS, I
421	subq	$4, I
422	sarq	$4, I
423	jle	.L14
424	ALIGN_3
425
426.L12:
427	movaps	 xtemp1, xt1
428	mulps	 a1,     xt1
429	mulps	 atemp1, a1
430	addps	 xt1,    xsum1
431	addps	 a1,     yy1
432	movsd	 4 * SIZE(A1), a1
433	movhps	 6 * SIZE(A1), a1
434
435	PREFETCH	PREFETCHSIZE(A1)
436
437	movaps	 xtemp1, xt1
438	mulps	 a2,     xt1
439	mulps	 atemp2, a2
440	addps	 xt1,    xsum2
441	addps	 a2,     yy1
442	movsd	 4 * SIZE(A1, LDA, 1), a2
443	movhps	 6 * SIZE(A1, LDA, 1), a2
444
445	movaps	 xtemp1, xt1
446	mulps	 a3,     xt1
447	mulps	 atemp3, a3
448	addps	 xt1,    xsum3
449	addps	 a3,     yy1
450	movsd	 4 * SIZE(A2), a3
451	movhps	 6 * SIZE(A2), a3
452
453#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
454	PREFETCH	PREFETCHSIZE(XX)
455#endif
456
457	movaps	 xtemp1, xt1
458	movaps	 8 * SIZE(XX), xtemp1
459	mulps	 a4,     xt1
460	mulps	 atemp4, a4
461	addps	 xt1,    xsum4
462	addps	 a4,     yy1
463	movsd	 4 * SIZE(A2, LDA, 1), a4
464	movhps	 6 * SIZE(A2, LDA, 1), a4
465
466	movlps	 yy1, 0 * SIZE(YY)
467	movhps	 yy1, 2 * SIZE(YY)
468	movsd	 4 * SIZE(YY), yy1
469	movhps	 6 * SIZE(YY), yy1
470
471	movaps	 xtemp2, xt1
472	mulps	 a1,     xt1
473	mulps	 atemp1, a1
474	addps	 xt1,    xsum1
475	addps	 a1,     yy1
476	movsd	 8 * SIZE(A1), a1
477	movhps	10 * SIZE(A1), a1
478
479	PREFETCH	PREFETCHSIZE(A1, LDA, 1)
480
481	movaps	 xtemp2, xt1
482	mulps	 a2,     xt1
483	mulps	 atemp2, a2
484	addps	 xt1,    xsum2
485	addps	 a2,     yy1
486	movsd	 8 * SIZE(A1, LDA, 1), a2
487	movhps	10 * SIZE(A1, LDA, 1), a2
488
489	movaps	 xtemp2, xt1
490	mulps	 a3,     xt1
491	mulps	 atemp3, a3
492	addps	 xt1,    xsum3
493	addps	 a3,     yy1
494	movsd	 8 * SIZE(A2), a3
495	movhps	10 * SIZE(A2), a3
496
497	movaps	 xtemp2, xt1
498	movaps	12 * SIZE(XX), xtemp2
499	mulps	 a4,     xt1
500	mulps	 atemp4, a4
501	addps	 xt1,    xsum4
502	addps	 a4,     yy1
503	movsd	 8 * SIZE(A2, LDA, 1), a4
504	movhps	10 * SIZE(A2, LDA, 1), a4
505
506	movlps	 yy1, 4 * SIZE(YY)
507	movhps	 yy1, 6 * SIZE(YY)
508	movsd	 8 * SIZE(YY), yy1
509	movhps	10 * SIZE(YY), yy1
510
511
512	movaps	 xtemp1, xt1
513	mulps	 a1,     xt1
514	mulps	 atemp1, a1
515	addps	 xt1,    xsum1
516	addps	 a1,     yy1
517	movsd	12 * SIZE(A1), a1
518	movhps	14 * SIZE(A1), a1
519
520	PREFETCH	PREFETCHSIZE(A2)
521
522	movaps	 xtemp1, xt1
523	mulps	 a2,     xt1
524	mulps	 atemp2, a2
525	addps	 xt1,    xsum2
526	addps	 a2,     yy1
527	movsd	12 * SIZE(A1, LDA, 1), a2
528	movhps	14 * SIZE(A1, LDA, 1), a2
529
530	movaps	 xtemp1, xt1
531	mulps	 a3,     xt1
532	mulps	 atemp3, a3
533	addps	 xt1,    xsum3
534	addps	 a3,     yy1
535	movsd	12 * SIZE(A2), a3
536	movhps	14 * SIZE(A2), a3
537
538#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
539	PREFETCHW	PREFETCHSIZE(YY)
540#endif
541
542	movaps	 xtemp1, xt1
543	movaps	16 * SIZE(XX), xtemp1
544	mulps	 a4,     xt1
545	mulps	 atemp4, a4
546	addps	 xt1,    xsum4
547	addps	 a4,     yy1
548	movsd	12 * SIZE(A2, LDA, 1), a4
549	movhps	14 * SIZE(A2, LDA, 1), a4
550
551	movlps	 yy1,  8 * SIZE(YY)
552	movhps	 yy1, 10 * SIZE(YY)
553	movsd	12 * SIZE(YY), yy1
554	movhps	14 * SIZE(YY), yy1
555
556	movaps	 xtemp2, xt1
557	mulps	 a1,     xt1
558	mulps	 atemp1, a1
559	addps	 xt1,    xsum1
560	addps	 a1,     yy1
561	movsd	16 * SIZE(A1), a1
562	movhps	18 * SIZE(A1), a1
563
564	PREFETCH	PREFETCHSIZE(A2, LDA, 1)
565
566	movaps	 xtemp2, xt1
567	mulps	 a2,     xt1
568	mulps	 atemp2, a2
569	addps	 xt1,    xsum2
570	addps	 a2,     yy1
571	movsd	16 * SIZE(A1, LDA, 1), a2
572	movhps	18 * SIZE(A1, LDA, 1), a2
573
574	movaps	 xtemp2, xt1
575	mulps	 a3,     xt1
576	mulps	 atemp3, a3
577	addps	 xt1,    xsum3
578	addps	 a3,     yy1
579	movsd	16 * SIZE(A2), a3
580	movhps	18 * SIZE(A2), a3
581
582	movaps	 xtemp2, xt1
583	movaps	20 * SIZE(XX), xtemp2
584	mulps	 a4,     xt1
585	mulps	 atemp4, a4
586	addps	 xt1,    xsum4
587	addps	 a4,     yy1
588	movsd	16 * SIZE(A2, LDA, 1), a4
589	movhps	18 * SIZE(A2, LDA, 1), a4
590
591	movlps	 yy1, 12 * SIZE(YY)
592	movhps	 yy1, 14 * SIZE(YY)
593	movsd	16 * SIZE(YY), yy1
594	movhps	18 * SIZE(YY), yy1
595
596	addq	 $16 * SIZE, XX
597	addq	 $16 * SIZE, YY
598	addq	 $16 * SIZE, A1
599	addq	 $16 * SIZE, A2
600
601	decq	 I
602	jg	 .L12
603	ALIGN_3
604
605.L14:
606	movq	M,  I
607	subq	IS, I
608	subq	$4, I
609	test	$8, I
610	jle	.L15
611
612	movaps	 xtemp1, xt1
613	mulps	 a1,     xt1
614	mulps	 atemp1, a1
615	addps	 xt1,    xsum1
616	addps	 a1,     yy1
617	movsd	 4 * SIZE(A1), a1
618	movhps	 6 * SIZE(A1), a1
619
620	movaps	 xtemp1, xt1
621	mulps	 a2,     xt1
622	mulps	 atemp2, a2
623	addps	 xt1,    xsum2
624	addps	 a2,     yy1
625	movsd	 4 * SIZE(A1, LDA, 1), a2
626	movhps	 6 * SIZE(A1, LDA, 1), a2
627
628	movaps	 xtemp1, xt1
629	mulps	 a3,     xt1
630	mulps	 atemp3, a3
631	addps	 xt1,    xsum3
632	addps	 a3,     yy1
633	movsd	 4 * SIZE(A2), a3
634	movhps	 6 * SIZE(A2), a3
635
636	movaps	 xtemp1, xt1
637	movaps	 8 * SIZE(XX), xtemp1
638	mulps	 a4,     xt1
639	mulps	 atemp4, a4
640	addps	 xt1,    xsum4
641	addps	 a4,     yy1
642	movsd	 4 * SIZE(A2, LDA, 1), a4
643	movhps	 6 * SIZE(A2, LDA, 1), a4
644
645	movlps	 yy1, 0 * SIZE(YY)
646	movhps	 yy1, 2 * SIZE(YY)
647	movsd	 4 * SIZE(YY), yy1
648	movhps	 6 * SIZE(YY), yy1
649
650	movaps	 xtemp2, xt1
651	mulps	 a1,     xt1
652	mulps	 atemp1, a1
653	addps	 xt1,    xsum1
654	addps	 a1,     yy1
655	movsd	 8 * SIZE(A1), a1
656	movhps	10 * SIZE(A1), a1
657
658	movaps	 xtemp2, xt1
659	mulps	 a2,     xt1
660	mulps	 atemp2, a2
661	addps	 xt1,    xsum2
662	addps	 a2,     yy1
663	movsd	 8 * SIZE(A1, LDA, 1), a2
664	movhps	10 * SIZE(A1, LDA, 1), a2
665
666	movaps	 xtemp2, xt1
667	mulps	 a3,     xt1
668	mulps	 atemp3, a3
669	addps	 xt1,    xsum3
670	addps	 a3,     yy1
671	movsd	 8 * SIZE(A2), a3
672	movhps	10 * SIZE(A2), a3
673
674	movaps	 xtemp2, xt1
675	movaps	12 * SIZE(XX), xtemp2
676	mulps	 a4,     xt1
677	mulps	 atemp4, a4
678	addps	 xt1,    xsum4
679	addps	 a4,     yy1
680	movsd	 8 * SIZE(A2, LDA, 1), a4
681	movhps	10 * SIZE(A2, LDA, 1), a4
682
683	movlps	 yy1, 4 * SIZE(YY)
684	movhps	 yy1, 6 * SIZE(YY)
685	movsd	 8 * SIZE(YY), yy1
686	movhps	10 * SIZE(YY), yy1
687
688	addq	 $8 * SIZE, XX
689	addq	 $8 * SIZE, YY
690	addq	 $8 * SIZE, A1
691	addq	 $8 * SIZE, A2
692	ALIGN_3
693
694.L15:
695	test	$4, I
696	jle	.L17
697
698	movaps	 xtemp1, xt1
699	mulps	 a1,     xt1
700	mulps	 atemp1, a1
701	addps	 xt1,    xsum1
702	addps	 a1,     yy1
703	movsd	 4 * SIZE(A1), a1
704
705	movaps	 xtemp1, xt1
706	mulps	 a2,     xt1
707	mulps	 atemp2, a2
708	addps	 xt1,    xsum2
709	addps	 a2,     yy1
710	movsd	 4 * SIZE(A1, LDA, 1), a2
711
712	movaps	 xtemp1, xt1
713	mulps	 a3,     xt1
714	mulps	 atemp3, a3
715	addps	 xt1,    xsum3
716	addps	 a3,     yy1
717	movsd	 4 * SIZE(A2), a3
718
719	movaps	 xtemp1, xt1
720	movsd	 4 * SIZE(XX), xtemp1
721	mulps	 a4,     xt1
722	mulps	 atemp4, a4
723	addps	 xt1,    xsum4
724	addps	 a4,     yy1
725	movsd	 4 * SIZE(A2, LDA, 1), a4
726
727	movlps	 yy1, 0 * SIZE(YY)
728	movhps	 yy1, 2 * SIZE(YY)
729	movsd	 4 * SIZE(YY), yy1
730
731	addq	 $4 * SIZE, XX
732	addq	 $4 * SIZE, YY
733	addq	 $4 * SIZE, A1
734	addq	 $4 * SIZE, A2
735	ALIGN_3
736
737.L17:
738	testq	$2, M
739	jle	.L18
740
741	pxor	 xtemp2, xtemp2
742
743	movlhps  xtemp2, a1
744	movaps	 xtemp1, xt1
745	mulps	 a1,     xt1
746	mulps	 atemp1, a1
747	addps	 xt1,    xsum1
748	addps	 a1,     yy1
749	movss	 2 * SIZE(A1), a1
750
751	movlhps  xtemp2, a2
752	movaps	 xtemp1, xt1
753	mulps	 a2,     xt1
754	mulps	 atemp2, a2
755	addps	 xt1,    xsum2
756	addps	 a2,     yy1
757	movss	 2 * SIZE(A1, LDA, 1), a2
758
759	movlhps  xtemp2, a3
760	movaps	 xtemp1, xt1
761	mulps	 a3,     xt1
762	mulps	 atemp3, a3
763	addps	 xt1,    xsum3
764	addps	 a3,     yy1
765	movss	 2 * SIZE(A2), a3
766
767	movlhps  xtemp2, a4
768	movaps	 xtemp1, xt1
769	movss	 2 * SIZE(XX), xtemp1
770	mulps	 a4,     xt1
771	mulps	 atemp4, a4
772	addps	 xt1,    xsum4
773	addps	 a4,     yy1
774	movss	 2 * SIZE(A2, LDA, 1), a4
775
776	movlps	 yy1, 0 * SIZE(YY)
777	movss	 2 * SIZE(YY), yy1
778
779	addq	 $2 * SIZE, XX
780	addq	 $2 * SIZE, YY
781	addq	 $2 * SIZE, A1
782	addq	 $2 * SIZE, A2
783	ALIGN_3
784
785.L18:
786	testq	$1, M
787	jle	.L19
788
789	movss	 0 * SIZE(XX), xtemp1
790
791	movss	 0 * SIZE(YY), yy1
792
793	movss	 0 * SIZE(A1), a1
794	movss	 0 * SIZE(A1, LDA, 1), a2
795	movss	 0 * SIZE(A2), a3
796	movss	 0 * SIZE(A2, LDA, 1), a4
797
798	movaps	 xtemp1, xt1
799	mulss	 a1,     xt1
800	mulss	 atemp1, a1
801	addss	 xt1,    xsum1
802	addss	 a1,     yy1
803
804	movaps	 xtemp1, xt1
805	mulss	 a2,     xt1
806	mulss	 atemp2, a2
807	addss	 xt1,    xsum2
808	addss	 a2,     yy1
809
810	movaps	 xtemp1, xt1
811	mulss	 a3,     xt1
812	mulss	 atemp3, a3
813	addss	 xt1,    xsum3
814	addss	 a3,     yy1
815
816	movaps	 xtemp1, xt1
817	mulss	 a4,     xt1
818	mulss	 atemp4, a4
819	addss	 xt1,    xsum4
820	addss	 a4,     yy1
821
822	movss	 yy1, 0 * SIZE(YY)
823	ALIGN_3
824
825.L19:
826#ifndef HAVE_SSE3
827	movaps	 xsum1,  xtemp1
828	unpcklps xsum3,  xsum1
829	unpckhps xsum3,  xtemp1
830
831	movaps	 xsum2,  xtemp2
832	unpcklps xsum4,  xsum2
833	unpckhps xsum4,  xtemp2
834
835	movaps	 xsum1,  xsum3
836	unpcklps xsum2,  xsum1
837	unpckhps xsum2,  xsum3
838
839	movaps	 xtemp1, xsum4
840	unpcklps xtemp2, xtemp1
841	unpckhps xtemp2, xsum4
842
843	addps	 xsum3,  xsum1
844	addps	 xtemp1, xsum4
845	addps	 xsum4,  xsum1
846#else
847	haddps	 xsum2, xsum1
848	haddps	 xsum4, xsum3
849
850	haddps	 xsum3, xsum1
851#endif
852
853	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1
854	movhps	 2 * SIZE(NEW_Y, IS, SIZE), yy1
855
856	addps	 xsum1, yy1
857
858	movsd	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)
859	movhps	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)
860
861	addq	 $4, IS
862
863	movq	 IS, I
864	addq	 $4, I
865	cmpq	 N, I
866	jle	 .L11
867	ALIGN_3
868
869.L20:
870	testq	$2, N
871	jle	.L30
872
873	movq	A,  A1
874	leaq	2 * SIZE(A, LDA, 2), A
875
876	movaps	 0 * SIZE(NEW_X, IS, SIZE), atemp4
877
878#if defined(OPTERON)
879	pxor	xsum1, xsum1
880#endif
881	movsd	 0 * SIZE(A1), xsum1
882	mulps	 atemp4, xsum1
883
884	movss	 1 * SIZE(A1), xsum2
885	movss	 1 * SIZE(A1, LDA, 1), a2
886	unpcklps a2, xsum2
887	mulps	 atemp4, xsum2
888
889	pshufd	$0x00, atemp4, atemp1
890	pshufd	$0x55, atemp4, atemp2
891
892	testq	$1, M
893	jle	.L29
894
895	movss	 2 * SIZE(A1), a1
896	movss	 2 * SIZE(A1, LDA, 1), a2
897	movss	 2 * SIZE(NEW_X, IS, SIZE), xtemp1
898	movss	 2 * SIZE(NEW_Y, IS, SIZE), yy1
899
900	movaps	 xtemp1, xt1
901	mulss	 a1,     xt1
902	mulss	 atemp1, a1
903	addss	 xt1,    xsum1
904	addps	 a1,     yy1
905
906	movaps	 xtemp1, xt1
907	mulss	 a2,     xt1
908	mulss	 atemp2, a2
909	addss	 xt1,    xsum2
910	addss	 a2,     yy1
911
912	movss	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)
913	ALIGN_3
914
915.L29:
916
917#ifndef HAVE_SSE3
918	unpcklps xsum2, xsum1
919	movhlps	 xsum1, xsum2
920	addps	 xsum2, xsum1
921#else
922	haddps	 xsum2, xsum1
923	haddps	 xsum1, xsum1
924#endif
925
926	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1
927
928	addps	 xsum1, yy1
929
930	movlps	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)
931
932	addq	 $2, IS
933	ALIGN_3
934
935.L30:
936	testq	$1, N
937	jle	.L990
938
939	movss	 0 * SIZE(NEW_X, IS, SIZE), xsum1
940	mulss	 0 * SIZE(A), xsum1
941	addss	 0 * SIZE(NEW_Y, IS, SIZE), xsum1
942	movss	 xsum1, 0 * SIZE(NEW_Y, IS, SIZE)
943	ALIGN_3
944
945.L990:
946	cmpq   $SIZE, INCY
947	je    .L999
948
949	movq	M,  %rax
950	sarq	$3, %rax
951	jle	.L997
952	ALIGN_3
953
954.L996:
955	movss	 0 * SIZE(NEW_Y), %xmm0
956	movss	 1 * SIZE(NEW_Y), %xmm1
957	movss	 2 * SIZE(NEW_Y), %xmm2
958	movss	 3 * SIZE(NEW_Y), %xmm3
959	movss	 4 * SIZE(NEW_Y), %xmm4
960	movss	 5 * SIZE(NEW_Y), %xmm5
961	movss	 6 * SIZE(NEW_Y), %xmm6
962	movss	 7 * SIZE(NEW_Y), %xmm7
963
964	movss	%xmm0,  0 * SIZE(Y)
965	addq	INCY, Y
966	movss	%xmm1,  0 * SIZE(Y)
967	addq	INCY, Y
968	movss	%xmm2,  0 * SIZE(Y)
969	addq	INCY, Y
970	movss	%xmm3,  0 * SIZE(Y)
971	addq	INCY, Y
972	movss	%xmm4,  0 * SIZE(Y)
973	addq	INCY, Y
974	movss	%xmm5,  0 * SIZE(Y)
975	addq	INCY, Y
976	movss	%xmm6,  0 * SIZE(Y)
977	addq	INCY, Y
978	movss	%xmm7,  0 * SIZE(Y)
979	addq	INCY, Y
980
981	addq	$8 * SIZE, NEW_Y
982	decq	%rax
983	jg	.L996
984	ALIGN_3
985
986.L997:
987	movq	M, %rax
988	andq	$7, %rax
989	jle	.L999
990	ALIGN_3
991
992.L998:
993	movss	0 * SIZE(NEW_Y), %xmm0
994
995	movss	%xmm0,  0 * SIZE(Y)
996	addq	INCY, Y
997
998	addq	$1 * SIZE, NEW_Y
999
1000	decq	%rax
1001	jg	.L998
1002	ALIGN_3
1003
1004.L999:
1005	movq	  0(%rsp), %rbx
1006	movq	  8(%rsp), %rbp
1007	movq	 16(%rsp), %r12
1008	movq	 24(%rsp), %r13
1009	movq	 32(%rsp), %r14
1010	movq	 40(%rsp), %r15
1011
1012#ifdef WINDOWS_ABI
1013	movq	 48(%rsp), %rdi
1014	movq	 56(%rsp), %rsi
1015	movups	 64(%rsp), %xmm6
1016	movups	 80(%rsp), %xmm7
1017	movups	 96(%rsp), %xmm8
1018	movups	112(%rsp), %xmm9
1019	movups	128(%rsp), %xmm10
1020	movups	144(%rsp), %xmm11
1021	movups	160(%rsp), %xmm12
1022	movups	176(%rsp), %xmm13
1023	movups	192(%rsp), %xmm14
1024	movups	208(%rsp), %xmm15
1025#endif
1026
1027	addq	$STACKSIZE, %rsp
1028	ret
1029	EPILOGUE
1030