1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef ATOM
43#define PREFETCH	prefetcht0
44#define PREFETCHW	prefetcht0
45#define PREFETCHSIZE	(16 * 12)
46#endif
47
48#ifdef CORE2
49#define PREFETCH	prefetcht0
50#define PREFETCHW	prefetcht0
51#define PREFETCHSIZE	(16 * 12)
52#endif
53
54#if defined(PENRYN) || defined(DUNNINGTON)
55#define PREFETCH	prefetcht0
56#define PREFETCHW	prefetcht0
57#define PREFETCHSIZE	(16 * 12)
58#endif
59
60#ifdef NEHALEM
61#define PREFETCH	prefetcht0
62#define PREFETCHW	prefetcht0
63#define PREFETCHSIZE	(16 * 12)
64#endif
65
66#ifdef PENTIUM4
67#define PREFETCH	prefetcht0
68#define PREFETCHW	prefetcht0
69#define PREFETCHSIZE	(16 * 20)
70#endif
71
72#ifdef OPTERON
73#define PREFETCH	prefetch
74#define PREFETCHW	prefetchw
75#define PREFETCHSIZE	(16 * 8)
76#define movsd		movlps
77#endif
78
79#if defined(BARCELONA) || defined(SHANGHAI)
80#define PREFETCH	prefetch
81#define PREFETCHW	prefetchw
82#define PREFETCHSIZE	(16 * 16)
83#endif
84
85#ifdef NANO
86#define PREFETCH	prefetcht0
87#define PREFETCHW	prefetcht0
88#define PREFETCHSIZE	(16 * 24)
89#endif
90
91#ifdef GENERIC
92#define PREFETCH	prefetcht0
93#define PREFETCHW	prefetcht0
94#define PREFETCHSIZE	(16 * 20)
95#endif
96
97#ifndef WINDOWS_ABI
98
99#define STACKSIZE	80
100
101#define OLD_Y		 8 + STACKSIZE(%rsp)
102#define OLD_INCY	16 + STACKSIZE(%rsp)
103#define OLD_BUFFER	24 + STACKSIZE(%rsp)
104
105#define M	  ARG1
106#define	IS	  ARG2
107#define	A	  ARG3
108#define LDA	  ARG4
109#define	X	  ARG5
110#define INCX	  ARG6
111
112#else
113
114#define STACKSIZE	256
115
116#define OLD_LDA		 40 + STACKSIZE(%rsp)
117#define OLD_X		 48 + STACKSIZE(%rsp)
118#define OLD_INCX	 56 + STACKSIZE(%rsp)
119#define OLD_Y		 64 + STACKSIZE(%rsp)
120#define OLD_INCY	 72 + STACKSIZE(%rsp)
121#define OLD_BUFFER	 80 + STACKSIZE(%rsp)
122
123#define M	  ARG1
124#define IS	  ARG2
125#define	A	  ARG4
126#define LDA	  ARG3
127#define	X	  %rdi
128#define INCX	  %rsi
129
130#endif
131
132#define	Y	%r10
133#define INCY	%r11
134#define BUFFER	%r12
135
136#define TEMP	%rax
137#define I	%rax
138#define A1	%rbx
139#define A2	%rbp
140#define XX	%r13
141#define YY	%r14
142#define NEW_X	BUFFER
143#define NEW_Y	X
144
145#define ALPHA  %xmm0
146
147#define atemp1 %xmm0
148#define atemp2 %xmm1
149#define atemp3 %xmm2
150#define atemp4 %xmm3
151
152#define xsum1  %xmm4
153#define xsum2  %xmm5
154#define xsum3  %xmm6
155#define xsum4  %xmm7
156
157#define xtemp1 %xmm8
158#define xtemp2 %xmm9
159#define yy1    %xmm10
160#define	xt1    %xmm11
161
162#define a1     %xmm12
163#define a2     %xmm13
164#define a3     %xmm14
165#define a4     %xmm15
166
167
168	PROLOGUE
169	PROFCODE
170
171	subq	$STACKSIZE, %rsp
172	movq	%rbx,  0(%rsp)
173	movq	%rbp,  8(%rsp)
174	movq	%r12, 16(%rsp)
175	movq	%r13, 24(%rsp)
176	movq	%r14, 32(%rsp)
177	movq	%r15, 40(%rsp)
178
179#ifdef WINDOWS_ABI
180	movq	%rdi,    48(%rsp)
181	movq	%rsi,    56(%rsp)
182	movups	%xmm6,   64(%rsp)
183	movups	%xmm7,   80(%rsp)
184	movups	%xmm8,   96(%rsp)
185	movups	%xmm9,  112(%rsp)
186	movups	%xmm10, 128(%rsp)
187	movups	%xmm11, 144(%rsp)
188	movups	%xmm12, 160(%rsp)
189	movups	%xmm13, 176(%rsp)
190	movups	%xmm14, 192(%rsp)
191	movups	%xmm15, 208(%rsp)
192
193	movq	OLD_LDA,   LDA
194	movq	OLD_X,     X
195	movq	OLD_INCX,  INCX
196
197	movaps	%xmm2, %xmm0
198#endif
199
200	movq	OLD_Y,     Y
201	movq	OLD_INCY,   INCY
202	movq	OLD_BUFFER, BUFFER
203
204	leaq	(,INCX, SIZE), INCX
205	leaq	(,INCY, SIZE), INCY
206	leaq	(,LDA,  SIZE), LDA
207
208	testq	M, M
209	jle	.L999
210
211	negq	IS
212	addq	M, IS
213
214	movq	IS,  TEMP
215	imulq	LDA, TEMP
216	addq	TEMP, A
217
218	shufps	$0, ALPHA, ALPHA
219
220	movq	BUFFER, XX
221
222	movq	M,  %rax
223	sarq	$3, %rax
224	jle	.L02
225	ALIGN_3
226
227.L01:
228	movss	0 * SIZE(X), %xmm1
229	addq	INCX, X
230	movss	0 * SIZE(X), %xmm2
231	addq	INCX, X
232	movss	0 * SIZE(X), %xmm3
233	addq	INCX, X
234	movss	0 * SIZE(X), %xmm4
235	addq	INCX, X
236	movss	0 * SIZE(X), %xmm5
237	addq	INCX, X
238	movss	0 * SIZE(X), %xmm6
239	addq	INCX, X
240	movss	0 * SIZE(X), %xmm7
241	addq	INCX, X
242	movss	0 * SIZE(X), %xmm8
243	addq	INCX, X
244
245	mulss	ALPHA, %xmm1
246	mulss	ALPHA, %xmm2
247	mulss	ALPHA, %xmm3
248	mulss	ALPHA, %xmm4
249	mulss	ALPHA, %xmm5
250	mulss	ALPHA, %xmm6
251	mulss	ALPHA, %xmm7
252	mulss	ALPHA, %xmm8
253
254	movss	%xmm1, 0 * SIZE(XX)
255	movss	%xmm2, 1 * SIZE(XX)
256	movss	%xmm3, 2 * SIZE(XX)
257	movss	%xmm4, 3 * SIZE(XX)
258	movss	%xmm5, 4 * SIZE(XX)
259	movss	%xmm6, 5 * SIZE(XX)
260	movss	%xmm7, 6 * SIZE(XX)
261	movss	%xmm8, 7 * SIZE(XX)
262
263	addq	$8 * SIZE, XX
264	decq	%rax
265	jg	.L01
266	ALIGN_3
267
268.L02:
269	movq	M, %rax
270	andq	$7, %rax
271	jle	.L05
272	ALIGN_3
273
274.L03:
275	movss	0 * SIZE(X), %xmm1
276	addq	INCX, X
277
278	mulss	ALPHA, %xmm1
279
280	movss	%xmm1, 0 * SIZE(XX)
281
282	addq	$1 * SIZE, XX
283	decq	%rax
284	jg	.L03
285	ALIGN_3
286
287.L05:
288	/* now we don't need original X */
289	movq   Y, NEW_Y
290
291	addq   $512, XX
292	andq   $-512, XX
293
294	cmpq   $SIZE, INCY
295	je    .L10
296
297	movq   Y,  YY
298	movq   XX, NEW_Y
299
300	movq	M,  %rax
301	sarq	$3, %rax
302	jle	.L07
303	ALIGN_3
304
305.L06:
306	movss	0 * SIZE(YY), %xmm0
307	addq	INCY, YY
308	movss	0 * SIZE(YY), %xmm1
309	addq	INCY, YY
310	movss	0 * SIZE(YY), %xmm2
311	addq	INCY, YY
312	movss	0 * SIZE(YY), %xmm3
313	addq	INCY, YY
314	movss	0 * SIZE(YY), %xmm4
315	addq	INCY, YY
316	movss	0 * SIZE(YY), %xmm5
317	addq	INCY, YY
318	movss	0 * SIZE(YY), %xmm6
319	addq	INCY, YY
320	movss	0 * SIZE(YY), %xmm7
321	addq	INCY, YY
322
323	movss	%xmm0, 0 * SIZE(XX)
324	movss	%xmm1, 1 * SIZE(XX)
325	movss	%xmm2, 2 * SIZE(XX)
326	movss	%xmm3, 3 * SIZE(XX)
327	movss	%xmm4, 4 * SIZE(XX)
328	movss	%xmm5, 5 * SIZE(XX)
329	movss	%xmm6, 6 * SIZE(XX)
330	movss	%xmm7, 7 * SIZE(XX)
331
332	addq	$8 * SIZE, XX
333	decq	%rax
334	jg	.L06
335	ALIGN_3
336
337.L07:
338	movq	M, %rax
339	andq	$7, %rax
340	jle	.L10
341	ALIGN_3
342
343.L08:
344	movss	0 * SIZE(YY), %xmm0
345	addq	INCY, YY
346
347	movss	%xmm0, 0 * SIZE(XX)
348
349	addq	$1 * SIZE, XX
350	decq	%rax
351	jg	.L08
352	ALIGN_3
353
354.L10:
355	movq	 IS, I
356	addq	 $4, I
357	cmpq	 M,  I
358	jg	 .L20
359	ALIGN_3
360
361.L11:
362	movq	A,  A1
363	leaq	(A, LDA, 2), A2
364	leaq	(A, LDA, 4), A
365
366	movaps		0 * SIZE(NEW_X, IS, SIZE), atemp4
367
368	pshufd	$0x00, atemp4, atemp1
369	pshufd	$0x55, atemp4, atemp2
370	pshufd	$0xaa, atemp4, atemp3
371	pshufd	$0xff, atemp4, atemp4
372
373	pxor		xsum1, xsum1
374	pxor		xsum2, xsum2
375	pxor		xsum3, xsum3
376	pxor		xsum4, xsum4
377
378	movaps	 0 * SIZE(NEW_X), xtemp1
379	movaps	 4 * SIZE(NEW_X), xtemp2
380
381	movsd	 0 * SIZE(A1), a1
382	movhps	 2 * SIZE(A1), a1
383	movsd	 0 * SIZE(A1, LDA, 1), a2
384	movhps	 2 * SIZE(A1, LDA, 1), a2
385	movsd	 0 * SIZE(A2), a3
386	movhps	 2 * SIZE(A2), a3
387	movsd	 0 * SIZE(A2, LDA, 1), a4
388	movhps	 2 * SIZE(A2, LDA, 1), a4
389
390	movsd	 0 * SIZE(NEW_Y), yy1
391	movhps	 2 * SIZE(NEW_Y), yy1
392
393	movq		NEW_X, XX
394	movq		NEW_Y, YY
395
396	movq	IS,  I
397	sarq	$4,  I
398	jle	.L14
399	ALIGN_3
400
401.L12:
402	movaps	 xtemp1, xt1
403	mulps	 a1,     xt1
404	mulps	 atemp1, a1
405	addps	 xt1,    xsum1
406	addps	 a1,     yy1
407	movsd	 4 * SIZE(A1), a1
408	movhps	 6 * SIZE(A1), a1
409
410	PREFETCH	PREFETCHSIZE(A1)
411
412	movaps	 xtemp1, xt1
413	mulps	 a2,     xt1
414	mulps	 atemp2, a2
415	addps	 xt1,    xsum2
416	addps	 a2,     yy1
417	movsd	 4 * SIZE(A1, LDA, 1), a2
418	movhps	 6 * SIZE(A1, LDA, 1), a2
419
420	movaps	 xtemp1, xt1
421	mulps	 a3,     xt1
422	mulps	 atemp3, a3
423	addps	 xt1,    xsum3
424	addps	 a3,     yy1
425	movsd	 4 * SIZE(A2), a3
426	movhps	 6 * SIZE(A2), a3
427
428#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
429	PREFETCH	PREFETCHSIZE(XX)
430#endif
431
432	movaps	 xtemp1, xt1
433	movaps	 8 * SIZE(XX), xtemp1
434	mulps	 a4,     xt1
435	mulps	 atemp4, a4
436	addps	 xt1,    xsum4
437	addps	 a4,     yy1
438	movsd	 4 * SIZE(A2, LDA, 1), a4
439	movhps	 6 * SIZE(A2, LDA, 1), a4
440
441	movlps	 yy1, 0 * SIZE(YY)
442	movhps	 yy1, 2 * SIZE(YY)
443	movsd	 4 * SIZE(YY), yy1
444	movhps	 6 * SIZE(YY), yy1
445
446	movaps	 xtemp2, xt1
447	mulps	 a1,     xt1
448	mulps	 atemp1, a1
449	addps	 xt1,    xsum1
450	addps	 a1,     yy1
451	movsd	 8 * SIZE(A1), a1
452	movhps	10 * SIZE(A1), a1
453
454	PREFETCH	PREFETCHSIZE(A1, LDA, 1)
455
456	movaps	 xtemp2, xt1
457	mulps	 a2,     xt1
458	mulps	 atemp2, a2
459	addps	 xt1,    xsum2
460	addps	 a2,     yy1
461	movsd	 8 * SIZE(A1, LDA, 1), a2
462	movhps	10 * SIZE(A1, LDA, 1), a2
463
464	movaps	 xtemp2, xt1
465	mulps	 a3,     xt1
466	mulps	 atemp3, a3
467	addps	 xt1,    xsum3
468	addps	 a3,     yy1
469	movsd	 8 * SIZE(A2), a3
470	movhps	10 * SIZE(A2), a3
471
472	movaps	 xtemp2, xt1
473	movaps	12 * SIZE(XX), xtemp2
474	mulps	 a4,     xt1
475	mulps	 atemp4, a4
476	addps	 xt1,    xsum4
477	addps	 a4,     yy1
478	movsd	 8 * SIZE(A2, LDA, 1), a4
479	movhps	10 * SIZE(A2, LDA, 1), a4
480
481	movlps	 yy1, 4 * SIZE(YY)
482	movhps	 yy1, 6 * SIZE(YY)
483	movsd	 8 * SIZE(YY), yy1
484	movhps	10 * SIZE(YY), yy1
485
486
487	movaps	 xtemp1, xt1
488	mulps	 a1,     xt1
489	mulps	 atemp1, a1
490	addps	 xt1,    xsum1
491	addps	 a1,     yy1
492	movsd	12 * SIZE(A1), a1
493	movhps	14 * SIZE(A1), a1
494
495	PREFETCH	PREFETCHSIZE(A2)
496
497	movaps	 xtemp1, xt1
498	mulps	 a2,     xt1
499	mulps	 atemp2, a2
500	addps	 xt1,    xsum2
501	addps	 a2,     yy1
502	movsd	12 * SIZE(A1, LDA, 1), a2
503	movhps	14 * SIZE(A1, LDA, 1), a2
504
505	movaps	 xtemp1, xt1
506	mulps	 a3,     xt1
507	mulps	 atemp3, a3
508	addps	 xt1,    xsum3
509	addps	 a3,     yy1
510	movsd	12 * SIZE(A2), a3
511	movhps	14 * SIZE(A2), a3
512
513#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
514	PREFETCHW	PREFETCHSIZE(YY)
515#endif
516
517	movaps	 xtemp1, xt1
518	movaps	16 * SIZE(XX), xtemp1
519	mulps	 a4,     xt1
520	mulps	 atemp4, a4
521	addps	 xt1,    xsum4
522	addps	 a4,     yy1
523	movsd	12 * SIZE(A2, LDA, 1), a4
524	movhps	14 * SIZE(A2, LDA, 1), a4
525
526	movlps	 yy1,  8 * SIZE(YY)
527	movhps	 yy1, 10 * SIZE(YY)
528	movsd	12 * SIZE(YY), yy1
529	movhps	14 * SIZE(YY), yy1
530
531	movaps	 xtemp2, xt1
532	mulps	 a1,     xt1
533	mulps	 atemp1, a1
534	addps	 xt1,    xsum1
535	addps	 a1,     yy1
536	movsd	16 * SIZE(A1), a1
537	movhps	18 * SIZE(A1), a1
538
539	PREFETCH	PREFETCHSIZE(A2, LDA, 1)
540
541	movaps	 xtemp2, xt1
542	mulps	 a2,     xt1
543	mulps	 atemp2, a2
544	addps	 xt1,    xsum2
545	addps	 a2,     yy1
546	movsd	16 * SIZE(A1, LDA, 1), a2
547	movhps	18 * SIZE(A1, LDA, 1), a2
548
549	movaps	 xtemp2, xt1
550	mulps	 a3,     xt1
551	mulps	 atemp3, a3
552	addps	 xt1,    xsum3
553	addps	 a3,     yy1
554	movsd	16 * SIZE(A2), a3
555	movhps	18 * SIZE(A2), a3
556
557	movaps	 xtemp2, xt1
558	movaps	20 * SIZE(XX), xtemp2
559	mulps	 a4,     xt1
560	mulps	 atemp4, a4
561	addps	 xt1,    xsum4
562	addps	 a4,     yy1
563	movsd	16 * SIZE(A2, LDA, 1), a4
564	movhps	18 * SIZE(A2, LDA, 1), a4
565
566	movlps	 yy1, 12 * SIZE(YY)
567	movhps	 yy1, 14 * SIZE(YY)
568	movsd	16 * SIZE(YY), yy1
569	movhps	18 * SIZE(YY), yy1
570
571	addq	 $16 * SIZE, XX
572	addq	 $16 * SIZE, YY
573	addq	 $16 * SIZE, A1
574	addq	 $16 * SIZE, A2
575
576	decq	 I
577	jg	 .L12
578	ALIGN_3
579
580.L14:
581	testq	$8, IS
582	jle	.L15
583
584	movaps	 xtemp1, xt1
585	mulps	 a1,     xt1
586	mulps	 atemp1, a1
587	addps	 xt1,    xsum1
588	addps	 a1,     yy1
589	movsd	 4 * SIZE(A1), a1
590	movhps	 6 * SIZE(A1), a1
591
592	movaps	 xtemp1, xt1
593	mulps	 a2,     xt1
594	mulps	 atemp2, a2
595	addps	 xt1,    xsum2
596	addps	 a2,     yy1
597	movsd	 4 * SIZE(A1, LDA, 1), a2
598	movhps	 6 * SIZE(A1, LDA, 1), a2
599
600	movaps	 xtemp1, xt1
601	mulps	 a3,     xt1
602	mulps	 atemp3, a3
603	addps	 xt1,    xsum3
604	addps	 a3,     yy1
605	movsd	 4 * SIZE(A2), a3
606	movhps	 6 * SIZE(A2), a3
607
608	movaps	 xtemp1, xt1
609	movaps	 8 * SIZE(XX), xtemp1
610	mulps	 a4,     xt1
611	mulps	 atemp4, a4
612	addps	 xt1,    xsum4
613	addps	 a4,     yy1
614	movsd	 4 * SIZE(A2, LDA, 1), a4
615	movhps	 6 * SIZE(A2, LDA, 1), a4
616
617	movlps	 yy1, 0 * SIZE(YY)
618	movhps	 yy1, 2 * SIZE(YY)
619	movsd	 4 * SIZE(YY), yy1
620	movhps	 6 * SIZE(YY), yy1
621
622	movaps	 xtemp2, xt1
623	mulps	 a1,     xt1
624	mulps	 atemp1, a1
625	addps	 xt1,    xsum1
626	addps	 a1,     yy1
627	movsd	 8 * SIZE(A1), a1
628	movhps	10 * SIZE(A1), a1
629
630	movaps	 xtemp2, xt1
631	mulps	 a2,     xt1
632	mulps	 atemp2, a2
633	addps	 xt1,    xsum2
634	addps	 a2,     yy1
635	movsd	 8 * SIZE(A1, LDA, 1), a2
636	movhps	10 * SIZE(A1, LDA, 1), a2
637
638	movaps	 xtemp2, xt1
639	mulps	 a3,     xt1
640	mulps	 atemp3, a3
641	addps	 xt1,    xsum3
642	addps	 a3,     yy1
643	movsd	 8 * SIZE(A2), a3
644	movhps	10 * SIZE(A2), a3
645
646	movaps	 xtemp2, xt1
647	movaps	12 * SIZE(XX), xtemp2
648	mulps	 a4,     xt1
649	mulps	 atemp4, a4
650	addps	 xt1,    xsum4
651	addps	 a4,     yy1
652	movsd	 8 * SIZE(A2, LDA, 1), a4
653	movhps	10 * SIZE(A2, LDA, 1), a4
654
655	movlps	 yy1, 4 * SIZE(YY)
656	movhps	 yy1, 6 * SIZE(YY)
657	movsd	 8 * SIZE(YY), yy1
658	movhps	10 * SIZE(YY), yy1
659
660	addq	 $8 * SIZE, XX
661	addq	 $8 * SIZE, YY
662	addq	 $8 * SIZE, A1
663	addq	 $8 * SIZE, A2
664	ALIGN_3
665
666.L15:
667	testq	$4, IS
668	jle	.L18
669
670	movaps	 xtemp1, xt1
671	mulps	 a1,     xt1
672	mulps	 atemp1, a1
673	addps	 xt1,    xsum1
674	addps	 a1,     yy1
675
676	movaps	 xtemp1, xt1
677	mulps	 a2,     xt1
678	mulps	 atemp2, a2
679	addps	 xt1,    xsum2
680	addps	 a2,     yy1
681
682	movaps	 xtemp1, xt1
683	mulps	 a3,     xt1
684	mulps	 atemp3, a3
685	addps	 xt1,    xsum3
686	addps	 a3,     yy1
687
688	movaps	 xtemp1, xt1
689	mulps	 a4,     xt1
690	mulps	 atemp4, a4
691	addps	 xt1,    xsum4
692	addps	 a4,     yy1
693
694	movlps	 yy1, 0 * SIZE(YY)
695	movhps	 yy1, 2 * SIZE(YY)
696	movsd	 4 * SIZE(YY), yy1
697	movhps	 6 * SIZE(YY), yy1
698
699	addq	 $4 * SIZE, XX
700	addq	 $4 * SIZE, YY
701	addq	 $4 * SIZE, A1
702	addq	 $4 * SIZE, A2
703	ALIGN_3
704
705.L18:
706	movaps		0 * SIZE(NEW_X, IS, SIZE), atemp1
707
708	movss	 0 * SIZE(A1), a1
709	movss	 0 * SIZE(A1, LDA, 1), a2
710	movss	 0 * SIZE(A2), a3
711	movss	 0 * SIZE(A2, LDA, 1), a4
712
713	unpcklps a3, a1
714	unpcklps a4, a2
715	unpcklps a2, a1
716
717	mulps	 atemp1, a1
718	addps	 a1, xsum1
719
720	movsd	 0 * SIZE(A1, LDA, 1), a1
721	movss	 1 * SIZE(A2), a2
722	movhps	 1 * SIZE(A2, LDA, 1), a2
723
724	shufps	 $0x84, a2, a1
725
726	mulps	 atemp1, a1
727	addps	 a1, xsum2
728
729	movsd	 0 * SIZE(A2), a1
730	movss	 2 * SIZE(A2), a2
731	movhps	 2 * SIZE(A2, LDA, 1), a2
732
733	shufps	 $0x84, a2, a1
734
735	mulps	 atemp1, a1
736	addps	 a1, xsum3
737
738	movsd	 0 * SIZE(A2, LDA, 1), a1
739	movhps	 2 * SIZE(A2, LDA, 1), a1
740
741	mulps	 atemp1, a1
742	addps	 a1, xsum4
743
744
745#ifndef HAVE_SSE3
746	movaps	 xsum1,  xtemp1
747	unpcklps xsum3,  xsum1
748	unpckhps xsum3,  xtemp1
749
750	movaps	 xsum2,  xtemp2
751	unpcklps xsum4,  xsum2
752	unpckhps xsum4,  xtemp2
753
754	movaps	 xsum1,  xsum3
755	unpcklps xsum2,  xsum1
756	unpckhps xsum2,  xsum3
757
758	movaps	 xtemp1, xsum4
759	unpcklps xtemp2, xtemp1
760	unpckhps xtemp2, xsum4
761
762	addps	 xsum3,  xsum1
763	addps	 xtemp1, xsum4
764	addps	 xsum4,  xsum1
765#else
766	haddps	 xsum2, xsum1
767	haddps	 xsum4, xsum3
768
769	haddps	 xsum3, xsum1
770#endif
771
772	addps	 xsum1, yy1
773
774	movlps	 yy1, 0 * SIZE(YY)
775	movhps	 yy1, 2 * SIZE(YY)
776
777	addq	 $4, IS
778
779	movq	 IS, I
780	addq	 $4, I
781	cmpq	 M, I
782	jle	 .L11
783	ALIGN_3
784
785.L20:
786	testq	$2, M
787	jle	.L30
788
789	movq	A,  A1
790	leaq	(A, LDA, 2), A
791
792	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp4
793
794	pshufd	$0x00, atemp4, atemp1
795	pshufd	$0x55, atemp4, atemp2
796
797	pxor		xsum1, xsum1
798	pxor		xsum2, xsum2
799
800	movaps	 0 * SIZE(NEW_X), xtemp1
801
802	movsd	 0 * SIZE(A1), a1
803	movhps	 2 * SIZE(A1), a1
804	movsd	 0 * SIZE(A1, LDA, 1), a2
805	movhps	 2 * SIZE(A1, LDA, 1), a2
806
807	movsd	 0 * SIZE(NEW_Y), yy1
808	movhps	 2 * SIZE(NEW_Y), yy1
809
810	movq		NEW_X, XX
811	movq		NEW_Y, YY
812
813	movq	IS,  I
814	sarq	$2,  I
815	jle	.L28
816	ALIGN_3
817
818.L22:
819	movaps	 xtemp1, xt1
820	mulps	 a1,     xt1
821	mulps	 atemp1, a1
822	addps	 xt1,    xsum1
823	addps	 a1,     yy1
824	movsd	 4 * SIZE(A1), a1
825	movhps	 6 * SIZE(A1), a1
826
827	movaps	 xtemp1, xt1
828	movaps	 4 * SIZE(XX), xtemp1
829	mulps	 a2,     xt1
830	mulps	 atemp2, a2
831	addps	 xt1,    xsum2
832	addps	 a2,     yy1
833	movsd	 4 * SIZE(A1, LDA, 1), a2
834	movhps	 6 * SIZE(A1, LDA, 1), a2
835
836	movlps	 yy1, 0 * SIZE(YY)
837	movhps	 yy1, 2 * SIZE(YY)
838	movsd	 4 * SIZE(YY), yy1
839	movhps	 6 * SIZE(YY), yy1
840
841	addq	 $4 * SIZE, XX
842	addq	 $4 * SIZE, YY
843	addq	 $4 * SIZE, A1
844
845	decq	 I
846	jg	 .L22
847	ALIGN_3
848
849.L28:
850	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1
851
852	movss	 0 * SIZE(A1), a1
853	movss	 0 * SIZE(A1, LDA, 1), a2
854
855	unpcklps a2, a1
856
857	mulps	 atemp1, a1
858	addps	 a1, xsum1
859
860	movsd	 0 * SIZE(A1, LDA, 1), a1
861	mulps	 atemp1, a1
862	addps	 a1, xsum2
863
864#ifndef HAVE_SSE3
865	movhlps	 xsum1, xsum3
866	movhlps	 xsum2, xsum4
867	addps	 xsum3, xsum1
868	addps	 xsum4, xsum2
869
870	unpcklps xsum2, xsum1
871	movhlps	 xsum1, xsum2
872
873	addps	 xsum2, xsum1
874#else
875	haddps	 xsum2, xsum1
876	haddps	 xsum1, xsum1
877#endif
878
879	addps	 xsum1, yy1
880
881	movlps	 yy1, 0 * SIZE(YY)
882
883	addq	 $2, IS
884	ALIGN_3
885
886.L30:
887	testq	$1, M
888	jle	.L990
889
890	movq	A,  A1
891
892	movss		0 * SIZE(NEW_X, IS, SIZE), atemp1
893
894	pshufd	$0x00, atemp1, atemp1
895
896	pxor		xsum1, xsum1
897	pxor		xsum2, xsum2
898
899	movss	 0 * SIZE(NEW_Y), yy1
900
901	movss	 0 * SIZE(NEW_X), xtemp1
902	movss	 1 * SIZE(NEW_X), xtemp2
903
904	movss	 0 * SIZE(A1), a1
905	movss	 1 * SIZE(A1), a2
906
907	movq		NEW_X, XX
908	movq		NEW_Y, YY
909
910	movq	IS,  I
911	sarq	$1,  I
912	jle	.L38
913	ALIGN_3
914
915.L32:
916	movaps	 xtemp1, xt1
917	movss	 2 * SIZE(XX), xtemp1
918	mulps	 a1,     xt1
919	mulps	 atemp1, a1
920	addps	 xt1,    xsum1
921	addps	 a1,     yy1
922	movss	 2 * SIZE(A1), a1
923
924	movss	 yy1, 0 * SIZE(YY)
925	movss	 1 * SIZE(YY), yy1
926
927	movaps	 xtemp2, xt1
928	movss	 3 * SIZE(XX), xtemp2
929	mulps	 a2,     xt1
930	mulps	 atemp1, a2
931	addps	 xt1,    xsum1
932	addps	 a2,     yy1
933	movss	 3 * SIZE(A1), a2
934
935	movss	 yy1, 1 * SIZE(YY)
936	movss	 2 * SIZE(YY), yy1
937
938	addq	 $2 * SIZE, XX
939	addq	 $2 * SIZE, YY
940	addq	 $2 * SIZE, A1
941
942	decq	 I
943	jg	 .L32
944	ALIGN_3
945
946.L38:
947	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1
948
949	movss	 0 * SIZE(A1), a1
950	mulss	 atemp1, a1
951	addss	 a1, xsum1
952
953#ifndef HAVE_SSE3
954	movhlps	 xsum1, xsum3
955	movhlps	 xsum2, xsum4
956	addps	 xsum3, xsum1
957	addps	 xsum4, xsum2
958
959	unpcklps xsum2, xsum1
960	movhlps	 xsum1, xsum2
961
962	addps	 xsum2, xsum1
963#else
964	addss	 xsum2, xsum1
965#endif
966
967	addss	 xsum1, yy1
968
969	movss	 yy1, 0 * SIZE(YY)
970
971	addq	 $2, IS
972	ALIGN_3
973
974.L990:
975	cmpq   $SIZE, INCY
976	je    .L999
977
978	movq	M,  %rax
979	sarq	$3, %rax
980	jle	.L997
981	ALIGN_3
982
983.L996:
984	movss	 0 * SIZE(NEW_Y), %xmm0
985	movss	 1 * SIZE(NEW_Y), %xmm1
986	movss	 2 * SIZE(NEW_Y), %xmm2
987	movss	 3 * SIZE(NEW_Y), %xmm3
988	movss	 4 * SIZE(NEW_Y), %xmm4
989	movss	 5 * SIZE(NEW_Y), %xmm5
990	movss	 6 * SIZE(NEW_Y), %xmm6
991	movss	 7 * SIZE(NEW_Y), %xmm7
992
993	movss	%xmm0,  0 * SIZE(Y)
994	addq	INCY, Y
995	movss	%xmm1,  0 * SIZE(Y)
996	addq	INCY, Y
997	movss	%xmm2,  0 * SIZE(Y)
998	addq	INCY, Y
999	movss	%xmm3,  0 * SIZE(Y)
1000	addq	INCY, Y
1001	movss	%xmm4,  0 * SIZE(Y)
1002	addq	INCY, Y
1003	movss	%xmm5,  0 * SIZE(Y)
1004	addq	INCY, Y
1005	movss	%xmm6,  0 * SIZE(Y)
1006	addq	INCY, Y
1007	movss	%xmm7,  0 * SIZE(Y)
1008	addq	INCY, Y
1009
1010	addq	$8 * SIZE, NEW_Y
1011	decq	%rax
1012	jg	.L996
1013	ALIGN_3
1014
1015.L997:
1016	movq	M, %rax
1017	andq	$7, %rax
1018	jle	.L999
1019	ALIGN_3
1020
1021.L998:
1022	movss	0 * SIZE(NEW_Y), %xmm0
1023
1024	movss	%xmm0,  0 * SIZE(Y)
1025	addq	INCY, Y
1026
1027	addq	$1 * SIZE, NEW_Y
1028
1029	decq	%rax
1030	jg	.L998
1031	ALIGN_3
1032
1033
1034.L999:
1035	movq	  0(%rsp), %rbx
1036	movq	  8(%rsp), %rbp
1037	movq	 16(%rsp), %r12
1038	movq	 24(%rsp), %r13
1039	movq	 32(%rsp), %r14
1040	movq	 40(%rsp), %r15
1041
1042#ifdef WINDOWS_ABI
1043	movq	 48(%rsp), %rdi
1044	movq	 56(%rsp), %rsi
1045	movups	 64(%rsp), %xmm6
1046	movups	 80(%rsp), %xmm7
1047	movups	 96(%rsp), %xmm8
1048	movups	112(%rsp), %xmm9
1049	movups	128(%rsp), %xmm10
1050	movups	144(%rsp), %xmm11
1051	movups	160(%rsp), %xmm12
1052	movups	176(%rsp), %xmm13
1053	movups	192(%rsp), %xmm14
1054	movups	208(%rsp), %xmm15
1055#endif
1056
1057	addq	$STACKSIZE, %rsp
1058	ret
1059	EPILOGUE
1060