1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef WINDOWS_ABI
43#define M	ARG1
44#define X	ARG4
45#define INCX	ARG5
46#define Y	ARG6
47#define INCY	ARG2
48#else
49#define M	ARG1
50#define X	ARG2
51#define INCX	ARG3
52#define Y	ARG4
53#define INCY	%r10
54#endif
55
56#define	YY	%r11
57#define ALPHA	%xmm15
58
59#include "l1param.h"
60
61	PROLOGUE
62	PROFCODE
63
64#ifndef WINDOWS_ABI
65#ifndef XDOUBLE
66	movq	 8(%rsp), INCY
67#else
68	movq	24(%rsp), INCY
69#endif
70	movaps	%xmm0,  ALPHA
71#else
72	movaps	%xmm3,  ALPHA
73
74	movq	40(%rsp), X
75	movq	48(%rsp), INCX
76	movq	56(%rsp), Y
77	movq	64(%rsp), INCY
78#endif
79
80	SAVEREGISTERS
81
82	unpcklpd ALPHA, ALPHA
83
84	leaq	(, INCX, SIZE), INCX
85	leaq	(, INCY, SIZE), INCY
86
87	testq	M, M
88	jle	.L47
89
90	cmpq	$SIZE, INCX
91	jne	.L40
92	cmpq	$SIZE, INCY
93	jne	.L40
94
95	testq	$SIZE, Y
96	je	.L10
97
98	movsd	(X), %xmm0
99	mulsd	ALPHA, %xmm0
100	addsd	(Y), %xmm0
101	movsd	%xmm0, (Y)
102	addq	$1 * SIZE, X
103	addq	$1 * SIZE, Y
104	decq	M
105	jle	.L19
106	ALIGN_4
107
108.L10:
109	subq	$-16 * SIZE, X
110	subq	$-16 * SIZE, Y
111
112	testq	$SIZE, X
113	jne	.L20
114
115	movq	M,  %rax
116	sarq	$4, %rax
117	jle	.L13
118
119	movaps	-16 * SIZE(X), %xmm0
120	movaps	-14 * SIZE(X), %xmm1
121	movaps	-12 * SIZE(X), %xmm2
122	movaps	-10 * SIZE(X), %xmm3
123
124	decq	%rax
125	jle .L12
126	ALIGN_3
127
128.L11:
129	movaps	 -8 * SIZE(X), %xmm4
130	movaps	 -6 * SIZE(X), %xmm5
131
132#ifdef PREFETCHW
133	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
134#endif
135
136	mulpd	ALPHA, %xmm0
137	addpd	-16 * SIZE(Y), %xmm0
138	movaps	%xmm0, -16 * SIZE(Y)
139
140	mulpd	ALPHA, %xmm1
141	addpd	-14 * SIZE(Y), %xmm1
142	movaps	%xmm1, -14 * SIZE(Y)
143
144	movaps	 -4 * SIZE(X), %xmm6
145	movaps	 -2 * SIZE(X), %xmm7
146
147#ifdef PREFETCH
148	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
149#endif
150
151	mulpd	ALPHA, %xmm2
152	addpd	-12 * SIZE(Y), %xmm2
153	movaps	%xmm2, -12 * SIZE(Y)
154
155	mulpd	ALPHA, %xmm3
156	addpd	-10 * SIZE(Y), %xmm3
157	movaps	%xmm3, -10 * SIZE(Y)
158
159	movaps	 0 * SIZE(X), %xmm0
160	movaps	 2 * SIZE(X), %xmm1
161
162#if defined(PREFETCHW) && !defined(FETCH128)
163	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
164#endif
165
166	mulpd	ALPHA, %xmm4
167	addpd	 -8 * SIZE(Y), %xmm4
168	movaps	%xmm4, -8 * SIZE(Y)
169
170	mulpd	ALPHA, %xmm5
171	addpd	 -6 * SIZE(Y), %xmm5
172	movaps	%xmm5, -6 * SIZE(Y)
173
174	movaps	 4 * SIZE(X), %xmm2
175	movaps	 6 * SIZE(X), %xmm3
176
177#if defined(PREFETCH) && !defined(FETCH128)
178	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
179#endif
180
181	mulpd	ALPHA, %xmm6
182	addpd	 -4 * SIZE(Y), %xmm6
183	movaps	%xmm6, -4 * SIZE(Y)
184
185	mulpd	ALPHA, %xmm7
186	addpd	 -2 * SIZE(Y), %xmm7
187	movaps	%xmm7, -2 * SIZE(Y)
188
189	subq	$-16 * SIZE, Y
190	subq	$-16 * SIZE, X
191	decq	%rax
192	jg	.L11
193	ALIGN_3
194
195.L12:
196	movaps	 -8 * SIZE(X), %xmm4
197	movaps	 -6 * SIZE(X), %xmm5
198
199	mulpd	ALPHA, %xmm0
200	addpd	-16 * SIZE(Y), %xmm0
201	movaps	%xmm0, -16 * SIZE(Y)
202
203	mulpd	ALPHA, %xmm1
204	addpd	-14 * SIZE(Y), %xmm1
205	movaps	%xmm1, -14 * SIZE(Y)
206
207	movaps	 -4 * SIZE(X), %xmm6
208	movaps	 -2 * SIZE(X), %xmm7
209
210	mulpd	ALPHA, %xmm2
211	addpd	-12 * SIZE(Y), %xmm2
212	movaps	%xmm2, -12 * SIZE(Y)
213
214	mulpd	ALPHA, %xmm3
215	addpd	-10 * SIZE(Y), %xmm3
216	movaps	%xmm3, -10 * SIZE(Y)
217
218	mulpd	ALPHA, %xmm4
219	addpd	 -8 * SIZE(Y), %xmm4
220	movaps	%xmm4,  -8 * SIZE(Y)
221
222	mulpd	ALPHA, %xmm5
223	addpd	 -6 * SIZE(Y), %xmm5
224	movaps	%xmm5,  -6 * SIZE(Y)
225
226	mulpd	ALPHA, %xmm6
227	addpd	 -4 * SIZE(Y), %xmm6
228	movaps	%xmm6,  -4 * SIZE(Y)
229
230	mulpd	ALPHA, %xmm7
231	addpd	 -2 * SIZE(Y), %xmm7
232	movaps	%xmm7,  -2 * SIZE(Y)
233
234	subq	$-16 * SIZE, Y
235	subq	$-16 * SIZE, X
236	ALIGN_3
237
238.L13:
239	movq	M,  %rax
240	andq	$8, %rax
241	jle	.L14
242	ALIGN_3
243
244	movaps	-16 * SIZE(X), %xmm0
245	movaps	-14 * SIZE(X), %xmm1
246	movaps	-12 * SIZE(X), %xmm2
247	movaps	-10 * SIZE(X), %xmm3
248
249	mulpd	ALPHA, %xmm0
250	addpd	-16 * SIZE(Y), %xmm0
251	mulpd	ALPHA, %xmm1
252	addpd	-14 * SIZE(Y), %xmm1
253	mulpd	ALPHA, %xmm2
254	addpd	-12 * SIZE(Y), %xmm2
255	mulpd	ALPHA, %xmm3
256	addpd	-10 * SIZE(Y), %xmm3
257
258	movaps	%xmm0, -16 * SIZE(Y)
259	movaps	%xmm1, -14 * SIZE(Y)
260	movaps	%xmm2, -12 * SIZE(Y)
261	movaps	%xmm3, -10 * SIZE(Y)
262
263	addq	$8 * SIZE, X
264	addq	$8 * SIZE, Y
265	ALIGN_3
266
267.L14:
268	movq	M,  %rax
269	andq	$4, %rax
270	jle	.L15
271	ALIGN_3
272
273	movaps	-16 * SIZE(X), %xmm0
274	movaps	-14 * SIZE(X), %xmm1
275
276	mulpd	ALPHA, %xmm0
277	mulpd	ALPHA, %xmm1
278
279	addpd	-16 * SIZE(Y), %xmm0
280	addpd	-14 * SIZE(Y), %xmm1
281
282	movaps	%xmm0, -16 * SIZE(Y)
283	movaps	%xmm1, -14 * SIZE(Y)
284
285	addq	$4 * SIZE, X
286	addq	$4 * SIZE, Y
287	ALIGN_3
288
289.L15:
290	movq	M,  %rax
291	andq	$2, %rax
292	jle	.L16
293	ALIGN_3
294
295	movaps	-16 * SIZE(X), %xmm0
296	mulpd	ALPHA, %xmm0
297	addpd	-16 * SIZE(Y), %xmm0
298	movaps	%xmm0, -16 * SIZE(Y)
299
300	addq	$2 * SIZE, X
301	addq	$2 * SIZE, Y
302	ALIGN_3
303
304.L16:
305	movq	M,  %rax
306	andq	$1, %rax
307	jle	.L19
308	ALIGN_3
309
310	movsd	-16 * SIZE(X), %xmm0
311	mulsd	ALPHA, %xmm0
312	addsd	-16 * SIZE(Y), %xmm0
313
314	movsd	%xmm0, 	-16 * SIZE(Y)
315	ALIGN_3
316
317.L19:
318	xorq	%rax,%rax
319
320	RESTOREREGISTERS
321
322	ret
323	ALIGN_3
324
325.L20:
326#ifdef ALIGNED_ACCESS
327
328	movhps	-16 * SIZE(X), %xmm0
329
330	movq	M,  %rax
331	sarq	$4, %rax
332	jle	.L23
333
334	movaps	-15 * SIZE(X), %xmm1
335	movaps	-13 * SIZE(X), %xmm2
336	movaps	-11 * SIZE(X), %xmm3
337
338	decq	%rax
339	jle .L22
340	ALIGN_4
341
342.L21:
343	movaps	 -9 * SIZE(X), %xmm4
344	movaps	 -7 * SIZE(X), %xmm5
345
346#ifdef PREFETCHW
347	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
348#endif
349
350	SHUFPD_1 %xmm1, %xmm0
351	mulpd	ALPHA, %xmm0
352	addpd	-16 * SIZE(Y), %xmm0
353	movaps	%xmm0, -16 * SIZE(Y)
354
355	SHUFPD_1 %xmm2, %xmm1
356	mulpd	ALPHA, %xmm1
357	addpd	-14 * SIZE(Y), %xmm1
358	movaps	%xmm1, -14 * SIZE(Y)
359
360	movaps	 -5 * SIZE(X), %xmm6
361	movaps	 -3 * SIZE(X), %xmm7
362
363#ifdef PREFETCH
364	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
365#endif
366
367	SHUFPD_1 %xmm3, %xmm2
368	mulpd	ALPHA, %xmm2
369	addpd	-12 * SIZE(Y), %xmm2
370	movaps	%xmm2, -12 * SIZE(Y)
371
372	SHUFPD_1 %xmm4, %xmm3
373	mulpd	ALPHA, %xmm3
374	addpd	-10 * SIZE(Y), %xmm3
375	movaps	%xmm3, -10 * SIZE(Y)
376
377	movaps	-1 * SIZE(X), %xmm0
378	movaps	 1 * SIZE(X), %xmm1
379
380#if defined(PREFETCHW) && !defined(FETCH128)
381	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
382#endif
383
384	SHUFPD_1 %xmm5, %xmm4
385	mulpd	ALPHA, %xmm4
386	addpd	 -8 * SIZE(Y), %xmm4
387	movaps	%xmm4,  -8 * SIZE(Y)
388
389	SHUFPD_1 %xmm6, %xmm5
390	mulpd	ALPHA, %xmm5
391	addpd	 -6 * SIZE(Y), %xmm5
392	movaps	%xmm5, -6 * SIZE(Y)
393
394	movaps	 3 * SIZE(X), %xmm2
395	movaps	 5 * SIZE(X), %xmm3
396
397#if defined(PREFETCH) && !defined(FETCH128)
398	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
399#endif
400
401	SHUFPD_1 %xmm7, %xmm6
402	mulpd	ALPHA, %xmm6
403	addpd	 -4 * SIZE(Y), %xmm6
404	movaps	%xmm6, -4 * SIZE(Y)
405
406	SHUFPD_1 %xmm0, %xmm7
407	mulpd	ALPHA, %xmm7
408	addpd	-2 * SIZE(Y), %xmm7
409	movaps	%xmm7, -2 * SIZE(Y)
410
411	subq	$-16 * SIZE, X
412	subq	$-16 * SIZE, Y
413	decq	%rax
414	jg	.L21
415	ALIGN_3
416
417.L22:
418	movaps	 -9 * SIZE(X), %xmm4
419	movaps	 -7 * SIZE(X), %xmm5
420
421	SHUFPD_1 %xmm1, %xmm0
422	mulpd	ALPHA, %xmm0
423	addpd	-16 * SIZE(Y), %xmm0
424	movaps	%xmm0, -16 * SIZE(Y)
425	movaps	-1 * SIZE(X), %xmm0
426
427	SHUFPD_1 %xmm2, %xmm1
428	mulpd	ALPHA, %xmm1
429	addpd	-14 * SIZE(Y), %xmm1
430	movaps	%xmm1, -14 * SIZE(Y)
431
432	movaps	 -5 * SIZE(X), %xmm6
433	movaps	 -3 * SIZE(X), %xmm7
434
435	SHUFPD_1 %xmm3, %xmm2
436	mulpd	ALPHA, %xmm2
437	addpd	-12 * SIZE(Y), %xmm2
438	movaps	%xmm2, -12 * SIZE(Y)
439
440	SHUFPD_1 %xmm4, %xmm3
441	mulpd	ALPHA, %xmm3
442	addpd	-10 * SIZE(Y), %xmm3
443	movaps	%xmm3, -10 * SIZE(Y)
444
445	SHUFPD_1 %xmm5, %xmm4
446	mulpd	ALPHA, %xmm4
447	addpd	 -8 * SIZE(Y), %xmm4
448	movaps	%xmm4,  -8 * SIZE(Y)
449
450	SHUFPD_1 %xmm6, %xmm5
451	mulpd	ALPHA, %xmm5
452	addpd	 -6 * SIZE(Y), %xmm5
453	movaps	%xmm5,  -6 * SIZE(Y)
454
455	SHUFPD_1 %xmm7, %xmm6
456	mulpd	ALPHA, %xmm6
457	addpd	 -4 * SIZE(Y), %xmm6
458	movaps	%xmm6,  -4 * SIZE(Y)
459
460	SHUFPD_1 %xmm0, %xmm7
461	mulpd	ALPHA, %xmm7
462	addpd	 -2 * SIZE(Y), %xmm7
463	movaps	%xmm7,  -2 * SIZE(Y)
464
465	subq	$-16 * SIZE, X
466	subq	$-16 * SIZE, Y
467	ALIGN_3
468
469.L23:
470	movq	M,  %rax
471	andq	$8, %rax
472	jle	.L24
473	ALIGN_3
474
475	movaps	-15 * SIZE(X), %xmm1
476	movaps	-13 * SIZE(X), %xmm2
477	movaps	-11 * SIZE(X), %xmm3
478	movaps	 -9 * SIZE(X), %xmm8
479
480	SHUFPD_1 %xmm1, %xmm0
481	mulpd	ALPHA, %xmm0
482	addpd	-16 * SIZE(Y), %xmm0
483	movaps	%xmm0, -16 * SIZE(Y)
484
485	SHUFPD_1 %xmm2, %xmm1
486	mulpd	ALPHA, %xmm1
487	addpd	-14 * SIZE(Y), %xmm1
488	movaps	%xmm1, -14 * SIZE(Y)
489
490	SHUFPD_1 %xmm3, %xmm2
491	mulpd	ALPHA, %xmm2
492	addpd	-12 * SIZE(Y), %xmm2
493	movaps	%xmm2, -12 * SIZE(Y)
494
495	SHUFPD_1 %xmm8, %xmm3
496	mulpd	ALPHA, %xmm3
497	addpd	-10 * SIZE(Y), %xmm3
498	movaps	%xmm3, -10 * SIZE(Y)
499
500	movaps	%xmm8, %xmm0
501
502	addq	$8 * SIZE, X
503	addq	$8 * SIZE, Y
504	ALIGN_3
505
506.L24:
507	movq	M,  %rax
508	andq	$4, %rax
509	jle	.L25
510	ALIGN_3
511
512	movaps	-15 * SIZE(X), %xmm1
513	movaps	-13 * SIZE(X), %xmm2
514
515	SHUFPD_1 %xmm1, %xmm0
516	SHUFPD_1 %xmm2, %xmm1
517
518	mulpd	ALPHA, %xmm0
519	mulpd	ALPHA, %xmm1
520
521	addpd	-16 * SIZE(Y), %xmm0
522	addpd	-14 * SIZE(Y), %xmm1
523
524	movaps	%xmm0, -16 * SIZE(Y)
525	movaps	%xmm1, -14 * SIZE(Y)
526	movaps	%xmm2, %xmm0
527
528	addq	$4 * SIZE, X
529	addq	$4 * SIZE, Y
530	ALIGN_3
531
532.L25:
533	movq	M,  %rax
534	andq	$2, %rax
535	jle	.L26
536	ALIGN_3
537
538	movaps	-15 * SIZE(X), %xmm1
539	SHUFPD_1 %xmm1, %xmm0
540	mulpd	ALPHA,  %xmm0
541	addpd	-16 * SIZE(Y), %xmm0
542
543	movaps	%xmm0, -16 * SIZE(Y)
544
545	addq	$2 * SIZE, X
546	addq	$2 * SIZE, Y
547	ALIGN_3
548
549.L26:
550	movq	M,  %rax
551	andq	$1, %rax
552	jle	.L29
553	ALIGN_3
554
555	movsd	-16 * SIZE(X), %xmm0
556	mulsd	ALPHA, %xmm0
557	addsd	-16 * SIZE(Y), %xmm0
558
559	movsd	%xmm0, 	-16 * SIZE(Y)
560	ALIGN_3
561
562.L29:
563	xorq	%rax,%rax
564
565	RESTOREREGISTERS
566
567	ret
568	ALIGN_3
569
570#else
571	movq	M,  %rax
572	sarq	$4, %rax
573	jle	.L23
574
575	movsd	-16 * SIZE(X), %xmm0
576	movhps	-15 * SIZE(X), %xmm0
577	movsd	-14 * SIZE(X), %xmm1
578	movhps	-13 * SIZE(X), %xmm1
579	movsd	-12 * SIZE(X), %xmm2
580	movhps	-11 * SIZE(X), %xmm2
581	movsd	-10 * SIZE(X), %xmm3
582	movhps	 -9 * SIZE(X), %xmm3
583
584	decq	%rax
585	jle .L22
586	ALIGN_3
587
588.L21:
589	movsd	 -8 * SIZE(X), %xmm4
590	movhps	 -7 * SIZE(X), %xmm4
591	movsd	 -6 * SIZE(X), %xmm5
592	movhps	 -5 * SIZE(X), %xmm5
593
594#ifdef PREFETCHW
595	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
596#endif
597
598	mulpd	ALPHA, %xmm0
599	addpd	-16 * SIZE(Y), %xmm0
600	movaps	%xmm0, -16 * SIZE(Y)
601
602	mulpd	ALPHA, %xmm1
603	addpd	-14 * SIZE(Y), %xmm1
604	movaps	%xmm1, -14 * SIZE(Y)
605
606	movsd	 -4 * SIZE(X), %xmm6
607	movhps	 -3 * SIZE(X), %xmm6
608	movsd	 -2 * SIZE(X), %xmm7
609	movhps	 -1 * SIZE(X), %xmm7
610
611#ifdef PREFETCH
612	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
613#endif
614
615	mulpd	ALPHA, %xmm2
616	addpd	-12 * SIZE(Y), %xmm2
617	movaps	%xmm2, -12 * SIZE(Y)
618
619	mulpd	ALPHA, %xmm3
620	addpd	-10 * SIZE(Y), %xmm3
621	movaps	%xmm3, -10 * SIZE(Y)
622
623	movsd	 0 * SIZE(X), %xmm0
624	movhps	 1 * SIZE(X), %xmm0
625	movsd	 2 * SIZE(X), %xmm1
626	movhps	 3 * SIZE(X), %xmm1
627
628#if defined(PREFETCHW) && !defined(FETCH128)
629	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
630#endif
631
632	mulpd	ALPHA, %xmm4
633	addpd	 -8 * SIZE(Y), %xmm4
634	movaps	%xmm4, -8 * SIZE(Y)
635
636	mulpd	ALPHA, %xmm5
637	addpd	 -6 * SIZE(Y), %xmm5
638	movaps	%xmm5, -6 * SIZE(Y)
639
640	movsd	 4 * SIZE(X), %xmm2
641	movhps	 5 * SIZE(X), %xmm2
642	movsd	 6 * SIZE(X), %xmm3
643	movhps	 7 * SIZE(X), %xmm3
644
645#if defined(PREFETCH) && !defined(FETCH128)
646	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
647#endif
648
649	mulpd	ALPHA, %xmm6
650	addpd	 -4 * SIZE(Y), %xmm6
651	movaps	%xmm6, -4 * SIZE(Y)
652
653	mulpd	ALPHA, %xmm7
654	addpd	 -2 * SIZE(Y), %xmm7
655	movaps	%xmm7, -2 * SIZE(Y)
656
657	subq	$-16 * SIZE, Y
658	subq	$-16 * SIZE, X
659	decq	%rax
660	jg	.L21
661	ALIGN_3
662
663.L22:
664	movsd	 -8 * SIZE(X), %xmm4
665	movhps	 -7 * SIZE(X), %xmm4
666	movsd	 -6 * SIZE(X), %xmm5
667	movhps	 -5 * SIZE(X), %xmm5
668
669	mulpd	ALPHA, %xmm0
670	addpd	-16 * SIZE(Y), %xmm0
671	movaps	%xmm0, -16 * SIZE(Y)
672
673	mulpd	ALPHA, %xmm1
674	addpd	-14 * SIZE(Y), %xmm1
675	movaps	%xmm1, -14 * SIZE(Y)
676
677	movsd	 -4 * SIZE(X), %xmm6
678	movhps	 -3 * SIZE(X), %xmm6
679	movsd	 -2 * SIZE(X), %xmm7
680	movhps	 -1 * SIZE(X), %xmm7
681
682	mulpd	ALPHA, %xmm2
683	addpd	-12 * SIZE(Y), %xmm2
684	movaps	%xmm2, -12 * SIZE(Y)
685
686	mulpd	ALPHA, %xmm3
687	addpd	-10 * SIZE(Y), %xmm3
688	movaps	%xmm3, -10 * SIZE(Y)
689
690	mulpd	ALPHA, %xmm4
691	addpd	 -8 * SIZE(Y), %xmm4
692	movaps	%xmm4,  -8 * SIZE(Y)
693
694	mulpd	ALPHA, %xmm5
695	addpd	 -6 * SIZE(Y), %xmm5
696	movaps	%xmm5,  -6 * SIZE(Y)
697
698	mulpd	ALPHA, %xmm6
699	addpd	 -4 * SIZE(Y), %xmm6
700	movaps	%xmm6,  -4 * SIZE(Y)
701
702	mulpd	ALPHA, %xmm7
703	addpd	 -2 * SIZE(Y), %xmm7
704	movaps	%xmm7,  -2 * SIZE(Y)
705
706	subq	$-16 * SIZE, Y
707	subq	$-16 * SIZE, X
708	ALIGN_3
709
710.L23:
711	movq	M,  %rax
712	andq	$8, %rax
713	jle	.L24
714	ALIGN_3
715
716	movsd	-16 * SIZE(X), %xmm0
717	movhps	-15 * SIZE(X), %xmm0
718	movsd	-14 * SIZE(X), %xmm1
719	movhps	-13 * SIZE(X), %xmm1
720	movsd	-12 * SIZE(X), %xmm2
721	movhps	-11 * SIZE(X), %xmm2
722	movsd	-10 * SIZE(X), %xmm3
723	movhps	 -9 * SIZE(X), %xmm3
724
725	mulpd	ALPHA, %xmm0
726	addpd	-16 * SIZE(Y), %xmm0
727	mulpd	ALPHA, %xmm1
728	addpd	-14 * SIZE(Y), %xmm1
729	mulpd	ALPHA, %xmm2
730	addpd	-12 * SIZE(Y), %xmm2
731	mulpd	ALPHA, %xmm3
732	addpd	-10 * SIZE(Y), %xmm3
733
734	movaps	%xmm0, -16 * SIZE(Y)
735	movaps	%xmm1, -14 * SIZE(Y)
736	movaps	%xmm2, -12 * SIZE(Y)
737	movaps	%xmm3, -10 * SIZE(Y)
738
739	addq	$8 * SIZE, X
740	addq	$8 * SIZE, Y
741	ALIGN_3
742
743.L24:
744	movq	M,  %rax
745	andq	$4, %rax
746	jle	.L25
747	ALIGN_3
748
749	movsd	-16 * SIZE(X), %xmm0
750	movhps	-15 * SIZE(X), %xmm0
751	movsd	-14 * SIZE(X), %xmm1
752	movhps	-13 * SIZE(X), %xmm1
753
754	mulpd	ALPHA, %xmm0
755	mulpd	ALPHA, %xmm1
756
757	addpd	-16 * SIZE(Y), %xmm0
758	addpd	-14 * SIZE(Y), %xmm1
759
760	movaps	%xmm0, -16 * SIZE(Y)
761	movaps	%xmm1, -14 * SIZE(Y)
762
763	addq	$4 * SIZE, X
764	addq	$4 * SIZE, Y
765	ALIGN_3
766
767.L25:
768	movq	M,  %rax
769	andq	$2, %rax
770	jle	.L26
771	ALIGN_3
772
773	movsd	-16 * SIZE(X), %xmm0
774	movhps	-15 * SIZE(X), %xmm0
775	mulpd	ALPHA, %xmm0
776	addpd	-16 * SIZE(Y), %xmm0
777	movaps	%xmm0, -16 * SIZE(Y)
778
779	addq	$2 * SIZE, X
780	addq	$2 * SIZE, Y
781	ALIGN_3
782
783.L26:
784	movq	M,  %rax
785	andq	$1, %rax
786	jle	.L29
787	ALIGN_3
788
789	movsd	-16 * SIZE(X), %xmm0
790	mulsd	ALPHA, %xmm0
791	addsd	-16 * SIZE(Y), %xmm0
792
793	movsd	%xmm0, 	-16 * SIZE(Y)
794	ALIGN_3
795
796.L29:
797	xorq	%rax,%rax
798
799	RESTOREREGISTERS
800
801	ret
802	ALIGN_3
803#endif
804
805.L40:
806	movq	Y, YY
807	movq	M,  %rax
808//If incx==0 || incy==0, avoid unloop.
809	cmpq	$0, INCX
810	je  .L46
811	cmpq	$0, INCY
812	je  .L46
813
814	sarq	$3, %rax
815	jle	.L45
816	ALIGN_3
817
818.L41:
819	movsd	0 * SIZE(X), %xmm0
820	addq	INCX, X
821	movhpd	0 * SIZE(X), %xmm0
822	addq	INCX, X
823	mulpd	ALPHA, %xmm0
824
825	movsd	0 * SIZE(YY), %xmm6
826	addq	INCY, YY
827	movhpd	0 * SIZE(YY), %xmm6
828	addq	INCY, YY
829	addpd	%xmm6, %xmm0
830
831	movsd	0 * SIZE(X), %xmm1
832	addq	INCX, X
833	movhpd	0 * SIZE(X), %xmm1
834	addq	INCX, X
835	mulpd	ALPHA, %xmm1
836
837	movsd	0 * SIZE(YY), %xmm6
838	addq	INCY, YY
839	movhpd	0 * SIZE(YY), %xmm6
840	addq	INCY, YY
841	addpd	%xmm6, %xmm1
842
843	movsd	0 * SIZE(X), %xmm2
844	addq	INCX, X
845	movhpd	0 * SIZE(X), %xmm2
846	addq	INCX, X
847	mulpd	ALPHA, %xmm2
848
849	movsd	0 * SIZE(YY), %xmm6
850	addq	INCY, YY
851	movhpd	0 * SIZE(YY), %xmm6
852	addq	INCY, YY
853	addpd	%xmm6, %xmm2
854
855	movsd	0 * SIZE(X), %xmm3
856	addq	INCX, X
857	movhpd	0 * SIZE(X), %xmm3
858	addq	INCX, X
859	mulpd	ALPHA, %xmm3
860
861	movsd	0 * SIZE(YY), %xmm6
862	addq	INCY, YY
863	movhpd	0 * SIZE(YY), %xmm6
864	addq	INCY, YY
865	addpd	%xmm6, %xmm3
866
867	movsd	%xmm0, 0 * SIZE(Y)
868	addq	INCY, Y
869	movhpd	%xmm0, 0 * SIZE(Y)
870	addq	INCY, Y
871	movsd	%xmm1, 0 * SIZE(Y)
872	addq	INCY, Y
873	movhpd	%xmm1, 0 * SIZE(Y)
874	addq	INCY, Y
875	movsd	%xmm2, 0 * SIZE(Y)
876	addq	INCY, Y
877	movhpd	%xmm2, 0 * SIZE(Y)
878	addq	INCY, Y
879	movsd	%xmm3, 0 * SIZE(Y)
880	addq	INCY, Y
881	movhpd	%xmm3, 0 * SIZE(Y)
882	addq	INCY, Y
883
884	decq	%rax
885	jg	.L41
886	ALIGN_3
887
888.L45:
889	movq	M,  %rax
890	andq	$7, %rax
891	jle	.L47
892	ALIGN_3
893
894.L46:
895	movsd	(X), %xmm0
896	addq	INCX, X
897	mulsd	%xmm15, %xmm0
898	addsd	(Y), %xmm0
899	movsd	%xmm0, (Y)
900	addq	INCY, Y
901	decq	%rax
902	jg	.L46
903	ALIGN_3
904
905.L47:
906	xorq	%rax, %rax
907
908	RESTOREREGISTERS
909
910	ret
911
912	EPILOGUE
913