1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifndef WINDOWS_ABI
26#define M	ARG1
27#define X	ARG4
28#define INCX	ARG5
29#define Y	ARG6
30#define INCY	ARG2
31#else
32#define M	ARG1
33#define X	ARG2
34#define INCX	ARG3
35#define Y	ARG4
36#define INCY	%r10
37#endif
38
39#define	YY	%r11
40#define ALPHA	%xmm15
41
42#include "l1param.h"
43
44	PROLOGUE
45	PROFCODE
46
47#ifndef WINDOWS_ABI
48#ifndef XDOUBLE
49	movq	 8(%rsp), INCY
50#else
51	movq	24(%rsp), INCY
52#endif
53	movaps	%xmm0,  ALPHA
54#else
55	movaps	%xmm3,  ALPHA
56
57	movq	40(%rsp), X
58	movq	48(%rsp), INCX
59	movq	56(%rsp), Y
60	movq	64(%rsp), INCY
61#endif
62
63	SAVEREGISTERS
64
65	shufps	$0, ALPHA, ALPHA
66
67	leaq	(, INCX, SIZE), INCX
68	leaq	(, INCY, SIZE), INCY
69
70	testq	M, M
71	jle	.L19
72
73	cmpq	$SIZE, INCX
74	jne	.L50
75	cmpq	$SIZE, INCY
76	jne	.L50
77
78	subq	$-32 * SIZE, X
79	subq	$-32 * SIZE, Y
80
81	cmpq	$3, M
82	jle	.L16
83
84	testq	$SIZE, Y
85	je	.L00
86
87	movss	-32 * SIZE(X), %xmm0
88	mulss	ALPHA, %xmm0
89	addss	-32 * SIZE(Y), %xmm0
90	movss	%xmm0, -32 * SIZE(Y)
91	addq	$1 * SIZE, X
92	addq	$1 * SIZE, Y
93	decq	M
94	jle	.L19
95	ALIGN_3
96
97.L00:
98	testq	$SIZE * 2, Y
99	je	.L10
100
101	movsd	-32 * SIZE(X), %xmm0
102	movsd	-32 * SIZE(Y), %xmm4
103	mulps	ALPHA, %xmm0
104	addps	%xmm4, %xmm0
105	movsd	%xmm0, -32 * SIZE(Y)
106
107	addq	$2 * SIZE, X
108	addq	$2 * SIZE, Y
109	subq	$2, M
110	jle	.L19
111	ALIGN_3
112
113.L10:
114	testq	$SIZE * 3, X
115	jne	.L20
116
117	movq	M,  %rax
118	sarq	$5, %rax
119	jle	.L13
120
121	movaps	-32 * SIZE(X), %xmm0
122	movaps	-28 * SIZE(X), %xmm1
123	movaps	-24 * SIZE(X), %xmm2
124	movaps	-20 * SIZE(X), %xmm3
125
126	decq	%rax
127	jle .L12
128	ALIGN_4
129
130.L11:
131	movaps	-16 * SIZE(X), %xmm4
132	movaps	-12 * SIZE(X), %xmm5
133
134#ifdef PREFETCHW
135	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
136#endif
137
138	mulps	ALPHA, %xmm0
139	addps	-32 * SIZE(Y), %xmm0
140	movaps	%xmm0, -32 * SIZE(Y)
141
142	mulps	ALPHA, %xmm1
143	addps	-28 * SIZE(Y), %xmm1
144	movaps	%xmm1, -28 * SIZE(Y)
145
146	movaps	 -8 * SIZE(X), %xmm6
147	movaps	 -4 * SIZE(X), %xmm7
148
149#ifdef PREFETCH
150	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
151#endif
152
153	mulps	ALPHA, %xmm2
154	addps	-24 * SIZE(Y), %xmm2
155	movaps	%xmm2, -24 * SIZE(Y)
156
157	mulps	ALPHA, %xmm3
158	addps	-20 * SIZE(Y), %xmm3
159	movaps	%xmm3, -20 * SIZE(Y)
160
161	movaps	 0 * SIZE(X), %xmm0
162	movaps	 4 * SIZE(X), %xmm1
163
164#if defined(PREFETCHW) && !defined(FETCH128)
165	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
166#endif
167
168	mulps	ALPHA, %xmm4
169	addps	-16 * SIZE(Y), %xmm4
170	movaps	%xmm4, -16 * SIZE(Y)
171
172	mulps	ALPHA, %xmm5
173	addps	-12 * SIZE(Y), %xmm5
174	movaps	%xmm5, -12 * SIZE(Y)
175
176	movaps	 8 * SIZE(X), %xmm2
177	movaps	12 * SIZE(X), %xmm3
178
179#if defined(PREFETCH) && !defined(FETCH128)
180	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
181#endif
182
183	mulps	ALPHA, %xmm6
184	addps	 -8 * SIZE(Y), %xmm6
185	movaps	%xmm6,  -8 * SIZE(Y)
186
187	mulps	ALPHA, %xmm7
188	addps	 -4 * SIZE(Y), %xmm7
189	movaps	%xmm7,  -4 * SIZE(Y)
190
191	subq	$-32 * SIZE, X
192	subq	$-32 * SIZE, Y
193	decq	%rax
194	jg	.L11
195	ALIGN_3
196
197.L12:
198	movaps	-16 * SIZE(X), %xmm4
199	movaps	-12 * SIZE(X), %xmm5
200
201	mulps	ALPHA, %xmm0
202	addps	-32 * SIZE(Y), %xmm0
203	movaps	%xmm0, -32 * SIZE(Y)
204
205	mulps	ALPHA, %xmm1
206	addps	-28 * SIZE(Y), %xmm1
207	movaps	%xmm1, -28 * SIZE(Y)
208
209	movaps	 -8 * SIZE(X), %xmm6
210	movaps	 -4 * SIZE(X), %xmm7
211
212	mulps	ALPHA, %xmm2
213	addps	-24 * SIZE(Y), %xmm2
214	movaps	%xmm2, -24 * SIZE(Y)
215
216	mulps	ALPHA, %xmm3
217	addps	-20 * SIZE(Y), %xmm3
218	movaps	%xmm3, -20 * SIZE(Y)
219
220	mulps	ALPHA, %xmm4
221	addps	-16 * SIZE(Y), %xmm4
222	movaps	%xmm4, -16 * SIZE(Y)
223
224	mulps	ALPHA, %xmm5
225	addps	-12 * SIZE(Y), %xmm5
226	movaps	%xmm5, -12 * SIZE(Y)
227
228	mulps	ALPHA, %xmm6
229	addps	 -8 * SIZE(Y), %xmm6
230	movaps	%xmm6,  -8 * SIZE(Y)
231
232	mulps	ALPHA, %xmm7
233	addps	 -4 * SIZE(Y), %xmm7
234	movaps	%xmm7,  -4 * SIZE(Y)
235
236	subq	$-32 * SIZE, X
237	subq	$-32 * SIZE, Y
238	ALIGN_3
239
240.L13:
241	movq	M,  %rax
242	andq	$16, %rax
243	jle	.L14
244	ALIGN_3
245
246	movaps	-32 * SIZE(X), %xmm0
247	movaps	-28 * SIZE(X), %xmm1
248	movaps	-24 * SIZE(X), %xmm2
249	movaps	-20 * SIZE(X), %xmm3
250
251	mulps	ALPHA, %xmm0
252	addps	-32 * SIZE(Y), %xmm0
253	mulps	ALPHA, %xmm1
254	addps	-28 * SIZE(Y), %xmm1
255	mulps	ALPHA, %xmm2
256	addps	-24 * SIZE(Y), %xmm2
257	mulps	ALPHA, %xmm3
258	addps	-20 * SIZE(Y), %xmm3
259
260	movaps	%xmm0, -32 * SIZE(Y)
261	movaps	%xmm1, -28 * SIZE(Y)
262	movaps	%xmm2, -24 * SIZE(Y)
263	movaps	%xmm3, -20 * SIZE(Y)
264
265	addq	$16 * SIZE, X
266	addq	$16 * SIZE, Y
267	ALIGN_3
268
269.L14:
270	movq	M,  %rax
271	andq	$8, %rax
272	jle	.L15
273	ALIGN_3
274
275	movaps	-32 * SIZE(X), %xmm0
276	movaps	-28 * SIZE(X), %xmm1
277
278	mulps	ALPHA, %xmm0
279	addps	-32 * SIZE(Y), %xmm0
280	mulps	ALPHA, %xmm1
281	addps	-28 * SIZE(Y), %xmm1
282
283	movaps	%xmm0, -32 * SIZE(Y)
284	movaps	%xmm1, -28 * SIZE(Y)
285
286	addq	$8 * SIZE, X
287	addq	$8 * SIZE, Y
288	ALIGN_3
289
290.L15:
291	movq	M,  %rax
292	andq	$4, %rax
293	jle	.L16
294	ALIGN_3
295
296	movaps	-32 * SIZE(X), %xmm0
297
298	mulps	ALPHA, %xmm0
299
300	addps	-32 * SIZE(Y), %xmm0
301
302	movaps	%xmm0, -32 * SIZE(Y)
303
304	addq	$4 * SIZE, X
305	addq	$4 * SIZE, Y
306	ALIGN_3
307
308.L16:
309	movq	M,  %rax
310	andq	$2, %rax
311	jle	.L17
312	ALIGN_3
313
314	movsd	-32 * SIZE(X), %xmm0
315	movsd	-32 * SIZE(Y), %xmm4
316
317	mulps	ALPHA, %xmm0
318	addps	%xmm4, %xmm0
319
320	movsd	%xmm0, -32 * SIZE(Y)
321
322	addq	$2 * SIZE, X
323	addq	$2 * SIZE, Y
324	ALIGN_3
325
326.L17:
327	movq	M,  %rax
328	andq	$1, %rax
329	jle	.L19
330	ALIGN_3
331
332	movss	-32 * SIZE(X), %xmm0
333	mulss	ALPHA, %xmm0
334	addss	-32 * SIZE(Y), %xmm0
335
336	movss	%xmm0, 	-32 * SIZE(Y)
337	ALIGN_3
338
339.L19:
340	xorq	%rax,%rax
341
342	RESTOREREGISTERS
343
344	ret
345	ALIGN_3
346
347.L20:
348
349#ifdef ALIGNED_ACCESS
350
351	testq	$SIZE, X
352	jne	.L30
353
354	movhps	-32 * SIZE(X), %xmm0
355
356	movq	M,  %rax
357	sarq	$5, %rax
358	jle	.L23
359
360	movaps	-30 * SIZE(X), %xmm1
361	movaps	-26 * SIZE(X), %xmm2
362	movaps	-22 * SIZE(X), %xmm3
363	movaps	-18 * SIZE(X), %xmm4
364
365	decq	%rax
366	jle .L22
367	ALIGN_4
368
369.L21:
370	movaps	-14 * SIZE(X), %xmm5
371	movaps	-10 * SIZE(X), %xmm6
372
373#ifdef PREFETCHW
374	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
375#endif
376
377	SHUFPD_1 %xmm1, %xmm0
378	mulps	ALPHA, %xmm0
379	addps	-32 * SIZE(Y), %xmm0
380	movaps	%xmm0, -32 * SIZE(Y)
381
382	SHUFPD_1 %xmm2, %xmm1
383	mulps	ALPHA, %xmm1
384	addps	-28 * SIZE(Y), %xmm1
385	movaps	%xmm1, -28 * SIZE(Y)
386
387	movaps	 -6 * SIZE(X), %xmm7
388	movaps	 -2 * SIZE(X), %xmm0
389
390#ifdef PREFETCH
391	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
392#endif
393
394	SHUFPD_1 %xmm3, %xmm2
395	mulps	ALPHA, %xmm2
396	addps	-24 * SIZE(Y), %xmm2
397	movaps	%xmm2, -24 * SIZE(Y)
398
399	SHUFPD_1 %xmm4, %xmm3
400	mulps	ALPHA, %xmm3
401	addps	-20 * SIZE(Y), %xmm3
402	movaps	%xmm3, -20 * SIZE(Y)
403
404	movaps	  2 * SIZE(X), %xmm1
405	movaps	  6 * SIZE(X), %xmm2
406
407#if defined(PREFETCHW) && !defined(FETCH128)
408	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
409#endif
410
411	SHUFPD_1 %xmm5, %xmm4
412	mulps	ALPHA, %xmm4
413	addps	-16 * SIZE(Y), %xmm4
414	movaps	%xmm4, -16 * SIZE(Y)
415
416	SHUFPD_1 %xmm6, %xmm5
417	mulps	ALPHA, %xmm5
418	addps	-12 * SIZE(Y), %xmm5
419	movaps	%xmm5, -12 * SIZE(Y)
420
421	movaps	 10 * SIZE(X), %xmm3
422	movaps	 14 * SIZE(X), %xmm4
423
424#if defined(PREFETCH) && !defined(FETCH128)
425	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
426#endif
427
428	SHUFPD_1 %xmm7, %xmm6
429	mulps	ALPHA, %xmm6
430	addps	 -8 * SIZE(Y), %xmm6
431	movaps	%xmm6,  -8 * SIZE(Y)
432
433	SHUFPD_1 %xmm0, %xmm7
434	mulps	ALPHA, %xmm7
435	addps	 -4 * SIZE(Y), %xmm7
436	movaps	%xmm7,  -4 * SIZE(Y)
437
438	subq	$-32 * SIZE, X
439	subq	$-32 * SIZE, Y
440	decq	%rax
441	jg	.L21
442	ALIGN_3
443
444.L22:
445	movaps	-14 * SIZE(X), %xmm5
446	movaps	-10 * SIZE(X), %xmm6
447
448	SHUFPD_1 %xmm1, %xmm0
449	mulps	ALPHA, %xmm0
450	addps	-32 * SIZE(Y), %xmm0
451	movaps	%xmm0, -32 * SIZE(Y)
452
453	SHUFPD_1 %xmm2, %xmm1
454	mulps	ALPHA, %xmm1
455	addps	-28 * SIZE(Y), %xmm1
456	movaps	%xmm1, -28 * SIZE(Y)
457
458	movaps	 -6 * SIZE(X), %xmm7
459	movaps	 -2 * SIZE(X), %xmm0
460
461	SHUFPD_1 %xmm3, %xmm2
462	mulps	ALPHA, %xmm2
463	addps	-24 * SIZE(Y), %xmm2
464	movaps	%xmm2, -24 * SIZE(Y)
465
466	SHUFPD_1 %xmm4, %xmm3
467	mulps	ALPHA, %xmm3
468	addps	-20 * SIZE(Y), %xmm3
469	movaps	%xmm3, -20 * SIZE(Y)
470
471	SHUFPD_1 %xmm5, %xmm4
472	mulps	ALPHA, %xmm4
473	addps	-16 * SIZE(Y), %xmm4
474	movaps	%xmm4, -16 * SIZE(Y)
475
476	SHUFPD_1 %xmm6, %xmm5
477	mulps	ALPHA, %xmm5
478	addps	-12 * SIZE(Y), %xmm5
479	movaps	%xmm5, -12 * SIZE(Y)
480
481	SHUFPD_1 %xmm7, %xmm6
482	mulps	ALPHA, %xmm6
483	addps	 -8 * SIZE(Y), %xmm6
484	movaps	%xmm6,  -8 * SIZE(Y)
485
486	SHUFPD_1 %xmm0, %xmm7
487	mulps	ALPHA, %xmm7
488	addps	 -4 * SIZE(Y), %xmm7
489	movaps	%xmm7,  -4 * SIZE(Y)
490
491	subq	$-32 * SIZE, X
492	subq	$-32 * SIZE, Y
493	ALIGN_3
494
495.L23:
496	movq	M,  %rax
497	andq	$16, %rax
498	jle	.L24
499	ALIGN_3
500
501	movaps	-30 * SIZE(X), %xmm1
502	movaps	-26 * SIZE(X), %xmm2
503	movaps	-22 * SIZE(X), %xmm3
504	movaps	-18 * SIZE(X), %xmm4
505
506	SHUFPD_1 %xmm1, %xmm0
507	SHUFPD_1 %xmm2, %xmm1
508	SHUFPD_1 %xmm3, %xmm2
509	SHUFPD_1 %xmm4, %xmm3
510
511	mulps	ALPHA, %xmm0
512	addps	-32 * SIZE(Y), %xmm0
513	mulps	ALPHA, %xmm1
514	addps	-28 * SIZE(Y), %xmm1
515	mulps	ALPHA, %xmm2
516	addps	-24 * SIZE(Y), %xmm2
517	mulps	ALPHA, %xmm3
518	addps	-20 * SIZE(Y), %xmm3
519
520	movaps	%xmm0, -32 * SIZE(Y)
521	movaps	%xmm1, -28 * SIZE(Y)
522	movaps	%xmm2, -24 * SIZE(Y)
523	movaps	%xmm3, -20 * SIZE(Y)
524
525	movaps	%xmm4, %xmm0
526
527	addq	$16 * SIZE, X
528	addq	$16 * SIZE, Y
529	ALIGN_3
530
531.L24:
532	movq	M,  %rax
533	andq	$8, %rax
534	jle	.L25
535	ALIGN_3
536
537	movaps	-30 * SIZE(X), %xmm1
538	movaps	-26 * SIZE(X), %xmm2
539
540	SHUFPD_1 %xmm1, %xmm0
541	mulps	ALPHA, %xmm0
542	addps	-32 * SIZE(Y), %xmm0
543	SHUFPD_1 %xmm2, %xmm1
544	mulps	ALPHA, %xmm1
545	addps	-28 * SIZE(Y), %xmm1
546
547	movaps	%xmm0, -32 * SIZE(Y)
548	movaps	%xmm1, -28 * SIZE(Y)
549	movaps	%xmm2, %xmm0
550
551	addq	$8 * SIZE, X
552	addq	$8 * SIZE, Y
553	ALIGN_3
554
555.L25:
556	movq	M,  %rax
557	andq	$4, %rax
558	jle	.L26
559	ALIGN_3
560
561	movaps	-30 * SIZE(X), %xmm1
562
563	SHUFPD_1 %xmm1, %xmm0
564	mulps	ALPHA, %xmm0
565
566	addps	-32 * SIZE(Y), %xmm0
567
568	movaps	%xmm0, -32 * SIZE(Y)
569
570	addq	$4 * SIZE, X
571	addq	$4 * SIZE, Y
572	ALIGN_3
573
574.L26:
575	movq	M,  %rax
576	andq	$2, %rax
577	jle	.L27
578	ALIGN_3
579
580	movsd	-32 * SIZE(X), %xmm0
581	movsd	-32 * SIZE(Y), %xmm4
582
583	mulps	ALPHA, %xmm0
584	addps	%xmm4, %xmm0
585
586	movsd	%xmm0, -32 * SIZE(Y)
587
588	addq	$2 * SIZE, X
589	addq	$2 * SIZE, Y
590	ALIGN_3
591
592.L27:
593	movq	M,  %rax
594	andq	$1, %rax
595	jle	.L29
596	ALIGN_3
597
598	movss	-32 * SIZE(X), %xmm0
599	mulss	ALPHA, %xmm0
600	addss	-32 * SIZE(Y), %xmm0
601
602	movss	%xmm0, 	-32 * SIZE(Y)
603	addq	$SIZE, Y
604	ALIGN_3
605
606.L29:
607	xorq	%rax,%rax
608
609	RESTOREREGISTERS
610
611	ret
612	ALIGN_3
613
614.L30:
615	testq	$2 * SIZE, X
616	jne	.L40
617
618	movaps	-33 * SIZE(X), %xmm0
619
620	movq	M,  %rax
621	sarq	$5, %rax
622	jle	.L33
623
624	movaps	-29 * SIZE(X), %xmm1
625	movaps	-25 * SIZE(X), %xmm2
626	movaps	-21 * SIZE(X), %xmm3
627	movaps	-17 * SIZE(X), %xmm4
628
629	decq	%rax
630	jle .L32
631	ALIGN_4
632
633.L31:
634	movaps	-13 * SIZE(X), %xmm5
635	movaps	 -9 * SIZE(X), %xmm6
636
637#ifdef PREFETCHW
638	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
639#endif
640
641	movss	%xmm1, %xmm0
642	SHUFPS_39 %xmm0, %xmm0
643	mulps	ALPHA, %xmm0
644	addps	-32 * SIZE(Y), %xmm0
645	movaps	%xmm0, -32 * SIZE(Y)
646
647	movss	%xmm2, %xmm1
648	SHUFPS_39 %xmm1, %xmm1
649	mulps	ALPHA, %xmm1
650	addps	-28 * SIZE(Y), %xmm1
651	movaps	%xmm1, -28 * SIZE(Y)
652
653	movaps	 -5 * SIZE(X), %xmm7
654	movaps	 -1 * SIZE(X), %xmm0
655
656#ifdef PREFETCH
657	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
658#endif
659
660	movss	%xmm3, %xmm2
661	SHUFPS_39 %xmm2, %xmm2
662	mulps	ALPHA, %xmm2
663	addps	-24 * SIZE(Y), %xmm2
664	movaps	%xmm2, -24 * SIZE(Y)
665
666	movss	%xmm4, %xmm3
667	SHUFPS_39 %xmm3, %xmm3
668	mulps	ALPHA, %xmm3
669	addps	-20 * SIZE(Y), %xmm3
670	movaps	%xmm3, -20 * SIZE(Y)
671
672	movaps	  3 * SIZE(X), %xmm1
673	movaps	  7 * SIZE(X), %xmm2
674
675#if defined(PREFETCHW) && !defined(FETCH128)
676	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
677#endif
678
679	movss	%xmm5, %xmm4
680	SHUFPS_39 %xmm4, %xmm4
681	mulps	ALPHA, %xmm4
682	addps	-16 * SIZE(Y), %xmm4
683	movaps	%xmm4, -16 * SIZE(Y)
684
685	movss	%xmm6, %xmm5
686	SHUFPS_39 %xmm5, %xmm5
687	mulps	ALPHA, %xmm5
688	addps	-12 * SIZE(Y), %xmm5
689	movaps	%xmm5, -12 * SIZE(Y)
690
691	movaps	 11 * SIZE(X), %xmm3
692	movaps	 15 * SIZE(X), %xmm4
693
694#if defined(PREFETCH) && !defined(FETCH128)
695	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
696#endif
697
698	movss	%xmm7, %xmm6
699	SHUFPS_39 %xmm6, %xmm6
700	mulps	ALPHA, %xmm6
701	addps	-8 * SIZE(Y), %xmm6
702	movaps	%xmm6,  -8 * SIZE(Y)
703
704	movss	%xmm0, %xmm7
705	SHUFPS_39 %xmm7, %xmm7
706	mulps	ALPHA, %xmm7
707	addps	-4 * SIZE(Y), %xmm7
708	movaps	%xmm7,  -4 * SIZE(Y)
709
710	subq	$-32 * SIZE, X
711	subq	$-32 * SIZE, Y
712	decq	%rax
713	jg	.L31
714	ALIGN_3
715
716.L32:
717	movaps	-13 * SIZE(X), %xmm5
718	movaps	 -9 * SIZE(X), %xmm6
719
720	movss	%xmm1, %xmm0
721	SHUFPS_39 %xmm0, %xmm0
722	mulps	ALPHA, %xmm0
723	addps	-32 * SIZE(Y), %xmm0
724	movaps	%xmm0, -32 * SIZE(Y)
725
726	movss	%xmm2, %xmm1
727	SHUFPS_39 %xmm1, %xmm1
728	mulps	ALPHA, %xmm1
729	addps	-28 * SIZE(Y), %xmm1
730	movaps	%xmm1, -28 * SIZE(Y)
731
732	movaps	 -5 * SIZE(X), %xmm7
733	movaps	 -1 * SIZE(X), %xmm0
734
735	movss	%xmm3, %xmm2
736	SHUFPS_39 %xmm2, %xmm2
737	mulps	ALPHA, %xmm2
738	addps	-24 * SIZE(Y), %xmm2
739	movaps	%xmm2, -24 * SIZE(Y)
740
741	movss	%xmm4, %xmm3
742	SHUFPS_39 %xmm3, %xmm3
743	mulps	ALPHA, %xmm3
744	addps	-20 * SIZE(Y), %xmm3
745	movaps	%xmm3, -20 * SIZE(Y)
746
747	movss	%xmm5, %xmm4
748	SHUFPS_39 %xmm4, %xmm4
749	mulps	ALPHA, %xmm4
750	addps	-16 * SIZE(Y), %xmm4
751	movaps	%xmm4, -16 * SIZE(Y)
752
753	movss	%xmm6, %xmm5
754	SHUFPS_39 %xmm5, %xmm5
755	mulps	ALPHA, %xmm5
756	addps	-12 * SIZE(Y), %xmm5
757	movaps	%xmm5, -12 * SIZE(Y)
758
759	movss	%xmm7, %xmm6
760	SHUFPS_39 %xmm6, %xmm6
761	mulps	ALPHA, %xmm6
762	addps	-8 * SIZE(Y), %xmm6
763	movaps	%xmm6,  -8 * SIZE(Y)
764
765	movss	%xmm0, %xmm7
766	SHUFPS_39 %xmm7, %xmm7
767	mulps	ALPHA, %xmm7
768	addps	-4 * SIZE(Y), %xmm7
769	movaps	%xmm7,  -4 * SIZE(Y)
770
771	subq	$-32 * SIZE, X
772	subq	$-32 * SIZE, Y
773	ALIGN_3
774
775.L33:
776	movq	M,  %rax
777	andq	$16, %rax
778	jle	.L34
779	ALIGN_3
780
781	movaps	-29 * SIZE(X), %xmm1
782	movaps	-25 * SIZE(X), %xmm2
783	movaps	-21 * SIZE(X), %xmm3
784	movaps	-17 * SIZE(X), %xmm4
785
786	movss	%xmm1, %xmm0
787	SHUFPS_39 %xmm0, %xmm0
788	mulps	ALPHA, %xmm0
789	addps	-32 * SIZE(Y), %xmm0
790
791	movss	%xmm2, %xmm1
792	SHUFPS_39 %xmm1, %xmm1
793	mulps	ALPHA, %xmm1
794	addps	-28 * SIZE(Y), %xmm1
795
796	movss	%xmm3, %xmm2
797	SHUFPS_39 %xmm2, %xmm2
798	mulps	ALPHA, %xmm2
799	addps	-24 * SIZE(Y), %xmm2
800
801	movss	%xmm4, %xmm3
802	SHUFPS_39 %xmm3, %xmm3
803	mulps	ALPHA, %xmm3
804	addps	-20 * SIZE(Y), %xmm3
805
806	movaps	%xmm0, -32 * SIZE(Y)
807	movaps	%xmm1, -28 * SIZE(Y)
808	movaps	%xmm2, -24 * SIZE(Y)
809	movaps	%xmm3, -20 * SIZE(Y)
810
811	movaps	%xmm4, %xmm0
812
813	addq	$16 * SIZE, X
814	addq	$16 * SIZE, Y
815	ALIGN_3
816
817.L34:
818	movq	M,  %rax
819	andq	$8, %rax
820	jle	.L35
821	ALIGN_3
822
823	movaps	-29 * SIZE(X), %xmm1
824	movaps	-25 * SIZE(X), %xmm2
825
826	movss	%xmm1, %xmm0
827	SHUFPS_39 %xmm0, %xmm0
828	mulps	ALPHA, %xmm0
829	addps	-32 * SIZE(Y), %xmm0
830
831	movss	%xmm2, %xmm1
832	SHUFPS_39 %xmm1, %xmm1
833	mulps	ALPHA, %xmm1
834	addps	-28 * SIZE(Y), %xmm1
835
836	movaps	%xmm0, -32 * SIZE(Y)
837	movaps	%xmm1, -28 * SIZE(Y)
838	movaps	%xmm2, %xmm0
839
840	addq	$8 * SIZE, X
841	addq	$8 * SIZE, Y
842	ALIGN_3
843
844.L35:
845	movq	M,  %rax
846	andq	$4, %rax
847	jle	.L36
848	ALIGN_3
849
850	movaps	-29 * SIZE(X), %xmm1
851
852	movss	%xmm1, %xmm0
853	SHUFPS_39 %xmm0, %xmm0
854	mulps	ALPHA, %xmm0
855
856	addps	-32 * SIZE(Y), %xmm0
857
858	movaps	%xmm0, -32 * SIZE(Y)
859
860	addq	$4 * SIZE, X
861	addq	$4 * SIZE, Y
862	ALIGN_3
863
864.L36:
865	movq	M,  %rax
866	andq	$2, %rax
867	jle	.L37
868	ALIGN_3
869
870	movsd	-32 * SIZE(X), %xmm0
871	movsd	-32 * SIZE(Y), %xmm4
872
873	mulps	ALPHA, %xmm0
874	addps	%xmm4, %xmm0
875
876	movsd	%xmm0, -32 * SIZE(Y)
877
878	addq	$2 * SIZE, X
879	addq	$2 * SIZE, Y
880	ALIGN_3
881
882.L37:
883	movq	M,  %rax
884	andq	$1, %rax
885	jle	.L39
886	ALIGN_3
887
888	movss	-32 * SIZE(X), %xmm0
889	mulss	ALPHA, %xmm0
890	addss	-32 * SIZE(Y), %xmm0
891
892	movss	%xmm0, 	-32 * SIZE(Y)
893	addq	$SIZE, Y
894	ALIGN_3
895
896.L39:
897	xorq	%rax,%rax
898
899	RESTOREREGISTERS
900
901	ret
902	ALIGN_3
903
904.L40:
905	movaps	-35 * SIZE(X), %xmm0
906
907	movq	M,  %rax
908	sarq	$5, %rax
909	jle	.L43
910
911	movaps	-31 * SIZE(X), %xmm1
912	movaps	-27 * SIZE(X), %xmm2
913	movaps	-23 * SIZE(X), %xmm3
914	movaps	-19 * SIZE(X), %xmm4
915
916	decq	%rax
917	jle .L42
918	ALIGN_4
919
920.L41:
921	movaps	-15 * SIZE(X), %xmm5
922	movaps	-11 * SIZE(X), %xmm6
923
924#ifdef PREFETCHW
925	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
926#endif
927
928	movss	%xmm1, %xmm0
929	shufps	$0x93, %xmm1, %xmm0
930	mulps	ALPHA, %xmm0
931	addps	-32 * SIZE(Y), %xmm0
932	movaps	%xmm0, -32 * SIZE(Y)
933
934	movss	%xmm2, %xmm1
935	shufps	$0x93, %xmm2, %xmm1
936	mulps	ALPHA, %xmm1
937	addps	-28 * SIZE(Y), %xmm1
938	movaps	%xmm1, -28 * SIZE(Y)
939
940	movaps	 -7 * SIZE(X), %xmm7
941	movaps	 -3 * SIZE(X), %xmm0
942
943#ifdef PREFETCH
944	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
945#endif
946
947	movss	%xmm3, %xmm2
948	shufps	$0x93, %xmm3, %xmm2
949	mulps	ALPHA, %xmm2
950	addps	-24 * SIZE(Y), %xmm2
951	movaps	%xmm2, -24 * SIZE(Y)
952
953	movss	%xmm4, %xmm3
954	shufps	$0x93, %xmm4, %xmm3
955	mulps	ALPHA, %xmm3
956	addps	-20 * SIZE(Y), %xmm3
957	movaps	%xmm3, -20 * SIZE(Y)
958
959	movaps	  1 * SIZE(X), %xmm1
960	movaps	  5 * SIZE(X), %xmm2
961
962#if defined(PREFETCHW) && !defined(FETCH128)
963	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
964#endif
965
966	movss	%xmm5, %xmm4
967	shufps	$0x93, %xmm5, %xmm4
968	mulps	ALPHA, %xmm4
969	addps	-16 * SIZE(Y), %xmm4
970	movaps	%xmm4, -16 * SIZE(Y)
971
972	movss	%xmm6, %xmm5
973	shufps	$0x93, %xmm6, %xmm5
974	mulps	ALPHA, %xmm5
975	addps	-12 * SIZE(Y), %xmm5
976	movaps	%xmm5, -12 * SIZE(Y)
977
978	movaps	  9 * SIZE(X), %xmm3
979	movaps	 13 * SIZE(X), %xmm4
980
981#if defined(PREFETCH) && !defined(FETCH128)
982	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
983#endif
984
985	movss	%xmm7, %xmm6
986	shufps	$0x93, %xmm7, %xmm6
987	mulps	ALPHA, %xmm6
988	addps	 -8 * SIZE(Y), %xmm6
989	movaps	%xmm6, -8 * SIZE(Y)
990
991	movss	%xmm0, %xmm7
992	shufps	$0x93, %xmm0, %xmm7
993	mulps	ALPHA, %xmm7
994	addps	 -4 * SIZE(Y), %xmm7
995	movaps	%xmm7, -4 * SIZE(Y)
996
997	subq	$-32 * SIZE, X
998	subq	$-32 * SIZE, Y
999	decq	%rax
1000	jg	.L41
1001	ALIGN_3
1002
1003.L42:
1004	movaps	-15 * SIZE(X), %xmm5
1005	movaps	-11 * SIZE(X), %xmm6
1006
1007	movss	%xmm1, %xmm0
1008	shufps	$0x93, %xmm1, %xmm0
1009	mulps	ALPHA, %xmm0
1010	addps	-32 * SIZE(Y), %xmm0
1011	movaps	%xmm0, -32 * SIZE(Y)
1012
1013	movss	%xmm2, %xmm1
1014	shufps	$0x93, %xmm2, %xmm1
1015	mulps	ALPHA, %xmm1
1016	addps	-28 * SIZE(Y), %xmm1
1017	movaps	%xmm1, -28 * SIZE(Y)
1018
1019	movaps	 -7 * SIZE(X), %xmm7
1020	movaps	 -3 * SIZE(X), %xmm0
1021
1022	movss	%xmm3, %xmm2
1023	shufps	$0x93, %xmm3, %xmm2
1024	mulps	ALPHA, %xmm2
1025	addps	-24 * SIZE(Y), %xmm2
1026	movaps	%xmm2, -24 * SIZE(Y)
1027
1028	movss	%xmm4, %xmm3
1029	shufps	$0x93, %xmm4, %xmm3
1030	mulps	ALPHA, %xmm3
1031	addps	-20 * SIZE(Y), %xmm3
1032	movaps	%xmm3, -20 * SIZE(Y)
1033
1034	movss	%xmm5, %xmm4
1035	shufps	$0x93, %xmm5, %xmm4
1036	mulps	ALPHA, %xmm4
1037	addps	-16 * SIZE(Y), %xmm4
1038	movaps	%xmm4, -16 * SIZE(Y)
1039
1040	movss	%xmm6, %xmm5
1041	shufps	$0x93, %xmm6, %xmm5
1042	mulps	ALPHA, %xmm5
1043	addps	-12 * SIZE(Y), %xmm5
1044	movaps	%xmm5, -12 * SIZE(Y)
1045
1046	movss	%xmm7, %xmm6
1047	shufps	$0x93, %xmm7, %xmm6
1048	mulps	ALPHA, %xmm6
1049	addps	 -8 * SIZE(Y), %xmm6
1050	movaps	%xmm6, -8 * SIZE(Y)
1051
1052	movss	%xmm0, %xmm7
1053	shufps	$0x93, %xmm0, %xmm7
1054	mulps	ALPHA, %xmm7
1055	addps	 -4 * SIZE(Y), %xmm7
1056	movaps	%xmm7, -4 * SIZE(Y)
1057
1058	subq	$-32 * SIZE, X
1059	subq	$-32 * SIZE, Y
1060	ALIGN_3
1061
1062.L43:
1063	movq	M,  %rax
1064	andq	$16, %rax
1065	jle	.L44
1066	ALIGN_3
1067
1068	movaps	-31 * SIZE(X), %xmm1
1069	movaps	-27 * SIZE(X), %xmm2
1070	movaps	-23 * SIZE(X), %xmm3
1071	movaps	-19 * SIZE(X), %xmm4
1072
1073	movss	%xmm1, %xmm0
1074	shufps	$0x93, %xmm1, %xmm0
1075	mulps	ALPHA, %xmm0
1076	addps	-32 * SIZE(Y), %xmm0
1077	movss	%xmm2, %xmm1
1078	shufps	$0x93, %xmm2, %xmm1
1079	mulps	ALPHA, %xmm1
1080	addps	-28 * SIZE(Y), %xmm1
1081
1082	movss	%xmm3, %xmm2
1083	shufps	$0x93, %xmm3, %xmm2
1084	mulps	ALPHA, %xmm2
1085	addps	-24 * SIZE(Y), %xmm2
1086	movss	%xmm4, %xmm3
1087	shufps	$0x93, %xmm4, %xmm3
1088	mulps	ALPHA, %xmm3
1089	addps	-20 * SIZE(Y), %xmm3
1090
1091	movaps	%xmm0, -32 * SIZE(Y)
1092	movaps	%xmm1, -28 * SIZE(Y)
1093	movaps	%xmm2, -24 * SIZE(Y)
1094	movaps	%xmm3, -20 * SIZE(Y)
1095
1096	movaps	%xmm4, %xmm0
1097
1098	addq	$16 * SIZE, X
1099	addq	$16 * SIZE, Y
1100	ALIGN_3
1101
1102.L44:
1103	movq	M,  %rax
1104	andq	$8, %rax
1105	jle	.L45
1106	ALIGN_3
1107
1108	movaps	-31 * SIZE(X), %xmm1
1109	movaps	-27 * SIZE(X), %xmm2
1110
1111	movss	%xmm1, %xmm0
1112	shufps	$0x93, %xmm1, %xmm0
1113	mulps	ALPHA, %xmm0
1114	addps	-32 * SIZE(Y), %xmm0
1115	movss	%xmm2, %xmm1
1116	shufps	$0x93, %xmm2, %xmm1
1117	mulps	ALPHA, %xmm1
1118	addps	-28 * SIZE(Y), %xmm1
1119
1120	movaps	%xmm0, -32 * SIZE(Y)
1121	movaps	%xmm1, -28 * SIZE(Y)
1122	movaps	%xmm2, %xmm0
1123
1124	addq	$8 * SIZE, X
1125	addq	$8 * SIZE, Y
1126	ALIGN_3
1127
1128.L45:
1129	movq	M,  %rax
1130	andq	$4, %rax
1131	jle	.L46
1132	ALIGN_3
1133
1134	movaps	-31 * SIZE(X), %xmm1
1135
1136	movss	%xmm1, %xmm0
1137	shufps	$0x93, %xmm1, %xmm0
1138	mulps	ALPHA, %xmm0
1139
1140	addps	-32 * SIZE(Y), %xmm0
1141
1142	movaps	%xmm0, -32 * SIZE(Y)
1143
1144	addq	$4 * SIZE, X
1145	addq	$4 * SIZE, Y
1146	ALIGN_3
1147
1148.L46:
1149	movq	M,  %rax
1150	andq	$2, %rax
1151	jle	.L47
1152	ALIGN_3
1153
1154	movsd	-32 * SIZE(X), %xmm0
1155	movsd	-32 * SIZE(Y), %xmm4
1156
1157	mulps	ALPHA, %xmm0
1158	addps	%xmm4, %xmm0
1159
1160	movsd	%xmm0, -32 * SIZE(Y)
1161
1162	addq	$2 * SIZE, X
1163	addq	$2 * SIZE, Y
1164	ALIGN_3
1165
1166.L47:
1167	movq	M,  %rax
1168	andq	$1, %rax
1169	jle	.L49
1170	ALIGN_3
1171
1172	movss	-32 * SIZE(X), %xmm0
1173	mulss	ALPHA, %xmm0
1174	addss	-32 * SIZE(Y), %xmm0
1175
1176	movss	%xmm0, 	-32 * SIZE(Y)
1177	addq	$SIZE, Y
1178	ALIGN_3
1179
1180.L49:
1181	xorq	%rax,%rax
1182
1183	RESTOREREGISTERS
1184
1185	ret
1186
1187#else
1188
1189	movq	M,  %rax
1190	sarq	$5, %rax
1191	jle	.L23
1192
1193	movsd	-32 * SIZE(X), %xmm0
1194	movhps	-30 * SIZE(X), %xmm0
1195	movsd	-28 * SIZE(X), %xmm1
1196	movhps	-26 * SIZE(X), %xmm1
1197	movsd	-24 * SIZE(X), %xmm2
1198	movhps	-22 * SIZE(X), %xmm2
1199	movsd	-20 * SIZE(X), %xmm3
1200	movhps	-18 * SIZE(X), %xmm3
1201
1202	decq	%rax
1203	jle .L22
1204	ALIGN_4
1205
1206.L21:
1207	movsd	-16 * SIZE(X), %xmm4
1208	movhps	-14 * SIZE(X), %xmm4
1209	movsd	-12 * SIZE(X), %xmm5
1210	movhps	-10 * SIZE(X), %xmm5
1211
1212#ifdef PREFETCHW
1213	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
1214#endif
1215
1216	mulps	ALPHA, %xmm0
1217	addps	-32 * SIZE(Y), %xmm0
1218	movaps	%xmm0, -32 * SIZE(Y)
1219
1220	mulps	ALPHA, %xmm1
1221	addps	-28 * SIZE(Y), %xmm1
1222	movaps	%xmm1, -28 * SIZE(Y)
1223
1224	movsd	 -8 * SIZE(X), %xmm6
1225	movhps	 -6 * SIZE(X), %xmm6
1226	movsd	 -4 * SIZE(X), %xmm7
1227	movhps	 -2 * SIZE(X), %xmm7
1228
1229#ifdef PREFETCH
1230	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
1231#endif
1232
1233	mulps	ALPHA, %xmm2
1234	addps	-24 * SIZE(Y), %xmm2
1235	movaps	%xmm2, -24 * SIZE(Y)
1236
1237	mulps	ALPHA, %xmm3
1238	addps	-20 * SIZE(Y), %xmm3
1239	movaps	%xmm3, -20 * SIZE(Y)
1240
1241	movsd	  0 * SIZE(X), %xmm0
1242	movhps	  2 * SIZE(X), %xmm0
1243	movsd	  4 * SIZE(X), %xmm1
1244	movhps	  6 * SIZE(X), %xmm1
1245
1246#if defined(PREFETCHW) && !defined(FETCH128)
1247	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
1248#endif
1249
1250	mulps	ALPHA, %xmm4
1251	addps	-16 * SIZE(Y), %xmm4
1252	movaps	%xmm4, -16 * SIZE(Y)
1253
1254	mulps	ALPHA, %xmm5
1255	addps	-12 * SIZE(Y), %xmm5
1256	movaps	%xmm5, -12 * SIZE(Y)
1257
1258	movsd	  8 * SIZE(X), %xmm2
1259	movhps	 10 * SIZE(X), %xmm2
1260	movsd	 12 * SIZE(X), %xmm3
1261	movhps	 14 * SIZE(X), %xmm3
1262
1263#if defined(PREFETCH) && !defined(FETCH128)
1264	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
1265#endif
1266
1267	mulps	ALPHA, %xmm6
1268	addps	 -8 * SIZE(Y), %xmm6
1269	movaps	%xmm6,  -8 * SIZE(Y)
1270
1271	mulps	ALPHA, %xmm7
1272	addps	 -4 * SIZE(Y), %xmm7
1273	movaps	%xmm7,  -4 * SIZE(Y)
1274
1275	subq	$-32 * SIZE, X
1276	subq	$-32 * SIZE, Y
1277	decq	%rax
1278	jg	.L21
1279	ALIGN_3
1280
1281.L22:
1282	movsd	-16 * SIZE(X), %xmm4
1283	movhps	-14 * SIZE(X), %xmm4
1284	movsd	-12 * SIZE(X), %xmm5
1285	movhps	-10 * SIZE(X), %xmm5
1286
1287	mulps	ALPHA, %xmm0
1288	addps	-32 * SIZE(Y), %xmm0
1289	movaps	%xmm0, -32 * SIZE(Y)
1290
1291	mulps	ALPHA, %xmm1
1292	addps	-28 * SIZE(Y), %xmm1
1293	movaps	%xmm1, -28 * SIZE(Y)
1294
1295	movsd	 -8 * SIZE(X), %xmm6
1296	movhps	 -6 * SIZE(X), %xmm6
1297	movsd	 -4 * SIZE(X), %xmm7
1298	movhps	 -2 * SIZE(X), %xmm7
1299
1300	mulps	ALPHA, %xmm2
1301	addps	-24 * SIZE(Y), %xmm2
1302	movaps	%xmm2, -24 * SIZE(Y)
1303
1304	mulps	ALPHA, %xmm3
1305	addps	-20 * SIZE(Y), %xmm3
1306	movaps	%xmm3, -20 * SIZE(Y)
1307
1308	mulps	ALPHA, %xmm4
1309	addps	-16 * SIZE(Y), %xmm4
1310	movaps	%xmm4, -16 * SIZE(Y)
1311
1312	mulps	ALPHA, %xmm5
1313	addps	-12 * SIZE(Y), %xmm5
1314	movaps	%xmm5, -12 * SIZE(Y)
1315
1316	mulps	ALPHA, %xmm6
1317	addps	 -8 * SIZE(Y), %xmm6
1318	movaps	%xmm6,  -8 * SIZE(Y)
1319
1320	mulps	ALPHA, %xmm7
1321	addps	 -4 * SIZE(Y), %xmm7
1322	movaps	%xmm7,  -4 * SIZE(Y)
1323
1324	subq	$-32 * SIZE, X
1325	subq	$-32 * SIZE, Y
1326	ALIGN_3
1327
1328.L23:
1329	movq	M,  %rax
1330	andq	$16, %rax
1331	jle	.L24
1332	ALIGN_3
1333
1334	movsd	-32 * SIZE(X), %xmm0
1335	movhps	-30 * SIZE(X), %xmm0
1336	movsd	-28 * SIZE(X), %xmm1
1337	movhps	-26 * SIZE(X), %xmm1
1338
1339	mulps	ALPHA, %xmm0
1340	addps	-32 * SIZE(Y), %xmm0
1341	movaps	%xmm0, -32 * SIZE(Y)
1342	mulps	ALPHA, %xmm1
1343	addps	-28 * SIZE(Y), %xmm1
1344	movaps	%xmm1, -28 * SIZE(Y)
1345
1346	movsd	-24 * SIZE(X), %xmm2
1347	movhps	-22 * SIZE(X), %xmm2
1348	movsd	-20 * SIZE(X), %xmm3
1349	movhps	-18 * SIZE(X), %xmm3
1350
1351	mulps	ALPHA, %xmm2
1352	addps	-24 * SIZE(Y), %xmm2
1353	movaps	%xmm2, -24 * SIZE(Y)
1354	mulps	ALPHA, %xmm3
1355	addps	-20 * SIZE(Y), %xmm3
1356	movaps	%xmm3, -20 * SIZE(Y)
1357
1358	addq	$16 * SIZE, X
1359	addq	$16 * SIZE, Y
1360	ALIGN_3
1361
1362.L24:
1363	movq	M,  %rax
1364	andq	$8, %rax
1365	jle	.L25
1366	ALIGN_3
1367
1368	movsd	-32 * SIZE(X), %xmm0
1369	movhps	-30 * SIZE(X), %xmm0
1370	movsd	-28 * SIZE(X), %xmm1
1371	movhps	-26 * SIZE(X), %xmm1
1372
1373	mulps	ALPHA, %xmm0
1374	addps	-32 * SIZE(Y), %xmm0
1375	mulps	ALPHA, %xmm1
1376	addps	-28 * SIZE(Y), %xmm1
1377
1378	movaps	%xmm0, -32 * SIZE(Y)
1379	movaps	%xmm1, -28 * SIZE(Y)
1380
1381	addq	$8 * SIZE, X
1382	addq	$8 * SIZE, Y
1383	ALIGN_3
1384
1385.L25:
1386	movq	M,  %rax
1387	andq	$4, %rax
1388	jle	.L26
1389	ALIGN_3
1390
1391	movsd	-32 * SIZE(X), %xmm0
1392	movhps	-30 * SIZE(X), %xmm0
1393
1394	mulps	ALPHA, %xmm0
1395
1396	addps	-32 * SIZE(Y), %xmm0
1397
1398	movaps	%xmm0, -32 * SIZE(Y)
1399
1400	addq	$4 * SIZE, X
1401	addq	$4 * SIZE, Y
1402	ALIGN_3
1403
1404.L26:
1405	movq	M,  %rax
1406	andq	$2, %rax
1407	jle	.L27
1408	ALIGN_3
1409
1410	movsd	-32 * SIZE(X), %xmm0
1411	movsd	-32 * SIZE(Y), %xmm4
1412
1413	mulps	ALPHA, %xmm0
1414	addps	%xmm4, %xmm0
1415
1416	movsd	%xmm0, -32 * SIZE(Y)
1417
1418	addq	$2 * SIZE, X
1419	addq	$2 * SIZE, Y
1420	ALIGN_3
1421
1422.L27:
1423	movq	M,  %rax
1424	andq	$1, %rax
1425	jle	.L29
1426	ALIGN_3
1427
1428	movss	-32 * SIZE(X), %xmm0
1429	mulss	ALPHA, %xmm0
1430	addss	-32 * SIZE(Y), %xmm0
1431
1432	movss	%xmm0, -32 * SIZE(Y)
1433	addq	$SIZE, Y
1434	ALIGN_3
1435
1436.L29:
1437	xorq	%rax,%rax
1438
1439	RESTOREREGISTERS
1440
1441	ret
1442#endif
1443	ALIGN_3
1444
1445
1446.L50:
1447	movq	M, %rax
1448	movq	Y, YY
1449	sarq	$3,   %rax
1450	jle	.L55
1451	ALIGN_3
1452
1453.L51:
1454	movss	(X), %xmm0
1455	addq	INCX, X
1456	mulss	ALPHA, %xmm0
1457	movss	(YY), %xmm6
1458	addq	INCY, YY
1459	addss	%xmm6, %xmm0
1460
1461	movss	(X), %xmm1
1462	addq	INCX, X
1463	mulss	ALPHA, %xmm1
1464	movss	(YY), %xmm6
1465	addq	INCY, YY
1466	addss	%xmm6, %xmm1
1467
1468	movss	(X), %xmm2
1469	addq	INCX, X
1470	mulss	ALPHA, %xmm2
1471	movss	(YY), %xmm6
1472	addq	INCY, YY
1473	addss	%xmm6, %xmm2
1474
1475	movss	(X), %xmm3
1476	addq	INCX, X
1477	mulss	ALPHA, %xmm3
1478	movss	(YY), %xmm6
1479	addq	INCY, YY
1480	addss	%xmm6, %xmm3
1481
1482	movss	%xmm0, (Y)
1483	addq	INCY, Y
1484	movss	%xmm1, (Y)
1485	addq	INCY, Y
1486	movss	%xmm2, (Y)
1487	addq	INCY, Y
1488	movss	%xmm3, (Y)
1489	addq	INCY, Y
1490
1491	movss	(X), %xmm0
1492	addq	INCX, X
1493	mulss	ALPHA, %xmm0
1494	movss	(YY), %xmm6
1495	addq	INCY, YY
1496	addss	%xmm6, %xmm0
1497
1498	movss	(X), %xmm1
1499	addq	INCX, X
1500	mulss	ALPHA, %xmm1
1501	movss	(YY), %xmm6
1502	addq	INCY, YY
1503	addss	%xmm6, %xmm1
1504
1505	movss	(X), %xmm2
1506	addq	INCX, X
1507	mulss	ALPHA, %xmm2
1508	movss	(YY), %xmm6
1509	addq	INCY, YY
1510	addss	%xmm6, %xmm2
1511
1512	movss	(X), %xmm3
1513	addq	INCX, X
1514	mulss	ALPHA, %xmm3
1515	movss	(YY), %xmm6
1516	addq	INCY, YY
1517	addss	%xmm6, %xmm3
1518
1519	movss	%xmm0, (Y)
1520	addq	INCY, Y
1521	movss	%xmm1, (Y)
1522	addq	INCY, Y
1523	movss	%xmm2, (Y)
1524	addq	INCY, Y
1525	movss	%xmm3, (Y)
1526	addq	INCY, Y
1527
1528	decq	%rax
1529	jg	.L51
1530	ALIGN_3
1531
1532.L55:
1533	movq	M, %rax
1534	andq	$7,   %rax
1535	jle	.L59
1536	ALIGN_3
1537
1538.L56:
1539	movss	(X), %xmm0
1540	addq	INCX, X
1541	mulss	ALPHA, %xmm0
1542	movss	(Y), %xmm6
1543	addss	%xmm6, %xmm0
1544	movss	%xmm0, (Y)
1545	addq	INCY, Y
1546	decq	%rax
1547	jg	.L56
1548	ALIGN_3
1549
1550.L59:
1551	xorq	%rax,%rax
1552
1553	RESTOREREGISTERS
1554
1555	ret
1556	ALIGN_3
1557
1558
1559	EPILOGUE
1560