1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24#include "l2param.h"
25
26#if GEMV_UNROLL < 4
27#undef  GEMV_UNROLL
28#define GEMV_UNROLL 4
29#endif
30
31#ifndef WINDOWS_ABI
32
33#define STACKSIZE	64
34
35#define OLD_M	  %rdi
36#define OLD_N	  %rsi
37#define OLD_A	  %rcx
38#define OLD_LDA	  %r8
39#define STACK_INCX	 8 + STACKSIZE(%rsp)
40#define STACK_Y		16 + STACKSIZE(%rsp)
41#define STACK_INCY	24 + STACKSIZE(%rsp)
42#define STACK_BUFFER	32 + STACKSIZE(%rsp)
43
44#else
45
46#define STACKSIZE	256
47
48#define OLD_M	  %rcx
49#define OLD_N	  %rdx
50#define OLD_A		 40 + STACKSIZE(%rsp)
51#define OLD_LDA		 48 + STACKSIZE(%rsp)
52#define OLD_X		 56 + STACKSIZE(%rsp)
53#define STACK_INCX	 64 + STACKSIZE(%rsp)
54#define STACK_Y		 72 + STACKSIZE(%rsp)
55#define STACK_INCY	 80 + STACKSIZE(%rsp)
56#define STACK_BUFFER	 88 + STACKSIZE(%rsp)
57
58#endif
59
60#define LDA	%r8
61#define X	%r9
62
63#define INCX	%rsi
64#define INCY	%rdi
65
66#define M	%r10
67#define N	%r11
68#define A	%r12
69#define Y	%r14
70#define BUFFER	%r13
71
72#define I	%rax
73#define A1	%rbx
74#define A2	%rcx
75#define LDA3	%rdx
76#define X1	%rbp
77
78#define Y1	INCX
79
80#ifdef ALIGNED_ACCESS
81#define MM	%r15
82#else
83#define MM	M
84#endif
85
86#define ALPHA	%xmm7
87
88	PROLOGUE
89	PROFCODE
90
91	subq	$STACKSIZE, %rsp
92	movq	%rbx,  0(%rsp)
93	movq	%rbp,  8(%rsp)
94	movq	%r12, 16(%rsp)
95	movq	%r13, 24(%rsp)
96	movq	%r14, 32(%rsp)
97	movq	%r15, 40(%rsp)
98
99#ifdef WINDOWS_ABI
100	movq	%rdi,    48(%rsp)
101	movq	%rsi,    56(%rsp)
102	movups	%xmm6,   64(%rsp)
103	movups	%xmm7,   80(%rsp)
104	movups	%xmm8,   96(%rsp)
105	movups	%xmm9,  112(%rsp)
106	movups	%xmm10, 128(%rsp)
107	movups	%xmm11, 144(%rsp)
108	movups	%xmm12, 160(%rsp)
109	movups	%xmm13, 176(%rsp)
110	movups	%xmm14, 192(%rsp)
111	movups	%xmm15, 208(%rsp)
112
113	movq	OLD_M,	      M
114	movq	OLD_N,        N
115	movq	OLD_A,        A
116	movq	OLD_LDA,      LDA
117	movq	OLD_X,        X
118#else
119	movq	OLD_M,	      M
120	movq	OLD_N,        N
121	movq	OLD_A,        A
122	movq	OLD_LDA,      LDA
123#endif
124
125	movq	STACK_INCX,   INCX
126	movq	STACK_Y,      Y
127	movq	STACK_INCY,   INCY
128	movq	STACK_BUFFER, BUFFER
129
130#ifndef WINDOWS_ABI
131	pshufd	$0, %xmm0, ALPHA
132#else
133	pshufd	$0, %xmm3, ALPHA
134#endif
135
136	leaq	(,INCX, SIZE), INCX
137	leaq	(,INCY, SIZE), INCY
138	leaq	(,LDA,  SIZE), LDA
139
140	leaq	(LDA, LDA, 2), LDA3
141
142#ifdef ALIGNED_ACCESS
143	movq	M, MM
144	testq	$4 * SIZE - 1, A
145	je	.L0X
146	cmpq	$3, M
147	jle	.L0X
148
149	movq	A, MM
150	sarq	$BASE_SHIFT, MM
151	andq	$3, MM
152	subq	$4, MM
153	addq	M,  MM
154
155.L0X:
156#endif
157
158	testq	M, M
159	jle	.L999
160	testq	N, N
161	jle	.L999
162	ALIGN_4
163
164	subq	$-32 * SIZE, A
165
166#ifdef ALIGNED_ACCESS
167	movq	A,  %rax
168	andq	$4 * SIZE - 1, %rax
169	addq	%rax, BUFFER
170#endif
171
172	movq	BUFFER, X1
173
174	movq	M,  I
175	sarq	$3, I
176	jle	.L05
177	ALIGN_4
178
179.L02:
180	movss	(X), %xmm0
181	addq	INCX, X
182	movss	(X), %xmm1
183	addq	INCX, X
184
185	movss	(X), %xmm2
186	addq	INCX, X
187	movss	(X), %xmm3
188	addq	INCX, X
189
190	movss	(X), %xmm4
191	addq	INCX, X
192	movss	(X), %xmm5
193	addq	INCX, X
194
195	movss	(X), %xmm6
196	addq	INCX, X
197	movss	(X), %xmm8
198	addq	INCX, X
199
200	movss	%xmm0, 0 * SIZE(X1)
201	movss	%xmm1, 1 * SIZE(X1)
202	movss	%xmm2, 2 * SIZE(X1)
203	movss	%xmm3, 3 * SIZE(X1)
204	movss	%xmm4, 4 * SIZE(X1)
205	movss	%xmm5, 5 * SIZE(X1)
206	movss	%xmm6, 6 * SIZE(X1)
207	movss	%xmm8, 7 * SIZE(X1)
208
209	addq	$8 * SIZE, X1
210	decq	I
211	jg	.L02
212	ALIGN_4
213
214.L05:
215	movq	M,  I
216	andq	$7, I
217	jle	.L10
218	ALIGN_2
219
220.L06:
221	movss	(X), %xmm0
222	addq	INCX, X
223	movss	%xmm0, 0 * SIZE(X1)
224	addq	$SIZE, X1
225	decq	I
226	jg	.L06
227	ALIGN_4
228
229.L10:
230	movq	Y, Y1
231
232#ifdef ALIGNED_ACCESS
233	testq	$4 * SIZE - 1, LDA
234	jne	.L100
235#endif
236
237#if GEMV_UNROLL >= 8
238
239	cmpq	$8, N
240	jl	.L20
241	ALIGN_3
242
243.L11:
244	subq	$8, N
245
246	leaq	32 * SIZE(BUFFER), X1
247
248	movq	A, A1
249	leaq	(A1, LDA, 4), A2
250	leaq	(A1, LDA, 8), A
251
252	xorps	%xmm8, %xmm8
253	xorps	%xmm9, %xmm9
254	xorps	%xmm10, %xmm10
255	xorps	%xmm11, %xmm11
256	xorps	%xmm12, %xmm12
257	xorps	%xmm13, %xmm13
258	xorps	%xmm14, %xmm14
259	xorps	%xmm15, %xmm15
260
261#ifdef ALIGNED_ACCESS
262	cmpq	$3, M
263	jle	.L17
264
265	testq	$SIZE, A1
266	je	.L1X
267
268	movss	-32 * SIZE(A1), %xmm0
269	movss	-32 * SIZE(X1), %xmm4
270	mulss	%xmm4, %xmm0
271	addss	%xmm0, %xmm8
272	movss	-32 * SIZE(A1, LDA, 1), %xmm1
273	mulss	%xmm4, %xmm1
274	addss	%xmm1, %xmm9
275	movss	-32 * SIZE(A1, LDA, 2), %xmm2
276	mulss	%xmm4, %xmm2
277	addss	%xmm2, %xmm10
278	movss	-32 * SIZE(A1, LDA3, 1), %xmm3
279	mulss	%xmm4, %xmm3
280	addss	%xmm3, %xmm11
281	movss	-32 * SIZE(A2), %xmm0
282	mulss	%xmm4, %xmm0
283	addss	%xmm0, %xmm12
284	movss	-32 * SIZE(A2, LDA, 1), %xmm1
285	mulss	%xmm4, %xmm1
286	addss	%xmm1, %xmm13
287	movss	-32 * SIZE(A2, LDA, 2), %xmm2
288	mulss	%xmm4, %xmm2
289	addss	%xmm2, %xmm14
290	movss	-32 * SIZE(A2, LDA3, 1), %xmm3
291	mulss	%xmm4, %xmm3
292	addss	%xmm3, %xmm15
293
294	addq	 $1 * SIZE, A1
295	addq	 $1 * SIZE, A2
296	addq	 $1 * SIZE, X1
297	ALIGN_3
298
299.L1X:
300	testq	$2 * SIZE, A1
301	je	.L1XX
302
303#ifdef movsd
304	xorps	%xmm0, %xmm0
305	xorps	%xmm4, %xmm4
306#endif
307	movsd	-32 * SIZE(A1), %xmm0
308	movsd	-32 * SIZE(X1), %xmm4
309	mulps	%xmm4, %xmm0
310	addps	%xmm0, %xmm8
311#ifdef movsd
312	xorps	%xmm1, %xmm1
313#endif
314	movsd	-32 * SIZE(A1, LDA, 1), %xmm1
315	mulps	%xmm4, %xmm1
316	addps	%xmm1, %xmm9
317#ifdef movsd
318	xorps	%xmm2, %xmm2
319#endif
320	movsd	-32 * SIZE(A1, LDA, 2), %xmm2
321	mulps	%xmm4, %xmm2
322	addps	%xmm2, %xmm10
323#ifdef movsd
324	xorps	%xmm3, %xmm3
325#endif
326	movsd	-32 * SIZE(A1, LDA3, 1), %xmm3
327	mulps	%xmm4, %xmm3
328	addps	%xmm3, %xmm11
329	movsd	-32 * SIZE(A2), %xmm0
330	mulps	%xmm4, %xmm0
331	addps	%xmm0, %xmm12
332	movsd	-32 * SIZE(A2, LDA, 1), %xmm1
333	mulps	%xmm4, %xmm1
334	addps	%xmm1, %xmm13
335	movsd	-32 * SIZE(A2, LDA, 2), %xmm2
336	mulps	%xmm4, %xmm2
337	addps	%xmm2, %xmm14
338	movsd	-32 * SIZE(A2, LDA3, 1), %xmm3
339	mulps	%xmm4, %xmm3
340	addps	%xmm3, %xmm15
341
342	addq	 $2 * SIZE, A1
343	addq	 $2 * SIZE, A2
344	addq	 $2 * SIZE, X1
345	ALIGN_3
346
347.L1XX:
348#endif
349
350	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
351	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
352
353#ifdef PREFETCHW
354	PREFETCHW	8 * SIZE(Y1)
355#endif
356
357	movq	MM,  I
358	sarq	$4,  I
359	jle	.L15
360
361	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
362	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
363	MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2)
364	MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3)
365
366	decq	I
367	jle	.L13
368	ALIGN_4
369
370.L12:
371#ifdef PREFETCH
372	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
373#endif
374
375	mulps	%xmm4, %xmm0
376	addps	%xmm0, %xmm8
377	MOVUPS_A1 (-32 * SIZE, A2, %xmm0)
378	mulps	%xmm4, %xmm1
379	addps	%xmm1, %xmm9
380	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1)
381	mulps	%xmm4, %xmm2
382	addps	%xmm2, %xmm10
383	MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2)
384	mulps	%xmm4, %xmm3
385	addps	%xmm3, %xmm11
386	MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3)
387
388#ifdef PREFETCH
389	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1)
390#endif
391
392	mulps	%xmm4, %xmm0
393	addps	%xmm0, %xmm12
394	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
395	mulps	%xmm4, %xmm1
396	addps	%xmm1, %xmm13
397	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1)
398	mulps	%xmm4, %xmm2
399	addps	%xmm2, %xmm14
400	MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2)
401	mulps	%xmm4, %xmm3
402	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
403	addps	%xmm3, %xmm15
404	MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3)
405
406#ifdef PREFETCH
407	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2)
408#endif
409
410	mulps	%xmm5, %xmm0
411	addps	%xmm0, %xmm8
412	MOVUPS_A1 (-28 * SIZE, A2, %xmm0)
413	mulps	%xmm5, %xmm1
414	addps	%xmm1, %xmm9
415	MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1)
416	mulps	%xmm5, %xmm2
417	addps	%xmm2, %xmm10
418	MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2)
419	mulps	%xmm5, %xmm3
420	addps	%xmm3, %xmm11
421	MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3)
422
423#ifdef PREFETCH
424	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3)
425#endif
426
427	mulps	%xmm5, %xmm0
428	addps	%xmm0, %xmm12
429	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
430	mulps	%xmm5, %xmm1
431	addps	%xmm1, %xmm13
432	MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1)
433	mulps	%xmm5, %xmm2
434	addps	%xmm2, %xmm14
435	MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2)
436	mulps	%xmm5, %xmm3
437	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
438	addps	%xmm3, %xmm15
439	MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3)
440
441#ifdef PREFETCH
442	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
443#endif
444
445	mulps	%xmm4, %xmm0
446	addps	%xmm0, %xmm8
447	MOVUPS_A1 (-24 * SIZE, A2, %xmm0)
448	mulps	%xmm4, %xmm1
449	addps	%xmm1, %xmm9
450	MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1)
451	mulps	%xmm4, %xmm2
452	addps	%xmm2, %xmm10
453	MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2)
454	mulps	%xmm4, %xmm3
455	addps	%xmm3, %xmm11
456	MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3)
457	mulps	%xmm4, %xmm0
458	addps	%xmm0, %xmm12
459
460#ifdef PREFETCH
461	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1)
462#endif
463
464	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
465	mulps	%xmm4, %xmm1
466	addps	%xmm1, %xmm13
467	MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1)
468	mulps	%xmm4, %xmm2
469	addps	%xmm2, %xmm14
470	MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2)
471	mulps	%xmm4, %xmm3
472	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
473	addps	%xmm3, %xmm15
474	MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3)
475
476#ifdef PREFETCH
477	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2)
478#endif
479
480	mulps	%xmm5, %xmm0
481	addps	%xmm0, %xmm8
482	MOVUPS_A1 (-20 * SIZE, A2, %xmm0)
483	mulps	%xmm5, %xmm1
484	addps	%xmm1, %xmm9
485	MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1)
486	mulps	%xmm5, %xmm2
487	addps	%xmm2, %xmm10
488	MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2)
489	mulps	%xmm5, %xmm3
490	addps	%xmm3, %xmm11
491	MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3)
492	mulps	%xmm5, %xmm0
493	addps	%xmm0, %xmm12
494
495#ifdef PREFETCH
496	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3)
497#endif
498
499	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
500	mulps	%xmm5, %xmm1
501	addps	%xmm1, %xmm13
502	MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1)
503	mulps	%xmm5, %xmm2
504	addps	%xmm2, %xmm14
505
506#ifdef PREFETCHW
507	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
508#endif
509
510	MOVUPS_A2 (-16 * SIZE, A1, LDA, 2, %xmm2)
511	mulps	%xmm5, %xmm3
512	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
513	addps	%xmm3, %xmm15
514	MOVUPS_A2 (-16 * SIZE, A1, LDA3, 1, %xmm3)
515
516	addq	$16 * SIZE, A1
517	addq	$16 * SIZE, A2
518	addq	$16 * SIZE, X1
519
520	decq	I
521	jg	.L12
522	ALIGN_4
523
524.L13:
525	mulps	%xmm4, %xmm0
526	addps	%xmm0, %xmm8
527	MOVUPS_A1 (-32 * SIZE, A2, %xmm0)
528	mulps	%xmm4, %xmm1
529	addps	%xmm1, %xmm9
530	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1)
531	mulps	%xmm4, %xmm2
532	addps	%xmm2, %xmm10
533	MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2)
534	mulps	%xmm4, %xmm3
535	addps	%xmm3, %xmm11
536	MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3)
537	mulps	%xmm4, %xmm0
538	addps	%xmm0, %xmm12
539	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
540	mulps	%xmm4, %xmm1
541	addps	%xmm1, %xmm13
542	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1)
543	mulps	%xmm4, %xmm2
544	addps	%xmm2, %xmm14
545	MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2)
546	mulps	%xmm4, %xmm3
547	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
548	addps	%xmm3, %xmm15
549	MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3)
550
551	mulps	%xmm5, %xmm0
552	addps	%xmm0, %xmm8
553	MOVUPS_A1 (-28 * SIZE, A2, %xmm0)
554	mulps	%xmm5, %xmm1
555	addps	%xmm1, %xmm9
556	MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1)
557	mulps	%xmm5, %xmm2
558	addps	%xmm2, %xmm10
559	MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2)
560	mulps	%xmm5, %xmm3
561	addps	%xmm3, %xmm11
562	MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3)
563	mulps	%xmm5, %xmm0
564	addps	%xmm0, %xmm12
565	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
566	mulps	%xmm5, %xmm1
567	addps	%xmm1, %xmm13
568	MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1)
569	mulps	%xmm5, %xmm2
570	addps	%xmm2, %xmm14
571	MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2)
572	mulps	%xmm5, %xmm3
573	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
574	addps	%xmm3, %xmm15
575	MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3)
576
577	mulps	%xmm4, %xmm0
578	addps	%xmm0, %xmm8
579	MOVUPS_A1 (-24 * SIZE, A2, %xmm0)
580	mulps	%xmm4, %xmm1
581	addps	%xmm1, %xmm9
582	MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1)
583	mulps	%xmm4, %xmm2
584	addps	%xmm2, %xmm10
585	MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2)
586	mulps	%xmm4, %xmm3
587	addps	%xmm3, %xmm11
588	MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3)
589	mulps	%xmm4, %xmm0
590	addps	%xmm0, %xmm12
591	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
592	mulps	%xmm4, %xmm1
593	addps	%xmm1, %xmm13
594	MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1)
595	mulps	%xmm4, %xmm2
596	addps	%xmm2, %xmm14
597	MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2)
598	mulps	%xmm4, %xmm3
599	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
600	addps	%xmm3, %xmm15
601	MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3)
602
603	mulps	%xmm5, %xmm0
604	addps	%xmm0, %xmm8
605	MOVUPS_A1 (-20 * SIZE, A2, %xmm0)
606	mulps	%xmm5, %xmm1
607	addps	%xmm1, %xmm9
608	MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1)
609	mulps	%xmm5, %xmm2
610	addps	%xmm2, %xmm10
611	MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2)
612	mulps	%xmm5, %xmm3
613	addps	%xmm3, %xmm11
614	MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3)
615	mulps	%xmm5, %xmm0
616	addps	%xmm0, %xmm12
617	mulps	%xmm5, %xmm1
618	addps	%xmm1, %xmm13
619	mulps	%xmm5, %xmm2
620	addps	%xmm2, %xmm14
621	mulps	%xmm5, %xmm3
622	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
623	addps	%xmm3, %xmm15
624
625	addq	$16 * SIZE, A1
626	addq	$16 * SIZE, A2
627	addq	$16 * SIZE, X1
628	ALIGN_4
629
630.L15:
631	testq	$8, MM
632	jle	.L16
633
634	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
635	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
636	MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2)
637	MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3)
638
639	mulps	%xmm4, %xmm0
640	addps	%xmm0, %xmm8
641	MOVUPS_A1 (-32 * SIZE, A2, %xmm0)
642	mulps	%xmm4, %xmm1
643	addps	%xmm1, %xmm9
644	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1)
645	mulps	%xmm4, %xmm2
646	addps	%xmm2, %xmm10
647	MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2)
648	mulps	%xmm4, %xmm3
649	addps	%xmm3, %xmm11
650	MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3)
651	mulps	%xmm4, %xmm0
652	addps	%xmm0, %xmm12
653	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
654	mulps	%xmm4, %xmm1
655	addps	%xmm1, %xmm13
656	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1)
657	mulps	%xmm4, %xmm2
658	addps	%xmm2, %xmm14
659	MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2)
660	mulps	%xmm4, %xmm3
661	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
662	addps	%xmm3, %xmm15
663	MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3)
664
665	mulps	%xmm5, %xmm0
666	addps	%xmm0, %xmm8
667	MOVUPS_A1 (-28 * SIZE, A2, %xmm0)
668	mulps	%xmm5, %xmm1
669	addps	%xmm1, %xmm9
670	MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1)
671	mulps	%xmm5, %xmm2
672	addps	%xmm2, %xmm10
673	MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2)
674	mulps	%xmm5, %xmm3
675	addps	%xmm3, %xmm11
676	MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3)
677	mulps	%xmm5, %xmm0
678	addps	%xmm0, %xmm12
679	mulps	%xmm5, %xmm1
680	addps	%xmm1, %xmm13
681	mulps	%xmm5, %xmm2
682	addps	%xmm2, %xmm14
683	mulps	%xmm5, %xmm3
684	addps	%xmm3, %xmm15
685
686	addq	$8 * SIZE, A1
687	addq	$8 * SIZE, A2
688	addq	$8 * SIZE, X1
689	ALIGN_4
690
691.L16:
692	testq	$4, MM
693	jle	.L17
694
695	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
696	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
697	MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2)
698	MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3)
699
700	mulps	%xmm4, %xmm0
701	addps	%xmm0, %xmm8
702	MOVUPS_A1 (-32 * SIZE, A2, %xmm0)
703	mulps	%xmm4, %xmm1
704	addps	%xmm1, %xmm9
705	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1)
706	mulps	%xmm4, %xmm2
707	addps	%xmm2, %xmm10
708	MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2)
709	mulps	%xmm4, %xmm3
710	addps	%xmm3, %xmm11
711	MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3)
712	mulps	%xmm4, %xmm0
713	addps	%xmm0, %xmm12
714	mulps	%xmm4, %xmm1
715	addps	%xmm1, %xmm13
716	mulps	%xmm4, %xmm2
717	addps	%xmm2, %xmm14
718	mulps	%xmm4, %xmm3
719	addps	%xmm3, %xmm15
720
721	addq	$4 * SIZE, A1
722	addq	$4 * SIZE, A2
723	addq	$4 * SIZE, X1
724	ALIGN_4
725
726.L17:
727	testq	$2, MM
728	jle	.L18
729
730#ifdef movsd
731	xorps	%xmm0, %xmm0
732#endif
733	movsd	-32 * SIZE(A1), %xmm0
734#ifdef movsd
735	xorps	%xmm4, %xmm4
736#endif
737	movsd	-32 * SIZE(X1), %xmm4
738	mulps	%xmm4, %xmm0
739	addps	%xmm0, %xmm8
740#ifdef movsd
741	xorps	%xmm1, %xmm1
742#endif
743	movsd	-32 * SIZE(A1, LDA, 1), %xmm1
744	mulps	%xmm4, %xmm1
745	addps	%xmm1, %xmm9
746#ifdef movsd
747	xorps	%xmm2, %xmm2
748#endif
749	movsd	-32 * SIZE(A1, LDA, 2), %xmm2
750	mulps	%xmm4, %xmm2
751	addps	%xmm2, %xmm10
752#ifdef movsd
753	xorps	%xmm3, %xmm3
754#endif
755	movsd	-32 * SIZE(A1, LDA3, 1), %xmm3
756	mulps	%xmm4, %xmm3
757	addps	%xmm3, %xmm11
758	movsd	-32 * SIZE(A2), %xmm0
759	mulps	%xmm4, %xmm0
760	addps	%xmm0, %xmm12
761	movsd	-32 * SIZE(A2, LDA, 1), %xmm1
762	mulps	%xmm4, %xmm1
763	addps	%xmm1, %xmm13
764	movsd	-32 * SIZE(A2, LDA, 2), %xmm2
765	mulps	%xmm4, %xmm2
766	addps	%xmm2, %xmm14
767	movsd	-32 * SIZE(A2, LDA3, 1), %xmm3
768	mulps	%xmm4, %xmm3
769	addps	%xmm3, %xmm15
770
771	addq	$2 * SIZE, A1
772	addq	$2 * SIZE, A2
773	addq	$2 * SIZE, X1
774	ALIGN_4
775
776.L18:
777	testq	$1, MM
778	jle	.L19
779
780	movss	-32 * SIZE(A1), %xmm0
781	movss	-32 * SIZE(X1), %xmm4
782	mulss	%xmm4, %xmm0
783	addss	%xmm0, %xmm8
784	movss	-32 * SIZE(A1, LDA, 1), %xmm1
785	mulss	%xmm4, %xmm1
786	addss	%xmm1, %xmm9
787	movss	-32 * SIZE(A1, LDA, 2), %xmm2
788	mulss	%xmm4, %xmm2
789	addss	%xmm2, %xmm10
790	movss	-32 * SIZE(A1, LDA3, 1), %xmm3
791	mulss	%xmm4, %xmm3
792	addss	%xmm3, %xmm11
793	movss	-32 * SIZE(A2), %xmm0
794	mulss	%xmm4, %xmm0
795	addss	%xmm0, %xmm12
796	movss	-32 * SIZE(A2, LDA, 1), %xmm1
797	mulss	%xmm4, %xmm1
798	addss	%xmm1, %xmm13
799	movss	-32 * SIZE(A2, LDA, 2), %xmm2
800	mulss	%xmm4, %xmm2
801	addss	%xmm2, %xmm14
802	movss	-32 * SIZE(A2, LDA3, 1), %xmm3
803	mulss	%xmm4, %xmm3
804	addss	%xmm3, %xmm15
805	ALIGN_4
806
807.L19:
808#ifdef HAVE_SSE3
809	haddps	%xmm9, %xmm8
810	haddps	%xmm11, %xmm10
811	haddps	%xmm10, %xmm8
812
813	pshufd	$0x1, %xmm8, %xmm9
814	pshufd	$0x2, %xmm8, %xmm10
815	pshufd	$0x3, %xmm8, %xmm11
816
817	haddps	%xmm13, %xmm12
818	haddps	%xmm15, %xmm14
819	haddps	%xmm14, %xmm12
820
821	pshufd	$0x1, %xmm12, %xmm13
822	pshufd	$0x2, %xmm12, %xmm14
823	pshufd	$0x3, %xmm12, %xmm15
824#else
825	movaps	%xmm8, %xmm0
826	unpcklps %xmm9, %xmm8
827	unpckhps %xmm9, %xmm0
828
829	movaps	%xmm10, %xmm1
830	unpcklps %xmm11, %xmm10
831	unpckhps %xmm11, %xmm1
832
833	movaps	%xmm8, %xmm9
834	unpcklps %xmm10, %xmm8
835	unpckhps %xmm10, %xmm9
836
837	movaps	%xmm0, %xmm10
838	unpcklps %xmm1, %xmm0
839	unpckhps %xmm1, %xmm10
840
841	addps	%xmm9, %xmm8
842	addps	%xmm0, %xmm10
843	addps	%xmm10, %xmm8
844
845	pshufd	$0x2, %xmm8, %xmm9
846	pshufd	$0x1, %xmm8, %xmm10
847	pshufd	$0x3, %xmm8, %xmm11
848
849	movaps	%xmm12, %xmm0
850	unpcklps %xmm13, %xmm12
851	unpckhps %xmm13, %xmm0
852
853	movaps	%xmm14, %xmm1
854	unpcklps %xmm15, %xmm14
855	unpckhps %xmm15, %xmm1
856
857	movaps	%xmm12, %xmm13
858	unpcklps %xmm14, %xmm12
859	unpckhps %xmm14, %xmm13
860
861	movaps	%xmm0, %xmm14
862	unpcklps %xmm1, %xmm0
863	unpckhps %xmm1, %xmm14
864
865	addps	%xmm13, %xmm12
866	addps	%xmm0, %xmm14
867	addps	%xmm14, %xmm12
868
869	pshufd	$0x2, %xmm12, %xmm13
870	pshufd	$0x1, %xmm12, %xmm14
871	pshufd	$0x3, %xmm12, %xmm15
872#endif
873
874	mulss	ALPHA, %xmm8
875	mulss	ALPHA, %xmm9
876	mulss	ALPHA, %xmm10
877	mulss	ALPHA, %xmm11
878	mulss	ALPHA, %xmm12
879	mulss	ALPHA, %xmm13
880	mulss	ALPHA, %xmm14
881	mulss	ALPHA, %xmm15
882
883	addss	(Y), %xmm8
884	addq	INCY, Y
885	addss	(Y), %xmm9
886	addq	INCY, Y
887	addss	(Y), %xmm10
888	addq	INCY, Y
889	addss	(Y), %xmm11
890	addq	INCY, Y
891	addss	(Y), %xmm12
892	addq	INCY, Y
893	addss	(Y), %xmm13
894	addq	INCY, Y
895	addss	(Y), %xmm14
896	addq	INCY, Y
897	addss	(Y), %xmm15
898	addq	INCY, Y
899
900	movss	%xmm8,  (Y1)
901	addq	INCY, Y1
902	movss	%xmm9,  (Y1)
903	addq	INCY, Y1
904	movss	%xmm10, (Y1)
905	addq	INCY, Y1
906	movss	%xmm11, (Y1)
907	addq	INCY, Y1
908	movss	%xmm12, (Y1)
909	addq	INCY, Y1
910	movss	%xmm13, (Y1)
911	addq	INCY, Y1
912	movss	%xmm14, (Y1)
913	addq	INCY, Y1
914	movss	%xmm15, (Y1)
915	addq	INCY, Y1
916
917	cmpq	$8, N
918	jge	.L11
919	ALIGN_4
920
921.L20:
922#endif
923
924	cmpq	$4, N
925	jl	.L30
926
927#if GEMV_UNROLL == 4
928	ALIGN_3
929
930.L21:
931#endif
932	subq	$4, N
933
934	leaq	32 * SIZE(BUFFER), X1
935
936	movq	A, A1
937	leaq	(A1, LDA, 2), A2
938	leaq	(A1, LDA, 4), A
939
940	xorps	%xmm8, %xmm8
941	xorps	%xmm9, %xmm9
942	xorps	%xmm10, %xmm10
943	xorps	%xmm11, %xmm11
944
945#ifdef ALIGNED_ACCESS
946	cmpq	$3, M
947	jle	.L27
948
949	testq	$SIZE, A1
950	je	.L2X
951
952	movss	-32 * SIZE(A1), %xmm0
953	movss	-32 * SIZE(X1), %xmm4
954	mulss	%xmm4, %xmm0
955	addss	%xmm0, %xmm8
956	movss	-32 * SIZE(A1, LDA), %xmm1
957	mulss	%xmm4, %xmm1
958	addss	%xmm1, %xmm9
959	movss	-32 * SIZE(A2), %xmm2
960	mulss	%xmm4, %xmm2
961	addss	%xmm2, %xmm10
962	movss	-32 * SIZE(A2, LDA), %xmm3
963	mulss	%xmm4, %xmm3
964	addss	%xmm3, %xmm11
965
966	addq	 $1 * SIZE, A1
967	addq	 $1 * SIZE, A2
968	addq	 $1 * SIZE, X1
969	ALIGN_3
970
971.L2X:
972	testq	$2 * SIZE, A1
973	je	.L2XX
974
975#ifdef movsd
976	xorps	%xmm0, %xmm0
977	xorps	%xmm4, %xmm4
978#endif
979	movsd	-32 * SIZE(A1), %xmm0
980	movsd	-32 * SIZE(X1), %xmm4
981	mulps	%xmm4, %xmm0
982	addps	%xmm0, %xmm8
983#ifdef movsd
984	xorps	%xmm1, %xmm1
985#endif
986	movsd	-32 * SIZE(A1, LDA), %xmm1
987	mulps	%xmm4, %xmm1
988	addps	%xmm1, %xmm9
989#ifdef movsd
990	xorps	%xmm2, %xmm2
991#endif
992	movsd	-32 * SIZE(A2), %xmm2
993	mulps	%xmm4, %xmm2
994	addps	%xmm2, %xmm10
995#ifdef movsd
996	xorps	%xmm3, %xmm3
997#endif
998	movsd	-32 * SIZE(A2, LDA), %xmm3
999	mulps	%xmm4, %xmm3
1000	addps	%xmm3, %xmm11
1001
1002	addq	 $2 * SIZE, A1
1003	addq	 $2 * SIZE, A2
1004	addq	 $2 * SIZE, X1
1005	ALIGN_3
1006
1007.L2XX:
1008#endif
1009
1010	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
1011	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
1012
1013#if (GEMV_UNROLL == 4) && defined(PREFETCHW)
1014	PREFETCHW	4 * SIZE(Y1)
1015#endif
1016
1017	movq	MM,  I
1018	sarq	$4,  I
1019	jle	.L25
1020
1021	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1022	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
1023	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
1024	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3)
1025
1026	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
1027	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13)
1028	MOVUPS_A1 (-28 * SIZE, A2, %xmm14)
1029	MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15)
1030
1031	decq	I
1032	jle	.L23
1033	ALIGN_4
1034
1035.L22:
1036#ifdef PREFETCH
1037	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
1038#endif
1039
1040	mulps	%xmm4, %xmm0
1041	addps	%xmm0, %xmm8
1042	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
1043	mulps	%xmm4, %xmm1
1044	addps	%xmm1, %xmm9
1045	MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1)
1046	mulps	%xmm4, %xmm2
1047	addps	%xmm2, %xmm10
1048	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
1049	mulps	%xmm4, %xmm3
1050	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1051	addps	%xmm3, %xmm11
1052	MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3)
1053
1054#ifdef PREFETCH
1055	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA)
1056#endif
1057
1058	mulps	%xmm5, %xmm12
1059	addps	%xmm12, %xmm8
1060	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
1061	mulps	%xmm5, %xmm13
1062	addps	%xmm13, %xmm9
1063	MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13)
1064	mulps	%xmm5, %xmm14
1065	addps	%xmm14, %xmm10
1066	MOVUPS_A1 (-20 * SIZE, A2, %xmm14)
1067	mulps	%xmm5, %xmm15
1068	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
1069	addps	%xmm15, %xmm11
1070	MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15)
1071
1072#ifdef PREFETCH
1073	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
1074#endif
1075
1076	mulps	%xmm4, %xmm0
1077	addps	%xmm0, %xmm8
1078	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
1079	mulps	%xmm4, %xmm1
1080	addps	%xmm1, %xmm9
1081	MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1)
1082	mulps	%xmm4, %xmm2
1083	addps	%xmm2, %xmm10
1084	MOVUPS_A1 (-16 * SIZE, A2, %xmm2)
1085	mulps	%xmm4, %xmm3
1086	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
1087	addps	%xmm3, %xmm11
1088	MOVUPS_A2 (-16 * SIZE, A2, LDA, 1, %xmm3)
1089
1090#ifdef PREFETCH
1091	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA)
1092#endif
1093
1094	mulps	%xmm5, %xmm12
1095	addps	%xmm12, %xmm8
1096	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
1097	mulps	%xmm5, %xmm13
1098	addps	%xmm13, %xmm9
1099	MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13)
1100
1101#ifdef PREFETCHW
1102	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
1103#endif
1104
1105	mulps	%xmm5, %xmm14
1106	addps	%xmm14, %xmm10
1107	MOVUPS_A1 (-12 * SIZE, A2, %xmm14)
1108	mulps	%xmm5, %xmm15
1109	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
1110	addps	%xmm15, %xmm11
1111	MOVUPS_A2 (-12 * SIZE, A2, LDA, 1, %xmm15)
1112
1113	addq	$16 * SIZE, A1
1114	addq	$16 * SIZE, A2
1115	addq	$16 * SIZE, X1
1116
1117	decq	I
1118	jg	.L22
1119	ALIGN_4
1120
1121.L23:
1122	mulps	%xmm4, %xmm0
1123	addps	%xmm0, %xmm8
1124	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
1125	mulps	%xmm4, %xmm1
1126	addps	%xmm1, %xmm9
1127	MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1)
1128	mulps	%xmm4, %xmm2
1129	addps	%xmm2, %xmm10
1130	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
1131	mulps	%xmm4, %xmm3
1132	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1133	addps	%xmm3, %xmm11
1134	MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3)
1135
1136	mulps	%xmm5, %xmm12
1137	addps	%xmm12, %xmm8
1138	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
1139	mulps	%xmm5, %xmm13
1140	addps	%xmm13, %xmm9
1141	MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13)
1142	mulps	%xmm5, %xmm14
1143	addps	%xmm14, %xmm10
1144	MOVUPS_A1 (-20 * SIZE, A2, %xmm14)
1145	mulps	%xmm5, %xmm15
1146	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
1147	addps	%xmm15, %xmm11
1148	MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15)
1149
1150	mulps	%xmm4, %xmm0
1151	addps	%xmm0, %xmm8
1152	mulps	%xmm4, %xmm1
1153	addps	%xmm1, %xmm9
1154	mulps	%xmm4, %xmm2
1155	addps	%xmm2, %xmm10
1156	mulps	%xmm4, %xmm3
1157	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
1158	addps	%xmm3, %xmm11
1159
1160	mulps	%xmm5, %xmm12
1161	addps	%xmm12, %xmm8
1162	mulps	%xmm5, %xmm13
1163	addps	%xmm13, %xmm9
1164	mulps	%xmm5, %xmm14
1165	addps	%xmm14, %xmm10
1166	mulps	%xmm5, %xmm15
1167	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
1168	addps	%xmm15, %xmm11
1169
1170	addq	$16 * SIZE, A1
1171	addq	$16 * SIZE, A2
1172	addq	$16 * SIZE, X1
1173	ALIGN_4
1174
1175.L25:
1176	testq	$8, MM
1177	jle	.L26
1178
1179	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1180	mulps	%xmm4, %xmm0
1181	addps	%xmm0, %xmm8
1182	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
1183	mulps	%xmm4, %xmm1
1184	addps	%xmm1, %xmm9
1185	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
1186	mulps	%xmm4, %xmm2
1187	addps	%xmm2, %xmm10
1188	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3)
1189	mulps	%xmm4, %xmm3
1190	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1191	addps	%xmm3, %xmm11
1192
1193	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
1194	mulps	%xmm5, %xmm12
1195	addps	%xmm12, %xmm8
1196	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13)
1197	mulps	%xmm5, %xmm13
1198	addps	%xmm13, %xmm9
1199	MOVUPS_A1 (-28 * SIZE, A2, %xmm14)
1200	mulps	%xmm5, %xmm14
1201	addps	%xmm14, %xmm10
1202	MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15)
1203	mulps	%xmm5, %xmm15
1204	addps	%xmm15, %xmm11
1205
1206	addq	$8 * SIZE, A1
1207	addq	$8 * SIZE, A2
1208	addq	$8 * SIZE, X1
1209	ALIGN_4
1210
1211.L26:
1212	testq	$4, MM
1213	jle	.L27
1214
1215	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1216	mulps	%xmm4, %xmm0
1217	addps	%xmm0, %xmm8
1218	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
1219	mulps	%xmm4, %xmm1
1220	addps	%xmm1, %xmm9
1221	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
1222	mulps	%xmm4, %xmm2
1223	addps	%xmm2, %xmm10
1224	MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3)
1225	mulps	%xmm4, %xmm3
1226	addps	%xmm3, %xmm11
1227
1228	addq	$4 * SIZE, A1
1229	addq	$4 * SIZE, A2
1230	addq	$4 * SIZE, X1
1231	ALIGN_4
1232
1233.L27:
1234	testq	$2, MM
1235	jle	.L28
1236
1237#ifdef movsd
1238	xorps	%xmm0, %xmm0
1239#endif
1240	movsd	-32 * SIZE(A1), %xmm0
1241#ifdef movsd
1242	xorps	%xmm4, %xmm4
1243#endif
1244	movsd	-32 * SIZE(X1), %xmm4
1245	mulps	%xmm4, %xmm0
1246	addps	%xmm0, %xmm8
1247#ifdef movsd
1248	xorps	%xmm1, %xmm1
1249#endif
1250	movsd	-32 * SIZE(A1, LDA), %xmm1
1251	mulps	%xmm4, %xmm1
1252	addps	%xmm1, %xmm9
1253#ifdef movsd
1254	xorps	%xmm2, %xmm2
1255#endif
1256	movsd	-32 * SIZE(A2), %xmm2
1257	mulps	%xmm4, %xmm2
1258	addps	%xmm2, %xmm10
1259#ifdef movsd
1260	xorps	%xmm3, %xmm3
1261#endif
1262	movsd	-32 * SIZE(A2, LDA), %xmm3
1263	mulps	%xmm4, %xmm3
1264	addps	%xmm3, %xmm11
1265	shufps	$0xe, %xmm4, %xmm4
1266
1267	addq	$2 * SIZE, A1
1268	addq	$2 * SIZE, A2
1269	addq	$2 * SIZE, X1
1270	ALIGN_4
1271
1272.L28:
1273	testq	$1, MM
1274	jle	.L29
1275
1276	movss	-32 * SIZE(A1), %xmm0
1277	movss	-32 * SIZE(X1), %xmm4
1278	mulss	%xmm4, %xmm0
1279	addss	%xmm0, %xmm8
1280	movss	-32 * SIZE(A1, LDA), %xmm1
1281	mulss	%xmm4, %xmm1
1282	addss	%xmm1, %xmm9
1283	movss	-32 * SIZE(A2), %xmm2
1284	mulss	%xmm4, %xmm2
1285	addss	%xmm2, %xmm10
1286	movss	-32 * SIZE(A2, LDA), %xmm3
1287	mulss	%xmm4, %xmm3
1288	addss	%xmm3, %xmm11
1289	ALIGN_4
1290
1291.L29:
1292#ifdef HAVE_SSE3
1293	haddps	%xmm9, %xmm8
1294	haddps	%xmm11, %xmm10
1295	haddps	%xmm10, %xmm8
1296
1297	pshufd	$0x1, %xmm8, %xmm9
1298	pshufd	$0x2, %xmm8, %xmm10
1299	pshufd	$0x3, %xmm8, %xmm11
1300#else
1301	movaps	%xmm8, %xmm0
1302	unpcklps %xmm9, %xmm8
1303	unpckhps %xmm9, %xmm0
1304
1305	movaps	%xmm10, %xmm1
1306	unpcklps %xmm11, %xmm10
1307	unpckhps %xmm11, %xmm1
1308
1309	movaps	%xmm8, %xmm9
1310	unpcklps %xmm10, %xmm8
1311	unpckhps %xmm10, %xmm9
1312
1313	movaps	%xmm0, %xmm10
1314	unpcklps %xmm1, %xmm0
1315	unpckhps %xmm1, %xmm10
1316
1317	addps	%xmm9, %xmm8
1318	addps	%xmm0, %xmm10
1319	addps	%xmm10, %xmm8
1320
1321	pshufd	$0x2, %xmm8, %xmm9
1322	pshufd	$0x1, %xmm8, %xmm10
1323	pshufd	$0x3, %xmm8, %xmm11
1324#endif
1325
1326	mulss	ALPHA, %xmm8
1327	mulss	ALPHA, %xmm9
1328	mulss	ALPHA, %xmm10
1329	mulss	ALPHA, %xmm11
1330
1331	addss	(Y), %xmm8
1332	addq	INCY, Y
1333	addss	(Y), %xmm9
1334	addq	INCY, Y
1335	addss	(Y), %xmm10
1336	addq	INCY, Y
1337	addss	(Y), %xmm11
1338	addq	INCY, Y
1339
1340	movss	%xmm8, (Y1)
1341	addq	INCY, Y1
1342	movss	%xmm9, (Y1)
1343	addq	INCY, Y1
1344	movss	%xmm10, (Y1)
1345	addq	INCY, Y1
1346	movss	%xmm11, (Y1)
1347	addq	INCY, Y1
1348
1349#if GEMV_UNROLL == 4
1350	cmpq	$4, N
1351	jge	.L21
1352#endif
1353	ALIGN_4
1354
1355.L30:
1356	cmpq	$3, N
1357	jne	.L40
1358
1359	leaq	32 * SIZE(BUFFER), X1
1360
1361	movq	A, A1
1362	leaq	(A1, LDA, 2), A2
1363	leaq	(A1, LDA, 4), A
1364
1365	xorps	%xmm8, %xmm8
1366	xorps	%xmm9, %xmm9
1367	xorps	%xmm10, %xmm10
1368
1369#ifdef ALIGNED_ACCESS
1370	cmpq	$3, M
1371	jle	.L37
1372
1373	testq	$SIZE, A1
1374	je	.L3X
1375
1376	movss	-32 * SIZE(A1), %xmm0
1377	movss	-32 * SIZE(X1), %xmm4
1378	mulss	%xmm4, %xmm0
1379	addss	%xmm0, %xmm8
1380	movss	-32 * SIZE(A1, LDA), %xmm1
1381	mulss	%xmm4, %xmm1
1382	addss	%xmm1, %xmm9
1383	movss	-32 * SIZE(A2), %xmm2
1384	mulss	%xmm4, %xmm2
1385	addss	%xmm2, %xmm10
1386	movss	-32 * SIZE(A2, LDA), %xmm3
1387	mulss	%xmm4, %xmm3
1388	addss	%xmm3, %xmm11
1389
1390	addq	 $1 * SIZE, A1
1391	addq	 $1 * SIZE, A2
1392	addq	 $1 * SIZE, X1
1393	ALIGN_3
1394
1395.L3X:
1396	testq	$2 * SIZE, A1
1397	je	.L3XX
1398
1399#ifdef movsd
1400	xorps	%xmm0, %xmm0
1401	xorps	%xmm4, %xmm4
1402#endif
1403	movsd	-32 * SIZE(A1), %xmm0
1404	movsd	-32 * SIZE(X1), %xmm4
1405	mulps	%xmm4, %xmm0
1406	addps	%xmm0, %xmm8
1407#ifdef movsd
1408	xorps	%xmm1, %xmm1
1409#endif
1410	movsd	-32 * SIZE(A1, LDA), %xmm1
1411	mulps	%xmm4, %xmm1
1412	addps	%xmm1, %xmm9
1413#ifdef movsd
1414	xorps	%xmm2, %xmm2
1415#endif
1416	movsd	-32 * SIZE(A2), %xmm2
1417	mulps	%xmm4, %xmm2
1418	addps	%xmm2, %xmm10
1419#ifdef movsd
1420	xorps	%xmm3, %xmm3
1421#endif
1422	movsd	-32 * SIZE(A2, LDA), %xmm3
1423	mulps	%xmm4, %xmm3
1424	addps	%xmm3, %xmm11
1425
1426	addq	 $2 * SIZE, A1
1427	addq	 $2 * SIZE, A2
1428	addq	 $2 * SIZE, X1
1429	ALIGN_3
1430
1431.L3XX:
1432#endif
1433
1434	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
1435	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
1436
1437#if (GEMV_UNROLL == 4) && defined(PREFETCHW)
1438	PREFETCHW	4 * SIZE(Y1)
1439#endif
1440
1441	movq	MM,  I
1442	sarq	$4,  I
1443	jle	.L35
1444
1445	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1446	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
1447	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
1448
1449	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
1450	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13)
1451	MOVUPS_A1 (-28 * SIZE, A2, %xmm14)
1452
1453	decq	I
1454	jle	.L33
1455	ALIGN_4
1456
1457.L32:
1458#ifdef PREFETCH
1459	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1)
1460#endif
1461
1462	mulps	%xmm4, %xmm0
1463	addps	%xmm0, %xmm8
1464	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
1465	mulps	%xmm4, %xmm1
1466	addps	%xmm1, %xmm9
1467	MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1)
1468	mulps	%xmm4, %xmm2
1469	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1470	addps	%xmm2, %xmm10
1471	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
1472
1473#ifdef PREFETCH
1474	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA)
1475#endif
1476
1477	mulps	%xmm5, %xmm12
1478	addps	%xmm12, %xmm8
1479	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
1480	mulps	%xmm5, %xmm13
1481	addps	%xmm13, %xmm9
1482	MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13)
1483	mulps	%xmm5, %xmm14
1484	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
1485	addps	%xmm14, %xmm10
1486	MOVUPS_A1 (-20 * SIZE, A2, %xmm14)
1487
1488#ifdef PREFETCH
1489	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2)
1490#endif
1491
1492	mulps	%xmm4, %xmm0
1493	addps	%xmm0, %xmm8
1494	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
1495	mulps	%xmm4, %xmm1
1496	addps	%xmm1, %xmm9
1497	MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1)
1498	mulps	%xmm4, %xmm2
1499	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
1500	addps	%xmm2, %xmm10
1501	MOVUPS_A1 (-16 * SIZE, A2, %xmm2)
1502
1503#ifdef PREFETCHW
1504	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1)
1505#endif
1506
1507	mulps	%xmm5, %xmm12
1508	addps	%xmm12, %xmm8
1509	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
1510	mulps	%xmm5, %xmm13
1511	addps	%xmm13, %xmm9
1512	MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13)
1513	mulps	%xmm5, %xmm14
1514	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
1515	addps	%xmm14, %xmm10
1516	MOVUPS_A1 (-12 * SIZE, A2, %xmm14)
1517
1518	addq	$16 * SIZE, A1
1519	addq	$16 * SIZE, A2
1520	addq	$16 * SIZE, X1
1521
1522	decq	I
1523	jg	.L32
1524	ALIGN_4
1525
1526.L33:
1527	mulps	%xmm4, %xmm0
1528	addps	%xmm0, %xmm8
1529	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
1530	mulps	%xmm4, %xmm1
1531	addps	%xmm1, %xmm9
1532	MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1)
1533	mulps	%xmm4, %xmm2
1534	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1535	addps	%xmm2, %xmm10
1536	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
1537
1538	mulps	%xmm5, %xmm12
1539	addps	%xmm12, %xmm8
1540	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
1541	mulps	%xmm5, %xmm13
1542	addps	%xmm13, %xmm9
1543	MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13)
1544	mulps	%xmm5, %xmm14
1545	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
1546	addps	%xmm14, %xmm10
1547	MOVUPS_A1 (-20 * SIZE, A2, %xmm14)
1548
1549	mulps	%xmm4, %xmm0
1550	addps	%xmm0, %xmm8
1551	mulps	%xmm4, %xmm1
1552	addps	%xmm1, %xmm9
1553	mulps	%xmm4, %xmm2
1554	addps	%xmm2, %xmm10
1555	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
1556
1557	mulps	%xmm5, %xmm12
1558	addps	%xmm12, %xmm8
1559	mulps	%xmm5, %xmm13
1560	addps	%xmm13, %xmm9
1561	mulps	%xmm5, %xmm14
1562	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
1563	addps	%xmm14, %xmm10
1564
1565	addq	$16 * SIZE, A1
1566	addq	$16 * SIZE, A2
1567	addq	$16 * SIZE, X1
1568	ALIGN_4
1569
1570.L35:
1571	testq	$8, MM
1572	jle	.L36
1573
1574	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1575	mulps	%xmm4, %xmm0
1576	addps	%xmm0, %xmm8
1577	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
1578	mulps	%xmm4, %xmm1
1579	addps	%xmm1, %xmm9
1580	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
1581	mulps	%xmm4, %xmm2
1582	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1583	addps	%xmm2, %xmm10
1584
1585	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
1586	mulps	%xmm5, %xmm12
1587	addps	%xmm12, %xmm8
1588	MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13)
1589	mulps	%xmm5, %xmm13
1590	addps	%xmm13, %xmm9
1591	MOVUPS_A1 (-28 * SIZE, A2, %xmm14)
1592	mulps	%xmm5, %xmm14
1593	addps	%xmm14, %xmm10
1594
1595	addq	$8 * SIZE, A1
1596	addq	$8 * SIZE, A2
1597	addq	$8 * SIZE, X1
1598	ALIGN_4
1599
1600.L36:
1601	testq	$4, MM
1602	jle	.L37
1603
1604	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1605	mulps	%xmm4, %xmm0
1606	addps	%xmm0, %xmm8
1607	MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1)
1608	mulps	%xmm4, %xmm1
1609	addps	%xmm1, %xmm9
1610	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
1611	mulps	%xmm4, %xmm2
1612	addps	%xmm2, %xmm10
1613
1614	addq	$4 * SIZE, A1
1615	addq	$4 * SIZE, A2
1616	addq	$4 * SIZE, X1
1617	ALIGN_4
1618
1619.L37:
1620	testq	$2, MM
1621	jle	.L38
1622
1623#ifdef movsd
1624	xorps	%xmm0, %xmm0
1625#endif
1626	movsd	-32 * SIZE(A1), %xmm0
1627#ifdef movsd
1628	xorps	%xmm4, %xmm4
1629#endif
1630	movsd	-32 * SIZE(X1), %xmm4
1631	mulps	%xmm4, %xmm0
1632	addps	%xmm0, %xmm8
1633#ifdef movsd
1634	xorps	%xmm1, %xmm1
1635#endif
1636	movsd	-32 * SIZE(A1, LDA), %xmm1
1637	mulps	%xmm4, %xmm1
1638	addps	%xmm1, %xmm9
1639#ifdef movsd
1640	xorps	%xmm2, %xmm2
1641#endif
1642	movsd	-32 * SIZE(A2), %xmm2
1643	mulps	%xmm4, %xmm2
1644	addps	%xmm2, %xmm10
1645#ifdef movsd
1646	xorps	%xmm3, %xmm3
1647#endif
1648
1649	addq	$2 * SIZE, A1
1650	addq	$2 * SIZE, A2
1651	addq	$2 * SIZE, X1
1652	ALIGN_4
1653
1654.L38:
1655	testq	$1, MM
1656	jle	.L39
1657
1658	movss	-32 * SIZE(A1), %xmm0
1659	movss	-32 * SIZE(X1), %xmm4
1660	mulss	%xmm4, %xmm0
1661	addss	%xmm0, %xmm8
1662	movss	-32 * SIZE(A1, LDA), %xmm1
1663	mulss	%xmm4, %xmm1
1664	addss	%xmm1, %xmm9
1665	movss	-32 * SIZE(A2), %xmm2
1666	mulss	%xmm4, %xmm2
1667	addss	%xmm2, %xmm10
1668	ALIGN_4
1669
1670.L39:
1671#ifdef HAVE_SSE3
1672	haddps	%xmm9, %xmm8
1673	haddps	%xmm11, %xmm10
1674	haddps	%xmm10, %xmm8
1675
1676	pshufd	$0x1, %xmm8, %xmm9
1677	pshufd	$0x2, %xmm8, %xmm10
1678#else
1679	movaps	%xmm8, %xmm0
1680	unpcklps %xmm9, %xmm8
1681	unpckhps %xmm9, %xmm0
1682
1683	movaps	%xmm10, %xmm1
1684	unpcklps %xmm11, %xmm10
1685	unpckhps %xmm11, %xmm1
1686
1687	movaps	%xmm8, %xmm9
1688	unpcklps %xmm10, %xmm8
1689	unpckhps %xmm10, %xmm9
1690
1691	movaps	%xmm0, %xmm10
1692	unpcklps %xmm1, %xmm0
1693	unpckhps %xmm1, %xmm10
1694
1695	addps	%xmm9, %xmm8
1696	addps	%xmm0, %xmm10
1697	addps	%xmm10, %xmm8
1698
1699	pshufd	$0x2, %xmm8, %xmm9
1700	pshufd	$0x1, %xmm8, %xmm10
1701#endif
1702
1703	mulss	ALPHA, %xmm8
1704	mulss	ALPHA, %xmm9
1705	mulss	ALPHA, %xmm10
1706
1707	addss	(Y), %xmm8
1708	addq	INCY, Y
1709	addss	(Y), %xmm9
1710	addq	INCY, Y
1711	addss	(Y), %xmm10
1712	addq	INCY, Y
1713
1714	movss	%xmm8, (Y1)
1715	addq	INCY, Y1
1716	movss	%xmm9, (Y1)
1717	addq	INCY, Y1
1718	movss	%xmm10, (Y1)
1719	addq	INCY, Y1
1720	jmp	.L999
1721	ALIGN_4
1722
1723.L40:
1724	cmpq	$2, N
1725	jne	.L50
1726
1727	leaq	32 * SIZE(BUFFER), X1
1728
1729	movq	A, A1
1730	leaq	(A1, LDA), A2
1731	leaq	(A1, LDA, 2), A
1732
1733	xorps	%xmm8, %xmm8
1734	xorps	%xmm9, %xmm9
1735
1736#ifdef ALIGNED_ACCESS
1737	cmpq	$3, M
1738	jle	.L47
1739
1740	testq	$SIZE, A1
1741	je	.L4X
1742
1743	movss	-32 * SIZE(A1), %xmm0
1744	movss	-32 * SIZE(X1), %xmm4
1745	mulss	%xmm4, %xmm0
1746	addss	%xmm0, %xmm8
1747	movss	-32 * SIZE(A2), %xmm1
1748	mulss	%xmm4, %xmm1
1749	addss	%xmm1, %xmm9
1750
1751	addq	 $1 * SIZE, A1
1752	addq	 $1 * SIZE, A2
1753	addq	 $1 * SIZE, X1
1754	ALIGN_3
1755
1756.L4X:
1757	testq	$2 * SIZE, A1
1758	je	.L4XX
1759
1760#ifdef movsd
1761	xorps	%xmm0, %xmm0
1762	xorps	%xmm4, %xmm4
1763#endif
1764	movsd	-32 * SIZE(A1), %xmm0
1765	movsd	-32 * SIZE(X1), %xmm4
1766	mulps	%xmm4, %xmm0
1767	addps	%xmm0, %xmm8
1768#ifdef movsd
1769	xorps	%xmm1, %xmm1
1770#endif
1771	movsd	-32 * SIZE(A2), %xmm1
1772	mulps	%xmm4, %xmm1
1773	addps	%xmm1, %xmm9
1774
1775	addq	 $2 * SIZE, A1
1776	addq	 $2 * SIZE, A2
1777	addq	 $2 * SIZE, X1
1778	ALIGN_3
1779
1780.L4XX:
1781#endif
1782
1783	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
1784	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
1785
1786	movq	MM,  I
1787	sarq	$4,  I
1788	jle	.L45
1789
1790	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1791	MOVUPS_A1 (-32 * SIZE, A2, %xmm1)
1792	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
1793	MOVUPS_A1 (-28 * SIZE, A2, %xmm13)
1794
1795	decq	I
1796	jle	.L43
1797	ALIGN_4
1798
1799.L42:
1800#ifdef PREFETCH
1801	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
1802#endif
1803
1804	mulps	%xmm4, %xmm0
1805	addps	%xmm0, %xmm8
1806	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
1807	mulps	%xmm4, %xmm1
1808	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1809	addps	%xmm1, %xmm9
1810	MOVUPS_A1 (-24 * SIZE, A2, %xmm1)
1811
1812	mulps	%xmm5, %xmm12
1813	addps	%xmm12, %xmm8
1814	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
1815	mulps	%xmm5, %xmm13
1816	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
1817	addps	%xmm13, %xmm9
1818	MOVUPS_A1 (-20 * SIZE, A2, %xmm13)
1819
1820#ifdef PREFETCH
1821	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2)
1822#endif
1823
1824	mulps	%xmm4, %xmm0
1825	addps	%xmm0, %xmm8
1826	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
1827	mulps	%xmm4, %xmm1
1828	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
1829	addps	%xmm1, %xmm9
1830	MOVUPS_A1 (-16 * SIZE, A2, %xmm1)
1831
1832#ifdef PREFETCHW
1833	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
1834#endif
1835
1836	mulps	%xmm5, %xmm12
1837	addps	%xmm12, %xmm8
1838	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
1839	mulps	%xmm5, %xmm13
1840	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
1841	addps	%xmm13, %xmm9
1842	MOVUPS_A1 (-12 * SIZE, A2, %xmm13)
1843
1844	addq	$16 * SIZE, A1
1845	addq	$16 * SIZE, A2
1846	addq	$16 * SIZE, X1
1847
1848	decq	I
1849	jg	.L42
1850	ALIGN_4
1851
1852.L43:
1853	mulps	%xmm4, %xmm0
1854	addps	%xmm0, %xmm8
1855	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
1856	mulps	%xmm4, %xmm1
1857	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1858	addps	%xmm1, %xmm9
1859	MOVUPS_A1 (-24 * SIZE, A2, %xmm1)
1860
1861	mulps	%xmm5, %xmm12
1862	addps	%xmm12, %xmm8
1863	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
1864	mulps	%xmm5, %xmm13
1865	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
1866	addps	%xmm13, %xmm9
1867	MOVUPS_A1 (-20 * SIZE, A2, %xmm13)
1868
1869	mulps	%xmm4, %xmm0
1870	addps	%xmm0, %xmm8
1871	mulps	%xmm4, %xmm1
1872	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
1873	addps	%xmm1, %xmm9
1874
1875	mulps	%xmm5, %xmm12
1876	addps	%xmm12, %xmm8
1877	mulps	%xmm5, %xmm13
1878	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
1879	addps	%xmm13, %xmm9
1880
1881	addq	$16 * SIZE, A1
1882	addq	$16 * SIZE, A2
1883	addq	$16 * SIZE, X1
1884	ALIGN_4
1885
1886.L45:
1887	testq	$8, MM
1888	jle	.L46
1889
1890	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1891	mulps	%xmm4, %xmm0
1892	addps	%xmm0, %xmm8
1893	MOVUPS_A1 (-32 * SIZE, A2, %xmm1)
1894	mulps	%xmm4, %xmm1
1895	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
1896	addps	%xmm1, %xmm9
1897
1898	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
1899	mulps	%xmm5, %xmm12
1900	addps	%xmm12, %xmm8
1901	MOVUPS_A1 (-28 * SIZE, A2, %xmm13)
1902	mulps	%xmm5, %xmm13
1903	addps	%xmm13, %xmm9
1904
1905	addq	$8 * SIZE, A1
1906	addq	$8 * SIZE, A2
1907	addq	$8 * SIZE, X1
1908	ALIGN_4
1909
1910.L46:
1911	testq	$4, MM
1912	jle	.L47
1913
1914	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
1915	mulps	%xmm4, %xmm0
1916	addps	%xmm0, %xmm8
1917	MOVUPS_A1 (-32 * SIZE, A2, %xmm1)
1918	mulps	%xmm4, %xmm1
1919	addps	%xmm1, %xmm9
1920
1921	addq	$4 * SIZE, A1
1922	addq	$4 * SIZE, A2
1923	addq	$4 * SIZE, X1
1924	ALIGN_4
1925
1926.L47:
1927	testq	$2, MM
1928	jle	.L48
1929
1930#ifdef movsd
1931	xorps	%xmm0, %xmm0
1932#endif
1933	movsd	-32 * SIZE(A1), %xmm0
1934#ifdef movsd
1935	xorps	%xmm4, %xmm4
1936#endif
1937	movsd	-32 * SIZE(X1), %xmm4
1938	mulps	%xmm4, %xmm0
1939	addps	%xmm0, %xmm8
1940#ifdef movsd
1941	xorps	%xmm1, %xmm1
1942#endif
1943	movsd	-32 * SIZE(A2), %xmm1
1944	mulps	%xmm4, %xmm1
1945	addps	%xmm1, %xmm9
1946	shufps	$0xe, %xmm4, %xmm4
1947
1948	addq	$2 * SIZE, A1
1949	addq	$2 * SIZE, A2
1950	addq	$2 * SIZE, X1
1951	ALIGN_4
1952
1953.L48:
1954	testq	$1, MM
1955	jle	.L49
1956
1957	movss	-32 * SIZE(A1), %xmm0
1958	movss	-32 * SIZE(X1), %xmm4
1959	mulss	%xmm4, %xmm0
1960	addss	%xmm0, %xmm8
1961	movss	-32 * SIZE(A2), %xmm1
1962	mulss	%xmm4, %xmm1
1963	addss	%xmm1, %xmm9
1964	ALIGN_4
1965
1966.L49:
1967#ifdef HAVE_SSE3
1968	haddps	%xmm9, %xmm8
1969	haddps	%xmm8, %xmm8
1970#else
1971	movaps	%xmm8, %xmm10
1972	unpcklps %xmm9, %xmm8
1973	unpckhps %xmm9, %xmm10
1974
1975	addps	%xmm10, %xmm8
1976	movhlps %xmm8, %xmm9
1977	addps	%xmm9, %xmm8
1978#endif
1979
1980	pshufd	$0x1, %xmm8, %xmm9
1981
1982	mulss	ALPHA, %xmm8
1983	mulss	ALPHA, %xmm9
1984
1985	addss	(Y), %xmm8
1986	addq	INCY, Y
1987	addss	(Y), %xmm9
1988	addq	INCY, Y
1989
1990	movss	%xmm8, (Y1)
1991	addq	INCY, Y1
1992	movss	%xmm9, (Y1)
1993	addq	INCY, Y1
1994	jmp	.L999
1995	ALIGN_4
1996
1997.L50:
1998	cmpq	$1, N
1999	jne	.L999
2000
2001	leaq	32 * SIZE(BUFFER), X1
2002
2003	movq	A, A1
2004
2005	xorps	%xmm8, %xmm8
2006	xorps	%xmm9, %xmm9
2007
2008#ifdef ALIGNED_ACCESS
2009	cmpq	$3, M
2010	jle	.L57
2011
2012	testq	$SIZE, A1
2013	je	.L5X
2014
2015	movss	-32 * SIZE(A1), %xmm0
2016	movss	-32 * SIZE(X1), %xmm4
2017	mulss	%xmm4, %xmm0
2018	addss	%xmm0, %xmm8
2019
2020	addq	 $1 * SIZE, A1
2021	addq	 $1 * SIZE, X1
2022	ALIGN_3
2023
2024.L5X:
2025	testq	$2 * SIZE, A1
2026	je	.L5XX
2027
2028
2029#ifdef movsd
2030	xorps	%xmm0, %xmm0
2031	xorps	%xmm4, %xmm4
2032#endif
2033	movsd	-32 * SIZE(A1), %xmm0
2034	movsd	-32 * SIZE(X1), %xmm4
2035	mulps	%xmm4, %xmm0
2036	addps	%xmm0, %xmm8
2037	shufps	$0xe, %xmm4, %xmm4
2038
2039	addq	 $2 * SIZE, A1
2040	addq	 $2 * SIZE, X1
2041	ALIGN_3
2042
2043.L5XX:
2044#endif
2045
2046	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
2047	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
2048
2049	movq	MM,  I
2050	sarq	$4,  I
2051	jle	.L55
2052
2053	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2054	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
2055
2056	decq	I
2057	jle	.L53
2058	ALIGN_4
2059
2060.L52:
2061#ifdef PREFETCH
2062	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1)
2063#endif
2064
2065	mulps	%xmm4, %xmm0
2066	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2067	addps	%xmm0, %xmm8
2068	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
2069
2070	mulps	%xmm5, %xmm12
2071	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
2072	addps	%xmm12, %xmm9
2073	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
2074
2075#ifdef PREFETCHW
2076	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1)
2077#endif
2078
2079	mulps	%xmm4, %xmm0
2080	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
2081	addps	%xmm0, %xmm8
2082	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
2083
2084	mulps	%xmm5, %xmm12
2085	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
2086	addps	%xmm12, %xmm9
2087	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
2088
2089	addq	$16 * SIZE, A1
2090	addq	$16 * SIZE, X1
2091
2092	decq	I
2093	jg	.L52
2094	ALIGN_4
2095
2096.L53:
2097	mulps	%xmm4, %xmm0
2098	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2099	addps	%xmm0, %xmm8
2100	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
2101
2102	mulps	%xmm5, %xmm12
2103	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
2104	addps	%xmm12, %xmm9
2105	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
2106
2107	mulps	%xmm4, %xmm0
2108	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
2109	addps	%xmm0, %xmm8
2110
2111	mulps	%xmm5, %xmm12
2112	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
2113	addps	%xmm12, %xmm9
2114
2115	addq	$16 * SIZE, A1
2116	addq	$16 * SIZE, X1
2117	ALIGN_4
2118
2119.L55:
2120	testq	$8, MM
2121	jle	.L56
2122
2123	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2124	mulps	%xmm4, %xmm0
2125	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2126	addps	%xmm0, %xmm8
2127
2128	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
2129	mulps	%xmm5, %xmm12
2130	addps	%xmm12, %xmm9
2131
2132	addq	$8 * SIZE, A1
2133	addq	$8 * SIZE, X1
2134	ALIGN_4
2135
2136.L56:
2137	testq	$4, MM
2138	jle	.L57
2139
2140	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2141	mulps	%xmm4, %xmm0
2142	addps	%xmm0, %xmm8
2143
2144	addq	$4 * SIZE, A1
2145	addq	$4 * SIZE, X1
2146	ALIGN_4
2147
2148.L57:
2149	testq	$2, MM
2150	jle	.L58
2151
2152#ifdef movsd
2153	xorps	%xmm0, %xmm0
2154#endif
2155	movsd	-32 * SIZE(A1), %xmm0
2156#ifdef movsd
2157	xorps	%xmm4, %xmm4
2158#endif
2159	movsd	-32 * SIZE(X1), %xmm4
2160	mulps	%xmm4, %xmm0
2161	addps	%xmm0, %xmm8
2162	shufps	$0xe, %xmm4, %xmm4
2163
2164	addq	$2 * SIZE, A1
2165	addq	$2 * SIZE, X1
2166	ALIGN_4
2167
2168.L58:
2169	testq	$1, MM
2170	jle	.L59
2171
2172	movss	-32 * SIZE(A1), %xmm0
2173	movss	-32 * SIZE(X1), %xmm4
2174	mulss	%xmm4, %xmm0
2175	addss	%xmm0, %xmm8
2176	ALIGN_4
2177
2178.L59:
2179	addps	%xmm9, %xmm8
2180
2181#ifdef HAVE_SSE3
2182	haddps	%xmm8, %xmm8
2183	haddps	%xmm8, %xmm8
2184#else
2185	pshufd	$1, %xmm8, %xmm9
2186	pshufd	$2, %xmm8, %xmm10
2187	pshufd	$3, %xmm8, %xmm11
2188
2189	addss	%xmm9, %xmm8
2190	addss	%xmm11, %xmm10
2191	addss	%xmm10, %xmm8
2192#endif
2193
2194	mulss	ALPHA, %xmm8
2195
2196	addss	(Y), %xmm8
2197	movss	%xmm8, (Y1)
2198
2199#ifdef ALIGNED_ACCESS
2200	jmp	.L999
2201	ALIGN_4
2202
2203.L100:
2204	testq	$2 * SIZE - 1, LDA
2205	jne	.L200
2206
2207	cmpq	$4, N
2208	jl	.L110
2209	ALIGN_3
2210
2211.L101:
2212	subq	$4, N
2213
2214	leaq	32 * SIZE(BUFFER), X1
2215
2216	movq	A, A1
2217	leaq	(A1, LDA, 2), A2
2218	leaq	(A1, LDA, 4), A
2219
2220	xorps	%xmm8, %xmm8
2221	xorps	%xmm9, %xmm9
2222	xorps	%xmm10, %xmm10
2223	xorps	%xmm11, %xmm11
2224
2225	cmpq	$3, M
2226	jle	.L107
2227
2228	testq	$SIZE, A1
2229	je	.L10X
2230
2231	movss	-32 * SIZE(A1), %xmm0
2232	movss	-32 * SIZE(X1), %xmm4
2233	mulss	%xmm4, %xmm0
2234	addss	%xmm0, %xmm8
2235	movss	-32 * SIZE(A1, LDA), %xmm1
2236	mulss	%xmm4, %xmm1
2237	addss	%xmm1, %xmm9
2238	movss	-32 * SIZE(A2), %xmm2
2239	mulss	%xmm4, %xmm2
2240	addss	%xmm2, %xmm10
2241	movss	-32 * SIZE(A2, LDA), %xmm3
2242	mulss	%xmm4, %xmm3
2243	addss	%xmm3, %xmm11
2244
2245	addq	 $1 * SIZE, A1
2246	addq	 $1 * SIZE, A2
2247	addq	 $1 * SIZE, X1
2248	ALIGN_3
2249
2250.L10X:
2251	testq	$2 * SIZE, A1
2252	je	.L10XX
2253
2254#ifdef movsd
2255	xorps	%xmm0, %xmm0
2256	xorps	%xmm4, %xmm4
2257#endif
2258	movsd	-32 * SIZE(A1), %xmm0
2259	movsd	-32 * SIZE(X1), %xmm4
2260	mulps	%xmm4, %xmm0
2261	addps	%xmm0, %xmm8
2262#ifdef movsd
2263	xorps	%xmm1, %xmm1
2264#endif
2265	movsd	-32 * SIZE(A1, LDA), %xmm1
2266	mulps	%xmm4, %xmm1
2267	addps	%xmm1, %xmm9
2268#ifdef movsd
2269	xorps	%xmm2, %xmm2
2270#endif
2271	movsd	-32 * SIZE(A2), %xmm2
2272	mulps	%xmm4, %xmm2
2273	addps	%xmm2, %xmm10
2274#ifdef movsd
2275	xorps	%xmm3, %xmm3
2276#endif
2277	movsd	-32 * SIZE(A2, LDA), %xmm3
2278	mulps	%xmm4, %xmm3
2279	addps	%xmm3, %xmm11
2280
2281	addq	 $2 * SIZE, A1
2282	addq	 $2 * SIZE, A2
2283	addq	 $2 * SIZE, X1
2284	ALIGN_3
2285
2286.L10XX:
2287	MOVUPS_A2  (-34 * SIZE, A1, LDA, 1, %xmm12)
2288	MOVUPS_A2  (-34 * SIZE, A2, LDA, 1, %xmm13)
2289
2290	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
2291	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
2292
2293#ifdef PREFETCHW
2294	PREFETCHW	4 * SIZE(Y1)
2295#endif
2296
2297	movq	MM,  I
2298	sarq	$4,  I
2299	jle	.L105
2300
2301	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2302	MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1)
2303	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
2304	MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3)
2305
2306	decq	I
2307	jle	.L103
2308	ALIGN_4
2309
2310.L102:
2311#ifdef PREFETCH
2312	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
2313#endif
2314
2315	mulps	%xmm4, %xmm0
2316	addps	%xmm0, %xmm8
2317	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
2318	shufps	$0x4e, %xmm1, %xmm12
2319	mulps	%xmm4, %xmm12
2320	addps	%xmm12, %xmm9
2321	MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12)
2322	mulps	%xmm4, %xmm2
2323	addps	%xmm2, %xmm10
2324	MOVUPS_A1 (-28 * SIZE, A2, %xmm2)
2325	shufps	$0x4e, %xmm3, %xmm13
2326	mulps	%xmm4, %xmm13
2327	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2328	addps	%xmm13, %xmm11
2329	MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13)
2330
2331#ifdef PREFETCH
2332	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA)
2333#endif
2334
2335	mulps	%xmm5, %xmm0
2336	addps	%xmm0, %xmm8
2337	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
2338	shufps	$0x4e, %xmm12, %xmm1
2339	mulps	%xmm5, %xmm1
2340	addps	%xmm1, %xmm9
2341	MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1)
2342	mulps	%xmm5, %xmm2
2343	addps	%xmm2, %xmm10
2344	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
2345	shufps	$0x4e, %xmm13, %xmm3
2346	mulps	%xmm5, %xmm3
2347	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
2348	addps	%xmm3, %xmm11
2349	MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3)
2350
2351#ifdef PREFETCH
2352	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
2353#endif
2354
2355	mulps	%xmm4, %xmm0
2356	addps	%xmm0, %xmm8
2357	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
2358	shufps	$0x4e, %xmm1, %xmm12
2359	mulps	%xmm4, %xmm12
2360	addps	%xmm12, %xmm9
2361	MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12)
2362	mulps	%xmm4, %xmm2
2363	addps	%xmm2, %xmm10
2364	MOVUPS_A1 (-20 * SIZE, A2, %xmm2)
2365	shufps	$0x4e, %xmm3, %xmm13
2366	mulps	%xmm4, %xmm13
2367	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
2368	addps	%xmm13, %xmm11
2369	MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13)
2370
2371#ifdef PREFETCH
2372	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA)
2373#endif
2374
2375	mulps	%xmm5, %xmm0
2376	addps	%xmm0, %xmm8
2377	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
2378	shufps	$0x4e, %xmm12, %xmm1
2379	mulps	%xmm5, %xmm1
2380	addps	%xmm1, %xmm9
2381	MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1)
2382
2383#ifdef PREFETCHW
2384	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
2385#endif
2386
2387	mulps	%xmm5, %xmm2
2388	addps	%xmm2, %xmm10
2389	MOVUPS_A1 (-16 * SIZE, A2, %xmm2)
2390	shufps	$0x4e, %xmm13, %xmm3
2391	mulps	%xmm5, %xmm3
2392	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
2393	addps	%xmm3, %xmm11
2394	MOVUPS_A2 (-14 * SIZE, A2, LDA, 1, %xmm3)
2395
2396	addq	$16 * SIZE, A1
2397	addq	$16 * SIZE, A2
2398	addq	$16 * SIZE, X1
2399
2400	decq	I
2401	jg	.L102
2402	ALIGN_4
2403
2404.L103:
2405	mulps	%xmm4, %xmm0
2406	addps	%xmm0, %xmm8
2407	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
2408	shufps	$0x4e, %xmm1, %xmm12
2409	mulps	%xmm4, %xmm12
2410	addps	%xmm12, %xmm9
2411	MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12)
2412	mulps	%xmm4, %xmm2
2413	addps	%xmm2, %xmm10
2414	MOVUPS_A1 (-28 * SIZE, A2, %xmm2)
2415	shufps	$0x4e, %xmm3, %xmm13
2416	mulps	%xmm4, %xmm13
2417	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2418	addps	%xmm13, %xmm11
2419	MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13)
2420
2421	mulps	%xmm5, %xmm0
2422	addps	%xmm0, %xmm8
2423	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
2424	shufps	$0x4e, %xmm12, %xmm1
2425	mulps	%xmm5, %xmm1
2426	addps	%xmm1, %xmm9
2427	MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1)
2428	mulps	%xmm5, %xmm2
2429	addps	%xmm2, %xmm10
2430	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
2431	shufps	$0x4e, %xmm13, %xmm3
2432	mulps	%xmm5, %xmm3
2433	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
2434	addps	%xmm3, %xmm11
2435	MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3)
2436
2437	mulps	%xmm4, %xmm0
2438	addps	%xmm0, %xmm8
2439	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
2440	shufps	$0x4e, %xmm1, %xmm12
2441	mulps	%xmm4, %xmm12
2442	addps	%xmm12, %xmm9
2443	MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12)
2444	mulps	%xmm4, %xmm2
2445	addps	%xmm2, %xmm10
2446	MOVUPS_A1 (-20 * SIZE, A2, %xmm2)
2447	shufps	$0x4e, %xmm3, %xmm13
2448	mulps	%xmm4, %xmm13
2449	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
2450	addps	%xmm13, %xmm11
2451	MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13)
2452
2453	mulps	%xmm5, %xmm0
2454	addps	%xmm0, %xmm8
2455	shufps	$0x4e, %xmm12, %xmm1
2456	mulps	%xmm5, %xmm1
2457	addps	%xmm1, %xmm9
2458	mulps	%xmm5, %xmm2
2459	addps	%xmm2, %xmm10
2460	shufps	$0x4e, %xmm13, %xmm3
2461	mulps	%xmm5, %xmm3
2462	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
2463	addps	%xmm3, %xmm11
2464
2465	addq	$16 * SIZE, A1
2466	addq	$16 * SIZE, A2
2467	addq	$16 * SIZE, X1
2468	ALIGN_4
2469
2470.L105:
2471	testq	$8, MM
2472	jle	.L106
2473
2474	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2475	MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1)
2476	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
2477	MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3)
2478
2479	mulps	%xmm4, %xmm0
2480	addps	%xmm0, %xmm8
2481	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
2482	shufps	$0x4e, %xmm1, %xmm12
2483	mulps	%xmm4, %xmm12
2484	addps	%xmm12, %xmm9
2485	MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12)
2486	mulps	%xmm4, %xmm2
2487	addps	%xmm2, %xmm10
2488	MOVUPS_A1 (-28 * SIZE, A2, %xmm2)
2489	shufps	$0x4e, %xmm3, %xmm13
2490	mulps	%xmm4, %xmm13
2491	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2492	addps	%xmm13, %xmm11
2493	MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13)
2494
2495	mulps	%xmm5, %xmm0
2496	addps	%xmm0, %xmm8
2497	shufps	$0x4e, %xmm12, %xmm1
2498	mulps	%xmm5, %xmm1
2499	addps	%xmm1, %xmm9
2500	mulps	%xmm5, %xmm2
2501	addps	%xmm2, %xmm10
2502	shufps	$0x4e, %xmm13, %xmm3
2503	mulps	%xmm5, %xmm3
2504	addps	%xmm3, %xmm11
2505
2506	addq	$8 * SIZE, A1
2507	addq	$8 * SIZE, A2
2508	addq	$8 * SIZE, X1
2509	ALIGN_4
2510
2511.L106:
2512	testq	$4, MM
2513	jle	.L107
2514
2515	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2516	mulps	%xmm4, %xmm0
2517	addps	%xmm0, %xmm8
2518	MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1)
2519	shufps	$0x4e, %xmm1, %xmm12
2520	mulps	%xmm4, %xmm12
2521	addps	%xmm12, %xmm9
2522
2523	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
2524	mulps	%xmm4, %xmm2
2525	addps	%xmm2, %xmm10
2526	MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3)
2527	shufps	$0x4e, %xmm3, %xmm13
2528	mulps	%xmm4, %xmm13
2529	addps	%xmm13, %xmm11
2530
2531	addq	$4 * SIZE, A1
2532	addq	$4 * SIZE, A2
2533	addq	$4 * SIZE, X1
2534	ALIGN_4
2535
2536.L107:
2537	testq	$2, MM
2538	jle	.L108
2539
2540#ifdef movsd
2541	xorps	%xmm0, %xmm0
2542#endif
2543	movsd	-32 * SIZE(A1), %xmm0
2544#ifdef movsd
2545	xorps	%xmm4, %xmm4
2546#endif
2547	movsd	-32 * SIZE(X1), %xmm4
2548	mulps	%xmm4, %xmm0
2549	addps	%xmm0, %xmm8
2550#ifdef movsd
2551	xorps	%xmm1, %xmm1
2552#endif
2553	movsd	-32 * SIZE(A1, LDA), %xmm1
2554	mulps	%xmm4, %xmm1
2555	addps	%xmm1, %xmm9
2556#ifdef movsd
2557	xorps	%xmm2, %xmm2
2558#endif
2559	movsd	-32 * SIZE(A2), %xmm2
2560	mulps	%xmm4, %xmm2
2561	addps	%xmm2, %xmm10
2562#ifdef movsd
2563	xorps	%xmm3, %xmm3
2564#endif
2565	movsd	-32 * SIZE(A2, LDA), %xmm3
2566	mulps	%xmm4, %xmm3
2567	addps	%xmm3, %xmm11
2568	shufps	$0xe, %xmm4, %xmm4
2569
2570	addq	$2 * SIZE, A1
2571	addq	$2 * SIZE, A2
2572	addq	$2 * SIZE, X1
2573	ALIGN_4
2574
2575.L108:
2576	testq	$1, MM
2577	jle	.L109
2578
2579	movss	-32 * SIZE(A1), %xmm0
2580	movss	-32 * SIZE(X1), %xmm4
2581	mulss	%xmm4, %xmm0
2582	addss	%xmm0, %xmm8
2583	movss	-32 * SIZE(A1, LDA), %xmm1
2584	mulss	%xmm4, %xmm1
2585	addss	%xmm1, %xmm9
2586	movss	-32 * SIZE(A2), %xmm2
2587	mulss	%xmm4, %xmm2
2588	addss	%xmm2, %xmm10
2589	movss	-32 * SIZE(A2, LDA), %xmm3
2590	mulss	%xmm4, %xmm3
2591	addss	%xmm3, %xmm11
2592	ALIGN_4
2593
2594.L109:
2595#ifdef HAVE_SSE3
2596	haddps	%xmm9, %xmm8
2597	haddps	%xmm11, %xmm10
2598	haddps	%xmm10, %xmm8
2599
2600	pshufd	$0x1, %xmm8, %xmm9
2601	pshufd	$0x2, %xmm8, %xmm10
2602	pshufd	$0x3, %xmm8, %xmm11
2603#else
2604	movaps	%xmm8, %xmm0
2605	unpcklps %xmm9, %xmm8
2606	unpckhps %xmm9, %xmm0
2607
2608	movaps	%xmm10, %xmm1
2609	unpcklps %xmm11, %xmm10
2610	unpckhps %xmm11, %xmm1
2611
2612	movaps	%xmm8, %xmm9
2613	unpcklps %xmm10, %xmm8
2614	unpckhps %xmm10, %xmm9
2615
2616	movaps	%xmm0, %xmm10
2617	unpcklps %xmm1, %xmm0
2618	unpckhps %xmm1, %xmm10
2619
2620	addps	%xmm9, %xmm8
2621	addps	%xmm0, %xmm10
2622	addps	%xmm10, %xmm8
2623
2624	pshufd	$0x2, %xmm8, %xmm9
2625	pshufd	$0x1, %xmm8, %xmm10
2626	pshufd	$0x3, %xmm8, %xmm11
2627#endif
2628
2629	mulss	ALPHA, %xmm8
2630	mulss	ALPHA, %xmm9
2631	mulss	ALPHA, %xmm10
2632	mulss	ALPHA, %xmm11
2633
2634	addss	(Y), %xmm8
2635	addq	INCY, Y
2636	addss	(Y), %xmm9
2637	addq	INCY, Y
2638	addss	(Y), %xmm10
2639	addq	INCY, Y
2640	addss	(Y), %xmm11
2641	addq	INCY, Y
2642
2643	movss	%xmm8, (Y1)
2644	addq	INCY, Y1
2645	movss	%xmm9, (Y1)
2646	addq	INCY, Y1
2647	movss	%xmm10, (Y1)
2648	addq	INCY, Y1
2649	movss	%xmm11, (Y1)
2650	addq	INCY, Y1
2651
2652	cmpq	$4, N
2653	jge	.L101
2654	ALIGN_4
2655
2656.L110:
2657	cmpq	$3, N
2658	jne	.L120
2659
2660	leaq	32 * SIZE(BUFFER), X1
2661
2662	movq	A, A1
2663	leaq	(A1, LDA, 2), A2
2664	leaq	(A1, LDA, 4), A
2665
2666	xorps	%xmm8, %xmm8
2667	xorps	%xmm9, %xmm9
2668	xorps	%xmm10, %xmm10
2669
2670	cmpq	$3, M
2671	jle	.L117
2672
2673	testq	$SIZE, A1
2674	je	.L11X
2675
2676	movss	-32 * SIZE(A1), %xmm0
2677	movss	-32 * SIZE(X1), %xmm4
2678	mulss	%xmm4, %xmm0
2679	addss	%xmm0, %xmm8
2680	movss	-32 * SIZE(A1, LDA), %xmm1
2681	mulss	%xmm4, %xmm1
2682	addss	%xmm1, %xmm9
2683	movss	-32 * SIZE(A2), %xmm2
2684	mulss	%xmm4, %xmm2
2685	addss	%xmm2, %xmm10
2686
2687	addq	 $1 * SIZE, A1
2688	addq	 $1 * SIZE, A2
2689	addq	 $1 * SIZE, X1
2690	ALIGN_3
2691
2692.L11X:
2693	testq	$2 * SIZE, A1
2694	je	.L11XX
2695
2696#ifdef movsd
2697	xorps	%xmm0, %xmm0
2698	xorps	%xmm4, %xmm4
2699#endif
2700	movsd	-32 * SIZE(A1), %xmm0
2701	movsd	-32 * SIZE(X1), %xmm4
2702	mulps	%xmm4, %xmm0
2703	addps	%xmm0, %xmm8
2704#ifdef movsd
2705	xorps	%xmm1, %xmm1
2706#endif
2707	movsd	-32 * SIZE(A1, LDA), %xmm1
2708	mulps	%xmm4, %xmm1
2709	addps	%xmm1, %xmm9
2710#ifdef movsd
2711	xorps	%xmm2, %xmm2
2712#endif
2713	movsd	-32 * SIZE(A2), %xmm2
2714	mulps	%xmm4, %xmm2
2715	addps	%xmm2, %xmm10
2716
2717	addq	 $2 * SIZE, A1
2718	addq	 $2 * SIZE, A2
2719	addq	 $2 * SIZE, X1
2720	ALIGN_3
2721
2722.L11XX:
2723	MOVUPS_A2  (-34 * SIZE, A1, LDA, 1, %xmm12)
2724
2725	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
2726	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
2727
2728	movq	MM,  I
2729	sarq	$4,  I
2730	jle	.L115
2731
2732	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2733	MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1)
2734	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
2735
2736	decq	I
2737	jle	.L113
2738	ALIGN_4
2739
2740.L112:
2741#ifdef PREFETCH
2742	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1)
2743#endif
2744
2745	mulps	%xmm4, %xmm0
2746	addps	%xmm0, %xmm8
2747	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
2748	shufps	$0x4e, %xmm1, %xmm12
2749	mulps	%xmm4, %xmm12
2750	addps	%xmm12, %xmm9
2751	MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12)
2752	mulps	%xmm4, %xmm2
2753	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2754	addps	%xmm2, %xmm10
2755	MOVUPS_A1 (-28 * SIZE, A2, %xmm2)
2756
2757#ifdef PREFETCH
2758	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA)
2759#endif
2760
2761	mulps	%xmm5, %xmm0
2762	addps	%xmm0, %xmm8
2763	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
2764	shufps	$0x4e, %xmm12, %xmm1
2765	mulps	%xmm5, %xmm1
2766	addps	%xmm1, %xmm9
2767	MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1)
2768	mulps	%xmm5, %xmm2
2769	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
2770	addps	%xmm2, %xmm10
2771	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
2772
2773#ifdef PREFETCH
2774	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2)
2775#endif
2776
2777	mulps	%xmm4, %xmm0
2778	addps	%xmm0, %xmm8
2779	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
2780	shufps	$0x4e, %xmm1, %xmm12
2781	mulps	%xmm4, %xmm12
2782	addps	%xmm12, %xmm9
2783	MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12)
2784	mulps	%xmm4, %xmm2
2785	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
2786	addps	%xmm2, %xmm10
2787	MOVUPS_A1 (-20 * SIZE, A2, %xmm2)
2788
2789#ifdef PREFETCHW
2790	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1)
2791#endif
2792
2793	mulps	%xmm5, %xmm0
2794	addps	%xmm0, %xmm8
2795	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
2796	shufps	$0x4e, %xmm12, %xmm1
2797	mulps	%xmm5, %xmm1
2798	addps	%xmm1, %xmm9
2799	MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1)
2800	mulps	%xmm5, %xmm2
2801	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
2802	addps	%xmm2, %xmm10
2803	MOVUPS_A1 (-16 * SIZE, A2, %xmm2)
2804
2805	addq	$16 * SIZE, A1
2806	addq	$16 * SIZE, A2
2807	addq	$16 * SIZE, X1
2808
2809	decq	I
2810	jg	.L112
2811	ALIGN_4
2812
2813.L113:
2814	mulps	%xmm4, %xmm0
2815	addps	%xmm0, %xmm8
2816	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
2817	shufps	$0x4e, %xmm1, %xmm12
2818	mulps	%xmm4, %xmm12
2819	addps	%xmm12, %xmm9
2820	MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12)
2821	mulps	%xmm4, %xmm2
2822	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2823	addps	%xmm2, %xmm10
2824	MOVUPS_A1 (-28 * SIZE, A2, %xmm2)
2825
2826	mulps	%xmm5, %xmm0
2827	addps	%xmm0, %xmm8
2828	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
2829	shufps	$0x4e, %xmm12, %xmm1
2830	mulps	%xmm5, %xmm1
2831	addps	%xmm1, %xmm9
2832	MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1)
2833	mulps	%xmm5, %xmm2
2834	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
2835	addps	%xmm2, %xmm10
2836	MOVUPS_A1 (-24 * SIZE, A2, %xmm2)
2837
2838	mulps	%xmm4, %xmm0
2839	addps	%xmm0, %xmm8
2840	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
2841	shufps	$0x4e, %xmm1, %xmm12
2842	mulps	%xmm4, %xmm12
2843	addps	%xmm12, %xmm9
2844	MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12)
2845	mulps	%xmm4, %xmm2
2846	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
2847	addps	%xmm2, %xmm10
2848	MOVUPS_A1 (-20 * SIZE, A2, %xmm2)
2849
2850	mulps	%xmm5, %xmm0
2851	addps	%xmm0, %xmm8
2852	shufps	$0x4e, %xmm12, %xmm1
2853	mulps	%xmm5, %xmm1
2854	addps	%xmm1, %xmm9
2855	mulps	%xmm5, %xmm2
2856	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
2857	addps	%xmm2, %xmm10
2858
2859	addq	$16 * SIZE, A1
2860	addq	$16 * SIZE, A2
2861	addq	$16 * SIZE, X1
2862	ALIGN_4
2863
2864.L115:
2865	testq	$8, MM
2866	jle	.L116
2867
2868	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2869	MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1)
2870	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
2871
2872	mulps	%xmm4, %xmm0
2873	addps	%xmm0, %xmm8
2874	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
2875	shufps	$0x4e, %xmm1, %xmm12
2876	mulps	%xmm4, %xmm12
2877	addps	%xmm12, %xmm9
2878	MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12)
2879	mulps	%xmm4, %xmm2
2880	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
2881	addps	%xmm2, %xmm10
2882	MOVUPS_A1 (-28 * SIZE, A2, %xmm2)
2883
2884	mulps	%xmm5, %xmm0
2885	addps	%xmm0, %xmm8
2886	shufps	$0x4e, %xmm12, %xmm1
2887	mulps	%xmm5, %xmm1
2888	addps	%xmm1, %xmm9
2889	mulps	%xmm5, %xmm2
2890	addps	%xmm2, %xmm10
2891
2892	addq	$8 * SIZE, A1
2893	addq	$8 * SIZE, A2
2894	addq	$8 * SIZE, X1
2895	ALIGN_4
2896
2897.L116:
2898	testq	$4, MM
2899	jle	.L117
2900
2901	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
2902	mulps	%xmm4, %xmm0
2903	addps	%xmm0, %xmm8
2904	MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1)
2905	shufps	$0x4e, %xmm1, %xmm12
2906	mulps	%xmm4, %xmm12
2907	addps	%xmm12, %xmm9
2908
2909	MOVUPS_A1 (-32 * SIZE, A2, %xmm2)
2910	mulps	%xmm4, %xmm2
2911	addps	%xmm2, %xmm10
2912
2913	addq	$4 * SIZE, A1
2914	addq	$4 * SIZE, A2
2915	addq	$4 * SIZE, X1
2916	ALIGN_4
2917
2918.L117:
2919	testq	$2, MM
2920	jle	.L118
2921
2922#ifdef movsd
2923	xorps	%xmm0, %xmm0
2924#endif
2925	movsd	-32 * SIZE(A1), %xmm0
2926#ifdef movsd
2927	xorps	%xmm4, %xmm4
2928#endif
2929	movsd	-32 * SIZE(X1), %xmm4
2930	mulps	%xmm4, %xmm0
2931	addps	%xmm0, %xmm8
2932#ifdef movsd
2933	xorps	%xmm1, %xmm1
2934#endif
2935	movsd	-32 * SIZE(A1, LDA), %xmm1
2936	mulps	%xmm4, %xmm1
2937	addps	%xmm1, %xmm9
2938#ifdef movsd
2939	xorps	%xmm2, %xmm2
2940#endif
2941	movsd	-32 * SIZE(A2), %xmm2
2942	mulps	%xmm4, %xmm2
2943	addps	%xmm2, %xmm10
2944
2945	addq	$2 * SIZE, A1
2946	addq	$2 * SIZE, A2
2947	addq	$2 * SIZE, X1
2948	ALIGN_4
2949
2950.L118:
2951	testq	$1, MM
2952	jle	.L119
2953
2954	movss	-32 * SIZE(A1), %xmm0
2955	movss	-32 * SIZE(X1), %xmm4
2956	mulss	%xmm4, %xmm0
2957	addss	%xmm0, %xmm8
2958	movss	-32 * SIZE(A1, LDA), %xmm1
2959	mulss	%xmm4, %xmm1
2960	addss	%xmm1, %xmm9
2961	movss	-32 * SIZE(A2), %xmm2
2962	mulss	%xmm4, %xmm2
2963	addss	%xmm2, %xmm10
2964	ALIGN_4
2965
2966.L119:
2967#ifdef HAVE_SSE3
2968	haddps	%xmm9, %xmm8
2969	haddps	%xmm11, %xmm10
2970	haddps	%xmm10, %xmm8
2971
2972	pshufd	$0x1, %xmm8, %xmm9
2973	pshufd	$0x2, %xmm8, %xmm10
2974#else
2975	movaps	%xmm8, %xmm0
2976	unpcklps %xmm9, %xmm8
2977	unpckhps %xmm9, %xmm0
2978
2979	movaps	%xmm10, %xmm1
2980	unpcklps %xmm11, %xmm10
2981	unpckhps %xmm11, %xmm1
2982
2983	movaps	%xmm8, %xmm9
2984	unpcklps %xmm10, %xmm8
2985	unpckhps %xmm10, %xmm9
2986
2987	movaps	%xmm0, %xmm10
2988	unpcklps %xmm1, %xmm0
2989	unpckhps %xmm1, %xmm10
2990
2991	addps	%xmm9, %xmm8
2992	addps	%xmm0, %xmm10
2993	addps	%xmm10, %xmm8
2994
2995	pshufd	$0x2, %xmm8, %xmm9
2996	pshufd	$0x1, %xmm8, %xmm10
2997#endif
2998
2999	mulss	ALPHA, %xmm8
3000	mulss	ALPHA, %xmm9
3001	mulss	ALPHA, %xmm10
3002
3003	addss	(Y), %xmm8
3004	addq	INCY, Y
3005	addss	(Y), %xmm9
3006	addq	INCY, Y
3007	addss	(Y), %xmm10
3008
3009	movss	%xmm8, (Y1)
3010	addq	INCY, Y1
3011	movss	%xmm9, (Y1)
3012	addq	INCY, Y1
3013	movss	%xmm10, (Y1)
3014	jmp	.L999
3015	ALIGN_4
3016
3017.L120:
3018	cmpq	$2, N
3019	jne	.L130
3020
3021	leaq	32 * SIZE(BUFFER), X1
3022
3023	movq	A, A1
3024	leaq	(A1, LDA), A2
3025	leaq	(A1, LDA, 2), A
3026
3027	xorps	%xmm8, %xmm8
3028	xorps	%xmm9, %xmm9
3029
3030	cmpq	$3, M
3031	jle	.L127
3032
3033	testq	$SIZE, A1
3034	je	.L12X
3035
3036	movss	-32 * SIZE(A1), %xmm0
3037	movss	-32 * SIZE(X1), %xmm4
3038	mulss	%xmm4, %xmm0
3039	addss	%xmm0, %xmm8
3040	movss	-32 * SIZE(A2), %xmm1
3041	mulss	%xmm4, %xmm1
3042	addss	%xmm1, %xmm9
3043
3044	addq	 $1 * SIZE, A1
3045	addq	 $1 * SIZE, A2
3046	addq	 $1 * SIZE, X1
3047	ALIGN_3
3048
3049.L12X:
3050	testq	$2 * SIZE, A1
3051	je	.L12XX
3052
3053#ifdef movsd
3054	xorps	%xmm0, %xmm0
3055	xorps	%xmm4, %xmm4
3056#endif
3057	movsd	-32 * SIZE(A1), %xmm0
3058	movsd	-32 * SIZE(X1), %xmm4
3059	mulps	%xmm4, %xmm0
3060	addps	%xmm0, %xmm8
3061#ifdef movsd
3062	xorps	%xmm1, %xmm1
3063#endif
3064	movsd	-32 * SIZE(A2), %xmm1
3065	mulps	%xmm4, %xmm1
3066	addps	%xmm1, %xmm9
3067
3068	addq	 $2 * SIZE, A1
3069	addq	 $2 * SIZE, A2
3070	addq	 $2 * SIZE, X1
3071	ALIGN_3
3072
3073.L12XX:
3074	MOVUPS_A1  (-34 * SIZE, A2, %xmm12)
3075
3076	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
3077	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
3078
3079	movq	MM,  I
3080	sarq	$4,  I
3081	jle	.L125
3082
3083	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3084	MOVUPS_A1 (-30 * SIZE, A2, %xmm1)
3085
3086	decq	I
3087	jle	.L123
3088	ALIGN_4
3089
3090.L122:
3091#ifdef PREFETCH
3092	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
3093#endif
3094
3095	mulps	%xmm4, %xmm0
3096	addps	%xmm0, %xmm8
3097	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
3098	shufps	$0x4e, %xmm1, %xmm12
3099	mulps	%xmm4, %xmm12
3100	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3101	addps	%xmm12, %xmm9
3102	MOVUPS_A1 (-26 * SIZE, A2, %xmm12)
3103
3104	mulps	%xmm5, %xmm0
3105	addps	%xmm0, %xmm8
3106	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
3107	shufps	$0x4e, %xmm12, %xmm1
3108	mulps	%xmm5, %xmm1
3109	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
3110	addps	%xmm1, %xmm9
3111	MOVUPS_A1 (-22 * SIZE, A2, %xmm1)
3112
3113#ifdef PREFETCH
3114	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2)
3115#endif
3116
3117	mulps	%xmm4, %xmm0
3118	addps	%xmm0, %xmm8
3119	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
3120	shufps	$0x4e, %xmm1, %xmm12
3121	mulps	%xmm4, %xmm12
3122	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
3123	addps	%xmm12, %xmm9
3124	MOVUPS_A1 (-18 * SIZE, A2, %xmm12)
3125
3126#ifdef PREFETCHW
3127	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
3128#endif
3129
3130	mulps	%xmm5, %xmm0
3131	addps	%xmm0, %xmm8
3132	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
3133	shufps	$0x4e, %xmm12, %xmm1
3134	mulps	%xmm5, %xmm1
3135	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
3136	addps	%xmm1, %xmm9
3137	MOVUPS_A1 (-14 * SIZE, A2, %xmm1)
3138
3139	addq	$16 * SIZE, A1
3140	addq	$16 * SIZE, A2
3141	addq	$16 * SIZE, X1
3142
3143	decq	I
3144	jg	.L122
3145	ALIGN_4
3146
3147.L123:
3148	mulps	%xmm4, %xmm0
3149	addps	%xmm0, %xmm8
3150	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
3151	shufps	$0x4e, %xmm1, %xmm12
3152	mulps	%xmm4, %xmm12
3153	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3154	addps	%xmm12, %xmm9
3155	MOVUPS_A1 (-26 * SIZE, A2, %xmm12)
3156
3157	mulps	%xmm5, %xmm0
3158	addps	%xmm0, %xmm8
3159	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
3160	shufps	$0x4e, %xmm12, %xmm1
3161	mulps	%xmm5, %xmm1
3162	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
3163	addps	%xmm1, %xmm9
3164	MOVUPS_A1 (-22 * SIZE, A2, %xmm1)
3165
3166	mulps	%xmm4, %xmm0
3167	addps	%xmm0, %xmm8
3168	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
3169	shufps	$0x4e, %xmm1, %xmm12
3170	mulps	%xmm4, %xmm12
3171	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
3172	addps	%xmm12, %xmm9
3173	MOVUPS_A1 (-18 * SIZE, A2, %xmm12)
3174
3175	mulps	%xmm5, %xmm0
3176	addps	%xmm0, %xmm8
3177	shufps	$0x4e, %xmm12, %xmm1
3178	mulps	%xmm5, %xmm1
3179	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
3180	addps	%xmm1, %xmm9
3181
3182	addq	$16 * SIZE, A1
3183	addq	$16 * SIZE, A2
3184	addq	$16 * SIZE, X1
3185	ALIGN_4
3186
3187.L125:
3188	testq	$8, MM
3189	jle	.L126
3190
3191	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3192	MOVUPS_A1 (-30 * SIZE, A2, %xmm1)
3193
3194	mulps	%xmm4, %xmm0
3195	addps	%xmm0, %xmm8
3196	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
3197	shufps	$0x4e, %xmm1, %xmm12
3198	mulps	%xmm4, %xmm12
3199	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3200	addps	%xmm12, %xmm9
3201	MOVUPS_A1 (-26 * SIZE, A2, %xmm12)
3202
3203	mulps	%xmm5, %xmm0
3204	addps	%xmm0, %xmm8
3205	shufps	$0x4e, %xmm12, %xmm1
3206	mulps	%xmm5, %xmm1
3207	addps	%xmm1, %xmm9
3208
3209	addq	$8 * SIZE, A1
3210	addq	$8 * SIZE, A2
3211	addq	$8 * SIZE, X1
3212	ALIGN_4
3213
3214.L126:
3215	testq	$4, MM
3216	jle	.L127
3217
3218	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3219	mulps	%xmm4, %xmm0
3220	addps	%xmm0, %xmm8
3221	MOVUPS_A1 (-30 * SIZE, A2, %xmm1)
3222	shufps	$0x4e, %xmm1, %xmm12
3223	mulps	%xmm4, %xmm12
3224	addps	%xmm12, %xmm9
3225
3226	addq	$4 * SIZE, A1
3227	addq	$4 * SIZE, A2
3228	addq	$4 * SIZE, X1
3229	ALIGN_4
3230
3231.L127:
3232	testq	$2, MM
3233	jle	.L128
3234
3235#ifdef movsd
3236	xorps	%xmm0, %xmm0
3237#endif
3238	movsd	-32 * SIZE(A1), %xmm0
3239#ifdef movsd
3240	xorps	%xmm4, %xmm4
3241#endif
3242	movsd	-32 * SIZE(X1), %xmm4
3243	mulps	%xmm4, %xmm0
3244	addps	%xmm0, %xmm8
3245#ifdef movsd
3246	xorps	%xmm1, %xmm1
3247#endif
3248	movsd	-32 * SIZE(A2), %xmm1
3249	mulps	%xmm4, %xmm1
3250	addps	%xmm1, %xmm9
3251	shufps	$0xe, %xmm4, %xmm4
3252
3253	addq	$2 * SIZE, A1
3254	addq	$2 * SIZE, A2
3255	addq	$2 * SIZE, X1
3256	ALIGN_4
3257
3258.L128:
3259	testq	$1, MM
3260	jle	.L129
3261
3262	movss	-32 * SIZE(A1), %xmm0
3263	movss	-32 * SIZE(X1), %xmm4
3264	mulss	%xmm4, %xmm0
3265	addss	%xmm0, %xmm8
3266	movss	-32 * SIZE(A2), %xmm1
3267	mulss	%xmm4, %xmm1
3268	addss	%xmm1, %xmm9
3269	ALIGN_4
3270
3271.L129:
3272#ifdef HAVE_SSE3
3273	haddps	%xmm9, %xmm8
3274	haddps	%xmm8, %xmm8
3275#else
3276	movaps	%xmm8, %xmm10
3277	unpcklps %xmm9, %xmm8
3278	unpckhps %xmm9, %xmm10
3279
3280	addps	%xmm10, %xmm8
3281	movhlps %xmm8, %xmm9
3282	addps	%xmm9, %xmm8
3283#endif
3284
3285	pshufd	$0x1, %xmm8, %xmm9
3286
3287	mulss	ALPHA, %xmm8
3288	mulss	ALPHA, %xmm9
3289
3290	addss	(Y), %xmm8
3291	addq	INCY, Y
3292	addss	(Y), %xmm9
3293	addq	INCY, Y
3294
3295	movss	%xmm8, (Y1)
3296	addq	INCY, Y1
3297	movss	%xmm9, (Y1)
3298	addq	INCY, Y1
3299	jmp	.L999
3300	ALIGN_4
3301
3302.L130:
3303	cmpq	$1, N
3304	jne	.L999
3305
3306	leaq	32 * SIZE(BUFFER), X1
3307
3308	movq	A, A1
3309
3310	xorps	%xmm8, %xmm8
3311	xorps	%xmm9, %xmm9
3312
3313	cmpq	$3, M
3314	jle	.L137
3315
3316	testq	$SIZE, A1
3317	je	.L13X
3318
3319	movss	-32 * SIZE(A1), %xmm0
3320	movss	-32 * SIZE(X1), %xmm4
3321	mulss	%xmm4, %xmm0
3322	addss	%xmm0, %xmm8
3323
3324	addq	 $1 * SIZE, A1
3325	addq	 $1 * SIZE, X1
3326	ALIGN_3
3327
3328.L13X:
3329	testq	$2 * SIZE, A1
3330	je	.L13XX
3331
3332
3333#ifdef movsd
3334	xorps	%xmm0, %xmm0
3335	xorps	%xmm4, %xmm4
3336#endif
3337	movsd	-32 * SIZE(A1), %xmm0
3338	movsd	-32 * SIZE(X1), %xmm4
3339	mulps	%xmm4, %xmm0
3340	addps	%xmm0, %xmm8
3341	shufps	$0xe, %xmm4, %xmm4
3342
3343	addq	 $2 * SIZE, A1
3344	addq	 $2 * SIZE, X1
3345	ALIGN_3
3346
3347.L13XX:
3348
3349	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
3350	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
3351
3352	movq	MM,  I
3353	sarq	$4,  I
3354	jle	.L135
3355
3356	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3357	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
3358
3359	decq	I
3360	jle	.L133
3361	ALIGN_4
3362
3363.L132:
3364#ifdef PREFETCH
3365	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1)
3366#endif
3367
3368	mulps	%xmm4, %xmm0
3369	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3370	addps	%xmm0, %xmm8
3371	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
3372
3373	mulps	%xmm5, %xmm12
3374	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
3375	addps	%xmm12, %xmm9
3376	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
3377
3378#ifdef PREFETCHW
3379	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1)
3380#endif
3381
3382	mulps	%xmm4, %xmm0
3383	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
3384	addps	%xmm0, %xmm8
3385	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
3386
3387	mulps	%xmm5, %xmm12
3388	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
3389	addps	%xmm12, %xmm9
3390	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
3391
3392	addq	$16 * SIZE, A1
3393	addq	$16 * SIZE, X1
3394
3395	decq	I
3396	jg	.L132
3397	ALIGN_4
3398
3399.L133:
3400	mulps	%xmm4, %xmm0
3401	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3402	addps	%xmm0, %xmm8
3403	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
3404
3405	mulps	%xmm5, %xmm12
3406	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
3407	addps	%xmm12, %xmm9
3408	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
3409
3410	mulps	%xmm4, %xmm0
3411	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
3412	addps	%xmm0, %xmm8
3413
3414	mulps	%xmm5, %xmm12
3415	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
3416	addps	%xmm12, %xmm9
3417
3418	addq	$16 * SIZE, A1
3419	addq	$16 * SIZE, X1
3420	ALIGN_4
3421
3422.L135:
3423	testq	$8, MM
3424	jle	.L136
3425
3426	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3427	mulps	%xmm4, %xmm0
3428	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3429	addps	%xmm0, %xmm8
3430
3431	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
3432	mulps	%xmm5, %xmm12
3433	addps	%xmm12, %xmm9
3434
3435	addq	$8 * SIZE, A1
3436	addq	$8 * SIZE, X1
3437	ALIGN_4
3438
3439.L136:
3440	testq	$4, MM
3441	jle	.L137
3442
3443	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3444	mulps	%xmm4, %xmm0
3445	addps	%xmm0, %xmm8
3446
3447	addq	$4 * SIZE, A1
3448	addq	$4 * SIZE, X1
3449	ALIGN_4
3450
3451.L137:
3452	testq	$2, MM
3453	jle	.L138
3454
3455#ifdef movsd
3456	xorps	%xmm0, %xmm0
3457#endif
3458	movsd	-32 * SIZE(A1), %xmm0
3459#ifdef movsd
3460	xorps	%xmm4, %xmm4
3461#endif
3462	movsd	-32 * SIZE(X1), %xmm4
3463	mulps	%xmm4, %xmm0
3464	addps	%xmm0, %xmm8
3465	shufps	$0xe, %xmm4, %xmm4
3466
3467	addq	$2 * SIZE, A1
3468	addq	$2 * SIZE, X1
3469	ALIGN_4
3470
3471.L138:
3472	testq	$1, MM
3473	jle	.L139
3474
3475	movss	-32 * SIZE(A1), %xmm0
3476	movss	-32 * SIZE(X1), %xmm4
3477	mulss	%xmm4, %xmm0
3478	addss	%xmm0, %xmm8
3479	ALIGN_4
3480
3481.L139:
3482	addps	%xmm9, %xmm8
3483
3484#ifdef HAVE_SSE3
3485	haddps	%xmm8, %xmm8
3486	haddps	%xmm8, %xmm8
3487#else
3488	pshufd	$1, %xmm8, %xmm9
3489	pshufd	$2, %xmm8, %xmm10
3490	pshufd	$3, %xmm8, %xmm11
3491
3492	addss	%xmm9, %xmm8
3493	addss	%xmm11, %xmm10
3494	addss	%xmm10, %xmm8
3495#endif
3496
3497	mulss	ALPHA, %xmm8
3498
3499	addss	(Y), %xmm8
3500	movss	%xmm8, (Y1)
3501	jmp	.L999
3502	ALIGN_4
3503
3504.L200:
3505	testq	$2 * SIZE, LDA
3506	jne	.L300
3507
3508	cmpq	$4, N
3509	jl	.L210
3510	ALIGN_3
3511
3512.L201:
3513	subq	$4, N
3514
3515	leaq	32 * SIZE(BUFFER), X1
3516
3517	movq	A, A1
3518	leaq	(A1, LDA, 2), A2
3519	leaq	(A1, LDA, 4), A
3520
3521	xorps	%xmm8, %xmm8
3522	xorps	%xmm9, %xmm9
3523	xorps	%xmm10, %xmm10
3524	xorps	%xmm11, %xmm11
3525
3526	cmpq	$3, M
3527	jle	.L207
3528
3529	testq	$SIZE, A1
3530	je	.L20X
3531
3532	movss	-32 * SIZE(A1), %xmm0
3533	movss	-32 * SIZE(X1), %xmm4
3534	mulss	%xmm4, %xmm0
3535	addss	%xmm0, %xmm8
3536	movss	-32 * SIZE(A1, LDA), %xmm1
3537	mulss	%xmm4, %xmm1
3538	addss	%xmm1, %xmm9
3539	movss	-32 * SIZE(A2), %xmm2
3540	mulss	%xmm4, %xmm2
3541	addss	%xmm2, %xmm10
3542	movss	-32 * SIZE(A2, LDA), %xmm3
3543	mulss	%xmm4, %xmm3
3544	addss	%xmm3, %xmm11
3545
3546	addq	 $1 * SIZE, A1
3547	addq	 $1 * SIZE, A2
3548	addq	 $1 * SIZE, X1
3549	ALIGN_3
3550
3551.L20X:
3552	testq	$2 * SIZE, A1
3553	je	.L20XX
3554
3555#ifdef movsd
3556	xorps	%xmm0, %xmm0
3557	xorps	%xmm4, %xmm4
3558#endif
3559	movsd	-32 * SIZE(A1), %xmm0
3560	movsd	-32 * SIZE(X1), %xmm4
3561	mulps	%xmm4, %xmm0
3562	addps	%xmm0, %xmm8
3563#ifdef movsd
3564	xorps	%xmm1, %xmm1
3565#endif
3566	movsd	-32 * SIZE(A1, LDA), %xmm1
3567	mulps	%xmm4, %xmm1
3568	addps	%xmm1, %xmm9
3569#ifdef movsd
3570	xorps	%xmm2, %xmm2
3571#endif
3572	movsd	-32 * SIZE(A2), %xmm2
3573	mulps	%xmm4, %xmm2
3574	addps	%xmm2, %xmm10
3575#ifdef movsd
3576	xorps	%xmm3, %xmm3
3577#endif
3578	movsd	-32 * SIZE(A2, LDA), %xmm3
3579	mulps	%xmm4, %xmm3
3580	addps	%xmm3, %xmm11
3581
3582	addq	 $2 * SIZE, A1
3583	addq	 $2 * SIZE, A2
3584	addq	 $2 * SIZE, X1
3585	ALIGN_3
3586
3587.L20XX:
3588	movaps	-33 * SIZE(A1, LDA), %xmm12
3589	movaps	-34 * SIZE(A2), %xmm13
3590	movaps	-35 * SIZE(A2, LDA), %xmm14
3591
3592	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
3593	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
3594
3595#ifdef PREFETCHW
3596	PREFETCHW	4 * SIZE(Y1)
3597#endif
3598
3599	movq	MM,  I
3600	sarq	$4,  I
3601	jle	.L205
3602
3603	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3604	MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1)
3605	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
3606	MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3)
3607
3608	decq	I
3609	jle	.L203
3610	ALIGN_4
3611
3612.L202:
3613#ifdef PREFETCH
3614	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
3615#endif
3616
3617	mulps	%xmm4, %xmm0
3618	addps	%xmm0, %xmm8
3619	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
3620	movss	 %xmm1,  %xmm12
3621	shufps	 $0x39,  %xmm12, %xmm12
3622	mulps	 %xmm4,  %xmm12
3623	addps	 %xmm12, %xmm9
3624	MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12)
3625
3626	shufps	 $0x4e, %xmm2, %xmm13
3627	mulps	 %xmm4, %xmm13
3628	addps	 %xmm13, %xmm10
3629	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
3630	movss	 %xmm3,  %xmm14
3631	shufps	 $0x93, %xmm3, %xmm14
3632	mulps	 %xmm4, %xmm14
3633	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3634	addps	 %xmm14, %xmm11
3635	MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14)
3636
3637#ifdef PREFETCH
3638	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA)
3639#endif
3640
3641	mulps	%xmm5, %xmm0
3642	addps	%xmm0, %xmm8
3643	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
3644	movss	 %xmm12,  %xmm1
3645	shufps	 $0x39,  %xmm1, %xmm1
3646	mulps	 %xmm5,  %xmm1
3647	addps	 %xmm1, %xmm9
3648	MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1)
3649
3650	shufps	 $0x4e, %xmm13, %xmm2
3651	mulps	 %xmm5, %xmm2
3652	addps	 %xmm2, %xmm10
3653	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
3654	movss	 %xmm14,  %xmm3
3655	shufps	 $0x93, %xmm14, %xmm3
3656	mulps	 %xmm5, %xmm3
3657	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
3658	addps	 %xmm3, %xmm11
3659	MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3)
3660
3661#ifdef PREFETCH
3662	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
3663#endif
3664
3665	mulps	%xmm4, %xmm0
3666	addps	%xmm0, %xmm8
3667	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
3668	movss	 %xmm1,  %xmm12
3669	shufps	 $0x39,  %xmm12, %xmm12
3670	mulps	 %xmm4,  %xmm12
3671	addps	 %xmm12, %xmm9
3672	MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12)
3673
3674	shufps	 $0x4e, %xmm2, %xmm13
3675	mulps	 %xmm4, %xmm13
3676	addps	 %xmm13, %xmm10
3677	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
3678	movss	 %xmm3,  %xmm14
3679	shufps	 $0x93, %xmm3, %xmm14
3680	mulps	 %xmm4, %xmm14
3681	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
3682	addps	 %xmm14, %xmm11
3683	MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14)
3684
3685#ifdef PREFETCH
3686	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA)
3687#endif
3688
3689	mulps	%xmm5, %xmm0
3690	addps	%xmm0, %xmm8
3691	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
3692	movss	 %xmm12,  %xmm1
3693	shufps	 $0x39,  %xmm1, %xmm1
3694	mulps	 %xmm5,  %xmm1
3695	addps	 %xmm1, %xmm9
3696	MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1)
3697
3698#ifdef PREFETCHW
3699	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
3700#endif
3701
3702	shufps	 $0x4e, %xmm13, %xmm2
3703	mulps	 %xmm5, %xmm2
3704	addps	 %xmm2, %xmm10
3705	MOVUPS_A1 (-14 * SIZE, A2, %xmm2)
3706	movss	 %xmm14,  %xmm3
3707	shufps	 $0x93, %xmm14, %xmm3
3708	mulps	 %xmm5, %xmm3
3709	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
3710	addps	 %xmm3, %xmm11
3711	MOVUPS_A2 (-15 * SIZE, A2, LDA, 1, %xmm3)
3712
3713	addq	$16 * SIZE, A1
3714	addq	$16 * SIZE, A2
3715	addq	$16 * SIZE, X1
3716
3717	decq	I
3718	jg	.L202
3719	ALIGN_4
3720
3721.L203:
3722	mulps	%xmm4, %xmm0
3723	addps	%xmm0, %xmm8
3724	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
3725	movss	 %xmm1,  %xmm12
3726	shufps	 $0x39,  %xmm12, %xmm12
3727	mulps	 %xmm4,  %xmm12
3728	addps	 %xmm12, %xmm9
3729	MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12)
3730
3731	shufps	 $0x4e, %xmm2, %xmm13
3732	mulps	 %xmm4, %xmm13
3733	addps	 %xmm13, %xmm10
3734	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
3735	movss	 %xmm3,  %xmm14
3736	shufps	 $0x93, %xmm3, %xmm14
3737	mulps	 %xmm4, %xmm14
3738	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3739	addps	 %xmm14, %xmm11
3740	MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14)
3741
3742	mulps	%xmm5, %xmm0
3743	addps	%xmm0, %xmm8
3744	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
3745	movss	 %xmm12,  %xmm1
3746	shufps	 $0x39,  %xmm1, %xmm1
3747	mulps	 %xmm5,  %xmm1
3748	addps	 %xmm1, %xmm9
3749	MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1)
3750
3751	shufps	 $0x4e, %xmm13, %xmm2
3752	mulps	 %xmm5, %xmm2
3753	addps	 %xmm2, %xmm10
3754	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
3755	movss	 %xmm14,  %xmm3
3756	shufps	 $0x93, %xmm14, %xmm3
3757	mulps	 %xmm5, %xmm3
3758	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
3759	addps	 %xmm3, %xmm11
3760	MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3)
3761
3762	mulps	%xmm4, %xmm0
3763	addps	%xmm0, %xmm8
3764	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
3765	movss	 %xmm1,  %xmm12
3766	shufps	 $0x39,  %xmm12, %xmm12
3767	mulps	 %xmm4,  %xmm12
3768	addps	 %xmm12, %xmm9
3769	MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12)
3770
3771	shufps	 $0x4e, %xmm2, %xmm13
3772	mulps	 %xmm4, %xmm13
3773	addps	 %xmm13, %xmm10
3774	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
3775	movss	 %xmm3,  %xmm14
3776	shufps	 $0x93, %xmm3, %xmm14
3777	mulps	 %xmm4, %xmm14
3778	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
3779	addps	 %xmm14, %xmm11
3780	MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14)
3781
3782	mulps	%xmm5, %xmm0
3783	addps	%xmm0, %xmm8
3784	movss	 %xmm12,  %xmm1
3785	shufps	 $0x39,  %xmm1, %xmm1
3786	mulps	 %xmm5,  %xmm1
3787	addps	 %xmm1, %xmm9
3788
3789	shufps	 $0x4e, %xmm13, %xmm2
3790	mulps	 %xmm5, %xmm2
3791	addps	 %xmm2, %xmm10
3792	movss	 %xmm14,  %xmm3
3793	shufps	 $0x93, %xmm14, %xmm3
3794	mulps	 %xmm5, %xmm3
3795	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
3796	addps	 %xmm3, %xmm11
3797
3798	addq	$16 * SIZE, A1
3799	addq	$16 * SIZE, A2
3800	addq	$16 * SIZE, X1
3801	ALIGN_4
3802
3803.L205:
3804	testq	$8, MM
3805	jle	.L206
3806
3807	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3808	MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1)
3809	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
3810	MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3)
3811
3812	mulps	%xmm4, %xmm0
3813	addps	%xmm0, %xmm8
3814	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
3815	movss	 %xmm1,  %xmm12
3816	shufps	 $0x39,  %xmm12, %xmm12
3817	mulps	 %xmm4,  %xmm12
3818	addps	 %xmm12, %xmm9
3819	MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12)
3820
3821	shufps	 $0x4e, %xmm2, %xmm13
3822	mulps	 %xmm4, %xmm13
3823	addps	 %xmm13, %xmm10
3824	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
3825	movss	 %xmm3,  %xmm14
3826	shufps	 $0x93, %xmm3, %xmm14
3827	mulps	 %xmm4, %xmm14
3828	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
3829	addps	 %xmm14, %xmm11
3830	MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14)
3831
3832	mulps	%xmm5, %xmm0
3833	addps	%xmm0, %xmm8
3834	movss	 %xmm12,  %xmm1
3835	shufps	 $0x39,  %xmm1, %xmm1
3836	mulps	 %xmm5,  %xmm1
3837	addps	 %xmm1, %xmm9
3838
3839	shufps	 $0x4e, %xmm13, %xmm2
3840	mulps	 %xmm5, %xmm2
3841	addps	 %xmm2, %xmm10
3842	movss	 %xmm14,  %xmm3
3843	shufps	 $0x93, %xmm14, %xmm3
3844	mulps	 %xmm5, %xmm3
3845	addps	 %xmm3, %xmm11
3846
3847	addq	$8 * SIZE, A1
3848	addq	$8 * SIZE, A2
3849	addq	$8 * SIZE, X1
3850	ALIGN_4
3851
3852.L206:
3853	testq	$4, MM
3854	jle	.L207
3855
3856	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
3857	MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1)
3858	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
3859	MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3)
3860
3861	mulps	%xmm4, %xmm0
3862	addps	%xmm0, %xmm8
3863	movss	 %xmm1,  %xmm12
3864	shufps	 $0x39,  %xmm12, %xmm12
3865	mulps	 %xmm4,  %xmm12
3866	addps	 %xmm12, %xmm9
3867
3868	shufps	 $0x4e, %xmm2, %xmm13
3869	mulps	 %xmm4, %xmm13
3870	addps	 %xmm13, %xmm10
3871	movss	 %xmm3,  %xmm14
3872	shufps	 $0x93, %xmm3, %xmm14
3873	mulps	 %xmm4, %xmm14
3874	addps	 %xmm14, %xmm11
3875
3876	addq	$4 * SIZE, A1
3877	addq	$4 * SIZE, A2
3878	addq	$4 * SIZE, X1
3879	ALIGN_4
3880
3881.L207:
3882	testq	$2, MM
3883	jle	.L208
3884
3885#ifdef movsd
3886	xorps	%xmm0, %xmm0
3887#endif
3888	movsd	-32 * SIZE(A1), %xmm0
3889#ifdef movsd
3890	xorps	%xmm4, %xmm4
3891#endif
3892	movsd	-32 * SIZE(X1), %xmm4
3893	mulps	%xmm4, %xmm0
3894	addps	%xmm0, %xmm8
3895#ifdef movsd
3896	xorps	%xmm1, %xmm1
3897#endif
3898	movsd	-32 * SIZE(A1, LDA), %xmm1
3899	mulps	%xmm4, %xmm1
3900	addps	%xmm1, %xmm9
3901#ifdef movsd
3902	xorps	%xmm2, %xmm2
3903#endif
3904	movsd	-32 * SIZE(A2), %xmm2
3905	mulps	%xmm4, %xmm2
3906	addps	%xmm2, %xmm10
3907#ifdef movsd
3908	xorps	%xmm3, %xmm3
3909#endif
3910	movsd	-32 * SIZE(A2, LDA), %xmm3
3911	mulps	%xmm4, %xmm3
3912	addps	%xmm3, %xmm11
3913	shufps	$0xe, %xmm4, %xmm4
3914
3915	addq	$2 * SIZE, A1
3916	addq	$2 * SIZE, A2
3917	addq	$2 * SIZE, X1
3918	ALIGN_4
3919
3920.L208:
3921	testq	$1, MM
3922	jle	.L209
3923
3924	movss	-32 * SIZE(A1), %xmm0
3925	movss	-32 * SIZE(X1), %xmm4
3926	mulss	%xmm4, %xmm0
3927	addss	%xmm0, %xmm8
3928	movss	-32 * SIZE(A1, LDA), %xmm1
3929	mulss	%xmm4, %xmm1
3930	addss	%xmm1, %xmm9
3931	movss	-32 * SIZE(A2), %xmm2
3932	mulss	%xmm4, %xmm2
3933	addss	%xmm2, %xmm10
3934	movss	-32 * SIZE(A2, LDA), %xmm3
3935	mulss	%xmm4, %xmm3
3936	addss	%xmm3, %xmm11
3937	ALIGN_4
3938
3939.L209:
3940#ifdef HAVE_SSE3
3941	haddps	%xmm9, %xmm8
3942	haddps	%xmm11, %xmm10
3943	haddps	%xmm10, %xmm8
3944
3945	pshufd	$0x1, %xmm8, %xmm9
3946	pshufd	$0x2, %xmm8, %xmm10
3947	pshufd	$0x3, %xmm8, %xmm11
3948#else
3949	movaps	%xmm8, %xmm0
3950	unpcklps %xmm9, %xmm8
3951	unpckhps %xmm9, %xmm0
3952
3953	movaps	%xmm10, %xmm1
3954	unpcklps %xmm11, %xmm10
3955	unpckhps %xmm11, %xmm1
3956
3957	movaps	%xmm8, %xmm9
3958	unpcklps %xmm10, %xmm8
3959	unpckhps %xmm10, %xmm9
3960
3961	movaps	%xmm0, %xmm10
3962	unpcklps %xmm1, %xmm0
3963	unpckhps %xmm1, %xmm10
3964
3965	addps	%xmm9, %xmm8
3966	addps	%xmm0, %xmm10
3967	addps	%xmm10, %xmm8
3968
3969	pshufd	$0x2, %xmm8, %xmm9
3970	pshufd	$0x1, %xmm8, %xmm10
3971	pshufd	$0x3, %xmm8, %xmm11
3972#endif
3973
3974	mulss	ALPHA, %xmm8
3975	mulss	ALPHA, %xmm9
3976	mulss	ALPHA, %xmm10
3977	mulss	ALPHA, %xmm11
3978
3979	addss	(Y), %xmm8
3980	addq	INCY, Y
3981	addss	(Y), %xmm9
3982	addq	INCY, Y
3983	addss	(Y), %xmm10
3984	addq	INCY, Y
3985	addss	(Y), %xmm11
3986	addq	INCY, Y
3987
3988	movss	%xmm8, (Y1)
3989	addq	INCY, Y1
3990	movss	%xmm9, (Y1)
3991	addq	INCY, Y1
3992	movss	%xmm10, (Y1)
3993	addq	INCY, Y1
3994	movss	%xmm11, (Y1)
3995	addq	INCY, Y1
3996
3997	cmpq	$4, N
3998	jge	.L201
3999	ALIGN_4
4000
4001.L210:
4002	cmpq	$3, N
4003	jne	.L220
4004
4005	leaq	32 * SIZE(BUFFER), X1
4006
4007	movq	A, A1
4008	leaq	(A1, LDA, 2), A2
4009	leaq	(A1, LDA, 4), A
4010
4011	xorps	%xmm8, %xmm8
4012	xorps	%xmm9, %xmm9
4013	xorps	%xmm10, %xmm10
4014
4015	cmpq	$3, M
4016	jle	.L217
4017
4018	testq	$SIZE, A1
4019	je	.L21X
4020
4021	movss	-32 * SIZE(A1), %xmm0
4022	movss	-32 * SIZE(X1), %xmm4
4023	mulss	%xmm4, %xmm0
4024	addss	%xmm0, %xmm8
4025	movss	-32 * SIZE(A1, LDA), %xmm1
4026	mulss	%xmm4, %xmm1
4027	addss	%xmm1, %xmm9
4028	movss	-32 * SIZE(A2), %xmm2
4029	mulss	%xmm4, %xmm2
4030	addss	%xmm2, %xmm10
4031
4032	addq	 $1 * SIZE, A1
4033	addq	 $1 * SIZE, A2
4034	addq	 $1 * SIZE, X1
4035	ALIGN_3
4036
4037.L21X:
4038	testq	$2 * SIZE, A1
4039	je	.L21XX
4040
4041#ifdef movsd
4042	xorps	%xmm0, %xmm0
4043	xorps	%xmm4, %xmm4
4044#endif
4045	movsd	-32 * SIZE(A1), %xmm0
4046	movsd	-32 * SIZE(X1), %xmm4
4047	mulps	%xmm4, %xmm0
4048	addps	%xmm0, %xmm8
4049#ifdef movsd
4050	xorps	%xmm1, %xmm1
4051#endif
4052	movsd	-32 * SIZE(A1, LDA), %xmm1
4053	mulps	%xmm4, %xmm1
4054	addps	%xmm1, %xmm9
4055#ifdef movsd
4056	xorps	%xmm2, %xmm2
4057#endif
4058	movsd	-32 * SIZE(A2), %xmm2
4059	mulps	%xmm4, %xmm2
4060	addps	%xmm2, %xmm10
4061
4062	addq	 $2 * SIZE, A1
4063	addq	 $2 * SIZE, A2
4064	addq	 $2 * SIZE, X1
4065	ALIGN_3
4066
4067.L21XX:
4068	movaps	-33 * SIZE(A1, LDA), %xmm12
4069	movaps	-34 * SIZE(A2), %xmm13
4070
4071	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
4072	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
4073
4074#ifdef PREFETCHW
4075	PREFETCHW	4 * SIZE(Y1)
4076#endif
4077
4078	movq	MM,  I
4079	sarq	$4,  I
4080	jle	.L215
4081
4082	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4083	MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1)
4084	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
4085
4086	decq	I
4087	jle	.L213
4088	ALIGN_4
4089
4090.L212:
4091#ifdef PREFETCH
4092	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1)
4093#endif
4094
4095	mulps	%xmm4, %xmm0
4096	addps	%xmm0, %xmm8
4097	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
4098	movss	 %xmm1,  %xmm12
4099	shufps	 $0x39,  %xmm12, %xmm12
4100	mulps	 %xmm4,  %xmm12
4101	addps	 %xmm12, %xmm9
4102	MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12)
4103
4104	shufps	 $0x4e, %xmm2, %xmm13
4105	mulps	 %xmm4, %xmm13
4106	addps	 %xmm13, %xmm10
4107	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
4108	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4109
4110#ifdef PREFETCH
4111	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA)
4112#endif
4113
4114	mulps	%xmm5, %xmm0
4115	addps	%xmm0, %xmm8
4116	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
4117	movss	 %xmm12,  %xmm1
4118	shufps	 $0x39,  %xmm1, %xmm1
4119	mulps	 %xmm5,  %xmm1
4120	addps	 %xmm1, %xmm9
4121	MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1)
4122
4123	shufps	 $0x4e, %xmm13, %xmm2
4124	mulps	 %xmm5, %xmm2
4125	addps	 %xmm2, %xmm10
4126	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
4127	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
4128
4129#ifdef PREFETCH
4130	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2)
4131#endif
4132
4133	mulps	%xmm4, %xmm0
4134	addps	%xmm0, %xmm8
4135	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
4136	movss	 %xmm1,  %xmm12
4137	shufps	 $0x39,  %xmm12, %xmm12
4138	mulps	 %xmm4,  %xmm12
4139	addps	 %xmm12, %xmm9
4140	MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12)
4141	shufps	 $0x4e, %xmm2, %xmm13
4142	mulps	 %xmm4, %xmm13
4143	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
4144	addps	 %xmm13, %xmm10
4145	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
4146
4147#ifdef PREFETCHW
4148	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1)
4149#endif
4150
4151	mulps	%xmm5, %xmm0
4152	addps	%xmm0, %xmm8
4153	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
4154	movss	 %xmm12,  %xmm1
4155	shufps	 $0x39,  %xmm1, %xmm1
4156	mulps	 %xmm5,  %xmm1
4157	addps	 %xmm1, %xmm9
4158	MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1)
4159	shufps	 $0x4e, %xmm13, %xmm2
4160	mulps	 %xmm5, %xmm2
4161	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
4162	addps	 %xmm2, %xmm10
4163	MOVUPS_A1 (-14 * SIZE, A2, %xmm2)
4164
4165	addq	$16 * SIZE, A1
4166	addq	$16 * SIZE, A2
4167	addq	$16 * SIZE, X1
4168
4169	decq	I
4170	jg	.L212
4171	ALIGN_4
4172
4173.L213:
4174	mulps	%xmm4, %xmm0
4175	addps	%xmm0, %xmm8
4176	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
4177	movss	 %xmm1,  %xmm12
4178	shufps	 $0x39,  %xmm12, %xmm12
4179	mulps	 %xmm4,  %xmm12
4180	addps	 %xmm12, %xmm9
4181	MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12)
4182
4183	shufps	 $0x4e, %xmm2, %xmm13
4184	mulps	 %xmm4, %xmm13
4185	addps	 %xmm13, %xmm10
4186	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
4187	movss	 %xmm3,  %xmm14
4188	shufps	 $0x93, %xmm3, %xmm14
4189	mulps	 %xmm4, %xmm14
4190	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4191	addps	 %xmm14, %xmm11
4192	MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14)
4193
4194	mulps	%xmm5, %xmm0
4195	addps	%xmm0, %xmm8
4196	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
4197	movss	 %xmm12,  %xmm1
4198	shufps	 $0x39,  %xmm1, %xmm1
4199	mulps	 %xmm5,  %xmm1
4200	addps	 %xmm1, %xmm9
4201	MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1)
4202
4203	shufps	 $0x4e, %xmm13, %xmm2
4204	mulps	 %xmm5, %xmm2
4205	addps	 %xmm2, %xmm10
4206	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
4207	movss	 %xmm14,  %xmm3
4208	shufps	 $0x93, %xmm14, %xmm3
4209	mulps	 %xmm5, %xmm3
4210	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
4211	addps	 %xmm3, %xmm11
4212	MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3)
4213
4214	mulps	%xmm4, %xmm0
4215	addps	%xmm0, %xmm8
4216	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
4217	movss	 %xmm1,  %xmm12
4218	shufps	 $0x39,  %xmm12, %xmm12
4219	mulps	 %xmm4,  %xmm12
4220	addps	 %xmm12, %xmm9
4221	MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12)
4222
4223	shufps	 $0x4e, %xmm2, %xmm13
4224	mulps	 %xmm4, %xmm13
4225	addps	 %xmm13, %xmm10
4226	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
4227	movss	 %xmm3,  %xmm14
4228	shufps	 $0x93, %xmm3, %xmm14
4229	mulps	 %xmm4, %xmm14
4230	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
4231	addps	 %xmm14, %xmm11
4232	MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14)
4233
4234	mulps	%xmm5, %xmm0
4235	addps	%xmm0, %xmm8
4236	movss	 %xmm12,  %xmm1
4237	shufps	 $0x39,  %xmm1, %xmm1
4238	mulps	 %xmm5,  %xmm1
4239	addps	 %xmm1, %xmm9
4240
4241	shufps	 $0x4e, %xmm13, %xmm2
4242	mulps	 %xmm5, %xmm2
4243	addps	 %xmm2, %xmm10
4244	movss	 %xmm14,  %xmm3
4245	shufps	 $0x93, %xmm14, %xmm3
4246	mulps	 %xmm5, %xmm3
4247	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
4248	addps	 %xmm3, %xmm11
4249
4250	addq	$16 * SIZE, A1
4251	addq	$16 * SIZE, A2
4252	addq	$16 * SIZE, X1
4253	ALIGN_4
4254
4255.L215:
4256	testq	$8, MM
4257	jle	.L216
4258
4259	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4260	MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1)
4261	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
4262
4263	mulps	%xmm4, %xmm0
4264	addps	%xmm0, %xmm8
4265	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
4266	movss	 %xmm1,  %xmm12
4267	shufps	 $0x39,  %xmm12, %xmm12
4268	mulps	 %xmm4,  %xmm12
4269	addps	 %xmm12, %xmm9
4270	MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12)
4271
4272	shufps	 $0x4e, %xmm2, %xmm13
4273	mulps	 %xmm4, %xmm13
4274	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4275	addps	 %xmm13, %xmm10
4276	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
4277
4278	mulps	%xmm5, %xmm0
4279	addps	%xmm0, %xmm8
4280	movss	 %xmm12,  %xmm1
4281	shufps	 $0x39,  %xmm1, %xmm1
4282	mulps	 %xmm5,  %xmm1
4283	addps	 %xmm1, %xmm9
4284
4285	shufps	 $0x4e, %xmm13, %xmm2
4286	mulps	 %xmm5, %xmm2
4287	addps	 %xmm2, %xmm10
4288
4289	addq	$8 * SIZE, A1
4290	addq	$8 * SIZE, A2
4291	addq	$8 * SIZE, X1
4292	ALIGN_4
4293
4294.L216:
4295	testq	$4, MM
4296	jle	.L217
4297
4298	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4299	MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1)
4300	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
4301
4302	mulps	%xmm4, %xmm0
4303	addps	%xmm0, %xmm8
4304	movss	 %xmm1,  %xmm12
4305	shufps	 $0x39,  %xmm12, %xmm12
4306	mulps	 %xmm4,  %xmm12
4307	addps	 %xmm12, %xmm9
4308
4309	shufps	 $0x4e, %xmm2, %xmm13
4310	mulps	 %xmm4, %xmm13
4311	addps	 %xmm13, %xmm10
4312
4313	addq	$4 * SIZE, A1
4314	addq	$4 * SIZE, A2
4315	addq	$4 * SIZE, X1
4316	ALIGN_4
4317
4318.L217:
4319	testq	$2, MM
4320	jle	.L218
4321
4322#ifdef movsd
4323	xorps	%xmm0, %xmm0
4324#endif
4325	movsd	-32 * SIZE(A1), %xmm0
4326#ifdef movsd
4327	xorps	%xmm4, %xmm4
4328#endif
4329	movsd	-32 * SIZE(X1), %xmm4
4330	mulps	%xmm4, %xmm0
4331	addps	%xmm0, %xmm8
4332#ifdef movsd
4333	xorps	%xmm1, %xmm1
4334#endif
4335	movsd	-32 * SIZE(A1, LDA), %xmm1
4336	mulps	%xmm4, %xmm1
4337	addps	%xmm1, %xmm9
4338#ifdef movsd
4339	xorps	%xmm2, %xmm2
4340#endif
4341	movsd	-32 * SIZE(A2), %xmm2
4342	mulps	%xmm4, %xmm2
4343	addps	%xmm2, %xmm10
4344
4345	addq	$2 * SIZE, A1
4346	addq	$2 * SIZE, A2
4347	addq	$2 * SIZE, X1
4348	ALIGN_4
4349
4350.L218:
4351	testq	$1, MM
4352	jle	.L219
4353
4354	movss	-32 * SIZE(A1), %xmm0
4355	movss	-32 * SIZE(X1), %xmm4
4356	mulss	%xmm4, %xmm0
4357	addss	%xmm0, %xmm8
4358	movss	-32 * SIZE(A1, LDA), %xmm1
4359	mulss	%xmm4, %xmm1
4360	addss	%xmm1, %xmm9
4361	movss	-32 * SIZE(A2), %xmm2
4362	mulss	%xmm4, %xmm2
4363	addss	%xmm2, %xmm10
4364	ALIGN_4
4365
4366.L219:
4367#ifdef HAVE_SSE3
4368	haddps	%xmm9, %xmm8
4369	haddps	%xmm11, %xmm10
4370	haddps	%xmm10, %xmm8
4371
4372	pshufd	$0x1, %xmm8, %xmm9
4373	pshufd	$0x2, %xmm8, %xmm10
4374#else
4375	movaps	%xmm8, %xmm0
4376	unpcklps %xmm9, %xmm8
4377	unpckhps %xmm9, %xmm0
4378
4379	movaps	%xmm10, %xmm1
4380	unpcklps %xmm11, %xmm10
4381	unpckhps %xmm11, %xmm1
4382
4383	movaps	%xmm8, %xmm9
4384	unpcklps %xmm10, %xmm8
4385	unpckhps %xmm10, %xmm9
4386
4387	movaps	%xmm0, %xmm10
4388	unpcklps %xmm1, %xmm0
4389	unpckhps %xmm1, %xmm10
4390
4391	addps	%xmm9, %xmm8
4392	addps	%xmm0, %xmm10
4393	addps	%xmm10, %xmm8
4394
4395	pshufd	$0x2, %xmm8, %xmm9
4396	pshufd	$0x1, %xmm8, %xmm10
4397#endif
4398
4399	mulss	ALPHA, %xmm8
4400	mulss	ALPHA, %xmm9
4401	mulss	ALPHA, %xmm10
4402
4403	addss	(Y), %xmm8
4404	addq	INCY, Y
4405	addss	(Y), %xmm9
4406	addq	INCY, Y
4407	addss	(Y), %xmm10
4408
4409	movss	%xmm8, (Y1)
4410	addq	INCY, Y1
4411	movss	%xmm9, (Y1)
4412	addq	INCY, Y1
4413	movss	%xmm10, (Y1)
4414	jmp	.L999
4415	ALIGN_4
4416
4417.L220:
4418	testq	N, N
4419	jle	.L999
4420
4421	cmpq	$2, N
4422	jne	.L230
4423
4424	leaq	32 * SIZE(BUFFER), X1
4425
4426	movq	A, A1
4427	leaq	(A1, LDA), A2
4428	leaq	(A1, LDA, 2), A
4429
4430	xorps	%xmm8, %xmm8
4431	xorps	%xmm9, %xmm9
4432
4433	cmpq	$3, M
4434	jle	.L227
4435
4436	testq	$SIZE, A1
4437	je	.L22X
4438
4439	movss	-32 * SIZE(A1), %xmm0
4440	movss	-32 * SIZE(X1), %xmm4
4441	mulss	%xmm4, %xmm0
4442	addss	%xmm0, %xmm8
4443	movss	-32 * SIZE(A2), %xmm1
4444	mulss	%xmm4, %xmm1
4445	addss	%xmm1, %xmm9
4446
4447	addq	 $1 * SIZE, A1
4448	addq	 $1 * SIZE, A2
4449	addq	 $1 * SIZE, X1
4450	ALIGN_3
4451
4452.L22X:
4453	testq	$2 * SIZE, A1
4454	je	.L22XX
4455
4456#ifdef movsd
4457	xorps	%xmm0, %xmm0
4458	xorps	%xmm4, %xmm4
4459#endif
4460	movsd	-32 * SIZE(A1), %xmm0
4461	movsd	-32 * SIZE(X1), %xmm4
4462	mulps	%xmm4, %xmm0
4463	addps	%xmm0, %xmm8
4464#ifdef movsd
4465	xorps	%xmm1, %xmm1
4466#endif
4467	movsd	-32 * SIZE(A2), %xmm1
4468	mulps	%xmm4, %xmm1
4469	addps	%xmm1, %xmm9
4470
4471	addq	 $2 * SIZE, A1
4472	addq	 $2 * SIZE, A2
4473	addq	 $2 * SIZE, X1
4474	ALIGN_3
4475
4476.L22XX:
4477	movaps	-33 * SIZE(A2), %xmm12
4478
4479	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
4480	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
4481
4482	movq	MM,  I
4483	sarq	$4,  I
4484	jle	.L225
4485
4486	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4487	MOVUPS_A1 (-29 * SIZE, A2, %xmm1)
4488
4489	decq	I
4490	jle	.L223
4491	ALIGN_4
4492
4493.L222:
4494#ifdef PREFETCH
4495	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
4496#endif
4497
4498	mulps	%xmm4,  %xmm0
4499	addps	%xmm0,  %xmm8
4500	MOVUPS_A1 (-28 * SIZE, A1, %xmm2)
4501	movss	%xmm1,  %xmm12
4502	shufps	$0x39,  %xmm12, %xmm12
4503	mulps	%xmm4,  %xmm12
4504	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4505	addps	%xmm12, %xmm9
4506	MOVUPS_A1 (-25 * SIZE, A2, %xmm12)
4507
4508	mulps	%xmm5,  %xmm2
4509	addps	%xmm2,  %xmm8
4510	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
4511	movss	%xmm12, %xmm1
4512	shufps	$0x39,  %xmm1, %xmm1
4513	mulps	%xmm5,  %xmm1
4514	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
4515	addps	%xmm1,  %xmm9
4516	MOVUPS_A1 (-21 * SIZE, A2, %xmm1)
4517
4518#ifdef PREFETCH
4519	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2)
4520#endif
4521
4522	mulps	%xmm4,  %xmm0
4523	addps	%xmm0,  %xmm8
4524	MOVUPS_A1 (-20 * SIZE, A1, %xmm2)
4525	movss	%xmm1,  %xmm12
4526	shufps	$0x39,  %xmm12, %xmm12
4527	mulps	%xmm4,  %xmm12
4528	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
4529	addps	%xmm12, %xmm9
4530	MOVUPS_A1 (-17 * SIZE, A2, %xmm12)
4531
4532#ifdef PREFETCHW
4533	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
4534#endif
4535
4536	mulps	%xmm5,  %xmm2
4537	addps	%xmm2,  %xmm8
4538	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
4539	movss	%xmm12, %xmm1
4540	shufps	$0x39,  %xmm1, %xmm1
4541	mulps	%xmm5,  %xmm1
4542	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
4543	addps	%xmm1,  %xmm9
4544	MOVUPS_A1 (-13 * SIZE, A2, %xmm1)
4545
4546	addq	$16 * SIZE, A1
4547	addq	$16 * SIZE, A2
4548	addq	$16 * SIZE, X1
4549
4550	decq	I
4551	jg	.L222
4552	ALIGN_4
4553
4554.L223:
4555	mulps	%xmm4,  %xmm0
4556	addps	%xmm0,  %xmm8
4557	MOVUPS_A1 (-28 * SIZE, A1, %xmm2)
4558	movss	%xmm1,  %xmm12
4559	shufps	$0x39,  %xmm12, %xmm12
4560	mulps	%xmm4,  %xmm12
4561	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4562	addps	%xmm12, %xmm9
4563	MOVUPS_A1 (-25 * SIZE, A2, %xmm12)
4564
4565	mulps	%xmm5,  %xmm2
4566	addps	%xmm2,  %xmm8
4567	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
4568	movss	%xmm12, %xmm1
4569	shufps	$0x39,  %xmm1, %xmm1
4570	mulps	%xmm5,  %xmm1
4571	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
4572	addps	%xmm1,  %xmm9
4573	MOVUPS_A1 (-21 * SIZE, A2, %xmm1)
4574
4575	mulps	%xmm4,  %xmm0
4576	addps	%xmm0,  %xmm8
4577	MOVUPS_A1 (-20 * SIZE, A1, %xmm2)
4578	movss	%xmm1,  %xmm12
4579	shufps	$0x39,  %xmm12, %xmm12
4580	mulps	%xmm4,  %xmm12
4581	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
4582	addps	%xmm12, %xmm9
4583	MOVUPS_A1 (-17 * SIZE, A2, %xmm12)
4584
4585	mulps	%xmm5,  %xmm2
4586	addps	%xmm2,  %xmm8
4587	movss	%xmm12, %xmm1
4588	shufps	$0x39,  %xmm1, %xmm1
4589	mulps	%xmm5,  %xmm1
4590	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
4591	addps	%xmm1,  %xmm9
4592
4593	addq	$16 * SIZE, A1
4594	addq	$16 * SIZE, A2
4595	addq	$16 * SIZE, X1
4596	ALIGN_4
4597
4598.L225:
4599	testq	$8, MM
4600	jle	.L226
4601
4602	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4603	MOVUPS_A1 (-29 * SIZE, A2, %xmm1)
4604
4605	mulps	%xmm4,  %xmm0
4606	addps	%xmm0,  %xmm8
4607	MOVUPS_A1 (-28 * SIZE, A1, %xmm2)
4608	movss	%xmm1,  %xmm12
4609	shufps	$0x39,  %xmm12, %xmm12
4610	mulps	%xmm4,  %xmm12
4611	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4612	addps	%xmm12, %xmm9
4613	MOVUPS_A1 (-25 * SIZE, A2, %xmm12)
4614
4615	mulps	%xmm5,  %xmm2
4616	addps	%xmm2,  %xmm8
4617	movss	%xmm12, %xmm1
4618	shufps	$0x39,  %xmm1, %xmm1
4619	mulps	%xmm5,  %xmm1
4620	addps	%xmm1,  %xmm9
4621
4622	addq	$8 * SIZE, A1
4623	addq	$8 * SIZE, A2
4624	addq	$8 * SIZE, X1
4625	ALIGN_4
4626
4627.L226:
4628	testq	$4, MM
4629	jle	.L227
4630
4631	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4632	MOVUPS_A1 (-29 * SIZE, A2, %xmm1)
4633
4634	mulps	%xmm4, %xmm0
4635	addps	%xmm0, %xmm8
4636
4637	movss	 %xmm1,  %xmm12
4638	shufps	 $0x39,  %xmm12, %xmm12
4639	mulps	 %xmm4,  %xmm12
4640	addps	 %xmm12, %xmm9
4641
4642	addq	$4 * SIZE, A1
4643	addq	$4 * SIZE, A2
4644	addq	$4 * SIZE, X1
4645	ALIGN_4
4646
4647.L227:
4648	testq	$2, MM
4649	jle	.L228
4650
4651#ifdef movsd
4652	xorps	%xmm0, %xmm0
4653#endif
4654	movsd	-32 * SIZE(A1), %xmm0
4655#ifdef movsd
4656	xorps	%xmm4, %xmm4
4657#endif
4658	movsd	-32 * SIZE(X1), %xmm4
4659	mulps	%xmm4, %xmm0
4660	addps	%xmm0, %xmm8
4661#ifdef movsd
4662	xorps	%xmm1, %xmm1
4663#endif
4664	movsd	-32 * SIZE(A2), %xmm1
4665	mulps	%xmm4, %xmm1
4666	addps	%xmm1, %xmm9
4667	shufps	$0xe, %xmm4, %xmm4
4668
4669	addq	$2 * SIZE, A1
4670	addq	$2 * SIZE, A2
4671	addq	$2 * SIZE, X1
4672	ALIGN_4
4673
4674.L228:
4675	testq	$1, MM
4676	jle	.L229
4677
4678	movss	-32 * SIZE(A1), %xmm0
4679	movss	-32 * SIZE(X1), %xmm4
4680	mulss	%xmm4, %xmm0
4681	addss	%xmm0, %xmm8
4682	movss	-32 * SIZE(A2), %xmm1
4683	mulss	%xmm4, %xmm1
4684	addss	%xmm1, %xmm9
4685	ALIGN_4
4686
4687.L229:
4688#ifdef HAVE_SSE3
4689	haddps	%xmm9, %xmm8
4690	haddps	%xmm8, %xmm8
4691#else
4692	movaps	%xmm8, %xmm10
4693	unpcklps %xmm9, %xmm8
4694	unpckhps %xmm9, %xmm10
4695
4696	addps	%xmm10, %xmm8
4697	movhlps %xmm8, %xmm9
4698	addps	%xmm9, %xmm8
4699#endif
4700
4701	pshufd	$0x1, %xmm8, %xmm9
4702
4703	mulss	ALPHA, %xmm8
4704	mulss	ALPHA, %xmm9
4705
4706	addss	(Y), %xmm8
4707	addq	INCY, Y
4708	addss	(Y), %xmm9
4709	addq	INCY, Y
4710
4711	movss	%xmm8, (Y1)
4712	addq	INCY, Y1
4713	movss	%xmm9, (Y1)
4714	addq	INCY, Y1
4715	jmp	.L999
4716	ALIGN_4
4717
4718.L230:
4719	cmpq	$1, N
4720	jne	.L999
4721
4722	leaq	32 * SIZE(BUFFER), X1
4723
4724	movq	A, A1
4725
4726	xorps	%xmm8, %xmm8
4727	xorps	%xmm9, %xmm9
4728
4729	cmpq	$3, M
4730	jle	.L237
4731
4732	testq	$SIZE, A1
4733	je	.L23X
4734
4735	movss	-32 * SIZE(A1), %xmm0
4736	movss	-32 * SIZE(X1), %xmm4
4737	mulss	%xmm4, %xmm0
4738	addss	%xmm0, %xmm8
4739
4740	addq	 $1 * SIZE, A1
4741	addq	 $1 * SIZE, X1
4742	ALIGN_3
4743
4744.L23X:
4745	testq	$2 * SIZE, A1
4746	je	.L23XX
4747
4748#ifdef movsd
4749	xorps	%xmm0, %xmm0
4750	xorps	%xmm4, %xmm4
4751#endif
4752	movsd	-32 * SIZE(A1), %xmm0
4753	movsd	-32 * SIZE(X1), %xmm4
4754	mulps	%xmm4, %xmm0
4755	addps	%xmm0, %xmm8
4756	shufps	$0xe, %xmm4, %xmm4
4757
4758	addq	 $2 * SIZE, A1
4759	addq	 $2 * SIZE, X1
4760	ALIGN_3
4761
4762.L23XX:
4763	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
4764	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
4765
4766
4767	movq	MM,  I
4768	sarq	$4,  I
4769	jle	.L235
4770
4771	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4772	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
4773
4774	decq	I
4775	jle	.L233
4776	ALIGN_4
4777
4778.L232:
4779#ifdef PREFETCH
4780	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1)
4781#endif
4782
4783	mulps	%xmm4, %xmm0
4784	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4785	addps	%xmm0, %xmm8
4786	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
4787
4788	mulps	%xmm5, %xmm12
4789	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
4790	addps	%xmm12, %xmm9
4791	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
4792
4793#ifdef PREFETCHW
4794	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1)
4795#endif
4796
4797	mulps	%xmm4, %xmm0
4798	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
4799	addps	%xmm0, %xmm8
4800	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
4801
4802	mulps	%xmm5, %xmm12
4803	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
4804	addps	%xmm12, %xmm9
4805	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
4806
4807	addq	$16 * SIZE, A1
4808	addq	$16 * SIZE, X1
4809
4810	decq	I
4811	jg	.L232
4812	ALIGN_4
4813
4814.L233:
4815	mulps	%xmm4, %xmm0
4816	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4817	addps	%xmm0, %xmm8
4818	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
4819
4820	mulps	%xmm5, %xmm12
4821	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
4822	addps	%xmm12, %xmm9
4823	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
4824
4825	mulps	%xmm4, %xmm0
4826	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
4827	addps	%xmm0, %xmm8
4828
4829	mulps	%xmm5, %xmm12
4830	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
4831	addps	%xmm12, %xmm9
4832
4833	addq	$16 * SIZE, A1
4834	addq	$16 * SIZE, X1
4835	ALIGN_4
4836
4837.L235:
4838	testq	$8, MM
4839	jle	.L236
4840
4841	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4842	mulps	%xmm4, %xmm0
4843	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
4844	addps	%xmm0, %xmm8
4845
4846	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
4847	mulps	%xmm5, %xmm12
4848	addps	%xmm12, %xmm9
4849
4850	addq	$8 * SIZE, A1
4851	addq	$8 * SIZE, X1
4852	ALIGN_4
4853
4854.L236:
4855	testq	$4, MM
4856	jle	.L237
4857
4858	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
4859	mulps	%xmm4, %xmm0
4860	addps	%xmm0, %xmm8
4861
4862	addq	$4 * SIZE, A1
4863	addq	$4 * SIZE, X1
4864	ALIGN_4
4865
4866.L237:
4867	testq	$2, MM
4868	jle	.L238
4869
4870#ifdef movsd
4871	xorps	%xmm0, %xmm0
4872#endif
4873	movsd	-32 * SIZE(A1), %xmm0
4874#ifdef movsd
4875	xorps	%xmm4, %xmm4
4876#endif
4877	movsd	-32 * SIZE(X1), %xmm4
4878	mulps	%xmm4, %xmm0
4879	addps	%xmm0, %xmm8
4880	shufps	$0xe, %xmm4, %xmm4
4881
4882	addq	$2 * SIZE, A1
4883	addq	$2 * SIZE, X1
4884	ALIGN_4
4885
4886.L238:
4887	testq	$1, MM
4888	jle	.L239
4889
4890	movss	-32 * SIZE(A1), %xmm0
4891	movss	-32 * SIZE(X1), %xmm4
4892	mulss	%xmm4, %xmm0
4893	addss	%xmm0, %xmm8
4894	ALIGN_4
4895
4896.L239:
4897	addps	%xmm9, %xmm8
4898
4899#ifdef HAVE_SSE3
4900	haddps	%xmm8, %xmm8
4901	haddps	%xmm8, %xmm8
4902#else
4903	pshufd	$1, %xmm8, %xmm9
4904	pshufd	$2, %xmm8, %xmm10
4905	pshufd	$3, %xmm8, %xmm11
4906
4907	addss	%xmm9, %xmm8
4908	addss	%xmm11, %xmm10
4909	addss	%xmm10, %xmm8
4910#endif
4911
4912	mulss	ALPHA, %xmm8
4913
4914	addss	(Y), %xmm8
4915	movss	%xmm8, (Y1)
4916	jmp	.L999
4917	ALIGN_4
4918
4919.L300:
4920	cmpq	$4, N
4921	jl	.L310
4922	ALIGN_3
4923
4924.L301:
4925	subq	$4, N
4926
4927	leaq	32 * SIZE(BUFFER), X1
4928
4929	movq	A, A1
4930	leaq	(A1, LDA, 2), A2
4931	leaq	(A1, LDA, 4), A
4932
4933	xorps	%xmm8, %xmm8
4934	xorps	%xmm9, %xmm9
4935	xorps	%xmm10, %xmm10
4936	xorps	%xmm11, %xmm11
4937
4938	cmpq	$3, M
4939	jle	.L307
4940
4941	testq	$SIZE, A1
4942	je	.L30X
4943
4944	movss	-32 * SIZE(A1), %xmm0
4945	movss	-32 * SIZE(X1), %xmm4
4946	mulss	%xmm4, %xmm0
4947	addss	%xmm0, %xmm8
4948	movss	-32 * SIZE(A1, LDA), %xmm1
4949	mulss	%xmm4, %xmm1
4950	addss	%xmm1, %xmm9
4951	movss	-32 * SIZE(A2), %xmm2
4952	mulss	%xmm4, %xmm2
4953	addss	%xmm2, %xmm10
4954	movss	-32 * SIZE(A2, LDA), %xmm3
4955	mulss	%xmm4, %xmm3
4956	addss	%xmm3, %xmm11
4957
4958	addq	 $1 * SIZE, A1
4959	addq	 $1 * SIZE, A2
4960	addq	 $1 * SIZE, X1
4961	ALIGN_3
4962
4963.L30X:
4964	testq	$2 * SIZE, A1
4965	je	.L30XX
4966
4967#ifdef movsd
4968	xorps	%xmm0, %xmm0
4969	xorps	%xmm4, %xmm4
4970#endif
4971	movsd	-32 * SIZE(A1), %xmm0
4972	movsd	-32 * SIZE(X1), %xmm4
4973	mulps	%xmm4, %xmm0
4974	addps	%xmm0, %xmm8
4975#ifdef movsd
4976	xorps	%xmm1, %xmm1
4977#endif
4978	movsd	-32 * SIZE(A1, LDA), %xmm1
4979	mulps	%xmm4, %xmm1
4980	addps	%xmm1, %xmm9
4981#ifdef movsd
4982	xorps	%xmm2, %xmm2
4983#endif
4984	movsd	-32 * SIZE(A2), %xmm2
4985	mulps	%xmm4, %xmm2
4986	addps	%xmm2, %xmm10
4987#ifdef movsd
4988	xorps	%xmm3, %xmm3
4989#endif
4990	movsd	-32 * SIZE(A2, LDA), %xmm3
4991	mulps	%xmm4, %xmm3
4992	addps	%xmm3, %xmm11
4993
4994	addq	 $2 * SIZE, A1
4995	addq	 $2 * SIZE, A2
4996	addq	 $2 * SIZE, X1
4997	ALIGN_3
4998
4999.L30XX:
5000	movaps	-35 * SIZE(A1, LDA), %xmm12
5001	movaps	-34 * SIZE(A2), %xmm13
5002	movaps	-33 * SIZE(A2, LDA), %xmm14
5003
5004	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
5005	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
5006
5007#ifdef PREFETCHW
5008	PREFETCHW	4 * SIZE(Y1)
5009#endif
5010
5011	movq	MM,  I
5012	sarq	$4,  I
5013	jle	.L305
5014
5015	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5016	MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1)
5017	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
5018	MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3)
5019
5020	decq	I
5021	jle	.L303
5022	ALIGN_4
5023
5024.L302:
5025#ifdef PREFETCH
5026	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
5027#endif
5028
5029	mulps	 %xmm4, %xmm0
5030	addps	 %xmm0, %xmm8
5031	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5032	movss	 %xmm1,  %xmm12
5033	shufps	 $0x93, %xmm1, %xmm12
5034	mulps	 %xmm4, %xmm12
5035	addps	 %xmm12, %xmm9
5036	MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12)
5037
5038	shufps	 $0x4e, %xmm2, %xmm13
5039	mulps	 %xmm4, %xmm13
5040	addps	 %xmm13, %xmm10
5041	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
5042	movss	 %xmm3,  %xmm14
5043	shufps	 $0x39,  %xmm14, %xmm14
5044	mulps	 %xmm4,  %xmm14
5045	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5046	addps	 %xmm14, %xmm11
5047	MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14)
5048
5049#ifdef PREFETCH
5050	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA)
5051#endif
5052
5053	mulps	 %xmm5, %xmm0
5054	addps	 %xmm0, %xmm8
5055	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
5056	movss	 %xmm12,  %xmm1
5057	shufps	 $0x93, %xmm12, %xmm1
5058	mulps	 %xmm5, %xmm1
5059	addps	 %xmm1, %xmm9
5060	MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1)
5061
5062	shufps	 $0x4e, %xmm13, %xmm2
5063	mulps	 %xmm5, %xmm2
5064	addps	 %xmm2, %xmm10
5065	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
5066	movss	 %xmm14,  %xmm3
5067	shufps	 $0x39,  %xmm3, %xmm3
5068	mulps	 %xmm5,  %xmm3
5069	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
5070	addps	 %xmm3, %xmm11
5071	MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3)
5072
5073#ifdef PREFETCH
5074	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
5075#endif
5076
5077	mulps	 %xmm4, %xmm0
5078	addps	 %xmm0, %xmm8
5079	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
5080	movss	 %xmm1,  %xmm12
5081	shufps	 $0x93, %xmm1, %xmm12
5082	mulps	 %xmm4, %xmm12
5083	addps	 %xmm12, %xmm9
5084	MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12)
5085
5086	shufps	 $0x4e, %xmm2, %xmm13
5087	mulps	 %xmm4, %xmm13
5088	addps	 %xmm13, %xmm10
5089	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
5090	movss	 %xmm3,  %xmm14
5091	shufps	 $0x39,  %xmm14, %xmm14
5092	mulps	 %xmm4,  %xmm14
5093	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
5094	addps	 %xmm14, %xmm11
5095	MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14)
5096
5097#ifdef PREFETCH
5098	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA)
5099#endif
5100
5101	mulps	 %xmm5, %xmm0
5102	addps	 %xmm0, %xmm8
5103	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
5104	movss	 %xmm12,  %xmm1
5105	shufps	 $0x93, %xmm12, %xmm1
5106	mulps	 %xmm5, %xmm1
5107	addps	 %xmm1, %xmm9
5108	MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1)
5109
5110#ifdef PREFETCHW
5111	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
5112#endif
5113
5114	shufps	 $0x4e, %xmm13, %xmm2
5115	mulps	 %xmm5, %xmm2
5116	addps	 %xmm2, %xmm10
5117	MOVUPS_A1 (-14 * SIZE, A2, %xmm2)
5118	movss	 %xmm14,  %xmm3
5119	shufps	 $0x39,  %xmm3, %xmm3
5120	mulps	 %xmm5,  %xmm3
5121	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
5122	addps	 %xmm3, %xmm11
5123	MOVUPS_A2 (-13 * SIZE, A2, LDA, 1, %xmm3)
5124
5125	addq	$16 * SIZE, A1
5126	addq	$16 * SIZE, A2
5127	addq	$16 * SIZE, X1
5128
5129	decq	I
5130	jg	.L302
5131	ALIGN_4
5132
5133.L303:
5134	mulps	 %xmm4, %xmm0
5135	addps	 %xmm0, %xmm8
5136	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5137	movss	 %xmm1,  %xmm12
5138	shufps	 $0x93, %xmm1, %xmm12
5139	mulps	 %xmm4, %xmm12
5140	addps	 %xmm12, %xmm9
5141	MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12)
5142
5143	shufps	 $0x4e, %xmm2, %xmm13
5144	mulps	 %xmm4, %xmm13
5145	addps	 %xmm13, %xmm10
5146	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
5147	movss	 %xmm3,  %xmm14
5148	shufps	 $0x39,  %xmm14, %xmm14
5149	mulps	 %xmm4,  %xmm14
5150	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5151	addps	 %xmm14, %xmm11
5152	MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14)
5153
5154	mulps	 %xmm5, %xmm0
5155	addps	 %xmm0, %xmm8
5156	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
5157	movss	 %xmm12,  %xmm1
5158	shufps	 $0x93, %xmm12, %xmm1
5159	mulps	 %xmm5, %xmm1
5160	addps	 %xmm1, %xmm9
5161	MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1)
5162
5163	shufps	 $0x4e, %xmm13, %xmm2
5164	mulps	 %xmm5, %xmm2
5165	addps	 %xmm2, %xmm10
5166	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
5167	movss	 %xmm14,  %xmm3
5168	shufps	 $0x39,  %xmm3, %xmm3
5169	mulps	 %xmm5,  %xmm3
5170	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
5171	addps	 %xmm3, %xmm11
5172	MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3)
5173
5174	mulps	 %xmm4, %xmm0
5175	addps	 %xmm0, %xmm8
5176	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
5177	movss	 %xmm1,  %xmm12
5178	shufps	 $0x93, %xmm1, %xmm12
5179	mulps	 %xmm4, %xmm12
5180	addps	 %xmm12, %xmm9
5181	MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12)
5182
5183	shufps	 $0x4e, %xmm2, %xmm13
5184	mulps	 %xmm4, %xmm13
5185	addps	 %xmm13, %xmm10
5186	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
5187	movss	 %xmm3,  %xmm14
5188	shufps	 $0x39,  %xmm14, %xmm14
5189	mulps	 %xmm4,  %xmm14
5190	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
5191	addps	 %xmm14, %xmm11
5192	MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14)
5193
5194	mulps	 %xmm5, %xmm0
5195	addps	 %xmm0, %xmm8
5196	movss	 %xmm12,  %xmm1
5197	shufps	 $0x93, %xmm12, %xmm1
5198	mulps	 %xmm5, %xmm1
5199	addps	 %xmm1, %xmm9
5200
5201	shufps	 $0x4e, %xmm13, %xmm2
5202	mulps	 %xmm5, %xmm2
5203	addps	 %xmm2, %xmm10
5204	movss	 %xmm14,  %xmm3
5205	shufps	 $0x39,  %xmm3, %xmm3
5206	mulps	 %xmm5,  %xmm3
5207	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
5208	addps	 %xmm3, %xmm11
5209
5210	addq	$16 * SIZE, A1
5211	addq	$16 * SIZE, A2
5212	addq	$16 * SIZE, X1
5213	ALIGN_4
5214
5215.L305:
5216	testq	$8, MM
5217	jle	.L306
5218
5219	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5220	MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1)
5221	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
5222	MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3)
5223
5224	mulps	 %xmm4, %xmm0
5225	addps	 %xmm0, %xmm8
5226	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5227	movss	 %xmm1,  %xmm12
5228	shufps	 $0x93, %xmm1, %xmm12
5229	mulps	 %xmm4, %xmm12
5230	addps	 %xmm12, %xmm9
5231	MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12)
5232
5233	shufps	 $0x4e, %xmm2, %xmm13
5234	mulps	 %xmm4, %xmm13
5235	addps	 %xmm13, %xmm10
5236	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
5237	movss	 %xmm3,  %xmm14
5238	shufps	 $0x39,  %xmm14, %xmm14
5239	mulps	 %xmm4,  %xmm14
5240	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5241	addps	 %xmm14, %xmm11
5242	MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14)
5243
5244	mulps	 %xmm5, %xmm0
5245	addps	 %xmm0, %xmm8
5246	movss	 %xmm12,  %xmm1
5247	shufps	 $0x93, %xmm12, %xmm1
5248	mulps	 %xmm5, %xmm1
5249	addps	 %xmm1, %xmm9
5250
5251	shufps	 $0x4e, %xmm13, %xmm2
5252	mulps	 %xmm5, %xmm2
5253	addps	 %xmm2, %xmm10
5254	movss	 %xmm14,  %xmm3
5255	shufps	 $0x39,  %xmm3, %xmm3
5256	mulps	 %xmm5,  %xmm3
5257	addps	 %xmm3, %xmm11
5258
5259	addq	$8 * SIZE, A1
5260	addq	$8 * SIZE, A2
5261	addq	$8 * SIZE, X1
5262	ALIGN_4
5263
5264.L306:
5265	testq	$4, MM
5266	jle	.L307
5267
5268	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5269	MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1)
5270	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
5271	MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3)
5272
5273	mulps	 %xmm4, %xmm0
5274	addps	 %xmm0, %xmm8
5275	movss	 %xmm1,  %xmm12
5276	shufps	 $0x93, %xmm1, %xmm12
5277	mulps	 %xmm4, %xmm12
5278	addps	 %xmm12, %xmm9
5279
5280	shufps	 $0x4e, %xmm2, %xmm13
5281	mulps	 %xmm4, %xmm13
5282	addps	 %xmm13, %xmm10
5283	movss	 %xmm3,  %xmm14
5284	shufps	 $0x39,  %xmm14, %xmm14
5285	mulps	 %xmm4,  %xmm14
5286	addps	 %xmm14, %xmm11
5287
5288	addq	$4 * SIZE, A1
5289	addq	$4 * SIZE, A2
5290	addq	$4 * SIZE, X1
5291	ALIGN_4
5292
5293.L307:
5294	testq	$2, MM
5295	jle	.L308
5296
5297#ifdef movsd
5298	xorps	%xmm0, %xmm0
5299#endif
5300	movsd	-32 * SIZE(A1), %xmm0
5301#ifdef movsd
5302	xorps	%xmm4, %xmm4
5303#endif
5304	movsd	-32 * SIZE(X1), %xmm4
5305	mulps	%xmm4, %xmm0
5306	addps	%xmm0, %xmm8
5307#ifdef movsd
5308	xorps	%xmm1, %xmm1
5309#endif
5310	movsd	-32 * SIZE(A1, LDA), %xmm1
5311	mulps	%xmm4, %xmm1
5312	addps	%xmm1, %xmm9
5313#ifdef movsd
5314	xorps	%xmm2, %xmm2
5315#endif
5316	movsd	-32 * SIZE(A2), %xmm2
5317	mulps	%xmm4, %xmm2
5318	addps	%xmm2, %xmm10
5319#ifdef movsd
5320	xorps	%xmm3, %xmm3
5321#endif
5322	movsd	-32 * SIZE(A2, LDA), %xmm3
5323	mulps	%xmm4, %xmm3
5324	addps	%xmm3, %xmm11
5325	shufps	$0xe, %xmm4, %xmm4
5326
5327	addq	$2 * SIZE, A1
5328	addq	$2 * SIZE, A2
5329	addq	$2 * SIZE, X1
5330	ALIGN_4
5331
5332.L308:
5333	testq	$1, MM
5334	jle	.L309
5335
5336	movss	-32 * SIZE(A1), %xmm0
5337	movss	-32 * SIZE(X1), %xmm4
5338	mulss	%xmm4, %xmm0
5339	addss	%xmm0, %xmm8
5340	movss	-32 * SIZE(A1, LDA), %xmm1
5341	mulss	%xmm4, %xmm1
5342	addss	%xmm1, %xmm9
5343	movss	-32 * SIZE(A2), %xmm2
5344	mulss	%xmm4, %xmm2
5345	addss	%xmm2, %xmm10
5346	movss	-32 * SIZE(A2, LDA), %xmm3
5347	mulss	%xmm4, %xmm3
5348	addss	%xmm3, %xmm11
5349	ALIGN_4
5350
5351.L309:
5352#ifdef HAVE_SSE3
5353	haddps	%xmm9, %xmm8
5354	haddps	%xmm11, %xmm10
5355	haddps	%xmm10, %xmm8
5356
5357	pshufd	$0x1, %xmm8, %xmm9
5358	pshufd	$0x2, %xmm8, %xmm10
5359	pshufd	$0x3, %xmm8, %xmm11
5360#else
5361	movaps	%xmm8, %xmm0
5362	unpcklps %xmm9, %xmm8
5363	unpckhps %xmm9, %xmm0
5364
5365	movaps	%xmm10, %xmm1
5366	unpcklps %xmm11, %xmm10
5367	unpckhps %xmm11, %xmm1
5368
5369	movaps	%xmm8, %xmm9
5370	unpcklps %xmm10, %xmm8
5371	unpckhps %xmm10, %xmm9
5372
5373	movaps	%xmm0, %xmm10
5374	unpcklps %xmm1, %xmm0
5375	unpckhps %xmm1, %xmm10
5376
5377	addps	%xmm9, %xmm8
5378	addps	%xmm0, %xmm10
5379	addps	%xmm10, %xmm8
5380
5381	pshufd	$0x2, %xmm8, %xmm9
5382	pshufd	$0x1, %xmm8, %xmm10
5383	pshufd	$0x3, %xmm8, %xmm11
5384#endif
5385
5386	mulss	ALPHA, %xmm8
5387	mulss	ALPHA, %xmm9
5388	mulss	ALPHA, %xmm10
5389	mulss	ALPHA, %xmm11
5390
5391	addss	(Y), %xmm8
5392	addq	INCY, Y
5393	addss	(Y), %xmm9
5394	addq	INCY, Y
5395	addss	(Y), %xmm10
5396	addq	INCY, Y
5397	addss	(Y), %xmm11
5398	addq	INCY, Y
5399
5400	movss	%xmm8, (Y1)
5401	addq	INCY, Y1
5402	movss	%xmm9, (Y1)
5403	addq	INCY, Y1
5404	movss	%xmm10, (Y1)
5405	addq	INCY, Y1
5406	movss	%xmm11, (Y1)
5407	addq	INCY, Y1
5408
5409	cmpq	$4, N
5410	jge	.L301
5411	ALIGN_4
5412
5413.L310:
5414	testq	N, N
5415	jle	.L999
5416
5417	cmpq	$3, N
5418	jne	.L320
5419
5420	leaq	32 * SIZE(BUFFER), X1
5421
5422	movq	A, A1
5423	leaq	(A1, LDA, 2), A2
5424	leaq	(A1, LDA, 4), A
5425
5426	xorps	%xmm8, %xmm8
5427	xorps	%xmm9, %xmm9
5428	xorps	%xmm10, %xmm10
5429
5430	cmpq	$3, M
5431	jle	.L317
5432
5433	testq	$SIZE, A1
5434	je	.L31X
5435
5436	movss	-32 * SIZE(A1), %xmm0
5437	movss	-32 * SIZE(X1), %xmm4
5438	mulss	%xmm4, %xmm0
5439	addss	%xmm0, %xmm8
5440	movss	-32 * SIZE(A1, LDA), %xmm1
5441	mulss	%xmm4, %xmm1
5442	addss	%xmm1, %xmm9
5443	movss	-32 * SIZE(A2), %xmm2
5444	mulss	%xmm4, %xmm2
5445	addss	%xmm2, %xmm10
5446
5447	addq	 $1 * SIZE, A1
5448	addq	 $1 * SIZE, A2
5449	addq	 $1 * SIZE, X1
5450	ALIGN_3
5451
5452.L31X:
5453	testq	$2 * SIZE, A1
5454	je	.L31XX
5455
5456#ifdef movsd
5457	xorps	%xmm0, %xmm0
5458	xorps	%xmm4, %xmm4
5459#endif
5460	movsd	-32 * SIZE(A1), %xmm0
5461	movsd	-32 * SIZE(X1), %xmm4
5462	mulps	%xmm4, %xmm0
5463	addps	%xmm0, %xmm8
5464#ifdef movsd
5465	xorps	%xmm1, %xmm1
5466#endif
5467	movsd	-32 * SIZE(A1, LDA), %xmm1
5468	mulps	%xmm4, %xmm1
5469	addps	%xmm1, %xmm9
5470#ifdef movsd
5471	xorps	%xmm2, %xmm2
5472#endif
5473	movsd	-32 * SIZE(A2), %xmm2
5474	mulps	%xmm4, %xmm2
5475	addps	%xmm2, %xmm10
5476
5477	addq	 $2 * SIZE, A1
5478	addq	 $2 * SIZE, A2
5479	addq	 $2 * SIZE, X1
5480	ALIGN_3
5481
5482.L31XX:
5483	movaps	-35 * SIZE(A1, LDA), %xmm12
5484	movaps	-34 * SIZE(A2), %xmm13
5485
5486	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
5487	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
5488
5489	movq	MM,  I
5490	sarq	$4,  I
5491	jle	.L315
5492
5493	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5494	MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1)
5495	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
5496
5497	decq	I
5498	jle	.L313
5499	ALIGN_4
5500
5501.L312:
5502#ifdef PREFETCH
5503	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1)
5504#endif
5505
5506	mulps	 %xmm4, %xmm0
5507	addps	 %xmm0, %xmm8
5508	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5509	movss	 %xmm1,  %xmm12
5510	shufps	 $0x93, %xmm1, %xmm12
5511	mulps	 %xmm4, %xmm12
5512	addps	 %xmm12, %xmm9
5513	MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12)
5514
5515	shufps	 $0x4e, %xmm2, %xmm13
5516	mulps	 %xmm4, %xmm13
5517	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5518	addps	 %xmm13, %xmm10
5519	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
5520
5521#ifdef PREFETCH
5522	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA)
5523#endif
5524
5525	mulps	 %xmm5, %xmm0
5526	addps	 %xmm0, %xmm8
5527	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
5528	movss	 %xmm12,  %xmm1
5529	shufps	 $0x93, %xmm12, %xmm1
5530	mulps	 %xmm5, %xmm1
5531	addps	 %xmm1, %xmm9
5532	MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1)
5533	shufps	 $0x4e, %xmm13, %xmm2
5534	mulps	 %xmm5, %xmm2
5535	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
5536	addps	 %xmm2, %xmm10
5537	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
5538
5539#ifdef PREFETCH
5540	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2)
5541#endif
5542
5543	mulps	 %xmm4, %xmm0
5544	addps	 %xmm0, %xmm8
5545	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
5546	movss	 %xmm1,  %xmm12
5547	shufps	 $0x93, %xmm1, %xmm12
5548	mulps	 %xmm4, %xmm12
5549	addps	 %xmm12, %xmm9
5550	MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12)
5551	shufps	 $0x4e, %xmm2, %xmm13
5552	mulps	 %xmm4, %xmm13
5553	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
5554	addps	 %xmm13, %xmm10
5555	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
5556
5557#ifdef PREFETCHW
5558	PREFETCH	(PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1)
5559#endif
5560
5561	mulps	 %xmm5, %xmm0
5562	addps	 %xmm0, %xmm8
5563	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
5564	movss	 %xmm12,  %xmm1
5565	shufps	 $0x93, %xmm12, %xmm1
5566	mulps	 %xmm5, %xmm1
5567	addps	 %xmm1, %xmm9
5568	MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1)
5569	shufps	 $0x4e, %xmm13, %xmm2
5570	mulps	 %xmm5, %xmm2
5571	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
5572	addps	 %xmm2, %xmm10
5573	MOVUPS_A1 (-14 * SIZE, A2, %xmm2)
5574
5575	addq	$16 * SIZE, A1
5576	addq	$16 * SIZE, A2
5577	addq	$16 * SIZE, X1
5578
5579	decq	I
5580	jg	.L312
5581	ALIGN_4
5582
5583.L313:
5584	mulps	 %xmm4, %xmm0
5585	addps	 %xmm0, %xmm8
5586	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5587	movss	 %xmm1,  %xmm12
5588	shufps	 $0x93, %xmm1, %xmm12
5589	mulps	 %xmm4, %xmm12
5590	addps	 %xmm12, %xmm9
5591	MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12)
5592
5593	shufps	 $0x4e, %xmm2, %xmm13
5594	mulps	 %xmm4, %xmm13
5595	addps	 %xmm13, %xmm10
5596	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
5597	movss	 %xmm3,  %xmm14
5598	shufps	 $0x39,  %xmm14, %xmm14
5599	mulps	 %xmm4,  %xmm14
5600	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5601	addps	 %xmm14, %xmm11
5602	MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14)
5603
5604	mulps	 %xmm5, %xmm0
5605	addps	 %xmm0, %xmm8
5606	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
5607	movss	 %xmm12,  %xmm1
5608	shufps	 $0x93, %xmm12, %xmm1
5609	mulps	 %xmm5, %xmm1
5610	addps	 %xmm1, %xmm9
5611	MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1)
5612
5613	shufps	 $0x4e, %xmm13, %xmm2
5614	mulps	 %xmm5, %xmm2
5615	addps	 %xmm2, %xmm10
5616	MOVUPS_A1 (-22 * SIZE, A2, %xmm2)
5617	movss	 %xmm14,  %xmm3
5618	shufps	 $0x39,  %xmm3, %xmm3
5619	mulps	 %xmm5,  %xmm3
5620	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
5621	addps	 %xmm3, %xmm11
5622	MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3)
5623
5624	mulps	 %xmm4, %xmm0
5625	addps	 %xmm0, %xmm8
5626	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
5627	movss	 %xmm1,  %xmm12
5628	shufps	 $0x93, %xmm1, %xmm12
5629	mulps	 %xmm4, %xmm12
5630	addps	 %xmm12, %xmm9
5631	MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12)
5632
5633	shufps	 $0x4e, %xmm2, %xmm13
5634	mulps	 %xmm4, %xmm13
5635	addps	 %xmm13, %xmm10
5636	MOVUPS_A1 (-18 * SIZE, A2, %xmm13)
5637	movss	 %xmm3,  %xmm14
5638	shufps	 $0x39,  %xmm14, %xmm14
5639	mulps	 %xmm4,  %xmm14
5640	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
5641	addps	 %xmm14, %xmm11
5642	MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14)
5643
5644	mulps	 %xmm5, %xmm0
5645	addps	 %xmm0, %xmm8
5646	movss	 %xmm12,  %xmm1
5647	shufps	 $0x93, %xmm12, %xmm1
5648	mulps	 %xmm5, %xmm1
5649	addps	 %xmm1, %xmm9
5650
5651	shufps	 $0x4e, %xmm13, %xmm2
5652	mulps	 %xmm5, %xmm2
5653	addps	 %xmm2, %xmm10
5654	movss	 %xmm14,  %xmm3
5655	shufps	 $0x39,  %xmm3, %xmm3
5656	mulps	 %xmm5,  %xmm3
5657	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
5658	addps	 %xmm3, %xmm11
5659
5660	addq	$16 * SIZE, A1
5661	addq	$16 * SIZE, A2
5662	addq	$16 * SIZE, X1
5663	ALIGN_4
5664
5665.L315:
5666	testq	$8, MM
5667	jle	.L316
5668
5669	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5670	MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1)
5671	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
5672
5673	mulps	 %xmm4, %xmm0
5674	addps	 %xmm0, %xmm8
5675	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5676	movss	 %xmm1,  %xmm12
5677	shufps	 $0x93, %xmm1, %xmm12
5678	mulps	 %xmm4, %xmm12
5679	addps	 %xmm12, %xmm9
5680	MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12)
5681
5682	shufps	 $0x4e, %xmm2, %xmm13
5683	mulps	 %xmm4, %xmm13
5684	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5685	addps	 %xmm13, %xmm10
5686	MOVUPS_A1 (-26 * SIZE, A2, %xmm13)
5687
5688	mulps	 %xmm5, %xmm0
5689	addps	 %xmm0, %xmm8
5690	movss	 %xmm12,  %xmm1
5691	shufps	 $0x93, %xmm12, %xmm1
5692	mulps	 %xmm5, %xmm1
5693	addps	 %xmm1, %xmm9
5694
5695	shufps	 $0x4e, %xmm13, %xmm2
5696	mulps	 %xmm5, %xmm2
5697	addps	 %xmm2, %xmm10
5698
5699	addq	$8 * SIZE, A1
5700	addq	$8 * SIZE, A2
5701	addq	$8 * SIZE, X1
5702	ALIGN_4
5703
5704.L316:
5705	testq	$4, MM
5706	jle	.L317
5707
5708	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5709	MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1)
5710	MOVUPS_A1 (-30 * SIZE, A2, %xmm2)
5711
5712	mulps	 %xmm4, %xmm0
5713	addps	 %xmm0, %xmm8
5714	movss	 %xmm1,  %xmm12
5715	shufps	 $0x93, %xmm1, %xmm12
5716	mulps	 %xmm4, %xmm12
5717	addps	 %xmm12, %xmm9
5718
5719	shufps	 $0x4e, %xmm2, %xmm13
5720	mulps	 %xmm4, %xmm13
5721	addps	 %xmm13, %xmm10
5722
5723	addq	$4 * SIZE, A1
5724	addq	$4 * SIZE, A2
5725	addq	$4 * SIZE, X1
5726	ALIGN_4
5727
5728.L317:
5729	testq	$2, MM
5730	jle	.L318
5731
5732#ifdef movsd
5733	xorps	%xmm0, %xmm0
5734#endif
5735	movsd	-32 * SIZE(A1), %xmm0
5736#ifdef movsd
5737	xorps	%xmm4, %xmm4
5738#endif
5739	movsd	-32 * SIZE(X1), %xmm4
5740	mulps	%xmm4, %xmm0
5741	addps	%xmm0, %xmm8
5742#ifdef movsd
5743	xorps	%xmm1, %xmm1
5744#endif
5745	movsd	-32 * SIZE(A1, LDA), %xmm1
5746	mulps	%xmm4, %xmm1
5747	addps	%xmm1, %xmm9
5748#ifdef movsd
5749	xorps	%xmm2, %xmm2
5750#endif
5751	movsd	-32 * SIZE(A2), %xmm2
5752	mulps	%xmm4, %xmm2
5753	addps	%xmm2, %xmm10
5754
5755	addq	$2 * SIZE, A1
5756	addq	$2 * SIZE, A2
5757	addq	$2 * SIZE, X1
5758	ALIGN_4
5759
5760.L318:
5761	testq	$1, MM
5762	jle	.L319
5763
5764	movss	-32 * SIZE(A1), %xmm0
5765	movss	-32 * SIZE(X1), %xmm4
5766	mulss	%xmm4, %xmm0
5767	addss	%xmm0, %xmm8
5768	movss	-32 * SIZE(A1, LDA), %xmm1
5769	mulss	%xmm4, %xmm1
5770	addss	%xmm1, %xmm9
5771	movss	-32 * SIZE(A2), %xmm2
5772	mulss	%xmm4, %xmm2
5773	addss	%xmm2, %xmm10
5774	ALIGN_4
5775
5776.L319:
5777#ifdef HAVE_SSE3
5778	haddps	%xmm9, %xmm8
5779	haddps	%xmm11, %xmm10
5780	haddps	%xmm10, %xmm8
5781
5782	pshufd	$0x1, %xmm8, %xmm9
5783	pshufd	$0x2, %xmm8, %xmm10
5784#else
5785	movaps	%xmm8, %xmm0
5786	unpcklps %xmm9, %xmm8
5787	unpckhps %xmm9, %xmm0
5788
5789	movaps	%xmm10, %xmm1
5790	unpcklps %xmm11, %xmm10
5791	unpckhps %xmm11, %xmm1
5792
5793	movaps	%xmm8, %xmm9
5794	unpcklps %xmm10, %xmm8
5795	unpckhps %xmm10, %xmm9
5796
5797	movaps	%xmm0, %xmm10
5798	unpcklps %xmm1, %xmm0
5799	unpckhps %xmm1, %xmm10
5800
5801	addps	%xmm9, %xmm8
5802	addps	%xmm0, %xmm10
5803	addps	%xmm10, %xmm8
5804
5805	pshufd	$0x2, %xmm8, %xmm9
5806	pshufd	$0x1, %xmm8, %xmm10
5807#endif
5808
5809	mulss	ALPHA, %xmm8
5810	mulss	ALPHA, %xmm9
5811	mulss	ALPHA, %xmm10
5812
5813	addss	(Y), %xmm8
5814	addq	INCY, Y
5815	addss	(Y), %xmm9
5816	addq	INCY, Y
5817	addss	(Y), %xmm10
5818
5819	movss	%xmm8, (Y1)
5820	addq	INCY, Y1
5821	movss	%xmm9, (Y1)
5822	addq	INCY, Y1
5823	movss	%xmm10, (Y1)
5824	jmp	.L999
5825	ALIGN_3
5826
5827.L320:
5828	cmpq	$2, N
5829	jne	.L330
5830
5831	leaq	32 * SIZE(BUFFER), X1
5832
5833	movq	A, A1
5834	leaq	(A1, LDA), A2
5835
5836	xorps	%xmm8, %xmm8
5837	xorps	%xmm9, %xmm9
5838
5839	cmpq	$3, M
5840	jle	.L327
5841
5842	testq	$SIZE, A1
5843	je	.L32X
5844
5845	movss	-32 * SIZE(A1), %xmm0
5846	movss	-32 * SIZE(X1), %xmm4
5847	mulss	%xmm4, %xmm0
5848	addss	%xmm0, %xmm8
5849	movss	-32 * SIZE(A2), %xmm1
5850	mulss	%xmm4, %xmm1
5851	addss	%xmm1, %xmm9
5852
5853	addq	 $1 * SIZE, A1
5854	addq	 $1 * SIZE, A2
5855	addq	 $1 * SIZE, X1
5856	ALIGN_3
5857
5858.L32X:
5859	testq	$2 * SIZE, A1
5860	je	.L32XX
5861
5862#ifdef movsd
5863	xorps	%xmm0, %xmm0
5864	xorps	%xmm4, %xmm4
5865#endif
5866	movsd	-32 * SIZE(A1), %xmm0
5867	movsd	-32 * SIZE(X1), %xmm4
5868	mulps	%xmm4, %xmm0
5869	addps	%xmm0, %xmm8
5870#ifdef movsd
5871	xorps	%xmm1, %xmm1
5872#endif
5873	movsd	-32 * SIZE(A2), %xmm1
5874	mulps	%xmm4, %xmm1
5875	addps	%xmm1, %xmm9
5876
5877	addq	 $2 * SIZE, A1
5878	addq	 $2 * SIZE, A2
5879	addq	 $2 * SIZE, X1
5880	ALIGN_3
5881
5882.L32XX:
5883	movaps	-35 * SIZE(A2), %xmm12
5884
5885	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
5886	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
5887
5888	movq	MM,  I
5889	sarq	$4,  I
5890	jle	.L325
5891
5892	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
5893	MOVUPS_A1 (-31 * SIZE, A2, %xmm1)
5894
5895	decq	I
5896	jle	.L323
5897	ALIGN_4
5898
5899.L322:
5900#ifdef PREFETCH
5901	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
5902#endif
5903
5904	mulps	%xmm4, %xmm0
5905	addps	%xmm0, %xmm8
5906	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5907	movss	 %xmm1,  %xmm12
5908	shufps	 $0x93, %xmm1, %xmm12
5909	mulps	 %xmm4, %xmm12
5910	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5911	addps	 %xmm12, %xmm9
5912	MOVUPS_A1 (-27 * SIZE, A2, %xmm12)
5913
5914	mulps	%xmm5, %xmm0
5915	addps	%xmm0, %xmm8
5916	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
5917	movss	 %xmm12,  %xmm1
5918	shufps	 $0x93, %xmm12, %xmm1
5919	mulps	 %xmm5, %xmm1
5920	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
5921	addps	 %xmm1, %xmm9
5922	MOVUPS_A1 (-23 * SIZE, A2, %xmm1)
5923
5924#ifdef PREFETCH
5925	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2)
5926#endif
5927
5928	mulps	%xmm4, %xmm0
5929	addps	%xmm0, %xmm8
5930	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
5931	movss	 %xmm1,  %xmm12
5932	shufps	 $0x93, %xmm1, %xmm12
5933	mulps	 %xmm4, %xmm12
5934	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
5935	addps	 %xmm12, %xmm9
5936	MOVUPS_A1 (-19 * SIZE, A2, %xmm12)
5937
5938#ifdef PREFETCHW
5939	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
5940#endif
5941
5942	mulps	%xmm5, %xmm0
5943	addps	%xmm0, %xmm8
5944	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
5945	movss	 %xmm12,  %xmm1
5946	shufps	 $0x93, %xmm12, %xmm1
5947	mulps	 %xmm5, %xmm1
5948	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
5949	addps	 %xmm1, %xmm9
5950	MOVUPS_A1 (-15 * SIZE, A2, %xmm1)
5951
5952	addq	$16 * SIZE, A1
5953	addq	$16 * SIZE, A2
5954	addq	$16 * SIZE, X1
5955
5956	decq	I
5957	jg	.L322
5958	ALIGN_4
5959
5960.L323:
5961	mulps	%xmm4, %xmm0
5962	addps	%xmm0, %xmm8
5963	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
5964	movss	 %xmm1,  %xmm12
5965	shufps	 $0x93, %xmm1, %xmm12
5966	mulps	 %xmm4, %xmm12
5967	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
5968	addps	 %xmm12, %xmm9
5969	MOVUPS_A1 (-27 * SIZE, A2, %xmm12)
5970
5971	mulps	%xmm5, %xmm0
5972	addps	%xmm0, %xmm8
5973	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
5974	movss	 %xmm12,  %xmm1
5975	shufps	 $0x93, %xmm12, %xmm1
5976	mulps	 %xmm5, %xmm1
5977	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
5978	addps	 %xmm1, %xmm9
5979	MOVUPS_A1 (-23 * SIZE, A2, %xmm1)
5980
5981	mulps	%xmm4, %xmm0
5982	addps	%xmm0, %xmm8
5983	MOVUPS_A1 (-20 * SIZE, A1, %xmm0)
5984	movss	 %xmm1,  %xmm12
5985	shufps	 $0x93, %xmm1, %xmm12
5986	mulps	 %xmm4, %xmm12
5987	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
5988	addps	 %xmm12, %xmm9
5989	MOVUPS_A1 (-19 * SIZE, A2, %xmm12)
5990
5991	mulps	%xmm5, %xmm0
5992	addps	%xmm0, %xmm8
5993	movss	 %xmm12,  %xmm1
5994	shufps	 $0x93, %xmm12, %xmm1
5995	mulps	 %xmm5, %xmm1
5996	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
5997	addps	 %xmm1, %xmm9
5998
5999	addq	$16 * SIZE, A1
6000	addq	$16 * SIZE, A2
6001	addq	$16 * SIZE, X1
6002	ALIGN_4
6003
6004.L325:
6005	testq	$8, MM
6006	jle	.L326
6007
6008	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
6009	MOVUPS_A1 (-31 * SIZE, A2, %xmm1)
6010
6011	mulps	%xmm4, %xmm0
6012	addps	%xmm0, %xmm8
6013	MOVUPS_A1 (-28 * SIZE, A1, %xmm0)
6014	movss	 %xmm1,  %xmm12
6015	shufps	 $0x93, %xmm1, %xmm12
6016	mulps	 %xmm4, %xmm12
6017	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
6018	addps	 %xmm12, %xmm9
6019	MOVUPS_A1 (-27 * SIZE, A2, %xmm12)
6020
6021	mulps	%xmm5, %xmm0
6022	addps	%xmm0, %xmm8
6023	movss	 %xmm12,  %xmm1
6024	shufps	 $0x93, %xmm12, %xmm1
6025	mulps	 %xmm5, %xmm1
6026	addps	 %xmm1, %xmm9
6027
6028	addq	$8 * SIZE, A1
6029	addq	$8 * SIZE, A2
6030	addq	$8 * SIZE, X1
6031	ALIGN_4
6032
6033.L326:
6034	testq	$4, MM
6035	jle	.L327
6036
6037	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
6038	MOVUPS_A1 (-31 * SIZE, A2, %xmm1)
6039
6040	mulps	%xmm4, %xmm0
6041	addps	%xmm0, %xmm8
6042	movss	 %xmm1,  %xmm12
6043	shufps	 $0x93, %xmm1, %xmm12
6044	mulps	 %xmm4, %xmm12
6045	addps	 %xmm12, %xmm9
6046
6047	addq	$4 * SIZE, A1
6048	addq	$4 * SIZE, A2
6049	addq	$4 * SIZE, X1
6050	ALIGN_4
6051
6052.L327:
6053	testq	$2, MM
6054	jle	.L328
6055
6056#ifdef movsd
6057	xorps	%xmm0, %xmm0
6058#endif
6059	movsd	-32 * SIZE(A1), %xmm0
6060#ifdef movsd
6061	xorps	%xmm4, %xmm4
6062#endif
6063	movsd	-32 * SIZE(X1), %xmm4
6064	mulps	%xmm4, %xmm0
6065	addps	%xmm0, %xmm8
6066#ifdef movsd
6067	xorps	%xmm1, %xmm1
6068#endif
6069	movsd	-32 * SIZE(A2), %xmm1
6070	mulps	%xmm4, %xmm1
6071	addps	%xmm1, %xmm9
6072	shufps	$0xe, %xmm4, %xmm4
6073
6074	addq	$2 * SIZE, A1
6075	addq	$2 * SIZE, A2
6076	addq	$2 * SIZE, X1
6077	ALIGN_4
6078
6079.L328:
6080	testq	$1, MM
6081	jle	.L329
6082
6083	movss	-32 * SIZE(A1), %xmm0
6084	movss	-32 * SIZE(X1), %xmm4
6085	mulss	%xmm4, %xmm0
6086	addss	%xmm0, %xmm8
6087	movss	-32 * SIZE(A2), %xmm1
6088	mulss	%xmm4, %xmm1
6089	addss	%xmm1, %xmm9
6090	ALIGN_4
6091
6092.L329:
6093#ifdef HAVE_SSE3
6094	haddps	%xmm9, %xmm8
6095	haddps	%xmm8, %xmm8
6096#else
6097	movaps	%xmm8, %xmm10
6098	unpcklps %xmm9, %xmm8
6099	unpckhps %xmm9, %xmm10
6100
6101	addps	%xmm10, %xmm8
6102	movhlps %xmm8, %xmm9
6103	addps	%xmm9, %xmm8
6104#endif
6105
6106	pshufd	$0x1, %xmm8, %xmm9
6107
6108	mulss	ALPHA, %xmm8
6109	mulss	ALPHA, %xmm9
6110
6111	addss	(Y), %xmm8
6112	addq	INCY, Y
6113	addss	(Y), %xmm9
6114	addq	INCY, Y
6115
6116	movss	%xmm8, (Y1)
6117	addq	INCY, Y1
6118	movss	%xmm9, (Y1)
6119	addq	INCY, Y1
6120	jmp	.L999
6121	ALIGN_4
6122
6123.L330:
6124	cmpq	$1, N
6125	jne	.L999
6126
6127	leaq	32 * SIZE(BUFFER), X1
6128
6129	movq	A, A1
6130
6131	xorps	%xmm8, %xmm8
6132	xorps	%xmm9, %xmm9
6133
6134	cmpq	$3, M
6135	jle	.L337
6136
6137	testq	$SIZE, A1
6138	je	.L33X
6139
6140	movss	-32 * SIZE(A1), %xmm0
6141	movss	-32 * SIZE(X1), %xmm4
6142	mulss	%xmm4, %xmm0
6143	addss	%xmm0, %xmm8
6144
6145	addq	 $1 * SIZE, A1
6146	addq	 $1 * SIZE, X1
6147	ALIGN_3
6148
6149.L33X:
6150	testq	$2 * SIZE, A1
6151	je	.L33XX
6152
6153#ifdef movsd
6154	xorps	%xmm0, %xmm0
6155	xorps	%xmm4, %xmm4
6156#endif
6157	movsd	-32 * SIZE(A1), %xmm0
6158	movsd	-32 * SIZE(X1), %xmm4
6159	mulps	%xmm4, %xmm0
6160	addps	%xmm0, %xmm8
6161	shufps	$0xe, %xmm4, %xmm4
6162
6163	addq	 $2 * SIZE, A1
6164	addq	 $2 * SIZE, X1
6165	ALIGN_3
6166
6167.L33XX:
6168
6169	MOVUPS_XL1 (-32 * SIZE, X1, %xmm4)
6170	MOVUPS_XL1 (-28 * SIZE, X1, %xmm5)
6171
6172	movq	MM,  I
6173	sarq	$4,  I
6174	jle	.L335
6175
6176	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
6177	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
6178
6179	decq	I
6180	jle	.L333
6181	ALIGN_4
6182
6183.L332:
6184#ifdef PREFETCH
6185	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1)
6186#endif
6187
6188	mulps	%xmm4, %xmm0
6189	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
6190	addps	%xmm0, %xmm8
6191	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
6192
6193	mulps	%xmm5, %xmm12
6194	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
6195	addps	%xmm12, %xmm9
6196	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
6197
6198#ifdef PREFETCHW
6199	PREFETCH	(PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1)
6200#endif
6201
6202	mulps	%xmm4, %xmm0
6203	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
6204	addps	%xmm0, %xmm8
6205	MOVUPS_A1 (-16 * SIZE, A1, %xmm0)
6206
6207	mulps	%xmm5, %xmm12
6208	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
6209	addps	%xmm12, %xmm9
6210	MOVUPS_A1 (-12 * SIZE, A1, %xmm12)
6211
6212	addq	$16 * SIZE, A1
6213	addq	$16 * SIZE, X1
6214
6215	decq	I
6216	jg	.L332
6217	ALIGN_4
6218
6219.L333:
6220	mulps	%xmm4, %xmm0
6221	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
6222	addps	%xmm0, %xmm8
6223	MOVUPS_A1 (-24 * SIZE, A1, %xmm0)
6224
6225	mulps	%xmm5, %xmm12
6226	MOVUPS_XL1 (-20 * SIZE, X1, %xmm5)
6227	addps	%xmm12, %xmm9
6228	MOVUPS_A1 (-20 * SIZE, A1, %xmm12)
6229
6230	mulps	%xmm4, %xmm0
6231	MOVUPS_XL1 (-16 * SIZE, X1, %xmm4)
6232	addps	%xmm0, %xmm8
6233
6234	mulps	%xmm5, %xmm12
6235	MOVUPS_XL1 (-12 * SIZE, X1, %xmm5)
6236	addps	%xmm12, %xmm9
6237
6238	addq	$16 * SIZE, A1
6239	addq	$16 * SIZE, X1
6240	ALIGN_4
6241
6242.L335:
6243	testq	$8, MM
6244	jle	.L336
6245
6246	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
6247	mulps	%xmm4, %xmm0
6248	MOVUPS_XL1 (-24 * SIZE, X1, %xmm4)
6249	addps	%xmm0, %xmm8
6250
6251	MOVUPS_A1 (-28 * SIZE, A1, %xmm12)
6252	mulps	%xmm5, %xmm12
6253	addps	%xmm12, %xmm9
6254
6255	addq	$8 * SIZE, A1
6256	addq	$8 * SIZE, X1
6257	ALIGN_4
6258
6259.L336:
6260	testq	$4, MM
6261	jle	.L337
6262
6263	MOVUPS_A1 (-32 * SIZE, A1, %xmm0)
6264	mulps	%xmm4, %xmm0
6265	addps	%xmm0, %xmm8
6266
6267	addq	$4 * SIZE, A1
6268	addq	$4 * SIZE, X1
6269	ALIGN_4
6270
6271.L337:
6272	testq	$2, MM
6273	jle	.L338
6274
6275#ifdef movsd
6276	xorps	%xmm0, %xmm0
6277#endif
6278	movsd	-32 * SIZE(A1), %xmm0
6279#ifdef movsd
6280	xorps	%xmm4, %xmm4
6281#endif
6282	movsd	-32 * SIZE(X1), %xmm4
6283	mulps	%xmm4, %xmm0
6284	addps	%xmm0, %xmm8
6285	shufps	$0xe, %xmm4, %xmm4
6286
6287	addq	$2 * SIZE, A1
6288	addq	$2 * SIZE, X1
6289	ALIGN_4
6290
6291.L338:
6292	testq	$1, MM
6293	jle	.L339
6294
6295	movss	-32 * SIZE(A1), %xmm0
6296	movss	-32 * SIZE(X1), %xmm4
6297	mulss	%xmm4, %xmm0
6298	addss	%xmm0, %xmm8
6299	ALIGN_4
6300
6301.L339:
6302	addps	%xmm9, %xmm8
6303
6304#ifdef HAVE_SSE3
6305	haddps	%xmm8, %xmm8
6306	haddps	%xmm8, %xmm8
6307#else
6308	pshufd	$1, %xmm8, %xmm9
6309	pshufd	$2, %xmm8, %xmm10
6310	pshufd	$3, %xmm8, %xmm11
6311
6312	addss	%xmm9, %xmm8
6313	addss	%xmm11, %xmm10
6314	addss	%xmm10, %xmm8
6315#endif
6316
6317	mulss	ALPHA, %xmm8
6318
6319	addss	(Y), %xmm8
6320	movss	%xmm8, (Y1)
6321
6322	jmp	.L999
6323#endif
6324	ALIGN_4
6325
6326.L999:
6327	movq	  0(%rsp), %rbx
6328	movq	  8(%rsp), %rbp
6329	movq	 16(%rsp), %r12
6330	movq	 24(%rsp), %r13
6331	movq	 32(%rsp), %r14
6332	movq	 40(%rsp), %r15
6333
6334#ifdef WINDOWS_ABI
6335	movq	 48(%rsp), %rdi
6336	movq	 56(%rsp), %rsi
6337	movups	 64(%rsp), %xmm6
6338	movups	 80(%rsp), %xmm7
6339	movups	 96(%rsp), %xmm8
6340	movups	112(%rsp), %xmm9
6341	movups	128(%rsp), %xmm10
6342	movups	144(%rsp), %xmm11
6343	movups	160(%rsp), %xmm12
6344	movups	176(%rsp), %xmm13
6345	movups	192(%rsp), %xmm14
6346	movups	208(%rsp), %xmm15
6347#endif
6348
6349	addq	$STACKSIZE, %rsp
6350	ret
6351	ALIGN_4
6352
6353	EPILOGUE
6354