1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24#include "l2param.h"
25
26#ifndef WINDOWS_ABI
27
28#define STACKSIZE	64
29
30#define OLD_INCX	 8 + STACKSIZE(%rsp)
31#define OLD_Y		16 + STACKSIZE(%rsp)
32#define OLD_INCY	24 + STACKSIZE(%rsp)
33#define OLD_BUFFER	32 + STACKSIZE(%rsp)
34#define ALPHA_R		48	      (%rsp)
35#define ALPHA_I		56	      (%rsp)
36
37#define M	  %rdi
38#define N	  %rsi
39#define A	  %rcx
40#define LDA	  %r8
41#define X	  %r9
42#define INCX	  %rdx
43#define Y	  %rbp
44#define INCY	  %r10
45
46#else
47
48#define STACKSIZE	256
49
50#define OLD_ALPHA_I	 40 + STACKSIZE(%rsp)
51#define OLD_A		 48 + STACKSIZE(%rsp)
52#define OLD_LDA		 56 + STACKSIZE(%rsp)
53#define OLD_X		 64 + STACKSIZE(%rsp)
54#define OLD_INCX	 72 + STACKSIZE(%rsp)
55#define OLD_Y		 80 + STACKSIZE(%rsp)
56#define OLD_INCY	 88 + STACKSIZE(%rsp)
57#define OLD_BUFFER	 96 + STACKSIZE(%rsp)
58#define ALPHA_R		224	       (%rsp)
59#define ALPHA_I		232	       (%rsp)
60
61#define M	  %rcx
62#define N	  %rdx
63#define A	  %r8
64#define LDA	  %r9
65#define X	  %rdi
66#define INCX	  %rsi
67#define Y	  %rbp
68#define INCY	  %r10
69
70#endif
71
72#define I	%rax
73#define A1	%r12
74#define A2	%r13
75
76#define Y1	%r14
77#define BUFFER	%r15
78
79#define J	%r11
80
81#undef SUBPD
82
83#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
84#define SUBPD	   subpd
85#else
86#define SUBPD	   addpd
87#endif
88
89	PROLOGUE
90	PROFCODE
91
92	subq	$STACKSIZE, %rsp
93	movq	%rbx,  0(%rsp)
94	movq	%rbp,  8(%rsp)
95	movq	%r12, 16(%rsp)
96	movq	%r13, 24(%rsp)
97	movq	%r14, 32(%rsp)
98	movq	%r15, 40(%rsp)
99
100#ifdef WINDOWS_ABI
101	movq	%rdi,    48(%rsp)
102	movq	%rsi,    56(%rsp)
103	movups	%xmm6,   64(%rsp)
104	movups	%xmm7,   80(%rsp)
105	movups	%xmm8,   96(%rsp)
106	movups	%xmm9,  112(%rsp)
107	movups	%xmm10, 128(%rsp)
108	movups	%xmm11, 144(%rsp)
109	movups	%xmm12, 160(%rsp)
110	movups	%xmm13, 176(%rsp)
111	movups	%xmm14, 192(%rsp)
112	movups	%xmm15, 208(%rsp)
113
114	movq	OLD_A,     A
115	movq	OLD_LDA,   LDA
116	movq	OLD_X,     X
117
118	movapd	%xmm3,       %xmm0
119	movsd	OLD_ALPHA_I, %xmm1
120#endif
121
122	movq	OLD_INCX,  INCX
123	movq	OLD_Y,     Y
124	movq	OLD_INCY,  INCY
125	movq	OLD_BUFFER, BUFFER
126
127	salq	$ZBASE_SHIFT,   LDA
128	salq	$ZBASE_SHIFT,   INCX
129	salq	$ZBASE_SHIFT,   INCY
130
131	movlps	%xmm0, ALPHA_R
132	movlps	%xmm1, ALPHA_I
133
134	subq	$-16 * SIZE, A
135
136	testq	M, M
137	jle	.L999
138	testq	N, N
139	jle	.L999
140	ALIGN_3
141
142	movq	BUFFER, Y1
143
144	xorps	%xmm4, %xmm4
145
146	movq	M,   %rax
147	addq	$8,  %rax
148	sarq	$3,  %rax
149	ALIGN_3
150
151.L01:
152	movaps	%xmm4,  0 * SIZE(Y1)
153	movaps	%xmm4,  2 * SIZE(Y1)
154	movaps	%xmm4,  4 * SIZE(Y1)
155	movaps	%xmm4,  6 * SIZE(Y1)
156	movaps	%xmm4,  8 * SIZE(Y1)
157	movaps	%xmm4, 10 * SIZE(Y1)
158	movaps	%xmm4, 12 * SIZE(Y1)
159	movaps	%xmm4, 14 * SIZE(Y1)
160
161	subq	$-16 * SIZE, Y1
162	decq	%rax
163	jg	.L01
164	ALIGN_3
165
166.L10:
167#if GEMV_UNROLL >= 4
168
169	cmpq	$4, N
170	jl	.L20
171	ALIGN_3
172
173.L11:
174	subq	$4, N
175
176	leaq	16 * SIZE(BUFFER), Y1
177	movq	A,  A1
178	leaq	(A,  LDA, 2), A2
179	leaq	(A,  LDA, 4), A
180
181	movddup	0 * SIZE(X), %xmm8
182	movddup	1 * SIZE(X), %xmm9
183	addq	INCX, X
184	movddup	0 * SIZE(X), %xmm10
185	movddup	1 * SIZE(X), %xmm11
186	addq	INCX, X
187	movddup	0 * SIZE(X), %xmm12
188	movddup	1 * SIZE(X), %xmm13
189	addq	INCX, X
190	movddup	0 * SIZE(X), %xmm14
191	movddup	1 * SIZE(X), %xmm15
192	addq	INCX, X
193
194	pcmpeqb	%xmm5, %xmm5
195	psllq	$63,   %xmm5
196	shufps	$0x40, %xmm5, %xmm5
197
198	movsd	ALPHA_R, %xmm6
199	movhps	ALPHA_I, %xmm6
200
201	pshufd	$0x4e, %xmm6,  %xmm7
202
203#ifndef XCONJ
204	xorps	 %xmm5, %xmm7
205#else
206	xorps	 %xmm5, %xmm6
207#endif
208
209	mulpd	 %xmm6, %xmm8
210	mulpd	 %xmm7, %xmm9
211	mulpd	 %xmm6, %xmm10
212	mulpd	 %xmm7, %xmm11
213
214	mulpd	 %xmm6, %xmm12
215	mulpd	 %xmm7, %xmm13
216	mulpd	 %xmm6, %xmm14
217	mulpd	 %xmm7, %xmm15
218
219#ifndef XCONJ
220	subpd	 %xmm9,  %xmm8
221	subpd	 %xmm11, %xmm10
222	subpd	 %xmm13, %xmm12
223	subpd	 %xmm15, %xmm14
224#else
225	addpd	 %xmm9,  %xmm8
226	addpd	 %xmm11, %xmm10
227	addpd	 %xmm13, %xmm12
228	addpd	 %xmm15, %xmm14
229#endif
230
231	pshufd	 $0x4e, %xmm8,  %xmm9
232	pshufd	 $0x4e, %xmm10, %xmm11
233	pshufd	 $0x4e, %xmm12, %xmm13
234	pshufd	 $0x4e, %xmm14, %xmm15
235
236#ifndef XCONJ
237	xorps	 %xmm5, %xmm9
238	xorps	 %xmm5, %xmm11
239	xorps	 %xmm5, %xmm13
240	xorps	 %xmm5, %xmm15
241#else
242	xorps	 %xmm5, %xmm8
243	xorps	 %xmm5, %xmm10
244	xorps	 %xmm5, %xmm12
245	xorps	 %xmm5, %xmm14
246#endif
247
248	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
249	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
250	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
251	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
252	ALIGN_3
253
254	movq	M,   I
255	sarq	$2,  I
256	jle	.L15
257
258	movddup	-16 * SIZE(A1), %xmm4
259	movddup	-14 * SIZE(A1), %xmm5
260	movddup	-12 * SIZE(A1), %xmm6
261	movddup	-10 * SIZE(A1), %xmm7
262
263	decq	 I
264	jle	 .L14
265	ALIGN_3
266
267.L13:
268#ifdef PREFETCH
269	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
270#endif
271
272	mulpd	 %xmm8,  %xmm4
273	addpd	 %xmm4,  %xmm0
274	movddup	-15 * SIZE(A1), %xmm4
275	mulpd	 %xmm8,  %xmm5
276	addpd	 %xmm5,  %xmm1
277	movddup	-13 * SIZE(A1), %xmm5
278	mulpd	 %xmm8,  %xmm6
279	addpd	 %xmm6,  %xmm2
280	movddup	-11 * SIZE(A1), %xmm6
281	mulpd	 %xmm8,  %xmm7
282	addpd	 %xmm7,  %xmm3
283	movddup	 -9 * SIZE(A1), %xmm7
284
285	mulpd	 %xmm9,  %xmm4
286	SUBPD	 %xmm4,  %xmm0
287	movddup	-16 * SIZE(A1, LDA), %xmm4
288	mulpd	 %xmm9,  %xmm5
289	SUBPD	 %xmm5,  %xmm1
290	movddup	-14 * SIZE(A1, LDA), %xmm5
291	mulpd	 %xmm9,  %xmm6
292	SUBPD	 %xmm6,  %xmm2
293	movddup	-12 * SIZE(A1, LDA), %xmm6
294	mulpd	 %xmm9,  %xmm7
295	SUBPD	 %xmm7,  %xmm3
296	movddup	-10 * SIZE(A1, LDA), %xmm7
297
298#ifdef PREFETCH
299	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
300#endif
301
302	mulpd	 %xmm10, %xmm4
303	addpd	 %xmm4,  %xmm0
304	movddup	-15 * SIZE(A1, LDA), %xmm4
305	mulpd	 %xmm10, %xmm5
306	addpd	 %xmm5,  %xmm1
307	movddup	-13 * SIZE(A1, LDA), %xmm5
308	mulpd	 %xmm10, %xmm6
309	addpd	 %xmm6,  %xmm2
310	movddup	-11 * SIZE(A1, LDA), %xmm6
311	mulpd	 %xmm10, %xmm7
312	addpd	 %xmm7,  %xmm3
313	movddup	 -9 * SIZE(A1, LDA), %xmm7
314
315	mulpd	 %xmm11, %xmm4
316	SUBPD	 %xmm4,  %xmm0
317	movddup	-16 * SIZE(A2), %xmm4
318	mulpd	 %xmm11, %xmm5
319	SUBPD	 %xmm5,  %xmm1
320	movddup	-14 * SIZE(A2), %xmm5
321	mulpd	 %xmm11, %xmm6
322	SUBPD	 %xmm6,  %xmm2
323	movddup	-12 * SIZE(A2), %xmm6
324	mulpd	 %xmm11, %xmm7
325	SUBPD	 %xmm7,  %xmm3
326	movddup	-10 * SIZE(A2), %xmm7
327
328#ifdef PREFETCH
329	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
330#endif
331
332	mulpd	 %xmm12, %xmm4
333	addpd	 %xmm4,  %xmm0
334	movddup	-15 * SIZE(A2), %xmm4
335	mulpd	 %xmm12, %xmm5
336	addpd	 %xmm5,  %xmm1
337	movddup	-13 * SIZE(A2), %xmm5
338	mulpd	 %xmm12, %xmm6
339	addpd	 %xmm6,  %xmm2
340	movddup	-11 * SIZE(A2), %xmm6
341	mulpd	 %xmm12, %xmm7
342	addpd	 %xmm7,  %xmm3
343	movddup	 -9 * SIZE(A2), %xmm7
344
345	mulpd	 %xmm13, %xmm4
346	SUBPD	 %xmm4,  %xmm0
347	movddup	-16 * SIZE(A2, LDA), %xmm4
348	mulpd	 %xmm13, %xmm5
349	SUBPD	 %xmm5,  %xmm1
350	movddup	-14 * SIZE(A2, LDA), %xmm5
351	mulpd	 %xmm13, %xmm6
352	SUBPD	 %xmm6,  %xmm2
353	movddup	-12 * SIZE(A2, LDA), %xmm6
354	mulpd	 %xmm13, %xmm7
355	SUBPD	 %xmm7,  %xmm3
356	movddup	-10 * SIZE(A2, LDA), %xmm7
357
358#ifdef PREFETCH
359	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
360#endif
361
362	mulpd	 %xmm14, %xmm4
363	addpd	 %xmm4,  %xmm0
364	movddup	-15 * SIZE(A2, LDA), %xmm4
365	mulpd	 %xmm14, %xmm5
366	addpd	 %xmm5,  %xmm1
367	movddup	-13 * SIZE(A2, LDA), %xmm5
368	mulpd	 %xmm14, %xmm6
369	addpd	 %xmm6,  %xmm2
370	movddup	-11 * SIZE(A2, LDA), %xmm6
371	mulpd	 %xmm14, %xmm7
372	addpd	 %xmm7,  %xmm3
373	movddup	 -9 * SIZE(A2, LDA), %xmm7
374
375	mulpd	 %xmm15, %xmm4
376	SUBPD	 %xmm4,  %xmm0
377	movddup	 -8 * SIZE(A1), %xmm4
378	mulpd	 %xmm15, %xmm5
379	SUBPD	 %xmm5,  %xmm1
380	movddup	 -6 * SIZE(A1), %xmm5
381	mulpd	 %xmm15, %xmm6
382	SUBPD	 %xmm6,  %xmm2
383	movddup	 -4 * SIZE(A1), %xmm6
384	mulpd	 %xmm15, %xmm7
385	SUBPD	 %xmm7,  %xmm3
386	movddup	 -2 * SIZE(A1), %xmm7
387
388#ifdef PREFETCHW
389	PREFETCHW	(PREFETCHSIZE) - 128 + PREOFFSET(Y1)
390#endif
391
392	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
393	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
394	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
395	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
396
397	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
398	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
399	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
400	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
401
402	subq	 $-8 * SIZE, A1
403	subq	 $-8 * SIZE, A2
404	subq	 $-8 * SIZE, Y1
405
406	subq	 $1, I
407	BRANCH
408	jg	.L13
409	ALIGN_3
410
411.L14:
412	mulpd	 %xmm8,  %xmm4
413	addpd	 %xmm4,  %xmm0
414	movddup	-15 * SIZE(A1), %xmm4
415	mulpd	 %xmm8,  %xmm5
416	addpd	 %xmm5,  %xmm1
417	movddup	-13 * SIZE(A1), %xmm5
418	mulpd	 %xmm8,  %xmm6
419	addpd	 %xmm6,  %xmm2
420	movddup	-11 * SIZE(A1), %xmm6
421	mulpd	 %xmm8,  %xmm7
422	addpd	 %xmm7,  %xmm3
423	movddup	 -9 * SIZE(A1), %xmm7
424
425	mulpd	 %xmm9,  %xmm4
426	SUBPD	 %xmm4,  %xmm0
427	movddup	-16 * SIZE(A1, LDA), %xmm4
428	mulpd	 %xmm9,  %xmm5
429	SUBPD	 %xmm5,  %xmm1
430	movddup	-14 * SIZE(A1, LDA), %xmm5
431	mulpd	 %xmm9,  %xmm6
432	SUBPD	 %xmm6,  %xmm2
433	movddup	-12 * SIZE(A1, LDA), %xmm6
434	mulpd	 %xmm9,  %xmm7
435	SUBPD	 %xmm7,  %xmm3
436	movddup	-10 * SIZE(A1, LDA), %xmm7
437
438	mulpd	 %xmm10, %xmm4
439	addpd	 %xmm4,  %xmm0
440	movddup	-15 * SIZE(A1, LDA), %xmm4
441	mulpd	 %xmm10, %xmm5
442	addpd	 %xmm5,  %xmm1
443	movddup	-13 * SIZE(A1, LDA), %xmm5
444	mulpd	 %xmm10, %xmm6
445	addpd	 %xmm6,  %xmm2
446	movddup	-11 * SIZE(A1, LDA), %xmm6
447	mulpd	 %xmm10, %xmm7
448	addpd	 %xmm7,  %xmm3
449	movddup	 -9 * SIZE(A1, LDA), %xmm7
450
451	mulpd	 %xmm11, %xmm4
452	SUBPD	 %xmm4,  %xmm0
453	movddup	-16 * SIZE(A2), %xmm4
454	mulpd	 %xmm11, %xmm5
455	SUBPD	 %xmm5,  %xmm1
456	movddup	-14 * SIZE(A2), %xmm5
457	mulpd	 %xmm11, %xmm6
458	SUBPD	 %xmm6,  %xmm2
459	movddup	-12 * SIZE(A2), %xmm6
460	mulpd	 %xmm11, %xmm7
461	SUBPD	 %xmm7,  %xmm3
462	movddup	-10 * SIZE(A2), %xmm7
463
464	mulpd	 %xmm12, %xmm4
465	addpd	 %xmm4,  %xmm0
466	movddup	-15 * SIZE(A2), %xmm4
467	mulpd	 %xmm12, %xmm5
468	addpd	 %xmm5,  %xmm1
469	movddup	-13 * SIZE(A2), %xmm5
470	mulpd	 %xmm12, %xmm6
471	addpd	 %xmm6,  %xmm2
472	movddup	-11 * SIZE(A2), %xmm6
473	mulpd	 %xmm12, %xmm7
474	addpd	 %xmm7,  %xmm3
475	movddup	 -9 * SIZE(A2), %xmm7
476
477	mulpd	 %xmm13, %xmm4
478	SUBPD	 %xmm4,  %xmm0
479	movddup	-16 * SIZE(A2, LDA), %xmm4
480	mulpd	 %xmm13, %xmm5
481	SUBPD	 %xmm5,  %xmm1
482	movddup	-14 * SIZE(A2, LDA), %xmm5
483	mulpd	 %xmm13, %xmm6
484	SUBPD	 %xmm6,  %xmm2
485	movddup	-12 * SIZE(A2, LDA), %xmm6
486	mulpd	 %xmm13, %xmm7
487	SUBPD	 %xmm7,  %xmm3
488	movddup	-10 * SIZE(A2, LDA), %xmm7
489
490	mulpd	 %xmm14, %xmm4
491	addpd	 %xmm4,  %xmm0
492	movddup	-15 * SIZE(A2, LDA), %xmm4
493	mulpd	 %xmm14, %xmm5
494	addpd	 %xmm5,  %xmm1
495	movddup	-13 * SIZE(A2, LDA), %xmm5
496	mulpd	 %xmm14, %xmm6
497	addpd	 %xmm6,  %xmm2
498	movddup	-11 * SIZE(A2, LDA), %xmm6
499	mulpd	 %xmm14, %xmm7
500	addpd	 %xmm7,  %xmm3
501	movddup	 -9 * SIZE(A2, LDA), %xmm7
502
503	mulpd	 %xmm15, %xmm4
504	SUBPD	 %xmm4,  %xmm0
505	mulpd	 %xmm15, %xmm5
506	SUBPD	 %xmm5,  %xmm1
507	mulpd	 %xmm15, %xmm6
508	SUBPD	 %xmm6,  %xmm2
509	mulpd	 %xmm15, %xmm7
510	SUBPD	 %xmm7,  %xmm3
511
512	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
513	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
514	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
515	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
516
517	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
518	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
519	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
520	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
521
522	subq	 $-8 * SIZE, A1
523	subq	 $-8 * SIZE, A2
524	subq	 $-8 * SIZE, Y1
525	ALIGN_3
526
527.L15:
528	testq	$2, M
529	je	.L17
530
531	movddup	-16 * SIZE(A1), %xmm4
532	movddup	-15 * SIZE(A1), %xmm5
533	movddup	-14 * SIZE(A1), %xmm6
534	movddup	-13 * SIZE(A1), %xmm7
535
536	mulpd	 %xmm8,  %xmm4
537	addpd	 %xmm4,  %xmm0
538	movddup	-16 * SIZE(A1, LDA, 1), %xmm4
539	mulpd	 %xmm8,  %xmm6
540	addpd	 %xmm6,  %xmm1
541	movddup	-14 * SIZE(A1, LDA, 1), %xmm6
542
543	mulpd	 %xmm9,  %xmm5
544	SUBPD	 %xmm5,  %xmm0
545	movddup	-15 * SIZE(A1, LDA, 1), %xmm5
546	mulpd	 %xmm9,  %xmm7
547	SUBPD	 %xmm7,  %xmm1
548	movddup	-13 * SIZE(A1, LDA, 1), %xmm7
549
550	mulpd	 %xmm10, %xmm4
551	addpd	 %xmm4,  %xmm0
552	movddup	-16 * SIZE(A2), %xmm4
553	mulpd	 %xmm10, %xmm6
554	addpd	 %xmm6,  %xmm1
555	movddup	-14 * SIZE(A2), %xmm6
556
557	mulpd	 %xmm11, %xmm5
558	SUBPD	 %xmm5,  %xmm0
559	movddup	-15 * SIZE(A2), %xmm5
560	mulpd	 %xmm11, %xmm7
561	SUBPD	 %xmm7,  %xmm1
562	movddup	-13 * SIZE(A2), %xmm7
563
564	mulpd	 %xmm12, %xmm4
565	addpd	 %xmm4,  %xmm0
566	movddup	-16 * SIZE(A2, LDA, 1), %xmm4
567	mulpd	 %xmm12, %xmm6
568	addpd	 %xmm6,  %xmm1
569	movddup	-14 * SIZE(A2, LDA, 1), %xmm6
570
571	mulpd	 %xmm13, %xmm5
572	SUBPD	 %xmm5,  %xmm0
573	movddup	-15 * SIZE(A2, LDA, 1), %xmm5
574	mulpd	 %xmm13, %xmm7
575	SUBPD	 %xmm7,  %xmm1
576	movddup	-13 * SIZE(A2, LDA, 1), %xmm7
577
578	mulpd	 %xmm14, %xmm4
579	addpd	 %xmm4,  %xmm0
580	mulpd	 %xmm14, %xmm6
581	addpd	 %xmm6,  %xmm1
582
583	mulpd	 %xmm15, %xmm5
584	SUBPD	 %xmm5,  %xmm0
585	mulpd	 %xmm15, %xmm7
586	SUBPD	 %xmm7,  %xmm1
587
588	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
589	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
590	movaps	 %xmm2, %xmm0
591
592	addq	 $4 * SIZE, A1
593	addq	 $4 * SIZE, A2
594	addq	 $4 * SIZE, Y1
595	ALIGN_3
596
597.L17:
598	testq	$1, M
599	je	.L19
600
601	movddup	-16 * SIZE(A1), %xmm4
602	movddup	-15 * SIZE(A1), %xmm5
603	movddup	-16 * SIZE(A1, LDA, 1), %xmm6
604	movddup	-15 * SIZE(A1, LDA, 1), %xmm7
605
606	mulpd	 %xmm8,  %xmm4
607	addpd	 %xmm4,  %xmm0
608	movddup	-16 * SIZE(A2), %xmm4
609	mulpd	 %xmm9,  %xmm5
610	SUBPD	 %xmm5,  %xmm0
611	movddup	-15 * SIZE(A2), %xmm5
612
613	mulpd	 %xmm10, %xmm6
614	addpd	 %xmm6,  %xmm0
615	movddup	-16 * SIZE(A2, LDA, 1), %xmm6
616	mulpd	 %xmm11, %xmm7
617	SUBPD	 %xmm7,  %xmm0
618	movddup	-15 * SIZE(A2, LDA, 1), %xmm7
619
620	mulpd	 %xmm12, %xmm4
621	addpd	 %xmm4,  %xmm0
622	mulpd	 %xmm13, %xmm5
623	SUBPD	 %xmm5,  %xmm0
624
625	mulpd	 %xmm14, %xmm6
626	addpd	 %xmm6,  %xmm0
627	mulpd	 %xmm15, %xmm7
628	SUBPD	 %xmm7,  %xmm0
629
630	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
631	ALIGN_3
632
633.L19:
634	cmpq	$4, N
635	jge	.L11
636	ALIGN_3
637
638.L20:
639#endif
640
641#if GEMV_UNROLL >= 2
642
643	cmpq	$2, N
644	jl	.L30
645
646#if GEMV_UNROLL == 2
647	ALIGN_3
648
649.L21:
650#endif
651
652	subq	$2, N
653
654	leaq	16 * SIZE(BUFFER), Y1
655	movq	A,  A1
656	leaq	(A,  LDA, 1), A2
657	leaq	(A,  LDA, 2), A
658
659	movddup	0 * SIZE(X), %xmm8
660	movddup	1 * SIZE(X), %xmm9
661	addq	INCX, X
662	movddup	0 * SIZE(X), %xmm10
663	movddup	1 * SIZE(X), %xmm11
664	addq	INCX, X
665
666	pcmpeqb	%xmm5, %xmm5
667	psllq	$63,   %xmm5
668	shufps	$0x40, %xmm5, %xmm5
669
670	movsd	ALPHA_R, %xmm6
671	movhps	ALPHA_I, %xmm6
672
673	pshufd	$0x4e, %xmm6,  %xmm7
674
675#ifndef XCONJ
676	xorps	 %xmm5, %xmm7
677#else
678	xorps	 %xmm5, %xmm6
679#endif
680
681	mulpd	 %xmm6, %xmm8
682	mulpd	 %xmm7, %xmm9
683	mulpd	 %xmm6, %xmm10
684	mulpd	 %xmm7, %xmm11
685
686#ifndef XCONJ
687	subpd	 %xmm9,  %xmm8
688	subpd	 %xmm11, %xmm10
689#else
690	addpd	 %xmm9,  %xmm8
691	addpd	 %xmm11, %xmm10
692#endif
693
694	pshufd	 $0x4e, %xmm8,  %xmm9
695	pshufd	 $0x4e, %xmm10, %xmm11
696
697#ifndef XCONJ
698	xorps	 %xmm5, %xmm9
699	xorps	 %xmm5, %xmm11
700#else
701	xorps	 %xmm5, %xmm8
702	xorps	 %xmm5, %xmm10
703#endif
704
705	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
706	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
707	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
708	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
709
710	movq	M,   I
711	sarq	$2,  I
712	jle	.L25
713
714	movddup	-16 * SIZE(A1), %xmm4
715	movddup	-14 * SIZE(A1), %xmm5
716	movddup	-12 * SIZE(A1), %xmm6
717	movddup	-10 * SIZE(A1), %xmm7
718
719	decq	 I
720	jle	 .L24
721	ALIGN_3
722
723.L23:
724#ifdef PREFETCH
725	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
726#endif
727
728	mulpd	 %xmm8,  %xmm4
729	addpd	 %xmm4,  %xmm0
730	movddup	-15 * SIZE(A1), %xmm4
731	mulpd	 %xmm8,  %xmm5
732	addpd	 %xmm5,  %xmm1
733	movddup	-13 * SIZE(A1), %xmm5
734	mulpd	 %xmm8,  %xmm6
735	addpd	 %xmm6,  %xmm2
736	movddup	-11 * SIZE(A1), %xmm6
737	mulpd	 %xmm8,  %xmm7
738	addpd	 %xmm7,  %xmm3
739	movddup	 -9 * SIZE(A1), %xmm7
740
741	mulpd	 %xmm9,  %xmm4
742	SUBPD	 %xmm4,  %xmm0
743	movddup	-16 * SIZE(A2), %xmm4
744	mulpd	 %xmm9,  %xmm5
745	SUBPD	 %xmm5,  %xmm1
746	movddup	-14 * SIZE(A2), %xmm5
747	mulpd	 %xmm9,  %xmm6
748	SUBPD	 %xmm6,  %xmm2
749	movddup	-12 * SIZE(A2), %xmm6
750	mulpd	 %xmm9,  %xmm7
751	SUBPD	 %xmm7,  %xmm3
752	movddup	-10 * SIZE(A2), %xmm7
753
754#ifdef PREFETCH
755	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
756#endif
757
758	mulpd	 %xmm10, %xmm4
759	addpd	 %xmm4,  %xmm0
760	movddup	-15 * SIZE(A2), %xmm4
761	mulpd	 %xmm10, %xmm5
762	addpd	 %xmm5,  %xmm1
763	movddup	-13 * SIZE(A2), %xmm5
764	mulpd	 %xmm10, %xmm6
765	addpd	 %xmm6,  %xmm2
766	movddup	-11 * SIZE(A2), %xmm6
767	mulpd	 %xmm10, %xmm7
768	addpd	 %xmm7,  %xmm3
769	movddup	 -9 * SIZE(A2), %xmm7
770
771	mulpd	 %xmm11, %xmm4
772	SUBPD	 %xmm4,  %xmm0
773	movddup	 -8 * SIZE(A1), %xmm4
774	mulpd	 %xmm11, %xmm5
775	SUBPD	 %xmm5,  %xmm1
776	movddup	 -6 * SIZE(A1), %xmm5
777	mulpd	 %xmm11, %xmm6
778	SUBPD	 %xmm6,  %xmm2
779	movddup	 -4 * SIZE(A1), %xmm6
780	mulpd	 %xmm11, %xmm7
781	SUBPD	 %xmm7,  %xmm3
782	movddup	 -2 * SIZE(A1), %xmm7
783
784#ifdef PREFETCHW
785	PREFETCHW	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
786#endif
787
788	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
789	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
790	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
791	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
792
793	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
794	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
795	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
796	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
797
798	subq	 $-8 * SIZE, A1
799	subq	 $-8 * SIZE, A2
800	subq	 $-8 * SIZE, Y1
801
802	subq	 $1, I
803	BRANCH
804	jg	.L23
805	ALIGN_3
806
807.L24:
808	mulpd	 %xmm8,  %xmm4
809	addpd	 %xmm4,  %xmm0
810	movddup	-15 * SIZE(A1), %xmm4
811	mulpd	 %xmm8,  %xmm5
812	addpd	 %xmm5,  %xmm1
813	movddup	-13 * SIZE(A1), %xmm5
814	mulpd	 %xmm8,  %xmm6
815	addpd	 %xmm6,  %xmm2
816	movddup	-11 * SIZE(A1), %xmm6
817	mulpd	 %xmm8,  %xmm7
818	addpd	 %xmm7,  %xmm3
819	movddup	 -9 * SIZE(A1), %xmm7
820
821	mulpd	 %xmm9,  %xmm4
822	SUBPD	 %xmm4,  %xmm0
823	movddup	-16 * SIZE(A2), %xmm4
824	mulpd	 %xmm9,  %xmm5
825	SUBPD	 %xmm5,  %xmm1
826	movddup	-14 * SIZE(A2), %xmm5
827	mulpd	 %xmm9,  %xmm6
828	SUBPD	 %xmm6,  %xmm2
829	movddup	-12 * SIZE(A2), %xmm6
830	mulpd	 %xmm9,  %xmm7
831	SUBPD	 %xmm7,  %xmm3
832	movddup	-10 * SIZE(A2), %xmm7
833
834	mulpd	 %xmm10, %xmm4
835	addpd	 %xmm4,  %xmm0
836	movddup	-15 * SIZE(A2), %xmm4
837	mulpd	 %xmm10, %xmm5
838	addpd	 %xmm5,  %xmm1
839	movddup	-13 * SIZE(A2), %xmm5
840	mulpd	 %xmm10, %xmm6
841	addpd	 %xmm6,  %xmm2
842	movddup	-11 * SIZE(A2), %xmm6
843	mulpd	 %xmm10, %xmm7
844	addpd	 %xmm7,  %xmm3
845	movddup	 -9 * SIZE(A2), %xmm7
846
847	mulpd	 %xmm11, %xmm4
848	SUBPD	 %xmm4,  %xmm0
849	mulpd	 %xmm11, %xmm5
850	SUBPD	 %xmm5,  %xmm1
851	mulpd	 %xmm11, %xmm6
852	SUBPD	 %xmm6,  %xmm2
853	mulpd	 %xmm11, %xmm7
854	SUBPD	 %xmm7,  %xmm3
855
856	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
857	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
858	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
859	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
860
861	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
862	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
863	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
864	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
865
866	subq	 $-8 * SIZE, A1
867	subq	 $-8 * SIZE, A2
868	subq	 $-8 * SIZE, Y1
869	ALIGN_3
870
871.L25:
872	testq	$2, M
873	je	.L27
874
875	movddup	-16 * SIZE(A1), %xmm4
876	movddup	-15 * SIZE(A1), %xmm5
877	movddup	-14 * SIZE(A1), %xmm6
878	movddup	-13 * SIZE(A1), %xmm7
879
880	mulpd	 %xmm8,  %xmm4
881	addpd	 %xmm4,  %xmm0
882	movddup	-16 * SIZE(A2), %xmm4
883	mulpd	 %xmm8,  %xmm6
884	addpd	 %xmm6,  %xmm1
885	movddup	-14 * SIZE(A2), %xmm6
886
887	mulpd	 %xmm9,  %xmm5
888	SUBPD	 %xmm5,  %xmm0
889	movddup	-15 * SIZE(A2), %xmm5
890	mulpd	 %xmm9,  %xmm7
891	SUBPD	 %xmm7,  %xmm1
892	movddup	-13 * SIZE(A2), %xmm7
893
894	mulpd	 %xmm10, %xmm4
895	addpd	 %xmm4,  %xmm0
896	mulpd	 %xmm10, %xmm6
897	addpd	 %xmm6,  %xmm1
898
899	mulpd	 %xmm11, %xmm5
900	SUBPD	 %xmm5,  %xmm0
901	mulpd	 %xmm11, %xmm7
902	SUBPD	 %xmm7,  %xmm1
903
904	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
905	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
906	movaps	 %xmm2, %xmm0
907
908	addq	 $4 * SIZE, A1
909	addq	 $4 * SIZE, A2
910	addq	 $4 * SIZE, Y1
911	ALIGN_3
912
913.L27:
914	testq	$1, M
915#if GEMV_UNROLL == 2
916	je	.L29
917#else
918	je	.L30
919#endif
920
921	movddup	-16 * SIZE(A1), %xmm4
922	movddup	-15 * SIZE(A1), %xmm5
923	movddup	-16 * SIZE(A2), %xmm6
924	movddup	-15 * SIZE(A2), %xmm7
925
926	mulpd	 %xmm8,  %xmm4
927	addpd	 %xmm4,  %xmm0
928	mulpd	 %xmm9,  %xmm5
929	SUBPD	 %xmm5,  %xmm0
930
931	mulpd	 %xmm10, %xmm6
932	addpd	 %xmm6,  %xmm0
933	mulpd	 %xmm11, %xmm7
934	SUBPD	 %xmm7,  %xmm0
935
936	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
937
938#if GEMV_UNROLL == 2
939	ALIGN_3
940
941.L29:
942	cmpq	$2, N
943	jge	.L21
944#endif
945	ALIGN_3
946
947.L30:
948#endif
949
950	cmpq	$1, N
951	jl	.L980
952
953#if GEMV_UNROLL == 1
954.L31:
955	decq	N
956#endif
957
958	leaq	16 * SIZE(BUFFER), Y1
959	movq	A,  A1
960#if GEMV_UNROLL == 1
961	addq	LDA, A
962#endif
963
964	movddup	0 * SIZE(X), %xmm8
965	movddup	1 * SIZE(X), %xmm9
966	addq	INCX, X
967
968	pcmpeqb	%xmm5, %xmm5
969	psllq	$63,   %xmm5
970	shufps	$0x40, %xmm5, %xmm5
971
972	movsd	ALPHA_R, %xmm6
973	movhps	ALPHA_I, %xmm6
974
975	pshufd	$0x4e, %xmm6,  %xmm7
976
977#ifndef XCONJ
978	xorps	 %xmm5, %xmm7
979#else
980	xorps	 %xmm5, %xmm6
981#endif
982
983	mulpd	 %xmm6, %xmm8
984	mulpd	 %xmm7, %xmm9
985
986#ifndef XCONJ
987	subpd	 %xmm9,  %xmm8
988#else
989	addpd	 %xmm9,  %xmm8
990#endif
991
992	pshufd	 $0x4e, %xmm8,  %xmm9
993
994#ifndef XCONJ
995	xorps	 %xmm5, %xmm9
996#else
997	xorps	 %xmm5, %xmm8
998#endif
999
1000	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1001	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
1002	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
1003	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
1004
1005	movq	M,   I
1006	sarq	$2,  I
1007	jle	.L35
1008
1009	movddup	-16 * SIZE(A1), %xmm4
1010	movddup	-14 * SIZE(A1), %xmm5
1011	movddup	-12 * SIZE(A1), %xmm6
1012	movddup	-10 * SIZE(A1), %xmm7
1013
1014	decq	 I
1015	jle	 .L34
1016	ALIGN_3
1017
1018.L33:
1019#ifdef PREFETCH
1020	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
1021#endif
1022
1023	mulpd	 %xmm8,  %xmm4
1024	addpd	 %xmm4,  %xmm0
1025	movddup	-15 * SIZE(A1), %xmm4
1026	mulpd	 %xmm8,  %xmm5
1027	addpd	 %xmm5,  %xmm1
1028	movddup	-13 * SIZE(A1), %xmm5
1029	mulpd	 %xmm8,  %xmm6
1030	addpd	 %xmm6,  %xmm2
1031	movddup	-11 * SIZE(A1), %xmm6
1032	mulpd	 %xmm8,  %xmm7
1033	addpd	 %xmm7,  %xmm3
1034	movddup	 -9 * SIZE(A1), %xmm7
1035
1036	mulpd	 %xmm9,  %xmm4
1037	SUBPD	 %xmm4,  %xmm0
1038	movddup	 -8 * SIZE(A1), %xmm4
1039	mulpd	 %xmm9,  %xmm5
1040	SUBPD	 %xmm5,  %xmm1
1041	movddup	 -6 * SIZE(A1), %xmm5
1042	mulpd	 %xmm9,  %xmm6
1043	SUBPD	 %xmm6,  %xmm2
1044	movddup	 -4 * SIZE(A1), %xmm6
1045	mulpd	 %xmm9,  %xmm7
1046	SUBPD	 %xmm7,  %xmm3
1047	movddup	 -2 * SIZE(A1), %xmm7
1048
1049#ifdef PREFETCHW
1050	PREFETCHW	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
1051#endif
1052
1053	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1054	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1055	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1056	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1057
1058	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1059	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1060	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1061	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1062
1063	subq	 $-8 * SIZE, A1
1064	subq	 $-8 * SIZE, Y1
1065
1066	subq	 $1, I
1067	BRANCH
1068	jg	.L33
1069	ALIGN_3
1070
1071.L34:
1072	mulpd	 %xmm8,  %xmm4
1073	addpd	 %xmm4,  %xmm0
1074	movddup	-15 * SIZE(A1), %xmm4
1075	mulpd	 %xmm8,  %xmm5
1076	addpd	 %xmm5,  %xmm1
1077	movddup	-13 * SIZE(A1), %xmm5
1078	mulpd	 %xmm8,  %xmm6
1079	addpd	 %xmm6,  %xmm2
1080	movddup	-11 * SIZE(A1), %xmm6
1081	mulpd	 %xmm8,  %xmm7
1082	addpd	 %xmm7,  %xmm3
1083	movddup	 -9 * SIZE(A1), %xmm7
1084
1085	mulpd	 %xmm9,  %xmm4
1086	SUBPD	 %xmm4,  %xmm0
1087	mulpd	 %xmm9,  %xmm5
1088	SUBPD	 %xmm5,  %xmm1
1089	mulpd	 %xmm9,  %xmm6
1090	SUBPD	 %xmm6,  %xmm2
1091	mulpd	 %xmm9,  %xmm7
1092	SUBPD	 %xmm7,  %xmm3
1093
1094	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1095	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1096	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1097	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1098
1099	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1100	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1101	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1102	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1103
1104	subq	 $-8 * SIZE, A1
1105	subq	 $-8 * SIZE, Y1
1106	ALIGN_3
1107
1108.L35:
1109	testq	$2, M
1110	je	.L37
1111
1112	movddup	-16 * SIZE(A1), %xmm4
1113	movddup	-15 * SIZE(A1), %xmm5
1114	movddup	-14 * SIZE(A1), %xmm6
1115	movddup	-13 * SIZE(A1), %xmm7
1116
1117	mulpd	 %xmm8,  %xmm4
1118	addpd	 %xmm4,  %xmm0
1119	mulpd	 %xmm8,  %xmm6
1120	addpd	 %xmm6,  %xmm1
1121
1122	mulpd	 %xmm9,  %xmm5
1123	SUBPD	 %xmm5,  %xmm0
1124	mulpd	 %xmm9,  %xmm7
1125	SUBPD	 %xmm7,  %xmm1
1126
1127	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1128	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1129	movaps	 %xmm2, %xmm0
1130
1131	addq	 $4 * SIZE, A1
1132	addq	 $4 * SIZE, Y1
1133	ALIGN_3
1134
1135.L37:
1136	testq	$1, M
1137#if GEMV_UNROLL == 1
1138	je	.L39
1139#else
1140	je	.L980
1141#endif
1142
1143	movddup	-16 * SIZE(A1), %xmm4
1144	movddup	-15 * SIZE(A1), %xmm5
1145
1146	mulpd	 %xmm8,  %xmm4
1147	addpd	 %xmm4,  %xmm0
1148	mulpd	 %xmm9,  %xmm5
1149	SUBPD	 %xmm5,  %xmm0
1150
1151	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1152
1153#if GEMV_UNROLL == 1
1154	ALIGN_3
1155.L39:
1156	cmpq	$1, N
1157	jge	.L31
1158#endif
1159
1160.L980:
1161	testq	$SIZE, Y
1162	jne	.L990
1163
1164	movq	Y,  Y1
1165
1166	movq	M,  %rax
1167	sarq	$3, %rax
1168	jle	.L184
1169	ALIGN_3
1170
1171.L182:
1172 	movaps	 (Y), %xmm0
1173	addq	INCY, Y
1174	movaps	 (Y), %xmm1
1175	addq	INCY, Y
1176	movaps	 (Y), %xmm2
1177	addq	INCY, Y
1178	movaps	 (Y), %xmm3
1179	addq	INCY, Y
1180 	movaps	 (Y), %xmm4
1181	addq	INCY, Y
1182	movaps	 (Y), %xmm5
1183	addq	INCY, Y
1184	movaps	 (Y), %xmm6
1185	addq	INCY, Y
1186	movaps	 (Y), %xmm7
1187	addq	INCY, Y
1188
1189	addpd	 0 * SIZE(BUFFER), %xmm0
1190	addpd	 2 * SIZE(BUFFER), %xmm1
1191	addpd	 4 * SIZE(BUFFER), %xmm2
1192	addpd	 6 * SIZE(BUFFER), %xmm3
1193	addpd	 8 * SIZE(BUFFER), %xmm4
1194	addpd	10 * SIZE(BUFFER), %xmm5
1195	addpd	12 * SIZE(BUFFER), %xmm6
1196	addpd	14 * SIZE(BUFFER), %xmm7
1197
1198	movaps	%xmm0,  (Y1)
1199	addq	INCY, Y1
1200	movaps	%xmm1,  (Y1)
1201	addq	INCY, Y1
1202	movaps	%xmm2,  (Y1)
1203	addq	INCY, Y1
1204	movaps	%xmm3,  (Y1)
1205	addq	INCY, Y1
1206	movaps	%xmm4,  (Y1)
1207	addq	INCY, Y1
1208	movaps	%xmm5,  (Y1)
1209	addq	INCY, Y1
1210	movaps	%xmm6,  (Y1)
1211	addq	INCY, Y1
1212	movaps	%xmm7,  (Y1)
1213	addq	INCY, Y1
1214
1215	subq	$-16 * SIZE, BUFFER
1216	decq	%rax
1217	jg	.L182
1218	ALIGN_3
1219
1220.L184:
1221	testq	$7, M
1222	jle	.L999
1223
1224	testq	$4, M
1225	jle	.L185
1226
1227 	movaps	 (Y), %xmm0
1228	addq	INCY, Y
1229	movaps	 (Y), %xmm1
1230	addq	INCY, Y
1231	movaps	 (Y), %xmm2
1232	addq	INCY, Y
1233	movaps	 (Y), %xmm3
1234	addq	INCY, Y
1235
1236	addpd	 0 * SIZE(BUFFER), %xmm0
1237	addpd	 2 * SIZE(BUFFER), %xmm1
1238	addpd	 4 * SIZE(BUFFER), %xmm2
1239	addpd	 6 * SIZE(BUFFER), %xmm3
1240
1241	movaps	%xmm0,  (Y1)
1242	addq	INCY, Y1
1243	movaps	%xmm1,  (Y1)
1244	addq	INCY, Y1
1245	movaps	%xmm2,  (Y1)
1246	addq	INCY, Y1
1247	movaps	%xmm3,  (Y1)
1248	addq	INCY, Y1
1249
1250	addq	$8 * SIZE, BUFFER
1251	ALIGN_3
1252
1253.L185:
1254	testq	$2, M
1255	jle	.L186
1256
1257 	movaps	 (Y), %xmm0
1258	addq	INCY, Y
1259	movaps	 (Y), %xmm1
1260	addq	INCY, Y
1261	addpd	 0 * SIZE(BUFFER), %xmm0
1262	addpd	 2 * SIZE(BUFFER), %xmm1
1263
1264	movaps	%xmm0,  (Y1)
1265	addq	INCY, Y1
1266	movaps	%xmm1,  (Y1)
1267	addq	INCY, Y1
1268
1269	addq	$4 * SIZE, BUFFER
1270	ALIGN_3
1271
1272.L186:
1273	testq	$1, M
1274	jle	.L999
1275
1276 	movaps	 (Y), %xmm0
1277
1278	addpd	 (BUFFER), %xmm0
1279
1280	movaps	%xmm0,  (Y1)
1281	jmp	.L999
1282	ALIGN_3
1283
1284.L990:
1285	movq	Y,  Y1
1286
1287	movq	M,  %rax
1288	sarq	$3, %rax
1289	jle	.L994
1290	ALIGN_3
1291
1292.L992:
1293 	movsd	 0 * SIZE(Y), %xmm0
1294	movhpd	 1 * SIZE(Y), %xmm0
1295	addq	INCY, Y
1296
1297	movsd	 0 * SIZE(Y), %xmm1
1298	movhpd	 1 * SIZE(Y), %xmm1
1299	addq	INCY, Y
1300
1301	movsd	 0 * SIZE(Y), %xmm2
1302	movhpd	 1 * SIZE(Y), %xmm2
1303	addq	INCY, Y
1304
1305	movsd	 0 * SIZE(Y), %xmm3
1306	movhpd	 1 * SIZE(Y), %xmm3
1307	addq	INCY, Y
1308
1309 	movsd	 0 * SIZE(Y), %xmm4
1310	movhpd	 1 * SIZE(Y), %xmm4
1311	addq	INCY, Y
1312
1313	movsd	 0 * SIZE(Y), %xmm5
1314	movhpd	 1 * SIZE(Y), %xmm5
1315	addq	INCY, Y
1316
1317	movsd	 0 * SIZE(Y), %xmm6
1318	movhpd	 1 * SIZE(Y), %xmm6
1319	addq	INCY, Y
1320
1321	movsd	 0 * SIZE(Y), %xmm7
1322	movhpd	 1 * SIZE(Y), %xmm7
1323	addq	INCY, Y
1324
1325	addpd	 0 * SIZE(BUFFER), %xmm0
1326	addpd	 2 * SIZE(BUFFER), %xmm1
1327	addpd	 4 * SIZE(BUFFER), %xmm2
1328	addpd	 6 * SIZE(BUFFER), %xmm3
1329	addpd	 8 * SIZE(BUFFER), %xmm4
1330	addpd	10 * SIZE(BUFFER), %xmm5
1331	addpd	12 * SIZE(BUFFER), %xmm6
1332	addpd	14 * SIZE(BUFFER), %xmm7
1333
1334	movlpd	%xmm0,  0 * SIZE(Y1)
1335	movhpd	%xmm0,  1 * SIZE(Y1)
1336	addq	INCY, Y1
1337
1338	movlpd	%xmm1,  0 * SIZE(Y1)
1339	movhpd	%xmm1,  1 * SIZE(Y1)
1340	addq	INCY, Y1
1341
1342	movlpd	%xmm2,  0 * SIZE(Y1)
1343	movhpd	%xmm2,  1 * SIZE(Y1)
1344	addq	INCY, Y1
1345
1346	movlpd	%xmm3,  0 * SIZE(Y1)
1347	movhpd	%xmm3,  1 * SIZE(Y1)
1348	addq	INCY, Y1
1349
1350	movlpd	%xmm4,  0 * SIZE(Y1)
1351	movhpd	%xmm4,  1 * SIZE(Y1)
1352	addq	INCY, Y1
1353
1354	movlpd	%xmm5,  0 * SIZE(Y1)
1355	movhpd	%xmm5,  1 * SIZE(Y1)
1356	addq	INCY, Y1
1357
1358	movlpd	%xmm6,  0 * SIZE(Y1)
1359	movhpd	%xmm6,  1 * SIZE(Y1)
1360	addq	INCY, Y1
1361
1362	movlpd	%xmm7,  0 * SIZE(Y1)
1363	movhpd	%xmm7,  1 * SIZE(Y1)
1364	addq	INCY, Y1
1365
1366	subq	$-16 * SIZE, BUFFER
1367	decq	%rax
1368	jg	.L992
1369	ALIGN_3
1370
1371.L994:
1372	testq	$7, M
1373	jle	.L999
1374
1375	testq	$4, M
1376	jle	.L995
1377
1378 	movsd	 0 * SIZE(Y), %xmm0
1379	movhpd	 1 * SIZE(Y), %xmm0
1380	addq	INCY, Y
1381
1382	movsd	 0 * SIZE(Y), %xmm1
1383	movhpd	 1 * SIZE(Y), %xmm1
1384	addq	INCY, Y
1385
1386	movsd	 0 * SIZE(Y), %xmm2
1387	movhpd	 1 * SIZE(Y), %xmm2
1388	addq	INCY, Y
1389
1390	movsd	 0 * SIZE(Y), %xmm3
1391	movhpd	 1 * SIZE(Y), %xmm3
1392	addq	INCY, Y
1393
1394	addpd	 0 * SIZE(BUFFER), %xmm0
1395	addpd	 2 * SIZE(BUFFER), %xmm1
1396	addpd	 4 * SIZE(BUFFER), %xmm2
1397	addpd	 6 * SIZE(BUFFER), %xmm3
1398
1399	movlpd	%xmm0,  0 * SIZE(Y1)
1400	movhpd	%xmm0,  1 * SIZE(Y1)
1401	addq	INCY, Y1
1402
1403	movlpd	%xmm1,  0 * SIZE(Y1)
1404	movhpd	%xmm1,  1 * SIZE(Y1)
1405	addq	INCY, Y1
1406
1407	movlpd	%xmm2,  0 * SIZE(Y1)
1408	movhpd	%xmm2,  1 * SIZE(Y1)
1409	addq	INCY, Y1
1410
1411	movlpd	%xmm3,  0 * SIZE(Y1)
1412	movhpd	%xmm3,  1 * SIZE(Y1)
1413	addq	INCY, Y1
1414
1415	addq	$8 * SIZE, BUFFER
1416	ALIGN_3
1417
1418.L995:
1419	testq	$2, M
1420	jle	.L996
1421
1422 	movsd	 0 * SIZE(Y), %xmm0
1423	movhpd	 1 * SIZE(Y), %xmm0
1424	addq	INCY, Y
1425
1426	movsd	 0 * SIZE(Y), %xmm1
1427	movhpd	 1 * SIZE(Y), %xmm1
1428	addq	INCY, Y
1429
1430	addpd	 0 * SIZE(BUFFER), %xmm0
1431	addpd	 2 * SIZE(BUFFER), %xmm1
1432
1433	movlpd	%xmm0,  0 * SIZE(Y1)
1434	movhpd	%xmm0,  1 * SIZE(Y1)
1435	addq	INCY, Y1
1436
1437	movlpd	%xmm1,  0 * SIZE(Y1)
1438	movhpd	%xmm1,  1 * SIZE(Y1)
1439	addq	INCY, Y1
1440
1441	addq	$4 * SIZE, BUFFER
1442	ALIGN_3
1443
1444.L996:
1445	testq	$1, M
1446	jle	.L999
1447
1448 	movsd	 0 * SIZE(Y), %xmm0
1449	movhpd	 1 * SIZE(Y), %xmm0
1450
1451	addpd	 0 * SIZE(BUFFER), %xmm0
1452
1453	movlpd	%xmm0,  0 * SIZE(Y1)
1454	movhpd	%xmm0,  1 * SIZE(Y1)
1455	ALIGN_3
1456
1457.L999:
1458	movq	  0(%rsp), %rbx
1459	movq	  8(%rsp), %rbp
1460	movq	 16(%rsp), %r12
1461	movq	 24(%rsp), %r13
1462	movq	 32(%rsp), %r14
1463	movq	 40(%rsp), %r15
1464
1465#ifdef WINDOWS_ABI
1466	movq	 48(%rsp), %rdi
1467	movq	 56(%rsp), %rsi
1468	movups	 64(%rsp), %xmm6
1469	movups	 80(%rsp), %xmm7
1470	movups	 96(%rsp), %xmm8
1471	movups	112(%rsp), %xmm9
1472	movups	128(%rsp), %xmm10
1473	movups	144(%rsp), %xmm11
1474	movups	160(%rsp), %xmm12
1475	movups	176(%rsp), %xmm13
1476	movups	192(%rsp), %xmm14
1477	movups	208(%rsp), %xmm15
1478#endif
1479
1480	addq	$STACKSIZE, %rsp
1481	ret
1482
1483	EPILOGUE
1484