1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41#include "l2param.h"
42
43#ifndef WINDOWS_ABI
44
45#define STACKSIZE	64
46
47#define OLD_INCX	 8 + STACKSIZE(%rsp)
48#define OLD_Y		16 + STACKSIZE(%rsp)
49#define OLD_INCY	24 + STACKSIZE(%rsp)
50#define OLD_BUFFER	32 + STACKSIZE(%rsp)
51
52#define M	  %rdi
53#define N	  %rsi
54#define A	  %rcx
55#define LDA	  %r8
56#define X	  %r9
57#define INCX	  %rdx
58#define Y	  %rbp
59#define INCY	  %r10
60
61#else
62
63#define STACKSIZE	256
64
65#define OLD_ALPHA_I	 40 + STACKSIZE(%rsp)
66#define OLD_A		 48 + STACKSIZE(%rsp)
67#define OLD_LDA		 56 + STACKSIZE(%rsp)
68#define OLD_X		 64 + STACKSIZE(%rsp)
69#define OLD_INCX	 72 + STACKSIZE(%rsp)
70#define OLD_Y		 80 + STACKSIZE(%rsp)
71#define OLD_INCY	 88 + STACKSIZE(%rsp)
72#define OLD_BUFFER	 96 + STACKSIZE(%rsp)
73
74#define M	  %rcx
75#define N	  %rdx
76#define A	  %r8
77#define LDA	  %r9
78#define X	  %rdi
79#define INCX	  %rsi
80#define Y	  %rbp
81#define INCY	  %r10
82
83#endif
84
85#define I	%rax
86#define J	%rbx
87#define A1	%r11
88#define A2	%r12
89
90#define X1	%r13
91#define Y1	%r14
92#define BUFFER	%r15
93
94#define ALPHA_R	 %xmm14
95#define ALPHA_I	 %xmm15
96
97#undef SUBPD
98
99#ifndef CONJ
100#define SUBPD	   addpd
101#else
102#define SUBPD	   subpd
103#endif
104
105	PROLOGUE
106	PROFCODE
107
108	subq	$STACKSIZE, %rsp
109	movq	%rbx,  0(%rsp)
110	movq	%rbp,  8(%rsp)
111	movq	%r12, 16(%rsp)
112	movq	%r13, 24(%rsp)
113	movq	%r14, 32(%rsp)
114	movq	%r15, 40(%rsp)
115
116#ifdef WINDOWS_ABI
117	movq	%rdi,    48(%rsp)
118	movq	%rsi,    56(%rsp)
119	movups	%xmm6,   64(%rsp)
120	movups	%xmm7,   80(%rsp)
121	movups	%xmm8,   96(%rsp)
122	movups	%xmm9,  112(%rsp)
123	movups	%xmm10, 128(%rsp)
124	movups	%xmm11, 144(%rsp)
125	movups	%xmm12, 160(%rsp)
126	movups	%xmm13, 176(%rsp)
127	movups	%xmm14, 192(%rsp)
128	movups	%xmm15, 208(%rsp)
129
130	movq	OLD_A,     A
131	movq	OLD_LDA,   LDA
132	movq	OLD_X,     X
133
134	movaps	%xmm3,       %xmm0
135	movss	OLD_ALPHA_I, %xmm1
136#endif
137
138	movq	OLD_INCX,  INCX
139	movq	OLD_Y,     Y
140	movq	OLD_INCY,  INCY
141	movq	OLD_BUFFER, BUFFER
142
143	salq	$ZBASE_SHIFT,   LDA
144	salq	$ZBASE_SHIFT,   INCX
145	salq	$ZBASE_SHIFT,   INCY
146
147#ifdef HAVE_SSE3
148	movddup	%xmm0, ALPHA_R
149	movddup	%xmm1, ALPHA_I
150#else
151	pshufd	$0x44, %xmm0, ALPHA_R
152	pshufd	$0x44, %xmm1, ALPHA_I
153#endif
154
155	subq	$-16 * SIZE, A
156
157	testq	M, M
158	jle	.L999
159	testq	N, N
160	jle	.L999
161	ALIGN_3
162
163	movq	BUFFER, X1
164
165	movq	Y, Y1
166
167	movq	M,  I
168	sarq	$2, I
169	jle	.L05
170	ALIGN_4
171
172.L02:
173	movsd	 0 * SIZE(X), %xmm0
174	movhpd	 1 * SIZE(X), %xmm0
175	addq	INCX, X
176
177	movsd	 0 * SIZE(X), %xmm1
178	movhpd	 1 * SIZE(X), %xmm1
179	addq	INCX, X
180
181	movsd	 0 * SIZE(X), %xmm2
182	movhpd	 1 * SIZE(X), %xmm2
183	addq	INCX, X
184
185	movsd	 0 * SIZE(X), %xmm3
186	movhpd	 1 * SIZE(X), %xmm3
187	addq	INCX, X
188
189	movapd	%xmm0,  0 * SIZE(X1)
190	movapd	%xmm1,  2 * SIZE(X1)
191	movapd	%xmm2,  4 * SIZE(X1)
192	movapd	%xmm3,  6 * SIZE(X1)
193
194	addq	$8 * SIZE, X1
195	decq	I
196	jg	.L02
197	ALIGN_4
198
199.L05:
200	movq	M,  I
201	andq	$3, I
202	jle	.L10
203	ALIGN_2
204
205.L06:
206	movsd	 0 * SIZE(X), %xmm0
207	movhpd	 1 * SIZE(X), %xmm0
208	addq	INCX, X
209	movapd	%xmm0, 0 * SIZE(X1)
210	addq	$2 * SIZE, X1
211	decq	I
212	jg	.L06
213	ALIGN_4
214
215.L10:
216#ifdef ALIGNED_ACCESS
217	testq	$SIZE, A
218	jne	.L100
219#endif
220
221#if GEMV_UNROLL >= 4
222
223	cmpq	$4, N
224	jl	.L20
225	ALIGN_3
226
227.L11:
228	subq	$4, N
229
230	leaq	16 * SIZE(BUFFER), X1
231
232	movq	A, A1
233	leaq	(A1, LDA, 2), A2
234	leaq	(A1, LDA, 4), A
235
236	MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
237	xorpd	%xmm0, %xmm0
238	xorpd	%xmm1, %xmm1
239	xorpd	%xmm2, %xmm2
240	xorpd	%xmm3, %xmm3
241	MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
242	xorpd	%xmm4, %xmm4
243	xorpd	%xmm5, %xmm5
244	xorpd	%xmm6, %xmm6
245	xorpd	%xmm7, %xmm7
246
247#ifdef PREFETCHW
248	PREFETCHW	3 * SIZE(Y1)
249#endif
250
251	movq	M,   I
252	sarq	$2,  I
253	jle	.L15
254
255	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
256	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10)
257
258	decq	 I
259	jle	 .L14
260	ALIGN_3
261
262.L13:
263#ifdef PREFETCH
264	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
265#endif
266
267	pshufd	$0x4e, %xmm8,  %xmm9
268	mulpd	%xmm12,  %xmm8
269	addpd	%xmm8,  %xmm0
270	MOVUPS_A1(-16 * SIZE, A2, %xmm8)
271	mulpd	%xmm12, %xmm9
272	SUBPD	%xmm9,  %xmm1
273
274	pshufd	$0x4e, %xmm10, %xmm11
275	mulpd	%xmm12, %xmm10
276	addpd	%xmm10, %xmm2
277	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
278	mulpd	%xmm12, %xmm11
279	SUBPD	%xmm11, %xmm3
280
281	pshufd	$0x4e, %xmm8,  %xmm9
282	mulpd	%xmm12, %xmm8
283	addpd	%xmm8,  %xmm4
284	MOVUPS_A1(-14 * SIZE, A1, %xmm8)
285	mulpd	%xmm12, %xmm9
286	SUBPD	%xmm9,  %xmm5
287
288	pshufd	$0x4e, %xmm10, %xmm11
289	mulpd	%xmm12, %xmm10
290	addpd	%xmm10, %xmm6
291	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10)
292	mulpd	%xmm12, %xmm11
293	MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
294	SUBPD	%xmm11, %xmm7
295
296#ifdef PREFETCH
297	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
298#endif
299
300	pshufd	$0x4e, %xmm8,  %xmm9
301	mulpd	%xmm13,  %xmm8
302	addpd	%xmm8,  %xmm0
303	MOVUPS_A1(-14 * SIZE, A2, %xmm8)
304	mulpd	%xmm13, %xmm9
305	SUBPD	%xmm9,  %xmm1
306
307	pshufd	$0x4e, %xmm10, %xmm11
308	mulpd	%xmm13, %xmm10
309	addpd	%xmm10, %xmm2
310	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10)
311	mulpd	%xmm13, %xmm11
312	SUBPD	%xmm11, %xmm3
313
314	pshufd	$0x4e, %xmm8,  %xmm9
315	mulpd	%xmm13, %xmm8
316	addpd	%xmm8,  %xmm4
317	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
318	mulpd	%xmm13, %xmm9
319	SUBPD	%xmm9,  %xmm5
320
321	pshufd	$0x4e, %xmm10, %xmm11
322	mulpd	%xmm13, %xmm10
323	addpd	%xmm10, %xmm6
324	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10)
325	mulpd	%xmm13, %xmm11
326	MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
327	SUBPD	%xmm11, %xmm7
328
329#ifdef PREFETCH
330	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
331#endif
332
333	pshufd	$0x4e, %xmm8,  %xmm9
334	mulpd	%xmm12,  %xmm8
335	addpd	%xmm8,  %xmm0
336	MOVUPS_A1(-12 * SIZE, A2, %xmm8)
337	mulpd	%xmm12, %xmm9
338	SUBPD	%xmm9,  %xmm1
339
340	pshufd	$0x4e, %xmm10, %xmm11
341	mulpd	%xmm12, %xmm10
342	addpd	%xmm10, %xmm2
343	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10)
344	mulpd	%xmm12, %xmm11
345	SUBPD	%xmm11, %xmm3
346
347	pshufd	$0x4e, %xmm8,  %xmm9
348	mulpd	%xmm12, %xmm8
349	addpd	%xmm8,  %xmm4
350	MOVUPS_A1(-10 * SIZE, A1, %xmm8)
351	mulpd	%xmm12, %xmm9
352	SUBPD	%xmm9,  %xmm5
353
354	pshufd	$0x4e, %xmm10, %xmm11
355	mulpd	%xmm12, %xmm10
356	addpd	%xmm10, %xmm6
357	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10)
358	mulpd	%xmm12, %xmm11
359	MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
360	SUBPD	%xmm11, %xmm7
361
362#ifdef PREFETCH
363	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
364#endif
365
366	pshufd	$0x4e, %xmm8,  %xmm9
367	mulpd	%xmm13,  %xmm8
368	addpd	%xmm8,  %xmm0
369	MOVUPS_A1(-10 * SIZE, A2, %xmm8)
370	mulpd	%xmm13, %xmm9
371	SUBPD	%xmm9,  %xmm1
372
373	pshufd	$0x4e, %xmm10, %xmm11
374	mulpd	%xmm13, %xmm10
375	addpd	%xmm10, %xmm2
376	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10)
377	mulpd	%xmm13, %xmm11
378	SUBPD	%xmm11, %xmm3
379
380#ifdef PREFETCHW
381	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(X1)
382#endif
383
384	pshufd	$0x4e, %xmm8,  %xmm9
385	mulpd	%xmm13, %xmm8
386	addpd	%xmm8,  %xmm4
387	MOVUPS_A1( -8 * SIZE, A1, %xmm8)
388	mulpd	%xmm13, %xmm9
389	SUBPD	%xmm9,  %xmm5
390
391	pshufd	$0x4e, %xmm10, %xmm11
392	mulpd	%xmm13, %xmm10
393	addpd	%xmm10, %xmm6
394	MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm10)
395	mulpd	%xmm13, %xmm11
396	MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
397	SUBPD	%xmm11, %xmm7
398
399	subq	 $-8 * SIZE, A1
400	subq	 $-8 * SIZE, A2
401	subq	 $-8 * SIZE, X1
402
403	subq	 $1, I
404	BRANCH
405	jg	.L13
406	ALIGN_3
407
408.L14:
409	pshufd	$0x4e, %xmm8,  %xmm9
410	mulpd	%xmm12,  %xmm8
411	addpd	%xmm8,  %xmm0
412	MOVUPS_A1(-16 * SIZE, A2, %xmm8)
413	mulpd	%xmm12, %xmm9
414	SUBPD	%xmm9,  %xmm1
415
416	pshufd	$0x4e, %xmm10, %xmm11
417	mulpd	%xmm12, %xmm10
418	addpd	%xmm10, %xmm2
419	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
420	mulpd	%xmm12, %xmm11
421	SUBPD	%xmm11, %xmm3
422
423	pshufd	$0x4e, %xmm8,  %xmm9
424	mulpd	%xmm12, %xmm8
425	addpd	%xmm8,  %xmm4
426	MOVUPS_A1(-14 * SIZE, A1, %xmm8)
427	mulpd	%xmm12, %xmm9
428	SUBPD	%xmm9,  %xmm5
429
430	pshufd	$0x4e, %xmm10, %xmm11
431	mulpd	%xmm12, %xmm10
432	addpd	%xmm10, %xmm6
433	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10)
434	mulpd	%xmm12, %xmm11
435	MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
436	SUBPD	%xmm11, %xmm7
437
438	pshufd	$0x4e, %xmm8,  %xmm9
439	mulpd	%xmm13,  %xmm8
440	addpd	%xmm8,  %xmm0
441	MOVUPS_A1(-14 * SIZE, A2, %xmm8)
442	mulpd	%xmm13, %xmm9
443	SUBPD	%xmm9,  %xmm1
444
445	pshufd	$0x4e, %xmm10, %xmm11
446	mulpd	%xmm13, %xmm10
447	addpd	%xmm10, %xmm2
448	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10)
449	mulpd	%xmm13, %xmm11
450	SUBPD	%xmm11, %xmm3
451
452	pshufd	$0x4e, %xmm8,  %xmm9
453	mulpd	%xmm13, %xmm8
454	addpd	%xmm8,  %xmm4
455	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
456	mulpd	%xmm13, %xmm9
457	SUBPD	%xmm9,  %xmm5
458
459	pshufd	$0x4e, %xmm10, %xmm11
460	mulpd	%xmm13, %xmm10
461	addpd	%xmm10, %xmm6
462	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10)
463	mulpd	%xmm13, %xmm11
464	MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
465	SUBPD	%xmm11, %xmm7
466
467	pshufd	$0x4e, %xmm8,  %xmm9
468	mulpd	%xmm12,  %xmm8
469	addpd	%xmm8,  %xmm0
470	MOVUPS_A1(-12 * SIZE, A2, %xmm8)
471	mulpd	%xmm12, %xmm9
472	SUBPD	%xmm9,  %xmm1
473
474	pshufd	$0x4e, %xmm10, %xmm11
475	mulpd	%xmm12, %xmm10
476	addpd	%xmm10, %xmm2
477	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10)
478	mulpd	%xmm12, %xmm11
479	SUBPD	%xmm11, %xmm3
480
481	pshufd	$0x4e, %xmm8,  %xmm9
482	mulpd	%xmm12, %xmm8
483	addpd	%xmm8,  %xmm4
484	MOVUPS_A1(-10 * SIZE, A1, %xmm8)
485	mulpd	%xmm12, %xmm9
486	SUBPD	%xmm9,  %xmm5
487
488	pshufd	$0x4e, %xmm10, %xmm11
489	mulpd	%xmm12, %xmm10
490	addpd	%xmm10, %xmm6
491	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10)
492	mulpd	%xmm12, %xmm11
493	MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
494	SUBPD	%xmm11, %xmm7
495
496	pshufd	$0x4e, %xmm8,  %xmm9
497	mulpd	%xmm13,  %xmm8
498	addpd	%xmm8,  %xmm0
499	MOVUPS_A1(-10 * SIZE, A2, %xmm8)
500	mulpd	%xmm13, %xmm9
501	SUBPD	%xmm9,  %xmm1
502
503	pshufd	$0x4e, %xmm10, %xmm11
504	mulpd	%xmm13, %xmm10
505	addpd	%xmm10, %xmm2
506	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10)
507	mulpd	%xmm13, %xmm11
508	SUBPD	%xmm11, %xmm3
509
510	pshufd	$0x4e, %xmm8,  %xmm9
511	mulpd	%xmm13, %xmm8
512	addpd	%xmm8,  %xmm4
513	mulpd	%xmm13, %xmm9
514	SUBPD	%xmm9,  %xmm5
515
516	pshufd	$0x4e, %xmm10, %xmm11
517	mulpd	%xmm13, %xmm10
518	addpd	%xmm10, %xmm6
519	mulpd	%xmm13, %xmm11
520	MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
521	SUBPD	%xmm11, %xmm7
522
523	subq	 $-8 * SIZE, A1
524	subq	 $-8 * SIZE, A2
525	subq	 $-8 * SIZE, X1
526	ALIGN_3
527
528.L15:
529	testq	$2, M
530	je	.L17
531
532	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
533	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10)
534
535	pshufd	$0x4e, %xmm8,  %xmm9
536	mulpd	%xmm12,  %xmm8
537	addpd	%xmm8,  %xmm0
538	MOVUPS_A1(-16 * SIZE, A2, %xmm8)
539	mulpd	%xmm12, %xmm9
540	SUBPD	%xmm9,  %xmm1
541
542	pshufd	$0x4e, %xmm10, %xmm11
543	mulpd	%xmm12, %xmm10
544	addpd	%xmm10, %xmm2
545	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
546	mulpd	%xmm12, %xmm11
547	SUBPD	%xmm11, %xmm3
548
549	pshufd	$0x4e, %xmm8,  %xmm9
550	mulpd	%xmm12, %xmm8
551	addpd	%xmm8,  %xmm4
552	MOVUPS_A1(-14 * SIZE, A1, %xmm8)
553	mulpd	%xmm12, %xmm9
554	SUBPD	%xmm9,  %xmm5
555
556	pshufd	$0x4e, %xmm10, %xmm11
557	mulpd	%xmm12, %xmm10
558	addpd	%xmm10, %xmm6
559	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10)
560	mulpd	%xmm12, %xmm11
561	MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
562	SUBPD	%xmm11, %xmm7
563
564	pshufd	$0x4e, %xmm8,  %xmm9
565	mulpd	%xmm13,  %xmm8
566	addpd	%xmm8,  %xmm0
567	MOVUPS_A1(-14 * SIZE, A2, %xmm8)
568	mulpd	%xmm13, %xmm9
569	SUBPD	%xmm9,  %xmm1
570
571	pshufd	$0x4e, %xmm10, %xmm11
572	mulpd	%xmm13, %xmm10
573	addpd	%xmm10, %xmm2
574	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10)
575	mulpd	%xmm13, %xmm11
576	SUBPD	%xmm11, %xmm3
577
578	pshufd	$0x4e, %xmm8,  %xmm9
579	mulpd	%xmm13, %xmm8
580	addpd	%xmm8,  %xmm4
581	mulpd	%xmm13, %xmm9
582	SUBPD	%xmm9,  %xmm5
583
584	pshufd	$0x4e, %xmm10, %xmm11
585	mulpd	%xmm13, %xmm10
586	addpd	%xmm10, %xmm6
587	mulpd	%xmm13, %xmm11
588	SUBPD	%xmm11, %xmm7
589
590	addq	 $4 * SIZE, A1
591	addq	 $4 * SIZE, A2
592	ALIGN_3
593
594.L17:
595	testq	$1, M
596	je	.L19
597
598	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
599	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10)
600
601	pshufd	$0x4e, %xmm8,  %xmm9
602	mulpd	%xmm12,  %xmm8
603	addpd	%xmm8,  %xmm0
604	MOVUPS_A1(-16 * SIZE, A2, %xmm8)
605	mulpd	%xmm12, %xmm9
606	SUBPD	%xmm9,  %xmm1
607
608	pshufd	$0x4e, %xmm10, %xmm11
609	mulpd	%xmm12, %xmm10
610	addpd	%xmm10, %xmm2
611	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
612	mulpd	%xmm12, %xmm11
613	SUBPD	%xmm11, %xmm3
614
615	pshufd	$0x4e, %xmm8,  %xmm9
616	mulpd	%xmm12, %xmm8
617	addpd	%xmm8,  %xmm4
618	mulpd	%xmm12, %xmm9
619	SUBPD	%xmm9,  %xmm5
620
621	pshufd	$0x4e, %xmm10, %xmm11
622	mulpd	%xmm12, %xmm10
623	addpd	%xmm10, %xmm6
624	mulpd	%xmm12, %xmm11
625	SUBPD	%xmm11, %xmm7
626	ALIGN_3
627
628.L19:
629	pcmpeqb	%xmm13, %xmm13
630	psllq	$63,    %xmm13
631	shufps	$0xc0, %xmm13, %xmm13
632
633#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
634	xorpd	%xmm13, %xmm0
635	xorpd	%xmm13, %xmm2
636	xorpd	%xmm13, %xmm4
637	xorpd	%xmm13, %xmm6
638#else
639	xorpd	%xmm13, %xmm1
640	xorpd	%xmm13, %xmm3
641	xorpd	%xmm13, %xmm5
642	xorpd	%xmm13, %xmm7
643#endif
644
645#ifdef HAVE_SSE3
646	haddpd	%xmm1, %xmm0
647	haddpd	%xmm3, %xmm2
648
649	haddpd	%xmm5, %xmm4
650	haddpd	%xmm7, %xmm6
651#else
652	movapd	%xmm0, %xmm8
653	unpcklpd %xmm1, %xmm0
654	unpckhpd %xmm1, %xmm8
655
656	movapd	%xmm2, %xmm9
657	unpcklpd %xmm3, %xmm2
658	unpckhpd %xmm3, %xmm9
659
660	movapd	%xmm4, %xmm10
661	unpcklpd %xmm5, %xmm4
662	unpckhpd %xmm5, %xmm10
663
664	movapd	%xmm6, %xmm11
665	unpcklpd %xmm7, %xmm6
666	unpckhpd %xmm7, %xmm11
667
668	addpd	%xmm8,  %xmm0
669	addpd	%xmm9,  %xmm2
670	addpd	%xmm10, %xmm4
671	addpd	%xmm11, %xmm6
672#endif
673
674	pshufd	$0x4e, %xmm0, %xmm1
675	pshufd	$0x4e, %xmm2, %xmm3
676	pshufd	$0x4e, %xmm4, %xmm5
677	pshufd	$0x4e, %xmm6, %xmm7
678
679	mulpd	ALPHA_R, %xmm0
680	mulpd	ALPHA_I, %xmm1
681	mulpd	ALPHA_R, %xmm2
682	mulpd	ALPHA_I, %xmm3
683
684	mulpd	ALPHA_R, %xmm4
685	mulpd	ALPHA_I, %xmm5
686	mulpd	ALPHA_R, %xmm6
687	mulpd	ALPHA_I, %xmm7
688
689	xorpd	%xmm13, %xmm1
690	xorpd	%xmm13, %xmm3
691	xorpd	%xmm13, %xmm5
692	xorpd	%xmm13, %xmm7
693
694	subpd	%xmm1, %xmm0
695	subpd	%xmm3, %xmm2
696	subpd	%xmm5, %xmm4
697	subpd	%xmm7, %xmm6
698
699	movsd	 0 * SIZE(Y), %xmm1
700	movhpd	 1 * SIZE(Y), %xmm1
701	addq	INCY, Y
702	movsd	 0 * SIZE(Y), %xmm3
703	movhpd	 1 * SIZE(Y), %xmm3
704	addq	INCY, Y
705	movsd	 0 * SIZE(Y), %xmm5
706	movhpd	 1 * SIZE(Y), %xmm5
707	addq	INCY, Y
708	movsd	 0 * SIZE(Y), %xmm7
709	movhpd	 1 * SIZE(Y), %xmm7
710	addq	INCY, Y
711
712	addpd	%xmm1, %xmm0
713	addpd	%xmm3, %xmm2
714	addpd	%xmm5, %xmm4
715	addpd	%xmm7, %xmm6
716
717	movlpd	%xmm0,  0 * SIZE(Y1)
718	movhpd	%xmm0,  1 * SIZE(Y1)
719	addq	INCY, Y1
720	movlpd	%xmm2,  0 * SIZE(Y1)
721	movhpd	%xmm2,  1 * SIZE(Y1)
722	addq	INCY, Y1
723	movlpd	%xmm4,  0 * SIZE(Y1)
724	movhpd	%xmm4,  1 * SIZE(Y1)
725	addq	INCY, Y1
726	movlpd	%xmm6,  0 * SIZE(Y1)
727	movhpd	%xmm6,  1 * SIZE(Y1)
728	addq	INCY, Y1
729
730	cmpq	$4, N
731	jge	.L11
732	ALIGN_3
733
734.L20:
735#endif
736
737#if GEMV_UNROLL >= 2
738
739	cmpq	$2, N
740	jl	.L30
741
742#if GEMV_UNROLL == 2
743	ALIGN_3
744
745.L21:
746#endif
747
748	subq	$2, N
749
750	leaq	16 * SIZE(BUFFER), X1
751
752	movq	A, A1
753	leaq	(A1, LDA), A2
754	leaq	(A1, LDA, 2), A
755
756	xorpd	%xmm0, %xmm0
757	xorpd	%xmm1, %xmm1
758	xorpd	%xmm2, %xmm2
759	xorpd	%xmm3, %xmm3
760
761	MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
762	MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
763
764#ifdef PREFETCHW
765	PREFETCHW	3 * SIZE(Y1)
766#endif
767
768	movq	M,   I
769	sarq	$2,  I
770	jle	.L25
771
772	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
773	MOVUPS_A1(-16 * SIZE, A2, %xmm10)
774	MOVUPS_A1(-14 * SIZE, A1, %xmm12)
775	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
776
777	decq	 I
778	jle	 .L24
779	ALIGN_3
780
781.L23:
782#ifdef PREFETCH
783	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
784#endif
785
786	pshufd	$0x4e, %xmm8,  %xmm9
787	mulpd	%xmm4,  %xmm8
788	addpd	%xmm8,  %xmm0
789	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
790	mulpd	%xmm4,  %xmm9
791	SUBPD	%xmm9,  %xmm1
792
793	pshufd	$0x4e, %xmm10, %xmm11
794	mulpd	%xmm4,  %xmm10
795	addpd	%xmm10, %xmm2
796	MOVUPS_A1(-12 * SIZE, A2, %xmm10)
797	mulpd	%xmm4,  %xmm11
798	SUBPD	%xmm11, %xmm3
799
800	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
801
802	pshufd	$0x4e, %xmm12,  %xmm13
803	mulpd	%xmm5,  %xmm12
804	addpd	%xmm12,  %xmm0
805	MOVUPS_A1(-10 * SIZE, A1, %xmm12)
806	mulpd	%xmm5,  %xmm13
807	SUBPD	%xmm13,  %xmm1
808
809	pshufd	$0x4e, %xmm6, %xmm7
810	mulpd	%xmm5,  %xmm6
811	addpd	%xmm6, %xmm2
812	MOVUPS_A1(-10 * SIZE, A2, %xmm6)
813	mulpd	%xmm5,  %xmm7
814	SUBPD	%xmm7, %xmm3
815
816	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
817
818#ifdef PREFETCH
819	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
820#endif
821
822	pshufd	$0x4e, %xmm8,  %xmm9
823	mulpd	%xmm4,  %xmm8
824	addpd	%xmm8,  %xmm0
825	MOVUPS_A1( -8 * SIZE, A1, %xmm8)
826	mulpd	%xmm4,  %xmm9
827	SUBPD	%xmm9,  %xmm1
828
829	pshufd	$0x4e, %xmm10, %xmm11
830	mulpd	%xmm4,  %xmm10
831	addpd	%xmm10, %xmm2
832	MOVUPS_A1( -8 * SIZE, A2, %xmm10)
833	mulpd	%xmm4,  %xmm11
834	SUBPD	%xmm11, %xmm3
835
836	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
837
838#ifdef PREFETCHW
839	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
840#endif
841
842	pshufd	$0x4e, %xmm12,  %xmm13
843	mulpd	%xmm5,  %xmm12
844	addpd	%xmm12,  %xmm0
845	MOVUPS_A1( -6 * SIZE, A1, %xmm12)
846	mulpd	%xmm5,  %xmm13
847	SUBPD	%xmm13,  %xmm1
848
849	pshufd	$0x4e, %xmm6, %xmm7
850	mulpd	%xmm5,  %xmm6
851	addpd	%xmm6, %xmm2
852	MOVUPS_A1( -6 * SIZE, A2, %xmm6)
853	mulpd	%xmm5,  %xmm7
854	SUBPD	%xmm7, %xmm3
855
856	MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
857
858	subq	 $-8 * SIZE, A1
859	subq	 $-8 * SIZE, A2
860	subq	 $-8 * SIZE, X1
861
862	subq	 $1, I
863	BRANCH
864	jg	.L23
865	ALIGN_3
866
867.L24:
868	pshufd	$0x4e, %xmm8,  %xmm9
869	mulpd	%xmm4,  %xmm8
870	addpd	%xmm8,  %xmm0
871	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
872	mulpd	%xmm4,  %xmm9
873	SUBPD	%xmm9,  %xmm1
874
875	pshufd	$0x4e, %xmm10, %xmm11
876	mulpd	%xmm4,  %xmm10
877	addpd	%xmm10, %xmm2
878	MOVUPS_A1(-12 * SIZE, A2, %xmm10)
879	mulpd	%xmm4,  %xmm11
880	SUBPD	%xmm11, %xmm3
881
882	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
883
884	pshufd	$0x4e, %xmm12,  %xmm13
885	mulpd	%xmm5,  %xmm12
886	addpd	%xmm12,  %xmm0
887	MOVUPS_A1(-10 * SIZE, A1, %xmm12)
888	mulpd	%xmm5,  %xmm13
889	SUBPD	%xmm13,  %xmm1
890
891	pshufd	$0x4e, %xmm6, %xmm7
892	mulpd	%xmm5,  %xmm6
893	addpd	%xmm6, %xmm2
894	MOVUPS_A1(-10 * SIZE, A2, %xmm6)
895	mulpd	%xmm5,  %xmm7
896	SUBPD	%xmm7, %xmm3
897
898	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
899
900	pshufd	$0x4e, %xmm8,  %xmm9
901	mulpd	%xmm4,  %xmm8
902	addpd	%xmm8,  %xmm0
903	mulpd	%xmm4,  %xmm9
904	SUBPD	%xmm9,  %xmm1
905
906	pshufd	$0x4e, %xmm10, %xmm11
907	mulpd	%xmm4,  %xmm10
908	addpd	%xmm10, %xmm2
909	mulpd	%xmm4,  %xmm11
910	SUBPD	%xmm11, %xmm3
911
912	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
913
914	pshufd	$0x4e, %xmm12,  %xmm13
915	mulpd	%xmm5,  %xmm12
916	addpd	%xmm12,  %xmm0
917	mulpd	%xmm5,  %xmm13
918	SUBPD	%xmm13,  %xmm1
919
920	pshufd	$0x4e, %xmm6, %xmm7
921	mulpd	%xmm5,  %xmm6
922	addpd	%xmm6, %xmm2
923	mulpd	%xmm5,  %xmm7
924	SUBPD	%xmm7, %xmm3
925
926	MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
927
928	subq	 $-8 * SIZE, A1
929	subq	 $-8 * SIZE, A2
930	subq	 $-8 * SIZE, X1
931	ALIGN_3
932
933.L25:
934	testq	$2, M
935	je	.L27
936
937	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
938	MOVUPS_A1(-16 * SIZE, A2, %xmm10)
939
940	MOVUPS_A1(-14 * SIZE, A1, %xmm12)
941	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
942
943	pshufd	$0x4e, %xmm8,  %xmm9
944	mulpd	%xmm4,  %xmm8
945	addpd	%xmm8,  %xmm0
946	mulpd	%xmm4,  %xmm9
947	SUBPD	%xmm9,  %xmm1
948
949	pshufd	$0x4e, %xmm10, %xmm11
950	mulpd	%xmm4,  %xmm10
951	addpd	%xmm10, %xmm2
952	mulpd	%xmm4,  %xmm11
953	SUBPD	%xmm11, %xmm3
954
955	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
956
957	pshufd	$0x4e, %xmm12,  %xmm13
958	mulpd	%xmm5,  %xmm12
959	addpd	%xmm12,  %xmm0
960	mulpd	%xmm5,  %xmm13
961	SUBPD	%xmm13,  %xmm1
962
963	pshufd	$0x4e, %xmm6, %xmm7
964	mulpd	%xmm5,  %xmm6
965	addpd	%xmm6, %xmm2
966	mulpd	%xmm5,  %xmm7
967	SUBPD	%xmm7, %xmm3
968
969	addq	 $4 * SIZE, A1
970	addq	 $4 * SIZE, A2
971	ALIGN_3
972
973.L27:
974	testq	$1, M
975	je	.L29
976
977	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
978	MOVUPS_A1(-16 * SIZE, A2, %xmm10)
979
980	pshufd	$0x4e, %xmm8,  %xmm9
981	mulpd	%xmm4,  %xmm8
982	addpd	%xmm8,  %xmm0
983	mulpd	%xmm4,  %xmm9
984	SUBPD	%xmm9,  %xmm1
985
986	pshufd	$0x4e, %xmm10, %xmm11
987	mulpd	%xmm4,  %xmm10
988	addpd	%xmm10, %xmm2
989	mulpd	%xmm4,  %xmm11
990	SUBPD	%xmm11, %xmm3
991	ALIGN_3
992
993.L29:
994	pcmpeqb	%xmm11, %xmm11
995	psllq	$63,    %xmm11
996	shufps	$0xc0, %xmm11, %xmm11
997
998#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
999	xorpd	%xmm11, %xmm0
1000	xorpd	%xmm11, %xmm2
1001#else
1002	xorpd	%xmm11, %xmm1
1003	xorpd	%xmm11, %xmm3
1004#endif
1005
1006#ifdef HAVE_SSE3
1007	haddpd	%xmm1, %xmm0
1008	haddpd	%xmm3, %xmm2
1009#else
1010	movapd	%xmm0, %xmm8
1011	unpcklpd %xmm1, %xmm0
1012	unpckhpd %xmm1, %xmm8
1013
1014	movapd	%xmm2, %xmm9
1015	unpcklpd %xmm3, %xmm2
1016	unpckhpd %xmm3, %xmm9
1017
1018	addpd	%xmm8, %xmm0
1019	addpd	%xmm9, %xmm2
1020#endif
1021
1022	pshufd	$0x4e, %xmm0, %xmm1
1023	pshufd	$0x4e, %xmm2, %xmm3
1024
1025	mulpd	ALPHA_R, %xmm0
1026	mulpd	ALPHA_I, %xmm1
1027	mulpd	ALPHA_R, %xmm2
1028	mulpd	ALPHA_I, %xmm3
1029
1030	xorpd	%xmm11, %xmm1
1031	xorpd	%xmm11, %xmm3
1032
1033	subpd	%xmm1, %xmm0
1034	subpd	%xmm3, %xmm2
1035
1036	movsd	 0 * SIZE(Y), %xmm4
1037	movhpd	 1 * SIZE(Y), %xmm4
1038	addq	INCY, Y
1039	movsd	 0 * SIZE(Y), %xmm5
1040	movhpd	 1 * SIZE(Y), %xmm5
1041	addq	INCY, Y
1042
1043	addpd	%xmm4, %xmm0
1044	addpd	%xmm5, %xmm2
1045
1046	movlpd	%xmm0,  0 * SIZE(Y1)
1047	movhpd	%xmm0,  1 * SIZE(Y1)
1048	addq	INCY, Y1
1049	movlpd	%xmm2,  0 * SIZE(Y1)
1050	movhpd	%xmm2,  1 * SIZE(Y1)
1051	addq	INCY, Y1
1052
1053#if GEMV_UNROLL == 2
1054	cmpq	$2, N
1055	jge	.L21
1056#endif
1057	ALIGN_3
1058
1059.L30:
1060#endif
1061
1062	cmpq	$1, N
1063	jl	.L999
1064
1065#if GEMV_UNROLL == 1
1066.L31:
1067	decq	N
1068#endif
1069
1070	leaq	16 * SIZE(BUFFER), X1
1071
1072	movq	A, A1
1073#if GEMV_UNROLL == 1
1074	addq	LDA, A
1075#endif
1076
1077	xorpd	%xmm0, %xmm0
1078	xorpd	%xmm1, %xmm1
1079
1080	MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
1081	MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
1082
1083	movq	M,   I
1084	sarq	$2,  I
1085	jle	.L35
1086
1087	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
1088	MOVUPS_A1(-14 * SIZE, A1, %xmm12)
1089
1090	decq	 I
1091	jle	 .L34
1092	ALIGN_3
1093
1094.L33:
1095#ifdef PREFETCH
1096	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
1097#endif
1098
1099	pshufd	$0x4e, %xmm8,  %xmm9
1100	mulpd	%xmm4,  %xmm8
1101	addpd	%xmm8,  %xmm0
1102	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
1103	mulpd	%xmm4,  %xmm9
1104	SUBPD	%xmm9,  %xmm1
1105
1106	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
1107
1108	pshufd	$0x4e, %xmm12,  %xmm13
1109	mulpd	%xmm5,  %xmm12
1110	addpd	%xmm12,  %xmm0
1111	MOVUPS_A1(-10 * SIZE, A1, %xmm12)
1112	mulpd	%xmm5,  %xmm13
1113	SUBPD	%xmm13,  %xmm1
1114
1115	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
1116
1117#ifdef PREFETCHW
1118	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
1119#endif
1120
1121	pshufd	$0x4e, %xmm8,  %xmm9
1122	mulpd	%xmm4,  %xmm8
1123	addpd	%xmm8,  %xmm0
1124	MOVUPS_A1( -8 * SIZE, A1, %xmm8)
1125	mulpd	%xmm4,  %xmm9
1126	SUBPD	%xmm9,  %xmm1
1127
1128	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
1129
1130	pshufd	$0x4e, %xmm12,  %xmm13
1131	mulpd	%xmm5,  %xmm12
1132	addpd	%xmm12,  %xmm0
1133	MOVUPS_A1( -6 * SIZE, A1, %xmm12)
1134	mulpd	%xmm5,  %xmm13
1135	SUBPD	%xmm13,  %xmm1
1136
1137	MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
1138
1139	subq	 $-8 * SIZE, A1
1140	subq	 $-8 * SIZE, X1
1141
1142	subq	 $1, I
1143	BRANCH
1144	jg	.L33
1145	ALIGN_3
1146
1147.L34:
1148	pshufd	$0x4e, %xmm8,  %xmm9
1149	mulpd	%xmm4,  %xmm8
1150	addpd	%xmm8,  %xmm0
1151	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
1152	mulpd	%xmm4,  %xmm9
1153	SUBPD	%xmm9,  %xmm1
1154
1155	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
1156
1157	pshufd	$0x4e, %xmm12,  %xmm13
1158	mulpd	%xmm5,  %xmm12
1159	addpd	%xmm12,  %xmm0
1160	MOVUPS_A1(-10 * SIZE, A1, %xmm12)
1161	mulpd	%xmm5,  %xmm13
1162	SUBPD	%xmm13,  %xmm1
1163
1164	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
1165
1166	pshufd	$0x4e, %xmm8,  %xmm9
1167	mulpd	%xmm4,  %xmm8
1168	addpd	%xmm8,  %xmm0
1169	mulpd	%xmm4,  %xmm9
1170	SUBPD	%xmm9,  %xmm1
1171
1172	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
1173
1174	pshufd	$0x4e, %xmm12,  %xmm13
1175	mulpd	%xmm5,  %xmm12
1176	addpd	%xmm12,  %xmm0
1177	mulpd	%xmm5,  %xmm13
1178	SUBPD	%xmm13,  %xmm1
1179
1180	MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
1181
1182	subq	 $-8 * SIZE, A1
1183	subq	 $-8 * SIZE, X1
1184	ALIGN_3
1185
1186.L35:
1187	testq	$2, M
1188	je	.L37
1189
1190	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
1191	MOVUPS_A1(-14 * SIZE, A1, %xmm12)
1192
1193	pshufd	$0x4e, %xmm8,  %xmm9
1194	mulpd	%xmm4,  %xmm8
1195	addpd	%xmm8,  %xmm0
1196	mulpd	%xmm4,  %xmm9
1197	SUBPD	%xmm9,  %xmm1
1198
1199	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
1200
1201	pshufd	$0x4e, %xmm12,  %xmm13
1202	mulpd	%xmm5,  %xmm12
1203	addpd	%xmm12,  %xmm0
1204	mulpd	%xmm5,  %xmm13
1205	SUBPD	%xmm13,  %xmm1
1206
1207	addq	 $4 * SIZE, A1
1208	ALIGN_3
1209
1210.L37:
1211	testq	$1, M
1212	je	.L39
1213
1214	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
1215
1216	pshufd	$0x4e, %xmm8,  %xmm9
1217	mulpd	%xmm4,  %xmm8
1218	addpd	%xmm8,  %xmm0
1219	mulpd	%xmm4,  %xmm9
1220	SUBPD	%xmm9,  %xmm1
1221	ALIGN_3
1222
1223.L39:
1224	pcmpeqb	%xmm11, %xmm11
1225	psllq	$63,    %xmm11
1226	shufps	$0xc0, %xmm11, %xmm11
1227
1228#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
1229	xorpd	%xmm11, %xmm0
1230#else
1231	xorpd	%xmm11, %xmm1
1232#endif
1233
1234#ifdef HAVE_SSE3
1235	haddpd	%xmm1, %xmm0
1236#else
1237	movapd	%xmm0, %xmm8
1238	unpcklpd %xmm1, %xmm0
1239	unpckhpd %xmm1, %xmm8
1240
1241	addpd	%xmm8, %xmm0
1242#endif
1243
1244	pshufd	$0x4e, %xmm0, %xmm1
1245
1246	mulpd	ALPHA_R, %xmm0
1247	mulpd	ALPHA_I, %xmm1
1248
1249	xorpd	%xmm11, %xmm1
1250
1251	subpd	%xmm1, %xmm0
1252
1253	movsd	 0 * SIZE(Y), %xmm4
1254	movhpd	 1 * SIZE(Y), %xmm4
1255
1256	addpd	%xmm4, %xmm0
1257
1258	movlpd	%xmm0,  0 * SIZE(Y1)
1259	movhpd	%xmm0,  1 * SIZE(Y1)
1260
1261#if GEMV_UNROLL == 1
1262	addq	INCY, Y
1263	addq	INCY, Y1
1264
1265	cmpq	$1, N
1266	jge	.L31
1267#endif
1268
1269#ifdef ALIGNED_ACCESS
1270	jmp	.L999
1271	ALIGN_3
1272
1273.L100:
1274#if GEMV_UNROLL >= 4
1275
1276	cmpq	$4, N
1277	jl	.L110
1278	ALIGN_3
1279
1280.L101:
1281	subq	$4, N
1282
1283	leaq	16 * SIZE(BUFFER), X1
1284
1285	movq	A, A1
1286	leaq	(A1, LDA, 2), A2
1287	leaq	(A1, LDA, 4), A
1288
1289	MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
1290	xorpd	%xmm0, %xmm0
1291	xorpd	%xmm1, %xmm1
1292	xorpd	%xmm2, %xmm2
1293	xorpd	%xmm3, %xmm3
1294	MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
1295	xorpd	%xmm4, %xmm4
1296	xorpd	%xmm5, %xmm5
1297	xorpd	%xmm6, %xmm6
1298	xorpd	%xmm7, %xmm7
1299
1300#ifdef PREFETCHW
1301	PREFETCHW	3 * SIZE(Y1)
1302#endif
1303
1304	movq	M,   I
1305	sarq	$2,  I
1306	jle	.L105
1307
1308	movsd	-16 * SIZE(A1), %xmm8
1309	movhpd	-15 * SIZE(A1), %xmm8
1310
1311	movsd	-16 * SIZE(A1, LDA), %xmm10
1312	movhpd	-15 * SIZE(A1, LDA), %xmm10
1313
1314	decq	 I
1315	jle	 .L104
1316	ALIGN_3
1317
1318.L103:
1319#ifdef PREFETCH
1320	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
1321#endif
1322
1323	pshufd	$0x4e, %xmm8,  %xmm9
1324	mulpd	%xmm12,  %xmm8
1325	addpd	%xmm8,  %xmm0
1326	movsd	-16 * SIZE(A2), %xmm8
1327	movhpd	-15 * SIZE(A2), %xmm8
1328	mulpd	%xmm12, %xmm9
1329	SUBPD	%xmm9,  %xmm1
1330
1331	pshufd	$0x4e, %xmm10, %xmm11
1332	mulpd	%xmm12, %xmm10
1333	addpd	%xmm10, %xmm2
1334	movsd	-16 * SIZE(A2, LDA), %xmm10
1335	movhpd	-15 * SIZE(A2, LDA), %xmm10
1336	mulpd	%xmm12, %xmm11
1337	SUBPD	%xmm11, %xmm3
1338
1339	pshufd	$0x4e, %xmm8,  %xmm9
1340	mulpd	%xmm12, %xmm8
1341	addpd	%xmm8,  %xmm4
1342	movsd	-14 * SIZE(A1), %xmm8
1343	movhpd	-13 * SIZE(A1), %xmm8
1344	mulpd	%xmm12, %xmm9
1345	SUBPD	%xmm9,  %xmm5
1346
1347	pshufd	$0x4e, %xmm10, %xmm11
1348	mulpd	%xmm12, %xmm10
1349	addpd	%xmm10, %xmm6
1350	movsd	-14 * SIZE(A1, LDA), %xmm10
1351	movhpd	-13 * SIZE(A1, LDA), %xmm10
1352	mulpd	%xmm12, %xmm11
1353	MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
1354	SUBPD	%xmm11, %xmm7
1355
1356#ifdef PREFETCH
1357	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
1358#endif
1359
1360	pshufd	$0x4e, %xmm8,  %xmm9
1361	mulpd	%xmm13,  %xmm8
1362	addpd	%xmm8,  %xmm0
1363	movsd	-14 * SIZE(A2), %xmm8
1364	movhpd	-13 * SIZE(A2), %xmm8
1365	mulpd	%xmm13, %xmm9
1366	SUBPD	%xmm9,  %xmm1
1367
1368	pshufd	$0x4e, %xmm10, %xmm11
1369	mulpd	%xmm13, %xmm10
1370	addpd	%xmm10, %xmm2
1371	movsd	-14 * SIZE(A2, LDA), %xmm10
1372	movhpd	-13 * SIZE(A2, LDA), %xmm10
1373	mulpd	%xmm13, %xmm11
1374	SUBPD	%xmm11, %xmm3
1375
1376	pshufd	$0x4e, %xmm8,  %xmm9
1377	mulpd	%xmm13, %xmm8
1378	addpd	%xmm8,  %xmm4
1379	movsd	-12 * SIZE(A1), %xmm8
1380	movhpd	-11 * SIZE(A1), %xmm8
1381	mulpd	%xmm13, %xmm9
1382	SUBPD	%xmm9,  %xmm5
1383
1384	pshufd	$0x4e, %xmm10, %xmm11
1385	mulpd	%xmm13, %xmm10
1386	addpd	%xmm10, %xmm6
1387	movsd	-12 * SIZE(A1, LDA), %xmm10
1388	movhpd	-11 * SIZE(A1, LDA), %xmm10
1389	mulpd	%xmm13, %xmm11
1390	MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
1391	SUBPD	%xmm11, %xmm7
1392
1393#ifdef PREFETCH
1394	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
1395#endif
1396
1397	pshufd	$0x4e, %xmm8,  %xmm9
1398	mulpd	%xmm12,  %xmm8
1399	addpd	%xmm8,  %xmm0
1400	movsd	-12 * SIZE(A2), %xmm8
1401	movhpd	-11 * SIZE(A2), %xmm8
1402	mulpd	%xmm12, %xmm9
1403	SUBPD	%xmm9,  %xmm1
1404
1405	pshufd	$0x4e, %xmm10, %xmm11
1406	mulpd	%xmm12, %xmm10
1407	addpd	%xmm10, %xmm2
1408	movsd	-12 * SIZE(A2, LDA), %xmm10
1409	movhpd	-11 * SIZE(A2, LDA), %xmm10
1410	mulpd	%xmm12, %xmm11
1411	SUBPD	%xmm11, %xmm3
1412
1413	pshufd	$0x4e, %xmm8,  %xmm9
1414	mulpd	%xmm12, %xmm8
1415	addpd	%xmm8,  %xmm4
1416	movsd	-10 * SIZE(A1), %xmm8
1417	movhpd	 -9 * SIZE(A1), %xmm8
1418	mulpd	%xmm12, %xmm9
1419	SUBPD	%xmm9,  %xmm5
1420
1421	pshufd	$0x4e, %xmm10, %xmm11
1422	mulpd	%xmm12, %xmm10
1423	addpd	%xmm10, %xmm6
1424	movsd	-10 * SIZE(A1, LDA), %xmm10
1425	movhpd	 -9 * SIZE(A1, LDA), %xmm10
1426	mulpd	%xmm12, %xmm11
1427	MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
1428	SUBPD	%xmm11, %xmm7
1429
1430#ifdef PREFETCH
1431	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
1432#endif
1433
1434	pshufd	$0x4e, %xmm8,  %xmm9
1435	mulpd	%xmm13,  %xmm8
1436	addpd	%xmm8,  %xmm0
1437	movsd	-10 * SIZE(A2), %xmm8
1438	movhpd	 -9 * SIZE(A2), %xmm8
1439	mulpd	%xmm13, %xmm9
1440	SUBPD	%xmm9,  %xmm1
1441
1442	pshufd	$0x4e, %xmm10, %xmm11
1443	mulpd	%xmm13, %xmm10
1444	addpd	%xmm10, %xmm2
1445	movsd	-10 * SIZE(A2, LDA), %xmm10
1446	movhpd	 -9 * SIZE(A2, LDA), %xmm10
1447	mulpd	%xmm13, %xmm11
1448	SUBPD	%xmm11, %xmm3
1449
1450#ifdef PREFETCHW
1451	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(X1)
1452#endif
1453
1454	pshufd	$0x4e, %xmm8,  %xmm9
1455	mulpd	%xmm13, %xmm8
1456	addpd	%xmm8,  %xmm4
1457	movsd	 -8 * SIZE(A1), %xmm8
1458	movhpd	 -7 * SIZE(A1), %xmm8
1459	mulpd	%xmm13, %xmm9
1460	SUBPD	%xmm9,  %xmm5
1461
1462	pshufd	$0x4e, %xmm10, %xmm11
1463	mulpd	%xmm13, %xmm10
1464	addpd	%xmm10, %xmm6
1465	movsd	 -8 * SIZE(A1, LDA), %xmm10
1466	movhpd	 -7 * SIZE(A1, LDA), %xmm10
1467	mulpd	%xmm13, %xmm11
1468	MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
1469	SUBPD	%xmm11, %xmm7
1470
1471	subq	 $-8 * SIZE, A1
1472	subq	 $-8 * SIZE, A2
1473	subq	 $-8 * SIZE, X1
1474
1475	subq	 $1, I
1476	BRANCH
1477	jg	.L103
1478	ALIGN_3
1479
1480.L104:
1481	pshufd	$0x4e, %xmm8,  %xmm9
1482	mulpd	%xmm12,  %xmm8
1483	addpd	%xmm8,  %xmm0
1484	movsd	-16 * SIZE(A2), %xmm8
1485	movhpd	-15 * SIZE(A2), %xmm8
1486	mulpd	%xmm12, %xmm9
1487	SUBPD	%xmm9,  %xmm1
1488
1489	pshufd	$0x4e, %xmm10, %xmm11
1490	mulpd	%xmm12, %xmm10
1491	addpd	%xmm10, %xmm2
1492	movsd	-16 * SIZE(A2, LDA), %xmm10
1493	movhpd	-15 * SIZE(A2, LDA), %xmm10
1494	mulpd	%xmm12, %xmm11
1495	SUBPD	%xmm11, %xmm3
1496
1497	pshufd	$0x4e, %xmm8,  %xmm9
1498	mulpd	%xmm12, %xmm8
1499	addpd	%xmm8,  %xmm4
1500	movsd	-14 * SIZE(A1), %xmm8
1501	movhpd	-13 * SIZE(A1), %xmm8
1502	mulpd	%xmm12, %xmm9
1503	SUBPD	%xmm9,  %xmm5
1504
1505	pshufd	$0x4e, %xmm10, %xmm11
1506	mulpd	%xmm12, %xmm10
1507	addpd	%xmm10, %xmm6
1508	movsd	-14 * SIZE(A1, LDA), %xmm10
1509	movhpd	-13 * SIZE(A1, LDA), %xmm10
1510	mulpd	%xmm12, %xmm11
1511	MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
1512	SUBPD	%xmm11, %xmm7
1513
1514	pshufd	$0x4e, %xmm8,  %xmm9
1515	mulpd	%xmm13,  %xmm8
1516	addpd	%xmm8,  %xmm0
1517	movsd	-14 * SIZE(A2), %xmm8
1518	movhpd	-13 * SIZE(A2), %xmm8
1519	mulpd	%xmm13, %xmm9
1520	SUBPD	%xmm9,  %xmm1
1521
1522	pshufd	$0x4e, %xmm10, %xmm11
1523	mulpd	%xmm13, %xmm10
1524	addpd	%xmm10, %xmm2
1525	movsd	-14 * SIZE(A2, LDA), %xmm10
1526	movhpd	-13 * SIZE(A2, LDA), %xmm10
1527	mulpd	%xmm13, %xmm11
1528	SUBPD	%xmm11, %xmm3
1529
1530	pshufd	$0x4e, %xmm8,  %xmm9
1531	mulpd	%xmm13, %xmm8
1532	addpd	%xmm8,  %xmm4
1533	movsd	-12 * SIZE(A1), %xmm8
1534	movhpd	-11 * SIZE(A1), %xmm8
1535	mulpd	%xmm13, %xmm9
1536	SUBPD	%xmm9,  %xmm5
1537
1538	pshufd	$0x4e, %xmm10, %xmm11
1539	mulpd	%xmm13, %xmm10
1540	addpd	%xmm10, %xmm6
1541	movsd	-12 * SIZE(A1, LDA), %xmm10
1542	movhpd	-11 * SIZE(A1, LDA), %xmm10
1543	mulpd	%xmm13, %xmm11
1544	MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
1545	SUBPD	%xmm11, %xmm7
1546
1547	pshufd	$0x4e, %xmm8,  %xmm9
1548	mulpd	%xmm12,  %xmm8
1549	addpd	%xmm8,  %xmm0
1550	movsd	-12 * SIZE(A2), %xmm8
1551	movhpd	-11 * SIZE(A2), %xmm8
1552	mulpd	%xmm12, %xmm9
1553	SUBPD	%xmm9,  %xmm1
1554
1555	pshufd	$0x4e, %xmm10, %xmm11
1556	mulpd	%xmm12, %xmm10
1557	addpd	%xmm10, %xmm2
1558	movsd	-12 * SIZE(A2, LDA), %xmm10
1559	movhpd	-11 * SIZE(A2, LDA), %xmm10
1560	mulpd	%xmm12, %xmm11
1561	SUBPD	%xmm11, %xmm3
1562
1563	pshufd	$0x4e, %xmm8,  %xmm9
1564	mulpd	%xmm12, %xmm8
1565	addpd	%xmm8,  %xmm4
1566	movsd	-10 * SIZE(A1), %xmm8
1567	movhpd	 -9 * SIZE(A1), %xmm8
1568	mulpd	%xmm12, %xmm9
1569	SUBPD	%xmm9,  %xmm5
1570
1571	pshufd	$0x4e, %xmm10, %xmm11
1572	mulpd	%xmm12, %xmm10
1573	addpd	%xmm10, %xmm6
1574	movsd	-10 * SIZE(A1, LDA), %xmm10
1575	movhpd	 -9 * SIZE(A1, LDA), %xmm10
1576	mulpd	%xmm12, %xmm11
1577	MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
1578	SUBPD	%xmm11, %xmm7
1579
1580	pshufd	$0x4e, %xmm8,  %xmm9
1581	mulpd	%xmm13,  %xmm8
1582	addpd	%xmm8,  %xmm0
1583	movsd	-10 * SIZE(A2), %xmm8
1584	movhpd	 -9 * SIZE(A2), %xmm8
1585	mulpd	%xmm13, %xmm9
1586	SUBPD	%xmm9,  %xmm1
1587
1588	pshufd	$0x4e, %xmm10, %xmm11
1589	mulpd	%xmm13, %xmm10
1590	addpd	%xmm10, %xmm2
1591	movsd	-10 * SIZE(A2, LDA), %xmm10
1592	movhpd	 -9 * SIZE(A2, LDA), %xmm10
1593	mulpd	%xmm13, %xmm11
1594	SUBPD	%xmm11, %xmm3
1595
1596	pshufd	$0x4e, %xmm8,  %xmm9
1597	mulpd	%xmm13, %xmm8
1598	addpd	%xmm8,  %xmm4
1599	mulpd	%xmm13, %xmm9
1600	SUBPD	%xmm9,  %xmm5
1601
1602	pshufd	$0x4e, %xmm10, %xmm11
1603	mulpd	%xmm13, %xmm10
1604	addpd	%xmm10, %xmm6
1605	mulpd	%xmm13, %xmm11
1606	MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
1607	SUBPD	%xmm11, %xmm7
1608
1609	subq	 $-8 * SIZE, A1
1610	subq	 $-8 * SIZE, A2
1611	subq	 $-8 * SIZE, X1
1612	ALIGN_3
1613
1614.L105:
1615	testq	$2, M
1616	je	.L107
1617
1618	movsd	-16 * SIZE(A1), %xmm8
1619	movhpd	-15 * SIZE(A1), %xmm8
1620
1621	movsd	-16 * SIZE(A1, LDA), %xmm10
1622	movhpd	-15 * SIZE(A1, LDA), %xmm10
1623
1624	pshufd	$0x4e, %xmm8,  %xmm9
1625	mulpd	%xmm12,  %xmm8
1626	addpd	%xmm8,  %xmm0
1627	movsd	-16 * SIZE(A2), %xmm8
1628	movhpd	-15 * SIZE(A2), %xmm8
1629	mulpd	%xmm12, %xmm9
1630	SUBPD	%xmm9,  %xmm1
1631
1632	pshufd	$0x4e, %xmm10, %xmm11
1633	mulpd	%xmm12, %xmm10
1634	addpd	%xmm10, %xmm2
1635	movsd	-16 * SIZE(A2, LDA), %xmm10
1636	movhpd	-15 * SIZE(A2, LDA), %xmm10
1637	mulpd	%xmm12, %xmm11
1638	SUBPD	%xmm11, %xmm3
1639
1640	pshufd	$0x4e, %xmm8,  %xmm9
1641	mulpd	%xmm12, %xmm8
1642	addpd	%xmm8,  %xmm4
1643	movsd	-14 * SIZE(A1), %xmm8
1644	movhpd	-13 * SIZE(A1), %xmm8
1645	mulpd	%xmm12, %xmm9
1646	SUBPD	%xmm9,  %xmm5
1647
1648	pshufd	$0x4e, %xmm10, %xmm11
1649	mulpd	%xmm12, %xmm10
1650	addpd	%xmm10, %xmm6
1651	movsd	-14 * SIZE(A1, LDA), %xmm10
1652	movhpd	-13 * SIZE(A1, LDA), %xmm10
1653	mulpd	%xmm12, %xmm11
1654	MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
1655	SUBPD	%xmm11, %xmm7
1656
1657	pshufd	$0x4e, %xmm8,  %xmm9
1658	mulpd	%xmm13,  %xmm8
1659	addpd	%xmm8,  %xmm0
1660	movsd	-14 * SIZE(A2), %xmm8
1661	movhpd	-13 * SIZE(A2), %xmm8
1662	mulpd	%xmm13, %xmm9
1663	SUBPD	%xmm9,  %xmm1
1664
1665	pshufd	$0x4e, %xmm10, %xmm11
1666	mulpd	%xmm13, %xmm10
1667	addpd	%xmm10, %xmm2
1668	movsd	-14 * SIZE(A2, LDA), %xmm10
1669	movhpd	-13 * SIZE(A2, LDA), %xmm10
1670	mulpd	%xmm13, %xmm11
1671	SUBPD	%xmm11, %xmm3
1672
1673	pshufd	$0x4e, %xmm8,  %xmm9
1674	mulpd	%xmm13, %xmm8
1675	addpd	%xmm8,  %xmm4
1676	mulpd	%xmm13, %xmm9
1677	SUBPD	%xmm9,  %xmm5
1678
1679	pshufd	$0x4e, %xmm10, %xmm11
1680	mulpd	%xmm13, %xmm10
1681	addpd	%xmm10, %xmm6
1682	mulpd	%xmm13, %xmm11
1683	SUBPD	%xmm11, %xmm7
1684
1685	addq	 $4 * SIZE, A1
1686	addq	 $4 * SIZE, A2
1687	ALIGN_3
1688
1689.L107:
1690	testq	$1, M
1691	je	.L109
1692
1693	movsd	-16 * SIZE(A1), %xmm8
1694	movhpd	-15 * SIZE(A1), %xmm8
1695
1696	movsd	-16 * SIZE(A1, LDA), %xmm10
1697	movhpd	-15 * SIZE(A1, LDA), %xmm10
1698
1699	pshufd	$0x4e, %xmm8,  %xmm9
1700	mulpd	%xmm12,  %xmm8
1701	addpd	%xmm8,  %xmm0
1702	movsd	-16 * SIZE(A2), %xmm8
1703	movhpd	-15 * SIZE(A2), %xmm8
1704	mulpd	%xmm12, %xmm9
1705	SUBPD	%xmm9,  %xmm1
1706
1707	pshufd	$0x4e, %xmm10, %xmm11
1708	mulpd	%xmm12, %xmm10
1709	addpd	%xmm10, %xmm2
1710	movsd	-16 * SIZE(A2, LDA), %xmm10
1711	movhpd	-15 * SIZE(A2, LDA), %xmm10
1712	mulpd	%xmm12, %xmm11
1713	SUBPD	%xmm11, %xmm3
1714
1715	pshufd	$0x4e, %xmm8,  %xmm9
1716	mulpd	%xmm12, %xmm8
1717	addpd	%xmm8,  %xmm4
1718	mulpd	%xmm12, %xmm9
1719	SUBPD	%xmm9,  %xmm5
1720
1721	pshufd	$0x4e, %xmm10, %xmm11
1722	mulpd	%xmm12, %xmm10
1723	addpd	%xmm10, %xmm6
1724	mulpd	%xmm12, %xmm11
1725	SUBPD	%xmm11, %xmm7
1726	ALIGN_3
1727
1728.L109:
1729	pcmpeqb	%xmm13, %xmm13
1730	psllq	$63,    %xmm13
1731	shufps	$0xc0, %xmm13, %xmm13
1732
1733#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
1734	xorpd	%xmm13, %xmm0
1735	xorpd	%xmm13, %xmm2
1736	xorpd	%xmm13, %xmm4
1737	xorpd	%xmm13, %xmm6
1738#else
1739	xorpd	%xmm13, %xmm1
1740	xorpd	%xmm13, %xmm3
1741	xorpd	%xmm13, %xmm5
1742	xorpd	%xmm13, %xmm7
1743#endif
1744
1745#ifdef HAVE_SSE3
1746	haddpd	%xmm1, %xmm0
1747	haddpd	%xmm3, %xmm2
1748
1749	haddpd	%xmm5, %xmm4
1750	haddpd	%xmm7, %xmm6
1751#else
1752	movapd	%xmm0, %xmm8
1753	unpcklpd %xmm1, %xmm0
1754	unpckhpd %xmm1, %xmm8
1755
1756	movapd	%xmm2, %xmm9
1757	unpcklpd %xmm3, %xmm2
1758	unpckhpd %xmm3, %xmm9
1759
1760	movapd	%xmm4, %xmm10
1761	unpcklpd %xmm5, %xmm4
1762	unpckhpd %xmm5, %xmm10
1763
1764	movapd	%xmm6, %xmm11
1765	unpcklpd %xmm7, %xmm6
1766	unpckhpd %xmm7, %xmm11
1767
1768	addpd	%xmm8,  %xmm0
1769	addpd	%xmm9,  %xmm2
1770	addpd	%xmm10, %xmm4
1771	addpd	%xmm11, %xmm6
1772#endif
1773
1774	pshufd	$0x4e, %xmm0, %xmm1
1775	pshufd	$0x4e, %xmm2, %xmm3
1776	pshufd	$0x4e, %xmm4, %xmm5
1777	pshufd	$0x4e, %xmm6, %xmm7
1778
1779	mulpd	ALPHA_R, %xmm0
1780	mulpd	ALPHA_I, %xmm1
1781	mulpd	ALPHA_R, %xmm2
1782	mulpd	ALPHA_I, %xmm3
1783
1784	mulpd	ALPHA_R, %xmm4
1785	mulpd	ALPHA_I, %xmm5
1786	mulpd	ALPHA_R, %xmm6
1787	mulpd	ALPHA_I, %xmm7
1788
1789	xorpd	%xmm13, %xmm1
1790	xorpd	%xmm13, %xmm3
1791	xorpd	%xmm13, %xmm5
1792	xorpd	%xmm13, %xmm7
1793
1794	subpd	%xmm1, %xmm0
1795	subpd	%xmm3, %xmm2
1796	subpd	%xmm5, %xmm4
1797	subpd	%xmm7, %xmm6
1798
1799	movsd	 0 * SIZE(Y), %xmm1
1800	movhpd	 1 * SIZE(Y), %xmm1
1801	addq	INCY, Y
1802	movsd	 0 * SIZE(Y), %xmm3
1803	movhpd	 1 * SIZE(Y), %xmm3
1804	addq	INCY, Y
1805	movsd	 0 * SIZE(Y), %xmm5
1806	movhpd	 1 * SIZE(Y), %xmm5
1807	addq	INCY, Y
1808	movsd	 0 * SIZE(Y), %xmm7
1809	movhpd	 1 * SIZE(Y), %xmm7
1810	addq	INCY, Y
1811
1812	addpd	%xmm1, %xmm0
1813	addpd	%xmm3, %xmm2
1814	addpd	%xmm5, %xmm4
1815	addpd	%xmm7, %xmm6
1816
1817	movlpd	%xmm0,  0 * SIZE(Y1)
1818	movhpd	%xmm0,  1 * SIZE(Y1)
1819	addq	INCY, Y1
1820	movlpd	%xmm2,  0 * SIZE(Y1)
1821	movhpd	%xmm2,  1 * SIZE(Y1)
1822	addq	INCY, Y1
1823	movlpd	%xmm4,  0 * SIZE(Y1)
1824	movhpd	%xmm4,  1 * SIZE(Y1)
1825	addq	INCY, Y1
1826	movlpd	%xmm6,  0 * SIZE(Y1)
1827	movhpd	%xmm6,  1 * SIZE(Y1)
1828	addq	INCY, Y1
1829
1830	cmpq	$4, N
1831	jge	.L101
1832	ALIGN_3
1833
1834.L110:
1835#endif
1836
1837#if GEMV_UNROLL >= 2
1838
1839	cmpq	$2, N
1840	jl	.L120
1841
1842#if GEMV_UNROLL == 2
1843	ALIGN_3
1844
1845.L111:
1846#endif
1847
1848	subq	$2, N
1849
1850	leaq	16 * SIZE(BUFFER), X1
1851
1852	movq	A, A1
1853	leaq	(A1, LDA), A2
1854	leaq	(A1, LDA, 2), A
1855
1856	xorpd	%xmm0, %xmm0
1857	xorpd	%xmm1, %xmm1
1858	xorpd	%xmm2, %xmm2
1859	xorpd	%xmm3, %xmm3
1860
1861	MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
1862	MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
1863
1864#ifdef PREFETCHW
1865	PREFETCHW	3 * SIZE(Y1)
1866#endif
1867
1868	movq	M,   I
1869	sarq	$2,  I
1870	jle	.L115
1871
1872	movsd	-16 * SIZE(A1), %xmm8
1873	movhpd	-15 * SIZE(A1), %xmm8
1874	movsd	-16 * SIZE(A2), %xmm10
1875	movhpd	-15 * SIZE(A2), %xmm10
1876
1877	movsd	-14 * SIZE(A1), %xmm12
1878	movhpd	-13 * SIZE(A1), %xmm12
1879	movsd	-14 * SIZE(A2), %xmm6
1880	movhpd	-13 * SIZE(A2), %xmm6
1881
1882	decq	 I
1883	jle	 .L114
1884	ALIGN_3
1885
1886.L113:
1887#ifdef PREFETCH
1888	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
1889#endif
1890
1891	pshufd	$0x4e, %xmm8,  %xmm9
1892	mulpd	%xmm4,  %xmm8
1893	addpd	%xmm8,  %xmm0
1894	movsd	-12 * SIZE(A1), %xmm8
1895	movhpd	-11 * SIZE(A1), %xmm8
1896	mulpd	%xmm4,  %xmm9
1897	SUBPD	%xmm9,  %xmm1
1898
1899	pshufd	$0x4e, %xmm10, %xmm11
1900	mulpd	%xmm4,  %xmm10
1901	addpd	%xmm10, %xmm2
1902	movsd	-12 * SIZE(A2), %xmm10
1903	movhpd	-11 * SIZE(A2), %xmm10
1904	mulpd	%xmm4,  %xmm11
1905	SUBPD	%xmm11, %xmm3
1906
1907	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
1908
1909	pshufd	$0x4e, %xmm12,  %xmm13
1910	mulpd	%xmm5,  %xmm12
1911	addpd	%xmm12,  %xmm0
1912	movsd	-10 * SIZE(A1), %xmm12
1913	movhpd	 -9 * SIZE(A1), %xmm12
1914	mulpd	%xmm5,  %xmm13
1915	SUBPD	%xmm13,  %xmm1
1916
1917	pshufd	$0x4e, %xmm6, %xmm7
1918	mulpd	%xmm5,  %xmm6
1919	addpd	%xmm6, %xmm2
1920	movsd	-10 * SIZE(A2), %xmm6
1921	movhpd	 -9 * SIZE(A2), %xmm6
1922	mulpd	%xmm5,  %xmm7
1923	SUBPD	%xmm7, %xmm3
1924
1925	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
1926
1927#ifdef PREFETCH
1928	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
1929#endif
1930
1931	pshufd	$0x4e, %xmm8,  %xmm9
1932	mulpd	%xmm4,  %xmm8
1933	addpd	%xmm8,  %xmm0
1934	movsd	 -8 * SIZE(A1), %xmm8
1935	movhpd	 -7 * SIZE(A1), %xmm8
1936	mulpd	%xmm4,  %xmm9
1937	SUBPD	%xmm9,  %xmm1
1938
1939	pshufd	$0x4e, %xmm10, %xmm11
1940	mulpd	%xmm4,  %xmm10
1941	addpd	%xmm10, %xmm2
1942	movsd	 -8 * SIZE(A2), %xmm10
1943	movhpd	 -7 * SIZE(A2), %xmm10
1944	mulpd	%xmm4,  %xmm11
1945	SUBPD	%xmm11, %xmm3
1946
1947	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
1948
1949#ifdef PREFETCHW
1950	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
1951#endif
1952
1953	pshufd	$0x4e, %xmm12,  %xmm13
1954	mulpd	%xmm5,  %xmm12
1955	addpd	%xmm12,  %xmm0
1956	movsd	 -6 * SIZE(A1), %xmm12
1957	movhpd	 -5 * SIZE(A1), %xmm12
1958	mulpd	%xmm5,  %xmm13
1959	SUBPD	%xmm13,  %xmm1
1960
1961	pshufd	$0x4e, %xmm6, %xmm7
1962	mulpd	%xmm5,  %xmm6
1963	addpd	%xmm6, %xmm2
1964	movsd	 -6 * SIZE(A2), %xmm6
1965	movhpd	 -5 * SIZE(A2), %xmm6
1966	mulpd	%xmm5,  %xmm7
1967	SUBPD	%xmm7, %xmm3
1968
1969	MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
1970
1971	subq	 $-8 * SIZE, A1
1972	subq	 $-8 * SIZE, A2
1973	subq	 $-8 * SIZE, X1
1974
1975	subq	 $1, I
1976	BRANCH
1977	jg	.L113
1978	ALIGN_3
1979
1980.L114:
1981	pshufd	$0x4e, %xmm8,  %xmm9
1982	mulpd	%xmm4,  %xmm8
1983	addpd	%xmm8,  %xmm0
1984	movsd	-12 * SIZE(A1), %xmm8
1985	movhpd	-11 * SIZE(A1), %xmm8
1986	mulpd	%xmm4,  %xmm9
1987	SUBPD	%xmm9,  %xmm1
1988
1989	pshufd	$0x4e, %xmm10, %xmm11
1990	mulpd	%xmm4,  %xmm10
1991	addpd	%xmm10, %xmm2
1992	movsd	-12 * SIZE(A2), %xmm10
1993	movhpd	-11 * SIZE(A2), %xmm10
1994	mulpd	%xmm4,  %xmm11
1995	SUBPD	%xmm11, %xmm3
1996
1997	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
1998
1999	pshufd	$0x4e, %xmm12,  %xmm13
2000	mulpd	%xmm5,  %xmm12
2001	addpd	%xmm12,  %xmm0
2002	movsd	-10 * SIZE(A1), %xmm12
2003	movhpd	 -9 * SIZE(A1), %xmm12
2004	mulpd	%xmm5,  %xmm13
2005	SUBPD	%xmm13,  %xmm1
2006
2007	pshufd	$0x4e, %xmm6, %xmm7
2008	mulpd	%xmm5,  %xmm6
2009	addpd	%xmm6, %xmm2
2010	movsd	-10 * SIZE(A2), %xmm6
2011	movhpd	 -9 * SIZE(A2), %xmm6
2012	mulpd	%xmm5,  %xmm7
2013	SUBPD	%xmm7, %xmm3
2014
2015	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
2016
2017	pshufd	$0x4e, %xmm8,  %xmm9
2018	mulpd	%xmm4,  %xmm8
2019	addpd	%xmm8,  %xmm0
2020	mulpd	%xmm4,  %xmm9
2021	SUBPD	%xmm9,  %xmm1
2022
2023	pshufd	$0x4e, %xmm10, %xmm11
2024	mulpd	%xmm4,  %xmm10
2025	addpd	%xmm10, %xmm2
2026	mulpd	%xmm4,  %xmm11
2027	SUBPD	%xmm11, %xmm3
2028
2029	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
2030
2031	pshufd	$0x4e, %xmm12,  %xmm13
2032	mulpd	%xmm5,  %xmm12
2033	addpd	%xmm12,  %xmm0
2034	mulpd	%xmm5,  %xmm13
2035	SUBPD	%xmm13,  %xmm1
2036
2037	pshufd	$0x4e, %xmm6, %xmm7
2038	mulpd	%xmm5,  %xmm6
2039	addpd	%xmm6, %xmm2
2040	mulpd	%xmm5,  %xmm7
2041	SUBPD	%xmm7, %xmm3
2042
2043	MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
2044
2045	subq	 $-8 * SIZE, A1
2046	subq	 $-8 * SIZE, A2
2047	subq	 $-8 * SIZE, X1
2048	ALIGN_3
2049
2050.L115:
2051	testq	$2, M
2052	je	.L117
2053
2054	movsd	-16 * SIZE(A1), %xmm8
2055	movhpd	-15 * SIZE(A1), %xmm8
2056	movsd	-16 * SIZE(A2), %xmm10
2057	movhpd	-15 * SIZE(A2), %xmm10
2058
2059	movsd	-14 * SIZE(A1), %xmm12
2060	movhpd	-13 * SIZE(A1), %xmm12
2061	movsd	-14 * SIZE(A2), %xmm6
2062	movhpd	-13 * SIZE(A2), %xmm6
2063
2064	pshufd	$0x4e, %xmm8,  %xmm9
2065	mulpd	%xmm4,  %xmm8
2066	addpd	%xmm8,  %xmm0
2067	mulpd	%xmm4,  %xmm9
2068	SUBPD	%xmm9,  %xmm1
2069
2070	pshufd	$0x4e, %xmm10, %xmm11
2071	mulpd	%xmm4,  %xmm10
2072	addpd	%xmm10, %xmm2
2073	mulpd	%xmm4,  %xmm11
2074	SUBPD	%xmm11, %xmm3
2075
2076	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
2077
2078	pshufd	$0x4e, %xmm12,  %xmm13
2079	mulpd	%xmm5,  %xmm12
2080	addpd	%xmm12,  %xmm0
2081	mulpd	%xmm5,  %xmm13
2082	SUBPD	%xmm13,  %xmm1
2083
2084	pshufd	$0x4e, %xmm6, %xmm7
2085	mulpd	%xmm5,  %xmm6
2086	addpd	%xmm6, %xmm2
2087	mulpd	%xmm5,  %xmm7
2088	SUBPD	%xmm7, %xmm3
2089
2090	addq	 $4 * SIZE, A1
2091	addq	 $4 * SIZE, A2
2092	ALIGN_3
2093
2094.L117:
2095	testq	$1, M
2096	je	.L119
2097
2098	movsd	-16 * SIZE(A1), %xmm8
2099	movhpd	-15 * SIZE(A1), %xmm8
2100	movsd	-16 * SIZE(A2), %xmm10
2101	movhpd	-15 * SIZE(A2), %xmm10
2102
2103	pshufd	$0x4e, %xmm8,  %xmm9
2104	mulpd	%xmm4,  %xmm8
2105	addpd	%xmm8,  %xmm0
2106	mulpd	%xmm4,  %xmm9
2107	SUBPD	%xmm9,  %xmm1
2108
2109	pshufd	$0x4e, %xmm10, %xmm11
2110	mulpd	%xmm4,  %xmm10
2111	addpd	%xmm10, %xmm2
2112	mulpd	%xmm4,  %xmm11
2113	SUBPD	%xmm11, %xmm3
2114	ALIGN_3
2115
2116.L119:
2117	pcmpeqb	%xmm11, %xmm11
2118	psllq	$63,    %xmm11
2119	shufps	$0xc0, %xmm11, %xmm11
2120
2121#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
2122	xorpd	%xmm11, %xmm0
2123	xorpd	%xmm11, %xmm2
2124#else
2125	xorpd	%xmm11, %xmm1
2126	xorpd	%xmm11, %xmm3
2127#endif
2128
2129#ifdef HAVE_SSE3
2130	haddpd	%xmm1, %xmm0
2131	haddpd	%xmm3, %xmm2
2132#else
2133	movapd	%xmm0, %xmm8
2134	unpcklpd %xmm1, %xmm0
2135	unpckhpd %xmm1, %xmm8
2136
2137	movapd	%xmm2, %xmm9
2138	unpcklpd %xmm3, %xmm2
2139	unpckhpd %xmm3, %xmm9
2140
2141	addpd	%xmm8, %xmm0
2142	addpd	%xmm9, %xmm2
2143#endif
2144
2145	pshufd	$0x4e, %xmm0, %xmm1
2146	pshufd	$0x4e, %xmm2, %xmm3
2147
2148	mulpd	ALPHA_R, %xmm0
2149	mulpd	ALPHA_I, %xmm1
2150	mulpd	ALPHA_R, %xmm2
2151	mulpd	ALPHA_I, %xmm3
2152
2153	xorpd	%xmm11, %xmm1
2154	xorpd	%xmm11, %xmm3
2155
2156	subpd	%xmm1, %xmm0
2157	subpd	%xmm3, %xmm2
2158
2159	movsd	 0 * SIZE(Y), %xmm4
2160	movhpd	 1 * SIZE(Y), %xmm4
2161	addq	INCY, Y
2162	movsd	 0 * SIZE(Y), %xmm5
2163	movhpd	 1 * SIZE(Y), %xmm5
2164	addq	INCY, Y
2165
2166	addpd	%xmm4, %xmm0
2167	addpd	%xmm5, %xmm2
2168
2169	movlpd	%xmm0,  0 * SIZE(Y1)
2170	movhpd	%xmm0,  1 * SIZE(Y1)
2171	addq	INCY, Y1
2172	movlpd	%xmm2,  0 * SIZE(Y1)
2173	movhpd	%xmm2,  1 * SIZE(Y1)
2174	addq	INCY, Y1
2175
2176#if GEMV_UNROLL == 2
2177	cmpq	$2, N
2178	jge	.L111
2179#endif
2180	ALIGN_3
2181
2182.L120:
2183#endif
2184
2185	cmpq	$1, N
2186	jl	.L999
2187
2188#if GEMV_UNROLL == 1
2189.L121:
2190	decq	N
2191#endif
2192
2193	leaq	16 * SIZE(BUFFER), X1
2194
2195	movq	A, A1
2196#if GEMV_UNROLL == 1
2197	addq	LDA, A
2198#endif
2199
2200	xorpd	%xmm0, %xmm0
2201	xorpd	%xmm1, %xmm1
2202
2203	MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
2204	MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
2205
2206	movq	M,   I
2207	sarq	$2,  I
2208	jle	.L125
2209
2210	movsd	-16 * SIZE(A1), %xmm8
2211	movhpd	-15 * SIZE(A1), %xmm8
2212	movsd	-14 * SIZE(A1), %xmm12
2213	movhpd	-13 * SIZE(A1), %xmm12
2214
2215	decq	 I
2216	jle	 .L124
2217	ALIGN_3
2218
2219.L123:
2220#ifdef PREFETCH
2221	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
2222#endif
2223
2224	pshufd	$0x4e, %xmm8,  %xmm9
2225	mulpd	%xmm4,  %xmm8
2226	addpd	%xmm8,  %xmm0
2227	movsd	-12 * SIZE(A1), %xmm8
2228	movhpd	-11 * SIZE(A1), %xmm8
2229	mulpd	%xmm4,  %xmm9
2230	SUBPD	%xmm9,  %xmm1
2231
2232	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
2233
2234	pshufd	$0x4e, %xmm12,  %xmm13
2235	mulpd	%xmm5,  %xmm12
2236	addpd	%xmm12,  %xmm0
2237	movsd	-10 * SIZE(A1), %xmm12
2238	movhpd	 -9 * SIZE(A1), %xmm12
2239	mulpd	%xmm5,  %xmm13
2240	SUBPD	%xmm13,  %xmm1
2241
2242	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
2243
2244#ifdef PREFETCHW
2245	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
2246#endif
2247
2248	pshufd	$0x4e, %xmm8,  %xmm9
2249	mulpd	%xmm4,  %xmm8
2250	addpd	%xmm8,  %xmm0
2251	movsd	 -8 * SIZE(A1), %xmm8
2252	movhpd	 -7 * SIZE(A1), %xmm8
2253	mulpd	%xmm4,  %xmm9
2254	SUBPD	%xmm9,  %xmm1
2255
2256	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
2257
2258	pshufd	$0x4e, %xmm12,  %xmm13
2259	mulpd	%xmm5,  %xmm12
2260	addpd	%xmm12,  %xmm0
2261	movsd	 -6 * SIZE(A1), %xmm12
2262	movhpd	 -5 * SIZE(A1), %xmm12
2263	mulpd	%xmm5,  %xmm13
2264	SUBPD	%xmm13,  %xmm1
2265
2266	MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
2267
2268	subq	 $-8 * SIZE, A1
2269	subq	 $-8 * SIZE, X1
2270
2271	subq	 $1, I
2272	BRANCH
2273	jg	.L123
2274	ALIGN_3
2275
2276.L124:
2277	pshufd	$0x4e, %xmm8,  %xmm9
2278	mulpd	%xmm4,  %xmm8
2279	addpd	%xmm8,  %xmm0
2280	movsd	-12 * SIZE(A1), %xmm8
2281	movhpd	-11 * SIZE(A1), %xmm8
2282	mulpd	%xmm4,  %xmm9
2283	SUBPD	%xmm9,  %xmm1
2284
2285	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
2286
2287	pshufd	$0x4e, %xmm12,  %xmm13
2288	mulpd	%xmm5,  %xmm12
2289	addpd	%xmm12,  %xmm0
2290	movsd	-10 * SIZE(A1), %xmm12
2291	movhpd	 -9 * SIZE(A1), %xmm12
2292	mulpd	%xmm5,  %xmm13
2293	SUBPD	%xmm13,  %xmm1
2294
2295	MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
2296
2297	pshufd	$0x4e, %xmm8,  %xmm9
2298	mulpd	%xmm4,  %xmm8
2299	addpd	%xmm8,  %xmm0
2300	mulpd	%xmm4,  %xmm9
2301	SUBPD	%xmm9,  %xmm1
2302
2303	MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
2304
2305	pshufd	$0x4e, %xmm12,  %xmm13
2306	mulpd	%xmm5,  %xmm12
2307	addpd	%xmm12,  %xmm0
2308	mulpd	%xmm5,  %xmm13
2309	SUBPD	%xmm13,  %xmm1
2310
2311	MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
2312
2313	subq	 $-8 * SIZE, A1
2314	subq	 $-8 * SIZE, X1
2315	ALIGN_3
2316
2317.L125:
2318	testq	$2, M
2319	je	.L127
2320
2321	movsd	-16 * SIZE(A1), %xmm8
2322	movhpd	-15 * SIZE(A1), %xmm8
2323	movsd	-14 * SIZE(A1), %xmm12
2324	movhpd	-13 * SIZE(A1), %xmm12
2325
2326	pshufd	$0x4e, %xmm8,  %xmm9
2327	mulpd	%xmm4,  %xmm8
2328	addpd	%xmm8,  %xmm0
2329	mulpd	%xmm4,  %xmm9
2330	SUBPD	%xmm9,  %xmm1
2331
2332	MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
2333
2334	pshufd	$0x4e, %xmm12,  %xmm13
2335	mulpd	%xmm5,  %xmm12
2336	addpd	%xmm12,  %xmm0
2337	mulpd	%xmm5,  %xmm13
2338	SUBPD	%xmm13,  %xmm1
2339
2340	addq	 $4 * SIZE, A1
2341	ALIGN_3
2342
2343.L127:
2344	testq	$1, M
2345	je	.L129
2346
2347	movsd	-16 * SIZE(A1), %xmm8
2348	movhpd	-15 * SIZE(A1), %xmm8
2349
2350	pshufd	$0x4e, %xmm8,  %xmm9
2351	mulpd	%xmm4,  %xmm8
2352	addpd	%xmm8,  %xmm0
2353	mulpd	%xmm4,  %xmm9
2354	SUBPD	%xmm9,  %xmm1
2355	ALIGN_3
2356
2357.L129:
2358	pcmpeqb	%xmm11, %xmm11
2359	psllq	$63,    %xmm11
2360	shufps	$0xc0, %xmm11, %xmm11
2361
2362#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
2363	xorpd	%xmm11, %xmm0
2364#else
2365	xorpd	%xmm11, %xmm1
2366#endif
2367
2368#ifdef HAVE_SSE3
2369	haddpd	%xmm1, %xmm0
2370#else
2371	movapd	%xmm0, %xmm8
2372	unpcklpd %xmm1, %xmm0
2373	unpckhpd %xmm1, %xmm8
2374
2375	addpd	%xmm8, %xmm0
2376#endif
2377
2378	pshufd	$0x4e, %xmm0, %xmm1
2379
2380	mulpd	ALPHA_R, %xmm0
2381	mulpd	ALPHA_I, %xmm1
2382
2383	xorpd	%xmm11, %xmm1
2384
2385	subpd	%xmm1, %xmm0
2386
2387	movsd	 0 * SIZE(Y), %xmm4
2388	movhpd	 1 * SIZE(Y), %xmm4
2389
2390	addpd	%xmm4, %xmm0
2391
2392	movlpd	%xmm0,  0 * SIZE(Y1)
2393	movhpd	%xmm0,  1 * SIZE(Y1)
2394
2395#if GEMV_UNROLL == 1
2396	addq	INCY, Y
2397	addq	INCY, Y1
2398
2399	cmpq	$1, N
2400	jge	.L121
2401#endif
2402
2403
2404#endif
2405	ALIGN_3
2406
2407.L999:
2408	movq	  0(%rsp), %rbx
2409	movq	  8(%rsp), %rbp
2410	movq	 16(%rsp), %r12
2411	movq	 24(%rsp), %r13
2412	movq	 32(%rsp), %r14
2413	movq	 40(%rsp), %r15
2414
2415#ifdef WINDOWS_ABI
2416	movq	 48(%rsp), %rdi
2417	movq	 56(%rsp), %rsi
2418	movups	 64(%rsp), %xmm6
2419	movups	 80(%rsp), %xmm7
2420	movups	 96(%rsp), %xmm8
2421	movups	112(%rsp), %xmm9
2422	movups	128(%rsp), %xmm10
2423	movups	144(%rsp), %xmm11
2424	movups	160(%rsp), %xmm12
2425	movups	176(%rsp), %xmm13
2426	movups	192(%rsp), %xmm14
2427	movups	208(%rsp), %xmm15
2428#endif
2429
2430	addq	$STACKSIZE, %rsp
2431	ret
2432
2433	EPILOGUE
2434