1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41#include "l2param.h"
42
43#ifndef WINDOWS_ABI
44
45#define STACKSIZE	64
46
47#define OLD_INCX	 8 + STACKSIZE(%rsp)
48#define OLD_Y		16 + STACKSIZE(%rsp)
49#define OLD_INCY	24 + STACKSIZE(%rsp)
50#define OLD_BUFFER	32 + STACKSIZE(%rsp)
51#define ALPHA_R		48	      (%rsp)
52#define ALPHA_I		56	      (%rsp)
53
54#define M	  %rdi
55#define N	  %rsi
56#define A	  %rcx
57#define LDA	  %r8
58#define X	  %r9
59#define INCX	  %rdx
60#define Y	  %rbp
61#define INCY	  %r10
62
63#else
64
65#define STACKSIZE	256
66
67#define OLD_ALPHA_I	 40 + STACKSIZE(%rsp)
68#define OLD_A		 48 + STACKSIZE(%rsp)
69#define OLD_LDA		 56 + STACKSIZE(%rsp)
70#define OLD_X		 64 + STACKSIZE(%rsp)
71#define OLD_INCX	 72 + STACKSIZE(%rsp)
72#define OLD_Y		 80 + STACKSIZE(%rsp)
73#define OLD_INCY	 88 + STACKSIZE(%rsp)
74#define OLD_BUFFER	 96 + STACKSIZE(%rsp)
75#define ALPHA_R		224	       (%rsp)
76#define ALPHA_I		232	       (%rsp)
77
78#define M	  %rcx
79#define N	  %rdx
80#define A	  %r8
81#define LDA	  %r9
82#define X	  %rdi
83#define INCX	  %rsi
84#define Y	  %rbp
85#define INCY	  %r10
86
87#endif
88
89#define I	%rax
90#define A1	%r12
91#define A2	%r13
92
93#define Y1	%r14
94#define BUFFER	%r15
95
96#define J	%r11
97
98#undef SUBPD
99
100#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
101#define SUBPD	   subpd
102#else
103#define SUBPD	   addpd
104#endif
105
106	PROLOGUE
107	PROFCODE
108
109	subq	$STACKSIZE, %rsp
110	movq	%rbx,  0(%rsp)
111	movq	%rbp,  8(%rsp)
112	movq	%r12, 16(%rsp)
113	movq	%r13, 24(%rsp)
114	movq	%r14, 32(%rsp)
115	movq	%r15, 40(%rsp)
116
117#ifdef WINDOWS_ABI
118	movq	%rdi,    48(%rsp)
119	movq	%rsi,    56(%rsp)
120	movups	%xmm6,   64(%rsp)
121	movups	%xmm7,   80(%rsp)
122	movups	%xmm8,   96(%rsp)
123	movups	%xmm9,  112(%rsp)
124	movups	%xmm10, 128(%rsp)
125	movups	%xmm11, 144(%rsp)
126	movups	%xmm12, 160(%rsp)
127	movups	%xmm13, 176(%rsp)
128	movups	%xmm14, 192(%rsp)
129	movups	%xmm15, 208(%rsp)
130
131	movq	OLD_A,     A
132	movq	OLD_LDA,   LDA
133	movq	OLD_X,     X
134
135	movapd	%xmm3,       %xmm0
136	movsd	OLD_ALPHA_I, %xmm1
137#endif
138
139	movq	OLD_INCX,  INCX
140	movq	OLD_Y,     Y
141	movq	OLD_INCY,  INCY
142	movq	OLD_BUFFER, BUFFER
143
144	salq	$ZBASE_SHIFT,   LDA
145	salq	$ZBASE_SHIFT,   INCX
146	salq	$ZBASE_SHIFT,   INCY
147
148	movlpd	%xmm0, ALPHA_R
149	movlpd	%xmm1, ALPHA_I
150
151	subq	$-16 * SIZE, A
152
153	testq	M, M
154	jle	.L999
155	testq	N, N
156	jle	.L999
157	ALIGN_3
158
159	movq	BUFFER, Y1
160
161	pxor	%xmm4, %xmm4
162
163	movq	M,   %rax
164	addq	$8,  %rax
165	sarq	$3,  %rax
166	ALIGN_3
167
168.L01:
169	movapd	%xmm4,  0 * SIZE(Y1)
170	movapd	%xmm4,  2 * SIZE(Y1)
171	movapd	%xmm4,  4 * SIZE(Y1)
172	movapd	%xmm4,  6 * SIZE(Y1)
173	movapd	%xmm4,  8 * SIZE(Y1)
174	movapd	%xmm4, 10 * SIZE(Y1)
175	movapd	%xmm4, 12 * SIZE(Y1)
176	movapd	%xmm4, 14 * SIZE(Y1)
177
178	subq	$-16 * SIZE, Y1
179	decq	%rax
180	jg	.L01
181	ALIGN_3
182
183.L10:
184#ifdef ALIGNED_ACCESS
185	testq	$SIZE, A
186	jne	.L100
187#endif
188
189#if GEMV_UNROLL >= 4
190
191	cmpq	$4, N
192	jl	.L20
193	ALIGN_3
194
195.L11:
196	subq	$4, N
197
198	leaq	16 * SIZE(BUFFER), Y1
199	movq	A,  A1
200	leaq	(A,  LDA, 2), A2
201	leaq	(A,  LDA, 4), A
202
203	movsd	0 * SIZE(X), %xmm8
204	movhpd	1 * SIZE(X), %xmm8
205	addq	INCX, X
206	movsd	0 * SIZE(X), %xmm10
207	movhpd	1 * SIZE(X), %xmm10
208	addq	INCX, X
209	movsd	0 * SIZE(X), %xmm12
210	movhpd	1 * SIZE(X), %xmm12
211	addq	INCX, X
212	movsd	0 * SIZE(X), %xmm14
213	movhpd	1 * SIZE(X), %xmm14
214	addq	INCX, X
215
216	pcmpeqb	%xmm5, %xmm5
217	psllq	$63,   %xmm5
218	shufps	$0xc0, %xmm5, %xmm5
219
220	pshufd	$0x4e, %xmm8,  %xmm9
221	pshufd	$0x4e, %xmm10, %xmm11
222	pshufd	$0x4e, %xmm12, %xmm13
223	pshufd	$0x4e, %xmm14, %xmm15
224
225#ifdef HAVE_SSE3
226	movddup	ALPHA_R, %xmm6
227	movddup	ALPHA_I, %xmm7
228#else
229	movsd	ALPHA_R, %xmm6
230	unpcklpd %xmm6, %xmm6
231	movsd	ALPHA_I, %xmm7
232	unpcklpd %xmm7, %xmm7
233#endif
234
235	xorpd	 %xmm5, %xmm9
236	xorpd	 %xmm5, %xmm11
237	xorpd	 %xmm5, %xmm13
238	xorpd	 %xmm5, %xmm15
239
240	mulpd	 %xmm6, %xmm8
241	mulpd	 %xmm7, %xmm9
242	mulpd	 %xmm6, %xmm10
243	mulpd	 %xmm7, %xmm11
244
245	mulpd	 %xmm6, %xmm12
246	mulpd	 %xmm7, %xmm13
247	mulpd	 %xmm6, %xmm14
248	mulpd	 %xmm7, %xmm15
249
250#ifndef XCONJ
251	subpd	 %xmm9,  %xmm8
252	subpd	 %xmm11, %xmm10
253	subpd	 %xmm13, %xmm12
254	subpd	 %xmm15, %xmm14
255#else
256	addpd	 %xmm9,  %xmm8
257	addpd	 %xmm11, %xmm10
258	addpd	 %xmm13, %xmm12
259	addpd	 %xmm15, %xmm14
260#endif
261
262	pshufd	 $0xee, %xmm8,  %xmm9
263	pshufd	 $0x44, %xmm8,  %xmm8
264
265	pshufd	 $0xee, %xmm10, %xmm11
266	pshufd	 $0x44, %xmm10, %xmm10
267
268	pshufd	 $0xee, %xmm12, %xmm13
269	pshufd	 $0x44, %xmm12, %xmm12
270
271	pshufd	 $0xee, %xmm14, %xmm15
272	pshufd	 $0x44, %xmm14, %xmm14
273
274#ifndef CONJ
275	xorpd	 %xmm5, %xmm9
276	xorpd	 %xmm5, %xmm11
277	xorpd	 %xmm5, %xmm13
278	xorpd	 %xmm5, %xmm15
279#else
280	xorpd	 %xmm5, %xmm8
281	xorpd	 %xmm5, %xmm10
282	xorpd	 %xmm5, %xmm12
283	xorpd	 %xmm5, %xmm14
284#endif
285
286	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
287	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
288	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
289	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
290	ALIGN_3
291
292	movq	M,   I
293	sarq	$2,  I
294	jle	.L15
295
296	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
297	MOVUPS_A1(-14 * SIZE, A1, %xmm6)
298
299	decq	 I
300	jle	 .L14
301	ALIGN_3
302
303.L13:
304#ifdef PREFETCH
305	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
306#endif
307
308	pshufd	 $0x4e, %xmm4, %xmm5
309	mulpd	 %xmm8,  %xmm4
310	addpd	 %xmm4,  %xmm0
311	MOVUPS_A1(-12 * SIZE, A1, %xmm4)
312	pshufd	 $0x4e, %xmm6, %xmm7
313	mulpd	 %xmm8,  %xmm6
314	addpd	 %xmm6,  %xmm1
315	MOVUPS_A1(-10 * SIZE, A1, %xmm6)
316
317	mulpd	 %xmm9,  %xmm5
318	SUBPD	 %xmm5,  %xmm0
319	mulpd	 %xmm9,  %xmm7
320	SUBPD	 %xmm7,  %xmm1
321
322	pshufd	 $0x4e, %xmm4, %xmm5
323	mulpd	 %xmm8,  %xmm4
324	addpd	 %xmm4,  %xmm2
325	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
326	pshufd	 $0x4e, %xmm6, %xmm7
327	mulpd	 %xmm8,  %xmm6
328	addpd	 %xmm6,  %xmm3
329	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6)
330
331	mulpd	 %xmm9,  %xmm5
332	SUBPD	 %xmm5,  %xmm2
333	mulpd	 %xmm9,  %xmm7
334	SUBPD	 %xmm7,  %xmm3
335
336#ifdef PREFETCH
337	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
338#endif
339
340	pshufd	 $0x4e, %xmm4, %xmm5
341	mulpd	 %xmm10, %xmm4
342	addpd	 %xmm4,  %xmm0
343	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4)
344	pshufd	 $0x4e, %xmm6, %xmm7
345	mulpd	 %xmm10, %xmm6
346	addpd	 %xmm6,  %xmm1
347	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6)
348
349	mulpd	 %xmm11, %xmm5
350	SUBPD	 %xmm5,  %xmm0
351	mulpd	 %xmm11, %xmm7
352	SUBPD	 %xmm7,  %xmm1
353
354	pshufd	 $0x4e, %xmm4, %xmm5
355	mulpd	 %xmm10, %xmm4
356	addpd	 %xmm4,  %xmm2
357	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
358	pshufd	 $0x4e, %xmm6, %xmm7
359	mulpd	 %xmm10, %xmm6
360	addpd	 %xmm6,  %xmm3
361	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
362
363	mulpd	 %xmm11, %xmm5
364	SUBPD	 %xmm5,  %xmm2
365	mulpd	 %xmm11, %xmm7
366	SUBPD	 %xmm7,  %xmm3
367
368#ifdef PREFETCH
369	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
370#endif
371
372	pshufd	 $0x4e, %xmm4, %xmm5
373	mulpd	 %xmm12, %xmm4
374	addpd	 %xmm4,  %xmm0
375	MOVUPS_A1(-12 * SIZE, A2, %xmm4)
376	pshufd	 $0x4e, %xmm6, %xmm7
377	mulpd	 %xmm12, %xmm6
378	addpd	 %xmm6,  %xmm1
379	MOVUPS_A1(-10 * SIZE, A2, %xmm6)
380
381	mulpd	 %xmm13, %xmm5
382	SUBPD	 %xmm5,  %xmm0
383	mulpd	 %xmm13, %xmm7
384	SUBPD	 %xmm7,  %xmm1
385
386	pshufd	 $0x4e, %xmm4, %xmm5
387	mulpd	 %xmm12, %xmm4
388	addpd	 %xmm4,  %xmm2
389	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
390	pshufd	 $0x4e, %xmm6, %xmm7
391	mulpd	 %xmm12, %xmm6
392	addpd	 %xmm6,  %xmm3
393	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6)
394
395	mulpd	 %xmm13, %xmm5
396	SUBPD	 %xmm5,  %xmm2
397	mulpd	 %xmm13, %xmm7
398	SUBPD	 %xmm7,  %xmm3
399
400#ifdef PREFETCH
401	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
402#endif
403
404	pshufd	 $0x4e, %xmm4, %xmm5
405	mulpd	 %xmm14, %xmm4
406	addpd	 %xmm4,  %xmm0
407	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4)
408	pshufd	 $0x4e, %xmm6, %xmm7
409	mulpd	 %xmm14, %xmm6
410	addpd	 %xmm6,  %xmm1
411	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6)
412
413	mulpd	 %xmm15, %xmm5
414	SUBPD	 %xmm5,  %xmm0
415	mulpd	 %xmm15, %xmm7
416	SUBPD	 %xmm7,  %xmm1
417
418	pshufd	 $0x4e, %xmm4, %xmm5
419	mulpd	 %xmm14, %xmm4
420	addpd	 %xmm4,  %xmm2
421	MOVUPS_A1( -8 * SIZE, A1, %xmm4)
422	pshufd	 $0x4e, %xmm6, %xmm7
423	mulpd	 %xmm14, %xmm6
424	addpd	 %xmm6,  %xmm3
425	MOVUPS_A1( -6 * SIZE, A1, %xmm6)
426
427	mulpd	 %xmm15, %xmm5
428	SUBPD	 %xmm5,  %xmm2
429	mulpd	 %xmm15, %xmm7
430	SUBPD	 %xmm7,  %xmm3
431
432#ifdef PREFETCHW
433	PREFETCHW	(PREFETCHSIZE) - 128 + PREOFFSET(Y1)
434#endif
435
436	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
437	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
438	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
439	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
440
441	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
442	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
443	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
444	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
445
446	subq	 $-8 * SIZE, A1
447	subq	 $-8 * SIZE, A2
448	subq	 $-8 * SIZE, Y1
449
450	subq	 $1, I
451	BRANCH
452	jg	.L13
453	ALIGN_3
454
455.L14:
456	pshufd	 $0x4e, %xmm4, %xmm5
457	mulpd	 %xmm8,  %xmm4
458	addpd	 %xmm4,  %xmm0
459	MOVUPS_A1(-12 * SIZE, A1, %xmm4)
460	pshufd	 $0x4e, %xmm6, %xmm7
461	mulpd	 %xmm8,  %xmm6
462	addpd	 %xmm6,  %xmm1
463	MOVUPS_A1(-10 * SIZE, A1, %xmm6)
464
465	mulpd	 %xmm9,  %xmm5
466	SUBPD	 %xmm5,  %xmm0
467	mulpd	 %xmm9,  %xmm7
468	SUBPD	 %xmm7,  %xmm1
469
470	pshufd	 $0x4e, %xmm4, %xmm5
471	mulpd	 %xmm8,  %xmm4
472	addpd	 %xmm4,  %xmm2
473	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
474	pshufd	 $0x4e, %xmm6, %xmm7
475	mulpd	 %xmm8,  %xmm6
476	addpd	 %xmm6,  %xmm3
477	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6)
478
479	mulpd	 %xmm9,  %xmm5
480	SUBPD	 %xmm5,  %xmm2
481	mulpd	 %xmm9,  %xmm7
482	SUBPD	 %xmm7,  %xmm3
483
484	pshufd	 $0x4e, %xmm4, %xmm5
485	mulpd	 %xmm10, %xmm4
486	addpd	 %xmm4,  %xmm0
487	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4)
488	pshufd	 $0x4e, %xmm6, %xmm7
489	mulpd	 %xmm10, %xmm6
490	addpd	 %xmm6,  %xmm1
491	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6)
492
493	mulpd	 %xmm11, %xmm5
494	SUBPD	 %xmm5,  %xmm0
495	mulpd	 %xmm11, %xmm7
496	SUBPD	 %xmm7,  %xmm1
497
498	pshufd	 $0x4e, %xmm4, %xmm5
499	mulpd	 %xmm10, %xmm4
500	addpd	 %xmm4,  %xmm2
501	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
502	pshufd	 $0x4e, %xmm6, %xmm7
503	mulpd	 %xmm10, %xmm6
504	addpd	 %xmm6,  %xmm3
505	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
506
507	mulpd	 %xmm11, %xmm5
508	SUBPD	 %xmm5,  %xmm2
509	mulpd	 %xmm11, %xmm7
510	SUBPD	 %xmm7,  %xmm3
511
512	pshufd	 $0x4e, %xmm4, %xmm5
513	mulpd	 %xmm12, %xmm4
514	addpd	 %xmm4,  %xmm0
515	MOVUPS_A1(-12 * SIZE, A2, %xmm4)
516	pshufd	 $0x4e, %xmm6, %xmm7
517	mulpd	 %xmm12, %xmm6
518	addpd	 %xmm6,  %xmm1
519	MOVUPS_A1(-10 * SIZE, A2, %xmm6)
520
521	mulpd	 %xmm13, %xmm5
522	SUBPD	 %xmm5,  %xmm0
523	mulpd	 %xmm13, %xmm7
524	SUBPD	 %xmm7,  %xmm1
525
526	pshufd	 $0x4e, %xmm4, %xmm5
527	mulpd	 %xmm12, %xmm4
528	addpd	 %xmm4,  %xmm2
529	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
530	pshufd	 $0x4e, %xmm6, %xmm7
531	mulpd	 %xmm12, %xmm6
532	addpd	 %xmm6,  %xmm3
533	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6)
534
535	mulpd	 %xmm13, %xmm5
536	SUBPD	 %xmm5,  %xmm2
537	mulpd	 %xmm13, %xmm7
538	SUBPD	 %xmm7,  %xmm3
539
540	pshufd	 $0x4e, %xmm4, %xmm5
541	mulpd	 %xmm14, %xmm4
542	addpd	 %xmm4,  %xmm0
543	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4)
544	pshufd	 $0x4e, %xmm6, %xmm7
545	mulpd	 %xmm14, %xmm6
546	addpd	 %xmm6,  %xmm1
547	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6)
548
549	mulpd	 %xmm15, %xmm5
550	SUBPD	 %xmm5,  %xmm0
551	mulpd	 %xmm15, %xmm7
552	SUBPD	 %xmm7,  %xmm1
553
554	pshufd	 $0x4e, %xmm4, %xmm5
555	mulpd	 %xmm14, %xmm4
556	addpd	 %xmm4,  %xmm2
557	pshufd	 $0x4e, %xmm6, %xmm7
558	mulpd	 %xmm14, %xmm6
559	addpd	 %xmm6,  %xmm3
560
561	mulpd	 %xmm15, %xmm5
562	SUBPD	 %xmm5,  %xmm2
563	mulpd	 %xmm15, %xmm7
564	SUBPD	 %xmm7,  %xmm3
565
566	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
567	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
568	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
569	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
570
571	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
572	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
573	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
574	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
575
576	subq	 $-8 * SIZE, A1
577	subq	 $-8 * SIZE, A2
578	subq	 $-8 * SIZE, Y1
579	ALIGN_3
580
581.L15:
582	testq	$2, M
583	je	.L17
584
585	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
586	MOVUPS_A1(-14 * SIZE, A1, %xmm6)
587
588	pshufd	 $0x4e, %xmm4, %xmm5
589	mulpd	 %xmm8,  %xmm4
590	addpd	 %xmm4,  %xmm0
591	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
592	pshufd	 $0x4e, %xmm6, %xmm7
593	mulpd	 %xmm8,  %xmm6
594	addpd	 %xmm6,  %xmm1
595	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6)
596
597	mulpd	 %xmm9,  %xmm5
598	SUBPD	 %xmm5,  %xmm0
599	mulpd	 %xmm9,  %xmm7
600	SUBPD	 %xmm7,  %xmm1
601
602	pshufd	 $0x4e, %xmm4, %xmm5
603	mulpd	 %xmm10, %xmm4
604	addpd	 %xmm4,  %xmm0
605	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
606	pshufd	 $0x4e, %xmm6, %xmm7
607	mulpd	 %xmm10, %xmm6
608	addpd	 %xmm6,  %xmm1
609	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
610
611	mulpd	 %xmm11, %xmm5
612	SUBPD	 %xmm5,  %xmm0
613	mulpd	 %xmm11, %xmm7
614	SUBPD	 %xmm7,  %xmm1
615
616	pshufd	 $0x4e, %xmm4, %xmm5
617	mulpd	 %xmm12, %xmm4
618	addpd	 %xmm4,  %xmm0
619	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
620	pshufd	 $0x4e, %xmm6, %xmm7
621	mulpd	 %xmm12, %xmm6
622	addpd	 %xmm6,  %xmm1
623	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6)
624
625	mulpd	 %xmm13, %xmm5
626	SUBPD	 %xmm5,  %xmm0
627	mulpd	 %xmm13, %xmm7
628	SUBPD	 %xmm7,  %xmm1
629
630	pshufd	 $0x4e, %xmm4, %xmm5
631	mulpd	 %xmm14, %xmm4
632	addpd	 %xmm4,  %xmm0
633	mulpd	 %xmm15, %xmm5
634	SUBPD	 %xmm5,  %xmm0
635
636	pshufd	 $0x4e, %xmm6, %xmm7
637	mulpd	 %xmm14, %xmm6
638	addpd	 %xmm6,  %xmm1
639	mulpd	 %xmm15, %xmm7
640	SUBPD	 %xmm7,  %xmm1
641
642	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
643	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
644	movapd	 %xmm2, %xmm0
645
646	addq	 $4 * SIZE, A1
647	addq	 $4 * SIZE, A2
648	addq	 $4 * SIZE, Y1
649	ALIGN_3
650
651.L17:
652	testq	$1, M
653	je	.L19
654
655	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
656	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6)
657
658	pshufd	 $0x4e, %xmm4, %xmm5
659	mulpd	 %xmm8,  %xmm4
660	addpd	 %xmm4,  %xmm0
661	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
662	mulpd	 %xmm9,  %xmm5
663	SUBPD	 %xmm5,  %xmm0
664
665	pshufd	 $0x4e, %xmm6, %xmm7
666	mulpd	 %xmm10, %xmm6
667	addpd	 %xmm6,  %xmm0
668	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6)
669	mulpd	 %xmm11, %xmm7
670	SUBPD	 %xmm7,  %xmm0
671
672	pshufd	 $0x4e, %xmm4, %xmm5
673	mulpd	 %xmm12, %xmm4
674	addpd	 %xmm4,  %xmm0
675	mulpd	 %xmm13, %xmm5
676	SUBPD	 %xmm5,  %xmm0
677
678	pshufd	 $0x4e, %xmm6, %xmm7
679	mulpd	 %xmm14, %xmm6
680	addpd	 %xmm6,  %xmm0
681	mulpd	 %xmm15, %xmm7
682	SUBPD	 %xmm7,  %xmm0
683
684	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
685	ALIGN_3
686
687.L19:
688	cmpq	$4, N
689	jge	.L11
690	ALIGN_3
691
692.L20:
693#endif
694
695#if GEMV_UNROLL >= 2
696
697	cmpq	$2, N
698	jl	.L30
699
700#if GEMV_UNROLL == 2
701	ALIGN_3
702
703.L21:
704#endif
705
706	subq	$2, N
707
708	leaq	16 * SIZE(BUFFER), Y1
709	movq	A,  A1
710	leaq	(A,  LDA, 1), A2
711	leaq	(A,  LDA, 2), A
712
713	movsd	0 * SIZE(X), %xmm12
714	movhpd	1 * SIZE(X), %xmm12
715	addq	INCX, X
716	movsd	0 * SIZE(X), %xmm14
717	movhpd	1 * SIZE(X), %xmm14
718	addq	INCX, X
719
720	pcmpeqb	%xmm11, %xmm11
721	psllq	$63,    %xmm11
722	shufps	$0xc0, %xmm11, %xmm11
723
724	pshufd	$0x4e, %xmm12, %xmm13
725	pshufd	$0x4e, %xmm14, %xmm15
726
727#ifdef HAVE_SSE3
728	movddup	ALPHA_R, %xmm8
729	movddup	ALPHA_I, %xmm9
730#else
731	movsd	ALPHA_R, %xmm8
732	unpcklpd %xmm8, %xmm8
733	movsd	ALPHA_I, %xmm9
734	unpcklpd %xmm9, %xmm9
735#endif
736
737	xorpd	 %xmm11, %xmm13
738	xorpd	 %xmm11, %xmm15
739
740	mulpd	 %xmm8, %xmm12
741	mulpd	 %xmm9, %xmm13
742	mulpd	 %xmm8, %xmm14
743	mulpd	 %xmm9, %xmm15
744
745#ifndef XCONJ
746	subpd	 %xmm13, %xmm12
747	subpd	 %xmm15, %xmm14
748#else
749	addpd	 %xmm13, %xmm12
750	addpd	 %xmm15, %xmm14
751#endif
752
753	pshufd	 $0xee, %xmm12, %xmm13
754	pshufd	 $0x44, %xmm12, %xmm12
755
756	pshufd	 $0xee, %xmm14, %xmm15
757	pshufd	 $0x44, %xmm14, %xmm14
758
759#ifndef CONJ
760	xorpd	 %xmm11, %xmm13
761	xorpd	 %xmm11, %xmm15
762#else
763	xorpd	 %xmm11, %xmm12
764	xorpd	 %xmm11, %xmm14
765#endif
766
767	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
768	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
769	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
770	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
771	ALIGN_3
772
773	movq	M,   I
774	sarq	$2,  I
775	jle	.L25
776
777	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
778	MOVUPS_A1(-14 * SIZE, A1, %xmm6)
779	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
780	MOVUPS_A1(-10 * SIZE, A1, %xmm10)
781
782	decq	 I
783	jle	 .L24
784	ALIGN_3
785
786.L23:
787#ifdef PREFETCH
788	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
789#endif
790
791	pshufd	 $0x4e, %xmm4,  %xmm5
792	mulpd	 %xmm12, %xmm4
793	addpd	 %xmm4,  %xmm0
794	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
795	pshufd	 $0x4e, %xmm6,  %xmm7
796	mulpd	 %xmm12, %xmm6
797	addpd	 %xmm6,  %xmm1
798	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
799
800	pshufd	 $0x4e, %xmm8,  %xmm9
801	mulpd	 %xmm12, %xmm8
802	addpd	 %xmm8,  %xmm2
803	MOVUPS_A1(-12 * SIZE, A2, %xmm8)
804	pshufd	 $0x4e, %xmm10, %xmm11
805	mulpd	 %xmm12, %xmm10
806	addpd	 %xmm10, %xmm3
807	MOVUPS_A1(-10 * SIZE, A2, %xmm10)
808
809	mulpd	 %xmm13, %xmm5
810	SUBPD	 %xmm5,  %xmm0
811	mulpd	 %xmm13, %xmm7
812	SUBPD	 %xmm7,  %xmm1
813
814	mulpd	 %xmm13, %xmm9
815	SUBPD	 %xmm9,  %xmm2
816	mulpd	 %xmm13, %xmm11
817	SUBPD	 %xmm11, %xmm3
818
819#ifdef PREFETCH
820	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
821#endif
822
823	pshufd	 $0x4e, %xmm4,  %xmm5
824	mulpd	 %xmm14, %xmm4
825	addpd	 %xmm4,  %xmm0
826	MOVUPS_A1( -8 * SIZE, A1, %xmm4)
827	pshufd	 $0x4e, %xmm6,  %xmm7
828	mulpd	 %xmm14, %xmm6
829	addpd	 %xmm6,  %xmm1
830	MOVUPS_A1( -6 * SIZE, A1, %xmm6)
831
832	pshufd	 $0x4e, %xmm8,  %xmm9
833	mulpd	 %xmm14, %xmm8
834	addpd	 %xmm8,  %xmm2
835	MOVUPS_A1( -4 * SIZE, A1, %xmm8)
836	pshufd	 $0x4e, %xmm10, %xmm11
837	mulpd	 %xmm14, %xmm10
838	addpd	 %xmm10, %xmm3
839	MOVUPS_A1( -2 * SIZE, A1, %xmm10)
840
841	mulpd	 %xmm15, %xmm5
842	SUBPD	 %xmm5,  %xmm0
843	mulpd	 %xmm15, %xmm7
844	SUBPD	 %xmm7,  %xmm1
845
846	mulpd	 %xmm15, %xmm9
847	SUBPD	 %xmm9,  %xmm2
848	mulpd	 %xmm15, %xmm11
849	SUBPD	 %xmm11, %xmm3
850
851#ifdef PREFETCHW
852	PREFETCHW	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
853#endif
854
855	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
856	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
857	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
858	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
859
860	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
861	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
862	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
863	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
864
865	subq	 $-8 * SIZE, A1
866	subq	 $-8 * SIZE, A2
867	subq	 $-8 * SIZE, Y1
868
869	subq	 $1, I
870	BRANCH
871	jg	.L23
872	ALIGN_3
873
874.L24:
875	pshufd	 $0x4e, %xmm4,  %xmm5
876	mulpd	 %xmm12, %xmm4
877	addpd	 %xmm4,  %xmm0
878	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
879	pshufd	 $0x4e, %xmm6,  %xmm7
880	mulpd	 %xmm12, %xmm6
881	addpd	 %xmm6,  %xmm1
882	MOVUPS_A1(-14 * SIZE, A2, %xmm6)
883
884	pshufd	 $0x4e, %xmm8,  %xmm9
885	mulpd	 %xmm12, %xmm8
886	addpd	 %xmm8,  %xmm2
887	MOVUPS_A1(-12 * SIZE, A2, %xmm8)
888	pshufd	 $0x4e, %xmm10, %xmm11
889	mulpd	 %xmm12, %xmm10
890	addpd	 %xmm10, %xmm3
891	MOVUPS_A1(-10 * SIZE, A2, %xmm10)
892
893	mulpd	 %xmm13, %xmm5
894	SUBPD	 %xmm5,  %xmm0
895	mulpd	 %xmm13, %xmm7
896	SUBPD	 %xmm7,  %xmm1
897
898	mulpd	 %xmm13, %xmm9
899	SUBPD	 %xmm9,  %xmm2
900	mulpd	 %xmm13, %xmm11
901	SUBPD	 %xmm11, %xmm3
902
903	pshufd	 $0x4e, %xmm4,  %xmm5
904	mulpd	 %xmm14, %xmm4
905	addpd	 %xmm4,  %xmm0
906	pshufd	 $0x4e, %xmm6,  %xmm7
907	mulpd	 %xmm14, %xmm6
908	addpd	 %xmm6,  %xmm1
909
910	pshufd	 $0x4e, %xmm8,  %xmm9
911	mulpd	 %xmm14, %xmm8
912	addpd	 %xmm8,  %xmm2
913	pshufd	 $0x4e, %xmm10, %xmm11
914	mulpd	 %xmm14, %xmm10
915	addpd	 %xmm10, %xmm3
916
917	mulpd	 %xmm15, %xmm5
918	SUBPD	 %xmm5,  %xmm0
919	mulpd	 %xmm15, %xmm7
920	SUBPD	 %xmm7,  %xmm1
921
922	mulpd	 %xmm15, %xmm9
923	SUBPD	 %xmm9,  %xmm2
924	mulpd	 %xmm15, %xmm11
925	SUBPD	 %xmm11, %xmm3
926
927	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
928	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
929	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
930	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
931
932	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
933	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
934	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
935	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
936
937	subq	 $-8 * SIZE, A1
938	subq	 $-8 * SIZE, A2
939	subq	 $-8 * SIZE, Y1
940	ALIGN_3
941
942.L25:
943	testq	$2, M
944	je	.L27
945
946	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
947	MOVUPS_A1(-14 * SIZE, A1, %xmm6)
948	MOVUPS_A1(-16 * SIZE, A2, %xmm8)
949	MOVUPS_A1(-14 * SIZE, A2, %xmm10)
950
951	pshufd	 $0x4e, %xmm4,  %xmm5
952	mulpd	 %xmm12, %xmm4
953	addpd	 %xmm4,  %xmm0
954	pshufd	 $0x4e, %xmm6,  %xmm7
955	mulpd	 %xmm12, %xmm6
956	addpd	 %xmm6,  %xmm1
957
958	mulpd	 %xmm13, %xmm5
959	SUBPD	 %xmm5,  %xmm0
960	mulpd	 %xmm13, %xmm7
961	SUBPD	 %xmm7,  %xmm1
962
963	pshufd	 $0x4e, %xmm8,  %xmm9
964	mulpd	 %xmm14, %xmm8
965	addpd	 %xmm8,  %xmm0
966	pshufd	 $0x4e, %xmm10, %xmm11
967	mulpd	 %xmm14, %xmm10
968	addpd	 %xmm10, %xmm1
969
970	mulpd	 %xmm15, %xmm9
971	SUBPD	 %xmm9,  %xmm0
972	mulpd	 %xmm15, %xmm11
973	SUBPD	 %xmm11, %xmm1
974
975	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
976	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
977	movapd	 %xmm2, %xmm0
978
979	addq	 $4 * SIZE, A1
980	addq	 $4 * SIZE, A2
981	addq	 $4 * SIZE, Y1
982	ALIGN_3
983
984.L27:
985	testq	$1, M
986#if GEMV_UNROLL == 2
987	je	.L29
988#else
989	je	.L30
990#endif
991
992	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
993	MOVUPS_A1(-16 * SIZE, A2, %xmm6)
994
995	pshufd	 $0x4e, %xmm4, %xmm5
996	mulpd	 %xmm12, %xmm4
997	addpd	 %xmm4,  %xmm0
998	mulpd	 %xmm13, %xmm5
999	SUBPD	 %xmm5,  %xmm0
1000
1001	pshufd	 $0x4e, %xmm6, %xmm7
1002	mulpd	 %xmm14, %xmm6
1003	addpd	 %xmm6,  %xmm0
1004	mulpd	 %xmm15, %xmm7
1005	SUBPD	 %xmm7,  %xmm0
1006
1007	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1008
1009#if GEMV_UNROLL == 2
1010	ALIGN_3
1011
1012.L29:
1013	cmpq	$2, N
1014	jge	.L21
1015#endif
1016	ALIGN_3
1017
1018.L30:
1019#endif
1020
1021	cmpq	$1, N
1022	jl	.L980
1023
1024#if GEMV_UNROLL == 1
1025.L31:
1026	decq	N
1027#endif
1028
1029	leaq	16 * SIZE(BUFFER), Y1
1030	movq	A,  A1
1031#if GEMV_UNROLL == 1
1032	addq	LDA, A
1033#endif
1034
1035	movsd	0 * SIZE(X), %xmm12
1036	movhpd	1 * SIZE(X), %xmm12
1037	addq	INCX, X
1038
1039	pcmpeqb	%xmm11, %xmm11
1040	psllq	$63,    %xmm11
1041	shufps	$0xc0, %xmm11, %xmm11
1042
1043	pshufd	$0x4e, %xmm12, %xmm13
1044
1045#ifdef HAVE_SSE3
1046	movddup	ALPHA_R, %xmm8
1047	movddup	ALPHA_I, %xmm9
1048#else
1049	movsd	ALPHA_R, %xmm8
1050	unpcklpd %xmm8, %xmm8
1051	movsd	ALPHA_I, %xmm9
1052	unpcklpd %xmm9, %xmm9
1053#endif
1054
1055	xorpd	 %xmm11, %xmm13
1056
1057	mulpd	 %xmm8, %xmm12
1058	mulpd	 %xmm9, %xmm13
1059
1060#ifndef XCONJ
1061	subpd	 %xmm13, %xmm12
1062#else
1063	addpd	 %xmm13, %xmm12
1064#endif
1065
1066	pshufd	 $0xee, %xmm12, %xmm13
1067	pshufd	 $0x44, %xmm12, %xmm12
1068
1069#ifndef CONJ
1070	xorpd	 %xmm11, %xmm13
1071#else
1072	xorpd	 %xmm11, %xmm12
1073#endif
1074
1075	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1076	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
1077	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
1078	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
1079
1080	movq	M,   I
1081	sarq	$2,  I
1082	jle	.L35
1083
1084	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
1085	MOVUPS_A1(-14 * SIZE, A1, %xmm6)
1086	MOVUPS_A1(-12 * SIZE, A1, %xmm8)
1087	MOVUPS_A1(-10 * SIZE, A1, %xmm10)
1088
1089	decq	 I
1090	jle	 .L34
1091	ALIGN_3
1092
1093.L33:
1094#ifdef PREFETCH
1095	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
1096#endif
1097
1098	pshufd	 $0x4e, %xmm4,  %xmm5
1099	mulpd	 %xmm12, %xmm4
1100	addpd	 %xmm4,  %xmm0
1101	MOVUPS_A1( -8 * SIZE, A1, %xmm4)
1102	pshufd	 $0x4e, %xmm6,  %xmm7
1103	mulpd	 %xmm12, %xmm6
1104	addpd	 %xmm6,  %xmm1
1105	MOVUPS_A1( -6 * SIZE, A1, %xmm6)
1106
1107	pshufd	 $0x4e, %xmm8,  %xmm9
1108	mulpd	 %xmm12, %xmm8
1109	addpd	 %xmm8,  %xmm2
1110	MOVUPS_A1( -4 * SIZE, A1, %xmm8)
1111	pshufd	 $0x4e, %xmm10, %xmm11
1112	mulpd	 %xmm12, %xmm10
1113	addpd	 %xmm10, %xmm3
1114	MOVUPS_A1( -2 * SIZE, A1, %xmm10)
1115
1116	mulpd	 %xmm13, %xmm5
1117	SUBPD	 %xmm5,  %xmm0
1118	mulpd	 %xmm13, %xmm7
1119	SUBPD	 %xmm7,  %xmm1
1120
1121	mulpd	 %xmm13, %xmm9
1122	SUBPD	 %xmm9,  %xmm2
1123	mulpd	 %xmm13, %xmm11
1124	SUBPD	 %xmm11, %xmm3
1125
1126#ifdef PREFETCHW
1127	PREFETCHW	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
1128#endif
1129
1130	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1131	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1132	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1133	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1134
1135	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1136	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1137	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1138	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1139
1140	subq	 $-8 * SIZE, A1
1141	subq	 $-8 * SIZE, Y1
1142
1143	subq	 $1, I
1144	BRANCH
1145	jg	.L33
1146	ALIGN_3
1147
1148.L34:
1149	pshufd	 $0x4e, %xmm4,  %xmm5
1150	mulpd	 %xmm12, %xmm4
1151	addpd	 %xmm4,  %xmm0
1152	pshufd	 $0x4e, %xmm6,  %xmm7
1153	mulpd	 %xmm12, %xmm6
1154	addpd	 %xmm6,  %xmm1
1155
1156	pshufd	 $0x4e, %xmm8,  %xmm9
1157	mulpd	 %xmm12, %xmm8
1158	addpd	 %xmm8,  %xmm2
1159	pshufd	 $0x4e, %xmm10, %xmm11
1160	mulpd	 %xmm12, %xmm10
1161	addpd	 %xmm10, %xmm3
1162
1163	mulpd	 %xmm13, %xmm5
1164	SUBPD	 %xmm5,  %xmm0
1165	mulpd	 %xmm13, %xmm7
1166	SUBPD	 %xmm7,  %xmm1
1167
1168	mulpd	 %xmm13, %xmm9
1169	SUBPD	 %xmm9,  %xmm2
1170	mulpd	 %xmm13, %xmm11
1171	SUBPD	 %xmm11, %xmm3
1172
1173	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1174	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1175	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1176	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1177
1178	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1179	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1180	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1181	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1182
1183	subq	 $-8 * SIZE, A1
1184	subq	 $-8 * SIZE, Y1
1185	ALIGN_3
1186
1187.L35:
1188	testq	$2, M
1189	je	.L37
1190
1191	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
1192	MOVUPS_A1(-14 * SIZE, A1, %xmm6)
1193
1194	pshufd	 $0x4e, %xmm4,  %xmm5
1195	mulpd	 %xmm12, %xmm4
1196	addpd	 %xmm4,  %xmm0
1197	pshufd	 $0x4e, %xmm6,  %xmm7
1198	mulpd	 %xmm12, %xmm6
1199	addpd	 %xmm6,  %xmm1
1200
1201	mulpd	 %xmm13, %xmm5
1202	SUBPD	 %xmm5,  %xmm0
1203	mulpd	 %xmm13, %xmm7
1204	SUBPD	 %xmm7,  %xmm1
1205
1206	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1207	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1208	movapd	 %xmm2, %xmm0
1209
1210	addq	 $4 * SIZE, A1
1211	addq	 $4 * SIZE, Y1
1212	ALIGN_3
1213
1214.L37:
1215	testq	$1, M
1216#if GEMV_UNROLL == 1
1217	je	.L39
1218#else
1219	je	.L980
1220#endif
1221
1222	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
1223
1224	pshufd	 $0x4e, %xmm4, %xmm5
1225	mulpd	 %xmm12, %xmm4
1226	addpd	 %xmm4,  %xmm0
1227	mulpd	 %xmm13, %xmm5
1228	SUBPD	 %xmm5,  %xmm0
1229
1230	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1231
1232#if GEMV_UNROLL == 1
1233	ALIGN_3
1234.L39:
1235	cmpq	$1, N
1236	jge	.L31
1237#endif
1238
1239#ifdef ALIGNED_ACCESS
1240
1241	jmp	.L980
1242	ALIGN_3
1243
1244.L100:
1245#if GEMV_UNROLL >= 4
1246
1247	cmpq	$4, N
1248	jl	.L110
1249	ALIGN_3
1250
1251.L101:
1252	subq	$4, N
1253
1254	leaq	16 * SIZE(BUFFER), Y1
1255	movq	A,  A1
1256	leaq	(A,  LDA, 2), A2
1257	leaq	(A,  LDA, 4), A
1258
1259	movsd	0 * SIZE(X), %xmm8
1260	movhpd	1 * SIZE(X), %xmm8
1261	addq	INCX, X
1262	movsd	0 * SIZE(X), %xmm10
1263	movhpd	1 * SIZE(X), %xmm10
1264	addq	INCX, X
1265	movsd	0 * SIZE(X), %xmm12
1266	movhpd	1 * SIZE(X), %xmm12
1267	addq	INCX, X
1268	movsd	0 * SIZE(X), %xmm14
1269	movhpd	1 * SIZE(X), %xmm14
1270	addq	INCX, X
1271
1272	pcmpeqb	%xmm5, %xmm5
1273	psllq	$63,   %xmm5
1274	shufps	$0xc0, %xmm5, %xmm5
1275
1276	pshufd	$0x4e, %xmm8,  %xmm9
1277	pshufd	$0x4e, %xmm10, %xmm11
1278	pshufd	$0x4e, %xmm12, %xmm13
1279	pshufd	$0x4e, %xmm14, %xmm15
1280
1281#ifdef HAVE_SSE3
1282	movddup	ALPHA_R, %xmm6
1283	movddup	ALPHA_I, %xmm7
1284#else
1285	movsd	ALPHA_R, %xmm6
1286	unpcklpd %xmm6, %xmm6
1287	movsd	ALPHA_I, %xmm7
1288	unpcklpd %xmm7, %xmm7
1289#endif
1290
1291	xorpd	 %xmm5, %xmm9
1292	xorpd	 %xmm5, %xmm11
1293	xorpd	 %xmm5, %xmm13
1294	xorpd	 %xmm5, %xmm15
1295
1296	mulpd	 %xmm6, %xmm8
1297	mulpd	 %xmm7, %xmm9
1298	mulpd	 %xmm6, %xmm10
1299	mulpd	 %xmm7, %xmm11
1300
1301	mulpd	 %xmm6, %xmm12
1302	mulpd	 %xmm7, %xmm13
1303	mulpd	 %xmm6, %xmm14
1304	mulpd	 %xmm7, %xmm15
1305
1306#ifndef XCONJ
1307	subpd	 %xmm9,  %xmm8
1308	subpd	 %xmm11, %xmm10
1309	subpd	 %xmm13, %xmm12
1310	subpd	 %xmm15, %xmm14
1311#else
1312	addpd	 %xmm9,  %xmm8
1313	addpd	 %xmm11, %xmm10
1314	addpd	 %xmm13, %xmm12
1315	addpd	 %xmm15, %xmm14
1316#endif
1317
1318	pshufd	 $0xee, %xmm8,  %xmm9
1319	pshufd	 $0x44, %xmm8,  %xmm8
1320
1321	pshufd	 $0xee, %xmm10, %xmm11
1322	pshufd	 $0x44, %xmm10, %xmm10
1323
1324	pshufd	 $0xee, %xmm12, %xmm13
1325	pshufd	 $0x44, %xmm12, %xmm12
1326
1327	pshufd	 $0xee, %xmm14, %xmm15
1328	pshufd	 $0x44, %xmm14, %xmm14
1329
1330#ifndef CONJ
1331	xorpd	 %xmm5, %xmm9
1332	xorpd	 %xmm5, %xmm11
1333	xorpd	 %xmm5, %xmm13
1334	xorpd	 %xmm5, %xmm15
1335#else
1336	xorpd	 %xmm5, %xmm8
1337	xorpd	 %xmm5, %xmm10
1338	xorpd	 %xmm5, %xmm12
1339	xorpd	 %xmm5, %xmm14
1340#endif
1341
1342	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1343	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
1344	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
1345	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
1346	ALIGN_3
1347
1348	movq	M,   I
1349	sarq	$2,  I
1350	jle	.L105
1351
1352	movsd	-16 * SIZE(A1), %xmm4
1353	movhpd	-15 * SIZE(A1), %xmm4
1354	movsd	-14 * SIZE(A1), %xmm6
1355	movhpd	-13 * SIZE(A1), %xmm6
1356
1357	decq	 I
1358	jle	 .L104
1359	ALIGN_3
1360
1361.L103:
1362#ifdef PREFETCH
1363	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
1364#endif
1365
1366	pshufd	 $0x4e, %xmm4, %xmm5
1367	mulpd	 %xmm8,  %xmm4
1368	addpd	 %xmm4,  %xmm0
1369	movsd	-12 * SIZE(A1), %xmm4
1370	movhpd	-11 * SIZE(A1), %xmm4
1371	pshufd	 $0x4e, %xmm6, %xmm7
1372	mulpd	 %xmm8,  %xmm6
1373	addpd	 %xmm6,  %xmm1
1374	movsd	-10 * SIZE(A1), %xmm6
1375	movhpd	 -9 * SIZE(A1), %xmm6
1376
1377	mulpd	 %xmm9,  %xmm5
1378	SUBPD	 %xmm5,  %xmm0
1379	mulpd	 %xmm9,  %xmm7
1380	SUBPD	 %xmm7,  %xmm1
1381
1382	pshufd	 $0x4e, %xmm4, %xmm5
1383	mulpd	 %xmm8,  %xmm4
1384	addpd	 %xmm4,  %xmm2
1385	movsd	-16 * SIZE(A1, LDA), %xmm4
1386	movhpd	-15 * SIZE(A1, LDA), %xmm4
1387	pshufd	 $0x4e, %xmm6, %xmm7
1388	mulpd	 %xmm8,  %xmm6
1389	addpd	 %xmm6,  %xmm3
1390	movsd	-14 * SIZE(A1, LDA), %xmm6
1391	movhpd	-13 * SIZE(A1, LDA), %xmm6
1392
1393	mulpd	 %xmm9,  %xmm5
1394	SUBPD	 %xmm5,  %xmm2
1395	mulpd	 %xmm9,  %xmm7
1396	SUBPD	 %xmm7,  %xmm3
1397
1398#ifdef PREFETCH
1399	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
1400#endif
1401
1402	pshufd	 $0x4e, %xmm4, %xmm5
1403	mulpd	 %xmm10, %xmm4
1404	addpd	 %xmm4,  %xmm0
1405	movsd	-12 * SIZE(A1, LDA), %xmm4
1406	movhpd	-11 * SIZE(A1, LDA), %xmm4
1407	pshufd	 $0x4e, %xmm6, %xmm7
1408	mulpd	 %xmm10, %xmm6
1409	addpd	 %xmm6,  %xmm1
1410	movsd	-10 * SIZE(A1, LDA), %xmm6
1411	movhpd	 -9 * SIZE(A1, LDA), %xmm6
1412
1413	mulpd	 %xmm11, %xmm5
1414	SUBPD	 %xmm5,  %xmm0
1415	mulpd	 %xmm11, %xmm7
1416	SUBPD	 %xmm7,  %xmm1
1417
1418	pshufd	 $0x4e, %xmm4, %xmm5
1419	mulpd	 %xmm10, %xmm4
1420	addpd	 %xmm4,  %xmm2
1421	movsd	-16 * SIZE(A2), %xmm4
1422	movhpd	-15 * SIZE(A2), %xmm4
1423	pshufd	 $0x4e, %xmm6, %xmm7
1424	mulpd	 %xmm10, %xmm6
1425	addpd	 %xmm6,  %xmm3
1426	movsd	-14 * SIZE(A2), %xmm6
1427	movhpd	-13 * SIZE(A2), %xmm6
1428
1429	mulpd	 %xmm11, %xmm5
1430	SUBPD	 %xmm5,  %xmm2
1431	mulpd	 %xmm11, %xmm7
1432	SUBPD	 %xmm7,  %xmm3
1433
1434#ifdef PREFETCH
1435	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
1436#endif
1437
1438	pshufd	 $0x4e, %xmm4, %xmm5
1439	mulpd	 %xmm12, %xmm4
1440	addpd	 %xmm4,  %xmm0
1441	movsd	-12 * SIZE(A2), %xmm4
1442	movhpd	-11 * SIZE(A2), %xmm4
1443	pshufd	 $0x4e, %xmm6, %xmm7
1444	mulpd	 %xmm12, %xmm6
1445	addpd	 %xmm6,  %xmm1
1446	movsd	-10 * SIZE(A2), %xmm6
1447	movhpd	 -9 * SIZE(A2), %xmm6
1448
1449	mulpd	 %xmm13, %xmm5
1450	SUBPD	 %xmm5,  %xmm0
1451	mulpd	 %xmm13, %xmm7
1452	SUBPD	 %xmm7,  %xmm1
1453
1454	pshufd	 $0x4e, %xmm4, %xmm5
1455	mulpd	 %xmm12, %xmm4
1456	addpd	 %xmm4,  %xmm2
1457	movsd	-16 * SIZE(A2, LDA), %xmm4
1458	movhpd	-15 * SIZE(A2, LDA), %xmm4
1459	pshufd	 $0x4e, %xmm6, %xmm7
1460	mulpd	 %xmm12, %xmm6
1461	addpd	 %xmm6,  %xmm3
1462	movsd	-14 * SIZE(A2, LDA), %xmm6
1463	movhpd	-13 * SIZE(A2, LDA), %xmm6
1464
1465	mulpd	 %xmm13, %xmm5
1466	SUBPD	 %xmm5,  %xmm2
1467	mulpd	 %xmm13, %xmm7
1468	SUBPD	 %xmm7,  %xmm3
1469
1470#ifdef PREFETCH
1471	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
1472#endif
1473
1474	pshufd	 $0x4e, %xmm4, %xmm5
1475	mulpd	 %xmm14, %xmm4
1476	addpd	 %xmm4,  %xmm0
1477	movsd	-12 * SIZE(A2, LDA), %xmm4
1478	movhpd	-11 * SIZE(A2, LDA), %xmm4
1479	pshufd	 $0x4e, %xmm6, %xmm7
1480	mulpd	 %xmm14, %xmm6
1481	addpd	 %xmm6,  %xmm1
1482	movsd	-10 * SIZE(A2, LDA), %xmm6
1483	movhpd	 -9 * SIZE(A2, LDA), %xmm6
1484
1485	mulpd	 %xmm15, %xmm5
1486	SUBPD	 %xmm5,  %xmm0
1487	mulpd	 %xmm15, %xmm7
1488	SUBPD	 %xmm7,  %xmm1
1489
1490	pshufd	 $0x4e, %xmm4, %xmm5
1491	mulpd	 %xmm14, %xmm4
1492	addpd	 %xmm4,  %xmm2
1493	movsd	 -8 * SIZE(A1), %xmm4
1494	movhpd	 -7 * SIZE(A1), %xmm4
1495	pshufd	 $0x4e, %xmm6, %xmm7
1496	mulpd	 %xmm14, %xmm6
1497	addpd	 %xmm6,  %xmm3
1498	movsd	 -6 * SIZE(A1), %xmm6
1499	movhpd	 -5 * SIZE(A1), %xmm6
1500
1501	mulpd	 %xmm15, %xmm5
1502	SUBPD	 %xmm5,  %xmm2
1503	mulpd	 %xmm15, %xmm7
1504	SUBPD	 %xmm7,  %xmm3
1505
1506#ifdef PREFETCHW
1507	PREFETCHW	(PREFETCHSIZE) - 128 + PREOFFSET(Y1)
1508#endif
1509
1510	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1511	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1512	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1513	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1514
1515	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1516	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1517	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1518	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1519
1520	subq	 $-8 * SIZE, A1
1521	subq	 $-8 * SIZE, A2
1522	subq	 $-8 * SIZE, Y1
1523
1524	subq	 $1, I
1525	BRANCH
1526	jg	.L103
1527	ALIGN_3
1528
1529.L104:
1530	pshufd	 $0x4e, %xmm4, %xmm5
1531	mulpd	 %xmm8,  %xmm4
1532	addpd	 %xmm4,  %xmm0
1533	movsd	-12 * SIZE(A1), %xmm4
1534	movhpd	-11 * SIZE(A1), %xmm4
1535	pshufd	 $0x4e, %xmm6, %xmm7
1536	mulpd	 %xmm8,  %xmm6
1537	addpd	 %xmm6,  %xmm1
1538	movsd	-10 * SIZE(A1), %xmm6
1539	movhpd	 -9 * SIZE(A1), %xmm6
1540
1541	mulpd	 %xmm9,  %xmm5
1542	SUBPD	 %xmm5,  %xmm0
1543	mulpd	 %xmm9,  %xmm7
1544	SUBPD	 %xmm7,  %xmm1
1545
1546	pshufd	 $0x4e, %xmm4, %xmm5
1547	mulpd	 %xmm8,  %xmm4
1548	addpd	 %xmm4,  %xmm2
1549	movsd	-16 * SIZE(A1, LDA), %xmm4
1550	movhpd	-15 * SIZE(A1, LDA), %xmm4
1551	pshufd	 $0x4e, %xmm6, %xmm7
1552	mulpd	 %xmm8,  %xmm6
1553	addpd	 %xmm6,  %xmm3
1554	movsd	-14 * SIZE(A1, LDA), %xmm6
1555	movhpd	-13 * SIZE(A1, LDA), %xmm6
1556
1557	mulpd	 %xmm9,  %xmm5
1558	SUBPD	 %xmm5,  %xmm2
1559	mulpd	 %xmm9,  %xmm7
1560	SUBPD	 %xmm7,  %xmm3
1561
1562	pshufd	 $0x4e, %xmm4, %xmm5
1563	mulpd	 %xmm10, %xmm4
1564	addpd	 %xmm4,  %xmm0
1565	movsd	-12 * SIZE(A1, LDA), %xmm4
1566	movhpd	-11 * SIZE(A1, LDA), %xmm4
1567	pshufd	 $0x4e, %xmm6, %xmm7
1568	mulpd	 %xmm10, %xmm6
1569	addpd	 %xmm6,  %xmm1
1570	movsd	-10 * SIZE(A1, LDA), %xmm6
1571	movhpd	 -9 * SIZE(A1, LDA), %xmm6
1572
1573	mulpd	 %xmm11, %xmm5
1574	SUBPD	 %xmm5,  %xmm0
1575	mulpd	 %xmm11, %xmm7
1576	SUBPD	 %xmm7,  %xmm1
1577
1578	pshufd	 $0x4e, %xmm4, %xmm5
1579	mulpd	 %xmm10, %xmm4
1580	addpd	 %xmm4,  %xmm2
1581	movsd	-16 * SIZE(A2), %xmm4
1582	movhpd	-15 * SIZE(A2), %xmm4
1583	pshufd	 $0x4e, %xmm6, %xmm7
1584	mulpd	 %xmm10, %xmm6
1585	addpd	 %xmm6,  %xmm3
1586	movsd	-14 * SIZE(A2), %xmm6
1587	movhpd	-13 * SIZE(A2), %xmm6
1588
1589	mulpd	 %xmm11, %xmm5
1590	SUBPD	 %xmm5,  %xmm2
1591	mulpd	 %xmm11, %xmm7
1592	SUBPD	 %xmm7,  %xmm3
1593
1594	pshufd	 $0x4e, %xmm4, %xmm5
1595	mulpd	 %xmm12, %xmm4
1596	addpd	 %xmm4,  %xmm0
1597	movsd	-12 * SIZE(A2), %xmm4
1598	movhpd	-11 * SIZE(A2), %xmm4
1599	pshufd	 $0x4e, %xmm6, %xmm7
1600	mulpd	 %xmm12, %xmm6
1601	addpd	 %xmm6,  %xmm1
1602	movsd	-10 * SIZE(A2), %xmm6
1603	movhpd	 -9 * SIZE(A2), %xmm6
1604
1605	mulpd	 %xmm13, %xmm5
1606	SUBPD	 %xmm5,  %xmm0
1607	mulpd	 %xmm13, %xmm7
1608	SUBPD	 %xmm7,  %xmm1
1609
1610	pshufd	 $0x4e, %xmm4, %xmm5
1611	mulpd	 %xmm12, %xmm4
1612	addpd	 %xmm4,  %xmm2
1613	movsd	-16 * SIZE(A2, LDA), %xmm4
1614	movhpd	-15 * SIZE(A2, LDA), %xmm4
1615	pshufd	 $0x4e, %xmm6, %xmm7
1616	mulpd	 %xmm12, %xmm6
1617	addpd	 %xmm6,  %xmm3
1618	movsd	-14 * SIZE(A2, LDA), %xmm6
1619	movhpd	-13 * SIZE(A2, LDA), %xmm6
1620
1621	mulpd	 %xmm13, %xmm5
1622	SUBPD	 %xmm5,  %xmm2
1623	mulpd	 %xmm13, %xmm7
1624	SUBPD	 %xmm7,  %xmm3
1625
1626	pshufd	 $0x4e, %xmm4, %xmm5
1627	mulpd	 %xmm14, %xmm4
1628	addpd	 %xmm4,  %xmm0
1629	movsd	-12 * SIZE(A2, LDA), %xmm4
1630	movhpd	-11 * SIZE(A2, LDA), %xmm4
1631	pshufd	 $0x4e, %xmm6, %xmm7
1632	mulpd	 %xmm14, %xmm6
1633	addpd	 %xmm6,  %xmm1
1634	movsd	-10 * SIZE(A2, LDA), %xmm6
1635	movhpd	 -9 * SIZE(A2, LDA), %xmm6
1636
1637	mulpd	 %xmm15, %xmm5
1638	SUBPD	 %xmm5,  %xmm0
1639	mulpd	 %xmm15, %xmm7
1640	SUBPD	 %xmm7,  %xmm1
1641
1642	pshufd	 $0x4e, %xmm4, %xmm5
1643	mulpd	 %xmm14, %xmm4
1644	addpd	 %xmm4,  %xmm2
1645	pshufd	 $0x4e, %xmm6, %xmm7
1646	mulpd	 %xmm14, %xmm6
1647	addpd	 %xmm6,  %xmm3
1648
1649	mulpd	 %xmm15, %xmm5
1650	SUBPD	 %xmm5,  %xmm2
1651	mulpd	 %xmm15, %xmm7
1652	SUBPD	 %xmm7,  %xmm3
1653
1654	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1655	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1656	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1657	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1658
1659	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1660	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1661	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1662	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1663
1664	subq	 $-8 * SIZE, A1
1665	subq	 $-8 * SIZE, A2
1666	subq	 $-8 * SIZE, Y1
1667	ALIGN_3
1668
1669.L105:
1670	testq	$2, M
1671	je	.L107
1672
1673	movsd	-16 * SIZE(A1), %xmm4
1674	movhpd	-15 * SIZE(A1), %xmm4
1675	movsd	-14 * SIZE(A1), %xmm6
1676	movhpd	-13 * SIZE(A1), %xmm6
1677
1678	pshufd	 $0x4e, %xmm4, %xmm5
1679	mulpd	 %xmm8,  %xmm4
1680	addpd	 %xmm4,  %xmm0
1681	movsd	-16 * SIZE(A1, LDA), %xmm4
1682	movhpd	-15 * SIZE(A1, LDA), %xmm4
1683	pshufd	 $0x4e, %xmm6, %xmm7
1684	mulpd	 %xmm8,  %xmm6
1685	addpd	 %xmm6,  %xmm1
1686	movsd	-14 * SIZE(A1, LDA), %xmm6
1687	movhpd	-13 * SIZE(A1, LDA), %xmm6
1688
1689	mulpd	 %xmm9,  %xmm5
1690	SUBPD	 %xmm5,  %xmm0
1691	mulpd	 %xmm9,  %xmm7
1692	SUBPD	 %xmm7,  %xmm1
1693
1694	pshufd	 $0x4e, %xmm4, %xmm5
1695	mulpd	 %xmm10, %xmm4
1696	addpd	 %xmm4,  %xmm0
1697	movsd	-16 * SIZE(A2), %xmm4
1698	movhpd	-15 * SIZE(A2), %xmm4
1699	pshufd	 $0x4e, %xmm6, %xmm7
1700	mulpd	 %xmm10, %xmm6
1701	addpd	 %xmm6,  %xmm1
1702	movsd	-14 * SIZE(A2), %xmm6
1703	movhpd	-13 * SIZE(A2), %xmm6
1704
1705	mulpd	 %xmm11, %xmm5
1706	SUBPD	 %xmm5,  %xmm0
1707	mulpd	 %xmm11, %xmm7
1708	SUBPD	 %xmm7,  %xmm1
1709
1710	pshufd	 $0x4e, %xmm4, %xmm5
1711	mulpd	 %xmm12, %xmm4
1712	addpd	 %xmm4,  %xmm0
1713	movsd	-16 * SIZE(A2, LDA), %xmm4
1714	movhpd	-15 * SIZE(A2, LDA), %xmm4
1715	pshufd	 $0x4e, %xmm6, %xmm7
1716	mulpd	 %xmm12, %xmm6
1717	addpd	 %xmm6,  %xmm1
1718	movsd	-14 * SIZE(A2, LDA), %xmm6
1719	movhpd	-13 * SIZE(A2, LDA), %xmm6
1720
1721	mulpd	 %xmm13, %xmm5
1722	SUBPD	 %xmm5,  %xmm0
1723	mulpd	 %xmm13, %xmm7
1724	SUBPD	 %xmm7,  %xmm1
1725
1726	pshufd	 $0x4e, %xmm4, %xmm5
1727	mulpd	 %xmm14, %xmm4
1728	addpd	 %xmm4,  %xmm0
1729	mulpd	 %xmm15, %xmm5
1730	SUBPD	 %xmm5,  %xmm0
1731
1732	pshufd	 $0x4e, %xmm6, %xmm7
1733	mulpd	 %xmm14, %xmm6
1734	addpd	 %xmm6,  %xmm1
1735	mulpd	 %xmm15, %xmm7
1736	SUBPD	 %xmm7,  %xmm1
1737
1738	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1739	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1740	movapd	 %xmm2, %xmm0
1741
1742	addq	 $4 * SIZE, A1
1743	addq	 $4 * SIZE, A2
1744	addq	 $4 * SIZE, Y1
1745	ALIGN_3
1746
1747.L107:
1748	testq	$1, M
1749	je	.L109
1750
1751	movsd	-16 * SIZE(A1), %xmm4
1752	movhpd	-15 * SIZE(A1), %xmm4
1753	movsd	-16 * SIZE(A1, LDA), %xmm6
1754	movhpd	-15 * SIZE(A1, LDA), %xmm6
1755
1756	pshufd	 $0x4e, %xmm4, %xmm5
1757	mulpd	 %xmm8,  %xmm4
1758	addpd	 %xmm4,  %xmm0
1759	movsd	-16 * SIZE(A2), %xmm4
1760	movhpd	-15 * SIZE(A2), %xmm4
1761	mulpd	 %xmm9,  %xmm5
1762	SUBPD	 %xmm5,  %xmm0
1763
1764	pshufd	 $0x4e, %xmm6, %xmm7
1765	mulpd	 %xmm10, %xmm6
1766	addpd	 %xmm6,  %xmm0
1767	movsd	-16 * SIZE(A2, LDA), %xmm6
1768	movhpd	-15 * SIZE(A2, LDA), %xmm6
1769	mulpd	 %xmm11, %xmm7
1770	SUBPD	 %xmm7,  %xmm0
1771
1772	pshufd	 $0x4e, %xmm4, %xmm5
1773	mulpd	 %xmm12, %xmm4
1774	addpd	 %xmm4,  %xmm0
1775	mulpd	 %xmm13, %xmm5
1776	SUBPD	 %xmm5,  %xmm0
1777
1778	pshufd	 $0x4e, %xmm6, %xmm7
1779	mulpd	 %xmm14, %xmm6
1780	addpd	 %xmm6,  %xmm0
1781	mulpd	 %xmm15, %xmm7
1782	SUBPD	 %xmm7,  %xmm0
1783
1784	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1785	ALIGN_3
1786
1787.L109:
1788	cmpq	$4, N
1789	jge	.L101
1790	ALIGN_3
1791
1792.L110:
1793#endif
1794
1795#if GEMV_UNROLL >= 2
1796
1797	cmpq	$2, N
1798	jl	.L120
1799
1800#if GEMV_UNROLL == 2
1801	ALIGN_3
1802
1803.L111:
1804#endif
1805
1806	subq	$2, N
1807
1808	leaq	16 * SIZE(BUFFER), Y1
1809	movq	A,  A1
1810	leaq	(A,  LDA, 1), A2
1811	leaq	(A,  LDA, 2), A
1812
1813	movsd	0 * SIZE(X), %xmm12
1814	movhpd	1 * SIZE(X), %xmm12
1815	addq	INCX, X
1816	movsd	0 * SIZE(X), %xmm14
1817	movhpd	1 * SIZE(X), %xmm14
1818	addq	INCX, X
1819
1820	pcmpeqb	%xmm11, %xmm11
1821	psllq	$63,    %xmm11
1822	shufps	$0xc0, %xmm11, %xmm11
1823
1824	pshufd	$0x4e, %xmm12, %xmm13
1825	pshufd	$0x4e, %xmm14, %xmm15
1826
1827#ifdef HAVE_SSE3
1828	movddup	ALPHA_R, %xmm8
1829	movddup	ALPHA_I, %xmm9
1830#else
1831	movsd	ALPHA_R, %xmm8
1832	unpcklpd %xmm8, %xmm8
1833	movsd	ALPHA_I, %xmm9
1834	unpcklpd %xmm9, %xmm9
1835#endif
1836
1837	xorpd	 %xmm11, %xmm13
1838	xorpd	 %xmm11, %xmm15
1839
1840	mulpd	 %xmm8, %xmm12
1841	mulpd	 %xmm9, %xmm13
1842	mulpd	 %xmm8, %xmm14
1843	mulpd	 %xmm9, %xmm15
1844
1845#ifndef XCONJ
1846	subpd	 %xmm13, %xmm12
1847	subpd	 %xmm15, %xmm14
1848#else
1849	addpd	 %xmm13, %xmm12
1850	addpd	 %xmm15, %xmm14
1851#endif
1852
1853	pshufd	 $0xee, %xmm12, %xmm13
1854	pshufd	 $0x44, %xmm12, %xmm12
1855
1856	pshufd	 $0xee, %xmm14, %xmm15
1857	pshufd	 $0x44, %xmm14, %xmm14
1858
1859#ifndef CONJ
1860	xorpd	 %xmm11, %xmm13
1861	xorpd	 %xmm11, %xmm15
1862#else
1863	xorpd	 %xmm11, %xmm12
1864	xorpd	 %xmm11, %xmm14
1865#endif
1866
1867	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1868	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
1869	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
1870	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
1871	ALIGN_3
1872
1873	movq	M,   I
1874	sarq	$2,  I
1875	jle	.L115
1876
1877	movsd	-16 * SIZE(A1), %xmm4
1878	movhpd	-15 * SIZE(A1), %xmm4
1879	movsd	-14 * SIZE(A1), %xmm6
1880	movhpd	-13 * SIZE(A1), %xmm6
1881	movsd	-12 * SIZE(A1), %xmm8
1882	movhpd	-11 * SIZE(A1), %xmm8
1883	movsd	-10 * SIZE(A1), %xmm10
1884	movhpd	 -9 * SIZE(A1), %xmm10
1885
1886	decq	 I
1887	jle	 .L114
1888	ALIGN_3
1889
1890.L113:
1891#ifdef PREFETCH
1892	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
1893#endif
1894
1895	pshufd	 $0x4e, %xmm4,  %xmm5
1896	mulpd	 %xmm12, %xmm4
1897	addpd	 %xmm4,  %xmm0
1898	movsd	-16 * SIZE(A2), %xmm4
1899	movhpd	-15 * SIZE(A2), %xmm4
1900	pshufd	 $0x4e, %xmm6,  %xmm7
1901	mulpd	 %xmm12, %xmm6
1902	addpd	 %xmm6,  %xmm1
1903	movsd	-14 * SIZE(A2), %xmm6
1904	movhpd	-13 * SIZE(A2), %xmm6
1905
1906	pshufd	 $0x4e, %xmm8,  %xmm9
1907	mulpd	 %xmm12, %xmm8
1908	addpd	 %xmm8,  %xmm2
1909	movsd	-12 * SIZE(A2), %xmm8
1910	movhpd	-11 * SIZE(A2), %xmm8
1911	pshufd	 $0x4e, %xmm10, %xmm11
1912	mulpd	 %xmm12, %xmm10
1913	addpd	 %xmm10, %xmm3
1914	movsd	-10 * SIZE(A2), %xmm10
1915	movhpd	 -9 * SIZE(A2), %xmm10
1916
1917	mulpd	 %xmm13, %xmm5
1918	SUBPD	 %xmm5,  %xmm0
1919	mulpd	 %xmm13, %xmm7
1920	SUBPD	 %xmm7,  %xmm1
1921
1922	mulpd	 %xmm13, %xmm9
1923	SUBPD	 %xmm9,  %xmm2
1924	mulpd	 %xmm13, %xmm11
1925	SUBPD	 %xmm11, %xmm3
1926
1927#ifdef PREFETCH
1928	PREFETCH	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
1929#endif
1930
1931	pshufd	 $0x4e, %xmm4,  %xmm5
1932	mulpd	 %xmm14, %xmm4
1933	addpd	 %xmm4,  %xmm0
1934	movsd	 -8 * SIZE(A1), %xmm4
1935	movhpd	 -7 * SIZE(A1), %xmm4
1936	pshufd	 $0x4e, %xmm6,  %xmm7
1937	mulpd	 %xmm14, %xmm6
1938	addpd	 %xmm6,  %xmm1
1939	movsd	 -6 * SIZE(A1), %xmm6
1940	movhpd	 -5 * SIZE(A1), %xmm6
1941
1942	pshufd	 $0x4e, %xmm8,  %xmm9
1943	mulpd	 %xmm14, %xmm8
1944	addpd	 %xmm8,  %xmm2
1945	movsd	 -4 * SIZE(A1), %xmm8
1946	movhpd	 -3 * SIZE(A1), %xmm8
1947	pshufd	 $0x4e, %xmm10, %xmm11
1948	mulpd	 %xmm14, %xmm10
1949	addpd	 %xmm10, %xmm3
1950	movsd	 -2 * SIZE(A1), %xmm10
1951	movhpd	 -1 * SIZE(A1), %xmm10
1952
1953	mulpd	 %xmm15, %xmm5
1954	SUBPD	 %xmm5,  %xmm0
1955	mulpd	 %xmm15, %xmm7
1956	SUBPD	 %xmm7,  %xmm1
1957
1958	mulpd	 %xmm15, %xmm9
1959	SUBPD	 %xmm9,  %xmm2
1960	mulpd	 %xmm15, %xmm11
1961	SUBPD	 %xmm11, %xmm3
1962
1963#ifdef PREFETCHW
1964	PREFETCHW	(PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
1965#endif
1966
1967	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1968	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1969	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1970	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1971
1972	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1973	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1974	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1975	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1976
1977	subq	 $-8 * SIZE, A1
1978	subq	 $-8 * SIZE, A2
1979	subq	 $-8 * SIZE, Y1
1980
1981	subq	 $1, I
1982	BRANCH
1983	jg	.L113
1984	ALIGN_3
1985
1986.L114:
1987	pshufd	 $0x4e, %xmm4,  %xmm5
1988	mulpd	 %xmm12, %xmm4
1989	addpd	 %xmm4,  %xmm0
1990	movsd	-16 * SIZE(A2), %xmm4
1991	movhpd	-15 * SIZE(A2), %xmm4
1992	pshufd	 $0x4e, %xmm6,  %xmm7
1993	mulpd	 %xmm12, %xmm6
1994	addpd	 %xmm6,  %xmm1
1995	movsd	-14 * SIZE(A2), %xmm6
1996	movhpd	-13 * SIZE(A2), %xmm6
1997
1998	pshufd	 $0x4e, %xmm8,  %xmm9
1999	mulpd	 %xmm12, %xmm8
2000	addpd	 %xmm8,  %xmm2
2001	movsd	-12 * SIZE(A2), %xmm8
2002	movhpd	-11 * SIZE(A2), %xmm8
2003	pshufd	 $0x4e, %xmm10, %xmm11
2004	mulpd	 %xmm12, %xmm10
2005	addpd	 %xmm10, %xmm3
2006	movsd	-10 * SIZE(A2), %xmm10
2007	movhpd	 -9 * SIZE(A2), %xmm10
2008
2009	mulpd	 %xmm13, %xmm5
2010	SUBPD	 %xmm5,  %xmm0
2011	mulpd	 %xmm13, %xmm7
2012	SUBPD	 %xmm7,  %xmm1
2013
2014	mulpd	 %xmm13, %xmm9
2015	SUBPD	 %xmm9,  %xmm2
2016	mulpd	 %xmm13, %xmm11
2017	SUBPD	 %xmm11, %xmm3
2018
2019	pshufd	 $0x4e, %xmm4,  %xmm5
2020	mulpd	 %xmm14, %xmm4
2021	addpd	 %xmm4,  %xmm0
2022	pshufd	 $0x4e, %xmm6,  %xmm7
2023	mulpd	 %xmm14, %xmm6
2024	addpd	 %xmm6,  %xmm1
2025
2026	pshufd	 $0x4e, %xmm8,  %xmm9
2027	mulpd	 %xmm14, %xmm8
2028	addpd	 %xmm8,  %xmm2
2029	pshufd	 $0x4e, %xmm10, %xmm11
2030	mulpd	 %xmm14, %xmm10
2031	addpd	 %xmm10, %xmm3
2032
2033	mulpd	 %xmm15, %xmm5
2034	SUBPD	 %xmm5,  %xmm0
2035	mulpd	 %xmm15, %xmm7
2036	SUBPD	 %xmm7,  %xmm1
2037
2038	mulpd	 %xmm15, %xmm9
2039	SUBPD	 %xmm9,  %xmm2
2040	mulpd	 %xmm15, %xmm11
2041	SUBPD	 %xmm11, %xmm3
2042
2043	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2044	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2045	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
2046	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
2047
2048	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
2049	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
2050	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
2051	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
2052
2053	subq	 $-8 * SIZE, A1
2054	subq	 $-8 * SIZE, A2
2055	subq	 $-8 * SIZE, Y1
2056	ALIGN_3
2057
2058.L115:
2059	testq	$2, M
2060	je	.L117
2061
2062	movsd	-16 * SIZE(A1), %xmm4
2063	movhpd	-15 * SIZE(A1), %xmm4
2064	movsd	-14 * SIZE(A1), %xmm6
2065	movhpd	-13 * SIZE(A1), %xmm6
2066
2067	movsd	-16 * SIZE(A2), %xmm8
2068	movhpd	-15 * SIZE(A2), %xmm8
2069	movsd	-14 * SIZE(A2), %xmm10
2070	movhpd	-13 * SIZE(A2), %xmm10
2071
2072	pshufd	 $0x4e, %xmm4,  %xmm5
2073	mulpd	 %xmm12, %xmm4
2074	addpd	 %xmm4,  %xmm0
2075	pshufd	 $0x4e, %xmm6,  %xmm7
2076	mulpd	 %xmm12, %xmm6
2077	addpd	 %xmm6,  %xmm1
2078
2079	mulpd	 %xmm13, %xmm5
2080	SUBPD	 %xmm5,  %xmm0
2081	mulpd	 %xmm13, %xmm7
2082	SUBPD	 %xmm7,  %xmm1
2083
2084	pshufd	 $0x4e, %xmm8,  %xmm9
2085	mulpd	 %xmm14, %xmm8
2086	addpd	 %xmm8,  %xmm0
2087	pshufd	 $0x4e, %xmm10, %xmm11
2088	mulpd	 %xmm14, %xmm10
2089	addpd	 %xmm10, %xmm1
2090
2091	mulpd	 %xmm15, %xmm9
2092	SUBPD	 %xmm9,  %xmm0
2093	mulpd	 %xmm15, %xmm11
2094	SUBPD	 %xmm11, %xmm1
2095
2096	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2097	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2098	movapd	 %xmm2, %xmm0
2099
2100	addq	 $4 * SIZE, A1
2101	addq	 $4 * SIZE, A2
2102	addq	 $4 * SIZE, Y1
2103	ALIGN_3
2104
2105.L117:
2106	testq	$1, M
2107#if GEMV_UNROLL == 2
2108	je	.L119
2109#else
2110	je	.L120
2111#endif
2112
2113	movsd	-16 * SIZE(A1), %xmm4
2114	movhpd	-15 * SIZE(A1), %xmm4
2115	movsd	-16 * SIZE(A2), %xmm6
2116	movhpd	-15 * SIZE(A2), %xmm6
2117
2118	pshufd	 $0x4e, %xmm4, %xmm5
2119	mulpd	 %xmm12, %xmm4
2120	addpd	 %xmm4,  %xmm0
2121	mulpd	 %xmm13, %xmm5
2122	SUBPD	 %xmm5,  %xmm0
2123
2124	pshufd	 $0x4e, %xmm6, %xmm7
2125	mulpd	 %xmm14, %xmm6
2126	addpd	 %xmm6,  %xmm0
2127	mulpd	 %xmm15, %xmm7
2128	SUBPD	 %xmm7,  %xmm0
2129
2130	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2131
2132#if GEMV_UNROLL == 2
2133	ALIGN_3
2134
2135.L119:
2136	cmpq	$2, N
2137	jge	.L111
2138#endif
2139	ALIGN_3
2140
2141.L120:
2142#endif
2143
2144	cmpq	$1, N
2145	jl	.L980
2146
2147#if GEMV_UNROLL == 1
2148.L121:
2149	decq	N
2150#endif
2151
2152	leaq	16 * SIZE(BUFFER), Y1
2153	movq	A,  A1
2154#if GEMV_UNROLL == 1
2155	addq	LDA, A
2156#endif
2157
2158	movsd	0 * SIZE(X), %xmm12
2159	movhpd	1 * SIZE(X), %xmm12
2160	addq	INCX, X
2161
2162	pcmpeqb	%xmm11, %xmm11
2163	psllq	$63,    %xmm11
2164	shufps	$0xc0, %xmm11, %xmm11
2165
2166	pshufd	$0x4e, %xmm12, %xmm13
2167
2168#ifdef HAVE_SSE3
2169	movddup	ALPHA_R, %xmm8
2170	movddup	ALPHA_I, %xmm9
2171#else
2172	movsd	ALPHA_R, %xmm8
2173	unpcklpd %xmm8, %xmm8
2174	movsd	ALPHA_I, %xmm9
2175	unpcklpd %xmm9, %xmm9
2176#endif
2177
2178	xorpd	 %xmm11, %xmm13
2179
2180	mulpd	 %xmm8, %xmm12
2181	mulpd	 %xmm9, %xmm13
2182
2183#ifndef XCONJ
2184	subpd	 %xmm13, %xmm12
2185#else
2186	addpd	 %xmm13, %xmm12
2187#endif
2188
2189	pshufd	 $0xee, %xmm12, %xmm13
2190	pshufd	 $0x44, %xmm12, %xmm12
2191
2192#ifndef CONJ
2193	xorpd	 %xmm11, %xmm13
2194#else
2195	xorpd	 %xmm11, %xmm12
2196#endif
2197
2198	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
2199	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
2200	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
2201	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
2202
2203	movq	M,   I
2204	sarq	$2,  I
2205	jle	.L125
2206
2207	movsd	-16 * SIZE(A1), %xmm4
2208	movhpd	-15 * SIZE(A1), %xmm4
2209	movsd	-14 * SIZE(A1), %xmm6
2210	movhpd	-13 * SIZE(A1), %xmm6
2211	movsd	-12 * SIZE(A1), %xmm8
2212	movhpd	-11 * SIZE(A1), %xmm8
2213	movsd	-10 * SIZE(A1), %xmm10
2214	movhpd	 -9 * SIZE(A1), %xmm10
2215
2216	decq	 I
2217	jle	 .L124
2218	ALIGN_3
2219
2220.L123:
2221#ifdef PREFETCH
2222	PREFETCH	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
2223#endif
2224
2225	pshufd	 $0x4e, %xmm4,  %xmm5
2226	mulpd	 %xmm12, %xmm4
2227	addpd	 %xmm4,  %xmm0
2228	movsd	 -8 * SIZE(A1), %xmm4
2229	movhpd	 -7 * SIZE(A1), %xmm4
2230	pshufd	 $0x4e, %xmm6,  %xmm7
2231	mulpd	 %xmm12, %xmm6
2232	addpd	 %xmm6,  %xmm1
2233	movsd	 -6 * SIZE(A1), %xmm6
2234	movhpd	 -5 * SIZE(A1), %xmm6
2235
2236	pshufd	 $0x4e, %xmm8,  %xmm9
2237	mulpd	 %xmm12, %xmm8
2238	addpd	 %xmm8,  %xmm2
2239	movsd	 -4 * SIZE(A1), %xmm8
2240	movhpd	 -3 * SIZE(A1), %xmm8
2241	pshufd	 $0x4e, %xmm10, %xmm11
2242	mulpd	 %xmm12, %xmm10
2243	addpd	 %xmm10, %xmm3
2244	movsd	 -2 * SIZE(A1), %xmm10
2245	movhpd	 -1 * SIZE(A1), %xmm10
2246
2247	mulpd	 %xmm13, %xmm5
2248	SUBPD	 %xmm5,  %xmm0
2249	mulpd	 %xmm13, %xmm7
2250	SUBPD	 %xmm7,  %xmm1
2251
2252	mulpd	 %xmm13, %xmm9
2253	SUBPD	 %xmm9,  %xmm2
2254	mulpd	 %xmm13, %xmm11
2255	SUBPD	 %xmm11, %xmm3
2256
2257#ifdef PREFETCHW
2258	PREFETCHW	(PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
2259#endif
2260
2261	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2262	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2263	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
2264	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
2265
2266	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
2267	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
2268	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
2269	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
2270
2271	subq	 $-8 * SIZE, A1
2272	subq	 $-8 * SIZE, Y1
2273
2274	subq	 $1, I
2275	BRANCH
2276	jg	.L123
2277	ALIGN_3
2278
2279.L124:
2280	pshufd	 $0x4e, %xmm4,  %xmm5
2281	mulpd	 %xmm12, %xmm4
2282	addpd	 %xmm4,  %xmm0
2283	pshufd	 $0x4e, %xmm6,  %xmm7
2284	mulpd	 %xmm12, %xmm6
2285	addpd	 %xmm6,  %xmm1
2286
2287	pshufd	 $0x4e, %xmm8,  %xmm9
2288	mulpd	 %xmm12, %xmm8
2289	addpd	 %xmm8,  %xmm2
2290	pshufd	 $0x4e, %xmm10, %xmm11
2291	mulpd	 %xmm12, %xmm10
2292	addpd	 %xmm10, %xmm3
2293
2294	mulpd	 %xmm13, %xmm5
2295	SUBPD	 %xmm5,  %xmm0
2296	mulpd	 %xmm13, %xmm7
2297	SUBPD	 %xmm7,  %xmm1
2298
2299	mulpd	 %xmm13, %xmm9
2300	SUBPD	 %xmm9,  %xmm2
2301	mulpd	 %xmm13, %xmm11
2302	SUBPD	 %xmm11, %xmm3
2303
2304	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2305	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2306	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
2307	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
2308
2309	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
2310	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
2311	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
2312	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
2313
2314	subq	 $-8 * SIZE, A1
2315	subq	 $-8 * SIZE, Y1
2316	ALIGN_3
2317
2318.L125:
2319	testq	$2, M
2320	je	.L127
2321
2322
2323	movsd	-16 * SIZE(A1), %xmm4
2324	movhpd	-15 * SIZE(A1), %xmm4
2325	movsd	-14 * SIZE(A1), %xmm6
2326	movhpd	-13 * SIZE(A1), %xmm6
2327
2328	pshufd	 $0x4e, %xmm4,  %xmm5
2329	mulpd	 %xmm12, %xmm4
2330	addpd	 %xmm4,  %xmm0
2331	pshufd	 $0x4e, %xmm6,  %xmm7
2332	mulpd	 %xmm12, %xmm6
2333	addpd	 %xmm6,  %xmm1
2334
2335	mulpd	 %xmm13, %xmm5
2336	SUBPD	 %xmm5,  %xmm0
2337	mulpd	 %xmm13, %xmm7
2338	SUBPD	 %xmm7,  %xmm1
2339
2340	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2341	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2342	movapd	 %xmm2, %xmm0
2343
2344	addq	 $4 * SIZE, A1
2345	addq	 $4 * SIZE, Y1
2346	ALIGN_3
2347
2348.L127:
2349	testq	$1, M
2350#if GEMV_UNROLL == 1
2351	je	.L129
2352#else
2353	je	.L980
2354#endif
2355
2356	movsd	-16 * SIZE(A1), %xmm4
2357	movhpd	-15 * SIZE(A1), %xmm4
2358
2359	pshufd	 $0x4e, %xmm4, %xmm5
2360	mulpd	 %xmm12, %xmm4
2361	addpd	 %xmm4,  %xmm0
2362	mulpd	 %xmm13, %xmm5
2363	SUBPD	 %xmm5,  %xmm0
2364
2365	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2366
2367#if GEMV_UNROLL == 1
2368	ALIGN_3
2369.L129:
2370	cmpq	$1, N
2371	jge	.L121
2372#endif
2373
2374
2375#endif
2376	ALIGN_3
2377
2378.L980:
2379	testq	$SIZE, Y
2380	jne	.L990
2381
2382	movq	Y,  Y1
2383
2384	movq	M,  %rax
2385	sarq	$3, %rax
2386	jle	.L184
2387	ALIGN_3
2388
2389.L182:
2390 	movapd	 (Y), %xmm0
2391	addq	INCY, Y
2392	movapd	 (Y), %xmm1
2393	addq	INCY, Y
2394	movapd	 (Y), %xmm2
2395	addq	INCY, Y
2396	movapd	 (Y), %xmm3
2397	addq	INCY, Y
2398 	movapd	 (Y), %xmm4
2399	addq	INCY, Y
2400	movapd	 (Y), %xmm5
2401	addq	INCY, Y
2402	movapd	 (Y), %xmm6
2403	addq	INCY, Y
2404	movapd	 (Y), %xmm7
2405	addq	INCY, Y
2406
2407	addpd	 0 * SIZE(BUFFER), %xmm0
2408	addpd	 2 * SIZE(BUFFER), %xmm1
2409	addpd	 4 * SIZE(BUFFER), %xmm2
2410	addpd	 6 * SIZE(BUFFER), %xmm3
2411	addpd	 8 * SIZE(BUFFER), %xmm4
2412	addpd	10 * SIZE(BUFFER), %xmm5
2413	addpd	12 * SIZE(BUFFER), %xmm6
2414	addpd	14 * SIZE(BUFFER), %xmm7
2415
2416	movapd	%xmm0,  (Y1)
2417	addq	INCY, Y1
2418	movapd	%xmm1,  (Y1)
2419	addq	INCY, Y1
2420	movapd	%xmm2,  (Y1)
2421	addq	INCY, Y1
2422	movapd	%xmm3,  (Y1)
2423	addq	INCY, Y1
2424	movapd	%xmm4,  (Y1)
2425	addq	INCY, Y1
2426	movapd	%xmm5,  (Y1)
2427	addq	INCY, Y1
2428	movapd	%xmm6,  (Y1)
2429	addq	INCY, Y1
2430	movapd	%xmm7,  (Y1)
2431	addq	INCY, Y1
2432
2433	subq	$-16 * SIZE, BUFFER
2434	decq	%rax
2435	jg	.L182
2436	ALIGN_3
2437
2438.L184:
2439	testq	$7, M
2440	jle	.L999
2441
2442	testq	$4, M
2443	jle	.L185
2444
2445 	movapd	 (Y), %xmm0
2446	addq	INCY, Y
2447	movapd	 (Y), %xmm1
2448	addq	INCY, Y
2449	movapd	 (Y), %xmm2
2450	addq	INCY, Y
2451	movapd	 (Y), %xmm3
2452	addq	INCY, Y
2453
2454	addpd	 0 * SIZE(BUFFER), %xmm0
2455	addpd	 2 * SIZE(BUFFER), %xmm1
2456	addpd	 4 * SIZE(BUFFER), %xmm2
2457	addpd	 6 * SIZE(BUFFER), %xmm3
2458
2459	movapd	%xmm0,  (Y1)
2460	addq	INCY, Y1
2461	movapd	%xmm1,  (Y1)
2462	addq	INCY, Y1
2463	movapd	%xmm2,  (Y1)
2464	addq	INCY, Y1
2465	movapd	%xmm3,  (Y1)
2466	addq	INCY, Y1
2467
2468	addq	$8 * SIZE, BUFFER
2469	ALIGN_3
2470
2471.L185:
2472	testq	$2, M
2473	jle	.L186
2474
2475 	movapd	 (Y), %xmm0
2476	addq	INCY, Y
2477	movapd	 (Y), %xmm1
2478	addq	INCY, Y
2479	addpd	 0 * SIZE(BUFFER), %xmm0
2480	addpd	 2 * SIZE(BUFFER), %xmm1
2481
2482	movapd	%xmm0,  (Y1)
2483	addq	INCY, Y1
2484	movapd	%xmm1,  (Y1)
2485	addq	INCY, Y1
2486
2487	addq	$4 * SIZE, BUFFER
2488	ALIGN_3
2489
2490.L186:
2491	testq	$1, M
2492	jle	.L999
2493
2494 	movapd	 (Y), %xmm0
2495
2496	addpd	 (BUFFER), %xmm0
2497
2498	movapd	%xmm0,  (Y1)
2499	jmp	.L999
2500	ALIGN_3
2501
2502.L990:
2503	movq	Y,  Y1
2504
2505	movq	M,  %rax
2506	sarq	$3, %rax
2507	jle	.L994
2508	ALIGN_3
2509
2510.L992:
2511 	movsd	 0 * SIZE(Y), %xmm0
2512	movhpd	 1 * SIZE(Y), %xmm0
2513	addq	INCY, Y
2514
2515	movsd	 0 * SIZE(Y), %xmm1
2516	movhpd	 1 * SIZE(Y), %xmm1
2517	addq	INCY, Y
2518
2519	movsd	 0 * SIZE(Y), %xmm2
2520	movhpd	 1 * SIZE(Y), %xmm2
2521	addq	INCY, Y
2522
2523	movsd	 0 * SIZE(Y), %xmm3
2524	movhpd	 1 * SIZE(Y), %xmm3
2525	addq	INCY, Y
2526
2527 	movsd	 0 * SIZE(Y), %xmm4
2528	movhpd	 1 * SIZE(Y), %xmm4
2529	addq	INCY, Y
2530
2531	movsd	 0 * SIZE(Y), %xmm5
2532	movhpd	 1 * SIZE(Y), %xmm5
2533	addq	INCY, Y
2534
2535	movsd	 0 * SIZE(Y), %xmm6
2536	movhpd	 1 * SIZE(Y), %xmm6
2537	addq	INCY, Y
2538
2539	movsd	 0 * SIZE(Y), %xmm7
2540	movhpd	 1 * SIZE(Y), %xmm7
2541	addq	INCY, Y
2542
2543	addpd	 0 * SIZE(BUFFER), %xmm0
2544	addpd	 2 * SIZE(BUFFER), %xmm1
2545	addpd	 4 * SIZE(BUFFER), %xmm2
2546	addpd	 6 * SIZE(BUFFER), %xmm3
2547	addpd	 8 * SIZE(BUFFER), %xmm4
2548	addpd	10 * SIZE(BUFFER), %xmm5
2549	addpd	12 * SIZE(BUFFER), %xmm6
2550	addpd	14 * SIZE(BUFFER), %xmm7
2551
2552	movlpd	%xmm0,  0 * SIZE(Y1)
2553	movhpd	%xmm0,  1 * SIZE(Y1)
2554	addq	INCY, Y1
2555
2556	movlpd	%xmm1,  0 * SIZE(Y1)
2557	movhpd	%xmm1,  1 * SIZE(Y1)
2558	addq	INCY, Y1
2559
2560	movlpd	%xmm2,  0 * SIZE(Y1)
2561	movhpd	%xmm2,  1 * SIZE(Y1)
2562	addq	INCY, Y1
2563
2564	movlpd	%xmm3,  0 * SIZE(Y1)
2565	movhpd	%xmm3,  1 * SIZE(Y1)
2566	addq	INCY, Y1
2567
2568	movlpd	%xmm4,  0 * SIZE(Y1)
2569	movhpd	%xmm4,  1 * SIZE(Y1)
2570	addq	INCY, Y1
2571
2572	movlpd	%xmm5,  0 * SIZE(Y1)
2573	movhpd	%xmm5,  1 * SIZE(Y1)
2574	addq	INCY, Y1
2575
2576	movlpd	%xmm6,  0 * SIZE(Y1)
2577	movhpd	%xmm6,  1 * SIZE(Y1)
2578	addq	INCY, Y1
2579
2580	movlpd	%xmm7,  0 * SIZE(Y1)
2581	movhpd	%xmm7,  1 * SIZE(Y1)
2582	addq	INCY, Y1
2583
2584	subq	$-16 * SIZE, BUFFER
2585	decq	%rax
2586	jg	.L992
2587	ALIGN_3
2588
2589.L994:
2590	testq	$7, M
2591	jle	.L999
2592
2593	testq	$4, M
2594	jle	.L995
2595
2596 	movsd	 0 * SIZE(Y), %xmm0
2597	movhpd	 1 * SIZE(Y), %xmm0
2598	addq	INCY, Y
2599
2600	movsd	 0 * SIZE(Y), %xmm1
2601	movhpd	 1 * SIZE(Y), %xmm1
2602	addq	INCY, Y
2603
2604	movsd	 0 * SIZE(Y), %xmm2
2605	movhpd	 1 * SIZE(Y), %xmm2
2606	addq	INCY, Y
2607
2608	movsd	 0 * SIZE(Y), %xmm3
2609	movhpd	 1 * SIZE(Y), %xmm3
2610	addq	INCY, Y
2611
2612	addpd	 0 * SIZE(BUFFER), %xmm0
2613	addpd	 2 * SIZE(BUFFER), %xmm1
2614	addpd	 4 * SIZE(BUFFER), %xmm2
2615	addpd	 6 * SIZE(BUFFER), %xmm3
2616
2617	movlpd	%xmm0,  0 * SIZE(Y1)
2618	movhpd	%xmm0,  1 * SIZE(Y1)
2619	addq	INCY, Y1
2620
2621	movlpd	%xmm1,  0 * SIZE(Y1)
2622	movhpd	%xmm1,  1 * SIZE(Y1)
2623	addq	INCY, Y1
2624
2625	movlpd	%xmm2,  0 * SIZE(Y1)
2626	movhpd	%xmm2,  1 * SIZE(Y1)
2627	addq	INCY, Y1
2628
2629	movlpd	%xmm3,  0 * SIZE(Y1)
2630	movhpd	%xmm3,  1 * SIZE(Y1)
2631	addq	INCY, Y1
2632
2633	addq	$8 * SIZE, BUFFER
2634	ALIGN_3
2635
2636.L995:
2637	testq	$2, M
2638	jle	.L996
2639
2640 	movsd	 0 * SIZE(Y), %xmm0
2641	movhpd	 1 * SIZE(Y), %xmm0
2642	addq	INCY, Y
2643
2644	movsd	 0 * SIZE(Y), %xmm1
2645	movhpd	 1 * SIZE(Y), %xmm1
2646	addq	INCY, Y
2647
2648	addpd	 0 * SIZE(BUFFER), %xmm0
2649	addpd	 2 * SIZE(BUFFER), %xmm1
2650
2651	movlpd	%xmm0,  0 * SIZE(Y1)
2652	movhpd	%xmm0,  1 * SIZE(Y1)
2653	addq	INCY, Y1
2654
2655	movlpd	%xmm1,  0 * SIZE(Y1)
2656	movhpd	%xmm1,  1 * SIZE(Y1)
2657	addq	INCY, Y1
2658
2659	addq	$4 * SIZE, BUFFER
2660	ALIGN_3
2661
2662.L996:
2663	testq	$1, M
2664	jle	.L999
2665
2666 	movsd	 0 * SIZE(Y), %xmm0
2667	movhpd	 1 * SIZE(Y), %xmm0
2668
2669	addpd	 0 * SIZE(BUFFER), %xmm0
2670
2671	movlpd	%xmm0,  0 * SIZE(Y1)
2672	movhpd	%xmm0,  1 * SIZE(Y1)
2673	ALIGN_3
2674
2675.L999:
2676	movq	  0(%rsp), %rbx
2677	movq	  8(%rsp), %rbp
2678	movq	 16(%rsp), %r12
2679	movq	 24(%rsp), %r13
2680	movq	 32(%rsp), %r14
2681	movq	 40(%rsp), %r15
2682
2683#ifdef WINDOWS_ABI
2684	movq	 48(%rsp), %rdi
2685	movq	 56(%rsp), %rsi
2686	movups	 64(%rsp), %xmm6
2687	movups	 80(%rsp), %xmm7
2688	movups	 96(%rsp), %xmm8
2689	movups	112(%rsp), %xmm9
2690	movups	128(%rsp), %xmm10
2691	movups	144(%rsp), %xmm11
2692	movups	160(%rsp), %xmm12
2693	movups	176(%rsp), %xmm13
2694	movups	192(%rsp), %xmm14
2695	movups	208(%rsp), %xmm15
2696#endif
2697
2698	addq	$STACKSIZE, %rsp
2699	ret
2700
2701	EPILOGUE
2702