1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41#include "l2param.h"
42
43#if GEMV_UNROLL < 2
44#undef  GEMV_UNROLL
45#define GEMV_UNROLL 2
46#endif
47
48#ifndef WINDOWS_ABI
49
50#define STACKSIZE	128
51
52#define OLD_M	  %rdi
53#define OLD_N	  %rsi
54#define OLD_A	  %rcx
55#define OLD_LDA	  %r8
56#define STACK_INCX	 8 + STACKSIZE(%rsp)
57#define STACK_Y		16 + STACKSIZE(%rsp)
58#define STACK_INCY	24 + STACKSIZE(%rsp)
59#define STACK_BUFFER	32 + STACKSIZE(%rsp)
60#define ALPHA	48	      (%rsp)
61
62#define MMM	56(%rsp)
63#define NN	64(%rsp)
64#define AA	72(%rsp)
65#define LDAX	80(%rsp)
66#define XX	88(%rsp)
67#else
68
69#define STACKSIZE	288
70
71#define OLD_M	  %rcx
72#define OLD_N	  %rdx
73#define OLD_A		 40 + STACKSIZE(%rsp)
74#define OLD_LDA		 48 + STACKSIZE(%rsp)
75#define OLD_X		 56 + STACKSIZE(%rsp)
76#define STACK_INCX	 64 + STACKSIZE(%rsp)
77#define STACK_Y		 72 + STACKSIZE(%rsp)
78#define STACK_INCY	 80 + STACKSIZE(%rsp)
79#define STACK_BUFFER	 88 + STACKSIZE(%rsp)
80#define ALPHA	224	       (%rsp)
81
82#define MMM	232(%rsp)
83#define NN	240(%rsp)
84#define AA	248(%rsp)
85#define LDAX 256(%rsp)
86#define XX	264(%rsp)
87
88#endif
89
90#define LDA	%r8
91#define X	%r9
92
93#define INCX	%rsi
94#define INCY	%rdi
95
96#define M	%r10
97#define N	%r11
98#define A	%r12
99#define Y	%r14
100#define BUFFER	%r13
101
102#define I	%rax
103#define A1	%rbx
104#define A2	%rcx
105#define LDA3	%rdx
106#define Y1	%rbp
107
108#ifdef ALIGNED_ACCESS
109#define MM	%r15
110#else
111#define MM	M
112#endif
113
114#define TMP_M   %r15
115#define Y2      %rbx
116
117	PROLOGUE
118	PROFCODE
119
120	subq	$STACKSIZE, %rsp
121	movq	%rbx,  0(%rsp)
122	movq	%rbp,  8(%rsp)
123	movq	%r12, 16(%rsp)
124	movq	%r13, 24(%rsp)
125	movq	%r14, 32(%rsp)
126	movq	%r15, 40(%rsp)
127
128#ifdef WINDOWS_ABI
129	movq	%rdi,    48(%rsp)
130	movq	%rsi,    56(%rsp)
131	movups	%xmm6,   64(%rsp)
132	movups	%xmm7,   80(%rsp)
133	movups	%xmm8,   96(%rsp)
134	movups	%xmm9,  112(%rsp)
135	movups	%xmm10, 128(%rsp)
136	movups	%xmm11, 144(%rsp)
137	movups	%xmm12, 160(%rsp)
138	movups	%xmm13, 176(%rsp)
139	movups	%xmm14, 192(%rsp)
140	movups	%xmm15, 208(%rsp)
141
142	movq	OLD_M,	      M
143	movq	OLD_N,        N
144	movq	OLD_A,        A
145	movq	OLD_LDA,      LDA
146	movq	OLD_X,        X
147#else
148	movq	OLD_M,	      M
149	movq	OLD_N,        N
150	movq	OLD_A,        A
151	movq	OLD_LDA,      LDA
152#endif
153
154#ifndef WINDOWS_ABI
155	movsd	 %xmm0, ALPHA
156#else
157	movsd	 %xmm3, ALPHA
158#endif
159
160	movq	STACK_Y,      Y
161	movq	A,AA
162	movq	N,NN
163	movq	M,MMM
164	movq	LDA,LDAX
165	movq	X,XX
166
167.L0t:
168	xorq	I,I
169	addq	$1,I
170	salq	$21,I
171	subq	I,MMM
172	movq	I,M
173	jge	.L00t
174
175	movq	MMM,M
176	addq	M, I
177	jle	.L999x
178	movq	I, M
179
180.L00t:
181	movq	XX,X
182	movq	AA,A
183	movq	NN,N
184	movq	LDAX,LDA
185
186	movq	STACK_INCX,   INCX
187	movq	STACK_INCY,   INCY
188	movq	STACK_BUFFER, BUFFER
189
190
191	leaq	-1(INCY), %rax
192
193	leaq	(,INCX, SIZE), INCX
194	leaq	(,INCY, SIZE), INCY
195	leaq	(,LDA,  SIZE), LDA
196
197	leaq	(LDA, LDA, 2), LDA3
198
199	subq	$-16 * SIZE, A
200
201#ifdef ALIGNED_ACCESS
202	leaq	-1 (M), MM
203	testq	$SIZE, A
204	cmoveq	M, MM
205#endif
206
207	testq	N, N		# if n <= 0 goto END
208	jle	.L999
209	testq	M, M		# if n <= 0 goto END
210	jle	.L999
211
212#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS)
213#ifndef  NOCOPY_UNALIGNED
214	movq	Y,    Y1
215	andq	$0xf, Y1
216	orq	Y1, %rax
217#endif
218	testq	%rax, %rax
219 	cmoveq	Y, BUFFER
220	je	.L10
221#endif
222
223	movq	BUFFER, Y1
224
225	pxor	%xmm4, %xmm4
226
227	movq	M,   %rax
228	addq	$16, %rax
229	sarq	$4,  %rax
230	ALIGN_3
231
232.L01:
233	movapd	%xmm4,  0 * SIZE(Y1)
234	movapd	%xmm4,  2 * SIZE(Y1)
235	movapd	%xmm4,  4 * SIZE(Y1)
236	movapd	%xmm4,  6 * SIZE(Y1)
237	movapd	%xmm4,  8 * SIZE(Y1)
238	movapd	%xmm4, 10 * SIZE(Y1)
239	movapd	%xmm4, 12 * SIZE(Y1)
240	movapd	%xmm4, 14 * SIZE(Y1)
241	subq	$-16 * SIZE, Y1
242	decq	%rax
243	jg	.L01
244	ALIGN_3
245
246.L10:
247
248#ifdef ALIGNED_ACCESS
249	leaq	SIZE(BUFFER), %rax
250	testq	$SIZE, A
251	cmovne	%rax, BUFFER
252
253	testq	$SIZE, LDA
254	jne	.L50
255#endif
256
257#if GEMV_UNROLL >= 8
258
259	cmpq	$8, N
260	jl	.L20
261	ALIGN_3
262
263.L11:
264	subq	$8, N
265
266	leaq	16 * SIZE(BUFFER), Y1
267	movq	A,  A1
268	leaq	(A,  LDA, 4), A2
269	leaq	(A,  LDA, 8), A
270
271#ifdef HAVE_SSE3
272	movddup	(X), %xmm8
273	addq	INCX, X
274	movddup	(X), %xmm9
275	addq	INCX, X
276	movddup	(X), %xmm10
277	addq	INCX, X
278	movddup	(X), %xmm11
279	addq	INCX, X
280	movddup	(X), %xmm12
281	addq	INCX, X
282	movddup	(X), %xmm13
283	addq	INCX, X
284	movddup	(X), %xmm14
285	addq	INCX, X
286	movddup	(X), %xmm15
287	addq	INCX, X
288
289	movddup	ALPHA, %xmm0
290#else
291	movsd	(X), %xmm8
292	unpcklpd %xmm8, %xmm8
293	addq	INCX, X
294	movsd	(X), %xmm9
295	unpcklpd %xmm9, %xmm9
296	addq	INCX, X
297	movsd	(X), %xmm10
298	unpcklpd %xmm10, %xmm10
299	addq	INCX, X
300	movsd	(X), %xmm11
301	unpcklpd %xmm11, %xmm11
302	addq	INCX, X
303	movsd	(X), %xmm12
304	unpcklpd %xmm12, %xmm12
305	addq	INCX, X
306	movsd	(X), %xmm13
307	unpcklpd %xmm13, %xmm13
308	addq	INCX, X
309	movsd	(X), %xmm14
310	unpcklpd %xmm14, %xmm14
311	addq	INCX, X
312	movsd	(X), %xmm15
313	unpcklpd %xmm15, %xmm15
314	addq	INCX, X
315
316	movsd	ALPHA, %xmm0
317	unpcklpd %xmm0, %xmm0
318#endif
319
320	mulpd	%xmm0, %xmm8
321	mulpd	%xmm0, %xmm9
322	mulpd	%xmm0, %xmm10
323	mulpd	%xmm0, %xmm11
324	mulpd	%xmm0, %xmm12
325	mulpd	%xmm0, %xmm13
326	mulpd	%xmm0, %xmm14
327	mulpd	%xmm0, %xmm15
328
329#ifdef ALIGNED_ACCESS
330	testq	$SIZE, A
331	je	.L1X
332
333	movsd	 -16 * SIZE(A1),         %xmm4
334	movsd	 -16 * SIZE(A1, LDA),    %xmm5
335	movsd	 -16 * SIZE(A1, LDA, 2), %xmm6
336	movsd	 -16 * SIZE(A1, LDA3),   %xmm7
337
338	movsd	 -16 * SIZE(Y1),         %xmm0
339
340	mulsd	 %xmm8,  %xmm4
341	addsd	 %xmm4,  %xmm0
342	movsd	 -16 * SIZE(A2),         %xmm4
343	mulsd	 %xmm9,  %xmm5
344	addsd	 %xmm5,  %xmm0
345	movsd	 -16 * SIZE(A2, LDA),    %xmm5
346	mulsd	 %xmm10, %xmm6
347	addsd	 %xmm6,  %xmm0
348	movsd	 -16 * SIZE(A2, LDA, 2), %xmm6
349	mulsd	 %xmm11, %xmm7
350	addsd	 %xmm7,  %xmm0
351	movsd	 -16 * SIZE(A2, LDA3),   %xmm7
352
353	mulsd	 %xmm12, %xmm4
354	addsd	 %xmm4,  %xmm0
355	mulsd	 %xmm13, %xmm5
356	addsd	 %xmm5,  %xmm0
357	mulsd	 %xmm14, %xmm6
358	addsd	 %xmm6,  %xmm0
359	mulsd	 %xmm15, %xmm7
360	addsd	 %xmm7,  %xmm0
361
362	movsd	 %xmm0, -16 * SIZE(Y1)
363
364	addq	 $SIZE, A1
365	addq	 $SIZE, A2
366	addq	 $SIZE, Y1
367	ALIGN_3
368
369.L1X:
370#endif
371
372	movq	MM,  I
373	sarq	$3,  I
374	jle	.L15
375
376	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
377	MOVUPS_A1(-14 * SIZE, A1, %xmm5)
378	MOVUPS_A1(-12 * SIZE, A1, %xmm6)
379	MOVUPS_A1(-10 * SIZE, A1, %xmm7)
380
381	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
382	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
383	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
384	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
385
386	decq	 I
387	jle	 .L14
388	ALIGN_3
389
390.L13:
391#ifdef PREFETCH
392	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1)
393#endif
394
395	mulpd	 %xmm8,  %xmm4
396	addpd	 %xmm4,  %xmm0
397	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
398	mulpd	 %xmm8,  %xmm5
399	addpd	 %xmm5,  %xmm1
400	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
401
402	mulpd	 %xmm8,  %xmm6
403	addpd	 %xmm6,  %xmm2
404	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6)
405	mulpd	 %xmm8,  %xmm7
406	addpd	 %xmm7,  %xmm3
407	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7)
408
409#ifdef PREFETCH
410	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1)
411#endif
412
413	mulpd	 %xmm9,  %xmm4
414	addpd	 %xmm4,  %xmm0
415	MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4)
416	mulpd	 %xmm9,  %xmm5
417	addpd	 %xmm5,  %xmm1
418	MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5)
419
420	mulpd	 %xmm9,  %xmm6
421	addpd	 %xmm6,  %xmm2
422	MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6)
423	mulpd	 %xmm9,  %xmm7
424	addpd	 %xmm7,  %xmm3
425	MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7)
426
427#ifdef PREFETCH
428	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2)
429#endif
430
431	mulpd	 %xmm10, %xmm4
432	addpd	 %xmm4,  %xmm0
433	MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4)
434	mulpd	 %xmm10, %xmm5
435	addpd	 %xmm5,  %xmm1
436	MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5)
437
438	mulpd	 %xmm10, %xmm6
439	addpd	 %xmm6,  %xmm2
440	MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6)
441	mulpd	 %xmm10, %xmm7
442	addpd	 %xmm7,  %xmm3
443	MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7)
444
445#ifdef PREFETCH
446	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3)
447#endif
448
449	mulpd	 %xmm11, %xmm4
450	addpd	 %xmm4,  %xmm0
451	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
452	mulpd	 %xmm11, %xmm5
453	addpd	 %xmm5,  %xmm1
454	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
455
456	mulpd	 %xmm11, %xmm6
457	addpd	 %xmm6,  %xmm2
458	MOVUPS_A1(-12 * SIZE, A2, %xmm6)
459	mulpd	 %xmm11, %xmm7
460	addpd	 %xmm7,  %xmm3
461	MOVUPS_A1(-10 * SIZE, A2, %xmm7)
462
463#ifdef PREFETCH
464	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2)
465#endif
466
467	mulpd	 %xmm12, %xmm4
468	addpd	 %xmm4,  %xmm0
469	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
470	mulpd	 %xmm12, %xmm5
471	addpd	 %xmm5,  %xmm1
472	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
473
474	mulpd	 %xmm12, %xmm6
475	addpd	 %xmm6,  %xmm2
476	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
477	mulpd	 %xmm12, %xmm7
478	addpd	 %xmm7,  %xmm3
479	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
480
481#ifdef PREFETCH
482	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1)
483#endif
484
485	mulpd	 %xmm13, %xmm4
486	addpd	 %xmm4,  %xmm0
487	MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4)
488	mulpd	 %xmm13, %xmm5
489	addpd	 %xmm5,  %xmm1
490	MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5)
491
492	mulpd	 %xmm13, %xmm6
493	addpd	 %xmm6,  %xmm2
494	MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6)
495	mulpd	 %xmm13, %xmm7
496	addpd	 %xmm7,  %xmm3
497	MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7)
498
499#ifdef PREFETCH
500	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2)
501#endif
502
503	mulpd	 %xmm14, %xmm4
504	addpd	 %xmm4,  %xmm0
505	MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4)
506	mulpd	 %xmm14, %xmm5
507	addpd	 %xmm5,  %xmm1
508	MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5)
509
510	mulpd	 %xmm14, %xmm6
511	addpd	 %xmm6,  %xmm2
512	MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6)
513	mulpd	 %xmm14, %xmm7
514	addpd	 %xmm7,  %xmm3
515	MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7)
516
517#ifdef PREFETCH
518	PREFETCH	(PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3)
519#endif
520
521	mulpd	 %xmm15, %xmm4
522	addpd	 %xmm4,  %xmm0
523	MOVUPS_A1( -8 * SIZE, A1, %xmm4)
524	mulpd	 %xmm15, %xmm5
525	addpd	 %xmm5,  %xmm1
526	MOVUPS_A1( -6 * SIZE, A1, %xmm5)
527
528	mulpd	 %xmm15, %xmm6
529	addpd	 %xmm6,  %xmm2
530	MOVUPS_A1( -4 * SIZE, A1, %xmm6)
531	mulpd	 %xmm15, %xmm7
532	addpd	 %xmm7,  %xmm3
533	MOVUPS_A1( -2 * SIZE, A1, %xmm7)
534
535#ifdef PREFETCHW
536	PREFETCHW	(PREFETCHSIZE) - 128 + PREOFFSET(Y1)
537#endif
538
539	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
540	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
541	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
542	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
543
544	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
545	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
546	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
547	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
548
549	subq	 $-8 * SIZE, A1
550	subq	 $-8 * SIZE, A2
551	subq	 $-8 * SIZE, Y1
552
553	subq	 $1, I
554	BRANCH
555	jg	.L13
556	ALIGN_3
557
558.L14:
559	mulpd	 %xmm8,  %xmm4
560	addpd	 %xmm4,  %xmm0
561	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
562	mulpd	 %xmm8,  %xmm5
563	addpd	 %xmm5,  %xmm1
564	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
565
566	mulpd	 %xmm8,  %xmm6
567	addpd	 %xmm6,  %xmm2
568	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6)
569	mulpd	 %xmm8,  %xmm7
570	addpd	 %xmm7,  %xmm3
571	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7)
572
573	mulpd	 %xmm9,  %xmm4
574	addpd	 %xmm4,  %xmm0
575	MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4)
576	mulpd	 %xmm9,  %xmm5
577	addpd	 %xmm5,  %xmm1
578	MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5)
579
580	mulpd	 %xmm9,  %xmm6
581	addpd	 %xmm6,  %xmm2
582	MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6)
583	mulpd	 %xmm9,  %xmm7
584	addpd	 %xmm7,  %xmm3
585	MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7)
586
587	mulpd	 %xmm10, %xmm4
588	addpd	 %xmm4,  %xmm0
589	MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4)
590	mulpd	 %xmm10, %xmm5
591	addpd	 %xmm5,  %xmm1
592	MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5)
593
594	mulpd	 %xmm10, %xmm6
595	addpd	 %xmm6,  %xmm2
596	MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6)
597	mulpd	 %xmm10, %xmm7
598	addpd	 %xmm7,  %xmm3
599	MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7)
600
601	mulpd	 %xmm11, %xmm4
602	addpd	 %xmm4,  %xmm0
603	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
604	mulpd	 %xmm11, %xmm5
605	addpd	 %xmm5,  %xmm1
606	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
607
608	mulpd	 %xmm11, %xmm6
609	addpd	 %xmm6,  %xmm2
610	MOVUPS_A1(-12 * SIZE, A2, %xmm6)
611	mulpd	 %xmm11, %xmm7
612	addpd	 %xmm7,  %xmm3
613	MOVUPS_A1(-10 * SIZE, A2, %xmm7)
614
615	mulpd	 %xmm12, %xmm4
616	addpd	 %xmm4,  %xmm0
617	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
618	mulpd	 %xmm12, %xmm5
619	addpd	 %xmm5,  %xmm1
620	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
621
622	mulpd	 %xmm12, %xmm6
623	addpd	 %xmm6,  %xmm2
624	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
625	mulpd	 %xmm12, %xmm7
626	addpd	 %xmm7,  %xmm3
627	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
628
629	mulpd	 %xmm13, %xmm4
630	addpd	 %xmm4,  %xmm0
631	MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4)
632	mulpd	 %xmm13, %xmm5
633	addpd	 %xmm5,  %xmm1
634	MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5)
635
636	mulpd	 %xmm13, %xmm6
637	addpd	 %xmm6,  %xmm2
638	MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6)
639	mulpd	 %xmm13, %xmm7
640	addpd	 %xmm7,  %xmm3
641	MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7)
642
643	mulpd	 %xmm14, %xmm4
644	addpd	 %xmm4,  %xmm0
645	MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4)
646	mulpd	 %xmm14, %xmm5
647	addpd	 %xmm5,  %xmm1
648	MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5)
649
650	mulpd	 %xmm14, %xmm6
651	addpd	 %xmm6,  %xmm2
652	MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6)
653	mulpd	 %xmm14, %xmm7
654	addpd	 %xmm7,  %xmm3
655	MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7)
656
657	mulpd	 %xmm15, %xmm4
658	addpd	 %xmm4,  %xmm0
659	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
660	mulpd	 %xmm15, %xmm5
661	addpd	 %xmm5,  %xmm1
662	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
663
664	mulpd	 %xmm15, %xmm6
665	addpd	 %xmm6,  %xmm2
666	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
667	mulpd	 %xmm15, %xmm7
668	addpd	 %xmm7,  %xmm3
669	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
670
671	subq	 $-8 * SIZE, A1
672	subq	 $-8 * SIZE, A2
673	subq	 $-8 * SIZE, Y1
674	ALIGN_3
675
676.L15:
677	testq	$4, MM
678	je	.L16
679
680	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
681	MOVUPS_A1(-14 * SIZE, A1, %xmm5)
682	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6)
683	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7)
684
685	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
686	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
687
688	mulpd	 %xmm8,  %xmm4
689	addpd	 %xmm4,  %xmm0
690	MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4)
691	mulpd	 %xmm8,  %xmm5
692	addpd	 %xmm5,  %xmm1
693	MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5)
694
695	mulpd	 %xmm9,  %xmm6
696	addpd	 %xmm6,  %xmm0
697	MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6)
698	mulpd	 %xmm9,  %xmm7
699	addpd	 %xmm7,  %xmm1
700	MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7)
701
702	mulpd	 %xmm10, %xmm4
703	addpd	 %xmm4,  %xmm0
704	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
705	mulpd	 %xmm10, %xmm5
706	addpd	 %xmm5,  %xmm1
707	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
708
709	mulpd	 %xmm11, %xmm6
710	addpd	 %xmm6,  %xmm0
711	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6)
712	mulpd	 %xmm11, %xmm7
713	addpd	 %xmm7,  %xmm1
714	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7)
715
716	mulpd	 %xmm12, %xmm4
717	addpd	 %xmm4,  %xmm0
718	MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4)
719	mulpd	 %xmm12, %xmm5
720	addpd	 %xmm5,  %xmm1
721	MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5)
722
723	mulpd	 %xmm13, %xmm6
724	addpd	 %xmm6,  %xmm0
725	MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6)
726	mulpd	 %xmm13, %xmm7
727	addpd	 %xmm7,  %xmm1
728	MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7)
729
730	mulpd	 %xmm14, %xmm4
731	addpd	 %xmm4,  %xmm0
732	mulpd	 %xmm14, %xmm5
733	addpd	 %xmm5,  %xmm1
734
735	mulpd	 %xmm15, %xmm6
736	addpd	 %xmm6,  %xmm0
737	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
738	mulpd	 %xmm15, %xmm7
739	addpd	 %xmm7,  %xmm1
740	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
741
742	addq	 $4 * SIZE, A1
743	addq	 $4 * SIZE, A2
744	addq	 $4 * SIZE, Y1
745	ALIGN_3
746
747.L16:
748	testq	$2, MM
749	je	.L17
750
751	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
752	MOVUPS_A2(-16 * SIZE, A1, LDA,  1, %xmm5)
753	MOVUPS_A2(-16 * SIZE, A1, LDA,  2, %xmm6)
754	MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7)
755
756	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
757
758	mulpd	 %xmm8,  %xmm4
759	addpd	 %xmm4,  %xmm0
760	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
761	mulpd	 %xmm9,  %xmm5
762	addpd	 %xmm5,  %xmm0
763	MOVUPS_A2(-16 * SIZE, A2, LDA,  1, %xmm5)
764
765	mulpd	 %xmm10, %xmm6
766	addpd	 %xmm6,  %xmm0
767	MOVUPS_A2(-16 * SIZE, A2, LDA,  2, %xmm6)
768	mulpd	 %xmm11, %xmm7
769	addpd	 %xmm7,  %xmm0
770	MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7)
771
772	mulpd	 %xmm12, %xmm4
773	addpd	 %xmm4,  %xmm0
774	mulpd	 %xmm13, %xmm5
775	addpd	 %xmm5,  %xmm0
776	mulpd	 %xmm14, %xmm6
777	addpd	 %xmm6,  %xmm0
778	mulpd	 %xmm15, %xmm7
779	addpd	 %xmm7,  %xmm0
780
781	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
782
783	addq	 $2 * SIZE, A1
784	addq	 $2 * SIZE, A2
785	addq	 $2 * SIZE, Y1
786	ALIGN_3
787
788.L17:
789	testq	$1, MM
790	je	.L18
791
792	movsd	 -16 * SIZE(A1),         %xmm4
793	movsd	 -16 * SIZE(A1, LDA),    %xmm5
794	movsd	 -16 * SIZE(A1, LDA, 2), %xmm6
795	movsd	 -16 * SIZE(A1, LDA3),   %xmm7
796
797	movsd	 -16 * SIZE(Y1),         %xmm0
798
799	mulsd	 %xmm8,  %xmm4
800	addsd	 %xmm4,  %xmm0
801	movsd	 -16 * SIZE(A2),         %xmm4
802	mulsd	 %xmm9,  %xmm5
803	addsd	 %xmm5,  %xmm0
804	movsd	 -16 * SIZE(A2, LDA),    %xmm5
805	mulsd	 %xmm10, %xmm6
806	addsd	 %xmm6,  %xmm0
807	movsd	 -16 * SIZE(A2, LDA, 2), %xmm6
808	mulsd	 %xmm11, %xmm7
809	addsd	 %xmm7,  %xmm0
810	movsd	 -16 * SIZE(A2, LDA3),   %xmm7
811
812	mulsd	 %xmm12, %xmm4
813	addsd	 %xmm4,  %xmm0
814	mulsd	 %xmm13, %xmm5
815	addsd	 %xmm5,  %xmm0
816	mulsd	 %xmm14, %xmm6
817	addsd	 %xmm6,  %xmm0
818	mulsd	 %xmm15, %xmm7
819	addsd	 %xmm7,  %xmm0
820
821	movsd	 %xmm0, -16 * SIZE(Y1)
822	ALIGN_3
823
824.L18:
825	cmpq	$8, N
826	jge	.L11
827	ALIGN_3
828
829.L20:
830#endif
831
832#if GEMV_UNROLL >= 4
833
834	cmpq	$4, N
835	jl	.L30
836
837#if GEMV_UNROLL == 4
838	ALIGN_3
839
840.L21:
841#endif
842
843	subq	$4, N
844
845	leaq	16 * SIZE(BUFFER), Y1
846	movq	A,  A1
847	leaq	(A,  LDA, 2), A2
848	leaq	(A,  LDA, 4), A
849
850#ifdef HAVE_SSE3
851	movddup	(X), %xmm12
852	addq	INCX, X
853	movddup	(X), %xmm13
854	addq	INCX, X
855	movddup	(X), %xmm14
856	addq	INCX, X
857	movddup	(X), %xmm15
858	addq	INCX, X
859
860	movddup	ALPHA, %xmm0
861#else
862	movsd	(X), %xmm12
863	unpcklpd %xmm12, %xmm12
864	addq	INCX, X
865	movsd	(X), %xmm13
866	unpcklpd %xmm13, %xmm13
867	addq	INCX, X
868	movsd	(X), %xmm14
869	unpcklpd %xmm14, %xmm14
870	addq	INCX, X
871	movsd	(X), %xmm15
872	unpcklpd %xmm15, %xmm15
873	addq	INCX, X
874
875	movsd	ALPHA, %xmm0
876	unpcklpd %xmm0, %xmm0
877#endif
878
879	mulpd	%xmm0, %xmm12
880	mulpd	%xmm0, %xmm13
881	mulpd	%xmm0, %xmm14
882	mulpd	%xmm0, %xmm15
883
884#ifdef ALIGNED_ACCESS
885	testq	$SIZE, A
886	je	.L2X
887
888	movsd	 -16 * SIZE(A1),         %xmm4
889	movsd	 -16 * SIZE(A1, LDA),    %xmm5
890	movsd	 -16 * SIZE(A2),         %xmm6
891	movsd	 -16 * SIZE(A2, LDA),    %xmm7
892
893	movsd	 -16 * SIZE(Y1),         %xmm0
894
895	mulsd	 %xmm12, %xmm4
896	addsd	 %xmm4,  %xmm0
897	mulsd	 %xmm13, %xmm5
898	addsd	 %xmm5,  %xmm0
899	mulsd	 %xmm14, %xmm6
900	addsd	 %xmm6,  %xmm0
901	mulsd	 %xmm15, %xmm7
902	addsd	 %xmm7,  %xmm0
903
904	movsd	 %xmm0, -16 * SIZE(Y1)
905
906	addq	 $SIZE, A1
907	addq	 $SIZE, A2
908	addq	 $SIZE, Y1
909	ALIGN_3
910
911.L2X:
912#endif
913
914	movq	MM,  I
915	sarq	$3,  I
916	jle	.L25
917
918	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
919	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
920	MOVUPS_A1(-12 * SIZE, A1, %xmm2)
921	MOVUPS_A1(-10 * SIZE, A1, %xmm3)
922
923	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
924	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
925	MOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
926	MOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
927
928	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
929	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
930	MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6)
931	MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7)
932
933	decq	 I
934	jle	 .L24
935	ALIGN_3
936
937.L23:
938#ifdef PREFETCH
939	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(A1)
940#endif
941
942	mulpd	 %xmm12, %xmm0
943	addpd	 %xmm0,  %xmm8
944	MOVUPS_A1(-16 * SIZE, A2, %xmm0)
945	mulpd	 %xmm12, %xmm1
946	addpd	 %xmm1,  %xmm9
947	MOVUPS_A1(-14 * SIZE, A2, %xmm1)
948
949	mulpd	 %xmm12, %xmm2
950	addpd	 %xmm2,  %xmm10
951	MOVUPS_A1(-12 * SIZE, A2, %xmm2)
952	mulpd	 %xmm12, %xmm3
953	addpd	 %xmm3,  %xmm11
954	MOVUPS_A1(-10 * SIZE, A2, %xmm3)
955
956#ifdef PREFETCH
957	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(A1, LDA)
958#endif
959
960	mulpd	 %xmm13, %xmm4
961	addpd	 %xmm4,  %xmm8
962	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
963	mulpd	 %xmm13, %xmm5
964	addpd	 %xmm5,  %xmm9
965	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
966
967	mulpd	 %xmm13, %xmm6
968	addpd	 %xmm6,  %xmm10
969	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
970	mulpd	 %xmm13, %xmm7
971	addpd	 %xmm7,  %xmm11
972	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
973
974#ifdef PREFETCH
975	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(A2)
976#endif
977
978	mulpd	 %xmm14, %xmm0
979	addpd	 %xmm0,  %xmm8
980	MOVUPS_A1( -8 * SIZE, A1, %xmm0)
981	mulpd	 %xmm14, %xmm1
982	addpd	 %xmm1,  %xmm9
983	MOVUPS_A1( -6 * SIZE, A1, %xmm1)
984
985	mulpd	 %xmm14, %xmm2
986	addpd	 %xmm2,  %xmm10
987	MOVUPS_A1( -4 * SIZE, A1, %xmm2)
988	mulpd	 %xmm14, %xmm3
989	addpd	 %xmm3,  %xmm11
990	MOVUPS_A1( -2 * SIZE, A1, %xmm3)
991
992#ifdef PREFETCH
993	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(A2, LDA)
994#endif
995
996	mulpd	 %xmm15, %xmm4
997	addpd	 %xmm4,  %xmm8
998	MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4)
999	mulpd	 %xmm15, %xmm5
1000	addpd	 %xmm5,  %xmm9
1001	MOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5)
1002
1003	mulpd	 %xmm15, %xmm6
1004	addpd	 %xmm6,  %xmm10
1005	MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6)
1006	mulpd	 %xmm15, %xmm7
1007	addpd	 %xmm7,  %xmm11
1008	MOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7)
1009
1010#ifdef PREFETCHW
1011	PREFETCHW	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(Y1)
1012#endif
1013
1014	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1015	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1016	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
1017	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
1018
1019	MOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
1020	MOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
1021	MOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
1022	MOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
1023
1024	subq	 $-8 * SIZE, A1
1025	subq	 $-8 * SIZE, A2
1026	subq	 $-8 * SIZE, Y1
1027
1028	subq	 $1, I
1029	BRANCH
1030	jg	.L23
1031	ALIGN_3
1032
1033.L24:
1034	mulpd	 %xmm12, %xmm0
1035	addpd	 %xmm0,  %xmm8
1036	MOVUPS_A1(-16 * SIZE, A2, %xmm0)
1037	mulpd	 %xmm12, %xmm1
1038	addpd	 %xmm1,  %xmm9
1039	MOVUPS_A1(-14 * SIZE, A2, %xmm1)
1040
1041	mulpd	 %xmm12, %xmm2
1042	addpd	 %xmm2,  %xmm10
1043	MOVUPS_A1(-12 * SIZE, A2, %xmm2)
1044	mulpd	 %xmm12, %xmm3
1045	addpd	 %xmm3,  %xmm11
1046	MOVUPS_A1(-10 * SIZE, A2, %xmm3)
1047
1048	mulpd	 %xmm13, %xmm4
1049	addpd	 %xmm4,  %xmm8
1050	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
1051	mulpd	 %xmm13, %xmm5
1052	addpd	 %xmm5,  %xmm9
1053	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
1054
1055	mulpd	 %xmm13, %xmm6
1056	addpd	 %xmm6,  %xmm10
1057	MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
1058	mulpd	 %xmm13, %xmm7
1059	addpd	 %xmm7,  %xmm11
1060	MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
1061
1062	mulpd	 %xmm14, %xmm0
1063	addpd	 %xmm0,  %xmm8
1064	mulpd	 %xmm14, %xmm1
1065	addpd	 %xmm1,  %xmm9
1066
1067	mulpd	 %xmm14, %xmm2
1068	addpd	 %xmm2,  %xmm10
1069	mulpd	 %xmm14, %xmm3
1070	addpd	 %xmm3,  %xmm11
1071
1072	mulpd	 %xmm15, %xmm4
1073	addpd	 %xmm4,  %xmm8
1074	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1075	mulpd	 %xmm15, %xmm5
1076	addpd	 %xmm5,  %xmm9
1077	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1078
1079	mulpd	 %xmm15, %xmm6
1080	addpd	 %xmm6,  %xmm10
1081	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
1082	mulpd	 %xmm15, %xmm7
1083	addpd	 %xmm7,  %xmm11
1084	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
1085
1086	subq	 $-8 * SIZE, A1
1087	subq	 $-8 * SIZE, A2
1088	subq	 $-8 * SIZE, Y1
1089	ALIGN_3
1090
1091.L25:
1092	testq	$4, MM
1093	je	.L26
1094
1095	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
1096	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
1097
1098	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
1099	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
1100
1101	mulpd	 %xmm12, %xmm0
1102	addpd	 %xmm0,  %xmm8
1103	mulpd	 %xmm12, %xmm1
1104	addpd	 %xmm1,  %xmm9
1105
1106	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
1107	MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
1108
1109	mulpd	 %xmm13, %xmm4
1110	addpd	 %xmm4,  %xmm8
1111	mulpd	 %xmm13, %xmm5
1112	addpd	 %xmm5,  %xmm9
1113
1114	MOVUPS_A1(-16 * SIZE, A2, %xmm0)
1115	MOVUPS_A1(-14 * SIZE, A2, %xmm1)
1116
1117	mulpd	 %xmm14, %xmm0
1118	addpd	 %xmm0,  %xmm8
1119	mulpd	 %xmm14, %xmm1
1120	addpd	 %xmm1,  %xmm9
1121
1122	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
1123	MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
1124
1125	mulpd	 %xmm15, %xmm4
1126	addpd	 %xmm4,  %xmm8
1127	mulpd	 %xmm15, %xmm5
1128	addpd	 %xmm5,  %xmm9
1129
1130	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1131	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1132
1133	addq	 $4 * SIZE, A1
1134	addq	 $4 * SIZE, A2
1135	addq	 $4 * SIZE, Y1
1136	ALIGN_3
1137
1138.L26:
1139	testq	$2, MM
1140	je	.L27
1141
1142	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
1143	MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
1144	MOVUPS_A1(-16 * SIZE, A2, %xmm10)
1145	MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11)
1146
1147	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1148
1149	mulpd	 %xmm12, %xmm8
1150	addpd	 %xmm8,  %xmm0
1151	mulpd	 %xmm13, %xmm9
1152	addpd	 %xmm9,  %xmm0
1153	mulpd	 %xmm14, %xmm10
1154	addpd	 %xmm10, %xmm0
1155	mulpd	 %xmm15, %xmm11
1156	addpd	 %xmm11, %xmm0
1157
1158	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1159
1160	addq	 $2 * SIZE, A1
1161	addq	 $2 * SIZE, A2
1162	addq	 $2 * SIZE, Y1
1163	ALIGN_3
1164
1165.L27:
1166	testq	$1, MM
1167#if GEMV_UNROLL == 4
1168	je	.L28
1169#else
1170	je	.L30
1171#endif
1172
1173	movsd	 -16 * SIZE(Y1), %xmm0
1174
1175	movsd	 -16 * SIZE(A1), %xmm8
1176	movsd	 -16 * SIZE(A1, LDA), %xmm9
1177	movsd	 -16 * SIZE(A2), %xmm10
1178	movsd	 -16 * SIZE(A2, LDA), %xmm11
1179
1180	mulsd	 %xmm12, %xmm8
1181	addsd	 %xmm8,  %xmm0
1182	mulsd	 %xmm13, %xmm9
1183	addsd	 %xmm9,  %xmm0
1184	mulsd	 %xmm14, %xmm10
1185	addsd	 %xmm10, %xmm0
1186	mulsd	 %xmm15, %xmm11
1187	addsd	 %xmm11, %xmm0
1188
1189	movsd	 %xmm0, -16 * SIZE(Y1)
1190	ALIGN_3
1191
1192#if GEMV_UNROLL == 4
1193.L28:
1194	cmpq	$4, N
1195	jge	.L21
1196	ALIGN_3
1197
1198#endif
1199
1200.L30:
1201#endif
1202
1203#if GEMV_UNROLL >= 2
1204
1205	cmpq	$2, N
1206	jl	.L40
1207
1208#if GEMV_UNROLL == 2
1209	ALIGN_3
1210
1211.L31:
1212#endif
1213
1214	subq	$2, N
1215
1216	leaq	16 * SIZE(BUFFER), Y1
1217	movq	A,  A1
1218	leaq	(A,  LDA), A2
1219	leaq	(A,  LDA, 2), A
1220
1221#ifdef HAVE_SSE3
1222	movddup	(X), %xmm12
1223	addq	INCX, X
1224	movddup	(X), %xmm13
1225	addq	INCX, X
1226
1227	movddup	ALPHA, %xmm0
1228#else
1229	movsd	(X), %xmm12
1230	unpcklpd %xmm12, %xmm12
1231	addq	INCX, X
1232	movsd	(X), %xmm13
1233	unpcklpd %xmm13, %xmm13
1234	addq	INCX, X
1235
1236	movsd	ALPHA, %xmm0
1237	unpcklpd %xmm0, %xmm0
1238#endif
1239
1240	mulpd	%xmm0, %xmm12
1241	mulpd	%xmm0, %xmm13
1242
1243#ifdef ALIGNED_ACCESS
1244	testq	$SIZE, A
1245	je	.L3X
1246
1247	movsd	 -16 * SIZE(A1), %xmm4
1248	movsd	 -16 * SIZE(A2), %xmm5
1249
1250	movsd	 -16 * SIZE(Y1), %xmm0
1251
1252	mulsd	 %xmm12, %xmm4
1253	addsd	 %xmm4,  %xmm0
1254	mulsd	 %xmm13, %xmm5
1255	addsd	 %xmm5,  %xmm0
1256
1257	movsd	 %xmm0, -16 * SIZE(Y1)
1258
1259	addq	 $SIZE, A1
1260	addq	 $SIZE, A2
1261	addq	 $SIZE, Y1
1262	ALIGN_3
1263
1264.L3X:
1265#endif
1266
1267	movq	MM,  I
1268	sarq	$3,  I
1269	jle	.L35
1270
1271	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
1272	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
1273	MOVUPS_A1(-12 * SIZE, A1, %xmm2)
1274	MOVUPS_A1(-10 * SIZE, A1, %xmm3)
1275
1276	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
1277	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
1278	MOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
1279	MOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
1280
1281	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
1282	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
1283	MOVUPS_A1(-12 * SIZE, A2, %xmm6)
1284	MOVUPS_A1(-10 * SIZE, A2, %xmm7)
1285
1286	decq	 I
1287	jle	 .L34
1288	ALIGN_3
1289
1290.L33:
1291#ifdef PREFETCH
1292	PREFETCH	(PREFETCHSIZE) * 4  - 128 + PREOFFSET(A1)
1293#endif
1294
1295	mulpd	 %xmm12, %xmm0
1296	addpd	 %xmm0,  %xmm8
1297	MOVUPS_A1( -8 * SIZE, A1, %xmm0)
1298	mulpd	 %xmm12, %xmm1
1299	addpd	 %xmm1,  %xmm9
1300	MOVUPS_A1( -6 * SIZE, A1, %xmm1)
1301
1302	mulpd	 %xmm12, %xmm2
1303	addpd	 %xmm2,  %xmm10
1304	MOVUPS_A1( -4 * SIZE, A1, %xmm2)
1305	mulpd	 %xmm12, %xmm3
1306	addpd	 %xmm3,  %xmm11
1307	MOVUPS_A1( -2 * SIZE, A1, %xmm3)
1308
1309#ifdef PREFETCH
1310	PREFETCH	(PREFETCHSIZE) * 4  - 128 + PREOFFSET(A2)
1311#endif
1312
1313	mulpd	 %xmm13, %xmm4
1314	addpd	 %xmm4,  %xmm8
1315	MOVUPS_A1( -8 * SIZE, A2, %xmm4)
1316	mulpd	 %xmm13, %xmm5
1317	addpd	 %xmm5,  %xmm9
1318	MOVUPS_A1( -6 * SIZE, A2, %xmm5)
1319
1320	mulpd	 %xmm13, %xmm6
1321	addpd	 %xmm6,  %xmm10
1322	MOVUPS_A1( -4 * SIZE, A2, %xmm6)
1323	mulpd	 %xmm13, %xmm7
1324	addpd	 %xmm7,  %xmm11
1325	MOVUPS_A1( -2 * SIZE, A2, %xmm7)
1326
1327#ifdef PREFETCHW
1328	PREFETCHW	(PREFETCHSIZE) * 4  - 128 + PREOFFSET(Y1)
1329#endif
1330
1331	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1332	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1333	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
1334	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
1335
1336	MOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
1337	MOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
1338	MOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
1339	MOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
1340
1341	subq	 $-8 * SIZE, A1
1342	subq	 $-8 * SIZE, A2
1343	subq	 $-8 * SIZE, Y1
1344
1345	subq	 $1, I
1346	BRANCH
1347	jg	.L33
1348	ALIGN_3
1349
1350.L34:
1351	mulpd	 %xmm12, %xmm0
1352	addpd	 %xmm0,  %xmm8
1353	mulpd	 %xmm12, %xmm1
1354	addpd	 %xmm1,  %xmm9
1355	mulpd	 %xmm12, %xmm2
1356	addpd	 %xmm2,  %xmm10
1357	mulpd	 %xmm12, %xmm3
1358	addpd	 %xmm3,  %xmm11
1359
1360	mulpd	 %xmm13, %xmm4
1361	addpd	 %xmm4,  %xmm8
1362	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1363	mulpd	 %xmm13, %xmm5
1364	addpd	 %xmm5,  %xmm9
1365	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1366	mulpd	 %xmm13, %xmm6
1367	addpd	 %xmm6,  %xmm10
1368	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
1369	mulpd	 %xmm13, %xmm7
1370	addpd	 %xmm7,  %xmm11
1371	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
1372
1373	subq	 $-8 * SIZE, A1
1374	subq	 $-8 * SIZE, A2
1375	subq	 $-8 * SIZE, Y1
1376	ALIGN_3
1377
1378.L35:
1379	testq	$4, MM
1380	je	.L36
1381
1382
1383	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
1384	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
1385
1386	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
1387	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
1388
1389	mulpd	 %xmm12, %xmm0
1390	addpd	 %xmm0,  %xmm8
1391	mulpd	 %xmm12, %xmm1
1392	addpd	 %xmm1,  %xmm9
1393
1394	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
1395	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
1396
1397	mulpd	 %xmm13, %xmm4
1398	addpd	 %xmm4,  %xmm8
1399	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1400	mulpd	 %xmm13, %xmm5
1401	addpd	 %xmm5,  %xmm9
1402	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1403
1404	addq	 $4 * SIZE, A1
1405	addq	 $4 * SIZE, A2
1406	addq	 $4 * SIZE, Y1
1407	ALIGN_3
1408
1409.L36:
1410	testq	$2, MM
1411	je	.L37
1412
1413	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
1414	MOVUPS_A1(-16 * SIZE, A2, %xmm9)
1415
1416	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1417
1418	mulpd	 %xmm12, %xmm8
1419	addpd	 %xmm8,  %xmm0
1420	mulpd	 %xmm13, %xmm9
1421	addpd	 %xmm9,  %xmm0
1422
1423	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1424
1425	addq	 $2 * SIZE, A1
1426	addq	 $2 * SIZE, A2
1427	addq	 $2 * SIZE, Y1
1428	ALIGN_3
1429
1430.L37:
1431	testq	$1, MM
1432#if GEMV_UNROLL == 2
1433	je	.L38
1434#else
1435	je	.L40
1436#endif
1437
1438	movsd	 -16 * SIZE(Y1), %xmm0
1439
1440	movsd	 -16 * SIZE(A1), %xmm8
1441	movsd	 -16 * SIZE(A2), %xmm9
1442
1443	mulsd	 %xmm12, %xmm8
1444	addsd	 %xmm8,  %xmm0
1445	mulsd	 %xmm13, %xmm9
1446	addsd	 %xmm9,  %xmm0
1447
1448	movsd	 %xmm0, -16 * SIZE(Y1)
1449	ALIGN_3
1450
1451#if GEMV_UNROLL == 2
1452.L38:
1453	cmpq	$2, N
1454	jge	.L31
1455	ALIGN_3
1456
1457#endif
1458
1459.L40:
1460	cmpq	$1, N
1461	jl	.L900
1462#endif
1463
1464	leaq	16 * SIZE(BUFFER), Y1
1465	movq	A,  A1
1466
1467#ifdef HAVE_SSE3
1468	movddup	(X), %xmm12
1469	addq	INCX, X
1470
1471	movddup	ALPHA, %xmm0
1472#else
1473	movsd	(X), %xmm12
1474	unpcklpd %xmm12, %xmm12
1475	addq	INCX, X
1476
1477	movsd	ALPHA, %xmm0
1478	unpcklpd %xmm0, %xmm0
1479#endif
1480
1481	mulpd	%xmm0, %xmm12
1482
1483#ifdef ALIGNED_ACCESS
1484	testq	$SIZE, A
1485	je	.L4X
1486
1487	movsd	 -16 * SIZE(A1), %xmm4
1488	movsd	 -16 * SIZE(Y1), %xmm0
1489
1490	mulsd	 %xmm12, %xmm4
1491	addsd	 %xmm4,  %xmm0
1492
1493	movsd	 %xmm0, -16 * SIZE(Y1)
1494
1495	addq	 $SIZE, A1
1496	addq	 $SIZE, Y1
1497	ALIGN_3
1498
1499.L4X:
1500#endif
1501
1502	movq	MM,  I
1503	sarq	$3,  I
1504	jle	.L45
1505
1506	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
1507	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
1508	MOVUPS_A1(-12 * SIZE, A1, %xmm2)
1509	MOVUPS_A1(-10 * SIZE, A1, %xmm3)
1510
1511	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
1512	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
1513	MOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
1514	MOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
1515
1516	decq	 I
1517	jle	 .L44
1518	ALIGN_3
1519
1520.L43:
1521#ifdef PREFETCH
1522	PREFETCH	(PREFETCHSIZE) * 8  - 128 + PREOFFSET(A1)
1523#endif
1524
1525	mulpd	 %xmm12, %xmm0
1526	addpd	 %xmm0,  %xmm8
1527	MOVUPS_A1( -8 * SIZE, A1, %xmm0)
1528	mulpd	 %xmm12, %xmm1
1529	addpd	 %xmm1,  %xmm9
1530	MOVUPS_A1( -6 * SIZE, A1, %xmm1)
1531
1532	mulpd	 %xmm12, %xmm2
1533	addpd	 %xmm2,  %xmm10
1534	MOVUPS_A1( -4 * SIZE, A1, %xmm2)
1535	mulpd	 %xmm12, %xmm3
1536	addpd	 %xmm3,  %xmm11
1537	MOVUPS_A1( -2 * SIZE, A1, %xmm3)
1538
1539#ifdef PREFETCHW
1540	PREFETCHW	(PREFETCHSIZE) * 8  - 128 + PREOFFSET(Y1)
1541#endif
1542
1543	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1544	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1545	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
1546	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
1547
1548	MOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
1549	MOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
1550	MOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
1551	MOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
1552
1553	subq	 $-8 * SIZE, A1
1554	subq	 $-8 * SIZE, Y1
1555
1556	subq	 $1, I
1557	BRANCH
1558	jg	.L43
1559	ALIGN_3
1560
1561.L44:
1562	mulpd	 %xmm12, %xmm0
1563	addpd	 %xmm0,  %xmm8
1564	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1565	mulpd	 %xmm12, %xmm1
1566	addpd	 %xmm1,  %xmm9
1567	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1568	mulpd	 %xmm12, %xmm2
1569	addpd	 %xmm2,  %xmm10
1570	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
1571	mulpd	 %xmm12, %xmm3
1572	addpd	 %xmm3,  %xmm11
1573	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
1574
1575	subq	 $-8 * SIZE, A1
1576	subq	 $-8 * SIZE, Y1
1577	ALIGN_3
1578
1579.L45:
1580	testq	$4, MM
1581	je	.L46
1582
1583	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
1584	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
1585
1586	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
1587	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
1588
1589	mulpd	 %xmm12, %xmm0
1590	addpd	 %xmm0,  %xmm8
1591	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
1592	mulpd	 %xmm12, %xmm1
1593	addpd	 %xmm1,  %xmm9
1594	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
1595
1596	addq	 $4 * SIZE, A1
1597	addq	 $4 * SIZE, Y1
1598	ALIGN_3
1599
1600.L46:
1601	testq	$2, MM
1602	je	.L47
1603
1604	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
1605
1606	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1607
1608	mulpd	 %xmm12, %xmm8
1609	addpd	 %xmm8,  %xmm0
1610
1611	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1612
1613	addq	 $2 * SIZE, A1
1614	addq	 $2 * SIZE, Y1
1615	ALIGN_3
1616
1617.L47:
1618	testq	$1, MM
1619	je	.L900
1620
1621	movsd	 -16 * SIZE(Y1), %xmm0
1622	movsd	 -16 * SIZE(A1), %xmm8
1623
1624	mulsd	 %xmm12, %xmm8
1625	addsd	 %xmm8,  %xmm0
1626
1627	movsd	 %xmm0, -16 * SIZE(Y1)
1628	ALIGN_3
1629
1630#ifdef ALIGNED_ACCESS
1631	jmp	.L900
1632	ALIGN_3
1633
1634.L50:
1635#if GEMV_UNROLL >= 4
1636
1637	cmpq	$4, N
1638	jl	.L60
1639	ALIGN_3
1640
1641.L51:
1642
1643	subq	$4, N
1644
1645	leaq	16 * SIZE(BUFFER), Y1
1646	movq	A,  A1
1647	leaq	(A,  LDA, 2), A2
1648	leaq	(A,  LDA, 4), A
1649
1650#ifdef HAVE_SSE3
1651	movddup	(X), %xmm12
1652	addq	INCX, X
1653	movddup	(X), %xmm13
1654	addq	INCX, X
1655	movddup	(X), %xmm14
1656	addq	INCX, X
1657	movddup	(X), %xmm15
1658	addq	INCX, X
1659
1660	movddup	ALPHA, %xmm0
1661#else
1662	movsd	(X), %xmm12
1663	unpcklpd %xmm12, %xmm12
1664	addq	INCX, X
1665	movsd	(X), %xmm13
1666	unpcklpd %xmm13, %xmm13
1667	addq	INCX, X
1668	movsd	(X), %xmm14
1669	unpcklpd %xmm14, %xmm14
1670	addq	INCX, X
1671	movsd	(X), %xmm15
1672	unpcklpd %xmm15, %xmm15
1673	addq	INCX, X
1674
1675	movsd	ALPHA, %xmm0
1676	unpcklpd %xmm0, %xmm0
1677#endif
1678
1679	mulpd	%xmm0, %xmm12
1680	mulpd	%xmm0, %xmm13
1681	mulpd	%xmm0, %xmm14
1682	mulpd	%xmm0, %xmm15
1683
1684	testq	$SIZE, A
1685	je	.L5X
1686
1687	movsd	 -16 * SIZE(A1),         %xmm4
1688	movsd	 -16 * SIZE(A1, LDA),    %xmm5
1689	movsd	 -16 * SIZE(A2),         %xmm6
1690	movsd	 -16 * SIZE(A2, LDA),    %xmm7
1691
1692	movsd	 -16 * SIZE(Y1),         %xmm0
1693
1694	mulsd	 %xmm12, %xmm4
1695	addsd	 %xmm4,  %xmm0
1696	mulsd	 %xmm13, %xmm5
1697	addsd	 %xmm5,  %xmm0
1698	mulsd	 %xmm14, %xmm6
1699	addsd	 %xmm6,  %xmm0
1700	mulsd	 %xmm15, %xmm7
1701	addsd	 %xmm7,  %xmm0
1702
1703	movsd	 %xmm0, -16 * SIZE(Y1)
1704
1705	addq	 $SIZE, A1
1706	addq	 $SIZE, A2
1707	addq	 $SIZE, Y1
1708	ALIGN_3
1709
1710.L5X:
1711	movhpd	-16 * SIZE(A1, LDA), %xmm8
1712	movhpd	-16 * SIZE(A2, LDA), %xmm9
1713
1714	movq	MM,  I
1715	sarq	$3,  I
1716	jle	.L55
1717
1718	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
1719	MOVUPS_A1(-14 * SIZE, A1, %xmm5)
1720	MOVUPS_A1(-12 * SIZE, A1, %xmm6)
1721
1722	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1723	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
1724	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
1725	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
1726
1727	decq	 I
1728	jle	 .L54
1729	ALIGN_3
1730
1731.L53:
1732#ifdef PREFETCH
1733	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(A1)
1734#endif
1735
1736	mulpd	 %xmm12, %xmm4
1737	addpd	 %xmm4,  %xmm0
1738	MOVUPS_A1(-10 * SIZE, A1, %xmm7)
1739	mulpd	 %xmm12, %xmm5
1740	addpd	 %xmm5,  %xmm1
1741	MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4)
1742
1743	mulpd	 %xmm12, %xmm6
1744	addpd	 %xmm6,  %xmm2
1745	MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5)
1746	mulpd	 %xmm12, %xmm7
1747	addpd	 %xmm7,  %xmm3
1748	MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6)
1749
1750#ifdef PREFETCH
1751	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET + 8(A1, LDA)
1752#endif
1753
1754	shufpd	$1, %xmm4, %xmm8
1755	mulpd	 %xmm13, %xmm8
1756	addpd	 %xmm8,  %xmm0
1757	MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
1758	shufpd	$1, %xmm5, %xmm4
1759	mulpd	 %xmm13, %xmm4
1760	addpd	 %xmm4,  %xmm1
1761	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
1762
1763	shufpd	$1, %xmm6, %xmm5
1764	mulpd	 %xmm13, %xmm5
1765	addpd	 %xmm5,  %xmm2
1766	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
1767	shufpd	$1, %xmm8, %xmm6
1768	mulpd	 %xmm13, %xmm6
1769	addpd	 %xmm6,  %xmm3
1770	MOVUPS_A1(-12 * SIZE, A2, %xmm6)
1771
1772#ifdef PREFETCH
1773	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(A2)
1774#endif
1775
1776	mulpd	 %xmm14, %xmm4
1777	addpd	 %xmm4, %xmm0
1778	MOVUPS_A1(-10 * SIZE, A2, %xmm7)
1779	mulpd	 %xmm14, %xmm5
1780	addpd	 %xmm5, %xmm1
1781	MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4)
1782
1783	mulpd	 %xmm14, %xmm6
1784	addpd	 %xmm6,  %xmm2
1785	MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5)
1786	mulpd	 %xmm14, %xmm7
1787	addpd	 %xmm7,  %xmm3
1788	MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6)
1789
1790#ifdef PREFETCH
1791	PREFETCH	(PREFETCHSIZE) * 2  - 128 + PREOFFSET + 8(A2, LDA)
1792#endif
1793
1794	shufpd	$1, %xmm4, %xmm9
1795	mulpd	 %xmm15, %xmm9
1796	addpd	 %xmm9,  %xmm0
1797	MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
1798	shufpd	$1, %xmm5, %xmm4
1799	mulpd	 %xmm15, %xmm4
1800	addpd	 %xmm4,  %xmm1
1801	MOVUPS_A1( -8 * SIZE, A1, %xmm4)
1802
1803	shufpd	$1, %xmm6, %xmm5
1804	mulpd	 %xmm15, %xmm5
1805	addpd	 %xmm5,  %xmm2
1806	MOVUPS_A1( -6 * SIZE, A1, %xmm5)
1807	shufpd	$1, %xmm9, %xmm6
1808	mulpd	 %xmm15, %xmm6
1809	addpd	 %xmm6,  %xmm3
1810	MOVUPS_A1( -4 * SIZE, A1, %xmm6)
1811
1812#ifdef PREFETCHW
1813	PREFETCHW	(PREFETCHSIZE) * 2  - 128 + PREOFFSET(Y1)
1814#endif
1815
1816	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1817	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1818	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1819	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1820
1821	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
1822	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
1823	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
1824	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
1825
1826	subq	 $-8 * SIZE, A1
1827	subq	 $-8 * SIZE, A2
1828	subq	 $-8 * SIZE, Y1
1829
1830	subq	 $1, I
1831	BRANCH
1832	jg	.L53
1833	ALIGN_3
1834
1835.L54:
1836	mulpd	 %xmm12, %xmm4
1837	addpd	 %xmm4,  %xmm0
1838	MOVUPS_A1(-10 * SIZE, A1, %xmm7)
1839	mulpd	 %xmm12, %xmm5
1840	addpd	 %xmm5,  %xmm1
1841	MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4)
1842
1843	mulpd	 %xmm12, %xmm6
1844	addpd	 %xmm6,  %xmm2
1845	MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5)
1846	mulpd	 %xmm12, %xmm7
1847	addpd	 %xmm7,  %xmm3
1848	MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6)
1849
1850	shufpd	$1, %xmm4, %xmm8
1851	mulpd	 %xmm13, %xmm8
1852	addpd	 %xmm8,  %xmm0
1853	MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
1854	shufpd	$1, %xmm5, %xmm4
1855	mulpd	 %xmm13, %xmm4
1856	addpd	 %xmm4,  %xmm1
1857	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
1858
1859	shufpd	$1, %xmm6, %xmm5
1860	mulpd	 %xmm13, %xmm5
1861	addpd	 %xmm5,  %xmm2
1862	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
1863	shufpd	$1, %xmm8, %xmm6
1864	mulpd	 %xmm13, %xmm6
1865	addpd	 %xmm6,  %xmm3
1866	MOVUPS_A1(-12 * SIZE, A2, %xmm6)
1867
1868	mulpd	 %xmm14, %xmm4
1869	addpd	 %xmm4, %xmm0
1870	MOVUPS_A1(-10 * SIZE, A2, %xmm7)
1871	mulpd	 %xmm14, %xmm5
1872	addpd	 %xmm5, %xmm1
1873	MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4)
1874
1875	mulpd	 %xmm14, %xmm6
1876	addpd	 %xmm6,  %xmm2
1877	MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5)
1878	mulpd	 %xmm14, %xmm7
1879	addpd	 %xmm7,  %xmm3
1880	MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6)
1881
1882	shufpd	$1, %xmm4, %xmm9
1883	mulpd	 %xmm15, %xmm9
1884	addpd	 %xmm9,  %xmm0
1885	MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
1886
1887	shufpd	$1, %xmm5, %xmm4
1888	mulpd	 %xmm15, %xmm4
1889	addpd	 %xmm4,  %xmm1
1890	shufpd	$1, %xmm6, %xmm5
1891	mulpd	 %xmm15, %xmm5
1892	addpd	 %xmm5,  %xmm2
1893	shufpd	$1, %xmm9, %xmm6
1894	mulpd	 %xmm15, %xmm6
1895	addpd	 %xmm6,  %xmm3
1896
1897	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1898	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1899	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
1900	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
1901
1902	subq	 $-8 * SIZE, A1
1903	subq	 $-8 * SIZE, A2
1904	subq	 $-8 * SIZE, Y1
1905	ALIGN_3
1906
1907.L55:
1908	testq	$4, MM
1909	je	.L56
1910
1911	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
1912	MOVUPS_A1(-14 * SIZE, A1, %xmm5)
1913
1914	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1915	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
1916
1917	mulpd	 %xmm12, %xmm4
1918	addpd	 %xmm4,  %xmm0
1919	mulpd	 %xmm12, %xmm5
1920	addpd	 %xmm5,  %xmm1
1921
1922	MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6)
1923	MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7)
1924
1925	shufpd	$1, %xmm6, %xmm8
1926	mulpd	 %xmm13, %xmm8
1927	addpd	 %xmm8,  %xmm0
1928	movaps	 %xmm7,  %xmm8
1929	shufpd	$1, %xmm7, %xmm6
1930	mulpd	 %xmm13, %xmm6
1931	addpd	 %xmm6,  %xmm1
1932
1933	MOVUPS_A1(-16 * SIZE, A2, %xmm4)
1934	MOVUPS_A1(-14 * SIZE, A2, %xmm5)
1935
1936	mulpd	 %xmm14, %xmm4
1937	addpd	 %xmm4,  %xmm0
1938	mulpd	 %xmm14, %xmm5
1939	addpd	 %xmm5,  %xmm1
1940
1941	MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6)
1942	MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7)
1943
1944	shufpd	$1, %xmm6, %xmm9
1945	mulpd	 %xmm15, %xmm9
1946	addpd	 %xmm9,  %xmm0
1947	movaps	 %xmm7,  %xmm9
1948	shufpd	$1, %xmm7, %xmm6
1949	mulpd	 %xmm15, %xmm6
1950	addpd	 %xmm6,  %xmm1
1951
1952	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1953	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
1954
1955	addq	 $4 * SIZE, A1
1956	addq	 $4 * SIZE, A2
1957	addq	 $4 * SIZE, Y1
1958	ALIGN_3
1959
1960.L56:
1961	testq	$2, MM
1962	je	.L57
1963
1964	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
1965	MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
1966	MOVUPS_A1(-16 * SIZE, A2, %xmm6)
1967	MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
1968
1969	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
1970
1971	mulpd	 %xmm12, %xmm4
1972	addpd	 %xmm4,  %xmm0
1973	shufpd	$1, %xmm5, %xmm8
1974	mulpd	 %xmm13, %xmm8
1975	addpd	 %xmm8,  %xmm0
1976	movaps	 %xmm5,  %xmm8
1977	mulpd	 %xmm14, %xmm6
1978	addpd	 %xmm6,  %xmm0
1979	shufpd	$1, %xmm7, %xmm9
1980	mulpd	 %xmm15, %xmm9
1981	addpd	 %xmm9,  %xmm0
1982	movaps	 %xmm7,  %xmm9
1983
1984	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
1985
1986	addq	 $2 * SIZE, A1
1987	addq	 $2 * SIZE, A2
1988	addq	 $2 * SIZE, Y1
1989	ALIGN_3
1990
1991.L57:
1992	testq	$1, MM
1993	je	.L58
1994
1995	movsd	 -16 * SIZE(Y1), %xmm0
1996
1997	movsd	 -16 * SIZE(A1), %xmm4
1998	shufpd	 $1, %xmm8, %xmm8
1999	movsd	 -16 * SIZE(A2), %xmm6
2000	shufpd	 $1, %xmm9, %xmm9
2001
2002	mulsd	 %xmm12, %xmm4
2003	addsd	 %xmm4,  %xmm0
2004	mulsd	 %xmm13, %xmm8
2005	addsd	 %xmm8,  %xmm0
2006	mulsd	 %xmm14, %xmm6
2007	addsd	 %xmm6,  %xmm0
2008	mulsd	 %xmm15, %xmm9
2009	addsd	 %xmm9,  %xmm0
2010
2011	movsd	 %xmm0, -16 * SIZE(Y1)
2012	ALIGN_3
2013
2014.L58:
2015	cmpq	$4, N
2016	jge	.L51
2017	ALIGN_3
2018
2019.L60:
2020#endif
2021
2022#if GEMV_UNROLL >= 2
2023
2024	cmpq	$2, N
2025	jl	.L70
2026
2027#if GEMV_UNROLL == 2
2028	ALIGN_3
2029
2030.L61:
2031#endif
2032
2033	subq	$2, N
2034
2035	leaq	16 * SIZE(BUFFER), Y1
2036	movq	A,  A1
2037	leaq	(A,  LDA), A2
2038	leaq	(A,  LDA, 2), A
2039
2040#ifdef HAVE_SSE3
2041	movddup	(X), %xmm12
2042	addq	INCX, X
2043	movddup	(X), %xmm13
2044	addq	INCX, X
2045
2046	movddup	ALPHA, %xmm0
2047#else
2048	movsd	(X), %xmm12
2049	unpcklpd %xmm12, %xmm12
2050	addq	INCX, X
2051	movsd	(X), %xmm13
2052	unpcklpd %xmm13, %xmm13
2053	addq	INCX, X
2054
2055	movsd	ALPHA, %xmm0
2056	unpcklpd %xmm0, %xmm0
2057#endif
2058
2059	mulpd	%xmm0, %xmm12
2060	mulpd	%xmm0, %xmm13
2061
2062	testq	$SIZE, A
2063	je	.L6X
2064
2065	movsd	 -16 * SIZE(A1), %xmm4
2066	movsd	 -16 * SIZE(A2), %xmm5
2067
2068	movsd	 -16 * SIZE(Y1), %xmm0
2069
2070	mulsd	 %xmm12, %xmm4
2071	addsd	 %xmm4,  %xmm0
2072	mulsd	 %xmm13, %xmm5
2073	addsd	 %xmm5,  %xmm0
2074
2075	movsd	 %xmm0, -16 * SIZE(Y1)
2076
2077	addq	 $SIZE, A1
2078	addq	 $SIZE, A2
2079	addq	 $SIZE, Y1
2080	ALIGN_3
2081
2082.L6X:
2083	movhpd	-16 * SIZE(A2), %xmm8
2084
2085	movq	MM,  I
2086	sarq	$3,  I
2087	jle	.L65
2088
2089	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
2090	MOVUPS_A1(-14 * SIZE, A1, %xmm5)
2091	MOVUPS_A1(-12 * SIZE, A1, %xmm6)
2092
2093	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
2094	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
2095	MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
2096	MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
2097
2098	decq	 I
2099	jle	 .L64
2100	ALIGN_3
2101
2102.L63:
2103#ifdef PREFETCH
2104	PREFETCH	(PREFETCHSIZE) * 4  - 128 + PREOFFSET(A1)
2105#endif
2106
2107	mulpd	 %xmm12, %xmm4
2108	addpd	 %xmm4,  %xmm0
2109	MOVUPS_A1(-10 * SIZE, A1, %xmm7)
2110	mulpd	 %xmm12, %xmm5
2111	addpd	 %xmm5,  %xmm1
2112	MOVUPS_A1(-15 * SIZE, A2, %xmm4)
2113
2114	mulpd	 %xmm12, %xmm6
2115	addpd	 %xmm6,  %xmm2
2116	MOVUPS_A1(-13 * SIZE, A2, %xmm5)
2117	mulpd	 %xmm12, %xmm7
2118	addpd	 %xmm7,  %xmm3
2119	MOVUPS_A1(-11 * SIZE, A2, %xmm6)
2120
2121#ifdef PREFETCH
2122	PREFETCH	(PREFETCHSIZE) * 4  - 128 + PREOFFSET + 8(A2)
2123#endif
2124
2125	shufpd	$1, %xmm4, %xmm8
2126	mulpd	 %xmm13, %xmm8
2127	addpd	 %xmm8,  %xmm0
2128	MOVUPS_A1( -9 * SIZE, A2, %xmm8)
2129	shufpd	$1, %xmm5, %xmm4
2130	mulpd	 %xmm13, %xmm4
2131	addpd	 %xmm4,  %xmm1
2132	MOVUPS_A1( -8 * SIZE, A1, %xmm4)
2133
2134	shufpd	$1, %xmm6, %xmm5
2135	mulpd	 %xmm13, %xmm5
2136	addpd	 %xmm5,  %xmm2
2137	MOVUPS_A1( -6 * SIZE, A1, %xmm5)
2138	shufpd	$1, %xmm8, %xmm6
2139	mulpd	 %xmm13, %xmm6
2140	addpd	 %xmm6,  %xmm3
2141	MOVUPS_A1( -4 * SIZE, A1, %xmm6)
2142
2143#ifdef PREFETCHW
2144	PREFETCHW	(PREFETCHSIZE) * 4  - 128 + PREOFFSET(Y1)
2145#endif
2146
2147	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2148	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2149	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
2150	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
2151
2152	MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
2153	MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
2154	MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
2155	MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
2156
2157	subq	 $-8 * SIZE, A1
2158	subq	 $-8 * SIZE, A2
2159	subq	 $-8 * SIZE, Y1
2160
2161	subq	 $1, I
2162	BRANCH
2163	jg	.L63
2164	ALIGN_3
2165
2166.L64:
2167	mulpd	 %xmm12, %xmm4
2168	addpd	 %xmm4,  %xmm0
2169	MOVUPS_A1(-10 * SIZE, A1, %xmm7)
2170	mulpd	 %xmm12, %xmm5
2171	addpd	 %xmm5,  %xmm1
2172	MOVUPS_A1(-15 * SIZE, A2, %xmm4)
2173
2174	mulpd	 %xmm12, %xmm6
2175	addpd	 %xmm6,  %xmm2
2176	MOVUPS_A1(-13 * SIZE, A2, %xmm5)
2177	mulpd	 %xmm12, %xmm7
2178	addpd	 %xmm7,  %xmm3
2179	MOVUPS_A1(-11 * SIZE, A2, %xmm6)
2180
2181	shufpd	$1, %xmm4, %xmm8
2182	mulpd	 %xmm13, %xmm8
2183	addpd	 %xmm8,  %xmm0
2184	MOVUPS_A1( -9 * SIZE, A2, %xmm8)
2185	shufpd	$1, %xmm5, %xmm4
2186	mulpd	 %xmm13, %xmm4
2187	addpd	 %xmm4,  %xmm1
2188
2189	shufpd	$1, %xmm6, %xmm5
2190	mulpd	 %xmm13, %xmm5
2191	addpd	 %xmm5,  %xmm2
2192	shufpd	$1, %xmm8, %xmm6
2193	mulpd	 %xmm13, %xmm6
2194	addpd	 %xmm6,  %xmm3
2195
2196	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2197	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2198	MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
2199	MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
2200
2201	subq	 $-8 * SIZE, A1
2202	subq	 $-8 * SIZE, A2
2203	subq	 $-8 * SIZE, Y1
2204	ALIGN_3
2205
2206.L65:
2207	testq	$4, MM
2208	je	.L66
2209
2210
2211	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
2212	MOVUPS_A1(-14 * SIZE, A1, %xmm5)
2213
2214	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
2215	MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
2216
2217	mulpd	 %xmm12, %xmm4
2218	addpd	 %xmm4,  %xmm0
2219	mulpd	 %xmm12, %xmm5
2220	addpd	 %xmm5,  %xmm1
2221
2222	MOVUPS_A1(-15 * SIZE, A2, %xmm6)
2223	MOVUPS_A1(-13 * SIZE, A2, %xmm7)
2224
2225	shufpd	$1, %xmm6, %xmm8
2226	mulpd	 %xmm13, %xmm8
2227	addpd	 %xmm8,  %xmm0
2228	movaps	 %xmm7,  %xmm8
2229	shufpd	$1, %xmm7, %xmm6
2230	mulpd	 %xmm13, %xmm6
2231	addpd	 %xmm6,  %xmm1
2232
2233	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2234	MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
2235
2236	addq	 $4 * SIZE, A1
2237	addq	 $4 * SIZE, A2
2238	addq	 $4 * SIZE, Y1
2239	ALIGN_3
2240
2241.L66:
2242	testq	$2, MM
2243	je	.L67
2244
2245	MOVUPS_A1(-16 * SIZE, A1, %xmm4)
2246	MOVUPS_A1(-15 * SIZE, A2, %xmm5)
2247
2248	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
2249
2250	mulpd	 %xmm12, %xmm4
2251	addpd	 %xmm4,  %xmm0
2252	shufpd	$1, %xmm5, %xmm8
2253	mulpd	 %xmm13, %xmm8
2254	addpd	 %xmm8,  %xmm0
2255	movaps	 %xmm5,  %xmm8
2256
2257	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2258
2259	addq	 $2 * SIZE, A1
2260	addq	 $2 * SIZE, A2
2261	addq	 $2 * SIZE, Y1
2262	ALIGN_3
2263
2264.L67:
2265	testq	$1, MM
2266#if GEMV_UNROLL == 2
2267	je	.L68
2268#else
2269	je	.L70
2270#endif
2271
2272	movsd	 -16 * SIZE(Y1), %xmm0
2273
2274	movsd	 -16 * SIZE(A1), %xmm4
2275	shufpd	 $1, %xmm8, %xmm8
2276
2277	mulsd	 %xmm12, %xmm4
2278	addsd	 %xmm4,  %xmm0
2279	mulsd	 %xmm13, %xmm8
2280	addsd	 %xmm8,  %xmm0
2281
2282	movsd	 %xmm0, -16 * SIZE(Y1)
2283	ALIGN_3
2284
2285#if GEMV_UNROLL == 2
2286.L68:
2287	cmpq	$2, N
2288	jge	.L61
2289	ALIGN_3
2290
2291#endif
2292
2293.L70:
2294	cmpq	$1, N
2295	jl	.L900
2296
2297#endif
2298
2299	leaq	16 * SIZE(BUFFER), Y1
2300	movq	A,  A1
2301
2302#ifdef HAVE_SSE3
2303	movddup	(X), %xmm12
2304	addq	INCX, X
2305
2306	movddup	ALPHA, %xmm0
2307#else
2308	movsd	(X), %xmm12
2309	unpcklpd %xmm12, %xmm12
2310	addq	INCX, X
2311
2312	movsd	ALPHA, %xmm0
2313	unpcklpd %xmm0, %xmm0
2314#endif
2315
2316	mulpd	%xmm0, %xmm12
2317
2318	testq	$SIZE, A
2319	je	.L7X
2320
2321	movsd	 -16 * SIZE(A1), %xmm4
2322	movsd	 -16 * SIZE(Y1), %xmm0
2323
2324	mulsd	 %xmm12, %xmm4
2325	addsd	 %xmm4,  %xmm0
2326
2327	movsd	 %xmm0, -16 * SIZE(Y1)
2328
2329	addq	 $SIZE, A1
2330	addq	 $SIZE, Y1
2331	ALIGN_3
2332
2333.L7X:
2334
2335	movq	MM,  I
2336	sarq	$3,  I
2337	jle	.L75
2338
2339	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
2340	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
2341	MOVUPS_A1(-12 * SIZE, A1, %xmm2)
2342	MOVUPS_A1(-10 * SIZE, A1, %xmm3)
2343
2344	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
2345	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
2346	MOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
2347	MOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
2348
2349	decq	 I
2350	jle	 .L74
2351	ALIGN_3
2352
2353.L73:
2354#ifdef PREFETCH
2355	PREFETCH	(PREFETCHSIZE) * 8  - 128 + PREOFFSET(A1)
2356#endif
2357
2358	mulpd	 %xmm12, %xmm0
2359	addpd	 %xmm0,  %xmm8
2360	MOVUPS_A1( -8 * SIZE, A1, %xmm0)
2361	mulpd	 %xmm12, %xmm1
2362	addpd	 %xmm1,  %xmm9
2363	MOVUPS_A1( -6 * SIZE, A1, %xmm1)
2364
2365	mulpd	 %xmm12, %xmm2
2366	addpd	 %xmm2,  %xmm10
2367	MOVUPS_A1( -4 * SIZE, A1, %xmm2)
2368	mulpd	 %xmm12, %xmm3
2369	addpd	 %xmm3,  %xmm11
2370	MOVUPS_A1( -2 * SIZE, A1, %xmm3)
2371
2372#ifdef PREFETCHW
2373	PREFETCHW	(PREFETCHSIZE) * 8  - 128 + PREOFFSET(Y1)
2374#endif
2375
2376	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
2377	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
2378	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
2379	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
2380
2381	MOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
2382	MOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
2383	MOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
2384	MOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
2385
2386	subq	 $-8 * SIZE, A1
2387	subq	 $-8 * SIZE, Y1
2388
2389	subq	 $1, I
2390	BRANCH
2391	jg	.L73
2392	ALIGN_3
2393
2394.L74:
2395	mulpd	 %xmm12, %xmm0
2396	addpd	 %xmm0,  %xmm8
2397	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
2398	mulpd	 %xmm12, %xmm1
2399	addpd	 %xmm1,  %xmm9
2400	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
2401	mulpd	 %xmm12, %xmm2
2402	addpd	 %xmm2,  %xmm10
2403	MOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
2404	mulpd	 %xmm12, %xmm3
2405	addpd	 %xmm3,  %xmm11
2406	MOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
2407
2408	subq	 $-8 * SIZE, A1
2409	subq	 $-8 * SIZE, Y1
2410	ALIGN_3
2411
2412.L75:
2413	testq	$4, MM
2414	je	.L76
2415
2416	MOVUPS_A1(-16 * SIZE, A1, %xmm0)
2417	MOVUPS_A1(-14 * SIZE, A1, %xmm1)
2418
2419	MOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
2420	MOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
2421
2422	mulpd	 %xmm12, %xmm0
2423	addpd	 %xmm0,  %xmm8
2424	MOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
2425	mulpd	 %xmm12, %xmm1
2426	addpd	 %xmm1,  %xmm9
2427	MOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
2428
2429	addq	 $4 * SIZE, A1
2430	addq	 $4 * SIZE, Y1
2431	ALIGN_3
2432
2433.L76:
2434	testq	$2, MM
2435	je	.L77
2436
2437	MOVUPS_A1(-16 * SIZE, A1, %xmm8)
2438
2439	MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
2440
2441	mulpd	 %xmm12, %xmm8
2442	addpd	 %xmm8,  %xmm0
2443
2444	MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
2445
2446	addq	 $2 * SIZE, A1
2447	addq	 $2 * SIZE, Y1
2448	ALIGN_3
2449
2450.L77:
2451	testq	$1, MM
2452	je	.L900
2453
2454	movsd	 -16 * SIZE(Y1), %xmm0
2455	movsd	 -16 * SIZE(A1), %xmm8
2456
2457	mulsd	 %xmm12, %xmm8
2458	addsd	 %xmm8,  %xmm0
2459
2460	movsd	 %xmm0, -16 * SIZE(Y1)
2461#endif
2462	ALIGN_3
2463
2464
2465.L900:
2466#ifndef COPY_FORCE
2467	cmpq	Y, BUFFER
2468	je	.L999
2469#endif
2470	movq	M,   TMP_M
2471	movq    Y,    Y1
2472
2473	cmpq	$SIZE, INCY
2474	jne	.L950
2475
2476	testq	$SIZE, Y1
2477	je	.L910
2478
2479	movsd	(Y1), %xmm0
2480	addsd	(BUFFER), %xmm0
2481	movsd	%xmm0, (Y1)
2482
2483	addq	$SIZE, Y1
2484	addq	$SIZE, BUFFER
2485
2486	decq	TMP_M
2487	jle	.L999
2488	ALIGN_4
2489
2490.L910:
2491	testq	$SIZE, BUFFER
2492	jne	.L920
2493
2494	movq	TMP_M,   %rax
2495	sarq	$3,  %rax
2496	jle	.L914
2497	ALIGN_3
2498
2499.L912:
2500#ifdef PREFETCHW
2501	PREFETCHW	(PREFETCHSIZE) * 4 + PREOFFSET(Y1)
2502#endif
2503
2504	movapd	0 * SIZE(Y1), %xmm0
2505	movapd	2 * SIZE(Y1), %xmm1
2506	movapd	4 * SIZE(Y1), %xmm2
2507	movapd	6 * SIZE(Y1), %xmm3
2508
2509	movapd	0 * SIZE(BUFFER), %xmm4
2510	movapd	2 * SIZE(BUFFER), %xmm5
2511	movapd	4 * SIZE(BUFFER), %xmm6
2512	movapd	6 * SIZE(BUFFER), %xmm7
2513
2514#ifdef PREFETCH
2515	PREFETCH	(PREFETCHSIZE) * 4 + PREOFFSET(BUFFER)
2516#endif
2517
2518	addpd	%xmm4, %xmm0
2519	addpd	%xmm5, %xmm1
2520	addpd	%xmm6, %xmm2
2521	addpd	%xmm7, %xmm3
2522
2523	movapd	%xmm0,  0 * SIZE(Y1)
2524	movapd	%xmm1,  2 * SIZE(Y1)
2525	movapd	%xmm2,  4 * SIZE(Y1)
2526	movapd	%xmm3,  6 * SIZE(Y1)
2527
2528	addq	$8 * SIZE, Y1
2529	addq	$8 * SIZE, BUFFER
2530
2531	decq	%rax
2532	jg	.L912
2533	ALIGN_3
2534
2535.L914:
2536	testq	$7, TMP_M
2537	jle	.L999
2538
2539	testq	$4, TMP_M
2540	jle	.L915
2541
2542	movapd	0 * SIZE(Y1), %xmm0
2543	movapd	2 * SIZE(Y1), %xmm1
2544
2545	movapd	0 * SIZE(BUFFER), %xmm4
2546	movapd	2 * SIZE(BUFFER), %xmm5
2547
2548	addpd	%xmm4, %xmm0
2549	addpd	%xmm5, %xmm1
2550
2551	movapd	%xmm0,  0 * SIZE(Y1)
2552	movapd	%xmm1,  2 * SIZE(Y1)
2553
2554	addq	$4 * SIZE, Y1
2555	addq	$4 * SIZE, BUFFER
2556	ALIGN_3
2557
2558.L915:
2559	testq	$2, TMP_M
2560	jle	.L916
2561
2562	movapd	(Y1), %xmm0
2563
2564	movapd	(BUFFER), %xmm4
2565
2566	addpd	%xmm4, %xmm0
2567
2568	movapd	%xmm0,  (Y1)
2569
2570	addq	$2 * SIZE, Y1
2571	addq	$2 * SIZE, BUFFER
2572	ALIGN_3
2573
2574.L916:
2575	testq	$1, TMP_M
2576	jle	.L999
2577
2578	movsd	(Y1), %xmm0
2579
2580	movsd	0 * SIZE(BUFFER), %xmm4
2581
2582	addsd	%xmm4, %xmm0
2583
2584	movlpd	%xmm0, (Y1)
2585	ALIGN_3
2586
2587	jmp	.L999
2588	ALIGN_4
2589
2590.L920:
2591	movapd	-1 * SIZE(BUFFER), %xmm4
2592
2593	movq	TMP_M,   %rax
2594	sarq	$3,  %rax
2595	jle	.L924
2596	ALIGN_3
2597
2598.L922:
2599#ifdef PREFETCHW
2600	PREFETCHW	(PREFETCHSIZE) * 4 + PREOFFSET(Y1)
2601#endif
2602
2603	movapd	 0 * SIZE(Y1), %xmm0
2604	movapd	 2 * SIZE(Y1), %xmm1
2605	movapd	 4 * SIZE(Y1), %xmm2
2606	movapd	 6 * SIZE(Y1), %xmm3
2607
2608	movapd	1 * SIZE(BUFFER), %xmm5
2609	movapd	3 * SIZE(BUFFER), %xmm6
2610	movapd	5 * SIZE(BUFFER), %xmm7
2611	movapd	7 * SIZE(BUFFER), %xmm8
2612
2613	shufpd	 $1, %xmm5, %xmm4
2614	shufpd	 $1, %xmm6, %xmm5
2615	shufpd	 $1, %xmm7, %xmm6
2616	shufpd	 $1, %xmm8, %xmm7
2617
2618#ifdef PREFETCH
2619	PREFETCH	(PREFETCHSIZE) * 4 + PREOFFSET(BUFFER)
2620#endif
2621
2622	addpd	%xmm4, %xmm0
2623	addpd	%xmm5, %xmm1
2624	addpd	%xmm6, %xmm2
2625	addpd	%xmm7, %xmm3
2626
2627	movapd	%xmm0,  0 * SIZE(Y1)
2628	movapd	%xmm1,  2 * SIZE(Y1)
2629	movapd	%xmm2,  4 * SIZE(Y1)
2630	movapd	%xmm3,  6 * SIZE(Y1)
2631
2632	movapd	%xmm8, %xmm4
2633
2634	addq	$8 * SIZE, Y1
2635	addq	$8 * SIZE, BUFFER
2636
2637	decq	%rax
2638	jg	.L922
2639	ALIGN_3
2640
2641.L924:
2642	testq	$7, TMP_M
2643	jle	.L999
2644
2645	testq	$4, TMP_M
2646	jle	.L925
2647
2648	movapd	 0 * SIZE(Y1), %xmm0
2649	movapd	 2 * SIZE(Y1), %xmm1
2650
2651	movapd	1 * SIZE(BUFFER), %xmm5
2652	movapd	3 * SIZE(BUFFER), %xmm6
2653
2654	shufpd	 $1, %xmm5, %xmm4
2655	shufpd	 $1, %xmm6, %xmm5
2656
2657	addpd	%xmm4, %xmm0
2658	addpd	%xmm5, %xmm1
2659
2660	movapd	%xmm0,  0 * SIZE(Y1)
2661	movapd	%xmm1,  2 * SIZE(Y1)
2662
2663	movapd	%xmm6, %xmm4
2664
2665	addq	$4 * SIZE, Y1
2666	addq	$4 * SIZE, BUFFER
2667	ALIGN_3
2668
2669.L925:
2670	testq	$2, TMP_M
2671	jle	.L926
2672
2673	movapd	 (Y1), %xmm0
2674
2675	movapd	 1 * SIZE(BUFFER), %xmm5
2676
2677	shufpd	 $1, %xmm5, %xmm4
2678
2679	addpd	%xmm4, %xmm0
2680
2681	movapd	%xmm0,  (Y1)
2682
2683	movaps	%xmm5, %xmm4
2684
2685	addq	$2 * SIZE, Y1
2686	addq	$2 * SIZE, BUFFER
2687	ALIGN_3
2688
2689.L926:
2690	testq	$1, TMP_M
2691	jle	.L999
2692
2693	movsd	(Y1), %xmm0
2694
2695	shufpd	$1, %xmm4, %xmm4
2696
2697	addsd	%xmm4, %xmm0
2698
2699	movlpd	%xmm0, (Y1)
2700	ALIGN_3
2701
2702	jmp	.L999
2703	ALIGN_4
2704
2705.L950:
2706	testq	$SIZE, BUFFER
2707	je	.L960
2708
2709	movsd	(Y1), %xmm0
2710	addsd	(BUFFER), %xmm0
2711	movsd	%xmm0, (Y1)
2712
2713	addq	INCY, Y1
2714	addq	$SIZE, BUFFER
2715
2716	decq	TMP_M
2717	jle	.L999
2718	ALIGN_4
2719
2720.L960:
2721	movq	Y1,  Y2
2722
2723	movq	TMP_M,   %rax
2724	sarq	$3,  %rax
2725	jle	.L964
2726	ALIGN_3
2727
2728.L962:
2729	movsd	(Y2), %xmm0
2730	addq	INCY, Y2
2731	movhpd	(Y2), %xmm0
2732	addq	INCY, Y2
2733
2734	movapd	0 * SIZE(BUFFER), %xmm4
2735
2736	movsd	(Y2), %xmm1
2737	addq	INCY, Y2
2738	movhpd	(Y2), %xmm1
2739	addq	INCY, Y2
2740
2741	movapd	2 * SIZE(BUFFER), %xmm5
2742
2743	movsd	(Y2), %xmm2
2744	addq	INCY, Y2
2745	movhpd	(Y2), %xmm2
2746	addq	INCY, Y2
2747
2748	movapd	4 * SIZE(BUFFER), %xmm6
2749
2750	addpd	%xmm4, %xmm0
2751
2752	movsd	(Y2), %xmm3
2753	addq	INCY, Y2
2754	movhpd	(Y2), %xmm3
2755	addq	INCY, Y2
2756
2757	movapd	6 * SIZE(BUFFER), %xmm7
2758
2759	addpd	%xmm5, %xmm1
2760
2761	movlpd	%xmm0, (Y1)
2762	addq	INCY, Y1
2763	movhpd	%xmm0, (Y1)
2764	addq	INCY, Y1
2765
2766	addpd	%xmm6, %xmm2
2767
2768	movlpd	%xmm1, (Y1)
2769	addq	INCY, Y1
2770	movhpd	%xmm1, (Y1)
2771	addq	INCY, Y1
2772
2773	addpd	%xmm7, %xmm3
2774
2775	movlpd	%xmm2, (Y1)
2776	addq	INCY, Y1
2777	movhpd	%xmm2, (Y1)
2778	addq	INCY, Y1
2779	movlpd	%xmm3, (Y1)
2780	addq	INCY, Y1
2781	movhpd	%xmm3, (Y1)
2782	addq	INCY, Y1
2783
2784	addq	$8 * SIZE, BUFFER
2785	decq	%rax
2786	jg	.L962
2787	ALIGN_3
2788
2789.L964:
2790	testq	$7, TMP_M
2791	jle	.L999
2792
2793	testq	$4, TMP_M
2794	jle	.L965
2795
2796	movsd	(Y2), %xmm0
2797	addq	INCY, Y2
2798	movhpd	(Y2), %xmm0
2799	addq	INCY, Y2
2800
2801	movapd	0 * SIZE(BUFFER), %xmm4
2802
2803	movsd	(Y2), %xmm1
2804	addq	INCY, Y2
2805	movhpd	(Y2), %xmm1
2806	addq	INCY, Y2
2807
2808	movapd	2 * SIZE(BUFFER), %xmm5
2809
2810	addpd	%xmm4, %xmm0
2811	addpd	%xmm5, %xmm1
2812
2813	movlpd	%xmm0, (Y1)
2814	addq	INCY, Y1
2815	movhpd	%xmm0, (Y1)
2816	addq	INCY, Y1
2817	movlpd	%xmm1, (Y1)
2818	addq	INCY, Y1
2819	movhpd	%xmm1, (Y1)
2820	addq	INCY, Y1
2821
2822	addq	$4 * SIZE, BUFFER
2823	ALIGN_3
2824
2825.L965:
2826	testq	$2, TMP_M
2827	jle	.L966
2828
2829	movsd	(Y2), %xmm0
2830	addq	INCY, Y2
2831	movhpd	(Y2), %xmm0
2832	addq	INCY, Y2
2833
2834	movapd	0 * SIZE(BUFFER), %xmm4
2835
2836	addpd	%xmm4, %xmm0
2837
2838	movlpd	%xmm0, (Y1)
2839	addq	INCY, Y1
2840	movhpd	%xmm0, (Y1)
2841	addq	INCY, Y1
2842
2843	addq	$2 * SIZE, BUFFER
2844	ALIGN_3
2845
2846.L966:
2847	testq	$1, TMP_M
2848	jle	.L999
2849
2850	movsd	(Y2), %xmm0
2851
2852	movsd	0 * SIZE(BUFFER), %xmm4
2853
2854	addsd	%xmm4, %xmm0
2855
2856	movlpd	%xmm0, (Y1)
2857	ALIGN_3
2858
2859.L999:
2860	leaq	(, M, SIZE), %rax
2861	addq %rax,AA
2862	movq	STACK_INCY, INCY
2863	imulq	INCY, %rax
2864	addq %rax, Y
2865	jmp .L0t
2866	ALIGN_4
2867
2868.L999x:
2869	movq	  0(%rsp), %rbx
2870	movq	  8(%rsp), %rbp
2871	movq	 16(%rsp), %r12
2872	movq	 24(%rsp), %r13
2873	movq	 32(%rsp), %r14
2874	movq	 40(%rsp), %r15
2875
2876#ifdef WINDOWS_ABI
2877	movq	 48(%rsp), %rdi
2878	movq	 56(%rsp), %rsi
2879	movups	 64(%rsp), %xmm6
2880	movups	 80(%rsp), %xmm7
2881	movups	 96(%rsp), %xmm8
2882	movups	112(%rsp), %xmm9
2883	movups	128(%rsp), %xmm10
2884	movups	144(%rsp), %xmm11
2885	movups	160(%rsp), %xmm12
2886	movups	176(%rsp), %xmm13
2887	movups	192(%rsp), %xmm14
2888	movups	208(%rsp), %xmm15
2889#endif
2890
2891	addq	$STACKSIZE, %rsp
2892
2893	ret
2894	EPILOGUE
2895