1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	ARG1	/* rdi */
26#define X	ARG2	/* rsi */
27#define INCX	ARG3	/* rdx */
28#define Y	ARG4	/* rcx */
29#ifndef WINDOWS_ABI
30#define INCY	ARG5	/* r8  */
31#else
32#define INCY	%r10
33#endif
34
35#include "l1param.h"
36
37#undef movsd
38
39#ifndef OPTERON
40#define MOVLPS	movsd
41#else
42#define MOVLPS	movlps
43#endif
44
45
46	PROLOGUE
47	PROFCODE
48
49#ifdef WINDOWS_ABI
50	movq	40(%rsp), INCY
51#endif
52
53	SAVEREGISTERS
54
55	salq	$ZBASE_SHIFT, INCX
56	salq	$ZBASE_SHIFT, INCY
57
58	xorps	%xmm0, %xmm0
59	xorps	%xmm1, %xmm1
60	xorps	%xmm2, %xmm2
61	xorps	%xmm3, %xmm3
62
63	cmpq	$0, N
64	jle	.L999
65
66	cmpq	$2 * SIZE, INCX
67	jne	.L50
68	cmpq	$2 * SIZE, INCY
69	jne	.L50
70
71	subq	$-16 * SIZE, X
72	subq	$-16 * SIZE, Y
73
74	testq	$SIZE, Y
75	jne	.L30
76
77	testq	$SIZE, X
78	jne	.L20
79
80	movq	N,  %rax
81	sarq	$3, %rax
82	jle	.L15
83
84	movaps	-16 * SIZE(X), %xmm4
85	movaps	-14 * SIZE(X), %xmm5
86	movaps	-16 * SIZE(Y), %xmm8
87	movaps	-14 * SIZE(Y), %xmm9
88	movaps	-12 * SIZE(X), %xmm6
89	movaps	-10 * SIZE(X), %xmm7
90	movaps	-12 * SIZE(Y), %xmm10
91	movaps	-10 * SIZE(Y), %xmm11
92
93	decq	%rax
94	jle	.L12
95	ALIGN_3
96
97.L11:
98#ifdef PREFETCH
99	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
100#endif
101
102	pshufd	$0x4e,  %xmm8, %xmm12
103	mulpd	%xmm4,  %xmm8
104	addpd	%xmm8,  %xmm0
105	movaps	 -8 * SIZE(Y), %xmm8
106	mulpd	%xmm4,  %xmm12
107	movaps	 -8 * SIZE(X), %xmm4
108	addpd	%xmm12, %xmm1
109
110	pshufd	$0x4e,  %xmm9, %xmm12
111	mulpd	%xmm5,  %xmm9
112	addpd	%xmm9,  %xmm2
113	movaps	 -6 * SIZE(Y), %xmm9
114	mulpd	%xmm5,  %xmm12
115	movaps	 -6 * SIZE(X), %xmm5
116	addpd	%xmm12, %xmm3
117
118#ifdef PREFETCH
119	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
120#endif
121
122	pshufd	$0x4e,  %xmm10, %xmm12
123	mulpd	%xmm6,  %xmm10
124	addpd	%xmm10, %xmm0
125	movaps	 -4 * SIZE(Y), %xmm10
126	mulpd	%xmm6,  %xmm12
127	movaps	 -4 * SIZE(X), %xmm6
128	addpd	%xmm12, %xmm1
129
130	pshufd	$0x4e,  %xmm11, %xmm12
131	mulpd	%xmm7,  %xmm11
132	addpd	%xmm11, %xmm2
133	movaps	 -2 * SIZE(Y), %xmm11
134	mulpd	%xmm7,  %xmm12
135	movaps	 -2 * SIZE(X), %xmm7
136	addpd	%xmm12, %xmm3
137
138#if defined(PREFETCH) && !defined(FETCH128)
139	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
140#endif
141
142	pshufd	$0x4e,  %xmm8, %xmm12
143	mulpd	%xmm4,  %xmm8
144	addpd	%xmm8,  %xmm0
145	movaps	  0 * SIZE(Y), %xmm8
146	mulpd	%xmm4,  %xmm12
147	movaps	  0 * SIZE(X), %xmm4
148	addpd	%xmm12, %xmm1
149
150	pshufd	$0x4e,  %xmm9, %xmm12
151	mulpd	%xmm5,  %xmm9
152	addpd	%xmm9,  %xmm2
153	movaps	  2 * SIZE(Y), %xmm9
154	mulpd	%xmm5,  %xmm12
155	movaps	  2 * SIZE(X), %xmm5
156	addpd	%xmm12, %xmm3
157
158#if defined(PREFETCH) && !defined(FETCH128)
159	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
160#endif
161
162	pshufd	$0x4e,  %xmm10, %xmm12
163	mulpd	%xmm6,  %xmm10
164	addpd	%xmm10, %xmm0
165	movaps	  4 * SIZE(Y), %xmm10
166	mulpd	%xmm6,  %xmm12
167	movaps	  4 * SIZE(X), %xmm6
168	addpd	%xmm12, %xmm1
169
170	pshufd	$0x4e,  %xmm11, %xmm12
171	mulpd	%xmm7,  %xmm11
172	addpd	%xmm11, %xmm2
173	movaps	  6 * SIZE(Y), %xmm11
174	mulpd	%xmm7,  %xmm12
175	movaps	  6 * SIZE(X), %xmm7
176	addpd	%xmm12, %xmm3
177
178	subq	$-16 * SIZE, X
179	subq	$-16 * SIZE, Y
180
181	decq	%rax
182	jg	.L11
183	ALIGN_3
184
185.L12:
186	pshufd	$0x4e,  %xmm8, %xmm12
187	mulpd	%xmm4,  %xmm8
188	addpd	%xmm8,  %xmm0
189	movaps	 -8 * SIZE(Y), %xmm8
190	mulpd	%xmm4,  %xmm12
191	movaps	 -8 * SIZE(X), %xmm4
192	addpd	%xmm12, %xmm1
193
194	pshufd	$0x4e,  %xmm9, %xmm12
195	mulpd	%xmm5,  %xmm9
196	addpd	%xmm9,  %xmm2
197	movaps	 -6 * SIZE(Y), %xmm9
198	mulpd	%xmm5,  %xmm12
199	movaps	 -6 * SIZE(X), %xmm5
200	addpd	%xmm12, %xmm3
201
202	pshufd	$0x4e,  %xmm10, %xmm12
203	mulpd	%xmm6,  %xmm10
204	addpd	%xmm10, %xmm0
205	movaps	 -4 * SIZE(Y), %xmm10
206	mulpd	%xmm6,  %xmm12
207	movaps	 -4 * SIZE(X), %xmm6
208	addpd	%xmm12, %xmm1
209
210	pshufd	$0x4e,  %xmm11, %xmm12
211	mulpd	%xmm7,  %xmm11
212	addpd	%xmm11, %xmm2
213	movaps	 -2 * SIZE(Y), %xmm11
214	mulpd	%xmm7,  %xmm12
215	movaps	 -2 * SIZE(X), %xmm7
216	addpd	%xmm12, %xmm3
217
218	pshufd	$0x4e,  %xmm8, %xmm12
219	mulpd	%xmm4,  %xmm8
220	addpd	%xmm8,  %xmm0
221	mulpd	%xmm4,  %xmm12
222	addpd	%xmm12, %xmm1
223
224	pshufd	$0x4e,  %xmm9, %xmm12
225	mulpd	%xmm5,  %xmm9
226	addpd	%xmm9,  %xmm2
227	mulpd	%xmm5,  %xmm12
228	addpd	%xmm12, %xmm3
229
230	pshufd	$0x4e,  %xmm10, %xmm12
231	mulpd	%xmm6,  %xmm10
232	addpd	%xmm10, %xmm0
233	mulpd	%xmm6,  %xmm12
234	addpd	%xmm12, %xmm1
235
236	pshufd	$0x4e,  %xmm11, %xmm12
237	mulpd	%xmm7,  %xmm11
238	addpd	%xmm11, %xmm2
239	mulpd	%xmm7,  %xmm12
240	addpd	%xmm12, %xmm3
241
242	subq	$-16 * SIZE, X
243	subq	$-16 * SIZE, Y
244	ALIGN_3
245
246.L15:
247	testq	$4, N
248	jle	.L16
249
250	movaps	-16 * SIZE(X), %xmm4
251	movaps	-16 * SIZE(Y), %xmm8
252	movaps	-14 * SIZE(X), %xmm5
253	movaps	-14 * SIZE(Y), %xmm9
254
255	pshufd	$0x4e,  %xmm8, %xmm12
256	mulpd	%xmm4,  %xmm8
257	addpd	%xmm8,  %xmm0
258	mulpd	%xmm4,  %xmm12
259	addpd	%xmm12, %xmm1
260
261	pshufd	$0x4e,  %xmm9, %xmm12
262	mulpd	%xmm5,  %xmm9
263	addpd	%xmm9,  %xmm2
264	mulpd	%xmm5,  %xmm12
265	addpd	%xmm12, %xmm3
266
267	movaps	-12 * SIZE(X), %xmm6
268	movaps	-12 * SIZE(Y), %xmm10
269	movaps	-10 * SIZE(X), %xmm7
270	movaps	-10 * SIZE(Y), %xmm11
271
272	pshufd	$0x4e,  %xmm10, %xmm12
273	mulpd	%xmm6,  %xmm10
274	addpd	%xmm10, %xmm0
275	mulpd	%xmm6,  %xmm12
276	addpd	%xmm12, %xmm1
277
278	pshufd	$0x4e,  %xmm11, %xmm12
279	mulpd	%xmm7,  %xmm11
280	addpd	%xmm11, %xmm2
281	mulpd	%xmm7,  %xmm12
282	addpd	%xmm12, %xmm3
283
284	addq	$8 * SIZE, X
285	addq	$8 * SIZE, Y
286	ALIGN_3
287
288.L16:
289	testq	$2, N
290	jle	.L17
291
292	movaps	-16 * SIZE(X), %xmm4
293	movaps	-16 * SIZE(Y), %xmm8
294	movaps	-14 * SIZE(X), %xmm5
295	movaps	-14 * SIZE(Y), %xmm9
296
297	pshufd	$0x4e,  %xmm8, %xmm12
298	mulpd	%xmm4,  %xmm8
299	addpd	%xmm8,  %xmm0
300	mulpd	%xmm4,  %xmm12
301	addpd	%xmm12, %xmm1
302
303	pshufd	$0x4e,  %xmm9, %xmm12
304	mulpd	%xmm5,  %xmm9
305	addpd	%xmm9,  %xmm2
306	mulpd	%xmm5,  %xmm12
307	addpd	%xmm12, %xmm3
308
309	addq	$4 * SIZE, X
310	addq	$4 * SIZE, Y
311	ALIGN_3
312
313.L17:
314	testq	$1, N
315	jle	.L98
316
317	movaps	-16 * SIZE(X), %xmm4
318	movaps	-16 * SIZE(Y), %xmm8
319
320	pshufd	$0x4e,  %xmm8, %xmm12
321	mulpd	%xmm4,  %xmm8
322	addpd	%xmm8,  %xmm0
323	mulpd	%xmm4,  %xmm12
324	addpd	%xmm12, %xmm1
325	jmp	.L98
326	ALIGN_3
327
328.L20:
329	movq	N,  %rax
330	sarq	$3, %rax
331	jle	.L25
332
333	MOVLPS	-16 * SIZE(X), %xmm4
334	movhps	-15 * SIZE(X), %xmm4
335	MOVLPS	-14 * SIZE(X), %xmm5
336	movhps	-13 * SIZE(X), %xmm5
337	movaps	-16 * SIZE(Y), %xmm8
338	movaps	-14 * SIZE(Y), %xmm9
339	MOVLPS	-12 * SIZE(X), %xmm6
340	movhps	-11 * SIZE(X), %xmm6
341	MOVLPS	-10 * SIZE(X), %xmm7
342	movhps	 -9 * SIZE(X), %xmm7
343	movaps	-12 * SIZE(Y), %xmm10
344	movaps	-10 * SIZE(Y), %xmm11
345
346	decq	%rax
347	jle	.L22
348	ALIGN_3
349
350.L21:
351#ifdef PREFETCH
352	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
353#endif
354
355	pshufd	$0x4e,  %xmm8, %xmm12
356	mulpd	%xmm4,  %xmm8
357	addpd	%xmm8,  %xmm0
358	movaps	 -8 * SIZE(Y), %xmm8
359	mulpd	%xmm4,  %xmm12
360	MOVLPS	 -8 * SIZE(X), %xmm4
361	movhps	 -7 * SIZE(X), %xmm4
362	addpd	%xmm12, %xmm1
363
364	pshufd	$0x4e,  %xmm9, %xmm12
365	mulpd	%xmm5,  %xmm9
366	addpd	%xmm9,  %xmm2
367	movaps	 -6 * SIZE(Y), %xmm9
368	mulpd	%xmm5,  %xmm12
369	MOVLPS	 -6 * SIZE(X), %xmm5
370	movhps	 -5 * SIZE(X), %xmm5
371	addpd	%xmm12, %xmm3
372
373#ifdef PREFETCH
374	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
375#endif
376
377	pshufd	$0x4e,  %xmm10, %xmm12
378	mulpd	%xmm6,  %xmm10
379	addpd	%xmm10, %xmm0
380	movaps	 -4 * SIZE(Y), %xmm10
381	mulpd	%xmm6,  %xmm12
382	MOVLPS	 -4 * SIZE(X), %xmm6
383	movhps	 -3 * SIZE(X), %xmm6
384	addpd	%xmm12, %xmm1
385
386	pshufd	$0x4e,  %xmm11, %xmm12
387	mulpd	%xmm7,  %xmm11
388	addpd	%xmm11, %xmm2
389	movaps	 -2 * SIZE(Y), %xmm11
390	mulpd	%xmm7,  %xmm12
391	MOVLPS	 -2 * SIZE(X), %xmm7
392	movhps	 -1 * SIZE(X), %xmm7
393	addpd	%xmm12, %xmm3
394
395#if defined(PREFETCH) && !defined(FETCH128)
396	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
397#endif
398
399	pshufd	$0x4e,  %xmm8, %xmm12
400	mulpd	%xmm4,  %xmm8
401	addpd	%xmm8,  %xmm0
402	movaps	  0 * SIZE(Y), %xmm8
403	mulpd	%xmm4,  %xmm12
404	MOVLPS	  0 * SIZE(X), %xmm4
405	movhps	  1 * SIZE(X), %xmm4
406	addpd	%xmm12, %xmm1
407
408	pshufd	$0x4e,  %xmm9, %xmm12
409	mulpd	%xmm5,  %xmm9
410	addpd	%xmm9,  %xmm2
411	movaps	  2 * SIZE(Y), %xmm9
412	mulpd	%xmm5,  %xmm12
413	MOVLPS	  2 * SIZE(X), %xmm5
414	movhps	  3 * SIZE(X), %xmm5
415	addpd	%xmm12, %xmm3
416
417#if defined(PREFETCH) && !defined(FETCH128)
418	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
419#endif
420
421	pshufd	$0x4e,  %xmm10, %xmm12
422	mulpd	%xmm6,  %xmm10
423	addpd	%xmm10, %xmm0
424	movaps	  4 * SIZE(Y), %xmm10
425	mulpd	%xmm6,  %xmm12
426	MOVLPS	  4 * SIZE(X), %xmm6
427	movhps	  5 * SIZE(X), %xmm6
428	addpd	%xmm12, %xmm1
429
430	pshufd	$0x4e,  %xmm11, %xmm12
431	mulpd	%xmm7,  %xmm11
432	addpd	%xmm11, %xmm2
433	movaps	  6 * SIZE(Y), %xmm11
434	mulpd	%xmm7,  %xmm12
435	MOVLPS	  6 * SIZE(X), %xmm7
436	movhps	  7 * SIZE(X), %xmm7
437	addpd	%xmm12, %xmm3
438
439	subq	$-16 * SIZE, X
440	subq	$-16 * SIZE, Y
441
442	decq	%rax
443	jg	.L21
444	ALIGN_3
445
446.L22:
447
448	pshufd	$0x4e,  %xmm8, %xmm12
449	mulpd	%xmm4,  %xmm8
450	addpd	%xmm8,  %xmm0
451	movaps	 -8 * SIZE(Y), %xmm8
452	mulpd	%xmm4,  %xmm12
453	MOVLPS	 -8 * SIZE(X), %xmm4
454	movhps	 -7 * SIZE(X), %xmm4
455	addpd	%xmm12, %xmm1
456
457	pshufd	$0x4e,  %xmm9, %xmm12
458	mulpd	%xmm5,  %xmm9
459	addpd	%xmm9,  %xmm2
460	movaps	 -6 * SIZE(Y), %xmm9
461	mulpd	%xmm5,  %xmm12
462	MOVLPS	 -6 * SIZE(X), %xmm5
463	movhps	 -5 * SIZE(X), %xmm5
464	addpd	%xmm12, %xmm3
465
466	pshufd	$0x4e,  %xmm10, %xmm12
467	mulpd	%xmm6,  %xmm10
468	addpd	%xmm10, %xmm0
469	movaps	 -4 * SIZE(Y), %xmm10
470	mulpd	%xmm6,  %xmm12
471	MOVLPS	 -4 * SIZE(X), %xmm6
472	movhps	 -3 * SIZE(X), %xmm6
473	addpd	%xmm12, %xmm1
474
475	pshufd	$0x4e,  %xmm11, %xmm12
476	mulpd	%xmm7,  %xmm11
477	addpd	%xmm11, %xmm2
478	movaps	 -2 * SIZE(Y), %xmm11
479	mulpd	%xmm7,  %xmm12
480	MOVLPS	 -2 * SIZE(X), %xmm7
481	movhps	 -1 * SIZE(X), %xmm7
482	addpd	%xmm12, %xmm3
483
484	pshufd	$0x4e,  %xmm8, %xmm12
485	mulpd	%xmm4,  %xmm8
486	addpd	%xmm8,  %xmm0
487	mulpd	%xmm4,  %xmm12
488	addpd	%xmm12, %xmm1
489
490	pshufd	$0x4e,  %xmm9, %xmm12
491	mulpd	%xmm5,  %xmm9
492	addpd	%xmm9,  %xmm2
493	mulpd	%xmm5,  %xmm12
494	addpd	%xmm12, %xmm3
495
496	pshufd	$0x4e,  %xmm10, %xmm12
497	mulpd	%xmm6,  %xmm10
498	addpd	%xmm10, %xmm0
499	mulpd	%xmm6,  %xmm12
500	addpd	%xmm12, %xmm1
501
502	pshufd	$0x4e,  %xmm11, %xmm12
503	mulpd	%xmm7,  %xmm11
504	addpd	%xmm11, %xmm2
505	mulpd	%xmm7,  %xmm12
506	addpd	%xmm12, %xmm3
507
508	subq	$-16 * SIZE, X
509	subq	$-16 * SIZE, Y
510	ALIGN_3
511
512.L25:
513	testq	$4, N
514	jle	.L26
515
516	MOVLPS	-16 * SIZE(X), %xmm4
517	movhps	-15 * SIZE(X), %xmm4
518	movaps	-16 * SIZE(Y), %xmm8
519
520	pshufd	$0x4e,  %xmm8, %xmm12
521	mulpd	%xmm4,  %xmm8
522	addpd	%xmm8,  %xmm0
523	mulpd	%xmm4,  %xmm12
524	addpd	%xmm12, %xmm1
525
526	MOVLPS	-14 * SIZE(X), %xmm5
527	movhps	-13 * SIZE(X), %xmm5
528	movaps	-14 * SIZE(Y), %xmm9
529
530	pshufd	$0x4e,  %xmm9, %xmm12
531	mulpd	%xmm5,  %xmm9
532	addpd	%xmm9,  %xmm2
533	mulpd	%xmm5,  %xmm12
534	addpd	%xmm12, %xmm3
535
536	MOVLPS	-12 * SIZE(X), %xmm6
537	movhps	-11 * SIZE(X), %xmm6
538	movaps	-12 * SIZE(Y), %xmm10
539
540	pshufd	$0x4e,  %xmm10, %xmm12
541	mulpd	%xmm6,  %xmm10
542	addpd	%xmm10, %xmm0
543	mulpd	%xmm6,  %xmm12
544	addpd	%xmm12, %xmm1
545
546	MOVLPS	-10 * SIZE(X), %xmm7
547	movhps	 -9 * SIZE(X), %xmm7
548	movaps	-10 * SIZE(Y), %xmm11
549
550	pshufd	$0x4e,  %xmm11, %xmm12
551	mulpd	%xmm7,  %xmm11
552	addpd	%xmm11, %xmm2
553	mulpd	%xmm7,  %xmm12
554	addpd	%xmm12, %xmm3
555
556	addq	$8 * SIZE, X
557	addq	$8 * SIZE, Y
558	ALIGN_3
559
560.L26:
561	testq	$2, N
562	jle	.L27
563
564	MOVLPS	-16 * SIZE(X), %xmm4
565	movhps	-15 * SIZE(X), %xmm4
566	movaps	-16 * SIZE(Y), %xmm8
567
568	pshufd	$0x4e,  %xmm8, %xmm12
569	mulpd	%xmm4,  %xmm8
570	addpd	%xmm8,  %xmm0
571	mulpd	%xmm4,  %xmm12
572	addpd	%xmm12, %xmm1
573
574	MOVLPS	-14 * SIZE(X), %xmm5
575	movhps	-13 * SIZE(X), %xmm5
576	movaps	-14 * SIZE(Y), %xmm9
577
578	pshufd	$0x4e,  %xmm9, %xmm12
579	mulpd	%xmm5,  %xmm9
580	addpd	%xmm9,  %xmm2
581	mulpd	%xmm5,  %xmm12
582	addpd	%xmm12, %xmm3
583
584	addq	$4 * SIZE, X
585	addq	$4 * SIZE, Y
586	ALIGN_3
587
588.L27:
589	testq	$1, N
590	jle	.L98
591
592	MOVLPS	-16 * SIZE(X), %xmm4
593	movhps	-15 * SIZE(X), %xmm4
594	movaps	-16 * SIZE(Y), %xmm8
595
596	pshufd	$0x4e,  %xmm8, %xmm12
597	mulpd	%xmm4,  %xmm8
598	addpd	%xmm8,  %xmm0
599	mulpd	%xmm4,  %xmm12
600	addpd	%xmm12, %xmm1
601	jmp	.L98
602	ALIGN_3
603
604.L30:
605	testq	$SIZE, X
606	jne	.L40
607
608	movq	N,  %rax
609	sarq	$3, %rax
610	jle	.L35
611
612	MOVLPS	-16 * SIZE(Y), %xmm4
613	movhps	-15 * SIZE(Y), %xmm4
614	MOVLPS	-14 * SIZE(Y), %xmm5
615	movhps	-13 * SIZE(Y), %xmm5
616	movaps	-16 * SIZE(X), %xmm8
617	movaps	-14 * SIZE(X), %xmm9
618	MOVLPS	-12 * SIZE(Y), %xmm6
619	movhps	-11 * SIZE(Y), %xmm6
620	MOVLPS	-10 * SIZE(Y), %xmm7
621	movhps	 -9 * SIZE(Y), %xmm7
622	movaps	-12 * SIZE(X), %xmm10
623	movaps	-10 * SIZE(X), %xmm11
624
625	decq	%rax
626	jle	.L32
627	ALIGN_3
628
629.L31:
630#ifdef PREFETCH
631	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
632#endif
633
634	pshufd	$0x4e,  %xmm8, %xmm12
635	mulpd	%xmm4,  %xmm8
636	addpd	%xmm8,  %xmm0
637	movaps	 -8 * SIZE(X), %xmm8
638	mulpd	%xmm4,  %xmm12
639	MOVLPS	 -8 * SIZE(Y), %xmm4
640	movhps	 -7 * SIZE(Y), %xmm4
641	addpd	%xmm12, %xmm1
642
643	pshufd	$0x4e,  %xmm9, %xmm12
644	mulpd	%xmm5,  %xmm9
645	addpd	%xmm9,  %xmm2
646	movaps	 -6 * SIZE(X), %xmm9
647	mulpd	%xmm5,  %xmm12
648	MOVLPS	 -6 * SIZE(Y), %xmm5
649	movhps	 -5 * SIZE(Y), %xmm5
650	addpd	%xmm12, %xmm3
651
652#ifdef PREFETCH
653	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
654#endif
655
656	pshufd	$0x4e,  %xmm10, %xmm12
657	mulpd	%xmm6,  %xmm10
658	addpd	%xmm10, %xmm0
659	movaps	 -4 * SIZE(X), %xmm10
660	mulpd	%xmm6,  %xmm12
661	MOVLPS	 -4 * SIZE(Y), %xmm6
662	movhps	 -3 * SIZE(Y), %xmm6
663	addpd	%xmm12, %xmm1
664
665	pshufd	$0x4e,  %xmm11, %xmm12
666	mulpd	%xmm7,  %xmm11
667	addpd	%xmm11, %xmm2
668	movaps	 -2 * SIZE(X), %xmm11
669	mulpd	%xmm7,  %xmm12
670	MOVLPS	 -2 * SIZE(Y), %xmm7
671	movhps	 -1 * SIZE(Y), %xmm7
672	addpd	%xmm12, %xmm3
673
674#if defined(PREFETCH) && !defined(FETCH128)
675	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
676#endif
677
678	pshufd	$0x4e,  %xmm8, %xmm12
679	mulpd	%xmm4,  %xmm8
680	addpd	%xmm8,  %xmm0
681	movaps	  0 * SIZE(X), %xmm8
682	mulpd	%xmm4,  %xmm12
683	MOVLPS	  0 * SIZE(Y), %xmm4
684	movhps	  1 * SIZE(Y), %xmm4
685	addpd	%xmm12, %xmm1
686
687	pshufd	$0x4e,  %xmm9, %xmm12
688	mulpd	%xmm5,  %xmm9
689	addpd	%xmm9,  %xmm2
690	movaps	  2 * SIZE(X), %xmm9
691	mulpd	%xmm5,  %xmm12
692	MOVLPS	  2 * SIZE(Y), %xmm5
693	movhps	  3 * SIZE(Y), %xmm5
694	addpd	%xmm12, %xmm3
695
696#if defined(PREFETCH) && !defined(FETCH128)
697	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
698#endif
699
700	pshufd	$0x4e,  %xmm10, %xmm12
701	mulpd	%xmm6,  %xmm10
702	addpd	%xmm10, %xmm0
703	movaps	  4 * SIZE(X), %xmm10
704	mulpd	%xmm6,  %xmm12
705	MOVLPS	  4 * SIZE(Y), %xmm6
706	movhps	  5 * SIZE(Y), %xmm6
707	addpd	%xmm12, %xmm1
708
709	pshufd	$0x4e,  %xmm11, %xmm12
710	mulpd	%xmm7,  %xmm11
711	addpd	%xmm11, %xmm2
712	movaps	  6 * SIZE(X), %xmm11
713	mulpd	%xmm7,  %xmm12
714	MOVLPS	  6 * SIZE(Y), %xmm7
715	movhps	  7 * SIZE(Y), %xmm7
716	addpd	%xmm12, %xmm3
717
718	subq	$-16 * SIZE, X
719	subq	$-16 * SIZE, Y
720
721	decq	%rax
722	jg	.L31
723	ALIGN_3
724
725.L32:
726
727	pshufd	$0x4e,  %xmm8, %xmm12
728	mulpd	%xmm4,  %xmm8
729	addpd	%xmm8,  %xmm0
730	movaps	 -8 * SIZE(X), %xmm8
731	mulpd	%xmm4,  %xmm12
732	MOVLPS	 -8 * SIZE(Y), %xmm4
733	movhps	 -7 * SIZE(Y), %xmm4
734	addpd	%xmm12, %xmm1
735
736	pshufd	$0x4e,  %xmm9, %xmm12
737	mulpd	%xmm5,  %xmm9
738	addpd	%xmm9,  %xmm2
739	movaps	 -6 * SIZE(X), %xmm9
740	mulpd	%xmm5,  %xmm12
741	MOVLPS	 -6 * SIZE(Y), %xmm5
742	movhps	 -5 * SIZE(Y), %xmm5
743	addpd	%xmm12, %xmm3
744
745	pshufd	$0x4e,  %xmm10, %xmm12
746	mulpd	%xmm6,  %xmm10
747	addpd	%xmm10, %xmm0
748	movaps	 -4 * SIZE(X), %xmm10
749	mulpd	%xmm6,  %xmm12
750	MOVLPS	 -4 * SIZE(Y), %xmm6
751	movhps	 -3 * SIZE(Y), %xmm6
752	addpd	%xmm12, %xmm1
753
754	pshufd	$0x4e,  %xmm11, %xmm12
755	mulpd	%xmm7,  %xmm11
756	addpd	%xmm11, %xmm2
757	movaps	 -2 * SIZE(X), %xmm11
758	mulpd	%xmm7,  %xmm12
759	MOVLPS	 -2 * SIZE(Y), %xmm7
760	movhps	 -1 * SIZE(Y), %xmm7
761	addpd	%xmm12, %xmm3
762
763	pshufd	$0x4e,  %xmm8, %xmm12
764	mulpd	%xmm4,  %xmm8
765	addpd	%xmm8,  %xmm0
766	mulpd	%xmm4,  %xmm12
767	addpd	%xmm12, %xmm1
768
769	pshufd	$0x4e,  %xmm9, %xmm12
770	mulpd	%xmm5,  %xmm9
771	addpd	%xmm9,  %xmm2
772	mulpd	%xmm5,  %xmm12
773	addpd	%xmm12, %xmm3
774
775	pshufd	$0x4e,  %xmm10, %xmm12
776	mulpd	%xmm6,  %xmm10
777	addpd	%xmm10, %xmm0
778	mulpd	%xmm6,  %xmm12
779	addpd	%xmm12, %xmm1
780
781	pshufd	$0x4e,  %xmm11, %xmm12
782	mulpd	%xmm7,  %xmm11
783	addpd	%xmm11, %xmm2
784	mulpd	%xmm7,  %xmm12
785	addpd	%xmm12, %xmm3
786
787	subq	$-16 * SIZE, X
788	subq	$-16 * SIZE, Y
789	ALIGN_3
790
791.L35:
792	testq	$4, N
793	jle	.L36
794
795	MOVLPS	-16 * SIZE(Y), %xmm4
796	movhps	-15 * SIZE(Y), %xmm4
797	movaps	-16 * SIZE(X), %xmm8
798
799	pshufd	$0x4e,  %xmm8, %xmm12
800	mulpd	%xmm4,  %xmm8
801	addpd	%xmm8,  %xmm0
802	mulpd	%xmm4,  %xmm12
803	addpd	%xmm12, %xmm1
804
805	MOVLPS	-14 * SIZE(Y), %xmm5
806	movhps	-13 * SIZE(Y), %xmm5
807	movaps	-14 * SIZE(X), %xmm9
808
809	pshufd	$0x4e,  %xmm9, %xmm12
810	mulpd	%xmm5,  %xmm9
811	addpd	%xmm9,  %xmm2
812	mulpd	%xmm5,  %xmm12
813	addpd	%xmm12, %xmm3
814
815	MOVLPS	-12 * SIZE(Y), %xmm6
816	movhps	-11 * SIZE(Y), %xmm6
817	movaps	-12 * SIZE(X), %xmm10
818
819	pshufd	$0x4e,  %xmm10, %xmm12
820	mulpd	%xmm6,  %xmm10
821	addpd	%xmm10, %xmm0
822	mulpd	%xmm6,  %xmm12
823	addpd	%xmm12, %xmm1
824
825	MOVLPS	-10 * SIZE(Y), %xmm7
826	movhps	 -9 * SIZE(Y), %xmm7
827	movaps	-10 * SIZE(X), %xmm11
828
829	pshufd	$0x4e,  %xmm11, %xmm12
830	mulpd	%xmm7,  %xmm11
831	addpd	%xmm11, %xmm2
832	mulpd	%xmm7,  %xmm12
833	addpd	%xmm12, %xmm3
834
835	addq	$8 * SIZE, X
836	addq	$8 * SIZE, Y
837	ALIGN_3
838
839.L36:
840	testq	$2, N
841	jle	.L37
842
843	MOVLPS	-16 * SIZE(Y), %xmm4
844	movhps	-15 * SIZE(Y), %xmm4
845	movaps	-16 * SIZE(X), %xmm8
846
847	pshufd	$0x4e,  %xmm8, %xmm12
848	mulpd	%xmm4,  %xmm8
849	addpd	%xmm8,  %xmm0
850	mulpd	%xmm4,  %xmm12
851	addpd	%xmm12, %xmm1
852
853	MOVLPS	-14 * SIZE(Y), %xmm5
854	movhps	-13 * SIZE(Y), %xmm5
855	movaps	-14 * SIZE(X), %xmm9
856
857	pshufd	$0x4e,  %xmm9, %xmm12
858	mulpd	%xmm5,  %xmm9
859	addpd	%xmm9,  %xmm2
860	mulpd	%xmm5,  %xmm12
861	addpd	%xmm12, %xmm3
862
863	addq	$4 * SIZE, X
864	addq	$4 * SIZE, Y
865	ALIGN_3
866
867.L37:
868	SHUFPD_1 %xmm1, %xmm1
869	SHUFPD_1 %xmm3, %xmm3
870
871	testq	$1, N
872	jle	.L98
873
874	MOVLPS	-16 * SIZE(Y), %xmm4
875	movhps	-15 * SIZE(Y), %xmm4
876	movaps	-16 * SIZE(X), %xmm8
877
878	pshufd	$0x4e,  %xmm8, %xmm12
879	mulpd	%xmm4,  %xmm8
880	addpd	%xmm8,  %xmm0
881	mulpd	%xmm4,  %xmm12
882	SHUFPD_1 %xmm12, %xmm12
883	addpd	%xmm12, %xmm1
884	jmp	.L98
885	ALIGN_3
886
887.L40:
888	movhps	-16 * SIZE(X), %xmm4
889	addq	$SIZE, X
890	movhps	-16 * SIZE(Y), %xmm8
891	addq	$SIZE, Y
892
893	movq	N,  %rax
894	sarq	$3, %rax
895	jle	.L45
896
897	movaps	-16 * SIZE(X), %xmm5
898	movaps	-16 * SIZE(Y), %xmm9
899	movaps	-14 * SIZE(X), %xmm6
900	movaps	-14 * SIZE(Y), %xmm10
901	movaps	-12 * SIZE(X), %xmm7
902	movaps	-12 * SIZE(Y), %xmm11
903	decq	%rax
904	jle	.L42
905	ALIGN_3
906
907.L41:
908#ifdef PREFETCH
909	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
910#endif
911
912	movsd	%xmm9,  %xmm8
913	pshufd	$0x4e,  %xmm8, %xmm12
914	movsd	%xmm5,  %xmm4
915	mulpd	%xmm4,  %xmm8
916	addpd	%xmm8,  %xmm0
917	movaps	-10 * SIZE(Y), %xmm8
918	mulpd	%xmm4,  %xmm12
919	movaps	-10 * SIZE(X), %xmm4
920	addpd	%xmm12, %xmm1
921
922	movsd	%xmm10, %xmm9
923	pshufd	$0x4e,  %xmm9, %xmm12
924	movsd	%xmm6,  %xmm5
925	mulpd	%xmm5,  %xmm9
926	addpd	%xmm9,  %xmm0
927 	movaps	 -8 * SIZE(Y), %xmm9
928	mulpd	%xmm5,  %xmm12
929	movaps	 -8 * SIZE(X), %xmm5
930	addpd	%xmm12, %xmm1
931
932#ifdef PREFETCH
933	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
934#endif
935
936	movsd	%xmm11, %xmm10
937	pshufd	$0x4e,  %xmm10, %xmm12
938	movsd	%xmm7,  %xmm6
939	mulpd	%xmm6,  %xmm10
940	addpd	%xmm10, %xmm0
941	movaps	 -6 * SIZE(Y), %xmm10
942	mulpd	%xmm6,  %xmm12
943	movaps	 -6 * SIZE(X), %xmm6
944	addpd	%xmm12, %xmm1
945
946	movsd	%xmm8, %xmm11
947	pshufd	$0x4e,  %xmm11, %xmm12
948	movsd	%xmm4,  %xmm7
949	mulpd	%xmm7,  %xmm11
950	addpd	%xmm11,  %xmm0
951	movaps	 -4 * SIZE(Y), %xmm11
952	mulpd	%xmm7,  %xmm12
953	movaps	 -4 * SIZE(X), %xmm7
954	addpd	%xmm12, %xmm1
955
956#if defined(PREFETCH) && !defined(FETCH128)
957	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
958#endif
959
960	movsd	%xmm9,  %xmm8
961	pshufd	$0x4e,  %xmm8, %xmm12
962	movsd	%xmm5,  %xmm4
963	mulpd	%xmm4,  %xmm8
964	addpd	%xmm8,  %xmm0
965	movaps	 -2 * SIZE(Y), %xmm8
966	mulpd	%xmm4,  %xmm12
967	movaps	 -2 * SIZE(X), %xmm4
968	addpd	%xmm12, %xmm1
969
970	movsd	%xmm10, %xmm9
971	pshufd	$0x4e,  %xmm9, %xmm12
972	movsd	%xmm6,  %xmm5
973	mulpd	%xmm5,  %xmm9
974	addpd	%xmm9,  %xmm0
975	movaps	  0 * SIZE(Y), %xmm9
976	mulpd	%xmm5,  %xmm12
977	movaps	  0 * SIZE(X), %xmm5
978	addpd	%xmm12, %xmm1
979
980#if defined(PREFETCH) && !defined(FETCH128)
981	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
982#endif
983
984	movsd	%xmm11, %xmm10
985	pshufd	$0x4e,  %xmm10, %xmm12
986	movsd	%xmm7,  %xmm6
987	mulpd	%xmm6,  %xmm10
988	addpd	%xmm10, %xmm0
989	movaps	  2 * SIZE(Y), %xmm10
990	mulpd	%xmm6,  %xmm12
991	movaps	  2 * SIZE(X), %xmm6
992	addpd	%xmm12, %xmm1
993
994	movsd	%xmm8, %xmm11
995	pshufd	$0x4e,  %xmm11, %xmm12
996	movsd	%xmm4,  %xmm7
997	mulpd	%xmm7,  %xmm11
998	addpd	%xmm11,  %xmm0
999	movaps	  4 * SIZE(Y), %xmm11
1000	mulpd	%xmm7,  %xmm12
1001	movaps	  4 * SIZE(X), %xmm7
1002	addpd	%xmm12, %xmm1
1003
1004	subq	$-16 * SIZE, X
1005	subq	$-16 * SIZE, Y
1006
1007	decq	%rax
1008	jg	.L41
1009	ALIGN_3
1010
1011.L42:
1012	movsd	%xmm9,  %xmm8
1013	pshufd	$0x4e,  %xmm8, %xmm12
1014	movsd	%xmm5,  %xmm4
1015	mulpd	%xmm4,  %xmm8
1016	addpd	%xmm8,  %xmm0
1017	movaps	-10 * SIZE(Y), %xmm8
1018	mulpd	%xmm4,  %xmm12
1019	movaps	-10 * SIZE(X), %xmm4
1020	addpd	%xmm12, %xmm1
1021
1022	movsd	%xmm10, %xmm9
1023	pshufd	$0x4e,  %xmm9, %xmm12
1024	movsd	%xmm6,  %xmm5
1025	mulpd	%xmm5,  %xmm9
1026	addpd	%xmm9,  %xmm0
1027 	movaps	 -8 * SIZE(Y), %xmm9
1028	mulpd	%xmm5,  %xmm12
1029	movaps	 -8 * SIZE(X), %xmm5
1030	addpd	%xmm12, %xmm1
1031
1032	movsd	%xmm11, %xmm10
1033	pshufd	$0x4e,  %xmm10, %xmm12
1034	movsd	%xmm7,  %xmm6
1035	mulpd	%xmm6,  %xmm10
1036	addpd	%xmm10, %xmm0
1037	movaps	 -6 * SIZE(Y), %xmm10
1038	mulpd	%xmm6,  %xmm12
1039	movaps	 -6 * SIZE(X), %xmm6
1040	addpd	%xmm12, %xmm1
1041
1042	movsd	%xmm8, %xmm11
1043	pshufd	$0x4e,  %xmm11, %xmm12
1044	movsd	%xmm4,  %xmm7
1045	mulpd	%xmm7,  %xmm11
1046	addpd	%xmm11,  %xmm0
1047	movaps	 -4 * SIZE(Y), %xmm11
1048	mulpd	%xmm7,  %xmm12
1049	movaps	 -4 * SIZE(X), %xmm7
1050	addpd	%xmm12, %xmm1
1051
1052	movsd	%xmm9,  %xmm8
1053	pshufd	$0x4e,  %xmm8, %xmm12
1054	movsd	%xmm5,  %xmm4
1055	mulpd	%xmm4,  %xmm8
1056	addpd	%xmm8,  %xmm0
1057	movaps	 -2 * SIZE(Y), %xmm8
1058	mulpd	%xmm4,  %xmm12
1059	movaps	 -2 * SIZE(X), %xmm4
1060	addpd	%xmm12, %xmm1
1061
1062	movsd	%xmm10, %xmm9
1063	pshufd	$0x4e,  %xmm9, %xmm12
1064	movsd	%xmm6,  %xmm5
1065	mulpd	%xmm5,  %xmm9
1066	addpd	%xmm9,  %xmm0
1067	mulpd	%xmm5,  %xmm12
1068	addpd	%xmm12, %xmm1
1069
1070	movsd	%xmm11, %xmm10
1071	pshufd	$0x4e,  %xmm10, %xmm12
1072	movsd	%xmm7,  %xmm6
1073	mulpd	%xmm6,  %xmm10
1074	addpd	%xmm10, %xmm0
1075	mulpd	%xmm6,  %xmm12
1076	addpd	%xmm12, %xmm1
1077
1078	movsd	%xmm8, %xmm11
1079	pshufd	$0x4e,  %xmm11, %xmm12
1080	movsd	%xmm4,  %xmm7
1081	mulpd	%xmm7,  %xmm11
1082	addpd	%xmm11,  %xmm0
1083	mulpd	%xmm7,  %xmm12
1084	addpd	%xmm12, %xmm1
1085
1086	subq	$-16 * SIZE, X
1087	subq	$-16 * SIZE, Y
1088	ALIGN_3
1089
1090.L45:
1091	testq	$4, N
1092	jle	.L46
1093
1094	movaps	-16 * SIZE(X), %xmm5
1095	movaps	-16 * SIZE(Y), %xmm9
1096	movaps	-14 * SIZE(X), %xmm6
1097	movaps	-14 * SIZE(Y), %xmm10
1098
1099	movsd	%xmm9,  %xmm8
1100	pshufd	$0x4e,  %xmm8, %xmm12
1101	movsd	%xmm5,  %xmm4
1102	mulpd	%xmm4,  %xmm8
1103	addpd	%xmm8,  %xmm0
1104	mulpd	%xmm4,  %xmm12
1105	addpd	%xmm12, %xmm1
1106
1107	movaps	-12 * SIZE(X), %xmm7
1108	movaps	-12 * SIZE(Y), %xmm11
1109
1110	movsd	%xmm10, %xmm9
1111	pshufd	$0x4e,  %xmm9, %xmm12
1112	movsd	%xmm6,  %xmm5
1113	mulpd	%xmm5,  %xmm9
1114	addpd	%xmm9,  %xmm0
1115	mulpd	%xmm5,  %xmm12
1116	addpd	%xmm12, %xmm1
1117
1118	movaps	-10 * SIZE(X), %xmm4
1119	movaps	-10 * SIZE(Y), %xmm8
1120
1121	movsd	%xmm11, %xmm10
1122	pshufd	$0x4e,  %xmm10, %xmm12
1123	movsd	%xmm7,  %xmm6
1124	mulpd	%xmm6,  %xmm10
1125	addpd	%xmm10, %xmm0
1126	mulpd	%xmm6,  %xmm12
1127	addpd	%xmm12, %xmm1
1128
1129	movsd	%xmm8, %xmm11
1130	pshufd	$0x4e,  %xmm11, %xmm12
1131	movsd	%xmm4,  %xmm7
1132	mulpd	%xmm7,  %xmm11
1133	addpd	%xmm11,  %xmm0
1134	mulpd	%xmm7,  %xmm12
1135	addpd	%xmm12, %xmm1
1136
1137	addq	$8 * SIZE, X
1138	addq	$8 * SIZE, Y
1139	ALIGN_3
1140
1141.L46:
1142	testq	$2, N
1143	jle	.L47
1144
1145	movaps	-16 * SIZE(X), %xmm5
1146	movaps	-16 * SIZE(Y), %xmm9
1147
1148	movsd	%xmm9,  %xmm8
1149	pshufd	$0x4e,  %xmm8, %xmm12
1150	movsd	%xmm5,  %xmm4
1151	mulpd	%xmm4,  %xmm8
1152	addpd	%xmm8,  %xmm0
1153	mulpd	%xmm4,  %xmm12
1154	addpd	%xmm12, %xmm1
1155
1156	movaps	-14 * SIZE(X), %xmm6
1157	movaps	-14 * SIZE(Y), %xmm10
1158
1159	movsd	%xmm10, %xmm9
1160	pshufd	$0x4e,  %xmm9, %xmm12
1161	movsd	%xmm6,  %xmm5
1162	mulpd	%xmm5,  %xmm9
1163	addpd	%xmm9,  %xmm0
1164	mulpd	%xmm5,  %xmm12
1165	addpd	%xmm12, %xmm1
1166
1167	movaps	%xmm6,  %xmm4
1168	movaps	%xmm10, %xmm8
1169
1170	addq	$4 * SIZE, X
1171	addq	$4 * SIZE, Y
1172	ALIGN_3
1173
1174.L47:
1175	testq	$1, N
1176	jle	.L48
1177
1178	movlps	-16 * SIZE(X), %xmm4
1179	movlps	-16 * SIZE(Y), %xmm8
1180
1181	pshufd	$0x4e,  %xmm8, %xmm12
1182	mulpd	%xmm4,  %xmm8
1183	addpd	%xmm8,  %xmm0
1184	mulpd	%xmm4,  %xmm12
1185	addpd	%xmm12, %xmm1
1186	ALIGN_3
1187
1188.L48:
1189	SHUFPD_1 %xmm0, %xmm0
1190	SHUFPD_1 %xmm1, %xmm1
1191	SHUFPD_1 %xmm2, %xmm2
1192	SHUFPD_1 %xmm3, %xmm3
1193	jmp	.L98
1194	ALIGN_3
1195
1196.L50:
1197	movq	N,  %rax
1198	sarq	$3, %rax
1199	jle	.L55
1200
1201	MOVLPS	0 * SIZE(X), %xmm4
1202	movhps	1 * SIZE(X), %xmm4
1203	addq	INCX, X
1204	MOVLPS	0 * SIZE(Y), %xmm8
1205	movhps	1 * SIZE(Y), %xmm8
1206	addq	INCY, Y
1207
1208	MOVLPS	0 * SIZE(X), %xmm5
1209	movhps	1 * SIZE(X), %xmm5
1210	addq	INCX, X
1211	MOVLPS	0 * SIZE(Y), %xmm9
1212	movhps	1 * SIZE(Y), %xmm9
1213	addq	INCY, Y
1214
1215	MOVLPS	0 * SIZE(X), %xmm6
1216	movhps	1 * SIZE(X), %xmm6
1217	addq	INCX, X
1218	MOVLPS	0 * SIZE(Y), %xmm10
1219	movhps	1 * SIZE(Y), %xmm10
1220	addq	INCY, Y
1221
1222	MOVLPS	0 * SIZE(X), %xmm7
1223	movhps	1 * SIZE(X), %xmm7
1224	addq	INCX, X
1225	MOVLPS	0 * SIZE(Y), %xmm11
1226	movhps	1 * SIZE(Y), %xmm11
1227	addq	INCY, Y
1228
1229	decq	%rax
1230	jle	.L54
1231	ALIGN_3
1232
1233.L53:
1234	pshufd	$0x4e,  %xmm8, %xmm12
1235	mulpd	%xmm4,  %xmm8
1236	addpd	%xmm8,  %xmm0
1237	MOVLPS	0 * SIZE(Y), %xmm8
1238	movhps	1 * SIZE(Y), %xmm8
1239	addq	INCY, Y
1240	mulpd	%xmm4,  %xmm12
1241	MOVLPS	0 * SIZE(X), %xmm4
1242	movhps	1 * SIZE(X), %xmm4
1243	addq	INCX, X
1244	addpd	%xmm12, %xmm1
1245
1246	pshufd	$0x4e,  %xmm9, %xmm12
1247	mulpd	%xmm5,  %xmm9
1248	addpd	%xmm9,  %xmm2
1249	MOVLPS	0 * SIZE(Y), %xmm9
1250	movhps	1 * SIZE(Y), %xmm9
1251	addq	INCY, Y
1252	mulpd	%xmm5,  %xmm12
1253	MOVLPS	0 * SIZE(X), %xmm5
1254	movhps	1 * SIZE(X), %xmm5
1255	addq	INCX, X
1256	addpd	%xmm12, %xmm3
1257
1258	pshufd	$0x4e,  %xmm10, %xmm12
1259	mulpd	%xmm6,  %xmm10
1260	addpd	%xmm10, %xmm0
1261	MOVLPS	0 * SIZE(Y), %xmm10
1262	movhps	1 * SIZE(Y), %xmm10
1263	addq	INCY, Y
1264	mulpd	%xmm6,  %xmm12
1265	MOVLPS	0 * SIZE(X), %xmm6
1266	movhps	1 * SIZE(X), %xmm6
1267	addq	INCX, X
1268	addpd	%xmm12, %xmm1
1269
1270	pshufd	$0x4e,  %xmm11, %xmm12
1271	mulpd	%xmm7,  %xmm11
1272	addpd	%xmm11, %xmm2
1273	MOVLPS	0 * SIZE(Y), %xmm11
1274	movhps	1 * SIZE(Y), %xmm11
1275	addq	INCY, Y
1276	mulpd	%xmm7,  %xmm12
1277	MOVLPS	0 * SIZE(X), %xmm7
1278	movhps	1 * SIZE(X), %xmm7
1279	addq	INCX, X
1280	addpd	%xmm12, %xmm3
1281
1282	pshufd	$0x4e,  %xmm8, %xmm12
1283	mulpd	%xmm4,  %xmm8
1284	addpd	%xmm8,  %xmm0
1285	MOVLPS	0 * SIZE(Y), %xmm8
1286	movhps	1 * SIZE(Y), %xmm8
1287	addq	INCY, Y
1288
1289	mulpd	%xmm4,  %xmm12
1290	MOVLPS	0 * SIZE(X), %xmm4
1291	movhps	1 * SIZE(X), %xmm4
1292	addq	INCX, X
1293	addpd	%xmm12, %xmm1
1294
1295	pshufd	$0x4e,  %xmm9, %xmm12
1296	mulpd	%xmm5,  %xmm9
1297	addpd	%xmm9,  %xmm2
1298	MOVLPS	0 * SIZE(Y), %xmm9
1299	movhps	1 * SIZE(Y), %xmm9
1300	addq	INCY, Y
1301
1302	mulpd	%xmm5,  %xmm12
1303	MOVLPS	0 * SIZE(X), %xmm5
1304	movhps	1 * SIZE(X), %xmm5
1305	addq	INCX, X
1306	addpd	%xmm12, %xmm3
1307
1308	pshufd	$0x4e,  %xmm10, %xmm12
1309	mulpd	%xmm6,  %xmm10
1310	addpd	%xmm10, %xmm0
1311	MOVLPS	0 * SIZE(Y), %xmm10
1312	movhps	1 * SIZE(Y), %xmm10
1313	addq	INCY, Y
1314	mulpd	%xmm6,  %xmm12
1315	MOVLPS	0 * SIZE(X), %xmm6
1316	movhps	1 * SIZE(X), %xmm6
1317	addq	INCX, X
1318	addpd	%xmm12, %xmm1
1319
1320	pshufd	$0x4e,  %xmm11, %xmm12
1321	mulpd	%xmm7,  %xmm11
1322	addpd	%xmm11, %xmm2
1323	MOVLPS	0 * SIZE(Y), %xmm11
1324	movhps	1 * SIZE(Y), %xmm11
1325	addq	INCY, Y
1326	mulpd	%xmm7,  %xmm12
1327	MOVLPS	0 * SIZE(X), %xmm7
1328	movhps	1 * SIZE(X), %xmm7
1329	addq	INCX, X
1330	addpd	%xmm12, %xmm3
1331
1332	decq	%rax
1333	jg	.L53
1334	ALIGN_3
1335
1336.L54:
1337	pshufd	$0x4e,  %xmm8, %xmm12
1338	mulpd	%xmm4,  %xmm8
1339	addpd	%xmm8,  %xmm0
1340	MOVLPS	0 * SIZE(Y), %xmm8
1341	movhps	1 * SIZE(Y), %xmm8
1342	addq	INCY, Y
1343	mulpd	%xmm4,  %xmm12
1344	MOVLPS	0 * SIZE(X), %xmm4
1345	movhps	1 * SIZE(X), %xmm4
1346	addq	INCX, X
1347	addpd	%xmm12, %xmm1
1348
1349	pshufd	$0x4e,  %xmm9, %xmm12
1350	mulpd	%xmm5,  %xmm9
1351	addpd	%xmm9,  %xmm2
1352	MOVLPS	0 * SIZE(Y), %xmm9
1353	movhps	1 * SIZE(Y), %xmm9
1354	addq	INCY, Y
1355	mulpd	%xmm5,  %xmm12
1356	MOVLPS	0 * SIZE(X), %xmm5
1357	movhps	1 * SIZE(X), %xmm5
1358	addq	INCX, X
1359	addpd	%xmm12, %xmm3
1360
1361	pshufd	$0x4e,  %xmm10, %xmm12
1362	mulpd	%xmm6,  %xmm10
1363	addpd	%xmm10, %xmm0
1364	MOVLPS	0 * SIZE(Y), %xmm10
1365	movhps	1 * SIZE(Y), %xmm10
1366	addq	INCY, Y
1367	mulpd	%xmm6,  %xmm12
1368	MOVLPS	0 * SIZE(X), %xmm6
1369	movhps	1 * SIZE(X), %xmm6
1370	addq	INCX, X
1371	addpd	%xmm12, %xmm1
1372
1373	pshufd	$0x4e,  %xmm11, %xmm12
1374	mulpd	%xmm7,  %xmm11
1375	addpd	%xmm11, %xmm2
1376	MOVLPS	0 * SIZE(Y), %xmm11
1377	movhps	1 * SIZE(Y), %xmm11
1378	addq	INCY, Y
1379	mulpd	%xmm7,  %xmm12
1380	MOVLPS	0 * SIZE(X), %xmm7
1381	movhps	1 * SIZE(X), %xmm7
1382	addq	INCX, X
1383	addpd	%xmm12, %xmm3
1384
1385	pshufd	$0x4e,  %xmm8, %xmm12
1386	mulpd	%xmm4,  %xmm8
1387	addpd	%xmm8,  %xmm0
1388	mulpd	%xmm4,  %xmm12
1389	addpd	%xmm12, %xmm1
1390
1391	pshufd	$0x4e,  %xmm9, %xmm12
1392	mulpd	%xmm5,  %xmm9
1393	addpd	%xmm9,  %xmm2
1394	mulpd	%xmm5,  %xmm12
1395	addpd	%xmm12, %xmm3
1396
1397	pshufd	$0x4e,  %xmm10, %xmm12
1398	mulpd	%xmm6,  %xmm10
1399	addpd	%xmm10, %xmm0
1400	mulpd	%xmm6,  %xmm12
1401	addpd	%xmm12, %xmm1
1402
1403	pshufd	$0x4e,  %xmm11, %xmm12
1404	mulpd	%xmm7,  %xmm11
1405	addpd	%xmm11, %xmm2
1406	mulpd	%xmm7,  %xmm12
1407	addpd	%xmm12, %xmm3
1408	ALIGN_3
1409
1410.L55:
1411	testq	$4, N
1412	jle	.L56
1413
1414	MOVLPS	0 * SIZE(X), %xmm4
1415	movhps	1 * SIZE(X), %xmm4
1416	addq	INCX, X
1417	MOVLPS	0 * SIZE(Y), %xmm8
1418	movhps	1 * SIZE(Y), %xmm8
1419	addq	INCY, Y
1420
1421	pshufd	$0x4e,  %xmm8, %xmm12
1422	mulpd	%xmm4,  %xmm8
1423	addpd	%xmm8,  %xmm0
1424	mulpd	%xmm4,  %xmm12
1425	addpd	%xmm12, %xmm1
1426
1427	MOVLPS	0 * SIZE(X), %xmm5
1428	movhps	1 * SIZE(X), %xmm5
1429	addq	INCX, X
1430	MOVLPS	0 * SIZE(Y), %xmm9
1431	movhps	1 * SIZE(Y), %xmm9
1432	addq	INCY, Y
1433
1434	pshufd	$0x4e,  %xmm9, %xmm12
1435	mulpd	%xmm5,  %xmm9
1436	addpd	%xmm9,  %xmm2
1437	mulpd	%xmm5,  %xmm12
1438	addpd	%xmm12, %xmm3
1439
1440	MOVLPS	0 * SIZE(X), %xmm6
1441	movhps	1 * SIZE(X), %xmm6
1442	addq	INCX, X
1443	MOVLPS	0 * SIZE(Y), %xmm10
1444	movhps	1 * SIZE(Y), %xmm10
1445	addq	INCY, Y
1446
1447	pshufd	$0x4e,  %xmm10, %xmm12
1448	mulpd	%xmm6,  %xmm10
1449	addpd	%xmm10, %xmm0
1450	mulpd	%xmm6,  %xmm12
1451	addpd	%xmm12, %xmm1
1452
1453	MOVLPS	0 * SIZE(X), %xmm7
1454	movhps	1 * SIZE(X), %xmm7
1455	addq	INCX, X
1456	MOVLPS	0 * SIZE(Y), %xmm11
1457	movhps	1 * SIZE(Y), %xmm11
1458	addq	INCY, Y
1459
1460	pshufd	$0x4e,  %xmm11, %xmm12
1461	mulpd	%xmm7,  %xmm11
1462	addpd	%xmm11, %xmm2
1463	mulpd	%xmm7,  %xmm12
1464	addpd	%xmm12, %xmm3
1465	ALIGN_3
1466
1467.L56:
1468	testq	$2, N
1469	jle	.L57
1470
1471	MOVLPS	0 * SIZE(X), %xmm4
1472	movhps	1 * SIZE(X), %xmm4
1473	addq	INCX, X
1474	MOVLPS	0 * SIZE(Y), %xmm8
1475	movhps	1 * SIZE(Y), %xmm8
1476	addq	INCY, Y
1477
1478	pshufd	$0x4e,  %xmm8, %xmm12
1479	mulpd	%xmm4,  %xmm8
1480	addpd	%xmm8,  %xmm0
1481	mulpd	%xmm4,  %xmm12
1482	addpd	%xmm12, %xmm1
1483
1484	MOVLPS	0 * SIZE(X), %xmm5
1485	movhps	1 * SIZE(X), %xmm5
1486	addq	INCX, X
1487	MOVLPS	0 * SIZE(Y), %xmm9
1488	movhps	1 * SIZE(Y), %xmm9
1489	addq	INCY, Y
1490
1491	pshufd	$0x4e,  %xmm9, %xmm12
1492	mulpd	%xmm5,  %xmm9
1493	addpd	%xmm9,  %xmm2
1494	mulpd	%xmm5,  %xmm12
1495	addpd	%xmm12, %xmm3
1496	ALIGN_3
1497
1498.L57:
1499	testq	$1, N
1500	jle	.L98
1501
1502	MOVLPS	0 * SIZE(X), %xmm4
1503	movhps	1 * SIZE(X), %xmm4
1504	MOVLPS	0 * SIZE(Y), %xmm8
1505	movhps	1 * SIZE(Y), %xmm8
1506
1507	pshufd	$0x4e,  %xmm8, %xmm12
1508	mulpd	%xmm4,  %xmm8
1509	addpd	%xmm8,  %xmm0
1510	mulpd	%xmm4,  %xmm12
1511	addpd	%xmm12, %xmm1
1512	ALIGN_3
1513
1514.L98:
1515	addpd	%xmm2, %xmm0
1516	addpd	%xmm3, %xmm1
1517
1518	pshufd	$0x4e, %xmm0, %xmm2
1519	pshufd	$0x4e, %xmm1, %xmm3
1520
1521.L999:
1522#ifndef CONJ
1523	subsd	 %xmm2, %xmm0
1524	addsd	 %xmm3, %xmm1
1525#else
1526	addsd	 %xmm2, %xmm0
1527	subsd	 %xmm3, %xmm1
1528#endif
1529
1530	RESTOREREGISTERS
1531	ret
1532
1533	EPILOGUE
1534