1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45#define Y	ARG4	/* rcx */
46#ifndef WINDOWS_ABI
47#define INCY	ARG5	/* r8  */
48#else
49#define INCY	%r10
50#endif
51
52#include "l1param.h"
53
54	PROLOGUE
55	PROFCODE
56
57#ifdef WINDOWS_ABI
58	movq	40(%rsp), INCY
59#endif
60
61	SAVEREGISTERS
62
63	salq	$ZBASE_SHIFT, INCX
64	salq	$ZBASE_SHIFT, INCY
65
66	xorps	%xmm0, %xmm0
67	xorps	%xmm1, %xmm1
68	xorps	%xmm2, %xmm2
69	xorps	%xmm3, %xmm3
70
71	testq	N, N
72	jle	.L999
73
74	cmpq	$2 * SIZE, INCX
75	jne	.L200
76	cmpq	$2 * SIZE, INCY
77	jne	.L200
78
79	subq	$-32 * SIZE, X
80	subq	$-32 * SIZE, Y
81
82	testq	$SIZE, X
83	jne	.L50
84
85.L0x:
86	testq	$2 * SIZE, X
87	je	.L10
88
89#ifdef movsd
90	xorps	%xmm4, %xmm4
91#endif
92	movsd	-32 * SIZE(X), %xmm4
93	movsd	-32 * SIZE(Y), %xmm0
94
95	pshufd	$0xb1,  %xmm0, %xmm1
96	mulps	%xmm4,  %xmm0
97	mulps	%xmm4,  %xmm1
98	addq	$2 * SIZE, X
99	addq	$2 * SIZE, Y
100	decq	N
101	ALIGN_3
102
103.L10:
104	testq	$3 * SIZE, Y
105	jne	.L20
106
107	movq	N,  %rax
108	sarq	$4, %rax
109	jle	.L15
110
111	movaps	-32 * SIZE(X), %xmm4
112	movaps	-28 * SIZE(X), %xmm5
113	movaps	-32 * SIZE(Y), %xmm8
114	movaps	-28 * SIZE(Y), %xmm9
115	movaps	-24 * SIZE(X), %xmm6
116	movaps	-20 * SIZE(X), %xmm7
117	movaps	-24 * SIZE(Y), %xmm10
118	movaps	-20 * SIZE(Y), %xmm11
119
120	decq	%rax
121	jle	.L12
122	ALIGN_3
123
124.L11:
125#ifdef PREFETCH
126	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
127#endif
128
129	pshufd	$0xb1,  %xmm8, %xmm12
130	mulps	%xmm4,  %xmm8
131	addps	%xmm8,  %xmm0
132	movaps	-16 * SIZE(Y), %xmm8
133	mulps	%xmm4,  %xmm12
134	movaps	-16 * SIZE(X), %xmm4
135	addps	%xmm12, %xmm1
136
137	pshufd	$0xb1,  %xmm9, %xmm12
138	mulps	%xmm5,  %xmm9
139	addps	%xmm9,  %xmm2
140	movaps	-12 * SIZE(Y), %xmm9
141	mulps	%xmm5,  %xmm12
142	movaps	-12 * SIZE(X), %xmm5
143	addps	%xmm12, %xmm3
144
145#ifdef PREFETCH
146	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
147#endif
148
149	pshufd	$0xb1,  %xmm10, %xmm12
150	mulps	%xmm6,  %xmm10
151	addps	%xmm10, %xmm0
152	movaps	 -8 * SIZE(Y), %xmm10
153	mulps	%xmm6,  %xmm12
154	movaps	 -8 * SIZE(X), %xmm6
155	addps	%xmm12, %xmm1
156
157	pshufd	$0xb1,  %xmm11, %xmm12
158	mulps	%xmm7,  %xmm11
159	addps	%xmm11, %xmm2
160	movaps	 -4 * SIZE(Y), %xmm11
161	mulps	%xmm7,  %xmm12
162	movaps	 -4 * SIZE(X), %xmm7
163	addps	%xmm12, %xmm3
164
165#if defined(PREFETCH) && !defined(FETCH128)
166	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
167#endif
168
169	pshufd	$0xb1,  %xmm8, %xmm12
170	mulps	%xmm4,  %xmm8
171	addps	%xmm8,  %xmm0
172	movaps	  0 * SIZE(Y), %xmm8
173	mulps	%xmm4,  %xmm12
174	movaps	  0 * SIZE(X), %xmm4
175	addps	%xmm12, %xmm1
176
177	pshufd	$0xb1,  %xmm9, %xmm12
178	mulps	%xmm5,  %xmm9
179	addps	%xmm9,  %xmm2
180	movaps	  4 * SIZE(Y), %xmm9
181	mulps	%xmm5,  %xmm12
182	movaps	  4 * SIZE(X), %xmm5
183	addps	%xmm12, %xmm3
184
185#if defined(PREFETCH) && !defined(FETCH128)
186	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
187#endif
188
189	pshufd	$0xb1,  %xmm10, %xmm12
190	mulps	%xmm6,  %xmm10
191	addps	%xmm10, %xmm0
192	movaps	  8 * SIZE(Y), %xmm10
193	mulps	%xmm6,  %xmm12
194	movaps	  8 * SIZE(X), %xmm6
195	addps	%xmm12, %xmm1
196
197	pshufd	$0xb1,  %xmm11, %xmm12
198	mulps	%xmm7,  %xmm11
199	addps	%xmm11, %xmm2
200	movaps	 12 * SIZE(Y), %xmm11
201	mulps	%xmm7,  %xmm12
202	movaps	 12 * SIZE(X), %xmm7
203	addps	%xmm12, %xmm3
204
205	subq	$-32 * SIZE, X
206	subq	$-32 * SIZE, Y
207
208	decq	%rax
209	jg	.L11
210	ALIGN_3
211
212.L12:
213	pshufd	$0xb1,  %xmm8, %xmm12
214	mulps	%xmm4,  %xmm8
215	addps	%xmm8,  %xmm0
216	movaps	-16 * SIZE(Y), %xmm8
217	mulps	%xmm4,  %xmm12
218	movaps	-16 * SIZE(X), %xmm4
219	addps	%xmm12, %xmm1
220
221	pshufd	$0xb1,  %xmm9, %xmm12
222	mulps	%xmm5,  %xmm9
223	addps	%xmm9,  %xmm2
224	movaps	-12 * SIZE(Y), %xmm9
225	mulps	%xmm5,  %xmm12
226	movaps	-12 * SIZE(X), %xmm5
227	addps	%xmm12, %xmm3
228
229	pshufd	$0xb1,  %xmm10, %xmm12
230	mulps	%xmm6,  %xmm10
231	addps	%xmm10, %xmm0
232	movaps	 -8 * SIZE(Y), %xmm10
233	mulps	%xmm6,  %xmm12
234	movaps	 -8 * SIZE(X), %xmm6
235	addps	%xmm12, %xmm1
236
237	pshufd	$0xb1,  %xmm11, %xmm12
238	mulps	%xmm7,  %xmm11
239	addps	%xmm11, %xmm2
240	movaps	 -4 * SIZE(Y), %xmm11
241	mulps	%xmm7,  %xmm12
242	movaps	 -4 * SIZE(X), %xmm7
243	addps	%xmm12, %xmm3
244
245	pshufd	$0xb1,  %xmm8, %xmm12
246	mulps	%xmm4,  %xmm8
247	addps	%xmm8,  %xmm0
248	mulps	%xmm4,  %xmm12
249	addps	%xmm12, %xmm1
250
251	pshufd	$0xb1,  %xmm9, %xmm12
252	mulps	%xmm5,  %xmm9
253	addps	%xmm9,  %xmm2
254	mulps	%xmm5,  %xmm12
255	addps	%xmm12, %xmm3
256
257	pshufd	$0xb1,  %xmm10, %xmm12
258	mulps	%xmm6,  %xmm10
259	addps	%xmm10, %xmm0
260	mulps	%xmm6,  %xmm12
261	addps	%xmm12, %xmm1
262
263	pshufd	$0xb1,  %xmm11, %xmm12
264	mulps	%xmm7,  %xmm11
265	addps	%xmm11, %xmm2
266	mulps	%xmm7,  %xmm12
267	addps	%xmm12, %xmm3
268
269	subq	$-32 * SIZE, X
270	subq	$-32 * SIZE, Y
271	ALIGN_3
272
273.L15:
274	testq	$8, N
275	jle	.L16
276
277	movaps	-32 * SIZE(X), %xmm4
278	movaps	-32 * SIZE(Y), %xmm8
279
280	pshufd	$0xb1,  %xmm8, %xmm12
281	mulps	%xmm4,  %xmm8
282	addps	%xmm8,  %xmm0
283	mulps	%xmm4,  %xmm12
284	addps	%xmm12, %xmm1
285
286	movaps	-28 * SIZE(X), %xmm5
287	movaps	-28 * SIZE(Y), %xmm9
288
289	pshufd	$0xb1,  %xmm9, %xmm12
290	mulps	%xmm5,  %xmm9
291	addps	%xmm9,  %xmm2
292	mulps	%xmm5,  %xmm12
293	addps	%xmm12, %xmm3
294
295	movaps	-24 * SIZE(X), %xmm6
296	movaps	-24 * SIZE(Y), %xmm10
297
298	pshufd	$0xb1,  %xmm10, %xmm12
299	mulps	%xmm6,  %xmm10
300	addps	%xmm10, %xmm0
301	mulps	%xmm6,  %xmm12
302	addps	%xmm12, %xmm1
303
304	movaps	-20 * SIZE(X), %xmm7
305	movaps	-20 * SIZE(Y), %xmm11
306
307	pshufd	$0xb1,  %xmm11, %xmm12
308	mulps	%xmm7,  %xmm11
309	addps	%xmm11, %xmm2
310	mulps	%xmm7,  %xmm12
311	addps	%xmm12, %xmm3
312
313	addq	$16 * SIZE, X
314	addq	$16 * SIZE, Y
315	ALIGN_3
316
317.L16:
318	testq	$4, N
319	jle	.L17
320
321	movaps	-32 * SIZE(X), %xmm4
322	movaps	-32 * SIZE(Y), %xmm8
323	movaps	-28 * SIZE(X), %xmm5
324	movaps	-28 * SIZE(Y), %xmm9
325
326	pshufd	$0xb1,  %xmm8, %xmm12
327	mulps	%xmm4,  %xmm8
328	addps	%xmm8,  %xmm0
329	mulps	%xmm4,  %xmm12
330	addps	%xmm12, %xmm1
331
332	pshufd	$0xb1,  %xmm9, %xmm12
333	mulps	%xmm5,  %xmm9
334	addps	%xmm9,  %xmm2
335	mulps	%xmm5,  %xmm12
336	addps	%xmm12, %xmm3
337
338	addq	$8 * SIZE, X
339	addq	$8 * SIZE, Y
340	ALIGN_3
341
342.L17:
343	testq	$2, N
344	jle	.L18
345
346	movaps	-32 * SIZE(X), %xmm4
347	movaps	-32 * SIZE(Y), %xmm8
348
349	pshufd	$0xb1,  %xmm8, %xmm12
350	mulps	%xmm4,  %xmm8
351	addps	%xmm8,  %xmm0
352	mulps	%xmm4,  %xmm12
353	addps	%xmm12, %xmm1
354
355	addq	$4 * SIZE, X
356	addq	$4 * SIZE, Y
357	ALIGN_3
358
359.L18:
360	testq	$1, N
361	jle	.L98
362
363#ifdef movsd
364	xorps	%xmm4, %xmm4
365#endif
366	movsd	-32 * SIZE(X), %xmm4
367#ifdef movsd
368	xorps	%xmm8, %xmm8
369#endif
370	movsd	-32 * SIZE(Y), %xmm8
371
372	pshufd	$0xb1,  %xmm8, %xmm12
373	mulps	%xmm4,  %xmm8
374	addps	%xmm8,  %xmm0
375	mulps	%xmm4,  %xmm12
376	addps	%xmm12, %xmm1
377	jmp	.L98
378	ALIGN_3
379
380.L20:
381#ifdef ALIGNED_ACCESS
382
383	testq	$2 * SIZE, Y
384	jne	.L30
385
386	movaps	-33 * SIZE(Y), %xmm8
387	addq	$3 * SIZE, Y
388
389	shufps	$0xb1, %xmm1, %xmm1
390
391	movq	N,  %rax
392	sarq	$4, %rax
393	jle	.L25
394
395	movaps	-32 * SIZE(X), %xmm4
396	movaps	-32 * SIZE(Y), %xmm9
397	movaps	-28 * SIZE(X), %xmm5
398	movaps	-28 * SIZE(Y), %xmm10
399	movaps	-24 * SIZE(X), %xmm6
400	movaps	-24 * SIZE(Y), %xmm11
401	movaps	-20 * SIZE(X), %xmm7
402
403	decq	%rax
404	jle	.L22
405	ALIGN_3
406
407.L21:
408#ifdef PREFETCH
409	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
410#endif
411
412	movss	%xmm9,  %xmm8
413	pshufd	$0xb1,  %xmm4, %xmm12
414	shufps	$0x39,  %xmm8, %xmm8
415	mulps	%xmm8,  %xmm4
416	addps	%xmm4,  %xmm0
417	movaps	-16 * SIZE(X), %xmm4
418	mulps	%xmm8,  %xmm12
419	movaps	-20 * SIZE(Y), %xmm8
420	addps	%xmm12, %xmm1
421
422	movss	%xmm10, %xmm9
423	pshufd	$0xb1,  %xmm5, %xmm12
424	shufps	$0x39,  %xmm9, %xmm9
425	mulps	%xmm9,  %xmm5
426	addps	%xmm5,  %xmm0
427	movaps	-12 * SIZE(X), %xmm5
428	mulps	%xmm9,  %xmm12
429	movaps	-16 * SIZE(Y), %xmm9
430	addps	%xmm12, %xmm1
431
432#ifdef PREFETCH
433	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
434#endif
435
436	movss	%xmm11, %xmm10
437	pshufd	$0xb1,  %xmm6,  %xmm12
438	shufps	$0x39,  %xmm10, %xmm10
439	mulps	%xmm10, %xmm6
440	addps	%xmm6,  %xmm0
441	movaps	 -8 * SIZE(X), %xmm6
442	mulps	%xmm10, %xmm12
443	movaps	-12 * SIZE(Y), %xmm10
444	addps	%xmm12, %xmm1
445
446	movss	%xmm8,  %xmm11
447	pshufd	$0xb1,  %xmm7,  %xmm12
448	shufps	$0x39,  %xmm11, %xmm11
449	mulps	%xmm11, %xmm7
450	addps	%xmm7,  %xmm0
451	movaps	 -4 * SIZE(X), %xmm7
452	mulps	%xmm11, %xmm12
453	movaps	 -8 * SIZE(Y), %xmm11
454	addps	%xmm12, %xmm1
455
456#if defined(PREFETCH) && !defined(FETCH128)
457	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
458#endif
459
460	movss	%xmm9,  %xmm8
461	pshufd	$0xb1,  %xmm4, %xmm12
462	shufps	$0x39,  %xmm8, %xmm8
463	mulps	%xmm8,  %xmm4
464	addps	%xmm4,  %xmm0
465	movaps	  0 * SIZE(X), %xmm4
466	mulps	%xmm8,  %xmm12
467	movaps	 -4 * SIZE(Y), %xmm8
468	addps	%xmm12, %xmm1
469
470	movss	%xmm10, %xmm9
471	pshufd	$0xb1,  %xmm5, %xmm12
472	shufps	$0x39,  %xmm9, %xmm9
473	mulps	%xmm9,  %xmm5
474	addps	%xmm5,  %xmm0
475	movaps	  4 * SIZE(X), %xmm5
476	mulps	%xmm9,  %xmm12
477	movaps	  0 * SIZE(Y), %xmm9
478	addps	%xmm12, %xmm1
479
480#if defined(PREFETCH) && !defined(FETCH128)
481	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
482#endif
483
484	movss	%xmm11, %xmm10
485	pshufd	$0xb1,  %xmm6,  %xmm12
486	shufps	$0x39,  %xmm10, %xmm10
487	mulps	%xmm10, %xmm6
488	addps	%xmm6,  %xmm0
489	movaps	  8 * SIZE(X), %xmm6
490	mulps	%xmm10, %xmm12
491	movaps	  4 * SIZE(Y), %xmm10
492	addps	%xmm12, %xmm1
493
494	movss	%xmm8,  %xmm11
495	pshufd	$0xb1,  %xmm7,  %xmm12
496	shufps	$0x39,  %xmm11, %xmm11
497	mulps	%xmm11, %xmm7
498	addps	%xmm7,  %xmm0
499	movaps	 12 * SIZE(X), %xmm7
500	mulps	%xmm11, %xmm12
501	movaps	  8 * SIZE(Y), %xmm11
502	addps	%xmm12, %xmm1
503
504	subq	$-32 * SIZE, X
505	subq	$-32 * SIZE, Y
506
507	decq	%rax
508	jg	.L21
509	ALIGN_3
510
511.L22:
512	movss	%xmm9,  %xmm8
513	pshufd	$0xb1,  %xmm4, %xmm12
514	shufps	$0x39,  %xmm8, %xmm8
515	mulps	%xmm8,  %xmm4
516	addps	%xmm4,  %xmm0
517	movaps	-16 * SIZE(X), %xmm4
518	mulps	%xmm8,  %xmm12
519	movaps	-20 * SIZE(Y), %xmm8
520	addps	%xmm12, %xmm1
521
522	movss	%xmm10, %xmm9
523	pshufd	$0xb1,  %xmm5, %xmm12
524	shufps	$0x39,  %xmm9, %xmm9
525	mulps	%xmm9,  %xmm5
526	addps	%xmm5,  %xmm0
527	movaps	-12 * SIZE(X), %xmm5
528	mulps	%xmm9,  %xmm12
529	movaps	-16 * SIZE(Y), %xmm9
530	addps	%xmm12, %xmm1
531
532	movss	%xmm11, %xmm10
533	pshufd	$0xb1,  %xmm6,  %xmm12
534	shufps	$0x39,  %xmm10, %xmm10
535	mulps	%xmm10, %xmm6
536	addps	%xmm6,  %xmm0
537	movaps	 -8 * SIZE(X), %xmm6
538	mulps	%xmm10, %xmm12
539	movaps	-12 * SIZE(Y), %xmm10
540	addps	%xmm12, %xmm1
541
542	movss	%xmm8,  %xmm11
543	pshufd	$0xb1,  %xmm7,  %xmm12
544	shufps	$0x39,  %xmm11, %xmm11
545	mulps	%xmm11, %xmm7
546	addps	%xmm7,  %xmm0
547	movaps	 -4 * SIZE(X), %xmm7
548	mulps	%xmm11, %xmm12
549	movaps	 -8 * SIZE(Y), %xmm11
550	addps	%xmm12, %xmm1
551
552	movss	%xmm9,  %xmm8
553	pshufd	$0xb1,  %xmm4, %xmm12
554	shufps	$0x39,  %xmm8, %xmm8
555	mulps	%xmm8,  %xmm4
556	addps	%xmm4,  %xmm0
557	mulps	%xmm8,  %xmm12
558	movaps	 -4 * SIZE(Y), %xmm8
559	addps	%xmm12, %xmm1
560
561	movss	%xmm10, %xmm9
562	pshufd	$0xb1,  %xmm5, %xmm12
563	shufps	$0x39,  %xmm9, %xmm9
564	mulps	%xmm9,  %xmm5
565	addps	%xmm5,  %xmm0
566	mulps	%xmm9,  %xmm12
567	addps	%xmm12, %xmm1
568
569	movss	%xmm11, %xmm10
570	pshufd	$0xb1,  %xmm6,  %xmm12
571	shufps	$0x39,  %xmm10, %xmm10
572	mulps	%xmm10, %xmm6
573	addps	%xmm6,  %xmm0
574	mulps	%xmm10, %xmm12
575	addps	%xmm12, %xmm1
576
577	movss	%xmm8,  %xmm11
578	pshufd	$0xb1,  %xmm7,  %xmm12
579	shufps	$0x39,  %xmm11, %xmm11
580	mulps	%xmm11, %xmm7
581	addps	%xmm7,  %xmm0
582	mulps	%xmm11, %xmm12
583	addps	%xmm12, %xmm1
584
585	subq	$-32 * SIZE, X
586	subq	$-32 * SIZE, Y
587	ALIGN_3
588
589.L25:
590	testq	$8, N
591	jle	.L26
592
593	movaps	-32 * SIZE(X), %xmm4
594	movaps	-32 * SIZE(Y), %xmm9
595	movaps	-28 * SIZE(X), %xmm5
596	movaps	-28 * SIZE(Y), %xmm10
597
598	movss	%xmm9,  %xmm8
599	pshufd	$0xb1,  %xmm4, %xmm12
600	shufps	$0x39,  %xmm8, %xmm8
601	mulps	%xmm8,  %xmm4
602	addps	%xmm4,  %xmm0
603	mulps	%xmm8,  %xmm12
604	addps	%xmm12, %xmm1
605
606	movaps	-24 * SIZE(X), %xmm6
607	movaps	-24 * SIZE(Y), %xmm11
608
609	movss	%xmm10, %xmm9
610	pshufd	$0xb1,  %xmm5, %xmm12
611	shufps	$0x39,  %xmm9, %xmm9
612	mulps	%xmm9,  %xmm5
613	addps	%xmm5,  %xmm0
614	mulps	%xmm9,  %xmm12
615	addps	%xmm12, %xmm1
616
617	movaps	-20 * SIZE(X), %xmm7
618	movaps	-20 * SIZE(Y), %xmm8
619
620	movss	%xmm11, %xmm10
621	pshufd	$0xb1,  %xmm6,  %xmm12
622	shufps	$0x39,  %xmm10, %xmm10
623	mulps	%xmm10, %xmm6
624	addps	%xmm6,  %xmm0
625	mulps	%xmm10, %xmm12
626	addps	%xmm12, %xmm1
627
628	movss	%xmm8,  %xmm11
629	pshufd	$0xb1,  %xmm7,  %xmm12
630	shufps	$0x39,  %xmm11, %xmm11
631	mulps	%xmm11, %xmm7
632	addps	%xmm7,  %xmm0
633	mulps	%xmm11, %xmm12
634	addps	%xmm12, %xmm1
635
636	addq	$16 * SIZE, X
637	addq	$16 * SIZE, Y
638	ALIGN_3
639
640.L26:
641	testq	$4, N
642	jle	.L27
643
644	movaps	-32 * SIZE(X), %xmm4
645	movaps	-32 * SIZE(Y), %xmm9
646
647	movss	%xmm9,  %xmm8
648	pshufd	$0xb1,  %xmm4, %xmm12
649	shufps	$0x39,  %xmm8, %xmm8
650	mulps	%xmm8,  %xmm4
651	addps	%xmm4,  %xmm0
652	mulps	%xmm8,  %xmm12
653	addps	%xmm12, %xmm1
654
655	movaps	-28 * SIZE(X), %xmm5
656	movaps	-28 * SIZE(Y), %xmm10
657
658	movss	%xmm10, %xmm9
659	pshufd	$0xb1,  %xmm5, %xmm12
660	shufps	$0x39,  %xmm9, %xmm9
661	mulps	%xmm9,  %xmm5
662	addps	%xmm5,  %xmm0
663	mulps	%xmm9,  %xmm12
664	addps	%xmm12, %xmm1
665
666	movaps	%xmm10, %xmm8
667
668	addq	$8 * SIZE, X
669	addq	$8 * SIZE, Y
670	ALIGN_3
671
672.L27:
673	testq	$2, N
674	jle	.L28
675
676	movaps	-32 * SIZE(X), %xmm4
677	movaps	-32 * SIZE(Y), %xmm9
678
679	movss	%xmm9,  %xmm8
680	pshufd	$0xb1,  %xmm4, %xmm12
681	shufps	$0x39,  %xmm8, %xmm8
682	mulps	%xmm8,  %xmm4
683	addps	%xmm4,  %xmm0
684	mulps	%xmm8,  %xmm12
685	addps	%xmm12, %xmm1
686
687	movaps	%xmm9, %xmm8
688	addq	$4 * SIZE, X
689	addq	$4 * SIZE, Y
690	ALIGN_3
691
692.L28:
693	testq	$1, N
694	jle	.L29
695
696#ifdef movsd
697	xorps	%xmm4, %xmm4
698#endif
699	movsd	-32 * SIZE(X), %xmm4
700
701	pshufd	$0xb1,  %xmm4, %xmm12
702	shufps	$0x39,  %xmm8, %xmm8
703	mulps	%xmm8,  %xmm4
704	addps	%xmm4,  %xmm0
705	mulps	%xmm8,  %xmm12
706	addps	%xmm12, %xmm1
707	ALIGN_3
708
709.L29:
710	shufps	$0xb1, %xmm1, %xmm1
711	shufps	$0xb1, %xmm3, %xmm3
712	jmp	.L98
713	ALIGN_3
714
715.L30:
716
717	testq	$SIZE, Y
718	jne	.L40
719#endif
720
721	movq	N,  %rax
722	sarq	$4, %rax
723	jle	.L35
724
725	movaps	-32 * SIZE(X), %xmm4
726	movsd	-32 * SIZE(Y), %xmm8
727	movhps	-30 * SIZE(Y), %xmm8
728	movaps	-28 * SIZE(X), %xmm5
729	movsd	-28 * SIZE(Y), %xmm9
730	movhps	-26 * SIZE(Y), %xmm9
731
732	movaps	-24 * SIZE(X), %xmm6
733	movsd	-24 * SIZE(Y), %xmm10
734	movhps	-22 * SIZE(Y), %xmm10
735	movaps	-20 * SIZE(X), %xmm7
736	movsd	-20 * SIZE(Y), %xmm11
737	movhps	-18 * SIZE(Y), %xmm11
738
739	decq	%rax
740	jle	.L32
741	ALIGN_3
742
743.L31:
744#ifdef PREFETCH
745	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
746#endif
747
748	pshufd	$0xb1,  %xmm8, %xmm12
749	mulps	%xmm4,  %xmm8
750	addps	%xmm8,  %xmm0
751	movsd	-16 * SIZE(Y), %xmm8
752	movhps	-14 * SIZE(Y), %xmm8
753	mulps	%xmm4,  %xmm12
754	movaps	-16 * SIZE(X), %xmm4
755	addps	%xmm12, %xmm1
756
757	pshufd	$0xb1,  %xmm9, %xmm12
758	mulps	%xmm5,  %xmm9
759	addps	%xmm9,  %xmm2
760	movsd	-12 * SIZE(Y), %xmm9
761	movhps	-10 * SIZE(Y), %xmm9
762	mulps	%xmm5,  %xmm12
763	movaps	-12 * SIZE(X), %xmm5
764	addps	%xmm12, %xmm3
765
766#ifdef PREFETCH
767	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
768#endif
769
770	pshufd	$0xb1,  %xmm10, %xmm12
771	mulps	%xmm6,  %xmm10
772	addps	%xmm10, %xmm0
773	movsd	 -8 * SIZE(Y), %xmm10
774	movhps	 -6 * SIZE(Y), %xmm10
775	mulps	%xmm6,  %xmm12
776	movaps	 -8 * SIZE(X), %xmm6
777	addps	%xmm12, %xmm1
778
779	pshufd	$0xb1,  %xmm11, %xmm12
780	mulps	%xmm7,  %xmm11
781	addps	%xmm11, %xmm2
782	movsd	 -4 * SIZE(Y), %xmm11
783	movhps	 -2 * SIZE(Y), %xmm11
784	mulps	%xmm7,  %xmm12
785	movaps	 -4 * SIZE(X), %xmm7
786	addps	%xmm12, %xmm3
787
788#if defined(PREFETCH) && !defined(FETCH128)
789	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
790#endif
791
792	pshufd	$0xb1,  %xmm8, %xmm12
793	mulps	%xmm4,  %xmm8
794	addps	%xmm8,  %xmm0
795	movsd	  0 * SIZE(Y), %xmm8
796	movhps	  2 * SIZE(Y), %xmm8
797	mulps	%xmm4,  %xmm12
798	movaps	  0 * SIZE(X), %xmm4
799	addps	%xmm12, %xmm1
800
801	pshufd	$0xb1,  %xmm9, %xmm12
802	mulps	%xmm5,  %xmm9
803	addps	%xmm9,  %xmm2
804	movsd	  4 * SIZE(Y), %xmm9
805	movhps	  6 * SIZE(Y), %xmm9
806	mulps	%xmm5,  %xmm12
807	movaps	  4 * SIZE(X), %xmm5
808	addps	%xmm12, %xmm3
809
810#if defined(PREFETCH) && !defined(FETCH128)
811	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
812#endif
813
814	pshufd	$0xb1,  %xmm10, %xmm12
815	mulps	%xmm6,  %xmm10
816	addps	%xmm10, %xmm0
817	movsd	  8 * SIZE(Y), %xmm10
818	movhps	 10 * SIZE(Y), %xmm10
819	mulps	%xmm6,  %xmm12
820	movaps	  8 * SIZE(X), %xmm6
821	addps	%xmm12, %xmm1
822
823	pshufd	$0xb1,  %xmm11, %xmm12
824	mulps	%xmm7,  %xmm11
825	addps	%xmm11, %xmm2
826	movsd	 12 * SIZE(Y), %xmm11
827	movhps	 14 * SIZE(Y), %xmm11
828	mulps	%xmm7,  %xmm12
829	movaps	 12 * SIZE(X), %xmm7
830	addps	%xmm12, %xmm3
831
832	subq	$-32 * SIZE, X
833	subq	$-32 * SIZE, Y
834
835	decq	%rax
836	jg	.L31
837	ALIGN_3
838
839.L32:
840	pshufd	$0xb1,  %xmm8, %xmm12
841	mulps	%xmm4,  %xmm8
842	addps	%xmm8,  %xmm0
843	movsd	-16 * SIZE(Y), %xmm8
844	movhps	-14 * SIZE(Y), %xmm8
845	mulps	%xmm4,  %xmm12
846	movaps	-16 * SIZE(X), %xmm4
847	addps	%xmm12, %xmm1
848
849	pshufd	$0xb1,  %xmm9, %xmm12
850	mulps	%xmm5,  %xmm9
851	addps	%xmm9,  %xmm2
852	movsd	-12 * SIZE(Y), %xmm9
853	movhps	-10 * SIZE(Y), %xmm9
854	mulps	%xmm5,  %xmm12
855	movaps	-12 * SIZE(X), %xmm5
856	addps	%xmm12, %xmm3
857
858	pshufd	$0xb1,  %xmm10, %xmm12
859	mulps	%xmm6,  %xmm10
860	addps	%xmm10, %xmm0
861	movsd	 -8 * SIZE(Y), %xmm10
862	movhps	 -6 * SIZE(Y), %xmm10
863	mulps	%xmm6,  %xmm12
864	movaps	 -8 * SIZE(X), %xmm6
865	addps	%xmm12, %xmm1
866
867	pshufd	$0xb1,  %xmm11, %xmm12
868	mulps	%xmm7,  %xmm11
869	addps	%xmm11, %xmm2
870	movsd	 -4 * SIZE(Y), %xmm11
871	movhps	 -2 * SIZE(Y), %xmm11
872	mulps	%xmm7,  %xmm12
873	movaps	 -4 * SIZE(X), %xmm7
874	addps	%xmm12, %xmm3
875
876	pshufd	$0xb1,  %xmm8, %xmm12
877	mulps	%xmm4,  %xmm8
878	addps	%xmm8,  %xmm0
879	mulps	%xmm4,  %xmm12
880	addps	%xmm12, %xmm1
881
882	pshufd	$0xb1,  %xmm9, %xmm12
883	mulps	%xmm5,  %xmm9
884	addps	%xmm9,  %xmm2
885	mulps	%xmm5,  %xmm12
886	addps	%xmm12, %xmm3
887
888	pshufd	$0xb1,  %xmm10, %xmm12
889	mulps	%xmm6,  %xmm10
890	addps	%xmm10, %xmm0
891	mulps	%xmm6,  %xmm12
892	addps	%xmm12, %xmm1
893
894	pshufd	$0xb1,  %xmm11, %xmm12
895	mulps	%xmm7,  %xmm11
896	addps	%xmm11, %xmm2
897	mulps	%xmm7,  %xmm12
898	addps	%xmm12, %xmm3
899
900	subq	$-32 * SIZE, X
901	subq	$-32 * SIZE, Y
902	ALIGN_3
903
904.L35:
905	testq	$8, N
906	jle	.L36
907
908	movaps	-32 * SIZE(X), %xmm4
909	movsd	-32 * SIZE(Y), %xmm8
910	movhps	-30 * SIZE(Y), %xmm8
911
912	pshufd	$0xb1,  %xmm8, %xmm12
913	mulps	%xmm4,  %xmm8
914	addps	%xmm8,  %xmm0
915	mulps	%xmm4,  %xmm12
916	addps	%xmm12, %xmm1
917
918	movaps	-28 * SIZE(X), %xmm5
919	movsd	-28 * SIZE(Y), %xmm9
920	movhps	-26 * SIZE(Y), %xmm9
921
922	pshufd	$0xb1,  %xmm9, %xmm12
923	mulps	%xmm5,  %xmm9
924	addps	%xmm9,  %xmm2
925	mulps	%xmm5,  %xmm12
926	addps	%xmm12, %xmm3
927
928	movaps	-24 * SIZE(X), %xmm6
929	movsd	-24 * SIZE(Y), %xmm10
930	movhps	-22 * SIZE(Y), %xmm10
931
932	pshufd	$0xb1,  %xmm10, %xmm12
933	mulps	%xmm6,  %xmm10
934	addps	%xmm10, %xmm0
935	mulps	%xmm6,  %xmm12
936	addps	%xmm12, %xmm1
937
938	movaps	-20 * SIZE(X), %xmm7
939	movsd	-20 * SIZE(Y), %xmm11
940	movhps	-18 * SIZE(Y), %xmm11
941
942	pshufd	$0xb1,  %xmm11, %xmm12
943	mulps	%xmm7,  %xmm11
944	addps	%xmm11, %xmm2
945	mulps	%xmm7,  %xmm12
946	addps	%xmm12, %xmm3
947
948	addq	$16 * SIZE, X
949	addq	$16 * SIZE, Y
950	ALIGN_3
951
952.L36:
953	testq	$4, N
954	jle	.L37
955
956	movaps	-32 * SIZE(X), %xmm4
957	movsd	-32 * SIZE(Y), %xmm8
958	movhps	-30 * SIZE(Y), %xmm8
959
960	pshufd	$0xb1,  %xmm8, %xmm12
961	mulps	%xmm4,  %xmm8
962	addps	%xmm8,  %xmm0
963	mulps	%xmm4,  %xmm12
964	addps	%xmm12, %xmm1
965
966	movaps	-28 * SIZE(X), %xmm5
967	movsd	-28 * SIZE(Y), %xmm9
968	movhps	-26 * SIZE(Y), %xmm9
969
970	pshufd	$0xb1,  %xmm9, %xmm12
971	mulps	%xmm5,  %xmm9
972	addps	%xmm9,  %xmm2
973	mulps	%xmm5,  %xmm12
974	addps	%xmm12, %xmm3
975
976	addq	$8 * SIZE, X
977	addq	$8 * SIZE, Y
978	ALIGN_3
979
980.L37:
981	testq	$2, N
982	jle	.L38
983
984	movaps	-32 * SIZE(X), %xmm4
985	movsd	-32 * SIZE(Y), %xmm8
986	movhps	-30 * SIZE(Y), %xmm8
987
988	pshufd	$0xb1,  %xmm8, %xmm12
989	mulps	%xmm4,  %xmm8
990	addps	%xmm8,  %xmm0
991	mulps	%xmm4,  %xmm12
992	addps	%xmm12, %xmm1
993
994	addq	$4 * SIZE, X
995	addq	$4 * SIZE, Y
996	ALIGN_3
997
998.L38:
999	testq	$1, N
1000	jle	.L98
1001
1002#ifdef movsd
1003	xorps	%xmm4, %xmm4
1004#endif
1005	movsd	-32 * SIZE(X), %xmm4
1006#ifdef movsd
1007	xorps	%xmm8, %xmm8
1008#endif
1009	movsd	-32 * SIZE(Y), %xmm8
1010
1011	pshufd	$0xb1,  %xmm8, %xmm12
1012	mulps	%xmm4,  %xmm8
1013	addps	%xmm8,  %xmm0
1014	mulps	%xmm4,  %xmm12
1015	addps	%xmm12, %xmm1
1016	jmp	.L98
1017	ALIGN_3
1018
1019#ifdef ALIGNED_ACCESS
1020.L40:
1021	movaps	-35 * SIZE(Y), %xmm8
1022	addq	$1 * SIZE, Y
1023
1024	shufps	$0xb1, %xmm1, %xmm1
1025
1026	movq	N,  %rax
1027	sarq	$4, %rax
1028	jle	.L45
1029
1030	movaps	-32 * SIZE(X), %xmm4
1031	movaps	-32 * SIZE(Y), %xmm9
1032	movaps	-28 * SIZE(X), %xmm5
1033	movaps	-28 * SIZE(Y), %xmm10
1034	movaps	-24 * SIZE(X), %xmm6
1035	movaps	-24 * SIZE(Y), %xmm11
1036	movaps	-20 * SIZE(X), %xmm7
1037
1038	decq	%rax
1039	jle	.L42
1040	ALIGN_3
1041
1042.L41:
1043#ifdef PREFETCH
1044	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
1045#endif
1046
1047	movss	%xmm9,  %xmm8
1048	pshufd	$0xb1,  %xmm4, %xmm12
1049	shufps	$0x93,  %xmm9, %xmm8
1050	mulps	%xmm8,  %xmm4
1051	addps	%xmm4,  %xmm0
1052	movaps	-16 * SIZE(X), %xmm4
1053	mulps	%xmm8,  %xmm12
1054	movaps	-20 * SIZE(Y), %xmm8
1055	addps	%xmm12, %xmm1
1056
1057	movss	%xmm10, %xmm9
1058	pshufd	$0xb1,  %xmm5, %xmm12
1059	shufps	$0x93,  %xmm10, %xmm9
1060	mulps	%xmm9,  %xmm5
1061	addps	%xmm5,  %xmm0
1062	movaps	-12 * SIZE(X), %xmm5
1063	mulps	%xmm9,  %xmm12
1064	movaps	-16 * SIZE(Y), %xmm9
1065	addps	%xmm12, %xmm1
1066
1067#ifdef PREFETCH
1068	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
1069#endif
1070
1071	movss	%xmm11, %xmm10
1072	pshufd	$0xb1,  %xmm6,  %xmm12
1073	shufps	$0x93,  %xmm11, %xmm10
1074	mulps	%xmm10, %xmm6
1075	addps	%xmm6,  %xmm0
1076	movaps	 -8 * SIZE(X), %xmm6
1077	mulps	%xmm10, %xmm12
1078	movaps	-12 * SIZE(Y), %xmm10
1079	addps	%xmm12, %xmm1
1080
1081	movss	%xmm8,  %xmm11
1082	pshufd	$0xb1,  %xmm7,  %xmm12
1083	shufps	$0x93,  %xmm8, %xmm11
1084	mulps	%xmm11, %xmm7
1085	addps	%xmm7,  %xmm0
1086	movaps	 -4 * SIZE(X), %xmm7
1087	mulps	%xmm11, %xmm12
1088	movaps	 -8 * SIZE(Y), %xmm11
1089	addps	%xmm12, %xmm1
1090
1091#if defined(PREFETCH) && !defined(FETCH128)
1092	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
1093#endif
1094
1095	movss	%xmm9,  %xmm8
1096	pshufd	$0xb1,  %xmm4, %xmm12
1097	shufps	$0x93,  %xmm9, %xmm8
1098	mulps	%xmm8,  %xmm4
1099	addps	%xmm4,  %xmm0
1100	movaps	  0 * SIZE(X), %xmm4
1101	mulps	%xmm8,  %xmm12
1102	movaps	 -4 * SIZE(Y), %xmm8
1103	addps	%xmm12, %xmm1
1104
1105	movss	%xmm10, %xmm9
1106	pshufd	$0xb1,  %xmm5, %xmm12
1107	shufps	$0x93,  %xmm10, %xmm9
1108	mulps	%xmm9,  %xmm5
1109	addps	%xmm5,  %xmm0
1110	movaps	  4 * SIZE(X), %xmm5
1111	mulps	%xmm9,  %xmm12
1112	movaps	  0 * SIZE(Y), %xmm9
1113	addps	%xmm12, %xmm1
1114
1115#if defined(PREFETCH) && !defined(FETCH128)
1116	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
1117#endif
1118
1119	movss	%xmm11, %xmm10
1120	pshufd	$0xb1,  %xmm6,  %xmm12
1121	shufps	$0x93,  %xmm11, %xmm10
1122	mulps	%xmm10, %xmm6
1123	addps	%xmm6,  %xmm0
1124	movaps	  8 * SIZE(X), %xmm6
1125	mulps	%xmm10, %xmm12
1126	movaps	  4 * SIZE(Y), %xmm10
1127	addps	%xmm12, %xmm1
1128
1129	movss	%xmm8,  %xmm11
1130	pshufd	$0xb1,  %xmm7,  %xmm12
1131	shufps	$0x93,  %xmm8, %xmm11
1132	mulps	%xmm11, %xmm7
1133	addps	%xmm7,  %xmm0
1134	movaps	 12 * SIZE(X), %xmm7
1135	mulps	%xmm11, %xmm12
1136	movaps	  8 * SIZE(Y), %xmm11
1137	addps	%xmm12, %xmm1
1138
1139	subq	$-32 * SIZE, X
1140	subq	$-32 * SIZE, Y
1141
1142	decq	%rax
1143	jg	.L41
1144	ALIGN_3
1145
1146.L42:
1147	movss	%xmm9,  %xmm8
1148	pshufd	$0xb1,  %xmm4, %xmm12
1149	shufps	$0x93,  %xmm9, %xmm8
1150	mulps	%xmm8,  %xmm4
1151	addps	%xmm4,  %xmm0
1152	movaps	-16 * SIZE(X), %xmm4
1153	mulps	%xmm8,  %xmm12
1154	movaps	-20 * SIZE(Y), %xmm8
1155	addps	%xmm12, %xmm1
1156
1157	movss	%xmm10, %xmm9
1158	pshufd	$0xb1,  %xmm5, %xmm12
1159	shufps	$0x93,  %xmm10, %xmm9
1160	mulps	%xmm9,  %xmm5
1161	addps	%xmm5,  %xmm0
1162	movaps	-12 * SIZE(X), %xmm5
1163	mulps	%xmm9,  %xmm12
1164	movaps	-16 * SIZE(Y), %xmm9
1165	addps	%xmm12, %xmm1
1166
1167	movss	%xmm11, %xmm10
1168	pshufd	$0xb1,  %xmm6,  %xmm12
1169	shufps	$0x93,  %xmm11, %xmm10
1170	mulps	%xmm10, %xmm6
1171	addps	%xmm6,  %xmm0
1172	movaps	 -8 * SIZE(X), %xmm6
1173	mulps	%xmm10, %xmm12
1174	movaps	-12 * SIZE(Y), %xmm10
1175	addps	%xmm12, %xmm1
1176
1177	movss	%xmm8,  %xmm11
1178	pshufd	$0xb1,  %xmm7,  %xmm12
1179	shufps	$0x93,  %xmm8, %xmm11
1180	mulps	%xmm11, %xmm7
1181	addps	%xmm7,  %xmm0
1182	movaps	 -4 * SIZE(X), %xmm7
1183	mulps	%xmm11, %xmm12
1184	movaps	 -8 * SIZE(Y), %xmm11
1185	addps	%xmm12, %xmm1
1186
1187	movss	%xmm9,  %xmm8
1188	pshufd	$0xb1,  %xmm4, %xmm12
1189	shufps	$0x93,  %xmm9, %xmm8
1190	mulps	%xmm8,  %xmm4
1191	addps	%xmm4,  %xmm0
1192	mulps	%xmm8,  %xmm12
1193	movaps	 -4 * SIZE(Y), %xmm8
1194	addps	%xmm12, %xmm1
1195
1196	movss	%xmm10, %xmm9
1197	pshufd	$0xb1,  %xmm5, %xmm12
1198	shufps	$0x93,  %xmm10, %xmm9
1199	mulps	%xmm9,  %xmm5
1200	addps	%xmm5,  %xmm0
1201	mulps	%xmm9,  %xmm12
1202	addps	%xmm12, %xmm1
1203
1204	movss	%xmm11, %xmm10
1205	pshufd	$0xb1,  %xmm6,  %xmm12
1206	shufps	$0x93,  %xmm11, %xmm10
1207	mulps	%xmm10, %xmm6
1208	addps	%xmm6,  %xmm0
1209	mulps	%xmm10, %xmm12
1210	addps	%xmm12, %xmm1
1211
1212	movss	%xmm8,  %xmm11
1213	pshufd	$0xb1,  %xmm7,  %xmm12
1214	shufps	$0x93,  %xmm8, %xmm11
1215	mulps	%xmm11, %xmm7
1216	addps	%xmm7,  %xmm0
1217	mulps	%xmm11, %xmm12
1218	addps	%xmm12, %xmm1
1219
1220	subq	$-32 * SIZE, X
1221	subq	$-32 * SIZE, Y
1222	ALIGN_3
1223
1224.L45:
1225	testq	$8, N
1226	jle	.L46
1227
1228	movaps	-32 * SIZE(X), %xmm4
1229	movaps	-32 * SIZE(Y), %xmm9
1230	movaps	-28 * SIZE(X), %xmm5
1231	movaps	-28 * SIZE(Y), %xmm10
1232
1233	movss	%xmm9,  %xmm8
1234	pshufd	$0xb1,  %xmm4, %xmm12
1235	shufps	$0x93,  %xmm9, %xmm8
1236	mulps	%xmm8,  %xmm4
1237	addps	%xmm4,  %xmm0
1238	mulps	%xmm8,  %xmm12
1239	addps	%xmm12, %xmm1
1240
1241	movaps	-24 * SIZE(X), %xmm6
1242	movaps	-24 * SIZE(Y), %xmm11
1243
1244	movss	%xmm10, %xmm9
1245	pshufd	$0xb1,  %xmm5, %xmm12
1246	shufps	$0x93,  %xmm10, %xmm9
1247	mulps	%xmm9,  %xmm5
1248	addps	%xmm5,  %xmm0
1249	mulps	%xmm9,  %xmm12
1250	addps	%xmm12, %xmm1
1251
1252	movaps	-20 * SIZE(X), %xmm7
1253	movaps	-20 * SIZE(Y), %xmm8
1254
1255	movss	%xmm11, %xmm10
1256	pshufd	$0xb1,  %xmm6,  %xmm12
1257	shufps	$0x93,  %xmm11, %xmm10
1258	mulps	%xmm10, %xmm6
1259	addps	%xmm6,  %xmm0
1260	mulps	%xmm10, %xmm12
1261	addps	%xmm12, %xmm1
1262
1263	movss	%xmm8,  %xmm11
1264	pshufd	$0xb1,  %xmm7,  %xmm12
1265	shufps	$0x93,  %xmm8, %xmm11
1266	mulps	%xmm11, %xmm7
1267	addps	%xmm7,  %xmm0
1268	mulps	%xmm11, %xmm12
1269	addps	%xmm12, %xmm1
1270
1271	addq	$16 * SIZE, X
1272	addq	$16 * SIZE, Y
1273	ALIGN_3
1274
1275.L46:
1276	testq	$4, N
1277	jle	.L47
1278
1279	movaps	-32 * SIZE(X), %xmm4
1280	movaps	-32 * SIZE(Y), %xmm9
1281
1282	movss	%xmm9,  %xmm8
1283	pshufd	$0xb1,  %xmm4, %xmm12
1284	shufps	$0x93,  %xmm9, %xmm8
1285	mulps	%xmm8,  %xmm4
1286	addps	%xmm4,  %xmm0
1287	mulps	%xmm8,  %xmm12
1288	addps	%xmm12, %xmm1
1289
1290	movaps	-28 * SIZE(X), %xmm5
1291	movaps	-28 * SIZE(Y), %xmm10
1292
1293	movss	%xmm10, %xmm9
1294	pshufd	$0xb1,  %xmm5, %xmm12
1295	shufps	$0x93,  %xmm10, %xmm9
1296	mulps	%xmm9,  %xmm5
1297	addps	%xmm5,  %xmm0
1298	mulps	%xmm9,  %xmm12
1299	addps	%xmm12, %xmm1
1300
1301	movaps	%xmm10, %xmm8
1302
1303	addq	$8 * SIZE, X
1304	addq	$8 * SIZE, Y
1305	ALIGN_3
1306
1307.L47:
1308	testq	$2, N
1309	jle	.L48
1310
1311	movaps	-32 * SIZE(X), %xmm4
1312	movaps	-32 * SIZE(Y), %xmm9
1313
1314	movss	%xmm9,  %xmm8
1315	pshufd	$0xb1,  %xmm4, %xmm12
1316	shufps	$0x93,  %xmm9, %xmm8
1317	mulps	%xmm8,  %xmm4
1318	addps	%xmm4,  %xmm0
1319	mulps	%xmm8,  %xmm12
1320	addps	%xmm12, %xmm1
1321
1322	movaps	%xmm9, %xmm8
1323	addq	$4 * SIZE, X
1324	addq	$4 * SIZE, Y
1325	ALIGN_3
1326
1327.L48:
1328	testq	$1, N
1329	jle	.L49
1330
1331#ifdef movsd
1332	xorps	%xmm4, %xmm4
1333#endif
1334	movsd	-32 * SIZE(X), %xmm4
1335	movss	-32 * SIZE(Y), %xmm9
1336
1337	movss	%xmm9,  %xmm8
1338	pshufd	$0xb1,  %xmm4, %xmm12
1339	shufps	$0x93,  %xmm8, %xmm8
1340	mulps	%xmm8,  %xmm4
1341	addps	%xmm4,  %xmm0
1342	mulps	%xmm8,  %xmm12
1343	addps	%xmm12, %xmm1
1344	ALIGN_3
1345
1346.L49:
1347	shufps	$0xb1, %xmm1, %xmm1
1348	shufps	$0xb1, %xmm3, %xmm3
1349	jmp	.L98
1350	ALIGN_3
1351#endif
1352
1353.L50:
1354	testq	$SIZE, Y
1355	jne	.L70
1356
1357#ifdef ALIGNED_ACCESS
1358
1359	testq	$2 * SIZE, Y
1360	je	.L50x
1361
1362#ifdef movsd
1363	xorps	%xmm0, %xmm0
1364#endif
1365	movsd	-32 * SIZE(X), %xmm0
1366#ifdef movsd
1367	xorps	%xmm4, %xmm4
1368#endif
1369	movsd	-32 * SIZE(Y), %xmm4
1370
1371	pshufd	$0xb1,  %xmm0, %xmm1
1372	mulps	%xmm4,  %xmm0
1373	mulps	%xmm4,  %xmm1
1374	addq	$2 * SIZE, X
1375	addq	$2 * SIZE, Y
1376
1377	decq	N
1378	ALIGN_3
1379
1380.L50x:
1381	testq	$2 * SIZE, X
1382	jne	.L60
1383
1384	movaps	-33 * SIZE(X), %xmm8
1385	addq	$3 * SIZE, X
1386
1387	shufps	$0xb1, %xmm1, %xmm1
1388
1389	movq	N,  %rax
1390	sarq	$4, %rax
1391	jle	.L55
1392
1393	movaps	-32 * SIZE(Y), %xmm4
1394	movaps	-32 * SIZE(X), %xmm9
1395	movaps	-28 * SIZE(Y), %xmm5
1396	movaps	-28 * SIZE(X), %xmm10
1397	movaps	-24 * SIZE(Y), %xmm6
1398	movaps	-24 * SIZE(X), %xmm11
1399	movaps	-20 * SIZE(Y), %xmm7
1400
1401	decq	%rax
1402	jle	.L52
1403	ALIGN_3
1404
1405.L51:
1406#ifdef PREFETCH
1407	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
1408#endif
1409
1410	movss	%xmm9,  %xmm8
1411	pshufd	$0xb1,  %xmm4, %xmm12
1412	shufps	$0x39,  %xmm8, %xmm8
1413	mulps	%xmm8,  %xmm4
1414	addps	%xmm4,  %xmm0
1415	movaps	-16 * SIZE(Y), %xmm4
1416	mulps	%xmm8,  %xmm12
1417	movaps	-20 * SIZE(X), %xmm8
1418	addps	%xmm12, %xmm1
1419
1420	movss	%xmm10, %xmm9
1421	pshufd	$0xb1,  %xmm5, %xmm12
1422	shufps	$0x39,  %xmm9, %xmm9
1423	mulps	%xmm9,  %xmm5
1424	addps	%xmm5,  %xmm0
1425	movaps	-12 * SIZE(Y), %xmm5
1426	mulps	%xmm9,  %xmm12
1427	movaps	-16 * SIZE(X), %xmm9
1428	addps	%xmm12, %xmm1
1429
1430#ifdef PREFETCH
1431	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
1432#endif
1433
1434	movss	%xmm11, %xmm10
1435	pshufd	$0xb1,  %xmm6,  %xmm12
1436	shufps	$0x39,  %xmm10, %xmm10
1437	mulps	%xmm10, %xmm6
1438	addps	%xmm6,  %xmm0
1439	movaps	 -8 * SIZE(Y), %xmm6
1440	mulps	%xmm10, %xmm12
1441	movaps	-12 * SIZE(X), %xmm10
1442	addps	%xmm12, %xmm1
1443
1444	movss	%xmm8,  %xmm11
1445	pshufd	$0xb1,  %xmm7,  %xmm12
1446	shufps	$0x39,  %xmm11, %xmm11
1447	mulps	%xmm11, %xmm7
1448	addps	%xmm7,  %xmm0
1449	movaps	 -4 * SIZE(Y), %xmm7
1450	mulps	%xmm11, %xmm12
1451	movaps	 -8 * SIZE(X), %xmm11
1452	addps	%xmm12, %xmm1
1453
1454#if defined(PREFETCH) && !defined(FETCH128)
1455	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
1456#endif
1457
1458	movss	%xmm9,  %xmm8
1459	pshufd	$0xb1,  %xmm4, %xmm12
1460	shufps	$0x39,  %xmm8, %xmm8
1461	mulps	%xmm8,  %xmm4
1462	addps	%xmm4,  %xmm0
1463	movaps	  0 * SIZE(Y), %xmm4
1464	mulps	%xmm8,  %xmm12
1465	movaps	 -4 * SIZE(X), %xmm8
1466	addps	%xmm12, %xmm1
1467
1468	movss	%xmm10, %xmm9
1469	pshufd	$0xb1,  %xmm5, %xmm12
1470	shufps	$0x39,  %xmm9, %xmm9
1471	mulps	%xmm9,  %xmm5
1472	addps	%xmm5,  %xmm0
1473	movaps	  4 * SIZE(Y), %xmm5
1474	mulps	%xmm9,  %xmm12
1475	movaps	  0 * SIZE(X), %xmm9
1476	addps	%xmm12, %xmm1
1477
1478#if defined(PREFETCH) && !defined(FETCH128)
1479	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
1480#endif
1481
1482	movss	%xmm11, %xmm10
1483	pshufd	$0xb1,  %xmm6,  %xmm12
1484	shufps	$0x39,  %xmm10, %xmm10
1485	mulps	%xmm10, %xmm6
1486	addps	%xmm6,  %xmm0
1487	movaps	  8 * SIZE(Y), %xmm6
1488	mulps	%xmm10, %xmm12
1489	movaps	  4 * SIZE(X), %xmm10
1490	addps	%xmm12, %xmm1
1491
1492	movss	%xmm8,  %xmm11
1493	pshufd	$0xb1,  %xmm7,  %xmm12
1494	shufps	$0x39,  %xmm11, %xmm11
1495	mulps	%xmm11, %xmm7
1496	addps	%xmm7,  %xmm0
1497	movaps	 12 * SIZE(Y), %xmm7
1498	mulps	%xmm11, %xmm12
1499	movaps	  8 * SIZE(X), %xmm11
1500	addps	%xmm12, %xmm1
1501
1502	subq	$-32 * SIZE, X
1503	subq	$-32 * SIZE, Y
1504
1505	decq	%rax
1506	jg	.L51
1507	ALIGN_3
1508
1509.L52:
1510	movss	%xmm9,  %xmm8
1511	pshufd	$0xb1,  %xmm4, %xmm12
1512	shufps	$0x39,  %xmm8, %xmm8
1513	mulps	%xmm8,  %xmm4
1514	addps	%xmm4,  %xmm0
1515	movaps	-16 * SIZE(Y), %xmm4
1516	mulps	%xmm8,  %xmm12
1517	movaps	-20 * SIZE(X), %xmm8
1518	addps	%xmm12, %xmm1
1519
1520	movss	%xmm10, %xmm9
1521	pshufd	$0xb1,  %xmm5, %xmm12
1522	shufps	$0x39,  %xmm9, %xmm9
1523	mulps	%xmm9,  %xmm5
1524	addps	%xmm5,  %xmm0
1525	movaps	-12 * SIZE(Y), %xmm5
1526	mulps	%xmm9,  %xmm12
1527	movaps	-16 * SIZE(X), %xmm9
1528	addps	%xmm12, %xmm1
1529
1530	movss	%xmm11, %xmm10
1531	pshufd	$0xb1,  %xmm6,  %xmm12
1532	shufps	$0x39,  %xmm10, %xmm10
1533	mulps	%xmm10, %xmm6
1534	addps	%xmm6,  %xmm0
1535	movaps	 -8 * SIZE(Y), %xmm6
1536	mulps	%xmm10, %xmm12
1537	movaps	-12 * SIZE(X), %xmm10
1538	addps	%xmm12, %xmm1
1539
1540	movss	%xmm8,  %xmm11
1541	pshufd	$0xb1,  %xmm7,  %xmm12
1542	shufps	$0x39,  %xmm11, %xmm11
1543	mulps	%xmm11, %xmm7
1544	addps	%xmm7,  %xmm0
1545	movaps	 -4 * SIZE(Y), %xmm7
1546	mulps	%xmm11, %xmm12
1547	movaps	 -8 * SIZE(X), %xmm11
1548	addps	%xmm12, %xmm1
1549
1550	movss	%xmm9,  %xmm8
1551	pshufd	$0xb1,  %xmm4, %xmm12
1552	shufps	$0x39,  %xmm8, %xmm8
1553	mulps	%xmm8,  %xmm4
1554	addps	%xmm4,  %xmm0
1555	mulps	%xmm8,  %xmm12
1556	movaps	 -4 * SIZE(X), %xmm8
1557	addps	%xmm12, %xmm1
1558
1559	movss	%xmm10, %xmm9
1560	pshufd	$0xb1,  %xmm5, %xmm12
1561	shufps	$0x39,  %xmm9, %xmm9
1562	mulps	%xmm9,  %xmm5
1563	addps	%xmm5,  %xmm0
1564	mulps	%xmm9,  %xmm12
1565	addps	%xmm12, %xmm1
1566
1567	movss	%xmm11, %xmm10
1568	pshufd	$0xb1,  %xmm6,  %xmm12
1569	shufps	$0x39,  %xmm10, %xmm10
1570	mulps	%xmm10, %xmm6
1571	addps	%xmm6,  %xmm0
1572	mulps	%xmm10, %xmm12
1573	addps	%xmm12, %xmm1
1574
1575	movss	%xmm8,  %xmm11
1576	pshufd	$0xb1,  %xmm7,  %xmm12
1577	shufps	$0x39,  %xmm11, %xmm11
1578	mulps	%xmm11, %xmm7
1579	addps	%xmm7,  %xmm0
1580	mulps	%xmm11, %xmm12
1581	addps	%xmm12, %xmm1
1582
1583	subq	$-32 * SIZE, X
1584	subq	$-32 * SIZE, Y
1585	ALIGN_3
1586
1587.L55:
1588	testq	$8, N
1589	jle	.L56
1590
1591	movaps	-32 * SIZE(Y), %xmm4
1592	movaps	-32 * SIZE(X), %xmm9
1593	movaps	-28 * SIZE(Y), %xmm5
1594	movaps	-28 * SIZE(X), %xmm10
1595
1596	movss	%xmm9,  %xmm8
1597	pshufd	$0xb1,  %xmm4, %xmm12
1598	shufps	$0x39,  %xmm8, %xmm8
1599	mulps	%xmm8,  %xmm4
1600	addps	%xmm4,  %xmm0
1601	mulps	%xmm8,  %xmm12
1602	addps	%xmm12, %xmm1
1603
1604	movaps	-24 * SIZE(Y), %xmm6
1605	movaps	-24 * SIZE(X), %xmm11
1606
1607	movss	%xmm10, %xmm9
1608	pshufd	$0xb1,  %xmm5, %xmm12
1609	shufps	$0x39,  %xmm9, %xmm9
1610	mulps	%xmm9,  %xmm5
1611	addps	%xmm5,  %xmm0
1612	mulps	%xmm9,  %xmm12
1613	addps	%xmm12, %xmm1
1614
1615	movaps	-20 * SIZE(Y), %xmm7
1616	movaps	-20 * SIZE(X), %xmm8
1617
1618	movss	%xmm11, %xmm10
1619	pshufd	$0xb1,  %xmm6,  %xmm12
1620	shufps	$0x39,  %xmm10, %xmm10
1621	mulps	%xmm10, %xmm6
1622	addps	%xmm6,  %xmm0
1623	mulps	%xmm10, %xmm12
1624	addps	%xmm12, %xmm1
1625
1626	movss	%xmm8,  %xmm11
1627	pshufd	$0xb1,  %xmm7,  %xmm12
1628	shufps	$0x39,  %xmm11, %xmm11
1629	mulps	%xmm11, %xmm7
1630	addps	%xmm7,  %xmm0
1631	mulps	%xmm11, %xmm12
1632	addps	%xmm12, %xmm1
1633
1634	addq	$16 * SIZE, X
1635	addq	$16 * SIZE, Y
1636	ALIGN_3
1637
1638.L56:
1639	testq	$4, N
1640	jle	.L57
1641
1642	movaps	-32 * SIZE(Y), %xmm4
1643	movaps	-32 * SIZE(X), %xmm9
1644
1645	movss	%xmm9,  %xmm8
1646	pshufd	$0xb1,  %xmm4, %xmm12
1647	shufps	$0x39,  %xmm8, %xmm8
1648	mulps	%xmm8,  %xmm4
1649	addps	%xmm4,  %xmm0
1650	mulps	%xmm8,  %xmm12
1651	addps	%xmm12, %xmm1
1652
1653	movaps	-28 * SIZE(Y), %xmm5
1654	movaps	-28 * SIZE(X), %xmm10
1655
1656	movss	%xmm10, %xmm9
1657	pshufd	$0xb1,  %xmm5, %xmm12
1658	shufps	$0x39,  %xmm9, %xmm9
1659	mulps	%xmm9,  %xmm5
1660	addps	%xmm5,  %xmm0
1661	mulps	%xmm9,  %xmm12
1662	addps	%xmm12, %xmm1
1663
1664	movaps	%xmm10, %xmm8
1665
1666	addq	$8 * SIZE, X
1667	addq	$8 * SIZE, Y
1668	ALIGN_3
1669
1670.L57:
1671	testq	$2, N
1672	jle	.L58
1673
1674	movaps	-32 * SIZE(Y), %xmm4
1675	movaps	-32 * SIZE(X), %xmm9
1676
1677	movss	%xmm9,  %xmm8
1678	pshufd	$0xb1,  %xmm4, %xmm12
1679	shufps	$0x39,  %xmm8, %xmm8
1680	mulps	%xmm8,  %xmm4
1681	addps	%xmm4,  %xmm0
1682	mulps	%xmm8,  %xmm12
1683	addps	%xmm12, %xmm1
1684
1685	movaps	%xmm9, %xmm8
1686	addq	$4 * SIZE, X
1687	addq	$4 * SIZE, Y
1688	ALIGN_3
1689
1690.L58:
1691	testq	$1, N
1692	jle	.L98
1693
1694#ifdef movsd
1695	xorps	%xmm4, %xmm4
1696#endif
1697	movsd	-32 * SIZE(Y), %xmm4
1698
1699	pshufd	$0xb1,  %xmm4, %xmm12
1700	shufps	$0x39,  %xmm8, %xmm8
1701	mulps	%xmm8,  %xmm4
1702	addps	%xmm4,  %xmm0
1703	mulps	%xmm8,  %xmm12
1704	addps	%xmm12, %xmm1
1705	jmp	.L98
1706	ALIGN_3
1707
1708.L60:
1709	movaps	-35 * SIZE(X), %xmm8
1710	addq	$1 * SIZE, X
1711
1712	shufps	$0xb1, %xmm1, %xmm1
1713
1714	movq	N,  %rax
1715	sarq	$4, %rax
1716	jle	.L65
1717
1718	movaps	-32 * SIZE(Y), %xmm4
1719	movaps	-32 * SIZE(X), %xmm9
1720	movaps	-28 * SIZE(Y), %xmm5
1721	movaps	-28 * SIZE(X), %xmm10
1722	movaps	-24 * SIZE(Y), %xmm6
1723	movaps	-24 * SIZE(X), %xmm11
1724	movaps	-20 * SIZE(Y), %xmm7
1725
1726	decq	%rax
1727	jle	.L62
1728	ALIGN_3
1729
1730.L61:
1731#ifdef PREFETCH
1732	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
1733#endif
1734
1735	movss	%xmm9,  %xmm8
1736	pshufd	$0xb1,  %xmm4, %xmm12
1737	shufps	$0x93,  %xmm9, %xmm8
1738	mulps	%xmm8,  %xmm4
1739	addps	%xmm4,  %xmm0
1740	movaps	-16 * SIZE(Y), %xmm4
1741	mulps	%xmm8,  %xmm12
1742	movaps	-20 * SIZE(X), %xmm8
1743	addps	%xmm12, %xmm1
1744
1745	movss	%xmm10, %xmm9
1746	pshufd	$0xb1,  %xmm5, %xmm12
1747	shufps	$0x93,  %xmm10, %xmm9
1748	mulps	%xmm9,  %xmm5
1749	addps	%xmm5,  %xmm0
1750	movaps	-12 * SIZE(Y), %xmm5
1751	mulps	%xmm9,  %xmm12
1752	movaps	-16 * SIZE(X), %xmm9
1753	addps	%xmm12, %xmm1
1754
1755#ifdef PREFETCH
1756	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
1757#endif
1758
1759	movss	%xmm11, %xmm10
1760	pshufd	$0xb1,  %xmm6,  %xmm12
1761	shufps	$0x93,  %xmm11, %xmm10
1762	mulps	%xmm10, %xmm6
1763	addps	%xmm6,  %xmm0
1764	movaps	 -8 * SIZE(Y), %xmm6
1765	mulps	%xmm10, %xmm12
1766	movaps	-12 * SIZE(X), %xmm10
1767	addps	%xmm12, %xmm1
1768
1769	movss	%xmm8,  %xmm11
1770	pshufd	$0xb1,  %xmm7,  %xmm12
1771	shufps	$0x93,  %xmm8, %xmm11
1772	mulps	%xmm11, %xmm7
1773	addps	%xmm7,  %xmm0
1774	movaps	 -4 * SIZE(Y), %xmm7
1775	mulps	%xmm11, %xmm12
1776	movaps	 -8 * SIZE(X), %xmm11
1777	addps	%xmm12, %xmm1
1778
1779#if defined(PREFETCH) && !defined(FETCH128)
1780	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
1781#endif
1782
1783	movss	%xmm9,  %xmm8
1784	pshufd	$0xb1,  %xmm4, %xmm12
1785	shufps	$0x93,  %xmm9, %xmm8
1786	mulps	%xmm8,  %xmm4
1787	addps	%xmm4,  %xmm0
1788	movaps	  0 * SIZE(Y), %xmm4
1789	mulps	%xmm8,  %xmm12
1790	movaps	 -4 * SIZE(X), %xmm8
1791	addps	%xmm12, %xmm1
1792
1793	movss	%xmm10, %xmm9
1794	pshufd	$0xb1,  %xmm5, %xmm12
1795	shufps	$0x93,  %xmm10, %xmm9
1796	mulps	%xmm9,  %xmm5
1797	addps	%xmm5,  %xmm0
1798	movaps	  4 * SIZE(Y), %xmm5
1799	mulps	%xmm9,  %xmm12
1800	movaps	  0 * SIZE(X), %xmm9
1801	addps	%xmm12, %xmm1
1802
1803#if defined(PREFETCH) && !defined(FETCH128)
1804	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
1805#endif
1806
1807	movss	%xmm11, %xmm10
1808	pshufd	$0xb1,  %xmm6,  %xmm12
1809	shufps	$0x93,  %xmm11, %xmm10
1810	mulps	%xmm10, %xmm6
1811	addps	%xmm6,  %xmm0
1812	movaps	  8 * SIZE(Y), %xmm6
1813	mulps	%xmm10, %xmm12
1814	movaps	  4 * SIZE(X), %xmm10
1815	addps	%xmm12, %xmm1
1816
1817	movss	%xmm8,  %xmm11
1818	pshufd	$0xb1,  %xmm7,  %xmm12
1819	shufps	$0x93,  %xmm8, %xmm11
1820	mulps	%xmm11, %xmm7
1821	addps	%xmm7,  %xmm0
1822	movaps	 12 * SIZE(Y), %xmm7
1823	mulps	%xmm11, %xmm12
1824	movaps	  8 * SIZE(X), %xmm11
1825	addps	%xmm12, %xmm1
1826
1827	subq	$-32 * SIZE, X
1828	subq	$-32 * SIZE, Y
1829
1830	decq	%rax
1831	jg	.L61
1832	ALIGN_3
1833
1834.L62:
1835	movss	%xmm9,  %xmm8
1836	pshufd	$0xb1,  %xmm4, %xmm12
1837	shufps	$0x93,  %xmm9, %xmm8
1838	mulps	%xmm8,  %xmm4
1839	addps	%xmm4,  %xmm0
1840	movaps	-16 * SIZE(Y), %xmm4
1841	mulps	%xmm8,  %xmm12
1842	movaps	-20 * SIZE(X), %xmm8
1843	addps	%xmm12, %xmm1
1844
1845	movss	%xmm10, %xmm9
1846	pshufd	$0xb1,  %xmm5, %xmm12
1847	shufps	$0x93,  %xmm10, %xmm9
1848	mulps	%xmm9,  %xmm5
1849	addps	%xmm5,  %xmm0
1850	movaps	-12 * SIZE(Y), %xmm5
1851	mulps	%xmm9,  %xmm12
1852	movaps	-16 * SIZE(X), %xmm9
1853	addps	%xmm12, %xmm1
1854
1855	movss	%xmm11, %xmm10
1856	pshufd	$0xb1,  %xmm6,  %xmm12
1857	shufps	$0x93,  %xmm11, %xmm10
1858	mulps	%xmm10, %xmm6
1859	addps	%xmm6,  %xmm0
1860	movaps	 -8 * SIZE(Y), %xmm6
1861	mulps	%xmm10, %xmm12
1862	movaps	-12 * SIZE(X), %xmm10
1863	addps	%xmm12, %xmm1
1864
1865	movss	%xmm8,  %xmm11
1866	pshufd	$0xb1,  %xmm7,  %xmm12
1867	shufps	$0x93,  %xmm8, %xmm11
1868	mulps	%xmm11, %xmm7
1869	addps	%xmm7,  %xmm0
1870	movaps	 -4 * SIZE(Y), %xmm7
1871	mulps	%xmm11, %xmm12
1872	movaps	 -8 * SIZE(X), %xmm11
1873	addps	%xmm12, %xmm1
1874
1875	movss	%xmm9,  %xmm8
1876	pshufd	$0xb1,  %xmm4, %xmm12
1877	shufps	$0x93,  %xmm9, %xmm8
1878	mulps	%xmm8,  %xmm4
1879	addps	%xmm4,  %xmm0
1880	mulps	%xmm8,  %xmm12
1881	movaps	 -4 * SIZE(X), %xmm8
1882	addps	%xmm12, %xmm1
1883
1884	movss	%xmm10, %xmm9
1885	pshufd	$0xb1,  %xmm5, %xmm12
1886	shufps	$0x93,  %xmm10, %xmm9
1887	mulps	%xmm9,  %xmm5
1888	addps	%xmm5,  %xmm0
1889	mulps	%xmm9,  %xmm12
1890	addps	%xmm12, %xmm1
1891
1892	movss	%xmm11, %xmm10
1893	pshufd	$0xb1,  %xmm6,  %xmm12
1894	shufps	$0x93,  %xmm11, %xmm10
1895	mulps	%xmm10, %xmm6
1896	addps	%xmm6,  %xmm0
1897	mulps	%xmm10, %xmm12
1898	addps	%xmm12, %xmm1
1899
1900	movss	%xmm8,  %xmm11
1901	pshufd	$0xb1,  %xmm7,  %xmm12
1902	shufps	$0x93,  %xmm8, %xmm11
1903	mulps	%xmm11, %xmm7
1904	addps	%xmm7,  %xmm0
1905	mulps	%xmm11, %xmm12
1906	addps	%xmm12, %xmm1
1907
1908	subq	$-32 * SIZE, X
1909	subq	$-32 * SIZE, Y
1910	ALIGN_3
1911
1912.L65:
1913	testq	$8, N
1914	jle	.L66
1915
1916	movaps	-32 * SIZE(Y), %xmm4
1917	movaps	-32 * SIZE(X), %xmm9
1918	movaps	-28 * SIZE(Y), %xmm5
1919	movaps	-28 * SIZE(X), %xmm10
1920
1921	movss	%xmm9,  %xmm8
1922	pshufd	$0xb1,  %xmm4, %xmm12
1923	shufps	$0x93,  %xmm9, %xmm8
1924	mulps	%xmm8,  %xmm4
1925	addps	%xmm4,  %xmm0
1926	mulps	%xmm8,  %xmm12
1927	addps	%xmm12, %xmm1
1928
1929	movaps	-24 * SIZE(Y), %xmm6
1930	movaps	-24 * SIZE(X), %xmm11
1931
1932	movss	%xmm10, %xmm9
1933	pshufd	$0xb1,  %xmm5, %xmm12
1934	shufps	$0x93,  %xmm10, %xmm9
1935	mulps	%xmm9,  %xmm5
1936	addps	%xmm5,  %xmm0
1937	mulps	%xmm9,  %xmm12
1938	addps	%xmm12, %xmm1
1939
1940	movaps	-20 * SIZE(Y), %xmm7
1941	movaps	-20 * SIZE(X), %xmm8
1942
1943	movss	%xmm11, %xmm10
1944	pshufd	$0xb1,  %xmm6,  %xmm12
1945	shufps	$0x93,  %xmm11, %xmm10
1946	mulps	%xmm10, %xmm6
1947	addps	%xmm6,  %xmm0
1948	mulps	%xmm10, %xmm12
1949	addps	%xmm12, %xmm1
1950
1951	movss	%xmm8,  %xmm11
1952	pshufd	$0xb1,  %xmm7,  %xmm12
1953	shufps	$0x93,  %xmm8, %xmm11
1954	mulps	%xmm11, %xmm7
1955	addps	%xmm7,  %xmm0
1956	mulps	%xmm11, %xmm12
1957	addps	%xmm12, %xmm1
1958
1959	addq	$16 * SIZE, X
1960	addq	$16 * SIZE, Y
1961	ALIGN_3
1962
1963.L66:
1964	testq	$4, N
1965	jle	.L67
1966
1967	movaps	-32 * SIZE(Y), %xmm4
1968	movaps	-32 * SIZE(X), %xmm9
1969
1970	movss	%xmm9,  %xmm8
1971	pshufd	$0xb1,  %xmm4, %xmm12
1972	shufps	$0x93,  %xmm9, %xmm8
1973	mulps	%xmm8,  %xmm4
1974	addps	%xmm4,  %xmm0
1975	mulps	%xmm8,  %xmm12
1976	addps	%xmm12, %xmm1
1977
1978	movaps	-28 * SIZE(Y), %xmm5
1979	movaps	-28 * SIZE(X), %xmm10
1980
1981	movss	%xmm10, %xmm9
1982	pshufd	$0xb1,  %xmm5, %xmm12
1983	shufps	$0x93,  %xmm10, %xmm9
1984	mulps	%xmm9,  %xmm5
1985	addps	%xmm5,  %xmm0
1986	mulps	%xmm9,  %xmm12
1987	addps	%xmm12, %xmm1
1988
1989	movaps	%xmm10, %xmm8
1990
1991	addq	$8 * SIZE, X
1992	addq	$8 * SIZE, Y
1993	ALIGN_3
1994
1995.L67:
1996	testq	$2, N
1997	jle	.L68
1998
1999	movaps	-32 * SIZE(Y), %xmm4
2000	movaps	-32 * SIZE(X), %xmm9
2001
2002	movss	%xmm9,  %xmm8
2003	pshufd	$0xb1,  %xmm4, %xmm12
2004	shufps	$0x93,  %xmm9, %xmm8
2005	mulps	%xmm8,  %xmm4
2006	addps	%xmm4,  %xmm0
2007	mulps	%xmm8,  %xmm12
2008	addps	%xmm12, %xmm1
2009
2010	movaps	%xmm9, %xmm8
2011	addq	$4 * SIZE, X
2012	addq	$4 * SIZE, Y
2013	ALIGN_3
2014
2015.L68:
2016	testq	$1, N
2017	jle	.L98
2018
2019#ifdef movsd
2020	xorps	%xmm4, %xmm4
2021#endif
2022	movsd	-32 * SIZE(Y), %xmm4
2023	movss	-32 * SIZE(X), %xmm9
2024
2025	movss	%xmm9,  %xmm8
2026	pshufd	$0xb1,  %xmm4, %xmm12
2027	shufps	$0x93,  %xmm8, %xmm8
2028	mulps	%xmm8,  %xmm4
2029	addps	%xmm4,  %xmm0
2030	mulps	%xmm8,  %xmm12
2031	addps	%xmm12, %xmm1
2032	jmp	.L98
2033	ALIGN_3
2034
2035#else
2036
2037	testq	$2 * SIZE, Y
2038	je	.L50x
2039
2040#ifdef movsd
2041	xorps	%xmm0, %xmm0
2042#endif
2043	movsd	-32 * SIZE(Y), %xmm0
2044#ifdef movsd
2045	xorps	%xmm4, %xmm4
2046#endif
2047	movsd	-32 * SIZE(X), %xmm4
2048
2049	pshufd	$0xb1,  %xmm0, %xmm1
2050	mulps	%xmm4,  %xmm0
2051	mulps	%xmm4,  %xmm1
2052	addq	$2 * SIZE, X
2053	addq	$2 * SIZE, Y
2054
2055	decq	N
2056	ALIGN_3
2057
2058.L50x:
2059	movq	N,  %rax
2060	sarq	$4, %rax
2061	jle	.L55
2062
2063	movaps	-32 * SIZE(Y), %xmm4
2064	movlps	-32 * SIZE(X), %xmm8
2065	movhps	-30 * SIZE(X), %xmm8
2066	movaps	-28 * SIZE(Y), %xmm5
2067	movlps	-28 * SIZE(X), %xmm9
2068	movhps	-26 * SIZE(X), %xmm9
2069
2070	movaps	-24 * SIZE(Y), %xmm6
2071	movlps	-24 * SIZE(X), %xmm10
2072	movhps	-22 * SIZE(X), %xmm10
2073	movaps	-20 * SIZE(Y), %xmm7
2074	movlps	-20 * SIZE(X), %xmm11
2075	movhps	-18 * SIZE(X), %xmm11
2076
2077	decq	%rax
2078	jle	.L52
2079	ALIGN_3
2080
2081.L51:
2082#ifdef PREFETCH
2083	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
2084#endif
2085
2086#ifdef PREFETCH
2087	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
2088#endif
2089
2090#if defined(PREFETCH) && !defined(FETCH128)
2091	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
2092#endif
2093
2094#if defined(PREFETCH) && !defined(FETCH128)
2095	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
2096#endif
2097
2098	pshufd	$0xb1,  %xmm4, %xmm12
2099	mulps	%xmm8,  %xmm4
2100	addps	%xmm4,  %xmm0
2101	movaps	-16 * SIZE(Y), %xmm4
2102	mulps	%xmm8,  %xmm12
2103	movlps	-16 * SIZE(X), %xmm8
2104	movhps	-14 * SIZE(X), %xmm8
2105	addps	%xmm12, %xmm1
2106
2107	pshufd	$0xb1,  %xmm5, %xmm12
2108	mulps	%xmm9,  %xmm5
2109	addps	%xmm5,  %xmm0
2110	movaps	-12 * SIZE(Y), %xmm5
2111	mulps	%xmm9,  %xmm12
2112	movlps	-12 * SIZE(X), %xmm9
2113	movhps	-10 * SIZE(X), %xmm9
2114	addps	%xmm12, %xmm1
2115
2116	pshufd	$0xb1,  %xmm6,  %xmm12
2117	mulps	%xmm10, %xmm6
2118	addps	%xmm6,  %xmm0
2119	movaps	 -8 * SIZE(Y), %xmm6
2120	mulps	%xmm10, %xmm12
2121	movlps	 -8 * SIZE(X), %xmm10
2122	movhps	 -6 * SIZE(X), %xmm10
2123	addps	%xmm12, %xmm1
2124
2125	pshufd	$0xb1,  %xmm7,  %xmm12
2126	mulps	%xmm11, %xmm7
2127	addps	%xmm7,  %xmm0
2128	movaps	 -4 * SIZE(Y), %xmm7
2129	mulps	%xmm11, %xmm12
2130	movlps	 -4 * SIZE(X), %xmm11
2131	movhps	 -2 * SIZE(X), %xmm11
2132	addps	%xmm12, %xmm1
2133
2134	pshufd	$0xb1,  %xmm4, %xmm12
2135	mulps	%xmm8,  %xmm4
2136	addps	%xmm4,  %xmm0
2137	movaps	  0 * SIZE(Y), %xmm4
2138	mulps	%xmm8,  %xmm12
2139 	movlps	  0 * SIZE(X), %xmm8
2140	movhps	  2 * SIZE(X), %xmm8
2141	addps	%xmm12, %xmm1
2142
2143	pshufd	$0xb1,  %xmm5, %xmm12
2144	mulps	%xmm9,  %xmm5
2145	addps	%xmm5,  %xmm0
2146	movaps	  4 * SIZE(Y), %xmm5
2147	mulps	%xmm9,  %xmm12
2148	movlps	  4 * SIZE(X), %xmm9
2149	movhps	  6 * SIZE(X), %xmm9
2150	addps	%xmm12, %xmm1
2151
2152	pshufd	$0xb1,  %xmm6,  %xmm12
2153	mulps	%xmm10, %xmm6
2154	addps	%xmm6,  %xmm0
2155	movaps	  8 * SIZE(Y), %xmm6
2156	mulps	%xmm10, %xmm12
2157	movlps	  8 * SIZE(X), %xmm10
2158	movhps	 10 * SIZE(X), %xmm10
2159	addps	%xmm12, %xmm1
2160
2161	pshufd	$0xb1,  %xmm7,  %xmm12
2162	mulps	%xmm11, %xmm7
2163	addps	%xmm7,  %xmm0
2164	movaps	 12 * SIZE(Y), %xmm7
2165	mulps	%xmm11, %xmm12
2166	movlps	 12 * SIZE(X), %xmm11
2167	movhps	 14 * SIZE(X), %xmm11
2168	addps	%xmm12, %xmm1
2169
2170	subq	$-32 * SIZE, X
2171	subq	$-32 * SIZE, Y
2172
2173	decq	%rax
2174	jg	.L51
2175	ALIGN_3
2176
2177.L52:
2178	pshufd	$0xb1,  %xmm4, %xmm12
2179	mulps	%xmm8,  %xmm4
2180	addps	%xmm4,  %xmm0
2181	movaps	-16 * SIZE(Y), %xmm4
2182	mulps	%xmm8,  %xmm12
2183	movlps	-16 * SIZE(X), %xmm8
2184	movhps	-14 * SIZE(X), %xmm8
2185	addps	%xmm12, %xmm1
2186
2187	pshufd	$0xb1,  %xmm5, %xmm12
2188	mulps	%xmm9,  %xmm5
2189	addps	%xmm5,  %xmm0
2190	movaps	-12 * SIZE(Y), %xmm5
2191	mulps	%xmm9,  %xmm12
2192	movlps	-12 * SIZE(X), %xmm9
2193	movhps	-10 * SIZE(X), %xmm9
2194	addps	%xmm12, %xmm1
2195
2196	pshufd	$0xb1,  %xmm6,  %xmm12
2197	mulps	%xmm10, %xmm6
2198	addps	%xmm6,  %xmm0
2199	movaps	 -8 * SIZE(Y), %xmm6
2200	mulps	%xmm10, %xmm12
2201	movlps	 -8 * SIZE(X), %xmm10
2202	movhps	 -6 * SIZE(X), %xmm10
2203	addps	%xmm12, %xmm1
2204
2205	pshufd	$0xb1,  %xmm7,  %xmm12
2206	mulps	%xmm11, %xmm7
2207	addps	%xmm7,  %xmm0
2208	movaps	 -4 * SIZE(Y), %xmm7
2209	mulps	%xmm11, %xmm12
2210	movlps	 -4 * SIZE(X), %xmm11
2211	movhps	 -2 * SIZE(X), %xmm11
2212	addps	%xmm12, %xmm1
2213
2214	pshufd	$0xb1,  %xmm4, %xmm12
2215	mulps	%xmm8,  %xmm4
2216	addps	%xmm4,  %xmm0
2217	mulps	%xmm8,  %xmm12
2218	addps	%xmm12, %xmm1
2219
2220	pshufd	$0xb1,  %xmm5, %xmm12
2221	mulps	%xmm9,  %xmm5
2222	addps	%xmm5,  %xmm0
2223	mulps	%xmm9,  %xmm12
2224	addps	%xmm12, %xmm1
2225
2226	pshufd	$0xb1,  %xmm6,  %xmm12
2227	mulps	%xmm10, %xmm6
2228	addps	%xmm6,  %xmm0
2229	mulps	%xmm10, %xmm12
2230	addps	%xmm12, %xmm1
2231
2232	pshufd	$0xb1,  %xmm7,  %xmm12
2233	mulps	%xmm11, %xmm7
2234	addps	%xmm7,  %xmm0
2235	mulps	%xmm11, %xmm12
2236	addps	%xmm12, %xmm1
2237
2238	subq	$-32 * SIZE, X
2239	subq	$-32 * SIZE, Y
2240	ALIGN_3
2241
2242.L55:
2243	testq	$8, N
2244	jle	.L56
2245
2246	movaps	-32 * SIZE(Y), %xmm4
2247	movlps	-32 * SIZE(X), %xmm8
2248	movhps	-30 * SIZE(X), %xmm8
2249
2250	movaps	-28 * SIZE(Y), %xmm5
2251	movlps	-28 * SIZE(X), %xmm9
2252	movhps	-26 * SIZE(X), %xmm9
2253
2254	pshufd	$0xb1,  %xmm4, %xmm12
2255	mulps	%xmm8,  %xmm4
2256	addps	%xmm4,  %xmm0
2257	mulps	%xmm8,  %xmm12
2258	addps	%xmm12, %xmm1
2259
2260	movaps	-24 * SIZE(Y), %xmm6
2261	movlps	-24 * SIZE(X), %xmm10
2262	movhps	-22 * SIZE(X), %xmm10
2263
2264	pshufd	$0xb1,  %xmm5, %xmm12
2265	mulps	%xmm9,  %xmm5
2266	addps	%xmm5,  %xmm0
2267	mulps	%xmm9,  %xmm12
2268	addps	%xmm12, %xmm1
2269
2270	movaps	-20 * SIZE(Y), %xmm7
2271	movlps	-20 * SIZE(X), %xmm11
2272	movhps	-18 * SIZE(X), %xmm11
2273
2274	pshufd	$0xb1,  %xmm6,  %xmm12
2275	mulps	%xmm10, %xmm6
2276	addps	%xmm6,  %xmm0
2277	mulps	%xmm10, %xmm12
2278	addps	%xmm12, %xmm1
2279
2280	pshufd	$0xb1,  %xmm7,  %xmm12
2281	mulps	%xmm11, %xmm7
2282	addps	%xmm7,  %xmm0
2283	mulps	%xmm11, %xmm12
2284	addps	%xmm12, %xmm1
2285
2286	addq	$16 * SIZE, X
2287	addq	$16 * SIZE, Y
2288	ALIGN_3
2289
2290.L56:
2291	testq	$4, N
2292	jle	.L57
2293
2294	movaps	-32 * SIZE(Y), %xmm4
2295	movlps	-32 * SIZE(X), %xmm8
2296	movhps	-30 * SIZE(X), %xmm8
2297
2298	pshufd	$0xb1,  %xmm4, %xmm12
2299	mulps	%xmm8,  %xmm4
2300	addps	%xmm4,  %xmm0
2301	mulps	%xmm8,  %xmm12
2302	addps	%xmm12, %xmm1
2303
2304	movaps	-28 * SIZE(Y), %xmm5
2305	movlps	-28 * SIZE(X), %xmm9
2306	movhps	-26 * SIZE(X), %xmm9
2307
2308	pshufd	$0xb1,  %xmm5, %xmm12
2309	mulps	%xmm9,  %xmm5
2310	addps	%xmm5,  %xmm0
2311	mulps	%xmm9,  %xmm12
2312	addps	%xmm12, %xmm1
2313
2314	addq	$8 * SIZE, X
2315	addq	$8 * SIZE, Y
2316	ALIGN_3
2317
2318.L57:
2319	testq	$2, N
2320	jle	.L58
2321
2322	movaps	-32 * SIZE(Y), %xmm4
2323	movlps	-32 * SIZE(X), %xmm8
2324	movhps	-30 * SIZE(X), %xmm8
2325
2326	pshufd	$0xb1,  %xmm4, %xmm12
2327	mulps	%xmm8,  %xmm4
2328	addps	%xmm4,  %xmm0
2329	mulps	%xmm8,  %xmm12
2330	addps	%xmm12, %xmm1
2331
2332	movaps	%xmm9, %xmm8
2333	addq	$4 * SIZE, X
2334	addq	$4 * SIZE, Y
2335	ALIGN_3
2336
2337.L58:
2338	testq	$1, N
2339	jle	.L98
2340
2341#ifdef movsd
2342	xorps	%xmm4, %xmm4
2343#endif
2344	movsd	-32 * SIZE(Y), %xmm4
2345#ifdef movsd
2346	xorps	%xmm8, %xmm8
2347#endif
2348	movsd	-32 * SIZE(X), %xmm8
2349
2350	pshufd	$0xb1,  %xmm4, %xmm12
2351	mulps	%xmm8,  %xmm4
2352	addps	%xmm4,  %xmm0
2353	mulps	%xmm8,  %xmm12
2354	addps	%xmm12, %xmm1
2355	jmp	.L98
2356	ALIGN_3
2357#endif
2358
2359.L70:
2360	testq	$2 * SIZE, Y
2361	je	.L70x
2362
2363#ifdef movsd
2364	xorps	%xmm4, %xmm4
2365#endif
2366	movsd	-32 * SIZE(X), %xmm4
2367	addq	$2 * SIZE, X
2368#ifdef movsd
2369	xorps	%xmm1, %xmm1
2370#endif
2371	movsd	-32 * SIZE(Y), %xmm1
2372	addq	$2 * SIZE, Y
2373
2374	pshufd	$0xb1, %xmm1, %xmm0
2375	shufps	$0xb1, %xmm4, %xmm4
2376
2377	mulps	%xmm4,  %xmm0
2378	mulps	%xmm4,  %xmm1
2379	decq	N
2380	ALIGN_3
2381
2382.L70x:
2383	testq	$2 * SIZE, X
2384	jne	.L80
2385
2386	movaps	-33 * SIZE(X), %xmm4
2387	addq	$3 * SIZE, X
2388	movaps	-33 * SIZE(Y), %xmm8
2389	addq	$3 * SIZE, Y
2390
2391	movq	N,  %rax
2392	sarq	$4, %rax
2393	jle	.L75
2394
2395	movaps	-32 * SIZE(X), %xmm5
2396	movaps	-32 * SIZE(Y), %xmm9
2397	movaps	-28 * SIZE(X), %xmm6
2398	movaps	-28 * SIZE(Y), %xmm10
2399	movaps	-24 * SIZE(X), %xmm7
2400	movaps	-24 * SIZE(Y), %xmm11
2401
2402	decq	%rax
2403	jle	.L72
2404	ALIGN_3
2405
2406.L71:
2407#ifdef PREFETCH
2408	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
2409#endif
2410
2411	movss	%xmm9,  %xmm8
2412	pshufd	$0x1b,  %xmm8, %xmm12
2413	movss	%xmm5,  %xmm4
2414	mulps	%xmm4,  %xmm8
2415	addps	%xmm8,  %xmm0
2416	movaps	-20 * SIZE(Y), %xmm8
2417	mulps	%xmm4,  %xmm12
2418	movaps	-20 * SIZE(X), %xmm4
2419	addps	%xmm12, %xmm1
2420
2421	movss	%xmm10, %xmm9
2422	pshufd	$0x1b,  %xmm9, %xmm12
2423	movss	%xmm6,  %xmm5
2424	mulps	%xmm5,  %xmm9
2425	addps	%xmm9,  %xmm2
2426	movaps	-16 * SIZE(Y), %xmm9
2427	mulps	%xmm5,  %xmm12
2428	movaps	-16 * SIZE(X), %xmm5
2429	addps	%xmm12, %xmm3
2430
2431#ifdef PREFETCH
2432	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
2433#endif
2434
2435	movss	%xmm11, %xmm10
2436	pshufd	$0x1b,  %xmm10, %xmm12
2437	movss	%xmm7,  %xmm6
2438	mulps	%xmm6,  %xmm10
2439	addps	%xmm10, %xmm0
2440	movaps	-12 * SIZE(Y), %xmm10
2441	mulps	%xmm6,  %xmm12
2442	movaps	-12 * SIZE(X), %xmm6
2443	addps	%xmm12, %xmm1
2444
2445	movss	%xmm8,  %xmm11
2446	pshufd	$0x1b,  %xmm11, %xmm12
2447	movss	%xmm4,  %xmm7
2448	mulps	%xmm7,  %xmm11
2449	addps	%xmm11, %xmm2
2450	movaps	 -8 * SIZE(Y), %xmm11
2451	mulps	%xmm7,  %xmm12
2452	movaps	 -8 * SIZE(X), %xmm7
2453	addps	%xmm12, %xmm3
2454
2455#if defined(PREFETCH) && !defined(FETCH128)
2456	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
2457#endif
2458
2459	movss	%xmm9,  %xmm8
2460	pshufd	$0x1b,  %xmm8, %xmm12
2461	movss	%xmm5,  %xmm4
2462	mulps	%xmm4,  %xmm8
2463	addps	%xmm8,  %xmm0
2464 	movaps	 -4 * SIZE(Y), %xmm8
2465	mulps	%xmm4,  %xmm12
2466	movaps	 -4 * SIZE(X), %xmm4
2467	addps	%xmm12, %xmm1
2468
2469	movss	%xmm10, %xmm9
2470	pshufd	$0x1b,  %xmm9, %xmm12
2471	movss	%xmm6,  %xmm5
2472	mulps	%xmm5,  %xmm9
2473	addps	%xmm9,  %xmm2
2474	movaps	  0 * SIZE(Y), %xmm9
2475	mulps	%xmm5,  %xmm12
2476	movaps	  0 * SIZE(X), %xmm5
2477	addps	%xmm12, %xmm3
2478
2479#if defined(PREFETCH) && !defined(FETCH128)
2480	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
2481#endif
2482
2483	movss	%xmm11, %xmm10
2484	pshufd	$0x1b,  %xmm10, %xmm12
2485	movss	%xmm7,  %xmm6
2486	mulps	%xmm6,  %xmm10
2487	addps	%xmm10, %xmm0
2488	movaps	  4 * SIZE(Y), %xmm10
2489	mulps	%xmm6,  %xmm12
2490	movaps	  4 * SIZE(X), %xmm6
2491	addps	%xmm12, %xmm1
2492
2493	movss	%xmm8,  %xmm11
2494	pshufd	$0x1b,  %xmm11, %xmm12
2495	movss	%xmm4,  %xmm7
2496	mulps	%xmm7,  %xmm11
2497	addps	%xmm11, %xmm2
2498	movaps	  8 * SIZE(Y), %xmm11
2499	mulps	%xmm7,  %xmm12
2500	movaps	  8 * SIZE(X), %xmm7
2501	addps	%xmm12, %xmm3
2502
2503	subq	$-32 * SIZE, X
2504	subq	$-32 * SIZE, Y
2505
2506	decq	%rax
2507	jg	.L71
2508	ALIGN_3
2509
2510.L72:
2511	movss	%xmm9,  %xmm8
2512	pshufd	$0x1b,  %xmm8, %xmm12
2513	movss	%xmm5,  %xmm4
2514	mulps	%xmm4,  %xmm8
2515	addps	%xmm8,  %xmm0
2516	movaps	-20 * SIZE(Y), %xmm8
2517	mulps	%xmm4,  %xmm12
2518	movaps	-20 * SIZE(X), %xmm4
2519	addps	%xmm12, %xmm1
2520
2521	movss	%xmm10, %xmm9
2522	pshufd	$0x1b,  %xmm9, %xmm12
2523	movss	%xmm6,  %xmm5
2524	mulps	%xmm5,  %xmm9
2525	addps	%xmm9,  %xmm2
2526	movaps	-16 * SIZE(Y), %xmm9
2527	mulps	%xmm5,  %xmm12
2528	movaps	-16 * SIZE(X), %xmm5
2529	addps	%xmm12, %xmm3
2530
2531	movss	%xmm11, %xmm10
2532	pshufd	$0x1b,  %xmm10, %xmm12
2533	movss	%xmm7,  %xmm6
2534	mulps	%xmm6,  %xmm10
2535	addps	%xmm10, %xmm0
2536	movaps	-12 * SIZE(Y), %xmm10
2537	mulps	%xmm6,  %xmm12
2538	movaps	-12 * SIZE(X), %xmm6
2539	addps	%xmm12, %xmm1
2540
2541	movss	%xmm8,  %xmm11
2542	pshufd	$0x1b,  %xmm11, %xmm12
2543	movss	%xmm4,  %xmm7
2544	mulps	%xmm7,  %xmm11
2545	addps	%xmm11, %xmm2
2546	movaps	 -8 * SIZE(Y), %xmm11
2547	mulps	%xmm7,  %xmm12
2548	movaps	 -8 * SIZE(X), %xmm7
2549	addps	%xmm12, %xmm3
2550
2551	movss	%xmm9,  %xmm8
2552	pshufd	$0x1b,  %xmm8, %xmm12
2553	movss	%xmm5,  %xmm4
2554	mulps	%xmm4,  %xmm8
2555	addps	%xmm8,  %xmm0
2556 	movaps	 -4 * SIZE(Y), %xmm8
2557	mulps	%xmm4,  %xmm12
2558	movaps	 -4 * SIZE(X), %xmm4
2559	addps	%xmm12, %xmm1
2560
2561	movss	%xmm10, %xmm9
2562	pshufd	$0x1b,  %xmm9, %xmm12
2563	movss	%xmm6,  %xmm5
2564	mulps	%xmm5,  %xmm9
2565	addps	%xmm9,  %xmm2
2566	mulps	%xmm5,  %xmm12
2567	addps	%xmm12, %xmm3
2568
2569	movss	%xmm11, %xmm10
2570	pshufd	$0x1b,  %xmm10, %xmm12
2571	movss	%xmm7,  %xmm6
2572	mulps	%xmm6,  %xmm10
2573	addps	%xmm10, %xmm0
2574	mulps	%xmm6,  %xmm12
2575	addps	%xmm12, %xmm1
2576
2577	movss	%xmm8,  %xmm11
2578	pshufd	$0x1b,  %xmm11, %xmm12
2579	movss	%xmm4,  %xmm7
2580	mulps	%xmm7,  %xmm11
2581	addps	%xmm11, %xmm2
2582	mulps	%xmm7,  %xmm12
2583	addps	%xmm12, %xmm3
2584
2585	subq	$-32 * SIZE, X
2586	subq	$-32 * SIZE, Y
2587	ALIGN_3
2588
2589.L75:
2590	testq	$8, N
2591	jle	.L76
2592
2593	movaps	-32 * SIZE(X), %xmm5
2594	movaps	-32 * SIZE(Y), %xmm9
2595
2596	movss	%xmm9,  %xmm8
2597	pshufd	$0x1b,  %xmm8, %xmm12
2598	movss	%xmm5,  %xmm4
2599	mulps	%xmm4,  %xmm8
2600	addps	%xmm8,  %xmm0
2601	mulps	%xmm4,  %xmm12
2602	addps	%xmm12, %xmm1
2603
2604	movaps	-28 * SIZE(X), %xmm6
2605	movaps	-28 * SIZE(Y), %xmm10
2606
2607	movss	%xmm10, %xmm9
2608	pshufd	$0x1b,  %xmm9, %xmm12
2609	movss	%xmm6,  %xmm5
2610	mulps	%xmm5,  %xmm9
2611	addps	%xmm9,  %xmm2
2612	mulps	%xmm5,  %xmm12
2613	addps	%xmm12, %xmm3
2614
2615	movaps	-24 * SIZE(X), %xmm7
2616	movaps	-24 * SIZE(Y), %xmm11
2617
2618	movss	%xmm11, %xmm10
2619	pshufd	$0x1b,  %xmm10, %xmm12
2620	movss	%xmm7,  %xmm6
2621	mulps	%xmm6,  %xmm10
2622	addps	%xmm10, %xmm0
2623	mulps	%xmm6,  %xmm12
2624	addps	%xmm12, %xmm1
2625
2626	movaps	-20 * SIZE(X), %xmm4
2627	movaps	-20 * SIZE(Y), %xmm8
2628
2629	movss	%xmm8,  %xmm11
2630	pshufd	$0x1b,  %xmm11, %xmm12
2631	movss	%xmm4,  %xmm7
2632	mulps	%xmm7,  %xmm11
2633	addps	%xmm11, %xmm2
2634	mulps	%xmm7,  %xmm12
2635	addps	%xmm12, %xmm3
2636
2637	addq	$16 * SIZE, X
2638	addq	$16 * SIZE, Y
2639	ALIGN_3
2640
2641.L76:
2642	testq	$4, N
2643	jle	.L77
2644
2645	movaps	-32 * SIZE(X), %xmm5
2646	movaps	-32 * SIZE(Y), %xmm9
2647	movaps	-28 * SIZE(X), %xmm6
2648	movaps	-28 * SIZE(Y), %xmm10
2649
2650	movss	%xmm9,  %xmm8
2651	pshufd	$0x1b,  %xmm8, %xmm12
2652	movss	%xmm5,  %xmm4
2653	mulps	%xmm4,  %xmm8
2654	addps	%xmm8,  %xmm0
2655	mulps	%xmm4,  %xmm12
2656	addps	%xmm12, %xmm1
2657
2658	movss	%xmm10, %xmm9
2659	pshufd	$0x1b,  %xmm9, %xmm12
2660	movss	%xmm6,  %xmm5
2661	mulps	%xmm5,  %xmm9
2662	addps	%xmm9,  %xmm2
2663	mulps	%xmm5,  %xmm12
2664	addps	%xmm12, %xmm3
2665
2666	movaps	%xmm6, %xmm4
2667	movaps	%xmm10, %xmm8
2668
2669	addq	$8 * SIZE, X
2670	addq	$8 * SIZE, Y
2671	ALIGN_3
2672
2673.L77:
2674	testq	$2, N
2675	jle	.L78
2676
2677	movaps	-32 * SIZE(X), %xmm5
2678	movaps	-32 * SIZE(Y), %xmm9
2679
2680	movss	%xmm9,  %xmm8
2681	pshufd	$0x1b,  %xmm8, %xmm12
2682	movss	%xmm5,  %xmm4
2683	mulps	%xmm4,  %xmm8
2684	addps	%xmm8,  %xmm0
2685	mulps	%xmm4,  %xmm12
2686	addps	%xmm12, %xmm1
2687
2688	movaps	%xmm5, %xmm4
2689	movaps	%xmm9, %xmm8
2690	ALIGN_3
2691
2692.L78:
2693	testq	$1, N
2694	jle	.L79
2695
2696	xorps	%xmm5, %xmm5
2697	movss	%xmm5, %xmm4
2698	movss	%xmm5, %xmm8
2699
2700	shufps	$0x24, %xmm4, %xmm4
2701	pshufd	$0x18, %xmm8, %xmm12
2702	shufps	$0x24, %xmm8, %xmm8
2703
2704	mulps	%xmm4,  %xmm8
2705	addps	%xmm8,  %xmm0
2706	mulps	%xmm4,  %xmm12
2707	addps	%xmm12, %xmm1
2708	ALIGN_3
2709
2710.L79:
2711	shufps	$0x39, %xmm0, %xmm0
2712	shufps	$0x39, %xmm1, %xmm1
2713	shufps	$0x39, %xmm2, %xmm2
2714	shufps	$0x39, %xmm3, %xmm3
2715	jmp	.L98
2716	ALIGN_3
2717
2718.L80:
2719	movsd	-33 * SIZE(X), %xmm4
2720	movhps	-31 * SIZE(X), %xmm4
2721	addq	$3 * SIZE, X
2722	movaps	-33 * SIZE(Y), %xmm8
2723	addq	$3 * SIZE, Y
2724
2725	movq	N,  %rax
2726	sarq	$4, %rax
2727	jle	.L85
2728
2729	movsd	-32 * SIZE(X), %xmm5
2730	movhps	-30 * SIZE(X), %xmm5
2731	movaps	-32 * SIZE(Y), %xmm9
2732
2733	movsd	-28 * SIZE(X), %xmm6
2734	movhps	-26 * SIZE(X), %xmm6
2735	movaps	-28 * SIZE(Y), %xmm10
2736
2737	movsd	-24 * SIZE(X), %xmm7
2738	movhps	-22 * SIZE(X), %xmm7
2739	movaps	-24 * SIZE(Y), %xmm11
2740
2741	decq	%rax
2742	jle	.L82
2743	ALIGN_3
2744
2745.L81:
2746#ifdef PREFETCH
2747	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
2748#endif
2749
2750	movss	%xmm9,  %xmm8
2751	pshufd	$0x1b,  %xmm8, %xmm12
2752	movss	%xmm5,  %xmm4
2753	mulps	%xmm4,  %xmm8
2754	addps	%xmm8,  %xmm0
2755	movaps	-20 * SIZE(Y), %xmm8
2756	mulps	%xmm4,  %xmm12
2757	movsd	-20 * SIZE(X), %xmm4
2758	movhps	-18 * SIZE(X), %xmm4
2759	addps	%xmm12, %xmm1
2760
2761	movss	%xmm10, %xmm9
2762	pshufd	$0x1b,  %xmm9, %xmm12
2763	movss	%xmm6,  %xmm5
2764	mulps	%xmm5,  %xmm9
2765	addps	%xmm9,  %xmm2
2766	movaps	-16 * SIZE(Y), %xmm9
2767	mulps	%xmm5,  %xmm12
2768	movsd	-16 * SIZE(X), %xmm5
2769	movhps	-14 * SIZE(X), %xmm5
2770	addps	%xmm12, %xmm3
2771
2772#ifdef PREFETCH
2773	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(Y)
2774#endif
2775
2776	movss	%xmm11, %xmm10
2777	pshufd	$0x1b,  %xmm10, %xmm12
2778	movss	%xmm7,  %xmm6
2779	mulps	%xmm6,  %xmm10
2780	addps	%xmm10, %xmm0
2781	movaps	-12 * SIZE(Y), %xmm10
2782	mulps	%xmm6,  %xmm12
2783	movsd	-12 * SIZE(X), %xmm6
2784	movhps	-10 * SIZE(X), %xmm6
2785	addps	%xmm12, %xmm1
2786
2787	movss	%xmm8,  %xmm11
2788	pshufd	$0x1b,  %xmm11, %xmm12
2789	movss	%xmm4,  %xmm7
2790	mulps	%xmm7,  %xmm11
2791	addps	%xmm11, %xmm2
2792	movaps	 -8 * SIZE(Y), %xmm11
2793	mulps	%xmm7,  %xmm12
2794	movsd	 -8 * SIZE(X), %xmm7
2795	movhps	 -6 * SIZE(X), %xmm7
2796	addps	%xmm12, %xmm3
2797
2798#if defined(PREFETCH) && !defined(FETCH128)
2799	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
2800#endif
2801
2802	movss	%xmm9,  %xmm8
2803	pshufd	$0x1b,  %xmm8, %xmm12
2804	movss	%xmm5,  %xmm4
2805	mulps	%xmm4,  %xmm8
2806	addps	%xmm8,  %xmm0
2807 	movaps	 -4 * SIZE(Y), %xmm8
2808	mulps	%xmm4,  %xmm12
2809	movsd	 -4 * SIZE(X), %xmm4
2810	movhps	 -2 * SIZE(X), %xmm4
2811	addps	%xmm12, %xmm1
2812
2813	movss	%xmm10, %xmm9
2814	pshufd	$0x1b,  %xmm9, %xmm12
2815	movss	%xmm6,  %xmm5
2816	mulps	%xmm5,  %xmm9
2817	addps	%xmm9,  %xmm2
2818	movaps	  0 * SIZE(Y), %xmm9
2819	mulps	%xmm5,  %xmm12
2820	movsd	  0 * SIZE(X), %xmm5
2821	movhps	  2 * SIZE(X), %xmm5
2822	addps	%xmm12, %xmm3
2823
2824#if defined(PREFETCH) && !defined(FETCH128)
2825	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(Y)
2826#endif
2827
2828	movss	%xmm11, %xmm10
2829	pshufd	$0x1b,  %xmm10, %xmm12
2830	movss	%xmm7,  %xmm6
2831	mulps	%xmm6,  %xmm10
2832	addps	%xmm10, %xmm0
2833	movaps	  4 * SIZE(Y), %xmm10
2834	mulps	%xmm6,  %xmm12
2835	movsd	  4 * SIZE(X), %xmm6
2836	movhps	  6 * SIZE(X), %xmm6
2837	addps	%xmm12, %xmm1
2838
2839	movss	%xmm8,  %xmm11
2840	pshufd	$0x1b,  %xmm11, %xmm12
2841	movss	%xmm4,  %xmm7
2842	mulps	%xmm7,  %xmm11
2843	addps	%xmm11, %xmm2
2844	movaps	  8 * SIZE(Y), %xmm11
2845	mulps	%xmm7,  %xmm12
2846	movsd	  8 * SIZE(X), %xmm7
2847	movhps	 10 * SIZE(X), %xmm7
2848	addps	%xmm12, %xmm3
2849
2850	subq	$-32 * SIZE, X
2851	subq	$-32 * SIZE, Y
2852
2853	decq	%rax
2854	jg	.L81
2855	ALIGN_3
2856
2857.L82:
2858	movss	%xmm9,  %xmm8
2859	pshufd	$0x1b,  %xmm8, %xmm12
2860	movss	%xmm5,  %xmm4
2861	mulps	%xmm4,  %xmm8
2862	addps	%xmm8,  %xmm0
2863	movaps	-20 * SIZE(Y), %xmm8
2864	mulps	%xmm4,  %xmm12
2865	movsd	-20 * SIZE(X), %xmm4
2866	movhps	-18 * SIZE(X), %xmm4
2867	addps	%xmm12, %xmm1
2868
2869	movss	%xmm10, %xmm9
2870	pshufd	$0x1b,  %xmm9, %xmm12
2871	movss	%xmm6,  %xmm5
2872	mulps	%xmm5,  %xmm9
2873	addps	%xmm9,  %xmm2
2874	movaps	-16 * SIZE(Y), %xmm9
2875	mulps	%xmm5,  %xmm12
2876	movsd	-16 * SIZE(X), %xmm5
2877	movhps	-14 * SIZE(X), %xmm5
2878	addps	%xmm12, %xmm3
2879
2880	movss	%xmm11, %xmm10
2881	pshufd	$0x1b,  %xmm10, %xmm12
2882	movss	%xmm7,  %xmm6
2883	mulps	%xmm6,  %xmm10
2884	addps	%xmm10, %xmm0
2885	movaps	-12 * SIZE(Y), %xmm10
2886	mulps	%xmm6,  %xmm12
2887	movsd	-12 * SIZE(X), %xmm6
2888	movhps	-10 * SIZE(X), %xmm6
2889	addps	%xmm12, %xmm1
2890
2891	movss	%xmm8,  %xmm11
2892	pshufd	$0x1b,  %xmm11, %xmm12
2893	movss	%xmm4,  %xmm7
2894	mulps	%xmm7,  %xmm11
2895	addps	%xmm11, %xmm2
2896	movaps	 -8 * SIZE(Y), %xmm11
2897	mulps	%xmm7,  %xmm12
2898	movsd	 -8 * SIZE(X), %xmm7
2899	movhps	 -6 * SIZE(X), %xmm7
2900	addps	%xmm12, %xmm3
2901
2902	movss	%xmm9,  %xmm8
2903	pshufd	$0x1b,  %xmm8, %xmm12
2904	movss	%xmm5,  %xmm4
2905	mulps	%xmm4,  %xmm8
2906	addps	%xmm8,  %xmm0
2907 	movaps	 -4 * SIZE(Y), %xmm8
2908	mulps	%xmm4,  %xmm12
2909	movsd	 -4 * SIZE(X), %xmm4
2910	movhps	 -2 * SIZE(X), %xmm4
2911	addps	%xmm12, %xmm1
2912
2913	movss	%xmm10, %xmm9
2914	pshufd	$0x1b,  %xmm9, %xmm12
2915	movss	%xmm6,  %xmm5
2916	mulps	%xmm5,  %xmm9
2917	addps	%xmm9,  %xmm2
2918	mulps	%xmm5,  %xmm12
2919	addps	%xmm12, %xmm3
2920
2921	movss	%xmm11, %xmm10
2922	pshufd	$0x1b,  %xmm10, %xmm12
2923	movss	%xmm7,  %xmm6
2924	mulps	%xmm6,  %xmm10
2925	addps	%xmm10, %xmm0
2926	mulps	%xmm6,  %xmm12
2927	addps	%xmm12, %xmm1
2928
2929	movss	%xmm8,  %xmm11
2930	pshufd	$0x1b,  %xmm11, %xmm12
2931	movss	%xmm4,  %xmm7
2932	mulps	%xmm7,  %xmm11
2933	addps	%xmm11, %xmm2
2934	mulps	%xmm7,  %xmm12
2935	addps	%xmm12, %xmm3
2936
2937	subq	$-32 * SIZE, X
2938	subq	$-32 * SIZE, Y
2939	ALIGN_3
2940
2941.L85:
2942	testq	$8, N
2943	jle	.L86
2944
2945	movsd	-32 * SIZE(X), %xmm5
2946	movhps	-30 * SIZE(X), %xmm5
2947	movaps	-32 * SIZE(Y), %xmm9
2948
2949	movss	%xmm9,  %xmm8
2950	pshufd	$0x1b,  %xmm8, %xmm12
2951	movss	%xmm5,  %xmm4
2952	mulps	%xmm4,  %xmm8
2953	addps	%xmm8,  %xmm0
2954	mulps	%xmm4,  %xmm12
2955	addps	%xmm12, %xmm1
2956
2957	movsd	-28 * SIZE(X), %xmm6
2958	movhps	-26 * SIZE(X), %xmm6
2959	movaps	-28 * SIZE(Y), %xmm10
2960
2961	movss	%xmm10, %xmm9
2962	pshufd	$0x1b,  %xmm9, %xmm12
2963	movss	%xmm6,  %xmm5
2964	mulps	%xmm5,  %xmm9
2965	addps	%xmm9,  %xmm2
2966	mulps	%xmm5,  %xmm12
2967	addps	%xmm12, %xmm3
2968
2969	movsd	-24 * SIZE(X), %xmm7
2970	movhps	-22 * SIZE(X), %xmm7
2971	movaps	-24 * SIZE(Y), %xmm11
2972
2973	movss	%xmm11, %xmm10
2974	pshufd	$0x1b,  %xmm10, %xmm12
2975	movss	%xmm7,  %xmm6
2976	mulps	%xmm6,  %xmm10
2977	addps	%xmm10, %xmm0
2978	mulps	%xmm6,  %xmm12
2979	addps	%xmm12, %xmm1
2980
2981	movsd	-20 * SIZE(X), %xmm4
2982	movhps	-18 * SIZE(X), %xmm4
2983	movaps	-20 * SIZE(Y), %xmm8
2984
2985	movss	%xmm8,  %xmm11
2986	pshufd	$0x1b,  %xmm11, %xmm12
2987	movss	%xmm4,  %xmm7
2988	mulps	%xmm7,  %xmm11
2989	addps	%xmm11, %xmm2
2990	mulps	%xmm7,  %xmm12
2991	addps	%xmm12, %xmm3
2992
2993	addq	$16 * SIZE, X
2994	addq	$16 * SIZE, Y
2995	ALIGN_3
2996
2997.L86:
2998	testq	$4, N
2999	jle	.L87
3000
3001	movsd	-32 * SIZE(X), %xmm5
3002	movhps	-30 * SIZE(X), %xmm5
3003	movaps	-32 * SIZE(Y), %xmm9
3004
3005	movss	%xmm9,  %xmm8
3006	pshufd	$0x1b,  %xmm8, %xmm12
3007	movss	%xmm5,  %xmm4
3008	mulps	%xmm4,  %xmm8
3009	addps	%xmm8,  %xmm0
3010	mulps	%xmm4,  %xmm12
3011	addps	%xmm12, %xmm1
3012
3013	movsd	-28 * SIZE(X), %xmm6
3014	movhps	-26 * SIZE(X), %xmm6
3015	movaps	-28 * SIZE(Y), %xmm10
3016
3017	movss	%xmm10, %xmm9
3018	pshufd	$0x1b,  %xmm9, %xmm12
3019	movss	%xmm6,  %xmm5
3020	mulps	%xmm5,  %xmm9
3021	addps	%xmm9,  %xmm2
3022	mulps	%xmm5,  %xmm12
3023	addps	%xmm12, %xmm3
3024
3025	movaps	%xmm6, %xmm4
3026	movaps	%xmm10, %xmm8
3027
3028	addq	$8 * SIZE, X
3029	addq	$8 * SIZE, Y
3030	ALIGN_3
3031
3032.L87:
3033	testq	$2, N
3034	jle	.L88
3035
3036	movsd	-32 * SIZE(X), %xmm5
3037	movhps	-30 * SIZE(X), %xmm5
3038	movaps	-32 * SIZE(Y), %xmm9
3039
3040	movss	%xmm9,  %xmm8
3041	pshufd	$0x1b,  %xmm8, %xmm12
3042	movss	%xmm5,  %xmm4
3043	mulps	%xmm4,  %xmm8
3044	addps	%xmm8,  %xmm0
3045	mulps	%xmm4,  %xmm12
3046	addps	%xmm12, %xmm1
3047
3048	movaps	%xmm5, %xmm4
3049	movaps	%xmm9, %xmm8
3050	ALIGN_3
3051
3052.L88:
3053	testq	$1, N
3054	jle	.L89
3055
3056	xorps	%xmm5, %xmm5
3057	movss	%xmm5, %xmm4
3058	movss	%xmm5, %xmm8
3059
3060	shufps	$0x24, %xmm4, %xmm4
3061	pshufd	$0x18, %xmm8, %xmm12
3062	shufps	$0x24, %xmm8, %xmm8
3063
3064	mulps	%xmm4,  %xmm8
3065	addps	%xmm8,  %xmm0
3066	mulps	%xmm4,  %xmm12
3067	addps	%xmm12, %xmm1
3068	ALIGN_3
3069
3070.L89:
3071	shufps	$0x39, %xmm0, %xmm0
3072	shufps	$0x39, %xmm1, %xmm1
3073	shufps	$0x39, %xmm2, %xmm2
3074	shufps	$0x39, %xmm3, %xmm3
3075	jmp	.L98
3076	ALIGN_3
3077
3078.L200:
3079	movq	N,  %rax
3080	sarq	$4, %rax
3081	jle	.L205
3082
3083	movsd	(X), %xmm4
3084	addq	INCX, X
3085	movhps	(X), %xmm4
3086	addq	INCX, X
3087	movsd	(Y), %xmm8
3088	addq	INCY, Y
3089	movhps	(Y), %xmm8
3090	addq	INCY, Y
3091
3092	movsd	(X), %xmm5
3093	addq	INCX, X
3094	movhps	(X), %xmm5
3095	addq	INCX, X
3096	movsd	(Y), %xmm9
3097	addq	INCY, Y
3098	movhps	(Y), %xmm9
3099	addq	INCY, Y
3100
3101	movsd	(X), %xmm6
3102	addq	INCX, X
3103	movhps	(X), %xmm6
3104	addq	INCX, X
3105	movsd	(Y), %xmm10
3106	addq	INCY, Y
3107	movhps	(Y), %xmm10
3108	addq	INCY, Y
3109
3110	movsd	(X), %xmm7
3111	addq	INCX, X
3112	movhps	(X), %xmm7
3113	addq	INCX, X
3114	movsd	(Y), %xmm11
3115	addq	INCY, Y
3116	movhps	(Y), %xmm11
3117	addq	INCY, Y
3118
3119	decq	%rax
3120	jle	.L204
3121	ALIGN_3
3122
3123.L203:
3124	pshufd	$0xb1,  %xmm8, %xmm12
3125	mulps	%xmm4,  %xmm8
3126	addps	%xmm8,  %xmm0
3127	movsd	(Y), %xmm8
3128	addq	INCY, Y
3129	movhps	(Y), %xmm8
3130	addq	INCY, Y
3131	mulps	%xmm4,  %xmm12
3132	movsd	(X), %xmm4
3133	addq	INCX, X
3134	movhps	(X), %xmm4
3135	addq	INCX, X
3136	addps	%xmm12, %xmm1
3137
3138	pshufd	$0xb1,  %xmm9, %xmm12
3139	mulps	%xmm5,  %xmm9
3140	addps	%xmm9,  %xmm2
3141	movsd	(Y), %xmm9
3142	addq	INCY, Y
3143	movhps	(Y), %xmm9
3144	addq	INCY, Y
3145	mulps	%xmm5,  %xmm12
3146	movsd	(X), %xmm5
3147	addq	INCX, X
3148	movhps	(X), %xmm5
3149	addq	INCX, X
3150	addps	%xmm12, %xmm3
3151
3152	pshufd	$0xb1,  %xmm10, %xmm12
3153	mulps	%xmm6,  %xmm10
3154	addps	%xmm10, %xmm0
3155	movsd	(Y), %xmm10
3156	addq	INCY, Y
3157	movhps	(Y), %xmm10
3158	addq	INCY, Y
3159	mulps	%xmm6,  %xmm12
3160	movsd	(X), %xmm6
3161	addq	INCX, X
3162	movhps	(X), %xmm6
3163	addq	INCX, X
3164	addps	%xmm12, %xmm1
3165
3166	pshufd	$0xb1,  %xmm11, %xmm12
3167	mulps	%xmm7,  %xmm11
3168	addps	%xmm11, %xmm2
3169	movsd	(Y), %xmm11
3170	addq	INCY, Y
3171	movhps	(Y), %xmm11
3172	addq	INCY, Y
3173	mulps	%xmm7,  %xmm12
3174	movsd	(X), %xmm7
3175	addq	INCX, X
3176	movhps	(X), %xmm7
3177	addq	INCX, X
3178	addps	%xmm12, %xmm3
3179
3180	pshufd	$0xb1,  %xmm8, %xmm12
3181	mulps	%xmm4,  %xmm8
3182	addps	%xmm8,  %xmm0
3183	movsd	(Y), %xmm8
3184	addq	INCY, Y
3185	movhps	(Y), %xmm8
3186	addq	INCY, Y
3187	mulps	%xmm4,  %xmm12
3188	movsd	(X), %xmm4
3189	addq	INCX, X
3190	movhps	(X), %xmm4
3191	addq	INCX, X
3192	addps	%xmm12, %xmm1
3193
3194	pshufd	$0xb1,  %xmm9, %xmm12
3195	mulps	%xmm5,  %xmm9
3196	addps	%xmm9,  %xmm2
3197	movsd	(Y), %xmm9
3198	addq	INCY, Y
3199	movhps	(Y), %xmm9
3200	addq	INCY, Y
3201	mulps	%xmm5,  %xmm12
3202	movsd	(X), %xmm5
3203	addq	INCX, X
3204	movhps	(X), %xmm5
3205	addq	INCX, X
3206	addps	%xmm12, %xmm3
3207
3208	pshufd	$0xb1,  %xmm10, %xmm12
3209	mulps	%xmm6,  %xmm10
3210	addps	%xmm10, %xmm0
3211	movsd	(Y), %xmm10
3212	addq	INCY, Y
3213	movhps	(Y), %xmm10
3214	addq	INCY, Y
3215	mulps	%xmm6,  %xmm12
3216	movsd	(X), %xmm6
3217	addq	INCX, X
3218	movhps	(X), %xmm6
3219	addq	INCX, X
3220	addps	%xmm12, %xmm1
3221
3222	pshufd	$0xb1,  %xmm11, %xmm12
3223	mulps	%xmm7,  %xmm11
3224	addps	%xmm11, %xmm2
3225	movsd	(Y), %xmm11
3226	addq	INCY, Y
3227	movhps	(Y), %xmm11
3228	addq	INCY, Y
3229
3230	mulps	%xmm7,  %xmm12
3231	movsd	(X), %xmm7
3232	addq	INCX, X
3233	movhps	(X), %xmm7
3234	addq	INCX, X
3235	addps	%xmm12, %xmm3
3236
3237	decq	%rax
3238	jg	.L203
3239	ALIGN_3
3240
3241.L204:
3242	pshufd	$0xb1,  %xmm8, %xmm12
3243	mulps	%xmm4,  %xmm8
3244	addps	%xmm8,  %xmm0
3245	movsd	(Y), %xmm8
3246	addq	INCY, Y
3247	movhps	(Y), %xmm8
3248	addq	INCY, Y
3249	mulps	%xmm4,  %xmm12
3250	movsd	(X), %xmm4
3251	addq	INCX, X
3252	movhps	(X), %xmm4
3253	addq	INCX, X
3254	addps	%xmm12, %xmm1
3255
3256	pshufd	$0xb1,  %xmm9, %xmm12
3257	mulps	%xmm5,  %xmm9
3258	addps	%xmm9,  %xmm2
3259	movsd	(Y), %xmm9
3260	addq	INCY, Y
3261	movhps	(Y), %xmm9
3262	addq	INCY, Y
3263	mulps	%xmm5,  %xmm12
3264	movsd	(X), %xmm5
3265	addq	INCX, X
3266	movhps	(X), %xmm5
3267	addq	INCX, X
3268	addps	%xmm12, %xmm3
3269
3270	pshufd	$0xb1,  %xmm10, %xmm12
3271	mulps	%xmm6,  %xmm10
3272	addps	%xmm10, %xmm0
3273	movsd	(Y), %xmm10
3274	addq	INCY, Y
3275	movhps	(Y), %xmm10
3276	addq	INCY, Y
3277	mulps	%xmm6,  %xmm12
3278	movsd	(X), %xmm6
3279	addq	INCX, X
3280	movhps	(X), %xmm6
3281	addq	INCX, X
3282	addps	%xmm12, %xmm1
3283
3284	pshufd	$0xb1,  %xmm11, %xmm12
3285	mulps	%xmm7,  %xmm11
3286	addps	%xmm11, %xmm2
3287	movsd	(Y), %xmm11
3288	addq	INCY, Y
3289	movhps	(Y), %xmm11
3290	addq	INCY, Y
3291	mulps	%xmm7,  %xmm12
3292	movsd	(X), %xmm7
3293	addq	INCX, X
3294	movhps	(X), %xmm7
3295	addq	INCX, X
3296	addps	%xmm12, %xmm3
3297
3298	pshufd	$0xb1,  %xmm8, %xmm12
3299	mulps	%xmm4,  %xmm8
3300	addps	%xmm8,  %xmm0
3301	mulps	%xmm4,  %xmm12
3302	addps	%xmm12, %xmm1
3303
3304	pshufd	$0xb1,  %xmm9, %xmm12
3305	mulps	%xmm5,  %xmm9
3306	addps	%xmm9,  %xmm2
3307	mulps	%xmm5,  %xmm12
3308	addps	%xmm12, %xmm3
3309
3310	pshufd	$0xb1,  %xmm10, %xmm12
3311	mulps	%xmm6,  %xmm10
3312	addps	%xmm10, %xmm0
3313	mulps	%xmm6,  %xmm12
3314	addps	%xmm12, %xmm1
3315
3316	pshufd	$0xb1,  %xmm11, %xmm12
3317	mulps	%xmm7,  %xmm11
3318	addps	%xmm11, %xmm2
3319	mulps	%xmm7,  %xmm12
3320	addps	%xmm12, %xmm3
3321	ALIGN_3
3322
3323.L205:
3324	testq	$8, N
3325	jle	.L206
3326
3327	movsd	(X), %xmm4
3328	addq	INCX, X
3329	movhps	(X), %xmm4
3330	addq	INCX, X
3331	movsd	(Y), %xmm8
3332	addq	INCY, Y
3333	movhps	(Y), %xmm8
3334	addq	INCY, Y
3335
3336	pshufd	$0xb1,  %xmm8, %xmm12
3337	mulps	%xmm4,  %xmm8
3338	addps	%xmm8,  %xmm0
3339	mulps	%xmm4,  %xmm12
3340	addps	%xmm12, %xmm1
3341
3342	movsd	(X), %xmm5
3343	addq	INCX, X
3344	movhps	(X), %xmm5
3345	addq	INCX, X
3346	movsd	(Y), %xmm9
3347	addq	INCY, Y
3348	movhps	(Y), %xmm9
3349	addq	INCY, Y
3350
3351	pshufd	$0xb1,  %xmm9, %xmm12
3352	mulps	%xmm5,  %xmm9
3353	addps	%xmm9,  %xmm2
3354	mulps	%xmm5,  %xmm12
3355	addps	%xmm12, %xmm3
3356
3357	movsd	(X), %xmm6
3358	addq	INCX, X
3359	movhps	(X), %xmm6
3360	addq	INCX, X
3361	movsd	(Y), %xmm10
3362	addq	INCY, Y
3363	movhps	(Y), %xmm10
3364	addq	INCY, Y
3365
3366	pshufd	$0xb1,  %xmm10, %xmm12
3367	mulps	%xmm6,  %xmm10
3368	addps	%xmm10, %xmm0
3369	mulps	%xmm6,  %xmm12
3370	addps	%xmm12, %xmm1
3371
3372	movsd	(X), %xmm7
3373	addq	INCX, X
3374	movhps	(X), %xmm7
3375	addq	INCX, X
3376	movsd	(Y), %xmm11
3377	addq	INCY, Y
3378	movhps	(Y), %xmm11
3379	addq	INCY, Y
3380
3381	pshufd	$0xb1,  %xmm11, %xmm12
3382	mulps	%xmm7,  %xmm11
3383	addps	%xmm11, %xmm2
3384	mulps	%xmm7,  %xmm12
3385	addps	%xmm12, %xmm3
3386	ALIGN_3
3387
3388.L206:
3389	testq	$4, N
3390	jle	.L207
3391
3392	movsd	(X), %xmm4
3393	addq	INCX, X
3394	movhps	(X), %xmm4
3395	addq	INCX, X
3396	movsd	(Y), %xmm8
3397	addq	INCY, Y
3398	movhps	(Y), %xmm8
3399	addq	INCY, Y
3400
3401	pshufd	$0xb1,  %xmm8, %xmm12
3402	mulps	%xmm4,  %xmm8
3403	addps	%xmm8,  %xmm0
3404	mulps	%xmm4,  %xmm12
3405	addps	%xmm12, %xmm1
3406
3407	movsd	(X), %xmm5
3408	addq	INCX, X
3409	movhps	(X), %xmm5
3410	addq	INCX, X
3411	movsd	(Y), %xmm9
3412	addq	INCY, Y
3413	movhps	(Y), %xmm9
3414	addq	INCY, Y
3415
3416	pshufd	$0xb1,  %xmm9, %xmm12
3417	mulps	%xmm5,  %xmm9
3418	addps	%xmm9,  %xmm2
3419	mulps	%xmm5,  %xmm12
3420	addps	%xmm12, %xmm3
3421	ALIGN_3
3422
3423.L207:
3424	testq	$2, N
3425	jle	.L208
3426
3427	movsd	(X), %xmm4
3428	addq	INCX, X
3429	movhps	(X), %xmm4
3430	addq	INCX, X
3431	movsd	(Y), %xmm8
3432	addq	INCY, Y
3433	movhps	(Y), %xmm8
3434	addq	INCY, Y
3435
3436	pshufd	$0xb1,  %xmm8, %xmm12
3437	mulps	%xmm4,  %xmm8
3438	addps	%xmm8,  %xmm0
3439	mulps	%xmm4,  %xmm12
3440	addps	%xmm12, %xmm1
3441	ALIGN_3
3442
3443.L208:
3444	testq	$1, N
3445	jle	.L98
3446
3447#ifdef movsd
3448	xorps	%xmm4, %xmm4
3449#endif
3450	movsd	(X), %xmm4
3451#ifdef movsd
3452	xorps	%xmm8, %xmm8
3453#endif
3454	movsd	(Y), %xmm8
3455
3456	pshufd	$0xb1,  %xmm8, %xmm12
3457	mulps	%xmm4,  %xmm8
3458	addps	%xmm8,  %xmm0
3459	mulps	%xmm4,  %xmm12
3460	addps	%xmm12, %xmm1
3461	ALIGN_3
3462
3463.L98:
3464	addps	%xmm2, %xmm0
3465	addps	%xmm3, %xmm1
3466
3467	movhlps	%xmm0, %xmm2
3468	movhlps	%xmm1, %xmm3
3469
3470	addps	%xmm2, %xmm0
3471	addps	%xmm3, %xmm1
3472
3473	pshufd	$1, %xmm0, %xmm2
3474	pshufd	$1, %xmm1, %xmm3
3475	ALIGN_3
3476
3477.L999:
3478#ifndef CONJ
3479	subss	 %xmm2, %xmm0
3480	addss	 %xmm3, %xmm1
3481#else
3482	addss	 %xmm2, %xmm0
3483	subss	 %xmm3, %xmm1
3484#endif
3485	unpcklps %xmm1, %xmm0
3486
3487#ifdef WINDOWS_ABI
3488	movq	%xmm0, %rax
3489#endif
3490
3491	RESTOREREGISTERS
3492
3493	ret
3494	ALIGN_3
3495
3496	EPILOGUE
3497