1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45#define Y	ARG4	/* rcx */
46#ifndef WINDOWS_ABI
47#define INCY	ARG5	/* r8  */
48#else
49#define INCY	%r10
50#endif
51
52#include "l1param.h"
53
54#ifdef OPTERON
55#define LOAD(OFFSET, ADDR, REG)		xorps	REG, REG; addps	OFFSET(ADDR), REG
56#else
57#define LOAD(OFFSET, ADDR, REG)		movaps	OFFSET(ADDR), REG
58#endif
59
60	PROLOGUE
61	PROFCODE
62
63#ifdef WINDOWS_ABI
64	movq	40(%rsp), INCY
65#endif
66
67	SAVEREGISTERS
68
69	salq	$ZBASE_SHIFT, INCX
70	salq	$ZBASE_SHIFT, INCY
71
72	cmpq	$2 * SIZE, INCX
73	jne	.L100
74	cmpq	$2 * SIZE, INCY
75	jne	.L100
76
77	cmpq	$3, M
78	jle	.L106
79
80	subq	$-32 * SIZE, X
81	subq	$-32 * SIZE, Y
82	addq	M, M
83
84	testq	$SIZE, Y
85	je	.L05
86
87	movss	-32 * SIZE(X), %xmm0
88	movss	%xmm0, -32 * SIZE(Y)
89	addq	$1 * SIZE, X
90	addq	$1 * SIZE, Y
91	decq	M
92	ALIGN_4
93
94.L05:
95	testq	$2 * SIZE, Y
96	je	.L10
97
98	movsd	-32 * SIZE(X), %xmm0
99	movlps	%xmm0, -32 * SIZE(Y)
100	addq	$2 * SIZE, X
101	addq	$2 * SIZE, Y
102	subq	$2, M
103	jle	.L19
104	ALIGN_4
105
106.L10:
107	testq	$3 * SIZE, X
108	jne	.L20
109
110	movq	M,  %rax
111	sarq	$5, %rax
112	jle	.L13
113
114	movaps	-32 * SIZE(X), %xmm0
115	movaps	-28 * SIZE(X), %xmm1
116	movaps	-24 * SIZE(X), %xmm2
117	movaps	-20 * SIZE(X), %xmm3
118	movaps	-16 * SIZE(X), %xmm4
119	movaps	-12 * SIZE(X), %xmm5
120	movaps	 -8 * SIZE(X), %xmm6
121	movaps	 -4 * SIZE(X), %xmm7
122
123	decq	%rax
124	jle .L12
125	ALIGN_3
126
127.L11:
128#ifdef PREFETCHW
129	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
130#endif
131
132	movaps	%xmm0, -32 * SIZE(Y)
133	LOAD( 0 * SIZE, X, %xmm0)
134	movaps	%xmm1, -28 * SIZE(Y)
135	LOAD( 4 * SIZE, X, %xmm1)
136
137#ifdef PREFETCH
138	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
139#endif
140
141	movaps	%xmm2, -24 * SIZE(Y)
142	LOAD( 8 * SIZE, X, %xmm2)
143	movaps	%xmm3, -20 * SIZE(Y)
144	LOAD(12 * SIZE, X, %xmm3)
145
146#if defined(PREFETCHW) && !defined(FETCH128)
147	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
148#endif
149
150	movaps	%xmm4,-16 * SIZE(Y)
151	LOAD(16 * SIZE, X, %xmm4)
152	movaps	%xmm5,-12 * SIZE(Y)
153	LOAD(20 * SIZE, X, %xmm5)
154
155#if defined(PREFETCH) && !defined(FETCH128)
156	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
157#endif
158
159	movaps	%xmm6, -8 * SIZE(Y)
160	LOAD(24 * SIZE, X, %xmm6)
161	movaps	%xmm7, -4 * SIZE(Y)
162	LOAD(28 * SIZE, X, %xmm7)
163
164	subq	$-32 * SIZE, Y
165	subq	$-32 * SIZE, X
166	decq	%rax
167	jg	.L11
168	ALIGN_3
169
170.L12:
171	movaps	%xmm0, -32 * SIZE(Y)
172	movaps	%xmm1, -28 * SIZE(Y)
173	movaps	%xmm2, -24 * SIZE(Y)
174	movaps	%xmm3, -20 * SIZE(Y)
175	movaps	%xmm4, -16 * SIZE(Y)
176	movaps	%xmm5, -12 * SIZE(Y)
177	movaps	%xmm6,  -8 * SIZE(Y)
178	movaps	%xmm7,  -4 * SIZE(Y)
179
180	subq	$-32 * SIZE, Y
181	subq	$-32 * SIZE, X
182	ALIGN_3
183
184.L13:
185	testq	$16, M
186	jle	.L14
187
188	movaps	-32 * SIZE(X), %xmm0
189	movaps	-28 * SIZE(X), %xmm1
190	movaps	-24 * SIZE(X), %xmm2
191	movaps	-20 * SIZE(X), %xmm3
192
193	movaps	%xmm0, -32 * SIZE(Y)
194	movaps	%xmm1, -28 * SIZE(Y)
195	movaps	%xmm2, -24 * SIZE(Y)
196	movaps	%xmm3, -20 * SIZE(Y)
197
198	addq	$16 * SIZE, X
199	addq	$16 * SIZE, Y
200	ALIGN_3
201
202.L14:
203	testq	$8, M
204	jle	.L15
205
206	movaps	-32 * SIZE(X), %xmm0
207	movaps	-28 * SIZE(X), %xmm1
208
209	movaps	%xmm0, -32 * SIZE(Y)
210	movaps	%xmm1, -28 * SIZE(Y)
211
212	addq	$8 * SIZE, X
213	addq	$8 * SIZE, Y
214	ALIGN_3
215
216.L15:
217	testq	$4, M
218	jle	.L16
219
220	movaps	-32 * SIZE(X), %xmm0
221	movaps	%xmm0, -32 * SIZE(Y)
222
223	addq	$4 * SIZE, X
224	addq	$4 * SIZE, Y
225	ALIGN_3
226
227.L16:
228	testq	$2, M
229	jle	.L17
230
231	movsd	-32 * SIZE(X), %xmm0
232	movlps	%xmm0, -32 * SIZE(Y)
233
234	addq	$2 * SIZE, X
235	addq	$2 * SIZE, Y
236	ALIGN_3
237
238.L17:
239	testq	$1, M
240	jle	.L19
241
242	movss	-32 * SIZE(X), %xmm0
243	movss	%xmm0, 	-32 * SIZE(Y)
244	ALIGN_3
245
246.L19:
247	xorq	%rax,%rax
248
249	RESTOREREGISTERS
250
251	ret
252	ALIGN_3
253
254
255.L20:
256	testq	$SIZE, X
257	jne	.L30
258
259	movhps	-32 * SIZE(X), %xmm0
260
261	movq	M,  %rax
262	sarq	$5, %rax
263	jle	.L23
264
265	movaps	-30 * SIZE(X), %xmm1
266	movaps	-26 * SIZE(X), %xmm2
267	movaps	-22 * SIZE(X), %xmm3
268	movaps	-18 * SIZE(X), %xmm4
269	movaps	-14 * SIZE(X), %xmm5
270	movaps	-10 * SIZE(X), %xmm6
271	movaps	 -6 * SIZE(X), %xmm7
272
273	decq	%rax
274	jle .L22
275	ALIGN_4
276
277.L21:
278
279#ifdef PREFETCHW
280	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
281#endif
282
283	shufps	$0x4e, %xmm1, %xmm0
284	movaps	%xmm0, -32 * SIZE(Y)
285	movaps	 -2 * SIZE(X), %xmm0
286
287	shufps	$0x4e, %xmm2, %xmm1
288	movaps	%xmm1, -28 * SIZE(Y)
289	movaps	  2 * SIZE(X), %xmm1
290
291#ifdef PREFETCH
292	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
293#endif
294
295	shufps	$0x4e, %xmm3, %xmm2
296	movaps	%xmm2, -24 * SIZE(Y)
297	movaps	  6 * SIZE(X), %xmm2
298
299	shufps	$0x4e, %xmm4, %xmm3
300	movaps	%xmm3, -20 * SIZE(Y)
301	movaps	 10 * SIZE(X), %xmm3
302
303#if defined(PREFETCHW) && !defined(FETCH128)
304	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
305#endif
306
307	shufps	$0x4e, %xmm5, %xmm4
308	movaps	%xmm4, -16 * SIZE(Y)
309	movaps	 14 * SIZE(X), %xmm4
310
311	shufps	$0x4e, %xmm6, %xmm5
312	movaps	%xmm5, -12 * SIZE(Y)
313	movaps	 18 * SIZE(X), %xmm5
314
315#if defined(PREFETCH) && !defined(FETCH128)
316	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
317#endif
318
319	shufps	$0x4e, %xmm7, %xmm6
320	movaps	%xmm6,  -8 * SIZE(Y)
321	movaps	 22 * SIZE(X), %xmm6
322
323	shufps	$0x4e, %xmm0, %xmm7
324	movaps	%xmm7,  -4 * SIZE(Y)
325	movaps	 26 * SIZE(X), %xmm7
326
327	subq	$-32 * SIZE, X
328	subq	$-32 * SIZE, Y
329	decq	%rax
330	jg	.L21
331	ALIGN_3
332
333.L22:
334	shufps	$0x4e, %xmm1, %xmm0
335	movaps	%xmm0, -32 * SIZE(Y)
336	movaps	 -2 * SIZE(X), %xmm0
337
338	shufps	$0x4e, %xmm2, %xmm1
339	movaps	%xmm1, -28 * SIZE(Y)
340
341	shufps	$0x4e, %xmm3, %xmm2
342	movaps	%xmm2, -24 * SIZE(Y)
343
344	shufps	$0x4e, %xmm4, %xmm3
345	movaps	%xmm3, -20 * SIZE(Y)
346
347	shufps	$0x4e, %xmm5, %xmm4
348	movaps	%xmm4, -16 * SIZE(Y)
349
350	shufps	$0x4e, %xmm6, %xmm5
351	movaps	%xmm5, -12 * SIZE(Y)
352
353	shufps	$0x4e, %xmm7, %xmm6
354	movaps	%xmm6,  -8 * SIZE(Y)
355
356	shufps	$0x4e, %xmm0, %xmm7
357	movaps	%xmm7,  -4 * SIZE(Y)
358
359	subq	$-32 * SIZE, X
360	subq	$-32 * SIZE, Y
361	ALIGN_3
362
363.L23:
364	testq	$16, M
365	jle	.L24
366	ALIGN_3
367
368	movaps	-30 * SIZE(X), %xmm1
369	movaps	-26 * SIZE(X), %xmm2
370	movaps	-22 * SIZE(X), %xmm3
371	movaps	-18 * SIZE(X), %xmm4
372
373	shufps	$0x4e, %xmm1, %xmm0
374	movaps	%xmm0, -32 * SIZE(Y)
375	shufps	$0x4e, %xmm2, %xmm1
376	movaps	%xmm1, -28 * SIZE(Y)
377	shufps	$0x4e, %xmm3, %xmm2
378	movaps	%xmm2, -24 * SIZE(Y)
379	shufps	$0x4e, %xmm4, %xmm3
380	movaps	%xmm3, -20 * SIZE(Y)
381
382	movaps	%xmm4, %xmm0
383
384	addq	$16 * SIZE, X
385	addq	$16 * SIZE, Y
386	ALIGN_3
387
388.L24:
389	testq	$8, M
390	jle	.L25
391	ALIGN_3
392
393	movaps	-30 * SIZE(X), %xmm1
394	movaps	-26 * SIZE(X), %xmm2
395
396	shufps	$0x4e, %xmm1, %xmm0
397	shufps	$0x4e, %xmm2, %xmm1
398
399	movaps	%xmm0, -32 * SIZE(Y)
400	movaps	%xmm1, -28 * SIZE(Y)
401	movaps	%xmm2, %xmm0
402
403	addq	$8 * SIZE, X
404	addq	$8 * SIZE, Y
405	ALIGN_3
406
407.L25:
408	testq	$4, M
409	jle	.L26
410	ALIGN_3
411
412	movaps	-30 * SIZE(X), %xmm1
413	shufps	$0x4e, %xmm1, %xmm0
414	movaps	%xmm0, -32 * SIZE(Y)
415
416	addq	$4 * SIZE, X
417	addq	$4 * SIZE, Y
418	ALIGN_3
419
420.L26:
421	testq	$2, M
422	jle	.L27
423	ALIGN_3
424
425	movsd	-32 * SIZE(X), %xmm0
426
427	movsd	%xmm0, -32 * SIZE(Y)
428
429	addq	$2 * SIZE, X
430	addq	$2 * SIZE, Y
431	ALIGN_3
432
433.L27:
434	testq	$1, M
435	jle	.L29
436	ALIGN_3
437
438	movss	-32 * SIZE(X), %xmm0
439	movss	%xmm0, 	-32 * SIZE(Y)
440	addq	$SIZE, Y
441	ALIGN_3
442
443.L29:
444	xorq	%rax,%rax
445
446	RESTOREREGISTERS
447
448	ret
449	ALIGN_3
450
451.L30:
452	testq	$2 * SIZE, X
453	jne	.L40
454
455	movaps	-33 * SIZE(X), %xmm0
456
457	movq	M,  %rax
458	sarq	$5, %rax
459	jle	.L33
460
461	movaps	-29 * SIZE(X), %xmm1
462	movaps	-25 * SIZE(X), %xmm2
463	movaps	-21 * SIZE(X), %xmm3
464	movaps	-17 * SIZE(X), %xmm4
465	movaps	-13 * SIZE(X), %xmm5
466	movaps	 -9 * SIZE(X), %xmm6
467	movaps	 -5 * SIZE(X), %xmm7
468
469	decq	%rax
470	jle .L32
471	ALIGN_4
472
473.L31:
474#ifdef PREFETCHW
475	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
476#endif
477
478	movss	%xmm1, %xmm0
479	shufps	$0x39, %xmm0, %xmm0
480	movaps	%xmm0, -32 * SIZE(Y)
481	movaps	 -1 * SIZE(X), %xmm0
482
483	movss	%xmm2, %xmm1
484	shufps	$0x39, %xmm1, %xmm1
485	movaps	%xmm1, -28 * SIZE(Y)
486	movaps	  3 * SIZE(X), %xmm1
487
488#ifdef PREFETCH
489	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
490#endif
491
492	movss	%xmm3, %xmm2
493	shufps	$0x39, %xmm2, %xmm2
494	movaps	%xmm2, -24 * SIZE(Y)
495	movaps	  7 * SIZE(X), %xmm2
496
497	movss	%xmm4, %xmm3
498	shufps	$0x39, %xmm3, %xmm3
499	movaps	%xmm3, -20 * SIZE(Y)
500	movaps	 11 * SIZE(X), %xmm3
501
502#if defined(PREFETCHW) && !defined(FETCH128)
503	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
504#endif
505
506	movss	%xmm5, %xmm4
507	shufps	$0x39, %xmm4, %xmm4
508	movaps	%xmm4, -16 * SIZE(Y)
509	movaps	 15 * SIZE(X), %xmm4
510
511	movss	%xmm6, %xmm5
512	shufps	$0x39, %xmm5, %xmm5
513	movaps	%xmm5, -12 * SIZE(Y)
514	movaps	 19 * SIZE(X), %xmm5
515
516#if defined(PREFETCH) && !defined(FETCH128)
517	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
518#endif
519
520	movss	%xmm7, %xmm6
521	shufps	$0x39, %xmm6, %xmm6
522	movaps	%xmm6,  -8 * SIZE(Y)
523	movaps	 23 * SIZE(X), %xmm6
524
525	movss	%xmm0, %xmm7
526	shufps	$0x39, %xmm7, %xmm7
527	movaps	%xmm7,  -4 * SIZE(Y)
528	movaps	 27 * SIZE(X), %xmm7
529
530	subq	$-32 * SIZE, X
531	subq	$-32 * SIZE, Y
532	decq	%rax
533	jg	.L31
534	ALIGN_3
535
536.L32:
537	movss	%xmm1, %xmm0
538	shufps	$0x39, %xmm0, %xmm0
539	movaps	%xmm0, -32 * SIZE(Y)
540	movaps	 -1 * SIZE(X), %xmm0
541
542	movss	%xmm2, %xmm1
543	shufps	$0x39, %xmm1, %xmm1
544	movaps	%xmm1, -28 * SIZE(Y)
545
546	movss	%xmm3, %xmm2
547	shufps	$0x39, %xmm2, %xmm2
548	movaps	%xmm2, -24 * SIZE(Y)
549
550	movss	%xmm4, %xmm3
551	shufps	$0x39, %xmm3, %xmm3
552	movaps	%xmm3, -20 * SIZE(Y)
553
554	movss	%xmm5, %xmm4
555	shufps	$0x39, %xmm4, %xmm4
556	movaps	%xmm4, -16 * SIZE(Y)
557
558	movss	%xmm6, %xmm5
559	shufps	$0x39, %xmm5, %xmm5
560	movaps	%xmm5, -12 * SIZE(Y)
561
562	movss	%xmm7, %xmm6
563	shufps	$0x39, %xmm6, %xmm6
564	movaps	%xmm6,  -8 * SIZE(Y)
565
566	movss	%xmm0, %xmm7
567	shufps	$0x39, %xmm7, %xmm7
568	movaps	%xmm7,  -4 * SIZE(Y)
569
570	subq	$-32 * SIZE, X
571	subq	$-32 * SIZE, Y
572	ALIGN_3
573
574.L33:
575	testq	$16, M
576	jle	.L34
577	ALIGN_3
578
579	movaps	-29 * SIZE(X), %xmm1
580	movaps	-25 * SIZE(X), %xmm2
581	movaps	-21 * SIZE(X), %xmm3
582	movaps	-17 * SIZE(X), %xmm4
583
584	movss	%xmm1, %xmm0
585	shufps	$0x39, %xmm0, %xmm0
586 	movaps	%xmm0, -32 * SIZE(Y)
587
588	movss	%xmm2, %xmm1
589	shufps	$0x39, %xmm1, %xmm1
590	movaps	%xmm1, -28 * SIZE(Y)
591
592	movss	%xmm3, %xmm2
593	shufps	$0x39, %xmm2, %xmm2
594 	movaps	%xmm2, -24 * SIZE(Y)
595
596	movss	%xmm4, %xmm3
597	shufps	$0x39, %xmm3, %xmm3
598	movaps	%xmm3, -20 * SIZE(Y)
599
600	movaps	%xmm4, %xmm0
601
602	addq	$16 * SIZE, X
603	addq	$16 * SIZE, Y
604	ALIGN_3
605
606.L34:
607	testq	$8, M
608	jle	.L35
609	ALIGN_3
610
611	movaps	-29 * SIZE(X), %xmm1
612	movaps	-25 * SIZE(X), %xmm2
613
614	movss	%xmm1, %xmm0
615	shufps	$0x39, %xmm0, %xmm0
616	movaps	%xmm0, -32 * SIZE(Y)
617
618	movss	%xmm2, %xmm1
619	shufps	$0x39, %xmm1, %xmm1
620	movaps	%xmm1, -28 * SIZE(Y)
621	movaps	%xmm2, %xmm0
622
623	addq	$8 * SIZE, X
624	addq	$8 * SIZE, Y
625	ALIGN_3
626
627.L35:
628	testq	$4, M
629	jle	.L36
630	ALIGN_3
631
632	movaps	-29 * SIZE(X), %xmm1
633
634	movss	%xmm1, %xmm0
635	shufps	$0x39, %xmm0, %xmm0
636
637	movaps	%xmm0, -32 * SIZE(Y)
638
639	addq	$4 * SIZE, X
640	addq	$4 * SIZE, Y
641	ALIGN_3
642
643.L36:
644	testq	$2, M
645	jle	.L37
646	ALIGN_3
647
648	movsd	-32 * SIZE(X), %xmm0
649	movsd	%xmm0, -32 * SIZE(Y)
650
651	addq	$2 * SIZE, X
652	addq	$2 * SIZE, Y
653	ALIGN_3
654
655.L37:
656	testq	$1, M
657	jle	.L39
658	ALIGN_3
659
660	movss	-32 * SIZE(X), %xmm0
661	movss	%xmm0, 	-32 * SIZE(Y)
662	addq	$SIZE, Y
663	ALIGN_3
664
665.L39:
666	xorq	%rax,%rax
667
668	RESTOREREGISTERS
669
670	ret
671	ALIGN_3
672
673.L40:
674	movaps	-35 * SIZE(X), %xmm0
675
676	movq	M,  %rax
677	sarq	$5, %rax
678	jle	.L43
679
680	movaps	-31 * SIZE(X), %xmm1
681	movaps	-27 * SIZE(X), %xmm2
682	movaps	-23 * SIZE(X), %xmm3
683	movaps	-19 * SIZE(X), %xmm4
684	movaps	-15 * SIZE(X), %xmm5
685	movaps	-11 * SIZE(X), %xmm6
686	movaps	 -7 * SIZE(X), %xmm7
687
688	decq	%rax
689	jle .L42
690	ALIGN_4
691
692.L41:
693#ifdef PREFETCHW
694	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
695#endif
696
697	movss	%xmm1, %xmm0
698	shufps	$0x93, %xmm1, %xmm0
699	movaps	%xmm0, -32 * SIZE(Y)
700	movaps	 -3 * SIZE(X), %xmm0
701
702	movss	%xmm2, %xmm1
703	shufps	$0x93, %xmm2, %xmm1
704	movaps	%xmm1, -28 * SIZE(Y)
705	movaps	  1 * SIZE(X), %xmm1
706
707#ifdef PREFETCH
708	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
709#endif
710
711	movss	%xmm3, %xmm2
712	shufps	$0x93, %xmm3, %xmm2
713	movaps	%xmm2, -24 * SIZE(Y)
714	movaps	  5 * SIZE(X), %xmm2
715
716	movss	%xmm4, %xmm3
717	shufps	$0x93, %xmm4, %xmm3
718	movaps	%xmm3, -20 * SIZE(Y)
719	movaps	  9 * SIZE(X), %xmm3
720
721#if defined(PREFETCHW) && !defined(FETCH128)
722	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
723#endif
724
725	movss	%xmm5, %xmm4
726	shufps	$0x93, %xmm5, %xmm4
727	movaps	%xmm4, -16 * SIZE(Y)
728	movaps	 13 * SIZE(X), %xmm4
729
730	movss	%xmm6, %xmm5
731	shufps	$0x93, %xmm6, %xmm5
732	movaps	%xmm5, -12 * SIZE(Y)
733	movaps	 17 * SIZE(X), %xmm5
734
735#if defined(PREFETCH) && !defined(FETCH128)
736	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
737#endif
738
739	movss	%xmm7, %xmm6
740	shufps	$0x93, %xmm7, %xmm6
741	movaps	%xmm6, -8 * SIZE(Y)
742	movaps	 21 * SIZE(X), %xmm6
743
744	movss	%xmm0, %xmm7
745	shufps	$0x93, %xmm0, %xmm7
746	movaps	%xmm7, -4 * SIZE(Y)
747	movaps	 25 * SIZE(X), %xmm7
748
749	subq	$-32 * SIZE, X
750	subq	$-32 * SIZE, Y
751	decq	%rax
752	jg	.L41
753	ALIGN_3
754
755.L42:
756	movss	%xmm1, %xmm0
757	shufps	$0x93, %xmm1, %xmm0
758	movaps	%xmm0, -32 * SIZE(Y)
759	movaps	 -3 * SIZE(X), %xmm0
760
761	movss	%xmm2, %xmm1
762	shufps	$0x93, %xmm2, %xmm1
763	movaps	%xmm1, -28 * SIZE(Y)
764
765	movss	%xmm3, %xmm2
766	shufps	$0x93, %xmm3, %xmm2
767	movaps	%xmm2, -24 * SIZE(Y)
768
769	movss	%xmm4, %xmm3
770	shufps	$0x93, %xmm4, %xmm3
771	movaps	%xmm3, -20 * SIZE(Y)
772
773	movss	%xmm5, %xmm4
774	shufps	$0x93, %xmm5, %xmm4
775	movaps	%xmm4, -16 * SIZE(Y)
776
777	movss	%xmm6, %xmm5
778	shufps	$0x93, %xmm6, %xmm5
779	movaps	%xmm5, -12 * SIZE(Y)
780
781	movss	%xmm7, %xmm6
782	shufps	$0x93, %xmm7, %xmm6
783	movaps	%xmm6, -8 * SIZE(Y)
784
785	movss	%xmm0, %xmm7
786	shufps	$0x93, %xmm0, %xmm7
787	movaps	%xmm7, -4 * SIZE(Y)
788
789	subq	$-32 * SIZE, X
790	subq	$-32 * SIZE, Y
791	ALIGN_3
792
793.L43:
794	testq	$16, M
795	jle	.L44
796	ALIGN_3
797
798	movaps	-31 * SIZE(X), %xmm1
799	movaps	-27 * SIZE(X), %xmm2
800	movaps	-23 * SIZE(X), %xmm3
801	movaps	-19 * SIZE(X), %xmm4
802
803	movss	%xmm1, %xmm0
804	shufps	$0x93, %xmm1, %xmm0
805	movaps	%xmm0, -32 * SIZE(Y)
806
807	movss	%xmm2, %xmm1
808	shufps	$0x93, %xmm2, %xmm1
809	movaps	%xmm1, -28 * SIZE(Y)
810
811	movss	%xmm3, %xmm2
812	shufps	$0x93, %xmm3, %xmm2
813	movaps	%xmm2, -24 * SIZE(Y)
814
815	movss	%xmm4, %xmm3
816	shufps	$0x93, %xmm4, %xmm3
817	movaps	%xmm3, -20 * SIZE(Y)
818
819	movaps	%xmm4, %xmm0
820
821	addq	$16 * SIZE, X
822	addq	$16 * SIZE, Y
823	ALIGN_3
824
825.L44:
826	testq	$8, M
827	jle	.L45
828	ALIGN_3
829
830	movaps	-31 * SIZE(X), %xmm1
831	movaps	-27 * SIZE(X), %xmm2
832
833	movss	%xmm1, %xmm0
834	shufps	$0x93, %xmm1, %xmm0
835	movaps	%xmm0, -32 * SIZE(Y)
836
837	movss	%xmm2, %xmm1
838	shufps	$0x93, %xmm2, %xmm1
839	movaps	%xmm1, -28 * SIZE(Y)
840
841	movaps	%xmm2, %xmm0
842
843	addq	$8 * SIZE, X
844	addq	$8 * SIZE, Y
845	ALIGN_3
846
847.L45:
848	testq	$4, M
849	jle	.L46
850	ALIGN_3
851
852	movaps	-31 * SIZE(X), %xmm1
853
854	movss	%xmm1, %xmm0
855	shufps	$0x93, %xmm1, %xmm0
856
857	movaps	%xmm0, -32 * SIZE(Y)
858
859	addq	$4 * SIZE, X
860	addq	$4 * SIZE, Y
861	ALIGN_3
862
863.L46:
864	testq	$2, M
865	jle	.L47
866	ALIGN_3
867
868	movsd	-32 * SIZE(X), %xmm0
869	movsd	%xmm0, -32 * SIZE(Y)
870
871	addq	$2 * SIZE, X
872	addq	$2 * SIZE, Y
873	ALIGN_3
874
875.L47:
876	testq	$1, M
877	jle	.L49
878	ALIGN_3
879
880	movss	-32 * SIZE(X), %xmm0
881	movss	%xmm0, 	-32 * SIZE(Y)
882	addq	$SIZE, Y
883	ALIGN_3
884
885.L49:
886	xorq	%rax,%rax
887
888	RESTOREREGISTERS
889
890	ret
891	ALIGN_4
892
893.L100:
894	movq	M,  %rax
895	sarq	$3, %rax
896	jle	.L105
897	ALIGN_3
898
899.L102:
900	movsd	 (X), %xmm0
901	addq	 INCX, X
902	movhps	 (X), %xmm0
903	addq	 INCX, X
904	movsd	 (X), %xmm1
905	addq	 INCX, X
906	movhps	 (X), %xmm1
907	addq	 INCX, X
908	movsd	 (X), %xmm2
909	addq	 INCX, X
910	movhps	 (X), %xmm2
911	addq	 INCX, X
912	movsd	 (X), %xmm3
913	addq	 INCX, X
914	movhps	 (X), %xmm3
915	addq	 INCX, X
916
917	movsd	%xmm0,  (Y)
918	addq	 INCY, Y
919	movhps	%xmm0,  (Y)
920	addq	 INCY, Y
921	movsd	%xmm1,  (Y)
922	addq	 INCY, Y
923	movhps	%xmm1,  (Y)
924	addq	 INCY, Y
925	movsd	%xmm2,  (Y)
926	addq	 INCY, Y
927	movhps	%xmm2,  (Y)
928	addq	 INCY, Y
929	movsd	%xmm3,  (Y)
930	addq	 INCY, Y
931	movhps	%xmm3,  (Y)
932	addq	 INCY, Y
933
934	decq	%rax
935	jg	.L102
936	ALIGN_3
937
938.L105:
939	testq	$4, M
940	jle	.L106
941
942	movsd	 (X), %xmm0
943	addq	 INCX, X
944	movhps	 (X), %xmm0
945	addq	 INCX, X
946	movsd	 (X), %xmm1
947	addq	 INCX, X
948	movhps	 (X), %xmm1
949	addq	 INCX, X
950
951	movsd	%xmm0,  (Y)
952	addq	 INCY, Y
953	movhps	%xmm0,  (Y)
954	addq	 INCY, Y
955	movsd	%xmm1,  (Y)
956	addq	 INCY, Y
957	movhps	%xmm1,  (Y)
958	addq	 INCY, Y
959	ALIGN_3
960
961.L106:
962	testq	$2, M
963	jle	.L107
964
965	movsd	 (X), %xmm0
966	addq	 INCX, X
967	movhps	 (X), %xmm0
968	addq	 INCX, X
969
970	movsd	%xmm0,  (Y)
971	addq	 INCY, Y
972	movhps	%xmm0,  (Y)
973	addq	 INCY, Y
974	ALIGN_3
975
976.L107:
977	testq	$1, M
978	jle	.L999
979
980	movsd	 (X), %xmm0
981	movsd	%xmm0,   (Y)
982	ALIGN_3
983
984.L999:
985	xorq	%rax, %rax
986
987	RESTOREREGISTERS
988
989	ret
990
991	EPILOGUE
992
993