1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	ARG1	/* rdi */
26#define X	ARG2	/* rsi */
27#define INCX	ARG3	/* rdx */
28#define Y	ARG4	/* rcx */
29#ifndef WINDOWS_ABI
30#define INCY	ARG5	/* r8  */
31#else
32#define INCY	%r10
33#endif
34
35#include "l1param.h"
36
37#ifdef OPTERON
38#define LOAD(OFFSET, ADDR, REG)		xorps	REG, REG; addps	OFFSET(ADDR), REG
39#else
40#define LOAD(OFFSET, ADDR, REG)		movaps	OFFSET(ADDR), REG
41#endif
42
43	PROLOGUE
44	PROFCODE
45
46#ifdef WINDOWS_ABI
47	movq	40(%rsp), INCY
48#endif
49
50	SAVEREGISTERS
51
52	salq	$ZBASE_SHIFT, INCX
53	salq	$ZBASE_SHIFT, INCY
54
55	cmpq	$2 * SIZE, INCX
56	jne	.L100
57	cmpq	$2 * SIZE, INCY
58	jne	.L100
59
60	cmpq	$3, M
61	jle	.L106
62
63	subq	$-32 * SIZE, X
64	subq	$-32 * SIZE, Y
65	addq	M, M
66
67	testq	$SIZE, Y
68	je	.L05
69
70	movss	-32 * SIZE(X), %xmm0
71	movss	%xmm0, -32 * SIZE(Y)
72	addq	$1 * SIZE, X
73	addq	$1 * SIZE, Y
74	decq	M
75	ALIGN_4
76
77.L05:
78	testq	$2 * SIZE, Y
79	je	.L10
80
81	movsd	-32 * SIZE(X), %xmm0
82	movlps	%xmm0, -32 * SIZE(Y)
83	addq	$2 * SIZE, X
84	addq	$2 * SIZE, Y
85	subq	$2, M
86	jle	.L19
87	ALIGN_4
88
89.L10:
90	testq	$3 * SIZE, X
91	jne	.L20
92
93	movq	M,  %rax
94	sarq	$5, %rax
95	jle	.L13
96
97	movaps	-32 * SIZE(X), %xmm0
98	movaps	-28 * SIZE(X), %xmm1
99	movaps	-24 * SIZE(X), %xmm2
100	movaps	-20 * SIZE(X), %xmm3
101	movaps	-16 * SIZE(X), %xmm4
102	movaps	-12 * SIZE(X), %xmm5
103	movaps	 -8 * SIZE(X), %xmm6
104	movaps	 -4 * SIZE(X), %xmm7
105
106	decq	%rax
107	jle .L12
108	ALIGN_3
109
110.L11:
111#ifdef PREFETCHW
112	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
113#endif
114
115	movaps	%xmm0, -32 * SIZE(Y)
116	LOAD( 0 * SIZE, X, %xmm0)
117	movaps	%xmm1, -28 * SIZE(Y)
118	LOAD( 4 * SIZE, X, %xmm1)
119
120#ifdef PREFETCH
121	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
122#endif
123
124	movaps	%xmm2, -24 * SIZE(Y)
125	LOAD( 8 * SIZE, X, %xmm2)
126	movaps	%xmm3, -20 * SIZE(Y)
127	LOAD(12 * SIZE, X, %xmm3)
128
129#if defined(PREFETCHW) && !defined(FETCH128)
130	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
131#endif
132
133	movaps	%xmm4,-16 * SIZE(Y)
134	LOAD(16 * SIZE, X, %xmm4)
135	movaps	%xmm5,-12 * SIZE(Y)
136	LOAD(20 * SIZE, X, %xmm5)
137
138#if defined(PREFETCH) && !defined(FETCH128)
139	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
140#endif
141
142	movaps	%xmm6, -8 * SIZE(Y)
143	LOAD(24 * SIZE, X, %xmm6)
144	movaps	%xmm7, -4 * SIZE(Y)
145	LOAD(28 * SIZE, X, %xmm7)
146
147	subq	$-32 * SIZE, Y
148	subq	$-32 * SIZE, X
149	decq	%rax
150	jg	.L11
151	ALIGN_3
152
153.L12:
154	movaps	%xmm0, -32 * SIZE(Y)
155	movaps	%xmm1, -28 * SIZE(Y)
156	movaps	%xmm2, -24 * SIZE(Y)
157	movaps	%xmm3, -20 * SIZE(Y)
158	movaps	%xmm4, -16 * SIZE(Y)
159	movaps	%xmm5, -12 * SIZE(Y)
160	movaps	%xmm6,  -8 * SIZE(Y)
161	movaps	%xmm7,  -4 * SIZE(Y)
162
163	subq	$-32 * SIZE, Y
164	subq	$-32 * SIZE, X
165	ALIGN_3
166
167.L13:
168	testq	$16, M
169	jle	.L14
170
171	movaps	-32 * SIZE(X), %xmm0
172	movaps	-28 * SIZE(X), %xmm1
173	movaps	-24 * SIZE(X), %xmm2
174	movaps	-20 * SIZE(X), %xmm3
175
176	movaps	%xmm0, -32 * SIZE(Y)
177	movaps	%xmm1, -28 * SIZE(Y)
178	movaps	%xmm2, -24 * SIZE(Y)
179	movaps	%xmm3, -20 * SIZE(Y)
180
181	addq	$16 * SIZE, X
182	addq	$16 * SIZE, Y
183	ALIGN_3
184
185.L14:
186	testq	$8, M
187	jle	.L15
188
189	movaps	-32 * SIZE(X), %xmm0
190	movaps	-28 * SIZE(X), %xmm1
191
192	movaps	%xmm0, -32 * SIZE(Y)
193	movaps	%xmm1, -28 * SIZE(Y)
194
195	addq	$8 * SIZE, X
196	addq	$8 * SIZE, Y
197	ALIGN_3
198
199.L15:
200	testq	$4, M
201	jle	.L16
202
203	movaps	-32 * SIZE(X), %xmm0
204	movaps	%xmm0, -32 * SIZE(Y)
205
206	addq	$4 * SIZE, X
207	addq	$4 * SIZE, Y
208	ALIGN_3
209
210.L16:
211	testq	$2, M
212	jle	.L17
213
214	movsd	-32 * SIZE(X), %xmm0
215	movlps	%xmm0, -32 * SIZE(Y)
216
217	addq	$2 * SIZE, X
218	addq	$2 * SIZE, Y
219	ALIGN_3
220
221.L17:
222	testq	$1, M
223	jle	.L19
224
225	movss	-32 * SIZE(X), %xmm0
226	movss	%xmm0, 	-32 * SIZE(Y)
227	ALIGN_3
228
229.L19:
230	xorq	%rax,%rax
231
232	RESTOREREGISTERS
233
234	ret
235	ALIGN_3
236
237
238.L20:
239	testq	$SIZE, X
240	jne	.L30
241
242	movhps	-32 * SIZE(X), %xmm0
243
244	movq	M,  %rax
245	sarq	$5, %rax
246	jle	.L23
247
248	movaps	-30 * SIZE(X), %xmm1
249	movaps	-26 * SIZE(X), %xmm2
250	movaps	-22 * SIZE(X), %xmm3
251	movaps	-18 * SIZE(X), %xmm4
252	movaps	-14 * SIZE(X), %xmm5
253	movaps	-10 * SIZE(X), %xmm6
254	movaps	 -6 * SIZE(X), %xmm7
255
256	decq	%rax
257	jle .L22
258	ALIGN_4
259
260.L21:
261
262#ifdef PREFETCHW
263	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
264#endif
265
266	shufps	$0x4e, %xmm1, %xmm0
267	movaps	%xmm0, -32 * SIZE(Y)
268	movaps	 -2 * SIZE(X), %xmm0
269
270	shufps	$0x4e, %xmm2, %xmm1
271	movaps	%xmm1, -28 * SIZE(Y)
272	movaps	  2 * SIZE(X), %xmm1
273
274#ifdef PREFETCH
275	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
276#endif
277
278	shufps	$0x4e, %xmm3, %xmm2
279	movaps	%xmm2, -24 * SIZE(Y)
280	movaps	  6 * SIZE(X), %xmm2
281
282	shufps	$0x4e, %xmm4, %xmm3
283	movaps	%xmm3, -20 * SIZE(Y)
284	movaps	 10 * SIZE(X), %xmm3
285
286#if defined(PREFETCHW) && !defined(FETCH128)
287	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
288#endif
289
290	shufps	$0x4e, %xmm5, %xmm4
291	movaps	%xmm4, -16 * SIZE(Y)
292	movaps	 14 * SIZE(X), %xmm4
293
294	shufps	$0x4e, %xmm6, %xmm5
295	movaps	%xmm5, -12 * SIZE(Y)
296	movaps	 18 * SIZE(X), %xmm5
297
298#if defined(PREFETCH) && !defined(FETCH128)
299	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
300#endif
301
302	shufps	$0x4e, %xmm7, %xmm6
303	movaps	%xmm6,  -8 * SIZE(Y)
304	movaps	 22 * SIZE(X), %xmm6
305
306	shufps	$0x4e, %xmm0, %xmm7
307	movaps	%xmm7,  -4 * SIZE(Y)
308	movaps	 26 * SIZE(X), %xmm7
309
310	subq	$-32 * SIZE, X
311	subq	$-32 * SIZE, Y
312	decq	%rax
313	jg	.L21
314	ALIGN_3
315
316.L22:
317	shufps	$0x4e, %xmm1, %xmm0
318	movaps	%xmm0, -32 * SIZE(Y)
319	movaps	 -2 * SIZE(X), %xmm0
320
321	shufps	$0x4e, %xmm2, %xmm1
322	movaps	%xmm1, -28 * SIZE(Y)
323
324	shufps	$0x4e, %xmm3, %xmm2
325	movaps	%xmm2, -24 * SIZE(Y)
326
327	shufps	$0x4e, %xmm4, %xmm3
328	movaps	%xmm3, -20 * SIZE(Y)
329
330	shufps	$0x4e, %xmm5, %xmm4
331	movaps	%xmm4, -16 * SIZE(Y)
332
333	shufps	$0x4e, %xmm6, %xmm5
334	movaps	%xmm5, -12 * SIZE(Y)
335
336	shufps	$0x4e, %xmm7, %xmm6
337	movaps	%xmm6,  -8 * SIZE(Y)
338
339	shufps	$0x4e, %xmm0, %xmm7
340	movaps	%xmm7,  -4 * SIZE(Y)
341
342	subq	$-32 * SIZE, X
343	subq	$-32 * SIZE, Y
344	ALIGN_3
345
346.L23:
347	testq	$16, M
348	jle	.L24
349	ALIGN_3
350
351	movaps	-30 * SIZE(X), %xmm1
352	movaps	-26 * SIZE(X), %xmm2
353	movaps	-22 * SIZE(X), %xmm3
354	movaps	-18 * SIZE(X), %xmm4
355
356	shufps	$0x4e, %xmm1, %xmm0
357	movaps	%xmm0, -32 * SIZE(Y)
358	shufps	$0x4e, %xmm2, %xmm1
359	movaps	%xmm1, -28 * SIZE(Y)
360	shufps	$0x4e, %xmm3, %xmm2
361	movaps	%xmm2, -24 * SIZE(Y)
362	shufps	$0x4e, %xmm4, %xmm3
363	movaps	%xmm3, -20 * SIZE(Y)
364
365	movaps	%xmm4, %xmm0
366
367	addq	$16 * SIZE, X
368	addq	$16 * SIZE, Y
369	ALIGN_3
370
371.L24:
372	testq	$8, M
373	jle	.L25
374	ALIGN_3
375
376	movaps	-30 * SIZE(X), %xmm1
377	movaps	-26 * SIZE(X), %xmm2
378
379	shufps	$0x4e, %xmm1, %xmm0
380	shufps	$0x4e, %xmm2, %xmm1
381
382	movaps	%xmm0, -32 * SIZE(Y)
383	movaps	%xmm1, -28 * SIZE(Y)
384	movaps	%xmm2, %xmm0
385
386	addq	$8 * SIZE, X
387	addq	$8 * SIZE, Y
388	ALIGN_3
389
390.L25:
391	testq	$4, M
392	jle	.L26
393	ALIGN_3
394
395	movaps	-30 * SIZE(X), %xmm1
396	shufps	$0x4e, %xmm1, %xmm0
397	movaps	%xmm0, -32 * SIZE(Y)
398
399	addq	$4 * SIZE, X
400	addq	$4 * SIZE, Y
401	ALIGN_3
402
403.L26:
404	testq	$2, M
405	jle	.L27
406	ALIGN_3
407
408	movsd	-32 * SIZE(X), %xmm0
409
410	movsd	%xmm0, -32 * SIZE(Y)
411
412	addq	$2 * SIZE, X
413	addq	$2 * SIZE, Y
414	ALIGN_3
415
416.L27:
417	testq	$1, M
418	jle	.L29
419	ALIGN_3
420
421	movss	-32 * SIZE(X), %xmm0
422	movss	%xmm0, 	-32 * SIZE(Y)
423	addq	$SIZE, Y
424	ALIGN_3
425
426.L29:
427	xorq	%rax,%rax
428
429	RESTOREREGISTERS
430
431	ret
432	ALIGN_3
433
434.L30:
435	testq	$2 * SIZE, X
436	jne	.L40
437
438	movaps	-33 * SIZE(X), %xmm0
439
440	movq	M,  %rax
441	sarq	$5, %rax
442	jle	.L33
443
444	movaps	-29 * SIZE(X), %xmm1
445	movaps	-25 * SIZE(X), %xmm2
446	movaps	-21 * SIZE(X), %xmm3
447	movaps	-17 * SIZE(X), %xmm4
448	movaps	-13 * SIZE(X), %xmm5
449	movaps	 -9 * SIZE(X), %xmm6
450	movaps	 -5 * SIZE(X), %xmm7
451
452	decq	%rax
453	jle .L32
454	ALIGN_4
455
456.L31:
457#ifdef PREFETCHW
458	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
459#endif
460
461	movss	%xmm1, %xmm0
462	shufps	$0x39, %xmm0, %xmm0
463	movaps	%xmm0, -32 * SIZE(Y)
464	movaps	 -1 * SIZE(X), %xmm0
465
466	movss	%xmm2, %xmm1
467	shufps	$0x39, %xmm1, %xmm1
468	movaps	%xmm1, -28 * SIZE(Y)
469	movaps	  3 * SIZE(X), %xmm1
470
471#ifdef PREFETCH
472	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
473#endif
474
475	movss	%xmm3, %xmm2
476	shufps	$0x39, %xmm2, %xmm2
477	movaps	%xmm2, -24 * SIZE(Y)
478	movaps	  7 * SIZE(X), %xmm2
479
480	movss	%xmm4, %xmm3
481	shufps	$0x39, %xmm3, %xmm3
482	movaps	%xmm3, -20 * SIZE(Y)
483	movaps	 11 * SIZE(X), %xmm3
484
485#if defined(PREFETCHW) && !defined(FETCH128)
486	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
487#endif
488
489	movss	%xmm5, %xmm4
490	shufps	$0x39, %xmm4, %xmm4
491	movaps	%xmm4, -16 * SIZE(Y)
492	movaps	 15 * SIZE(X), %xmm4
493
494	movss	%xmm6, %xmm5
495	shufps	$0x39, %xmm5, %xmm5
496	movaps	%xmm5, -12 * SIZE(Y)
497	movaps	 19 * SIZE(X), %xmm5
498
499#if defined(PREFETCH) && !defined(FETCH128)
500	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
501#endif
502
503	movss	%xmm7, %xmm6
504	shufps	$0x39, %xmm6, %xmm6
505	movaps	%xmm6,  -8 * SIZE(Y)
506	movaps	 23 * SIZE(X), %xmm6
507
508	movss	%xmm0, %xmm7
509	shufps	$0x39, %xmm7, %xmm7
510	movaps	%xmm7,  -4 * SIZE(Y)
511	movaps	 27 * SIZE(X), %xmm7
512
513	subq	$-32 * SIZE, X
514	subq	$-32 * SIZE, Y
515	decq	%rax
516	jg	.L31
517	ALIGN_3
518
519.L32:
520	movss	%xmm1, %xmm0
521	shufps	$0x39, %xmm0, %xmm0
522	movaps	%xmm0, -32 * SIZE(Y)
523	movaps	 -1 * SIZE(X), %xmm0
524
525	movss	%xmm2, %xmm1
526	shufps	$0x39, %xmm1, %xmm1
527	movaps	%xmm1, -28 * SIZE(Y)
528
529	movss	%xmm3, %xmm2
530	shufps	$0x39, %xmm2, %xmm2
531	movaps	%xmm2, -24 * SIZE(Y)
532
533	movss	%xmm4, %xmm3
534	shufps	$0x39, %xmm3, %xmm3
535	movaps	%xmm3, -20 * SIZE(Y)
536
537	movss	%xmm5, %xmm4
538	shufps	$0x39, %xmm4, %xmm4
539	movaps	%xmm4, -16 * SIZE(Y)
540
541	movss	%xmm6, %xmm5
542	shufps	$0x39, %xmm5, %xmm5
543	movaps	%xmm5, -12 * SIZE(Y)
544
545	movss	%xmm7, %xmm6
546	shufps	$0x39, %xmm6, %xmm6
547	movaps	%xmm6,  -8 * SIZE(Y)
548
549	movss	%xmm0, %xmm7
550	shufps	$0x39, %xmm7, %xmm7
551	movaps	%xmm7,  -4 * SIZE(Y)
552
553	subq	$-32 * SIZE, X
554	subq	$-32 * SIZE, Y
555	ALIGN_3
556
557.L33:
558	testq	$16, M
559	jle	.L34
560	ALIGN_3
561
562	movaps	-29 * SIZE(X), %xmm1
563	movaps	-25 * SIZE(X), %xmm2
564	movaps	-21 * SIZE(X), %xmm3
565	movaps	-17 * SIZE(X), %xmm4
566
567	movss	%xmm1, %xmm0
568	shufps	$0x39, %xmm0, %xmm0
569 	movaps	%xmm0, -32 * SIZE(Y)
570
571	movss	%xmm2, %xmm1
572	shufps	$0x39, %xmm1, %xmm1
573	movaps	%xmm1, -28 * SIZE(Y)
574
575	movss	%xmm3, %xmm2
576	shufps	$0x39, %xmm2, %xmm2
577 	movaps	%xmm2, -24 * SIZE(Y)
578
579	movss	%xmm4, %xmm3
580	shufps	$0x39, %xmm3, %xmm3
581	movaps	%xmm3, -20 * SIZE(Y)
582
583	movaps	%xmm4, %xmm0
584
585	addq	$16 * SIZE, X
586	addq	$16 * SIZE, Y
587	ALIGN_3
588
589.L34:
590	testq	$8, M
591	jle	.L35
592	ALIGN_3
593
594	movaps	-29 * SIZE(X), %xmm1
595	movaps	-25 * SIZE(X), %xmm2
596
597	movss	%xmm1, %xmm0
598	shufps	$0x39, %xmm0, %xmm0
599	movaps	%xmm0, -32 * SIZE(Y)
600
601	movss	%xmm2, %xmm1
602	shufps	$0x39, %xmm1, %xmm1
603	movaps	%xmm1, -28 * SIZE(Y)
604	movaps	%xmm2, %xmm0
605
606	addq	$8 * SIZE, X
607	addq	$8 * SIZE, Y
608	ALIGN_3
609
610.L35:
611	testq	$4, M
612	jle	.L36
613	ALIGN_3
614
615	movaps	-29 * SIZE(X), %xmm1
616
617	movss	%xmm1, %xmm0
618	shufps	$0x39, %xmm0, %xmm0
619
620	movaps	%xmm0, -32 * SIZE(Y)
621
622	addq	$4 * SIZE, X
623	addq	$4 * SIZE, Y
624	ALIGN_3
625
626.L36:
627	testq	$2, M
628	jle	.L37
629	ALIGN_3
630
631	movsd	-32 * SIZE(X), %xmm0
632	movsd	%xmm0, -32 * SIZE(Y)
633
634	addq	$2 * SIZE, X
635	addq	$2 * SIZE, Y
636	ALIGN_3
637
638.L37:
639	testq	$1, M
640	jle	.L39
641	ALIGN_3
642
643	movss	-32 * SIZE(X), %xmm0
644	movss	%xmm0, 	-32 * SIZE(Y)
645	addq	$SIZE, Y
646	ALIGN_3
647
648.L39:
649	xorq	%rax,%rax
650
651	RESTOREREGISTERS
652
653	ret
654	ALIGN_3
655
656.L40:
657	movaps	-35 * SIZE(X), %xmm0
658
659	movq	M,  %rax
660	sarq	$5, %rax
661	jle	.L43
662
663	movaps	-31 * SIZE(X), %xmm1
664	movaps	-27 * SIZE(X), %xmm2
665	movaps	-23 * SIZE(X), %xmm3
666	movaps	-19 * SIZE(X), %xmm4
667	movaps	-15 * SIZE(X), %xmm5
668	movaps	-11 * SIZE(X), %xmm6
669	movaps	 -7 * SIZE(X), %xmm7
670
671	decq	%rax
672	jle .L42
673	ALIGN_4
674
675.L41:
676#ifdef PREFETCHW
677	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
678#endif
679
680	movss	%xmm1, %xmm0
681	shufps	$0x93, %xmm1, %xmm0
682	movaps	%xmm0, -32 * SIZE(Y)
683	movaps	 -3 * SIZE(X), %xmm0
684
685	movss	%xmm2, %xmm1
686	shufps	$0x93, %xmm2, %xmm1
687	movaps	%xmm1, -28 * SIZE(Y)
688	movaps	  1 * SIZE(X), %xmm1
689
690#ifdef PREFETCH
691	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
692#endif
693
694	movss	%xmm3, %xmm2
695	shufps	$0x93, %xmm3, %xmm2
696	movaps	%xmm2, -24 * SIZE(Y)
697	movaps	  5 * SIZE(X), %xmm2
698
699	movss	%xmm4, %xmm3
700	shufps	$0x93, %xmm4, %xmm3
701	movaps	%xmm3, -20 * SIZE(Y)
702	movaps	  9 * SIZE(X), %xmm3
703
704#if defined(PREFETCHW) && !defined(FETCH128)
705	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
706#endif
707
708	movss	%xmm5, %xmm4
709	shufps	$0x93, %xmm5, %xmm4
710	movaps	%xmm4, -16 * SIZE(Y)
711	movaps	 13 * SIZE(X), %xmm4
712
713	movss	%xmm6, %xmm5
714	shufps	$0x93, %xmm6, %xmm5
715	movaps	%xmm5, -12 * SIZE(Y)
716	movaps	 17 * SIZE(X), %xmm5
717
718#if defined(PREFETCH) && !defined(FETCH128)
719	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
720#endif
721
722	movss	%xmm7, %xmm6
723	shufps	$0x93, %xmm7, %xmm6
724	movaps	%xmm6, -8 * SIZE(Y)
725	movaps	 21 * SIZE(X), %xmm6
726
727	movss	%xmm0, %xmm7
728	shufps	$0x93, %xmm0, %xmm7
729	movaps	%xmm7, -4 * SIZE(Y)
730	movaps	 25 * SIZE(X), %xmm7
731
732	subq	$-32 * SIZE, X
733	subq	$-32 * SIZE, Y
734	decq	%rax
735	jg	.L41
736	ALIGN_3
737
738.L42:
739	movss	%xmm1, %xmm0
740	shufps	$0x93, %xmm1, %xmm0
741	movaps	%xmm0, -32 * SIZE(Y)
742	movaps	 -3 * SIZE(X), %xmm0
743
744	movss	%xmm2, %xmm1
745	shufps	$0x93, %xmm2, %xmm1
746	movaps	%xmm1, -28 * SIZE(Y)
747
748	movss	%xmm3, %xmm2
749	shufps	$0x93, %xmm3, %xmm2
750	movaps	%xmm2, -24 * SIZE(Y)
751
752	movss	%xmm4, %xmm3
753	shufps	$0x93, %xmm4, %xmm3
754	movaps	%xmm3, -20 * SIZE(Y)
755
756	movss	%xmm5, %xmm4
757	shufps	$0x93, %xmm5, %xmm4
758	movaps	%xmm4, -16 * SIZE(Y)
759
760	movss	%xmm6, %xmm5
761	shufps	$0x93, %xmm6, %xmm5
762	movaps	%xmm5, -12 * SIZE(Y)
763
764	movss	%xmm7, %xmm6
765	shufps	$0x93, %xmm7, %xmm6
766	movaps	%xmm6, -8 * SIZE(Y)
767
768	movss	%xmm0, %xmm7
769	shufps	$0x93, %xmm0, %xmm7
770	movaps	%xmm7, -4 * SIZE(Y)
771
772	subq	$-32 * SIZE, X
773	subq	$-32 * SIZE, Y
774	ALIGN_3
775
776.L43:
777	testq	$16, M
778	jle	.L44
779	ALIGN_3
780
781	movaps	-31 * SIZE(X), %xmm1
782	movaps	-27 * SIZE(X), %xmm2
783	movaps	-23 * SIZE(X), %xmm3
784	movaps	-19 * SIZE(X), %xmm4
785
786	movss	%xmm1, %xmm0
787	shufps	$0x93, %xmm1, %xmm0
788	movaps	%xmm0, -32 * SIZE(Y)
789
790	movss	%xmm2, %xmm1
791	shufps	$0x93, %xmm2, %xmm1
792	movaps	%xmm1, -28 * SIZE(Y)
793
794	movss	%xmm3, %xmm2
795	shufps	$0x93, %xmm3, %xmm2
796	movaps	%xmm2, -24 * SIZE(Y)
797
798	movss	%xmm4, %xmm3
799	shufps	$0x93, %xmm4, %xmm3
800	movaps	%xmm3, -20 * SIZE(Y)
801
802	movaps	%xmm4, %xmm0
803
804	addq	$16 * SIZE, X
805	addq	$16 * SIZE, Y
806	ALIGN_3
807
808.L44:
809	testq	$8, M
810	jle	.L45
811	ALIGN_3
812
813	movaps	-31 * SIZE(X), %xmm1
814	movaps	-27 * SIZE(X), %xmm2
815
816	movss	%xmm1, %xmm0
817	shufps	$0x93, %xmm1, %xmm0
818	movaps	%xmm0, -32 * SIZE(Y)
819
820	movss	%xmm2, %xmm1
821	shufps	$0x93, %xmm2, %xmm1
822	movaps	%xmm1, -28 * SIZE(Y)
823
824	movaps	%xmm2, %xmm0
825
826	addq	$8 * SIZE, X
827	addq	$8 * SIZE, Y
828	ALIGN_3
829
830.L45:
831	testq	$4, M
832	jle	.L46
833	ALIGN_3
834
835	movaps	-31 * SIZE(X), %xmm1
836
837	movss	%xmm1, %xmm0
838	shufps	$0x93, %xmm1, %xmm0
839
840	movaps	%xmm0, -32 * SIZE(Y)
841
842	addq	$4 * SIZE, X
843	addq	$4 * SIZE, Y
844	ALIGN_3
845
846.L46:
847	testq	$2, M
848	jle	.L47
849	ALIGN_3
850
851	movsd	-32 * SIZE(X), %xmm0
852	movsd	%xmm0, -32 * SIZE(Y)
853
854	addq	$2 * SIZE, X
855	addq	$2 * SIZE, Y
856	ALIGN_3
857
858.L47:
859	testq	$1, M
860	jle	.L49
861	ALIGN_3
862
863	movss	-32 * SIZE(X), %xmm0
864	movss	%xmm0, 	-32 * SIZE(Y)
865	addq	$SIZE, Y
866	ALIGN_3
867
868.L49:
869	xorq	%rax,%rax
870
871	RESTOREREGISTERS
872
873	ret
874	ALIGN_4
875
876.L100:
877	movq	M,  %rax
878	sarq	$3, %rax
879	jle	.L105
880	ALIGN_3
881
882.L102:
883	movsd	 (X), %xmm0
884	addq	 INCX, X
885	movhps	 (X), %xmm0
886	addq	 INCX, X
887	movsd	 (X), %xmm1
888	addq	 INCX, X
889	movhps	 (X), %xmm1
890	addq	 INCX, X
891	movsd	 (X), %xmm2
892	addq	 INCX, X
893	movhps	 (X), %xmm2
894	addq	 INCX, X
895	movsd	 (X), %xmm3
896	addq	 INCX, X
897	movhps	 (X), %xmm3
898	addq	 INCX, X
899
900	movsd	%xmm0,  (Y)
901	addq	 INCY, Y
902	movhps	%xmm0,  (Y)
903	addq	 INCY, Y
904	movsd	%xmm1,  (Y)
905	addq	 INCY, Y
906	movhps	%xmm1,  (Y)
907	addq	 INCY, Y
908	movsd	%xmm2,  (Y)
909	addq	 INCY, Y
910	movhps	%xmm2,  (Y)
911	addq	 INCY, Y
912	movsd	%xmm3,  (Y)
913	addq	 INCY, Y
914	movhps	%xmm3,  (Y)
915	addq	 INCY, Y
916
917	decq	%rax
918	jg	.L102
919	ALIGN_3
920
921.L105:
922	testq	$4, M
923	jle	.L106
924
925	movsd	 (X), %xmm0
926	addq	 INCX, X
927	movhps	 (X), %xmm0
928	addq	 INCX, X
929	movsd	 (X), %xmm1
930	addq	 INCX, X
931	movhps	 (X), %xmm1
932	addq	 INCX, X
933
934	movsd	%xmm0,  (Y)
935	addq	 INCY, Y
936	movhps	%xmm0,  (Y)
937	addq	 INCY, Y
938	movsd	%xmm1,  (Y)
939	addq	 INCY, Y
940	movhps	%xmm1,  (Y)
941	addq	 INCY, Y
942	ALIGN_3
943
944.L106:
945	testq	$2, M
946	jle	.L107
947
948	movsd	 (X), %xmm0
949	addq	 INCX, X
950	movhps	 (X), %xmm0
951	addq	 INCX, X
952
953	movsd	%xmm0,  (Y)
954	addq	 INCY, Y
955	movhps	%xmm0,  (Y)
956	addq	 INCY, Y
957	ALIGN_3
958
959.L107:
960	testq	$1, M
961	jle	.L999
962
963	movsd	 (X), %xmm0
964	movsd	%xmm0,   (Y)
965	ALIGN_3
966
967.L999:
968	xorq	%rax, %rax
969
970	RESTOREREGISTERS
971
972	ret
973
974	EPILOGUE
975
976