1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	12
26#define ARGS     0
27
28#define STACK_N		 4 + STACK + ARGS(%esp)
29#define STACK_X		 8 + STACK + ARGS(%esp)
30#define STACK_INCX	12 + STACK + ARGS(%esp)
31#define STACK_Y		16 + STACK + ARGS(%esp)
32#define STACK_INCY	20 + STACK + ARGS(%esp)
33#define STACK_C		24 + STACK + ARGS(%esp)
34#define STACK_S		28 + STACK + ARGS(%esp)
35
36#define N	%ebx
37#define X	%esi
38#define INCX	%ecx
39#define Y	%edi
40#define INCY	%edx
41
42#define	I	%eax
43
44#define C	%xmm6
45#define S	%xmm7
46
47#include "l1param.h"
48
49	PROLOGUE
50	PROFCODE
51
52	pushl	%edi
53	pushl	%esi
54	pushl	%ebx
55
56	movl	STACK_N,     N
57	movl	STACK_X,     X
58	movl	STACK_INCX,  INCX
59	movl	STACK_Y,     Y
60	movl	STACK_INCY,  INCY
61
62	leal	(, INCX, SIZE), INCX
63	leal	(, INCY, SIZE), INCY
64
65	movss	STACK_C, C
66	movss	STACK_S, S
67
68	shufps	$0x0, C, C
69	shufps	$0x0, S, S
70
71	cmpl	$0, N
72	jle	.L999
73
74	cmpl	$SIZE, INCX
75	jne	.L50
76	cmpl	$SIZE, INCY
77	jne	.L50
78
79	testl	$SIZE, X
80	je	.L05
81
82	movss	0 * SIZE(Y), %xmm1
83	movss	0 * SIZE(X), %xmm0
84
85	movaps	%xmm1, %xmm2
86	movaps	%xmm0, %xmm3
87
88	mulss	C, %xmm0
89	mulss	S, %xmm1
90
91	mulss	C, %xmm2
92	mulss	S, %xmm3
93
94	addss	%xmm1, %xmm0
95	subss	%xmm3, %xmm2
96
97	movss	%xmm0, 0 * SIZE(X)
98	movss	%xmm2, 0 * SIZE(Y)
99
100	addl	$1 * SIZE, X
101	addl	$1 * SIZE, Y
102	decl	N
103	jle	.L999
104
105.L05:
106	testl	$2 * SIZE, X
107	je	.L10
108
109	cmpl	$1, N
110	je	.L17
111
112#ifdef movsd
113	xorps	%xmm0, %xmm0
114	xorps	%xmm1, %xmm1
115#endif
116
117	movsd	0 * SIZE(Y), %xmm1
118	movsd	0 * SIZE(X), %xmm0
119
120	movaps	%xmm1, %xmm2
121	movaps	%xmm0, %xmm3
122
123	mulps	C, %xmm0
124	mulps	S, %xmm1
125
126	mulps	C, %xmm2
127	mulps	S, %xmm3
128
129	addps	%xmm1, %xmm0
130	subps	%xmm3, %xmm2
131
132	movlps	%xmm0, 0 * SIZE(X)
133	movlps	%xmm2, 0 * SIZE(Y)
134
135	addl	$2 * SIZE, X
136	addl	$2 * SIZE, Y
137	subl	$2, N
138	jle	.L999
139	ALIGN_2
140
141.L10:
142	testl	$3 * SIZE, Y
143	jne	.L20
144
145	movl	N,  I
146	sarl	$5, I
147	jle	.L14
148	ALIGN_3
149
150.L11:
151#ifdef PREFETCHW
152	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(X)
153#endif
154
155	movsd	0 * SIZE(Y), %xmm1
156	movhps	2 * SIZE(Y), %xmm1
157	movaps	0 * SIZE(X), %xmm0
158
159	movaps	%xmm1, %xmm2
160	movaps	%xmm0, %xmm3
161
162	mulps	C, %xmm0
163	mulps	S, %xmm1
164
165	mulps	C, %xmm2
166	mulps	S, %xmm3
167
168	addps	%xmm1, %xmm0
169	subps	%xmm3, %xmm2
170
171	movaps	%xmm0, 0 * SIZE(X)
172	movlps	%xmm2, 0 * SIZE(Y)
173	movhps	%xmm2, 2 * SIZE(Y)
174
175	movsd	4 * SIZE(Y), %xmm1
176	movhps	6 * SIZE(Y), %xmm1
177	movaps	4 * SIZE(X), %xmm0
178
179	movaps	%xmm1, %xmm2
180	movaps	%xmm0, %xmm3
181
182	mulps	C, %xmm0
183	mulps	S, %xmm1
184
185	mulps	C, %xmm2
186	mulps	S, %xmm3
187
188	addps	%xmm1, %xmm0
189	subps	%xmm3, %xmm2
190
191#ifdef PREFETCHW
192	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
193#endif
194
195	movaps	%xmm0, 4 * SIZE(X)
196	movlps	%xmm2, 4 * SIZE(Y)
197	movhps	%xmm2, 6 * SIZE(Y)
198
199	movsd	 8 * SIZE(Y), %xmm1
200	movhps	10 * SIZE(Y), %xmm1
201	movaps	 8 * SIZE(X), %xmm0
202
203	movaps	%xmm1, %xmm2
204	movaps	%xmm0, %xmm3
205
206	mulps	C, %xmm0
207	mulps	S, %xmm1
208
209	mulps	C, %xmm2
210	mulps	S, %xmm3
211
212	addps	%xmm1, %xmm0
213	subps	%xmm3, %xmm2
214
215	movaps	%xmm0,  8 * SIZE(X)
216	movlps	%xmm2,  8 * SIZE(Y)
217	movhps	%xmm2, 10 * SIZE(Y)
218
219	movsd	12 * SIZE(Y), %xmm1
220	movhps	14 * SIZE(Y), %xmm1
221	movaps	12 * SIZE(X), %xmm0
222
223	movaps	%xmm1, %xmm2
224	movaps	%xmm0, %xmm3
225
226	mulps	C, %xmm0
227	mulps	S, %xmm1
228
229	mulps	C, %xmm2
230	mulps	S, %xmm3
231
232	addps	%xmm1, %xmm0
233	subps	%xmm3, %xmm2
234
235	movaps	%xmm0, 12 * SIZE(X)
236	movlps	%xmm2, 12 * SIZE(Y)
237	movhps	%xmm2, 14 * SIZE(Y)
238
239#ifdef PREFETCHW
240	PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
241#endif
242
243	movsd	16 * SIZE(Y), %xmm1
244	movhps	18 * SIZE(Y), %xmm1
245	movaps	16 * SIZE(X), %xmm0
246
247	movaps	%xmm1, %xmm2
248	movaps	%xmm0, %xmm3
249
250	mulps	C, %xmm0
251	mulps	S, %xmm1
252
253	mulps	C, %xmm2
254	mulps	S, %xmm3
255
256	addps	%xmm1, %xmm0
257	subps	%xmm3, %xmm2
258
259	movaps	%xmm0, 16 * SIZE(X)
260	movlps	%xmm2, 16 * SIZE(Y)
261	movhps	%xmm2, 18 * SIZE(Y)
262
263	movsd	20 * SIZE(Y), %xmm1
264	movhps	22 * SIZE(Y), %xmm1
265	movaps	20 * SIZE(X), %xmm0
266
267	movaps	%xmm1, %xmm2
268	movaps	%xmm0, %xmm3
269
270	mulps	C, %xmm0
271	mulps	S, %xmm1
272
273	mulps	C, %xmm2
274	mulps	S, %xmm3
275
276	addps	%xmm1, %xmm0
277	subps	%xmm3, %xmm2
278
279	movaps	%xmm0, 20 * SIZE(X)
280	movlps	%xmm2, 20 * SIZE(Y)
281	movhps	%xmm2, 22 * SIZE(Y)
282
283#ifdef PREFETCHW
284	PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
285#endif
286
287	movsd	24 * SIZE(Y), %xmm1
288	movhps	26 * SIZE(Y), %xmm1
289	movaps	24 * SIZE(X), %xmm0
290
291	movaps	%xmm1, %xmm2
292	movaps	%xmm0, %xmm3
293
294	mulps	C, %xmm0
295	mulps	S, %xmm1
296
297	mulps	C, %xmm2
298	mulps	S, %xmm3
299
300	addps	%xmm1, %xmm0
301	subps	%xmm3, %xmm2
302
303	movaps	%xmm0, 24 * SIZE(X)
304	movlps	%xmm2, 24 * SIZE(Y)
305	movhps	%xmm2, 26 * SIZE(Y)
306
307	movsd	28 * SIZE(Y), %xmm1
308	movhps	30 * SIZE(Y), %xmm1
309	movaps	28 * SIZE(X), %xmm0
310
311	movaps	%xmm1, %xmm2
312	movaps	%xmm0, %xmm3
313
314	mulps	C, %xmm0
315	mulps	S, %xmm1
316
317	mulps	C, %xmm2
318	mulps	S, %xmm3
319
320	addps	%xmm1, %xmm0
321	subps	%xmm3, %xmm2
322
323	movaps	%xmm0, 28 * SIZE(X)
324	movlps	%xmm2, 28 * SIZE(Y)
325	movhps	%xmm2, 30 * SIZE(Y)
326
327	addl	$32 * SIZE, X
328	addl	$32 * SIZE, Y
329
330	decl	I
331	jg	.L11
332	ALIGN_3
333
334.L14:
335	testl	$31, N
336	jle	.L999
337
338	testl	$16, N
339	jle	.L15
340
341	movsd	0 * SIZE(Y), %xmm1
342	movhps	2 * SIZE(Y), %xmm1
343	movaps	0 * SIZE(X), %xmm0
344
345	movaps	%xmm1, %xmm2
346	movaps	%xmm0, %xmm3
347
348	mulps	C, %xmm0
349	mulps	S, %xmm1
350
351	mulps	C, %xmm2
352	mulps	S, %xmm3
353
354	addps	%xmm1, %xmm0
355	subps	%xmm3, %xmm2
356
357	movaps	%xmm0, 0 * SIZE(X)
358	movlps	%xmm2, 0 * SIZE(Y)
359	movhps	%xmm2, 2 * SIZE(Y)
360
361	movsd	4 * SIZE(Y), %xmm1
362	movhps	6 * SIZE(Y), %xmm1
363	movaps	4 * SIZE(X), %xmm0
364
365	movaps	%xmm1, %xmm2
366	movaps	%xmm0, %xmm3
367
368	mulps	C, %xmm0
369	mulps	S, %xmm1
370
371	mulps	C, %xmm2
372	mulps	S, %xmm3
373
374	addps	%xmm1, %xmm0
375	subps	%xmm3, %xmm2
376
377	movaps	%xmm0, 4 * SIZE(X)
378	movlps	%xmm2, 4 * SIZE(Y)
379	movhps	%xmm2, 6 * SIZE(Y)
380
381	movsd	 8 * SIZE(Y), %xmm1
382	movhps	10 * SIZE(Y), %xmm1
383	movaps	 8 * SIZE(X), %xmm0
384
385	movaps	%xmm1, %xmm2
386	movaps	%xmm0, %xmm3
387
388	mulps	C, %xmm0
389	mulps	S, %xmm1
390
391	mulps	C, %xmm2
392	mulps	S, %xmm3
393
394	addps	%xmm1, %xmm0
395	subps	%xmm3, %xmm2
396
397	movaps	%xmm0,  8 * SIZE(X)
398	movlps	%xmm2,  8 * SIZE(Y)
399	movhps	%xmm2, 10 * SIZE(Y)
400
401	movsd	12 * SIZE(Y), %xmm1
402	movhps	14 * SIZE(Y), %xmm1
403	movaps	12 * SIZE(X), %xmm0
404
405	movaps	%xmm1, %xmm2
406	movaps	%xmm0, %xmm3
407
408	mulps	C, %xmm0
409	mulps	S, %xmm1
410
411	mulps	C, %xmm2
412	mulps	S, %xmm3
413
414	addps	%xmm1, %xmm0
415	subps	%xmm3, %xmm2
416
417	movaps	%xmm0, 12 * SIZE(X)
418	movlps	%xmm2, 12 * SIZE(Y)
419	movhps	%xmm2, 14 * SIZE(Y)
420
421	addl	$16 * SIZE, X
422	addl	$16 * SIZE, Y
423	ALIGN_3
424
425.L15:
426	testl	$8, N
427	jle	.L16
428
429	movsd	0 * SIZE(Y), %xmm1
430	movhps	2 * SIZE(Y), %xmm1
431	movaps	0 * SIZE(X), %xmm0
432
433	movaps	%xmm1, %xmm2
434	movaps	%xmm0, %xmm3
435
436	mulps	C, %xmm0
437	mulps	S, %xmm1
438
439	mulps	C, %xmm2
440	mulps	S, %xmm3
441
442	addps	%xmm1, %xmm0
443	subps	%xmm3, %xmm2
444
445	movaps	%xmm0, 0 * SIZE(X)
446	movlps	%xmm2, 0 * SIZE(Y)
447	movhps	%xmm2, 2 * SIZE(Y)
448
449	movsd	4 * SIZE(Y), %xmm1
450	movhps	6 * SIZE(Y), %xmm1
451	movaps	4 * SIZE(X), %xmm0
452
453	movaps	%xmm1, %xmm2
454	movaps	%xmm0, %xmm3
455
456	mulps	C, %xmm0
457	mulps	S, %xmm1
458
459	mulps	C, %xmm2
460	mulps	S, %xmm3
461
462	addps	%xmm1, %xmm0
463	subps	%xmm3, %xmm2
464
465	movaps	%xmm0, 4 * SIZE(X)
466	movlps	%xmm2, 4 * SIZE(Y)
467	movhps	%xmm2, 6 * SIZE(Y)
468
469	addl	$8 * SIZE, X
470	addl	$8 * SIZE, Y
471	ALIGN_3
472
473.L16:
474	testl	$4, N
475	jle	.L17
476
477	movsd	0 * SIZE(Y), %xmm1
478	movhps	2 * SIZE(Y), %xmm1
479	movaps	0 * SIZE(X), %xmm0
480
481	movaps	%xmm1, %xmm2
482	movaps	%xmm0, %xmm3
483
484	mulps	C, %xmm0
485	mulps	S, %xmm1
486
487	mulps	C, %xmm2
488	mulps	S, %xmm3
489
490	addps	%xmm1, %xmm0
491	subps	%xmm3, %xmm2
492
493	movaps	%xmm0, 0 * SIZE(X)
494	movlps	%xmm2, 0 * SIZE(Y)
495	movhps	%xmm2, 2 * SIZE(Y)
496
497	addl	$4 * SIZE, X
498	addl	$4 * SIZE, Y
499	ALIGN_3
500
501.L17:
502	testl	$2, N
503	jle	.L18
504
505#ifdef movsd
506	xorps	%xmm0, %xmm0
507	xorps	%xmm1, %xmm1
508#endif
509
510	movsd	0 * SIZE(Y), %xmm1
511	movsd	0 * SIZE(X), %xmm0
512
513	movaps	%xmm1, %xmm2
514	movaps	%xmm0, %xmm3
515
516	mulps	C, %xmm0
517	mulps	S, %xmm1
518
519	mulps	C, %xmm2
520	mulps	S, %xmm3
521
522	addps	%xmm1, %xmm0
523	subps	%xmm3, %xmm2
524
525	movlps	%xmm0, 0 * SIZE(X)
526	movlps	%xmm2, 0 * SIZE(Y)
527
528	addl	$2 * SIZE, X
529	addl	$2 * SIZE, Y
530	ALIGN_3
531
532.L18:
533	testl	$1, N
534	jle	.L999
535
536	movss	0 * SIZE(Y), %xmm1
537	movss	0 * SIZE(X), %xmm0
538
539	movaps	%xmm1, %xmm2
540	movaps	%xmm0, %xmm3
541
542	mulss	C, %xmm0
543	mulss	S, %xmm1
544
545	mulss	C, %xmm2
546	mulss	S, %xmm3
547
548	addss	%xmm1, %xmm0
549	subss	%xmm3, %xmm2
550
551	movss	%xmm0, 0 * SIZE(X)
552	movss	%xmm2, 0 * SIZE(Y)
553	jmp	.L999
554	ALIGN_3
555
556.L20:
557	movl	N,  I
558	sarl	$5, I
559	jle	.L24
560	ALIGN_3
561
562.L21:
563#ifdef PREFETCHW
564	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(X)
565#endif
566
567	movsd	0 * SIZE(Y), %xmm1
568	movhps	2 * SIZE(Y), %xmm1
569	movaps	0 * SIZE(X), %xmm0
570
571	movaps	%xmm1, %xmm2
572	movaps	%xmm0, %xmm3
573
574	mulps	C, %xmm0
575	mulps	S, %xmm1
576
577	mulps	C, %xmm2
578	mulps	S, %xmm3
579
580	addps	%xmm1, %xmm0
581	subps	%xmm3, %xmm2
582
583	movaps	%xmm0, 0 * SIZE(X)
584	movlps	%xmm2, 0 * SIZE(Y)
585	movhps	%xmm2, 2 * SIZE(Y)
586
587	movsd	4 * SIZE(Y), %xmm1
588	movhps	6 * SIZE(Y), %xmm1
589	movaps	4 * SIZE(X), %xmm0
590
591	movaps	%xmm1, %xmm2
592	movaps	%xmm0, %xmm3
593
594	mulps	C, %xmm0
595	mulps	S, %xmm1
596
597	mulps	C, %xmm2
598	mulps	S, %xmm3
599
600	addps	%xmm1, %xmm0
601	subps	%xmm3, %xmm2
602
603	movaps	%xmm0, 4 * SIZE(X)
604	movlps	%xmm2, 4 * SIZE(Y)
605	movhps	%xmm2, 6 * SIZE(Y)
606
607#ifdef PREFETCHW
608	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
609#endif
610
611	movsd	 8 * SIZE(Y), %xmm1
612	movhps	10 * SIZE(Y), %xmm1
613	movaps	 8 * SIZE(X), %xmm0
614
615	movaps	%xmm1, %xmm2
616	movaps	%xmm0, %xmm3
617
618	mulps	C, %xmm0
619	mulps	S, %xmm1
620
621	mulps	C, %xmm2
622	mulps	S, %xmm3
623
624	addps	%xmm1, %xmm0
625	subps	%xmm3, %xmm2
626
627	movaps	%xmm0,  8 * SIZE(X)
628	movlps	%xmm2,  8 * SIZE(Y)
629	movhps	%xmm2, 10 * SIZE(Y)
630
631	movsd	12 * SIZE(Y), %xmm1
632	movhps	14 * SIZE(Y), %xmm1
633	movaps	12 * SIZE(X), %xmm0
634
635	movaps	%xmm1, %xmm2
636	movaps	%xmm0, %xmm3
637
638	mulps	C, %xmm0
639	mulps	S, %xmm1
640
641	mulps	C, %xmm2
642	mulps	S, %xmm3
643
644	addps	%xmm1, %xmm0
645	subps	%xmm3, %xmm2
646
647	movaps	%xmm0, 12 * SIZE(X)
648	movlps	%xmm2, 12 * SIZE(Y)
649	movhps	%xmm2, 14 * SIZE(Y)
650
651#ifdef PREFETCHW
652	PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
653#endif
654
655	movsd	16 * SIZE(Y), %xmm1
656	movhps	18 * SIZE(Y), %xmm1
657	movaps	16 * SIZE(X), %xmm0
658
659	movaps	%xmm1, %xmm2
660	movaps	%xmm0, %xmm3
661
662	mulps	C, %xmm0
663	mulps	S, %xmm1
664
665	mulps	C, %xmm2
666	mulps	S, %xmm3
667
668	addps	%xmm1, %xmm0
669	subps	%xmm3, %xmm2
670
671	movaps	%xmm0, 16 * SIZE(X)
672	movlps	%xmm2, 16 * SIZE(Y)
673	movhps	%xmm2, 18 * SIZE(Y)
674
675	movsd	20 * SIZE(Y), %xmm1
676	movhps	22 * SIZE(Y), %xmm1
677	movaps	20 * SIZE(X), %xmm0
678
679	movaps	%xmm1, %xmm2
680	movaps	%xmm0, %xmm3
681
682	mulps	C, %xmm0
683	mulps	S, %xmm1
684
685	mulps	C, %xmm2
686	mulps	S, %xmm3
687
688	addps	%xmm1, %xmm0
689	subps	%xmm3, %xmm2
690
691	movaps	%xmm0, 20 * SIZE(X)
692	movlps	%xmm2, 20 * SIZE(Y)
693	movhps	%xmm2, 22 * SIZE(Y)
694
695#ifdef PREFETCHW
696	PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
697#endif
698
699	movsd	24 * SIZE(Y), %xmm1
700	movhps	26 * SIZE(Y), %xmm1
701	movaps	24 * SIZE(X), %xmm0
702
703	movaps	%xmm1, %xmm2
704	movaps	%xmm0, %xmm3
705
706	mulps	C, %xmm0
707	mulps	S, %xmm1
708
709	mulps	C, %xmm2
710	mulps	S, %xmm3
711
712	addps	%xmm1, %xmm0
713	subps	%xmm3, %xmm2
714
715	movaps	%xmm0, 24 * SIZE(X)
716	movlps	%xmm2, 24 * SIZE(Y)
717	movhps	%xmm2, 26 * SIZE(Y)
718
719	movsd	28 * SIZE(Y), %xmm1
720	movhps	30 * SIZE(Y), %xmm1
721	movaps	28 * SIZE(X), %xmm0
722
723	movaps	%xmm1, %xmm2
724	movaps	%xmm0, %xmm3
725
726	mulps	C, %xmm0
727	mulps	S, %xmm1
728
729	mulps	C, %xmm2
730	mulps	S, %xmm3
731
732	addps	%xmm1, %xmm0
733	subps	%xmm3, %xmm2
734
735	movaps	%xmm0, 28 * SIZE(X)
736	movlps	%xmm2, 28 * SIZE(Y)
737	movhps	%xmm2, 30 * SIZE(Y)
738
739	addl	$32 * SIZE, X
740	addl	$32 * SIZE, Y
741	decl	I
742	jg	.L21
743	ALIGN_3
744
745.L24:
746	testl	$31, N
747	jle	.L999
748
749	testl	$16, N
750	jle	.L25
751
752	movsd	0 * SIZE(Y), %xmm1
753	movhps	2 * SIZE(Y), %xmm1
754	movaps	0 * SIZE(X), %xmm0
755
756	movaps	%xmm1, %xmm2
757	movaps	%xmm0, %xmm3
758
759	mulps	C, %xmm0
760	mulps	S, %xmm1
761
762	mulps	C, %xmm2
763	mulps	S, %xmm3
764
765	addps	%xmm1, %xmm0
766	subps	%xmm3, %xmm2
767
768	movaps	%xmm0, 0 * SIZE(X)
769	movlps	%xmm2, 0 * SIZE(Y)
770	movhps	%xmm2, 2 * SIZE(Y)
771
772	movsd	4 * SIZE(Y), %xmm1
773	movhps	6 * SIZE(Y), %xmm1
774	movaps	4 * SIZE(X), %xmm0
775
776	movaps	%xmm1, %xmm2
777	movaps	%xmm0, %xmm3
778
779	mulps	C, %xmm0
780	mulps	S, %xmm1
781
782	mulps	C, %xmm2
783	mulps	S, %xmm3
784
785	addps	%xmm1, %xmm0
786	subps	%xmm3, %xmm2
787
788	movaps	%xmm0, 4 * SIZE(X)
789	movlps	%xmm2, 4 * SIZE(Y)
790	movhps	%xmm2, 6 * SIZE(Y)
791
792	movsd	 8 * SIZE(Y), %xmm1
793	movhps	10 * SIZE(Y), %xmm1
794	movaps	 8 * SIZE(X), %xmm0
795
796	movaps	%xmm1, %xmm2
797	movaps	%xmm0, %xmm3
798
799	mulps	C, %xmm0
800	mulps	S, %xmm1
801
802	mulps	C, %xmm2
803	mulps	S, %xmm3
804
805	addps	%xmm1, %xmm0
806	subps	%xmm3, %xmm2
807
808	movaps	%xmm0,  8 * SIZE(X)
809	movlps	%xmm2,  8 * SIZE(Y)
810	movhps	%xmm2, 10 * SIZE(Y)
811
812	movsd	12 * SIZE(Y), %xmm1
813	movhps	14 * SIZE(Y), %xmm1
814	movaps	12 * SIZE(X), %xmm0
815
816	movaps	%xmm1, %xmm2
817	movaps	%xmm0, %xmm3
818
819	mulps	C, %xmm0
820	mulps	S, %xmm1
821
822	mulps	C, %xmm2
823	mulps	S, %xmm3
824
825	addps	%xmm1, %xmm0
826	subps	%xmm3, %xmm2
827
828	movaps	%xmm0, 12 * SIZE(X)
829	movlps	%xmm2, 12 * SIZE(Y)
830	movhps	%xmm2, 14 * SIZE(Y)
831
832	addl	$16 * SIZE, X
833	addl	$16 * SIZE, Y
834	ALIGN_3
835
836.L25:
837	testl	$8, N
838	jle	.L26
839
840	movsd	0 * SIZE(Y), %xmm1
841	movhps	2 * SIZE(Y), %xmm1
842	movaps	0 * SIZE(X), %xmm0
843
844	movaps	%xmm1, %xmm2
845	movaps	%xmm0, %xmm3
846
847	mulps	C, %xmm0
848	mulps	S, %xmm1
849
850	mulps	C, %xmm2
851	mulps	S, %xmm3
852
853	addps	%xmm1, %xmm0
854	subps	%xmm3, %xmm2
855
856	movaps	%xmm0, 0 * SIZE(X)
857	movlps	%xmm2, 0 * SIZE(Y)
858	movhps	%xmm2, 2 * SIZE(Y)
859
860	movsd	4 * SIZE(Y), %xmm1
861	movhps	6 * SIZE(Y), %xmm1
862	movaps	4 * SIZE(X), %xmm0
863
864	movaps	%xmm1, %xmm2
865	movaps	%xmm0, %xmm3
866
867	mulps	C, %xmm0
868	mulps	S, %xmm1
869
870	mulps	C, %xmm2
871	mulps	S, %xmm3
872
873	addps	%xmm1, %xmm0
874	subps	%xmm3, %xmm2
875
876	movaps	%xmm0, 4 * SIZE(X)
877	movlps	%xmm2, 4 * SIZE(Y)
878	movhps	%xmm2, 6 * SIZE(Y)
879
880	addl	$8 * SIZE, X
881	addl	$8 * SIZE, Y
882	ALIGN_3
883
884
885.L26:
886	testl	$4, N
887	jle	.L27
888
889	movsd	0 * SIZE(Y), %xmm1
890	movhps	2 * SIZE(Y), %xmm1
891	movaps	0 * SIZE(X), %xmm0
892
893	movaps	%xmm1, %xmm2
894	movaps	%xmm0, %xmm3
895
896	mulps	C, %xmm0
897	mulps	S, %xmm1
898
899	mulps	C, %xmm2
900	mulps	S, %xmm3
901
902	addps	%xmm1, %xmm0
903	subps	%xmm3, %xmm2
904
905	movaps	%xmm0, 0 * SIZE(X)
906	movlps	%xmm2, 0 * SIZE(Y)
907	movhps	%xmm2, 2 * SIZE(Y)
908
909	addl	$4 * SIZE, X
910	addl	$4 * SIZE, Y
911	ALIGN_3
912
913.L27:
914	testl	$2, N
915	jle	.L28
916
917#ifdef movsd
918	xorps	%xmm0, %xmm0
919	xorps	%xmm1, %xmm1
920#endif
921
922	movsd	0 * SIZE(Y), %xmm1
923	movsd	0 * SIZE(X), %xmm0
924
925	movaps	%xmm1, %xmm2
926	movaps	%xmm0, %xmm3
927
928	mulps	C, %xmm0
929	mulps	S, %xmm1
930
931	mulps	C, %xmm2
932	mulps	S, %xmm3
933
934	addps	%xmm1, %xmm0
935	subps	%xmm3, %xmm2
936
937	movlps	%xmm0, 0 * SIZE(X)
938	movlps	%xmm2, 0 * SIZE(Y)
939
940	addl	$2 * SIZE, X
941	addl	$2 * SIZE, Y
942	ALIGN_3
943
944.L28:
945	testl	$1, N
946	jle	.L999
947
948	movss	0 * SIZE(Y), %xmm1
949	movss	0 * SIZE(X), %xmm0
950
951	movaps	%xmm1, %xmm2
952	movaps	%xmm0, %xmm3
953
954	mulss	C, %xmm0
955	mulss	S, %xmm1
956
957	mulss	C, %xmm2
958	mulss	S, %xmm3
959
960	addss	%xmm1, %xmm0
961	subss	%xmm3, %xmm2
962
963	movss	%xmm0, 0 * SIZE(X)
964	movss	%xmm2, 0 * SIZE(Y)
965	jmp	.L999
966	ALIGN_3
967
968.L50:
969	movl	N,  I
970	sarl	$2, I
971	jle	.L55
972	ALIGN_3
973
974.L53:
975	movss	(Y), %xmm1
976	movss	(X), %xmm0
977
978	movaps	%xmm1, %xmm2
979	movaps	%xmm0, %xmm3
980
981	mulss	C, %xmm0
982	mulss	S, %xmm1
983
984	mulss	C, %xmm2
985	mulss	S, %xmm3
986
987	addss	%xmm1, %xmm0
988	subss	%xmm3, %xmm2
989
990	movss	%xmm0, (X)
991	movss	%xmm2, (Y)
992
993	addl	INCX, X
994	addl	INCY, Y
995
996	movss	(Y), %xmm1
997	movss	(X), %xmm0
998
999	movaps	%xmm1, %xmm2
1000	movaps	%xmm0, %xmm3
1001
1002	mulss	C, %xmm0
1003	mulss	S, %xmm1
1004
1005	mulss	C, %xmm2
1006	mulss	S, %xmm3
1007
1008	addss	%xmm1, %xmm0
1009	subss	%xmm3, %xmm2
1010
1011	movss	%xmm0, (X)
1012	movss	%xmm2, (Y)
1013
1014	addl	INCX, X
1015	addl	INCY, Y
1016
1017	movss	(Y), %xmm1
1018	movss	(X), %xmm0
1019
1020	movaps	%xmm1, %xmm2
1021	movaps	%xmm0, %xmm3
1022
1023	mulss	C, %xmm0
1024	mulss	S, %xmm1
1025
1026	mulss	C, %xmm2
1027	mulss	S, %xmm3
1028
1029	addss	%xmm1, %xmm0
1030	subss	%xmm3, %xmm2
1031
1032	movss	%xmm0, (X)
1033	movss	%xmm2, (Y)
1034
1035	addl	INCX, X
1036	addl	INCY, Y
1037
1038	movss	(Y), %xmm1
1039	movss	(X), %xmm0
1040
1041	movaps	%xmm1, %xmm2
1042	movaps	%xmm0, %xmm3
1043
1044	mulss	C, %xmm0
1045	mulss	S, %xmm1
1046
1047	mulss	C, %xmm2
1048	mulss	S, %xmm3
1049
1050	addss	%xmm1, %xmm0
1051	subss	%xmm3, %xmm2
1052
1053	movss	%xmm0, (X)
1054	movss	%xmm2, (Y)
1055
1056	addl	INCX, X
1057	addl	INCY, Y
1058
1059	decl	I
1060	jg	.L53
1061	ALIGN_3
1062
1063.L55:
1064	movl	N,  I
1065	andl	$3, I
1066	jle	.L999
1067	ALIGN_3
1068
1069.L56:
1070	movss	(Y), %xmm1
1071	movss	(X), %xmm0
1072
1073	movaps	%xmm1, %xmm2
1074	movaps	%xmm0, %xmm3
1075
1076	mulss	C, %xmm0
1077	mulss	S, %xmm1
1078
1079	mulss	C, %xmm2
1080	mulss	S, %xmm3
1081
1082	addss	%xmm1, %xmm0
1083	subss	%xmm3, %xmm2
1084
1085	movss	%xmm0, (X)
1086	movss	%xmm2, (Y)
1087
1088	addl	INCX, X
1089	addl	INCY, Y
1090
1091	decl	I
1092	jg	.L56
1093	ALIGN_3
1094
1095.L999:
1096	popl	%ebx
1097	popl	%esi
1098	popl	%edi
1099
1100	ret
1101
1102	EPILOGUE
1103