1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef ATOM
26#define PREFETCH	prefetchnta
27#define PREFETCHW	prefetcht0
28#define PREFETCHSIZE	(8 * 6)
29#endif
30
31#define STACKSIZE	16
32
33#define M		 4 + STACKSIZE(%esp)
34#define N		 8 + STACKSIZE(%esp)
35#define ALPHA		16 + STACKSIZE(%esp)
36#define A		24 + STACKSIZE(%esp)
37#define STACK_LDA	28 + STACKSIZE(%esp)
38#define STACK_X		32 + STACKSIZE(%esp)
39#define STACK_INCX	36 + STACKSIZE(%esp)
40#define Y		40 + STACKSIZE(%esp)
41#define STACK_INCY	44 + STACKSIZE(%esp)
42#define BUFFER		48 + STACKSIZE(%esp)
43
44#define I	%eax
45#define J	%ebx
46
47#define INCX	%ecx
48#define INCY	J
49
50#define A1	%esi
51#define X	%edx
52#define Y1	%edi
53#define LDA	%ebp
54
55	PROLOGUE
56
57	pushl	%ebp
58	pushl	%edi
59	pushl	%esi
60	pushl	%ebx
61
62	PROFCODE
63
64	movl	STACK_LDA,  LDA
65	movl	STACK_X,    X
66	movl	STACK_INCX, INCX
67
68	leal	(,INCX, SIZE), INCX
69	leal	(,LDA,  SIZE), LDA
70
71	subl	$-16 * SIZE, A
72
73	cmpl	$0, N
74	jle	.L999
75	cmpl	$0, M
76	jle	.L999
77
78	movl	BUFFER, Y1
79
80	pxor	%xmm7, %xmm7
81
82	movl	M,   %eax
83	addl	$16, %eax
84	sarl	$4,  %eax
85	ALIGN_3
86
87.L01:
88	movapd	%xmm7,  0 * SIZE(Y1)
89	movapd	%xmm7,  2 * SIZE(Y1)
90	movapd	%xmm7,  4 * SIZE(Y1)
91	movapd	%xmm7,  6 * SIZE(Y1)
92	movapd	%xmm7,  8 * SIZE(Y1)
93	movapd	%xmm7, 10 * SIZE(Y1)
94	movapd	%xmm7, 12 * SIZE(Y1)
95	movapd	%xmm7, 14 * SIZE(Y1)
96	subl	$-16 * SIZE, Y1
97	decl	%eax
98	jg	.L01
99	ALIGN_3
100
101.L10:
102	movl	N,  J
103	sarl	$1, J
104	jle	.L20
105	ALIGN_3
106
107.L11:
108	movl	BUFFER, Y1
109	addl	$16 * SIZE, Y1
110
111	movl	A,  A1
112	leal	(A1,  LDA, 2), %eax
113	movl	%eax, A
114
115	movsd	(X), %xmm6
116	addl	INCX, X
117	movsd	(X), %xmm7
118	addl	INCX, X
119
120	movsd	ALPHA, %xmm0
121
122	mulsd	%xmm0, %xmm6
123	mulsd	%xmm0, %xmm7
124
125	movsd	 -16 * SIZE(Y1), %xmm0
126	movsd	 -15 * SIZE(Y1), %xmm1
127
128	movl	M,   I
129	sarl	$3,  I
130	jle	.L15
131
132	movsd	 -16 * SIZE(A1), %xmm2
133	movsd	 -15 * SIZE(A1), %xmm3
134	movsd	 -16 * SIZE(A1, LDA), %xmm4
135	movsd	 -15 * SIZE(A1, LDA), %xmm5
136
137	mulsd	 %xmm6, %xmm2
138	mulsd	 %xmm6, %xmm3
139
140	decl	 I
141	jle	 .L14
142	ALIGN_3
143
144.L13:
145#ifdef PREFETCH
146	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
147#endif
148
149	mulsd	 %xmm7, %xmm4
150	addsd	 %xmm2, %xmm0
151	movsd	 -14 * SIZE(A1), %xmm2
152	mulsd	 %xmm7, %xmm5
153	addsd	 %xmm3, %xmm1
154	movsd	 -13 * SIZE(A1), %xmm3
155
156	addsd	 %xmm4, %xmm0
157	movsd	 -14 * SIZE(A1, LDA), %xmm4
158	mulsd	 %xmm6, %xmm2
159	addsd	 %xmm5, %xmm1
160	movsd	 -13 * SIZE(A1, LDA), %xmm5
161	mulsd	 %xmm6, %xmm3
162
163	movlpd	 %xmm0, -16 * SIZE(Y1)
164	movsd	 -14 * SIZE(Y1), %xmm0
165	movlpd	 %xmm1, -15 * SIZE(Y1)
166	movsd	 -13 * SIZE(Y1), %xmm1
167
168	mulsd	 %xmm7, %xmm4
169	addsd	 %xmm2, %xmm0
170	movsd	 -12 * SIZE(A1), %xmm2
171	mulsd	 %xmm7, %xmm5
172	addsd	 %xmm3, %xmm1
173	movsd	 -11 * SIZE(A1), %xmm3
174
175	addsd	 %xmm4, %xmm0
176	movsd	 -12 * SIZE(A1, LDA), %xmm4
177	mulsd	 %xmm6, %xmm2
178	addsd	 %xmm5, %xmm1
179	movsd	 -11 * SIZE(A1, LDA), %xmm5
180	mulsd	 %xmm6, %xmm3
181
182	movlpd	 %xmm0, -14 * SIZE(Y1)
183	movsd	 -12 * SIZE(Y1), %xmm0
184	movlpd	 %xmm1, -13 * SIZE(Y1)
185	movsd	 -11 * SIZE(Y1), %xmm1
186
187#ifdef PREFETCH
188	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1, LDA)
189#endif
190
191	mulsd	 %xmm7, %xmm4
192	addsd	 %xmm2, %xmm0
193	movsd	 -10 * SIZE(A1), %xmm2
194	mulsd	 %xmm7, %xmm5
195	addsd	 %xmm3, %xmm1
196	movsd	  -9 * SIZE(A1), %xmm3
197
198	addsd	 %xmm4, %xmm0
199	movsd	 -10 * SIZE(A1, LDA), %xmm4
200	mulsd	 %xmm6, %xmm2
201	addsd	 %xmm5, %xmm1
202	movsd	  -9 * SIZE(A1, LDA), %xmm5
203	mulsd	 %xmm6, %xmm3
204
205	movlpd	 %xmm0, -12 * SIZE(Y1)
206	movsd	 -10 * SIZE(Y1), %xmm0
207	movlpd	 %xmm1, -11 * SIZE(Y1)
208	movsd	  -9 * SIZE(Y1), %xmm1
209
210	mulsd	 %xmm7, %xmm4
211	addsd	 %xmm2, %xmm0
212	movsd	  -8 * SIZE(A1), %xmm2
213	mulsd	 %xmm7, %xmm5
214	addsd	 %xmm3, %xmm1
215	movsd	  -7 * SIZE(A1), %xmm3
216
217	addsd	 %xmm4, %xmm0
218	movsd	  -8 * SIZE(A1, LDA), %xmm4
219	mulsd	 %xmm6, %xmm2
220	addsd	 %xmm5, %xmm1
221	movsd	  -7 * SIZE(A1, LDA), %xmm5
222	mulsd	 %xmm6, %xmm3
223
224	movlpd	 %xmm0, -10 * SIZE(Y1)
225	movsd	  -8 * SIZE(Y1), %xmm0
226	movlpd	 %xmm1,  -9 * SIZE(Y1)
227	movsd	  -7 * SIZE(Y1), %xmm1
228
229	subl	 $-8 * SIZE, A1
230	subl	 $-8 * SIZE, Y1
231
232	subl	 $1, I
233	BRANCH
234	jg	.L13
235	ALIGN_3
236
237.L14:
238	mulsd	 %xmm7, %xmm4
239	addsd	 %xmm2, %xmm0
240	movsd	 -14 * SIZE(A1), %xmm2
241	mulsd	 %xmm7, %xmm5
242	addsd	 %xmm3, %xmm1
243	movsd	 -13 * SIZE(A1), %xmm3
244
245	addsd	 %xmm4, %xmm0
246	movsd	 -14 * SIZE(A1, LDA), %xmm4
247	mulsd	 %xmm6, %xmm2
248	addsd	 %xmm5, %xmm1
249	movsd	 -13 * SIZE(A1, LDA), %xmm5
250	mulsd	 %xmm6, %xmm3
251
252	movlpd	 %xmm0, -16 * SIZE(Y1)
253	movsd	 -14 * SIZE(Y1), %xmm0
254	movlpd	 %xmm1, -15 * SIZE(Y1)
255	movsd	 -13 * SIZE(Y1), %xmm1
256
257	mulsd	 %xmm7, %xmm4
258	addsd	 %xmm2, %xmm0
259	movsd	 -12 * SIZE(A1), %xmm2
260	mulsd	 %xmm7, %xmm5
261	addsd	 %xmm3, %xmm1
262	movsd	 -11 * SIZE(A1), %xmm3
263
264	addsd	 %xmm4, %xmm0
265	movsd	 -12 * SIZE(A1, LDA), %xmm4
266	mulsd	 %xmm6, %xmm2
267	addsd	 %xmm5, %xmm1
268	movsd	 -11 * SIZE(A1, LDA), %xmm5
269	mulsd	 %xmm6, %xmm3
270
271	movlpd	 %xmm0, -14 * SIZE(Y1)
272	movsd	 -12 * SIZE(Y1), %xmm0
273	movlpd	 %xmm1, -13 * SIZE(Y1)
274	movsd	 -11 * SIZE(Y1), %xmm1
275
276	mulsd	 %xmm7, %xmm4
277	addsd	 %xmm2, %xmm0
278	movsd	 -10 * SIZE(A1), %xmm2
279	mulsd	 %xmm7, %xmm5
280	addsd	 %xmm3, %xmm1
281	movsd	  -9 * SIZE(A1), %xmm3
282
283	addsd	 %xmm4, %xmm0
284	movsd	 -10 * SIZE(A1, LDA), %xmm4
285	mulsd	 %xmm6, %xmm2
286	addsd	 %xmm5, %xmm1
287	movsd	  -9 * SIZE(A1, LDA), %xmm5
288	mulsd	 %xmm6, %xmm3
289
290	movlpd	 %xmm0, -12 * SIZE(Y1)
291	movsd	 -10 * SIZE(Y1), %xmm0
292	movlpd	 %xmm1, -11 * SIZE(Y1)
293	movsd	  -9 * SIZE(Y1), %xmm1
294
295	mulsd	 %xmm7, %xmm4
296	addsd	 %xmm2, %xmm0
297	mulsd	 %xmm7, %xmm5
298	addsd	 %xmm3, %xmm1
299
300	addsd	 %xmm4, %xmm0
301	addsd	 %xmm5, %xmm1
302
303	movlpd	 %xmm0, -10 * SIZE(Y1)
304	movsd	  -8 * SIZE(Y1), %xmm0
305	movlpd	 %xmm1,  -9 * SIZE(Y1)
306	movsd	  -7 * SIZE(Y1), %xmm1
307
308	subl	 $-8 * SIZE, A1
309	subl	 $-8 * SIZE, Y1
310	ALIGN_3
311
312.L15:
313	testl	$4, M
314	je	.L16
315
316	movsd	 -16 * SIZE(A1), %xmm2
317	movsd	 -15 * SIZE(A1), %xmm3
318	movsd	 -16 * SIZE(A1, LDA), %xmm4
319	movsd	 -15 * SIZE(A1, LDA), %xmm5
320
321	mulsd	 %xmm6, %xmm2
322	mulsd	 %xmm6, %xmm3
323
324	mulsd	 %xmm7, %xmm4
325	addsd	 %xmm2, %xmm0
326	movsd	 -14 * SIZE(A1), %xmm2
327	mulsd	 %xmm7, %xmm5
328	addsd	 %xmm3, %xmm1
329	movsd	 -13 * SIZE(A1), %xmm3
330
331	addsd	 %xmm4, %xmm0
332	movsd	 -14 * SIZE(A1, LDA), %xmm4
333	mulsd	 %xmm6, %xmm2
334	addsd	 %xmm5, %xmm1
335	movsd	 -13 * SIZE(A1, LDA), %xmm5
336	mulsd	 %xmm6, %xmm3
337
338	movlpd	 %xmm0, -16 * SIZE(Y1)
339	movsd	 -14 * SIZE(Y1), %xmm0
340	movlpd	 %xmm1, -15 * SIZE(Y1)
341	movsd	 -13 * SIZE(Y1), %xmm1
342
343	mulsd	 %xmm7, %xmm4
344	addsd	 %xmm2, %xmm0
345	mulsd	 %xmm7, %xmm5
346	addsd	 %xmm3, %xmm1
347
348	addsd	 %xmm4, %xmm0
349	addsd	 %xmm5, %xmm1
350
351	movlpd	 %xmm0, -14 * SIZE(Y1)
352	movsd	 -12 * SIZE(Y1), %xmm0
353	movlpd	 %xmm1, -13 * SIZE(Y1)
354	movsd	 -11 * SIZE(Y1), %xmm1
355
356	addl	 $4 * SIZE, A1
357	addl	 $4 * SIZE, Y1
358	ALIGN_3
359
360.L16:
361	testl	$2, M
362	je	.L17
363
364	movsd	 -16 * SIZE(A1), %xmm2
365	movsd	 -15 * SIZE(A1), %xmm3
366	movsd	 -16 * SIZE(A1, LDA), %xmm4
367	movsd	 -15 * SIZE(A1, LDA), %xmm5
368
369	mulsd	 %xmm6, %xmm2
370	mulsd	 %xmm6, %xmm3
371
372	mulsd	 %xmm7, %xmm4
373	addsd	 %xmm2, %xmm0
374	mulsd	 %xmm7, %xmm5
375	addsd	 %xmm3, %xmm1
376
377	addsd	 %xmm4, %xmm0
378	addsd	 %xmm5, %xmm1
379
380	movlpd	 %xmm0, -16 * SIZE(Y1)
381	movsd	 -14 * SIZE(Y1), %xmm0
382	movlpd	 %xmm1, -15 * SIZE(Y1)
383
384	addl	 $2 * SIZE, A1
385	addl	 $2 * SIZE, Y1
386	ALIGN_3
387
388.L17:
389	testl	$1, M
390	je	.L19
391
392	movsd	 -16 * SIZE(A1), %xmm2
393	movsd	 -16 * SIZE(A1, LDA), %xmm3
394
395	movsd	 -16 * SIZE(Y1), %xmm0
396
397	mulsd	 %xmm6, %xmm2
398	addsd	 %xmm2, %xmm0
399	mulsd	 %xmm7, %xmm3
400	addsd	 %xmm3, %xmm0
401
402	movsd	 %xmm0, -16 * SIZE(Y1)
403	ALIGN_3
404
405.L19:
406	decl	J
407	jg	.L11
408	ALIGN_4
409
410.L20:
411	testl	$1, N
412	jle	.L990
413
414	movl	BUFFER, Y1
415	addl	$16 * SIZE, Y1
416
417	movl	A,  A1
418	leal	(A1,  LDA, 2), %eax
419	movl	%eax, A
420
421	movsd	(X), %xmm6
422	addl	INCX, X
423	movsd	(X), %xmm7
424	addl	INCX, X
425
426	movsd	ALPHA, %xmm0
427
428	mulsd	%xmm0, %xmm6
429	mulsd	%xmm0, %xmm7
430
431	movsd	 -16 * SIZE(Y1), %xmm0
432	movsd	 -15 * SIZE(Y1), %xmm1
433	movsd	 -14 * SIZE(Y1), %xmm4
434	movsd	 -13 * SIZE(Y1), %xmm5
435
436	movl	M,   I
437	sarl	$3,  I
438	jle	.L25
439
440	movsd	 -16 * SIZE(A1), %xmm2
441	movsd	 -15 * SIZE(A1), %xmm3
442
443	mulsd	 %xmm6, %xmm2
444	mulsd	 %xmm6, %xmm3
445
446	decl	 I
447	jle	 .L24
448	ALIGN_3
449
450.L23:
451#ifdef PREFETCH
452	PREFETCH	(PREFETCHSIZE + 0) * SIZE(A1)
453#endif
454
455	addsd	 %xmm2, %xmm0
456	movsd	 -14 * SIZE(A1), %xmm2
457	addsd	 %xmm3, %xmm1
458	movsd	 -13 * SIZE(A1), %xmm3
459
460	mulsd	 %xmm6, %xmm2
461	movlpd	 %xmm0, -16 * SIZE(Y1)
462	movsd	 -12 * SIZE(Y1), %xmm0
463	mulsd	 %xmm6, %xmm3
464	movlpd	 %xmm1, -15 * SIZE(Y1)
465	movsd	 -11 * SIZE(Y1), %xmm1
466
467	addsd	 %xmm2, %xmm4
468	movsd	 -12 * SIZE(A1), %xmm2
469	addsd	 %xmm3, %xmm5
470	movsd	 -11 * SIZE(A1), %xmm3
471
472	mulsd	 %xmm6, %xmm2
473	movlpd	 %xmm4, -14 * SIZE(Y1)
474	movsd	 -10 * SIZE(Y1), %xmm4
475	mulsd	 %xmm6, %xmm3
476	movlpd	 %xmm5, -13 * SIZE(Y1)
477	movsd	  -9 * SIZE(Y1), %xmm5
478
479	addsd	 %xmm2, %xmm0
480	movsd	 -10 * SIZE(A1), %xmm2
481	addsd	 %xmm3, %xmm1
482	movsd	  -9 * SIZE(A1), %xmm3
483
484	mulsd	 %xmm6, %xmm2
485	movlpd	 %xmm0, -12 * SIZE(Y1)
486	movsd	  -8 * SIZE(Y1), %xmm0
487	mulsd	 %xmm6, %xmm3
488	movlpd	 %xmm1, -11 * SIZE(Y1)
489	movsd	  -7 * SIZE(Y1), %xmm1
490
491	addsd	 %xmm2, %xmm4
492	movsd	  -8 * SIZE(A1), %xmm2
493	addsd	 %xmm3, %xmm5
494	movsd	  -7 * SIZE(A1), %xmm3
495
496	mulsd	 %xmm6, %xmm2
497	movlpd	 %xmm4, -10 * SIZE(Y1)
498	movsd	  -6 * SIZE(Y1), %xmm4
499	mulsd	 %xmm6, %xmm3
500	movlpd	 %xmm5,  -9 * SIZE(Y1)
501	movsd	  -5 * SIZE(Y1), %xmm5
502
503	subl	 $-8 * SIZE, A1
504	subl	 $-8 * SIZE, Y1
505
506	subl	 $1, I
507	BRANCH
508	jg	.L23
509	ALIGN_3
510
511.L24:
512	addsd	 %xmm2, %xmm0
513	movsd	 -14 * SIZE(A1), %xmm2
514	addsd	 %xmm3, %xmm1
515	movsd	 -13 * SIZE(A1), %xmm3
516
517	mulsd	 %xmm6, %xmm2
518	movlpd	 %xmm0, -16 * SIZE(Y1)
519	movsd	 -12 * SIZE(Y1), %xmm0
520	mulsd	 %xmm6, %xmm3
521	movlpd	 %xmm1, -15 * SIZE(Y1)
522	movsd	 -11 * SIZE(Y1), %xmm1
523
524	addsd	 %xmm2, %xmm4
525	movsd	 -12 * SIZE(A1), %xmm2
526	addsd	 %xmm3, %xmm5
527	movsd	 -11 * SIZE(A1), %xmm3
528
529	mulsd	 %xmm6, %xmm2
530	movlpd	 %xmm4, -14 * SIZE(Y1)
531	movsd	 -10 * SIZE(Y1), %xmm4
532	mulsd	 %xmm6, %xmm3
533	movlpd	 %xmm5, -13 * SIZE(Y1)
534	movsd	  -9 * SIZE(Y1), %xmm5
535
536	addsd	 %xmm2, %xmm0
537	movsd	 -10 * SIZE(A1), %xmm2
538	addsd	 %xmm3, %xmm1
539	movsd	  -9 * SIZE(A1), %xmm3
540
541	mulsd	 %xmm6, %xmm2
542	movlpd	 %xmm0, -12 * SIZE(Y1)
543	mulsd	 %xmm6, %xmm3
544	movlpd	 %xmm1, -11 * SIZE(Y1)
545
546	addsd	 %xmm2, %xmm4
547	movsd	  -8 * SIZE(Y1), %xmm0
548	addsd	 %xmm3, %xmm5
549	movsd	  -7 * SIZE(Y1), %xmm1
550
551	movlpd	 %xmm4, -10 * SIZE(Y1)
552	movsd	  -6 * SIZE(Y1), %xmm4
553	movlpd	 %xmm5,  -9 * SIZE(Y1)
554	movsd	  -5 * SIZE(Y1), %xmm5
555
556	subl	 $-8 * SIZE, A1
557	subl	 $-8 * SIZE, Y1
558	ALIGN_3
559
560.L25:
561	testl	$4, M
562	je	.L26
563
564	movsd	 -16 * SIZE(A1), %xmm2
565	movsd	 -15 * SIZE(A1), %xmm3
566	mulsd	 %xmm6, %xmm2
567	mulsd	 %xmm6, %xmm3
568
569	addsd	 %xmm2, %xmm0
570	movsd	 -14 * SIZE(A1), %xmm2
571	addsd	 %xmm3, %xmm1
572	movsd	 -13 * SIZE(A1), %xmm3
573
574	mulsd	 %xmm6, %xmm2
575	movlpd	 %xmm0, -16 * SIZE(Y1)
576	movsd	 -12 * SIZE(Y1), %xmm0
577	mulsd	 %xmm6, %xmm3
578	movlpd	 %xmm1, -15 * SIZE(Y1)
579	movsd	 -11 * SIZE(Y1), %xmm1
580
581	addsd	 %xmm2, %xmm4
582	addsd	 %xmm3, %xmm5
583
584	movlpd	 %xmm4, -14 * SIZE(Y1)
585	movlpd	 %xmm5, -13 * SIZE(Y1)
586
587	addl	 $4 * SIZE, A1
588	addl	 $4 * SIZE, Y1
589	ALIGN_3
590
591.L26:
592	testl	$2, M
593	je	.L27
594
595	movsd	 -16 * SIZE(A1), %xmm2
596	movsd	 -15 * SIZE(A1), %xmm3
597
598	mulsd	 %xmm6, %xmm2
599	mulsd	 %xmm6, %xmm3
600	addsd	 %xmm2, %xmm0
601	addsd	 %xmm3, %xmm1
602
603	movlpd	 %xmm0, -16 * SIZE(Y1)
604	movsd	 -14 * SIZE(Y1), %xmm0
605	movlpd	 %xmm1, -15 * SIZE(Y1)
606
607	addl	 $2 * SIZE, A1
608	addl	 $2 * SIZE, Y1
609	ALIGN_3
610
611.L27:
612	testl	$1, M
613	je	.L990
614
615	movsd	 -16 * SIZE(A1), %xmm2
616	movsd	 -16 * SIZE(Y1), %xmm0
617
618	mulsd	 %xmm6, %xmm2
619	addsd	 %xmm2, %xmm0
620
621	movsd	 %xmm0, -16 * SIZE(Y1)
622	ALIGN_3
623
624.L990:
625	movl	Y,   Y1
626	movl	BUFFER, X
627	movl	Y1,  A1
628
629	movl	STACK_INCY, INCY
630	sall	$BASE_SHIFT, INCY
631
632	movl	M,   %eax
633	sarl	$3,  %eax
634	jle	.L994
635	ALIGN_3
636
637.L992:
638	movsd	(Y1), %xmm0
639	addl	INCY, Y1
640	movsd	(Y1), %xmm1
641	addl	INCY, Y1
642	movsd	(Y1), %xmm2
643	addl	INCY, Y1
644	movsd	(Y1), %xmm3
645	addl	INCY, Y1
646	movsd	(Y1), %xmm4
647	addl	INCY, Y1
648	movsd	(Y1), %xmm5
649	addl	INCY, Y1
650	movsd	(Y1), %xmm6
651	addl	INCY, Y1
652	movsd	(Y1), %xmm7
653	addl	INCY, Y1
654
655	addsd	0 * SIZE(X), %xmm0
656	addsd	1 * SIZE(X), %xmm1
657	addsd	2 * SIZE(X), %xmm2
658	addsd	3 * SIZE(X), %xmm3
659	addsd	4 * SIZE(X), %xmm4
660	addsd	5 * SIZE(X), %xmm5
661	addsd	6 * SIZE(X), %xmm6
662	addsd	7 * SIZE(X), %xmm7
663
664	movlpd	%xmm0, (A1)
665	addl	INCY, A1
666	movlpd	%xmm1, (A1)
667	addl	INCY, A1
668	movlpd	%xmm2, (A1)
669	addl	INCY, A1
670	movlpd	%xmm3, (A1)
671	addl	INCY, A1
672	movlpd	%xmm4, (A1)
673	addl	INCY, A1
674	movlpd	%xmm5, (A1)
675	addl	INCY, A1
676	movlpd	%xmm6, (A1)
677	addl	INCY, A1
678	movlpd	%xmm7, (A1)
679	addl	INCY, A1
680
681	addl	$8 * SIZE, X
682	decl	%eax
683	jg	.L992
684	ALIGN_3
685
686.L994:
687	testl	$7, M
688	jle	.L999
689
690	testl	$4, M
691	jle	.L995
692
693	movsd	(Y1), %xmm0
694	addl	INCY, Y1
695	movsd	(Y1), %xmm1
696	addl	INCY, Y1
697	movsd	(Y1), %xmm2
698	addl	INCY, Y1
699	movsd	(Y1), %xmm3
700	addl	INCY, Y1
701
702	addsd	0 * SIZE(X), %xmm0
703	addsd	1 * SIZE(X), %xmm1
704	addsd	2 * SIZE(X), %xmm2
705	addsd	3 * SIZE(X), %xmm3
706
707	movlpd	%xmm0, (A1)
708	addl	INCY, A1
709	movlpd	%xmm1, (A1)
710	addl	INCY, A1
711	movlpd	%xmm2, (A1)
712	addl	INCY, A1
713	movlpd	%xmm3, (A1)
714	addl	INCY, A1
715
716	addl	$4 * SIZE, X
717	ALIGN_3
718
719.L995:
720	testl	$2, M
721	jle	.L996
722
723	movsd	(Y1), %xmm0
724	addl	INCY, Y1
725	movsd	(Y1), %xmm1
726	addl	INCY, Y1
727
728	addsd	0 * SIZE(X), %xmm0
729	addsd	1 * SIZE(X), %xmm1
730
731	movlpd	%xmm0, (A1)
732	addl	INCY, A1
733	movlpd	%xmm1, (A1)
734	addl	INCY, A1
735
736	addl	$2 * SIZE, X
737	ALIGN_3
738
739.L996:
740	testl	$1, M
741	jle	.L999
742
743	movsd	(Y1), %xmm0
744
745	addsd	0 * SIZE(X), %xmm0
746
747	movlpd	%xmm0, (A1)
748	ALIGN_3
749
750.L999:
751	popl	%ebx
752	popl	%esi
753	popl	%edi
754	popl	%ebp
755	ret
756
757	EPILOGUE
758