1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	12
26#define ARGS     0
27
28#define STACK_N		 4 + STACK + ARGS(%esp)
29#define STACK_X		 8 + STACK + ARGS(%esp)
30#define STACK_INCX	12 + STACK + ARGS(%esp)
31#define STACK_Y		16 + STACK + ARGS(%esp)
32#define STACK_INCY	20 + STACK + ARGS(%esp)
33
34#define N	%ebx
35#define X	%esi
36#define INCX	%ecx
37#define Y	%edi
38#define INCY	%edx
39
40
41	PROLOGUE
42
43	pushl	%edi
44	pushl	%esi
45	pushl	%ebx
46
47	PROFCODE
48
49#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
50	EMMS
51#endif
52
53	movl	STACK_N,     N
54	movl	STACK_X,     X
55	movl	STACK_INCX,  INCX
56	movl	STACK_Y,     Y
57	movl	STACK_INCY,  INCY
58
59#ifdef F_INTERFACE
60	movl	(N),N
61	movl	(INCX),INCX
62	movl	(INCY),INCY
63#endif
64
65	sall	$BASE_SHIFT, INCX
66	sall	$BASE_SHIFT, INCY
67
68	fldz
69	fldz
70	fldz
71	fldz
72
73	cmpl	$SIZE, INCX
74	jne	.L14
75	cmpl	$SIZE, INCY
76	jne	.L14
77
78	movl	N, %eax
79	sarl	$2,   %eax
80	jle	.L15
81	ALIGN_3
82
83.L16:
84	FLD	0 * SIZE(X)
85	FLD	0 * SIZE(Y)
86	fmulp	%st, %st(1)
87	faddp	%st,%st(1)
88	FLD	1 * SIZE(X)
89	FLD	1 * SIZE(Y)
90	fmulp	%st, %st(1)
91	faddp	%st,%st(2)
92	FLD	2 * SIZE(X)
93	FLD	2 * SIZE(Y)
94	fmulp	%st, %st(1)
95	faddp	%st,%st(3)
96	FLD	3 * SIZE(X)
97	FLD	3 * SIZE(Y)
98	fmulp	%st, %st(1)
99	faddp	%st,%st(4)
100	addl	$4 * SIZE, X
101	addl	$4 * SIZE, Y
102	decl	%eax
103	jg	.L16
104	ALIGN_3
105
106.L15:
107	movl	N, %eax
108	andl	$3,   %eax
109	jle	.L27
110	ALIGN_3
111
112.L22:
113	FLD	(X)
114	addl	$SIZE, X
115	FLD	(Y)
116	fmulp	%st, %st(1)
117	addl	$SIZE, Y
118	faddp	%st,%st(1)
119	decl	%eax
120	jg	.L22
121
122	jmp	.L27
123	ALIGN_3
124
125.L14:
126#ifdef F_INTERFACE
127	testl	INCX, INCX
128	jge	.L28
129
130	movl	N, %eax
131	decl	%eax
132	imull	INCX, %eax
133	subl	%eax, X
134	ALIGN_3
135
136.L28:
137	testl	INCY, INCY
138	jge	.L29
139
140	movl	N, %eax
141	decl	%eax
142	imull	INCY, %eax
143	subl	%eax, Y
144	ALIGN_3
145.L29:
146#endif
147	movl	N, %eax
148	sarl	$2,   %eax
149	jle	.L30
150	ALIGN_3
151
152.L31:
153	FLD	(X)
154	addl	INCX, X
155	FLD	(Y)
156	fmulp	%st, %st(1)
157	addl	INCY, Y
158	faddp	%st,%st(1)
159
160	FLD	(X)
161	addl	INCX, X
162	FLD	(Y)
163	fmulp	%st, %st(1)
164	addl	INCY, Y
165	faddp	%st,%st(2)
166
167	FLD	(X)
168	addl	INCX, X
169	FLD	(Y)
170	fmulp	%st, %st(1)
171	addl	INCY, Y
172	faddp	%st,%st(3)
173
174	FLD	(X)
175	addl	INCX, X
176	FLD	(Y)
177	fmulp	%st, %st(1)
178	addl	INCY, Y
179	faddp	%st,%st(4)
180
181	decl	%eax
182	jg	.L31
183	ALIGN_3
184
185.L30:
186	movl	N, %eax
187	andl	$3,   %eax
188	jle	.L27
189	ALIGN_3
190
191.L37:
192	FLD	(X)
193	addl	INCX, X
194	FLD	(Y)
195	fmulp	%st, %st(1)
196	addl	INCY, Y
197	faddp	%st, %st(1)
198	decl	%eax
199	jg	.L37
200	ALIGN_3
201
202.L27:
203	faddp	%st,%st(2)
204	faddp	%st,%st(2)
205	faddp	%st,%st(1)
206
207	popl	%ebx
208	popl	%esi
209	popl	%edi
210	ret
211
212	EPILOGUE
213