1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	 8
26#define ARGS     0
27
28#define STACK_M		 4 + STACK + ARGS(%esp)
29#define STACK_X		 8 + STACK + ARGS(%esp)
30#define STACK_INCX	12 + STACK + ARGS(%esp)
31
32#define I	%eax
33#define M	%ecx
34#define X	%esi
35#define INCX	%ebx
36
37#include "l1param.h"
38
39	PROLOGUE
40	PROFCODE
41
42	pushl	%esi
43	pushl	%ebx
44
45	movl	STACK_M, M
46	movl	STACK_X, X
47	movl	STACK_INCX, INCX
48
49	xorps	%xmm0, %xmm0
50	xorps	%xmm1, %xmm1
51
52	testl	M, M
53	jle	.L999
54	testl	INCX, INCX
55	jle	.L999
56
57	pcmpeqb	%xmm3, %xmm3
58	psrlq	$1,    %xmm3
59
60	sall	$BASE_SHIFT, INCX
61
62	subl	$-16 * SIZE, X
63
64	cmpl	$SIZE, INCX
65	jne	.L40
66
67	testl	$SIZE, X
68	je	.L05
69
70	movsd	-16 * SIZE(X), %xmm0
71	addl	$SIZE, X
72
73	andps	%xmm3, %xmm0
74	subl	$1, M
75	jle	.L999
76	ALIGN_3
77
78.L05:
79	movl	M,  I
80	sarl	$4, I
81	jle	.L20
82
83	movaps	-16 * SIZE(X), %xmm4
84	movaps	-14 * SIZE(X), %xmm5
85	movaps	-12 * SIZE(X), %xmm6
86	movaps	-10 * SIZE(X), %xmm7
87
88	decl	I
89	jle	.L11
90	ALIGN_4
91
92.L10:
93#ifdef PREFETCH
94	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
95#endif
96
97	andps	%xmm3, %xmm4
98	addpd	%xmm4, %xmm0
99	movaps	 -8 * SIZE(X), %xmm4
100
101	andps	%xmm3, %xmm5
102	addpd	%xmm5, %xmm1
103	movaps	 -6 * SIZE(X), %xmm5
104
105	andps	%xmm3, %xmm6
106	addpd	%xmm6, %xmm0
107	movaps	 -4 * SIZE(X), %xmm6
108
109	andps	%xmm3, %xmm7
110	addpd	%xmm7, %xmm1
111	movaps	 -2 * SIZE(X), %xmm7
112
113#if defined(PREFETCH) && !defined(FETCH128)
114	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
115#endif
116
117	andps	%xmm3, %xmm4
118	addpd	%xmm4, %xmm0
119	movaps	  0 * SIZE(X), %xmm4
120
121	andps	%xmm3, %xmm5
122	addpd	%xmm5, %xmm1
123	movaps	  2 * SIZE(X), %xmm5
124
125	andps	%xmm3, %xmm6
126	addpd	%xmm6, %xmm0
127	movaps	  4 * SIZE(X), %xmm6
128
129	andps	%xmm3, %xmm7
130	addpd	%xmm7, %xmm1
131	movaps	  6 * SIZE(X), %xmm7
132
133	subl	$-16 * SIZE, X
134	decl	I
135	jg	.L10
136	ALIGN_4
137
138.L11:
139	andps	%xmm3, %xmm4
140	addpd	%xmm4, %xmm0
141	movaps	 -8 * SIZE(X), %xmm4
142
143	andps	%xmm3, %xmm5
144	addpd	%xmm5, %xmm1
145	movaps	 -6 * SIZE(X), %xmm5
146
147	andps	%xmm3, %xmm6
148	addpd	%xmm6, %xmm0
149	movaps	 -4 * SIZE(X), %xmm6
150
151	andps	%xmm3, %xmm7
152	addpd	%xmm7, %xmm1
153	movaps	 -2 * SIZE(X), %xmm7
154
155	andps	%xmm3, %xmm4
156	addpd	%xmm4, %xmm0
157	andps	%xmm3, %xmm5
158	addpd	%xmm5, %xmm1
159	andps	%xmm3, %xmm6
160	addpd	%xmm6, %xmm0
161	andps	%xmm3, %xmm7
162	addpd	%xmm7, %xmm1
163
164	subl	$-16 * SIZE, X
165	ALIGN_3
166
167.L20:
168	andl	$15,  M
169	jle	.L999
170
171	testl	$8, M
172	je	.L21
173
174	movaps	-16 * SIZE(X), %xmm4
175	movaps	-14 * SIZE(X), %xmm5
176	movaps	-12 * SIZE(X), %xmm6
177	movaps	-10 * SIZE(X), %xmm7
178
179	andps	%xmm3, %xmm4
180	addpd	%xmm4, %xmm0
181	andps	%xmm3, %xmm5
182	addpd	%xmm5, %xmm1
183	andps	%xmm3, %xmm6
184	addpd	%xmm6, %xmm0
185	andps	%xmm3, %xmm7
186	addpd	%xmm7, %xmm1
187	addl	$8 * SIZE, X
188	ALIGN_3
189
190.L21:
191	testl	$4, M
192	je	.L22
193
194	movaps	-16 * SIZE(X), %xmm4
195	movaps	-14 * SIZE(X), %xmm5
196
197	andps	%xmm3, %xmm4
198	addpd	%xmm4, %xmm0
199	andps	%xmm3, %xmm5
200	addpd	%xmm5, %xmm1
201
202	addl	$4 * SIZE, X
203	ALIGN_3
204
205.L22:
206	testl	$2, M
207	je	.L23
208
209	movaps	-16 * SIZE(X), %xmm4
210	andps	%xmm3, %xmm4
211	addpd	%xmm4, %xmm0
212	addl	$2 * SIZE, X
213
214.L23:
215	testl	$1, M
216	je	.L999
217
218#ifdef movsd
219	xorps	%xmm4, %xmm4
220#endif
221	movsd	-16 * SIZE(X), %xmm4
222	andps	%xmm3, %xmm4
223	addsd	%xmm4, %xmm1
224	jmp	.L999
225	ALIGN_3
226
227.L40:
228	movl	M,  I
229	sarl	$3, I
230	jle	.L60
231	ALIGN_4
232
233.L50:
234	movsd	-16 * SIZE(X), %xmm4
235	addl	INCX, X
236	movhps	-16 * SIZE(X), %xmm4
237	addl	INCX, X
238	andps	%xmm3, %xmm4
239	addpd	%xmm4, %xmm0
240
241	movsd	-16 * SIZE(X), %xmm5
242	addl	INCX, X
243	movhps	-16 * SIZE(X), %xmm5
244	addl	INCX, X
245	andps	%xmm3, %xmm5
246	addpd	%xmm5, %xmm1
247
248	movsd	-16 * SIZE(X), %xmm6
249	addl	INCX, X
250	movhps	-16 * SIZE(X), %xmm6
251	addl	INCX, X
252	andps	%xmm3, %xmm6
253	addpd	%xmm6, %xmm0
254
255	movsd	-16 * SIZE(X), %xmm7
256	addl	INCX, X
257	movhps	-16 * SIZE(X), %xmm7
258	addl	INCX, X
259	andps	%xmm3, %xmm7
260	addpd	%xmm7, %xmm1
261
262	decl	I
263	jg	.L50
264	ALIGN_4
265
266.L60:
267#ifdef movsd
268	xorps	%xmm4, %xmm4
269#endif
270	andl	$7,  M
271	jle	.L999
272	ALIGN_4
273
274.L61:
275	movsd	-16 * SIZE(X), %xmm4
276	andps	%xmm3, %xmm4
277	addsd	%xmm4, %xmm0
278	addl	INCX, X
279	decl	M
280	jg	.L61
281	ALIGN_4
282
283.L999:
284	addpd	%xmm1, %xmm0
285
286#ifndef HAVE_SSE3
287	movaps	%xmm0,  %xmm1
288	unpckhpd %xmm0, %xmm0
289	addsd	 %xmm1, %xmm0
290#else
291	haddpd	%xmm0, %xmm0
292#endif
293
294	movsd	%xmm0, STACK_M
295	fldl	STACK_M
296	popl	%ebx
297	popl	%esi
298	ret
299
300	EPILOGUE
301
302