1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	 8
43#define ARGS     0
44
45#define STACK_M		 4 + STACK + ARGS(%esp)
46#define STACK_X		 8 + STACK + ARGS(%esp)
47#define STACK_INCX	12 + STACK + ARGS(%esp)
48
49#define I	%eax
50#define M	%ecx
51#define X	%esi
52#define INCX	%ebx
53
54#include "l1param.h"
55
56	PROLOGUE
57	PROFCODE
58
59	pushl	%esi
60	pushl	%ebx
61
62	movl	STACK_M, M
63	movl	STACK_X, X
64	movl	STACK_INCX, INCX
65
66	xorps	%xmm0, %xmm0
67	xorps	%xmm1, %xmm1
68
69	testl	M, M
70	jle	.L999
71	testl	INCX, INCX
72	jle	.L999
73
74	pcmpeqb	%xmm3, %xmm3
75	psrlq	$1,    %xmm3
76
77	sall	$BASE_SHIFT, INCX
78
79	subl	$-16 * SIZE, X
80
81	cmpl	$SIZE, INCX
82	jne	.L40
83
84	testl	$SIZE, X
85	je	.L05
86
87	movsd	-16 * SIZE(X), %xmm0
88	addl	$SIZE, X
89
90	andps	%xmm3, %xmm0
91	subl	$1, M
92	jle	.L999
93	ALIGN_3
94
95.L05:
96	movl	M,  I
97	sarl	$4, I
98	jle	.L20
99
100	movaps	-16 * SIZE(X), %xmm4
101	movaps	-14 * SIZE(X), %xmm5
102	movaps	-12 * SIZE(X), %xmm6
103	movaps	-10 * SIZE(X), %xmm7
104
105	decl	I
106	jle	.L11
107	ALIGN_4
108
109.L10:
110#ifdef PREFETCH
111	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
112#endif
113
114	andps	%xmm3, %xmm4
115	addpd	%xmm4, %xmm0
116	movaps	 -8 * SIZE(X), %xmm4
117
118	andps	%xmm3, %xmm5
119	addpd	%xmm5, %xmm1
120	movaps	 -6 * SIZE(X), %xmm5
121
122	andps	%xmm3, %xmm6
123	addpd	%xmm6, %xmm0
124	movaps	 -4 * SIZE(X), %xmm6
125
126	andps	%xmm3, %xmm7
127	addpd	%xmm7, %xmm1
128	movaps	 -2 * SIZE(X), %xmm7
129
130#if defined(PREFETCH) && !defined(FETCH128)
131	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
132#endif
133
134	andps	%xmm3, %xmm4
135	addpd	%xmm4, %xmm0
136	movaps	  0 * SIZE(X), %xmm4
137
138	andps	%xmm3, %xmm5
139	addpd	%xmm5, %xmm1
140	movaps	  2 * SIZE(X), %xmm5
141
142	andps	%xmm3, %xmm6
143	addpd	%xmm6, %xmm0
144	movaps	  4 * SIZE(X), %xmm6
145
146	andps	%xmm3, %xmm7
147	addpd	%xmm7, %xmm1
148	movaps	  6 * SIZE(X), %xmm7
149
150	subl	$-16 * SIZE, X
151	decl	I
152	jg	.L10
153	ALIGN_4
154
155.L11:
156	andps	%xmm3, %xmm4
157	addpd	%xmm4, %xmm0
158	movaps	 -8 * SIZE(X), %xmm4
159
160	andps	%xmm3, %xmm5
161	addpd	%xmm5, %xmm1
162	movaps	 -6 * SIZE(X), %xmm5
163
164	andps	%xmm3, %xmm6
165	addpd	%xmm6, %xmm0
166	movaps	 -4 * SIZE(X), %xmm6
167
168	andps	%xmm3, %xmm7
169	addpd	%xmm7, %xmm1
170	movaps	 -2 * SIZE(X), %xmm7
171
172	andps	%xmm3, %xmm4
173	addpd	%xmm4, %xmm0
174	andps	%xmm3, %xmm5
175	addpd	%xmm5, %xmm1
176	andps	%xmm3, %xmm6
177	addpd	%xmm6, %xmm0
178	andps	%xmm3, %xmm7
179	addpd	%xmm7, %xmm1
180
181	subl	$-16 * SIZE, X
182	ALIGN_3
183
184.L20:
185	andl	$15,  M
186	jle	.L999
187
188	testl	$8, M
189	je	.L21
190
191	movaps	-16 * SIZE(X), %xmm4
192	movaps	-14 * SIZE(X), %xmm5
193	movaps	-12 * SIZE(X), %xmm6
194	movaps	-10 * SIZE(X), %xmm7
195
196	andps	%xmm3, %xmm4
197	addpd	%xmm4, %xmm0
198	andps	%xmm3, %xmm5
199	addpd	%xmm5, %xmm1
200	andps	%xmm3, %xmm6
201	addpd	%xmm6, %xmm0
202	andps	%xmm3, %xmm7
203	addpd	%xmm7, %xmm1
204	addl	$8 * SIZE, X
205	ALIGN_3
206
207.L21:
208	testl	$4, M
209	je	.L22
210
211	movaps	-16 * SIZE(X), %xmm4
212	movaps	-14 * SIZE(X), %xmm5
213
214	andps	%xmm3, %xmm4
215	addpd	%xmm4, %xmm0
216	andps	%xmm3, %xmm5
217	addpd	%xmm5, %xmm1
218
219	addl	$4 * SIZE, X
220	ALIGN_3
221
222.L22:
223	testl	$2, M
224	je	.L23
225
226	movaps	-16 * SIZE(X), %xmm4
227	andps	%xmm3, %xmm4
228	addpd	%xmm4, %xmm0
229	addl	$2 * SIZE, X
230
231.L23:
232	testl	$1, M
233	je	.L999
234
235#ifdef movsd
236	xorps	%xmm4, %xmm4
237#endif
238	movsd	-16 * SIZE(X), %xmm4
239	andps	%xmm3, %xmm4
240	addsd	%xmm4, %xmm1
241	jmp	.L999
242	ALIGN_3
243
244.L40:
245	movl	M,  I
246	sarl	$3, I
247	jle	.L60
248	ALIGN_4
249
250.L50:
251	movsd	-16 * SIZE(X), %xmm4
252	addl	INCX, X
253	movhps	-16 * SIZE(X), %xmm4
254	addl	INCX, X
255	andps	%xmm3, %xmm4
256	addpd	%xmm4, %xmm0
257
258	movsd	-16 * SIZE(X), %xmm5
259	addl	INCX, X
260	movhps	-16 * SIZE(X), %xmm5
261	addl	INCX, X
262	andps	%xmm3, %xmm5
263	addpd	%xmm5, %xmm1
264
265	movsd	-16 * SIZE(X), %xmm6
266	addl	INCX, X
267	movhps	-16 * SIZE(X), %xmm6
268	addl	INCX, X
269	andps	%xmm3, %xmm6
270	addpd	%xmm6, %xmm0
271
272	movsd	-16 * SIZE(X), %xmm7
273	addl	INCX, X
274	movhps	-16 * SIZE(X), %xmm7
275	addl	INCX, X
276	andps	%xmm3, %xmm7
277	addpd	%xmm7, %xmm1
278
279	decl	I
280	jg	.L50
281	ALIGN_4
282
283.L60:
284#ifdef movsd
285	xorps	%xmm4, %xmm4
286#endif
287	andl	$7,  M
288	jle	.L999
289	ALIGN_4
290
291.L61:
292	movsd	-16 * SIZE(X), %xmm4
293	andps	%xmm3, %xmm4
294	addsd	%xmm4, %xmm0
295	addl	INCX, X
296	decl	M
297	jg	.L61
298	ALIGN_4
299
300.L999:
301	addpd	%xmm1, %xmm0
302
303#ifndef HAVE_SSE3
304	movaps	%xmm0,  %xmm1
305	unpckhpd %xmm0, %xmm0
306	addsd	 %xmm1, %xmm0
307#else
308	haddpd	%xmm0, %xmm0
309#endif
310
311	movsd	%xmm0, STACK_M
312	fldl	STACK_M
313	popl	%ebx
314	popl	%esi
315	ret
316
317	EPILOGUE
318
319