1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45
46#define I	%rax
47
48#include "l1param.h"
49
50	PROLOGUE
51	PROFCODE
52
53	SAVEREGISTERS
54
55	xorps	%xmm0, %xmm0
56	testq	M, M
57	jle	.L999
58	testq	INCX, INCX
59	jle	.L999
60
61	xorps	%xmm1, %xmm1
62	xorps	%xmm2, %xmm2
63	xorps	%xmm3, %xmm3
64
65	pcmpeqb	%xmm15, %xmm15
66	psrld	$1, %xmm15
67
68	leaq	(, INCX, SIZE), INCX
69
70	cmpq	$SIZE, INCX
71	jne	.L100
72
73	subq	$-32 * SIZE, X
74
75	cmpq	$3, M
76	jle	.L18
77
78	testq	$4, X
79	je	.L05
80	movss	-32 * SIZE(X), %xmm0
81	andps	%xmm15, %xmm0
82	addq	$SIZE, X
83	decq	M
84	jle	.L998
85	ALIGN_3
86
87.L05:
88	testq	$8, X
89	je	.L10
90
91	movsd	-32 * SIZE(X), %xmm1
92	andps	%xmm15, %xmm1
93	addq	$2 * SIZE, X
94	subq	$2, M
95	jle	.L998
96	ALIGN_3
97
98.L10:
99	movq	M,  I
100	sarq	$5, I
101	jle	.L14
102
103	movaps	-32 * SIZE(X), %xmm4
104	movaps	-28 * SIZE(X), %xmm5
105	movaps	-24 * SIZE(X), %xmm6
106	movaps	-20 * SIZE(X), %xmm7
107
108	movaps	-16 * SIZE(X), %xmm8
109	movaps	-12 * SIZE(X), %xmm9
110	movaps	 -8 * SIZE(X), %xmm10
111	movaps	 -4 * SIZE(X), %xmm11
112	decq	I
113	jle	.L12
114	ALIGN_3
115
116.L11:
117#ifdef PREFETCH
118	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
119#endif
120
121	andps	%xmm15, %xmm4
122	addps	%xmm4,  %xmm0
123	movaps	  0 * SIZE(X), %xmm4
124
125	andps	%xmm15, %xmm5
126	addps	%xmm5,  %xmm1
127	movaps	  4 * SIZE(X), %xmm5
128
129	andps	%xmm15, %xmm6
130	addps	%xmm6,  %xmm2
131	movaps	  8 * SIZE(X), %xmm6
132
133	andps	%xmm15, %xmm7
134	addps	%xmm7,  %xmm3
135	movaps	 12 * SIZE(X), %xmm7
136
137#if defined(PREFETCH) && !defined(FETCH128)
138	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
139#endif
140
141	andps	%xmm15, %xmm8
142	addps	%xmm8,  %xmm0
143	movaps	 16 * SIZE(X), %xmm8
144
145	andps	%xmm15, %xmm9
146	addps	%xmm9,  %xmm1
147	movaps	 20 * SIZE(X), %xmm9
148
149	andps	%xmm15, %xmm10
150	addps	%xmm10, %xmm2
151	movaps	 24 * SIZE(X), %xmm10
152
153	andps	%xmm15, %xmm11
154	addps	%xmm11, %xmm3
155	movaps	 28 * SIZE(X), %xmm11
156
157	subq	$-32 * SIZE, X
158	decq	I
159	jg	.L11
160	ALIGN_3
161
162.L12:
163	andps	%xmm15, %xmm4
164	addps	%xmm4,  %xmm0
165	andps	%xmm15, %xmm5
166	addps	%xmm5,  %xmm1
167
168	andps	%xmm15, %xmm6
169	addps	%xmm6,  %xmm2
170	andps	%xmm15, %xmm7
171	addps	%xmm7,  %xmm3
172
173	andps	%xmm15, %xmm8
174	addps	%xmm8,  %xmm0
175	andps	%xmm15, %xmm9
176	addps	%xmm9,  %xmm1
177
178	andps	%xmm15, %xmm10
179	addps	%xmm10, %xmm2
180	andps	%xmm15, %xmm11
181	addps	%xmm11, %xmm3
182
183	subq	$-32 * SIZE, X
184	ALIGN_3
185
186.L14:
187	testq	$16, M
188	je	.L16
189
190	movaps	-32 * SIZE(X), %xmm4
191	andps	%xmm15, %xmm4
192	addps	%xmm4, %xmm0
193
194	movaps	-28 * SIZE(X), %xmm5
195	andps	%xmm15, %xmm5
196	addps	%xmm5, %xmm1
197
198	movaps	-24 * SIZE(X), %xmm4
199	andps	%xmm15, %xmm4
200	addps	%xmm4, %xmm0
201
202	movaps	-20 * SIZE(X), %xmm5
203	andps	%xmm15, %xmm5
204	addps	%xmm5, %xmm1
205
206	addq	$16 * SIZE, X
207	ALIGN_3
208
209.L16:
210	testq	$8, M
211	je	.L17
212
213	movaps	-32 * SIZE(X), %xmm4
214	andps	%xmm15, %xmm4
215	addps	%xmm4, %xmm0
216
217	movaps	-28 * SIZE(X), %xmm5
218	andps	%xmm15, %xmm5
219	addps	%xmm5, %xmm1
220
221	addq	$8 * SIZE, X
222	ALIGN_3
223
224.L17:
225	testq	$4, M
226	je	.L18
227
228	movaps	-32 * SIZE(X), %xmm6
229	andps	%xmm15, %xmm6
230	addps	%xmm6, %xmm2
231	addq	$4 * SIZE, X
232	ALIGN_3
233
234.L18:
235	testq	$2, M
236	je	.L19
237
238#ifdef movsd
239	xorps	%xmm7, %xmm7
240#endif
241	movsd	-32 * SIZE(X), %xmm7
242	andps	%xmm15, %xmm7
243	addps	%xmm7, %xmm3
244	addq	$2 * SIZE, X
245	ALIGN_3
246
247.L19:
248	testq	$1, M
249	je	.L998
250
251	movss	-32 * SIZE(X), %xmm6
252	andps	%xmm15, %xmm6
253	addps	%xmm6, %xmm2
254	jmp	.L998
255	ALIGN_4
256
257.L100:
258	movq	M,  I
259	sarq	$3, I
260	jle	.L105
261	ALIGN_4
262
263.L101:
264	movss	0 * SIZE(X), %xmm4
265	addq	INCX, X
266	andps	%xmm15, %xmm4
267	addss	%xmm4, %xmm0
268
269	movss	0 * SIZE(X), %xmm5
270	addq	INCX, X
271	andps	%xmm15, %xmm5
272	addss	%xmm5, %xmm1
273
274	movss	0 * SIZE(X), %xmm6
275	addq	INCX, X
276	andps	%xmm15, %xmm6
277	addss	%xmm6, %xmm2
278
279	movss	0 * SIZE(X), %xmm7
280	addq	INCX, X
281	andps	%xmm15, %xmm7
282	addss	%xmm7, %xmm3
283
284	movss	0 * SIZE(X), %xmm8
285	addq	INCX, X
286	andps	%xmm15, %xmm8
287	addss	%xmm8, %xmm0
288
289	movss	0 * SIZE(X), %xmm4
290	addq	INCX, X
291	andps	%xmm15, %xmm4
292	addss	%xmm4, %xmm1
293
294	movss	0 * SIZE(X), %xmm5
295	addq	INCX, X
296	andps	%xmm15, %xmm5
297	addss	%xmm5, %xmm2
298
299	movss	0 * SIZE(X), %xmm6
300	addq	INCX, X
301	andps	%xmm15, %xmm6
302	addss	%xmm6, %xmm3
303
304	decq	I
305	jg	.L101
306	ALIGN_4
307
308.L105:
309	andq	$7,  M
310	jle	.L998
311	ALIGN_4
312
313.L106:
314	movss	0 * SIZE(X), %xmm4
315	andps	%xmm15, %xmm4
316	addps	%xmm4, %xmm0
317	addq	INCX, X
318	decq	M
319	jg	.L106
320	ALIGN_4
321
322.L998:
323	addps	%xmm1, %xmm0
324	addps	%xmm3, %xmm2
325	addps	%xmm2, %xmm0
326
327#ifndef HAVE_SSE3
328	movhlps	%xmm0, %xmm1
329	addps	%xmm1, %xmm0
330
331	movaps	%xmm0, %xmm1
332	shufps  $1, %xmm0, %xmm0
333	addss	 %xmm1, %xmm0
334#else
335	haddps	%xmm0, %xmm0
336	haddps	%xmm0, %xmm0
337#endif
338	ALIGN_4
339
340.L999:
341	RESTOREREGISTERS
342
343	ret
344
345	EPILOGUE
346