1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45
46#define I	%rax
47
48#include "l1param.h"
49
50	PROLOGUE
51	PROFCODE
52
53	SAVEREGISTERS
54
55	xorps	%xmm0, %xmm0
56	testq	M, M
57	jle	.L999
58	testq	INCX, INCX
59	jle	.L999
60
61	xorps	%xmm1, %xmm1
62	xorps	%xmm2, %xmm2
63	xorps	%xmm3, %xmm3
64
65	pcmpeqb	%xmm15, %xmm15
66	psrlq	$1, %xmm15
67
68	salq	$BASE_SHIFT, INCX
69
70	subq	$-16 * SIZE, X
71
72	cmpq	$SIZE, INCX
73	jne	.L40
74
75	testq	$SIZE, X
76	je	.L05
77
78	movsd	-16 * SIZE(X), %xmm0
79	addq	$SIZE, X
80
81	andps	%xmm15, %xmm0
82	subq	$1, M
83	jle	.L999
84	ALIGN_3
85
86.L05:
87	movq	M,  I
88	sarq	$4, I
89	jle	.L20
90
91	movaps	-16 * SIZE(X), %xmm4
92	movaps	-14 * SIZE(X), %xmm5
93	movaps	-12 * SIZE(X), %xmm6
94	movaps	-10 * SIZE(X), %xmm7
95
96	movaps	 -8 * SIZE(X), %xmm8
97	movaps	 -6 * SIZE(X), %xmm9
98	movaps	 -4 * SIZE(X), %xmm10
99	movaps	 -2 * SIZE(X), %xmm11
100
101	decq	I
102	jle	.L11
103	ALIGN_4
104
105.L10:
106#ifdef PREFETCH
107	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
108#endif
109
110	andps	%xmm15, %xmm4
111	addpd	%xmm4,  %xmm0
112	movaps	  0 * SIZE(X), %xmm4
113
114	andps	%xmm15, %xmm5
115	addpd	%xmm5,  %xmm1
116	movaps	  2 * SIZE(X), %xmm5
117
118	andps	%xmm15, %xmm6
119	addpd	%xmm6,  %xmm2
120	movaps	  4 * SIZE(X), %xmm6
121
122	andps	%xmm15, %xmm7
123	addpd	%xmm7,  %xmm3
124	movaps	  6 * SIZE(X), %xmm7
125
126#if defined(PREFETCH) && !defined(FETCH128)
127	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
128#endif
129
130	andps	%xmm15, %xmm8
131	addpd	%xmm8,  %xmm0
132	movaps	  8 * SIZE(X), %xmm8
133
134	andps	%xmm15, %xmm9
135	addpd	%xmm9,  %xmm1
136	movaps	 10 * SIZE(X), %xmm9
137
138	andps	%xmm15, %xmm10
139	addpd	%xmm10, %xmm2
140	movaps	 12 * SIZE(X), %xmm10
141
142	andps	%xmm15, %xmm11
143	addpd	%xmm11, %xmm3
144	movaps	 14 * SIZE(X), %xmm11
145
146	subq	$-16 * SIZE, X
147	decq	I
148	jg	.L10
149	ALIGN_4
150
151.L11:
152	andps	%xmm15, %xmm4
153	andps	%xmm15, %xmm5
154	andps	%xmm15, %xmm6
155	andps	%xmm15, %xmm7
156
157	addpd	%xmm4,  %xmm0
158	addpd	%xmm5,  %xmm1
159	addpd	%xmm6,  %xmm2
160	addpd	%xmm7,  %xmm3
161
162	andps	%xmm15, %xmm8
163	andps	%xmm15, %xmm9
164	andps	%xmm15, %xmm10
165	andps	%xmm15, %xmm11
166
167	addpd	%xmm8,  %xmm0
168	addpd	%xmm9,  %xmm1
169	addpd	%xmm10, %xmm2
170	addpd	%xmm11, %xmm3
171
172	subq	$-16 * SIZE, X
173	ALIGN_3
174
175.L20:
176	andq	$15,  M
177	jle	.L998
178
179	testq	$8, M
180	je	.L21
181
182	movaps	-16 * SIZE(X), %xmm4
183	movaps	-14 * SIZE(X), %xmm5
184	movaps	-12 * SIZE(X), %xmm6
185	movaps	-10 * SIZE(X), %xmm7
186
187	andps	%xmm15, %xmm4
188	andps	%xmm15, %xmm5
189	andps	%xmm15, %xmm6
190	andps	%xmm15, %xmm7
191
192	addpd	%xmm4, %xmm0
193	addpd	%xmm5, %xmm1
194	addpd	%xmm6, %xmm2
195	addpd	%xmm7, %xmm3
196	addq	$8 * SIZE, X
197	ALIGN_3
198
199.L21:
200	testq	$4, M
201	je	.L22
202
203	movaps	-16 * SIZE(X), %xmm4
204	movaps	-14 * SIZE(X), %xmm5
205
206	andps	%xmm15, %xmm4
207	andps	%xmm15, %xmm5
208	addpd	%xmm4, %xmm0
209	addpd	%xmm5, %xmm1
210
211	addq	$4 * SIZE, X
212	ALIGN_3
213
214.L22:
215	testq	$2, M
216	je	.L23
217
218	movaps	-16 * SIZE(X), %xmm6
219	andps	%xmm15, %xmm6
220	addpd	%xmm6, %xmm3
221	addq	$2 * SIZE, X
222
223.L23:
224	testq	$1, M
225	je	.L998
226
227#ifdef movsd
228	xorps	%xmm4, %xmm4
229#endif
230	movsd	-16 * SIZE(X), %xmm4
231	andps	%xmm15, %xmm4
232	addsd	%xmm4, %xmm0
233	jmp	.L998
234	ALIGN_3
235
236.L40:
237	movq	M,  I
238	sarq	$3, I
239	jle	.L60
240	ALIGN_4
241
242.L50:
243	movsd	-16 * SIZE(X), %xmm4
244	addq	INCX, X
245	movhpd	-16 * SIZE(X), %xmm4
246	addq	INCX, X
247	andps	%xmm15, %xmm4
248	addpd	%xmm4, %xmm0
249
250	movsd	-16 * SIZE(X), %xmm5
251	addq	INCX, X
252	movhpd	-16 * SIZE(X), %xmm5
253	addq	INCX, X
254	andps	%xmm15, %xmm5
255	addpd	%xmm5, %xmm1
256
257	movsd	-16 * SIZE(X), %xmm6
258	addq	INCX, X
259	movhpd	-16 * SIZE(X), %xmm6
260	addq	INCX, X
261	andps	%xmm15, %xmm6
262	addpd	%xmm6, %xmm2
263
264	movsd	-16 * SIZE(X), %xmm7
265	addq	INCX, X
266	movhpd	-16 * SIZE(X), %xmm7
267	addq	INCX, X
268	andps	%xmm15, %xmm7
269	addpd	%xmm7, %xmm3
270
271	decq	I
272	jg	.L50
273	ALIGN_4
274
275.L60:
276#ifdef movsd
277	xorps	%xmm4, %xmm4
278#endif
279	andq	$7,  M
280	jle	.L998
281	ALIGN_4
282
283.L61:
284	movsd	-16 * SIZE(X), %xmm4
285	andps	%xmm15, %xmm4
286	addpd	%xmm4, %xmm0
287	addq	INCX, X
288	decq	M
289	jg	.L61
290	ALIGN_4
291
292.L998:
293	addpd	%xmm1, %xmm0
294	addpd	%xmm3, %xmm2
295	addpd	%xmm2, %xmm0
296	ALIGN_4
297
298.L999:
299#ifndef HAVE_SSE3
300	movhlps	%xmm0, %xmm1
301	addsd	%xmm1, %xmm0
302#else
303	haddpd	%xmm0, %xmm0
304#endif
305
306	RESTOREREGISTERS
307
308	ret
309
310	EPILOGUE
311
312