1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45
46#define I	%rax
47
48#include "l1param.h"
49
50	PROLOGUE
51	PROFCODE
52
53	SAVEREGISTERS
54
55	xorps	%xmm0, %xmm0
56	testq	M, M
57	jle	.L999
58	testq	INCX, INCX
59	jle	.L999
60
61	xorps	%xmm1, %xmm1
62	xorps	%xmm2, %xmm2
63	xorps	%xmm3, %xmm3
64
65	pcmpeqb	%xmm15, %xmm15
66	psrlq	$1, %xmm15
67
68	salq	$BASE_SHIFT, INCX
69	xorps	 %xmm13, %xmm13
70
71	cmpq	$SIZE, INCX
72	jne	.L20
73
74	testq	$SIZE, X
75	je	.L05
76
77	movsd	(X), %xmm0
78	addq	$SIZE, X
79	andps	%xmm15, %xmm0
80	decq	M
81	jle	.L999
82	ALIGN_3
83
84.L05:
85	subq	$-16 * SIZE, X
86
87	movq	M,  I
88	sarq	$4, I
89	jle	.L12
90
91	movaps	-16 * SIZE(X), %xmm4
92	movaps	-14 * SIZE(X), %xmm5
93	movaps	-12 * SIZE(X), %xmm6
94	movaps	-10 * SIZE(X), %xmm7
95
96	movaps	 -8 * SIZE(X), %xmm8
97	movaps	 -6 * SIZE(X), %xmm9
98	movaps	 -4 * SIZE(X), %xmm10
99	movaps	 -2 * SIZE(X), %xmm11
100
101	decq	I
102	jle	.L11
103	ALIGN_4
104
105.L10:
106#ifdef PREFETCH
107	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
108#endif
109
110	andps	%xmm15, %xmm4
111	addsd	%xmm13, %xmm3
112	pshufd	$0x4e, %xmm4, %xmm12
113	addsd	%xmm4, %xmm0
114	movaps	  0 * SIZE(X), %xmm4
115
116	andps	%xmm15, %xmm5
117	addsd	%xmm12, %xmm1
118	pshufd	$0x4e, %xmm5, %xmm13
119	addsd	%xmm5, %xmm2
120	movaps	  2 * SIZE(X), %xmm5
121
122	andps	%xmm15, %xmm6
123	addsd	%xmm13, %xmm3
124	pshufd	$0x4e, %xmm6, %xmm12
125	addsd	%xmm6, %xmm0
126	movaps	  4 * SIZE(X), %xmm6
127
128	andps	%xmm15, %xmm7
129	addsd	%xmm12, %xmm1
130	pshufd	$0x4e, %xmm7, %xmm13
131	addsd	%xmm7, %xmm2
132	movaps	  6 * SIZE(X), %xmm7
133
134#if defined(PREFETCH) && !defined(FETCH128)
135	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
136#endif
137
138	andps	%xmm15, %xmm8
139	addsd	%xmm13, %xmm3
140	pshufd	$0x4e, %xmm8, %xmm12
141	addsd	%xmm8, %xmm0
142	movaps	  8 * SIZE(X), %xmm8
143
144	andps	%xmm15, %xmm9
145	addsd	%xmm12, %xmm1
146	pshufd	$0x4e, %xmm9, %xmm13
147	addsd	%xmm9, %xmm2
148	movaps	 10 * SIZE(X), %xmm9
149
150	andps	%xmm15, %xmm10
151	addsd	%xmm13, %xmm3
152	pshufd	$0x4e, %xmm10, %xmm12
153	addsd	%xmm10, %xmm0
154	movaps	 12 * SIZE(X), %xmm10
155
156	andps	%xmm15, %xmm11
157	addsd	%xmm12, %xmm1
158	pshufd	$0x4e, %xmm11, %xmm13
159	addsd	%xmm11, %xmm2
160	movaps	 14 * SIZE(X), %xmm11
161
162	subq	$-16 * SIZE, X
163	decq	I
164	jg	.L10
165	ALIGN_4
166
167.L11:
168	andps	%xmm15, %xmm4
169	addsd	%xmm13, %xmm3
170	pshufd	$0x4e, %xmm4, %xmm12
171	addsd	%xmm4, %xmm0
172
173	andps	%xmm15, %xmm5
174	addsd	%xmm12, %xmm1
175	pshufd	$0x4e, %xmm5, %xmm13
176	addsd	%xmm5, %xmm2
177
178	andps	%xmm15, %xmm6
179	addsd	%xmm13, %xmm3
180	pshufd	$0x4e, %xmm6, %xmm12
181	addsd	%xmm6, %xmm0
182
183	andps	%xmm15, %xmm7
184	addsd	%xmm12, %xmm1
185	pshufd	$0x4e, %xmm7, %xmm13
186	addsd	%xmm7, %xmm2
187
188	andps	%xmm15, %xmm8
189	addsd	%xmm13, %xmm3
190	pshufd	$0x4e, %xmm8, %xmm12
191	addsd	%xmm8, %xmm0
192
193	andps	%xmm15, %xmm9
194	addsd	%xmm12, %xmm1
195	pshufd	$0x4e, %xmm9, %xmm13
196	addsd	%xmm9, %xmm2
197
198	andps	%xmm15, %xmm10
199	addsd	%xmm13, %xmm3
200	pshufd	$0x4e, %xmm10, %xmm12
201	addsd	%xmm10, %xmm0
202
203	andps	%xmm15, %xmm11
204	addsd	%xmm12, %xmm1
205	pshufd	$0x4e, %xmm11, %xmm13
206	addsd	%xmm11, %xmm2
207
208	addsd	%xmm13, %xmm3
209	subq	$-16 * SIZE, X
210	ALIGN_3
211
212.L12:
213	andq	$15,  M
214	jle	.L998
215
216	testq	$8, M
217	je	.L13
218
219	movaps	-16 * SIZE(X), %xmm4
220	movaps	-14 * SIZE(X), %xmm5
221	movaps	-12 * SIZE(X), %xmm6
222	movaps	-10 * SIZE(X), %xmm7
223	addq	$8 * SIZE, X
224
225	andps	%xmm15, %xmm4
226	pshufd	$0x4e, %xmm4, %xmm12
227	addsd	%xmm4, %xmm0
228	andps	%xmm15, %xmm5
229	addsd	%xmm12, %xmm1
230	pshufd	$0x4e, %xmm5, %xmm13
231	addsd	%xmm5, %xmm2
232	addsd	%xmm13, %xmm3
233	andps	%xmm15, %xmm6
234	pshufd	$0x4e, %xmm6, %xmm12
235	addsd	%xmm6, %xmm0
236	andps	%xmm15, %xmm7
237	addsd	%xmm12, %xmm1
238	pshufd	$0x4e, %xmm7, %xmm13
239	addsd	%xmm7, %xmm2
240	addsd	%xmm13, %xmm3
241	ALIGN_3
242
243.L13:
244	testq	$4, M
245	je	.L14
246
247	movaps	-16 * SIZE(X), %xmm4
248	movaps	-14 * SIZE(X), %xmm5
249	addq	$4 * SIZE, X
250
251	andps	%xmm15, %xmm4
252	pshufd	$0x4e, %xmm4, %xmm12
253	addsd	%xmm4, %xmm0
254	andps	%xmm15, %xmm5
255	addsd	%xmm12, %xmm1
256	pshufd	$0x4e, %xmm5, %xmm13
257	addsd	%xmm5, %xmm2
258	addsd	%xmm13, %xmm3
259	ALIGN_3
260
261.L14:
262	testq	$2, M
263	je	.L15
264
265	movaps	-16 * SIZE(X), %xmm4
266	addq	$2 * SIZE, X
267	andps	%xmm15, %xmm4
268
269	pshufd	$0x4e, %xmm4, %xmm5
270	addsd	%xmm4, %xmm2
271	addsd	%xmm5, %xmm3
272	ALIGN_3
273
274.L15:
275	testq	$1, M
276	je	.L998
277
278	movsd	-16 * SIZE(X), %xmm4
279	andps	%xmm15, %xmm4
280	addsd	%xmm4, %xmm0
281	jmp	.L998
282	ALIGN_3
283
284.L20:
285	movq	M,  I
286	sarq	$3, I
287	jle	.L25
288
289	movsd	(X), %xmm4
290	addq	INCX, X
291	movsd	(X), %xmm5
292	addq	INCX, X
293	movsd	(X), %xmm6
294	addq	INCX, X
295	movsd	(X), %xmm7
296	addq	INCX, X
297
298	movsd	(X), %xmm8
299	addq	INCX, X
300	movsd	(X), %xmm9
301	addq	INCX, X
302	movsd	(X), %xmm10
303	addq	INCX, X
304	movsd	(X), %xmm11
305
306	decq	I
307	jle	.L23
308	ALIGN_4
309
310.L22:
311	andps	%xmm15, %xmm4
312	addq	INCX, X
313	addsd	%xmm4,  %xmm0
314	movsd	 (X), %xmm4
315	andps	%xmm15, %xmm5
316	addq	INCX, X
317	addsd	%xmm5,  %xmm1
318	movsd	 (X), %xmm5
319	andps	%xmm15, %xmm6
320	addq	INCX, X
321	addsd	%xmm6,  %xmm2
322	movsd	 (X), %xmm6
323	andps	%xmm15, %xmm7
324	addq	INCX, X
325	addsd	%xmm7,  %xmm3
326	movsd	 (X), %xmm7
327
328	andps	%xmm15, %xmm8
329	addq	INCX, X
330	addsd	%xmm8,  %xmm0
331	movsd	 (X), %xmm8
332	andps	%xmm15, %xmm9
333	addq	INCX, X
334	addsd	%xmm9,  %xmm1
335	movsd	 (X), %xmm9
336	andps	%xmm15, %xmm10
337	addq	INCX, X
338	addsd	%xmm10, %xmm2
339	movsd	 (X), %xmm10
340	andps	%xmm15, %xmm11
341	addq	INCX, X
342	addsd	%xmm11, %xmm3
343	movsd	 (X), %xmm11
344
345	decq	I
346	jg	.L22
347	ALIGN_4
348
349.L23:
350	andps	%xmm15, %xmm4
351	addq	INCX, X
352	addsd	%xmm4,  %xmm0
353	andps	%xmm15, %xmm5
354	addsd	%xmm5,  %xmm1
355	andps	%xmm15, %xmm6
356	addsd	%xmm6,  %xmm2
357	andps	%xmm15, %xmm7
358	addsd	%xmm7,  %xmm3
359
360	andps	%xmm15, %xmm8
361	addsd	%xmm8,  %xmm0
362	andps	%xmm15, %xmm9
363	addsd	%xmm9,  %xmm1
364	andps	%xmm15, %xmm10
365	addsd	%xmm10, %xmm2
366	andps	%xmm15, %xmm11
367	addsd	%xmm11, %xmm3
368	ALIGN_3
369
370.L25:
371	andq	$7,  M
372	jle	.L998
373
374	testq	$4, M
375	je	.L26
376
377	movsd	(X), %xmm4
378	addq	INCX, X
379	movsd	(X), %xmm5
380	addq	INCX, X
381	movsd	(X), %xmm6
382	andps	%xmm15, %xmm4
383	addsd	%xmm4,  %xmm0
384	addq	INCX, X
385	movsd	(X), %xmm7
386	andps	%xmm15, %xmm5
387	addsd	%xmm5,  %xmm1
388	addq	INCX, X
389
390	andps	%xmm15, %xmm6
391	addsd	%xmm6,  %xmm2
392	andps	%xmm15, %xmm7
393	addsd	%xmm7,  %xmm3
394	ALIGN_3
395
396.L26:
397	testq	$2, M
398	je	.L27
399
400	movsd	(X), %xmm4
401	addq	INCX, X
402	movsd	(X), %xmm5
403	addq	INCX, X
404
405	andps	%xmm15, %xmm4
406	andps	%xmm15, %xmm5
407
408	addsd	%xmm4, %xmm0
409	addsd	%xmm5, %xmm1
410	ALIGN_3
411
412.L27:
413	testq	$1, M
414	je	.L998
415
416	movsd	(X), %xmm4
417	andps	%xmm15, %xmm4
418	addsd	%xmm4, %xmm0
419	ALIGN_3
420
421.L998:
422	addsd	%xmm1, %xmm0
423	addsd	%xmm3, %xmm2
424	addsd	%xmm2, %xmm0
425	ALIGN_4
426
427.L999:
428	RESTOREREGISTERS
429
430	ret
431
432	EPILOGUE
433
434