1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45
46#define I	%rax
47
48#ifdef USE_MIN
49#define maxpd	minpd
50#define maxsd	minsd
51#endif
52
53#include "l1param.h"
54
55	PROLOGUE
56	PROFCODE
57
58	SAVEREGISTERS
59
60	pxor	%xmm0, %xmm0
61	testq	M, M
62	jle	.L999
63	testq	INCX, INCX
64	jle	.L999
65
66	salq	$ZBASE_SHIFT, INCX
67
68	pcmpeqb	%xmm15, %xmm15
69	psrlq	$1, %xmm15
70
71	movsd	0 * SIZE(X), %xmm0
72	movsd	1 * SIZE(X), %xmm1
73	addq	INCX, X
74	decq	M
75	andpd	 %xmm15, %xmm0
76	andpd	 %xmm15, %xmm1
77	addpd	 %xmm1, %xmm0
78	unpcklpd %xmm0, %xmm0
79	movapd	 %xmm0, %xmm1
80	movapd	 %xmm0, %xmm2
81	movapd	 %xmm0, %xmm3
82
83	cmpq	$2 * SIZE, INCX
84	jne	.L40
85
86.L30:
87	movq	M,  I
88	sarq	$3, I
89	jle	.L35
90	ALIGN_4
91
92.L31:
93#ifdef PREFETCH
94	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
95#endif
96
97	movsd	0 * SIZE(X), %xmm4
98	movsd	1 * SIZE(X), %xmm5
99	movhpd	2 * SIZE(X), %xmm4
100	movhpd	3 * SIZE(X), %xmm5
101	movsd	4 * SIZE(X), %xmm6
102	movsd	5 * SIZE(X), %xmm7
103	movhpd	6 * SIZE(X), %xmm6
104	movhpd	7 * SIZE(X), %xmm7
105
106	andpd	%xmm15, %xmm4
107	andpd	%xmm15, %xmm5
108	addpd	%xmm5,  %xmm4
109	maxpd	%xmm4,  %xmm0
110
111	andpd	%xmm15, %xmm6
112	andpd	%xmm15, %xmm7
113	addpd	%xmm7,  %xmm6
114	maxpd	%xmm6,  %xmm1
115
116#if defined(PREFETCH) && !defined(FETCH128)
117	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
118#endif
119
120	movsd	 8 * SIZE(X), %xmm4
121	movsd	 9 * SIZE(X), %xmm5
122	movhpd	10 * SIZE(X), %xmm4
123	movhpd	11 * SIZE(X), %xmm5
124	movsd	12 * SIZE(X), %xmm6
125	movsd	13 * SIZE(X), %xmm7
126	movhpd	14 * SIZE(X), %xmm6
127	movhpd	15 * SIZE(X), %xmm7
128
129	andpd	%xmm15, %xmm4
130	andpd	%xmm15, %xmm5
131	addpd	%xmm5,  %xmm4
132	maxpd	%xmm4,  %xmm2
133
134	andpd	%xmm15, %xmm6
135	andpd	%xmm15, %xmm7
136	addpd	%xmm7,  %xmm6
137	maxpd	%xmm6,  %xmm3
138
139	addq	$16 * SIZE, X
140	decq	I
141	jg	.L31
142	ALIGN_4
143
144.L35:
145	andq	$7,  M
146	jle	.L998
147
148	testq	$4, M
149	je	.L36
150
151	movsd	0 * SIZE(X), %xmm4
152	movsd	1 * SIZE(X), %xmm5
153	movhpd	2 * SIZE(X), %xmm4
154	movhpd	3 * SIZE(X), %xmm5
155	movsd	4 * SIZE(X), %xmm6
156	movsd	5 * SIZE(X), %xmm7
157	movhpd	6 * SIZE(X), %xmm6
158	movhpd	7 * SIZE(X), %xmm7
159
160	andpd	%xmm15, %xmm4
161	andpd	%xmm15, %xmm5
162	andpd	%xmm15, %xmm6
163	andpd	%xmm15, %xmm7
164
165	addpd	%xmm5,  %xmm4
166	addpd	%xmm7,  %xmm6
167	maxpd	%xmm4, %xmm0
168	maxpd	%xmm6, %xmm1
169
170	addq	$8 * SIZE, X
171	ALIGN_3
172
173.L36:
174	testq	$2, M
175	je	.L37
176
177	movsd	0 * SIZE(X), %xmm4
178	movsd	1 * SIZE(X), %xmm5
179	movhpd	2 * SIZE(X), %xmm4
180	movhpd	3 * SIZE(X), %xmm5
181	addq	$4 * SIZE, X
182
183	andpd	%xmm15, %xmm4
184	andpd	%xmm15, %xmm5
185	addpd	%xmm5,  %xmm4
186	maxpd	%xmm4, %xmm0
187	ALIGN_3
188
189.L37:
190	testq	$1, M
191	je	.L998
192
193	movsd	0 * SIZE(X), %xmm4
194	movsd	1 * SIZE(X), %xmm5
195	andpd	%xmm15, %xmm4
196	andpd	%xmm15, %xmm5
197	addpd	%xmm5, %xmm4
198	maxsd	%xmm4, %xmm2
199	jmp	.L998
200	ALIGN_4
201
202
203.L40:
204	movq	M,  I
205	sarq	$3, I
206	jle	.L45
207	ALIGN_4
208
209.L41:
210#ifdef PREFETCH
211	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
212#endif
213
214	movsd	0 * SIZE(X), %xmm4
215	movsd	1 * SIZE(X), %xmm5
216	addq	INCX, X
217	movhpd	0 * SIZE(X), %xmm4
218	movhpd	1 * SIZE(X), %xmm5
219	addq	INCX, X
220	movsd	0 * SIZE(X), %xmm6
221	movsd	1 * SIZE(X), %xmm7
222	addq	INCX, X
223	movhpd	0 * SIZE(X), %xmm6
224	movhpd	1 * SIZE(X), %xmm7
225	addq	INCX, X
226
227	andpd	%xmm15, %xmm4
228	andpd	%xmm15, %xmm5
229	addpd	%xmm5,  %xmm4
230	maxpd	%xmm4,  %xmm0
231
232	andpd	%xmm15, %xmm6
233	andpd	%xmm15, %xmm7
234	addpd	%xmm7,  %xmm6
235	maxpd	%xmm6,  %xmm1
236
237#if defined(PREFETCH) && !defined(FETCH128)
238	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
239#endif
240
241	movsd	0 * SIZE(X), %xmm4
242	movsd	1 * SIZE(X), %xmm5
243	addq	INCX, X
244	movhpd	0 * SIZE(X), %xmm4
245	movhpd	1 * SIZE(X), %xmm5
246	addq	INCX, X
247	movsd	0 * SIZE(X), %xmm6
248	movsd	1 * SIZE(X), %xmm7
249	addq	INCX, X
250	movhpd	0 * SIZE(X), %xmm6
251	movhpd	1 * SIZE(X), %xmm7
252	addq	INCX, X
253
254	andpd	%xmm15, %xmm4
255	andpd	%xmm15, %xmm5
256	addpd	%xmm5,  %xmm4
257	maxpd	%xmm4,  %xmm2
258
259	andpd	%xmm15, %xmm6
260	andpd	%xmm15, %xmm7
261	addpd	%xmm7,  %xmm6
262	maxpd	%xmm6,  %xmm3
263
264	decq	I
265	jg	.L41
266	ALIGN_4
267
268.L45:
269	andq	$7,  M
270	jle	.L998
271
272	testq	$4, M
273	je	.L46
274
275	movsd	0 * SIZE(X), %xmm4
276	movsd	1 * SIZE(X), %xmm5
277	addq	INCX, X
278	movhpd	0 * SIZE(X), %xmm4
279	movhpd	1 * SIZE(X), %xmm5
280	addq	INCX, X
281	movsd	0 * SIZE(X), %xmm6
282	movsd	1 * SIZE(X), %xmm7
283	addq	INCX, X
284	movhpd	0 * SIZE(X), %xmm6
285	movhpd	1 * SIZE(X), %xmm7
286	addq	INCX, X
287
288	andpd	%xmm15, %xmm4
289	andpd	%xmm15, %xmm5
290	andpd	%xmm15, %xmm6
291	andpd	%xmm15, %xmm7
292	addpd	%xmm5, %xmm4
293	addpd	%xmm7, %xmm6
294	maxpd	%xmm4, %xmm0
295	maxpd	%xmm6, %xmm1
296	ALIGN_3
297
298.L46:
299	testq	$2, M
300	je	.L47
301
302	movsd	0 * SIZE(X), %xmm4
303	movsd	1 * SIZE(X), %xmm5
304	addq	INCX, X
305	movhpd	0 * SIZE(X), %xmm4
306	movhpd	1 * SIZE(X), %xmm5
307	addq	INCX, X
308	andpd	%xmm15, %xmm4
309	andpd	%xmm15, %xmm5
310	addpd	%xmm5, %xmm4
311	maxpd	%xmm4, %xmm2
312	ALIGN_3
313
314.L47:
315	testq	$1, M
316	je	.L998
317
318	movsd	0 * SIZE(X), %xmm4
319	movsd	1 * SIZE(X), %xmm5
320	andpd	%xmm15, %xmm4
321	andpd	%xmm15, %xmm5
322	addpd	%xmm5, %xmm4
323	maxsd	%xmm4, %xmm3
324	jmp	.L998
325	ALIGN_4
326
327.L998:
328	maxpd	%xmm1, %xmm0
329	maxpd	%xmm3, %xmm2
330	maxpd	%xmm2, %xmm0
331	movapd	%xmm0, %xmm1
332	unpckhpd %xmm0, %xmm0
333	maxsd	%xmm1, %xmm0
334	ALIGN_4
335
336.L999:
337	RESTOREREGISTERS
338
339	ret
340
341	EPILOGUE
342