1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43#define X	%i1
44#define INCX	%i2
45#define I	%i3
46
47#ifdef DOUBLE
48#define c1	%f0
49#define c2	%f2
50#define c3	%f4
51#define c4	%f6
52#define t1	%f8
53#define t2	%f10
54#define t3	%f12
55#define t4	%f14
56#define t5	%f16
57#define t6	%f18
58#define t7	%f20
59#define t8	%f22
60
61#define a1	%f24
62#define a2	%f26
63#define a3	%f28
64#define a4	%f30
65#define a5	%f32
66#define a6	%f34
67#define a7	%f36
68#define a8	%f38
69#else
70#define c1	%f0
71#define c2	%f1
72#define c3	%f2
73#define c4	%f3
74#define t1	%f4
75#define t2	%f5
76#define t3	%f6
77#define t4	%f7
78#define t5	%f8
79#define t6	%f9
80#define t7	%f10
81#define t8	%f11
82
83#define a1	%f12
84#define a2	%f13
85#define a3	%f14
86#define a4	%f15
87#define a5	%f16
88#define a6	%f17
89#define a7	%f18
90#define a8	%f19
91#endif
92
93#ifndef USE_MIN
94#define FCMOV	FMOVG
95#else
96#define FCMOV	FMOVL
97#endif
98
99	PROLOGUE
100	SAVESP
101
102	FCLR(0)
103
104	cmp	N, 0
105	ble	.LL20
106	nop
107
108	cmp	INCX, 0
109	ble	.LL20
110	sll	INCX, ZBASE_SHIFT, INCX
111
112	LDF	[X + 0 * SIZE], c1
113	LDF	[X + 1 * SIZE], c2
114	add	N, -1, N
115	FABS	c1, c1
116	add	X, INCX, X
117	FABS	c2, c2
118	cmp	N, 0
119	ble	.LL20
120	FADD	c1, c2, c1
121
122	FMOV	c1, c2
123	FMOV	c1, c3
124	FMOV	c1, c4
125
126	cmp	INCX, 2 * SIZE
127	bne	.LL50
128	nop
129
130	sra	N, 2, I
131	cmp	I, 0
132	ble,pn	%icc, .LL15
133	nop
134
135	LDF	[X +  0 * SIZE], a1
136	LDF	[X +  1 * SIZE], a2
137	LDF	[X +  2 * SIZE], a3
138	LDF	[X +  3 * SIZE], a4
139
140	LDF	[X +  4 * SIZE], a5
141	add	I, -1, I
142	LDF	[X +  5 * SIZE], a6
143	cmp	I, 0
144	LDF	[X +  6 * SIZE], a7
145	LDF	[X +  7 * SIZE], a8
146
147	ble,pt	%icc, .LL12
148	add	X, 8 * SIZE, X
149
150#define PREFETCHSIZE 40
151
152.LL11:
153	prefetch [X + PREFETCHSIZE * SIZE], 0
154
155	FABS	a1, t1
156	LDF	[X +  0 * SIZE], a1
157	FABS	a2, t2
158	LDF	[X +  1 * SIZE], a2
159	FABS	a3, t3
160	LDF	[X +  2 * SIZE], a3
161	FABS	a4, t4
162	LDF	[X +  3 * SIZE], a4
163
164	FABS	a5, t5
165	LDF	[X +  4 * SIZE], a5
166	FABS	a6, t6
167	LDF	[X +  5 * SIZE], a6
168	FABS	a7, t7
169	LDF	[X +  6 * SIZE], a7
170	FABS	a8, t8
171	LDF	[X +  7 * SIZE], a8
172
173	FADD	t1, t2, t1
174	FADD	t3, t4, t3
175	FADD	t5, t6, t5
176	FADD	t7, t8, t7
177
178	FCMP	%fcc0, t1, c1
179	FCMP	%fcc1, t3, c2
180	FCMP	%fcc2, t5, c3
181	FCMP	%fcc3, t7, c4
182
183	FCMOV	%fcc0, t1, c1
184	add	I, -1, I
185	FCMOV	%fcc1, t3, c2
186	cmp	I, 0
187	FCMOV	%fcc2, t5, c3
188	FCMOV	%fcc3, t7, c4
189
190	bg,pt	%icc, .LL11
191	add	X, 8 * SIZE, X
192
193.LL12:
194	FABS	a1, t1
195	FABS	a2, t2
196	FABS	a3, t3
197	FABS	a4, t4
198
199	FABS	a5, t5
200	FABS	a6, t6
201	FABS	a7, t7
202	FABS	a8, t8
203
204	FADD	t1, t2, t1
205	FADD	t3, t4, t3
206	FADD	t5, t6, t5
207	FADD	t7, t8, t7
208
209	FCMP	%fcc0, t1, c1
210	FCMP	%fcc1, t3, c2
211	FCMP	%fcc2, t5, c3
212	FCMP	%fcc3, t7, c4
213
214	FCMOV	%fcc0, t1, c1
215	FCMOV	%fcc1, t3, c2
216	FCMOV	%fcc2, t5, c3
217	FCMOV	%fcc3, t7, c4
218
219.LL15:
220	and	N, 3, I
221	cmp	I,  0
222	ble,a,pn %icc, .LL19
223	nop
224
225.LL16:
226	LDF	[X +  0 * SIZE], a1
227	LDF	[X +  1 * SIZE], a2
228
229	FABS	a1, t1
230	FABS	a2, t2
231	FADD	t1, t2, t1
232	FCMP	%fcc0, t1, c1
233	FCMOV	%fcc0, t1, c1
234	add	I, -1, I
235	cmp	I, 0
236	bg,pt	%icc, .LL16
237	add	X, 2 * SIZE, X
238
239.LL19:
240	FCMP	%fcc0, c2, c1
241	FCMP	%fcc1, c4, c3
242	FCMOV	%fcc0, c2, c1
243	FCMOV	%fcc1, c4, c3
244	FCMP	%fcc0, c3, c1
245	FCMOV	%fcc0, c3, c1
246
247.LL20:
248	return	%i7 + 8
249	clr	%g0
250
251.LL50:
252	sra	N, 2, I
253	cmp	I, 0
254	ble,pn	%icc, .LL55
255	nop
256
257	LDF	[X +  0 * SIZE], a1
258	LDF	[X +  1 * SIZE], a2
259	add	X, INCX, X
260	LDF	[X +  0 * SIZE], a3
261	LDF	[X +  1 * SIZE], a4
262	add	X, INCX, X
263	LDF	[X +  0 * SIZE], a5
264	LDF	[X +  1 * SIZE], a6
265	add	X, INCX, X
266	add	I, -1, I
267	LDF	[X +  0 * SIZE], a7
268	cmp	I, 0
269	LDF	[X +  1 * SIZE], a8
270	ble,pt	%icc, .LL52
271	add	X, INCX, X
272
273.LL51:
274	FABS	a1, t1
275	LDF	[X +  0 * SIZE], a1
276	FABS	a2, t2
277	LDF	[X +  1 * SIZE], a2
278	add	X, INCX, X
279	FABS	a3, t3
280	LDF	[X +  0 * SIZE], a3
281	FABS	a4, t4
282	LDF	[X +  1 * SIZE], a4
283	add	X, INCX, X
284
285	FABS	a5, t5
286	LDF	[X +  0 * SIZE], a5
287	FABS	a6, t6
288	LDF	[X +  1 * SIZE], a6
289	add	X, INCX, X
290	FABS	a7, t7
291	LDF	[X +  0 * SIZE], a7
292	FABS	a8, t8
293	LDF	[X +  1 * SIZE], a8
294
295	FADD	t1, t2, t1
296	FADD	t3, t4, t3
297	FADD	t5, t6, t5
298	FADD	t7, t8, t7
299
300	FCMP	%fcc0, t1, c1
301	FCMP	%fcc1, t3, c2
302	FCMP	%fcc2, t5, c3
303	FCMP	%fcc3, t7, c4
304
305	FCMOV	%fcc0, t1, c1
306	add	I, -1, I
307	FCMOV	%fcc1, t3, c2
308	cmp	I, 0
309	FCMOV	%fcc2, t5, c3
310	FCMOV	%fcc3, t7, c4
311
312
313	bg,pt	%icc, .LL51
314	add	X, INCX, X
315
316.LL52:
317	FABS	a1, t1
318	FABS	a2, t2
319	FABS	a3, t3
320	FABS	a4, t4
321
322	FABS	a5, t5
323	FABS	a6, t6
324	FABS	a7, t7
325	FABS	a8, t8
326
327	FADD	t1, t2, t1
328	FADD	t3, t4, t3
329	FADD	t5, t6, t5
330	FADD	t7, t8, t7
331
332	FCMP	%fcc0, t1, c1
333	FCMP	%fcc1, t3, c2
334	FCMP	%fcc2, t5, c3
335	FCMP	%fcc3, t7, c4
336
337	FCMOV	%fcc0, t1, c1
338	FCMOV	%fcc1, t3, c2
339	FCMOV	%fcc2, t5, c3
340	FCMOV	%fcc3, t7, c4
341
342.LL55:
343	and	N, 3, I
344	cmp	I,  0
345	ble,a,pn %icc, .LL59
346	nop
347
348.LL56:
349	LDF	[X +  0 * SIZE], a1
350	LDF	[X +  1 * SIZE], a2
351
352	FABS	a1, t1
353	add	I, -1, I
354	FABS	a2, t2
355	cmp	I, 0
356	FADD	t1, t2, t1
357	FCMP	%fcc0, t1, c1
358	FCMOV	%fcc0, t1, c1
359
360	bg,pt	%icc, .LL56
361	add	X, INCX, X
362
363.LL59:
364	FCMP	%fcc0, c2, c1
365	FCMP	%fcc1, c4, c3
366	FCMOV	%fcc0, c2, c1
367	FCMOV	%fcc1, c4, c3
368	FCMP	%fcc0, c3, c1
369	FCMOV	%fcc0, c3, c1
370
371	return	%i7 + 8
372	clr	%o0
373
374	EPILOGUE
375