1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43#define X	%i1
44#define INCX	%i2
45#define I	%i3
46
47#define v1	%o0
48#define v2	%o1
49#define v3	%o2
50#define v4	%o3
51#define count	%o4
52
53#ifdef DOUBLE
54#define c1	%f0
55#define c2	%f2
56#define c3	%f4
57#define c4	%f6
58#define t1	%f8
59#define t2	%f10
60#define t3	%f12
61#define t4	%f14
62#define t5	%f16
63#define t6	%f18
64#define t7	%f20
65#define t8	%f22
66
67#define a1	%f24
68#define a2	%f26
69#define a3	%f28
70#define a4	%f30
71#define a5	%f32
72#define a6	%f34
73#define a7	%f36
74#define a8	%f38
75#else
76#define c1	%f0
77#define c2	%f1
78#define c3	%f2
79#define c4	%f3
80#define t1	%f4
81#define t2	%f5
82#define t3	%f6
83#define t4	%f7
84#define t5	%f8
85#define t6	%f9
86#define t7	%f10
87#define t8	%f11
88
89#define a1	%f12
90#define a2	%f13
91#define a3	%f14
92#define a4	%f15
93#define a5	%f16
94#define a6	%f17
95#define a7	%f18
96#define a8	%f19
97#endif
98
99#ifndef USE_MIN
100#define FCMOV	FMOVG
101#define CMOV	movg
102#else
103#define FCMOV	FMOVL
104#define CMOV	movl
105#endif
106
107
108	PROLOGUE
109	SAVESP
110
111	FCLR(0)
112
113	cmp	N, 0
114	ble	.LL20
115	clr	v1
116
117	cmp	INCX, 0
118	ble	.LL20
119	sll	INCX, ZBASE_SHIFT, INCX
120
121	mov	1, v1
122
123	LDF	[X + 0 * SIZE], c1
124	LDF	[X + 1 * SIZE], c2
125	add	N, -1, N
126	FABS	c1, c1
127	add	X, INCX, X
128	FABS	c2, c2
129	cmp	N, 0
130	ble	.LL20
131	FADD	c1, c2, c1
132
133	FMOV	c1, c2
134	mov	1, v2
135	FMOV	c1, c3
136	mov	1, v3
137	FMOV	c1, c4
138	mov	1, v4
139	mov	2, count
140
141	cmp	INCX, 2 * SIZE
142	bne	.LL50
143	nop
144
145	sra	N, 2, I
146	cmp	I, 0
147	ble,pn	%icc, .LL15
148	nop
149
150	LDF	[X +  0 * SIZE], a1
151	LDF	[X +  1 * SIZE], a2
152	LDF	[X +  2 * SIZE], a3
153	LDF	[X +  3 * SIZE], a4
154
155	LDF	[X +  4 * SIZE], a5
156	add	I, -1, I
157	LDF	[X +  5 * SIZE], a6
158	cmp	I, 0
159	LDF	[X +  6 * SIZE], a7
160	LDF	[X +  7 * SIZE], a8
161
162	ble,pt	%icc, .LL12
163	add	X, 8 * SIZE, X
164
165#define PREFETCHSIZE 32
166
167.LL11:
168	prefetch [X + PREFETCHSIZE * SIZE], 0
169
170	FABS	a1, t1
171	LDF	[X +  0 * SIZE], a1
172	FABS	a2, t2
173	LDF	[X +  1 * SIZE], a2
174	FABS	a3, t3
175	LDF	[X +  2 * SIZE], a3
176	FABS	a4, t4
177	LDF	[X +  3 * SIZE], a4
178
179	FABS	a5, t5
180	LDF	[X +  4 * SIZE], a5
181	FABS	a6, t6
182	LDF	[X +  5 * SIZE], a6
183	FABS	a7, t7
184	LDF	[X +  6 * SIZE], a7
185	FABS	a8, t8
186	LDF	[X +  7 * SIZE], a8
187
188	FADD	t1, t2, t1
189	FADD	t3, t4, t3
190	FADD	t5, t6, t5
191	FADD	t7, t8, t7
192
193	FCMP	%fcc0, t1, c1
194	FCMP	%fcc1, t3, c2
195	FCMP	%fcc2, t5, c3
196	FCMP	%fcc3, t7, c4
197
198	FCMOV	%fcc0, t1, c1
199	CMOV	%fcc0, count, v1
200	add	I, -1, I
201	FCMOV	%fcc1, t3, c2
202	CMOV	%fcc1, count, v2
203	cmp	I, 0
204	FCMOV	%fcc2, t5, c3
205	CMOV	%fcc2, count, v3
206	FCMOV	%fcc3, t7, c4
207	CMOV	%fcc3, count, v4
208	add	count, 4, count
209
210	bg,pt	%icc, .LL11
211	add	X, 8 * SIZE, X
212
213.LL12:
214	FABS	a1, t1
215	FABS	a2, t2
216	FABS	a3, t3
217	FABS	a4, t4
218
219	FABS	a5, t5
220	FABS	a6, t6
221	FABS	a7, t7
222	FABS	a8, t8
223
224	FADD	t1, t2, t1
225	FADD	t3, t4, t3
226	FADD	t5, t6, t5
227	FADD	t7, t8, t7
228
229	FCMP	%fcc0, t1, c1
230	FCMP	%fcc1, t3, c2
231	FCMP	%fcc2, t5, c3
232	FCMP	%fcc3, t7, c4
233
234	FCMOV	%fcc0, t1, c1
235	CMOV	%fcc0, count, v1
236	FCMOV	%fcc1, t3, c2
237	CMOV	%fcc1, count, v2
238	FCMOV	%fcc2, t5, c3
239	CMOV	%fcc2, count, v3
240	FCMOV	%fcc3, t7, c4
241	CMOV	%fcc3, count, v4
242	add	count, 4, count
243
244.LL15:
245	and	N, 3, I
246	cmp	I,  0
247	ble,a,pn %icc, .LL19
248	nop
249
250.LL16:
251	LDF	[X +  0 * SIZE], a1
252	LDF	[X +  1 * SIZE], a2
253
254	FABS	a1, t1
255	FABS	a2, t2
256	FADD	t1, t2, t1
257	FCMP	%fcc0, t1, c1
258	FCMOV	%fcc0, t1, c1
259	CMOV	%fcc0, count, v1
260	add	count, 1, count
261	add	I, -1, I
262	cmp	I, 0
263	bg,pt	%icc, .LL16
264	add	X, 2 * SIZE, X
265
266.LL19:
267	FCMP	%fcc0, c2, c1
268	add	v2, 1, v2
269	FCMP	%fcc1, c4, c3
270	add	v3, 2, v3
271	add	v4, 3, v4
272
273	FCMOV	%fcc0, c2, c1
274	CMOV	%fcc0, v2, v1
275	FCMOV	%fcc1, c4, c3
276	CMOV	%fcc1, v4, v3
277	FCMP	%fcc0, c3, c1
278	CMOV	%fcc0, v3, v1
279
280.LL20:
281	mov	v1, %i0
282	return	%i7 + 8
283	nop
284
285.LL50:
286	sra	N, 2, I
287	cmp	I, 0
288	ble,pn	%icc, .LL55
289	nop
290
291	LDF	[X +  0 * SIZE], a1
292	LDF	[X +  1 * SIZE], a2
293	add	X, INCX, X
294	LDF	[X +  0 * SIZE], a3
295	LDF	[X +  1 * SIZE], a4
296	add	X, INCX, X
297	LDF	[X +  0 * SIZE], a5
298	LDF	[X +  1 * SIZE], a6
299	add	X, INCX, X
300	add	I, -1, I
301	LDF	[X +  0 * SIZE], a7
302	cmp	I, 0
303	LDF	[X +  1 * SIZE], a8
304	ble,pt	%icc, .LL52
305	add	X, INCX, X
306
307.LL51:
308	FABS	a1, t1
309	LDF	[X +  0 * SIZE], a1
310	FABS	a2, t2
311	LDF	[X +  1 * SIZE], a2
312	add	X, INCX, X
313	FABS	a3, t3
314	LDF	[X +  0 * SIZE], a3
315	FABS	a4, t4
316	LDF	[X +  1 * SIZE], a4
317	add	X, INCX, X
318
319	FABS	a5, t5
320	LDF	[X +  0 * SIZE], a5
321	FABS	a6, t6
322	LDF	[X +  1 * SIZE], a6
323	add	X, INCX, X
324	FABS	a7, t7
325	LDF	[X +  0 * SIZE], a7
326	FABS	a8, t8
327	LDF	[X +  1 * SIZE], a8
328
329	FADD	t1, t2, t1
330	FADD	t3, t4, t3
331	FADD	t5, t6, t5
332	FADD	t7, t8, t7
333
334	FCMP	%fcc0, t1, c1
335	FCMP	%fcc1, t3, c2
336	FCMP	%fcc2, t5, c3
337	FCMP	%fcc3, t7, c4
338
339	FCMOV	%fcc0, t1, c1
340	CMOV	%fcc0, count, v1
341	add	I, -1, I
342	FCMOV	%fcc1, t3, c2
343	CMOV	%fcc1, count, v2
344	cmp	I, 0
345	FCMOV	%fcc2, t5, c3
346	CMOV	%fcc2, count, v3
347	FCMOV	%fcc3, t7, c4
348	CMOV	%fcc3, count, v4
349	add	count, 4, count
350
351	bg,pt	%icc, .LL51
352	add	X, INCX, X
353
354.LL52:
355	FABS	a1, t1
356	FABS	a2, t2
357	FABS	a3, t3
358	FABS	a4, t4
359
360	FABS	a5, t5
361	FABS	a6, t6
362	FABS	a7, t7
363	FABS	a8, t8
364
365	FADD	t1, t2, t1
366	FADD	t3, t4, t3
367	FADD	t5, t6, t5
368	FADD	t7, t8, t7
369
370	FCMP	%fcc0, t1, c1
371	FCMP	%fcc1, t3, c2
372	FCMP	%fcc2, t5, c3
373	FCMP	%fcc3, t7, c4
374
375	FCMOV	%fcc0, t1, c1
376	CMOV	%fcc0, count, v1
377	FCMOV	%fcc1, t3, c2
378	CMOV	%fcc1, count, v2
379	FCMOV	%fcc2, t5, c3
380	CMOV	%fcc2, count, v3
381	FCMOV	%fcc3, t7, c4
382	CMOV	%fcc3, count, v4
383	add	count, 4, count
384
385.LL55:
386	and	N, 3, I
387	cmp	I,  0
388	ble,a,pn %icc, .LL59
389	nop
390
391.LL56:
392	LDF	[X +  0 * SIZE], a1
393	LDF	[X +  1 * SIZE], a2
394
395	FABS	a1, t1
396	add	I, -1, I
397	FABS	a2, t2
398	cmp	I, 0
399	FADD	t1, t2, t1
400	FCMP	%fcc0, t1, c1
401	FCMOV	%fcc0, t1, c1
402	CMOV	%fcc0, count, v1
403	add	count, 1, count
404	bg,pt	%icc, .LL56
405	add	X, INCX, X
406
407.LL59:
408	FCMP	%fcc0, c2, c1
409	add	v2, 1, v2
410	FCMP	%fcc1, c4, c3
411	add	v3, 2, v3
412	add	v4, 3, v4
413
414	FCMOV	%fcc0, c2, c1
415	CMOV	%fcc0, v2, v1
416	FCMOV	%fcc1, c4, c3
417	CMOV	%fcc1, v4, v3
418	FCMP	%fcc0, c3, c1
419	CMOV	%fcc0, v3, v1
420
421	mov	v1, %i0
422	return	%i7 + 8
423	nop
424
425	EPILOGUE
426