1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43#define X	%i1
44#define INCX	%i2
45#define I	%i3
46
47#ifdef DOUBLE
48#define c1	%f0
49#define c2	%f2
50#define t1	%f8
51#define t2	%f10
52#define t3	%f12
53#define t4	%f14
54
55#define a1	%f16
56#define a2	%f18
57#define a3	%f20
58#define a4	%f22
59#define a5	%f24
60#define a6	%f26
61#define a7	%f28
62#define a8	%f30
63#else
64#define c1	%f0
65#define c2	%f1
66#define t1	%f4
67#define t2	%f5
68#define t3	%f6
69#define t4	%f7
70
71#define a1	%f8
72#define a2	%f9
73#define a3	%f10
74#define a4	%f11
75#define a5	%f12
76#define a6	%f13
77#define a7	%f14
78#define a8	%f15
79#endif
80
81	PROLOGUE
82	SAVESP
83
84	FCLR(0)
85
86	sll	INCX, BASE_SHIFT, INCX
87
88	FMOV	c1, c2
89	FMOV	c1, t1
90	FMOV	c1, t2
91	FMOV	c1, t3
92	FMOV	c1, t4
93
94	cmp	INCX, 0
95	ble	.LL19
96	cmp	INCX, SIZE
97	bne	.LL50
98
99	sra	N, 3, I
100	cmp	I, 0
101	ble,pn	%icc, .LL15
102	nop
103
104	LDF	[X +  0 * SIZE], a1
105	add	I, -1, I
106	LDF	[X +  1 * SIZE], a2
107	cmp	I, 0
108	LDF	[X +  2 * SIZE], a3
109	LDF	[X +  3 * SIZE], a4
110	LDF	[X +  4 * SIZE], a5
111	LDF	[X +  5 * SIZE], a6
112	LDF	[X +  6 * SIZE], a7
113	LDF	[X +  7 * SIZE], a8
114
115	ble,pt	%icc, .LL12
116	add	X, 8 * SIZE, X
117
118#define PREFETCHSIZE 128
119
120.LL11:
121	FADD	c1, t1, c1
122	prefetch [X  + PREFETCHSIZE * SIZE], 0
123	FABS	a1, t1
124	LDF	[X +  0 * SIZE], a1
125
126	FADD	c2, t2, c2
127	add	I, -1, I
128	FABS	a2, t2
129	LDF	[X +  1 * SIZE], a2
130
131	FADD	c1, t3, c1
132	cmp	I, 0
133	FABS	a3, t3
134	LDF	[X +  2 * SIZE], a3
135
136	FADD	c2, t4, c2
137	nop
138	FABS	a4, t4
139	LDF	[X +  3 * SIZE], a4
140
141	FADD	c1, t1, c1
142	nop
143	FABS	a5, t1
144	LDF	[X +  4 * SIZE], a5
145
146	FADD	c2, t2, c2
147	nop
148	FABS	a6, t2
149	LDF	[X +  5 * SIZE], a6
150
151	FADD	c1, t3, c1
152	FABS	a7, t3
153	LDF	[X +  6 * SIZE], a7
154	add	X, 8 * SIZE, X
155
156	FADD	c2, t4, c2
157	FABS	a8, t4
158	bg,pt	%icc, .LL11
159	LDF	[X -  1 * SIZE], a8
160
161.LL12:
162	FADD	c1, t1, c1
163	FABS	a1, t1
164	FADD	c2, t2, c2
165	FABS	a2, t2
166
167	FADD	c1, t3, c1
168	FABS	a3, t3
169	FADD	c2, t4, c2
170	FABS	a4, t4
171
172	FADD	c1, t1, c1
173	FABS	a5, t1
174	FADD	c2, t2, c2
175	FABS	a6, t2
176
177	FADD	c1, t3, c1
178	FABS	a7, t3
179	FADD	c2, t4, c2
180	FABS	a8, t4
181
182.LL15:
183	and	N, 7, I
184	cmp	I,  0
185	ble,a,pn %icc, .LL19
186	nop
187
188.LL16:
189	LDF	[X +  0 * SIZE], a1
190	add	I, -1, I
191	cmp	I, 0
192	FADD	c1, t1, c1
193	FABS	a1, t1
194	bg,pt	%icc, .LL16
195	add	X, 1 * SIZE, X
196
197.LL19:
198	FADD	c1, t1, c1
199	FADD	c2, t2, c2
200	FADD	c1, t3, c1
201	FADD	c2, t4, c2
202
203	FADD	c1, c2, c1
204	return	%i7 + 8
205	clr	%g0
206
207.LL50:
208	sra	N, 3, I
209	cmp	I, 0
210	ble,pn	%icc, .LL55
211	nop
212
213	LDF	[X +  0 * SIZE], a1
214	add	X, INCX, X
215	LDF	[X +  0 * SIZE], a2
216	add	X, INCX, X
217	LDF	[X +  0 * SIZE], a3
218	add	X, INCX, X
219	LDF	[X +  0 * SIZE], a4
220	add	X, INCX, X
221	LDF	[X +  0 * SIZE], a5
222	add	X, INCX, X
223	LDF	[X +  0 * SIZE], a6
224	add	X, INCX, X
225	add	I, -1, I
226	LDF	[X +  0 * SIZE], a7
227	cmp	I, 0
228	add	X, INCX, X
229	LDF	[X +  0 * SIZE], a8
230
231	ble,pt	%icc, .LL52
232	add	X, INCX, X
233
234.LL51:
235	FADD	c1, t1, c1
236	add	I, -1, I
237	FABS	a1, t1
238	LDF	[X +  0 * SIZE], a1
239	add	X, INCX, X
240
241	FADD	c2, t2, c2
242	cmp	I, 0
243	FABS	a2, t2
244	LDF	[X +  0 * SIZE], a2
245	add	X, INCX, X
246
247	FADD	c1, t3, c1
248	FABS	a3, t3
249	LDF	[X +  0 * SIZE], a3
250	add	X, INCX, X
251
252	FADD	c2, t4, c2
253	FABS	a4, t4
254	LDF	[X +  0 * SIZE], a4
255	add	X, INCX, X
256
257	FADD	c1, t1, c1
258	FABS	a5, t1
259	LDF	[X +  0 * SIZE], a5
260	add	X, INCX, X
261
262	FADD	c2, t2, c2
263	FABS	a6, t2
264	LDF	[X +  0 * SIZE], a6
265	add	X, INCX, X
266
267	FADD	c1, t3, c1
268	FABS	a7, t3
269	LDF	[X +  0 * SIZE], a7
270	add	X, INCX, X
271
272	FADD	c2, t4, c2
273	FABS	a8, t4
274	LDF	[X +  0 * SIZE], a8
275
276	bg,pt	%icc, .LL51
277	add	X, INCX, X
278
279.LL52:
280	FADD	c1, t1, c1
281	FABS	a1, t1
282	FADD	c2, t2, c2
283	FABS	a2, t2
284
285	FADD	c1, t3, c1
286	FABS	a3, t3
287	FADD	c2, t4, c2
288	FABS	a4, t4
289
290	FADD	c1, t1, c1
291	FABS	a5, t1
292	FADD	c2, t2, c2
293	FABS	a6, t2
294
295	FADD	c1, t3, c1
296	FABS	a7, t3
297	FADD	c2, t4, c2
298	FABS	a8, t4
299
300.LL55:
301	and	N, 7, I
302	cmp	I,  0
303	ble,a,pn %icc, .LL59
304	nop
305
306.LL56:
307	LDF	[X +  0 * SIZE], a1
308	FADD	c1, t1, c1
309	add	I, -1, I
310	FABS	a1, t1
311	cmp	I, 0
312	bg,pt	%icc, .LL56
313	add	X, INCX, X
314
315.LL59:
316	FADD	c1, t1, c1
317	FADD	c2, t2, c2
318	FADD	c1, t3, c1
319	FADD	c2, t4, c2
320
321	FADD	c1, c2, c1
322	return	%i7 + 8
323	clr	%o0
324
325	EPILOGUE
326