1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43#define X	%i1
44#define INCX	%i2
45#define I	%i3
46
47#ifdef DOUBLE
48#define c1	%f0
49#define c2	%f2
50#define t1	%f8
51#define t2	%f10
52#define t3	%f12
53#define t4	%f14
54
55#define a1	%f16
56#define a2	%f18
57#define a3	%f20
58#define a4	%f22
59#define a5	%f24
60#define a6	%f26
61#define a7	%f28
62#define a8	%f30
63#else
64#define c1	%f0
65#define c2	%f1
66#define t1	%f4
67#define t2	%f5
68#define t3	%f6
69#define t4	%f7
70
71#define a1	%f8
72#define a2	%f9
73#define a3	%f10
74#define a4	%f11
75#define a5	%f12
76#define a6	%f13
77#define a7	%f14
78#define a8	%f15
79#endif
80
81	PROLOGUE
82	SAVESP
83
84	FCLR(0)
85
86	sll	INCX, ZBASE_SHIFT, INCX
87
88	FMOV	c1, c2
89	FMOV	c1, t1
90	FMOV	c1, t2
91	FMOV	c1, t3
92	FMOV	c1, t4
93
94	cmp	INCX, 0
95	ble	.LL19
96	nop
97
98	cmp	INCX, 2 * SIZE
99	bne	.LL50
100	nop
101
102	sra	N, 2, I
103	cmp	I, 0
104	ble,pn	%icc, .LL15
105	nop
106
107	LDF	[X +  0 * SIZE], a1
108	add	I, -1, I
109	LDF	[X +  1 * SIZE], a2
110	cmp	I, 0
111	LDF	[X +  2 * SIZE], a3
112	LDF	[X +  3 * SIZE], a4
113	LDF	[X +  4 * SIZE], a5
114	LDF	[X +  5 * SIZE], a6
115	LDF	[X +  6 * SIZE], a7
116	LDF	[X +  7 * SIZE], a8
117
118	ble,pt	%icc, .LL12
119	add	X, 8 * SIZE, X
120
121#define PREFETCHSIZE 32
122
123.LL11:
124	FADD	c1, t1, c1
125	prefetch [X  + PREFETCHSIZE * SIZE], 0
126	FABS	a1, t1
127	LDF	[X +  0 * SIZE], a1
128
129	FADD	c2, t2, c2
130	add	I, -1, I
131	FABS	a2, t2
132	LDF	[X +  1 * SIZE], a2
133
134	FADD	c1, t3, c1
135	cmp	I, 0
136	FABS	a3, t3
137	LDF	[X +  2 * SIZE], a3
138
139	FADD	c2, t4, c2
140	nop
141	FABS	a4, t4
142	LDF	[X +  3 * SIZE], a4
143
144	FADD	c1, t1, c1
145	nop
146	FABS	a5, t1
147	LDF	[X +  4 * SIZE], a5
148
149	FADD	c2, t2, c2
150	nop
151	FABS	a6, t2
152	LDF	[X +  5 * SIZE], a6
153
154	FADD	c1, t3, c1
155	FABS	a7, t3
156	LDF	[X +  6 * SIZE], a7
157	add	X, 8 * SIZE, X
158
159	FADD	c2, t4, c2
160	FABS	a8, t4
161	bg,pt	%icc, .LL11
162	LDF	[X -  1 * SIZE], a8
163
164.LL12:
165	FADD	c1, t1, c1
166	FABS	a1, t1
167	FADD	c2, t2, c2
168	FABS	a2, t2
169
170	FADD	c1, t3, c1
171	FABS	a3, t3
172	FADD	c2, t4, c2
173	FABS	a4, t4
174
175	FADD	c1, t1, c1
176	FABS	a5, t1
177	FADD	c2, t2, c2
178	FABS	a6, t2
179
180	FADD	c1, t3, c1
181	FABS	a7, t3
182	FADD	c2, t4, c2
183	FABS	a8, t4
184
185.LL15:
186	and	N, 3, I
187	cmp	I,  0
188	ble,a,pn %icc, .LL19
189	nop
190
191.LL16:
192	LDF	[X +  0 * SIZE], a1
193	LDF	[X +  1 * SIZE], a2
194	add	I, -1, I
195	cmp	I, 0
196	FADD	c1, t1, c1
197	FADD	c2, t2, c2
198	FABS	a1, t1
199	FABS	a2, t2
200	bg,pt	%icc, .LL16
201	add	X, 2 * SIZE, X
202
203.LL19:
204	FADD	c1, t1, c1
205	FADD	c2, t2, c2
206	FADD	c1, t3, c1
207	FADD	c2, t4, c2
208
209	FADD	c1, c2, c1
210	return	%i7 + 8
211	clr	%g0
212
213.LL50:
214	sra	N, 2, I
215	cmp	I, 0
216	ble,pn	%icc, .LL55
217	nop
218
219	LDF	[X +  0 * SIZE], a1
220	LDF	[X +  1 * SIZE], a2
221	add	X, INCX, X
222	LDF	[X +  0 * SIZE], a3
223	LDF	[X +  1 * SIZE], a4
224	add	X, INCX, X
225	LDF	[X +  0 * SIZE], a5
226	LDF	[X +  1 * SIZE], a6
227	add	X, INCX, X
228	add	I, -1, I
229	LDF	[X +  0 * SIZE], a7
230	cmp	I, 0
231	LDF	[X +  1 * SIZE], a8
232
233	ble,pt	%icc, .LL52
234	add	X, INCX, X
235
236.LL51:
237	FADD	c1, t1, c1
238	add	I, -1, I
239	FABS	a1, t1
240	LDF	[X +  0 * SIZE], a1
241
242	FADD	c2, t2, c2
243	cmp	I, 0
244	FABS	a2, t2
245	LDF	[X +  1 * SIZE], a2
246	add	X, INCX, X
247
248	FADD	c1, t3, c1
249	FABS	a3, t3
250	LDF	[X +  0 * SIZE], a3
251
252	FADD	c2, t4, c2
253	FABS	a4, t4
254	LDF	[X +  1 * SIZE], a4
255	add	X, INCX, X
256
257	FADD	c1, t1, c1
258	FABS	a5, t1
259	LDF	[X +  0 * SIZE], a5
260
261	FADD	c2, t2, c2
262	FABS	a6, t2
263	LDF	[X +  1 * SIZE], a6
264	add	X, INCX, X
265
266	FADD	c1, t3, c1
267	FABS	a7, t3
268	LDF	[X +  0 * SIZE], a7
269
270	FADD	c2, t4, c2
271	FABS	a8, t4
272	LDF	[X +  1 * SIZE], a8
273
274	bg,pt	%icc, .LL51
275	add	X, INCX, X
276
277.LL52:
278	FADD	c1, t1, c1
279	FABS	a1, t1
280	FADD	c2, t2, c2
281	FABS	a2, t2
282
283	FADD	c1, t3, c1
284	FABS	a3, t3
285	FADD	c2, t4, c2
286	FABS	a4, t4
287
288	FADD	c1, t1, c1
289	FABS	a5, t1
290	FADD	c2, t2, c2
291	FABS	a6, t2
292
293	FADD	c1, t3, c1
294	FABS	a7, t3
295	FADD	c2, t4, c2
296	FABS	a8, t4
297
298.LL55:
299	and	N, 3, I
300	cmp	I,  0
301	ble,a,pn %icc, .LL59
302	nop
303
304.LL56:
305	LDF	[X +  0 * SIZE], a1
306	LDF	[X +  1 * SIZE], a2
307	FADD	c1, t1, c1
308	FADD	c2, t2, c2
309	add	I, -1, I
310	FABS	a1, t1
311	FABS	a2, t2
312	cmp	I, 0
313	bg,pt	%icc, .LL56
314	add	X, INCX, X
315
316.LL59:
317	FADD	c1, t1, c1
318	FADD	c2, t2, c2
319	FADD	c1, t3, c1
320	FADD	c2, t4, c2
321
322	FADD	c1, c2, c1
323
324	return	%i7 + 8
325	clr	%o0
326
327	EPILOGUE
328