1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43#define X	%i1
44#define INCX	%i2
45#define I	%i3
46
47#ifdef DOUBLE
48#define c1	%f0
49#define c2	%f2
50#define c3	%f4
51#define c4	%f6
52#define t1	%f8
53#define t2	%f10
54#define t3	%f12
55#define t4	%f14
56
57#define a1	%f16
58#define a2	%f18
59#define a3	%f20
60#define a4	%f22
61#define a5	%f24
62#define a6	%f26
63#define a7	%f28
64#define a8	%f30
65#else
66#define c1	%f0
67#define c2	%f1
68#define c3	%f2
69#define c4	%f3
70#define t1	%f4
71#define t2	%f5
72#define t3	%f6
73#define t4	%f7
74
75#define a1	%f8
76#define a2	%f9
77#define a3	%f10
78#define a4	%f11
79#define a5	%f12
80#define a6	%f13
81#define a7	%f14
82#define a8	%f15
83#endif
84
85#ifndef USE_MIN
86#define FCMOV	FMOVG
87#else
88#define FCMOV	FMOVL
89#endif
90
91	PROLOGUE
92	SAVESP
93
94	FCLR(0)
95
96	cmp	N, 0
97	ble	.LL20
98	nop
99
100	cmp	INCX, 0
101	ble	.LL20
102	sll	INCX, BASE_SHIFT, INCX
103
104	add	N, -1, N
105	LDF	[X], c1
106	add	X, INCX, X
107	cmp	N, 0
108	ble	.LL20
109	nop
110
111	FMOV	c1, c2
112	FMOV	c1, c3
113	FMOV	c1, c4
114
115	cmp	INCX, SIZE
116	bne	.LL50
117	nop
118
119	sra	N, 3, I
120	cmp	I, 0
121	ble,pn	%icc, .LL15
122	nop
123
124	LDF	[X +  0 * SIZE], a1
125	LDF	[X +  1 * SIZE], a2
126	LDF	[X +  2 * SIZE], a3
127	LDF	[X +  3 * SIZE], a4
128
129	LDF	[X +  4 * SIZE], a5
130	LDF	[X +  5 * SIZE], a6
131	LDF	[X +  6 * SIZE], a7
132	LDF	[X +  7 * SIZE], a8
133	add	X, 8 * SIZE, X
134
135	add	I, -1, I
136	cmp	I, 0
137	ble,pt	%icc, .LL12
138	nop
139
140#define PREFETCHSIZE 40
141
142.LL11:
143	FCMP	%fcc0, a1, c1
144	FCMP	%fcc1, a2, c2
145	FCMP	%fcc2, a3, c3
146	FCMP	%fcc3, a4, c4
147
148	FCMOV	%fcc0, a1, c1
149	LDF	[X +  0 * SIZE], a1
150	FCMOV	%fcc1, a2, c2
151	LDF	[X +  1 * SIZE], a2
152	FCMOV	%fcc2, a3, c3
153	LDF	[X +  2 * SIZE], a3
154	FCMOV	%fcc3, a4, c4
155	LDF	[X +  3 * SIZE], a4
156
157	FCMP	%fcc0, a5, c1
158	FCMP	%fcc1, a6, c2
159	FCMP	%fcc2, a7, c3
160	FCMP	%fcc3, a8, c4
161
162	FCMOV	%fcc0, a5, c1
163	LDF	[X +  4 * SIZE], a5
164	add	I, -1, I
165	FCMOV	%fcc1, a6, c2
166	LDF	[X +  5 * SIZE], a6
167	cmp	I, 0
168	FCMOV	%fcc2, a7, c3
169	LDF	[X +  6 * SIZE], a7
170	FCMOV	%fcc3, a8, c4
171	LDF	[X +  7 * SIZE], a8
172
173	bg,pt	%icc, .LL11
174	add	X, 8 * SIZE, X
175
176.LL12:
177	FCMP	%fcc0, a1, c1
178	FCMP	%fcc1, a2, c2
179	FCMP	%fcc2, a3, c3
180	FCMP	%fcc3, a4, c4
181
182	FCMOV	%fcc0, a1, c1
183	FCMOV	%fcc1, a2, c2
184	FCMOV	%fcc2, a3, c3
185	FCMOV	%fcc3, a4, c4
186
187	FCMP	%fcc0, a5, c1
188	FCMP	%fcc1, a6, c2
189	FCMP	%fcc2, a7, c3
190	FCMP	%fcc3, a8, c4
191
192	FCMOV	%fcc0, a5, c1
193	FCMOV	%fcc1, a6, c2
194	FCMOV	%fcc2, a7, c3
195	FCMOV	%fcc3, a8, c4
196
197.LL15:
198	and	N, 7, I
199	cmp	I,  0
200	ble,a,pn %icc, .LL19
201	nop
202
203.LL16:
204	LDF	[X +  0 * SIZE], a1
205	FCMP	%fcc0, a1, c1
206	FCMOV	%fcc0, a1, c1
207	add	I, -1, I
208	cmp	I, 0
209	bg,pt	%icc, .LL16
210	add	X, 1 * SIZE, X
211
212.LL19:
213	FCMP	%fcc0, c2, c1
214	FCMP	%fcc1, c4, c3
215	FCMOV	%fcc0, c2, c1
216	FCMOV	%fcc1, c4, c3
217	FCMP	%fcc0, c3, c1
218	FCMOV	%fcc0, c3, c1
219
220.LL20:
221	return	%i7 + 8
222	clr	%g0
223
224.LL50:
225	sra	N, 3, I
226	cmp	I, 0
227	ble,pn	%icc, .LL55
228	nop
229
230	LDF	[X +  0 * SIZE], a1
231	add	X, INCX, X
232	LDF	[X +  0 * SIZE], a2
233	add	X, INCX, X
234	LDF	[X +  0 * SIZE], a3
235	add	X, INCX, X
236	LDF	[X +  0 * SIZE], a4
237	add	X, INCX, X
238	LDF	[X +  0 * SIZE], a5
239	add	X, INCX, X
240	LDF	[X +  0 * SIZE], a6
241	add	X, INCX, X
242	add	I, -1, I
243	LDF	[X +  0 * SIZE], a7
244	cmp	I, 0
245	add	X, INCX, X
246	LDF	[X +  0 * SIZE], a8
247
248	ble,pt	%icc, .LL52
249	add	X, INCX, X
250
251.LL51:
252	FCMP	%fcc0, a1, c1
253	FCMP	%fcc1, a2, c2
254	FCMP	%fcc2, a3, c3
255	FCMP	%fcc3, a4, c4
256
257	FCMOV	%fcc0, a1, c1
258	LDF	[X +  0 * SIZE], a1
259	add	X, INCX, X
260	FCMOV	%fcc1, a2, c2
261	LDF	[X +  0 * SIZE], a2
262	add	X, INCX, X
263	FCMOV	%fcc2, a3, c3
264	LDF	[X +  0 * SIZE], a3
265	add	X, INCX, X
266	FCMOV	%fcc3, a4, c4
267	LDF	[X +  0 * SIZE], a4
268	add	X, INCX, X
269
270	FCMP	%fcc0, a5, c1
271	add	I, -1, I
272	FCMP	%fcc1, a6, c2
273	cmp	I, 0
274	FCMP	%fcc2, a7, c3
275	FCMP	%fcc3, a8, c4
276
277	FCMOV	%fcc0, a5, c1
278	LDF	[X +  0 * SIZE], a5
279	add	X, INCX, X
280	FCMOV	%fcc1, a6, c2
281	LDF	[X +  0 * SIZE], a6
282	add	X, INCX, X
283	FCMOV	%fcc2, a7, c3
284	LDF	[X +  0 * SIZE], a7
285	add	X, INCX, X
286	FCMOV	%fcc3, a8, c4
287	LDF	[X +  0 * SIZE], a8
288
289	bg,pt	%icc, .LL51
290	add	X, INCX, X
291
292.LL52:
293	FCMP	%fcc0, a1, c1
294	FCMP	%fcc1, a2, c2
295	FCMP	%fcc2, a3, c3
296	FCMP	%fcc3, a4, c4
297
298	FCMOV	%fcc0, a1, c1
299	FCMOV	%fcc1, a2, c2
300	FCMOV	%fcc2, a3, c3
301	FCMOV	%fcc3, a4, c4
302
303	FCMP	%fcc0, a5, c1
304	FCMP	%fcc1, a6, c2
305	FCMP	%fcc2, a7, c3
306	FCMP	%fcc3, a8, c4
307
308	FCMOV	%fcc0, a5, c1
309	FCMOV	%fcc1, a6, c2
310	FCMOV	%fcc2, a7, c3
311	FCMOV	%fcc3, a8, c4
312
313.LL55:
314	and	N, 7, I
315	cmp	I,  0
316	ble,a,pn %icc, .LL59
317	nop
318
319.LL56:
320	LDF	[X +  0 * SIZE], a1
321	FCMP	%fcc0, a1, c1
322	FCMOV	%fcc0, a1, c1
323	add	I, -1, I
324	cmp	I, 0
325	bg,pt	%icc, .LL56
326	add	X, INCX, X
327
328.LL59:
329	FCMP	%fcc0, c2, c1
330	FCMP	%fcc1, c4, c3
331	FCMOV	%fcc0, c2, c1
332	FCMOV	%fcc1, c4, c3
333	FCMP	%fcc0, c3, c1
334	FCMOV	%fcc0, c3, c1
335
336	return	%i7 + 8
337	clr	%o0
338
339	EPILOGUE
340