1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	%i0
43#define X	%i1
44#define INCX	%i2
45#define I	%i3
46
47#define c1	%f0
48#define c2	%f2
49#define c3	%f4
50#define c4	%f6
51#define t1	%f8
52#define t2	%f10
53#define t3	%f12
54#define t4	%f14
55
56#define a1	%f16
57#define a2	%f18
58#define a3	%f20
59#define a4	%f22
60#define a5	%f24
61#define a6	%f26
62#define a7	%f28
63#define a8	%f30
64
65	PROLOGUE
66	SAVESP
67
68	FCLR(0)
69
70	FMOV	c1, c2
71	FMOV	c1, c3
72	FMOV	c1, c4
73	FMOV	c1, t1
74	FMOV	c1, t2
75	FMOV	c1, t3
76	FMOV	c1, t4
77
78	cmp	INCX, 0
79	ble	.LL20
80	sll	INCX, ZBASE_SHIFT, INCX
81
82	cmp	N, 0
83	ble	.LL20
84	nop
85
86	cmp	INCX, 2 * SIZE
87	bne	.LL50
88	nop
89
90	sra	N, 2, I
91	cmp	I, 0
92	ble,pn	%icc, .LL15
93	nop
94
95	ld	[X +  0 * SIZE], a1
96	add	I, -1, I
97	ld	[X +  1 * SIZE], a2
98	cmp	I, 0
99	ld	[X +  2 * SIZE], a3
100	ld	[X +  3 * SIZE], a4
101	ld	[X +  4 * SIZE], a5
102	ld	[X +  5 * SIZE], a6
103	ld	[X +  6 * SIZE], a7
104	ld	[X +  7 * SIZE], a8
105
106	ble,pt	%icc, .LL12
107	add	X, 8 * SIZE, X
108
109#define PREFETCHSIZE 40
110
111.LL11:
112	faddd	c1, t1, c1
113	fsmuld	a1, a1, t1
114	prefetch [X  + PREFETCHSIZE * SIZE], 0
115
116	faddd	c2, t2, c2
117	add	I, -1, I
118	fsmuld	a2, a2, t2
119	ld	[X +  0 * SIZE], a1
120
121	faddd	c3, t3, c3
122	cmp	I, 0
123	fsmuld	a3, a3, t3
124	ld	[X +  1 * SIZE], a2
125
126	faddd	c4, t4, c4
127	fsmuld	a4, a4, t4
128	ld	[X +  2 * SIZE], a3
129
130	faddd	c1, t1, c1
131	fsmuld	a5, a5, t1
132	ld	[X +  3 * SIZE], a4
133
134	faddd	c2, t2, c2
135	fsmuld	a6, a6, t2
136	ld	[X +  4 * SIZE], a5
137
138	faddd	c3, t3, c3
139	fsmuld	a7, a7, t3
140	ld	[X +  5 * SIZE], a6
141
142	faddd	c4, t4, c4
143	ld	[X +  6 * SIZE], a7
144	fsmuld	a8, a8, t4
145	add	X, 8 * SIZE, X
146
147	bg,pt	%icc, .LL11
148	ld	[X -  1 * SIZE], a8
149
150.LL12:
151	faddd	c1, t1, c1
152	fsmuld	a1, a1, t1
153	faddd	c2, t2, c2
154	fsmuld	a2, a2, t2
155
156	faddd	c3, t3, c3
157	fsmuld	a3, a3, t3
158	faddd	c4, t4, c4
159	fsmuld	a4, a4, t4
160
161	faddd	c1, t1, c1
162	fsmuld	a5, a5, t1
163	faddd	c2, t2, c2
164	fsmuld	a6, a6, t2
165
166	faddd	c3, t3, c3
167	fsmuld	a7, a7, t3
168	faddd	c4, t4, c4
169	fsmuld	a8, a8, t4
170
171.LL15:
172	and	N, 3, I
173	cmp	I,  0
174	ble,a,pn %icc, .LL19
175	nop
176
177.LL16:
178	ld	[X +  0 * SIZE], a1
179	add	I, -1, I
180	ld	[X +  1 * SIZE], a2
181	cmp	I, 0
182	faddd	c1, t1, c1
183	faddd	c2, t2, c2
184	fsmuld	a1, a1, t1
185	fsmuld	a2, a2, t2
186	bg,pt	%icc, .LL16
187	add	X, 2 * SIZE, X
188
189.LL19:
190	faddd	c1, t1, c1
191	faddd	c2, t2, c2
192	faddd	c3, t3, c3
193	faddd	c4, t4, c4
194
195	faddd	c1, c2, c1
196	faddd	c3, c4, c3
197	faddd	c1, c3, c1
198
199	fsqrtd	c1, c1
200
201#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C)
202	fdtos	c1, c1
203#endif
204.LL20:
205	return	%i7 + 8
206	clr	%g0
207
208.LL50:
209	sra	N, 2, I
210	cmp	I, 0
211	ble,pn	%icc, .LL55
212	nop
213
214	ld	[X +  0 * SIZE], a1
215	ld	[X +  1 * SIZE], a2
216	add	X, INCX, X
217	ld	[X +  0 * SIZE], a3
218	ld	[X +  1 * SIZE], a4
219	add	X, INCX, X
220	ld	[X +  0 * SIZE], a5
221	ld	[X +  1 * SIZE], a6
222	add	X, INCX, X
223	add	I, -1, I
224	ld	[X +  0 * SIZE], a7
225	cmp	I, 0
226	ld	[X +  1 * SIZE], a8
227
228	ble,pt	%icc, .LL52
229	add	X, INCX, X
230
231.LL51:
232	faddd	c1, t1, c1
233	add	I, -1, I
234	fsmuld	a1, a1, t1
235	ld	[X +  0 * SIZE], a1
236
237	faddd	c2, t2, c2
238	cmp	I, 0
239	fsmuld	a2, a2, t2
240	ld	[X +  1 * SIZE], a2
241	add	X, INCX, X
242
243	faddd	c3, t3, c3
244	fsmuld	a3, a3, t3
245	ld	[X +  0 * SIZE], a3
246
247	faddd	c4, t4, c4
248	fsmuld	a4, a4, t4
249	ld	[X +  1 * SIZE], a4
250	add	X, INCX, X
251
252	faddd	c1, t1, c1
253	fsmuld	a5, a5, t1
254	ld	[X +  0 * SIZE], a5
255
256	faddd	c2, t2, c2
257	fsmuld	a6, a6, t2
258	ld	[X +  1 * SIZE], a6
259	add	X, INCX, X
260
261	faddd	c3, t3, c3
262	fsmuld	a7, a7, t3
263	ld	[X +  0 * SIZE], a7
264
265	faddd	c4, t4, c4
266	fsmuld	a8, a8, t4
267	ld	[X +  1 * SIZE], a8
268	bg,pt	%icc, .LL51
269	add	X, INCX, X
270
271.LL52:
272	faddd	c1, t1, c1
273	fsmuld	a1, a1, t1
274	faddd	c2, t2, c2
275	fsmuld	a2, a2, t2
276
277	faddd	c3, t3, c3
278	fsmuld	a3, a3, t3
279	faddd	c4, t4, c4
280	fsmuld	a4, a4, t4
281
282	faddd	c1, t1, c1
283	fsmuld	a5, a5, t1
284	faddd	c2, t2, c2
285	fsmuld	a6, a6, t2
286
287	faddd	c3, t3, c3
288	fsmuld	a7, a7, t3
289	faddd	c4, t4, c4
290	fsmuld	a8, a8, t4
291
292.LL55:
293	and	N, 3, I
294	cmp	I,  0
295	ble,a,pn %icc, .LL59
296	nop
297
298.LL56:
299	ld	[X +  0 * SIZE], a1
300	add	I, -1, I
301	ld	[X +  1 * SIZE], a2
302	cmp	I, 0
303	faddd	c1, t1, c1
304	faddd	c2, t2, c2
305	fsmuld	a1, a1, t1
306	fsmuld	a2, a2, t2
307	bg,pt	%icc, .LL56
308	add	X, INCX, X
309
310.LL59:
311	faddd	c1, t1, c1
312	faddd	c2, t2, c2
313	faddd	c3, t3, c3
314	faddd	c4, t4, c4
315
316	faddd	c1, c2, c1
317	faddd	c3, c4, c3
318	faddd	c1, c3, c1
319
320	fsqrtd	c1, c1
321
322#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C)
323	fdtos	c1, c1
324#endif
325
326	return	%i7 + 8
327	clr	%o0
328
329	EPILOGUE
330