1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 /*
30 * __vsincosf: single precision vector sincos
31 *
32 * Algorithm:
33 *
34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36 * z*C2))), where z = x*x, all evaluated in double precision.
37 *
38 * Accuracy:
39 *
40 * The largest error is less than 0.6 ulps.
41 */
42
43 #include <sys/isa_defs.h>
44
45 #ifdef _LITTLE_ENDIAN
46 #define HI(x) *(1+(int *)&x)
47 #define LO(x) *(unsigned *)&x
48 #else
49 #define HI(x) *(int *)&x
50 #define LO(x) *(1+(unsigned *)&x)
51 #endif
52
53 #ifdef __RESTRICT
54 #define restrict _Restrict
55 #else
56 #define restrict
57 #endif
58
59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
60
61 static const double C[] = {
62 -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */
63 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */
64 -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */
65 1.0,
66 -0.5,
67 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */
68 -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */
69 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */
70 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */
71 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */
72 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */
73 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */
74 };
75
76 #define S0 C[0]
77 #define S1 C[1]
78 #define S2 C[2]
79 #define one C[3]
80 #define mhalf C[4]
81 #define C0 C[5]
82 #define C1 C[6]
83 #define C2 C[7]
84 #define invpio2 C[8]
85 #define c3two51 C[9]
86 #define pio2_1 C[10]
87 #define pio2_t C[11]
88
89 #define PREPROCESS(N, sindex, cindex, label) \
90 hx = *(int *)x; \
91 ix = hx & 0x7fffffff; \
92 t = *x; \
93 x += stridex; \
94 if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \
95 if (ix == 0) { \
96 s[sindex] = t; \
97 c[cindex] = one; \
98 goto label; \
99 } \
100 y##N = (double)t; \
101 n##N = 0; \
102 } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \
103 y##N = (double)t; \
104 medium = 1; \
105 } else { \
106 if (ix >= 0x7f800000) { /* inf or nan */ \
107 s[sindex] = c[cindex] = t / t; \
108 goto label; \
109 } \
110 z##N = y##N = (double)t; \
111 hx = HI(y##N); \
112 n##N = ((hx >> 20) & 0x7ff) - 1046; \
113 HI(z##N) = (hx & 0xfffff) | 0x41600000; \
114 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0); \
115 if (hx < 0) { \
116 y##N = -y##N; \
117 n##N = -n##N; \
118 } \
119 z##N = y##N * y##N; \
120 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * \
121 (S1 + z##N * S2))); \
122 g##N = (float)(one + z##N * (mhalf + z##N * (C0 + \
123 z##N * (C1 + z##N * C2)))); \
124 if (n##N & 2) { \
125 f##N = -f##N; \
126 g##N = -g##N; \
127 } \
128 if (n##N & 1) { \
129 s[sindex] = g##N; \
130 c[cindex] = -f##N; \
131 } else { \
132 s[sindex] = f##N; \
133 c[cindex] = g##N; \
134 } \
135 goto label; \
136 }
137
138 #define PROCESS(N) \
139 if (medium) { \
140 z##N = y##N * invpio2 + c3two51; \
141 n##N = LO(z##N); \
142 z##N -= c3two51; \
143 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \
144 } \
145 z##N = y##N * y##N; \
146 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + z##N * S2)));\
147 g##N = (float)(one + z##N * (mhalf + z##N * (C0 + z##N * \
148 (C1 + z##N * C2)))); \
149 if (n##N & 2) { \
150 f##N = -f##N; \
151 g##N = -g##N; \
152 } \
153 if (n##N & 1) { \
154 *s = g##N; \
155 *c = -f##N; \
156 } else { \
157 *s = f##N; \
158 *c = g##N; \
159 } \
160 s += strides; \
161 c += stridec
162
163 void
__vsincosf(int n,float * restrict x,int stridex,float * restrict s,int strides,float * restrict c,int stridec)164 __vsincosf(int n, float *restrict x, int stridex,
165 float *restrict s, int strides, float *restrict c, int stridec)
166 {
167 double y0, y1, y2, y3;
168 double z0, z1, z2, z3;
169 float f0, f1, f2, f3, t;
170 float g0, g1, g2, g3;
171 int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
172
173 s -= strides;
174 c -= stridec;
175
176 for (;;) {
177 begin:
178 s += strides;
179 c += stridec;
180
181 if (--n < 0)
182 break;
183
184 medium = 0;
185 PREPROCESS(0, 0, 0, begin);
186
187 if (--n < 0)
188 goto process1;
189
190 PREPROCESS(1, strides, stridec, process1);
191
192 if (--n < 0)
193 goto process2;
194
195 PREPROCESS(2, (strides << 1), (stridec << 1), process2);
196
197 if (--n < 0)
198 goto process3;
199
200 PREPROCESS(3, (strides << 1) + strides,
201 (stridec << 1) + stridec, process3);
202
203 if (medium) {
204 z0 = y0 * invpio2 + c3two51;
205 z1 = y1 * invpio2 + c3two51;
206 z2 = y2 * invpio2 + c3two51;
207 z3 = y3 * invpio2 + c3two51;
208
209 n0 = LO(z0);
210 n1 = LO(z1);
211 n2 = LO(z2);
212 n3 = LO(z3);
213
214 z0 -= c3two51;
215 z1 -= c3two51;
216 z2 -= c3two51;
217 z3 -= c3two51;
218
219 y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
220 y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
221 y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
222 y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
223 }
224
225 z0 = y0 * y0;
226 z1 = y1 * y1;
227 z2 = y2 * y2;
228 z3 = y3 * y3;
229
230 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
231 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
232 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
233 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
234
235 g0 = (float)(one + z0 * (mhalf + z0 * (C0 + z0 *
236 (C1 + z0 * C2))));
237 g1 = (float)(one + z1 * (mhalf + z1 * (C0 + z1 *
238 (C1 + z1 * C2))));
239 g2 = (float)(one + z2 * (mhalf + z2 * (C0 + z2 *
240 (C1 + z2 * C2))));
241 g3 = (float)(one + z3 * (mhalf + z3 * (C0 + z3 *
242 (C1 + z3 * C2))));
243
244 if (n0 & 2) {
245 f0 = -f0;
246 g0 = -g0;
247 }
248 if (n1 & 2) {
249 f1 = -f1;
250 g1 = -g1;
251 }
252 if (n2 & 2) {
253 f2 = -f2;
254 g2 = -g2;
255 }
256 if (n3 & 2) {
257 f3 = -f3;
258 g3 = -g3;
259 }
260
261 if (n0 & 1) {
262 *s = g0;
263 *c = -f0;
264 } else {
265 *s = f0;
266 *c = g0;
267 }
268 s += strides;
269 c += stridec;
270
271 if (n1 & 1) {
272 *s = g1;
273 *c = -f1;
274 } else {
275 *s = f1;
276 *c = g1;
277 }
278 s += strides;
279 c += stridec;
280
281 if (n2 & 1) {
282 *s = g2;
283 *c = -f2;
284 } else {
285 *s = f2;
286 *c = g2;
287 }
288 s += strides;
289 c += stridec;
290
291 if (n3 & 1) {
292 *s = g3;
293 *c = -f3;
294 } else {
295 *s = f3;
296 *c = g3;
297 }
298 continue;
299
300 process1:
301 PROCESS(0);
302 continue;
303
304 process2:
305 PROCESS(0);
306 PROCESS(1);
307 continue;
308
309 process3:
310 PROCESS(0);
311 PROCESS(1);
312 PROCESS(2);
313 }
314 }
315