1 ////////////////////////////////////////////////////////////////
2 //
3 // this code was taken from http://shibatch.sourceforge.net/
4 // Many thanks to the author of original version: Naoki Shibata
5 //
6 // This version contains modifications made by Ingo Weyrich
7 //
8 ////////////////////////////////////////////////////////////////
9
10 #ifndef _SLEEFC_
11 #define _SLEEFC_
12
13 #include <assert.h>
14 #include <stdint.h>
15 #include <math.h>
16 //#include <bits/nan.h>
17 //#include <bits/inf.h>
18
19 #define PI4_A .7853981554508209228515625
20 #define PI4_B .794662735614792836713604629039764404296875e-8
21 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
22 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
23
24 #define L2U .69314718055966295651160180568695068359375
25 #define L2L .28235290563031577122588448175013436025525412068e-12
26 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
27
doubleToRawLongBits(double d)28 __inline int64_t doubleToRawLongBits(double d) {
29 union {
30 double f;
31 int64_t i;
32 } tmp;
33 tmp.f = d;
34 return tmp.i;
35 }
36
longBitsToDouble(int64_t i)37 __inline double longBitsToDouble(int64_t i) {
38 union {
39 double f;
40 int64_t i;
41 } tmp;
42 tmp.i = i;
43 return tmp.f;
44 }
45
xfabs(double x)46 __inline double xfabs(double x) {
47 return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x));
48 }
49
mulsign(double x,double y)50 __inline double mulsign(double x, double y) {
51 return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63)));
52 }
53
sign(double d)54 __inline double sign(double d) { return mulsign(1, d); }
mla(double x,double y,double z)55 __inline double mla(double x, double y, double z) { return x * y + z; }
xrint(double x)56 __inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
57
xisnan(double x)58 __inline int xisnan(double x) { return x != x; }
xisinf(double x)59 __inline int xisinf(double x) { return x == INFINITY || x == -INFINITY; }
xisminf(double x)60 __inline int xisminf(double x) { return x == -INFINITY; }
xispinf(double x)61 __inline int xispinf(double x) { return x == INFINITY; }
62
ldexpk(double x,int q)63 __inline double ldexpk(double x, int q) {
64 double u;
65 int m;
66 m = q >> 31;
67 m = (((m + q) >> 9) - m) << 7;
68 q = q - (m << 2);
69 u = longBitsToDouble(((int64_t)(m + 0x3ff)) << 52);
70 double u2 = u*u;
71 u2 = u2 * u2;
72 x = x * u2;
73 u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
74 return x * u;
75 }
76
xldexp(double x,int q)77 __inline double xldexp(double x, int q) { return ldexpk(x, q); }
78
ilogbp1(double d)79 __inline int ilogbp1(double d) {
80 int m = d < 4.9090934652977266E-91;
81 d = m ? 2.037035976334486E90 * d : d;
82 int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
83 q = m ? q - (300 + 0x03fe) : q - 0x03fe;
84 return q;
85 }
86
xilogb(double d)87 __inline int xilogb(double d) {
88 int e = ilogbp1(xfabs(d)) - 1;
89 e = d == 0 ? (-2147483647 - 1) : e;
90 e = d == INFINITY || d == -INFINITY ? 2147483647 : e;
91 return e;
92 }
93
upper(double d)94 __inline double upper(double d) {
95 return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL);
96 }
97
98 typedef struct {
99 double x, y;
100 } double2;
101
102 typedef struct {
103 float x, y;
104 } float2;
105
dd(double h,double l)106 __inline double2 dd(double h, double l) {
107 double2 ret;
108 ret.x = h; ret.y = l;
109 return ret;
110 }
111
normalize_d(double2 t)112 __inline double2 normalize_d(double2 t) {
113 double2 s;
114
115 s.x = t.x + t.y;
116 s.y = t.x - s.x + t.y;
117
118 return s;
119 }
120
scale_d(double2 d,double s)121 __inline double2 scale_d(double2 d, double s) {
122 double2 r;
123
124 r.x = d.x * s;
125 r.y = d.y * s;
126
127 return r;
128 }
129
add2_ss(double x,double y)130 __inline double2 add2_ss(double x, double y) {
131 double2 r;
132
133 r.x = x + y;
134 double v = r.x - x;
135 r.y = (x - (r.x - v)) + (y - v);
136
137 return r;
138 }
139
add_ds(double2 x,double y)140 __inline double2 add_ds(double2 x, double y) {
141 // |x| >= |y|
142
143 double2 r;
144
145 assert(xisnan(x.x) || xisnan(y) || xfabs(x.x) >= xfabs(y));
146
147 r.x = x.x + y;
148 r.y = x.x - r.x + y + x.y;
149
150 return r;
151 }
152
add2_ds(double2 x,double y)153 __inline double2 add2_ds(double2 x, double y) {
154 // |x| >= |y|
155
156 double2 r;
157
158 r.x = x.x + y;
159 double v = r.x - x.x;
160 r.y = (x.x - (r.x - v)) + (y - v);
161 r.y += x.y;
162
163 return r;
164 }
165
add_sd(double x,double2 y)166 __inline double2 add_sd(double x, double2 y) {
167 // |x| >= |y|
168
169 double2 r;
170
171 assert(xisnan(x) || xisnan(y.x) || xfabs(x) >= xfabs(y.x));
172
173 r.x = x + y.x;
174 r.y = x - r.x + y.x + y.y;
175
176 return r;
177 }
178
add_dd(double2 x,double2 y)179 __inline double2 add_dd(double2 x, double2 y) {
180 // |x| >= |y|
181
182 double2 r;
183
184 assert(xisnan(x.x) || xisnan(y.x) || xfabs(x.x) >= xfabs(y.x));
185
186 r.x = x.x + y.x;
187 r.y = x.x - r.x + y.x + x.y + y.y;
188
189 return r;
190 }
191
add2_dd(double2 x,double2 y)192 __inline double2 add2_dd(double2 x, double2 y) {
193 double2 r;
194
195 r.x = x.x + y.x;
196 double v = r.x - x.x;
197 r.y = (x.x - (r.x - v)) + (y.x - v);
198 r.y += x.y + y.y;
199
200 return r;
201 }
202
div_dd(double2 n,double2 d)203 __inline double2 div_dd(double2 n, double2 d) {
204 double t = 1.0 / d.x;
205 double dh = upper(d.x), dl = d.x - dh;
206 double th = upper(t ), tl = t - th;
207 double nhh = upper(n.x), nhl = n.x - nhh;
208
209 double2 q;
210
211 q.x = n.x * t;
212
213 double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
214 q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
215
216 q.y = t * (n.y - q.x * d.y) + u;
217
218 return q;
219 }
220
mul_ss(double x,double y)221 __inline double2 mul_ss(double x, double y) {
222 double xh = upper(x), xl = x - xh;
223 double yh = upper(y), yl = y - yh;
224 double2 r;
225
226 r.x = x * y;
227 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
228
229 return r;
230 }
231
mul_ds(double2 x,double y)232 __inline double2 mul_ds(double2 x, double y) {
233 double xh = upper(x.x), xl = x.x - xh;
234 double yh = upper(y ), yl = y - yh;
235 double2 r;
236
237 r.x = x.x * y;
238 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
239
240 return r;
241 }
242
mul_dd(double2 x,double2 y)243 __inline double2 mul_dd(double2 x, double2 y) {
244 double xh = upper(x.x), xl = x.x - xh;
245 double yh = upper(y.x), yl = y.x - yh;
246 double2 r;
247
248 r.x = x.x * y.x;
249 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
250
251 return r;
252 }
253
squ_d(double2 x)254 __inline double2 squ_d(double2 x) {
255 double xh = upper(x.x), xl = x.x - xh;
256 double2 r;
257
258 r.x = x.x * x.x;
259 r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
260
261 return r;
262 }
263
rec_s(double d)264 __inline double2 rec_s(double d) {
265 double t = 1.0 / d;
266 double dh = upper(d), dl = d - dh;
267 double th = upper(t), tl = t - th;
268 double2 q;
269
270 q.x = t;
271 q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
272
273 return q;
274 }
275
sqrt_d(double2 d)276 __inline double2 sqrt_d(double2 d) {
277 double t = sqrt(d.x + d.y);
278 return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), 0.5);
279 }
280
atan2k(double y,double x)281 __inline double atan2k(double y, double x) {
282 double s, t, u;
283 int q = 0;
284
285 if (x < 0) { x = -x; q = -2; }
286 if (y > x) { t = x; x = y; y = -t; q += 1; }
287
288 s = y / x;
289 t = s * s;
290
291 u = -1.88796008463073496563746e-05;
292 u = u * t + (0.000209850076645816976906797);
293 u = u * t + (-0.00110611831486672482563471);
294 u = u * t + (0.00370026744188713119232403);
295 u = u * t + (-0.00889896195887655491740809);
296 u = u * t + (0.016599329773529201970117);
297 u = u * t + (-0.0254517624932312641616861);
298 u = u * t + (0.0337852580001353069993897);
299 u = u * t + (-0.0407629191276836500001934);
300 u = u * t + (0.0466667150077840625632675);
301 u = u * t + (-0.0523674852303482457616113);
302 u = u * t + (0.0587666392926673580854313);
303 u = u * t + (-0.0666573579361080525984562);
304 u = u * t + (0.0769219538311769618355029);
305 u = u * t + (-0.090908995008245008229153);
306 u = u * t + (0.111111105648261418443745);
307 u = u * t + (-0.14285714266771329383765);
308 u = u * t + (0.199999999996591265594148);
309 u = u * t + (-0.333333333333311110369124);
310
311 t = u * t * s + s;
312 t = q * (M_PI/2) + t;
313
314 return t;
315 }
316
xatan2(double y,double x)317 __inline double xatan2(double y, double x) {
318 double r = atan2k(xfabs(y), x);
319
320 r = mulsign(r, x);
321 if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0);
322 if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);
323 if ( y == 0) r = (sign(x) == -1 ? M_PI : 0);
324
325 return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y);
326 }
327
xasin(double d)328 __inline double xasin(double d) {
329 return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d);
330 }
331
xacos(double d)332 __inline double xacos(double d) {
333 return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? M_PI : 0);
334 }
335
xatan(double s)336 __inline double xatan(double s) {
337 double t, u;
338 int q = 0;
339
340 if (s < 0) { s = -s; q = 2; }
341 if (s > 1) { s = 1.0 / s; q |= 1; }
342
343 t = s * s;
344
345 u = -1.88796008463073496563746e-05;
346 u = u * t + (0.000209850076645816976906797);
347 u = u * t + (-0.00110611831486672482563471);
348 u = u * t + (0.00370026744188713119232403);
349 u = u * t + (-0.00889896195887655491740809);
350 u = u * t + (0.016599329773529201970117);
351 u = u * t + (-0.0254517624932312641616861);
352 u = u * t + (0.0337852580001353069993897);
353 u = u * t + (-0.0407629191276836500001934);
354 u = u * t + (0.0466667150077840625632675);
355 u = u * t + (-0.0523674852303482457616113);
356 u = u * t + (0.0587666392926673580854313);
357 u = u * t + (-0.0666573579361080525984562);
358 u = u * t + (0.0769219538311769618355029);
359 u = u * t + (-0.090908995008245008229153);
360 u = u * t + (0.111111105648261418443745);
361 u = u * t + (-0.14285714266771329383765);
362 u = u * t + (0.199999999996591265594148);
363 u = u * t + (-0.333333333333311110369124);
364
365 t = s + s * (t * u);
366
367 if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
368 if ((q & 2) != 0) t = -t;
369
370 return t;
371 }
372
xsin(double d)373 __inline double xsin(double d) {
374 int q;
375 double u, s;
376
377 q = (int)xrint(d * M_1_PI);
378
379 d = mla(q, -PI4_A*4, d);
380 d = mla(q, -PI4_B*4, d);
381 d = mla(q, -PI4_C*4, d);
382
383 s = d * d;
384
385 if ((q & 1) != 0) d = -d;
386
387 u = -7.97255955009037868891952e-18;
388 u = mla(u, s, 2.81009972710863200091251e-15);
389 u = mla(u, s, -7.64712219118158833288484e-13);
390 u = mla(u, s, 1.60590430605664501629054e-10);
391 u = mla(u, s, -2.50521083763502045810755e-08);
392 u = mla(u, s, 2.75573192239198747630416e-06);
393 u = mla(u, s, -0.000198412698412696162806809);
394 u = mla(u, s, 0.00833333333333332974823815);
395 u = mla(u, s, -0.166666666666666657414808);
396
397 u = mla(s, u * d, d);
398
399 return u;
400 }
401
xcos(double d)402 __inline double xcos(double d) {
403 int q;
404 double u, s;
405
406 q = 1 + 2*(int)xrint(d * M_1_PI - 0.5);
407
408 d = mla(q, -PI4_A*2, d);
409 d = mla(q, -PI4_B*2, d);
410 d = mla(q, -PI4_C*2, d);
411
412 s = d * d;
413
414 if ((q & 2) == 0) d = -d;
415
416 u = -7.97255955009037868891952e-18;
417 u = mla(u, s, 2.81009972710863200091251e-15);
418 u = mla(u, s, -7.64712219118158833288484e-13);
419 u = mla(u, s, 1.60590430605664501629054e-10);
420 u = mla(u, s, -2.50521083763502045810755e-08);
421 u = mla(u, s, 2.75573192239198747630416e-06);
422 u = mla(u, s, -0.000198412698412696162806809);
423 u = mla(u, s, 0.00833333333333332974823815);
424 u = mla(u, s, -0.166666666666666657414808);
425
426 u = mla(s, u * d, d);
427
428 return u;
429 }
430
xsincos(double d)431 __inline double2 xsincos(double d) {
432 int q;
433 double u, s, t;
434 double2 r;
435
436 q = (int)xrint(d * (2 * M_1_PI));
437
438 s = d;
439
440 s = mla(-q, PI4_A*2, s);
441 s = mla(-q, PI4_B*2, s);
442 s = mla(-q, PI4_C*2, s);
443
444 t = s;
445
446 s = s * s;
447
448 u = 1.58938307283228937328511e-10;
449 u = mla(u, s, -2.50506943502539773349318e-08);
450 u = mla(u, s, 2.75573131776846360512547e-06);
451 u = mla(u, s, -0.000198412698278911770864914);
452 u = mla(u, s, 0.0083333333333191845961746);
453 u = mla(u, s, -0.166666666666666130709393);
454 u = u * s * t;
455
456 r.x = t + u;
457
458 u = -1.13615350239097429531523e-11;
459 u = mla(u, s, 2.08757471207040055479366e-09);
460 u = mla(u, s, -2.75573144028847567498567e-07);
461 u = mla(u, s, 2.48015872890001867311915e-05);
462 u = mla(u, s, -0.00138888888888714019282329);
463 u = mla(u, s, 0.0416666666666665519592062);
464 u = mla(u, s, -0.5);
465
466 r.y = u * s + 1;
467
468 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
469 if ((q & 2) != 0) { r.x = -r.x; }
470 if (((q+1) & 2) != 0) { r.y = -r.y; }
471
472 if (xisinf(d)) { r.x = r.y = NAN; }
473
474 return r;
475 }
476
xtan(double d)477 __inline double xtan(double d) {
478 int q;
479 double u, s, x;
480
481 q = (int)xrint(d * (2 * M_1_PI));
482
483 x = mla(q, -PI4_A*2, d);
484 x = mla(q, -PI4_B*2, x);
485 x = mla(q, -PI4_C*2, x);
486
487 s = x * x;
488
489 if ((q & 1) != 0) x = -x;
490
491 u = 1.01419718511083373224408e-05;
492 u = mla(u, s, -2.59519791585924697698614e-05);
493 u = mla(u, s, 5.23388081915899855325186e-05);
494 u = mla(u, s, -3.05033014433946488225616e-05);
495 u = mla(u, s, 7.14707504084242744267497e-05);
496 u = mla(u, s, 8.09674518280159187045078e-05);
497 u = mla(u, s, 0.000244884931879331847054404);
498 u = mla(u, s, 0.000588505168743587154904506);
499 u = mla(u, s, 0.00145612788922812427978848);
500 u = mla(u, s, 0.00359208743836906619142924);
501 u = mla(u, s, 0.00886323944362401618113356);
502 u = mla(u, s, 0.0218694882853846389592078);
503 u = mla(u, s, 0.0539682539781298417636002);
504 u = mla(u, s, 0.133333333333125941821962);
505 u = mla(u, s, 0.333333333333334980164153);
506
507 u = mla(s, u * x, x);
508
509 if ((q & 1) != 0) u = 1.0 / u;
510
511 if (xisinf(d)) u = NAN;
512
513 return u;
514 }
515
xlog(double d)516 __inline double xlog(double d) {
517 double x, x2, t, m;
518 int e;
519
520 e = ilogbp1(d * 0.7071);
521 m = ldexpk(d, -e);
522
523 x = (m-1) / (m+1);
524 x2 = x * x;
525
526 t = 0.148197055177935105296783;
527 t = mla(t, x2, 0.153108178020442575739679);
528 t = mla(t, x2, 0.181837339521549679055568);
529 t = mla(t, x2, 0.22222194152736701733275);
530 t = mla(t, x2, 0.285714288030134544449368);
531 t = mla(t, x2, 0.399999999989941956712869);
532 t = mla(t, x2, 0.666666666666685503450651);
533 t = mla(t, x2, 2);
534
535 x = x * t + 0.693147180559945286226764 * e;
536
537 if (xisinf(d)) x = INFINITY;
538 if (d < 0) x = NAN;
539 if (d == 0) x = -INFINITY;
540
541 return x;
542 }
543
xexp(double d)544 __inline double xexp(double d) {
545 int q = (int)xrint(d * R_LN2);
546 double s, u;
547
548 s = mla(q, -L2U, d);
549 s = mla(q, -L2L, s);
550
551 u = 2.08860621107283687536341e-09;
552 u = mla(u, s, 2.51112930892876518610661e-08);
553 u = mla(u, s, 2.75573911234900471893338e-07);
554 u = mla(u, s, 2.75572362911928827629423e-06);
555 u = mla(u, s, 2.4801587159235472998791e-05);
556 u = mla(u, s, 0.000198412698960509205564975);
557 u = mla(u, s, 0.00138888888889774492207962);
558 u = mla(u, s, 0.00833333333331652721664984);
559 u = mla(u, s, 0.0416666666666665047591422);
560 u = mla(u, s, 0.166666666666666851703837);
561 u = mla(u, s, 0.5);
562
563 u = s * s * u + s + 1;
564 u = ldexpk(u, q);
565
566 if (xisminf(d)) u = 0;
567
568 return u;
569 }
570
logk(double d)571 __inline double2 logk(double d) {
572 double2 x, x2;
573 double m, t;
574 int e;
575
576 e = ilogbp1(d * 0.7071);
577 m = ldexpk(d, -e);
578
579 x = div_dd(add2_ss(-1, m), add2_ss(1, m));
580 x2 = squ_d(x);
581
582 t = 0.134601987501262130076155;
583 t = mla(t, x2.x, 0.132248509032032670243288);
584 t = mla(t, x2.x, 0.153883458318096079652524);
585 t = mla(t, x2.x, 0.181817427573705403298686);
586 t = mla(t, x2.x, 0.222222231326187414840781);
587 t = mla(t, x2.x, 0.285714285651261412873718);
588 t = mla(t, x2.x, 0.400000000000222439910458);
589 t = mla(t, x2.x, 0.666666666666666371239645);
590
591 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
592 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
593 }
594
expk(double2 d)595 __inline double expk(double2 d) {
596 int q = (int)rint((d.x + d.y) * R_LN2);
597 double2 s, t;
598 double u;
599
600 s = add2_ds(d, q * -L2U);
601 s = add2_ds(s, q * -L2L);
602
603 s = normalize_d(s);
604
605 u = 2.51069683420950419527139e-08;
606 u = mla(u, s.x, 2.76286166770270649116855e-07);
607 u = mla(u, s.x, 2.75572496725023574143864e-06);
608 u = mla(u, s.x, 2.48014973989819794114153e-05);
609 u = mla(u, s.x, 0.000198412698809069797676111);
610 u = mla(u, s.x, 0.0013888888939977128960529);
611 u = mla(u, s.x, 0.00833333333332371417601081);
612 u = mla(u, s.x, 0.0416666666665409524128449);
613 u = mla(u, s.x, 0.166666666666666740681535);
614 u = mla(u, s.x, 0.500000000000000999200722);
615
616 t = add_dd(s, mul_ds(squ_d(s), u));
617
618 t = add_sd(1, t);
619 return ldexpk(t.x + t.y, q);
620 }
621
xpow(double x,double y)622 __inline double xpow(double x, double y) {
623 int yisint = (int)y == y;
624 int yisodd = (1 & (int)y) != 0 && yisint;
625
626 double result = expk(mul_ds(logk(xfabs(x)), y));
627
628 result = xisnan(result) ? INFINITY : result;
629 result *= (x >= 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1)));
630
631 double efx = mulsign(xfabs(x) - 1, y);
632 if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY);
633 if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITY);
634 if (xisnan(x) || xisnan(y)) result = NAN;
635 if (y == 0 || x == 1) result = 1;
636
637 return result;
638 }
639
expk2(double2 d)640 __inline double2 expk2(double2 d) {
641 int q = (int)rint((d.x + d.y) * R_LN2);
642 double2 s, t;
643 double u;
644
645 s = add2_ds(d, q * -L2U);
646 s = add2_ds(s, q * -L2L);
647
648 s = normalize_d(s);
649
650 u = 2.51069683420950419527139e-08;
651 u = mla(u, s.x, 2.76286166770270649116855e-07);
652 u = mla(u, s.x, 2.75572496725023574143864e-06);
653 u = mla(u, s.x, 2.48014973989819794114153e-05);
654 u = mla(u, s.x, 0.000198412698809069797676111);
655 u = mla(u, s.x, 0.0013888888939977128960529);
656 u = mla(u, s.x, 0.00833333333332371417601081);
657 u = mla(u, s.x, 0.0416666666665409524128449);
658 u = mla(u, s.x, 0.166666666666666740681535);
659 u = mla(u, s.x, 0.500000000000000999200722);
660
661 t = add_dd(s, mul_ds(squ_d(s), u));
662
663 t = add_sd(1, t);
664 return dd(ldexpk(t.x, q), ldexpk(t.y, q));
665 }
666
xsinh(double x)667 __inline double xsinh(double x) {
668 double y = xfabs(x);
669 double2 d = expk2(dd(y, 0));
670 d = add2_dd(d, div_dd(dd(-1, 0), d));
671 y = (d.x + d.y) * 0.5;
672
673 y = xisinf(x) || xisnan(y) ? INFINITY : y;
674 y = mulsign(y, x);
675 y = xisnan(x) ? NAN : y;
676
677 return y;
678 }
679
xcosh(double x)680 __inline double xcosh(double x) {
681 double2 d = expk2(dd(x, 0));
682 d = add2_dd(d, div_dd(dd(1, 0), d));
683 double y = (d.x + d.y) * 0.5;
684
685 y = xisinf(x) || xisnan(y) ? INFINITY : y;
686 y = xisnan(x) ? NAN : y;
687
688 return y;
689 }
690
xtanh(double x)691 __inline double xtanh(double x) {
692 double y = xfabs(x);
693 double2 d = expk2(dd(y, 0));
694 double2 e = div_dd(dd(1, 0), d);
695 d = div_dd(add2_dd(d, scale_d(e, -1)), add2_dd(d, e));
696 y = d.x + d.y;
697
698 y = xisinf(x) || xisnan(y) ? 1.0 : y;
699 y = mulsign(y, x);
700 y = xisnan(x) ? NAN : y;
701
702 return y;
703 }
704
logk2(double2 d)705 __inline double2 logk2(double2 d) {
706 double2 x, x2, m;
707 double t;
708 int e;
709
710 d = normalize_d(d);
711 e = ilogbp1(d.x * 0.7071);
712 m = scale_d(d, ldexpk(1, -e));
713
714 x = div_dd(add2_ds(m, -1), add2_ds(m, 1));
715 x2 = squ_d(x);
716
717 t = 0.134601987501262130076155;
718 t = mla(t, x2.x, 0.132248509032032670243288);
719 t = mla(t, x2.x, 0.153883458318096079652524);
720 t = mla(t, x2.x, 0.181817427573705403298686);
721 t = mla(t, x2.x, 0.222222231326187414840781);
722 t = mla(t, x2.x, 0.285714285651261412873718);
723 t = mla(t, x2.x, 0.400000000000222439910458);
724 t = mla(t, x2.x, 0.666666666666666371239645);
725
726 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
727 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
728 }
729
xasinh(double x)730 __inline double xasinh(double x) {
731 double y = xfabs(x);
732 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y), 1)), y));
733 y = d.x + d.y;
734
735 y = xisinf(x) || xisnan(y) ? INFINITY : y;
736 y = mulsign(y, x);
737 y = xisnan(x) ? NAN : y;
738
739 return y;
740 }
741
xacosh(double x)742 __inline double xacosh(double x) {
743 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), -1)), x));
744 double y = d.x + d.y;
745
746 y = xisinf(x) || xisnan(y) ? INFINITY : y;
747 y = x == 1.0 ? 0.0 : y;
748 y = x < 1.0 ? NAN : y;
749 y = xisnan(x) ? NAN : y;
750
751 return y;
752 }
753
xatanh(double x)754 __inline double xatanh(double x) {
755 double y = xfabs(x);
756 double2 d = logk2(div_dd(add2_ss(1, y), add2_ss(1, -y)));
757 y = y > 1.0 ? NAN : (y == 1.0 ? INFINITY : (d.x + d.y) * 0.5);
758
759 y = xisinf(x) || xisnan(y) ? NAN : y;
760 y = mulsign(y, x);
761 y = xisnan(x) ? NAN : y;
762
763 return y;
764 }
765
766 //
767
xfma(double x,double y,double z)768 __inline double xfma(double x, double y, double z) {
769 union {
770 double f;
771 long long int i;
772 } tmp;
773
774 tmp.f = x;
775 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
776 double xh = tmp.f, xl = x - xh;
777
778 tmp.f = y;
779 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
780 double yh = tmp.f, yl = y - yh;
781
782 double h = x * y;
783 double l = xh * yh - h + xl * yh + xh * yl + xl * yl;
784
785 double h2, l2, v;
786
787 h2 = h + z;
788 v = h2 - h;
789 l2 = (h - (h2 - v)) + (z - v) + l;
790
791 return h2 + l2;
792 }
793
xsqrt(double d)794 __inline double xsqrt(double d) { // max error : 0.5 ulp
795 double q = 1;
796
797 if (d < 8.636168555094445E-78) {
798 d *= 1.157920892373162E77;
799 q = 2.9387358770557188E-39;
800 }
801
802 // http://en.wikipedia.org/wiki/Fast_inverse_square_root
803 double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));
804
805 x = x * (1.5 - 0.5 * d * x * x);
806 x = x * (1.5 - 0.5 * d * x * x);
807 x = x * (1.5 - 0.5 * d * x * x);
808
809 // You can change xfma to fma if fma is correctly implemented
810 x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x;
811
812 return d == INFINITY ? INFINITY : x * q;
813 }
814
xcbrt(double d)815 __inline double xcbrt(double d) { // max error : 2 ulps
816 double x, y, q = 1.0;
817 int e, r;
818
819 e = ilogbp1(d);
820 d = ldexpk(d, -e);
821 r = (e + 6144) % 3;
822 q = (r == 1) ? 1.2599210498948731647672106 : q;
823 q = (r == 2) ? 1.5874010519681994747517056 : q;
824 q = ldexpk(q, (e + 6144) / 3 - 2048);
825
826 q = mulsign(q, d);
827 d = xfabs(d);
828
829 x = -0.640245898480692909870982;
830 x = x * d + 2.96155103020039511818595;
831 x = x * d + -5.73353060922947843636166;
832 x = x * d + 6.03990368989458747961407;
833 x = x * d + -3.85841935510444988821632;
834 x = x * d + 2.2307275302496609725722;
835
836 y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
837 y = d * x * x;
838 y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;
839
840 return y;
841 }
842
xexp2(double a)843 __inline double xexp2(double a) {
844 double u = expk(mul_ds(dd(0.69314718055994528623, 2.3190468138462995584e-17), a));
845 if (xispinf(a)) u = INFINITY;
846 if (xisminf(a)) u = 0;
847 return u;
848 }
849
xexp10(double a)850 __inline double xexp10(double a) {
851 double u = expk(mul_ds(dd(2.3025850929940459011, -2.1707562233822493508e-16), a));
852 if (xispinf(a)) u = INFINITY;
853 if (xisminf(a)) u = 0;
854 return u;
855 }
856
xexpm1(double a)857 __inline double xexpm1(double a) {
858 double2 d = add2_ds(expk2(dd(a, 0)), -1.0);
859 double x = d.x + d.y;
860 if (xispinf(a)) x = INFINITY;
861 if (xisminf(a)) x = -1;
862 return x;
863 }
864
xlog10(double a)865 __inline double xlog10(double a) {
866 double2 d = mul_dd(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17));
867 double x = d.x + d.y;
868
869 if (xisinf(a)) x = INFINITY;
870 if (a < 0) x = NAN;
871 if (a == 0) x = -INFINITY;
872
873 return x;
874 }
875
xlog1p(double a)876 __inline double xlog1p(double a) {
877 double2 d = logk2(add2_ss(a, 1));
878 double x = d.x + d.y;
879
880 if (xisinf(a)) x = INFINITY;
881 if (a < -1) x = NAN;
882 if (a == -1) x = -INFINITY;
883
884 return x;
885 }
886
887 ///////////////////////////////////////////
888
889 #define PI4_Af 0.78515625f
890 #define PI4_Bf 0.00024127960205078125f
891 #define PI4_Cf 6.3329935073852539062e-07f
892 #define PI4_Df 4.9604681473525147339e-10f
893
894 #define L2Uf 0.693145751953125f
895 #define L2Lf 1.428606765330187045e-06f
896
897 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
898 #define M_PIf ((float)M_PI)
899 #define M_PIf_2 ((float)M_PI_2)
900
901 #define INFINITYf ((float)INFINITY)
902 #define NANf ((float)NAN)
903
floatToRawIntBits(float d)904 __inline int32_t floatToRawIntBits(float d) {
905 union {
906 float f;
907 int32_t i;
908 } tmp;
909 tmp.f = d;
910 return tmp.i;
911 }
912
intBitsToFloat(int32_t i)913 __inline float intBitsToFloat(int32_t i) {
914 union {
915 float f;
916 int32_t i;
917 } tmp;
918 tmp.i = i;
919 return tmp.f;
920 }
921
xfabsf(float x)922 __inline float xfabsf(float x) {
923 return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
924 }
925
mulsignf(float x,float y)926 __inline float mulsignf(float x, float y) {
927 return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
928 }
929
signf(float d)930 __inline float signf(float d) { return mulsignf(1, d); }
mlaf(float x,float y,float z)931 __inline float mlaf(float x, float y, float z) { return x * y + z; }
xrintf(float x)932 __inline float xrintf(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); }
933
xisnanf(float x)934 __inline int xisnanf(float x) { return x != x; }
xisinff(float x)935 __inline int xisinff(float x) { return x == INFINITYf || x == -INFINITYf; }
xisminff(float x)936 __inline int xisminff(float x) { return x == -INFINITYf; }
xispinff(float x)937 __inline int xispinff(float x) { return x == INFINITYf; }
938
ilogbp1f(float d)939 __inline int ilogbp1f(float d) {
940 int m = d < 5.421010862427522E-20f;
941 d = m ? 1.8446744073709552E19f * d : d;
942 int q = (floatToRawIntBits(d) >> 23) & 0xff;
943 q = m ? q - (64 + 0x7e) : q - 0x7e;
944 return q;
945 }
946
ldexpkf(float x,int q)947 __inline float ldexpkf(float x, int q) {
948 float u;
949 int m;
950 m = q >> 31;
951 m = (((m + q) >> 6) - m) << 4;
952 q = q - (m << 2);
953 u = intBitsToFloat(((int32_t)(m + 0x7f)) << 23);
954 u = u * u;
955 x = x * u * u;
956 u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
957 return x * u;
958 }
959
xcbrtf(float d)960 __inline float xcbrtf(float d) { // max error : 2 ulps
961 float x, y, q = 1.0f;
962 int e, r;
963
964 e = ilogbp1f(d);
965 d = ldexpkf(d, -e);
966 r = (e + 6144) % 3;
967 q = (r == 1) ? 1.2599210498948731647672106f : q;
968 q = (r == 2) ? 1.5874010519681994747517056f : q;
969 q = ldexpkf(q, (e + 6144) / 3 - 2048);
970
971 q = mulsignf(q, d);
972 d = xfabsf(d);
973
974 x = -0.601564466953277587890625f;
975 x = mlaf(x, d, 2.8208892345428466796875f);
976 x = mlaf(x, d, -5.532182216644287109375f);
977 x = mlaf(x, d, 5.898262500762939453125f);
978 x = mlaf(x, d, -3.8095417022705078125f);
979 x = mlaf(x, d, 2.2241256237030029296875f);
980
981 y = d * x * x;
982 y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;
983
984 return y;
985 }
986
xsinf(float d)987 __inline float xsinf(float d) {
988 int q;
989 float u, s;
990
991 q = (int)xrintf(d * (float)M_1_PI);
992
993 d = mlaf(q, -PI4_Af*4, d);
994 d = mlaf(q, -PI4_Bf*4, d);
995 d = mlaf(q, -PI4_Cf*4, d);
996 d = mlaf(q, -PI4_Df*4, d);
997
998 s = d * d;
999
1000 if ((q & 1) != 0) d = -d;
1001
1002 u = 2.6083159809786593541503e-06f;
1003 u = mlaf(u, s, -0.0001981069071916863322258f);
1004 u = mlaf(u, s, 0.00833307858556509017944336f);
1005 u = mlaf(u, s, -0.166666597127914428710938f);
1006
1007 u = mlaf(s, u * d, d);
1008
1009 return u;
1010 }
1011
xcosf(float d)1012 __inline float xcosf(float d) {
1013 int q;
1014 float u, s;
1015
1016 q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f);
1017
1018 d = mlaf(q, -PI4_Af*2, d);
1019 d = mlaf(q, -PI4_Bf*2, d);
1020 d = mlaf(q, -PI4_Cf*2, d);
1021 d = mlaf(q, -PI4_Df*2, d);
1022
1023 s = d * d;
1024
1025 if ((q & 2) == 0) d = -d;
1026
1027 u = 2.6083159809786593541503e-06f;
1028 u = mlaf(u, s, -0.0001981069071916863322258f);
1029 u = mlaf(u, s, 0.00833307858556509017944336f);
1030 u = mlaf(u, s, -0.166666597127914428710938f);
1031
1032 u = mlaf(s, u * d, d);
1033
1034 return u;
1035 }
1036
xsincosf(float d)1037 __inline float2 xsincosf(float d) {
1038 int q;
1039 float u, s, t;
1040 float2 r;
1041
1042 q = (int)rint(d * ((float)(2 * M_1_PI)));
1043
1044 s = d;
1045
1046 s = mlaf(q, -PI4_Af*2, s);
1047 s = mlaf(q, -PI4_Bf*2, s);
1048 s = mlaf(q, -PI4_Cf*2, s);
1049 s = mlaf(q, -PI4_Df*2, s);
1050
1051 t = s;
1052
1053 s = s * s;
1054
1055 u = -0.000195169282960705459117889f;
1056 u = mlaf(u, s, 0.00833215750753879547119141f);
1057 u = mlaf(u, s, -0.166666537523269653320312f);
1058 u = u * s * t;
1059
1060 r.x = t + u;
1061
1062 u = -2.71811842367242206819355e-07f;
1063 u = mlaf(u, s, 2.47990446951007470488548e-05f);
1064 u = mlaf(u, s, -0.00138888787478208541870117f);
1065 u = mlaf(u, s, 0.0416666641831398010253906f);
1066 u = mlaf(u, s, -0.5f);
1067
1068 r.y = u * s + 1;
1069
1070 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
1071 if ((q & 2) != 0) { r.x = -r.x; }
1072 if (((q+1) & 2) != 0) { r.y = -r.y; }
1073
1074 if (xisinff(d)) { r.x = r.y = NANf; }
1075
1076 return r;
1077 }
1078
xtanf(float d)1079 __inline float xtanf(float d) {
1080 int q;
1081 float u, s, x;
1082
1083 q = (int)xrintf(d * (float)(2 * M_1_PI));
1084
1085 x = d;
1086
1087 x = mlaf(q, -PI4_Af*2, x);
1088 x = mlaf(q, -PI4_Bf*2, x);
1089 x = mlaf(q, -PI4_Cf*2, x);
1090 x = mlaf(q, -PI4_Df*2, x);
1091
1092 s = x * x;
1093
1094 if ((q & 1) != 0) x = -x;
1095
1096 u = 0.00927245803177356719970703f;
1097 u = mlaf(u, s, 0.00331984995864331722259521f);
1098 u = mlaf(u, s, 0.0242998078465461730957031f);
1099 u = mlaf(u, s, 0.0534495301544666290283203f);
1100 u = mlaf(u, s, 0.133383005857467651367188f);
1101 u = mlaf(u, s, 0.333331853151321411132812f);
1102
1103 u = mlaf(s, u * x, x);
1104
1105 if ((q & 1) != 0) u = 1.0f / u;
1106
1107 if (xisinff(d)) u = NANf;
1108
1109 return u;
1110 }
1111
xatanf(float s)1112 __inline float xatanf(float s) {
1113 float t, u;
1114 int q = 0;
1115
1116 if (s < 0) { s = -s; q = 2; }
1117 if (s > 1) { s = 1.0f / s; q |= 1; }
1118
1119 t = s * s;
1120
1121 u = 0.00282363896258175373077393f;
1122 u = mlaf(u, t, -0.0159569028764963150024414f);
1123 u = mlaf(u, t, 0.0425049886107444763183594f);
1124 u = mlaf(u, t, -0.0748900920152664184570312f);
1125 u = mlaf(u, t, 0.106347933411598205566406f);
1126 u = mlaf(u, t, -0.142027363181114196777344f);
1127 u = mlaf(u, t, 0.199926957488059997558594f);
1128 u = mlaf(u, t, -0.333331018686294555664062f);
1129
1130 t = s + s * (t * u);
1131
1132 if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
1133 if ((q & 2) != 0) t = -t;
1134
1135 return t;
1136 }
1137
atan2kf(float y,float x)1138 __inline float atan2kf(float y, float x) {
1139 float s, t, u;
1140 float q = 0.f;
1141
1142 if (x < 0) { x = -x; q = -2.f; }
1143 if (y > x) { t = x; x = y; y = -t; q += 1.f; }
1144
1145 s = y / x;
1146 t = s * s;
1147
1148 u = 0.00282363896258175373077393f;
1149 u = mlaf(u, t, -0.0159569028764963150024414f);
1150 u = mlaf(u, t, 0.0425049886107444763183594f);
1151 u = mlaf(u, t, -0.0748900920152664184570312f);
1152 u = mlaf(u, t, 0.106347933411598205566406f);
1153 u = mlaf(u, t, -0.142027363181114196777344f);
1154 u = mlaf(u, t, 0.199926957488059997558594f);
1155 u = mlaf(u, t, -0.333331018686294555664062f);
1156
1157 t = u * t;
1158 t = mlaf(t,s,s);
1159 return mlaf(q,(float)(M_PIf_2),t);
1160 }
1161
xatan2f(float y,float x)1162 __inline float xatan2f(float y, float x) {
1163 float r = atan2kf(xfabsf(y), x);
1164
1165 r = mulsignf(r, x);
1166 if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PIf*.5f)) : 0);
1167 if (xisinff(y) ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PIf*.25f)) : 0);
1168 if ( y == 0) r = (signf(x) == -1 ? M_PIf : 0);
1169
1170 return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y);
1171 }
1172
xasinf(float d)1173 __inline float xasinf(float d) {
1174 return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d);
1175 }
1176
xacosf(float d)1177 __inline float xacosf(float d) {
1178 return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)M_PI : 0.0f);
1179 }
1180
xlogf(float d)1181 __inline float xlogf(float d) {
1182 float x, x2, t, m;
1183 int e;
1184
1185 e = ilogbp1f(d * 0.7071f);
1186 m = ldexpkf(d, -e);
1187
1188 x = (m-1.0f) / (m+1.0f);
1189 x2 = x * x;
1190
1191 t = 0.2371599674224853515625f;
1192 t = mlaf(t, x2, 0.285279005765914916992188f);
1193 t = mlaf(t, x2, 0.400005519390106201171875f);
1194 t = mlaf(t, x2, 0.666666567325592041015625f);
1195 t = mlaf(t, x2, 2.0f);
1196
1197 x = x * t + 0.693147180559945286226764f * e;
1198
1199 if (xisinff(d)) x = INFINITYf;
1200 if (d < 0) x = NANf;
1201 if (d == 0) x = -INFINITYf;
1202
1203 return x;
1204 }
1205
xexpf(float d)1206 __inline float xexpf(float d) {
1207 if(d<=-104.0f) return 0.0f;
1208
1209 int q = (int)xrintf(d * R_LN2f);
1210 float s, u;
1211
1212 s = mlaf(q, -L2Uf, d);
1213 s = mlaf(q, -L2Lf, s);
1214
1215 u = 0.00136324646882712841033936f;
1216 u = mlaf(u, s, 0.00836596917361021041870117f);
1217 u = mlaf(u, s, 0.0416710823774337768554688f);
1218 u = mlaf(u, s, 0.166665524244308471679688f);
1219 u = mlaf(u, s, 0.499999850988388061523438f);
1220
1221 u = mlaf( s, mlaf(s,u,1.f),1.f);
1222 return ldexpkf(u, q);
1223
1224 }
1225
xmul2f(float d)1226 __inline float xmul2f(float d) {
1227 union {
1228 float floatval;
1229 int intval;
1230 } uflint;
1231 uflint.floatval = d;
1232 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1233 uflint.intval += 1 << 23; // add 1 to the exponent
1234 }
1235 return uflint.floatval;
1236 }
1237
xdiv2f(float d)1238 __inline float xdiv2f(float d) {
1239 union {
1240 float floatval;
1241 int intval;
1242 } uflint;
1243 uflint.floatval = d;
1244 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1245 uflint.intval -= 1 << 23; // sub 1 from the exponent
1246 }
1247 return uflint.floatval;
1248 }
1249
xdivf(float d,int n)1250 __inline float xdivf( float d, int n){
1251 union {
1252 float floatval;
1253 int intval;
1254 } uflint;
1255 uflint.floatval = d;
1256 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1257 uflint.intval -= n << 23; // add n to the exponent
1258 }
1259 return uflint.floatval;
1260 }
1261
1262
1263
1264 #endif
1265