1 ////////////////////////////////////////////////////////////////
2 //
3 // this code was taken from http://shibatch.sourceforge.net/
4 // Many thanks to the author of original version: Naoki Shibata
5 //
6 // Copyright Naoki Shibata and contributors 2010 - 2021.
7 // Distributed under the Boost Software License, Version 1.0.
8 // (See accompanying file LICENSE.txt or copy at
9 // http://www.boost.org/LICENSE_1_0.txt)
10 //
11 // This version contains modifications made by Ingo Weyrich
12 //
13 ////////////////////////////////////////////////////////////////
14
15 #ifndef _SLEEFC_
16 #define _SLEEFC_
17
18 #include <assert.h>
19 #include <stdint.h>
20 #include "rt_math.h"
21 #include "opthelper.h"
22
23 #define PI4_A .7853981554508209228515625
24 #define PI4_B .794662735614792836713604629039764404296875e-8
25 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
26 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
27
28 #define L2U .69314718055966295651160180568695068359375
29 #define L2L .28235290563031577122588448175013436025525412068e-12
30 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
31
doubleToRawLongBits(double d)32 __inline int64_t doubleToRawLongBits(double d) {
33 union {
34 double f;
35 int64_t i;
36 } tmp;
37 tmp.f = d;
38 return tmp.i;
39 }
40
longBitsToDouble(int64_t i)41 __inline double longBitsToDouble(int64_t i) {
42 union {
43 double f;
44 int64_t i;
45 } tmp;
46 tmp.i = i;
47 return tmp.f;
48 }
49
xfabs(double x)50 __inline double xfabs(double x) {
51 return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x));
52 }
53
mulsign(double x,double y)54 __inline double mulsign(double x, double y) {
55 return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63)));
56 }
57
sign(double d)58 __inline double sign(double d) { return mulsign(1, d); }
mla(double x,double y,double z)59 __inline double mla(double x, double y, double z) { return x * y + z; }
xrint(double x)60 __inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
61
xisnan(double x)62 __inline int xisnan(double x) { return x != x; }
xisinf(double x)63 __inline int xisinf(double x) { return x == librtprocess::RT_INFINITY || x == -librtprocess::RT_INFINITY; }
xisminf(double x)64 __inline int xisminf(double x) { return x == -librtprocess::RT_INFINITY; }
xispinf(double x)65 __inline int xispinf(double x) { return x == librtprocess::RT_INFINITY; }
66
ldexpk(double x,int q)67 __inline double ldexpk(double x, int q) {
68 double u;
69 int m;
70 m = q >> 31;
71 m = (((m + q) >> 9) - m) << 7;
72 q = q - (m << 2);
73 u = longBitsToDouble(((int64_t)(m + 0x3ff)) << 52);
74 double u2 = u*u;
75 u2 = u2 * u2;
76 x = x * u2;
77 u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
78 return x * u;
79 }
80
xldexp(double x,int q)81 __inline double xldexp(double x, int q) { return ldexpk(x, q); }
82
ilogbp1(double d)83 __inline int ilogbp1(double d) {
84 int m = d < 4.9090934652977266E-91;
85 d = m ? 2.037035976334486E90 * d : d;
86 int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
87 q = m ? q - (300 + 0x03fe) : q - 0x03fe;
88 return q;
89 }
90
xilogb(double d)91 __inline int xilogb(double d) {
92 int e = ilogbp1(xfabs(d)) - 1;
93 e = d == 0 ? (-2147483647 - 1) : e;
94 e = d == librtprocess::RT_INFINITY || d == -librtprocess::RT_INFINITY ? 2147483647 : e;
95 return e;
96 }
97
upper(double d)98 __inline double upper(double d) {
99 return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL);
100 }
101
102 typedef struct {
103 double x, y;
104 } double2;
105
106 typedef struct {
107 float x, y;
108 } float2;
109
dd(double h,double l)110 __inline double2 dd(double h, double l) {
111 double2 ret;
112 ret.x = h; ret.y = l;
113 return ret;
114 }
115
normalize_d(double2 t)116 __inline double2 normalize_d(double2 t) {
117 double2 s;
118
119 s.x = t.x + t.y;
120 s.y = t.x - s.x + t.y;
121
122 return s;
123 }
124
scale_d(double2 d,double s)125 __inline double2 scale_d(double2 d, double s) {
126 double2 r;
127
128 r.x = d.x * s;
129 r.y = d.y * s;
130
131 return r;
132 }
133
add2_ss(double x,double y)134 __inline double2 add2_ss(double x, double y) {
135 double2 r;
136
137 r.x = x + y;
138 double v = r.x - x;
139 r.y = (x - (r.x - v)) + (y - v);
140
141 return r;
142 }
143
add_ds(double2 x,double y)144 __inline double2 add_ds(double2 x, double y) {
145 // |x| >= |y|
146
147 double2 r;
148
149 assert(xisnan(x.x) || xisnan(y) || xfabs(x.x) >= xfabs(y));
150
151 r.x = x.x + y;
152 r.y = x.x - r.x + y + x.y;
153
154 return r;
155 }
156
add2_ds(double2 x,double y)157 __inline double2 add2_ds(double2 x, double y) {
158 // |x| >= |y|
159
160 double2 r;
161
162 r.x = x.x + y;
163 double v = r.x - x.x;
164 r.y = (x.x - (r.x - v)) + (y - v);
165 r.y += x.y;
166
167 return r;
168 }
169
add_sd(double x,double2 y)170 __inline double2 add_sd(double x, double2 y) {
171 // |x| >= |y|
172
173 double2 r;
174
175 assert(xisnan(x) || xisnan(y.x) || xfabs(x) >= xfabs(y.x));
176
177 r.x = x + y.x;
178 r.y = x - r.x + y.x + y.y;
179
180 return r;
181 }
182
add_dd(double2 x,double2 y)183 __inline double2 add_dd(double2 x, double2 y) {
184 // |x| >= |y|
185
186 double2 r;
187
188 assert(xisnan(x.x) || xisnan(y.x) || xfabs(x.x) >= xfabs(y.x));
189
190 r.x = x.x + y.x;
191 r.y = x.x - r.x + y.x + x.y + y.y;
192
193 return r;
194 }
195
add2_dd(double2 x,double2 y)196 __inline double2 add2_dd(double2 x, double2 y) {
197 double2 r;
198
199 r.x = x.x + y.x;
200 double v = r.x - x.x;
201 r.y = (x.x - (r.x - v)) + (y.x - v);
202 r.y += x.y + y.y;
203
204 return r;
205 }
206
div_dd(double2 n,double2 d)207 __inline double2 div_dd(double2 n, double2 d) {
208 double t = 1.0 / d.x;
209 double dh = upper(d.x), dl = d.x - dh;
210 double th = upper(t ), tl = t - th;
211 double nhh = upper(n.x), nhl = n.x - nhh;
212
213 double2 q;
214
215 q.x = n.x * t;
216
217 double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
218 q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
219
220 q.y = t * (n.y - q.x * d.y) + u;
221
222 return q;
223 }
224
mul_ss(double x,double y)225 __inline double2 mul_ss(double x, double y) {
226 double xh = upper(x), xl = x - xh;
227 double yh = upper(y), yl = y - yh;
228 double2 r;
229
230 r.x = x * y;
231 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
232
233 return r;
234 }
235
mul_ds(double2 x,double y)236 __inline double2 mul_ds(double2 x, double y) {
237 double xh = upper(x.x), xl = x.x - xh;
238 double yh = upper(y ), yl = y - yh;
239 double2 r;
240
241 r.x = x.x * y;
242 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
243
244 return r;
245 }
246
mul_dd(double2 x,double2 y)247 __inline double2 mul_dd(double2 x, double2 y) {
248 double xh = upper(x.x), xl = x.x - xh;
249 double yh = upper(y.x), yl = y.x - yh;
250 double2 r;
251
252 r.x = x.x * y.x;
253 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
254
255 return r;
256 }
257
squ_d(double2 x)258 __inline double2 squ_d(double2 x) {
259 double xh = upper(x.x), xl = x.x - xh;
260 double2 r;
261
262 r.x = x.x * x.x;
263 r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
264
265 return r;
266 }
267
rec_s(double d)268 __inline double2 rec_s(double d) {
269 double t = 1.0 / d;
270 double dh = upper(d), dl = d - dh;
271 double th = upper(t), tl = t - th;
272 double2 q;
273
274 q.x = t;
275 q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
276
277 return q;
278 }
279
sqrt_d(double2 d)280 __inline double2 sqrt_d(double2 d) {
281 double t = sqrt(d.x + d.y);
282 return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), 0.5);
283 }
284
atan2k(double y,double x)285 __inline double atan2k(double y, double x) {
286 double s, t, u;
287 int q = 0;
288
289 if (x < 0) { x = -x; q = -2; }
290 if (y > x) { t = x; x = y; y = -t; q += 1; }
291
292 s = y / x;
293 t = s * s;
294
295 u = -1.88796008463073496563746e-05;
296 u = u * t + (0.000209850076645816976906797);
297 u = u * t + (-0.00110611831486672482563471);
298 u = u * t + (0.00370026744188713119232403);
299 u = u * t + (-0.00889896195887655491740809);
300 u = u * t + (0.016599329773529201970117);
301 u = u * t + (-0.0254517624932312641616861);
302 u = u * t + (0.0337852580001353069993897);
303 u = u * t + (-0.0407629191276836500001934);
304 u = u * t + (0.0466667150077840625632675);
305 u = u * t + (-0.0523674852303482457616113);
306 u = u * t + (0.0587666392926673580854313);
307 u = u * t + (-0.0666573579361080525984562);
308 u = u * t + (0.0769219538311769618355029);
309 u = u * t + (-0.090908995008245008229153);
310 u = u * t + (0.111111105648261418443745);
311 u = u * t + (-0.14285714266771329383765);
312 u = u * t + (0.199999999996591265594148);
313 u = u * t + (-0.333333333333311110369124);
314
315 t = u * t * s + s;
316 t = q * (librtprocess::RT_PI_2) + t;
317
318 return t;
319 }
320
xatan2(double y,double x)321 __inline double xatan2(double y, double x) {
322 double r = atan2k(xfabs(y), x);
323
324 r = mulsign(r, x);
325 if (xisinf(x) || x == 0) r = librtprocess::RT_PI_2 - (xisinf(x) ? (sign(x) * (librtprocess::RT_PI_2)) : 0);
326 if (xisinf(y) ) r = librtprocess::RT_PI_2 - (xisinf(x) ? (sign(x) * (librtprocess::RT_PI*1/4)) : 0);
327 if ( y == 0) r = (sign(x) == -1 ? librtprocess::RT_PI : 0);
328
329 return xisnan(x) || xisnan(y) ? librtprocess::RT_NAN : mulsign(r, y);
330 }
331
xasin(double d)332 __inline double xasin(double d) {
333 return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d);
334 }
335
xacos(double d)336 __inline double xacos(double d) {
337 return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? librtprocess::RT_PI : 0);
338 }
339
xatan(double s)340 __inline double xatan(double s) {
341 double t, u;
342 int q = 0;
343
344 if (s < 0) { s = -s; q = 2; }
345 if (s > 1) { s = 1.0 / s; q |= 1; }
346
347 t = s * s;
348
349 u = -1.88796008463073496563746e-05;
350 u = u * t + (0.000209850076645816976906797);
351 u = u * t + (-0.00110611831486672482563471);
352 u = u * t + (0.00370026744188713119232403);
353 u = u * t + (-0.00889896195887655491740809);
354 u = u * t + (0.016599329773529201970117);
355 u = u * t + (-0.0254517624932312641616861);
356 u = u * t + (0.0337852580001353069993897);
357 u = u * t + (-0.0407629191276836500001934);
358 u = u * t + (0.0466667150077840625632675);
359 u = u * t + (-0.0523674852303482457616113);
360 u = u * t + (0.0587666392926673580854313);
361 u = u * t + (-0.0666573579361080525984562);
362 u = u * t + (0.0769219538311769618355029);
363 u = u * t + (-0.090908995008245008229153);
364 u = u * t + (0.111111105648261418443745);
365 u = u * t + (-0.14285714266771329383765);
366 u = u * t + (0.199999999996591265594148);
367 u = u * t + (-0.333333333333311110369124);
368
369 t = s + s * (t * u);
370
371 if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
372 if ((q & 2) != 0) t = -t;
373
374 return t;
375 }
376
xsin(double d)377 __inline double xsin(double d) {
378 int q;
379 double u, s;
380
381 q = (int)xrint(d * librtprocess::RT_1_PI);
382
383 d = mla(q, -PI4_A*4, d);
384 d = mla(q, -PI4_B*4, d);
385 d = mla(q, -PI4_C*4, d);
386
387 s = d * d;
388
389 if ((q & 1) != 0) d = -d;
390
391 u = -7.97255955009037868891952e-18;
392 u = mla(u, s, 2.81009972710863200091251e-15);
393 u = mla(u, s, -7.64712219118158833288484e-13);
394 u = mla(u, s, 1.60590430605664501629054e-10);
395 u = mla(u, s, -2.50521083763502045810755e-08);
396 u = mla(u, s, 2.75573192239198747630416e-06);
397 u = mla(u, s, -0.000198412698412696162806809);
398 u = mla(u, s, 0.00833333333333332974823815);
399 u = mla(u, s, -0.166666666666666657414808);
400
401 u = mla(s, u * d, d);
402
403 return u;
404 }
405
xcos(double d)406 __inline double xcos(double d) {
407 int q;
408 double u, s;
409
410 q = 1 + 2*(int)xrint(d * librtprocess::RT_1_PI - 0.5);
411
412 d = mla(q, -PI4_A*2, d);
413 d = mla(q, -PI4_B*2, d);
414 d = mla(q, -PI4_C*2, d);
415
416 s = d * d;
417
418 if ((q & 2) == 0) d = -d;
419
420 u = -7.97255955009037868891952e-18;
421 u = mla(u, s, 2.81009972710863200091251e-15);
422 u = mla(u, s, -7.64712219118158833288484e-13);
423 u = mla(u, s, 1.60590430605664501629054e-10);
424 u = mla(u, s, -2.50521083763502045810755e-08);
425 u = mla(u, s, 2.75573192239198747630416e-06);
426 u = mla(u, s, -0.000198412698412696162806809);
427 u = mla(u, s, 0.00833333333333332974823815);
428 u = mla(u, s, -0.166666666666666657414808);
429
430 u = mla(s, u * d, d);
431
432 return u;
433 }
434
xsincos(double d)435 __inline double2 xsincos(double d) {
436 int q;
437 double u, s, t;
438 double2 r;
439
440 q = (int)xrint(d * (2 * librtprocess::RT_1_PI));
441
442 s = d;
443
444 s = mla(-q, PI4_A*2, s);
445 s = mla(-q, PI4_B*2, s);
446 s = mla(-q, PI4_C*2, s);
447
448 t = s;
449
450 s = s * s;
451
452 u = 1.58938307283228937328511e-10;
453 u = mla(u, s, -2.50506943502539773349318e-08);
454 u = mla(u, s, 2.75573131776846360512547e-06);
455 u = mla(u, s, -0.000198412698278911770864914);
456 u = mla(u, s, 0.0083333333333191845961746);
457 u = mla(u, s, -0.166666666666666130709393);
458 u = u * s * t;
459
460 r.x = t + u;
461
462 u = -1.13615350239097429531523e-11;
463 u = mla(u, s, 2.08757471207040055479366e-09);
464 u = mla(u, s, -2.75573144028847567498567e-07);
465 u = mla(u, s, 2.48015872890001867311915e-05);
466 u = mla(u, s, -0.00138888888888714019282329);
467 u = mla(u, s, 0.0416666666666665519592062);
468 u = mla(u, s, -0.5);
469
470 r.y = u * s + 1;
471
472 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
473 if ((q & 2) != 0) { r.x = -r.x; }
474 if (((q+1) & 2) != 0) { r.y = -r.y; }
475
476 if (xisinf(d)) { r.x = r.y = librtprocess::RT_NAN; }
477
478 return r;
479 }
480
xtan(double d)481 __inline double xtan(double d) {
482 int q;
483 double u, s, x;
484
485 q = (int)xrint(d * (2 * librtprocess::RT_1_PI));
486
487 x = mla(q, -PI4_A*2, d);
488 x = mla(q, -PI4_B*2, x);
489 x = mla(q, -PI4_C*2, x);
490
491 s = x * x;
492
493 if ((q & 1) != 0) x = -x;
494
495 u = 1.01419718511083373224408e-05;
496 u = mla(u, s, -2.59519791585924697698614e-05);
497 u = mla(u, s, 5.23388081915899855325186e-05);
498 u = mla(u, s, -3.05033014433946488225616e-05);
499 u = mla(u, s, 7.14707504084242744267497e-05);
500 u = mla(u, s, 8.09674518280159187045078e-05);
501 u = mla(u, s, 0.000244884931879331847054404);
502 u = mla(u, s, 0.000588505168743587154904506);
503 u = mla(u, s, 0.00145612788922812427978848);
504 u = mla(u, s, 0.00359208743836906619142924);
505 u = mla(u, s, 0.00886323944362401618113356);
506 u = mla(u, s, 0.0218694882853846389592078);
507 u = mla(u, s, 0.0539682539781298417636002);
508 u = mla(u, s, 0.133333333333125941821962);
509 u = mla(u, s, 0.333333333333334980164153);
510
511 u = mla(s, u * x, x);
512
513 if ((q & 1) != 0) u = 1.0 / u;
514
515 if (xisinf(d)) u = librtprocess::RT_NAN;
516
517 return u;
518 }
519
xlog(double d)520 __inline double xlog(double d) {
521 double x, x2, t, m;
522 int e;
523
524 e = ilogbp1(d * 0.7071);
525 m = ldexpk(d, -e);
526
527 x = (m-1) / (m+1);
528 x2 = x * x;
529
530 t = 0.148197055177935105296783;
531 t = mla(t, x2, 0.153108178020442575739679);
532 t = mla(t, x2, 0.181837339521549679055568);
533 t = mla(t, x2, 0.22222194152736701733275);
534 t = mla(t, x2, 0.285714288030134544449368);
535 t = mla(t, x2, 0.399999999989941956712869);
536 t = mla(t, x2, 0.666666666666685503450651);
537 t = mla(t, x2, 2);
538
539 x = x * t + 0.693147180559945286226764 * e;
540
541 if (xisinf(d)) x = librtprocess::RT_INFINITY;
542 if (d < 0) x = librtprocess::RT_NAN;
543 if (d == 0) x = -librtprocess::RT_INFINITY;
544
545 return x;
546 }
547
xexp(double d)548 __inline double xexp(double d) {
549 int q = (int)xrint(d * R_LN2);
550 double s, u;
551
552 s = mla(q, -L2U, d);
553 s = mla(q, -L2L, s);
554
555 u = 2.08860621107283687536341e-09;
556 u = mla(u, s, 2.51112930892876518610661e-08);
557 u = mla(u, s, 2.75573911234900471893338e-07);
558 u = mla(u, s, 2.75572362911928827629423e-06);
559 u = mla(u, s, 2.4801587159235472998791e-05);
560 u = mla(u, s, 0.000198412698960509205564975);
561 u = mla(u, s, 0.00138888888889774492207962);
562 u = mla(u, s, 0.00833333333331652721664984);
563 u = mla(u, s, 0.0416666666666665047591422);
564 u = mla(u, s, 0.166666666666666851703837);
565 u = mla(u, s, 0.5);
566
567 u = s * s * u + s + 1;
568 u = ldexpk(u, q);
569
570 if (xisminf(d)) u = 0;
571
572 return u;
573 }
574
logk(double d)575 __inline double2 logk(double d) {
576 double2 x, x2;
577 double m, t;
578 int e;
579
580 e = ilogbp1(d * 0.7071);
581 m = ldexpk(d, -e);
582
583 x = div_dd(add2_ss(-1, m), add2_ss(1, m));
584 x2 = squ_d(x);
585
586 t = 0.134601987501262130076155;
587 t = mla(t, x2.x, 0.132248509032032670243288);
588 t = mla(t, x2.x, 0.153883458318096079652524);
589 t = mla(t, x2.x, 0.181817427573705403298686);
590 t = mla(t, x2.x, 0.222222231326187414840781);
591 t = mla(t, x2.x, 0.285714285651261412873718);
592 t = mla(t, x2.x, 0.400000000000222439910458);
593 t = mla(t, x2.x, 0.666666666666666371239645);
594
595 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
596 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
597 }
598
expk(double2 d)599 __inline double expk(double2 d) {
600 int q = (int)rint((d.x + d.y) * R_LN2);
601 double2 s, t;
602 double u;
603
604 s = add2_ds(d, q * -L2U);
605 s = add2_ds(s, q * -L2L);
606
607 s = normalize_d(s);
608
609 u = 2.51069683420950419527139e-08;
610 u = mla(u, s.x, 2.76286166770270649116855e-07);
611 u = mla(u, s.x, 2.75572496725023574143864e-06);
612 u = mla(u, s.x, 2.48014973989819794114153e-05);
613 u = mla(u, s.x, 0.000198412698809069797676111);
614 u = mla(u, s.x, 0.0013888888939977128960529);
615 u = mla(u, s.x, 0.00833333333332371417601081);
616 u = mla(u, s.x, 0.0416666666665409524128449);
617 u = mla(u, s.x, 0.166666666666666740681535);
618 u = mla(u, s.x, 0.500000000000000999200722);
619
620 t = add_dd(s, mul_ds(squ_d(s), u));
621
622 t = add_sd(1, t);
623 return ldexpk(t.x + t.y, q);
624 }
625
xpow(double x,double y)626 __inline double xpow(double x, double y) {
627 int yisint = (int)y == y;
628 int yisodd = (1 & (int)y) != 0 && yisint;
629
630 double result = expk(mul_ds(logk(xfabs(x)), y));
631
632 result = xisnan(result) ? librtprocess::RT_INFINITY : result;
633 result *= (x >= 0 ? 1 : (!yisint ? librtprocess::RT_NAN : (yisodd ? -1 : 1)));
634
635 double efx = mulsign(xfabs(x) - 1, y);
636 if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : librtprocess::RT_INFINITY);
637 if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : librtprocess::RT_INFINITY);
638 if (xisnan(x) || xisnan(y)) result = librtprocess::RT_NAN;
639 if (y == 0 || x == 1) result = 1;
640
641 return result;
642 }
643
expk2(double2 d)644 __inline double2 expk2(double2 d) {
645 int q = (int)rint((d.x + d.y) * R_LN2);
646 double2 s, t;
647 double u;
648
649 s = add2_ds(d, q * -L2U);
650 s = add2_ds(s, q * -L2L);
651
652 s = normalize_d(s);
653
654 u = 2.51069683420950419527139e-08;
655 u = mla(u, s.x, 2.76286166770270649116855e-07);
656 u = mla(u, s.x, 2.75572496725023574143864e-06);
657 u = mla(u, s.x, 2.48014973989819794114153e-05);
658 u = mla(u, s.x, 0.000198412698809069797676111);
659 u = mla(u, s.x, 0.0013888888939977128960529);
660 u = mla(u, s.x, 0.00833333333332371417601081);
661 u = mla(u, s.x, 0.0416666666665409524128449);
662 u = mla(u, s.x, 0.166666666666666740681535);
663 u = mla(u, s.x, 0.500000000000000999200722);
664
665 t = add_dd(s, mul_ds(squ_d(s), u));
666
667 t = add_sd(1, t);
668 return dd(ldexpk(t.x, q), ldexpk(t.y, q));
669 }
670
xsinh(double x)671 __inline double xsinh(double x) {
672 double y = xfabs(x);
673 double2 d = expk2(dd(y, 0));
674 d = add2_dd(d, div_dd(dd(-1, 0), d));
675 y = (d.x + d.y) * 0.5;
676
677 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
678 y = mulsign(y, x);
679 y = xisnan(x) ? librtprocess::RT_NAN : y;
680
681 return y;
682 }
683
xcosh(double x)684 __inline double xcosh(double x) {
685 double2 d = expk2(dd(x, 0));
686 d = add2_dd(d, div_dd(dd(1, 0), d));
687 double y = (d.x + d.y) * 0.5;
688
689 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
690 y = xisnan(x) ? librtprocess::RT_NAN : y;
691
692 return y;
693 }
694
xtanh(double x)695 __inline double xtanh(double x) {
696 double y = xfabs(x);
697 double2 d = expk2(dd(y, 0));
698 double2 e = div_dd(dd(1, 0), d);
699 d = div_dd(add2_dd(d, scale_d(e, -1)), add2_dd(d, e));
700 y = d.x + d.y;
701
702 y = xisinf(x) || xisnan(y) ? 1.0 : y;
703 y = mulsign(y, x);
704 y = xisnan(x) ? librtprocess::RT_NAN : y;
705
706 return y;
707 }
708
logk2(double2 d)709 __inline double2 logk2(double2 d) {
710 double2 x, x2, m;
711 double t;
712 int e;
713
714 d = normalize_d(d);
715 e = ilogbp1(d.x * 0.7071);
716 m = scale_d(d, ldexpk(1, -e));
717
718 x = div_dd(add2_ds(m, -1), add2_ds(m, 1));
719 x2 = squ_d(x);
720
721 t = 0.134601987501262130076155;
722 t = mla(t, x2.x, 0.132248509032032670243288);
723 t = mla(t, x2.x, 0.153883458318096079652524);
724 t = mla(t, x2.x, 0.181817427573705403298686);
725 t = mla(t, x2.x, 0.222222231326187414840781);
726 t = mla(t, x2.x, 0.285714285651261412873718);
727 t = mla(t, x2.x, 0.400000000000222439910458);
728 t = mla(t, x2.x, 0.666666666666666371239645);
729
730 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
731 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
732 }
733
xasinh(double x)734 __inline double xasinh(double x) {
735 double y = xfabs(x);
736 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y), 1)), y));
737 y = d.x + d.y;
738
739 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
740 y = mulsign(y, x);
741 y = xisnan(x) ? librtprocess::RT_NAN : y;
742
743 return y;
744 }
745
xacosh(double x)746 __inline double xacosh(double x) {
747 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), -1)), x));
748 double y = d.x + d.y;
749
750 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
751 y = x == 1.0 ? 0.0 : y;
752 y = x < 1.0 ? librtprocess::RT_NAN : y;
753 y = xisnan(x) ? librtprocess::RT_NAN : y;
754
755 return y;
756 }
757
xatanh(double x)758 __inline double xatanh(double x) {
759 double y = xfabs(x);
760 double2 d = logk2(div_dd(add2_ss(1, y), add2_ss(1, -y)));
761 y = y > 1.0 ? librtprocess::RT_NAN : (y == 1.0 ? librtprocess::RT_INFINITY : (d.x + d.y) * 0.5);
762
763 y = xisinf(x) || xisnan(y) ? librtprocess::RT_NAN : y;
764 y = mulsign(y, x);
765 y = xisnan(x) ? librtprocess::RT_NAN : y;
766
767 return y;
768 }
769
770 //
771
xfma(double x,double y,double z)772 __inline double xfma(double x, double y, double z) {
773 union {
774 double f;
775 long long int i;
776 } tmp;
777
778 tmp.f = x;
779 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
780 double xh = tmp.f, xl = x - xh;
781
782 tmp.f = y;
783 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
784 double yh = tmp.f, yl = y - yh;
785
786 double h = x * y;
787 double l = xh * yh - h + xl * yh + xh * yl + xl * yl;
788
789 double h2, l2, v;
790
791 h2 = h + z;
792 v = h2 - h;
793 l2 = (h - (h2 - v)) + (z - v) + l;
794
795 return h2 + l2;
796 }
797
xsqrt(double d)798 __inline double xsqrt(double d) { // max error : 0.5 ulp
799 double q = 1;
800
801 if (d < 8.636168555094445E-78) {
802 d *= 1.157920892373162E77;
803 q = 2.9387358770557188E-39;
804 }
805
806 // http://en.wikipedia.org/wiki/Fast_inverse_square_root
807 double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));
808
809 x = x * (1.5 - 0.5 * d * x * x);
810 x = x * (1.5 - 0.5 * d * x * x);
811 x = x * (1.5 - 0.5 * d * x * x);
812
813 // You can change xfma to fma if fma is correctly implemented
814 x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x;
815
816 return d == librtprocess::RT_INFINITY ? librtprocess::RT_INFINITY : x * q;
817 }
818
xcbrt(double d)819 __inline double xcbrt(double d) { // max error : 2 ulps
820 double x, y, q = 1.0;
821 int e, r;
822
823 e = ilogbp1(d);
824 d = ldexpk(d, -e);
825 r = (e + 6144) % 3;
826 q = (r == 1) ? 1.2599210498948731647672106 : q;
827 q = (r == 2) ? 1.5874010519681994747517056 : q;
828 q = ldexpk(q, (e + 6144) / 3 - 2048);
829
830 q = mulsign(q, d);
831 d = xfabs(d);
832
833 x = -0.640245898480692909870982;
834 x = x * d + 2.96155103020039511818595;
835 x = x * d + -5.73353060922947843636166;
836 x = x * d + 6.03990368989458747961407;
837 x = x * d + -3.85841935510444988821632;
838 x = x * d + 2.2307275302496609725722;
839
840 y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
841 y = d * x * x;
842 y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;
843
844 return y;
845 }
846
xexp2(double a)847 __inline double xexp2(double a) {
848 double u = expk(mul_ds(dd(0.69314718055994528623, 2.3190468138462995584e-17), a));
849 if (xispinf(a)) u = librtprocess::RT_INFINITY;
850 if (xisminf(a)) u = 0;
851 return u;
852 }
853
xexp10(double a)854 __inline double xexp10(double a) {
855 double u = expk(mul_ds(dd(2.3025850929940459011, -2.1707562233822493508e-16), a));
856 if (xispinf(a)) u = librtprocess::RT_INFINITY;
857 if (xisminf(a)) u = 0;
858 return u;
859 }
860
xexpm1(double a)861 __inline double xexpm1(double a) {
862 double2 d = add2_ds(expk2(dd(a, 0)), -1.0);
863 double x = d.x + d.y;
864 if (xispinf(a)) x = librtprocess::RT_INFINITY;
865 if (xisminf(a)) x = -1;
866 return x;
867 }
868
xlog10(double a)869 __inline double xlog10(double a) {
870 double2 d = mul_dd(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17));
871 double x = d.x + d.y;
872
873 if (xisinf(a)) x = librtprocess::RT_INFINITY;
874 if (a < 0) x = librtprocess::RT_NAN;
875 if (a == 0) x = -librtprocess::RT_INFINITY;
876
877 return x;
878 }
879
xlog1p(double a)880 __inline double xlog1p(double a) {
881 double2 d = logk2(add2_ss(a, 1));
882 double x = d.x + d.y;
883
884 if (xisinf(a)) x = librtprocess::RT_INFINITY;
885 if (a < -1) x = librtprocess::RT_NAN;
886 if (a == -1) x = -librtprocess::RT_INFINITY;
887
888 return x;
889 }
890
891 ///////////////////////////////////////////
892
893 #define PI4_Af 0.78515625f
894 #define PI4_Bf 0.00024127960205078125f
895 #define PI4_Cf 6.3329935073852539062e-07f
896 #define PI4_Df 4.9604681473525147339e-10f
897
898 #define L2Uf 0.693145751953125f
899 #define L2Lf 1.428606765330187045e-06f
900
901 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
902
floatToRawIntBits(float d)903 __inline int32_t floatToRawIntBits(float d) {
904 union {
905 float f;
906 int32_t i;
907 } tmp;
908 tmp.f = d;
909 return tmp.i;
910 }
911
intBitsToFloat(int32_t i)912 __inline float intBitsToFloat(int32_t i) {
913 union {
914 float f;
915 int32_t i;
916 } tmp;
917 tmp.i = i;
918 return tmp.f;
919 }
920
xfabsf(float x)921 __inline float xfabsf(float x) {
922 return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
923 }
924
mulsignf(float x,float y)925 __inline float mulsignf(float x, float y) {
926 return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
927 }
928
signf(float d)929 __inline float signf(float d) { return copysign(1, d); }
mlaf(float x,float y,float z)930 __inline float mlaf(float x, float y, float z) { return x * y + z; }
931
xisnanf(float x)932 __inline int xisnanf(float x) { return x != x; }
xisinff(float x)933 __inline int xisinff(float x) { return x == librtprocess::RT_INFINITY_F || x == -librtprocess::RT_INFINITY_F; }
xisminff(float x)934 __inline int xisminff(float x) { return x == -librtprocess::RT_INFINITY_F; }
xispinff(float x)935 __inline int xispinff(float x) { return x == librtprocess::RT_INFINITY_F; }
936
ilogbp1f(float d)937 __inline int ilogbp1f(float d) {
938 int m = d < 5.421010862427522E-20f;
939 d = m ? 1.8446744073709552E19f * d : d;
940 int q = (floatToRawIntBits(d) >> 23) & 0xff;
941 q = m ? q - (64 + 0x7e) : q - 0x7e;
942 return q;
943 }
944
ldexpkf(float x,int q)945 __inline float ldexpkf(float x, int q) {
946 float u;
947 int m;
948 m = q >> 31;
949 m = (((m + q) >> 6) - m) << 4;
950 q = q - (m << 2);
951 u = intBitsToFloat(((int32_t)(m + 0x7f)) << 23);
952 u = u * u;
953 x = x * u * u;
954 u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
955 return x * u;
956 }
957
xcbrtf(float d)958 __inline float xcbrtf(float d) { // max error : 2 ulps
959 float x, y, q = 1.0f;
960 int e, r;
961
962 e = ilogbp1f(d);
963 d = ldexpkf(d, -e);
964 r = (e + 6144) % 3;
965 q = (r == 1) ? 1.2599210498948731647672106f : q;
966 q = (r == 2) ? 1.5874010519681994747517056f : q;
967 q = ldexpkf(q, (e + 6144) / 3 - 2048);
968
969 q = mulsignf(q, d);
970 d = xfabsf(d);
971
972 x = -0.601564466953277587890625f;
973 x = mlaf(x, d, 2.8208892345428466796875f);
974 x = mlaf(x, d, -5.532182216644287109375f);
975 x = mlaf(x, d, 5.898262500762939453125f);
976 x = mlaf(x, d, -3.8095417022705078125f);
977 x = mlaf(x, d, 2.2241256237030029296875f);
978
979 y = d * x * x;
980 y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;
981
982 return y;
983 }
984
xsinf(float d)985 __inline float xsinf(float d) {
986 int q;
987 float u, s;
988
989 q = rint(d * librtprocess::RT_1_PI_F);
990
991 d = mlaf(q, -PI4_Af*4, d);
992 d = mlaf(q, -PI4_Bf*4, d);
993 d = mlaf(q, -PI4_Cf*4, d);
994 d = mlaf(q, -PI4_Df*4, d);
995
996 s = d * d;
997
998 if ((q & 1) != 0) d = -d;
999
1000 u = 2.6083159809786593541503e-06f;
1001 u = mlaf(u, s, -0.0001981069071916863322258f);
1002 u = mlaf(u, s, 0.00833307858556509017944336f);
1003 u = mlaf(u, s, -0.166666597127914428710938f);
1004
1005 u = mlaf(s, u * d, d);
1006
1007 return u;
1008 }
1009
xcosf(float d)1010 __inline float xcosf(float d) {
1011 #ifdef __SSE2__
1012 // faster than scalar version
1013 return xcosf(_mm_set_ss(d))[0];
1014 #else
1015 int q;
1016 float u, s;
1017
1018 q = 1 + 2*rint(d * librtprocess::RT_1_PI_F - 0.5f);
1019
1020 d = mlaf(q, -PI4_Af*2, d);
1021 d = mlaf(q, -PI4_Bf*2, d);
1022 d = mlaf(q, -PI4_Cf*2, d);
1023 d = mlaf(q, -PI4_Df*2, d);
1024
1025 s = d * d;
1026
1027 if ((q & 2) == 0) d = -d;
1028
1029 u = 2.6083159809786593541503e-06f;
1030 u = mlaf(u, s, -0.0001981069071916863322258f);
1031 u = mlaf(u, s, 0.00833307858556509017944336f);
1032 u = mlaf(u, s, -0.166666597127914428710938f);
1033
1034 u = mlaf(s, u * d, d);
1035
1036 return u;
1037 #endif
1038 }
1039
xsincosf(float d)1040 __inline float2 xsincosf(float d) {
1041 #ifdef __SSE2__
1042 // faster than scalar version
1043 vfloat2 res = xsincosf(_mm_set_ss(d));
1044 return {res.x[0], res.y[0]};
1045 #else
1046 int q;
1047 float u, s, t;
1048 float2 r;
1049
1050 q = rint(d * librtprocess::RT_2_PI_F);
1051
1052 s = d;
1053
1054 s = mlaf(q, -PI4_Af*2, s);
1055 s = mlaf(q, -PI4_Bf*2, s);
1056 s = mlaf(q, -PI4_Cf*2, s);
1057 s = mlaf(q, -PI4_Df*2, s);
1058
1059 t = s;
1060
1061 s = s * s;
1062
1063 u = -0.000195169282960705459117889f;
1064 u = mlaf(u, s, 0.00833215750753879547119141f);
1065 u = mlaf(u, s, -0.166666537523269653320312f);
1066 u = u * s * t;
1067
1068 r.x = t + u;
1069
1070 u = -2.71811842367242206819355e-07f;
1071 u = mlaf(u, s, 2.47990446951007470488548e-05f);
1072 u = mlaf(u, s, -0.00138888787478208541870117f);
1073 u = mlaf(u, s, 0.0416666641831398010253906f);
1074 u = mlaf(u, s, -0.5f);
1075
1076 r.y = u * s + 1;
1077
1078 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
1079 if ((q & 2) != 0) { r.x = -r.x; }
1080 if (((q+1) & 2) != 0) { r.y = -r.y; }
1081
1082 if (xisinff(d)) { r.x = r.y = librtprocess::RT_NAN_F; }
1083
1084 return r;
1085 #endif
1086 }
1087
xtanf(float d)1088 __inline float xtanf(float d) {
1089 int q;
1090 float u, s, x;
1091
1092 q = rint(d * (float)(2 * librtprocess::RT_1_PI));
1093
1094 x = d;
1095
1096 x = mlaf(q, -PI4_Af*2, x);
1097 x = mlaf(q, -PI4_Bf*2, x);
1098 x = mlaf(q, -PI4_Cf*2, x);
1099 x = mlaf(q, -PI4_Df*2, x);
1100
1101 s = x * x;
1102
1103 if ((q & 1) != 0) x = -x;
1104
1105 u = 0.00927245803177356719970703f;
1106 u = mlaf(u, s, 0.00331984995864331722259521f);
1107 u = mlaf(u, s, 0.0242998078465461730957031f);
1108 u = mlaf(u, s, 0.0534495301544666290283203f);
1109 u = mlaf(u, s, 0.133383005857467651367188f);
1110 u = mlaf(u, s, 0.333331853151321411132812f);
1111
1112 u = mlaf(s, u * x, x);
1113
1114 if ((q & 1) != 0) u = 1.0f / u;
1115
1116 if (xisinff(d)) u = librtprocess::RT_NAN_F;
1117
1118 return u;
1119 }
1120
xatanf(float s)1121 __inline float xatanf(float s) {
1122 float t, u;
1123 int q = 0;
1124
1125 if (s < 0) { s = -s; q = 2; }
1126 if (s > 1) { s = 1.0f / s; q |= 1; }
1127
1128 t = s * s;
1129
1130 u = 0.00282363896258175373077393f;
1131 u = mlaf(u, t, -0.0159569028764963150024414f);
1132 u = mlaf(u, t, 0.0425049886107444763183594f);
1133 u = mlaf(u, t, -0.0748900920152664184570312f);
1134 u = mlaf(u, t, 0.106347933411598205566406f);
1135 u = mlaf(u, t, -0.142027363181114196777344f);
1136 u = mlaf(u, t, 0.199926957488059997558594f);
1137 u = mlaf(u, t, -0.333331018686294555664062f);
1138
1139 t = s + s * (t * u);
1140
1141 if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
1142 if ((q & 2) != 0) t = -t;
1143
1144 return t;
1145 }
1146
atan2kf(float y,float x)1147 __inline float atan2kf(float y, float x) {
1148 float s, t, u;
1149 float q = 0.f;
1150
1151 if (x < 0) { x = -x; q = -2.f; }
1152 if (y > x) { t = x; x = y; y = -t; q += 1.f; }
1153
1154 s = y / x;
1155 t = s * s;
1156
1157 u = 0.00282363896258175373077393f;
1158 u = mlaf(u, t, -0.0159569028764963150024414f);
1159 u = mlaf(u, t, 0.0425049886107444763183594f);
1160 u = mlaf(u, t, -0.0748900920152664184570312f);
1161 u = mlaf(u, t, 0.106347933411598205566406f);
1162 u = mlaf(u, t, -0.142027363181114196777344f);
1163 u = mlaf(u, t, 0.199926957488059997558594f);
1164 u = mlaf(u, t, -0.333331018686294555664062f);
1165
1166 t = u * t;
1167 t = mlaf(t,s,s);
1168 return mlaf(q,(float)(librtprocess::RT_PI_F_2),t);
1169 }
1170
xatan2f(float y,float x)1171 __inline float xatan2f(float y, float x) {
1172 float r = atan2kf(xfabsf(y), x);
1173
1174 r = mulsignf(r, x);
1175 if (xisinff(x) || x == 0) r = librtprocess::RT_PI_F/2 - (xisinff(x) ? (signf(x) * (float)(librtprocess::RT_PI_F*.5f)) : 0);
1176 if (xisinff(y) ) r = librtprocess::RT_PI_F/2 - (xisinff(x) ? (signf(x) * (float)(librtprocess::RT_PI_F*.25f)) : 0);
1177 if ( y == 0) r = (signf(x) == -1 ? librtprocess::RT_PI_F : 0);
1178
1179 return xisnanf(x) || xisnanf(y) ? librtprocess::RT_NAN_F : mulsignf(r, y);
1180 }
1181
xasinf(float d)1182 __inline float xasinf(float d) {
1183 return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d);
1184 }
1185
xacosf(float d)1186 __inline float xacosf(float d) {
1187 return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)librtprocess::RT_PI : 0.0f);
1188 }
1189
xlogf(float d)1190 __inline float xlogf(float d) {
1191 float x, x2, t, m;
1192 int e;
1193
1194 e = ilogbp1f(d * 0.7071f);
1195 m = ldexpkf(d, -e);
1196
1197 x = (m-1.0f) / (m+1.0f);
1198 x2 = x * x;
1199
1200 t = 0.2371599674224853515625f;
1201 t = mlaf(t, x2, 0.285279005765914916992188f);
1202 t = mlaf(t, x2, 0.400005519390106201171875f);
1203 t = mlaf(t, x2, 0.666666567325592041015625f);
1204 t = mlaf(t, x2, 2.0f);
1205
1206 x = x * t + 0.693147180559945286226764f * e;
1207
1208 if (xisinff(d)) x = librtprocess::RT_INFINITY_F;
1209 if (d < 0) x = librtprocess::RT_NAN_F;
1210 if (d == 0) x = -librtprocess::RT_INFINITY_F;
1211
1212 return x;
1213 }
1214
xexpf(float d)1215 __inline float xexpf(float d) {
1216 if(d<=-104.0f) return 0.0f;
1217
1218 int q = rint(d * R_LN2f);
1219 float s, u;
1220
1221 s = mlaf(q, -L2Uf, d);
1222 s = mlaf(q, -L2Lf, s);
1223
1224 u = 0.00136324646882712841033936f;
1225 u = mlaf(u, s, 0.00836596917361021041870117f);
1226 u = mlaf(u, s, 0.0416710823774337768554688f);
1227 u = mlaf(u, s, 0.166665524244308471679688f);
1228 u = mlaf(u, s, 0.499999850988388061523438f);
1229
1230 u = mlaf( s, mlaf(s,u,1.f),1.f);
1231 return ldexpkf(u, q);
1232
1233 }
1234
xmul2f(float d)1235 __inline float xmul2f(float d) {
1236 union {
1237 float floatval;
1238 int intval;
1239 } uflint;
1240 uflint.floatval = d;
1241 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1242 uflint.intval += 1 << 23; // add 1 to the exponent
1243 }
1244 return uflint.floatval;
1245 }
1246
xdiv2f(float d)1247 __inline float xdiv2f(float d) {
1248 union {
1249 float floatval;
1250 int intval;
1251 } uflint;
1252 uflint.floatval = d;
1253 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1254 uflint.intval -= 1 << 23; // sub 1 from the exponent
1255 }
1256 return uflint.floatval;
1257 }
1258
xdivf(float d,int n)1259 __inline float xdivf( float d, int n){
1260 union {
1261 float floatval;
1262 int intval;
1263 } uflint;
1264 uflint.floatval = d;
1265 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1266 uflint.intval -= n << 23; // add n to the exponent
1267 }
1268 return uflint.floatval;
1269 }
1270
1271
1272
1273 #endif
1274