1 ////////////////////////////////////////////////////////////////
2 //
3 // this code was taken from http://shibatch.sourceforge.net/
4 // Many thanks to the author of original version: Naoki Shibata
5 //
6 // This version contains modifications made by Ingo Weyrich
7 //
8 ////////////////////////////////////////////////////////////////
9
10 #ifndef _SLEEFC_
11 #define _SLEEFC_
12
13 #include <assert.h>
14 #include <stdint.h>
15 #include "rt_math.h"
16 #include "opthelper.h"
17
18 #define PI4_A .7853981554508209228515625
19 #define PI4_B .794662735614792836713604629039764404296875e-8
20 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
21 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
22
23 #define L2U .69314718055966295651160180568695068359375
24 #define L2L .28235290563031577122588448175013436025525412068e-12
25 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
26
doubleToRawLongBits(double d)27 __inline int64_t doubleToRawLongBits(double d) {
28 union {
29 double f;
30 int64_t i;
31 } tmp;
32 tmp.f = d;
33 return tmp.i;
34 }
35
longBitsToDouble(int64_t i)36 __inline double longBitsToDouble(int64_t i) {
37 union {
38 double f;
39 int64_t i;
40 } tmp;
41 tmp.i = i;
42 return tmp.f;
43 }
44
xfabs(double x)45 __inline double xfabs(double x) {
46 return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x));
47 }
48
mulsign(double x,double y)49 __inline double mulsign(double x, double y) {
50 return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63)));
51 }
52
sign(double d)53 __inline double sign(double d) { return mulsign(1, d); }
mla(double x,double y,double z)54 __inline double mla(double x, double y, double z) { return x * y + z; }
xrint(double x)55 __inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
56
xisnan(double x)57 __inline int xisnan(double x) { return x != x; }
xisinf(double x)58 __inline int xisinf(double x) { return x == librtprocess::RT_INFINITY || x == -librtprocess::RT_INFINITY; }
xisminf(double x)59 __inline int xisminf(double x) { return x == -librtprocess::RT_INFINITY; }
xispinf(double x)60 __inline int xispinf(double x) { return x == librtprocess::RT_INFINITY; }
61
ldexpk(double x,int q)62 __inline double ldexpk(double x, int q) {
63 double u;
64 int m;
65 m = q >> 31;
66 m = (((m + q) >> 9) - m) << 7;
67 q = q - (m << 2);
68 u = longBitsToDouble(((int64_t)(m + 0x3ff)) << 52);
69 double u2 = u*u;
70 u2 = u2 * u2;
71 x = x * u2;
72 u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
73 return x * u;
74 }
75
xldexp(double x,int q)76 __inline double xldexp(double x, int q) { return ldexpk(x, q); }
77
ilogbp1(double d)78 __inline int ilogbp1(double d) {
79 int m = d < 4.9090934652977266E-91;
80 d = m ? 2.037035976334486E90 * d : d;
81 int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
82 q = m ? q - (300 + 0x03fe) : q - 0x03fe;
83 return q;
84 }
85
xilogb(double d)86 __inline int xilogb(double d) {
87 int e = ilogbp1(xfabs(d)) - 1;
88 e = d == 0 ? (-2147483647 - 1) : e;
89 e = d == librtprocess::RT_INFINITY || d == -librtprocess::RT_INFINITY ? 2147483647 : e;
90 return e;
91 }
92
upper(double d)93 __inline double upper(double d) {
94 return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL);
95 }
96
97 typedef struct {
98 double x, y;
99 } double2;
100
101 typedef struct {
102 float x, y;
103 } float2;
104
dd(double h,double l)105 __inline double2 dd(double h, double l) {
106 double2 ret;
107 ret.x = h; ret.y = l;
108 return ret;
109 }
110
normalize_d(double2 t)111 __inline double2 normalize_d(double2 t) {
112 double2 s;
113
114 s.x = t.x + t.y;
115 s.y = t.x - s.x + t.y;
116
117 return s;
118 }
119
scale_d(double2 d,double s)120 __inline double2 scale_d(double2 d, double s) {
121 double2 r;
122
123 r.x = d.x * s;
124 r.y = d.y * s;
125
126 return r;
127 }
128
add2_ss(double x,double y)129 __inline double2 add2_ss(double x, double y) {
130 double2 r;
131
132 r.x = x + y;
133 double v = r.x - x;
134 r.y = (x - (r.x - v)) + (y - v);
135
136 return r;
137 }
138
add_ds(double2 x,double y)139 __inline double2 add_ds(double2 x, double y) {
140 // |x| >= |y|
141
142 double2 r;
143
144 assert(xisnan(x.x) || xisnan(y) || xfabs(x.x) >= xfabs(y));
145
146 r.x = x.x + y;
147 r.y = x.x - r.x + y + x.y;
148
149 return r;
150 }
151
add2_ds(double2 x,double y)152 __inline double2 add2_ds(double2 x, double y) {
153 // |x| >= |y|
154
155 double2 r;
156
157 r.x = x.x + y;
158 double v = r.x - x.x;
159 r.y = (x.x - (r.x - v)) + (y - v);
160 r.y += x.y;
161
162 return r;
163 }
164
add_sd(double x,double2 y)165 __inline double2 add_sd(double x, double2 y) {
166 // |x| >= |y|
167
168 double2 r;
169
170 assert(xisnan(x) || xisnan(y.x) || xfabs(x) >= xfabs(y.x));
171
172 r.x = x + y.x;
173 r.y = x - r.x + y.x + y.y;
174
175 return r;
176 }
177
add_dd(double2 x,double2 y)178 __inline double2 add_dd(double2 x, double2 y) {
179 // |x| >= |y|
180
181 double2 r;
182
183 assert(xisnan(x.x) || xisnan(y.x) || xfabs(x.x) >= xfabs(y.x));
184
185 r.x = x.x + y.x;
186 r.y = x.x - r.x + y.x + x.y + y.y;
187
188 return r;
189 }
190
add2_dd(double2 x,double2 y)191 __inline double2 add2_dd(double2 x, double2 y) {
192 double2 r;
193
194 r.x = x.x + y.x;
195 double v = r.x - x.x;
196 r.y = (x.x - (r.x - v)) + (y.x - v);
197 r.y += x.y + y.y;
198
199 return r;
200 }
201
div_dd(double2 n,double2 d)202 __inline double2 div_dd(double2 n, double2 d) {
203 double t = 1.0 / d.x;
204 double dh = upper(d.x), dl = d.x - dh;
205 double th = upper(t ), tl = t - th;
206 double nhh = upper(n.x), nhl = n.x - nhh;
207
208 double2 q;
209
210 q.x = n.x * t;
211
212 double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
213 q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
214
215 q.y = t * (n.y - q.x * d.y) + u;
216
217 return q;
218 }
219
mul_ss(double x,double y)220 __inline double2 mul_ss(double x, double y) {
221 double xh = upper(x), xl = x - xh;
222 double yh = upper(y), yl = y - yh;
223 double2 r;
224
225 r.x = x * y;
226 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
227
228 return r;
229 }
230
mul_ds(double2 x,double y)231 __inline double2 mul_ds(double2 x, double y) {
232 double xh = upper(x.x), xl = x.x - xh;
233 double yh = upper(y ), yl = y - yh;
234 double2 r;
235
236 r.x = x.x * y;
237 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
238
239 return r;
240 }
241
mul_dd(double2 x,double2 y)242 __inline double2 mul_dd(double2 x, double2 y) {
243 double xh = upper(x.x), xl = x.x - xh;
244 double yh = upper(y.x), yl = y.x - yh;
245 double2 r;
246
247 r.x = x.x * y.x;
248 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
249
250 return r;
251 }
252
squ_d(double2 x)253 __inline double2 squ_d(double2 x) {
254 double xh = upper(x.x), xl = x.x - xh;
255 double2 r;
256
257 r.x = x.x * x.x;
258 r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
259
260 return r;
261 }
262
rec_s(double d)263 __inline double2 rec_s(double d) {
264 double t = 1.0 / d;
265 double dh = upper(d), dl = d - dh;
266 double th = upper(t), tl = t - th;
267 double2 q;
268
269 q.x = t;
270 q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
271
272 return q;
273 }
274
sqrt_d(double2 d)275 __inline double2 sqrt_d(double2 d) {
276 double t = sqrt(d.x + d.y);
277 return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), 0.5);
278 }
279
atan2k(double y,double x)280 __inline double atan2k(double y, double x) {
281 double s, t, u;
282 int q = 0;
283
284 if (x < 0) { x = -x; q = -2; }
285 if (y > x) { t = x; x = y; y = -t; q += 1; }
286
287 s = y / x;
288 t = s * s;
289
290 u = -1.88796008463073496563746e-05;
291 u = u * t + (0.000209850076645816976906797);
292 u = u * t + (-0.00110611831486672482563471);
293 u = u * t + (0.00370026744188713119232403);
294 u = u * t + (-0.00889896195887655491740809);
295 u = u * t + (0.016599329773529201970117);
296 u = u * t + (-0.0254517624932312641616861);
297 u = u * t + (0.0337852580001353069993897);
298 u = u * t + (-0.0407629191276836500001934);
299 u = u * t + (0.0466667150077840625632675);
300 u = u * t + (-0.0523674852303482457616113);
301 u = u * t + (0.0587666392926673580854313);
302 u = u * t + (-0.0666573579361080525984562);
303 u = u * t + (0.0769219538311769618355029);
304 u = u * t + (-0.090908995008245008229153);
305 u = u * t + (0.111111105648261418443745);
306 u = u * t + (-0.14285714266771329383765);
307 u = u * t + (0.199999999996591265594148);
308 u = u * t + (-0.333333333333311110369124);
309
310 t = u * t * s + s;
311 t = q * (librtprocess::RT_PI_2) + t;
312
313 return t;
314 }
315
xatan2(double y,double x)316 __inline double xatan2(double y, double x) {
317 double r = atan2k(xfabs(y), x);
318
319 r = mulsign(r, x);
320 if (xisinf(x) || x == 0) r = librtprocess::RT_PI_2 - (xisinf(x) ? (sign(x) * (librtprocess::RT_PI_2)) : 0);
321 if (xisinf(y) ) r = librtprocess::RT_PI_2 - (xisinf(x) ? (sign(x) * (librtprocess::RT_PI*1/4)) : 0);
322 if ( y == 0) r = (sign(x) == -1 ? librtprocess::RT_PI : 0);
323
324 return xisnan(x) || xisnan(y) ? librtprocess::RT_NAN : mulsign(r, y);
325 }
326
xasin(double d)327 __inline double xasin(double d) {
328 return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d);
329 }
330
xacos(double d)331 __inline double xacos(double d) {
332 return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? librtprocess::RT_PI : 0);
333 }
334
xatan(double s)335 __inline double xatan(double s) {
336 double t, u;
337 int q = 0;
338
339 if (s < 0) { s = -s; q = 2; }
340 if (s > 1) { s = 1.0 / s; q |= 1; }
341
342 t = s * s;
343
344 u = -1.88796008463073496563746e-05;
345 u = u * t + (0.000209850076645816976906797);
346 u = u * t + (-0.00110611831486672482563471);
347 u = u * t + (0.00370026744188713119232403);
348 u = u * t + (-0.00889896195887655491740809);
349 u = u * t + (0.016599329773529201970117);
350 u = u * t + (-0.0254517624932312641616861);
351 u = u * t + (0.0337852580001353069993897);
352 u = u * t + (-0.0407629191276836500001934);
353 u = u * t + (0.0466667150077840625632675);
354 u = u * t + (-0.0523674852303482457616113);
355 u = u * t + (0.0587666392926673580854313);
356 u = u * t + (-0.0666573579361080525984562);
357 u = u * t + (0.0769219538311769618355029);
358 u = u * t + (-0.090908995008245008229153);
359 u = u * t + (0.111111105648261418443745);
360 u = u * t + (-0.14285714266771329383765);
361 u = u * t + (0.199999999996591265594148);
362 u = u * t + (-0.333333333333311110369124);
363
364 t = s + s * (t * u);
365
366 if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
367 if ((q & 2) != 0) t = -t;
368
369 return t;
370 }
371
xsin(double d)372 __inline double xsin(double d) {
373 int q;
374 double u, s;
375
376 q = (int)xrint(d * librtprocess::RT_1_PI);
377
378 d = mla(q, -PI4_A*4, d);
379 d = mla(q, -PI4_B*4, d);
380 d = mla(q, -PI4_C*4, d);
381
382 s = d * d;
383
384 if ((q & 1) != 0) d = -d;
385
386 u = -7.97255955009037868891952e-18;
387 u = mla(u, s, 2.81009972710863200091251e-15);
388 u = mla(u, s, -7.64712219118158833288484e-13);
389 u = mla(u, s, 1.60590430605664501629054e-10);
390 u = mla(u, s, -2.50521083763502045810755e-08);
391 u = mla(u, s, 2.75573192239198747630416e-06);
392 u = mla(u, s, -0.000198412698412696162806809);
393 u = mla(u, s, 0.00833333333333332974823815);
394 u = mla(u, s, -0.166666666666666657414808);
395
396 u = mla(s, u * d, d);
397
398 return u;
399 }
400
xcos(double d)401 __inline double xcos(double d) {
402 int q;
403 double u, s;
404
405 q = 1 + 2*(int)xrint(d * librtprocess::RT_1_PI - 0.5);
406
407 d = mla(q, -PI4_A*2, d);
408 d = mla(q, -PI4_B*2, d);
409 d = mla(q, -PI4_C*2, d);
410
411 s = d * d;
412
413 if ((q & 2) == 0) d = -d;
414
415 u = -7.97255955009037868891952e-18;
416 u = mla(u, s, 2.81009972710863200091251e-15);
417 u = mla(u, s, -7.64712219118158833288484e-13);
418 u = mla(u, s, 1.60590430605664501629054e-10);
419 u = mla(u, s, -2.50521083763502045810755e-08);
420 u = mla(u, s, 2.75573192239198747630416e-06);
421 u = mla(u, s, -0.000198412698412696162806809);
422 u = mla(u, s, 0.00833333333333332974823815);
423 u = mla(u, s, -0.166666666666666657414808);
424
425 u = mla(s, u * d, d);
426
427 return u;
428 }
429
xsincos(double d)430 __inline double2 xsincos(double d) {
431 int q;
432 double u, s, t;
433 double2 r;
434
435 q = (int)xrint(d * (2 * librtprocess::RT_1_PI));
436
437 s = d;
438
439 s = mla(-q, PI4_A*2, s);
440 s = mla(-q, PI4_B*2, s);
441 s = mla(-q, PI4_C*2, s);
442
443 t = s;
444
445 s = s * s;
446
447 u = 1.58938307283228937328511e-10;
448 u = mla(u, s, -2.50506943502539773349318e-08);
449 u = mla(u, s, 2.75573131776846360512547e-06);
450 u = mla(u, s, -0.000198412698278911770864914);
451 u = mla(u, s, 0.0083333333333191845961746);
452 u = mla(u, s, -0.166666666666666130709393);
453 u = u * s * t;
454
455 r.x = t + u;
456
457 u = -1.13615350239097429531523e-11;
458 u = mla(u, s, 2.08757471207040055479366e-09);
459 u = mla(u, s, -2.75573144028847567498567e-07);
460 u = mla(u, s, 2.48015872890001867311915e-05);
461 u = mla(u, s, -0.00138888888888714019282329);
462 u = mla(u, s, 0.0416666666666665519592062);
463 u = mla(u, s, -0.5);
464
465 r.y = u * s + 1;
466
467 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
468 if ((q & 2) != 0) { r.x = -r.x; }
469 if (((q+1) & 2) != 0) { r.y = -r.y; }
470
471 if (xisinf(d)) { r.x = r.y = librtprocess::RT_NAN; }
472
473 return r;
474 }
475
xtan(double d)476 __inline double xtan(double d) {
477 int q;
478 double u, s, x;
479
480 q = (int)xrint(d * (2 * librtprocess::RT_1_PI));
481
482 x = mla(q, -PI4_A*2, d);
483 x = mla(q, -PI4_B*2, x);
484 x = mla(q, -PI4_C*2, x);
485
486 s = x * x;
487
488 if ((q & 1) != 0) x = -x;
489
490 u = 1.01419718511083373224408e-05;
491 u = mla(u, s, -2.59519791585924697698614e-05);
492 u = mla(u, s, 5.23388081915899855325186e-05);
493 u = mla(u, s, -3.05033014433946488225616e-05);
494 u = mla(u, s, 7.14707504084242744267497e-05);
495 u = mla(u, s, 8.09674518280159187045078e-05);
496 u = mla(u, s, 0.000244884931879331847054404);
497 u = mla(u, s, 0.000588505168743587154904506);
498 u = mla(u, s, 0.00145612788922812427978848);
499 u = mla(u, s, 0.00359208743836906619142924);
500 u = mla(u, s, 0.00886323944362401618113356);
501 u = mla(u, s, 0.0218694882853846389592078);
502 u = mla(u, s, 0.0539682539781298417636002);
503 u = mla(u, s, 0.133333333333125941821962);
504 u = mla(u, s, 0.333333333333334980164153);
505
506 u = mla(s, u * x, x);
507
508 if ((q & 1) != 0) u = 1.0 / u;
509
510 if (xisinf(d)) u = librtprocess::RT_NAN;
511
512 return u;
513 }
514
xlog(double d)515 __inline double xlog(double d) {
516 double x, x2, t, m;
517 int e;
518
519 e = ilogbp1(d * 0.7071);
520 m = ldexpk(d, -e);
521
522 x = (m-1) / (m+1);
523 x2 = x * x;
524
525 t = 0.148197055177935105296783;
526 t = mla(t, x2, 0.153108178020442575739679);
527 t = mla(t, x2, 0.181837339521549679055568);
528 t = mla(t, x2, 0.22222194152736701733275);
529 t = mla(t, x2, 0.285714288030134544449368);
530 t = mla(t, x2, 0.399999999989941956712869);
531 t = mla(t, x2, 0.666666666666685503450651);
532 t = mla(t, x2, 2);
533
534 x = x * t + 0.693147180559945286226764 * e;
535
536 if (xisinf(d)) x = librtprocess::RT_INFINITY;
537 if (d < 0) x = librtprocess::RT_NAN;
538 if (d == 0) x = -librtprocess::RT_INFINITY;
539
540 return x;
541 }
542
xexp(double d)543 __inline double xexp(double d) {
544 int q = (int)xrint(d * R_LN2);
545 double s, u;
546
547 s = mla(q, -L2U, d);
548 s = mla(q, -L2L, s);
549
550 u = 2.08860621107283687536341e-09;
551 u = mla(u, s, 2.51112930892876518610661e-08);
552 u = mla(u, s, 2.75573911234900471893338e-07);
553 u = mla(u, s, 2.75572362911928827629423e-06);
554 u = mla(u, s, 2.4801587159235472998791e-05);
555 u = mla(u, s, 0.000198412698960509205564975);
556 u = mla(u, s, 0.00138888888889774492207962);
557 u = mla(u, s, 0.00833333333331652721664984);
558 u = mla(u, s, 0.0416666666666665047591422);
559 u = mla(u, s, 0.166666666666666851703837);
560 u = mla(u, s, 0.5);
561
562 u = s * s * u + s + 1;
563 u = ldexpk(u, q);
564
565 if (xisminf(d)) u = 0;
566
567 return u;
568 }
569
logk(double d)570 __inline double2 logk(double d) {
571 double2 x, x2;
572 double m, t;
573 int e;
574
575 e = ilogbp1(d * 0.7071);
576 m = ldexpk(d, -e);
577
578 x = div_dd(add2_ss(-1, m), add2_ss(1, m));
579 x2 = squ_d(x);
580
581 t = 0.134601987501262130076155;
582 t = mla(t, x2.x, 0.132248509032032670243288);
583 t = mla(t, x2.x, 0.153883458318096079652524);
584 t = mla(t, x2.x, 0.181817427573705403298686);
585 t = mla(t, x2.x, 0.222222231326187414840781);
586 t = mla(t, x2.x, 0.285714285651261412873718);
587 t = mla(t, x2.x, 0.400000000000222439910458);
588 t = mla(t, x2.x, 0.666666666666666371239645);
589
590 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
591 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
592 }
593
expk(double2 d)594 __inline double expk(double2 d) {
595 int q = (int)rint((d.x + d.y) * R_LN2);
596 double2 s, t;
597 double u;
598
599 s = add2_ds(d, q * -L2U);
600 s = add2_ds(s, q * -L2L);
601
602 s = normalize_d(s);
603
604 u = 2.51069683420950419527139e-08;
605 u = mla(u, s.x, 2.76286166770270649116855e-07);
606 u = mla(u, s.x, 2.75572496725023574143864e-06);
607 u = mla(u, s.x, 2.48014973989819794114153e-05);
608 u = mla(u, s.x, 0.000198412698809069797676111);
609 u = mla(u, s.x, 0.0013888888939977128960529);
610 u = mla(u, s.x, 0.00833333333332371417601081);
611 u = mla(u, s.x, 0.0416666666665409524128449);
612 u = mla(u, s.x, 0.166666666666666740681535);
613 u = mla(u, s.x, 0.500000000000000999200722);
614
615 t = add_dd(s, mul_ds(squ_d(s), u));
616
617 t = add_sd(1, t);
618 return ldexpk(t.x + t.y, q);
619 }
620
xpow(double x,double y)621 __inline double xpow(double x, double y) {
622 int yisint = (int)y == y;
623 int yisodd = (1 & (int)y) != 0 && yisint;
624
625 double result = expk(mul_ds(logk(xfabs(x)), y));
626
627 result = xisnan(result) ? librtprocess::RT_INFINITY : result;
628 result *= (x >= 0 ? 1 : (!yisint ? librtprocess::RT_NAN : (yisodd ? -1 : 1)));
629
630 double efx = mulsign(xfabs(x) - 1, y);
631 if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : librtprocess::RT_INFINITY);
632 if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : librtprocess::RT_INFINITY);
633 if (xisnan(x) || xisnan(y)) result = librtprocess::RT_NAN;
634 if (y == 0 || x == 1) result = 1;
635
636 return result;
637 }
638
expk2(double2 d)639 __inline double2 expk2(double2 d) {
640 int q = (int)rint((d.x + d.y) * R_LN2);
641 double2 s, t;
642 double u;
643
644 s = add2_ds(d, q * -L2U);
645 s = add2_ds(s, q * -L2L);
646
647 s = normalize_d(s);
648
649 u = 2.51069683420950419527139e-08;
650 u = mla(u, s.x, 2.76286166770270649116855e-07);
651 u = mla(u, s.x, 2.75572496725023574143864e-06);
652 u = mla(u, s.x, 2.48014973989819794114153e-05);
653 u = mla(u, s.x, 0.000198412698809069797676111);
654 u = mla(u, s.x, 0.0013888888939977128960529);
655 u = mla(u, s.x, 0.00833333333332371417601081);
656 u = mla(u, s.x, 0.0416666666665409524128449);
657 u = mla(u, s.x, 0.166666666666666740681535);
658 u = mla(u, s.x, 0.500000000000000999200722);
659
660 t = add_dd(s, mul_ds(squ_d(s), u));
661
662 t = add_sd(1, t);
663 return dd(ldexpk(t.x, q), ldexpk(t.y, q));
664 }
665
xsinh(double x)666 __inline double xsinh(double x) {
667 double y = xfabs(x);
668 double2 d = expk2(dd(y, 0));
669 d = add2_dd(d, div_dd(dd(-1, 0), d));
670 y = (d.x + d.y) * 0.5;
671
672 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
673 y = mulsign(y, x);
674 y = xisnan(x) ? librtprocess::RT_NAN : y;
675
676 return y;
677 }
678
xcosh(double x)679 __inline double xcosh(double x) {
680 double2 d = expk2(dd(x, 0));
681 d = add2_dd(d, div_dd(dd(1, 0), d));
682 double y = (d.x + d.y) * 0.5;
683
684 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
685 y = xisnan(x) ? librtprocess::RT_NAN : y;
686
687 return y;
688 }
689
xtanh(double x)690 __inline double xtanh(double x) {
691 double y = xfabs(x);
692 double2 d = expk2(dd(y, 0));
693 double2 e = div_dd(dd(1, 0), d);
694 d = div_dd(add2_dd(d, scale_d(e, -1)), add2_dd(d, e));
695 y = d.x + d.y;
696
697 y = xisinf(x) || xisnan(y) ? 1.0 : y;
698 y = mulsign(y, x);
699 y = xisnan(x) ? librtprocess::RT_NAN : y;
700
701 return y;
702 }
703
logk2(double2 d)704 __inline double2 logk2(double2 d) {
705 double2 x, x2, m;
706 double t;
707 int e;
708
709 d = normalize_d(d);
710 e = ilogbp1(d.x * 0.7071);
711 m = scale_d(d, ldexpk(1, -e));
712
713 x = div_dd(add2_ds(m, -1), add2_ds(m, 1));
714 x2 = squ_d(x);
715
716 t = 0.134601987501262130076155;
717 t = mla(t, x2.x, 0.132248509032032670243288);
718 t = mla(t, x2.x, 0.153883458318096079652524);
719 t = mla(t, x2.x, 0.181817427573705403298686);
720 t = mla(t, x2.x, 0.222222231326187414840781);
721 t = mla(t, x2.x, 0.285714285651261412873718);
722 t = mla(t, x2.x, 0.400000000000222439910458);
723 t = mla(t, x2.x, 0.666666666666666371239645);
724
725 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
726 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
727 }
728
xasinh(double x)729 __inline double xasinh(double x) {
730 double y = xfabs(x);
731 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y), 1)), y));
732 y = d.x + d.y;
733
734 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
735 y = mulsign(y, x);
736 y = xisnan(x) ? librtprocess::RT_NAN : y;
737
738 return y;
739 }
740
xacosh(double x)741 __inline double xacosh(double x) {
742 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), -1)), x));
743 double y = d.x + d.y;
744
745 y = xisinf(x) || xisnan(y) ? librtprocess::RT_INFINITY : y;
746 y = x == 1.0 ? 0.0 : y;
747 y = x < 1.0 ? librtprocess::RT_NAN : y;
748 y = xisnan(x) ? librtprocess::RT_NAN : y;
749
750 return y;
751 }
752
xatanh(double x)753 __inline double xatanh(double x) {
754 double y = xfabs(x);
755 double2 d = logk2(div_dd(add2_ss(1, y), add2_ss(1, -y)));
756 y = y > 1.0 ? librtprocess::RT_NAN : (y == 1.0 ? librtprocess::RT_INFINITY : (d.x + d.y) * 0.5);
757
758 y = xisinf(x) || xisnan(y) ? librtprocess::RT_NAN : y;
759 y = mulsign(y, x);
760 y = xisnan(x) ? librtprocess::RT_NAN : y;
761
762 return y;
763 }
764
765 //
766
xfma(double x,double y,double z)767 __inline double xfma(double x, double y, double z) {
768 union {
769 double f;
770 long long int i;
771 } tmp;
772
773 tmp.f = x;
774 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
775 double xh = tmp.f, xl = x - xh;
776
777 tmp.f = y;
778 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
779 double yh = tmp.f, yl = y - yh;
780
781 double h = x * y;
782 double l = xh * yh - h + xl * yh + xh * yl + xl * yl;
783
784 double h2, l2, v;
785
786 h2 = h + z;
787 v = h2 - h;
788 l2 = (h - (h2 - v)) + (z - v) + l;
789
790 return h2 + l2;
791 }
792
xsqrt(double d)793 __inline double xsqrt(double d) { // max error : 0.5 ulp
794 double q = 1;
795
796 if (d < 8.636168555094445E-78) {
797 d *= 1.157920892373162E77;
798 q = 2.9387358770557188E-39;
799 }
800
801 // http://en.wikipedia.org/wiki/Fast_inverse_square_root
802 double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));
803
804 x = x * (1.5 - 0.5 * d * x * x);
805 x = x * (1.5 - 0.5 * d * x * x);
806 x = x * (1.5 - 0.5 * d * x * x);
807
808 // You can change xfma to fma if fma is correctly implemented
809 x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x;
810
811 return d == librtprocess::RT_INFINITY ? librtprocess::RT_INFINITY : x * q;
812 }
813
xcbrt(double d)814 __inline double xcbrt(double d) { // max error : 2 ulps
815 double x, y, q = 1.0;
816 int e, r;
817
818 e = ilogbp1(d);
819 d = ldexpk(d, -e);
820 r = (e + 6144) % 3;
821 q = (r == 1) ? 1.2599210498948731647672106 : q;
822 q = (r == 2) ? 1.5874010519681994747517056 : q;
823 q = ldexpk(q, (e + 6144) / 3 - 2048);
824
825 q = mulsign(q, d);
826 d = xfabs(d);
827
828 x = -0.640245898480692909870982;
829 x = x * d + 2.96155103020039511818595;
830 x = x * d + -5.73353060922947843636166;
831 x = x * d + 6.03990368989458747961407;
832 x = x * d + -3.85841935510444988821632;
833 x = x * d + 2.2307275302496609725722;
834
835 y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
836 y = d * x * x;
837 y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;
838
839 return y;
840 }
841
xexp2(double a)842 __inline double xexp2(double a) {
843 double u = expk(mul_ds(dd(0.69314718055994528623, 2.3190468138462995584e-17), a));
844 if (xispinf(a)) u = librtprocess::RT_INFINITY;
845 if (xisminf(a)) u = 0;
846 return u;
847 }
848
xexp10(double a)849 __inline double xexp10(double a) {
850 double u = expk(mul_ds(dd(2.3025850929940459011, -2.1707562233822493508e-16), a));
851 if (xispinf(a)) u = librtprocess::RT_INFINITY;
852 if (xisminf(a)) u = 0;
853 return u;
854 }
855
xexpm1(double a)856 __inline double xexpm1(double a) {
857 double2 d = add2_ds(expk2(dd(a, 0)), -1.0);
858 double x = d.x + d.y;
859 if (xispinf(a)) x = librtprocess::RT_INFINITY;
860 if (xisminf(a)) x = -1;
861 return x;
862 }
863
xlog10(double a)864 __inline double xlog10(double a) {
865 double2 d = mul_dd(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17));
866 double x = d.x + d.y;
867
868 if (xisinf(a)) x = librtprocess::RT_INFINITY;
869 if (a < 0) x = librtprocess::RT_NAN;
870 if (a == 0) x = -librtprocess::RT_INFINITY;
871
872 return x;
873 }
874
xlog1p(double a)875 __inline double xlog1p(double a) {
876 double2 d = logk2(add2_ss(a, 1));
877 double x = d.x + d.y;
878
879 if (xisinf(a)) x = librtprocess::RT_INFINITY;
880 if (a < -1) x = librtprocess::RT_NAN;
881 if (a == -1) x = -librtprocess::RT_INFINITY;
882
883 return x;
884 }
885
886 ///////////////////////////////////////////
887
888 #define PI4_Af 0.78515625f
889 #define PI4_Bf 0.00024127960205078125f
890 #define PI4_Cf 6.3329935073852539062e-07f
891 #define PI4_Df 4.9604681473525147339e-10f
892
893 #define L2Uf 0.693145751953125f
894 #define L2Lf 1.428606765330187045e-06f
895
896 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
897
floatToRawIntBits(float d)898 __inline int32_t floatToRawIntBits(float d) {
899 union {
900 float f;
901 int32_t i;
902 } tmp;
903 tmp.f = d;
904 return tmp.i;
905 }
906
intBitsToFloat(int32_t i)907 __inline float intBitsToFloat(int32_t i) {
908 union {
909 float f;
910 int32_t i;
911 } tmp;
912 tmp.i = i;
913 return tmp.f;
914 }
915
xfabsf(float x)916 __inline float xfabsf(float x) {
917 return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
918 }
919
mulsignf(float x,float y)920 __inline float mulsignf(float x, float y) {
921 return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
922 }
923
signf(float d)924 __inline float signf(float d) { return copysign(1, d); }
mlaf(float x,float y,float z)925 __inline float mlaf(float x, float y, float z) { return x * y + z; }
926
xisnanf(float x)927 __inline int xisnanf(float x) { return x != x; }
xisinff(float x)928 __inline int xisinff(float x) { return x == librtprocess::RT_INFINITY_F || x == -librtprocess::RT_INFINITY_F; }
xisminff(float x)929 __inline int xisminff(float x) { return x == -librtprocess::RT_INFINITY_F; }
xispinff(float x)930 __inline int xispinff(float x) { return x == librtprocess::RT_INFINITY_F; }
931
ilogbp1f(float d)932 __inline int ilogbp1f(float d) {
933 int m = d < 5.421010862427522E-20f;
934 d = m ? 1.8446744073709552E19f * d : d;
935 int q = (floatToRawIntBits(d) >> 23) & 0xff;
936 q = m ? q - (64 + 0x7e) : q - 0x7e;
937 return q;
938 }
939
ldexpkf(float x,int q)940 __inline float ldexpkf(float x, int q) {
941 float u;
942 int m;
943 m = q >> 31;
944 m = (((m + q) >> 6) - m) << 4;
945 q = q - (m << 2);
946 u = intBitsToFloat(((int32_t)(m + 0x7f)) << 23);
947 u = u * u;
948 x = x * u * u;
949 u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
950 return x * u;
951 }
952
xcbrtf(float d)953 __inline float xcbrtf(float d) { // max error : 2 ulps
954 float x, y, q = 1.0f;
955 int e, r;
956
957 e = ilogbp1f(d);
958 d = ldexpkf(d, -e);
959 r = (e + 6144) % 3;
960 q = (r == 1) ? 1.2599210498948731647672106f : q;
961 q = (r == 2) ? 1.5874010519681994747517056f : q;
962 q = ldexpkf(q, (e + 6144) / 3 - 2048);
963
964 q = mulsignf(q, d);
965 d = xfabsf(d);
966
967 x = -0.601564466953277587890625f;
968 x = mlaf(x, d, 2.8208892345428466796875f);
969 x = mlaf(x, d, -5.532182216644287109375f);
970 x = mlaf(x, d, 5.898262500762939453125f);
971 x = mlaf(x, d, -3.8095417022705078125f);
972 x = mlaf(x, d, 2.2241256237030029296875f);
973
974 y = d * x * x;
975 y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;
976
977 return y;
978 }
979
xsinf(float d)980 __inline float xsinf(float d) {
981 int q;
982 float u, s;
983
984 q = rint(d * librtprocess::RT_1_PI_F);
985
986 d = mlaf(q, -PI4_Af*4, d);
987 d = mlaf(q, -PI4_Bf*4, d);
988 d = mlaf(q, -PI4_Cf*4, d);
989 d = mlaf(q, -PI4_Df*4, d);
990
991 s = d * d;
992
993 if ((q & 1) != 0) d = -d;
994
995 u = 2.6083159809786593541503e-06f;
996 u = mlaf(u, s, -0.0001981069071916863322258f);
997 u = mlaf(u, s, 0.00833307858556509017944336f);
998 u = mlaf(u, s, -0.166666597127914428710938f);
999
1000 u = mlaf(s, u * d, d);
1001
1002 return u;
1003 }
1004
xcosf(float d)1005 __inline float xcosf(float d) {
1006 #ifdef __SSE2__
1007 // faster than scalar version
1008 return xcosf(_mm_set_ss(d))[0];
1009 #else
1010 int q;
1011 float u, s;
1012
1013 q = 1 + 2*rint(d * librtprocess::RT_1_PI_F - 0.5f);
1014
1015 d = mlaf(q, -PI4_Af*2, d);
1016 d = mlaf(q, -PI4_Bf*2, d);
1017 d = mlaf(q, -PI4_Cf*2, d);
1018 d = mlaf(q, -PI4_Df*2, d);
1019
1020 s = d * d;
1021
1022 if ((q & 2) == 0) d = -d;
1023
1024 u = 2.6083159809786593541503e-06f;
1025 u = mlaf(u, s, -0.0001981069071916863322258f);
1026 u = mlaf(u, s, 0.00833307858556509017944336f);
1027 u = mlaf(u, s, -0.166666597127914428710938f);
1028
1029 u = mlaf(s, u * d, d);
1030
1031 return u;
1032 #endif
1033 }
1034
xsincosf(float d)1035 __inline float2 xsincosf(float d) {
1036 #ifdef __SSE2__
1037 // faster than scalar version
1038 vfloat2 res = xsincosf(_mm_set_ss(d));
1039 return {res.x[0], res.y[0]};
1040 #else
1041 int q;
1042 float u, s, t;
1043 float2 r;
1044
1045 q = rint(d * librtprocess::RT_2_PI_F);
1046
1047 s = d;
1048
1049 s = mlaf(q, -PI4_Af*2, s);
1050 s = mlaf(q, -PI4_Bf*2, s);
1051 s = mlaf(q, -PI4_Cf*2, s);
1052 s = mlaf(q, -PI4_Df*2, s);
1053
1054 t = s;
1055
1056 s = s * s;
1057
1058 u = -0.000195169282960705459117889f;
1059 u = mlaf(u, s, 0.00833215750753879547119141f);
1060 u = mlaf(u, s, -0.166666537523269653320312f);
1061 u = u * s * t;
1062
1063 r.x = t + u;
1064
1065 u = -2.71811842367242206819355e-07f;
1066 u = mlaf(u, s, 2.47990446951007470488548e-05f);
1067 u = mlaf(u, s, -0.00138888787478208541870117f);
1068 u = mlaf(u, s, 0.0416666641831398010253906f);
1069 u = mlaf(u, s, -0.5f);
1070
1071 r.y = u * s + 1;
1072
1073 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
1074 if ((q & 2) != 0) { r.x = -r.x; }
1075 if (((q+1) & 2) != 0) { r.y = -r.y; }
1076
1077 if (xisinff(d)) { r.x = r.y = librtprocess::RT_NAN_F; }
1078
1079 return r;
1080 #endif
1081 }
1082
xtanf(float d)1083 __inline float xtanf(float d) {
1084 int q;
1085 float u, s, x;
1086
1087 q = rint(d * (float)(2 * librtprocess::RT_1_PI));
1088
1089 x = d;
1090
1091 x = mlaf(q, -PI4_Af*2, x);
1092 x = mlaf(q, -PI4_Bf*2, x);
1093 x = mlaf(q, -PI4_Cf*2, x);
1094 x = mlaf(q, -PI4_Df*2, x);
1095
1096 s = x * x;
1097
1098 if ((q & 1) != 0) x = -x;
1099
1100 u = 0.00927245803177356719970703f;
1101 u = mlaf(u, s, 0.00331984995864331722259521f);
1102 u = mlaf(u, s, 0.0242998078465461730957031f);
1103 u = mlaf(u, s, 0.0534495301544666290283203f);
1104 u = mlaf(u, s, 0.133383005857467651367188f);
1105 u = mlaf(u, s, 0.333331853151321411132812f);
1106
1107 u = mlaf(s, u * x, x);
1108
1109 if ((q & 1) != 0) u = 1.0f / u;
1110
1111 if (xisinff(d)) u = librtprocess::RT_NAN_F;
1112
1113 return u;
1114 }
1115
xatanf(float s)1116 __inline float xatanf(float s) {
1117 float t, u;
1118 int q = 0;
1119
1120 if (s < 0) { s = -s; q = 2; }
1121 if (s > 1) { s = 1.0f / s; q |= 1; }
1122
1123 t = s * s;
1124
1125 u = 0.00282363896258175373077393f;
1126 u = mlaf(u, t, -0.0159569028764963150024414f);
1127 u = mlaf(u, t, 0.0425049886107444763183594f);
1128 u = mlaf(u, t, -0.0748900920152664184570312f);
1129 u = mlaf(u, t, 0.106347933411598205566406f);
1130 u = mlaf(u, t, -0.142027363181114196777344f);
1131 u = mlaf(u, t, 0.199926957488059997558594f);
1132 u = mlaf(u, t, -0.333331018686294555664062f);
1133
1134 t = s + s * (t * u);
1135
1136 if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
1137 if ((q & 2) != 0) t = -t;
1138
1139 return t;
1140 }
1141
atan2kf(float y,float x)1142 __inline float atan2kf(float y, float x) {
1143 float s, t, u;
1144 float q = 0.f;
1145
1146 if (x < 0) { x = -x; q = -2.f; }
1147 if (y > x) { t = x; x = y; y = -t; q += 1.f; }
1148
1149 s = y / x;
1150 t = s * s;
1151
1152 u = 0.00282363896258175373077393f;
1153 u = mlaf(u, t, -0.0159569028764963150024414f);
1154 u = mlaf(u, t, 0.0425049886107444763183594f);
1155 u = mlaf(u, t, -0.0748900920152664184570312f);
1156 u = mlaf(u, t, 0.106347933411598205566406f);
1157 u = mlaf(u, t, -0.142027363181114196777344f);
1158 u = mlaf(u, t, 0.199926957488059997558594f);
1159 u = mlaf(u, t, -0.333331018686294555664062f);
1160
1161 t = u * t;
1162 t = mlaf(t,s,s);
1163 return mlaf(q,(float)(librtprocess::RT_PI_F_2),t);
1164 }
1165
xatan2f(float y,float x)1166 __inline float xatan2f(float y, float x) {
1167 float r = atan2kf(xfabsf(y), x);
1168
1169 r = mulsignf(r, x);
1170 if (xisinff(x) || x == 0) r = librtprocess::RT_PI_F/2 - (xisinff(x) ? (signf(x) * (float)(librtprocess::RT_PI_F*.5f)) : 0);
1171 if (xisinff(y) ) r = librtprocess::RT_PI_F/2 - (xisinff(x) ? (signf(x) * (float)(librtprocess::RT_PI_F*.25f)) : 0);
1172 if ( y == 0) r = (signf(x) == -1 ? librtprocess::RT_PI_F : 0);
1173
1174 return xisnanf(x) || xisnanf(y) ? librtprocess::RT_NAN_F : mulsignf(r, y);
1175 }
1176
xasinf(float d)1177 __inline float xasinf(float d) {
1178 return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d);
1179 }
1180
xacosf(float d)1181 __inline float xacosf(float d) {
1182 return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)librtprocess::RT_PI : 0.0f);
1183 }
1184
xlogf(float d)1185 __inline float xlogf(float d) {
1186 float x, x2, t, m;
1187 int e;
1188
1189 e = ilogbp1f(d * 0.7071f);
1190 m = ldexpkf(d, -e);
1191
1192 x = (m-1.0f) / (m+1.0f);
1193 x2 = x * x;
1194
1195 t = 0.2371599674224853515625f;
1196 t = mlaf(t, x2, 0.285279005765914916992188f);
1197 t = mlaf(t, x2, 0.400005519390106201171875f);
1198 t = mlaf(t, x2, 0.666666567325592041015625f);
1199 t = mlaf(t, x2, 2.0f);
1200
1201 x = x * t + 0.693147180559945286226764f * e;
1202
1203 if (xisinff(d)) x = librtprocess::RT_INFINITY_F;
1204 if (d < 0) x = librtprocess::RT_NAN_F;
1205 if (d == 0) x = -librtprocess::RT_INFINITY_F;
1206
1207 return x;
1208 }
1209
xexpf(float d)1210 __inline float xexpf(float d) {
1211 if(d<=-104.0f) return 0.0f;
1212
1213 int q = rint(d * R_LN2f);
1214 float s, u;
1215
1216 s = mlaf(q, -L2Uf, d);
1217 s = mlaf(q, -L2Lf, s);
1218
1219 u = 0.00136324646882712841033936f;
1220 u = mlaf(u, s, 0.00836596917361021041870117f);
1221 u = mlaf(u, s, 0.0416710823774337768554688f);
1222 u = mlaf(u, s, 0.166665524244308471679688f);
1223 u = mlaf(u, s, 0.499999850988388061523438f);
1224
1225 u = mlaf( s, mlaf(s,u,1.f),1.f);
1226 return ldexpkf(u, q);
1227
1228 }
1229
xmul2f(float d)1230 __inline float xmul2f(float d) {
1231 union {
1232 float floatval;
1233 int intval;
1234 } uflint;
1235 uflint.floatval = d;
1236 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1237 uflint.intval += 1 << 23; // add 1 to the exponent
1238 }
1239 return uflint.floatval;
1240 }
1241
xdiv2f(float d)1242 __inline float xdiv2f(float d) {
1243 union {
1244 float floatval;
1245 int intval;
1246 } uflint;
1247 uflint.floatval = d;
1248 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1249 uflint.intval -= 1 << 23; // sub 1 from the exponent
1250 }
1251 return uflint.floatval;
1252 }
1253
xdivf(float d,int n)1254 __inline float xdivf( float d, int n){
1255 union {
1256 float floatval;
1257 int intval;
1258 } uflint;
1259 uflint.floatval = d;
1260 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1261 uflint.intval -= n << 23; // add n to the exponent
1262 }
1263 return uflint.floatval;
1264 }
1265
1266
1267
1268 #endif
1269