1 ////////////////////////////////////////////////////////////////
2 //
3 // this code was taken from http://shibatch.sourceforge.net/
4 // Many thanks to the author of original version: Naoki Shibata
5 //
6 // This version contains modifications made by Ingo Weyrich
7 //
8 ////////////////////////////////////////////////////////////////
9
10 #ifndef _SLEEFC_
11 #define _SLEEFC_
12
13 #include <assert.h>
14 #include <stdint.h>
15 //#include <math.h>
16 #include "rt_math.h"
17 //#include <bits/nan.h>
18 //#include <bits/inf.h>
19
20 #define PI4_A .7853981554508209228515625
21 #define PI4_B .794662735614792836713604629039764404296875e-8
22 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
23 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
24
25 #define L2U .69314718055966295651160180568695068359375
26 #define L2L .28235290563031577122588448175013436025525412068e-12
27 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
28
doubleToRawLongBits(double d)29 __inline int64_t doubleToRawLongBits(double d) {
30 union {
31 double f;
32 int64_t i;
33 } tmp;
34 tmp.f = d;
35 return tmp.i;
36 }
37
longBitsToDouble(int64_t i)38 __inline double longBitsToDouble(int64_t i) {
39 union {
40 double f;
41 int64_t i;
42 } tmp;
43 tmp.i = i;
44 return tmp.f;
45 }
46
xfabs(double x)47 __inline double xfabs(double x) {
48 return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x));
49 }
50
mulsign(double x,double y)51 __inline double mulsign(double x, double y) {
52 return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63)));
53 }
54
sign(double d)55 __inline double sign(double d) { return mulsign(1, d); }
mla(double x,double y,double z)56 __inline double mla(double x, double y, double z) { return x * y + z; }
xrint(double x)57 __inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
58
xisnan(double x)59 __inline int xisnan(double x) { return x != x; }
xisinf(double x)60 __inline int xisinf(double x) { return x == lhdrengine::RT_INFINITY || x == -lhdrengine::RT_INFINITY; }
xisminf(double x)61 __inline int xisminf(double x) { return x == -lhdrengine::RT_INFINITY; }
xispinf(double x)62 __inline int xispinf(double x) { return x == lhdrengine::RT_INFINITY; }
63
ldexpk(double x,int q)64 __inline double ldexpk(double x, int q) {
65 double u;
66 int m;
67 m = q >> 31;
68 m = (((m + q) >> 9) - m) << 7;
69 q = q - (m << 2);
70 u = longBitsToDouble(((int64_t)(m + 0x3ff)) << 52);
71 double u2 = u*u;
72 u2 = u2 * u2;
73 x = x * u2;
74 u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
75 return x * u;
76 }
77
xldexp(double x,int q)78 __inline double xldexp(double x, int q) { return ldexpk(x, q); }
79
ilogbp1(double d)80 __inline int ilogbp1(double d) {
81 int m = d < 4.9090934652977266E-91;
82 d = m ? 2.037035976334486E90 * d : d;
83 int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
84 q = m ? q - (300 + 0x03fe) : q - 0x03fe;
85 return q;
86 }
87
xilogb(double d)88 __inline int xilogb(double d) {
89 int e = ilogbp1(xfabs(d)) - 1;
90 e = d == 0 ? (-2147483647 - 1) : e;
91 e = d == lhdrengine::RT_INFINITY || d == -lhdrengine::RT_INFINITY ? 2147483647 : e;
92 return e;
93 }
94
upper(double d)95 __inline double upper(double d) {
96 return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL);
97 }
98
99 typedef struct {
100 double x, y;
101 } double2;
102
103 typedef struct {
104 float x, y;
105 } float2;
106
dd(double h,double l)107 __inline double2 dd(double h, double l) {
108 double2 ret;
109 ret.x = h; ret.y = l;
110 return ret;
111 }
112
normalize_d(double2 t)113 __inline double2 normalize_d(double2 t) {
114 double2 s;
115
116 s.x = t.x + t.y;
117 s.y = t.x - s.x + t.y;
118
119 return s;
120 }
121
scale_d(double2 d,double s)122 __inline double2 scale_d(double2 d, double s) {
123 double2 r;
124
125 r.x = d.x * s;
126 r.y = d.y * s;
127
128 return r;
129 }
130
add2_ss(double x,double y)131 __inline double2 add2_ss(double x, double y) {
132 double2 r;
133
134 r.x = x + y;
135 double v = r.x - x;
136 r.y = (x - (r.x - v)) + (y - v);
137
138 return r;
139 }
140
add_ds(double2 x,double y)141 __inline double2 add_ds(double2 x, double y) {
142 // |x| >= |y|
143
144 double2 r;
145
146 assert(xisnan(x.x) || xisnan(y) || xfabs(x.x) >= xfabs(y));
147
148 r.x = x.x + y;
149 r.y = x.x - r.x + y + x.y;
150
151 return r;
152 }
153
add2_ds(double2 x,double y)154 __inline double2 add2_ds(double2 x, double y) {
155 // |x| >= |y|
156
157 double2 r;
158
159 r.x = x.x + y;
160 double v = r.x - x.x;
161 r.y = (x.x - (r.x - v)) + (y - v);
162 r.y += x.y;
163
164 return r;
165 }
166
add_sd(double x,double2 y)167 __inline double2 add_sd(double x, double2 y) {
168 // |x| >= |y|
169
170 double2 r;
171
172 assert(xisnan(x) || xisnan(y.x) || xfabs(x) >= xfabs(y.x));
173
174 r.x = x + y.x;
175 r.y = x - r.x + y.x + y.y;
176
177 return r;
178 }
179
add_dd(double2 x,double2 y)180 __inline double2 add_dd(double2 x, double2 y) {
181 // |x| >= |y|
182
183 double2 r;
184
185 assert(xisnan(x.x) || xisnan(y.x) || xfabs(x.x) >= xfabs(y.x));
186
187 r.x = x.x + y.x;
188 r.y = x.x - r.x + y.x + x.y + y.y;
189
190 return r;
191 }
192
add2_dd(double2 x,double2 y)193 __inline double2 add2_dd(double2 x, double2 y) {
194 double2 r;
195
196 r.x = x.x + y.x;
197 double v = r.x - x.x;
198 r.y = (x.x - (r.x - v)) + (y.x - v);
199 r.y += x.y + y.y;
200
201 return r;
202 }
203
div_dd(double2 n,double2 d)204 __inline double2 div_dd(double2 n, double2 d) {
205 double t = 1.0 / d.x;
206 double dh = upper(d.x), dl = d.x - dh;
207 double th = upper(t ), tl = t - th;
208 double nhh = upper(n.x), nhl = n.x - nhh;
209
210 double2 q;
211
212 q.x = n.x * t;
213
214 double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
215 q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
216
217 q.y = t * (n.y - q.x * d.y) + u;
218
219 return q;
220 }
221
mul_ss(double x,double y)222 __inline double2 mul_ss(double x, double y) {
223 double xh = upper(x), xl = x - xh;
224 double yh = upper(y), yl = y - yh;
225 double2 r;
226
227 r.x = x * y;
228 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
229
230 return r;
231 }
232
mul_ds(double2 x,double y)233 __inline double2 mul_ds(double2 x, double y) {
234 double xh = upper(x.x), xl = x.x - xh;
235 double yh = upper(y ), yl = y - yh;
236 double2 r;
237
238 r.x = x.x * y;
239 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
240
241 return r;
242 }
243
mul_dd(double2 x,double2 y)244 __inline double2 mul_dd(double2 x, double2 y) {
245 double xh = upper(x.x), xl = x.x - xh;
246 double yh = upper(y.x), yl = y.x - yh;
247 double2 r;
248
249 r.x = x.x * y.x;
250 r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
251
252 return r;
253 }
254
squ_d(double2 x)255 __inline double2 squ_d(double2 x) {
256 double xh = upper(x.x), xl = x.x - xh;
257 double2 r;
258
259 r.x = x.x * x.x;
260 r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
261
262 return r;
263 }
264
rec_s(double d)265 __inline double2 rec_s(double d) {
266 double t = 1.0 / d;
267 double dh = upper(d), dl = d - dh;
268 double th = upper(t), tl = t - th;
269 double2 q;
270
271 q.x = t;
272 q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
273
274 return q;
275 }
276
sqrt_d(double2 d)277 __inline double2 sqrt_d(double2 d) {
278 double t = sqrt(d.x + d.y);
279 return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), 0.5);
280 }
281
atan2k(double y,double x)282 __inline double atan2k(double y, double x) {
283 double s, t, u;
284 int q = 0;
285
286 if (x < 0) { x = -x; q = -2; }
287 if (y > x) { t = x; x = y; y = -t; q += 1; }
288
289 s = y / x;
290 t = s * s;
291
292 u = -1.88796008463073496563746e-05;
293 u = u * t + (0.000209850076645816976906797);
294 u = u * t + (-0.00110611831486672482563471);
295 u = u * t + (0.00370026744188713119232403);
296 u = u * t + (-0.00889896195887655491740809);
297 u = u * t + (0.016599329773529201970117);
298 u = u * t + (-0.0254517624932312641616861);
299 u = u * t + (0.0337852580001353069993897);
300 u = u * t + (-0.0407629191276836500001934);
301 u = u * t + (0.0466667150077840625632675);
302 u = u * t + (-0.0523674852303482457616113);
303 u = u * t + (0.0587666392926673580854313);
304 u = u * t + (-0.0666573579361080525984562);
305 u = u * t + (0.0769219538311769618355029);
306 u = u * t + (-0.090908995008245008229153);
307 u = u * t + (0.111111105648261418443745);
308 u = u * t + (-0.14285714266771329383765);
309 u = u * t + (0.199999999996591265594148);
310 u = u * t + (-0.333333333333311110369124);
311
312 t = u * t * s + s;
313 t = q * (lhdrengine::RT_PI_2) + t;
314
315 return t;
316 }
317
xatan2(double y,double x)318 __inline double xatan2(double y, double x) {
319 double r = atan2k(xfabs(y), x);
320
321 r = mulsign(r, x);
322 if (xisinf(x) || x == 0) r = lhdrengine::RT_PI_2 - (xisinf(x) ? (sign(x) * (lhdrengine::RT_PI_2)) : 0);
323 if (xisinf(y) ) r = lhdrengine::RT_PI_2 - (xisinf(x) ? (sign(x) * (lhdrengine::RT_PI*1/4)) : 0);
324 if ( y == 0) r = (sign(x) == -1 ? lhdrengine::RT_PI : 0);
325
326 return xisnan(x) || xisnan(y) ? lhdrengine::RT_NAN : mulsign(r, y);
327 }
328
xasin(double d)329 __inline double xasin(double d) {
330 return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d);
331 }
332
xacos(double d)333 __inline double xacos(double d) {
334 return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? lhdrengine::RT_PI : 0);
335 }
336
xatan(double s)337 __inline double xatan(double s) {
338 double t, u;
339 int q = 0;
340
341 if (s < 0) { s = -s; q = 2; }
342 if (s > 1) { s = 1.0 / s; q |= 1; }
343
344 t = s * s;
345
346 u = -1.88796008463073496563746e-05;
347 u = u * t + (0.000209850076645816976906797);
348 u = u * t + (-0.00110611831486672482563471);
349 u = u * t + (0.00370026744188713119232403);
350 u = u * t + (-0.00889896195887655491740809);
351 u = u * t + (0.016599329773529201970117);
352 u = u * t + (-0.0254517624932312641616861);
353 u = u * t + (0.0337852580001353069993897);
354 u = u * t + (-0.0407629191276836500001934);
355 u = u * t + (0.0466667150077840625632675);
356 u = u * t + (-0.0523674852303482457616113);
357 u = u * t + (0.0587666392926673580854313);
358 u = u * t + (-0.0666573579361080525984562);
359 u = u * t + (0.0769219538311769618355029);
360 u = u * t + (-0.090908995008245008229153);
361 u = u * t + (0.111111105648261418443745);
362 u = u * t + (-0.14285714266771329383765);
363 u = u * t + (0.199999999996591265594148);
364 u = u * t + (-0.333333333333311110369124);
365
366 t = s + s * (t * u);
367
368 if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
369 if ((q & 2) != 0) t = -t;
370
371 return t;
372 }
373
xsin(double d)374 __inline double xsin(double d) {
375 int q;
376 double u, s;
377
378 q = (int)xrint(d * lhdrengine::RT_1_PI);
379
380 d = mla(q, -PI4_A*4, d);
381 d = mla(q, -PI4_B*4, d);
382 d = mla(q, -PI4_C*4, d);
383
384 s = d * d;
385
386 if ((q & 1) != 0) d = -d;
387
388 u = -7.97255955009037868891952e-18;
389 u = mla(u, s, 2.81009972710863200091251e-15);
390 u = mla(u, s, -7.64712219118158833288484e-13);
391 u = mla(u, s, 1.60590430605664501629054e-10);
392 u = mla(u, s, -2.50521083763502045810755e-08);
393 u = mla(u, s, 2.75573192239198747630416e-06);
394 u = mla(u, s, -0.000198412698412696162806809);
395 u = mla(u, s, 0.00833333333333332974823815);
396 u = mla(u, s, -0.166666666666666657414808);
397
398 u = mla(s, u * d, d);
399
400 return u;
401 }
402
xcos(double d)403 __inline double xcos(double d) {
404 int q;
405 double u, s;
406
407 q = 1 + 2*(int)xrint(d * lhdrengine::RT_1_PI - 0.5);
408
409 d = mla(q, -PI4_A*2, d);
410 d = mla(q, -PI4_B*2, d);
411 d = mla(q, -PI4_C*2, d);
412
413 s = d * d;
414
415 if ((q & 2) == 0) d = -d;
416
417 u = -7.97255955009037868891952e-18;
418 u = mla(u, s, 2.81009972710863200091251e-15);
419 u = mla(u, s, -7.64712219118158833288484e-13);
420 u = mla(u, s, 1.60590430605664501629054e-10);
421 u = mla(u, s, -2.50521083763502045810755e-08);
422 u = mla(u, s, 2.75573192239198747630416e-06);
423 u = mla(u, s, -0.000198412698412696162806809);
424 u = mla(u, s, 0.00833333333333332974823815);
425 u = mla(u, s, -0.166666666666666657414808);
426
427 u = mla(s, u * d, d);
428
429 return u;
430 }
431
xsincos(double d)432 __inline double2 xsincos(double d) {
433 int q;
434 double u, s, t;
435 double2 r;
436
437 q = (int)xrint(d * (2 * lhdrengine::RT_1_PI));
438
439 s = d;
440
441 s = mla(-q, PI4_A*2, s);
442 s = mla(-q, PI4_B*2, s);
443 s = mla(-q, PI4_C*2, s);
444
445 t = s;
446
447 s = s * s;
448
449 u = 1.58938307283228937328511e-10;
450 u = mla(u, s, -2.50506943502539773349318e-08);
451 u = mla(u, s, 2.75573131776846360512547e-06);
452 u = mla(u, s, -0.000198412698278911770864914);
453 u = mla(u, s, 0.0083333333333191845961746);
454 u = mla(u, s, -0.166666666666666130709393);
455 u = u * s * t;
456
457 r.x = t + u;
458
459 u = -1.13615350239097429531523e-11;
460 u = mla(u, s, 2.08757471207040055479366e-09);
461 u = mla(u, s, -2.75573144028847567498567e-07);
462 u = mla(u, s, 2.48015872890001867311915e-05);
463 u = mla(u, s, -0.00138888888888714019282329);
464 u = mla(u, s, 0.0416666666666665519592062);
465 u = mla(u, s, -0.5);
466
467 r.y = u * s + 1;
468
469 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
470 if ((q & 2) != 0) { r.x = -r.x; }
471 if (((q+1) & 2) != 0) { r.y = -r.y; }
472
473 if (xisinf(d)) { r.x = r.y = lhdrengine::RT_NAN; }
474
475 return r;
476 }
477
xtan(double d)478 __inline double xtan(double d) {
479 int q;
480 double u, s, x;
481
482 q = (int)xrint(d * (2 * lhdrengine::RT_1_PI));
483
484 x = mla(q, -PI4_A*2, d);
485 x = mla(q, -PI4_B*2, x);
486 x = mla(q, -PI4_C*2, x);
487
488 s = x * x;
489
490 if ((q & 1) != 0) x = -x;
491
492 u = 1.01419718511083373224408e-05;
493 u = mla(u, s, -2.59519791585924697698614e-05);
494 u = mla(u, s, 5.23388081915899855325186e-05);
495 u = mla(u, s, -3.05033014433946488225616e-05);
496 u = mla(u, s, 7.14707504084242744267497e-05);
497 u = mla(u, s, 8.09674518280159187045078e-05);
498 u = mla(u, s, 0.000244884931879331847054404);
499 u = mla(u, s, 0.000588505168743587154904506);
500 u = mla(u, s, 0.00145612788922812427978848);
501 u = mla(u, s, 0.00359208743836906619142924);
502 u = mla(u, s, 0.00886323944362401618113356);
503 u = mla(u, s, 0.0218694882853846389592078);
504 u = mla(u, s, 0.0539682539781298417636002);
505 u = mla(u, s, 0.133333333333125941821962);
506 u = mla(u, s, 0.333333333333334980164153);
507
508 u = mla(s, u * x, x);
509
510 if ((q & 1) != 0) u = 1.0 / u;
511
512 if (xisinf(d)) u = lhdrengine::RT_NAN;
513
514 return u;
515 }
516
xlog(double d)517 __inline double xlog(double d) {
518 double x, x2, t, m;
519 int e;
520
521 e = ilogbp1(d * 0.7071);
522 m = ldexpk(d, -e);
523
524 x = (m-1) / (m+1);
525 x2 = x * x;
526
527 t = 0.148197055177935105296783;
528 t = mla(t, x2, 0.153108178020442575739679);
529 t = mla(t, x2, 0.181837339521549679055568);
530 t = mla(t, x2, 0.22222194152736701733275);
531 t = mla(t, x2, 0.285714288030134544449368);
532 t = mla(t, x2, 0.399999999989941956712869);
533 t = mla(t, x2, 0.666666666666685503450651);
534 t = mla(t, x2, 2);
535
536 x = x * t + 0.693147180559945286226764 * e;
537
538 if (xisinf(d)) x = lhdrengine::RT_INFINITY;
539 if (d < 0) x = lhdrengine::RT_NAN;
540 if (d == 0) x = -lhdrengine::RT_INFINITY;
541
542 return x;
543 }
544
xexp(double d)545 __inline double xexp(double d) {
546 int q = (int)xrint(d * R_LN2);
547 double s, u;
548
549 s = mla(q, -L2U, d);
550 s = mla(q, -L2L, s);
551
552 u = 2.08860621107283687536341e-09;
553 u = mla(u, s, 2.51112930892876518610661e-08);
554 u = mla(u, s, 2.75573911234900471893338e-07);
555 u = mla(u, s, 2.75572362911928827629423e-06);
556 u = mla(u, s, 2.4801587159235472998791e-05);
557 u = mla(u, s, 0.000198412698960509205564975);
558 u = mla(u, s, 0.00138888888889774492207962);
559 u = mla(u, s, 0.00833333333331652721664984);
560 u = mla(u, s, 0.0416666666666665047591422);
561 u = mla(u, s, 0.166666666666666851703837);
562 u = mla(u, s, 0.5);
563
564 u = s * s * u + s + 1;
565 u = ldexpk(u, q);
566
567 if (xisminf(d)) u = 0;
568
569 return u;
570 }
571
logk(double d)572 __inline double2 logk(double d) {
573 double2 x, x2;
574 double m, t;
575 int e;
576
577 e = ilogbp1(d * 0.7071);
578 m = ldexpk(d, -e);
579
580 x = div_dd(add2_ss(-1, m), add2_ss(1, m));
581 x2 = squ_d(x);
582
583 t = 0.134601987501262130076155;
584 t = mla(t, x2.x, 0.132248509032032670243288);
585 t = mla(t, x2.x, 0.153883458318096079652524);
586 t = mla(t, x2.x, 0.181817427573705403298686);
587 t = mla(t, x2.x, 0.222222231326187414840781);
588 t = mla(t, x2.x, 0.285714285651261412873718);
589 t = mla(t, x2.x, 0.400000000000222439910458);
590 t = mla(t, x2.x, 0.666666666666666371239645);
591
592 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
593 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
594 }
595
expk(double2 d)596 __inline double expk(double2 d) {
597 int q = (int)rint((d.x + d.y) * R_LN2);
598 double2 s, t;
599 double u;
600
601 s = add2_ds(d, q * -L2U);
602 s = add2_ds(s, q * -L2L);
603
604 s = normalize_d(s);
605
606 u = 2.51069683420950419527139e-08;
607 u = mla(u, s.x, 2.76286166770270649116855e-07);
608 u = mla(u, s.x, 2.75572496725023574143864e-06);
609 u = mla(u, s.x, 2.48014973989819794114153e-05);
610 u = mla(u, s.x, 0.000198412698809069797676111);
611 u = mla(u, s.x, 0.0013888888939977128960529);
612 u = mla(u, s.x, 0.00833333333332371417601081);
613 u = mla(u, s.x, 0.0416666666665409524128449);
614 u = mla(u, s.x, 0.166666666666666740681535);
615 u = mla(u, s.x, 0.500000000000000999200722);
616
617 t = add_dd(s, mul_ds(squ_d(s), u));
618
619 t = add_sd(1, t);
620 return ldexpk(t.x + t.y, q);
621 }
622
xpow(double x,double y)623 __inline double xpow(double x, double y) {
624 int yisint = (int)y == y;
625 int yisodd = (1 & (int)y) != 0 && yisint;
626
627 double result = expk(mul_ds(logk(xfabs(x)), y));
628
629 result = xisnan(result) ? lhdrengine::RT_INFINITY : result;
630 result *= (x >= 0 ? 1 : (!yisint ? lhdrengine::RT_NAN : (yisodd ? -1 : 1)));
631
632 double efx = mulsign(xfabs(x) - 1, y);
633 if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : lhdrengine::RT_INFINITY);
634 if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : lhdrengine::RT_INFINITY);
635 if (xisnan(x) || xisnan(y)) result = lhdrengine::RT_NAN;
636 if (y == 0 || x == 1) result = 1;
637
638 return result;
639 }
640
expk2(double2 d)641 __inline double2 expk2(double2 d) {
642 int q = (int)rint((d.x + d.y) * R_LN2);
643 double2 s, t;
644 double u;
645
646 s = add2_ds(d, q * -L2U);
647 s = add2_ds(s, q * -L2L);
648
649 s = normalize_d(s);
650
651 u = 2.51069683420950419527139e-08;
652 u = mla(u, s.x, 2.76286166770270649116855e-07);
653 u = mla(u, s.x, 2.75572496725023574143864e-06);
654 u = mla(u, s.x, 2.48014973989819794114153e-05);
655 u = mla(u, s.x, 0.000198412698809069797676111);
656 u = mla(u, s.x, 0.0013888888939977128960529);
657 u = mla(u, s.x, 0.00833333333332371417601081);
658 u = mla(u, s.x, 0.0416666666665409524128449);
659 u = mla(u, s.x, 0.166666666666666740681535);
660 u = mla(u, s.x, 0.500000000000000999200722);
661
662 t = add_dd(s, mul_ds(squ_d(s), u));
663
664 t = add_sd(1, t);
665 return dd(ldexpk(t.x, q), ldexpk(t.y, q));
666 }
667
xsinh(double x)668 __inline double xsinh(double x) {
669 double y = xfabs(x);
670 double2 d = expk2(dd(y, 0));
671 d = add2_dd(d, div_dd(dd(-1, 0), d));
672 y = (d.x + d.y) * 0.5;
673
674 y = xisinf(x) || xisnan(y) ? lhdrengine::RT_INFINITY : y;
675 y = mulsign(y, x);
676 y = xisnan(x) ? lhdrengine::RT_NAN : y;
677
678 return y;
679 }
680
xcosh(double x)681 __inline double xcosh(double x) {
682 double2 d = expk2(dd(x, 0));
683 d = add2_dd(d, div_dd(dd(1, 0), d));
684 double y = (d.x + d.y) * 0.5;
685
686 y = xisinf(x) || xisnan(y) ? lhdrengine::RT_INFINITY : y;
687 y = xisnan(x) ? lhdrengine::RT_NAN : y;
688
689 return y;
690 }
691
xtanh(double x)692 __inline double xtanh(double x) {
693 double y = xfabs(x);
694 double2 d = expk2(dd(y, 0));
695 double2 e = div_dd(dd(1, 0), d);
696 d = div_dd(add2_dd(d, scale_d(e, -1)), add2_dd(d, e));
697 y = d.x + d.y;
698
699 y = xisinf(x) || xisnan(y) ? 1.0 : y;
700 y = mulsign(y, x);
701 y = xisnan(x) ? lhdrengine::RT_NAN : y;
702
703 return y;
704 }
705
logk2(double2 d)706 __inline double2 logk2(double2 d) {
707 double2 x, x2, m;
708 double t;
709 int e;
710
711 d = normalize_d(d);
712 e = ilogbp1(d.x * 0.7071);
713 m = scale_d(d, ldexpk(1, -e));
714
715 x = div_dd(add2_ds(m, -1), add2_ds(m, 1));
716 x2 = squ_d(x);
717
718 t = 0.134601987501262130076155;
719 t = mla(t, x2.x, 0.132248509032032670243288);
720 t = mla(t, x2.x, 0.153883458318096079652524);
721 t = mla(t, x2.x, 0.181817427573705403298686);
722 t = mla(t, x2.x, 0.222222231326187414840781);
723 t = mla(t, x2.x, 0.285714285651261412873718);
724 t = mla(t, x2.x, 0.400000000000222439910458);
725 t = mla(t, x2.x, 0.666666666666666371239645);
726
727 return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
728 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
729 }
730
xasinh(double x)731 __inline double xasinh(double x) {
732 double y = xfabs(x);
733 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y), 1)), y));
734 y = d.x + d.y;
735
736 y = xisinf(x) || xisnan(y) ? lhdrengine::RT_INFINITY : y;
737 y = mulsign(y, x);
738 y = xisnan(x) ? lhdrengine::RT_NAN : y;
739
740 return y;
741 }
742
xacosh(double x)743 __inline double xacosh(double x) {
744 double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), -1)), x));
745 double y = d.x + d.y;
746
747 y = xisinf(x) || xisnan(y) ? lhdrengine::RT_INFINITY : y;
748 y = x == 1.0 ? 0.0 : y;
749 y = x < 1.0 ? lhdrengine::RT_NAN : y;
750 y = xisnan(x) ? lhdrengine::RT_NAN : y;
751
752 return y;
753 }
754
xatanh(double x)755 __inline double xatanh(double x) {
756 double y = xfabs(x);
757 double2 d = logk2(div_dd(add2_ss(1, y), add2_ss(1, -y)));
758 y = y > 1.0 ? lhdrengine::RT_NAN : (y == 1.0 ? lhdrengine::RT_INFINITY : (d.x + d.y) * 0.5);
759
760 y = xisinf(x) || xisnan(y) ? lhdrengine::RT_NAN : y;
761 y = mulsign(y, x);
762 y = xisnan(x) ? lhdrengine::RT_NAN : y;
763
764 return y;
765 }
766
767 //
768
xfma(double x,double y,double z)769 __inline double xfma(double x, double y, double z) {
770 union {
771 double f;
772 long long int i;
773 } tmp;
774
775 tmp.f = x;
776 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
777 double xh = tmp.f, xl = x - xh;
778
779 tmp.f = y;
780 tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
781 double yh = tmp.f, yl = y - yh;
782
783 double h = x * y;
784 double l = xh * yh - h + xl * yh + xh * yl + xl * yl;
785
786 double h2, l2, v;
787
788 h2 = h + z;
789 v = h2 - h;
790 l2 = (h - (h2 - v)) + (z - v) + l;
791
792 return h2 + l2;
793 }
794
xsqrt(double d)795 __inline double xsqrt(double d) { // max error : 0.5 ulp
796 double q = 1;
797
798 if (d < 8.636168555094445E-78) {
799 d *= 1.157920892373162E77;
800 q = 2.9387358770557188E-39;
801 }
802
803 // http://en.wikipedia.org/wiki/Fast_inverse_square_root
804 double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));
805
806 x = x * (1.5 - 0.5 * d * x * x);
807 x = x * (1.5 - 0.5 * d * x * x);
808 x = x * (1.5 - 0.5 * d * x * x);
809
810 // You can change xfma to fma if fma is correctly implemented
811 x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x;
812
813 return d == lhdrengine::RT_INFINITY ? lhdrengine::RT_INFINITY : x * q;
814 }
815
xcbrt(double d)816 __inline double xcbrt(double d) { // max error : 2 ulps
817 double x, y, q = 1.0;
818 int e, r;
819
820 e = ilogbp1(d);
821 d = ldexpk(d, -e);
822 r = (e + 6144) % 3;
823 q = (r == 1) ? 1.2599210498948731647672106 : q;
824 q = (r == 2) ? 1.5874010519681994747517056 : q;
825 q = ldexpk(q, (e + 6144) / 3 - 2048);
826
827 q = mulsign(q, d);
828 d = xfabs(d);
829
830 x = -0.640245898480692909870982;
831 x = x * d + 2.96155103020039511818595;
832 x = x * d + -5.73353060922947843636166;
833 x = x * d + 6.03990368989458747961407;
834 x = x * d + -3.85841935510444988821632;
835 x = x * d + 2.2307275302496609725722;
836
837 y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
838 y = d * x * x;
839 y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;
840
841 return y;
842 }
843
xexp2(double a)844 __inline double xexp2(double a) {
845 double u = expk(mul_ds(dd(0.69314718055994528623, 2.3190468138462995584e-17), a));
846 if (xispinf(a)) u = lhdrengine::RT_INFINITY;
847 if (xisminf(a)) u = 0;
848 return u;
849 }
850
xexp10(double a)851 __inline double xexp10(double a) {
852 double u = expk(mul_ds(dd(2.3025850929940459011, -2.1707562233822493508e-16), a));
853 if (xispinf(a)) u = lhdrengine::RT_INFINITY;
854 if (xisminf(a)) u = 0;
855 return u;
856 }
857
xexpm1(double a)858 __inline double xexpm1(double a) {
859 double2 d = add2_ds(expk2(dd(a, 0)), -1.0);
860 double x = d.x + d.y;
861 if (xispinf(a)) x = lhdrengine::RT_INFINITY;
862 if (xisminf(a)) x = -1;
863 return x;
864 }
865
xlog10(double a)866 __inline double xlog10(double a) {
867 double2 d = mul_dd(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17));
868 double x = d.x + d.y;
869
870 if (xisinf(a)) x = lhdrengine::RT_INFINITY;
871 if (a < 0) x = lhdrengine::RT_NAN;
872 if (a == 0) x = -lhdrengine::RT_INFINITY;
873
874 return x;
875 }
876
xlog1p(double a)877 __inline double xlog1p(double a) {
878 double2 d = logk2(add2_ss(a, 1));
879 double x = d.x + d.y;
880
881 if (xisinf(a)) x = lhdrengine::RT_INFINITY;
882 if (a < -1) x = lhdrengine::RT_NAN;
883 if (a == -1) x = -lhdrengine::RT_INFINITY;
884
885 return x;
886 }
887
888 ///////////////////////////////////////////
889
890 #define PI4_Af 0.78515625f
891 #define PI4_Bf 0.00024127960205078125f
892 #define PI4_Cf 6.3329935073852539062e-07f
893 #define PI4_Df 4.9604681473525147339e-10f
894
895 #define L2Uf 0.693145751953125f
896 #define L2Lf 1.428606765330187045e-06f
897
898 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
899
floatToRawIntBits(float d)900 __inline int32_t floatToRawIntBits(float d) {
901 union {
902 float f;
903 int32_t i;
904 } tmp;
905 tmp.f = d;
906 return tmp.i;
907 }
908
intBitsToFloat(int32_t i)909 __inline float intBitsToFloat(int32_t i) {
910 union {
911 float f;
912 int32_t i;
913 } tmp;
914 tmp.i = i;
915 return tmp.f;
916 }
917
xfabsf(float x)918 __inline float xfabsf(float x) {
919 return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
920 }
921
mulsignf(float x,float y)922 __inline float mulsignf(float x, float y) {
923 return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
924 }
925
signf(float d)926 __inline float signf(float d) { return mulsignf(1, d); }
mlaf(float x,float y,float z)927 __inline float mlaf(float x, float y, float z) { return x * y + z; }
xrintf(float x)928 __inline float xrintf(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); }
929
xisnanf(float x)930 __inline int xisnanf(float x) { return x != x; }
xisinff(float x)931 __inline int xisinff(float x) { return x == lhdrengine::RT_INFINITY_F || x == -lhdrengine::RT_INFINITY_F; }
xisminff(float x)932 __inline int xisminff(float x) { return x == -lhdrengine::RT_INFINITY_F; }
xispinff(float x)933 __inline int xispinff(float x) { return x == lhdrengine::RT_INFINITY_F; }
934
ilogbp1f(float d)935 __inline int ilogbp1f(float d) {
936 int m = d < 5.421010862427522E-20f;
937 d = m ? 1.8446744073709552E19f * d : d;
938 int q = (floatToRawIntBits(d) >> 23) & 0xff;
939 q = m ? q - (64 + 0x7e) : q - 0x7e;
940 return q;
941 }
942
ldexpkf(float x,int q)943 __inline float ldexpkf(float x, int q) {
944 float u;
945 int m;
946 m = q >> 31;
947 m = (((m + q) >> 6) - m) << 4;
948 q = q - (m << 2);
949 u = intBitsToFloat(((int32_t)(m + 0x7f)) << 23);
950 u = u * u;
951 x = x * u * u;
952 u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
953 return x * u;
954 }
955
xcbrtf(float d)956 __inline float xcbrtf(float d) { // max error : 2 ulps
957 float x, y, q = 1.0f;
958 int e, r;
959
960 e = ilogbp1f(d);
961 d = ldexpkf(d, -e);
962 r = (e + 6144) % 3;
963 q = (r == 1) ? 1.2599210498948731647672106f : q;
964 q = (r == 2) ? 1.5874010519681994747517056f : q;
965 q = ldexpkf(q, (e + 6144) / 3 - 2048);
966
967 q = mulsignf(q, d);
968 d = xfabsf(d);
969
970 x = -0.601564466953277587890625f;
971 x = mlaf(x, d, 2.8208892345428466796875f);
972 x = mlaf(x, d, -5.532182216644287109375f);
973 x = mlaf(x, d, 5.898262500762939453125f);
974 x = mlaf(x, d, -3.8095417022705078125f);
975 x = mlaf(x, d, 2.2241256237030029296875f);
976
977 y = d * x * x;
978 y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;
979
980 return y;
981 }
982
xsinf(float d)983 __inline float xsinf(float d) {
984 int q;
985 float u, s;
986
987 q = (int)xrintf(d * (float)lhdrengine::RT_1_PI);
988
989 d = mlaf(q, -PI4_Af*4, d);
990 d = mlaf(q, -PI4_Bf*4, d);
991 d = mlaf(q, -PI4_Cf*4, d);
992 d = mlaf(q, -PI4_Df*4, d);
993
994 s = d * d;
995
996 if ((q & 1) != 0) d = -d;
997
998 u = 2.6083159809786593541503e-06f;
999 u = mlaf(u, s, -0.0001981069071916863322258f);
1000 u = mlaf(u, s, 0.00833307858556509017944336f);
1001 u = mlaf(u, s, -0.166666597127914428710938f);
1002
1003 u = mlaf(s, u * d, d);
1004
1005 return u;
1006 }
1007
xcosf(float d)1008 __inline float xcosf(float d) {
1009 int q;
1010 float u, s;
1011
1012 q = 1 + 2*(int)xrintf(d * (float)lhdrengine::RT_1_PI - 0.5f);
1013
1014 d = mlaf(q, -PI4_Af*2, d);
1015 d = mlaf(q, -PI4_Bf*2, d);
1016 d = mlaf(q, -PI4_Cf*2, d);
1017 d = mlaf(q, -PI4_Df*2, d);
1018
1019 s = d * d;
1020
1021 if ((q & 2) == 0) d = -d;
1022
1023 u = 2.6083159809786593541503e-06f;
1024 u = mlaf(u, s, -0.0001981069071916863322258f);
1025 u = mlaf(u, s, 0.00833307858556509017944336f);
1026 u = mlaf(u, s, -0.166666597127914428710938f);
1027
1028 u = mlaf(s, u * d, d);
1029
1030 return u;
1031 }
1032
xsincosf(float d)1033 __inline float2 xsincosf(float d) {
1034 int q;
1035 float u, s, t;
1036 float2 r;
1037
1038 q = (int)rint(d * ((float)(2 * lhdrengine::RT_1_PI)));
1039
1040 s = d;
1041
1042 s = mlaf(q, -PI4_Af*2, s);
1043 s = mlaf(q, -PI4_Bf*2, s);
1044 s = mlaf(q, -PI4_Cf*2, s);
1045 s = mlaf(q, -PI4_Df*2, s);
1046
1047 t = s;
1048
1049 s = s * s;
1050
1051 u = -0.000195169282960705459117889f;
1052 u = mlaf(u, s, 0.00833215750753879547119141f);
1053 u = mlaf(u, s, -0.166666537523269653320312f);
1054 u = u * s * t;
1055
1056 r.x = t + u;
1057
1058 u = -2.71811842367242206819355e-07f;
1059 u = mlaf(u, s, 2.47990446951007470488548e-05f);
1060 u = mlaf(u, s, -0.00138888787478208541870117f);
1061 u = mlaf(u, s, 0.0416666641831398010253906f);
1062 u = mlaf(u, s, -0.5f);
1063
1064 r.y = u * s + 1;
1065
1066 if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
1067 if ((q & 2) != 0) { r.x = -r.x; }
1068 if (((q+1) & 2) != 0) { r.y = -r.y; }
1069
1070 if (xisinff(d)) { r.x = r.y = lhdrengine::RT_NAN_F; }
1071
1072 return r;
1073 }
1074
xtanf(float d)1075 __inline float xtanf(float d) {
1076 int q;
1077 float u, s, x;
1078
1079 q = (int)xrintf(d * (float)(2 * lhdrengine::RT_1_PI));
1080
1081 x = d;
1082
1083 x = mlaf(q, -PI4_Af*2, x);
1084 x = mlaf(q, -PI4_Bf*2, x);
1085 x = mlaf(q, -PI4_Cf*2, x);
1086 x = mlaf(q, -PI4_Df*2, x);
1087
1088 s = x * x;
1089
1090 if ((q & 1) != 0) x = -x;
1091
1092 u = 0.00927245803177356719970703f;
1093 u = mlaf(u, s, 0.00331984995864331722259521f);
1094 u = mlaf(u, s, 0.0242998078465461730957031f);
1095 u = mlaf(u, s, 0.0534495301544666290283203f);
1096 u = mlaf(u, s, 0.133383005857467651367188f);
1097 u = mlaf(u, s, 0.333331853151321411132812f);
1098
1099 u = mlaf(s, u * x, x);
1100
1101 if ((q & 1) != 0) u = 1.0f / u;
1102
1103 if (xisinff(d)) u = lhdrengine::RT_NAN_F;
1104
1105 return u;
1106 }
1107
xatanf(float s)1108 __inline float xatanf(float s) {
1109 float t, u;
1110 int q = 0;
1111
1112 if (s < 0) { s = -s; q = 2; }
1113 if (s > 1) { s = 1.0f / s; q |= 1; }
1114
1115 t = s * s;
1116
1117 u = 0.00282363896258175373077393f;
1118 u = mlaf(u, t, -0.0159569028764963150024414f);
1119 u = mlaf(u, t, 0.0425049886107444763183594f);
1120 u = mlaf(u, t, -0.0748900920152664184570312f);
1121 u = mlaf(u, t, 0.106347933411598205566406f);
1122 u = mlaf(u, t, -0.142027363181114196777344f);
1123 u = mlaf(u, t, 0.199926957488059997558594f);
1124 u = mlaf(u, t, -0.333331018686294555664062f);
1125
1126 t = s + s * (t * u);
1127
1128 if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
1129 if ((q & 2) != 0) t = -t;
1130
1131 return t;
1132 }
1133
atan2kf(float y,float x)1134 __inline float atan2kf(float y, float x) {
1135 float s, t, u;
1136 float q = 0.f;
1137
1138 if (x < 0) { x = -x; q = -2.f; }
1139 if (y > x) { t = x; x = y; y = -t; q += 1.f; }
1140
1141 s = y / x;
1142 t = s * s;
1143
1144 u = 0.00282363896258175373077393f;
1145 u = mlaf(u, t, -0.0159569028764963150024414f);
1146 u = mlaf(u, t, 0.0425049886107444763183594f);
1147 u = mlaf(u, t, -0.0748900920152664184570312f);
1148 u = mlaf(u, t, 0.106347933411598205566406f);
1149 u = mlaf(u, t, -0.142027363181114196777344f);
1150 u = mlaf(u, t, 0.199926957488059997558594f);
1151 u = mlaf(u, t, -0.333331018686294555664062f);
1152
1153 t = u * t;
1154 t = mlaf(t,s,s);
1155 return mlaf(q,(float)(lhdrengine::RT_PI_F_2),t);
1156 }
1157
xatan2f(float y,float x)1158 __inline float xatan2f(float y, float x) {
1159 float r = atan2kf(xfabsf(y), x);
1160
1161 r = mulsignf(r, x);
1162 if (xisinff(x) || x == 0) r = lhdrengine::RT_PI_F/2 - (xisinff(x) ? (signf(x) * (float)(lhdrengine::RT_PI_F*.5f)) : 0);
1163 if (xisinff(y) ) r = lhdrengine::RT_PI_F/2 - (xisinff(x) ? (signf(x) * (float)(lhdrengine::RT_PI_F*.25f)) : 0);
1164 if ( y == 0) r = (signf(x) == -1 ? lhdrengine::RT_PI_F : 0);
1165
1166 return xisnanf(x) || xisnanf(y) ? lhdrengine::RT_NAN_F : mulsignf(r, y);
1167 }
1168
xasinf(float d)1169 __inline float xasinf(float d) {
1170 return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d);
1171 }
1172
xacosf(float d)1173 __inline float xacosf(float d) {
1174 return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)lhdrengine::RT_PI : 0.0f);
1175 }
1176
xlogf(float d)1177 __inline float xlogf(float d) {
1178 float x, x2, t, m;
1179 int e;
1180
1181 e = ilogbp1f(d * 0.7071f);
1182 m = ldexpkf(d, -e);
1183
1184 x = (m-1.0f) / (m+1.0f);
1185 x2 = x * x;
1186
1187 t = 0.2371599674224853515625f;
1188 t = mlaf(t, x2, 0.285279005765914916992188f);
1189 t = mlaf(t, x2, 0.400005519390106201171875f);
1190 t = mlaf(t, x2, 0.666666567325592041015625f);
1191 t = mlaf(t, x2, 2.0f);
1192
1193 x = x * t + 0.693147180559945286226764f * e;
1194
1195 if (xisinff(d)) x = lhdrengine::RT_INFINITY_F;
1196 if (d < 0) x = lhdrengine::RT_NAN_F;
1197 if (d == 0) x = -lhdrengine::RT_INFINITY_F;
1198
1199 return x;
1200 }
1201
xexpf(float d)1202 __inline float xexpf(float d) {
1203 if(d<=-104.0f) return 0.0f;
1204
1205 int q = (int)xrintf(d * R_LN2f);
1206 float s, u;
1207
1208 s = mlaf(q, -L2Uf, d);
1209 s = mlaf(q, -L2Lf, s);
1210
1211 u = 0.00136324646882712841033936f;
1212 u = mlaf(u, s, 0.00836596917361021041870117f);
1213 u = mlaf(u, s, 0.0416710823774337768554688f);
1214 u = mlaf(u, s, 0.166665524244308471679688f);
1215 u = mlaf(u, s, 0.499999850988388061523438f);
1216
1217 u = mlaf( s, mlaf(s,u,1.f),1.f);
1218 return ldexpkf(u, q);
1219
1220 }
1221
xmul2f(float d)1222 __inline float xmul2f(float d) {
1223 return d + d;
1224 }
1225
xdiv2f(float d)1226 __inline float xdiv2f(float d) {
1227 union {
1228 float floatval;
1229 int intval;
1230 } uflint;
1231 uflint.floatval = d;
1232 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1233 uflint.intval -= 1 << 23; // sub 1 from the exponent
1234 }
1235 return uflint.floatval;
1236 }
1237
xdivf(float d,int n)1238 __inline float xdivf( float d, int n){
1239 union {
1240 float floatval;
1241 int intval;
1242 } uflint;
1243 uflint.floatval = d;
1244 if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
1245 uflint.intval -= n << 23; // add n to the exponent
1246 }
1247 return uflint.floatval;
1248 }
1249
1250
1251
1252 #endif
1253