1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #pragma once
5 
6 // Transcendental functions from "ispc": https://github.com/ispc/ispc/
7 // Most of the transcendental implementations in ispc code come from
8 // Solomon Boulos's "syrah": https://github.com/boulos/syrah/
9 
10 #include "../simd/simd.h"
11 
12 namespace embree
13 {
14 
15 namespace fastapprox
16 {
17 
18 template <typename T>
sin(const T & v)19 __forceinline T sin(const T &v)
20 {
21   static const float piOverTwoVec = 1.57079637050628662109375;
22   static const float twoOverPiVec = 0.636619746685028076171875;
23   auto scaled = v * twoOverPiVec;
24   auto kReal = floor(scaled);
25   auto k = toInt(kReal);
26 
27   // Reduced range version of x
28   auto x = v - kReal * piOverTwoVec;
29   auto kMod4 = k & 3;
30   auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
31   auto flipSign = (kMod4 > 1);
32 
33   // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
34   // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
35   static const float sinC2  = -0.16666667163372039794921875;
36   static const float sinC4  = +8.333347737789154052734375e-3;
37   static const float sinC6  = -1.9842604524455964565277099609375e-4;
38   static const float sinC8  = +2.760012648650445044040679931640625e-6;
39   static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
40 
41   static const float cosC2  = -0.5;
42   static const float cosC4  = +4.166664183139801025390625e-2;
43   static const float cosC6  = -1.388833043165504932403564453125e-3;
44   static const float cosC8  = +2.47562347794882953166961669921875e-5;
45   static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
46 
47   auto outside = select(sinUseCos, 1., x);
48   auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
49   auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
50   auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
51   auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
52   auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
53 
54   auto x2 = x * x;
55   auto formula = x2 * c10 + c8;
56   formula = x2 * formula + c6;
57   formula = x2 * formula + c4;
58   formula = x2 * formula + c2;
59   formula = x2 * formula + 1.;
60   formula *= outside;
61 
62   formula = select(flipSign, -formula, formula);
63   return formula;
64 }
65 
66 template <typename T>
cos(const T & v)67 __forceinline T cos(const T &v)
68 {
69   static const float piOverTwoVec = 1.57079637050628662109375;
70   static const float twoOverPiVec = 0.636619746685028076171875;
71   auto scaled = v * twoOverPiVec;
72   auto kReal = floor(scaled);
73   auto k = toInt(kReal);
74 
75   // Reduced range version of x
76   auto x = v - kReal * piOverTwoVec;
77 
78   auto kMod4 = k & 3;
79   auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
80   auto flipSign = (kMod4 == 1 | kMod4 == 2);
81 
82   const float sinC2  = -0.16666667163372039794921875;
83   const float sinC4  = +8.333347737789154052734375e-3;
84   const float sinC6  = -1.9842604524455964565277099609375e-4;
85   const float sinC8  = +2.760012648650445044040679931640625e-6;
86   const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
87 
88   const float cosC2  = -0.5;
89   const float cosC4  = +4.166664183139801025390625e-2;
90   const float cosC6  = -1.388833043165504932403564453125e-3;
91   const float cosC8  = +2.47562347794882953166961669921875e-5;
92   const float cosC10 = -2.59630184018533327616751194000244140625e-7;
93 
94   auto outside = select(cosUseCos, 1., x);
95   auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
96   auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
97   auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
98   auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
99   auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
100 
101   auto x2 = x * x;
102   auto formula = x2 * c10 + c8;
103   formula = x2 * formula + c6;
104   formula = x2 * formula + c4;
105   formula = x2 * formula + c2;
106   formula = x2 * formula + 1.;
107   formula *= outside;
108 
109   formula = select(flipSign, -formula, formula);
110   return formula;
111 }
112 
113 template <typename T>
sincos(const T & v,T & sinResult,T & cosResult)114 __forceinline void sincos(const T &v, T &sinResult, T &cosResult)
115 {
116   const float piOverTwoVec = 1.57079637050628662109375;
117   const float twoOverPiVec = 0.636619746685028076171875;
118   auto scaled = v * twoOverPiVec;
119   auto kReal = floor(scaled);
120   auto k = toInt(kReal);
121 
122   // Reduced range version of x
123   auto x = v - kReal * piOverTwoVec;
124   auto kMod4 = k & 3;
125   auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
126   auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
127   auto sinFlipSign = (kMod4 > 1);
128   auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
129 
130   const float oneVec = +1.;
131   const float sinC2  = -0.16666667163372039794921875;
132   const float sinC4  = +8.333347737789154052734375e-3;
133   const float sinC6  = -1.9842604524455964565277099609375e-4;
134   const float sinC8  = +2.760012648650445044040679931640625e-6;
135   const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
136 
137   const float cosC2  = -0.5;
138   const float cosC4  = +4.166664183139801025390625e-2;
139   const float cosC6  = -1.388833043165504932403564453125e-3;
140   const float cosC8  = +2.47562347794882953166961669921875e-5;
141   const float cosC10 = -2.59630184018533327616751194000244140625e-7;
142 
143   auto x2 = x * x;
144 
145   auto sinFormula = x2 * sinC10 + sinC8;
146   auto cosFormula = x2 * cosC10 + cosC8;
147   sinFormula = x2 * sinFormula + sinC6;
148   cosFormula = x2 * cosFormula + cosC6;
149 
150   sinFormula = x2 * sinFormula + sinC4;
151   cosFormula = x2 * cosFormula + cosC4;
152 
153   sinFormula = x2 * sinFormula + sinC2;
154   cosFormula = x2 * cosFormula + cosC2;
155 
156   sinFormula = x2 * sinFormula + oneVec;
157   cosFormula = x2 * cosFormula + oneVec;
158 
159   sinFormula *= x;
160 
161   sinResult = select(sinUseCos, cosFormula, sinFormula);
162   cosResult = select(cosUseCos, cosFormula, sinFormula);
163 
164   sinResult = select(sinFlipSign, -sinResult, sinResult);
165   cosResult = select(cosFlipSign, -cosResult, cosResult);
166 }
167 
168 template <typename T>
tan(const T & v)169 __forceinline T tan(const T &v)
170 {
171   const float piOverFourVec = 0.785398185253143310546875;
172   const float fourOverPiVec = 1.27323949337005615234375;
173 
174   auto xLt0 = v < 0.;
175   auto y = select(xLt0, -v, v);
176   auto scaled = y * fourOverPiVec;
177 
178   auto kReal = floor(scaled);
179   auto k = toInt(kReal);
180 
181   auto x = y - kReal * piOverFourVec;
182 
183   // If k & 1, x -= Pi/4
184   auto needOffset = (k & 1) != 0;
185   x = select(needOffset, x - piOverFourVec, x);
186 
187   // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
188   auto kMod4 = k & 3;
189   auto useCotan = (kMod4 == 1) | (kMod4 == 2);
190 
191   const float oneVec = 1.0;
192 
193   const float tanC2  = +0.33333075046539306640625;
194   const float tanC4  = +0.13339905440807342529296875;
195   const float tanC6  = +5.3348250687122344970703125e-2;
196   const float tanC8  = +2.46033705770969390869140625e-2;
197   const float tanC10 = +2.892402000725269317626953125e-3;
198   const float tanC12 = +9.500005282461643218994140625e-3;
199 
200   const float cotC2  = -0.3333333432674407958984375;
201   const float cotC4  = -2.222204394638538360595703125e-2;
202   const float cotC6  = -2.11752182804048061370849609375e-3;
203   const float cotC8  = -2.0846328698098659515380859375e-4;
204   const float cotC10 = -2.548247357481159269809722900390625e-5;
205   const float cotC12 = -3.5257363606433500535786151885986328125e-7;
206 
207   auto x2 = x * x;
208   T z;
209   if (any(useCotan))
210   {
211     auto cotVal = x2 * cotC12 + cotC10;
212     cotVal = x2 * cotVal + cotC8;
213     cotVal = x2 * cotVal + cotC6;
214     cotVal = x2 * cotVal + cotC4;
215     cotVal = x2 * cotVal + cotC2;
216     cotVal = x2 * cotVal + oneVec;
217     // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
218     cotVal /= -x;
219     z = cotVal;
220   }
221   auto useTan = !useCotan;
222   if (any(useTan))
223   {
224     auto tanVal = x2 * tanC12 + tanC10;
225     tanVal = x2 * tanVal + tanC8;
226     tanVal = x2 * tanVal + tanC6;
227     tanVal = x2 * tanVal + tanC4;
228     tanVal = x2 * tanVal + tanC2;
229     tanVal = x2 * tanVal + oneVec;
230     // Equation was for tan(x)/x
231     tanVal *= x;
232     z = select(useTan, tanVal, z);
233   }
234   return select(xLt0, -z, z);
235 }
236 
237 template <typename T>
asin(const T & x0)238 __forceinline T asin(const T &x0)
239 {
240   auto isneg = (x0 < 0.f);
241   auto x = abs(x0);
242   auto isnan = (x > 1.f);
243 
244   // sollya
245   // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
246   //           [1e-20;.9999999999999999]);
247   // avg error: 1.1105439e-06, max error 1.3187528e-06
248   auto v = 1.57079517841339111328125f +
249            x * (-0.21450997889041900634765625f +
250                 x * (8.78556668758392333984375e-2f +
251                      x * (-4.489909112453460693359375e-2f +
252                           x * (1.928029954433441162109375e-2f +
253                                x * (-4.3095736764371395111083984375e-3f)))));
254 
255   v *= -sqrt(1.f - x);
256   v = v + 1.57079637050628662109375f;
257 
258   v = select(v < 0.f, T(0.f), v);
259   v = select(isneg, -v, v);
260   v = select(isnan, T(cast_i2f(0x7fc00000)), v);
261 
262   return v;
263 }
264 
265 template <typename T>
acos(const T & v)266 __forceinline T acos(const T &v)
267 {
268   return 1.57079637050628662109375f - asin(v);
269 }
270 
271 template <typename T>
atan(const T & v)272 __forceinline T atan(const T &v)
273 {
274   const float piOverTwoVec = 1.57079637050628662109375;
275   // atan(-x) = -atan(x) (so flip from negative to positive first)
276   // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
277   auto xNeg = v < 0.f;
278   auto xFlipped = select(xNeg, -v, v);
279 
280   auto xGt1 = xFlipped > 1.;
281   auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
282 
283   // These coefficients approximate atan(x)/x
284   const float atanC0  = +0.99999988079071044921875;
285   const float atanC2  = -0.3333191573619842529296875;
286   const float atanC4  = +0.199689209461212158203125;
287   const float atanC6  = -0.14015688002109527587890625;
288   const float atanC8  = +9.905083477497100830078125e-2;
289   const float atanC10 = -5.93664981424808502197265625e-2;
290   const float atanC12 = +2.417283318936824798583984375e-2;
291   const float atanC14 = -4.6721356920897960662841796875e-3;
292 
293   auto x2 = x * x;
294   auto result = x2 * atanC14 + atanC12;
295   result = x2 * result + atanC10;
296   result = x2 * result + atanC8;
297   result = x2 * result + atanC6;
298   result = x2 * result + atanC4;
299   result = x2 * result + atanC2;
300   result = x2 * result + atanC0;
301   result *= x;
302 
303   result = select(xGt1, piOverTwoVec - result, result);
304   result = select(xNeg, -result, result);
305   return result;
306 }
307 
308 template <typename T>
atan2(const T & y,const T & x)309 __forceinline T atan2(const T &y, const T &x)
310 {
311   const float piVec = 3.1415926536;
312   // atan2(y, x) =
313   //
314   // atan2(y > 0, x = +-0) ->  Pi/2
315   // atan2(y < 0, x = +-0) -> -Pi/2
316   // atan2(y = +-0, x < +0) -> +-Pi
317   // atan2(y = +-0, x >= +0) -> +-0
318   //
319   // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
320   // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
321   // atan2(y, x > 0) -> atan(y/x)
322   //
323   // and then a bunch of code for dealing with infinities.
324   auto yOverX = y * rcpSafe(x);
325   auto atanArg = atan(yOverX);
326   auto xLt0 = x < 0.f;
327   auto yLt0 = y < 0.f;
328   auto offset = select(xLt0,
329                 select(yLt0, T(-piVec), T(piVec)), 0.f);
330   return offset + atanArg;
331 }
332 
333 template <typename T>
exp(const T & v)334 __forceinline T exp(const T &v)
335 {
336   const float ln2Part1 = 0.6931457519;
337   const float ln2Part2 = 1.4286067653e-6;
338   const float oneOverLn2 = 1.44269502162933349609375;
339 
340   auto scaled = v * oneOverLn2;
341   auto kReal = floor(scaled);
342   auto k = toInt(kReal);
343 
344   // Reduced range version of x
345   auto x = v - kReal * ln2Part1;
346   x -= kReal * ln2Part2;
347 
348   // These coefficients are for e^x in [0, ln(2)]
349   const float one = 1.;
350   const float c2 = 0.4999999105930328369140625;
351   const float c3 = 0.166668415069580078125;
352   const float c4 = 4.16539050638675689697265625e-2;
353   const float c5 = 8.378830738365650177001953125e-3;
354   const float c6 = 1.304379315115511417388916015625e-3;
355   const float c7 = 2.7555381529964506626129150390625e-4;
356 
357   auto result = x * c7 + c6;
358   result = x * result + c5;
359   result = x * result + c4;
360   result = x * result + c3;
361   result = x * result + c2;
362   result = x * result + one;
363   result = x * result + one;
364 
365   // Compute 2^k (should differ for float and double, but I'll avoid
366   // it for now and just do floats)
367   const int fpbias = 127;
368   auto biasedN = k + fpbias;
369   auto overflow = kReal > fpbias;
370   // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
371   // we've got underflow. -127 * ln(2) -> -88.02. So the most
372   // negative float input that doesn't result in zero is like -88.
373   auto underflow = kReal <= -fpbias;
374   const int infBits = 0x7f800000;
375   biasedN <<= 23;
376   // Reinterpret this thing as float
377   auto twoToTheN = asFloat(biasedN);
378   // Handle both doubles and floats (hopefully eliding the copy for float)
379   auto elemtype2n = twoToTheN;
380   result *= elemtype2n;
381   result = select(overflow, cast_i2f(infBits), result);
382   result = select(underflow, 0., result);
383   return result;
384 }
385 
386 // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
387 // * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
388 template <typename T, typename R>
__rangeReduceLog(const T & input,T & reduced,R & exponent)389 __forceinline void __rangeReduceLog(const T &input,
390                                     T &reduced,
391                                     R &exponent)
392 {
393   auto intVersion = asInt(input);
394   // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
395   // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
396   //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
397   // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
398   //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
399 
400   //const int exponentMask(0x7F800000)
401   static const int nonexponentMask = 0x807FFFFF;
402 
403   // We want the reduced version to have an exponent of -1 which is
404   // -1 + 127 after biasing or 126
405   static const int exponentNeg1 = (126l << 23);
406   // NOTE(boulos): We don't need to mask anything out since we know
407   // the sign bit has to be 0. If it's 1, we need to return infinity/nan
408   // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
409   auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
410 
411   auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
412   exponent = offsetExponent - 127;          // get the real value
413 
414   // Blend the offset_exponent with the original input (do this in
415   // int for now, until I decide if float can have & and &not)
416   auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
417   reduced = asFloat(blended);
418 }
419 
420 template <typename T> struct ExponentType            { };
421 template <int N>      struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
422 template <>           struct ExponentType<float>     { typedef int     Ty; };
423 
424 template <typename T>
425 __forceinline T log(const T &v)
426 {
427   T reduced;
428   typename ExponentType<T>::Ty exponent;
429 
430   const int nanBits = 0x7fc00000;
431   const int negInfBits = 0xFF800000;
432   const float nan = cast_i2f(nanBits);
433   const float negInf = cast_i2f(negInfBits);
434   auto useNan = v < 0.;
435   auto useInf = v == 0.;
436   auto exceptional = useNan | useInf;
437   const float one = 1.0;
438 
439   auto patched = select(exceptional, one, v);
440   __rangeReduceLog(patched, reduced, exponent);
441 
442   const float ln2 = 0.693147182464599609375;
443 
444   auto x1 = one - reduced;
445   const float c1 = +0.50000095367431640625;
446   const float c2 = +0.33326041698455810546875;
447   const float c3 = +0.2519190013408660888671875;
448   const float c4 = +0.17541764676570892333984375;
449   const float c5 = +0.3424419462680816650390625;
450   const float c6 = -0.599632322788238525390625;
451   const float c7 = +1.98442304134368896484375;
452   const float c8 = -2.4899270534515380859375;
453   const float c9 = +1.7491014003753662109375;
454 
455   auto result = x1 * c9 + c8;
456   result = x1 * result + c7;
457   result = x1 * result + c6;
458   result = x1 * result + c5;
459   result = x1 * result + c4;
460   result = x1 * result + c3;
461   result = x1 * result + c2;
462   result = x1 * result + c1;
463   result = x1 * result + one;
464 
465   // Equation was for -(ln(red)/(1-red))
466   result *= -x1;
467   result += toFloat(exponent) * ln2;
468 
469   return select(exceptional,
470                 select(useNan, T(nan), T(negInf)),
471                 result);
472 }
473 
474 template <typename T>
475 __forceinline T pow(const T &x, const T &y)
476 {
477   auto x1 = abs(x);
478   auto z = exp(y * log(x1));
479 
480   // Handle special cases
481   const float twoOver23 = 8388608.0f;
482   auto yInt = y == round(y);
483   auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
484 
485   // x == 0
486   z = select(x == 0.0f,
487       select(y < 0.0f, T(inf) | signmsk(x),
488       select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
489 
490   // x < 0
491   auto xNegative = x < 0.0f;
492   if (any(xNegative))
493   {
494     auto z1 = z | asFloat(yOddInt);
495     z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
496     z = select(xNegative, z1, z);
497   }
498 
499   auto xFinite = isfinite(x);
500   auto yFinite = isfinite(y);
501   if (all(xFinite & yFinite))
502     return z;
503 
504   // x finite and y infinite
505   z = select(andn(xFinite, yFinite),
506       select(x1 == 1.0f, 1.0f,
507       select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
508 
509   // x infinite
510   z = select(xFinite, z,
511       select(y == 0.0f, 1.0f,
512       select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
513 
514   return z;
515 }
516 
517 template <typename T>
518 __forceinline T pow(const T &x, float y)
519 {
520   return pow(x, T(y));
521 }
522 
523 } // namespace fastapprox
524 
525 } // namespace embree
526