1 /*
2  * Adapted from OpenImageIO library with this license:
3  *
4  * Copyright 2008-2014 Larry Gritz and the other authors and contributors.
5  * All Rights Reserved.
6 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are
9  * met:
10  * * Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12  * * Redistributions in binary form must reproduce the above copyright
13  *   notice, this list of conditions and the following disclaimer in the
14  *   documentation and/or other materials provided with the distribution.
15  * * Neither the name of the software's owners nor the names of its
16  *   contributors may be used to endorse or promote products derived from
17  *   this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * (This is the Modified BSD License)
31  *
32  * A few bits here are based upon code from NVIDIA that was also released
33  * under the same modified BSD license, and marked as:
34  *    Copyright 2004 NVIDIA Corporation. All Rights Reserved.
35  *
36  * Some parts of this file were first open-sourced in Open Shading Language,
37  * then later moved here. The original copyright notice was:
38  *    Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
39  *
40  * Many of the math functions were copied from or inspired by other
41  * public domain sources or open source packages with compatible licenses.
42  * The individual functions give references were applicable.
43  */
44 
45 #ifndef __UTIL_FAST_MATH__
46 #define __UTIL_FAST_MATH__
47 
48 CCL_NAMESPACE_BEGIN
49 
madd(const float a,const float b,const float c)50 ccl_device_inline float madd(const float a, const float b, const float c)
51 {
52   /* NOTE: In the future we may want to explicitly ask for a fused
53    * multiply-add in a specialized version for float.
54    *
55    * NOTE: GCC/ICC will turn this (for float) into a FMA unless
56    * explicitly asked not to, clang seems to leave the code alone.
57    */
58   return a * b + c;
59 }
60 
madd4(const float4 a,const float4 b,const float4 c)61 ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
62 {
63   return a * b + c;
64 }
65 
66 /*
67  * FAST & APPROXIMATE MATH
68  *
69  * The functions named "fast_*" provide a set of replacements to libm that
70  * are much faster at the expense of some accuracy and robust handling of
71  * extreme values. One design goal for these approximation was to avoid
72  * branches as much as possible and operate on single precision values only
73  * so that SIMD versions should be straightforward ports We also try to
74  * implement "safe" semantics (ie: clamp to valid range where possible)
75  * natively since wrapping these inline calls in another layer would be
76  * wasteful.
77  *
78  * Some functions are fast_safe_*, which is both a faster approximation as
79  * well as clamped input domain to ensure no NaN, Inf, or divide by zero.
80  */
81 
82 /* Round to nearest integer, returning as an int. */
fast_rint(float x)83 ccl_device_inline int fast_rint(float x)
84 {
85   /* used by sin/cos/tan range reduction. */
86 #ifdef __KERNEL_SSE4__
87   /* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
88   return float_to_int(rintf(x));
89 #else
90   /* emulate rounding by adding/subtracting 0.5. */
91   return float_to_int(x + copysignf(0.5f, x));
92 #endif
93 }
94 
fast_sinf(float x)95 ccl_device float fast_sinf(float x)
96 {
97   /* Very accurate argument reduction from SLEEF,
98    * starts failing around x=262000
99    *
100    * Results on: [-2pi,2pi].
101    *
102    * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
103    * 1.19209e-07 max error
104    */
105   int q = fast_rint(x * M_1_PI_F);
106   float qf = q;
107   x = madd(qf, -0.78515625f * 4, x);
108   x = madd(qf, -0.00024187564849853515625f * 4, x);
109   x = madd(qf, -3.7747668102383613586e-08f * 4, x);
110   x = madd(qf, -1.2816720341285448015e-12f * 4, x);
111   x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
112   float s = x * x;
113   if ((q & 1) != 0)
114     x = -x;
115   /* This polynomial approximation has very low error on [-pi/2,+pi/2]
116    * 1.19209e-07 max error in total over [-2pi,+2pi]. */
117   float u = 2.6083159809786593541503e-06f;
118   u = madd(u, s, -0.0001981069071916863322258f);
119   u = madd(u, s, +0.00833307858556509017944336f);
120   u = madd(u, s, -0.166666597127914428710938f);
121   u = madd(s, u * x, x);
122   /* For large x, the argument reduction can fail and the polynomial can be
123    * evaluated with arguments outside the valid internal. Just clamp the bad
124    * values away (setting to 0.0f means no branches need to be generated). */
125   if (fabsf(u) > 1.0f) {
126     u = 0.0f;
127   }
128   return u;
129 }
130 
fast_cosf(float x)131 ccl_device float fast_cosf(float x)
132 {
133   /* Same argument reduction as fast_sinf(). */
134   int q = fast_rint(x * M_1_PI_F);
135   float qf = q;
136   x = madd(qf, -0.78515625f * 4, x);
137   x = madd(qf, -0.00024187564849853515625f * 4, x);
138   x = madd(qf, -3.7747668102383613586e-08f * 4, x);
139   x = madd(qf, -1.2816720341285448015e-12f * 4, x);
140   x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals. */
141   float s = x * x;
142   /* Polynomial from SLEEF's sincosf, max error is
143    * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
144   float u = -2.71811842367242206819355e-07f;
145   u = madd(u, s, +2.47990446951007470488548e-05f);
146   u = madd(u, s, -0.00138888787478208541870117f);
147   u = madd(u, s, +0.0416666641831398010253906f);
148   u = madd(u, s, -0.5f);
149   u = madd(u, s, +1.0f);
150   if ((q & 1) != 0) {
151     u = -u;
152   }
153   if (fabsf(u) > 1.0f) {
154     u = 0.0f;
155   }
156   return u;
157 }
158 
fast_sincosf(float x,float * sine,float * cosine)159 ccl_device void fast_sincosf(float x, float *sine, float *cosine)
160 {
161   /* Same argument reduction as fast_sin. */
162   int q = fast_rint(x * M_1_PI_F);
163   float qf = q;
164   x = madd(qf, -0.78515625f * 4, x);
165   x = madd(qf, -0.00024187564849853515625f * 4, x);
166   x = madd(qf, -3.7747668102383613586e-08f * 4, x);
167   x = madd(qf, -1.2816720341285448015e-12f * 4, x);
168   x = M_PI_2_F - (M_PI_2_F - x);  // crush denormals
169   float s = x * x;
170   /* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
171   if ((q & 1) != 0) {
172     x = -x;
173   }
174   float su = 2.6083159809786593541503e-06f;
175   su = madd(su, s, -0.0001981069071916863322258f);
176   su = madd(su, s, +0.00833307858556509017944336f);
177   su = madd(su, s, -0.166666597127914428710938f);
178   su = madd(s, su * x, x);
179   float cu = -2.71811842367242206819355e-07f;
180   cu = madd(cu, s, +2.47990446951007470488548e-05f);
181   cu = madd(cu, s, -0.00138888787478208541870117f);
182   cu = madd(cu, s, +0.0416666641831398010253906f);
183   cu = madd(cu, s, -0.5f);
184   cu = madd(cu, s, +1.0f);
185   if ((q & 1) != 0) {
186     cu = -cu;
187   }
188   if (fabsf(su) > 1.0f) {
189     su = 0.0f;
190   }
191   if (fabsf(cu) > 1.0f) {
192     cu = 0.0f;
193   }
194   *sine = su;
195   *cosine = cu;
196 }
197 
198 /* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
199  * becoming really poor outside of this range because the reciprocal amplifies
200  * errors.
201  */
fast_tanf(float x)202 ccl_device float fast_tanf(float x)
203 {
204   /* Derived from SLEEF implementation.
205    *
206    * Note that we cannot apply the "denormal crush" trick everywhere because
207    * we sometimes need to take the reciprocal of the polynomial
208    */
209   int q = fast_rint(x * 2.0f * M_1_PI_F);
210   float qf = q;
211   x = madd(qf, -0.78515625f * 2, x);
212   x = madd(qf, -0.00024187564849853515625f * 2, x);
213   x = madd(qf, -3.7747668102383613586e-08f * 2, x);
214   x = madd(qf, -1.2816720341285448015e-12f * 2, x);
215   if ((q & 1) == 0) {
216     /* Crush denormals (only if we aren't inverting the result later). */
217     x = M_PI_4_F - (M_PI_4_F - x);
218   }
219   float s = x * x;
220   float u = 0.00927245803177356719970703f;
221   u = madd(u, s, 0.00331984995864331722259521f);
222   u = madd(u, s, 0.0242998078465461730957031f);
223   u = madd(u, s, 0.0534495301544666290283203f);
224   u = madd(u, s, 0.133383005857467651367188f);
225   u = madd(u, s, 0.333331853151321411132812f);
226   u = madd(s, u * x, x);
227   if ((q & 1) != 0) {
228     u = -1.0f / u;
229   }
230   return u;
231 }
232 
233 /* Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
234  *
235  * Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
236  */
fast_sinpif(float x)237 ccl_device float fast_sinpif(float x)
238 {
239   /* Fast trick to strip the integral part off, so our domain is [-1, 1]. */
240   const float z = x - ((x + 25165824.0f) - 25165824.0f);
241   const float y = z - z * fabsf(z);
242   const float Q = 3.10396624f;
243   const float P = 3.584135056f; /* P = 16-4*Q */
244   return y * (Q + P * fabsf(y));
245 
246   /* The original article used used inferior constants for Q and P and
247    * so had max error 1.091e-3.
248    *
249    * The optimal value for Q was determined by exhaustive search, minimizing
250    * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
251    * over the interval [0,2] (which is where most of the invocations happen).
252    *
253    * The basic idea of this approximation starts with the coarse approximation:
254    *      sin(pi*x) ~= f(x) =  4 * (x - x * abs(x))
255    *
256    * This approximation always _over_ estimates the target. On the other hand,
257    * the curve:
258    *      sin(pi*x) ~= f(x) * abs(f(x)) / 4
259    *
260    * always lies _under_ the target. Thus we can simply numerically search for
261    * the optimal constant to LERP these curves into a more precise
262    * approximation.
263    *
264    * After folding the constants together and simplifying the resulting math,
265    * we end up with the compact implementation above.
266    *
267    * NOTE: this function actually computes sin(x * pi) which avoids one or two
268    * mults in many cases and guarantees exact values at integer periods.
269    */
270 }
271 
272 /* Fast approximate cos(x*M_PI) with ~0.1% absolute error. */
fast_cospif(float x)273 ccl_device_inline float fast_cospif(float x)
274 {
275   return fast_sinpif(x + 0.5f);
276 }
277 
fast_acosf(float x)278 ccl_device float fast_acosf(float x)
279 {
280   const float f = fabsf(x);
281   /* clamp and crush denormals. */
282   const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
283   /* Based on http://www.pouet.net/topic.php?which=9132&page=2
284    * 85% accurate (ulp 0)
285    * Examined 2130706434 values of acos:
286    *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
287    * Examined 2130706434 values of acos:
288    *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
289    */
290   const float a = sqrtf(1.0f - m) *
291                   (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
292   return x < 0 ? M_PI_F - a : a;
293 }
294 
fast_asinf(float x)295 ccl_device float fast_asinf(float x)
296 {
297   /* Based on acosf approximation above.
298    * Max error is 4.51133e-05 (ulps are higher because we are consistently off
299    * by a little amount).
300    */
301   const float f = fabsf(x);
302   /* Clamp and crush denormals. */
303   const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
304   const float a = M_PI_2_F -
305                   sqrtf(1.0f - m) * (1.5707963267f +
306                                      m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
307   return copysignf(a, x);
308 }
309 
fast_atanf(float x)310 ccl_device float fast_atanf(float x)
311 {
312   const float a = fabsf(x);
313   const float k = a > 1.0f ? 1 / a : a;
314   const float s = 1.0f - (1.0f - k); /* Crush denormals. */
315   const float t = s * s;
316   /* http://mathforum.org/library/drmath/view/62672.html
317    * Examined 4278190080 values of atan:
318    *   2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
319    * Examined 4278190080 values of atan:
320    *   171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
321    */
322   float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
323   if (a > 1.0f) {
324     r = M_PI_2_F - r;
325   }
326   return copysignf(r, x);
327 }
328 
fast_atan2f(float y,float x)329 ccl_device float fast_atan2f(float y, float x)
330 {
331   /* Based on atan approximation above.
332    *
333    * The special cases around 0 and infinity were tested explicitly.
334    *
335    * The only case not handled correctly is x=NaN,y=0 which returns 0 instead
336    * of nan.
337    */
338   const float a = fabsf(x);
339   const float b = fabsf(y);
340 
341   const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
342   const float s = 1.0f - (1.0f - k); /* Crush denormals */
343   const float t = s * s;
344 
345   float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
346 
347   if (b > a) {
348     /* Account for arg reduction. */
349     r = M_PI_2_F - r;
350   }
351   /* Test sign bit of x. */
352   if (__float_as_uint(x) & 0x80000000u) {
353     r = M_PI_F - r;
354   }
355   return copysignf(r, y);
356 }
357 
358 /* Based on:
359  *
360  *   https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
361  */
fast_log2f(float x)362 ccl_device float fast_log2f(float x)
363 {
364   /* NOTE: clamp to avoid special cases and make result "safe" from large
365    * negative values/nans. */
366   x = clamp(x, FLT_MIN, FLT_MAX);
367   unsigned bits = __float_as_uint(x);
368   int exponent = (int)(bits >> 23) - 127;
369   float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
370   /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
371    * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
372    * ulp histogram:
373    *  0  = 97.46%
374    *  1  =  2.29%
375    *  2  =  0.11%
376    */
377   float f2 = f * f;
378   float f4 = f2 * f2;
379   float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
380   float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
381   hi = madd(f, hi, -0.13753123777116f);
382   hi = madd(f, hi, 0.24187369696082f);
383   hi = madd(f, hi, -0.34730547155299f);
384   lo = madd(f, lo, 1.442689881667200f);
385   return ((f4 * hi) + (f * lo)) + exponent;
386 }
387 
fast_logf(float x)388 ccl_device_inline float fast_logf(float x)
389 {
390   /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
391    * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
392    */
393   return fast_log2f(x) * M_LN2_F;
394 }
395 
fast_log10(float x)396 ccl_device_inline float fast_log10(float x)
397 {
398   /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
399    * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
400    */
401   return fast_log2f(x) * M_LN2_F / M_LN10_F;
402 }
403 
fast_logb(float x)404 ccl_device float fast_logb(float x)
405 {
406   /* Don't bother with denormals. */
407   x = fabsf(x);
408   x = clamp(x, FLT_MIN, FLT_MAX);
409   unsigned bits = __float_as_uint(x);
410   return (int)(bits >> 23) - 127;
411 }
412 
fast_exp2f(float x)413 ccl_device float fast_exp2f(float x)
414 {
415   /* Clamp to safe range for final addition. */
416   x = clamp(x, -126.0f, 126.0f);
417   /* Range reduction. */
418   int m = (int)x;
419   x -= m;
420   x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
421   /* 5th degree polynomial generated with sollya
422    * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
423    * 232 max ulp.
424    *
425    * ulp histogram:
426    *  0  = 87.81%
427    *  1  =  4.18%
428    */
429   float r = 1.33336498402e-3f;
430   r = madd(x, r, 9.810352697968e-3f);
431   r = madd(x, r, 5.551834031939e-2f);
432   r = madd(x, r, 0.2401793301105f);
433   r = madd(x, r, 0.693144857883f);
434   r = madd(x, r, 1.0f);
435   /* Multiply by 2 ^ m by adding in the exponent. */
436   /* NOTE: left-shift of negative number is undefined behavior. */
437   return __uint_as_float(__float_as_uint(r) + ((unsigned)m << 23));
438 }
439 
fast_expf(float x)440 ccl_device_inline float fast_expf(float x)
441 {
442   /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
443    * 2.6666452 avg ulp diff, 230 max ulp.
444    */
445   return fast_exp2f(x / M_LN2_F);
446 }
447 
448 #if defined(__KERNEL_CPU__) && !defined(_MSC_VER)
449 /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
450  * T78047 and T78869 for details. Just disable for now, it only makes
451  * a small difference in denoising performance. */
fast_exp2f4(float4 x)452 ccl_device float4 fast_exp2f4(float4 x)
453 {
454   const float4 one = make_float4(1.0f);
455   const float4 limit = make_float4(126.0f);
456   x = clamp(x, -limit, limit);
457   int4 m = make_int4(x);
458   x = one - (one - (x - make_float4(m)));
459   float4 r = make_float4(1.33336498402e-3f);
460   r = madd4(x, r, make_float4(9.810352697968e-3f));
461   r = madd4(x, r, make_float4(5.551834031939e-2f));
462   r = madd4(x, r, make_float4(0.2401793301105f));
463   r = madd4(x, r, make_float4(0.693144857883f));
464   r = madd4(x, r, make_float4(1.0f));
465   return __int4_as_float4(__float4_as_int4(r) + (m << 23));
466 }
467 
fast_expf4(float4 x)468 ccl_device_inline float4 fast_expf4(float4 x)
469 {
470   return fast_exp2f4(x / M_LN2_F);
471 }
472 #else
fast_expf4(float4 x)473 ccl_device_inline float4 fast_expf4(float4 x)
474 {
475   return make_float4(fast_expf(x.x), fast_expf(x.y), fast_expf(x.z), fast_expf(x.w));
476 }
477 #endif
478 
fast_exp10(float x)479 ccl_device_inline float fast_exp10(float x)
480 {
481   /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
482    * 2.71732409 avg ulp diff, 232 max ulp.
483    */
484   return fast_exp2f(x * M_LN10_F / M_LN2_F);
485 }
486 
fast_expm1f(float x)487 ccl_device_inline float fast_expm1f(float x)
488 {
489   if (fabsf(x) < 1e-5f) {
490     x = 1.0f - (1.0f - x); /* Crush denormals. */
491     return madd(0.5f, x * x, x);
492   }
493   else {
494     return fast_expf(x) - 1.0f;
495   }
496 }
497 
fast_sinhf(float x)498 ccl_device float fast_sinhf(float x)
499 {
500   float a = fabsf(x);
501   if (a > 1.0f) {
502     /* Examined 53389559 values of sinh on [1,87.3300018]:
503      * 33.6886442 avg ulp diff, 178 max ulp. */
504     float e = fast_expf(a);
505     return copysignf(0.5f * e - 0.5f / e, x);
506   }
507   else {
508     a = 1.0f - (1.0f - a); /* Crush denorms. */
509     float a2 = a * a;
510     /* Degree 7 polynomial generated with sollya. */
511     /* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
512     float r = 2.03945513931e-4f;
513     r = madd(r, a2, 8.32990277558e-3f);
514     r = madd(r, a2, 0.1666673421859f);
515     r = madd(r * a, a2, a);
516     return copysignf(r, x);
517   }
518 }
519 
fast_coshf(float x)520 ccl_device_inline float fast_coshf(float x)
521 {
522   /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
523    * 1.78256726 avg ulp diff, 178 max ulp.
524    */
525   float e = fast_expf(fabsf(x));
526   return 0.5f * e + 0.5f / e;
527 }
528 
fast_tanhf(float x)529 ccl_device_inline float fast_tanhf(float x)
530 {
531   /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
532    * 3.12924e-06 max error.
533    */
534   /* NOTE: ulp error is high because of sub-optimal handling around the origin. */
535   float e = fast_expf(2.0f * fabsf(x));
536   return copysignf(1.0f - 2.0f / (1.0f + e), x);
537 }
538 
fast_safe_powf(float x,float y)539 ccl_device float fast_safe_powf(float x, float y)
540 {
541   if (y == 0)
542     return 1.0f; /* x^1=1 */
543   if (x == 0)
544     return 0.0f; /* 0^y=0 */
545   float sign = 1.0f;
546   if (x < 0.0f) {
547     /* if x is negative, only deal with integer powers
548      * powf returns NaN for non-integers, we will return 0 instead.
549      */
550     int ybits = __float_as_int(y) & 0x7fffffff;
551     if (ybits >= 0x4b800000) {
552       // always even int, keep positive
553     }
554     else if (ybits >= 0x3f800000) {
555       /* Bigger than 1, check. */
556       int k = (ybits >> 23) - 127;    /* Get exponent. */
557       int j = ybits >> (23 - k);      /* Shift out possible fractional bits. */
558       if ((j << (23 - k)) == ybits) { /* rebuild number and check for a match. */
559         /* +1 for even, -1 for odd. */
560         sign = __int_as_float(0x3f800000 | (j << 31));
561       }
562       else {
563         /* Not an integer. */
564         return 0.0f;
565       }
566     }
567     else {
568       /* Not an integer. */
569       return 0.0f;
570     }
571   }
572   return sign * fast_exp2f(y * fast_log2f(fabsf(x)));
573 }
574 
575 /* TODO(sergey): Check speed  with our erf functions implementation from
576  * bsdf_microfacet.h.
577  */
578 
fast_erff(float x)579 ccl_device_inline float fast_erff(float x)
580 {
581   /* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
582   /* Abramowitz and Stegun, 7.1.28. */
583   const float a1 = 0.0705230784f;
584   const float a2 = 0.0422820123f;
585   const float a3 = 0.0092705272f;
586   const float a4 = 0.0001520143f;
587   const float a5 = 0.0002765672f;
588   const float a6 = 0.0000430638f;
589   const float a = fabsf(x);
590   if (a >= 12.3f) {
591     return copysignf(1.0f, x);
592   }
593   const float b = 1.0f - (1.0f - a); /* Crush denormals. */
594   const float r = madd(
595       madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
596   const float s = r * r; /* ^2 */
597   const float t = s * s; /* ^4 */
598   const float u = t * t; /* ^8 */
599   const float v = u * u; /* ^16 */
600   return copysignf(1.0f - 1.0f / v, x);
601 }
602 
fast_erfcf(float x)603 ccl_device_inline float fast_erfcf(float x)
604 {
605   /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
606    *
607    * ulp histogram:
608    *
609    *  0  = 80.30%
610    */
611   return 1.0f - fast_erff(x);
612 }
613 
fast_ierff(float x)614 ccl_device_inline float fast_ierff(float x)
615 {
616   /* From: Approximating the erfinv function by Mike Giles. */
617   /* To avoid trouble at the limit, clamp input to 1-eps. */
618   float a = fabsf(x);
619   if (a > 0.99999994f) {
620     a = 0.99999994f;
621   }
622   float w = -fast_logf((1.0f - a) * (1.0f + a)), p;
623   if (w < 5.0f) {
624     w = w - 2.5f;
625     p = 2.81022636e-08f;
626     p = madd(p, w, 3.43273939e-07f);
627     p = madd(p, w, -3.5233877e-06f);
628     p = madd(p, w, -4.39150654e-06f);
629     p = madd(p, w, 0.00021858087f);
630     p = madd(p, w, -0.00125372503f);
631     p = madd(p, w, -0.00417768164f);
632     p = madd(p, w, 0.246640727f);
633     p = madd(p, w, 1.50140941f);
634   }
635   else {
636     w = sqrtf(w) - 3.0f;
637     p = -0.000200214257f;
638     p = madd(p, w, 0.000100950558f);
639     p = madd(p, w, 0.00134934322f);
640     p = madd(p, w, -0.00367342844f);
641     p = madd(p, w, 0.00573950773f);
642     p = madd(p, w, -0.0076224613f);
643     p = madd(p, w, 0.00943887047f);
644     p = madd(p, w, 1.00167406f);
645     p = madd(p, w, 2.83297682f);
646   }
647   return p * x;
648 }
649 
650 CCL_NAMESPACE_END
651 
652 #endif /* __UTIL_FAST_MATH__ */
653