1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4 
5 /*
6   A few bits here are based upon code from NVIDIA that was also released
7   under the same 3-Clause BSD license, and marked as:
8      Copyright 2004 NVIDIA Corporation. All Rights Reserved.
9 
10   Some parts of this file were first open-sourced in Open Shading Language,
11   also 3-Clause BSD license, then later moved here. The original copyright
12   notice was:
13      Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
14 
15   Many of the math functions were copied from or inspired by other
16   public domain sources or open source packages with compatible licenses.
17   The individual functions give references were applicable.
18 */
19 
20 
21 // clang-format off
22 
23 /// \file
24 ///
25 /// A variety of floating-point math helper routines (and, slight
26 /// misnomer, some int stuff as well).
27 ///
28 
29 
30 #pragma once
31 #define OIIO_FMATH_H 1
32 
33 #include <algorithm>
34 #include <cmath>
35 #include <cstring>
36 #include <limits>
37 #include <typeinfo>
38 #include <type_traits>
39 
40 #include <OpenImageIO/Imath.h>
41 #include <OpenImageIO/span.h>
42 #include <OpenImageIO/dassert.h>
43 #include <OpenImageIO/oiioversion.h>
44 #include <OpenImageIO/platform.h>
45 #include <OpenImageIO/simd.h>
46 
47 
48 OIIO_NAMESPACE_BEGIN
49 
50 /// Occasionally there may be a tradeoff where the best/fastest
51 /// implementation of a math function in an ordinary scalar context does
52 /// things that prevent autovectorization (or OMP pragma vectorization) of a
53 /// loop containing that operation, and the only way to make it SIMD
54 /// friendly slows it down as a scalar op.
55 ///
56 /// By default, we always prefer doing it as correctly and quickly as
57 /// possible in scalar mode, but if OIIO_FMATH_SIMD_FRIENDLY is defined to
58 /// be nonzero, we prefer ensuring that the SIMD loops are as efficient as
59 /// possible, even at the expense of scalar performance.
60 ///
61 /// Because everything here consists of inline functions, and it doesn't
62 /// really matter for OIIO internals themselves, this is primarily intended
63 /// for the sake of 3rd party apps that happen to have OIIO as a dependency
64 /// and use the fmath.h functions. Such a downstream dependency just needs
65 /// to ensure that OIIO_FMATH_SIMD_FRIENDLY is defined to 1 prior to
66 /// including fmath.h.
67 ///
68 #ifndef OIIO_FMATH_SIMD_FRIENDLY
69 #    define OIIO_FMATH_SIMD_FRIENDLY 0
70 #endif
71 
72 
73 // Helper template to let us tell if two types are the same.
74 // C++11 defines this, keep in OIIO namespace for back compat.
75 // DEPRECATED(2.0) -- clients should switch OIIO::is_same -> std::is_same.
76 using std::is_same;
77 
78 
79 // For back compatibility: expose these in the OIIO namespace.
80 // DEPRECATED(2.0) -- clients should switch OIIO:: -> std:: for these.
81 using std::isfinite;
82 using std::isinf;
83 using std::isnan;
84 
85 
86 // Define math constants just in case they aren't included (Windows is a
87 // little finicky about this, only defining these if _USE_MATH_DEFINES is
88 // defined before <cmath> is included, which is hard to control).
89 #ifndef M_PI
90 #    define M_PI 3.14159265358979323846264338327950288
91 #endif
92 #ifndef M_PI_2
93 #    define M_PI_2 1.57079632679489661923132169163975144
94 #endif
95 #ifndef M_PI_4
96 #    define M_PI_4 0.785398163397448309615660845819875721
97 #endif
98 #ifndef M_TWO_PI
99 #    define M_TWO_PI (M_PI * 2.0)
100 #endif
101 #ifndef M_1_PI
102 #    define M_1_PI 0.318309886183790671537767526745028724
103 #endif
104 #ifndef M_2_PI
105 #    define M_2_PI 0.636619772367581343075535053490057448
106 #endif
107 #ifndef M_SQRT2
108 #    define M_SQRT2 1.41421356237309504880168872420969808
109 #endif
110 #ifndef M_SQRT1_2
111 #    define M_SQRT1_2 0.707106781186547524400844362104849039
112 #endif
113 #ifndef M_LN2
114 #    define M_LN2 0.69314718055994530941723212145817656
115 #endif
116 #ifndef M_LN10
117 #    define M_LN10 2.30258509299404568401799145468436421
118 #endif
119 #ifndef M_E
120 #    define M_E 2.71828182845904523536028747135266250
121 #endif
122 #ifndef M_LOG2E
123 #    define M_LOG2E 1.44269504088896340735992468100189214
124 #endif
125 
126 
127 
128 ////////////////////////////////////////////////////////////////////////////
129 ////////////////////////////////////////////////////////////////////////////
130 //
131 // INTEGER HELPER FUNCTIONS
132 //
133 // A variety of handy functions that operate on integers.
134 //
135 
136 /// Quick test for whether an integer is a power of 2.
137 ///
138 template<typename T>
139 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool
ispow2(T x)140 ispow2(T x) noexcept
141 {
142     // Numerous references for this bit trick are on the web.  The
143     // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
144     // all 1 bits for bits < b.
145     return (x & (x - 1)) == 0 && (x >= 0);
146 }
147 
148 
149 
150 /// Round up to next higher power of 2 (return x if it's already a power
151 /// of 2).
152 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
ceil2(int x)153 ceil2(int x) noexcept
154 {
155     // Here's a version with no loops.
156     if (x < 0)
157         return 0;
158     // Subtract 1, then round up to a power of 2, that way if we are
159     // already a power of 2, we end up with the same number.
160     --x;
161     // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
162     x |= x >> 1;
163     x |= x >> 2;
164     x |= x >> 4;
165     x |= x >> 8;
166     x |= x >> 16;
167     // Now we have 2^n-1, by adding 1, we make it a power of 2 again
168     return x + 1;
169 }
170 
171 
172 
173 /// Round down to next lower power of 2 (return x if it's already a power
174 /// of 2).
175 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
floor2(int x)176 floor2(int x) noexcept
177 {
178     // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
179     x |= x >> 1;
180     x |= x >> 2;
181     x |= x >> 4;
182     x |= x >> 8;
183     x |= x >> 16;
184     // Strip off all but the high bit, i.e. 00011111 -> 00010000
185     // That's the power of two <= the original x
186     return x & ~(x >> 1);
187 }
188 
189 
190 // Old names -- DEPRECATED(2.1)
pow2roundup(int x)191 inline OIIO_HOSTDEVICE int pow2roundup(int x) { return ceil2(x); }
pow2rounddown(int x)192 inline OIIO_HOSTDEVICE int pow2rounddown(int x) { return floor2(x); }
193 
194 
195 
196 /// Round value up to the next whole multiple.
197 /// For example, round_to_multiple(7,10) returns 10.
198 template <typename V, typename M>
round_to_multiple(V value,M multiple)199 inline OIIO_HOSTDEVICE V round_to_multiple (V value, M multiple)
200 {
201     return V (((value + V(multiple) - 1) / V(multiple)) * V(multiple));
202 }
203 
204 
205 
206 /// Round up to the next whole multiple of m, for the special case where
207 /// m is definitely a power of 2 (somewhat simpler than the more general
208 /// round_to_multiple). This is a template that should work for any
209 /// integer type.
210 template<typename T>
211 inline OIIO_HOSTDEVICE T
round_to_multiple_of_pow2(T x,T m)212 round_to_multiple_of_pow2(T x, T m)
213 {
214     OIIO_DASSERT(ispow2(m));
215     return (x + m - 1) & (~(m - 1));
216 }
217 
218 
219 
220 /// Multiply two unsigned 32-bit ints safely, carefully checking for
221 /// overflow, and clamping to uint32_t's maximum value.
222 inline OIIO_HOSTDEVICE uint32_t
clamped_mult32(uint32_t a,uint32_t b)223 clamped_mult32(uint32_t a, uint32_t b)
224 {
225     const uint32_t Err = std::numeric_limits<uint32_t>::max();
226     uint64_t r = (uint64_t)a * (uint64_t)b;  // Multiply into a bigger int
227     return r < Err ? (uint32_t)r : Err;
228 }
229 
230 
231 
232 /// Multiply two unsigned 64-bit ints safely, carefully checking for
233 /// overflow, and clamping to uint64_t's maximum value.
234 inline OIIO_HOSTDEVICE uint64_t
clamped_mult64(uint64_t a,uint64_t b)235 clamped_mult64(uint64_t a, uint64_t b)
236 {
237     uint64_t ab = a * b;
238     if (b && ab / b != a)
239         return std::numeric_limits<uint64_t>::max();
240     else
241         return ab;
242 }
243 
244 
245 
246 /// Bitwise circular rotation left by `s` bits (for any unsigned integer
247 /// type).  For info on the C++20 std::rotl(), see
248 /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0553r4.html
249 // FIXME: this should be constexpr, but we're leaving that out for now
250 // because the Cuda specialization uses an intrinsic that isn't constexpr.
251 // Come back to this later when more of the Cuda library is properly
252 // constexpr.
253 template<class T>
254 OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE
255 // constexpr
rotl(T x,int s)256 T rotl(T x, int s) noexcept
257 {
258     static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
259                   "rotl only works for unsigned integer types");
260     return (x << s) | (x >> ((sizeof(T) * 8) - s));
261 }
262 
263 
264 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
265 // Cuda has an intrinsic for 32 bit unsigned int rotation
266 // FIXME: This should be constexpr, but __funnelshift_lc seems not to be
267 // marked as such.
268 template<>
269 OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE
270 // constexpr
rotl(uint32_t x,int s)271 uint32_t rotl(uint32_t x, int s) noexcept
272 {
273     return __funnelshift_lc(x, x, s);
274 }
275 #endif
276 
277 
278 
279 // Old names -- DEPRECATED(2.1)
280 OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t
rotl32(uint32_t x,int k)281 rotl32(uint32_t x, int k)
282 {
283 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
284     return __funnelshift_lc(x, x, k);
285 #else
286     return (x << k) | (x >> (32 - k));
287 #endif
288 }
289 
290 OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t
rotl64(uint64_t x,int k)291 rotl64(uint64_t x, int k)
292 {
293     return (x << k) | (x >> (64 - k));
294 }
295 
296 
297 
298 // safe_mod is like integer a%b, but safely returns 0 when b==0.
299 template <class T>
300 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
safe_mod(T a,T b)301 safe_mod(T a, T b)
302 {
303     return b ? (a % b) : T(0);
304 }
305 
306 // (end of integer helper functions)
307 ////////////////////////////////////////////////////////////////////////////
308 
309 
310 
311 ////////////////////////////////////////////////////////////////////////////
312 ////////////////////////////////////////////////////////////////////////////
313 //
314 // FLOAT UTILITY FUNCTIONS
315 //
316 // Helper/utility functions: clamps, blends, interpolations...
317 //
318 
319 
320 /// clamp a to bounds [low,high].
321 template <class T>
322 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
clamp(const T & a,const T & low,const T & high)323 clamp (const T& a, const T& low, const T& high)
324 {
325 #if 1
326     // This looks clunky, but it generates minimal code. For float, it
327     // should result in just a max and min instruction, thats it.
328     // This implementation is courtesy of Alex Wells, Intel, via OSL.
329     T val = a;
330     if (!(low <= val))  // Forces clamp(NaN,low,high) to return low
331         val = low;
332     if (val > high)
333         val = high;
334     return val;
335 #else
336     // The naive implementation we originally had, below, only does the 2nd
337     // comparison in the else block, which will generate extra code in a
338     // SIMD masking scenario. I (LG) can confirm that benchmarks show that
339     // the above implementation is indeed faster than the one below, even
340     // for non-SIMD use.
341     return (a >= low) ? ((a <= high) ? a : high) : low;
342 #endif
343 }
344 
345 
346 #ifndef __CUDA_ARCH__
347 // Specialization of clamp for vfloat4
348 template<> OIIO_FORCEINLINE simd::vfloat4
clamp(const simd::vfloat4 & a,const simd::vfloat4 & low,const simd::vfloat4 & high)349 clamp (const simd::vfloat4& a, const simd::vfloat4& low, const simd::vfloat4& high)
350 {
351     return simd::min (high, simd::max (low, a));
352 }
353 
354 template<> OIIO_FORCEINLINE simd::vfloat8
clamp(const simd::vfloat8 & a,const simd::vfloat8 & low,const simd::vfloat8 & high)355 clamp (const simd::vfloat8& a, const simd::vfloat8& low, const simd::vfloat8& high)
356 {
357     return simd::min (high, simd::max (low, a));
358 }
359 
360 template<> OIIO_FORCEINLINE simd::vfloat16
clamp(const simd::vfloat16 & a,const simd::vfloat16 & low,const simd::vfloat16 & high)361 clamp (const simd::vfloat16& a, const simd::vfloat16& low, const simd::vfloat16& high)
362 {
363     return simd::min (high, simd::max (low, a));
364 }
365 
366 // Specialization of clamp for vint4
367 template<> OIIO_FORCEINLINE simd::vint4
clamp(const simd::vint4 & a,const simd::vint4 & low,const simd::vint4 & high)368 clamp (const simd::vint4& a, const simd::vint4& low, const simd::vint4& high)
369 {
370     return simd::min (high, simd::max (low, a));
371 }
372 
373 template<> OIIO_FORCEINLINE simd::vint8
clamp(const simd::vint8 & a,const simd::vint8 & low,const simd::vint8 & high)374 clamp (const simd::vint8& a, const simd::vint8& low, const simd::vint8& high)
375 {
376     return simd::min (high, simd::max (low, a));
377 }
378 
379 template<> OIIO_FORCEINLINE simd::vint16
clamp(const simd::vint16 & a,const simd::vint16 & low,const simd::vint16 & high)380 clamp (const simd::vint16& a, const simd::vint16& low, const simd::vint16& high)
381 {
382     return simd::min (high, simd::max (low, a));
383 }
384 #endif
385 
386 
387 
388 // For the multply+add (or sub) operations below, note that the results may
389 // differ slightly on different hardware, depending on whether true fused
390 // multiply and add is available or if the code generated just does an old
391 // fashioned multiply followed by a separate add. So please interpret these
392 // as "do a multiply and add as fast as possible for this hardware, with at
393 // least as much precision as a multiply followed by a separate add."
394 
395 /// Fused multiply and add: (a*b + c)
madd(float a,float b,float c)396 OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd (float a, float b, float c) {
397     // NOTE: GCC/ICC will turn this (for float) into a FMA unless
398     // explicitly asked not to, clang will do so if -ffp-contract=fast.
399     OIIO_CLANG_PRAGMA(clang fp contract(fast))
400     return a * b + c;
401 }
402 
403 
404 /// Fused multiply and subtract: (a*b - c)
msub(float a,float b,float c)405 OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub (float a, float b, float c) {
406     OIIO_CLANG_PRAGMA(clang fp contract(fast))
407     return a * b - c;
408 }
409 
410 
411 
412 /// Fused negative multiply and add: -(a*b) + c
nmadd(float a,float b,float c)413 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd (float a, float b, float c) {
414     OIIO_CLANG_PRAGMA(clang fp contract(fast))
415     return c - (a * b);
416 }
417 
418 
419 
420 /// Negative fused multiply and subtract: -(a*b) - c
nmsub(float a,float b,float c)421 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub (float a, float b, float c) {
422     OIIO_CLANG_PRAGMA(clang fp contract(fast))
423     return -(a * b) - c;
424 }
425 
426 
427 
428 /// Linearly interpolate values v0-v1 at x: v0*(1-x) + v1*x.
429 /// This is a template, and so should work for any types.
430 template <class T, class Q>
431 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
lerp(const T & v0,const T & v1,const Q & x)432 lerp (const T& v0, const T& v1, const Q& x)
433 {
434     // NOTE: a*(1-x) + b*x is much more numerically stable than a+x*(b-a)
435     return v0*(Q(1)-x) + v1*x;
436 }
437 
438 
439 
440 /// Bilinearly interoplate values v0-v3 (v0 upper left, v1 upper right,
441 /// v2 lower left, v3 lower right) at coordinates (s,t) and return the
442 /// result.  This is a template, and so should work for any types.
443 template <class T, class Q>
444 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
bilerp(const T & v0,const T & v1,const T & v2,const T & v3,const Q & s,const Q & t)445 bilerp(const T& v0, const T& v1, const T& v2, const T& v3, const Q& s, const Q& t)
446 {
447     // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
448     Q s1 = Q(1) - s;
449     return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
450 }
451 
452 
453 
454 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
455 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
456 /// storing the results in 'result'.  These are all vectors, so do it
457 /// for each of 'n' contiguous values (using the same s,t interpolants).
458 template <class T, class Q>
459 inline OIIO_HOSTDEVICE void
bilerp(const T * v0,const T * v1,const T * v2,const T * v3,Q s,Q t,int n,T * result)460 bilerp (const T *v0, const T *v1,
461         const T *v2, const T *v3,
462         Q s, Q t, int n, T *result)
463 {
464     Q s1 = Q(1) - s;
465     Q t1 = Q(1) - t;
466     for (int i = 0;  i < n;  ++i)
467         result[i] = T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
468 }
469 
470 
471 
472 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
473 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
474 /// SCALING the interpolated value by 'scale' and then ADDING to
475 /// 'result'.  These are all vectors, so do it for each of 'n'
476 /// contiguous values (using the same s,t interpolants).
477 template <class T, class Q>
478 inline OIIO_HOSTDEVICE void
bilerp_mad(const T * v0,const T * v1,const T * v2,const T * v3,Q s,Q t,Q scale,int n,T * result)479 bilerp_mad (const T *v0, const T *v1,
480             const T *v2, const T *v3,
481             Q s, Q t, Q scale, int n, T *result)
482 {
483     Q s1 = Q(1) - s;
484     Q t1 = Q(1) - t;
485     for (int i = 0;  i < n;  ++i)
486         result[i] += T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
487                                   t*(v2[i]*s1 + v3[i]*s)));
488 }
489 
490 
491 
492 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
493 /// upper right top, ...) at coordinates (s,t,r), and return the
494 /// result.  This is a template, and so should work for any types.
495 template <class T, class Q>
496 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
trilerp(T v0,T v1,T v2,T v3,T v4,T v5,T v6,T v7,Q s,Q t,Q r)497 trilerp (T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
498 {
499     // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
500     Q s1 = Q(1) - s;
501     Q t1 = Q(1) - t;
502     Q r1 = Q(1) - r;
503     return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
504                r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
505 }
506 
507 
508 
509 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
510 /// upper right top, ...) at coordinates (s,t,r),
511 /// storing the results in 'result'.  These are all vectors, so do it
512 /// for each of 'n' contiguous values (using the same s,t,r interpolants).
513 template <class T, class Q>
514 inline OIIO_HOSTDEVICE void
trilerp(const T * v0,const T * v1,const T * v2,const T * v3,const T * v4,const T * v5,const T * v6,const T * v7,Q s,Q t,Q r,int n,T * result)515 trilerp (const T *v0, const T *v1, const T *v2, const T *v3,
516          const T *v4, const T *v5, const T *v6, const T *v7,
517          Q s, Q t, Q r, int n, T *result)
518 {
519     Q s1 = Q(1) - s;
520     Q t1 = Q(1) - t;
521     Q r1 = Q(1) - r;
522     for (int i = 0;  i < n;  ++i)
523         result[i] = T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
524                         r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
525 }
526 
527 
528 
529 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
530 /// upper right top, ...) at coordinates (s,t,r),
531 /// SCALING the interpolated value by 'scale' and then ADDING to
532 /// 'result'.  These are all vectors, so do it for each of 'n'
533 /// contiguous values (using the same s,t,r interpolants).
534 template <class T, class Q>
535 inline OIIO_HOSTDEVICE void
trilerp_mad(const T * v0,const T * v1,const T * v2,const T * v3,const T * v4,const T * v5,const T * v6,const T * v7,Q s,Q t,Q r,Q scale,int n,T * result)536 trilerp_mad (const T *v0, const T *v1, const T *v2, const T *v3,
537              const T *v4, const T *v5, const T *v6, const T *v7,
538              Q s, Q t, Q r, Q scale, int n, T *result)
539 {
540     Q r1 = Q(1) - r;
541     bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
542     bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
543 }
544 
545 
546 
547 /// Evaluate B-spline weights in w[0..3] for the given fraction.  This
548 /// is an important component of performing a cubic interpolation.
549 template <typename T>
evalBSplineWeights(T w[4],T fraction)550 inline OIIO_HOSTDEVICE void evalBSplineWeights (T w[4], T fraction)
551 {
552     T one_frac = 1 - fraction;
553     w[0] = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
554     w[1] = T(2.0 / 3.0) - T(0.5) * fraction * fraction * (2 - fraction);
555     w[2] = T(2.0 / 3.0) - T(0.5) * one_frac * one_frac * (2 - one_frac);
556     w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
557 }
558 
559 
560 /// Evaluate B-spline derivative weights in w[0..3] for the given
561 /// fraction.  This is an important component of performing a cubic
562 /// interpolation with derivatives.
563 template <typename T>
evalBSplineWeightDerivs(T dw[4],T fraction)564 inline OIIO_HOSTDEVICE void evalBSplineWeightDerivs (T dw[4], T fraction)
565 {
566     T one_frac = 1 - fraction;
567     dw[0] = -T(0.5) * one_frac * one_frac;
568     dw[1] =  T(0.5) * fraction * (3 * fraction - 4);
569     dw[2] = -T(0.5) * one_frac * (3 * one_frac - 4);
570     dw[3] =  T(0.5) * fraction * fraction;
571 }
572 
573 
574 
575 /// Bicubically interoplate arrays of pointers arranged in a 4x4 pattern
576 /// with val[0] pointing to the data in the upper left corner, val[15]
577 /// pointing to the lower right) at coordinates (s,t), storing the
578 /// results in 'result'.  These are all vectors, so do it for each of
579 /// 'n' contiguous values (using the same s,t interpolants).
580 template <class T>
581 inline OIIO_HOSTDEVICE void
bicubic_interp(const T ** val,T s,T t,int n,T * result)582 bicubic_interp (const T **val, T s, T t, int n, T *result)
583 {
584     for (int c = 0;  c < n;  ++c)
585         result[c] = T(0);
586     T wx[4]; evalBSplineWeights (wx, s);
587     T wy[4]; evalBSplineWeights (wy, t);
588     for (int j = 0;  j < 4;  ++j) {
589         for (int i = 0;  i < 4;  ++i) {
590             T w = wx[i] * wy[j];
591             for (int c = 0;  c < n;  ++c)
592                 result[c] += w * val[j*4+i][c];
593         }
594     }
595 }
596 
597 
598 
599 /// Return floor(x) cast to an int.
600 OIIO_FORCEINLINE OIIO_HOSTDEVICE int
ifloor(float x)601 ifloor (float x)
602 {
603     return (int)floorf(x);
604 }
605 
606 
607 
608 /// Return (x-floor(x)) and put (int)floor(x) in *xint.  This is similar
609 /// to the built-in modf, but returns a true int, always rounds down
610 /// (compared to modf which rounds toward 0), and always returns
611 /// frac >= 0 (comapred to modf which can return <0 if x<0).
612 inline OIIO_HOSTDEVICE float
floorfrac(float x,int * xint)613 floorfrac (float x, int *xint)
614 {
615 #if 1
616     float f = std::floor(x);
617     *xint = int(f);
618     return x - f;
619 #else /* vvv This idiom is slower */
620     int i = ifloor(x);
621     *xint = i;
622     return x - static_cast<float>(i);   // Return the fraction left over
623 #endif
624 }
625 
626 
627 #ifndef __CUDA_ARCH__
floorfrac(const simd::vfloat4 & x,simd::vint4 * xint)628 inline simd::vfloat4 floorfrac (const simd::vfloat4& x, simd::vint4 *xint) {
629     simd::vfloat4 f = simd::floor(x);
630     *xint = simd::vint4(f);
631     return x - f;
632 }
633 
floorfrac(const simd::vfloat8 & x,simd::vint8 * xint)634 inline simd::vfloat8 floorfrac (const simd::vfloat8& x, simd::vint8 *xint) {
635     simd::vfloat8 f = simd::floor(x);
636     *xint = simd::vint8(f);
637     return x - f;
638 }
639 
floorfrac(const simd::vfloat16 & x,simd::vint16 * xint)640 inline simd::vfloat16 floorfrac (const simd::vfloat16& x, simd::vint16 *xint) {
641     simd::vfloat16 f = simd::floor(x);
642     *xint = simd::vint16(f);
643     return x - f;
644 }
645 #endif
646 
647 
648 
649 
650 /// Convert degrees to radians.
651 template <typename T>
radians(T deg)652 OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians (T deg) { return deg * T(M_PI / 180.0); }
653 
654 /// Convert radians to degrees
655 template <typename T>
degrees(T rad)656 OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees (T rad) { return rad * T(180.0 / M_PI); }
657 
658 
659 /// Faster floating point negation, in cases where you aren't too picky
660 /// about the difference between +0 and -0. (Courtesy of Alex Wells, Intel,
661 /// via code in OSL.)
662 ///
663 /// Beware: fast_neg(0.0f) returns 0.0f, NOT -0.0f. All other values,
664 /// including -0.0 and +/- Inf, work identically to `-x`. For many use
665 /// cases, that's fine. (When was the last time you wanted a -0.0f anyway?)
666 ///
667 /// The reason it's faster is that `-x` (for float x) is implemented by
668 /// compilers by xor'ing x against a bitmask that flips the sign bit, and
669 /// that bitmask had to come from somewhere -- memory! -- which can be
670 /// expensive, depending on whether/where it is in cache. But  `0 - x`
671 /// generates a zero in a register (with xor, no memory access needed) and
672 /// then subtracts, so that's sometimes faster because there is no memory
673 /// read. This works for SIMD types as well!
674 ///
675 /// It's also safe (though pointless) to use fast_neg for integer types,
676 /// where both `-x` and `0-x` are implemented as a `neg` instruction.
677 template <typename T>
678 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
fast_neg(const T & x)679 fast_neg(const T &x) {
680     return T(0) - x;
681 }
682 
683 
684 
685 inline OIIO_HOSTDEVICE void
sincos(float x,float * sine,float * cosine)686 sincos (float x, float* sine, float* cosine)
687 {
688 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
689     __builtin_sincosf(x, sine, cosine);
690 #elif defined(__CUDA_ARCH__)
691     // Explicitly select the single-precision CUDA library function
692     sincosf(x, sine, cosine);
693 #else
694     *sine = std::sin(x);
695     *cosine = std::cos(x);
696 #endif
697 }
698 
699 inline OIIO_HOSTDEVICE void
sincos(double x,double * sine,double * cosine)700 sincos (double x, double* sine, double* cosine)
701 {
702 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
703     __builtin_sincos(x, sine, cosine);
704 #elif defined(__CUDA_ARCH__)
705     // Use of anonymous namespace resolves to the CUDA library function and
706     // avoids infinite recursion
707     ::sincos(x, sine, cosine);
708 #else
709     *sine = std::sin(x);
710     *cosine = std::cos(x);
711 #endif
712 }
713 
714 
sign(float x)715 inline OIIO_HOSTDEVICE float sign (float x)
716 {
717     return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
718 }
719 
720 
721 
722 
723 // (end of float helper functions)
724 ////////////////////////////////////////////////////////////////////////////
725 
726 
727 
728 ////////////////////////////////////////////////////////////////////////////
729 ////////////////////////////////////////////////////////////////////////////
730 //
731 // CONVERSION
732 //
733 // Type and range conversion helper functions and classes.
734 
735 
736 template <typename IN_TYPE, typename OUT_TYPE>
bit_cast(const IN_TYPE & in)737 OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE& in) {
738     // NOTE: this is the only standards compliant way of doing this type of casting,
739     // luckily the compilers we care about know how to optimize away this idiom.
740     static_assert(sizeof(IN_TYPE) == sizeof(OUT_TYPE),
741                   "bit_cast must be between objects of the same size");
742     OUT_TYPE out;
743     memcpy ((void *)&out, &in, sizeof(IN_TYPE));
744     return out;
745 }
746 
747 #if defined(__INTEL_COMPILER)
748     // On x86/x86_64 for certain compilers we can use Intel CPU intrinsics
749     // for some common bit_cast cases that might be even more understandable
750     // to the compiler and generate better code without its getting confused
751     // about the memcpy in the general case.
752     // FIXME: The intrinsics are not in clang <= 9 nor gcc <= 9.1. Check
753     // future releases.
754     template<> OIIO_FORCEINLINE uint32_t bit_cast<float, uint32_t> (const float val) {
755           return static_cast<uint32_t>(_castf32_u32(val));
756     }
757     template<> OIIO_FORCEINLINE int32_t bit_cast<float, int32_t> (const float val) {
758           return static_cast<int32_t>(_castf32_u32(val));
759     }
760     template<> OIIO_FORCEINLINE float bit_cast<uint32_t, float> (const uint32_t val) {
761           return _castu32_f32(val);
762     }
763     template<> OIIO_FORCEINLINE float bit_cast<int32_t, float> (const int32_t val) {
764           return _castu32_f32(val);
765     }
766     template<> OIIO_FORCEINLINE uint64_t bit_cast<double, uint64_t> (const double val) {
767           return static_cast<uint64_t>(_castf64_u64(val));
768     }
769     template<> OIIO_FORCEINLINE int64_t bit_cast<double, int64_t> (const double val) {
770           return static_cast<int64_t>(_castf64_u64(val));
771     }
772     template<> OIIO_FORCEINLINE double bit_cast<uint64_t, double> (const uint64_t val) {
773           return _castu64_f64(val);
774     }
775     template<> OIIO_FORCEINLINE double bit_cast<int64_t, double> (const int64_t val) {
776           return _castu64_f64(val);
777     }
778 #endif
779 
780 
bitcast_to_int(float x)781 OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int (float x) {
782     return bit_cast<float,int>(x);
783 }
bitcast_to_float(int x)784 OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float (int x) {
785     return bit_cast<int,float>(x);
786 }
787 
788 
789 
790 /// Change endian-ness of one or more data items that are each 2, 4,
791 /// or 8 bytes.  This should work for any of short, unsigned short, int,
792 /// unsigned int, float, long long, pointers.
793 template<class T>
794 inline OIIO_HOSTDEVICE void
795 swap_endian (T *f, int len=1)
796 {
797     for (char *c = (char *) f;  len--;  c += sizeof(T)) {
798         if (sizeof(T) == 2) {
799             std::swap (c[0], c[1]);
800         } else if (sizeof(T) == 4) {
801             std::swap (c[0], c[3]);
802             std::swap (c[1], c[2]);
803         } else if (sizeof(T) == 8) {
804             std::swap (c[0], c[7]);
805             std::swap (c[1], c[6]);
806             std::swap (c[2], c[5]);
807             std::swap (c[3], c[4]);
808         }
809     }
810 }
811 
812 
813 
814 // big_enough_float<T>::float_t is a floating-point type big enough to
815 // handle the range and precision of a <T>. It's a float, unless T is big.
816 template <typename T> struct big_enough_float    { typedef float float_t; };
817 template<> struct big_enough_float<int>          { typedef double float_t; };
818 template<> struct big_enough_float<unsigned int> { typedef double float_t; };
819 template<> struct big_enough_float<int64_t>      { typedef double float_t; };
820 template<> struct big_enough_float<uint64_t>     { typedef double float_t; };
821 template<> struct big_enough_float<double>       { typedef double float_t; };
822 
823 
824 /// Multiply src by scale, clamp to [min,max], and round to the nearest D
825 /// (presumed to be integer).  This is just a helper for the convert_type
826 /// templates, it probably has no other use.
827 template<typename S, typename D, typename F>
828 inline OIIO_HOSTDEVICE D
829 scaled_conversion(const S& src, F scale, F min, F max)
830 {
831     if (std::numeric_limits<S>::is_signed) {
832         F s = src * scale;
833         s += (s < 0 ? (F)-0.5 : (F)0.5);
834         return (D)clamp(s, min, max);
835     } else {
836         return (D)clamp((F)src * scale + (F)0.5, min, max);
837     }
838 }
839 
840 
841 
842 /// Convert n consecutive values from the type of S to the type of D.
843 /// The conversion is not a simple cast, but correctly remaps the
844 /// 0.0->1.0 range from and to the full positive range of integral
845 /// types.  Take a memcpy shortcut if both types are the same and no
846 /// conversion is necessary.  Optional arguments can give nonstandard
847 /// quantizations.
848 //
849 // FIXME: make table-based specializations for common types with only a
850 // few possible src values (like unsigned char -> float).
851 template<typename S, typename D>
852 void convert_type (const S *src, D *dst, size_t n, D _min, D _max)
853 {
854     if (std::is_same<S,D>::value) {
855         // They must be the same type.  Just memcpy.
856         memcpy (dst, src, n*sizeof(D));
857         return;
858     }
859     typedef typename big_enough_float<D>::float_t F;
860     F scale = std::numeric_limits<S>::is_integer ?
861         (F(1)) / F(std::numeric_limits<S>::max()) : F(1);
862     if (std::numeric_limits<D>::is_integer) {
863         // Converting to an integer-like type.
864         F min = (F)_min;  // std::numeric_limits<D>::min();
865         F max = (F)_max;  // std::numeric_limits<D>::max();
866         scale *= _max;
867         // Unroll loop for speed
868         for ( ; n >= 16; n -= 16) {
869             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
870             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
871             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
872             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
873             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
874             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
875             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
876             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
877             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
878             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
879             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
880             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
881             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
882             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
883             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
884             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
885         }
886         while (n--)
887             *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
888     } else {
889         // Converting to a float-like type, so we don't need to remap
890         // the range
891         // Unroll loop for speed
892         for ( ; n >= 16; n -= 16) {
893             *dst++ = (D)((*src++) * scale);
894             *dst++ = (D)((*src++) * scale);
895             *dst++ = (D)((*src++) * scale);
896             *dst++ = (D)((*src++) * scale);
897             *dst++ = (D)((*src++) * scale);
898             *dst++ = (D)((*src++) * scale);
899             *dst++ = (D)((*src++) * scale);
900             *dst++ = (D)((*src++) * scale);
901             *dst++ = (D)((*src++) * scale);
902             *dst++ = (D)((*src++) * scale);
903             *dst++ = (D)((*src++) * scale);
904             *dst++ = (D)((*src++) * scale);
905             *dst++ = (D)((*src++) * scale);
906             *dst++ = (D)((*src++) * scale);
907             *dst++ = (D)((*src++) * scale);
908             *dst++ = (D)((*src++) * scale);
909         }
910         while (n--)
911             *dst++ = (D)((*src++) * scale);
912     }
913 }
914 
915 
916 #ifndef __CUDA_ARCH__
917 template<>
918 inline void convert_type<uint8_t,float> (const uint8_t *src,
919                                          float *dst, size_t n,
920                                          float /*_min*/, float /*_max*/)
921 {
922     float scale (1.0f/std::numeric_limits<uint8_t>::max());
923     simd::vfloat4 scale_simd (scale);
924     for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
925         simd::vfloat4 s_simd (src);
926         simd::vfloat4 d_simd = s_simd * scale_simd;
927         d_simd.store (dst);
928     }
929     while (n--)
930         *dst++ = (*src++) * scale;
931 }
932 
933 
934 
935 template<>
936 inline void convert_type<uint16_t,float> (const uint16_t *src,
937                                           float *dst, size_t n,
938                                           float /*_min*/, float /*_max*/)
939 {
940     float scale (1.0f/std::numeric_limits<uint16_t>::max());
941     simd::vfloat4 scale_simd (scale);
942     for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
943         simd::vfloat4 s_simd (src);
944         simd::vfloat4 d_simd = s_simd * scale_simd;
945         d_simd.store (dst);
946     }
947     while (n--)
948         *dst++ = (*src++) * scale;
949 }
950 
951 
952 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
953 template<>
954 inline void convert_type<half,float> (const half *src,
955                                       float *dst, size_t n,
956                                       float /*_min*/, float /*_max*/)
957 {
958 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
959     // If f16c ops are enabled, it's worth doing this by 8's
960     for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
961         simd::vfloat8 s_simd (src);
962         s_simd.store (dst);
963     }
964 #endif
965 #if OIIO_SIMD >= 4
966     for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
967         simd::vfloat4 s_simd (src);
968         s_simd.store (dst);
969     }
970 #endif
971     while (n--)
972         *dst++ = (*src++);
973 }
974 #endif
975 
976 
977 
978 template<>
979 inline void
980 convert_type<float,uint16_t> (const float *src, uint16_t *dst, size_t n,
981                               uint16_t /*_min*/, uint16_t /*_max*/)
982 {
983     float min = std::numeric_limits<uint16_t>::min();
984     float max = std::numeric_limits<uint16_t>::max();
985     float scale = max;
986     simd::vfloat4 max_simd (max);
987     simd::vfloat4 zero_simd (0.0f);
988     for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
989         simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
990         simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
991         simd::vint4 i (clamped);
992         i.store (dst);
993     }
994     while (n--)
995         *dst++ = scaled_conversion<float,uint16_t,float> (*src++, scale, min, max);
996 }
997 
998 
999 template<>
1000 inline void
1001 convert_type<float,uint8_t> (const float *src, uint8_t *dst, size_t n,
1002                              uint8_t /*_min*/, uint8_t /*_max*/)
1003 {
1004     float min = std::numeric_limits<uint8_t>::min();
1005     float max = std::numeric_limits<uint8_t>::max();
1006     float scale = max;
1007     simd::vfloat4 max_simd (max);
1008     simd::vfloat4 zero_simd (0.0f);
1009     for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1010         simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1011         simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1012         simd::vint4 i (clamped);
1013         i.store (dst);
1014     }
1015     while (n--)
1016         *dst++ = scaled_conversion<float,uint8_t,float> (*src++, scale, min, max);
1017 }
1018 
1019 
1020 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1021 template<>
1022 inline void
1023 convert_type<float,half> (const float *src, half *dst, size_t n,
1024                           half /*_min*/, half /*_max*/)
1025 {
1026 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1027     // If f16c ops are enabled, it's worth doing this by 8's
1028     for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1029         simd::vfloat8 s (src);
1030         s.store (dst);
1031     }
1032 #endif
1033 #if OIIO_SIMD >= 4
1034     for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1035         simd::vfloat4 s (src);
1036         s.store (dst);
1037     }
1038 #endif
1039     while (n--)
1040         *dst++ = *src++;
1041 }
1042 #endif
1043 #endif
1044 
1045 
1046 
1047 template<typename S, typename D>
1048 inline void convert_type (const S *src, D *dst, size_t n)
1049 {
1050     convert_type<S,D> (src, dst, n,
1051                        std::numeric_limits<D>::min(),
1052                        std::numeric_limits<D>::max());
1053 }
1054 
1055 
1056 
1057 
1058 /// Convert a single value from the type of S to the type of D.
1059 /// The conversion is not a simple cast, but correctly remaps the
1060 /// 0.0->1.0 range from and to the full positive range of integral
1061 /// types.  Take a copy shortcut if both types are the same and no
1062 /// conversion is necessary.
1063 template<typename S, typename D>
1064 inline D
1065 convert_type (const S &src)
1066 {
1067     if (std::is_same<S,D>::value) {
1068         // They must be the same type.  Just return it.
1069         return (D)src;
1070     }
1071     typedef typename big_enough_float<D>::float_t F;
1072     F scale = std::numeric_limits<S>::is_integer ?
1073         F(1) / F(std::numeric_limits<S>::max()) : F(1);
1074     if (std::numeric_limits<D>::is_integer) {
1075         // Converting to an integer-like type.
1076         F min = (F) std::numeric_limits<D>::min();
1077         F max = (F) std::numeric_limits<D>::max();
1078         scale *= max;
1079         return scaled_conversion<S,D,F> (src, scale, min, max);
1080     } else {
1081         // Converting to a float-like type, so we don't need to remap
1082         // the range
1083         return (D)((F)src * scale);
1084     }
1085 }
1086 
1087 
1088 
1089 /// Helper function to convert channel values between different bit depths.
1090 /// Roughly equivalent to:
1091 ///
1092 /// out = round (in * (pow (2, TO_BITS) - 1) / (pow (2, FROM_BITS) - 1));
1093 ///
1094 /// but utilizes an integer math trick for speed. It can be proven that the
1095 /// absolute error of this method is less or equal to 1, with an average error
1096 /// (with respect to the entire domain) below 0.2.
1097 ///
1098 /// It is assumed that the original value is a valid FROM_BITS integer, i.e.
1099 /// shifted fully to the right.
1100 template<unsigned int FROM_BITS, unsigned int TO_BITS>
1101 inline OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in) {
1102     unsigned int out = 0;
1103     int shift = TO_BITS - FROM_BITS;
1104     for (; shift > 0; shift -= FROM_BITS)
1105         out |= in << shift;
1106     out |= in >> -shift;
1107     return out;
1108 }
1109 
1110 
1111 
1112 // non-templated version.  Slow but general
1113 inline OIIO_HOSTDEVICE unsigned int
1114 bit_range_convert(unsigned int in, unsigned int FROM_BITS, unsigned int TO_BITS)
1115 {
1116     unsigned int out = 0;
1117     int shift = TO_BITS - FROM_BITS;
1118     for (; shift > 0; shift -= FROM_BITS)
1119         out |= in << shift;
1120     out |= in >> -shift;
1121     return out;
1122 }
1123 
1124 
1125 
1126 /// Append the `n` LSB bits of `val` into a bit sting `T out[]`, where the
1127 /// `filled` MSB bits of `*out` are already filled in. Increment `out` and
1128 /// adjust `filled` as required. Type `T` should be uint8_t, uint16_t, or
1129 /// uint32_t.
1130 template<typename T>
1131 inline void
1132 bitstring_add_n_bits (T* &out, int& filled, uint32_t val, int n)
1133 {
1134     static_assert(std::is_same<T,uint8_t>::value ||
1135                   std::is_same<T,uint16_t>::value ||
1136                   std::is_same<T,uint32_t>::value,
1137                   "bitstring_add_n_bits must be unsigned int 8/16/32");
1138     const int Tbits = sizeof(T) * 8;
1139     // val:         | don't care     | copy me    |
1140     //                                <- n bits ->
1141     //
1142     // *out:        | don't touch |   fill in here       |
1143     //               <- filled  -> <- (Tbits - filled) ->
1144     while (n > 0) {
1145         // Make sure val doesn't have any cruft in bits > n
1146         val &= ~(0xffffffff << n);
1147         // Initialize any new byte we're filling in
1148         if (filled == 0)
1149             *out = 0;
1150         // How many bits can we fill in `*out` without having to increment
1151         // to the next byte?
1152         int bits_left_in_out = Tbits - filled;
1153         int b = 0;   // bit pattern to 'or' with *out
1154         int nb = 0;  // number of bits to add
1155         if (n <= bits_left_in_out) { // can fit completely in this byte
1156             b = val << (bits_left_in_out - n);
1157             nb = n;
1158         } else { // n > bits_left_in_out, will spill to next byte
1159             b = val >> (n - bits_left_in_out);
1160             nb = bits_left_in_out;
1161         }
1162         *out |= b;
1163         filled += nb;
1164         OIIO_DASSERT (filled <= Tbits);
1165         n -= nb;
1166         if (filled == Tbits) {
1167             ++out;
1168             filled = 0;
1169         }
1170     }
1171 }
1172 
1173 
1174 
1175 /// Pack values from `T in[0..n-1]` (where `T` is expected to be a uint8,
1176 /// uint16, or uint32, into successive raw outbits-bit pieces of `out[]`,
1177 /// where outbits is expected to be less than the number of bits in a `T`.
1178 template<typename T>
1179 inline void
1180 bit_pack(cspan<T> data, void* out, int outbits)
1181 {
1182     static_assert(std::is_same<T,uint8_t>::value ||
1183                   std::is_same<T,uint16_t>::value ||
1184                   std::is_same<T,uint32_t>::value,
1185                   "bit_pack must be unsigned int 8/16/32");
1186     unsigned char* outbuffer = (unsigned char*)out;
1187     int filled = 0;
1188     for (size_t i = 0, e = data.size(); i < e; ++i)
1189         bitstring_add_n_bits (outbuffer, filled, data[i], outbits);
1190 }
1191 
1192 
1193 
1194 /// Decode n packed inbits-bits values from in[...] into normal uint8,
1195 /// uint16, or uint32 representation of `T out[0..n-1]`. In other words,
1196 /// each successive `inbits` of `in` (allowing spanning of byte boundaries)
1197 /// will be stored in a successive out[i].
1198 template<typename T>
1199 inline void
1200 bit_unpack(int n, const unsigned char* in, int inbits, T* out)
1201 {
1202     static_assert(std::is_same<T,uint8_t>::value ||
1203                   std::is_same<T,uint16_t>::value ||
1204                   std::is_same<T,uint32_t>::value,
1205                   "bit_unpack must be unsigned int 8/16/32");
1206     OIIO_DASSERT(inbits >= 1 && inbits < 32);  // surely bugs if not
1207     // int highest = (1 << inbits) - 1;
1208     int B = 0, b = 0;
1209     // Invariant:
1210     // So far, we have used in[0..B-1] and the high b bits of in[B].
1211     for (int i = 0; i < n; ++i) {
1212         long long val = 0;
1213         int valbits   = 0;  // bits so far we've accumulated in val
1214         while (valbits < inbits) {
1215             // Invariant: we have already accumulated valbits of the next
1216             // needed value (of a total of inbits), living in the valbits
1217             // low bits of val.
1218             int out_left = inbits - valbits;  // How much more we still need
1219             int in_left  = 8 - b;             // Bits still available in in[B].
1220             if (in_left <= out_left) {
1221                 // Eat the rest of this byte:
1222                 //   |---------|--------|
1223                 //        b      in_left
1224                 val <<= in_left;
1225                 val |= in[B] & ~(0xffffffff << in_left);
1226                 ++B;
1227                 b = 0;
1228                 valbits += in_left;
1229             } else {
1230                 // Eat just the bits we need:
1231                 //   |--|---------|-----|
1232                 //    b  out_left  extra
1233                 val <<= out_left;
1234                 int extra = 8 - b - out_left;
1235                 val |= (in[B] >> extra) & ~(0xffffffff << out_left);
1236                 b += out_left;
1237                 valbits = inbits;
1238             }
1239         }
1240         out[i] = val; //T((val * 0xff) / highest);
1241     }
1242 }
1243 
1244 
1245 
1246 /// A DataProxy<I,E> looks like an (E &), but it really holds an (I &)
1247 /// and does conversions (via convert_type) as it reads in and out.
1248 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1249 template<typename I, typename E>
1250 struct DataProxy {
1251     DataProxy (I &data) : m_data(data) { }
1252     E operator= (E newval) { m_data = convert_type<E,I>(newval); return newval; }
1253     operator E () const { return convert_type<I,E>(m_data); }
1254 private:
1255     DataProxy& operator = (const DataProxy&); // Do not implement
1256     I &m_data;
1257 };
1258 
1259 
1260 
1261 /// A ConstDataProxy<I,E> looks like a (const E &), but it really holds
1262 /// a (const I &) and does conversions (via convert_type) as it reads.
1263 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1264 template<typename I, typename E>
1265 struct ConstDataProxy {
1266     ConstDataProxy (const I &data) : m_data(data) { }
1267     operator E () const { return convert_type<E,I>(*m_data); }
1268 private:
1269     const I &m_data;
1270 };
1271 
1272 
1273 
1274 /// A DataArrayProxy<I,E> looks like an (E *), but it really holds an (I *)
1275 /// and does conversions (via convert_type) as it reads in and out.
1276 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1277 template<typename I, typename E>
1278 struct DataArrayProxy {
1279     DataArrayProxy (I *data=NULL) : m_data(data) { }
1280     E operator* () const { return convert_type<I,E>(*m_data); }
1281     E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1282     DataProxy<I,E> operator[] (int i) { return DataProxy<I,E> (m_data[i]); }
1283     void set (I *data) { m_data = data; }
1284     I * get () const { return m_data; }
1285     const DataArrayProxy<I,E> & operator+= (int i) {
1286         m_data += i;  return *this;
1287     }
1288 private:
1289     I *m_data;
1290 };
1291 
1292 
1293 
1294 /// A ConstDataArrayProxy<I,E> looks like an (E *), but it really holds an
1295 /// (I *) and does conversions (via convert_type) as it reads in and out.
1296 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1297 template<typename I, typename E>
1298 struct ConstDataArrayProxy {
1299     ConstDataArrayProxy (const I *data=NULL) : m_data(data) { }
1300     E operator* () const { return convert_type<I,E>(*m_data); }
1301     E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1302     void set (const I *data) { m_data = data; }
1303     const I * get () const { return m_data; }
1304     const ConstDataArrayProxy<I,E> & operator+= (int i) {
1305         m_data += i;  return *this;
1306     }
1307 private:
1308     const I *m_data;
1309 };
1310 
1311 
1312 
1313 /// Fast table-based conversion of 8-bit to other types.  Declare this
1314 /// as static to avoid the expensive ctr being called all the time.
1315 template <class T=float>
1316 class EightBitConverter {
1317 public:
1318     EightBitConverter () noexcept { init(); }
1319     T operator() (unsigned char c) const noexcept { return val[c]; }
1320 private:
1321     T val[256];
1322     void init () noexcept {
1323         float scale = 1.0f / 255.0f;
1324         if (std::numeric_limits<T>::is_integer)
1325             scale *= (float)std::numeric_limits<T>::max();
1326         for (int i = 0;  i < 256;  ++i)
1327             val[i] = (T)(i * scale);
1328     }
1329 };
1330 
1331 
1332 
1333 /// Simple conversion of a (presumably non-negative) float into a
1334 /// rational.  This does not attempt to find the simplest fraction
1335 /// that approximates the float, for example 52.83 will simply
1336 /// return 5283/100.  This does not attempt to gracefully handle
1337 /// floats that are out of range that could be easily int/int.
1338 inline OIIO_HOSTDEVICE void
1339 float_to_rational (float f, unsigned int &num, unsigned int &den)
1340 {
1341     if (f <= 0) {   // Trivial case of zero, and handle all negative values
1342         num = 0;
1343         den = 1;
1344     } else if ((int)(1.0/f) == (1.0/f)) { // Exact results for perfect inverses
1345         num = 1;
1346         den = (int)f;
1347     } else {
1348         num = (int)f;
1349         den = 1;
1350         while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1351             den *= 10;
1352             f *= 10;
1353             num = (int)f;
1354         }
1355     }
1356 }
1357 
1358 
1359 
1360 /// Simple conversion of a float into a rational.  This does not attempt
1361 /// to find the simplest fraction that approximates the float, for
1362 /// example 52.83 will simply return 5283/100.  This does not attempt to
1363 /// gracefully handle floats that are out of range that could be easily
1364 /// int/int.
1365 inline OIIO_HOSTDEVICE void
1366 float_to_rational (float f, int &num, int &den)
1367 {
1368     unsigned int n, d;
1369     float_to_rational (fabsf(f), n, d);
1370     num = (f >= 0) ? (int)n : -(int)n;
1371     den = (int) d;
1372 }
1373 
1374 
1375 // (end of conversion helpers)
1376 ////////////////////////////////////////////////////////////////////////////
1377 
1378 
1379 
1380 
1381 ////////////////////////////////////////////////////////////////////////////
1382 ////////////////////////////////////////////////////////////////////////////
1383 //
1384 // SAFE MATH
1385 //
1386 // The functions named "safe_*" are versions with various internal clamps
1387 // or other deviations from IEEE standards with the specific intent of
1388 // never producing NaN or Inf values or throwing exceptions. But within the
1389 // valid range, they should be full precision and match IEEE standards.
1390 //
1391 
1392 
1393 /// Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
1394 template <typename T>
1395 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt (T x) {
1396     return x >= T(0) ? std::sqrt(x) : T(0);
1397 }
1398 
1399 /// Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
1400 template <typename T>
1401 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt (T x) {
1402     return x > T(0) ? T(1) / std::sqrt(x) : T(0);
1403 }
1404 
1405 
1406 /// Safe (clamping) arcsine: clamp to the valid domain.
1407 template <typename T>
1408 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin (T x) {
1409     if (x <= T(-1)) return T(-M_PI_2);
1410     if (x >= T(+1)) return T(+M_PI_2);
1411     return std::asin(x);
1412 }
1413 
1414 /// Safe (clamping) arccosine: clamp to the valid domain.
1415 template <typename T>
1416 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos (T x) {
1417     if (x <= T(-1)) return T(M_PI);
1418     if (x >= T(+1)) return T(0);
1419     return std::acos(x);
1420 }
1421 
1422 
1423 /// Safe log2: clamp to valid domain.
1424 template <typename T>
1425 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2 (T x) {
1426     // match clamping from fast version
1427     if (x < std::numeric_limits<T>::min()) x = std::numeric_limits<T>::min();
1428     if (x > std::numeric_limits<T>::max()) x = std::numeric_limits<T>::max();
1429     return std::log2(x);
1430 }
1431 
1432 /// Safe log: clamp to valid domain.
1433 template <typename T>
1434 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log (T x) {
1435     // slightly different than fast version since clamping happens before scaling
1436     if (x < std::numeric_limits<T>::min()) x = std::numeric_limits<T>::min();
1437     if (x > std::numeric_limits<T>::max()) x = std::numeric_limits<T>::max();
1438     return std::log(x);
1439 }
1440 
1441 /// Safe log10: clamp to valid domain.
1442 template <typename T>
1443 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10 (T x) {
1444     // slightly different than fast version since clamping happens before scaling
1445     if (x < std::numeric_limits<T>::min()) x = std::numeric_limits<T>::min();
1446     if (x > std::numeric_limits<T>::max()) x = std::numeric_limits<T>::max();
1447     return log10f(x);
1448 }
1449 
1450 /// Safe logb: clamp to valid domain.
1451 template <typename T>
1452 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb (T x) {
1453     return (x != T(0)) ? std::logb(x) : -std::numeric_limits<T>::max();
1454 }
1455 
1456 /// Safe pow: clamp the domain so it never returns Inf or NaN or has divide
1457 /// by zero error.
1458 template <typename T>
1459 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow (T x, T y) {
1460     if (y == T(0)) return T(1);
1461     if (x == T(0)) return T(0);
1462     // if x is negative, only deal with integer powers
1463     if ((x < T(0)) && (y != floor(y))) return T(0);
1464     // FIXME: this does not match "fast" variant because clamping limits are different
1465     T r = std::pow(x, y);
1466     // Clamp to avoid returning Inf.
1467     const T big = std::numeric_limits<T>::max();
1468     return clamp (r, -big, big);
1469 }
1470 
1471 
1472 /// Safe fmod: guard against b==0.0 (in which case, return 0.0). Also, it
1473 /// seems that this is much faster than std::fmod!
1474 OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod (float a, float b)
1475 {
1476     if (OIIO_LIKELY(b != 0.0f)) {
1477 #if 0
1478         return std::fmod (a,b);
1479         // std::fmod was getting called serially instead of vectorizing, so
1480         // we will just do the calculation ourselves.
1481 #else
1482         // This formulation is equivalent but much faster in our benchmarks,
1483         // also vectorizes better.
1484         // The floating-point remainder of the division operation
1485         // a/b is a - N*b, where N = a/b with its fractional part truncated.
1486         int N = static_cast<int>(a/b);
1487         return a - N*b;
1488 #endif
1489     }
1490     return 0.0f;
1491 }
1492 
1493 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1494 
1495 // (end of safe_* functions)
1496 ////////////////////////////////////////////////////////////////////////////
1497 
1498 
1499 
1500 ////////////////////////////////////////////////////////////////////////////
1501 ////////////////////////////////////////////////////////////////////////////
1502 //
1503 // FAST & APPROXIMATE MATH
1504 //
1505 // The functions named "fast_*" provide a set of replacements to libm that
1506 // are much faster at the expense of some accuracy and robust handling of
1507 // extreme values. One design goal for these approximation was to avoid
1508 // branches as much as possible and operate on single precision values only
1509 // so that SIMD versions should be straightforward ports. We also try to
1510 // implement "safe" semantics (ie: clamp to valid range where possible)
1511 // natively since wrapping these inline calls in another layer would be
1512 // wasteful.
1513 //
1514 // The "fast_*" functions are not only possibly differing in value
1515 // (slightly) from the std versions of these functions, but also we do not
1516 // guarantee that the results of "fast" will exactly match from platform to
1517 // platform. This is because if a particular platform (HW, library, or
1518 // compiler) provides an intrinsic that is even faster than our usual "fast"
1519 // implementation, we may substitute it.
1520 //
1521 // Some functions are fast_safe_*, which is both a faster approximation as
1522 // well as clamped input domain to ensure no NaN, Inf, or divide by zero.
1523 //
1524 // NB: When compiling for CUDA devices, selection of the 'fast' intrinsics
1525 //     is influenced by compiler options (-ffast-math for clang/gcc,
1526 //     --use-fast-math for NVCC).
1527 //
1528 // TODO: Quantify the performance and accuracy of the CUDA Math functions
1529 //       relative to the definitions in this file. It may be better in some
1530 //       cases to use the approximate versions defined below.
1531 
1532 
1533 /// Round to nearest integer, returning as an int.
1534 /// Note that this differs from std::rint, which returns a float; it's more
1535 /// akin to std::lrint, which returns a long (int). Sorry for the naming
1536 /// confusion.
1537 OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint (float x) {
1538     // used by sin/cos/tan range reduction
1539 #if OIIO_SIMD_SSE >= 4
1540     // single roundps instruction on SSE4.1+ (for gcc/clang at least)
1541     return static_cast<int>(std::rint(x));
1542 #else
1543     // emulate rounding by adding/substracting 0.5
1544     return static_cast<int>(x + copysignf(0.5f, x));
1545 #endif
1546 }
1547 
1548 #ifndef __CUDA_ARCH__
1549 OIIO_FORCEINLINE simd::vint4 fast_rint (const simd::vfloat4& x) {
1550     return simd::rint (x);
1551 }
1552 #endif
1553 
1554 
1555 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin (float x) {
1556 #ifndef __CUDA_ARCH__
1557     // very accurate argument reduction from SLEEF
1558     // starts failing around x=262000
1559     // Results on: [-2pi,2pi]
1560     // Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp, 1.19209e-07 max error
1561     int q = fast_rint (x * float(M_1_PI));
1562     float qf = float(q);
1563     x = madd(qf, -0.78515625f*4, x);
1564     x = madd(qf, -0.00024187564849853515625f*4, x);
1565     x = madd(qf, -3.7747668102383613586e-08f*4, x);
1566     x = madd(qf, -1.2816720341285448015e-12f*4, x);
1567     x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1568     float s = x * x;
1569     if ((q & 1) != 0) x = -x;
1570     // this polynomial approximation has very low error on [-pi/2,+pi/2]
1571     // 1.19209e-07 max error in total over [-2pi,+2pi]
1572     float u = 2.6083159809786593541503e-06f;
1573     u = madd(u, s, -0.0001981069071916863322258f);
1574     u = madd(u, s, +0.00833307858556509017944336f);
1575     u = madd(u, s, -0.166666597127914428710938f);
1576     u = madd(s, u * x, x);
1577     // For large x, the argument reduction can fail and the polynomial can
1578     // be evaluated with arguments outside the valid internal. Just clamp
1579     // the bad values away.
1580     return clamp(u, -1.0f, 1.0f);
1581 #else
1582     return __sinf(x);
1583 #endif
1584 }
1585 
1586 
1587 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos (float x) {
1588 #ifndef __CUDA_ARCH__
1589     // same argument reduction as fast_sin
1590     int q = fast_rint (x * float(M_1_PI));
1591     float qf = float(q);
1592     x = madd(qf, -0.78515625f*4, x);
1593     x = madd(qf, -0.00024187564849853515625f*4, x);
1594     x = madd(qf, -3.7747668102383613586e-08f*4, x);
1595     x = madd(qf, -1.2816720341285448015e-12f*4, x);
1596     x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1597     float s = x * x;
1598     // polynomial from SLEEF's sincosf, max error is
1599     // 4.33127e-07 over [-2pi,2pi] (98% of values are "exact")
1600     float u = -2.71811842367242206819355e-07f;
1601     u = madd(u, s, +2.47990446951007470488548e-05f);
1602     u = madd(u, s, -0.00138888787478208541870117f);
1603     u = madd(u, s, +0.0416666641831398010253906f);
1604     u = madd(u, s, -0.5f);
1605     u = madd(u, s, +1.0f);
1606     if ((q & 1) != 0) u = -u;
1607     return clamp(u, -1.0f, 1.0f);
1608 #else
1609     return __cosf(x);
1610 #endif
1611 }
1612 
1613 
1614 OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos (float x, float* sine, float* cosine) {
1615 #ifndef __CUDA_ARCH__
1616     // same argument reduction as fast_sin
1617     int q = fast_rint (x * float(M_1_PI));
1618     float qf = float(q);
1619     x = madd(qf, -0.78515625f*4, x);
1620     x = madd(qf, -0.00024187564849853515625f*4, x);
1621     x = madd(qf, -3.7747668102383613586e-08f*4, x);
1622     x = madd(qf, -1.2816720341285448015e-12f*4, x);
1623     x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1624     float s = x * x;
1625     // NOTE: same exact polynomials as fast_sin and fast_cos above
1626     if ((q & 1) != 0) x = -x;
1627     float su = 2.6083159809786593541503e-06f;
1628     su = madd(su, s, -0.0001981069071916863322258f);
1629     su = madd(su, s, +0.00833307858556509017944336f);
1630     su = madd(su, s, -0.166666597127914428710938f);
1631     su = madd(s, su * x, x);
1632     float cu = -2.71811842367242206819355e-07f;
1633     cu = madd(cu, s, +2.47990446951007470488548e-05f);
1634     cu = madd(cu, s, -0.00138888787478208541870117f);
1635     cu = madd(cu, s, +0.0416666641831398010253906f);
1636     cu = madd(cu, s, -0.5f);
1637     cu = madd(cu, s, +1.0f);
1638     if ((q & 1) != 0) cu = -cu;
1639     *sine   = clamp(su, -1.0f, 1.0f);;
1640     *cosine = clamp(cu, -1.0f, 1.0f);;
1641 #else
1642     __sincosf(x, sine, cosine);
1643 #endif
1644 }
1645 
1646 // NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts becoming
1647 // really poor outside of this range because the reciprocal amplifies errors
1648 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan (float x) {
1649 #ifndef __CUDA_ARCH__
1650     // derived from SLEEF implementation
1651     // note that we cannot apply the "denormal crush" trick everywhere because
1652     // we sometimes need to take the reciprocal of the polynomial
1653     int q = fast_rint (x * float(2 * M_1_PI));
1654     float qf = float(q);
1655     x = madd(qf, -0.78515625f*2, x);
1656     x = madd(qf, -0.00024187564849853515625f*2, x);
1657     x = madd(qf, -3.7747668102383613586e-08f*2, x);
1658     x = madd(qf, -1.2816720341285448015e-12f*2, x);
1659     if ((q & 1) == 0)
1660         x = float(M_PI_4) - (float(M_PI_4) - x); // crush denormals (only if we aren't inverting the result later)
1661     float s = x * x;
1662     float u = 0.00927245803177356719970703f;
1663     u = madd(u, s, 0.00331984995864331722259521f);
1664     u = madd(u, s, 0.0242998078465461730957031f);
1665     u = madd(u, s, 0.0534495301544666290283203f);
1666     u = madd(u, s, 0.133383005857467651367188f);
1667     u = madd(u, s, 0.333331853151321411132812f);
1668     u = madd(s, u * x, x);
1669     if ((q & 1) != 0)
1670         u = -1.0f / u;
1671     return u;
1672 #else
1673     return __tanf(x);
1674 #endif
1675 }
1676 
1677 /// Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
1678 /// Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
1679 /// Note that this is MUCH faster, but much less accurate than fast_sin.
1680 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi (float x)
1681 {
1682 #ifndef __CUDA_ARCH__
1683 	// Fast trick to strip the integral part off, so our domain is [-1,1]
1684 	const float z = x - ((x + 25165824.0f) - 25165824.0f);
1685     const float y = z - z * fabsf(z);
1686     const float Q = 3.10396624f;
1687     const float P = 3.584135056f; // P = 16-4*Q
1688     return y * (Q + P * fabsf(y));
1689     /* The original article used used inferior constants for Q and P and
1690      * so had max error 1.091e-3.
1691      *
1692      * The optimal value for Q was determined by exhaustive search, minimizing
1693      * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
1694      * over the interval [0,2] (which is where most of the invocations happen).
1695      *
1696      * The basic idea of this approximation starts with the coarse approximation:
1697      *      sin(pi*x) ~= f(x) =  4 * (x - x * abs(x))
1698      *
1699      * This approximation always _over_ estimates the target. On the otherhand, the
1700      * curve:
1701      *      sin(pi*x) ~= f(x) * abs(f(x)) / 4
1702      *
1703      * always lies _under_ the target. Thus we can simply numerically search for the
1704      * optimal constant to LERP these curves into a more precise approximation.
1705      * After folding the constants together and simplifying the resulting math, we
1706      * end up with the compact implementation below.
1707      *
1708      * NOTE: this function actually computes sin(x * pi) which avoids one or two
1709      * mults in many cases and guarantees exact values at integer periods.
1710      */
1711 #else
1712     return sinpif(x);
1713 #endif
1714 }
1715 
1716 /// Fast approximate cos(x*M_PI) with ~0.1% absolute error.
1717 /// Note that this is MUCH faster, but much less accurate than fast_cos.
1718 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi (float x)
1719 {
1720 #ifndef __CUDA_ARCH__
1721     return fast_sinpi (x+0.5f);
1722 #else
1723     return cospif(x);
1724 #endif
1725 }
1726 
1727 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos (float x) {
1728 #ifndef __CUDA_ARCH__
1729     const float f = fabsf(x);
1730     const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1731     // based on http://www.pouet.net/topic.php?which=9132&page=2
1732     // 85% accurate (ulp 0)
1733     // Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
1734     // Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
1735     const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1736     return x < 0 ? float(M_PI) - a : a;
1737 #else
1738     return acosf(x);
1739 #endif
1740 }
1741 
1742 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin (float x) {
1743 #ifndef __CUDA_ARCH__
1744     // based on acosf approximation above
1745     // max error is 4.51133e-05 (ulps are higher because we are consistently off by a little amount)
1746     const float f = fabsf(x);
1747     const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1748     const float a = float(M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1749     return copysignf(a, x);
1750 #else
1751     return asinf(x);
1752 #endif
1753 }
1754 
1755 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan (float x) {
1756 #ifndef __CUDA_ARCH__
1757     const float a = fabsf(x);
1758     const float k = a > 1.0f ? 1 / a : a;
1759     const float s = 1.0f - (1.0f - k); // crush denormals
1760     const float t = s * s;
1761     // http://mathforum.org/library/drmath/view/62672.html
1762     // the coefficients were tuned in mathematica with the assumption that we want atan(1)=pi/4
1763     // (slightly higher error but no discontinuities)
1764     // Examined 4278190080 values of atan: 2.53989068 avg ulp diff, 315 max ulp, 9.17912e-06 max error      // (with  denormals)
1765     // Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 9.17912e-06 max error // (crush denormals)
1766     float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1767     if (a > 1.0f) r = 1.570796326794896557998982f - r;
1768     return copysignf(r, x);
1769 #else
1770     return atanf(x);
1771 #endif
1772 }
1773 
1774 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2 (float y, float x) {
1775 #ifndef __CUDA_ARCH__
1776     // based on atan approximation above
1777     // the special cases around 0 and infinity were tested explicitly
1778     // the only case not handled correctly is x=NaN,y=0 which returns 0 instead of nan
1779     const float a = fabsf(x);
1780     const float b = fabsf(y);
1781     bool b_is_greater_than_a = b > a;
1782 
1783 #if OIIO_FMATH_SIMD_FRIENDLY
1784     // When applying to all lanes in SIMD, we end up doing extra masking and
1785     // 2 divides. So lets just do 1 divide and swap the parameters instead.
1786     // And if we are going to do a doing a divide anyway, when a == b it
1787     // should be 1.0f anyway so lets not bother special casing it.
1788     float sa = b_is_greater_than_a ? b : a;
1789     float sb = b_is_greater_than_a ? a : b;
1790     const float k = (b == 0) ? 0.0f : sb/sa;
1791 #else
1792     const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
1793 #endif
1794 
1795     const float s = 1.0f - (1.0f - k); // crush denormals
1796     const float t = s * s;
1797 
1798     float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1799 
1800     if (b_is_greater_than_a)
1801         r = 1.570796326794896557998982f - r; // account for arg reduction
1802     // TODO:  investigate if testing x < 0.0f is more efficient
1803     if (bit_cast<float, unsigned>(x) & 0x80000000u) // test sign bit of x
1804         r = float(M_PI) - r;
1805     return copysignf(r, y);
1806 #else
1807     return atan2f(y, x);
1808 #endif
1809 }
1810 
1811 
1812 template<typename T>
1813 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2 (const T& xval) {
1814     using namespace simd;
1815     typedef typename T::int_t intN;
1816     // See float fast_log2 for explanations
1817     T x = clamp (xval, T(std::numeric_limits<float>::min()), T(std::numeric_limits<float>::max()));
1818     intN bits = bitcast_to_int(x);
1819     intN exponent = srl (bits, 23) - intN(127);
1820     T f = bitcast_to_float ((bits & intN(0x007FFFFF)) | intN(0x3f800000)) - T(1.0f);
1821     T f2 = f * f;
1822     T f4 = f2 * f2;
1823     T hi = madd(f, T(-0.00931049621349f), T( 0.05206469089414f));
1824     T lo = madd(f, T( 0.47868480909345f), T(-0.72116591947498f));
1825     hi = madd(f, hi, T(-0.13753123777116f));
1826     hi = madd(f, hi, T( 0.24187369696082f));
1827     hi = madd(f, hi, T(-0.34730547155299f));
1828     lo = madd(f, lo, T( 1.442689881667200f));
1829     return ((f4 * hi) + (f * lo)) + T(exponent);
1830 }
1831 
1832 
1833 template<>
1834 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log2 (const float& xval) {
1835 #ifndef __CUDA_ARCH__
1836     // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
1837     float x = clamp (xval, std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
1838     // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
1839     unsigned bits = bit_cast<float, unsigned>(x);
1840     int exponent = int(bits >> 23) - 127;
1841     float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1842     // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
1843     // ulp histogram:
1844     //  0  = 97.46%
1845     //  1  =  2.29%
1846     //  2  =  0.11%
1847     float f2 = f * f;
1848     float f4 = f2 * f2;
1849     float hi = madd(f, -0.00931049621349f,  0.05206469089414f);
1850     float lo = madd(f,  0.47868480909345f, -0.72116591947498f);
1851     hi = madd(f, hi, -0.13753123777116f);
1852     hi = madd(f, hi,  0.24187369696082f);
1853     hi = madd(f, hi, -0.34730547155299f);
1854     lo = madd(f, lo,  1.442689881667200f);
1855     return ((f4 * hi) + (f * lo)) + exponent;
1856 #else
1857     return __log2f(xval);
1858 #endif
1859 }
1860 
1861 
1862 
1863 template<typename T>
1864 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log (const T& x) {
1865     // Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]: 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error
1866     return fast_log2(x) * T(M_LN2);
1867 }
1868 
1869 #ifdef __CUDA_ARCH__
1870 template<>
1871 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log(const float& x)
1872 {
1873      return __logf(x);
1874 }
1875 #endif
1876 
1877 
1878 template<typename T>
1879 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10 (const T& x) {
1880     // Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]: 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error
1881     return fast_log2(x) * T(M_LN2 / M_LN10);
1882 }
1883 
1884 #ifdef __CUDA_ARCH__
1885 template<>
1886 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log10(const float& x)
1887 {
1888      return __log10f(x);
1889 }
1890 #endif
1891 
1892 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb (float x) {
1893 #ifndef __CUDA_ARCH__
1894     // don't bother with denormals
1895     x = fabsf(x);
1896     if (x < std::numeric_limits<float>::min()) x = std::numeric_limits<float>::min();
1897     if (x > std::numeric_limits<float>::max()) x = std::numeric_limits<float>::max();
1898     unsigned bits = bit_cast<float, unsigned>(x);
1899     return float (int(bits >> 23) - 127);
1900 #else
1901     return logbf(x);
1902 #endif
1903 }
1904 
1905 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p (float x) {
1906 #ifndef __CUDA_ARCH__
1907     if (fabsf(x) < 0.01f) {
1908         float y = 1.0f - (1.0f - x); // crush denormals
1909         return copysignf(madd(-0.5f, y * y, y), x);
1910     } else {
1911         return fast_log(x + 1);
1912     }
1913 #else
1914     return log1pf(x);
1915 #endif
1916 }
1917 
1918 
1919 
1920 template<typename T>
1921 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2 (const T& xval) {
1922     using namespace simd;
1923     typedef typename T::int_t intN;
1924 #if OIIO_SIMD_SSE
1925     // See float specialization for explanations
1926     T x = clamp (xval, T(-126.0f), T(126.0f));
1927     intN m (x); x -= T(m);
1928     T one (1.0f);
1929     x = one - (one - x); // crush denormals (does not affect max ulps!)
1930     const T kA (1.33336498402e-3f);
1931     const T kB (9.810352697968e-3f);
1932     const T kC (5.551834031939e-2f);
1933     const T kD (0.2401793301105f);
1934     const T kE (0.693144857883f);
1935     T r (kA);
1936     r = madd(x, r, kB);
1937     r = madd(x, r, kC);
1938     r = madd(x, r, kD);
1939     r = madd(x, r, kE);
1940     r = madd(x, r, one);
1941     return bitcast_to_float (bitcast_to_int(r) + (m << 23));
1942 #else
1943     T r;
1944     for (int i = 0; i < r.elements; ++i)
1945         r[i] = fast_exp2(xval[i]);
1946     for (int i = r.elements; i < r.paddedelements; ++i)
1947         r[i] = 0.0f;
1948     return r;
1949 #endif
1950 }
1951 
1952 
1953 template<>
1954 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp2 (const float& xval) {
1955 #if OIIO_NON_INTEL_CLANG && OIIO_FMATH_SIMD_FRIENDLY
1956     // Clang was unhappy using the bitcast/memcpy/reinter_cast/union inside
1957     // an explicit SIMD loop, so revert to calling the standard version.
1958     return std::exp2(xval);
1959 #elif !defined(__CUDA_ARCH__)
1960     // clamp to safe range for final addition
1961     float x = clamp (xval, -126.0f, 126.0f);
1962     // range reduction
1963     int m = int(x); x -= m;
1964     x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
1965     // 5th degree polynomial generated with sollya
1966     // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
1967     // ulp histogram:
1968     //  0  = 87.81%
1969     //  1  =  4.18%
1970     float r = 1.33336498402e-3f;
1971     r = madd(x, r, 9.810352697968e-3f);
1972     r = madd(x, r, 5.551834031939e-2f);
1973     r = madd(x, r, 0.2401793301105f);
1974     r = madd(x, r, 0.693144857883f);
1975     r = madd(x, r, 1.0f);
1976     // multiply by 2 ^ m by adding in the exponent
1977     // NOTE: left-shift of negative number is undefined behavior
1978     return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
1979     // Clang: loop not vectorized: unsafe dependent memory operations in loop.
1980     // This is why we special case the OIIO_FMATH_SIMD_FRIENDLY above.
1981     // FIXME: as clang releases continue to improve, periodically check if
1982     // this is still the case.
1983 #else
1984     return exp2f(xval);
1985 #endif
1986 }
1987 
1988 
1989 
1990 
1991 template <typename T>
1992 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp (const T& x) {
1993     // Examined 2237485550 values of exp on [-87.3300018,87.3300018]: 2.6666452 avg ulp diff, 230 max ulp
1994     return fast_exp2(x * T(1 / M_LN2));
1995 }
1996 
1997 #ifdef __CUDA_ARCH__
1998 template<>
1999 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp (const float& x) {
2000     return __expf(x);
2001 }
2002 #endif
2003 
2004 
2005 
2006 /// Faster float exp than is in libm, but still 100% accurate
2007 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp (float x)
2008 {
2009 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2010     // On x86_64, versions of glibc < 2.16 have an issue where expf is
2011     // much slower than the double version.  This was fixed in glibc 2.16.
2012     return static_cast<float>(std::exp(static_cast<double>(x)));
2013 #else
2014     return std::exp(x);
2015 #endif
2016 }
2017 
2018 
2019 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10 (float x) {
2020 #ifndef __CUDA_ARCH__
2021     // Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: 2.71732409 avg ulp diff, 232 max ulp
2022     return fast_exp2(x * float(M_LN10 / M_LN2));
2023 #else
2024     return __exp10f(x);
2025 #endif
2026 }
2027 
2028 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1 (float x) {
2029 #ifndef __CUDA_ARCH__
2030     if (fabsf(x) < 0.03f) {
2031         float y = 1.0f - (1.0f - x); // crush denormals
2032         return copysignf(madd(0.5f, y * y, y), x);
2033     } else
2034         return fast_exp(x) - 1.0f;
2035 #else
2036     return expm1f(x);
2037 #endif
2038 }
2039 
2040 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh (float x) {
2041 #ifndef __CUDA_ARCH__
2042     float a = fabsf(x);
2043     if (a > 1.0f) {
2044         // Examined 53389559 values of sinh on [1,87.3300018]: 33.6886442 avg ulp diff, 178 max ulp
2045         float e = fast_exp(a);
2046         return copysignf(0.5f * e - 0.5f / e, x);
2047     } else {
2048         a = 1.0f - (1.0f - a); // crush denorms
2049         float a2 = a * a;
2050         // degree 7 polynomial generated with sollya
2051         // Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error
2052         float r = 2.03945513931e-4f;
2053         r = madd(r, a2, 8.32990277558e-3f);
2054         r = madd(r, a2, 0.1666673421859f);
2055         r = madd(r * a, a2, a);
2056         return copysignf(r, x);
2057     }
2058 #else
2059     return sinhf(x);
2060 #endif
2061 }
2062 
2063 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh (float x) {
2064 #ifndef __CUDA_ARCH__
2065     // Examined 2237485550 values of cosh on [-87.3300018,87.3300018]: 1.78256726 avg ulp diff, 178 max ulp
2066     float e = fast_exp(fabsf(x));
2067     return 0.5f * e + 0.5f / e;
2068 #else
2069     return coshf(x);
2070 #endif
2071 }
2072 
2073 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh (float x) {
2074 #ifndef __CUDA_ARCH__
2075     // Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]: 3.12924e-06 max error
2076     // NOTE: ulp error is high because of sub-optimal handling around the origin
2077     float e = fast_exp(2.0f * fabsf(x));
2078     return copysignf(1 - 2 / (1 + e), x);
2079 #else
2080     return tanhf(x);
2081 #endif
2082 }
2083 
2084 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow (float x, float y) {
2085     if (y == 0) return 1.0f; // x^0=1
2086     if (x == 0) return 0.0f; // 0^y=0
2087     // be cheap & exact for special case of squaring and identity
2088     if (y == 1.0f)
2089         return x;
2090     if (y == 2.0f) {
2091 #ifndef __CUDA_ARCH__
2092         return std::min (x*x, std::numeric_limits<float>::max());
2093 #else
2094         return fminf (x*x, std::numeric_limits<float>::max());
2095 #endif
2096     }
2097     float sign = 1.0f;
2098     if (OIIO_UNLIKELY(x < 0.0f)) {
2099         // if x is negative, only deal with integer powers
2100         // powf returns NaN for non-integers, we will return 0 instead
2101         int ybits = bit_cast<float, int>(y) & 0x7fffffff;
2102         if (ybits >= 0x4b800000) {
2103             // always even int, keep positive
2104         } else if (ybits >= 0x3f800000) {
2105             // bigger than 1, check
2106             int k = (ybits >> 23) - 127;  // get exponent
2107             int j =  ybits >> (23 - k);   // shift out possible fractional bits
2108             if ((j << (23 - k)) == ybits) // rebuild number and check for a match
2109                 sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
2110             else
2111                 return 0.0f; // not integer
2112         } else {
2113             return 0.0f; // not integer
2114         }
2115     }
2116     return sign * fast_exp2(y * fast_log2(fabsf(x)));
2117 }
2118 
2119 
2120 // Fast simd pow that only needs to work for positive x
2121 template<typename T, typename U>
2122 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos (const T& x, const U& y) {
2123     return fast_exp2(y * fast_log2(x));
2124 }
2125 
2126 
2127 // Fast cube root (performs better that using fast_pow's above with y=1/3)
2128 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt (float x) {
2129 #ifndef __CUDA_ARCH__
2130     float x0 = fabsf(x);
2131     // from hacker's delight
2132     float a = bit_cast<int, float>(0x2a5137a0 + bit_cast<float, int>(x0) / 3); // Initial guess.
2133     // Examined 14272478 values of cbrt on [-9.99999935e-39,9.99999935e-39]: 8.14687e-14 max error
2134     // Examined 2131958802 values of cbrt on [9.99999935e-39,3.40282347e+38]: 2.46930719 avg ulp diff, 12 max ulp
2135     a = 0.333333333f * (2.0f * a + x0 / (a * a));  // Newton step.
2136     a = 0.333333333f * (2.0f * a + x0 / (a * a));  // Newton step again.
2137     a = (x0 == 0) ? 0 : a; // Fix 0 case
2138     return copysignf(a, x);
2139 #else
2140     return cbrtf(x);
2141 #endif
2142 }
2143 
2144 
2145 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf (float x)
2146 {
2147 #ifndef __CUDA_ARCH__
2148     // Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error
2149     // Abramowitz and Stegun, 7.1.28
2150     const float a1 = 0.0705230784f;
2151     const float a2 = 0.0422820123f;
2152     const float a3 = 0.0092705272f;
2153     const float a4 = 0.0001520143f;
2154     const float a5 = 0.0002765672f;
2155     const float a6 = 0.0000430638f;
2156     const float a = fabsf(x);
2157     const float b = 1.0f - (1.0f - a); // crush denormals
2158     const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2159     const float s = r * r; // ^2
2160     const float t = s * s; // ^4
2161     const float u = t * t; // ^8
2162     const float v = u * u; // ^16
2163     return copysignf(1.0f - 1.0f / v, x);
2164 #else
2165     return erff(x);
2166 #endif
2167 }
2168 
2169 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc (float x)
2170 {
2171 #ifndef __CUDA_ARCH__
2172     // Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error
2173     // ulp histogram:
2174     //   0  = 80.30%
2175     return 1.0f - fast_erf(x);
2176 #else
2177     return erfcf(x);
2178 #endif
2179 }
2180 
2181 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf (float x)
2182 {
2183     // from: Approximating the erfinv function by Mike Giles
2184     // to avoid trouble at the limit, clamp input to 1-eps
2185     float a = fabsf(x); if (a > 0.99999994f) a = 0.99999994f;
2186     float w = -fast_log((1.0f - a) * (1.0f + a)), p;
2187     if (w < 5.0f) {
2188         w = w - 2.5f;
2189         p =  2.81022636e-08f;
2190         p = madd(p, w,  3.43273939e-07f);
2191         p = madd(p, w, -3.5233877e-06f );
2192         p = madd(p, w, -4.39150654e-06f);
2193         p = madd(p, w,  0.00021858087f );
2194         p = madd(p, w, -0.00125372503f );
2195         p = madd(p, w, -0.00417768164f );
2196         p = madd(p, w,  0.246640727f   );
2197         p = madd(p, w,  1.50140941f    );
2198     } else {
2199         w = sqrtf(w) - 3.0f;
2200         p = -0.000200214257f;
2201         p = madd(p, w,  0.000100950558f);
2202         p = madd(p, w,  0.00134934322f );
2203         p = madd(p, w, -0.00367342844f );
2204         p = madd(p, w,  0.00573950773f );
2205         p = madd(p, w, -0.0076224613f  );
2206         p = madd(p, w,  0.00943887047f );
2207         p = madd(p, w,  1.00167406f    );
2208         p = madd(p, w,  2.83297682f    );
2209     }
2210     return p * x;
2211 }
2212 
2213 // (end of fast* functions)
2214 ////////////////////////////////////////////////////////////////////////////
2215 
2216 
2217 
2218 
2219 ////////////////////////////////////////////////////////////////////////////
2220 ////////////////////////////////////////////////////////////////////////////
2221 //
2222 // MISCELLANEOUS NUMERICAL METHODS
2223 //
2224 
2225 
2226 /// Solve for the x for which func(x) == y on the interval [xmin,xmax].
2227 /// Use a maximum of maxiter iterations, and stop any time the remaining
2228 /// search interval or the function evaluations <= eps.  If brack is
2229 /// non-NULL, set it to true if y is in [f(xmin), f(xmax)], otherwise
2230 /// false (in which case the caller should know that the results may be
2231 /// unreliable.  Results are undefined if the function is not monotonic
2232 /// on that interval or if there are multiple roots in the interval (it
2233 /// may not converge, or may converge to any of the roots without
2234 /// telling you that there are more than one).
2235 template<class T, class Func> OIIO_HOSTDEVICE
2236 T invert (Func &func, T y, T xmin=0.0, T xmax=1.0,
2237           int maxiters=32, T eps=1.0e-6, bool *brack=0)
2238 {
2239     // Use the Regula Falsi method, falling back to bisection if it
2240     // hasn't converged after 3/4 of the maximum number of iterations.
2241     // See, e.g., Numerical Recipes for the basic ideas behind both
2242     // methods.
2243     T v0 = func(xmin), v1 = func(xmax);
2244     T x = xmin, v = v0;
2245     bool increasing = (v0 < v1);
2246     T vmin = increasing ? v0 : v1;
2247     T vmax = increasing ? v1 : v0;
2248     bool bracketed = (y >= vmin && y <= vmax);
2249     if (brack)
2250         *brack = bracketed;
2251     if (! bracketed) {
2252         // If our bounds don't bracket the zero, just give up, and
2253         // return the appropriate "edge" of the interval
2254         return ((y < vmin) == increasing) ? xmin : xmax;
2255     }
2256     if (fabs(v0-v1) < eps)   // already close enough
2257         return x;
2258     int rfiters = (3*maxiters)/4;   // how many times to try regula falsi
2259     for (int iters = 0;  iters < maxiters;  ++iters) {
2260         T t;  // interpolation factor
2261         if (iters < rfiters) {
2262             // Regula falsi
2263             t = (y-v0)/(v1-v0);
2264             if (t <= T(0) || t >= T(1))
2265                 t = T(0.5);  // RF convergence failure -- bisect instead
2266         } else {
2267             t = T(0.5);            // bisection
2268         }
2269         x = lerp (xmin, xmax, t);
2270         v = func(x);
2271         if ((v < y) == increasing) {
2272             xmin = x; v0 = v;
2273         } else {
2274             xmax = x; v1 = v;
2275         }
2276         if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2277             return x;   // converged
2278     }
2279     return x;
2280 }
2281 
2282 
2283 
2284 /// Linearly interpolate a list of evenly-spaced knots y[0..len-1] with
2285 /// y[0] corresponding to the value at x==0.0 and y[len-1] corresponding to
2286 /// x==1.0.
2287 inline OIIO_HOSTDEVICE float
2288 interpolate_linear (float x, span_strided<const float> y)
2289 {
2290 #ifndef __CUDA_ARCH__
2291     DASSERT_MSG (y.size() >= 2, "interpolate_linear needs at least 2 knot values (%zd)", y.size());
2292 #endif
2293     x = clamp (x, float(0.0), float(1.0));
2294     int nsegs = int(y.size()) - 1;
2295     int segnum;
2296     x = floorfrac (x*nsegs, &segnum);
2297 #ifndef __CUDA_ARCH__
2298     int nextseg = std::min (segnum+1, nsegs);
2299 #else
2300     int nextseg = min (segnum+1, nsegs);
2301 #endif
2302     return lerp (y[segnum], y[nextseg], x);
2303 }
2304 
2305 // (end miscellaneous numerical methods)
2306 ////////////////////////////////////////////////////////////////////////////
2307 
2308 
2309 
2310 OIIO_NAMESPACE_END
2311