1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4
5 /*
6 A few bits here are based upon code from NVIDIA that was also released
7 under the same 3-Clause BSD license, and marked as:
8 Copyright 2004 NVIDIA Corporation. All Rights Reserved.
9
10 Some parts of this file were first open-sourced in Open Shading Language,
11 also 3-Clause BSD license, then later moved here. The original copyright
12 notice was:
13 Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
14
15 Many of the math functions were copied from or inspired by other
16 public domain sources or open source packages with compatible licenses.
17 The individual functions give references were applicable.
18 */
19
20
21 // clang-format off
22
23 /// \file
24 ///
25 /// A variety of floating-point math helper routines (and, slight
26 /// misnomer, some int stuff as well).
27 ///
28
29
30 #pragma once
31 #define OIIO_FMATH_H 1
32
33 #include <algorithm>
34 #include <cmath>
35 #include <cstring>
36 #include <limits>
37 #include <typeinfo>
38 #include <type_traits>
39
40 #include <OpenImageIO/Imath.h>
41 #include <OpenImageIO/span.h>
42 #include <OpenImageIO/dassert.h>
43 #include <OpenImageIO/oiioversion.h>
44 #include <OpenImageIO/platform.h>
45 #include <OpenImageIO/simd.h>
46
47
48 OIIO_NAMESPACE_BEGIN
49
50 /// Occasionally there may be a tradeoff where the best/fastest
51 /// implementation of a math function in an ordinary scalar context does
52 /// things that prevent autovectorization (or OMP pragma vectorization) of a
53 /// loop containing that operation, and the only way to make it SIMD
54 /// friendly slows it down as a scalar op.
55 ///
56 /// By default, we always prefer doing it as correctly and quickly as
57 /// possible in scalar mode, but if OIIO_FMATH_SIMD_FRIENDLY is defined to
58 /// be nonzero, we prefer ensuring that the SIMD loops are as efficient as
59 /// possible, even at the expense of scalar performance.
60 ///
61 /// Because everything here consists of inline functions, and it doesn't
62 /// really matter for OIIO internals themselves, this is primarily intended
63 /// for the sake of 3rd party apps that happen to have OIIO as a dependency
64 /// and use the fmath.h functions. Such a downstream dependency just needs
65 /// to ensure that OIIO_FMATH_SIMD_FRIENDLY is defined to 1 prior to
66 /// including fmath.h.
67 ///
68 #ifndef OIIO_FMATH_SIMD_FRIENDLY
69 # define OIIO_FMATH_SIMD_FRIENDLY 0
70 #endif
71
72
73 // Helper template to let us tell if two types are the same.
74 // C++11 defines this, keep in OIIO namespace for back compat.
75 // DEPRECATED(2.0) -- clients should switch OIIO::is_same -> std::is_same.
76 using std::is_same;
77
78
79 // For back compatibility: expose these in the OIIO namespace.
80 // DEPRECATED(2.0) -- clients should switch OIIO:: -> std:: for these.
81 using std::isfinite;
82 using std::isinf;
83 using std::isnan;
84
85
86 // Define math constants just in case they aren't included (Windows is a
87 // little finicky about this, only defining these if _USE_MATH_DEFINES is
88 // defined before <cmath> is included, which is hard to control).
89 #ifndef M_PI
90 # define M_PI 3.14159265358979323846264338327950288
91 #endif
92 #ifndef M_PI_2
93 # define M_PI_2 1.57079632679489661923132169163975144
94 #endif
95 #ifndef M_PI_4
96 # define M_PI_4 0.785398163397448309615660845819875721
97 #endif
98 #ifndef M_TWO_PI
99 # define M_TWO_PI (M_PI * 2.0)
100 #endif
101 #ifndef M_1_PI
102 # define M_1_PI 0.318309886183790671537767526745028724
103 #endif
104 #ifndef M_2_PI
105 # define M_2_PI 0.636619772367581343075535053490057448
106 #endif
107 #ifndef M_SQRT2
108 # define M_SQRT2 1.41421356237309504880168872420969808
109 #endif
110 #ifndef M_SQRT1_2
111 # define M_SQRT1_2 0.707106781186547524400844362104849039
112 #endif
113 #ifndef M_LN2
114 # define M_LN2 0.69314718055994530941723212145817656
115 #endif
116 #ifndef M_LN10
117 # define M_LN10 2.30258509299404568401799145468436421
118 #endif
119 #ifndef M_E
120 # define M_E 2.71828182845904523536028747135266250
121 #endif
122 #ifndef M_LOG2E
123 # define M_LOG2E 1.44269504088896340735992468100189214
124 #endif
125
126
127
128 ////////////////////////////////////////////////////////////////////////////
129 ////////////////////////////////////////////////////////////////////////////
130 //
131 // INTEGER HELPER FUNCTIONS
132 //
133 // A variety of handy functions that operate on integers.
134 //
135
136 /// Quick test for whether an integer is a power of 2.
137 ///
138 template<typename T>
139 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool
ispow2(T x)140 ispow2(T x) noexcept
141 {
142 // Numerous references for this bit trick are on the web. The
143 // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
144 // all 1 bits for bits < b.
145 return (x & (x - 1)) == 0 && (x >= 0);
146 }
147
148
149
150 /// Round up to next higher power of 2 (return x if it's already a power
151 /// of 2).
152 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
ceil2(int x)153 ceil2(int x) noexcept
154 {
155 // Here's a version with no loops.
156 if (x < 0)
157 return 0;
158 // Subtract 1, then round up to a power of 2, that way if we are
159 // already a power of 2, we end up with the same number.
160 --x;
161 // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
162 x |= x >> 1;
163 x |= x >> 2;
164 x |= x >> 4;
165 x |= x >> 8;
166 x |= x >> 16;
167 // Now we have 2^n-1, by adding 1, we make it a power of 2 again
168 return x + 1;
169 }
170
171
172
173 /// Round down to next lower power of 2 (return x if it's already a power
174 /// of 2).
175 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
floor2(int x)176 floor2(int x) noexcept
177 {
178 // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
179 x |= x >> 1;
180 x |= x >> 2;
181 x |= x >> 4;
182 x |= x >> 8;
183 x |= x >> 16;
184 // Strip off all but the high bit, i.e. 00011111 -> 00010000
185 // That's the power of two <= the original x
186 return x & ~(x >> 1);
187 }
188
189
190 // Old names -- DEPRECATED(2.1)
pow2roundup(int x)191 inline OIIO_HOSTDEVICE int pow2roundup(int x) { return ceil2(x); }
pow2rounddown(int x)192 inline OIIO_HOSTDEVICE int pow2rounddown(int x) { return floor2(x); }
193
194
195
196 /// Round value up to the next whole multiple.
197 /// For example, round_to_multiple(7,10) returns 10.
198 template <typename V, typename M>
round_to_multiple(V value,M multiple)199 inline OIIO_HOSTDEVICE V round_to_multiple (V value, M multiple)
200 {
201 return V (((value + V(multiple) - 1) / V(multiple)) * V(multiple));
202 }
203
204
205
206 /// Round up to the next whole multiple of m, for the special case where
207 /// m is definitely a power of 2 (somewhat simpler than the more general
208 /// round_to_multiple). This is a template that should work for any
209 /// integer type.
210 template<typename T>
211 inline OIIO_HOSTDEVICE T
round_to_multiple_of_pow2(T x,T m)212 round_to_multiple_of_pow2(T x, T m)
213 {
214 OIIO_DASSERT(ispow2(m));
215 return (x + m - 1) & (~(m - 1));
216 }
217
218
219
220 /// Multiply two unsigned 32-bit ints safely, carefully checking for
221 /// overflow, and clamping to uint32_t's maximum value.
222 inline OIIO_HOSTDEVICE uint32_t
clamped_mult32(uint32_t a,uint32_t b)223 clamped_mult32(uint32_t a, uint32_t b)
224 {
225 const uint32_t Err = std::numeric_limits<uint32_t>::max();
226 uint64_t r = (uint64_t)a * (uint64_t)b; // Multiply into a bigger int
227 return r < Err ? (uint32_t)r : Err;
228 }
229
230
231
232 /// Multiply two unsigned 64-bit ints safely, carefully checking for
233 /// overflow, and clamping to uint64_t's maximum value.
234 inline OIIO_HOSTDEVICE uint64_t
clamped_mult64(uint64_t a,uint64_t b)235 clamped_mult64(uint64_t a, uint64_t b)
236 {
237 uint64_t ab = a * b;
238 if (b && ab / b != a)
239 return std::numeric_limits<uint64_t>::max();
240 else
241 return ab;
242 }
243
244
245
246 /// Bitwise circular rotation left by `s` bits (for any unsigned integer
247 /// type). For info on the C++20 std::rotl(), see
248 /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0553r4.html
249 // FIXME: this should be constexpr, but we're leaving that out for now
250 // because the Cuda specialization uses an intrinsic that isn't constexpr.
251 // Come back to this later when more of the Cuda library is properly
252 // constexpr.
253 template<class T>
254 OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE
255 // constexpr
rotl(T x,int s)256 T rotl(T x, int s) noexcept
257 {
258 static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
259 "rotl only works for unsigned integer types");
260 return (x << s) | (x >> ((sizeof(T) * 8) - s));
261 }
262
263
264 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
265 // Cuda has an intrinsic for 32 bit unsigned int rotation
266 // FIXME: This should be constexpr, but __funnelshift_lc seems not to be
267 // marked as such.
268 template<>
269 OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE
270 // constexpr
rotl(uint32_t x,int s)271 uint32_t rotl(uint32_t x, int s) noexcept
272 {
273 return __funnelshift_lc(x, x, s);
274 }
275 #endif
276
277
278
279 // Old names -- DEPRECATED(2.1)
280 OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t
rotl32(uint32_t x,int k)281 rotl32(uint32_t x, int k)
282 {
283 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
284 return __funnelshift_lc(x, x, k);
285 #else
286 return (x << k) | (x >> (32 - k));
287 #endif
288 }
289
290 OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t
rotl64(uint64_t x,int k)291 rotl64(uint64_t x, int k)
292 {
293 return (x << k) | (x >> (64 - k));
294 }
295
296
297
298 // safe_mod is like integer a%b, but safely returns 0 when b==0.
299 template <class T>
300 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
safe_mod(T a,T b)301 safe_mod(T a, T b)
302 {
303 return b ? (a % b) : T(0);
304 }
305
306 // (end of integer helper functions)
307 ////////////////////////////////////////////////////////////////////////////
308
309
310
311 ////////////////////////////////////////////////////////////////////////////
312 ////////////////////////////////////////////////////////////////////////////
313 //
314 // FLOAT UTILITY FUNCTIONS
315 //
316 // Helper/utility functions: clamps, blends, interpolations...
317 //
318
319
320 /// clamp a to bounds [low,high].
321 template <class T>
322 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
clamp(const T & a,const T & low,const T & high)323 clamp (const T& a, const T& low, const T& high)
324 {
325 #if 1
326 // This looks clunky, but it generates minimal code. For float, it
327 // should result in just a max and min instruction, thats it.
328 // This implementation is courtesy of Alex Wells, Intel, via OSL.
329 T val = a;
330 if (!(low <= val)) // Forces clamp(NaN,low,high) to return low
331 val = low;
332 if (val > high)
333 val = high;
334 return val;
335 #else
336 // The naive implementation we originally had, below, only does the 2nd
337 // comparison in the else block, which will generate extra code in a
338 // SIMD masking scenario. I (LG) can confirm that benchmarks show that
339 // the above implementation is indeed faster than the one below, even
340 // for non-SIMD use.
341 return (a >= low) ? ((a <= high) ? a : high) : low;
342 #endif
343 }
344
345
346 #ifndef __CUDA_ARCH__
347 // Specialization of clamp for vfloat4
348 template<> OIIO_FORCEINLINE simd::vfloat4
clamp(const simd::vfloat4 & a,const simd::vfloat4 & low,const simd::vfloat4 & high)349 clamp (const simd::vfloat4& a, const simd::vfloat4& low, const simd::vfloat4& high)
350 {
351 return simd::min (high, simd::max (low, a));
352 }
353
354 template<> OIIO_FORCEINLINE simd::vfloat8
clamp(const simd::vfloat8 & a,const simd::vfloat8 & low,const simd::vfloat8 & high)355 clamp (const simd::vfloat8& a, const simd::vfloat8& low, const simd::vfloat8& high)
356 {
357 return simd::min (high, simd::max (low, a));
358 }
359
360 template<> OIIO_FORCEINLINE simd::vfloat16
clamp(const simd::vfloat16 & a,const simd::vfloat16 & low,const simd::vfloat16 & high)361 clamp (const simd::vfloat16& a, const simd::vfloat16& low, const simd::vfloat16& high)
362 {
363 return simd::min (high, simd::max (low, a));
364 }
365
366 // Specialization of clamp for vint4
367 template<> OIIO_FORCEINLINE simd::vint4
clamp(const simd::vint4 & a,const simd::vint4 & low,const simd::vint4 & high)368 clamp (const simd::vint4& a, const simd::vint4& low, const simd::vint4& high)
369 {
370 return simd::min (high, simd::max (low, a));
371 }
372
373 template<> OIIO_FORCEINLINE simd::vint8
clamp(const simd::vint8 & a,const simd::vint8 & low,const simd::vint8 & high)374 clamp (const simd::vint8& a, const simd::vint8& low, const simd::vint8& high)
375 {
376 return simd::min (high, simd::max (low, a));
377 }
378
379 template<> OIIO_FORCEINLINE simd::vint16
clamp(const simd::vint16 & a,const simd::vint16 & low,const simd::vint16 & high)380 clamp (const simd::vint16& a, const simd::vint16& low, const simd::vint16& high)
381 {
382 return simd::min (high, simd::max (low, a));
383 }
384 #endif
385
386
387
388 // For the multply+add (or sub) operations below, note that the results may
389 // differ slightly on different hardware, depending on whether true fused
390 // multiply and add is available or if the code generated just does an old
391 // fashioned multiply followed by a separate add. So please interpret these
392 // as "do a multiply and add as fast as possible for this hardware, with at
393 // least as much precision as a multiply followed by a separate add."
394
395 /// Fused multiply and add: (a*b + c)
madd(float a,float b,float c)396 OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd (float a, float b, float c) {
397 // NOTE: GCC/ICC will turn this (for float) into a FMA unless
398 // explicitly asked not to, clang will do so if -ffp-contract=fast.
399 OIIO_CLANG_PRAGMA(clang fp contract(fast))
400 return a * b + c;
401 }
402
403
404 /// Fused multiply and subtract: (a*b - c)
msub(float a,float b,float c)405 OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub (float a, float b, float c) {
406 OIIO_CLANG_PRAGMA(clang fp contract(fast))
407 return a * b - c;
408 }
409
410
411
412 /// Fused negative multiply and add: -(a*b) + c
nmadd(float a,float b,float c)413 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd (float a, float b, float c) {
414 OIIO_CLANG_PRAGMA(clang fp contract(fast))
415 return c - (a * b);
416 }
417
418
419
420 /// Negative fused multiply and subtract: -(a*b) - c
nmsub(float a,float b,float c)421 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub (float a, float b, float c) {
422 OIIO_CLANG_PRAGMA(clang fp contract(fast))
423 return -(a * b) - c;
424 }
425
426
427
428 /// Linearly interpolate values v0-v1 at x: v0*(1-x) + v1*x.
429 /// This is a template, and so should work for any types.
430 template <class T, class Q>
431 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
lerp(const T & v0,const T & v1,const Q & x)432 lerp (const T& v0, const T& v1, const Q& x)
433 {
434 // NOTE: a*(1-x) + b*x is much more numerically stable than a+x*(b-a)
435 return v0*(Q(1)-x) + v1*x;
436 }
437
438
439
440 /// Bilinearly interoplate values v0-v3 (v0 upper left, v1 upper right,
441 /// v2 lower left, v3 lower right) at coordinates (s,t) and return the
442 /// result. This is a template, and so should work for any types.
443 template <class T, class Q>
444 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
bilerp(const T & v0,const T & v1,const T & v2,const T & v3,const Q & s,const Q & t)445 bilerp(const T& v0, const T& v1, const T& v2, const T& v3, const Q& s, const Q& t)
446 {
447 // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
448 Q s1 = Q(1) - s;
449 return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
450 }
451
452
453
454 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
455 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
456 /// storing the results in 'result'. These are all vectors, so do it
457 /// for each of 'n' contiguous values (using the same s,t interpolants).
458 template <class T, class Q>
459 inline OIIO_HOSTDEVICE void
bilerp(const T * v0,const T * v1,const T * v2,const T * v3,Q s,Q t,int n,T * result)460 bilerp (const T *v0, const T *v1,
461 const T *v2, const T *v3,
462 Q s, Q t, int n, T *result)
463 {
464 Q s1 = Q(1) - s;
465 Q t1 = Q(1) - t;
466 for (int i = 0; i < n; ++i)
467 result[i] = T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
468 }
469
470
471
472 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
473 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
474 /// SCALING the interpolated value by 'scale' and then ADDING to
475 /// 'result'. These are all vectors, so do it for each of 'n'
476 /// contiguous values (using the same s,t interpolants).
477 template <class T, class Q>
478 inline OIIO_HOSTDEVICE void
bilerp_mad(const T * v0,const T * v1,const T * v2,const T * v3,Q s,Q t,Q scale,int n,T * result)479 bilerp_mad (const T *v0, const T *v1,
480 const T *v2, const T *v3,
481 Q s, Q t, Q scale, int n, T *result)
482 {
483 Q s1 = Q(1) - s;
484 Q t1 = Q(1) - t;
485 for (int i = 0; i < n; ++i)
486 result[i] += T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
487 t*(v2[i]*s1 + v3[i]*s)));
488 }
489
490
491
492 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
493 /// upper right top, ...) at coordinates (s,t,r), and return the
494 /// result. This is a template, and so should work for any types.
495 template <class T, class Q>
496 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
trilerp(T v0,T v1,T v2,T v3,T v4,T v5,T v6,T v7,Q s,Q t,Q r)497 trilerp (T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
498 {
499 // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
500 Q s1 = Q(1) - s;
501 Q t1 = Q(1) - t;
502 Q r1 = Q(1) - r;
503 return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
504 r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
505 }
506
507
508
509 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
510 /// upper right top, ...) at coordinates (s,t,r),
511 /// storing the results in 'result'. These are all vectors, so do it
512 /// for each of 'n' contiguous values (using the same s,t,r interpolants).
513 template <class T, class Q>
514 inline OIIO_HOSTDEVICE void
trilerp(const T * v0,const T * v1,const T * v2,const T * v3,const T * v4,const T * v5,const T * v6,const T * v7,Q s,Q t,Q r,int n,T * result)515 trilerp (const T *v0, const T *v1, const T *v2, const T *v3,
516 const T *v4, const T *v5, const T *v6, const T *v7,
517 Q s, Q t, Q r, int n, T *result)
518 {
519 Q s1 = Q(1) - s;
520 Q t1 = Q(1) - t;
521 Q r1 = Q(1) - r;
522 for (int i = 0; i < n; ++i)
523 result[i] = T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
524 r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
525 }
526
527
528
529 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
530 /// upper right top, ...) at coordinates (s,t,r),
531 /// SCALING the interpolated value by 'scale' and then ADDING to
532 /// 'result'. These are all vectors, so do it for each of 'n'
533 /// contiguous values (using the same s,t,r interpolants).
534 template <class T, class Q>
535 inline OIIO_HOSTDEVICE void
trilerp_mad(const T * v0,const T * v1,const T * v2,const T * v3,const T * v4,const T * v5,const T * v6,const T * v7,Q s,Q t,Q r,Q scale,int n,T * result)536 trilerp_mad (const T *v0, const T *v1, const T *v2, const T *v3,
537 const T *v4, const T *v5, const T *v6, const T *v7,
538 Q s, Q t, Q r, Q scale, int n, T *result)
539 {
540 Q r1 = Q(1) - r;
541 bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
542 bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
543 }
544
545
546
547 /// Evaluate B-spline weights in w[0..3] for the given fraction. This
548 /// is an important component of performing a cubic interpolation.
549 template <typename T>
evalBSplineWeights(T w[4],T fraction)550 inline OIIO_HOSTDEVICE void evalBSplineWeights (T w[4], T fraction)
551 {
552 T one_frac = 1 - fraction;
553 w[0] = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
554 w[1] = T(2.0 / 3.0) - T(0.5) * fraction * fraction * (2 - fraction);
555 w[2] = T(2.0 / 3.0) - T(0.5) * one_frac * one_frac * (2 - one_frac);
556 w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
557 }
558
559
560 /// Evaluate B-spline derivative weights in w[0..3] for the given
561 /// fraction. This is an important component of performing a cubic
562 /// interpolation with derivatives.
563 template <typename T>
evalBSplineWeightDerivs(T dw[4],T fraction)564 inline OIIO_HOSTDEVICE void evalBSplineWeightDerivs (T dw[4], T fraction)
565 {
566 T one_frac = 1 - fraction;
567 dw[0] = -T(0.5) * one_frac * one_frac;
568 dw[1] = T(0.5) * fraction * (3 * fraction - 4);
569 dw[2] = -T(0.5) * one_frac * (3 * one_frac - 4);
570 dw[3] = T(0.5) * fraction * fraction;
571 }
572
573
574
575 /// Bicubically interoplate arrays of pointers arranged in a 4x4 pattern
576 /// with val[0] pointing to the data in the upper left corner, val[15]
577 /// pointing to the lower right) at coordinates (s,t), storing the
578 /// results in 'result'. These are all vectors, so do it for each of
579 /// 'n' contiguous values (using the same s,t interpolants).
580 template <class T>
581 inline OIIO_HOSTDEVICE void
bicubic_interp(const T ** val,T s,T t,int n,T * result)582 bicubic_interp (const T **val, T s, T t, int n, T *result)
583 {
584 for (int c = 0; c < n; ++c)
585 result[c] = T(0);
586 T wx[4]; evalBSplineWeights (wx, s);
587 T wy[4]; evalBSplineWeights (wy, t);
588 for (int j = 0; j < 4; ++j) {
589 for (int i = 0; i < 4; ++i) {
590 T w = wx[i] * wy[j];
591 for (int c = 0; c < n; ++c)
592 result[c] += w * val[j*4+i][c];
593 }
594 }
595 }
596
597
598
599 /// Return floor(x) cast to an int.
600 OIIO_FORCEINLINE OIIO_HOSTDEVICE int
ifloor(float x)601 ifloor (float x)
602 {
603 return (int)floorf(x);
604 }
605
606
607
608 /// Return (x-floor(x)) and put (int)floor(x) in *xint. This is similar
609 /// to the built-in modf, but returns a true int, always rounds down
610 /// (compared to modf which rounds toward 0), and always returns
611 /// frac >= 0 (comapred to modf which can return <0 if x<0).
612 inline OIIO_HOSTDEVICE float
floorfrac(float x,int * xint)613 floorfrac (float x, int *xint)
614 {
615 #if 1
616 float f = std::floor(x);
617 *xint = int(f);
618 return x - f;
619 #else /* vvv This idiom is slower */
620 int i = ifloor(x);
621 *xint = i;
622 return x - static_cast<float>(i); // Return the fraction left over
623 #endif
624 }
625
626
627 #ifndef __CUDA_ARCH__
floorfrac(const simd::vfloat4 & x,simd::vint4 * xint)628 inline simd::vfloat4 floorfrac (const simd::vfloat4& x, simd::vint4 *xint) {
629 simd::vfloat4 f = simd::floor(x);
630 *xint = simd::vint4(f);
631 return x - f;
632 }
633
floorfrac(const simd::vfloat8 & x,simd::vint8 * xint)634 inline simd::vfloat8 floorfrac (const simd::vfloat8& x, simd::vint8 *xint) {
635 simd::vfloat8 f = simd::floor(x);
636 *xint = simd::vint8(f);
637 return x - f;
638 }
639
floorfrac(const simd::vfloat16 & x,simd::vint16 * xint)640 inline simd::vfloat16 floorfrac (const simd::vfloat16& x, simd::vint16 *xint) {
641 simd::vfloat16 f = simd::floor(x);
642 *xint = simd::vint16(f);
643 return x - f;
644 }
645 #endif
646
647
648
649
650 /// Convert degrees to radians.
651 template <typename T>
radians(T deg)652 OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians (T deg) { return deg * T(M_PI / 180.0); }
653
654 /// Convert radians to degrees
655 template <typename T>
degrees(T rad)656 OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees (T rad) { return rad * T(180.0 / M_PI); }
657
658
659 /// Faster floating point negation, in cases where you aren't too picky
660 /// about the difference between +0 and -0. (Courtesy of Alex Wells, Intel,
661 /// via code in OSL.)
662 ///
663 /// Beware: fast_neg(0.0f) returns 0.0f, NOT -0.0f. All other values,
664 /// including -0.0 and +/- Inf, work identically to `-x`. For many use
665 /// cases, that's fine. (When was the last time you wanted a -0.0f anyway?)
666 ///
667 /// The reason it's faster is that `-x` (for float x) is implemented by
668 /// compilers by xor'ing x against a bitmask that flips the sign bit, and
669 /// that bitmask had to come from somewhere -- memory! -- which can be
670 /// expensive, depending on whether/where it is in cache. But `0 - x`
671 /// generates a zero in a register (with xor, no memory access needed) and
672 /// then subtracts, so that's sometimes faster because there is no memory
673 /// read. This works for SIMD types as well!
674 ///
675 /// It's also safe (though pointless) to use fast_neg for integer types,
676 /// where both `-x` and `0-x` are implemented as a `neg` instruction.
677 template <typename T>
678 OIIO_FORCEINLINE OIIO_HOSTDEVICE T
fast_neg(const T & x)679 fast_neg(const T &x) {
680 return T(0) - x;
681 }
682
683
684
685 inline OIIO_HOSTDEVICE void
sincos(float x,float * sine,float * cosine)686 sincos (float x, float* sine, float* cosine)
687 {
688 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
689 __builtin_sincosf(x, sine, cosine);
690 #elif defined(__CUDA_ARCH__)
691 // Explicitly select the single-precision CUDA library function
692 sincosf(x, sine, cosine);
693 #else
694 *sine = std::sin(x);
695 *cosine = std::cos(x);
696 #endif
697 }
698
699 inline OIIO_HOSTDEVICE void
sincos(double x,double * sine,double * cosine)700 sincos (double x, double* sine, double* cosine)
701 {
702 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
703 __builtin_sincos(x, sine, cosine);
704 #elif defined(__CUDA_ARCH__)
705 // Use of anonymous namespace resolves to the CUDA library function and
706 // avoids infinite recursion
707 ::sincos(x, sine, cosine);
708 #else
709 *sine = std::sin(x);
710 *cosine = std::cos(x);
711 #endif
712 }
713
714
sign(float x)715 inline OIIO_HOSTDEVICE float sign (float x)
716 {
717 return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
718 }
719
720
721
722
723 // (end of float helper functions)
724 ////////////////////////////////////////////////////////////////////////////
725
726
727
728 ////////////////////////////////////////////////////////////////////////////
729 ////////////////////////////////////////////////////////////////////////////
730 //
731 // CONVERSION
732 //
733 // Type and range conversion helper functions and classes.
734
735
736 template <typename IN_TYPE, typename OUT_TYPE>
bit_cast(const IN_TYPE & in)737 OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE& in) {
738 // NOTE: this is the only standards compliant way of doing this type of casting,
739 // luckily the compilers we care about know how to optimize away this idiom.
740 static_assert(sizeof(IN_TYPE) == sizeof(OUT_TYPE),
741 "bit_cast must be between objects of the same size");
742 OUT_TYPE out;
743 memcpy ((void *)&out, &in, sizeof(IN_TYPE));
744 return out;
745 }
746
747 #if defined(__INTEL_COMPILER)
748 // On x86/x86_64 for certain compilers we can use Intel CPU intrinsics
749 // for some common bit_cast cases that might be even more understandable
750 // to the compiler and generate better code without its getting confused
751 // about the memcpy in the general case.
752 // FIXME: The intrinsics are not in clang <= 9 nor gcc <= 9.1. Check
753 // future releases.
754 template<> OIIO_FORCEINLINE uint32_t bit_cast<float, uint32_t> (const float val) {
755 return static_cast<uint32_t>(_castf32_u32(val));
756 }
757 template<> OIIO_FORCEINLINE int32_t bit_cast<float, int32_t> (const float val) {
758 return static_cast<int32_t>(_castf32_u32(val));
759 }
760 template<> OIIO_FORCEINLINE float bit_cast<uint32_t, float> (const uint32_t val) {
761 return _castu32_f32(val);
762 }
763 template<> OIIO_FORCEINLINE float bit_cast<int32_t, float> (const int32_t val) {
764 return _castu32_f32(val);
765 }
766 template<> OIIO_FORCEINLINE uint64_t bit_cast<double, uint64_t> (const double val) {
767 return static_cast<uint64_t>(_castf64_u64(val));
768 }
769 template<> OIIO_FORCEINLINE int64_t bit_cast<double, int64_t> (const double val) {
770 return static_cast<int64_t>(_castf64_u64(val));
771 }
772 template<> OIIO_FORCEINLINE double bit_cast<uint64_t, double> (const uint64_t val) {
773 return _castu64_f64(val);
774 }
775 template<> OIIO_FORCEINLINE double bit_cast<int64_t, double> (const int64_t val) {
776 return _castu64_f64(val);
777 }
778 #endif
779
780
bitcast_to_int(float x)781 OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int (float x) {
782 return bit_cast<float,int>(x);
783 }
bitcast_to_float(int x)784 OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float (int x) {
785 return bit_cast<int,float>(x);
786 }
787
788
789
790 /// Change endian-ness of one or more data items that are each 2, 4,
791 /// or 8 bytes. This should work for any of short, unsigned short, int,
792 /// unsigned int, float, long long, pointers.
793 template<class T>
794 inline OIIO_HOSTDEVICE void
795 swap_endian (T *f, int len=1)
796 {
797 for (char *c = (char *) f; len--; c += sizeof(T)) {
798 if (sizeof(T) == 2) {
799 std::swap (c[0], c[1]);
800 } else if (sizeof(T) == 4) {
801 std::swap (c[0], c[3]);
802 std::swap (c[1], c[2]);
803 } else if (sizeof(T) == 8) {
804 std::swap (c[0], c[7]);
805 std::swap (c[1], c[6]);
806 std::swap (c[2], c[5]);
807 std::swap (c[3], c[4]);
808 }
809 }
810 }
811
812
813
814 // big_enough_float<T>::float_t is a floating-point type big enough to
815 // handle the range and precision of a <T>. It's a float, unless T is big.
816 template <typename T> struct big_enough_float { typedef float float_t; };
817 template<> struct big_enough_float<int> { typedef double float_t; };
818 template<> struct big_enough_float<unsigned int> { typedef double float_t; };
819 template<> struct big_enough_float<int64_t> { typedef double float_t; };
820 template<> struct big_enough_float<uint64_t> { typedef double float_t; };
821 template<> struct big_enough_float<double> { typedef double float_t; };
822
823
824 /// Multiply src by scale, clamp to [min,max], and round to the nearest D
825 /// (presumed to be integer). This is just a helper for the convert_type
826 /// templates, it probably has no other use.
827 template<typename S, typename D, typename F>
828 inline OIIO_HOSTDEVICE D
829 scaled_conversion(const S& src, F scale, F min, F max)
830 {
831 if (std::numeric_limits<S>::is_signed) {
832 F s = src * scale;
833 s += (s < 0 ? (F)-0.5 : (F)0.5);
834 return (D)clamp(s, min, max);
835 } else {
836 return (D)clamp((F)src * scale + (F)0.5, min, max);
837 }
838 }
839
840
841
842 /// Convert n consecutive values from the type of S to the type of D.
843 /// The conversion is not a simple cast, but correctly remaps the
844 /// 0.0->1.0 range from and to the full positive range of integral
845 /// types. Take a memcpy shortcut if both types are the same and no
846 /// conversion is necessary. Optional arguments can give nonstandard
847 /// quantizations.
848 //
849 // FIXME: make table-based specializations for common types with only a
850 // few possible src values (like unsigned char -> float).
851 template<typename S, typename D>
852 void convert_type (const S *src, D *dst, size_t n, D _min, D _max)
853 {
854 if (std::is_same<S,D>::value) {
855 // They must be the same type. Just memcpy.
856 memcpy (dst, src, n*sizeof(D));
857 return;
858 }
859 typedef typename big_enough_float<D>::float_t F;
860 F scale = std::numeric_limits<S>::is_integer ?
861 (F(1)) / F(std::numeric_limits<S>::max()) : F(1);
862 if (std::numeric_limits<D>::is_integer) {
863 // Converting to an integer-like type.
864 F min = (F)_min; // std::numeric_limits<D>::min();
865 F max = (F)_max; // std::numeric_limits<D>::max();
866 scale *= _max;
867 // Unroll loop for speed
868 for ( ; n >= 16; n -= 16) {
869 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
870 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
871 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
872 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
873 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
874 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
875 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
876 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
877 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
878 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
879 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
880 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
881 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
882 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
883 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
884 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
885 }
886 while (n--)
887 *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
888 } else {
889 // Converting to a float-like type, so we don't need to remap
890 // the range
891 // Unroll loop for speed
892 for ( ; n >= 16; n -= 16) {
893 *dst++ = (D)((*src++) * scale);
894 *dst++ = (D)((*src++) * scale);
895 *dst++ = (D)((*src++) * scale);
896 *dst++ = (D)((*src++) * scale);
897 *dst++ = (D)((*src++) * scale);
898 *dst++ = (D)((*src++) * scale);
899 *dst++ = (D)((*src++) * scale);
900 *dst++ = (D)((*src++) * scale);
901 *dst++ = (D)((*src++) * scale);
902 *dst++ = (D)((*src++) * scale);
903 *dst++ = (D)((*src++) * scale);
904 *dst++ = (D)((*src++) * scale);
905 *dst++ = (D)((*src++) * scale);
906 *dst++ = (D)((*src++) * scale);
907 *dst++ = (D)((*src++) * scale);
908 *dst++ = (D)((*src++) * scale);
909 }
910 while (n--)
911 *dst++ = (D)((*src++) * scale);
912 }
913 }
914
915
916 #ifndef __CUDA_ARCH__
917 template<>
918 inline void convert_type<uint8_t,float> (const uint8_t *src,
919 float *dst, size_t n,
920 float /*_min*/, float /*_max*/)
921 {
922 float scale (1.0f/std::numeric_limits<uint8_t>::max());
923 simd::vfloat4 scale_simd (scale);
924 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
925 simd::vfloat4 s_simd (src);
926 simd::vfloat4 d_simd = s_simd * scale_simd;
927 d_simd.store (dst);
928 }
929 while (n--)
930 *dst++ = (*src++) * scale;
931 }
932
933
934
935 template<>
936 inline void convert_type<uint16_t,float> (const uint16_t *src,
937 float *dst, size_t n,
938 float /*_min*/, float /*_max*/)
939 {
940 float scale (1.0f/std::numeric_limits<uint16_t>::max());
941 simd::vfloat4 scale_simd (scale);
942 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
943 simd::vfloat4 s_simd (src);
944 simd::vfloat4 d_simd = s_simd * scale_simd;
945 d_simd.store (dst);
946 }
947 while (n--)
948 *dst++ = (*src++) * scale;
949 }
950
951
952 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
953 template<>
954 inline void convert_type<half,float> (const half *src,
955 float *dst, size_t n,
956 float /*_min*/, float /*_max*/)
957 {
958 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
959 // If f16c ops are enabled, it's worth doing this by 8's
960 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
961 simd::vfloat8 s_simd (src);
962 s_simd.store (dst);
963 }
964 #endif
965 #if OIIO_SIMD >= 4
966 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
967 simd::vfloat4 s_simd (src);
968 s_simd.store (dst);
969 }
970 #endif
971 while (n--)
972 *dst++ = (*src++);
973 }
974 #endif
975
976
977
978 template<>
979 inline void
980 convert_type<float,uint16_t> (const float *src, uint16_t *dst, size_t n,
981 uint16_t /*_min*/, uint16_t /*_max*/)
982 {
983 float min = std::numeric_limits<uint16_t>::min();
984 float max = std::numeric_limits<uint16_t>::max();
985 float scale = max;
986 simd::vfloat4 max_simd (max);
987 simd::vfloat4 zero_simd (0.0f);
988 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
989 simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
990 simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
991 simd::vint4 i (clamped);
992 i.store (dst);
993 }
994 while (n--)
995 *dst++ = scaled_conversion<float,uint16_t,float> (*src++, scale, min, max);
996 }
997
998
999 template<>
1000 inline void
1001 convert_type<float,uint8_t> (const float *src, uint8_t *dst, size_t n,
1002 uint8_t /*_min*/, uint8_t /*_max*/)
1003 {
1004 float min = std::numeric_limits<uint8_t>::min();
1005 float max = std::numeric_limits<uint8_t>::max();
1006 float scale = max;
1007 simd::vfloat4 max_simd (max);
1008 simd::vfloat4 zero_simd (0.0f);
1009 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1010 simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1011 simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1012 simd::vint4 i (clamped);
1013 i.store (dst);
1014 }
1015 while (n--)
1016 *dst++ = scaled_conversion<float,uint8_t,float> (*src++, scale, min, max);
1017 }
1018
1019
1020 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1021 template<>
1022 inline void
1023 convert_type<float,half> (const float *src, half *dst, size_t n,
1024 half /*_min*/, half /*_max*/)
1025 {
1026 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1027 // If f16c ops are enabled, it's worth doing this by 8's
1028 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1029 simd::vfloat8 s (src);
1030 s.store (dst);
1031 }
1032 #endif
1033 #if OIIO_SIMD >= 4
1034 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1035 simd::vfloat4 s (src);
1036 s.store (dst);
1037 }
1038 #endif
1039 while (n--)
1040 *dst++ = *src++;
1041 }
1042 #endif
1043 #endif
1044
1045
1046
1047 template<typename S, typename D>
1048 inline void convert_type (const S *src, D *dst, size_t n)
1049 {
1050 convert_type<S,D> (src, dst, n,
1051 std::numeric_limits<D>::min(),
1052 std::numeric_limits<D>::max());
1053 }
1054
1055
1056
1057
1058 /// Convert a single value from the type of S to the type of D.
1059 /// The conversion is not a simple cast, but correctly remaps the
1060 /// 0.0->1.0 range from and to the full positive range of integral
1061 /// types. Take a copy shortcut if both types are the same and no
1062 /// conversion is necessary.
1063 template<typename S, typename D>
1064 inline D
1065 convert_type (const S &src)
1066 {
1067 if (std::is_same<S,D>::value) {
1068 // They must be the same type. Just return it.
1069 return (D)src;
1070 }
1071 typedef typename big_enough_float<D>::float_t F;
1072 F scale = std::numeric_limits<S>::is_integer ?
1073 F(1) / F(std::numeric_limits<S>::max()) : F(1);
1074 if (std::numeric_limits<D>::is_integer) {
1075 // Converting to an integer-like type.
1076 F min = (F) std::numeric_limits<D>::min();
1077 F max = (F) std::numeric_limits<D>::max();
1078 scale *= max;
1079 return scaled_conversion<S,D,F> (src, scale, min, max);
1080 } else {
1081 // Converting to a float-like type, so we don't need to remap
1082 // the range
1083 return (D)((F)src * scale);
1084 }
1085 }
1086
1087
1088
1089 /// Helper function to convert channel values between different bit depths.
1090 /// Roughly equivalent to:
1091 ///
1092 /// out = round (in * (pow (2, TO_BITS) - 1) / (pow (2, FROM_BITS) - 1));
1093 ///
1094 /// but utilizes an integer math trick for speed. It can be proven that the
1095 /// absolute error of this method is less or equal to 1, with an average error
1096 /// (with respect to the entire domain) below 0.2.
1097 ///
1098 /// It is assumed that the original value is a valid FROM_BITS integer, i.e.
1099 /// shifted fully to the right.
1100 template<unsigned int FROM_BITS, unsigned int TO_BITS>
1101 inline OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in) {
1102 unsigned int out = 0;
1103 int shift = TO_BITS - FROM_BITS;
1104 for (; shift > 0; shift -= FROM_BITS)
1105 out |= in << shift;
1106 out |= in >> -shift;
1107 return out;
1108 }
1109
1110
1111
1112 // non-templated version. Slow but general
1113 inline OIIO_HOSTDEVICE unsigned int
1114 bit_range_convert(unsigned int in, unsigned int FROM_BITS, unsigned int TO_BITS)
1115 {
1116 unsigned int out = 0;
1117 int shift = TO_BITS - FROM_BITS;
1118 for (; shift > 0; shift -= FROM_BITS)
1119 out |= in << shift;
1120 out |= in >> -shift;
1121 return out;
1122 }
1123
1124
1125
1126 /// Append the `n` LSB bits of `val` into a bit sting `T out[]`, where the
1127 /// `filled` MSB bits of `*out` are already filled in. Increment `out` and
1128 /// adjust `filled` as required. Type `T` should be uint8_t, uint16_t, or
1129 /// uint32_t.
1130 template<typename T>
1131 inline void
1132 bitstring_add_n_bits (T* &out, int& filled, uint32_t val, int n)
1133 {
1134 static_assert(std::is_same<T,uint8_t>::value ||
1135 std::is_same<T,uint16_t>::value ||
1136 std::is_same<T,uint32_t>::value,
1137 "bitstring_add_n_bits must be unsigned int 8/16/32");
1138 const int Tbits = sizeof(T) * 8;
1139 // val: | don't care | copy me |
1140 // <- n bits ->
1141 //
1142 // *out: | don't touch | fill in here |
1143 // <- filled -> <- (Tbits - filled) ->
1144 while (n > 0) {
1145 // Make sure val doesn't have any cruft in bits > n
1146 val &= ~(0xffffffff << n);
1147 // Initialize any new byte we're filling in
1148 if (filled == 0)
1149 *out = 0;
1150 // How many bits can we fill in `*out` without having to increment
1151 // to the next byte?
1152 int bits_left_in_out = Tbits - filled;
1153 int b = 0; // bit pattern to 'or' with *out
1154 int nb = 0; // number of bits to add
1155 if (n <= bits_left_in_out) { // can fit completely in this byte
1156 b = val << (bits_left_in_out - n);
1157 nb = n;
1158 } else { // n > bits_left_in_out, will spill to next byte
1159 b = val >> (n - bits_left_in_out);
1160 nb = bits_left_in_out;
1161 }
1162 *out |= b;
1163 filled += nb;
1164 OIIO_DASSERT (filled <= Tbits);
1165 n -= nb;
1166 if (filled == Tbits) {
1167 ++out;
1168 filled = 0;
1169 }
1170 }
1171 }
1172
1173
1174
1175 /// Pack values from `T in[0..n-1]` (where `T` is expected to be a uint8,
1176 /// uint16, or uint32, into successive raw outbits-bit pieces of `out[]`,
1177 /// where outbits is expected to be less than the number of bits in a `T`.
1178 template<typename T>
1179 inline void
1180 bit_pack(cspan<T> data, void* out, int outbits)
1181 {
1182 static_assert(std::is_same<T,uint8_t>::value ||
1183 std::is_same<T,uint16_t>::value ||
1184 std::is_same<T,uint32_t>::value,
1185 "bit_pack must be unsigned int 8/16/32");
1186 unsigned char* outbuffer = (unsigned char*)out;
1187 int filled = 0;
1188 for (size_t i = 0, e = data.size(); i < e; ++i)
1189 bitstring_add_n_bits (outbuffer, filled, data[i], outbits);
1190 }
1191
1192
1193
1194 /// Decode n packed inbits-bits values from in[...] into normal uint8,
1195 /// uint16, or uint32 representation of `T out[0..n-1]`. In other words,
1196 /// each successive `inbits` of `in` (allowing spanning of byte boundaries)
1197 /// will be stored in a successive out[i].
1198 template<typename T>
1199 inline void
1200 bit_unpack(int n, const unsigned char* in, int inbits, T* out)
1201 {
1202 static_assert(std::is_same<T,uint8_t>::value ||
1203 std::is_same<T,uint16_t>::value ||
1204 std::is_same<T,uint32_t>::value,
1205 "bit_unpack must be unsigned int 8/16/32");
1206 OIIO_DASSERT(inbits >= 1 && inbits < 32); // surely bugs if not
1207 // int highest = (1 << inbits) - 1;
1208 int B = 0, b = 0;
1209 // Invariant:
1210 // So far, we have used in[0..B-1] and the high b bits of in[B].
1211 for (int i = 0; i < n; ++i) {
1212 long long val = 0;
1213 int valbits = 0; // bits so far we've accumulated in val
1214 while (valbits < inbits) {
1215 // Invariant: we have already accumulated valbits of the next
1216 // needed value (of a total of inbits), living in the valbits
1217 // low bits of val.
1218 int out_left = inbits - valbits; // How much more we still need
1219 int in_left = 8 - b; // Bits still available in in[B].
1220 if (in_left <= out_left) {
1221 // Eat the rest of this byte:
1222 // |---------|--------|
1223 // b in_left
1224 val <<= in_left;
1225 val |= in[B] & ~(0xffffffff << in_left);
1226 ++B;
1227 b = 0;
1228 valbits += in_left;
1229 } else {
1230 // Eat just the bits we need:
1231 // |--|---------|-----|
1232 // b out_left extra
1233 val <<= out_left;
1234 int extra = 8 - b - out_left;
1235 val |= (in[B] >> extra) & ~(0xffffffff << out_left);
1236 b += out_left;
1237 valbits = inbits;
1238 }
1239 }
1240 out[i] = val; //T((val * 0xff) / highest);
1241 }
1242 }
1243
1244
1245
1246 /// A DataProxy<I,E> looks like an (E &), but it really holds an (I &)
1247 /// and does conversions (via convert_type) as it reads in and out.
1248 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1249 template<typename I, typename E>
1250 struct DataProxy {
1251 DataProxy (I &data) : m_data(data) { }
1252 E operator= (E newval) { m_data = convert_type<E,I>(newval); return newval; }
1253 operator E () const { return convert_type<I,E>(m_data); }
1254 private:
1255 DataProxy& operator = (const DataProxy&); // Do not implement
1256 I &m_data;
1257 };
1258
1259
1260
1261 /// A ConstDataProxy<I,E> looks like a (const E &), but it really holds
1262 /// a (const I &) and does conversions (via convert_type) as it reads.
1263 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1264 template<typename I, typename E>
1265 struct ConstDataProxy {
1266 ConstDataProxy (const I &data) : m_data(data) { }
1267 operator E () const { return convert_type<E,I>(*m_data); }
1268 private:
1269 const I &m_data;
1270 };
1271
1272
1273
1274 /// A DataArrayProxy<I,E> looks like an (E *), but it really holds an (I *)
1275 /// and does conversions (via convert_type) as it reads in and out.
1276 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1277 template<typename I, typename E>
1278 struct DataArrayProxy {
1279 DataArrayProxy (I *data=NULL) : m_data(data) { }
1280 E operator* () const { return convert_type<I,E>(*m_data); }
1281 E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1282 DataProxy<I,E> operator[] (int i) { return DataProxy<I,E> (m_data[i]); }
1283 void set (I *data) { m_data = data; }
1284 I * get () const { return m_data; }
1285 const DataArrayProxy<I,E> & operator+= (int i) {
1286 m_data += i; return *this;
1287 }
1288 private:
1289 I *m_data;
1290 };
1291
1292
1293
1294 /// A ConstDataArrayProxy<I,E> looks like an (E *), but it really holds an
1295 /// (I *) and does conversions (via convert_type) as it reads in and out.
1296 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1297 template<typename I, typename E>
1298 struct ConstDataArrayProxy {
1299 ConstDataArrayProxy (const I *data=NULL) : m_data(data) { }
1300 E operator* () const { return convert_type<I,E>(*m_data); }
1301 E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1302 void set (const I *data) { m_data = data; }
1303 const I * get () const { return m_data; }
1304 const ConstDataArrayProxy<I,E> & operator+= (int i) {
1305 m_data += i; return *this;
1306 }
1307 private:
1308 const I *m_data;
1309 };
1310
1311
1312
1313 /// Fast table-based conversion of 8-bit to other types. Declare this
1314 /// as static to avoid the expensive ctr being called all the time.
1315 template <class T=float>
1316 class EightBitConverter {
1317 public:
1318 EightBitConverter () noexcept { init(); }
1319 T operator() (unsigned char c) const noexcept { return val[c]; }
1320 private:
1321 T val[256];
1322 void init () noexcept {
1323 float scale = 1.0f / 255.0f;
1324 if (std::numeric_limits<T>::is_integer)
1325 scale *= (float)std::numeric_limits<T>::max();
1326 for (int i = 0; i < 256; ++i)
1327 val[i] = (T)(i * scale);
1328 }
1329 };
1330
1331
1332
1333 /// Simple conversion of a (presumably non-negative) float into a
1334 /// rational. This does not attempt to find the simplest fraction
1335 /// that approximates the float, for example 52.83 will simply
1336 /// return 5283/100. This does not attempt to gracefully handle
1337 /// floats that are out of range that could be easily int/int.
1338 inline OIIO_HOSTDEVICE void
1339 float_to_rational (float f, unsigned int &num, unsigned int &den)
1340 {
1341 if (f <= 0) { // Trivial case of zero, and handle all negative values
1342 num = 0;
1343 den = 1;
1344 } else if ((int)(1.0/f) == (1.0/f)) { // Exact results for perfect inverses
1345 num = 1;
1346 den = (int)f;
1347 } else {
1348 num = (int)f;
1349 den = 1;
1350 while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1351 den *= 10;
1352 f *= 10;
1353 num = (int)f;
1354 }
1355 }
1356 }
1357
1358
1359
1360 /// Simple conversion of a float into a rational. This does not attempt
1361 /// to find the simplest fraction that approximates the float, for
1362 /// example 52.83 will simply return 5283/100. This does not attempt to
1363 /// gracefully handle floats that are out of range that could be easily
1364 /// int/int.
1365 inline OIIO_HOSTDEVICE void
1366 float_to_rational (float f, int &num, int &den)
1367 {
1368 unsigned int n, d;
1369 float_to_rational (fabsf(f), n, d);
1370 num = (f >= 0) ? (int)n : -(int)n;
1371 den = (int) d;
1372 }
1373
1374
1375 // (end of conversion helpers)
1376 ////////////////////////////////////////////////////////////////////////////
1377
1378
1379
1380
1381 ////////////////////////////////////////////////////////////////////////////
1382 ////////////////////////////////////////////////////////////////////////////
1383 //
1384 // SAFE MATH
1385 //
1386 // The functions named "safe_*" are versions with various internal clamps
1387 // or other deviations from IEEE standards with the specific intent of
1388 // never producing NaN or Inf values or throwing exceptions. But within the
1389 // valid range, they should be full precision and match IEEE standards.
1390 //
1391
1392
1393 /// Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
1394 template <typename T>
1395 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt (T x) {
1396 return x >= T(0) ? std::sqrt(x) : T(0);
1397 }
1398
1399 /// Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
1400 template <typename T>
1401 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt (T x) {
1402 return x > T(0) ? T(1) / std::sqrt(x) : T(0);
1403 }
1404
1405
1406 /// Safe (clamping) arcsine: clamp to the valid domain.
1407 template <typename T>
1408 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin (T x) {
1409 if (x <= T(-1)) return T(-M_PI_2);
1410 if (x >= T(+1)) return T(+M_PI_2);
1411 return std::asin(x);
1412 }
1413
1414 /// Safe (clamping) arccosine: clamp to the valid domain.
1415 template <typename T>
1416 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos (T x) {
1417 if (x <= T(-1)) return T(M_PI);
1418 if (x >= T(+1)) return T(0);
1419 return std::acos(x);
1420 }
1421
1422
1423 /// Safe log2: clamp to valid domain.
1424 template <typename T>
1425 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2 (T x) {
1426 // match clamping from fast version
1427 if (x < std::numeric_limits<T>::min()) x = std::numeric_limits<T>::min();
1428 if (x > std::numeric_limits<T>::max()) x = std::numeric_limits<T>::max();
1429 return std::log2(x);
1430 }
1431
1432 /// Safe log: clamp to valid domain.
1433 template <typename T>
1434 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log (T x) {
1435 // slightly different than fast version since clamping happens before scaling
1436 if (x < std::numeric_limits<T>::min()) x = std::numeric_limits<T>::min();
1437 if (x > std::numeric_limits<T>::max()) x = std::numeric_limits<T>::max();
1438 return std::log(x);
1439 }
1440
1441 /// Safe log10: clamp to valid domain.
1442 template <typename T>
1443 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10 (T x) {
1444 // slightly different than fast version since clamping happens before scaling
1445 if (x < std::numeric_limits<T>::min()) x = std::numeric_limits<T>::min();
1446 if (x > std::numeric_limits<T>::max()) x = std::numeric_limits<T>::max();
1447 return log10f(x);
1448 }
1449
1450 /// Safe logb: clamp to valid domain.
1451 template <typename T>
1452 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb (T x) {
1453 return (x != T(0)) ? std::logb(x) : -std::numeric_limits<T>::max();
1454 }
1455
1456 /// Safe pow: clamp the domain so it never returns Inf or NaN or has divide
1457 /// by zero error.
1458 template <typename T>
1459 OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow (T x, T y) {
1460 if (y == T(0)) return T(1);
1461 if (x == T(0)) return T(0);
1462 // if x is negative, only deal with integer powers
1463 if ((x < T(0)) && (y != floor(y))) return T(0);
1464 // FIXME: this does not match "fast" variant because clamping limits are different
1465 T r = std::pow(x, y);
1466 // Clamp to avoid returning Inf.
1467 const T big = std::numeric_limits<T>::max();
1468 return clamp (r, -big, big);
1469 }
1470
1471
1472 /// Safe fmod: guard against b==0.0 (in which case, return 0.0). Also, it
1473 /// seems that this is much faster than std::fmod!
1474 OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod (float a, float b)
1475 {
1476 if (OIIO_LIKELY(b != 0.0f)) {
1477 #if 0
1478 return std::fmod (a,b);
1479 // std::fmod was getting called serially instead of vectorizing, so
1480 // we will just do the calculation ourselves.
1481 #else
1482 // This formulation is equivalent but much faster in our benchmarks,
1483 // also vectorizes better.
1484 // The floating-point remainder of the division operation
1485 // a/b is a - N*b, where N = a/b with its fractional part truncated.
1486 int N = static_cast<int>(a/b);
1487 return a - N*b;
1488 #endif
1489 }
1490 return 0.0f;
1491 }
1492
1493 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1494
1495 // (end of safe_* functions)
1496 ////////////////////////////////////////////////////////////////////////////
1497
1498
1499
1500 ////////////////////////////////////////////////////////////////////////////
1501 ////////////////////////////////////////////////////////////////////////////
1502 //
1503 // FAST & APPROXIMATE MATH
1504 //
1505 // The functions named "fast_*" provide a set of replacements to libm that
1506 // are much faster at the expense of some accuracy and robust handling of
1507 // extreme values. One design goal for these approximation was to avoid
1508 // branches as much as possible and operate on single precision values only
1509 // so that SIMD versions should be straightforward ports. We also try to
1510 // implement "safe" semantics (ie: clamp to valid range where possible)
1511 // natively since wrapping these inline calls in another layer would be
1512 // wasteful.
1513 //
1514 // The "fast_*" functions are not only possibly differing in value
1515 // (slightly) from the std versions of these functions, but also we do not
1516 // guarantee that the results of "fast" will exactly match from platform to
1517 // platform. This is because if a particular platform (HW, library, or
1518 // compiler) provides an intrinsic that is even faster than our usual "fast"
1519 // implementation, we may substitute it.
1520 //
1521 // Some functions are fast_safe_*, which is both a faster approximation as
1522 // well as clamped input domain to ensure no NaN, Inf, or divide by zero.
1523 //
1524 // NB: When compiling for CUDA devices, selection of the 'fast' intrinsics
1525 // is influenced by compiler options (-ffast-math for clang/gcc,
1526 // --use-fast-math for NVCC).
1527 //
1528 // TODO: Quantify the performance and accuracy of the CUDA Math functions
1529 // relative to the definitions in this file. It may be better in some
1530 // cases to use the approximate versions defined below.
1531
1532
1533 /// Round to nearest integer, returning as an int.
1534 /// Note that this differs from std::rint, which returns a float; it's more
1535 /// akin to std::lrint, which returns a long (int). Sorry for the naming
1536 /// confusion.
1537 OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint (float x) {
1538 // used by sin/cos/tan range reduction
1539 #if OIIO_SIMD_SSE >= 4
1540 // single roundps instruction on SSE4.1+ (for gcc/clang at least)
1541 return static_cast<int>(std::rint(x));
1542 #else
1543 // emulate rounding by adding/substracting 0.5
1544 return static_cast<int>(x + copysignf(0.5f, x));
1545 #endif
1546 }
1547
1548 #ifndef __CUDA_ARCH__
1549 OIIO_FORCEINLINE simd::vint4 fast_rint (const simd::vfloat4& x) {
1550 return simd::rint (x);
1551 }
1552 #endif
1553
1554
1555 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin (float x) {
1556 #ifndef __CUDA_ARCH__
1557 // very accurate argument reduction from SLEEF
1558 // starts failing around x=262000
1559 // Results on: [-2pi,2pi]
1560 // Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp, 1.19209e-07 max error
1561 int q = fast_rint (x * float(M_1_PI));
1562 float qf = float(q);
1563 x = madd(qf, -0.78515625f*4, x);
1564 x = madd(qf, -0.00024187564849853515625f*4, x);
1565 x = madd(qf, -3.7747668102383613586e-08f*4, x);
1566 x = madd(qf, -1.2816720341285448015e-12f*4, x);
1567 x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1568 float s = x * x;
1569 if ((q & 1) != 0) x = -x;
1570 // this polynomial approximation has very low error on [-pi/2,+pi/2]
1571 // 1.19209e-07 max error in total over [-2pi,+2pi]
1572 float u = 2.6083159809786593541503e-06f;
1573 u = madd(u, s, -0.0001981069071916863322258f);
1574 u = madd(u, s, +0.00833307858556509017944336f);
1575 u = madd(u, s, -0.166666597127914428710938f);
1576 u = madd(s, u * x, x);
1577 // For large x, the argument reduction can fail and the polynomial can
1578 // be evaluated with arguments outside the valid internal. Just clamp
1579 // the bad values away.
1580 return clamp(u, -1.0f, 1.0f);
1581 #else
1582 return __sinf(x);
1583 #endif
1584 }
1585
1586
1587 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos (float x) {
1588 #ifndef __CUDA_ARCH__
1589 // same argument reduction as fast_sin
1590 int q = fast_rint (x * float(M_1_PI));
1591 float qf = float(q);
1592 x = madd(qf, -0.78515625f*4, x);
1593 x = madd(qf, -0.00024187564849853515625f*4, x);
1594 x = madd(qf, -3.7747668102383613586e-08f*4, x);
1595 x = madd(qf, -1.2816720341285448015e-12f*4, x);
1596 x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1597 float s = x * x;
1598 // polynomial from SLEEF's sincosf, max error is
1599 // 4.33127e-07 over [-2pi,2pi] (98% of values are "exact")
1600 float u = -2.71811842367242206819355e-07f;
1601 u = madd(u, s, +2.47990446951007470488548e-05f);
1602 u = madd(u, s, -0.00138888787478208541870117f);
1603 u = madd(u, s, +0.0416666641831398010253906f);
1604 u = madd(u, s, -0.5f);
1605 u = madd(u, s, +1.0f);
1606 if ((q & 1) != 0) u = -u;
1607 return clamp(u, -1.0f, 1.0f);
1608 #else
1609 return __cosf(x);
1610 #endif
1611 }
1612
1613
1614 OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos (float x, float* sine, float* cosine) {
1615 #ifndef __CUDA_ARCH__
1616 // same argument reduction as fast_sin
1617 int q = fast_rint (x * float(M_1_PI));
1618 float qf = float(q);
1619 x = madd(qf, -0.78515625f*4, x);
1620 x = madd(qf, -0.00024187564849853515625f*4, x);
1621 x = madd(qf, -3.7747668102383613586e-08f*4, x);
1622 x = madd(qf, -1.2816720341285448015e-12f*4, x);
1623 x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1624 float s = x * x;
1625 // NOTE: same exact polynomials as fast_sin and fast_cos above
1626 if ((q & 1) != 0) x = -x;
1627 float su = 2.6083159809786593541503e-06f;
1628 su = madd(su, s, -0.0001981069071916863322258f);
1629 su = madd(su, s, +0.00833307858556509017944336f);
1630 su = madd(su, s, -0.166666597127914428710938f);
1631 su = madd(s, su * x, x);
1632 float cu = -2.71811842367242206819355e-07f;
1633 cu = madd(cu, s, +2.47990446951007470488548e-05f);
1634 cu = madd(cu, s, -0.00138888787478208541870117f);
1635 cu = madd(cu, s, +0.0416666641831398010253906f);
1636 cu = madd(cu, s, -0.5f);
1637 cu = madd(cu, s, +1.0f);
1638 if ((q & 1) != 0) cu = -cu;
1639 *sine = clamp(su, -1.0f, 1.0f);;
1640 *cosine = clamp(cu, -1.0f, 1.0f);;
1641 #else
1642 __sincosf(x, sine, cosine);
1643 #endif
1644 }
1645
1646 // NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts becoming
1647 // really poor outside of this range because the reciprocal amplifies errors
1648 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan (float x) {
1649 #ifndef __CUDA_ARCH__
1650 // derived from SLEEF implementation
1651 // note that we cannot apply the "denormal crush" trick everywhere because
1652 // we sometimes need to take the reciprocal of the polynomial
1653 int q = fast_rint (x * float(2 * M_1_PI));
1654 float qf = float(q);
1655 x = madd(qf, -0.78515625f*2, x);
1656 x = madd(qf, -0.00024187564849853515625f*2, x);
1657 x = madd(qf, -3.7747668102383613586e-08f*2, x);
1658 x = madd(qf, -1.2816720341285448015e-12f*2, x);
1659 if ((q & 1) == 0)
1660 x = float(M_PI_4) - (float(M_PI_4) - x); // crush denormals (only if we aren't inverting the result later)
1661 float s = x * x;
1662 float u = 0.00927245803177356719970703f;
1663 u = madd(u, s, 0.00331984995864331722259521f);
1664 u = madd(u, s, 0.0242998078465461730957031f);
1665 u = madd(u, s, 0.0534495301544666290283203f);
1666 u = madd(u, s, 0.133383005857467651367188f);
1667 u = madd(u, s, 0.333331853151321411132812f);
1668 u = madd(s, u * x, x);
1669 if ((q & 1) != 0)
1670 u = -1.0f / u;
1671 return u;
1672 #else
1673 return __tanf(x);
1674 #endif
1675 }
1676
1677 /// Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
1678 /// Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
1679 /// Note that this is MUCH faster, but much less accurate than fast_sin.
1680 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi (float x)
1681 {
1682 #ifndef __CUDA_ARCH__
1683 // Fast trick to strip the integral part off, so our domain is [-1,1]
1684 const float z = x - ((x + 25165824.0f) - 25165824.0f);
1685 const float y = z - z * fabsf(z);
1686 const float Q = 3.10396624f;
1687 const float P = 3.584135056f; // P = 16-4*Q
1688 return y * (Q + P * fabsf(y));
1689 /* The original article used used inferior constants for Q and P and
1690 * so had max error 1.091e-3.
1691 *
1692 * The optimal value for Q was determined by exhaustive search, minimizing
1693 * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
1694 * over the interval [0,2] (which is where most of the invocations happen).
1695 *
1696 * The basic idea of this approximation starts with the coarse approximation:
1697 * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
1698 *
1699 * This approximation always _over_ estimates the target. On the otherhand, the
1700 * curve:
1701 * sin(pi*x) ~= f(x) * abs(f(x)) / 4
1702 *
1703 * always lies _under_ the target. Thus we can simply numerically search for the
1704 * optimal constant to LERP these curves into a more precise approximation.
1705 * After folding the constants together and simplifying the resulting math, we
1706 * end up with the compact implementation below.
1707 *
1708 * NOTE: this function actually computes sin(x * pi) which avoids one or two
1709 * mults in many cases and guarantees exact values at integer periods.
1710 */
1711 #else
1712 return sinpif(x);
1713 #endif
1714 }
1715
1716 /// Fast approximate cos(x*M_PI) with ~0.1% absolute error.
1717 /// Note that this is MUCH faster, but much less accurate than fast_cos.
1718 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi (float x)
1719 {
1720 #ifndef __CUDA_ARCH__
1721 return fast_sinpi (x+0.5f);
1722 #else
1723 return cospif(x);
1724 #endif
1725 }
1726
1727 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos (float x) {
1728 #ifndef __CUDA_ARCH__
1729 const float f = fabsf(x);
1730 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1731 // based on http://www.pouet.net/topic.php?which=9132&page=2
1732 // 85% accurate (ulp 0)
1733 // Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
1734 // Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
1735 const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1736 return x < 0 ? float(M_PI) - a : a;
1737 #else
1738 return acosf(x);
1739 #endif
1740 }
1741
1742 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin (float x) {
1743 #ifndef __CUDA_ARCH__
1744 // based on acosf approximation above
1745 // max error is 4.51133e-05 (ulps are higher because we are consistently off by a little amount)
1746 const float f = fabsf(x);
1747 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1748 const float a = float(M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1749 return copysignf(a, x);
1750 #else
1751 return asinf(x);
1752 #endif
1753 }
1754
1755 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan (float x) {
1756 #ifndef __CUDA_ARCH__
1757 const float a = fabsf(x);
1758 const float k = a > 1.0f ? 1 / a : a;
1759 const float s = 1.0f - (1.0f - k); // crush denormals
1760 const float t = s * s;
1761 // http://mathforum.org/library/drmath/view/62672.html
1762 // the coefficients were tuned in mathematica with the assumption that we want atan(1)=pi/4
1763 // (slightly higher error but no discontinuities)
1764 // Examined 4278190080 values of atan: 2.53989068 avg ulp diff, 315 max ulp, 9.17912e-06 max error // (with denormals)
1765 // Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 9.17912e-06 max error // (crush denormals)
1766 float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1767 if (a > 1.0f) r = 1.570796326794896557998982f - r;
1768 return copysignf(r, x);
1769 #else
1770 return atanf(x);
1771 #endif
1772 }
1773
1774 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2 (float y, float x) {
1775 #ifndef __CUDA_ARCH__
1776 // based on atan approximation above
1777 // the special cases around 0 and infinity were tested explicitly
1778 // the only case not handled correctly is x=NaN,y=0 which returns 0 instead of nan
1779 const float a = fabsf(x);
1780 const float b = fabsf(y);
1781 bool b_is_greater_than_a = b > a;
1782
1783 #if OIIO_FMATH_SIMD_FRIENDLY
1784 // When applying to all lanes in SIMD, we end up doing extra masking and
1785 // 2 divides. So lets just do 1 divide and swap the parameters instead.
1786 // And if we are going to do a doing a divide anyway, when a == b it
1787 // should be 1.0f anyway so lets not bother special casing it.
1788 float sa = b_is_greater_than_a ? b : a;
1789 float sb = b_is_greater_than_a ? a : b;
1790 const float k = (b == 0) ? 0.0f : sb/sa;
1791 #else
1792 const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
1793 #endif
1794
1795 const float s = 1.0f - (1.0f - k); // crush denormals
1796 const float t = s * s;
1797
1798 float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1799
1800 if (b_is_greater_than_a)
1801 r = 1.570796326794896557998982f - r; // account for arg reduction
1802 // TODO: investigate if testing x < 0.0f is more efficient
1803 if (bit_cast<float, unsigned>(x) & 0x80000000u) // test sign bit of x
1804 r = float(M_PI) - r;
1805 return copysignf(r, y);
1806 #else
1807 return atan2f(y, x);
1808 #endif
1809 }
1810
1811
1812 template<typename T>
1813 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2 (const T& xval) {
1814 using namespace simd;
1815 typedef typename T::int_t intN;
1816 // See float fast_log2 for explanations
1817 T x = clamp (xval, T(std::numeric_limits<float>::min()), T(std::numeric_limits<float>::max()));
1818 intN bits = bitcast_to_int(x);
1819 intN exponent = srl (bits, 23) - intN(127);
1820 T f = bitcast_to_float ((bits & intN(0x007FFFFF)) | intN(0x3f800000)) - T(1.0f);
1821 T f2 = f * f;
1822 T f4 = f2 * f2;
1823 T hi = madd(f, T(-0.00931049621349f), T( 0.05206469089414f));
1824 T lo = madd(f, T( 0.47868480909345f), T(-0.72116591947498f));
1825 hi = madd(f, hi, T(-0.13753123777116f));
1826 hi = madd(f, hi, T( 0.24187369696082f));
1827 hi = madd(f, hi, T(-0.34730547155299f));
1828 lo = madd(f, lo, T( 1.442689881667200f));
1829 return ((f4 * hi) + (f * lo)) + T(exponent);
1830 }
1831
1832
1833 template<>
1834 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log2 (const float& xval) {
1835 #ifndef __CUDA_ARCH__
1836 // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
1837 float x = clamp (xval, std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
1838 // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
1839 unsigned bits = bit_cast<float, unsigned>(x);
1840 int exponent = int(bits >> 23) - 127;
1841 float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1842 // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
1843 // ulp histogram:
1844 // 0 = 97.46%
1845 // 1 = 2.29%
1846 // 2 = 0.11%
1847 float f2 = f * f;
1848 float f4 = f2 * f2;
1849 float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
1850 float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
1851 hi = madd(f, hi, -0.13753123777116f);
1852 hi = madd(f, hi, 0.24187369696082f);
1853 hi = madd(f, hi, -0.34730547155299f);
1854 lo = madd(f, lo, 1.442689881667200f);
1855 return ((f4 * hi) + (f * lo)) + exponent;
1856 #else
1857 return __log2f(xval);
1858 #endif
1859 }
1860
1861
1862
1863 template<typename T>
1864 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log (const T& x) {
1865 // Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]: 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error
1866 return fast_log2(x) * T(M_LN2);
1867 }
1868
1869 #ifdef __CUDA_ARCH__
1870 template<>
1871 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log(const float& x)
1872 {
1873 return __logf(x);
1874 }
1875 #endif
1876
1877
1878 template<typename T>
1879 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10 (const T& x) {
1880 // Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]: 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error
1881 return fast_log2(x) * T(M_LN2 / M_LN10);
1882 }
1883
1884 #ifdef __CUDA_ARCH__
1885 template<>
1886 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log10(const float& x)
1887 {
1888 return __log10f(x);
1889 }
1890 #endif
1891
1892 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb (float x) {
1893 #ifndef __CUDA_ARCH__
1894 // don't bother with denormals
1895 x = fabsf(x);
1896 if (x < std::numeric_limits<float>::min()) x = std::numeric_limits<float>::min();
1897 if (x > std::numeric_limits<float>::max()) x = std::numeric_limits<float>::max();
1898 unsigned bits = bit_cast<float, unsigned>(x);
1899 return float (int(bits >> 23) - 127);
1900 #else
1901 return logbf(x);
1902 #endif
1903 }
1904
1905 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p (float x) {
1906 #ifndef __CUDA_ARCH__
1907 if (fabsf(x) < 0.01f) {
1908 float y = 1.0f - (1.0f - x); // crush denormals
1909 return copysignf(madd(-0.5f, y * y, y), x);
1910 } else {
1911 return fast_log(x + 1);
1912 }
1913 #else
1914 return log1pf(x);
1915 #endif
1916 }
1917
1918
1919
1920 template<typename T>
1921 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2 (const T& xval) {
1922 using namespace simd;
1923 typedef typename T::int_t intN;
1924 #if OIIO_SIMD_SSE
1925 // See float specialization for explanations
1926 T x = clamp (xval, T(-126.0f), T(126.0f));
1927 intN m (x); x -= T(m);
1928 T one (1.0f);
1929 x = one - (one - x); // crush denormals (does not affect max ulps!)
1930 const T kA (1.33336498402e-3f);
1931 const T kB (9.810352697968e-3f);
1932 const T kC (5.551834031939e-2f);
1933 const T kD (0.2401793301105f);
1934 const T kE (0.693144857883f);
1935 T r (kA);
1936 r = madd(x, r, kB);
1937 r = madd(x, r, kC);
1938 r = madd(x, r, kD);
1939 r = madd(x, r, kE);
1940 r = madd(x, r, one);
1941 return bitcast_to_float (bitcast_to_int(r) + (m << 23));
1942 #else
1943 T r;
1944 for (int i = 0; i < r.elements; ++i)
1945 r[i] = fast_exp2(xval[i]);
1946 for (int i = r.elements; i < r.paddedelements; ++i)
1947 r[i] = 0.0f;
1948 return r;
1949 #endif
1950 }
1951
1952
1953 template<>
1954 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp2 (const float& xval) {
1955 #if OIIO_NON_INTEL_CLANG && OIIO_FMATH_SIMD_FRIENDLY
1956 // Clang was unhappy using the bitcast/memcpy/reinter_cast/union inside
1957 // an explicit SIMD loop, so revert to calling the standard version.
1958 return std::exp2(xval);
1959 #elif !defined(__CUDA_ARCH__)
1960 // clamp to safe range for final addition
1961 float x = clamp (xval, -126.0f, 126.0f);
1962 // range reduction
1963 int m = int(x); x -= m;
1964 x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
1965 // 5th degree polynomial generated with sollya
1966 // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
1967 // ulp histogram:
1968 // 0 = 87.81%
1969 // 1 = 4.18%
1970 float r = 1.33336498402e-3f;
1971 r = madd(x, r, 9.810352697968e-3f);
1972 r = madd(x, r, 5.551834031939e-2f);
1973 r = madd(x, r, 0.2401793301105f);
1974 r = madd(x, r, 0.693144857883f);
1975 r = madd(x, r, 1.0f);
1976 // multiply by 2 ^ m by adding in the exponent
1977 // NOTE: left-shift of negative number is undefined behavior
1978 return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
1979 // Clang: loop not vectorized: unsafe dependent memory operations in loop.
1980 // This is why we special case the OIIO_FMATH_SIMD_FRIENDLY above.
1981 // FIXME: as clang releases continue to improve, periodically check if
1982 // this is still the case.
1983 #else
1984 return exp2f(xval);
1985 #endif
1986 }
1987
1988
1989
1990
1991 template <typename T>
1992 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp (const T& x) {
1993 // Examined 2237485550 values of exp on [-87.3300018,87.3300018]: 2.6666452 avg ulp diff, 230 max ulp
1994 return fast_exp2(x * T(1 / M_LN2));
1995 }
1996
1997 #ifdef __CUDA_ARCH__
1998 template<>
1999 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp (const float& x) {
2000 return __expf(x);
2001 }
2002 #endif
2003
2004
2005
2006 /// Faster float exp than is in libm, but still 100% accurate
2007 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp (float x)
2008 {
2009 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2010 // On x86_64, versions of glibc < 2.16 have an issue where expf is
2011 // much slower than the double version. This was fixed in glibc 2.16.
2012 return static_cast<float>(std::exp(static_cast<double>(x)));
2013 #else
2014 return std::exp(x);
2015 #endif
2016 }
2017
2018
2019 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10 (float x) {
2020 #ifndef __CUDA_ARCH__
2021 // Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: 2.71732409 avg ulp diff, 232 max ulp
2022 return fast_exp2(x * float(M_LN10 / M_LN2));
2023 #else
2024 return __exp10f(x);
2025 #endif
2026 }
2027
2028 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1 (float x) {
2029 #ifndef __CUDA_ARCH__
2030 if (fabsf(x) < 0.03f) {
2031 float y = 1.0f - (1.0f - x); // crush denormals
2032 return copysignf(madd(0.5f, y * y, y), x);
2033 } else
2034 return fast_exp(x) - 1.0f;
2035 #else
2036 return expm1f(x);
2037 #endif
2038 }
2039
2040 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh (float x) {
2041 #ifndef __CUDA_ARCH__
2042 float a = fabsf(x);
2043 if (a > 1.0f) {
2044 // Examined 53389559 values of sinh on [1,87.3300018]: 33.6886442 avg ulp diff, 178 max ulp
2045 float e = fast_exp(a);
2046 return copysignf(0.5f * e - 0.5f / e, x);
2047 } else {
2048 a = 1.0f - (1.0f - a); // crush denorms
2049 float a2 = a * a;
2050 // degree 7 polynomial generated with sollya
2051 // Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error
2052 float r = 2.03945513931e-4f;
2053 r = madd(r, a2, 8.32990277558e-3f);
2054 r = madd(r, a2, 0.1666673421859f);
2055 r = madd(r * a, a2, a);
2056 return copysignf(r, x);
2057 }
2058 #else
2059 return sinhf(x);
2060 #endif
2061 }
2062
2063 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh (float x) {
2064 #ifndef __CUDA_ARCH__
2065 // Examined 2237485550 values of cosh on [-87.3300018,87.3300018]: 1.78256726 avg ulp diff, 178 max ulp
2066 float e = fast_exp(fabsf(x));
2067 return 0.5f * e + 0.5f / e;
2068 #else
2069 return coshf(x);
2070 #endif
2071 }
2072
2073 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh (float x) {
2074 #ifndef __CUDA_ARCH__
2075 // Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]: 3.12924e-06 max error
2076 // NOTE: ulp error is high because of sub-optimal handling around the origin
2077 float e = fast_exp(2.0f * fabsf(x));
2078 return copysignf(1 - 2 / (1 + e), x);
2079 #else
2080 return tanhf(x);
2081 #endif
2082 }
2083
2084 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow (float x, float y) {
2085 if (y == 0) return 1.0f; // x^0=1
2086 if (x == 0) return 0.0f; // 0^y=0
2087 // be cheap & exact for special case of squaring and identity
2088 if (y == 1.0f)
2089 return x;
2090 if (y == 2.0f) {
2091 #ifndef __CUDA_ARCH__
2092 return std::min (x*x, std::numeric_limits<float>::max());
2093 #else
2094 return fminf (x*x, std::numeric_limits<float>::max());
2095 #endif
2096 }
2097 float sign = 1.0f;
2098 if (OIIO_UNLIKELY(x < 0.0f)) {
2099 // if x is negative, only deal with integer powers
2100 // powf returns NaN for non-integers, we will return 0 instead
2101 int ybits = bit_cast<float, int>(y) & 0x7fffffff;
2102 if (ybits >= 0x4b800000) {
2103 // always even int, keep positive
2104 } else if (ybits >= 0x3f800000) {
2105 // bigger than 1, check
2106 int k = (ybits >> 23) - 127; // get exponent
2107 int j = ybits >> (23 - k); // shift out possible fractional bits
2108 if ((j << (23 - k)) == ybits) // rebuild number and check for a match
2109 sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
2110 else
2111 return 0.0f; // not integer
2112 } else {
2113 return 0.0f; // not integer
2114 }
2115 }
2116 return sign * fast_exp2(y * fast_log2(fabsf(x)));
2117 }
2118
2119
2120 // Fast simd pow that only needs to work for positive x
2121 template<typename T, typename U>
2122 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos (const T& x, const U& y) {
2123 return fast_exp2(y * fast_log2(x));
2124 }
2125
2126
2127 // Fast cube root (performs better that using fast_pow's above with y=1/3)
2128 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt (float x) {
2129 #ifndef __CUDA_ARCH__
2130 float x0 = fabsf(x);
2131 // from hacker's delight
2132 float a = bit_cast<int, float>(0x2a5137a0 + bit_cast<float, int>(x0) / 3); // Initial guess.
2133 // Examined 14272478 values of cbrt on [-9.99999935e-39,9.99999935e-39]: 8.14687e-14 max error
2134 // Examined 2131958802 values of cbrt on [9.99999935e-39,3.40282347e+38]: 2.46930719 avg ulp diff, 12 max ulp
2135 a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step.
2136 a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step again.
2137 a = (x0 == 0) ? 0 : a; // Fix 0 case
2138 return copysignf(a, x);
2139 #else
2140 return cbrtf(x);
2141 #endif
2142 }
2143
2144
2145 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf (float x)
2146 {
2147 #ifndef __CUDA_ARCH__
2148 // Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error
2149 // Abramowitz and Stegun, 7.1.28
2150 const float a1 = 0.0705230784f;
2151 const float a2 = 0.0422820123f;
2152 const float a3 = 0.0092705272f;
2153 const float a4 = 0.0001520143f;
2154 const float a5 = 0.0002765672f;
2155 const float a6 = 0.0000430638f;
2156 const float a = fabsf(x);
2157 const float b = 1.0f - (1.0f - a); // crush denormals
2158 const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2159 const float s = r * r; // ^2
2160 const float t = s * s; // ^4
2161 const float u = t * t; // ^8
2162 const float v = u * u; // ^16
2163 return copysignf(1.0f - 1.0f / v, x);
2164 #else
2165 return erff(x);
2166 #endif
2167 }
2168
2169 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc (float x)
2170 {
2171 #ifndef __CUDA_ARCH__
2172 // Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error
2173 // ulp histogram:
2174 // 0 = 80.30%
2175 return 1.0f - fast_erf(x);
2176 #else
2177 return erfcf(x);
2178 #endif
2179 }
2180
2181 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf (float x)
2182 {
2183 // from: Approximating the erfinv function by Mike Giles
2184 // to avoid trouble at the limit, clamp input to 1-eps
2185 float a = fabsf(x); if (a > 0.99999994f) a = 0.99999994f;
2186 float w = -fast_log((1.0f - a) * (1.0f + a)), p;
2187 if (w < 5.0f) {
2188 w = w - 2.5f;
2189 p = 2.81022636e-08f;
2190 p = madd(p, w, 3.43273939e-07f);
2191 p = madd(p, w, -3.5233877e-06f );
2192 p = madd(p, w, -4.39150654e-06f);
2193 p = madd(p, w, 0.00021858087f );
2194 p = madd(p, w, -0.00125372503f );
2195 p = madd(p, w, -0.00417768164f );
2196 p = madd(p, w, 0.246640727f );
2197 p = madd(p, w, 1.50140941f );
2198 } else {
2199 w = sqrtf(w) - 3.0f;
2200 p = -0.000200214257f;
2201 p = madd(p, w, 0.000100950558f);
2202 p = madd(p, w, 0.00134934322f );
2203 p = madd(p, w, -0.00367342844f );
2204 p = madd(p, w, 0.00573950773f );
2205 p = madd(p, w, -0.0076224613f );
2206 p = madd(p, w, 0.00943887047f );
2207 p = madd(p, w, 1.00167406f );
2208 p = madd(p, w, 2.83297682f );
2209 }
2210 return p * x;
2211 }
2212
2213 // (end of fast* functions)
2214 ////////////////////////////////////////////////////////////////////////////
2215
2216
2217
2218
2219 ////////////////////////////////////////////////////////////////////////////
2220 ////////////////////////////////////////////////////////////////////////////
2221 //
2222 // MISCELLANEOUS NUMERICAL METHODS
2223 //
2224
2225
2226 /// Solve for the x for which func(x) == y on the interval [xmin,xmax].
2227 /// Use a maximum of maxiter iterations, and stop any time the remaining
2228 /// search interval or the function evaluations <= eps. If brack is
2229 /// non-NULL, set it to true if y is in [f(xmin), f(xmax)], otherwise
2230 /// false (in which case the caller should know that the results may be
2231 /// unreliable. Results are undefined if the function is not monotonic
2232 /// on that interval or if there are multiple roots in the interval (it
2233 /// may not converge, or may converge to any of the roots without
2234 /// telling you that there are more than one).
2235 template<class T, class Func> OIIO_HOSTDEVICE
2236 T invert (Func &func, T y, T xmin=0.0, T xmax=1.0,
2237 int maxiters=32, T eps=1.0e-6, bool *brack=0)
2238 {
2239 // Use the Regula Falsi method, falling back to bisection if it
2240 // hasn't converged after 3/4 of the maximum number of iterations.
2241 // See, e.g., Numerical Recipes for the basic ideas behind both
2242 // methods.
2243 T v0 = func(xmin), v1 = func(xmax);
2244 T x = xmin, v = v0;
2245 bool increasing = (v0 < v1);
2246 T vmin = increasing ? v0 : v1;
2247 T vmax = increasing ? v1 : v0;
2248 bool bracketed = (y >= vmin && y <= vmax);
2249 if (brack)
2250 *brack = bracketed;
2251 if (! bracketed) {
2252 // If our bounds don't bracket the zero, just give up, and
2253 // return the appropriate "edge" of the interval
2254 return ((y < vmin) == increasing) ? xmin : xmax;
2255 }
2256 if (fabs(v0-v1) < eps) // already close enough
2257 return x;
2258 int rfiters = (3*maxiters)/4; // how many times to try regula falsi
2259 for (int iters = 0; iters < maxiters; ++iters) {
2260 T t; // interpolation factor
2261 if (iters < rfiters) {
2262 // Regula falsi
2263 t = (y-v0)/(v1-v0);
2264 if (t <= T(0) || t >= T(1))
2265 t = T(0.5); // RF convergence failure -- bisect instead
2266 } else {
2267 t = T(0.5); // bisection
2268 }
2269 x = lerp (xmin, xmax, t);
2270 v = func(x);
2271 if ((v < y) == increasing) {
2272 xmin = x; v0 = v;
2273 } else {
2274 xmax = x; v1 = v;
2275 }
2276 if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2277 return x; // converged
2278 }
2279 return x;
2280 }
2281
2282
2283
2284 /// Linearly interpolate a list of evenly-spaced knots y[0..len-1] with
2285 /// y[0] corresponding to the value at x==0.0 and y[len-1] corresponding to
2286 /// x==1.0.
2287 inline OIIO_HOSTDEVICE float
2288 interpolate_linear (float x, span_strided<const float> y)
2289 {
2290 #ifndef __CUDA_ARCH__
2291 DASSERT_MSG (y.size() >= 2, "interpolate_linear needs at least 2 knot values (%zd)", y.size());
2292 #endif
2293 x = clamp (x, float(0.0), float(1.0));
2294 int nsegs = int(y.size()) - 1;
2295 int segnum;
2296 x = floorfrac (x*nsegs, &segnum);
2297 #ifndef __CUDA_ARCH__
2298 int nextseg = std::min (segnum+1, nsegs);
2299 #else
2300 int nextseg = min (segnum+1, nsegs);
2301 #endif
2302 return lerp (y[segnum], y[nextseg], x);
2303 }
2304
2305 // (end miscellaneous numerical methods)
2306 ////////////////////////////////////////////////////////////////////////////
2307
2308
2309
2310 OIIO_NAMESPACE_END
2311