1 // Copyright Contributors to the Open Shading Language project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
4 
5 // clang-format off
6 
7 #pragma once
8 
9 #include <initializer_list>
10 #include <utility>
11 #include <OSL/oslconfig.h>
12 #include <OSL/oslversion.h>
13 #include <OpenImageIO/fmath.h>
14 
15 
16 OSL_NAMESPACE_ENTER
17 
18 
19 // Shortcut notation for enable_if trickery
20 # define DUAL_REQUIRES(...) \
21   typename std::enable_if<(__VA_ARGS__), bool>::type = true
22 
23 
24 
25 // To generically access elements or partials of a Dual, use a ConstIndex<#>.
26 // NOTE:  Each ConstIndex<#> (ConstIndex<0>, ConstIndex<1>, ConstIndex<2>, ...)
27 // is a different type and known at compile time.  A regular loop index will
28 // not work to access elements or partials.  For non-generic code, the methods
29 // .val(), .dx(), .dy(), .dz() should be used.
30 // For generic code, the macro "OSL_INDEX_LOOP(__INDEX_NAME, __COUNT, ...)"
31 // is provided to execute a code block (...) with __INDEX_NAME correctly typed
32 // for each ConstIndex<#> within the # range [0-__COUNT).
33 // In practice, this means changing
34 //     for(int i=0; i < elements;++i) {
35 //          elem(i) += value;
36 //     }
37 // to
38 //     OSL_INDEX_LOOP(i, elements, {
39 //          elem(i) += value;
40 //     });
41 
42 
43 // If possible (C++14+) use generic lambda based to loop to repeat code
44 // sequences with compile time known ConstIndex.  Otherwise for c++11,
45 // a more limited manually unrolled loop to a fixed maximum __COUNT
46 // works because dead code elimination prunes extraneous iterations.
47 #if (OSL_CPLUSPLUS_VERSION >= 14) && !defined(__GNUC__)
48     // explanation of passing code block as macro argument to handle
49     // nested comma operators that might break up the code block into
50     // multiple macro arguments
51     // https://mort.coffee/home/obscure-c-features/
52     //
53     // The macro's variable argument list populates the body of the functor lambda.
54     // NOTE: generic lambda parameter __INDEX_NAME means that the functor itself
55     // is a template that accepts any type.  The static_foreach will call the
56     // templated functor with different types of ConstIndex<#> where # is [0-__COUNT)
57     #define OSL_INDEX_LOOP(__INDEX_NAME, __COUNT, ...) \
58         { \
59             auto functor = [&](auto __INDEX_NAME) { \
60                 __VA_ARGS__; \
61             }; \
62             static_foreach<ConstIndex, __COUNT>(functor); \
63         }
64 
65 #else
66     namespace pvt {
67         template<typename T>
68         static OSL_HOSTDEVICE constexpr T static_min(T a, T b) {
69             return (b < a) ? b : a;
70         }
71     }
72     // explanation of passing code block as macro argument to handle
73     // nested comma operators that might break up the code block into
74     // multiple macro arguments
75     // https://mort.coffee/home/obscure-c-features/
76     //
77     // We rely on dead code elimination to quickly get rid of the code emitted
78     // for out of range indices, but we do take care to not generate any ConstIndex<#>
79     // that would be out of range using the static_min helper
80     #define OSL_INDEX_LOOP(__INDEX_NAME, __COUNT, ...) \
81         static_assert((__COUNT) < 4, "macro based expansion must be repeated to support higher __COUNT values"); \
82         { ConstIndex<0> __INDEX_NAME; __VA_ARGS__ ; } \
83         if ((__COUNT) > 1) { ConstIndex<pvt::static_min(1, (__COUNT)-1)> __INDEX_NAME; __VA_ARGS__ ; } \
84         if ((__COUNT) > 2) { ConstIndex<pvt::static_min(2, (__COUNT)-1)> __INDEX_NAME; __VA_ARGS__ ; } \
85         if ((__COUNT) > 3) { ConstIndex<pvt::static_min(3, (__COUNT)-1)> __INDEX_NAME; __VA_ARGS__ ; }
86 #endif
87 
88 /// Dual numbers are used to represent values and their derivatives.
89 ///
90 /// The method is summarized in:
91 ///      Piponi, Dan, "Automatic Differentiation, C++ Templates,
92 ///      and Photogrammetry," Journal of Graphics, GPU, and Game
93 ///      Tools (JGT), Vol 9, No 4, pp. 41-55 (2004).
94 ///
95 ///
96 
97 // Default DualStorage can handle any number of partials by storing elements
98 // in an array.  In practice the array subscript operation can require
99 // standard data layout so that the base address + index works.  This
100 // can inhibit other transformations useful for vectorization such as
101 // storing data members in a Structure Of Arrays (SOA) layout
102 template<class T, int PARTIALS>
103 class DualStorage
104 {
105 public:
106     T m_elem[PARTIALS+1];
107 
108     template<int N>
elem(ConstIndex<N>)109     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<N>) const { return m_elem[N]; }
110     template<int N>
elem(ConstIndex<N>)111     OSL_HOSTDEVICE T& elem (ConstIndex<N>)  { return m_elem[N]; }
112 };
113 
114 
115 // Specialize DualStorage for specific number of partials and store elements
116 // as individual data members.  Because each ConstIndex<#> is its own type and
117 // identifies a specific element, we can overload the elem function for each
118 // specific ConstIndex<#> and return the corresponding data member.
119 // The main benefit is better enabling of SROA (Scalar Replacement of Aggregates)
120 // and other transformations that allow each data member to be
121 // optimized independently, kept in vector registers vs. scatter back to the
122 // stack, and more.
123 template<class T>
124 class DualStorage<T, 1>
125 {
126 public:
127     T m_val;
128     T m_dx;
129 
130     // To better enable Scalar Replacement of Aggregates and other
131     // transformations, CLANG has easier time if the per element
132     // constructors declared.
DualStorage()133     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage() {}
DualStorage(const T & val,const T & dx)134     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage(const T & val, const T & dx)
135     : m_val(val)
136     , m_dx(dx)
137     {}
DualStorage(const DualStorage & other)138     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage(const DualStorage &other)
139     : m_val(other.m_val)
140     , m_dx(other.m_dx)
141     {}
142 
elem(ConstIndex<0>)143     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<0>) const { return m_val; }
elem(ConstIndex<1>)144     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<1>) const { return m_dx; }
elem(ConstIndex<0>)145     OSL_HOSTDEVICE T& elem (ConstIndex<0>)  { return m_val; }
elem(ConstIndex<1>)146     OSL_HOSTDEVICE T& elem (ConstIndex<1>)  { return m_dx; }
147 };
148 
149 
150 // Specialize layout to be explicit data members vs. array
151 template<class T>
152 class DualStorage<T, 2>
153 {
154 public:
155     T m_val;
156     T m_dx;
157     T m_dy;
158 
159     // To better enable Scalar Replacement of Aggregates and other
160     // transformations, CLANG has easier time if the per element
161     // constructors declared.
DualStorage()162     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage() {}
DualStorage(const T & val,const T & dx,const T & dy)163     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage(const T & val, const T & dx, const T & dy)
164     : m_val(val)
165     , m_dx(dx)
166     , m_dy(dy)
167     {}
DualStorage(const DualStorage & other)168     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage(const DualStorage &other)
169     : m_val(other.m_val)
170     , m_dx(other.m_dx)
171     , m_dy(other.m_dy)
172     {}
173 
elem(ConstIndex<0>)174     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<0>) const { return m_val; }
elem(ConstIndex<1>)175     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<1>) const { return m_dx; }
elem(ConstIndex<2>)176     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<2>) const { return m_dy; }
elem(ConstIndex<0>)177     OSL_HOSTDEVICE T& elem (ConstIndex<0>)  { return m_val; }
elem(ConstIndex<1>)178     OSL_HOSTDEVICE T& elem (ConstIndex<1>)  { return m_dx; }
elem(ConstIndex<2>)179     OSL_HOSTDEVICE T& elem (ConstIndex<2>)  { return m_dy; }
180 };
181 
182 
183 template<class T>
184 class DualStorage<T, 3>
185 {
186 public:
187     T m_val;
188     T m_dx;
189     T m_dy;
190     T m_dz;
191 
192     // To better enable Scalar Replacement of Aggregates and other
193     // transformations, CLANG has easier time if the per element
194     // constructors declared.
DualStorage()195     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage() {}
DualStorage(const T & val,const T & dx,const T & dy,const T & dz)196     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage(const T & val, const T & dx, const T & dy, const T & dz)
197     : m_val(val)
198     , m_dx(dx)
199     , m_dy(dy)
200     , m_dz(dz)
201     {}
202 
DualStorage(const DualStorage & other)203     OSL_HOSTDEVICE OSL_CONSTEXPR14 DualStorage(const DualStorage &other)
204     : m_val(other.m_val)
205     , m_dx(other.m_dx)
206     , m_dy(other.m_dy)
207     , m_dz(other.dz)
208     {}
209 
elem(ConstIndex<0>)210     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<0>) const { return m_val; }
elem(ConstIndex<1>)211     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<1>) const { return m_dx; }
elem(ConstIndex<2>)212     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<2>) const { return m_dy; }
elem(ConstIndex<3>)213     OSL_HOSTDEVICE constexpr const T& elem (ConstIndex<3>) const { return m_dz; }
elem(ConstIndex<0>)214     OSL_HOSTDEVICE T& elem (ConstIndex<0>)  { return m_val; }
elem(ConstIndex<1>)215     OSL_HOSTDEVICE T& elem (ConstIndex<1>)  { return m_dx; }
elem(ConstIndex<2>)216     OSL_HOSTDEVICE T& elem (ConstIndex<2>)  { return m_dy; }
elem(ConstIndex<3>)217     OSL_HOSTDEVICE T& elem (ConstIndex<3>)  { return m_dz; }
218 };
219 
220 
221 
222 // Single generic implementation of Dual can handle any number of partials
223 // delegating access to element storage to the DualStorage base class
224 template<
225          class T,           // Base data type
226          int PARTIALS=1     // Number of dimentions of partial derivs
227         >
228 class Dual : public DualStorage<T, PARTIALS> {
229     static const int elements = PARTIALS+1;   // main value + partials
230     static_assert (PARTIALS>=1, "Can't have a Dual with 0 partials");
231 public:
232     using value_type = T;
zero()233     static OSL_FORCEINLINE OSL_HOSTDEVICE OSL_CONSTEXPR14 T zero() { return T(0.0); }
234 
235     using DualStorage<T, PARTIALS>::elem;
236     template<int N>
partial(ConstIndex<N>)237     OSL_HOSTDEVICE constexpr const T partial (ConstIndex<N>) const { return elem(ConstIndex<N+1>()); }
238     template<int N>
partial(ConstIndex<N>)239     OSL_HOSTDEVICE T& partial (ConstIndex<N>) { return elem(ConstIndex<N+1>()); }
240 
241     /// Default ctr leaves everything uninitialized
242     ///
Dual()243     OSL_HOSTDEVICE OSL_CONSTEXPR14 Dual ()
244     : DualStorage<T, PARTIALS>()
245     {}
246 
247     /// Construct a Dual from just a real value (derivs set to 0)
248     ///
Dual(const T & x)249     OSL_HOSTDEVICE OSL_CONSTEXPR14 Dual (const T &x)
250     : DualStorage<T, PARTIALS>()
251     {
252         val() = x;
253         OSL_INDEX_LOOP(i, PARTIALS, {
254            partial(i) = zero();
255         });
256     }
257 
258     /// Copy constructor from another Dual of same dimension and different,
259     /// but castable, data type.
260     template<class F>
Dual(const Dual<F,PARTIALS> & x)261     OSL_HOSTDEVICE OSL_CONSTEXPR14 Dual (const Dual<F,PARTIALS> &x)
262     : DualStorage<T, PARTIALS>()
263     {
264         OSL_INDEX_LOOP(i, elements, {
265             elem(i) = T(x.elem(i));
266         });
267     }
268 
269     //OSL_HOSTDEVICE OSL_CONSTEXPR14 Dual (const Dual &x) = default;
Dual(const Dual & x)270     OSL_HOSTDEVICE OSL_CONSTEXPR14 Dual (const Dual &x)
271     : DualStorage<T, PARTIALS>(x)
272     {}
273 
274     /// Construct a Dual from a real and infinitesimals.
275     ///
276     template<typename... DerivListT>
Dual(const T & x,const DerivListT &...derivs)277     OSL_HOSTDEVICE constexpr Dual (const T &x, const DerivListT & ...derivs)
278     : DualStorage<T, PARTIALS>{ x, static_cast<T>(derivs)...}
279     {
280         static_assert(sizeof...(DerivListT)==PARTIALS, "Wrong number of initializers");
281     }
282 
283 protected:
284     template<int... IntListT, typename... ValueListT>
285     OSL_HOSTDEVICE OSL_FORCEINLINE void
set_expander(pvt::int_sequence<IntListT...>,const ValueListT &...values)286     set_expander (pvt::int_sequence<IntListT...> /*indices*/, const ValueListT & ...values)
287     {
288         __OSL_EXPAND_PARAMETER_PACKS( elem(ConstIndex<IntListT>()) = values );
289     }
290 public:
291 
292     template<typename... ValueListT>
set(const ValueListT &...values)293     OSL_HOSTDEVICE void set (const ValueListT & ...values)
294     {
295         static_assert(sizeof...(ValueListT)==elements, "Wrong number of initializers");
296 
297         set_expander(pvt::make_int_sequence<elements>(), values...);
298     }
299 
Dual(std::initializer_list<T> vals)300     OSL_HOSTDEVICE OSL_CONSTEXPR14 Dual (std::initializer_list<T> vals) {
301 #if OIIO_CPLUSPLUS_VERSION >= 14
302         static_assert (vals.size() == elements, "Wrong number of initializers");
303 #endif
304         OSL_INDEX_LOOP(i, elements, {
305             elem(i) = vals.begin()[i];
306         });
307     }
308 
309     /// Return the real value.
val()310     OSL_HOSTDEVICE constexpr const T& val () const { return elem(ConstIndex<0>()); }
val()311     OSL_HOSTDEVICE T& val () { return elem(ConstIndex<0>()); }
312 
313     /// Return the partial derivative with respect to x.
dx()314     OSL_HOSTDEVICE constexpr const T& dx () const { return elem(ConstIndex<1>()); }
dx()315     OSL_HOSTDEVICE T& dx () { return elem(ConstIndex<1>()); }
316 
317     /// Return the partial derivative with respect to y.
318     template<typename ThisType = Dual, typename std::enable_if<ThisType::elements==3, int>::type = 0>
319     OSL_HOSTDEVICE constexpr const T &
dy()320     dy () const {
321         return elem(ConstIndex<2>());
322     }
323     template<typename ThisType = Dual, typename std::enable_if<ThisType::elements==3, int>::type = 0>
dy()324     OSL_HOSTDEVICE T& dy () {
325         return elem(ConstIndex<2>());
326     }
327 
328     /// Return the partial derivative with respect to z.
329     /// Only valid if there are at least 3 partial dimensions.
330     template<typename ThisType = Dual, typename std::enable_if<ThisType::elements==4, int>::type = 0>
dz()331     OSL_HOSTDEVICE constexpr const T& dz () const {
332         return elem(ConstIndex<3>());
333     }
334     template<typename ThisType = Dual, typename std::enable_if<ThisType::elements==4, int>::type = 0>
dz()335     OSL_HOSTDEVICE T& dz () {
336         return elem(ConstIndex<3>());
337     }
338 
339     /// Clear the derivatives; leave the value alone.
clear_d()340     OSL_HOSTDEVICE void clear_d () {
341         OSL_INDEX_LOOP(i, PARTIALS, {
342             partial(i) = zero();
343         });
344     }
345 
346     /// Assignment of a real (the derivs are implicitly 0).
347     OSL_HOSTDEVICE const Dual & operator= (const T &x) {
348         val() = x;
349         OSL_INDEX_LOOP(i, PARTIALS, {
350             partial(i) = zero();
351         });
352         return *this;
353     }
354 
355     OSL_HOSTDEVICE const Dual & operator= (const Dual &other) {
356         OSL_INDEX_LOOP(i, elements, {
357             elem(i) = other.elem(i);
358         });
359         return *this;
360     }
361 
362 
363     /// Stream output.  Format as: "val[dx,dy,...]"
364     ///
365     friend std::ostream& operator<< (std::ostream &out, const Dual &x) {
366         out << x.val() << "[";
367         OSL_INDEX_LOOP(i, PARTIALS, {
368             out << (x.partial(i)) << ((i < PARTIALS-1) ? ',' : ']');
369         });
370         return out;
371     }
372 
373 };
374 
375 
376 
377 /// is_Dual<TYPE>::value is true for Dual, false for everything else.
378 /// You can also evaluate an is_Dual<T> as a bool.
379 template<class T> struct is_Dual : std::false_type {};
380 template<class T, int P> struct is_Dual<Dual<T,P>> : std::true_type {};
381 
382 /// Dualify<T>::type returns Dual<T> if T is not a dual, or just T if it's
383 /// already a dual.
384 template<class T> struct UnDual { typedef T type; };
385 template<class T, int P> struct UnDual<Dual<T,P>> { typedef T type; };
386 
387 /// Dualify<T,P>::type returns Dual<T,P> if T is not a dual, or just T if
388 /// it's already a dual.
389 template<class T, int P=1> struct Dualify {
390     typedef Dual<typename UnDual<T>::type, P> type;
391 };
392 
393 
394 
395 /// Define Dual2<T> as a Dual<T,2> -- a T-based quantity with x and y
396 /// partial derivatives.
397 template<class T> using Dual2 = Dual<T,2>;
398 
399 
400 
401 /// Addition of duals.
402 ///
403 template<class T, int P>
404 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator+ (const Dual<T,P> &a, const Dual<T,P> &b)
405 {
406     Dual<T,P> result = a;
407     OSL_INDEX_LOOP(i, P+1, {
408         result.elem(i) += b.elem(i);
409     });
410     return result;
411 }
412 
413 
414 template<class T, int P>
415 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator+ (const Dual<T,P> &a, const T &b)
416 {
417     Dual<T,P> result = a;
418     result.val() += b;
419     return result;
420 }
421 
422 
423 template<class T, int P>
424 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator+ (const T &a, const Dual<T,P> &b)
425 {
426     Dual<T,P> result = b;
427     result.val() += a;
428     return result;
429 }
430 
431 
432 template<class T, int P>
433 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P>& operator+= (Dual<T,P> &a, const Dual<T,P> &b)
434 {
435     OSL_INDEX_LOOP(i, P+1, {
436         a.elem(i) += b.elem(i);
437     });
438     return a;
439 }
440 
441 
442 template<class T, int P>
443 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P>& operator+= (Dual<T,P> &a, const T &b)
444 {
445     a.val() += b;
446     return a;
447 }
448 
449 
450 /// Subtraction of duals.
451 ///
452 template<class T, int P>
453 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator- (const Dual<T,P> &a, const Dual<T,P> &b)
454 {
455     Dual<T,P> result;
456     OSL_INDEX_LOOP(i, P+1, {
457         result.elem(i) = a.elem(i) - b.elem(i);
458     });
459     return result;
460 }
461 
462 
463 template<class T, int P>
464 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator- (const Dual<T,P> &a, const T &b)
465 {
466     Dual<T,P> result = a;
467     result.val() -= b;
468     return result;
469 }
470 
471 
472 template<class T, int P>
473 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator- (const T &a, const Dual<T,P> &b)
474 {
475     Dual<T,P> result;
476     result.val() = a - b.val();
477     OSL_INDEX_LOOP(i, P, {
478         result.partial(i) = -b.partial(i);
479     });
480     return result;
481 }
482 
483 
484 template<class T, int P>
485 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P>& operator-= (Dual<T,P> &a, const Dual<T,P> &b)
486 {
487     OSL_INDEX_LOOP(i, P+1, {
488         a.elem(i) -= b.elem(i);
489     });
490     return a;
491 }
492 
493 
494 template<class T, int P>
495 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P>& operator-= (Dual<T,P> &a, const T &b)
496 {
497     a.val() -= b;
498     return a;
499 }
500 
501 
502 
503 /// Negation of duals.
504 ///
505 template<class T, int P>
506 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator- (const Dual<T,P> &a)
507 {
508     Dual<T,P> result;
509     OSL_INDEX_LOOP(i, P+1, {
510         result.elem(i) = -a.elem(i);
511     });
512     return result;
513 }
514 
515 
516 /// Multiplication of duals. This will work for any Dual<T>*Dual<S>
517 /// where T*S is meaningful.
518 template<class T, int P, class S>
519 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 auto
520 operator* (const Dual<T,P> &a, const Dual<S,P> &b) -> Dual<decltype(a.val()*b.val()),P>
521 {
522     // Use the chain rule
523     Dual<decltype(a.val()*b.val()),P> result;
524     result.val() = a.val() * b.val();
525     OSL_INDEX_LOOP(i, P, {
526         result.partial(i) = a.val()*b.partial(i) + a.partial(i)*b.val();
527     });
528     return result;
529 }
530 
531 
532 /// Multiplication of dual by a non-dual scalar. This will work for any
533 /// Dual<T> * S where T*S is meaningful.
534 template<class T, int P, class S,
535          DUAL_REQUIRES(is_Dual<S>::value == false)>
536 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 auto
537 operator* (const Dual<T,P> &a, const S &b) -> Dual<decltype(a.val()*b),P>
538 {
539     Dual<decltype(a.val()*b),P> result;
540     OSL_INDEX_LOOP(i, P+1, {
541         result.elem(i) = a.elem(i) * b;
542     });
543     return result;
544 }
545 
546 
547 /// Multiplication of dual by scalar in place.
548 ///
549 template<class T, int P, class S,
550          DUAL_REQUIRES(is_Dual<S>::value == false)>
551 OSL_HOSTDEVICE OSL_FORCEINLINE const Dual<T,P>& operator*= (Dual<T,P> &a, const S &b)
552 {
553     OSL_INDEX_LOOP(i, P+1, {
554         a.elem(i) *= b;
555     });
556     return a;
557 }
558 
559 
560 
561 /// Multiplication of dual by a non-dual scalar. This will work for any
562 /// Dual<T> * S where T*S is meaningful.
563 template<class T, int P, class S,
564          DUAL_REQUIRES(is_Dual<S>::value == false)>
565 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 auto
566 operator* (const S &b, const Dual<T,P> &a) -> Dual<decltype(a.val()*b),P>
567 {
568     return a*b;
569 }
570 
571 
572 
573 /// Division of duals.
574 ///
575 template<class T, int P>
576 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator/ (const Dual<T,P> &a, const Dual<T,P> &b)
577 {
578     T bvalinv = T(1) / b.val();
579     T aval_bval = a.val() / b.val();
580     Dual<T,P> result;
581     result.val() = aval_bval;
582     OSL_INDEX_LOOP(i, P, {
583         result.partial(i) = bvalinv * (a.partial(i) - aval_bval * b.partial(i));
584     });
585     return result;
586 }
587 
588 
589 /// Division of dual by scalar.
590 ///
591 template<class T, int P>
592 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator/ (const Dual<T,P> &a, const T &b)
593 {
594     T bvalinv = T(1) / b;
595     T aval_bval = a.val() / b;
596     Dual<T,P> result;
597     result.val() = aval_bval;
598     OSL_INDEX_LOOP(i, P, {
599         result.partial(i) = bvalinv * a.partial(i);
600     });
601     return result;
602 }
603 
604 
605 /// Division of scalar by dual.
606 ///
607 template<class T, int P>
608 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> operator/ (const T &aval, const Dual<T,P> &b)
609 {
610     T bvalinv = T(1) / b.val();
611     T aval_bval = aval / b.val();
612     Dual<T,P> result;
613     result.val() = aval_bval;
614     OSL_INDEX_LOOP(i, P, {
615         result.partial(i) = bvalinv * ( - aval_bval * b.partial(i));
616     });
617     return result;
618 }
619 
620 
621 
622 
623 template<class T, int P>
624 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator< (const Dual<T,P> &a, const Dual<T,P> &b) {
625     return a.val() < b.val();
626 }
627 
628 template<class T, int P>
629 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator< (const Dual<T,P> &a, const T &b) {
630     return a.val() < b;
631 }
632 
633 template<class T, int P>
634 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator< (const T &a, const Dual<T,P> &b) {
635     return a < b.val();
636 }
637 
638 template<class T, int P>
639 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator> (const Dual<T,P> &a, const Dual<T,P> &b) {
640     return a.val() > b.val();
641 }
642 
643 template<class T, int P>
644 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator> (const Dual<T,P> &a, const T &b) {
645     return a.val() > b;
646 }
647 
648 template<class T, int P>
649 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator> (const T &a, const Dual<T,P> &b) {
650     return a > b.val();
651 }
652 
653 template<class T, int P>
654 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator<= (const Dual<T,P> &a, const Dual<T,P> &b) {
655     return a.val() <= b.val();
656 }
657 
658 template<class T, int P>
659 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator<= (const Dual<T,P> &a, const T &b) {
660     return a.val() <= b;
661 }
662 
663 template<class T, int P>
664 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator<= (const T &a, const Dual<T,P> &b) {
665     return a <= b.val();
666 }
667 
668 template<class T, int P>
669 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator>= (const Dual<T,P> &a, const Dual<T,P> &b) {
670     return a.val() >= b.val();
671 }
672 
673 template<class T, int P>
674 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator>= (const Dual<T,P> &a, const T &b) {
675     return a.val() >= b;
676 }
677 
678 template<class T, int P>
679 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool operator>= (const T &a, const Dual<T,P> &b) {
680     return a >= b.val();
681 }
682 
683 
684 
685 // Eliminate the derivatives of a number, works for scalar as well as Dual.
686 template<class T> OSL_HOSTDEVICE OSL_FORCEINLINE constexpr const T&
687 removeDerivatives (const T &x) { return x; }
688 
689 template<class T, int P> OSL_HOSTDEVICE OSL_FORCEINLINE constexpr const T&
690 removeDerivatives (const Dual<T,P> &x) { return x.val(); }
691 
692 
693 // Get the x derivative (or 0 for a non-Dual)
694 template<class T> OSL_HOSTDEVICE OSL_FORCEINLINE constexpr const T&
695 getXDerivative (const T & /*x*/) { return T(0); }
696 
697 template<class T, int P> OSL_HOSTDEVICE OSL_FORCEINLINE constexpr const T&
698 getXDerivative (const Dual<T,P> &x) { return x.dx(); }
699 
700 
701 // Get the y derivative (or 0 for a non-Dual)
702 template<class T> OSL_HOSTDEVICE OSL_FORCEINLINE constexpr const T&
703 getYDerivative (const T & /*x*/) { return T(0); }
704 
705 template<class T, int P> OSL_HOSTDEVICE OSL_FORCEINLINE constexpr const T&
706 getYDerivative (const Dual<T,P> &x) { return x.dy(); }
707 
708 
709 // Simple templated "copy" function
710 template<class T> OSL_HOSTDEVICE OSL_FORCEINLINE void
711 assignment (T &a, const T &b) { a = b; }
712 template<class T, int P> OSL_HOSTDEVICE OSL_FORCEINLINE void
713 assignment (T &a, const Dual<T,P> &b) { a = b.val(); }
714 
715 // Templated value equality. For scalars, it's the same as regular ==.
716 // For Dual's, this only tests the value, not the derivatives. This solves
717 // a pesky source of confusion about whether operator== of Duals ought to
718 // return if just their value is equal or if the whole struct (including
719 // derivs) are equal.
720 template<class T>
721 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool equalVal (const T &x, const T &y) {
722     return x == y;
723 }
724 
725 template<class T, int P>
726 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool equalVal (const Dual<T,P> &x, const Dual<T,P> &y) {
727     return x.val() == y.val();
728 }
729 
730 template<class T, int P>
731 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool equalVal (const Dual<T,P> &x, const T &y) {
732     return x.val() == y;
733 }
734 
735 template<class T, int P>
736 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr bool equalVal (const T &x, const Dual<T,P> &y) {
737     return x == y.val();
738 }
739 
740 
741 
742 /// equalAll is the same as equalVal generally, but for two Duals of the same
743 /// type, they also compare derivs.
744 template<class T>
745 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 bool equalAll (const T &x, const T &y) {
746     return equalVal(x, y);
747 }
748 
749 template<class T, int P>
750 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 bool equalAll (const Dual<T,P> &x, const Dual<T,P> &y) {
751     bool r = true;
752     OSL_INDEX_LOOP(i, P+1, {
753         r &= (x.elem(i) == y.elem(i));
754     });
755     return r;
756 }
757 
758 
759 
760 
761 // Helper for constructing the result of a Dual function of one variable,
762 // given the scalar version and its derivative. Suppose you have scalar
763 // function f(scalar), then the dual function F(<u,u'>) is defined as:
764 //    F(<u,u'>) = < f(u), f'(u)*u' >
765 template<class T, int P>
766 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P>
767 dualfunc (const Dual<T,P>& u, const T& f_val, const T& df_val)
768 {
769     Dual<T,P> result;
770     result.val() = f_val;
771     OSL_INDEX_LOOP(i, P, {
772         result.partial(i) = df_val * u.partial(i);
773     });
774     return result;
775 }
776 
777 
778 // Helper for constructing the result of a Dual function of two variables,
779 // given the scalar version and its derivative. In general, the dual-form of
780 // the primitive function 'f(u,v)' is:
781 //   F(<u,u'>, <v,v'>) = < f(u,v), dfdu(u,v)u' + dfdv(u,v)v' >
782 // (from http://en.wikipedia.org/wiki/Automatic_differentiation)
783 template<class T, int P>
784 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P>
785 dualfunc (const Dual<T,P>& u, const Dual<T,P>& v,
786           const T& f_val, const T& dfdu_val, const T& dfdv_val)
787 {
788     Dual<T,P> result;
789     result.val() = f_val;
790     OSL_INDEX_LOOP(i, P, {
791         result.partial(i) = dfdu_val * u.partial(i) + dfdv_val * v.partial(i);
792     });
793     return result;
794 }
795 
796 
797 // Helper for construction the result of a Dual function of three variables.
798 template<class T, int P>
799 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P>
800 dualfunc (const Dual<T,P>& u, const Dual<T,P>& v, const Dual<T,P>& w,
801           const T& f_val, const T& dfdu_val, const T& dfdv_val, const T& dfdw_val)
802 {
803     Dual<T,P> result;
804     result.val() = f_val;
805     OSL_INDEX_LOOP(i, P, {
806         result.partial(i) = dfdu_val * u.partial(i) + dfdv_val * v.partial(i) + dfdw_val * w.partial(i);
807     });
808     return result;
809 }
810 
811 
812 
813 /// Fast negation of duals, in cases where you aren't too picky about the
814 /// difference between +0 and -0. (Courtesy of Alex Wells, Intel) Please see
815 /// OIIO's fast_math.h fast_neg for a more full explanation of why this is
816 /// faster.
817 template<class T, int P>
818 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> fast_neg (const Dual<T,P> &a)
819 {
820     Dual<T,P> result;
821     OSL_INDEX_LOOP(i, P+1, {
822         result.elem(i) = T(0) - a.elem(i);
823     });
824     return result;
825 }
826 
827 
828 // f(x) = cos(x), f'(x) = -sin(x)
829 template<class T, int P>
830 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> cos (const Dual<T,P> &a)
831 {
832     T sina, cosa;
833     OIIO::sincos(a.val(), &sina, &cosa);
834     return dualfunc (a, cosa, -sina);
835 }
836 
837 template<class T, int P>
838 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_cos(const Dual<T,P> &a)
839 {
840     T sina, cosa;
841     OIIO::fast_sincos (a.val(), &sina, &cosa);
842     return dualfunc (a, cosa, -sina);
843 }
844 
845 // f(x) = sin(x),  f'(x) = cos(x)
846 template<class T, int P>
847 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> sin (const Dual<T,P> &a)
848 {
849     T sina, cosa;
850     OIIO::sincos(a.val(), &sina, &cosa);
851     return dualfunc (a, sina, cosa);
852 }
853 
854 template<class T, int P>
855 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_sin(const Dual<T,P> &a)
856 {
857     T sina, cosa;
858     OIIO::fast_sincos (a.val(), &sina, &cosa);
859     return dualfunc (a, sina, cosa);
860 }
861 
862 template<class T, int P>
863 OSL_HOSTDEVICE OSL_FORCEINLINE void sincos(const Dual<T,P> &a, Dual<T,P> *sine, Dual<T,P> *cosine)
864 {
865 	T sina, cosa;
866 	OIIO::sincos(a.val(), &sina, &cosa);
867     *cosine = dualfunc (a, cosa, -sina);
868     *sine   = dualfunc (a, sina, cosa);
869 }
870 
871 template<class T, int P>
872 OSL_HOSTDEVICE OSL_FORCEINLINE void fast_sincos(const Dual<T,P> &a, Dual<T,P> *sine, Dual<T,P> *cosine)
873 {
874 	T sina, cosa;
875 	OIIO::fast_sincos(a.val(), &sina, &cosa);
876     *cosine = dualfunc (a, cosa, -sina);
877     *sine   = dualfunc (a, sina, cosa);
878 }
879 
880 // f(x) = tan(x), f'(x) = sec^2(x)
881 template<class T, int P>
882 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> tan (const Dual<T,P> &a)
883 {
884     T tana  = std::tan (a.val());
885     T cosa  = std::cos (a.val());
886     T sec2a = T(1)/(cosa*cosa);
887     return dualfunc (a, tana, sec2a);
888 }
889 
890 template<class T, int P>
891 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_tan(const Dual<T,P> &a)
892 {
893     T tana  = OIIO::fast_tan (a.val());
894     T cosa  = OIIO::fast_cos (a.val());
895     T sec2a = 1 / (cosa * cosa);
896     return dualfunc (a, tana, sec2a);
897 }
898 
899 // f(x) = cosh(x), f'(x) = sinh(x)
900 template<class T, int P>
901 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> cosh (const Dual<T,P> &a)
902 {
903     T f = std::cosh(a.val());
904     T df = std::sinh(a.val());
905     return dualfunc (a, f, df);
906 }
907 
908 template<class T, int P>
909 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_cosh(const Dual<T,P> &a)
910 {
911     T f = OIIO::fast_cosh(a.val());
912     T df = OIIO::fast_sinh(a.val());
913     return dualfunc (a, f, df);
914 }
915 
916 
917 // f(x) = sinh(x), f'(x) = cosh(x)
918 template<class T, int P>
919 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> sinh (const Dual<T,P> &a)
920 {
921     T f = std::sinh(a.val());
922     T df = std::cosh(a.val());
923     return dualfunc (a, f, df);
924 }
925 
926 template<class T, int P>
927 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_sinh(const Dual<T,P> &a)
928 {
929     T f = OIIO::fast_sinh(a.val());
930     T df = OIIO::fast_cosh(a.val());
931     return dualfunc (a, f, df);
932 }
933 
934 // f(x) = tanh(x), f'(x) = sech^2(x)
935 template<class T, int P>
936 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> tanh (const Dual<T,P> &a)
937 {
938     T tanha = std::tanh(a.val());
939     T cosha = std::cosh(a.val());
940     T sech2a = T(1)/(cosha*cosha);
941     return dualfunc (a, tanha, sech2a);
942 }
943 
944 template<class T, int P>
945 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_tanh(const Dual<T,P> &a)
946 {
947     T tanha = OIIO::fast_tanh(a.val());
948     T cosha = OIIO::fast_cosh(a.val());
949     T sech2a = T(1) / (cosha * cosha);
950     return dualfunc (a, tanha, sech2a);
951 }
952 
953 // f(x) = acos(x), f'(x) = -1/(sqrt(1 - x^2))
954 template<class T, int P>
955 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> safe_acos (const Dual<T,P> &a)
956 {
957     if (a.val() >= T(1))
958         return Dual<T,P> (T(0));
959     if (a.val() <= T(-1))
960         return Dual<T,P> (T(M_PI));
961     T f = std::acos (a.val());
962     T df = -T(1) / std::sqrt (T(1) - a.val()*a.val());
963     return dualfunc (a, f, df);
964 }
965 
966 template<class T, int P>
967 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_acos(const Dual<T,P> &a)
968 {
969     T f = OIIO::fast_acos(a.val());
970     T df = fabsf(a.val()) < 1.0f ? -1.0f / sqrtf(1.0f - a.val() * a.val()) : 0.0f;
971     return dualfunc (a, f, df);
972 }
973 
974 // f(x) = asin(x), f'(x) = 1/(sqrt(1 - x^2))
975 template<class T, int P>
976 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> safe_asin (const Dual<T,P> &a)
977 {
978     if (a.val() >= T(1))
979         return Dual<T,P> (T(M_PI/2));
980     if (a.val() <= T(-1))
981         return Dual<T,P> (T(-M_PI/2));
982     T f = std::asin (a.val());
983     T df = T(1) / std::sqrt (T(1) - a.val()*a.val());
984     return dualfunc (a, f, df);
985 }
986 
987 template<class T, int P>
988 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_asin(const Dual<T,P> &a)
989 {
990     T f = OIIO::fast_asin(a.val());
991     T df = fabsf(a.val()) < 1.0f ? 1.0f / sqrtf(1.0f - a.val() * a.val()) : 0.0f;
992     return dualfunc (a, f, df);
993 }
994 
995 
996 // f(x) = atan(x), f'(x) = 1/(1 + x^2)
997 template<class T, int P>
998 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> atan (const Dual<T,P> &a)
999 {
1000     T f = std::atan (a.val());
1001     T df = T(1) / (T(1) + a.val()*a.val());
1002     return dualfunc (a, f, df);
1003 }
1004 
1005 template<class T, int P>
1006 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_atan(const Dual<T,P> &a)
1007 {
1008     T f = OIIO::fast_atan(a.val());
1009     T df = 1.0f / (1.0f + a.val() * a.val());
1010     return dualfunc (a, f, df);
1011 }
1012 
1013 // f(x,y) = atan2(y,x); f'(x) =  y x' / (x^2 + y^2),
1014 //                      f'(y) = -x y' / (x^2 + y^2)
1015 // reference:  http://en.wikipedia.org/wiki/Atan2
1016 // (above link has other formulations)
1017 template<class T, int P>
1018 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> atan2 (const Dual<T,P> &y, const Dual<T,P> &x)
1019 {
1020     T atan2xy = std::atan2 (y.val(), x.val());
1021     T denom = (x.val() == T(0) && y.val() == T(0)) ? T(0) : T(1) / (x.val()*x.val() + y.val()*y.val());
1022     return dualfunc (y, x, atan2xy, -x.val()*denom, y.val()*denom);
1023 }
1024 
1025 template<class T, int P>
1026 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_atan2(const Dual<T,P> &y, const Dual<T,P> &x)
1027 {
1028     T atan2xy = OIIO::fast_atan2(y.val(), x.val());
1029     T denom = (x.val() == 0 && y.val() == 0) ? 0.0f : 1.0f / (x.val() * x.val() + y.val() * y.val());
1030     return dualfunc (y, x, atan2xy, -x.val()*denom, y.val()*denom);
1031 }
1032 
1033 // f(x) = log(a), f'(x) = 1/x
1034 // (log base e)
1035 template<class T, int P>
1036 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> safe_log (const Dual<T,P> &a)
1037 {
1038     T f = OIIO::safe_log(a.val());
1039     T df = a.val() < std::numeric_limits<T>::min() ? T(0) : T(1) / a.val();
1040     return dualfunc (a, f, df);
1041 }
1042 
1043 template<class T, int P>
1044 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_log(const Dual<T,P> &a)
1045 {
1046     T f = OIIO::fast_log(a.val());
1047     T df = a.val() < std::numeric_limits<float>::min() ? 0.0f : 1.0f / a.val();
1048     return dualfunc (a, f, df);
1049 }
1050 
1051 
1052 // to compute pow(u,v), we need the dual-form representation of
1053 // the pow() operator.  In general, the dual-form of the primitive
1054 // function 'g' is:
1055 //   g(<u,u'>, <v,v'>) = < g(u,v), dgdu(u,v)u' + dgdv(u,v)v' >
1056 //   (from http://en.wikipedia.org/wiki/Automatic_differentiation)
1057 // so, pow(u,v) = < u^v, vu^(v-1) u' + log(u)u^v v' >
1058 template<class T, int P>
1059 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> safe_pow (const Dual<T,P> &u, const Dual<T,P> &v)
1060 {
1061     // NOTE: this function won't return exactly the same as pow(x,y) because we
1062     // use the identity u^v=u * u^(v-1) which does not hold in all cases for our
1063     // "safe" variant (nor does it hold in general in floating point arithmetic).
1064     T powuvm1 = OIIO::safe_pow(u.val(), v.val() - T(1));
1065     T powuv   = powuvm1 * u.val();
1066     T logu    = u.val() > 0 ? OIIO::safe_log(u.val()) : T(0);
1067     return dualfunc (u, v, powuv, v.val()*powuvm1, logu*powuv);
1068 }
1069 // Fallthrough to OIIO::safe_pow for floats and vectors
1070 using OIIO::safe_pow;
1071 
1072 template<class T, int P>
1073 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_safe_pow(const Dual<T,P> &u, const Dual<T,P> &v)
1074 {
1075     // NOTE: same issue as above (fast_safe_pow does even more clamping)
1076     T powuvm1 = OIIO::fast_safe_pow (u.val(), v.val() - 1.0f);
1077     T powuv   = powuvm1 * u.val();
1078     T logu    = u.val() > 0 ? OIIO::fast_log(u.val()) : 0.0f;
1079     return dualfunc (u, v, powuv, v.val()*powuvm1, logu*powuv);
1080 }
1081 
1082 // f(x) = log2(x), f'(x) = 1/(x*log2)
1083 // (log base 2)
1084 template<class T, int P>
1085 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> safe_log2 (const Dual<T,P> &a)
1086 {
1087     T f = safe_log2(a.val());
1088     T df = a.val() < std::numeric_limits<T>::min() ? T(0) : T(1) / (a.val() * T(M_LN2));
1089     return dualfunc (a, f, df);
1090 }
1091 
1092 template<class T, int P>
1093 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_log2(const Dual<T,P> &a)
1094 {
1095     T f = OIIO::fast_log2(a.val());
1096     T aln2 = a.val() * float(M_LN2);
1097     T df = aln2 < std::numeric_limits<float>::min() ? 0.0f : 1.0f / aln2;
1098     return dualfunc (a, f, df);
1099 }
1100 
1101 // f(x) = log10(x), f'(x) = 1/(x*log10)
1102 // (log base 10)
1103 template<class T, int P>
1104 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> safe_log10 (const Dual<T,P> &a)
1105 {
1106     T f = safe_log10(a.val());
1107     T df = a.val() < std::numeric_limits<T>::min() ? T(0) : T(1) / (a.val() * T(M_LN10));
1108     return dualfunc (a, f, df);
1109 }
1110 
1111 template<class T, int P>
1112 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_log10(const Dual<T,P> &a)
1113 {
1114     T f  = OIIO::fast_log10(a.val());
1115     T aln10 = a.val() * float(M_LN10);
1116     T df  = aln10 < std::numeric_limits<float>::min() ? 0.0f : 1.0f / aln10;
1117     return dualfunc (a, f, df);
1118 }
1119 
1120 // f(x) = e^x, f'(x) = e^x
1121 template<class T, int P>
1122 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> exp (const Dual<T,P> &a)
1123 {
1124     T f = std::exp(a.val());
1125     return dualfunc (a, f, f);
1126 }
1127 
1128 template<class T, int P>
1129 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_exp(const Dual<T,P> &a)
1130 {
1131     T f = OIIO::fast_exp(a.val());
1132     return dualfunc (a, f, f);
1133 }
1134 
1135 // f(x) = 2^x, f'(x) = (2^x)*log(2)
1136 template<class T, int P>
1137 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> exp2 (const Dual<T,P> &a)
1138 {
1139     using std::exp2;
1140     T f = exp2(a.val());
1141     return dualfunc (a, f, f*T(M_LN2));
1142 }
1143 
1144 template<class T, int P>
1145 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_exp2(const Dual<T,P> &a)
1146 {
1147     T f = OIIO::fast_exp2(float(a.val()));
1148     return dualfunc (a, f, f*T(M_LN2));
1149 }
1150 
1151 
1152 // f(x) = e^x - 1, f'(x) = e^x
1153 template<class T, int P>
1154 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> expm1 (const Dual<T,P> &a)
1155 {
1156     using std::expm1;
1157     T f  = expm1(a.val());
1158     T df = std::exp  (a.val());
1159     return dualfunc (a, f, df);
1160 }
1161 
1162 template<class T, int P>
1163 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_expm1(const Dual<T,P> &a)
1164 {
1165     T f  = OIIO::fast_expm1(a.val());
1166     T df = OIIO::fast_exp  (a.val());
1167     return dualfunc (a, f, df);
1168 }
1169 
1170 // f(x) = erf(x), f'(x) = (2e^(-x^2))/sqrt(pi)
1171 template<class T, int P>
1172 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> erf (const Dual<T,P> &a)
1173 {
1174     using std::erf;
1175     T f = erf (a.val());
1176     const T two_over_sqrt_pi = T(1.128379167095512573896158903);
1177     T df = std::exp (-a.val() * a.val()) * two_over_sqrt_pi;
1178     return dualfunc (a, f, df);
1179 }
1180 
1181 template<class T, int P>
1182 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_erf(const Dual<T,P> &a)
1183 {
1184     T f = OIIO::fast_erf (float(a.val())); // float version!
1185     const T two_over_sqrt_pi = 1.128379167095512573896158903f;
1186     T df = OIIO::fast_exp(-a.val() * a.val()) * two_over_sqrt_pi;
1187     return dualfunc (a, f, df);
1188 }
1189 
1190 // f(x) = erfc(x), f'(x) = -(2e^(-x^2))/sqrt(pi)
1191 template<class T, int P>
1192 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> erfc (const Dual<T,P> &a)
1193 {
1194     using std::erfc;
1195     T f = erfc (a.val()); // float version!
1196     const T two_over_sqrt_pi = -T(1.128379167095512573896158903);
1197     T df = std::exp (-a.val() * a.val()) * two_over_sqrt_pi;
1198     return dualfunc (a, f, df);
1199 }
1200 
1201 template<class T, int P>
1202 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_erfc(const Dual<T,P> &a)
1203 {
1204     T f = OIIO::fast_erfc (float(a.val())); // float version!
1205     const T two_over_sqrt_pi = -1.128379167095512573896158903f;
1206     T df = OIIO::fast_exp(-a.val() * a.val()) * two_over_sqrt_pi;
1207     return dualfunc (a, f, df);
1208 }
1209 
1210 
1211 // f(x) = sqrt(x), f'(x) = 1/(2*sqrt(x))
1212 template<class T, int P>
1213 OSL_HOSTDEVICE OSL_FORCEINLINE OSL_CONSTEXPR14 Dual<T,P> sqrt (const Dual<T,P> &a)
1214 {
1215     Dual<T,P> result;
1216     if (OSL_LIKELY(a.val() > T(0))) {
1217         T f  = std::sqrt(a.val());
1218         T df = T(0.5) / f;
1219         result = dualfunc (a, f, df);
1220     } else {
1221         result = Dual<T,P> (T(0));
1222     }
1223     return result;
1224 }
1225 
1226 // f(x) = 1/sqrt(x), f'(x) = -1/(2*x^(3/2))
1227 template<class T, int P>
1228 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> inversesqrt (const Dual<T,P> &a)
1229 {
1230     // do we want to print an error message?
1231     Dual<T,P> result;
1232     if (OSL_LIKELY(a.val() > T(0))) {
1233         T f  = T(1)/std::sqrt(a.val());
1234         T df = T(-0.5)*f/a.val();
1235         result = dualfunc (a, f, df);
1236     } else {
1237         result = Dual<T,P> (T(0));
1238     }
1239     return result;
1240 }
1241 
1242 // f(x) = cbrt(x), f'(x) = 1/(3*x^(2/3))
1243 template<class T, int P>
1244 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> cbrt (const Dual<T,P> &a)
1245 {
1246     if (OSL_LIKELY(a.val() != T(0))) {
1247         T f = std::cbrt(a.val());
1248         T df = T(1) / (T(3) * f * f);
1249         return dualfunc(a, f, df);
1250     }
1251     return Dual<T,P> (T(0));
1252 }
1253 
1254 template<class T, int P>
1255 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fast_cbrt (const Dual<T,P> &a)
1256 {
1257     if (OSL_LIKELY(a.val() != T(0))) {
1258         T f = OIIO::fast_cbrt(float(a.val())); // float version!
1259         T df = T(1) / (T(3) * f * f);
1260         return dualfunc(a, f, df);
1261     }
1262     return Dual<T,P> (T(0));
1263 }
1264 
1265 // (fx) = x*(1-a) + y*a, f'(x) = (1-a)x' + (y - x)*a' + a*y'
1266 template<class T, int P>
1267 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> mix (const Dual<T,P> &x, const Dual<T,P> &y, const Dual<T,P> &a)
1268 {
1269     T mixval = x.val()*(T(1)-a.val()) + y.val()*a.val();
1270     return dualfunc (x, y, a, mixval, T(1)-a.val(), a.val(), y.val() - x.val());
1271 }
1272 
1273 template<class T, int P>
1274 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> fabs (const Dual<T,P> &x)
1275 {
1276     return x.val() >= T(0) ? x : -x;
1277 }
1278 
1279 
1280 #ifdef OIIO_FMATH_HAS_SAFE_FMOD
1281 using OIIO::safe_fmod;
1282 #else
1283 OSL_FORCEINLINE OSL_HOSTDEVICE float safe_fmod (float a, float b)
1284 {
1285     if (OSL_LIKELY(b != 0.0f)) {
1286 #if 0
1287         return std::fmod (a,b);
1288         // std::fmod was getting called serially instead of vectorizing, so
1289         // we will just do the calculation ourselves
1290 #else
1291         // This formulation is equivalent but much faster in our benchmarks,
1292         // also vectorizes better.
1293         // The floating-point remainder of the division operation
1294         // a/b is a - N*b, where N = a/b with its fractional part truncated.
1295         int N = static_cast<int>(a/b);
1296         return a - N*b;
1297 #endif
1298     }
1299     return 0.0f;
1300 }
1301 #endif
1302 
1303 template<class T, int P>
1304 OSL_FORCEINLINE OSL_HOSTDEVICE Dual<T,P>
1305 safe_fmod (const Dual<T,P>& a, const Dual<T,P>& b)
1306 {
1307     Dual<T,P> result = a;
1308     result.val() = safe_fmod(a.val(), b.val());
1309     return result;
1310 }
1311 
1312 
1313 
1314 OSL_HOSTDEVICE OSL_FORCEINLINE float smoothstep(float e0, float e1, float x) {
1315     if (x < e0) return 0.0f;
1316     else if (x >= e1) return 1.0f;
1317     else {
1318         float t = (x - e0)/(e1 - e0);
1319         return (3.0f-2.0f*t)*(t*t);
1320     }
1321 }
1322 
1323 // f(t) = (3-2t)t^2,   t = (x-e0)/(e1-e0)
1324 template<class T, int P>
1325 OSL_HOSTDEVICE OSL_FORCEINLINE Dual<T,P> smoothstep (const Dual<T,P> &e0, const Dual<T,P> &e1, const Dual<T,P> &x)
1326 {
1327    if (x.val() < e0.val()) {
1328       return Dual<T,P> (T(0));
1329    }
1330    else if (x.val() >= e1.val()) {
1331       return Dual<T,P> (T(1));
1332    }
1333    Dual<T,P> t = (x - e0)/(e1-e0);
1334    return  (T(3) - T(2)*t)*t*t;
1335 }
1336 
1337 
1338 #ifdef __CUDA_ARCH__
1339 template<> inline OSL_HOSTDEVICE OSL_CONSTEXPR14 float3 Dual<float3, 2>::zero() {
1340     return float3{0.f, 0.f, 0.f};
1341 }
1342 #endif
1343 
1344 
1345 // ceil(Dual) loses derivatives
1346 template<class T, int P>
1347 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr float ceil (const Dual<T,P> &x)
1348 {
1349     return std::ceil(x.val());
1350 }
1351 
1352 
1353 // floor(Dual) loses derivatives
1354 template<class T, int P>
1355 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr float floor (const Dual<T,P> &x)
1356 {
1357     return std::floor(x.val());
1358 }
1359 
1360 
1361 // floor, cast to an int.
1362 template<class T, int P>
1363 OSL_HOSTDEVICE OSL_FORCEINLINE constexpr int
1364 ifloor (const Dual<T,P> &x)
1365 {
1366     return (int)floor(x);
1367 }
1368 
1369 
1370 OSL_NAMESPACE_EXIT
1371