1 // Copyright Contributors to the Open Shading Language project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
4 // Contributions Copyright (c) 2017 Intel Inc., et al.
5 
6 // clang-format off
7 
8 #pragma once
9 
10 #include <cmath>
11 #include <limits>
12 
13 #include "dual.h"
14 #include "dual_vec.h"
15 #include <OSL/Imathx/Imathx.h>
16 
17 #include <OpenImageIO/fmath.h>
18 
19 
20 OSL_NAMESPACE_ENTER
21 
22 #ifdef __OSL_WIDE_PVT
23     namespace __OSL_WIDE_PVT {
24 #else
25     namespace pvt {
26 #endif
27 
28 
29 
30 // SIMD FRIENDLY MATH
31 // Scalar code meant to be used from inside
32 // compiler vectorized SIMD loops.
33 // No intrinsics or assembly, just vanilla C++
34 namespace sfm
35 {
36 
37     // Math code derived from OpenEXR/ImathMatrix.h
38     // including it's copyrights in the namespace
39     /*
40        Copyright (c) 2002-2012, Industrial Light & Magic, a division of Lucas
41        Digital Ltd. LLC
42 
43        All rights reserved.
44 
45        Redistribution and use in source and binary forms, with or without
46        modification, are permitted provided that the following conditions are
47        met:
48        *       Redistributions of source code must retain the above copyright
49        notice, this list of conditions and the following disclaimer.
50        *       Redistributions in binary form must reproduce the above
51        copyright notice, this list of conditions and the following disclaimer
52        in the documentation and/or other materials provided with the
53        distribution.
54        *       Neither the name of Industrial Light & Magic nor the names of
55        its contributors may be used to endorse or promote products derived
56        from this software without specific prior written permission.
57 
58        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
59        "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
60        LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
61        A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
62        OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
63        SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
64        LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
65        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
66        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
67        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
68        OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
69     */
70 
71 #if OSL_INTEL_COMPILER
72     // std::isinf wasn't vectorizing and was branchy. This slightly
73     // perturbed version fairs better and is branch free when vectorized
74     // with the Intel compiler.
isinf(float x)75     OSL_FORCEINLINE OSL_HOSTDEVICE int isinf (float x) {
76         int r = 0;
77         // NOTE: using bitwise | to avoid branches
78         if (!(std::isfinite(x)|std::isnan(x))) {
79             r = static_cast<int>(copysignf(1.0f,x));
80         }
81         return r;
82     }
83 #else
84     // Other compilers don't seem to vectorize well no matter what, so just
85     // use the standard version.
86     using std::isinf;
87 #endif
88 
89     OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<float>
absf(const Dual2<float> & x)90     absf (const Dual2<float> &x)
91     {
92         // Avoid ternary ops whose operands have side effects
93         // in favor of code that executes both sides masked
94         // return x.val() >= 0.0f ? x : -x;
95 
96         // NOTE: negation happens outside of conditional, then is blended based on the condition
97 #if OPENIMAGEIO_VERSION >= 20112
98         Dual2<float> neg_x = OIIO::fast_neg(x);
99 #else
100         Dual2<float> neg_x = -x;
101 #endif
102 
103         bool cond = x.val() < 0.0f;
104         // Blend per builtin component to allow
105         // the compiler to track builtins and privatize the data layout
106         // versus requiring a stack location.
107         float val = x.val();
108         if (cond) {
109             val = neg_x.val();
110         }
111 
112         float dx = x.dx();
113         if (cond) {
114             dx = neg_x.dx();
115         }
116 
117         float dy = x.dy();
118         if (cond) {
119             dy = neg_x.dy();
120         }
121 
122         return Dual2<float>(val, dx, dy);
123     }
124 
125 
126     /// Round to nearest integer, returning as an int.
fast_rint(float x)127     OSL_FORCEINLINE OSL_HOSTDEVICE int fast_rint (float x) {
128         // used by sin/cos/tan range reduction
129     #if 0
130         // single roundps instruction on SSE4.1+ (for gcc/clang at least)
131         //return static_cast<int>(rintf(x));
132         return rintf(x);
133     #else
134         // emulate rounding by adding/substracting 0.5
135         return static_cast<int>(x + copysignf(0.5f, x));
136 
137         // Other possible factorings
138         //return (x >= 0.0f) ? static_cast<int>(x + 0.5f) : static_cast<int>(x - 0.5f);
139         //return static_cast<int>(x +  (x >= 0.0f) ? 0.5f : - 0.5f);
140         //float pad = (x >= 0.0f) ? 0.5f : - 0.5f;
141         //return static_cast<int>(x + pad);
142         //return nearbyint(x);
143 #endif
144     }
145 
146 #if OSL_USING_IMATH >= 3
147     // Imath 3.0 has adequate optimized versions of these.
148 
149     OSL_FORCEINLINE OSL_HOSTDEVICE
length(const Vec3 & N)150     float length(const Vec3 &N)
151     {
152         return N.length();
153     }
154 
155     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
normalize(const Vec3 & N)156     normalize(const Vec3 &N)
157     {
158         return N.normalized();
159     }
160 
161 #else
162     // because lengthTiny does a lot of work including another
163     // sqrt, we really want to skip that if possible because
164     // with SIMD execution, we end up doing the sqrt twice
165     // and blending the results.  Although code could be
166     // refactored to do a single sqrt, think its better
167     // to skip the code block as we don't expect near 0 lengths
168     // TODO: get OpenEXR ImathVec to update to similar, don't think
169     // it can cause harm
170 
171     // Imath::Vec3::lengthTiny is private
172     // local copy here no changes
accessibleTinyLength(const Vec3 & N)173     OSL_FORCEINLINE OSL_HOSTDEVICE float accessibleTinyLength(const Vec3 &N)
174     {
175 //        float absX = (N.x >= float (0))? N.x: -N.x;
176 //        float absY = (N.y >= float (0))? N.y: -N.y;
177 //        float absZ = (N.z >= float (0))? N.z: -N.z;
178         // gcc builtin for abs is 2 instructions using bit twiddling vs. compares
179         float absX = std::abs(N.x);
180         float absY = std::abs(N.y);
181         float absZ = std::abs(N.z);
182 
183         float max = absX;
184 
185         if (max < absY)
186             max = absY;
187 
188         if (max < absZ)
189             max = absZ;
190 
191         if (OSL_UNLIKELY(max == 0.0f))
192             return 0.0f;
193 
194         //
195         // Do not replace the divisions by max with multiplications by 1/max.
196         // Computing 1/max can overflow but the divisions below will always
197         // produce results less than or equal to 1.
198         //
199 
200         absX /= max;
201         absY /= max;
202         absZ /= max;
203 
204         return max * std::sqrt(absX * absX + absY * absY + absZ * absZ);
205     }
206 
207     OSL_FORCEINLINE OSL_HOSTDEVICE
length(const Vec3 & N)208     float length(const Vec3 &N)
209     {
210         float length2 = N.dot (N);
211 
212         if (OSL_UNLIKELY(length2 < 2.0f * std::numeric_limits<float>::min()))
213             return accessibleTinyLength(N);
214 
215         return std::sqrt(length2);
216     }
217 
218     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
normalize(const Vec3 & N)219     normalize(const Vec3 &N)
220     {
221         float l = length(N);
222 
223         if (OSL_UNLIKELY(l == float (0)))
224             return Vec3 (float (0));
225 
226         return Vec3 (N.x / l, N.y / l, N.z / l);
227     }
228 #endif
229 
230     OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<Vec3>
normalize(const Dual2<Vec3> & a)231     normalize (const Dual2<Vec3> &a)
232     {
233         // NOTE: using bitwise & to avoid branches
234         if (OSL_UNLIKELY((a.val().x == 0.0f) & (a.val().y == 0.0f) & (a.val().z == 0.0f))) {
235             return Dual2<Vec3> (Vec3(0.0f, 0.0f, 0.0f),
236                                 Vec3(0.0f, 0.0f, 0.0f),
237                                 Vec3(0.0f, 0.0f, 0.0f));
238         } else {
239             Dual2<float> ax (a.val().x, a.dx().x, a.dy().x);
240             Dual2<float> ay (a.val().y, a.dx().y, a.dy().y);
241             Dual2<float> az (a.val().z, a.dx().z, a.dy().z);
242             Dual2<float> inv_length = 1.0f / sqrt(ax*ax + ay*ay + az*az);
243             ax = ax*inv_length;
244             ay = ay*inv_length;
245             az = az*inv_length;
246             return Dual2<Vec3> (Vec3(ax.val(), ay.val(), az.val()),
247                                 Vec3(ax.dx(),  ay.dx(),  az.dx() ),
248                                 Vec3(ax.dy(),  ay.dy(),  az.dy() ));
249         }
250     }
251 
252 
253     template<typename T>
254     OSL_FORCEINLINE OSL_HOSTDEVICE
max_val(T left,T right)255     T max_val(T left, T right)
256     {
257         return (right > left)? right: left;
258     }
259 
260 #if OSL_USING_IMATH >= 3
261     using Matrix33 = OSL::Matrix33;
262 #else
263     class Matrix33 : public OSL::Matrix33
264     {
265     public:
266         typedef OSL::Matrix33 parent;
267 
Matrix33(Imath::Uninitialized uninit)268         OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 (Imath::Uninitialized uninit)
269         : parent(uninit)
270         {}
271 
272         // Avoid the memset that is part of the Imath::Matrix33
273         // default constructor
Matrix33()274         OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 ()
275         : parent(1.0f, 0.0f, 0.0f,
276                                  0.0f, 1.0f, 0.0f,
277                                  0.0f, 0.0f, 1.0f)
278         {}
279 
Matrix33(float a,float b,float c,float d,float e,float f,float g,float h,float i)280         OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 (float a, float b, float c, float d, float e, float f, float g, float h, float i)
281         : parent(a,b,c,d,e,f,g,h,i)
282         {}
283 
Matrix33(const OSL::Matrix33 & a)284         OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 (const OSL::Matrix33 &a)
285         : parent(a)
286         {}
287 
288         // Avoid the memcpy that is part of the Imath::Matrix33
289         OSL_FORCEINLINE OSL_HOSTDEVICE
Matrix33(const float a[3][3])290         Matrix33 (const float a[3][3])
291         : OSL::Matrix33(
292             a[0][0], a[0][1], a[0][2],
293             a[1][0], a[1][1], a[1][2],
294             a[2][0], a[2][1], a[2][2])
295         {}
296 
297 
298         // Avoid the memcpy that is part of Imath::Matrix33::operator=
299         OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 &
300         operator = (const Matrix33 &v)
301         {
302             parent::x[0][0] = v.x[0][0];
303             parent::x[0][1] = v.x[0][1];
304             parent::x[0][2] = v.x[0][2];
305 
306             parent::x[1][0] = v.x[1][0];
307             parent::x[1][1] = v.x[1][1];
308             parent::x[1][2] = v.x[1][2];
309 
310             parent::x[2][0] = v.x[2][0];
311             parent::x[2][1] = v.x[2][1];
312             parent::x[2][2] = v.x[2][2];
313 
314             return *this;
315         }
316 
317 
318         // Avoid Imath::Matrix33::operator * that
319         // initializing values to 0 before overwriting them
320         // Also manually unroll its nested loops
321         OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33
322         operator * (const Matrix33 &v) const
323         {
324             Matrix33 tmp(Imath::UNINITIALIZED);
325 
326             tmp.x[0][0] = parent::x[0][0] * v.x[0][0] +
327                           parent::x[0][1] * v.x[1][0] +
328                           parent::x[0][2] * v.x[2][0];
329             tmp.x[0][1] = parent::x[0][0] * v.x[0][1] +
330                     parent::x[0][1] * v.x[1][1] +
331                     parent::x[0][2] * v.x[2][1];
332             tmp.x[0][2] = parent::x[0][0] * v.x[0][2] +
333                     parent::x[0][1] * v.x[1][2] +
334                     parent::x[0][2] * v.x[2][2];
335 
336             tmp.x[1][0] = parent::x[1][0] * v.x[0][0] +
337                     parent::x[1][1] * v.x[1][0] +
338                     parent::x[1][2] * v.x[2][0];
339             tmp.x[1][1] = parent::x[1][0] * v.x[0][1] +
340                     parent::x[1][1] * v.x[1][1] +
341                     parent::x[1][2] * v.x[2][1];
342             tmp.x[1][2] = parent::x[1][0] * v.x[0][2] +
343                     parent::x[1][1] * v.x[1][2] +
344                     parent::x[1][2] * v.x[2][2];
345 
346             tmp.x[2][0] = parent::x[2][0] * v.x[0][0] +
347                     parent::x[2][1] * v.x[1][0] +
348                     parent::x[2][2] * v.x[2][0];
349             tmp.x[2][1] = parent::x[2][0] * v.x[0][1] +
350                     parent::x[2][1] * v.x[1][1] +
351                     parent::x[2][2] * v.x[2][1];
352             tmp.x[2][2] = parent::x[2][0] * v.x[0][2] +
353                     parent::x[2][1] * v.x[1][2] +
354                     parent::x[2][2] * v.x[2][2];
355 
356             return tmp;
357         }
358     };
359 #endif
360 
361     OSL_FORCEINLINE OSL_HOSTDEVICE sfm::Matrix33
make_matrix33_cols(const Vec3 & a,const Vec3 & b,const Vec3 & c)362     make_matrix33_cols (const Vec3 &a, const Vec3 &b, const Vec3 &c)
363     {
364         return sfm::Matrix33 (a.x, b.x, c.x,
365                          a.y, b.y, c.y,
366                          a.z, b.z, c.z);
367     }
368 
369 
370 
371     // Considering having functionally equivalent versions of Vec3, Color3, Matrix44
372     // with slight modifications to inlining and implementation to avoid aliasing and
373     // improve likelyhood of proper privation of local variables within a SIMD loop
374 
375 }  // namespace sfm
376 
377 }  // namespace __OSL_WIDE_PVT or pvt
378 
379 
380 
381 OSL_NAMESPACE_EXIT
382