1 // Copyright Contributors to the Open Shading Language project. 2 // SPDX-License-Identifier: BSD-3-Clause 3 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage 4 // Contributions Copyright (c) 2017 Intel Inc., et al. 5 6 // clang-format off 7 8 #pragma once 9 10 #include <cmath> 11 #include <limits> 12 13 #include "dual.h" 14 #include "dual_vec.h" 15 #include <OSL/Imathx/Imathx.h> 16 17 #include <OpenImageIO/fmath.h> 18 19 20 OSL_NAMESPACE_ENTER 21 22 #ifdef __OSL_WIDE_PVT 23 namespace __OSL_WIDE_PVT { 24 #else 25 namespace pvt { 26 #endif 27 28 29 30 // SIMD FRIENDLY MATH 31 // Scalar code meant to be used from inside 32 // compiler vectorized SIMD loops. 33 // No intrinsics or assembly, just vanilla C++ 34 namespace sfm 35 { 36 37 // Math code derived from OpenEXR/ImathMatrix.h 38 // including it's copyrights in the namespace 39 /* 40 Copyright (c) 2002-2012, Industrial Light & Magic, a division of Lucas 41 Digital Ltd. LLC 42 43 All rights reserved. 44 45 Redistribution and use in source and binary forms, with or without 46 modification, are permitted provided that the following conditions are 47 met: 48 * Redistributions of source code must retain the above copyright 49 notice, this list of conditions and the following disclaimer. 50 * Redistributions in binary form must reproduce the above 51 copyright notice, this list of conditions and the following disclaimer 52 in the documentation and/or other materials provided with the 53 distribution. 54 * Neither the name of Industrial Light & Magic nor the names of 55 its contributors may be used to endorse or promote products derived 56 from this software without specific prior written permission. 57 58 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 59 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 60 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 61 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 62 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 63 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 64 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 65 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 66 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 67 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 68 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 69 */ 70 71 #if OSL_INTEL_COMPILER 72 // std::isinf wasn't vectorizing and was branchy. This slightly 73 // perturbed version fairs better and is branch free when vectorized 74 // with the Intel compiler. isinf(float x)75 OSL_FORCEINLINE OSL_HOSTDEVICE int isinf (float x) { 76 int r = 0; 77 // NOTE: using bitwise | to avoid branches 78 if (!(std::isfinite(x)|std::isnan(x))) { 79 r = static_cast<int>(copysignf(1.0f,x)); 80 } 81 return r; 82 } 83 #else 84 // Other compilers don't seem to vectorize well no matter what, so just 85 // use the standard version. 86 using std::isinf; 87 #endif 88 89 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<float> absf(const Dual2<float> & x)90 absf (const Dual2<float> &x) 91 { 92 // Avoid ternary ops whose operands have side effects 93 // in favor of code that executes both sides masked 94 // return x.val() >= 0.0f ? x : -x; 95 96 // NOTE: negation happens outside of conditional, then is blended based on the condition 97 #if OPENIMAGEIO_VERSION >= 20112 98 Dual2<float> neg_x = OIIO::fast_neg(x); 99 #else 100 Dual2<float> neg_x = -x; 101 #endif 102 103 bool cond = x.val() < 0.0f; 104 // Blend per builtin component to allow 105 // the compiler to track builtins and privatize the data layout 106 // versus requiring a stack location. 107 float val = x.val(); 108 if (cond) { 109 val = neg_x.val(); 110 } 111 112 float dx = x.dx(); 113 if (cond) { 114 dx = neg_x.dx(); 115 } 116 117 float dy = x.dy(); 118 if (cond) { 119 dy = neg_x.dy(); 120 } 121 122 return Dual2<float>(val, dx, dy); 123 } 124 125 126 /// Round to nearest integer, returning as an int. fast_rint(float x)127 OSL_FORCEINLINE OSL_HOSTDEVICE int fast_rint (float x) { 128 // used by sin/cos/tan range reduction 129 #if 0 130 // single roundps instruction on SSE4.1+ (for gcc/clang at least) 131 //return static_cast<int>(rintf(x)); 132 return rintf(x); 133 #else 134 // emulate rounding by adding/substracting 0.5 135 return static_cast<int>(x + copysignf(0.5f, x)); 136 137 // Other possible factorings 138 //return (x >= 0.0f) ? static_cast<int>(x + 0.5f) : static_cast<int>(x - 0.5f); 139 //return static_cast<int>(x + (x >= 0.0f) ? 0.5f : - 0.5f); 140 //float pad = (x >= 0.0f) ? 0.5f : - 0.5f; 141 //return static_cast<int>(x + pad); 142 //return nearbyint(x); 143 #endif 144 } 145 146 #if OSL_USING_IMATH >= 3 147 // Imath 3.0 has adequate optimized versions of these. 148 149 OSL_FORCEINLINE OSL_HOSTDEVICE length(const Vec3 & N)150 float length(const Vec3 &N) 151 { 152 return N.length(); 153 } 154 155 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 normalize(const Vec3 & N)156 normalize(const Vec3 &N) 157 { 158 return N.normalized(); 159 } 160 161 #else 162 // because lengthTiny does a lot of work including another 163 // sqrt, we really want to skip that if possible because 164 // with SIMD execution, we end up doing the sqrt twice 165 // and blending the results. Although code could be 166 // refactored to do a single sqrt, think its better 167 // to skip the code block as we don't expect near 0 lengths 168 // TODO: get OpenEXR ImathVec to update to similar, don't think 169 // it can cause harm 170 171 // Imath::Vec3::lengthTiny is private 172 // local copy here no changes accessibleTinyLength(const Vec3 & N)173 OSL_FORCEINLINE OSL_HOSTDEVICE float accessibleTinyLength(const Vec3 &N) 174 { 175 // float absX = (N.x >= float (0))? N.x: -N.x; 176 // float absY = (N.y >= float (0))? N.y: -N.y; 177 // float absZ = (N.z >= float (0))? N.z: -N.z; 178 // gcc builtin for abs is 2 instructions using bit twiddling vs. compares 179 float absX = std::abs(N.x); 180 float absY = std::abs(N.y); 181 float absZ = std::abs(N.z); 182 183 float max = absX; 184 185 if (max < absY) 186 max = absY; 187 188 if (max < absZ) 189 max = absZ; 190 191 if (OSL_UNLIKELY(max == 0.0f)) 192 return 0.0f; 193 194 // 195 // Do not replace the divisions by max with multiplications by 1/max. 196 // Computing 1/max can overflow but the divisions below will always 197 // produce results less than or equal to 1. 198 // 199 200 absX /= max; 201 absY /= max; 202 absZ /= max; 203 204 return max * std::sqrt(absX * absX + absY * absY + absZ * absZ); 205 } 206 207 OSL_FORCEINLINE OSL_HOSTDEVICE length(const Vec3 & N)208 float length(const Vec3 &N) 209 { 210 float length2 = N.dot (N); 211 212 if (OSL_UNLIKELY(length2 < 2.0f * std::numeric_limits<float>::min())) 213 return accessibleTinyLength(N); 214 215 return std::sqrt(length2); 216 } 217 218 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 normalize(const Vec3 & N)219 normalize(const Vec3 &N) 220 { 221 float l = length(N); 222 223 if (OSL_UNLIKELY(l == float (0))) 224 return Vec3 (float (0)); 225 226 return Vec3 (N.x / l, N.y / l, N.z / l); 227 } 228 #endif 229 230 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<Vec3> normalize(const Dual2<Vec3> & a)231 normalize (const Dual2<Vec3> &a) 232 { 233 // NOTE: using bitwise & to avoid branches 234 if (OSL_UNLIKELY((a.val().x == 0.0f) & (a.val().y == 0.0f) & (a.val().z == 0.0f))) { 235 return Dual2<Vec3> (Vec3(0.0f, 0.0f, 0.0f), 236 Vec3(0.0f, 0.0f, 0.0f), 237 Vec3(0.0f, 0.0f, 0.0f)); 238 } else { 239 Dual2<float> ax (a.val().x, a.dx().x, a.dy().x); 240 Dual2<float> ay (a.val().y, a.dx().y, a.dy().y); 241 Dual2<float> az (a.val().z, a.dx().z, a.dy().z); 242 Dual2<float> inv_length = 1.0f / sqrt(ax*ax + ay*ay + az*az); 243 ax = ax*inv_length; 244 ay = ay*inv_length; 245 az = az*inv_length; 246 return Dual2<Vec3> (Vec3(ax.val(), ay.val(), az.val()), 247 Vec3(ax.dx(), ay.dx(), az.dx() ), 248 Vec3(ax.dy(), ay.dy(), az.dy() )); 249 } 250 } 251 252 253 template<typename T> 254 OSL_FORCEINLINE OSL_HOSTDEVICE max_val(T left,T right)255 T max_val(T left, T right) 256 { 257 return (right > left)? right: left; 258 } 259 260 #if OSL_USING_IMATH >= 3 261 using Matrix33 = OSL::Matrix33; 262 #else 263 class Matrix33 : public OSL::Matrix33 264 { 265 public: 266 typedef OSL::Matrix33 parent; 267 Matrix33(Imath::Uninitialized uninit)268 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 (Imath::Uninitialized uninit) 269 : parent(uninit) 270 {} 271 272 // Avoid the memset that is part of the Imath::Matrix33 273 // default constructor Matrix33()274 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 () 275 : parent(1.0f, 0.0f, 0.0f, 276 0.0f, 1.0f, 0.0f, 277 0.0f, 0.0f, 1.0f) 278 {} 279 Matrix33(float a,float b,float c,float d,float e,float f,float g,float h,float i)280 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 (float a, float b, float c, float d, float e, float f, float g, float h, float i) 281 : parent(a,b,c,d,e,f,g,h,i) 282 {} 283 Matrix33(const OSL::Matrix33 & a)284 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 (const OSL::Matrix33 &a) 285 : parent(a) 286 {} 287 288 // Avoid the memcpy that is part of the Imath::Matrix33 289 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33(const float a[3][3])290 Matrix33 (const float a[3][3]) 291 : OSL::Matrix33( 292 a[0][0], a[0][1], a[0][2], 293 a[1][0], a[1][1], a[1][2], 294 a[2][0], a[2][1], a[2][2]) 295 {} 296 297 298 // Avoid the memcpy that is part of Imath::Matrix33::operator= 299 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 & 300 operator = (const Matrix33 &v) 301 { 302 parent::x[0][0] = v.x[0][0]; 303 parent::x[0][1] = v.x[0][1]; 304 parent::x[0][2] = v.x[0][2]; 305 306 parent::x[1][0] = v.x[1][0]; 307 parent::x[1][1] = v.x[1][1]; 308 parent::x[1][2] = v.x[1][2]; 309 310 parent::x[2][0] = v.x[2][0]; 311 parent::x[2][1] = v.x[2][1]; 312 parent::x[2][2] = v.x[2][2]; 313 314 return *this; 315 } 316 317 318 // Avoid Imath::Matrix33::operator * that 319 // initializing values to 0 before overwriting them 320 // Also manually unroll its nested loops 321 OSL_FORCEINLINE OSL_HOSTDEVICE Matrix33 322 operator * (const Matrix33 &v) const 323 { 324 Matrix33 tmp(Imath::UNINITIALIZED); 325 326 tmp.x[0][0] = parent::x[0][0] * v.x[0][0] + 327 parent::x[0][1] * v.x[1][0] + 328 parent::x[0][2] * v.x[2][0]; 329 tmp.x[0][1] = parent::x[0][0] * v.x[0][1] + 330 parent::x[0][1] * v.x[1][1] + 331 parent::x[0][2] * v.x[2][1]; 332 tmp.x[0][2] = parent::x[0][0] * v.x[0][2] + 333 parent::x[0][1] * v.x[1][2] + 334 parent::x[0][2] * v.x[2][2]; 335 336 tmp.x[1][0] = parent::x[1][0] * v.x[0][0] + 337 parent::x[1][1] * v.x[1][0] + 338 parent::x[1][2] * v.x[2][0]; 339 tmp.x[1][1] = parent::x[1][0] * v.x[0][1] + 340 parent::x[1][1] * v.x[1][1] + 341 parent::x[1][2] * v.x[2][1]; 342 tmp.x[1][2] = parent::x[1][0] * v.x[0][2] + 343 parent::x[1][1] * v.x[1][2] + 344 parent::x[1][2] * v.x[2][2]; 345 346 tmp.x[2][0] = parent::x[2][0] * v.x[0][0] + 347 parent::x[2][1] * v.x[1][0] + 348 parent::x[2][2] * v.x[2][0]; 349 tmp.x[2][1] = parent::x[2][0] * v.x[0][1] + 350 parent::x[2][1] * v.x[1][1] + 351 parent::x[2][2] * v.x[2][1]; 352 tmp.x[2][2] = parent::x[2][0] * v.x[0][2] + 353 parent::x[2][1] * v.x[1][2] + 354 parent::x[2][2] * v.x[2][2]; 355 356 return tmp; 357 } 358 }; 359 #endif 360 361 OSL_FORCEINLINE OSL_HOSTDEVICE sfm::Matrix33 make_matrix33_cols(const Vec3 & a,const Vec3 & b,const Vec3 & c)362 make_matrix33_cols (const Vec3 &a, const Vec3 &b, const Vec3 &c) 363 { 364 return sfm::Matrix33 (a.x, b.x, c.x, 365 a.y, b.y, c.y, 366 a.z, b.z, c.z); 367 } 368 369 370 371 // Considering having functionally equivalent versions of Vec3, Color3, Matrix44 372 // with slight modifications to inlining and implementation to avoid aliasing and 373 // improve likelyhood of proper privation of local variables within a SIMD loop 374 375 } // namespace sfm 376 377 } // namespace __OSL_WIDE_PVT or pvt 378 379 380 381 OSL_NAMESPACE_EXIT 382