1 // Copyright Contributors to the Open Shading Language project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
4 
5 // clang-format off
6 
7 #pragma once
8 
9 #include <limits>
10 #include <type_traits>
11 
12 #include <OpenImageIO/fmath.h>
13 #include <OpenImageIO/hash.h>
14 #include <OpenImageIO/simd.h>
15 
16 #include <OSL/dual.h>
17 #include <OSL/dual_vec.h>
18 #include <OSL/sfm_simplex.h>
19 #include <OSL/sfmath.h>
20 
21 
22 OSL_NAMESPACE_ENTER
23 
24 
25 namespace oslnoise {
26 
27 /////////////////////////////////////////////////////////////////////////
28 //
29 // Simple public API for computing the same noise functions that you get
30 // from OSL shaders.
31 //
32 // These match the properties of OSL noises that have the same names,
33 // please see the OSL specification for more detailed descriptions, we
34 // won't recapitulate it here.
35 //
36 // For the sake of compactness, we express the noise functions as
37 // templates for a either one or two domain parameters, which may be:
38 //     (float)                 // 1-D domain noise, 1-argument variety
39 //     (float, float)          // 2-D domain noise, 2-argument variety
40 //     (const Vec3 &)          // 3-D domain noise, 1-argument variety
41 //     (const Vec3 &, float)   // 4-D domain noise, 2-argument variety
42 // And the range type may be
43 //     float noisename ()      // float-valued noise
44 //     Vec3  vnoisename()      // vector-valued noise
45 // Note that in OSL we can overload function calls by return type, but we
46 // can't in C++, so we prepend a "v" in front of the names of functions
47 // that return vector-valued noise.
48 //
49 /////////////////////////////////////////////////////////////////////////
50 
51 // Signed Perlin-like noise on 1-4 dimensional domain, range [-1,1].
52 template <typename S >             OSL_HOSTDEVICE float snoise (S x);
53 template <typename S, typename T>  OSL_HOSTDEVICE float snoise (S x, T y);
54 template <typename S >             OSL_HOSTDEVICE Vec3  vsnoise (S x);
55 template <typename S, typename T>  OSL_HOSTDEVICE Vec3  vsnoise (S x, T y);
56 
57 // Unsigned Perlin-like noise on 1-4 dimensional domain, range [0,1].
58 template <typename S >             OSL_HOSTDEVICE float noise (S x);
59 template <typename S, typename T>  OSL_HOSTDEVICE float noise (S x, T y);
60 template <typename S >             OSL_HOSTDEVICE Vec3  vnoise (S x);
61 template <typename S, typename T>  OSL_HOSTDEVICE Vec3  vnoise (S x, T y);
62 
63 // Cell noise on 1-4 dimensional domain, range [0,1].
64 // cellnoise is constant within each unit cube (cell) on the domain, but
65 // discontinuous at integer boundaries (and uncorrelated from cell to
66 // cell).
67 template <typename S >             OSL_HOSTDEVICE float cellnoise (S x);
68 template <typename S, typename T>  OSL_HOSTDEVICE float cellnoise (S x, T y);
69 template <typename S >             OSL_HOSTDEVICE Vec3  vcellnoise (S x);
70 template <typename S, typename T>  OSL_HOSTDEVICE Vec3  vcellnoise (S x, T y);
71 
72 // Hash noise on 1-4 dimensional domain, range [0,1].
73 // hashnoise is like cellnoise, but without the 'floor' -- in other words,
74 // it's an uncorrelated hash that is different for every floating point
75 // value.
76 template <typename S >             OSL_HOSTDEVICE float hashnoise (S x);
77 template <typename S, typename T>  OSL_HOSTDEVICE float hashnoise (S x, T y);
78 template <typename S >             OSL_HOSTDEVICE Vec3  vhashnoise (S x);
79 template <typename S, typename T>  OSL_HOSTDEVICE Vec3  vhashnoise (S x, T y);
80 
81 // FIXME -- eventually consider adding to the public API:
82 //  * periodic varieties
83 //  * varieties with derivatives
84 //  * varieties that take/return simd::float3 rather than Imath::Vec3f.
85 //  * exposing the simplex & gabor varieties
86 
87 
88 }   // namespace oslnoise
89 
90 
91 
92 ///////////////////////////////////////////////////////////////////////
93 // Implementation follows...
94 //
95 // Users don't need to worry about this part
96 ///////////////////////////////////////////////////////////////////////
97 
98 struct NoiseParams;
99 
100 namespace pvt {
101 using namespace OIIO::simd;
102 using namespace OIIO::bjhash;
103 
104 typedef void (*NoiseGenericFunc)(int outdim, float *out, bool derivs,
105                                  int indim, const float *in,
106                                  const float *period, NoiseParams *params);
107 typedef void (*NoiseImplFunc)(float *out, const float *in,
108                               const float *period, NoiseParams *params);
109 
110 OSLNOISEPUBLIC OSL_HOSTDEVICE
111 float simplexnoise1 (float x, int seed=0, float *dnoise_dx=NULL);
112 
113 OSLNOISEPUBLIC OSL_HOSTDEVICE
114 float simplexnoise2 (float x, float y, int seed=0,
115                      float *dnoise_dx=NULL, float *dnoise_dy=NULL);
116 
117 OSLNOISEPUBLIC OSL_HOSTDEVICE
118 float simplexnoise3 (float x, float y, float z, int seed=0,
119                      float *dnoise_dx=NULL, float *dnoise_dy=NULL,
120                      float *dnoise_dz=NULL);
121 
122 OSLNOISEPUBLIC OSL_HOSTDEVICE
123 float simplexnoise4 (float x, float y, float z, float w, int seed=0,
124                      float *dnoise_dx=NULL, float *dnoise_dy=NULL,
125                      float *dnoise_dz=NULL, float *dnoise_dw=NULL);
126 
127 
128 namespace {
129 
130 // convert a 32 bit integer into a floating point number in [0,1]
bits_to_01(unsigned int bits)131 inline OSL_HOSTDEVICE float bits_to_01 (unsigned int bits) {
132     // divide by 2^32-1
133 	// Calculate inverse constant with double precision to avoid
134 	//     warning: implicit conversion from 'unsigned int' to 'float' changes value from 4294967295 to 4294967296
135     constexpr float convertFactor = static_cast<float>(static_cast<double>(1.0) / static_cast<double>(std::numeric_limits<unsigned int>::max()));
136     return bits * convertFactor;
137 }
138 
139 
140 #ifndef __CUDA_ARCH__
141 // Perform a bjmix (see OpenImageIO/hash.h) on 4 sets of values at once.
142 OSL_FORCEINLINE void
bjmix(int4 & a,int4 & b,int4 & c)143 bjmix (int4 &a, int4 &b, int4 &c)
144 {
145     using OIIO::simd::rotl32;
146     a -= c;  a ^= rotl32(c, 4);  c += b;
147     b -= a;  b ^= rotl32(a, 6);  a += c;
148     c -= b;  c ^= rotl32(b, 8);  b += a;
149     a -= c;  a ^= rotl32(c,16);  c += b;
150     b -= a;  b ^= rotl32(a,19);  a += c;
151     c -= b;  c ^= rotl32(b, 4);  b += a;
152 }
153 
154 // Perform a bjfinal (see OpenImageIO/hash.h) on 4 sets of values at once.
155 OSL_FORCEINLINE int4
bjfinal(const int4 & a_,const int4 & b_,const int4 & c_)156 bjfinal (const int4& a_, const int4& b_, const int4& c_)
157 {
158     using OIIO::simd::rotl32;
159     int4 a(a_), b(b_), c(c_);
160     c ^= b; c -= rotl32(b,14);
161     a ^= c; a -= rotl32(c,11);
162     b ^= a; b -= rotl32(a,25);
163     c ^= b; c -= rotl32(b,16);
164     a ^= c; a -= rotl32(c,4);
165     b ^= a; b -= rotl32(a,14);
166     c ^= b; c -= rotl32(b,24);
167     return c;
168 }
169 #endif
170 
171 #ifndef __OSL_USE_REFERENCE_INT_HASH
172 	// Warning the reference hash may cause incorrect results when
173 	// used inside a SIMD loop due to its complexity
174 	#define __OSL_USE_REFERENCE_INT_HASH 0
175 #endif
176 
177 /// hash an array of N 32 bit values into a pseudo-random value
178 /// based on my favorite hash: http://burtleburtle.net/bob/c/lookup3.c
179 /// templated so that the compiler can unroll the loops for us
180 template <int N>
181 OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
reference_inthash(const unsigned int k[N])182 reference_inthash (const unsigned int k[N]) {
183     // now hash the data!
184     unsigned int a, b, c, len = N;
185     a = b = c = 0xdeadbeef + (len << 2) + 13;
186     while (len > 3) {
187         a += k[0];
188         b += k[1];
189         c += k[2];
190         OIIO::bjhash::bjmix(a, b, c);
191         len -= 3;
192         k += 3;
193     }
194     switch (len) {
195         case 3 : c += k[2];
196         case 2 : b += k[1];
197         case 1 : a += k[0];
198         c = OIIO::bjhash::bjfinal(a, b, c);
199         case 0:
200             break;
201     }
202     return c;
203 }
204 
205 #if (__OSL_USE_REFERENCE_INT_HASH == 0)
206 	// Do not rely on compilers to fully optimizing the
207 	// reference_inthash<int N> template above.
208 	// The fact it takes an array parameter
209 	// could cause confusion and extra work for a compiler to convert
210 	// from x,y,z,w -> k[4] then index them.  So to simplify things
211 	// for compilers and avoid requiring actual stack space for the k[N]
212 	// array versus tracking builtin integer types in registers,
213 	// here are hand unrolled versions of inthash for 1-5 parameters.
214 	// The upshot is no need to create a k[N] on the stack and hopefully
215 	// simpler time for optimizer as it has no need to deal with while
216 	// loop and switch statement.
217 	OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
inthash(const unsigned int k0)218 	inthash (const unsigned int k0) {
219 		// now hash the data!
220 		unsigned int start_val = 0xdeadbeef + (1 << 2) + 13;
221 
222 		unsigned int a = start_val + k0;
223 		unsigned int c = OIIO::bjhash::bjfinal(a, start_val, start_val);
224 		return c;
225 	}
226 
227 	OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
inthash(const unsigned int k0,const unsigned int k1)228 	inthash (const unsigned int k0, const unsigned int k1) {
229 		// now hash the data!
230 		unsigned int start_val = 0xdeadbeef + (2 << 2) + 13;
231 
232 		unsigned int a = start_val + k0;
233 		unsigned int b = start_val + k1;
234 		unsigned int c = OIIO::bjhash::bjfinal(a, b, start_val);
235 		return c;
236 	}
237 
238 	OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
inthash(const unsigned int k0,const unsigned int k1,const unsigned int k2)239 	inthash (const unsigned int k0, const unsigned int k1, const unsigned int k2) {
240 		// now hash the data!
241 		unsigned int start_val = 0xdeadbeef + (3 << 2) + 13;
242 
243 		unsigned int a = start_val + k0;
244 		unsigned int b = start_val + k1;
245 		unsigned int c = start_val + k2;
246 		c = OIIO::bjhash::bjfinal(a, b, c);
247 		return c;
248 	}
249 
250 	OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
inthash(const unsigned int k0,const unsigned int k1,const unsigned int k2,const unsigned int k3)251 	inthash (const unsigned int k0, const unsigned int k1, const unsigned int k2, const unsigned int k3) {
252 		// now hash the data!
253 		unsigned int start_val = 0xdeadbeef + (4 << 2) + 13;
254 
255 		unsigned int a = start_val + k0;
256 		unsigned int b = start_val + k1;
257 		unsigned int c = start_val + k2;
258 		OIIO::bjhash::bjmix(a, b, c);
259 		a += k3;
260 		c = OIIO::bjhash::bjfinal(a, b, c);
261 		return c;
262 	}
263 
264 	OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
inthash(const unsigned int k0,const unsigned int k1,const unsigned int k2,const unsigned int k3,const unsigned int k4)265 	inthash (const unsigned int k0, const unsigned int k1, const unsigned int k2, const unsigned int k3, const unsigned int k4) {
266 		// now hash the data!
267 		unsigned int start_val = 0xdeadbeef + (5 << 2) + 13;
268 
269 		unsigned int a = start_val + k0;
270 		unsigned int b = start_val + k1;
271 		unsigned int c = start_val + k2;
272 		OIIO::bjhash::bjmix(a, b, c);
273 		b += k4;
274 		a += k3;
275 		c = OIIO::bjhash::bjfinal(a, b, c);
276 		return c;
277 	}
278 #endif
279 
280 #ifndef __CUDA_ARCH__
281 // Do four 2D hashes simultaneously.
282 inline int4
inthash_simd(const int4 & key_x,const int4 & key_y)283 inthash_simd (const int4& key_x, const int4& key_y)
284 {
285     const int len = 2;
286     const int seed_ = (0xdeadbeef + (len << 2) + 13);
287     static const OIIO_SIMD4_ALIGN int seed[4] = { seed_,seed_,seed_,seed_};
288     int4 a = (*(int4*)&seed)+key_x, b = (*(int4*)&seed)+key_y, c = (*(int4*)&seed);
289     return bjfinal (a, b, c);
290 }
291 
292 
293 // Do four 3D hashes simultaneously.
294 inline int4
inthash_simd(const int4 & key_x,const int4 & key_y,const int4 & key_z)295 inthash_simd (const int4& key_x, const int4& key_y, const int4& key_z)
296 {
297     const int len = 3;
298     const int seed_ = (0xdeadbeef + (len << 2) + 13);
299     static const OIIO_SIMD4_ALIGN int seed[4] = { seed_,seed_,seed_,seed_};
300     int4 a = (*(int4*)&seed)+key_x, b = (*(int4*)&seed)+key_y, c = (*(int4*)&seed)+key_z;
301     return bjfinal (a, b, c);
302 }
303 
304 
305 
306 // Do four 3D hashes simultaneously.
307 inline int4
inthash_simd(const int4 & key_x,const int4 & key_y,const int4 & key_z,const int4 & key_w)308 inthash_simd (const int4& key_x, const int4& key_y, const int4& key_z, const int4& key_w)
309 {
310     const int len = 4;
311     const int seed_ = (0xdeadbeef + (len << 2) + 13);
312     static const OIIO_SIMD4_ALIGN int seed[4] = { seed_,seed_,seed_,seed_};
313     int4 a = (*(int4*)&seed)+key_x, b = (*(int4*)&seed)+key_y, c = (*(int4*)&seed)+key_z;
314     bjmix (a, b, c);
315     a += key_w;
316     return bjfinal(a, b, c);
317 }
318 #endif
319 
320 
321 // Cell and Hash noise only differ in how they transform their inputs from
322 // float to unsigned int for use in the inthash function.
323 // IntHashNoiseBase serves as base class with DerivedT::transformToUint
324 // controlling how the float inputs are transformed
325 template<typename DerivedT>
326 struct IntHashNoiseBase  {
IntHashNoiseBaseIntHashNoiseBase327 	OSL_FORCEINLINE OSL_HOSTDEVICE IntHashNoiseBase () { }
328 
operatorIntHashNoiseBase329     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x) const {
330     	result = hashFloat(x);
331     }
332 
operatorIntHashNoiseBase333     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float y) const {
334     	result = hashFloat(x, y);
335     }
336 
operatorIntHashNoiseBase337     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p) const {
338     	result = hashFloat(p.x, p.y, p.z);
339     }
340 
operatorIntHashNoiseBase341     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t) const {
342     	result = hashFloat(p.x, p.y, p.z, t);
343     }
344 
345 
operatorIntHashNoiseBase346     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x) const {
347     	result = hashVec(x);
348     }
349 
operatorIntHashNoiseBase350     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y) const {
351     	result = hashVec(x, y);
352     }
353 
operatorIntHashNoiseBase354     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p) const {
355     	result = hashVec(p.x, p.y, p.z);
356     }
357 
operatorIntHashNoiseBase358     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t) const {
359     	result = hashVec(p.x, p.y, p.z, t);
360     }
361 
362 
363 private:
364     template<typename ...ListT>
365     OSL_FORCEINLINE OSL_HOSTDEVICE float
hashFloatIntHashNoiseBase366     hashFloat (ListT... floatList) const {
367         return bits_to_01(inthash(DerivedT::transformToUint(floatList)...));
368     }
369 
370     template<typename ...ListT>
371     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
hashVecIntHashNoiseBase372     hashVec (ListT... floatList) const {
373     	return inthashVec(DerivedT::transformToUint(floatList)...);
374     }
375 
376 #if __OSL_USE_REFERENCE_INT_HASH
377     // Allow any number of arguments to be adapted to the array based reference_inthash
378     // and leave door open for overloads of hash3 with specific parameters
379 
380     // Produce Vec3 result from any number of arguments with array based reference_inthash
381     // and extra seed values, but leave door open for overloads of inthashVec
382     // for specific parameter combinations
383     template<typename ...ListT>
384     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
inthashVecIntHashNoiseBase385     inthashVec (ListT... uintList) const {
386         constexpr int N = sizeof...(uintList) + 1;
387     	unsigned int k[N] = {uintList...};
388 
389     	Vec3 result;
390         k[N-1] = 0u; result.x = bits_to_01 (reference_inthash<N> (k));
391         k[N-1] = 1u; result.y = bits_to_01 (reference_inthash<N> (k));
392         k[N-1] = 2u; result.z = bits_to_01 (reference_inthash<N> (k));
393         return result;
394     }
395 #else
396     // Produce Vec3 result from any number of arguments with inthash and
397     // extra seed values, but leave door open for overloads of inthashVec
398     // for specific parameter combinations
399     template<typename ...ListT>
400     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
inthashVecIntHashNoiseBase401 	inthashVec (ListT... uintList) const  {
402         return Vec3(bits_to_01(inthash(uintList..., 0u)),
403         			bits_to_01(inthash(uintList..., 1u)),
404 					bits_to_01(inthash(uintList..., 2u)));
405     }
406 
407     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
inthashVecIntHashNoiseBase408 	inthashVec (const unsigned int k0, const unsigned int k1, const unsigned int k2) const {
409         // combine implementations of reference_inthash<3> and reference_inthash<4>
410     	// so that we can only perform the bjmix once because k0,k1,k2 are the
411     	// same values passed to reference_inthash<4>, only k3 differs
412         // and if we unroll the work it can be separated
413 
414         // now hash the data!
415         unsigned int start_val = 0xdeadbeef + (4 << 2) + 13;
416 
417         unsigned int a = start_val + k0;
418         unsigned int b = start_val + k1;
419         unsigned int c = start_val + k2;
420         OIIO::bjhash::bjmix(a, b, c);
421 
422         return Vec3(bits_to_01( OIIO::bjhash::bjfinal(a+0, b, c) ),
423                     bits_to_01( OIIO::bjhash::bjfinal(a+1, b, c) ),
424                     bits_to_01( OIIO::bjhash::bjfinal(a+2, b, c) ));
425     }
426 
427     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3
inthashVecIntHashNoiseBase428 	inthashVec (const unsigned int k0, const unsigned int k1, const unsigned int k2, const unsigned int k3) const {
429         // combine implementations of reference_inthash<3> and reference_inthash<5>
430     	// so that we can only perform the bjmix once because k0,k1,k2,k3 are the
431     	// same values passed to reference_inthash<5>, only k4 differs
432         // and if we unroll the work it can be separated
433 
434         // now hash the data!
435         unsigned int start_val = 0xdeadbeef + (5 << 2) + 13;
436 
437         unsigned int a = start_val + k0;
438         unsigned int b = start_val + k1;
439         unsigned int c = start_val + k2;
440         OIIO::bjhash::bjmix(a, b, c);
441         unsigned int a2 = a + k3;
442 
443         return Vec3(bits_to_01( OIIO::bjhash::bjfinal(a2, b+0, c) ),
444                     bits_to_01( OIIO::bjhash::bjfinal(a2, b+1, c) ),
445                     bits_to_01( OIIO::bjhash::bjfinal(a2, b+2, c) ));
446     }
447 #endif
448 };
449 
450 struct CellNoise: public IntHashNoiseBase<CellNoise>  {
CellNoiseCellNoise451     OSL_FORCEINLINE OSL_HOSTDEVICE CellNoise () { }
452 
453     static OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
transformToUintCellNoise454     transformToUint(float val)
455     {
456         return OIIO::ifloor(val);
457     }
458 };
459 
460 struct HashNoise: public IntHashNoiseBase<HashNoise>  {
HashNoiseHashNoise461     OSL_FORCEINLINE OSL_HOSTDEVICE HashNoise () { }
462 
463     static OSL_FORCEINLINE OSL_HOSTDEVICE unsigned int
transformToUintHashNoise464     transformToUint(float val)
465     {
466         return OIIO::bit_cast<float,unsigned int>(val);
467     }
468 };
469 
470 // Periodic Cell and Hash Noise simply wraps its inputs before
471 // performing the same conversions and hashing as the non-periodic version.
472 // We define a wrapper on top of Cell or Hash Noise to reuse
473 // its underlying implementation.
474 template <typename BaseNoiseT>
475 struct PeriodicAdaptionOf {
PeriodicAdaptionOfPeriodicAdaptionOf476 	OSL_FORCEINLINE OSL_HOSTDEVICE PeriodicAdaptionOf () { }
477 
operatorPeriodicAdaptionOf478     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float px) const {
479     	m_impl(result, wrap (x, px));
480     }
481 
operatorPeriodicAdaptionOf482     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float y,
483                             float px, float py) const {
484     	m_impl(result, wrap (x, px),
485                        wrap (y, py));
486     }
487 
operatorPeriodicAdaptionOf488     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p,
489                             const Vec3 &pp) const {
490     	m_impl(result, wrap (p, pp));
491     }
492 
operatorPeriodicAdaptionOf493     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t,
494                             const Vec3 &pp, float tt) const {
495     	m_impl(result, wrap (p, pp),
496 					   wrap (t, tt));
497     }
498 
operatorPeriodicAdaptionOf499     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float px) const {
500     	m_impl(result, wrap (x, px));
501     }
502 
operatorPeriodicAdaptionOf503     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y,
504                             float px, float py) const {
505     	m_impl(result, wrap (x, px),
506     				   wrap (y, py));
507     }
508 
operatorPeriodicAdaptionOf509     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, const Vec3 &pp) const {
510     	m_impl(result, wrap (p, pp));
511     }
512 
operatorPeriodicAdaptionOf513     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t,
514                             const Vec3 &pp, float tt) const {
515     	m_impl(result, wrap (p, pp),
516 					   wrap (t, tt));
517     }
518 
519 private:
wrapPeriodicAdaptionOf520     OSL_FORCEINLINE OSL_HOSTDEVICE float wrap (float s, float period) const {
521         // TODO: investigate if quick_floor is legal here
522         // and or beneficial
523         period = floorf (period);
524         if (period < 1.0f)
525             period = 1.0f;
526         return s - period * floorf (s / period);
527     }
528 
wrapPeriodicAdaptionOf529     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 wrap (const Vec3 &s, const Vec3 &period) const {
530     	return Vec3(wrap(s.x, period.x),
531     			    wrap(s.y, period.y),
532 					wrap(s.z, period.z));
533     }
534 
535     BaseNoiseT m_impl;
536 };
537 
538 using PeriodicCellNoise = PeriodicAdaptionOf<CellNoise>;
539 using PeriodicHashNoise = PeriodicAdaptionOf<HashNoise>;
540 
541 
542 
543 
544 inline OSL_HOSTDEVICE int
inthashi(int x)545 inthashi (int x)
546 {
547     return static_cast<int>(inthash(
548 		static_cast<unsigned int>(x)
549 	));
550 }
551 
552 inline OSL_HOSTDEVICE int
inthashf(float x)553 inthashf (float x)
554 {
555     return static_cast<int>(
556 		inthash(OIIO::bit_cast<float,unsigned int>(x))
557 	);
558 }
559 
560 inline OSL_HOSTDEVICE int
inthashf(float x,float y)561 inthashf (float x, float y)
562 {
563     return static_cast<int>(
564 		inthash(
565 			OIIO::bit_cast<float,unsigned int>(x),
566 			OIIO::bit_cast<float,unsigned int>(y)
567 		)
568 	);
569 }
570 
571 
572 inline OSL_HOSTDEVICE int
inthashf(const float * x)573 inthashf (const float *x)
574 {
575     return static_cast<int>(
576 		inthash(
577 			OIIO::bit_cast<float,unsigned int>(x[0]),
578 			OIIO::bit_cast<float,unsigned int>(x[1]),
579 			OIIO::bit_cast<float,unsigned int>(x[2])
580 		)
581 	);
582 }
583 
584 
585 inline OSL_HOSTDEVICE int
inthashf(const float * x,float y)586 inthashf (const float *x, float y)
587 {
588     return static_cast<int>(
589 		inthash(
590 			OIIO::bit_cast<float,unsigned int>(x[0]),
591 			OIIO::bit_cast<float,unsigned int>(x[1]),
592 			OIIO::bit_cast<float,unsigned int>(x[2]),
593 			OIIO::bit_cast<float,unsigned int>(y)
594 		)
595 	);
596 }
597 
598 
599 // Define select(bool,truevalue,falsevalue) template that works for a
600 // variety of types that we can use for both scalars and vectors. Because ?:
601 // won't work properly in template code with vector ops.
602 // NOTE: Removing template and require explicit functions for different
603 // combinations be specified using polymorphism.  Main reason is so that
604 // different versions can pass parameters by value vs. reference.
605 //template <typename B, typename F>
606 //OSL_FORCEINLINE OSL_HOSTDEVICE F select (B b, const F& t, const F& f) { return b ? t : f; }
607 
select(const bool b,float t,float f)608 OSL_FORCEINLINE OSL_HOSTDEVICE float select(const bool b, float t, float f) {
609     // NOTE:  parameters must NOT be references, to avoid inlining caller's creation of
610     // these values down inside the conditional assignments which can create to complex
611     // of a code flow for Clang's vectorizer to handle
612     static_assert(!std::is_reference<decltype(b)>::value, "parameters to select cannot be references");
613     static_assert(!std::is_reference<decltype(t)>::value, "parameters to select cannot be references");
614     static_assert(!std::is_reference<decltype(f)>::value, "parameters to select cannot be references");
615     return b ? t : f;
616 }
617 
select(const bool b,const Dual2<float> & t,const Dual2<float> & f)618 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<float> select(const bool b, const Dual2<float> &t, const Dual2<float> &f) {
619     // Because t & f are a references, we don't want inlining to expand t.val(), t.dx(), or t.dy()
620     // inside the conditional branch creating a more complex flow by forcing the inlined
621     // abstract syntax tree to only be evaluated when the condition is true.
622     // To avoid this we assign t.val(), t.dx(), t.dy() to local values outside the
623     // conditional to insulate it.
624     // NOTE:  This technique was also necessary to enable vectorization of nested select calls
625     float val(f.val());
626     float tval(t.val());
627 
628     float dx(f.dx());
629     float tdx(t.dx());
630 
631     float dy(f.dy());
632     float tdy(t.dy());
633 
634     // Blend per builtin component to allow
635     // the compiler to track builtins and privatize the data layout
636     // versus requiring a stack location.
637     // Without this work per component, gathers & scatters were being emitted
638     // when used inside SIMD loops.
639 #if OSL_NON_INTEL_CLANG
640     // Clang's vectorizor was really insistent that a select operation could not be replaced
641     // with control flow, so had to re-introduce the ? operator to make it happy
642     return Dual2<float> (
643         b ? tval : val,
644         b ? tdx : dx,
645         b ? tdy : dy);
646 #else
647     if (b)
648         val = tval;
649 
650     if (b)
651         dx = tdx;
652 
653     if (b)
654         dy = tdy;
655 
656     return Dual2<float> (
657             val,
658             dx,
659             dy);
660 #endif
661 }
662 
663 #ifndef __CUDA_ARCH__
664 // Already provided by oiio/simd.h
665 //OSL_FORCEINLINE int4 select (const bool4& b, const int4& t, const int4& f) {
666 //    return blend (f, t, b);
667 //}
668 
669 // Already provided by oiio/simd.h
670 //OSL_FORCEINLINE float4 select (const bool4& b, const float4& t, const float4& f) {
671 //    return blend (f, t, b);
672 //}
673 
select(const int4 & b,const float4 & t,const float4 & f)674 OSL_FORCEINLINE float4 select (const int4& b, const float4& t, const float4& f) {
675     return blend (f, t, bool4(b));
676 }
677 
678 OSL_FORCEINLINE Dual2<float4>
select(const bool4 & b,const Dual2<float4> & t,const Dual2<float4> & f)679 select (const bool4& b, const Dual2<float4>& t, const Dual2<float4>& f) {
680     return Dual2<float4> (blend (f.val(), t.val(), b),
681                           blend (f.dx(),  t.dx(),  b),
682                           blend (f.dy(),  t.dy(),  b));
683 }
684 
select(const int4 & b,const Dual2<float4> & t,const Dual2<float4> & f)685 OSL_FORCEINLINE Dual2<float4> select (const int4& b, const Dual2<float4>& t, const Dual2<float4>& f) {
686     return select (bool4(b), t, f);
687 }
688 #endif
689 
690 
691 
692 // Define negate_if(value,bool) that will work for both scalars and vectors,
693 // as well as Dual2's of both.
694 // NOTE: Removing template and require explicit functions for different
695 // combinations be specified using polymorphism.  Main reason is so that
696 // different versions can pass parameters by value vs. reference.
697 //template <typename B, typename F>
698 //template<typename FLOAT, typename BOOL>
699 //OSL_FORCEINLINE OSL_HOSTDEVICE FLOAT negate_if (const FLOAT& val, const BOOL& b) {
700 //    return b ? -val : val;
701 //}
702 
negate_if(const float val,const bool cond)703 OSL_FORCEINLINE OSL_HOSTDEVICE float negate_if (const float val, const bool cond) {
704     // NOTE:  parameters must NOT be references, to avoid inlining caller's creation of
705     // these values down inside the conditional assignments which can create to complex
706     // of a code flow for vectorizer to handle
707     static_assert(!std::is_reference<decltype(val)>::value, "parameters to negate_if cannot be references");
708     return cond ? -val : val;
709 }
710 
711 
negate_if(const Dual2<float> & val,const bool cond)712 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<float> negate_if(const Dual2<float> &val, const bool cond) {
713     // NOTE: negation happens outside of conditional, then is blended based on the condition
714     Dual2<float> neg_val(-val);
715 
716     // Blend per builtin component to allow
717     // the compiler to track builtins and privatize the data layout
718     // versus requiring a stack location.
719     float v = val.val();
720     if (cond) {
721         v = neg_val.val();
722     }
723 
724     float dx = val.dx();
725     if (cond) {
726         dx = neg_val.dx();
727     }
728 
729     float dy = val.dy();
730     if (cond) {
731         dy = neg_val.dy();
732     }
733 
734     return Dual2<float>(v, dx, dy);
735 }
736 
737 #if 0 // Unneeded currently
738 template <> OSL_FORCEINLINE Dual2<Vec3> negate_if(const Dual2<Vec3> &val, const bool &b) {
739     Dual2<Vec3> r;
740     // Use a ternary operation per builtin component to allow
741     // the compiler to track builtins and privatize the data layout
742     // versus requiring a stack location.
743     Dual2<Vec3> neg_val(-val);
744     r.val() = b ? neg_val.val() : val.val();
745     r.dx() = b ? neg_val.dx() : val.dx();
746     r.dy() = b ? neg_val.dy() : val.dy();
747     return r;
748 }
749 #endif
750 
751 #ifndef __CUDA_ARCH__
negate_if(const float4 & val,const int4 & b)752 OSL_FORCEINLINE float4 negate_if (const float4& val, const int4& b) {
753     // Special case negate_if for SIMD -- can do it with bit tricks, no branches
754     int4 highbit (0x80000000);
755     return bitcast_to_float4 (bitcast_to_int4(val) ^ (blend0 (highbit, bool4(b))));
756 }
757 
758 // Special case negate_if for SIMD -- can do it with bit tricks, no branches
negate_if(const Dual2<float4> & val,const int4 & b)759 OSL_FORCEINLINE Dual2<float4> negate_if (const Dual2<float4>& val, const int4& b)
760 {
761     return Dual2<float4> (negate_if (val.val(), b),
762                           negate_if (val.dx(),  b),
763                           negate_if (val.dy(),  b));
764 }
765 #endif
766 
767 
768 #ifndef __CUDA_ARCH__
769 // Define shuffle<> template that works with Dual2<float4> analogously to
770 // how it works for float4.
771 template<int i0, int i1, int i2, int i3>
shuffle(const Dual2<float4> & a)772 OSL_FORCEINLINE Dual2<float4> shuffle (const Dual2<float4>& a)
773 {
774     return Dual2<float4> (OIIO::simd::shuffle<i0,i1,i2,i3>(a.val()),
775                           OIIO::simd::shuffle<i0,i1,i2,i3>(a.dx()),
776                           OIIO::simd::shuffle<i0,i1,i2,i3>(a.dy()));
777 }
778 
779 template<int i>
shuffle(const Dual2<float4> & a)780 OSL_FORCEINLINE Dual2<float4> shuffle (const Dual2<float4>& a)
781 {
782     return Dual2<float4> (OIIO::simd::shuffle<i>(a.val()),
783                           OIIO::simd::shuffle<i>(a.dx()),
784                           OIIO::simd::shuffle<i>(a.dy()));
785 }
786 
787 // Define extract<> that works with Dual2<float4> analogously to how it
788 // works for float4.
789 template<int i>
extract(const Dual2<float4> & a)790 OSL_FORCEINLINE Dual2<float> extract (const Dual2<float4>& a)
791 {
792     return Dual2<float> (OIIO::simd::extract<i>(a.val()),
793                          OIIO::simd::extract<i>(a.dx()),
794                          OIIO::simd::extract<i>(a.dy()));
795 }
796 #endif
797 
798 
799 
800 // Equivalent to OIIO::bilerp (a, b, c, d, u, v), but if abcd are already
801 // packed into a float4. We assume T is float and VECTYPE is float4,
802 // but it also works if T is Dual2<float> and VECTYPE is Dual2<float4>.
803 template<typename T, typename VECTYPE>
bilerp(VECTYPE abcd,T u,T v)804 OSL_FORCEINLINE OSL_HOSTDEVICE T bilerp (VECTYPE abcd, T u, T v) {
805     VECTYPE xx = OIIO::lerp (abcd, OIIO::simd::shuffle<1,1,3,3>(abcd), u);
806     return OIIO::simd::extract<0>(OIIO::lerp (xx,OIIO::simd::shuffle<2>(xx), v));
807 }
808 
809 #ifndef __CUDA_ARCH__
810 // Equivalent to OIIO::bilerp (a, b, c, d, u, v), but if abcd are already
811 // packed into a float4 and uv are already packed into the first two
812 // elements of a float4. We assume VECTYPE is float4, but it also works if
813 // VECTYPE is Dual2<float4>.
bilerp(const Dual2<float4> & abcd,const Dual2<float4> & uv)814 OSL_FORCEINLINE Dual2<float> bilerp (const Dual2<float4>& abcd, const Dual2<float4>& uv) {
815     Dual2<float4> xx = OIIO::lerp (abcd, shuffle<1,1,3,3>(abcd), shuffle<0>(uv));
816     return extract<0>(OIIO::lerp (xx,shuffle<2>(xx), shuffle<1>(uv)));
817 }
818 
819 
820 // Equivalent to OIIO::trilerp (a, b, c, d, e, f, g, h, u, v, w), but if
821 // abcd and efgh are already packed into float4's and uvw are packed into
822 // the first 3 elements of a float4.
trilerp(const float4 & abcd,const float4 & efgh,const float4 & uvw)823 OSL_FORCEINLINE float trilerp (const float4& abcd, const float4& efgh, const float4& uvw) {
824     // Interpolate along z axis by w
825     float4 xy = OIIO::lerp (abcd, efgh, OIIO::simd::shuffle<2>(uvw));
826     // Interpolate along x axis by u
827     float4 xx = OIIO::lerp (xy, OIIO::simd::shuffle<1,1,3,3>(xy), OIIO::simd::shuffle<0>(uvw));
828     // interpolate along y axis by v
829     return OIIO::simd::extract<0>(OIIO::lerp (xx, OIIO::simd::shuffle<2>(xx), OIIO::simd::shuffle<1>(uvw)));
830 }
831 #endif
832 
833 
834 
835 // always return a value inside [0,b) - even for negative numbers
imod(int a,int b)836 OSL_FORCEINLINE OSL_HOSTDEVICE int imod(int a, int b) {
837 #if 0
838     a %= b;
839     return a < 0 ? a + b : a;
840 #else
841     // Avoid confusing vectorizor by returning a single value
842     int remainder = a % b;
843     if (remainder < 0) {
844         remainder += b;
845     }
846     return remainder;
847 #endif
848 }
849 
850 #ifndef __CUDA_ARCH__
851 // imod four values at once
imod(const int4 & a,int b)852 inline int4 imod(const int4& a, int b) {
853     int4 c = a % b;
854     return c + select(c < 0, int4(b), int4::Zero());
855 }
856 #endif
857 
858 // floorfrac return ifloor as well as the fractional remainder
859 // FIXME: already implemented inside OIIO but can't easily override it for duals
860 //        inside a different namespace
floorfrac(float x,int * i)861 OSL_FORCEINLINE OSL_HOSTDEVICE float floorfrac(float x, int* i) {
862     *i = OIIO::ifloor(x);
863     return x - *i;
864 }
865 
866 // floorfrac with derivs
floorfrac(const Dual2<float> & x,int * i)867 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<float> floorfrac(const Dual2<float> &x, int* i) {
868     float frac = floorfrac(x.val(), i);
869     // slope of x is not affected by this operation
870     return Dual2<float>(frac, x.dx(), x.dy());
871 }
872 
873 #ifndef __CUDA_ARCH__
874 // floatfrac for four sets of values at once.
floorfrac(const float4 & x,int4 * i)875 inline float4 floorfrac(const float4& x, int4 * i) {
876 #if 0
877     float4 thefloor = floor(x);
878     *i = int4(thefloor);
879     return x-thefloor;
880 #else
881     int4 thefloor = OIIO::simd::ifloor (x);
882     *i = thefloor;
883     return x - float4(thefloor);
884 #endif
885 }
886 
887 // floorfrac with derivs, computed on 4 values at once.
floorfrac(const Dual2<float4> & x,int4 * i)888 inline Dual2<float4> floorfrac(const Dual2<float4> &x, int4* i) {
889     float4 frac = floorfrac(x.val(), i);
890     // slope of x is not affected by this operation
891     return Dual2<float4>(frac, x.dx(), x.dy());
892 }
893 #endif
894 
895 
896 // Perlin 'fade' function. Can be overloaded for float, Dual2, as well
897 // as float4 / Dual2<float4>.
898 template <typename T>
fade(const T & t)899 OSL_HOSTDEVICE OSL_FORCEINLINE T fade (const T &t) {
900    return t * t * t * (t * (t * T(6.0f) - T(15.0f)) + T(10.0f));
901 }
902 
903 
904 
905 // 1,2,3 and 4 dimensional gradient functions - perform a dot product against a
906 // randomly chosen vector. Note that the gradient vector is not normalized, but
907 // this only affects the overall "scale" of the result, so we simply account for
908 // the scale by multiplying in the corresponding "perlin" function.
909 // These factors were experimentally calculated to be:
910 //    1D:   0.188
911 //    2D:   0.507
912 //    3D:   0.936
913 //    4D:   0.870
914 
915 template <typename T>
grad(int hash,const T & x)916 OSL_FORCEINLINE OSL_HOSTDEVICE T grad (int hash, const T &x) {
917     int h = hash & 15;
918     float g = 1 + (h & 7);  // 1, 2, .., 8
919     if (h&8) g = -g;        // random sign
920     return g * x;           // dot-product
921 }
922 
923 template <typename I, typename T>
grad(const I & hash,const T & x,const T & y)924 OSL_FORCEINLINE OSL_HOSTDEVICE T grad (const I &hash, const T &x, const T &y) {
925     // 8 possible directions (+-1,+-2) and (+-2,+-1)
926     I h = hash & 7;
927     T u = select (h<4, x, y);
928     T v = 2.0f * select (h<4, y, x);
929     // compute the dot product with (x,y).
930     return negate_if(u, h&1) + negate_if(v, h&2);
931 }
932 
933 template <typename I, typename T>
grad(const I & hash,const T & x,const T & y,const T & z)934 OSL_FORCEINLINE OSL_HOSTDEVICE T grad (const I &hash, const T &x, const T &y, const T &z) {
935     // use vectors pointing to the edges of the cube
936     I h = hash & 15;
937     T u = select (h<8, x, y);
938     T v = select (h<4, y, select ((h==I(12))|(h==I(14)), x, z));
939     return negate_if(u,h&1) + negate_if(v,h&2);
940 }
941 
942 template <typename T>
grad(const int hash,const T & x,const T & y,const T & z)943 OSL_FORCEINLINE OSL_HOSTDEVICE T grad (const int hash, const T &x, const T &y, const T &z) {
944     // use vectors pointing to the edges of the cube
945     int h = hash & 15;
946     T u = select (h<8, x, y);
947     // Changed from bitwise | to logical || to avoid conversions
948     // to integer from native boolean that was causing gather + scatters
949     // to be generated when used with the select vs.
950     // simple masking or blending.
951     // TODO: couldn't change the grad version above because OpenImageIO::v1_7::simd::bool4
952     // has no || operator defined.  Would be preferable to implement bool4::operator||
953     // and not have this version of grad separate
954     T v = select (h<4, y, select ((h==int(12))||(h==int(14)), x, z));
955     return negate_if(u,h&1) + negate_if(v,h&2);
956 }
957 
958 template <typename I, typename T>
grad(const I & hash,const T & x,const T & y,const T & z,const T & w)959 OSL_FORCEINLINE OSL_HOSTDEVICE T grad (const I &hash, const T &x, const T &y, const T &z, const T &w) {
960     // use vectors pointing to the edges of the hypercube
961     I h = hash & 31;
962     T u = select (h<24, x, y);
963     T v = select (h<16, y, z);
964     T s = select (h<8 , z, w);
965     return negate_if(u,h&1) + negate_if(v,h&2) + negate_if(s,h&4);
966 }
967 
968 typedef Imath::Vec3<int> Vec3i;
969 
grad(const Vec3i & hash,float x)970 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 grad (const Vec3i &hash, float x) {
971     return Vec3 (grad (hash.x, x),
972                  grad (hash.y, x),
973                  grad (hash.z, x));
974 }
975 
grad(const Vec3i & hash,Dual2<float> x)976 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<Vec3> grad (const Vec3i &hash, Dual2<float> x) {
977     Dual2<float> rx = grad (hash.x, x);
978     Dual2<float> ry = grad (hash.y, x);
979     Dual2<float> rz = grad (hash.z, x);
980     return make_Vec3 (rx, ry, rz);
981 }
982 
983 
grad(const Vec3i & hash,float x,float y)984 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 grad (const Vec3i &hash, float x, float y) {
985     return Vec3 (grad (hash.x, x, y),
986                  grad (hash.y, x, y),
987                  grad (hash.z, x, y));
988 }
989 
grad(const Vec3i & hash,Dual2<float> x,Dual2<float> y)990 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<Vec3> grad (const Vec3i &hash, Dual2<float> x, Dual2<float> y) {
991     Dual2<float> rx = grad (hash.x, x, y);
992     Dual2<float> ry = grad (hash.y, x, y);
993     Dual2<float> rz = grad (hash.z, x, y);
994     return make_Vec3 (rx, ry, rz);
995 }
996 
grad(const Vec3i & hash,float x,float y,float z)997 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 grad (const Vec3i &hash, float x, float y, float z) {
998     return Vec3 (grad (hash.x, x, y, z),
999                  grad (hash.y, x, y, z),
1000                  grad (hash.z, x, y, z));
1001 }
1002 
grad(const Vec3i & hash,Dual2<float> x,Dual2<float> y,Dual2<float> z)1003 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<Vec3> grad (const Vec3i &hash, Dual2<float> x, Dual2<float> y, Dual2<float> z) {
1004     Dual2<float> rx = grad (hash.x, x, y, z);
1005     Dual2<float> ry = grad (hash.y, x, y, z);
1006     Dual2<float> rz = grad (hash.z, x, y, z);
1007     return make_Vec3 (rx, ry, rz);
1008 }
1009 
grad(const Vec3i & hash,float x,float y,float z,float w)1010 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3 grad (const Vec3i &hash, float x, float y, float z, float w) {
1011     return Vec3 (grad (hash.x, x, y, z, w),
1012                  grad (hash.y, x, y, z, w),
1013                  grad (hash.z, x, y, z, w));
1014 }
1015 
grad(const Vec3i & hash,Dual2<float> x,Dual2<float> y,Dual2<float> z,Dual2<float> w)1016 OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<Vec3> grad (const Vec3i &hash, Dual2<float> x, Dual2<float> y, Dual2<float> z, Dual2<float> w) {
1017     Dual2<float> rx = grad (hash.x, x, y, z, w);
1018     Dual2<float> ry = grad (hash.y, x, y, z, w);
1019     Dual2<float> rz = grad (hash.z, x, y, z, w);
1020     return make_Vec3 (rx, ry, rz);
1021 }
1022 
1023 template <typename T>
scale1(const T & result)1024 OSL_FORCEINLINE OSL_HOSTDEVICE T scale1 (const T &result) { return 0.2500f * result; }
1025 template <typename T>
scale2(const T & result)1026 OSL_FORCEINLINE OSL_HOSTDEVICE T scale2 (const T &result) { return 0.6616f * result; }
1027 template <typename T>
scale3(const T & result)1028 OSL_FORCEINLINE OSL_HOSTDEVICE T scale3 (const T &result) { return 0.9820f * result; }
1029 template <typename T>
scale4(const T & result)1030 OSL_FORCEINLINE OSL_HOSTDEVICE T scale4 (const T &result) { return 0.8344f * result; }
1031 
1032 
1033 struct HashScalar {
1034 
operatorHashScalar1035     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x) const {
1036         return static_cast<int>(
1037 			inthash(static_cast<unsigned int>(x))
1038 		);
1039     }
1040 
operatorHashScalar1041     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x, int y) const {
1042     	return static_cast<int>(
1043 			inthash(static_cast<unsigned int>(x),
1044         	        static_cast<unsigned int>(y))
1045 		);
1046     }
1047 
operatorHashScalar1048     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x, int y, int z) const {
1049     	return static_cast<int>(
1050 			inthash(static_cast<unsigned int>(x),
1051         	        static_cast<unsigned int>(y),
1052 			 	    static_cast<unsigned int>(z))
1053 		);
1054     }
1055 
operatorHashScalar1056     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x, int y, int z, int w) const {
1057     	return static_cast<int>(
1058 			inthash(static_cast<unsigned int>(x),
1059         	        static_cast<unsigned int>(y),
1060 			  	    static_cast<unsigned int>(z),
1061 					static_cast<unsigned int>(w))
1062 		);
1063     }
1064 
1065 #ifndef __CUDA_ARCH__
1066     // 4 2D hashes at once!
operatorHashScalar1067     OSL_FORCEINLINE int4 operator() (const int4& x, const int4& y) const {
1068         return inthash_simd (x, y);
1069     }
1070 
1071     // 4 3D hashes at once!
operatorHashScalar1072     OSL_FORCEINLINE int4 operator() (const int4& x, const int4& y, const int4& z) const {
1073         return inthash_simd (x, y, z);
1074     }
1075 
1076     // 4 3D hashes at once!
operatorHashScalar1077     OSL_FORCEINLINE int4 operator() (const int4& x, const int4& y, const int4& z, const int4& w) const {
1078         return inthash_simd (x, y, z, w);
1079     }
1080 #endif
1081 
1082 };
1083 
1084 
sliceup(unsigned int h)1085 OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i sliceup (unsigned int h) {
1086     // we only need the low-order bits to be random, so split out
1087     // the 32 bit result into 3 parts for each channel
1088     return Vec3i(
1089 		(h      ) & 0xFF,
1090 		(h >> 8 ) & 0xFF,
1091 		(h >> 16) & 0xFF);
1092 }
1093 
1094 struct HashVector {
convertToScalarHashVector1095     static OSL_FORCEINLINE OSL_HOSTDEVICE HashScalar convertToScalar() { return HashScalar(); }
1096 
operatorHashVector1097     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x) const {
1098         return sliceup(inthash(static_cast<unsigned int>(x)));
1099     }
1100 
operatorHashVector1101     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x, int y) const {
1102         return sliceup(inthash(static_cast<unsigned int>(x),
1103         		               static_cast<unsigned int>(y)));
1104     }
1105 
operatorHashVector1106     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x, int y, int z) const {
1107         return sliceup(inthash(static_cast<unsigned int>(x),
1108         		               static_cast<unsigned int>(y),
1109 							   static_cast<unsigned int>(z)));
1110     }
operatorHashVector1111     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x, int y, int z, int w) const {
1112         return sliceup(inthash(static_cast<unsigned int>(x),
1113         		               static_cast<unsigned int>(y),
1114 							   static_cast<unsigned int>(z),
1115 							   static_cast<unsigned int>(w)));
1116     }
1117 
1118 #ifndef __CUDA_ARCH__
1119     // Vector hash of 4 3D points at once
operatorHashVector1120     OSL_FORCEINLINE void operator() (int4 *result, const int4& x, const int4& y) const {
1121         int4 h = inthash_simd (x, y);
1122         result[0] = (h        ) & 0xFF;
1123         result[1] = (srl(h,8 )) & 0xFF;
1124         result[2] = (srl(h,16)) & 0xFF;
1125     }
1126 
1127     // Vector hash of 4 3D points at once
operatorHashVector1128     OSL_FORCEINLINE void operator() (int4 *result, const int4& x, const int4& y, const int4& z) const {
1129         int4 h = inthash_simd (x, y, z);
1130         result[0] = (h        ) & 0xFF;
1131         result[1] = (srl(h,8 )) & 0xFF;
1132         result[2] = (srl(h,16)) & 0xFF;
1133     }
1134 
1135     // Vector hash of 4 3D points at once
operatorHashVector1136     OSL_FORCEINLINE void operator() (int4 *result, const int4& x, const int4& y, const int4& z, const int4& w) const {
1137         int4 h = inthash_simd (x, y, z, w);
1138         result[0] = (h        ) & 0xFF;
1139         result[1] = (srl(h,8 )) & 0xFF;
1140         result[2] = (srl(h,16)) & 0xFF;
1141     }
1142 #endif
1143 
1144 };
1145 
1146 
1147 struct HashScalarPeriodic {
1148 private:
1149     friend struct HashVectorPeriodic;
HashScalarPeriodicHashScalarPeriodic1150     OSL_FORCEINLINE OSL_HOSTDEVICE HashScalarPeriodic () {}
1151 public:
HashScalarPeriodicHashScalarPeriodic1152     OSL_FORCEINLINE OSL_HOSTDEVICE HashScalarPeriodic (float px) {
1153         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1154     }
HashScalarPeriodicHashScalarPeriodic1155     OSL_FORCEINLINE OSL_HOSTDEVICE HashScalarPeriodic (float px, float py) {
1156         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1157         m_py = OIIO::ifloor(py); if (m_py < 1) m_py = 1;
1158     }
HashScalarPeriodicHashScalarPeriodic1159     OSL_FORCEINLINE OSL_HOSTDEVICE HashScalarPeriodic (float px, float py, float pz) {
1160         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1161         m_py = OIIO::ifloor(py); if (m_py < 1) m_py = 1;
1162         m_pz = OIIO::ifloor(pz); if (m_pz < 1) m_pz = 1;
1163     }
HashScalarPeriodicHashScalarPeriodic1164     OSL_FORCEINLINE OSL_HOSTDEVICE HashScalarPeriodic (float px, float py, float pz, float pw) {
1165         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1166         m_py = OIIO::ifloor(py); if (m_py < 1) m_py = 1;
1167         m_pz = OIIO::ifloor(pz); if (m_pz < 1) m_pz = 1;
1168         m_pw = OIIO::ifloor(pw); if (m_pw < 1) m_pw = 1;
1169     }
1170 
1171     int m_px, m_py, m_pz, m_pw;
1172 
operatorHashScalarPeriodic1173     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x) const {
1174         return static_cast<int>(
1175 			inthash (static_cast<unsigned int>(imod (x, m_px)))
1176 		);
1177     }
1178 
operatorHashScalarPeriodic1179     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x, int y) const {
1180         return static_cast<int>(
1181 			inthash(static_cast<unsigned int>(imod (x, m_px)),
1182         	        static_cast<unsigned int>(imod (y, m_py)))
1183 		);
1184     }
1185 
operatorHashScalarPeriodic1186     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x, int y, int z) const {
1187         return static_cast<int>(
1188 			inthash(static_cast<unsigned int>(imod (x, m_px)),
1189         	        static_cast<unsigned int>(imod (y, m_py)),
1190 			 	    static_cast<unsigned int>(imod (z, m_pz)))
1191 		);
1192     }
1193 
operatorHashScalarPeriodic1194     OSL_FORCEINLINE OSL_HOSTDEVICE int operator() (int x, int y, int z, int w) const {
1195         return static_cast<int>(
1196 			inthash(static_cast<unsigned int>(imod (x, m_px)),
1197         	        static_cast<unsigned int>(imod (y, m_py)),
1198 			  	    static_cast<unsigned int>(imod (z, m_pz)),
1199 					static_cast<unsigned int>(imod (w, m_pw)))
1200 		);
1201     }
1202 
1203 #ifndef __CUDA_ARCH__
1204     // 4 2D hashes at once!
operatorHashScalarPeriodic1205     int4 operator() (const int4& x, const int4& y) const {
1206         return inthash_simd (imod(x,m_px), imod(y,m_py));
1207     }
1208 
1209     // 4 3D hashes at once!
operatorHashScalarPeriodic1210     int4 operator() (const int4& x, const int4& y, const int4& z) const {
1211         return inthash_simd (imod(x,m_px), imod(y,m_py), imod(z,m_pz));
1212     }
1213 
1214     // 4 4D hashes at once
operatorHashScalarPeriodic1215     int4 operator() (const int4& x, const int4& y, const int4& z, const int4& w) const {
1216         return inthash_simd (imod(x,m_px), imod(y,m_py), imod(z,m_pz), imod(w,m_pw));
1217     }
1218 #endif
1219 
1220 };
1221 
1222 struct HashVectorPeriodic {
HashVectorPeriodicHashVectorPeriodic1223 	OSL_FORCEINLINE OSL_HOSTDEVICE HashVectorPeriodic (float px) {
1224         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1225     }
HashVectorPeriodicHashVectorPeriodic1226 	OSL_FORCEINLINE OSL_HOSTDEVICE HashVectorPeriodic (float px, float py) {
1227         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1228         m_py = OIIO::ifloor(py); if (m_py < 1) m_py = 1;
1229     }
HashVectorPeriodicHashVectorPeriodic1230 	OSL_FORCEINLINE OSL_HOSTDEVICE HashVectorPeriodic (float px, float py, float pz) {
1231         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1232         m_py = OIIO::ifloor(py); if (m_py < 1) m_py = 1;
1233         m_pz = OIIO::ifloor(pz); if (m_pz < 1) m_pz = 1;
1234     }
HashVectorPeriodicHashVectorPeriodic1235 	OSL_FORCEINLINE OSL_HOSTDEVICE HashVectorPeriodic (float px, float py, float pz, float pw) {
1236         m_px = OIIO::ifloor(px); if (m_px < 1) m_px = 1;
1237         m_py = OIIO::ifloor(py); if (m_py < 1) m_py = 1;
1238         m_pz = OIIO::ifloor(pz); if (m_pz < 1) m_pz = 1;
1239         m_pw = OIIO::ifloor(pw); if (m_pw < 1) m_pw = 1;
1240     }
1241 
1242     int m_px, m_py, m_pz, m_pw;
1243 
convertToScalarHashVectorPeriodic1244     OSL_FORCEINLINE OSL_HOSTDEVICE HashScalarPeriodic convertToScalar() const {
1245         HashScalarPeriodic r;
1246         r.m_px = m_px;
1247         r.m_py = m_py;
1248         r.m_pz = m_pz;
1249         r.m_pw = m_pw;
1250         return r;
1251     }
1252 
operatorHashVectorPeriodic1253     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x) const {
1254         return sliceup(inthash(static_cast<unsigned int>(imod (x, m_px))));
1255     }
1256 
operatorHashVectorPeriodic1257     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x, int y) const {
1258         return sliceup(inthash(static_cast<unsigned int>(imod (x, m_px)),
1259         		               static_cast<unsigned int>(imod (y, m_py))));
1260     }
1261 
operatorHashVectorPeriodic1262     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x, int y, int z) const {
1263         return sliceup(inthash(static_cast<unsigned int>(imod (x, m_px)),
1264         		               static_cast<unsigned int>(imod (y, m_py)),
1265 							   static_cast<unsigned int>(imod (z, m_pz))));
1266     }
1267 
operatorHashVectorPeriodic1268     OSL_FORCEINLINE OSL_HOSTDEVICE Vec3i operator() (int x, int y, int z, int w) const {
1269         return sliceup(inthash(static_cast<unsigned int>(imod (x, m_px)),
1270         		               static_cast<unsigned int>(imod (y, m_py)),
1271 							   static_cast<unsigned int>(imod (z, m_pz)),
1272 							   static_cast<unsigned int>(imod (w, m_pw))));
1273     }
1274 
1275 #ifndef __CUDA_ARCH__
1276     // Vector hash of 4 3D points at once
operatorHashVectorPeriodic1277     void operator() (int4 *result, const int4& x, const int4& y) const {
1278         int4 h = inthash_simd (imod(x,m_px), imod(y,m_py));
1279         result[0] = (h        ) & 0xFF;
1280         result[1] = (srl(h,8 )) & 0xFF;
1281         result[2] = (srl(h,16)) & 0xFF;
1282     }
1283 
1284     // Vector hash of 4 3D points at once
operatorHashVectorPeriodic1285     void operator() (int4 *result, const int4& x, const int4& y, const int4& z) const {
1286         int4 h = inthash_simd (imod(x,m_px), imod(y,m_py), imod(z,m_pz));
1287         result[0] = (h        ) & 0xFF;
1288         result[1] = (srl(h,8 )) & 0xFF;
1289         result[2] = (srl(h,16)) & 0xFF;
1290     }
1291 
1292     // Vector hash of 4 4D points at once
operatorHashVectorPeriodic1293     void operator() (int4 *result, const int4& x, const int4& y, const int4& z, const int4& w) const {
1294         int4 h = inthash_simd (imod(x,m_px), imod(y,m_py), imod(z,m_pz), imod(w,m_pw));
1295         result[0] = (h        ) & 0xFF;
1296         result[1] = (srl(h,8 )) & 0xFF;
1297         result[2] = (srl(h,16)) & 0xFF;
1298     }
1299 #endif
1300 };
1301 
1302 // Code Generation Policies
1303 // main intent is to allow top level classes to share implementation and carry
1304 // the policy down into helper functions who might diverge in implementation
1305 // or conditionally emit different code paths
1306 struct CGDefault
1307 {
1308     static constexpr bool allowSIMD = true;
1309 };
1310 
1311 struct CGScalar
1312 {
1313     static constexpr bool allowSIMD = false;
1314 };
1315 
1316 template <typename CGPolicyT = CGDefault, typename V, typename H, typename T>
perlin(V & result,H & hash,const T & x)1317 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (V& result, H& hash, const T &x) {
1318     int X; T fx = floorfrac(x, &X);
1319     T u = fade(fx);
1320 
1321     auto lerp_result = OIIO::lerp(grad (hash (X  ), fx     ),
1322                         grad (hash (X+1), fx-1.0f), u);
1323     result = scale1 (lerp_result);
1324 }
1325 
1326 template <typename CGPolicyT = CGDefault, typename H>
perlin(float & result,const H & hash,const float & x,const float & y)1327 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (float &result, const H &hash, const float &x, const float &y)
1328 {
1329 #if OIIO_SIMD
1330     if (CGPolicyT::allowSIMD)
1331     {
1332     int4 XY;
1333     float4 fxy = floorfrac (float4(x,y,0.0f), &XY);
1334     float4 uv = fade(fxy);  // Note: will be (fade(fx), fade(fy), 0, 0)
1335 
1336     // We parallelize primarily by computing the hashes and gradients at the
1337     // integer lattice corners simultaneously.
1338     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1339     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1340     int4 cornerx = OIIO::simd::shuffle<0>(XY) + (*(int4*)i0101);
1341     int4 cornery = OIIO::simd::shuffle<1>(XY) + (*(int4*)i0011);
1342     int4 corner_hash = hash (cornerx, cornery);
1343     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1344     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1345     float4 remainderx = OIIO::simd::shuffle<0>(fxy) - (*(float4*)f0101);
1346     float4 remaindery = OIIO::simd::shuffle<1>(fxy) - (*(float4*)f0011);
1347     float4 corner_grad = grad (corner_hash, remainderx, remaindery);
1348     result = scale2 (bilerp (corner_grad, uv[0], uv[1]));
1349 
1350     } else
1351 #endif
1352     {
1353     // ORIGINAL, non-SIMD
1354     int X; float fx = floorfrac(x, &X);
1355     int Y; float fy = floorfrac(y, &Y);
1356 
1357     float u = fade(fx);
1358     float v = fade(fy);
1359 
1360     float bilerp_result = OIIO::bilerp (grad (hash (X  , Y  ), fx     , fy     ),
1361                            grad (hash (X+1, Y  ), fx-1.0f, fy     ),
1362                            grad (hash (X  , Y+1), fx     , fy-1.0f),
1363                            grad (hash (X+1, Y+1), fx-1.0f, fy-1.0f), u, v);
1364     result = scale2 (bilerp_result);
1365     }
1366 }
1367 
1368 template <typename CGPolicyT = CGDefault, typename H>
perlin(float & result,const H & hash,const float & x,const float & y,const float & z)1369 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (float &result, const H &hash,
1370                     const float &x, const float &y, const float &z)
1371 {
1372 #if OIIO_SIMD
1373     if (CGPolicyT::allowSIMD)
1374     {
1375 #if 0
1376     // You'd think it would be faster to do the floorfrac in parallel, but
1377     // according to my timings, it is not. I don't understand exactly why.
1378     int4 XYZ;
1379     float4 fxyz = floorfrac (float4(x,y,z), &XYZ);
1380     float4 uvw = fade (fxyz);
1381 
1382     // We parallelize primarily by computing the hashes and gradients at the
1383     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1384     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1385     // with AVX.)
1386     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1387     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1388     int4 cornerx = shuffle<0>(XYZ) + (*(int4*)i0101);
1389     int4 cornery = shuffle<1>(XYZ) + (*(int4*)i0011);
1390     int4 cornerz = shuffle<2>(XYZ);
1391 #else
1392     int X; float fx = floorfrac(x, &X);
1393     int Y; float fy = floorfrac(y, &Y);
1394     int Z; float fz = floorfrac(z, &Z);
1395     float4 fxyz (fx, fy, fz); // = floorfrac (xyz, &XYZ);
1396     float4 uvw = fade (fxyz);
1397 
1398     // We parallelize primarily by computing the hashes and gradients at the
1399     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1400     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1401     // with AVX.)
1402     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1403     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1404     int4 cornerx = X + (*(int4*)i0101);
1405     int4 cornery = Y + (*(int4*)i0011);
1406     int4 cornerz = Z;
1407 #endif
1408     int4 corner_hash_z0 = hash (cornerx, cornery, cornerz);
1409     int4 corner_hash_z1 = hash (cornerx, cornery, cornerz+int4::One());
1410 
1411     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1412     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1413     float4 remainderx = OIIO::simd::shuffle<0>(fxyz) - (*(float4*)f0101);
1414     float4 remaindery = OIIO::simd::shuffle<1>(fxyz) - (*(float4*)f0011);
1415     float4 remainderz = OIIO::simd::shuffle<2>(fxyz);
1416     float4 corner_grad_z0 = grad (corner_hash_z0, remainderx, remaindery, remainderz);
1417     float4 corner_grad_z1 = grad (corner_hash_z1, remainderx, remaindery, remainderz-float4::One());
1418 
1419     result = scale3 (trilerp (corner_grad_z0, corner_grad_z1, uvw));
1420     } else
1421 #endif
1422     {
1423     // ORIGINAL -- non-SIMD
1424     int X; float fx = floorfrac(x, &X);
1425     int Y; float fy = floorfrac(y, &Y);
1426     int Z; float fz = floorfrac(z, &Z);
1427     float u = fade(fx);
1428     float v = fade(fy);
1429     float w = fade(fz);
1430     float trilerp_result = OIIO::trilerp (grad (hash (X  , Y  , Z  ), fx     , fy     , fz     ),
1431                             grad (hash (X+1, Y  , Z  ), fx-1.0f, fy     , fz     ),
1432                             grad (hash (X  , Y+1, Z  ), fx     , fy-1.0f, fz     ),
1433                             grad (hash (X+1, Y+1, Z  ), fx-1.0f, fy-1.0f, fz     ),
1434                             grad (hash (X  , Y  , Z+1), fx     , fy     , fz-1.0f),
1435                             grad (hash (X+1, Y  , Z+1), fx-1.0f, fy     , fz-1.0f),
1436                             grad (hash (X  , Y+1, Z+1), fx     , fy-1.0f, fz-1.0f),
1437                             grad (hash (X+1, Y+1, Z+1), fx-1.0f, fy-1.0f, fz-1.0f),
1438                             u, v, w);
1439     result = scale3 (trilerp_result);
1440     }
1441 }
1442 
1443 
1444 
1445 template <typename CGPolicyT = CGDefault, typename H>
perlin(float & result,const H & hash,const float & x,const float & y,const float & z,const float & w)1446 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (float &result, const H &hash,
1447                     const float &x, const float &y, const float &z, const float &w)
1448 {
1449 #if OIIO_SIMD
1450     if (CGPolicyT::allowSIMD)
1451     {
1452 
1453     int4 XYZW;
1454     float4 fxyzw = floorfrac (float4(x,y,z,w), &XYZW);
1455     float4 uvts = fade (fxyzw);
1456 
1457     // We parallelize primarily by computing the hashes and gradients at the
1458     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1459     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1460     // with AVX.)
1461     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1462     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1463     int4 cornerx = OIIO::simd::shuffle<0>(XYZW) + (*(int4*)i0101);
1464     int4 cornery = OIIO::simd::shuffle<1>(XYZW) + (*(int4*)i0011);
1465     int4 cornerz = OIIO::simd::shuffle<2>(XYZW);
1466     int4 cornerz1 = cornerz + int4::One();
1467     int4 cornerw = OIIO::simd::shuffle<3>(XYZW);
1468 
1469     int4 corner_hash_z0 = hash (cornerx, cornery, cornerz,  cornerw);
1470     int4 corner_hash_z1 = hash (cornerx, cornery, cornerz1, cornerw);
1471     int4 cornerw1 = cornerw + int4::One();
1472     int4 corner_hash_z2 = hash (cornerx, cornery, cornerz,  cornerw1);
1473     int4 corner_hash_z3 = hash (cornerx, cornery, cornerz1, cornerw1);
1474 
1475     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1476     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1477     float4 remainderx = OIIO::simd::shuffle<0>(fxyzw) - (*(float4*)f0101);
1478     float4 remaindery = OIIO::simd::shuffle<1>(fxyzw) - (*(float4*)f0011);
1479     float4 remainderz = OIIO::simd::shuffle<2>(fxyzw);
1480     float4 remainderz1 = remainderz - float4::One();
1481     float4 remainderw = OIIO::simd::shuffle<3>(fxyzw);
1482     float4 corner_grad_z0 = grad (corner_hash_z0, remainderx, remaindery, remainderz,  remainderw);
1483     float4 corner_grad_z1 = grad (corner_hash_z1, remainderx, remaindery, remainderz1, remainderw);
1484     float4 remainderw1 = remainderw - float4::One();
1485     float4 corner_grad_z2 = grad (corner_hash_z2, remainderx, remaindery, remainderz,  remainderw1);
1486     float4 corner_grad_z3 = grad (corner_hash_z3, remainderx, remaindery, remainderz1, remainderw1);
1487 
1488     result = scale4 (OIIO::lerp (trilerp (corner_grad_z0, corner_grad_z1, uvts),
1489                                  trilerp (corner_grad_z2, corner_grad_z3, uvts),
1490                                  OIIO::simd::extract<3>(uvts)));
1491     } else
1492 #endif
1493     {
1494     // ORIGINAL -- non-SIMD
1495     int X; float fx = floorfrac(x, &X);
1496     int Y; float fy = floorfrac(y, &Y);
1497     int Z; float fz = floorfrac(z, &Z);
1498     int W; float fw = floorfrac(w, &W);
1499 
1500     float u = fade(fx);
1501     float v = fade(fy);
1502     float t = fade(fz);
1503     float s = fade(fw);
1504 
1505     float lerp_result = OIIO::lerp (
1506                OIIO::trilerp (grad (hash (X  , Y  , Z  , W  ), fx     , fy     , fz     , fw     ),
1507                               grad (hash (X+1, Y  , Z  , W  ), fx-1.0f, fy     , fz     , fw     ),
1508                               grad (hash (X  , Y+1, Z  , W  ), fx     , fy-1.0f, fz     , fw     ),
1509                               grad (hash (X+1, Y+1, Z  , W  ), fx-1.0f, fy-1.0f, fz     , fw     ),
1510                               grad (hash (X  , Y  , Z+1, W  ), fx     , fy     , fz-1.0f, fw     ),
1511                               grad (hash (X+1, Y  , Z+1, W  ), fx-1.0f, fy     , fz-1.0f, fw     ),
1512                               grad (hash (X  , Y+1, Z+1, W  ), fx     , fy-1.0f, fz-1.0f, fw     ),
1513                               grad (hash (X+1, Y+1, Z+1, W  ), fx-1.0f, fy-1.0f, fz-1.0f, fw     ),
1514                               u, v, t),
1515                OIIO::trilerp (grad (hash (X  , Y  , Z  , W+1), fx     , fy     , fz     , fw-1.0f),
1516                               grad (hash (X+1, Y  , Z  , W+1), fx-1.0f, fy     , fz     , fw-1.0f),
1517                               grad (hash (X  , Y+1, Z  , W+1), fx     , fy-1.0f, fz     , fw-1.0f),
1518                               grad (hash (X+1, Y+1, Z  , W+1), fx-1.0f, fy-1.0f, fz     , fw-1.0f),
1519                               grad (hash (X  , Y  , Z+1, W+1), fx     , fy     , fz-1.0f, fw-1.0f),
1520                               grad (hash (X+1, Y  , Z+1, W+1), fx-1.0f, fy     , fz-1.0f, fw-1.0f),
1521                               grad (hash (X  , Y+1, Z+1, W+1), fx     , fy-1.0f, fz-1.0f, fw-1.0f),
1522                               grad (hash (X+1, Y+1, Z+1, W+1), fx-1.0f, fy-1.0f, fz-1.0f, fw-1.0f),
1523                               u, v, t),
1524                s);
1525     result = scale4 (lerp_result);
1526     }
1527 }
1528 
1529 template <typename CGPolicyT = CGDefault, typename H>
perlin(Dual2<float> & result,const H & hash,const Dual2<float> & x,const Dual2<float> & y)1530 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<float> &result, const H &hash,
1531                     const Dual2<float> &x, const Dual2<float> &y)
1532 {
1533 #if OIIO_SIMD
1534     if (CGPolicyT::allowSIMD)
1535     {
1536     int X; Dual2<float> fx = floorfrac(x, &X);
1537     int Y; Dual2<float> fy = floorfrac(y, &Y);
1538     Dual2<float4> fxy (float4(fx.val(), fy.val(), 0.0f),
1539                        float4(fx.dx(), fy.dx(), 0.0f),
1540                        float4(fx.dy(), fy.dy(), 0.0f));
1541     Dual2<float4> uv = fade (fxy);
1542 
1543     // We parallelize primarily by computing the hashes and gradients at the
1544     // integer lattice corners simultaneously.
1545     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1546     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1547     int4 cornerx = X + (*(int4*)i0101);
1548     int4 cornery = Y + (*(int4*)i0011);
1549     int4 corner_hash = hash (cornerx, cornery);
1550 
1551     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1552     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1553     Dual2<float4> remainderx = shuffle<0>(fxy) - (*(float4*)f0101);
1554     Dual2<float4> remaindery = shuffle<1>(fxy) - (*(float4*)f0011);
1555     Dual2<float4> corner_grad = grad (corner_hash, remainderx, remaindery);
1556 
1557     result = scale2 (bilerp (corner_grad, uv));
1558     } else
1559 #endif
1560     {
1561     // Non-SIMD case
1562     int X; Dual2<float> fx = floorfrac(x, &X);
1563     int Y; Dual2<float> fy = floorfrac(y, &Y);
1564     Dual2<float> u = fade(fx);
1565     Dual2<float> v = fade(fy);
1566     auto bilerp_result = OIIO::bilerp (grad (hash (X  , Y  ), fx     , fy     ),
1567                            grad (hash (X+1, Y  ), fx-1.0f, fy     ),
1568                            grad (hash (X  , Y+1), fx     , fy-1.0f),
1569                            grad (hash (X+1, Y+1), fx-1.0f, fy-1.0f), u, v);
1570     result = scale2 (bilerp_result);
1571     }
1572 }
1573 
1574 template <typename CGPolicyT = CGDefault, typename H>
perlin(Dual2<float> & result,const H & hash,const Dual2<float> & x,const Dual2<float> & y,const Dual2<float> & z)1575 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<float> &result, const H &hash,
1576                     const Dual2<float> &x, const Dual2<float> &y, const Dual2<float> &z)
1577 {
1578 #if OIIO_SIMD
1579     if (CGPolicyT::allowSIMD)
1580     {
1581     int X; Dual2<float> fx = floorfrac(x, &X);
1582     int Y; Dual2<float> fy = floorfrac(y, &Y);
1583     int Z; Dual2<float> fz = floorfrac(z, &Z);
1584     Dual2<float4> fxyz (float4(fx.val(), fy.val(), fz.val()),
1585                         float4(fx.dx(), fy.dx(), fz.dx()),
1586                         float4(fx.dy(), fy.dy(), fz.dy()));
1587     Dual2<float4> uvw = fade (fxyz);
1588 
1589     // We parallelize primarily by computing the hashes and gradients at the
1590     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1591     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1592     // with AVX.)
1593     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1594     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1595     int4 cornerx = X + (*(int4*)i0101);
1596     int4 cornery = Y + (*(int4*)i0011);
1597     int4 cornerz = Z;
1598     int4 corner_hash_z0 = hash (cornerx, cornery, cornerz);
1599     int4 corner_hash_z1 = hash (cornerx, cornery, cornerz+int4::One());
1600 
1601     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1602     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1603     Dual2<float4> remainderx = shuffle<0>(fxyz) - (*(float4*)f0101);
1604     Dual2<float4> remaindery = shuffle<1>(fxyz) - (*(float4*)f0011);
1605     Dual2<float4> remainderz = shuffle<2>(fxyz);
1606 
1607     Dual2<float4> corner_grad_z0 = grad (corner_hash_z0, remainderx, remaindery, remainderz);
1608     Dual2<float4> corner_grad_z1 = grad (corner_hash_z1, remainderx, remaindery, remainderz-float4::One());
1609 
1610     // Interpolate along the z axis first
1611     Dual2<float4> xy = OIIO::lerp (corner_grad_z0, corner_grad_z1, shuffle<2>(uvw));
1612     // Interpolate along x axis
1613     Dual2<float4> xx = OIIO::lerp (xy, shuffle<1,1,3,3>(xy), shuffle<0>(uvw));
1614     // interpolate along y axis
1615     result = scale3 (extract<0>(OIIO::lerp (xx,shuffle<2>(xx), shuffle<1>(uvw))));
1616     } else
1617 #endif
1618     {
1619     // Non-SIMD case
1620     int X; Dual2<float> fx = floorfrac(x, &X);
1621     int Y; Dual2<float> fy = floorfrac(y, &Y);
1622     int Z; Dual2<float> fz = floorfrac(z, &Z);
1623     Dual2<float> u = fade(fx);
1624     Dual2<float> v = fade(fy);
1625     Dual2<float> w = fade(fz);
1626     auto tril_result = OIIO::trilerp (grad (hash (X  , Y  , Z  ), fx     , fy     , fz     ),
1627                             grad (hash (X+1, Y  , Z  ), fx-1.0f, fy     , fz     ),
1628                             grad (hash (X  , Y+1, Z  ), fx     , fy-1.0f, fz     ),
1629                             grad (hash (X+1, Y+1, Z  ), fx-1.0f, fy-1.0f, fz     ),
1630                             grad (hash (X  , Y  , Z+1), fx     , fy     , fz-1.0f),
1631                             grad (hash (X+1, Y  , Z+1), fx-1.0f, fy     , fz-1.0f),
1632                             grad (hash (X  , Y+1, Z+1), fx     , fy-1.0f, fz-1.0f),
1633                             grad (hash (X+1, Y+1, Z+1), fx-1.0f, fy-1.0f, fz-1.0f),
1634                             u, v, w);
1635     result = scale3 (tril_result);
1636     }
1637 }
1638 
1639 template <typename CGPolicyT = CGDefault, typename H>
perlin(Dual2<float> & result,const H & hash,const Dual2<float> & x,const Dual2<float> & y,const Dual2<float> & z,const Dual2<float> & w)1640 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<float> &result, const H &hash,
1641                     const Dual2<float> &x, const Dual2<float> &y,
1642                     const Dual2<float> &z, const Dual2<float> &w)
1643 {
1644 #if OIIO_SIMD
1645     if (CGPolicyT::allowSIMD)
1646     {
1647     int X; Dual2<float> fx = floorfrac(x, &X);
1648     int Y; Dual2<float> fy = floorfrac(y, &Y);
1649     int Z; Dual2<float> fz = floorfrac(z, &Z);
1650     int W; Dual2<float> fw = floorfrac(w, &W);
1651     Dual2<float4> fxyzw (float4(fx.val(), fy.val(), fz.val(), fw.val()),
1652                          float4(fx.dx (), fy.dx (), fz.dx (), fw.dx ()),
1653                          float4(fx.dy (), fy.dy (), fz.dy (), fw.dy ()));
1654     Dual2<float4> uvts = fade (fxyzw);
1655 
1656     // We parallelize primarily by computing the hashes and gradients at the
1657     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1658     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1659     // with AVX.)
1660     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1661     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1662     int4 cornerx = X + (*(int4*)i0101);
1663     int4 cornery = Y + (*(int4*)i0011);
1664     int4 cornerz = Z;
1665     int4 cornerz1 = Z + int4::One();
1666     int4 cornerw = W;
1667     int4 cornerw1 = W + int4::One();
1668     int4 corner_hash_z0 = hash (cornerx, cornery, cornerz,  cornerw);
1669     int4 corner_hash_z1 = hash (cornerx, cornery, cornerz1, cornerw);
1670     int4 corner_hash_z2 = hash (cornerx, cornery, cornerz,  cornerw1);
1671     int4 corner_hash_z3 = hash (cornerx, cornery, cornerz1, cornerw1);
1672 
1673     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1674     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1675     Dual2<float4> remainderx  = shuffle<0>(fxyzw) - (*(float4*)f0101);
1676     Dual2<float4> remaindery  = shuffle<1>(fxyzw) - (*(float4*)f0011);
1677     Dual2<float4> remainderz  = shuffle<2>(fxyzw);
1678     Dual2<float4> remainderz1 = remainderz - float4::One();
1679     Dual2<float4> remainderw  = shuffle<3>(fxyzw);
1680     Dual2<float4> remainderw1 = remainderw - float4::One();
1681 
1682     Dual2<float4> corner_grad_z0 = grad (corner_hash_z0, remainderx, remaindery, remainderz,  remainderw);
1683     Dual2<float4> corner_grad_z1 = grad (corner_hash_z1, remainderx, remaindery, remainderz1, remainderw);
1684     Dual2<float4> corner_grad_z2 = grad (corner_hash_z2, remainderx, remaindery, remainderz,  remainderw1);
1685     Dual2<float4> corner_grad_z3 = grad (corner_hash_z3, remainderx, remaindery, remainderz1, remainderw1);
1686 
1687     // Interpolate along the w axis first
1688     Dual2<float4> xyz0 = OIIO::lerp (corner_grad_z0, corner_grad_z2, shuffle<3>(uvts));
1689     Dual2<float4> xyz1 = OIIO::lerp (corner_grad_z1, corner_grad_z3, shuffle<3>(uvts));
1690     Dual2<float4> xy = OIIO::lerp (xyz0, xyz1, shuffle<2>(uvts));
1691     // Interpolate along x axis
1692     Dual2<float4> xx = OIIO::lerp (xy, shuffle<1,1,3,3>(xy), shuffle<0>(uvts));
1693     // interpolate along y axis
1694     result = scale4 (extract<0>(OIIO::lerp (xx,shuffle<2>(xx), shuffle<1>(uvts))));
1695     } else
1696 #endif
1697     {
1698     // ORIGINAL -- non-SIMD
1699     int X; Dual2<float> fx = floorfrac(x, &X);
1700     int Y; Dual2<float> fy = floorfrac(y, &Y);
1701     int Z; Dual2<float> fz = floorfrac(z, &Z);
1702     int W; Dual2<float> fw = floorfrac(w, &W);
1703 
1704     Dual2<float> u = fade(fx);
1705     Dual2<float> v = fade(fy);
1706     Dual2<float> t = fade(fz);
1707     Dual2<float> s = fade(fw);
1708 
1709     auto lerp_result = OIIO::lerp (
1710                OIIO::trilerp (grad (hash (X  , Y  , Z  , W  ), fx     , fy     , fz     , fw     ),
1711                               grad (hash (X+1, Y  , Z  , W  ), fx-1.0f, fy     , fz     , fw     ),
1712                               grad (hash (X  , Y+1, Z  , W  ), fx     , fy-1.0f, fz     , fw     ),
1713                               grad (hash (X+1, Y+1, Z  , W  ), fx-1.0f, fy-1.0f, fz     , fw     ),
1714                               grad (hash (X  , Y  , Z+1, W  ), fx     , fy     , fz-1.0f, fw     ),
1715                               grad (hash (X+1, Y  , Z+1, W  ), fx-1.0f, fy     , fz-1.0f, fw     ),
1716                               grad (hash (X  , Y+1, Z+1, W  ), fx     , fy-1.0f, fz-1.0f, fw     ),
1717                               grad (hash (X+1, Y+1, Z+1, W  ), fx-1.0f, fy-1.0f, fz-1.0f, fw     ),
1718                               u, v, t),
1719                OIIO::trilerp (grad (hash (X  , Y  , Z  , W+1), fx     , fy     , fz     , fw-1.0f),
1720                               grad (hash (X+1, Y  , Z  , W+1), fx-1.0f, fy     , fz     , fw-1.0f),
1721                               grad (hash (X  , Y+1, Z  , W+1), fx     , fy-1.0f, fz     , fw-1.0f),
1722                               grad (hash (X+1, Y+1, Z  , W+1), fx-1.0f, fy-1.0f, fz     , fw-1.0f),
1723                               grad (hash (X  , Y  , Z+1, W+1), fx     , fy     , fz-1.0f, fw-1.0f),
1724                               grad (hash (X+1, Y  , Z+1, W+1), fx-1.0f, fy     , fz-1.0f, fw-1.0f),
1725                               grad (hash (X  , Y+1, Z+1, W+1), fx     , fy-1.0f, fz-1.0f, fw-1.0f),
1726                               grad (hash (X+1, Y+1, Z+1, W+1), fx-1.0f, fy-1.0f, fz-1.0f, fw-1.0f),
1727                               u, v, t),
1728                s);
1729     result = scale4 (lerp_result);
1730     }
1731 }
1732 
1733 
1734 template <typename CGPolicyT = CGDefault, typename H>
perlin(Vec3 & result,const H & hash,const float & x,const float & y)1735 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Vec3 &result, const H &hash,
1736                     const float &x, const float &y)
1737 {
1738 #if OIIO_SIMD
1739     if (CGPolicyT::allowSIMD)
1740     {
1741     int4 XYZ;
1742     float4 fxyz = floorfrac (float4(x,y,0), &XYZ);
1743     float4 uv = fade (fxyz);
1744 
1745     // We parallelize primarily by computing the hashes and gradients at the
1746     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1747     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1748     // with AVX.)
1749     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1750     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1751     int4 cornerx = OIIO::simd::shuffle<0>(XYZ) + (*(int4*)i0101);
1752     int4 cornery = OIIO::simd::shuffle<1>(XYZ) + (*(int4*)i0011);
1753 
1754     // We actually derive 3 hashes (one for each output dimension) for each
1755     // corner.
1756     int4 corner_hash[3];
1757     hash (corner_hash, cornerx, cornery);
1758 
1759     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1760     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1761     float4 remainderx = OIIO::simd::shuffle<0>(fxyz) - (*(float4*)f0101);
1762     float4 remaindery = OIIO::simd::shuffle<1>(fxyz) - (*(float4*)f0011);
1763     float result_comp[3];
1764     for (int i = 0; i < 3; ++i) {
1765         float4 corner_grad = grad (corner_hash[i], remainderx, remaindery);
1766         // Do the bilinear interpolation with SIMD. Here's the fastest way
1767         // I've found to do it.
1768         float4 xx = OIIO::lerp (corner_grad, OIIO::simd::shuffle<1,1,3,3>(corner_grad), uv[0]);
1769         result_comp[i] = scale2 (OIIO::simd::extract<0>(OIIO::lerp (xx,OIIO::simd::shuffle<2>(xx), uv[1])));
1770     }
1771     result = Vec3(result_comp[0], result_comp[1], result_comp[2]);
1772     } else
1773 #endif
1774     {
1775     // Non-SIMD case
1776     typedef float T;
1777     int X; T fx = floorfrac(x, &X);
1778     int Y; T fy = floorfrac(y, &Y);
1779     T u = fade(fx);
1780     T v = fade(fy);
1781     auto bil_result = OIIO::bilerp (grad (hash (X  , Y  ), fx     , fy     ),
1782                            grad (hash (X+1, Y  ), fx-1.0f, fy     ),
1783                            grad (hash (X  , Y+1), fx     , fy-1.0f),
1784                            grad (hash (X+1, Y+1), fx-1.0f, fy-1.0f), u, v);
1785     result = scale2 (bil_result);
1786     }
1787 }
1788 
1789 
1790 
1791 template <typename CGPolicyT = CGDefault, typename H>
perlin(Vec3 & result,const H & hash,const float & x,const float & y,const float & z)1792 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Vec3 &result, const H &hash,
1793                     const float &x, const float &y, const float &z)
1794 {
1795 #if OIIO_SIMD
1796     if (CGPolicyT::allowSIMD)
1797     {
1798 #if 0
1799     // You'd think it would be faster to do the floorfrac in parallel, but
1800     // according to my timings, it is not. Come back and understand why.
1801     int4 XYZ;
1802     float4 fxyz = floorfrac (float4(x,y,z), &XYZ);
1803     float4 uvw = fade (fxyz);
1804 
1805     // We parallelize primarily by computing the hashes and gradients at the
1806     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1807     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1808     // with AVX.)
1809     int4 cornerx = shuffle<0>(XYZ) + int4(0,1,0,1);
1810     int4 cornery = shuffle<1>(XYZ) + int4(0,0,1,1);
1811     int4 cornerz = shuffle<2>(XYZ);
1812 #else
1813     int X; float fx = floorfrac(x, &X);
1814     int Y; float fy = floorfrac(y, &Y);
1815     int Z; float fz = floorfrac(z, &Z);
1816     float4 fxyz (fx, fy, fz); // = floorfrac (xyz, &XYZ);
1817     float4 uvw = fade (fxyz);
1818 
1819     // We parallelize primarily by computing the hashes and gradients at the
1820     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1821     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1822     // with AVX.)
1823     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1824     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1825     int4 cornerx = X + (*(int4*)i0101);
1826     int4 cornery = Y + (*(int4*)i0011);
1827     int4 cornerz = Z;
1828 #endif
1829 
1830     // We actually derive 3 hashes (one for each output dimension) for each
1831     // corner.
1832     int4 corner_hash_z0[3], corner_hash_z1[3];
1833     hash (corner_hash_z0, cornerx, cornery, cornerz);
1834     hash (corner_hash_z1, cornerx, cornery, cornerz+int4::One());
1835 
1836     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1837     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1838     float4 remainderx = OIIO::simd::shuffle<0>(fxyz) - (*(float4*)f0101);
1839     float4 remaindery = OIIO::simd::shuffle<1>(fxyz) - (*(float4*)f0011);
1840     float4 remainderz0 = OIIO::simd::shuffle<2>(fxyz);
1841     float4 remainderz1 = OIIO::simd::shuffle<2>(fxyz) - float4::One();
1842     float result_comp[3];
1843     for (int i = 0; i < 3; ++i) {
1844         float4 corner_grad_z0 = grad (corner_hash_z0[i], remainderx, remaindery, remainderz0);
1845         float4 corner_grad_z1 = grad (corner_hash_z1[i], remainderx, remaindery, remainderz1);
1846 
1847         // Interpolate along the z axis first
1848         float4 xy = OIIO::lerp (corner_grad_z0, corner_grad_z1, OIIO::simd::shuffle<2>(uvw));
1849         // Interpolate along x axis
1850         float4 xx = OIIO::lerp (xy, OIIO::simd::shuffle<1,1,3,3>(xy), OIIO::simd::shuffle<0>(uvw));
1851         // interpolate along y axis
1852         result_comp[i] = scale3 (OIIO::simd::extract<0>(OIIO::lerp (xx,OIIO::simd::shuffle<2>(xx), OIIO::simd::shuffle<1>(uvw))));
1853     }
1854     result = Vec3(result_comp[0],result_comp[1],result_comp[2]);
1855     } else
1856 #endif
1857     {
1858     // ORIGINAL -- non-SIMD
1859     int X; float fx = floorfrac(x, &X);
1860     int Y; float fy = floorfrac(y, &Y);
1861     int Z; float fz = floorfrac(z, &Z);
1862     float u = fade(fx);
1863     float v = fade(fy);
1864     float w = fade(fz);
1865 
1866     // A.W. the OIIO_SIMD above differs from the original results
1867     // so we are re-implementing the non-SIMD version to match below
1868 #if 0
1869     result = OIIO::trilerp (grad (hash (X  , Y  , Z  ), fx     , fy     , fz      ),
1870                             grad (hash (X+1, Y  , Z  ), fx-1.0f, fy     , fz      ),
1871                             grad (hash (X  , Y+1, Z  ), fx     , fy-1.0f, fz      ),
1872                             grad (hash (X+1, Y+1, Z  ), fx-1.0f, fy-1.0f, fz      ),
1873                             grad (hash (X  , Y  , Z+1), fx     , fy     , fz-1.0f ),
1874                             grad (hash (X+1, Y  , Z+1), fx-1.0f, fy     , fz-1.0f ),
1875                             grad (hash (X  , Y+1, Z+1), fx     , fy-1.0f, fz-1.0f ),
1876                             grad (hash (X+1, Y+1, Z+1), fx-1.0f, fy-1.0f, fz-1.0f ),
1877                             u, v, w);
1878     result = scale3 (result);
1879 #else
1880     static_assert(std::is_same<Vec3i, decltype(hash(X, Y, Z))>::value, "This re-implementation was developed for Hashs returning a vector type only");
1881     // Want to avoid repeating the same hash work 3 times in a row
1882     // so rather than executing the vector version, we will use HashScalar
1883     // and directly mask its results on a per component basis
1884     auto hash_scalar = hash.convertToScalar();
1885     int h000 = hash_scalar (X  , Y  , Z  );
1886     int h100 = hash_scalar (X+1, Y  , Z  );
1887     int h010 = hash_scalar (X  , Y+1, Z  );
1888     int h110 = hash_scalar (X+1, Y+1, Z  );
1889     int h001 = hash_scalar (X  , Y  , Z+1);
1890     int h101 = hash_scalar (X+1, Y  , Z+1);
1891     int h011 = hash_scalar (X  , Y+1, Z+1);
1892     int h111 = hash_scalar (X+1, Y+1, Z+1);
1893 
1894     // We are mimicking the OIIO_SSE behavior
1895     //      result[0] = (h        ) & 0xFF;
1896     //      result[1] = (srl(h,8 )) & 0xFF;
1897     //      result[2] = (srl(h,16)) & 0xFF;
1898     // skipping masking the 0th version, perhaps that is a mistake
1899 
1900     float result0 = OIIO::trilerp (grad (h000, fx     , fy     , fz      ),
1901                             grad (h100, fx-1.0f, fy     , fz      ),
1902                             grad (h010, fx     , fy-1.0f, fz      ),
1903                             grad (h110, fx-1.0f, fy-1.0f, fz      ),
1904                             grad (h001, fx     , fy     , fz-1.0f ),
1905                             grad (h101, fx-1.0f, fy     , fz-1.0f ),
1906                             grad (h011, fx     , fy-1.0f, fz-1.0f ),
1907                             grad (h111, fx-1.0f, fy-1.0f, fz-1.0f ),
1908                             u, v, w);
1909 
1910     float result1 = OIIO::trilerp (
1911         grad ((h000>>8) & 0xFF, fx     , fy     , fz      ),
1912         grad ((h100>>8) & 0xFF, fx-1.0f, fy     , fz      ),
1913         grad ((h010>>8) & 0xFF, fx     , fy-1.0f, fz      ),
1914         grad ((h110>>8) & 0xFF, fx-1.0f, fy-1.0f, fz      ),
1915         grad ((h001>>8) & 0xFF, fx     , fy     , fz-1.0f ),
1916         grad ((h101>>8) & 0xFF, fx-1.0f, fy     , fz-1.0f ),
1917         grad ((h011>>8) & 0xFF, fx     , fy-1.0f, fz-1.0f ),
1918         grad ((h111>>8) & 0xFF, fx-1.0f, fy-1.0f, fz-1.0f ),
1919         u, v, w);
1920 
1921     float result2 = OIIO::trilerp (
1922         grad ((h000>>16) & 0xFF, fx     , fy     , fz      ),
1923         grad ((h100>>16) & 0xFF, fx-1.0f, fy     , fz      ),
1924         grad ((h010>>16) & 0xFF, fx     , fy-1.0f, fz      ),
1925         grad ((h110>>16) & 0xFF, fx-1.0f, fy-1.0f, fz      ),
1926         grad ((h001>>16) & 0xFF, fx     , fy     , fz-1.0f ),
1927         grad ((h101>>16) & 0xFF, fx-1.0f, fy     , fz-1.0f ),
1928         grad ((h011>>16) & 0xFF, fx     , fy-1.0f, fz-1.0f ),
1929         grad ((h111>>16) & 0xFF, fx-1.0f, fy-1.0f, fz-1.0f ),
1930         u, v, w);
1931 
1932     result = Vec3(scale3 (result0), scale3 (result1), scale3 (result2));
1933 #endif
1934     }
1935 }
1936 
1937 template <typename CGPolicyT = CGDefault, typename H>
perlin(Vec3 & result,const H & hash,const float & x,const float & y,const float & z,const float & w)1938 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Vec3 &result, const H &hash,
1939                     const float &x, const float &y, const float &z, const float &w)
1940 {
1941 #if OIIO_SIMD
1942     if (CGPolicyT::allowSIMD)
1943     {
1944     int4 XYZW;
1945     float4 fxyzw = floorfrac (float4(x,y,z,w), &XYZW);
1946     float4 uvts = fade (fxyzw);
1947 
1948     // We parallelize primarily by computing the hashes and gradients at the
1949     // integer lattice corners simultaneously. We need 8 total (for 3D), so
1950     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
1951     // with AVX.)
1952     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
1953     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
1954     int4 cornerx = OIIO::simd::shuffle<0>(XYZW) + (*(int4*)i0101);
1955     int4 cornery = OIIO::simd::shuffle<1>(XYZW) + (*(int4*)i0011);
1956     int4 cornerz = OIIO::simd::shuffle<2>(XYZW);
1957     int4 cornerw = OIIO::simd::shuffle<3>(XYZW);
1958 
1959     // We actually derive 3 hashes (one for each output dimension) for each
1960     // corner.
1961     int4 corner_hash_z0[3], corner_hash_z1[3];
1962     int4 corner_hash_z2[3], corner_hash_z3[3];
1963     hash (corner_hash_z0, cornerx, cornery, cornerz, cornerw);
1964     hash (corner_hash_z1, cornerx, cornery, cornerz+int4::One(), cornerw);
1965     hash (corner_hash_z2, cornerx, cornery, cornerz, cornerw+int4::One());
1966     hash (corner_hash_z3, cornerx, cornery, cornerz+int4::One(), cornerw+int4::One());
1967 
1968     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
1969     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
1970     float4 remainderx = OIIO::simd::shuffle<0>(fxyzw) - (*(float4*)f0101);
1971     float4 remaindery = OIIO::simd::shuffle<1>(fxyzw) - (*(float4*)f0011);
1972     float4 remainderz = OIIO::simd::shuffle<2>(fxyzw);
1973     float4 remainderw = OIIO::simd::shuffle<3>(fxyzw);
1974 //    float4 remainderz0 = shuffle<2>(fxyz);
1975 //    float4 remainderz1 = shuffle<2>(fxyz) - 1.0f;
1976     float result_comp[3];
1977     for (int i = 0; i < 3; ++i) {
1978         float4 corner_grad_z0 = grad (corner_hash_z0[i], remainderx, remaindery, remainderz, remainderw);
1979         float4 corner_grad_z1 = grad (corner_hash_z1[i], remainderx, remaindery, remainderz-float4::One(), remainderw);
1980         float4 corner_grad_z2 = grad (corner_hash_z2[i], remainderx, remaindery, remainderz, remainderw-float4::One());
1981         float4 corner_grad_z3 = grad (corner_hash_z3[i], remainderx, remaindery, remainderz-float4::One(), remainderw-float4::One());
1982         result_comp[i] = scale4 (OIIO::lerp (trilerp (corner_grad_z0, corner_grad_z1, uvts),
1983                                         trilerp (corner_grad_z2, corner_grad_z3, uvts),
1984                                         OIIO::simd::extract<3>(uvts)));
1985     }
1986     result = Vec3(result_comp[0],result_comp[1],result_comp[2]);
1987     } else
1988 #endif
1989     {
1990     // ORIGINAL -- non-SIMD
1991     int X; float fx = floorfrac(x, &X);
1992     int Y; float fy = floorfrac(y, &Y);
1993     int Z; float fz = floorfrac(z, &Z);
1994     int W; float fw = floorfrac(w, &W);
1995 
1996     float u = fade(fx);
1997     float v = fade(fy);
1998     float t = fade(fz);
1999     float s = fade(fw);
2000 
2001     auto l_result = OIIO::lerp (
2002                OIIO::trilerp (grad (hash (X  , Y  , Z  , W  ), fx     , fy     , fz     , fw     ),
2003                               grad (hash (X+1, Y  , Z  , W  ), fx-1.0f, fy     , fz     , fw     ),
2004                               grad (hash (X  , Y+1, Z  , W  ), fx     , fy-1.0f, fz     , fw     ),
2005                               grad (hash (X+1, Y+1, Z  , W  ), fx-1.0f, fy-1.0f, fz     , fw     ),
2006                               grad (hash (X  , Y  , Z+1, W  ), fx     , fy     , fz-1.0f, fw     ),
2007                               grad (hash (X+1, Y  , Z+1, W  ), fx-1.0f, fy     , fz-1.0f, fw     ),
2008                               grad (hash (X  , Y+1, Z+1, W  ), fx     , fy-1.0f, fz-1.0f, fw     ),
2009                               grad (hash (X+1, Y+1, Z+1, W  ), fx-1.0f, fy-1.0f, fz-1.0f, fw     ),
2010                               u, v, t),
2011                OIIO::trilerp (grad (hash (X  , Y  , Z  , W+1), fx     , fy     , fz     , fw-1.0f),
2012                               grad (hash (X+1, Y  , Z  , W+1), fx-1.0f, fy     , fz     , fw-1.0f),
2013                               grad (hash (X  , Y+1, Z  , W+1), fx     , fy-1.0f, fz     , fw-1.0f),
2014                               grad (hash (X+1, Y+1, Z  , W+1), fx-1.0f, fy-1.0f, fz     , fw-1.0f),
2015                               grad (hash (X  , Y  , Z+1, W+1), fx     , fy     , fz-1.0f, fw-1.0f),
2016                               grad (hash (X+1, Y  , Z+1, W+1), fx-1.0f, fy     , fz-1.0f, fw-1.0f),
2017                               grad (hash (X  , Y+1, Z+1, W+1), fx     , fy-1.0f, fz-1.0f, fw-1.0f),
2018                               grad (hash (X+1, Y+1, Z+1, W+1), fx-1.0f, fy-1.0f, fz-1.0f, fw-1.0f),
2019                               u, v, t),
2020                s);
2021     result = scale4 (l_result);
2022     }
2023 }
2024 
2025 
2026 template <typename CGPolicyT = CGDefault, typename H>
perlin(Dual2<Vec3> & result,const H & hash,const Dual2<float> & x,const Dual2<float> & y)2027 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<Vec3> &result, const H &hash,
2028                     const Dual2<float> &x, const Dual2<float> &y)
2029 {
2030 #if OIIO_SIMD
2031     if (CGPolicyT::allowSIMD)
2032     {
2033     int X; Dual2<float> fx = floorfrac(x, &X);
2034     int Y; Dual2<float> fy = floorfrac(y, &Y);
2035     Dual2<float4> fxyz (float4(fx.val(), fy.val(), 0.0f),
2036                         float4(fx.dx(),  fy.dx(),  0.0f),
2037                         float4(fx.dy(),  fy.dy(),  0.0f));
2038     Dual2<float4> uvw = fade (fxyz);
2039 
2040     // We parallelize primarily by computing the hashes and gradients at the
2041     // integer lattice corners simultaneously. We need 8 total (for 3D), so
2042     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
2043     // with AVX.)
2044     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
2045     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
2046     int4 cornerx = X + (*(int4*)i0101);
2047     int4 cornery = Y + (*(int4*)i0011);
2048 
2049     // We actually derive 3 hashes (one for each output dimension) for each
2050     // corner.
2051     int4 corner_hash[3];
2052     hash (corner_hash, cornerx, cornery);
2053 
2054     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
2055     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
2056     Dual2<float4> remainderx = shuffle<0>(fxyz) - (*(float4*)f0101);
2057     Dual2<float4> remaindery = shuffle<1>(fxyz) - (*(float4*)f0011);
2058     Dual2<float> r[3];
2059     for (int i = 0; i < 3; ++i) {
2060         Dual2<float4> corner_grad = grad (corner_hash[i], remainderx, remaindery);
2061         // Interpolate along x axis
2062         Dual2<float4> xx = OIIO::lerp (corner_grad, shuffle<1,1,3,3>(corner_grad), shuffle<0>(uvw));
2063         // interpolate along y axis
2064         r[i] = scale2 (extract<0>(OIIO::lerp (xx,shuffle<2>(xx), shuffle<1>(uvw))));
2065     }
2066     result.set (Vec3 (r[0].val(), r[1].val(), r[2].val()),
2067                 Vec3 (r[0].dx(),  r[1].dx(),  r[2].dx()),
2068                 Vec3 (r[0].dy(),  r[1].dy(),  r[2].dy()));
2069     } else
2070 #endif
2071     {
2072     // ORIGINAL -- non-SIMD
2073     int X; Dual2<float> fx = floorfrac(x, &X);
2074     int Y; Dual2<float> fy = floorfrac(y, &Y);
2075     Dual2<float> u = fade(fx);
2076     Dual2<float> v = fade(fy);
2077     auto bil_result = OIIO::bilerp (grad (hash (X  , Y  ), fx     , fy     ),
2078                            grad (hash (X+1, Y  ), fx-1.0f, fy     ),
2079                            grad (hash (X  , Y+1), fx     , fy-1.0f),
2080                            grad (hash (X+1, Y+1), fx-1.0f, fy-1.0f),
2081                            u, v);
2082     result = scale2 (bil_result);
2083     }
2084 }
2085 
2086 
2087 template <typename CGPolicyT = CGDefault, typename H>
perlin(Dual2<Vec3> & result,const H & hash,const Dual2<float> & x,const Dual2<float> & y,const Dual2<float> & z)2088 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<Vec3> &result, const H &hash,
2089                     const Dual2<float> &x, const Dual2<float> &y, const Dual2<float> &z)
2090 {
2091 #if OIIO_SIMD
2092     if (CGPolicyT::allowSIMD)
2093     {
2094     int X; Dual2<float> fx = floorfrac(x, &X);
2095     int Y; Dual2<float> fy = floorfrac(y, &Y);
2096     int Z; Dual2<float> fz = floorfrac(z, &Z);
2097     Dual2<float4> fxyz (float4(fx.val(), fy.val(), fz.val()),
2098                         float4(fx.dx(),  fy.dx(),  fz.dx()),
2099                         float4(fx.dy(),  fy.dy(),  fz.dy()));
2100     Dual2<float4> uvw = fade (fxyz);
2101 
2102     // We parallelize primarily by computing the hashes and gradients at the
2103     // integer lattice corners simultaneously. We need 8 total (for 3D), so
2104     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
2105     // with AVX.)
2106     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
2107     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
2108     int4 cornerx = X + (*(int4*)i0101);
2109     int4 cornery = Y + (*(int4*)i0011);
2110     int4 cornerz = Z;
2111 
2112     // We actually derive 3 hashes (one for each output dimension) for each
2113     // corner.
2114     int4 corner_hash_z0[3], corner_hash_z1[3];
2115     hash (corner_hash_z0, cornerx, cornery, cornerz);
2116     hash (corner_hash_z1, cornerx, cornery, cornerz+int4::One());
2117 
2118     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
2119     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
2120     Dual2<float4> remainderx = shuffle<0>(fxyz) - (*(float4*)f0101);
2121     Dual2<float4> remaindery = shuffle<1>(fxyz) - (*(float4*)f0011);
2122     Dual2<float4> remainderz0 = shuffle<2>(fxyz);
2123     Dual2<float4> remainderz1 = shuffle<2>(fxyz) - float4::One();
2124     Dual2<float> r[3];
2125     for (int i = 0; i < 3; ++i) {
2126         Dual2<float4> corner_grad_z0 = grad (corner_hash_z0[i], remainderx, remaindery, remainderz0);
2127         Dual2<float4> corner_grad_z1 = grad (corner_hash_z1[i], remainderx, remaindery, remainderz1);
2128 
2129         // Interpolate along the z axis first
2130         Dual2<float4> xy = OIIO::lerp (corner_grad_z0, corner_grad_z1, shuffle<2>(uvw));
2131         // Interpolate along x axis
2132         Dual2<float4> xx = OIIO::lerp (xy, shuffle<1,1,3,3>(xy), shuffle<0>(uvw));
2133         // interpolate along y axis
2134         r[i] = scale3 (extract<0>(OIIO::lerp (xx,shuffle<2>(xx), shuffle<1>(uvw))));
2135     }
2136     result.set (Vec3 (r[0].val(), r[1].val(), r[2].val()),
2137                 Vec3 (r[0].dx(),  r[1].dx(),  r[2].dx()),
2138                 Vec3 (r[0].dy(),  r[1].dy(),  r[2].dy()));
2139     } else
2140 #endif
2141     {
2142     // ORIGINAL -- non-SIMD
2143     int X; Dual2<float> fx = floorfrac(x, &X);
2144     int Y; Dual2<float> fy = floorfrac(y, &Y);
2145     int Z; Dual2<float> fz = floorfrac(z, &Z);
2146     Dual2<float> u = fade(fx);
2147     Dual2<float> v = fade(fy);
2148     Dual2<float> w = fade(fz);
2149     auto til_result = OIIO::trilerp (grad (hash (X  , Y  , Z  ), fx     , fy     , fz      ),
2150                             grad (hash (X+1, Y  , Z  ), fx-1.0f, fy     , fz      ),
2151                             grad (hash (X  , Y+1, Z  ), fx     , fy-1.0f, fz      ),
2152                             grad (hash (X+1, Y+1, Z  ), fx-1.0f, fy-1.0f, fz      ),
2153                             grad (hash (X  , Y  , Z+1), fx     , fy     , fz-1.0f ),
2154                             grad (hash (X+1, Y  , Z+1), fx-1.0f, fy     , fz-1.0f ),
2155                             grad (hash (X  , Y+1, Z+1), fx     , fy-1.0f, fz-1.0f ),
2156                             grad (hash (X+1, Y+1, Z+1), fx-1.0f, fy-1.0f, fz-1.0f ),
2157                             u, v, w);
2158     result = scale3 (til_result);
2159     }
2160 }
2161 
2162 
2163 template <typename CGPolicyT = CGDefault, typename H>
perlin(Dual2<Vec3> & result,const H & hash,const Dual2<float> & x,const Dual2<float> & y,const Dual2<float> & z,const Dual2<float> & w)2164 OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<Vec3> &result, const H &hash,
2165                     const Dual2<float> &x, const Dual2<float> &y,
2166                     const Dual2<float> &z, const Dual2<float> &w)
2167 {
2168 #if OIIO_SIMD
2169     if (CGPolicyT::allowSIMD)
2170     {
2171     int X; Dual2<float> fx = floorfrac(x, &X);
2172     int Y; Dual2<float> fy = floorfrac(y, &Y);
2173     int Z; Dual2<float> fz = floorfrac(z, &Z);
2174     int W; Dual2<float> fw = floorfrac(w, &W);
2175     Dual2<float4> fxyzw (float4(fx.val(), fy.val(), fz.val(), fw.val()),
2176                          float4(fx.dx (), fy.dx (), fz.dx (), fw.dx ()),
2177                          float4(fx.dy (), fy.dy (), fz.dy (), fw.dy ()));
2178     Dual2<float4> uvts = fade (fxyzw);
2179 
2180     // We parallelize primarily by computing the hashes and gradients at the
2181     // integer lattice corners simultaneously. We need 8 total (for 3D), so
2182     // we do two sets of 4. (Future opportunity to do all 8 simultaneously
2183     // with AVX.)
2184     static const OIIO_SIMD4_ALIGN int i0101[4] = {0,1,0,1};
2185     static const OIIO_SIMD4_ALIGN int i0011[4] = {0,0,1,1};
2186     int4 cornerx = X + (*(int4*)i0101);
2187     int4 cornery = Y + (*(int4*)i0011);
2188     int4 cornerz = Z;
2189     int4 cornerz1 = Z + int4::One();
2190     int4 cornerw = W;
2191     int4 cornerw1 = W + int4::One();
2192 
2193     // We actually derive 3 hashes (one for each output dimension) for each
2194     // corner.
2195     int4 corner_hash_z0[3], corner_hash_z1[3], corner_hash_z2[3], corner_hash_z3[3];
2196     hash (corner_hash_z0, cornerx, cornery, cornerz,  cornerw);
2197     hash (corner_hash_z1, cornerx, cornery, cornerz1, cornerw);
2198     hash (corner_hash_z2, cornerx, cornery, cornerz,  cornerw1);
2199     hash (corner_hash_z3, cornerx, cornery, cornerz1, cornerw1);
2200 
2201     static const OIIO_SIMD4_ALIGN float f0101[4] = {0,1,0,1};
2202     static const OIIO_SIMD4_ALIGN float f0011[4] = {0,0,1,1};
2203     Dual2<float4> remainderx  = shuffle<0>(fxyzw) - (*(float4*)f0101);
2204     Dual2<float4> remaindery  = shuffle<1>(fxyzw) - (*(float4*)f0011);
2205     Dual2<float4> remainderz  = shuffle<2>(fxyzw);
2206     Dual2<float4> remainderz1 = remainderz - float4::One();
2207     Dual2<float4> remainderw  = shuffle<3>(fxyzw);
2208     Dual2<float4> remainderw1 = remainderw - float4::One();
2209 
2210     Dual2<float> r[3];
2211     for (int i = 0; i < 3; ++i) {
2212         Dual2<float4> corner_grad_z0 = grad (corner_hash_z0[i], remainderx, remaindery, remainderz,  remainderw);
2213         Dual2<float4> corner_grad_z1 = grad (corner_hash_z1[i], remainderx, remaindery, remainderz1, remainderw);
2214         Dual2<float4> corner_grad_z2 = grad (corner_hash_z2[i], remainderx, remaindery, remainderz,  remainderw1);
2215         Dual2<float4> corner_grad_z3 = grad (corner_hash_z3[i], remainderx, remaindery, remainderz1, remainderw1);
2216 
2217         // Interpolate along the w axis first
2218         Dual2<float4> xyz0 = OIIO::lerp (corner_grad_z0, corner_grad_z2, shuffle<3>(uvts));
2219         Dual2<float4> xyz1 = OIIO::lerp (corner_grad_z1, corner_grad_z3, shuffle<3>(uvts));
2220         Dual2<float4> xy = OIIO::lerp (xyz0, xyz1, shuffle<2>(uvts));
2221         // Interpolate along x axis
2222         Dual2<float4> xx = OIIO::lerp (xy, shuffle<1,1,3,3>(xy), shuffle<0>(uvts));
2223         // interpolate along y axis
2224         r[i] = scale4 (extract<0>(OIIO::lerp (xx,shuffle<2>(xx), shuffle<1>(uvts))));
2225 
2226     }
2227     result.set (Vec3 (r[0].val(), r[1].val(), r[2].val()),
2228                 Vec3 (r[0].dx(),  r[1].dx(),  r[2].dx()),
2229                 Vec3 (r[0].dy(),  r[1].dy(),  r[2].dy()));
2230     } else
2231 #endif
2232     {
2233     // ORIGINAL -- non-SIMD
2234     int X; Dual2<float> fx = floorfrac(x, &X);
2235     int Y; Dual2<float> fy = floorfrac(y, &Y);
2236     int Z; Dual2<float> fz = floorfrac(z, &Z);
2237     int W; Dual2<float> fw = floorfrac(w, &W);
2238 
2239     Dual2<float> u = fade(fx);
2240     Dual2<float> v = fade(fy);
2241     Dual2<float> t = fade(fz);
2242     Dual2<float> s = fade(fw);
2243 
2244     auto l_result = OIIO::lerp (
2245                OIIO::trilerp (grad (hash (X  , Y  , Z  , W  ), fx     , fy     , fz     , fw     ),
2246                               grad (hash (X+1, Y  , Z  , W  ), fx-1.0f, fy     , fz     , fw     ),
2247                               grad (hash (X  , Y+1, Z  , W  ), fx     , fy-1.0f, fz     , fw     ),
2248                               grad (hash (X+1, Y+1, Z  , W  ), fx-1.0f, fy-1.0f, fz     , fw     ),
2249                               grad (hash (X  , Y  , Z+1, W  ), fx     , fy     , fz-1.0f, fw     ),
2250                               grad (hash (X+1, Y  , Z+1, W  ), fx-1.0f, fy     , fz-1.0f, fw     ),
2251                               grad (hash (X  , Y+1, Z+1, W  ), fx     , fy-1.0f, fz-1.0f, fw     ),
2252                               grad (hash (X+1, Y+1, Z+1, W  ), fx-1.0f, fy-1.0f, fz-1.0f, fw     ),
2253                               u, v, t),
2254                OIIO::trilerp (grad (hash (X  , Y  , Z  , W+1), fx     , fy     , fz     , fw-1.0f),
2255                               grad (hash (X+1, Y  , Z  , W+1), fx-1.0f, fy     , fz     , fw-1.0f),
2256                               grad (hash (X  , Y+1, Z  , W+1), fx     , fy-1.0f, fz     , fw-1.0f),
2257                               grad (hash (X+1, Y+1, Z  , W+1), fx-1.0f, fy-1.0f, fz     , fw-1.0f),
2258                               grad (hash (X  , Y  , Z+1, W+1), fx     , fy     , fz-1.0f, fw-1.0f),
2259                               grad (hash (X+1, Y  , Z+1, W+1), fx-1.0f, fy     , fz-1.0f, fw-1.0f),
2260                               grad (hash (X  , Y+1, Z+1, W+1), fx     , fy-1.0f, fz-1.0f, fw-1.0f),
2261                               grad (hash (X+1, Y+1, Z+1, W+1), fx-1.0f, fy-1.0f, fz-1.0f, fw-1.0f),
2262                               u, v, t),
2263                s);
2264 
2265     result = scale4 (l_result);
2266     }
2267 }
2268 
2269 
2270 
2271 template<typename CGPolicyT = CGDefault>
2272 struct NoiseImpl {
NoiseImplNoiseImpl2273 	OSL_FORCEINLINE OSL_HOSTDEVICE NoiseImpl () { }
2274 
operatorNoiseImpl2275     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x) const {
2276         HashScalar h;
2277         float perlin_result;
2278         perlin<CGPolicyT>(perlin_result, h, x);
2279         result = 0.5f * (perlin_result + 1.0f);
2280     }
2281 
operatorNoiseImpl2282     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float y) const {
2283         HashScalar h;
2284         float perlin_result;
2285         perlin<CGPolicyT>(perlin_result, h, x, y);
2286         result = 0.5f * (perlin_result + 1.0f);
2287     }
2288 
operatorNoiseImpl2289     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p) const {
2290         HashScalar h;
2291         float perlin_result;
2292         perlin<CGPolicyT>(perlin_result, h, p.x, p.y, p.z);
2293         result = 0.5f * (perlin_result + 1.0f);
2294     }
2295 
operatorNoiseImpl2296     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t) const {
2297         HashScalar h;
2298         float perlin_result;
2299         perlin<CGPolicyT>(perlin_result, h, p.x, p.y, p.z, t);
2300         result = 0.5f * (perlin_result + 1.0f);
2301     }
2302 
operatorNoiseImpl2303     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x) const {
2304         HashVector h;
2305         Vec3 perlin_result;
2306         perlin<CGPolicyT>(perlin_result, h, x);
2307         result = 0.5f * (perlin_result + Vec3(1.0f, 1.0f, 1.0f));
2308     }
2309 
operatorNoiseImpl2310     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y) const {
2311         HashVector h;
2312         Vec3 perlin_result;
2313         perlin<CGPolicyT>(perlin_result, h, x, y);
2314         result = 0.5f * (perlin_result + Vec3(1.0f, 1.0f, 1.0f));
2315     }
2316 
operatorNoiseImpl2317     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p) const {
2318         HashVector h;
2319         Vec3 perlin_result;
2320         perlin<CGPolicyT>(perlin_result, h, p.x, p.y, p.z);
2321         result = 0.5f * (perlin_result + Vec3(1.0f, 1.0f, 1.0f));
2322     }
2323 
operatorNoiseImpl2324     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t) const {
2325         HashVector h;
2326         Vec3 perlin_result;
2327         perlin<CGPolicyT>(perlin_result, h, p.x, p.y, p.z, t);
2328         result = 0.5f * (perlin_result + Vec3(1.0f, 1.0f, 1.0f));
2329     }
2330 
2331 
2332     // dual versions
2333 
operatorNoiseImpl2334     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x) const {
2335         HashScalar h;
2336         Dual2<float> perlin_result;
2337         perlin<CGPolicyT>(perlin_result, h, x);
2338         result = 0.5f * (perlin_result + 1.0f);
2339     }
2340 
operatorNoiseImpl2341     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2342         HashScalar h;
2343         Dual2<float> perlin_result;
2344         perlin<CGPolicyT>(perlin_result, h, x, y);
2345         result = 0.5f * (perlin_result + 1.0f);
2346     }
2347 
operatorNoiseImpl2348     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p) const {
2349         HashScalar h;
2350         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2351         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2352         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2353         Dual2<float> perlin_result;
2354         perlin<CGPolicyT>(perlin_result, h, px, py, pz);
2355         result = 0.5f * (perlin_result + 1.0f);
2356     }
2357 
operatorNoiseImpl2358     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
2359         HashScalar h;
2360         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2361         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2362         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2363         Dual2<float> perlin_result;
2364         perlin<CGPolicyT>(perlin_result, h, px, py, pz, t);
2365         result = 0.5f * (perlin_result + 1.0f);
2366     }
2367 
operatorNoiseImpl2368     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x) const {
2369         HashVector h;
2370         Dual2<Vec3> perlin_result;
2371         perlin<CGPolicyT>(perlin_result, h, x);
2372         result = Vec3(0.5f, 0.5f, 0.5f) * (perlin_result + Vec3(1, 1, 1));
2373     }
2374 
operatorNoiseImpl2375     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2376         HashVector h;
2377         Dual2<Vec3> perlin_result;
2378         perlin<CGPolicyT>(perlin_result, h, x, y);
2379         result = Vec3(0.5f, 0.5f, 0.5f) * (perlin_result + Vec3(1, 1, 1));
2380     }
2381 
operatorNoiseImpl2382     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p) const {
2383         HashVector h;
2384         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2385         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2386         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2387         Dual2<Vec3> perlin_result;
2388         perlin<CGPolicyT>(perlin_result, h, px, py, pz);
2389         result = Vec3(0.5f, 0.5f, 0.5f) * (perlin_result + Vec3(1, 1, 1));
2390     }
2391 
operatorNoiseImpl2392     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
2393         HashVector h;
2394         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2395         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2396         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2397         Dual2<Vec3> perlin_result;
2398         perlin<CGPolicyT>(perlin_result, h, px, py, pz, t);
2399         result = Vec3(0.5f, 0.5f, 0.5f) * (perlin_result + Vec3(1, 1, 1));
2400     }
2401 };
2402 
2403 struct Noise : NoiseImpl<CGDefault> {};
2404 // Scalar version of Noise that is SIMD friendly suitable to be
2405 // inlined inside of a SIMD loops
2406 struct NoiseScalar : NoiseImpl<CGScalar> {};
2407 
2408 
2409 template<typename CGPolicyT = CGDefault>
2410 struct SNoiseImpl {
SNoiseImplSNoiseImpl2411 	OSL_FORCEINLINE OSL_HOSTDEVICE SNoiseImpl () { }
2412 
operatorSNoiseImpl2413     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x) const {
2414         HashScalar h;
2415         perlin<CGPolicyT>(result, h, x);
2416     }
2417 
operatorSNoiseImpl2418     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float y) const {
2419         HashScalar h;
2420         perlin<CGPolicyT>(result, h, x, y);
2421     }
2422 
operatorSNoiseImpl2423     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p) const {
2424         HashScalar h;
2425         perlin<CGPolicyT>(result, h, p.x, p.y, p.z);
2426     }
2427 
operatorSNoiseImpl2428     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t) const {
2429         HashScalar h;
2430         perlin<CGPolicyT>(result, h, p.x, p.y, p.z, t);
2431     }
2432 
operatorSNoiseImpl2433     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x) const {
2434         HashVector h;
2435         perlin<CGPolicyT>(result, h, x);
2436     }
2437 
operatorSNoiseImpl2438     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y) const {
2439         HashVector h;
2440         perlin<CGPolicyT>(result, h, x, y);
2441     }
2442 
operatorSNoiseImpl2443     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p) const {
2444         HashVector h;
2445         perlin<CGPolicyT>(result, h, p.x, p.y, p.z);
2446     }
2447 
operatorSNoiseImpl2448     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t) const {
2449         HashVector h;
2450         perlin<CGPolicyT>(result, h, p.x, p.y, p.z, t);
2451     }
2452 
2453 
2454     // dual versions
2455 
operatorSNoiseImpl2456     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x) const {
2457         HashScalar h;
2458         perlin<CGPolicyT>(result, h, x);
2459     }
2460 
operatorSNoiseImpl2461     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2462         HashScalar h;
2463         perlin<CGPolicyT>(result, h, x, y);
2464     }
2465 
operatorSNoiseImpl2466     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p) const {
2467         HashScalar h;
2468         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2469         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2470         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2471         perlin<CGPolicyT>(result, h, px, py, pz);
2472     }
2473 
operatorSNoiseImpl2474     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
2475         HashScalar h;
2476         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2477         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2478         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2479         perlin<CGPolicyT>(result, h, px, py, pz, t);
2480     }
2481 
operatorSNoiseImpl2482     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x) const {
2483         HashVector h;
2484         perlin<CGPolicyT>(result, h, x);
2485     }
2486 
2487 
operatorSNoiseImpl2488     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2489         HashVector h;
2490         perlin<CGPolicyT>(result, h, x, y);
2491     }
2492 
operatorSNoiseImpl2493     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p) const {
2494         HashVector h;
2495         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2496         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2497         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2498         perlin<CGPolicyT>(result, h, px, py, pz);
2499     }
2500 
operatorSNoiseImpl2501     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
2502         HashVector h;
2503         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2504         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2505         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2506         perlin<CGPolicyT>(result, h, px, py, pz, t);
2507     }
2508 };
2509 
2510 struct SNoise : SNoiseImpl<CGDefault> {};
2511 // Scalar version of SNoise that is SIMD friendly suitable to be
2512 // inlined inside of a SIMD loops
2513 struct SNoiseScalar : SNoiseImpl<CGScalar> {};
2514 
2515 
2516 
2517 template<typename CGPolicyT = CGDefault>
2518 struct PeriodicNoiseImpl {
PeriodicNoiseImplPeriodicNoiseImpl2519 	OSL_FORCEINLINE OSL_HOSTDEVICE PeriodicNoiseImpl () { }
2520 
operatorPeriodicNoiseImpl2521     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float px) const {
2522         HashScalarPeriodic h(px);
2523         perlin<CGPolicyT>(result, h, x);
2524         result = 0.5f * (result + 1.0f);
2525     }
2526 
operatorPeriodicNoiseImpl2527     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float y, float px, float py) const {
2528         HashScalarPeriodic h(px, py);
2529         perlin<CGPolicyT>(result, h, x, y);
2530         result = 0.5f * (result + 1.0f);
2531     }
2532 
operatorPeriodicNoiseImpl2533     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, const Vec3 &pp) const {
2534         HashScalarPeriodic h(pp.x, pp.y, pp.z);
2535         perlin<CGPolicyT>(result, h, p.x, p.y, p.z);
2536         result = 0.5f * (result + 1.0f);
2537     }
2538 
operatorPeriodicNoiseImpl2539     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t, const Vec3 &pp, float pt) const {
2540         HashScalarPeriodic h(pp.x, pp.y, pp.z, pt);
2541         perlin<CGPolicyT>(result, h, p.x, p.y, p.z, t);
2542         result = 0.5f * (result + 1.0f);
2543     }
2544 
operatorPeriodicNoiseImpl2545     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float px) const {
2546         HashVectorPeriodic h(px);
2547         perlin<CGPolicyT>(result, h, x);
2548         result = 0.5f * (result + Vec3(1.0f, 1.0f, 1.0f));
2549     }
2550 
operatorPeriodicNoiseImpl2551     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y, float px, float py) const {
2552         HashVectorPeriodic h(px, py);
2553         perlin<CGPolicyT>(result, h, x, y);
2554         result = 0.5f * (result + Vec3(1.0f, 1.0f, 1.0f));
2555     }
2556 
operatorPeriodicNoiseImpl2557     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, const Vec3 &pp) const {
2558         HashVectorPeriodic h(pp.x, pp.y, pp.z);
2559         perlin<CGPolicyT>(result, h, p.x, p.y, p.z);
2560         result = 0.5f * (result + Vec3(1.0f, 1.0f, 1.0f));
2561     }
2562 
operatorPeriodicNoiseImpl2563     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t, const Vec3 &pp, float pt) const {
2564         HashVectorPeriodic h(pp.x, pp.y, pp.z, pt);
2565         perlin<CGPolicyT>(result, h, p.x, p.y, p.z, t);
2566         result = 0.5f * (result + Vec3(1.0f, 1.0f, 1.0f));
2567     }
2568 
2569     // dual versions
2570 
operatorPeriodicNoiseImpl2571     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x, float px) const {
2572         HashScalarPeriodic h(px);
2573         perlin<CGPolicyT>(result, h, x);
2574         result = 0.5f * (result + 1.0f);
2575     }
2576 
operatorPeriodicNoiseImpl2577     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x, const Dual2<float> &y,
2578             float px, float py) const {
2579         HashScalarPeriodic h(px, py);
2580         perlin<CGPolicyT>(result, h, x, y);
2581         result = 0.5f * (result + 1.0f);
2582     }
2583 
operatorPeriodicNoiseImpl2584     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p, const Vec3 &pp) const {
2585         HashScalarPeriodic h(pp.x, pp.y, pp.z);
2586         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2587         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2588         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2589         perlin<CGPolicyT>(result, h, px, py, pz);
2590         result = 0.5f * (result + 1.0f);
2591     }
2592 
operatorPeriodicNoiseImpl2593     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p, const Dual2<float> &t,
2594             const Vec3 &pp, float pt) const {
2595         HashScalarPeriodic h(pp.x, pp.y, pp.z, pt);
2596         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2597         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2598         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2599         perlin<CGPolicyT>(result, h, px, py, pz, t);
2600         result = 0.5f * (result + 1.0f);
2601     }
2602 
operatorPeriodicNoiseImpl2603     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, float px) const {
2604         HashVectorPeriodic h(px);
2605         perlin<CGPolicyT>(result, h, x);
2606         result = Vec3(0.5f, 0.5f, 0.5f) * (result + Vec3(1.0f, 1.0f, 1.0f));
2607     }
2608 
operatorPeriodicNoiseImpl2609     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y,
2610             float px, float py) const {
2611         HashVectorPeriodic h(px, py);
2612         perlin<CGPolicyT>(result, h, x, y);
2613         result = Vec3(0.5f, 0.5f, 0.5f) * (result + Vec3(1.0f, 1.0f, 1.0f));
2614     }
2615 
operatorPeriodicNoiseImpl2616     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Vec3 &pp) const {
2617         HashVectorPeriodic h(pp.x, pp.y, pp.z);
2618         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2619         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2620         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2621         perlin<CGPolicyT>(result, h, px, py, pz);
2622         result = Vec3(0.5f, 0.5f, 0.5f) * (result + Vec3(1.0f, 1.0f, 1.0f));
2623     }
2624 
operatorPeriodicNoiseImpl2625     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t,
2626             const Vec3 &pp, float pt) const {
2627         HashVectorPeriodic h(pp.x, pp.y, pp.z, pt);
2628         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2629         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2630         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2631         perlin<CGPolicyT>(result, h, px, py, pz, t);
2632         result = Vec3(0.5f, 0.5f, 0.5f) * (result + Vec3(1.0f, 1.0f, 1.0f));
2633     }
2634 
2635 };
2636 
2637 struct PeriodicNoise : PeriodicNoiseImpl<CGDefault> {};
2638 // Scalar version of PeriodicNoise that is SIMD friendly suitable to be
2639 // inlined inside of a SIMD loops
2640 struct PeriodicNoiseScalar : PeriodicNoiseImpl<CGScalar> {};
2641 
2642 template<typename CGPolicyT = CGDefault>
2643 struct PeriodicSNoiseImpl {
PeriodicSNoiseImplPeriodicSNoiseImpl2644 	OSL_FORCEINLINE OSL_HOSTDEVICE PeriodicSNoiseImpl () { }
2645 
operatorPeriodicSNoiseImpl2646     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float px) const {
2647         HashScalarPeriodic h(px);
2648         perlin<CGPolicyT>(result, h, x);
2649     }
2650 
operatorPeriodicSNoiseImpl2651     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, float x, float y, float px, float py) const {
2652         HashScalarPeriodic h(px, py);
2653         perlin<CGPolicyT>(result, h, x, y);
2654     }
2655 
operatorPeriodicSNoiseImpl2656     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, const Vec3 &pp) const {
2657         HashScalarPeriodic h(pp.x, pp.y, pp.z);
2658         perlin<CGPolicyT>(result, h, p.x, p.y, p.z);
2659     }
2660 
operatorPeriodicSNoiseImpl2661     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t, const Vec3 &pp, float pt) const {
2662         HashScalarPeriodic h(pp.x, pp.y, pp.z, pt);
2663         perlin<CGPolicyT>(result, h, p.x, p.y, p.z, t);
2664     }
2665 
operatorPeriodicSNoiseImpl2666     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float px) const {
2667         HashVectorPeriodic h(px);
2668         perlin<CGPolicyT>(result, h, x);
2669     }
2670 
operatorPeriodicSNoiseImpl2671     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y, float px, float py) const {
2672         HashVectorPeriodic h(px, py);
2673         perlin<CGPolicyT>(result, h, x, y);
2674     }
2675 
operatorPeriodicSNoiseImpl2676     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, const Vec3 &pp) const {
2677         HashVectorPeriodic h(pp.x, pp.y, pp.z);
2678         perlin<CGPolicyT>(result, h, p.x, p.y, p.z);
2679     }
2680 
operatorPeriodicSNoiseImpl2681     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t, const Vec3 &pp, float pt) const {
2682         HashVectorPeriodic h(pp.x, pp.y, pp.z, pt);
2683         perlin<CGPolicyT>(result, h, p.x, p.y, p.z, t);
2684     }
2685 
2686     // dual versions
2687 
operatorPeriodicSNoiseImpl2688     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x, float px) const {
2689         HashScalarPeriodic h(px);
2690         perlin<CGPolicyT>(result, h, x);
2691     }
2692 
operatorPeriodicSNoiseImpl2693     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x, const Dual2<float> &y,
2694             float px, float py) const {
2695         HashScalarPeriodic h(px, py);
2696         perlin<CGPolicyT>(result, h, x, y);
2697     }
2698 
operatorPeriodicSNoiseImpl2699     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p, const Vec3 &pp) const {
2700         HashScalarPeriodic h(pp.x, pp.y, pp.z);
2701         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2702         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2703         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2704         perlin<CGPolicyT>(result, h, px, py, pz);
2705     }
2706 
operatorPeriodicSNoiseImpl2707     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p, const Dual2<float> &t,
2708             const Vec3 &pp, float pt) const {
2709         HashScalarPeriodic h(pp.x, pp.y, pp.z, pt);
2710         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2711         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2712         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2713         perlin<CGPolicyT>(result, h, px, py, pz, t);
2714     }
2715 
operatorPeriodicSNoiseImpl2716     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, float px) const {
2717         HashVectorPeriodic h(px);
2718         perlin<CGPolicyT>(result, h, x);
2719     }
2720 
operatorPeriodicSNoiseImpl2721     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y,
2722             float px, float py) const {
2723         HashVectorPeriodic h(px, py);
2724         perlin<CGPolicyT>(result, h, x, y);
2725     }
2726 
operatorPeriodicSNoiseImpl2727     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Vec3 &pp) const {
2728         HashVectorPeriodic h(pp.x, pp.y, pp.z);
2729         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2730         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2731         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2732         perlin<CGPolicyT>(result, h, px, py, pz);
2733     }
2734 
operatorPeriodicSNoiseImpl2735     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t,
2736             const Vec3 &pp, float pt) const {
2737         HashVectorPeriodic h(pp.x, pp.y, pp.z, pt);
2738         Dual2<float> px(p.val().x, p.dx().x, p.dy().x);
2739         Dual2<float> py(p.val().y, p.dx().y, p.dy().y);
2740         Dual2<float> pz(p.val().z, p.dx().z, p.dy().z);
2741         perlin<CGPolicyT>(result, h, px, py, pz, t);
2742     }
2743 
2744 };
2745 
2746 struct PeriodicSNoise : PeriodicSNoiseImpl<CGDefault> {};
2747 // Scalar version of PeriodicSNoise that is SIMD friendly suitable to be
2748 // inlined inside of a SIMD loops
2749 struct PeriodicSNoiseScalar : PeriodicSNoiseImpl<CGScalar> {};
2750 
2751 
2752 struct SimplexNoise {
SimplexNoiseSimplexNoise2753     OSL_HOSTDEVICE SimplexNoise () { }
2754 
operatorSimplexNoise2755     inline OSL_HOSTDEVICE void operator() (float &result, float x) const {
2756         result = simplexnoise1 (x);
2757     }
2758 
operatorSimplexNoise2759     inline OSL_HOSTDEVICE void operator() (float &result, float x, float y) const {
2760         result = simplexnoise2 (x, y);
2761     }
2762 
operatorSimplexNoise2763     inline OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p) const {
2764         result = simplexnoise3 (p.x, p.y, p.z);
2765     }
2766 
operatorSimplexNoise2767     inline OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t) const {
2768         result = simplexnoise4 (p.x, p.y, p.z, t);
2769     }
2770 
operatorSimplexNoise2771     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x) const {
2772         result.x = simplexnoise1 (x, 0);
2773         result.y = simplexnoise1 (x, 1);
2774         result.z = simplexnoise1 (x, 2);
2775     }
2776 
operatorSimplexNoise2777     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y) const {
2778         result.x = simplexnoise2 (x, y, 0);
2779         result.y = simplexnoise2 (x, y, 1);
2780         result.z = simplexnoise2 (x, y, 2);
2781     }
2782 
operatorSimplexNoise2783     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p) const {
2784         result.x = simplexnoise3 (p.x, p.y, p.z, 0);
2785         result.y = simplexnoise3 (p.x, p.y, p.z, 1);
2786         result.z = simplexnoise3 (p.x, p.y, p.z, 2);
2787     }
2788 
operatorSimplexNoise2789     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t) const {
2790         result.x = simplexnoise4 (p.x, p.y, p.z, t, 0);
2791         result.y = simplexnoise4 (p.x, p.y, p.z, t, 1);
2792         result.z = simplexnoise4 (p.x, p.y, p.z, t, 2);
2793     }
2794 
2795 
2796     // dual versions
2797 
operatorSimplexNoise2798     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x,
2799                                            int seed=0) const {
2800         float r, dndx;
2801         r = simplexnoise1 (x.val(), seed, &dndx);
2802         result.set (r, dndx * x.dx(), dndx * x.dy());
2803     }
2804 
operatorSimplexNoise2805     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x,
2806                                            const Dual2<float> &y, int seed=0) const {
2807         float r, dndx, dndy;
2808         r = simplexnoise2 (x.val(), y.val(), seed, &dndx, &dndy);
2809         result.set (r, dndx * x.dx() + dndy * y.dx(),
2810                        dndx * x.dy() + dndy * y.dy());
2811     }
2812 
operatorSimplexNoise2813     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p,
2814                                            int seed=0) const {
2815         float r, dndx, dndy, dndz;
2816         r = simplexnoise3 (p.val().x, p.val().y, p.val().z,
2817                            seed, &dndx, &dndy, &dndz);
2818         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z,
2819                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z);
2820     }
2821 
operatorSimplexNoise2822     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p,
2823                                            const Dual2<float> &t, int seed=0) const {
2824         float r, dndx, dndy, dndz, dndt;
2825         r = simplexnoise4 (p.val().x, p.val().y, p.val().z, t.val(),
2826                            seed, &dndx, &dndy, &dndz, &dndt);
2827         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z + dndt * t.dx(),
2828                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z + dndt * t.dy());
2829     }
2830 
operatorSimplexNoise2831     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x) const {
2832         Dual2<float> r0, r1, r2;
2833         (*this)(r0, x, 0);
2834         (*this)(r1, x, 1);
2835         (*this)(r2, x, 2);
2836         result = make_Vec3 (r0, r1, r2);
2837     }
2838 
operatorSimplexNoise2839     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2840         Dual2<float> r0, r1, r2;
2841         (*this)(r0, x, y, 0);
2842         (*this)(r1, x, y, 1);
2843         (*this)(r2, x, y, 2);
2844         result = make_Vec3 (r0, r1, r2);
2845     }
2846 
operatorSimplexNoise2847     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p) const {
2848         Dual2<float> r0, r1, r2;
2849         (*this)(r0, p, 0);
2850         (*this)(r1, p, 1);
2851         (*this)(r2, p, 2);
2852         result = make_Vec3 (r0, r1, r2);
2853     }
2854 
operatorSimplexNoise2855     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
2856         Dual2<float> r0, r1, r2;
2857         (*this)(r0, p, t, 0);
2858         (*this)(r1, p, t, 1);
2859         (*this)(r2, p, t, 2);
2860         result = make_Vec3 (r0, r1, r2);
2861     }
2862 };
2863 
2864 
2865 // Scalar version of SimplexNoise that is SIMD friendly suitable to be
2866 // inlined inside of a SIMD loops
2867 struct SimplexNoiseScalar {
SimplexNoiseScalarSimplexNoiseScalar2868     SimplexNoiseScalar () { }
2869 
operatorSimplexNoiseScalar2870     OSL_FORCEINLINE void operator() (float &result, float x) const {
2871         result = sfm::simplexnoise1<0/* seed */>(x);
2872     }
2873 
operatorSimplexNoiseScalar2874     OSL_FORCEINLINE void operator() (float &result, float x, float y) const {
2875         result = sfm::simplexnoise2<0/* seed */>(x, y);
2876     }
2877 
operatorSimplexNoiseScalar2878     OSL_FORCEINLINE void operator()(float &result, const Vec3 &p) const {
2879         result = sfm::simplexnoise3<0/* seed */>(p.x, p.y, p.z);
2880     }
2881 
operatorSimplexNoiseScalar2882     OSL_FORCEINLINE void operator()(float &result, const Vec3 &p, float t) const {
2883         result = sfm::simplexnoise4<0/* seed */>(p.x, p.y, p.z, t);
2884     }
2885 
operatorSimplexNoiseScalar2886     OSL_FORCEINLINE void operator() (Vec3 &result, float x) const {
2887         result.x = sfm::simplexnoise1<0/* seed */>(x);
2888         result.y = sfm::simplexnoise1<1/* seed */>(x);
2889         result.z = sfm::simplexnoise1<2/* seed */>(x);
2890     }
2891 
operatorSimplexNoiseScalar2892     OSL_FORCEINLINE void operator() (Vec3 &result, float x, float y) const {
2893         result.x = sfm::simplexnoise2<0/* seed */>(x, y);
2894         result.y = sfm::simplexnoise2<1/* seed */>(x, y);
2895         result.z = sfm::simplexnoise2<2/* seed */>(x, y);
2896     }
2897 
operatorSimplexNoiseScalar2898     OSL_FORCEINLINE void operator() (Vec3 &result, const Vec3 &p) const {
2899         result.x = sfm::simplexnoise3<0/* seed */>(p.x, p.y, p.z);
2900         result.y = sfm::simplexnoise3<1/* seed */>(p.x, p.y, p.z);
2901         result.z = sfm::simplexnoise3<2/* seed */>(p.x, p.y, p.z);
2902     }
2903 
operatorSimplexNoiseScalar2904     OSL_FORCEINLINE void operator() (Vec3 &result, const Vec3 &p, float t) const {
2905         result.x = sfm::simplexnoise4<0/* seed */>(p.x, p.y, p.z, t);
2906         result.y = sfm::simplexnoise4<1/* seed */>(p.x, p.y, p.z, t);
2907         result.z = sfm::simplexnoise4<2/* seed */>(p.x, p.y, p.z, t);
2908     }
2909 
2910 
2911     // dual versions
2912     template<int seed=0>
operatorSimplexNoiseScalar2913     OSL_FORCEINLINE void operator() (Dual2<float> &result, const Dual2<float> &x) const {
2914         float r, dndx;
2915         r = sfm::simplexnoise1<seed>(x.val(), sfm::DxRef(dndx));
2916         result.set (r, dndx * x.dx(), dndx * x.dy());
2917     }
2918 
2919     template<int seedT=0>
operatorSimplexNoiseScalar2920     OSL_FORCEINLINE void operator() (Dual2<float> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2921         float r, dndx, dndy;
2922         r = sfm::simplexnoise2<seedT> (x.val(), y.val(), sfm::DxDyRef(dndx, dndy));
2923         result.set (r, dndx * x.dx() + dndy * y.dx(),
2924                        dndx * x.dy() + dndy * y.dy());
2925     }
2926 
2927     template<int seed=0>
operatorSimplexNoiseScalar2928     OSL_FORCEINLINE void operator() (Dual2<float> &result, const Dual2<Vec3> &p) const {
2929         float dndx, dndy, dndz;
2930         const Vec3 &p_val = p.val();
2931         float r = sfm::simplexnoise3<seed>(p_val.x, p_val.y, p_val.z, sfm::DxDyDzRef(dndx, dndy, dndz));
2932         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z,
2933                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z);
2934     }
2935 
2936     template<int seed=0>
operatorSimplexNoiseScalar2937     OSL_FORCEINLINE void operator()(Dual2<float> &result, const Dual2<Vec3> &p,
2938                             const Dual2<float> &t) const {
2939         float dndx, dndy, dndz, dndt;
2940         float r = sfm::simplexnoise4<seed> (p.val().x, p.val().y, p.val().z, t.val(),
2941                            sfm::DxDyDzDwRef(dndx, dndy, dndz, dndt));
2942         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z + dndt * t.dx(),
2943                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z + dndt * t.dy());
2944     }
2945 
operatorSimplexNoiseScalar2946     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<float> &x) const {
2947         Dual2<float> r0, r1, r2;
2948         operator()<0>(r0, x);
2949         operator()<1>(r1, x);
2950         operator()<2>(r2, x);
2951         result = make_Vec3 (r0, r1, r2);
2952     }
2953 
operatorSimplexNoiseScalar2954     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y) const {
2955         Dual2<float> r0, r1, r2;
2956         operator()<0>(r0, x, y);
2957         operator()<1>(r1, x, y);
2958         operator()<2>(r2, x, y);
2959         result = make_Vec3 (r0, r1, r2);
2960     }
2961 
operatorSimplexNoiseScalar2962     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p) const {
2963         Dual2<float> r0, r1, r2;
2964         operator()<0>(r0, p);
2965         operator()<1>(r1, p);
2966         operator()<2>(r2, p);
2967         result = make_Vec3 (r0, r1, r2);
2968     }
2969 
operatorSimplexNoiseScalar2970     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
2971         Dual2<float> r0, r1, r2;
2972         operator()<0>(r0, p, t);
2973         operator()<1>(r1, p, t);
2974         operator()<2>(r2, p, t);
2975         result = make_Vec3 (r0, r1, r2);
2976     }
2977 };
2978 
2979 // Unsigned simplex noise
2980 struct USimplexNoise {
USimplexNoiseUSimplexNoise2981     OSL_HOSTDEVICE USimplexNoise () { }
2982 
operatorUSimplexNoise2983     inline OSL_HOSTDEVICE void operator() (float &result, float x) const {
2984         result = 0.5f * (simplexnoise1 (x) + 1.0f);
2985     }
2986 
operatorUSimplexNoise2987     inline OSL_HOSTDEVICE void operator() (float &result, float x, float y) const {
2988         result = 0.5f * (simplexnoise2 (x, y) + 1.0f);
2989     }
2990 
operatorUSimplexNoise2991     inline OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p) const {
2992         result = 0.5f * (simplexnoise3 (p.x, p.y, p.z) + 1.0f);
2993     }
2994 
operatorUSimplexNoise2995     inline OSL_HOSTDEVICE void operator() (float &result, const Vec3 &p, float t) const {
2996         result = 0.5f * (simplexnoise4 (p.x, p.y, p.z, t) + 1.0f);
2997     }
2998 
operatorUSimplexNoise2999     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x) const {
3000         result.x = 0.5f * (simplexnoise1 (x, 0) + 1.0f);
3001         result.y = 0.5f * (simplexnoise1 (x, 1) + 1.0f);
3002         result.z = 0.5f * (simplexnoise1 (x, 2) + 1.0f);
3003     }
3004 
operatorUSimplexNoise3005     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, float x, float y) const {
3006         result.x = 0.5f * (simplexnoise2 (x, y, 0) + 1.0f);
3007         result.y = 0.5f * (simplexnoise2 (x, y, 1) + 1.0f);
3008         result.z = 0.5f * (simplexnoise2 (x, y, 2) + 1.0f);
3009     }
3010 
operatorUSimplexNoise3011     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p) const {
3012         result.x = 0.5f * (simplexnoise3 (p.x, p.y, p.z, 0) + 1.0f);
3013         result.y = 0.5f * (simplexnoise3 (p.x, p.y, p.z, 1) + 1.0f);
3014         result.z = 0.5f * (simplexnoise3 (p.x, p.y, p.z, 2) + 1.0f);
3015     }
3016 
operatorUSimplexNoise3017     OSL_FORCEINLINE OSL_HOSTDEVICE void operator() (Vec3 &result, const Vec3 &p, float t) const {
3018         result.x = 0.5f * (simplexnoise4 (p.x, p.y, p.z, t, 0) + 1.0f);
3019         result.y = 0.5f * (simplexnoise4 (p.x, p.y, p.z, t, 1) + 1.0f);
3020         result.z = 0.5f * (simplexnoise4 (p.x, p.y, p.z, t, 2) + 1.0f);
3021     }
3022 
3023 
3024     // dual versions
3025 
operatorUSimplexNoise3026     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x,
3027                                            int seed=0) const {
3028         float r, dndx;
3029         r = simplexnoise1 (x.val(), seed, &dndx);
3030         r = 0.5f * (r + 1.0f);
3031         dndx *= 0.5f;
3032         result.set (r, dndx * x.dx(), dndx * x.dy());
3033     }
3034 
operatorUSimplexNoise3035     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<float> &x,
3036                                            const Dual2<float> &y, int seed=0) const {
3037         float r, dndx, dndy;
3038         r = simplexnoise2 (x.val(), y.val(), seed, &dndx, &dndy);
3039         r = 0.5f * (r + 1.0f);
3040         dndx *= 0.5f;
3041         dndy *= 0.5f;
3042         result.set (r, dndx * x.dx() + dndy * y.dx(),
3043                        dndx * x.dy() + dndy * y.dy());
3044     }
3045 
operatorUSimplexNoise3046     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p,
3047                                            int seed=0) const {
3048         float r, dndx, dndy, dndz;
3049         r = simplexnoise3 (p.val().x, p.val().y, p.val().z,
3050                            seed, &dndx, &dndy, &dndz);
3051         r = 0.5f * (r + 1.0f);
3052         dndx *= 0.5f;
3053         dndy *= 0.5f;
3054         dndz *= 0.5f;
3055         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z,
3056                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z);
3057     }
3058 
operatorUSimplexNoise3059     inline OSL_HOSTDEVICE void operator() (Dual2<float> &result, const Dual2<Vec3> &p,
3060                                            const Dual2<float> &t, int seed=0) const {
3061         float r, dndx, dndy, dndz, dndt;
3062         r = simplexnoise4 (p.val().x, p.val().y, p.val().z, t.val(),
3063                            seed, &dndx, &dndy, &dndz, &dndt);
3064         r = 0.5f * (r + 1.0f);
3065         dndx *= 0.5f;
3066         dndy *= 0.5f;
3067         dndz *= 0.5f;
3068         dndt *= 0.5f;
3069         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z + dndt * t.dx(),
3070                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z + dndt * t.dy());
3071     }
3072 
operatorUSimplexNoise3073     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x) const {
3074         Dual2<float> r0, r1, r2;
3075         (*this)(r0, x, 0);
3076         (*this)(r1, x, 1);
3077         (*this)(r2, x, 2);
3078         result = make_Vec3 (r0, r1, r2);
3079     }
3080 
operatorUSimplexNoise3081     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y) const {
3082         Dual2<float> r0, r1, r2;
3083         (*this)(r0, x, y, 0);
3084         (*this)(r1, x, y, 1);
3085         (*this)(r2, x, y, 2);
3086         result = make_Vec3 (r0, r1, r2);
3087     }
3088 
operatorUSimplexNoise3089     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p) const {
3090         Dual2<float> r0, r1, r2;
3091         (*this)(r0, p, 0);
3092         (*this)(r1, p, 1);
3093         (*this)(r2, p, 2);
3094         result = make_Vec3 (r0, r1, r2);
3095     }
3096 
operatorUSimplexNoise3097     inline OSL_HOSTDEVICE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
3098         Dual2<float> r0, r1, r2;
3099         (*this)(r0, p, t, 0);
3100         (*this)(r1, p, t, 1);
3101         (*this)(r2, p, t, 2);
3102         result = make_Vec3 (r0, r1, r2);
3103     }
3104 
3105 };
3106 
3107 // Scalar version of USimplexNoise that is SIMD friendly suitable to be
3108 // inlined inside of a SIMD loops
3109 struct USimplexNoiseScalar {
USimplexNoiseScalarUSimplexNoiseScalar3110 	OSL_FORCEINLINE USimplexNoiseScalar () { }
3111 
operatorUSimplexNoiseScalar3112     OSL_FORCEINLINE void operator() (float &result, float x) const {
3113         result = 0.5f * (sfm::simplexnoise1<0/* seed */>(x) + 1.0f);
3114     }
3115 
operatorUSimplexNoiseScalar3116     OSL_FORCEINLINE void operator() (float &result, float x, float y) const {
3117         result = 0.5f * (sfm::simplexnoise2<0/* seed */>(x, y) + 1.0f);
3118     }
3119 
operatorUSimplexNoiseScalar3120     OSL_FORCEINLINE void operator() (float &result, const Vec3 &p) const {
3121         result = 0.5f * (sfm::simplexnoise3<0/* seed */>(p.x, p.y, p.z) + 1.0f);
3122     }
3123 
operatorUSimplexNoiseScalar3124     OSL_FORCEINLINE void operator() (float &result, const Vec3 &p, float t) const {
3125         result = 0.5f * (sfm::simplexnoise4<0/* seed */> (p.x, p.y, p.z, t) + 1.0f);
3126     }
3127 
operatorUSimplexNoiseScalar3128     OSL_FORCEINLINE void operator() (Vec3 &result, float x) const {
3129         result.x = 0.5f * (sfm::simplexnoise1<0/* seed */>(x) + 1.0f);
3130         result.y = 0.5f * (sfm::simplexnoise1<1/* seed */>(x) + 1.0f);
3131         result.z = 0.5f * (sfm::simplexnoise1<2/* seed */>(x) + 1.0f);
3132     }
3133 
operatorUSimplexNoiseScalar3134     OSL_FORCEINLINE void operator() (Vec3 &result, float x, float y) const {
3135         result.x = 0.5f * (sfm::simplexnoise2<0>(x, y) + 1.0f);
3136         result.y = 0.5f * (sfm::simplexnoise2<1>(x, y) + 1.0f);
3137         result.z = 0.5f * (sfm::simplexnoise2<2>(x, y) + 1.0f);
3138     }
3139 
operatorUSimplexNoiseScalar3140     OSL_FORCEINLINE void operator() (Vec3 &result, const Vec3 &p) const {
3141         result.x = 0.5f * (sfm::simplexnoise3<0/* seed */>(p.x, p.y, p.z) + 1.0f);
3142         result.y = 0.5f * (sfm::simplexnoise3<1/* seed */>(p.x, p.y, p.z) + 1.0f);
3143         result.z = 0.5f * (sfm::simplexnoise3<2/* seed */>(p.x, p.y, p.z) + 1.0f);
3144     }
3145 
operatorUSimplexNoiseScalar3146     OSL_FORCEINLINE void operator() (Vec3 &result, const Vec3 &p, float t) const {
3147         result.x = 0.5f * (sfm::simplexnoise4<0/* seed */>(p.x, p.y, p.z, t) + 1.0f);
3148         result.y = 0.5f * (sfm::simplexnoise4<1/* seed */>(p.x, p.y, p.z, t) + 1.0f);
3149         result.z = 0.5f * (sfm::simplexnoise4<2/* seed */>(p.x, p.y, p.z, t) + 1.0f);
3150     }
3151 
3152     // dual versions
3153     template<int seed=0>
operatorUSimplexNoiseScalar3154     OSL_FORCEINLINE void operator() (Dual2<float> &result, const Dual2<float> &x) const {
3155         float r, dndx;
3156         r = sfm::simplexnoise1<seed>(x.val(), sfm::DxRef(dndx));
3157         r = 0.5f * (r + 1.0f);
3158         dndx *= 0.5f;
3159         result.set (r, dndx * x.dx(), dndx * x.dy());
3160     }
3161 
3162     template<int seedT=0>
operatorUSimplexNoiseScalar3163     OSL_FORCEINLINE void operator() (Dual2<float> &result, const Dual2<float> &x,
3164                              const Dual2<float> &y) const {
3165         float r, dndx, dndy;
3166         r = sfm::simplexnoise2<seedT> (x.val(), y.val(), sfm::DxDyRef(dndx, dndy));
3167         r = 0.5f * (r + 1.0f);
3168         dndx *= 0.5f;
3169         dndy *= 0.5f;
3170         result.set (r, dndx * x.dx() + dndy * y.dx(),
3171                        dndx * x.dy() + dndy * y.dy());
3172     }
3173 
3174     template<int seed=0>
operatorUSimplexNoiseScalar3175     OSL_FORCEINLINE void operator() (Dual2<float> &result, const Dual2<Vec3> &p) const {
3176         float r, dndx, dndy, dndz;
3177         const Vec3 &p_val = p.val();
3178         r = sfm::simplexnoise3<seed>(p_val.x, p_val.y, p_val.z, sfm::DxDyDzRef(dndx, dndy, dndz));
3179         r = 0.5f * (r + 1.0f);
3180         dndx *= 0.5f;
3181         dndy *= 0.5f;
3182         dndz *= 0.5f;
3183         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z,
3184                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z);
3185     }
3186 
3187     template<int seed=0>
operatorUSimplexNoiseScalar3188     OSL_FORCEINLINE void operator()(Dual2<float> &result, const Dual2<Vec3> &p,
3189                             const Dual2<float> &t) const {
3190         float dndx, dndy, dndz, dndt;
3191         float r = sfm::simplexnoise4<seed> (p.val().x, p.val().y, p.val().z, t.val(),
3192                            sfm::DxDyDzDwRef(dndx, dndy, dndz, dndt));
3193         r = 0.5f * (r + 1.0f);
3194         dndx *= 0.5f;
3195         dndy *= 0.5f;
3196         dndz *= 0.5f;
3197         dndt *= 0.5f;
3198         result.set (r, dndx * p.dx().x + dndy * p.dx().y + dndz * p.dx().z + dndt * t.dx(),
3199                        dndx * p.dy().x + dndy * p.dy().y + dndz * p.dy().z + dndt * t.dy());
3200     }
3201 
operatorUSimplexNoiseScalar3202     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<float> &x) const {
3203         Dual2<float> r0, r1, r2;
3204         operator()<0>(r0, x);
3205         operator()<1>(r1, x);
3206         operator()<2>(r2, x);
3207         result = make_Vec3 (r0, r1, r2);
3208     }
3209 
operatorUSimplexNoiseScalar3210     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<float> &x, const Dual2<float> &y) const {
3211         Dual2<float> r0, r1, r2;
3212         operator()<0>(r0, x, y);
3213         operator()<1>(r1, x, y);
3214         operator()<2>(r2, x, y);
3215         result = make_Vec3 (r0, r1, r2);
3216     }
3217 
operatorUSimplexNoiseScalar3218     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p) const {
3219         Dual2<float> r0, r1, r2;
3220         operator()<0>(r0, p);
3221         operator()<1>(r1, p);
3222         operator()<2>(r2, p);
3223         result = make_Vec3 (r0, r1, r2);
3224     }
3225 
operatorUSimplexNoiseScalar3226     OSL_FORCEINLINE void operator() (Dual2<Vec3> &result, const Dual2<Vec3> &p, const Dual2<float> &t) const {
3227         Dual2<float> r0, r1, r2;
3228         operator()<0>(r0, p, t);
3229         operator()<1>(r1, p, t);
3230         operator()<2>(r2, p, t);
3231         result = make_Vec3 (r0, r1, r2);
3232     }
3233 
3234 };
3235 } // anonymous namespace
3236 
3237 
3238 
3239 OSLNOISEPUBLIC OSL_HOSTDEVICE
3240 Dual2<float> gabor (const Dual2<Vec3> &P, const NoiseParams *opt);
3241 
3242 
3243 
3244 OSLNOISEPUBLIC OSL_HOSTDEVICE
3245 Dual2<float> gabor (const Dual2<float> &x, const Dual2<float> &y,
3246                     const NoiseParams *opt);
3247 
3248 OSLNOISEPUBLIC OSL_HOSTDEVICE
3249 Dual2<float> gabor (const Dual2<float> &x, const NoiseParams *opt);
3250 
3251 OSLNOISEPUBLIC OSL_HOSTDEVICE
3252 Dual2<Vec3> gabor3 (const Dual2<Vec3> &P, const NoiseParams *opt);
3253 
3254 OSLNOISEPUBLIC OSL_HOSTDEVICE
3255 Dual2<Vec3> gabor3 (const Dual2<float> &x, const Dual2<float> &y,
3256                     const NoiseParams *opt);
3257 
3258 OSLNOISEPUBLIC OSL_HOSTDEVICE
3259 Dual2<Vec3> gabor3 (const Dual2<float> &x, const NoiseParams *opt);
3260 
3261 OSLNOISEPUBLIC OSL_HOSTDEVICE
3262 Dual2<float> pgabor (const Dual2<Vec3> &P, const Vec3 &Pperiod,
3263                      const NoiseParams *opt);
3264 
3265 OSLNOISEPUBLIC OSL_HOSTDEVICE
3266 Dual2<float> pgabor (const Dual2<float> &x, const Dual2<float> &y,
3267                      float xperiod, float yperiod, const NoiseParams *opt);
3268 
3269 OSLNOISEPUBLIC OSL_HOSTDEVICE
3270 Dual2<float> pgabor (const Dual2<float> &x, float xperiod,
3271                      const NoiseParams *opt);
3272 
3273 OSLNOISEPUBLIC OSL_HOSTDEVICE
3274 Dual2<Vec3> pgabor3 (const Dual2<Vec3> &P, const Vec3 &Pperiod,
3275                      const NoiseParams *opt);
3276 
3277 OSLNOISEPUBLIC OSL_HOSTDEVICE
3278 Dual2<Vec3> pgabor3 (const Dual2<float> &x, const Dual2<float> &y,
3279                      float xperiod, float yperiod, const NoiseParams *opt);
3280 
3281 OSLNOISEPUBLIC OSL_HOSTDEVICE
3282 Dual2<Vec3> pgabor3 (const Dual2<float> &x, float xperiod,
3283                      const NoiseParams *opt);
3284 
3285 
3286 
3287 }; // namespace pvt
3288 
3289 namespace oslnoise {
3290 
3291 #define DECLNOISE(name,impl)                    \
3292     template <class S> OSL_HOSTDEVICE           \
3293     inline float name (S x) {                   \
3294         pvt::impl noise;                        \
3295         float r;                                \
3296         noise (r, x);                           \
3297         return r;                               \
3298     }                                           \
3299                                                 \
3300     template <class S, class T> OSL_HOSTDEVICE  \
3301     inline float name (S x, T y) {              \
3302         pvt::impl noise;                        \
3303         float r;                                \
3304         noise (r, x, y);                        \
3305         return r;                               \
3306     }                                           \
3307                                                 \
3308     template <class S> OSL_HOSTDEVICE           \
3309     inline Vec3 v ## name (S x) {               \
3310         pvt::impl noise;                        \
3311         Vec3 r;                                 \
3312         noise (r, x);                           \
3313         return r;                               \
3314     }                                           \
3315                                                 \
3316     template <class S, class T> OSL_HOSTDEVICE  \
3317     inline Vec3 v ## name (S x, T y) {          \
3318         pvt::impl noise;                        \
3319         Vec3 r;                                 \
3320         noise (r, x, y);                        \
3321         return r;                               \
3322     }
3323 
3324 
3325 DECLNOISE (snoise, SNoise)
3326 DECLNOISE (noise, Noise)
3327 DECLNOISE (cellnoise, CellNoise)
3328 DECLNOISE (hashnoise, HashNoise)
3329 
3330 #undef DECLNOISE
3331 }   // namespace oslnoise
3332 
3333 
3334 OSL_NAMESPACE_EXIT
3335