1 // Copyright 2019-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #include "CPUDevice.h"
5 #include "../common/Data.h"
6 #include "../common/export_util.h"
7 #include "../iterator/Iterator.h"
8 #include "../observer/Observer.h"
9 #include "../sampler/Sampler.h"
10 #include "../volume/Volume.h"
11 #include "CPUDevice_ispc.h"
12 
13 namespace openvkl {
14   namespace cpu_device {
15 
16     template <int W>
supportsWidth(int width)17     bool CPUDevice<W>::supportsWidth(int width)
18     {
19       return width == W || width == 4 || width == 8 || width == 16;
20     }
21 
22     template <int W>
getNativeSIMDWidth()23     int CPUDevice<W>::getNativeSIMDWidth()
24     {
25       return CALL_ISPC(ISPC_getProgramCount);
26     }
27 
28     template <int W>
commit()29     void CPUDevice<W>::commit()
30     {
31       Device::commit();
32     }
33 
34     template <int W>
commit(VKLObject object)35     void CPUDevice<W>::commit(VKLObject object)
36     {
37       ManagedObject *managedObject = (ManagedObject *)object;
38       managedObject->commit();
39     }
40 
41     template <int W>
release(VKLObject object)42     void CPUDevice<W>::release(VKLObject object)
43     {
44       ManagedObject *managedObject = (ManagedObject *)object;
45       managedObject->refDec();
46     }
47 
48     ///////////////////////////////////////////////////////////////////////////
49     // Data ///////////////////////////////////////////////////////////////////
50     ///////////////////////////////////////////////////////////////////////////
51 
52     template <int W>
newData(size_t numItems,VKLDataType dataType,const void * source,VKLDataCreationFlags dataCreationFlags,size_t byteStride)53     VKLData CPUDevice<W>::newData(size_t numItems,
54                                   VKLDataType dataType,
55                                   const void *source,
56                                   VKLDataCreationFlags dataCreationFlags,
57                                   size_t byteStride)
58     {
59       Data *data =
60           new Data(numItems, dataType, source, dataCreationFlags, byteStride);
61       return (VKLData)data;
62     }
63 
64     ///////////////////////////////////////////////////////////////////////////
65     // Observer ///////////////////////////////////////////////////////////////
66     ///////////////////////////////////////////////////////////////////////////
67 
68     template <int W>
newObserver(VKLVolume volume,const char * type)69     VKLObserver CPUDevice<W>::newObserver(VKLVolume volume, const char *type)
70     {
71       auto &object          = referenceFromHandle<Volume<W>>(volume);
72       Observer<W> *observer = object.newObserver(type);
73       return (VKLObserver)observer;
74     }
75 
76     template <int W>
newObserver(VKLSampler sampler,const char * type)77     VKLObserver CPUDevice<W>::newObserver(VKLSampler sampler, const char *type)
78     {
79       auto &object          = referenceFromHandle<Sampler<W>>(sampler);
80       Observer<W> *observer = object.newObserver(type);
81       return (VKLObserver)observer;
82     }
83 
84     template <int W>
mapObserver(VKLObserver observer)85     const void *CPUDevice<W>::mapObserver(VKLObserver observer)
86     {
87       auto &observerObject = referenceFromHandle<Observer<W>>(observer);
88       return observerObject.map();
89     }
90 
91     template <int W>
unmapObserver(VKLObserver observer)92     void CPUDevice<W>::unmapObserver(VKLObserver observer)
93     {
94       auto &observerObject = referenceFromHandle<Observer<W>>(observer);
95       observerObject.unmap();
96     }
97 
98     template <int W>
getObserverElementType(VKLObserver observer) const99     VKLDataType CPUDevice<W>::getObserverElementType(VKLObserver observer) const
100     {
101       auto &observerObject = referenceFromHandle<Observer<W>>(observer);
102       return observerObject.getElementType();
103     }
104 
105     template <int W>
getObserverElementSize(VKLObserver observer) const106     size_t CPUDevice<W>::getObserverElementSize(VKLObserver observer) const
107     {
108       auto &observerObject = referenceFromHandle<Observer<W>>(observer);
109       return observerObject.getElementSize();
110     }
111 
112     template <int W>
getObserverNumElements(VKLObserver observer) const113     size_t CPUDevice<W>::getObserverNumElements(VKLObserver observer) const
114     {
115       auto &observerObject = referenceFromHandle<Observer<W>>(observer);
116       return observerObject.getNumElements();
117     }
118 
119     ///////////////////////////////////////////////////////////////////////////
120     // Interval iterator //////////////////////////////////////////////////////
121     ///////////////////////////////////////////////////////////////////////////
122 
123     template <int W>
newIntervalIteratorContext(VKLSampler sampler)124     VKLIntervalIteratorContext CPUDevice<W>::newIntervalIteratorContext(
125         VKLSampler sampler)
126     {
127       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
128       return (VKLIntervalIteratorContext)samplerObject
129           .getIntervalIteratorFactory()
130           .newContext(samplerObject);
131     }
132 
133     ///////////////////////////////////////////////////////////////////////////
134     // Hit iterator ///////////////////////////////////////////////////////////
135     ///////////////////////////////////////////////////////////////////////////
136 
137     template <int W>
newHitIteratorContext(VKLSampler sampler)138     VKLHitIteratorContext CPUDevice<W>::newHitIteratorContext(
139         VKLSampler sampler)
140     {
141       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
142       return (VKLHitIteratorContext)samplerObject.getHitIteratorFactory()
143           .newContext(samplerObject);
144     }
145 
146     ///////////////////////////////////////////////////////////////////////////
147     // Parameters /////////////////////////////////////////////////////////////
148     ///////////////////////////////////////////////////////////////////////////
149 
150     template <int W>
setBool(VKLObject object,const char * name,const bool b)151     void CPUDevice<W>::setBool(VKLObject object, const char *name, const bool b)
152     {
153       ManagedObject *managedObject = (ManagedObject *)object;
154       managedObject->setParam(name, b);
155     }
156 
157     template <int W>
set1f(VKLObject object,const char * name,const float x)158     void CPUDevice<W>::set1f(VKLObject object, const char *name, const float x)
159     {
160       ManagedObject *managedObject = (ManagedObject *)object;
161       managedObject->setParam(name, x);
162     }
163 
164     template <int W>
set1i(VKLObject object,const char * name,const int x)165     void CPUDevice<W>::set1i(VKLObject object, const char *name, const int x)
166     {
167       ManagedObject *managedObject = (ManagedObject *)object;
168       managedObject->setParam(name, x);
169     }
170 
171     template <int W>
setVec3f(VKLObject object,const char * name,const vec3f & v)172     void CPUDevice<W>::setVec3f(VKLObject object,
173                                 const char *name,
174                                 const vec3f &v)
175     {
176       ManagedObject *managedObject = (ManagedObject *)object;
177       managedObject->setParam(name, v);
178     }
179 
180     template <int W>
setVec3i(VKLObject object,const char * name,const vec3i & v)181     void CPUDevice<W>::setVec3i(VKLObject object,
182                                 const char *name,
183                                 const vec3i &v)
184     {
185       ManagedObject *managedObject = (ManagedObject *)object;
186       managedObject->setParam(name, v);
187     }
188 
189     template <int W>
setObject(VKLObject object,const char * name,VKLObject setObject)190     void CPUDevice<W>::setObject(VKLObject object,
191                                  const char *name,
192                                  VKLObject setObject)
193     {
194       ManagedObject *target = (ManagedObject *)object;
195       ManagedObject *value  = (ManagedObject *)setObject;
196       target->setParam(name, value);
197     }
198 
199     template <int W>
setString(VKLObject object,const char * name,const std::string & s)200     void CPUDevice<W>::setString(VKLObject object,
201                                  const char *name,
202                                  const std::string &s)
203     {
204       ManagedObject *managedObject = (ManagedObject *)object;
205       managedObject->setParam(name, s);
206     }
207 
208     template <int W>
setVoidPtr(VKLObject object,const char * name,void * v)209     void CPUDevice<W>::setVoidPtr(VKLObject object, const char *name, void *v)
210     {
211       ManagedObject *managedObject = (ManagedObject *)object;
212       managedObject->setParam(name, v);
213     }
214 
215     ///////////////////////////////////////////////////////////////////////////
216     // Sampler ////////////////////////////////////////////////////////////////
217     ///////////////////////////////////////////////////////////////////////////
218 
219     template <int W>
newSampler(VKLVolume volume)220     VKLSampler CPUDevice<W>::newSampler(VKLVolume volume)
221     {
222       auto &volumeObject = referenceFromHandle<Volume<W>>(volume);
223       return (VKLSampler)volumeObject.newSampler();
224     }
225 
226 #define __define_computeSampleN(WIDTH)                                      \
227   template <int W>                                                          \
228   void CPUDevice<W>::computeSample##WIDTH(                                  \
229       const int *valid,                                                     \
230       VKLSampler sampler,                                                   \
231       const vvec3fn<WIDTH> &objectCoordinates,                              \
232       float *samples,                                                       \
233       unsigned int attributeIndex,                                          \
234       const float *times)                                                   \
235   {                                                                         \
236     computeSampleAnyWidth<WIDTH>(                                           \
237         valid, sampler, objectCoordinates, samples, attributeIndex, times); \
238   }
239 
240     __define_computeSampleN(4);
241     __define_computeSampleN(8);
242     __define_computeSampleN(16);
243 
244 #undef __define_computeSampleN
245 
246     // support a fast path for scalar sampling
247     template <int W>
computeSample1(const int * valid,VKLSampler sampler,const vvec3fn<1> & objectCoordinates,float * sample,unsigned int attributeIndex,const float * time)248     void CPUDevice<W>::computeSample1(const int *valid,
249                                       VKLSampler sampler,
250                                       const vvec3fn<1> &objectCoordinates,
251                                       float *sample,
252                                       unsigned int attributeIndex,
253                                       const float *time)
254     {
255       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
256       vfloatn<1> timeW(time, 1);
257       vfloatn<1> sampleW;
258       samplerObject.computeSample(
259           objectCoordinates, sampleW, attributeIndex, timeW);
260       *sample = sampleW[0];
261     }
262 
263     template <int W>
computeSampleN(VKLSampler sampler,unsigned int N,const vvec3fn<1> * objectCoordinates,float * samples,unsigned int attributeIndex,const float * times)264     void CPUDevice<W>::computeSampleN(VKLSampler sampler,
265                                       unsigned int N,
266                                       const vvec3fn<1> *objectCoordinates,
267                                       float *samples,
268                                       unsigned int attributeIndex,
269                                       const float *times)
270     {
271       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
272       samplerObject.computeSampleN(
273           N, objectCoordinates, samples, attributeIndex, times);
274     }
275 
276 #define __define_computeSampleMN(WIDTH)              \
277   template <int W>                                   \
278   void CPUDevice<W>::computeSampleM##WIDTH(          \
279       const int *valid,                              \
280       VKLSampler sampler,                            \
281       const vvec3fn<WIDTH> &objectCoordinates,       \
282       float *samples,                                \
283       unsigned int M,                                \
284       const unsigned int *attributeIndices,          \
285       const float *times)                            \
286   {                                                  \
287     computeSampleMAnyWidth<WIDTH>(valid,             \
288                                   sampler,           \
289                                   objectCoordinates, \
290                                   samples,           \
291                                   M,                 \
292                                   attributeIndices,  \
293                                   times);            \
294   }
295 
296     __define_computeSampleMN(4);
297     __define_computeSampleMN(8);
298     __define_computeSampleMN(16);
299 
300 #undef __define_computeSampleMN
301 
302     // support a fast path for scalar sampling
303     template <int W>
computeSampleM1(const int * valid,VKLSampler sampler,const vvec3fn<1> & objectCoordinates,float * samples,unsigned int M,const unsigned int * attributeIndices,const float * time)304     void CPUDevice<W>::computeSampleM1(const int *valid,
305                                        VKLSampler sampler,
306                                        const vvec3fn<1> &objectCoordinates,
307                                        float *samples,
308                                        unsigned int M,
309                                        const unsigned int *attributeIndices,
310                                        const float *time)
311     {
312       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
313       vfloatn<1> timeW(time, 1);
314       samplerObject.computeSampleM(
315           objectCoordinates, samples, M, attributeIndices, timeW);
316     }
317 
318     template <int W>
computeSampleMN(VKLSampler sampler,unsigned int N,const vvec3fn<1> * objectCoordinates,float * samples,unsigned int M,const unsigned int * attributeIndices,const float * times)319     void CPUDevice<W>::computeSampleMN(VKLSampler sampler,
320                                        unsigned int N,
321                                        const vvec3fn<1> *objectCoordinates,
322                                        float *samples,
323                                        unsigned int M,
324                                        const unsigned int *attributeIndices,
325                                        const float *times)
326     {
327       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
328       samplerObject.computeSampleMN(
329           N, objectCoordinates, samples, M, attributeIndices, times);
330     }
331 
332 #define __define_computeGradientN(WIDTH)                                      \
333   template <int W>                                                            \
334   void CPUDevice<W>::computeGradient##WIDTH(                                  \
335       const int *valid,                                                       \
336       VKLSampler sampler,                                                     \
337       const vvec3fn<WIDTH> &objectCoordinates,                                \
338       vvec3fn<WIDTH> &gradients,                                              \
339       unsigned int attributeIndex,                                            \
340       const float *times)                                                     \
341   {                                                                           \
342     computeGradientAnyWidth<WIDTH>(                                           \
343         valid, sampler, objectCoordinates, gradients, attributeIndex, times); \
344   }
345 
346     __define_computeGradientN(1);
347     __define_computeGradientN(4);
348     __define_computeGradientN(8);
349     __define_computeGradientN(16);
350 
351 #undef __define_computeGradientN
352 
353     template <int W>
computeGradientN(VKLSampler sampler,unsigned int N,const vvec3fn<1> * objectCoordinates,vvec3fn<1> * gradients,unsigned int attributeIndex,const float * times)354     void CPUDevice<W>::computeGradientN(VKLSampler sampler,
355                                         unsigned int N,
356                                         const vvec3fn<1> *objectCoordinates,
357                                         vvec3fn<1> *gradients,
358                                         unsigned int attributeIndex,
359                                         const float *times)
360     {
361       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
362       samplerObject.computeGradientN(
363           N, objectCoordinates, gradients, attributeIndex, times);
364     }
365 
366     ///////////////////////////////////////////////////////////////////////////
367     // Volume /////////////////////////////////////////////////////////////////
368     ///////////////////////////////////////////////////////////////////////////
369 
370     template <int W>
newVolume(const char * type)371     VKLVolume CPUDevice<W>::newVolume(const char *type)
372     {
373       // warn for deprecated snake case volume types
374       std::string typeStr(type);
375 
376       if (typeStr.find("_") != std::string::npos) {
377         postLogMessage(this, VKL_LOG_WARNING)
378             << "volume type name '" << typeStr
379             << "' may be deprecated; volume type names are now camelCase (no "
380                "underscores)";
381       }
382 
383       std::stringstream ss;
384       ss << type << "_" << W;
385 
386       return (VKLVolume)Volume<W>::createInstance(this, ss.str());
387     }
388 
389     template <int W>
getBoundingBox(VKLVolume volume)390     box3f CPUDevice<W>::getBoundingBox(VKLVolume volume)
391     {
392       auto &volumeObject = referenceFromHandle<Volume<W>>(volume);
393       return volumeObject.getBoundingBox();
394     }
395 
396     template <int W>
getNumAttributes(VKLVolume volume)397     unsigned int CPUDevice<W>::getNumAttributes(VKLVolume volume)
398     {
399       auto &volumeObject = referenceFromHandle<Volume<W>>(volume);
400       return volumeObject.getNumAttributes();
401     }
402 
403     template <int W>
getValueRange(VKLVolume volume,unsigned int attributeIndex)404     range1f CPUDevice<W>::getValueRange(VKLVolume volume,
405                                         unsigned int attributeIndex)
406     {
407       auto &volumeObject = referenceFromHandle<Volume<W>>(volume);
408       return volumeObject.getValueRange(attributeIndex);
409     }
410 
411     ///////////////////////////////////////////////////////////////////////////
412     // Private methods ////////////////////////////////////////////////////////
413     ///////////////////////////////////////////////////////////////////////////
414 
415     template <int W>
416     template <int OW>
417     typename std::enable_if<(OW < W), void>::type
computeSampleAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,float * samples,unsigned int attributeIndex,const float * times)418     CPUDevice<W>::computeSampleAnyWidth(const int *valid,
419                                         VKLSampler sampler,
420                                         const vvec3fn<OW> &objectCoordinates,
421                                         float *samples,
422                                         unsigned int attributeIndex,
423                                         const float *times)
424     {
425       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
426 
427       vvec3fn<W> ocW = static_cast<vvec3fn<W>>(objectCoordinates);
428       vfloatn<W> tW(times, OW);
429 
430       vintn<W> validW;
431       for (int i = 0; i < W; i++)
432         validW[i] = i < OW ? valid[i] : 0;
433 
434       ocW.fill_inactive_lanes(validW);
435       tW.fill_inactive_lanes(validW);
436 
437       vfloatn<W> samplesW;
438 
439       samplerObject.computeSampleV(validW, ocW, samplesW, attributeIndex, tW);
440 
441       for (int i = 0; i < OW; i++)
442         samples[i] = samplesW[i];
443     }
444 
445     template <int W>
446     template <int OW>
447     typename std::enable_if<(OW == W), void>::type
computeSampleAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,float * samples,unsigned int attributeIndex,const float * times)448     CPUDevice<W>::computeSampleAnyWidth(const int *valid,
449                                         VKLSampler sampler,
450                                         const vvec3fn<OW> &objectCoordinates,
451                                         float *samples,
452                                         unsigned int attributeIndex,
453                                         const float *times)
454     {
455       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
456 
457       vfloatn<W> tW(times, W);
458 
459       vintn<W> validW;
460       for (int i = 0; i < W; i++)
461         validW[i] = valid[i];
462 
463       vfloatn<W> samplesW;
464 
465       samplerObject.computeSampleV(
466           validW, objectCoordinates, samplesW, attributeIndex, tW);
467 
468       for (int i = 0; i < W; i++)
469         samples[i] = samplesW[i];
470     }
471 
472     template <int W>
473     template <int OW>
474     typename std::enable_if<(OW > W), void>::type
computeSampleAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,float * samples,unsigned int attributeIndex,const float * times)475     CPUDevice<W>::computeSampleAnyWidth(const int *valid,
476                                         VKLSampler sampler,
477                                         const vvec3fn<OW> &objectCoordinates,
478                                         float *samples,
479                                         unsigned int attributeIndex,
480                                         const float *times)
481     {
482       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
483 
484       vfloatn<OW> tOW(times, OW);
485 
486       const int numPacks = OW / W + (OW % W != 0);
487 
488       for (int packIndex = 0; packIndex < numPacks; packIndex++) {
489         vvec3fn<W> ocW = objectCoordinates.template extract_pack<W>(packIndex);
490         vfloatn<W> tW  = tOW.template extract_pack<W>(packIndex);
491 
492         vintn<W> validW;
493         for (int i = packIndex * W; i < (packIndex + 1) * W && i < OW; i++)
494           validW[i - packIndex * W] = i < OW ? valid[i] : 0;
495 
496         ocW.fill_inactive_lanes(validW);
497         tW.fill_inactive_lanes(validW);
498 
499         vfloatn<W> samplesW;
500 
501         samplerObject.computeSampleV(validW, ocW, samplesW, attributeIndex, tW);
502 
503         for (int i = packIndex * W; i < (packIndex + 1) * W && i < OW; i++)
504           samples[i] = samplesW[i - packIndex * W];
505       }
506     }
507 
508     template <int W>
509     template <int OW>
510     typename std::enable_if<(OW < W), void>::type
computeSampleMAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,float * samples,unsigned int M,const unsigned int * attributeIndices,const float * times)511     CPUDevice<W>::computeSampleMAnyWidth(const int *valid,
512                                          VKLSampler sampler,
513                                          const vvec3fn<OW> &objectCoordinates,
514                                          float *samples,
515                                          unsigned int M,
516                                          const unsigned int *attributeIndices,
517                                          const float *times)
518     {
519       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
520 
521       vvec3fn<W> ocW = static_cast<vvec3fn<W>>(objectCoordinates);
522       vfloatn<W> tW(times, OW);
523 
524       vintn<W> validW;
525       for (int i = 0; i < W; i++)
526         validW[i] = i < OW ? valid[i] : 0;
527 
528       ocW.fill_inactive_lanes(validW);
529       tW.fill_inactive_lanes(validW);
530 
531       float *samplesW = (float *)alloca(M * W * sizeof(float));
532 
533       samplerObject.computeSampleMV(
534           validW, ocW, samplesW, M, attributeIndices, tW);
535 
536       for (unsigned int a = 0; a < M; a++) {
537         for (int i = 0; i < OW; i++) {
538           samples[a * OW + i] = samplesW[a * W + i];
539         }
540       }
541     }
542 
543     template <int W>
544     template <int OW>
545     typename std::enable_if<(OW == W), void>::type
computeSampleMAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,float * samples,unsigned int M,const unsigned int * attributeIndices,const float * times)546     CPUDevice<W>::computeSampleMAnyWidth(const int *valid,
547                                          VKLSampler sampler,
548                                          const vvec3fn<OW> &objectCoordinates,
549                                          float *samples,
550                                          unsigned int M,
551                                          const unsigned int *attributeIndices,
552                                          const float *times)
553     {
554       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
555 
556       vfloatn<W> timesW(times, W);
557 
558       vintn<W> validW;
559       for (int i = 0; i < W; i++)
560         validW[i] = valid[i];
561 
562       samplerObject.computeSampleMV(
563           validW, objectCoordinates, samples, M, attributeIndices, timesW);
564     }
565 
566     template <int W>
567     template <int OW>
568     typename std::enable_if<(OW > W), void>::type
computeSampleMAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,float * samples,unsigned int M,const unsigned int * attributeIndices,const float * times)569     CPUDevice<W>::computeSampleMAnyWidth(const int *valid,
570                                          VKLSampler sampler,
571                                          const vvec3fn<OW> &objectCoordinates,
572                                          float *samples,
573                                          unsigned int M,
574                                          const unsigned int *attributeIndices,
575                                          const float *times)
576     {
577       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
578 
579       vfloatn<OW> tOW(times, OW);
580 
581       const int numPacks = OW / W + (OW % W != 0);
582 
583       for (int packIndex = 0; packIndex < numPacks; packIndex++) {
584         vvec3fn<W> ocW = objectCoordinates.template extract_pack<W>(packIndex);
585         vfloatn<W> tW  = tOW.template extract_pack<W>(packIndex);
586 
587         vintn<W> validW;
588         for (int i = packIndex * W; i < (packIndex + 1) * W && i < OW; i++)
589           validW[i - packIndex * W] = i < OW ? valid[i] : 0;
590 
591         ocW.fill_inactive_lanes(validW);
592         tW.fill_inactive_lanes(validW);
593 
594         float *samplesW = (float *)alloca(M * W * sizeof(float));
595 
596         samplerObject.computeSampleMV(
597             validW, ocW, samplesW, M, attributeIndices, tW);
598 
599         for (unsigned int a = 0; a < M; a++) {
600           for (int i = packIndex * W; i < (packIndex + 1) * W && i < OW; i++)
601             samples[a * OW + i] = samplesW[a * W + (i - packIndex * W)];
602         }
603       }
604     }
605 
606     template <int W>
607     template <int OW>
608     typename std::enable_if<(OW < W), void>::type
computeGradientAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,vvec3fn<OW> & gradients,unsigned int attributeIndex,const float * times)609     CPUDevice<W>::computeGradientAnyWidth(const int *valid,
610                                           VKLSampler sampler,
611                                           const vvec3fn<OW> &objectCoordinates,
612                                           vvec3fn<OW> &gradients,
613                                           unsigned int attributeIndex,
614                                           const float *times)
615     {
616       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
617 
618       vvec3fn<W> ocW = static_cast<vvec3fn<W>>(objectCoordinates);
619       vfloatn<W> tW(times, OW);
620 
621       vintn<W> validW;
622       for (int i = 0; i < W; i++)
623         validW[i] = i < OW ? valid[i] : 0;
624 
625       ocW.fill_inactive_lanes(validW);
626       tW.fill_inactive_lanes(validW);
627 
628       vvec3fn<W> gradientsW;
629 
630       samplerObject.computeGradientV(
631           validW, ocW, gradientsW, attributeIndex, tW);
632 
633       for (int i = 0; i < OW; i++) {
634         gradients.x[i] = gradientsW.x[i];
635         gradients.y[i] = gradientsW.y[i];
636         gradients.z[i] = gradientsW.z[i];
637       }
638     }
639 
640     template <int W>
641     template <int OW>
642     typename std::enable_if<(OW == W), void>::type
computeGradientAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,vvec3fn<OW> & gradients,unsigned int attributeIndex,const float * times)643     CPUDevice<W>::computeGradientAnyWidth(const int *valid,
644                                           VKLSampler sampler,
645                                           const vvec3fn<OW> &objectCoordinates,
646                                           vvec3fn<OW> &gradients,
647                                           unsigned int attributeIndex,
648                                           const float *times)
649     {
650       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
651 
652       vfloatn<W> tW(times, W);
653 
654       vintn<W> validW;
655       for (int i = 0; i < W; i++)
656         validW[i] = valid[i];
657 
658       samplerObject.computeGradientV(
659           validW, objectCoordinates, gradients, attributeIndex, tW);
660     }
661 
662     template <int W>
663     template <int OW>
664     typename std::enable_if<(OW > W), void>::type
computeGradientAnyWidth(const int * valid,VKLSampler sampler,const vvec3fn<OW> & objectCoordinates,vvec3fn<OW> & gradients,unsigned int attributeIndex,const float * times)665     CPUDevice<W>::computeGradientAnyWidth(const int *valid,
666                                           VKLSampler sampler,
667                                           const vvec3fn<OW> &objectCoordinates,
668                                           vvec3fn<OW> &gradients,
669                                           unsigned int attributeIndex,
670                                           const float *times)
671     {
672       auto &samplerObject = referenceFromHandle<Sampler<W>>(sampler);
673 
674       vfloatn<OW> tOW(times, OW);
675 
676       const int numPacks = OW / W + (OW % W != 0);
677 
678       for (int packIndex = 0; packIndex < numPacks; packIndex++) {
679         vvec3fn<W> ocW = objectCoordinates.template extract_pack<W>(packIndex);
680         vfloatn<W> tW  = tOW.template extract_pack<W>(packIndex);
681 
682         vintn<W> validW;
683         for (int i = packIndex * W; i < (packIndex + 1) * W && i < OW; i++)
684           validW[i - packIndex * W] = i < OW ? valid[i] : 0;
685 
686         ocW.fill_inactive_lanes(validW);
687         tW.fill_inactive_lanes(validW);
688 
689         vvec3fn<W> gradientsW;
690 
691         samplerObject.computeGradientV(
692             validW, ocW, gradientsW, attributeIndex, tW);
693 
694         for (int i = packIndex * W; i < (packIndex + 1) * W && i < OW; i++) {
695           gradients.x[i] = gradientsW.x[i - packIndex * W];
696           gradients.y[i] = gradientsW.y[i - packIndex * W];
697           gradients.z[i] = gradientsW.z[i - packIndex * W];
698         }
699       }
700     }
701 
702     VKL_REGISTER_DEVICE(CPUDevice<VKL_TARGET_WIDTH>,
703                         CONCAT1(internal_cpu_, VKL_TARGET_WIDTH))
704 
705   }  // namespace cpu_device
706 }  // namespace openvkl
707 
CONCAT1(openvkl_init_module_cpu_device_,VKL_TARGET_WIDTH)708 extern "C" OPENVKL_DLLEXPORT void CONCAT1(openvkl_init_module_cpu_device_,
709                                           VKL_TARGET_WIDTH)()
710 {
711 }
712