1 // Copyright (c) 2017-2021, Lawrence Livermore National Security, LLC and
2 // other Axom Project Developers. See the top-level LICENSE file for details.
3 //
4 // SPDX-License-Identifier: (BSD-3-Clause)
5 
6 #ifndef AXOM_SPIN_BUILD_RADIX_TREE_H_
7 #define AXOM_SPIN_BUILD_RADIX_TREE_H_
8 
9 #include "axom/config.hpp"  // for axom compile-time definitions
10 
11 #include "axom/core/execution/execution_space.hpp"
12 #include "axom/core/execution/for_all.hpp"
13 
14 #include "axom/core/utilities/AnnotationMacros.hpp"  // for annotations
15 
16 #include "axom/primal/geometry/BoundingBox.hpp"
17 #include "axom/primal/geometry/Point.hpp"
18 
19 #include "axom/spin/internal/linear_bvh/RadixTree.hpp"
20 
21 #include "axom/spin/MortonIndex.hpp"
22 
23 #include "axom/core/utilities/Utilities.hpp"  // for isNearlyEqual()
24 #include "axom/slic/interface/slic.hpp"       // for slic
25 
26 #if defined(AXOM_USE_RAJA)
27   // RAJA includes
28   #include "RAJA/RAJA.hpp"
29 #endif
30 
31 #include <atomic>  // For std::atomic_thread_fence
32 
33 #if defined(AXOM_USE_CUDA) && defined(AXOM_USE_RAJA)
34   // NOTE: uses the cub installation that is  bundled with RAJA
35   #include "cub/device/device_radix_sort.cuh"
36 #endif
37 
38 namespace axom
39 {
40 namespace spin
41 {
42 namespace internal
43 {
44 namespace linear_bvh
45 {
46 //------------------------------------------------------------------------------
47 //Returns 30 bit morton code for coordinates for
48 // x, y, and z are expecting to be between [0,1]
49 template <typename FloatType, int Dims>
morton32_encode(const primal::Vector<FloatType,Dims> & point)50 static inline AXOM_HOST_DEVICE axom::int32 morton32_encode(
51   const primal::Vector<FloatType, Dims>& point)
52 {
53   //for a float, take the first 10 bits. Note, 2^10 = 1024
54   constexpr int NUM_BITS_PER_DIM = 32 / Dims;
55   constexpr FloatType FLOAT_TO_INT = 1 << NUM_BITS_PER_DIM;
56   constexpr FloatType FLOAT_CEILING = FLOAT_TO_INT - 1;
57 
58   int32 int_coords[Dims];
59   for(int i = 0; i < Dims; i++)
60   {
61     int_coords[i] =
62       fmin(fmax(point[i] * FLOAT_TO_INT, (FloatType)0), FLOAT_CEILING);
63   }
64 
65   primal::Point<int32, Dims> integer_pt(int_coords);
66 
67   return convertPointToMorton<int32>(integer_pt);
68 }
69 
70 //------------------------------------------------------------------------------
71 //Returns 30 bit morton code for coordinates for
72 //coordinates in the unit cude
morton64_encode(axom::float32 x,axom::float32 y,axom::float32 z=0.0)73 static inline AXOM_HOST_DEVICE axom::int64 morton64_encode(axom::float32 x,
74                                                            axom::float32 y,
75                                                            axom::float32 z = 0.0)
76 {
77   //take the first 21 bits. Note, 2^21= 2097152.0f
78   x = fmin(fmax(x * 2097152.0f, 0.0f), 2097151.0f);
79   y = fmin(fmax(y * 2097152.0f, 0.0f), 2097151.0f);
80   z = fmin(fmax(z * 2097152.0f, 0.0f), 2097151.0f);
81 
82   primal::Point<int64, 3> integer_pt =
83     primal::Point<int64, 3>::make_point((int64)x, (int64)y, (int64)z);
84 
85   return convertPointToMorton<int64>(integer_pt);
86 }
87 
88 template <typename ExecSpace, typename BoxIndexable, typename FloatType, int NDIMS>
transform_boxes(const BoxIndexable boxes,primal::BoundingBox<FloatType,NDIMS> * aabbs,int32 size,FloatType scale_factor)89 void transform_boxes(const BoxIndexable boxes,
90                      primal::BoundingBox<FloatType, NDIMS>* aabbs,
91                      int32 size,
92                      FloatType scale_factor)
93 {
94   AXOM_PERF_MARK_FUNCTION("transform_boxes");
95 
96   for_all<ExecSpace>(
97     size,
98     AXOM_LAMBDA(int32 i) {
99       primal::BoundingBox<FloatType, NDIMS> aabb = boxes[i];
100 
101       aabb.scale(scale_factor);
102 
103       aabbs[i] = aabb;
104     });
105 }
106 
107 //------------------------------------------------------------------------------
108 template <typename ExecSpace, typename FloatType, int NDIMS>
reduce(primal::BoundingBox<FloatType,NDIMS> * aabbs,int32 size)109 primal::BoundingBox<FloatType, NDIMS> reduce(
110   primal::BoundingBox<FloatType, NDIMS>* aabbs,
111   int32 size)
112 {
113   AXOM_PERF_MARK_FUNCTION("reduce_abbs");
114 
115 #ifdef AXOM_USE_RAJA
116   using reduce_policy = typename axom::execution_space<ExecSpace>::reduce_policy;
117 
118   primal::Point<FloatType, NDIMS> min_pt, max_pt;
119 
120   FloatType infinity = std::numeric_limits<FloatType>::max();
121   FloatType neg_infinity = std::numeric_limits<FloatType>::lowest();
122 
123   for(int dim = 0; dim < NDIMS; dim++)
124   {
125     RAJA::ReduceMin<reduce_policy, FloatType> min_coord(infinity);
126     RAJA::ReduceMax<reduce_policy, FloatType> max_coord(neg_infinity);
127 
128     for_all<ExecSpace>(
129       size,
130       AXOM_LAMBDA(int32 i) {
131         const primal::BoundingBox<FloatType, NDIMS>& aabb = aabbs[i];
132         min_coord.min(aabb.getMin()[dim]);
133         max_coord.max(aabb.getMax()[dim]);
134       });
135 
136     min_pt[dim] = min_coord.get();
137     max_pt[dim] = max_coord.get();
138   }
139 
140   return primal::BoundingBox<FloatType, NDIMS>(min_pt, max_pt);
141 #else
142   static_assert(std::is_same<ExecSpace, SEQ_EXEC>::value,
143                 "Only SEQ_EXEC supported without RAJA");
144 
145   primal::BoundingBox<FloatType, NDIMS> global_bounds;
146 
147   for_all<ExecSpace>(size, [&](int32 i) { global_bounds.addBox(aabbs[i]); });
148 
149   return global_bounds;
150 #endif
151 }
152 
153 //------------------------------------------------------------------------------
154 template <typename ExecSpace, typename FloatType, int NDIMS>
get_mcodes(primal::BoundingBox<FloatType,NDIMS> * aabbs,int32 size,const primal::BoundingBox<FloatType,NDIMS> & bounds,uint32 * mcodes)155 void get_mcodes(primal::BoundingBox<FloatType, NDIMS>* aabbs,
156                 int32 size,
157                 const primal::BoundingBox<FloatType, NDIMS>& bounds,
158                 uint32* mcodes)
159 {
160   AXOM_PERF_MARK_FUNCTION("get_mcodes");
161 
162   primal::Vector<FloatType, NDIMS> extent, inv_extent, min_coord;
163 
164   extent = primal::Vector<FloatType, NDIMS>(bounds.getMax());
165   extent -= primal::Vector<FloatType, NDIMS>(bounds.getMin());
166   min_coord = primal::Vector<FloatType, NDIMS>(bounds.getMin());
167 
168   for(int i = 0; i < NDIMS; ++i)
169   {
170     inv_extent[i] = utilities::isNearlyEqual<FloatType>(extent[i], .0f)
171       ? 0.f
172       : 1.f / extent[i];
173   }
174 
175   for_all<ExecSpace>(
176     size,
177     AXOM_LAMBDA(int32 i) {
178       const primal::BoundingBox<FloatType, NDIMS>& aabb = aabbs[i];
179 
180       // get the center and normalize it
181       primal::Vector<FloatType, NDIMS> centroid(aabb.getCentroid());
182       centroid = primal::Vector<FloatType, NDIMS>(
183         (centroid - min_coord).array() * inv_extent.array());
184       mcodes[i] = morton32_encode(centroid);
185     });
186 }
187 
188 //------------------------------------------------------------------------------
189 template <typename ExecSpace, typename IntType>
array_counting(IntType * iterator,const IntType & size,const IntType & start,const IntType & step)190 void array_counting(IntType* iterator,
191                     const IntType& size,
192                     const IntType& start,
193                     const IntType& step)
194 {
195   AXOM_PERF_MARK_FUNCTION("array_counting");
196 
197   for_all<ExecSpace>(
198     size,
199     AXOM_LAMBDA(int32 i) { iterator[i] = start + i * step; });
200 }
201 
202 //------------------------------------------------------------------------------
203 //
204 // reorder and array based on a new set of indices.
205 // array   [a,b,c]
206 // indices [1,0,2]
207 // result  [b,a,c]
208 //
209 template <typename ExecSpace, typename T>
reorder(int32 * indices,T * & array,int32 size,int allocatorID)210 void reorder(int32* indices, T*& array, int32 size, int allocatorID)
211 {
212   AXOM_PERF_MARK_FUNCTION("reorder");
213 
214   T* temp = axom::allocate<T>(size, allocatorID);
215 
216   for_all<ExecSpace>(
217     size,
218     AXOM_LAMBDA(int32 i) {
219       int32 in_idx = indices[i];
220       temp[i] = array[in_idx];
221     });
222 
223   axom::deallocate(array);
224   array = temp;
225 }
226 
227 //------------------------------------------------------------------------------
228 #if defined(AXOM_USE_RAJA) &&  \
229   ((RAJA_VERSION_MAJOR > 0) || \
230    ((RAJA_VERSION_MAJOR == 0) && (RAJA_VERSION_MINOR >= 12)))
231 
232 template <typename ExecSpace>
sort_mcodes(uint32 * & mcodes,int32 size,int32 * iter)233 void sort_mcodes(uint32*& mcodes, int32 size, int32* iter)
234 {
235   AXOM_PERF_MARK_FUNCTION("sort_mcodes");
236 
237   array_counting<ExecSpace>(iter, size, 0, 1);
238 
239   AXOM_PERF_MARK_SECTION(
240     "raja_stable_sort",
241     using EXEC_POL = typename axom::execution_space<ExecSpace>::loop_policy;
242     RAJA::stable_sort_pairs<EXEC_POL>(RAJA::make_span(mcodes, size),
243                                       RAJA::make_span(iter, size)););
244 }
245 
246 #else
247 
248 // fall back to std::stable_sort
249 template <typename ExecSpace>
sort_mcodes(uint32 * & mcodes,int32 size,int32 * iter)250 void sort_mcodes(uint32*& mcodes, int32 size, int32* iter)
251 {
252   AXOM_PERF_MARK_FUNCTION("sort_mcodes");
253 
254   array_counting<ExecSpace>(iter, size, 0, 1);
255 
256   AXOM_PERF_MARK_SECTION(
257     "cpu_sort",
258 
259     std::stable_sort(iter,
260                      iter + size,
261                      [=](int32 i1, int32 i2) { return mcodes[i1] < mcodes[i2]; });
262 
263   );
264 
265   const int allocID = axom::execution_space<ExecSpace>::allocatorID();
266   reorder<ExecSpace>(iter, mcodes, size, allocID);
267 }
268 
269 #endif /* RAJA version 0.12.0 and above */
270 
271 //------------------------------------------------------------------------------
272 //
273 // count leading zeros
274 //
clz(axom::int32 x)275 inline AXOM_HOST_DEVICE axom::int32 clz(axom::int32 x)
276 {
277   axom::int32 y;
278   axom::int32 n = 32;
279   y = x >> 16;
280   if(y != 0)
281   {
282     n = n - 16;
283     x = y;
284   }
285   y = x >> 8;
286   if(y != 0)
287   {
288     n = n - 8;
289     x = y;
290   }
291   y = x >> 4;
292   if(y != 0)
293   {
294     n = n - 4;
295     x = y;
296   }
297   y = x >> 2;
298   if(y != 0)
299   {
300     n = n - 2;
301     x = y;
302   }
303   y = x >> 1;
304   if(y != 0) return axom::int32(n - 2);
305   return axom::int32(n - x);
306 }
307 
308 //------------------------------------------------------------------------------
309 template <typename IntType, typename MCType>
delta(const IntType & a,const IntType & b,const IntType & inner_size,const MCType * mcodes)310 AXOM_HOST_DEVICE IntType delta(const IntType& a,
311                                const IntType& b,
312                                const IntType& inner_size,
313                                const MCType* mcodes)
314 {
315   bool tie = false;
316   bool out_of_range = (b < 0 || b > inner_size);
317   //still make the call but with a valid adderss
318   const int32 bb = (out_of_range) ? 0 : b;
319   const uint32 acode = mcodes[a];
320   const uint32 bcode = mcodes[bb];
321   //use xor to find where they differ
322   uint32 exor = acode ^ bcode;
323   tie = (exor == 0);
324   //break the tie, a and b must always differ
325   exor = tie ? uint32(a) ^ uint32(bb) : exor;
326   int32 count = clz(exor);
327   if(tie) count += 32;
328   count = (out_of_range) ? -1 : count;
329   return count;
330 }
331 
332 //------------------------------------------------------------------------------
333 template <typename ExecSpace, typename FloatType, int NDIMS>
build_tree(RadixTree<FloatType,NDIMS> & data)334 void build_tree(RadixTree<FloatType, NDIMS>& data)
335 {
336   AXOM_PERF_MARK_FUNCTION("build_tree");
337 
338   // http://research.nvidia.com/sites/default/files/publications/karras2012hpg_paper.pdf
339 
340   // Pointers and vars are redeclared because I have a faint memory
341   // of a huge amount of pain and suffering due so cuda
342   // lambda captures of pointers inside a struct. Bad memories
343   // of random segfaults ........ be warned
344   const int32 inner_size = data.m_inner_size;
345   int32* lchildren_ptr = data.m_left_children;
346   int32* rchildren_ptr = data.m_right_children;
347   int32* parent_ptr = data.m_parents;
348   const uint32* mcodes_ptr = data.m_mcodes;
349 
350   for_all<ExecSpace>(
351     inner_size,
352     AXOM_LAMBDA(int32 i) {
353       //determine range direction
354       int32 d = 0 > (delta(i, i + 1, inner_size, mcodes_ptr) -
355                      delta(i, i - 1, inner_size, mcodes_ptr))
356         ? -1
357         : 1;
358 
359       //find upper bound for the length of the range
360       int32 min_delta = delta(i, i - d, inner_size, mcodes_ptr);
361       int32 lmax = 2;
362       while(delta(i, i + lmax * d, inner_size, mcodes_ptr) > min_delta)
363         lmax *= 2;
364 
365       //binary search to find the lower bound
366       int32 l = 0;
367       for(int32 t = lmax / 2; t >= 1; t /= 2)
368       {
369         if(delta(i, i + (l + t) * d, inner_size, mcodes_ptr) > min_delta)
370         {
371           l += t;
372         }
373       }
374 
375       int32 j = i + l * d;
376       int32 delta_node = delta(i, j, inner_size, mcodes_ptr);
377       int32 s = 0;
378       FloatType div_factor = 2.f;
379       //find the split postition using a binary search
380       for(int32 t = (int32)ceil(float32(l) / div_factor);;
381           div_factor *= 2, t = (int32)ceil(float32(l) / div_factor))
382       {
383         if(delta(i, i + (s + t) * d, inner_size, mcodes_ptr) > delta_node)
384         {
385           s += t;
386         }
387         if(t == 1) break;
388       }
389 
390       int32 split = i + s * d + utilities::min(d, 0);
391       // assign parent/child pointers
392       if(utilities::min(i, j) == split)
393       {
394         //leaf
395         parent_ptr[split + inner_size] = i;
396         lchildren_ptr[i] = split + inner_size;
397       }
398       else
399       {
400         //inner node
401         parent_ptr[split] = i;
402         lchildren_ptr[i] = split;
403       }
404 
405       if(utilities::max(i, j) == split + 1)
406       {
407         //leaf
408         parent_ptr[split + inner_size + 1] = i;
409         rchildren_ptr[i] = split + inner_size + 1;
410       }
411       else
412       {
413         parent_ptr[split + 1] = i;
414         rchildren_ptr[i] = split + 1;
415       }
416 
417       if(i == 0)
418       {
419         // flag the root
420         parent_ptr[0] = -1;
421       }
422     });
423 }
424 
425 //------------------------------------------------------------------------------
426 template <typename ExecSpace, typename T>
array_memset(T * array,const int32 size,const T val)427 static void array_memset(T* array, const int32 size, const T val)
428 {
429   AXOM_PERF_MARK_FUNCTION("array_memset");
430 
431   for_all<ExecSpace>(
432     size,
433     AXOM_LAMBDA(int32 i) { array[i] = val; });
434 }
435 
436 //------------------------------------------------------------------------------
437 // Fetches a bounding box value synchronized with another thread's store.
438 // On the CPU, this is achieved with an acquire fence. This is only really
439 //   needed for non-x86 architectures with a weaker memory model (Power, ARM)
440 // On the GPU, we poll the bounding box values for a non-sentinel value.
441 template <typename ExecSpace, typename BBoxType>
sync_load(const BBoxType & box)442 AXOM_HOST_DEVICE static inline BBoxType sync_load(const BBoxType& box)
443 {
444 #ifdef __CUDA_ARCH__
445   using atomic_policy = typename axom::execution_space<ExecSpace>::atomic_policy;
446 
447   using FloatType = typename BBoxType::CoordType;
448   using PointType = typename BBoxType::PointType;
449 
450   constexpr int NDIMS = PointType::DIMENSION;
451 
452   PointType min_pt {BBoxType::InvalidMin};
453   PointType max_pt {BBoxType::InvalidMax};
454 
455   #ifdef SPIN_BVH_DEBUG_MEMORY_HAZARD
456   int nreads = 0;  // number of extra reads needed for a non-sentinel value
457   #endif
458   for(int dim = 0; dim < NDIMS; dim++)
459   {
460     // Cast to volatile so reads always hit L2$ or memory.
461     volatile const FloatType& min_dim =
462       reinterpret_cast<volatile const FloatType&>(box.getMin()[dim]);
463     volatile const FloatType& max_dim =
464       reinterpret_cast<volatile const FloatType&>(box.getMax()[dim]);
465 
466       // NOTE: There is a possibility for a read-after-write hazard, where the
467       // uncached store of an AABB on one thread isn't visible when another
468       // thread calls this method to read the value. However, this doesn't seem to
469       // be an issue on Volta; the atomicAdd used to terminate the first thread
470       // seems to correctly synchronize the prior atomic store operations for the
471       // bounding box data.
472       //
473       // Just in case this changes, we poll for a non-sentinel value to be read
474       // out. Naturally, this assumes that reads of sizeof(FloatType) don't tear.
475   #ifdef SPIN_BVH_DEBUG_MEMORY_HAZARD
476     while((min_pt[dim] = min_dim) == BBoxType::InvalidMin)
477     {
478       nreads++;
479     }
480     while((max_pt[dim] = max_dim) == BBoxType::InvalidMax)
481     {
482       nreads++;
483     }
484   #else
485     while((min_pt[dim] = min_dim) == BBoxType::InvalidMin)
486       ;
487     while((max_pt[dim] = max_dim) == BBoxType::InvalidMax)
488       ;
489   #endif
490   }
491 
492   #ifdef SPIN_BVH_DEBUG_MEMORY_HAZARD
493   if(nreads > 0)
494   {
495     printf("Warning: needed %d extra reads for address %p\n", nreads, &box);
496   }
497   #endif
498 
499   return BBoxType {min_pt, max_pt};
500 
501 #else  // __CUDA_ARCH__
502   std::atomic_thread_fence(std::memory_order_acquire);
503   return box;
504 #endif
505 }
506 
507 //------------------------------------------------------------------------------
508 // Writes a bounding box to memory, synchronized with another thread's read.
509 // On the CPU, this is achieved with a release fence.
510 // On the GPU, this function uses atomicExch to write a value directly to the
511 // L2 cache, thus avoiding potential cache coherency issues between threads.
512 template <typename ExecSpace, typename BBoxType>
sync_store(BBoxType & box,const BBoxType & value)513 AXOM_HOST_DEVICE static inline void sync_store(BBoxType& box,
514                                                const BBoxType& value)
515 {
516 #if defined(__CUDA_ARCH__) && defined(AXOM_USE_RAJA)
517   using atomic_policy = typename axom::execution_space<ExecSpace>::atomic_policy;
518 
519   using FloatType = typename BBoxType::CoordType;
520   using PointType = typename BBoxType::PointType;
521 
522   constexpr int NDIMS = PointType::DIMENSION;
523 
524   // Cast away the underlying const so we can directly modify the box data.
525   PointType& min_pt = const_cast<PointType&>(box.getMin());
526   PointType& max_pt = const_cast<PointType&>(box.getMax());
527 
528   for(int dim = 0; dim < NDIMS; dim++)
529   {
530     RAJA::atomicExchange<atomic_policy>(&(min_pt[dim]), value.getMin()[dim]);
531     RAJA::atomicExchange<atomic_policy>(&(max_pt[dim]), value.getMax()[dim]);
532   }
533 #else  // __CUDA_ARCH__
534   box = value;
535   std::atomic_thread_fence(std::memory_order_release);
536 #endif
537 }
538 
539 //------------------------------------------------------------------------------
540 template <typename ExecSpace>
atomic_increment(int * addr)541 AXOM_HOST_DEVICE static inline int atomic_increment(int* addr)
542 {
543 #ifdef AXOM_USE_RAJA
544   using atomic_policy = typename axom::execution_space<ExecSpace>::atomic_policy;
545 
546   return RAJA::atomicAdd<atomic_policy>(addr, 1);
547 #else
548   static_assert(std::is_same<ExecSpace, SEQ_EXEC>::value,
549                 "Only SEQ_EXEC supported without RAJA");
550 
551   // TODO: use atomic_ref?
552   int old = *addr;
553   (*addr)++;
554   return old;
555 #endif  // AXOM_USE_RAJA
556 }
557 
558 //------------------------------------------------------------------------------
559 template <typename ExecSpace, typename FloatType, int NDIMS>
propagate_aabbs(RadixTree<FloatType,NDIMS> & data,int allocatorID)560 void propagate_aabbs(RadixTree<FloatType, NDIMS>& data, int allocatorID)
561 {
562   AXOM_PERF_MARK_FUNCTION("propagate_abbs");
563 
564   const int inner_size = data.m_inner_size;
565   const int leaf_size = data.m_inner_size + 1;
566   SLIC_ASSERT(leaf_size == data.m_size);
567 
568   // Pointers and vars are redeclared because I have a faint memory
569   // of a huge amount of pain and suffering due so cuda
570   // labda captures of pointers indide a struct. Bad memories
571   // of random segfaults ........ be warned
572   const int32* lchildren_ptr = data.m_left_children;
573   const int32* rchildren_ptr = data.m_right_children;
574   const int32* parent_ptr = data.m_parents;
575   const primal::BoundingBox<FloatType, NDIMS>* leaf_aabb_ptr = data.m_leaf_aabbs;
576 
577   primal::BoundingBox<FloatType, NDIMS>* inner_aabb_ptr = data.m_inner_aabbs;
578 
579   int32* counters_ptr = axom::allocate<int32>(inner_size, allocatorID);
580 
581   using BoxType = primal::BoundingBox<FloatType, NDIMS>;
582 
583   array_memset<ExecSpace>(counters_ptr, inner_size, 0);
584   array_memset<ExecSpace>(inner_aabb_ptr, inner_size, BoxType {});
585 
586   for_all<ExecSpace>(
587     leaf_size,
588     AXOM_LAMBDA(int32 i) {
589       primal::BoundingBox<FloatType, NDIMS> aabb = leaf_aabb_ptr[i];
590       int32 last_node = inner_size + i;
591       int32 current_node = parent_ptr[inner_size + i];
592 
593       while(current_node != -1)
594       {
595         // TODO: If RAJA atomics get memory ordering policies in the future,
596         // we should look at replacing the sync_load/sync_stores by changing
597         // the below atomic to an acquire/release atomic.
598         int32 old = atomic_increment<ExecSpace>(&(counters_ptr[current_node]));
599 
600         if(old == 0)
601         {
602           // first thread to get here kills itself
603           return;
604         }
605 
606         int32 lchild = lchildren_ptr[current_node];
607         int32 rchild = rchildren_ptr[current_node];
608 
609         int32 other_child = (lchild == last_node) ? rchild : lchild;
610 
611         primal::BoundingBox<FloatType, NDIMS> other_aabb;
612         if(other_child >= inner_size)
613         {
614           other_aabb = leaf_aabb_ptr[other_child - inner_size];
615         }
616         else
617         {
618           other_aabb = sync_load<ExecSpace>(inner_aabb_ptr[other_child]);
619         }
620         aabb.addBox(other_aabb);
621 
622         // Store the final AABB for this internal node coherently.
623         sync_store<ExecSpace>(inner_aabb_ptr[current_node], aabb);
624 
625         last_node = current_node;
626         current_node = parent_ptr[current_node];
627       }
628     });
629 
630   axom::deallocate(counters_ptr);
631 }
632 
633 //------------------------------------------------------------------------------
634 template <typename ExecSpace, typename BoxIndexable, typename FloatType, int NDIMS>
build_radix_tree(const BoxIndexable boxes,int size,primal::BoundingBox<FloatType,NDIMS> & bounds,RadixTree<FloatType,NDIMS> & radix_tree,FloatType scale_factor,int allocatorID)635 void build_radix_tree(const BoxIndexable boxes,
636                       int size,
637                       primal::BoundingBox<FloatType, NDIMS>& bounds,
638                       RadixTree<FloatType, NDIMS>& radix_tree,
639                       FloatType scale_factor,
640                       int allocatorID)
641 {
642   AXOM_PERF_MARK_FUNCTION("build_radix_tree");
643 
644   // sanity checks
645   SLIC_ASSERT(size > 0);
646 
647   radix_tree.allocate(size, allocatorID);
648 
649   // copy so we don't reorder the input
650   transform_boxes<ExecSpace>(boxes, radix_tree.m_leaf_aabbs, size, scale_factor);
651 
652   // evaluate global bounds
653   bounds = reduce<ExecSpace>(radix_tree.m_leaf_aabbs, size);
654 
655   // sort aabbs based on morton code
656   // original positions of the sorted morton codes.
657   // allows us to gather / sort other arrays.
658   get_mcodes<ExecSpace>(radix_tree.m_leaf_aabbs, size, bounds, radix_tree.m_mcodes);
659   sort_mcodes<ExecSpace>(radix_tree.m_mcodes, size, radix_tree.m_leafs);
660 
661   reorder<ExecSpace>(radix_tree.m_leafs, radix_tree.m_leaf_aabbs, size, allocatorID);
662 
663   build_tree<ExecSpace>(radix_tree);
664 
665   propagate_aabbs<ExecSpace>(radix_tree, allocatorID);
666 }
667 
668 } /* namespace linear_bvh */
669 } /* namespace internal */
670 } /* namespace spin */
671 } /* namespace axom */
672 #endif
673