1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #pragma once
5 
6 #include "../common/builder.h"
7 #include "../../common/algorithms/parallel_reduce.h"
8 
9 namespace embree
10 {
11   namespace isa
12   {
13     struct BVHBuilderMorton
14     {
15       static const size_t MAX_BRANCHING_FACTOR = 8;          //!< maximum supported BVH branching factor
16       static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree of we are that many levels before the maximum tree depth
17 
18       /*! settings for morton builder */
19       struct Settings
20       {
21         /*! default settings */
SettingsBVHBuilderMorton::Settings22         Settings ()
23         : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
24 
25         /*! initialize settings from API settings */
SettingsBVHBuilderMorton::Settings26         Settings (const RTCBuildArguments& settings)
27         : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
28         {
29           if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
30           if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
31           if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
32           if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
33 
34           minLeafSize = min(minLeafSize,maxLeafSize);
35         }
36 
SettingsBVHBuilderMorton::Settings37         Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
38         : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
39         {
40           minLeafSize = min(minLeafSize,maxLeafSize);
41         }
42 
43       public:
44         size_t branchingFactor;  //!< branching factor of BVH to build
45         size_t maxDepth;         //!< maximum depth of BVH to build
46         size_t minLeafSize;      //!< minimum size of a leaf
47         size_t maxLeafSize;      //!< maximum size of a leaf
48         size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
49       };
50 
51       /*! Build primitive consisting of morton code and primitive ID. */
52       struct __aligned(8) BuildPrim
53       {
54         union {
55           struct {
56             unsigned int code;     //!< morton code
57             unsigned int index;    //!< i'th primitive
58           };
59           uint64_t t;
60         };
61 
62         /*! interface for radix sort */
63         __forceinline operator unsigned() const { return code; }
64 
65         /*! interface for standard sort */
66         __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
67       };
68 
69       /*! maps bounding box to morton code */
70       struct MortonCodeMapping
71       {
72         static const size_t LATTICE_BITS_PER_DIM = 10;
73         static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
74 
75         vfloat4 base;
76         vfloat4 scale;
77 
MortonCodeMappingBVHBuilderMorton::MortonCodeMapping78         __forceinline MortonCodeMapping(const BBox3fa& bounds)
79         {
80           base  = (vfloat4)bounds.lower;
81           const vfloat4 diag  = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
82           scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
83         }
84 
binBVHBuilderMorton::MortonCodeMapping85         __forceinline const vint4 bin (const BBox3fa& box) const
86         {
87           const vfloat4 lower = (vfloat4)box.lower;
88           const vfloat4 upper = (vfloat4)box.upper;
89           const vfloat4 centroid = lower+upper;
90           return vint4((centroid-base)*scale);
91         }
92 
codeBVHBuilderMorton::MortonCodeMapping93         __forceinline unsigned int code (const BBox3fa& box) const
94         {
95           const vint4 binID = bin(box);
96           const unsigned int x = extract<0>(binID);
97           const unsigned int y = extract<1>(binID);
98           const unsigned int z = extract<2>(binID);
99           const unsigned int xyz = bitInterleave(x,y,z);
100           return xyz;
101         }
102       };
103 
104 #if defined (__AVX2__)
105 
106       /*! for AVX2 there is a fast scalar bitInterleave */
107       struct MortonCodeGenerator
108       {
MortonCodeGeneratorBVHBuilderMorton::MortonCodeGenerator109         __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
110           : mapping(mapping), dest(dest) {}
111 
operatorBVHBuilderMorton::MortonCodeGenerator112         __forceinline void operator() (const BBox3fa& b, const unsigned index)
113         {
114           dest->index = index;
115           dest->code = mapping.code(b);
116           dest++;
117         }
118 
119       public:
120         const MortonCodeMapping mapping;
121         BuildPrim* dest;
122         size_t currentID;
123       };
124 
125 #else
126 
127       /*! before AVX2 is it better to use the SSE version of bitInterleave */
128       struct MortonCodeGenerator
129       {
MortonCodeGeneratorBVHBuilderMorton::MortonCodeGenerator130         __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
131           : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
132 
~MortonCodeGeneratorBVHBuilderMorton::MortonCodeGenerator133         __forceinline ~MortonCodeGenerator()
134         {
135           if (slots != 0)
136           {
137             const vint4 code = bitInterleave(ax,ay,az);
138             for (size_t i=0; i<slots; i++) {
139               dest[currentID-slots+i].index = ai[i];
140               dest[currentID-slots+i].code = code[i];
141             }
142           }
143         }
144 
operatorBVHBuilderMorton::MortonCodeGenerator145         __forceinline void operator() (const BBox3fa& b, const unsigned index)
146         {
147           const vint4 binID = mapping.bin(b);
148           ax[slots] = extract<0>(binID);
149           ay[slots] = extract<1>(binID);
150           az[slots] = extract<2>(binID);
151           ai[slots] = index;
152           slots++;
153           currentID++;
154 
155           if (slots == 4)
156           {
157             const vint4 code = bitInterleave(ax,ay,az);
158             vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
159             vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
160             slots = 0;
161           }
162         }
163 
164       public:
165         const MortonCodeMapping mapping;
166         BuildPrim* dest;
167         size_t currentID;
168         size_t slots;
169         vint4 ax, ay, az, ai;
170       };
171 
172 #endif
173 
174       template<
175         typename ReductionTy,
176         typename Allocator,
177         typename CreateAllocator,
178         typename CreateNodeFunc,
179         typename SetNodeBoundsFunc,
180         typename CreateLeafFunc,
181         typename CalculateBounds,
182         typename ProgressMonitor>
183 
184         class BuilderT : private Settings
185       {
186         ALIGNED_CLASS_(16);
187 
188       public:
189 
BuilderTBVHBuilderMorton190         BuilderT (CreateAllocator& createAllocator,
191                   CreateNodeFunc& createNode,
192                   SetNodeBoundsFunc& setBounds,
193                   CreateLeafFunc& createLeaf,
194                   CalculateBounds& calculateBounds,
195                   ProgressMonitor& progressMonitor,
196                   const Settings& settings)
197 
198           : Settings(settings),
199           createAllocator(createAllocator),
200           createNode(createNode),
201           setBounds(setBounds),
202           createLeaf(createLeaf),
203           calculateBounds(calculateBounds),
204           progressMonitor(progressMonitor),
205           morton(nullptr) {}
206 
createLargeLeafBVHBuilderMorton207         ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
208         {
209           /* this should never occur but is a fatal error */
210           if (depth > maxDepth)
211             throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
212 
213           /* create leaf for few primitives */
214           if (current.size() <= maxLeafSize)
215             return createLeaf(current,alloc);
216 
217           /* fill all children by always splitting the largest one */
218           range<unsigned> children[MAX_BRANCHING_FACTOR];
219           size_t numChildren = 1;
220           children[0] = current;
221 
222           do {
223 
224             /* find best child with largest number of primitives */
225             size_t bestChild = -1;
226             size_t bestSize = 0;
227             for (size_t i=0; i<numChildren; i++)
228             {
229               /* ignore leaves as they cannot get split */
230               if (children[i].size() <= maxLeafSize)
231                 continue;
232 
233               /* remember child with largest size */
234               if (children[i].size() > bestSize) {
235                 bestSize = children[i].size();
236                 bestChild = i;
237               }
238             }
239             if (bestChild == size_t(-1)) break;
240 
241             /*! split best child into left and right child */
242             auto split = children[bestChild].split();
243 
244             /* add new children left and right */
245             children[bestChild] = children[numChildren-1];
246             children[numChildren-1] = split.first;
247             children[numChildren+0] = split.second;
248             numChildren++;
249 
250           } while (numChildren < branchingFactor);
251 
252           /* create node */
253           auto node = createNode(alloc,numChildren);
254 
255           /* recurse into each child */
256           ReductionTy bounds[MAX_BRANCHING_FACTOR];
257           for (size_t i=0; i<numChildren; i++)
258             bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
259 
260           return setBounds(node,bounds,numChildren);
261         }
262 
263         /*! recreates morton codes when reaching a region where all codes are identical */
recreateMortonCodesBVHBuilderMorton264         __noinline void recreateMortonCodes(const range<unsigned>& current) const
265         {
266           /* fast path for small ranges */
267           if (likely(current.size() < 1024))
268           {
269             /*! recalculate centroid bounds */
270             BBox3fa centBounds(empty);
271             for (size_t i=current.begin(); i<current.end(); i++)
272               centBounds.extend(center2(calculateBounds(morton[i])));
273 
274             /* recalculate morton codes */
275             MortonCodeMapping mapping(centBounds);
276             for (size_t i=current.begin(); i<current.end(); i++)
277               morton[i].code = mapping.code(calculateBounds(morton[i]));
278 
279             /* sort morton codes */
280             std::sort(morton+current.begin(),morton+current.end());
281           }
282           else
283           {
284             /*! recalculate centroid bounds */
285             auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
286               BBox3fa centBounds = empty;
287               for (size_t i=r.begin(); i<r.end(); i++)
288                 centBounds.extend(center2(calculateBounds(morton[i])));
289               return centBounds;
290             };
291             const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
292                                                        BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
293 
294             /* recalculate morton codes */
295             MortonCodeMapping mapping(centBounds);
296             parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
297                 for (size_t i=r.begin(); i<r.end(); i++) {
298                   morton[i].code = mapping.code(calculateBounds(morton[i]));
299                 }
300               });
301 
302             /*! sort morton codes */
303 #if defined(TASKING_TBB)
304             tbb::parallel_sort(morton+current.begin(),morton+current.end());
305 #else
306             radixsort32(morton+current.begin(),current.size());
307 #endif
308           }
309         }
310 
splitBVHBuilderMorton311         __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
312         {
313           const unsigned int code_start = morton[current.begin()].code;
314           const unsigned int code_end   = morton[current.end()-1].code;
315           unsigned int bitpos = lzcnt(code_start^code_end);
316 
317           /* if all items mapped to same morton code, then re-create new morton codes for the items */
318           if (unlikely(bitpos == 32))
319           {
320             recreateMortonCodes(current);
321             const unsigned int code_start = morton[current.begin()].code;
322             const unsigned int code_end   = morton[current.end()-1].code;
323             bitpos = lzcnt(code_start^code_end);
324 
325             /* if the morton code is still the same, goto fall back split */
326             if (unlikely(bitpos == 32)) {
327               current.split(left,right);
328               return;
329             }
330           }
331 
332           /* split the items at the topmost different morton code bit */
333           const unsigned int bitpos_diff = 31-bitpos;
334           const unsigned int bitmask = 1 << bitpos_diff;
335 
336           /* find location where bit differs using binary search */
337           unsigned begin = current.begin();
338           unsigned end   = current.end();
339           while (begin + 1 != end) {
340             const unsigned mid = (begin+end)/2;
341             const unsigned bit = morton[mid].code & bitmask;
342             if (bit == 0) begin = mid; else end = mid;
343           }
344           unsigned center = end;
345 #if defined(DEBUG)
346           for (unsigned int i=begin;  i<center; i++) assert((morton[i].code & bitmask) == 0);
347           for (unsigned int i=center; i<end;    i++) assert((morton[i].code & bitmask) == bitmask);
348 #endif
349 
350           left = make_range(current.begin(),center);
351           right = make_range(center,current.end());
352         }
353 
recurseBVHBuilderMorton354         ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
355         {
356           /* get thread local allocator */
357           if (!alloc)
358             alloc = createAllocator();
359 
360           /* call memory monitor function to signal progress */
361           if (toplevel && current.size() <= singleThreadThreshold)
362             progressMonitor(current.size());
363 
364           /* create leaf node */
365           if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
366             return createLargeLeaf(depth,current,alloc);
367 
368           /* fill all children by always splitting the one with the largest surface area */
369           range<unsigned> children[MAX_BRANCHING_FACTOR];
370           split(current,children[0],children[1]);
371           size_t numChildren = 2;
372 
373           while (numChildren < branchingFactor)
374           {
375             /* find best child with largest number of primitives */
376             int bestChild = -1;
377             unsigned bestItems = 0;
378             for (unsigned int i=0; i<numChildren; i++)
379             {
380               /* ignore leaves as they cannot get split */
381               if (children[i].size() <= minLeafSize)
382                 continue;
383 
384               /* remember child with largest area */
385               if (children[i].size() > bestItems) {
386                 bestItems = children[i].size();
387                 bestChild = i;
388               }
389             }
390             if (bestChild == -1) break;
391 
392             /*! split best child into left and right child */
393             range<unsigned> left, right;
394             split(children[bestChild],left,right);
395 
396             /* add new children left and right */
397             children[bestChild] = children[numChildren-1];
398             children[numChildren-1] = left;
399             children[numChildren+0] = right;
400             numChildren++;
401           }
402 
403           /* create leaf node if no split is possible */
404           if (unlikely(numChildren == 1))
405             return createLeaf(current,alloc);
406 
407           /* allocate node */
408           auto node = createNode(alloc,numChildren);
409 
410           /* process top parts of tree parallel */
411           ReductionTy bounds[MAX_BRANCHING_FACTOR];
412           if (current.size() > singleThreadThreshold)
413           {
414             /*! parallel_for is faster than spawing sub-tasks */
415             parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
416                 for (size_t i=r.begin(); i<r.end(); i++) {
417                   bounds[i] = recurse(depth+1,children[i],nullptr,true);
418                   _mm_mfence(); // to allow non-temporal stores during build
419                 }
420               });
421           }
422 
423           /* finish tree sequentially */
424           else
425           {
426             for (size_t i=0; i<numChildren; i++)
427               bounds[i] = recurse(depth+1,children[i],alloc,false);
428           }
429 
430           return setBounds(node,bounds,numChildren);
431         }
432 
433         /* build function */
buildBVHBuilderMorton434         ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
435         {
436           /* sort morton codes */
437           morton = src;
438           radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
439 
440           /* build BVH */
441           const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
442           _mm_mfence(); // to allow non-temporal stores during build
443           return root;
444         }
445 
446       public:
447         CreateAllocator& createAllocator;
448         CreateNodeFunc& createNode;
449         SetNodeBoundsFunc& setBounds;
450         CreateLeafFunc& createLeaf;
451         CalculateBounds& calculateBounds;
452         ProgressMonitor& progressMonitor;
453 
454       public:
455         BuildPrim* morton;
456       };
457 
458 
459       template<
460       typename ReductionTy,
461         typename CreateAllocFunc,
462         typename CreateNodeFunc,
463         typename SetBoundsFunc,
464         typename CreateLeafFunc,
465         typename CalculateBoundsFunc,
466         typename ProgressMonitor>
467 
buildBVHBuilderMorton468         static ReductionTy build(CreateAllocFunc createAllocator,
469                                  CreateNodeFunc createNode,
470                                  SetBoundsFunc setBounds,
471                                  CreateLeafFunc createLeaf,
472                                  CalculateBoundsFunc calculateBounds,
473                                  ProgressMonitor progressMonitor,
474                                  BuildPrim* src,
475                                  BuildPrim* tmp,
476                                  size_t numPrimitives,
477                                  const Settings& settings)
478         {
479           typedef BuilderT<
480             ReductionTy,
481             decltype(createAllocator()),
482             CreateAllocFunc,
483             CreateNodeFunc,
484             SetBoundsFunc,
485             CreateLeafFunc,
486             CalculateBoundsFunc,
487             ProgressMonitor> Builder;
488 
489           Builder builder(createAllocator,
490                           createNode,
491                           setBounds,
492                           createLeaf,
493                           calculateBounds,
494                           progressMonitor,
495                           settings);
496 
497           return builder.build(src,tmp,numPrimitives);
498         }
499     };
500   }
501 }
502