1 // Copyright 2009-2021 Intel Corporation 2 // SPDX-License-Identifier: Apache-2.0 3 4 #pragma once 5 6 #include "../common/builder.h" 7 #include "../../common/algorithms/parallel_reduce.h" 8 9 namespace embree 10 { 11 namespace isa 12 { 13 struct BVHBuilderMorton 14 { 15 static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor 16 static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth 17 18 /*! settings for morton builder */ 19 struct Settings 20 { 21 /*! default settings */ SettingsBVHBuilderMorton::Settings22 Settings () 23 : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} 24 25 /*! initialize settings from API settings */ SettingsBVHBuilderMorton::Settings26 Settings (const RTCBuildArguments& settings) 27 : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) 28 { 29 if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; 30 if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; 31 if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; 32 if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; 33 34 minLeafSize = min(minLeafSize,maxLeafSize); 35 } 36 SettingsBVHBuilderMorton::Settings37 Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) 38 : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) 39 { 40 minLeafSize = min(minLeafSize,maxLeafSize); 41 } 42 43 public: 44 size_t branchingFactor; //!< branching factor of BVH to build 45 size_t maxDepth; //!< maximum depth of BVH to build 46 size_t minLeafSize; //!< minimum size of a leaf 47 size_t maxLeafSize; //!< maximum size of a leaf 48 size_t singleThreadThreshold; //!< threshold when we switch to single threaded build 49 }; 50 51 /*! Build primitive consisting of morton code and primitive ID. */ 52 struct __aligned(8) BuildPrim 53 { 54 union { 55 struct { 56 unsigned int code; //!< morton code 57 unsigned int index; //!< i'th primitive 58 }; 59 uint64_t t; 60 }; 61 62 /*! interface for radix sort */ 63 __forceinline operator unsigned() const { return code; } 64 65 /*! interface for standard sort */ 66 __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } 67 }; 68 69 /*! maps bounding box to morton code */ 70 struct MortonCodeMapping 71 { 72 static const size_t LATTICE_BITS_PER_DIM = 10; 73 static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; 74 75 vfloat4 base; 76 vfloat4 scale; 77 MortonCodeMappingBVHBuilderMorton::MortonCodeMapping78 __forceinline MortonCodeMapping(const BBox3fa& bounds) 79 { 80 base = (vfloat4)bounds.lower; 81 const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; 82 scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f)); 83 } 84 binBVHBuilderMorton::MortonCodeMapping85 __forceinline const vint4 bin (const BBox3fa& box) const 86 { 87 const vfloat4 lower = (vfloat4)box.lower; 88 const vfloat4 upper = (vfloat4)box.upper; 89 const vfloat4 centroid = lower+upper; 90 return vint4((centroid-base)*scale); 91 } 92 codeBVHBuilderMorton::MortonCodeMapping93 __forceinline unsigned int code (const BBox3fa& box) const 94 { 95 const vint4 binID = bin(box); 96 const unsigned int x = extract<0>(binID); 97 const unsigned int y = extract<1>(binID); 98 const unsigned int z = extract<2>(binID); 99 const unsigned int xyz = bitInterleave(x,y,z); 100 return xyz; 101 } 102 }; 103 104 #if defined (__AVX2__) 105 106 /*! for AVX2 there is a fast scalar bitInterleave */ 107 struct MortonCodeGenerator 108 { MortonCodeGeneratorBVHBuilderMorton::MortonCodeGenerator109 __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) 110 : mapping(mapping), dest(dest) {} 111 operatorBVHBuilderMorton::MortonCodeGenerator112 __forceinline void operator() (const BBox3fa& b, const unsigned index) 113 { 114 dest->index = index; 115 dest->code = mapping.code(b); 116 dest++; 117 } 118 119 public: 120 const MortonCodeMapping mapping; 121 BuildPrim* dest; 122 size_t currentID; 123 }; 124 125 #else 126 127 /*! before AVX2 is it better to use the SSE version of bitInterleave */ 128 struct MortonCodeGenerator 129 { MortonCodeGeneratorBVHBuilderMorton::MortonCodeGenerator130 __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) 131 : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} 132 ~MortonCodeGeneratorBVHBuilderMorton::MortonCodeGenerator133 __forceinline ~MortonCodeGenerator() 134 { 135 if (slots != 0) 136 { 137 const vint4 code = bitInterleave(ax,ay,az); 138 for (size_t i=0; i<slots; i++) { 139 dest[currentID-slots+i].index = ai[i]; 140 dest[currentID-slots+i].code = code[i]; 141 } 142 } 143 } 144 operatorBVHBuilderMorton::MortonCodeGenerator145 __forceinline void operator() (const BBox3fa& b, const unsigned index) 146 { 147 const vint4 binID = mapping.bin(b); 148 ax[slots] = extract<0>(binID); 149 ay[slots] = extract<1>(binID); 150 az[slots] = extract<2>(binID); 151 ai[slots] = index; 152 slots++; 153 currentID++; 154 155 if (slots == 4) 156 { 157 const vint4 code = bitInterleave(ax,ay,az); 158 vint4::storeu(&dest[currentID-4],unpacklo(code,ai)); 159 vint4::storeu(&dest[currentID-2],unpackhi(code,ai)); 160 slots = 0; 161 } 162 } 163 164 public: 165 const MortonCodeMapping mapping; 166 BuildPrim* dest; 167 size_t currentID; 168 size_t slots; 169 vint4 ax, ay, az, ai; 170 }; 171 172 #endif 173 174 template< 175 typename ReductionTy, 176 typename Allocator, 177 typename CreateAllocator, 178 typename CreateNodeFunc, 179 typename SetNodeBoundsFunc, 180 typename CreateLeafFunc, 181 typename CalculateBounds, 182 typename ProgressMonitor> 183 184 class BuilderT : private Settings 185 { 186 ALIGNED_CLASS_(16); 187 188 public: 189 BuilderTBVHBuilderMorton190 BuilderT (CreateAllocator& createAllocator, 191 CreateNodeFunc& createNode, 192 SetNodeBoundsFunc& setBounds, 193 CreateLeafFunc& createLeaf, 194 CalculateBounds& calculateBounds, 195 ProgressMonitor& progressMonitor, 196 const Settings& settings) 197 198 : Settings(settings), 199 createAllocator(createAllocator), 200 createNode(createNode), 201 setBounds(setBounds), 202 createLeaf(createLeaf), 203 calculateBounds(calculateBounds), 204 progressMonitor(progressMonitor), 205 morton(nullptr) {} 206 createLargeLeafBVHBuilderMorton207 ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc) 208 { 209 /* this should never occur but is a fatal error */ 210 if (depth > maxDepth) 211 throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); 212 213 /* create leaf for few primitives */ 214 if (current.size() <= maxLeafSize) 215 return createLeaf(current,alloc); 216 217 /* fill all children by always splitting the largest one */ 218 range<unsigned> children[MAX_BRANCHING_FACTOR]; 219 size_t numChildren = 1; 220 children[0] = current; 221 222 do { 223 224 /* find best child with largest number of primitives */ 225 size_t bestChild = -1; 226 size_t bestSize = 0; 227 for (size_t i=0; i<numChildren; i++) 228 { 229 /* ignore leaves as they cannot get split */ 230 if (children[i].size() <= maxLeafSize) 231 continue; 232 233 /* remember child with largest size */ 234 if (children[i].size() > bestSize) { 235 bestSize = children[i].size(); 236 bestChild = i; 237 } 238 } 239 if (bestChild == size_t(-1)) break; 240 241 /*! split best child into left and right child */ 242 auto split = children[bestChild].split(); 243 244 /* add new children left and right */ 245 children[bestChild] = children[numChildren-1]; 246 children[numChildren-1] = split.first; 247 children[numChildren+0] = split.second; 248 numChildren++; 249 250 } while (numChildren < branchingFactor); 251 252 /* create node */ 253 auto node = createNode(alloc,numChildren); 254 255 /* recurse into each child */ 256 ReductionTy bounds[MAX_BRANCHING_FACTOR]; 257 for (size_t i=0; i<numChildren; i++) 258 bounds[i] = createLargeLeaf(depth+1,children[i],alloc); 259 260 return setBounds(node,bounds,numChildren); 261 } 262 263 /*! recreates morton codes when reaching a region where all codes are identical */ recreateMortonCodesBVHBuilderMorton264 __noinline void recreateMortonCodes(const range<unsigned>& current) const 265 { 266 /* fast path for small ranges */ 267 if (likely(current.size() < 1024)) 268 { 269 /*! recalculate centroid bounds */ 270 BBox3fa centBounds(empty); 271 for (size_t i=current.begin(); i<current.end(); i++) 272 centBounds.extend(center2(calculateBounds(morton[i]))); 273 274 /* recalculate morton codes */ 275 MortonCodeMapping mapping(centBounds); 276 for (size_t i=current.begin(); i<current.end(); i++) 277 morton[i].code = mapping.code(calculateBounds(morton[i])); 278 279 /* sort morton codes */ 280 std::sort(morton+current.begin(),morton+current.end()); 281 } 282 else 283 { 284 /*! recalculate centroid bounds */ 285 auto calculateCentBounds = [&] ( const range<unsigned>& r ) { 286 BBox3fa centBounds = empty; 287 for (size_t i=r.begin(); i<r.end(); i++) 288 centBounds.extend(center2(calculateBounds(morton[i]))); 289 return centBounds; 290 }; 291 const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024), 292 BBox3fa(empty), calculateCentBounds, BBox3fa::merge); 293 294 /* recalculate morton codes */ 295 MortonCodeMapping mapping(centBounds); 296 parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) { 297 for (size_t i=r.begin(); i<r.end(); i++) { 298 morton[i].code = mapping.code(calculateBounds(morton[i])); 299 } 300 }); 301 302 /*! sort morton codes */ 303 #if defined(TASKING_TBB) 304 tbb::parallel_sort(morton+current.begin(),morton+current.end()); 305 #else 306 radixsort32(morton+current.begin(),current.size()); 307 #endif 308 } 309 } 310 splitBVHBuilderMorton311 __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const 312 { 313 const unsigned int code_start = morton[current.begin()].code; 314 const unsigned int code_end = morton[current.end()-1].code; 315 unsigned int bitpos = lzcnt(code_start^code_end); 316 317 /* if all items mapped to same morton code, then re-create new morton codes for the items */ 318 if (unlikely(bitpos == 32)) 319 { 320 recreateMortonCodes(current); 321 const unsigned int code_start = morton[current.begin()].code; 322 const unsigned int code_end = morton[current.end()-1].code; 323 bitpos = lzcnt(code_start^code_end); 324 325 /* if the morton code is still the same, goto fall back split */ 326 if (unlikely(bitpos == 32)) { 327 current.split(left,right); 328 return; 329 } 330 } 331 332 /* split the items at the topmost different morton code bit */ 333 const unsigned int bitpos_diff = 31-bitpos; 334 const unsigned int bitmask = 1 << bitpos_diff; 335 336 /* find location where bit differs using binary search */ 337 unsigned begin = current.begin(); 338 unsigned end = current.end(); 339 while (begin + 1 != end) { 340 const unsigned mid = (begin+end)/2; 341 const unsigned bit = morton[mid].code & bitmask; 342 if (bit == 0) begin = mid; else end = mid; 343 } 344 unsigned center = end; 345 #if defined(DEBUG) 346 for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0); 347 for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask); 348 #endif 349 350 left = make_range(current.begin(),center); 351 right = make_range(center,current.end()); 352 } 353 recurseBVHBuilderMorton354 ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel) 355 { 356 /* get thread local allocator */ 357 if (!alloc) 358 alloc = createAllocator(); 359 360 /* call memory monitor function to signal progress */ 361 if (toplevel && current.size() <= singleThreadThreshold) 362 progressMonitor(current.size()); 363 364 /* create leaf node */ 365 if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) 366 return createLargeLeaf(depth,current,alloc); 367 368 /* fill all children by always splitting the one with the largest surface area */ 369 range<unsigned> children[MAX_BRANCHING_FACTOR]; 370 split(current,children[0],children[1]); 371 size_t numChildren = 2; 372 373 while (numChildren < branchingFactor) 374 { 375 /* find best child with largest number of primitives */ 376 int bestChild = -1; 377 unsigned bestItems = 0; 378 for (unsigned int i=0; i<numChildren; i++) 379 { 380 /* ignore leaves as they cannot get split */ 381 if (children[i].size() <= minLeafSize) 382 continue; 383 384 /* remember child with largest area */ 385 if (children[i].size() > bestItems) { 386 bestItems = children[i].size(); 387 bestChild = i; 388 } 389 } 390 if (bestChild == -1) break; 391 392 /*! split best child into left and right child */ 393 range<unsigned> left, right; 394 split(children[bestChild],left,right); 395 396 /* add new children left and right */ 397 children[bestChild] = children[numChildren-1]; 398 children[numChildren-1] = left; 399 children[numChildren+0] = right; 400 numChildren++; 401 } 402 403 /* create leaf node if no split is possible */ 404 if (unlikely(numChildren == 1)) 405 return createLeaf(current,alloc); 406 407 /* allocate node */ 408 auto node = createNode(alloc,numChildren); 409 410 /* process top parts of tree parallel */ 411 ReductionTy bounds[MAX_BRANCHING_FACTOR]; 412 if (current.size() > singleThreadThreshold) 413 { 414 /*! parallel_for is faster than spawing sub-tasks */ 415 parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { 416 for (size_t i=r.begin(); i<r.end(); i++) { 417 bounds[i] = recurse(depth+1,children[i],nullptr,true); 418 _mm_mfence(); // to allow non-temporal stores during build 419 } 420 }); 421 } 422 423 /* finish tree sequentially */ 424 else 425 { 426 for (size_t i=0; i<numChildren; i++) 427 bounds[i] = recurse(depth+1,children[i],alloc,false); 428 } 429 430 return setBounds(node,bounds,numChildren); 431 } 432 433 /* build function */ buildBVHBuilderMorton434 ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives) 435 { 436 /* sort morton codes */ 437 morton = src; 438 radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold); 439 440 /* build BVH */ 441 const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true); 442 _mm_mfence(); // to allow non-temporal stores during build 443 return root; 444 } 445 446 public: 447 CreateAllocator& createAllocator; 448 CreateNodeFunc& createNode; 449 SetNodeBoundsFunc& setBounds; 450 CreateLeafFunc& createLeaf; 451 CalculateBounds& calculateBounds; 452 ProgressMonitor& progressMonitor; 453 454 public: 455 BuildPrim* morton; 456 }; 457 458 459 template< 460 typename ReductionTy, 461 typename CreateAllocFunc, 462 typename CreateNodeFunc, 463 typename SetBoundsFunc, 464 typename CreateLeafFunc, 465 typename CalculateBoundsFunc, 466 typename ProgressMonitor> 467 buildBVHBuilderMorton468 static ReductionTy build(CreateAllocFunc createAllocator, 469 CreateNodeFunc createNode, 470 SetBoundsFunc setBounds, 471 CreateLeafFunc createLeaf, 472 CalculateBoundsFunc calculateBounds, 473 ProgressMonitor progressMonitor, 474 BuildPrim* src, 475 BuildPrim* tmp, 476 size_t numPrimitives, 477 const Settings& settings) 478 { 479 typedef BuilderT< 480 ReductionTy, 481 decltype(createAllocator()), 482 CreateAllocFunc, 483 CreateNodeFunc, 484 SetBoundsFunc, 485 CreateLeafFunc, 486 CalculateBoundsFunc, 487 ProgressMonitor> Builder; 488 489 Builder builder(createAllocator, 490 createNode, 491 setBounds, 492 createLeaf, 493 calculateBounds, 494 progressMonitor, 495 settings); 496 497 return builder.build(src,tmp,numPrimitives); 498 } 499 }; 500 } 501 } 502