1 // Copyright 2009-2021 Intel Corporation 2 // SPDX-License-Identifier: Apache-2.0 3 4 #pragma once 5 6 #include "../simd/simd.h" 7 #include "parallel_for.h" 8 #include <algorithm> 9 10 namespace embree 11 { 12 template<class T> insertionsort_ascending(T * __restrict__ array,const size_t length)13 __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) 14 { 15 for(size_t i = 1;i<length;++i) 16 { 17 T v = array[i]; 18 size_t j = i; 19 while(j > 0 && v < array[j-1]) 20 { 21 array[j] = array[j-1]; 22 --j; 23 } 24 array[j] = v; 25 } 26 } 27 28 template<class T> insertionsort_decending(T * __restrict__ array,const size_t length)29 __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) 30 { 31 for(size_t i = 1;i<length;++i) 32 { 33 T v = array[i]; 34 size_t j = i; 35 while(j > 0 && v > array[j-1]) 36 { 37 array[j] = array[j-1]; 38 --j; 39 } 40 array[j] = v; 41 } 42 } 43 44 template<class T> quicksort_ascending(T * __restrict__ t,const ssize_t begin,const ssize_t end)45 void quicksort_ascending(T *__restrict__ t, 46 const ssize_t begin, 47 const ssize_t end) 48 { 49 if (likely(begin < end)) 50 { 51 const T pivotvalue = t[begin]; 52 ssize_t left = begin - 1; 53 ssize_t right = end + 1; 54 55 while(1) 56 { 57 while (t[--right] > pivotvalue); 58 while (t[++left] < pivotvalue); 59 60 if (left >= right) break; 61 62 const T temp = t[right]; 63 t[right] = t[left]; 64 t[left] = temp; 65 } 66 67 const int pivot = right; 68 quicksort_ascending(t, begin, pivot); 69 quicksort_ascending(t, pivot + 1, end); 70 } 71 } 72 73 template<class T> quicksort_decending(T * __restrict__ t,const ssize_t begin,const ssize_t end)74 void quicksort_decending(T *__restrict__ t, 75 const ssize_t begin, 76 const ssize_t end) 77 { 78 if (likely(begin < end)) 79 { 80 const T pivotvalue = t[begin]; 81 ssize_t left = begin - 1; 82 ssize_t right = end + 1; 83 84 while(1) 85 { 86 while (t[--right] < pivotvalue); 87 while (t[++left] > pivotvalue); 88 89 if (left >= right) break; 90 91 const T temp = t[right]; 92 t[right] = t[left]; 93 t[left] = temp; 94 } 95 96 const int pivot = right; 97 quicksort_decending(t, begin, pivot); 98 quicksort_decending(t, pivot + 1, end); 99 } 100 } 101 102 103 template<class T, ssize_t THRESHOLD> quicksort_insertionsort_ascending(T * __restrict__ t,const ssize_t begin,const ssize_t end)104 void quicksort_insertionsort_ascending(T *__restrict__ t, 105 const ssize_t begin, 106 const ssize_t end) 107 { 108 if (likely(begin < end)) 109 { 110 const ssize_t size = end-begin+1; 111 if (likely(size <= THRESHOLD)) 112 { 113 insertionsort_ascending<T>(&t[begin],size); 114 } 115 else 116 { 117 const T pivotvalue = t[begin]; 118 ssize_t left = begin - 1; 119 ssize_t right = end + 1; 120 121 while(1) 122 { 123 while (t[--right] > pivotvalue); 124 while (t[++left] < pivotvalue); 125 126 if (left >= right) break; 127 128 const T temp = t[right]; 129 t[right] = t[left]; 130 t[left] = temp; 131 } 132 133 const ssize_t pivot = right; 134 quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); 135 quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); 136 } 137 } 138 } 139 140 141 template<class T, ssize_t THRESHOLD> quicksort_insertionsort_decending(T * __restrict__ t,const ssize_t begin,const ssize_t end)142 void quicksort_insertionsort_decending(T *__restrict__ t, 143 const ssize_t begin, 144 const ssize_t end) 145 { 146 if (likely(begin < end)) 147 { 148 const ssize_t size = end-begin+1; 149 if (likely(size <= THRESHOLD)) 150 { 151 insertionsort_decending<T>(&t[begin],size); 152 } 153 else 154 { 155 156 const T pivotvalue = t[begin]; 157 ssize_t left = begin - 1; 158 ssize_t right = end + 1; 159 160 while(1) 161 { 162 while (t[--right] < pivotvalue); 163 while (t[++left] > pivotvalue); 164 165 if (left >= right) break; 166 167 const T temp = t[right]; 168 t[right] = t[left]; 169 t[left] = temp; 170 } 171 172 const ssize_t pivot = right; 173 quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); 174 quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); 175 } 176 } 177 } 178 179 template<typename T> 180 static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) 181 { 182 static const unsigned int BITS = 8; 183 static const unsigned int BUCKETS = (1 << BITS); 184 static const unsigned int CMP_SORT_THRESHOLD = 16; 185 186 __aligned(64) unsigned int count[BUCKETS]; 187 188 /* clear buckets */ 189 for (size_t i=0;i<BUCKETS;i++) count[i] = 0; 190 191 /* count buckets */ 192 #if defined(__INTEL_COMPILER) 193 #pragma nounroll 194 #endif 195 for (size_t i=0;i<num;i++) 196 count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; 197 198 /* prefix sums */ 199 __aligned(64) unsigned int head[BUCKETS]; 200 __aligned(64) unsigned int tail[BUCKETS]; 201 202 head[0] = 0; 203 for (size_t i=1; i<BUCKETS; i++) 204 head[i] = head[i-1] + count[i-1]; 205 206 for (size_t i=0; i<BUCKETS-1; i++) 207 tail[i] = head[i+1]; 208 209 tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; 210 211 assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]); 212 assert(tail[BUCKETS-1] == num); 213 214 /* in-place swap */ 215 for (size_t i=0;i<BUCKETS;i++) 216 { 217 /* process bucket */ 218 while(head[i] < tail[i]) 219 { 220 T v = morton[head[i]]; 221 while(1) 222 { 223 const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); 224 if (b == i) break; 225 std::swap(v,morton[head[b]++]); 226 } 227 assert((unsigned(v) >> shift & (BUCKETS-1)) == i); 228 morton[head[i]++] = v; 229 } 230 } 231 if (shift == 0) return; 232 233 size_t offset = 0; 234 for (size_t i=0;i<BUCKETS;i++) 235 if (count[i]) 236 { 237 238 for (size_t j=offset;j<offset+count[i]-1;j++) 239 assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); 240 241 if (unlikely(count[i] < CMP_SORT_THRESHOLD)) 242 insertionsort_ascending(morton + offset, count[i]); 243 else 244 radixsort32(morton + offset, count[i], shift-BITS); 245 246 for (size_t j=offset;j<offset+count[i]-1;j++) 247 assert(morton[j] <= morton[j+1]); 248 249 offset += count[i]; 250 } 251 } 252 253 template<typename Ty, typename Key> 254 class ParallelRadixSort 255 { 256 static const size_t MAX_TASKS = 64; 257 static const size_t BITS = 8; 258 static const size_t BUCKETS = (1 << BITS); 259 typedef unsigned int TyRadixCount[BUCKETS]; 260 261 template<typename T> compare(const T & v0,const T & v1)262 static bool compare(const T& v0, const T& v1) { 263 return (Key)v0 < (Key)v1; 264 } 265 266 private: 267 ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement 268 ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement 269 270 271 public: ParallelRadixSort(Ty * const src,Ty * const tmp,const size_t N)272 ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) 273 : radixCount(nullptr), src(src), tmp(tmp), N(N) {} 274 sort(const size_t blockSize)275 void sort(const size_t blockSize) 276 { 277 assert(blockSize > 0); 278 279 /* perform single threaded sort for small N */ 280 if (N<=blockSize) // handles also special case of 0! 281 { 282 /* do inplace sort inside destination array */ 283 std::sort(src,src+N,compare<Ty>); 284 } 285 286 /* perform parallel sort for large N */ 287 else 288 { 289 const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); 290 tbbRadixSort(numThreads); 291 } 292 } 293 ~ParallelRadixSort()294 ~ParallelRadixSort() 295 { 296 alignedFree(radixCount); 297 radixCount = nullptr; 298 } 299 300 private: 301 tbbRadixIteration0(const Key shift,const Ty * __restrict const src,Ty * __restrict const dst,const size_t threadIndex,const size_t threadCount)302 void tbbRadixIteration0(const Key shift, 303 const Ty* __restrict const src, 304 Ty* __restrict const dst, 305 const size_t threadIndex, const size_t threadCount) 306 { 307 const size_t startID = (threadIndex+0)*N/threadCount; 308 const size_t endID = (threadIndex+1)*N/threadCount; 309 310 /* mask to extract some number of bits */ 311 const Key mask = BUCKETS-1; 312 313 /* count how many items go into the buckets */ 314 for (size_t i=0; i<BUCKETS; i++) 315 radixCount[threadIndex][i] = 0; 316 317 /* iterate over src array and count buckets */ 318 unsigned int * __restrict const count = radixCount[threadIndex]; 319 #if defined(__INTEL_COMPILER) 320 #pragma nounroll 321 #endif 322 for (size_t i=startID; i<endID; i++) { 323 #if defined(__64BIT__) 324 const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; 325 #else 326 const Key index = ((Key)src[i] >> shift) & mask; 327 #endif 328 count[index]++; 329 } 330 } 331 tbbRadixIteration1(const Key shift,const Ty * __restrict const src,Ty * __restrict const dst,const size_t threadIndex,const size_t threadCount)332 void tbbRadixIteration1(const Key shift, 333 const Ty* __restrict const src, 334 Ty* __restrict const dst, 335 const size_t threadIndex, const size_t threadCount) 336 { 337 const size_t startID = (threadIndex+0)*N/threadCount; 338 const size_t endID = (threadIndex+1)*N/threadCount; 339 340 /* mask to extract some number of bits */ 341 const Key mask = BUCKETS-1; 342 343 /* calculate total number of items for each bucket */ 344 __aligned(64) unsigned int total[BUCKETS]; 345 /* 346 for (size_t i=0; i<BUCKETS; i++) 347 total[i] = 0; 348 */ 349 for (size_t i=0; i<BUCKETS; i+=VSIZEX) 350 vintx::store(&total[i], zero); 351 352 for (size_t i=0; i<threadCount; i++) 353 { 354 /* 355 for (size_t j=0; j<BUCKETS; j++) 356 total[j] += radixCount[i][j]; 357 */ 358 for (size_t j=0; j<BUCKETS; j+=VSIZEX) 359 vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j])); 360 } 361 362 /* calculate start offset of each bucket */ 363 __aligned(64) unsigned int offset[BUCKETS]; 364 offset[0] = 0; 365 for (size_t i=1; i<BUCKETS; i++) 366 offset[i] = offset[i-1] + total[i-1]; 367 368 /* calculate start offset of each bucket for this thread */ 369 for (size_t i=0; i<threadIndex; i++) 370 { 371 /* 372 for (size_t j=0; j<BUCKETS; j++) 373 offset[j] += radixCount[i][j]; 374 */ 375 for (size_t j=0; j<BUCKETS; j+=VSIZEX) 376 vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j])); 377 } 378 379 /* copy items into their buckets */ 380 #if defined(__INTEL_COMPILER) 381 #pragma nounroll 382 #endif 383 for (size_t i=startID; i<endID; i++) { 384 const Ty elt = src[i]; 385 #if defined(__64BIT__) 386 const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; 387 #else 388 const size_t index = ((Key)src[i] >> shift) & mask; 389 #endif 390 dst[offset[index]++] = elt; 391 } 392 } 393 tbbRadixIteration(const Key shift,const bool last,const Ty * __restrict src,Ty * __restrict dst,const size_t numTasks)394 void tbbRadixIteration(const Key shift, const bool last, 395 const Ty* __restrict src, Ty* __restrict dst, 396 const size_t numTasks) 397 { 398 affinity_partitioner ap; 399 parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); 400 parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); 401 } 402 tbbRadixSort(const size_t numTasks)403 void tbbRadixSort(const size_t numTasks) 404 { 405 radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); 406 407 if (sizeof(Key) == sizeof(uint32_t)) { 408 tbbRadixIteration(0*BITS,0,src,tmp,numTasks); 409 tbbRadixIteration(1*BITS,0,tmp,src,numTasks); 410 tbbRadixIteration(2*BITS,0,src,tmp,numTasks); 411 tbbRadixIteration(3*BITS,1,tmp,src,numTasks); 412 } 413 else if (sizeof(Key) == sizeof(uint64_t)) 414 { 415 tbbRadixIteration(0*BITS,0,src,tmp,numTasks); 416 tbbRadixIteration(1*BITS,0,tmp,src,numTasks); 417 tbbRadixIteration(2*BITS,0,src,tmp,numTasks); 418 tbbRadixIteration(3*BITS,0,tmp,src,numTasks); 419 tbbRadixIteration(4*BITS,0,src,tmp,numTasks); 420 tbbRadixIteration(5*BITS,0,tmp,src,numTasks); 421 tbbRadixIteration(6*BITS,0,src,tmp,numTasks); 422 tbbRadixIteration(7*BITS,1,tmp,src,numTasks); 423 } 424 } 425 426 private: 427 TyRadixCount* radixCount; 428 Ty* const src; 429 Ty* const tmp; 430 const size_t N; 431 }; 432 433 template<typename Ty> 434 void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) 435 { 436 ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); 437 } 438 439 template<typename Ty, typename Key> 440 void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) 441 { 442 ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); 443 } 444 445 template<typename Ty> 446 void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { 447 radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); 448 } 449 450 template<typename Ty> 451 void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { 452 radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); 453 } 454 } 455