1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #pragma once
5 
6 #include "../simd/simd.h"
7 #include "parallel_for.h"
8 #include <algorithm>
9 
10 namespace embree
11 {
12   template<class T>
insertionsort_ascending(T * __restrict__ array,const size_t length)13     __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
14   {
15     for(size_t i = 1;i<length;++i)
16     {
17       T v = array[i];
18       size_t j = i;
19       while(j > 0 && v < array[j-1])
20       {
21         array[j] = array[j-1];
22         --j;
23       }
24       array[j] = v;
25     }
26   }
27 
28   template<class T>
insertionsort_decending(T * __restrict__ array,const size_t length)29     __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
30   {
31     for(size_t i = 1;i<length;++i)
32     {
33       T v = array[i];
34       size_t j = i;
35       while(j > 0 && v > array[j-1])
36       {
37         array[j] = array[j-1];
38         --j;
39       }
40       array[j] = v;
41     }
42   }
43 
44   template<class T>
quicksort_ascending(T * __restrict__ t,const ssize_t begin,const ssize_t end)45     void quicksort_ascending(T *__restrict__ t,
46 			     const ssize_t begin,
47 			     const ssize_t end)
48   {
49     if (likely(begin < end))
50     {
51       const T pivotvalue = t[begin];
52       ssize_t left  = begin - 1;
53       ssize_t right = end   + 1;
54 
55       while(1)
56       {
57         while (t[--right] > pivotvalue);
58         while (t[++left] < pivotvalue);
59 
60         if (left >= right) break;
61 
62         const T temp = t[right];
63         t[right] = t[left];
64         t[left] = temp;
65       }
66 
67       const int pivot = right;
68       quicksort_ascending(t, begin, pivot);
69       quicksort_ascending(t, pivot + 1, end);
70     }
71   }
72 
73   template<class T>
quicksort_decending(T * __restrict__ t,const ssize_t begin,const ssize_t end)74     void quicksort_decending(T *__restrict__ t,
75 			     const ssize_t begin,
76 			     const ssize_t end)
77   {
78     if (likely(begin < end))
79     {
80       const T pivotvalue = t[begin];
81       ssize_t left  = begin - 1;
82       ssize_t right = end   + 1;
83 
84       while(1)
85       {
86         while (t[--right] < pivotvalue);
87         while (t[++left] > pivotvalue);
88 
89         if (left >= right) break;
90 
91         const T temp = t[right];
92         t[right] = t[left];
93         t[left] = temp;
94       }
95 
96       const int pivot = right;
97       quicksort_decending(t, begin, pivot);
98       quicksort_decending(t, pivot + 1, end);
99     }
100   }
101 
102 
103   template<class T, ssize_t THRESHOLD>
quicksort_insertionsort_ascending(T * __restrict__ t,const ssize_t begin,const ssize_t end)104     void quicksort_insertionsort_ascending(T *__restrict__ t,
105 					   const ssize_t begin,
106 					   const ssize_t end)
107   {
108     if (likely(begin < end))
109     {
110       const ssize_t size = end-begin+1;
111       if (likely(size <= THRESHOLD))
112       {
113         insertionsort_ascending<T>(&t[begin],size);
114       }
115       else
116       {
117         const T pivotvalue = t[begin];
118         ssize_t left  = begin - 1;
119         ssize_t right = end   + 1;
120 
121         while(1)
122         {
123           while (t[--right] > pivotvalue);
124           while (t[++left] < pivotvalue);
125 
126           if (left >= right) break;
127 
128           const T temp = t[right];
129           t[right] = t[left];
130           t[left] = temp;
131         }
132 
133         const ssize_t pivot = right;
134         quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
135         quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
136       }
137     }
138   }
139 
140 
141   template<class T, ssize_t THRESHOLD>
quicksort_insertionsort_decending(T * __restrict__ t,const ssize_t begin,const ssize_t end)142     void quicksort_insertionsort_decending(T *__restrict__ t,
143 					   const ssize_t begin,
144 					   const ssize_t end)
145   {
146     if (likely(begin < end))
147     {
148       const ssize_t size = end-begin+1;
149       if (likely(size <= THRESHOLD))
150       {
151         insertionsort_decending<T>(&t[begin],size);
152       }
153       else
154       {
155 
156         const T pivotvalue = t[begin];
157         ssize_t left  = begin - 1;
158         ssize_t right = end   + 1;
159 
160         while(1)
161         {
162           while (t[--right] < pivotvalue);
163           while (t[++left] > pivotvalue);
164 
165           if (left >= right) break;
166 
167           const T temp = t[right];
168           t[right] = t[left];
169           t[left] = temp;
170         }
171 
172         const ssize_t pivot = right;
173         quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
174         quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
175       }
176     }
177   }
178 
179   template<typename T>
180     static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
181   {
182     static const unsigned int BITS = 8;
183     static const unsigned int BUCKETS = (1 << BITS);
184     static const unsigned int CMP_SORT_THRESHOLD = 16;
185 
186     __aligned(64) unsigned int count[BUCKETS];
187 
188     /* clear buckets */
189     for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
190 
191     /* count buckets */
192 #if defined(__INTEL_COMPILER)
193 #pragma nounroll
194 #endif
195     for (size_t i=0;i<num;i++)
196       count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
197 
198     /* prefix sums */
199     __aligned(64) unsigned int head[BUCKETS];
200     __aligned(64) unsigned int tail[BUCKETS];
201 
202     head[0] = 0;
203     for (size_t i=1; i<BUCKETS; i++)
204       head[i] = head[i-1] + count[i-1];
205 
206     for (size_t i=0; i<BUCKETS-1; i++)
207       tail[i] = head[i+1];
208 
209     tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
210 
211     assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
212     assert(tail[BUCKETS-1] == num);
213 
214     /* in-place swap */
215     for (size_t i=0;i<BUCKETS;i++)
216     {
217       /* process bucket */
218       while(head[i] < tail[i])
219       {
220         T v = morton[head[i]];
221         while(1)
222         {
223           const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
224           if (b == i) break;
225           std::swap(v,morton[head[b]++]);
226         }
227         assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
228         morton[head[i]++] = v;
229       }
230     }
231     if (shift == 0) return;
232 
233     size_t offset = 0;
234     for (size_t i=0;i<BUCKETS;i++)
235       if (count[i])
236       {
237 
238         for (size_t j=offset;j<offset+count[i]-1;j++)
239           assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
240 
241         if (unlikely(count[i] < CMP_SORT_THRESHOLD))
242           insertionsort_ascending(morton + offset, count[i]);
243         else
244           radixsort32(morton + offset, count[i], shift-BITS);
245 
246         for (size_t j=offset;j<offset+count[i]-1;j++)
247           assert(morton[j] <= morton[j+1]);
248 
249         offset += count[i];
250       }
251   }
252 
253   template<typename Ty, typename Key>
254     class ParallelRadixSort
255   {
256     static const size_t MAX_TASKS = 64;
257     static const size_t BITS = 8;
258     static const size_t BUCKETS = (1 << BITS);
259     typedef unsigned int TyRadixCount[BUCKETS];
260 
261     template<typename T>
compare(const T & v0,const T & v1)262       static bool compare(const T& v0, const T& v1) {
263       return (Key)v0 < (Key)v1;
264     }
265 
266   private:
267     ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
268     ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
269 
270 
271   public:
ParallelRadixSort(Ty * const src,Ty * const tmp,const size_t N)272     ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
273       : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
274 
sort(const size_t blockSize)275     void sort(const size_t blockSize)
276     {
277       assert(blockSize > 0);
278 
279       /* perform single threaded sort for small N */
280       if (N<=blockSize) // handles also special case of 0!
281       {
282         /* do inplace sort inside destination array */
283         std::sort(src,src+N,compare<Ty>);
284       }
285 
286       /* perform parallel sort for large N */
287       else
288       {
289         const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
290         tbbRadixSort(numThreads);
291       }
292     }
293 
~ParallelRadixSort()294     ~ParallelRadixSort()
295     {
296       alignedFree(radixCount);
297       radixCount = nullptr;
298     }
299 
300   private:
301 
tbbRadixIteration0(const Key shift,const Ty * __restrict const src,Ty * __restrict const dst,const size_t threadIndex,const size_t threadCount)302     void tbbRadixIteration0(const Key shift,
303                             const Ty* __restrict const src,
304                             Ty* __restrict const dst,
305                             const size_t threadIndex, const size_t threadCount)
306     {
307       const size_t startID = (threadIndex+0)*N/threadCount;
308       const size_t endID   = (threadIndex+1)*N/threadCount;
309 
310       /* mask to extract some number of bits */
311       const Key mask = BUCKETS-1;
312 
313       /* count how many items go into the buckets */
314       for (size_t i=0; i<BUCKETS; i++)
315         radixCount[threadIndex][i] = 0;
316 
317       /* iterate over src array and count buckets */
318       unsigned int * __restrict const count = radixCount[threadIndex];
319 #if defined(__INTEL_COMPILER)
320 #pragma nounroll
321 #endif
322       for (size_t i=startID; i<endID; i++) {
323 #if defined(__64BIT__)
324         const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
325 #else
326         const Key index = ((Key)src[i] >> shift) & mask;
327 #endif
328         count[index]++;
329       }
330     }
331 
tbbRadixIteration1(const Key shift,const Ty * __restrict const src,Ty * __restrict const dst,const size_t threadIndex,const size_t threadCount)332     void tbbRadixIteration1(const Key shift,
333                             const Ty* __restrict const src,
334                             Ty* __restrict const dst,
335                             const size_t threadIndex, const size_t threadCount)
336     {
337       const size_t startID = (threadIndex+0)*N/threadCount;
338       const size_t endID   = (threadIndex+1)*N/threadCount;
339 
340       /* mask to extract some number of bits */
341       const Key mask = BUCKETS-1;
342 
343       /* calculate total number of items for each bucket */
344       __aligned(64) unsigned int total[BUCKETS];
345       /*
346       for (size_t i=0; i<BUCKETS; i++)
347         total[i] = 0;
348       */
349       for (size_t i=0; i<BUCKETS; i+=VSIZEX)
350         vintx::store(&total[i], zero);
351 
352       for (size_t i=0; i<threadCount; i++)
353       {
354         /*
355         for (size_t j=0; j<BUCKETS; j++)
356           total[j] += radixCount[i][j];
357         */
358         for (size_t j=0; j<BUCKETS; j+=VSIZEX)
359           vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
360       }
361 
362       /* calculate start offset of each bucket */
363       __aligned(64) unsigned int offset[BUCKETS];
364       offset[0] = 0;
365       for (size_t i=1; i<BUCKETS; i++)
366         offset[i] = offset[i-1] + total[i-1];
367 
368       /* calculate start offset of each bucket for this thread */
369       for (size_t i=0; i<threadIndex; i++)
370       {
371         /*
372         for (size_t j=0; j<BUCKETS; j++)
373           offset[j] += radixCount[i][j];
374         */
375         for (size_t j=0; j<BUCKETS; j+=VSIZEX)
376           vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
377       }
378 
379       /* copy items into their buckets */
380 #if defined(__INTEL_COMPILER)
381 #pragma nounroll
382 #endif
383       for (size_t i=startID; i<endID; i++) {
384         const Ty elt = src[i];
385 #if defined(__64BIT__)
386         const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
387 #else
388         const size_t index = ((Key)src[i] >> shift) & mask;
389 #endif
390         dst[offset[index]++] = elt;
391       }
392     }
393 
tbbRadixIteration(const Key shift,const bool last,const Ty * __restrict src,Ty * __restrict dst,const size_t numTasks)394     void tbbRadixIteration(const Key shift, const bool last,
395                            const Ty* __restrict src, Ty* __restrict dst,
396                            const size_t numTasks)
397     {
398       affinity_partitioner ap;
399       parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
400       parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
401     }
402 
tbbRadixSort(const size_t numTasks)403     void tbbRadixSort(const size_t numTasks)
404     {
405       radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
406 
407       if (sizeof(Key) == sizeof(uint32_t)) {
408         tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
409         tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
410         tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
411         tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
412       }
413       else if (sizeof(Key) == sizeof(uint64_t))
414       {
415         tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
416         tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
417         tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
418         tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
419         tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
420         tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
421         tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
422         tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
423       }
424     }
425 
426   private:
427     TyRadixCount* radixCount;
428     Ty* const src;
429     Ty* const tmp;
430     const size_t N;
431   };
432 
433   template<typename Ty>
434     void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
435   {
436     ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
437   }
438 
439   template<typename Ty, typename Key>
440     void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
441   {
442     ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
443   }
444 
445   template<typename Ty>
446     void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
447     radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
448   }
449 
450   template<typename Ty>
451     void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
452     radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
453   }
454 }
455