1 // -------------------------------------------------------------
2 // CUDPP -- CUDA Data Parallel Primitives library
3 // -------------------------------------------------------------
4 // $Revision$
5 // $Date$
6 // -------------------------------------------------------------
7 // This source code is distributed under the terms of license.txt
8 // in the root directory of this source distribution.
9 // -------------------------------------------------------------
10 
11 /**
12  * @file
13  * radixsort_app.cu
14  *
15  * @brief CUDPP application-level radix sorting routines
16  */
17 
18 /** @addtogroup cudpp_app
19  * @{
20  */
21 
22 /** @name RadixSort Functions
23  * @{
24  */
25 
26 
27 #include "cudpp.h"
28 #include "cudpp_util.h"
29 #include "cudpp_radixsort.h"
30 #include "cudpp_scan.h"
31 #include "kernel/radixsort_kernel.cu"
32 
33 #include <cutil.h>
34 #include <cstdlib>
35 #include <cstdio>
36 #include <assert.h>
37 
38 typedef unsigned int uint;
39 
40 /** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step,
41 * starting at startbit.
42 *
43 * Uses cudppScanDispatch() for the prefix sum of radix counters.
44 *
45 * @param[in,out] keys Keys to be sorted.
46 * @param[in,out] values Associated values to be sorted (through keys).
47 * @param[in] plan Configuration information for RadixSort.
48 * @param[in] numElements Number of elements in the sort.
49 **/
50 template<uint nbits, uint startbit, bool flip, bool unflip>
radixSortStep(uint * keys,uint * values,const CUDPPRadixSortPlan * plan,uint numElements)51 void radixSortStep(uint *keys,
52                    uint *values,
53                    const CUDPPRadixSortPlan *plan,
54                    uint numElements)
55 {
56     const uint eltsPerBlock = SORT_CTA_SIZE * 4;
57     const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
58 
59     bool fullBlocks = ((numElements % eltsPerBlock) == 0);
60     uint numBlocks = (fullBlocks) ?
61         (numElements / eltsPerBlock) :
62     (numElements / eltsPerBlock + 1);
63     uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
64         (numElements / eltsPerBlock2) :
65     (numElements / eltsPerBlock2 + 1);
66 
67     bool loop = numBlocks > 65535;
68     uint blocks = loop ? 65535 : numBlocks;
69     uint blocksFind = loop ? 65535 : numBlocks2;
70     uint blocksReorder = loop ? 65535 : numBlocks2;
71 
72     uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
73 
74     bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
75 
76     if (persist)
77     {
78         loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
79 
80         blocks = numBlocks;
81         blocksFind = numBlocks2;
82         blocksReorder = numBlocks2;
83 
84         // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
85         // on GT200, resulting in better scheduling and lower run times
86         if (startbit > 0)
87         {
88             emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
89         }
90     }
91 
92     if (fullBlocks)
93     {
94         if (loop)
95         {
96             if (persist)
97             {
98                 blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) :
99                                numCTAs(radixSortBlocks<4, 0, true, false, true>);
100             }
101 
102             radixSortBlocks<nbits, startbit, true, flip, true>
103                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
104                 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
105         }
106         else
107         {
108             radixSortBlocks<nbits, startbit, true, flip, false>
109                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
110                 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
111         }
112     }
113     else
114     {
115         if (loop)
116         {
117             if (persist)
118             {
119                 blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) :
120                                 numCTAs(radixSortBlocks<4, 0, false, false, true>);
121             }
122 
123             radixSortBlocks<nbits, startbit, false, flip, true>
124                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
125                 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
126         }
127         else
128         {
129             radixSortBlocks<nbits, startbit, false, flip, false>
130                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
131                 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
132         }
133     }
134 
135     CUT_CHECK_ERROR("radixSortBlocks");
136 
137     if (fullBlocks)
138     {
139         if (loop)
140         {
141             if (persist)
142             {
143                 blocksFind = numCTAs(findRadixOffsets<0, true, true>);
144             }
145             findRadixOffsets<startbit, true, true>
146                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
147                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
148         }
149         else
150         {
151             findRadixOffsets<startbit, true, false>
152                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
153                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
154         }
155     }
156     else
157     {
158         if (loop)
159         {
160             if (persist)
161             {
162                 blocksFind = numCTAs(findRadixOffsets<0, false, true>);
163             }
164             findRadixOffsets<startbit, false, true>
165                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
166                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
167         }
168         else
169         {
170             findRadixOffsets<startbit, false, false>
171                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
172                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
173         }
174     }
175 
176     CUT_CHECK_ERROR("findRadixOffsets");
177 
178     cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
179 
180     if (fullBlocks)
181     {
182         if (plan->m_bManualCoalesce)
183         {
184             if (loop)
185             {
186                 if (persist)
187                 {
188                     blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
189                                              numCTAs(reorderData<0, true, true, false, true>);
190                 }
191                 reorderData<startbit, true, true, unflip, true>
192                     <<<blocksReorder, SORT_CTA_SIZE>>>
193                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
194                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
195             }
196             else
197             {
198                 reorderData<startbit, true, true, unflip, false>
199                     <<<blocksReorder, SORT_CTA_SIZE>>>
200                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
201                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
202             }
203         }
204         else
205         {
206             if (loop)
207             {
208                 if (persist)
209                 {
210                     blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
211                                              numCTAs(reorderData<0, true, false, false, true>);
212                 }
213                 reorderData<startbit, true, false, unflip, true>
214                     <<<blocksReorder, SORT_CTA_SIZE>>>
215                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
216                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
217             }
218             else
219             {
220                 reorderData<startbit, true, false, unflip, false>
221                     <<<blocksReorder, SORT_CTA_SIZE>>>
222                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
223                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
224             }
225         }
226     }
227     else
228     {
229         if (plan->m_bManualCoalesce)
230         {
231             if (loop)
232             {
233                 if (persist)
234                 {
235                     blocksReorder = unflip ?
236                         numCTAs(reorderData<0, false, true, true, true>) :
237                         numCTAs(reorderData<0, false, true, false, true>);
238                 }
239                 reorderData<startbit, false, true, unflip, true>
240                     <<<blocksReorder, SORT_CTA_SIZE>>>
241                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
242                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
243             }
244             else
245             {
246                 reorderData<startbit, false, true, unflip, false>
247                     <<<blocksReorder, SORT_CTA_SIZE>>>
248                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
249                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
250             }
251         }
252         else
253         {
254             if (loop)
255             {
256                 if (persist)
257                 {
258                     blocksReorder = unflip ?
259                         numCTAs(reorderData<0, false, false, true, true>) :
260                         numCTAs(reorderData<0, false, false, false, true>);
261                 }
262                 reorderData<startbit, false, false, unflip, true>
263                     <<<blocksReorder, SORT_CTA_SIZE>>>
264                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
265                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
266             }
267             else
268             {
269                 reorderData<startbit, false, false, unflip, false>
270                     <<<blocksReorder, SORT_CTA_SIZE>>>
271                     (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
272                     plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
273             }
274         }
275     }
276 
277     CUT_CHECK_ERROR("radixSortStep");
278 }
279 
280 /**
281  * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
282  *
283  * @param[in,out] keys  Keys to be sorted.
284  * @param[in,out] values Associated values to be sorted (through keys).
285  * @param numElements Number of elements in the sort.
286 **/
287 template <bool flip>
radixSortSingleBlock(uint * keys,uint * values,uint numElements)288 void radixSortSingleBlock(uint *keys,
289                           uint *values,
290                           uint numElements)
291 {
292     bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
293     if (fullBlocks)
294     {
295         radixSortBlocks<32, 0, true, flip, false>
296             <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
297                 ((uint4*)keys, (uint4*)values,
298                  (uint4*)keys, (uint4*)values,
299                  numElements, 0);
300     }
301     else
302     {
303         radixSortBlocks<32, 0, false, flip, false>
304             <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
305                 ((uint4*)keys, (uint4*)values,
306                  (uint4*)keys, (uint4*)values,
307                  numElements, 0);
308     }
309 
310     if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
311 
312     CUT_CHECK_ERROR("radixSortSingleBlock");
313 }
314 
315 /**
316  * @brief Main radix sort function
317  *
318  * Main radix sort function.  Sorts in place in the keys and values arrays,
319  * but uses the other device arrays as temporary storage.  All pointer
320  * parameters are device pointers.  Uses cudppScan() for the prefix sum of
321  * radix counters.
322  *
323  * @param[in,out] keys Keys to be sorted.
324  * @param[in,out] values Associated values to be sorted (through keys).
325  * @param[in] plan Configuration information for RadixSort.
326  * @param[in] numElements Number of elements in the sort.
327  * @param[in] flipBits Is set true if key datatype is a float
328  *                 (neg. numbers) for special float sorting operations.
329  * @param[in] keyBits Number of interesting bits in the key
330  **/
radixSort(uint * keys,uint * values,const CUDPPRadixSortPlan * plan,size_t numElements,bool flipBits,int keyBits)331 void radixSort(uint *keys,
332                uint* values,
333                const CUDPPRadixSortPlan *plan,
334                size_t numElements,
335                bool flipBits,
336                int keyBits)
337 {
338     if(numElements <= WARP_SIZE)
339     {
340         if (flipBits)
341             radixSortSingleWarp<true><<<1, numElements>>>
342                 (keys, values, numElements);
343         else
344             radixSortSingleWarp<false><<<1, numElements>>>
345                 (keys, values, numElements);
346 
347         CUT_CHECK_ERROR("radixSortSingleWarp");
348         return;
349     }
350 #ifdef __DEVICE_EMULATION__
351     printf("bits: %d\n", keyBits);
352 #endif
353 
354     if(numElements <= SORT_CTA_SIZE * 4)
355     {
356         if (flipBits)
357             radixSortSingleBlock<true>(keys, values, numElements);
358         else
359             radixSortSingleBlock<false>(keys, values, numElements);
360         return;
361     }
362 
363     // flip float bits on the first pass, unflip on the last pass
364     if (flipBits)
365     {
366         radixSortStep<4,  0, true, false>
367             (keys, values, plan, numElements);
368     }
369     else
370     {
371         radixSortStep<4,  0, false, false>
372             (keys, values, plan, numElements);
373     }
374 
375     if (keyBits > 4)
376     {
377         radixSortStep<4,  4, false, false>
378             (keys, values, plan, numElements);
379     }
380     if (keyBits > 8)
381     {
382         radixSortStep<4,  8, false, false>
383             (keys, values, plan, numElements);
384     }
385     if (keyBits > 12)
386     {
387         radixSortStep<4, 12, false, false>
388             (keys, values, plan, numElements);
389     }
390     if (keyBits > 16)
391     {
392         radixSortStep<4, 16, false, false>
393             (keys, values, plan, numElements);
394     }
395     if (keyBits > 20)
396     {
397         radixSortStep<4, 20, false, false>
398             (keys, values, plan, numElements);
399     }
400     if (keyBits > 24)
401     {
402         radixSortStep<4, 24, false, false>
403             (keys, values, plan, numElements);
404     }
405     if (keyBits > 28)
406     {
407         if (flipBits) // last pass
408         {
409             radixSortStep<4, 28, false, true>
410                 (keys, values, plan, numElements);
411         }
412         else
413         {
414             radixSortStep<4, 28, false, false>
415                 (keys, values, plan, numElements);
416         }
417     }
418 }
419 
420 /**
421  * @brief Wrapper to call main radix sort function. For float configuration.
422  *
423  * Calls the main radix sort function. For float configuration.
424  *
425  * @param[in,out] keys Keys to be sorted.
426  * @param[in,out] values Associated values to be sorted (through keys).
427  * @param[in] plan Configuration information for RadixSort.
428  * @param[in] numElements Number of elements in the sort.
429  * @param[in] negativeKeys Is set true if key datatype has neg. numbers.
430  * @param[in] keyBits Number of interesting bits in the key
431  **/
432 extern "C"
radixSortFloatKeys(float * keys,uint * values,const CUDPPRadixSortPlan * plan,size_t numElements,bool negativeKeys,int keyBits)433 void radixSortFloatKeys(float* keys,
434                         uint* values,
435                         const CUDPPRadixSortPlan *plan,
436                         size_t numElements,
437                         bool  negativeKeys,
438                         int keyBits)
439 {
440 
441     radixSort((uint*)keys, (uint*)values, plan,
442               numElements, negativeKeys, keyBits);
443 }
444 
445 /** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step,
446  * starting at startbit.
447  *
448  * @param[in,out] keys  Keys to be sorted.
449  * @param[in] plan Configuration information for RadixSort.
450  * @param[in] numElements Number of elements in the sort.
451 **/
452 template<uint nbits, uint startbit, bool flip, bool unflip>
radixSortStepKeysOnly(uint * keys,const CUDPPRadixSortPlan * plan,uint numElements)453 void radixSortStepKeysOnly(uint *keys,
454                            const CUDPPRadixSortPlan *plan,
455                            uint numElements)
456 {
457     const uint eltsPerBlock = SORT_CTA_SIZE * 4;
458     const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
459 
460     bool fullBlocks = ((numElements % eltsPerBlock) == 0);
461     uint numBlocks = (fullBlocks) ?
462         (numElements / eltsPerBlock) :
463     (numElements / eltsPerBlock + 1);
464     uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
465         (numElements / eltsPerBlock2) :
466     (numElements / eltsPerBlock2 + 1);
467 
468     bool loop = numBlocks > 65535;
469 
470     uint blocks = loop ? 65535 : numBlocks;
471     uint blocksFind = loop ? 65535 : numBlocks2;
472     uint blocksReorder = loop ? 65535 : numBlocks2;
473 
474     uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
475 
476     bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
477 
478     if (persist)
479     {
480         loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
481 
482         blocks = numBlocks;
483         blocksFind = numBlocks2;
484         blocksReorder = numBlocks2;
485     }
486 
487     if (fullBlocks)
488     {
489         if (loop)
490         {
491             if (persist)
492             {
493                 blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) :
494                                 numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
495             }
496 
497             radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
498                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
499                 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
500         }
501         else
502             radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
503                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
504                 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
505     }
506     else
507     {
508         if (loop)
509         {
510             if (persist)
511             {
512                 blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) :
513                                 numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
514             }
515 
516             radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
517                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
518                 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
519         }
520         else
521             radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
522                 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
523                 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
524 
525     }
526 
527     if (fullBlocks)
528     {
529         if (loop)
530         {
531             if (persist)
532             {
533                 blocksFind = numCTAs(findRadixOffsets<0, true, true>);
534             }
535             findRadixOffsets<startbit, true, true>
536                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
537                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
538         }
539         else
540             findRadixOffsets<startbit, true, false>
541                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
542                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
543     }
544     else
545     {
546         if (loop)
547         {
548             if (persist)
549             {
550                 blocksFind = numCTAs(findRadixOffsets<0, false, true>);
551             }
552             findRadixOffsets<startbit, false, true>
553                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
554                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
555         }
556         else
557             findRadixOffsets<startbit, false, false>
558                 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
559                 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
560 
561     }
562 
563     cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
564 
565     if (fullBlocks)
566     {
567         if (plan->m_bManualCoalesce)
568         {
569             if (loop)
570             {
571                 if (persist)
572                 {
573                     blocksReorder = unflip ?
574                         numCTAs(reorderDataKeysOnly<0, true, true, true, true>) :
575                         numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
576                 }
577                 reorderDataKeysOnly<startbit, true, true, unflip, true>
578                     <<<blocksReorder, SORT_CTA_SIZE>>>
579                     (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
580                     numElements, numBlocks2);
581             }
582             else
583                 reorderDataKeysOnly<startbit, true, true, unflip, false>
584                     <<<blocksReorder, SORT_CTA_SIZE>>>
585                     (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
586                      numElements, numBlocks2);
587         }
588         else
589         {
590             if (loop)
591             {
592                 if (persist)
593                 {
594                     blocksReorder = unflip ?
595                         numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
596                         numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
597                 }
598                 reorderDataKeysOnly<startbit, true, false, unflip, true>
599                     <<<blocksReorder, SORT_CTA_SIZE>>>
600                     (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
601                     numElements, numBlocks2);
602             }
603             else
604                 reorderDataKeysOnly<startbit, true, false, unflip, false>
605                     <<<blocksReorder, SORT_CTA_SIZE>>>
606                     (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
607                      numElements, numBlocks2);
608         }
609     }
610     else
611     {
612         if (plan->m_bManualCoalesce)
613         {
614             if (loop)
615             {
616                 if (persist)
617                 {
618                     blocksReorder = unflip ?
619                         numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
620                         numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
621                 }
622                 reorderDataKeysOnly<startbit, false, true, unflip, true>
623                     <<<blocksReorder, SORT_CTA_SIZE>>>
624                     (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
625                     numElements, numBlocks2);
626             }
627             else
628                 reorderDataKeysOnly<startbit, false, true, unflip, false>
629                 <<<blocksReorder, SORT_CTA_SIZE>>>
630                 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
631                 numElements, numBlocks2);
632         }
633         else
634         {
635             if (loop)
636             {
637                 if (persist)
638                 {
639                     blocksReorder = unflip ?
640                         numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
641                         numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
642                 }
643                 reorderDataKeysOnly<startbit, false, false, unflip, true>
644                     <<<blocksReorder, SORT_CTA_SIZE>>>
645                     (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
646                     numElements, numBlocks2);
647             }
648             else
649                 reorderDataKeysOnly<startbit, false, false, unflip, false>
650                 <<<blocksReorder, SORT_CTA_SIZE>>>
651                 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
652                 numElements, numBlocks2);
653         }
654     }
655 
656     CUT_CHECK_ERROR("radixSortStepKeysOnly");
657 }
658 
659 /**
660  * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
661  *
662  * @param[in,out] keys Keys to be sorted.
663  * @param numElements Number of elements in the sort.
664 **/
665 template <bool flip>
radixSortSingleBlockKeysOnly(uint * keys,uint numElements)666 void radixSortSingleBlockKeysOnly(uint *keys,
667                                   uint numElements)
668 {
669     bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
670     if (fullBlocks)
671     {
672         radixSortBlocksKeysOnly<32, 0, true, flip, false>
673             <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
674             ((uint4*)keys, (uint4*)keys, numElements, 1 );
675     }
676     else
677     {
678         radixSortBlocksKeysOnly<32, 0, false, flip, false>
679             <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
680             ((uint4*)keys, (uint4*)keys, numElements, 1 );
681     }
682 
683     if (flip)
684         unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
685 
686 
687     CUT_CHECK_ERROR("radixSortSingleBlock");
688 }
689 
690 /**
691  * @brief Main radix sort function. For keys only configuration.
692  *
693  * Main radix sort function.  Sorts in place in the keys array,
694  * but uses the other device arrays as temporary storage.  All pointer
695  * parameters are device pointers.  Uses scan for the prefix sum of
696  * radix counters.
697  *
698  * @param[in,out] keys Keys to be sorted.
699  * @param[in] plan Configuration information for RadixSort.
700  * @param[in] flipBits Is set true if key datatype is a float (neg. numbers)
701  *        for special float sorting operations.
702  * @param[in] numElements Number of elements in the sort.
703  * @param[in] keyBits Number of interesting bits in the key
704 **/
705 extern "C"
radixSortKeysOnly(uint * keys,const CUDPPRadixSortPlan * plan,bool flipBits,size_t numElements,int keyBits)706 void radixSortKeysOnly(uint *keys,
707                        const CUDPPRadixSortPlan *plan,
708                        bool flipBits,
709                        size_t numElements,
710                        int keyBits)
711 {
712 
713     if(numElements <= WARP_SIZE)
714     {
715         if (flipBits)
716             radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
717         else
718             radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
719         return;
720     }
721     if(numElements <= SORT_CTA_SIZE * 4)
722     {
723         if (flipBits)
724             radixSortSingleBlockKeysOnly<true>(keys, numElements);
725         else
726             radixSortSingleBlockKeysOnly<false>(keys, numElements);
727         return;
728     }
729 
730     // flip float bits on the first pass, unflip on the last pass
731     if (flipBits)
732     {
733         radixSortStepKeysOnly<4,  0, true, false>(keys, plan, numElements);
734     }
735     else
736     {
737         radixSortStepKeysOnly<4,  0, false, false>(keys, plan, numElements);
738     }
739 
740     if (keyBits > 4)
741     {
742         radixSortStepKeysOnly<4,  4, false, false>(keys, plan, numElements);
743     }
744     if (keyBits > 8)
745     {
746         radixSortStepKeysOnly<4,  8, false, false>(keys, plan, numElements);
747     }
748     if (keyBits > 12)
749     {
750         radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
751     }
752     if (keyBits > 16)
753     {
754         radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
755     }
756     if (keyBits > 20)
757     {
758         radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
759     }
760     if (keyBits > 24)
761     {
762        radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
763     }
764     if (keyBits > 28)
765     {
766         if (flipBits) // last pass
767         {
768             radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
769         }
770         else
771         {
772             radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
773         }
774     }
775 }
776 
777 /**
778  * @brief Wrapper to call main radix sort function. For floats and keys only.
779  *
780  * Calls the radixSortKeysOnly function setting parameters for floats.
781  *
782  * @param[in,out] keys Keys to be sorted.
783  * @param[in] plan Configuration information for RadixSort.
784  * @param[in] negativeKeys Is set true if key flipBits is to be true in
785  *                     radixSortKeysOnly().
786  * @param[in] numElements Number of elements in the sort.
787  * @param[in] keyBits Number of interesting bits in the key
788 **/
789 extern "C"
radixSortFloatKeysOnly(float * keys,const CUDPPRadixSortPlan * plan,bool negativeKeys,size_t numElements,int keyBits)790 void radixSortFloatKeysOnly(float *keys,
791                             const CUDPPRadixSortPlan *plan,
792                             bool  negativeKeys,
793                             size_t numElements,
794                             int keyBits)
795 {
796     radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
797 }
798 
799 extern "C"
initDeviceParameters(CUDPPRadixSortPlan * plan)800 void initDeviceParameters(CUDPPRadixSortPlan *plan)
801 {
802     int deviceID = -1;
803     if (cudaSuccess == cudaGetDevice(&deviceID))
804     {
805         cudaDeviceProp devprop;
806         cudaGetDeviceProperties(&devprop, deviceID);
807 
808         int smVersion = devprop.major * 10 + devprop.minor;
809 
810         // sm_12 and later devices don't need help with coalesce in reorderData kernel
811         plan->m_bManualCoalesce = (smVersion < 12);
812 
813         // sm_20 and later devices are better off not using persistent CTAs
814         plan->m_bUsePersistentCTAs = (smVersion < 20);
815 
816         if (plan->m_bUsePersistentCTAs)
817         {
818             // The following is only true on pre-sm_20 devices (pre-Fermi):
819             // Empirically we have found that for some (usually larger) sort
820             // sizes it is better to use exactly as many "persistent" CTAs
821             // as can fill the GPU, which loop over the "blocks" of work. For smaller
822             // arrays it is better to use the typical CUDA approach of launching one CTA
823             // per block of work.
824             // 0-element of these two-element arrays is for key-value sorts
825             // 1-element is for key-only sorts
826             plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
827             plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
828             plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
829             plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
830 
831             // create a map of function pointers to register counts for more accurate occupancy calculation
832             // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
833             // Note we only insert the "loop" version of the kernels (the one with the last template param = true)
834             // Because those are the only ones that require persistent CTAs that maximally fill the device.
835             computeNumCTAs(radixSortBlocks<4, 0, false, false, true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
836             computeNumCTAs(radixSortBlocks<4, 0, false, true,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
837             computeNumCTAs(radixSortBlocks<4, 0, true, false,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
838             computeNumCTAs(radixSortBlocks<4, 0, true, true,  true>,          4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
839 
840             computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
841             computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
842             computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
843             computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>,   4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
844 
845             computeNumCTAs(findRadixOffsets<0, false, true>,                  3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
846             computeNumCTAs(findRadixOffsets<0, true, true>,                   3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
847 
848             computeNumCTAs(reorderData<0, false, false, false, true>,         0,                                SORT_CTA_SIZE);
849             computeNumCTAs(reorderData<0, false, false, true, true>,          0,                                SORT_CTA_SIZE);
850             computeNumCTAs(reorderData<0, false, true, false, true>,          0,                                SORT_CTA_SIZE);
851             computeNumCTAs(reorderData<0, false, true, true, true>,           0,                                SORT_CTA_SIZE);
852             computeNumCTAs(reorderData<0, true, false, false, true>,          0,                                SORT_CTA_SIZE);
853             computeNumCTAs(reorderData<0, true, false, true, true>,           0,                                SORT_CTA_SIZE);
854             computeNumCTAs(reorderData<0, true, true, false, true>,           0,                                SORT_CTA_SIZE);
855             computeNumCTAs(reorderData<0, true, true, true, true>,            0,                                SORT_CTA_SIZE);
856 
857             computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0,                                SORT_CTA_SIZE);
858             computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>,  0,                                SORT_CTA_SIZE);
859             computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>,  0,                                SORT_CTA_SIZE);
860             computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>,   0,                                SORT_CTA_SIZE);
861             computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>,  0,                                SORT_CTA_SIZE);
862             computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>,   0,                                SORT_CTA_SIZE);
863             computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>,   0,                                SORT_CTA_SIZE);
864             computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>,    0,                                SORT_CTA_SIZE);
865 
866             computeNumCTAs(emptyKernel,                                       0,                                SORT_CTA_SIZE);
867         }
868     }
869 }
870 
871 /**
872  * @brief From the programmer-specified sort configuration,
873  *        creates internal memory for performing the sort.
874  *
875  * @param[in] plan Pointer to CUDPPRadixSortPlan object
876 **/
877 extern "C"
allocRadixSortStorage(CUDPPRadixSortPlan * plan)878 void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
879 {
880 
881     unsigned int numElements = plan->m_numElements;
882 
883     unsigned int numBlocks =
884         ((numElements % (SORT_CTA_SIZE * 4)) == 0) ?
885             (numElements / (SORT_CTA_SIZE * 4)) :
886             (numElements / (SORT_CTA_SIZE * 4) + 1);
887 
888     switch(plan->m_config.datatype)
889     {
890     case CUDPP_UINT:
891         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
892                                   numElements * sizeof(unsigned int)));
893 
894         if (!plan->m_bKeysOnly)
895             CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
896                            numElements * sizeof(unsigned int)));
897 
898         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
899                        WARP_SIZE * numBlocks * sizeof(unsigned int)));
900 
901         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
902                        WARP_SIZE * numBlocks * sizeof(unsigned int)));
903 
904         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
905                        WARP_SIZE * numBlocks * sizeof(unsigned int)));
906     break;
907 
908     case CUDPP_FLOAT:
909         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
910                                    numElements * sizeof(float)));
911 
912         if (!plan->m_bKeysOnly)
913             CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
914                            numElements * sizeof(float)));
915 
916         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
917                        WARP_SIZE * numBlocks * sizeof(float)));
918 
919         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
920                        WARP_SIZE * numBlocks * sizeof(float)));
921 
922         CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
923                        WARP_SIZE * numBlocks * sizeof(float)));
924     break;
925     }
926 
927     initDeviceParameters(plan);
928 }
929 
930 /** @brief Deallocates intermediate memory from allocRadixSortStorage.
931  *
932  *
933  * @param[in] plan Pointer to CUDPPRadixSortPlan object
934 **/
935 extern "C"
freeRadixSortStorage(CUDPPRadixSortPlan * plan)936 void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
937 {
938     CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
939     CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
940     CUDA_SAFE_CALL( cudaFree(plan->m_counters));
941     CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
942     CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
943 }
944 
945 /** @brief Dispatch function to perform a sort on an array with
946  * a specified configuration.
947  *
948  * This is the dispatch routine which calls radixSort...() with
949  * appropriate template parameters and arguments as specified by
950  * the plan.
951  * @param[in,out] keys Keys to be sorted.
952  * @param[in,out] values Associated values to be sorted (through keys).
953  * @param[in] numElements Number of elements in the sort.
954  * @param[in] keyBits Number of interesting bits in the key*
955  * @param[in] plan Configuration information for RadixSort.
956 **/
957 extern "C"
cudppRadixSortDispatch(void * keys,void * values,size_t numElements,int keyBits,const CUDPPRadixSortPlan * plan)958 void cudppRadixSortDispatch(void  *keys,
959                             void  *values,
960                             size_t numElements,
961                             int   keyBits,
962                             const CUDPPRadixSortPlan *plan)
963 {
964     if(plan->m_bKeysOnly)
965     {
966         switch(plan->m_config.datatype)
967         {
968         case CUDPP_UINT:
969             radixSortKeysOnly((uint*)keys, plan, false,
970                               numElements, keyBits);
971             break;
972         case CUDPP_FLOAT:
973             radixSortFloatKeysOnly((float*)keys, plan, true,
974                                     numElements, keyBits);
975         }
976     }
977     else
978     {
979         switch(plan->m_config.datatype)
980         {
981         case CUDPP_UINT:
982             radixSort((uint*)keys, (uint*) values, plan,
983                       numElements, false, keyBits);
984             break;
985         case CUDPP_FLOAT:
986             radixSortFloatKeys((float*)keys, (uint*) values, plan,
987                                numElements, true, keyBits);
988         }
989     }
990 }
991 
992 /** @} */ // end radixsort functions
993 /** @} */ // end cudpp_app
994