1 // -------------------------------------------------------------
2 // CUDPP -- CUDA Data Parallel Primitives library
3 // -------------------------------------------------------------
4 // $Revision$
5 // $Date$
6 // -------------------------------------------------------------
7 // This source code is distributed under the terms of license.txt
8 // in the root directory of this source distribution.
9 // -------------------------------------------------------------
10
11 /**
12 * @file
13 * radixsort_app.cu
14 *
15 * @brief CUDPP application-level radix sorting routines
16 */
17
18 /** @addtogroup cudpp_app
19 * @{
20 */
21
22 /** @name RadixSort Functions
23 * @{
24 */
25
26
27 #include "cudpp.h"
28 #include "cudpp_util.h"
29 #include "cudpp_radixsort.h"
30 #include "cudpp_scan.h"
31 #include "kernel/radixsort_kernel.cu"
32
33 #include <cutil.h>
34 #include <cstdlib>
35 #include <cstdio>
36 #include <assert.h>
37
38 typedef unsigned int uint;
39
40 /** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
41 * starting at startbit.
42 *
43 * Uses cudppScanDispatch() for the prefix sum of radix counters.
44 *
45 * @param[in,out] keys Keys to be sorted.
46 * @param[in,out] values Associated values to be sorted (through keys).
47 * @param[in] plan Configuration information for RadixSort.
48 * @param[in] numElements Number of elements in the sort.
49 **/
50 template<uint nbits, uint startbit, bool flip, bool unflip>
radixSortStep(uint * keys,uint * values,const CUDPPRadixSortPlan * plan,uint numElements)51 void radixSortStep(uint *keys,
52 uint *values,
53 const CUDPPRadixSortPlan *plan,
54 uint numElements)
55 {
56 const uint eltsPerBlock = SORT_CTA_SIZE * 4;
57 const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
58
59 bool fullBlocks = ((numElements % eltsPerBlock) == 0);
60 uint numBlocks = (fullBlocks) ?
61 (numElements / eltsPerBlock) :
62 (numElements / eltsPerBlock + 1);
63 uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
64 (numElements / eltsPerBlock2) :
65 (numElements / eltsPerBlock2 + 1);
66
67 bool loop = numBlocks > 65535;
68 uint blocks = loop ? 65535 : numBlocks;
69 uint blocksFind = loop ? 65535 : numBlocks2;
70 uint blocksReorder = loop ? 65535 : numBlocks2;
71
72 uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
73
74 bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
75
76 if (persist)
77 {
78 loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
79
80 blocks = numBlocks;
81 blocksFind = numBlocks2;
82 blocksReorder = numBlocks2;
83
84 // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
85 // on GT200, resulting in better scheduling and lower run times
86 if (startbit > 0)
87 {
88 emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
89 }
90 }
91
92 if (fullBlocks)
93 {
94 if (loop)
95 {
96 if (persist)
97 {
98 blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) :
99 numCTAs(radixSortBlocks<4, 0, true, false, true>);
100 }
101
102 radixSortBlocks<nbits, startbit, true, flip, true>
103 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
104 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
105 }
106 else
107 {
108 radixSortBlocks<nbits, startbit, true, flip, false>
109 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
110 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
111 }
112 }
113 else
114 {
115 if (loop)
116 {
117 if (persist)
118 {
119 blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) :
120 numCTAs(radixSortBlocks<4, 0, false, false, true>);
121 }
122
123 radixSortBlocks<nbits, startbit, false, flip, true>
124 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
125 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
126 }
127 else
128 {
129 radixSortBlocks<nbits, startbit, false, flip, false>
130 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
131 ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
132 }
133 }
134
135 CUT_CHECK_ERROR("radixSortBlocks");
136
137 if (fullBlocks)
138 {
139 if (loop)
140 {
141 if (persist)
142 {
143 blocksFind = numCTAs(findRadixOffsets<0, true, true>);
144 }
145 findRadixOffsets<startbit, true, true>
146 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
147 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
148 }
149 else
150 {
151 findRadixOffsets<startbit, true, false>
152 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
153 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
154 }
155 }
156 else
157 {
158 if (loop)
159 {
160 if (persist)
161 {
162 blocksFind = numCTAs(findRadixOffsets<0, false, true>);
163 }
164 findRadixOffsets<startbit, false, true>
165 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
166 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
167 }
168 else
169 {
170 findRadixOffsets<startbit, false, false>
171 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
172 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
173 }
174 }
175
176 CUT_CHECK_ERROR("findRadixOffsets");
177
178 cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
179
180 if (fullBlocks)
181 {
182 if (plan->m_bManualCoalesce)
183 {
184 if (loop)
185 {
186 if (persist)
187 {
188 blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
189 numCTAs(reorderData<0, true, true, false, true>);
190 }
191 reorderData<startbit, true, true, unflip, true>
192 <<<blocksReorder, SORT_CTA_SIZE>>>
193 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
194 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
195 }
196 else
197 {
198 reorderData<startbit, true, true, unflip, false>
199 <<<blocksReorder, SORT_CTA_SIZE>>>
200 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
201 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
202 }
203 }
204 else
205 {
206 if (loop)
207 {
208 if (persist)
209 {
210 blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
211 numCTAs(reorderData<0, true, false, false, true>);
212 }
213 reorderData<startbit, true, false, unflip, true>
214 <<<blocksReorder, SORT_CTA_SIZE>>>
215 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
216 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
217 }
218 else
219 {
220 reorderData<startbit, true, false, unflip, false>
221 <<<blocksReorder, SORT_CTA_SIZE>>>
222 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
223 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
224 }
225 }
226 }
227 else
228 {
229 if (plan->m_bManualCoalesce)
230 {
231 if (loop)
232 {
233 if (persist)
234 {
235 blocksReorder = unflip ?
236 numCTAs(reorderData<0, false, true, true, true>) :
237 numCTAs(reorderData<0, false, true, false, true>);
238 }
239 reorderData<startbit, false, true, unflip, true>
240 <<<blocksReorder, SORT_CTA_SIZE>>>
241 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
242 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
243 }
244 else
245 {
246 reorderData<startbit, false, true, unflip, false>
247 <<<blocksReorder, SORT_CTA_SIZE>>>
248 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
249 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
250 }
251 }
252 else
253 {
254 if (loop)
255 {
256 if (persist)
257 {
258 blocksReorder = unflip ?
259 numCTAs(reorderData<0, false, false, true, true>) :
260 numCTAs(reorderData<0, false, false, false, true>);
261 }
262 reorderData<startbit, false, false, unflip, true>
263 <<<blocksReorder, SORT_CTA_SIZE>>>
264 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
265 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
266 }
267 else
268 {
269 reorderData<startbit, false, false, unflip, false>
270 <<<blocksReorder, SORT_CTA_SIZE>>>
271 (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
272 plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
273 }
274 }
275 }
276
277 CUT_CHECK_ERROR("radixSortStep");
278 }
279
280 /**
281 * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
282 *
283 * @param[in,out] keys Keys to be sorted.
284 * @param[in,out] values Associated values to be sorted (through keys).
285 * @param numElements Number of elements in the sort.
286 **/
287 template <bool flip>
radixSortSingleBlock(uint * keys,uint * values,uint numElements)288 void radixSortSingleBlock(uint *keys,
289 uint *values,
290 uint numElements)
291 {
292 bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
293 if (fullBlocks)
294 {
295 radixSortBlocks<32, 0, true, flip, false>
296 <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
297 ((uint4*)keys, (uint4*)values,
298 (uint4*)keys, (uint4*)values,
299 numElements, 0);
300 }
301 else
302 {
303 radixSortBlocks<32, 0, false, flip, false>
304 <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
305 ((uint4*)keys, (uint4*)values,
306 (uint4*)keys, (uint4*)values,
307 numElements, 0);
308 }
309
310 if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
311
312 CUT_CHECK_ERROR("radixSortSingleBlock");
313 }
314
315 /**
316 * @brief Main radix sort function
317 *
318 * Main radix sort function. Sorts in place in the keys and values arrays,
319 * but uses the other device arrays as temporary storage. All pointer
320 * parameters are device pointers. Uses cudppScan() for the prefix sum of
321 * radix counters.
322 *
323 * @param[in,out] keys Keys to be sorted.
324 * @param[in,out] values Associated values to be sorted (through keys).
325 * @param[in] plan Configuration information for RadixSort.
326 * @param[in] numElements Number of elements in the sort.
327 * @param[in] flipBits Is set true if key datatype is a float
328 * (neg. numbers) for special float sorting operations.
329 * @param[in] keyBits Number of interesting bits in the key
330 **/
radixSort(uint * keys,uint * values,const CUDPPRadixSortPlan * plan,size_t numElements,bool flipBits,int keyBits)331 void radixSort(uint *keys,
332 uint* values,
333 const CUDPPRadixSortPlan *plan,
334 size_t numElements,
335 bool flipBits,
336 int keyBits)
337 {
338 if(numElements <= WARP_SIZE)
339 {
340 if (flipBits)
341 radixSortSingleWarp<true><<<1, numElements>>>
342 (keys, values, numElements);
343 else
344 radixSortSingleWarp<false><<<1, numElements>>>
345 (keys, values, numElements);
346
347 CUT_CHECK_ERROR("radixSortSingleWarp");
348 return;
349 }
350 #ifdef __DEVICE_EMULATION__
351 printf("bits: %d\n", keyBits);
352 #endif
353
354 if(numElements <= SORT_CTA_SIZE * 4)
355 {
356 if (flipBits)
357 radixSortSingleBlock<true>(keys, values, numElements);
358 else
359 radixSortSingleBlock<false>(keys, values, numElements);
360 return;
361 }
362
363 // flip float bits on the first pass, unflip on the last pass
364 if (flipBits)
365 {
366 radixSortStep<4, 0, true, false>
367 (keys, values, plan, numElements);
368 }
369 else
370 {
371 radixSortStep<4, 0, false, false>
372 (keys, values, plan, numElements);
373 }
374
375 if (keyBits > 4)
376 {
377 radixSortStep<4, 4, false, false>
378 (keys, values, plan, numElements);
379 }
380 if (keyBits > 8)
381 {
382 radixSortStep<4, 8, false, false>
383 (keys, values, plan, numElements);
384 }
385 if (keyBits > 12)
386 {
387 radixSortStep<4, 12, false, false>
388 (keys, values, plan, numElements);
389 }
390 if (keyBits > 16)
391 {
392 radixSortStep<4, 16, false, false>
393 (keys, values, plan, numElements);
394 }
395 if (keyBits > 20)
396 {
397 radixSortStep<4, 20, false, false>
398 (keys, values, plan, numElements);
399 }
400 if (keyBits > 24)
401 {
402 radixSortStep<4, 24, false, false>
403 (keys, values, plan, numElements);
404 }
405 if (keyBits > 28)
406 {
407 if (flipBits) // last pass
408 {
409 radixSortStep<4, 28, false, true>
410 (keys, values, plan, numElements);
411 }
412 else
413 {
414 radixSortStep<4, 28, false, false>
415 (keys, values, plan, numElements);
416 }
417 }
418 }
419
420 /**
421 * @brief Wrapper to call main radix sort function. For float configuration.
422 *
423 * Calls the main radix sort function. For float configuration.
424 *
425 * @param[in,out] keys Keys to be sorted.
426 * @param[in,out] values Associated values to be sorted (through keys).
427 * @param[in] plan Configuration information for RadixSort.
428 * @param[in] numElements Number of elements in the sort.
429 * @param[in] negativeKeys Is set true if key datatype has neg. numbers.
430 * @param[in] keyBits Number of interesting bits in the key
431 **/
432 extern "C"
radixSortFloatKeys(float * keys,uint * values,const CUDPPRadixSortPlan * plan,size_t numElements,bool negativeKeys,int keyBits)433 void radixSortFloatKeys(float* keys,
434 uint* values,
435 const CUDPPRadixSortPlan *plan,
436 size_t numElements,
437 bool negativeKeys,
438 int keyBits)
439 {
440
441 radixSort((uint*)keys, (uint*)values, plan,
442 numElements, negativeKeys, keyBits);
443 }
444
445 /** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
446 * starting at startbit.
447 *
448 * @param[in,out] keys Keys to be sorted.
449 * @param[in] plan Configuration information for RadixSort.
450 * @param[in] numElements Number of elements in the sort.
451 **/
452 template<uint nbits, uint startbit, bool flip, bool unflip>
radixSortStepKeysOnly(uint * keys,const CUDPPRadixSortPlan * plan,uint numElements)453 void radixSortStepKeysOnly(uint *keys,
454 const CUDPPRadixSortPlan *plan,
455 uint numElements)
456 {
457 const uint eltsPerBlock = SORT_CTA_SIZE * 4;
458 const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
459
460 bool fullBlocks = ((numElements % eltsPerBlock) == 0);
461 uint numBlocks = (fullBlocks) ?
462 (numElements / eltsPerBlock) :
463 (numElements / eltsPerBlock + 1);
464 uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
465 (numElements / eltsPerBlock2) :
466 (numElements / eltsPerBlock2 + 1);
467
468 bool loop = numBlocks > 65535;
469
470 uint blocks = loop ? 65535 : numBlocks;
471 uint blocksFind = loop ? 65535 : numBlocks2;
472 uint blocksReorder = loop ? 65535 : numBlocks2;
473
474 uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
475
476 bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
477
478 if (persist)
479 {
480 loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
481
482 blocks = numBlocks;
483 blocksFind = numBlocks2;
484 blocksReorder = numBlocks2;
485 }
486
487 if (fullBlocks)
488 {
489 if (loop)
490 {
491 if (persist)
492 {
493 blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) :
494 numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
495 }
496
497 radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
498 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
499 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
500 }
501 else
502 radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
503 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
504 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
505 }
506 else
507 {
508 if (loop)
509 {
510 if (persist)
511 {
512 blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) :
513 numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
514 }
515
516 radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
517 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
518 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
519 }
520 else
521 radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
522 <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
523 ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
524
525 }
526
527 if (fullBlocks)
528 {
529 if (loop)
530 {
531 if (persist)
532 {
533 blocksFind = numCTAs(findRadixOffsets<0, true, true>);
534 }
535 findRadixOffsets<startbit, true, true>
536 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
537 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
538 }
539 else
540 findRadixOffsets<startbit, true, false>
541 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
542 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
543 }
544 else
545 {
546 if (loop)
547 {
548 if (persist)
549 {
550 blocksFind = numCTAs(findRadixOffsets<0, false, true>);
551 }
552 findRadixOffsets<startbit, false, true>
553 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
554 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
555 }
556 else
557 findRadixOffsets<startbit, false, false>
558 <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
559 ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
560
561 }
562
563 cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
564
565 if (fullBlocks)
566 {
567 if (plan->m_bManualCoalesce)
568 {
569 if (loop)
570 {
571 if (persist)
572 {
573 blocksReorder = unflip ?
574 numCTAs(reorderDataKeysOnly<0, true, true, true, true>) :
575 numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
576 }
577 reorderDataKeysOnly<startbit, true, true, unflip, true>
578 <<<blocksReorder, SORT_CTA_SIZE>>>
579 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
580 numElements, numBlocks2);
581 }
582 else
583 reorderDataKeysOnly<startbit, true, true, unflip, false>
584 <<<blocksReorder, SORT_CTA_SIZE>>>
585 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
586 numElements, numBlocks2);
587 }
588 else
589 {
590 if (loop)
591 {
592 if (persist)
593 {
594 blocksReorder = unflip ?
595 numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
596 numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
597 }
598 reorderDataKeysOnly<startbit, true, false, unflip, true>
599 <<<blocksReorder, SORT_CTA_SIZE>>>
600 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
601 numElements, numBlocks2);
602 }
603 else
604 reorderDataKeysOnly<startbit, true, false, unflip, false>
605 <<<blocksReorder, SORT_CTA_SIZE>>>
606 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
607 numElements, numBlocks2);
608 }
609 }
610 else
611 {
612 if (plan->m_bManualCoalesce)
613 {
614 if (loop)
615 {
616 if (persist)
617 {
618 blocksReorder = unflip ?
619 numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
620 numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
621 }
622 reorderDataKeysOnly<startbit, false, true, unflip, true>
623 <<<blocksReorder, SORT_CTA_SIZE>>>
624 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
625 numElements, numBlocks2);
626 }
627 else
628 reorderDataKeysOnly<startbit, false, true, unflip, false>
629 <<<blocksReorder, SORT_CTA_SIZE>>>
630 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
631 numElements, numBlocks2);
632 }
633 else
634 {
635 if (loop)
636 {
637 if (persist)
638 {
639 blocksReorder = unflip ?
640 numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
641 numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
642 }
643 reorderDataKeysOnly<startbit, false, false, unflip, true>
644 <<<blocksReorder, SORT_CTA_SIZE>>>
645 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
646 numElements, numBlocks2);
647 }
648 else
649 reorderDataKeysOnly<startbit, false, false, unflip, false>
650 <<<blocksReorder, SORT_CTA_SIZE>>>
651 (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
652 numElements, numBlocks2);
653 }
654 }
655
656 CUT_CHECK_ERROR("radixSortStepKeysOnly");
657 }
658
659 /**
660 * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
661 *
662 * @param[in,out] keys Keys to be sorted.
663 * @param numElements Number of elements in the sort.
664 **/
665 template <bool flip>
radixSortSingleBlockKeysOnly(uint * keys,uint numElements)666 void radixSortSingleBlockKeysOnly(uint *keys,
667 uint numElements)
668 {
669 bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
670 if (fullBlocks)
671 {
672 radixSortBlocksKeysOnly<32, 0, true, flip, false>
673 <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
674 ((uint4*)keys, (uint4*)keys, numElements, 1 );
675 }
676 else
677 {
678 radixSortBlocksKeysOnly<32, 0, false, flip, false>
679 <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
680 ((uint4*)keys, (uint4*)keys, numElements, 1 );
681 }
682
683 if (flip)
684 unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
685
686
687 CUT_CHECK_ERROR("radixSortSingleBlock");
688 }
689
690 /**
691 * @brief Main radix sort function. For keys only configuration.
692 *
693 * Main radix sort function. Sorts in place in the keys array,
694 * but uses the other device arrays as temporary storage. All pointer
695 * parameters are device pointers. Uses scan for the prefix sum of
696 * radix counters.
697 *
698 * @param[in,out] keys Keys to be sorted.
699 * @param[in] plan Configuration information for RadixSort.
700 * @param[in] flipBits Is set true if key datatype is a float (neg. numbers)
701 * for special float sorting operations.
702 * @param[in] numElements Number of elements in the sort.
703 * @param[in] keyBits Number of interesting bits in the key
704 **/
705 extern "C"
radixSortKeysOnly(uint * keys,const CUDPPRadixSortPlan * plan,bool flipBits,size_t numElements,int keyBits)706 void radixSortKeysOnly(uint *keys,
707 const CUDPPRadixSortPlan *plan,
708 bool flipBits,
709 size_t numElements,
710 int keyBits)
711 {
712
713 if(numElements <= WARP_SIZE)
714 {
715 if (flipBits)
716 radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
717 else
718 radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
719 return;
720 }
721 if(numElements <= SORT_CTA_SIZE * 4)
722 {
723 if (flipBits)
724 radixSortSingleBlockKeysOnly<true>(keys, numElements);
725 else
726 radixSortSingleBlockKeysOnly<false>(keys, numElements);
727 return;
728 }
729
730 // flip float bits on the first pass, unflip on the last pass
731 if (flipBits)
732 {
733 radixSortStepKeysOnly<4, 0, true, false>(keys, plan, numElements);
734 }
735 else
736 {
737 radixSortStepKeysOnly<4, 0, false, false>(keys, plan, numElements);
738 }
739
740 if (keyBits > 4)
741 {
742 radixSortStepKeysOnly<4, 4, false, false>(keys, plan, numElements);
743 }
744 if (keyBits > 8)
745 {
746 radixSortStepKeysOnly<4, 8, false, false>(keys, plan, numElements);
747 }
748 if (keyBits > 12)
749 {
750 radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
751 }
752 if (keyBits > 16)
753 {
754 radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
755 }
756 if (keyBits > 20)
757 {
758 radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
759 }
760 if (keyBits > 24)
761 {
762 radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
763 }
764 if (keyBits > 28)
765 {
766 if (flipBits) // last pass
767 {
768 radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
769 }
770 else
771 {
772 radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
773 }
774 }
775 }
776
777 /**
778 * @brief Wrapper to call main radix sort function. For floats and keys only.
779 *
780 * Calls the radixSortKeysOnly function setting parameters for floats.
781 *
782 * @param[in,out] keys Keys to be sorted.
783 * @param[in] plan Configuration information for RadixSort.
784 * @param[in] negativeKeys Is set true if key flipBits is to be true in
785 * radixSortKeysOnly().
786 * @param[in] numElements Number of elements in the sort.
787 * @param[in] keyBits Number of interesting bits in the key
788 **/
789 extern "C"
radixSortFloatKeysOnly(float * keys,const CUDPPRadixSortPlan * plan,bool negativeKeys,size_t numElements,int keyBits)790 void radixSortFloatKeysOnly(float *keys,
791 const CUDPPRadixSortPlan *plan,
792 bool negativeKeys,
793 size_t numElements,
794 int keyBits)
795 {
796 radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
797 }
798
799 extern "C"
initDeviceParameters(CUDPPRadixSortPlan * plan)800 void initDeviceParameters(CUDPPRadixSortPlan *plan)
801 {
802 int deviceID = -1;
803 if (cudaSuccess == cudaGetDevice(&deviceID))
804 {
805 cudaDeviceProp devprop;
806 cudaGetDeviceProperties(&devprop, deviceID);
807
808 int smVersion = devprop.major * 10 + devprop.minor;
809
810 // sm_12 and later devices don't need help with coalesce in reorderData kernel
811 plan->m_bManualCoalesce = (smVersion < 12);
812
813 // sm_20 and later devices are better off not using persistent CTAs
814 plan->m_bUsePersistentCTAs = (smVersion < 20);
815
816 if (plan->m_bUsePersistentCTAs)
817 {
818 // The following is only true on pre-sm_20 devices (pre-Fermi):
819 // Empirically we have found that for some (usually larger) sort
820 // sizes it is better to use exactly as many "persistent" CTAs
821 // as can fill the GPU, which loop over the "blocks" of work. For smaller
822 // arrays it is better to use the typical CUDA approach of launching one CTA
823 // per block of work.
824 // 0-element of these two-element arrays is for key-value sorts
825 // 1-element is for key-only sorts
826 plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
827 plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
828 plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
829 plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
830
831 // create a map of function pointers to register counts for more accurate occupancy calculation
832 // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
833 // Note we only insert the "loop" version of the kernels (the one with the last template param = true)
834 // Because those are the only ones that require persistent CTAs that maximally fill the device.
835 computeNumCTAs(radixSortBlocks<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
836 computeNumCTAs(radixSortBlocks<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
837 computeNumCTAs(radixSortBlocks<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
838 computeNumCTAs(radixSortBlocks<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
839
840 computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
841 computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
842 computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
843 computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
844
845 computeNumCTAs(findRadixOffsets<0, false, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
846 computeNumCTAs(findRadixOffsets<0, true, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
847
848 computeNumCTAs(reorderData<0, false, false, false, true>, 0, SORT_CTA_SIZE);
849 computeNumCTAs(reorderData<0, false, false, true, true>, 0, SORT_CTA_SIZE);
850 computeNumCTAs(reorderData<0, false, true, false, true>, 0, SORT_CTA_SIZE);
851 computeNumCTAs(reorderData<0, false, true, true, true>, 0, SORT_CTA_SIZE);
852 computeNumCTAs(reorderData<0, true, false, false, true>, 0, SORT_CTA_SIZE);
853 computeNumCTAs(reorderData<0, true, false, true, true>, 0, SORT_CTA_SIZE);
854 computeNumCTAs(reorderData<0, true, true, false, true>, 0, SORT_CTA_SIZE);
855 computeNumCTAs(reorderData<0, true, true, true, true>, 0, SORT_CTA_SIZE);
856
857 computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0, SORT_CTA_SIZE);
858 computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>, 0, SORT_CTA_SIZE);
859 computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>, 0, SORT_CTA_SIZE);
860 computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>, 0, SORT_CTA_SIZE);
861 computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>, 0, SORT_CTA_SIZE);
862 computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>, 0, SORT_CTA_SIZE);
863 computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>, 0, SORT_CTA_SIZE);
864 computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>, 0, SORT_CTA_SIZE);
865
866 computeNumCTAs(emptyKernel, 0, SORT_CTA_SIZE);
867 }
868 }
869 }
870
871 /**
872 * @brief From the programmer-specified sort configuration,
873 * creates internal memory for performing the sort.
874 *
875 * @param[in] plan Pointer to CUDPPRadixSortPlan object
876 **/
877 extern "C"
allocRadixSortStorage(CUDPPRadixSortPlan * plan)878 void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
879 {
880
881 unsigned int numElements = plan->m_numElements;
882
883 unsigned int numBlocks =
884 ((numElements % (SORT_CTA_SIZE * 4)) == 0) ?
885 (numElements / (SORT_CTA_SIZE * 4)) :
886 (numElements / (SORT_CTA_SIZE * 4) + 1);
887
888 switch(plan->m_config.datatype)
889 {
890 case CUDPP_UINT:
891 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
892 numElements * sizeof(unsigned int)));
893
894 if (!plan->m_bKeysOnly)
895 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
896 numElements * sizeof(unsigned int)));
897
898 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
899 WARP_SIZE * numBlocks * sizeof(unsigned int)));
900
901 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
902 WARP_SIZE * numBlocks * sizeof(unsigned int)));
903
904 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
905 WARP_SIZE * numBlocks * sizeof(unsigned int)));
906 break;
907
908 case CUDPP_FLOAT:
909 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
910 numElements * sizeof(float)));
911
912 if (!plan->m_bKeysOnly)
913 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
914 numElements * sizeof(float)));
915
916 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
917 WARP_SIZE * numBlocks * sizeof(float)));
918
919 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
920 WARP_SIZE * numBlocks * sizeof(float)));
921
922 CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
923 WARP_SIZE * numBlocks * sizeof(float)));
924 break;
925 }
926
927 initDeviceParameters(plan);
928 }
929
930 /** @brief Deallocates intermediate memory from allocRadixSortStorage.
931 *
932 *
933 * @param[in] plan Pointer to CUDPPRadixSortPlan object
934 **/
935 extern "C"
freeRadixSortStorage(CUDPPRadixSortPlan * plan)936 void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
937 {
938 CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
939 CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
940 CUDA_SAFE_CALL( cudaFree(plan->m_counters));
941 CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
942 CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
943 }
944
945 /** @brief Dispatch function to perform a sort on an array with
946 * a specified configuration.
947 *
948 * This is the dispatch routine which calls radixSort...() with
949 * appropriate template parameters and arguments as specified by
950 * the plan.
951 * @param[in,out] keys Keys to be sorted.
952 * @param[in,out] values Associated values to be sorted (through keys).
953 * @param[in] numElements Number of elements in the sort.
954 * @param[in] keyBits Number of interesting bits in the key*
955 * @param[in] plan Configuration information for RadixSort.
956 **/
957 extern "C"
cudppRadixSortDispatch(void * keys,void * values,size_t numElements,int keyBits,const CUDPPRadixSortPlan * plan)958 void cudppRadixSortDispatch(void *keys,
959 void *values,
960 size_t numElements,
961 int keyBits,
962 const CUDPPRadixSortPlan *plan)
963 {
964 if(plan->m_bKeysOnly)
965 {
966 switch(plan->m_config.datatype)
967 {
968 case CUDPP_UINT:
969 radixSortKeysOnly((uint*)keys, plan, false,
970 numElements, keyBits);
971 break;
972 case CUDPP_FLOAT:
973 radixSortFloatKeysOnly((float*)keys, plan, true,
974 numElements, keyBits);
975 }
976 }
977 else
978 {
979 switch(plan->m_config.datatype)
980 {
981 case CUDPP_UINT:
982 radixSort((uint*)keys, (uint*) values, plan,
983 numElements, false, keyBits);
984 break;
985 case CUDPP_FLOAT:
986 radixSortFloatKeys((float*)keys, (uint*) values, plan,
987 numElements, true, keyBits);
988 }
989 }
990 }
991
992 /** @} */ // end radixsort functions
993 /** @} */ // end cudpp_app
994