1
2 #include "b3RadixSort32CL.h"
3 #include "b3LauncherCL.h"
4 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
5 #include "b3PrefixScanCL.h"
6 #include "b3FillCL.h"
7
8 #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
9
10 #include "kernels/RadixSort32KernelsCL.h"
11
b3RadixSort32CL(cl_context ctx,cl_device_id device,cl_command_queue queue,int initialCapacity)12 b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
13 : m_commandQueue(queue)
14 {
15 b3OpenCLDeviceInfo info;
16 b3OpenCLUtils::getDeviceInfo(device, &info);
17 m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
18
19 m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
20 m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
21 m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
22 m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
23 m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
24 m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
25
26 if (initialCapacity > 0)
27 {
28 m_workBuffer1->resize(initialCapacity);
29 m_workBuffer3->resize(initialCapacity);
30 m_workBuffer3a->resize(initialCapacity);
31 m_workBuffer4->resize(initialCapacity);
32 m_workBuffer4a->resize(initialCapacity);
33 }
34
35 m_scan = new b3PrefixScanCL(ctx, device, queue);
36 m_fill = new b3FillCL(ctx, device, queue);
37
38 const char* additionalMacros = "";
39
40 cl_int pErrNum;
41 const char* kernelSource = radixSort32KernelsCL;
42
43 cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
44 b3Assert(sortProg);
45
46 m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
47 b3Assert(m_streamCountSortDataKernel);
48
49 m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
50 b3Assert(m_streamCountKernel);
51
52 if (m_deviceCPU)
53 {
54 m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
55 b3Assert(m_sortAndScatterSortDataKernel);
56 m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
57 b3Assert(m_sortAndScatterKernel);
58 }
59 else
60 {
61 m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
62 b3Assert(m_sortAndScatterSortDataKernel);
63 m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
64 b3Assert(m_sortAndScatterKernel);
65 }
66
67 m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
68 b3Assert(m_prefixScanKernel);
69 }
70
~b3RadixSort32CL()71 b3RadixSort32CL::~b3RadixSort32CL()
72 {
73 delete m_scan;
74 delete m_fill;
75 delete m_workBuffer1;
76 delete m_workBuffer2;
77 delete m_workBuffer3;
78 delete m_workBuffer3a;
79 delete m_workBuffer4;
80 delete m_workBuffer4a;
81
82 clReleaseKernel(m_streamCountSortDataKernel);
83 clReleaseKernel(m_streamCountKernel);
84 clReleaseKernel(m_sortAndScatterSortDataKernel);
85 clReleaseKernel(m_sortAndScatterKernel);
86 clReleaseKernel(m_prefixScanKernel);
87 }
88
executeHost(b3AlignedObjectArray<b3SortData> & inout,int sortBits)89 void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
90 {
91 int n = inout.size();
92 const int BITS_PER_PASS = 8;
93 const int NUM_TABLES = (1 << BITS_PER_PASS);
94
95 int tables[NUM_TABLES];
96 int counter[NUM_TABLES];
97
98 b3SortData* src = &inout[0];
99 b3AlignedObjectArray<b3SortData> workbuffer;
100 workbuffer.resize(inout.size());
101 b3SortData* dst = &workbuffer[0];
102
103 int count = 0;
104 for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
105 {
106 for (int i = 0; i < NUM_TABLES; i++)
107 {
108 tables[i] = 0;
109 }
110
111 for (int i = 0; i < n; i++)
112 {
113 int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
114 tables[tableIdx]++;
115 }
116 //#define TEST
117 #ifdef TEST
118 printf("histogram size=%d\n", NUM_TABLES);
119 for (int i = 0; i < NUM_TABLES; i++)
120 {
121 if (tables[i] != 0)
122 {
123 printf("tables[%d]=%d]\n", i, tables[i]);
124 }
125 }
126 #endif //TEST \
127 // prefix scan
128 int sum = 0;
129 for (int i = 0; i < NUM_TABLES; i++)
130 {
131 int iData = tables[i];
132 tables[i] = sum;
133 sum += iData;
134 counter[i] = 0;
135 }
136
137 // distribute
138 for (int i = 0; i < n; i++)
139 {
140 int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
141
142 dst[tables[tableIdx] + counter[tableIdx]] = src[i];
143 counter[tableIdx]++;
144 }
145
146 b3Swap(src, dst);
147 count++;
148 }
149
150 if (count & 1)
151 {
152 b3Assert(0); //need to copy
153 }
154 }
155
executeHost(b3OpenCLArray<b3SortData> & keyValuesInOut,int sortBits)156 void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
157 {
158 b3AlignedObjectArray<b3SortData> inout;
159 keyValuesInOut.copyToHost(inout);
160
161 executeHost(inout, sortBits);
162
163 keyValuesInOut.copyFromHost(inout);
164 }
165
execute(b3OpenCLArray<unsigned int> & keysIn,b3OpenCLArray<unsigned int> & keysOut,b3OpenCLArray<unsigned int> & valuesIn,b3OpenCLArray<unsigned int> & valuesOut,int n,int sortBits)166 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
167 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
168 {
169 }
170
171 //#define DEBUG_RADIXSORT
172 //#define DEBUG_RADIXSORT2
173
execute(b3OpenCLArray<b3SortData> & keyValuesInOut,int sortBits)174 void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
175 {
176 int originalSize = keyValuesInOut.size();
177 int workingSize = originalSize;
178
179 int dataAlignment = DATA_ALIGNMENT;
180
181 #ifdef DEBUG_RADIXSORT2
182 b3AlignedObjectArray<b3SortData> test2;
183 keyValuesInOut.copyToHost(test2);
184 printf("numElem = %d\n", test2.size());
185 for (int i = 0; i < test2.size(); i++)
186 {
187 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
188 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
189 }
190 #endif //DEBUG_RADIXSORT2
191
192 b3OpenCLArray<b3SortData>* src = 0;
193
194 if (workingSize % dataAlignment)
195 {
196 workingSize += dataAlignment - (workingSize % dataAlignment);
197 m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
198 m_workBuffer4->resize(workingSize);
199 b3SortData fillValue;
200 fillValue.m_key = 0xffffffff;
201 fillValue.m_value = 0xffffffff;
202
203 #define USE_BTFILL
204 #ifdef USE_BTFILL
205 m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
206 #else
207 //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
208
209 for (int i = originalSize; i < workingSize; i++)
210 {
211 m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
212 }
213 #endif //USE_BTFILL
214
215 src = m_workBuffer4;
216 }
217 else
218 {
219 src = &keyValuesInOut;
220 m_workBuffer4->resize(0);
221 }
222
223 b3Assert(workingSize % DATA_ALIGNMENT == 0);
224 int minCap = NUM_BUCKET * NUM_WGS;
225
226 int n = workingSize;
227
228 m_workBuffer1->resize(minCap);
229 m_workBuffer3->resize(workingSize);
230
231 // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
232 b3Assert(BITS_PER_PASS == 4);
233 b3Assert(WG_SIZE == 64);
234 b3Assert((sortBits & 0x3) == 0);
235
236 b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
237
238 b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
239 b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
240
241 int nWGs = NUM_WGS;
242 b3ConstData cdata;
243
244 {
245 int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
246 int nBlocks = (n + blockSize - 1) / (blockSize);
247 cdata.m_n = n;
248 cdata.m_nWGs = NUM_WGS;
249 cdata.m_startBit = 0;
250 cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
251 if (nBlocks < NUM_WGS)
252 {
253 cdata.m_nBlocksPerWG = 1;
254 nWGs = nBlocks;
255 }
256 }
257
258 int count = 0;
259 for (int ib = 0; ib < sortBits; ib += 4)
260 {
261 #ifdef DEBUG_RADIXSORT2
262 keyValuesInOut.copyToHost(test2);
263 printf("numElem = %d\n", test2.size());
264 for (int i = 0; i < test2.size(); i++)
265 {
266 if (test2[i].m_key != test2[i].m_value)
267 {
268 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
269 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
270 }
271 }
272 #endif //DEBUG_RADIXSORT2
273
274 cdata.m_startBit = ib;
275
276 if (src->size())
277 {
278 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
279 b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
280
281 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
282 launcher.setConst(cdata);
283
284 int num = NUM_WGS * WG_SIZE;
285 launcher.launch1D(num, WG_SIZE);
286 }
287
288 #ifdef DEBUG_RADIXSORT
289 b3AlignedObjectArray<unsigned int> testHist;
290 srcHisto->copyToHost(testHist);
291 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
292 for (int i = 0; i < testHist.size(); i++)
293 {
294 if (testHist[i] != 0)
295 printf("testHist[%d]=%d\n", i, testHist[i]);
296 }
297 #endif //DEBUG_RADIXSORT
298
299 //fast prefix scan is not working properly on Mac OSX yet
300 #ifdef __APPLE__
301 bool fastScan = false;
302 #else
303 bool fastScan = !m_deviceCPU; //only use fast scan on GPU
304 #endif
305
306 if (fastScan)
307 { // prefix scan group histogram
308 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
309 b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
310 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
311 launcher.setConst(cdata);
312 launcher.launch1D(128, 128);
313 destHisto = srcHisto;
314 }
315 else
316 {
317 //unsigned int sum; //for debugging
318 m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
319 }
320
321 #ifdef DEBUG_RADIXSORT
322 destHisto->copyToHost(testHist);
323 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
324 for (int i = 0; i < testHist.size(); i++)
325 {
326 if (testHist[i] != 0)
327 printf("testHist[%d]=%d\n", i, testHist[i]);
328 }
329
330 for (int i = 0; i < testHist.size(); i += NUM_WGS)
331 {
332 printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
333 }
334
335 #endif //DEBUG_RADIXSORT
336
337 #define USE_GPU
338 #ifdef USE_GPU
339
340 if (src->size())
341 { // local sort and distribute
342 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
343 b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
344 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
345 launcher.setConst(cdata);
346 launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
347 }
348 #else
349 {
350 #define NUM_TABLES 16
351 //#define SEQUENTIAL
352 #ifdef SEQUENTIAL
353 int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
354 int tables[NUM_TABLES];
355 int startBit = ib;
356
357 destHisto->copyToHost(testHist);
358 b3AlignedObjectArray<b3SortData> srcHost;
359 b3AlignedObjectArray<b3SortData> dstHost;
360 dstHost.resize(src->size());
361
362 src->copyToHost(srcHost);
363
364 for (int i = 0; i < NUM_TABLES; i++)
365 {
366 tables[i] = testHist[i * NUM_WGS];
367 }
368
369 // distribute
370 for (int i = 0; i < n; i++)
371 {
372 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
373
374 dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
375 counter2[tableIdx]++;
376 }
377
378 #else
379
380 int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
381
382 int tables[NUM_TABLES];
383 b3AlignedObjectArray<b3SortData> dstHostOK;
384 dstHostOK.resize(src->size());
385
386 destHisto->copyToHost(testHist);
387 b3AlignedObjectArray<b3SortData> srcHost;
388 src->copyToHost(srcHost);
389
390 int blockSize = 256;
391 int nBlocksPerWG = cdata.m_nBlocksPerWG;
392 int startBit = ib;
393
394 {
395 for (int i = 0; i < NUM_TABLES; i++)
396 {
397 tables[i] = testHist[i * NUM_WGS];
398 }
399
400 // distribute
401 for (int i = 0; i < n; i++)
402 {
403 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
404
405 dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
406 counter2[tableIdx]++;
407 }
408 }
409
410 b3AlignedObjectArray<b3SortData> dstHost;
411 dstHost.resize(src->size());
412
413 int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
414
415 for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
416 {
417 int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
418
419 int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
420
421 for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
422 {
423 for (int lIdx = 0; lIdx < 64; lIdx++)
424 {
425 int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
426
427 // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
428 // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
429 // AMD: AtomInc performs better while NV prefers ++
430 for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
431 {
432 if (addr + j < n)
433 {
434 // printf ("addr+j=%d\n", addr+j);
435
436 int i = addr + j;
437
438 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
439
440 int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
441
442 b3SortData ok = dstHostOK[destIndex];
443
444 if (ok.m_key != srcHost[i].m_key)
445 {
446 printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
447 printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
448 }
449 if (ok.m_value != srcHost[i].m_value)
450 {
451 printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
452 printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
453 }
454
455 dstHost[destIndex] = srcHost[i];
456 counter[tableIdx]++;
457 }
458 }
459 }
460 }
461 }
462
463 #endif //SEQUENTIAL
464
465 dst->copyFromHost(dstHost);
466 }
467 #endif //USE_GPU
468
469 #ifdef DEBUG_RADIXSORT
470 destHisto->copyToHost(testHist);
471 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
472 for (int i = 0; i < testHist.size(); i++)
473 {
474 if (testHist[i] != 0)
475 printf("testHist[%d]=%d\n", i, testHist[i]);
476 }
477 #endif //DEBUG_RADIXSORT
478 b3Swap(src, dst);
479 b3Swap(srcHisto, destHisto);
480
481 #ifdef DEBUG_RADIXSORT2
482 keyValuesInOut.copyToHost(test2);
483 printf("numElem = %d\n", test2.size());
484 for (int i = 0; i < test2.size(); i++)
485 {
486 if (test2[i].m_key != test2[i].m_value)
487 {
488 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
489 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
490 }
491 }
492 #endif //DEBUG_RADIXSORT2
493
494 count++;
495 }
496
497 if (count & 1)
498 {
499 b3Assert(0); //need to copy from workbuffer to keyValuesInOut
500 }
501
502 if (m_workBuffer4->size())
503 {
504 m_workBuffer4->resize(originalSize);
505 keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
506 }
507
508 #ifdef DEBUG_RADIXSORT
509 keyValuesInOut.copyToHost(test2);
510
511 printf("numElem = %d\n", test2.size());
512 for (int i = 0; i < test2.size(); i++)
513 {
514 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
515 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
516 }
517 #endif
518 }
519
execute(b3OpenCLArray<unsigned int> & keysInOut,int sortBits)520 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
521 {
522 int originalSize = keysInOut.size();
523 int workingSize = originalSize;
524
525 int dataAlignment = DATA_ALIGNMENT;
526
527 b3OpenCLArray<unsigned int>* src = 0;
528
529 if (workingSize % dataAlignment)
530 {
531 workingSize += dataAlignment - (workingSize % dataAlignment);
532 m_workBuffer4a->copyFromOpenCLArray(keysInOut);
533 m_workBuffer4a->resize(workingSize);
534 unsigned int fillValue = 0xffffffff;
535
536 m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
537
538 src = m_workBuffer4a;
539 }
540 else
541 {
542 src = &keysInOut;
543 m_workBuffer4a->resize(0);
544 }
545
546 b3Assert(workingSize % DATA_ALIGNMENT == 0);
547 int minCap = NUM_BUCKET * NUM_WGS;
548
549 int n = workingSize;
550
551 m_workBuffer1->resize(minCap);
552 m_workBuffer3->resize(workingSize);
553 m_workBuffer3a->resize(workingSize);
554
555 // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
556 b3Assert(BITS_PER_PASS == 4);
557 b3Assert(WG_SIZE == 64);
558 b3Assert((sortBits & 0x3) == 0);
559
560 b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
561
562 b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
563 b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
564
565 int nWGs = NUM_WGS;
566 b3ConstData cdata;
567
568 {
569 int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
570 int nBlocks = (n + blockSize - 1) / (blockSize);
571 cdata.m_n = n;
572 cdata.m_nWGs = NUM_WGS;
573 cdata.m_startBit = 0;
574 cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
575 if (nBlocks < NUM_WGS)
576 {
577 cdata.m_nBlocksPerWG = 1;
578 nWGs = nBlocks;
579 }
580 }
581
582 int count = 0;
583 for (int ib = 0; ib < sortBits; ib += 4)
584 {
585 cdata.m_startBit = ib;
586
587 if (src->size())
588 {
589 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
590 b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
591
592 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
593 launcher.setConst(cdata);
594
595 int num = NUM_WGS * WG_SIZE;
596 launcher.launch1D(num, WG_SIZE);
597 }
598
599 //fast prefix scan is not working properly on Mac OSX yet
600 #ifdef __APPLE__
601 bool fastScan = false;
602 #else
603 bool fastScan = !m_deviceCPU;
604 #endif
605
606 if (fastScan)
607 { // prefix scan group histogram
608 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
609 b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
610 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
611 launcher.setConst(cdata);
612 launcher.launch1D(128, 128);
613 destHisto = srcHisto;
614 }
615 else
616 {
617 //unsigned int sum; //for debugging
618 m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
619 }
620
621 if (src->size())
622 { // local sort and distribute
623 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
624 b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
625 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
626 launcher.setConst(cdata);
627 launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
628 }
629
630 b3Swap(src, dst);
631 b3Swap(srcHisto, destHisto);
632
633 count++;
634 }
635
636 if (count & 1)
637 {
638 b3Assert(0); //need to copy from workbuffer to keyValuesInOut
639 }
640
641 if (m_workBuffer4a->size())
642 {
643 m_workBuffer4a->resize(originalSize);
644 keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
645 }
646 }
647