1 
2 #include "b3RadixSort32CL.h"
3 #include "b3LauncherCL.h"
4 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
5 #include "b3PrefixScanCL.h"
6 #include "b3FillCL.h"
7 
8 #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
9 
10 #include "kernels/RadixSort32KernelsCL.h"
11 
b3RadixSort32CL(cl_context ctx,cl_device_id device,cl_command_queue queue,int initialCapacity)12 b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
13 	: m_commandQueue(queue)
14 {
15 	b3OpenCLDeviceInfo info;
16 	b3OpenCLUtils::getDeviceInfo(device, &info);
17 	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
18 
19 	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
20 	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
21 	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
22 	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
23 	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
24 	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
25 
26 	if (initialCapacity > 0)
27 	{
28 		m_workBuffer1->resize(initialCapacity);
29 		m_workBuffer3->resize(initialCapacity);
30 		m_workBuffer3a->resize(initialCapacity);
31 		m_workBuffer4->resize(initialCapacity);
32 		m_workBuffer4a->resize(initialCapacity);
33 	}
34 
35 	m_scan = new b3PrefixScanCL(ctx, device, queue);
36 	m_fill = new b3FillCL(ctx, device, queue);
37 
38 	const char* additionalMacros = "";
39 
40 	cl_int pErrNum;
41 	const char* kernelSource = radixSort32KernelsCL;
42 
43 	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
44 	b3Assert(sortProg);
45 
46 	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
47 	b3Assert(m_streamCountSortDataKernel);
48 
49 	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
50 	b3Assert(m_streamCountKernel);
51 
52 	if (m_deviceCPU)
53 	{
54 		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
55 		b3Assert(m_sortAndScatterSortDataKernel);
56 		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
57 		b3Assert(m_sortAndScatterKernel);
58 	}
59 	else
60 	{
61 		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
62 		b3Assert(m_sortAndScatterSortDataKernel);
63 		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
64 		b3Assert(m_sortAndScatterKernel);
65 	}
66 
67 	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
68 	b3Assert(m_prefixScanKernel);
69 }
70 
~b3RadixSort32CL()71 b3RadixSort32CL::~b3RadixSort32CL()
72 {
73 	delete m_scan;
74 	delete m_fill;
75 	delete m_workBuffer1;
76 	delete m_workBuffer2;
77 	delete m_workBuffer3;
78 	delete m_workBuffer3a;
79 	delete m_workBuffer4;
80 	delete m_workBuffer4a;
81 
82 	clReleaseKernel(m_streamCountSortDataKernel);
83 	clReleaseKernel(m_streamCountKernel);
84 	clReleaseKernel(m_sortAndScatterSortDataKernel);
85 	clReleaseKernel(m_sortAndScatterKernel);
86 	clReleaseKernel(m_prefixScanKernel);
87 }
88 
executeHost(b3AlignedObjectArray<b3SortData> & inout,int sortBits)89 void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
90 {
91 	int n = inout.size();
92 	const int BITS_PER_PASS = 8;
93 	const int NUM_TABLES = (1 << BITS_PER_PASS);
94 
95 	int tables[NUM_TABLES];
96 	int counter[NUM_TABLES];
97 
98 	b3SortData* src = &inout[0];
99 	b3AlignedObjectArray<b3SortData> workbuffer;
100 	workbuffer.resize(inout.size());
101 	b3SortData* dst = &workbuffer[0];
102 
103 	int count = 0;
104 	for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
105 	{
106 		for (int i = 0; i < NUM_TABLES; i++)
107 		{
108 			tables[i] = 0;
109 		}
110 
111 		for (int i = 0; i < n; i++)
112 		{
113 			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
114 			tables[tableIdx]++;
115 		}
116 //#define TEST
117 #ifdef TEST
118 		printf("histogram size=%d\n", NUM_TABLES);
119 		for (int i = 0; i < NUM_TABLES; i++)
120 		{
121 			if (tables[i] != 0)
122 			{
123 				printf("tables[%d]=%d]\n", i, tables[i]);
124 			}
125 		}
126 #endif  //TEST \
127 	//	prefix scan
128 		int sum = 0;
129 		for (int i = 0; i < NUM_TABLES; i++)
130 		{
131 			int iData = tables[i];
132 			tables[i] = sum;
133 			sum += iData;
134 			counter[i] = 0;
135 		}
136 
137 		//	distribute
138 		for (int i = 0; i < n; i++)
139 		{
140 			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
141 
142 			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
143 			counter[tableIdx]++;
144 		}
145 
146 		b3Swap(src, dst);
147 		count++;
148 	}
149 
150 	if (count & 1)
151 	{
152 		b3Assert(0);  //need to copy
153 	}
154 }
155 
executeHost(b3OpenCLArray<b3SortData> & keyValuesInOut,int sortBits)156 void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
157 {
158 	b3AlignedObjectArray<b3SortData> inout;
159 	keyValuesInOut.copyToHost(inout);
160 
161 	executeHost(inout, sortBits);
162 
163 	keyValuesInOut.copyFromHost(inout);
164 }
165 
execute(b3OpenCLArray<unsigned int> & keysIn,b3OpenCLArray<unsigned int> & keysOut,b3OpenCLArray<unsigned int> & valuesIn,b3OpenCLArray<unsigned int> & valuesOut,int n,int sortBits)166 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
167 							  b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
168 {
169 }
170 
171 //#define DEBUG_RADIXSORT
172 //#define DEBUG_RADIXSORT2
173 
execute(b3OpenCLArray<b3SortData> & keyValuesInOut,int sortBits)174 void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
175 {
176 	int originalSize = keyValuesInOut.size();
177 	int workingSize = originalSize;
178 
179 	int dataAlignment = DATA_ALIGNMENT;
180 
181 #ifdef DEBUG_RADIXSORT2
182 	b3AlignedObjectArray<b3SortData> test2;
183 	keyValuesInOut.copyToHost(test2);
184 	printf("numElem = %d\n", test2.size());
185 	for (int i = 0; i < test2.size(); i++)
186 	{
187 		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
188 		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
189 	}
190 #endif  //DEBUG_RADIXSORT2
191 
192 	b3OpenCLArray<b3SortData>* src = 0;
193 
194 	if (workingSize % dataAlignment)
195 	{
196 		workingSize += dataAlignment - (workingSize % dataAlignment);
197 		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
198 		m_workBuffer4->resize(workingSize);
199 		b3SortData fillValue;
200 		fillValue.m_key = 0xffffffff;
201 		fillValue.m_value = 0xffffffff;
202 
203 #define USE_BTFILL
204 #ifdef USE_BTFILL
205 		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
206 #else
207 		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
208 
209 		for (int i = originalSize; i < workingSize; i++)
210 		{
211 			m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
212 		}
213 #endif  //USE_BTFILL
214 
215 		src = m_workBuffer4;
216 	}
217 	else
218 	{
219 		src = &keyValuesInOut;
220 		m_workBuffer4->resize(0);
221 	}
222 
223 	b3Assert(workingSize % DATA_ALIGNMENT == 0);
224 	int minCap = NUM_BUCKET * NUM_WGS;
225 
226 	int n = workingSize;
227 
228 	m_workBuffer1->resize(minCap);
229 	m_workBuffer3->resize(workingSize);
230 
231 	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
232 	b3Assert(BITS_PER_PASS == 4);
233 	b3Assert(WG_SIZE == 64);
234 	b3Assert((sortBits & 0x3) == 0);
235 
236 	b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
237 
238 	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
239 	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
240 
241 	int nWGs = NUM_WGS;
242 	b3ConstData cdata;
243 
244 	{
245 		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
246 		int nBlocks = (n + blockSize - 1) / (blockSize);
247 		cdata.m_n = n;
248 		cdata.m_nWGs = NUM_WGS;
249 		cdata.m_startBit = 0;
250 		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
251 		if (nBlocks < NUM_WGS)
252 		{
253 			cdata.m_nBlocksPerWG = 1;
254 			nWGs = nBlocks;
255 		}
256 	}
257 
258 	int count = 0;
259 	for (int ib = 0; ib < sortBits; ib += 4)
260 	{
261 #ifdef DEBUG_RADIXSORT2
262 		keyValuesInOut.copyToHost(test2);
263 		printf("numElem = %d\n", test2.size());
264 		for (int i = 0; i < test2.size(); i++)
265 		{
266 			if (test2[i].m_key != test2[i].m_value)
267 			{
268 				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
269 				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
270 			}
271 		}
272 #endif  //DEBUG_RADIXSORT2
273 
274 		cdata.m_startBit = ib;
275 
276 		if (src->size())
277 		{
278 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
279 			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
280 
281 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
282 			launcher.setConst(cdata);
283 
284 			int num = NUM_WGS * WG_SIZE;
285 			launcher.launch1D(num, WG_SIZE);
286 		}
287 
288 #ifdef DEBUG_RADIXSORT
289 		b3AlignedObjectArray<unsigned int> testHist;
290 		srcHisto->copyToHost(testHist);
291 		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
292 		for (int i = 0; i < testHist.size(); i++)
293 		{
294 			if (testHist[i] != 0)
295 				printf("testHist[%d]=%d\n", i, testHist[i]);
296 		}
297 #endif  //DEBUG_RADIXSORT
298 
299 //fast prefix scan is not working properly on Mac OSX yet
300 #ifdef __APPLE__
301 		bool fastScan = false;
302 #else
303 		bool fastScan = !m_deviceCPU;  //only use fast scan on GPU
304 #endif
305 
306 		if (fastScan)
307 		{  //	prefix scan group histogram
308 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
309 			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
310 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
311 			launcher.setConst(cdata);
312 			launcher.launch1D(128, 128);
313 			destHisto = srcHisto;
314 		}
315 		else
316 		{
317 			//unsigned int sum; //for debugging
318 			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
319 		}
320 
321 #ifdef DEBUG_RADIXSORT
322 		destHisto->copyToHost(testHist);
323 		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
324 		for (int i = 0; i < testHist.size(); i++)
325 		{
326 			if (testHist[i] != 0)
327 				printf("testHist[%d]=%d\n", i, testHist[i]);
328 		}
329 
330 		for (int i = 0; i < testHist.size(); i += NUM_WGS)
331 		{
332 			printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
333 		}
334 
335 #endif  //DEBUG_RADIXSORT
336 
337 #define USE_GPU
338 #ifdef USE_GPU
339 
340 		if (src->size())
341 		{  //	local sort and distribute
342 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
343 			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
344 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
345 			launcher.setConst(cdata);
346 			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
347 		}
348 #else
349 		{
350 #define NUM_TABLES 16
351 //#define SEQUENTIAL
352 #ifdef SEQUENTIAL
353 			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
354 			int tables[NUM_TABLES];
355 			int startBit = ib;
356 
357 			destHisto->copyToHost(testHist);
358 			b3AlignedObjectArray<b3SortData> srcHost;
359 			b3AlignedObjectArray<b3SortData> dstHost;
360 			dstHost.resize(src->size());
361 
362 			src->copyToHost(srcHost);
363 
364 			for (int i = 0; i < NUM_TABLES; i++)
365 			{
366 				tables[i] = testHist[i * NUM_WGS];
367 			}
368 
369 			//	distribute
370 			for (int i = 0; i < n; i++)
371 			{
372 				int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
373 
374 				dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
375 				counter2[tableIdx]++;
376 			}
377 
378 #else
379 
380 			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
381 
382 			int tables[NUM_TABLES];
383 			b3AlignedObjectArray<b3SortData> dstHostOK;
384 			dstHostOK.resize(src->size());
385 
386 			destHisto->copyToHost(testHist);
387 			b3AlignedObjectArray<b3SortData> srcHost;
388 			src->copyToHost(srcHost);
389 
390 			int blockSize = 256;
391 			int nBlocksPerWG = cdata.m_nBlocksPerWG;
392 			int startBit = ib;
393 
394 			{
395 				for (int i = 0; i < NUM_TABLES; i++)
396 				{
397 					tables[i] = testHist[i * NUM_WGS];
398 				}
399 
400 				//	distribute
401 				for (int i = 0; i < n; i++)
402 				{
403 					int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
404 
405 					dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
406 					counter2[tableIdx]++;
407 				}
408 			}
409 
410 			b3AlignedObjectArray<b3SortData> dstHost;
411 			dstHost.resize(src->size());
412 
413 			int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
414 
415 			for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
416 			{
417 				int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
418 
419 				int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
420 
421 				for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
422 				{
423 					for (int lIdx = 0; lIdx < 64; lIdx++)
424 					{
425 						int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
426 
427 						//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
428 						//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
429 						//	AMD: AtomInc performs better while NV prefers ++
430 						for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
431 						{
432 							if (addr + j < n)
433 							{
434 								//  printf ("addr+j=%d\n", addr+j);
435 
436 								int i = addr + j;
437 
438 								int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
439 
440 								int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
441 
442 								b3SortData ok = dstHostOK[destIndex];
443 
444 								if (ok.m_key != srcHost[i].m_key)
445 								{
446 									printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
447 									printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
448 								}
449 								if (ok.m_value != srcHost[i].m_value)
450 								{
451 									printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
452 									printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
453 								}
454 
455 								dstHost[destIndex] = srcHost[i];
456 								counter[tableIdx]++;
457 							}
458 						}
459 					}
460 				}
461 			}
462 
463 #endif  //SEQUENTIAL
464 
465 			dst->copyFromHost(dstHost);
466 		}
467 #endif  //USE_GPU
468 
469 #ifdef DEBUG_RADIXSORT
470 		destHisto->copyToHost(testHist);
471 		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
472 		for (int i = 0; i < testHist.size(); i++)
473 		{
474 			if (testHist[i] != 0)
475 				printf("testHist[%d]=%d\n", i, testHist[i]);
476 		}
477 #endif  //DEBUG_RADIXSORT
478 		b3Swap(src, dst);
479 		b3Swap(srcHisto, destHisto);
480 
481 #ifdef DEBUG_RADIXSORT2
482 		keyValuesInOut.copyToHost(test2);
483 		printf("numElem = %d\n", test2.size());
484 		for (int i = 0; i < test2.size(); i++)
485 		{
486 			if (test2[i].m_key != test2[i].m_value)
487 			{
488 				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
489 				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
490 			}
491 		}
492 #endif  //DEBUG_RADIXSORT2
493 
494 		count++;
495 	}
496 
497 	if (count & 1)
498 	{
499 		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
500 	}
501 
502 	if (m_workBuffer4->size())
503 	{
504 		m_workBuffer4->resize(originalSize);
505 		keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
506 	}
507 
508 #ifdef DEBUG_RADIXSORT
509 	keyValuesInOut.copyToHost(test2);
510 
511 	printf("numElem = %d\n", test2.size());
512 	for (int i = 0; i < test2.size(); i++)
513 	{
514 		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
515 		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
516 	}
517 #endif
518 }
519 
execute(b3OpenCLArray<unsigned int> & keysInOut,int sortBits)520 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
521 {
522 	int originalSize = keysInOut.size();
523 	int workingSize = originalSize;
524 
525 	int dataAlignment = DATA_ALIGNMENT;
526 
527 	b3OpenCLArray<unsigned int>* src = 0;
528 
529 	if (workingSize % dataAlignment)
530 	{
531 		workingSize += dataAlignment - (workingSize % dataAlignment);
532 		m_workBuffer4a->copyFromOpenCLArray(keysInOut);
533 		m_workBuffer4a->resize(workingSize);
534 		unsigned int fillValue = 0xffffffff;
535 
536 		m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
537 
538 		src = m_workBuffer4a;
539 	}
540 	else
541 	{
542 		src = &keysInOut;
543 		m_workBuffer4a->resize(0);
544 	}
545 
546 	b3Assert(workingSize % DATA_ALIGNMENT == 0);
547 	int minCap = NUM_BUCKET * NUM_WGS;
548 
549 	int n = workingSize;
550 
551 	m_workBuffer1->resize(minCap);
552 	m_workBuffer3->resize(workingSize);
553 	m_workBuffer3a->resize(workingSize);
554 
555 	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
556 	b3Assert(BITS_PER_PASS == 4);
557 	b3Assert(WG_SIZE == 64);
558 	b3Assert((sortBits & 0x3) == 0);
559 
560 	b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
561 
562 	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
563 	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
564 
565 	int nWGs = NUM_WGS;
566 	b3ConstData cdata;
567 
568 	{
569 		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
570 		int nBlocks = (n + blockSize - 1) / (blockSize);
571 		cdata.m_n = n;
572 		cdata.m_nWGs = NUM_WGS;
573 		cdata.m_startBit = 0;
574 		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
575 		if (nBlocks < NUM_WGS)
576 		{
577 			cdata.m_nBlocksPerWG = 1;
578 			nWGs = nBlocks;
579 		}
580 	}
581 
582 	int count = 0;
583 	for (int ib = 0; ib < sortBits; ib += 4)
584 	{
585 		cdata.m_startBit = ib;
586 
587 		if (src->size())
588 		{
589 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
590 			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
591 
592 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
593 			launcher.setConst(cdata);
594 
595 			int num = NUM_WGS * WG_SIZE;
596 			launcher.launch1D(num, WG_SIZE);
597 		}
598 
599 //fast prefix scan is not working properly on Mac OSX yet
600 #ifdef __APPLE__
601 		bool fastScan = false;
602 #else
603 		bool fastScan = !m_deviceCPU;
604 #endif
605 
606 		if (fastScan)
607 		{  //	prefix scan group histogram
608 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
609 			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
610 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
611 			launcher.setConst(cdata);
612 			launcher.launch1D(128, 128);
613 			destHisto = srcHisto;
614 		}
615 		else
616 		{
617 			//unsigned int sum; //for debugging
618 			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
619 		}
620 
621 		if (src->size())
622 		{  //	local sort and distribute
623 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
624 			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
625 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
626 			launcher.setConst(cdata);
627 			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
628 		}
629 
630 		b3Swap(src, dst);
631 		b3Swap(srcHisto, destHisto);
632 
633 		count++;
634 	}
635 
636 	if (count & 1)
637 	{
638 		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
639 	}
640 
641 	if (m_workBuffer4a->size())
642 	{
643 		m_workBuffer4a->resize(originalSize);
644 		keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
645 	}
646 }
647