1 
2 #ifndef B3_RADIXSORT32_H
3 #define B3_RADIXSORT32_H
4 
5 #include "b3OpenCLArray.h"
6 
7 struct b3SortData
8 {
9 	union {
10 		unsigned int m_key;
11 		unsigned int x;
12 	};
13 
14 	union {
15 		unsigned int m_value;
16 		unsigned int y;
17 	};
18 };
19 #include "b3BufferInfoCL.h"
20 
21 class b3RadixSort32CL
22 {
23 	b3OpenCLArray<unsigned int>* m_workBuffer1;
24 	b3OpenCLArray<unsigned int>* m_workBuffer2;
25 
26 	b3OpenCLArray<b3SortData>* m_workBuffer3;
27 	b3OpenCLArray<b3SortData>* m_workBuffer4;
28 
29 	b3OpenCLArray<unsigned int>* m_workBuffer3a;
30 	b3OpenCLArray<unsigned int>* m_workBuffer4a;
31 
32 	cl_command_queue m_commandQueue;
33 
34 	cl_kernel m_streamCountSortDataKernel;
35 	cl_kernel m_streamCountKernel;
36 
37 	cl_kernel m_prefixScanKernel;
38 	cl_kernel m_sortAndScatterSortDataKernel;
39 	cl_kernel m_sortAndScatterKernel;
40 
41 	bool m_deviceCPU;
42 
43 	class b3PrefixScanCL* m_scan;
44 	class b3FillCL* m_fill;
45 
46 public:
47 	struct b3ConstData
48 	{
49 		int m_n;
50 		int m_nWGs;
51 		int m_startBit;
52 		int m_nBlocksPerWG;
53 	};
54 	enum
55 	{
56 		DATA_ALIGNMENT = 256,
57 		WG_SIZE = 64,
58 		BLOCK_SIZE = 256,
59 		ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
60 		BITS_PER_PASS = 4,
61 		NUM_BUCKET = (1 << BITS_PER_PASS),
62 		//	if you change this, change nPerWI in kernel as well
63 		NUM_WGS = 20 * 6,  //	cypress
64 						   //			NUM_WGS = 24*6,	//	cayman
65 						   //			NUM_WGS = 32*4,	//	nv
66 	};
67 
68 private:
69 public:
70 	b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
71 
72 	virtual ~b3RadixSort32CL();
73 
74 	void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
75 				 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
76 
77 	///keys only
78 	void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
79 
80 	void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
81 	void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
82 	void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
83 };
84 #endif  //B3_RADIXSORT32_H
85