1 /******************************************************************************
2  * Copyright 2010 Duane Merrill
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may ob3ain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  *
17  *
18  *
19  * AUTHORS' REQUEST:
20  *
21  * 		If you use|reference|benchmark this code, please cite our Technical
22  * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
23  *
24  *		@TechReport{ Merrill:Sorting:2010,
25  *        	author = "Duane Merrill and Andrew Grimshaw",
26  *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
27  *        	year = "2010",
28  *        	institution = "University of Virginia, Department of Computer Science",
29  *        	address = "Charlottesville, VA, USA",
30  *        	number = "CS2010-03"
31  *		}
32  *
33  * For more information, see our Google Code project site:
34  * http://code.google.com/p/back40computing/
35  *
36  * Thanks!
37  ******************************************************************************/
38 
39 /******************************************************************************
40  * Simple test driver program for *large-problem* radix sorting.
41  *
42  * Useful for demonstrating how to integrate radix sorting into
43  * your application
44  ******************************************************************************/
45 
46 /******************************************************************************
47  * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
48  ******************************************************************************/
49 #ifdef _WIN32
50 #pragma warning(disable : 4996)
51 #endif
52 #include <stdlib.h>
53 #include <stdio.h>
54 #include <string.h>
55 #include <math.h>
56 #include <float.h>
57 #include <algorithm>
58 #include <string>
59 
60 //#include <iostream>
61 #include <sstream>
62 /**********************
63 *
64 */
65 
66 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
67 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
68 #include "../btgui/Bullet3AppSupport/b3Clock.h"
69 
70 cl_context g_cxMainContext;
71 cl_device_id g_device;
72 cl_command_queue g_cqCommandQueue;
73 
74 /***********************
75 *
76 */
77 
78 bool g_verbose;
79 ///Preferred OpenCL device/platform. When < 0 then no preference is used.
80 ///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
81 ///Preferred device/platform take priority over this platform-vendor match
82 int gPreferredDeviceId = -1;
83 int gPreferredPlatformId = -1;
84 
85 /******************************************************************************
86  * Routines
87  ******************************************************************************/
88 
89 /**
90  * Keys-only sorting.  Uses the GPU to sort the specified vector of elements for the given
91  * number of iterations, displaying runtime information.
92  *
93  * @param[in] 		num_elements
94  * 		Size in elements of the vector to sort
95  * @param[in] 		h_keys
96  * 		Vector of keys to sort
97  * @param[in] 		iterations
98  * 		Number of times to invoke the GPU sorting primitive
99   * @param[in] 		cfg
100  * 		Config
101  */
102 template <typename K>
TimedSort(unsigned int num_elements,K * h_keys,unsigned int iterations)103 void TimedSort(
104 	unsigned int num_elements,
105 	K *h_keys,
106 	unsigned int iterations)
107 {
108 	printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
109 
110 	int max_elements = num_elements;
111 	b3AlignedObjectArray<unsigned int> hostData;
112 	hostData.resize(num_elements);
113 	for (int i = 0; i < num_elements; i++)
114 	{
115 		hostData[i] = h_keys[i];
116 	}
117 
118 	b3RadixSort32CL sorter(g_cxMainContext, g_device, g_cqCommandQueue);
119 
120 	b3OpenCLArray<unsigned int> gpuData(g_cxMainContext, g_cqCommandQueue);
121 	gpuData.copyFromHost(hostData);
122 	//sorter.executeHost(gpuData);
123 	sorter.execute(gpuData);
124 
125 	b3AlignedObjectArray<unsigned int> hostDataSorted;
126 	gpuData.copyToHost(hostDataSorted);
127 
128 	clFinish(g_cqCommandQueue);
129 
130 	{
131 		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
132 
133 		// Create sorting enactor
134 
135 		// Perform the timed number of sorting iterations
136 		double elapsed = 0;
137 		float duration = 0;
138 		b3Clock watch;
139 
140 		//warm-start
141 		gpuData.copyFromHost(hostData);
142 		clFinish(g_cqCommandQueue);
143 		sorter.execute(gpuData);
144 
145 		watch.reset();
146 
147 		for (int i = 0; i < iterations; i++)
148 		{
149 			// Move a fresh copy of the problem into device storage
150 			gpuData.copyFromHost(hostData);
151 			clFinish(g_cqCommandQueue);
152 
153 			// Start GPU timing record
154 			double startMs = watch.getTimeMicroseconds() / 1e3;
155 
156 			// Call the sorting API routine
157 			sorter.execute(gpuData);
158 
159 			clFinish(g_cqCommandQueue);
160 
161 			double stopMs = watch.getTimeMicroseconds() / 1e3;
162 
163 			duration = stopMs - startMs;
164 
165 			// End GPU timing record
166 			elapsed += (double)duration;
167 			printf("duration = %f\n", duration);
168 		}
169 
170 		// Display timing information
171 		double avg_runtime = elapsed / iterations;
172 		//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
173 		//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
174 		double throughput = ((double)num_elements) / avg_runtime / 1000.0;
175 		printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
176 
177 		gpuData.copyToHost(hostData);
178 		for (int i = 0; i < num_elements; i++)
179 		{
180 			h_keys[i] = hostData[i];
181 		}
182 	}
183 }
184 
185 /**
186  * Key-value sorting.  Uses the GPU to sort the specified vector of elements for the given
187  * number of iterations, displaying runtime information.
188  *
189  * @param[in] 		num_elements
190  * 		Size in elements of the vector to sort
191  * @param[in] 		h_keys
192  * 		Vector of keys to sort
193  * @param[in,out] 	h_values
194  * 		Vector of values to sort
195  * @param[in] 		iterations
196  * 		Number of times to invoke the GPU sorting primitive
197   * @param[in] 		cfg
198  * 		Config
199  */
200 template <typename K, typename V>
TimedSort(unsigned int num_elements,K * h_keys,V * h_values,unsigned int iterations)201 void TimedSort(
202 	unsigned int num_elements,
203 	K *h_keys,
204 	V *h_values,
205 	unsigned int iterations)
206 {
207 	printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
208 
209 	int max_elements = num_elements;
210 	b3AlignedObjectArray<b3SortData> hostData;
211 	hostData.resize(num_elements);
212 	for (int i = 0; i < num_elements; i++)
213 	{
214 		hostData[i].m_key = h_keys[i];
215 		hostData[i].m_value = h_values[i];
216 	}
217 
218 	b3RadixSort32CL sorter(g_cxMainContext, g_device, g_cqCommandQueue);
219 
220 	b3OpenCLArray<b3SortData> gpuData(g_cxMainContext, g_cqCommandQueue);
221 	gpuData.copyFromHost(hostData);
222 	//sorter.executeHost(gpuData);
223 	sorter.execute(gpuData);
224 
225 	b3AlignedObjectArray<b3SortData> hostDataSorted;
226 	gpuData.copyToHost(hostDataSorted);
227 #if 0
228     for (int i=0;i<num_elements;i++)
229 	{
230 		printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
231         printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
232 	}
233 #endif
234 
235 	clFinish(g_cqCommandQueue);
236 
237 	{
238 		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
239 
240 		// Create sorting enactor
241 
242 		// Perform the timed number of sorting iterations
243 		double elapsed = 0;
244 		float duration = 0;
245 		b3Clock watch;
246 
247 		//warm-start
248 		gpuData.copyFromHost(hostData);
249 		sorter.execute(gpuData);
250 		clFinish(g_cqCommandQueue);
251 
252 		watch.reset();
253 
254 		for (int i = 0; i < iterations; i++)
255 		{
256 			// Move a fresh copy of the problem into device storage
257 			gpuData.copyFromHost(hostData);
258 			clFinish(g_cqCommandQueue);
259 
260 			// Start GPU timing record
261 			double startMs = watch.getTimeMicroseconds() / 1e3;
262 
263 			// Call the sorting API routine
264 			sorter.execute(gpuData);
265 			clFinish(g_cqCommandQueue);
266 
267 			double stopMs = watch.getTimeMicroseconds() / 1e3;
268 
269 			duration = stopMs - startMs;
270 
271 			// End GPU timing record
272 			elapsed += (double)duration;
273 			printf("duration = %f\n", duration);
274 		}
275 
276 		// Display timing information
277 		double avg_runtime = elapsed / iterations;
278 		//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
279 		//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
280 		double throughput = ((double)num_elements) / avg_runtime / 1000.0;
281 		printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
282 
283 		gpuData.copyToHost(hostData);
284 		for (int i = 0; i < num_elements; i++)
285 		{
286 			h_keys[i] = hostData[i].m_key;
287 			h_values[i] = hostData[i].m_value;
288 		}
289 	}
290 }
291 
292 /**
293  * Generates random 32-bit keys.
294  *
295  * We always take the second-order byte from rand() because the higher-order
296  * bits returned by rand() are commonly considered more uniformly distributed
297  * than the lower-order bits.
298  *
299  * We can decrease the entropy level of keys by adopting the technique
300  * of Thearling and Smith in which keys are computed from the bitwise AND of
301  * multiple random samples:
302  *
303  * entropy_reduction	| Effectively-unique bits per key
304  * -----------------------------------------------------
305  * -1					| 0
306  * 0					| 32
307  * 1					| 25.95
308  * 2					| 17.41
309  * 3					| 10.78
310  * 4					| 6.42
311  * ...					| ...
312  *
313  */
314 template <typename K>
RandomBits(K & key,int entropy_reduction=0,int lower_key_bits=sizeof (K)* 8)315 void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
316 {
317 	const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
318 	unsigned char key_bits[NUM_UCHARS];
319 
320 	do
321 	{
322 		for (int j = 0; j < NUM_UCHARS; j++)
323 		{
324 			unsigned char quarterword = 0xff;
325 			for (int i = 0; i <= entropy_reduction; i++)
326 			{
327 				quarterword &= (rand() >> 7);
328 			}
329 			key_bits[j] = quarterword;
330 		}
331 
332 		if (lower_key_bits < sizeof(K) * 8)
333 		{
334 			unsigned long long base = 0;
335 			memcpy(&base, key_bits, sizeof(K));
336 			base &= (1 << lower_key_bits) - 1;
337 			memcpy(key_bits, &base, sizeof(K));
338 		}
339 
340 		memcpy(&key, key_bits, sizeof(K));
341 
342 	} while (key != key);  // avoids NaNs when generating random floating point numbers
343 }
344 
345 /******************************************************************************
346  * Templated routines for printing keys/values to the console
347  ******************************************************************************/
348 
349 template <typename T>
PrintValue(T val)350 void PrintValue(T val)
351 {
352 	printf("%d", val);
353 }
354 
355 template <>
PrintValue(float val)356 void PrintValue<float>(float val)
357 {
358 	printf("%f", val);
359 }
360 
361 template <>
PrintValue(double val)362 void PrintValue<double>(double val)
363 {
364 	printf("%f", val);
365 }
366 
367 template <>
PrintValue(unsigned char val)368 void PrintValue<unsigned char>(unsigned char val)
369 {
370 	printf("%u", val);
371 }
372 
373 template <>
PrintValue(unsigned short val)374 void PrintValue<unsigned short>(unsigned short val)
375 {
376 	printf("%u", val);
377 }
378 
379 template <>
PrintValue(unsigned int val)380 void PrintValue<unsigned int>(unsigned int val)
381 {
382 	printf("%u", val);
383 }
384 
385 template <>
PrintValue(long val)386 void PrintValue<long>(long val)
387 {
388 	printf("%ld", val);
389 }
390 
391 template <>
PrintValue(unsigned long val)392 void PrintValue<unsigned long>(unsigned long val)
393 {
394 	printf("%lu", val);
395 }
396 
397 template <>
PrintValue(long long val)398 void PrintValue<long long>(long long val)
399 {
400 	printf("%lld", val);
401 }
402 
403 template <>
PrintValue(unsigned long long val)404 void PrintValue<unsigned long long>(unsigned long long val)
405 {
406 	printf("%llu", val);
407 }
408 
409 /**
410  * Compares the equivalence of two arrays
411  */
412 template <typename T, typename SizeT>
CompareResults(T * computed,T * reference,SizeT len,bool verbose=true)413 int CompareResults(T *computed, T *reference, SizeT len, bool verbose = true)
414 {
415 	printf("\n");
416 	for (SizeT i = 0; i < len; i++)
417 	{
418 		if (computed[i] != reference[i])
419 		{
420 			printf("INCORRECT: [%lu]: ", (unsigned long)i);
421 			PrintValue<T>(computed[i]);
422 			printf(" != ");
423 			PrintValue<T>(reference[i]);
424 
425 			if (verbose)
426 			{
427 				printf("\nresult[...");
428 				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++)
429 				{
430 					PrintValue<T>(computed[j]);
431 					printf(", ");
432 				}
433 				printf("...]");
434 				printf("\nreference[...");
435 				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++)
436 				{
437 					PrintValue<T>(reference[j]);
438 					printf(", ");
439 				}
440 				printf("...]");
441 			}
442 
443 			return 1;
444 		}
445 	}
446 
447 	printf("CORRECT\n");
448 	return 0;
449 }
450 
451 /**
452  * Creates an example sorting problem whose keys is a vector of the specified
453  * number of K elements, values of V elements, and then dispatches the problem
454  * to the GPU for the given number of iterations, displaying runtime information.
455  *
456  * @param[in] 		iterations
457  * 		Number of times to invoke the GPU sorting primitive
458  * @param[in] 		num_elements
459  * 		Size in elements of the vector to sort
460  * @param[in] 		cfg
461  * 		Config
462  */
463 template <typename K, typename V>
TestSort(unsigned int iterations,int num_elements,bool keys_only)464 void TestSort(
465 	unsigned int iterations,
466 	int num_elements,
467 	bool keys_only)
468 {
469 	// Allocate the sorting problem on the host and fill the keys with random bytes
470 
471 	K *h_keys = NULL;
472 	K *h_reference_keys = NULL;
473 	V *h_values = NULL;
474 	h_keys = (K *)malloc(num_elements * sizeof(K));
475 	h_reference_keys = (K *)malloc(num_elements * sizeof(K));
476 	if (!keys_only) h_values = (V *)malloc(num_elements * sizeof(V));
477 
478 	// Use random bits
479 	for (unsigned int i = 0; i < num_elements; ++i)
480 	{
481 		RandomBits<K>(h_keys[i], 0);
482 		//h_keys[i] = num_elements-i;
483 		//h_keys[i] = 0xffffffffu-i;
484 		if (!keys_only)
485 			h_values[i] = h_keys[i];  //0xffffffffu-i;
486 
487 		h_reference_keys[i] = h_keys[i];
488 	}
489 
490 	// Run the timing test
491 	if (keys_only)
492 	{
493 		TimedSort<K>(num_elements, h_keys, iterations);
494 	}
495 	else
496 	{
497 		TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
498 	}
499 
500 	//	cudaThreadSynchronize();
501 
502 	// Display sorted key data
503 	if (g_verbose)
504 	{
505 		printf("\n\nKeys:\n");
506 		for (int i = 0; i < num_elements; i++)
507 		{
508 			PrintValue<K>(h_keys[i]);
509 			printf(", ");
510 		}
511 		printf("\n\n");
512 	}
513 
514 	// Verify solution
515 	std::sort(h_reference_keys, h_reference_keys + num_elements);
516 	CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
517 	printf("\n");
518 	fflush(stdout);
519 
520 	// Free our allocated host memory
521 	if (h_keys != NULL) free(h_keys);
522 	if (h_values != NULL) free(h_values);
523 }
524 
525 /**
526  * Displays the commandline usage for this tool
527  */
Usage()528 void Usage()
529 {
530 	printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
531 	printf("\n");
532 	printf("\t--v\tDisplays sorted results to the console.\n");
533 	printf("\n");
534 	printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
535 	printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
536 	printf("\n");
537 	printf("\t--n\tThe number of elements to comprise the sample problem\n");
538 	printf("\t\t\tDefault = 512\n");
539 	printf("\n");
540 	printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
541 	printf("\n");
542 }
543 
544 /******************************************************************************
545  * Command-line parsing
546  ******************************************************************************/
547 #include <map>
548 #include <algorithm>
549 #include <string>
550 
551 class b3CommandLineArgs
552 {
553 protected:
554 	std::map<std::string, std::string> pairs;
555 
556 public:
557 	// Constructor
b3CommandLineArgs(int argc,char ** argv)558 	b3CommandLineArgs(int argc, char **argv)
559 	{
560 		using namespace std;
561 
562 		for (int i = 1; i < argc; i++)
563 		{
564 			string arg = argv[i];
565 
566 			if ((arg[0] != '-') || (arg[1] != '-'))
567 			{
568 				continue;
569 			}
570 
571 			string::size_type pos;
572 			string key, val;
573 			if ((pos = arg.find('=')) == string::npos)
574 			{
575 				key = string(arg, 2, arg.length() - 2);
576 				val = "";
577 			}
578 			else
579 			{
580 				key = string(arg, 2, pos - 2);
581 				val = string(arg, pos + 1, arg.length() - 1);
582 			}
583 			pairs[key] = val;
584 		}
585 	}
586 
CheckCmdLineFlag(const char * arg_name)587 	bool CheckCmdLineFlag(const char *arg_name)
588 	{
589 		using namespace std;
590 		map<string, string>::iterator itr;
591 		if ((itr = pairs.find(arg_name)) != pairs.end())
592 		{
593 			return true;
594 		}
595 		return false;
596 	}
597 
598 	template <typename T>
599 	void GetCmdLineArgument(const char *arg_name, T &val);
600 
ParsedArgc()601 	int ParsedArgc()
602 	{
603 		return pairs.size();
604 	}
605 };
606 
607 template <typename T>
GetCmdLineArgument(const char * arg_name,T & val)608 void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
609 {
610 	using namespace std;
611 	map<string, string>::iterator itr;
612 	if ((itr = pairs.find(arg_name)) != pairs.end())
613 	{
614 		istringstream strstream(itr->second);
615 		strstream >> val;
616 	}
617 }
618 
619 template <>
GetCmdLineArgument(const char * arg_name,char * & val)620 void b3CommandLineArgs::GetCmdLineArgument<char *>(const char *arg_name, char *&val)
621 {
622 	using namespace std;
623 	map<string, string>::iterator itr;
624 	if ((itr = pairs.find(arg_name)) != pairs.end())
625 	{
626 		string s = itr->second;
627 		val = (char *)malloc(sizeof(char) * (s.length() + 1));
628 		strcpy(val, s.c_str());
629 	}
630 	else
631 	{
632 		val = NULL;
633 	}
634 }
635 
636 /******************************************************************************
637  * Main
638  ******************************************************************************/
639 
640 extern bool gDebugSkipLoadingBinary;
641 
myprintf(const char * msg)642 void myprintf(const char *msg)
643 {
644 	(void *)msg;
645 }
main(int argc,char ** argv)646 int main(int argc, char **argv)
647 {
648 	//gDebugSkipLoadingBinary = true;
649 
650 	//	b3SetCustomPrintfFunc(myprintf);
651 
652 	cl_int ciErrNum;
653 	b3CommandLineArgs args(argc, argv);
654 
655 	args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
656 	args.GetCmdLineArgument("platformId", gPreferredPlatformId);
657 
658 	b3Printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
659 	cl_platform_id platformId;
660 	//	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
661 	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0, gPreferredDeviceId, gPreferredPlatformId, &platformId);
662 	//g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
663 
664 	oclCHECKERROR(ciErrNum, CL_SUCCESS);
665 
666 	int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
667 
668 	if (!numDev)
669 	{
670 		b3Error("error: no OpenCL devices\n");
671 		exit(0);
672 	}
673 	int devId = 0;
674 	g_device = b3OpenCLUtils_getDevice(g_cxMainContext, devId);
675 	b3OpenCLUtils_printDeviceInfo(g_device);
676 	// create a command-queue
677 	g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
678 	oclCHECKERROR(ciErrNum, CL_SUCCESS);
679 
680 	//srand(time(NULL));
681 	srand(0);  // presently deterministic
682 
683 	unsigned int num_elements = 8 * 1024 * 1024;  //4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
684 	unsigned int iterations = 10;
685 	bool keys_only = true;
686 
687 	//
688 	// Check command line arguments
689 	//
690 
691 	if (args.CheckCmdLineFlag("help"))
692 	{
693 		Usage();
694 		return 0;
695 	}
696 
697 	args.GetCmdLineArgument("i", iterations);
698 	args.GetCmdLineArgument("n", num_elements);
699 
700 	keys_only = !args.CheckCmdLineFlag("key-values");
701 	g_verbose = args.CheckCmdLineFlag("v");
702 
703 	TestSort<unsigned int, unsigned int>(
704 		iterations,
705 		num_elements,
706 		keys_only);
707 }
708