1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #include <algorithm>
12 #include <iostream>
13 #include <vector>
14 
15 #include <boost/program_options.hpp>
16 
17 #include <boost/compute/system.hpp>
18 #include <boost/compute/algorithm/sort.hpp>
19 #include <boost/compute/algorithm/is_sorted.hpp>
20 #include <boost/compute/container/vector.hpp>
21 
22 #include "perf.hpp"
23 
24 namespace po = boost::program_options;
25 namespace compute = boost::compute;
26 
27 template<class T>
perf_sort(const std::vector<T> & data,const size_t trials,compute::command_queue & queue)28 double perf_sort(const std::vector<T>& data,
29                  const size_t trials,
30                  compute::command_queue& queue)
31 {
32     compute::vector<T> vec(data.size(), queue.get_context());
33 
34     perf_timer t;
35     for(size_t trial = 0; trial < trials; trial++){
36         compute::copy(data.begin(), data.end(), vec.begin(), queue);
37         t.start();
38         compute::sort(vec.begin(), vec.end(), queue);
39         queue.finish();
40         t.stop();
41 
42         if(!compute::is_sorted(vec.begin(), vec.end(), queue)){
43             std::cerr << "ERROR: is_sorted() returned false" << std::endl;
44         }
45     }
46     return t.min_time();
47 }
48 
49 template<class T>
tune_sort(const std::vector<T> & data,const size_t trials,compute::command_queue & queue)50 void tune_sort(const std::vector<T>& data,
51                const size_t trials,
52                compute::command_queue& queue)
53 {
54     boost::shared_ptr<compute::detail::parameter_cache>
55         params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
56 
57     const std::string cache_key =
58         std::string("__boost_radix_sort_") + compute::type_name<T>();
59 
60     const compute::uint_ tpbs[] = { 32, 64, 128, 256, 512, 1024 };
61 
62     double min_time = (std::numeric_limits<double>::max)();
63     compute::uint_ best_tpb = 0;
64 
65     for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
66         params->set(cache_key, "tpb", tpbs[i]);
67 
68         try {
69             const double t = perf_sort(data, trials, queue);
70             if(t < min_time){
71                 best_tpb = tpbs[i];
72                 min_time = t;
73             }
74         }
75         catch(compute::opencl_error&){
76             // invalid work group size for this device, skip
77         }
78     }
79 
80     // store optimal parameters
81     params->set(cache_key, "tpb", best_tpb);
82 }
83 
main(int argc,char * argv[])84 int main(int argc, char *argv[])
85 {
86     // setup command line arguments
87     po::options_description options("options");
88     options.add_options()
89         ("help", "show usage instructions")
90         ("size", po::value<size_t>()->default_value(8192), "input size")
91         ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
92         ("tune", "run tuning procedure")
93     ;
94     po::positional_options_description positional_options;
95     positional_options.add("size", 1);
96 
97     // parse command line
98     po::variables_map vm;
99     po::store(
100         po::command_line_parser(argc, argv)
101             .options(options).positional(positional_options).run(),
102         vm
103     );
104     po::notify(vm);
105 
106     const size_t size = vm["size"].as<size_t>();
107     const size_t trials = vm["trials"].as<size_t>();
108     std::cout << "size: " << size << std::endl;
109 
110     // setup context and queue for the default device
111     compute::device device = boost::compute::system::default_device();
112     compute::context context(device);
113     compute::command_queue queue(context, device);
114     std::cout << "device: " << device.name() << std::endl;
115 
116     // create vector of random numbers on the host
117     std::vector<unsigned int> data(size);
118     std::generate(data.begin(), data.end(), rand);
119 
120     // run tuning proceure (if requested)
121     if(vm.count("tune")){
122         tune_sort(data, trials, queue);
123     }
124 
125     // run sort benchmark
126     double t = perf_sort(data, trials, queue);
127     std::cout << "time: " << t / 1e6 << " ms" << std::endl;
128 
129     return 0;
130 }
131