1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #include <algorithm>
12 #include <iostream>
13 #include <vector>
14 
15 #include <boost/program_options.hpp>
16 
17 #include <boost/compute/lambda.hpp>
18 #include <boost/compute/system.hpp>
19 #include <boost/compute/algorithm/copy.hpp>
20 #include <boost/compute/algorithm/transform.hpp>
21 #include <boost/compute/container/vector.hpp>
22 
23 #include "perf.hpp"
24 
25 namespace po = boost::program_options;
26 namespace compute = boost::compute;
27 
rand_float()28 float rand_float()
29 {
30     return (float(rand()) / float(RAND_MAX)) * 1000.f;
31 }
32 
33 template<class T>
perf_saxpy(const compute::vector<T> & x,const compute::vector<T> & y,const T alpha,const size_t trials,compute::command_queue & queue)34 double perf_saxpy(const compute::vector<T>& x,
35                   const compute::vector<T>& y,
36                   const T alpha,
37                   const size_t trials,
38                   compute::command_queue& queue)
39 {
40     // create vector on the device to store the result
41     compute::vector<T> result(x.size(), queue.get_context());
42 
43     perf_timer t;
44     for(size_t trial = 0; trial < trials; trial++){
45         compute::fill(result.begin(), result.end(), T(0), queue);
46         queue.finish();
47 
48         t.start();
49 
50         using compute::lambda::_1;
51         using compute::lambda::_2;
52 
53         compute::transform(
54             x.begin(), x.end(), y.begin(), result.begin(), alpha * _1 + _2, queue
55         );
56 
57         queue.finish();
58         t.stop();
59     }
60 
61     return t.min_time();
62 }
63 
64 template<class T>
tune_saxpy(const compute::vector<T> & x,const compute::vector<T> & y,const T alpha,const size_t trials,compute::command_queue & queue)65 void tune_saxpy(const compute::vector<T>& x,
66                 const compute::vector<T>& y,
67                 const T alpha,
68                 const size_t trials,
69                 compute::command_queue& queue)
70 {
71     boost::shared_ptr<compute::detail::parameter_cache>
72         params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
73 
74     const std::string cache_key =
75         std::string("__boost_copy_kernel_") + boost::lexical_cast<std::string>(sizeof(T));
76 
77     const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
78     const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
79 
80     double min_time = (std::numeric_limits<double>::max)();
81     compute::uint_ best_tpb = 0;
82     compute::uint_ best_vpt = 0;
83 
84     for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
85         params->set(cache_key, "tpb", tpbs[i]);
86         for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
87             params->set(cache_key, "vpt", vpts[j]);
88 
89             try {
90                 const double t = perf_saxpy(x, y, alpha, trials, queue);
91                 if(t < min_time){
92                     best_tpb = tpbs[i];
93                     best_vpt = vpts[j];
94                     min_time = t;
95                 }
96             }
97             catch(compute::opencl_error&){
98                 // invalid parameters for this device, skip
99             }
100         }
101     }
102 
103     // store optimal parameters
104     params->set(cache_key, "tpb", best_tpb);
105     params->set(cache_key, "vpt", best_vpt);
106 }
107 
main(int argc,char * argv[])108 int main(int argc, char *argv[])
109 {
110     // setup command line arguments
111     po::options_description options("options");
112     options.add_options()
113         ("help", "show usage instructions")
114         ("size", po::value<size_t>()->default_value(8192), "input size")
115         ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
116         ("tune", "run tuning procedure")
117         ("alpha", po::value<double>()->default_value(2.5), "saxpy alpha value")
118     ;
119     po::positional_options_description positional_options;
120     positional_options.add("size", 1);
121 
122     // parse command line
123     po::variables_map vm;
124     po::store(
125         po::command_line_parser(argc, argv)
126             .options(options).positional(positional_options).run(),
127         vm
128     );
129     po::notify(vm);
130 
131     const size_t size = vm["size"].as<size_t>();
132     const size_t trials = vm["trials"].as<size_t>();
133     const float alpha = vm["alpha"].as<double>();
134     std::cout << "size: " << size << std::endl;
135 
136     // setup context and queue for the default device
137     compute::device device = boost::compute::system::default_device();
138     compute::context context(device);
139     compute::command_queue queue(context, device);
140     std::cout << "device: " << device.name() << std::endl;
141 
142     // create vector of random numbers on the host
143     std::vector<float> host_x(size);
144     std::vector<float> host_y(size);
145     std::generate(host_x.begin(), host_x.end(), rand_float);
146     std::generate(host_y.begin(), host_y.end(), rand_float);
147 
148     // create vector on the device and copy the data
149     compute::vector<float> x(host_x.begin(), host_x.end(), queue);
150     compute::vector<float> y(host_y.begin(), host_y.end(), queue);
151 
152     // run tuning proceure (if requested)
153     if(vm.count("tune")){
154         tune_saxpy(x, y, alpha, trials, queue);
155     }
156 
157     // run benchmark
158     double t = perf_saxpy(x, y, alpha, trials, queue);
159     std::cout << "time: " << t / 1e6 << " ms" << std::endl;
160 
161     return 0;
162 }
163