1 
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file implements the Tuner class (see the header for information about the class).
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 //  http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27 
28 // The corresponding header file
29 #include "cltune.h"
30 
31 // And the implemenation (Pimpl idiom)
32 #include "internal/tuner_impl.h"
33 
34 #include <iostream> // FILE
35 #include <limits> // std::numeric_limits
36 
37 namespace cltune {
38 // =================================================================================================
39 
40 // The implemenation of the constructors and destructors are hidden in the TunerImpl class
Tuner()41 Tuner::Tuner():
42     pimpl(new TunerImpl()) {
43 }
Tuner(size_t platform_id,size_t device_id)44 Tuner::Tuner(size_t platform_id, size_t device_id):
45     pimpl(new TunerImpl(platform_id, device_id)) {
46 }
~Tuner()47 Tuner::~Tuner() {
48 }
49 
50 // =================================================================================================
51 
52 // Loads the kernel source-code from a file and calls the function-overload below.
AddKernel(const std::vector<std::string> & filenames,const std::string & kernel_name,const IntRange & global,const IntRange & local)53 size_t Tuner::AddKernel(const std::vector<std::string> &filenames, const std::string &kernel_name,
54                         const IntRange &global, const IntRange &local) {
55   auto source = std::string{};
56   for (auto &filename: filenames) {
57     source += pimpl->LoadFile(filename);
58   }
59   return AddKernelFromString(source, kernel_name, global, local);
60 }
61 
62 // Loads the kernel source-code from a string and creates a new variable of type KernelInfo to store
63 // all the kernel-information.
AddKernelFromString(const std::string & source,const std::string & kernel_name,const IntRange & global,const IntRange & local)64 size_t Tuner::AddKernelFromString(const std::string &source, const std::string &kernel_name,
65                                   const IntRange &global, const IntRange &local) {
66   pimpl->kernels_.push_back(KernelInfo(kernel_name, source, pimpl->device()));
67   auto id = pimpl->kernels_.size() - 1;
68   pimpl->kernels_[id].set_global_base(global);
69   pimpl->kernels_[id].set_local_base(local);
70   return id;
71 }
72 
73 // =================================================================================================
74 
75 // Sets the reference kernel (source-code location, kernel name, global/local thread-sizes) and
76 // sets a flag to indicate that there is now a reference. Calling this function again will simply
77 // overwrite the old reference.
SetReference(const std::vector<std::string> & filenames,const std::string & kernel_name,const IntRange & global,const IntRange & local)78 void Tuner::SetReference(const std::vector<std::string> &filenames, const std::string &kernel_name,
79                          const IntRange &global, const IntRange &local) {
80   auto source = std::string{};
81   for (auto &filename: filenames) {
82     source += pimpl->LoadFile(filename);
83   }
84   SetReferenceFromString(source, kernel_name, global, local);
85 }
SetReferenceFromString(const std::string & source,const std::string & kernel_name,const IntRange & global,const IntRange & local)86 void Tuner::SetReferenceFromString(const std::string &source, const std::string &kernel_name,
87                                    const IntRange &global, const IntRange &local) {
88   pimpl->has_reference_ = true;
89   pimpl->reference_kernel_.reset(new KernelInfo(kernel_name, source, pimpl->device()));
90   pimpl->reference_kernel_->set_global_base(global);
91   pimpl->reference_kernel_->set_local_base(local);
92 }
93 
94 // =================================================================================================
95 
96 // Adds parameters for a kernel to tune. Also checks whether this parameter already exists.
AddParameter(const size_t id,const std::string & parameter_name,const std::vector<size_t> & values)97 void Tuner::AddParameter(const size_t id, const std::string &parameter_name,
98                          const std::vector<size_t> &values) {
99   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
100   if (pimpl->kernels_[id].ParameterExists(parameter_name)) {
101     throw std::runtime_error("Parameter already exists");
102   }
103   pimpl->kernels_[id].AddParameter(parameter_name, values);
104 }
105 
106 // As above, but now adds a single valued parameter to the reference
AddParameterReference(const std::string & parameter_name,const size_t value)107 void Tuner::AddParameterReference(const std::string &parameter_name, const size_t value) {
108   auto value_string = std::string{std::to_string(static_cast<long long>(value))};
109   pimpl->reference_kernel_->PrependSource("#define "+parameter_name+" "+value_string);
110 }
111 
112 // =================================================================================================
113 
114 // These functions forward their work (adding a modifier to global/local thread-sizes) to an object
115 // of KernelInfo class
MulGlobalSize(const size_t id,const StringRange range)116 void Tuner::MulGlobalSize(const size_t id, const StringRange range) {
117   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
118   pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kGlobalMul);
119 }
DivGlobalSize(const size_t id,const StringRange range)120 void Tuner::DivGlobalSize(const size_t id, const StringRange range) {
121   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
122   pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kGlobalDiv);
123 }
MulLocalSize(const size_t id,const StringRange range)124 void Tuner::MulLocalSize(const size_t id, const StringRange range) {
125   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
126   pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kLocalMul);
127 }
DivLocalSize(const size_t id,const StringRange range)128 void Tuner::DivLocalSize(const size_t id, const StringRange range) {
129   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
130   pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kLocalDiv);
131 }
132 
133 // Adds a contraint to the list of constraints for a particular kernel. First checks whether the
134 // kernel exists and whether the parameters exist.
AddConstraint(const size_t id,ConstraintFunction valid_if,const std::vector<std::string> & parameters)135 void Tuner::AddConstraint(const size_t id, ConstraintFunction valid_if,
136                           const std::vector<std::string> &parameters) {
137   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
138   for (auto &parameter: parameters) {
139     if (!pimpl->kernels_[id].ParameterExists(parameter)) {
140       throw std::runtime_error("Invalid parameter");
141     }
142   }
143   pimpl->kernels_[id].AddConstraint(valid_if, parameters);
144 }
145 
146 // As above, but for the local memory usage
SetLocalMemoryUsage(const size_t id,LocalMemoryFunction amount,const std::vector<std::string> & parameters)147 void Tuner::SetLocalMemoryUsage(const size_t id, LocalMemoryFunction amount,
148                                 const std::vector<std::string> &parameters) {
149   if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
150   for (auto &parameter: parameters) {
151     if (!pimpl->kernels_[id].ParameterExists(parameter)) {
152       throw std::runtime_error("Invalid parameter");
153     }
154   }
155   pimpl->kernels_[id].SetLocalMemoryUsage(amount, parameters);
156 }
157 
158 
159 // =================================================================================================
160 
161 // Creates a new buffer of type Memory (containing both host and device data) based on a source
162 // vector of data. Then, upload it to the device and store the argument in a list.
163 template <typename T>
AddArgumentInput(const std::vector<T> & source)164 void Tuner::AddArgumentInput(const std::vector<T> &source) {
165   auto device_buffer = Buffer<T>(pimpl->context(), BufferAccess::kNotOwned, source.size());
166   device_buffer.Write(pimpl->queue(), source.size(), source);
167   auto argument = TunerImpl::MemArgument{pimpl->argument_counter_++, source.size(),
168                                          pimpl->GetType<T>(), device_buffer()};
169   pimpl->arguments_input_.push_back(argument);
170 }
171 
172 // Compiles the function for various data-types
173 template void PUBLIC_API Tuner::AddArgumentInput<short>(const std::vector<short>&);
174 template void PUBLIC_API Tuner::AddArgumentInput<int>(const std::vector<int>&);
175 template void PUBLIC_API Tuner::AddArgumentInput<size_t>(const std::vector<size_t>&);
176 template void PUBLIC_API Tuner::AddArgumentInput<half>(const std::vector<half>&);
177 template void PUBLIC_API Tuner::AddArgumentInput<float>(const std::vector<float>&);
178 template void PUBLIC_API Tuner::AddArgumentInput<double>(const std::vector<double>&);
179 template void PUBLIC_API Tuner::AddArgumentInput<float2>(const std::vector<float2>&);
180 template void PUBLIC_API Tuner::AddArgumentInput<double2>(const std::vector<double2>&);
181 
182 // Similar to the above function, but now marked as output buffer. Output buffers are special in the
183 // sense that they will be checked in the verification process.
184 template <typename T>
AddArgumentOutput(const std::vector<T> & source)185 void Tuner::AddArgumentOutput(const std::vector<T> &source) {
186   auto device_buffer = Buffer<T>(pimpl->context(), BufferAccess::kNotOwned, source.size());
187   device_buffer.Write(pimpl->queue(), source.size(), source);
188   auto argument = TunerImpl::MemArgument{pimpl->argument_counter_++, source.size(),
189                                          pimpl->GetType<T>(), device_buffer()};
190   pimpl->arguments_output_.push_back(argument);
191 }
192 
193 // Compiles the function for various data-types
194 template void PUBLIC_API Tuner::AddArgumentOutput<short>(const std::vector<short>&);
195 template void PUBLIC_API Tuner::AddArgumentOutput<int>(const std::vector<int>&);
196 template void PUBLIC_API Tuner::AddArgumentOutput<size_t>(const std::vector<size_t>&);
197 template void PUBLIC_API Tuner::AddArgumentOutput<half>(const std::vector<half>&);
198 template void PUBLIC_API Tuner::AddArgumentOutput<float>(const std::vector<float>&);
199 template void PUBLIC_API Tuner::AddArgumentOutput<double>(const std::vector<double>&);
200 template void PUBLIC_API Tuner::AddArgumentOutput<float2>(const std::vector<float2>&);
201 template void PUBLIC_API Tuner::AddArgumentOutput<double2>(const std::vector<double2>&);
202 
203 // Sets a scalar value as an argument to the kernel. Since a vector of scalars of any type doesn't
204 // exist, there is no general implemenation. Instead, each data-type has its specialised version in
205 // which it stores to a specific vector.
AddArgumentScalar(const short argument)206 template <> void PUBLIC_API Tuner::AddArgumentScalar<short>(const short argument) {
207   pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument});
208 }
AddArgumentScalar(const int argument)209 template <> void PUBLIC_API Tuner::AddArgumentScalar<int>(const int argument) {
210   pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument});
211 }
AddArgumentScalar(const size_t argument)212 template <> void PUBLIC_API Tuner::AddArgumentScalar<size_t>(const size_t argument) {
213   pimpl->arguments_size_t_.push_back({pimpl->argument_counter_++, argument});
214 }
AddArgumentScalar(const half argument)215 template <> void PUBLIC_API Tuner::AddArgumentScalar<half>(const half argument) {
216   pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument});
217 }
AddArgumentScalar(const float argument)218 template <> void PUBLIC_API Tuner::AddArgumentScalar<float>(const float argument) {
219   pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument});
220 }
AddArgumentScalar(const double argument)221 template <> void PUBLIC_API Tuner::AddArgumentScalar<double>(const double argument) {
222   pimpl->arguments_double_.push_back({pimpl->argument_counter_++, argument});
223 }
AddArgumentScalar(const float2 argument)224 template <> void PUBLIC_API Tuner::AddArgumentScalar<float2>(const float2 argument) {
225   pimpl->arguments_float2_.push_back({pimpl->argument_counter_++, argument});
226 }
AddArgumentScalar(const double2 argument)227 template <> void PUBLIC_API Tuner::AddArgumentScalar<double2>(const double2 argument) {
228   pimpl->arguments_double2_.push_back({pimpl->argument_counter_++, argument});
229 }
230 
231 // =================================================================================================
232 
233 // Use full search as a search strategy. This is the default method.
UseFullSearch()234 void Tuner::UseFullSearch() {
235   pimpl->search_method_ = SearchMethod::FullSearch;
236 }
237 
238 // Use random search as a search strategy.
UseRandomSearch(const double fraction)239 void Tuner::UseRandomSearch(const double fraction) {
240   pimpl->search_method_ = SearchMethod::RandomSearch;
241   pimpl->search_args_.push_back(fraction);
242 }
243 
244 // Use simulated annealing as a search strategy.
UseAnnealing(const double fraction,const double max_temperature)245 void Tuner::UseAnnealing(const double fraction, const double max_temperature) {
246   pimpl->search_method_ = SearchMethod::Annealing;
247   pimpl->search_args_.push_back(fraction);
248   pimpl->search_args_.push_back(max_temperature);
249 }
250 
251 // Use PSO as a search strategy.
UsePSO(const double fraction,const size_t swarm_size,const double influence_global,const double influence_local,const double influence_random)252 void Tuner::UsePSO(const double fraction, const size_t swarm_size, const double influence_global,
253                    const double influence_local, const double influence_random) {
254   pimpl->search_method_ = SearchMethod::PSO;
255   pimpl->search_args_.push_back(fraction);
256   pimpl->search_args_.push_back(static_cast<double>(swarm_size));
257   pimpl->search_args_.push_back(influence_global);
258   pimpl->search_args_.push_back(influence_local);
259   pimpl->search_args_.push_back(influence_random);
260 }
261 
262 
263 // Output the search process to a file. This is disabled per default.
OutputSearchLog(const std::string & filename)264 void Tuner::OutputSearchLog(const std::string &filename) {
265   pimpl->output_search_process_ = true;
266   pimpl->search_log_filename_ = filename;
267 }
268 
269 // =================================================================================================
270 
271 // Starts the tuning process. See the TunerImpl's implemenation for details
Tune()272 void Tuner::Tune() {
273   pimpl->Tune();
274 }
275 
276 // =================================================================================================
277 
278 // Fits a machine learning model. See the TunerImpl's implemenation for details
ModelPrediction(const Model model_type,const float validation_fraction,const size_t test_top_x_configurations)279 void Tuner::ModelPrediction(const Model model_type, const float validation_fraction,
280                             const size_t test_top_x_configurations) {
281   pimpl->ModelPrediction(model_type, validation_fraction, test_top_x_configurations);
282 }
283 
284 // =================================================================================================
285 
286 
287 // Retrieves the parameters of the best tuning result
GetBestResult() const288 std::unordered_map<std::string, size_t> Tuner::GetBestResult() const {
289   const auto best_result = pimpl->GetBestResult();
290   const auto best_configuration = best_result.configuration;
291 
292   // Converts the std::vector<KernelInfo::Setting> into an unordere map of strings and integers
293   auto parameters = std::unordered_map<std::string, size_t>{};
294   for (const auto &parameter_setting : best_configuration) {
295     parameters[parameter_setting.name] = parameter_setting.value;
296   }
297   return parameters;
298 }
299 
300 // Iterates over all tuning results and prints each parameter configuration and the corresponding
301 // timing-results. Printing is to stdout.
PrintToScreen() const302 double Tuner::PrintToScreen() const {
303 
304   // Finds the best result
305   const auto best_result = pimpl->GetBestResult();
306   const auto best_time = best_result.time;
307 
308   // Aborts if there was no best time found
309   if (best_time == std::numeric_limits<double>::max()) {
310     pimpl->PrintHeader("No tuner results found");
311     return 0.0;
312   }
313 
314   // Prints all valid results and the one with the lowest execution time
315   pimpl->PrintHeader("Printing results to stdout");
316   for (auto &tuning_result: pimpl->tuning_results_) {
317     if (tuning_result.status && tuning_result.time != std::numeric_limits<double>::max()) {
318       pimpl->PrintResult(stdout, tuning_result, pimpl->kMessageResult);
319     }
320   }
321   pimpl->PrintHeader("Printing best result to stdout");
322   pimpl->PrintResult(stdout, best_result, pimpl->kMessageBest);
323 
324   // Return the best time
325   return best_time;
326 }
327 
328 // Prints the best result in a neatly formatted C++ database format to screen
PrintFormatted() const329 void Tuner::PrintFormatted() const {
330 
331   // Finds the best result
332   const auto best_result = pimpl->GetBestResult();
333   const auto best_time = best_result.time;
334 
335   // Prints the best result in C++ database format
336   auto count = size_t{0};
337   pimpl->PrintHeader("Printing best result in database format to stdout");
338   fprintf(stdout, "{ \"%s\", { ", pimpl->device().Name().c_str());
339   for (auto &setting: best_result.configuration) {
340     fprintf(stdout, "%s", setting.GetDatabase().c_str());
341     if (count < best_result.configuration.size()-1) {
342       fprintf(stdout, ", ");
343     }
344     count++;
345   }
346   fprintf(stdout, " } }\n");
347 }
348 
349 // Outputs all results in a JSON database format
PrintJSON(const std::string & filename,const std::vector<std::pair<std::string,std::string>> & descriptions) const350 void Tuner::PrintJSON(const std::string &filename,
351                       const std::vector<std::pair<std::string,std::string>> &descriptions) const {
352 
353   // Prints the best result in JSON database format
354   pimpl->PrintHeader("Printing results to file in JSON format");
355   auto file = fopen(filename.c_str(), "w");
356   auto device_type = pimpl->device().Type();
357   fprintf(file, "{\n");
358   for (auto &description: descriptions) {
359     fprintf(file, "  \"%s\": \"%s\",\n", description.first.c_str(), description.second.c_str());
360   }
361   fprintf(file, "  \"device\": \"%s\",\n", pimpl->device().Name().c_str());
362   fprintf(file, "  \"device_vendor\": \"%s\",\n", pimpl->device().Vendor().c_str());
363   fprintf(file, "  \"device_type\": \"%s\",\n", device_type.c_str());
364   fprintf(file, "  \"device_core_clock\": \"%zu\",\n", pimpl->device().CoreClock());
365   fprintf(file, "  \"device_compute_units\": \"%zu\",\n", pimpl->device().ComputeUnits());
366   fprintf(file, "  \"results\": [\n");
367 
368   // Filters failed configurations
369   auto results = std::vector<TunerImpl::TunerResult>();
370   for (const auto &tuning_result: pimpl->tuning_results_) {
371     if (tuning_result.status && tuning_result.time != std::numeric_limits<double>::max()) {
372       results.push_back(tuning_result);
373     }
374   }
375 
376   // Loops over all the results
377   auto num_results = results.size();
378   for (auto r=size_t{0}; r<num_results; ++r) {
379     auto result = results[r];
380     fprintf(file, "    {\n");
381     fprintf(file, "      \"kernel\": \"%s\",\n", result.kernel_name.c_str());
382     fprintf(file, "      \"time\": %.3lf,\n", result.time);
383 
384     // Loops over all the parameters for this result
385     fprintf(file, "      \"parameters\": {");
386     auto num_configs = result.configuration.size();
387     for (auto p=size_t{0}; p<num_configs; ++p) {
388       auto config = result.configuration[p];
389       fprintf(file, "\"%s\": %zu", config.name.c_str(), config.value);
390       if (p < num_configs-1) { fprintf(file, ","); }
391     }
392     fprintf(file, "}\n");
393 
394     // The footer
395     fprintf(file, "    }");
396     if (r < num_results-1) { fprintf(file, ","); }
397     fprintf(file, "\n");
398   }
399   fprintf(file, "  ]\n");
400   fprintf(file, "}\n");
401   fclose(file);
402 }
403 
404 // Same as PrintToScreen, but now outputs into a file and does not mark the best-case
PrintToFile(const std::string & filename) const405 void Tuner::PrintToFile(const std::string &filename) const {
406   pimpl->PrintHeader("Printing results to file: "+filename);
407   auto file = fopen(filename.c_str(), "w");
408   std::vector<std::string> processed_kernels;
409   for (auto &tuning_result: pimpl->tuning_results_) {
410     if (tuning_result.status) {
411 
412       // Checks whether this is a kernel which hasn't been encountered yet
413       auto new_kernel = true;
414       for (auto &kernel_name: processed_kernels) {
415         if (kernel_name == tuning_result.kernel_name) { new_kernel = false; break; }
416       }
417       processed_kernels.push_back(tuning_result.kernel_name);
418 
419       // Prints the header in case of a new kernel name
420       if (new_kernel) {
421         fprintf(file, "name;time;threads;");
422         for (auto &setting: tuning_result.configuration) {
423           fprintf(file, "%s;", setting.name.c_str());
424         }
425         fprintf(file, "\n");
426       }
427 
428       // Prints an entry to file
429       fprintf(file, "%s;", tuning_result.kernel_name.c_str());
430       fprintf(file, "%.2lf;", tuning_result.time);
431       fprintf(file, "%zu;", tuning_result.threads);
432       for (auto &setting: tuning_result.configuration) {
433         fprintf(file, "%zu;", setting.value);
434       }
435       fprintf(file, "\n");
436     }
437   }
438   fclose(file);
439 }
440 
441 // Set the flag to suppress output to true. Note that this cannot be undone.
SuppressOutput()442 void Tuner::SuppressOutput() {
443   pimpl->suppress_output_ = true;
444 }
445 
446 // Sets the number of runs to average time measurements.
SetNumRuns(const size_t num_runs)447 void Tuner::SetNumRuns(const size_t num_runs) {
448   pimpl->num_runs_ = num_runs;
449 }
450 
451 // =================================================================================================
452 } // namespace cltune
453