1
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file implements the Tuner class (see the header for information about the class).
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27
28 // The corresponding header file
29 #include "cltune.h"
30
31 // And the implemenation (Pimpl idiom)
32 #include "internal/tuner_impl.h"
33
34 #include <iostream> // FILE
35 #include <limits> // std::numeric_limits
36
37 namespace cltune {
38 // =================================================================================================
39
40 // The implemenation of the constructors and destructors are hidden in the TunerImpl class
Tuner()41 Tuner::Tuner():
42 pimpl(new TunerImpl()) {
43 }
Tuner(size_t platform_id,size_t device_id)44 Tuner::Tuner(size_t platform_id, size_t device_id):
45 pimpl(new TunerImpl(platform_id, device_id)) {
46 }
~Tuner()47 Tuner::~Tuner() {
48 }
49
50 // =================================================================================================
51
52 // Loads the kernel source-code from a file and calls the function-overload below.
AddKernel(const std::vector<std::string> & filenames,const std::string & kernel_name,const IntRange & global,const IntRange & local)53 size_t Tuner::AddKernel(const std::vector<std::string> &filenames, const std::string &kernel_name,
54 const IntRange &global, const IntRange &local) {
55 auto source = std::string{};
56 for (auto &filename: filenames) {
57 source += pimpl->LoadFile(filename);
58 }
59 return AddKernelFromString(source, kernel_name, global, local);
60 }
61
62 // Loads the kernel source-code from a string and creates a new variable of type KernelInfo to store
63 // all the kernel-information.
AddKernelFromString(const std::string & source,const std::string & kernel_name,const IntRange & global,const IntRange & local)64 size_t Tuner::AddKernelFromString(const std::string &source, const std::string &kernel_name,
65 const IntRange &global, const IntRange &local) {
66 pimpl->kernels_.push_back(KernelInfo(kernel_name, source, pimpl->device()));
67 auto id = pimpl->kernels_.size() - 1;
68 pimpl->kernels_[id].set_global_base(global);
69 pimpl->kernels_[id].set_local_base(local);
70 return id;
71 }
72
73 // =================================================================================================
74
75 // Sets the reference kernel (source-code location, kernel name, global/local thread-sizes) and
76 // sets a flag to indicate that there is now a reference. Calling this function again will simply
77 // overwrite the old reference.
SetReference(const std::vector<std::string> & filenames,const std::string & kernel_name,const IntRange & global,const IntRange & local)78 void Tuner::SetReference(const std::vector<std::string> &filenames, const std::string &kernel_name,
79 const IntRange &global, const IntRange &local) {
80 auto source = std::string{};
81 for (auto &filename: filenames) {
82 source += pimpl->LoadFile(filename);
83 }
84 SetReferenceFromString(source, kernel_name, global, local);
85 }
SetReferenceFromString(const std::string & source,const std::string & kernel_name,const IntRange & global,const IntRange & local)86 void Tuner::SetReferenceFromString(const std::string &source, const std::string &kernel_name,
87 const IntRange &global, const IntRange &local) {
88 pimpl->has_reference_ = true;
89 pimpl->reference_kernel_.reset(new KernelInfo(kernel_name, source, pimpl->device()));
90 pimpl->reference_kernel_->set_global_base(global);
91 pimpl->reference_kernel_->set_local_base(local);
92 }
93
94 // =================================================================================================
95
96 // Adds parameters for a kernel to tune. Also checks whether this parameter already exists.
AddParameter(const size_t id,const std::string & parameter_name,const std::vector<size_t> & values)97 void Tuner::AddParameter(const size_t id, const std::string ¶meter_name,
98 const std::vector<size_t> &values) {
99 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
100 if (pimpl->kernels_[id].ParameterExists(parameter_name)) {
101 throw std::runtime_error("Parameter already exists");
102 }
103 pimpl->kernels_[id].AddParameter(parameter_name, values);
104 }
105
106 // As above, but now adds a single valued parameter to the reference
AddParameterReference(const std::string & parameter_name,const size_t value)107 void Tuner::AddParameterReference(const std::string ¶meter_name, const size_t value) {
108 auto value_string = std::string{std::to_string(static_cast<long long>(value))};
109 pimpl->reference_kernel_->PrependSource("#define "+parameter_name+" "+value_string);
110 }
111
112 // =================================================================================================
113
114 // These functions forward their work (adding a modifier to global/local thread-sizes) to an object
115 // of KernelInfo class
MulGlobalSize(const size_t id,const StringRange range)116 void Tuner::MulGlobalSize(const size_t id, const StringRange range) {
117 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
118 pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kGlobalMul);
119 }
DivGlobalSize(const size_t id,const StringRange range)120 void Tuner::DivGlobalSize(const size_t id, const StringRange range) {
121 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
122 pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kGlobalDiv);
123 }
MulLocalSize(const size_t id,const StringRange range)124 void Tuner::MulLocalSize(const size_t id, const StringRange range) {
125 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
126 pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kLocalMul);
127 }
DivLocalSize(const size_t id,const StringRange range)128 void Tuner::DivLocalSize(const size_t id, const StringRange range) {
129 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
130 pimpl->kernels_[id].AddModifier(range, KernelInfo::ThreadSizeModifierType::kLocalDiv);
131 }
132
133 // Adds a contraint to the list of constraints for a particular kernel. First checks whether the
134 // kernel exists and whether the parameters exist.
AddConstraint(const size_t id,ConstraintFunction valid_if,const std::vector<std::string> & parameters)135 void Tuner::AddConstraint(const size_t id, ConstraintFunction valid_if,
136 const std::vector<std::string> ¶meters) {
137 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
138 for (auto ¶meter: parameters) {
139 if (!pimpl->kernels_[id].ParameterExists(parameter)) {
140 throw std::runtime_error("Invalid parameter");
141 }
142 }
143 pimpl->kernels_[id].AddConstraint(valid_if, parameters);
144 }
145
146 // As above, but for the local memory usage
SetLocalMemoryUsage(const size_t id,LocalMemoryFunction amount,const std::vector<std::string> & parameters)147 void Tuner::SetLocalMemoryUsage(const size_t id, LocalMemoryFunction amount,
148 const std::vector<std::string> ¶meters) {
149 if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); }
150 for (auto ¶meter: parameters) {
151 if (!pimpl->kernels_[id].ParameterExists(parameter)) {
152 throw std::runtime_error("Invalid parameter");
153 }
154 }
155 pimpl->kernels_[id].SetLocalMemoryUsage(amount, parameters);
156 }
157
158
159 // =================================================================================================
160
161 // Creates a new buffer of type Memory (containing both host and device data) based on a source
162 // vector of data. Then, upload it to the device and store the argument in a list.
163 template <typename T>
AddArgumentInput(const std::vector<T> & source)164 void Tuner::AddArgumentInput(const std::vector<T> &source) {
165 auto device_buffer = Buffer<T>(pimpl->context(), BufferAccess::kNotOwned, source.size());
166 device_buffer.Write(pimpl->queue(), source.size(), source);
167 auto argument = TunerImpl::MemArgument{pimpl->argument_counter_++, source.size(),
168 pimpl->GetType<T>(), device_buffer()};
169 pimpl->arguments_input_.push_back(argument);
170 }
171
172 // Compiles the function for various data-types
173 template void PUBLIC_API Tuner::AddArgumentInput<short>(const std::vector<short>&);
174 template void PUBLIC_API Tuner::AddArgumentInput<int>(const std::vector<int>&);
175 template void PUBLIC_API Tuner::AddArgumentInput<size_t>(const std::vector<size_t>&);
176 template void PUBLIC_API Tuner::AddArgumentInput<half>(const std::vector<half>&);
177 template void PUBLIC_API Tuner::AddArgumentInput<float>(const std::vector<float>&);
178 template void PUBLIC_API Tuner::AddArgumentInput<double>(const std::vector<double>&);
179 template void PUBLIC_API Tuner::AddArgumentInput<float2>(const std::vector<float2>&);
180 template void PUBLIC_API Tuner::AddArgumentInput<double2>(const std::vector<double2>&);
181
182 // Similar to the above function, but now marked as output buffer. Output buffers are special in the
183 // sense that they will be checked in the verification process.
184 template <typename T>
AddArgumentOutput(const std::vector<T> & source)185 void Tuner::AddArgumentOutput(const std::vector<T> &source) {
186 auto device_buffer = Buffer<T>(pimpl->context(), BufferAccess::kNotOwned, source.size());
187 device_buffer.Write(pimpl->queue(), source.size(), source);
188 auto argument = TunerImpl::MemArgument{pimpl->argument_counter_++, source.size(),
189 pimpl->GetType<T>(), device_buffer()};
190 pimpl->arguments_output_.push_back(argument);
191 }
192
193 // Compiles the function for various data-types
194 template void PUBLIC_API Tuner::AddArgumentOutput<short>(const std::vector<short>&);
195 template void PUBLIC_API Tuner::AddArgumentOutput<int>(const std::vector<int>&);
196 template void PUBLIC_API Tuner::AddArgumentOutput<size_t>(const std::vector<size_t>&);
197 template void PUBLIC_API Tuner::AddArgumentOutput<half>(const std::vector<half>&);
198 template void PUBLIC_API Tuner::AddArgumentOutput<float>(const std::vector<float>&);
199 template void PUBLIC_API Tuner::AddArgumentOutput<double>(const std::vector<double>&);
200 template void PUBLIC_API Tuner::AddArgumentOutput<float2>(const std::vector<float2>&);
201 template void PUBLIC_API Tuner::AddArgumentOutput<double2>(const std::vector<double2>&);
202
203 // Sets a scalar value as an argument to the kernel. Since a vector of scalars of any type doesn't
204 // exist, there is no general implemenation. Instead, each data-type has its specialised version in
205 // which it stores to a specific vector.
AddArgumentScalar(const short argument)206 template <> void PUBLIC_API Tuner::AddArgumentScalar<short>(const short argument) {
207 pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument});
208 }
AddArgumentScalar(const int argument)209 template <> void PUBLIC_API Tuner::AddArgumentScalar<int>(const int argument) {
210 pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument});
211 }
AddArgumentScalar(const size_t argument)212 template <> void PUBLIC_API Tuner::AddArgumentScalar<size_t>(const size_t argument) {
213 pimpl->arguments_size_t_.push_back({pimpl->argument_counter_++, argument});
214 }
AddArgumentScalar(const half argument)215 template <> void PUBLIC_API Tuner::AddArgumentScalar<half>(const half argument) {
216 pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument});
217 }
AddArgumentScalar(const float argument)218 template <> void PUBLIC_API Tuner::AddArgumentScalar<float>(const float argument) {
219 pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument});
220 }
AddArgumentScalar(const double argument)221 template <> void PUBLIC_API Tuner::AddArgumentScalar<double>(const double argument) {
222 pimpl->arguments_double_.push_back({pimpl->argument_counter_++, argument});
223 }
AddArgumentScalar(const float2 argument)224 template <> void PUBLIC_API Tuner::AddArgumentScalar<float2>(const float2 argument) {
225 pimpl->arguments_float2_.push_back({pimpl->argument_counter_++, argument});
226 }
AddArgumentScalar(const double2 argument)227 template <> void PUBLIC_API Tuner::AddArgumentScalar<double2>(const double2 argument) {
228 pimpl->arguments_double2_.push_back({pimpl->argument_counter_++, argument});
229 }
230
231 // =================================================================================================
232
233 // Use full search as a search strategy. This is the default method.
UseFullSearch()234 void Tuner::UseFullSearch() {
235 pimpl->search_method_ = SearchMethod::FullSearch;
236 }
237
238 // Use random search as a search strategy.
UseRandomSearch(const double fraction)239 void Tuner::UseRandomSearch(const double fraction) {
240 pimpl->search_method_ = SearchMethod::RandomSearch;
241 pimpl->search_args_.push_back(fraction);
242 }
243
244 // Use simulated annealing as a search strategy.
UseAnnealing(const double fraction,const double max_temperature)245 void Tuner::UseAnnealing(const double fraction, const double max_temperature) {
246 pimpl->search_method_ = SearchMethod::Annealing;
247 pimpl->search_args_.push_back(fraction);
248 pimpl->search_args_.push_back(max_temperature);
249 }
250
251 // Use PSO as a search strategy.
UsePSO(const double fraction,const size_t swarm_size,const double influence_global,const double influence_local,const double influence_random)252 void Tuner::UsePSO(const double fraction, const size_t swarm_size, const double influence_global,
253 const double influence_local, const double influence_random) {
254 pimpl->search_method_ = SearchMethod::PSO;
255 pimpl->search_args_.push_back(fraction);
256 pimpl->search_args_.push_back(static_cast<double>(swarm_size));
257 pimpl->search_args_.push_back(influence_global);
258 pimpl->search_args_.push_back(influence_local);
259 pimpl->search_args_.push_back(influence_random);
260 }
261
262
263 // Output the search process to a file. This is disabled per default.
OutputSearchLog(const std::string & filename)264 void Tuner::OutputSearchLog(const std::string &filename) {
265 pimpl->output_search_process_ = true;
266 pimpl->search_log_filename_ = filename;
267 }
268
269 // =================================================================================================
270
271 // Starts the tuning process. See the TunerImpl's implemenation for details
Tune()272 void Tuner::Tune() {
273 pimpl->Tune();
274 }
275
276 // =================================================================================================
277
278 // Fits a machine learning model. See the TunerImpl's implemenation for details
ModelPrediction(const Model model_type,const float validation_fraction,const size_t test_top_x_configurations)279 void Tuner::ModelPrediction(const Model model_type, const float validation_fraction,
280 const size_t test_top_x_configurations) {
281 pimpl->ModelPrediction(model_type, validation_fraction, test_top_x_configurations);
282 }
283
284 // =================================================================================================
285
286
287 // Retrieves the parameters of the best tuning result
GetBestResult() const288 std::unordered_map<std::string, size_t> Tuner::GetBestResult() const {
289 const auto best_result = pimpl->GetBestResult();
290 const auto best_configuration = best_result.configuration;
291
292 // Converts the std::vector<KernelInfo::Setting> into an unordere map of strings and integers
293 auto parameters = std::unordered_map<std::string, size_t>{};
294 for (const auto ¶meter_setting : best_configuration) {
295 parameters[parameter_setting.name] = parameter_setting.value;
296 }
297 return parameters;
298 }
299
300 // Iterates over all tuning results and prints each parameter configuration and the corresponding
301 // timing-results. Printing is to stdout.
PrintToScreen() const302 double Tuner::PrintToScreen() const {
303
304 // Finds the best result
305 const auto best_result = pimpl->GetBestResult();
306 const auto best_time = best_result.time;
307
308 // Aborts if there was no best time found
309 if (best_time == std::numeric_limits<double>::max()) {
310 pimpl->PrintHeader("No tuner results found");
311 return 0.0;
312 }
313
314 // Prints all valid results and the one with the lowest execution time
315 pimpl->PrintHeader("Printing results to stdout");
316 for (auto &tuning_result: pimpl->tuning_results_) {
317 if (tuning_result.status && tuning_result.time != std::numeric_limits<double>::max()) {
318 pimpl->PrintResult(stdout, tuning_result, pimpl->kMessageResult);
319 }
320 }
321 pimpl->PrintHeader("Printing best result to stdout");
322 pimpl->PrintResult(stdout, best_result, pimpl->kMessageBest);
323
324 // Return the best time
325 return best_time;
326 }
327
328 // Prints the best result in a neatly formatted C++ database format to screen
PrintFormatted() const329 void Tuner::PrintFormatted() const {
330
331 // Finds the best result
332 const auto best_result = pimpl->GetBestResult();
333 const auto best_time = best_result.time;
334
335 // Prints the best result in C++ database format
336 auto count = size_t{0};
337 pimpl->PrintHeader("Printing best result in database format to stdout");
338 fprintf(stdout, "{ \"%s\", { ", pimpl->device().Name().c_str());
339 for (auto &setting: best_result.configuration) {
340 fprintf(stdout, "%s", setting.GetDatabase().c_str());
341 if (count < best_result.configuration.size()-1) {
342 fprintf(stdout, ", ");
343 }
344 count++;
345 }
346 fprintf(stdout, " } }\n");
347 }
348
349 // Outputs all results in a JSON database format
PrintJSON(const std::string & filename,const std::vector<std::pair<std::string,std::string>> & descriptions) const350 void Tuner::PrintJSON(const std::string &filename,
351 const std::vector<std::pair<std::string,std::string>> &descriptions) const {
352
353 // Prints the best result in JSON database format
354 pimpl->PrintHeader("Printing results to file in JSON format");
355 auto file = fopen(filename.c_str(), "w");
356 auto device_type = pimpl->device().Type();
357 fprintf(file, "{\n");
358 for (auto &description: descriptions) {
359 fprintf(file, " \"%s\": \"%s\",\n", description.first.c_str(), description.second.c_str());
360 }
361 fprintf(file, " \"device\": \"%s\",\n", pimpl->device().Name().c_str());
362 fprintf(file, " \"device_vendor\": \"%s\",\n", pimpl->device().Vendor().c_str());
363 fprintf(file, " \"device_type\": \"%s\",\n", device_type.c_str());
364 fprintf(file, " \"device_core_clock\": \"%zu\",\n", pimpl->device().CoreClock());
365 fprintf(file, " \"device_compute_units\": \"%zu\",\n", pimpl->device().ComputeUnits());
366 fprintf(file, " \"results\": [\n");
367
368 // Filters failed configurations
369 auto results = std::vector<TunerImpl::TunerResult>();
370 for (const auto &tuning_result: pimpl->tuning_results_) {
371 if (tuning_result.status && tuning_result.time != std::numeric_limits<double>::max()) {
372 results.push_back(tuning_result);
373 }
374 }
375
376 // Loops over all the results
377 auto num_results = results.size();
378 for (auto r=size_t{0}; r<num_results; ++r) {
379 auto result = results[r];
380 fprintf(file, " {\n");
381 fprintf(file, " \"kernel\": \"%s\",\n", result.kernel_name.c_str());
382 fprintf(file, " \"time\": %.3lf,\n", result.time);
383
384 // Loops over all the parameters for this result
385 fprintf(file, " \"parameters\": {");
386 auto num_configs = result.configuration.size();
387 for (auto p=size_t{0}; p<num_configs; ++p) {
388 auto config = result.configuration[p];
389 fprintf(file, "\"%s\": %zu", config.name.c_str(), config.value);
390 if (p < num_configs-1) { fprintf(file, ","); }
391 }
392 fprintf(file, "}\n");
393
394 // The footer
395 fprintf(file, " }");
396 if (r < num_results-1) { fprintf(file, ","); }
397 fprintf(file, "\n");
398 }
399 fprintf(file, " ]\n");
400 fprintf(file, "}\n");
401 fclose(file);
402 }
403
404 // Same as PrintToScreen, but now outputs into a file and does not mark the best-case
PrintToFile(const std::string & filename) const405 void Tuner::PrintToFile(const std::string &filename) const {
406 pimpl->PrintHeader("Printing results to file: "+filename);
407 auto file = fopen(filename.c_str(), "w");
408 std::vector<std::string> processed_kernels;
409 for (auto &tuning_result: pimpl->tuning_results_) {
410 if (tuning_result.status) {
411
412 // Checks whether this is a kernel which hasn't been encountered yet
413 auto new_kernel = true;
414 for (auto &kernel_name: processed_kernels) {
415 if (kernel_name == tuning_result.kernel_name) { new_kernel = false; break; }
416 }
417 processed_kernels.push_back(tuning_result.kernel_name);
418
419 // Prints the header in case of a new kernel name
420 if (new_kernel) {
421 fprintf(file, "name;time;threads;");
422 for (auto &setting: tuning_result.configuration) {
423 fprintf(file, "%s;", setting.name.c_str());
424 }
425 fprintf(file, "\n");
426 }
427
428 // Prints an entry to file
429 fprintf(file, "%s;", tuning_result.kernel_name.c_str());
430 fprintf(file, "%.2lf;", tuning_result.time);
431 fprintf(file, "%zu;", tuning_result.threads);
432 for (auto &setting: tuning_result.configuration) {
433 fprintf(file, "%zu;", setting.value);
434 }
435 fprintf(file, "\n");
436 }
437 }
438 fclose(file);
439 }
440
441 // Set the flag to suppress output to true. Note that this cannot be undone.
SuppressOutput()442 void Tuner::SuppressOutput() {
443 pimpl->suppress_output_ = true;
444 }
445
446 // Sets the number of runs to average time measurements.
SetNumRuns(const size_t num_runs)447 void Tuner::SetNumRuns(const size_t num_runs) {
448 pimpl->num_runs_ = num_runs;
449 }
450
451 // =================================================================================================
452 } // namespace cltune
453