1 
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file implements the TunerImpl class (see the header for information about the class).
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 //  http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27 
28 // The corresponding header file
29 #include "internal/tuner_impl.h"
30 
31 // The search strategies
32 #include "internal/searchers/full_search.h"
33 #include "internal/searchers/random_search.h"
34 #include "internal/searchers/annealing.h"
35 #include "internal/searchers/pso.h"
36 
37 // The machine learning models
38 #include "internal/ml_models/linear_regression.h"
39 #include "internal/ml_models/neural_network.h"
40 
41 #include <fstream> // std::ifstream, std::stringstream
42 #include <iostream> // FILE
43 #include <limits> // std::numeric_limits
44 #include <algorithm> // std::min
45 #include <memory> // std::unique_ptr
46 #include <tuple> // std::tuple
47 #include <cstdlib> // std::getenv
48 
49 namespace cltune {
50 // =================================================================================================
51 
52 // This is the threshold for 'correctness'
53 const double TunerImpl::kMaxL2Norm = 1e-4;
54 
55 // Messages printed to stdout (in colours)
56 const std::string TunerImpl::kMessageFull    = "\x1b[32m[==========]\x1b[0m";
57 const std::string TunerImpl::kMessageHead    = "\x1b[32m[----------]\x1b[0m";
58 const std::string TunerImpl::kMessageRun     = "\x1b[32m[ RUN      ]\x1b[0m";
59 const std::string TunerImpl::kMessageInfo    = "\x1b[32m[   INFO   ]\x1b[0m";
60 const std::string TunerImpl::kMessageVerbose = "\x1b[39m[ VERBOSE  ]\x1b[0m";
61 const std::string TunerImpl::kMessageOK      = "\x1b[32m[       OK ]\x1b[0m";
62 const std::string TunerImpl::kMessageWarning = "\x1b[33m[  WARNING ]\x1b[0m";
63 const std::string TunerImpl::kMessageFailure = "\x1b[31m[   FAILED ]\x1b[0m";
64 const std::string TunerImpl::kMessageResult  = "\x1b[32m[ RESULT   ]\x1b[0m";
65 const std::string TunerImpl::kMessageBest    = "\x1b[35m[     BEST ]\x1b[0m";
66 
67 // =================================================================================================
68 
69 // Initializes the platform and device to the default 0
TunerImpl()70 TunerImpl::TunerImpl():
71     platform_(Platform(size_t{0})),
72     device_(Device(platform_, size_t{0})),
73     context_(Context(device_)),
74     queue_(Queue(context_, device_)),
75     num_runs_(size_t{1}),
76     has_reference_(false),
77     suppress_output_(false),
78     output_search_process_(false),
79     search_log_filename_(std::string{}),
80     search_method_(SearchMethod::FullSearch),
81     search_args_(0),
82     argument_counter_(0) {
83   if (!suppress_output_) {
84     fprintf(stdout, "\n%s Initializing on platform 0 device 0\n", kMessageFull.c_str());
85     auto opencl_version = device_.Version();
86     auto device_name = device_.Name();
87     fprintf(stdout, "%s Device name: '%s' (%s)\n", kMessageFull.c_str(),
88             device_name.c_str(), opencl_version.c_str());
89   }
90 }
91 
92 // Initializes with a custom platform and device
TunerImpl(size_t platform_id,size_t device_id)93 TunerImpl::TunerImpl(size_t platform_id, size_t device_id):
94     platform_(Platform(platform_id)),
95     device_(Device(platform_, device_id)),
96     context_(Context(device_)),
97     queue_(Queue(context_, device_)),
98     num_runs_(size_t{1}),
99     has_reference_(false),
100     suppress_output_(false),
101     output_search_process_(false),
102     search_log_filename_(std::string{}),
103     search_method_(SearchMethod::FullSearch),
104     search_args_(0),
105     argument_counter_(0) {
106   if (!suppress_output_) {
107     fprintf(stdout, "\n%s Initializing on platform %zu device %zu\n",
108             kMessageFull.c_str(), platform_id, device_id);
109     auto opencl_version = device_.Version();
110     auto device_name = device_.Name();
111     fprintf(stdout, "%s Device name: '%s' (%s)\n", kMessageFull.c_str(),
112             device_name.c_str(), opencl_version.c_str());
113   }
114 }
115 
116 // End of the tuner
~TunerImpl()117 TunerImpl::~TunerImpl() {
118   for (auto &reference_output: reference_outputs_) {
119     delete[] static_cast<int*>(reference_output);
120   }
121 
122   // Frees the device buffers
123   auto free_buffers = [](MemArgument &mem_info) {
124     #ifdef USE_OPENCL
125       CheckError(clReleaseMemObject(mem_info.buffer));
126     #else
127       CheckError(cuMemFree(mem_info.buffer));
128     #endif
129   };
130   for (auto &mem_argument: arguments_input_) { free_buffers(mem_argument); }
131   for (auto &mem_argument: arguments_output_) { free_buffers(mem_argument); }
132   for (auto &mem_argument: arguments_output_copy_) { free_buffers(mem_argument); }
133 
134   if (!suppress_output_) {
135     fprintf(stdout, "\n%s End of the tuning process\n\n", kMessageFull.c_str());
136   }
137 }
138 
139 // =================================================================================================
140 
141 // Starts the tuning process. First, the reference kernel is run if it exists (output results are
142 // automatically verified with respect to this reference run). Next, all permutations of all tuning-
143 // parameters are computed for each kernel and those kernels are run. Their timing-results are
144 // collected and stored into the tuning_results_ vector.
Tune()145 void TunerImpl::Tune() {
146 
147   // Runs the reference kernel if it is defined
148   if (has_reference_) {
149     PrintHeader("Testing reference "+reference_kernel_->name());
150     RunKernel(reference_kernel_->source(), *reference_kernel_, 0, 1);
151     StoreReferenceOutput();
152   }
153 
154   // Iterates over all tunable kernels
155   for (auto &kernel: kernels_) {
156     PrintHeader("Testing kernel "+kernel.name());
157 
158     // If there are no tuning parameters, simply run the kernel and store the results
159     if (kernel.parameters().size() == 0) {
160 
161         // Compiles and runs the kernel
162       auto tuning_result = RunKernel(kernel.source(), kernel, 0, 1);
163       tuning_result.status = VerifyOutput();
164 
165       // Stores the result of the tuning
166       tuning_results_.push_back(tuning_result);
167 
168     // Else: there are tuning parameters to iterate over
169     } else {
170 
171       // Computes the permutations of all parameters and pass them to a (smart) search algorithm
172       #ifdef VERBOSE
173         fprintf(stdout, "%s Computing the permutations of all parameters\n", kMessageVerbose.c_str());
174       #endif
175       kernel.SetConfigurations();
176 
177       // Creates the selected search algorithm
178       std::unique_ptr<Searcher> search;
179       switch (search_method_) {
180         case SearchMethod::FullSearch:
181           search.reset(new FullSearch{kernel.configurations()});
182           break;
183         case SearchMethod::RandomSearch:
184           search.reset(new RandomSearch{kernel.configurations(), search_args_[0]});
185           break;
186         case SearchMethod::Annealing:
187           search.reset(new Annealing{kernel.configurations(), search_args_[0], search_args_[1]});
188           break;
189         case SearchMethod::PSO:
190           search.reset(new PSO{kernel.configurations(), kernel.parameters(), search_args_[0],
191                                static_cast<size_t>(search_args_[1]), search_args_[2],
192                                search_args_[3], search_args_[4]});
193           break;
194       }
195 
196       // Iterates over all possible configurations (the permutations of the tuning parameters)
197       for (auto p=size_t{0}; p<search->NumConfigurations(); ++p) {
198         #ifdef VERBOSE
199           fprintf(stdout, "%s Exploring configuration (%zu out of %zu)\n", kMessageVerbose.c_str(),
200                   p + 1, search->NumConfigurations());
201         #endif
202         auto permutation = search->GetConfiguration();
203 
204         // Adds the parameters to the source-code string as defines
205         auto source = std::string{};
206         for (auto &config: permutation) {
207           source += config.GetDefine();
208         }
209         source += kernel.source();
210 
211         // Updates the local range with the parameter values
212         kernel.ComputeRanges(permutation);
213 
214         // Compiles and runs the kernel
215         auto tuning_result = RunKernel(source, kernel, p, search->NumConfigurations());
216         tuning_result.status = VerifyOutput();
217 
218         // Gives timing feedback to the search algorithm and calculates the next index
219         search->PushExecutionTime(tuning_result.time);
220         search->CalculateNextIndex();
221 
222         // Stores the parameters and the timing-result
223         tuning_result.configuration = permutation;
224         if (tuning_result.time == std::numeric_limits<float>::max()) {
225           tuning_result.time = 0.0;
226           PrintResult(stdout, tuning_result, kMessageFailure);
227           tuning_result.time = std::numeric_limits<float>::max();
228           tuning_result.status = false;
229         }
230         else if (!tuning_result.status) {
231           PrintResult(stdout, tuning_result, kMessageWarning);
232         }
233         tuning_results_.push_back(tuning_result);
234       }
235 
236       // Prints a log of the searching process. This is disabled per default, but can be enabled
237       // using the "OutputSearchLog" function.
238       if (output_search_process_) {
239         auto file = fopen(search_log_filename_.c_str(), "w");
240         search->PrintLog(file);
241         fclose(file);
242       }
243     }
244   }
245 }
246 
247 // =================================================================================================
248 
249 // Compiles the kernel and checks for error messages, sets all output buffers to zero,
250 // launches the kernel, and collects the timing information.
RunKernel(const std::string & source,const KernelInfo & kernel,const size_t configuration_id,const size_t num_configurations)251 TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const KernelInfo &kernel,
252                                             const size_t configuration_id,
253                                             const size_t num_configurations) {
254 
255   // In case of an exception, skip this run
256   try {
257     #ifdef VERBOSE
258       fprintf(stdout, "%s Starting compilation\n", kMessageVerbose.c_str());
259     #endif
260 
261      // Sets the build options from an environmental variable (if set)
262     auto options = std::vector<std::string>();
263     const auto environment_variable = std::getenv("CLTUNE_BUILD_OPTIONS");
264     if (environment_variable != nullptr) {
265       options.push_back(std::string(environment_variable));
266     }
267 
268     // Compiles the kernel and prints the compiler errors/warnings
269     auto program = Program(context_, source);
270     auto build_status = program.Build(device_, options);
271     if (build_status == BuildStatus::kError) {
272       auto message = program.GetBuildInfo(device_);
273       fprintf(stdout, "device compiler error/warning: %s\n", message.c_str());
274       throw std::runtime_error("device compiler error/warning occurred ^^\n");
275     }
276     if (build_status == BuildStatus::kInvalid) {
277       throw std::runtime_error("Invalid program binary");
278     }
279     #ifdef VERBOSE
280       fprintf(stdout, "%s Finished compilation\n", kMessageVerbose.c_str());
281     #endif
282 
283     // Clears all previous copies of output buffer(s)
284     for (auto &mem_info: arguments_output_copy_) {
285       #ifdef USE_OPENCL
286         CheckError(clReleaseMemObject(mem_info.buffer));
287       #else
288         CheckError(cuMemFree(mem_info.buffer));
289       #endif
290     }
291     arguments_output_copy_.clear();
292 
293     // Creates a copy of the output buffer(s)
294     #ifdef VERBOSE
295       fprintf(stdout, "%s Creating a copy of the output buffer\n", kMessageVerbose.c_str());
296     #endif
297     for (auto &output: arguments_output_) {
298       switch (output.type) {
299         case MemType::kShort: arguments_output_copy_.push_back(CopyOutputBuffer<short>(output)); break;
300         case MemType::kInt: arguments_output_copy_.push_back(CopyOutputBuffer<int>(output)); break;
301         case MemType::kSizeT: arguments_output_copy_.push_back(CopyOutputBuffer<size_t>(output)); break;
302         case MemType::kHalf: arguments_output_copy_.push_back(CopyOutputBuffer<half>(output)); break;
303         case MemType::kFloat: arguments_output_copy_.push_back(CopyOutputBuffer<float>(output)); break;
304         case MemType::kDouble: arguments_output_copy_.push_back(CopyOutputBuffer<double>(output)); break;
305         case MemType::kFloat2: arguments_output_copy_.push_back(CopyOutputBuffer<float2>(output)); break;
306         case MemType::kDouble2: arguments_output_copy_.push_back(CopyOutputBuffer<double2>(output)); break;
307         default: throw std::runtime_error("Unsupported reference output data-type");
308       }
309     }
310 
311     // Sets the kernel and its arguments
312     #ifdef VERBOSE
313       fprintf(stdout, "%s Setting kernel arguments\n", kMessageVerbose.c_str());
314     #endif
315     auto tune_kernel = Kernel(program, kernel.name());
316     for (auto &i: arguments_input_) { tune_kernel.SetArgument(i.index, i.buffer); }
317     for (auto &i: arguments_output_copy_) { tune_kernel.SetArgument(i.index, i.buffer); }
318     for (auto &i: arguments_int_) { tune_kernel.SetArgument(i.first, i.second); }
319     for (auto &i: arguments_size_t_) { tune_kernel.SetArgument(i.first, i.second); }
320     for (auto &i: arguments_float_) { tune_kernel.SetArgument(i.first, i.second); }
321     for (auto &i: arguments_double_) { tune_kernel.SetArgument(i.first, i.second); }
322     for (auto &i: arguments_float2_) { tune_kernel.SetArgument(i.first, i.second); }
323     for (auto &i: arguments_double2_) { tune_kernel.SetArgument(i.first, i.second); }
324 
325     // Sets the global and local thread-sizes
326     auto global = kernel.global();
327     auto local = kernel.local();
328 
329     // Makes sure that the global size is a multiple of the local
330     for (auto i=size_t{0}; i<global.size(); ++i) {
331       global[i] = Ceil(global[i], local[i]);
332     }
333 
334     // Verifies the local memory usage of the kernel
335     auto local_mem_usage = tune_kernel.LocalMemUsage(device_);
336     if (!device_.IsLocalMemoryValid(local_mem_usage)) {
337       throw std::runtime_error("Using too much local memory");
338     }
339 
340     // Prepares the kernel
341     queue_.Finish();
342 
343     // Multiple runs of the kernel to find the minimum execution time
344     fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str());
345     auto events = std::vector<Event>(num_runs_);
346     auto elapsed_time = std::numeric_limits<float>::max();
347     for (auto t=size_t{0}; t<num_runs_; ++t) {
348       #ifdef VERBOSE
349         fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(),
350                 t + 1, num_runs_);
351       #endif
352       const auto start_time = std::chrono::steady_clock::now();
353 
354       // Runs the kernel (this is the timed part)
355       tune_kernel.Launch(queue_, global, local, events[t].pointer());
356       queue_.Finish(events[t]);
357 
358       // Collects the timing information
359       const auto cpu_timer = std::chrono::steady_clock::now() - start_time;
360       const auto cpu_timing = std::chrono::duration<float,std::milli>(cpu_timer).count();
361       #ifdef VERBOSE
362         fprintf(stdout, "%s Completed kernel in %.2lf ms\n", kMessageVerbose.c_str(), cpu_timing);
363       #endif
364       elapsed_time = std::min(elapsed_time, cpu_timing);
365     }
366     queue_.Finish();
367 
368     // Prints diagnostic information
369     fprintf(stdout, "%s Completed %s (%.1lf ms) - %zu out of %zu\n",
370             kMessageOK.c_str(), kernel.name().c_str(), elapsed_time,
371             configuration_id+1, num_configurations);
372 
373     // Computes the result of the tuning
374     auto local_threads = size_t{1};
375     for (auto &item: local) { local_threads *= item; }
376     TunerResult result = {kernel.name(), elapsed_time, local_threads, false, {}};
377     return result;
378   }
379 
380   // There was an exception, now return an invalid tuner results
381   catch(std::exception& e) {
382     fprintf(stdout, "%s Kernel %s failed\n", kMessageFailure.c_str(), kernel.name().c_str());
383     fprintf(stdout, "%s   catched exception: %s\n", kMessageFailure.c_str(), e.what());
384     TunerResult result = {kernel.name(), std::numeric_limits<float>::max(), 0, false, {}};
385     return result;
386   }
387 }
388 
389 // =================================================================================================
390 
391 // Uploads a copy of the output vector to the device. This is done because the output might as well
392 // be an input buffer at the same time. Every kernel might override it, so it needs to be updated
393 // before each run.
394 template <typename T>
CopyOutputBuffer(MemArgument & argument)395 TunerImpl::MemArgument TunerImpl::CopyOutputBuffer(MemArgument &argument) {
396   auto buffer_copy = Buffer<T>(context_, BufferAccess::kNotOwned, argument.size);
397   auto buffer_source = Buffer<T>(argument.buffer);
398   buffer_source.CopyTo(queue_, argument.size, buffer_copy);
399   auto result = MemArgument{argument.index, argument.size, argument.type, buffer_copy()};
400   return result;
401 }
402 
403 // =================================================================================================
404 
405 // Loops over all reference outputs, creates per output a new host buffer and copies the device
406 // buffer from the device onto the host. This function is specialised for different data-types.
StoreReferenceOutput()407 void TunerImpl::StoreReferenceOutput() {
408   reference_outputs_.clear();
409   for (auto &output_buffer: arguments_output_copy_) {
410     switch (output_buffer.type) {
411       case MemType::kShort: DownloadReference<short>(output_buffer); break;
412       case MemType::kInt: DownloadReference<int>(output_buffer); break;
413       case MemType::kSizeT: DownloadReference<size_t>(output_buffer); break;
414       case MemType::kHalf: DownloadReference<half>(output_buffer); break;
415       case MemType::kFloat: DownloadReference<float>(output_buffer); break;
416       case MemType::kDouble: DownloadReference<double>(output_buffer); break;
417       case MemType::kFloat2: DownloadReference<float2>(output_buffer); break;
418       case MemType::kDouble2: DownloadReference<double2>(output_buffer); break;
419       default: throw std::runtime_error("Unsupported reference output data-type");
420     }
421   }
422 }
DownloadReference(MemArgument & device_buffer)423 template <typename T> void TunerImpl::DownloadReference(MemArgument &device_buffer) {
424   auto host_buffer = new T[device_buffer.size];
425   Buffer<T>(device_buffer.buffer).Read(queue_, device_buffer.size, host_buffer);
426   reference_outputs_.push_back(host_buffer);
427 }
428 
429 // =================================================================================================
430 
431 // In case there is a reference kernel, this function loops over all outputs, creates per output a
432 // new host buffer and copies the device buffer from the device onto the host. Following, it
433 // compares the results to the reference output. This function is specialised for different
434 // data-types. These functions return "true" if everything is OK, and "false" if there is a warning.
VerifyOutput()435 bool TunerImpl::VerifyOutput() {
436   auto status = true;
437   if (has_reference_) {
438     auto i = size_t{0};
439     for (auto &output_buffer: arguments_output_copy_) {
440       switch (output_buffer.type) {
441         case MemType::kShort: status &= DownloadAndCompare<short>(output_buffer, i); break;
442         case MemType::kInt: status &= DownloadAndCompare<int>(output_buffer, i); break;
443         case MemType::kSizeT: status &= DownloadAndCompare<size_t>(output_buffer, i); break;
444         case MemType::kHalf: status &= DownloadAndCompare<half>(output_buffer, i); break;
445         case MemType::kFloat: status &= DownloadAndCompare<float>(output_buffer, i); break;
446         case MemType::kDouble: status &= DownloadAndCompare<double>(output_buffer, i); break;
447         case MemType::kFloat2: status &= DownloadAndCompare<float2>(output_buffer, i); break;
448         case MemType::kDouble2: status &= DownloadAndCompare<double2>(output_buffer, i); break;
449         default: throw std::runtime_error("Unsupported output data-type");
450       }
451       ++i;
452     }
453   }
454   return status;
455 }
456 
457 // See above comment
458 template <typename T>
DownloadAndCompare(MemArgument & device_buffer,const size_t i)459 bool TunerImpl::DownloadAndCompare(MemArgument &device_buffer, const size_t i) {
460   auto l2_norm = 0.0;
461 
462   // Downloads the results to the host
463   std::vector<T> host_buffer(device_buffer.size);
464   Buffer<T>(device_buffer.buffer).Read(queue_, device_buffer.size, host_buffer);
465 
466   // Compares the results (L2 norm)
467   T* reference_output = static_cast<T*>(reference_outputs_[i]);
468   for (auto j=size_t{0}; j<device_buffer.size; ++j) {
469     l2_norm += AbsoluteDifference(reference_output[j], host_buffer[j]);
470   }
471 
472   // Verifies if everything was OK, if not: print the L2 norm
473   // TODO: Implement a choice of comparisons for the client to choose from
474   if (std::isnan(l2_norm) || l2_norm > kMaxL2Norm) {
475     fprintf(stderr, "%s Results differ: L2 norm is %6.2e\n", kMessageWarning.c_str(), l2_norm);
476     return false;
477   }
478   return true;
479 }
480 
481 // Computes the absolute difference
482 template <typename T>
AbsoluteDifference(const T reference,const T result)483 double TunerImpl::AbsoluteDifference(const T reference, const T result) {
484   return fabs(static_cast<double>(reference) - static_cast<double>(result));
485 }
AbsoluteDifference(const float2 reference,const float2 result)486 template <> double TunerImpl::AbsoluteDifference(const float2 reference, const float2 result) {
487   auto real = fabs(static_cast<double>(reference.real()) - static_cast<double>(result.real()));
488   auto imag = fabs(static_cast<double>(reference.imag()) - static_cast<double>(result.imag()));
489   return real + imag;
490 }
AbsoluteDifference(const double2 reference,const double2 result)491 template <> double TunerImpl::AbsoluteDifference(const double2 reference, const double2 result) {
492   auto real = fabs(reference.real() - result.real());
493   auto imag = fabs(reference.imag() - result.imag());
494   return real + imag;
495 }
AbsoluteDifference(const half reference,const half result)496 template <> double TunerImpl::AbsoluteDifference(const half reference, const half result) {
497   const auto reference_float = HalfToFloat(reference);
498   const auto result_float = HalfToFloat(result);
499   return fabs(static_cast<double>(reference_float) - static_cast<double>(result_float));
500 }
501 
502 // =================================================================================================
503 
504 // Trains a model and predicts all remaining configurations
ModelPrediction(const Model model_type,const float validation_fraction,const size_t test_top_x_configurations)505 void TunerImpl::ModelPrediction(const Model model_type, const float validation_fraction,
506                                 const size_t test_top_x_configurations) {
507 
508   // Iterates over all tunable kernels
509   for (auto &kernel: kernels_) {
510 
511     // Retrieves the number of training samples and features
512     auto validation_samples = static_cast<size_t>(tuning_results_.size()*validation_fraction);
513     auto training_samples = tuning_results_.size() - validation_samples;
514     auto features = tuning_results_[0].configuration.size();
515 
516     // Sets the raw training and validation data
517     auto x_train = std::vector<std::vector<float>>(training_samples, std::vector<float>(features));
518     auto y_train = std::vector<float>(training_samples);
519     for (auto s=size_t{0}; s<training_samples; ++s) {
520       y_train[s] = tuning_results_[s].time;
521       for (auto f=size_t{0}; f<features; ++f) {
522         x_train[s][f] = static_cast<float>(tuning_results_[s].configuration[f].value);
523       }
524     }
525     auto x_validation = std::vector<std::vector<float>>(validation_samples, std::vector<float>(features));
526     auto y_validation = std::vector<float>(validation_samples);
527     for (auto s=size_t{0}; s<validation_samples; ++s) {
528       y_validation[s] = tuning_results_[s+training_samples].time;
529       for (auto f=size_t{0}; f<features; ++f) {
530         x_validation[s][f] = static_cast<float>(tuning_results_[s + training_samples].configuration[f].value);
531       }
532     }
533 
534     // Pointer to one of the machine learning models
535     std::unique_ptr<MLModel<float>> model;
536 
537     // Trains a linear regression model
538     if (model_type == Model::kLinearRegression) {
539       PrintHeader("Training a linear regression model");
540 
541       // Sets the learning parameters
542       auto learning_iterations = size_t{800}; // For gradient descent
543       auto learning_rate = 0.05f; // For gradient descent
544       auto lambda = 0.2f; // Regularization parameter
545       auto debug_display = true; // Output learned data to stdout
546 
547       // Trains and validates the model
548       model = std::unique_ptr<MLModel<float>>(
549         new LinearRegression<float>(learning_iterations, learning_rate, lambda, debug_display)
550       );
551       model->Train(x_train, y_train);
552       model->Validate(x_validation, y_validation);
553     }
554 
555     // Trains a neural network model
556     else if (model_type == Model::kNeuralNetwork) {
557       PrintHeader("Training a neural network model");
558 
559       // Sets the learning parameters
560       auto learning_iterations = size_t{800}; // For gradient descent
561       auto learning_rate = 0.1f; // For gradient descent
562       auto lambda = 0.005f; // Regularization parameter
563       auto debug_display = true; // Output learned data to stdout
564       auto layers = std::vector<size_t>{features, 20, 1};
565 
566       // Trains and validates the model
567       model = std::unique_ptr<MLModel<float>>(
568         new NeuralNetwork<float>(learning_iterations, learning_rate, lambda, layers, debug_display)
569       );
570       model->Train(x_train, y_train);
571       model->Validate(x_validation, y_validation);
572     }
573 
574     // Unknown model
575     else {
576       throw std::runtime_error("Unknown machine learning model");
577     }
578 
579     // Iterates over all configurations (the permutations of the tuning parameters)
580     PrintHeader("Predicting the remaining configurations using the model");
581     auto model_results = std::vector<std::tuple<size_t,float>>();
582     auto p = size_t{0};
583     for (auto &permutation: kernel.configurations()) {
584 
585       // Runs the trained model to predicts the result
586       auto x_test = std::vector<float>();
587       for (auto &setting: permutation) {
588         x_test.push_back(static_cast<float>(setting.value));
589       }
590       auto predicted_time = model->Predict(x_test);
591       model_results.push_back(std::make_tuple(p, predicted_time));
592       ++p;
593     }
594 
595     // Sorts the modelled results by performance
596     std::sort(begin(model_results), end(model_results),
597       [](const std::tuple<size_t,float> &t1, const std::tuple<size_t,float> &t2) {
598         return std::get<1>(t1) < std::get<1>(t2);
599       }
600     );
601 
602     // Tests the best configurations on the device to verify the results
603     PrintHeader("Testing the best-found configurations");
604     for (auto i=size_t{0}; i<test_top_x_configurations && i<model_results.size(); ++i) {
605       auto result = model_results[i];
606       printf("[ -------> ] The model predicted: %.3lf ms\n", std::get<1>(result));
607       auto pid = std::get<0>(result);
608       auto permutations = kernel.configurations();
609       auto permutation = permutations[pid];
610 
611       // Adds the parameters to the source-code string as defines
612       auto source = std::string{};
613       for (auto &config: permutation) {
614         source += config.GetDefine();
615       }
616       source += kernel.source();
617 
618       // Updates the local range with the parameter values
619       kernel.ComputeRanges(permutation);
620 
621       // Compiles and runs the kernel
622       auto tuning_result = RunKernel(source, kernel, pid, test_top_x_configurations);
623       tuning_result.status = VerifyOutput();
624 
625       // Stores the parameters and the timing-result
626       tuning_result.configuration = permutation;
627       tuning_results_.push_back(tuning_result);
628       if (tuning_result.time == std::numeric_limits<float>::max()) {
629         tuning_result.time = 0.0;
630         PrintResult(stdout, tuning_result, kMessageFailure);
631         tuning_result.time = std::numeric_limits<float>::max();
632       }
633       else if (!tuning_result.status) {
634         PrintResult(stdout, tuning_result, kMessageWarning);
635       }
636     }
637   }
638 }
639 
640 // =================================================================================================
641 
642 // Prints a result by looping over all its configuration parameters
PrintResult(FILE * fp,const TunerResult & result,const std::string & message) const643 void TunerImpl::PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const {
644   fprintf(fp, "%s %s; ", message.c_str(), result.kernel_name.c_str());
645   fprintf(fp, "%8.1lf ms;", result.time);
646   for (auto &setting: result.configuration) {
647     fprintf(fp, "%9s;", setting.GetConfig().c_str());
648   }
649   fprintf(fp, "\n");
650 }
651 
652 // =================================================================================================
653 
654 // Finds the best result
GetBestResult() const655 TunerImpl::TunerResult TunerImpl::GetBestResult() const {
656   auto best_result = tuning_results_[0];
657   auto best_time = std::numeric_limits<double>::max();
658   for (auto &tuning_result: tuning_results_) {
659     if (tuning_result.status && best_time >= tuning_result.time) {
660       best_result = tuning_result;
661       best_time = tuning_result.time;
662     }
663   }
664   return best_result;
665 }
666 
667 // =================================================================================================
668 
669 // Loads a file into a stringstream and returns the result as a string
LoadFile(const std::string & filename)670 std::string TunerImpl::LoadFile(const std::string &filename) {
671   std::ifstream file(filename);
672   if (file.fail()) { throw std::runtime_error("Could not open kernel file: "+filename); }
673   std::stringstream file_contents;
674   file_contents << file.rdbuf();
675   return file_contents.str();
676 }
677 
678 // =================================================================================================
679 
680 // Converts a C++ string to a C string and print it out with nice formatting
PrintHeader(const std::string & header_name) const681 void TunerImpl::PrintHeader(const std::string &header_name) const {
682   if (!suppress_output_) {
683     fprintf(stdout, "\n%s %s\n", kMessageHead.c_str(), header_name.c_str());
684   }
685 }
686 
687 // =================================================================================================
688 
689 // Get the MemType based on a template argument
GetType()690 template <> MemType TunerImpl::GetType<short>() { return MemType::kShort; }
GetType()691 template <> MemType TunerImpl::GetType<int>() { return MemType::kInt; }
GetType()692 template <> MemType TunerImpl::GetType<size_t>() { return MemType::kSizeT; }
GetType()693 template <> MemType TunerImpl::GetType<half>() { return MemType::kHalf; }
GetType()694 template <> MemType TunerImpl::GetType<float>() { return MemType::kFloat; }
GetType()695 template <> MemType TunerImpl::GetType<double>() { return MemType::kDouble; }
GetType()696 template <> MemType TunerImpl::GetType<float2>() { return MemType::kFloat2; }
GetType()697 template <> MemType TunerImpl::GetType<double2>() { return MemType::kDouble2; }
698 
699 // =================================================================================================
700 } // namespace cltune
701