1
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file implements the TunerImpl class (see the header for information about the class).
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27
28 // The corresponding header file
29 #include "internal/tuner_impl.h"
30
31 // The search strategies
32 #include "internal/searchers/full_search.h"
33 #include "internal/searchers/random_search.h"
34 #include "internal/searchers/annealing.h"
35 #include "internal/searchers/pso.h"
36
37 // The machine learning models
38 #include "internal/ml_models/linear_regression.h"
39 #include "internal/ml_models/neural_network.h"
40
41 #include <fstream> // std::ifstream, std::stringstream
42 #include <iostream> // FILE
43 #include <limits> // std::numeric_limits
44 #include <algorithm> // std::min
45 #include <memory> // std::unique_ptr
46 #include <tuple> // std::tuple
47 #include <cstdlib> // std::getenv
48
49 namespace cltune {
50 // =================================================================================================
51
52 // This is the threshold for 'correctness'
53 const double TunerImpl::kMaxL2Norm = 1e-4;
54
55 // Messages printed to stdout (in colours)
56 const std::string TunerImpl::kMessageFull = "\x1b[32m[==========]\x1b[0m";
57 const std::string TunerImpl::kMessageHead = "\x1b[32m[----------]\x1b[0m";
58 const std::string TunerImpl::kMessageRun = "\x1b[32m[ RUN ]\x1b[0m";
59 const std::string TunerImpl::kMessageInfo = "\x1b[32m[ INFO ]\x1b[0m";
60 const std::string TunerImpl::kMessageVerbose = "\x1b[39m[ VERBOSE ]\x1b[0m";
61 const std::string TunerImpl::kMessageOK = "\x1b[32m[ OK ]\x1b[0m";
62 const std::string TunerImpl::kMessageWarning = "\x1b[33m[ WARNING ]\x1b[0m";
63 const std::string TunerImpl::kMessageFailure = "\x1b[31m[ FAILED ]\x1b[0m";
64 const std::string TunerImpl::kMessageResult = "\x1b[32m[ RESULT ]\x1b[0m";
65 const std::string TunerImpl::kMessageBest = "\x1b[35m[ BEST ]\x1b[0m";
66
67 // =================================================================================================
68
69 // Initializes the platform and device to the default 0
TunerImpl()70 TunerImpl::TunerImpl():
71 platform_(Platform(size_t{0})),
72 device_(Device(platform_, size_t{0})),
73 context_(Context(device_)),
74 queue_(Queue(context_, device_)),
75 num_runs_(size_t{1}),
76 has_reference_(false),
77 suppress_output_(false),
78 output_search_process_(false),
79 search_log_filename_(std::string{}),
80 search_method_(SearchMethod::FullSearch),
81 search_args_(0),
82 argument_counter_(0) {
83 if (!suppress_output_) {
84 fprintf(stdout, "\n%s Initializing on platform 0 device 0\n", kMessageFull.c_str());
85 auto opencl_version = device_.Version();
86 auto device_name = device_.Name();
87 fprintf(stdout, "%s Device name: '%s' (%s)\n", kMessageFull.c_str(),
88 device_name.c_str(), opencl_version.c_str());
89 }
90 }
91
92 // Initializes with a custom platform and device
TunerImpl(size_t platform_id,size_t device_id)93 TunerImpl::TunerImpl(size_t platform_id, size_t device_id):
94 platform_(Platform(platform_id)),
95 device_(Device(platform_, device_id)),
96 context_(Context(device_)),
97 queue_(Queue(context_, device_)),
98 num_runs_(size_t{1}),
99 has_reference_(false),
100 suppress_output_(false),
101 output_search_process_(false),
102 search_log_filename_(std::string{}),
103 search_method_(SearchMethod::FullSearch),
104 search_args_(0),
105 argument_counter_(0) {
106 if (!suppress_output_) {
107 fprintf(stdout, "\n%s Initializing on platform %zu device %zu\n",
108 kMessageFull.c_str(), platform_id, device_id);
109 auto opencl_version = device_.Version();
110 auto device_name = device_.Name();
111 fprintf(stdout, "%s Device name: '%s' (%s)\n", kMessageFull.c_str(),
112 device_name.c_str(), opencl_version.c_str());
113 }
114 }
115
116 // End of the tuner
~TunerImpl()117 TunerImpl::~TunerImpl() {
118 for (auto &reference_output: reference_outputs_) {
119 delete[] static_cast<int*>(reference_output);
120 }
121
122 // Frees the device buffers
123 auto free_buffers = [](MemArgument &mem_info) {
124 #ifdef USE_OPENCL
125 CheckError(clReleaseMemObject(mem_info.buffer));
126 #else
127 CheckError(cuMemFree(mem_info.buffer));
128 #endif
129 };
130 for (auto &mem_argument: arguments_input_) { free_buffers(mem_argument); }
131 for (auto &mem_argument: arguments_output_) { free_buffers(mem_argument); }
132 for (auto &mem_argument: arguments_output_copy_) { free_buffers(mem_argument); }
133
134 if (!suppress_output_) {
135 fprintf(stdout, "\n%s End of the tuning process\n\n", kMessageFull.c_str());
136 }
137 }
138
139 // =================================================================================================
140
141 // Starts the tuning process. First, the reference kernel is run if it exists (output results are
142 // automatically verified with respect to this reference run). Next, all permutations of all tuning-
143 // parameters are computed for each kernel and those kernels are run. Their timing-results are
144 // collected and stored into the tuning_results_ vector.
Tune()145 void TunerImpl::Tune() {
146
147 // Runs the reference kernel if it is defined
148 if (has_reference_) {
149 PrintHeader("Testing reference "+reference_kernel_->name());
150 RunKernel(reference_kernel_->source(), *reference_kernel_, 0, 1);
151 StoreReferenceOutput();
152 }
153
154 // Iterates over all tunable kernels
155 for (auto &kernel: kernels_) {
156 PrintHeader("Testing kernel "+kernel.name());
157
158 // If there are no tuning parameters, simply run the kernel and store the results
159 if (kernel.parameters().size() == 0) {
160
161 // Compiles and runs the kernel
162 auto tuning_result = RunKernel(kernel.source(), kernel, 0, 1);
163 tuning_result.status = VerifyOutput();
164
165 // Stores the result of the tuning
166 tuning_results_.push_back(tuning_result);
167
168 // Else: there are tuning parameters to iterate over
169 } else {
170
171 // Computes the permutations of all parameters and pass them to a (smart) search algorithm
172 #ifdef VERBOSE
173 fprintf(stdout, "%s Computing the permutations of all parameters\n", kMessageVerbose.c_str());
174 #endif
175 kernel.SetConfigurations();
176
177 // Creates the selected search algorithm
178 std::unique_ptr<Searcher> search;
179 switch (search_method_) {
180 case SearchMethod::FullSearch:
181 search.reset(new FullSearch{kernel.configurations()});
182 break;
183 case SearchMethod::RandomSearch:
184 search.reset(new RandomSearch{kernel.configurations(), search_args_[0]});
185 break;
186 case SearchMethod::Annealing:
187 search.reset(new Annealing{kernel.configurations(), search_args_[0], search_args_[1]});
188 break;
189 case SearchMethod::PSO:
190 search.reset(new PSO{kernel.configurations(), kernel.parameters(), search_args_[0],
191 static_cast<size_t>(search_args_[1]), search_args_[2],
192 search_args_[3], search_args_[4]});
193 break;
194 }
195
196 // Iterates over all possible configurations (the permutations of the tuning parameters)
197 for (auto p=size_t{0}; p<search->NumConfigurations(); ++p) {
198 #ifdef VERBOSE
199 fprintf(stdout, "%s Exploring configuration (%zu out of %zu)\n", kMessageVerbose.c_str(),
200 p + 1, search->NumConfigurations());
201 #endif
202 auto permutation = search->GetConfiguration();
203
204 // Adds the parameters to the source-code string as defines
205 auto source = std::string{};
206 for (auto &config: permutation) {
207 source += config.GetDefine();
208 }
209 source += kernel.source();
210
211 // Updates the local range with the parameter values
212 kernel.ComputeRanges(permutation);
213
214 // Compiles and runs the kernel
215 auto tuning_result = RunKernel(source, kernel, p, search->NumConfigurations());
216 tuning_result.status = VerifyOutput();
217
218 // Gives timing feedback to the search algorithm and calculates the next index
219 search->PushExecutionTime(tuning_result.time);
220 search->CalculateNextIndex();
221
222 // Stores the parameters and the timing-result
223 tuning_result.configuration = permutation;
224 if (tuning_result.time == std::numeric_limits<float>::max()) {
225 tuning_result.time = 0.0;
226 PrintResult(stdout, tuning_result, kMessageFailure);
227 tuning_result.time = std::numeric_limits<float>::max();
228 tuning_result.status = false;
229 }
230 else if (!tuning_result.status) {
231 PrintResult(stdout, tuning_result, kMessageWarning);
232 }
233 tuning_results_.push_back(tuning_result);
234 }
235
236 // Prints a log of the searching process. This is disabled per default, but can be enabled
237 // using the "OutputSearchLog" function.
238 if (output_search_process_) {
239 auto file = fopen(search_log_filename_.c_str(), "w");
240 search->PrintLog(file);
241 fclose(file);
242 }
243 }
244 }
245 }
246
247 // =================================================================================================
248
249 // Compiles the kernel and checks for error messages, sets all output buffers to zero,
250 // launches the kernel, and collects the timing information.
RunKernel(const std::string & source,const KernelInfo & kernel,const size_t configuration_id,const size_t num_configurations)251 TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const KernelInfo &kernel,
252 const size_t configuration_id,
253 const size_t num_configurations) {
254
255 // In case of an exception, skip this run
256 try {
257 #ifdef VERBOSE
258 fprintf(stdout, "%s Starting compilation\n", kMessageVerbose.c_str());
259 #endif
260
261 // Sets the build options from an environmental variable (if set)
262 auto options = std::vector<std::string>();
263 const auto environment_variable = std::getenv("CLTUNE_BUILD_OPTIONS");
264 if (environment_variable != nullptr) {
265 options.push_back(std::string(environment_variable));
266 }
267
268 // Compiles the kernel and prints the compiler errors/warnings
269 auto program = Program(context_, source);
270 auto build_status = program.Build(device_, options);
271 if (build_status == BuildStatus::kError) {
272 auto message = program.GetBuildInfo(device_);
273 fprintf(stdout, "device compiler error/warning: %s\n", message.c_str());
274 throw std::runtime_error("device compiler error/warning occurred ^^\n");
275 }
276 if (build_status == BuildStatus::kInvalid) {
277 throw std::runtime_error("Invalid program binary");
278 }
279 #ifdef VERBOSE
280 fprintf(stdout, "%s Finished compilation\n", kMessageVerbose.c_str());
281 #endif
282
283 // Clears all previous copies of output buffer(s)
284 for (auto &mem_info: arguments_output_copy_) {
285 #ifdef USE_OPENCL
286 CheckError(clReleaseMemObject(mem_info.buffer));
287 #else
288 CheckError(cuMemFree(mem_info.buffer));
289 #endif
290 }
291 arguments_output_copy_.clear();
292
293 // Creates a copy of the output buffer(s)
294 #ifdef VERBOSE
295 fprintf(stdout, "%s Creating a copy of the output buffer\n", kMessageVerbose.c_str());
296 #endif
297 for (auto &output: arguments_output_) {
298 switch (output.type) {
299 case MemType::kShort: arguments_output_copy_.push_back(CopyOutputBuffer<short>(output)); break;
300 case MemType::kInt: arguments_output_copy_.push_back(CopyOutputBuffer<int>(output)); break;
301 case MemType::kSizeT: arguments_output_copy_.push_back(CopyOutputBuffer<size_t>(output)); break;
302 case MemType::kHalf: arguments_output_copy_.push_back(CopyOutputBuffer<half>(output)); break;
303 case MemType::kFloat: arguments_output_copy_.push_back(CopyOutputBuffer<float>(output)); break;
304 case MemType::kDouble: arguments_output_copy_.push_back(CopyOutputBuffer<double>(output)); break;
305 case MemType::kFloat2: arguments_output_copy_.push_back(CopyOutputBuffer<float2>(output)); break;
306 case MemType::kDouble2: arguments_output_copy_.push_back(CopyOutputBuffer<double2>(output)); break;
307 default: throw std::runtime_error("Unsupported reference output data-type");
308 }
309 }
310
311 // Sets the kernel and its arguments
312 #ifdef VERBOSE
313 fprintf(stdout, "%s Setting kernel arguments\n", kMessageVerbose.c_str());
314 #endif
315 auto tune_kernel = Kernel(program, kernel.name());
316 for (auto &i: arguments_input_) { tune_kernel.SetArgument(i.index, i.buffer); }
317 for (auto &i: arguments_output_copy_) { tune_kernel.SetArgument(i.index, i.buffer); }
318 for (auto &i: arguments_int_) { tune_kernel.SetArgument(i.first, i.second); }
319 for (auto &i: arguments_size_t_) { tune_kernel.SetArgument(i.first, i.second); }
320 for (auto &i: arguments_float_) { tune_kernel.SetArgument(i.first, i.second); }
321 for (auto &i: arguments_double_) { tune_kernel.SetArgument(i.first, i.second); }
322 for (auto &i: arguments_float2_) { tune_kernel.SetArgument(i.first, i.second); }
323 for (auto &i: arguments_double2_) { tune_kernel.SetArgument(i.first, i.second); }
324
325 // Sets the global and local thread-sizes
326 auto global = kernel.global();
327 auto local = kernel.local();
328
329 // Makes sure that the global size is a multiple of the local
330 for (auto i=size_t{0}; i<global.size(); ++i) {
331 global[i] = Ceil(global[i], local[i]);
332 }
333
334 // Verifies the local memory usage of the kernel
335 auto local_mem_usage = tune_kernel.LocalMemUsage(device_);
336 if (!device_.IsLocalMemoryValid(local_mem_usage)) {
337 throw std::runtime_error("Using too much local memory");
338 }
339
340 // Prepares the kernel
341 queue_.Finish();
342
343 // Multiple runs of the kernel to find the minimum execution time
344 fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str());
345 auto events = std::vector<Event>(num_runs_);
346 auto elapsed_time = std::numeric_limits<float>::max();
347 for (auto t=size_t{0}; t<num_runs_; ++t) {
348 #ifdef VERBOSE
349 fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(),
350 t + 1, num_runs_);
351 #endif
352 const auto start_time = std::chrono::steady_clock::now();
353
354 // Runs the kernel (this is the timed part)
355 tune_kernel.Launch(queue_, global, local, events[t].pointer());
356 queue_.Finish(events[t]);
357
358 // Collects the timing information
359 const auto cpu_timer = std::chrono::steady_clock::now() - start_time;
360 const auto cpu_timing = std::chrono::duration<float,std::milli>(cpu_timer).count();
361 #ifdef VERBOSE
362 fprintf(stdout, "%s Completed kernel in %.2lf ms\n", kMessageVerbose.c_str(), cpu_timing);
363 #endif
364 elapsed_time = std::min(elapsed_time, cpu_timing);
365 }
366 queue_.Finish();
367
368 // Prints diagnostic information
369 fprintf(stdout, "%s Completed %s (%.1lf ms) - %zu out of %zu\n",
370 kMessageOK.c_str(), kernel.name().c_str(), elapsed_time,
371 configuration_id+1, num_configurations);
372
373 // Computes the result of the tuning
374 auto local_threads = size_t{1};
375 for (auto &item: local) { local_threads *= item; }
376 TunerResult result = {kernel.name(), elapsed_time, local_threads, false, {}};
377 return result;
378 }
379
380 // There was an exception, now return an invalid tuner results
381 catch(std::exception& e) {
382 fprintf(stdout, "%s Kernel %s failed\n", kMessageFailure.c_str(), kernel.name().c_str());
383 fprintf(stdout, "%s catched exception: %s\n", kMessageFailure.c_str(), e.what());
384 TunerResult result = {kernel.name(), std::numeric_limits<float>::max(), 0, false, {}};
385 return result;
386 }
387 }
388
389 // =================================================================================================
390
391 // Uploads a copy of the output vector to the device. This is done because the output might as well
392 // be an input buffer at the same time. Every kernel might override it, so it needs to be updated
393 // before each run.
394 template <typename T>
CopyOutputBuffer(MemArgument & argument)395 TunerImpl::MemArgument TunerImpl::CopyOutputBuffer(MemArgument &argument) {
396 auto buffer_copy = Buffer<T>(context_, BufferAccess::kNotOwned, argument.size);
397 auto buffer_source = Buffer<T>(argument.buffer);
398 buffer_source.CopyTo(queue_, argument.size, buffer_copy);
399 auto result = MemArgument{argument.index, argument.size, argument.type, buffer_copy()};
400 return result;
401 }
402
403 // =================================================================================================
404
405 // Loops over all reference outputs, creates per output a new host buffer and copies the device
406 // buffer from the device onto the host. This function is specialised for different data-types.
StoreReferenceOutput()407 void TunerImpl::StoreReferenceOutput() {
408 reference_outputs_.clear();
409 for (auto &output_buffer: arguments_output_copy_) {
410 switch (output_buffer.type) {
411 case MemType::kShort: DownloadReference<short>(output_buffer); break;
412 case MemType::kInt: DownloadReference<int>(output_buffer); break;
413 case MemType::kSizeT: DownloadReference<size_t>(output_buffer); break;
414 case MemType::kHalf: DownloadReference<half>(output_buffer); break;
415 case MemType::kFloat: DownloadReference<float>(output_buffer); break;
416 case MemType::kDouble: DownloadReference<double>(output_buffer); break;
417 case MemType::kFloat2: DownloadReference<float2>(output_buffer); break;
418 case MemType::kDouble2: DownloadReference<double2>(output_buffer); break;
419 default: throw std::runtime_error("Unsupported reference output data-type");
420 }
421 }
422 }
DownloadReference(MemArgument & device_buffer)423 template <typename T> void TunerImpl::DownloadReference(MemArgument &device_buffer) {
424 auto host_buffer = new T[device_buffer.size];
425 Buffer<T>(device_buffer.buffer).Read(queue_, device_buffer.size, host_buffer);
426 reference_outputs_.push_back(host_buffer);
427 }
428
429 // =================================================================================================
430
431 // In case there is a reference kernel, this function loops over all outputs, creates per output a
432 // new host buffer and copies the device buffer from the device onto the host. Following, it
433 // compares the results to the reference output. This function is specialised for different
434 // data-types. These functions return "true" if everything is OK, and "false" if there is a warning.
VerifyOutput()435 bool TunerImpl::VerifyOutput() {
436 auto status = true;
437 if (has_reference_) {
438 auto i = size_t{0};
439 for (auto &output_buffer: arguments_output_copy_) {
440 switch (output_buffer.type) {
441 case MemType::kShort: status &= DownloadAndCompare<short>(output_buffer, i); break;
442 case MemType::kInt: status &= DownloadAndCompare<int>(output_buffer, i); break;
443 case MemType::kSizeT: status &= DownloadAndCompare<size_t>(output_buffer, i); break;
444 case MemType::kHalf: status &= DownloadAndCompare<half>(output_buffer, i); break;
445 case MemType::kFloat: status &= DownloadAndCompare<float>(output_buffer, i); break;
446 case MemType::kDouble: status &= DownloadAndCompare<double>(output_buffer, i); break;
447 case MemType::kFloat2: status &= DownloadAndCompare<float2>(output_buffer, i); break;
448 case MemType::kDouble2: status &= DownloadAndCompare<double2>(output_buffer, i); break;
449 default: throw std::runtime_error("Unsupported output data-type");
450 }
451 ++i;
452 }
453 }
454 return status;
455 }
456
457 // See above comment
458 template <typename T>
DownloadAndCompare(MemArgument & device_buffer,const size_t i)459 bool TunerImpl::DownloadAndCompare(MemArgument &device_buffer, const size_t i) {
460 auto l2_norm = 0.0;
461
462 // Downloads the results to the host
463 std::vector<T> host_buffer(device_buffer.size);
464 Buffer<T>(device_buffer.buffer).Read(queue_, device_buffer.size, host_buffer);
465
466 // Compares the results (L2 norm)
467 T* reference_output = static_cast<T*>(reference_outputs_[i]);
468 for (auto j=size_t{0}; j<device_buffer.size; ++j) {
469 l2_norm += AbsoluteDifference(reference_output[j], host_buffer[j]);
470 }
471
472 // Verifies if everything was OK, if not: print the L2 norm
473 // TODO: Implement a choice of comparisons for the client to choose from
474 if (std::isnan(l2_norm) || l2_norm > kMaxL2Norm) {
475 fprintf(stderr, "%s Results differ: L2 norm is %6.2e\n", kMessageWarning.c_str(), l2_norm);
476 return false;
477 }
478 return true;
479 }
480
481 // Computes the absolute difference
482 template <typename T>
AbsoluteDifference(const T reference,const T result)483 double TunerImpl::AbsoluteDifference(const T reference, const T result) {
484 return fabs(static_cast<double>(reference) - static_cast<double>(result));
485 }
AbsoluteDifference(const float2 reference,const float2 result)486 template <> double TunerImpl::AbsoluteDifference(const float2 reference, const float2 result) {
487 auto real = fabs(static_cast<double>(reference.real()) - static_cast<double>(result.real()));
488 auto imag = fabs(static_cast<double>(reference.imag()) - static_cast<double>(result.imag()));
489 return real + imag;
490 }
AbsoluteDifference(const double2 reference,const double2 result)491 template <> double TunerImpl::AbsoluteDifference(const double2 reference, const double2 result) {
492 auto real = fabs(reference.real() - result.real());
493 auto imag = fabs(reference.imag() - result.imag());
494 return real + imag;
495 }
AbsoluteDifference(const half reference,const half result)496 template <> double TunerImpl::AbsoluteDifference(const half reference, const half result) {
497 const auto reference_float = HalfToFloat(reference);
498 const auto result_float = HalfToFloat(result);
499 return fabs(static_cast<double>(reference_float) - static_cast<double>(result_float));
500 }
501
502 // =================================================================================================
503
504 // Trains a model and predicts all remaining configurations
ModelPrediction(const Model model_type,const float validation_fraction,const size_t test_top_x_configurations)505 void TunerImpl::ModelPrediction(const Model model_type, const float validation_fraction,
506 const size_t test_top_x_configurations) {
507
508 // Iterates over all tunable kernels
509 for (auto &kernel: kernels_) {
510
511 // Retrieves the number of training samples and features
512 auto validation_samples = static_cast<size_t>(tuning_results_.size()*validation_fraction);
513 auto training_samples = tuning_results_.size() - validation_samples;
514 auto features = tuning_results_[0].configuration.size();
515
516 // Sets the raw training and validation data
517 auto x_train = std::vector<std::vector<float>>(training_samples, std::vector<float>(features));
518 auto y_train = std::vector<float>(training_samples);
519 for (auto s=size_t{0}; s<training_samples; ++s) {
520 y_train[s] = tuning_results_[s].time;
521 for (auto f=size_t{0}; f<features; ++f) {
522 x_train[s][f] = static_cast<float>(tuning_results_[s].configuration[f].value);
523 }
524 }
525 auto x_validation = std::vector<std::vector<float>>(validation_samples, std::vector<float>(features));
526 auto y_validation = std::vector<float>(validation_samples);
527 for (auto s=size_t{0}; s<validation_samples; ++s) {
528 y_validation[s] = tuning_results_[s+training_samples].time;
529 for (auto f=size_t{0}; f<features; ++f) {
530 x_validation[s][f] = static_cast<float>(tuning_results_[s + training_samples].configuration[f].value);
531 }
532 }
533
534 // Pointer to one of the machine learning models
535 std::unique_ptr<MLModel<float>> model;
536
537 // Trains a linear regression model
538 if (model_type == Model::kLinearRegression) {
539 PrintHeader("Training a linear regression model");
540
541 // Sets the learning parameters
542 auto learning_iterations = size_t{800}; // For gradient descent
543 auto learning_rate = 0.05f; // For gradient descent
544 auto lambda = 0.2f; // Regularization parameter
545 auto debug_display = true; // Output learned data to stdout
546
547 // Trains and validates the model
548 model = std::unique_ptr<MLModel<float>>(
549 new LinearRegression<float>(learning_iterations, learning_rate, lambda, debug_display)
550 );
551 model->Train(x_train, y_train);
552 model->Validate(x_validation, y_validation);
553 }
554
555 // Trains a neural network model
556 else if (model_type == Model::kNeuralNetwork) {
557 PrintHeader("Training a neural network model");
558
559 // Sets the learning parameters
560 auto learning_iterations = size_t{800}; // For gradient descent
561 auto learning_rate = 0.1f; // For gradient descent
562 auto lambda = 0.005f; // Regularization parameter
563 auto debug_display = true; // Output learned data to stdout
564 auto layers = std::vector<size_t>{features, 20, 1};
565
566 // Trains and validates the model
567 model = std::unique_ptr<MLModel<float>>(
568 new NeuralNetwork<float>(learning_iterations, learning_rate, lambda, layers, debug_display)
569 );
570 model->Train(x_train, y_train);
571 model->Validate(x_validation, y_validation);
572 }
573
574 // Unknown model
575 else {
576 throw std::runtime_error("Unknown machine learning model");
577 }
578
579 // Iterates over all configurations (the permutations of the tuning parameters)
580 PrintHeader("Predicting the remaining configurations using the model");
581 auto model_results = std::vector<std::tuple<size_t,float>>();
582 auto p = size_t{0};
583 for (auto &permutation: kernel.configurations()) {
584
585 // Runs the trained model to predicts the result
586 auto x_test = std::vector<float>();
587 for (auto &setting: permutation) {
588 x_test.push_back(static_cast<float>(setting.value));
589 }
590 auto predicted_time = model->Predict(x_test);
591 model_results.push_back(std::make_tuple(p, predicted_time));
592 ++p;
593 }
594
595 // Sorts the modelled results by performance
596 std::sort(begin(model_results), end(model_results),
597 [](const std::tuple<size_t,float> &t1, const std::tuple<size_t,float> &t2) {
598 return std::get<1>(t1) < std::get<1>(t2);
599 }
600 );
601
602 // Tests the best configurations on the device to verify the results
603 PrintHeader("Testing the best-found configurations");
604 for (auto i=size_t{0}; i<test_top_x_configurations && i<model_results.size(); ++i) {
605 auto result = model_results[i];
606 printf("[ -------> ] The model predicted: %.3lf ms\n", std::get<1>(result));
607 auto pid = std::get<0>(result);
608 auto permutations = kernel.configurations();
609 auto permutation = permutations[pid];
610
611 // Adds the parameters to the source-code string as defines
612 auto source = std::string{};
613 for (auto &config: permutation) {
614 source += config.GetDefine();
615 }
616 source += kernel.source();
617
618 // Updates the local range with the parameter values
619 kernel.ComputeRanges(permutation);
620
621 // Compiles and runs the kernel
622 auto tuning_result = RunKernel(source, kernel, pid, test_top_x_configurations);
623 tuning_result.status = VerifyOutput();
624
625 // Stores the parameters and the timing-result
626 tuning_result.configuration = permutation;
627 tuning_results_.push_back(tuning_result);
628 if (tuning_result.time == std::numeric_limits<float>::max()) {
629 tuning_result.time = 0.0;
630 PrintResult(stdout, tuning_result, kMessageFailure);
631 tuning_result.time = std::numeric_limits<float>::max();
632 }
633 else if (!tuning_result.status) {
634 PrintResult(stdout, tuning_result, kMessageWarning);
635 }
636 }
637 }
638 }
639
640 // =================================================================================================
641
642 // Prints a result by looping over all its configuration parameters
PrintResult(FILE * fp,const TunerResult & result,const std::string & message) const643 void TunerImpl::PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const {
644 fprintf(fp, "%s %s; ", message.c_str(), result.kernel_name.c_str());
645 fprintf(fp, "%8.1lf ms;", result.time);
646 for (auto &setting: result.configuration) {
647 fprintf(fp, "%9s;", setting.GetConfig().c_str());
648 }
649 fprintf(fp, "\n");
650 }
651
652 // =================================================================================================
653
654 // Finds the best result
GetBestResult() const655 TunerImpl::TunerResult TunerImpl::GetBestResult() const {
656 auto best_result = tuning_results_[0];
657 auto best_time = std::numeric_limits<double>::max();
658 for (auto &tuning_result: tuning_results_) {
659 if (tuning_result.status && best_time >= tuning_result.time) {
660 best_result = tuning_result;
661 best_time = tuning_result.time;
662 }
663 }
664 return best_result;
665 }
666
667 // =================================================================================================
668
669 // Loads a file into a stringstream and returns the result as a string
LoadFile(const std::string & filename)670 std::string TunerImpl::LoadFile(const std::string &filename) {
671 std::ifstream file(filename);
672 if (file.fail()) { throw std::runtime_error("Could not open kernel file: "+filename); }
673 std::stringstream file_contents;
674 file_contents << file.rdbuf();
675 return file_contents.str();
676 }
677
678 // =================================================================================================
679
680 // Converts a C++ string to a C string and print it out with nice formatting
PrintHeader(const std::string & header_name) const681 void TunerImpl::PrintHeader(const std::string &header_name) const {
682 if (!suppress_output_) {
683 fprintf(stdout, "\n%s %s\n", kMessageHead.c_str(), header_name.c_str());
684 }
685 }
686
687 // =================================================================================================
688
689 // Get the MemType based on a template argument
GetType()690 template <> MemType TunerImpl::GetType<short>() { return MemType::kShort; }
GetType()691 template <> MemType TunerImpl::GetType<int>() { return MemType::kInt; }
GetType()692 template <> MemType TunerImpl::GetType<size_t>() { return MemType::kSizeT; }
GetType()693 template <> MemType TunerImpl::GetType<half>() { return MemType::kHalf; }
GetType()694 template <> MemType TunerImpl::GetType<float>() { return MemType::kFloat; }
GetType()695 template <> MemType TunerImpl::GetType<double>() { return MemType::kDouble; }
GetType()696 template <> MemType TunerImpl::GetType<float2>() { return MemType::kFloat2; }
GetType()697 template <> MemType TunerImpl::GetType<double2>() { return MemType::kDouble2; }
698
699 // =================================================================================================
700 } // namespace cltune
701