1 2 // ================================================================================================= 3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses 4 // a tab-size of two spaces and a max-width of 100 characters per line. 5 // 6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren) 7 // 8 // This file contains the non-publicly visible part of the tuner. It contains the header file for 9 // the TunerImpl class, the implemenation in the Pimpl idiom. This class contains a vector of 10 // KernelInfo objects, holding the actual kernels and parameters. This class interfaces between 11 // them. This class is also responsible for the actual tuning and the collection and dissemination 12 // of the results. 13 // 14 // ------------------------------------------------------------------------------------------------- 15 // 16 // Copyright 2014 SURFsara 17 // 18 // Licensed under the Apache License, Version 2.0 (the "License"); 19 // you may not use this file except in compliance with the License. 20 // You may obtain a copy of the License at 21 // 22 // http://www.apache.org/licenses/LICENSE-2.0 23 // 24 // Unless required by applicable law or agreed to in writing, software 25 // distributed under the License is distributed on an "AS IS" BASIS, 26 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 27 // See the License for the specific language governing permissions and 28 // limitations under the License. 29 // 30 // ================================================================================================= 31 32 #ifndef CLTUNE_TUNER_IMPL_H_ 33 #define CLTUNE_TUNER_IMPL_H_ 34 35 // Uses either the OpenCL or CUDA back-end (CLCudaAPI C++11 headers) 36 #if USE_OPENCL 37 #include "internal/clpp11.h" 38 #else 39 #include "internal/cupp11.h" 40 #endif 41 42 #include "internal/kernel_info.h" 43 #include "internal/msvc.h" 44 45 // Host data-type for half-precision floating-point (16-bit) 46 #include "internal/half.h" 47 48 #include <string> // std::string 49 #include <vector> // std::vector 50 #include <memory> // std::shared_ptr 51 #include <complex> // std::complex 52 #include <stdexcept> // std::runtime_error 53 54 namespace cltune { 55 // ================================================================================================= 56 57 // Shorthands for complex data-types 58 using float2 = std::complex<float>; // cl_float2; 59 using double2 = std::complex<double>; // cl_double2; 60 61 // Raw device buffer 62 #if USE_OPENCL 63 using BufferRaw = cl_mem; 64 #else 65 using BufferRaw = CUdeviceptr; 66 #endif 67 68 // Enumeration of currently supported data-types by this class 69 enum class MemType { kShort, kInt, kSizeT, kHalf, kFloat, kDouble, kFloat2, kDouble2 }; 70 71 // See comment at top of file for a description of the class 72 class TunerImpl { 73 // Note that everything here is public because of the Pimpl-idiom 74 public: 75 76 // Parameters 77 static const double kMaxL2Norm; // This is the threshold for 'correctness' 78 79 // Messages printed to stdout (in colours) 80 static const std::string kMessageFull; 81 static const std::string kMessageHead; 82 static const std::string kMessageRun; 83 static const std::string kMessageInfo; 84 static const std::string kMessageVerbose; 85 static const std::string kMessageOK; 86 static const std::string kMessageWarning; 87 static const std::string kMessageFailure; 88 static const std::string kMessageResult; 89 static const std::string kMessageBest; 90 91 // Helper structure to store a device memory argument for a kernel 92 struct MemArgument { 93 size_t index; // The kernel-argument index 94 size_t size; // The number of elements (not bytes) 95 MemType type; // The data-type (e.g. float) 96 BufferRaw buffer; // The buffer on the device 97 }; 98 99 // Helper structure to hold the results of a tuning run 100 struct TunerResult { 101 std::string kernel_name; 102 float time; 103 size_t threads; 104 bool status; 105 KernelInfo::Configuration configuration; 106 }; 107 108 // Initialize either with platform 0 and device 0 or with a custom platform/device 109 explicit TunerImpl(); 110 explicit TunerImpl(size_t platform_id, size_t device_id); 111 ~TunerImpl(); 112 113 // Starts the tuning process. This function is called directly from the Tuner API. 114 void Tune(); 115 116 // Compiles and runs a kernel and returns the elapsed time 117 TunerResult RunKernel(const std::string &source, const KernelInfo &kernel, 118 const size_t configuration_id, const size_t num_configurations); 119 120 // Copies an output buffer 121 template <typename T> MemArgument CopyOutputBuffer(MemArgument &argument); 122 123 // Stores the output of the reference run into the host memory 124 void StoreReferenceOutput(); 125 template <typename T> void DownloadReference(MemArgument &device_buffer); 126 127 // Downloads the output of a tuning run and compares it against the reference run 128 bool VerifyOutput(); 129 template <typename T> bool DownloadAndCompare(MemArgument &device_buffer, const size_t i); 130 template <typename T> double AbsoluteDifference(const T reference, const T result); 131 132 // Trains and uses a machine learning model based on the search space explored so far 133 void ModelPrediction(const Model model_type, const float validation_fraction, 134 const size_t test_top_x_configurations); 135 136 // Prints results of a particular kernel run 137 void PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const; 138 139 // Retrieves the best tuning result 140 TunerResult GetBestResult() const; 141 142 // Loads a file from disk into a string 143 std::string LoadFile(const std::string &filename); 144 145 // Prints a header of a new section in the tuning process 146 void PrintHeader(const std::string &header_name) const; 147 148 // Specific implementations of the helper structure to get the memory-type based on a template 149 // argument. Supports all enumerations of MemType. 150 template <typename T> MemType GetType(); 151 152 // Rounding functions performing ceiling and division operations CeilDiv(const size_t x,const size_t y)153 size_t CeilDiv(const size_t x, const size_t y) { 154 return 1 + ((x - 1) / y); 155 } Ceil(const size_t x,const size_t y)156 size_t Ceil(const size_t x, const size_t y) { 157 return CeilDiv(x,y)*y; 158 } 159 160 // Accessors to device data-types device()161 const Device device() const { return device_; } context()162 const Context context() const { return context_; } queue()163 Queue queue() const { return queue_; } 164 165 // Device variables 166 Platform platform_; 167 Device device_; 168 Context context_; 169 Queue queue_; 170 171 // Settings 172 size_t num_runs_; // This is used for more-accurate execution time measurement 173 bool has_reference_; 174 bool suppress_output_; 175 bool output_search_process_; 176 std::string search_log_filename_; 177 178 // The search method and its arguments 179 SearchMethod search_method_; 180 std::vector<double> search_args_; 181 182 // Storage of kernel sources, arguments, and parameters 183 size_t argument_counter_; 184 std::vector<KernelInfo> kernels_; 185 std::vector<MemArgument> arguments_input_; 186 std::vector<MemArgument> arguments_output_; // these remain constant 187 std::vector<MemArgument> arguments_output_copy_; // these may be modified by the kernel 188 std::vector<std::pair<size_t,int>> arguments_int_; 189 std::vector<std::pair<size_t,size_t>> arguments_size_t_; 190 std::vector<std::pair<size_t,float>> arguments_float_; 191 std::vector<std::pair<size_t,double>> arguments_double_; 192 std::vector<std::pair<size_t,float2>> arguments_float2_; 193 std::vector<std::pair<size_t,double2>> arguments_double2_; 194 195 // Storage for the reference kernel and output 196 std::unique_ptr<KernelInfo> reference_kernel_; 197 std::vector<void*> reference_outputs_; 198 199 // List of tuning results 200 std::vector<TunerResult> tuning_results_; 201 }; 202 203 // ================================================================================================= 204 } // namespace cltune 205 206 // CLTUNE_TUNER_IMPL_H_ 207 #endif 208