1 
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file contains the non-publicly visible part of the tuner. It contains the header file for
9 // the TunerImpl class, the implemenation in the Pimpl idiom. This class contains a vector of
10 // KernelInfo objects, holding the actual kernels and parameters. This class interfaces between
11 // them. This class is also responsible for the actual tuning and the collection and dissemination
12 // of the results.
13 //
14 // -------------------------------------------------------------------------------------------------
15 //
16 // Copyright 2014 SURFsara
17 //
18 // Licensed under the Apache License, Version 2.0 (the "License");
19 // you may not use this file except in compliance with the License.
20 // You may obtain a copy of the License at
21 //
22 //  http://www.apache.org/licenses/LICENSE-2.0
23 //
24 // Unless required by applicable law or agreed to in writing, software
25 // distributed under the License is distributed on an "AS IS" BASIS,
26 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27 // See the License for the specific language governing permissions and
28 // limitations under the License.
29 //
30 // =================================================================================================
31 
32 #ifndef CLTUNE_TUNER_IMPL_H_
33 #define CLTUNE_TUNER_IMPL_H_
34 
35 // Uses either the OpenCL or CUDA back-end (CLCudaAPI C++11 headers)
36 #if USE_OPENCL
37   #include "internal/clpp11.h"
38 #else
39   #include "internal/cupp11.h"
40 #endif
41 
42 #include "internal/kernel_info.h"
43 #include "internal/msvc.h"
44 
45 // Host data-type for half-precision floating-point (16-bit)
46 #include "internal/half.h"
47 
48 #include <string> // std::string
49 #include <vector> // std::vector
50 #include <memory> // std::shared_ptr
51 #include <complex> // std::complex
52 #include <stdexcept> // std::runtime_error
53 
54 namespace cltune {
55 // =================================================================================================
56 
57 // Shorthands for complex data-types
58 using float2 = std::complex<float>; // cl_float2;
59 using double2 = std::complex<double>; // cl_double2;
60 
61 // Raw device buffer
62 #if USE_OPENCL
63   using BufferRaw = cl_mem;
64 #else
65   using BufferRaw = CUdeviceptr;
66 #endif
67 
68 // Enumeration of currently supported data-types by this class
69 enum class MemType { kShort, kInt, kSizeT, kHalf, kFloat, kDouble, kFloat2, kDouble2 };
70 
71 // See comment at top of file for a description of the class
72 class TunerImpl {
73  // Note that everything here is public because of the Pimpl-idiom
74  public:
75 
76   // Parameters
77   static const double kMaxL2Norm; // This is the threshold for 'correctness'
78 
79   // Messages printed to stdout (in colours)
80   static const std::string kMessageFull;
81   static const std::string kMessageHead;
82   static const std::string kMessageRun;
83   static const std::string kMessageInfo;
84   static const std::string kMessageVerbose;
85   static const std::string kMessageOK;
86   static const std::string kMessageWarning;
87   static const std::string kMessageFailure;
88   static const std::string kMessageResult;
89   static const std::string kMessageBest;
90 
91   // Helper structure to store a device memory argument for a kernel
92   struct MemArgument {
93     size_t index;       // The kernel-argument index
94     size_t size;        // The number of elements (not bytes)
95     MemType type;       // The data-type (e.g. float)
96     BufferRaw buffer;   // The buffer on the device
97   };
98 
99   // Helper structure to hold the results of a tuning run
100   struct TunerResult {
101     std::string kernel_name;
102     float time;
103     size_t threads;
104     bool status;
105     KernelInfo::Configuration configuration;
106   };
107 
108   // Initialize either with platform 0 and device 0 or with a custom platform/device
109   explicit TunerImpl();
110   explicit TunerImpl(size_t platform_id, size_t device_id);
111   ~TunerImpl();
112 
113   // Starts the tuning process. This function is called directly from the Tuner API.
114   void Tune();
115 
116   // Compiles and runs a kernel and returns the elapsed time
117   TunerResult RunKernel(const std::string &source, const KernelInfo &kernel,
118                         const size_t configuration_id, const size_t num_configurations);
119 
120   // Copies an output buffer
121   template <typename T> MemArgument CopyOutputBuffer(MemArgument &argument);
122 
123   // Stores the output of the reference run into the host memory
124   void StoreReferenceOutput();
125   template <typename T> void DownloadReference(MemArgument &device_buffer);
126 
127   // Downloads the output of a tuning run and compares it against the reference run
128   bool VerifyOutput();
129   template <typename T> bool DownloadAndCompare(MemArgument &device_buffer, const size_t i);
130   template <typename T> double AbsoluteDifference(const T reference, const T result);
131 
132   // Trains and uses a machine learning model based on the search space explored so far
133   void ModelPrediction(const Model model_type, const float validation_fraction,
134                        const size_t test_top_x_configurations);
135 
136   // Prints results of a particular kernel run
137   void PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const;
138 
139   // Retrieves the best tuning result
140   TunerResult GetBestResult() const;
141 
142   // Loads a file from disk into a string
143   std::string LoadFile(const std::string &filename);
144 
145   // Prints a header of a new section in the tuning process
146   void PrintHeader(const std::string &header_name) const;
147 
148   // Specific implementations of the helper structure to get the memory-type based on a template
149   // argument. Supports all enumerations of MemType.
150   template <typename T> MemType GetType();
151 
152   // Rounding functions performing ceiling and division operations
CeilDiv(const size_t x,const size_t y)153   size_t CeilDiv(const size_t x, const size_t y) {
154     return 1 + ((x - 1) / y);
155   }
Ceil(const size_t x,const size_t y)156   size_t Ceil(const size_t x, const size_t y) {
157     return CeilDiv(x,y)*y;
158   }
159 
160   // Accessors to device data-types
device()161   const Device device() const { return device_; }
context()162   const Context context() const { return context_; }
queue()163   Queue queue() const { return queue_; }
164 
165   // Device variables
166   Platform platform_;
167   Device device_;
168   Context context_;
169   Queue queue_;
170 
171   // Settings
172   size_t num_runs_; // This is used for more-accurate execution time measurement
173   bool has_reference_;
174   bool suppress_output_;
175   bool output_search_process_;
176   std::string search_log_filename_;
177 
178   // The search method and its arguments
179   SearchMethod search_method_;
180   std::vector<double> search_args_;
181 
182   // Storage of kernel sources, arguments, and parameters
183   size_t argument_counter_;
184   std::vector<KernelInfo> kernels_;
185   std::vector<MemArgument> arguments_input_;
186   std::vector<MemArgument> arguments_output_; // these remain constant
187   std::vector<MemArgument> arguments_output_copy_; // these may be modified by the kernel
188   std::vector<std::pair<size_t,int>> arguments_int_;
189   std::vector<std::pair<size_t,size_t>> arguments_size_t_;
190   std::vector<std::pair<size_t,float>> arguments_float_;
191   std::vector<std::pair<size_t,double>> arguments_double_;
192   std::vector<std::pair<size_t,float2>> arguments_float2_;
193   std::vector<std::pair<size_t,double2>> arguments_double2_;
194 
195   // Storage for the reference kernel and output
196   std::unique_ptr<KernelInfo> reference_kernel_;
197   std::vector<void*> reference_outputs_;
198 
199   // List of tuning results
200   std::vector<TunerResult> tuning_results_;
201 };
202 
203 // =================================================================================================
204 } // namespace cltune
205 
206 // CLTUNE_TUNER_IMPL_H_
207 #endif
208