1 
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file demonstrates the usage of CLTune with 2D convolution and advanced search techniques
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 //  http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27 
28 #include <iostream>
29 #include <sstream>
30 #include <vector>
31 #include <chrono>
32 #include <random>
33 #include <cmath>
34 #include <numeric>
35 
36 // Includes the OpenCL tuner library
37 #include "cltune.h"
38 
39 // Helper function to perform an integer division + ceiling (round-up)
CeilDiv(size_t a,size_t b)40 size_t CeilDiv(size_t a, size_t b) { return (a + b - 1)/b; }
41 
42 // Helper function to determine whether or not 'a' is a multiple of 'b'
IsMultiple(size_t a,size_t b)43 bool IsMultiple(size_t a, size_t b) {
44   return ((a/b)*b == a) ? true : false;
45 };
46 
47 // Constants
48 const auto kDefaultDevice = size_t{0};
49 const auto kDefaultPlatform = size_t{0};
50 const auto kDefaultSearchMethod = size_t{1};
51 const auto kDefaultSearchParameter1 = size_t{4};
52 
53 // Settings (synchronise these with "conv.cc", "conv.opencl" and "conv_reference.opencl")
54 #define HFS (3)        // Half filter size
55 #define FS (HFS+HFS+1) // Filter size
56 
57 // Settings (sizes)
58 const auto kSizeX = size_t{8192}; // Matrix dimension X
59 const auto kSizeY = size_t{4096}; // Matrix dimension Y
60 
61 // =================================================================================================
62 
63 // Example showing how to tune an OpenCL 2D convolution kernel
main(int argc,char * argv[])64 int main(int argc, char* argv[]) {
65 
66   // Sets the filenames of the OpenCL kernels (optionally automatically translated to CUDA)
67   auto conv = std::vector<std::string>{"../samples/conv/conv.opencl"};
68   auto conv_reference = std::vector<std::string>{"../samples/conv/conv_reference.opencl"};
69   #ifndef USE_OPENCL
70     conv.insert(conv.begin(), "../samples/cl_to_cuda.h");
71     conv_reference.insert(conv_reference.begin(), "../samples/cl_to_cuda.h");
72   #endif
73 
74   // Selects the device, the search method and its first parameter. These parameters are all
75   // optional and are thus also given default values.
76   auto device_id = kDefaultDevice;
77   auto platform_id = kDefaultPlatform;
78   auto method = kDefaultSearchMethod;
79   auto search_param_1 = kDefaultSearchParameter1;
80   if (argc >= 2) {
81     platform_id = static_cast<size_t>(std::stoi(std::string{argv[1]}));
82     if (argc >= 3) {
83       device_id = static_cast<size_t>(std::stoi(std::string{argv[2]}));
84       if (argc >= 4) {
85         method = static_cast<size_t>(std::stoi(std::string{argv[3]}));
86         if (argc >= 5) {
87           search_param_1 = static_cast<size_t>(std::stoi(std::string{argv[4]}));
88         }
89       }
90     }
91   }
92 
93   // Creates data structures
94   const auto kExtraSize = size_t{FS*8};
95   auto mat_a = std::vector<float>((kExtraSize+kSizeX)*(kExtraSize+kSizeY));
96   auto mat_b = std::vector<float>(kSizeX*kSizeY);
97   auto coeff = std::vector<float>(FS*FS);
98 
99   // Create a random number generator
100   const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count();
101   std::default_random_engine generator(static_cast<unsigned int>(random_seed));
102   std::uniform_real_distribution<float> distribution(-2.0f, 2.0f);
103 
104   // Populates input data structures
105   for (auto &item: mat_a) { item = distribution(generator); }
106   for (auto &item: mat_b) { item = 0.0f; }
107 
108   // Creates the filter coefficients (gaussian blur)
109   auto sigma = 1.0f;
110   auto mean = FS/2.0f;
111   auto sum = 0.0f;
112   for (auto x=size_t{0}; x<FS; ++x) {
113     for (auto y=size_t{0}; y<FS; ++y) {
114       auto exponent = -0.5f * (pow((x-mean)/sigma, 2.0f) + pow((y-mean)/sigma, 2.0f));
115       coeff[y*FS + x] = static_cast<float>(exp(exponent) / (2.0f * 3.14159265f * sigma * sigma));
116       sum += coeff[y*FS + x];
117     }
118   }
119   for (auto &item: coeff) { item = item / sum; }
120 
121   // ===============================================================================================
122 
123   // Initializes the tuner (platform 'platform_id', device 'device_id')
124   cltune::Tuner tuner(static_cast<size_t>(platform_id), static_cast<size_t>(device_id));
125 
126   // Sets one of the following search methods:
127   // 0) Random search
128   // 1) Simulated annealing
129   // 2) Particle swarm optimisation (PSO)
130   // 3) Full search
131   auto fraction = 1/64.0f;
132   if      (method == 0) { tuner.UseRandomSearch(fraction); }
133   else if (method == 1) { tuner.UseAnnealing(fraction, static_cast<double>(search_param_1)); }
134   else if (method == 2) { tuner.UsePSO(fraction, static_cast<size_t>(search_param_1), 0.4, 0.0, 0.4); }
135   else                  { tuner.UseFullSearch(); }
136 
137   // Outputs the search process to a file
138   tuner.OutputSearchLog("search_log.txt");
139 
140   // ===============================================================================================
141 
142   // Adds a heavily tuneable kernel and some example parameter values
143   auto id = tuner.AddKernel(conv, "conv", {kSizeX, kSizeY}, {1, 1});
144   tuner.AddParameter(id, "TBX", {8, 16, 32, 64});
145   tuner.AddParameter(id, "TBY", {8, 16, 32, 64});
146   tuner.AddParameter(id, "LOCAL", {0, 1, 2});
147   tuner.AddParameter(id, "WPTX", {1, 2, 4, 8});
148   tuner.AddParameter(id, "WPTY", {1, 2, 4, 8});
149   tuner.AddParameter(id, "VECTOR", {1, 2, 4});
150   tuner.AddParameter(id, "UNROLL_FACTOR", {1, FS});
151   tuner.AddParameter(id, "PADDING", {0, 1});
152 
153   // Introduces a helper parameter to compute the proper number of threads for the LOCAL == 2 case.
154   // In this case, the workgroup size (TBX by TBY) is extra large (TBX_XL by TBY_XL) because it uses
155   // extra threads to compute the halo threads. How many extra threads are needed is dependend on
156   // the filter size. Here we support a the TBX and TBY size plus up to 10 extra threads.
157   auto integers = std::initializer_list<size_t>{
158     8,9,10,11,12,13,14,15,
159     16,17,18,19,20,21,22,23,24,25,26,
160     32,33,34,35,36,37,38,39,40,41,42,
161     64,65,66,67,68,69,70,71,72,73,74
162   };
163   tuner.AddParameter(id, "TBX_XL", integers);
164   tuner.AddParameter(id, "TBY_XL", integers);
165   auto HaloThreads = [] (std::vector<size_t> v) {
166     if (v[0] == 2) { return (v[1] == v[2] + CeilDiv(2*HFS,v[3])); } // With halo threads
167     else           { return (v[1] == v[2]); }                       // Without halo threads
168   };
169   tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBX_XL", "TBX", "WPTX"});
170   tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBY_XL", "TBY", "WPTY"});
171 
172   // Sets the constrains on the vector size
173   auto VectorConstraint = [] (std::vector<size_t> v) {
174     if (v[0] == 2) { return IsMultiple(v[2],v[1]) && IsMultiple(2*HFS,v[1]); }
175     else           { return IsMultiple(v[2],v[1]); }
176   };
177   tuner.AddConstraint(id, VectorConstraint, {"LOCAL", "VECTOR", "WPTX"});
178 
179   // Makes sure the work per thread is not too high, otherwise too many registers would be used.
180   //auto WorkPerThreadConstraint = [] (std::vector<size_t> v) { return (v[0]*v[1] < 32); };
181   //tuner.AddConstraint(id, WorkPerThreadConstraint, {"WPTX", "WPTY"});
182 
183   // Sets padding to zero in case local memory is not used
184   auto PaddingConstraint = [] (std::vector<size_t> v) { return (v[1] == 0 || v[0] != 0); };
185   tuner.AddConstraint(id, PaddingConstraint, {"LOCAL", "PADDING"});
186 
187   // Sets the constraints for local memory size limitations
188   auto LocalMemorySize = [] (std::vector<size_t> v) {
189     if (v[0] != 0) { return ((v[3]*v[4] + 2*HFS) * (v[1]*v[2] + 2*HFS + v[5]))*sizeof(float); }
190     else           { return size_t{0}; }
191   };
192   tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"LOCAL", "TBX", "WPTX", "TBY", "WPTY", "PADDING"});
193 
194   // Modifies the thread-sizes based on the parameters
195   tuner.MulLocalSize(id, {"TBX_XL", "TBY_XL"});
196   tuner.MulGlobalSize(id, {"TBX_XL", "TBY_XL"});
197   tuner.DivGlobalSize(id, {"TBX", "TBY"});
198   tuner.DivGlobalSize(id, {"WPTX", "WPTY"});
199 
200   // ===============================================================================================
201 
202   // Sets the tuner's golden reference function. This kernel contains the reference code to which
203   // the output is compared. Supplying such a function is not required, but it is necessary for
204   // correctness checks to be enabled.
205   tuner.SetReference(conv_reference, "conv_reference", {kSizeX, kSizeY}, {8,8});
206 
207   // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use)
208   // all input arguments.
209   tuner.AddArgumentScalar(static_cast<int>(kSizeX));
210   tuner.AddArgumentScalar(static_cast<int>(kSizeY));
211   tuner.AddArgumentInput(mat_a);
212   tuner.AddArgumentInput(coeff);
213   tuner.AddArgumentOutput(mat_b);
214 
215   // Starts the tuner
216   tuner.Tune();
217 
218   // The search method only explored a random subset of the whole search space. The collected data
219   // is used to train a model which is then used to estimate all the other (not-explored) points in
220   // the search space.
221   if (method == 0) {
222     auto validation_fraction = 0.20f; // 20%
223     auto top_x = size_t{10}; // Tests the top-10 best found results from the model on actual hardware
224     tuner.ModelPrediction(cltune::Model::kNeuralNetwork, validation_fraction, top_x);
225   }
226 
227   // Prints the results to screen and to file
228   auto time_ms = tuner.PrintToScreen();
229   tuner.PrintToFile("output.csv");
230   tuner.PrintJSON("output.json", {{"sample","convolution"}});
231 
232   // Also prints the performance of the best-case in terms of GB/s and GFLOPS
233   const auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6;
234   const auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6;
235   if (time_ms != 0.0) {
236     printf("[ -------> ] %.1lf ms or %.1lf GB/s or %1.lf GFLOPS\n",
237            time_ms, kMB/time_ms, kMFLOPS/time_ms);
238   }
239 
240   // End of the tuner example
241   return 0;
242 }
243 
244 // =================================================================================================
245