1
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file demonstrates the usage of CLTune with 2D convolution and advanced search techniques
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27
28 #include <iostream>
29 #include <sstream>
30 #include <vector>
31 #include <chrono>
32 #include <random>
33 #include <cmath>
34 #include <numeric>
35
36 // Includes the OpenCL tuner library
37 #include "cltune.h"
38
39 // Helper function to perform an integer division + ceiling (round-up)
CeilDiv(size_t a,size_t b)40 size_t CeilDiv(size_t a, size_t b) { return (a + b - 1)/b; }
41
42 // Helper function to determine whether or not 'a' is a multiple of 'b'
IsMultiple(size_t a,size_t b)43 bool IsMultiple(size_t a, size_t b) {
44 return ((a/b)*b == a) ? true : false;
45 };
46
47 // Constants
48 const auto kDefaultDevice = size_t{0};
49 const auto kDefaultPlatform = size_t{0};
50 const auto kDefaultSearchMethod = size_t{1};
51 const auto kDefaultSearchParameter1 = size_t{4};
52
53 // Settings (synchronise these with "conv.cc", "conv.opencl" and "conv_reference.opencl")
54 #define HFS (3) // Half filter size
55 #define FS (HFS+HFS+1) // Filter size
56
57 // Settings (sizes)
58 const auto kSizeX = size_t{8192}; // Matrix dimension X
59 const auto kSizeY = size_t{4096}; // Matrix dimension Y
60
61 // =================================================================================================
62
63 // Example showing how to tune an OpenCL 2D convolution kernel
main(int argc,char * argv[])64 int main(int argc, char* argv[]) {
65
66 // Sets the filenames of the OpenCL kernels (optionally automatically translated to CUDA)
67 auto conv = std::vector<std::string>{"../samples/conv/conv.opencl"};
68 auto conv_reference = std::vector<std::string>{"../samples/conv/conv_reference.opencl"};
69 #ifndef USE_OPENCL
70 conv.insert(conv.begin(), "../samples/cl_to_cuda.h");
71 conv_reference.insert(conv_reference.begin(), "../samples/cl_to_cuda.h");
72 #endif
73
74 // Selects the device, the search method and its first parameter. These parameters are all
75 // optional and are thus also given default values.
76 auto device_id = kDefaultDevice;
77 auto platform_id = kDefaultPlatform;
78 auto method = kDefaultSearchMethod;
79 auto search_param_1 = kDefaultSearchParameter1;
80 if (argc >= 2) {
81 platform_id = static_cast<size_t>(std::stoi(std::string{argv[1]}));
82 if (argc >= 3) {
83 device_id = static_cast<size_t>(std::stoi(std::string{argv[2]}));
84 if (argc >= 4) {
85 method = static_cast<size_t>(std::stoi(std::string{argv[3]}));
86 if (argc >= 5) {
87 search_param_1 = static_cast<size_t>(std::stoi(std::string{argv[4]}));
88 }
89 }
90 }
91 }
92
93 // Creates data structures
94 const auto kExtraSize = size_t{FS*8};
95 auto mat_a = std::vector<float>((kExtraSize+kSizeX)*(kExtraSize+kSizeY));
96 auto mat_b = std::vector<float>(kSizeX*kSizeY);
97 auto coeff = std::vector<float>(FS*FS);
98
99 // Create a random number generator
100 const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count();
101 std::default_random_engine generator(static_cast<unsigned int>(random_seed));
102 std::uniform_real_distribution<float> distribution(-2.0f, 2.0f);
103
104 // Populates input data structures
105 for (auto &item: mat_a) { item = distribution(generator); }
106 for (auto &item: mat_b) { item = 0.0f; }
107
108 // Creates the filter coefficients (gaussian blur)
109 auto sigma = 1.0f;
110 auto mean = FS/2.0f;
111 auto sum = 0.0f;
112 for (auto x=size_t{0}; x<FS; ++x) {
113 for (auto y=size_t{0}; y<FS; ++y) {
114 auto exponent = -0.5f * (pow((x-mean)/sigma, 2.0f) + pow((y-mean)/sigma, 2.0f));
115 coeff[y*FS + x] = static_cast<float>(exp(exponent) / (2.0f * 3.14159265f * sigma * sigma));
116 sum += coeff[y*FS + x];
117 }
118 }
119 for (auto &item: coeff) { item = item / sum; }
120
121 // ===============================================================================================
122
123 // Initializes the tuner (platform 'platform_id', device 'device_id')
124 cltune::Tuner tuner(static_cast<size_t>(platform_id), static_cast<size_t>(device_id));
125
126 // Sets one of the following search methods:
127 // 0) Random search
128 // 1) Simulated annealing
129 // 2) Particle swarm optimisation (PSO)
130 // 3) Full search
131 auto fraction = 1/64.0f;
132 if (method == 0) { tuner.UseRandomSearch(fraction); }
133 else if (method == 1) { tuner.UseAnnealing(fraction, static_cast<double>(search_param_1)); }
134 else if (method == 2) { tuner.UsePSO(fraction, static_cast<size_t>(search_param_1), 0.4, 0.0, 0.4); }
135 else { tuner.UseFullSearch(); }
136
137 // Outputs the search process to a file
138 tuner.OutputSearchLog("search_log.txt");
139
140 // ===============================================================================================
141
142 // Adds a heavily tuneable kernel and some example parameter values
143 auto id = tuner.AddKernel(conv, "conv", {kSizeX, kSizeY}, {1, 1});
144 tuner.AddParameter(id, "TBX", {8, 16, 32, 64});
145 tuner.AddParameter(id, "TBY", {8, 16, 32, 64});
146 tuner.AddParameter(id, "LOCAL", {0, 1, 2});
147 tuner.AddParameter(id, "WPTX", {1, 2, 4, 8});
148 tuner.AddParameter(id, "WPTY", {1, 2, 4, 8});
149 tuner.AddParameter(id, "VECTOR", {1, 2, 4});
150 tuner.AddParameter(id, "UNROLL_FACTOR", {1, FS});
151 tuner.AddParameter(id, "PADDING", {0, 1});
152
153 // Introduces a helper parameter to compute the proper number of threads for the LOCAL == 2 case.
154 // In this case, the workgroup size (TBX by TBY) is extra large (TBX_XL by TBY_XL) because it uses
155 // extra threads to compute the halo threads. How many extra threads are needed is dependend on
156 // the filter size. Here we support a the TBX and TBY size plus up to 10 extra threads.
157 auto integers = std::initializer_list<size_t>{
158 8,9,10,11,12,13,14,15,
159 16,17,18,19,20,21,22,23,24,25,26,
160 32,33,34,35,36,37,38,39,40,41,42,
161 64,65,66,67,68,69,70,71,72,73,74
162 };
163 tuner.AddParameter(id, "TBX_XL", integers);
164 tuner.AddParameter(id, "TBY_XL", integers);
165 auto HaloThreads = [] (std::vector<size_t> v) {
166 if (v[0] == 2) { return (v[1] == v[2] + CeilDiv(2*HFS,v[3])); } // With halo threads
167 else { return (v[1] == v[2]); } // Without halo threads
168 };
169 tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBX_XL", "TBX", "WPTX"});
170 tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBY_XL", "TBY", "WPTY"});
171
172 // Sets the constrains on the vector size
173 auto VectorConstraint = [] (std::vector<size_t> v) {
174 if (v[0] == 2) { return IsMultiple(v[2],v[1]) && IsMultiple(2*HFS,v[1]); }
175 else { return IsMultiple(v[2],v[1]); }
176 };
177 tuner.AddConstraint(id, VectorConstraint, {"LOCAL", "VECTOR", "WPTX"});
178
179 // Makes sure the work per thread is not too high, otherwise too many registers would be used.
180 //auto WorkPerThreadConstraint = [] (std::vector<size_t> v) { return (v[0]*v[1] < 32); };
181 //tuner.AddConstraint(id, WorkPerThreadConstraint, {"WPTX", "WPTY"});
182
183 // Sets padding to zero in case local memory is not used
184 auto PaddingConstraint = [] (std::vector<size_t> v) { return (v[1] == 0 || v[0] != 0); };
185 tuner.AddConstraint(id, PaddingConstraint, {"LOCAL", "PADDING"});
186
187 // Sets the constraints for local memory size limitations
188 auto LocalMemorySize = [] (std::vector<size_t> v) {
189 if (v[0] != 0) { return ((v[3]*v[4] + 2*HFS) * (v[1]*v[2] + 2*HFS + v[5]))*sizeof(float); }
190 else { return size_t{0}; }
191 };
192 tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"LOCAL", "TBX", "WPTX", "TBY", "WPTY", "PADDING"});
193
194 // Modifies the thread-sizes based on the parameters
195 tuner.MulLocalSize(id, {"TBX_XL", "TBY_XL"});
196 tuner.MulGlobalSize(id, {"TBX_XL", "TBY_XL"});
197 tuner.DivGlobalSize(id, {"TBX", "TBY"});
198 tuner.DivGlobalSize(id, {"WPTX", "WPTY"});
199
200 // ===============================================================================================
201
202 // Sets the tuner's golden reference function. This kernel contains the reference code to which
203 // the output is compared. Supplying such a function is not required, but it is necessary for
204 // correctness checks to be enabled.
205 tuner.SetReference(conv_reference, "conv_reference", {kSizeX, kSizeY}, {8,8});
206
207 // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use)
208 // all input arguments.
209 tuner.AddArgumentScalar(static_cast<int>(kSizeX));
210 tuner.AddArgumentScalar(static_cast<int>(kSizeY));
211 tuner.AddArgumentInput(mat_a);
212 tuner.AddArgumentInput(coeff);
213 tuner.AddArgumentOutput(mat_b);
214
215 // Starts the tuner
216 tuner.Tune();
217
218 // The search method only explored a random subset of the whole search space. The collected data
219 // is used to train a model which is then used to estimate all the other (not-explored) points in
220 // the search space.
221 if (method == 0) {
222 auto validation_fraction = 0.20f; // 20%
223 auto top_x = size_t{10}; // Tests the top-10 best found results from the model on actual hardware
224 tuner.ModelPrediction(cltune::Model::kNeuralNetwork, validation_fraction, top_x);
225 }
226
227 // Prints the results to screen and to file
228 auto time_ms = tuner.PrintToScreen();
229 tuner.PrintToFile("output.csv");
230 tuner.PrintJSON("output.json", {{"sample","convolution"}});
231
232 // Also prints the performance of the best-case in terms of GB/s and GFLOPS
233 const auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6;
234 const auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6;
235 if (time_ms != 0.0) {
236 printf("[ -------> ] %.1lf ms or %.1lf GB/s or %1.lf GFLOPS\n",
237 time_ms, kMB/time_ms, kMFLOPS/time_ms);
238 }
239
240 // End of the tuner example
241 return 0;
242 }
243
244 // =================================================================================================
245