// ================================================================================================= // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses // a tab-size of two spaces and a max-width of 100 characters per line. // // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren) // // This file demonstrates the usage of CLTune with 2D convolution and advanced search techniques // // ------------------------------------------------------------------------------------------------- // // Copyright 2014 SURFsara // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // ================================================================================================= #include #include #include #include #include #include #include // Includes the OpenCL tuner library #include "cltune.h" // Helper function to perform an integer division + ceiling (round-up) size_t CeilDiv(size_t a, size_t b) { return (a + b - 1)/b; } // Helper function to determine whether or not 'a' is a multiple of 'b' bool IsMultiple(size_t a, size_t b) { return ((a/b)*b == a) ? true : false; }; // Constants const auto kDefaultDevice = size_t{0}; const auto kDefaultPlatform = size_t{0}; const auto kDefaultSearchMethod = size_t{1}; const auto kDefaultSearchParameter1 = size_t{4}; // Settings (synchronise these with "conv.cc", "conv.opencl" and "conv_reference.opencl") #define HFS (3) // Half filter size #define FS (HFS+HFS+1) // Filter size // Settings (sizes) const auto kSizeX = size_t{8192}; // Matrix dimension X const auto kSizeY = size_t{4096}; // Matrix dimension Y // ================================================================================================= // Example showing how to tune an OpenCL 2D convolution kernel int main(int argc, char* argv[]) { // Sets the filenames of the OpenCL kernels (optionally automatically translated to CUDA) auto conv = std::vector{"../samples/conv/conv.opencl"}; auto conv_reference = std::vector{"../samples/conv/conv_reference.opencl"}; #ifndef USE_OPENCL conv.insert(conv.begin(), "../samples/cl_to_cuda.h"); conv_reference.insert(conv_reference.begin(), "../samples/cl_to_cuda.h"); #endif // Selects the device, the search method and its first parameter. These parameters are all // optional and are thus also given default values. auto device_id = kDefaultDevice; auto platform_id = kDefaultPlatform; auto method = kDefaultSearchMethod; auto search_param_1 = kDefaultSearchParameter1; if (argc >= 2) { platform_id = static_cast(std::stoi(std::string{argv[1]})); if (argc >= 3) { device_id = static_cast(std::stoi(std::string{argv[2]})); if (argc >= 4) { method = static_cast(std::stoi(std::string{argv[3]})); if (argc >= 5) { search_param_1 = static_cast(std::stoi(std::string{argv[4]})); } } } } // Creates data structures const auto kExtraSize = size_t{FS*8}; auto mat_a = std::vector((kExtraSize+kSizeX)*(kExtraSize+kSizeY)); auto mat_b = std::vector(kSizeX*kSizeY); auto coeff = std::vector(FS*FS); // Create a random number generator const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count(); std::default_random_engine generator(static_cast(random_seed)); std::uniform_real_distribution distribution(-2.0f, 2.0f); // Populates input data structures for (auto &item: mat_a) { item = distribution(generator); } for (auto &item: mat_b) { item = 0.0f; } // Creates the filter coefficients (gaussian blur) auto sigma = 1.0f; auto mean = FS/2.0f; auto sum = 0.0f; for (auto x=size_t{0}; x(exp(exponent) / (2.0f * 3.14159265f * sigma * sigma)); sum += coeff[y*FS + x]; } } for (auto &item: coeff) { item = item / sum; } // =============================================================================================== // Initializes the tuner (platform 'platform_id', device 'device_id') cltune::Tuner tuner(static_cast(platform_id), static_cast(device_id)); // Sets one of the following search methods: // 0) Random search // 1) Simulated annealing // 2) Particle swarm optimisation (PSO) // 3) Full search auto fraction = 1/64.0f; if (method == 0) { tuner.UseRandomSearch(fraction); } else if (method == 1) { tuner.UseAnnealing(fraction, static_cast(search_param_1)); } else if (method == 2) { tuner.UsePSO(fraction, static_cast(search_param_1), 0.4, 0.0, 0.4); } else { tuner.UseFullSearch(); } // Outputs the search process to a file tuner.OutputSearchLog("search_log.txt"); // =============================================================================================== // Adds a heavily tuneable kernel and some example parameter values auto id = tuner.AddKernel(conv, "conv", {kSizeX, kSizeY}, {1, 1}); tuner.AddParameter(id, "TBX", {8, 16, 32, 64}); tuner.AddParameter(id, "TBY", {8, 16, 32, 64}); tuner.AddParameter(id, "LOCAL", {0, 1, 2}); tuner.AddParameter(id, "WPTX", {1, 2, 4, 8}); tuner.AddParameter(id, "WPTY", {1, 2, 4, 8}); tuner.AddParameter(id, "VECTOR", {1, 2, 4}); tuner.AddParameter(id, "UNROLL_FACTOR", {1, FS}); tuner.AddParameter(id, "PADDING", {0, 1}); // Introduces a helper parameter to compute the proper number of threads for the LOCAL == 2 case. // In this case, the workgroup size (TBX by TBY) is extra large (TBX_XL by TBY_XL) because it uses // extra threads to compute the halo threads. How many extra threads are needed is dependend on // the filter size. Here we support a the TBX and TBY size plus up to 10 extra threads. auto integers = std::initializer_list{ 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26, 32,33,34,35,36,37,38,39,40,41,42, 64,65,66,67,68,69,70,71,72,73,74 }; tuner.AddParameter(id, "TBX_XL", integers); tuner.AddParameter(id, "TBY_XL", integers); auto HaloThreads = [] (std::vector v) { if (v[0] == 2) { return (v[1] == v[2] + CeilDiv(2*HFS,v[3])); } // With halo threads else { return (v[1] == v[2]); } // Without halo threads }; tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBX_XL", "TBX", "WPTX"}); tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBY_XL", "TBY", "WPTY"}); // Sets the constrains on the vector size auto VectorConstraint = [] (std::vector v) { if (v[0] == 2) { return IsMultiple(v[2],v[1]) && IsMultiple(2*HFS,v[1]); } else { return IsMultiple(v[2],v[1]); } }; tuner.AddConstraint(id, VectorConstraint, {"LOCAL", "VECTOR", "WPTX"}); // Makes sure the work per thread is not too high, otherwise too many registers would be used. //auto WorkPerThreadConstraint = [] (std::vector v) { return (v[0]*v[1] < 32); }; //tuner.AddConstraint(id, WorkPerThreadConstraint, {"WPTX", "WPTY"}); // Sets padding to zero in case local memory is not used auto PaddingConstraint = [] (std::vector v) { return (v[1] == 0 || v[0] != 0); }; tuner.AddConstraint(id, PaddingConstraint, {"LOCAL", "PADDING"}); // Sets the constraints for local memory size limitations auto LocalMemorySize = [] (std::vector v) { if (v[0] != 0) { return ((v[3]*v[4] + 2*HFS) * (v[1]*v[2] + 2*HFS + v[5]))*sizeof(float); } else { return size_t{0}; } }; tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"LOCAL", "TBX", "WPTX", "TBY", "WPTY", "PADDING"}); // Modifies the thread-sizes based on the parameters tuner.MulLocalSize(id, {"TBX_XL", "TBY_XL"}); tuner.MulGlobalSize(id, {"TBX_XL", "TBY_XL"}); tuner.DivGlobalSize(id, {"TBX", "TBY"}); tuner.DivGlobalSize(id, {"WPTX", "WPTY"}); // =============================================================================================== // Sets the tuner's golden reference function. This kernel contains the reference code to which // the output is compared. Supplying such a function is not required, but it is necessary for // correctness checks to be enabled. tuner.SetReference(conv_reference, "conv_reference", {kSizeX, kSizeY}, {8,8}); // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use) // all input arguments. tuner.AddArgumentScalar(static_cast(kSizeX)); tuner.AddArgumentScalar(static_cast(kSizeY)); tuner.AddArgumentInput(mat_a); tuner.AddArgumentInput(coeff); tuner.AddArgumentOutput(mat_b); // Starts the tuner tuner.Tune(); // The search method only explored a random subset of the whole search space. The collected data // is used to train a model which is then used to estimate all the other (not-explored) points in // the search space. if (method == 0) { auto validation_fraction = 0.20f; // 20% auto top_x = size_t{10}; // Tests the top-10 best found results from the model on actual hardware tuner.ModelPrediction(cltune::Model::kNeuralNetwork, validation_fraction, top_x); } // Prints the results to screen and to file auto time_ms = tuner.PrintToScreen(); tuner.PrintToFile("output.csv"); tuner.PrintJSON("output.json", {{"sample","convolution"}}); // Also prints the performance of the best-case in terms of GB/s and GFLOPS const auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6; const auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6; if (time_ms != 0.0) { printf("[ -------> ] %.1lf ms or %.1lf GB/s or %1.lf GFLOPS\n", time_ms, kMB/time_ms, kMFLOPS/time_ms); } // End of the tuner example return 0; } // =================================================================================================