1 /* 2 This file is part of Leela Zero. 3 Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors 4 5 Leela Zero is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 Leela Zero is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with Leela Zero. If not, see <http://www.gnu.org/licenses/>. 17 18 Additional permission under GNU GPL version 3 section 7 19 20 If you modify this Program, or any covered work, by linking or 21 combining it with NVIDIA Corporation's libraries from the 22 NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural 23 Network library and/or the NVIDIA TensorRT inference library 24 (or a modified version of those libraries), containing parts covered 25 by the terms of the respective license agreement, the licensors of 26 this Program grant you additional permission to convey the resulting 27 work. 28 */ 29 30 #ifndef OPENCL_H_INCLUDED 31 #define OPENCL_H_INCLUDED 32 33 #include "config.h" 34 35 #define CL_HPP_MINIMUM_OPENCL_VERSION 110 36 #define CL_HPP_TARGET_OPENCL_VERSION 120 37 #define CL_HPP_ENABLE_EXCEPTIONS 38 #include <CL/cl2.hpp> 39 #include <cstddef> 40 #include <memory> 41 #include <string> 42 #include <vector> 43 #include <mutex> 44 #include <cassert> 45 46 #include "Tuner.h" 47 48 template <typename net_t> class OpenCL; 49 template <typename net_t> class OpenCL_Network; 50 51 class Layer { 52 template <typename> friend class OpenCL_Network; 53 private: 54 unsigned int channels{0}; 55 unsigned int outputs{0}; 56 unsigned int filter_size{0}; 57 bool is_input_convolution{false}; 58 bool is_residual_block{false}; 59 bool is_convolve1{false}; 60 std::vector<cl::Buffer> weights; 61 }; 62 63 class OpenCLContext { 64 template <typename> friend class OpenCL; 65 template <typename> friend class OpenCL_Network; 66 private: 67 bool m_is_initialized{false}; 68 cl::CommandQueue m_commandqueue; 69 cl::Kernel m_convolve1_kernel; 70 cl::Kernel m_merge_kernel; 71 cl::Kernel m_in_transform_kernel; 72 cl::Kernel m_sgemm_kernel; 73 cl::Kernel m_out_transform_bn_kernel; 74 cl::Kernel m_out_transform_bn_in_kernel; 75 cl::Buffer m_inBuffer; 76 cl::Buffer m_inBuffer2; 77 cl::Buffer m_VBuffer; 78 cl::Buffer m_MBuffer; 79 cl::Buffer m_pinnedOutBuffer_pol; 80 cl::Buffer m_pinnedOutBuffer_val; 81 bool m_buffers_allocated{false}; 82 }; 83 84 template <typename net_t> 85 class OpenCL_Network { 86 public: OpenCL_Network(OpenCL<net_t> & opencl)87 OpenCL_Network(OpenCL<net_t> & opencl) : m_opencl(opencl) {} getOpenCL()88 OpenCL<net_t> & getOpenCL() { 89 return m_opencl; 90 } 91 push_input_convolution(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<net_t> & weights,const std::vector<net_t> & means,const std::vector<net_t> & variances)92 void push_input_convolution(unsigned int filter_size, 93 unsigned int channels, 94 unsigned int outputs, 95 const std::vector<net_t>& weights, 96 const std::vector<net_t>& means, 97 const std::vector<net_t>& variances) { 98 size_t layer = get_layer_count(); 99 push_weights(layer, weights); 100 push_weights(layer, means); 101 push_weights(layer, variances); 102 m_layers[layer].is_input_convolution = true; 103 m_layers[layer].outputs = outputs; 104 m_layers[layer].filter_size = filter_size; 105 m_layers[layer].channels = channels; 106 } 107 push_residual(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<net_t> & weights_1,const std::vector<net_t> & means_1,const std::vector<net_t> & variances_1,const std::vector<net_t> & weights_2,const std::vector<net_t> & means_2,const std::vector<net_t> & variances_2)108 void push_residual(unsigned int filter_size, 109 unsigned int channels, 110 unsigned int outputs, 111 const std::vector<net_t>& weights_1, 112 const std::vector<net_t>& means_1, 113 const std::vector<net_t>& variances_1, 114 const std::vector<net_t>& weights_2, 115 const std::vector<net_t>& means_2, 116 const std::vector<net_t>& variances_2) { 117 size_t layer = get_layer_count(); 118 push_weights(layer, weights_1); 119 push_weights(layer, means_1); 120 push_weights(layer, variances_1); 121 push_weights(layer, weights_2); 122 push_weights(layer, means_2); 123 push_weights(layer, variances_2); 124 m_layers[layer].is_residual_block = true; 125 m_layers[layer].outputs = outputs; 126 m_layers[layer].filter_size = filter_size; 127 m_layers[layer].channels = channels; 128 } 129 push_convolve(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<net_t> & weights)130 void push_convolve(unsigned int filter_size, 131 unsigned int channels, 132 unsigned int outputs, 133 const std::vector<net_t>& weights) { 134 (void)filter_size; 135 assert(filter_size == 1); 136 137 size_t layer = get_layer_count(); 138 push_weights(layer, weights); 139 m_layers[layer].is_convolve1 = true; 140 m_layers[layer].outputs = outputs; 141 m_layers[layer].channels = channels; 142 } 143 get_layer_count()144 size_t get_layer_count() const { 145 return m_layers.size(); 146 } 147 148 void forward(const std::vector<float>& input, 149 std::vector<float>& output_pol, 150 std::vector<float>& output_val, 151 OpenCLContext & opencl_context, 152 const int batch_size = 1); 153 154 private: 155 using weight_slice_t = std::vector<cl::Buffer>::const_iterator; 156 push_weights(size_t layer,const std::vector<net_t> & weights)157 void push_weights(size_t layer, const std::vector<net_t>& weights) { 158 add_weights(layer, weights.size(), weights.data()); 159 } 160 void add_weights(size_t layer, size_t size, const net_t* weights); 161 162 void convolve3(OpenCLContext & opencl_context, 163 int channels, int outputs, 164 cl::Buffer& bufferIn, 165 cl::Buffer& bufferOut, 166 cl::Buffer& bufferV, 167 cl::Buffer& bufferM, weight_slice_t weights, 168 cl::Buffer* bufferResidual, 169 weight_slice_t bn_weights, 170 bool skip_in_transform, 171 bool fuse_in_transform, bool store_inout, 172 int batch_size); 173 174 void convolve1(OpenCLContext & opencl_context, 175 int channels, int outputs, 176 cl::Buffer& bufferInput, 177 cl::Buffer& bufferOutput, 178 cl::Buffer& bufferMerge, 179 weight_slice_t weights, 180 int batch_size); 181 182 OpenCL<net_t> & m_opencl; 183 184 // this mutex is not required for correctness, but this exists simply 185 // because queue.finish() is a busy wait and having a lot of threads 186 // waiting here is counterproductive CPU-wise. At least std::mutex 187 // isn't busy wait so it should be better. 188 std::mutex m_queue_finish_mutex; 189 std::vector<Layer> m_layers; 190 }; 191 192 template <typename net_t> 193 class OpenCL { 194 friend class OpenCL_Network<net_t>; 195 friend class Tuner<net_t>; 196 public: 197 OpenCL(int gpu, bool silent = false); 198 199 void initialize(const int channels, size_t batch_size = 1); 200 void ensure_context_initialized(OpenCLContext & opencl_context); 201 std::string get_device_name(); 202 bool has_fp16_compute(); 203 bool has_tensor_cores(); 204 205 std::vector<size_t> get_sgemm_tuners(); 206 207 cl::Device m_device; 208 cl::Context m_context; 209 private: 210 void process_tuners(std::string tuners); 211 212 size_t m_batch_size = 1; 213 cl::Program m_program; 214 std::string m_cl_args; 215 216 struct sgemm_tuners { 217 size_t mwg, nwg, kwg; 218 size_t vwm, vwn; 219 size_t mdima, ndimb; 220 size_t mdimc, ndimc; 221 size_t tce; 222 }; 223 sgemm_tuners m_sgemm_tuners; 224 size_t m_wavefront_size{0}; 225 size_t m_max_workgroup_size{0}; 226 std::vector<size_t> m_max_workgroup_dims; 227 bool m_fp16_compute{false}; 228 bool m_tensorcore{false}; 229 bool m_init_ok{false}; 230 }; 231 232 extern const std::string sourceCode_sgemm; 233 extern const std::string sourceCode_common; 234 235 #endif 236