1 /*
2     This file is part of Leela Zero.
3     Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
4 
5     Leela Zero is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     Leela Zero is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
17 
18     Additional permission under GNU GPL version 3 section 7
19 
20     If you modify this Program, or any covered work, by linking or
21     combining it with NVIDIA Corporation's libraries from the
22     NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23     Network library and/or the NVIDIA TensorRT inference library
24     (or a modified version of those libraries), containing parts covered
25     by the terms of the respective license agreement, the licensors of
26     this Program grant you additional permission to convey the resulting
27     work.
28 */
29 
30 #ifndef OPENCL_H_INCLUDED
31 #define OPENCL_H_INCLUDED
32 
33 #include "config.h"
34 
35 #define CL_HPP_MINIMUM_OPENCL_VERSION   110
36 #define CL_HPP_TARGET_OPENCL_VERSION    120
37 #define CL_HPP_ENABLE_EXCEPTIONS
38 #include <CL/cl2.hpp>
39 #include <cstddef>
40 #include <memory>
41 #include <string>
42 #include <vector>
43 #include <mutex>
44 #include <cassert>
45 
46 #include "Tuner.h"
47 
48 template <typename net_t> class OpenCL;
49 template <typename net_t> class OpenCL_Network;
50 
51 class Layer {
52     template <typename> friend class OpenCL_Network;
53 private:
54     unsigned int channels{0};
55     unsigned int outputs{0};
56     unsigned int filter_size{0};
57     bool is_input_convolution{false};
58     bool is_residual_block{false};
59     bool is_convolve1{false};
60     std::vector<cl::Buffer> weights;
61 };
62 
63 class OpenCLContext {
64     template <typename> friend class OpenCL;
65     template <typename> friend class OpenCL_Network;
66 private:
67     bool m_is_initialized{false};
68     cl::CommandQueue m_commandqueue;
69     cl::Kernel m_convolve1_kernel;
70     cl::Kernel m_merge_kernel;
71     cl::Kernel m_in_transform_kernel;
72     cl::Kernel m_sgemm_kernel;
73     cl::Kernel m_out_transform_bn_kernel;
74     cl::Kernel m_out_transform_bn_in_kernel;
75     cl::Buffer m_inBuffer;
76     cl::Buffer m_inBuffer2;
77     cl::Buffer m_VBuffer;
78     cl::Buffer m_MBuffer;
79     cl::Buffer m_pinnedOutBuffer_pol;
80     cl::Buffer m_pinnedOutBuffer_val;
81     bool m_buffers_allocated{false};
82 };
83 
84 template <typename net_t>
85 class OpenCL_Network {
86 public:
OpenCL_Network(OpenCL<net_t> & opencl)87     OpenCL_Network(OpenCL<net_t> & opencl) : m_opencl(opencl) {}
getOpenCL()88     OpenCL<net_t> & getOpenCL() {
89         return m_opencl;
90     }
91 
push_input_convolution(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<net_t> & weights,const std::vector<net_t> & means,const std::vector<net_t> & variances)92     void push_input_convolution(unsigned int filter_size,
93                        unsigned int channels,
94                        unsigned int outputs,
95                        const std::vector<net_t>& weights,
96                        const std::vector<net_t>& means,
97                        const std::vector<net_t>& variances) {
98         size_t layer = get_layer_count();
99         push_weights(layer, weights);
100         push_weights(layer, means);
101         push_weights(layer, variances);
102         m_layers[layer].is_input_convolution = true;
103         m_layers[layer].outputs = outputs;
104         m_layers[layer].filter_size = filter_size;
105         m_layers[layer].channels = channels;
106     }
107 
push_residual(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<net_t> & weights_1,const std::vector<net_t> & means_1,const std::vector<net_t> & variances_1,const std::vector<net_t> & weights_2,const std::vector<net_t> & means_2,const std::vector<net_t> & variances_2)108     void push_residual(unsigned int filter_size,
109                        unsigned int channels,
110                        unsigned int outputs,
111                        const std::vector<net_t>& weights_1,
112                        const std::vector<net_t>& means_1,
113                        const std::vector<net_t>& variances_1,
114                        const std::vector<net_t>& weights_2,
115                        const std::vector<net_t>& means_2,
116                        const std::vector<net_t>& variances_2) {
117         size_t layer = get_layer_count();
118         push_weights(layer, weights_1);
119         push_weights(layer, means_1);
120         push_weights(layer, variances_1);
121         push_weights(layer, weights_2);
122         push_weights(layer, means_2);
123         push_weights(layer, variances_2);
124         m_layers[layer].is_residual_block = true;
125         m_layers[layer].outputs = outputs;
126         m_layers[layer].filter_size = filter_size;
127         m_layers[layer].channels = channels;
128     }
129 
push_convolve(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<net_t> & weights)130     void push_convolve(unsigned int filter_size,
131                        unsigned int channels,
132                        unsigned int outputs,
133                        const std::vector<net_t>& weights) {
134         (void)filter_size;
135         assert(filter_size == 1);
136 
137         size_t layer = get_layer_count();
138         push_weights(layer, weights);
139         m_layers[layer].is_convolve1 = true;
140         m_layers[layer].outputs = outputs;
141         m_layers[layer].channels = channels;
142     }
143 
get_layer_count()144     size_t get_layer_count() const {
145         return m_layers.size();
146     }
147 
148     void forward(const std::vector<float>& input,
149             std::vector<float>& output_pol,
150             std::vector<float>& output_val,
151             OpenCLContext & opencl_context,
152             const int batch_size = 1);
153 
154 private:
155     using weight_slice_t = std::vector<cl::Buffer>::const_iterator;
156 
push_weights(size_t layer,const std::vector<net_t> & weights)157     void push_weights(size_t layer, const std::vector<net_t>& weights) {
158         add_weights(layer, weights.size(), weights.data());
159     }
160     void add_weights(size_t layer, size_t size, const net_t* weights);
161 
162     void convolve3(OpenCLContext & opencl_context,
163                     int channels, int outputs,
164                     cl::Buffer& bufferIn,
165                     cl::Buffer& bufferOut,
166                     cl::Buffer& bufferV,
167                     cl::Buffer& bufferM, weight_slice_t weights,
168                     cl::Buffer* bufferResidual,
169                     weight_slice_t bn_weights,
170                     bool skip_in_transform,
171                     bool fuse_in_transform, bool store_inout,
172                     int batch_size);
173 
174     void convolve1(OpenCLContext & opencl_context,
175                   int channels, int outputs,
176                   cl::Buffer& bufferInput,
177                   cl::Buffer& bufferOutput,
178                   cl::Buffer& bufferMerge,
179                   weight_slice_t weights,
180                   int batch_size);
181 
182     OpenCL<net_t> & m_opencl;
183 
184     // this mutex is not required for correctness, but this exists simply
185     // because queue.finish() is a busy wait and having a lot of threads
186     // waiting here is counterproductive CPU-wise.  At least std::mutex
187     // isn't busy wait so it should be better.
188     std::mutex m_queue_finish_mutex;
189     std::vector<Layer> m_layers;
190 };
191 
192 template <typename net_t>
193 class OpenCL {
194     friend class OpenCL_Network<net_t>;
195     friend class Tuner<net_t>;
196 public:
197     OpenCL(int gpu, bool silent = false);
198 
199     void initialize(const int channels, size_t batch_size = 1);
200     void ensure_context_initialized(OpenCLContext & opencl_context);
201     std::string get_device_name();
202     bool has_fp16_compute();
203     bool has_tensor_cores();
204 
205     std::vector<size_t> get_sgemm_tuners();
206 
207     cl::Device m_device;
208     cl::Context m_context;
209 private:
210     void process_tuners(std::string tuners);
211 
212     size_t m_batch_size = 1;
213     cl::Program m_program;
214     std::string m_cl_args;
215 
216     struct sgemm_tuners {
217         size_t mwg, nwg, kwg;
218         size_t vwm, vwn;
219         size_t mdima, ndimb;
220         size_t mdimc, ndimc;
221         size_t tce;
222     };
223     sgemm_tuners m_sgemm_tuners;
224     size_t m_wavefront_size{0};
225     size_t m_max_workgroup_size{0};
226     std::vector<size_t> m_max_workgroup_dims;
227     bool m_fp16_compute{false};
228     bool m_tensorcore{false};
229     bool m_init_ok{false};
230 };
231 
232 extern const std::string sourceCode_sgemm;
233 extern const std::string sourceCode_common;
234 
235 #endif
236