1 /*
2     This file is part of Leela Zero.
3     Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
4 
5     Leela Zero is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     Leela Zero is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
17 
18     Additional permission under GNU GPL version 3 section 7
19 
20     If you modify this Program, or any covered work, by linking or
21     combining it with NVIDIA Corporation's libraries from the
22     NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23     Network library and/or the NVIDIA TensorRT inference library
24     (or a modified version of those libraries), containing parts covered
25     by the terms of the respective license agreement, the licensors of
26     this Program grant you additional permission to convey the resulting
27     work.
28 */
29 
30 #include "config.h"
31 
32 #ifdef USE_OPENCL
33 #include <cassert>
34 #include <algorithm>
35 #include <boost/algorithm/string.hpp>
36 #include <boost/format.hpp>
37 #include <iterator>
38 #include <limits>
39 #include <stdexcept>
40 
41 #include <cstdio>
42 #include <iostream>
43 #include <memory>
44 #include <sstream>
45 #include <string>
46 
47 #include "OpenCL.h"
48 #include "Network.h"
49 #include "GTP.h"
50 #include "Utils.h"
51 #include "Tuner.h"
52 
53 using namespace Utils;
54 
55 template <typename net_t> static std::string getClArgs();
56 
getClArgs()57 template <> std::string getClArgs<float>() {
58     return
59         "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero";
60 }
61 #ifdef USE_HALF
getClArgs()62 template <> std::string getClArgs<half_float::half>() {
63     return
64         "-DUSE_HALF "
65         "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero";
66 }
67 #endif
68 
69 const std::string sourceCode_common =
70     #include "kernels/common.opencl"
71 ;
72 
73 static const std::string sourceCode_tensorcore_test =
74     #include "kernels/tensorcore_test.opencl"
75 ;
76 
77 static const std::string sourceCode_config = R"(
78 #define BOARD_SIZE )" + std::to_string(BOARD_SIZE) +
79 "\n#define NUM_INTERSECTIONS " + std::to_string(NUM_INTERSECTIONS) +
80 "\n#define WINOGRAD_M " + std::to_string(WINOGRAD_M) +
81 "\n#define WINOGRAD_ALPHA " + std::to_string(WINOGRAD_ALPHA) +
82 "\n#define WTILES " + std::to_string(WINOGRAD_WTILES);
83 
84 static const std::string sourceCode_convolve1 =
85     #include "kernels/convolve1.opencl"
86 ;
87 
88 static const std::string sourceCode_convolve3 =
89     #include "kernels/convolve3.opencl"
90 ;
91 
92 const std::string sourceCode_sgemm =
93     "#if TCE == 1\n" // Enable tensorcore
94     #include "kernels/clblast/hgemm_tensorcore.opencl"
95     "\n#else\n" // Use clblast
96     #include "kernels/clblast/xgemm_part1.opencl"
97     #include "kernels/clblast/xgemm_part2.opencl"
98     #include "kernels/clblast/xgemm_part3.opencl"
99     #include "kernels/clblast/xgemm_batched.opencl"
100     "\n#endif\n"
101 ;
102 
103 template <typename net_t>
ensure_context_initialized(OpenCLContext & opencl_context)104 void OpenCL<net_t>::ensure_context_initialized(OpenCLContext &opencl_context) {
105     if (!opencl_context.m_is_initialized) {
106         // Make kernels
107         opencl_context.m_convolve1_kernel =
108             cl::Kernel(m_program, "convolve1");
109         opencl_context.m_merge_kernel =
110             cl::Kernel(m_program, "merge");
111         opencl_context.m_in_transform_kernel =
112             cl::Kernel(m_program, "in_transform");
113         opencl_context.m_sgemm_kernel =
114             cl::Kernel(m_program, "XgemmBatched");
115         opencl_context.m_out_transform_bn_kernel =
116             cl::Kernel(m_program, "out_transform_fused_bn");
117         opencl_context.m_out_transform_bn_in_kernel =
118             cl::Kernel(m_program, "out_transform_fused_bn_in");
119         opencl_context.m_commandqueue =
120             cl::CommandQueue(m_context, m_device);
121         opencl_context.m_is_initialized = true;
122     }
123 }
124 
125 template <typename net_t>
add_weights(size_t layer,size_t size,const net_t * weights)126 void OpenCL_Network<net_t>::add_weights(size_t layer,
127                                  size_t size,
128                                  const net_t * weights) {
129     if (layer >= m_layers.size()) {
130         m_layers.push_back(Layer());
131     }
132 
133     auto weightSize = size * sizeof(net_t);
134 
135     auto queue = cl::CommandQueue(getOpenCL().m_context, getOpenCL().m_device);
136     auto buffer = cl::Buffer(
137         m_opencl.m_context,
138         CL_MEM_READ_ONLY,
139         weightSize,
140         nullptr
141     );
142     queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, weightSize, const_cast<net_t*>(weights));
143     m_layers.back().weights.push_back(std::move(buffer));
144 }
145 
146 template <typename net_t>
forward(const std::vector<float> & input,std::vector<float> & output_pol,std::vector<float> & output_val,OpenCLContext & opencl_context,const int batch_size)147 void OpenCL_Network<net_t>::forward(const std::vector<float>& input,
148                              std::vector<float>& output_pol,
149                              std::vector<float>& output_val,
150                              OpenCLContext & opencl_context,
151                              const int batch_size) {
152     constexpr auto tiles = WINOGRAD_P;
153     constexpr auto one_plane = NUM_INTERSECTIONS * sizeof(net_t);
154     const auto finalSize_pol = m_layers[m_layers.size()-2].outputs * one_plane;
155     const auto finalSize_val = m_layers.back().outputs * one_plane;
156 
157     m_opencl.ensure_context_initialized(opencl_context);
158 
159     if (!opencl_context.m_buffers_allocated) {
160         auto max_channels = unsigned{0};
161         for (const auto& layer : m_layers) {
162             max_channels = std::max(max_channels,
163                                     std::max(layer.channels, layer.outputs));
164         }
165 
166         const auto mwg = m_opencl.m_sgemm_tuners.mwg;
167         const auto nwg = m_opencl.m_sgemm_tuners.nwg;
168         const auto vwm = m_opencl.m_sgemm_tuners.vwm;
169         const auto vwn = m_opencl.m_sgemm_tuners.vwn;
170 
171         const auto m_ceil = ceilMultiple(ceilMultiple(max_channels, mwg), vwm);
172         const auto n_ceil = ceilMultiple(ceilMultiple(tiles, nwg), vwn);
173 
174         const auto alloc_inSize =
175             getOpenCL().m_batch_size * NUM_INTERSECTIONS * max_channels * sizeof(net_t);
176         const auto alloc_vm_size =
177             getOpenCL().m_batch_size * WINOGRAD_TILE * m_ceil * n_ceil * sizeof(net_t);
178 
179         auto v_zeros = std::vector<net_t>(alloc_vm_size);
180 
181         opencl_context.m_inBuffer = cl::Buffer(
182             m_opencl.m_context,
183             CL_MEM_READ_WRITE, alloc_inSize);
184         opencl_context.m_inBuffer2 = cl::Buffer(
185             m_opencl.m_context,
186             CL_MEM_READ_WRITE, alloc_inSize);
187         opencl_context.m_VBuffer = cl::Buffer(
188             m_opencl.m_context,
189             CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR,
190             alloc_vm_size, v_zeros.data(), nullptr);
191         opencl_context.m_MBuffer = cl::Buffer(
192             m_opencl.m_context,
193             CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_vm_size);
194 
195         opencl_context.m_pinnedOutBuffer_pol = cl::Buffer(
196             m_opencl.m_context,
197             CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, getOpenCL().m_batch_size * finalSize_pol);
198         opencl_context.m_pinnedOutBuffer_val = cl::Buffer(
199             m_opencl.m_context,
200             CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, getOpenCL().m_batch_size * finalSize_val);
201 
202         opencl_context.m_buffers_allocated = true;
203     }
204 
205     cl::Buffer & inBuffer = opencl_context.m_inBuffer;
206     cl::Buffer & inBuffer2 = opencl_context.m_inBuffer2;
207     cl::Buffer & VBuffer = opencl_context.m_VBuffer;
208     cl::Buffer & MBuffer = opencl_context.m_MBuffer;
209     cl::CommandQueue & queue = opencl_context.m_commandqueue;
210 
211     std::vector<net_t> net_t_input(input.size());
212     std::copy(begin(input), end(input), begin(net_t_input));
213 
214     const auto inSize = sizeof(net_t) * input.size();
215     queue.enqueueWriteBuffer(inBuffer, CL_FALSE, 0, inSize, net_t_input.data());
216 
217     // Fused in_out transformation kernel is slower with big batch_sizes than
218     // calling out and in transformations separately.
219     // This condition could be tunable in future.
220     auto use_inout = (batch_size == 1);
221 
222     auto skip_in_trans = false;
223     for (auto iter = cbegin(m_layers); iter != cend(m_layers); iter++) {
224         const auto& layer = *iter;
225         const auto niter = std::next(iter);
226 
227         if (layer.is_input_convolution) {
228             assert(niter != cend(m_layers));
229             auto conv_weights = begin(layer.weights);
230             auto bn_weights = begin(layer.weights) + 1;
231             auto skip_next_in_trans = false;
232             if (niter->is_residual_block) {
233                 skip_next_in_trans = use_inout;
234             }
235 
236             convolve3(opencl_context,
237                      layer.channels,
238                      layer.outputs,
239                      inBuffer,
240                      inBuffer,
241                      VBuffer,
242                      MBuffer,
243                      conv_weights,
244                      nullptr,
245                      bn_weights,
246                      skip_in_trans, skip_next_in_trans, true,
247                      batch_size);
248 
249             skip_in_trans = skip_next_in_trans;
250         } else if (layer.is_residual_block) {
251             assert(layer.channels == layer.outputs);
252             assert(niter != cend(m_layers));
253             auto conv1_weights = begin(layer.weights);
254             auto bn1_weights   = begin(layer.weights) + 1;
255             auto conv2_weights = begin(layer.weights) + 3;
256             auto bn2_weights   = begin(layer.weights) + 4;
257             convolve3(opencl_context,
258                       layer.channels,
259                       layer.outputs,
260                       inBuffer,
261                       inBuffer2,
262                       VBuffer,
263                       MBuffer,
264                       conv1_weights,
265                       nullptr,
266                       bn1_weights,
267                       skip_in_trans, use_inout, false,
268                       batch_size);
269 
270             auto skip_next_in_trans = false;
271             if (niter->is_residual_block) {
272                 skip_next_in_trans = use_inout;
273             }
274             convolve3(opencl_context,
275                       layer.channels,
276                       layer.outputs,
277                       inBuffer2,
278                       inBuffer,
279                       VBuffer,
280                       MBuffer,
281                       conv2_weights,
282                       &inBuffer,
283                       bn2_weights,
284                       use_inout, skip_next_in_trans, true,
285                       batch_size);
286             skip_in_trans = skip_next_in_trans;
287         } else {
288             assert(layer.is_convolve1);
289 
290             cl::Buffer out_buffer;
291             if (niter == cend(m_layers)) {
292                 out_buffer = opencl_context.m_pinnedOutBuffer_val;
293             } else {
294                 out_buffer = opencl_context.m_pinnedOutBuffer_pol;
295             }
296 
297             convolve1(opencl_context, layer.channels,
298                     layer.outputs,
299                     inBuffer,
300                     out_buffer,
301                     VBuffer,
302                     begin(layer.weights),
303                     batch_size);
304         }
305     }
306 
307     auto pinnedOutBufferHost_pol = queue.enqueueMapBuffer(
308         opencl_context.m_pinnedOutBuffer_pol, CL_FALSE,
309         CL_MAP_READ, 0, batch_size * finalSize_pol);
310     auto pinnedOutBufferHost_val = queue.enqueueMapBuffer(
311         opencl_context.m_pinnedOutBuffer_val, CL_FALSE,
312         CL_MAP_READ, 0, batch_size * finalSize_val);
313 
314     {
315         // Finish call is usually a busy wait. When using multiple threads
316         // use the lock to avoid busy waiting with all threads.
317         std::lock_guard<std::mutex> lock(m_queue_finish_mutex);
318         queue.finish();
319     }
320 
321     auto polptr = static_cast<net_t*>(pinnedOutBufferHost_pol);
322     auto valptr = static_cast<net_t*>(pinnedOutBufferHost_val);
323     std::copy(polptr, polptr + output_pol.size(), begin(output_pol));
324     std::copy(valptr, valptr + output_val.size(), begin(output_val));
325 
326     queue.enqueueUnmapMemObject(opencl_context.m_pinnedOutBuffer_pol,
327             pinnedOutBufferHost_pol);
328     queue.enqueueUnmapMemObject(opencl_context.m_pinnedOutBuffer_val,
329             pinnedOutBufferHost_val);
330 
331 }
332 
333 template <typename net_t>
convolve3(OpenCLContext & opencl_context,int channels,int outputs,cl::Buffer & bufferIn,cl::Buffer & bufferOut,cl::Buffer & bufferV,cl::Buffer & bufferM,weight_slice_t weights,cl::Buffer * bufferResidual,weight_slice_t bn_weights,bool skip_in_transform,bool fuse_in_transform,bool store_inout,int batch_size)334 void OpenCL_Network<net_t>::convolve3(OpenCLContext & opencl_context,
335                               int channels, int outputs,
336                               cl::Buffer& bufferIn,
337                               cl::Buffer& bufferOut,
338                               cl::Buffer& bufferV,
339                               cl::Buffer& bufferM,
340                               weight_slice_t weights,
341                               cl::Buffer* bufferResidual,
342                               weight_slice_t bn_weights,
343                               bool skip_in_transform,
344                               bool fuse_in_transform,
345                               bool store_inout,
346                               int batch_size) {
347 
348     cl::Kernel & in_transform_kernel = opencl_context.m_in_transform_kernel;
349     cl::Kernel & sgemm_kernel = opencl_context.m_sgemm_kernel;
350     cl::Kernel & out_transform_bn_kernel =
351         opencl_context.m_out_transform_bn_kernel;
352     cl::Kernel & out_transform_bn_in_kernel =
353         opencl_context.m_out_transform_bn_in_kernel;
354 
355     auto mwg = m_opencl.m_sgemm_tuners.mwg;
356     auto nwg = m_opencl.m_sgemm_tuners.nwg;
357     auto kwg = m_opencl.m_sgemm_tuners.kwg;
358     auto vwm = m_opencl.m_sgemm_tuners.vwm;
359     auto vwn = m_opencl.m_sgemm_tuners.vwn;
360     auto mdimc = m_opencl.m_sgemm_tuners.mdimc;
361     auto ndimc = m_opencl.m_sgemm_tuners.ndimc;
362     auto tce = m_opencl.m_sgemm_tuners.tce;
363     auto mdima = m_opencl.m_sgemm_tuners.mdima;
364     auto ndimb = m_opencl.m_sgemm_tuners.ndimb;
365 
366     auto wavefront_size = m_opencl.m_wavefront_size;
367 
368     assert(mwg != 0);
369     assert(nwg != 0);
370     assert(kwg != 0);
371     assert(mdimc != 0);
372     assert(ndimc != 0);
373     assert(vwm != 0);
374     assert(vwn != 0);
375     assert(wavefront_size != 0);
376 
377     constexpr auto tiles = WINOGRAD_P;
378 
379     auto wgs = ceilMultiple(batch_size * tiles, wavefront_size);
380     auto wgs_single = ceilMultiple(tiles, wavefront_size);
381 
382     auto m_ceil = int(ceilMultiple(ceilMultiple(outputs, mwg), vwm));
383     auto n_ceil = int(ceilMultiple(ceilMultiple(batch_size * tiles, nwg), vwn));
384     auto k_ceil = int(ceilMultiple(ceilMultiple(channels, kwg), vwm));
385 
386     cl::CommandQueue & queue = opencl_context.m_commandqueue;
387 
388     if (!skip_in_transform) {
389         try {
390             in_transform_kernel.setArg(0, bufferIn);
391             in_transform_kernel.setArg(1, bufferV);
392             in_transform_kernel.setArg(2, channels);
393             in_transform_kernel.setArg(3, k_ceil);
394             in_transform_kernel.setArg(4, n_ceil);
395             in_transform_kernel.setArg(5, batch_size);
396 
397             queue.enqueueNDRangeKernel(in_transform_kernel, cl::NullRange,
398                                        cl::NDRange(wgs, channels));
399         } catch (const cl::Error &e) {
400             std::cerr << "Error in convolve3/in: " << e.what() << ": "
401                 << e.err() << std::endl;
402             throw;
403         }
404     }
405 
406     try {
407         sgemm_kernel.setArg(0, m_ceil);
408         sgemm_kernel.setArg(1, n_ceil);
409         sgemm_kernel.setArg(2, k_ceil);
410         sgemm_kernel.setArg(3, weights[0]);
411         sgemm_kernel.setArg(4, bufferV);
412         sgemm_kernel.setArg(5, bufferM);
413 
414         cl::NDRange local_sgemm = {mdimc, ndimc, 1};
415 
416         cl::NDRange size_sgemm = {(m_ceil * mdimc) / mwg,
417                                   (n_ceil * ndimc) / nwg,
418                                   cl::size_type(WINOGRAD_TILE)};
419 
420         // tensorcore implementation uses a different dimension
421         if (tce) {
422             local_sgemm = {32 * mdimc/mdima, ndimc/ndimb, 1};
423             size_sgemm = {32 * m_ceil / mdima * mdimc / mwg,
424                           n_ceil / ndimb * ndimc / nwg,
425                           cl::size_type(WINOGRAD_TILE)};
426         }
427         queue.enqueueNDRangeKernel(sgemm_kernel, cl::NullRange,
428                                    size_sgemm, local_sgemm);
429     } catch (const cl::Error &e) {
430         std::cerr << "Error in convolve3/sgemm: " << e.what() << ": "
431             << e.err() << std::endl;
432         throw;
433     }
434 
435     try {
436         if (fuse_in_transform) {
437             // TODO : Eventually this might also be something tuneable?
438             // Needs to match OUTIN_KWG in kernel
439             constexpr auto dim_size = 2;
440             out_transform_bn_in_kernel.setArg(0, bufferM);
441             if (store_inout) {
442                 out_transform_bn_in_kernel.setArg(1, bufferOut);
443             } else {
444                 out_transform_bn_in_kernel.setArg(1, nullptr);
445             }
446             out_transform_bn_in_kernel.setArg(2, bufferV);
447             out_transform_bn_in_kernel.setArg(3, outputs);
448             out_transform_bn_in_kernel.setArg(4, m_ceil);
449             out_transform_bn_in_kernel.setArg(5, n_ceil);
450             // k_ceil of the next convolution
451             auto k_ceil2 = int(ceilMultiple(ceilMultiple(outputs, kwg), vwm));
452             out_transform_bn_in_kernel.setArg(6, k_ceil2);
453             if (bufferResidual) {
454                 out_transform_bn_in_kernel.setArg(7, *bufferResidual);
455             } else {
456                 out_transform_bn_in_kernel.setArg(7, nullptr);
457             }
458             out_transform_bn_in_kernel.setArg(8, bn_weights[0]);
459             out_transform_bn_in_kernel.setArg(9, bn_weights[1]);
460 
461             queue.enqueueNDRangeKernel(out_transform_bn_in_kernel,
462                                        cl::NullRange,
463                                        cl::NDRange(outputs, wgs_single, batch_size),
464                                        cl::NDRange(dim_size, wgs_single, 1));
465         } else {
466             out_transform_bn_kernel.setArg(0, bufferM);
467             out_transform_bn_kernel.setArg(1, bufferOut);
468             out_transform_bn_kernel.setArg(2, outputs);
469             out_transform_bn_kernel.setArg(3, m_ceil);
470             out_transform_bn_kernel.setArg(4, n_ceil);
471             out_transform_bn_kernel.setArg(5, batch_size);
472             if (bufferResidual) {
473                 out_transform_bn_kernel.setArg(6, *bufferResidual);
474             } else {
475                 out_transform_bn_kernel.setArg(6, nullptr);
476             }
477             out_transform_bn_kernel.setArg(7, bn_weights[0]);
478             out_transform_bn_kernel.setArg(8, bn_weights[1]);
479 
480             // Needs to match OUT_KWG, OUT_BWG in the kernel.
481             // This could be tuned.
482             cl::NDRange local_out = {32, 2};
483 
484             cl::NDRange global_out = {ceilMultiple(outputs, local_out[0]),
485                                       ceilMultiple(tiles * batch_size, local_out[1])};
486 
487             queue.enqueueNDRangeKernel(out_transform_bn_kernel, cl::NullRange,
488                                        global_out,
489                                        local_out);
490         }
491     } catch (const cl::Error &e) {
492         std::cerr << "Error in convolve3/out: " << e.what() << ": "
493             << e.err() << std::endl;
494         throw;
495     }
496 }
497 
498 template <typename net_t>
convolve1(OpenCLContext & opencl_context,int channels,int outputs,cl::Buffer & bufferInput,cl::Buffer & bufferOutput,cl::Buffer & bufferMerge,weight_slice_t weights,int batch_size)499 void OpenCL_Network<net_t>::convolve1(OpenCLContext & opencl_context,
500                               int channels, int outputs,
501                               cl::Buffer& bufferInput,
502                               cl::Buffer& bufferOutput,
503                               cl::Buffer& bufferMerge,
504                               weight_slice_t weights,
505                               int batch_size) {
506     // The size of the board is defined at compile time
507     constexpr int width = BOARD_SIZE;
508     constexpr int boardsize = NUM_INTERSECTIONS;
509     constexpr int rowTiles = BOARD_SIZE;
510 
511     // Input channel grouping in multiples of 8
512     constexpr int channelGroup = 8;
513     constexpr int channelShift = 3;
514     constexpr int rowGroup = 1;
515     size_t outputGroup = std::min(outputs, 32);
516 
517     auto m_convolve_kernel = &opencl_context.m_convolve1_kernel;
518 
519 #ifndef NDEBUG
520     // Total output size after reducing
521     size_t outSize = boardsize * outputs * sizeof(net_t);
522 
523     // Produce channel * output planes and merge them at the end
524     size_t mergeSize = (channels >> channelShift) * outSize;
525     assert(mergeSize <= bufferMerge.getInfo<CL_MEM_SIZE>());
526 #endif
527 
528     // Copy the rows locally
529     size_t stripSize = width * sizeof(float);
530 
531     int rowBuffer = std::min<int>(channelGroup, 7);
532     size_t rowSize = channelGroup * outputGroup * rowBuffer * sizeof(float);
533 
534     cl::CommandQueue & queue = opencl_context.m_commandqueue;
535 
536     try {
537         m_convolve_kernel->setArg(0, bufferInput);
538         m_convolve_kernel->setArg(1, bufferMerge);
539         m_convolve_kernel->setArg(2, weights[0]);
540         m_convolve_kernel->setArg(3, cl::Local(stripSize * channelGroup * rowGroup));
541         m_convolve_kernel->setArg(4, cl::Local(rowSize));
542 
543         queue.enqueueNDRangeKernel(
544             *m_convolve_kernel, cl::NullRange,
545             cl::NDRange(channels, outputs, batch_size * rowTiles),
546             cl::NDRange(channelGroup, outputGroup, rowGroup));
547     } catch (const cl::Error &e) {
548         std::cerr << "Error in convolve1: " << e.what() << ": "
549                   << e.err() << std::endl;
550         throw;
551     }
552 
553     cl::Kernel & merge_kernel = opencl_context.m_merge_kernel;
554     assert(channels % (1 << channelShift) == 0);
555 
556     try {
557         merge_kernel.setArg(0, bufferMerge);
558         merge_kernel.setArg(1, bufferOutput);
559         merge_kernel.setArg(2, channels >> channelShift);
560 
561         queue.enqueueNDRangeKernel(
562             merge_kernel, cl::NullRange,
563             cl::NDRange(outputs, boardsize, batch_size),
564             cl::NDRange(std::min(8, outputs), BOARD_SIZE, 1));
565     } catch (const cl::Error &e) {
566         std::cerr << "Error in merge: " << e.what() << ": "
567                   << e.err() << std::endl;
568         throw;
569     }
570 }
571 
572 template<class T>
opencl_dev_type_to_string(T type)573 static std::string opencl_dev_type_to_string(T type) {
574     if (type == CL_DEVICE_TYPE_CPU) {
575         return "CPU";
576     } else if (type == CL_DEVICE_TYPE_GPU) {
577         return "GPU";
578     } else if (type == CL_DEVICE_TYPE_ACCELERATOR) {
579         return "Accelerator";
580     } else {
581         return "Unknown";
582     }
583 }
584 
trim(std::string trim_me)585 static std::string trim(std::string trim_me) {
586     boost::algorithm::trim(trim_me);
587     return trim_me;
588 }
589 
590 template <typename net_t>
process_tuners(std::string tuners)591 void OpenCL<net_t>::process_tuners(std::string tuners) {
592     std::string buf;
593     std::stringstream ss(tuners);
594     std::size_t found;
595 
596     auto mwg = false;
597     auto nwg = false;
598     auto kwg = false;
599     auto ndimc = false;
600     auto mdimc = false;
601     auto mdima = false;
602     auto ndimb = false;
603     auto vwm = false;
604     auto vwn = false;
605     auto tce = false;
606 
607     while (ss >> buf) {
608         found = buf.find("=");
609         if (found == std::string::npos) {
610             std::cerr << "Invalid tuner string: " << tuners << std::endl;
611             std::exit(-1);
612         }
613         std::string name = buf.substr(0, found);
614         auto value = std::stoi(buf.substr(found + 1, std::string::npos));
615         if (name == "-DMWG") {
616             m_sgemm_tuners.mwg = value;
617             mwg = true;
618         }
619         if (name == "-DNWG") {
620             m_sgemm_tuners.nwg = value;
621             nwg = true;
622         }
623         if (name == "-DKWG") {
624             m_sgemm_tuners.kwg = value;
625             kwg = true;
626         }
627         if (name == "-DMDIMA") {
628             m_sgemm_tuners.mdima = value;
629             mdima = true;
630         }
631         if (name == "-DNDIMB") {
632             m_sgemm_tuners.ndimb = value;
633             ndimb = true;
634         }
635         if (name == "-DMDIMC") {
636             m_sgemm_tuners.mdimc = value;
637             mdimc = true;
638         }
639         if (name == "-DNDIMC") {
640             m_sgemm_tuners.ndimc = value;
641             ndimc = true;
642         }
643         if (name == "-DVWM") {
644             m_sgemm_tuners.vwm = value;
645             vwm = true;
646         }
647         if (name == "-DVWN") {
648             m_sgemm_tuners.vwn = value;
649             vwn = true;
650         }
651         if (name == "-DTCE") {
652             m_sgemm_tuners.tce = value;
653             tce = true;
654         }
655     }
656     if (!mwg || !nwg || !kwg || !mdimc || !ndimc || !vwm || !vwn || !mdima || !ndimb) {
657         std::cerr << "Missing tuner parameters";
658         if (!mwg) {
659             std::cerr << " MWG";
660         }
661         if (!nwg) {
662             std::cerr << " NWG";
663         }
664         if (!kwg) {
665             std::cerr << " KWG";
666         }
667         if (!mdima) {
668             std::cerr << " MDIMA";
669         }
670         if (!ndimb) {
671             std::cerr << " NDIMB";
672         }
673         if (!mdimc) {
674             std::cerr << " MDIMC";
675         }
676         if (!ndimc) {
677             std::cerr << " NDIMC";
678         }
679         if (!vwm) {
680             std::cerr << " VWM";
681         }
682         if (!vwn) {
683             std::cerr << " VWN";
684         }
685         if (!tce) {
686             std::cerr << " VWN";
687         }
688         std::cerr << std::endl;
689         std::exit(-1);
690     }
691 }
692 
693 template <typename net_t>
get_sgemm_tuners()694 std::vector<size_t> OpenCL<net_t>::get_sgemm_tuners() {
695     std::vector<size_t> tuners;
696 
697     tuners.emplace_back(m_sgemm_tuners.mwg);
698     tuners.emplace_back(m_sgemm_tuners.nwg);
699     tuners.emplace_back(m_sgemm_tuners.kwg);
700     tuners.emplace_back(m_sgemm_tuners.vwm);
701     tuners.emplace_back(m_sgemm_tuners.vwn);
702     tuners.emplace_back(m_sgemm_tuners.mdimc);
703     tuners.emplace_back(m_sgemm_tuners.ndimc);
704 
705     return tuners;
706 }
707 
708 template <typename net_t>
OpenCL(int gpu,bool silent)709 OpenCL<net_t>::OpenCL(int gpu, bool silent) {
710     std::vector<cl::Platform> platforms;
711     try {
712         cl::Platform::get(&platforms);
713     } catch (const cl::Error &e) {
714         myprintf("OpenCL: %s\n", e.what());
715         throw;
716     }
717 
718     auto best_version = 0.0f;
719     cl::Platform best_platform;
720     cl::Device best_device;
721     std::string best_vendor;
722     auto best_score = 0;
723     auto found_device = false;
724     auto id = 0;
725 
726     if (!silent) {
727         myprintf("Detected %d OpenCL platforms.\n", platforms.size());
728     }
729 
730     for (const auto &p : platforms) {
731         std::string platvers = p.getInfo<CL_PLATFORM_VERSION>();
732         if (!silent) {
733             std::string platprof = p.getInfo<CL_PLATFORM_PROFILE>();
734             std::string platname = p.getInfo<CL_PLATFORM_NAME>();
735             std::string platvend = p.getInfo<CL_PLATFORM_VENDOR>();
736             myprintf("Platform version: %s\n", platvers.c_str());;
737             myprintf("Platform profile: %s\n", platprof.c_str());
738             myprintf("Platform name:    %s\n", platname.c_str());
739             myprintf("Platform vendor:  %s\n", platvend.c_str());
740         }
741 
742         std::istringstream versstream(platvers);
743         std::string tmp;
744         float opencl_version;
745         versstream >> tmp >> opencl_version;
746 
747         std::vector<cl::Device> devices;
748         try {
749             p.getDevices(CL_DEVICE_TYPE_ALL, &devices);
750         } catch (const cl::Error &e) {
751             myprintf("Error getting device(s): %s: %d\n", e.what(), e.err());
752             devices.clear();
753         }
754         for (auto& d : devices) {
755             if (!silent) {
756                 myprintf("Device ID:     %d\n", id);
757                 myprintf("Device name:   %s\n",
758                          trim(d.getInfo<CL_DEVICE_NAME>()).c_str());
759                 myprintf("Device type:   %s\n",
760                          opencl_dev_type_to_string(
761                              d.getInfo<CL_DEVICE_TYPE>()).c_str());
762                 myprintf("Device vendor: %s\n",
763                           d.getInfo<CL_DEVICE_VENDOR>().c_str());
764                 myprintf("Device driver: %s\n",
765                           d.getInfo<CL_DRIVER_VERSION>().c_str());
766                 myprintf("Device speed:  %u MHz\n",
767                           d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
768                 myprintf("Device cores:  %u CU\n",
769                           d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>());
770             }
771 
772             // assign score, try to find best device
773             int this_score = 0;
774             std::string this_vendor = d.getInfo<CL_DEVICE_VENDOR>();
775             this_score += 1000 * boost::icontains(this_vendor, "advanced micro devices");
776             this_score += 1000 * boost::icontains(this_vendor, "amd");
777             this_score += 1000 * boost::icontains(this_vendor, "nvidia");
778             this_score +=  500 * boost::icontains(this_vendor, "intel");
779             this_score +=  100 * (d.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU);
780             this_score +=  opencl_version * 10;
781             if (!silent) {
782                 myprintf("Device score:  %d\n", this_score);
783             }
784 
785             bool preferred = (gpu == id);
786 
787             if (((this_score > best_score)
788                  /*&& (d.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_CPU)*/) // Do not reject CPU-only OpenCL providers because some users only have that (ex. pocl)
789                 || preferred) {
790                 best_version = opencl_version;
791                 best_platform = p;
792                 best_device = d;
793                 best_vendor = this_vendor;
794                 if (preferred) {
795                     best_score =
796                         std::numeric_limits<decltype(best_score)>::max();
797                 } else {
798                     best_score = this_score;
799                 }
800                 found_device = true;
801             }
802             id++;
803         }
804     }
805 
806     if (!found_device) {
807         throw std::runtime_error("No suitable OpenCL device found.");
808     }
809 
810     myprintf("Selected platform: %s\n",
811         best_platform.getInfo<CL_PLATFORM_NAME>().c_str());
812     myprintf("Selected device: %s\n",
813         trim(best_device.getInfo<CL_DEVICE_NAME>()).c_str());
814     myprintf("with OpenCL %2.1f capability.\n", best_version);
815 
816     cl::Context context;
817     try {
818         context = cl::Context(best_device);
819     } catch (const cl::Error &e) {
820         myprintf("Error creating OpenCL context: %s: %d", e.what(), e.err());
821         throw std::runtime_error("Error creating OpenCL context.");
822     }
823     m_context = context;
824     m_device = best_device;
825 
826     m_cl_args = getClArgs<net_t>();
827 
828     myprintf("Half precision compute support: ");
829     if (m_device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp16")
830         != std::string::npos) {
831         myprintf("Yes.\n");
832         m_fp16_compute = true;
833         m_cl_args += " -DFP16_SUPPORT";
834     } else {
835         myprintf("No.\n");
836     }
837 
838     myprintf("Tensor Core support: ");
839     try {
840         cl::Program(m_context, sourceCode_tensorcore_test).build(m_cl_args.c_str());
841         m_tensorcore = true;
842         myprintf("Yes.\n");
843     } catch (...) {
844         myprintf("No.\n");
845     }
846 }
847 
848 template <typename net_t>
initialize(const int channels,size_t batch_size)849 void OpenCL<net_t>::initialize(const int channels, size_t batch_size) {
850     m_batch_size = batch_size;
851     // Make program of the source code in the context
852     try {
853         m_program = cl::Program(m_context,
854                                 sourceCode_common
855                                 + sourceCode_config
856                                 + sourceCode_convolve1
857                                 + sourceCode_convolve3
858                                 + sourceCode_sgemm);
859     } catch (const cl::Error &e) {
860         myprintf("Error getting kernels: %s: %d", e.what(), e.err());
861         throw std::runtime_error("Error getting OpenCL kernels.");
862     }
863 
864     auto t = Tuner<net_t>(*this, m_context, m_device);
865     if (m_tensorcore) {
866         t.enable_tensorcore();
867     }
868 
869     auto sgemm_tuners =
870         t.load_sgemm_tuners(channels, batch_size * WINOGRAD_P, channels, WINOGRAD_TILE);
871 
872     // Some NVIDIA drivers are buggy and will fail to compile the rest of the
873     // kernels after a tuning run.
874     if (cfg_tune_only) {
875         // Originally this was an exit() but this will make the tuner
876         // only tune the first GPU.  Return instead.  Exit will be called
877         // after all GPUs are created.
878         return;
879     }
880 
881     // Build program for these specific devices
882     try {
883         std::string args = m_cl_args;
884         // Intel iGPUs need vector types for math for best performance
885         if (m_device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>() > 1) {
886             args += " -DWINOGRAD_SIMD";
887         }
888 
889         args += sgemm_tuners;
890         m_program.build(args.c_str());
891     } catch (const cl::Error&) {
892         myprintf("Error building kernels: %s\n",
893                  m_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(m_device).c_str());
894         throw std::runtime_error("Error building OpenCL kernels.");
895     }
896 
897     OpenCLContext tdata;
898     ensure_context_initialized(tdata);
899 
900     process_tuners(sgemm_tuners);
901 
902     m_wavefront_size =
903         tdata.m_sgemm_kernel.getWorkGroupInfo<
904             CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(m_device);
905     myprintf("Wavefront/Warp size: %d\n", m_wavefront_size);
906 
907     m_max_workgroup_size = m_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
908     m_max_workgroup_dims = m_device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
909 
910     myprintf("Max workgroup size: %d\n", m_max_workgroup_size);
911     myprintf("Max workgroup dimensions: ");
912     for (auto d : m_max_workgroup_dims) {
913         myprintf("%d ", d);
914     }
915     myprintf("\n");
916 
917     m_init_ok = true;
918 }
919 
920 template <typename net_t>
has_fp16_compute()921 bool OpenCL<net_t>::has_fp16_compute() {
922     return m_fp16_compute;
923 }
924 
925 template <typename net_t>
has_tensor_cores()926 bool OpenCL<net_t>::has_tensor_cores() {
927     return m_tensorcore;
928 }
929 
930 template <typename net_t>
get_device_name()931 std::string OpenCL<net_t>::get_device_name() {
932     std::stringstream ss;
933 
934     ss << "OpenCL: ";
935     ss << m_device.getInfo<CL_DEVICE_VENDOR>() << " ";
936     ss << m_device.getInfo<CL_DEVICE_NAME>() << " @ ";
937     ss << m_device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << "MHz";
938 
939     return ss.str();
940 }
941 
942 template class OpenCL<float>;
943 template class OpenCL_Network<float>;
944 #ifdef USE_HALF
945 template class OpenCL<half_float::half>;
946 template class OpenCL_Network<half_float::half>;
947 #endif
948 
949 #endif
950