1 /*
2 This file is part of Leela Zero.
3 Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
4
5 Leela Zero is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 Leela Zero is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with Leela Zero. If not, see <http://www.gnu.org/licenses/>.
17
18 Additional permission under GNU GPL version 3 section 7
19
20 If you modify this Program, or any covered work, by linking or
21 combining it with NVIDIA Corporation's libraries from the
22 NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23 Network library and/or the NVIDIA TensorRT inference library
24 (or a modified version of those libraries), containing parts covered
25 by the terms of the respective license agreement, the licensors of
26 this Program grant you additional permission to convey the resulting
27 work.
28 */
29
30 #include "config.h"
31
32 #ifdef USE_OPENCL
33 #include <cassert>
34 #include <algorithm>
35 #include <boost/algorithm/string.hpp>
36 #include <boost/format.hpp>
37 #include <iterator>
38 #include <limits>
39 #include <stdexcept>
40
41 #include <cstdio>
42 #include <iostream>
43 #include <memory>
44 #include <sstream>
45 #include <string>
46
47 #include "OpenCL.h"
48 #include "Network.h"
49 #include "GTP.h"
50 #include "Utils.h"
51 #include "Tuner.h"
52
53 using namespace Utils;
54
55 template <typename net_t> static std::string getClArgs();
56
getClArgs()57 template <> std::string getClArgs<float>() {
58 return
59 "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero";
60 }
61 #ifdef USE_HALF
getClArgs()62 template <> std::string getClArgs<half_float::half>() {
63 return
64 "-DUSE_HALF "
65 "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero";
66 }
67 #endif
68
69 const std::string sourceCode_common =
70 #include "kernels/common.opencl"
71 ;
72
73 static const std::string sourceCode_tensorcore_test =
74 #include "kernels/tensorcore_test.opencl"
75 ;
76
77 static const std::string sourceCode_config = R"(
78 #define BOARD_SIZE )" + std::to_string(BOARD_SIZE) +
79 "\n#define NUM_INTERSECTIONS " + std::to_string(NUM_INTERSECTIONS) +
80 "\n#define WINOGRAD_M " + std::to_string(WINOGRAD_M) +
81 "\n#define WINOGRAD_ALPHA " + std::to_string(WINOGRAD_ALPHA) +
82 "\n#define WTILES " + std::to_string(WINOGRAD_WTILES);
83
84 static const std::string sourceCode_convolve1 =
85 #include "kernels/convolve1.opencl"
86 ;
87
88 static const std::string sourceCode_convolve3 =
89 #include "kernels/convolve3.opencl"
90 ;
91
92 const std::string sourceCode_sgemm =
93 "#if TCE == 1\n" // Enable tensorcore
94 #include "kernels/clblast/hgemm_tensorcore.opencl"
95 "\n#else\n" // Use clblast
96 #include "kernels/clblast/xgemm_part1.opencl"
97 #include "kernels/clblast/xgemm_part2.opencl"
98 #include "kernels/clblast/xgemm_part3.opencl"
99 #include "kernels/clblast/xgemm_batched.opencl"
100 "\n#endif\n"
101 ;
102
103 template <typename net_t>
ensure_context_initialized(OpenCLContext & opencl_context)104 void OpenCL<net_t>::ensure_context_initialized(OpenCLContext &opencl_context) {
105 if (!opencl_context.m_is_initialized) {
106 // Make kernels
107 opencl_context.m_convolve1_kernel =
108 cl::Kernel(m_program, "convolve1");
109 opencl_context.m_merge_kernel =
110 cl::Kernel(m_program, "merge");
111 opencl_context.m_in_transform_kernel =
112 cl::Kernel(m_program, "in_transform");
113 opencl_context.m_sgemm_kernel =
114 cl::Kernel(m_program, "XgemmBatched");
115 opencl_context.m_out_transform_bn_kernel =
116 cl::Kernel(m_program, "out_transform_fused_bn");
117 opencl_context.m_out_transform_bn_in_kernel =
118 cl::Kernel(m_program, "out_transform_fused_bn_in");
119 opencl_context.m_commandqueue =
120 cl::CommandQueue(m_context, m_device);
121 opencl_context.m_is_initialized = true;
122 }
123 }
124
125 template <typename net_t>
add_weights(size_t layer,size_t size,const net_t * weights)126 void OpenCL_Network<net_t>::add_weights(size_t layer,
127 size_t size,
128 const net_t * weights) {
129 if (layer >= m_layers.size()) {
130 m_layers.push_back(Layer());
131 }
132
133 auto weightSize = size * sizeof(net_t);
134
135 auto queue = cl::CommandQueue(getOpenCL().m_context, getOpenCL().m_device);
136 auto buffer = cl::Buffer(
137 m_opencl.m_context,
138 CL_MEM_READ_ONLY,
139 weightSize,
140 nullptr
141 );
142 queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, weightSize, const_cast<net_t*>(weights));
143 m_layers.back().weights.push_back(std::move(buffer));
144 }
145
146 template <typename net_t>
forward(const std::vector<float> & input,std::vector<float> & output_pol,std::vector<float> & output_val,OpenCLContext & opencl_context,const int batch_size)147 void OpenCL_Network<net_t>::forward(const std::vector<float>& input,
148 std::vector<float>& output_pol,
149 std::vector<float>& output_val,
150 OpenCLContext & opencl_context,
151 const int batch_size) {
152 constexpr auto tiles = WINOGRAD_P;
153 constexpr auto one_plane = NUM_INTERSECTIONS * sizeof(net_t);
154 const auto finalSize_pol = m_layers[m_layers.size()-2].outputs * one_plane;
155 const auto finalSize_val = m_layers.back().outputs * one_plane;
156
157 m_opencl.ensure_context_initialized(opencl_context);
158
159 if (!opencl_context.m_buffers_allocated) {
160 auto max_channels = unsigned{0};
161 for (const auto& layer : m_layers) {
162 max_channels = std::max(max_channels,
163 std::max(layer.channels, layer.outputs));
164 }
165
166 const auto mwg = m_opencl.m_sgemm_tuners.mwg;
167 const auto nwg = m_opencl.m_sgemm_tuners.nwg;
168 const auto vwm = m_opencl.m_sgemm_tuners.vwm;
169 const auto vwn = m_opencl.m_sgemm_tuners.vwn;
170
171 const auto m_ceil = ceilMultiple(ceilMultiple(max_channels, mwg), vwm);
172 const auto n_ceil = ceilMultiple(ceilMultiple(tiles, nwg), vwn);
173
174 const auto alloc_inSize =
175 getOpenCL().m_batch_size * NUM_INTERSECTIONS * max_channels * sizeof(net_t);
176 const auto alloc_vm_size =
177 getOpenCL().m_batch_size * WINOGRAD_TILE * m_ceil * n_ceil * sizeof(net_t);
178
179 auto v_zeros = std::vector<net_t>(alloc_vm_size);
180
181 opencl_context.m_inBuffer = cl::Buffer(
182 m_opencl.m_context,
183 CL_MEM_READ_WRITE, alloc_inSize);
184 opencl_context.m_inBuffer2 = cl::Buffer(
185 m_opencl.m_context,
186 CL_MEM_READ_WRITE, alloc_inSize);
187 opencl_context.m_VBuffer = cl::Buffer(
188 m_opencl.m_context,
189 CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR,
190 alloc_vm_size, v_zeros.data(), nullptr);
191 opencl_context.m_MBuffer = cl::Buffer(
192 m_opencl.m_context,
193 CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_vm_size);
194
195 opencl_context.m_pinnedOutBuffer_pol = cl::Buffer(
196 m_opencl.m_context,
197 CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, getOpenCL().m_batch_size * finalSize_pol);
198 opencl_context.m_pinnedOutBuffer_val = cl::Buffer(
199 m_opencl.m_context,
200 CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, getOpenCL().m_batch_size * finalSize_val);
201
202 opencl_context.m_buffers_allocated = true;
203 }
204
205 cl::Buffer & inBuffer = opencl_context.m_inBuffer;
206 cl::Buffer & inBuffer2 = opencl_context.m_inBuffer2;
207 cl::Buffer & VBuffer = opencl_context.m_VBuffer;
208 cl::Buffer & MBuffer = opencl_context.m_MBuffer;
209 cl::CommandQueue & queue = opencl_context.m_commandqueue;
210
211 std::vector<net_t> net_t_input(input.size());
212 std::copy(begin(input), end(input), begin(net_t_input));
213
214 const auto inSize = sizeof(net_t) * input.size();
215 queue.enqueueWriteBuffer(inBuffer, CL_FALSE, 0, inSize, net_t_input.data());
216
217 // Fused in_out transformation kernel is slower with big batch_sizes than
218 // calling out and in transformations separately.
219 // This condition could be tunable in future.
220 auto use_inout = (batch_size == 1);
221
222 auto skip_in_trans = false;
223 for (auto iter = cbegin(m_layers); iter != cend(m_layers); iter++) {
224 const auto& layer = *iter;
225 const auto niter = std::next(iter);
226
227 if (layer.is_input_convolution) {
228 assert(niter != cend(m_layers));
229 auto conv_weights = begin(layer.weights);
230 auto bn_weights = begin(layer.weights) + 1;
231 auto skip_next_in_trans = false;
232 if (niter->is_residual_block) {
233 skip_next_in_trans = use_inout;
234 }
235
236 convolve3(opencl_context,
237 layer.channels,
238 layer.outputs,
239 inBuffer,
240 inBuffer,
241 VBuffer,
242 MBuffer,
243 conv_weights,
244 nullptr,
245 bn_weights,
246 skip_in_trans, skip_next_in_trans, true,
247 batch_size);
248
249 skip_in_trans = skip_next_in_trans;
250 } else if (layer.is_residual_block) {
251 assert(layer.channels == layer.outputs);
252 assert(niter != cend(m_layers));
253 auto conv1_weights = begin(layer.weights);
254 auto bn1_weights = begin(layer.weights) + 1;
255 auto conv2_weights = begin(layer.weights) + 3;
256 auto bn2_weights = begin(layer.weights) + 4;
257 convolve3(opencl_context,
258 layer.channels,
259 layer.outputs,
260 inBuffer,
261 inBuffer2,
262 VBuffer,
263 MBuffer,
264 conv1_weights,
265 nullptr,
266 bn1_weights,
267 skip_in_trans, use_inout, false,
268 batch_size);
269
270 auto skip_next_in_trans = false;
271 if (niter->is_residual_block) {
272 skip_next_in_trans = use_inout;
273 }
274 convolve3(opencl_context,
275 layer.channels,
276 layer.outputs,
277 inBuffer2,
278 inBuffer,
279 VBuffer,
280 MBuffer,
281 conv2_weights,
282 &inBuffer,
283 bn2_weights,
284 use_inout, skip_next_in_trans, true,
285 batch_size);
286 skip_in_trans = skip_next_in_trans;
287 } else {
288 assert(layer.is_convolve1);
289
290 cl::Buffer out_buffer;
291 if (niter == cend(m_layers)) {
292 out_buffer = opencl_context.m_pinnedOutBuffer_val;
293 } else {
294 out_buffer = opencl_context.m_pinnedOutBuffer_pol;
295 }
296
297 convolve1(opencl_context, layer.channels,
298 layer.outputs,
299 inBuffer,
300 out_buffer,
301 VBuffer,
302 begin(layer.weights),
303 batch_size);
304 }
305 }
306
307 auto pinnedOutBufferHost_pol = queue.enqueueMapBuffer(
308 opencl_context.m_pinnedOutBuffer_pol, CL_FALSE,
309 CL_MAP_READ, 0, batch_size * finalSize_pol);
310 auto pinnedOutBufferHost_val = queue.enqueueMapBuffer(
311 opencl_context.m_pinnedOutBuffer_val, CL_FALSE,
312 CL_MAP_READ, 0, batch_size * finalSize_val);
313
314 {
315 // Finish call is usually a busy wait. When using multiple threads
316 // use the lock to avoid busy waiting with all threads.
317 std::lock_guard<std::mutex> lock(m_queue_finish_mutex);
318 queue.finish();
319 }
320
321 auto polptr = static_cast<net_t*>(pinnedOutBufferHost_pol);
322 auto valptr = static_cast<net_t*>(pinnedOutBufferHost_val);
323 std::copy(polptr, polptr + output_pol.size(), begin(output_pol));
324 std::copy(valptr, valptr + output_val.size(), begin(output_val));
325
326 queue.enqueueUnmapMemObject(opencl_context.m_pinnedOutBuffer_pol,
327 pinnedOutBufferHost_pol);
328 queue.enqueueUnmapMemObject(opencl_context.m_pinnedOutBuffer_val,
329 pinnedOutBufferHost_val);
330
331 }
332
333 template <typename net_t>
convolve3(OpenCLContext & opencl_context,int channels,int outputs,cl::Buffer & bufferIn,cl::Buffer & bufferOut,cl::Buffer & bufferV,cl::Buffer & bufferM,weight_slice_t weights,cl::Buffer * bufferResidual,weight_slice_t bn_weights,bool skip_in_transform,bool fuse_in_transform,bool store_inout,int batch_size)334 void OpenCL_Network<net_t>::convolve3(OpenCLContext & opencl_context,
335 int channels, int outputs,
336 cl::Buffer& bufferIn,
337 cl::Buffer& bufferOut,
338 cl::Buffer& bufferV,
339 cl::Buffer& bufferM,
340 weight_slice_t weights,
341 cl::Buffer* bufferResidual,
342 weight_slice_t bn_weights,
343 bool skip_in_transform,
344 bool fuse_in_transform,
345 bool store_inout,
346 int batch_size) {
347
348 cl::Kernel & in_transform_kernel = opencl_context.m_in_transform_kernel;
349 cl::Kernel & sgemm_kernel = opencl_context.m_sgemm_kernel;
350 cl::Kernel & out_transform_bn_kernel =
351 opencl_context.m_out_transform_bn_kernel;
352 cl::Kernel & out_transform_bn_in_kernel =
353 opencl_context.m_out_transform_bn_in_kernel;
354
355 auto mwg = m_opencl.m_sgemm_tuners.mwg;
356 auto nwg = m_opencl.m_sgemm_tuners.nwg;
357 auto kwg = m_opencl.m_sgemm_tuners.kwg;
358 auto vwm = m_opencl.m_sgemm_tuners.vwm;
359 auto vwn = m_opencl.m_sgemm_tuners.vwn;
360 auto mdimc = m_opencl.m_sgemm_tuners.mdimc;
361 auto ndimc = m_opencl.m_sgemm_tuners.ndimc;
362 auto tce = m_opencl.m_sgemm_tuners.tce;
363 auto mdima = m_opencl.m_sgemm_tuners.mdima;
364 auto ndimb = m_opencl.m_sgemm_tuners.ndimb;
365
366 auto wavefront_size = m_opencl.m_wavefront_size;
367
368 assert(mwg != 0);
369 assert(nwg != 0);
370 assert(kwg != 0);
371 assert(mdimc != 0);
372 assert(ndimc != 0);
373 assert(vwm != 0);
374 assert(vwn != 0);
375 assert(wavefront_size != 0);
376
377 constexpr auto tiles = WINOGRAD_P;
378
379 auto wgs = ceilMultiple(batch_size * tiles, wavefront_size);
380 auto wgs_single = ceilMultiple(tiles, wavefront_size);
381
382 auto m_ceil = int(ceilMultiple(ceilMultiple(outputs, mwg), vwm));
383 auto n_ceil = int(ceilMultiple(ceilMultiple(batch_size * tiles, nwg), vwn));
384 auto k_ceil = int(ceilMultiple(ceilMultiple(channels, kwg), vwm));
385
386 cl::CommandQueue & queue = opencl_context.m_commandqueue;
387
388 if (!skip_in_transform) {
389 try {
390 in_transform_kernel.setArg(0, bufferIn);
391 in_transform_kernel.setArg(1, bufferV);
392 in_transform_kernel.setArg(2, channels);
393 in_transform_kernel.setArg(3, k_ceil);
394 in_transform_kernel.setArg(4, n_ceil);
395 in_transform_kernel.setArg(5, batch_size);
396
397 queue.enqueueNDRangeKernel(in_transform_kernel, cl::NullRange,
398 cl::NDRange(wgs, channels));
399 } catch (const cl::Error &e) {
400 std::cerr << "Error in convolve3/in: " << e.what() << ": "
401 << e.err() << std::endl;
402 throw;
403 }
404 }
405
406 try {
407 sgemm_kernel.setArg(0, m_ceil);
408 sgemm_kernel.setArg(1, n_ceil);
409 sgemm_kernel.setArg(2, k_ceil);
410 sgemm_kernel.setArg(3, weights[0]);
411 sgemm_kernel.setArg(4, bufferV);
412 sgemm_kernel.setArg(5, bufferM);
413
414 cl::NDRange local_sgemm = {mdimc, ndimc, 1};
415
416 cl::NDRange size_sgemm = {(m_ceil * mdimc) / mwg,
417 (n_ceil * ndimc) / nwg,
418 cl::size_type(WINOGRAD_TILE)};
419
420 // tensorcore implementation uses a different dimension
421 if (tce) {
422 local_sgemm = {32 * mdimc/mdima, ndimc/ndimb, 1};
423 size_sgemm = {32 * m_ceil / mdima * mdimc / mwg,
424 n_ceil / ndimb * ndimc / nwg,
425 cl::size_type(WINOGRAD_TILE)};
426 }
427 queue.enqueueNDRangeKernel(sgemm_kernel, cl::NullRange,
428 size_sgemm, local_sgemm);
429 } catch (const cl::Error &e) {
430 std::cerr << "Error in convolve3/sgemm: " << e.what() << ": "
431 << e.err() << std::endl;
432 throw;
433 }
434
435 try {
436 if (fuse_in_transform) {
437 // TODO : Eventually this might also be something tuneable?
438 // Needs to match OUTIN_KWG in kernel
439 constexpr auto dim_size = 2;
440 out_transform_bn_in_kernel.setArg(0, bufferM);
441 if (store_inout) {
442 out_transform_bn_in_kernel.setArg(1, bufferOut);
443 } else {
444 out_transform_bn_in_kernel.setArg(1, nullptr);
445 }
446 out_transform_bn_in_kernel.setArg(2, bufferV);
447 out_transform_bn_in_kernel.setArg(3, outputs);
448 out_transform_bn_in_kernel.setArg(4, m_ceil);
449 out_transform_bn_in_kernel.setArg(5, n_ceil);
450 // k_ceil of the next convolution
451 auto k_ceil2 = int(ceilMultiple(ceilMultiple(outputs, kwg), vwm));
452 out_transform_bn_in_kernel.setArg(6, k_ceil2);
453 if (bufferResidual) {
454 out_transform_bn_in_kernel.setArg(7, *bufferResidual);
455 } else {
456 out_transform_bn_in_kernel.setArg(7, nullptr);
457 }
458 out_transform_bn_in_kernel.setArg(8, bn_weights[0]);
459 out_transform_bn_in_kernel.setArg(9, bn_weights[1]);
460
461 queue.enqueueNDRangeKernel(out_transform_bn_in_kernel,
462 cl::NullRange,
463 cl::NDRange(outputs, wgs_single, batch_size),
464 cl::NDRange(dim_size, wgs_single, 1));
465 } else {
466 out_transform_bn_kernel.setArg(0, bufferM);
467 out_transform_bn_kernel.setArg(1, bufferOut);
468 out_transform_bn_kernel.setArg(2, outputs);
469 out_transform_bn_kernel.setArg(3, m_ceil);
470 out_transform_bn_kernel.setArg(4, n_ceil);
471 out_transform_bn_kernel.setArg(5, batch_size);
472 if (bufferResidual) {
473 out_transform_bn_kernel.setArg(6, *bufferResidual);
474 } else {
475 out_transform_bn_kernel.setArg(6, nullptr);
476 }
477 out_transform_bn_kernel.setArg(7, bn_weights[0]);
478 out_transform_bn_kernel.setArg(8, bn_weights[1]);
479
480 // Needs to match OUT_KWG, OUT_BWG in the kernel.
481 // This could be tuned.
482 cl::NDRange local_out = {32, 2};
483
484 cl::NDRange global_out = {ceilMultiple(outputs, local_out[0]),
485 ceilMultiple(tiles * batch_size, local_out[1])};
486
487 queue.enqueueNDRangeKernel(out_transform_bn_kernel, cl::NullRange,
488 global_out,
489 local_out);
490 }
491 } catch (const cl::Error &e) {
492 std::cerr << "Error in convolve3/out: " << e.what() << ": "
493 << e.err() << std::endl;
494 throw;
495 }
496 }
497
498 template <typename net_t>
convolve1(OpenCLContext & opencl_context,int channels,int outputs,cl::Buffer & bufferInput,cl::Buffer & bufferOutput,cl::Buffer & bufferMerge,weight_slice_t weights,int batch_size)499 void OpenCL_Network<net_t>::convolve1(OpenCLContext & opencl_context,
500 int channels, int outputs,
501 cl::Buffer& bufferInput,
502 cl::Buffer& bufferOutput,
503 cl::Buffer& bufferMerge,
504 weight_slice_t weights,
505 int batch_size) {
506 // The size of the board is defined at compile time
507 constexpr int width = BOARD_SIZE;
508 constexpr int boardsize = NUM_INTERSECTIONS;
509 constexpr int rowTiles = BOARD_SIZE;
510
511 // Input channel grouping in multiples of 8
512 constexpr int channelGroup = 8;
513 constexpr int channelShift = 3;
514 constexpr int rowGroup = 1;
515 size_t outputGroup = std::min(outputs, 32);
516
517 auto m_convolve_kernel = &opencl_context.m_convolve1_kernel;
518
519 #ifndef NDEBUG
520 // Total output size after reducing
521 size_t outSize = boardsize * outputs * sizeof(net_t);
522
523 // Produce channel * output planes and merge them at the end
524 size_t mergeSize = (channels >> channelShift) * outSize;
525 assert(mergeSize <= bufferMerge.getInfo<CL_MEM_SIZE>());
526 #endif
527
528 // Copy the rows locally
529 size_t stripSize = width * sizeof(float);
530
531 int rowBuffer = std::min<int>(channelGroup, 7);
532 size_t rowSize = channelGroup * outputGroup * rowBuffer * sizeof(float);
533
534 cl::CommandQueue & queue = opencl_context.m_commandqueue;
535
536 try {
537 m_convolve_kernel->setArg(0, bufferInput);
538 m_convolve_kernel->setArg(1, bufferMerge);
539 m_convolve_kernel->setArg(2, weights[0]);
540 m_convolve_kernel->setArg(3, cl::Local(stripSize * channelGroup * rowGroup));
541 m_convolve_kernel->setArg(4, cl::Local(rowSize));
542
543 queue.enqueueNDRangeKernel(
544 *m_convolve_kernel, cl::NullRange,
545 cl::NDRange(channels, outputs, batch_size * rowTiles),
546 cl::NDRange(channelGroup, outputGroup, rowGroup));
547 } catch (const cl::Error &e) {
548 std::cerr << "Error in convolve1: " << e.what() << ": "
549 << e.err() << std::endl;
550 throw;
551 }
552
553 cl::Kernel & merge_kernel = opencl_context.m_merge_kernel;
554 assert(channels % (1 << channelShift) == 0);
555
556 try {
557 merge_kernel.setArg(0, bufferMerge);
558 merge_kernel.setArg(1, bufferOutput);
559 merge_kernel.setArg(2, channels >> channelShift);
560
561 queue.enqueueNDRangeKernel(
562 merge_kernel, cl::NullRange,
563 cl::NDRange(outputs, boardsize, batch_size),
564 cl::NDRange(std::min(8, outputs), BOARD_SIZE, 1));
565 } catch (const cl::Error &e) {
566 std::cerr << "Error in merge: " << e.what() << ": "
567 << e.err() << std::endl;
568 throw;
569 }
570 }
571
572 template<class T>
opencl_dev_type_to_string(T type)573 static std::string opencl_dev_type_to_string(T type) {
574 if (type == CL_DEVICE_TYPE_CPU) {
575 return "CPU";
576 } else if (type == CL_DEVICE_TYPE_GPU) {
577 return "GPU";
578 } else if (type == CL_DEVICE_TYPE_ACCELERATOR) {
579 return "Accelerator";
580 } else {
581 return "Unknown";
582 }
583 }
584
trim(std::string trim_me)585 static std::string trim(std::string trim_me) {
586 boost::algorithm::trim(trim_me);
587 return trim_me;
588 }
589
590 template <typename net_t>
process_tuners(std::string tuners)591 void OpenCL<net_t>::process_tuners(std::string tuners) {
592 std::string buf;
593 std::stringstream ss(tuners);
594 std::size_t found;
595
596 auto mwg = false;
597 auto nwg = false;
598 auto kwg = false;
599 auto ndimc = false;
600 auto mdimc = false;
601 auto mdima = false;
602 auto ndimb = false;
603 auto vwm = false;
604 auto vwn = false;
605 auto tce = false;
606
607 while (ss >> buf) {
608 found = buf.find("=");
609 if (found == std::string::npos) {
610 std::cerr << "Invalid tuner string: " << tuners << std::endl;
611 std::exit(-1);
612 }
613 std::string name = buf.substr(0, found);
614 auto value = std::stoi(buf.substr(found + 1, std::string::npos));
615 if (name == "-DMWG") {
616 m_sgemm_tuners.mwg = value;
617 mwg = true;
618 }
619 if (name == "-DNWG") {
620 m_sgemm_tuners.nwg = value;
621 nwg = true;
622 }
623 if (name == "-DKWG") {
624 m_sgemm_tuners.kwg = value;
625 kwg = true;
626 }
627 if (name == "-DMDIMA") {
628 m_sgemm_tuners.mdima = value;
629 mdima = true;
630 }
631 if (name == "-DNDIMB") {
632 m_sgemm_tuners.ndimb = value;
633 ndimb = true;
634 }
635 if (name == "-DMDIMC") {
636 m_sgemm_tuners.mdimc = value;
637 mdimc = true;
638 }
639 if (name == "-DNDIMC") {
640 m_sgemm_tuners.ndimc = value;
641 ndimc = true;
642 }
643 if (name == "-DVWM") {
644 m_sgemm_tuners.vwm = value;
645 vwm = true;
646 }
647 if (name == "-DVWN") {
648 m_sgemm_tuners.vwn = value;
649 vwn = true;
650 }
651 if (name == "-DTCE") {
652 m_sgemm_tuners.tce = value;
653 tce = true;
654 }
655 }
656 if (!mwg || !nwg || !kwg || !mdimc || !ndimc || !vwm || !vwn || !mdima || !ndimb) {
657 std::cerr << "Missing tuner parameters";
658 if (!mwg) {
659 std::cerr << " MWG";
660 }
661 if (!nwg) {
662 std::cerr << " NWG";
663 }
664 if (!kwg) {
665 std::cerr << " KWG";
666 }
667 if (!mdima) {
668 std::cerr << " MDIMA";
669 }
670 if (!ndimb) {
671 std::cerr << " NDIMB";
672 }
673 if (!mdimc) {
674 std::cerr << " MDIMC";
675 }
676 if (!ndimc) {
677 std::cerr << " NDIMC";
678 }
679 if (!vwm) {
680 std::cerr << " VWM";
681 }
682 if (!vwn) {
683 std::cerr << " VWN";
684 }
685 if (!tce) {
686 std::cerr << " VWN";
687 }
688 std::cerr << std::endl;
689 std::exit(-1);
690 }
691 }
692
693 template <typename net_t>
get_sgemm_tuners()694 std::vector<size_t> OpenCL<net_t>::get_sgemm_tuners() {
695 std::vector<size_t> tuners;
696
697 tuners.emplace_back(m_sgemm_tuners.mwg);
698 tuners.emplace_back(m_sgemm_tuners.nwg);
699 tuners.emplace_back(m_sgemm_tuners.kwg);
700 tuners.emplace_back(m_sgemm_tuners.vwm);
701 tuners.emplace_back(m_sgemm_tuners.vwn);
702 tuners.emplace_back(m_sgemm_tuners.mdimc);
703 tuners.emplace_back(m_sgemm_tuners.ndimc);
704
705 return tuners;
706 }
707
708 template <typename net_t>
OpenCL(int gpu,bool silent)709 OpenCL<net_t>::OpenCL(int gpu, bool silent) {
710 std::vector<cl::Platform> platforms;
711 try {
712 cl::Platform::get(&platforms);
713 } catch (const cl::Error &e) {
714 myprintf("OpenCL: %s\n", e.what());
715 throw;
716 }
717
718 auto best_version = 0.0f;
719 cl::Platform best_platform;
720 cl::Device best_device;
721 std::string best_vendor;
722 auto best_score = 0;
723 auto found_device = false;
724 auto id = 0;
725
726 if (!silent) {
727 myprintf("Detected %d OpenCL platforms.\n", platforms.size());
728 }
729
730 for (const auto &p : platforms) {
731 std::string platvers = p.getInfo<CL_PLATFORM_VERSION>();
732 if (!silent) {
733 std::string platprof = p.getInfo<CL_PLATFORM_PROFILE>();
734 std::string platname = p.getInfo<CL_PLATFORM_NAME>();
735 std::string platvend = p.getInfo<CL_PLATFORM_VENDOR>();
736 myprintf("Platform version: %s\n", platvers.c_str());;
737 myprintf("Platform profile: %s\n", platprof.c_str());
738 myprintf("Platform name: %s\n", platname.c_str());
739 myprintf("Platform vendor: %s\n", platvend.c_str());
740 }
741
742 std::istringstream versstream(platvers);
743 std::string tmp;
744 float opencl_version;
745 versstream >> tmp >> opencl_version;
746
747 std::vector<cl::Device> devices;
748 try {
749 p.getDevices(CL_DEVICE_TYPE_ALL, &devices);
750 } catch (const cl::Error &e) {
751 myprintf("Error getting device(s): %s: %d\n", e.what(), e.err());
752 devices.clear();
753 }
754 for (auto& d : devices) {
755 if (!silent) {
756 myprintf("Device ID: %d\n", id);
757 myprintf("Device name: %s\n",
758 trim(d.getInfo<CL_DEVICE_NAME>()).c_str());
759 myprintf("Device type: %s\n",
760 opencl_dev_type_to_string(
761 d.getInfo<CL_DEVICE_TYPE>()).c_str());
762 myprintf("Device vendor: %s\n",
763 d.getInfo<CL_DEVICE_VENDOR>().c_str());
764 myprintf("Device driver: %s\n",
765 d.getInfo<CL_DRIVER_VERSION>().c_str());
766 myprintf("Device speed: %u MHz\n",
767 d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
768 myprintf("Device cores: %u CU\n",
769 d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>());
770 }
771
772 // assign score, try to find best device
773 int this_score = 0;
774 std::string this_vendor = d.getInfo<CL_DEVICE_VENDOR>();
775 this_score += 1000 * boost::icontains(this_vendor, "advanced micro devices");
776 this_score += 1000 * boost::icontains(this_vendor, "amd");
777 this_score += 1000 * boost::icontains(this_vendor, "nvidia");
778 this_score += 500 * boost::icontains(this_vendor, "intel");
779 this_score += 100 * (d.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU);
780 this_score += opencl_version * 10;
781 if (!silent) {
782 myprintf("Device score: %d\n", this_score);
783 }
784
785 bool preferred = (gpu == id);
786
787 if (((this_score > best_score)
788 /*&& (d.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_CPU)*/) // Do not reject CPU-only OpenCL providers because some users only have that (ex. pocl)
789 || preferred) {
790 best_version = opencl_version;
791 best_platform = p;
792 best_device = d;
793 best_vendor = this_vendor;
794 if (preferred) {
795 best_score =
796 std::numeric_limits<decltype(best_score)>::max();
797 } else {
798 best_score = this_score;
799 }
800 found_device = true;
801 }
802 id++;
803 }
804 }
805
806 if (!found_device) {
807 throw std::runtime_error("No suitable OpenCL device found.");
808 }
809
810 myprintf("Selected platform: %s\n",
811 best_platform.getInfo<CL_PLATFORM_NAME>().c_str());
812 myprintf("Selected device: %s\n",
813 trim(best_device.getInfo<CL_DEVICE_NAME>()).c_str());
814 myprintf("with OpenCL %2.1f capability.\n", best_version);
815
816 cl::Context context;
817 try {
818 context = cl::Context(best_device);
819 } catch (const cl::Error &e) {
820 myprintf("Error creating OpenCL context: %s: %d", e.what(), e.err());
821 throw std::runtime_error("Error creating OpenCL context.");
822 }
823 m_context = context;
824 m_device = best_device;
825
826 m_cl_args = getClArgs<net_t>();
827
828 myprintf("Half precision compute support: ");
829 if (m_device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp16")
830 != std::string::npos) {
831 myprintf("Yes.\n");
832 m_fp16_compute = true;
833 m_cl_args += " -DFP16_SUPPORT";
834 } else {
835 myprintf("No.\n");
836 }
837
838 myprintf("Tensor Core support: ");
839 try {
840 cl::Program(m_context, sourceCode_tensorcore_test).build(m_cl_args.c_str());
841 m_tensorcore = true;
842 myprintf("Yes.\n");
843 } catch (...) {
844 myprintf("No.\n");
845 }
846 }
847
848 template <typename net_t>
initialize(const int channels,size_t batch_size)849 void OpenCL<net_t>::initialize(const int channels, size_t batch_size) {
850 m_batch_size = batch_size;
851 // Make program of the source code in the context
852 try {
853 m_program = cl::Program(m_context,
854 sourceCode_common
855 + sourceCode_config
856 + sourceCode_convolve1
857 + sourceCode_convolve3
858 + sourceCode_sgemm);
859 } catch (const cl::Error &e) {
860 myprintf("Error getting kernels: %s: %d", e.what(), e.err());
861 throw std::runtime_error("Error getting OpenCL kernels.");
862 }
863
864 auto t = Tuner<net_t>(*this, m_context, m_device);
865 if (m_tensorcore) {
866 t.enable_tensorcore();
867 }
868
869 auto sgemm_tuners =
870 t.load_sgemm_tuners(channels, batch_size * WINOGRAD_P, channels, WINOGRAD_TILE);
871
872 // Some NVIDIA drivers are buggy and will fail to compile the rest of the
873 // kernels after a tuning run.
874 if (cfg_tune_only) {
875 // Originally this was an exit() but this will make the tuner
876 // only tune the first GPU. Return instead. Exit will be called
877 // after all GPUs are created.
878 return;
879 }
880
881 // Build program for these specific devices
882 try {
883 std::string args = m_cl_args;
884 // Intel iGPUs need vector types for math for best performance
885 if (m_device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>() > 1) {
886 args += " -DWINOGRAD_SIMD";
887 }
888
889 args += sgemm_tuners;
890 m_program.build(args.c_str());
891 } catch (const cl::Error&) {
892 myprintf("Error building kernels: %s\n",
893 m_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(m_device).c_str());
894 throw std::runtime_error("Error building OpenCL kernels.");
895 }
896
897 OpenCLContext tdata;
898 ensure_context_initialized(tdata);
899
900 process_tuners(sgemm_tuners);
901
902 m_wavefront_size =
903 tdata.m_sgemm_kernel.getWorkGroupInfo<
904 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(m_device);
905 myprintf("Wavefront/Warp size: %d\n", m_wavefront_size);
906
907 m_max_workgroup_size = m_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
908 m_max_workgroup_dims = m_device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
909
910 myprintf("Max workgroup size: %d\n", m_max_workgroup_size);
911 myprintf("Max workgroup dimensions: ");
912 for (auto d : m_max_workgroup_dims) {
913 myprintf("%d ", d);
914 }
915 myprintf("\n");
916
917 m_init_ok = true;
918 }
919
920 template <typename net_t>
has_fp16_compute()921 bool OpenCL<net_t>::has_fp16_compute() {
922 return m_fp16_compute;
923 }
924
925 template <typename net_t>
has_tensor_cores()926 bool OpenCL<net_t>::has_tensor_cores() {
927 return m_tensorcore;
928 }
929
930 template <typename net_t>
get_device_name()931 std::string OpenCL<net_t>::get_device_name() {
932 std::stringstream ss;
933
934 ss << "OpenCL: ";
935 ss << m_device.getInfo<CL_DEVICE_VENDOR>() << " ";
936 ss << m_device.getInfo<CL_DEVICE_NAME>() << " @ ";
937 ss << m_device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << "MHz";
938
939 return ss.str();
940 }
941
942 template class OpenCL<float>;
943 template class OpenCL_Network<float>;
944 #ifdef USE_HALF
945 template class OpenCL<half_float::half>;
946 template class OpenCL_Network<half_float::half>;
947 #endif
948
949 #endif
950