1 /*
2 This file is part of Leela Zero.
3 Copyright (C) 2018-2019 Junhee Yoo and contributors
4
5 Leela Zero is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 Leela Zero is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with Leela Zero. If not, see <http://www.gnu.org/licenses/>.
17
18 Additional permission under GNU GPL version 3 section 7
19
20 If you modify this Program, or any covered work, by linking or
21 combining it with NVIDIA Corporation's libraries from the
22 NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23 Network library and/or the NVIDIA TensorRT inference library
24 (or a modified version of those libraries), containing parts covered
25 by the terms of the respective license agreement, the licensors of
26 this Program grant you additional permission to convey the resulting
27 work.
28 */
29 #include "config.h"
30
31 #ifdef USE_OPENCL
32
33 #include "GTP.h"
34 #include "Random.h"
35 #include "Network.h"
36 #include "Utils.h"
37 #include "OpenCLScheduler.h"
38
39 using Utils::ceilMultiple;
40 using Utils::myprintf;
41
42 class from_float{
43 public:
from_float(const std::vector<float> & f)44 from_float(const std::vector<float> & f) : m_f(f) {}
45
operator const std::vector<float>&()46 operator const std::vector<float>&() {
47 return m_f;
48 }
49
operator std::vector<half_float::half>()50 operator std::vector<half_float::half>() {
51 auto ret = std::vector<half_float::half>(m_f.size());
52 std::copy(cbegin(m_f), cend(m_f), begin(ret));
53 return ret;
54 }
55 private:
56 const std::vector<float>& m_f;
57 };
58
59 template <typename T>
zeropad_U(const std::vector<float> & U,const int outputs,const int channels,const int outputs_pad,const int channels_pad)60 static std::vector<T> zeropad_U(const std::vector<float>& U,
61 const int outputs, const int channels,
62 const int outputs_pad,
63 const int channels_pad) {
64 // Fill with zeroes
65 auto Upad =
66 std::vector<T>(WINOGRAD_TILE * outputs_pad * channels_pad);
67
68 for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++){
69 for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
70 for (auto c = 0; c < channels; c++) {
71 for (auto o = 0; o < outputs; o++) {
72 Upad[xi * (WINOGRAD_ALPHA * outputs_pad * channels_pad)
73 + nu * (outputs_pad * channels_pad)
74 + c * outputs_pad +
75 o] =
76 U[xi * (WINOGRAD_ALPHA * outputs * channels)
77 + nu * (outputs * channels)
78 + c * outputs
79 + o];
80 }
81 }
82 }
83 }
84
85 return Upad;
86 }
87
88 template <typename net_t>
OpenCLScheduler()89 OpenCLScheduler<net_t>::OpenCLScheduler() {
90 // multi-gpu?
91 auto gpus = cfg_gpus;
92
93 // An empty GPU list from the command line represents autodetect.
94 // Put a minus one GPU index here.
95 if (gpus.empty()) {
96 gpus = {-1};
97 }
98
99 auto silent{false};
100
101 for (auto gpu : gpus) {
102 auto opencl = std::make_unique<OpenCL<net_t>>(gpu, silent);
103 auto net = std::make_unique<OpenCL_Network<net_t>>(*opencl);
104 m_opencl.push_back(std::move(opencl));
105 m_networks.push_back(std::move(net));
106
107 // Starting next GPU, let's not dump full list of GPUs.
108 silent = true;
109 }
110 }
111
112 template <typename net_t>
initialize(const int channels)113 void OpenCLScheduler<net_t>::initialize(const int channels) {
114 // Launch the worker threads. Minimum 1 worker per GPU, but use enough threads
115 // so that we can at least concurrently schedule something to the GPU.
116 auto num_worker_threads = cfg_num_threads / cfg_batch_size / (m_opencl.size() + 1) + 1;
117 auto gnum = 0;
118 for (auto & opencl : m_opencl) {
119 opencl->initialize(channels, cfg_batch_size);
120
121 for (auto i = unsigned{0}; i < num_worker_threads; i++) {
122 auto t = std::thread(&OpenCLScheduler<net_t>::batch_worker, this, gnum);
123 m_worker_threads.push_back(std::move(t));
124 }
125 gnum++;
126 }
127
128 // Exit immediately after tuning. We should exit here because we skipped
129 // initializing rest of the kernels due to some NVIDIA drivers crashing.
130 if (cfg_tune_only) {
131 exit(EXIT_SUCCESS);
132 }
133 }
134
135 template <typename net_t>
~OpenCLScheduler()136 OpenCLScheduler<net_t>::~OpenCLScheduler() {
137 {
138 std::unique_lock<std::mutex> lk(m_mutex);
139 m_running = false;
140 }
141 m_cv.notify_all();
142 for (auto & x : m_worker_threads) {
143 x.join();
144 }
145 }
146
147 template<typename net_t>
needs_autodetect()148 bool OpenCLScheduler<net_t>::needs_autodetect() {
149 for (auto& opencl : m_opencl) {
150 // If any card has no native fp16 compute, we'll have to benchmark.
151 if (!opencl->has_fp16_compute() && !opencl->has_tensor_cores()) {
152 return true;
153 }
154 }
155 return false;
156 }
157
158 template <typename net_t>
push_input_convolution(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<float> & weights,const std::vector<float> & means,const std::vector<float> & variances)159 void OpenCLScheduler<net_t>::push_input_convolution(
160 unsigned int filter_size,
161 unsigned int channels,
162 unsigned int outputs,
163 const std::vector<float>& weights,
164 const std::vector<float>& means,
165 const std::vector<float>& variances) {
166
167 for (const auto& opencl_net : m_networks) {
168 const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
169
170 const auto mwg = tuners[0];
171 const auto kwg = tuners[2];
172 const auto vwm = tuners[3];
173
174 const auto m_ceil = ceilMultiple(ceilMultiple(outputs, mwg), vwm);
175 const auto k_ceil = ceilMultiple(ceilMultiple(channels, kwg), vwm);
176
177 const auto Upad = zeropad_U<net_t>(weights,
178 outputs, channels,
179 m_ceil, k_ceil);
180 opencl_net->push_input_convolution(
181 filter_size, channels, outputs,
182 Upad, from_float(means), from_float(variances)
183 );
184 }
185 }
186
187 template <typename net_t>
push_residual(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<float> & weights_1,const std::vector<float> & means_1,const std::vector<float> & variances_1,const std::vector<float> & weights_2,const std::vector<float> & means_2,const std::vector<float> & variances_2)188 void OpenCLScheduler<net_t>::push_residual(unsigned int filter_size,
189 unsigned int channels,
190 unsigned int outputs,
191 const std::vector<float>& weights_1,
192 const std::vector<float>& means_1,
193 const std::vector<float>& variances_1,
194 const std::vector<float>& weights_2,
195 const std::vector<float>& means_2,
196 const std::vector<float>& variances_2) {
197 for (const auto& opencl_net : m_networks) {
198 const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
199
200 const auto mwg = tuners[0];
201 const auto vwm = tuners[3];
202
203 const auto m_ceil = ceilMultiple(ceilMultiple(outputs, mwg), vwm);
204 const auto Upad1 = zeropad_U<net_t>(weights_1,
205 outputs, outputs,
206 m_ceil, m_ceil);
207 const auto Upad2 = zeropad_U<net_t>(weights_2,
208 outputs, outputs,
209 m_ceil, m_ceil);
210 opencl_net->push_residual(filter_size, channels, outputs,
211 Upad1,
212 from_float(means_1),
213 from_float(variances_1),
214 Upad2,
215 from_float(means_2),
216 from_float(variances_2));
217 }
218 }
219
220 template <typename net_t>
push_convolve(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<float> & weights)221 void OpenCLScheduler<net_t>::push_convolve(unsigned int filter_size,
222 unsigned int channels,
223 unsigned int outputs,
224 const std::vector<float>& weights) {
225 for (const auto & opencl_net : m_networks) {
226 opencl_net->push_convolve(filter_size, channels, outputs,
227 from_float(weights));
228 }
229 }
230
231 template <typename net_t>
push_weights(unsigned int filter_size,unsigned int channels,unsigned int outputs,std::shared_ptr<const ForwardPipeWeights> weights)232 void OpenCLScheduler<net_t>::push_weights(
233 unsigned int filter_size,
234 unsigned int channels,
235 unsigned int outputs,
236 std::shared_ptr<const ForwardPipeWeights> weights) {
237
238 auto weight_index = size_t{0};
239
240 // Winograd filter transformation changes filter size to 4x4
241 push_input_convolution(filter_size, channels, outputs,
242 weights->m_conv_weights[weight_index],
243 weights->m_batchnorm_means[weight_index],
244 weights->m_batchnorm_stddevs[weight_index]);
245 weight_index++;
246
247 // residual blocks : except the first entry,
248 // the second ~ last entry is all on residual topwer
249 for (auto i = size_t{0}; i < weights->m_conv_weights.size()/2; i++) {
250 push_residual(filter_size, outputs, outputs,
251 weights->m_conv_weights[weight_index],
252 weights->m_batchnorm_means[weight_index],
253 weights->m_batchnorm_stddevs[weight_index],
254 weights->m_conv_weights[weight_index + 1],
255 weights->m_batchnorm_means[weight_index + 1],
256 weights->m_batchnorm_stddevs[weight_index + 1]);
257 weight_index += 2;
258 }
259
260 // Output head convolutions
261 push_convolve(1, outputs, Network::OUTPUTS_POLICY, weights->m_conv_pol_w);
262 push_convolve(1, outputs, Network::OUTPUTS_VALUE, weights->m_conv_val_w);
263 }
264
265 template <typename net_t>
forward(const std::vector<float> & input,std::vector<float> & output_pol,std::vector<float> & output_val)266 void OpenCLScheduler<net_t>::forward(const std::vector<float>& input,
267 std::vector<float>& output_pol,
268 std::vector<float>& output_val) {
269 auto entry = std::make_shared<ForwardQueueEntry>(input, output_pol, output_val);
270 std::unique_lock<std::mutex> lk(entry->mutex);
271 {
272 std::unique_lock<std::mutex> lk(m_mutex);
273 m_forward_queue.push_back(entry);
274
275 if (m_single_eval_in_progress.load()) {
276 m_waittime += 2;
277 }
278 }
279 m_cv.notify_one();
280 entry->cv.wait(lk);
281 }
282
283 #ifndef NDEBUG
284 struct batch_stats_t batch_stats;
285 #endif
286
287 template <typename net_t>
batch_worker(const size_t gnum)288 void OpenCLScheduler<net_t>::batch_worker(const size_t gnum) {
289 constexpr auto in_size = Network::INPUT_CHANNELS * BOARD_SIZE * BOARD_SIZE;
290 constexpr auto out_pol_size = Network::OUTPUTS_POLICY * BOARD_SIZE * BOARD_SIZE;
291 constexpr auto out_val_size = Network::OUTPUTS_VALUE * BOARD_SIZE * BOARD_SIZE;
292
293 OpenCLContext context;
294
295 // batch scheduling heuristic.
296 // Returns the batch picked up from the queue (m_forward_queue)
297 // 1) Wait for m_waittime milliseconds for full batch
298 // 2) if we don't have a full batch then just do a single eval
299 //
300 // The purpose of m_waittime is to prevent the system from deadlocking
301 // because we were waiting for a job too long, while the job is never
302 // going to come due to a control dependency (e.g., evals stuck on a
303 // critical path). To do so:
304 //
305 // 1) if we couldn't form a batch after waiting m_waittime ms, it means
306 // that we hit the critical path and should do scalar evals.
307 // Wait 1ms shorter next time.
308 //
309 // 2) if we picked up a single eval, but were getting additional evals
310 // while that single eval was being processed, it means that we made
311 // the wrong decision. Wait 2ms longer next time.
312
313 auto pickup_task = [this] () {
314 std::list<std::shared_ptr<ForwardQueueEntry>> inputs;
315 size_t count = 0;
316
317 std::unique_lock<std::mutex> lk(m_mutex);
318 while (true) {
319 if (!m_running) return inputs;
320
321 count = m_forward_queue.size();
322 if (count >= cfg_batch_size) {
323 count = cfg_batch_size;
324 break;
325 }
326
327 bool timeout = !m_cv.wait_for(
328 lk,
329 std::chrono::milliseconds(m_waittime),
330 [this] () {
331 return !m_running || m_forward_queue.size() >= cfg_batch_size;
332 }
333 );
334
335 if (!m_forward_queue.empty()) {
336 if (timeout && m_single_eval_in_progress.exchange(true) == false) {
337 // Waited long enough but couldn't form a batch.
338 // Check if there is any other single eval in progress, and if not,
339 // do one from this thread.
340 if (m_waittime > 1) {
341 m_waittime--;
342 }
343 count = 1;
344 break;
345 }
346 }
347 }
348 // Move 'count' evals from shared queue to local list.
349 auto end = begin(m_forward_queue);
350 std::advance(end, count);
351 std::move(begin(m_forward_queue), end, std::back_inserter(inputs));
352 m_forward_queue.erase(begin(m_forward_queue), end);
353
354 return inputs;
355 };
356
357 auto batch_input = std::vector<float>();
358 auto batch_output_pol = std::vector<float>();
359 auto batch_output_val = std::vector<float>();
360
361 while (true) {
362 auto inputs = pickup_task();
363 auto count = inputs.size();
364
365 if (!m_running) {
366 return;
367 }
368
369 #ifndef NDEBUG
370 if (count == 1) {
371 batch_stats.single_evals++;
372 } else {
373 batch_stats.batch_evals++;
374 }
375 #endif
376
377 // prepare input for forward() call
378 batch_input.resize(in_size * count);
379 batch_output_pol.resize(out_pol_size * count);
380 batch_output_val.resize(out_val_size * count);
381
382 auto index = size_t{0};
383 for (auto & x : inputs) {
384 std::unique_lock<std::mutex> lk(x->mutex);
385 std::copy(begin(x->in), end(x->in), begin(batch_input) + in_size * index);
386 index++;
387 }
388
389 // run the NN evaluation
390 m_networks[gnum]->forward(
391 batch_input, batch_output_pol, batch_output_val, context, count);
392
393 // Get output and copy back
394 index = 0;
395 for (auto & x : inputs) {
396 std::copy(begin(batch_output_pol) + out_pol_size * index,
397 begin(batch_output_pol) + out_pol_size * (index + 1),
398 begin(x->out_p));
399 std::copy(begin(batch_output_val) + out_val_size * index,
400 begin(batch_output_val) + out_val_size * (index + 1),
401 begin(x->out_v));
402 x->cv.notify_all();
403 index++;
404 }
405
406 if (count == 1) {
407 m_single_eval_in_progress = false;
408 }
409 }
410 }
411
412 template class OpenCLScheduler<float>;
413 #ifdef USE_HALF
414 template class OpenCLScheduler<half_float::half>;
415 #endif
416
417 #endif
418