1 /*
2     This file is part of Leela Zero.
3     Copyright (C) 2018-2019 Junhee Yoo and contributors
4 
5     Leela Zero is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     Leela Zero is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
17 
18     Additional permission under GNU GPL version 3 section 7
19 
20     If you modify this Program, or any covered work, by linking or
21     combining it with NVIDIA Corporation's libraries from the
22     NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23     Network library and/or the NVIDIA TensorRT inference library
24     (or a modified version of those libraries), containing parts covered
25     by the terms of the respective license agreement, the licensors of
26     this Program grant you additional permission to convey the resulting
27     work.
28 */
29 #include "config.h"
30 
31 #ifdef USE_OPENCL
32 
33 #include "GTP.h"
34 #include "Random.h"
35 #include "Network.h"
36 #include "Utils.h"
37 #include "OpenCLScheduler.h"
38 
39 using Utils::ceilMultiple;
40 using Utils::myprintf;
41 
42 class from_float{
43 public:
from_float(const std::vector<float> & f)44     from_float(const std::vector<float> & f) : m_f(f) {}
45 
operator const std::vector<float>&()46     operator const std::vector<float>&() {
47         return m_f;
48     }
49 
operator std::vector<half_float::half>()50     operator std::vector<half_float::half>() {
51         auto ret = std::vector<half_float::half>(m_f.size());
52         std::copy(cbegin(m_f), cend(m_f), begin(ret));
53         return ret;
54     }
55 private:
56     const std::vector<float>& m_f;
57 };
58 
59 template <typename T>
zeropad_U(const std::vector<float> & U,const int outputs,const int channels,const int outputs_pad,const int channels_pad)60 static std::vector<T> zeropad_U(const std::vector<float>& U,
61                                 const int outputs, const int channels,
62                                 const int outputs_pad,
63                                 const int channels_pad) {
64     // Fill with zeroes
65     auto Upad =
66         std::vector<T>(WINOGRAD_TILE * outputs_pad * channels_pad);
67 
68     for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++){
69         for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
70             for (auto c = 0; c < channels; c++) {
71                 for (auto o = 0; o < outputs; o++) {
72                     Upad[xi * (WINOGRAD_ALPHA * outputs_pad * channels_pad)
73                          + nu * (outputs_pad * channels_pad)
74                          + c * outputs_pad +
75                           o] =
76                     U[xi * (WINOGRAD_ALPHA * outputs * channels)
77                       + nu * (outputs * channels)
78                       + c * outputs
79                       + o];
80                 }
81             }
82         }
83     }
84 
85     return Upad;
86 }
87 
88 template <typename net_t>
OpenCLScheduler()89 OpenCLScheduler<net_t>::OpenCLScheduler() {
90     // multi-gpu?
91     auto gpus = cfg_gpus;
92 
93     // An empty GPU list from the command line represents autodetect.
94     // Put a minus one GPU index here.
95     if (gpus.empty()) {
96         gpus = {-1};
97     }
98 
99     auto silent{false};
100 
101     for (auto gpu : gpus) {
102         auto opencl = std::make_unique<OpenCL<net_t>>(gpu, silent);
103         auto net = std::make_unique<OpenCL_Network<net_t>>(*opencl);
104         m_opencl.push_back(std::move(opencl));
105         m_networks.push_back(std::move(net));
106 
107         // Starting next GPU, let's not dump full list of GPUs.
108         silent = true;
109     }
110 }
111 
112 template <typename net_t>
initialize(const int channels)113 void OpenCLScheduler<net_t>::initialize(const int channels) {
114     // Launch the worker threads.  Minimum 1 worker per GPU, but use enough threads
115     // so that we can at least concurrently schedule something to the GPU.
116     auto num_worker_threads = cfg_num_threads / cfg_batch_size / (m_opencl.size() + 1) + 1;
117     auto gnum = 0;
118     for (auto & opencl : m_opencl) {
119         opencl->initialize(channels, cfg_batch_size);
120 
121         for (auto i = unsigned{0}; i < num_worker_threads; i++) {
122             auto t = std::thread(&OpenCLScheduler<net_t>::batch_worker, this, gnum);
123             m_worker_threads.push_back(std::move(t));
124         }
125         gnum++;
126     }
127 
128     // Exit immediately after tuning.  We should exit here because we skipped
129     // initializing rest of the kernels due to some NVIDIA drivers crashing.
130     if (cfg_tune_only) {
131         exit(EXIT_SUCCESS);
132     }
133 }
134 
135 template <typename net_t>
~OpenCLScheduler()136 OpenCLScheduler<net_t>::~OpenCLScheduler() {
137     {
138         std::unique_lock<std::mutex> lk(m_mutex);
139         m_running = false;
140     }
141     m_cv.notify_all();
142     for (auto & x : m_worker_threads) {
143         x.join();
144     }
145 }
146 
147 template<typename net_t>
needs_autodetect()148 bool OpenCLScheduler<net_t>::needs_autodetect() {
149     for (auto& opencl : m_opencl) {
150         // If any card has no native fp16 compute, we'll have to benchmark.
151         if (!opencl->has_fp16_compute() && !opencl->has_tensor_cores()) {
152             return true;
153         }
154     }
155     return false;
156 }
157 
158 template <typename net_t>
push_input_convolution(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<float> & weights,const std::vector<float> & means,const std::vector<float> & variances)159 void OpenCLScheduler<net_t>::push_input_convolution(
160     unsigned int filter_size,
161     unsigned int channels,
162     unsigned int outputs,
163     const std::vector<float>& weights,
164     const std::vector<float>& means,
165     const std::vector<float>& variances) {
166 
167     for (const auto& opencl_net : m_networks) {
168         const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
169 
170         const auto mwg = tuners[0];
171         const auto kwg = tuners[2];
172         const auto vwm = tuners[3];
173 
174         const auto m_ceil = ceilMultiple(ceilMultiple(outputs, mwg), vwm);
175         const auto k_ceil = ceilMultiple(ceilMultiple(channels, kwg), vwm);
176 
177         const auto Upad = zeropad_U<net_t>(weights,
178                                            outputs, channels,
179                                            m_ceil, k_ceil);
180         opencl_net->push_input_convolution(
181             filter_size, channels, outputs,
182             Upad, from_float(means), from_float(variances)
183         );
184     }
185 }
186 
187 template <typename net_t>
push_residual(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<float> & weights_1,const std::vector<float> & means_1,const std::vector<float> & variances_1,const std::vector<float> & weights_2,const std::vector<float> & means_2,const std::vector<float> & variances_2)188 void OpenCLScheduler<net_t>::push_residual(unsigned int filter_size,
189                                            unsigned int channels,
190                                            unsigned int outputs,
191                                            const std::vector<float>& weights_1,
192                                            const std::vector<float>& means_1,
193                                            const std::vector<float>& variances_1,
194                                            const std::vector<float>& weights_2,
195                                            const std::vector<float>& means_2,
196                                            const std::vector<float>& variances_2) {
197     for (const auto& opencl_net : m_networks) {
198         const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
199 
200         const auto mwg = tuners[0];
201         const auto vwm = tuners[3];
202 
203         const auto m_ceil = ceilMultiple(ceilMultiple(outputs, mwg), vwm);
204         const auto Upad1 = zeropad_U<net_t>(weights_1,
205                                             outputs, outputs,
206                                             m_ceil, m_ceil);
207         const auto Upad2 = zeropad_U<net_t>(weights_2,
208                                             outputs, outputs,
209                                             m_ceil, m_ceil);
210         opencl_net->push_residual(filter_size, channels, outputs,
211                                   Upad1,
212                                   from_float(means_1),
213                                   from_float(variances_1),
214                                   Upad2,
215                                   from_float(means_2),
216                                   from_float(variances_2));
217     }
218 }
219 
220 template <typename net_t>
push_convolve(unsigned int filter_size,unsigned int channels,unsigned int outputs,const std::vector<float> & weights)221 void OpenCLScheduler<net_t>::push_convolve(unsigned int filter_size,
222                                            unsigned int channels,
223                                            unsigned int outputs,
224                                            const std::vector<float>& weights) {
225     for (const auto & opencl_net : m_networks) {
226         opencl_net->push_convolve(filter_size, channels, outputs,
227                                   from_float(weights));
228     }
229 }
230 
231 template <typename net_t>
push_weights(unsigned int filter_size,unsigned int channels,unsigned int outputs,std::shared_ptr<const ForwardPipeWeights> weights)232 void OpenCLScheduler<net_t>::push_weights(
233     unsigned int filter_size,
234     unsigned int channels,
235     unsigned int outputs,
236     std::shared_ptr<const ForwardPipeWeights> weights) {
237 
238     auto weight_index = size_t{0};
239 
240     // Winograd filter transformation changes filter size to 4x4
241     push_input_convolution(filter_size, channels, outputs,
242                            weights->m_conv_weights[weight_index],
243                            weights->m_batchnorm_means[weight_index],
244                            weights->m_batchnorm_stddevs[weight_index]);
245     weight_index++;
246 
247     // residual blocks : except the first entry,
248     // the second ~ last entry is all on residual topwer
249     for (auto i = size_t{0}; i < weights->m_conv_weights.size()/2; i++) {
250         push_residual(filter_size, outputs, outputs,
251                       weights->m_conv_weights[weight_index],
252                       weights->m_batchnorm_means[weight_index],
253                       weights->m_batchnorm_stddevs[weight_index],
254                       weights->m_conv_weights[weight_index + 1],
255                       weights->m_batchnorm_means[weight_index + 1],
256                       weights->m_batchnorm_stddevs[weight_index + 1]);
257         weight_index += 2;
258     }
259 
260     // Output head convolutions
261     push_convolve(1, outputs, Network::OUTPUTS_POLICY, weights->m_conv_pol_w);
262     push_convolve(1, outputs, Network::OUTPUTS_VALUE, weights->m_conv_val_w);
263 }
264 
265 template <typename net_t>
forward(const std::vector<float> & input,std::vector<float> & output_pol,std::vector<float> & output_val)266 void OpenCLScheduler<net_t>::forward(const std::vector<float>& input,
267                                      std::vector<float>& output_pol,
268                                      std::vector<float>& output_val) {
269     auto entry = std::make_shared<ForwardQueueEntry>(input, output_pol, output_val);
270     std::unique_lock<std::mutex> lk(entry->mutex);
271     {
272         std::unique_lock<std::mutex> lk(m_mutex);
273         m_forward_queue.push_back(entry);
274 
275         if (m_single_eval_in_progress.load()) {
276             m_waittime += 2;
277         }
278     }
279     m_cv.notify_one();
280     entry->cv.wait(lk);
281 }
282 
283 #ifndef NDEBUG
284 struct batch_stats_t batch_stats;
285 #endif
286 
287 template <typename net_t>
batch_worker(const size_t gnum)288 void OpenCLScheduler<net_t>::batch_worker(const size_t gnum) {
289     constexpr auto in_size = Network::INPUT_CHANNELS * BOARD_SIZE * BOARD_SIZE;
290     constexpr auto out_pol_size = Network::OUTPUTS_POLICY * BOARD_SIZE * BOARD_SIZE;
291     constexpr auto out_val_size = Network::OUTPUTS_VALUE * BOARD_SIZE * BOARD_SIZE;
292 
293     OpenCLContext context;
294 
295     // batch scheduling heuristic.
296     // Returns the batch picked up from the queue (m_forward_queue)
297     // 1) Wait for m_waittime milliseconds for full batch
298     // 2) if we don't have a full batch then just do a single eval
299     //
300     // The purpose of m_waittime is to prevent the system from deadlocking
301     // because we were waiting for a job too long, while the job is never
302     // going to come due to a control dependency (e.g., evals stuck on a
303     // critical path).  To do so:
304     //
305     // 1) if we couldn't form a batch after waiting m_waittime ms, it means
306     // that we hit the critical path and should do scalar evals.
307     // Wait 1ms shorter next time.
308     //
309     // 2) if we picked up a single eval, but were getting additional evals
310     // while that single eval was being processed, it means that we made
311     // the wrong decision.  Wait 2ms longer next time.
312 
313     auto pickup_task = [this] () {
314         std::list<std::shared_ptr<ForwardQueueEntry>> inputs;
315         size_t count = 0;
316 
317         std::unique_lock<std::mutex> lk(m_mutex);
318         while (true) {
319             if (!m_running) return inputs;
320 
321             count = m_forward_queue.size();
322             if (count >= cfg_batch_size) {
323                 count = cfg_batch_size;
324                 break;
325             }
326 
327             bool timeout = !m_cv.wait_for(
328                 lk,
329                 std::chrono::milliseconds(m_waittime),
330                 [this] () {
331                     return !m_running || m_forward_queue.size() >= cfg_batch_size;
332                 }
333             );
334 
335             if (!m_forward_queue.empty()) {
336                 if (timeout && m_single_eval_in_progress.exchange(true) == false) {
337                     // Waited long enough but couldn't form a batch.
338                     // Check if there is any other single eval in progress, and if not,
339                     // do one from this thread.
340                     if (m_waittime > 1) {
341                         m_waittime--;
342                     }
343                     count = 1;
344                     break;
345                 }
346             }
347         }
348         // Move 'count' evals from shared queue to local list.
349         auto end = begin(m_forward_queue);
350         std::advance(end, count);
351         std::move(begin(m_forward_queue), end, std::back_inserter(inputs));
352         m_forward_queue.erase(begin(m_forward_queue), end);
353 
354         return inputs;
355     };
356 
357     auto batch_input = std::vector<float>();
358     auto batch_output_pol = std::vector<float>();
359     auto batch_output_val = std::vector<float>();
360 
361     while (true) {
362         auto inputs = pickup_task();
363         auto count = inputs.size();
364 
365         if (!m_running) {
366             return;
367         }
368 
369 #ifndef NDEBUG
370         if (count == 1) {
371             batch_stats.single_evals++;
372         } else {
373             batch_stats.batch_evals++;
374         }
375 #endif
376 
377         // prepare input for forward() call
378         batch_input.resize(in_size * count);
379         batch_output_pol.resize(out_pol_size * count);
380         batch_output_val.resize(out_val_size * count);
381 
382         auto index = size_t{0};
383         for (auto & x : inputs) {
384             std::unique_lock<std::mutex> lk(x->mutex);
385             std::copy(begin(x->in), end(x->in), begin(batch_input) + in_size * index);
386             index++;
387         }
388 
389         // run the NN evaluation
390         m_networks[gnum]->forward(
391             batch_input, batch_output_pol, batch_output_val, context, count);
392 
393         // Get output and copy back
394         index = 0;
395         for (auto & x : inputs) {
396             std::copy(begin(batch_output_pol) + out_pol_size * index,
397                       begin(batch_output_pol) + out_pol_size * (index + 1),
398                       begin(x->out_p));
399             std::copy(begin(batch_output_val) + out_val_size * index,
400                       begin(batch_output_val) + out_val_size * (index + 1),
401                       begin(x->out_v));
402             x->cv.notify_all();
403             index++;
404         }
405 
406         if (count == 1) {
407             m_single_eval_in_progress = false;
408         }
409     }
410 }
411 
412 template class OpenCLScheduler<float>;
413 #ifdef USE_HALF
414 template class OpenCLScheduler<half_float::half>;
415 #endif
416 
417 #endif
418