1 /*
2 This file is part of Leela Zero.
3 Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
4
5 Leela Zero is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 Leela Zero is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with Leela Zero. If not, see <http://www.gnu.org/licenses/>.
17
18 Additional permission under GNU GPL version 3 section 7
19
20 If you modify this Program, or any covered work, by linking or
21 combining it with NVIDIA Corporation's libraries from the
22 NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23 Network library and/or the NVIDIA TensorRT inference library
24 (or a modified version of those libraries), containing parts covered
25 by the terms of the respective license agreement, the licensors of
26 this Program grant you additional permission to convey the resulting
27 work.
28 */
29
30 #include "config.h"
31
32 #include <algorithm>
33 #include <array>
34 #include <cassert>
35 #include <cmath>
36 #include <iterator>
37 #include <memory>
38 #include <sstream>
39 #include <string>
40 #include <boost/utility.hpp>
41 #include <boost/format.hpp>
42 #include <boost/spirit/home/x3.hpp>
43 #ifndef USE_BLAS
44 #include <Eigen/Dense>
45 #endif
46
47 #ifdef __APPLE__
48 #include <Accelerate/Accelerate.h>
49 #endif
50 #ifdef USE_MKL
51 #include <mkl.h>
52 #endif
53 #ifdef USE_OPENBLAS
54 #include <cblas.h>
55 #endif
56 #include "zlib.h"
57
58 #include "Network.h"
59 #include "CPUPipe.h"
60 #ifdef USE_OPENCL
61 #include "OpenCLScheduler.h"
62 #include "UCTNode.h"
63 #endif
64 #include "FastBoard.h"
65 #include "FastState.h"
66 #include "FullBoard.h"
67 #include "GameState.h"
68 #include "GTP.h"
69 #include "NNCache.h"
70 #include "Random.h"
71 #include "ThreadPool.h"
72 #include "Timing.h"
73 #include "Utils.h"
74
75 namespace x3 = boost::spirit::x3;
76 using namespace Utils;
77
78 #ifndef USE_BLAS
79 // Eigen helpers
80 template <typename T>
81 using EigenVectorMap =
82 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
83 template <typename T>
84 using ConstEigenVectorMap =
85 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
86 template <typename T>
87 using ConstEigenMatrixMap =
88 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
89 #endif
90
91 // Symmetry helper
92 static std::array<std::array<int, NUM_INTERSECTIONS>,
93 Network::NUM_SYMMETRIES> symmetry_nn_idx_table;
94
benchmark_time(int centiseconds)95 float Network::benchmark_time(int centiseconds) {
96 const auto cpus = cfg_num_threads;
97
98 ThreadGroup tg(thread_pool);
99 std::atomic<int> runcount{0};
100
101 GameState state;
102 state.init_game(BOARD_SIZE, KOMI);
103
104 // As a sanity run, try one run with self check.
105 // Isn't enough to guarantee correctness but better than nothing,
106 // plus for large nets self-check takes a while (1~3 eval per second)
107 get_output(&state, Ensemble::RANDOM_SYMMETRY, -1, false, true, true);
108
109 const Time start;
110 for (auto i = size_t{0}; i < cpus; i++) {
111 tg.add_task([this, &runcount, start, centiseconds, state]() {
112 while (true) {
113 runcount++;
114 get_output(&state, Ensemble::RANDOM_SYMMETRY, -1, false);
115 const Time end;
116 const auto elapsed = Time::timediff_centis(start, end);
117 if (elapsed >= centiseconds) {
118 break;
119 }
120 }
121 });
122 }
123 tg.wait_all();
124
125 const Time end;
126 const auto elapsed = Time::timediff_centis(start, end);
127 return 100.0f * runcount.load() / elapsed;
128 }
129
benchmark(const GameState * const state,const int iterations)130 void Network::benchmark(const GameState* const state, const int iterations) {
131 const auto cpus = cfg_num_threads;
132 const Time start;
133
134 ThreadGroup tg(thread_pool);
135 std::atomic<int> runcount{0};
136
137 for (auto i = size_t{0}; i < cpus; i++) {
138 tg.add_task([this, &runcount, iterations, state]() {
139 while (runcount < iterations) {
140 runcount++;
141 get_output(state, Ensemble::RANDOM_SYMMETRY, -1, false);
142 }
143 });
144 }
145 tg.wait_all();
146
147 const Time end;
148 const auto elapsed = Time::timediff_seconds(start, end);
149 myprintf("%5d evaluations in %5.2f seconds -> %d n/s\n",
150 runcount.load(), elapsed, int(runcount.load() / elapsed));
151 }
152
153 template<class container>
process_bn_var(container & weights)154 void process_bn_var(container& weights) {
155 constexpr float epsilon = 1e-5f;
156 for (auto&& w : weights) {
157 w = 1.0f / std::sqrt(w + epsilon);
158 }
159 }
160
winograd_transform_f(const std::vector<float> & f,const int outputs,const int channels)161 std::vector<float> Network::winograd_transform_f(const std::vector<float>& f,
162 const int outputs,
163 const int channels) {
164 // F(4x4, 3x3) Winograd filter transformation
165 // transpose(G.dot(f).dot(G.transpose()))
166 // U matrix is transposed for better memory layout in SGEMM
167 auto U = std::vector<float>(WINOGRAD_TILE * outputs * channels);
168 const auto G = std::array<float, 3 * WINOGRAD_ALPHA>
169 { 1.0f, 0.0f, 0.0f,
170 -2.0f/3.0f, -SQ2/3.0f, -1.0f/3.0f,
171 -2.0f/3.0f, SQ2/3.0f, -1.0f/3.0f,
172 1.0f/6.0f, SQ2/6.0f, 1.0f/3.0f,
173 1.0f/6.0f, -SQ2/6.0f, 1.0f/3.0f,
174 0.0f, 0.0f, 1.0f};
175
176 auto temp = std::array<float, 3 * WINOGRAD_ALPHA>{};
177
178 constexpr auto max_buffersize = 8;
179 auto buffersize = max_buffersize;
180
181 if (outputs % buffersize != 0) {
182 buffersize = 1;
183 }
184
185 std::array<float, max_buffersize * WINOGRAD_ALPHA * WINOGRAD_ALPHA> buffer;
186
187 for (auto c = 0; c < channels; c++) {
188 for (auto o_b = 0; o_b < outputs/buffersize; o_b++) {
189 for (auto bufferline = 0; bufferline < buffersize; bufferline++) {
190 const auto o = o_b * buffersize + bufferline;
191
192 for (auto i = 0; i < WINOGRAD_ALPHA; i++) {
193 for (auto j = 0; j < 3; j++) {
194 auto acc = 0.0f;
195 for (auto k = 0; k < 3; k++) {
196 acc += G[i*3 + k] * f[o*channels*9 + c*9 + k*3 + j];
197 }
198 temp[i*3 + j] = acc;
199 }
200 }
201
202 for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
203 for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
204 auto acc = 0.0f;
205 for (auto k = 0; k < 3; k++) {
206 acc += temp[xi*3 + k] * G[nu*3 + k];
207 }
208 buffer[(xi * WINOGRAD_ALPHA + nu) * buffersize + bufferline] = acc;
209 }
210 }
211 }
212 for (auto i = 0; i < WINOGRAD_ALPHA * WINOGRAD_ALPHA; i++) {
213 for (auto entry = 0; entry < buffersize; entry++) {
214 const auto o = o_b * buffersize + entry;
215 U[i * outputs * channels
216 + c * outputs
217 + o] =
218 buffer[buffersize * i + entry];
219 }
220 }
221 }
222 }
223
224 return U;
225 }
226
load_v1_network(std::istream & wtfile)227 std::pair<int, int> Network::load_v1_network(std::istream& wtfile) {
228 // Count size of the network
229 myprintf("Detecting residual layers...");
230 // We are version 1 or 2
231 if (m_value_head_not_stm) {
232 myprintf("v%d...", 2);
233 } else {
234 myprintf("v%d...", 1);
235 }
236 // First line was the version number
237 auto linecount = size_t{1};
238 auto channels = 0;
239 auto line = std::string{};
240 while (std::getline(wtfile, line)) {
241 auto iss = std::stringstream{line};
242 // Third line of parameters are the convolution layer biases,
243 // so this tells us the amount of channels in the residual layers.
244 // We are assuming all layers have the same amount of filters.
245 if (linecount == 2) {
246 auto count = std::distance(std::istream_iterator<std::string>(iss),
247 std::istream_iterator<std::string>());
248 myprintf("%d channels...", count);
249 channels = count;
250 }
251 linecount++;
252 }
253 // 1 format id, 1 input layer (4 x weights), 14 ending weights,
254 // the rest are residuals, every residual has 8 x weight lines
255 auto residual_blocks = linecount - (1 + 4 + 14);
256 if (residual_blocks % 8 != 0) {
257 myprintf("\nInconsistent number of weights in the file.\n");
258 return {0, 0};
259 }
260 residual_blocks /= 8;
261 myprintf("%d blocks.\n", residual_blocks);
262
263 // Re-read file and process
264 wtfile.clear();
265 wtfile.seekg(0, std::ios::beg);
266
267 // Get the file format id out of the way
268 std::getline(wtfile, line);
269
270 const auto plain_conv_layers = 1 + (residual_blocks * 2);
271 const auto plain_conv_wts = plain_conv_layers * 4;
272 linecount = 0;
273 while (std::getline(wtfile, line)) {
274 std::vector<float> weights;
275 auto it_line = line.cbegin();
276 const auto ok = phrase_parse(it_line, line.cend(),
277 *x3::float_, x3::space, weights);
278 if (!ok || it_line != line.cend()) {
279 myprintf("\nFailed to parse weight file. Error on line %d.\n",
280 linecount + 2); //+1 from version line, +1 from 0-indexing
281 return {0, 0};
282 }
283 if (linecount < plain_conv_wts) {
284 if (linecount % 4 == 0) {
285 m_fwd_weights->m_conv_weights.emplace_back(weights);
286 } else if (linecount % 4 == 1) {
287 // Redundant in our model, but they encode the
288 // number of outputs so we have to read them in.
289 m_fwd_weights->m_conv_biases.emplace_back(weights);
290 } else if (linecount % 4 == 2) {
291 m_fwd_weights->m_batchnorm_means.emplace_back(weights);
292 } else if (linecount % 4 == 3) {
293 process_bn_var(weights);
294 m_fwd_weights->m_batchnorm_stddevs.emplace_back(weights);
295 }
296 } else {
297 switch (linecount - plain_conv_wts) {
298 case 0: m_fwd_weights->m_conv_pol_w = std::move(weights); break;
299 case 1: m_fwd_weights->m_conv_pol_b = std::move(weights); break;
300 case 2: std::copy(cbegin(weights), cend(weights),
301 begin(m_bn_pol_w1)); break;
302 case 3: std::copy(cbegin(weights), cend(weights),
303 begin(m_bn_pol_w2)); break;
304 case 4: if (weights.size() != OUTPUTS_POLICY
305 * NUM_INTERSECTIONS
306 * POTENTIAL_MOVES) {
307 myprintf("The weights file is not for %dx%d boards.\n",
308 BOARD_SIZE, BOARD_SIZE);
309 return {0, 0};
310 }
311 std::copy(cbegin(weights), cend(weights),
312 begin(m_ip_pol_w)); break;
313 case 5: std::copy(cbegin(weights), cend(weights),
314 begin(m_ip_pol_b)); break;
315 case 6: m_fwd_weights->m_conv_val_w = std::move(weights); break;
316 case 7: m_fwd_weights->m_conv_val_b = std::move(weights); break;
317 case 8: std::copy(cbegin(weights), cend(weights),
318 begin(m_bn_val_w1)); break;
319 case 9: std::copy(cbegin(weights), cend(weights),
320 begin(m_bn_val_w2)); break;
321 case 10: std::copy(cbegin(weights), cend(weights),
322 begin(m_ip1_val_w)); break;
323 case 11: std::copy(cbegin(weights), cend(weights),
324 begin(m_ip1_val_b)); break;
325 case 12: std::copy(cbegin(weights), cend(weights),
326 begin(m_ip2_val_w)); break;
327 case 13: std::copy(cbegin(weights), cend(weights),
328 begin(m_ip2_val_b)); break;
329 }
330 }
331 linecount++;
332 }
333 process_bn_var(m_bn_pol_w2);
334 process_bn_var(m_bn_val_w2);
335
336 return {channels, static_cast<int>(residual_blocks)};
337 }
338
load_network_file(const std::string & filename)339 std::pair<int, int> Network::load_network_file(const std::string& filename) {
340 // gzopen supports both gz and non-gz files, will decompress
341 // or just read directly as needed.
342 auto gzhandle = gzopen(filename.c_str(), "rb");
343 if (gzhandle == nullptr) {
344 myprintf("Could not open weights file: %s\n", filename.c_str());
345 return {0, 0};
346 }
347 // Stream the gz file in to a memory buffer stream.
348 auto buffer = std::stringstream{};
349 constexpr auto chunkBufferSize = 64 * 1024;
350 std::vector<char> chunkBuffer(chunkBufferSize);
351 while (true) {
352 auto bytesRead = gzread(gzhandle, chunkBuffer.data(), chunkBufferSize);
353 if (bytesRead == 0) break;
354 if (bytesRead < 0) {
355 myprintf("Failed to decompress or read: %s\n", filename.c_str());
356 gzclose(gzhandle);
357 return {0, 0};
358 }
359 assert(bytesRead <= chunkBufferSize);
360 buffer.write(chunkBuffer.data(), bytesRead);
361 }
362 gzclose(gzhandle);
363
364 // Read format version
365 auto line = std::string{};
366 auto format_version = -1;
367 if (std::getline(buffer, line)) {
368 auto iss = std::stringstream{line};
369 // First line is the file format version id
370 iss >> format_version;
371 if (iss.fail() || (format_version != 1 && format_version != 2)) {
372 myprintf("Weights file is the wrong version.\n");
373 return {0, 0};
374 } else {
375 // Version 2 networks are identical to v1, except
376 // that they return the value for black instead of
377 // the player to move. This is used by ELF Open Go.
378 if (format_version == 2) {
379 m_value_head_not_stm = true;
380 } else {
381 m_value_head_not_stm = false;
382 }
383 return load_v1_network(buffer);
384 }
385 }
386 return {0, 0};
387 }
388
init_net(int channels,std::unique_ptr<ForwardPipe> && pipe)389 std::unique_ptr<ForwardPipe>&& Network::init_net(int channels,
390 std::unique_ptr<ForwardPipe>&& pipe) {
391
392 pipe->initialize(channels);
393 pipe->push_weights(WINOGRAD_ALPHA, INPUT_CHANNELS, channels, m_fwd_weights);
394
395 return std::move(pipe);
396 }
397
398 #ifdef USE_HALF
select_precision(int channels)399 void Network::select_precision(int channels) {
400 if (cfg_precision == precision_t::AUTO) {
401 auto score_fp16 = float{-1.0};
402 auto score_fp32 = float{-1.0};
403
404 myprintf("Initializing OpenCL (autodetecting precision).\n");
405
406 // Setup fp16 here so that we can see if we can skip autodetect.
407 // However, if fp16 sanity check fails we will return a fp32 and pray it works.
408 auto fp16_net = std::make_unique<OpenCLScheduler<half_float::half>>();
409 if (!fp16_net->needs_autodetect()) {
410 try {
411 myprintf("OpenCL: using fp16/half or tensor core compute support.\n");
412 m_forward = init_net(channels, std::move(fp16_net));
413 benchmark_time(1); // a sanity check run
414 } catch (...) {
415 myprintf("OpenCL: fp16/half or tensor core failed despite driver claiming support.\n");
416 myprintf("Falling back to single precision\n");
417 m_forward.reset();
418 m_forward = init_net(channels,
419 std::make_unique<OpenCLScheduler<float>>());
420 }
421 return;
422 }
423
424 // Start by setting up fp32.
425 try {
426 m_forward.reset();
427 m_forward = init_net(channels,
428 std::make_unique<OpenCLScheduler<float>>());
429 score_fp32 = benchmark_time(100);
430 } catch (...) {
431 // empty - if exception thrown just throw away fp32 net
432 }
433
434 // Now benchmark fp16.
435 try {
436 m_forward.reset();
437 m_forward = init_net(channels, std::move(fp16_net));
438 score_fp16 = benchmark_time(100);
439 } catch (...) {
440 // empty - if exception thrown just throw away fp16 net
441 }
442
443 if (score_fp16 < 0.0f && score_fp32 < 0.0f) {
444 myprintf("Both single precision and half precision failed to run.\n");
445 throw std::runtime_error("Failed to initialize net.");
446 } else if (score_fp16 < 0.0f) {
447 myprintf("Using OpenCL single precision (half precision failed to run).\n");
448 m_forward.reset();
449 m_forward = init_net(channels,
450 std::make_unique<OpenCLScheduler<float>>());
451 } else if (score_fp32 < 0.0f) {
452 myprintf("Using OpenCL half precision (single precision failed to run).\n");
453 } else if (score_fp32 * 1.05f > score_fp16) {
454 myprintf("Using OpenCL single precision (less than 5%% slower than half).\n");
455 m_forward.reset();
456 m_forward = init_net(channels,
457 std::make_unique<OpenCLScheduler<float>>());
458 } else {
459 myprintf("Using OpenCL half precision (at least 5%% faster than single).\n");
460 }
461 return;
462 } else if (cfg_precision == precision_t::SINGLE) {
463 myprintf("Initializing OpenCL (single precision).\n");
464 m_forward = init_net(channels,
465 std::make_unique<OpenCLScheduler<float>>());
466 return;
467 } else if (cfg_precision == precision_t::HALF) {
468 myprintf("Initializing OpenCL (half precision).\n");
469 m_forward = init_net(channels,
470 std::make_unique<OpenCLScheduler<half_float::half>>());
471 return;
472 }
473 }
474 #endif
475
initialize(int playouts,const std::string & weightsfile)476 void Network::initialize(int playouts, const std::string & weightsfile) {
477 #ifdef USE_BLAS
478 #ifndef __APPLE__
479 #ifdef USE_OPENBLAS
480 openblas_set_num_threads(1);
481 myprintf("BLAS Core: %s\n", openblas_get_corename());
482 #endif
483 #ifdef USE_MKL
484 //mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
485 mkl_set_num_threads(1);
486 MKLVersion Version;
487 mkl_get_version(&Version);
488 myprintf("BLAS core: MKL %s\n", Version.Processor);
489 #endif
490 #endif
491 #else
492 myprintf("BLAS Core: built-in Eigen %d.%d.%d library.\n",
493 EIGEN_WORLD_VERSION, EIGEN_MAJOR_VERSION, EIGEN_MINOR_VERSION);
494 #endif
495
496 m_fwd_weights = std::make_shared<ForwardPipeWeights>();
497
498 // Make a guess at a good size as long as the user doesn't
499 // explicitly set a maximum memory usage.
500 m_nncache.set_size_from_playouts(playouts);
501
502 // Prepare symmetry table
503 for (auto s = 0; s < NUM_SYMMETRIES; ++s) {
504 for (auto v = 0; v < NUM_INTERSECTIONS; ++v) {
505 const auto newvtx =
506 get_symmetry({v % BOARD_SIZE, v / BOARD_SIZE}, s);
507 symmetry_nn_idx_table[s][v] =
508 (newvtx.second * BOARD_SIZE) + newvtx.first;
509 assert(symmetry_nn_idx_table[s][v] >= 0
510 && symmetry_nn_idx_table[s][v] < NUM_INTERSECTIONS);
511 }
512 }
513
514 // Load network from file
515 size_t channels, residual_blocks;
516 std::tie(channels, residual_blocks) = load_network_file(weightsfile);
517 if (channels == 0) {
518 exit(EXIT_FAILURE);
519 }
520
521 auto weight_index = size_t{0};
522 // Input convolution
523 // Winograd transform convolution weights
524 m_fwd_weights->m_conv_weights[weight_index] =
525 winograd_transform_f(m_fwd_weights->m_conv_weights[weight_index],
526 channels, INPUT_CHANNELS);
527 weight_index++;
528
529 // Residual block convolutions
530 for (auto i = size_t{0}; i < residual_blocks * 2; i++) {
531 m_fwd_weights->m_conv_weights[weight_index] =
532 winograd_transform_f(m_fwd_weights->m_conv_weights[weight_index],
533 channels, channels);
534 weight_index++;
535 }
536
537 // Biases are not calculated and are typically zero but some networks might
538 // still have non-zero biases.
539 // Move biases to batchnorm means to make the output match without having
540 // to separately add the biases.
541 auto bias_size = m_fwd_weights->m_conv_biases.size();
542 for (auto i = size_t{0}; i < bias_size; i++) {
543 auto means_size = m_fwd_weights->m_batchnorm_means[i].size();
544 for (auto j = size_t{0}; j < means_size; j++) {
545 m_fwd_weights->m_batchnorm_means[i][j] -= m_fwd_weights->m_conv_biases[i][j];
546 m_fwd_weights->m_conv_biases[i][j] = 0.0f;
547 }
548 }
549
550 for (auto i = size_t{0}; i < m_bn_val_w1.size(); i++) {
551 m_bn_val_w1[i] -= m_fwd_weights->m_conv_val_b[i];
552 m_fwd_weights->m_conv_val_b[i] = 0.0f;
553 }
554
555 for (auto i = size_t{0}; i < m_bn_pol_w1.size(); i++) {
556 m_bn_pol_w1[i] -= m_fwd_weights->m_conv_pol_b[i];
557 m_fwd_weights->m_conv_pol_b[i] = 0.0f;
558 }
559
560 #ifdef USE_OPENCL
561 if (cfg_cpu_only) {
562 myprintf("Initializing CPU-only evaluation.\n");
563 m_forward = init_net(channels, std::make_unique<CPUPipe>());
564 } else {
565 #ifdef USE_OPENCL_SELFCHECK
566 // initialize CPU reference first, so that we can self-check
567 // when doing fp16 vs. fp32 detections
568 m_forward_cpu = init_net(channels, std::make_unique<CPUPipe>());
569 #endif
570 #ifdef USE_HALF
571 // HALF support is enabled, and we are using the GPU.
572 // Select the precision to use at runtime.
573 select_precision(channels);
574 #else
575 myprintf("Initializing OpenCL (single precision).\n");
576 m_forward = init_net(channels,
577 std::make_unique<OpenCLScheduler<float>>());
578 #endif
579 }
580
581 #else //!USE_OPENCL
582 myprintf("Initializing CPU-only evaluation.\n");
583 m_forward = init_net(channels, std::make_unique<CPUPipe>());
584 #endif
585
586 // Need to estimate size before clearing up the pipe.
587 get_estimated_size();
588 m_fwd_weights.reset();
589 }
590
591 template<unsigned int inputs,
592 unsigned int outputs,
593 bool ReLU,
594 size_t W>
innerproduct(const std::vector<float> & input,const std::array<float,W> & weights,const std::array<float,outputs> & biases)595 std::vector<float> innerproduct(const std::vector<float>& input,
596 const std::array<float, W>& weights,
597 const std::array<float, outputs>& biases) {
598 std::vector<float> output(outputs);
599
600 #ifdef USE_BLAS
601 cblas_sgemv(CblasRowMajor, CblasNoTrans,
602 // M K
603 outputs, inputs,
604 1.0f, &weights[0], inputs,
605 &input[0], 1,
606 0.0f, &output[0], 1);
607 #else
608 EigenVectorMap<float> y(output.data(), outputs);
609 y.noalias() =
610 ConstEigenMatrixMap<float>(weights.data(),
611 inputs,
612 outputs).transpose()
613 * ConstEigenVectorMap<float>(input.data(), inputs);
614 #endif
615 const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
616 val : 0.0f; };
617 for (unsigned int o = 0; o < outputs; o++) {
618 auto val = biases[o] + output[o];
619 if (ReLU) {
620 val = lambda_ReLU(val);
621 }
622 output[o] = val;
623 }
624
625 return output;
626 }
627
628 template <size_t spatial_size>
batchnorm(const size_t channels,std::vector<float> & data,const float * const means,const float * const stddivs,const float * const eltwise=nullptr)629 void batchnorm(const size_t channels,
630 std::vector<float>& data,
631 const float* const means,
632 const float* const stddivs,
633 const float* const eltwise = nullptr) {
634 const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
635 val : 0.0f; };
636 for (auto c = size_t{0}; c < channels; ++c) {
637 const auto mean = means[c];
638 const auto scale_stddiv = stddivs[c];
639 const auto arr = &data[c * spatial_size];
640
641 if (eltwise == nullptr) {
642 // Classical BN
643 for (auto b = size_t{0}; b < spatial_size; b++) {
644 arr[b] = lambda_ReLU(scale_stddiv * (arr[b] - mean));
645 }
646 } else {
647 // BN + residual add
648 const auto res = &eltwise[c * spatial_size];
649 for (auto b = size_t{0}; b < spatial_size; b++) {
650 arr[b] = lambda_ReLU((scale_stddiv * (arr[b] - mean)) + res[b]);
651 }
652 }
653 }
654 }
655
656 #ifdef USE_OPENCL_SELFCHECK
compare_net_outputs(const Netresult & data,const Netresult & ref)657 void Network::compare_net_outputs(const Netresult& data,
658 const Netresult& ref) {
659 // Calculates L2-norm between data and ref.
660 constexpr auto max_error = 0.2f;
661
662 auto error = 0.0f;
663
664 for (auto idx = size_t{0}; idx < data.policy.size(); ++idx) {
665 const auto diff = data.policy[idx] - ref.policy[idx];
666 error += diff * diff;
667 }
668 const auto diff_pass = data.policy_pass - ref.policy_pass;
669 const auto diff_winrate = data.winrate - ref.winrate;
670 error += diff_pass * diff_pass;
671 error += diff_winrate * diff_winrate;
672
673 error = std::sqrt(error);
674
675 if (error > max_error || std::isnan(error)) {
676 printf("Error in OpenCL calculation: Update your device's OpenCL drivers "
677 "or reduce the amount of games played simultaneously.\n");
678 throw std::runtime_error("OpenCL self-check mismatch.");
679 }
680 }
681 #endif
682
softmax(const std::vector<float> & input,const float temperature=1.0f)683 std::vector<float> softmax(const std::vector<float>& input,
684 const float temperature = 1.0f) {
685 auto output = std::vector<float>{};
686 output.reserve(input.size());
687
688 const auto alpha = *std::max_element(cbegin(input), cend(input));
689 auto denom = 0.0f;
690
691 for (const auto in_val : input) {
692 auto val = std::exp((in_val - alpha) / temperature);
693 denom += val;
694 output.push_back(val);
695 }
696
697 for (auto& out : output) {
698 out /= denom;
699 }
700
701 return output;
702 }
703
probe_cache(const GameState * const state,Network::Netresult & result)704 bool Network::probe_cache(const GameState* const state,
705 Network::Netresult& result) {
706 if (m_nncache.lookup(state->board.get_hash(), result)) {
707 return true;
708 }
709 // If we are not generating a self-play game, try to find
710 // symmetries if we are in the early opening.
711 if (!cfg_noise && !cfg_random_cnt
712 && state->get_movenum()
713 < (state->get_timecontrol().opening_moves(BOARD_SIZE) / 2)) {
714 for (auto sym = 0; sym < Network::NUM_SYMMETRIES; ++sym) {
715 if (sym == Network::IDENTITY_SYMMETRY) {
716 continue;
717 }
718 const auto hash = state->get_symmetry_hash(sym);
719 if (m_nncache.lookup(hash, result)) {
720 decltype(result.policy) corrected_policy;
721 for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; ++idx) {
722 const auto sym_idx = symmetry_nn_idx_table[sym][idx];
723 corrected_policy[idx] = result.policy[sym_idx];
724 }
725 result.policy = std::move(corrected_policy);
726 return true;
727 }
728 }
729 }
730
731 return false;
732 }
733
get_output(const GameState * const state,const Ensemble ensemble,const int symmetry,const bool read_cache,const bool write_cache,const bool force_selfcheck)734 Network::Netresult Network::get_output(
735 const GameState* const state, const Ensemble ensemble, const int symmetry,
736 const bool read_cache, const bool write_cache, const bool force_selfcheck) {
737 Netresult result;
738 if (state->board.get_boardsize() != BOARD_SIZE) {
739 return result;
740 }
741
742 if (read_cache) {
743 // See if we already have this in the cache.
744 if (probe_cache(state, result)) {
745 return result;
746 }
747 }
748
749 if (ensemble == DIRECT) {
750 assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
751 result = get_output_internal(state, symmetry);
752 } else if (ensemble == AVERAGE) {
753 assert(symmetry == -1);
754 for (auto sym = 0; sym < NUM_SYMMETRIES; ++sym) {
755 auto tmpresult = get_output_internal(state, sym);
756 result.winrate +=
757 tmpresult.winrate / static_cast<float>(NUM_SYMMETRIES);
758 result.policy_pass +=
759 tmpresult.policy_pass / static_cast<float>(NUM_SYMMETRIES);
760
761 for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; idx++) {
762 result.policy[idx] +=
763 tmpresult.policy[idx] / static_cast<float>(NUM_SYMMETRIES);
764 }
765 }
766 } else {
767 assert(ensemble == RANDOM_SYMMETRY);
768 assert(symmetry == -1);
769 const auto rand_sym = Random::get_Rng().randfix<NUM_SYMMETRIES>();
770 result = get_output_internal(state, rand_sym);
771 #ifdef USE_OPENCL_SELFCHECK
772 // Both implementations are available, self-check the OpenCL driver by
773 // running both with a probability of 1/2000.
774 // selfcheck is done here because this is the only place NN
775 // evaluation is done on actual gameplay.
776 if (m_forward_cpu != nullptr
777 && (force_selfcheck || Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0)
778 ) {
779 auto result_ref = get_output_internal(state, rand_sym, true);
780 compare_net_outputs(result, result_ref);
781 }
782 #else
783 (void)force_selfcheck;
784 #endif
785 }
786
787 // v2 format (ELF Open Go) returns black value, not stm
788 if (m_value_head_not_stm) {
789 if (state->board.get_to_move() == FastBoard::WHITE) {
790 result.winrate = 1.0f - result.winrate;
791 }
792 }
793
794 if (write_cache) {
795 // Insert result into cache.
796 m_nncache.insert(state->board.get_hash(), result);
797 }
798
799 return result;
800 }
801
get_output_internal(const GameState * const state,const int symmetry,bool selfcheck)802 Network::Netresult Network::get_output_internal(
803 const GameState* const state, const int symmetry, bool selfcheck) {
804 assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
805 constexpr auto width = BOARD_SIZE;
806 constexpr auto height = BOARD_SIZE;
807
808 const auto input_data = gather_features(state, symmetry);
809 std::vector<float> policy_data(OUTPUTS_POLICY * width * height);
810 std::vector<float> value_data(OUTPUTS_VALUE * width * height);
811 #ifdef USE_OPENCL_SELFCHECK
812 if (selfcheck) {
813 m_forward_cpu->forward(input_data, policy_data, value_data);
814 } else {
815 m_forward->forward(input_data, policy_data, value_data);
816 }
817 #else
818 m_forward->forward(input_data, policy_data, value_data);
819 (void) selfcheck;
820 #endif
821
822 // Get the moves
823 batchnorm<NUM_INTERSECTIONS>(OUTPUTS_POLICY, policy_data,
824 m_bn_pol_w1.data(), m_bn_pol_w2.data());
825 const auto policy_out =
826 innerproduct<OUTPUTS_POLICY * NUM_INTERSECTIONS, POTENTIAL_MOVES, false>(
827 policy_data, m_ip_pol_w, m_ip_pol_b);
828 const auto outputs = softmax(policy_out, cfg_softmax_temp);
829
830 // Now get the value
831 batchnorm<NUM_INTERSECTIONS>(OUTPUTS_VALUE, value_data,
832 m_bn_val_w1.data(), m_bn_val_w2.data());
833 const auto winrate_data =
834 innerproduct<OUTPUTS_VALUE * NUM_INTERSECTIONS, VALUE_LAYER, true>(
835 value_data, m_ip1_val_w, m_ip1_val_b);
836 const auto winrate_out =
837 innerproduct<VALUE_LAYER, 1, false>(winrate_data, m_ip2_val_w, m_ip2_val_b);
838
839 // Map TanH output range [-1..1] to [0..1] range
840 const auto winrate = (1.0f + std::tanh(winrate_out[0])) / 2.0f;
841
842 Netresult result;
843
844 for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; idx++) {
845 const auto sym_idx = symmetry_nn_idx_table[symmetry][idx];
846 result.policy[sym_idx] = outputs[idx];
847 }
848
849 result.policy_pass = outputs[NUM_INTERSECTIONS];
850 result.winrate = winrate;
851
852 return result;
853 }
854
show_heatmap(const FastState * const state,const Netresult & result,const bool topmoves)855 void Network::show_heatmap(const FastState* const state,
856 const Netresult& result,
857 const bool topmoves) {
858 std::vector<std::string> display_map;
859 std::string line;
860
861 for (unsigned int y = 0; y < BOARD_SIZE; y++) {
862 for (unsigned int x = 0; x < BOARD_SIZE; x++) {
863 auto policy = 0;
864 const auto vertex = state->board.get_vertex(x, y);
865 if (state->board.get_state(vertex) == FastBoard::EMPTY) {
866 policy = result.policy[y * BOARD_SIZE + x] * 1000;
867 }
868
869 line += boost::str(boost::format("%3d ") % policy);
870 }
871
872 display_map.push_back(line);
873 line.clear();
874 }
875
876 for (int i = display_map.size() - 1; i >= 0; --i) {
877 myprintf("%s\n", display_map[i].c_str());
878 }
879 const auto pass_policy = int(result.policy_pass * 1000);
880 myprintf("pass: %d\n", pass_policy);
881 myprintf("winrate: %f\n", result.winrate);
882
883 if (topmoves) {
884 std::vector<Network::PolicyVertexPair> moves;
885 for (auto i=0; i < NUM_INTERSECTIONS; i++) {
886 const auto x = i % BOARD_SIZE;
887 const auto y = i / BOARD_SIZE;
888 const auto vertex = state->board.get_vertex(x, y);
889 if (state->board.get_state(vertex) == FastBoard::EMPTY) {
890 moves.emplace_back(result.policy[i], vertex);
891 }
892 }
893 moves.emplace_back(result.policy_pass, FastBoard::PASS);
894
895 std::stable_sort(rbegin(moves), rend(moves));
896
897 auto cum = 0.0f;
898 for (const auto& move : moves) {
899 if (cum > 0.85f || move.first < 0.01f) break;
900 myprintf("%1.3f (%s)\n",
901 move.first,
902 state->board.move_to_text(move.second).c_str());
903 cum += move.first;
904 }
905 }
906 }
907
fill_input_plane_pair(const FullBoard & board,std::vector<float>::iterator black,std::vector<float>::iterator white,const int symmetry)908 void Network::fill_input_plane_pair(const FullBoard& board,
909 std::vector<float>::iterator black,
910 std::vector<float>::iterator white,
911 const int symmetry) {
912 for (auto idx = 0; idx < NUM_INTERSECTIONS; idx++) {
913 const auto sym_idx = symmetry_nn_idx_table[symmetry][idx];
914 const auto x = sym_idx % BOARD_SIZE;
915 const auto y = sym_idx / BOARD_SIZE;
916 const auto color = board.get_state(x, y);
917 if (color == FastBoard::BLACK) {
918 black[idx] = float(true);
919 } else if (color == FastBoard::WHITE) {
920 white[idx] = float(true);
921 }
922 }
923 }
924
gather_features(const GameState * const state,const int symmetry)925 std::vector<float> Network::gather_features(const GameState* const state,
926 const int symmetry) {
927 assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
928 auto input_data = std::vector<float>(INPUT_CHANNELS * NUM_INTERSECTIONS);
929
930 const auto to_move = state->get_to_move();
931 const auto blacks_move = to_move == FastBoard::BLACK;
932
933 const auto black_it = blacks_move ?
934 begin(input_data) :
935 begin(input_data) + INPUT_MOVES * NUM_INTERSECTIONS;
936 const auto white_it = blacks_move ?
937 begin(input_data) + INPUT_MOVES * NUM_INTERSECTIONS :
938 begin(input_data);
939 const auto to_move_it = blacks_move ?
940 begin(input_data) + 2 * INPUT_MOVES * NUM_INTERSECTIONS :
941 begin(input_data) + (2 * INPUT_MOVES + 1) * NUM_INTERSECTIONS;
942
943 const auto moves = std::min<size_t>(state->get_movenum() + 1, INPUT_MOVES);
944 // Go back in time, fill history boards
945 for (auto h = size_t{0}; h < moves; h++) {
946 // collect white, black occupation planes
947 fill_input_plane_pair(state->get_past_board(h),
948 black_it + h * NUM_INTERSECTIONS,
949 white_it + h * NUM_INTERSECTIONS,
950 symmetry);
951 }
952
953 std::fill(to_move_it, to_move_it + NUM_INTERSECTIONS, float(true));
954
955 return input_data;
956 }
957
get_symmetry(const std::pair<int,int> & vertex,const int symmetry,const int board_size)958 std::pair<int, int> Network::get_symmetry(const std::pair<int, int>& vertex,
959 const int symmetry,
960 const int board_size) {
961 auto x = vertex.first;
962 auto y = vertex.second;
963 assert(x >= 0 && x < board_size);
964 assert(y >= 0 && y < board_size);
965 assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
966
967 if ((symmetry & 4) != 0) {
968 std::swap(x, y);
969 }
970
971 if ((symmetry & 2) != 0) {
972 x = board_size - x - 1;
973 }
974
975 if ((symmetry & 1) != 0) {
976 y = board_size - y - 1;
977 }
978
979 assert(x >= 0 && x < board_size);
980 assert(y >= 0 && y < board_size);
981 assert(symmetry != IDENTITY_SYMMETRY || vertex == std::make_pair(x, y));
982 return {x, y};
983 }
984
get_estimated_size()985 size_t Network::get_estimated_size() {
986 if (estimated_size != 0) {
987 return estimated_size;
988 }
989 auto result = size_t{0};
990
991 const auto lambda_vector_size = [](const std::vector<std::vector<float>> &v) {
992 auto result = size_t{0};
993 for (auto it = begin(v); it != end(v); ++it) {
994 result += it->size() * sizeof(float);
995 }
996 return result;
997 };
998
999 result += lambda_vector_size(m_fwd_weights->m_conv_weights);
1000 result += lambda_vector_size(m_fwd_weights->m_conv_biases);
1001 result += lambda_vector_size(m_fwd_weights->m_batchnorm_means);
1002 result += lambda_vector_size(m_fwd_weights->m_batchnorm_stddevs);
1003
1004 result += m_fwd_weights->m_conv_pol_w.size() * sizeof(float);
1005 result += m_fwd_weights->m_conv_pol_b.size() * sizeof(float);
1006
1007 // Policy head
1008 result += OUTPUTS_POLICY * sizeof(float); // m_bn_pol_w1
1009 result += OUTPUTS_POLICY * sizeof(float); // m_bn_pol_w2
1010 result += OUTPUTS_POLICY * NUM_INTERSECTIONS
1011 * POTENTIAL_MOVES * sizeof(float); //m_ip_pol_w
1012 result += POTENTIAL_MOVES * sizeof(float); // m_ip_pol_b
1013
1014 // Value head
1015 result += m_fwd_weights->m_conv_val_w.size() * sizeof(float);
1016 result += m_fwd_weights->m_conv_val_b.size() * sizeof(float);
1017 result += OUTPUTS_VALUE * sizeof(float); // m_bn_val_w1
1018 result += OUTPUTS_VALUE * sizeof(float); // m_bn_val_w2
1019
1020 result += OUTPUTS_VALUE * NUM_INTERSECTIONS
1021 * VALUE_LAYER * sizeof(float); // m_ip1_val_w
1022 result += VALUE_LAYER * sizeof(float); // m_ip1_val_b
1023
1024 result += VALUE_LAYER * sizeof(float); // m_ip2_val_w
1025 result += sizeof(float); // m_ip2_val_b
1026 return estimated_size = result;
1027 }
1028
get_estimated_cache_size()1029 size_t Network::get_estimated_cache_size() {
1030 return m_nncache.get_estimated_size();
1031 }
1032
nncache_resize(int max_count)1033 void Network::nncache_resize(int max_count) {
1034 return m_nncache.resize(max_count);
1035 }
1036