1 /*
2     This file is part of Leela Zero.
3     Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
4 
5     Leela Zero is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     Leela Zero is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
17 
18     Additional permission under GNU GPL version 3 section 7
19 
20     If you modify this Program, or any covered work, by linking or
21     combining it with NVIDIA Corporation's libraries from the
22     NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
23     Network library and/or the NVIDIA TensorRT inference library
24     (or a modified version of those libraries), containing parts covered
25     by the terms of the respective license agreement, the licensors of
26     this Program grant you additional permission to convey the resulting
27     work.
28 */
29 
30 #include "config.h"
31 
32 #include <algorithm>
33 #include <array>
34 #include <cassert>
35 #include <cmath>
36 #include <iterator>
37 #include <memory>
38 #include <sstream>
39 #include <string>
40 #include <boost/utility.hpp>
41 #include <boost/format.hpp>
42 #include <boost/spirit/home/x3.hpp>
43 #ifndef USE_BLAS
44 #include <Eigen/Dense>
45 #endif
46 
47 #ifdef __APPLE__
48 #include <Accelerate/Accelerate.h>
49 #endif
50 #ifdef USE_MKL
51 #include <mkl.h>
52 #endif
53 #ifdef USE_OPENBLAS
54 #include <cblas.h>
55 #endif
56 #include "zlib.h"
57 
58 #include "Network.h"
59 #include "CPUPipe.h"
60 #ifdef USE_OPENCL
61 #include "OpenCLScheduler.h"
62 #include "UCTNode.h"
63 #endif
64 #include "FastBoard.h"
65 #include "FastState.h"
66 #include "FullBoard.h"
67 #include "GameState.h"
68 #include "GTP.h"
69 #include "NNCache.h"
70 #include "Random.h"
71 #include "ThreadPool.h"
72 #include "Timing.h"
73 #include "Utils.h"
74 
75 namespace x3 = boost::spirit::x3;
76 using namespace Utils;
77 
78 #ifndef USE_BLAS
79 // Eigen helpers
80 template <typename T>
81 using EigenVectorMap =
82     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
83 template <typename T>
84 using ConstEigenVectorMap =
85     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
86 template <typename T>
87 using ConstEigenMatrixMap =
88     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
89 #endif
90 
91 // Symmetry helper
92 static std::array<std::array<int, NUM_INTERSECTIONS>,
93                   Network::NUM_SYMMETRIES> symmetry_nn_idx_table;
94 
benchmark_time(int centiseconds)95 float Network::benchmark_time(int centiseconds) {
96     const auto cpus = cfg_num_threads;
97 
98     ThreadGroup tg(thread_pool);
99     std::atomic<int> runcount{0};
100 
101     GameState state;
102     state.init_game(BOARD_SIZE, KOMI);
103 
104     // As a sanity run, try one run with self check.
105     // Isn't enough to guarantee correctness but better than nothing,
106     // plus for large nets self-check takes a while (1~3 eval per second)
107     get_output(&state, Ensemble::RANDOM_SYMMETRY, -1, false, true, true);
108 
109     const Time start;
110     for (auto i = size_t{0}; i < cpus; i++) {
111         tg.add_task([this, &runcount, start, centiseconds, state]() {
112             while (true) {
113                 runcount++;
114                 get_output(&state, Ensemble::RANDOM_SYMMETRY, -1, false);
115                 const Time end;
116                 const auto elapsed = Time::timediff_centis(start, end);
117                 if (elapsed >= centiseconds) {
118                     break;
119                 }
120             }
121         });
122     }
123     tg.wait_all();
124 
125     const Time end;
126     const auto elapsed = Time::timediff_centis(start, end);
127     return 100.0f * runcount.load() / elapsed;
128 }
129 
benchmark(const GameState * const state,const int iterations)130 void Network::benchmark(const GameState* const state, const int iterations) {
131     const auto cpus = cfg_num_threads;
132     const Time start;
133 
134     ThreadGroup tg(thread_pool);
135     std::atomic<int> runcount{0};
136 
137     for (auto i = size_t{0}; i < cpus; i++) {
138         tg.add_task([this, &runcount, iterations, state]() {
139             while (runcount < iterations) {
140                 runcount++;
141                 get_output(state, Ensemble::RANDOM_SYMMETRY, -1, false);
142             }
143         });
144     }
145     tg.wait_all();
146 
147     const Time end;
148     const auto elapsed = Time::timediff_seconds(start, end);
149     myprintf("%5d evaluations in %5.2f seconds -> %d n/s\n",
150              runcount.load(), elapsed, int(runcount.load() / elapsed));
151 }
152 
153 template<class container>
process_bn_var(container & weights)154 void process_bn_var(container& weights) {
155     constexpr float epsilon = 1e-5f;
156     for (auto&& w : weights) {
157         w = 1.0f / std::sqrt(w + epsilon);
158     }
159 }
160 
winograd_transform_f(const std::vector<float> & f,const int outputs,const int channels)161 std::vector<float> Network::winograd_transform_f(const std::vector<float>& f,
162                                                  const int outputs,
163                                                  const int channels) {
164     // F(4x4, 3x3) Winograd filter transformation
165     // transpose(G.dot(f).dot(G.transpose()))
166     // U matrix is transposed for better memory layout in SGEMM
167     auto U = std::vector<float>(WINOGRAD_TILE * outputs * channels);
168     const auto G = std::array<float, 3 * WINOGRAD_ALPHA>
169                     { 1.0f,        0.0f,      0.0f,
170                       -2.0f/3.0f, -SQ2/3.0f, -1.0f/3.0f,
171                       -2.0f/3.0f,  SQ2/3.0f, -1.0f/3.0f,
172                       1.0f/6.0f,   SQ2/6.0f,  1.0f/3.0f,
173                       1.0f/6.0f,  -SQ2/6.0f,  1.0f/3.0f,
174                       0.0f,        0.0f,      1.0f};
175 
176     auto temp = std::array<float, 3 * WINOGRAD_ALPHA>{};
177 
178     constexpr auto max_buffersize = 8;
179     auto buffersize = max_buffersize;
180 
181     if (outputs % buffersize != 0) {
182         buffersize = 1;
183     }
184 
185     std::array<float, max_buffersize * WINOGRAD_ALPHA * WINOGRAD_ALPHA> buffer;
186 
187     for (auto c = 0; c < channels; c++) {
188         for (auto o_b = 0; o_b < outputs/buffersize; o_b++) {
189             for (auto bufferline = 0; bufferline < buffersize; bufferline++) {
190                 const auto o = o_b * buffersize + bufferline;
191 
192                 for (auto i = 0; i < WINOGRAD_ALPHA; i++) {
193                     for (auto j = 0; j < 3; j++) {
194                         auto acc = 0.0f;
195                         for (auto k = 0; k < 3; k++) {
196                             acc += G[i*3 + k] * f[o*channels*9 + c*9 + k*3 + j];
197                         }
198                         temp[i*3 + j] = acc;
199                     }
200                 }
201 
202                 for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
203                     for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
204                         auto acc = 0.0f;
205                         for (auto k = 0; k < 3; k++) {
206                             acc += temp[xi*3 + k] * G[nu*3 + k];
207                         }
208                         buffer[(xi * WINOGRAD_ALPHA + nu) * buffersize + bufferline] = acc;
209                     }
210                 }
211             }
212             for (auto i = 0; i < WINOGRAD_ALPHA * WINOGRAD_ALPHA; i++) {
213                 for (auto entry = 0; entry < buffersize; entry++) {
214                     const auto o = o_b * buffersize + entry;
215                     U[i * outputs * channels
216                       + c * outputs
217                       + o] =
218                     buffer[buffersize * i + entry];
219                 }
220             }
221         }
222     }
223 
224     return U;
225 }
226 
load_v1_network(std::istream & wtfile)227 std::pair<int, int> Network::load_v1_network(std::istream& wtfile) {
228     // Count size of the network
229     myprintf("Detecting residual layers...");
230     // We are version 1 or 2
231     if (m_value_head_not_stm) {
232         myprintf("v%d...", 2);
233     } else {
234         myprintf("v%d...", 1);
235     }
236     // First line was the version number
237     auto linecount = size_t{1};
238     auto channels = 0;
239     auto line = std::string{};
240     while (std::getline(wtfile, line)) {
241         auto iss = std::stringstream{line};
242         // Third line of parameters are the convolution layer biases,
243         // so this tells us the amount of channels in the residual layers.
244         // We are assuming all layers have the same amount of filters.
245         if (linecount == 2) {
246             auto count = std::distance(std::istream_iterator<std::string>(iss),
247                                        std::istream_iterator<std::string>());
248             myprintf("%d channels...", count);
249             channels = count;
250         }
251         linecount++;
252     }
253     // 1 format id, 1 input layer (4 x weights), 14 ending weights,
254     // the rest are residuals, every residual has 8 x weight lines
255     auto residual_blocks = linecount - (1 + 4 + 14);
256     if (residual_blocks % 8 != 0) {
257         myprintf("\nInconsistent number of weights in the file.\n");
258         return {0, 0};
259     }
260     residual_blocks /= 8;
261     myprintf("%d blocks.\n", residual_blocks);
262 
263     // Re-read file and process
264     wtfile.clear();
265     wtfile.seekg(0, std::ios::beg);
266 
267     // Get the file format id out of the way
268     std::getline(wtfile, line);
269 
270     const auto plain_conv_layers = 1 + (residual_blocks * 2);
271     const auto plain_conv_wts = plain_conv_layers * 4;
272     linecount = 0;
273     while (std::getline(wtfile, line)) {
274         std::vector<float> weights;
275         auto it_line = line.cbegin();
276         const auto ok = phrase_parse(it_line, line.cend(),
277                                      *x3::float_, x3::space, weights);
278         if (!ok || it_line != line.cend()) {
279             myprintf("\nFailed to parse weight file. Error on line %d.\n",
280                     linecount + 2); //+1 from version line, +1 from 0-indexing
281             return {0, 0};
282         }
283         if (linecount < plain_conv_wts) {
284             if (linecount % 4 == 0) {
285                 m_fwd_weights->m_conv_weights.emplace_back(weights);
286             } else if (linecount % 4 == 1) {
287                 // Redundant in our model, but they encode the
288                 // number of outputs so we have to read them in.
289                 m_fwd_weights->m_conv_biases.emplace_back(weights);
290             } else if (linecount % 4 == 2) {
291                 m_fwd_weights->m_batchnorm_means.emplace_back(weights);
292             } else if (linecount % 4 == 3) {
293                 process_bn_var(weights);
294                 m_fwd_weights->m_batchnorm_stddevs.emplace_back(weights);
295             }
296         } else {
297             switch (linecount - plain_conv_wts) {
298                 case  0: m_fwd_weights->m_conv_pol_w = std::move(weights); break;
299                 case  1: m_fwd_weights->m_conv_pol_b = std::move(weights); break;
300                 case  2: std::copy(cbegin(weights), cend(weights),
301                                    begin(m_bn_pol_w1)); break;
302                 case  3: std::copy(cbegin(weights), cend(weights),
303                                    begin(m_bn_pol_w2)); break;
304                 case  4: if (weights.size() != OUTPUTS_POLICY
305                                                * NUM_INTERSECTIONS
306                                                * POTENTIAL_MOVES) {
307                              myprintf("The weights file is not for %dx%d boards.\n",
308                                       BOARD_SIZE, BOARD_SIZE);
309                              return {0, 0};
310                          }
311                          std::copy(cbegin(weights), cend(weights),
312                                    begin(m_ip_pol_w)); break;
313                 case  5: std::copy(cbegin(weights), cend(weights),
314                                    begin(m_ip_pol_b)); break;
315                 case  6: m_fwd_weights->m_conv_val_w = std::move(weights); break;
316                 case  7: m_fwd_weights->m_conv_val_b = std::move(weights); break;
317                 case  8: std::copy(cbegin(weights), cend(weights),
318                                    begin(m_bn_val_w1)); break;
319                 case  9: std::copy(cbegin(weights), cend(weights),
320                                    begin(m_bn_val_w2)); break;
321                 case 10: std::copy(cbegin(weights), cend(weights),
322                                    begin(m_ip1_val_w)); break;
323                 case 11: std::copy(cbegin(weights), cend(weights),
324                                    begin(m_ip1_val_b)); break;
325                 case 12: std::copy(cbegin(weights), cend(weights),
326                                    begin(m_ip2_val_w)); break;
327                 case 13: std::copy(cbegin(weights), cend(weights),
328                                    begin(m_ip2_val_b)); break;
329             }
330         }
331         linecount++;
332     }
333     process_bn_var(m_bn_pol_w2);
334     process_bn_var(m_bn_val_w2);
335 
336     return {channels, static_cast<int>(residual_blocks)};
337 }
338 
load_network_file(const std::string & filename)339 std::pair<int, int> Network::load_network_file(const std::string& filename) {
340     // gzopen supports both gz and non-gz files, will decompress
341     // or just read directly as needed.
342     auto gzhandle = gzopen(filename.c_str(), "rb");
343     if (gzhandle == nullptr) {
344         myprintf("Could not open weights file: %s\n", filename.c_str());
345         return {0, 0};
346     }
347     // Stream the gz file in to a memory buffer stream.
348     auto buffer = std::stringstream{};
349     constexpr auto chunkBufferSize = 64 * 1024;
350     std::vector<char> chunkBuffer(chunkBufferSize);
351     while (true) {
352         auto bytesRead = gzread(gzhandle, chunkBuffer.data(), chunkBufferSize);
353         if (bytesRead == 0) break;
354         if (bytesRead < 0) {
355             myprintf("Failed to decompress or read: %s\n", filename.c_str());
356             gzclose(gzhandle);
357             return {0, 0};
358         }
359         assert(bytesRead <= chunkBufferSize);
360         buffer.write(chunkBuffer.data(), bytesRead);
361     }
362     gzclose(gzhandle);
363 
364     // Read format version
365     auto line = std::string{};
366     auto format_version = -1;
367     if (std::getline(buffer, line)) {
368         auto iss = std::stringstream{line};
369         // First line is the file format version id
370         iss >> format_version;
371         if (iss.fail() || (format_version != 1 && format_version != 2)) {
372             myprintf("Weights file is the wrong version.\n");
373             return {0, 0};
374         } else {
375             // Version 2 networks are identical to v1, except
376             // that they return the value for black instead of
377             // the player to move. This is used by ELF Open Go.
378             if (format_version == 2) {
379                 m_value_head_not_stm = true;
380             } else {
381                 m_value_head_not_stm = false;
382             }
383             return load_v1_network(buffer);
384         }
385     }
386     return {0, 0};
387 }
388 
init_net(int channels,std::unique_ptr<ForwardPipe> && pipe)389 std::unique_ptr<ForwardPipe>&& Network::init_net(int channels,
390     std::unique_ptr<ForwardPipe>&& pipe) {
391 
392     pipe->initialize(channels);
393     pipe->push_weights(WINOGRAD_ALPHA, INPUT_CHANNELS, channels, m_fwd_weights);
394 
395     return std::move(pipe);
396 }
397 
398 #ifdef USE_HALF
select_precision(int channels)399 void Network::select_precision(int channels) {
400     if (cfg_precision == precision_t::AUTO) {
401         auto score_fp16 = float{-1.0};
402         auto score_fp32 = float{-1.0};
403 
404         myprintf("Initializing OpenCL (autodetecting precision).\n");
405 
406         // Setup fp16 here so that we can see if we can skip autodetect.
407         // However, if fp16 sanity check fails we will return a fp32 and pray it works.
408         auto fp16_net = std::make_unique<OpenCLScheduler<half_float::half>>();
409         if (!fp16_net->needs_autodetect()) {
410             try {
411                 myprintf("OpenCL: using fp16/half or tensor core compute support.\n");
412                 m_forward = init_net(channels, std::move(fp16_net));
413                 benchmark_time(1); // a sanity check run
414             } catch (...) {
415                 myprintf("OpenCL: fp16/half or tensor core failed despite driver claiming support.\n");
416                 myprintf("Falling back to single precision\n");
417                 m_forward.reset();
418                 m_forward = init_net(channels,
419                     std::make_unique<OpenCLScheduler<float>>());
420             }
421             return;
422         }
423 
424         // Start by setting up fp32.
425         try {
426             m_forward.reset();
427             m_forward = init_net(channels,
428                 std::make_unique<OpenCLScheduler<float>>());
429             score_fp32 = benchmark_time(100);
430         } catch (...) {
431             // empty - if exception thrown just throw away fp32 net
432         }
433 
434         // Now benchmark fp16.
435         try {
436             m_forward.reset();
437             m_forward = init_net(channels, std::move(fp16_net));
438             score_fp16 = benchmark_time(100);
439         } catch (...) {
440             // empty - if exception thrown just throw away fp16 net
441         }
442 
443         if (score_fp16 < 0.0f && score_fp32 < 0.0f) {
444             myprintf("Both single precision and half precision failed to run.\n");
445             throw std::runtime_error("Failed to initialize net.");
446         } else if (score_fp16 < 0.0f) {
447             myprintf("Using OpenCL single precision (half precision failed to run).\n");
448             m_forward.reset();
449             m_forward = init_net(channels,
450                 std::make_unique<OpenCLScheduler<float>>());
451         } else if (score_fp32 < 0.0f) {
452             myprintf("Using OpenCL half precision (single precision failed to run).\n");
453         } else if (score_fp32 * 1.05f > score_fp16) {
454             myprintf("Using OpenCL single precision (less than 5%% slower than half).\n");
455             m_forward.reset();
456             m_forward = init_net(channels,
457                 std::make_unique<OpenCLScheduler<float>>());
458         } else {
459             myprintf("Using OpenCL half precision (at least 5%% faster than single).\n");
460         }
461         return;
462     } else if (cfg_precision == precision_t::SINGLE) {
463         myprintf("Initializing OpenCL (single precision).\n");
464         m_forward = init_net(channels,
465             std::make_unique<OpenCLScheduler<float>>());
466         return;
467     } else if (cfg_precision == precision_t::HALF) {
468         myprintf("Initializing OpenCL (half precision).\n");
469         m_forward = init_net(channels,
470             std::make_unique<OpenCLScheduler<half_float::half>>());
471         return;
472     }
473 }
474 #endif
475 
initialize(int playouts,const std::string & weightsfile)476 void Network::initialize(int playouts, const std::string & weightsfile) {
477 #ifdef USE_BLAS
478 #ifndef __APPLE__
479 #ifdef USE_OPENBLAS
480     openblas_set_num_threads(1);
481     myprintf("BLAS Core: %s\n", openblas_get_corename());
482 #endif
483 #ifdef USE_MKL
484     //mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
485     mkl_set_num_threads(1);
486     MKLVersion Version;
487     mkl_get_version(&Version);
488     myprintf("BLAS core: MKL %s\n", Version.Processor);
489 #endif
490 #endif
491 #else
492     myprintf("BLAS Core: built-in Eigen %d.%d.%d library.\n",
493              EIGEN_WORLD_VERSION, EIGEN_MAJOR_VERSION, EIGEN_MINOR_VERSION);
494 #endif
495 
496     m_fwd_weights = std::make_shared<ForwardPipeWeights>();
497 
498     // Make a guess at a good size as long as the user doesn't
499     // explicitly set a maximum memory usage.
500     m_nncache.set_size_from_playouts(playouts);
501 
502     // Prepare symmetry table
503     for (auto s = 0; s < NUM_SYMMETRIES; ++s) {
504         for (auto v = 0; v < NUM_INTERSECTIONS; ++v) {
505             const auto newvtx =
506                 get_symmetry({v % BOARD_SIZE, v / BOARD_SIZE}, s);
507             symmetry_nn_idx_table[s][v] =
508                 (newvtx.second * BOARD_SIZE) + newvtx.first;
509             assert(symmetry_nn_idx_table[s][v] >= 0
510                    && symmetry_nn_idx_table[s][v] < NUM_INTERSECTIONS);
511         }
512     }
513 
514     // Load network from file
515     size_t channels, residual_blocks;
516     std::tie(channels, residual_blocks) = load_network_file(weightsfile);
517     if (channels == 0) {
518         exit(EXIT_FAILURE);
519     }
520 
521     auto weight_index = size_t{0};
522     // Input convolution
523     // Winograd transform convolution weights
524     m_fwd_weights->m_conv_weights[weight_index] =
525         winograd_transform_f(m_fwd_weights->m_conv_weights[weight_index],
526                              channels, INPUT_CHANNELS);
527     weight_index++;
528 
529     // Residual block convolutions
530     for (auto i = size_t{0}; i < residual_blocks * 2; i++) {
531         m_fwd_weights->m_conv_weights[weight_index] =
532             winograd_transform_f(m_fwd_weights->m_conv_weights[weight_index],
533                                  channels, channels);
534         weight_index++;
535     }
536 
537     // Biases are not calculated and are typically zero but some networks might
538     // still have non-zero biases.
539     // Move biases to batchnorm means to make the output match without having
540     // to separately add the biases.
541     auto bias_size = m_fwd_weights->m_conv_biases.size();
542     for (auto i = size_t{0}; i < bias_size; i++) {
543         auto means_size = m_fwd_weights->m_batchnorm_means[i].size();
544         for (auto j = size_t{0}; j < means_size; j++) {
545             m_fwd_weights->m_batchnorm_means[i][j] -= m_fwd_weights->m_conv_biases[i][j];
546             m_fwd_weights->m_conv_biases[i][j] = 0.0f;
547         }
548     }
549 
550     for (auto i = size_t{0}; i < m_bn_val_w1.size(); i++) {
551         m_bn_val_w1[i] -= m_fwd_weights->m_conv_val_b[i];
552         m_fwd_weights->m_conv_val_b[i] = 0.0f;
553     }
554 
555     for (auto i = size_t{0}; i < m_bn_pol_w1.size(); i++) {
556         m_bn_pol_w1[i] -= m_fwd_weights->m_conv_pol_b[i];
557         m_fwd_weights->m_conv_pol_b[i] = 0.0f;
558     }
559 
560 #ifdef USE_OPENCL
561     if (cfg_cpu_only) {
562         myprintf("Initializing CPU-only evaluation.\n");
563         m_forward = init_net(channels, std::make_unique<CPUPipe>());
564     } else {
565 #ifdef USE_OPENCL_SELFCHECK
566         // initialize CPU reference first, so that we can self-check
567         // when doing fp16 vs. fp32 detections
568         m_forward_cpu = init_net(channels, std::make_unique<CPUPipe>());
569 #endif
570 #ifdef USE_HALF
571         // HALF support is enabled, and we are using the GPU.
572         // Select the precision to use at runtime.
573         select_precision(channels);
574 #else
575         myprintf("Initializing OpenCL (single precision).\n");
576         m_forward = init_net(channels,
577                              std::make_unique<OpenCLScheduler<float>>());
578 #endif
579     }
580 
581 #else //!USE_OPENCL
582     myprintf("Initializing CPU-only evaluation.\n");
583     m_forward = init_net(channels, std::make_unique<CPUPipe>());
584 #endif
585 
586     // Need to estimate size before clearing up the pipe.
587     get_estimated_size();
588     m_fwd_weights.reset();
589 }
590 
591 template<unsigned int inputs,
592          unsigned int outputs,
593          bool ReLU,
594          size_t W>
innerproduct(const std::vector<float> & input,const std::array<float,W> & weights,const std::array<float,outputs> & biases)595 std::vector<float> innerproduct(const std::vector<float>& input,
596                                 const std::array<float, W>& weights,
597                                 const std::array<float, outputs>& biases) {
598     std::vector<float> output(outputs);
599 
600 #ifdef USE_BLAS
601     cblas_sgemv(CblasRowMajor, CblasNoTrans,
602                 // M     K
603                 outputs, inputs,
604                 1.0f, &weights[0], inputs,
605                 &input[0], 1,
606                 0.0f, &output[0], 1);
607 #else
608     EigenVectorMap<float> y(output.data(), outputs);
609     y.noalias() =
610         ConstEigenMatrixMap<float>(weights.data(),
611                                    inputs,
612                                    outputs).transpose()
613         * ConstEigenVectorMap<float>(input.data(), inputs);
614 #endif
615     const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
616                                                           val : 0.0f; };
617     for (unsigned int o = 0; o < outputs; o++) {
618         auto val = biases[o] + output[o];
619         if (ReLU) {
620             val = lambda_ReLU(val);
621         }
622         output[o] = val;
623     }
624 
625     return output;
626 }
627 
628 template <size_t spatial_size>
batchnorm(const size_t channels,std::vector<float> & data,const float * const means,const float * const stddivs,const float * const eltwise=nullptr)629 void batchnorm(const size_t channels,
630                std::vector<float>& data,
631                const float* const means,
632                const float* const stddivs,
633                const float* const eltwise = nullptr) {
634     const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
635                                                           val : 0.0f; };
636     for (auto c = size_t{0}; c < channels; ++c) {
637         const auto mean = means[c];
638         const auto scale_stddiv = stddivs[c];
639         const auto arr = &data[c * spatial_size];
640 
641         if (eltwise == nullptr) {
642             // Classical BN
643             for (auto b = size_t{0}; b < spatial_size; b++) {
644                 arr[b] = lambda_ReLU(scale_stddiv * (arr[b] - mean));
645             }
646         } else {
647             // BN + residual add
648             const auto res = &eltwise[c * spatial_size];
649             for (auto b = size_t{0}; b < spatial_size; b++) {
650                 arr[b] = lambda_ReLU((scale_stddiv * (arr[b] - mean)) + res[b]);
651             }
652         }
653     }
654 }
655 
656 #ifdef USE_OPENCL_SELFCHECK
compare_net_outputs(const Netresult & data,const Netresult & ref)657 void Network::compare_net_outputs(const Netresult& data,
658                                   const Netresult& ref) {
659     // Calculates L2-norm between data and ref.
660     constexpr auto max_error = 0.2f;
661 
662     auto error = 0.0f;
663 
664     for (auto idx = size_t{0}; idx < data.policy.size(); ++idx) {
665         const auto diff = data.policy[idx] - ref.policy[idx];
666         error += diff * diff;
667     }
668     const auto diff_pass = data.policy_pass - ref.policy_pass;
669     const auto diff_winrate = data.winrate - ref.winrate;
670     error += diff_pass * diff_pass;
671     error += diff_winrate * diff_winrate;
672 
673     error = std::sqrt(error);
674 
675     if (error > max_error || std::isnan(error)) {
676         printf("Error in OpenCL calculation: Update your device's OpenCL drivers "
677                "or reduce the amount of games played simultaneously.\n");
678         throw std::runtime_error("OpenCL self-check mismatch.");
679     }
680 }
681 #endif
682 
softmax(const std::vector<float> & input,const float temperature=1.0f)683 std::vector<float> softmax(const std::vector<float>& input,
684                            const float temperature = 1.0f) {
685     auto output = std::vector<float>{};
686     output.reserve(input.size());
687 
688     const auto alpha = *std::max_element(cbegin(input), cend(input));
689     auto denom = 0.0f;
690 
691     for (const auto in_val : input) {
692         auto val = std::exp((in_val - alpha) / temperature);
693         denom += val;
694         output.push_back(val);
695     }
696 
697     for (auto& out : output) {
698         out /= denom;
699     }
700 
701     return output;
702 }
703 
probe_cache(const GameState * const state,Network::Netresult & result)704 bool Network::probe_cache(const GameState* const state,
705                           Network::Netresult& result) {
706     if (m_nncache.lookup(state->board.get_hash(), result)) {
707         return true;
708     }
709     // If we are not generating a self-play game, try to find
710     // symmetries if we are in the early opening.
711     if (!cfg_noise && !cfg_random_cnt
712         && state->get_movenum()
713            < (state->get_timecontrol().opening_moves(BOARD_SIZE) / 2)) {
714         for (auto sym = 0; sym < Network::NUM_SYMMETRIES; ++sym) {
715             if (sym == Network::IDENTITY_SYMMETRY) {
716                 continue;
717             }
718             const auto hash = state->get_symmetry_hash(sym);
719             if (m_nncache.lookup(hash, result)) {
720                 decltype(result.policy) corrected_policy;
721                 for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; ++idx) {
722                     const auto sym_idx = symmetry_nn_idx_table[sym][idx];
723                     corrected_policy[idx] = result.policy[sym_idx];
724                 }
725                 result.policy = std::move(corrected_policy);
726                 return true;
727             }
728         }
729     }
730 
731     return false;
732 }
733 
get_output(const GameState * const state,const Ensemble ensemble,const int symmetry,const bool read_cache,const bool write_cache,const bool force_selfcheck)734 Network::Netresult Network::get_output(
735     const GameState* const state, const Ensemble ensemble, const int symmetry,
736     const bool read_cache, const bool write_cache, const bool force_selfcheck) {
737     Netresult result;
738     if (state->board.get_boardsize() != BOARD_SIZE) {
739         return result;
740     }
741 
742     if (read_cache) {
743         // See if we already have this in the cache.
744         if (probe_cache(state, result)) {
745             return result;
746         }
747     }
748 
749     if (ensemble == DIRECT) {
750         assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
751         result = get_output_internal(state, symmetry);
752     } else if (ensemble == AVERAGE) {
753         assert(symmetry == -1);
754         for (auto sym = 0; sym < NUM_SYMMETRIES; ++sym) {
755             auto tmpresult = get_output_internal(state, sym);
756             result.winrate +=
757                 tmpresult.winrate / static_cast<float>(NUM_SYMMETRIES);
758             result.policy_pass +=
759                 tmpresult.policy_pass / static_cast<float>(NUM_SYMMETRIES);
760 
761             for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; idx++) {
762                 result.policy[idx] +=
763                     tmpresult.policy[idx] / static_cast<float>(NUM_SYMMETRIES);
764             }
765         }
766     } else {
767         assert(ensemble == RANDOM_SYMMETRY);
768         assert(symmetry == -1);
769         const auto rand_sym = Random::get_Rng().randfix<NUM_SYMMETRIES>();
770         result = get_output_internal(state, rand_sym);
771 #ifdef USE_OPENCL_SELFCHECK
772         // Both implementations are available, self-check the OpenCL driver by
773         // running both with a probability of 1/2000.
774         // selfcheck is done here because this is the only place NN
775         // evaluation is done on actual gameplay.
776         if (m_forward_cpu != nullptr
777             && (force_selfcheck || Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0)
778         ) {
779             auto result_ref = get_output_internal(state, rand_sym, true);
780             compare_net_outputs(result, result_ref);
781         }
782 #else
783         (void)force_selfcheck;
784 #endif
785     }
786 
787     // v2 format (ELF Open Go) returns black value, not stm
788     if (m_value_head_not_stm) {
789         if (state->board.get_to_move() == FastBoard::WHITE) {
790             result.winrate = 1.0f - result.winrate;
791         }
792     }
793 
794     if (write_cache) {
795         // Insert result into cache.
796         m_nncache.insert(state->board.get_hash(), result);
797     }
798 
799     return result;
800 }
801 
get_output_internal(const GameState * const state,const int symmetry,bool selfcheck)802 Network::Netresult Network::get_output_internal(
803     const GameState* const state, const int symmetry, bool selfcheck) {
804     assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
805     constexpr auto width = BOARD_SIZE;
806     constexpr auto height = BOARD_SIZE;
807 
808     const auto input_data = gather_features(state, symmetry);
809     std::vector<float> policy_data(OUTPUTS_POLICY * width * height);
810     std::vector<float> value_data(OUTPUTS_VALUE * width * height);
811 #ifdef USE_OPENCL_SELFCHECK
812     if (selfcheck) {
813         m_forward_cpu->forward(input_data, policy_data, value_data);
814     } else {
815         m_forward->forward(input_data, policy_data, value_data);
816     }
817 #else
818     m_forward->forward(input_data, policy_data, value_data);
819     (void) selfcheck;
820 #endif
821 
822     // Get the moves
823     batchnorm<NUM_INTERSECTIONS>(OUTPUTS_POLICY, policy_data,
824         m_bn_pol_w1.data(), m_bn_pol_w2.data());
825     const auto policy_out =
826         innerproduct<OUTPUTS_POLICY * NUM_INTERSECTIONS, POTENTIAL_MOVES, false>(
827             policy_data, m_ip_pol_w, m_ip_pol_b);
828     const auto outputs = softmax(policy_out, cfg_softmax_temp);
829 
830     // Now get the value
831     batchnorm<NUM_INTERSECTIONS>(OUTPUTS_VALUE, value_data,
832         m_bn_val_w1.data(), m_bn_val_w2.data());
833     const auto winrate_data =
834         innerproduct<OUTPUTS_VALUE * NUM_INTERSECTIONS, VALUE_LAYER, true>(
835             value_data, m_ip1_val_w, m_ip1_val_b);
836     const auto winrate_out =
837         innerproduct<VALUE_LAYER, 1, false>(winrate_data, m_ip2_val_w, m_ip2_val_b);
838 
839     // Map TanH output range [-1..1] to [0..1] range
840     const auto winrate = (1.0f + std::tanh(winrate_out[0])) / 2.0f;
841 
842     Netresult result;
843 
844     for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; idx++) {
845         const auto sym_idx = symmetry_nn_idx_table[symmetry][idx];
846         result.policy[sym_idx] = outputs[idx];
847     }
848 
849     result.policy_pass = outputs[NUM_INTERSECTIONS];
850     result.winrate = winrate;
851 
852     return result;
853 }
854 
show_heatmap(const FastState * const state,const Netresult & result,const bool topmoves)855 void Network::show_heatmap(const FastState* const state,
856                            const Netresult& result,
857                            const bool topmoves) {
858     std::vector<std::string> display_map;
859     std::string line;
860 
861     for (unsigned int y = 0; y < BOARD_SIZE; y++) {
862         for (unsigned int x = 0; x < BOARD_SIZE; x++) {
863             auto policy = 0;
864             const auto vertex = state->board.get_vertex(x, y);
865             if (state->board.get_state(vertex) == FastBoard::EMPTY) {
866                 policy = result.policy[y * BOARD_SIZE + x] * 1000;
867             }
868 
869             line += boost::str(boost::format("%3d ") % policy);
870         }
871 
872         display_map.push_back(line);
873         line.clear();
874     }
875 
876     for (int i = display_map.size() - 1; i >= 0; --i) {
877         myprintf("%s\n", display_map[i].c_str());
878     }
879     const auto pass_policy = int(result.policy_pass * 1000);
880     myprintf("pass: %d\n", pass_policy);
881     myprintf("winrate: %f\n", result.winrate);
882 
883     if (topmoves) {
884         std::vector<Network::PolicyVertexPair> moves;
885         for (auto i=0; i < NUM_INTERSECTIONS; i++) {
886             const auto x = i % BOARD_SIZE;
887             const auto y = i / BOARD_SIZE;
888             const auto vertex = state->board.get_vertex(x, y);
889             if (state->board.get_state(vertex) == FastBoard::EMPTY) {
890                 moves.emplace_back(result.policy[i], vertex);
891             }
892         }
893         moves.emplace_back(result.policy_pass, FastBoard::PASS);
894 
895         std::stable_sort(rbegin(moves), rend(moves));
896 
897         auto cum = 0.0f;
898         for (const auto& move : moves) {
899             if (cum > 0.85f || move.first < 0.01f) break;
900             myprintf("%1.3f (%s)\n",
901                     move.first,
902                     state->board.move_to_text(move.second).c_str());
903             cum += move.first;
904         }
905     }
906 }
907 
fill_input_plane_pair(const FullBoard & board,std::vector<float>::iterator black,std::vector<float>::iterator white,const int symmetry)908 void Network::fill_input_plane_pair(const FullBoard& board,
909                                     std::vector<float>::iterator black,
910                                     std::vector<float>::iterator white,
911                                     const int symmetry) {
912     for (auto idx = 0; idx < NUM_INTERSECTIONS; idx++) {
913         const auto sym_idx = symmetry_nn_idx_table[symmetry][idx];
914         const auto x = sym_idx % BOARD_SIZE;
915         const auto y = sym_idx / BOARD_SIZE;
916         const auto color = board.get_state(x, y);
917         if (color == FastBoard::BLACK) {
918             black[idx] = float(true);
919         } else if (color == FastBoard::WHITE) {
920             white[idx] = float(true);
921         }
922     }
923 }
924 
gather_features(const GameState * const state,const int symmetry)925 std::vector<float> Network::gather_features(const GameState* const state,
926                                             const int symmetry) {
927     assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
928     auto input_data = std::vector<float>(INPUT_CHANNELS * NUM_INTERSECTIONS);
929 
930     const auto to_move = state->get_to_move();
931     const auto blacks_move = to_move == FastBoard::BLACK;
932 
933     const auto black_it = blacks_move ?
934                           begin(input_data) :
935                           begin(input_data) + INPUT_MOVES * NUM_INTERSECTIONS;
936     const auto white_it = blacks_move ?
937                           begin(input_data) + INPUT_MOVES * NUM_INTERSECTIONS :
938                           begin(input_data);
939     const auto to_move_it = blacks_move ?
940         begin(input_data) + 2 * INPUT_MOVES * NUM_INTERSECTIONS :
941         begin(input_data) + (2 * INPUT_MOVES + 1) * NUM_INTERSECTIONS;
942 
943     const auto moves = std::min<size_t>(state->get_movenum() + 1, INPUT_MOVES);
944     // Go back in time, fill history boards
945     for (auto h = size_t{0}; h < moves; h++) {
946         // collect white, black occupation planes
947         fill_input_plane_pair(state->get_past_board(h),
948                               black_it + h * NUM_INTERSECTIONS,
949                               white_it + h * NUM_INTERSECTIONS,
950                               symmetry);
951     }
952 
953     std::fill(to_move_it, to_move_it + NUM_INTERSECTIONS, float(true));
954 
955     return input_data;
956 }
957 
get_symmetry(const std::pair<int,int> & vertex,const int symmetry,const int board_size)958 std::pair<int, int> Network::get_symmetry(const std::pair<int, int>& vertex,
959                                           const int symmetry,
960                                           const int board_size) {
961     auto x = vertex.first;
962     auto y = vertex.second;
963     assert(x >= 0 && x < board_size);
964     assert(y >= 0 && y < board_size);
965     assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
966 
967     if ((symmetry & 4) != 0) {
968         std::swap(x, y);
969     }
970 
971     if ((symmetry & 2) != 0) {
972         x = board_size - x - 1;
973     }
974 
975     if ((symmetry & 1) != 0) {
976         y = board_size - y - 1;
977     }
978 
979     assert(x >= 0 && x < board_size);
980     assert(y >= 0 && y < board_size);
981     assert(symmetry != IDENTITY_SYMMETRY || vertex == std::make_pair(x, y));
982     return {x, y};
983 }
984 
get_estimated_size()985 size_t Network::get_estimated_size() {
986     if (estimated_size != 0) {
987         return estimated_size;
988     }
989     auto result = size_t{0};
990 
991     const auto lambda_vector_size =  [](const std::vector<std::vector<float>> &v) {
992         auto result = size_t{0};
993         for (auto it = begin(v); it != end(v); ++it) {
994             result += it->size() * sizeof(float);
995         }
996         return result;
997     };
998 
999     result += lambda_vector_size(m_fwd_weights->m_conv_weights);
1000     result += lambda_vector_size(m_fwd_weights->m_conv_biases);
1001     result += lambda_vector_size(m_fwd_weights->m_batchnorm_means);
1002     result += lambda_vector_size(m_fwd_weights->m_batchnorm_stddevs);
1003 
1004     result += m_fwd_weights->m_conv_pol_w.size() * sizeof(float);
1005     result += m_fwd_weights->m_conv_pol_b.size() * sizeof(float);
1006 
1007     // Policy head
1008     result += OUTPUTS_POLICY * sizeof(float); // m_bn_pol_w1
1009     result += OUTPUTS_POLICY * sizeof(float); // m_bn_pol_w2
1010     result += OUTPUTS_POLICY * NUM_INTERSECTIONS
1011                              * POTENTIAL_MOVES * sizeof(float); //m_ip_pol_w
1012     result += POTENTIAL_MOVES * sizeof(float); // m_ip_pol_b
1013 
1014     // Value head
1015     result += m_fwd_weights->m_conv_val_w.size() * sizeof(float);
1016     result += m_fwd_weights->m_conv_val_b.size() * sizeof(float);
1017     result += OUTPUTS_VALUE * sizeof(float); // m_bn_val_w1
1018     result += OUTPUTS_VALUE * sizeof(float); // m_bn_val_w2
1019 
1020     result += OUTPUTS_VALUE * NUM_INTERSECTIONS
1021                             * VALUE_LAYER * sizeof(float); // m_ip1_val_w
1022     result += VALUE_LAYER * sizeof(float);  // m_ip1_val_b
1023 
1024     result += VALUE_LAYER * sizeof(float); // m_ip2_val_w
1025     result += sizeof(float); // m_ip2_val_b
1026     return estimated_size = result;
1027 }
1028 
get_estimated_cache_size()1029 size_t Network::get_estimated_cache_size() {
1030     return m_nncache.get_estimated_size();
1031 }
1032 
nncache_resize(int max_count)1033 void Network::nncache_resize(int max_count) {
1034     return m_nncache.resize(max_count);
1035 }
1036