1 /**
2  * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <array>
17 #include <cstdlib>
18 #include <future>
19 #include <random>
20 
21 #include "Bench.h"
22 
23 #include "glow/ExecutionEngine/ExecutionEngine.h"
24 #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25 
26 using namespace glow;
27 
28 /*
29  * Benchmark a number of (m x n) * (n x n) matrix multiplications.
30  * There are a number of parallel FC nodes which are created, one per core.
31  * Each core handles one weight matrix. Then these are
32  * chained together in multiple layers. After each layer, output tensor
33  * is passed to the next layer.
34  */
35 class Int8GemmParallelBench : public Benchmark {
36   /// Matrices.
37   std::vector<float> a;
38   std::vector<float> b;
39   std::vector<float> c;
40 
41   /// Dimensions expressed in libjit's format.
42   size_t aDims[2];
43   size_t cDims[2];
44   size_t numLayers_;
45   PlaceholderBindings bindings_;
46   std::unique_ptr<runtime::HostManager> hostManager_;
47   size_t asyncLaunchSize_;
48   size_t numCores_;
49   const char *backendStr_;
50   const char *devId_;
51 
52 public:
Int8GemmParallelBench(size_t m,size_t n,size_t numLayers_,size_t asyncLaunchSize_,size_t numCores_,const char * backendStr_,const char * devId_)53   Int8GemmParallelBench(size_t m, size_t n, size_t numLayers_,
54                         size_t asyncLaunchSize_, size_t numCores_,
55                         const char *backendStr_, const char *devId_)
56       : aDims{m, n}, cDims{m, n}, numLayers_(numLayers_),
57         asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58         backendStr_(backendStr_), devId_(devId_) {}
59 
setup()60   void setup() override {
61 
62     // Setup host manager
63     std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
64     auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
65     if (devId_ != nullptr) {
66       config->parameters["DeviceID"] = devId_;
67     }
68     configs.push_back(std::move(config));
69     hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
70     printf("set up host manager\n");
71 
72     dim_t m = cDims[0];
73     dim_t n = cDims[1];
74     dim_t k = aDims[1];
75     a.resize(m * k);
76     b.resize(k * n);
77     c.resize(m * n);
78 
79     std::unique_ptr<Module> mod(new Module);
80     auto fn = mod->createFunction("singleNode");
81     printf("set up module \n");
82 
83     std::vector<Node *> cur(numCores_);
84     std::vector<Placeholder *> weights(numCores_);
85     std::vector<Placeholder *> bias(numCores_);
86     std::vector<Node *> fc(numCores_);
87     std::vector<Placeholder *> input(numCores_);
88     std::vector<Placeholder *> output(numCores_);
89 
90     printf("set up inputs and outputs");
91     for (size_t core = 0; core < numCores_; core++) {
92       input[core] =
93           mod->createPlaceholder(ElemKind::Int8QTy, {m, k}, 1.0, 0,
94                                  "input_" + std::to_string(core), false);
95       output[core] =
96           mod->createPlaceholder(ElemKind::Int8QTy, {m, n}, 1.0, 0,
97                                  "output_" + std::to_string(core), false);
98       cur[core] = input[core];
99     }
100 
101     printf("set up weights and bias");
102     for (size_t layer = 0; layer < numLayers_; layer++) {
103       for (size_t core = 0; core < numCores_; core++) {
104         weights[core] =
105             mod->createPlaceholder(ElemKind::Int8QTy, {k, n}, 1.0, 0,
106                                    "weights_" + std::to_string(core), false);
107         bias[core] =
108             mod->createPlaceholder(ElemKind::Int32QTy, {n}, 1.0, 0,
109                                    "bias_" + std::to_string(core), false);
110         bindings_.allocate(weights[core])->getHandle<int8_t>().clear(0);
111         bindings_.allocate(bias[core])->getHandle<int32_t>().clear(0);
112         fc[core] = fn->createFullyConnected(
113             "fc" + std::to_string(core) + "_" + std::to_string(layer),
114             cur[core], weights[core], bias[core]);
115         cur[core] = fc[core];
116       }
117     }
118     printf("save output");
119     for (size_t core = 0; core < numCores_; core++) {
120       fn->createSave("save" + std::to_string(core), cur[core], output[core]);
121     }
122 
123     for (size_t core = 0; core < numCores_; core++) {
124       ::glow::convertPlaceholdersToConstants(fn, bindings_,
125                                              {
126                                                  input[core],
127                                                  output[core],
128                                              });
129     }
130 
131     CompilationContext ctx;
132     EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
133   }
134 
run()135   void run() override {
136     printf("Running module");
137     std::vector<std::promise<void>> promises(asyncLaunchSize_);
138     std::vector<std::future<void>> futures;
139     for (auto &runPromise : promises) {
140       std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
141       futures.push_back(runPromise.get_future());
142       hostManager_->runNetwork(
143           "singleNode", std::move(contextPtr),
144           [&runPromise](runtime::RunIdentifierTy, Error err,
145                         std::unique_ptr<ExecutionContext> /* contextPtr */) {
146             EXIT_ON_ERR(std::move(err));
147             runPromise.set_value();
148           });
149     }
150     for (auto &fut : futures) {
151       fut.wait();
152     }
153   }
154 
teardown()155   void teardown() override {}
156 
gflops() const157   double gflops() const {
158     return 2.0 * cDims[0] * cDims[1] * aDims[1] * numLayers_ * numCores_ / 1e9;
159   }
160 };
161 
main(int argc,char * argv[])162 int main(int argc, char *argv[]) {
163   size_t m = atoi(argv[1]);
164   size_t n = atoi(argv[2]);
165   size_t numLayers = atoi(argv[3]);
166   size_t reps = atoi(argv[4]);
167   size_t asyncLaunches = atoi(argv[5]);
168   size_t numCores = atoi(argv[6]);
169   const char *backendStr = argv[7];
170   char *dev_id = nullptr;
171 
172   printf("Int8GEMMParallel Microbenchmark\n");
173   printf(
174       "Usage: Int8GemmParallelBench m(Int) n(Int) numLayers(Int) numReps(Int) "
175       "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n");
176   assert(argc == 8 || argc == 9);
177   if (argc > 8) {
178     dev_id = argv[8];
179     printf("Setting backend device: \"%s\"\n", dev_id);
180   }
181   printf("Start Int8GemmParallelBench\n");
182   Int8GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr,
183                           dev_id);
184   auto times = bench(&b, reps);
185   for (auto t : times) {
186     printf("BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%"
187            "2.6lf,%5.2lf\n",
188            m, n, numLayers, reps, asyncLaunches, numCores, backendStr,
189            t / asyncLaunches, b.gflops() * asyncLaunches / t);
190   }
191   double min = *(std::min_element(times.begin(), times.end()));
192   size_t midElt = times.size() / 2;
193   std::nth_element(times.begin(), times.begin() + midElt, times.end());
194   double median = times[midElt];
195   double median_runtime = median / ((double)asyncLaunches);
196   double min_runtime = min / ((double)asyncLaunches);
197   printf("BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%"
198          "2.6lf,%2.6lf,%5.2lf, %5.2lf\n",
199          m, n, numLayers, reps, asyncLaunches, numCores, backendStr,
200          median_runtime, min_runtime, b.gflops() / median_runtime,
201          b.gflops() / min_runtime);
202 }
203