1 /**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include <array>
17 #include <cstdlib>
18 #include <future>
19 #include <random>
20
21 #include "Bench.h"
22
23 #include "glow/ExecutionEngine/ExecutionEngine.h"
24 #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26 using namespace glow;
27
28 /*
29 * Benchmark a number of (m x n) * (n x n) matrix multiplications.
30 * There are a number of parallel FC nodes which are created, one per core.
31 * Each core handles one weight matrix. Then these are
32 * chained together in multiple layers. After each layer, output tensor
33 * is passed to the next layer.
34 */
35 class Int8GemmParallelBench : public Benchmark {
36 /// Matrices.
37 std::vector<float> a;
38 std::vector<float> b;
39 std::vector<float> c;
40
41 /// Dimensions expressed in libjit's format.
42 size_t aDims[2];
43 size_t cDims[2];
44 size_t numLayers_;
45 PlaceholderBindings bindings_;
46 std::unique_ptr<runtime::HostManager> hostManager_;
47 size_t asyncLaunchSize_;
48 size_t numCores_;
49 const char *backendStr_;
50 const char *devId_;
51
52 public:
Int8GemmParallelBench(size_t m,size_t n,size_t numLayers_,size_t asyncLaunchSize_,size_t numCores_,const char * backendStr_,const char * devId_)53 Int8GemmParallelBench(size_t m, size_t n, size_t numLayers_,
54 size_t asyncLaunchSize_, size_t numCores_,
55 const char *backendStr_, const char *devId_)
56 : aDims{m, n}, cDims{m, n}, numLayers_(numLayers_),
57 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58 backendStr_(backendStr_), devId_(devId_) {}
59
setup()60 void setup() override {
61
62 // Setup host manager
63 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
64 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
65 if (devId_ != nullptr) {
66 config->parameters["DeviceID"] = devId_;
67 }
68 configs.push_back(std::move(config));
69 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
70 printf("set up host manager\n");
71
72 dim_t m = cDims[0];
73 dim_t n = cDims[1];
74 dim_t k = aDims[1];
75 a.resize(m * k);
76 b.resize(k * n);
77 c.resize(m * n);
78
79 std::unique_ptr<Module> mod(new Module);
80 auto fn = mod->createFunction("singleNode");
81 printf("set up module \n");
82
83 std::vector<Node *> cur(numCores_);
84 std::vector<Placeholder *> weights(numCores_);
85 std::vector<Placeholder *> bias(numCores_);
86 std::vector<Node *> fc(numCores_);
87 std::vector<Placeholder *> input(numCores_);
88 std::vector<Placeholder *> output(numCores_);
89
90 printf("set up inputs and outputs");
91 for (size_t core = 0; core < numCores_; core++) {
92 input[core] =
93 mod->createPlaceholder(ElemKind::Int8QTy, {m, k}, 1.0, 0,
94 "input_" + std::to_string(core), false);
95 output[core] =
96 mod->createPlaceholder(ElemKind::Int8QTy, {m, n}, 1.0, 0,
97 "output_" + std::to_string(core), false);
98 cur[core] = input[core];
99 }
100
101 printf("set up weights and bias");
102 for (size_t layer = 0; layer < numLayers_; layer++) {
103 for (size_t core = 0; core < numCores_; core++) {
104 weights[core] =
105 mod->createPlaceholder(ElemKind::Int8QTy, {k, n}, 1.0, 0,
106 "weights_" + std::to_string(core), false);
107 bias[core] =
108 mod->createPlaceholder(ElemKind::Int32QTy, {n}, 1.0, 0,
109 "bias_" + std::to_string(core), false);
110 bindings_.allocate(weights[core])->getHandle<int8_t>().clear(0);
111 bindings_.allocate(bias[core])->getHandle<int32_t>().clear(0);
112 fc[core] = fn->createFullyConnected(
113 "fc" + std::to_string(core) + "_" + std::to_string(layer),
114 cur[core], weights[core], bias[core]);
115 cur[core] = fc[core];
116 }
117 }
118 printf("save output");
119 for (size_t core = 0; core < numCores_; core++) {
120 fn->createSave("save" + std::to_string(core), cur[core], output[core]);
121 }
122
123 for (size_t core = 0; core < numCores_; core++) {
124 ::glow::convertPlaceholdersToConstants(fn, bindings_,
125 {
126 input[core],
127 output[core],
128 });
129 }
130
131 CompilationContext ctx;
132 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
133 }
134
run()135 void run() override {
136 printf("Running module");
137 std::vector<std::promise<void>> promises(asyncLaunchSize_);
138 std::vector<std::future<void>> futures;
139 for (auto &runPromise : promises) {
140 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
141 futures.push_back(runPromise.get_future());
142 hostManager_->runNetwork(
143 "singleNode", std::move(contextPtr),
144 [&runPromise](runtime::RunIdentifierTy, Error err,
145 std::unique_ptr<ExecutionContext> /* contextPtr */) {
146 EXIT_ON_ERR(std::move(err));
147 runPromise.set_value();
148 });
149 }
150 for (auto &fut : futures) {
151 fut.wait();
152 }
153 }
154
teardown()155 void teardown() override {}
156
gflops() const157 double gflops() const {
158 return 2.0 * cDims[0] * cDims[1] * aDims[1] * numLayers_ * numCores_ / 1e9;
159 }
160 };
161
main(int argc,char * argv[])162 int main(int argc, char *argv[]) {
163 size_t m = atoi(argv[1]);
164 size_t n = atoi(argv[2]);
165 size_t numLayers = atoi(argv[3]);
166 size_t reps = atoi(argv[4]);
167 size_t asyncLaunches = atoi(argv[5]);
168 size_t numCores = atoi(argv[6]);
169 const char *backendStr = argv[7];
170 char *dev_id = nullptr;
171
172 printf("Int8GEMMParallel Microbenchmark\n");
173 printf(
174 "Usage: Int8GemmParallelBench m(Int) n(Int) numLayers(Int) numReps(Int) "
175 "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n");
176 assert(argc == 8 || argc == 9);
177 if (argc > 8) {
178 dev_id = argv[8];
179 printf("Setting backend device: \"%s\"\n", dev_id);
180 }
181 printf("Start Int8GemmParallelBench\n");
182 Int8GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr,
183 dev_id);
184 auto times = bench(&b, reps);
185 for (auto t : times) {
186 printf("BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%"
187 "2.6lf,%5.2lf\n",
188 m, n, numLayers, reps, asyncLaunches, numCores, backendStr,
189 t / asyncLaunches, b.gflops() * asyncLaunches / t);
190 }
191 double min = *(std::min_element(times.begin(), times.end()));
192 size_t midElt = times.size() / 2;
193 std::nth_element(times.begin(), times.begin() + midElt, times.end());
194 double median = times[midElt];
195 double median_runtime = median / ((double)asyncLaunches);
196 double min_runtime = min / ((double)asyncLaunches);
197 printf("BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%"
198 "2.6lf,%2.6lf,%5.2lf, %5.2lf\n",
199 m, n, numLayers, reps, asyncLaunches, numCores, backendStr,
200 median_runtime, min_runtime, b.gflops() / median_runtime,
201 b.gflops() / min_runtime);
202 }
203