1 /**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "HabanaDeviceManager.h"
18
19 #include "glow/Runtime/StatsExporter.h"
20
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/FormatVariadic.h"
23 #include "llvm/Support/raw_ostream.h"
24
25 #include "synapse.h"
26
27 #include <glog/logging.h>
28 #include <limits>
29
30 using namespace glow;
31 using namespace glow::runtime;
32
33 namespace glow {
34 namespace runtime {
35
36 unsigned GlowHabanaMemory = 7 << 20; // 7 GB.
37
38 static llvm::cl::opt<unsigned, /* ExternalStorage */ true> GlowHabanaMemoryOpt(
39 "glow-habana-memory",
40 llvm::cl::desc("Amount of DRAM to allocate per Habana device in kilobytes"),
41 llvm::cl::location(GlowHabanaMemory));
42
createHabanaDeviceManager(const DeviceConfig & config)43 DeviceManager *createHabanaDeviceManager(const DeviceConfig &config) {
44 return new HabanaDeviceManager(config);
45 }
46 } // namespace runtime
47 } // namespace glow
48
49 // Initialization of static class variables.
50 unsigned HabanaDeviceManager::numActiveDevices_{0};
51 std::mutex HabanaDeviceManager::synapseMtx_;
52 std::atomic<RunIdentifierTy> HabanaDeviceManager::runIdentifier_;
53
HabanaDeviceManager(const DeviceConfig & config,unsigned numRunners,unsigned numWaiters)54 HabanaDeviceManager::HabanaDeviceManager(const DeviceConfig &config,
55 unsigned numRunners,
56 unsigned numWaiters)
57 : DeviceManager(config), numRunners_(numRunners), numWaiters_(numWaiters) {}
58
~HabanaDeviceManager()59 HabanaDeviceManager::~HabanaDeviceManager() {
60 // If a device was never successfully acquired, there's nothing to clean up.
61 if (deviceId_ == INVALID_DEVICE) {
62 return;
63 }
64 std::lock_guard<std::mutex> lock(synapseMtx_);
65 numActiveDevices_--;
66 statsExporterRegistry_->incrementCounter(kDevicesUsedHabana, -1);
67
68 // Explicitly clear this map to force synFree of the managed IOBuffers to
69 // happen now, before we synReleaseDevice. Otherwise synReleaseDevice will
70 // free the buffers, and then the destructor will try to do it again.
71 functions_.clear();
72 chk_kill(synReleaseDevice(deviceId_));
73
74 // If this is the last HabanaDeviceManager to be destroyed, destroy the
75 // Synapse API.
76 if (numActiveDevices_ == 0) {
77 chk_kill(synDestroy());
78 }
79 }
80
init()81 Error HabanaDeviceManager::init() {
82 std::lock_guard<std::mutex> lock(synapseMtx_);
83
84 // If this is the first HabanaDeviceManager to be created, initialize the
85 // Synapse API.
86 if (numActiveDevices_ == 0) {
87 LOG(INFO) << "Using version " << synGetVersion();
88 // This environment variable tells Synapse to allow enqueueing tensors that
89 // are smaller than the declared size, which offers a significant savings
90 // in PCI traffic for embedding lookups.
91 setenv("IGNORE_ENQUEUE_SIZE_VALIDATION", "1", /*overwrite*/ 1);
92 chk(synInitialize());
93 }
94
95 // Acquire a device to work with for the lifetime of this instance.
96 synStatus status = synAcquireDevice(&deviceId_, nullptr);
97 if (status != synSuccess) {
98 RETURN_ERR("Failed to acquire device");
99 }
100
101 numActiveDevices_++;
102 statsExporterRegistry_->incrementCounter(kDevicesUsedHabana);
103
104 // Fetch initial memory information.
105 RETURN_IF_ERR(updateMemoryUsage());
106
107 // Create thread pools for running functions and waiting on function results.
108 runPool_ = glow::make_unique<ThreadPool>(numRunners_);
109 waitPool_ = glow::make_unique<ThreadPool>(numWaiters_);
110
111 if (!runPool_ || !waitPool_) {
112 RETURN_ERR("Failed to create HabanaDeviceManager thread pools");
113 }
114
115 return Error::success();
116 }
117
updateMemoryUsage()118 Error HabanaDeviceManager::updateMemoryUsage() {
119 // TODO: Use synGetMemInfo once implemented.
120
121 // Use GlowHabanaMemory if it is defined from GFLAGS or llvm params,
122 // otherwise, fall back to what config says.
123 uint64_t defaultMemory = 7 << 20;
124 if (GlowHabanaMemory == defaultMemory && config_.getDeviceMemory() != 0) {
125 totalMemory_ = config_.getDeviceMemory();
126 } else {
127 totalMemory_ = uint64_t{GlowHabanaMemory} * 1024;
128 }
129 freeMemory_ = totalMemory_;
130
131 // Account for the size used by each function loaded on the card.
132 for (const auto &pr : functions_) {
133 const auto &functionMeta = pr.second;
134 const auto &runtimeBundle = functionMeta.function->getRuntimeBundle();
135 freeMemory_ -= runtimeBundle.getConstantWeightSize();
136 freeMemory_ -= runtimeBundle.getMutableWeightSize();
137 }
138
139 return Error::success();
140 }
141
addNetwork(const Module * module,FunctionMapTy functions,ReadyCBTy readyCB)142 void HabanaDeviceManager::addNetwork(const Module *module,
143 FunctionMapTy functions,
144 ReadyCBTy readyCB) {
145 DCHECK(readyCB != nullptr);
146
147 std::unique_lock<std::mutex> lk(instanceMtx_);
148 for (const auto &func : functions) {
149 // Check if a function with the same name has already been added.
150 if (functions_.count(func.first) != 0) {
151 lk.unlock();
152 readyCB(module,
153 MAKE_ERR(strFormat(
154 "Failed to add network: already have a function called %s",
155 func.first.c_str())));
156 return;
157 }
158
159 uint64_t topologyId = 0;
160 HabanaFunction *habanaFunction = static_cast<HabanaFunction *>(func.second);
161
162 // Load the recipe (created during compilation) and store the resultant
163 // topology ID. This is the reference that will be used lated to "activate"
164 // this function and make it executable.
165 synStatus status = synFail;
166
167 {
168 std::lock_guard<std::mutex> lock(synapseMtx_);
169 status = synLoadRecipe(deviceId_, habanaFunction->getRecipeName().c_str(),
170 &topologyId);
171 }
172
173 if (auto err = chk_make_err(status)) {
174 LOG(ERROR) << "Unable to load recipe " << habanaFunction->getRecipeName()
175 << " for function " << func.first << ".";
176 // TODO: Unload functions that were loaded successfully.
177 lk.unlock();
178 readyCB(module, std::move(err));
179 return;
180 }
181
182 // Insert the function into functions_.
183 bool inserted = false;
184 std::tie(std::ignore, inserted) = functions_.insert(std::make_pair(
185 func.first,
186 HabanaFunctionMeta{topologyId, habanaFunction,
187 glow::make_unique<HabanaIOBufferPool>(
188 deviceId_, habanaFunction->getInputs(),
189 habanaFunction->getOutputs())}));
190
191 if (!inserted) {
192 // TODO: Unload functions that were loaded successfully.
193 lk.unlock();
194 readyCB(module, MAKE_ERR(strFormat(
195 "Unable to add function %s to HabanaDeviceManager",
196 func.first.c_str())));
197 return;
198 }
199
200 // Optimistically activate the topology if nothing else is loaded.
201 cv_.wait(lk, [this] { return inflightRequests_ == 0; });
202 if (auto err = chk_make_err(synActivateTopology(deviceId_, topologyId))) {
203 lk.unlock();
204 readyCB(module, std::move(err));
205 return;
206 }
207 activeTopo_ = topologyId;
208 }
209
210 lk.unlock();
211
212 // Update memory information after loading all the functions.
213 if (auto err = updateMemoryUsage()) {
214 readyCB(module, std::move(err));
215 return;
216 }
217
218 readyCB(module, Error::success());
219 }
220
evictNetwork(std::string functionName,EvictFunctionCBTy evictCB)221 void HabanaDeviceManager::evictNetwork(std::string functionName,
222 EvictFunctionCBTy evictCB) {
223 DCHECK(evictCB != nullptr);
224
225 std::unique_lock<std::mutex> lk(instanceMtx_);
226
227 // Check if a network with the given name exists on the device.
228 if (functions_.count(functionName) == 0) {
229 lk.unlock();
230 evictCB(functionName,
231 MAKE_ERR(strFormat(
232 "Failed to evict network: function called %s was not added",
233 functionName.c_str())));
234 return;
235 }
236
237 // Unload the topology ID corresponding to the function.
238 synStatus status = synFail;
239 uint64_t topologyId = functions_[functionName].topologyId;
240
241 {
242 std::lock_guard<std::mutex> lock(synapseMtx_);
243 status = synUnloadTopology(deviceId_, topologyId);
244 if (topologyId == activeTopo_) {
245 activeTopo_ = INVALID_TOPOLOGY;
246 }
247 }
248
249 if (auto err = chk_make_err(status)) {
250 LOG(ERROR) << "Unable to unload function " << functionName;
251 lk.unlock();
252 evictCB(functionName, std::move(err));
253 return;
254 }
255
256 // Erase the function from the functions_ map.
257 auto numErased = functions_.erase(functionName);
258
259 if (numErased == 0) {
260 lk.unlock();
261 evictCB(functionName,
262 MAKE_ERR(strFormat(
263 "Unable to evict function %s from HabanaDeviceManager",
264 functionName.c_str())));
265 return;
266 }
267
268 lk.unlock();
269
270 // Update memory information after evicting the function.
271 if (auto err = updateMemoryUsage()) {
272 evictCB(functionName, std::move(err));
273 return;
274 }
275
276 evictCB(functionName, Error::success());
277 }
278
runFunctionImpl(RunIdentifierTy runId,std::string functionName,std::unique_ptr<ExecutionContext> ctx,runtime::ResultCBTy resultCB)279 void HabanaDeviceManager::runFunctionImpl(RunIdentifierTy runId,
280 std::string functionName,
281 std::unique_ptr<ExecutionContext> ctx,
282 runtime::ResultCBTy resultCB) {
283 DCHECK(resultCB != nullptr);
284
285 TRACE_EVENT_SCOPE_NAMED(ctx->getTraceContext(), TraceLevel::RUNTIME,
286 "HabanaDM::runnerThread", trEvent);
287
288 /// Habana DeviceManager doesn't support Device Resident Tensors.
289 ctx->getPlaceholderBindings()->ensureOnHost();
290
291 if (ctx->getTraceContext()) {
292 ctx->getTraceContext()->setThreadName(
293 llvm::formatv("Habana {0} (enqueue)", deviceId_).str());
294 }
295 // Try to find the function with the given name in functions_.
296 uint64_t topologyId;
297 HabanaFunction *function;
298 HabanaIOBufferPool *ioBufferPool;
299 {
300 std::lock_guard<std::mutex> lock(instanceMtx_);
301 auto it = functions_.find(functionName);
302 if (it == functions_.end()) {
303 resultCB(runId,
304 MAKE_ERR(strFormat(
305 "Failed to run function: function called %s was not added",
306 functionName.c_str())),
307 std::move(ctx));
308 return;
309 }
310
311 topologyId = (it->second).topologyId;
312 function = (it->second).function;
313 ioBufferPool = (it->second).ioBufferPool.get();
314 }
315
316 // If we need to switch topos, wait to drain the queue.
317 {
318 std::unique_lock<std::mutex> lock(instanceMtx_);
319 if (topologyId != activeTopo_) {
320 // FIXME: This can starve inactive topos.
321 cv_.wait(lock, [this] { return inflightRequests_ == 0; });
322 const auto activateTopoRes = synActivateTopology(deviceId_, topologyId);
323 if (auto err = chk_make_err(activateTopoRes)) {
324 LOG(ERROR) << "synActivateTopology failed with status "
325 << activateTopoRes;
326 trEvent.addArg(
327 "error", llvm::formatv("synActivateTopology failed with status {0}",
328 activateTopoRes)
329 .str());
330 TRACE_EVENT_SCOPE_END_NAMED(trEvent);
331 resultCB(runId, std::move(err), std::move(ctx));
332 return;
333 }
334 activeTopo_ = topologyId;
335 }
336 inflightRequests_++;
337 }
338
339 // Execute the function.
340 auto deviceBindings =
341 glow::make_unique<HabanaBindings>(deviceId_, topologyId);
342 deviceBindings->setIOBuffer(ioBufferPool->get());
343 ctx->setDeviceBindings(std::move(deviceBindings));
344
345 auto executeErr = function->execute(ctx.get());
346 if (executeErr) {
347 trEvent.addArg("error", "execute() failed");
348 TRACE_EVENT_SCOPE_END_NAMED(trEvent);
349 resultCB(runId, std::move(executeErr), std::move(ctx));
350 return;
351 }
352
353 // Give the handle to the wait thread pool to wait on and call the callback
354 // for.
355 waitPool_->submit([this, runId, function, ioBufferPool,
356 functionName = std::move(functionName),
357 ctx = std::move(ctx),
358 resultCB = std::move(resultCB)]() mutable {
359 DCHECK(resultCB != nullptr);
360
361 TRACE_EVENT_SCOPE(ctx->getTraceContext(), TraceLevel::RUNTIME,
362 "HabanaDM::waiterThread");
363 if (ctx->getTraceContext()) {
364 ctx->getTraceContext()->setThreadName(
365 llvm::formatv("Habana {0} (waiter)", deviceId_).str());
366 }
367
368 TRACE_EVENT_BEGIN(ctx->getTraceContext(), TraceLevel::RUNTIME, "wait");
369 auto &habanaHandle =
370 static_cast<HabanaBindings *>(ctx->getDeviceBindings())->getHandle();
371 bool ok = habanaHandle.wait();
372 std::unique_ptr<HabanaIOBuffer> ioBuffer =
373 static_cast<HabanaBindings *>(ctx->getDeviceBindings())->getIOBuffer();
374 TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::RUNTIME, "wait");
375
376 // Notify anything waiting for a topo switch.
377 {
378 std::lock_guard<std::mutex> lock(this->instanceMtx_);
379 inflightRequests_--;
380 }
381 cv_.notify_one();
382
383 if (!ok) {
384 // Return the IO buffer to the IO buffer pool.
385 ioBufferPool->put(std::move(ioBuffer));
386
387 resultCB(runId,
388 MAKE_ERR(strFormat("Failed to execute function %s",
389 functionName.c_str())),
390 std::move(ctx));
391 } else {
392 // Copy the execution outputs from the designated IO buffer back to the
393 // PlaceholderBindings inside ctx.
394 TRACE_EVENT_SCOPE_NAMED(ctx->getTraceContext(), TraceLevel::RUNTIME,
395 "copyOutputs", coEvent);
396 auto bindings = ctx->getPlaceholderBindings();
397 size_t tensors{0}, bytes{0};
398 for (const auto &ph : function->getOutputs()) {
399 auto *tensor = bindings->get(ph);
400 if (!tensor) {
401 tensor =
402 bindings->get(bindings->getPlaceholderByNameSlow(ph->getName()));
403 }
404 tensors++;
405
406 if (auto ioBufferDataOrErr = ioBuffer->get(ph)) {
407 memcpy(tensor->getUnsafePtr(), *ioBufferDataOrErr,
408 ph->getType()->getSizeInBytes());
409 bytes += ph->getType()->getSizeInBytes();
410 } else {
411 // Return the IO buffer to the IO buffer pool.
412 ioBufferPool->put(std::move(ioBuffer));
413 coEvent.addArg("tensors", std::to_string(tensors));
414 coEvent.addArg("bytes", std::to_string(bytes));
415 coEvent.addArg("missingTensor", ph->getName().str());
416 TRACE_EVENT_SCOPE_END_NAMED(coEvent);
417 resultCB(runId, ioBufferDataOrErr.takeError(), std::move(ctx));
418 return;
419 }
420 }
421 coEvent.addArg("tensors", std::to_string(tensors));
422 coEvent.addArg("bytes", std::to_string(bytes));
423 TRACE_EVENT_SCOPE_END_NAMED(coEvent);
424
425 // Return the IO buffer to the IO buffer pool.
426 ioBufferPool->put(std::move(ioBuffer));
427 resultCB(runId, Error::success(), std::move(ctx));
428 }
429 });
430 }
431
432 RunIdentifierTy
runFunction(std::string functionName,std::unique_ptr<ExecutionContext> ctx,runtime::ResultCBTy resultCB)433 HabanaDeviceManager::runFunction(std::string functionName,
434 std::unique_ptr<ExecutionContext> ctx,
435 runtime::ResultCBTy resultCB) {
436 DCHECK(resultCB != nullptr);
437
438 RunIdentifierTy runId = runIdentifier_++;
439 runPool_->submit([this, runId, functionName = std::move(functionName),
440 ctx = std::move(ctx),
441 resultCB = std::move(resultCB)]() mutable {
442 runFunctionImpl(runId, std::move(functionName), std::move(ctx),
443 std::move(resultCB));
444 });
445 return runId;
446 }
447
stop(bool block)448 Error HabanaDeviceManager::stop(bool block) {
449 runPool_->stop(block);
450 waitPool_->stop(block);
451 return Error::success();
452 }
453
getMaximumMemory() const454 uint64_t HabanaDeviceManager::getMaximumMemory() const { return totalMemory_; }
455
getAvailableMemory() const456 uint64_t HabanaDeviceManager::getAvailableMemory() const { return freeMemory_; }
457
isMemoryAvailable(uint64_t estimate) const458 bool HabanaDeviceManager::isMemoryAvailable(uint64_t estimate) const {
459 return estimate <= freeMemory_;
460 }
461
getDeviceInfo() const462 DeviceInfo HabanaDeviceManager::getDeviceInfo() const {
463 DeviceInfo info = DeviceInfo();
464 info.sramCapacity = 50 * 1024 * 1024;
465 info.peakCompute = 0.45 * 1024 * 1024 * 1024 * 1024;
466 info.peakDramBw = 30.0 * 1024 * 1024 * 1024;
467 info.peakSramBw = 1024.0 * 1024 * 1024 * 1024;
468 info.peakPCIeBw = 16.0 * 1024 * 1024 * 1024;
469 return info;
470 }
471