1 /** 2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef GLOW_BACKENDS_HABANA_HABANADEVICEMANAGER_H 17 #define GLOW_BACKENDS_HABANA_HABANADEVICEMANAGER_H 18 19 #include "HabanaFunction.h" 20 #include "HabanaUtils.h" 21 22 #include "glow/Backends/DeviceManager.h" 23 #include "glow/Runtime/RuntimeTypes.h" 24 #include "glow/Support/ThreadPool.h" 25 26 #include "synapse.h" 27 28 #include <atomic> 29 #include <mutex> 30 #include <string> 31 #include <unordered_map> 32 33 namespace glow { 34 namespace runtime { 35 36 /// This class implements the DeviceManager interface for 37 /// Habana devices. 38 class HabanaDeviceManager : public DeviceManager { 39 using DeviceId = uint32_t; 40 using TopologyId = uint64_t; 41 42 static constexpr auto INVALID_DEVICE = std::numeric_limits<DeviceId>::max(); 43 static constexpr auto INVALID_TOPOLOGY = 44 std::numeric_limits<TopologyId>::max(); 45 46 /// String constant for logging number of in-use devices. 47 static constexpr const char *kDevicesUsedHabana = "glow.devices_used.habana"; 48 49 /// The ID of the device managed by this instance. 50 DeviceId deviceId_{INVALID_DEVICE}; 51 /// The available memory on the device. 52 uint64_t freeMemory_{0}; 53 /// The total memory on the device. 54 uint64_t totalMemory_{0}; 55 /// Mutex for accessing instance members (activeTopologyId_, functions_, 56 /// activeEnqueues_). 57 std::mutex instanceMtx_; 58 59 /// Thread pool for executing functions. 60 std::unique_ptr<ThreadPool> runPool_; 61 /// Thread pool for waiting on the results of executing functions. 62 std::unique_ptr<ThreadPool> waitPool_; 63 /// The default number of workers in run pool (overridable). 64 constexpr static unsigned kNumRunners = 1; 65 /// The default number of workers in wait pool (overridable). 66 constexpr static unsigned kNumWaiters = 1; 67 /// The number of workers in run pool. 68 unsigned numRunners_{kNumRunners}; 69 /// The number of workers in wait pool. 70 unsigned numWaiters_{kNumWaiters}; 71 72 /// Track active topology on this device. 73 TopologyId activeTopo_{INVALID_TOPOLOGY}; 74 /// Number of requests in flight. Used to block topo switching. 75 unsigned inflightRequests_{0}; 76 /// Condition variable for signaling queue drain. 77 std::condition_variable cv_; 78 79 /// This struct wraps a topology ID with its corresponding HabanaFunction so 80 /// that only one map is needed to keep track of both. 81 struct HabanaFunctionMeta { 82 /// The topology ID of the function. This is returned by the Synapse API 83 /// after loading a recipe. 84 uint64_t topologyId; 85 /// The HabanaFunction corresponding to topologyId. This is needed in order 86 /// to call HabanaFunction::executeOnDevice after loading and activating a 87 /// topology. 88 HabanaFunction *function; 89 /// A pool of IO buffers to use during execution. 90 std::unique_ptr<HabanaIOBufferPool> ioBufferPool; 91 }; 92 93 /// A map from function name -> HabanaFunctionMeta. Its keys are the 94 /// names of all functions added to the device manager. 95 std::unordered_map<std::string, HabanaFunctionMeta> functions_; 96 97 /// The total number of active Habana devices among all HabanaDeviceManager 98 /// instances. This is used to determine which instance should 99 /// initialize/destroy the Synapse API in the constructor/destructor. 100 static unsigned numActiveDevices_; 101 /// Mutex for guarding access to Synapse API. 102 static std::mutex synapseMtx_; 103 /// Identifier for next run. 104 static std::atomic<RunIdentifierTy> runIdentifier_; 105 106 /// Helper method for running a function. runFunction submits a lambda that 107 /// calls this to runPool_ so that it can return immediately without taking up 108 /// the calling thread for too long. 109 void runFunctionImpl(RunIdentifierTy runId, std::string functionName, 110 std::unique_ptr<ExecutionContext> ctx, 111 runtime::ResultCBTy resultCB); 112 113 /// Update the totalMemory_ and freeMemory_ counts for the device based once 114 /// per-function memory estimates. This function is not thread safe and should 115 /// only be invoked while holding synapseLock. 116 Error updateMemoryUsage(); 117 118 public: 119 /// Constructor. 120 HabanaDeviceManager(const DeviceConfig &config, 121 unsigned numRunners = kNumRunners, 122 unsigned numWaiters = kNumWaiters); 123 124 /// Destructor. 125 virtual ~HabanaDeviceManager(); 126 127 /// See DeviceManager and QueueBackedDeviceManager for the documentation of 128 /// the interface below. 129 Error init() override; 130 131 void addNetwork(const Module *module, FunctionMapTy functions, 132 ReadyCBTy readyCB) override; 133 134 void evictNetwork(std::string functionName, 135 EvictFunctionCBTy evictCB) override; 136 137 RunIdentifierTy runFunction(std::string functionName, 138 std::unique_ptr<ExecutionContext> ctx, 139 runtime::ResultCBTy resultCB) override; 140 141 Error stop(bool block) override; 142 143 uint64_t getMaximumMemory() const override; 144 uint64_t getAvailableMemory() const override; 145 bool isMemoryAvailable(uint64_t estimate) const override; 146 147 /// Returns the DeviceInfo for this device containing peak limits for 148 /// compute and bandwidths (used in partitioning). 149 DeviceInfo getDeviceInfo() const override; 150 }; 151 152 /// Factory function for creating a HabanaDeviceManager. 153 DeviceManager *createHabanaDeviceManager(const DeviceConfig &config); 154 155 } // namespace runtime 156 } // namespace glow 157 158 #endif // GLOW_BACKENDS_HABANADEVICEMANAGER_H 159