1 /**
2  * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef GLOW_BACKENDS_HABANA_HABANADEVICEMANAGER_H
17 #define GLOW_BACKENDS_HABANA_HABANADEVICEMANAGER_H
18 
19 #include "HabanaFunction.h"
20 #include "HabanaUtils.h"
21 
22 #include "glow/Backends/DeviceManager.h"
23 #include "glow/Runtime/RuntimeTypes.h"
24 #include "glow/Support/ThreadPool.h"
25 
26 #include "synapse.h"
27 
28 #include <atomic>
29 #include <mutex>
30 #include <string>
31 #include <unordered_map>
32 
33 namespace glow {
34 namespace runtime {
35 
36 /// This class implements the DeviceManager interface for
37 /// Habana devices.
38 class HabanaDeviceManager : public DeviceManager {
39   using DeviceId = uint32_t;
40   using TopologyId = uint64_t;
41 
42   static constexpr auto INVALID_DEVICE = std::numeric_limits<DeviceId>::max();
43   static constexpr auto INVALID_TOPOLOGY =
44       std::numeric_limits<TopologyId>::max();
45 
46   /// String constant for logging number of in-use devices.
47   static constexpr const char *kDevicesUsedHabana = "glow.devices_used.habana";
48 
49   /// The ID of the device managed by this instance.
50   DeviceId deviceId_{INVALID_DEVICE};
51   /// The available memory on the device.
52   uint64_t freeMemory_{0};
53   /// The total memory on the device.
54   uint64_t totalMemory_{0};
55   /// Mutex for accessing instance members (activeTopologyId_, functions_,
56   /// activeEnqueues_).
57   std::mutex instanceMtx_;
58 
59   /// Thread pool for executing functions.
60   std::unique_ptr<ThreadPool> runPool_;
61   /// Thread pool for waiting on the results of executing functions.
62   std::unique_ptr<ThreadPool> waitPool_;
63   /// The default number of workers in run pool (overridable).
64   constexpr static unsigned kNumRunners = 1;
65   /// The default number of workers in wait pool (overridable).
66   constexpr static unsigned kNumWaiters = 1;
67   /// The number of workers in run pool.
68   unsigned numRunners_{kNumRunners};
69   /// The number of workers in wait pool.
70   unsigned numWaiters_{kNumWaiters};
71 
72   /// Track active topology on this device.
73   TopologyId activeTopo_{INVALID_TOPOLOGY};
74   /// Number of requests in flight.  Used to block topo switching.
75   unsigned inflightRequests_{0};
76   /// Condition variable for signaling queue drain.
77   std::condition_variable cv_;
78 
79   /// This struct wraps a topology ID with its corresponding HabanaFunction so
80   /// that only one map is needed to keep track of both.
81   struct HabanaFunctionMeta {
82     /// The topology ID of the function. This is returned by the Synapse API
83     /// after loading a recipe.
84     uint64_t topologyId;
85     /// The HabanaFunction corresponding to topologyId. This is needed in order
86     /// to call HabanaFunction::executeOnDevice after loading and activating a
87     /// topology.
88     HabanaFunction *function;
89     /// A pool of IO buffers to use during execution.
90     std::unique_ptr<HabanaIOBufferPool> ioBufferPool;
91   };
92 
93   /// A map from function name -> HabanaFunctionMeta. Its keys are the
94   /// names of all functions added to the device manager.
95   std::unordered_map<std::string, HabanaFunctionMeta> functions_;
96 
97   /// The total number of active Habana devices among all HabanaDeviceManager
98   /// instances. This is used to determine which instance should
99   /// initialize/destroy the Synapse API in the constructor/destructor.
100   static unsigned numActiveDevices_;
101   /// Mutex for guarding access to Synapse API.
102   static std::mutex synapseMtx_;
103   /// Identifier for next run.
104   static std::atomic<RunIdentifierTy> runIdentifier_;
105 
106   /// Helper method for running a function. runFunction submits a lambda that
107   /// calls this to runPool_ so that it can return immediately without taking up
108   /// the calling thread for too long.
109   void runFunctionImpl(RunIdentifierTy runId, std::string functionName,
110                        std::unique_ptr<ExecutionContext> ctx,
111                        runtime::ResultCBTy resultCB);
112 
113   /// Update the totalMemory_ and freeMemory_ counts for the device based once
114   /// per-function memory estimates. This function is not thread safe and should
115   /// only be invoked while holding synapseLock.
116   Error updateMemoryUsage();
117 
118 public:
119   /// Constructor.
120   HabanaDeviceManager(const DeviceConfig &config,
121                       unsigned numRunners = kNumRunners,
122                       unsigned numWaiters = kNumWaiters);
123 
124   /// Destructor.
125   virtual ~HabanaDeviceManager();
126 
127   /// See DeviceManager and QueueBackedDeviceManager for the documentation of
128   /// the interface below.
129   Error init() override;
130 
131   void addNetwork(const Module *module, FunctionMapTy functions,
132                   ReadyCBTy readyCB) override;
133 
134   void evictNetwork(std::string functionName,
135                     EvictFunctionCBTy evictCB) override;
136 
137   RunIdentifierTy runFunction(std::string functionName,
138                               std::unique_ptr<ExecutionContext> ctx,
139                               runtime::ResultCBTy resultCB) override;
140 
141   Error stop(bool block) override;
142 
143   uint64_t getMaximumMemory() const override;
144   uint64_t getAvailableMemory() const override;
145   bool isMemoryAvailable(uint64_t estimate) const override;
146 
147   /// Returns the DeviceInfo for this device containing peak limits for
148   /// compute and bandwidths (used in partitioning).
149   DeviceInfo getDeviceInfo() const override;
150 };
151 
152 /// Factory function for creating a HabanaDeviceManager.
153 DeviceManager *createHabanaDeviceManager(const DeviceConfig &config);
154 
155 } // namespace runtime
156 } // namespace glow
157 
158 #endif // GLOW_BACKENDS_HABANADEVICEMANAGER_H
159