1 /*
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "HostManagerOnnxifi.h"
18 #include "glow/Runtime/DeferredWeightLoader.h"
19 #include "glow/Runtime/RequestData.h"
20
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/FileSystem.h"
23
24 namespace glow {
25 extern bool GlowDumpCompilationLog;
26 namespace onnxifi {
27
28 extern bool GlowSaveOnnxifiModel;
29
30 int32_t GlowNumDevices = 0;
31 int32_t GlowSparseNNPartitioningSchemeNumCards = 1;
32 int64_t GlowSparseNNPartitioningSchemeSLSTableKBytesPerCard = 0;
33 int32_t GlowSparseNNPartitioningSchemeNumCoresSLS = 1;
34 int32_t GlowSparseNNPartitioningSchemeNumCoresOther = 1;
35 bool GlowDumpDebugTraces = false;
36 int32_t GlowNumDebugTracesPerDump = 100;
37 bool GlowSaturateHost = false;
38 bool GlowFP16 = false;
39 bool GlowFP16Placeholders = true;
40 bool GlowFP16Constants = true;
41 bool GlowDumpGraph = false;
42 bool GlowUseDAGOptimizer = false;
43 std::string GlowDAGOptimizerPlacementTaggingAlgorithm = "None";
44 std::string GlowDAGOptimizerParallelizationTaggingAlgorithm = "None";
45 int32_t GlowDAGOptimizerNumParallelChunks = 1;
46 bool GlowFusedScaleOffsetFP16 = false;
47 bool GlowForceSLSAccumFP16 = false;
48 bool GlowClipFP16 = false;
49 bool GlowClipFP16SkipInputs = true;
50 bool GlowUseSparseNNPartitioningScheme = false;
51 bool GlowSparseNNPartitioningAddSLSConcats = false;
52 bool GlowSparseNNPartitioningBalancePerfModel = false;
53 bool GlowSparseNNPartitioningPairLNWithSLS = false;
54 size_t GlowMaxActiveRequests = 48;
55 size_t GlowMaxActiveRequestsPerInstance = 48;
56 size_t GlowMaxQueueSize = 100;
57 size_t GlowExecutorThreads = 10;
58 bool GlowSaveOnnxifiDAG = false;
59 bool GlowDelayAndRecordConstantModification = false;
60
61 static llvm::cl::opt<int32_t, true>
62 GlowNumDevicesOpt("glow-num-devices",
63 llvm::cl::desc("Number of devices for Glow backend"),
64 llvm::cl::location(GlowNumDevices));
65
66 static llvm::cl::opt<bool, true>
67 GlowDumpDebugTracesOpt("glow-dump-debug-traces",
68 llvm::cl::desc("Dump a trace of each run to /tmp"),
69 llvm::cl::location(GlowDumpDebugTraces));
70
71 static llvm::cl::opt<bool, true> GlowSaturateHostOpt(
72 "glow-saturate-host",
73 llvm::cl::desc("Try to use all available devices on the host"),
74 llvm::cl::location(GlowSaturateHost));
75
76 static llvm::cl::opt<int32_t, true> GlowSparseNNPartitioningSchemeNumCardsOpt(
77 "glow_snn_partitioning_num_cards",
78 llvm::cl::desc("Number of cards for SparseNNPartitioningScheme"),
79 llvm::cl::location(GlowSparseNNPartitioningSchemeNumCards));
80
81 static llvm::cl::opt<int64_t, true>
82 GlowSparseNNPartitioningSchemeSLSTableKBytesPerCardOpt(
83 "glow_snn_partitioning_kbytes_per_card",
84 llvm::cl::desc("SLS KBytes per card for SparseNNPartitioningScheme"),
85 llvm::cl::location(
86 GlowSparseNNPartitioningSchemeSLSTableKBytesPerCard));
87
88 static llvm::cl::opt<int32_t, true>
89 GlowSparseNNPartitioningSchemeNumCoresSLSOpt(
90 "glow_snn_partitioning_num_cores_sls",
91 llvm::cl::desc(
92 "Number of cores for SLS for SparseNNPartitioningScheme"),
93 llvm::cl::location(GlowSparseNNPartitioningSchemeNumCoresSLS));
94
95 static llvm::cl::opt<int32_t, true>
96 GlowSparseNNPartitioningSchemeNumCoresOtherOpt(
97 "glow_snn_partitioning_num_cores_other",
98 llvm::cl::desc(
99 "Number of cores for other for SparseNNPartitioningScheme"),
100 llvm::cl::location(GlowSparseNNPartitioningSchemeNumCoresOther));
101
102 static llvm::cl::opt<bool, true> GlowUseSparseNNPartitioningSchemeOpt(
103 "glow_use_sparsenn_partitioning_scheme",
104 llvm::cl::desc("Whether to use SparseNNPartitioningScheme"),
105 llvm::cl::location(GlowUseSparseNNPartitioningScheme));
106
107 static llvm::cl::opt<bool, true> GlowSparseNNPartitioningAddSLSConcatsOpt(
108 "glow_sparsenn_partitioning_add_sls_concats",
109 llvm::cl::desc("Add extra concats inside of SLS partitions for more "
110 "efficient inter-partitition transfers"),
111 llvm::cl::location(GlowSparseNNPartitioningAddSLSConcats));
112
113 static llvm::cl::opt<bool, true> GlowSparseNNPartitioningBalancePerfModelOpt(
114 "glow_sparsenn_partitioning_balance_perf_model",
115 llvm::cl::desc("Balance SLS tables across cards using a perf model"),
116 llvm::cl::location(GlowSparseNNPartitioningBalancePerfModel));
117
118 static llvm::cl::opt<bool, true> GlowSparseNNPartitioningPairLNWithSLSOpt(
119 "glow_sparsenn_partitioning_pair_ln_with_sls",
120 llvm::cl::desc("Place layer normalization nodes immediately following SLS "
121 "into SLS partition"),
122 llvm::cl::location(GlowSparseNNPartitioningPairLNWithSLS));
123
124 std::unique_ptr<runtime::HostManager>
createHostManager(llvm::StringRef backendName)125 HostManagerBackend::createHostManager(llvm::StringRef backendName) {
126 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
127 // If GlowNumDevices is set specify that many devices, otherwise use all
128 // discovered devices.
129 if (GlowNumDevices) {
130 for (int i = 0; i < GlowNumDevices; i++) {
131 auto config = glow::make_unique<runtime::DeviceConfig>(backendName);
132 config->deviceID = i;
133 configs.push_back(std::move(config));
134 }
135 } else {
136 configs = runtime::DeviceManager::generateDeviceConfigs(backendName);
137 }
138
139 runtime::HostConfig hostConfig;
140 hostConfig.maxActiveRequests = GlowMaxActiveRequests;
141 hostConfig.maxQueueSize = GlowMaxQueueSize;
142 hostConfig.executorThreads = GlowExecutorThreads;
143
144 return glow::make_unique<runtime::HostManager>(std::move(configs),
145 hostConfig);
146 }
147
runNetwork(const Graph * graph,std::unique_ptr<ExecutionContext> context,runtime::ResultCBTy callback,uint64_t priority)148 void HostManagerBackend::runNetwork(const Graph *graph,
149 std::unique_ptr<ExecutionContext> context,
150 runtime::ResultCBTy callback,
151 uint64_t priority) {
152 DCHECK(callback != nullptr);
153
154 auto hostManagerGraph = static_cast<const HostManagerGraph *>(graph);
155 hostManager_->runNetwork(hostManagerGraph->getName(), std::move(context),
156 std::move(callback), priority);
157 }
158
addNetwork(std::unique_ptr<Module> module,void * deferredBlobReader,runtime::PrePartitionedConfig * PPC)159 onnxStatus HostManagerBackend::addNetwork(std::unique_ptr<Module> module,
160 void *deferredBlobReader,
161 runtime::PrePartitionedConfig *PPC) {
162 CompilationContext cctx;
163 PrecisionConfiguration &precConfig = cctx.precisionConfig;
164 cctx.prepartitionedConfig = PPC;
165 cctx.maxActiveRequestsPerInstance = GlowMaxActiveRequestsPerInstance;
166
167 if (deferredBlobReader) {
168 // Initialize loader and set field in cctx.
169 auto loader = runtime::DeferredLoader()->getLoader();
170 if (!loader) {
171 LOG(INFO) << "Blob reader provided but no loader registered!";
172 return ONNXIFI_STATUS_INTERNAL_ERROR;
173 }
174
175 // Generate a map of type date for all static placeholders.
176 std::map<std::string, Type> staticPlaceholderTypes;
177 for (auto PH : module->getPlaceholders()) {
178 if (PH->isStatic()) {
179 staticPlaceholderTypes[std::string(PH->getName())] = *PH->getType();
180 }
181 }
182 loader->setTypeInfo(std::move(staticPlaceholderTypes));
183 auto err = loader->setSrc(deferredBlobReader);
184 if (ERR_TO_BOOL(std::move(err))) {
185 return ONNXIFI_STATUS_INTERNAL_ERROR;
186 }
187
188 cctx.deferredWeightLoader = loader;
189 // Signal that we want to fold convertTo and Quantize into static
190 // Placeholders.
191 cctx.optimizationOpts.foldStaticPlaceholderConversions = true;
192 }
193
194 if (GlowFP16) {
195 precConfig.convertToFP16 = GlowFP16;
196 LOG(INFO) << "Conversion to fp16 enabled";
197 }
198 if (GlowFP16Placeholders) {
199 precConfig.convertPlaceholdersToFP16 = GlowFP16Placeholders;
200 LOG(INFO) << "Conversion of Placeholders to fp16 enabled";
201 }
202 if (GlowFP16Constants) {
203 precConfig.convertConstantsToFP16 = GlowFP16Constants;
204 LOG(INFO) << "Conversion of Constants to fp16 enabled";
205 }
206 if (GlowFusedScaleOffsetFP16) {
207 precConfig.convertFusedToFP16 = GlowFusedScaleOffsetFP16;
208 LOG(INFO) << "Conversion of fused scales/offsets to fp16 enabled";
209 }
210 if (GlowClipFP16) {
211 precConfig.clipFP16 = GlowClipFP16;
212 LOG(INFO) << "Clipping to fp16 enabled";
213 }
214 if (GlowClipFP16SkipInputs) {
215 precConfig.clipFP16SkipInputs = GlowClipFP16SkipInputs;
216 LOG(INFO) << "Skipping clipping for fp16 Node inputs fp16";
217 }
218 if (GlowForceSLSAccumFP16) {
219 precConfig.forceFP16AccumSLS = GlowForceSLSAccumFP16;
220 LOG(INFO) << "Forcing all SLS/SLWS ops to use FP16 accumulation enabled";
221 }
222 if (GlowDumpCompilationLog) {
223 cctx.compilationLogPrefix = "glow-onnxifi";
224 }
225 if (GlowUseSparseNNPartitioningScheme) {
226 cctx.optimizationOpts.useSparseNNPartitioningScheme = true;
227 cctx.optimizationOpts.sparseNNPartitioningAddSLSConcats =
228 GlowSparseNNPartitioningAddSLSConcats;
229 cctx.optimizationOpts.sparseNNPartitioningBalancePerfModel =
230 GlowSparseNNPartitioningBalancePerfModel;
231 cctx.optimizationOpts.sparseNNPartitioningPairLNWithSLS =
232 GlowSparseNNPartitioningPairLNWithSLS;
233 cctx.optimizationOpts.sparseNNPartitioningSchemeNumCards =
234 GlowSparseNNPartitioningSchemeNumCards;
235 cctx.optimizationOpts.sparseNNPartitioningSchemeSLSTableKBytesPerCard =
236 GlowSparseNNPartitioningSchemeSLSTableKBytesPerCard;
237 cctx.optimizationOpts.sparseNNPartitioningSchemeNumCoresSLS =
238 GlowSparseNNPartitioningSchemeNumCoresSLS;
239 cctx.optimizationOpts.sparseNNPartitioningSchemeNumCoresOther =
240 GlowSparseNNPartitioningSchemeNumCoresOther;
241 }
242 if (GlowDumpGraph) {
243 cctx.dumpFinalGraph = true;
244 }
245 if (GlowUseDAGOptimizer) {
246 LOG(INFO) << "Will call the DAG optimizer.";
247 cctx.callDAGOptimizer = true;
248 cctx.optimizationOpts.DAGOptimizerPlacementTaggingAlgorithm =
249 GlowDAGOptimizerPlacementTaggingAlgorithm;
250 cctx.optimizationOpts.DAGOptimizerParallelizationTaggingAlgorithm =
251 GlowDAGOptimizerParallelizationTaggingAlgorithm;
252 cctx.optimizationOpts.DAGOptimizerNumParallelChunks =
253 GlowDAGOptimizerNumParallelChunks;
254 }
255 if (GlowSaveOnnxifiDAG) {
256 LOG(INFO) << "Serializing DAG after optimization and partitioning.";
257 cctx.serializeCompiledDAG = true;
258 }
259 if (GlowDelayAndRecordConstantModification) {
260 LOG(INFO) << "Delaying constant modification until after optimizations, "
261 "including recording constant folding for DAG serialization.";
262 cctx.optimizationOpts.delayAndRecordConstantModification = true;
263 }
264 cctx.saturateHost = GlowSaturateHost;
265
266 auto err = hostManager_->addNetwork(std::move(module), cctx);
267
268 if (ERR_TO_BOOL(std::move(err))) {
269 return ONNXIFI_STATUS_INTERNAL_ERROR;
270 }
271
272 return ONNXIFI_STATUS_SUCCESS;
273 }
274
removeNetwork(const Graph * graph)275 onnxStatus HostManagerBackend::removeNetwork(const Graph *graph) {
276 auto hostManagerGraph = static_cast<const HostManagerGraph *>(graph);
277 auto error = hostManager_->removeNetwork(hostManagerGraph->getName());
278
279 if (ERR_TO_BOOL(std::move(error))) {
280 return ONNXIFI_STATUS_INTERNAL_ERROR;
281 }
282
283 return ONNXIFI_STATUS_SUCCESS;
284 }
285
286 onnxStatus
initGraph(const void * onnxModel,size_t onnxModelSize,uint32_t weightCount,const onnxTensorDescriptorV1 * weightDescriptors,uint32_t maxSeqLength,void * deferedBlobReader)287 HostManagerGraph::initGraph(const void *onnxModel, size_t onnxModelSize,
288 uint32_t weightCount,
289 const onnxTensorDescriptorV1 *weightDescriptors,
290 uint32_t maxSeqLength, void *deferedBlobReader) {
291
292 netName_ = strFormat("onnxifi_function_%lu", makeUniqueGraphId());
293
294 std::unique_ptr<Module> module = glow::make_unique<Module>();
295 runtime::PrePartitionedConfig PPC;
296
297 std::unique_ptr<ONNXIFIModelLoader> loader;
298 auto loaderOrErr = ONNXIFIModelLoader::parse(
299 onnxModel, onnxModelSize, weightCount, weightDescriptors, *module,
300 netName_, &PPC, true /*loadInputsAsPlaceholdersForOnnx*/,
301 backendPtr_->getUseOnnx());
302 if (loaderOrErr) {
303 loader = std::move(*loaderOrErr);
304 } else {
305 LOG(ERROR) << "Error when loading model: "
306 << ERR_TO_STRING(loaderOrErr.takeError());
307 return ONNXIFI_STATUS_INVALID_MODEL;
308 }
309
310 bindPlaceholders(*loader);
311 setZeroLengthSequence(maxSeqLength);
312 // Make sure the pool is ready to go.
313 for (auto &obj : onnxInputToPlaceholder_) {
314 tensorPool_.reserve(obj.second->getType(), 10);
315 }
316
317 if (GlowSaveOnnxifiModel) {
318 for (Function *F : module->getFunctions()) {
319 saveOnnxifiModel(F);
320 }
321 }
322
323 return static_cast<HostManagerBackend *>(backendPtr_)
324 ->addNetwork(std::move(module), deferedBlobReader, &PPC);
325 }
326
327 namespace {
dumpTraces(TraceContext * traceContext)328 void dumpTraces(TraceContext *traceContext) {
329 CHECK(traceContext);
330 llvm::SmallString<64> path;
331 auto tempFileRes =
332 llvm::sys::fs::createTemporaryFile("glow-trace", "json", path);
333 if (tempFileRes.value() != 0) {
334 LOG(ERROR) << "Failed to create temp file for Glow trace events: "
335 << tempFileRes;
336 } else {
337 traceContext->dump(path);
338 }
339 }
340
341 } // namespace
342
run(std::unique_ptr<ExecutionContext> ctx,EventPtr outputEvent,onnxTraceEventList * traceEvents)343 onnxStatus HostManagerGraph::run(std::unique_ptr<ExecutionContext> ctx,
344 EventPtr outputEvent,
345 onnxTraceEventList *traceEvents) {
346 auto threadId = threads::getThreadId();
347 auto startTime = TraceEvent::now();
348
349 auto *data = ::glow::runtime::RequestData::get();
350 std::map<std::string, std::string> attributes;
351 if (data) {
352 attributes["app level request id"] =
353 llvm::formatv("{0}", data->appLevelRequestId);
354 }
355
356 backendPtr_->runNetwork(
357 this, std::move(ctx),
358 [outputEvent, traceEvents, threadId, startTime,
359 attributes = std::move(attributes),
360 this](runtime::RunIdentifierTy runId, Error err,
361 std::unique_ptr<ExecutionContext> ctx) mutable {
362 TRACE_EVENT_SCOPE(ctx->getTraceContext(), TraceLevel::RUNTIME,
363 "Onnxifi::callback");
364 // If an Error occurred then log it in ERR_TO_BOOL and signal the output
365 // event.
366 if (ERR_TO_BOOL(std::move(err))) {
367 outputEvent->signal(ONNXIFI_STATUS_INTERNAL_ERROR);
368 return;
369 }
370
371 // End the current trace event before we convert TraceEvents to the ONNX
372 // format.
373 TRACE_EVENT_SCOPE_END();
374
375 auto *traceContext = ctx->getTraceContext();
376 if (traceContext) {
377 // We want to log the async start event with the original caller's
378 // threadId. This way, chrome UI will put the async event next to the
379 // caller thread.
380 traceContext->logTraceEvent("glow e2e", TraceLevel::RUNTIME,
381 TraceEvent::BeginType, startTime,
382 attributes, threadId, runId);
383 traceContext->logTraceEvent("glow e2e", TraceLevel::RUNTIME,
384 TraceEvent::EndType, TraceEvent::now(),
385 attributes, threadId, runId);
386 setTraceEvents(traceEvents, traceContext);
387 }
388
389 // Signal to caller that the inference is completed.
390 outputEvent->signal(ONNXIFI_STATUS_SUCCESS);
391
392 if (traceContext && GlowDumpDebugTraces) {
393 // Dumping traces to a file can take a while. So avoid tracesMutex_
394 // while we call dumpTraces.
395 std::unique_ptr<TraceContext> toDump;
396 {
397 std::unique_lock<std::mutex> lock(tracesMutex_);
398 if (!mergedTraceContext_) {
399 mergedTraceContext_ =
400 glow::make_unique<TraceContext>(TraceLevel::STANDARD);
401 }
402 mergedTraceContext_->merge(traceContext);
403
404 if (++numTracesToDump_ >= GlowNumDebugTracesPerDump) {
405 numTracesToDump_ = 0;
406 toDump.reset(mergedTraceContext_.release());
407 }
408 }
409
410 if (toDump) {
411 dumpTraces(toDump.get());
412 }
413 }
414 });
415
416 return ONNXIFI_STATUS_SUCCESS;
417 }
418
~HostManagerGraph()419 HostManagerGraph::~HostManagerGraph() {
420 // Remove network from the Backend
421 backendPtr_->removeNetwork(this);
422
423 if (GlowDumpDebugTraces) {
424 std::unique_lock<std::mutex> lock(tracesMutex_);
425 if (mergedTraceContext_ && numTracesToDump_ > 0) {
426 dumpTraces(mergedTraceContext_.get());
427 }
428 }
429 }
430
makeUniqueGraphId()431 size_t HostManagerGraph::makeUniqueGraphId() {
432 static std::atomic<size_t> nextId{0};
433 return nextId++;
434 }
435
436 } // namespace onnxifi
437 } // namespace glow
438