1 /*
2  * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "HostManagerOnnxifi.h"
18 #include "glow/Runtime/DeferredWeightLoader.h"
19 #include "glow/Runtime/RequestData.h"
20 
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/FileSystem.h"
23 
24 namespace glow {
25 extern bool GlowDumpCompilationLog;
26 namespace onnxifi {
27 
28 extern bool GlowSaveOnnxifiModel;
29 
30 int32_t GlowNumDevices = 0;
31 int32_t GlowSparseNNPartitioningSchemeNumCards = 1;
32 int64_t GlowSparseNNPartitioningSchemeSLSTableKBytesPerCard = 0;
33 int32_t GlowSparseNNPartitioningSchemeNumCoresSLS = 1;
34 int32_t GlowSparseNNPartitioningSchemeNumCoresOther = 1;
35 bool GlowDumpDebugTraces = false;
36 int32_t GlowNumDebugTracesPerDump = 100;
37 bool GlowSaturateHost = false;
38 bool GlowFP16 = false;
39 bool GlowFP16Placeholders = true;
40 bool GlowFP16Constants = true;
41 bool GlowDumpGraph = false;
42 bool GlowUseDAGOptimizer = false;
43 std::string GlowDAGOptimizerPlacementTaggingAlgorithm = "None";
44 std::string GlowDAGOptimizerParallelizationTaggingAlgorithm = "None";
45 int32_t GlowDAGOptimizerNumParallelChunks = 1;
46 bool GlowFusedScaleOffsetFP16 = false;
47 bool GlowForceSLSAccumFP16 = false;
48 bool GlowClipFP16 = false;
49 bool GlowClipFP16SkipInputs = true;
50 bool GlowUseSparseNNPartitioningScheme = false;
51 bool GlowSparseNNPartitioningAddSLSConcats = false;
52 bool GlowSparseNNPartitioningBalancePerfModel = false;
53 bool GlowSparseNNPartitioningPairLNWithSLS = false;
54 size_t GlowMaxActiveRequests = 48;
55 size_t GlowMaxActiveRequestsPerInstance = 48;
56 size_t GlowMaxQueueSize = 100;
57 size_t GlowExecutorThreads = 10;
58 bool GlowSaveOnnxifiDAG = false;
59 bool GlowDelayAndRecordConstantModification = false;
60 
61 static llvm::cl::opt<int32_t, true>
62     GlowNumDevicesOpt("glow-num-devices",
63                       llvm::cl::desc("Number of devices for Glow backend"),
64                       llvm::cl::location(GlowNumDevices));
65 
66 static llvm::cl::opt<bool, true>
67     GlowDumpDebugTracesOpt("glow-dump-debug-traces",
68                            llvm::cl::desc("Dump a trace of each run to /tmp"),
69                            llvm::cl::location(GlowDumpDebugTraces));
70 
71 static llvm::cl::opt<bool, true> GlowSaturateHostOpt(
72     "glow-saturate-host",
73     llvm::cl::desc("Try to use all available devices on the host"),
74     llvm::cl::location(GlowSaturateHost));
75 
76 static llvm::cl::opt<int32_t, true> GlowSparseNNPartitioningSchemeNumCardsOpt(
77     "glow_snn_partitioning_num_cards",
78     llvm::cl::desc("Number of cards for SparseNNPartitioningScheme"),
79     llvm::cl::location(GlowSparseNNPartitioningSchemeNumCards));
80 
81 static llvm::cl::opt<int64_t, true>
82     GlowSparseNNPartitioningSchemeSLSTableKBytesPerCardOpt(
83         "glow_snn_partitioning_kbytes_per_card",
84         llvm::cl::desc("SLS KBytes per card for SparseNNPartitioningScheme"),
85         llvm::cl::location(
86             GlowSparseNNPartitioningSchemeSLSTableKBytesPerCard));
87 
88 static llvm::cl::opt<int32_t, true>
89     GlowSparseNNPartitioningSchemeNumCoresSLSOpt(
90         "glow_snn_partitioning_num_cores_sls",
91         llvm::cl::desc(
92             "Number of cores for SLS for SparseNNPartitioningScheme"),
93         llvm::cl::location(GlowSparseNNPartitioningSchemeNumCoresSLS));
94 
95 static llvm::cl::opt<int32_t, true>
96     GlowSparseNNPartitioningSchemeNumCoresOtherOpt(
97         "glow_snn_partitioning_num_cores_other",
98         llvm::cl::desc(
99             "Number of cores for other for SparseNNPartitioningScheme"),
100         llvm::cl::location(GlowSparseNNPartitioningSchemeNumCoresOther));
101 
102 static llvm::cl::opt<bool, true> GlowUseSparseNNPartitioningSchemeOpt(
103     "glow_use_sparsenn_partitioning_scheme",
104     llvm::cl::desc("Whether to use SparseNNPartitioningScheme"),
105     llvm::cl::location(GlowUseSparseNNPartitioningScheme));
106 
107 static llvm::cl::opt<bool, true> GlowSparseNNPartitioningAddSLSConcatsOpt(
108     "glow_sparsenn_partitioning_add_sls_concats",
109     llvm::cl::desc("Add extra concats inside of SLS partitions for more "
110                    "efficient inter-partitition transfers"),
111     llvm::cl::location(GlowSparseNNPartitioningAddSLSConcats));
112 
113 static llvm::cl::opt<bool, true> GlowSparseNNPartitioningBalancePerfModelOpt(
114     "glow_sparsenn_partitioning_balance_perf_model",
115     llvm::cl::desc("Balance SLS tables across cards using a perf model"),
116     llvm::cl::location(GlowSparseNNPartitioningBalancePerfModel));
117 
118 static llvm::cl::opt<bool, true> GlowSparseNNPartitioningPairLNWithSLSOpt(
119     "glow_sparsenn_partitioning_pair_ln_with_sls",
120     llvm::cl::desc("Place layer normalization nodes immediately following SLS "
121                    "into SLS partition"),
122     llvm::cl::location(GlowSparseNNPartitioningPairLNWithSLS));
123 
124 std::unique_ptr<runtime::HostManager>
createHostManager(llvm::StringRef backendName)125 HostManagerBackend::createHostManager(llvm::StringRef backendName) {
126   std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
127   // If GlowNumDevices is set specify that many devices, otherwise use all
128   // discovered devices.
129   if (GlowNumDevices) {
130     for (int i = 0; i < GlowNumDevices; i++) {
131       auto config = glow::make_unique<runtime::DeviceConfig>(backendName);
132       config->deviceID = i;
133       configs.push_back(std::move(config));
134     }
135   } else {
136     configs = runtime::DeviceManager::generateDeviceConfigs(backendName);
137   }
138 
139   runtime::HostConfig hostConfig;
140   hostConfig.maxActiveRequests = GlowMaxActiveRequests;
141   hostConfig.maxQueueSize = GlowMaxQueueSize;
142   hostConfig.executorThreads = GlowExecutorThreads;
143 
144   return glow::make_unique<runtime::HostManager>(std::move(configs),
145                                                  hostConfig);
146 }
147 
runNetwork(const Graph * graph,std::unique_ptr<ExecutionContext> context,runtime::ResultCBTy callback,uint64_t priority)148 void HostManagerBackend::runNetwork(const Graph *graph,
149                                     std::unique_ptr<ExecutionContext> context,
150                                     runtime::ResultCBTy callback,
151                                     uint64_t priority) {
152   DCHECK(callback != nullptr);
153 
154   auto hostManagerGraph = static_cast<const HostManagerGraph *>(graph);
155   hostManager_->runNetwork(hostManagerGraph->getName(), std::move(context),
156                            std::move(callback), priority);
157 }
158 
addNetwork(std::unique_ptr<Module> module,void * deferredBlobReader,runtime::PrePartitionedConfig * PPC)159 onnxStatus HostManagerBackend::addNetwork(std::unique_ptr<Module> module,
160                                           void *deferredBlobReader,
161                                           runtime::PrePartitionedConfig *PPC) {
162   CompilationContext cctx;
163   PrecisionConfiguration &precConfig = cctx.precisionConfig;
164   cctx.prepartitionedConfig = PPC;
165   cctx.maxActiveRequestsPerInstance = GlowMaxActiveRequestsPerInstance;
166 
167   if (deferredBlobReader) {
168     // Initialize loader and set field in cctx.
169     auto loader = runtime::DeferredLoader()->getLoader();
170     if (!loader) {
171       LOG(INFO) << "Blob reader provided but no loader registered!";
172       return ONNXIFI_STATUS_INTERNAL_ERROR;
173     }
174 
175     // Generate a map of type date for all static placeholders.
176     std::map<std::string, Type> staticPlaceholderTypes;
177     for (auto PH : module->getPlaceholders()) {
178       if (PH->isStatic()) {
179         staticPlaceholderTypes[std::string(PH->getName())] = *PH->getType();
180       }
181     }
182     loader->setTypeInfo(std::move(staticPlaceholderTypes));
183     auto err = loader->setSrc(deferredBlobReader);
184     if (ERR_TO_BOOL(std::move(err))) {
185       return ONNXIFI_STATUS_INTERNAL_ERROR;
186     }
187 
188     cctx.deferredWeightLoader = loader;
189     // Signal that we want to fold convertTo and Quantize into static
190     // Placeholders.
191     cctx.optimizationOpts.foldStaticPlaceholderConversions = true;
192   }
193 
194   if (GlowFP16) {
195     precConfig.convertToFP16 = GlowFP16;
196     LOG(INFO) << "Conversion to fp16 enabled";
197   }
198   if (GlowFP16Placeholders) {
199     precConfig.convertPlaceholdersToFP16 = GlowFP16Placeholders;
200     LOG(INFO) << "Conversion of Placeholders to fp16 enabled";
201   }
202   if (GlowFP16Constants) {
203     precConfig.convertConstantsToFP16 = GlowFP16Constants;
204     LOG(INFO) << "Conversion of Constants to fp16 enabled";
205   }
206   if (GlowFusedScaleOffsetFP16) {
207     precConfig.convertFusedToFP16 = GlowFusedScaleOffsetFP16;
208     LOG(INFO) << "Conversion of fused scales/offsets to fp16 enabled";
209   }
210   if (GlowClipFP16) {
211     precConfig.clipFP16 = GlowClipFP16;
212     LOG(INFO) << "Clipping to fp16 enabled";
213   }
214   if (GlowClipFP16SkipInputs) {
215     precConfig.clipFP16SkipInputs = GlowClipFP16SkipInputs;
216     LOG(INFO) << "Skipping clipping for fp16 Node inputs fp16";
217   }
218   if (GlowForceSLSAccumFP16) {
219     precConfig.forceFP16AccumSLS = GlowForceSLSAccumFP16;
220     LOG(INFO) << "Forcing all SLS/SLWS ops to use FP16 accumulation enabled";
221   }
222   if (GlowDumpCompilationLog) {
223     cctx.compilationLogPrefix = "glow-onnxifi";
224   }
225   if (GlowUseSparseNNPartitioningScheme) {
226     cctx.optimizationOpts.useSparseNNPartitioningScheme = true;
227     cctx.optimizationOpts.sparseNNPartitioningAddSLSConcats =
228         GlowSparseNNPartitioningAddSLSConcats;
229     cctx.optimizationOpts.sparseNNPartitioningBalancePerfModel =
230         GlowSparseNNPartitioningBalancePerfModel;
231     cctx.optimizationOpts.sparseNNPartitioningPairLNWithSLS =
232         GlowSparseNNPartitioningPairLNWithSLS;
233     cctx.optimizationOpts.sparseNNPartitioningSchemeNumCards =
234         GlowSparseNNPartitioningSchemeNumCards;
235     cctx.optimizationOpts.sparseNNPartitioningSchemeSLSTableKBytesPerCard =
236         GlowSparseNNPartitioningSchemeSLSTableKBytesPerCard;
237     cctx.optimizationOpts.sparseNNPartitioningSchemeNumCoresSLS =
238         GlowSparseNNPartitioningSchemeNumCoresSLS;
239     cctx.optimizationOpts.sparseNNPartitioningSchemeNumCoresOther =
240         GlowSparseNNPartitioningSchemeNumCoresOther;
241   }
242   if (GlowDumpGraph) {
243     cctx.dumpFinalGraph = true;
244   }
245   if (GlowUseDAGOptimizer) {
246     LOG(INFO) << "Will call the DAG optimizer.";
247     cctx.callDAGOptimizer = true;
248     cctx.optimizationOpts.DAGOptimizerPlacementTaggingAlgorithm =
249         GlowDAGOptimizerPlacementTaggingAlgorithm;
250     cctx.optimizationOpts.DAGOptimizerParallelizationTaggingAlgorithm =
251         GlowDAGOptimizerParallelizationTaggingAlgorithm;
252     cctx.optimizationOpts.DAGOptimizerNumParallelChunks =
253         GlowDAGOptimizerNumParallelChunks;
254   }
255   if (GlowSaveOnnxifiDAG) {
256     LOG(INFO) << "Serializing DAG after optimization and partitioning.";
257     cctx.serializeCompiledDAG = true;
258   }
259   if (GlowDelayAndRecordConstantModification) {
260     LOG(INFO) << "Delaying constant modification until after optimizations, "
261                  "including recording constant folding for DAG serialization.";
262     cctx.optimizationOpts.delayAndRecordConstantModification = true;
263   }
264   cctx.saturateHost = GlowSaturateHost;
265 
266   auto err = hostManager_->addNetwork(std::move(module), cctx);
267 
268   if (ERR_TO_BOOL(std::move(err))) {
269     return ONNXIFI_STATUS_INTERNAL_ERROR;
270   }
271 
272   return ONNXIFI_STATUS_SUCCESS;
273 }
274 
removeNetwork(const Graph * graph)275 onnxStatus HostManagerBackend::removeNetwork(const Graph *graph) {
276   auto hostManagerGraph = static_cast<const HostManagerGraph *>(graph);
277   auto error = hostManager_->removeNetwork(hostManagerGraph->getName());
278 
279   if (ERR_TO_BOOL(std::move(error))) {
280     return ONNXIFI_STATUS_INTERNAL_ERROR;
281   }
282 
283   return ONNXIFI_STATUS_SUCCESS;
284 }
285 
286 onnxStatus
initGraph(const void * onnxModel,size_t onnxModelSize,uint32_t weightCount,const onnxTensorDescriptorV1 * weightDescriptors,uint32_t maxSeqLength,void * deferedBlobReader)287 HostManagerGraph::initGraph(const void *onnxModel, size_t onnxModelSize,
288                             uint32_t weightCount,
289                             const onnxTensorDescriptorV1 *weightDescriptors,
290                             uint32_t maxSeqLength, void *deferedBlobReader) {
291 
292   netName_ = strFormat("onnxifi_function_%lu", makeUniqueGraphId());
293 
294   std::unique_ptr<Module> module = glow::make_unique<Module>();
295   runtime::PrePartitionedConfig PPC;
296 
297   std::unique_ptr<ONNXIFIModelLoader> loader;
298   auto loaderOrErr = ONNXIFIModelLoader::parse(
299       onnxModel, onnxModelSize, weightCount, weightDescriptors, *module,
300       netName_, &PPC, true /*loadInputsAsPlaceholdersForOnnx*/,
301       backendPtr_->getUseOnnx());
302   if (loaderOrErr) {
303     loader = std::move(*loaderOrErr);
304   } else {
305     LOG(ERROR) << "Error when loading model: "
306                << ERR_TO_STRING(loaderOrErr.takeError());
307     return ONNXIFI_STATUS_INVALID_MODEL;
308   }
309 
310   bindPlaceholders(*loader);
311   setZeroLengthSequence(maxSeqLength);
312   // Make sure the pool is ready to go.
313   for (auto &obj : onnxInputToPlaceholder_) {
314     tensorPool_.reserve(obj.second->getType(), 10);
315   }
316 
317   if (GlowSaveOnnxifiModel) {
318     for (Function *F : module->getFunctions()) {
319       saveOnnxifiModel(F);
320     }
321   }
322 
323   return static_cast<HostManagerBackend *>(backendPtr_)
324       ->addNetwork(std::move(module), deferedBlobReader, &PPC);
325 }
326 
327 namespace {
dumpTraces(TraceContext * traceContext)328 void dumpTraces(TraceContext *traceContext) {
329   CHECK(traceContext);
330   llvm::SmallString<64> path;
331   auto tempFileRes =
332       llvm::sys::fs::createTemporaryFile("glow-trace", "json", path);
333   if (tempFileRes.value() != 0) {
334     LOG(ERROR) << "Failed to create temp file for Glow trace events: "
335                << tempFileRes;
336   } else {
337     traceContext->dump(path);
338   }
339 }
340 
341 } // namespace
342 
run(std::unique_ptr<ExecutionContext> ctx,EventPtr outputEvent,onnxTraceEventList * traceEvents)343 onnxStatus HostManagerGraph::run(std::unique_ptr<ExecutionContext> ctx,
344                                  EventPtr outputEvent,
345                                  onnxTraceEventList *traceEvents) {
346   auto threadId = threads::getThreadId();
347   auto startTime = TraceEvent::now();
348 
349   auto *data = ::glow::runtime::RequestData::get();
350   std::map<std::string, std::string> attributes;
351   if (data) {
352     attributes["app level request id"] =
353         llvm::formatv("{0}", data->appLevelRequestId);
354   }
355 
356   backendPtr_->runNetwork(
357       this, std::move(ctx),
358       [outputEvent, traceEvents, threadId, startTime,
359        attributes = std::move(attributes),
360        this](runtime::RunIdentifierTy runId, Error err,
361              std::unique_ptr<ExecutionContext> ctx) mutable {
362         TRACE_EVENT_SCOPE(ctx->getTraceContext(), TraceLevel::RUNTIME,
363                           "Onnxifi::callback");
364         // If an Error occurred then log it in ERR_TO_BOOL and signal the output
365         // event.
366         if (ERR_TO_BOOL(std::move(err))) {
367           outputEvent->signal(ONNXIFI_STATUS_INTERNAL_ERROR);
368           return;
369         }
370 
371         // End the current trace event before we convert TraceEvents to the ONNX
372         // format.
373         TRACE_EVENT_SCOPE_END();
374 
375         auto *traceContext = ctx->getTraceContext();
376         if (traceContext) {
377           // We want to log the async start event with the original caller's
378           // threadId. This way, chrome UI will put the async event next to the
379           // caller thread.
380           traceContext->logTraceEvent("glow e2e", TraceLevel::RUNTIME,
381                                       TraceEvent::BeginType, startTime,
382                                       attributes, threadId, runId);
383           traceContext->logTraceEvent("glow e2e", TraceLevel::RUNTIME,
384                                       TraceEvent::EndType, TraceEvent::now(),
385                                       attributes, threadId, runId);
386           setTraceEvents(traceEvents, traceContext);
387         }
388 
389         // Signal to caller that the inference is completed.
390         outputEvent->signal(ONNXIFI_STATUS_SUCCESS);
391 
392         if (traceContext && GlowDumpDebugTraces) {
393           // Dumping traces to a file can take a while. So avoid tracesMutex_
394           // while we call dumpTraces.
395           std::unique_ptr<TraceContext> toDump;
396           {
397             std::unique_lock<std::mutex> lock(tracesMutex_);
398             if (!mergedTraceContext_) {
399               mergedTraceContext_ =
400                   glow::make_unique<TraceContext>(TraceLevel::STANDARD);
401             }
402             mergedTraceContext_->merge(traceContext);
403 
404             if (++numTracesToDump_ >= GlowNumDebugTracesPerDump) {
405               numTracesToDump_ = 0;
406               toDump.reset(mergedTraceContext_.release());
407             }
408           }
409 
410           if (toDump) {
411             dumpTraces(toDump.get());
412           }
413         }
414       });
415 
416   return ONNXIFI_STATUS_SUCCESS;
417 }
418 
~HostManagerGraph()419 HostManagerGraph::~HostManagerGraph() {
420   // Remove network from the Backend
421   backendPtr_->removeNetwork(this);
422 
423   if (GlowDumpDebugTraces) {
424     std::unique_lock<std::mutex> lock(tracesMutex_);
425     if (mergedTraceContext_ && numTracesToDump_ > 0) {
426       dumpTraces(mergedTraceContext_.get());
427     }
428   }
429 }
430 
makeUniqueGraphId()431 size_t HostManagerGraph::makeUniqueGraphId() {
432   static std::atomic<size_t> nextId{0};
433   return nextId++;
434 }
435 
436 } // namespace onnxifi
437 } // namespace glow
438