1 /**
2  * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
17 #define GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
18 
19 #include "glow/Backends/BackendOptions.h"
20 #include "glow/Graph/PlaceholderBindings.h"
21 #include "glow/Quantization/Base/Base.h"
22 #include "glow/Support/Error.h"
23 
24 namespace glow {
25 namespace runtime {
26 struct PartitionConfig;
27 struct PrePartitionedConfig;
28 class DeferredWeightLoader;
29 } // namespace runtime
30 
31 /// Configuration for different precision modes.
32 struct PrecisionConfiguration {
33   /// Enum for what kind of transformation should be done for Quantization.
34   enum class QuantizationMode {
35     None,     /// Perform no transformations for quantization.
36     Quantize, /// Quantize the graph using previously gathered statistics.
37     Profile,  /// Add profiling nodes for quantization statistics gathering.
38   } quantMode{QuantizationMode::None};
39 
40   /// Configuration for Profiling.
41   quantization::ProfilingConfiguration profConfig;
42 
43   /// Configuration for Quantization.
44   quantization::QuantizationConfiguration quantConfig;
45 
46   /// Enum for what kind of float16 format should be used.
47   enum class Float16Format {
48     None,     /// No float16 format should be used.
49     FP16,     /// FP16 format for float16 should be used.
50     BFloat16, /// FP16 format for float16 should be used.
51   } float16Format{
52       Float16Format::FP16}; /// If convertToFp16, float16 format to be used.
53 
54   /// Whether to convert the FloatTy to Float16Ty in the Function.
55   bool convertToFP16{false};
56 
57   /// Whether to convert UInt8FusedQTy to UInt8FusedFP16QTy in the Function.
58   bool convertFusedToFP16{false};
59 
60   /// If convertToFP16, whether to convert input Placeholders.
61   bool convertPlaceholdersToFP16{false};
62 
63   /// If convertToFP16, whether to convert Constants.
64   bool convertConstantsToFP16{false};
65 
66   /// If convertToFP16, whether to clip out-of-range FP values to the min/max of
67   /// fp16.
68   bool clipFP16{false};
69 
70   /// If clipFP16, whether to skip clipping inputs of Nodes.
71   bool clipFP16SkipInputs{false};
72 
73   /// Whether to force FP16 accumulation for the SLS family of ops.
74   bool forceFP16AccumSLS{true};
75 
76   /// Used during Quantization and convertToFP16 to keep the original precision
77   /// of specific node kinds (i.e. quantization/FP16 conversion would be skipped
78   /// for any node kinds found here). Used during profiling to prevent nodes
79   /// from being lowered before instrumenting the graph (e.g. do not lower group
80   /// convolutions for profiling; see `-do-not-lower-nodes-for-profiling` in
81   /// docs/Quantization.md).
82   KindSet precisionModeKindSet;
83 
84   /// Whether to use the precisionModeKindSet as a whitelist instead of the
85   /// default blacklist. Currently only supported for convertToFP16.
86   bool useSetAsWhitelist{false};
87 
88   /// Converts a float16 \p format into an ElemKind.
getElementTypePrecisionConfiguration89   static ElemKind getElementType(Float16Format format) {
90     switch (format) {
91     case Float16Format::FP16:
92       return ElemKind::Float16Ty;
93     case Float16Format::BFloat16:
94       return ElemKind::BFloat16Ty;
95     default:
96       llvm_unreachable("Unknown float16 format");
97     }
98   }
99 };
100 
101 using QuantizationMode = PrecisionConfiguration::QuantizationMode;
102 
103 /// Options relevant to optimizations during compilation.
104 struct OptimizationOptions {
105   /// Only lower, i.e. skip optimizations and precision transformations. Used
106   /// for testing.
107   llvm::SmallSet<Function *, 1> onlyLowerFuns;
108 
109   /// If true, perform compile-time computation of constant operations.
110   bool enableConstantFolding{true};
111 
112   /// If true, before any Function optimization, all the Constants will be
113   /// temporarily replaced by Placeholders, preventing the Constants from being
114   /// modified during the normal optimization pipeline. The original Constants
115   /// will be put back in place automatically afterward, and then Constant
116   /// Folding will be run.
117   bool delayAndRecordConstantModification{false};
118 
119   /// If true, this will merge ConvertTo and Quantize nodes into inputs and
120   /// outputs of the Function. This means modifying the types of Placeholders
121   /// and SaveNodes if they have a corresponding ElemKind conversion (ConvertTo,
122   /// Quantize, Dequantize nodes). Note that this must be accompanied by
123   /// modifying the Tensors backing Placeholders at runtime.
124   bool foldElemKindConversionIntoIO{false};
125 
126   /// If true this will fold convertTo and Quantize nodes into only static
127   /// placeholders. The conversion of the Tensors will be handled by the
128   /// provisioner.
129   bool foldStaticPlaceholderConversions{false};
130 
131   /// If true, this will direct the partitioner to use SparseNN partitioning
132   /// scheme
133   bool useSparseNNPartitioningScheme{false};
134 
135   /// If true, SparseNN partiitoning scheme will add extra concats to the
136   /// SLS partition for more efficient inter-partition transfers
137   bool sparseNNPartitioningAddSLSConcats{false};
138 
139   /// If true, SparseNN partiitoning scheme will balance SLS tables across
140   /// cards using a performance model
141   bool sparseNNPartitioningBalancePerfModel{false};
142 
143   /// If true, SparseNN partiitoning scheme will move Layer Normalization
144   /// nodes immediately following SLS into SLS partitions
145   bool sparseNNPartitioningPairLNWithSLS{false};
146 
147   /// The number of cards over which to split SLS tables when using SparseNN
148   /// partitioning scheme
149   unsigned int sparseNNPartitioningSchemeNumCards{1};
150 
151   /// The number of bytes to allocate per card for SLS tables when using
152   /// the SparseNN partitioning scheme
153   unsigned int sparseNNPartitioningSchemeSLSTableKBytesPerCard{0};
154 
155   /// The number of cores to assign to SLS partition when using SparseNN
156   /// partitioning scheme
157   unsigned int sparseNNPartitioningSchemeNumCoresSLS{1};
158 
159   /// The number of cores to assign to non-SLS partition when using SparseNN
160   /// partitioning scheme
161   unsigned int sparseNNPartitioningSchemeNumCoresOther{1};
162 
163   /// The algorithm used for Placement tagging in DAG Optimizer
164   std::string DAGOptimizerPlacementTaggingAlgorithm;
165 
166   /// The algorithm used for Parallelization tagging in DAG Optimizer
167   std::string DAGOptimizerParallelizationTaggingAlgorithm;
168 
169   /// The number of parallel chunks used in DAG Optimizer parallelization
170   int32_t DAGOptimizerNumParallelChunks;
171 
172   /// If true does int64 to int32 type demotion if backend supports for specific
173   /// nodes.
174   bool enableTypeDemotion{true};
175 
176   /// If true, optimizations are allowed to change quantization scale/offset.
177   bool enableQuantParamChanges{false};
178 };
179 
180 /// Meta information produced during the compilation. Whereas the compile
181 /// options should be interpreted as input variables for the compilation, the
182 /// below structure is output information produced by the compilation process.
183 struct CompilationInfo {
184   /// The hash of the graph before the lowering stage.
185   llvm::hash_code graphPreLowerHash{0};
186 };
187 
188 /// Context for compilation.
189 struct CompilationContext {
190   /// Used during Profiling.
191   PlaceholderBindings *bindings{nullptr};
192 
193   /// Allows the user to specify user defined partitioning.
194   runtime::PartitionConfig *partitionConfig{nullptr};
195 
196   /// Allows a loader to store a pre-partitioned config.
197   runtime::PrePartitionedConfig *prepartitionedConfig{nullptr};
198 
199   /// If true the HostManager will try to use all available devices on the host.
200   bool saturateHost{false};
201 
202   /// Number of max active requests per instance of this network.
203   unsigned maxActiveRequestsPerInstance{48};
204 
205   /// Used during Quantization and Profiling.
206   LoweredInfoMap *loweredInfoMap{nullptr};
207 
208   /// Select whether in Training or Inference mode.
209   enum class CompilationMode {
210     Train, /// Compile the graph in preparation for training.
211     Infer, /// Compile the graph for inference. Notice that this operation
212            /// changes the graph in a way that is not reversible.
213     NumCompilationModes, /// Used to count the number of CompilationModes.
214   } compMode{CompilationMode::Infer};
215 
216   /// Options for the Backend to use.
217   BackendOptions backendOpts;
218 
219   /// Options for the optimizations to use.
220   OptimizationOptions optimizationOpts;
221 
222   /// Configuration for different precision modes.
223   PrecisionConfiguration precisionConfig;
224 
225   /// Information produced during compilation.
226   CompilationInfo info;
227 
228   /// How to annotate the compilation log filename.
229   std::string compilationLogPrefix{"glow"};
230 
231   /// Pointer to deferredWeightLoader object, this is used for large model
232   /// support.
233   runtime::DeferredWeightLoader *deferredWeightLoader{nullptr};
234 
235   /// Whether to print out issues/logging during compilation. Used for example
236   /// to disable printing issues encountered during ConstantFolding.
237   bool verboseCompile{true};
238 
239   /// Call dumpDag on each Function passed to the backend for compilation.
240   bool dumpFinalGraph = false;
241 
242   /// Whether to skip stripping the module.
243   bool skipModuleStrip{false};
244 
245   /// Enables Peer to Peer Tensor optimization.
246   bool enableP2P{false};
247 
248   /// Enables Device Resident Tensor optimization.
249   bool enableDRT{false};
250 
251   /// Number of times a function should be replicated on a device. This is
252   /// enabled for single partition networks. For advanced replication setups use
253   /// user-defined partitioning.
254   unsigned replicationCount{1};
255 
256   /// Whether to serialize the DAG that has been optimized and partitioned.
257   bool serializeCompiledDAG{false};
258 
259   /// Whether to call the DAG optimizer after the DAG is created in HostManager.
260   bool callDAGOptimizer{false};
261 
262   CompilationContext(PlaceholderBindings *bindings_ = nullptr,
263                      LoweredInfoMap *loweredInfoMap_ = nullptr)
bindingsCompilationContext264       : bindings(bindings_), loweredInfoMap(loweredInfoMap_) {}
265 
266   /// \returns an error if the CompilationContext is malformed for whatever
267   /// configuration it is set up for, otherwise returns success.
verifyCompilationContext268   Error verify() const {
269     RETURN_ERR_IF_NOT(!precisionConfig.useSetAsWhitelist ||
270                           precisionConfig.convertToFP16,
271                       "Can only use the precisionModeKindSet as a whitelist in "
272                       "convertToFP16 mode.");
273 
274     switch (precisionConfig.quantMode) {
275     case QuantizationMode::Profile:
276       RETURN_ERR_IF_NOT(bindings,
277                         ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
278                         "In Profiling mode, but bindings was not set.\n");
279 
280       RETURN_ERR_IF_NOT(loweredInfoMap,
281                         ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
282                         "In Profiling mode, but loweredInfoMap was not set.\n");
283 
284       RETURN_ERR_IF_NOT(!precisionConfig.convertToFP16,
285                         ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
286                         "Converting to FP16 while profiling is unsupported.\n");
287       break;
288 
289     case QuantizationMode::Quantize:
290       RETURN_ERR_IF_NOT(
291           loweredInfoMap, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
292           "In Quantization mode, but loweredInfoMap was not set.\n");
293       break;
294 
295     case QuantizationMode::None:
296       break;
297     }
298 
299     RETURN_ERR_IF_NOT(!(optimizationOpts.foldElemKindConversionIntoIO &&
300                         optimizationOpts.delayAndRecordConstantModification),
301                       "Cannot currently perform elem kind merging into PHs "
302                       "when also preventing constant modification.");
303 
304     RETURN_ERR_IF_NOT(!(serializeCompiledDAG &&
305                         !optimizationOpts.delayAndRecordConstantModification),
306                       "When serializing the compiled DAG, must also enable "
307                       "delayAndRecordConstantModification.");
308 
309     return Error::success();
310   }
311 };
312 
313 using CompilationMode = CompilationContext::CompilationMode;
314 
315 }; // namespace glow
316 
317 #endif // GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
318