/** * Copyright (c) Glow Contributors. See CONTRIBUTORS file. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H #define GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H #include "glow/Backends/BackendOptions.h" #include "glow/Graph/PlaceholderBindings.h" #include "glow/Quantization/Base/Base.h" #include "glow/Support/Error.h" namespace glow { namespace runtime { struct PartitionConfig; struct PrePartitionedConfig; class DeferredWeightLoader; } // namespace runtime /// Configuration for different precision modes. struct PrecisionConfiguration { /// Enum for what kind of transformation should be done for Quantization. enum class QuantizationMode { None, /// Perform no transformations for quantization. Quantize, /// Quantize the graph using previously gathered statistics. Profile, /// Add profiling nodes for quantization statistics gathering. } quantMode{QuantizationMode::None}; /// Configuration for Profiling. quantization::ProfilingConfiguration profConfig; /// Configuration for Quantization. quantization::QuantizationConfiguration quantConfig; /// Enum for what kind of float16 format should be used. enum class Float16Format { None, /// No float16 format should be used. FP16, /// FP16 format for float16 should be used. BFloat16, /// FP16 format for float16 should be used. } float16Format{ Float16Format::FP16}; /// If convertToFp16, float16 format to be used. /// Whether to convert the FloatTy to Float16Ty in the Function. bool convertToFP16{false}; /// Whether to convert UInt8FusedQTy to UInt8FusedFP16QTy in the Function. bool convertFusedToFP16{false}; /// If convertToFP16, whether to convert input Placeholders. bool convertPlaceholdersToFP16{false}; /// If convertToFP16, whether to convert Constants. bool convertConstantsToFP16{false}; /// If convertToFP16, whether to clip out-of-range FP values to the min/max of /// fp16. bool clipFP16{false}; /// If clipFP16, whether to skip clipping inputs of Nodes. bool clipFP16SkipInputs{false}; /// Whether to force FP16 accumulation for the SLS family of ops. bool forceFP16AccumSLS{true}; /// Used during Quantization and convertToFP16 to keep the original precision /// of specific node kinds (i.e. quantization/FP16 conversion would be skipped /// for any node kinds found here). Used during profiling to prevent nodes /// from being lowered before instrumenting the graph (e.g. do not lower group /// convolutions for profiling; see `-do-not-lower-nodes-for-profiling` in /// docs/Quantization.md). KindSet precisionModeKindSet; /// Whether to use the precisionModeKindSet as a whitelist instead of the /// default blacklist. Currently only supported for convertToFP16. bool useSetAsWhitelist{false}; /// Converts a float16 \p format into an ElemKind. static ElemKind getElementType(Float16Format format) { switch (format) { case Float16Format::FP16: return ElemKind::Float16Ty; case Float16Format::BFloat16: return ElemKind::BFloat16Ty; default: llvm_unreachable("Unknown float16 format"); } } }; using QuantizationMode = PrecisionConfiguration::QuantizationMode; /// Options relevant to optimizations during compilation. struct OptimizationOptions { /// Only lower, i.e. skip optimizations and precision transformations. Used /// for testing. llvm::SmallSet onlyLowerFuns; /// If true, perform compile-time computation of constant operations. bool enableConstantFolding{true}; /// If true, before any Function optimization, all the Constants will be /// temporarily replaced by Placeholders, preventing the Constants from being /// modified during the normal optimization pipeline. The original Constants /// will be put back in place automatically afterward, and then Constant /// Folding will be run. bool delayAndRecordConstantModification{false}; /// If true, this will merge ConvertTo and Quantize nodes into inputs and /// outputs of the Function. This means modifying the types of Placeholders /// and SaveNodes if they have a corresponding ElemKind conversion (ConvertTo, /// Quantize, Dequantize nodes). Note that this must be accompanied by /// modifying the Tensors backing Placeholders at runtime. bool foldElemKindConversionIntoIO{false}; /// If true this will fold convertTo and Quantize nodes into only static /// placeholders. The conversion of the Tensors will be handled by the /// provisioner. bool foldStaticPlaceholderConversions{false}; /// If true, this will direct the partitioner to use SparseNN partitioning /// scheme bool useSparseNNPartitioningScheme{false}; /// If true, SparseNN partiitoning scheme will add extra concats to the /// SLS partition for more efficient inter-partition transfers bool sparseNNPartitioningAddSLSConcats{false}; /// If true, SparseNN partiitoning scheme will balance SLS tables across /// cards using a performance model bool sparseNNPartitioningBalancePerfModel{false}; /// If true, SparseNN partiitoning scheme will move Layer Normalization /// nodes immediately following SLS into SLS partitions bool sparseNNPartitioningPairLNWithSLS{false}; /// The number of cards over which to split SLS tables when using SparseNN /// partitioning scheme unsigned int sparseNNPartitioningSchemeNumCards{1}; /// The number of bytes to allocate per card for SLS tables when using /// the SparseNN partitioning scheme unsigned int sparseNNPartitioningSchemeSLSTableKBytesPerCard{0}; /// The number of cores to assign to SLS partition when using SparseNN /// partitioning scheme unsigned int sparseNNPartitioningSchemeNumCoresSLS{1}; /// The number of cores to assign to non-SLS partition when using SparseNN /// partitioning scheme unsigned int sparseNNPartitioningSchemeNumCoresOther{1}; /// The algorithm used for Placement tagging in DAG Optimizer std::string DAGOptimizerPlacementTaggingAlgorithm; /// The algorithm used for Parallelization tagging in DAG Optimizer std::string DAGOptimizerParallelizationTaggingAlgorithm; /// The number of parallel chunks used in DAG Optimizer parallelization int32_t DAGOptimizerNumParallelChunks; /// If true does int64 to int32 type demotion if backend supports for specific /// nodes. bool enableTypeDemotion{true}; /// If true, optimizations are allowed to change quantization scale/offset. bool enableQuantParamChanges{false}; }; /// Meta information produced during the compilation. Whereas the compile /// options should be interpreted as input variables for the compilation, the /// below structure is output information produced by the compilation process. struct CompilationInfo { /// The hash of the graph before the lowering stage. llvm::hash_code graphPreLowerHash{0}; }; /// Context for compilation. struct CompilationContext { /// Used during Profiling. PlaceholderBindings *bindings{nullptr}; /// Allows the user to specify user defined partitioning. runtime::PartitionConfig *partitionConfig{nullptr}; /// Allows a loader to store a pre-partitioned config. runtime::PrePartitionedConfig *prepartitionedConfig{nullptr}; /// If true the HostManager will try to use all available devices on the host. bool saturateHost{false}; /// Number of max active requests per instance of this network. unsigned maxActiveRequestsPerInstance{48}; /// Used during Quantization and Profiling. LoweredInfoMap *loweredInfoMap{nullptr}; /// Select whether in Training or Inference mode. enum class CompilationMode { Train, /// Compile the graph in preparation for training. Infer, /// Compile the graph for inference. Notice that this operation /// changes the graph in a way that is not reversible. NumCompilationModes, /// Used to count the number of CompilationModes. } compMode{CompilationMode::Infer}; /// Options for the Backend to use. BackendOptions backendOpts; /// Options for the optimizations to use. OptimizationOptions optimizationOpts; /// Configuration for different precision modes. PrecisionConfiguration precisionConfig; /// Information produced during compilation. CompilationInfo info; /// How to annotate the compilation log filename. std::string compilationLogPrefix{"glow"}; /// Pointer to deferredWeightLoader object, this is used for large model /// support. runtime::DeferredWeightLoader *deferredWeightLoader{nullptr}; /// Whether to print out issues/logging during compilation. Used for example /// to disable printing issues encountered during ConstantFolding. bool verboseCompile{true}; /// Call dumpDag on each Function passed to the backend for compilation. bool dumpFinalGraph = false; /// Whether to skip stripping the module. bool skipModuleStrip{false}; /// Enables Peer to Peer Tensor optimization. bool enableP2P{false}; /// Enables Device Resident Tensor optimization. bool enableDRT{false}; /// Number of times a function should be replicated on a device. This is /// enabled for single partition networks. For advanced replication setups use /// user-defined partitioning. unsigned replicationCount{1}; /// Whether to serialize the DAG that has been optimized and partitioned. bool serializeCompiledDAG{false}; /// Whether to call the DAG optimizer after the DAG is created in HostManager. bool callDAGOptimizer{false}; CompilationContext(PlaceholderBindings *bindings_ = nullptr, LoweredInfoMap *loweredInfoMap_ = nullptr) : bindings(bindings_), loweredInfoMap(loweredInfoMap_) {} /// \returns an error if the CompilationContext is malformed for whatever /// configuration it is set up for, otherwise returns success. Error verify() const { RETURN_ERR_IF_NOT(!precisionConfig.useSetAsWhitelist || precisionConfig.convertToFP16, "Can only use the precisionModeKindSet as a whitelist in " "convertToFP16 mode."); switch (precisionConfig.quantMode) { case QuantizationMode::Profile: RETURN_ERR_IF_NOT(bindings, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, "In Profiling mode, but bindings was not set.\n"); RETURN_ERR_IF_NOT(loweredInfoMap, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, "In Profiling mode, but loweredInfoMap was not set.\n"); RETURN_ERR_IF_NOT(!precisionConfig.convertToFP16, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, "Converting to FP16 while profiling is unsupported.\n"); break; case QuantizationMode::Quantize: RETURN_ERR_IF_NOT( loweredInfoMap, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, "In Quantization mode, but loweredInfoMap was not set.\n"); break; case QuantizationMode::None: break; } RETURN_ERR_IF_NOT(!(optimizationOpts.foldElemKindConversionIntoIO && optimizationOpts.delayAndRecordConstantModification), "Cannot currently perform elem kind merging into PHs " "when also preventing constant modification."); RETURN_ERR_IF_NOT(!(serializeCompiledDAG && !optimizationOpts.delayAndRecordConstantModification), "When serializing the compiled DAG, must also enable " "delayAndRecordConstantModification."); return Error::success(); } }; using CompilationMode = CompilationContext::CompilationMode; }; // namespace glow #endif // GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H