1 //===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate NVVMIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
15
16 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
17 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
18 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
19 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
20 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
21 #include "mlir/Dialect/GPU/GPUDialect.h"
22 #include "mlir/Dialect/GPU/Passes.h"
23 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
24 #include "mlir/Dialect/Math/IR/Math.h"
25 #include "mlir/Dialect/MemRef/IR/MemRef.h"
26 #include "mlir/IR/BlockAndValueMapping.h"
27 #include "mlir/Transforms/DialectConversion.h"
28 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
29 #include "llvm/Support/FormatVariadic.h"
30
31 #include "../GPUCommon/GPUOpsLowering.h"
32 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
33 #include "../GPUCommon/OpToFuncCallLowering.h"
OpToFuncCallLoweringOpToFuncCallLowering34 #include "../PassDetail.h"
35
36 using namespace mlir;
37
38 namespace {
39
matchAndRewriteOpToFuncCallLowering40 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
41 using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
42
43 /// Lowers a shuffle to the corresponding NVVM op.
44 ///
45 /// Convert the `width` argument into an activeMask (a bitmask which specifies
46 /// which threads participate in the shuffle) and a maskAndClamp (specifying
47 /// the highest lane which participates in the shuffle).
48 ///
49 /// %one = llvm.constant(1 : i32) : i32
50 /// %shl = llvm.shl %one, %width : i32
51 /// %active_mask = llvm.sub %shl, %one : i32
52 /// %mask_and_clamp = llvm.sub %width, %one : i32
53 /// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
54 /// %mask_and_clamp : !llvm<"{ float, i1 }">
55 /// %shfl_value = llvm.extractvalue %shfl[0 : index] :
56 /// !llvm<"{ float, i1 }">
57 /// %shfl_pred = llvm.extractvalue %shfl[1 : index] :
58 /// !llvm<"{ float, i1 }">
59 LogicalResult
60 matchAndRewrite(gpu::ShuffleOp op, ArrayRef<Value> operands,
61 ConversionPatternRewriter &rewriter) const override {
62 Location loc = op->getLoc();
63 gpu::ShuffleOpAdaptor adaptor(operands);
64
65 auto valueTy = adaptor.value().getType();
66 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
67 auto predTy = IntegerType::get(rewriter.getContext(), 1);
68 auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
69 {valueTy, predTy});
70
71 Value one = rewriter.create<LLVM::ConstantOp>(
72 loc, int32Type, rewriter.getI32IntegerAttr(1));
73 // Bit mask of active lanes: `(1 << activeWidth) - 1`.
74 Value activeMask = rewriter.create<LLVM::SubOp>(
75 loc, int32Type,
76 rewriter.create<LLVM::ShlOp>(loc, int32Type, one, adaptor.width()),
77 one);
78 // Clamp lane: `activeWidth - 1`
79 Value maskAndClamp =
80 rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
81
82 auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
83 Value shfl = rewriter.create<NVVM::ShflBflyOp>(
84 loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),
85 maskAndClamp, returnValueAndIsValidAttr);
86 Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
87 loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));
88 Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
89 loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
90
91 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
92 return success();
93 }
94 };
95
96 /// Import the GPU Ops to NVVM Patterns.
getFunctionNameOpToFuncCallLowering97 #include "GPUToNVVM.cpp.inc"
98
99 /// A pass that replaces all occurrences of GPU device operations with their
100 /// corresponding NVVM equivalent.
101 ///
102 /// This pass only handles device code and is not meant to be run on GPU host
103 /// code.
104 struct LowerGpuOpsToNVVMOpsPass
105 : public ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
106 LowerGpuOpsToNVVMOpsPass() = default;
107 LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
108 this->indexBitwidth = indexBitwidth;
109 }
110
111 void runOnOperation() override {
112 gpu::GPUModuleOp m = getOperation();
113
114 /// Customize the bitwidth used for the device side index computations.
115 LowerToLLVMOptions options(
116 m.getContext(),
117 DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
118 options.emitCWrappers = true;
119 if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
120 options.overrideIndexBitwidth(indexBitwidth);
121
122 /// MemRef conversion for GPU to NVVM lowering. The GPU dialect uses memory
123 /// space 5 for private memory attributions, but NVVM represents private
124 /// memory allocations as local `alloca`s in the default address space. This
125 /// converter drops the private memory space to support the use case above.
126 LLVMTypeConverter converter(m.getContext(), options);
127 converter.addConversion([&](MemRefType type) -> Optional<Type> {
128 if (type.getMemorySpaceAsInt() !=
129 gpu::GPUDialect::getPrivateAddressSpace())
130 return llvm::None;
131 return converter.convertType(MemRefType::Builder(type).setMemorySpace(0));
132 });
133
134 // Lowering for MMAMatrixType.
135 converter.addConversion([&](gpu::MMAMatrixType type) -> Type {
136 // The number of items in structToReturn are dependent on the the dataType
137 // and the MMA operand that this operation is associated with.
138 llvm::DenseMap<StringRef, int64_t> numElemsPerThreadF16,
139 numElemsPerThreadF32;
140 numElemsPerThreadF16["AOp"] = 8;
141 numElemsPerThreadF16["BOp"] = 8;
142 numElemsPerThreadF16["COp"] = 4;
143 numElemsPerThreadF32["AOp"] = 8;
144 numElemsPerThreadF32["BOp"] = 8;
145 numElemsPerThreadF32["COp"] = 8;
146 Type structToReturn;
147 if (type.getElementType().isF16()) {
148 // Number of f16's in 32-bit.
149 unsigned vecSize = 2;
150 Type vec = VectorType::get(vecSize, FloatType::getF16(&getContext()));
151 unsigned size = numElemsPerThreadF16[type.getOperand()];
152 SmallVector<Type> elements(size, vec);
153 structToReturn =
154 LLVM::LLVMStructType::getLiteral(&getContext(), elements);
155 } else if (type.getElementType().isF32()) {
156 unsigned size = numElemsPerThreadF32[type.getOperand()];
157 SmallVector<Type> elements(size, FloatType::getF32(&getContext()));
158 structToReturn =
159 LLVM::LLVMStructType::getLiteral(&getContext(), elements);
160 }
161 return structToReturn;
162 });
163
164 RewritePatternSet patterns(m.getContext());
165 RewritePatternSet llvmPatterns(m.getContext());
166
167 // Apply in-dialect lowering first. In-dialect lowering will replace ops
168 // which need to be lowered further, which is not supported by a single
169 // conversion pass.
170 populateGpuRewritePatterns(patterns);
171 (void)applyPatternsAndFoldGreedily(m, std::move(patterns));
172
173 populateStdToLLVMConversionPatterns(converter, llvmPatterns);
174 populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
175 populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
176 populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
177 LLVMConversionTarget target(getContext());
178 configureGpuToNVVMConversionLegality(target);
179 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
180 signalPassFailure();
181 }
182 };
183
184 } // anonymous namespace
185
186 void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
187 target.addIllegalOp<FuncOp>();
188 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
189 target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
190 target.addIllegalDialect<gpu::GPUDialect>();
191 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
192 LLVM::FCeilOp, LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op,
193 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, LLVM::SqrtOp>();
194
195 // TODO: Remove once we support replacing non-root ops.
196 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
197 }
198
199 void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
200 RewritePatternSet &patterns) {
201 populateWithGenerated(patterns);
202 patterns
203 .add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
204 NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
205 GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
206 NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
207 GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
208 NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
209 GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
210 NVVM::GridDimYOp, NVVM::GridDimZOp>,
211 GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
212
213 // Explicitly drop memory space when lowering private memory
214 // attributions since NVVM models it as `alloca`s in the default
215 // memory space and does not support `alloca`s with addrspace(5).
216 patterns.add<GPUFuncOpLowering>(
217 converter, /*allocaAddrSpace=*/0,
218 Identifier::get(NVVM::NVVMDialect::getKernelFuncAttrName(),
219 &converter.getContext()));
220
221 patterns.add<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
222 "__nv_fabs");
223 patterns.add<OpToFuncCallLowering<math::AtanOp>>(converter, "__nv_atanf",
224 "__nv_atan");
225 patterns.add<OpToFuncCallLowering<math::Atan2Op>>(converter, "__nv_atan2f",
226 "__nv_atan2");
227 patterns.add<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
228 "__nv_ceil");
229 patterns.add<OpToFuncCallLowering<math::CosOp>>(converter, "__nv_cosf",
230 "__nv_cos");
231 patterns.add<OpToFuncCallLowering<math::ExpOp>>(converter, "__nv_expf",
232 "__nv_exp");
233 patterns.add<OpToFuncCallLowering<math::Exp2Op>>(converter, "__nv_exp2f",
234 "__nv_exp2");
235 patterns.add<OpToFuncCallLowering<math::ExpM1Op>>(converter, "__nv_expm1f",
236 "__nv_expm1");
237 patterns.add<OpToFuncCallLowering<FloorFOp>>(converter, "__nv_floorf",
238 "__nv_floor");
239 patterns.add<OpToFuncCallLowering<math::LogOp>>(converter, "__nv_logf",
240 "__nv_log");
241 patterns.add<OpToFuncCallLowering<math::Log1pOp>>(converter, "__nv_log1pf",
242 "__nv_log1p");
243 patterns.add<OpToFuncCallLowering<math::Log10Op>>(converter, "__nv_log10f",
244 "__nv_log10");
245 patterns.add<OpToFuncCallLowering<math::Log2Op>>(converter, "__nv_log2f",
246 "__nv_log2");
247 patterns.add<OpToFuncCallLowering<math::PowFOp>>(converter, "__nv_powf",
248 "__nv_pow");
249 patterns.add<OpToFuncCallLowering<math::RsqrtOp>>(converter, "__nv_rsqrtf",
250 "__nv_rsqrt");
251 patterns.add<OpToFuncCallLowering<math::SinOp>>(converter, "__nv_sinf",
252 "__nv_sin");
253 patterns.add<OpToFuncCallLowering<math::SqrtOp>>(converter, "__nv_sqrtf",
254 "__nv_sqrt");
255 patterns.add<OpToFuncCallLowering<math::TanhOp>>(converter, "__nv_tanhf",
256 "__nv_tanh");
257 }
258
259 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
260 mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
261 return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth);
262 }
263