1 //===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate NVVMIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
15
16 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
17 #include "mlir/Dialect/GPU/GPUDialect.h"
18 #include "mlir/Dialect/GPU/Passes.h"
19 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
20 #include "mlir/IR/BlockAndValueMapping.h"
21 #include "mlir/Transforms/DialectConversion.h"
22 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
23 #include "llvm/Support/FormatVariadic.h"
24
25 #include "../GPUCommon/GPUOpsLowering.h"
26 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
27 #include "../GPUCommon/OpToFuncCallLowering.h"
28 #include "../PassDetail.h"
29
30 using namespace mlir;
31
32 namespace {
33
34 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
35 using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
36
37 /// Lowers a shuffle to the corresponding NVVM op.
38 ///
39 /// Convert the `width` argument into an activeMask (a bitmask which specifies
40 /// which threads participate in the shuffle) and a maskAndClamp (specifying
41 /// the highest lane which participates in the shuffle).
42 ///
43 /// %one = llvm.constant(1 : i32) : i32
44 /// %shl = llvm.shl %one, %width : i32
45 /// %active_mask = llvm.sub %shl, %one : i32
46 /// %mask_and_clamp = llvm.sub %width, %one : i32
47 /// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
48 /// %mask_and_clamp : !llvm<"{ float, i1 }">
49 /// %shfl_value = llvm.extractvalue %shfl[0 : index] :
50 /// !llvm<"{ float, i1 }">
51 /// %shfl_pred = llvm.extractvalue %shfl[1 : index] :
52 /// !llvm<"{ float, i1 }">
53 LogicalResult
matchAndRewrite__anon4772837e0111::GPUShuffleOpLowering54 matchAndRewrite(gpu::ShuffleOp op, ArrayRef<Value> operands,
55 ConversionPatternRewriter &rewriter) const override {
56 Location loc = op->getLoc();
57 gpu::ShuffleOpAdaptor adaptor(operands);
58
59 auto valueTy = adaptor.value().getType();
60 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
61 auto predTy = IntegerType::get(rewriter.getContext(), 1);
62 auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
63 {valueTy, predTy});
64
65 Value one = rewriter.create<LLVM::ConstantOp>(
66 loc, int32Type, rewriter.getI32IntegerAttr(1));
67 // Bit mask of active lanes: `(1 << activeWidth) - 1`.
68 Value activeMask = rewriter.create<LLVM::SubOp>(
69 loc, int32Type,
70 rewriter.create<LLVM::ShlOp>(loc, int32Type, one, adaptor.width()),
71 one);
72 // Clamp lane: `activeWidth - 1`
73 Value maskAndClamp =
74 rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
75
76 auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
77 Value shfl = rewriter.create<NVVM::ShflBflyOp>(
78 loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),
79 maskAndClamp, returnValueAndIsValidAttr);
80 Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
81 loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));
82 Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
83 loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
84
85 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
86 return success();
87 }
88 };
89
90 /// Import the GPU Ops to NVVM Patterns.
91 #include "GPUToNVVM.cpp.inc"
92
93 /// A pass that replaces all occurrences of GPU device operations with their
94 /// corresponding NVVM equivalent.
95 ///
96 /// This pass only handles device code and is not meant to be run on GPU host
97 /// code.
98 struct LowerGpuOpsToNVVMOpsPass
99 : public ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
100 LowerGpuOpsToNVVMOpsPass() = default;
LowerGpuOpsToNVVMOpsPass__anon4772837e0111::LowerGpuOpsToNVVMOpsPass101 LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
102 this->indexBitwidth = indexBitwidth;
103 }
104
runOnOperation__anon4772837e0111::LowerGpuOpsToNVVMOpsPass105 void runOnOperation() override {
106 gpu::GPUModuleOp m = getOperation();
107
108 /// Customize the bitwidth used for the device side index computations.
109 LowerToLLVMOptions options = {/*useBarePtrCallConv =*/false,
110 /*emitCWrappers =*/true,
111 /*indexBitwidth =*/indexBitwidth,
112 /*useAlignedAlloc =*/false};
113
114 /// MemRef conversion for GPU to NVVM lowering. The GPU dialect uses memory
115 /// space 5 for private memory attributions, but NVVM represents private
116 /// memory allocations as local `alloca`s in the default address space. This
117 /// converter drops the private memory space to support the use case above.
118 LLVMTypeConverter converter(m.getContext(), options);
119 converter.addConversion([&](MemRefType type) -> Optional<Type> {
120 if (type.getMemorySpace() != gpu::GPUDialect::getPrivateAddressSpace())
121 return llvm::None;
122 return converter.convertType(MemRefType::Builder(type).setMemorySpace(0));
123 });
124
125 OwningRewritePatternList patterns, llvmPatterns;
126
127 // Apply in-dialect lowering first. In-dialect lowering will replace ops
128 // which need to be lowered further, which is not supported by a single
129 // conversion pass.
130 populateGpuRewritePatterns(m.getContext(), patterns);
131 applyPatternsAndFoldGreedily(m, std::move(patterns));
132
133 populateStdToLLVMConversionPatterns(converter, llvmPatterns);
134 populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
135 LLVMConversionTarget target(getContext());
136 configureGpuToNVVMConversionLegality(target);
137 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
138 signalPassFailure();
139 }
140 };
141
142 } // anonymous namespace
143
configureGpuToNVVMConversionLegality(ConversionTarget & target)144 void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
145 target.addIllegalOp<FuncOp>();
146 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
147 target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
148 target.addIllegalDialect<gpu::GPUDialect>();
149 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
150 LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
151 LLVM::PowOp, LLVM::SinOp, LLVM::SqrtOp>();
152
153 // TODO: Remove once we support replacing non-root ops.
154 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
155 }
156
populateGpuToNVVMConversionPatterns(LLVMTypeConverter & converter,OwningRewritePatternList & patterns)157 void mlir::populateGpuToNVVMConversionPatterns(
158 LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
159 populateWithGenerated(converter.getDialect()->getContext(), patterns);
160 patterns
161 .insert<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
162 NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
163 GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
164 NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
165 GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
166 NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
167 GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
168 NVVM::GridDimYOp, NVVM::GridDimZOp>,
169 GPUShuffleOpLowering, GPUReturnOpLowering,
170 // Explicitly drop memory space when lowering private memory
171 // attributions since NVVM models it as `alloca`s in the default
172 // memory space and does not support `alloca`s with addrspace(5).
173 GPUFuncOpLowering<0>>(converter);
174 patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
175 "__nv_fabs");
176 patterns.insert<OpToFuncCallLowering<AtanOp>>(converter, "__nv_atanf",
177 "__nv_atan");
178 patterns.insert<OpToFuncCallLowering<Atan2Op>>(converter, "__nv_atan2f",
179 "__nv_atan2");
180 patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
181 "__nv_ceil");
182 patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__nv_cosf",
183 "__nv_cos");
184 patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__nv_expf",
185 "__nv_exp");
186 patterns.insert<OpToFuncCallLowering<FloorFOp>>(converter, "__nv_floorf",
187 "__nv_floor");
188 patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__nv_logf",
189 "__nv_log");
190 patterns.insert<OpToFuncCallLowering<Log1pOp>>(converter, "__nv_log1pf",
191 "__nv_log1p");
192 patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__nv_log10f",
193 "__nv_log10");
194 patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__nv_log2f",
195 "__nv_log2");
196 patterns.insert<OpToFuncCallLowering<PowFOp>>(converter, "__nv_powf",
197 "__nv_pow");
198 patterns.insert<OpToFuncCallLowering<RsqrtOp>>(converter, "__nv_rsqrtf",
199 "__nv_rsqrt");
200 patterns.insert<OpToFuncCallLowering<SinOp>>(converter, "__nv_sinf",
201 "__nv_sin");
202 patterns.insert<OpToFuncCallLowering<SqrtOp>>(converter, "__nv_sqrtf",
203 "__nv_sqrt");
204 patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__nv_tanhf",
205 "__nv_tanh");
206 }
207
208 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth)209 mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
210 return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth);
211 }
212