1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "NVPTXTargetTransformInfo.h"
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/Analysis/ValueTracking.h"
14 #include "llvm/CodeGen/BasicTTIImpl.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/TargetLowering.h"
17 #include "llvm/IR/IntrinsicsNVPTX.h"
18 #include "llvm/Support/Debug.h"
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "NVPTXtti"
22 
23 // Whether the given intrinsic reads threadIdx.x/y/z.
24 static bool readsThreadIndex(const IntrinsicInst *II) {
25   switch (II->getIntrinsicID()) {
26     default: return false;
27     case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28     case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29     case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30       return true;
31   }
32 }
33 
34 static bool readsLaneId(const IntrinsicInst *II) {
35   return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37 
38 // Whether the given intrinsic is an atomic instruction in PTX.
39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40   switch (II->getIntrinsicID()) {
41     default: return false;
42     case Intrinsic::nvvm_atomic_load_inc_32:
43     case Intrinsic::nvvm_atomic_load_dec_32:
44 
45     case Intrinsic::nvvm_atomic_add_gen_f_cta:
46     case Intrinsic::nvvm_atomic_add_gen_f_sys:
47     case Intrinsic::nvvm_atomic_add_gen_i_cta:
48     case Intrinsic::nvvm_atomic_add_gen_i_sys:
49     case Intrinsic::nvvm_atomic_and_gen_i_cta:
50     case Intrinsic::nvvm_atomic_and_gen_i_sys:
51     case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52     case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53     case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54     case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55     case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56     case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57     case Intrinsic::nvvm_atomic_max_gen_i_cta:
58     case Intrinsic::nvvm_atomic_max_gen_i_sys:
59     case Intrinsic::nvvm_atomic_min_gen_i_cta:
60     case Intrinsic::nvvm_atomic_min_gen_i_sys:
61     case Intrinsic::nvvm_atomic_or_gen_i_cta:
62     case Intrinsic::nvvm_atomic_or_gen_i_sys:
63     case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64     case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65     case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66     case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67       return true;
68   }
69 }
70 
71 bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
72   // Without inter-procedural analysis, we conservatively assume that arguments
73   // to __device__ functions are divergent.
74   if (const Argument *Arg = dyn_cast<Argument>(V))
75     return !isKernelFunction(*Arg->getParent());
76 
77   if (const Instruction *I = dyn_cast<Instruction>(V)) {
78     // Without pointer analysis, we conservatively assume values loaded from
79     // generic or local address space are divergent.
80     if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81       unsigned AS = LI->getPointerAddressSpace();
82       return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83     }
84     // Atomic instructions may cause divergence. Atomic instructions are
85     // executed sequentially across all threads in a warp. Therefore, an earlier
86     // executed thread may see different memory inputs than a later executed
87     // thread. For example, suppose *a = 0 initially.
88     //
89     //   atom.global.add.s32 d, [a], 1
90     //
91     // returns 0 for the first thread that enters the critical region, and 1 for
92     // the second thread.
93     if (I->isAtomic())
94       return true;
95     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96       // Instructions that read threadIdx are obviously divergent.
97       if (readsThreadIndex(II) || readsLaneId(II))
98         return true;
99       // Handle the NVPTX atomic instrinsics that cannot be represented as an
100       // atomic IR instruction.
101       if (isNVVMAtomic(II))
102         return true;
103     }
104     // Conservatively consider the return value of function calls as divergent.
105     // We could analyze callees with bodies more precisely using
106     // inter-procedural analysis.
107     if (isa<CallInst>(I))
108       return true;
109   }
110 
111   return false;
112 }
113 
114 // Convert NVVM intrinsics to target-generic LLVM code where possible.
115 static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
116   // Each NVVM intrinsic we can simplify can be replaced with one of:
117   //
118   //  * an LLVM intrinsic,
119   //  * an LLVM cast operation,
120   //  * an LLVM binary operation, or
121   //  * ad-hoc LLVM IR for the particular operation.
122 
123   // Some transformations are only valid when the module's
124   // flush-denormals-to-zero (ftz) setting is true/false, whereas other
125   // transformations are valid regardless of the module's ftz setting.
126   enum FtzRequirementTy {
127     FTZ_Any,       // Any ftz setting is ok.
128     FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
129     FTZ_MustBeOff, // Transformation is valid only if ftz is off.
130   };
131   // Classes of NVVM intrinsics that can't be replaced one-to-one with a
132   // target-generic intrinsic, cast op, or binary op but that we can nonetheless
133   // simplify.
134   enum SpecialCase {
135     SPC_Reciprocal,
136   };
137 
138   // SimplifyAction is a poor-man's variant (plus an additional flag) that
139   // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
140   struct SimplifyAction {
141     // Invariant: At most one of these Optionals has a value.
142     Optional<Intrinsic::ID> IID;
143     Optional<Instruction::CastOps> CastOp;
144     Optional<Instruction::BinaryOps> BinaryOp;
145     Optional<SpecialCase> Special;
146 
147     FtzRequirementTy FtzRequirement = FTZ_Any;
148 
149     SimplifyAction() = default;
150 
151     SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
152         : IID(IID), FtzRequirement(FtzReq) {}
153 
154     // Cast operations don't have anything to do with FTZ, so we skip that
155     // argument.
156     SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
157 
158     SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
159         : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
160 
161     SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
162         : Special(Special), FtzRequirement(FtzReq) {}
163   };
164 
165   // Try to generate a SimplifyAction describing how to replace our
166   // IntrinsicInstr with target-generic LLVM IR.
167   const SimplifyAction Action = [II]() -> SimplifyAction {
168     switch (II->getIntrinsicID()) {
169     // NVVM intrinsics that map directly to LLVM intrinsics.
170     case Intrinsic::nvvm_ceil_d:
171       return {Intrinsic::ceil, FTZ_Any};
172     case Intrinsic::nvvm_ceil_f:
173       return {Intrinsic::ceil, FTZ_MustBeOff};
174     case Intrinsic::nvvm_ceil_ftz_f:
175       return {Intrinsic::ceil, FTZ_MustBeOn};
176     case Intrinsic::nvvm_fabs_d:
177       return {Intrinsic::fabs, FTZ_Any};
178     case Intrinsic::nvvm_fabs_f:
179       return {Intrinsic::fabs, FTZ_MustBeOff};
180     case Intrinsic::nvvm_fabs_ftz_f:
181       return {Intrinsic::fabs, FTZ_MustBeOn};
182     case Intrinsic::nvvm_floor_d:
183       return {Intrinsic::floor, FTZ_Any};
184     case Intrinsic::nvvm_floor_f:
185       return {Intrinsic::floor, FTZ_MustBeOff};
186     case Intrinsic::nvvm_floor_ftz_f:
187       return {Intrinsic::floor, FTZ_MustBeOn};
188     case Intrinsic::nvvm_fma_rn_d:
189       return {Intrinsic::fma, FTZ_Any};
190     case Intrinsic::nvvm_fma_rn_f:
191       return {Intrinsic::fma, FTZ_MustBeOff};
192     case Intrinsic::nvvm_fma_rn_ftz_f:
193       return {Intrinsic::fma, FTZ_MustBeOn};
194     case Intrinsic::nvvm_fmax_d:
195       return {Intrinsic::maxnum, FTZ_Any};
196     case Intrinsic::nvvm_fmax_f:
197       return {Intrinsic::maxnum, FTZ_MustBeOff};
198     case Intrinsic::nvvm_fmax_ftz_f:
199       return {Intrinsic::maxnum, FTZ_MustBeOn};
200     case Intrinsic::nvvm_fmin_d:
201       return {Intrinsic::minnum, FTZ_Any};
202     case Intrinsic::nvvm_fmin_f:
203       return {Intrinsic::minnum, FTZ_MustBeOff};
204     case Intrinsic::nvvm_fmin_ftz_f:
205       return {Intrinsic::minnum, FTZ_MustBeOn};
206     case Intrinsic::nvvm_round_d:
207       return {Intrinsic::round, FTZ_Any};
208     case Intrinsic::nvvm_round_f:
209       return {Intrinsic::round, FTZ_MustBeOff};
210     case Intrinsic::nvvm_round_ftz_f:
211       return {Intrinsic::round, FTZ_MustBeOn};
212     case Intrinsic::nvvm_sqrt_rn_d:
213       return {Intrinsic::sqrt, FTZ_Any};
214     case Intrinsic::nvvm_sqrt_f:
215       // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
216       // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
217       // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
218       // the versions with explicit ftz-ness.
219       return {Intrinsic::sqrt, FTZ_Any};
220     case Intrinsic::nvvm_sqrt_rn_f:
221       return {Intrinsic::sqrt, FTZ_MustBeOff};
222     case Intrinsic::nvvm_sqrt_rn_ftz_f:
223       return {Intrinsic::sqrt, FTZ_MustBeOn};
224     case Intrinsic::nvvm_trunc_d:
225       return {Intrinsic::trunc, FTZ_Any};
226     case Intrinsic::nvvm_trunc_f:
227       return {Intrinsic::trunc, FTZ_MustBeOff};
228     case Intrinsic::nvvm_trunc_ftz_f:
229       return {Intrinsic::trunc, FTZ_MustBeOn};
230 
231     // NVVM intrinsics that map to LLVM cast operations.
232     //
233     // Note that llvm's target-generic conversion operators correspond to the rz
234     // (round to zero) versions of the nvvm conversion intrinsics, even though
235     // most everything else here uses the rn (round to nearest even) nvvm ops.
236     case Intrinsic::nvvm_d2i_rz:
237     case Intrinsic::nvvm_f2i_rz:
238     case Intrinsic::nvvm_d2ll_rz:
239     case Intrinsic::nvvm_f2ll_rz:
240       return {Instruction::FPToSI};
241     case Intrinsic::nvvm_d2ui_rz:
242     case Intrinsic::nvvm_f2ui_rz:
243     case Intrinsic::nvvm_d2ull_rz:
244     case Intrinsic::nvvm_f2ull_rz:
245       return {Instruction::FPToUI};
246     case Intrinsic::nvvm_i2d_rz:
247     case Intrinsic::nvvm_i2f_rz:
248     case Intrinsic::nvvm_ll2d_rz:
249     case Intrinsic::nvvm_ll2f_rz:
250       return {Instruction::SIToFP};
251     case Intrinsic::nvvm_ui2d_rz:
252     case Intrinsic::nvvm_ui2f_rz:
253     case Intrinsic::nvvm_ull2d_rz:
254     case Intrinsic::nvvm_ull2f_rz:
255       return {Instruction::UIToFP};
256 
257     // NVVM intrinsics that map to LLVM binary ops.
258     case Intrinsic::nvvm_add_rn_d:
259       return {Instruction::FAdd, FTZ_Any};
260     case Intrinsic::nvvm_add_rn_f:
261       return {Instruction::FAdd, FTZ_MustBeOff};
262     case Intrinsic::nvvm_add_rn_ftz_f:
263       return {Instruction::FAdd, FTZ_MustBeOn};
264     case Intrinsic::nvvm_mul_rn_d:
265       return {Instruction::FMul, FTZ_Any};
266     case Intrinsic::nvvm_mul_rn_f:
267       return {Instruction::FMul, FTZ_MustBeOff};
268     case Intrinsic::nvvm_mul_rn_ftz_f:
269       return {Instruction::FMul, FTZ_MustBeOn};
270     case Intrinsic::nvvm_div_rn_d:
271       return {Instruction::FDiv, FTZ_Any};
272     case Intrinsic::nvvm_div_rn_f:
273       return {Instruction::FDiv, FTZ_MustBeOff};
274     case Intrinsic::nvvm_div_rn_ftz_f:
275       return {Instruction::FDiv, FTZ_MustBeOn};
276 
277     // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
278     // need special handling.
279     //
280     // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
281     // as well.
282     case Intrinsic::nvvm_rcp_rn_d:
283       return {SPC_Reciprocal, FTZ_Any};
284     case Intrinsic::nvvm_rcp_rn_f:
285       return {SPC_Reciprocal, FTZ_MustBeOff};
286     case Intrinsic::nvvm_rcp_rn_ftz_f:
287       return {SPC_Reciprocal, FTZ_MustBeOn};
288 
289       // We do not currently simplify intrinsics that give an approximate
290       // answer. These include:
291       //
292       //   - nvvm_cos_approx_{f,ftz_f}
293       //   - nvvm_ex2_approx_{d,f,ftz_f}
294       //   - nvvm_lg2_approx_{d,f,ftz_f}
295       //   - nvvm_sin_approx_{f,ftz_f}
296       //   - nvvm_sqrt_approx_{f,ftz_f}
297       //   - nvvm_rsqrt_approx_{d,f,ftz_f}
298       //   - nvvm_div_approx_{ftz_d,ftz_f,f}
299       //   - nvvm_rcp_approx_ftz_d
300       //
301       // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
302       // means that fastmath is enabled in the intrinsic.  Unfortunately only
303       // binary operators (currently) have a fastmath bit in SelectionDAG, so
304       // this information gets lost and we can't select on it.
305       //
306       // TODO: div and rcp are lowered to a binary op, so these we could in
307       // theory lower them to "fast fdiv".
308 
309     default:
310       return {};
311     }
312   }();
313 
314   // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
315   // can bail out now.  (Notice that in the case that IID is not an NVVM
316   // intrinsic, we don't have to look up any module metadata, as
317   // FtzRequirementTy will be FTZ_Any.)
318   if (Action.FtzRequirement != FTZ_Any) {
319     StringRef Attr = II->getFunction()
320                          ->getFnAttribute("denormal-fp-math-f32")
321                          .getValueAsString();
322     DenormalMode Mode = parseDenormalFPAttribute(Attr);
323     bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
324 
325     if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
326       return nullptr;
327   }
328 
329   // Simplify to target-generic intrinsic.
330   if (Action.IID) {
331     SmallVector<Value *, 4> Args(II->arg_operands());
332     // All the target-generic intrinsics currently of interest to us have one
333     // type argument, equal to that of the nvvm intrinsic's argument.
334     Type *Tys[] = {II->getArgOperand(0)->getType()};
335     return CallInst::Create(
336         Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
337   }
338 
339   // Simplify to target-generic binary op.
340   if (Action.BinaryOp)
341     return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
342                                   II->getArgOperand(1), II->getName());
343 
344   // Simplify to target-generic cast op.
345   if (Action.CastOp)
346     return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
347                             II->getName());
348 
349   // All that's left are the special cases.
350   if (!Action.Special)
351     return nullptr;
352 
353   switch (*Action.Special) {
354   case SPC_Reciprocal:
355     // Simplify reciprocal.
356     return BinaryOperator::Create(
357         Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
358         II->getArgOperand(0), II->getName());
359   }
360   llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
361 }
362 
363 Optional<Instruction *>
364 NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
365   if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
366     return I;
367   }
368   return None;
369 }
370 
371 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
372     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
373     TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
374     TTI::OperandValueProperties Opd1PropInfo,
375     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
376     const Instruction *CxtI) {
377   // Legalize the type.
378   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
379 
380   int ISD = TLI->InstructionOpcodeToISD(Opcode);
381 
382   switch (ISD) {
383   default:
384     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
385                                          Opd2Info,
386                                          Opd1PropInfo, Opd2PropInfo);
387   case ISD::ADD:
388   case ISD::MUL:
389   case ISD::XOR:
390   case ISD::OR:
391   case ISD::AND:
392     // The machine code (SASS) simulates an i64 with two i32. Therefore, we
393     // estimate that arithmetic operations on i64 are twice as expensive as
394     // those on types that can fit into one machine register.
395     if (LT.second.SimpleTy == MVT::i64)
396       return 2 * LT.first;
397     // Delegate other cases to the basic TTI.
398     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
399                                          Opd2Info,
400                                          Opd1PropInfo, Opd2PropInfo);
401   }
402 }
403 
404 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
405                                            TTI::UnrollingPreferences &UP) {
406   BaseT::getUnrollingPreferences(L, SE, UP);
407 
408   // Enable partial unrolling and runtime unrolling, but reduce the
409   // threshold.  This partially unrolls small loops which are often
410   // unrolled by the PTX to SASS compiler and unrolling earlier can be
411   // beneficial.
412   UP.Partial = UP.Runtime = true;
413   UP.PartialThreshold = UP.Threshold / 4;
414 }
415 
416 void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
417                                          TTI::PeelingPreferences &PP) {
418   BaseT::getPeelingPreferences(L, SE, PP);
419 }
420