1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "NVPTXTargetTransformInfo.h"
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/Analysis/ValueTracking.h"
14 #include "llvm/CodeGen/BasicTTIImpl.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/TargetLowering.h"
17 #include "llvm/IR/IntrinsicsNVPTX.h"
18 #include "llvm/Support/Debug.h"
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "NVPTXtti"
22 
23 // Whether the given intrinsic reads threadIdx.x/y/z.
24 static bool readsThreadIndex(const IntrinsicInst *II) {
25   switch (II->getIntrinsicID()) {
26     default: return false;
27     case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28     case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29     case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30       return true;
31   }
32 }
33 
34 static bool readsLaneId(const IntrinsicInst *II) {
35   return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37 
38 // Whether the given intrinsic is an atomic instruction in PTX.
39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40   switch (II->getIntrinsicID()) {
41     default: return false;
42     case Intrinsic::nvvm_atomic_load_inc_32:
43     case Intrinsic::nvvm_atomic_load_dec_32:
44 
45     case Intrinsic::nvvm_atomic_add_gen_f_cta:
46     case Intrinsic::nvvm_atomic_add_gen_f_sys:
47     case Intrinsic::nvvm_atomic_add_gen_i_cta:
48     case Intrinsic::nvvm_atomic_add_gen_i_sys:
49     case Intrinsic::nvvm_atomic_and_gen_i_cta:
50     case Intrinsic::nvvm_atomic_and_gen_i_sys:
51     case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52     case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53     case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54     case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55     case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56     case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57     case Intrinsic::nvvm_atomic_max_gen_i_cta:
58     case Intrinsic::nvvm_atomic_max_gen_i_sys:
59     case Intrinsic::nvvm_atomic_min_gen_i_cta:
60     case Intrinsic::nvvm_atomic_min_gen_i_sys:
61     case Intrinsic::nvvm_atomic_or_gen_i_cta:
62     case Intrinsic::nvvm_atomic_or_gen_i_sys:
63     case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64     case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65     case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66     case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67       return true;
68   }
69 }
70 
71 bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
72   // Without inter-procedural analysis, we conservatively assume that arguments
73   // to __device__ functions are divergent.
74   if (const Argument *Arg = dyn_cast<Argument>(V))
75     return !isKernelFunction(*Arg->getParent());
76 
77   if (const Instruction *I = dyn_cast<Instruction>(V)) {
78     // Without pointer analysis, we conservatively assume values loaded from
79     // generic or local address space are divergent.
80     if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81       unsigned AS = LI->getPointerAddressSpace();
82       return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83     }
84     // Atomic instructions may cause divergence. Atomic instructions are
85     // executed sequentially across all threads in a warp. Therefore, an earlier
86     // executed thread may see different memory inputs than a later executed
87     // thread. For example, suppose *a = 0 initially.
88     //
89     //   atom.global.add.s32 d, [a], 1
90     //
91     // returns 0 for the first thread that enters the critical region, and 1 for
92     // the second thread.
93     if (I->isAtomic())
94       return true;
95     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96       // Instructions that read threadIdx are obviously divergent.
97       if (readsThreadIndex(II) || readsLaneId(II))
98         return true;
99       // Handle the NVPTX atomic intrinsics that cannot be represented as an
100       // atomic IR instruction.
101       if (isNVVMAtomic(II))
102         return true;
103     }
104     // Conservatively consider the return value of function calls as divergent.
105     // We could analyze callees with bodies more precisely using
106     // inter-procedural analysis.
107     if (isa<CallInst>(I))
108       return true;
109   }
110 
111   return false;
112 }
113 
114 // Convert NVVM intrinsics to target-generic LLVM code where possible.
115 static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
116   // Each NVVM intrinsic we can simplify can be replaced with one of:
117   //
118   //  * an LLVM intrinsic,
119   //  * an LLVM cast operation,
120   //  * an LLVM binary operation, or
121   //  * ad-hoc LLVM IR for the particular operation.
122 
123   // Some transformations are only valid when the module's
124   // flush-denormals-to-zero (ftz) setting is true/false, whereas other
125   // transformations are valid regardless of the module's ftz setting.
126   enum FtzRequirementTy {
127     FTZ_Any,       // Any ftz setting is ok.
128     FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
129     FTZ_MustBeOff, // Transformation is valid only if ftz is off.
130   };
131   // Classes of NVVM intrinsics that can't be replaced one-to-one with a
132   // target-generic intrinsic, cast op, or binary op but that we can nonetheless
133   // simplify.
134   enum SpecialCase {
135     SPC_Reciprocal,
136   };
137 
138   // SimplifyAction is a poor-man's variant (plus an additional flag) that
139   // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
140   struct SimplifyAction {
141     // Invariant: At most one of these Optionals has a value.
142     Optional<Intrinsic::ID> IID;
143     Optional<Instruction::CastOps> CastOp;
144     Optional<Instruction::BinaryOps> BinaryOp;
145     Optional<SpecialCase> Special;
146 
147     FtzRequirementTy FtzRequirement = FTZ_Any;
148     // Denormal handling is guarded by different attributes depending on the
149     // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
150     bool IsHalfTy = false;
151 
152     SimplifyAction() = default;
153 
154     SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
155                    bool IsHalfTy = false)
156         : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
157 
158     // Cast operations don't have anything to do with FTZ, so we skip that
159     // argument.
160     SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
161 
162     SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
163         : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
164 
165     SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
166         : Special(Special), FtzRequirement(FtzReq) {}
167   };
168 
169   // Try to generate a SimplifyAction describing how to replace our
170   // IntrinsicInstr with target-generic LLVM IR.
171   const SimplifyAction Action = [II]() -> SimplifyAction {
172     switch (II->getIntrinsicID()) {
173     // NVVM intrinsics that map directly to LLVM intrinsics.
174     case Intrinsic::nvvm_ceil_d:
175       return {Intrinsic::ceil, FTZ_Any};
176     case Intrinsic::nvvm_ceil_f:
177       return {Intrinsic::ceil, FTZ_MustBeOff};
178     case Intrinsic::nvvm_ceil_ftz_f:
179       return {Intrinsic::ceil, FTZ_MustBeOn};
180     case Intrinsic::nvvm_fabs_d:
181       return {Intrinsic::fabs, FTZ_Any};
182     case Intrinsic::nvvm_fabs_f:
183       return {Intrinsic::fabs, FTZ_MustBeOff};
184     case Intrinsic::nvvm_fabs_ftz_f:
185       return {Intrinsic::fabs, FTZ_MustBeOn};
186     case Intrinsic::nvvm_floor_d:
187       return {Intrinsic::floor, FTZ_Any};
188     case Intrinsic::nvvm_floor_f:
189       return {Intrinsic::floor, FTZ_MustBeOff};
190     case Intrinsic::nvvm_floor_ftz_f:
191       return {Intrinsic::floor, FTZ_MustBeOn};
192     case Intrinsic::nvvm_fma_rn_d:
193       return {Intrinsic::fma, FTZ_Any};
194     case Intrinsic::nvvm_fma_rn_f:
195       return {Intrinsic::fma, FTZ_MustBeOff};
196     case Intrinsic::nvvm_fma_rn_ftz_f:
197       return {Intrinsic::fma, FTZ_MustBeOn};
198     case Intrinsic::nvvm_fma_rn_f16:
199       return {Intrinsic::fma, FTZ_MustBeOff, true};
200     case Intrinsic::nvvm_fma_rn_ftz_f16:
201       return {Intrinsic::fma, FTZ_MustBeOn, true};
202     case Intrinsic::nvvm_fma_rn_f16x2:
203       return {Intrinsic::fma, FTZ_MustBeOff, true};
204     case Intrinsic::nvvm_fma_rn_ftz_f16x2:
205       return {Intrinsic::fma, FTZ_MustBeOn, true};
206     case Intrinsic::nvvm_fmax_d:
207       return {Intrinsic::maxnum, FTZ_Any};
208     case Intrinsic::nvvm_fmax_f:
209       return {Intrinsic::maxnum, FTZ_MustBeOff};
210     case Intrinsic::nvvm_fmax_ftz_f:
211       return {Intrinsic::maxnum, FTZ_MustBeOn};
212     case Intrinsic::nvvm_fmax_nan_f:
213       return {Intrinsic::maximum, FTZ_MustBeOff};
214     case Intrinsic::nvvm_fmax_ftz_nan_f:
215       return {Intrinsic::maximum, FTZ_MustBeOn};
216     case Intrinsic::nvvm_fmax_f16:
217       return {Intrinsic::maxnum, FTZ_MustBeOff, true};
218     case Intrinsic::nvvm_fmax_ftz_f16:
219       return {Intrinsic::maxnum, FTZ_MustBeOn, true};
220     case Intrinsic::nvvm_fmax_f16x2:
221       return {Intrinsic::maxnum, FTZ_MustBeOff, true};
222     case Intrinsic::nvvm_fmax_ftz_f16x2:
223       return {Intrinsic::maxnum, FTZ_MustBeOn, true};
224     case Intrinsic::nvvm_fmax_nan_f16:
225       return {Intrinsic::maximum, FTZ_MustBeOff, true};
226     case Intrinsic::nvvm_fmax_ftz_nan_f16:
227       return {Intrinsic::maximum, FTZ_MustBeOn, true};
228     case Intrinsic::nvvm_fmax_nan_f16x2:
229       return {Intrinsic::maximum, FTZ_MustBeOff, true};
230     case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
231       return {Intrinsic::maximum, FTZ_MustBeOn, true};
232     case Intrinsic::nvvm_fmin_d:
233       return {Intrinsic::minnum, FTZ_Any};
234     case Intrinsic::nvvm_fmin_f:
235       return {Intrinsic::minnum, FTZ_MustBeOff};
236     case Intrinsic::nvvm_fmin_ftz_f:
237       return {Intrinsic::minnum, FTZ_MustBeOn};
238     case Intrinsic::nvvm_fmin_nan_f:
239       return {Intrinsic::minimum, FTZ_MustBeOff};
240     case Intrinsic::nvvm_fmin_ftz_nan_f:
241       return {Intrinsic::minimum, FTZ_MustBeOn};
242     case Intrinsic::nvvm_fmin_f16:
243       return {Intrinsic::minnum, FTZ_MustBeOff, true};
244     case Intrinsic::nvvm_fmin_ftz_f16:
245       return {Intrinsic::minnum, FTZ_MustBeOn, true};
246     case Intrinsic::nvvm_fmin_f16x2:
247       return {Intrinsic::minnum, FTZ_MustBeOff, true};
248     case Intrinsic::nvvm_fmin_ftz_f16x2:
249       return {Intrinsic::minnum, FTZ_MustBeOn, true};
250     case Intrinsic::nvvm_fmin_nan_f16:
251       return {Intrinsic::minimum, FTZ_MustBeOff, true};
252     case Intrinsic::nvvm_fmin_ftz_nan_f16:
253       return {Intrinsic::minimum, FTZ_MustBeOn, true};
254     case Intrinsic::nvvm_fmin_nan_f16x2:
255       return {Intrinsic::minimum, FTZ_MustBeOff, true};
256     case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
257       return {Intrinsic::minimum, FTZ_MustBeOn, true};
258     case Intrinsic::nvvm_round_d:
259       return {Intrinsic::round, FTZ_Any};
260     case Intrinsic::nvvm_round_f:
261       return {Intrinsic::round, FTZ_MustBeOff};
262     case Intrinsic::nvvm_round_ftz_f:
263       return {Intrinsic::round, FTZ_MustBeOn};
264     case Intrinsic::nvvm_sqrt_rn_d:
265       return {Intrinsic::sqrt, FTZ_Any};
266     case Intrinsic::nvvm_sqrt_f:
267       // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
268       // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
269       // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
270       // the versions with explicit ftz-ness.
271       return {Intrinsic::sqrt, FTZ_Any};
272     case Intrinsic::nvvm_sqrt_rn_f:
273       return {Intrinsic::sqrt, FTZ_MustBeOff};
274     case Intrinsic::nvvm_sqrt_rn_ftz_f:
275       return {Intrinsic::sqrt, FTZ_MustBeOn};
276     case Intrinsic::nvvm_trunc_d:
277       return {Intrinsic::trunc, FTZ_Any};
278     case Intrinsic::nvvm_trunc_f:
279       return {Intrinsic::trunc, FTZ_MustBeOff};
280     case Intrinsic::nvvm_trunc_ftz_f:
281       return {Intrinsic::trunc, FTZ_MustBeOn};
282 
283     // NVVM intrinsics that map to LLVM cast operations.
284     //
285     // Note that llvm's target-generic conversion operators correspond to the rz
286     // (round to zero) versions of the nvvm conversion intrinsics, even though
287     // most everything else here uses the rn (round to nearest even) nvvm ops.
288     case Intrinsic::nvvm_d2i_rz:
289     case Intrinsic::nvvm_f2i_rz:
290     case Intrinsic::nvvm_d2ll_rz:
291     case Intrinsic::nvvm_f2ll_rz:
292       return {Instruction::FPToSI};
293     case Intrinsic::nvvm_d2ui_rz:
294     case Intrinsic::nvvm_f2ui_rz:
295     case Intrinsic::nvvm_d2ull_rz:
296     case Intrinsic::nvvm_f2ull_rz:
297       return {Instruction::FPToUI};
298     case Intrinsic::nvvm_i2d_rz:
299     case Intrinsic::nvvm_i2f_rz:
300     case Intrinsic::nvvm_ll2d_rz:
301     case Intrinsic::nvvm_ll2f_rz:
302       return {Instruction::SIToFP};
303     case Intrinsic::nvvm_ui2d_rz:
304     case Intrinsic::nvvm_ui2f_rz:
305     case Intrinsic::nvvm_ull2d_rz:
306     case Intrinsic::nvvm_ull2f_rz:
307       return {Instruction::UIToFP};
308 
309     // NVVM intrinsics that map to LLVM binary ops.
310     case Intrinsic::nvvm_add_rn_d:
311       return {Instruction::FAdd, FTZ_Any};
312     case Intrinsic::nvvm_add_rn_f:
313       return {Instruction::FAdd, FTZ_MustBeOff};
314     case Intrinsic::nvvm_add_rn_ftz_f:
315       return {Instruction::FAdd, FTZ_MustBeOn};
316     case Intrinsic::nvvm_mul_rn_d:
317       return {Instruction::FMul, FTZ_Any};
318     case Intrinsic::nvvm_mul_rn_f:
319       return {Instruction::FMul, FTZ_MustBeOff};
320     case Intrinsic::nvvm_mul_rn_ftz_f:
321       return {Instruction::FMul, FTZ_MustBeOn};
322     case Intrinsic::nvvm_div_rn_d:
323       return {Instruction::FDiv, FTZ_Any};
324     case Intrinsic::nvvm_div_rn_f:
325       return {Instruction::FDiv, FTZ_MustBeOff};
326     case Intrinsic::nvvm_div_rn_ftz_f:
327       return {Instruction::FDiv, FTZ_MustBeOn};
328 
329     // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
330     // need special handling.
331     //
332     // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
333     // as well.
334     case Intrinsic::nvvm_rcp_rn_d:
335       return {SPC_Reciprocal, FTZ_Any};
336     case Intrinsic::nvvm_rcp_rn_f:
337       return {SPC_Reciprocal, FTZ_MustBeOff};
338     case Intrinsic::nvvm_rcp_rn_ftz_f:
339       return {SPC_Reciprocal, FTZ_MustBeOn};
340 
341       // We do not currently simplify intrinsics that give an approximate
342       // answer. These include:
343       //
344       //   - nvvm_cos_approx_{f,ftz_f}
345       //   - nvvm_ex2_approx_{d,f,ftz_f}
346       //   - nvvm_lg2_approx_{d,f,ftz_f}
347       //   - nvvm_sin_approx_{f,ftz_f}
348       //   - nvvm_sqrt_approx_{f,ftz_f}
349       //   - nvvm_rsqrt_approx_{d,f,ftz_f}
350       //   - nvvm_div_approx_{ftz_d,ftz_f,f}
351       //   - nvvm_rcp_approx_ftz_d
352       //
353       // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
354       // means that fastmath is enabled in the intrinsic.  Unfortunately only
355       // binary operators (currently) have a fastmath bit in SelectionDAG, so
356       // this information gets lost and we can't select on it.
357       //
358       // TODO: div and rcp are lowered to a binary op, so these we could in
359       // theory lower them to "fast fdiv".
360 
361     default:
362       return {};
363     }
364   }();
365 
366   // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
367   // can bail out now.  (Notice that in the case that IID is not an NVVM
368   // intrinsic, we don't have to look up any module metadata, as
369   // FtzRequirementTy will be FTZ_Any.)
370   if (Action.FtzRequirement != FTZ_Any) {
371     const char *AttrName =
372         Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32";
373     StringRef Attr =
374         II->getFunction()->getFnAttribute(AttrName).getValueAsString();
375     DenormalMode Mode = parseDenormalFPAttribute(Attr);
376     bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
377 
378     if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
379       return nullptr;
380   }
381 
382   // Simplify to target-generic intrinsic.
383   if (Action.IID) {
384     SmallVector<Value *, 4> Args(II->args());
385     // All the target-generic intrinsics currently of interest to us have one
386     // type argument, equal to that of the nvvm intrinsic's argument.
387     Type *Tys[] = {II->getArgOperand(0)->getType()};
388     return CallInst::Create(
389         Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
390   }
391 
392   // Simplify to target-generic binary op.
393   if (Action.BinaryOp)
394     return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
395                                   II->getArgOperand(1), II->getName());
396 
397   // Simplify to target-generic cast op.
398   if (Action.CastOp)
399     return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
400                             II->getName());
401 
402   // All that's left are the special cases.
403   if (!Action.Special)
404     return nullptr;
405 
406   switch (*Action.Special) {
407   case SPC_Reciprocal:
408     // Simplify reciprocal.
409     return BinaryOperator::Create(
410         Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
411         II->getArgOperand(0), II->getName());
412   }
413   llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
414 }
415 
416 Optional<Instruction *>
417 NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
418   if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
419     return I;
420   }
421   return None;
422 }
423 
424 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
425     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
426     TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
427     TTI::OperandValueProperties Opd1PropInfo,
428     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
429     const Instruction *CxtI) {
430   // Legalize the type.
431   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
432 
433   int ISD = TLI->InstructionOpcodeToISD(Opcode);
434 
435   switch (ISD) {
436   default:
437     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
438                                          Opd2Info,
439                                          Opd1PropInfo, Opd2PropInfo);
440   case ISD::ADD:
441   case ISD::MUL:
442   case ISD::XOR:
443   case ISD::OR:
444   case ISD::AND:
445     // The machine code (SASS) simulates an i64 with two i32. Therefore, we
446     // estimate that arithmetic operations on i64 are twice as expensive as
447     // those on types that can fit into one machine register.
448     if (LT.second.SimpleTy == MVT::i64)
449       return 2 * LT.first;
450     // Delegate other cases to the basic TTI.
451     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
452                                          Opd2Info,
453                                          Opd1PropInfo, Opd2PropInfo);
454   }
455 }
456 
457 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
458                                            TTI::UnrollingPreferences &UP,
459                                            OptimizationRemarkEmitter *ORE) {
460   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
461 
462   // Enable partial unrolling and runtime unrolling, but reduce the
463   // threshold.  This partially unrolls small loops which are often
464   // unrolled by the PTX to SASS compiler and unrolling earlier can be
465   // beneficial.
466   UP.Partial = UP.Runtime = true;
467   UP.PartialThreshold = UP.Threshold / 4;
468 }
469 
470 void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
471                                          TTI::PeelingPreferences &PP) {
472   BaseT::getPeelingPreferences(L, SE, PP);
473 }
474