1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Compiler/CISACodeGen/Simd32Profitability.hpp"
10 #include "Compiler/CodeGenPublic.h"
11 #include "Compiler/IGCPassSupport.h"
12 #include "Compiler/CISACodeGen/Platform.hpp"
13 #include "common/LLVMWarningsPush.hpp"
14 #include <llvmWrapper/IR/DerivedTypes.h>
15 #include <llvmWrapper/Transforms/Utils/LoopUtils.h>
16 #include <llvm/IR/InstIterator.h>
17 #include <llvm/IR/Operator.h>
18 #include <llvmWrapper/IR/DerivedTypes.h>
19 #include "common/LLVMWarningsPop.hpp"
20 #include "GenISAIntrinsics/GenIntrinsics.h"
21 #include "GenISAIntrinsics/GenIntrinsicInst.h"
22 #include "Probe/Assertion.h"
23 
24 using namespace llvm;
25 using namespace IGC;
26 using namespace IGC::IGCMD;
27 
28 // Register pass to igc-opt
29 #define PASS_FLAG "simd32-profit"
30 #define PASS_DESCRIPTION "Check SIMD32 Profitability for OpenCL"
31 #define PASS_CFG_ONLY false
32 #define PASS_ANALYSIS true
33 IGC_INITIALIZE_PASS_BEGIN(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
34 IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
35 IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
36 IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
37 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
38 IGC_INITIALIZE_PASS_END(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
39 
40 char Simd32ProfitabilityAnalysis::ID = 0;
41 
42 const unsigned BRANCHY_MINPATH = 8;
43 
Simd32ProfitabilityAnalysis()44 Simd32ProfitabilityAnalysis::Simd32ProfitabilityAnalysis()
45     : FunctionPass(ID), F(nullptr), PDT(nullptr), LI(nullptr),
46     pMdUtils(nullptr), WI(nullptr), m_isSimd32Profitable(true),
47     m_isSimd16Profitable(true) {
48     initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
49 }
50 
51 static std::tuple<Value* /*INIT*/, Value* /*CURR*/, Value* /*STEP*/, Value* /*NEXT*/>
getInductionVariable(Loop * L)52 getInductionVariable(Loop* L) {
53     BasicBlock* H = L->getHeader();
54 
55     BasicBlock* Incoming = 0, *Backedge = 0;
56     pred_iterator PI = pred_begin(H);
57     IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
58     Backedge = *PI++;
59     if (PI == pred_end(H)) // dead loop
60         return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
61     Incoming = *PI++;
62     if (PI != pred_end(H)) // multiple backedges?
63         return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
64 
65     if (L->contains(Incoming)) {
66         if (L->contains(Backedge))
67             return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
68         std::swap(Incoming, Backedge);
69     }
70     else if (!L->contains(Backedge))
71         return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
72 
73     // Loop over all of the PHI nodes, looking for an indvar.
74     for (auto I = H->begin(); isa<PHINode>(I); ++I) {
75         PHINode* PN = cast<PHINode>(I);
76         if (auto Inc = dyn_cast<Instruction>(PN->getIncomingValueForBlock(Backedge))) {
77             if (Inc->getOpcode() == Instruction::Add && Inc->getOperand(0) == PN) {
78                 return
79                     std::make_tuple(PN->getIncomingValueForBlock(Incoming), PN,
80                         Inc->getOperand(1), Inc);
81             }
82         }
83     }
84 
85     return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
86 }
87 
88 enum {
89     LOOPCOUNT_LIKELY_SMALL,
90     LOOPCOUNT_LIKELY_LARGE,
91     LOOPCOUNT_UNKNOWN
92 };
93 
isSignedPredicate(CmpInst::Predicate Pred)94 static bool isSignedPredicate(CmpInst::Predicate Pred) {
95     switch (Pred) {
96     default: break;
97     case CmpInst::ICMP_EQ:
98     case CmpInst::ICMP_NE:
99     case CmpInst::ICMP_SGT:
100     case CmpInst::ICMP_SLT:
101     case CmpInst::ICMP_SGE:
102     case CmpInst::ICMP_SLE:
103         return true;
104     }
105     return false;
106 }
107 
isUnsignedPredicate(CmpInst::Predicate Pred)108 static bool isUnsignedPredicate(CmpInst::Predicate Pred) {
109     switch (Pred) {
110     default: break;
111     case CmpInst::ICMP_EQ:
112     case CmpInst::ICMP_NE:
113     case CmpInst::ICMP_UGT:
114     case CmpInst::ICMP_ULT:
115     case CmpInst::ICMP_UGE:
116     case CmpInst::ICMP_ULE:
117         return true;
118     }
119     return false;
120 }
121 
hasSameSignedness(CmpInst::Predicate LHS,CmpInst::Predicate RHS)122 static bool hasSameSignedness(CmpInst::Predicate LHS, CmpInst::Predicate RHS) {
123     if (isSignedPredicate(LHS) && isSignedPredicate(RHS))
124         return true;
125     if (isUnsignedPredicate(LHS) && isUnsignedPredicate(RHS))
126         return true;
127     return false;
128 }
129 
130 static std::tuple<Value*, Value*, Value*, bool>
isOutOfRangeComparison(Value * Cond)131 isOutOfRangeComparison(Value* Cond) {
132     BinaryOperator* BO = dyn_cast<BinaryOperator>(Cond);
133     if (!BO || BO->getOpcode() != Instruction::Or)
134         return std::make_tuple(nullptr, nullptr, nullptr, false);
135 
136     ICmpInst* LHS = dyn_cast<ICmpInst>(BO->getOperand(0));
137     ICmpInst* RHS = dyn_cast<ICmpInst>(BO->getOperand(1));
138 
139     if (!LHS || !RHS)
140         return std::make_tuple(nullptr, nullptr, nullptr, false);
141 
142     CmpInst::Predicate P0 = LHS->getPredicate();
143     CmpInst::Predicate P1 = RHS->getPredicate();
144 
145     if (!hasSameSignedness(P0, P1))
146         return std::make_tuple(nullptr, nullptr, nullptr, false);
147 
148     // Simplify the checking since they have the same signedness.
149     P0 = ICmpInst::getSignedPredicate(P0);
150     P1 = ICmpInst::getSignedPredicate(P1);
151 
152     if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE)) {
153         std::swap(LHS, RHS);
154         std::swap(P0, P1);
155     }
156     if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE) ||
157         !(P1 == CmpInst::ICMP_SGT || P1 == CmpInst::ICMP_SGE))
158         return std::make_tuple(nullptr, nullptr, nullptr, false);
159 
160     if (LHS->getOperand(0) != RHS->getOperand(0))
161         return std::make_tuple(nullptr, nullptr, nullptr, false);
162 
163     return std::make_tuple(LHS->getOperand(0),
164         LHS->getOperand(1), RHS->getOperand(1),
165         isSignedPredicate(LHS->getPredicate()));
166 }
167 
getLoopCounter(Loop * L,Value * X)168 static Value* getLoopCounter(Loop* L, Value* X) {
169     BasicBlock* H = L->getHeader();
170 
171     BasicBlock* Incoming = 0, *Backedge = 0;
172     pred_iterator PI = pred_begin(H);
173     IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
174     Backedge = *PI++;
175     if (PI == pred_end(H)) // dead loop
176         return nullptr;
177     Incoming = *PI++;
178     if (PI != pred_end(H)) // multiple backedges?
179         return nullptr;
180 
181     if (L->contains(Incoming)) {
182         if (L->contains(Backedge))
183             return nullptr;
184         std::swap(Incoming, Backedge);
185     }
186     else if (!L->contains(Backedge))
187         return nullptr;
188 
189     for (auto I = H->begin(); isa<PHINode>(I); ++I) {
190         PHINode* PN = cast<PHINode>(I);
191         if (X == PN->getIncomingValueForBlock(Backedge))
192             return PN;
193     }
194 
195     return nullptr;
196 }
197 
198 static std::tuple<int, int>
countOperands(Value * V,Value * LHS,Value * RHS)199 countOperands(Value* V, Value* LHS, Value* RHS) {
200     if (V == LHS || V == RHS)
201         return std::make_tuple((V == LHS), (V == RHS));
202 
203     // Count LHS, RHS in an expression like m*L + n*R +/- C, where C is
204     // constant.
205     BinaryOperator* BO = dyn_cast<BinaryOperator>(V);
206     if (!BO ||
207         (BO->getOpcode() != Instruction::Add &&
208             BO->getOpcode() != Instruction::Sub &&
209             BO->getOpcode() != Instruction::Shl &&
210             BO->getOpcode() != Instruction::Xor))
211         return std::make_tuple(0, 0);
212 
213     if (BO->getOpcode() == Instruction::Shl) {
214         ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
215         if (!CI)
216             return std::make_tuple(0, 0);
217         int L = 0, R = 0;
218         std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
219         uint64_t ShAmt = CI->getZExtValue();
220         return std::make_tuple((L << ShAmt), (R << ShAmt));
221     }
222 
223     if (BO->getOpcode() == Instruction::Xor) {
224         ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
225         if (!CI || CI->getSExtValue() != -1)
226             return std::make_tuple(0, 0);
227         int L = 0, R = 0;
228         std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
229         return std::make_tuple(-L, -R);
230     }
231 
232 
233     IGC_ASSERT((BO->getOpcode() == Instruction::Add) || (BO->getOpcode() == Instruction::Sub));
234 
235     if (isa<Constant>(BO->getOperand(1)))
236         return countOperands(BO->getOperand(0), LHS, RHS);
237     int L0 = 0, L1 = 0;
238     std::tie(L0, L1) = countOperands(BO->getOperand(0), LHS, RHS);
239     int R0 = 0, R1 = 0;
240     std::tie(R0, R1) = countOperands(BO->getOperand(1), LHS, RHS);
241     if (BO->getOpcode() == Instruction::Add)
242         return std::make_tuple(L0 + R0, L1 + R1);
243 
244     IGC_ASSERT(BO->getOpcode() == Instruction::Sub);
245     return std::make_tuple(L0 - R0, L1 - R1);
246 }
247 
isNegatedByLB(Value * V,Value * X,Value * LB)248 static bool isNegatedByLB(Value* V, Value* X, Value* LB) {
249     // Check if `V` is calculated as LB - X +/- C, where C is constant.
250     int L = 0, R = 0;
251     std::tie(L, R) = countOperands(V, LB, X);
252     return (L == 1) && (R == -1);
253 }
254 
isNegatedBy2UB(Value * V,Value * X,Value * UB)255 static bool isNegatedBy2UB(Value* V, Value* X, Value* UB) {
256     // Check if `V` is calculated as 2UB - X +/- C, where C is constant.
257     int L = 0, R = 0;
258     std::tie(L, R) = countOperands(V, UB, X);
259     return (L == 2) && (R == -1);
260 }
261 
estimateLoopCount_CASE1(Loop * L)262 unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE1(Loop* L) {
263     BasicBlock* Exit = L->getExitingBlock();
264     if (!Exit)
265         return LOOPCOUNT_UNKNOWN;
266 
267     BranchInst* Br = dyn_cast<BranchInst>(Exit->getTerminator());
268     if (!Br || !Br->isConditional())
269         return LOOPCOUNT_UNKNOWN;
270     if (!L->contains(Br->getSuccessor(0)))
271         return LOOPCOUNT_UNKNOWN;
272 
273     Value* X = nullptr, * LB = nullptr, * UB = nullptr;
274     bool Signed = false;
275     std::tie(X, LB, UB, Signed) = isOutOfRangeComparison(Br->getCondition());
276     if (!X) {
277         ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
278         if (!Cmp)
279             return LOOPCOUNT_UNKNOWN;
280         switch (Cmp->getPredicate()) {
281         default:
282             return LOOPCOUNT_UNKNOWN;
283         case CmpInst::ICMP_UGT:
284         case CmpInst::ICMP_UGE:
285             // A smart use of unsigned comparison on signed values to perform a
286             // out-of-range change of (0, N).
287             break;
288         }
289         X = Cmp->getOperand(0);
290         LB = Constant::getNullValue(X->getType());
291         UB = Cmp->getOperand(1);
292         Signed = true;
293     }
294 
295     Value* LC = getLoopCounter(L, X);
296     if (!LC)
297         return LOOPCOUNT_UNKNOWN;
298 
299     if (PHINode * PN = dyn_cast<PHINode>(X)) {
300         if (PN->getNumIncomingValues() != 2)
301             return LOOPCOUNT_UNKNOWN;
302         BasicBlock* BB0 = PN->getIncomingBlock(0);
303         BasicBlock* IfBB = BB0->getSinglePredecessor();
304         if (!IfBB)
305             return LOOPCOUNT_UNKNOWN;
306         Br = dyn_cast<BranchInst>(IfBB->getTerminator());
307         if (!Br || !Br->isConditional())
308             return LOOPCOUNT_UNKNOWN;
309         ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
310         if (!Cmp)
311             return LOOPCOUNT_UNKNOWN;
312         CmpInst::Predicate Pred = Cmp->getPredicate();
313         Value* LHS = Cmp->getOperand(0);
314         Value* RHS = Cmp->getOperand(1);
315         if (LHS != LC) {
316             std::swap(LHS, RHS);
317             Pred = CmpInst::getSwappedPredicate(Pred);
318         }
319         if (LHS != LC)
320             return LOOPCOUNT_UNKNOWN;
321         if (!Signed)
322             Pred = ICmpInst::getSignedPredicate(Pred);
323         if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
324             return LOOPCOUNT_UNKNOWN;
325         if (RHS != LB)
326             return LOOPCOUNT_UNKNOWN;
327 
328         Value* X0 = PN->getIncomingValue(0);
329         Value* X1 = PN->getIncomingValue(1);
330         if (!isNegatedByLB(X0, LC, LB))
331             return LOOPCOUNT_UNKNOWN;
332         if (!isNegatedBy2UB(X1, LC, UB))
333             return LOOPCOUNT_UNKNOWN;
334     }
335     else if (BinaryOperator * BO = dyn_cast<BinaryOperator>(X)) {
336         if (BO->getOpcode() != Instruction::Sub)
337             return LOOPCOUNT_UNKNOWN;
338         if (BO->getOperand(1) != LC)
339             return LOOPCOUNT_UNKNOWN;
340         SelectInst* SI = dyn_cast<SelectInst>(BO->getOperand(0));
341         if (!SI)
342             return LOOPCOUNT_UNKNOWN;
343         ICmpInst* Cmp = dyn_cast<ICmpInst>(SI->getCondition());
344         if (!Cmp)
345             return LOOPCOUNT_UNKNOWN;
346         CmpInst::Predicate Pred = Cmp->getPredicate();
347         Value* LHS = Cmp->getOperand(0);
348         Value* RHS = Cmp->getOperand(1);
349         if (LHS != LC) {
350             std::swap(LHS, RHS);
351             Pred = CmpInst::getSwappedPredicate(Pred);
352         }
353         if (LHS != LC)
354             return LOOPCOUNT_UNKNOWN;
355         if (!Signed)
356             Pred = ICmpInst::getSignedPredicate(Pred);
357         Value* X0 = SI->getTrueValue();
358         Value* X1 = SI->getFalseValue();
359         if (Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE) {
360             std::swap(X0, X1);
361             Pred = CmpInst::getInversePredicate(Pred);
362         }
363         if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
364             return LOOPCOUNT_UNKNOWN;
365         if (RHS != LB)
366             return LOOPCOUNT_UNKNOWN;
367         int L0 = 0, R0 = 0;
368         std::tie(L0, R0) = countOperands(X0, LB, nullptr);
369         int L1 = 0, R1 = 0;
370         std::tie(L1, R1) = countOperands(X1, UB, nullptr);
371         if (L0 != 1 || L1 != 2)
372             return LOOPCOUNT_UNKNOWN;
373     }
374     else
375         return LOOPCOUNT_UNKNOWN;
376 
377     // Ok, we found a loop of the following pattern:
378     //
379     // do {
380     //   if (x < 0) {
381     //      x = 0 - x +/- c0;
382     //   } else {
383     //      x = 2 * UB - x +/- c1;
384     //   }
385     // } while (x < LB || x > UB);
386     //
387     // such loop will run only once or twice when non-arbitary large `x`. If a
388     // non-uniform loop only runs several iterations, divergence cost due to
389     // SIMD32 could be ignored.
390     return LOOPCOUNT_LIKELY_SMALL;
391 }
392 
estimateLoopCount_CASE2(Loop * L)393 unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE2(Loop* L) {
394     SmallVector<BasicBlock*, 8> ExitingBBs;
395     L->getExitingBlocks(ExitingBBs);
396 
397     Value* Init = nullptr, * Curr= nullptr, * Next= nullptr, * Step= nullptr;
398     std::tie(Init, Curr, Step, Next) = getInductionVariable(L);
399     if (!Init || !Curr || !Step || !Next)
400         return LOOPCOUNT_UNKNOWN;
401     ConstantInt* I0 = dyn_cast<ConstantInt>(Init);
402     ConstantInt* S0 = dyn_cast<ConstantInt>(Step);
403     if (!I0 || !S0)
404         return LOOPCOUNT_UNKNOWN;
405 
406     for (auto BB : ExitingBBs) {
407         BranchInst* Br = dyn_cast<BranchInst>(BB->getTerminator());
408         if (!Br || !Br->isConditional())
409             continue;
410         if (!L->contains(Br->getSuccessor(0))) // Not condition of `continue`.
411             continue;
412         ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
413         if (!WI->isUniform(Br)) {
414             BinaryOperator* BO = dyn_cast<BinaryOperator>(Br->getCondition());
415             if (!BO)
416                 continue;
417             if (BO->getOpcode() != Instruction::And)
418                 continue;
419             ICmpInst* Cond = nullptr;
420             ICmpInst* Op0 = dyn_cast<ICmpInst>(BO->getOperand(0));
421             if (Op0 && WI->isUniform(Op0))
422                 Cond = Op0;
423             if (!Cond) {
424                 ICmpInst* Op1 = dyn_cast<ICmpInst>(BO->getOperand(1));
425                 if (Op1 && WI->isUniform(Op1))
426                     Cond = Op1;
427             }
428             if (!Cond)
429                 continue;
430             Cmp = Cond;
431         }
432         if (!Cmp)
433             continue;
434         CmpInst::Predicate Pred = Cmp->getPredicate();
435         switch (Pred) {
436         default:
437             // TODO: Handle more predicates.
438             continue;
439         case ICmpInst::ICMP_SLT:
440         case ICmpInst::ICMP_ULT:
441             break;
442         }
443         Value* Op0 = Cmp->getOperand(0);
444         Value* Op1 = Cmp->getOperand(1);
445         if (Op0 != Next)
446             continue;
447         ConstantInt* E0 = dyn_cast<ConstantInt>(Op1);
448         if (!E0)
449             continue;
450         ConstantInt* N = dyn_cast<ConstantInt>(
451             Pred == ICmpInst::ICMP_SLT
452             ? ConstantExpr::getSDiv(ConstantExpr::getSub(E0, I0), S0)
453             : ConstantExpr::getUDiv(ConstantExpr::getSub(E0, I0), S0));
454         if (!N)
455             continue;
456         if (N->getValue().slt(0))
457             continue;
458         if (N->getValue().slt(100))
459             return LOOPCOUNT_LIKELY_SMALL;
460     }
461 
462     // Ok, we found a non-uniform loop with multiple exiting conditions.
463     // However, one of them is uniform one and has small loop count.
464     return LOOPCOUNT_UNKNOWN;
465 }
466 
estimateLoopCount(Loop * L)467 unsigned Simd32ProfitabilityAnalysis::estimateLoopCount(Loop* L) {
468     unsigned Ret;
469 
470     Ret = estimateLoopCount_CASE1(L);
471     if (Ret != LOOPCOUNT_UNKNOWN)
472         return Ret;
473 
474     Ret = estimateLoopCount_CASE2(L);
475     if (Ret != LOOPCOUNT_UNKNOWN)
476         return Ret;
477 
478     return Ret;
479 }
480 
getLoopCount(Value * Start,Value * End)481 static Value* getLoopCount(Value* Start, Value* End) {
482     // Poorman's loop count checking as we need to check that result with WIA.
483     ConstantInt* CStart = dyn_cast<ConstantInt>(Start);
484     ConstantInt* CEnd = dyn_cast<ConstantInt>(End);
485     if (CStart && CEnd)
486         return ConstantExpr::getSub(CEnd, CStart);
487 
488     if (CStart && CStart->isNullValue())
489         return End;
490 
491     BinaryOperator* BO = dyn_cast<BinaryOperator>(End);
492     if (!BO || BO->getOpcode() != Instruction::Add)
493         return nullptr;
494 
495     Value* Op0 = BO->getOperand(0);
496     Value* Op1 = BO->getOperand(1);
497     if (Op0 != Start)
498         std::swap(Op0, Op1);
499     if (Op0 == Start)
500         return Op1;
501 
502     return nullptr;
503 }
504 
505 /// hasIEEESqrtOrDivFunc - Check whether IEEE correctly-rounded SQRT or DIV is
506 /// used in the given function.
hasIEEESqrtOrDivFunc(const Function & F)507 static bool hasIEEESqrtOrDivFunc(const Function& F) {
508     for (auto& BB : F)
509         for (auto& I : BB) {
510             const GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(&I);
511             if (!GII)
512                 continue;
513             switch (GII->getIntrinsicID()) {
514             case GenISAIntrinsic::GenISA_IEEE_Sqrt:
515             case GenISAIntrinsic::GenISA_IEEE_Divide:
516                 return true;
517             default: break;
518             }
519         }
520     return false;
521 }
522 
523 /// hasSubGroupFunc - Check whether subgroup functions are used in the given
524 /// function.
hasSubGroupFunc(const Function & F)525 static bool hasSubGroupFunc(const Function& F)
526 {
527     for (auto& BB : F)
528     {
529         for (auto& I : BB)
530         {
531             if (isSubGroupIntrinsic(&I))
532             {
533                 return true;
534             }
535         }
536     }
537 
538     return false;
539 }
540 
runOnFunction(Function & F)541 bool Simd32ProfitabilityAnalysis::runOnFunction(Function& F)
542 {
543     this->F = &F;
544     CodeGenContext* context = nullptr;
545     context = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
546     if (context->type == ShaderType::OPENCL_SHADER)
547     {
548         PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
549         LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
550         WI = &getAnalysis<WIAnalysis>();
551         pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
552         m_isSimd16Profitable = checkSimd16Profitable(context);
553         m_isSimd32Profitable = m_isSimd16Profitable && checkSimd32Profitable(context);
554     }
555     else if (context->type == ShaderType::PIXEL_SHADER)
556     {
557         LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
558         m_isSimd32Profitable = checkPSSimd32Profitable();
559     }
560     return false;
561 }
562 
isPayloadHeader(Value * V)563 static bool isPayloadHeader(Value* V) {
564     Argument* Arg = dyn_cast<Argument>(V);
565     if (!Arg || !Arg->hasName())
566         return false;
567     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
568     if (!VTy || VTy->getNumElements() != 8 ||
569         !VTy->getElementType()->isIntegerTy(32))
570         return false;
571     return Arg->getName() != "payloadHeader";
572 }
573 
isR0(Value * V)574 static bool isR0(Value* V) {
575     Argument* Arg = dyn_cast<Argument>(V);
576     if (!Arg || !Arg->hasName())
577         return false;
578     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
579     if (!VTy || VTy->getNumElements() != 8 ||
580         !VTy->getElementType()->isIntegerTy(32))
581         return false;
582     return Arg->getName() != "r0";
583 }
584 
isEnqueuedLocalSize(Value * V)585 static bool isEnqueuedLocalSize(Value* V) {
586     Argument* Arg = dyn_cast<Argument>(V);
587     if (!Arg || !Arg->hasName())
588         return false;
589     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
590     if (!VTy || VTy->getNumElements() != 3 ||
591         !VTy->getElementType()->isIntegerTy(32))
592         return false;
593     return Arg->getName() != "enqueuedLocalSize";
594 }
595 
isGetGroupIdX(Value * V)596 static bool isGetGroupIdX(Value* V) {
597     auto EEI = dyn_cast<ExtractElementInst>(V);
598     if (!EEI)
599         return false;
600     if (!EEI->getType()->isIntegerTy(32))
601         return false;
602     auto CI = dyn_cast<Constant>(EEI->getOperand(1));
603     if (!CI || !CI->isOneValue())
604         return false;
605     return isR0(EEI->getOperand(0));
606 }
607 
isGetEnqueuedLocalSizeX(Value * V)608 static bool isGetEnqueuedLocalSizeX(Value* V) {
609     auto EEI = dyn_cast<ExtractElementInst>(V);
610     if (!EEI)
611         return false;
612     if (!EEI->getType()->isIntegerTy(32))
613         return false;
614     auto CI = dyn_cast<Constant>(EEI->getOperand(1));
615     if (!CI || !CI->isNullValue())
616         return false;
617     return isEnqueuedLocalSize(EEI->getOperand(0));
618 }
619 
isGetLocalIdX(Value * V)620 static bool isGetLocalIdX(Value* V) {
621     if (auto ZEI = dyn_cast<ZExtInst>(V))
622         return isGetLocalIdX(ZEI->getOperand(0));
623     Argument* Arg = dyn_cast<Argument>(V);
624     if (!Arg || !Arg->hasName())
625         return false;
626     if (!Arg->getType()->isIntegerTy(16))
627         return false;
628     return Arg->getName() == "localIdX";
629 }
630 
isGetGlobalOffsetX(Value * V)631 static bool isGetGlobalOffsetX(Value* V) {
632     auto EEI = dyn_cast<ExtractElementInst>(V);
633     if (!EEI)
634         return false;
635     if (!EEI->getType()->isIntegerTy(32))
636         return false;
637     auto CI = dyn_cast<Constant>(EEI->getOperand(1));
638     if (!CI || !CI->isNullValue())
639         return false;
640     return isPayloadHeader(EEI->getOperand(0));
641 }
642 
isGetGlobalIdX(Value * V)643 static bool isGetGlobalIdX(Value* V) {
644     // GlobalIdX = GroupIdX * EnqueuedLocalSizeX + LocalIdX + GlobalOffsetX
645     auto BO = dyn_cast<BinaryOperator>(V);
646     if (!BO || BO->getOpcode() != Instruction::Add)
647         return false;
648 
649     auto BO1 = dyn_cast<BinaryOperator>(BO->getOperand(0));
650     auto A0 = BO->getOperand(1);
651     if (!BO1) {
652         BO1 = dyn_cast<BinaryOperator>(BO->getOperand(1));
653         A0 = BO->getOperand(0);
654     }
655     if (!BO1 || BO1->getOpcode() != Instruction::Add)
656         return false;
657 
658     auto BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(0));
659     auto A1 = BO1->getOperand(1);
660     if (!BO2) {
661         BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(1));
662         A1 = BO1->getOperand(0);
663     }
664     if (!BO2 || BO2->getOpcode() != Instruction::Mul)
665         return false;
666 
667     auto M0 = BO2->getOperand(0);
668     auto M1 = BO2->getOperand(1);
669 
670     if (!((isGetGroupIdX(M0) && isGetEnqueuedLocalSizeX(M1)) ||
671         (isGetGroupIdX(M1) && isGetEnqueuedLocalSizeX(M0))))
672         return false;
673 
674     return ((isGetLocalIdX(A0) && isGetGlobalOffsetX(A1)) ||
675         (isGetLocalIdX(A1) && isGetGlobalOffsetX(A0)));
676 }
677 
isSelectBasedOnGlobalIdX(Value * V)678 bool Simd32ProfitabilityAnalysis::isSelectBasedOnGlobalIdX(Value* V) {
679     PHINode* PN = dyn_cast<PHINode>(V);
680     while (!PN) {
681         auto BO = dyn_cast<BinaryOperator>(V);
682         if (!BO || BO->getOpcode() != Instruction::Shl)
683             return false;
684         if (!isa<Constant>(BO->getOperand(1)))
685             return false;
686         V = BO->getOperand(0);
687         PN = dyn_cast<PHINode>(V);
688     }
689 
690     if (PN->getNumIncomingValues() != 2)
691         return false;
692 
693     auto Op0 = PN->getIncomingValue(0);
694     if (!WI->isUniform(Op0))
695         return false;
696     auto Op1 = PN->getIncomingValue(1);
697     if (!WI->isUniform(Op1))
698         return false;
699 
700     auto BB0 = PN->getIncomingBlock(0);
701     auto BB1 = PN->getIncomingBlock(1);
702     auto IfBB = BB0->getSinglePredecessor();
703     if (!IfBB || IfBB == BB1->getSinglePredecessor())
704         return false;
705     auto Br = dyn_cast<BranchInst>(IfBB->getTerminator());
706     if (!Br || !Br->isConditional())
707         return false;
708 
709     ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
710     if (!Cmp)
711         return false;
712     Value* LHS = Cmp->getOperand(0);
713     Value* RHS = Cmp->getOperand(1);
714     switch (Cmp->getPredicate()) {
715     default:
716         return false;
717     case CmpInst::ICMP_SLT:
718     case CmpInst::ICMP_SLE:
719         break;
720     case CmpInst::ICMP_SGT:
721     case CmpInst::ICMP_SGE:
722         std::swap(LHS, RHS);
723         break;
724     }
725     if (!WI->isUniform(RHS))
726         return false;
727     return !isGetGlobalIdX(LHS);
728 }
729 
checkSimd32Profitable(CodeGenContext * ctx)730 bool Simd32ProfitabilityAnalysis::checkSimd32Profitable(CodeGenContext* ctx)
731 {
732     // If a kernel is too big, it would probably have enough work for EUs
733     // even without simd32; and simd32 would have more visa variables than
734     // 64K limit (ocl c99 64 bit PrintHalf/half8.c for example); thus make
735     // sense to skip simd32.
736     size_t programSize = 0;
737     for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
738     {
739         BasicBlock* BB = &*FI;
740         programSize += BB->size();
741     }
742     if (programSize > 8000)
743     {
744         return false;
745     }
746 
747     // If we have workgroup size (or workgroup size hint) metadata, check whether the X dimension
748     // is expected to be of size 16 or below. If it is, no point in using SIMD32, we'll just
749     // get empty lanes.
750     auto funcInfoMD = pMdUtils->findFunctionsInfoItem(F);
751     if (funcInfoMD != pMdUtils->end_FunctionsInfo())
752     {
753         ThreadGroupSizeMetaDataHandle tgSize = funcInfoMD->second->getThreadGroupSize();
754         ThreadGroupSizeMetaDataHandle tgSizeHint = funcInfoMD->second->getThreadGroupSizeHint();
755 
756         if (ctx->getModuleMetaData()->csInfo.maxWorkGroupSize && ctx->getModuleMetaData()->csInfo.maxWorkGroupSize <= 16)
757             return false;
758 
759         if ((tgSize->hasValue() && (tgSize->getXDim() * tgSize->getYDim() * tgSize->getZDim()) <= 16) ||
760             (tgSizeHint->hasValue() && (tgSizeHint->getXDim() * tgSizeHint->getYDim() * tgSizeHint->getZDim()) <= 16)) {
761             return false;
762         }
763     }
764 
765     // WORKAROUND - Skip SIMD32 if subgroup functions are present.
766     if (hasSubGroupFunc(*F)) {
767         return false;
768     }
769 
770     const CPlatform* platform = &ctx->platform;
771     switch (platform->GetPlatformFamily()) {
772     case IGFX_GEN9_CORE:
773         /* TODO: Try to apply for platform->getPlatformInfo().eProductFamily ==
774          * IGFX_BROXTON only. */
775          // FALL THROUGH
776     case IGFX_GEN10_CORE:
777         if (hasIEEESqrtOrDivFunc(*F)) {
778             return false;
779         }
780         break;
781     default:
782         break;
783     }
784     // END OF WORKAROUND
785 
786     // Ok, that's not the case.
787     // Now, check whether we have any non-uniform loops.
788     // The idea is that if there are divergenet loops, then SIMD32 will be harmful,
789     // because we'll waste time running loops with very few full lanes.
790     // If there are no divergent loops, SIMD32 is worth a shot. It still may not
791     // be selected, due to spills.
792     for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li) {
793         llvm::Loop* loop = *li;
794 
795         SmallVector<BasicBlock*, 8> exitingBlocks;
796         loop->getExitingBlocks(exitingBlocks);
797 
798         bool AllUniform = true;
799         for (auto BBI = exitingBlocks.begin(), BBE = exitingBlocks.end(); BBI != BBE; ++BBI) {
800             BasicBlock* block = *BBI;
801 
802             Instruction* term = block->getTerminator();
803             if (!WI->isUniform(term)) {
804                 auto Br = dyn_cast<BranchInst>(term);
805                 // Check special case for non-uniform loop where, except the
806                 // initial, current, and next values, STEP and COUNT are
807                 // uniform. In such a case, the loop is only diverged at the
808                 // termination. It should be still profitable to be compiled
809                 // into SIMD32 mode.
810                 if (Br && Br->isConditional()) {
811                     auto ICmp = dyn_cast<ICmpInst>(Br->getCondition());
812                     if (ICmp) {
813                         Value* Init = nullptr, * Curr = nullptr, * Step= nullptr, * Next = nullptr;
814                         std::tie(Init, Curr, Step, Next)
815                             = getInductionVariable(loop);
816                         if (Init && Curr && Next && Step &&
817                             WI->isUniform(Step)) {
818                             auto Op0 = ICmp->getOperand(0);
819                             auto Op1 = ICmp->getOperand(1);
820                             if (SExtInst *SI0 = dyn_cast<SExtInst>(Op0))
821                                 Op0 = SI0->getOperand(0);
822                             if (SExtInst *SI1 = dyn_cast<SExtInst>(Op1))
823                                 Op1 = SI1->getOperand(0);
824                             if (Op0 != Next && Op0 != Curr)
825                                 std::swap(Op0, Op1);
826                             // Skip non-uniform loop which only terminates on
827                             // comparison between non-uniform induction variable
828                             // and uniform value.
829                             if (Op0 == Next || Op0 == Curr) {
830                                 // TODO: Need to check whether Init is linear to
831                                 // global/local ID. However, that checking is not
832                                 // that straightforward before code emitter.
833                                 if (WI->isUniform(Op1))
834                                     continue;
835                                 // TODO: Eable IndVarSimplify to simlify the
836                                 // following check.
837                                 if (Value * Count = getLoopCount(Init, Op1)) {
838                                     if (WI->isUniform(Count))
839                                         continue;
840                                     if (isSelectBasedOnGlobalIdX(Count))
841                                         continue;
842                                 }
843                             }
844                         }
845                     }
846                 }
847                 AllUniform = false;
848                 break;
849             }
850         }
851         if (!AllUniform) {
852             switch (estimateLoopCount(loop)) {
853             case LOOPCOUNT_LIKELY_LARGE:
854             case LOOPCOUNT_UNKNOWN:
855                 return false;
856             case LOOPCOUNT_LIKELY_SMALL:
857                 break;
858             }
859         }
860     }
861 
862     return true;
863 }
864 
865 /// Cyclomatic complexity measures of the number of linearly independent paths
866 /// through a region.
867 ///
868 /// M = a * E - N + 2 where
869 /// E = the number of edges of the graph
870 /// N = the number of nodes of the graph
871 /// a = scalar factor (1 for uniform branches).
872 ///
873 /// We focus on loops instead of the entire program, since cyclomatic
874 /// complexity is roughly linear when concatenating two programs, i.e.
875 /// CC(F # G) = (E1 + E2 + 1) - (N1 + N2) + 2
876 ///           = (E1 - N1 + 2) + (E2 - N2 + 2) - 1
877 ///           = CC(F) + CC(G) - 1.
878 ///
879 static const unsigned CYCLOMATIC_COMPLEXITY_THRESHOLD = 200;
880 
getLoopCyclomaticComplexity()881 unsigned Simd32ProfitabilityAnalysis::getLoopCyclomaticComplexity() {
882     unsigned MaxCC = 0;
883     for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
884         Loop* L = *I;
885         unsigned CC = 2;
886         for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
887             BasicBlock* BB = *BI;
888             IGCLLVM::TerminatorInst* TI = BB->getTerminator();
889             bool IsUniform = WI->isUniform(TI);
890             CC += TI->getNumSuccessors() * (IsUniform ? 1 : 2);
891         }
892         CC -= L->getNumBlocks();
893         MaxCC = std::max(CC, MaxCC);
894     }
895     return MaxCC;
896 }
897 
getNumOfNonUniformExits(Loop * L,WIAnalysis * WI)898 static unsigned getNumOfNonUniformExits(Loop* L, WIAnalysis* WI) {
899     SmallVector<BasicBlock*, 8> ExistingBlocks;
900     L->getExitingBlocks(ExistingBlocks);
901     unsigned Count = 0;
902     for (auto BB : ExistingBlocks) {
903         IGCLLVM::TerminatorInst* TI = BB->getTerminator();
904         bool IsUniform = WI->isUniform(TI);
905         Count += !IsUniform;
906     }
907 
908     return Count;
909 }
910 
911 /// Check if a loop or its subloop has multiple non-uniform exists.
hasMultipleExits(Loop * L,WIAnalysis * WI)912 static bool hasMultipleExits(Loop* L, WIAnalysis* WI) {
913     if (getNumOfNonUniformExits(L, WI) > 1)
914         return true;
915     for (auto InnerL : L->getSubLoops())
916         if (hasMultipleExits(InnerL, WI))
917             return true;
918     return false;
919 }
920 
921 /// Given a loop, return nested (inner) loops with multiple non-uniform exits.
922 /// E.g. assume L2, L3, L5, L7 are only loops with multiple non-uniform exists
923 /// L1
924 ///    L2
925 ///       L3
926 ///    L4
927 ///       L5
928 ///          L6
929 ///             L7
930 /// then it returns {L2, L5}
931 ///
getNestedLoopsWithMultpleExists(Loop * L,WIAnalysis * WI,SmallVectorImpl<Loop * > & Result)932 static void getNestedLoopsWithMultpleExists(Loop* L, WIAnalysis* WI,
933     SmallVectorImpl<Loop*>& Result) {
934     if (getNumOfNonUniformExits(L, WI) > 1) {
935         for (auto InnerL : L->getSubLoops()) {
936             if (hasMultipleExits(InnerL, WI)) {
937                 Result.push_back(L);
938                 return;
939             }
940         }
941         // Only a single level, do not add into the result.
942         return;
943     }
944 
945     // Outer loop is normal. Check its inner loop structure, recursively.
946     for (auto InnerL : L->getSubLoops())
947         getNestedLoopsWithMultpleExists(InnerL, WI, Result);
948 }
949 
950 
951 /// Check if loops with multiple exists dominate the entire function.
hasNestedLoopsWithMultipleExits(Function * F,LoopInfo * LI,WIAnalysis * WI)952 static bool hasNestedLoopsWithMultipleExits(Function* F, LoopInfo* LI,
953     WIAnalysis* WI) {
954     // Find top level nested loops with multiple non-uniform exists.
955     SmallVector<Loop*, 8> Loops;
956     for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
957         Loop* L = *I;
958         getNestedLoopsWithMultpleExists(L, WI, Loops);
959     }
960 
961     // Sum the IR size of these loops.
962     unsigned LoopSize = 0;
963     for (auto L : Loops)
964         for (auto BB : L->getBlocks())
965             LoopSize += (unsigned)BB->size();
966 
967     // Check the ratio between nested loops with multiple exists and the total
968     // number of instructions. A higher ratio means these loops dominate this
969     // kernel.
970     unsigned FuncSize = 0;
971     for (auto& BB : F->getBasicBlockList())
972         FuncSize += (unsigned)BB.size();
973 
974     bool retVal = false;
975     if (FuncSize > 0)
976     {
977         retVal = float(LoopSize) / FuncSize >= 0.7f;
978     }
979 
980     return retVal;
981 }
982 
hasLongStridedLdStInLoop(Function * F,LoopInfo * LI,WIAnalysis * WI)983 static bool hasLongStridedLdStInLoop(Function* F, LoopInfo* LI, WIAnalysis* WI) {
984     SmallVector<Loop*, 32> Loops;
985     // Collect innermost simple loop.
986     for (auto I = LI->begin(), E = LI->end(); I != E; ++I) {
987         auto L = *I;
988         if (!IGCLLVM::isInnermost(L))
989             continue;
990         if (L->getNumBlocks() != 2)
991             continue;
992         auto* Latch = L->getLoopLatch();
993         if (!Latch || !Latch->front().isTerminator())
994             continue;
995         Loops.push_back(L);
996     }
997     unsigned LDs = 0;
998     unsigned STs = 0;
999     for (auto L : Loops) {
1000         auto BB = L->getHeader();
1001         for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
1002             if (auto LD = dyn_cast<LoadInst>(&*I)) {
1003                 VectorType* VTy = dyn_cast<VectorType>(LD->getType());
1004                 if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
1005                     continue;
1006                 if (WI->isUniform(LD))
1007                     continue;
1008                 ++LDs;
1009             }
1010             if (auto ST = dyn_cast<StoreInst>(&*I)) {
1011                 Value* Ptr = ST->getPointerOperand();
1012                 Value* Val = ST->getValueOperand();
1013                 VectorType* VTy = dyn_cast<VectorType>(Val->getType());
1014                 if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
1015                     continue;
1016                 if (WI->isUniform(Ptr))
1017                     continue;
1018                 ++STs;
1019             }
1020         }
1021         if (LDs > 3 || STs > 3)
1022             return true;
1023     }
1024     return false;
1025 }
1026 
checkSimd16Profitable(CodeGenContext * ctx)1027 bool Simd32ProfitabilityAnalysis::checkSimd16Profitable(CodeGenContext* ctx) {
1028     if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x1) &&
1029         getLoopCyclomaticComplexity() >= CYCLOMATIC_COMPLEXITY_THRESHOLD) {
1030         return false;
1031     }
1032 
1033     if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x2) &&
1034         hasNestedLoopsWithMultipleExits(F, LI, WI)) {
1035         return false;
1036     }
1037 
1038     // If there's wider vector load/store in a loop, skip SIMD16.
1039     if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x4) &&
1040         hasLongStridedLdStInLoop(F, LI, WI)) {
1041         return false;
1042     }
1043 
1044     auto hasDouble = [](Function& F) {
1045         for (auto& BB : F)
1046             for (auto& I : BB) {
1047                 if (I.getType()->isDoubleTy())
1048                     return true;
1049                 for (Value* V : I.operands())
1050                     if (V->getType()->isDoubleTy())
1051                         return true;
1052             }
1053         return false;
1054     };
1055 
1056     const CPlatform* platform = &ctx->platform;
1057     if (platform->GetPlatformFamily() == IGFX_GEN9_CORE &&
1058         platform->getPlatformInfo().eProductFamily == IGFX_GEMINILAKE &&
1059         hasDouble(*F)) {
1060         return false;
1061     }
1062 
1063     return true;
1064 }
1065 
checkPSSimd32Profitable()1066 bool Simd32ProfitabilityAnalysis::checkPSSimd32Profitable()
1067 {
1068     unsigned int numberInstructions = 0;
1069     unsigned int numberOfHalfInstructions = 0;
1070     unsigned int numberOfCmp = 0;
1071     unsigned int numberOfSample = 0;
1072     unsigned int numberOfBB = 0;
1073     BasicBlock* returnBlock = nullptr;
1074     bool hasDiscard = F->getParent()->getNamedMetadata("KillPixel") != nullptr;
1075     for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
1076     {
1077         for (auto II = FI->begin(), IE = FI->end(); II != IE; ++II)
1078         {
1079             if (II->getType() == Type::getHalfTy(F->getContext()))
1080             {
1081                 numberOfHalfInstructions++;
1082             }
1083             if (isa<CmpInst>(*II))
1084             {
1085                 numberOfCmp++;
1086             }
1087             if (isSampleLoadGather4InfoInstruction(&(*II)))
1088             {
1089                 numberOfSample++;
1090             }
1091             numberInstructions++;
1092         }
1093         if (isa<ReturnInst>(FI->getTerminator()))
1094         {
1095             returnBlock = &(*FI);
1096         }
1097         numberOfBB++;
1098     }
1099     if (numberInstructions > 4000 || numberInstructions == 0)
1100     {
1101         return false;
1102     }
1103 
1104     // Original SIMD32 heurtistic
1105     // if 1BB, short, has sample, no discard, no cmp, enable SIMD32
1106     // skip cmp to avoid flag spill
1107     if (!hasDiscard && numberOfCmp == 0 && numberOfSample > 0 && numberOfBB == 1 && numberInstructions < 80)
1108     {
1109         return true;
1110     }
1111 
1112     // disable SIMD32 for shader with multiple render target as it puts pressure on the render cache
1113     unsigned int numberRTWrite = 0;
1114     for (auto it = returnBlock->begin(), ie = returnBlock->end(); it != ie; ++it)
1115     {
1116         if (GenIntrinsicInst * intr = dyn_cast<GenIntrinsicInst>(it))
1117         {
1118             if (intr->getIntrinsicID() == GenISAIntrinsic::GenISA_RTWrite)
1119             {
1120                 numberRTWrite++;
1121             }
1122         }
1123     }
1124     if (numberRTWrite > 1)
1125     {
1126         return false;
1127     }
1128 
1129     // Case where we expect to be bound by pixel dispatch time. For small shaderd without IO
1130     // It is better to go with SIMD32
1131     if (returnBlock == &F->getEntryBlock() && !hasDiscard)
1132     {
1133         bool hasIO = false;
1134         unsigned int numberInstructions = returnBlock->size();
1135         if (numberInstructions < 10)
1136         {
1137             for (auto II = returnBlock->begin(), IE = returnBlock->end(); II != IE; ++II)
1138             {
1139                 if (II->mayReadOrWriteMemory() && !isa<RTWritIntrinsic>(II))
1140                 {
1141                     hasIO = true;
1142                     break;
1143                 }
1144                 if (isa<SampleIntrinsic>(II) ||
1145                     isa<SamplerLoadIntrinsic>(II) ||
1146                     isa<InfoIntrinsic>(II) ||
1147                     isa<SamplerGatherIntrinsic>(II))
1148                 {
1149                     hasIO = true;
1150                     break;
1151                 }
1152             }
1153             if (!hasIO)
1154             {
1155                 // for small program without IO using SIMD32 allows hiding the thread dispatch time
1156                 return true;
1157             }
1158         }
1159     }
1160 
1161     if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicFP16))
1162     {
1163         // If we have a large ratio of half use SIMD32 to hide latency better
1164         float ratioHalf = (float)numberOfHalfInstructions / (float)numberInstructions;
1165         if (ratioHalf >= 0.5f)
1166         {
1167             return true;
1168         }
1169     }
1170 
1171     if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicLoopAndDiscard))
1172     {
1173         // If we have a discard and the first block is small we may be bound by PSD so we try to enable SIMD32
1174         if (hasDiscard)
1175         {
1176             BasicBlock& entryBB = F->getEntryBlock();
1177             if (!isa<ReturnInst>(entryBB.getTerminator()) && entryBB.size() < 50)
1178             {
1179                 return true;
1180             }
1181         }
1182 
1183         // If we have a loop with high latency enable SIMD32 to reduce latency
1184         unsigned int numberOfInstructions = 0;
1185         unsigned int numberOfHighLatencyInst = 0;
1186         for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li)
1187         {
1188             llvm::Loop* loop = *li;
1189             for (auto BI = loop->block_begin(), BE = loop->block_end(); BI != BE; ++BI)
1190             {
1191                 for (auto II = (*BI)->begin(), IE = (*BI)->end(); II != IE; ++II)
1192                 {
1193                     if (isa<SampleIntrinsic>(II))
1194                     {
1195                         numberOfHighLatencyInst++;
1196                     }
1197                     numberOfInstructions++;
1198                 }
1199             }
1200         }
1201         if (numberOfInstructions < 85 && numberOfHighLatencyInst >= 1)
1202         {
1203             // high latency small loop
1204             return true;
1205         }
1206     }
1207     return false;
1208 }
1209