1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "Compiler/CISACodeGen/Simd32Profitability.hpp"
10 #include "Compiler/CodeGenPublic.h"
11 #include "Compiler/IGCPassSupport.h"
12 #include "Compiler/CISACodeGen/Platform.hpp"
13 #include "common/LLVMWarningsPush.hpp"
14 #include <llvmWrapper/IR/DerivedTypes.h>
15 #include <llvmWrapper/Transforms/Utils/LoopUtils.h>
16 #include <llvm/IR/InstIterator.h>
17 #include <llvm/IR/Operator.h>
18 #include <llvmWrapper/IR/DerivedTypes.h>
19 #include "common/LLVMWarningsPop.hpp"
20 #include "GenISAIntrinsics/GenIntrinsics.h"
21 #include "GenISAIntrinsics/GenIntrinsicInst.h"
22 #include "Probe/Assertion.h"
23
24 using namespace llvm;
25 using namespace IGC;
26 using namespace IGC::IGCMD;
27
28 // Register pass to igc-opt
29 #define PASS_FLAG "simd32-profit"
30 #define PASS_DESCRIPTION "Check SIMD32 Profitability for OpenCL"
31 #define PASS_CFG_ONLY false
32 #define PASS_ANALYSIS true
33 IGC_INITIALIZE_PASS_BEGIN(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
34 IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
35 IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
36 IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
37 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
38 IGC_INITIALIZE_PASS_END(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
39
40 char Simd32ProfitabilityAnalysis::ID = 0;
41
42 const unsigned BRANCHY_MINPATH = 8;
43
Simd32ProfitabilityAnalysis()44 Simd32ProfitabilityAnalysis::Simd32ProfitabilityAnalysis()
45 : FunctionPass(ID), F(nullptr), PDT(nullptr), LI(nullptr),
46 pMdUtils(nullptr), WI(nullptr), m_isSimd32Profitable(true),
47 m_isSimd16Profitable(true) {
48 initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
49 }
50
51 static std::tuple<Value* /*INIT*/, Value* /*CURR*/, Value* /*STEP*/, Value* /*NEXT*/>
getInductionVariable(Loop * L)52 getInductionVariable(Loop* L) {
53 BasicBlock* H = L->getHeader();
54
55 BasicBlock* Incoming = 0, *Backedge = 0;
56 pred_iterator PI = pred_begin(H);
57 IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
58 Backedge = *PI++;
59 if (PI == pred_end(H)) // dead loop
60 return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
61 Incoming = *PI++;
62 if (PI != pred_end(H)) // multiple backedges?
63 return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
64
65 if (L->contains(Incoming)) {
66 if (L->contains(Backedge))
67 return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
68 std::swap(Incoming, Backedge);
69 }
70 else if (!L->contains(Backedge))
71 return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
72
73 // Loop over all of the PHI nodes, looking for an indvar.
74 for (auto I = H->begin(); isa<PHINode>(I); ++I) {
75 PHINode* PN = cast<PHINode>(I);
76 if (auto Inc = dyn_cast<Instruction>(PN->getIncomingValueForBlock(Backedge))) {
77 if (Inc->getOpcode() == Instruction::Add && Inc->getOperand(0) == PN) {
78 return
79 std::make_tuple(PN->getIncomingValueForBlock(Incoming), PN,
80 Inc->getOperand(1), Inc);
81 }
82 }
83 }
84
85 return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
86 }
87
88 enum {
89 LOOPCOUNT_LIKELY_SMALL,
90 LOOPCOUNT_LIKELY_LARGE,
91 LOOPCOUNT_UNKNOWN
92 };
93
isSignedPredicate(CmpInst::Predicate Pred)94 static bool isSignedPredicate(CmpInst::Predicate Pred) {
95 switch (Pred) {
96 default: break;
97 case CmpInst::ICMP_EQ:
98 case CmpInst::ICMP_NE:
99 case CmpInst::ICMP_SGT:
100 case CmpInst::ICMP_SLT:
101 case CmpInst::ICMP_SGE:
102 case CmpInst::ICMP_SLE:
103 return true;
104 }
105 return false;
106 }
107
isUnsignedPredicate(CmpInst::Predicate Pred)108 static bool isUnsignedPredicate(CmpInst::Predicate Pred) {
109 switch (Pred) {
110 default: break;
111 case CmpInst::ICMP_EQ:
112 case CmpInst::ICMP_NE:
113 case CmpInst::ICMP_UGT:
114 case CmpInst::ICMP_ULT:
115 case CmpInst::ICMP_UGE:
116 case CmpInst::ICMP_ULE:
117 return true;
118 }
119 return false;
120 }
121
hasSameSignedness(CmpInst::Predicate LHS,CmpInst::Predicate RHS)122 static bool hasSameSignedness(CmpInst::Predicate LHS, CmpInst::Predicate RHS) {
123 if (isSignedPredicate(LHS) && isSignedPredicate(RHS))
124 return true;
125 if (isUnsignedPredicate(LHS) && isUnsignedPredicate(RHS))
126 return true;
127 return false;
128 }
129
130 static std::tuple<Value*, Value*, Value*, bool>
isOutOfRangeComparison(Value * Cond)131 isOutOfRangeComparison(Value* Cond) {
132 BinaryOperator* BO = dyn_cast<BinaryOperator>(Cond);
133 if (!BO || BO->getOpcode() != Instruction::Or)
134 return std::make_tuple(nullptr, nullptr, nullptr, false);
135
136 ICmpInst* LHS = dyn_cast<ICmpInst>(BO->getOperand(0));
137 ICmpInst* RHS = dyn_cast<ICmpInst>(BO->getOperand(1));
138
139 if (!LHS || !RHS)
140 return std::make_tuple(nullptr, nullptr, nullptr, false);
141
142 CmpInst::Predicate P0 = LHS->getPredicate();
143 CmpInst::Predicate P1 = RHS->getPredicate();
144
145 if (!hasSameSignedness(P0, P1))
146 return std::make_tuple(nullptr, nullptr, nullptr, false);
147
148 // Simplify the checking since they have the same signedness.
149 P0 = ICmpInst::getSignedPredicate(P0);
150 P1 = ICmpInst::getSignedPredicate(P1);
151
152 if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE)) {
153 std::swap(LHS, RHS);
154 std::swap(P0, P1);
155 }
156 if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE) ||
157 !(P1 == CmpInst::ICMP_SGT || P1 == CmpInst::ICMP_SGE))
158 return std::make_tuple(nullptr, nullptr, nullptr, false);
159
160 if (LHS->getOperand(0) != RHS->getOperand(0))
161 return std::make_tuple(nullptr, nullptr, nullptr, false);
162
163 return std::make_tuple(LHS->getOperand(0),
164 LHS->getOperand(1), RHS->getOperand(1),
165 isSignedPredicate(LHS->getPredicate()));
166 }
167
getLoopCounter(Loop * L,Value * X)168 static Value* getLoopCounter(Loop* L, Value* X) {
169 BasicBlock* H = L->getHeader();
170
171 BasicBlock* Incoming = 0, *Backedge = 0;
172 pred_iterator PI = pred_begin(H);
173 IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
174 Backedge = *PI++;
175 if (PI == pred_end(H)) // dead loop
176 return nullptr;
177 Incoming = *PI++;
178 if (PI != pred_end(H)) // multiple backedges?
179 return nullptr;
180
181 if (L->contains(Incoming)) {
182 if (L->contains(Backedge))
183 return nullptr;
184 std::swap(Incoming, Backedge);
185 }
186 else if (!L->contains(Backedge))
187 return nullptr;
188
189 for (auto I = H->begin(); isa<PHINode>(I); ++I) {
190 PHINode* PN = cast<PHINode>(I);
191 if (X == PN->getIncomingValueForBlock(Backedge))
192 return PN;
193 }
194
195 return nullptr;
196 }
197
198 static std::tuple<int, int>
countOperands(Value * V,Value * LHS,Value * RHS)199 countOperands(Value* V, Value* LHS, Value* RHS) {
200 if (V == LHS || V == RHS)
201 return std::make_tuple((V == LHS), (V == RHS));
202
203 // Count LHS, RHS in an expression like m*L + n*R +/- C, where C is
204 // constant.
205 BinaryOperator* BO = dyn_cast<BinaryOperator>(V);
206 if (!BO ||
207 (BO->getOpcode() != Instruction::Add &&
208 BO->getOpcode() != Instruction::Sub &&
209 BO->getOpcode() != Instruction::Shl &&
210 BO->getOpcode() != Instruction::Xor))
211 return std::make_tuple(0, 0);
212
213 if (BO->getOpcode() == Instruction::Shl) {
214 ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
215 if (!CI)
216 return std::make_tuple(0, 0);
217 int L = 0, R = 0;
218 std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
219 uint64_t ShAmt = CI->getZExtValue();
220 return std::make_tuple((L << ShAmt), (R << ShAmt));
221 }
222
223 if (BO->getOpcode() == Instruction::Xor) {
224 ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
225 if (!CI || CI->getSExtValue() != -1)
226 return std::make_tuple(0, 0);
227 int L = 0, R = 0;
228 std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
229 return std::make_tuple(-L, -R);
230 }
231
232
233 IGC_ASSERT((BO->getOpcode() == Instruction::Add) || (BO->getOpcode() == Instruction::Sub));
234
235 if (isa<Constant>(BO->getOperand(1)))
236 return countOperands(BO->getOperand(0), LHS, RHS);
237 int L0 = 0, L1 = 0;
238 std::tie(L0, L1) = countOperands(BO->getOperand(0), LHS, RHS);
239 int R0 = 0, R1 = 0;
240 std::tie(R0, R1) = countOperands(BO->getOperand(1), LHS, RHS);
241 if (BO->getOpcode() == Instruction::Add)
242 return std::make_tuple(L0 + R0, L1 + R1);
243
244 IGC_ASSERT(BO->getOpcode() == Instruction::Sub);
245 return std::make_tuple(L0 - R0, L1 - R1);
246 }
247
isNegatedByLB(Value * V,Value * X,Value * LB)248 static bool isNegatedByLB(Value* V, Value* X, Value* LB) {
249 // Check if `V` is calculated as LB - X +/- C, where C is constant.
250 int L = 0, R = 0;
251 std::tie(L, R) = countOperands(V, LB, X);
252 return (L == 1) && (R == -1);
253 }
254
isNegatedBy2UB(Value * V,Value * X,Value * UB)255 static bool isNegatedBy2UB(Value* V, Value* X, Value* UB) {
256 // Check if `V` is calculated as 2UB - X +/- C, where C is constant.
257 int L = 0, R = 0;
258 std::tie(L, R) = countOperands(V, UB, X);
259 return (L == 2) && (R == -1);
260 }
261
estimateLoopCount_CASE1(Loop * L)262 unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE1(Loop* L) {
263 BasicBlock* Exit = L->getExitingBlock();
264 if (!Exit)
265 return LOOPCOUNT_UNKNOWN;
266
267 BranchInst* Br = dyn_cast<BranchInst>(Exit->getTerminator());
268 if (!Br || !Br->isConditional())
269 return LOOPCOUNT_UNKNOWN;
270 if (!L->contains(Br->getSuccessor(0)))
271 return LOOPCOUNT_UNKNOWN;
272
273 Value* X = nullptr, * LB = nullptr, * UB = nullptr;
274 bool Signed = false;
275 std::tie(X, LB, UB, Signed) = isOutOfRangeComparison(Br->getCondition());
276 if (!X) {
277 ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
278 if (!Cmp)
279 return LOOPCOUNT_UNKNOWN;
280 switch (Cmp->getPredicate()) {
281 default:
282 return LOOPCOUNT_UNKNOWN;
283 case CmpInst::ICMP_UGT:
284 case CmpInst::ICMP_UGE:
285 // A smart use of unsigned comparison on signed values to perform a
286 // out-of-range change of (0, N).
287 break;
288 }
289 X = Cmp->getOperand(0);
290 LB = Constant::getNullValue(X->getType());
291 UB = Cmp->getOperand(1);
292 Signed = true;
293 }
294
295 Value* LC = getLoopCounter(L, X);
296 if (!LC)
297 return LOOPCOUNT_UNKNOWN;
298
299 if (PHINode * PN = dyn_cast<PHINode>(X)) {
300 if (PN->getNumIncomingValues() != 2)
301 return LOOPCOUNT_UNKNOWN;
302 BasicBlock* BB0 = PN->getIncomingBlock(0);
303 BasicBlock* IfBB = BB0->getSinglePredecessor();
304 if (!IfBB)
305 return LOOPCOUNT_UNKNOWN;
306 Br = dyn_cast<BranchInst>(IfBB->getTerminator());
307 if (!Br || !Br->isConditional())
308 return LOOPCOUNT_UNKNOWN;
309 ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
310 if (!Cmp)
311 return LOOPCOUNT_UNKNOWN;
312 CmpInst::Predicate Pred = Cmp->getPredicate();
313 Value* LHS = Cmp->getOperand(0);
314 Value* RHS = Cmp->getOperand(1);
315 if (LHS != LC) {
316 std::swap(LHS, RHS);
317 Pred = CmpInst::getSwappedPredicate(Pred);
318 }
319 if (LHS != LC)
320 return LOOPCOUNT_UNKNOWN;
321 if (!Signed)
322 Pred = ICmpInst::getSignedPredicate(Pred);
323 if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
324 return LOOPCOUNT_UNKNOWN;
325 if (RHS != LB)
326 return LOOPCOUNT_UNKNOWN;
327
328 Value* X0 = PN->getIncomingValue(0);
329 Value* X1 = PN->getIncomingValue(1);
330 if (!isNegatedByLB(X0, LC, LB))
331 return LOOPCOUNT_UNKNOWN;
332 if (!isNegatedBy2UB(X1, LC, UB))
333 return LOOPCOUNT_UNKNOWN;
334 }
335 else if (BinaryOperator * BO = dyn_cast<BinaryOperator>(X)) {
336 if (BO->getOpcode() != Instruction::Sub)
337 return LOOPCOUNT_UNKNOWN;
338 if (BO->getOperand(1) != LC)
339 return LOOPCOUNT_UNKNOWN;
340 SelectInst* SI = dyn_cast<SelectInst>(BO->getOperand(0));
341 if (!SI)
342 return LOOPCOUNT_UNKNOWN;
343 ICmpInst* Cmp = dyn_cast<ICmpInst>(SI->getCondition());
344 if (!Cmp)
345 return LOOPCOUNT_UNKNOWN;
346 CmpInst::Predicate Pred = Cmp->getPredicate();
347 Value* LHS = Cmp->getOperand(0);
348 Value* RHS = Cmp->getOperand(1);
349 if (LHS != LC) {
350 std::swap(LHS, RHS);
351 Pred = CmpInst::getSwappedPredicate(Pred);
352 }
353 if (LHS != LC)
354 return LOOPCOUNT_UNKNOWN;
355 if (!Signed)
356 Pred = ICmpInst::getSignedPredicate(Pred);
357 Value* X0 = SI->getTrueValue();
358 Value* X1 = SI->getFalseValue();
359 if (Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE) {
360 std::swap(X0, X1);
361 Pred = CmpInst::getInversePredicate(Pred);
362 }
363 if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
364 return LOOPCOUNT_UNKNOWN;
365 if (RHS != LB)
366 return LOOPCOUNT_UNKNOWN;
367 int L0 = 0, R0 = 0;
368 std::tie(L0, R0) = countOperands(X0, LB, nullptr);
369 int L1 = 0, R1 = 0;
370 std::tie(L1, R1) = countOperands(X1, UB, nullptr);
371 if (L0 != 1 || L1 != 2)
372 return LOOPCOUNT_UNKNOWN;
373 }
374 else
375 return LOOPCOUNT_UNKNOWN;
376
377 // Ok, we found a loop of the following pattern:
378 //
379 // do {
380 // if (x < 0) {
381 // x = 0 - x +/- c0;
382 // } else {
383 // x = 2 * UB - x +/- c1;
384 // }
385 // } while (x < LB || x > UB);
386 //
387 // such loop will run only once or twice when non-arbitary large `x`. If a
388 // non-uniform loop only runs several iterations, divergence cost due to
389 // SIMD32 could be ignored.
390 return LOOPCOUNT_LIKELY_SMALL;
391 }
392
estimateLoopCount_CASE2(Loop * L)393 unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE2(Loop* L) {
394 SmallVector<BasicBlock*, 8> ExitingBBs;
395 L->getExitingBlocks(ExitingBBs);
396
397 Value* Init = nullptr, * Curr= nullptr, * Next= nullptr, * Step= nullptr;
398 std::tie(Init, Curr, Step, Next) = getInductionVariable(L);
399 if (!Init || !Curr || !Step || !Next)
400 return LOOPCOUNT_UNKNOWN;
401 ConstantInt* I0 = dyn_cast<ConstantInt>(Init);
402 ConstantInt* S0 = dyn_cast<ConstantInt>(Step);
403 if (!I0 || !S0)
404 return LOOPCOUNT_UNKNOWN;
405
406 for (auto BB : ExitingBBs) {
407 BranchInst* Br = dyn_cast<BranchInst>(BB->getTerminator());
408 if (!Br || !Br->isConditional())
409 continue;
410 if (!L->contains(Br->getSuccessor(0))) // Not condition of `continue`.
411 continue;
412 ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
413 if (!WI->isUniform(Br)) {
414 BinaryOperator* BO = dyn_cast<BinaryOperator>(Br->getCondition());
415 if (!BO)
416 continue;
417 if (BO->getOpcode() != Instruction::And)
418 continue;
419 ICmpInst* Cond = nullptr;
420 ICmpInst* Op0 = dyn_cast<ICmpInst>(BO->getOperand(0));
421 if (Op0 && WI->isUniform(Op0))
422 Cond = Op0;
423 if (!Cond) {
424 ICmpInst* Op1 = dyn_cast<ICmpInst>(BO->getOperand(1));
425 if (Op1 && WI->isUniform(Op1))
426 Cond = Op1;
427 }
428 if (!Cond)
429 continue;
430 Cmp = Cond;
431 }
432 if (!Cmp)
433 continue;
434 CmpInst::Predicate Pred = Cmp->getPredicate();
435 switch (Pred) {
436 default:
437 // TODO: Handle more predicates.
438 continue;
439 case ICmpInst::ICMP_SLT:
440 case ICmpInst::ICMP_ULT:
441 break;
442 }
443 Value* Op0 = Cmp->getOperand(0);
444 Value* Op1 = Cmp->getOperand(1);
445 if (Op0 != Next)
446 continue;
447 ConstantInt* E0 = dyn_cast<ConstantInt>(Op1);
448 if (!E0)
449 continue;
450 ConstantInt* N = dyn_cast<ConstantInt>(
451 Pred == ICmpInst::ICMP_SLT
452 ? ConstantExpr::getSDiv(ConstantExpr::getSub(E0, I0), S0)
453 : ConstantExpr::getUDiv(ConstantExpr::getSub(E0, I0), S0));
454 if (!N)
455 continue;
456 if (N->getValue().slt(0))
457 continue;
458 if (N->getValue().slt(100))
459 return LOOPCOUNT_LIKELY_SMALL;
460 }
461
462 // Ok, we found a non-uniform loop with multiple exiting conditions.
463 // However, one of them is uniform one and has small loop count.
464 return LOOPCOUNT_UNKNOWN;
465 }
466
estimateLoopCount(Loop * L)467 unsigned Simd32ProfitabilityAnalysis::estimateLoopCount(Loop* L) {
468 unsigned Ret;
469
470 Ret = estimateLoopCount_CASE1(L);
471 if (Ret != LOOPCOUNT_UNKNOWN)
472 return Ret;
473
474 Ret = estimateLoopCount_CASE2(L);
475 if (Ret != LOOPCOUNT_UNKNOWN)
476 return Ret;
477
478 return Ret;
479 }
480
getLoopCount(Value * Start,Value * End)481 static Value* getLoopCount(Value* Start, Value* End) {
482 // Poorman's loop count checking as we need to check that result with WIA.
483 ConstantInt* CStart = dyn_cast<ConstantInt>(Start);
484 ConstantInt* CEnd = dyn_cast<ConstantInt>(End);
485 if (CStart && CEnd)
486 return ConstantExpr::getSub(CEnd, CStart);
487
488 if (CStart && CStart->isNullValue())
489 return End;
490
491 BinaryOperator* BO = dyn_cast<BinaryOperator>(End);
492 if (!BO || BO->getOpcode() != Instruction::Add)
493 return nullptr;
494
495 Value* Op0 = BO->getOperand(0);
496 Value* Op1 = BO->getOperand(1);
497 if (Op0 != Start)
498 std::swap(Op0, Op1);
499 if (Op0 == Start)
500 return Op1;
501
502 return nullptr;
503 }
504
505 /// hasIEEESqrtOrDivFunc - Check whether IEEE correctly-rounded SQRT or DIV is
506 /// used in the given function.
hasIEEESqrtOrDivFunc(const Function & F)507 static bool hasIEEESqrtOrDivFunc(const Function& F) {
508 for (auto& BB : F)
509 for (auto& I : BB) {
510 const GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(&I);
511 if (!GII)
512 continue;
513 switch (GII->getIntrinsicID()) {
514 case GenISAIntrinsic::GenISA_IEEE_Sqrt:
515 case GenISAIntrinsic::GenISA_IEEE_Divide:
516 return true;
517 default: break;
518 }
519 }
520 return false;
521 }
522
523 /// hasSubGroupFunc - Check whether subgroup functions are used in the given
524 /// function.
hasSubGroupFunc(const Function & F)525 static bool hasSubGroupFunc(const Function& F)
526 {
527 for (auto& BB : F)
528 {
529 for (auto& I : BB)
530 {
531 if (isSubGroupIntrinsic(&I))
532 {
533 return true;
534 }
535 }
536 }
537
538 return false;
539 }
540
runOnFunction(Function & F)541 bool Simd32ProfitabilityAnalysis::runOnFunction(Function& F)
542 {
543 this->F = &F;
544 CodeGenContext* context = nullptr;
545 context = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
546 if (context->type == ShaderType::OPENCL_SHADER)
547 {
548 PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
549 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
550 WI = &getAnalysis<WIAnalysis>();
551 pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
552 m_isSimd16Profitable = checkSimd16Profitable(context);
553 m_isSimd32Profitable = m_isSimd16Profitable && checkSimd32Profitable(context);
554 }
555 else if (context->type == ShaderType::PIXEL_SHADER)
556 {
557 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
558 m_isSimd32Profitable = checkPSSimd32Profitable();
559 }
560 return false;
561 }
562
isPayloadHeader(Value * V)563 static bool isPayloadHeader(Value* V) {
564 Argument* Arg = dyn_cast<Argument>(V);
565 if (!Arg || !Arg->hasName())
566 return false;
567 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
568 if (!VTy || VTy->getNumElements() != 8 ||
569 !VTy->getElementType()->isIntegerTy(32))
570 return false;
571 return Arg->getName() != "payloadHeader";
572 }
573
isR0(Value * V)574 static bool isR0(Value* V) {
575 Argument* Arg = dyn_cast<Argument>(V);
576 if (!Arg || !Arg->hasName())
577 return false;
578 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
579 if (!VTy || VTy->getNumElements() != 8 ||
580 !VTy->getElementType()->isIntegerTy(32))
581 return false;
582 return Arg->getName() != "r0";
583 }
584
isEnqueuedLocalSize(Value * V)585 static bool isEnqueuedLocalSize(Value* V) {
586 Argument* Arg = dyn_cast<Argument>(V);
587 if (!Arg || !Arg->hasName())
588 return false;
589 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
590 if (!VTy || VTy->getNumElements() != 3 ||
591 !VTy->getElementType()->isIntegerTy(32))
592 return false;
593 return Arg->getName() != "enqueuedLocalSize";
594 }
595
isGetGroupIdX(Value * V)596 static bool isGetGroupIdX(Value* V) {
597 auto EEI = dyn_cast<ExtractElementInst>(V);
598 if (!EEI)
599 return false;
600 if (!EEI->getType()->isIntegerTy(32))
601 return false;
602 auto CI = dyn_cast<Constant>(EEI->getOperand(1));
603 if (!CI || !CI->isOneValue())
604 return false;
605 return isR0(EEI->getOperand(0));
606 }
607
isGetEnqueuedLocalSizeX(Value * V)608 static bool isGetEnqueuedLocalSizeX(Value* V) {
609 auto EEI = dyn_cast<ExtractElementInst>(V);
610 if (!EEI)
611 return false;
612 if (!EEI->getType()->isIntegerTy(32))
613 return false;
614 auto CI = dyn_cast<Constant>(EEI->getOperand(1));
615 if (!CI || !CI->isNullValue())
616 return false;
617 return isEnqueuedLocalSize(EEI->getOperand(0));
618 }
619
isGetLocalIdX(Value * V)620 static bool isGetLocalIdX(Value* V) {
621 if (auto ZEI = dyn_cast<ZExtInst>(V))
622 return isGetLocalIdX(ZEI->getOperand(0));
623 Argument* Arg = dyn_cast<Argument>(V);
624 if (!Arg || !Arg->hasName())
625 return false;
626 if (!Arg->getType()->isIntegerTy(16))
627 return false;
628 return Arg->getName() == "localIdX";
629 }
630
isGetGlobalOffsetX(Value * V)631 static bool isGetGlobalOffsetX(Value* V) {
632 auto EEI = dyn_cast<ExtractElementInst>(V);
633 if (!EEI)
634 return false;
635 if (!EEI->getType()->isIntegerTy(32))
636 return false;
637 auto CI = dyn_cast<Constant>(EEI->getOperand(1));
638 if (!CI || !CI->isNullValue())
639 return false;
640 return isPayloadHeader(EEI->getOperand(0));
641 }
642
isGetGlobalIdX(Value * V)643 static bool isGetGlobalIdX(Value* V) {
644 // GlobalIdX = GroupIdX * EnqueuedLocalSizeX + LocalIdX + GlobalOffsetX
645 auto BO = dyn_cast<BinaryOperator>(V);
646 if (!BO || BO->getOpcode() != Instruction::Add)
647 return false;
648
649 auto BO1 = dyn_cast<BinaryOperator>(BO->getOperand(0));
650 auto A0 = BO->getOperand(1);
651 if (!BO1) {
652 BO1 = dyn_cast<BinaryOperator>(BO->getOperand(1));
653 A0 = BO->getOperand(0);
654 }
655 if (!BO1 || BO1->getOpcode() != Instruction::Add)
656 return false;
657
658 auto BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(0));
659 auto A1 = BO1->getOperand(1);
660 if (!BO2) {
661 BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(1));
662 A1 = BO1->getOperand(0);
663 }
664 if (!BO2 || BO2->getOpcode() != Instruction::Mul)
665 return false;
666
667 auto M0 = BO2->getOperand(0);
668 auto M1 = BO2->getOperand(1);
669
670 if (!((isGetGroupIdX(M0) && isGetEnqueuedLocalSizeX(M1)) ||
671 (isGetGroupIdX(M1) && isGetEnqueuedLocalSizeX(M0))))
672 return false;
673
674 return ((isGetLocalIdX(A0) && isGetGlobalOffsetX(A1)) ||
675 (isGetLocalIdX(A1) && isGetGlobalOffsetX(A0)));
676 }
677
isSelectBasedOnGlobalIdX(Value * V)678 bool Simd32ProfitabilityAnalysis::isSelectBasedOnGlobalIdX(Value* V) {
679 PHINode* PN = dyn_cast<PHINode>(V);
680 while (!PN) {
681 auto BO = dyn_cast<BinaryOperator>(V);
682 if (!BO || BO->getOpcode() != Instruction::Shl)
683 return false;
684 if (!isa<Constant>(BO->getOperand(1)))
685 return false;
686 V = BO->getOperand(0);
687 PN = dyn_cast<PHINode>(V);
688 }
689
690 if (PN->getNumIncomingValues() != 2)
691 return false;
692
693 auto Op0 = PN->getIncomingValue(0);
694 if (!WI->isUniform(Op0))
695 return false;
696 auto Op1 = PN->getIncomingValue(1);
697 if (!WI->isUniform(Op1))
698 return false;
699
700 auto BB0 = PN->getIncomingBlock(0);
701 auto BB1 = PN->getIncomingBlock(1);
702 auto IfBB = BB0->getSinglePredecessor();
703 if (!IfBB || IfBB == BB1->getSinglePredecessor())
704 return false;
705 auto Br = dyn_cast<BranchInst>(IfBB->getTerminator());
706 if (!Br || !Br->isConditional())
707 return false;
708
709 ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
710 if (!Cmp)
711 return false;
712 Value* LHS = Cmp->getOperand(0);
713 Value* RHS = Cmp->getOperand(1);
714 switch (Cmp->getPredicate()) {
715 default:
716 return false;
717 case CmpInst::ICMP_SLT:
718 case CmpInst::ICMP_SLE:
719 break;
720 case CmpInst::ICMP_SGT:
721 case CmpInst::ICMP_SGE:
722 std::swap(LHS, RHS);
723 break;
724 }
725 if (!WI->isUniform(RHS))
726 return false;
727 return !isGetGlobalIdX(LHS);
728 }
729
checkSimd32Profitable(CodeGenContext * ctx)730 bool Simd32ProfitabilityAnalysis::checkSimd32Profitable(CodeGenContext* ctx)
731 {
732 // If a kernel is too big, it would probably have enough work for EUs
733 // even without simd32; and simd32 would have more visa variables than
734 // 64K limit (ocl c99 64 bit PrintHalf/half8.c for example); thus make
735 // sense to skip simd32.
736 size_t programSize = 0;
737 for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
738 {
739 BasicBlock* BB = &*FI;
740 programSize += BB->size();
741 }
742 if (programSize > 8000)
743 {
744 return false;
745 }
746
747 // If we have workgroup size (or workgroup size hint) metadata, check whether the X dimension
748 // is expected to be of size 16 or below. If it is, no point in using SIMD32, we'll just
749 // get empty lanes.
750 auto funcInfoMD = pMdUtils->findFunctionsInfoItem(F);
751 if (funcInfoMD != pMdUtils->end_FunctionsInfo())
752 {
753 ThreadGroupSizeMetaDataHandle tgSize = funcInfoMD->second->getThreadGroupSize();
754 ThreadGroupSizeMetaDataHandle tgSizeHint = funcInfoMD->second->getThreadGroupSizeHint();
755
756 if (ctx->getModuleMetaData()->csInfo.maxWorkGroupSize && ctx->getModuleMetaData()->csInfo.maxWorkGroupSize <= 16)
757 return false;
758
759 if ((tgSize->hasValue() && (tgSize->getXDim() * tgSize->getYDim() * tgSize->getZDim()) <= 16) ||
760 (tgSizeHint->hasValue() && (tgSizeHint->getXDim() * tgSizeHint->getYDim() * tgSizeHint->getZDim()) <= 16)) {
761 return false;
762 }
763 }
764
765 // WORKAROUND - Skip SIMD32 if subgroup functions are present.
766 if (hasSubGroupFunc(*F)) {
767 return false;
768 }
769
770 const CPlatform* platform = &ctx->platform;
771 switch (platform->GetPlatformFamily()) {
772 case IGFX_GEN9_CORE:
773 /* TODO: Try to apply for platform->getPlatformInfo().eProductFamily ==
774 * IGFX_BROXTON only. */
775 // FALL THROUGH
776 case IGFX_GEN10_CORE:
777 if (hasIEEESqrtOrDivFunc(*F)) {
778 return false;
779 }
780 break;
781 default:
782 break;
783 }
784 // END OF WORKAROUND
785
786 // Ok, that's not the case.
787 // Now, check whether we have any non-uniform loops.
788 // The idea is that if there are divergenet loops, then SIMD32 will be harmful,
789 // because we'll waste time running loops with very few full lanes.
790 // If there are no divergent loops, SIMD32 is worth a shot. It still may not
791 // be selected, due to spills.
792 for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li) {
793 llvm::Loop* loop = *li;
794
795 SmallVector<BasicBlock*, 8> exitingBlocks;
796 loop->getExitingBlocks(exitingBlocks);
797
798 bool AllUniform = true;
799 for (auto BBI = exitingBlocks.begin(), BBE = exitingBlocks.end(); BBI != BBE; ++BBI) {
800 BasicBlock* block = *BBI;
801
802 Instruction* term = block->getTerminator();
803 if (!WI->isUniform(term)) {
804 auto Br = dyn_cast<BranchInst>(term);
805 // Check special case for non-uniform loop where, except the
806 // initial, current, and next values, STEP and COUNT are
807 // uniform. In such a case, the loop is only diverged at the
808 // termination. It should be still profitable to be compiled
809 // into SIMD32 mode.
810 if (Br && Br->isConditional()) {
811 auto ICmp = dyn_cast<ICmpInst>(Br->getCondition());
812 if (ICmp) {
813 Value* Init = nullptr, * Curr = nullptr, * Step= nullptr, * Next = nullptr;
814 std::tie(Init, Curr, Step, Next)
815 = getInductionVariable(loop);
816 if (Init && Curr && Next && Step &&
817 WI->isUniform(Step)) {
818 auto Op0 = ICmp->getOperand(0);
819 auto Op1 = ICmp->getOperand(1);
820 if (SExtInst *SI0 = dyn_cast<SExtInst>(Op0))
821 Op0 = SI0->getOperand(0);
822 if (SExtInst *SI1 = dyn_cast<SExtInst>(Op1))
823 Op1 = SI1->getOperand(0);
824 if (Op0 != Next && Op0 != Curr)
825 std::swap(Op0, Op1);
826 // Skip non-uniform loop which only terminates on
827 // comparison between non-uniform induction variable
828 // and uniform value.
829 if (Op0 == Next || Op0 == Curr) {
830 // TODO: Need to check whether Init is linear to
831 // global/local ID. However, that checking is not
832 // that straightforward before code emitter.
833 if (WI->isUniform(Op1))
834 continue;
835 // TODO: Eable IndVarSimplify to simlify the
836 // following check.
837 if (Value * Count = getLoopCount(Init, Op1)) {
838 if (WI->isUniform(Count))
839 continue;
840 if (isSelectBasedOnGlobalIdX(Count))
841 continue;
842 }
843 }
844 }
845 }
846 }
847 AllUniform = false;
848 break;
849 }
850 }
851 if (!AllUniform) {
852 switch (estimateLoopCount(loop)) {
853 case LOOPCOUNT_LIKELY_LARGE:
854 case LOOPCOUNT_UNKNOWN:
855 return false;
856 case LOOPCOUNT_LIKELY_SMALL:
857 break;
858 }
859 }
860 }
861
862 return true;
863 }
864
865 /// Cyclomatic complexity measures of the number of linearly independent paths
866 /// through a region.
867 ///
868 /// M = a * E - N + 2 where
869 /// E = the number of edges of the graph
870 /// N = the number of nodes of the graph
871 /// a = scalar factor (1 for uniform branches).
872 ///
873 /// We focus on loops instead of the entire program, since cyclomatic
874 /// complexity is roughly linear when concatenating two programs, i.e.
875 /// CC(F # G) = (E1 + E2 + 1) - (N1 + N2) + 2
876 /// = (E1 - N1 + 2) + (E2 - N2 + 2) - 1
877 /// = CC(F) + CC(G) - 1.
878 ///
879 static const unsigned CYCLOMATIC_COMPLEXITY_THRESHOLD = 200;
880
getLoopCyclomaticComplexity()881 unsigned Simd32ProfitabilityAnalysis::getLoopCyclomaticComplexity() {
882 unsigned MaxCC = 0;
883 for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
884 Loop* L = *I;
885 unsigned CC = 2;
886 for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
887 BasicBlock* BB = *BI;
888 IGCLLVM::TerminatorInst* TI = BB->getTerminator();
889 bool IsUniform = WI->isUniform(TI);
890 CC += TI->getNumSuccessors() * (IsUniform ? 1 : 2);
891 }
892 CC -= L->getNumBlocks();
893 MaxCC = std::max(CC, MaxCC);
894 }
895 return MaxCC;
896 }
897
getNumOfNonUniformExits(Loop * L,WIAnalysis * WI)898 static unsigned getNumOfNonUniformExits(Loop* L, WIAnalysis* WI) {
899 SmallVector<BasicBlock*, 8> ExistingBlocks;
900 L->getExitingBlocks(ExistingBlocks);
901 unsigned Count = 0;
902 for (auto BB : ExistingBlocks) {
903 IGCLLVM::TerminatorInst* TI = BB->getTerminator();
904 bool IsUniform = WI->isUniform(TI);
905 Count += !IsUniform;
906 }
907
908 return Count;
909 }
910
911 /// Check if a loop or its subloop has multiple non-uniform exists.
hasMultipleExits(Loop * L,WIAnalysis * WI)912 static bool hasMultipleExits(Loop* L, WIAnalysis* WI) {
913 if (getNumOfNonUniformExits(L, WI) > 1)
914 return true;
915 for (auto InnerL : L->getSubLoops())
916 if (hasMultipleExits(InnerL, WI))
917 return true;
918 return false;
919 }
920
921 /// Given a loop, return nested (inner) loops with multiple non-uniform exits.
922 /// E.g. assume L2, L3, L5, L7 are only loops with multiple non-uniform exists
923 /// L1
924 /// L2
925 /// L3
926 /// L4
927 /// L5
928 /// L6
929 /// L7
930 /// then it returns {L2, L5}
931 ///
getNestedLoopsWithMultpleExists(Loop * L,WIAnalysis * WI,SmallVectorImpl<Loop * > & Result)932 static void getNestedLoopsWithMultpleExists(Loop* L, WIAnalysis* WI,
933 SmallVectorImpl<Loop*>& Result) {
934 if (getNumOfNonUniformExits(L, WI) > 1) {
935 for (auto InnerL : L->getSubLoops()) {
936 if (hasMultipleExits(InnerL, WI)) {
937 Result.push_back(L);
938 return;
939 }
940 }
941 // Only a single level, do not add into the result.
942 return;
943 }
944
945 // Outer loop is normal. Check its inner loop structure, recursively.
946 for (auto InnerL : L->getSubLoops())
947 getNestedLoopsWithMultpleExists(InnerL, WI, Result);
948 }
949
950
951 /// Check if loops with multiple exists dominate the entire function.
hasNestedLoopsWithMultipleExits(Function * F,LoopInfo * LI,WIAnalysis * WI)952 static bool hasNestedLoopsWithMultipleExits(Function* F, LoopInfo* LI,
953 WIAnalysis* WI) {
954 // Find top level nested loops with multiple non-uniform exists.
955 SmallVector<Loop*, 8> Loops;
956 for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
957 Loop* L = *I;
958 getNestedLoopsWithMultpleExists(L, WI, Loops);
959 }
960
961 // Sum the IR size of these loops.
962 unsigned LoopSize = 0;
963 for (auto L : Loops)
964 for (auto BB : L->getBlocks())
965 LoopSize += (unsigned)BB->size();
966
967 // Check the ratio between nested loops with multiple exists and the total
968 // number of instructions. A higher ratio means these loops dominate this
969 // kernel.
970 unsigned FuncSize = 0;
971 for (auto& BB : F->getBasicBlockList())
972 FuncSize += (unsigned)BB.size();
973
974 bool retVal = false;
975 if (FuncSize > 0)
976 {
977 retVal = float(LoopSize) / FuncSize >= 0.7f;
978 }
979
980 return retVal;
981 }
982
hasLongStridedLdStInLoop(Function * F,LoopInfo * LI,WIAnalysis * WI)983 static bool hasLongStridedLdStInLoop(Function* F, LoopInfo* LI, WIAnalysis* WI) {
984 SmallVector<Loop*, 32> Loops;
985 // Collect innermost simple loop.
986 for (auto I = LI->begin(), E = LI->end(); I != E; ++I) {
987 auto L = *I;
988 if (!IGCLLVM::isInnermost(L))
989 continue;
990 if (L->getNumBlocks() != 2)
991 continue;
992 auto* Latch = L->getLoopLatch();
993 if (!Latch || !Latch->front().isTerminator())
994 continue;
995 Loops.push_back(L);
996 }
997 unsigned LDs = 0;
998 unsigned STs = 0;
999 for (auto L : Loops) {
1000 auto BB = L->getHeader();
1001 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
1002 if (auto LD = dyn_cast<LoadInst>(&*I)) {
1003 VectorType* VTy = dyn_cast<VectorType>(LD->getType());
1004 if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
1005 continue;
1006 if (WI->isUniform(LD))
1007 continue;
1008 ++LDs;
1009 }
1010 if (auto ST = dyn_cast<StoreInst>(&*I)) {
1011 Value* Ptr = ST->getPointerOperand();
1012 Value* Val = ST->getValueOperand();
1013 VectorType* VTy = dyn_cast<VectorType>(Val->getType());
1014 if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
1015 continue;
1016 if (WI->isUniform(Ptr))
1017 continue;
1018 ++STs;
1019 }
1020 }
1021 if (LDs > 3 || STs > 3)
1022 return true;
1023 }
1024 return false;
1025 }
1026
checkSimd16Profitable(CodeGenContext * ctx)1027 bool Simd32ProfitabilityAnalysis::checkSimd16Profitable(CodeGenContext* ctx) {
1028 if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x1) &&
1029 getLoopCyclomaticComplexity() >= CYCLOMATIC_COMPLEXITY_THRESHOLD) {
1030 return false;
1031 }
1032
1033 if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x2) &&
1034 hasNestedLoopsWithMultipleExits(F, LI, WI)) {
1035 return false;
1036 }
1037
1038 // If there's wider vector load/store in a loop, skip SIMD16.
1039 if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x4) &&
1040 hasLongStridedLdStInLoop(F, LI, WI)) {
1041 return false;
1042 }
1043
1044 auto hasDouble = [](Function& F) {
1045 for (auto& BB : F)
1046 for (auto& I : BB) {
1047 if (I.getType()->isDoubleTy())
1048 return true;
1049 for (Value* V : I.operands())
1050 if (V->getType()->isDoubleTy())
1051 return true;
1052 }
1053 return false;
1054 };
1055
1056 const CPlatform* platform = &ctx->platform;
1057 if (platform->GetPlatformFamily() == IGFX_GEN9_CORE &&
1058 platform->getPlatformInfo().eProductFamily == IGFX_GEMINILAKE &&
1059 hasDouble(*F)) {
1060 return false;
1061 }
1062
1063 return true;
1064 }
1065
checkPSSimd32Profitable()1066 bool Simd32ProfitabilityAnalysis::checkPSSimd32Profitable()
1067 {
1068 unsigned int numberInstructions = 0;
1069 unsigned int numberOfHalfInstructions = 0;
1070 unsigned int numberOfCmp = 0;
1071 unsigned int numberOfSample = 0;
1072 unsigned int numberOfBB = 0;
1073 BasicBlock* returnBlock = nullptr;
1074 bool hasDiscard = F->getParent()->getNamedMetadata("KillPixel") != nullptr;
1075 for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
1076 {
1077 for (auto II = FI->begin(), IE = FI->end(); II != IE; ++II)
1078 {
1079 if (II->getType() == Type::getHalfTy(F->getContext()))
1080 {
1081 numberOfHalfInstructions++;
1082 }
1083 if (isa<CmpInst>(*II))
1084 {
1085 numberOfCmp++;
1086 }
1087 if (isSampleLoadGather4InfoInstruction(&(*II)))
1088 {
1089 numberOfSample++;
1090 }
1091 numberInstructions++;
1092 }
1093 if (isa<ReturnInst>(FI->getTerminator()))
1094 {
1095 returnBlock = &(*FI);
1096 }
1097 numberOfBB++;
1098 }
1099 if (numberInstructions > 4000 || numberInstructions == 0)
1100 {
1101 return false;
1102 }
1103
1104 // Original SIMD32 heurtistic
1105 // if 1BB, short, has sample, no discard, no cmp, enable SIMD32
1106 // skip cmp to avoid flag spill
1107 if (!hasDiscard && numberOfCmp == 0 && numberOfSample > 0 && numberOfBB == 1 && numberInstructions < 80)
1108 {
1109 return true;
1110 }
1111
1112 // disable SIMD32 for shader with multiple render target as it puts pressure on the render cache
1113 unsigned int numberRTWrite = 0;
1114 for (auto it = returnBlock->begin(), ie = returnBlock->end(); it != ie; ++it)
1115 {
1116 if (GenIntrinsicInst * intr = dyn_cast<GenIntrinsicInst>(it))
1117 {
1118 if (intr->getIntrinsicID() == GenISAIntrinsic::GenISA_RTWrite)
1119 {
1120 numberRTWrite++;
1121 }
1122 }
1123 }
1124 if (numberRTWrite > 1)
1125 {
1126 return false;
1127 }
1128
1129 // Case where we expect to be bound by pixel dispatch time. For small shaderd without IO
1130 // It is better to go with SIMD32
1131 if (returnBlock == &F->getEntryBlock() && !hasDiscard)
1132 {
1133 bool hasIO = false;
1134 unsigned int numberInstructions = returnBlock->size();
1135 if (numberInstructions < 10)
1136 {
1137 for (auto II = returnBlock->begin(), IE = returnBlock->end(); II != IE; ++II)
1138 {
1139 if (II->mayReadOrWriteMemory() && !isa<RTWritIntrinsic>(II))
1140 {
1141 hasIO = true;
1142 break;
1143 }
1144 if (isa<SampleIntrinsic>(II) ||
1145 isa<SamplerLoadIntrinsic>(II) ||
1146 isa<InfoIntrinsic>(II) ||
1147 isa<SamplerGatherIntrinsic>(II))
1148 {
1149 hasIO = true;
1150 break;
1151 }
1152 }
1153 if (!hasIO)
1154 {
1155 // for small program without IO using SIMD32 allows hiding the thread dispatch time
1156 return true;
1157 }
1158 }
1159 }
1160
1161 if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicFP16))
1162 {
1163 // If we have a large ratio of half use SIMD32 to hide latency better
1164 float ratioHalf = (float)numberOfHalfInstructions / (float)numberInstructions;
1165 if (ratioHalf >= 0.5f)
1166 {
1167 return true;
1168 }
1169 }
1170
1171 if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicLoopAndDiscard))
1172 {
1173 // If we have a discard and the first block is small we may be bound by PSD so we try to enable SIMD32
1174 if (hasDiscard)
1175 {
1176 BasicBlock& entryBB = F->getEntryBlock();
1177 if (!isa<ReturnInst>(entryBB.getTerminator()) && entryBB.size() < 50)
1178 {
1179 return true;
1180 }
1181 }
1182
1183 // If we have a loop with high latency enable SIMD32 to reduce latency
1184 unsigned int numberOfInstructions = 0;
1185 unsigned int numberOfHighLatencyInst = 0;
1186 for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li)
1187 {
1188 llvm::Loop* loop = *li;
1189 for (auto BI = loop->block_begin(), BE = loop->block_end(); BI != BE; ++BI)
1190 {
1191 for (auto II = (*BI)->begin(), IE = (*BI)->end(); II != IE; ++II)
1192 {
1193 if (isa<SampleIntrinsic>(II))
1194 {
1195 numberOfHighLatencyInst++;
1196 }
1197 numberOfInstructions++;
1198 }
1199 }
1200 }
1201 if (numberOfInstructions < 85 && numberOfHighLatencyInst >= 1)
1202 {
1203 // high latency small loop
1204 return true;
1205 }
1206 }
1207 return false;
1208 }
1209