1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "AdaptorCommon/ImplicitArgs.hpp"
10 #include "Compiler/CISACodeGen/WIAnalysis.hpp"
11 #include "Compiler/CISACodeGen/helper.h"
12 #include "Compiler/CodeGenContextWrapper.hpp"
13 #include "Compiler/CodeGenPublic.h"
14 #include "Compiler/IGCPassSupport.h"
15 #include "common/debug/Debug.hpp"
16 #include "common/igc_regkeys.hpp"
17 #include "GenISAIntrinsics/GenIntrinsicInst.h"
18 #include "common/LLVMWarningsPush.hpp"
19 #include <llvm/IR/Function.h>
20 #include <llvm/IR/CFG.h>
21 #include <llvm/Support/CommandLine.h>
22 #include <llvm/Support/Debug.h>
23 #include <llvm/IR/Constants.h>
24 #include "common/LLVMWarningsPop.hpp"
25 #include <string>
26 #include <stack>
27 #include <sstream>
28 #include "Probe/Assertion.h"
29
30 using namespace llvm;
31 using namespace IGC;
32 using namespace IGC::IGCMD;
33 using namespace IGC::Debug;
34
35 static cl::opt<bool> PrintWiaCheck(
36 "print-wia-check", cl::init(false), cl::Hidden,
37 cl::desc("Debug wia-check analysis"));
38
39 // Register pass to igc-opt
40 #define PASS_FLAG "igc-wi-analysis"
41 #define PASS_DESCRIPTION "WIAnalysis provides work item dependency info"
42 #define PASS_CFG_ONLY true
43 #define PASS_ANALYSIS true
44 IGC_INITIALIZE_PASS_BEGIN(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
45 IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
46 IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
47 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
48 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
49 IGC_INITIALIZE_PASS_DEPENDENCY(TranslationTable)
50 IGC_INITIALIZE_PASS_END(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
51
52 char WIAnalysis::ID = 0;
53
WIAnalysis()54 WIAnalysis::WIAnalysis() : FunctionPass(ID)
55 {
56 initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
57 }
58
59 const unsigned int WIAnalysisRunner::MinIndexBitwidthToPreserve = 16;
60
61 // For dumpping WIA info per each invocation
62 DenseMap<const Function*, int> WIAnalysisRunner::m_funcInvocationId;
63
64 /// Define shorter names for dependencies, for clarity of the conversion maps
65 /// Note that only UGL/UWG/UTH/RND are supported.
66 #define UGL WIAnalysis::UNIFORM_GLOBAL
67 #define UWG WIAnalysis::UNIFORM_WORKGROUP
68 #define UTH WIAnalysis::UNIFORM_THREAD
69 #define SEQ WIAnalysis::CONSECUTIVE
70 #define PTR WIAnalysis::PTR_CONSECUTIVE
71 #define STR WIAnalysis::STRIDED
72 #define RND WIAnalysis::RANDOM
73
74 static const char* const dep_str[] = {
75 "uniform_global",
76 "uniform_workgroup",
77 "uniform_thread",
78 "consecu",
79 "p_conse",
80 "strided",
81 "random "
82 };
83
84 /// Dependency maps (define output dependency according to 2 input deps)
85
86 static const WIAnalysis::WIDependancy
87 add_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
88 /* UGL, UWG, UTH, SEQ, PTR, STR, RND */
89 /* UGL */ {UGL, UWG, UTH, SEQ, PTR, STR, RND},
90 /* UWG */ {UWG, UWG, UTH, SEQ, PTR, STR, RND},
91 /* UTH */ {UTH, UTH, UTH, SEQ, PTR, STR, RND},
92 /* SEQ */ {SEQ, SEQ, SEQ, STR, STR, STR, RND},
93 /* PTR */ {PTR, PTR, PTR, STR, STR, STR, RND},
94 /* STR */ {STR, STR, STR, STR, STR, STR, RND},
95 /* RND */ {RND, RND, RND, RND, RND, RND, RND}
96 };
97
98 static const WIAnalysis::WIDependancy
99 sub_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
100 /* UGL, UWG, UTH, SEQ, PTR, STR, RND */
101 /* UGL */ {UGL, UWG, UTH, STR, RND, RND, RND},
102 /* UWG */ {UWG, UWG, UTH, STR, RND, RND, RND},
103 /* UTH */ {UTH, UTH, UTH, STR, RND, RND, RND},
104 /* SEQ */ {SEQ, SEQ, SEQ, RND, RND, RND, RND},
105 /* PTR */ {PTR, PTR, PTR, RND, RND, RND, RND},
106 /* STR */ {STR, STR, STR, RND, RND, RND, RND},
107 /* RND */ {RND, RND, RND, RND, RND, RND, RND}
108 };
109
110
111 static const WIAnalysis::WIDependancy
112 mul_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
113 /* UGL, UWG, UTH, SEQ, PTR, STR, RND */
114 /* UGL */ {UGL, UWG, UTH, STR, STR, STR, RND},
115 /* UWG */ {UWG, UWG, UTH, STR, STR, STR, RND},
116 /* UTH */ {UTH, UTH, UTH, STR, STR, STR, RND},
117 /* SEQ */ {STR, STR, STR, RND, RND, RND, RND},
118 /* PTR */ {STR, STR, STR, RND, RND, RND, RND},
119 /* STR */ {STR, STR, STR, RND, RND, RND, RND},
120 /* RND */ {RND, RND, RND, RND, RND, RND, RND}
121 };
122
123 // select is to have a weaker dep of two
124 static const WIAnalysis::WIDependancy
125 select_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
126 /* UGL, UWG, UTH, SEQ, PTR, STR, RND */
127 /* UGL */ {UGL, UWG, UTH, STR, STR, STR, RND},
128 /* UWG */ {UWG, UWG, UTH, STR, STR, STR, RND},
129 /* UTH */ {UTH, UTH, UTH, STR, STR, STR, RND},
130 /* SEQ */ {STR, STR, STR, SEQ, STR, STR, RND},
131 /* PTR */ {STR, STR, STR, STR, PTR, STR, RND},
132 /* STR */ {STR, STR, STR, STR, STR, STR, RND},
133 /* RND */ {RND, RND, RND, RND, RND, RND, RND}
134 };
135
136 static const WIAnalysis::WIDependancy
137 gep_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
138 /* ptr\index UGL, UWG, UTH, SEQ, PTR, STR, RND */
139 /* UGL */ {UGL, UWG, UTH, PTR, RND, RND, RND},
140 /* UWG */ {UWG, UWG, UTH, PTR, RND, RND, RND},
141 /* UTH */ {UTH, UTH, UTH, PTR, RND, RND, RND},
142 /* SEQ */ {RND, RND, RND, RND, RND, RND, RND},
143 /* PTR */ {PTR, PTR, PTR, RND, RND, RND, RND},
144 /* STR */ {RND, RND, RND, RND, RND, RND, RND},
145 /* RND */ {RND, RND, RND, RND, RND, RND, RND}
146 };
147
148 // For better readability, the rank of a dependency is used to compare two dependencies
149 // to see which of them is weaker or stronger.
150 //
151 // Dependancy rank : an integer value for each Dependancy, starting from 0.
152 // Property of rank: the lower (smaller) the rank, the stronger the dependancy.
153 //
154 // Currently, enum value of each dependency is used exactly as its rank.
depRank(WIAnalysis::WIDependancy D)155 inline int depRank(WIAnalysis::WIDependancy D) { return (int)D; }
156
157 namespace IGC {
158 /// @Brief, given a conditional branch and its immediate post dominator,
159 /// find its influence-region and partial joins within the influence region
160 class BranchInfo
161 {
162 public:
163 BranchInfo(const IGCLLVM::TerminatorInst* inst, const llvm::BasicBlock* ipd);
164
165 void print(llvm::raw_ostream& OS) const;
166
167 const IGCLLVM::TerminatorInst* cbr;
168 const llvm::BasicBlock* full_join;
169 llvm::DenseSet<llvm::BasicBlock*> influence_region;
170 llvm::SmallPtrSet<llvm::BasicBlock*, 4> partial_joins;
171 llvm::BasicBlock* fork_blk;
172 };
173 } // namespace IGC
174
print(raw_ostream & OS,const Module *) const175 void WIAnalysisRunner::print(raw_ostream& OS, const Module*) const
176 {
177 DenseMap<BasicBlock*, int> BBIDs;
178 int id = 0;
179 for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I, ++id) {
180 BasicBlock* BB = &*I;
181 BBIDs[BB] = id;
182 }
183
184 std::stringstream ss;
185 ss << "WIAnalysis: " << m_func->getName().str();
186 Banner(OS, ss.str());
187
188 OS << "Args: \n";
189 for (Function::arg_iterator I = m_func->arg_begin(), E = m_func->arg_end();
190 I != E; ++I) {
191 Value* AVal = &*I;
192
193 if (m_depMap.GetAttributeWithoutCreating(AVal) != m_depMap.end())
194 OS << " " << dep_str[m_depMap.GetAttributeWithoutCreating(AVal)] << " " << *AVal << "\n";
195 else
196 OS << " unknown " << *AVal << "\n";
197
198 }
199 OS << "\n";
200
201 for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I) {
202 BasicBlock* BB = &*I;
203 OS << "BB:" << BBIDs[BB];
204 if (BB->hasName())
205 OS << " " << BB->getName();
206 OS << " ; preds =";
207 bool isFirst = true;
208 for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
209 BasicBlock* pred = *PI;
210 OS << ((isFirst) ? " " : ", ") << "BB:" << BBIDs[pred] << " ";
211 if (pred->hasName())
212 OS << pred->getName();
213 isFirst = false;
214 }
215 {
216 auto dep = getCFDependency(BB);
217 OS << "[ " << dep_str[dep] << " ]";
218 }
219 OS << "\n";
220 for (BasicBlock::iterator it = BB->begin(), ie = BB->end(); it != ie; ++it) {
221 Instruction* I = &*it;
222 if (m_depMap.GetAttributeWithoutCreating(I) != m_depMap.end())
223 {
224 OS << " " << dep_str[m_depMap.GetAttributeWithoutCreating(I)] << " " << *I;
225 }
226 else
227 {
228 OS << " unknown " << *I;
229 }
230 if (I->isTerminator()) {
231 IGCLLVM::TerminatorInst* TI = dyn_cast<IGCLLVM::TerminatorInst>(I);
232 OS << " [";
233 for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
234 BasicBlock* succ = TI->getSuccessor(i);
235 OS << " BB:" << BBIDs[succ];
236 }
237 OS << " ]";
238 }
239 OS << "\n";
240 }
241 OS << "\n";
242 }
243 }
244
lock_print()245 void WIAnalysisRunner::lock_print()
246 {
247 IGC::Debug::DumpLock();
248 {
249 int id = m_funcInvocationId[m_func]++;
250 std::stringstream ss;
251 ss << m_func->getName().str() << "_WIAnalysis_" << id;
252 auto name =
253 DumpName(IGC::Debug::GetShaderOutputName())
254 .Hash(m_CGCtx->hash)
255 .Type(m_CGCtx->type)
256 .Pass(ss.str().c_str())
257 .Extension("txt");
258 print(Dump(name, DumpType::DBG_MSG_TEXT).stream());
259 }
260 IGC::Debug::DumpUnlock();
261 }
262
263 // Used for dumpping into a file with a fixed name while running in debugger
dump() const264 void WIAnalysisRunner::dump() const
265 {
266 auto name =
267 DumpName(IGC::Debug::GetShaderOutputName())
268 .Hash(m_CGCtx->hash)
269 .Type(m_CGCtx->type)
270 .Pass("WIAnalysis")
271 .Extension("txt");
272 print(Dump(name, DumpType::DBG_MSG_TEXT).stream());
273 }
274
init(llvm::Function * F,llvm::DominatorTree * DT,llvm::PostDominatorTree * PDT,IGC::IGCMD::MetaDataUtils * MDUtils,IGC::CodeGenContext * CGCtx,IGC::ModuleMetaData * ModMD,IGC::TranslationTable * TransTable)275 void WIAnalysisRunner::init(
276 llvm::Function* F,
277 llvm::DominatorTree* DT,
278 llvm::PostDominatorTree* PDT,
279 IGC::IGCMD::MetaDataUtils* MDUtils,
280 IGC::CodeGenContext* CGCtx,
281 IGC::ModuleMetaData* ModMD,
282 IGC::TranslationTable* TransTable)
283 {
284 m_func = F;
285 this->DT = DT;
286 this->PDT = PDT;
287 m_pMdUtils = MDUtils;
288 m_CGCtx = CGCtx;
289 m_ModMD = ModMD;
290 m_TT = TransTable;
291 }
292
run()293 bool WIAnalysisRunner::run()
294 {
295 auto& F = *m_func;
296 if (m_pMdUtils->findFunctionsInfoItem(&F) == m_pMdUtils->end_FunctionsInfo())
297 return false;
298
299 m_depMap.Initialize(m_TT);
300 m_TT->RegisterListener(&m_depMap);
301
302 m_changed1.clear();
303 m_changed2.clear();
304 m_pChangedNew = &m_changed1;
305 m_pChangedOld = &m_changed2;
306 m_ctrlBranches.clear();
307
308 m_storeDepMap.clear();
309 m_allocaDepMap.clear();
310 m_forcedUniforms.clear();
311
312 updateArgsDependency(&F);
313
314 if (!IGC_IS_FLAG_ENABLED(DisableUniformAnalysis))
315 {
316 // Compute the first iteration of the WI-dep according to ordering
317 // instructions this ordering is generally good (as it ususally correlates
318 // well with dominance).
319 inst_iterator it = inst_begin(F);
320 inst_iterator e = inst_end(F);
321 for (; it != e; ++it)
322 {
323 calculate_dep(&*it);
324 }
325
326 // Recursively check if WI-dep changes and if so reclaculates
327 // the WI-dep and marks the users for re-checking.
328 // This procedure is guranteed to converge since WI-dep can only
329 // become less unifrom (uniform->consecutive->ptr->stride->random).
330 updateDeps();
331
332 // sweep the dataflow started from those GenISA_vectorUniform,
333 // force all the insert-elements and phi-nodes to uniform
334 std::set<const Value*> visited;
335 while (!m_forcedUniforms.empty())
336 {
337 const Value* V = m_forcedUniforms.back();
338 m_forcedUniforms.pop_back();
339 visited.insert(V);
340 for (auto UI = V->user_begin(), UE = V->user_end(); UI != UE; ++UI)
341 {
342 const Value* use = (*UI);
343 if (!visited.count(use) && use->getType() == V->getType())
344 {
345 if (auto INS = dyn_cast<InsertElementInst>(use))
346 {
347 if (!isUniform(use))
348 m_depMap.SetAttribute(INS, WIAnalysis::UNIFORM_THREAD);
349 m_forcedUniforms.push_back(use);
350 }
351 else if (auto PHI = dyn_cast<PHINode>(use))
352 {
353 if (!isUniform(use))
354 m_depMap.SetAttribute(PHI, WIAnalysis::UNIFORM_THREAD);
355 m_forcedUniforms.push_back(use);
356 }
357 }
358 }
359 }
360 }
361
362 if (IGC_IS_FLAG_ENABLED(DumpWIA))
363 {
364 // Dump into a unique file under dump dir, per each function invocation.
365 lock_print();
366 }
367
368 // Original print to stdout
369 // Need igc key PrintToConsole and llvm flag -print-wia-check
370 if (PrintWiaCheck)
371 {
372 print(ods());
373 }
374 return false;
375 }
376
runOnFunction(Function & F)377 bool WIAnalysis::runOnFunction(Function& F)
378 {
379 auto* MDUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
380 auto* DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
381 auto* PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
382 auto* CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
383 auto* ModMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
384 auto* pTT = &getAnalysis<TranslationTable>();
385
386 Runner.init(&F, DT, PDT, MDUtils, CGCtx, ModMD, pTT);
387 return Runner.run();
388 }
389
updateDeps()390 void WIAnalysisRunner::updateDeps()
391 {
392 // As lonst as we have values to update
393 while (!m_pChangedNew->empty())
394 {
395 // swap between changedSet pointers - recheck the newChanged(now old)
396 std::swap(m_pChangedNew, m_pChangedOld);
397 // clear the newChanged set so it will be filled with the users of
398 // instruction which their WI-dep canged during the current iteration
399 m_pChangedNew->clear();
400
401 // update all changed values
402 std::vector<const Value*>::iterator it = m_pChangedOld->begin();
403 std::vector<const Value*>::iterator e = m_pChangedOld->end();
404 for (; it != e; ++it)
405 {
406 // remove first instruction
407 // calculate its new dependencey value
408 calculate_dep(*it);
409 }
410 }
411 }
412
isInstructionSimple(const Instruction * inst)413 bool WIAnalysisRunner::isInstructionSimple(const Instruction* inst)
414 {
415 // avoid changing cb load to sampler load, since sampler load
416 // has longer latency.
417 if (isa<LoadInst>(inst))
418 {
419 return false;
420 }
421
422 if (isa<UnaryInstruction>(inst) ||
423 isa<BinaryOperator>(inst) ||
424 isa<CmpInst>(inst) ||
425 isa<SelectInst>(inst))
426 {
427 return true;
428 }
429 if (IsMathIntrinsic(GetOpCode((Instruction*)inst)))
430 {
431 return true;
432 }
433
434 return false;
435 }
436
needToBeUniform(const Value * val)437 bool WIAnalysisRunner::needToBeUniform(const Value* val)
438 {
439 for (auto UI = val->user_begin(), E = val->user_end(); UI != E; ++UI)
440 {
441 if (const RTWritIntrinsic * use = dyn_cast<RTWritIntrinsic>(*UI))
442 {
443 if (use->getSampleIndex() == val || use->getBlendStateIndex() == val)
444 {
445 return true;
446 }
447 }
448 // TODO add sampler cases
449 }
450 return false;
451 }
452
allUsesRandom(const Value * val)453 bool WIAnalysisRunner::allUsesRandom(const Value* val)
454 {
455 for (auto UI = val->user_begin(), E = val->user_end(); UI != E; ++UI)
456 {
457 const Value* use = (*UI);
458 if (getDependency(use) != WIAnalysis::RANDOM)
459 {
460 return false;
461 }
462 }
463 return true;
464 }
465
updateArgsDependency(llvm::Function * pF)466 void WIAnalysisRunner::updateArgsDependency(llvm::Function* pF)
467 {
468 /*
469 Function Signature: define void @kernel(
470 [OCL function args...],
471 [implicit args...],
472 [push analysis args...])
473
474 Example push analysis args:
475 float %urb_read_0, float %urb_read_1, float %urb_read_2, float %urb_read_3, float %urb_read_4
476
477 Metadata Generated:
478 !igc.pushanalysis.wi.info = !{!3, !4, !5, !6, !7}
479
480 !3 = metadata !{metadata !"urb_read_0", i32 0, i32 4}
481 !4 = metadata !{metadata !"urb_read_1", i32 1, i32 4}
482 !5 = metadata !{metadata !"urb_read_2", i32 2, i32 4}
483 !6 = metadata !{metadata !"urb_read_3", i32 3, i32 4}
484 !7 = metadata !{metadata !"urb_read_4", i32 4, i32 4}
485
486 Assumption is that the order of metadata matches the order of arguments in function.
487 */
488
489 // For a subroutine, conservatively assume that all user provided arguments
490 // are random. Note that all other functions are treated as kernels.
491 // To enable subroutine for other FEs, we need to update this check.
492 bool IsSubroutine = !isEntryFunc(m_pMdUtils, pF) || isNonEntryMultirateShader(pF);
493
494 ImplicitArgs implicitArgs(*pF, m_pMdUtils);
495 int implicitArgStart = (unsigned)(pF->arg_size()
496 - implicitArgs.size()
497 - (IsSubroutine ? 0 : m_ModMD->pushInfo.pushAnalysisWIInfos.size()));
498 IGC_ASSERT_MESSAGE(implicitArgStart >= 0, "Function arg size does not match meta data and push args.");
499
500 llvm::Function::arg_iterator ai, ae;
501 ai = pF->arg_begin();
502 ae = pF->arg_end();
503
504 // 1. add all kernel function args as uniform, or
505 // add all subroutine function args as random
506 for (int i = 0; i < implicitArgStart; ++i, ++ai)
507 {
508 IGC_ASSERT(ai != ae);
509 incUpdateDepend(&(*ai), IsSubroutine ? WIAnalysis::RANDOM : WIAnalysis::UNIFORM_GLOBAL);
510 }
511
512 // 2. add implicit args
513 // By default, local IDs are not uniform. But if we know that the runtime dispatchs
514 // order (intel_reqd_workgroup_walk_order()) and work group size (reqd_work_group_size()),
515 // we may derive that some of local IDs are uniform.
516 bool localX_uniform = false, localY_uniform = false, localZ_uniform = false;
517 // DispatchOCLWGInLinearOrder should be removed after testing the guarded code.
518 if (!IsSubroutine &&
519 IGC_IS_FLAG_ENABLED(DispatchOCLWGInLinearOrder))
520 {
521 checkLocalIdUniform(pF, localX_uniform, localY_uniform, localZ_uniform);
522 }
523
524 for (unsigned i = 0; i < implicitArgs.size(); ++i, ++ai)
525 {
526 IGC_ASSERT(ai != ae);
527 const ImplicitArg& iArg = implicitArgs[ai->getArgNo() - implicitArgStart];
528 WIAnalysis::WIDependancy dependency = iArg.getDependency();
529 if ((localX_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_X) ||
530 (localY_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_Y) ||
531 (localZ_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_Z)) {
532 // todo: may improve it to have UNIFORM_WORKGROUP
533 dependency = WIAnalysis::UNIFORM_THREAD;
534 }
535
536 incUpdateDepend(&(*ai), dependency);
537 }
538
539 // 3. add push analysis args
540 if (!IsSubroutine)
541 {
542 for (unsigned i = 0; i < m_ModMD->pushInfo.pushAnalysisWIInfos.size(); ++i, ++ai)
543 {
544 IGC_ASSERT(ai != ae);
545 WIAnalysis::WIDependancy dependency =
546 static_cast<WIAnalysis::WIDependancy>(m_ModMD->pushInfo.pushAnalysisWIInfos[i].argDependency);
547 incUpdateDepend(&(*ai), dependency);
548 }
549 }
550 }
551
print(llvm::raw_ostream & OS,const llvm::Module * M) const552 void WIAnalysis::print(
553 llvm::raw_ostream& OS, const llvm::Module* M) const
554 {
555 Runner.print(OS, M);
556 }
557
dump() const558 void WIAnalysis::dump() const
559 {
560 Runner.dump();
561 }
562
incUpdateDepend(const llvm::Value * val,WIDependancy dep)563 void WIAnalysis::incUpdateDepend(const llvm::Value* val, WIDependancy dep)
564 {
565 Runner.incUpdateDepend(val, dep);
566 }
567
whichDepend(const llvm::Value * val)568 WIAnalysis::WIDependancy WIAnalysis::whichDepend(const llvm::Value* val)
569 {
570 return Runner.whichDepend(val);
571 }
572
isUniform(const Value * val) const573 bool WIAnalysis::isUniform(const Value* val) const
574 {
575 return Runner.isUniform(val);
576 }
577
isGlobalUniform(const Value * val)578 bool WIAnalysis::isGlobalUniform(const Value* val)
579 {
580 return Runner.isGlobalUniform(val);
581 }
582
isWorkGroupOrGlobalUniform(const Value * val)583 bool WIAnalysis::isWorkGroupOrGlobalUniform(const Value* val)
584 {
585 return Runner.isWorkGroupOrGlobalUniform(val);
586 }
587
insideDivergentCF(const Value * val) const588 bool WIAnalysis::insideDivergentCF(const Value* val) const
589 {
590 return Runner.insideDivergentCF(val);
591 }
592
insideWorkgroupDivergentCF(const Value * val) const593 bool WIAnalysis::insideWorkgroupDivergentCF(const Value* val) const
594 {
595 return Runner.insideWorkgroupDivergentCF(val);
596 }
597
whichDepend(const Value * val) const598 WIAnalysis::WIDependancy WIAnalysisRunner::whichDepend(const Value* val) const
599 {
600 IGC_ASSERT_MESSAGE(m_pChangedNew->empty(), "set should be empty before query");
601 IGC_ASSERT_MESSAGE(nullptr != val, "Bad value");
602 if (isa<Constant>(val))
603 {
604 return WIAnalysis::UNIFORM_GLOBAL;
605 }
606 else if (isa<StaticConstantPatchIntrinsic>(val))
607 {
608 return WIAnalysis::UNIFORM_GLOBAL;
609 }
610 auto EL = m_depMap.GetAttributeWithoutCreating(val);
611 if (IGC_IS_FLAG_ENABLED(DisableUniformAnalysis))
612 {
613 if (EL == m_depMap.end())
614 {
615 return WIAnalysis::RANDOM;
616 }
617 }
618 IGC_ASSERT(EL != m_depMap.end());
619 return EL;
620 }
621
isUniform(const Value * val) const622 bool WIAnalysisRunner::isUniform(const Value* val) const
623 {
624 if (!hasDependency(val))
625 return false;
626
627 return WIAnalysis::isDepUniform(whichDepend(val));
628 }
629
isWorkGroupOrGlobalUniform(const Value * val) const630 bool WIAnalysisRunner::isWorkGroupOrGlobalUniform(const Value* val) const
631 {
632 if (!hasDependency(val))
633 return false;
634 WIAnalysis::WIDependancy dep = whichDepend(val);
635 return dep == WIAnalysis::UNIFORM_GLOBAL ||
636 dep == WIAnalysis::UNIFORM_WORKGROUP;
637 }
638
isGlobalUniform(const Value * val) const639 bool WIAnalysisRunner::isGlobalUniform(const Value* val) const
640 {
641 if (!hasDependency(val))
642 return false;
643 WIAnalysis::WIDependancy dep = whichDepend(val);
644 return dep == WIAnalysis::UNIFORM_GLOBAL;
645 }
646
getCFDependency(const BasicBlock * BB) const647 WIAnalysis::WIDependancy WIAnalysisRunner::getCFDependency(const BasicBlock* BB) const
648 {
649 auto II = m_ctrlBranches.find(BB);
650 if (II == m_ctrlBranches.end())
651 return WIAnalysis::UNIFORM_GLOBAL;
652
653 WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
654 for (auto* BI : II->second)
655 {
656 auto newDep = whichDepend(BI);
657 if (depRank(dep) < depRank(newDep))
658 dep = newDep;
659 }
660
661 return dep;
662 }
663
insideWorkgroupDivergentCF(const Value * val) const664 bool WIAnalysisRunner::insideWorkgroupDivergentCF(const Value* val) const
665 {
666 if (auto* I = dyn_cast<Instruction>(val))
667 {
668 auto dep = getCFDependency(I->getParent());
669 return depRank(dep) > WIAnalysis::UNIFORM_WORKGROUP;
670 }
671
672 return false;
673 }
674
getDependency(const Value * val)675 WIAnalysis::WIDependancy WIAnalysisRunner::getDependency(const Value* val)
676 {
677 if (m_depMap.GetAttributeWithoutCreating(val) == m_depMap.end())
678 {
679 // Make sure that constants are not added in the map.
680 if (!isa<Instruction>(val) && !isa<Argument>(val))
681 {
682 return WIAnalysis::UNIFORM_GLOBAL;
683 }
684 // Don't expect this happens, let's assertion fail
685 IGC_ASSERT_MESSAGE(0, "Dependence for 'val' should bave been set already!");
686 }
687 IGC_ASSERT(m_depMap.GetAttributeWithoutCreating(val) != m_depMap.end());
688 return m_depMap.GetAttributeWithoutCreating(val);
689 }
690
hasDependency(const Value * val) const691 bool WIAnalysisRunner::hasDependency(const Value* val) const
692 {
693
694 if (!isa<Instruction>(val) && !isa<Argument>(val))
695 {
696 return true;
697 }
698 return (m_depMap.GetAttributeWithoutCreating(val) != m_depMap.end());
699 }
700
HasPhiUse(const llvm::Value * inst)701 static bool HasPhiUse(const llvm::Value* inst)
702 {
703 for (auto UI = inst->user_begin(), E = inst->user_end(); UI != E; ++UI)
704 {
705 if (llvm::isa<llvm::PHINode>(*UI))
706 {
707 return true;
708 }
709 }
710 return false;
711 }
712
calculate_dep(const Value * val)713 void WIAnalysisRunner::calculate_dep(const Value* val)
714 {
715 IGC_ASSERT_MESSAGE(nullptr != val, "Bad value");
716
717 // Not an instruction, must be a constant or an argument
718 // Could this vector type be of a constant which
719 // is not uniform ?
720 IGC_ASSERT_MESSAGE(isa<Instruction>(val), "Could we reach here with non instruction value?");
721
722 const Instruction* const inst = dyn_cast<Instruction>(val);
723 IGC_ASSERT_MESSAGE(nullptr != inst, "This Value is not an Instruction");
724 if (inst)
725 {
726 bool hasOriginal = hasDependency(inst);
727 WIAnalysis::WIDependancy orig;
728 // We only calculate dependency on unset instructions if all their operands
729 // were already given dependency. This is good for compile time since these
730 // instructions will be visited again after the operands dependency is set.
731 // An exception are phi nodes since they can be the ancestor of themselves in
732 // the def-use chain. Note that in this case we force the phi to have the
733 // pre-header value already calculated.
734 //
735 // Another case is that an inst might be set under control dependence (for example, phi)
736 // before any of its operands have been set. In this case, we will skip here. Here
737 // is the example (derived from ocl scheduler):
738 // B0: (p) goto Bt
739 // B1: goto Bf
740 // L B2: x.lcssa = phi (x.0, Bn) // B2: partial join
741 // ...
742 // Bt: ...
743 // ...
744 // Bf:
745 // ...
746 // goto Bm (out of loop)
747 // Bn:
748 // x.0 = ...
749 // goto B2
750 // Bm: ...
751 // ...
752 // B_ipd ( iPDOM(B0) = B_ipd)
753 //
754 // B0's branch instruction has random dependency, which triggers control dependence calculation.
755 // B2 is a partial join in InfluenceRegion. Thus its phi is marked as random, but its operand
756 // x.0 is still not set yet.
757 unsigned int unsetOpNum = 0;
758 for (unsigned i = 0; i < inst->getNumOperands(); ++i)
759 {
760 if (!hasDependency(inst->getOperand(i))) unsetOpNum++;
761 }
762 if (isa<PHINode>(inst))
763 {
764 // We do not calculate PhiNode with all incoming values unset.
765 //
766 // This seems right as we don't expect a phi that only depends upon other
767 // phi's (if it happens, those phis form a cycle dependency) so any phi's
768 // calculation will eventually be triggered from calculating a non-phi one
769 // which the phi depends upon.
770 if (unsetOpNum == inst->getNumOperands()) return;
771 }
772 else
773 {
774 // We do not calculate non-PhiNode instruction that have unset operands
775 if (unsetOpNum > 0) return;
776
777 // We have all operands set. Check a special case from calculate_dep for
778 // binary ops (see the details below). It checks for ASHR+ADD and ASHR+SHL
779 // cases, and in particular it accesses dependency for ADD operands. It
780 // could happen these operands are not processed yet and in such case
781 // getDependency raises the assertion. Thus check if dependency is set.
782 // Currently we need to check dependency for ASHR->ADD operands only.
783 // For SHR, its operands are checked to be constant so skip this case.
784 // This code could be extended further depending on requirements.
785 if (inst->getOpcode() == Instruction::AShr)
786 {
787 BinaryOperator* op0 = dyn_cast<BinaryOperator>(inst->getOperand(0));
788 if (op0 && op0->getOpcode() == Instruction::Add &&
789 !hasDependency(op0->getOperand(1)))
790 {
791 return;
792 }
793 }
794 }
795
796 if (!hasOriginal)
797 {
798 orig = WIAnalysis::UNIFORM_GLOBAL;
799 }
800 else
801 {
802 orig = m_depMap.GetAttributeWithoutCreating(inst);
803
804 // if inst is already marked random, it cannot get better
805 if (orig == WIAnalysis::RANDOM)
806 {
807 return;
808 }
809 }
810
811 WIAnalysis::WIDependancy dep = orig;
812
813 // LLVM does not have compile time polymorphisms
814 // TODO: to make things faster we may want to sort the list below according
815 // to the order of their probability of appearance.
816 if (const BinaryOperator* BI = dyn_cast<BinaryOperator>(inst)) dep = calculate_dep(BI);
817 else if (const CallInst* CI = dyn_cast<CallInst>(inst)) dep = calculate_dep(CI);
818 else if (isa<CmpInst>(inst)) dep = calculate_dep_simple(inst);
819 else if (isa<ExtractElementInst>(inst)) dep = calculate_dep_simple(inst);
820 else if (const GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(inst)) dep = calculate_dep(GEP);
821 else if (isa<InsertElementInst>(inst)) dep = calculate_dep_simple(inst);
822 else if (isa<InsertValueInst>(inst)) dep = calculate_dep_simple(inst);
823 else if (const PHINode* Phi = dyn_cast<PHINode>(inst)) dep = calculate_dep(Phi);
824 else if (isa<ShuffleVectorInst>(inst)) dep = calculate_dep_simple(inst);
825 else if (isa<StoreInst>(inst)) dep = calculate_dep_simple(inst);
826 else if (inst->isTerminator()) dep = calculate_dep_terminator(dyn_cast<IGCLLVM::TerminatorInst>(inst));
827 else if (const SelectInst* SI = dyn_cast<SelectInst>(inst)) dep = calculate_dep(SI);
828 else if (const AllocaInst* AI = dyn_cast<AllocaInst>(inst)) dep = calculate_dep(AI);
829 else if (const CastInst* CI = dyn_cast<CastInst>(inst)) dep = calculate_dep(CI);
830 else if (isa<ExtractValueInst>(inst)) dep = calculate_dep_simple(inst);
831 else if (const LoadInst* LI = dyn_cast<LoadInst>(inst)) dep = calculate_dep(LI);
832 else if (const VAArgInst* VAI = dyn_cast<VAArgInst>(inst)) dep = calculate_dep(VAI);
833 #if LLVM_VERSION_MAJOR >= 10
834 else if (inst->getOpcode() == Instruction::FNeg) dep = calculate_dep_simple(inst);
835 #endif
836
837 if (m_func->hasFnAttribute("KMPLOCK"))
838 {
839 dep = WIAnalysis::UNIFORM_THREAD;
840 }
841
842 // If the value was changed in this calculation
843 if (!hasOriginal || dep != orig)
844 {
845 // i1 instructions used in phi cannot be uniform as it may prevent us from removing the phi of 1
846 if (inst->getType()->isIntegerTy(1) && WIAnalysis::isDepUniform(dep) && HasPhiUse(inst))
847 {
848 dep = WIAnalysis::RANDOM;
849 }
850 // Update dependence of this instruction if dep is weaker than orig.
851 // Note depRank(orig) could be higher than depRank(dep) for phi.
852 // (Algo will never decrease the rank of a value.)
853 WIAnalysis::WIDependancy newDep = depRank(orig) < depRank(dep) ? dep : orig;
854 if (!hasOriginal || newDep != orig)
855 {
856 // update only if it is a new dep
857 updateDepMap(inst, newDep);
858 }
859 // divergent branch, trigger updates due to control-dependence
860 if (inst->isTerminator() && dep != WIAnalysis::UNIFORM_GLOBAL)
861 {
862 update_cf_dep(dyn_cast<IGCLLVM::TerminatorInst>(inst));
863 }
864 }
865 }
866 }
867
isRegionInvariant(const llvm::Instruction * defi,BranchInfo * brInfo,unsigned level)868 bool WIAnalysisRunner::isRegionInvariant(const llvm::Instruction* defi, BranchInfo* brInfo, unsigned level)
869 {
870 if (level >= 4)
871 {
872 return false;
873 }
874 if (isa<PHINode>(defi))
875 {
876 return false;
877 }
878 const unsigned nOps = defi->getNumOperands();
879 for (unsigned i = 0; i < nOps; ++i)
880 {
881 Value* op = defi->getOperand(i);
882 Instruction* srci = dyn_cast<Instruction>(op);
883 if (srci)
884 {
885 if (!brInfo->influence_region.count(srci->getParent()))
886 {
887 // go on to check the next operand
888 continue;
889 }
890 else if (!isRegionInvariant(srci, brInfo, level + 1))
891 {
892 return false;
893 }
894 }
895 }
896 return true;
897 }
898
update_cf_dep(const IGCLLVM::TerminatorInst * inst)899 void WIAnalysisRunner::update_cf_dep(const IGCLLVM::TerminatorInst* inst)
900 {
901 IGC_ASSERT(hasDependency(inst));
902 WIBaseClass::WIDependancy instDep = getDependency(inst);
903
904 BasicBlock* blk = (BasicBlock*)(inst->getParent());
905 BasicBlock* ipd = PDT->getNode(blk)->getIDom()->getBlock();
906 // a branch can have NULL immediate post-dominator when a function
907 // has multiple exits in llvm-ir
908 // compute influence region and the partial-joins
909 BranchInfo br_info(inst, ipd);
910 // debug: dump influence region and partial-joins
911 // br_info.print(ods());
912
913 // check dep-type for every phi in the full join
914 if (ipd)
915 {
916 updatePHIDepAtJoin(ipd, &br_info);
917 }
918 // check dep-type for every phi in the partial-joins
919 for (SmallPtrSet<BasicBlock*, 4>::iterator join_it = br_info.partial_joins.begin(),
920 join_e = br_info.partial_joins.end(); join_it != join_e; ++join_it)
921 {
922 updatePHIDepAtJoin(*join_it, &br_info);
923 }
924
925 // walk through all the instructions in the influence-region
926 // update the dep-type based upon its uses
927 DenseSet<BasicBlock*>::iterator blk_it = br_info.influence_region.begin();
928 DenseSet<BasicBlock*>::iterator blk_e = br_info.influence_region.end();
929 for (; blk_it != blk_e; ++blk_it)
930 {
931 BasicBlock* def_blk = *blk_it;
932 // add the branch into the controlling-branch set of the block
933 // if the block is in the influence-region
934 IGC_ASSERT(def_blk != br_info.full_join);
935 m_ctrlBranches[def_blk].insert(inst);
936
937 for (BasicBlock::iterator I = def_blk->begin(), E = def_blk->end(); I != E; ++I)
938 {
939 Instruction* defi = &(*I);
940 if (hasDependency(defi) && depRank(getDependency(defi)) >= depRank(instDep ))
941 {
942 // defi is already weaker than or equal to inst (br), do nothing.
943 continue;
944 }
945
946 if (const auto* st = dyn_cast<StoreInst>(defi))
947 {
948 // If we encounter a store in divergent control flow,
949 // we need to process the associated alloca (if any) again
950 // because it might need to be RANDOM.
951 auto it = m_storeDepMap.find(st);
952 if (it != m_storeDepMap.end())
953 m_pChangedNew->push_back(it->second);
954 }
955
956 // This is an optimization that tries to detect instruction
957 // not really affected by control-flow divergency because
958 // all the sources are outside the region.
959 // However this is only as good as we can get because we
960 // only search limited depth
961 if (isRegionInvariant(defi, &br_info, 0))
962 {
963 continue;
964 }
965 // We need to look at where the use is in order to decide
966 // we should make def to be "random" when loop is not in
967 // LCSSA form because we do not have LCSSA phi-nodes.
968 // 1) if use is in the full-join
969 // 2) if use is even outside the full-join
970 // 3) if use is in partial-join but def is not in partial-join
971 Value::use_iterator use_it = defi->use_begin();
972 Value::use_iterator use_e = defi->use_end();
973 for (; use_it != use_e; ++use_it)
974 {
975 Instruction* user = dyn_cast<Instruction>((*use_it).getUser());
976 IGC_ASSERT(user);
977 BasicBlock* user_blk = user->getParent();
978 PHINode* phi = dyn_cast<PHINode>(user);
979 if (phi)
980 {
981 // another place we assume all critical edges have been
982 // split and phi-move will be placed on those splitters
983 user_blk = phi->getIncomingBlock(*use_it);
984 }
985 if (user_blk == def_blk)
986 {
987 // local def-use, not related to control-dependence
988 continue; // skip
989 }
990 if (user_blk == br_info.full_join ||
991 !br_info.influence_region.count(user_blk) ||
992 (br_info.partial_joins.count(user_blk) &&
993 !br_info.partial_joins.count(def_blk))
994 )
995 {
996 updateDepMap(defi, instDep);
997 // break out of the use loop
998 // since def is changed to RANDOM, all uses will be changed later
999 break;
1000 }
1001 } // end of usei loop
1002 } // end of defi loop within a block
1003 } // end of influence-region block loop
1004 }
1005
updatePHIDepAtJoin(BasicBlock * blk,BranchInfo * brInfo)1006 void WIAnalysisRunner::updatePHIDepAtJoin(BasicBlock* blk, BranchInfo* brInfo)
1007 {
1008 // This is to bring down PHI's dep to br's dep.
1009 // If PHI's dep is already weaker than br's dep, do nothing.
1010 IGC_ASSERT(hasDependency(brInfo->cbr));
1011 WIAnalysis::WIDependancy brDep = getDependency(brInfo->cbr);
1012
1013 for (BasicBlock::iterator I = blk->begin(), E = blk->end(); I != E; ++I)
1014 {
1015 Instruction* defi = &(*I);
1016 PHINode* phi = dyn_cast<PHINode>(defi);
1017 if (!phi)
1018 {
1019 break;
1020 }
1021 if (hasDependency(phi) && depRank(getDependency(phi)) >= depRank(brDep))
1022 {
1023 // phi's dep is already the same or weaker, do nothing.
1024 continue;
1025 }
1026 Value* trickySrc = nullptr;
1027 for (unsigned predIdx = 0; predIdx < phi->getNumOperands(); ++predIdx)
1028 {
1029 Value* srcVal = phi->getOperand(predIdx);
1030 Instruction* defi = dyn_cast<Instruction>(srcVal);
1031 if (defi && brInfo->influence_region.count(defi->getParent()))
1032 {
1033 updateDepMap(phi, brDep);
1034 break;
1035 }
1036 else
1037 {
1038 // if the src is an immed, or an argument, or defined outside,
1039 // think about the phi-move that can be placed in the incoming block.
1040 // this phi should be random if we have two different src-values like that.
1041 // this is one place where we assume all critical edges have been split
1042 BasicBlock* predBlk = phi->getIncomingBlock(predIdx);
1043 if (brInfo->influence_region.count(predBlk))
1044 {
1045 if (!trickySrc)
1046 {
1047 trickySrc = srcVal;
1048 }
1049 else if (trickySrc != srcVal)
1050 {
1051 updateDepMap(phi, brDep);
1052 break;
1053 }
1054 }
1055 }
1056 }
1057 }
1058 }
1059
updateDepMap(const Instruction * inst,WIAnalysis::WIDependancy dep)1060 void WIAnalysisRunner::updateDepMap(const Instruction* inst, WIAnalysis::WIDependancy dep)
1061 {
1062 // Save the new value of this instruction
1063 m_depMap.SetAttribute(inst, dep);
1064 // Register for update all of the dependent values of this updated
1065 // instruction.
1066 Value::const_user_iterator it = inst->user_begin();
1067 Value::const_user_iterator e = inst->user_end();
1068 for (; it != e; ++it)
1069 {
1070 m_pChangedNew->push_back(*it);
1071 }
1072 if (const StoreInst * st = dyn_cast<StoreInst>(inst))
1073 {
1074 auto it = m_storeDepMap.find(st);
1075 if (it != m_storeDepMap.end())
1076 {
1077 m_pChangedNew->push_back(it->second);
1078 }
1079 }
1080
1081 if (dep == WIAnalysis::RANDOM)
1082 {
1083 EOPCODE eopcode = GetOpCode((Instruction*)inst);
1084 if (eopcode == llvm_insert)
1085 {
1086 updateInsertElements((const InsertElementInst*)inst);
1087 }
1088 }
1089 }
1090
1091 /// if one of insert-element is random, turn all the insert-elements into random
updateInsertElements(const InsertElementInst * inst)1092 void WIAnalysisRunner::updateInsertElements(const InsertElementInst* inst)
1093 {
1094 /// find the first one in the sequence
1095 InsertElementInst* curInst = (InsertElementInst*)inst;
1096 InsertElementInst* srcInst = dyn_cast<InsertElementInst>(curInst->getOperand(0));
1097 while (srcInst)
1098 {
1099 if (hasDependency(srcInst) && getDependency(srcInst) == WIAnalysis::RANDOM)
1100 return;
1101 curInst = srcInst;
1102 srcInst = dyn_cast<InsertElementInst>(curInst->getOperand(0));
1103 }
1104 if (curInst != inst)
1105 {
1106 m_depMap.SetAttribute(curInst, WIAnalysis::RANDOM);
1107 Value::user_iterator it = curInst->user_begin();
1108 Value::user_iterator e = curInst->user_end();
1109 for (; it != e; ++it)
1110 {
1111 m_pChangedNew->push_back(*it);
1112 }
1113 }
1114 }
1115
calculate_dep_simple(const Instruction * I)1116 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep_simple(const Instruction* I)
1117 {
1118 // simply check that all operands are uniform, if so return uniform, else random
1119 const unsigned nOps = I->getNumOperands();
1120 WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
1121 for (unsigned i = 0; i < nOps; ++i)
1122 {
1123 const Value* op = I->getOperand(i);
1124 WIAnalysis::WIDependancy D = getDependency(op);
1125 dep = add_conversion[dep][D];
1126 if (dep == WIAnalysis::RANDOM)
1127 {
1128 break;
1129 }
1130 }
1131 return dep;
1132 }
1133
calculate_dep(const LoadInst * inst)1134 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const LoadInst* inst)
1135 {
1136 return calculate_dep_simple(inst);
1137 }
1138
calculate_dep(const BinaryOperator * inst)1139 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(
1140 const BinaryOperator* inst)
1141 {
1142 // Calculate the dependency type for each of the operands
1143 Value* op0 = inst->getOperand(0);
1144 Value* op1 = inst->getOperand(1);
1145
1146 WIAnalysis::WIDependancy dep0 = getDependency(op0);
1147 IGC_ASSERT(dep0 < WIAnalysis::NumDeps);
1148 WIAnalysis::WIDependancy dep1 = getDependency(op1);
1149 IGC_ASSERT(dep1 < WIAnalysis::NumDeps);
1150
1151 // For whatever binary operation,
1152 // uniform returns uniform
1153 WIAnalysis::WIDependancy dep = select_conversion[dep0][dep1];
1154 if (WIAnalysis::isDepUniform(dep))
1155 {
1156 return dep;
1157 }
1158
1159 // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
1160 // The pattern (and (X, C)), where C preserves the lower k bits of the value,
1161 // is often used for truncating of numbers in 64bit. We assume that the index
1162 // properties are not hurt by this.
1163 if (inst->getOpcode() == Instruction::And)
1164 {
1165 ConstantInt* C0 = dyn_cast<ConstantInt>(inst->getOperand(0));
1166 ConstantInt* C1 = dyn_cast<ConstantInt>(inst->getOperand(1));
1167 // Use any of the constants. Instcombine places constants on Op1
1168 // so try Op1 first.
1169 if (C1 || C0)
1170 {
1171 ConstantInt* C = C1 ? C1 : C0;
1172 dep = C1 ? dep0 : dep1;
1173 // Cannot look at bit pattern of huge integers.
1174 if (C->getBitWidth() < 65)
1175 {
1176 uint64_t val = C->getZExtValue();
1177 uint64_t ptr_mask = (1 << MinIndexBitwidthToPreserve) - 1;
1178 // Zero all bits above the lower k bits that we are interested in
1179 val &= (ptr_mask);
1180 // Make sure that all of the remaining bits are active
1181 if (val == ptr_mask)
1182 {
1183 return dep;
1184 }
1185 }
1186 }
1187 }
1188
1189 // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
1190 // The pattern (ashr (shl X, C)C) is used for truncating of numbers in 64bit
1191 // The constant C must leave at least 32bits of the original number
1192 if (inst->getOpcode() == Instruction::AShr)
1193 {
1194 BinaryOperator* SHL = dyn_cast<BinaryOperator>(inst->getOperand(0));
1195 // We also allow add of uniform value between the ashr and shl instructions
1196 // since instcombine creates this pattern when adding a constant.
1197 // The shl forces all low bits to be zero, so there can be no carry to the
1198 // high bits due to the addition. Addition with uniform preservs WI-dep.
1199 if (SHL && SHL->getOpcode() == Instruction::Add)
1200 {
1201 Value* addedVal = SHL->getOperand(1);
1202 if (WIAnalysis::isDepUniform(getDependency(addedVal)))
1203 {
1204 SHL = dyn_cast<BinaryOperator>(SHL->getOperand(0));
1205 }
1206 }
1207
1208 if (SHL && SHL->getOpcode() == Instruction::Shl)
1209 {
1210 ConstantInt* c_ashr = dyn_cast<ConstantInt>(inst->getOperand(1));
1211 ConstantInt* c_shl = dyn_cast<ConstantInt>(SHL->getOperand(1));
1212 const IntegerType* AshrTy = cast<IntegerType>(inst->getType());
1213 if (c_ashr && c_shl && c_ashr->getZExtValue() == c_shl->getZExtValue())
1214 {
1215 // If wordWidth - shift_width >= 32 bits
1216 if ((AshrTy->getBitWidth() - c_shl->getZExtValue()) >= MinIndexBitwidthToPreserve)
1217 {
1218 // return the dep of the original X
1219 return getDependency(SHL->getOperand(0));
1220 }
1221 }
1222 }
1223 }
1224
1225 switch (inst->getOpcode())
1226 {
1227 // Addition simply adds the stride value, except for ptr_consecutive
1228 // which is promoted to strided.
1229 // Another exception is when we subtract the tid: 1 - X which turns the
1230 // tid order to random.
1231 case Instruction::Add:
1232 case Instruction::FAdd:
1233 return add_conversion[dep0][dep1];
1234 case Instruction::Sub:
1235 case Instruction::FSub:
1236 return sub_conversion[dep0][dep1];
1237
1238 case Instruction::Mul:
1239 case Instruction::FMul:
1240 case Instruction::Shl:
1241 if (WIAnalysis::isDepUniform(dep0) || WIAnalysis::isDepUniform(dep1))
1242 {
1243 // If one of the sides is uniform, then we can adopt
1244 // the other side (stride*uniform is still stride).
1245 // stride size is K, where K is the uniform input.
1246 // An exception to this is ptr_consecutive, which is
1247 // promoted to strided.
1248 return mul_conversion[dep0][dep1];
1249 }
1250 default:
1251 //TODO: Support more arithmetic if needed
1252 return WIAnalysis::RANDOM;
1253 }
1254 return WIAnalysis::RANDOM;
1255 }
1256
calculate_dep(const CallInst * inst)1257 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst)
1258 {
1259 // handle 3D specific intrinsics
1260 EOPCODE intrinsic_name = GetOpCode((Instruction*)(inst));
1261 GenISAIntrinsic::ID GII_id = GenISAIntrinsic::no_intrinsic;
1262 if (const GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(inst))
1263 {
1264 GII_id = GII->getIntrinsicID();
1265 }
1266
1267 const llvm::IntrinsicInst* llvmintrin = dyn_cast<llvm::IntrinsicInst>(inst);
1268 if (llvmintrin != nullptr &&
1269 (llvmintrin->getIntrinsicID() == llvm::Intrinsic::stacksave ||
1270 llvmintrin->getIntrinsicID() == llvm::Intrinsic::stackrestore)) {
1271 return WIAnalysis::UNIFORM_THREAD;
1272 }
1273
1274 if (IsMathIntrinsic(intrinsic_name) ||
1275 intrinsic_name == llvm_input ||
1276 intrinsic_name == llvm_sgv ||
1277 intrinsic_name == llvm_shaderinputvec ||
1278 intrinsic_name == llvm_getbufferptr ||
1279 intrinsic_name == llvm_runtimeValue ||
1280 intrinsic_name == llvm_getMessagePhaseX ||
1281 intrinsic_name == llvm_getMessagePhaseXV ||
1282 intrinsic_name == llvm_surfaceinfo ||
1283 intrinsic_name == llvm_simdSize ||
1284 intrinsic_name == llvm_resinfoptr ||
1285 intrinsic_name == llvm_sampleinfoptr ||
1286 intrinsic_name == llvm_ldrawvector_indexed ||
1287 intrinsic_name == llvm_ldraw_indexed ||
1288 intrinsic_name == llvm_cycleCounter ||
1289 intrinsic_name == llvm_waveShuffleIndex ||
1290 intrinsic_name == llvm_waveBallot ||
1291 intrinsic_name == llvm_waveAll ||
1292 intrinsic_name == llvm_waveClustered ||
1293 intrinsic_name == llvm_ld_ptr ||
1294 (IGC_IS_FLAG_DISABLED(DisableUniformTypedAccess) && intrinsic_name == llvm_typed_read) ||
1295 intrinsic_name == llvm_add_pair ||
1296 intrinsic_name == llvm_sub_pair ||
1297 intrinsic_name == llvm_mul_pair ||
1298 intrinsic_name == llvm_ptr_to_pair ||
1299 intrinsic_name == llvm_pair_to_ptr ||
1300 intrinsic_name == llvm_fma ||
1301 GII_id == GenISAIntrinsic::GenISA_uitof_rtz ||
1302 GII_id == GenISAIntrinsic::GenISA_ftobf ||
1303 GII_id == GenISAIntrinsic::GenISA_bftof ||
1304 GII_id == GenISAIntrinsic::GenISA_2fto2bf ||
1305 GII_id == GenISAIntrinsic::GenISA_dual_subslice_id ||
1306 GII_id == GenISAIntrinsic::GenISA_getSR0 ||
1307 GII_id == GenISAIntrinsic::GenISA_getSR0_0 ||
1308 GII_id == GenISAIntrinsic::GenISA_mul_rtz ||
1309 GII_id == GenISAIntrinsic::GenISA_fma_rtz ||
1310 GII_id == GenISAIntrinsic::GenISA_fma_rtp ||
1311 GII_id == GenISAIntrinsic::GenISA_fma_rtn ||
1312 GII_id == GenISAIntrinsic::GenISA_add_rtz ||
1313 GII_id == GenISAIntrinsic::GenISA_slice_id ||
1314 GII_id == GenISAIntrinsic::GenISA_subslice_id ||
1315 GII_id == GenISAIntrinsic::GenISA_dual_subslice_id ||
1316 GII_id == GenISAIntrinsic::GenISA_eu_id ||
1317 GII_id == GenISAIntrinsic::GenISA_eu_thread_id ||
1318 GII_id == GenISAIntrinsic::GenISA_hw_thread_id ||
1319 GII_id == GenISAIntrinsic::GenISA_hw_thread_id_alloca ||
1320 GII_id == GenISAIntrinsic::GenISA_StackAlloca ||
1321 GII_id == GenISAIntrinsic::GenISA_vectorUniform ||
1322 GII_id == GenISAIntrinsic::GenISA_getR0 ||
1323 GII_id == GenISAIntrinsic::GenISA_getPayloadHeader ||
1324 GII_id == GenISAIntrinsic::GenISA_getWorkDim ||
1325 GII_id == GenISAIntrinsic::GenISA_getNumWorkGroups ||
1326 GII_id == GenISAIntrinsic::GenISA_getLocalSize ||
1327 GII_id == GenISAIntrinsic::GenISA_getGlobalSize ||
1328 GII_id == GenISAIntrinsic::GenISA_getEnqueuedLocalSize ||
1329 GII_id == GenISAIntrinsic::GenISA_getLocalID_X ||
1330 GII_id == GenISAIntrinsic::GenISA_getLocalID_Y ||
1331 GII_id == GenISAIntrinsic::GenISA_getLocalID_Z ||
1332 GII_id == GenISAIntrinsic::GenISA_getPrivateBase ||
1333 GII_id == GenISAIntrinsic::GenISA_getPrintfBuffer ||
1334 GII_id == GenISAIntrinsic::GenISA_getStageInGridOrigin ||
1335 GII_id == GenISAIntrinsic::GenISA_getStageInGridSize ||
1336 GII_id == GenISAIntrinsic::GenISA_getSyncBuffer ||
1337 GII_id == GenISAIntrinsic::GenISA_GetImplicitBufferPtr ||
1338 GII_id == GenISAIntrinsic::GenISA_GetLocalIdBufferPtr)
1339 {
1340 switch (GII_id)
1341 {
1342 default:
1343 break;
1344 case GenISAIntrinsic::GenISA_vectorUniform:
1345 // collect the seeds for forcing uniform vectors
1346 m_forcedUniforms.push_back(inst);
1347 return WIAnalysis::UNIFORM_THREAD;
1348 case GenISAIntrinsic::GenISA_getSR0:
1349 case GenISAIntrinsic::GenISA_getSR0_0:
1350 case GenISAIntrinsic::GenISA_eu_id:
1351 case GenISAIntrinsic::GenISA_hw_thread_id:
1352 return WIAnalysis::UNIFORM_THREAD;
1353 case GenISAIntrinsic::GenISA_slice_id:
1354 case GenISAIntrinsic::GenISA_subslice_id:
1355 case GenISAIntrinsic::GenISA_dual_subslice_id:
1356 // Make sure they are UNIFORM_WORKGROUP
1357 //return WIAnalysis::UNIFORM_WORKGROUP;
1358 return WIAnalysis::UNIFORM_THREAD;
1359 case GenISAIntrinsic::GenISA_GetImplicitBufferPtr:
1360 case GenISAIntrinsic::GenISA_GetLocalIdBufferPtr:
1361 return WIAnalysis::UNIFORM_THREAD;
1362 case GenISAIntrinsic::GenISA_getR0:
1363 case GenISAIntrinsic::GenISA_getPayloadHeader:
1364 case GenISAIntrinsic::GenISA_getWorkDim:
1365 case GenISAIntrinsic::GenISA_getNumWorkGroups:
1366 case GenISAIntrinsic::GenISA_getLocalSize:
1367 case GenISAIntrinsic::GenISA_getGlobalSize:
1368 case GenISAIntrinsic::GenISA_getEnqueuedLocalSize:
1369 case GenISAIntrinsic::GenISA_getLocalID_X:
1370 case GenISAIntrinsic::GenISA_getLocalID_Y:
1371 case GenISAIntrinsic::GenISA_getLocalID_Z:
1372 case GenISAIntrinsic::GenISA_getPrivateBase:
1373 case GenISAIntrinsic::GenISA_getPrintfBuffer:
1374 case GenISAIntrinsic::GenISA_getStageInGridOrigin:
1375 case GenISAIntrinsic::GenISA_getStageInGridSize:
1376 case GenISAIntrinsic::GenISA_getSyncBuffer:
1377 return ImplicitArgs::getArgDep(GII_id);
1378 }
1379
1380 if (intrinsic_name == llvm_input ||
1381 intrinsic_name == llvm_shaderinputvec)
1382 {
1383 e_interpolation mode = (e_interpolation)cast<ConstantInt>(inst->getOperand(1))->getZExtValue();
1384 if (mode != EINTERPOLATION_CONSTANT
1385 )
1386 {
1387 return WIAnalysis::RANDOM;
1388 }
1389 }
1390
1391
1392 if (intrinsic_name == llvm_sgv)
1393 {
1394 SGVUsage usage = (SGVUsage)cast<ConstantInt>(inst->getOperand(0))->getZExtValue();
1395 if ((usage != VFACE
1396 ) &&
1397 usage != ACTUAL_COARSE_SIZE_X &&
1398 usage != ACTUAL_COARSE_SIZE_Y &&
1399 usage != THREAD_GROUP_ID_X &&
1400 usage != THREAD_GROUP_ID_Y &&
1401 usage != THREAD_GROUP_ID_Z &&
1402 usage != XP0 &&
1403 usage != XP1 &&
1404 usage != XP2
1405 )
1406 {
1407 return WIAnalysis::RANDOM;
1408 }
1409 }
1410 if (intrinsic_name == llvm_getMessagePhaseX ||
1411 intrinsic_name == llvm_getMessagePhaseXV)
1412 {
1413 return WIAnalysis::UNIFORM_THREAD;
1414 }
1415
1416 if (intrinsic_name == llvm_waveShuffleIndex)
1417 {
1418 Value* op0 = inst->getArgOperand(0);
1419 Value* op1 = inst->getArgOperand(1);
1420 WIAnalysis::WIDependancy dep0 = getDependency(op0);
1421 IGC_ASSERT(dep0 < WIAnalysis::NumDeps);
1422 WIAnalysis::WIDependancy dep1 = getDependency(op1);
1423 IGC_ASSERT(dep1 < WIAnalysis::NumDeps);
1424 bool isUniform0 = WIAnalysis::isDepUniform(dep0);
1425 bool isUniform1 = WIAnalysis::isDepUniform(dep1);
1426 if ((isUniform0 && isUniform1) || (!isUniform0 && !isUniform1))
1427 {
1428 // Select worse one
1429 return select_conversion[dep0][dep1];
1430 }
1431 else
1432 {
1433 // Select uniform one if only one is uniform
1434 return isUniform0 ? dep0 : dep1;
1435 }
1436 }
1437
1438 if (GII_id == GenISAIntrinsic::GenISA_StackAlloca)
1439 {
1440 WIAnalysis::WIDependancy dep0 = WIAnalysis::UNIFORM_THREAD;
1441 Value* op0 = inst->getArgOperand(0);
1442 WIAnalysis::WIDependancy dep1 = getDependency(op0);
1443 // Select worse one
1444 return select_conversion[dep0][dep1];
1445 }
1446
1447 if (intrinsic_name == llvm_waveBallot || intrinsic_name == llvm_waveAll)
1448 {
1449 return WIAnalysis::UNIFORM_THREAD;
1450 }
1451
1452 if (intrinsic_name == llvm_waveClustered)
1453 {
1454 const unsigned clusterSize = static_cast<unsigned>(
1455 cast<llvm::ConstantInt>(inst->getArgOperand(2))->getZExtValue());
1456
1457 constexpr unsigned maxSimdSize = 32;
1458 if (clusterSize == maxSimdSize)
1459 {
1460 // TODO: do the same for SIMD8 and SIMD16 if possible.
1461 return WIAnalysis::UNIFORM_THREAD;
1462 }
1463 else
1464 {
1465 return WIAnalysis::RANDOM;
1466 }
1467 }
1468
1469
1470 // Iterate over all input dependencies. If all are uniform - propagate it.
1471 // otherwise - return RANDOM
1472 unsigned numParams = inst->getNumArgOperands();
1473 WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
1474 for (unsigned i = 0; i < numParams; ++i)
1475 {
1476 Value* op = inst->getArgOperand(i);
1477 WIAnalysis::WIDependancy tdep = getDependency(op);
1478 dep = select_conversion[dep][tdep];
1479 if (dep == WIAnalysis::RANDOM)
1480 {
1481 break; // Uniformity check failed. no need to continue
1482 }
1483 }
1484 return dep;
1485 }
1486 return WIAnalysis::RANDOM;
1487 }
1488
calculate_dep(const GetElementPtrInst * inst)1489 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(
1490 const GetElementPtrInst* inst)
1491 {
1492 const Value* opPtr = inst->getOperand(0);
1493 WIAnalysis::WIDependancy dep = getDependency(opPtr);
1494 // running over the all indices arguments except for the last
1495 // here we assume the pointer is the first operand
1496 unsigned num = inst->getNumIndices();
1497 for (unsigned i = 1; i < num; ++i)
1498 {
1499 const Value* op = inst->getOperand(i);
1500 WIAnalysis::WIDependancy tdep = getDependency(op);
1501 dep = select_conversion[dep][tdep];
1502 if (!WIAnalysis::isDepUniform(dep))
1503 {
1504 return WIAnalysis::RANDOM;
1505 }
1506 }
1507 const Value* lastInd = inst->getOperand(num);
1508 WIAnalysis::WIDependancy lastIndDep = getDependency(lastInd);
1509 return gep_conversion[dep][lastIndDep];
1510 }
1511
calculate_dep(const PHINode * inst)1512 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const PHINode* inst)
1513 {
1514 unsigned num = inst->getNumIncomingValues();
1515 bool foundFirst = 0;
1516 WIAnalysis::WIDependancy totalDep = WIAnalysis::WIDependancy::INVALID;
1517
1518 for (unsigned i = 0; i < num; ++i)
1519 {
1520 Value* op = inst->getIncomingValue(i);
1521 if (hasDependency(op))
1522 {
1523 if (!foundFirst)
1524 {
1525 totalDep = getDependency(op);
1526 }
1527 else
1528 {
1529 totalDep = select_conversion[totalDep][getDependency(op)];
1530 }
1531 foundFirst = 1;
1532 }
1533 }
1534
1535 IGC_ASSERT_MESSAGE(foundFirst, "We should not reach here with All incoming values are unset");
1536
1537 return totalDep;
1538 }
1539
calculate_dep_terminator(const IGCLLVM::TerminatorInst * inst)1540 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep_terminator(
1541 const IGCLLVM::TerminatorInst* inst)
1542 {
1543 // Instruction has no return value
1544 // Just need to know if this inst is uniform or not
1545 // because we may want to avoid predication if the control flows
1546 // in the function are uniform...
1547 switch (inst->getOpcode())
1548 {
1549 case Instruction::Br:
1550 {
1551 const BranchInst* brInst = cast<BranchInst>(inst);
1552 if (brInst->isConditional())
1553 {
1554 // Conditional branch is uniform, if its condition is uniform
1555 Value* op = brInst->getCondition();
1556 WIAnalysis::WIDependancy dep = getDependency(op);
1557 if (WIAnalysis::isDepUniform(dep))
1558 {
1559 return dep;
1560 }
1561 return WIAnalysis::RANDOM;
1562 }
1563 // Unconditional branch is non TID-dependent
1564 return WIAnalysis::UNIFORM_GLOBAL;
1565 }
1566 //Return instructions are unconditional
1567 case Instruction::Ret:
1568 return WIAnalysis::UNIFORM_GLOBAL;
1569 case Instruction::Unreachable:
1570 return WIAnalysis::UNIFORM_GLOBAL;
1571 case Instruction::IndirectBr:
1572 return WIAnalysis::RANDOM;
1573 // TODO: Define the dependency requirements of indirectBr
1574 case Instruction::Switch:
1575 return WIAnalysis::RANDOM;
1576 // TODO: Should this depend only on the condition, like branch?
1577 default:
1578 return WIAnalysis::RANDOM;
1579 }
1580 }
1581
calculate_dep(const SelectInst * inst)1582 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const SelectInst* inst)
1583 {
1584 Value* op0 = inst->getOperand(0); // mask
1585 WIAnalysis::WIDependancy dep0 = getDependency(op0);
1586 if (WIAnalysis::isDepUniform(dep0))
1587 {
1588 Value* op1 = inst->getOperand(1);
1589 Value* op2 = inst->getOperand(2);
1590 WIAnalysis::WIDependancy dep1 = getDependency(op1);
1591 WIAnalysis::WIDependancy dep2 = getDependency(op2);
1592 // In case of constant scalar select we can choose according to the mask.
1593 if (ConstantInt * C = dyn_cast<ConstantInt>(op0))
1594 {
1595 uint64_t val = C->getZExtValue();
1596 if (val) return dep1;
1597 else return dep2;
1598 }
1599 // Select the "weaker" dep, but if only one dep is ptr_consecutive,
1600 // it must be promoted to strided ( as this data may
1601 // propagate to Load/Store instructions.
1602 WIAnalysis::WIDependancy tDep = select_conversion[dep1][dep2];
1603 return select_conversion[dep0][tDep];
1604 }
1605 // In case the mask is non-uniform the select outcome can be a combination
1606 // so we don't know nothing about it.
1607 return WIAnalysis::RANDOM;
1608 }
1609
TrackAllocaDep(const Value * I,AllocaDep & dep)1610 bool WIAnalysisRunner::TrackAllocaDep(const Value* I, AllocaDep& dep)
1611 {
1612 bool trackable = true;
1613 for (Value::const_user_iterator use_it = I->user_begin(), use_e = I->user_end(); use_it != use_e; ++use_it)
1614 {
1615 if (const GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(*use_it))
1616 {
1617 trackable &= TrackAllocaDep(gep, dep);
1618 }
1619 else if (const llvm::LoadInst * pLoad = llvm::dyn_cast<llvm::LoadInst>(*use_it))
1620 {
1621 trackable &= (pLoad->isSimple());
1622 }
1623 else if (const llvm::StoreInst * pStore = llvm::dyn_cast<llvm::StoreInst>(*use_it))
1624 {
1625 trackable &= (pStore->isSimple());
1626 // Not supported case: GEP instruction is the stored value of the StoreInst
1627 trackable &= (pStore->getValueOperand() != I);
1628 dep.stores.push_back(pStore);
1629 }
1630 else if (const llvm::BitCastInst * pBitCast = llvm::dyn_cast<llvm::BitCastInst>(*use_it))
1631 {
1632 trackable &= TrackAllocaDep(pBitCast, dep);
1633 }
1634 else if (const llvm::AddrSpaceCastInst* pAddrCast = llvm::dyn_cast<llvm::AddrSpaceCastInst>(*use_it))
1635 {
1636 trackable &= TrackAllocaDep(pAddrCast, dep);
1637 }
1638 else if (const GenIntrinsicInst* intr = dyn_cast<GenIntrinsicInst>(*use_it))
1639 {
1640 GenISAIntrinsic::ID IID = intr->getIntrinsicID();
1641 if (IID == GenISAIntrinsic::GenISA_assume_uniform)
1642 {
1643 dep.assume_uniform = true;
1644 }
1645 else
1646 trackable = false;
1647 }
1648 else if (const IntrinsicInst * intr = dyn_cast<IntrinsicInst>(*use_it))
1649 {
1650 llvm::Intrinsic::ID IID = intr->getIntrinsicID();
1651 if (IID != llvm::Intrinsic::lifetime_start &&
1652 IID != llvm::Intrinsic::lifetime_end)
1653 {
1654 trackable = false;
1655 }
1656 else if (IID == llvm::Intrinsic::lifetime_start)
1657 dep.lifetimes.push_back(intr);
1658 }
1659 else
1660 {
1661 // This is some other instruction. Right now we don't want to handle these
1662 trackable = false;
1663 }
1664 }
1665 return trackable;
1666 }
1667
1668
calculate_dep(const AllocaInst * inst)1669 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const AllocaInst* inst)
1670 {
1671 if (m_CGCtx->platform.getWATable().WaNoA32ByteScatteredStatelessMessages)
1672 {
1673 // avoid generating A32 byte scatter on platforms not supporting it
1674 return WIAnalysis::RANDOM;
1675 }
1676 if (!hasDependency(inst))
1677 {
1678 AllocaDep dep;
1679 dep.assume_uniform = false;
1680 bool trackable = TrackAllocaDep(inst, dep);
1681
1682 if ( trackable || dep.assume_uniform)
1683 {
1684 m_allocaDepMap.insert(std::make_pair(inst, dep));
1685 for (auto it : dep.stores)
1686 {
1687 m_storeDepMap.insert(std::make_pair(&(*it), inst));
1688 }
1689 }
1690 }
1691 auto depIt = m_allocaDepMap.find(inst);
1692 if (depIt == m_allocaDepMap.end())
1693 {
1694 // If we haven't been able to track the dependency of the alloca make it random
1695 return WIAnalysis::RANDOM;
1696 }
1697 // find assume-uniform
1698 if (depIt->second.assume_uniform)
1699 {
1700 return WIAnalysis::UNIFORM_THREAD;
1701 }
1702 // find the common dominator block among all the life-time starts
1703 // that can be considered as the nearest logical location for alloca.
1704 const BasicBlock* CommonDomBB = nullptr;
1705 for (auto *SI : depIt->second.lifetimes)
1706 {
1707 auto BB = SI->getParent();
1708 IGC_ASSERT(BB);
1709 if (!CommonDomBB)
1710 CommonDomBB = BB;
1711 else
1712 CommonDomBB = DT->findNearestCommonDominator(CommonDomBB, BB);
1713 }
1714 if (!CommonDomBB)
1715 {
1716 CommonDomBB = inst->getParent();
1717 }
1718 // if any store is not uniform, then alloca is not uniform
1719 // if any store is affected by a divergent branch after alloca,
1720 // then alloca is also not uniform
1721 for (auto *SI : depIt->second.stores)
1722 {
1723 if (hasDependency(SI))
1724 {
1725 if (!WIAnalysis::isDepUniform(getDependency(SI)))
1726 {
1727 return WIAnalysis::RANDOM;
1728 }
1729
1730 if (auto I = m_ctrlBranches.find(SI->getParent());
1731 I != m_ctrlBranches.end())
1732 {
1733 auto& Branches = I->second;
1734 WIAnalysis::WIDependancy cntrDep = WIAnalysis::UNIFORM_GLOBAL;
1735 for (auto* BrI : Branches)
1736 {
1737 // exclude those branches that dominates alloca
1738 if (!DT->dominates(BrI, CommonDomBB))
1739 {
1740 // select a weaker one
1741 IGC_ASSERT(hasDependency(BrI));
1742 cntrDep = select_conversion[cntrDep][getDependency(BrI)];
1743 if (cntrDep == WIAnalysis::RANDOM)
1744 break;
1745 }
1746 }
1747 if (cntrDep == WIAnalysis::RANDOM)
1748 return WIAnalysis::RANDOM;
1749 }
1750 }
1751 }
1752
1753 return WIAnalysis::UNIFORM_THREAD;
1754 }
1755
calculate_dep(const CastInst * inst)1756 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CastInst* inst)
1757 {
1758 Value* op0 = inst->getOperand(0);
1759 WIAnalysis::WIDependancy dep0 = getDependency(op0);
1760
1761 // independent remains independent
1762 if (WIAnalysis::isDepUniform(dep0)) return dep0;
1763
1764 switch (inst->getOpcode())
1765 {
1766 case Instruction::SExt:
1767 case Instruction::FPTrunc:
1768 case Instruction::FPExt:
1769 case Instruction::PtrToInt:
1770 case Instruction::IntToPtr:
1771 case Instruction::AddrSpaceCast:
1772 case Instruction::UIToFP:
1773 case Instruction::FPToUI:
1774 case Instruction::FPToSI:
1775 case Instruction::SIToFP:
1776 return dep0;
1777 case Instruction::BitCast:
1778 case Instruction::ZExt:
1779 return WIAnalysis::RANDOM;
1780 // FIXME:: assumes that the value does not cross the +/- border - risky !!!!
1781 case Instruction::Trunc: {
1782 const Type* destType = inst->getDestTy();
1783 const IntegerType* intType = dyn_cast<IntegerType>(destType);
1784 if (intType && (intType->getBitWidth() >= MinIndexBitwidthToPreserve))
1785 {
1786 return dep0;
1787 }
1788 return WIAnalysis::RANDOM;
1789 }
1790 default:
1791 IGC_ASSERT_MESSAGE(0, "no such opcode");
1792 // never get here
1793 return WIAnalysis::RANDOM;
1794 }
1795 }
1796
calculate_dep(const VAArgInst * inst)1797 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const VAArgInst* inst)
1798 {
1799 IGC_ASSERT_MESSAGE(0, "Are we supporting this ??");
1800 return WIAnalysis::RANDOM;
1801 }
1802
1803 // Set IsLxUniform/IsLyUniform/IsLxUniform to true if they are uniform;
1804 // do nothing otherwise.
checkLocalIdUniform(Function * F,bool & IsLxUniform,bool & IsLyUniform,bool & IsLzUniform)1805 void WIAnalysisRunner::checkLocalIdUniform(
1806 Function* F,
1807 bool& IsLxUniform,
1808 bool& IsLyUniform,
1809 bool& IsLzUniform)
1810 {
1811 if (m_CGCtx->type != ShaderType::OPENCL_SHADER)
1812 {
1813 return;
1814 }
1815
1816 FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(F);
1817 ModuleMetaData* modMD = m_CGCtx->getModuleMetaData();
1818 auto funcMD = modMD->FuncMD.find(F);
1819
1820 int32_t WO_0 = -1, WO_1 = -1, WO_2 = -1;
1821 if (funcMD != modMD->FuncMD.end())
1822 {
1823 WorkGroupWalkOrderMD workGroupWalkOrder = funcMD->second.workGroupWalkOrder;
1824 if (workGroupWalkOrder.dim0 || workGroupWalkOrder.dim1 || workGroupWalkOrder.dim2)
1825 {
1826 WO_0 = workGroupWalkOrder.dim0;
1827 WO_1 = workGroupWalkOrder.dim1;
1828 WO_2 = workGroupWalkOrder.dim2;
1829 }
1830 }
1831
1832 uint32_t simdSize = 0;
1833 SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize();
1834 if (subGroupSize->hasValue())
1835 {
1836 simdSize = (uint32_t)subGroupSize->getSIMD_size();
1837 }
1838 simdSize = simdSize >= 8 ? simdSize : 32;
1839
1840 int32_t X = -1, Y = -1, Z = -1;
1841 ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize();
1842 if (threadGroupSize->hasValue())
1843 {
1844 X = (int32_t)threadGroupSize->getXDim();
1845 Y = (int32_t)threadGroupSize->getYDim();
1846 Z = (int32_t)threadGroupSize->getZDim();
1847
1848 }
1849
1850 if (WO_0 == 0 && ((X / simdSize) * simdSize) == X)
1851 {
1852 // each thread will have Y and Z unchanged.
1853 IsLyUniform = true;
1854 IsLzUniform = true;
1855 }
1856 else if (WO_0 == 1 && ((Y / simdSize) * simdSize) == Y)
1857 {
1858 // each thread will have X and Z unchanged.
1859 IsLxUniform = true;
1860 IsLzUniform = true;
1861 }
1862 else if (WO_0 == 2 && ((Z / simdSize) * simdSize) == Z)
1863 {
1864 // each thread will have X and Y unchanged.
1865 IsLxUniform = true;
1866 IsLyUniform = true;
1867 }
1868
1869 if (X == 1)
1870 {
1871 IsLxUniform = true;
1872 }
1873 if (Y == 1)
1874 {
1875 IsLyUniform = true;
1876 }
1877 if (Z == 1)
1878 {
1879 IsLzUniform = true;
1880 }
1881
1882 if (IGC_IS_FLAG_ENABLED(DispatchOCLWGInLinearOrder) ||
1883 (WO_0 == 0 && WO_1 == 1 && WO_2 == 2))
1884 {
1885 // linear order dispatch
1886 uint32_t XxY = X * Y;
1887 if (X > 0 && (X % simdSize) == 0)
1888 {
1889 // X is multiple of simdSize
1890 IsLyUniform = true;
1891 IsLzUniform = true;
1892 }
1893 else if (X > 0 && Y > 0 && (XxY % simdSize) == 0)
1894 {
1895 // X*Y is multiple of simdSize
1896 IsLzUniform = true;
1897 }
1898 }
1899 }
1900
BranchInfo(const IGCLLVM::TerminatorInst * inst,const BasicBlock * ipd)1901 BranchInfo::BranchInfo(const IGCLLVM::TerminatorInst* inst, const BasicBlock* ipd)
1902 : cbr(inst),
1903 full_join(ipd)
1904 {
1905 const BasicBlock* fork_blk = inst->getParent();
1906 IGC_ASSERT_MESSAGE(cbr == fork_blk->getTerminator(), "block terminator mismatch");
1907
1908 if (cbr->getNumSuccessors() != 2) {
1909 std::set<const BasicBlock*> Reached;
1910 for (auto SI = succ_begin(fork_blk),
1911 SE = succ_end(fork_blk); SI != SE; ++SI) {
1912 auto Succ = *SI;
1913 if (Succ == full_join)
1914 continue;
1915 std::set<const BasicBlock*> Visited;
1916 std::stack<const BasicBlock*> WorkSet;
1917 WorkSet.push(Succ);
1918 while (!WorkSet.empty()) {
1919 const BasicBlock* BB = WorkSet.top();
1920 WorkSet.pop();
1921 Visited.insert(BB);
1922 influence_region.insert(const_cast<BasicBlock*>(BB));
1923 if (Reached.count(BB))
1924 partial_joins.insert(const_cast<BasicBlock*>(BB));
1925 for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
1926 auto SBB = *I;
1927 if (SBB != full_join && !Visited.count(SBB))
1928 WorkSet.push(SBB);
1929 }
1930 }
1931 // Merge Visited into Reached.
1932 for (auto BB : Visited)
1933 Reached.insert(BB);
1934 }
1935 }
1936 else {
1937 std::set<BasicBlock*> f_set, t_set;
1938 std::stack<BasicBlock*> work_set;
1939 if (cbr->getSuccessor(0) != full_join)
1940 {
1941 work_set.push(cbr->getSuccessor(0));
1942 while (!work_set.empty())
1943 {
1944 BasicBlock* cur_blk = work_set.top();
1945 work_set.pop();
1946 f_set.insert(cur_blk);
1947 influence_region.insert(cur_blk);
1948 for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk); SI != E; ++SI)
1949 {
1950 BasicBlock* succ_blk = (*SI);
1951 if (succ_blk != full_join && !f_set.count(succ_blk))
1952 {
1953 work_set.push(succ_blk);
1954 }
1955 }
1956 }
1957 }
1958 if (cbr->getSuccessor(1) != full_join)
1959 {
1960 work_set.push(cbr->getSuccessor(1));
1961 while (!work_set.empty())
1962 {
1963 BasicBlock* cur_blk = work_set.top();
1964 work_set.pop();
1965 t_set.insert(cur_blk);
1966 influence_region.insert(cur_blk);
1967 if (f_set.count(cur_blk))
1968 {
1969 partial_joins.insert(cur_blk);
1970 }
1971 for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk); SI != E; ++SI)
1972 {
1973 BasicBlock* succ_blk = (*SI);
1974 if (succ_blk != full_join && !t_set.count(succ_blk))
1975 {
1976 work_set.push(succ_blk);
1977 }
1978 }
1979 }
1980 }
1981 }
1982 }
1983
print(raw_ostream & OS) const1984 void BranchInfo::print(raw_ostream& OS) const
1985 {
1986 OS << "\nCBR: " << *cbr;
1987 OS << "\nIPD: ";
1988 if (full_join)
1989 {
1990 full_join->print(IGC::Debug::ods());
1991 }
1992 OS << "\nPartial Joins:";
1993 SmallPtrSet<BasicBlock*, 4>::iterator join_it = partial_joins.begin();
1994 SmallPtrSet<BasicBlock*, 4>::iterator join_e = partial_joins.end();
1995 for (; join_it != join_e; ++join_it)
1996 {
1997 BasicBlock* cur_blk = *join_it;
1998 OS << "\n ";
1999 cur_blk->print(IGC::Debug::ods());
2000 }
2001 OS << "\nInfluence Region:";
2002 DenseSet<BasicBlock*>::const_iterator blk_it = influence_region.begin();
2003 DenseSet<BasicBlock*>::const_iterator blk_e = influence_region.end();
2004 for (; blk_it != blk_e; ++blk_it)
2005 {
2006 BasicBlock* cur_blk = *blk_it;
2007 OS << "\n ";
2008 cur_blk->print(IGC::Debug::ods());
2009 }
2010 OS << "\n";
2011 }
2012
2013 extern "C"
2014 {
createWIAnalysisPass()2015 void* createWIAnalysisPass()
2016 {
2017 return new WIAnalysis();
2018 }
2019 }
2020