1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "AdaptorCommon/ImplicitArgs.hpp"
10 #include "Compiler/CISACodeGen/WIAnalysis.hpp"
11 #include "Compiler/CISACodeGen/helper.h"
12 #include "Compiler/CodeGenContextWrapper.hpp"
13 #include "Compiler/CodeGenPublic.h"
14 #include "Compiler/IGCPassSupport.h"
15 #include "common/debug/Debug.hpp"
16 #include "common/igc_regkeys.hpp"
17 #include "GenISAIntrinsics/GenIntrinsicInst.h"
18 #include "common/LLVMWarningsPush.hpp"
19 #include <llvm/IR/Function.h>
20 #include <llvm/IR/CFG.h>
21 #include <llvm/Support/CommandLine.h>
22 #include <llvm/Support/Debug.h>
23 #include <llvm/IR/Constants.h>
24 #include "common/LLVMWarningsPop.hpp"
25 #include <string>
26 #include <stack>
27 #include <sstream>
28 #include "Probe/Assertion.h"
29 
30 using namespace llvm;
31 using namespace IGC;
32 using namespace IGC::IGCMD;
33 using namespace IGC::Debug;
34 
35 static cl::opt<bool> PrintWiaCheck(
36     "print-wia-check", cl::init(false), cl::Hidden,
37     cl::desc("Debug wia-check analysis"));
38 
39 // Register pass to igc-opt
40 #define PASS_FLAG "igc-wi-analysis"
41 #define PASS_DESCRIPTION "WIAnalysis provides work item dependency info"
42 #define PASS_CFG_ONLY true
43 #define PASS_ANALYSIS true
44 IGC_INITIALIZE_PASS_BEGIN(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
45 IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
46 IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
47 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
48 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
49 IGC_INITIALIZE_PASS_DEPENDENCY(TranslationTable)
50 IGC_INITIALIZE_PASS_END(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
51 
52 char WIAnalysis::ID = 0;
53 
WIAnalysis()54 WIAnalysis::WIAnalysis() : FunctionPass(ID)
55 {
56     initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
57 }
58 
59 const unsigned int WIAnalysisRunner::MinIndexBitwidthToPreserve = 16;
60 
61 // For dumpping WIA info per each invocation
62 DenseMap<const Function*, int> WIAnalysisRunner::m_funcInvocationId;
63 
64 /// Define shorter names for dependencies, for clarity of the conversion maps
65 /// Note that only UGL/UWG/UTH/RND are supported.
66 #define UGL WIAnalysis::UNIFORM_GLOBAL
67 #define UWG WIAnalysis::UNIFORM_WORKGROUP
68 #define UTH WIAnalysis::UNIFORM_THREAD
69 #define SEQ WIAnalysis::CONSECUTIVE
70 #define PTR WIAnalysis::PTR_CONSECUTIVE
71 #define STR WIAnalysis::STRIDED
72 #define RND WIAnalysis::RANDOM
73 
74 static const char* const dep_str[] = {
75   "uniform_global",
76   "uniform_workgroup",
77   "uniform_thread",
78   "consecu",
79   "p_conse",
80   "strided",
81   "random "
82 };
83 
84 /// Dependency maps (define output dependency according to 2 input deps)
85 
86 static const WIAnalysis::WIDependancy
87 add_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
88     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
89     /* UGL */  {UGL, UWG, UTH, SEQ, PTR, STR, RND},
90     /* UWG */  {UWG, UWG, UTH, SEQ, PTR, STR, RND},
91     /* UTH */  {UTH, UTH, UTH, SEQ, PTR, STR, RND},
92     /* SEQ */  {SEQ, SEQ, SEQ, STR, STR, STR, RND},
93     /* PTR */  {PTR, PTR, PTR, STR, STR, STR, RND},
94     /* STR */  {STR, STR, STR, STR, STR, STR, RND},
95     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
96 };
97 
98 static const WIAnalysis::WIDependancy
99 sub_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
100     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
101     /* UGL */  {UGL, UWG, UTH, STR, RND, RND, RND},
102     /* UWG */  {UWG, UWG, UTH, STR, RND, RND, RND},
103     /* UTH */  {UTH, UTH, UTH, STR, RND, RND, RND},
104     /* SEQ */  {SEQ, SEQ, SEQ, RND, RND, RND, RND},
105     /* PTR */  {PTR, PTR, PTR, RND, RND, RND, RND},
106     /* STR */  {STR, STR, STR, RND, RND, RND, RND},
107     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
108 };
109 
110 
111 static const WIAnalysis::WIDependancy
112 mul_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
113     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
114     /* UGL */  {UGL, UWG, UTH, STR, STR, STR, RND},
115     /* UWG */  {UWG, UWG, UTH, STR, STR, STR, RND},
116     /* UTH */  {UTH, UTH, UTH, STR, STR, STR, RND},
117     /* SEQ */  {STR, STR, STR, RND, RND, RND, RND},
118     /* PTR */  {STR, STR, STR, RND, RND, RND, RND},
119     /* STR */  {STR, STR, STR, RND, RND, RND, RND},
120     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
121 };
122 
123 // select is to have a weaker dep of two
124 static const WIAnalysis::WIDependancy
125 select_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
126     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
127     /* UGL */  {UGL, UWG, UTH, STR, STR, STR, RND},
128     /* UWG */  {UWG, UWG, UTH, STR, STR, STR, RND},
129     /* UTH */  {UTH, UTH, UTH, STR, STR, STR, RND},
130     /* SEQ */  {STR, STR, STR, SEQ, STR, STR, RND},
131     /* PTR */  {STR, STR, STR, STR, PTR, STR, RND},
132     /* STR */  {STR, STR, STR, STR, STR, STR, RND},
133     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
134 };
135 
136 static const WIAnalysis::WIDependancy
137 gep_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
138     /* ptr\index UGL, UWG, UTH, SEQ, PTR, STR, RND */
139     /* UGL */  {UGL, UWG, UTH, PTR, RND, RND, RND},
140     /* UWG */  {UWG, UWG, UTH, PTR, RND, RND, RND},
141     /* UTH */  {UTH, UTH, UTH, PTR, RND, RND, RND},
142     /* SEQ */  {RND, RND, RND, RND, RND, RND, RND},
143     /* PTR */  {PTR, PTR, PTR, RND, RND, RND, RND},
144     /* STR */  {RND, RND, RND, RND, RND, RND, RND},
145     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
146 };
147 
148 // For better readability, the rank of a dependency is used to compare two dependencies
149 // to see which of them is weaker or stronger.
150 //
151 //   Dependancy rank : an integer value for each Dependancy, starting from 0.
152 //   Property of rank: the lower (smaller) the rank, the stronger the dependancy.
153 //
154 // Currently, enum value of each dependency is used exactly as its rank.
depRank(WIAnalysis::WIDependancy D)155 inline int depRank(WIAnalysis::WIDependancy D) { return (int)D; }
156 
157 namespace IGC {
158     /// @Brief, given a conditional branch and its immediate post dominator,
159     /// find its influence-region and partial joins within the influence region
160     class BranchInfo
161     {
162     public:
163         BranchInfo(const IGCLLVM::TerminatorInst* inst, const llvm::BasicBlock* ipd);
164 
165         void print(llvm::raw_ostream& OS) const;
166 
167         const IGCLLVM::TerminatorInst* cbr;
168         const llvm::BasicBlock* full_join;
169         llvm::DenseSet<llvm::BasicBlock*> influence_region;
170         llvm::SmallPtrSet<llvm::BasicBlock*, 4> partial_joins;
171         llvm::BasicBlock* fork_blk;
172     };
173 } // namespace IGC
174 
print(raw_ostream & OS,const Module *) const175 void WIAnalysisRunner::print(raw_ostream& OS, const Module*) const
176 {
177     DenseMap<BasicBlock*, int> BBIDs;
178     int id = 0;
179     for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I, ++id) {
180         BasicBlock* BB = &*I;
181         BBIDs[BB] = id;
182     }
183 
184     std::stringstream ss;
185     ss << "WIAnalysis: " << m_func->getName().str();
186     Banner(OS, ss.str());
187 
188     OS << "Args: \n";
189     for (Function::arg_iterator I = m_func->arg_begin(), E = m_func->arg_end();
190         I != E; ++I) {
191         Value* AVal = &*I;
192 
193         if (m_depMap.GetAttributeWithoutCreating(AVal) != m_depMap.end())
194             OS << "    " << dep_str[m_depMap.GetAttributeWithoutCreating(AVal)] << " " << *AVal << "\n";
195         else
196             OS << "  unknown " << *AVal << "\n";
197 
198     }
199     OS << "\n";
200 
201     for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I) {
202         BasicBlock* BB = &*I;
203         OS << "BB:" << BBIDs[BB];
204         if (BB->hasName())
205             OS << " " << BB->getName();
206         OS << "       ; preds =";
207         bool isFirst = true;
208         for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
209             BasicBlock* pred = *PI;
210             OS << ((isFirst) ? " " : ", ") << "BB:" << BBIDs[pred] << "  ";
211             if (pred->hasName())
212                 OS << pred->getName();
213             isFirst = false;
214         }
215         {
216             auto dep = getCFDependency(BB);
217             OS << "[ " << dep_str[dep] << " ]";
218         }
219         OS << "\n";
220         for (BasicBlock::iterator it = BB->begin(), ie = BB->end(); it != ie; ++it) {
221             Instruction* I = &*it;
222             if (m_depMap.GetAttributeWithoutCreating(I) != m_depMap.end())
223             {
224                 OS << "  " << dep_str[m_depMap.GetAttributeWithoutCreating(I)] << " " << *I;
225             }
226             else
227             {
228                 OS << "  unknown " << *I;
229             }
230             if (I->isTerminator()) {
231                 IGCLLVM::TerminatorInst* TI = dyn_cast<IGCLLVM::TerminatorInst>(I);
232                 OS << " [";
233                 for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
234                     BasicBlock* succ = TI->getSuccessor(i);
235                     OS << " BB:" << BBIDs[succ];
236                 }
237                 OS << " ]";
238             }
239             OS << "\n";
240         }
241         OS << "\n";
242     }
243 }
244 
lock_print()245 void WIAnalysisRunner::lock_print()
246 {
247     IGC::Debug::DumpLock();
248     {
249         int id = m_funcInvocationId[m_func]++;
250         std::stringstream ss;
251         ss << m_func->getName().str() << "_WIAnalysis_" << id;
252         auto name =
253             DumpName(IGC::Debug::GetShaderOutputName())
254             .Hash(m_CGCtx->hash)
255             .Type(m_CGCtx->type)
256             .Pass(ss.str().c_str())
257             .Extension("txt");
258         print(Dump(name, DumpType::DBG_MSG_TEXT).stream());
259     }
260     IGC::Debug::DumpUnlock();
261 }
262 
263 // Used for dumpping into a file with a fixed name while running in debugger
dump() const264 void WIAnalysisRunner::dump() const
265 {
266     auto name =
267         DumpName(IGC::Debug::GetShaderOutputName())
268         .Hash(m_CGCtx->hash)
269         .Type(m_CGCtx->type)
270         .Pass("WIAnalysis")
271         .Extension("txt");
272     print(Dump(name, DumpType::DBG_MSG_TEXT).stream());
273 }
274 
init(llvm::Function * F,llvm::DominatorTree * DT,llvm::PostDominatorTree * PDT,IGC::IGCMD::MetaDataUtils * MDUtils,IGC::CodeGenContext * CGCtx,IGC::ModuleMetaData * ModMD,IGC::TranslationTable * TransTable)275 void WIAnalysisRunner::init(
276     llvm::Function* F,
277     llvm::DominatorTree* DT,
278     llvm::PostDominatorTree* PDT,
279     IGC::IGCMD::MetaDataUtils* MDUtils,
280     IGC::CodeGenContext* CGCtx,
281     IGC::ModuleMetaData* ModMD,
282     IGC::TranslationTable* TransTable)
283 {
284     m_func = F;
285     this->DT = DT;
286     this->PDT = PDT;
287     m_pMdUtils = MDUtils;
288     m_CGCtx = CGCtx;
289     m_ModMD = ModMD;
290     m_TT = TransTable;
291 }
292 
run()293 bool WIAnalysisRunner::run()
294 {
295     auto& F = *m_func;
296     if (m_pMdUtils->findFunctionsInfoItem(&F) == m_pMdUtils->end_FunctionsInfo())
297         return false;
298 
299     m_depMap.Initialize(m_TT);
300     m_TT->RegisterListener(&m_depMap);
301 
302     m_changed1.clear();
303     m_changed2.clear();
304     m_pChangedNew = &m_changed1;
305     m_pChangedOld = &m_changed2;
306     m_ctrlBranches.clear();
307 
308     m_storeDepMap.clear();
309     m_allocaDepMap.clear();
310     m_forcedUniforms.clear();
311 
312     updateArgsDependency(&F);
313 
314     if (!IGC_IS_FLAG_ENABLED(DisableUniformAnalysis))
315     {
316         // Compute the  first iteration of the WI-dep according to ordering
317         // instructions this ordering is generally good (as it ususally correlates
318         // well with dominance).
319         inst_iterator it = inst_begin(F);
320         inst_iterator  e = inst_end(F);
321         for (; it != e; ++it)
322         {
323             calculate_dep(&*it);
324         }
325 
326         // Recursively check if WI-dep changes and if so reclaculates
327         // the WI-dep and marks the users for re-checking.
328         // This procedure is guranteed to converge since WI-dep can only
329         // become less unifrom (uniform->consecutive->ptr->stride->random).
330         updateDeps();
331 
332         // sweep the dataflow started from those GenISA_vectorUniform,
333         // force all the insert-elements and phi-nodes to uniform
334         std::set<const Value*> visited;
335         while (!m_forcedUniforms.empty())
336         {
337             const Value* V = m_forcedUniforms.back();
338             m_forcedUniforms.pop_back();
339             visited.insert(V);
340             for (auto UI = V->user_begin(), UE = V->user_end(); UI != UE; ++UI)
341             {
342                 const Value* use = (*UI);
343                 if (!visited.count(use) && use->getType() == V->getType())
344                 {
345                     if (auto INS = dyn_cast<InsertElementInst>(use))
346                     {
347                         if (!isUniform(use))
348                             m_depMap.SetAttribute(INS, WIAnalysis::UNIFORM_THREAD);
349                         m_forcedUniforms.push_back(use);
350                     }
351                     else if (auto PHI = dyn_cast<PHINode>(use))
352                     {
353                         if (!isUniform(use))
354                             m_depMap.SetAttribute(PHI, WIAnalysis::UNIFORM_THREAD);
355                         m_forcedUniforms.push_back(use);
356                     }
357                 }
358             }
359         }
360     }
361 
362     if (IGC_IS_FLAG_ENABLED(DumpWIA))
363     {
364         // Dump into a unique file under dump dir, per each function invocation.
365         lock_print();
366     }
367 
368     // Original print to stdout
369     //   Need igc key PrintToConsole and llvm flag -print-wia-check
370     if (PrintWiaCheck)
371     {
372         print(ods());
373     }
374     return false;
375 }
376 
runOnFunction(Function & F)377 bool WIAnalysis::runOnFunction(Function& F)
378 {
379     auto* MDUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
380     auto* DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
381     auto* PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
382     auto* CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
383     auto* ModMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
384     auto* pTT = &getAnalysis<TranslationTable>();
385 
386     Runner.init(&F, DT, PDT, MDUtils, CGCtx, ModMD, pTT);
387     return Runner.run();
388 }
389 
updateDeps()390 void WIAnalysisRunner::updateDeps()
391 {
392     // As lonst as we have values to update
393     while (!m_pChangedNew->empty())
394     {
395         // swap between changedSet pointers - recheck the newChanged(now old)
396         std::swap(m_pChangedNew, m_pChangedOld);
397         // clear the newChanged set so it will be filled with the users of
398         // instruction which their WI-dep canged during the current iteration
399         m_pChangedNew->clear();
400 
401         // update all changed values
402         std::vector<const Value*>::iterator it = m_pChangedOld->begin();
403         std::vector<const Value*>::iterator e = m_pChangedOld->end();
404         for (; it != e; ++it)
405         {
406             // remove first instruction
407             // calculate its new dependencey value
408             calculate_dep(*it);
409         }
410     }
411 }
412 
isInstructionSimple(const Instruction * inst)413 bool WIAnalysisRunner::isInstructionSimple(const Instruction* inst)
414 {
415     // avoid changing cb load to sampler load, since sampler load
416     // has longer latency.
417     if (isa<LoadInst>(inst))
418     {
419         return false;
420     }
421 
422     if (isa<UnaryInstruction>(inst) ||
423         isa<BinaryOperator>(inst) ||
424         isa<CmpInst>(inst) ||
425         isa<SelectInst>(inst))
426     {
427         return true;
428     }
429     if (IsMathIntrinsic(GetOpCode((Instruction*)inst)))
430     {
431         return true;
432     }
433 
434     return false;
435 }
436 
needToBeUniform(const Value * val)437 bool WIAnalysisRunner::needToBeUniform(const Value* val)
438 {
439     for (auto UI = val->user_begin(), E = val->user_end(); UI != E; ++UI)
440     {
441         if (const RTWritIntrinsic * use = dyn_cast<RTWritIntrinsic>(*UI))
442         {
443             if (use->getSampleIndex() == val || use->getBlendStateIndex() == val)
444             {
445                 return true;
446             }
447         }
448         // TODO add sampler cases
449     }
450     return false;
451 }
452 
allUsesRandom(const Value * val)453 bool WIAnalysisRunner::allUsesRandom(const Value* val)
454 {
455     for (auto UI = val->user_begin(), E = val->user_end(); UI != E; ++UI)
456     {
457         const Value* use = (*UI);
458         if (getDependency(use) != WIAnalysis::RANDOM)
459         {
460             return false;
461         }
462     }
463     return true;
464 }
465 
updateArgsDependency(llvm::Function * pF)466 void WIAnalysisRunner::updateArgsDependency(llvm::Function* pF)
467 {
468     /*
469     Function Signature: define void @kernel(
470         [OCL function args...],
471         [implicit args...],
472         [push analysis args...])
473 
474     Example push analysis args:
475         float %urb_read_0, float %urb_read_1, float %urb_read_2, float %urb_read_3, float %urb_read_4
476 
477     Metadata Generated:
478     !igc.pushanalysis.wi.info = !{!3, !4, !5, !6, !7}
479 
480     !3 = metadata !{metadata !"urb_read_0", i32 0, i32 4}
481     !4 = metadata !{metadata !"urb_read_1", i32 1, i32 4}
482     !5 = metadata !{metadata !"urb_read_2", i32 2, i32 4}
483     !6 = metadata !{metadata !"urb_read_3", i32 3, i32 4}
484     !7 = metadata !{metadata !"urb_read_4", i32 4, i32 4}
485 
486     Assumption is that the order of metadata matches the order of arguments in function.
487     */
488 
489     // For a subroutine, conservatively assume that all user provided arguments
490     // are random. Note that all other functions are treated as kernels.
491     // To enable subroutine for other FEs, we need to update this check.
492     bool IsSubroutine = !isEntryFunc(m_pMdUtils, pF) || isNonEntryMultirateShader(pF);
493 
494     ImplicitArgs implicitArgs(*pF, m_pMdUtils);
495     int implicitArgStart = (unsigned)(pF->arg_size()
496         - implicitArgs.size()
497         - (IsSubroutine ? 0 : m_ModMD->pushInfo.pushAnalysisWIInfos.size()));
498     IGC_ASSERT_MESSAGE(implicitArgStart >= 0, "Function arg size does not match meta data and push args.");
499 
500     llvm::Function::arg_iterator ai, ae;
501     ai = pF->arg_begin();
502     ae = pF->arg_end();
503 
504     // 1. add all kernel function args as uniform, or
505     //    add all subroutine function args as random
506     for (int i = 0; i < implicitArgStart; ++i, ++ai)
507     {
508         IGC_ASSERT(ai != ae);
509         incUpdateDepend(&(*ai), IsSubroutine ? WIAnalysis::RANDOM : WIAnalysis::UNIFORM_GLOBAL);
510     }
511 
512     // 2. add implicit args
513     //    By default, local IDs are not uniform. But if we know that the runtime dispatchs
514     //    order (intel_reqd_workgroup_walk_order()) and work group size (reqd_work_group_size()),
515     //    we may derive that some of local IDs are uniform.
516     bool localX_uniform = false, localY_uniform = false, localZ_uniform = false;
517     // DispatchOCLWGInLinearOrder should be removed after testing the guarded code.
518     if (!IsSubroutine &&
519         IGC_IS_FLAG_ENABLED(DispatchOCLWGInLinearOrder))
520     {
521         checkLocalIdUniform(pF, localX_uniform, localY_uniform, localZ_uniform);
522     }
523 
524     for (unsigned i = 0; i < implicitArgs.size(); ++i, ++ai)
525     {
526         IGC_ASSERT(ai != ae);
527         const ImplicitArg& iArg = implicitArgs[ai->getArgNo() - implicitArgStart];
528         WIAnalysis::WIDependancy dependency = iArg.getDependency();
529         if ((localX_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_X) ||
530             (localY_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_Y) ||
531             (localZ_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_Z)) {
532             // todo: may improve it to have UNIFORM_WORKGROUP
533             dependency = WIAnalysis::UNIFORM_THREAD;
534         }
535 
536         incUpdateDepend(&(*ai), dependency);
537     }
538 
539     // 3. add push analysis args
540     if (!IsSubroutine)
541     {
542         for (unsigned i = 0; i < m_ModMD->pushInfo.pushAnalysisWIInfos.size(); ++i, ++ai)
543         {
544             IGC_ASSERT(ai != ae);
545             WIAnalysis::WIDependancy dependency =
546                 static_cast<WIAnalysis::WIDependancy>(m_ModMD->pushInfo.pushAnalysisWIInfos[i].argDependency);
547             incUpdateDepend(&(*ai), dependency);
548         }
549     }
550 }
551 
print(llvm::raw_ostream & OS,const llvm::Module * M) const552 void WIAnalysis::print(
553     llvm::raw_ostream& OS, const llvm::Module* M) const
554 {
555     Runner.print(OS, M);
556 }
557 
dump() const558 void WIAnalysis::dump() const
559 {
560     Runner.dump();
561 }
562 
incUpdateDepend(const llvm::Value * val,WIDependancy dep)563 void WIAnalysis::incUpdateDepend(const llvm::Value* val, WIDependancy dep)
564 {
565     Runner.incUpdateDepend(val, dep);
566 }
567 
whichDepend(const llvm::Value * val)568 WIAnalysis::WIDependancy WIAnalysis::whichDepend(const llvm::Value* val)
569 {
570     return Runner.whichDepend(val);
571 }
572 
isUniform(const Value * val) const573 bool WIAnalysis::isUniform(const Value* val) const
574 {
575     return Runner.isUniform(val);
576 }
577 
isGlobalUniform(const Value * val)578 bool WIAnalysis::isGlobalUniform(const Value* val)
579 {
580     return Runner.isGlobalUniform(val);
581 }
582 
isWorkGroupOrGlobalUniform(const Value * val)583 bool WIAnalysis::isWorkGroupOrGlobalUniform(const Value* val)
584 {
585     return Runner.isWorkGroupOrGlobalUniform(val);
586 }
587 
insideDivergentCF(const Value * val) const588 bool WIAnalysis::insideDivergentCF(const Value* val) const
589 {
590     return Runner.insideDivergentCF(val);
591 }
592 
insideWorkgroupDivergentCF(const Value * val) const593 bool WIAnalysis::insideWorkgroupDivergentCF(const Value* val) const
594 {
595     return Runner.insideWorkgroupDivergentCF(val);
596 }
597 
whichDepend(const Value * val) const598 WIAnalysis::WIDependancy WIAnalysisRunner::whichDepend(const Value* val) const
599 {
600     IGC_ASSERT_MESSAGE(m_pChangedNew->empty(), "set should be empty before query");
601     IGC_ASSERT_MESSAGE(nullptr != val, "Bad value");
602     if (isa<Constant>(val))
603     {
604         return WIAnalysis::UNIFORM_GLOBAL;
605     }
606     else if (isa<StaticConstantPatchIntrinsic>(val))
607     {
608         return WIAnalysis::UNIFORM_GLOBAL;
609     }
610     auto EL = m_depMap.GetAttributeWithoutCreating(val);
611     if (IGC_IS_FLAG_ENABLED(DisableUniformAnalysis))
612     {
613         if (EL == m_depMap.end())
614         {
615             return WIAnalysis::RANDOM;
616         }
617     }
618     IGC_ASSERT(EL != m_depMap.end());
619     return EL;
620 }
621 
isUniform(const Value * val) const622 bool WIAnalysisRunner::isUniform(const Value* val) const
623 {
624     if (!hasDependency(val))
625         return false;
626 
627     return WIAnalysis::isDepUniform(whichDepend(val));
628 }
629 
isWorkGroupOrGlobalUniform(const Value * val) const630 bool WIAnalysisRunner::isWorkGroupOrGlobalUniform(const Value* val) const
631 {
632     if (!hasDependency(val))
633         return false;
634     WIAnalysis::WIDependancy dep = whichDepend(val);
635     return dep == WIAnalysis::UNIFORM_GLOBAL ||
636            dep == WIAnalysis::UNIFORM_WORKGROUP;
637 }
638 
isGlobalUniform(const Value * val) const639 bool WIAnalysisRunner::isGlobalUniform(const Value* val) const
640 {
641     if (!hasDependency(val))
642         return false;
643     WIAnalysis::WIDependancy dep = whichDepend(val);
644     return dep == WIAnalysis::UNIFORM_GLOBAL;
645 }
646 
getCFDependency(const BasicBlock * BB) const647 WIAnalysis::WIDependancy WIAnalysisRunner::getCFDependency(const BasicBlock* BB) const
648 {
649     auto II = m_ctrlBranches.find(BB);
650     if (II == m_ctrlBranches.end())
651         return WIAnalysis::UNIFORM_GLOBAL;
652 
653     WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
654     for (auto* BI : II->second)
655     {
656         auto newDep = whichDepend(BI);
657         if (depRank(dep) < depRank(newDep))
658             dep = newDep;
659     }
660 
661     return dep;
662 }
663 
insideWorkgroupDivergentCF(const Value * val) const664 bool WIAnalysisRunner::insideWorkgroupDivergentCF(const Value* val) const
665 {
666     if (auto* I = dyn_cast<Instruction>(val))
667     {
668         auto dep = getCFDependency(I->getParent());
669         return depRank(dep) > WIAnalysis::UNIFORM_WORKGROUP;
670     }
671 
672     return false;
673 }
674 
getDependency(const Value * val)675 WIAnalysis::WIDependancy WIAnalysisRunner::getDependency(const Value* val)
676 {
677     if (m_depMap.GetAttributeWithoutCreating(val) == m_depMap.end())
678     {
679         // Make sure that constants are not added in the map.
680         if (!isa<Instruction>(val) && !isa<Argument>(val))
681         {
682             return WIAnalysis::UNIFORM_GLOBAL;
683         }
684         // Don't expect this happens, let's assertion fail
685         IGC_ASSERT_MESSAGE(0, "Dependence for 'val' should bave been set already!");
686     }
687     IGC_ASSERT(m_depMap.GetAttributeWithoutCreating(val) != m_depMap.end());
688     return m_depMap.GetAttributeWithoutCreating(val);
689 }
690 
hasDependency(const Value * val) const691 bool WIAnalysisRunner::hasDependency(const Value* val) const
692 {
693 
694     if (!isa<Instruction>(val) && !isa<Argument>(val))
695     {
696         return true;
697     }
698     return (m_depMap.GetAttributeWithoutCreating(val) != m_depMap.end());
699 }
700 
HasPhiUse(const llvm::Value * inst)701 static bool HasPhiUse(const llvm::Value* inst)
702 {
703     for (auto UI = inst->user_begin(), E = inst->user_end(); UI != E; ++UI)
704     {
705         if (llvm::isa<llvm::PHINode>(*UI))
706         {
707             return true;
708         }
709     }
710     return false;
711 }
712 
calculate_dep(const Value * val)713 void WIAnalysisRunner::calculate_dep(const Value* val)
714 {
715     IGC_ASSERT_MESSAGE(nullptr != val, "Bad value");
716 
717     // Not an instruction, must be a constant or an argument
718     // Could this vector type be of a constant which
719     // is not uniform ?
720     IGC_ASSERT_MESSAGE(isa<Instruction>(val), "Could we reach here with non instruction value?");
721 
722     const Instruction* const inst = dyn_cast<Instruction>(val);
723     IGC_ASSERT_MESSAGE(nullptr != inst, "This Value is not an Instruction");
724     if (inst)
725     {
726         bool hasOriginal = hasDependency(inst);
727         WIAnalysis::WIDependancy orig;
728         // We only calculate dependency on unset instructions if all their operands
729         // were already given dependency. This is good for compile time since these
730         // instructions will be visited again after the operands dependency is set.
731         // An exception are phi nodes since they can be the ancestor of themselves in
732         // the def-use chain. Note that in this case we force the phi to have the
733         // pre-header value already calculated.
734         //
735         // Another case is that an inst might be set under control dependence (for example, phi)
736         // before any of its operands have been set. In this case, we will skip here. Here
737         // is the example (derived from ocl scheduler):
738         //      B0:  (p) goto Bt
739         //      B1:  goto Bf
740         //  L   B2:  x.lcssa = phi (x.0, Bn)      // B2: partial join
741         //      ...
742         //      Bt: ...
743         //      ...
744         //      Bf:
745         //      ...
746         //          goto Bm (out of loop)
747         //      Bn:
748         //          x.0 = ...
749         //          goto  B2
750         //      Bm:  ...
751         //      ...
752         //      B_ipd  ( iPDOM(B0) = B_ipd)
753         //
754         // B0's branch instruction has random dependency, which triggers control dependence calculation.
755         // B2 is a partial join in InfluenceRegion. Thus its phi is marked as random, but its operand
756         // x.0 is still not set yet.
757         unsigned int unsetOpNum = 0;
758         for (unsigned i = 0; i < inst->getNumOperands(); ++i)
759         {
760             if (!hasDependency(inst->getOperand(i))) unsetOpNum++;
761         }
762         if (isa<PHINode>(inst))
763         {
764             // We do not calculate PhiNode with all incoming values unset.
765             //
766             // This seems right as we don't expect a phi that only depends upon other
767             // phi's (if it happens, those phis form a cycle dependency) so any phi's
768             // calculation will eventually be triggered from calculating a non-phi one
769             // which the phi depends upon.
770             if (unsetOpNum == inst->getNumOperands()) return;
771         }
772         else
773         {
774             // We do not calculate non-PhiNode instruction that have unset operands
775             if (unsetOpNum > 0) return;
776 
777             // We have all operands set. Check a special case from calculate_dep for
778             // binary ops (see the details below). It checks for ASHR+ADD and ASHR+SHL
779             // cases, and in particular it accesses dependency for ADD operands. It
780             // could happen these operands are not processed yet and in such case
781             // getDependency raises the assertion. Thus check if dependency is set.
782             // Currently we need to check dependency for ASHR->ADD operands only.
783             // For SHR, its operands are checked to be constant so skip this case.
784             // This code could be extended further depending on requirements.
785             if (inst->getOpcode() == Instruction::AShr)
786             {
787                 BinaryOperator* op0 = dyn_cast<BinaryOperator>(inst->getOperand(0));
788                 if (op0 && op0->getOpcode() == Instruction::Add &&
789                     !hasDependency(op0->getOperand(1)))
790                 {
791                     return;
792                 }
793             }
794         }
795 
796         if (!hasOriginal)
797         {
798             orig = WIAnalysis::UNIFORM_GLOBAL;
799         }
800         else
801         {
802             orig = m_depMap.GetAttributeWithoutCreating(inst);
803 
804             // if inst is already marked random, it cannot get better
805             if (orig == WIAnalysis::RANDOM)
806             {
807                 return;
808             }
809         }
810 
811         WIAnalysis::WIDependancy dep = orig;
812 
813         // LLVM does not have compile time polymorphisms
814         // TODO: to make things faster we may want to sort the list below according
815         // to the order of their probability of appearance.
816         if (const BinaryOperator* BI = dyn_cast<BinaryOperator>(inst))              dep = calculate_dep(BI);
817         else if (const CallInst* CI = dyn_cast<CallInst>(inst))                     dep = calculate_dep(CI);
818         else if (isa<CmpInst>(inst))                                                dep = calculate_dep_simple(inst);
819         else if (isa<ExtractElementInst>(inst))                                     dep = calculate_dep_simple(inst);
820         else if (const GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(inst))  dep = calculate_dep(GEP);
821         else if (isa<InsertElementInst>(inst))                                      dep = calculate_dep_simple(inst);
822         else if (isa<InsertValueInst>(inst))                                        dep = calculate_dep_simple(inst);
823         else if (const PHINode* Phi = dyn_cast<PHINode>(inst))                      dep = calculate_dep(Phi);
824         else if (isa<ShuffleVectorInst>(inst))                                      dep = calculate_dep_simple(inst);
825         else if (isa<StoreInst>(inst))                                              dep = calculate_dep_simple(inst);
826         else if (inst->isTerminator())                                              dep = calculate_dep_terminator(dyn_cast<IGCLLVM::TerminatorInst>(inst));
827         else if (const SelectInst* SI = dyn_cast<SelectInst>(inst))                 dep = calculate_dep(SI);
828         else if (const AllocaInst* AI = dyn_cast<AllocaInst>(inst))                 dep = calculate_dep(AI);
829         else if (const CastInst* CI = dyn_cast<CastInst>(inst))                     dep = calculate_dep(CI);
830         else if (isa<ExtractValueInst>(inst))                                       dep = calculate_dep_simple(inst);
831         else if (const LoadInst* LI = dyn_cast<LoadInst>(inst))                     dep = calculate_dep(LI);
832         else if (const VAArgInst* VAI = dyn_cast<VAArgInst>(inst))                  dep = calculate_dep(VAI);
833 #if LLVM_VERSION_MAJOR >= 10
834         else if (inst->getOpcode() == Instruction::FNeg)                            dep = calculate_dep_simple(inst);
835 #endif
836 
837         if (m_func->hasFnAttribute("KMPLOCK"))
838         {
839             dep = WIAnalysis::UNIFORM_THREAD;
840         }
841 
842         // If the value was changed in this calculation
843         if (!hasOriginal || dep != orig)
844         {
845             // i1 instructions used in phi cannot be uniform as it may prevent us from removing the phi of 1
846             if (inst->getType()->isIntegerTy(1) && WIAnalysis::isDepUniform(dep) && HasPhiUse(inst))
847             {
848                 dep = WIAnalysis::RANDOM;
849             }
850             // Update dependence of this instruction if dep is weaker than orig.
851             // Note depRank(orig) could be higher than depRank(dep) for phi.
852             // (Algo will never decrease the rank of a value.)
853             WIAnalysis::WIDependancy newDep = depRank(orig) < depRank(dep) ? dep : orig;
854             if (!hasOriginal || newDep != orig)
855             {
856                 // update only if it is a new dep
857                 updateDepMap(inst, newDep);
858             }
859             // divergent branch, trigger updates due to control-dependence
860             if (inst->isTerminator() && dep != WIAnalysis::UNIFORM_GLOBAL)
861             {
862                 update_cf_dep(dyn_cast<IGCLLVM::TerminatorInst>(inst));
863             }
864         }
865     }
866 }
867 
isRegionInvariant(const llvm::Instruction * defi,BranchInfo * brInfo,unsigned level)868 bool WIAnalysisRunner::isRegionInvariant(const llvm::Instruction* defi, BranchInfo* brInfo, unsigned level)
869 {
870     if (level >= 4)
871     {
872         return false;
873     }
874     if (isa<PHINode>(defi))
875     {
876         return false;
877     }
878     const unsigned nOps = defi->getNumOperands();
879     for (unsigned i = 0; i < nOps; ++i)
880     {
881         Value* op = defi->getOperand(i);
882         Instruction* srci = dyn_cast<Instruction>(op);
883         if (srci)
884         {
885             if (!brInfo->influence_region.count(srci->getParent()))
886             {
887                 // go on to check the next operand
888                 continue;
889             }
890             else if (!isRegionInvariant(srci, brInfo, level + 1))
891             {
892                 return false;
893             }
894         }
895     }
896     return true;
897 }
898 
update_cf_dep(const IGCLLVM::TerminatorInst * inst)899 void WIAnalysisRunner::update_cf_dep(const IGCLLVM::TerminatorInst* inst)
900 {
901     IGC_ASSERT(hasDependency(inst));
902     WIBaseClass::WIDependancy instDep = getDependency(inst);
903 
904     BasicBlock* blk = (BasicBlock*)(inst->getParent());
905     BasicBlock* ipd = PDT->getNode(blk)->getIDom()->getBlock();
906     // a branch can have NULL immediate post-dominator when a function
907     // has multiple exits in llvm-ir
908     // compute influence region and the partial-joins
909     BranchInfo br_info(inst, ipd);
910     // debug: dump influence region and partial-joins
911     // br_info.print(ods());
912 
913     // check dep-type for every phi in the full join
914     if (ipd)
915     {
916         updatePHIDepAtJoin(ipd, &br_info);
917     }
918     // check dep-type for every phi in the partial-joins
919     for (SmallPtrSet<BasicBlock*, 4>::iterator join_it = br_info.partial_joins.begin(),
920         join_e = br_info.partial_joins.end(); join_it != join_e; ++join_it)
921     {
922         updatePHIDepAtJoin(*join_it, &br_info);
923     }
924 
925     // walk through all the instructions in the influence-region
926     // update the dep-type based upon its uses
927     DenseSet<BasicBlock*>::iterator blk_it = br_info.influence_region.begin();
928     DenseSet<BasicBlock*>::iterator blk_e = br_info.influence_region.end();
929     for (; blk_it != blk_e; ++blk_it)
930     {
931         BasicBlock* def_blk = *blk_it;
932         // add the branch into the controlling-branch set of the block
933         // if the block is in the influence-region
934         IGC_ASSERT(def_blk != br_info.full_join);
935         m_ctrlBranches[def_blk].insert(inst);
936 
937         for (BasicBlock::iterator I = def_blk->begin(), E = def_blk->end(); I != E; ++I)
938         {
939             Instruction* defi = &(*I);
940             if (hasDependency(defi) && depRank(getDependency(defi)) >= depRank(instDep ))
941             {
942                 // defi is already weaker than or equal to inst (br), do nothing.
943                 continue;
944             }
945 
946             if (const auto* st = dyn_cast<StoreInst>(defi))
947             {
948                 // If we encounter a store in divergent control flow,
949                 // we need to process the associated alloca (if any) again
950                 // because it might need to be RANDOM.
951                 auto it = m_storeDepMap.find(st);
952                 if (it != m_storeDepMap.end())
953                     m_pChangedNew->push_back(it->second);
954             }
955 
956             // This is an optimization that tries to detect instruction
957             // not really affected by control-flow divergency because
958             // all the sources are outside the region.
959             // However this is only as good as we can get because we
960             // only search limited depth
961             if (isRegionInvariant(defi, &br_info, 0))
962             {
963                 continue;
964             }
965             // We need to look at where the use is in order to decide
966             // we should make def to be "random" when loop is not in
967             // LCSSA form because we do not have LCSSA phi-nodes.
968             // 1) if use is in the full-join
969             // 2) if use is even outside the full-join
970             // 3) if use is in partial-join but def is not in partial-join
971             Value::use_iterator use_it = defi->use_begin();
972             Value::use_iterator use_e = defi->use_end();
973             for (; use_it != use_e; ++use_it)
974             {
975                 Instruction* user = dyn_cast<Instruction>((*use_it).getUser());
976                 IGC_ASSERT(user);
977                 BasicBlock* user_blk = user->getParent();
978                 PHINode* phi = dyn_cast<PHINode>(user);
979                 if (phi)
980                 {
981                     // another place we assume all critical edges have been
982                     // split and phi-move will be placed on those splitters
983                     user_blk = phi->getIncomingBlock(*use_it);
984                 }
985                 if (user_blk == def_blk)
986                 {
987                     // local def-use, not related to control-dependence
988                     continue; // skip
989                 }
990                 if (user_blk == br_info.full_join ||
991                     !br_info.influence_region.count(user_blk) ||
992                     (br_info.partial_joins.count(user_blk) &&
993                      !br_info.partial_joins.count(def_blk))
994                    )
995                 {
996                     updateDepMap(defi, instDep);
997                     // break out of the use loop
998                     // since def is changed to RANDOM, all uses will be changed later
999                     break;
1000                 }
1001             } // end of usei loop
1002         } // end of defi loop within a block
1003     } // end of influence-region block loop
1004 }
1005 
updatePHIDepAtJoin(BasicBlock * blk,BranchInfo * brInfo)1006 void WIAnalysisRunner::updatePHIDepAtJoin(BasicBlock* blk, BranchInfo* brInfo)
1007 {
1008     // This is to bring down PHI's dep to br's dep.
1009     // If PHI's dep is already weaker than br's dep, do nothing.
1010     IGC_ASSERT(hasDependency(brInfo->cbr));
1011     WIAnalysis::WIDependancy brDep = getDependency(brInfo->cbr);
1012 
1013     for (BasicBlock::iterator I = blk->begin(), E = blk->end(); I != E; ++I)
1014     {
1015         Instruction* defi = &(*I);
1016         PHINode* phi = dyn_cast<PHINode>(defi);
1017         if (!phi)
1018         {
1019             break;
1020         }
1021         if (hasDependency(phi) && depRank(getDependency(phi)) >= depRank(brDep))
1022         {
1023             // phi's dep is already the same or weaker, do nothing.
1024             continue;
1025         }
1026         Value* trickySrc = nullptr;
1027         for (unsigned predIdx = 0; predIdx < phi->getNumOperands(); ++predIdx)
1028         {
1029             Value* srcVal = phi->getOperand(predIdx);
1030             Instruction* defi = dyn_cast<Instruction>(srcVal);
1031             if (defi && brInfo->influence_region.count(defi->getParent()))
1032             {
1033                 updateDepMap(phi, brDep);
1034                 break;
1035             }
1036             else
1037             {
1038                 // if the src is an immed, or an argument, or defined outside,
1039                 // think about the phi-move that can be placed in the incoming block.
1040                 // this phi should be random if we have two different src-values like that.
1041                 // this is one place where we assume all critical edges have been split
1042                 BasicBlock* predBlk = phi->getIncomingBlock(predIdx);
1043                 if (brInfo->influence_region.count(predBlk))
1044                 {
1045                     if (!trickySrc)
1046                     {
1047                         trickySrc = srcVal;
1048                     }
1049                     else if (trickySrc != srcVal)
1050                     {
1051                         updateDepMap(phi, brDep);
1052                         break;
1053                     }
1054                 }
1055             }
1056         }
1057     }
1058 }
1059 
updateDepMap(const Instruction * inst,WIAnalysis::WIDependancy dep)1060 void WIAnalysisRunner::updateDepMap(const Instruction* inst, WIAnalysis::WIDependancy dep)
1061 {
1062     // Save the new value of this instruction
1063     m_depMap.SetAttribute(inst, dep);
1064     // Register for update all of the dependent values of this updated
1065     // instruction.
1066     Value::const_user_iterator it = inst->user_begin();
1067     Value::const_user_iterator e = inst->user_end();
1068     for (; it != e; ++it)
1069     {
1070         m_pChangedNew->push_back(*it);
1071     }
1072     if (const StoreInst * st = dyn_cast<StoreInst>(inst))
1073     {
1074         auto it = m_storeDepMap.find(st);
1075         if (it != m_storeDepMap.end())
1076         {
1077             m_pChangedNew->push_back(it->second);
1078         }
1079     }
1080 
1081     if (dep == WIAnalysis::RANDOM)
1082     {
1083         EOPCODE eopcode = GetOpCode((Instruction*)inst);
1084         if (eopcode == llvm_insert)
1085         {
1086             updateInsertElements((const InsertElementInst*)inst);
1087         }
1088     }
1089 }
1090 
1091 /// if one of insert-element is random, turn all the insert-elements into random
updateInsertElements(const InsertElementInst * inst)1092 void WIAnalysisRunner::updateInsertElements(const InsertElementInst* inst)
1093 {
1094     /// find the first one in the sequence
1095     InsertElementInst* curInst = (InsertElementInst*)inst;
1096     InsertElementInst* srcInst = dyn_cast<InsertElementInst>(curInst->getOperand(0));
1097     while (srcInst)
1098     {
1099         if (hasDependency(srcInst) && getDependency(srcInst) == WIAnalysis::RANDOM)
1100             return;
1101         curInst = srcInst;
1102         srcInst = dyn_cast<InsertElementInst>(curInst->getOperand(0));
1103     }
1104     if (curInst != inst)
1105     {
1106         m_depMap.SetAttribute(curInst, WIAnalysis::RANDOM);
1107         Value::user_iterator it = curInst->user_begin();
1108         Value::user_iterator e = curInst->user_end();
1109         for (; it != e; ++it)
1110         {
1111             m_pChangedNew->push_back(*it);
1112         }
1113     }
1114 }
1115 
calculate_dep_simple(const Instruction * I)1116 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep_simple(const Instruction* I)
1117 {
1118     // simply check that all operands are uniform, if so return uniform, else random
1119     const unsigned nOps = I->getNumOperands();
1120     WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
1121     for (unsigned i = 0; i < nOps; ++i)
1122     {
1123         const Value* op = I->getOperand(i);
1124         WIAnalysis::WIDependancy D = getDependency(op);
1125         dep = add_conversion[dep][D];
1126         if (dep == WIAnalysis::RANDOM)
1127         {
1128             break;
1129         }
1130     }
1131     return dep;
1132 }
1133 
calculate_dep(const LoadInst * inst)1134 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const LoadInst* inst)
1135 {
1136     return calculate_dep_simple(inst);
1137 }
1138 
calculate_dep(const BinaryOperator * inst)1139 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(
1140     const BinaryOperator* inst)
1141 {
1142     // Calculate the dependency type for each of the operands
1143     Value* op0 = inst->getOperand(0);
1144     Value* op1 = inst->getOperand(1);
1145 
1146     WIAnalysis::WIDependancy dep0 = getDependency(op0);
1147     IGC_ASSERT(dep0 < WIAnalysis::NumDeps);
1148     WIAnalysis::WIDependancy dep1 = getDependency(op1);
1149     IGC_ASSERT(dep1 < WIAnalysis::NumDeps);
1150 
1151     // For whatever binary operation,
1152     // uniform returns uniform
1153     WIAnalysis::WIDependancy dep = select_conversion[dep0][dep1];
1154     if (WIAnalysis::isDepUniform(dep))
1155     {
1156         return dep;
1157     }
1158 
1159     // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
1160     // The pattern (and (X, C)), where C preserves the lower k bits of the value,
1161     // is often used for truncating of numbers in 64bit. We assume that the index
1162     // properties are not hurt by this.
1163     if (inst->getOpcode() == Instruction::And)
1164     {
1165         ConstantInt* C0 = dyn_cast<ConstantInt>(inst->getOperand(0));
1166         ConstantInt* C1 = dyn_cast<ConstantInt>(inst->getOperand(1));
1167         // Use any of the constants. Instcombine places constants on Op1
1168         // so try Op1 first.
1169         if (C1 || C0)
1170         {
1171             ConstantInt* C = C1 ? C1 : C0;
1172             dep = C1 ? dep0 : dep1;
1173             // Cannot look at bit pattern of huge integers.
1174             if (C->getBitWidth() < 65)
1175             {
1176                 uint64_t val = C->getZExtValue();
1177                 uint64_t ptr_mask = (1 << MinIndexBitwidthToPreserve) - 1;
1178                 // Zero all bits above the lower k bits that we are interested in
1179                 val &= (ptr_mask);
1180                 // Make sure that all of the remaining bits are active
1181                 if (val == ptr_mask)
1182                 {
1183                     return dep;
1184                 }
1185             }
1186         }
1187     }
1188 
1189     // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
1190     // The pattern (ashr (shl X, C)C) is used for truncating of numbers in 64bit
1191     // The constant C must leave at least 32bits of the original number
1192     if (inst->getOpcode() == Instruction::AShr)
1193     {
1194         BinaryOperator* SHL = dyn_cast<BinaryOperator>(inst->getOperand(0));
1195         // We also allow add of uniform value between the ashr and shl instructions
1196         // since instcombine creates this pattern when adding a constant.
1197         // The shl forces all low bits to be zero, so there can be no carry to the
1198         // high bits due to the addition. Addition with uniform preservs WI-dep.
1199         if (SHL && SHL->getOpcode() == Instruction::Add)
1200         {
1201             Value* addedVal = SHL->getOperand(1);
1202             if (WIAnalysis::isDepUniform(getDependency(addedVal)))
1203             {
1204                 SHL = dyn_cast<BinaryOperator>(SHL->getOperand(0));
1205             }
1206         }
1207 
1208         if (SHL && SHL->getOpcode() == Instruction::Shl)
1209         {
1210             ConstantInt* c_ashr = dyn_cast<ConstantInt>(inst->getOperand(1));
1211             ConstantInt* c_shl = dyn_cast<ConstantInt>(SHL->getOperand(1));
1212             const IntegerType* AshrTy = cast<IntegerType>(inst->getType());
1213             if (c_ashr && c_shl && c_ashr->getZExtValue() == c_shl->getZExtValue())
1214             {
1215                 // If wordWidth - shift_width >= 32 bits
1216                 if ((AshrTy->getBitWidth() - c_shl->getZExtValue()) >= MinIndexBitwidthToPreserve)
1217                 {
1218                     // return the dep of the original X
1219                     return getDependency(SHL->getOperand(0));
1220                 }
1221             }
1222         }
1223     }
1224 
1225     switch (inst->getOpcode())
1226     {
1227         // Addition simply adds the stride value, except for ptr_consecutive
1228         // which is promoted to strided.
1229         // Another exception is when we subtract the tid: 1 - X which turns the
1230         // tid order to random.
1231     case Instruction::Add:
1232     case Instruction::FAdd:
1233         return add_conversion[dep0][dep1];
1234     case Instruction::Sub:
1235     case Instruction::FSub:
1236         return sub_conversion[dep0][dep1];
1237 
1238     case Instruction::Mul:
1239     case Instruction::FMul:
1240     case Instruction::Shl:
1241         if (WIAnalysis::isDepUniform(dep0) || WIAnalysis::isDepUniform(dep1))
1242         {
1243             // If one of the sides is uniform, then we can adopt
1244             // the other side (stride*uniform is still stride).
1245             // stride size is K, where K is the uniform input.
1246             // An exception to this is ptr_consecutive, which is
1247             // promoted to strided.
1248             return mul_conversion[dep0][dep1];
1249         }
1250     default:
1251         //TODO: Support more arithmetic if needed
1252         return WIAnalysis::RANDOM;
1253     }
1254     return WIAnalysis::RANDOM;
1255 }
1256 
calculate_dep(const CallInst * inst)1257 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst)
1258 {
1259     // handle 3D specific intrinsics
1260     EOPCODE intrinsic_name = GetOpCode((Instruction*)(inst));
1261     GenISAIntrinsic::ID GII_id = GenISAIntrinsic::no_intrinsic;
1262     if (const GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(inst))
1263     {
1264         GII_id = GII->getIntrinsicID();
1265     }
1266 
1267     const llvm::IntrinsicInst* llvmintrin = dyn_cast<llvm::IntrinsicInst>(inst);
1268     if (llvmintrin != nullptr &&
1269         (llvmintrin->getIntrinsicID() == llvm::Intrinsic::stacksave ||
1270          llvmintrin->getIntrinsicID() == llvm::Intrinsic::stackrestore)) {
1271         return WIAnalysis::UNIFORM_THREAD;
1272     }
1273 
1274     if (IsMathIntrinsic(intrinsic_name) ||
1275         intrinsic_name == llvm_input ||
1276         intrinsic_name == llvm_sgv ||
1277         intrinsic_name == llvm_shaderinputvec ||
1278         intrinsic_name == llvm_getbufferptr ||
1279         intrinsic_name == llvm_runtimeValue ||
1280         intrinsic_name == llvm_getMessagePhaseX ||
1281         intrinsic_name == llvm_getMessagePhaseXV ||
1282         intrinsic_name == llvm_surfaceinfo ||
1283         intrinsic_name == llvm_simdSize ||
1284         intrinsic_name == llvm_resinfoptr ||
1285         intrinsic_name == llvm_sampleinfoptr ||
1286         intrinsic_name == llvm_ldrawvector_indexed ||
1287         intrinsic_name == llvm_ldraw_indexed ||
1288         intrinsic_name == llvm_cycleCounter ||
1289         intrinsic_name == llvm_waveShuffleIndex ||
1290         intrinsic_name == llvm_waveBallot ||
1291         intrinsic_name == llvm_waveAll ||
1292         intrinsic_name == llvm_waveClustered ||
1293         intrinsic_name == llvm_ld_ptr ||
1294         (IGC_IS_FLAG_DISABLED(DisableUniformTypedAccess) && intrinsic_name == llvm_typed_read) ||
1295         intrinsic_name == llvm_add_pair ||
1296         intrinsic_name == llvm_sub_pair ||
1297         intrinsic_name == llvm_mul_pair ||
1298         intrinsic_name == llvm_ptr_to_pair ||
1299         intrinsic_name == llvm_pair_to_ptr ||
1300         intrinsic_name == llvm_fma ||
1301         GII_id == GenISAIntrinsic::GenISA_uitof_rtz ||
1302         GII_id == GenISAIntrinsic::GenISA_ftobf ||
1303         GII_id == GenISAIntrinsic::GenISA_bftof ||
1304         GII_id == GenISAIntrinsic::GenISA_2fto2bf ||
1305         GII_id == GenISAIntrinsic::GenISA_dual_subslice_id ||
1306         GII_id == GenISAIntrinsic::GenISA_getSR0   ||
1307         GII_id == GenISAIntrinsic::GenISA_getSR0_0 ||
1308         GII_id == GenISAIntrinsic::GenISA_mul_rtz  ||
1309         GII_id == GenISAIntrinsic::GenISA_fma_rtz  ||
1310         GII_id == GenISAIntrinsic::GenISA_fma_rtp ||
1311         GII_id == GenISAIntrinsic::GenISA_fma_rtn ||
1312         GII_id == GenISAIntrinsic::GenISA_add_rtz  ||
1313         GII_id == GenISAIntrinsic::GenISA_slice_id ||
1314         GII_id == GenISAIntrinsic::GenISA_subslice_id  ||
1315         GII_id == GenISAIntrinsic::GenISA_dual_subslice_id ||
1316         GII_id == GenISAIntrinsic::GenISA_eu_id        ||
1317         GII_id == GenISAIntrinsic::GenISA_eu_thread_id ||
1318         GII_id == GenISAIntrinsic::GenISA_hw_thread_id ||
1319         GII_id == GenISAIntrinsic::GenISA_hw_thread_id_alloca ||
1320         GII_id == GenISAIntrinsic::GenISA_StackAlloca ||
1321         GII_id == GenISAIntrinsic::GenISA_vectorUniform ||
1322         GII_id == GenISAIntrinsic::GenISA_getR0 ||
1323         GII_id == GenISAIntrinsic::GenISA_getPayloadHeader ||
1324         GII_id == GenISAIntrinsic::GenISA_getWorkDim ||
1325         GII_id == GenISAIntrinsic::GenISA_getNumWorkGroups ||
1326         GII_id == GenISAIntrinsic::GenISA_getLocalSize ||
1327         GII_id == GenISAIntrinsic::GenISA_getGlobalSize ||
1328         GII_id == GenISAIntrinsic::GenISA_getEnqueuedLocalSize ||
1329         GII_id == GenISAIntrinsic::GenISA_getLocalID_X ||
1330         GII_id == GenISAIntrinsic::GenISA_getLocalID_Y ||
1331         GII_id == GenISAIntrinsic::GenISA_getLocalID_Z ||
1332         GII_id == GenISAIntrinsic::GenISA_getPrivateBase ||
1333         GII_id == GenISAIntrinsic::GenISA_getPrintfBuffer ||
1334         GII_id == GenISAIntrinsic::GenISA_getStageInGridOrigin ||
1335         GII_id == GenISAIntrinsic::GenISA_getStageInGridSize ||
1336         GII_id == GenISAIntrinsic::GenISA_getSyncBuffer ||
1337         GII_id == GenISAIntrinsic::GenISA_GetImplicitBufferPtr ||
1338         GII_id == GenISAIntrinsic::GenISA_GetLocalIdBufferPtr)
1339     {
1340         switch (GII_id)
1341         {
1342         default:
1343             break;
1344         case GenISAIntrinsic::GenISA_vectorUniform:
1345             // collect the seeds for forcing uniform vectors
1346             m_forcedUniforms.push_back(inst);
1347             return WIAnalysis::UNIFORM_THREAD;
1348         case GenISAIntrinsic::GenISA_getSR0:
1349         case GenISAIntrinsic::GenISA_getSR0_0:
1350         case GenISAIntrinsic::GenISA_eu_id:
1351         case GenISAIntrinsic::GenISA_hw_thread_id:
1352             return WIAnalysis::UNIFORM_THREAD;
1353         case GenISAIntrinsic::GenISA_slice_id:
1354         case GenISAIntrinsic::GenISA_subslice_id:
1355         case GenISAIntrinsic::GenISA_dual_subslice_id:
1356             // Make sure they are UNIFORM_WORKGROUP
1357             //return WIAnalysis::UNIFORM_WORKGROUP;
1358             return WIAnalysis::UNIFORM_THREAD;
1359         case GenISAIntrinsic::GenISA_GetImplicitBufferPtr:
1360         case GenISAIntrinsic::GenISA_GetLocalIdBufferPtr:
1361             return WIAnalysis::UNIFORM_THREAD;
1362         case GenISAIntrinsic::GenISA_getR0:
1363         case GenISAIntrinsic::GenISA_getPayloadHeader:
1364         case GenISAIntrinsic::GenISA_getWorkDim:
1365         case GenISAIntrinsic::GenISA_getNumWorkGroups:
1366         case GenISAIntrinsic::GenISA_getLocalSize:
1367         case GenISAIntrinsic::GenISA_getGlobalSize:
1368         case GenISAIntrinsic::GenISA_getEnqueuedLocalSize:
1369         case GenISAIntrinsic::GenISA_getLocalID_X:
1370         case GenISAIntrinsic::GenISA_getLocalID_Y:
1371         case GenISAIntrinsic::GenISA_getLocalID_Z:
1372         case GenISAIntrinsic::GenISA_getPrivateBase:
1373         case GenISAIntrinsic::GenISA_getPrintfBuffer:
1374         case GenISAIntrinsic::GenISA_getStageInGridOrigin:
1375         case GenISAIntrinsic::GenISA_getStageInGridSize:
1376         case GenISAIntrinsic::GenISA_getSyncBuffer:
1377             return ImplicitArgs::getArgDep(GII_id);
1378         }
1379 
1380         if (intrinsic_name == llvm_input ||
1381             intrinsic_name == llvm_shaderinputvec)
1382         {
1383             e_interpolation mode = (e_interpolation)cast<ConstantInt>(inst->getOperand(1))->getZExtValue();
1384             if (mode != EINTERPOLATION_CONSTANT
1385                 )
1386             {
1387                 return WIAnalysis::RANDOM;
1388             }
1389         }
1390 
1391 
1392         if (intrinsic_name == llvm_sgv)
1393         {
1394             SGVUsage usage = (SGVUsage)cast<ConstantInt>(inst->getOperand(0))->getZExtValue();
1395             if ((usage != VFACE
1396                 ) &&
1397                 usage != ACTUAL_COARSE_SIZE_X &&
1398                 usage != ACTUAL_COARSE_SIZE_Y &&
1399                 usage != THREAD_GROUP_ID_X &&
1400                 usage != THREAD_GROUP_ID_Y &&
1401                 usage != THREAD_GROUP_ID_Z &&
1402                 usage != XP0 &&
1403                 usage != XP1 &&
1404                 usage != XP2
1405                 )
1406             {
1407                 return WIAnalysis::RANDOM;
1408             }
1409         }
1410         if (intrinsic_name == llvm_getMessagePhaseX ||
1411             intrinsic_name == llvm_getMessagePhaseXV)
1412         {
1413             return WIAnalysis::UNIFORM_THREAD;
1414         }
1415 
1416         if (intrinsic_name == llvm_waveShuffleIndex)
1417         {
1418             Value* op0 = inst->getArgOperand(0);
1419             Value* op1 = inst->getArgOperand(1);
1420             WIAnalysis::WIDependancy dep0 = getDependency(op0);
1421             IGC_ASSERT(dep0 < WIAnalysis::NumDeps);
1422             WIAnalysis::WIDependancy dep1 = getDependency(op1);
1423             IGC_ASSERT(dep1 < WIAnalysis::NumDeps);
1424             bool isUniform0 = WIAnalysis::isDepUniform(dep0);
1425             bool isUniform1 = WIAnalysis::isDepUniform(dep1);
1426             if ((isUniform0 && isUniform1) || (!isUniform0 && !isUniform1))
1427             {
1428                 // Select worse one
1429                 return select_conversion[dep0][dep1];
1430             }
1431             else
1432             {
1433                 // Select uniform one if only one is uniform
1434                 return isUniform0 ? dep0 : dep1;
1435             }
1436         }
1437 
1438         if (GII_id == GenISAIntrinsic::GenISA_StackAlloca)
1439         {
1440             WIAnalysis::WIDependancy dep0 = WIAnalysis::UNIFORM_THREAD;
1441             Value* op0 = inst->getArgOperand(0);
1442             WIAnalysis::WIDependancy dep1 = getDependency(op0);
1443             // Select worse one
1444             return select_conversion[dep0][dep1];
1445         }
1446 
1447         if (intrinsic_name == llvm_waveBallot || intrinsic_name == llvm_waveAll)
1448         {
1449             return WIAnalysis::UNIFORM_THREAD;
1450         }
1451 
1452         if (intrinsic_name == llvm_waveClustered)
1453         {
1454             const unsigned clusterSize = static_cast<unsigned>(
1455                 cast<llvm::ConstantInt>(inst->getArgOperand(2))->getZExtValue());
1456 
1457             constexpr unsigned maxSimdSize = 32;
1458             if (clusterSize == maxSimdSize)
1459             {
1460                 // TODO: do the same for SIMD8 and SIMD16 if possible.
1461                 return WIAnalysis::UNIFORM_THREAD;
1462             }
1463             else
1464             {
1465                 return WIAnalysis::RANDOM;
1466             }
1467         }
1468 
1469 
1470         // Iterate over all input dependencies. If all are uniform - propagate it.
1471         // otherwise - return RANDOM
1472         unsigned numParams = inst->getNumArgOperands();
1473         WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
1474         for (unsigned i = 0; i < numParams; ++i)
1475         {
1476             Value* op = inst->getArgOperand(i);
1477             WIAnalysis::WIDependancy tdep = getDependency(op);
1478             dep = select_conversion[dep][tdep];
1479             if (dep == WIAnalysis::RANDOM)
1480             {
1481                 break; // Uniformity check failed. no need to continue
1482             }
1483         }
1484         return dep;
1485     }
1486     return WIAnalysis::RANDOM;
1487 }
1488 
calculate_dep(const GetElementPtrInst * inst)1489 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(
1490     const GetElementPtrInst* inst)
1491 {
1492     const Value* opPtr = inst->getOperand(0);
1493     WIAnalysis::WIDependancy dep = getDependency(opPtr);
1494     // running over the all indices arguments except for the last
1495     // here we assume the pointer is the first operand
1496     unsigned num = inst->getNumIndices();
1497     for (unsigned i = 1; i < num; ++i)
1498     {
1499         const Value* op = inst->getOperand(i);
1500         WIAnalysis::WIDependancy tdep = getDependency(op);
1501         dep = select_conversion[dep][tdep];
1502         if (!WIAnalysis::isDepUniform(dep))
1503         {
1504             return WIAnalysis::RANDOM;
1505         }
1506     }
1507     const Value* lastInd = inst->getOperand(num);
1508     WIAnalysis::WIDependancy lastIndDep = getDependency(lastInd);
1509     return gep_conversion[dep][lastIndDep];
1510 }
1511 
calculate_dep(const PHINode * inst)1512 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const PHINode* inst)
1513 {
1514     unsigned num = inst->getNumIncomingValues();
1515     bool foundFirst = 0;
1516     WIAnalysis::WIDependancy totalDep = WIAnalysis::WIDependancy::INVALID;
1517 
1518     for (unsigned i = 0; i < num; ++i)
1519     {
1520         Value* op = inst->getIncomingValue(i);
1521         if (hasDependency(op))
1522         {
1523             if (!foundFirst)
1524             {
1525                 totalDep = getDependency(op);
1526             }
1527             else
1528             {
1529                 totalDep = select_conversion[totalDep][getDependency(op)];
1530             }
1531             foundFirst = 1;
1532         }
1533     }
1534 
1535     IGC_ASSERT_MESSAGE(foundFirst, "We should not reach here with All incoming values are unset");
1536 
1537     return totalDep;
1538 }
1539 
calculate_dep_terminator(const IGCLLVM::TerminatorInst * inst)1540 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep_terminator(
1541     const IGCLLVM::TerminatorInst* inst)
1542 {
1543     // Instruction has no return value
1544     // Just need to know if this inst is uniform or not
1545     // because we may want to avoid predication if the control flows
1546     // in the function are uniform...
1547     switch (inst->getOpcode())
1548     {
1549     case Instruction::Br:
1550     {
1551         const BranchInst* brInst = cast<BranchInst>(inst);
1552         if (brInst->isConditional())
1553         {
1554             // Conditional branch is uniform, if its condition is uniform
1555             Value* op = brInst->getCondition();
1556             WIAnalysis::WIDependancy dep = getDependency(op);
1557             if (WIAnalysis::isDepUniform(dep))
1558             {
1559                 return dep;
1560             }
1561             return WIAnalysis::RANDOM;
1562         }
1563         // Unconditional branch is non TID-dependent
1564         return WIAnalysis::UNIFORM_GLOBAL;
1565     }
1566     //Return instructions are unconditional
1567     case Instruction::Ret:
1568         return WIAnalysis::UNIFORM_GLOBAL;
1569     case Instruction::Unreachable:
1570         return WIAnalysis::UNIFORM_GLOBAL;
1571     case Instruction::IndirectBr:
1572         return WIAnalysis::RANDOM;
1573         // TODO: Define the dependency requirements of indirectBr
1574     case Instruction::Switch:
1575         return WIAnalysis::RANDOM;
1576         // TODO: Should this depend only on the condition, like branch?
1577     default:
1578         return WIAnalysis::RANDOM;
1579     }
1580 }
1581 
calculate_dep(const SelectInst * inst)1582 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const SelectInst* inst)
1583 {
1584     Value* op0 = inst->getOperand(0); // mask
1585     WIAnalysis::WIDependancy dep0 = getDependency(op0);
1586     if (WIAnalysis::isDepUniform(dep0))
1587     {
1588         Value* op1 = inst->getOperand(1);
1589         Value* op2 = inst->getOperand(2);
1590         WIAnalysis::WIDependancy dep1 = getDependency(op1);
1591         WIAnalysis::WIDependancy dep2 = getDependency(op2);
1592         // In case of constant scalar select we can choose according to the mask.
1593         if (ConstantInt * C = dyn_cast<ConstantInt>(op0))
1594         {
1595             uint64_t val = C->getZExtValue();
1596             if (val) return dep1;
1597             else return dep2;
1598         }
1599         // Select the "weaker" dep, but if only one dep is ptr_consecutive,
1600         // it must be promoted to strided ( as this data may
1601         // propagate to Load/Store instructions.
1602         WIAnalysis::WIDependancy tDep = select_conversion[dep1][dep2];
1603         return select_conversion[dep0][tDep];
1604     }
1605     // In case the mask is non-uniform the select outcome can be a combination
1606     // so we don't know nothing about it.
1607     return WIAnalysis::RANDOM;
1608 }
1609 
TrackAllocaDep(const Value * I,AllocaDep & dep)1610 bool WIAnalysisRunner::TrackAllocaDep(const Value* I, AllocaDep& dep)
1611 {
1612     bool trackable = true;
1613     for (Value::const_user_iterator use_it = I->user_begin(), use_e = I->user_end(); use_it != use_e; ++use_it)
1614     {
1615         if (const GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(*use_it))
1616         {
1617             trackable &= TrackAllocaDep(gep, dep);
1618         }
1619         else if (const llvm::LoadInst * pLoad = llvm::dyn_cast<llvm::LoadInst>(*use_it))
1620         {
1621             trackable &= (pLoad->isSimple());
1622         }
1623         else if (const llvm::StoreInst * pStore = llvm::dyn_cast<llvm::StoreInst>(*use_it))
1624         {
1625             trackable &= (pStore->isSimple());
1626             // Not supported case: GEP instruction is the stored value of the StoreInst
1627             trackable &= (pStore->getValueOperand() != I);
1628             dep.stores.push_back(pStore);
1629         }
1630         else if (const llvm::BitCastInst * pBitCast = llvm::dyn_cast<llvm::BitCastInst>(*use_it))
1631         {
1632             trackable &= TrackAllocaDep(pBitCast, dep);
1633         }
1634         else if (const llvm::AddrSpaceCastInst* pAddrCast = llvm::dyn_cast<llvm::AddrSpaceCastInst>(*use_it))
1635         {
1636             trackable &= TrackAllocaDep(pAddrCast, dep);
1637         }
1638         else if (const GenIntrinsicInst* intr = dyn_cast<GenIntrinsicInst>(*use_it))
1639         {
1640             GenISAIntrinsic::ID IID = intr->getIntrinsicID();
1641             if (IID == GenISAIntrinsic::GenISA_assume_uniform)
1642             {
1643                 dep.assume_uniform = true;
1644             }
1645             else
1646                 trackable = false;
1647         }
1648         else if (const IntrinsicInst * intr = dyn_cast<IntrinsicInst>(*use_it))
1649         {
1650             llvm::Intrinsic::ID  IID = intr->getIntrinsicID();
1651             if (IID != llvm::Intrinsic::lifetime_start &&
1652                 IID != llvm::Intrinsic::lifetime_end)
1653             {
1654                 trackable = false;
1655             }
1656             else if (IID == llvm::Intrinsic::lifetime_start)
1657               dep.lifetimes.push_back(intr);
1658         }
1659         else
1660         {
1661             // This is some other instruction. Right now we don't want to handle these
1662             trackable = false;
1663         }
1664     }
1665     return trackable;
1666 }
1667 
1668 
calculate_dep(const AllocaInst * inst)1669 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const AllocaInst* inst)
1670 {
1671     if (m_CGCtx->platform.getWATable().WaNoA32ByteScatteredStatelessMessages)
1672     {
1673         // avoid generating A32 byte scatter on platforms not supporting it
1674         return WIAnalysis::RANDOM;
1675     }
1676     if (!hasDependency(inst))
1677     {
1678         AllocaDep dep;
1679         dep.assume_uniform = false;
1680         bool trackable = TrackAllocaDep(inst, dep);
1681 
1682         if ( trackable || dep.assume_uniform)
1683         {
1684             m_allocaDepMap.insert(std::make_pair(inst, dep));
1685             for (auto it : dep.stores)
1686             {
1687                 m_storeDepMap.insert(std::make_pair(&(*it), inst));
1688             }
1689         }
1690     }
1691     auto depIt = m_allocaDepMap.find(inst);
1692     if (depIt == m_allocaDepMap.end())
1693     {
1694         // If we haven't been able to track the dependency of the alloca make it random
1695         return WIAnalysis::RANDOM;
1696     }
1697     // find assume-uniform
1698     if (depIt->second.assume_uniform)
1699     {
1700         return WIAnalysis::UNIFORM_THREAD;
1701     }
1702     // find the common dominator block among all the life-time starts
1703     // that can be considered as the nearest logical location for alloca.
1704     const BasicBlock* CommonDomBB = nullptr;
1705     for (auto *SI : depIt->second.lifetimes)
1706     {
1707         auto BB = SI->getParent();
1708         IGC_ASSERT(BB);
1709         if (!CommonDomBB)
1710             CommonDomBB = BB;
1711         else
1712             CommonDomBB = DT->findNearestCommonDominator(CommonDomBB, BB);
1713     }
1714     if (!CommonDomBB)
1715     {
1716         CommonDomBB = inst->getParent();
1717     }
1718     // if any store is not uniform, then alloca is not uniform
1719     // if any store is affected by a divergent branch after alloca,
1720     // then alloca is also not uniform
1721     for (auto *SI : depIt->second.stores)
1722     {
1723         if (hasDependency(SI))
1724         {
1725             if (!WIAnalysis::isDepUniform(getDependency(SI)))
1726             {
1727                 return WIAnalysis::RANDOM;
1728             }
1729 
1730             if (auto I = m_ctrlBranches.find(SI->getParent());
1731                 I != m_ctrlBranches.end())
1732             {
1733                 auto& Branches = I->second;
1734                 WIAnalysis::WIDependancy cntrDep = WIAnalysis::UNIFORM_GLOBAL;
1735                 for (auto* BrI : Branches)
1736                 {
1737                     // exclude those branches that dominates alloca
1738                     if (!DT->dominates(BrI, CommonDomBB))
1739                     {
1740                         // select a weaker one
1741                         IGC_ASSERT(hasDependency(BrI));
1742                         cntrDep = select_conversion[cntrDep][getDependency(BrI)];
1743                         if (cntrDep == WIAnalysis::RANDOM)
1744                             break;
1745                     }
1746                 }
1747                 if (cntrDep == WIAnalysis::RANDOM)
1748                     return WIAnalysis::RANDOM;
1749             }
1750         }
1751     }
1752 
1753     return WIAnalysis::UNIFORM_THREAD;
1754 }
1755 
calculate_dep(const CastInst * inst)1756 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CastInst* inst)
1757 {
1758     Value* op0 = inst->getOperand(0);
1759     WIAnalysis::WIDependancy dep0 = getDependency(op0);
1760 
1761     // independent remains independent
1762     if (WIAnalysis::isDepUniform(dep0)) return dep0;
1763 
1764     switch (inst->getOpcode())
1765     {
1766     case Instruction::SExt:
1767     case Instruction::FPTrunc:
1768     case Instruction::FPExt:
1769     case Instruction::PtrToInt:
1770     case Instruction::IntToPtr:
1771     case Instruction::AddrSpaceCast:
1772     case Instruction::UIToFP:
1773     case Instruction::FPToUI:
1774     case Instruction::FPToSI:
1775     case Instruction::SIToFP:
1776         return dep0;
1777     case Instruction::BitCast:
1778     case Instruction::ZExt:
1779         return WIAnalysis::RANDOM;
1780         // FIXME:: assumes that the value does not cross the +/- border - risky !!!!
1781     case Instruction::Trunc: {
1782         const Type* destType = inst->getDestTy();
1783         const IntegerType* intType = dyn_cast<IntegerType>(destType);
1784         if (intType && (intType->getBitWidth() >= MinIndexBitwidthToPreserve))
1785         {
1786             return dep0;
1787         }
1788         return WIAnalysis::RANDOM;
1789     }
1790     default:
1791         IGC_ASSERT_MESSAGE(0, "no such opcode");
1792         // never get here
1793         return WIAnalysis::RANDOM;
1794     }
1795 }
1796 
calculate_dep(const VAArgInst * inst)1797 WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const VAArgInst* inst)
1798 {
1799     IGC_ASSERT_MESSAGE(0, "Are we supporting this ??");
1800     return WIAnalysis::RANDOM;
1801 }
1802 
1803 // Set IsLxUniform/IsLyUniform/IsLxUniform to true if they are uniform;
1804 // do nothing otherwise.
checkLocalIdUniform(Function * F,bool & IsLxUniform,bool & IsLyUniform,bool & IsLzUniform)1805 void WIAnalysisRunner::checkLocalIdUniform(
1806     Function* F,
1807     bool& IsLxUniform,
1808     bool& IsLyUniform,
1809     bool& IsLzUniform)
1810 {
1811     if (m_CGCtx->type != ShaderType::OPENCL_SHADER)
1812     {
1813         return;
1814     }
1815 
1816     FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(F);
1817     ModuleMetaData* modMD = m_CGCtx->getModuleMetaData();
1818     auto funcMD = modMD->FuncMD.find(F);
1819 
1820     int32_t WO_0 = -1, WO_1 = -1, WO_2 = -1;
1821     if (funcMD != modMD->FuncMD.end())
1822     {
1823         WorkGroupWalkOrderMD workGroupWalkOrder = funcMD->second.workGroupWalkOrder;
1824         if (workGroupWalkOrder.dim0 || workGroupWalkOrder.dim1 || workGroupWalkOrder.dim2)
1825         {
1826             WO_0 = workGroupWalkOrder.dim0;
1827             WO_1 = workGroupWalkOrder.dim1;
1828             WO_2 = workGroupWalkOrder.dim2;
1829         }
1830     }
1831 
1832     uint32_t simdSize = 0;
1833     SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize();
1834     if (subGroupSize->hasValue())
1835     {
1836         simdSize = (uint32_t)subGroupSize->getSIMD_size();
1837     }
1838     simdSize = simdSize >= 8 ? simdSize : 32;
1839 
1840     int32_t X = -1, Y = -1, Z = -1;
1841     ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize();
1842     if (threadGroupSize->hasValue())
1843     {
1844         X = (int32_t)threadGroupSize->getXDim();
1845         Y = (int32_t)threadGroupSize->getYDim();
1846         Z = (int32_t)threadGroupSize->getZDim();
1847 
1848     }
1849 
1850     if (WO_0 == 0 && ((X / simdSize) * simdSize) == X)
1851     {
1852         // each thread will have Y and Z unchanged.
1853         IsLyUniform = true;
1854         IsLzUniform = true;
1855     }
1856     else if (WO_0 == 1 && ((Y / simdSize) * simdSize) == Y)
1857     {
1858         // each thread will have X and Z unchanged.
1859         IsLxUniform = true;
1860         IsLzUniform = true;
1861     }
1862     else if (WO_0 == 2 && ((Z / simdSize) * simdSize) == Z)
1863     {
1864         // each thread will have X and Y unchanged.
1865         IsLxUniform = true;
1866         IsLyUniform = true;
1867     }
1868 
1869     if (X == 1)
1870     {
1871         IsLxUniform = true;
1872     }
1873     if (Y == 1)
1874     {
1875         IsLyUniform = true;
1876     }
1877     if (Z == 1)
1878     {
1879         IsLzUniform = true;
1880     }
1881 
1882     if (IGC_IS_FLAG_ENABLED(DispatchOCLWGInLinearOrder) ||
1883         (WO_0 == 0 && WO_1 == 1 && WO_2 == 2))
1884     {
1885         // linear order dispatch
1886         uint32_t XxY = X * Y;
1887         if (X > 0 && (X % simdSize) == 0)
1888         {
1889             // X is multiple of simdSize
1890             IsLyUniform = true;
1891             IsLzUniform = true;
1892         }
1893         else if (X > 0 && Y > 0 && (XxY % simdSize) == 0)
1894         {
1895             // X*Y is multiple of simdSize
1896             IsLzUniform = true;
1897         }
1898     }
1899 }
1900 
BranchInfo(const IGCLLVM::TerminatorInst * inst,const BasicBlock * ipd)1901 BranchInfo::BranchInfo(const IGCLLVM::TerminatorInst* inst, const BasicBlock* ipd)
1902     : cbr(inst),
1903     full_join(ipd)
1904 {
1905     const BasicBlock* fork_blk = inst->getParent();
1906     IGC_ASSERT_MESSAGE(cbr == fork_blk->getTerminator(), "block terminator mismatch");
1907 
1908     if (cbr->getNumSuccessors() != 2) {
1909         std::set<const BasicBlock*> Reached;
1910         for (auto SI = succ_begin(fork_blk),
1911             SE = succ_end(fork_blk); SI != SE; ++SI) {
1912             auto Succ = *SI;
1913             if (Succ == full_join)
1914                 continue;
1915             std::set<const BasicBlock*> Visited;
1916             std::stack<const BasicBlock*> WorkSet;
1917             WorkSet.push(Succ);
1918             while (!WorkSet.empty()) {
1919                 const BasicBlock* BB = WorkSet.top();
1920                 WorkSet.pop();
1921                 Visited.insert(BB);
1922                 influence_region.insert(const_cast<BasicBlock*>(BB));
1923                 if (Reached.count(BB))
1924                     partial_joins.insert(const_cast<BasicBlock*>(BB));
1925                 for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
1926                     auto SBB = *I;
1927                     if (SBB != full_join && !Visited.count(SBB))
1928                         WorkSet.push(SBB);
1929                 }
1930             }
1931             // Merge Visited into Reached.
1932             for (auto BB : Visited)
1933                 Reached.insert(BB);
1934         }
1935     }
1936     else {
1937         std::set<BasicBlock*> f_set, t_set;
1938         std::stack<BasicBlock*> work_set;
1939         if (cbr->getSuccessor(0) != full_join)
1940         {
1941             work_set.push(cbr->getSuccessor(0));
1942             while (!work_set.empty())
1943             {
1944                 BasicBlock* cur_blk = work_set.top();
1945                 work_set.pop();
1946                 f_set.insert(cur_blk);
1947                 influence_region.insert(cur_blk);
1948                 for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk); SI != E; ++SI)
1949                 {
1950                     BasicBlock* succ_blk = (*SI);
1951                     if (succ_blk != full_join && !f_set.count(succ_blk))
1952                     {
1953                         work_set.push(succ_blk);
1954                     }
1955                 }
1956             }
1957         }
1958         if (cbr->getSuccessor(1) != full_join)
1959         {
1960             work_set.push(cbr->getSuccessor(1));
1961             while (!work_set.empty())
1962             {
1963                 BasicBlock* cur_blk = work_set.top();
1964                 work_set.pop();
1965                 t_set.insert(cur_blk);
1966                 influence_region.insert(cur_blk);
1967                 if (f_set.count(cur_blk))
1968                 {
1969                     partial_joins.insert(cur_blk);
1970                 }
1971                 for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk); SI != E; ++SI)
1972                 {
1973                     BasicBlock* succ_blk = (*SI);
1974                     if (succ_blk != full_join && !t_set.count(succ_blk))
1975                     {
1976                         work_set.push(succ_blk);
1977                     }
1978                 }
1979             }
1980         }
1981     }
1982 }
1983 
print(raw_ostream & OS) const1984 void BranchInfo::print(raw_ostream& OS) const
1985 {
1986     OS << "\nCBR: " << *cbr;
1987     OS << "\nIPD: ";
1988     if (full_join)
1989     {
1990         full_join->print(IGC::Debug::ods());
1991     }
1992     OS << "\nPartial Joins:";
1993     SmallPtrSet<BasicBlock*, 4>::iterator join_it = partial_joins.begin();
1994     SmallPtrSet<BasicBlock*, 4>::iterator join_e = partial_joins.end();
1995     for (; join_it != join_e; ++join_it)
1996     {
1997         BasicBlock* cur_blk = *join_it;
1998         OS << "\n    ";
1999         cur_blk->print(IGC::Debug::ods());
2000     }
2001     OS << "\nInfluence Region:";
2002     DenseSet<BasicBlock*>::const_iterator blk_it = influence_region.begin();
2003     DenseSet<BasicBlock*>::const_iterator blk_e = influence_region.end();
2004     for (; blk_it != blk_e; ++blk_it)
2005     {
2006         BasicBlock* cur_blk = *blk_it;
2007         OS << "\n    ";
2008         cur_blk->print(IGC::Debug::ods());
2009     }
2010     OS << "\n";
2011 }
2012 
2013 extern "C"
2014 {
createWIAnalysisPass()2015     void* createWIAnalysisPass()
2016     {
2017         return new WIAnalysis();
2018     }
2019 }
2020