1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 //===-------- CoalescingEngine.cpp - Pass and analysis for coalescing-------===//
10 //
11 //  Intel Extention to LLVM core
12 //===----------------------------------------------------------------------===//
13 //
14 // This pass is an implementation of idea of coalescing originated from USC
15 // but re-designed to work (and take advantage of) on SSA form.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "Compiler/CISACodeGen/CoalescingEngine.hpp"
20 #include "Compiler/CISACodeGen/DeSSA.hpp"
21 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
22 #include "Compiler/CISACodeGen/PatternMatchPass.hpp"
23 #include "Compiler/MetaDataApi/MetaDataApi.h"
24 #include "Compiler/CISACodeGen/helper.h"
25 #include "PayloadMapping.hpp"
26 #include "common/debug/Debug.hpp"
27 #include "common/debug/Dump.hpp"
28 #include "common/igc_regkeys.hpp"
29 #include "common/LLVMWarningsPush.hpp"
30 #include <llvm/IR/InstVisitor.h>
31 #include <llvm/ADT/SmallVector.h>
32 #include "common/LLVMWarningsPop.hpp"
33 #include "Compiler/IGCPassSupport.h"
34 #include "Probe/Assertion.h"
35 
36 using namespace llvm;
37 using namespace IGC;
38 using namespace IGC::Debug;
39 using namespace IGC::IGCMD;
40 
41 char CoalescingEngine::ID = 0;
42 #define PASS_FLAG "CoalescingEngine"
43 #define PASS_DESCRIPTION "coalesce moves coming payloads, insert and extract element"
44 #define PASS_CFG_ONLY true
45 #define PASS_ANALYSIS true
46 IGC_INITIALIZE_PASS_BEGIN(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
47 IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
48 IGC_INITIALIZE_PASS_DEPENDENCY(LiveVarsAnalysis)
49 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenPatternMatch)
50 IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
51 IGC_INITIALIZE_PASS_DEPENDENCY(DeSSA)
52 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
53 IGC_INITIALIZE_PASS_END(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
54 
55 namespace IGC
56 {
57 
CoalescingEngine()58     CoalescingEngine::CoalescingEngine() : FunctionPass(ID)
59     {
60         initializeCoalescingEnginePass(*PassRegistry::getPassRegistry());
61     }
62 
getAnalysisUsage(llvm::AnalysisUsage & AU) const63     void CoalescingEngine::getAnalysisUsage(llvm::AnalysisUsage& AU) const
64     {
65         AU.setPreservesAll();
66         AU.addRequired<llvm::DominatorTreeWrapperPass>();
67         AU.addRequired<WIAnalysis>();
68         AU.addRequired<LiveVarsAnalysis>();
69         AU.addRequired<CodeGenPatternMatch>();
70         AU.addRequired<DeSSA>();
71         AU.addRequired<MetaDataUtilsWrapper>();
72         AU.addRequired<CodeGenContextWrapper>();
73     }
74 
print(raw_ostream & OS,const Module *) const75     void CoalescingEngine::CCTuple::print(raw_ostream& OS, const Module*) const
76     {
77         OS << "CC Tuple";
78 
79         auto IT = OffsetToCCMap.begin();
80         while (IT != OffsetToCCMap.end())
81         {
82             OS << IT->first << " : ";
83             IT++;
84         }
85 
86     }
87 
dump() const88     void CoalescingEngine::CCTuple::dump() const {
89         print(ods());
90     }
91 
print(raw_ostream & OS,const Module *) const92     void CoalescingEngine::print(raw_ostream& OS, const Module*) const
93     {
94         OS << "CC Tuple list: \n";
95 
96 
97         for (auto itr = m_CCTupleList.begin(),
98             iend = m_CCTupleList.end();
99             itr != iend; ++itr)
100         {
101             CCTuple* ccTuple = *itr;
102             ccTuple->print(OS);
103         }
104 
105         OS << "CC Tuple list end. \n";
106 
107     }
108 
dump() const109     void CoalescingEngine::dump() const {
110         print(ods());
111     }
112 
113     /// \brief Entry point.
runOnFunction(Function & MF)114     bool CoalescingEngine::runOnFunction(Function& MF) {
115         if (IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing))
116         {
117             return false;
118         }
119 
120         MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
121         m_ModuleMetadata = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
122         m_CodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
123         if (!isEntryFunc(pMdUtils, &MF))
124         {
125             return false;
126         }
127 
128         auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
129         m_Platform = pCtx->platform;
130         m_PayloadMapping = PayloadMapping(pCtx);
131 
132         DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
133         WIA = &getAnalysis<WIAnalysis>();
134         CG = &getAnalysis<CodeGenPatternMatch>();
135         LV = &getAnalysis<LiveVarsAnalysis>().getLiveVars();
136         if (IGC_IS_FLAG_DISABLED(DisableDeSSA)) {
137             m_DeSSA = &getAnalysis<DeSSA>();
138         }
139         else {
140             m_DeSSA = nullptr;
141         }
142 
143         for (Function::iterator I = MF.begin(), E = MF.end();
144             I != E; ++I) {
145             ProcessBlock(&(*I));
146         }
147 
148         for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
149             DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
150 
151             IncrementalCoalesce(DI->getBlock());
152         }
153         return false;
154     }
155 
156     /// Coalesce instructions within a single-BB
IncrementalCoalesce(BasicBlock * MBB)157     void CoalescingEngine::IncrementalCoalesce(BasicBlock* MBB)
158     {
159         std::vector<Instruction*>& DefInstrs = BBProcessingDefs[MBB];
160         std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LV));
161 
162         for (std::vector<Instruction*>::const_iterator BBI = DefInstrs.begin(),
163             BBE = DefInstrs.end(); BBI != BBE; ++BBI)
164         {
165             Instruction* DefMI = *BBI;
166 
167             auto RI = ValueNodeMap.find((Value*)DefMI);
168             if (RI == ValueNodeMap.end()) {
169                 //Intrinsics themselves are not mapped to a node, they are
170                 //tuple generators.
171 
172                 //NOTE: Please leave this commented code snippet! It is useful when debugging coalescing regressions:
173                 //      (used to isolate coalescing to particular shader to check failing/passing using binary search)
174                 //DWORD hash = (DWORD) static_cast<IGC::CodeGenContext&>(MBB->getParent()->getContext()).hash.getAsmHash();
175                 ////if (hash >= 0xA27Affff && hash <= 0xA3CFFF00)
176                 //if (hash >= 0xA3C00000 && hash <= 0xA3CFFFFF)
177                 //{
178                 //    continue;
179                 //}
180 
181                 if (GenIntrinsicInst * intrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(DefMI))
182                 {
183                     GenISAIntrinsic::ID IID = intrinsic->getIntrinsicID();
184                     if ((isURBWriteIntrinsic(intrinsic) && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_URB))) ||
185                         (IID == GenISAIntrinsic::GenISA_RTWrite && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_RT))))
186                     {
187                         ProcessTuple(DefMI);
188                     }
189                 }
190 
191                 if (isSampleInstruction(DefMI) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_Sample))
192                 {
193                     ProcessTuple(DefMI);
194                 }
195                 else if (isLdInstruction(DefMI))
196                 {
197                     ProcessTuple(DefMI);
198                 }
199             }
200         }
201     }
202 
IsBindlessSampler(Instruction * inst)203     static bool IsBindlessSampler(Instruction* inst)
204     {
205         if (SampleIntrinsic * sample = dyn_cast<SampleIntrinsic>(inst))
206         {
207             Value* sampler = sample->getSamplerValue();
208             uint bufferIndex = 0;
209             bool directIndexing = false;
210             if (sampler->getType()->isPointerTy())
211             {
212                 BufferType bufType = DecodeAS4GFXResource(
213                     sampler->getType()->getPointerAddressSpace(), directIndexing, bufferIndex);
214                 if (bufType == BINDLESS_SAMPLER)
215                 {
216                     return true;
217                 }
218             }
219         }
220         return false;
221     }
222 
223     /// Core algorithm.
ProcessTuple(Instruction * tupleGeneratingInstruction)224     void CoalescingEngine::ProcessTuple(Instruction* tupleGeneratingInstruction)
225     {
226         const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction);
227         // bindless sampler always have a header
228         bool needsHeader = IsBindlessSampler(tupleGeneratingInstruction);
229 
230         //There should be an early exit in the case we have split-send available
231         //and this instruction is sure to generate two-Element payload.
232         if (m_Platform.supportSplitSend() && !needsHeader)
233         {
234             if (numOperands < 3)
235             {
236                 return;
237             }
238         }
239         else
240         {
241             if (numOperands < 2)
242             {
243                 return;
244             }
245 
246         }
247 
248         IGC_ASSERT(numOperands >= 2);
249 
250         //No result, but has side effects of updating the split mapping.
251         DecideSplit(tupleGeneratingInstruction);
252 
253         uint numSplits = GetNumSplitParts(tupleGeneratingInstruction);
254         for (uint numPart = 0; numPart < numSplits; numPart++)
255         {
256             SetCurrentPart(tupleGeneratingInstruction, numPart);
257             const uint numOperands = GetNumPayloadElements(tupleGeneratingInstruction);
258             bool isAnyNodeAnchored = false, isAnyNodeCoalescable = false;
259             SmallPtrSet<CCTuple*, 8> touchedTuplesSet;
260             SmallVector<CCTuple*, 8> touchedTuples;
261 
262             //Step: Prepare.
263             PrepareTuple(
264                 numOperands,
265                 tupleGeneratingInstruction,
266                 //out:
267                 touchedTuplesSet,
268                 touchedTuples,
269                 isAnyNodeAnchored,
270                 isAnyNodeCoalescable);
271 
272             if (!isAnyNodeCoalescable) {
273                 continue;
274             }
275 
276 
277             //So at this point, there is at least one node that is candidate for coalescing.
278             //It might be already a part of CCtuple, or it might be yet unconstrained.
279 
280             //Tuple Generating Instruction enforces sequential order of registers allocated to values.
281             //Each value might belong to (up to) one CCtuple.
282             //If it belongs to CCTuple, it has an unique offset in that CCTuple.
283             //Three cases are considered:
284             //1) No value have CCtuple assigned -> create new CCTuple and assign offsets.
285             //    a) if platform supports split-send, then some (arbitrary) split point should be determined
286             //2) At least one value belongs to a CCtuple (we say it is 'anchored' (constrained) to that tuple)
287             //   and all values belong to the same CCtuple
288             //3) More than one value belong to a tuple, and they belong to different (>1) tuples: NOT implemented
289 
290 
291             //Case 1) : create new CCTuple
292             if (!isAnyNodeAnchored)
293             {
294                 CreateTuple(numOperands, tupleGeneratingInstruction);
295             }
296             else {
297                 //Case 2) : at least one value is 'anchored'(constrained) to a CCtuple
298                 //Sub-case: All arguments belong to the same tuple.
299                 if (touchedTuples.size() == 1) {
300                     // Step I: find an anchoring element.
301                     // If arguments of this tuple-instruction are already constrained by CCtuple, it means
302                     // that they also constrain the tuple that would be generated by this payload.
303                     // They 'anchor' the new tuple, so pick the element that anchors the tuple and
304                     // determine its offset.
305 
306                     //E.g. :
307                     // ..     v1 (0) v2 (1) CCtuple
308                     // v0 (0) v1 (1) v2 (2) (this payload)
309                     // v1 will have: thisTupleStartOffset = 1, rootTupleStartOffset = 0
310                     // CCtuple will be 'grown' with element v0, that is at relative offset =  -1
311                     // v0(-1) v1 (0) v2 (1) CCtuple
312 
313                     CCTuple* ccTuple = NULL;
314                     int rootTupleStartOffset = 0;
315                     int thisTupleStartOffset = 0;
316 
317                     DetermineAnchor(
318                         numOperands,
319                         tupleGeneratingInstruction,
320                         //out:
321                         ccTuple,
322                         rootTupleStartOffset,
323                         thisTupleStartOffset);
324 
325                     IGC_ASSERT(ccTuple);
326 
327                     /* Step II: Interference checking */
328                     int offsetDiff = rootTupleStartOffset - thisTupleStartOffset;
329 
330                     bool interferes = false;
331 
332                     //Heuristic: we do not want to grow the ccTuple size too much
333                     // [c0 c1 c2 c3  c4  c5  c6  c7]
334                     //(offsetDiff=4)[e0' e1' e2' e3' e4' e5' e6' e7']
335                     // here, tuple numElements = 8, and offsetDiff = 4, so it would result in
336                     // 12 element CC tuple if joined together. We want to prevent growing
337                     // more than MaxTupleSize elements.
338                     if (abs(offsetDiff) > 0 && ccTuple->GetNumElements() >= MaxTupleSize)
339                     {
340                         interferes = true;
341                     }
342 
343                     ProcessInterferencesElementFunctor interferencesFunctor(
344                         false /*forceEviction */,
345                         tupleGeneratingInstruction,
346                         offsetDiff,
347                         ccTuple,
348                         this);
349 
350                     if (!interferes)
351                     {
352                         interferes = InterferenceCheck(
353                             numOperands,
354                             tupleGeneratingInstruction,
355                             offsetDiff,
356                             ccTuple,
357                             //out:
358                             &interferencesFunctor);
359 
360                         if (!interferes &&
361                             (ccTuple->HasNonHomogeneousElements() ||
362                                 m_PayloadMapping.HasNonHomogeneousPayloadElements(
363                                     tupleGeneratingInstruction)
364                                 )
365                             )
366                         {
367                             interferes = CheckIntersectionForNonHomogeneous(
368                                 numOperands,
369                                 tupleGeneratingInstruction,
370                                 offsetDiff,
371                                 ccTuple);
372                         }
373                     }
374 
375                     //If 'interferes' is false, it means that whole transaction could be committed,
376                     //provided that elements in dominatorsForDisplacement are displaced, and other nodes are attached.
377                     //If interferes is true, then no element will be attached to the ccTuple.
378                     if (!interferes) {
379                         SmallPtrSet<Value*, 8> touchedValuesSet;
380                         for (uint i = 0; i < numOperands; i++) {
381                             Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
382 
383                             ccTuple->CheckIndexForNonInitializedSlot(i + offsetDiff);
384 
385                             if (touchedValuesSet.count(val)) {
386                                 continue;
387                             }
388                             else {
389                                 touchedValuesSet.insert(val);
390                             }
391 
392                             if (IsValConstOrIsolated(val)) {
393                                 //do nothing
394                             }
395                             else {
396                                 Value* RootV = getRegRoot(val);
397                                 if (!RootV) //bail out if isolated
398                                     continue;
399 
400                                 //We do not expect to get Argument here, it must have been isolated.
401                                 IGC_ASSERT(dyn_cast<llvm::Instruction>(val));
402                                 IGC_ASSERT(ValueNodeMap.count(val));
403                                 //auto RI = ValueNodeMap.find(val);
404                                 //ElementNode *Node = RI->second;
405                                 ElementNode* Node = ValueNodeMap[val];
406 
407                                 if (!interferencesFunctor.GetComputedValuesForIsolation().count(val))
408                                 {
409                                     ccTuple->AttachNodeAtIndex(i + offsetDiff, Node, *this);
410                                     IGC_ASSERT(getRegRoot(val));
411                                 }
412                             }
413                         }//for
414                     }
415                     else  // (interferes) is true
416                     {
417                         CreateTuple(numOperands, tupleGeneratingInstruction);
418                     }
419                 }
420                 else {
421                     //Case 3)
422                     //More than one value belong to a tuple, and they belong to different (>1) tuples.
423                     //FIXME: not implemented yet
424                     //Need to implement code for joining two non-overlapping tuples.
425                     //This probably does not bring much benefit, but is nevertheless complex to implement.
426                     //Having two congruence class tuples, we need to find whether piecewise congruent classes
427                     //of the tuples interfere. Here a 'two-set interference check' due to Boissinot could be
428                     //used (first sorting elements)
429                 }
430             }
431         }
432     }
433 
434     ///
CheckIntersectionForNonHomogeneous(const uint numOperands,Instruction * tupleGeneratingInstruction,const int offsetDiff,CCTuple * ccTuple)435     bool CoalescingEngine::CheckIntersectionForNonHomogeneous(
436         const uint numOperands,
437         Instruction* tupleGeneratingInstruction,
438         const int offsetDiff,
439         CCTuple* ccTuple)
440     {
441         //Picturesque description:
442         // Hi - homogeneous slots
443         // <left_part>[H_0 H_1 .. H_n]<right_part>
444 
445         //ccTuple:
446         // <left_part> [H_0  H_1  ..  H_n]<right_part>
447         //
448         //1) this tuple instruction contains non-homogeneous part:
449         // <left_part'>[H_0' H_1' .. H_n']<right_part'>
450         //2) or this tuple instruction does not contain non-homogeneous part
451         //             [H_0' H_1' .. H_n']
452 
453         bool interferes = false;
454         if (ccTuple->HasNonHomogeneousElements())
455         {
456             // Finding a supremum instruction for a homogeneous part is implemented
457             // only for render target write instructions (RTWritIntrinsic).
458             // An only other possible combination for non-homogeneous instructions comprises
459             // RTWritIntrinsic and RTDualBlendSourceIntrinsic.
460             // In some cases there is an opportunity to coalesce their payloads but there exists
461             // a danger that they are compiled in different SIMD modes so there is a safe assumption
462             // that they cannot be coalesced.
463             // A comparison of the payload of RTWritIntrinsic and RTDualBlendSourceIntrinsic:
464             // +----------------------------+----------------------------+
465             // | RTWritIntrinsic            | RTDualBlendSourceIntrinsic |
466             // +----------------------------+----------------------------+
467             // | src0 Alpha (optional)      | src0 Alpha (unavailable)*  |
468             // +----------------------------+----------------------------+
469             // | output mask (optional)     | output mask (optional)     |
470             // +----------------------------+----------------------------+
471             // | -                          | R0 channel                 |
472             // +----------------------------+----------------------------+
473             // | -                          | G0 channel                 |
474             // +----------------------------+----------------------------+
475             // | -                          | B0 channel                 |
476             // +----------------------------+----------------------------+
477             // | -                          | A0 channel                 |
478             // +----------------------------+----------------------------+
479             // | R0 channel                 | R1 channel                 |
480             // +----------------------------+----------------------------+
481             // | G0 channel                 | G1 channel                 |
482             // +----------------------------+----------------------------+
483             // | B0 channel                 | B1 channel                 |
484             // +----------------------------+----------------------------+
485             // | A0 channel                 | A1 channel                 |
486             // +----------------------------+----------------------------+
487             // | Depth (optional)           | Depth (optional)           |
488             // +----------------------------+----------------------------+
489             // | Stencil (optional)         | Stencil (optional)         |
490             // +----------------------------+----------------------------+
491             // All optional fields except for src0 Alpha are consistent for both instruction called in a fragment shader.
492             // * RTDualBlendSourceIntrinsic doesn't have such an argument but it is defined in its payload.
493             if (offsetDiff == 0 &&
494                 ccTuple->GetNumElements() == numOperands &&
495                 llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) &&
496                 llvm::isa<llvm::RTWritIntrinsic>(tupleGeneratingInstruction))
497             {
498                 if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
499                 {
500                     //TODO: test for 'supremum' of TGI and ccTuple root element can be obtained
501                     const Instruction* supremum = m_PayloadMapping.GetSupremumOfNonHomogeneousPart(
502                         tupleGeneratingInstruction,
503                         ccTuple->GetRoot());
504                     if (supremum)
505                     {
506                     }
507                     else
508                     {
509                         interferes = true;
510                     }
511                 }
512             }
513             else
514             {
515                 //Since ccTuple contains non-homogeneous, and this homogeneous
516                 //part is not perfectly aligned to ccTuple, we cannot reason about
517                 //interferences safely, thus assume interferes.
518                 interferes = true;
519             }
520         }
521         else
522         {
523             //ccTuple:
524             //             [H_0  H_1  ..  H_n]
525             //
526             //1) this tuple instruction contains non-homogeneous part:
527             // <left_part'>[H_0' H_1' .. H_n']<right_part'>
528 
529             if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
530             {
531                 if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands)
532                 {
533                     ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction);
534                 }
535                 else
536                 {
537                     interferes = true;
538                 }
539             }
540         }
541 
542         return interferes;
543     }
544 
DecideSplit(Instruction * tupleGeneratingInstruction)545     void CoalescingEngine::DecideSplit(Instruction* tupleGeneratingInstruction)
546     {
547         if (m_PayloadMapping.DoPeelFirstElement(tupleGeneratingInstruction))
548         {
549             splitPoint_[tupleGeneratingInstruction] = 1;
550             return;
551         }
552 
553         uint splitPoint = 0;
554         const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction);
555         //No sense entering here if we have less than 2 operands.
556         IGC_ASSERT(numOperands >= 2);
557         Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, 0);
558         if (IsValConstOrIsolated(val) || !getRegRoot(val))
559         {
560             splitPoint = 1;
561         }
562 
563         val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, numOperands - 1);
564         if (IsValConstOrIsolated(val) || !getRegRoot(val))
565         {
566             splitPoint = numOperands - 1;
567         }
568 
569         if (splitPoint > 0 && m_PayloadMapping.DoesAllowSplit(tupleGeneratingInstruction))
570         {
571             splitPoint_[tupleGeneratingInstruction] = splitPoint;
572         }
573 
574     }
575 
576 
577     /// Determines if any value that is an argument of (possible) tuple generating instruction
578     /// can participate in payload coalescing. If so, it also determines if there are any
579     /// 'anchored'(constrained) values and the number of constraining tuples.
PrepareTuple(const uint numOperands,Instruction * tupleGeneratingInstruction,SmallPtrSet<CCTuple *,8> & touchedTuplesSet,SmallVector<CCTuple *,8> & touchedTuples,bool & isAnyNodeAnchored,bool & isAnyNodeCoalescable)580     void CoalescingEngine::PrepareTuple(
581         const uint numOperands,
582         Instruction* tupleGeneratingInstruction,
583         SmallPtrSet<CCTuple*, 8> & touchedTuplesSet,
584         SmallVector<CCTuple*, 8> & touchedTuples,
585         bool& isAnyNodeAnchored,
586         bool& isAnyNodeCoalescable)
587     {
588         uint numConstants = 0;
589         //Step: Prepare.
590         for (uint i = 0; i < numOperands; i++)
591         {
592             Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
593 
594             if (IsValConstOrIsolated(val) || !getRegRoot(val))
595             {
596                 numConstants++;
597                 continue;
598             }
599 
600             Value* RootV = getRegRoot(val);
601             IGC_ASSERT(RootV);
602             {
603                 isAnyNodeCoalescable = true;
604 
605                 IGC_ASSERT(ValueNodeMap.count(RootV));
606                 auto RI = ValueNodeMap.find(RootV);
607                 ElementNode* Node = RI->second;
608 
609                 auto CCI = NodeCCTupleMap.find(Node);
610                 if (CCI != NodeCCTupleMap.end())
611                 {
612                     CCTuple* ccTuple = NodeCCTupleMap[Node];
613                     if (!touchedTuplesSet.count(ccTuple))
614                     {
615                         touchedTuplesSet.insert(ccTuple);
616                         touchedTuples.push_back(ccTuple);
617                     }
618 
619                     isAnyNodeAnchored = true;
620                 }
621 
622                 // Do not coalesce values having excessive number of uses.
623                 // This otfen introduces many anti-dependencies.
624                 const unsigned MAX_USE_COUNT = 20;
625                 if (val->getNumUses() >= MAX_USE_COUNT)
626                 {
627                     isAnyNodeAnchored = true;
628                 }
629             }
630         }
631 
632         //heuristic:
633         //we do not want to create 'wide' root variables, where most of the slots
634         //will be anyway non-coalesced (due to immediate constants/isolated variables)
635         //vISA will anyway not benefit, while this increases pressure and makes coloring
636         //with round-robin heuristic harder
637         if (numOperands > 2 && numConstants > (numOperands / 2))
638         {
639             if (!m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
640             {
641                 //force non-coalescing:
642                 isAnyNodeCoalescable = false;
643             }
644         }
645     }
646 
647     /// Creates fresh tuple.
648     /// Fresh tuple consists of nodes which do not have any tuple assigned yet.
649     /// Precondition is that at least one value is coalescable (otherwise does not make sense).
CreateTuple(const uint numOperands,Instruction * tupleGeneratingInstruction)650     void CoalescingEngine::CreateTuple(
651         const uint numOperands,
652         Instruction* tupleGeneratingInstruction)
653     {
654         CCTuple* ccTuple = new CCTuple();
655         m_CCTupleList.push_back(ccTuple);
656 
657         for (uint i = 0; i < numOperands; i++) {
658             Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
659 
660             if (IsValConstOrIsolated(val))
661             {
662                 //It is important to initialize CC with empty slots, in order to get the proper
663                 //size of the vISA variable that will back CCtuple that corresponds to payload.
664                 ccTuple->ResizeBounds(i);
665                 continue;
666             }
667 
668             Value* RootV = getRegRoot(val);
669             if (!RootV) { //isolated
670                 ccTuple->ResizeBounds(i);
671                 continue;
672             }
673 
674             IGC_ASSERT(ValueNodeMap.count(RootV));
675             ElementNode* RootNode = ValueNodeMap[RootV];
676 
677             auto CCTI = NodeCCTupleMap.find(RootNode);
678             if (CCTI == NodeCCTupleMap.end())
679             {
680                 NodeCCTupleMap[RootNode] = ccTuple;
681                 NodeOffsetMap[RootNode] = i;
682                 ccTuple->InitializeIndexWithCCRoot(i, RootNode);
683                 CurrentDominatingParent[RootV] = RootV;
684                 ImmediateDominatingParent[RootV] = NULL;
685             }
686             else
687             {
688                 //This must have been a value that is copied to payload multiple times.
689                 //OR: The new CCtuple has been isolated, and this is the element that
690                 //belongs to other tuple.
691                 //Since this value belongs to another tuple, we should not increase the
692                 //current tuple's size, no?
693                 //ccTuple->ResizeBounds(i);
694             }
695         } // loop over arguments
696 
697         if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
698         {
699             ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction);
700         }
701     }
702 
703     /// Given a tuple generating instruction, picks out an 'anchored'(constrained)
704     /// value and determines the relative offset
705     /// output:
706     /// ccTuple              - a tuple that is constraining a picked value
707     /// thisTupleStartOffset - the offset of the value in the current tuple generating
708     ///                        instruction
709     /// rootTupleStartOffset - the offset of the value in constraining tuple
710     /// Assumption: it is already checked all values belong to exactly one tuple!
711     /// (otherwise the result might pick an arbitrary tuple)
DetermineAnchor(const uint numOperands,const Instruction * tupleGeneratingInstruction,CCTuple * & ccTuple,int & rootTupleStartOffset,int & thisTupleStartOffset)712     void CoalescingEngine::DetermineAnchor(
713         const uint numOperands,
714         const Instruction* tupleGeneratingInstruction,
715         CCTuple*& ccTuple,
716         int& rootTupleStartOffset,
717         int& thisTupleStartOffset)
718     {
719         for (uint i = 0; i < numOperands; i++) {
720             Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
721 
722             if (IsValConstOrIsolated(val)) {
723                 continue;
724             }
725             Value* RootV = getRegRoot(val);
726             if (!RootV)
727                 continue;
728 
729             IGC_ASSERT(ValueNodeMap.count(RootV));
730             ElementNode* RootNode = ValueNodeMap[RootV];
731 
732             auto CCTI = NodeCCTupleMap.find(RootNode);
733             if (CCTI != NodeCCTupleMap.end()) {
734                 //Element is constrained by ccTuple.
735                 IGC_ASSERT(NodeOffsetMap.count(RootNode));
736                 rootTupleStartOffset = NodeOffsetMap[RootNode];
737                 thisTupleStartOffset = i;
738                 ccTuple = NodeCCTupleMap[RootNode];
739                 //Just a sanity check.. :
740                 IGC_ASSERT(RootNode == ccTuple->GetCCNodeWithIndex(NodeOffsetMap[RootNode]));
741                 break;
742             }
743 
744         }//for
745     }
746 
747     /// Prepares the slot for moves (e.g. induced by const, isolated, uniform to vector)
748     /// into payload by evicting any value that is currently occupying this slot.
749     /// Assumption is that heuristic has determined it is profitable to do so.
750     /// input: bool evictFullCongruenceClass - tells whether the full congruence class
751     /// should be evicted, or only the element that occupies the tuple instruction
752     /// for the lifetime of a 'mov' (this is sufficient for issuing a mov to the payload slot).
PrepareInsertionSlot(CCTuple * ccTuple,const int index,Instruction * inst,const bool evictFullCongruenceClass)753     void CoalescingEngine::PrepareInsertionSlot(
754         CCTuple* ccTuple,
755         const int index,
756         Instruction* inst,
757         const bool evictFullCongruenceClass)
758     {
759         ElementNode* RootNode = ccTuple->OffsetToCCMap[index];
760         IGC_ASSERT(RootNode);
761 
762         if (evictFullCongruenceClass) {
763             Value* NewParent = GetActualDominatingParent(RootNode->value, inst);
764 
765             while (NewParent) {
766                 if (getRegRoot(NewParent)) {
767                     isolateReg(NewParent);
768                 }
769                 NewParent = ImmediateDominatingParent[NewParent];
770             }
771 
772             //it might turn out, that rootNode does not dominate 'inst'
773             //since it is in another branch of DT
774             //do not forget to delete it as well
775             if (getRegRoot(RootNode->value)) {
776                 isolateReg(RootNode->value);
777             }
778 
779         }
780         else {
781             //Evict dominating parent from CC.
782             Value* NewParent = GetActualDominatingParent(RootNode->value, inst);
783             if (NewParent)
784                 isolateReg(NewParent);
785         }
786     }
787 
IsInsertionSlotAvailable(CCTuple * ccTuple,const int index,Instruction * tupleGeneratingInstruction,const bool canExtend)788     bool CoalescingEngine::IsInsertionSlotAvailable(
789         CCTuple* ccTuple,
790         const int index,
791         Instruction* tupleGeneratingInstruction,
792         const bool canExtend)
793     {
794         bool avaiable = true;
795 
796         //ccTuple->OffsetToCCMap.count(index) == 0
797         if (!canExtend)
798         {
799             const int tupleNumElements = ccTuple->GetNumElements();
800 
801             if (!(0 <= index && index < tupleNumElements))
802             {
803                 return false;
804             }
805         }
806 
807         ElementNode* nodeToCheck = ccTuple->GetCCNodeWithIndex(index);
808 
809         if (nodeToCheck == NULL) {
810             //tuple   :  X       CC1   CC2
811             //payload :  <slot>   v1        v2
812             //Means: Tuple has no live interval (X) that is live at the point the payload is
813             //defined. It means, that a given 'register slot' can be used to issue a 'mov'
814             //with an immediate (or isolated) value.
815         }
816         else {
817             //tuple   :  CC0       CC1   CC2
818             // ....       | <- lifetime
819             //payload :  <slot>    v1       v2
820             // .....      | <- lifetime
821             //Here we need to check whether the value in CC is live
822             //and intersects with the desired <slot> .
823 
824             //Sanity check: tuple contains the root element.
825 
826             //IGC_ASSERT(getRegRoot(nodeToCheck->value) == nodeToCheck->value);
827             Value* RootV = nodeToCheck->value;
828             Value* NewParent = GetActualDominatingParent(RootV, tupleGeneratingInstruction);
829             //FIXME: what if we do not have pure ssa form? Look at de-ssa comment.
830             if (NewParent && LV->isLiveAt(NewParent, tupleGeneratingInstruction)) {
831                 avaiable = false;
832                 //interferes = true;
833                 //break;
834             }
835         }
836 
837         return avaiable;
838     }
839 
840     /// Processes the tuple elements in sequential order.
841     /// A function object is passed as an argument, which responds to
842     /// specific events (whether given payload elements is const/isolated/copied etc.)
843     /// It is meant to be used as a template method in various context where different
844     /// functors could be passed, but the walking mechanism is the same.
ProcessElements(const uint numOperands,Instruction * tupleInst,const int offsetDiff,CCTuple * ccTuple,ElementFunctor * functor)845     void CoalescingEngine::ProcessElements(const uint numOperands,
846         Instruction* tupleInst,
847         const int offsetDiff,
848         CCTuple* ccTuple,
849         ElementFunctor* functor)
850     {
851         SmallPtrSet<Value*, 8> touchedValuesSet;
852 
853         for (uint i = 0; i < numOperands; i++) {
854             functor->SetIndex(i);
855             Value* val = GetPayloadElementToValueMapping(tupleInst, i);
856             if (touchedValuesSet.count(val)) {
857                 if (functor->visitCopy())
858                     continue;
859                 else
860                     break;
861             }
862             else {
863                 touchedValuesSet.insert(val);
864             }
865             //Case 1: Constant
866             if (llvm::isa<llvm::Constant>(val)) {
867                 if (functor->visitConstant())
868                     continue;
869                 else
870                     break;
871             }
872             else //Case 2: Variable
873             {
874                 if (IsValIsolated(val) || !getRegRoot(val))
875                 {
876                     //If value is isolated, a copy <slot> will be necessary.
877                     if (functor->visitIsolated())
878                         continue;
879                     else
880                         break;
881                 }
882 
883                 IGC_ASSERT(ValueNodeMap.count(val));
884                 ElementNode* Node = ValueNodeMap[val];
885                 ElementNode* ccRootNode = ccTuple->GetCCNodeWithIndex(i + offsetDiff);
886 
887                 //Interferes if and only if: live at this definition point AND has different value
888                 if (ccRootNode == Node) {
889                     //No interference, since it is the same value. We have got a reuse of the same value in
890                     //different tuple.
891                     if (functor->visitAnchored())
892                         continue;
893                     else
894                         break;
895                 }
896                 else
897                 {
898                     IGC_ASSERT(llvm::isa<llvm::Instruction>(val));
899                     //Here comes the meat, and the most interesting case:
900                     if (ccRootNode != NULL)
901                     {
902                         Value* CCRootV = ccRootNode->value;
903                         Value* dominating = nullptr;
904                         Value* dominated = nullptr;
905 
906                         SymmetricInterferenceTest(
907                             CCRootV, dyn_cast<llvm::Instruction>(val),
908                             dominating,
909                             dominated);
910 
911                         //Case 1):
912                         //   (dominating=null)
913                         //
914                         //!   (value)
915                         //
916                         //    CC dominance tree:
917                         //!   v67 <- (dominated)
918                         //    ^
919                         //    |
920                         //!   v190
921                         //
922                         //!   (tupleInst)
923                         // Value has no dominator inside CC class, but dominates elements in CC class
924                         // and its lifetime spans until (but not including) tupleInst.
925                         if (dominating == nullptr && dominated != nullptr)
926                         {
927                             //IGC_ASSERT(LV->isLiveAt(val, tupleInst));
928                             //Interference check: value is live at dominated
929                             if (functor->visitInterfering(val, false))
930                                 continue;
931                             else
932                                 break;
933                         }
934                         else if (dominating != nullptr && dominated != nullptr)
935                         {
936                             //Case 2):
937                             // CC dominance tree
938                             //!  v67
939                             //    ^
940                             //    |
941                             //!  v121 (dominating)
942                             //    ^
943                             //!   |  <-----(value)
944                             //    |
945                             //!  v190 <- (dominated)
946                             //TODO: it would be possible to 'fit' v181 between v121 and v190 in some
947                             //      cases, but this would require redesign of the 'dominance forest' logic
948                             //      (regarding current dominating and immediate dominator handling)
949                             //      For now, just assume there is an interference.
950                             if (functor->visitInterfering(val, false))
951                                 continue;
952                             else
953                                 break;
954                         }
955                         else if (dominating != nullptr && dominated == nullptr)
956                         {
957                             //Case 3):
958                             // CC dominance tree
959                             //!  v121
960                             //    ^
961                             //    |
962                             //!  v190 <- (dominating)
963                             //    |
964                             //!   |    ----- (value)
965                             //    |
966                             //    | (dominated == null)
967                             if (LV->isLiveAt(dominating, dyn_cast<llvm::Instruction>(val)))
968                             {
969                                 if (functor->visitInterfering(val, false))
970                                     continue;
971                                 else
972                                     break;
973                             }
974                         }
975                         else
976                         {
977                             //It is possible that the ccRootNode is not empty, but all
978                             //elements that belong to it have been isolated in previously
979                             //dominating tuples.
980                             //IGC_ASSERT(0);
981                         }
982                     }
983                     else
984                     {
985                         //ccRootNode == NULL -> congruence class not occupied yet
986                         if (functor->visitPackedNonInterfering(val))
987                             continue;
988                         else
989                             break;
990                     }
991                 } //if( ccRootNode == Node )
992             } //if( llvm::isa<llvm::Constant>( val ) )
993         } //for loop
994     }
995 
996     /// Here we have two major differences with the implementation in de-ssa:
997     /// 1) Transactional character: even if one 'element' of CC-tuple does not interfere,
998     /// some other element in CC-tuple might be interfering, rendering the whole group
999     /// non-'coalescable'.
1000     /// 2) Values in payload coalescing are by default isolated, so if there is an interference,
1001     /// do nothing, no need for explicit isolation.
InterferenceCheck(const uint numOperands,Instruction * tupleInst,const int offsetDiff,CCTuple * ccTuple,ProcessInterferencesElementFunctor * interferencesFunctor)1002     bool CoalescingEngine::InterferenceCheck(
1003         const uint numOperands,
1004         Instruction* tupleInst,
1005         const int offsetDiff,
1006         CCTuple* ccTuple,
1007         ProcessInterferencesElementFunctor* interferencesFunctor)
1008     {
1009         SmallPtrSet<Value*, 8> touchedValuesSet;
1010         GatherWeightElementFunctor gatherFunctor;
1011         ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, &gatherFunctor);
1012         bool forceEviction =
1013             (gatherFunctor.GetNumInsertionSlotsRequired() + gatherFunctor.GetNumNeedsDisplacement() <=
1014                 gatherFunctor.GetNumAlignedAnchors());
1015 
1016 
1017         interferencesFunctor->SetForceEviction(forceEviction);
1018 
1019         ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, interferencesFunctor);
1020 
1021         return interferencesFunctor->IsInterfering();
1022     }
1023 
1024     /// Given an instruction and numOperands (corresponding to the number of coalesce-partaking
1025     /// payload elements), return whether any 'value' that is an argument of an intrinsic
1026     /// takes part in the coalescing CC tuple. If yes, return non-zero value that is
1027     /// the representative tuple for this coalescing.
1028     /// output: zeroBasedPayloadElementOffset - an offset of this instructions payload
1029     /// relative to the containing ccTuple - this is necessary to correctly slice the
1030     /// the relevant part of the payload from ccTuple (e.g. for preparing a payload for URB writes)
1031     ///     offset [payload]
1032     ///     [ccTuple ...... ]
1033     /// note: obviously, if return value is null, then the meaning of zeroBasedPayloadElementOffset
1034     /// is undefined
IsAnyValueCoalescedInCCTuple(llvm::Instruction * inst,const uint numOperands,int & zeroBasedPayloadElementOffset,Value * & outVal)1035     CoalescingEngine::CCTuple* CoalescingEngine::IsAnyValueCoalescedInCCTuple(
1036         llvm::Instruction* inst,
1037         const uint numOperands,
1038         int& zeroBasedPayloadElementOffset,
1039         Value*& outVal)
1040     {
1041         outVal = nullptr;
1042         zeroBasedPayloadElementOffset = 0; //provide some meaningful default
1043         CoalescingEngine::CCTuple* ccTuple = nullptr;
1044         uint index = 0;
1045         while (index < numOperands)
1046         {
1047             Value* val = GetPayloadElementToValueMapping(inst, index);
1048 
1049             if (!isa<Constant>(val) && GetValueCCTupleMapping(val))
1050             {
1051                 ccTuple = GetValueCCTupleMapping(val);
1052 
1053                 Value* valRoot = getRegRoot(val);
1054                 IGC_ASSERT(valRoot);
1055                 ElementNode* Node = ValueNodeMap[valRoot];
1056                 int offset = NodeOffsetMap[Node];
1057                 zeroBasedPayloadElementOffset = (offset - ccTuple->GetLeftBound()) - index;
1058                 outVal = val;
1059                 return ccTuple;
1060             }
1061             index++;
1062         }
1063         return ccTuple;
1064     }
1065 
1066     /// Return true if payload tuple might be completed (either by directly
1067     /// aliasing or by inserting an appropriate copy instruction).
IsPayloadCovered(llvm::Instruction * inst,CCTuple * ccTuple,const uint numOperands,const int payloadToCCTupleRelativeOffset)1068     bool CoalescingEngine::IsPayloadCovered(
1069         llvm::Instruction* inst,
1070         CCTuple* ccTuple,
1071         const uint numOperands,
1072         const int payloadToCCTupleRelativeOffset)
1073     {
1074         uint relativeOffset = 0;
1075         bool payloadCovered = false;
1076 
1077         if (ccTuple)
1078         {
1079             SmallPtrSet<Value*, 8> touchedValuesSet;
1080 
1081             //index = 0;
1082             payloadCovered = true;
1083             for (uint index = 0; index < numOperands; index++)
1084             {
1085                 Value* val = GetPayloadElementToValueMapping(inst, index);
1086                 const int ccIndex = index + payloadToCCTupleRelativeOffset;
1087 
1088                 if (touchedValuesSet.count(val)) {
1089                     if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
1090                         payloadCovered = false;
1091                         break;
1092                     }
1093                     continue;
1094                 }
1095                 else {
1096                     touchedValuesSet.insert(val);
1097                 }
1098 
1099                 if (IsValConstOrIsolated(val)) {
1100                     if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
1101                         payloadCovered = false;
1102                         break;
1103                     }
1104                 }
1105                 else {
1106                     if (CoalescingEngine::CCTuple * thisCCTuple = GetValueCCTupleMapping(val)) {
1107                         if (thisCCTuple == ccTuple) {
1108                             relativeOffset = GetValueOffsetInCCTuple(val);
1109                             if (relativeOffset != (ccIndex)) {
1110                                 payloadCovered = false;
1111                                 break;
1112                             }
1113                         }
1114                         else {
1115                             //different cc tuple
1116                             payloadCovered = false;
1117                             break;
1118                         }
1119 
1120                     }
1121                     else {
1122                         if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
1123                             payloadCovered = false;
1124                             break;
1125                         }
1126                     }
1127                 }//if constant
1128             }//while
1129         } //if cctuple
1130 
1131         return payloadCovered;
1132     }
1133 
1134 
1135     /// Uses the tuple information provided by coalescing engine in order
1136     /// to generate optimized sequence of 'movs' for preparing a payload.
PrepareExplicitPayload(CShader * outProgram,CEncoder * encoder,SIMDMode simdMode,const DataLayout * pDL,llvm::Instruction * inst,int & payloadOffsetInBytes)1137     CVariable* CoalescingEngine::PrepareExplicitPayload(
1138         CShader* outProgram,
1139         CEncoder* encoder,
1140         SIMDMode simdMode,
1141         const DataLayout* pDL,
1142         llvm::Instruction* inst,
1143         int& payloadOffsetInBytes)
1144     {
1145         SetCurrentPart(inst, 0);
1146         const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
1147         const uint grfSize = outProgram->GetContext()->platform.getGRFSize();
1148         const uint numSubVarsPerOperand = numLanes(simdMode) / (grfSize / 4);
1149         IGC_ASSERT(numSubVarsPerOperand == 1 || numSubVarsPerOperand == 2);
1150         CoalescingEngine::CCTuple* ccTuple = nullptr;
1151         int payloadToCCTupleRelativeOffset = 0;
1152         Value* dummyValPtr = nullptr;
1153         ccTuple = IsAnyValueCoalescedInCCTuple(inst, numOperands, payloadToCCTupleRelativeOffset, dummyValPtr);
1154         bool payloadCovered = IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset);
1155         bool payloadOffsetComputed = false;
1156         payloadOffsetInBytes = 0;
1157         CVariable* payload = nullptr;
1158 
1159         if (payloadToCCTupleRelativeOffset < 0)
1160         {
1161             payloadCovered = false;
1162         }
1163 
1164         //Check for EOT payload.
1165         if (payloadCovered) {
1166             if (IGCLLVM::TerminatorInst * terminator = inst->getParent()->getTerminator())
1167             {
1168                 if (terminator != &(*terminator->getParent()->begin()))
1169                 {
1170                     IGC_ASSERT(dyn_cast<GenIntrinsicInst>(inst));
1171                     IGC_ASSERT(terminator->getPrevNode());
1172                     if (terminator->getPrevNode() == inst)
1173                     {
1174                         //heuristic:
1175                         //this inst is an EOT node. Be very careful about payload size:
1176                         //coalescing: hard constraint
1177                         if (ccTuple->GetNumElements() > MaxTupleSize)
1178                         {
1179                             payloadCovered = false;
1180                         }
1181                     }
1182                 }
1183             }
1184         }
1185 
1186         if (payloadCovered) {
1187             SmallPtrSet<Value*, 8> touchedValuesSet;
1188 
1189             for (uint index = 0; index < numOperands; index++)
1190             {
1191                 Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, index);
1192 
1193                 CVariable* dataElement = outProgram->GetSymbol(val);
1194                 //FIXME: need to do additional checks for size
1195                 uint subVar = payloadToCCTupleRelativeOffset + index * numSubVarsPerOperand;
1196                 encoder->SetDstSubVar(subVar);
1197                 payload = outProgram->GetCCTupleToVariableMapping(ccTuple);
1198 
1199                 if (touchedValuesSet.count(val)) {
1200                     encoder->Copy(payload, dataElement);
1201                     encoder->Push();
1202                     continue;
1203                 }
1204                 else {
1205                     touchedValuesSet.insert(val);
1206                 }
1207 
1208                 if (IsValConstOrIsolated(val)) {
1209                     encoder->Copy(payload, dataElement);
1210                     encoder->Push();
1211                 }
1212                 else
1213                 {
1214                     if (GetValueCCTupleMapping(val))
1215                     {
1216                         if (!payloadOffsetComputed)
1217                         {
1218                             payloadOffsetComputed = true;
1219 
1220                             payloadOffsetInBytes = payloadToCCTupleRelativeOffset *
1221                                 GetSingleElementWidth(simdMode, pDL, val);
1222                         }
1223                     }
1224                     else
1225                     {
1226                         //this one actually encompasses the case for !getRegRoot(val)
1227                         encoder->Copy(payload, dataElement);
1228                         encoder->Push();
1229                     }
1230                 }//if constant
1231             }//for
1232 
1233         }
1234         else
1235         {
1236             payload = outProgram->GetNewVariable(numOperands * numLanes(simdMode), ISA_TYPE_F,
1237                 outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD,
1238                 "CEExplicitPayload");
1239 
1240             for (uint i = 0; i < numOperands; i++)
1241             {
1242                 Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
1243                 CVariable* data = outProgram->GetSymbol(val);
1244                 encoder->SetDstSubVar(i * numSubVarsPerOperand);
1245                 encoder->Copy(payload, data);
1246                 encoder->Push();
1247             }
1248         }
1249 
1250         return payload;
1251     }
1252 
1253 
1254     CoalescingEngine::ElementNode*
getLeader()1255         CoalescingEngine::ElementNode::getLeader() {
1256         ElementNode* N = this;
1257         ElementNode* Parent = parent.getPointer();
1258         ElementNode* Grandparent = Parent->parent.getPointer();
1259 
1260         while (Parent != Grandparent) {
1261             N->parent.setPointer(Grandparent);
1262             N = Grandparent;
1263             Parent = Parent->parent.getPointer();
1264             Grandparent = Parent->parent.getPointer();
1265         }
1266 
1267         return Parent;
1268     }
1269 
getRegRoot(Value * Val) const1270     Value* CoalescingEngine::getRegRoot(Value* Val) const {
1271         auto RI = ValueNodeMap.find(Val);
1272         if (RI == ValueNodeMap.end())
1273             return nullptr;
1274         ElementNode* Node = RI->second;
1275         if (Node->parent.getInt() & ElementNode::kRegisterIsolatedFlag)
1276             return 0x0;
1277         return Node->getLeader()->value;
1278     }
1279 
getPHIRoot(Instruction * PHI) const1280     Value* CoalescingEngine::getPHIRoot(Instruction* PHI) const {
1281         IGC_ASSERT(dyn_cast<PHINode>(PHI));
1282         auto RI = ValueNodeMap.find(PHI);
1283         IGC_ASSERT(RI != ValueNodeMap.end());
1284         ElementNode* DestNode = RI->second;
1285         if (DestNode->parent.getInt() & ElementNode::kPHIIsolatedFlag)
1286             return 0x0;
1287         for (unsigned i = 0; i < PHI->getNumOperands(); i++) {
1288             Value* SrcVal = PHI->getOperand(i);
1289             if (isa<Instruction>(SrcVal)) {  // skip constant source
1290                 Value* SrcRoot = getRegRoot(SrcVal);
1291                 if (SrcRoot)
1292                     return SrcRoot;
1293             }
1294         }
1295         return 0x0;
1296     }
1297 
1298 
1299 
unionRegs(Value * Val1,Value * Val2)1300     void CoalescingEngine::unionRegs(Value* Val1, Value* Val2) {
1301         ElementNode* Node1 = ValueNodeMap[Val1]->getLeader();
1302         ElementNode* Node2 = ValueNodeMap[Val2]->getLeader();
1303 
1304         if (Node1->rank > Node2->rank) {
1305             Node2->parent.setPointer(Node1->getLeader());
1306         }
1307         else if (Node1->rank < Node2->rank) {
1308             Node1->parent.setPointer(Node2->getLeader());
1309         }
1310         else if (Node1 != Node2) {
1311             Node2->parent.setPointer(Node1->getLeader());
1312             Node1->rank++;
1313         }
1314     }
1315 
1316     // For now, return true if V is dessa-aliased/InsEltMap-ed/phi-coalesced.
isCoalescedByDeSSA(Value * V) const1317     bool CoalescingEngine::isCoalescedByDeSSA(Value* V) const
1318     {
1319         if (m_DeSSA && m_DeSSA->getRootValue(V))
1320             return true;
1321         return false;
1322     }
1323 
ProcessBlock(llvm::BasicBlock * bb)1324     void CoalescingEngine::ProcessBlock(llvm::BasicBlock* bb)
1325     {
1326         llvm::BasicBlock::InstListType& instructionList = bb->getInstList();
1327         llvm::BasicBlock::InstListType::iterator I, E;
1328 
1329         //Loop through instructions top to bottom
1330         for (I = instructionList.begin(), E = instructionList.end(); I != E; ++I) {
1331             llvm::Instruction& inst = (*I);
1332             visit(inst);
1333         }
1334     }
1335 
MatchSingleInstruction(llvm::GenIntrinsicInst * inst)1336     bool CoalescingEngine::MatchSingleInstruction(llvm::GenIntrinsicInst* inst)
1337     {
1338         GenISAIntrinsic::ID IID = inst->getIntrinsicID();
1339         if (isSampleInstruction(inst) ||
1340             isLdInstruction(inst) ||
1341             isURBWriteIntrinsic(inst) ||
1342             IID == GenISAIntrinsic::GenISA_RTWrite)
1343         {
1344             uint numOperands = inst->getNumOperands();
1345             for (uint i = 0; i < numOperands; i++)
1346             {
1347                 Value* val = inst->getOperand(i);
1348                 if (llvm::isa<llvm::Constant>(val))
1349                 {
1350                     continue;
1351                 }
1352 
1353                 if (!ValueNodeMap.count(val))
1354                 {
1355                     Instruction* DefMI = dyn_cast<Instruction>(val);
1356                     if (DefMI)
1357                     {
1358                         BBProcessingDefs[DefMI->getParent()].push_back(DefMI);
1359                     }
1360                     ValueNodeMap[val] = new (Allocator) ElementNode(val);
1361                 }
1362             }
1363 
1364             //Add itself to processing stack as well.
1365             BBProcessingDefs[inst->getParent()].push_back(inst);
1366         }
1367 
1368         return true;
1369     }
1370 
visitCallInst(llvm::CallInst & I)1371     void CoalescingEngine::visitCallInst(llvm::CallInst& I)
1372     {
1373         //we do not care if we do not match
1374         //bool match = true;
1375         if (GenIntrinsicInst * CI = llvm::dyn_cast<GenIntrinsicInst>(&I))
1376         {
1377             switch (CI->getIntrinsicID())
1378             {
1379             case GenISAIntrinsic::GenISA_ROUNDNE:
1380             case GenISAIntrinsic::GenISA_imulH:
1381             case GenISAIntrinsic::GenISA_umulH:
1382             case GenISAIntrinsic::GenISA_uaddc:
1383             case GenISAIntrinsic::GenISA_usubb:
1384             case GenISAIntrinsic::GenISA_bfrev:
1385                 break;
1386             case GenISAIntrinsic::GenISA_intatomicraw:
1387             case GenISAIntrinsic::GenISA_floatatomicraw:
1388             case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
1389             case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
1390             case GenISAIntrinsic::GenISA_dwordatomicstructured:
1391             case GenISAIntrinsic::GenISA_floatatomicstructured:
1392             case GenISAIntrinsic::GenISA_cmpxchgatomicstructured:
1393             case GenISAIntrinsic::GenISA_fcmpxchgatomicstructured:
1394             case GenISAIntrinsic::GenISA_intatomictyped:
1395             case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
1396             case GenISAIntrinsic::GenISA_ldstructured:
1397             case GenISAIntrinsic::GenISA_atomiccounterinc:
1398             case GenISAIntrinsic::GenISA_atomiccounterpredec:
1399                 break;
1400             case GenISAIntrinsic::GenISA_GradientX:
1401             case GenISAIntrinsic::GenISA_GradientY:
1402             case GenISAIntrinsic::GenISA_GradientXfine:
1403             case GenISAIntrinsic::GenISA_GradientYfine:
1404                 break;
1405             default:
1406                 MatchSingleInstruction(CI);
1407                 // no pattern for the rest of the intrinsics
1408                 break;
1409             }
1410         }
1411         else if (IntrinsicInst * CI = llvm::dyn_cast<IntrinsicInst>(&I))
1412         {
1413             switch (CI->getIntrinsicID())
1414             {
1415             case Intrinsic::sqrt:
1416             case Intrinsic::log2:
1417             case Intrinsic::cos:
1418             case Intrinsic::sin:
1419             case Intrinsic::pow:
1420             case Intrinsic::floor:
1421             case Intrinsic::ceil:
1422             case Intrinsic::trunc:
1423             case Intrinsic::maxnum:
1424             case Intrinsic::minnum:
1425                 break;
1426             case Intrinsic::exp2:
1427                 break;
1428 
1429             default:
1430                 break;
1431             }
1432         }
1433     }
1434 
visitCastInst(llvm::CastInst & I)1435     void CoalescingEngine::visitCastInst(llvm::CastInst& I)
1436     {
1437 
1438     }
visitBinaryOperator(llvm::BinaryOperator & I)1439     void CoalescingEngine::visitBinaryOperator(llvm::BinaryOperator& I)
1440     {
1441 
1442     }
1443 
visitCmpInst(llvm::CmpInst & I)1444     void CoalescingEngine::visitCmpInst(llvm::CmpInst& I)
1445     {
1446 
1447     }
1448 
visitPHINode(llvm::PHINode & I)1449     void CoalescingEngine::visitPHINode(llvm::PHINode& I)
1450     {
1451 
1452     }
1453 
visitUnaryInstruction(llvm::UnaryInstruction & I)1454     void CoalescingEngine::visitUnaryInstruction(llvm::UnaryInstruction& I)
1455     {
1456 
1457     }
1458 
visitSelectInst(llvm::SelectInst & I)1459     void CoalescingEngine::visitSelectInst(llvm::SelectInst& I)
1460     {
1461 
1462     }
1463 
visitBitCastInst(llvm::BitCastInst & I)1464     void CoalescingEngine::visitBitCastInst(llvm::BitCastInst& I)
1465     {
1466 
1467     }
1468 
visitInstruction(llvm::Instruction & I)1469     void CoalescingEngine::visitInstruction(llvm::Instruction& I)
1470     {
1471 
1472     }
1473 
visitLoadInst(LoadInst & I)1474     void CoalescingEngine::visitLoadInst(LoadInst& I)
1475     {
1476 
1477     }
1478 
visitStoreInst(StoreInst & I)1479     void CoalescingEngine::visitStoreInst(StoreInst& I)
1480     {
1481 
1482     }
1483 
1484 } //namespace IGC
1485 
1486