1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 //===-------- CoalescingEngine.cpp - Pass and analysis for coalescing-------===// 10 // 11 // Intel Extention to LLVM core 12 //===----------------------------------------------------------------------===// 13 // 14 // This pass is an implementation of idea of coalescing originated from USC 15 // but re-designed to work (and take advantage of) on SSA form. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "Compiler/CISACodeGen/CoalescingEngine.hpp" 20 #include "Compiler/CISACodeGen/DeSSA.hpp" 21 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp" 22 #include "Compiler/CISACodeGen/PatternMatchPass.hpp" 23 #include "Compiler/MetaDataApi/MetaDataApi.h" 24 #include "Compiler/CISACodeGen/helper.h" 25 #include "PayloadMapping.hpp" 26 #include "common/debug/Debug.hpp" 27 #include "common/debug/Dump.hpp" 28 #include "common/igc_regkeys.hpp" 29 #include "common/LLVMWarningsPush.hpp" 30 #include <llvm/IR/InstVisitor.h> 31 #include <llvm/ADT/SmallVector.h> 32 #include "common/LLVMWarningsPop.hpp" 33 #include "Compiler/IGCPassSupport.h" 34 #include "Probe/Assertion.h" 35 36 using namespace llvm; 37 using namespace IGC; 38 using namespace IGC::Debug; 39 using namespace IGC::IGCMD; 40 41 char CoalescingEngine::ID = 0; 42 #define PASS_FLAG "CoalescingEngine" 43 #define PASS_DESCRIPTION "coalesce moves coming payloads, insert and extract element" 44 #define PASS_CFG_ONLY true 45 #define PASS_ANALYSIS true 46 IGC_INITIALIZE_PASS_BEGIN(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) 47 IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis) 48 IGC_INITIALIZE_PASS_DEPENDENCY(LiveVarsAnalysis) 49 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenPatternMatch) 50 IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 51 IGC_INITIALIZE_PASS_DEPENDENCY(DeSSA) 52 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper) 53 IGC_INITIALIZE_PASS_END(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) 54 55 namespace IGC 56 { 57 CoalescingEngine()58 CoalescingEngine::CoalescingEngine() : FunctionPass(ID) 59 { 60 initializeCoalescingEnginePass(*PassRegistry::getPassRegistry()); 61 } 62 getAnalysisUsage(llvm::AnalysisUsage & AU) const63 void CoalescingEngine::getAnalysisUsage(llvm::AnalysisUsage& AU) const 64 { 65 AU.setPreservesAll(); 66 AU.addRequired<llvm::DominatorTreeWrapperPass>(); 67 AU.addRequired<WIAnalysis>(); 68 AU.addRequired<LiveVarsAnalysis>(); 69 AU.addRequired<CodeGenPatternMatch>(); 70 AU.addRequired<DeSSA>(); 71 AU.addRequired<MetaDataUtilsWrapper>(); 72 AU.addRequired<CodeGenContextWrapper>(); 73 } 74 print(raw_ostream & OS,const Module *) const75 void CoalescingEngine::CCTuple::print(raw_ostream& OS, const Module*) const 76 { 77 OS << "CC Tuple"; 78 79 auto IT = OffsetToCCMap.begin(); 80 while (IT != OffsetToCCMap.end()) 81 { 82 OS << IT->first << " : "; 83 IT++; 84 } 85 86 } 87 dump() const88 void CoalescingEngine::CCTuple::dump() const { 89 print(ods()); 90 } 91 print(raw_ostream & OS,const Module *) const92 void CoalescingEngine::print(raw_ostream& OS, const Module*) const 93 { 94 OS << "CC Tuple list: \n"; 95 96 97 for (auto itr = m_CCTupleList.begin(), 98 iend = m_CCTupleList.end(); 99 itr != iend; ++itr) 100 { 101 CCTuple* ccTuple = *itr; 102 ccTuple->print(OS); 103 } 104 105 OS << "CC Tuple list end. \n"; 106 107 } 108 dump() const109 void CoalescingEngine::dump() const { 110 print(ods()); 111 } 112 113 /// \brief Entry point. runOnFunction(Function & MF)114 bool CoalescingEngine::runOnFunction(Function& MF) { 115 if (IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing)) 116 { 117 return false; 118 } 119 120 MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils(); 121 m_ModuleMetadata = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData(); 122 m_CodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext(); 123 if (!isEntryFunc(pMdUtils, &MF)) 124 { 125 return false; 126 } 127 128 auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext(); 129 m_Platform = pCtx->platform; 130 m_PayloadMapping = PayloadMapping(pCtx); 131 132 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 133 WIA = &getAnalysis<WIAnalysis>(); 134 CG = &getAnalysis<CodeGenPatternMatch>(); 135 LV = &getAnalysis<LiveVarsAnalysis>().getLiveVars(); 136 if (IGC_IS_FLAG_DISABLED(DisableDeSSA)) { 137 m_DeSSA = &getAnalysis<DeSSA>(); 138 } 139 else { 140 m_DeSSA = nullptr; 141 } 142 143 for (Function::iterator I = MF.begin(), E = MF.end(); 144 I != E; ++I) { 145 ProcessBlock(&(*I)); 146 } 147 148 for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), 149 DE = df_end(DT->getRootNode()); DI != DE; ++DI) { 150 151 IncrementalCoalesce(DI->getBlock()); 152 } 153 return false; 154 } 155 156 /// Coalesce instructions within a single-BB IncrementalCoalesce(BasicBlock * MBB)157 void CoalescingEngine::IncrementalCoalesce(BasicBlock* MBB) 158 { 159 std::vector<Instruction*>& DefInstrs = BBProcessingDefs[MBB]; 160 std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LV)); 161 162 for (std::vector<Instruction*>::const_iterator BBI = DefInstrs.begin(), 163 BBE = DefInstrs.end(); BBI != BBE; ++BBI) 164 { 165 Instruction* DefMI = *BBI; 166 167 auto RI = ValueNodeMap.find((Value*)DefMI); 168 if (RI == ValueNodeMap.end()) { 169 //Intrinsics themselves are not mapped to a node, they are 170 //tuple generators. 171 172 //NOTE: Please leave this commented code snippet! It is useful when debugging coalescing regressions: 173 // (used to isolate coalescing to particular shader to check failing/passing using binary search) 174 //DWORD hash = (DWORD) static_cast<IGC::CodeGenContext&>(MBB->getParent()->getContext()).hash.getAsmHash(); 175 ////if (hash >= 0xA27Affff && hash <= 0xA3CFFF00) 176 //if (hash >= 0xA3C00000 && hash <= 0xA3CFFFFF) 177 //{ 178 // continue; 179 //} 180 181 if (GenIntrinsicInst * intrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(DefMI)) 182 { 183 GenISAIntrinsic::ID IID = intrinsic->getIntrinsicID(); 184 if ((isURBWriteIntrinsic(intrinsic) && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_URB))) || 185 (IID == GenISAIntrinsic::GenISA_RTWrite && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_RT)))) 186 { 187 ProcessTuple(DefMI); 188 } 189 } 190 191 if (isSampleInstruction(DefMI) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_Sample)) 192 { 193 ProcessTuple(DefMI); 194 } 195 else if (isLdInstruction(DefMI)) 196 { 197 ProcessTuple(DefMI); 198 } 199 } 200 } 201 } 202 IsBindlessSampler(Instruction * inst)203 static bool IsBindlessSampler(Instruction* inst) 204 { 205 if (SampleIntrinsic * sample = dyn_cast<SampleIntrinsic>(inst)) 206 { 207 Value* sampler = sample->getSamplerValue(); 208 uint bufferIndex = 0; 209 bool directIndexing = false; 210 if (sampler->getType()->isPointerTy()) 211 { 212 BufferType bufType = DecodeAS4GFXResource( 213 sampler->getType()->getPointerAddressSpace(), directIndexing, bufferIndex); 214 if (bufType == BINDLESS_SAMPLER) 215 { 216 return true; 217 } 218 } 219 } 220 return false; 221 } 222 223 /// Core algorithm. ProcessTuple(Instruction * tupleGeneratingInstruction)224 void CoalescingEngine::ProcessTuple(Instruction* tupleGeneratingInstruction) 225 { 226 const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction); 227 // bindless sampler always have a header 228 bool needsHeader = IsBindlessSampler(tupleGeneratingInstruction); 229 230 //There should be an early exit in the case we have split-send available 231 //and this instruction is sure to generate two-Element payload. 232 if (m_Platform.supportSplitSend() && !needsHeader) 233 { 234 if (numOperands < 3) 235 { 236 return; 237 } 238 } 239 else 240 { 241 if (numOperands < 2) 242 { 243 return; 244 } 245 246 } 247 248 IGC_ASSERT(numOperands >= 2); 249 250 //No result, but has side effects of updating the split mapping. 251 DecideSplit(tupleGeneratingInstruction); 252 253 uint numSplits = GetNumSplitParts(tupleGeneratingInstruction); 254 for (uint numPart = 0; numPart < numSplits; numPart++) 255 { 256 SetCurrentPart(tupleGeneratingInstruction, numPart); 257 const uint numOperands = GetNumPayloadElements(tupleGeneratingInstruction); 258 bool isAnyNodeAnchored = false, isAnyNodeCoalescable = false; 259 SmallPtrSet<CCTuple*, 8> touchedTuplesSet; 260 SmallVector<CCTuple*, 8> touchedTuples; 261 262 //Step: Prepare. 263 PrepareTuple( 264 numOperands, 265 tupleGeneratingInstruction, 266 //out: 267 touchedTuplesSet, 268 touchedTuples, 269 isAnyNodeAnchored, 270 isAnyNodeCoalescable); 271 272 if (!isAnyNodeCoalescable) { 273 continue; 274 } 275 276 277 //So at this point, there is at least one node that is candidate for coalescing. 278 //It might be already a part of CCtuple, or it might be yet unconstrained. 279 280 //Tuple Generating Instruction enforces sequential order of registers allocated to values. 281 //Each value might belong to (up to) one CCtuple. 282 //If it belongs to CCTuple, it has an unique offset in that CCTuple. 283 //Three cases are considered: 284 //1) No value have CCtuple assigned -> create new CCTuple and assign offsets. 285 // a) if platform supports split-send, then some (arbitrary) split point should be determined 286 //2) At least one value belongs to a CCtuple (we say it is 'anchored' (constrained) to that tuple) 287 // and all values belong to the same CCtuple 288 //3) More than one value belong to a tuple, and they belong to different (>1) tuples: NOT implemented 289 290 291 //Case 1) : create new CCTuple 292 if (!isAnyNodeAnchored) 293 { 294 CreateTuple(numOperands, tupleGeneratingInstruction); 295 } 296 else { 297 //Case 2) : at least one value is 'anchored'(constrained) to a CCtuple 298 //Sub-case: All arguments belong to the same tuple. 299 if (touchedTuples.size() == 1) { 300 // Step I: find an anchoring element. 301 // If arguments of this tuple-instruction are already constrained by CCtuple, it means 302 // that they also constrain the tuple that would be generated by this payload. 303 // They 'anchor' the new tuple, so pick the element that anchors the tuple and 304 // determine its offset. 305 306 //E.g. : 307 // .. v1 (0) v2 (1) CCtuple 308 // v0 (0) v1 (1) v2 (2) (this payload) 309 // v1 will have: thisTupleStartOffset = 1, rootTupleStartOffset = 0 310 // CCtuple will be 'grown' with element v0, that is at relative offset = -1 311 // v0(-1) v1 (0) v2 (1) CCtuple 312 313 CCTuple* ccTuple = NULL; 314 int rootTupleStartOffset = 0; 315 int thisTupleStartOffset = 0; 316 317 DetermineAnchor( 318 numOperands, 319 tupleGeneratingInstruction, 320 //out: 321 ccTuple, 322 rootTupleStartOffset, 323 thisTupleStartOffset); 324 325 IGC_ASSERT(ccTuple); 326 327 /* Step II: Interference checking */ 328 int offsetDiff = rootTupleStartOffset - thisTupleStartOffset; 329 330 bool interferes = false; 331 332 //Heuristic: we do not want to grow the ccTuple size too much 333 // [c0 c1 c2 c3 c4 c5 c6 c7] 334 //(offsetDiff=4)[e0' e1' e2' e3' e4' e5' e6' e7'] 335 // here, tuple numElements = 8, and offsetDiff = 4, so it would result in 336 // 12 element CC tuple if joined together. We want to prevent growing 337 // more than MaxTupleSize elements. 338 if (abs(offsetDiff) > 0 && ccTuple->GetNumElements() >= MaxTupleSize) 339 { 340 interferes = true; 341 } 342 343 ProcessInterferencesElementFunctor interferencesFunctor( 344 false /*forceEviction */, 345 tupleGeneratingInstruction, 346 offsetDiff, 347 ccTuple, 348 this); 349 350 if (!interferes) 351 { 352 interferes = InterferenceCheck( 353 numOperands, 354 tupleGeneratingInstruction, 355 offsetDiff, 356 ccTuple, 357 //out: 358 &interferencesFunctor); 359 360 if (!interferes && 361 (ccTuple->HasNonHomogeneousElements() || 362 m_PayloadMapping.HasNonHomogeneousPayloadElements( 363 tupleGeneratingInstruction) 364 ) 365 ) 366 { 367 interferes = CheckIntersectionForNonHomogeneous( 368 numOperands, 369 tupleGeneratingInstruction, 370 offsetDiff, 371 ccTuple); 372 } 373 } 374 375 //If 'interferes' is false, it means that whole transaction could be committed, 376 //provided that elements in dominatorsForDisplacement are displaced, and other nodes are attached. 377 //If interferes is true, then no element will be attached to the ccTuple. 378 if (!interferes) { 379 SmallPtrSet<Value*, 8> touchedValuesSet; 380 for (uint i = 0; i < numOperands; i++) { 381 Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); 382 383 ccTuple->CheckIndexForNonInitializedSlot(i + offsetDiff); 384 385 if (touchedValuesSet.count(val)) { 386 continue; 387 } 388 else { 389 touchedValuesSet.insert(val); 390 } 391 392 if (IsValConstOrIsolated(val)) { 393 //do nothing 394 } 395 else { 396 Value* RootV = getRegRoot(val); 397 if (!RootV) //bail out if isolated 398 continue; 399 400 //We do not expect to get Argument here, it must have been isolated. 401 IGC_ASSERT(dyn_cast<llvm::Instruction>(val)); 402 IGC_ASSERT(ValueNodeMap.count(val)); 403 //auto RI = ValueNodeMap.find(val); 404 //ElementNode *Node = RI->second; 405 ElementNode* Node = ValueNodeMap[val]; 406 407 if (!interferencesFunctor.GetComputedValuesForIsolation().count(val)) 408 { 409 ccTuple->AttachNodeAtIndex(i + offsetDiff, Node, *this); 410 IGC_ASSERT(getRegRoot(val)); 411 } 412 } 413 }//for 414 } 415 else // (interferes) is true 416 { 417 CreateTuple(numOperands, tupleGeneratingInstruction); 418 } 419 } 420 else { 421 //Case 3) 422 //More than one value belong to a tuple, and they belong to different (>1) tuples. 423 //FIXME: not implemented yet 424 //Need to implement code for joining two non-overlapping tuples. 425 //This probably does not bring much benefit, but is nevertheless complex to implement. 426 //Having two congruence class tuples, we need to find whether piecewise congruent classes 427 //of the tuples interfere. Here a 'two-set interference check' due to Boissinot could be 428 //used (first sorting elements) 429 } 430 } 431 } 432 } 433 434 /// CheckIntersectionForNonHomogeneous(const uint numOperands,Instruction * tupleGeneratingInstruction,const int offsetDiff,CCTuple * ccTuple)435 bool CoalescingEngine::CheckIntersectionForNonHomogeneous( 436 const uint numOperands, 437 Instruction* tupleGeneratingInstruction, 438 const int offsetDiff, 439 CCTuple* ccTuple) 440 { 441 //Picturesque description: 442 // Hi - homogeneous slots 443 // <left_part>[H_0 H_1 .. H_n]<right_part> 444 445 //ccTuple: 446 // <left_part> [H_0 H_1 .. H_n]<right_part> 447 // 448 //1) this tuple instruction contains non-homogeneous part: 449 // <left_part'>[H_0' H_1' .. H_n']<right_part'> 450 //2) or this tuple instruction does not contain non-homogeneous part 451 // [H_0' H_1' .. H_n'] 452 453 bool interferes = false; 454 if (ccTuple->HasNonHomogeneousElements()) 455 { 456 // Finding a supremum instruction for a homogeneous part is implemented 457 // only for render target write instructions (RTWritIntrinsic). 458 // An only other possible combination for non-homogeneous instructions comprises 459 // RTWritIntrinsic and RTDualBlendSourceIntrinsic. 460 // In some cases there is an opportunity to coalesce their payloads but there exists 461 // a danger that they are compiled in different SIMD modes so there is a safe assumption 462 // that they cannot be coalesced. 463 // A comparison of the payload of RTWritIntrinsic and RTDualBlendSourceIntrinsic: 464 // +----------------------------+----------------------------+ 465 // | RTWritIntrinsic | RTDualBlendSourceIntrinsic | 466 // +----------------------------+----------------------------+ 467 // | src0 Alpha (optional) | src0 Alpha (unavailable)* | 468 // +----------------------------+----------------------------+ 469 // | output mask (optional) | output mask (optional) | 470 // +----------------------------+----------------------------+ 471 // | - | R0 channel | 472 // +----------------------------+----------------------------+ 473 // | - | G0 channel | 474 // +----------------------------+----------------------------+ 475 // | - | B0 channel | 476 // +----------------------------+----------------------------+ 477 // | - | A0 channel | 478 // +----------------------------+----------------------------+ 479 // | R0 channel | R1 channel | 480 // +----------------------------+----------------------------+ 481 // | G0 channel | G1 channel | 482 // +----------------------------+----------------------------+ 483 // | B0 channel | B1 channel | 484 // +----------------------------+----------------------------+ 485 // | A0 channel | A1 channel | 486 // +----------------------------+----------------------------+ 487 // | Depth (optional) | Depth (optional) | 488 // +----------------------------+----------------------------+ 489 // | Stencil (optional) | Stencil (optional) | 490 // +----------------------------+----------------------------+ 491 // All optional fields except for src0 Alpha are consistent for both instruction called in a fragment shader. 492 // * RTDualBlendSourceIntrinsic doesn't have such an argument but it is defined in its payload. 493 if (offsetDiff == 0 && 494 ccTuple->GetNumElements() == numOperands && 495 llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) && 496 llvm::isa<llvm::RTWritIntrinsic>(tupleGeneratingInstruction)) 497 { 498 if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) 499 { 500 //TODO: test for 'supremum' of TGI and ccTuple root element can be obtained 501 const Instruction* supremum = m_PayloadMapping.GetSupremumOfNonHomogeneousPart( 502 tupleGeneratingInstruction, 503 ccTuple->GetRoot()); 504 if (supremum) 505 { 506 } 507 else 508 { 509 interferes = true; 510 } 511 } 512 } 513 else 514 { 515 //Since ccTuple contains non-homogeneous, and this homogeneous 516 //part is not perfectly aligned to ccTuple, we cannot reason about 517 //interferences safely, thus assume interferes. 518 interferes = true; 519 } 520 } 521 else 522 { 523 //ccTuple: 524 // [H_0 H_1 .. H_n] 525 // 526 //1) this tuple instruction contains non-homogeneous part: 527 // <left_part'>[H_0' H_1' .. H_n']<right_part'> 528 529 if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) 530 { 531 if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands) 532 { 533 ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction); 534 } 535 else 536 { 537 interferes = true; 538 } 539 } 540 } 541 542 return interferes; 543 } 544 DecideSplit(Instruction * tupleGeneratingInstruction)545 void CoalescingEngine::DecideSplit(Instruction* tupleGeneratingInstruction) 546 { 547 if (m_PayloadMapping.DoPeelFirstElement(tupleGeneratingInstruction)) 548 { 549 splitPoint_[tupleGeneratingInstruction] = 1; 550 return; 551 } 552 553 uint splitPoint = 0; 554 const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction); 555 //No sense entering here if we have less than 2 operands. 556 IGC_ASSERT(numOperands >= 2); 557 Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, 0); 558 if (IsValConstOrIsolated(val) || !getRegRoot(val)) 559 { 560 splitPoint = 1; 561 } 562 563 val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, numOperands - 1); 564 if (IsValConstOrIsolated(val) || !getRegRoot(val)) 565 { 566 splitPoint = numOperands - 1; 567 } 568 569 if (splitPoint > 0 && m_PayloadMapping.DoesAllowSplit(tupleGeneratingInstruction)) 570 { 571 splitPoint_[tupleGeneratingInstruction] = splitPoint; 572 } 573 574 } 575 576 577 /// Determines if any value that is an argument of (possible) tuple generating instruction 578 /// can participate in payload coalescing. If so, it also determines if there are any 579 /// 'anchored'(constrained) values and the number of constraining tuples. PrepareTuple(const uint numOperands,Instruction * tupleGeneratingInstruction,SmallPtrSet<CCTuple *,8> & touchedTuplesSet,SmallVector<CCTuple *,8> & touchedTuples,bool & isAnyNodeAnchored,bool & isAnyNodeCoalescable)580 void CoalescingEngine::PrepareTuple( 581 const uint numOperands, 582 Instruction* tupleGeneratingInstruction, 583 SmallPtrSet<CCTuple*, 8> & touchedTuplesSet, 584 SmallVector<CCTuple*, 8> & touchedTuples, 585 bool& isAnyNodeAnchored, 586 bool& isAnyNodeCoalescable) 587 { 588 uint numConstants = 0; 589 //Step: Prepare. 590 for (uint i = 0; i < numOperands; i++) 591 { 592 Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); 593 594 if (IsValConstOrIsolated(val) || !getRegRoot(val)) 595 { 596 numConstants++; 597 continue; 598 } 599 600 Value* RootV = getRegRoot(val); 601 IGC_ASSERT(RootV); 602 { 603 isAnyNodeCoalescable = true; 604 605 IGC_ASSERT(ValueNodeMap.count(RootV)); 606 auto RI = ValueNodeMap.find(RootV); 607 ElementNode* Node = RI->second; 608 609 auto CCI = NodeCCTupleMap.find(Node); 610 if (CCI != NodeCCTupleMap.end()) 611 { 612 CCTuple* ccTuple = NodeCCTupleMap[Node]; 613 if (!touchedTuplesSet.count(ccTuple)) 614 { 615 touchedTuplesSet.insert(ccTuple); 616 touchedTuples.push_back(ccTuple); 617 } 618 619 isAnyNodeAnchored = true; 620 } 621 622 // Do not coalesce values having excessive number of uses. 623 // This otfen introduces many anti-dependencies. 624 const unsigned MAX_USE_COUNT = 20; 625 if (val->getNumUses() >= MAX_USE_COUNT) 626 { 627 isAnyNodeAnchored = true; 628 } 629 } 630 } 631 632 //heuristic: 633 //we do not want to create 'wide' root variables, where most of the slots 634 //will be anyway non-coalesced (due to immediate constants/isolated variables) 635 //vISA will anyway not benefit, while this increases pressure and makes coloring 636 //with round-robin heuristic harder 637 if (numOperands > 2 && numConstants > (numOperands / 2)) 638 { 639 if (!m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) 640 { 641 //force non-coalescing: 642 isAnyNodeCoalescable = false; 643 } 644 } 645 } 646 647 /// Creates fresh tuple. 648 /// Fresh tuple consists of nodes which do not have any tuple assigned yet. 649 /// Precondition is that at least one value is coalescable (otherwise does not make sense). CreateTuple(const uint numOperands,Instruction * tupleGeneratingInstruction)650 void CoalescingEngine::CreateTuple( 651 const uint numOperands, 652 Instruction* tupleGeneratingInstruction) 653 { 654 CCTuple* ccTuple = new CCTuple(); 655 m_CCTupleList.push_back(ccTuple); 656 657 for (uint i = 0; i < numOperands; i++) { 658 Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); 659 660 if (IsValConstOrIsolated(val)) 661 { 662 //It is important to initialize CC with empty slots, in order to get the proper 663 //size of the vISA variable that will back CCtuple that corresponds to payload. 664 ccTuple->ResizeBounds(i); 665 continue; 666 } 667 668 Value* RootV = getRegRoot(val); 669 if (!RootV) { //isolated 670 ccTuple->ResizeBounds(i); 671 continue; 672 } 673 674 IGC_ASSERT(ValueNodeMap.count(RootV)); 675 ElementNode* RootNode = ValueNodeMap[RootV]; 676 677 auto CCTI = NodeCCTupleMap.find(RootNode); 678 if (CCTI == NodeCCTupleMap.end()) 679 { 680 NodeCCTupleMap[RootNode] = ccTuple; 681 NodeOffsetMap[RootNode] = i; 682 ccTuple->InitializeIndexWithCCRoot(i, RootNode); 683 CurrentDominatingParent[RootV] = RootV; 684 ImmediateDominatingParent[RootV] = NULL; 685 } 686 else 687 { 688 //This must have been a value that is copied to payload multiple times. 689 //OR: The new CCtuple has been isolated, and this is the element that 690 //belongs to other tuple. 691 //Since this value belongs to another tuple, we should not increase the 692 //current tuple's size, no? 693 //ccTuple->ResizeBounds(i); 694 } 695 } // loop over arguments 696 697 if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) 698 { 699 ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction); 700 } 701 } 702 703 /// Given a tuple generating instruction, picks out an 'anchored'(constrained) 704 /// value and determines the relative offset 705 /// output: 706 /// ccTuple - a tuple that is constraining a picked value 707 /// thisTupleStartOffset - the offset of the value in the current tuple generating 708 /// instruction 709 /// rootTupleStartOffset - the offset of the value in constraining tuple 710 /// Assumption: it is already checked all values belong to exactly one tuple! 711 /// (otherwise the result might pick an arbitrary tuple) DetermineAnchor(const uint numOperands,const Instruction * tupleGeneratingInstruction,CCTuple * & ccTuple,int & rootTupleStartOffset,int & thisTupleStartOffset)712 void CoalescingEngine::DetermineAnchor( 713 const uint numOperands, 714 const Instruction* tupleGeneratingInstruction, 715 CCTuple*& ccTuple, 716 int& rootTupleStartOffset, 717 int& thisTupleStartOffset) 718 { 719 for (uint i = 0; i < numOperands; i++) { 720 Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); 721 722 if (IsValConstOrIsolated(val)) { 723 continue; 724 } 725 Value* RootV = getRegRoot(val); 726 if (!RootV) 727 continue; 728 729 IGC_ASSERT(ValueNodeMap.count(RootV)); 730 ElementNode* RootNode = ValueNodeMap[RootV]; 731 732 auto CCTI = NodeCCTupleMap.find(RootNode); 733 if (CCTI != NodeCCTupleMap.end()) { 734 //Element is constrained by ccTuple. 735 IGC_ASSERT(NodeOffsetMap.count(RootNode)); 736 rootTupleStartOffset = NodeOffsetMap[RootNode]; 737 thisTupleStartOffset = i; 738 ccTuple = NodeCCTupleMap[RootNode]; 739 //Just a sanity check.. : 740 IGC_ASSERT(RootNode == ccTuple->GetCCNodeWithIndex(NodeOffsetMap[RootNode])); 741 break; 742 } 743 744 }//for 745 } 746 747 /// Prepares the slot for moves (e.g. induced by const, isolated, uniform to vector) 748 /// into payload by evicting any value that is currently occupying this slot. 749 /// Assumption is that heuristic has determined it is profitable to do so. 750 /// input: bool evictFullCongruenceClass - tells whether the full congruence class 751 /// should be evicted, or only the element that occupies the tuple instruction 752 /// for the lifetime of a 'mov' (this is sufficient for issuing a mov to the payload slot). PrepareInsertionSlot(CCTuple * ccTuple,const int index,Instruction * inst,const bool evictFullCongruenceClass)753 void CoalescingEngine::PrepareInsertionSlot( 754 CCTuple* ccTuple, 755 const int index, 756 Instruction* inst, 757 const bool evictFullCongruenceClass) 758 { 759 ElementNode* RootNode = ccTuple->OffsetToCCMap[index]; 760 IGC_ASSERT(RootNode); 761 762 if (evictFullCongruenceClass) { 763 Value* NewParent = GetActualDominatingParent(RootNode->value, inst); 764 765 while (NewParent) { 766 if (getRegRoot(NewParent)) { 767 isolateReg(NewParent); 768 } 769 NewParent = ImmediateDominatingParent[NewParent]; 770 } 771 772 //it might turn out, that rootNode does not dominate 'inst' 773 //since it is in another branch of DT 774 //do not forget to delete it as well 775 if (getRegRoot(RootNode->value)) { 776 isolateReg(RootNode->value); 777 } 778 779 } 780 else { 781 //Evict dominating parent from CC. 782 Value* NewParent = GetActualDominatingParent(RootNode->value, inst); 783 if (NewParent) 784 isolateReg(NewParent); 785 } 786 } 787 IsInsertionSlotAvailable(CCTuple * ccTuple,const int index,Instruction * tupleGeneratingInstruction,const bool canExtend)788 bool CoalescingEngine::IsInsertionSlotAvailable( 789 CCTuple* ccTuple, 790 const int index, 791 Instruction* tupleGeneratingInstruction, 792 const bool canExtend) 793 { 794 bool avaiable = true; 795 796 //ccTuple->OffsetToCCMap.count(index) == 0 797 if (!canExtend) 798 { 799 const int tupleNumElements = ccTuple->GetNumElements(); 800 801 if (!(0 <= index && index < tupleNumElements)) 802 { 803 return false; 804 } 805 } 806 807 ElementNode* nodeToCheck = ccTuple->GetCCNodeWithIndex(index); 808 809 if (nodeToCheck == NULL) { 810 //tuple : X CC1 CC2 811 //payload : <slot> v1 v2 812 //Means: Tuple has no live interval (X) that is live at the point the payload is 813 //defined. It means, that a given 'register slot' can be used to issue a 'mov' 814 //with an immediate (or isolated) value. 815 } 816 else { 817 //tuple : CC0 CC1 CC2 818 // .... | <- lifetime 819 //payload : <slot> v1 v2 820 // ..... | <- lifetime 821 //Here we need to check whether the value in CC is live 822 //and intersects with the desired <slot> . 823 824 //Sanity check: tuple contains the root element. 825 826 //IGC_ASSERT(getRegRoot(nodeToCheck->value) == nodeToCheck->value); 827 Value* RootV = nodeToCheck->value; 828 Value* NewParent = GetActualDominatingParent(RootV, tupleGeneratingInstruction); 829 //FIXME: what if we do not have pure ssa form? Look at de-ssa comment. 830 if (NewParent && LV->isLiveAt(NewParent, tupleGeneratingInstruction)) { 831 avaiable = false; 832 //interferes = true; 833 //break; 834 } 835 } 836 837 return avaiable; 838 } 839 840 /// Processes the tuple elements in sequential order. 841 /// A function object is passed as an argument, which responds to 842 /// specific events (whether given payload elements is const/isolated/copied etc.) 843 /// It is meant to be used as a template method in various context where different 844 /// functors could be passed, but the walking mechanism is the same. ProcessElements(const uint numOperands,Instruction * tupleInst,const int offsetDiff,CCTuple * ccTuple,ElementFunctor * functor)845 void CoalescingEngine::ProcessElements(const uint numOperands, 846 Instruction* tupleInst, 847 const int offsetDiff, 848 CCTuple* ccTuple, 849 ElementFunctor* functor) 850 { 851 SmallPtrSet<Value*, 8> touchedValuesSet; 852 853 for (uint i = 0; i < numOperands; i++) { 854 functor->SetIndex(i); 855 Value* val = GetPayloadElementToValueMapping(tupleInst, i); 856 if (touchedValuesSet.count(val)) { 857 if (functor->visitCopy()) 858 continue; 859 else 860 break; 861 } 862 else { 863 touchedValuesSet.insert(val); 864 } 865 //Case 1: Constant 866 if (llvm::isa<llvm::Constant>(val)) { 867 if (functor->visitConstant()) 868 continue; 869 else 870 break; 871 } 872 else //Case 2: Variable 873 { 874 if (IsValIsolated(val) || !getRegRoot(val)) 875 { 876 //If value is isolated, a copy <slot> will be necessary. 877 if (functor->visitIsolated()) 878 continue; 879 else 880 break; 881 } 882 883 IGC_ASSERT(ValueNodeMap.count(val)); 884 ElementNode* Node = ValueNodeMap[val]; 885 ElementNode* ccRootNode = ccTuple->GetCCNodeWithIndex(i + offsetDiff); 886 887 //Interferes if and only if: live at this definition point AND has different value 888 if (ccRootNode == Node) { 889 //No interference, since it is the same value. We have got a reuse of the same value in 890 //different tuple. 891 if (functor->visitAnchored()) 892 continue; 893 else 894 break; 895 } 896 else 897 { 898 IGC_ASSERT(llvm::isa<llvm::Instruction>(val)); 899 //Here comes the meat, and the most interesting case: 900 if (ccRootNode != NULL) 901 { 902 Value* CCRootV = ccRootNode->value; 903 Value* dominating = nullptr; 904 Value* dominated = nullptr; 905 906 SymmetricInterferenceTest( 907 CCRootV, dyn_cast<llvm::Instruction>(val), 908 dominating, 909 dominated); 910 911 //Case 1): 912 // (dominating=null) 913 // 914 //! (value) 915 // 916 // CC dominance tree: 917 //! v67 <- (dominated) 918 // ^ 919 // | 920 //! v190 921 // 922 //! (tupleInst) 923 // Value has no dominator inside CC class, but dominates elements in CC class 924 // and its lifetime spans until (but not including) tupleInst. 925 if (dominating == nullptr && dominated != nullptr) 926 { 927 //IGC_ASSERT(LV->isLiveAt(val, tupleInst)); 928 //Interference check: value is live at dominated 929 if (functor->visitInterfering(val, false)) 930 continue; 931 else 932 break; 933 } 934 else if (dominating != nullptr && dominated != nullptr) 935 { 936 //Case 2): 937 // CC dominance tree 938 //! v67 939 // ^ 940 // | 941 //! v121 (dominating) 942 // ^ 943 //! | <-----(value) 944 // | 945 //! v190 <- (dominated) 946 //TODO: it would be possible to 'fit' v181 between v121 and v190 in some 947 // cases, but this would require redesign of the 'dominance forest' logic 948 // (regarding current dominating and immediate dominator handling) 949 // For now, just assume there is an interference. 950 if (functor->visitInterfering(val, false)) 951 continue; 952 else 953 break; 954 } 955 else if (dominating != nullptr && dominated == nullptr) 956 { 957 //Case 3): 958 // CC dominance tree 959 //! v121 960 // ^ 961 // | 962 //! v190 <- (dominating) 963 // | 964 //! | ----- (value) 965 // | 966 // | (dominated == null) 967 if (LV->isLiveAt(dominating, dyn_cast<llvm::Instruction>(val))) 968 { 969 if (functor->visitInterfering(val, false)) 970 continue; 971 else 972 break; 973 } 974 } 975 else 976 { 977 //It is possible that the ccRootNode is not empty, but all 978 //elements that belong to it have been isolated in previously 979 //dominating tuples. 980 //IGC_ASSERT(0); 981 } 982 } 983 else 984 { 985 //ccRootNode == NULL -> congruence class not occupied yet 986 if (functor->visitPackedNonInterfering(val)) 987 continue; 988 else 989 break; 990 } 991 } //if( ccRootNode == Node ) 992 } //if( llvm::isa<llvm::Constant>( val ) ) 993 } //for loop 994 } 995 996 /// Here we have two major differences with the implementation in de-ssa: 997 /// 1) Transactional character: even if one 'element' of CC-tuple does not interfere, 998 /// some other element in CC-tuple might be interfering, rendering the whole group 999 /// non-'coalescable'. 1000 /// 2) Values in payload coalescing are by default isolated, so if there is an interference, 1001 /// do nothing, no need for explicit isolation. InterferenceCheck(const uint numOperands,Instruction * tupleInst,const int offsetDiff,CCTuple * ccTuple,ProcessInterferencesElementFunctor * interferencesFunctor)1002 bool CoalescingEngine::InterferenceCheck( 1003 const uint numOperands, 1004 Instruction* tupleInst, 1005 const int offsetDiff, 1006 CCTuple* ccTuple, 1007 ProcessInterferencesElementFunctor* interferencesFunctor) 1008 { 1009 SmallPtrSet<Value*, 8> touchedValuesSet; 1010 GatherWeightElementFunctor gatherFunctor; 1011 ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, &gatherFunctor); 1012 bool forceEviction = 1013 (gatherFunctor.GetNumInsertionSlotsRequired() + gatherFunctor.GetNumNeedsDisplacement() <= 1014 gatherFunctor.GetNumAlignedAnchors()); 1015 1016 1017 interferencesFunctor->SetForceEviction(forceEviction); 1018 1019 ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, interferencesFunctor); 1020 1021 return interferencesFunctor->IsInterfering(); 1022 } 1023 1024 /// Given an instruction and numOperands (corresponding to the number of coalesce-partaking 1025 /// payload elements), return whether any 'value' that is an argument of an intrinsic 1026 /// takes part in the coalescing CC tuple. If yes, return non-zero value that is 1027 /// the representative tuple for this coalescing. 1028 /// output: zeroBasedPayloadElementOffset - an offset of this instructions payload 1029 /// relative to the containing ccTuple - this is necessary to correctly slice the 1030 /// the relevant part of the payload from ccTuple (e.g. for preparing a payload for URB writes) 1031 /// offset [payload] 1032 /// [ccTuple ...... ] 1033 /// note: obviously, if return value is null, then the meaning of zeroBasedPayloadElementOffset 1034 /// is undefined IsAnyValueCoalescedInCCTuple(llvm::Instruction * inst,const uint numOperands,int & zeroBasedPayloadElementOffset,Value * & outVal)1035 CoalescingEngine::CCTuple* CoalescingEngine::IsAnyValueCoalescedInCCTuple( 1036 llvm::Instruction* inst, 1037 const uint numOperands, 1038 int& zeroBasedPayloadElementOffset, 1039 Value*& outVal) 1040 { 1041 outVal = nullptr; 1042 zeroBasedPayloadElementOffset = 0; //provide some meaningful default 1043 CoalescingEngine::CCTuple* ccTuple = nullptr; 1044 uint index = 0; 1045 while (index < numOperands) 1046 { 1047 Value* val = GetPayloadElementToValueMapping(inst, index); 1048 1049 if (!isa<Constant>(val) && GetValueCCTupleMapping(val)) 1050 { 1051 ccTuple = GetValueCCTupleMapping(val); 1052 1053 Value* valRoot = getRegRoot(val); 1054 IGC_ASSERT(valRoot); 1055 ElementNode* Node = ValueNodeMap[valRoot]; 1056 int offset = NodeOffsetMap[Node]; 1057 zeroBasedPayloadElementOffset = (offset - ccTuple->GetLeftBound()) - index; 1058 outVal = val; 1059 return ccTuple; 1060 } 1061 index++; 1062 } 1063 return ccTuple; 1064 } 1065 1066 /// Return true if payload tuple might be completed (either by directly 1067 /// aliasing or by inserting an appropriate copy instruction). IsPayloadCovered(llvm::Instruction * inst,CCTuple * ccTuple,const uint numOperands,const int payloadToCCTupleRelativeOffset)1068 bool CoalescingEngine::IsPayloadCovered( 1069 llvm::Instruction* inst, 1070 CCTuple* ccTuple, 1071 const uint numOperands, 1072 const int payloadToCCTupleRelativeOffset) 1073 { 1074 uint relativeOffset = 0; 1075 bool payloadCovered = false; 1076 1077 if (ccTuple) 1078 { 1079 SmallPtrSet<Value*, 8> touchedValuesSet; 1080 1081 //index = 0; 1082 payloadCovered = true; 1083 for (uint index = 0; index < numOperands; index++) 1084 { 1085 Value* val = GetPayloadElementToValueMapping(inst, index); 1086 const int ccIndex = index + payloadToCCTupleRelativeOffset; 1087 1088 if (touchedValuesSet.count(val)) { 1089 if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) { 1090 payloadCovered = false; 1091 break; 1092 } 1093 continue; 1094 } 1095 else { 1096 touchedValuesSet.insert(val); 1097 } 1098 1099 if (IsValConstOrIsolated(val)) { 1100 if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) { 1101 payloadCovered = false; 1102 break; 1103 } 1104 } 1105 else { 1106 if (CoalescingEngine::CCTuple * thisCCTuple = GetValueCCTupleMapping(val)) { 1107 if (thisCCTuple == ccTuple) { 1108 relativeOffset = GetValueOffsetInCCTuple(val); 1109 if (relativeOffset != (ccIndex)) { 1110 payloadCovered = false; 1111 break; 1112 } 1113 } 1114 else { 1115 //different cc tuple 1116 payloadCovered = false; 1117 break; 1118 } 1119 1120 } 1121 else { 1122 if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) { 1123 payloadCovered = false; 1124 break; 1125 } 1126 } 1127 }//if constant 1128 }//while 1129 } //if cctuple 1130 1131 return payloadCovered; 1132 } 1133 1134 1135 /// Uses the tuple information provided by coalescing engine in order 1136 /// to generate optimized sequence of 'movs' for preparing a payload. PrepareExplicitPayload(CShader * outProgram,CEncoder * encoder,SIMDMode simdMode,const DataLayout * pDL,llvm::Instruction * inst,int & payloadOffsetInBytes)1137 CVariable* CoalescingEngine::PrepareExplicitPayload( 1138 CShader* outProgram, 1139 CEncoder* encoder, 1140 SIMDMode simdMode, 1141 const DataLayout* pDL, 1142 llvm::Instruction* inst, 1143 int& payloadOffsetInBytes) 1144 { 1145 SetCurrentPart(inst, 0); 1146 const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst); 1147 const uint grfSize = outProgram->GetContext()->platform.getGRFSize(); 1148 const uint numSubVarsPerOperand = numLanes(simdMode) / (grfSize / 4); 1149 IGC_ASSERT(numSubVarsPerOperand == 1 || numSubVarsPerOperand == 2); 1150 CoalescingEngine::CCTuple* ccTuple = nullptr; 1151 int payloadToCCTupleRelativeOffset = 0; 1152 Value* dummyValPtr = nullptr; 1153 ccTuple = IsAnyValueCoalescedInCCTuple(inst, numOperands, payloadToCCTupleRelativeOffset, dummyValPtr); 1154 bool payloadCovered = IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset); 1155 bool payloadOffsetComputed = false; 1156 payloadOffsetInBytes = 0; 1157 CVariable* payload = nullptr; 1158 1159 if (payloadToCCTupleRelativeOffset < 0) 1160 { 1161 payloadCovered = false; 1162 } 1163 1164 //Check for EOT payload. 1165 if (payloadCovered) { 1166 if (IGCLLVM::TerminatorInst * terminator = inst->getParent()->getTerminator()) 1167 { 1168 if (terminator != &(*terminator->getParent()->begin())) 1169 { 1170 IGC_ASSERT(dyn_cast<GenIntrinsicInst>(inst)); 1171 IGC_ASSERT(terminator->getPrevNode()); 1172 if (terminator->getPrevNode() == inst) 1173 { 1174 //heuristic: 1175 //this inst is an EOT node. Be very careful about payload size: 1176 //coalescing: hard constraint 1177 if (ccTuple->GetNumElements() > MaxTupleSize) 1178 { 1179 payloadCovered = false; 1180 } 1181 } 1182 } 1183 } 1184 } 1185 1186 if (payloadCovered) { 1187 SmallPtrSet<Value*, 8> touchedValuesSet; 1188 1189 for (uint index = 0; index < numOperands; index++) 1190 { 1191 Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, index); 1192 1193 CVariable* dataElement = outProgram->GetSymbol(val); 1194 //FIXME: need to do additional checks for size 1195 uint subVar = payloadToCCTupleRelativeOffset + index * numSubVarsPerOperand; 1196 encoder->SetDstSubVar(subVar); 1197 payload = outProgram->GetCCTupleToVariableMapping(ccTuple); 1198 1199 if (touchedValuesSet.count(val)) { 1200 encoder->Copy(payload, dataElement); 1201 encoder->Push(); 1202 continue; 1203 } 1204 else { 1205 touchedValuesSet.insert(val); 1206 } 1207 1208 if (IsValConstOrIsolated(val)) { 1209 encoder->Copy(payload, dataElement); 1210 encoder->Push(); 1211 } 1212 else 1213 { 1214 if (GetValueCCTupleMapping(val)) 1215 { 1216 if (!payloadOffsetComputed) 1217 { 1218 payloadOffsetComputed = true; 1219 1220 payloadOffsetInBytes = payloadToCCTupleRelativeOffset * 1221 GetSingleElementWidth(simdMode, pDL, val); 1222 } 1223 } 1224 else 1225 { 1226 //this one actually encompasses the case for !getRegRoot(val) 1227 encoder->Copy(payload, dataElement); 1228 encoder->Push(); 1229 } 1230 }//if constant 1231 }//for 1232 1233 } 1234 else 1235 { 1236 payload = outProgram->GetNewVariable(numOperands * numLanes(simdMode), ISA_TYPE_F, 1237 outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD, 1238 "CEExplicitPayload"); 1239 1240 for (uint i = 0; i < numOperands; i++) 1241 { 1242 Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i); 1243 CVariable* data = outProgram->GetSymbol(val); 1244 encoder->SetDstSubVar(i * numSubVarsPerOperand); 1245 encoder->Copy(payload, data); 1246 encoder->Push(); 1247 } 1248 } 1249 1250 return payload; 1251 } 1252 1253 1254 CoalescingEngine::ElementNode* getLeader()1255 CoalescingEngine::ElementNode::getLeader() { 1256 ElementNode* N = this; 1257 ElementNode* Parent = parent.getPointer(); 1258 ElementNode* Grandparent = Parent->parent.getPointer(); 1259 1260 while (Parent != Grandparent) { 1261 N->parent.setPointer(Grandparent); 1262 N = Grandparent; 1263 Parent = Parent->parent.getPointer(); 1264 Grandparent = Parent->parent.getPointer(); 1265 } 1266 1267 return Parent; 1268 } 1269 getRegRoot(Value * Val) const1270 Value* CoalescingEngine::getRegRoot(Value* Val) const { 1271 auto RI = ValueNodeMap.find(Val); 1272 if (RI == ValueNodeMap.end()) 1273 return nullptr; 1274 ElementNode* Node = RI->second; 1275 if (Node->parent.getInt() & ElementNode::kRegisterIsolatedFlag) 1276 return 0x0; 1277 return Node->getLeader()->value; 1278 } 1279 getPHIRoot(Instruction * PHI) const1280 Value* CoalescingEngine::getPHIRoot(Instruction* PHI) const { 1281 IGC_ASSERT(dyn_cast<PHINode>(PHI)); 1282 auto RI = ValueNodeMap.find(PHI); 1283 IGC_ASSERT(RI != ValueNodeMap.end()); 1284 ElementNode* DestNode = RI->second; 1285 if (DestNode->parent.getInt() & ElementNode::kPHIIsolatedFlag) 1286 return 0x0; 1287 for (unsigned i = 0; i < PHI->getNumOperands(); i++) { 1288 Value* SrcVal = PHI->getOperand(i); 1289 if (isa<Instruction>(SrcVal)) { // skip constant source 1290 Value* SrcRoot = getRegRoot(SrcVal); 1291 if (SrcRoot) 1292 return SrcRoot; 1293 } 1294 } 1295 return 0x0; 1296 } 1297 1298 1299 unionRegs(Value * Val1,Value * Val2)1300 void CoalescingEngine::unionRegs(Value* Val1, Value* Val2) { 1301 ElementNode* Node1 = ValueNodeMap[Val1]->getLeader(); 1302 ElementNode* Node2 = ValueNodeMap[Val2]->getLeader(); 1303 1304 if (Node1->rank > Node2->rank) { 1305 Node2->parent.setPointer(Node1->getLeader()); 1306 } 1307 else if (Node1->rank < Node2->rank) { 1308 Node1->parent.setPointer(Node2->getLeader()); 1309 } 1310 else if (Node1 != Node2) { 1311 Node2->parent.setPointer(Node1->getLeader()); 1312 Node1->rank++; 1313 } 1314 } 1315 1316 // For now, return true if V is dessa-aliased/InsEltMap-ed/phi-coalesced. isCoalescedByDeSSA(Value * V) const1317 bool CoalescingEngine::isCoalescedByDeSSA(Value* V) const 1318 { 1319 if (m_DeSSA && m_DeSSA->getRootValue(V)) 1320 return true; 1321 return false; 1322 } 1323 ProcessBlock(llvm::BasicBlock * bb)1324 void CoalescingEngine::ProcessBlock(llvm::BasicBlock* bb) 1325 { 1326 llvm::BasicBlock::InstListType& instructionList = bb->getInstList(); 1327 llvm::BasicBlock::InstListType::iterator I, E; 1328 1329 //Loop through instructions top to bottom 1330 for (I = instructionList.begin(), E = instructionList.end(); I != E; ++I) { 1331 llvm::Instruction& inst = (*I); 1332 visit(inst); 1333 } 1334 } 1335 MatchSingleInstruction(llvm::GenIntrinsicInst * inst)1336 bool CoalescingEngine::MatchSingleInstruction(llvm::GenIntrinsicInst* inst) 1337 { 1338 GenISAIntrinsic::ID IID = inst->getIntrinsicID(); 1339 if (isSampleInstruction(inst) || 1340 isLdInstruction(inst) || 1341 isURBWriteIntrinsic(inst) || 1342 IID == GenISAIntrinsic::GenISA_RTWrite) 1343 { 1344 uint numOperands = inst->getNumOperands(); 1345 for (uint i = 0; i < numOperands; i++) 1346 { 1347 Value* val = inst->getOperand(i); 1348 if (llvm::isa<llvm::Constant>(val)) 1349 { 1350 continue; 1351 } 1352 1353 if (!ValueNodeMap.count(val)) 1354 { 1355 Instruction* DefMI = dyn_cast<Instruction>(val); 1356 if (DefMI) 1357 { 1358 BBProcessingDefs[DefMI->getParent()].push_back(DefMI); 1359 } 1360 ValueNodeMap[val] = new (Allocator) ElementNode(val); 1361 } 1362 } 1363 1364 //Add itself to processing stack as well. 1365 BBProcessingDefs[inst->getParent()].push_back(inst); 1366 } 1367 1368 return true; 1369 } 1370 visitCallInst(llvm::CallInst & I)1371 void CoalescingEngine::visitCallInst(llvm::CallInst& I) 1372 { 1373 //we do not care if we do not match 1374 //bool match = true; 1375 if (GenIntrinsicInst * CI = llvm::dyn_cast<GenIntrinsicInst>(&I)) 1376 { 1377 switch (CI->getIntrinsicID()) 1378 { 1379 case GenISAIntrinsic::GenISA_ROUNDNE: 1380 case GenISAIntrinsic::GenISA_imulH: 1381 case GenISAIntrinsic::GenISA_umulH: 1382 case GenISAIntrinsic::GenISA_uaddc: 1383 case GenISAIntrinsic::GenISA_usubb: 1384 case GenISAIntrinsic::GenISA_bfrev: 1385 break; 1386 case GenISAIntrinsic::GenISA_intatomicraw: 1387 case GenISAIntrinsic::GenISA_floatatomicraw: 1388 case GenISAIntrinsic::GenISA_icmpxchgatomicraw: 1389 case GenISAIntrinsic::GenISA_fcmpxchgatomicraw: 1390 case GenISAIntrinsic::GenISA_dwordatomicstructured: 1391 case GenISAIntrinsic::GenISA_floatatomicstructured: 1392 case GenISAIntrinsic::GenISA_cmpxchgatomicstructured: 1393 case GenISAIntrinsic::GenISA_fcmpxchgatomicstructured: 1394 case GenISAIntrinsic::GenISA_intatomictyped: 1395 case GenISAIntrinsic::GenISA_icmpxchgatomictyped: 1396 case GenISAIntrinsic::GenISA_ldstructured: 1397 case GenISAIntrinsic::GenISA_atomiccounterinc: 1398 case GenISAIntrinsic::GenISA_atomiccounterpredec: 1399 break; 1400 case GenISAIntrinsic::GenISA_GradientX: 1401 case GenISAIntrinsic::GenISA_GradientY: 1402 case GenISAIntrinsic::GenISA_GradientXfine: 1403 case GenISAIntrinsic::GenISA_GradientYfine: 1404 break; 1405 default: 1406 MatchSingleInstruction(CI); 1407 // no pattern for the rest of the intrinsics 1408 break; 1409 } 1410 } 1411 else if (IntrinsicInst * CI = llvm::dyn_cast<IntrinsicInst>(&I)) 1412 { 1413 switch (CI->getIntrinsicID()) 1414 { 1415 case Intrinsic::sqrt: 1416 case Intrinsic::log2: 1417 case Intrinsic::cos: 1418 case Intrinsic::sin: 1419 case Intrinsic::pow: 1420 case Intrinsic::floor: 1421 case Intrinsic::ceil: 1422 case Intrinsic::trunc: 1423 case Intrinsic::maxnum: 1424 case Intrinsic::minnum: 1425 break; 1426 case Intrinsic::exp2: 1427 break; 1428 1429 default: 1430 break; 1431 } 1432 } 1433 } 1434 visitCastInst(llvm::CastInst & I)1435 void CoalescingEngine::visitCastInst(llvm::CastInst& I) 1436 { 1437 1438 } visitBinaryOperator(llvm::BinaryOperator & I)1439 void CoalescingEngine::visitBinaryOperator(llvm::BinaryOperator& I) 1440 { 1441 1442 } 1443 visitCmpInst(llvm::CmpInst & I)1444 void CoalescingEngine::visitCmpInst(llvm::CmpInst& I) 1445 { 1446 1447 } 1448 visitPHINode(llvm::PHINode & I)1449 void CoalescingEngine::visitPHINode(llvm::PHINode& I) 1450 { 1451 1452 } 1453 visitUnaryInstruction(llvm::UnaryInstruction & I)1454 void CoalescingEngine::visitUnaryInstruction(llvm::UnaryInstruction& I) 1455 { 1456 1457 } 1458 visitSelectInst(llvm::SelectInst & I)1459 void CoalescingEngine::visitSelectInst(llvm::SelectInst& I) 1460 { 1461 1462 } 1463 visitBitCastInst(llvm::BitCastInst & I)1464 void CoalescingEngine::visitBitCastInst(llvm::BitCastInst& I) 1465 { 1466 1467 } 1468 visitInstruction(llvm::Instruction & I)1469 void CoalescingEngine::visitInstruction(llvm::Instruction& I) 1470 { 1471 1472 } 1473 visitLoadInst(LoadInst & I)1474 void CoalescingEngine::visitLoadInst(LoadInst& I) 1475 { 1476 1477 } 1478 visitStoreInst(StoreInst & I)1479 void CoalescingEngine::visitStoreInst(StoreInst& I) 1480 { 1481 1482 } 1483 1484 } //namespace IGC 1485 1486