/*========================== begin_copyright_notice ============================ Copyright (C) 2017-2021 Intel Corporation SPDX-License-Identifier: MIT ============================= end_copyright_notice ===========================*/ //===-------- CoalescingEngine.cpp - Pass and analysis for coalescing-------===// // // Intel Extention to LLVM core //===----------------------------------------------------------------------===// // // This pass is an implementation of idea of coalescing originated from USC // but re-designed to work (and take advantage of) on SSA form. // //===----------------------------------------------------------------------===// #include "Compiler/CISACodeGen/CoalescingEngine.hpp" #include "Compiler/CISACodeGen/DeSSA.hpp" #include "Compiler/CISACodeGen/ShaderCodeGen.hpp" #include "Compiler/CISACodeGen/PatternMatchPass.hpp" #include "Compiler/MetaDataApi/MetaDataApi.h" #include "Compiler/CISACodeGen/helper.h" #include "PayloadMapping.hpp" #include "common/debug/Debug.hpp" #include "common/debug/Dump.hpp" #include "common/igc_regkeys.hpp" #include "common/LLVMWarningsPush.hpp" #include #include #include "common/LLVMWarningsPop.hpp" #include "Compiler/IGCPassSupport.h" #include "Probe/Assertion.h" using namespace llvm; using namespace IGC; using namespace IGC::Debug; using namespace IGC::IGCMD; char CoalescingEngine::ID = 0; #define PASS_FLAG "CoalescingEngine" #define PASS_DESCRIPTION "coalesce moves coming payloads, insert and extract element" #define PASS_CFG_ONLY true #define PASS_ANALYSIS true IGC_INITIALIZE_PASS_BEGIN(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis) IGC_INITIALIZE_PASS_DEPENDENCY(LiveVarsAnalysis) IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenPatternMatch) IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) IGC_INITIALIZE_PASS_DEPENDENCY(DeSSA) IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper) IGC_INITIALIZE_PASS_END(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) namespace IGC { CoalescingEngine::CoalescingEngine() : FunctionPass(ID) { initializeCoalescingEnginePass(*PassRegistry::getPassRegistry()); } void CoalescingEngine::getAnalysisUsage(llvm::AnalysisUsage& AU) const { AU.setPreservesAll(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); } void CoalescingEngine::CCTuple::print(raw_ostream& OS, const Module*) const { OS << "CC Tuple"; auto IT = OffsetToCCMap.begin(); while (IT != OffsetToCCMap.end()) { OS << IT->first << " : "; IT++; } } void CoalescingEngine::CCTuple::dump() const { print(ods()); } void CoalescingEngine::print(raw_ostream& OS, const Module*) const { OS << "CC Tuple list: \n"; for (auto itr = m_CCTupleList.begin(), iend = m_CCTupleList.end(); itr != iend; ++itr) { CCTuple* ccTuple = *itr; ccTuple->print(OS); } OS << "CC Tuple list end. \n"; } void CoalescingEngine::dump() const { print(ods()); } /// \brief Entry point. bool CoalescingEngine::runOnFunction(Function& MF) { if (IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing)) { return false; } MetaDataUtils* pMdUtils = getAnalysis().getMetaDataUtils(); m_ModuleMetadata = getAnalysis().getModuleMetaData(); m_CodeGenContext = getAnalysis().getCodeGenContext(); if (!isEntryFunc(pMdUtils, &MF)) { return false; } auto pCtx = getAnalysis().getCodeGenContext(); m_Platform = pCtx->platform; m_PayloadMapping = PayloadMapping(pCtx); DT = &getAnalysis().getDomTree(); WIA = &getAnalysis(); CG = &getAnalysis(); LV = &getAnalysis().getLiveVars(); if (IGC_IS_FLAG_DISABLED(DisableDeSSA)) { m_DeSSA = &getAnalysis(); } else { m_DeSSA = nullptr; } for (Function::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { ProcessBlock(&(*I)); } for (df_iterator DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) { IncrementalCoalesce(DI->getBlock()); } return false; } /// Coalesce instructions within a single-BB void CoalescingEngine::IncrementalCoalesce(BasicBlock* MBB) { std::vector& DefInstrs = BBProcessingDefs[MBB]; std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LV)); for (std::vector::const_iterator BBI = DefInstrs.begin(), BBE = DefInstrs.end(); BBI != BBE; ++BBI) { Instruction* DefMI = *BBI; auto RI = ValueNodeMap.find((Value*)DefMI); if (RI == ValueNodeMap.end()) { //Intrinsics themselves are not mapped to a node, they are //tuple generators. //NOTE: Please leave this commented code snippet! It is useful when debugging coalescing regressions: // (used to isolate coalescing to particular shader to check failing/passing using binary search) //DWORD hash = (DWORD) static_cast(MBB->getParent()->getContext()).hash.getAsmHash(); ////if (hash >= 0xA27Affff && hash <= 0xA3CFFF00) //if (hash >= 0xA3C00000 && hash <= 0xA3CFFFFF) //{ // continue; //} if (GenIntrinsicInst * intrinsic = llvm::dyn_cast(DefMI)) { GenISAIntrinsic::ID IID = intrinsic->getIntrinsicID(); if ((isURBWriteIntrinsic(intrinsic) && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_URB))) || (IID == GenISAIntrinsic::GenISA_RTWrite && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_RT)))) { ProcessTuple(DefMI); } } if (isSampleInstruction(DefMI) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_Sample)) { ProcessTuple(DefMI); } else if (isLdInstruction(DefMI)) { ProcessTuple(DefMI); } } } } static bool IsBindlessSampler(Instruction* inst) { if (SampleIntrinsic * sample = dyn_cast(inst)) { Value* sampler = sample->getSamplerValue(); uint bufferIndex = 0; bool directIndexing = false; if (sampler->getType()->isPointerTy()) { BufferType bufType = DecodeAS4GFXResource( sampler->getType()->getPointerAddressSpace(), directIndexing, bufferIndex); if (bufType == BINDLESS_SAMPLER) { return true; } } } return false; } /// Core algorithm. void CoalescingEngine::ProcessTuple(Instruction* tupleGeneratingInstruction) { const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction); // bindless sampler always have a header bool needsHeader = IsBindlessSampler(tupleGeneratingInstruction); //There should be an early exit in the case we have split-send available //and this instruction is sure to generate two-Element payload. if (m_Platform.supportSplitSend() && !needsHeader) { if (numOperands < 3) { return; } } else { if (numOperands < 2) { return; } } IGC_ASSERT(numOperands >= 2); //No result, but has side effects of updating the split mapping. DecideSplit(tupleGeneratingInstruction); uint numSplits = GetNumSplitParts(tupleGeneratingInstruction); for (uint numPart = 0; numPart < numSplits; numPart++) { SetCurrentPart(tupleGeneratingInstruction, numPart); const uint numOperands = GetNumPayloadElements(tupleGeneratingInstruction); bool isAnyNodeAnchored = false, isAnyNodeCoalescable = false; SmallPtrSet touchedTuplesSet; SmallVector touchedTuples; //Step: Prepare. PrepareTuple( numOperands, tupleGeneratingInstruction, //out: touchedTuplesSet, touchedTuples, isAnyNodeAnchored, isAnyNodeCoalescable); if (!isAnyNodeCoalescable) { continue; } //So at this point, there is at least one node that is candidate for coalescing. //It might be already a part of CCtuple, or it might be yet unconstrained. //Tuple Generating Instruction enforces sequential order of registers allocated to values. //Each value might belong to (up to) one CCtuple. //If it belongs to CCTuple, it has an unique offset in that CCTuple. //Three cases are considered: //1) No value have CCtuple assigned -> create new CCTuple and assign offsets. // a) if platform supports split-send, then some (arbitrary) split point should be determined //2) At least one value belongs to a CCtuple (we say it is 'anchored' (constrained) to that tuple) // and all values belong to the same CCtuple //3) More than one value belong to a tuple, and they belong to different (>1) tuples: NOT implemented //Case 1) : create new CCTuple if (!isAnyNodeAnchored) { CreateTuple(numOperands, tupleGeneratingInstruction); } else { //Case 2) : at least one value is 'anchored'(constrained) to a CCtuple //Sub-case: All arguments belong to the same tuple. if (touchedTuples.size() == 1) { // Step I: find an anchoring element. // If arguments of this tuple-instruction are already constrained by CCtuple, it means // that they also constrain the tuple that would be generated by this payload. // They 'anchor' the new tuple, so pick the element that anchors the tuple and // determine its offset. //E.g. : // .. v1 (0) v2 (1) CCtuple // v0 (0) v1 (1) v2 (2) (this payload) // v1 will have: thisTupleStartOffset = 1, rootTupleStartOffset = 0 // CCtuple will be 'grown' with element v0, that is at relative offset = -1 // v0(-1) v1 (0) v2 (1) CCtuple CCTuple* ccTuple = NULL; int rootTupleStartOffset = 0; int thisTupleStartOffset = 0; DetermineAnchor( numOperands, tupleGeneratingInstruction, //out: ccTuple, rootTupleStartOffset, thisTupleStartOffset); IGC_ASSERT(ccTuple); /* Step II: Interference checking */ int offsetDiff = rootTupleStartOffset - thisTupleStartOffset; bool interferes = false; //Heuristic: we do not want to grow the ccTuple size too much // [c0 c1 c2 c3 c4 c5 c6 c7] //(offsetDiff=4)[e0' e1' e2' e3' e4' e5' e6' e7'] // here, tuple numElements = 8, and offsetDiff = 4, so it would result in // 12 element CC tuple if joined together. We want to prevent growing // more than MaxTupleSize elements. if (abs(offsetDiff) > 0 && ccTuple->GetNumElements() >= MaxTupleSize) { interferes = true; } ProcessInterferencesElementFunctor interferencesFunctor( false /*forceEviction */, tupleGeneratingInstruction, offsetDiff, ccTuple, this); if (!interferes) { interferes = InterferenceCheck( numOperands, tupleGeneratingInstruction, offsetDiff, ccTuple, //out: &interferencesFunctor); if (!interferes && (ccTuple->HasNonHomogeneousElements() || m_PayloadMapping.HasNonHomogeneousPayloadElements( tupleGeneratingInstruction) ) ) { interferes = CheckIntersectionForNonHomogeneous( numOperands, tupleGeneratingInstruction, offsetDiff, ccTuple); } } //If 'interferes' is false, it means that whole transaction could be committed, //provided that elements in dominatorsForDisplacement are displaced, and other nodes are attached. //If interferes is true, then no element will be attached to the ccTuple. if (!interferes) { SmallPtrSet touchedValuesSet; for (uint i = 0; i < numOperands; i++) { Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); ccTuple->CheckIndexForNonInitializedSlot(i + offsetDiff); if (touchedValuesSet.count(val)) { continue; } else { touchedValuesSet.insert(val); } if (IsValConstOrIsolated(val)) { //do nothing } else { Value* RootV = getRegRoot(val); if (!RootV) //bail out if isolated continue; //We do not expect to get Argument here, it must have been isolated. IGC_ASSERT(dyn_cast(val)); IGC_ASSERT(ValueNodeMap.count(val)); //auto RI = ValueNodeMap.find(val); //ElementNode *Node = RI->second; ElementNode* Node = ValueNodeMap[val]; if (!interferencesFunctor.GetComputedValuesForIsolation().count(val)) { ccTuple->AttachNodeAtIndex(i + offsetDiff, Node, *this); IGC_ASSERT(getRegRoot(val)); } } }//for } else // (interferes) is true { CreateTuple(numOperands, tupleGeneratingInstruction); } } else { //Case 3) //More than one value belong to a tuple, and they belong to different (>1) tuples. //FIXME: not implemented yet //Need to implement code for joining two non-overlapping tuples. //This probably does not bring much benefit, but is nevertheless complex to implement. //Having two congruence class tuples, we need to find whether piecewise congruent classes //of the tuples interfere. Here a 'two-set interference check' due to Boissinot could be //used (first sorting elements) } } } } /// bool CoalescingEngine::CheckIntersectionForNonHomogeneous( const uint numOperands, Instruction* tupleGeneratingInstruction, const int offsetDiff, CCTuple* ccTuple) { //Picturesque description: // Hi - homogeneous slots // [H_0 H_1 .. H_n] //ccTuple: // [H_0 H_1 .. H_n] // //1) this tuple instruction contains non-homogeneous part: // [H_0' H_1' .. H_n'] //2) or this tuple instruction does not contain non-homogeneous part // [H_0' H_1' .. H_n'] bool interferes = false; if (ccTuple->HasNonHomogeneousElements()) { // Finding a supremum instruction for a homogeneous part is implemented // only for render target write instructions (RTWritIntrinsic). // An only other possible combination for non-homogeneous instructions comprises // RTWritIntrinsic and RTDualBlendSourceIntrinsic. // In some cases there is an opportunity to coalesce their payloads but there exists // a danger that they are compiled in different SIMD modes so there is a safe assumption // that they cannot be coalesced. // A comparison of the payload of RTWritIntrinsic and RTDualBlendSourceIntrinsic: // +----------------------------+----------------------------+ // | RTWritIntrinsic | RTDualBlendSourceIntrinsic | // +----------------------------+----------------------------+ // | src0 Alpha (optional) | src0 Alpha (unavailable)* | // +----------------------------+----------------------------+ // | output mask (optional) | output mask (optional) | // +----------------------------+----------------------------+ // | - | R0 channel | // +----------------------------+----------------------------+ // | - | G0 channel | // +----------------------------+----------------------------+ // | - | B0 channel | // +----------------------------+----------------------------+ // | - | A0 channel | // +----------------------------+----------------------------+ // | R0 channel | R1 channel | // +----------------------------+----------------------------+ // | G0 channel | G1 channel | // +----------------------------+----------------------------+ // | B0 channel | B1 channel | // +----------------------------+----------------------------+ // | A0 channel | A1 channel | // +----------------------------+----------------------------+ // | Depth (optional) | Depth (optional) | // +----------------------------+----------------------------+ // | Stencil (optional) | Stencil (optional) | // +----------------------------+----------------------------+ // All optional fields except for src0 Alpha are consistent for both instruction called in a fragment shader. // * RTDualBlendSourceIntrinsic doesn't have such an argument but it is defined in its payload. if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands && llvm::isa(ccTuple->GetRoot()) && llvm::isa(tupleGeneratingInstruction)) { if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) { //TODO: test for 'supremum' of TGI and ccTuple root element can be obtained const Instruction* supremum = m_PayloadMapping.GetSupremumOfNonHomogeneousPart( tupleGeneratingInstruction, ccTuple->GetRoot()); if (supremum) { } else { interferes = true; } } } else { //Since ccTuple contains non-homogeneous, and this homogeneous //part is not perfectly aligned to ccTuple, we cannot reason about //interferences safely, thus assume interferes. interferes = true; } } else { //ccTuple: // [H_0 H_1 .. H_n] // //1) this tuple instruction contains non-homogeneous part: // [H_0' H_1' .. H_n'] if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) { if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands) { ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction); } else { interferes = true; } } } return interferes; } void CoalescingEngine::DecideSplit(Instruction* tupleGeneratingInstruction) { if (m_PayloadMapping.DoPeelFirstElement(tupleGeneratingInstruction)) { splitPoint_[tupleGeneratingInstruction] = 1; return; } uint splitPoint = 0; const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction); //No sense entering here if we have less than 2 operands. IGC_ASSERT(numOperands >= 2); Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, 0); if (IsValConstOrIsolated(val) || !getRegRoot(val)) { splitPoint = 1; } val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, numOperands - 1); if (IsValConstOrIsolated(val) || !getRegRoot(val)) { splitPoint = numOperands - 1; } if (splitPoint > 0 && m_PayloadMapping.DoesAllowSplit(tupleGeneratingInstruction)) { splitPoint_[tupleGeneratingInstruction] = splitPoint; } } /// Determines if any value that is an argument of (possible) tuple generating instruction /// can participate in payload coalescing. If so, it also determines if there are any /// 'anchored'(constrained) values and the number of constraining tuples. void CoalescingEngine::PrepareTuple( const uint numOperands, Instruction* tupleGeneratingInstruction, SmallPtrSet & touchedTuplesSet, SmallVector & touchedTuples, bool& isAnyNodeAnchored, bool& isAnyNodeCoalescable) { uint numConstants = 0; //Step: Prepare. for (uint i = 0; i < numOperands; i++) { Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); if (IsValConstOrIsolated(val) || !getRegRoot(val)) { numConstants++; continue; } Value* RootV = getRegRoot(val); IGC_ASSERT(RootV); { isAnyNodeCoalescable = true; IGC_ASSERT(ValueNodeMap.count(RootV)); auto RI = ValueNodeMap.find(RootV); ElementNode* Node = RI->second; auto CCI = NodeCCTupleMap.find(Node); if (CCI != NodeCCTupleMap.end()) { CCTuple* ccTuple = NodeCCTupleMap[Node]; if (!touchedTuplesSet.count(ccTuple)) { touchedTuplesSet.insert(ccTuple); touchedTuples.push_back(ccTuple); } isAnyNodeAnchored = true; } // Do not coalesce values having excessive number of uses. // This otfen introduces many anti-dependencies. const unsigned MAX_USE_COUNT = 20; if (val->getNumUses() >= MAX_USE_COUNT) { isAnyNodeAnchored = true; } } } //heuristic: //we do not want to create 'wide' root variables, where most of the slots //will be anyway non-coalesced (due to immediate constants/isolated variables) //vISA will anyway not benefit, while this increases pressure and makes coloring //with round-robin heuristic harder if (numOperands > 2 && numConstants > (numOperands / 2)) { if (!m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) { //force non-coalescing: isAnyNodeCoalescable = false; } } } /// Creates fresh tuple. /// Fresh tuple consists of nodes which do not have any tuple assigned yet. /// Precondition is that at least one value is coalescable (otherwise does not make sense). void CoalescingEngine::CreateTuple( const uint numOperands, Instruction* tupleGeneratingInstruction) { CCTuple* ccTuple = new CCTuple(); m_CCTupleList.push_back(ccTuple); for (uint i = 0; i < numOperands; i++) { Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); if (IsValConstOrIsolated(val)) { //It is important to initialize CC with empty slots, in order to get the proper //size of the vISA variable that will back CCtuple that corresponds to payload. ccTuple->ResizeBounds(i); continue; } Value* RootV = getRegRoot(val); if (!RootV) { //isolated ccTuple->ResizeBounds(i); continue; } IGC_ASSERT(ValueNodeMap.count(RootV)); ElementNode* RootNode = ValueNodeMap[RootV]; auto CCTI = NodeCCTupleMap.find(RootNode); if (CCTI == NodeCCTupleMap.end()) { NodeCCTupleMap[RootNode] = ccTuple; NodeOffsetMap[RootNode] = i; ccTuple->InitializeIndexWithCCRoot(i, RootNode); CurrentDominatingParent[RootV] = RootV; ImmediateDominatingParent[RootV] = NULL; } else { //This must have been a value that is copied to payload multiple times. //OR: The new CCtuple has been isolated, and this is the element that //belongs to other tuple. //Since this value belongs to another tuple, we should not increase the //current tuple's size, no? //ccTuple->ResizeBounds(i); } } // loop over arguments if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) { ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction); } } /// Given a tuple generating instruction, picks out an 'anchored'(constrained) /// value and determines the relative offset /// output: /// ccTuple - a tuple that is constraining a picked value /// thisTupleStartOffset - the offset of the value in the current tuple generating /// instruction /// rootTupleStartOffset - the offset of the value in constraining tuple /// Assumption: it is already checked all values belong to exactly one tuple! /// (otherwise the result might pick an arbitrary tuple) void CoalescingEngine::DetermineAnchor( const uint numOperands, const Instruction* tupleGeneratingInstruction, CCTuple*& ccTuple, int& rootTupleStartOffset, int& thisTupleStartOffset) { for (uint i = 0; i < numOperands; i++) { Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i); if (IsValConstOrIsolated(val)) { continue; } Value* RootV = getRegRoot(val); if (!RootV) continue; IGC_ASSERT(ValueNodeMap.count(RootV)); ElementNode* RootNode = ValueNodeMap[RootV]; auto CCTI = NodeCCTupleMap.find(RootNode); if (CCTI != NodeCCTupleMap.end()) { //Element is constrained by ccTuple. IGC_ASSERT(NodeOffsetMap.count(RootNode)); rootTupleStartOffset = NodeOffsetMap[RootNode]; thisTupleStartOffset = i; ccTuple = NodeCCTupleMap[RootNode]; //Just a sanity check.. : IGC_ASSERT(RootNode == ccTuple->GetCCNodeWithIndex(NodeOffsetMap[RootNode])); break; } }//for } /// Prepares the slot for moves (e.g. induced by const, isolated, uniform to vector) /// into payload by evicting any value that is currently occupying this slot. /// Assumption is that heuristic has determined it is profitable to do so. /// input: bool evictFullCongruenceClass - tells whether the full congruence class /// should be evicted, or only the element that occupies the tuple instruction /// for the lifetime of a 'mov' (this is sufficient for issuing a mov to the payload slot). void CoalescingEngine::PrepareInsertionSlot( CCTuple* ccTuple, const int index, Instruction* inst, const bool evictFullCongruenceClass) { ElementNode* RootNode = ccTuple->OffsetToCCMap[index]; IGC_ASSERT(RootNode); if (evictFullCongruenceClass) { Value* NewParent = GetActualDominatingParent(RootNode->value, inst); while (NewParent) { if (getRegRoot(NewParent)) { isolateReg(NewParent); } NewParent = ImmediateDominatingParent[NewParent]; } //it might turn out, that rootNode does not dominate 'inst' //since it is in another branch of DT //do not forget to delete it as well if (getRegRoot(RootNode->value)) { isolateReg(RootNode->value); } } else { //Evict dominating parent from CC. Value* NewParent = GetActualDominatingParent(RootNode->value, inst); if (NewParent) isolateReg(NewParent); } } bool CoalescingEngine::IsInsertionSlotAvailable( CCTuple* ccTuple, const int index, Instruction* tupleGeneratingInstruction, const bool canExtend) { bool avaiable = true; //ccTuple->OffsetToCCMap.count(index) == 0 if (!canExtend) { const int tupleNumElements = ccTuple->GetNumElements(); if (!(0 <= index && index < tupleNumElements)) { return false; } } ElementNode* nodeToCheck = ccTuple->GetCCNodeWithIndex(index); if (nodeToCheck == NULL) { //tuple : X CC1 CC2 //payload : v1 v2 //Means: Tuple has no live interval (X) that is live at the point the payload is //defined. It means, that a given 'register slot' can be used to issue a 'mov' //with an immediate (or isolated) value. } else { //tuple : CC0 CC1 CC2 // .... | <- lifetime //payload : v1 v2 // ..... | <- lifetime //Here we need to check whether the value in CC is live //and intersects with the desired . //Sanity check: tuple contains the root element. //IGC_ASSERT(getRegRoot(nodeToCheck->value) == nodeToCheck->value); Value* RootV = nodeToCheck->value; Value* NewParent = GetActualDominatingParent(RootV, tupleGeneratingInstruction); //FIXME: what if we do not have pure ssa form? Look at de-ssa comment. if (NewParent && LV->isLiveAt(NewParent, tupleGeneratingInstruction)) { avaiable = false; //interferes = true; //break; } } return avaiable; } /// Processes the tuple elements in sequential order. /// A function object is passed as an argument, which responds to /// specific events (whether given payload elements is const/isolated/copied etc.) /// It is meant to be used as a template method in various context where different /// functors could be passed, but the walking mechanism is the same. void CoalescingEngine::ProcessElements(const uint numOperands, Instruction* tupleInst, const int offsetDiff, CCTuple* ccTuple, ElementFunctor* functor) { SmallPtrSet touchedValuesSet; for (uint i = 0; i < numOperands; i++) { functor->SetIndex(i); Value* val = GetPayloadElementToValueMapping(tupleInst, i); if (touchedValuesSet.count(val)) { if (functor->visitCopy()) continue; else break; } else { touchedValuesSet.insert(val); } //Case 1: Constant if (llvm::isa(val)) { if (functor->visitConstant()) continue; else break; } else //Case 2: Variable { if (IsValIsolated(val) || !getRegRoot(val)) { //If value is isolated, a copy will be necessary. if (functor->visitIsolated()) continue; else break; } IGC_ASSERT(ValueNodeMap.count(val)); ElementNode* Node = ValueNodeMap[val]; ElementNode* ccRootNode = ccTuple->GetCCNodeWithIndex(i + offsetDiff); //Interferes if and only if: live at this definition point AND has different value if (ccRootNode == Node) { //No interference, since it is the same value. We have got a reuse of the same value in //different tuple. if (functor->visitAnchored()) continue; else break; } else { IGC_ASSERT(llvm::isa(val)); //Here comes the meat, and the most interesting case: if (ccRootNode != NULL) { Value* CCRootV = ccRootNode->value; Value* dominating = nullptr; Value* dominated = nullptr; SymmetricInterferenceTest( CCRootV, dyn_cast(val), dominating, dominated); //Case 1): // (dominating=null) // //! (value) // // CC dominance tree: //! v67 <- (dominated) // ^ // | //! v190 // //! (tupleInst) // Value has no dominator inside CC class, but dominates elements in CC class // and its lifetime spans until (but not including) tupleInst. if (dominating == nullptr && dominated != nullptr) { //IGC_ASSERT(LV->isLiveAt(val, tupleInst)); //Interference check: value is live at dominated if (functor->visitInterfering(val, false)) continue; else break; } else if (dominating != nullptr && dominated != nullptr) { //Case 2): // CC dominance tree //! v67 // ^ // | //! v121 (dominating) // ^ //! | <-----(value) // | //! v190 <- (dominated) //TODO: it would be possible to 'fit' v181 between v121 and v190 in some // cases, but this would require redesign of the 'dominance forest' logic // (regarding current dominating and immediate dominator handling) // For now, just assume there is an interference. if (functor->visitInterfering(val, false)) continue; else break; } else if (dominating != nullptr && dominated == nullptr) { //Case 3): // CC dominance tree //! v121 // ^ // | //! v190 <- (dominating) // | //! | ----- (value) // | // | (dominated == null) if (LV->isLiveAt(dominating, dyn_cast(val))) { if (functor->visitInterfering(val, false)) continue; else break; } } else { //It is possible that the ccRootNode is not empty, but all //elements that belong to it have been isolated in previously //dominating tuples. //IGC_ASSERT(0); } } else { //ccRootNode == NULL -> congruence class not occupied yet if (functor->visitPackedNonInterfering(val)) continue; else break; } } //if( ccRootNode == Node ) } //if( llvm::isa( val ) ) } //for loop } /// Here we have two major differences with the implementation in de-ssa: /// 1) Transactional character: even if one 'element' of CC-tuple does not interfere, /// some other element in CC-tuple might be interfering, rendering the whole group /// non-'coalescable'. /// 2) Values in payload coalescing are by default isolated, so if there is an interference, /// do nothing, no need for explicit isolation. bool CoalescingEngine::InterferenceCheck( const uint numOperands, Instruction* tupleInst, const int offsetDiff, CCTuple* ccTuple, ProcessInterferencesElementFunctor* interferencesFunctor) { SmallPtrSet touchedValuesSet; GatherWeightElementFunctor gatherFunctor; ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, &gatherFunctor); bool forceEviction = (gatherFunctor.GetNumInsertionSlotsRequired() + gatherFunctor.GetNumNeedsDisplacement() <= gatherFunctor.GetNumAlignedAnchors()); interferencesFunctor->SetForceEviction(forceEviction); ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, interferencesFunctor); return interferencesFunctor->IsInterfering(); } /// Given an instruction and numOperands (corresponding to the number of coalesce-partaking /// payload elements), return whether any 'value' that is an argument of an intrinsic /// takes part in the coalescing CC tuple. If yes, return non-zero value that is /// the representative tuple for this coalescing. /// output: zeroBasedPayloadElementOffset - an offset of this instructions payload /// relative to the containing ccTuple - this is necessary to correctly slice the /// the relevant part of the payload from ccTuple (e.g. for preparing a payload for URB writes) /// offset [payload] /// [ccTuple ...... ] /// note: obviously, if return value is null, then the meaning of zeroBasedPayloadElementOffset /// is undefined CoalescingEngine::CCTuple* CoalescingEngine::IsAnyValueCoalescedInCCTuple( llvm::Instruction* inst, const uint numOperands, int& zeroBasedPayloadElementOffset, Value*& outVal) { outVal = nullptr; zeroBasedPayloadElementOffset = 0; //provide some meaningful default CoalescingEngine::CCTuple* ccTuple = nullptr; uint index = 0; while (index < numOperands) { Value* val = GetPayloadElementToValueMapping(inst, index); if (!isa(val) && GetValueCCTupleMapping(val)) { ccTuple = GetValueCCTupleMapping(val); Value* valRoot = getRegRoot(val); IGC_ASSERT(valRoot); ElementNode* Node = ValueNodeMap[valRoot]; int offset = NodeOffsetMap[Node]; zeroBasedPayloadElementOffset = (offset - ccTuple->GetLeftBound()) - index; outVal = val; return ccTuple; } index++; } return ccTuple; } /// Return true if payload tuple might be completed (either by directly /// aliasing or by inserting an appropriate copy instruction). bool CoalescingEngine::IsPayloadCovered( llvm::Instruction* inst, CCTuple* ccTuple, const uint numOperands, const int payloadToCCTupleRelativeOffset) { uint relativeOffset = 0; bool payloadCovered = false; if (ccTuple) { SmallPtrSet touchedValuesSet; //index = 0; payloadCovered = true; for (uint index = 0; index < numOperands; index++) { Value* val = GetPayloadElementToValueMapping(inst, index); const int ccIndex = index + payloadToCCTupleRelativeOffset; if (touchedValuesSet.count(val)) { if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) { payloadCovered = false; break; } continue; } else { touchedValuesSet.insert(val); } if (IsValConstOrIsolated(val)) { if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) { payloadCovered = false; break; } } else { if (CoalescingEngine::CCTuple * thisCCTuple = GetValueCCTupleMapping(val)) { if (thisCCTuple == ccTuple) { relativeOffset = GetValueOffsetInCCTuple(val); if (relativeOffset != (ccIndex)) { payloadCovered = false; break; } } else { //different cc tuple payloadCovered = false; break; } } else { if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) { payloadCovered = false; break; } } }//if constant }//while } //if cctuple return payloadCovered; } /// Uses the tuple information provided by coalescing engine in order /// to generate optimized sequence of 'movs' for preparing a payload. CVariable* CoalescingEngine::PrepareExplicitPayload( CShader* outProgram, CEncoder* encoder, SIMDMode simdMode, const DataLayout* pDL, llvm::Instruction* inst, int& payloadOffsetInBytes) { SetCurrentPart(inst, 0); const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst); const uint grfSize = outProgram->GetContext()->platform.getGRFSize(); const uint numSubVarsPerOperand = numLanes(simdMode) / (grfSize / 4); IGC_ASSERT(numSubVarsPerOperand == 1 || numSubVarsPerOperand == 2); CoalescingEngine::CCTuple* ccTuple = nullptr; int payloadToCCTupleRelativeOffset = 0; Value* dummyValPtr = nullptr; ccTuple = IsAnyValueCoalescedInCCTuple(inst, numOperands, payloadToCCTupleRelativeOffset, dummyValPtr); bool payloadCovered = IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset); bool payloadOffsetComputed = false; payloadOffsetInBytes = 0; CVariable* payload = nullptr; if (payloadToCCTupleRelativeOffset < 0) { payloadCovered = false; } //Check for EOT payload. if (payloadCovered) { if (IGCLLVM::TerminatorInst * terminator = inst->getParent()->getTerminator()) { if (terminator != &(*terminator->getParent()->begin())) { IGC_ASSERT(dyn_cast(inst)); IGC_ASSERT(terminator->getPrevNode()); if (terminator->getPrevNode() == inst) { //heuristic: //this inst is an EOT node. Be very careful about payload size: //coalescing: hard constraint if (ccTuple->GetNumElements() > MaxTupleSize) { payloadCovered = false; } } } } } if (payloadCovered) { SmallPtrSet touchedValuesSet; for (uint index = 0; index < numOperands; index++) { Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, index); CVariable* dataElement = outProgram->GetSymbol(val); //FIXME: need to do additional checks for size uint subVar = payloadToCCTupleRelativeOffset + index * numSubVarsPerOperand; encoder->SetDstSubVar(subVar); payload = outProgram->GetCCTupleToVariableMapping(ccTuple); if (touchedValuesSet.count(val)) { encoder->Copy(payload, dataElement); encoder->Push(); continue; } else { touchedValuesSet.insert(val); } if (IsValConstOrIsolated(val)) { encoder->Copy(payload, dataElement); encoder->Push(); } else { if (GetValueCCTupleMapping(val)) { if (!payloadOffsetComputed) { payloadOffsetComputed = true; payloadOffsetInBytes = payloadToCCTupleRelativeOffset * GetSingleElementWidth(simdMode, pDL, val); } } else { //this one actually encompasses the case for !getRegRoot(val) encoder->Copy(payload, dataElement); encoder->Push(); } }//if constant }//for } else { payload = outProgram->GetNewVariable(numOperands * numLanes(simdMode), ISA_TYPE_F, outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD, "CEExplicitPayload"); for (uint i = 0; i < numOperands; i++) { Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i); CVariable* data = outProgram->GetSymbol(val); encoder->SetDstSubVar(i * numSubVarsPerOperand); encoder->Copy(payload, data); encoder->Push(); } } return payload; } CoalescingEngine::ElementNode* CoalescingEngine::ElementNode::getLeader() { ElementNode* N = this; ElementNode* Parent = parent.getPointer(); ElementNode* Grandparent = Parent->parent.getPointer(); while (Parent != Grandparent) { N->parent.setPointer(Grandparent); N = Grandparent; Parent = Parent->parent.getPointer(); Grandparent = Parent->parent.getPointer(); } return Parent; } Value* CoalescingEngine::getRegRoot(Value* Val) const { auto RI = ValueNodeMap.find(Val); if (RI == ValueNodeMap.end()) return nullptr; ElementNode* Node = RI->second; if (Node->parent.getInt() & ElementNode::kRegisterIsolatedFlag) return 0x0; return Node->getLeader()->value; } Value* CoalescingEngine::getPHIRoot(Instruction* PHI) const { IGC_ASSERT(dyn_cast(PHI)); auto RI = ValueNodeMap.find(PHI); IGC_ASSERT(RI != ValueNodeMap.end()); ElementNode* DestNode = RI->second; if (DestNode->parent.getInt() & ElementNode::kPHIIsolatedFlag) return 0x0; for (unsigned i = 0; i < PHI->getNumOperands(); i++) { Value* SrcVal = PHI->getOperand(i); if (isa(SrcVal)) { // skip constant source Value* SrcRoot = getRegRoot(SrcVal); if (SrcRoot) return SrcRoot; } } return 0x0; } void CoalescingEngine::unionRegs(Value* Val1, Value* Val2) { ElementNode* Node1 = ValueNodeMap[Val1]->getLeader(); ElementNode* Node2 = ValueNodeMap[Val2]->getLeader(); if (Node1->rank > Node2->rank) { Node2->parent.setPointer(Node1->getLeader()); } else if (Node1->rank < Node2->rank) { Node1->parent.setPointer(Node2->getLeader()); } else if (Node1 != Node2) { Node2->parent.setPointer(Node1->getLeader()); Node1->rank++; } } // For now, return true if V is dessa-aliased/InsEltMap-ed/phi-coalesced. bool CoalescingEngine::isCoalescedByDeSSA(Value* V) const { if (m_DeSSA && m_DeSSA->getRootValue(V)) return true; return false; } void CoalescingEngine::ProcessBlock(llvm::BasicBlock* bb) { llvm::BasicBlock::InstListType& instructionList = bb->getInstList(); llvm::BasicBlock::InstListType::iterator I, E; //Loop through instructions top to bottom for (I = instructionList.begin(), E = instructionList.end(); I != E; ++I) { llvm::Instruction& inst = (*I); visit(inst); } } bool CoalescingEngine::MatchSingleInstruction(llvm::GenIntrinsicInst* inst) { GenISAIntrinsic::ID IID = inst->getIntrinsicID(); if (isSampleInstruction(inst) || isLdInstruction(inst) || isURBWriteIntrinsic(inst) || IID == GenISAIntrinsic::GenISA_RTWrite) { uint numOperands = inst->getNumOperands(); for (uint i = 0; i < numOperands; i++) { Value* val = inst->getOperand(i); if (llvm::isa(val)) { continue; } if (!ValueNodeMap.count(val)) { Instruction* DefMI = dyn_cast(val); if (DefMI) { BBProcessingDefs[DefMI->getParent()].push_back(DefMI); } ValueNodeMap[val] = new (Allocator) ElementNode(val); } } //Add itself to processing stack as well. BBProcessingDefs[inst->getParent()].push_back(inst); } return true; } void CoalescingEngine::visitCallInst(llvm::CallInst& I) { //we do not care if we do not match //bool match = true; if (GenIntrinsicInst * CI = llvm::dyn_cast(&I)) { switch (CI->getIntrinsicID()) { case GenISAIntrinsic::GenISA_ROUNDNE: case GenISAIntrinsic::GenISA_imulH: case GenISAIntrinsic::GenISA_umulH: case GenISAIntrinsic::GenISA_uaddc: case GenISAIntrinsic::GenISA_usubb: case GenISAIntrinsic::GenISA_bfrev: break; case GenISAIntrinsic::GenISA_intatomicraw: case GenISAIntrinsic::GenISA_floatatomicraw: case GenISAIntrinsic::GenISA_icmpxchgatomicraw: case GenISAIntrinsic::GenISA_fcmpxchgatomicraw: case GenISAIntrinsic::GenISA_dwordatomicstructured: case GenISAIntrinsic::GenISA_floatatomicstructured: case GenISAIntrinsic::GenISA_cmpxchgatomicstructured: case GenISAIntrinsic::GenISA_fcmpxchgatomicstructured: case GenISAIntrinsic::GenISA_intatomictyped: case GenISAIntrinsic::GenISA_icmpxchgatomictyped: case GenISAIntrinsic::GenISA_ldstructured: case GenISAIntrinsic::GenISA_atomiccounterinc: case GenISAIntrinsic::GenISA_atomiccounterpredec: break; case GenISAIntrinsic::GenISA_GradientX: case GenISAIntrinsic::GenISA_GradientY: case GenISAIntrinsic::GenISA_GradientXfine: case GenISAIntrinsic::GenISA_GradientYfine: break; default: MatchSingleInstruction(CI); // no pattern for the rest of the intrinsics break; } } else if (IntrinsicInst * CI = llvm::dyn_cast(&I)) { switch (CI->getIntrinsicID()) { case Intrinsic::sqrt: case Intrinsic::log2: case Intrinsic::cos: case Intrinsic::sin: case Intrinsic::pow: case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: case Intrinsic::maxnum: case Intrinsic::minnum: break; case Intrinsic::exp2: break; default: break; } } } void CoalescingEngine::visitCastInst(llvm::CastInst& I) { } void CoalescingEngine::visitBinaryOperator(llvm::BinaryOperator& I) { } void CoalescingEngine::visitCmpInst(llvm::CmpInst& I) { } void CoalescingEngine::visitPHINode(llvm::PHINode& I) { } void CoalescingEngine::visitUnaryInstruction(llvm::UnaryInstruction& I) { } void CoalescingEngine::visitSelectInst(llvm::SelectInst& I) { } void CoalescingEngine::visitBitCastInst(llvm::BitCastInst& I) { } void CoalescingEngine::visitInstruction(llvm::Instruction& I) { } void CoalescingEngine::visitLoadInst(LoadInst& I) { } void CoalescingEngine::visitStoreInst(StoreInst& I) { } } //namespace IGC