/*========================== begin_copyright_notice ============================ Copyright (C) 2021 Intel Corporation SPDX-License-Identifier: MIT ============================= end_copyright_notice ===========================*/ #include "SplitAlignedScalars.h" #include "G4_Opcode.h" using namespace vISA; bool SplitAlignedScalars::canReplaceDst(G4_INST* inst) { // acc has special rules, so skip optimizing instructions using acc if (inst->useAcc()) return false; // whitelist of supported instructions if (inst->isMov() || inst->isMath() || inst->isArithmetic() || inst->isLogic() || inst->isCompare() || inst->isPseudoKill()) { return true; } return false; } bool SplitAlignedScalars::canReplaceSrc(G4_INST* inst, unsigned int idx) { auto opcode = inst->opcode(); if (inst->useAcc()) return false; if (inst->isMov() || inst->isMath() || inst->isArithmetic() || inst->isLogic() || inst->isCompare() || opcode == G4_mad || inst->isPseudoUse()) { return true; } return false; } std::vector SplitAlignedScalars::gatherCandidates() { std::vector candidates; unsigned int lexId = 0; for (auto bb : kernel.fg) { for (auto inst : *bb) { inst->setLexicalId(lexId++); auto dst = inst->getDst(); if (dst && dst->isDstRegRegion()) { auto dstTopDcl = dst->getTopDcl(); if (dstTopDcl && isDclCandidate(dstTopDcl)) { auto& Data = dclData[dstTopDcl]; Data.defs.push_back(std::pair(bb, inst)); if (Data.firstDef == 0) Data.firstDef = lexId; // disallow cases where scalar def has predicate if (inst->getPredicate()) { Data.allowed = false; } // Disallow splitting any variable if it's used as send // dst because adding a copy after send causes a back-to-back // dependency. If scheduler cannot hide it, it can lead to // performance penalty. if (inst->isSend()) { Data.allowed = false; } if (dst->getTypeSize() != dstTopDcl->getByteSize()) { // we require that dst opnd writes complete topdcl Data.allowed = false; } if (dst->getTypeSize() < 4) { // byte, word, half floats may have many hw restrictions // and these are usually not set as GRF aligned Data.allowed = false; } auto dstDcl = dst->asDstRegRegion()->getBase()->asRegVar()->getDeclare(); if (dstDcl->getAliasDeclare()) { // disallow case where topdcl is scalar, but alias dcl // is not a scalar as it may be smaller in size. for eg, // topdcl may be :uq and alias may be of type :ud. if (dstDcl->getByteSize() != dstTopDcl->getByteSize()) Data.allowed = false; } if (dst->getHorzStride() != 1) { // dst hstride != 1 to accommodate some alignment restriction Data.allowed = false; } if (!inst->isSend()) { // check whether dst type size != src type size // disallow optimization if dst type is different // than src as that entails alignmment requirements // that this pass may not honor. for (unsigned int i = 0; i != inst->getNumSrc(); ++i) { auto src = inst->getSrc(i); if (!src->isSrcRegRegion()) continue; if (dst->getTypeSize() != src->getTypeSize()) Data.allowed = false; } } } } for (unsigned int i = 0; i != inst->getNumSrc(); ++i) { auto src = inst->getSrc(i); if (src && src->isSrcRegRegion()) { auto srcTopDcl = src->getTopDcl(); if (srcTopDcl && isDclCandidate(srcTopDcl)) { auto& Data = dclData[srcTopDcl]; Data.uses.push_back(std::make_tuple(bb, inst, i)); if (Data.lastUse < lexId) Data.lastUse = lexId; if (!inst->isSend()) { // mixed types have alignment requirements if (dst->getTypeSize() != src->getTypeSize()) Data.allowed = false; } } } } } } for (auto dcl : kernel.Declares) { if (dcl != dcl->getRootDeclare()) continue; auto dclDataIt = dclData.find(dcl); if (dclDataIt != dclData.end()) { if ((*dclDataIt).second.allowed) candidates.push_back(dcl); } } return candidates; } // return number of movs needed when applying optimization on dcl // this method returns estimate of dynamic movs by considering // loops. each loop nest N add is expected to run const# of iterations. unsigned int SplitAlignedScalars::computeNumMovs(G4_Declare* dcl) { unsigned int numMovsNeeded = 0; auto& Data = dclData[dcl]; for (auto def : Data.defs) { if (!canReplaceDst(def.second)) { auto bb = def.first; auto innerMostLoop = kernel.fg.getLoops().getInnerMostLoop(bb); if (innerMostLoop) numMovsNeeded += innerMostLoop->getNestingLevel() * EstimatedLoopTripCount; else numMovsNeeded++; } } for (auto use : Data.uses) { if (!canReplaceSrc(std::get<1>(use), std::get<2>(use))) { auto bb = std::get<0>(use); auto innerMostLoop = kernel.fg.getLoops().getInnerMostLoop(bb); if (innerMostLoop) numMovsNeeded += innerMostLoop->getNestingLevel() * EstimatedLoopTripCount; else numMovsNeeded++; } } return numMovsNeeded; } void SplitAlignedScalars::pruneCandidates(std::vector& candidates) { // This method is a cost function that decides which aligned scalar variables from candidates list // get split. This method returns list of final candidates in passed-by-ref argument. // We first sort candidates in descending order of spill cost. // Next, we determine number of movs needed if a given candidate is split. This is an estimate // that takes in to account loop nesting levels. If accumulated mov count is below a threshold // then splitting is allowed. Note that we always split a candidate if it was spilled by current // RA iteration or if the variable was marked as callee saved (in case it crossed fcall boundary). auto compareSpillCost = [&](G4_Declare* dcl1, G4_Declare* dcl2) { auto it1 = dclSpillCost.find(dcl1); auto it2 = dclSpillCost.find(dcl2); float cost1 = 0, cost2 = 0; if (it1 != dclSpillCost.end()) cost1 = (*it1).second; if (it2 != dclSpillCost.end()) cost2 = (*it2).second; return cost1 < cost2; }; std::list candidateList; std::for_each(candidates.begin(), candidates.end(), [&](G4_Declare* dcl) { candidateList.push_back(dcl); }); // First re-order candidates based on spill cost in descending order candidateList.sort(compareSpillCost); std::reverse(candidateList.begin(), candidateList.end()); for (auto it = candidateList.begin(); it != candidateList.end();) { auto dcl = *it; auto dclDataIt = dclData.find(dcl); bool erase = true; if (dclDataIt != dclData.end()) { auto& Data = dclData[dcl]; if (heuristic(*it, Data)) erase = false; } if (erase) { it = candidateList.erase(it); continue; } ++it; } auto estimateInstCount = [&]() { auto instCount = 0; for (auto bb : kernel.fg.getBBList()) { auto loop = kernel.fg.getLoops().getInnerMostLoop(bb); if (loop) instCount += (loop->getNestingLevel() * EstimatedLoopTripCount) * bb->size(); else instCount += bb->size(); } return instCount; }; candidates.clear(); unsigned int totalMovsNeeded = 0; unsigned int estimatedInstCount = estimateInstCount(); for (auto candidate : candidateList) { bool isCandidate = false; auto numMovsNeeded = computeNumMovs(candidate); // Allow any candidate that was marked as spill or if the candidate needs no movs or // if candidate was marked as requiring callee save allocation. if (spilledDclSet.find(candidate) != spilledDclSet.end() || calleeSaveBiased.find(candidate) != calleeSaveBiased.end() || numMovsNeeded == 0) isCandidate = true; else { // Mark as candidate for splitting only if doing so doesnt increase mov count above // a set threshold. if ((totalMovsNeeded + numMovsNeeded) < (unsigned int)(((float)estimatedInstCount * BloatAllowed))) isCandidate = true; } if (isCandidate) { candidates.push_back(candidate); totalMovsNeeded += numMovsNeeded; } } } bool SplitAlignedScalars::isDclCandidate(G4_Declare* dcl) { if (dcl->getRegFile() == G4_RegFileKind::G4_GRF && dcl->getNumElems() == 1 && !dcl->getAddressed() && !dcl->getIsPartialDcl() && !dcl->getRegVar()->isPhyRegAssigned() && !dcl->getAliasDeclare() && !dcl->isInput() && !dcl->isOutput() && !dcl->isPayloadLiveOut() && !dcl->isDoNotSpill() && gra.getSubRegAlign(dcl) == GRFALIGN) return true; return false; } bool SplitAlignedScalars::heuristic(G4_Declare* dcl, Data& d) { if (d.getDUMaxDist() < MinOptDist) return false; if (!d.allowed) { return false; } return true; } // T is either G4_SrcRegRegion or G4_DstRegRegion template G4_Declare* SplitAlignedScalars::getDclForRgn(T* rgn, G4_Declare* newTopDcl) { // newTopDcl is the replacement dcl computed by this pass to replace rgn // If rgn's dcl is not aliased then return newTopDcl as new rgn can // directly use this. // If rgn's dcl is aliased, create a new alias dcl based off newTopDcl // with correct offset and return it. auto rgnDcl = rgn->getBase()->asRegVar()->getDeclare(); if (rgn->getTopDcl() == rgnDcl) return newTopDcl; auto newAliasDcl = kernel.fg.builder->createTempVar(rgnDcl->getNumElems(), rgnDcl->getElemType(), rgnDcl->getSubRegAlign()); newAliasDcl->setAliasDeclare(newTopDcl, rgnDcl->getOffsetFromBase()); return newAliasDcl; } void SplitAlignedScalars::run() { auto getNewDcl = [&](G4_Declare* oldDcl) { auto newTopDcl = oldNewDcls[oldDcl]; if (newTopDcl == nullptr) { newTopDcl = kernel.fg.builder->createTempVar(oldDcl->getNumElems(), oldDcl->getElemType(), G4_SubReg_Align::Any); oldNewDcls[oldDcl] = newTopDcl; } return newTopDcl; }; auto getTypeExecSize = [&](G4_Type type) { if (TypeSize(type) < 8) return std::make_tuple(1, type); if (kernel.fg.builder->noInt64()) return std::make_tuple(2, Type_UD); return std::make_tuple(1, Type_UQ); }; auto candidates = gatherCandidates(); pruneCandidates(candidates); for (auto dcl : candidates) { auto dclDataIt = dclData.find(dcl); if (dclDataIt != dclData.end()) { oldNewDcls[dcl] = nullptr; numDclsReplaced++; } } for (auto bb : kernel.fg) { for (auto instIt = bb->begin(), instItEnd = bb->end(); instIt != instItEnd;) { auto newInstIt = instIt; auto inst = (*instIt); auto dst = inst->getDst(); if (dst && oldNewDcls.find(dst->getTopDcl()) != oldNewDcls.end()) { auto oldTopDcl = dst->getTopDcl(); auto newTopDcl = getNewDcl(oldTopDcl); auto newDcl = getDclForRgn(dst, newTopDcl); if (canReplaceDst(inst)) { auto newDstRgn = kernel.fg.builder->createDst(newDcl->getRegVar(), dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), dst->getType()); inst->setDest(newDstRgn); } else { // found an instruction where dst has to be GRF aligned, // so we keep old dst but we insert a copy in new dcl auto newAlignedTmpTopDcl = kernel.fg.builder->createTempVar(oldTopDcl->getNumElems(), oldTopDcl->getElemType(), oldTopDcl->getSubRegAlign()); newAlignedTmpTopDcl->copyAlign(oldTopDcl); auto newAlignedVar = getDclForRgn(dst, newAlignedTmpTopDcl); auto dstRgn = kernel.fg.builder->createDst(newAlignedVar->getRegVar(), dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), dst->getType()); inst->setDest(dstRgn); // emit copy to store data to original non-aligned scalar unsigned int execSize = 1; G4_Type typeToUse = Type_UD; std::tie(execSize, typeToUse) = getTypeExecSize(dst->getType()); auto src = kernel.fg.builder->createSrc(dstRgn->getBase(), dstRgn->getRegOff(), dstRgn->getSubRegOff(), execSize == g4::SIMD1 ? kernel.fg.builder->getRegionScalar() : kernel.fg.builder->getRegionStride1(), typeToUse); auto dstRgnOfCopy = kernel.fg.builder->createDst(newDcl->getRegVar(), dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), typeToUse); G4_Predicate* dupPred = nullptr; if(inst->getPredicate()) dupPred = kernel.fg.builder->createPredicate(*inst->getPredicate()); auto newInst = kernel.fg.builder->createInternalInst(dupPred, G4_mov, nullptr, g4::NOSAT, G4_ExecSize(execSize), dstRgnOfCopy, src, nullptr, InstOpt_WriteEnable); newInstIt = bb->insertAfter(instIt, newInst); gra.addNoRemat(newInst); numMovsAdded++; } changesMade = true; } for (unsigned int i = 0; i != inst->getNumSrc(); ++i) { auto src = inst->getSrc(i); if (!src || !src->isSrcRegRegion()) continue; auto srcRgn = src->asSrcRegRegion(); if (oldNewDcls.find(srcRgn->getTopDcl()) != oldNewDcls.end()) { auto oldTopDcl = srcRgn->getTopDcl(); auto newTopDcl = getNewDcl(oldTopDcl); auto newDcl = getDclForRgn(srcRgn, newTopDcl); if (canReplaceSrc(inst, i)) { auto newSrcRgn = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), srcRgn->getRegAccess(), newDcl->getRegVar(), srcRgn->getRegOff(), srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType()); inst->setSrc(newSrcRgn, i); } else { // create a new aligned tmp auto newAlignedTmpTopDcl = kernel.fg.builder->createTempVar(oldTopDcl->getNumElems(), oldTopDcl->getElemType(), oldTopDcl->getSubRegAlign()); newAlignedTmpTopDcl->copyAlign(oldTopDcl); unsigned int execSize = 1; G4_Type typeToUse = Type_UD; std::tie(execSize, typeToUse) = getTypeExecSize(srcRgn->getType()); // copy oldDcl in to newAlignedTmpTopDcl auto tmpDst = kernel.fg.builder->createDst(newAlignedTmpTopDcl->getRegVar(), typeToUse); auto src = kernel.fg.builder->createSrc(newTopDcl->getRegVar(), 0, 0, execSize == 1 ? kernel.fg.builder->getRegionScalar() : kernel.fg.builder->getRegionStride1(), typeToUse); auto copy = kernel.fg.builder->createMov(G4_ExecSize(execSize), tmpDst, src, InstOpt_WriteEnable, false); // now create src out of tmpDst auto dclToUse = getDclForRgn(srcRgn, newAlignedTmpTopDcl); auto newAlignedSrc = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), srcRgn->getRegAccess(), dclToUse->getRegVar(), srcRgn->getRegOff(), srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType()); inst->setSrc(newAlignedSrc, i); bb->insertBefore(instIt, copy); // this copy shouldnt be rematerialized gra.addNoRemat(copy); numMovsAdded++; } changesMade = true; } } instIt = ++newInstIt; } } } void SplitAlignedScalars::dump(std::ostream& os) { os << "# GRF aligned scalar dcls replaced: " << numDclsReplaced << std::endl; os << "# movs added: " << numMovsAdded << std::endl; }