1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "MCSOptimization.hpp"
10 #include "IGCPassSupport.h"
11 #include "GenISAIntrinsics/GenIntrinsicInst.h"
12 #include "Compiler/CodeGenPublic.h"
13 #include "Compiler/WorkaroundAnalysisPass.h"
14 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
15 #include <set>
16 #include "common/LLVMWarningsPush.hpp"
17 #include "llvm/IR/Function.h"
18 #include <llvm/IR/InstVisitor.h>
19 #include <llvm/IR/IRBuilder.h>
20 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
21 #include <llvm/Support/Casting.h>
22 #include "common/LLVMWarningsPop.hpp"
23 #include "common/IGCIRBuilder.h"
24 #include "common/igc_regkeys.hpp"
25 #include "Probe/Assertion.h"
26 
27 using namespace llvm;
28 using namespace IGC;
29 /************************************************************************
30 This transformation is not safe in general. It can be applied only in those case:
31 -We know that the resouce is MCS compressed
32 -We need to know that we don't access out of bound sample index
33 ************************************************************************/
34 class MCSOptimization : public FunctionPass, public InstVisitor<MCSOptimization>
35 {
36 public:
MCSOptimization()37     MCSOptimization() : FunctionPass(ID) {}
38     bool runOnFunction(Function& F);
39     void visitCallInst(llvm::CallInst& I);
getAnalysisUsage(llvm::AnalysisUsage & AU) const40     void getAnalysisUsage(llvm::AnalysisUsage& AU) const
41     {
42         AU.addRequired<CodeGenContextWrapper>();
43     }
getPassName() const44     virtual llvm::StringRef getPassName() const
45     {
46         return "MCSOptimization";
47     }
48 
49     static char ID;
50     bool m_changed;
51 
52 private:
shaderSamplesCompressedSurfaces(CodeGenContext * ctx)53     bool shaderSamplesCompressedSurfaces(CodeGenContext* ctx)
54     {
55         ModuleMetaData* modMD = ctx->getModuleMetaData();
56         for (unsigned int i = 0; i < NUM_SHADER_RESOURCE_VIEW_SIZE; i++)
57         {
58             if (modMD->m_ShaderResourceViewMcsMask[i] != 0)
59             {
60                 return true;
61             }
62         }
63         return false;
64     }
65 protected:
66 };
67 
68 char MCSOptimization::ID = 0;
69 
runOnFunction(Function & F)70 bool MCSOptimization::runOnFunction(Function& F)
71 {
72 
73     if (IGC_IS_FLAG_ENABLED(DisableMCSOpt))
74     {
75         return false;
76     }
77     m_changed = false;
78     visit(F);
79     return m_changed;
80 }
81 
visitCallInst(llvm::CallInst & I)82 void MCSOptimization::visitCallInst(llvm::CallInst& I)
83 {
84     Function* F = I.getParent()->getParent();
85     IGCIRBuilder<> IRB(F->getContext());
86 
87     if (LdmcsInstrinsic * ldMcs = dyn_cast<LdmcsInstrinsic>(&I))
88     {
89         CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
90 
91         {
92             if (!shaderSamplesCompressedSurfaces(ctx))
93             {
94                 return;
95             }
96 
97             llvm::Value* textureArgValue = ldMcs->getTextureValue();
98             uint textureIndex;
99             if (textureArgValue->getType()->isPointerTy())
100             {
101                 uint addrSpace = textureArgValue->getType()->getPointerAddressSpace();
102                 uint bufferIndex = 0;
103                 bool directIdx = false;
104                 DecodeAS4GFXResource(addrSpace, directIdx, bufferIndex);
105                 textureIndex = bufferIndex;
106             }
107             else
108             {
109                 textureIndex = int_cast<uint>(GetImmediateVal(textureArgValue));
110             }
111 
112             const unsigned int shaderResourceViewMcsMaskIndex = textureIndex / BITS_PER_QWORD;
113             const unsigned long long resourceViewMcsMaskElement = ctx->getModuleMetaData()->m_ShaderResourceViewMcsMask[shaderResourceViewMcsMaskIndex];
114             const unsigned int resourceViewMaskTextureBit = textureIndex % BITS_PER_QWORD;
115             IGC_ASSERT_MESSAGE(textureIndex <= 127, "Texture index is incorrectly extracted from ld_mcs");
116 
117             unsigned long long resultBit = resourceViewMcsMaskElement >> resourceViewMaskTextureBit;
118             if ((resultBit & 1) == 0)
119             {
120                 return;
121             }
122         }
123         ExtractElementInst* EEI = nullptr;
124         for (auto useItr : ldMcs->users())
125         {
126             if (ExtractElementInst * ee1 = dyn_cast<ExtractElementInst>(useItr))
127             {
128                 if (ConstantInt * channel = dyn_cast<ConstantInt>(ee1->getOperand(1)))
129                 {
130                     if (channel->isZero())
131                     {
132                         EEI = ee1;
133                         break;
134                     }
135                 }
136             }
137         }
138 
139         if (EEI != nullptr)
140         {
141             if (EEI->hasOneUse())
142                 return; //only one use of EEI -- noOptimization
143 
144             LdmsInstrinsic* firstUse = nullptr;
145 
146             for (auto it = EEI->getIterator(); it != EEI->getParent()->end(); ++it)
147             {
148                 if (LdmsInstrinsic * ldmsIntr = dyn_cast<LdmsInstrinsic>(&*it))
149                 {
150                     if (ldmsIntr->getOperand(1) == dyn_cast<Value>(EEI))
151                     {
152                         //first use and in the def's BB
153                         firstUse = ldmsIntr;
154                         break;
155                     }
156                 }
157             }
158 
159             if (!firstUse)
160                 return;
161 
162             //collect all blocks where this EEI insts is getting used
163             std::set<BasicBlock*> useBlocks;
164             for (auto BitcastUses = EEI->user_begin(); BitcastUses != EEI->user_end(); BitcastUses++)
165             {
166                 Instruction* ldmsInst = dyn_cast<Instruction>(*BitcastUses);
167                 if (ldmsInst)
168                 {
169                     if (dyn_cast<ConstantInt>(ldmsInst->getOperand(0)))
170                     {
171                         useBlocks.insert(ldmsInst->getParent());
172                     }
173                     else
174                     {
175                         return;
176                     }
177                 }
178             }
179 
180             //iterate over useBlocks.
181             //For each useBlock, collect all the ldms insts present within the use block corresponding to this EEI
182             for (auto BB : useBlocks)
183             {
184                 std::vector<LdmsInstrinsic*> ldmsInstsToMove;
185                 for (auto inst = BB->begin(); inst != BB->end(); inst++)
186                 {
187                     if (LdmsInstrinsic * ldmsIntr = dyn_cast<LdmsInstrinsic>(inst))
188                     {
189                         if (ldmsIntr->getOperand(1) == dyn_cast<Value>(EEI))
190                         {
191                             if (ldmsIntr == firstUse)
192                                 continue; //don't move the first use into the then block , need it for phi Node
193                             ldmsInstsToMove.push_back(ldmsIntr);
194                         }
195                     }
196                 }
197 
198                 //this is added because clubbing all ld2dms into a single then block
199                 //increases register pressure and causes spilling
200                 int instClubThreshold = IGC_GET_FLAG_VALUE(ld2dmsInstsClubbingThreshold); //# ld2dms insts that can be moved into the then block
201                 //int instClubThreshold = 2;
202                 bool allInstsWillBeMoved = false;
203 
204                 while (!allInstsWillBeMoved)
205                 {
206                     std::vector<LdmsInstrinsic*> ldmsInstsToClub;
207                     //Threshold is more than # of insts that are to be moved. So move all.
208                     if (instClubThreshold >= static_cast<int>(ldmsInstsToMove.size()))
209                     {
210                         ldmsInstsToClub = ldmsInstsToMove;
211                         allInstsWillBeMoved = true;
212                     }
213                     else
214                     {
215                         //pick the first 0-threshold # of insts and move them only
216                         for (int i = 0; i < instClubThreshold; i++)
217                         {
218                             ldmsInstsToClub.push_back(ldmsInstsToMove[i]);
219                         }
220                         ldmsInstsToMove.erase(ldmsInstsToMove.begin(), ldmsInstsToMove.begin() + instClubThreshold);
221                     }
222 
223                     //split the block into a new then block
224                     BasicBlock* ldmsUseBB = nullptr; //second entry to the phi node
225                     BasicBlock* thenBlock = nullptr;
226                     IGCLLVM::TerminatorInst* thenBlockTerminator = nullptr;
227                     if (ldmsInstsToClub.size() != 0)
228                     {
229                         LdmsInstrinsic* ldmsUse = ldmsInstsToClub[0];
230                         ldmsUseBB = ldmsUse->getParent();
231                         IRB.SetInsertPoint(ldmsUse);
232                         Value* ValueisMCSNotZero = nullptr;
233                         for (unsigned int i = 0; i < ldmsUse->getNumMcsOperands(); i++)
234                         {
235                             Value* mcs = firstUse->getMcsOperand(i);
236                             Value* cnd1 = IRB.CreateICmpNE(mcs, ConstantInt::get(mcs->getType(), 0));
237                             if (ValueisMCSNotZero == nullptr)
238                             {
239                                 ValueisMCSNotZero = cnd1;
240                             }
241                             else
242                             {
243                                 ValueisMCSNotZero = IRB.CreateOr(ValueisMCSNotZero, cnd1);
244                             }
245                         }
246                         thenBlockTerminator = SplitBlockAndInsertIfThen(ValueisMCSNotZero, ldmsUse, false);
247                         thenBlock = thenBlockTerminator->getParent();
248                     }
249 
250                     //Move the collected ldms insts into the then block and insert their phi nodes in the successor of the then block
251                     if (thenBlockTerminator)
252                     {
253                         for (auto instToMove : ldmsInstsToClub)
254                         {
255                             instToMove->moveBefore(thenBlockTerminator);
256                             IRB.SetInsertPoint(&*(thenBlockTerminator->getSuccessor(0)->begin()));
257                             PHINode* PN = IRB.CreatePHI(instToMove->getType(), 2);
258                             instToMove->replaceAllUsesWith(PN);
259                             PN->addIncoming(instToMove, thenBlock);
260                             PN->addIncoming(firstUse, ldmsUseBB);
261                             m_changed = true;
262                         }
263                     }
264 
265                 }
266             }
267             m_changed = true;
268         }
269     }
270 }
271 
272 namespace IGC {
273 #define PASS_FLAG "optimize ld2ms message assuming resources are always compressed"
274 #define PASS_DESCRIPTION "This is an optimization pass for ld2dms message "
275 #define PASS_CFG_ONLY false
276 #define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(MCSOptimization,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)277     IGC_INITIALIZE_PASS_BEGIN(MCSOptimization, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
278         IGC_INITIALIZE_PASS_END(MCSOptimization, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
279 
280         FunctionPass* CreateMCSOptimization()
281     {
282         return new MCSOptimization();
283     }
284 }
285