1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "PullConstantHeuristics.hpp"
10 
11 #include "Platform.hpp"
12 
13 using namespace llvm;
14 using namespace IGC;
15 
16 
17 static const unsigned EUInstCycleCount = 2;
18 static const unsigned SendInstCycleCount = 190;
19 static const unsigned RTWriteInstCycleCount = 190;
20 
21 char PullConstantHeuristics::ID = 0;
22 
23 #define PASS_FLAG "Analyse shader to determine push const threshold"
24 #define PASS_DESCRIPTION "This is an analysis pass for pulling constants for short shaders "
25 #define PASS_CFG_ONLY true
26 #define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(PullConstantHeuristics,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)27 IGC_INITIALIZE_PASS_BEGIN(PullConstantHeuristics, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
28 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
29 IGC_INITIALIZE_PASS_END(PullConstantHeuristics, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
30 
31 
32 //needs to be fine-tuned after obtaining feedback from visa
33 static unsigned estimateShaderLifetime(unsigned int EUCnt, unsigned SendMsgCnt, unsigned RTWriteCnt)
34 {
35     return EUCnt * EUInstCycleCount + SendMsgCnt * SendInstCycleCount + RTWriteCnt * RTWriteInstCycleCount;
36 }
37 
isSendMessage(GenIntrinsicInst * inst)38 static bool isSendMessage(GenIntrinsicInst* inst)
39 {
40     if (isSampleInstruction(inst) || isSampleLoadGather4InfoInstruction(inst))
41     {
42         return true;
43     }
44     return false;
45 }
46 
47 //approximating EU insts count - TODO: need a better way to do this
getEUInstEstimate(Instruction * inst)48 static unsigned getEUInstEstimate(Instruction* inst)
49 {
50     //handle ALU, Logical, and load-store insts
51     //Presently we're restricting to shaders with 1 BB only, i.e, short shaders
52     //But this handles branch insts as well (in case of multiple BB support in future)
53     if (inst->getOpcode() <= Instruction::Fence)
54     {
55         return 1;
56     }
57 
58     //handling remaining inst types
59     switch (inst->getOpcode())
60     {
61     case Instruction::FCmp:
62     case Instruction::ICmp:
63     case Instruction::Select:
64     case Instruction::Ret:
65         return 1;
66     default:
67         //bitcast insts don't make an EU inst in visa
68         return 0;
69     }
70 }
71 
72 //estimate EU, SendMsg and RTWrite insts required by the PS
getInstStats(const Function & F)73 static std::tuple<unsigned, unsigned, unsigned> getInstStats(const Function& F) {
74     unsigned EUInstCnt = 0;
75     unsigned int SendMsgInstCnt = 0;
76     unsigned int RTWriteInstCnt = 0;
77     for (auto BBI = F.getBasicBlockList().begin(); BBI != F.getBasicBlockList().end(); BBI++)
78     {
79         llvm::BasicBlock* BB = const_cast<llvm::BasicBlock*>(&*BBI);
80         for (auto II = BB->begin(); II != BB->end(); II++)
81         {
82             if (llvm::GenIntrinsicInst * pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(II))
83             {
84                 if (isSendMessage(pIntrinsic))
85                     SendMsgInstCnt++;
86                 else if (pIntrinsic->getIntrinsicID() == GenISAIntrinsic::GenISA_RTWrite)
87                     RTWriteInstCnt++;
88             }
89             else
90             {
91                 EUInstCnt += getEUInstEstimate(&*II);
92             }
93         }
94     }
95     return std::make_tuple(EUInstCnt, SendMsgInstCnt, RTWriteInstCnt);
96 }
97 
98 //Pixel Shader Dispatch can be bottleneck if
99 //    thread_payload_size > max(simd4_sample_instr, simd4_eu_instr / 16, simd4_rt_write * 2, shader_lifetime / 56)
getPSDBottleNeckThreshold(const Function & F)100 unsigned int PullConstantHeuristics::getPSDBottleNeckThreshold(const Function& F)
101 {
102     CodeGenContext* ctx = nullptr;
103     ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
104     const unsigned numThreadsPerSubslice = ctx->platform.getMaxNumberThreadPerSubslice(); //read from ctx.platform
105     const unsigned roofLineIPC = 16;
106     const unsigned pixelRate = 2;
107 
108     unsigned EstimatedEUInstCnt = 0;
109     unsigned SendMsgInstCnt = 0;
110     unsigned RTWriteInstCnt = 0;
111     std::tie(EstimatedEUInstCnt, SendMsgInstCnt, RTWriteInstCnt) = getInstStats(F);
112 
113     unsigned shaderLifetime = estimateShaderLifetime(EstimatedEUInstCnt, SendMsgInstCnt, RTWriteInstCnt);
114 
115     //calculate payload size threshold assuming SIMD16 to stop pushing constants
116     unsigned SIMD4_EU_Cnt = EstimatedEUInstCnt * 4;
117     unsigned SIMD4_Sample_Cnt = SendMsgInstCnt * 4;
118     unsigned SIMD4_RT_Write_Cnt = RTWriteInstCnt * 4;
119     unsigned payloadThreshold = std::max(std::max(SIMD4_Sample_Cnt, SIMD4_EU_Cnt / roofLineIPC),
120         std::max(SIMD4_RT_Write_Cnt * pixelRate, shaderLifetime / numThreadsPerSubslice));
121     return payloadThreshold;
122 }
123 
124 //currentPayloadSize = payloadHeaderSIMD16 + payloadBarySIMD16 + inputSize;
125 //
getCurrentPayloadSizeEstimate(const Function & F)126 static unsigned getCurrentPayloadSizeEstimate(const Function& F)
127 {
128     unsigned payloadHeaderSIMD16 = 3;
129     unsigned payloadBarySIMD16 = 4;
130     unsigned inputGRFSize = 1;
131 
132     //helper variables
133     unsigned maxValueFromInputVec = 0;
134     std::set<unsigned> countOfDifferentBary;
135     for (auto BBI = F.getBasicBlockList().begin(); BBI != F.getBasicBlockList().end(); BBI++)
136     {
137         llvm::BasicBlock* BB = const_cast<llvm::BasicBlock*>(&*BBI);
138         for (auto II = BB->begin(); II != BB->end(); II++)
139         {
140             if (llvm::GenIntrinsicInst * pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(II))
141             {
142                 if (pIntrinsic->getIntrinsicID() == GenISAIntrinsic::GenISA_DCL_inputVec)
143                 {
144                     countOfDifferentBary.insert((unsigned)llvm::cast<llvm::ConstantInt>(II->getOperand(1))->getZExtValue());
145                     maxValueFromInputVec = std::max(maxValueFromInputVec, (unsigned)llvm::cast<llvm::ConstantInt>(II->getOperand(0))->getZExtValue());
146                 }
147             }
148         }
149     }
150     payloadBarySIMD16 = countOfDifferentBary.size() * 4;
151     inputGRFSize = (unsigned)(maxValueFromInputVec / 2) % 2 == 0 ? (maxValueFromInputVec / 2) : ((maxValueFromInputVec / 2) + 1);
152     return payloadHeaderSIMD16 + payloadBarySIMD16 + inputGRFSize;
153 }
154 
runOnModule(Module & M)155 bool PullConstantHeuristics::runOnModule(Module& M)
156 {
157     if (IGC_IS_FLAG_ENABLED(DisablePullConstantHeuristics))
158     {
159         return false;
160     }
161     CodeGenContext* ctx = nullptr;
162     ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
163     if (ctx->type == ShaderType::PIXEL_SHADER && ctx->platform.hasPSDBottleneck())
164     {
165         for (auto& F : M)
166         {
167             if (F.getBasicBlockList().size() == 1)
168             {
169                 BasicBlock* BB = &(*F.begin());
170                 if (BB->getInstList().size() < 200)
171                 {//short shaders
172                     int PSDBottleNeckThreshold = getPSDBottleNeckThreshold(F);
173                     int PayloadWithoutConstants = getCurrentPayloadSizeEstimate(F);
174                     int maxPayload_Regkey = (IGC_GET_FLAG_VALUE(PayloadSizeThreshold));
175                     PSDBottleNeckThreshold = PSDBottleNeckThreshold > maxPayload_Regkey ?
176                         maxPayload_Regkey : PSDBottleNeckThreshold;
177 
178                     int threshold = PSDBottleNeckThreshold - PayloadWithoutConstants;
179                     threshold = threshold < 0 ? 0 : threshold;
180                     thresholdMap.insert(std::make_pair(&F, threshold));
181                 }
182             }
183         }
184 
185     }
186     return false;//does not modify IR
187 }
188