1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "common/LLVMUtils.h"
10 #include "common/IGCIRBuilder.h"
11 #include "PixelShaderLowering.hpp"
12 #include "GenISAIntrinsics/GenIntrinsics.h"
13 #include "Compiler/IGCPassSupport.h"
14 #include "Probe/Assertion.h"
15 
16 using namespace llvm;
17 
18 //#define DEBUG_BLEND_TO_DISCARD
19 
20 namespace IGC
21 {
22 
23 #define PASS_FLAG "igc-pixel-shader-addmask"
24 #define PASS_DESCRIPTION "Pixel shader lowering pass"
25 #define PASS_CFG_ONLY false
26 #define PASS_ANALYSIS true
27     IGC_INITIALIZE_PASS_BEGIN(PixelShaderAddMask, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
28         IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
29         IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
30         IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
31         IGC_INITIALIZE_PASS_END(PixelShaderAddMask, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
32 #undef PASS_FLAG
33 #undef PASS_DESCRIPTION
34 #undef PASS_CFG_ONLY
35 #undef PASS_ANALYSIS
36 
37     char PixelShaderAddMask::ID = 0;
38 
PixelShaderAddMask()39 PixelShaderAddMask::PixelShaderAddMask() :
40     FunctionPass(ID)
41 {
42     initializePixelShaderAddMaskPass(*PassRegistry::getPassRegistry());
43 }
44 
runOnFunction(llvm::Function & F)45 bool PixelShaderAddMask::runOnFunction(llvm::Function& F)
46 {
47     m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
48 
49     Module* mod = F.getParent();
50     bool hasDiscard;
51 
52     hasDiscard = (mod->getNamedMetadata("KillPixel") != nullptr);
53     m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
54     IGCMD::MetaDataUtils* pMdUtils =
55         getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
56 
57     if (!hasDiscard || pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
58     {
59         return false;
60     }
61 
62     Instruction* globalMask = nullptr;
63     Instruction* updateMask = nullptr;
64 
65     unsigned numUpdateMask = 0;
66 
67     for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
68     {
69         for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
70         {
71             if (isa<GenIntrinsicInst>(II, GenISAIntrinsic::GenISA_InitDiscardMask))
72             {
73                 globalMask = &(*II);
74             }
75             else
76                 if (isa<GenIntrinsicInst>(II, GenISAIntrinsic::GenISA_UpdateDiscardMask))
77                 {
78                     numUpdateMask++;
79                     updateMask = &(*II);
80                 }
81         }
82     }
83     if (!globalMask)
84     {
85         return false;
86     }
87 
88     if (F.size() == 1 && numUpdateMask == 1)
89     {
90         // handle special case function has 1 BB and 1 discard, then we
91         // can directly use the discard condition for RTWrite, no need to
92         // generate GetPixelMask.
93         Value* discardCond = updateMask->getOperand(1);
94         updateMask->eraseFromParent();
95         globalMask->eraseFromParent();
96         Value* mask = nullptr;
97 
98         for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
99         {
100             RTWritIntrinsic* rtw;
101             RTDualBlendSourceIntrinsic* drt;
102 
103             for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
104             {
105                 if ((rtw = dyn_cast<RTWritIntrinsic>(II)))
106                 {
107                     IGC_ASSERT(isa<ConstantInt>(rtw->getPMask()));
108                     if (!mask)
109                     {
110                         mask = BinaryOperator::CreateNot(discardCond, "", rtw);
111                     }
112 
113                     rtw->setPMask(mask);
114                 }
115                 else
116                     if ((drt = dyn_cast<RTDualBlendSourceIntrinsic>(II)))
117                     {
118                         IGC_ASSERT(isa<ConstantInt>(drt->getPMask()));
119                         if (!mask)
120                         {
121                             mask = BinaryOperator::CreateNot(discardCond, "", drt);
122                         }
123 
124                         drt->setPMask(mask);
125                     }
126             }
127         }
128     }
129     else
130     {
131         globalMask->moveBefore(globalMask->getParent()->getFirstNonPHI());
132 
133         Function* getMaskF;
134         getMaskF = GenISAIntrinsic::getDeclaration(mod,
135             GenISAIntrinsic::GenISA_GetPixelMask);
136 
137         Value* mask = nullptr;
138 
139         for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
140         {
141             RTWritIntrinsic* rtw;
142             RTDualBlendSourceIntrinsic* drt;
143 
144             mask = nullptr;
145             for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
146             {
147                 if ((rtw = dyn_cast<RTWritIntrinsic>(II)) && globalMask)
148                 {
149                     if (!mask)
150                     {
151                         mask = CallInst::Create(getMaskF, { globalMask }, "", rtw);
152                     }
153                     IGC_ASSERT(isa<ConstantInt>(rtw->getPMask()));
154                     rtw->setPMask(mask);
155                 }
156                 else
157                     if ((drt = dyn_cast<RTDualBlendSourceIntrinsic>(II)) && globalMask)
158                     {
159                         if (!mask)
160                         {
161                             mask = CallInst::Create(getMaskF, { globalMask }, "", drt);
162                         }
163                         IGC_ASSERT(isa<ConstantInt>(drt->getPMask()));
164                         drt->setPMask(mask);
165                     }
166             }
167         }
168     }
169 
170     return false;
171 }
172 
173 char PixelShaderLowering::ID = 0;
174 
175 // Register pass to igc-opt
176 #define PASS_FLAG "igc-pixel-shader-lowering"
177 #define PASS_DESCRIPTION "This is the pixel shader lowering pass "
178 #define PASS_CFG_ONLY false
179 #define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(PixelShaderLowering,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)180 IGC_INITIALIZE_PASS_BEGIN(PixelShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
181     IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
182     IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
183     IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
184     IGC_INITIALIZE_PASS_END(PixelShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
185 #undef PASS_FLAG
186 #undef PASS_DESCRIPTION
187 #undef PASS_CFG_ONLY
188 #undef PASS_ANALYSIS
189 
190     PixelShaderLowering::PixelShaderLowering() :
191     FunctionPass(ID),
192     m_module(nullptr),
193     PDT(nullptr),
194     m_ReturnBlock(nullptr),
195     SkipSrc0Alpha(false),
196     m_dualSrcBlendEnabled(false),
197     uavPixelSync(false)
198 {
199     initializePixelShaderLoweringPass(*PassRegistry::getPassRegistry());
200 }
201 
runOnFunction(llvm::Function & F)202 bool PixelShaderLowering::runOnFunction(llvm::Function& F)
203 {
204     m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
205     IGCMD::MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
206     if (!isEntryFunc(pMdUtils, &F))
207     {
208         return false;
209     }
210     m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
211 
212     for (llvm::Function::iterator bb = F.begin(), be = F.end(); bb != be; ++bb)
213     {
214         if (llvm::isa<llvm::ReturnInst>(bb->getTerminator()))
215         {
216             m_ReturnBlock = &(*bb);
217             break;
218         }
219     }
220     if (m_ReturnBlock == nullptr)
221     {
222         F.begin()->getTerminator()->eraseFromParent();
223         ReturnInst::Create(F.getContext(), &(*F.begin()));
224         m_ReturnBlock = &(*F.begin());
225     }
226     m_outputBlock = nullptr;
227 
228     m_module = F.getParent();
229     ColorOutputArray colors;
230     DebugLocArray debugLocs;
231     Value* depth = nullptr;
232     Value* mask = nullptr;
233     Value* src0Alpha = nullptr;
234     Value* stencil = nullptr;
235 
236     // src0Alphas need not be sent when renderTargetBlending metadata is disabled
237     // this means alpha to coverage and alpha test is disabled
238     // this also means the render target blending is disabled
239     SkipSrc0Alpha = m_modMD->psInfo.SkipSrc0Alpha || IGC_IS_FLAG_ENABLED(ForceDisableSrc0Alpha);
240 
241     // Check whether metadata indicates that dual source blending should be disabled
242     bool dualSourceBlendingDisabled =
243         IGC_IS_FLAG_ENABLED(DisableDualBlendSource) ||
244         m_modMD->psInfo.DualSourceBlendingDisabled;
245 
246     m_dualSrcBlendEnabled = !dualSourceBlendingDisabled;
247 
248     m_isPerSample = false;
249 
250     m_hasDiscard = (m_module->getNamedMetadata("KillPixel") != nullptr);
251 
252     // In case we are using intrinsic retrieve the output
253     FindIntrinsicOutput(colors, depth, stencil, mask, src0Alpha, debugLocs);
254 
255     if (uavPixelSync)
256     {
257         // Emitting a fence to ensure that the uav write is completed before an EOT is issued
258         IRBuilder<> builder(F.getContext());
259 
260         bool fenceFlushNone = 0;
261         EmitMemoryFence(builder, fenceFlushNone);
262     }
263 
264     // EmitRender target write intrinsic
265     EmitRTWrite(colors, depth, stencil, mask, src0Alpha, debugLocs);
266 
267     Function* pixelPhase = nullptr;
268     Function* coarsePhase = nullptr;
269     NamedMDNode* coarseNode = F.getParent()->getNamedMetadata(NAMED_METADATA_COARSE_PHASE);
270     NamedMDNode* pixelNode = F.getParent()->getNamedMetadata(NAMED_METADATA_PIXEL_PHASE);
271     bool cfgChanged = false;
272     if (coarseNode)
273     {
274         coarsePhase = mdconst::dyn_extract<Function>(coarseNode->getOperand(0)->getOperand(0));
275     }
276     if (pixelNode)
277     {
278         pixelPhase = mdconst::dyn_extract<Function>(pixelNode->getOperand(0)->getOperand(0));
279     }
280 
281     if (&F == coarsePhase && pixelPhase != nullptr && mask != nullptr)
282     {
283         EmitCoarseMask(mask);
284     }
285     return cfgChanged;
286 }
287 
FindIntrinsicOutput(ColorOutputArray & colors,Value * & depth,Value * & stencil,Value * & mask,Value * & src0Alpha,DebugLocArray & debugLocs)288 void PixelShaderLowering::FindIntrinsicOutput(
289     ColorOutputArray& colors,
290     Value*& depth,
291     Value*& stencil,
292     Value*& mask,
293     Value*& src0Alpha,
294     DebugLocArray& debugLocs)
295 {
296     constexpr uint cMaxInputs = 32;
297     constexpr uint cMaxInputComponents = cMaxInputs * 4;
298     std::bitset<cMaxInputComponents> inputComponentsUsed;
299     std::bitset<cMaxInputs> isLinearInterpolation;
300 
301     llvm::Instruction* primId = nullptr;
302     llvm::Instruction* pointCoordX = nullptr;
303     llvm::Instruction* pointCoordY = nullptr;
304     SmallVector<GenIntrinsicInst*, 4> outputInstructions;
305     SmallVector<Instruction*, 4> instructionToRemove;
306     Function& F = *m_ReturnBlock->getParent();
307     Value* btrue = llvm::ConstantInt::get(Type::getInt1Ty(m_module->getContext()), true);
308 
309     m_modMD->psInfo.colorOutputMask.resize(USC::NUM_PSHADER_OUTPUT_REGISTERS);
310 
311     for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
312     {
313         for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
314         {
315             if (GenIntrinsicInst * inst = dyn_cast<GenIntrinsicInst>(II))
316             {
317                 GenISAIntrinsic::ID IID = inst->getIntrinsicID();
318                 if (IID == GenISAIntrinsic::GenISA_uavSerializeAll ||
319                     IID == GenISAIntrinsic::GenISA_uavSerializeOnResID)
320                 {
321                     uavPixelSync = true;
322                 }
323                 else if (IID == GenISAIntrinsic::GenISA_OUTPUT)
324                 {
325                     m_outputBlock = inst->getParent();
326                     outputInstructions.push_back(inst);
327                     uint outputType = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(4))->getZExtValue();
328                     IGC_ASSERT(outputType == SHADER_OUTPUT_TYPE_DEFAULT ||
329                         outputType == SHADER_OUTPUT_TYPE_DEPTHOUT ||
330                         outputType == SHADER_OUTPUT_TYPE_STENCIL ||
331                         outputType == SHADER_OUTPUT_TYPE_OMASK);
332 
333                     //Need to save debug location
334                     debugLocs.push_back(((Instruction*)inst)->getDebugLoc());
335 
336                     // delete the output
337                     instructionToRemove.push_back(inst);
338                 }
339                 else if (IID == GenISAIntrinsic::GenISA_DCL_SystemValue)
340                 {
341                     SGVUsage usage = (SGVUsage)
342                         llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
343                     if (usage == PRIMITIVEID)
344                     {
345                         primId = inst;
346                     }
347                     else if (usage == POINT_COORD_X)
348                     {
349                         pointCoordX = inst;
350                     }
351                     else if (usage == POINT_COORD_Y)
352                     {
353                         pointCoordY = inst;
354                     }
355                     else if (usage == POSITION_X || usage == POSITION_Y)
356                     {
357                         LowerPositionInput(inst, usage);
358                     }
359                     else if (usage == SAMPLEINDEX)
360                     {
361                         m_isPerSample = true;
362                     }
363                 }
364                 else if (IID == GenISAIntrinsic::GenISA_DCL_inputVec)
365                 {
366                     uint setupIndex =
367                         (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
368 
369                     IGC_ASSERT_MESSAGE(setupIndex < cMaxInputComponents, "Max inputs cannot be greater than 32 x 4");
370                     inputComponentsUsed.set(setupIndex);
371 
372                     e_interpolation mode = (e_interpolation)
373                         llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
374                     switch (mode)
375                     {
376                     case EINTERPOLATION_CONSTANT:
377                         IGC_ASSERT(!isLinearInterpolation.test(setupIndex / 4));
378                         break;
379                     case EINTERPOLATION_LINEARSAMPLE:
380                     case EINTERPOLATION_LINEARNOPERSPECTIVESAMPLE:
381                         m_isPerSample = true;
382                         // fall through
383                     case EINTERPOLATION_LINEAR:
384                     case EINTERPOLATION_LINEARCENTROID:
385                     case EINTERPOLATION_LINEARNOPERSPECTIVE:
386                     case EINTERPOLATION_LINEARNOPERSPECTIVECENTROID:
387                         isLinearInterpolation.set(setupIndex / 4);
388                         break;
389                     case EINTERPOLATION_UNDEFINED:
390                     case EINTERPOLATION_VERTEX:
391                     default:
392                         IGC_ASSERT_MESSAGE(0, "Unexpected Pixel Shader input interpolation mode.");
393                     }
394                 }
395             }
396         }
397     }
398     if (primId)
399     {
400         // When PrimitiveId input is present in shader IGC allocates an additional input and returns
401         // information about the PrimitiveID input to UMD (to program SBE). This new input component
402         // is created with constant interpolation and cannot be placed in a (4-dword) location that
403         // has linearly interpolated components. Alernatively code in MarkConstantInterpolation()
404         // could be modified to ignore the additional input created for PrimitveID.
405         unsigned int location;
406         for (location = 0; location < cMaxInputComponents; location++)
407         {
408             if (inputComponentsUsed.test(location) == false &&
409                 isLinearInterpolation.test(location / 4) == false)
410             {
411                 break;
412             }
413         }
414         Value* arguments[] =
415         {
416             ConstantInt::get(Type::getInt32Ty(m_module->getContext()), location),
417             ConstantInt::get(Type::getInt32Ty(m_module->getContext()), EINTERPOLATION_CONSTANT),
418         };
419         CallInst* in = GenIntrinsicInst::Create(
420             GenISAIntrinsic::getDeclaration(
421                 m_module,
422                 GenISAIntrinsic::GenISA_DCL_inputVec,
423                 Type::getFloatTy(m_module->getContext())),
424             arguments,
425             "",
426             primId);
427         in->setDebugLoc(primId->getDebugLoc());
428         primId->replaceAllUsesWith(in);
429         NamedMDNode* primIdMD = m_module->getOrInsertNamedMetadata("PrimIdLocation");
430 
431         Constant* cval = ConstantInt::get(
432             Type::getInt32Ty(m_module->getContext()), location);
433         llvm::MDNode* locationNd = llvm::MDNode::get(
434             m_module->getContext(),
435             ConstantAsMetadata::get(cval));
436         primIdMD->addOperand(locationNd);
437     }
438     if (pointCoordX || pointCoordY)
439     {
440         // Although PointCoords needs only 2 DWORDs, IGC must allocate 4 additional input and returns
441         // information about the PointCoord input to UMD (to program SBE). These new input components
442         // are created with linear interpolation and must be placed in an empty attribute index (4 DWORDs).
443         unsigned int location;
444         for (location = 0; location < cMaxInputComponents; location += 4)
445         {
446             bool isAttributeIndexEmpty =
447                 inputComponentsUsed.test(location) == false &&
448                 inputComponentsUsed.test(location + 1) == false &&
449                 inputComponentsUsed.test(location + 2) == false &&
450                 inputComponentsUsed.test(location + 3) == false;
451             if (isAttributeIndexEmpty)
452             {
453                 isLinearInterpolation.set(location / 4);
454                 break;
455             }
456         }
457         IGC_ASSERT(location < cMaxInputComponents);
458 
459         llvm::Instruction* inputPointCoords[] = { pointCoordX, pointCoordY };
460         for (unsigned int i = 0; i < sizeof(inputPointCoords) / sizeof(inputPointCoords[0]); i++)
461         {
462             if (inputPointCoords[i] == nullptr)
463             {
464                 continue;
465             }
466             Value* arguments[] =
467             {
468                 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), location + i),
469                 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), EINTERPOLATION_LINEAR),
470             };
471             CallInst* in = GenIntrinsicInst::Create(
472                 GenISAIntrinsic::getDeclaration(
473                     m_module,
474                     GenISAIntrinsic::GenISA_DCL_inputVec,
475                     Type::getFloatTy(m_module->getContext())),
476                 arguments,
477                 "",
478                 inputPointCoords[i]);
479             in->setDebugLoc(inputPointCoords[i]->getDebugLoc());
480             inputPointCoords[i]->replaceAllUsesWith(in);
481             instructionToRemove.push_back(inputPointCoords[i]);
482         }
483 
484         NamedMDNode* PointCoordMD = m_module->getOrInsertNamedMetadata("PointCoordLocation");
485         Constant* cval = ConstantInt::get(
486             Type::getInt32Ty(m_module->getContext()), location);
487         llvm::MDNode* locationNd = llvm::MDNode::get(
488             m_module->getContext(),
489             ConstantAsMetadata::get(cval));
490         PointCoordMD->addOperand(locationNd);
491 
492     }
493     for (GenIntrinsicInst* pInst : outputInstructions)
494     {
495         uint outputType = (uint)llvm::cast<llvm::ConstantInt>(pInst->getOperand(4))->getZExtValue();
496         if (outputType == SHADER_OUTPUT_TYPE_DEFAULT)
497         {
498             uint RTIndex = (uint)llvm::cast<llvm::ConstantInt>(pInst->getOperand(5))->getZExtValue();
499 
500             unsigned mask = 0;
501             // if any of the color channel is undef, initialize it
502             // to 0 for color compression perf.
503             for (int i = 0; i < 4; i++)
504             {
505                 if (isa<UndefValue>(pInst->getOperand(i)))
506                 {
507                     if (i == 3 &&
508                         IGC_IS_FLAG_ENABLED(EnableUndefAlphaOutputAsRed))
509                     {
510                         // if it's alpha, then set default value to
511                         // color.r, see IGC-959.
512                         pInst->setOperand(i, pInst->getOperand(0));
513                     }
514                     else
515                     {
516                         pInst->setOperand(i,
517                             ConstantFP::get(pInst->getOperand(i)->getType(), 0.0f));
518                     }
519                 }
520                 else
521                 {
522                     mask |= 1 << i;
523                 }
524             }
525             if (RTIndex == 0)
526             {
527                 src0Alpha = pInst->getOperand(3);
528             }
529             m_modMD->psInfo.colorOutputMask[RTIndex] = mask;
530             ColorOutput data;
531             data.RTindex = RTIndex;
532             data.color[0] = pInst->getOperand(0);
533             data.color[1] = pInst->getOperand(1);
534             data.color[2] = pInst->getOperand(2);
535             data.color[3] = pInst->getOperand(3);
536             data.mask = btrue;
537             data.blendStateIndex = nullptr;
538             data.bb = pInst->getParent();
539             colors.push_back(data);
540         }
541         else if (outputType == SHADER_OUTPUT_TYPE_DEPTHOUT)
542         {
543             depth = pInst->getOperand(0);
544         }
545         else if (outputType == SHADER_OUTPUT_TYPE_STENCIL)
546         {
547             stencil = pInst->getOperand(0);
548         }
549         else if (outputType == SHADER_OUTPUT_TYPE_OMASK)
550         {
551             mask = pInst->getOperand(0);
552         }
553     }
554     for (unsigned int i = 0; i < instructionToRemove.size(); i++)
555     {
556         instructionToRemove[i]->eraseFromParent();
557     }
558 }
559 
EmitMemoryFence(IRBuilder<> & builder,bool forceFlushNone)560 void PixelShaderLowering::EmitMemoryFence(IRBuilder<>& builder, bool forceFlushNone)
561 {
562     Value* trueValue = builder.getInt1(true);
563     Value* falseValue = builder.getInt1(false);
564 
565     Value* arguments[] =
566     {
567         trueValue,
568         falseValue,
569         falseValue,
570         falseValue,
571         falseValue,
572         trueValue,
573         falseValue,
574     };
575 
576     CallInst* memFence = GenIntrinsicInst::Create(GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_memoryfence),
577         arguments,
578         "",
579         m_ReturnBlock->getTerminator());
580 }
581 
addRTWrite(BasicBlock * bbToAdd,Value * src0Alpha,Value * oMask,ColorOutput & color,Value * depth,Value * stencil)582 CallInst* PixelShaderLowering::addRTWrite(
583     BasicBlock* bbToAdd, Value* src0Alpha,
584     Value* oMask, ColorOutput& color,
585     Value* depth, Value* stencil)
586 {
587     bool isHF = false;
588     Value* undefSrc0Alpha = nullptr;
589     Value* r = color.color[0];
590     Value* g = color.color[1];
591     Value* b = color.color[2];
592     Value* a = color.color[3];
593 
594     //True if src0Alpha exists and renderTargetBlendingDisabled is false
595     bool needsSrc0Alpha = ((src0Alpha && color.RTindex > 0) && (!SkipSrc0Alpha) && src0Alpha != color.color[3]);
596     bool src0AlphaIsHF = (needsSrc0Alpha && isa<FPExtInst>(src0Alpha)) || !needsSrc0Alpha;
597 
598     if (m_cgCtx->platform.supportFP16() &&
599         (llvm::isa<llvm::FPExtInst>(r) &&
600             llvm::isa<llvm::FPExtInst>(g) &&
601             llvm::isa<llvm::FPExtInst>(b) &&
602             llvm::isa<llvm::FPExtInst>(a)) &&
603         src0AlphaIsHF &&
604         !SkipSrc0Alpha)
605     {
606 
607         FPExtInst* rInst = llvm::cast<llvm::FPExtInst>(r);
608         FPExtInst* gInst = llvm::cast<llvm::FPExtInst>(g);
609         FPExtInst* bInst = llvm::cast<llvm::FPExtInst>(b);
610         FPExtInst* aInst = llvm::cast<llvm::FPExtInst>(a);
611         FPExtInst* src0AlphaInst = nullptr;
612 
613         if (needsSrc0Alpha &&
614             llvm::isa<llvm::FPExtInst>(src0Alpha))
615             src0AlphaInst = llvm::cast<llvm::FPExtInst>(src0Alpha);
616 
617         r = rInst->getOperand(0);
618 
619         g = gInst->getOperand(0);
620 
621         b = bInst->getOperand(0);
622 
623         a = aInst->getOperand(0);
624 
625         if (src0AlphaInst)
626         {
627             src0Alpha = src0AlphaInst->getOperand(0);
628         }
629         isHF = true;
630     }
631 
632     if (r->getType()->isHalfTy())
633     {
634         isHF = true;
635     }
636 
637     /*
638         In case src0Alpha comes from a HF RT Write
639         */
640     IRBuilder<> builder(bbToAdd->getTerminator());
641     if (!isHF &&
642         needsSrc0Alpha &&
643         src0Alpha->getType()->isHalfTy())
644     {
645         if (llvm::isa<llvm::FPTruncInst>(src0Alpha))
646         {
647             src0Alpha = (llvm::cast<llvm::FPTruncInst>(src0Alpha))->getOperand(0);
648         }
649         else
650         {
651             src0Alpha = builder.CreateFPExt(src0Alpha, builder.getFloatTy());
652         }
653     }
654     else if (isHF &&
655         needsSrc0Alpha &&
656         src0Alpha->getType()->isFloatTy())
657     {
658         /*
659             reverse, src0Alpha comes from half float in to float RT Write
660             */
661         if (llvm::isa<llvm::FPExtInst>(src0Alpha))
662         {
663             src0Alpha = (llvm::cast<llvm::FPExtInst>(src0Alpha))->getOperand(0);
664         }
665         else
666         {
667             src0Alpha = builder.CreateFPTrunc(src0Alpha, llvm::Type::getHalfTy(m_module->getContext()));
668         }
669     }
670 
671     if (isHF)
672         undefSrc0Alpha = llvm::UndefValue::get(Type::getHalfTy(m_module->getContext()));
673     else
674         undefSrc0Alpha = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
675 
676     Type* i32t = Type::getInt32Ty(m_module->getContext());
677     Type* i1t = Type::getInt1Ty(m_module->getContext());
678     Value* undef = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
679     Value* iundef = llvm::UndefValue::get(i32t);
680     Value* i1true = ConstantInt::get(i1t, 1);
681     Value* i1false = ConstantInt::get(i1t, 0);
682     Value* vrtIdx = ConstantInt::get(i32t, color.RTindex);
683     Value* vblendIdx = color.blendStateIndex ? color.blendStateIndex : vrtIdx;
684     Value* hasOmask = (oMask || m_modMD->psInfo.outputMask) ? i1true : i1false;
685     Value* hasDepth = (depth || m_modMD->psInfo.outputDepth) ? i1true : i1false;
686     Value* hasStencil = (stencil || m_modMD->psInfo.outputStencil) ? i1true : i1false;
687 
688     Value* arguments[] = {
689         needsSrc0Alpha ? src0Alpha : undefSrc0Alpha,    // 0
690         oMask ? oMask : undef,                          // 1 - oMask
691         color.mask,                                     // 2 - pMask
692         r, g, b, a,                                     // 3,4,5,6
693         depth ? depth : undef,                          // 7
694         stencil ? stencil : undef,                      // 8
695         vrtIdx,                                         // 9 - RT index
696         vblendIdx,                                      // 10 - blend state index
697         hasOmask,                                       // 11
698         hasDepth,                                       // 12
699         hasStencil,                                     // 13
700         i1false,                                        // 14 - per sample
701         iundef                                          // 15 - sample idx
702     };
703 
704     Function* frtw;
705 
706     if (isHF)
707     {
708         frtw = GenISAIntrinsic::getDeclaration(m_module,
709             GenISAIntrinsic::GenISA_RTWrite,
710             Type::getHalfTy(this->m_module->getContext()));
711     }
712     else
713     {
714         frtw = GenISAIntrinsic::getDeclaration(m_module,
715             GenISAIntrinsic::GenISA_RTWrite,
716             Type::getFloatTy(this->m_module->getContext()));
717     }
718 
719     return GenIntrinsicInst::Create(frtw, arguments, "",
720         bbToAdd->getTerminator());
721 }
722 
723 #ifdef DEBUG_BLEND_TO_DISCARD
724 // debug function
dbgPrintBlendOptMode(uint64_t hash,std::vector<int> & blendOpt,unsigned ncolors)725 static void dbgPrintBlendOptMode(uint64_t hash,
726     std::vector<int>& blendOpt, unsigned ncolors)
727 {
728     static const char* blendOptName[] =
729     {
730         "BLEND_OPTIMIZATION_NONE",
731         "BLEND_OPTIMIZATION_SRC_ALPHA",
732         "BLEND_OPTIMIZATION_INV_SRC_ALPHA",
733         "BLEND_OPTIMIZATION_SRC_ALPHA_DISCARD_ONLY",
734         "BLEND_OPTIMIZATION_SRC_ALPHA_FILL_ONLY",
735         "BLEND_OPTIMIZATION_SRC_COLOR_ZERO",
736         "BLEND_OPTIMIZATION_SRC_COLOR_ONE",
737         "BLEND_OPTIMIZATION_SRC_BOTH_ZERO",
738         "BLEND_OPTIMIZATION_SRC_BOTH_ONE",
739         "BLEND_OPTIMIZATION_SRC_ALPHA_OR_COLOR_ZERO",
740         "BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_ONE",
741         "BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_IGNORE"
742     };
743     bool doprint = false;
744     for (unsigned i = 0; i < ncolors; i++)
745     {
746         if (blendOpt[i] != USC::BLEND_OPTIMIZATION_NONE)
747             doprint = true;
748     }
749     if (doprint)
750     {
751         printf("%016llx blend opt[%d]:\n", hash, ncolors);
752         for (unsigned i = 0; i < ncolors; i++)
753         {
754             printf("  %s\n", blendOptName[blendOpt[i]]);
755         }
756     }
757 }
758 #endif
759 
EmitRTWrite(ColorOutputArray & colors,Value * depth,Value * stencil,Value * oMask,Value * src0Alpha,DebugLocArray & debugLocs)760 void PixelShaderLowering::EmitRTWrite(
761     ColorOutputArray& colors, Value* depth, Value* stencil,
762     Value* oMask, Value* src0Alpha, DebugLocArray& debugLocs)
763 {
764     if (!m_hasDiscard)
765     {
766         // no discard found
767         //IGC_ASSERT(m_module->getNamedMetadata("KillPixel") == nullptr);
768 
769         // check blend to discard optimization and generate mask for each
770         // render target output
771         std::vector<int>& blendOpt = m_modMD->psInfo.blendOptimizationMode;
772 #ifdef DEBUG_BLEND_TO_DISCARD
773         dbgPrintBlendOptMode(m_cgCtx->hash.getAsmHash(), blendOpt, colors.size());
774 #endif
775 
776         if (blendOpt.size() && !useDualSrcBlend(colors))
777         {
778             bool hasDiscard = false;
779 
780             unsigned maxRTIndex = 0;
781             for (unsigned i = 0; i < colors.size(); i++)
782             {
783                 if (maxRTIndex < colors[i].RTindex)
784                 {
785                     maxRTIndex = colors[i].RTindex;
786                 }
787             }
788 
789             for (unsigned i = 0; i < colors.size(); i++)
790             {
791                 USC::BLEND_OPTIMIZATION_MODE blendOptMode =
792                     static_cast<USC::BLEND_OPTIMIZATION_MODE>(blendOpt[i]);
793 
794                 // Only do blend to fill if the shader is persample, hardware
795                 // already does blend to fill for other cases.
796                 bool enableBlendToFill =
797                     m_cgCtx->m_DriverInfo.SupportBlendToFillOpt() &&
798                     maxRTIndex <= 4 && m_isPerSample;
799 
800                 if (optBlendState(blendOptMode, colors[i], enableBlendToFill))
801                 {
802                     // for blend to discard opt, we need to force earlyz
803                     hasDiscard = true;
804                     m_modMD->psInfo.forceEarlyZ = true;
805                 }
806             }
807 
808             if (hasDiscard)
809             {
810                 m_module->getOrInsertNamedMetadata("KillPixel");
811             }
812         }
813     }
814 
815     uint32_t RTindexVal = -1;
816     //According to Spec, the RT Write instruction must follow this order : dual source followed by single source
817     if (useDualSrcBlend(colors))
818     {
819         //If RT0 is executed first when size is 2
820         if (colors[0].RTindex == 0 && colors[1].RTindex == 1)
821         {
822             RTindexVal = 0;
823         }
824         else if (colors[0].RTindex == 1 && colors[1].RTindex == 0)
825         {
826 
827             RTindexVal = 1;
828         }
829     }
830 
831     if (RTindexVal != -1)
832     {
833         //dual source RTWrite first
834         colors[RTindexVal].inst = addDualBlendWrite(
835             colors[RTindexVal].bb,
836             oMask,
837             colors[RTindexVal],
838             colors[1 - RTindexVal],
839             depth, stencil, 0);
840         colors[RTindexVal].inst->setDebugLoc(debugLocs[RTindexVal]);
841 
842         //Single source RTWrite
843         colors[1 - RTindexVal].inst = addRTWrite(
844             colors[1 - RTindexVal].bb,
845             src0Alpha,
846             oMask, colors[1 - RTindexVal],
847             depth,
848             stencil);
849         colors[1 - RTindexVal].inst->setDebugLoc(debugLocs[1 - RTindexVal]);
850     }
851     else
852     {
853         for (unsigned int i = 0; i < colors.size(); i++)
854         {
855             colors[i].inst = addRTWrite(
856                 colors[i].bb,
857                 src0Alpha,
858                 oMask, colors[i],
859                 depth,
860                 stencil);
861 
862             colors[i].inst->setDebugLoc(debugLocs[i]);
863         }
864     }
865 
866     // pick up 1 RTWrite and move it to return block, so we don't need to
867     // generate an additional null surface write for EOT.
868     if (m_hasDiscard)
869     {
870         moveRTWritesToReturnBlock(colors);
871     }
872 
873     checkAndCreateNullRTWrite(oMask, depth, stencil);
874 }
875 
fixHFSource(IRBuilder<> & builder,Value * val)876 inline Value* fixHFSource(IRBuilder<>& builder, Value* val)
877 {
878     if (val->getType()->isFloatTy())
879         return val;
880 
881     if (llvm::isa<llvm::FPTruncInst>(val))
882     {
883         return (llvm::cast<llvm::FPTruncInst>(val))->getOperand(0);
884     }
885     else
886     {
887         return builder.CreateFPExt(val, builder.getFloatTy());
888     }
889 }
890 
addDualBlendWrite(BasicBlock * bbToAdd,Value * oMask,ColorOutput & color0,ColorOutput & color1,Value * depth,Value * stencil,uint index)891 CallInst* PixelShaderLowering::addDualBlendWrite(
892     BasicBlock* bbToAdd, Value* oMask,
893     ColorOutput& color0, ColorOutput& color1,
894     Value* depth, Value* stencil, uint index)
895 {
896     bool isFP16 = false;
897     bool isFP32 = false;
898     Value* pMask = color0.mask;
899     Value* r0 = color0.color[0];
900     Value* g0 = color0.color[1];
901     Value* b0 = color0.color[2];
902     Value* a0 = color0.color[3];
903     Value* r1 = color1.color[0];
904     Value* g1 = color1.color[1];
905     Value* b1 = color1.color[2];
906     Value* a1 = color1.color[3];
907 
908     IGC_ASSERT(color0.mask == color1.mask);
909 
910     //assuming types are consistent
911     if (r0->getType()->isHalfTy() ||
912         r1->getType()->isHalfTy())
913     {
914         isFP16 = true;
915     }
916 
917     if (r0->getType()->isFloatTy() ||
918         r1->getType()->isFloatTy())
919     {
920         isFP32 = true;
921     }
922 
923     /*
924         if we are combining FP32 and FP16 RT writes
925         promote everything to FP32
926         Three Cases:
927         Case 1) Immediate, extend to FP32 Immediate.
928         Case 2) FP16 Not Immediate. Not result to FPTrunc. Add FPExt Instruction
929         Case 3) FP16 Not Immediate. Result of FPTrunc. Use src of FPTrunc
930     */
931     if (isFP16 && isFP32)
932     {
933         IRBuilder<> builder(bbToAdd->getTerminator());
934         r0 = fixHFSource(builder, r0);
935         g0 = fixHFSource(builder, g0);
936         b0 = fixHFSource(builder, b0);
937         a0 = fixHFSource(builder, a0);
938         r1 = fixHFSource(builder, r1);
939         g1 = fixHFSource(builder, g1);
940         b1 = fixHFSource(builder, b1);
941         a1 = fixHFSource(builder, a1);
942     }
943 
944     Type* i32t = Type::getInt32Ty(m_module->getContext());
945     Type* i1t = Type::getInt1Ty(m_module->getContext());
946     Value* undef = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
947     Value* iundef = llvm::UndefValue::get(i32t);
948     Value* i1true = ConstantInt::get(i1t, 1);
949     Value* i1false = ConstantInt::get(i1t, 0);
950 
951     Value* arguments[] = {
952         oMask ? oMask : undef,          // 0 - oMask
953         pMask,                          // 1 - pMask
954         r0, g0, b0, a0,                 // 2, 3, 4, 5
955         r1, g1, b1, a1,                 // 6, 7, 8, 9
956         depth ? depth : undef,          // 10
957         stencil ? stencil : undef,      // 11
958         ConstantInt::get(i32t, index),  // 12 - RT index
959         oMask ? i1true : i1false,       // 13
960         depth ? i1true : i1false,       // 14
961         stencil ? i1true : i1false,     // 15
962         i1false,                        // 16 - per sample
963         iundef,                         // 17 - sample index
964     };
965     return GenIntrinsicInst::Create(
966         GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_RTDualBlendSource, r0->getType()),
967         arguments,
968         "",
969         bbToAdd->getTerminator());
970 }
971 
EmitCoarseMask(llvm::Value * mask)972 void PixelShaderLowering::EmitCoarseMask(llvm::Value* mask)
973 {
974     Type* floatTy = Type::getFloatTy(m_module->getContext());
975     Value* undef = llvm::UndefValue::get(floatTy);
976     Value* oMaskType =
977         ConstantInt::get(Type::getInt32Ty(m_module->getContext()), SHADER_OUTPUT_TYPE_OMASK);
978     Value* zero = ConstantInt::get(Type::getInt32Ty(m_module->getContext()), 0);
979     Value* arguments[] =
980     {
981         mask,
982         undef,
983         undef,
984         undef,
985         oMaskType,
986         zero,
987     };
988 
989     GenIntrinsicInst::Create(
990         GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_OUTPUT, floatTy),
991         arguments,
992         "",
993         m_ReturnBlock->getTerminator());
994 }
995 
LowerPositionInput(GenIntrinsicInst * positionInstr,uint usage)996 void PixelShaderLowering::LowerPositionInput(GenIntrinsicInst* positionInstr, uint usage)
997 {
998     IRBuilder<> builder(positionInstr);
999     Function* positionIntr = GenISAIntrinsic::getDeclaration(m_module,
1000         usage == POSITION_X ? GenISAIntrinsic::GenISA_PixelPositionX : GenISAIntrinsic::GenISA_PixelPositionY);
1001     Value* intPosition = builder.CreateCall(positionIntr);
1002     Value* floatPosition = positionInstr;
1003     if (floatPosition->hasOneUse())
1004     {
1005         if (BinaryOperator * fadd = dyn_cast<BinaryOperator>(*floatPosition->user_begin()))
1006         {
1007             if (ConstantFP * cst = dyn_cast<ConstantFP>(fadd->getOperand(1)))
1008             {
1009                 float constant = cst->getValueAPF().convertToFloat();
1010                 if (constant >= 0.0f && constant < 1.f)
1011                 {
1012                     floatPosition = fadd;
1013                 }
1014             }
1015         }
1016     }
1017     if (floatPosition->hasOneUse())
1018     {
1019         Value* v = *floatPosition->user_begin();
1020         if (v->getType()->isIntegerTy(32) && (isa<FPToUIInst>(v) || isa<FPToSIInst>(v)))
1021         {
1022             for (auto UI = v->user_begin(), UE = v->user_end(); UI != UE;)
1023             {
1024                 Value* use = *UI++;
1025                 if (TruncInst * truncI = dyn_cast<TruncInst>(use))
1026                 {
1027                     truncI->replaceAllUsesWith(builder.CreateZExtOrTrunc(intPosition, truncI->getType()));
1028                 }
1029             }
1030             if (!v->user_empty())
1031             {
1032                 v->replaceAllUsesWith(builder.CreateZExt(intPosition, v->getType()));
1033             }
1034             return;
1035         }
1036     }
1037     positionInstr->replaceAllUsesWith(builder.CreateUIToFP(intPosition, positionInstr->getType()));
1038 }
1039 
1040 // Based on blend state, check color output and discard them if possible.
optBlendState(USC::BLEND_OPTIMIZATION_MODE blendOpt,ColorOutput & colorOut,bool enableBlendToFill)1041 bool PixelShaderLowering::optBlendState(
1042     USC::BLEND_OPTIMIZATION_MODE blendOpt,
1043     ColorOutput& colorOut,
1044     bool enableBlendToFill)
1045 {
1046     Function* fBallot = GenISAIntrinsic::getDeclaration(m_module,
1047         GenISAIntrinsic::GenISA_WaveBallot);
1048 
1049     bool enableBlendToDiscard =
1050         IGC_IS_FLAG_ENABLED(EnableBlendToDiscard) &&
1051         m_cgCtx->platform.enableBlendToDiscardAndFill();
1052     enableBlendToFill = enableBlendToFill &&
1053         IGC_IS_FLAG_ENABLED(EnableBlendToFill) &&
1054         m_cgCtx->platform.enableBlendToDiscardAndFill();
1055 
1056     bool hasDiscard = false;
1057 
1058     if (m_modMD->psInfo.outputDepth || m_modMD->psInfo.outputStencil)
1059     {
1060         enableBlendToDiscard = false;
1061     }
1062 
1063     IGCIRBuilder<> irb(m_ReturnBlock->getTerminator());
1064 
1065     switch (blendOpt)
1066     {
1067 
1068     case USC::BLEND_OPTIMIZATION_SRC_ALPHA:
1069     {
1070         // discard: src.a == 0, fill: src.a == 1
1071 
1072         if (enableBlendToDiscard)
1073         {
1074             Constant* f0 = ConstantFP::get(colorOut.color[3]->getType(), 0.0);
1075             Value* ane0 = irb.CreateFCmpUNE(colorOut.color[3], f0);
1076             colorOut.mask = ane0;
1077             hasDiscard = true;
1078         }
1079 
1080         if (enableBlendToFill)
1081         {
1082             // ifany(src.a != 1.0) ? RTIndex : RTIndex + 4
1083             Constant* f1 = ConstantFP::get(colorOut.color[3]->getType(), 1.0);
1084             Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1085             Value* ane1_ballot = irb.CreateCall(fBallot, { ane1 });
1086             Value* any = irb.CreateICmpNE(ane1_ballot, irb.getInt32(0));
1087             colorOut.blendStateIndex = irb.CreateSelect(any,
1088                 irb.getInt32(colorOut.RTindex),
1089                 irb.getInt32(colorOut.RTindex + 4));
1090             m_modMD->psInfo.blendToFillEnabled = true;
1091         }
1092         return hasDiscard;
1093     }
1094 
1095     case USC::BLEND_OPTIMIZATION_INV_SRC_ALPHA:
1096     {
1097         // discard: src.a == 1, fill: src.a == 0
1098         Constant* f1 = ConstantFP::get(colorOut.color[0]->getType(), 1.0);
1099 
1100         if (enableBlendToDiscard)
1101         {
1102             Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1103             colorOut.mask = ane1;
1104             hasDiscard = true;
1105         }
1106 
1107         if (enableBlendToFill)
1108         {
1109             // ifall(src.a == 0) ? RTIndex + 4 : RTIndex
1110             // ifany(src.a != 0) ? RTIndex : RTIndex + 4
1111             Value* ai = irb.CreateBitCast(colorOut.color[3], irb.getInt32Ty());
1112             Value* ane0 = irb.CreateICmpNE(ai, irb.getInt32(0));
1113             Value* ane0_ballot = irb.CreateCall(fBallot, { ane0 });
1114             Value* any = irb.CreateICmpNE(ane0_ballot, irb.getInt32(0));
1115             colorOut.blendStateIndex = irb.CreateSelect(any,
1116                 irb.getInt32(colorOut.RTindex),
1117                 irb.getInt32(colorOut.RTindex + 4));
1118             m_modMD->psInfo.blendToFillEnabled = true;
1119         }
1120         return hasDiscard;
1121     }
1122 
1123     case USC::BLEND_OPTIMIZATION_SRC_ALPHA_DISCARD_ONLY:
1124     {
1125         // discard: src.a == 0
1126         if (enableBlendToDiscard)
1127         {
1128             Constant* f0 = ConstantFP::get(colorOut.color[3]->getType(), 0.0);
1129             Value* ane0 = irb.CreateFCmpUNE(colorOut.color[3], f0);
1130             colorOut.mask = ane0;
1131             hasDiscard = true;
1132         }
1133         return hasDiscard;
1134     }
1135 
1136     case USC::BLEND_OPTIMIZATION_SRC_ALPHA_FILL_ONLY:
1137     {
1138         // fill: src.a == 1
1139         if (enableBlendToFill)
1140         {
1141             // ifall(src.a == 1.0) ? RTIndex + 4 : RTIndex
1142             // ifany(src.a != 1.0) ? RTIndex : RTIndex + 4
1143             Constant* f1 = ConstantFP::get(colorOut.color[3]->getType(), 1.0);
1144             Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1145             Value* ane1_ballot = irb.CreateCall(fBallot, { ane1 });
1146             Value* any = irb.CreateICmpNE(ane1_ballot, irb.getInt32(0));
1147             colorOut.blendStateIndex = irb.CreateSelect(any,
1148                 irb.getInt32(colorOut.RTindex),
1149                 irb.getInt32(colorOut.RTindex + 4));
1150             m_modMD->psInfo.blendToFillEnabled = true;
1151         }
1152         return false;
1153     }
1154 
1155     case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO:
1156     {
1157         // discard: src.rgb == 0
1158         if (enableBlendToDiscard)
1159         {
1160             colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1161             hasDiscard = true;
1162         }
1163         return hasDiscard;
1164     }
1165 
1166     case USC::BLEND_OPTIMIZATION_SRC_COLOR_ONE:
1167     {
1168         // discard if src.rgb == 1
1169         if (enableBlendToDiscard)
1170         {
1171             ConstantFP* f1 = cast<ConstantFP>(
1172                 ConstantFP::get(colorOut.color[0]->getType(), 1.0));
1173 
1174             Value* rne1 = fcmpUNEConst(irb, colorOut.color[0], f1);
1175             Value* gne1 = fcmpUNEConst(irb, colorOut.color[1], f1);
1176             Value* bne1 = fcmpUNEConst(irb, colorOut.color[2], f1);
1177 
1178             colorOut.mask = createOr(irb, bne1, createOr(irb, rne1, gne1));
1179             hasDiscard = true;
1180         }
1181         return hasDiscard;
1182     }
1183 
1184     case USC::BLEND_OPTIMIZATION_SRC_BOTH_ZERO:
1185     {
1186         // discard: src.rgba == 0
1187         if (enableBlendToDiscard)
1188         {
1189             colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 4);
1190 
1191             hasDiscard = true;
1192         }
1193         return hasDiscard;
1194     }
1195 
1196     case USC::BLEND_OPTIMIZATION_SRC_BOTH_ONE:
1197     {
1198         // discard if src.rgba == 1
1199         if (enableBlendToDiscard)
1200         {
1201             Constant* f1 = ConstantFP::get(colorOut.color[0]->getType(), 1.0);
1202 
1203             Value* rne1 = irb.CreateFCmpUNE(colorOut.color[0], f1);
1204             Value* gne1 = irb.CreateFCmpUNE(colorOut.color[1], f1);
1205             Value* bne1 = irb.CreateFCmpUNE(colorOut.color[2], f1);
1206             Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1207             colorOut.mask = irb.CreateOr(ane1, irb.CreateOr(bne1, irb.CreateOr(rne1, gne1)));
1208             hasDiscard = true;
1209         }
1210         return hasDiscard;
1211     }
1212 
1213     case USC::BLEND_OPTIMIZATION_SRC_ALPHA_OR_COLOR_ZERO:
1214     {
1215         // discard: src.a == 0 || src.rgb == 0
1216         if (enableBlendToDiscard)
1217         {
1218             Value* a = colorOut.color[3];
1219             Constant* f0 = ConstantFP::get(a->getType(), 0.0);
1220 
1221             Value* ane0 = irb.CreateFCmpUNE(a, f0);
1222 
1223             Value* cne0 = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1224 
1225             colorOut.mask = irb.CreateAnd(ane0, cne0);
1226             hasDiscard = true;
1227         }
1228         return hasDiscard;
1229     }
1230 
1231     case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_ONE:
1232     {
1233         // discard: src.rgb == 0 && src.a == 1
1234         // equivalently mask = (r|g|b != 0) || (a != 1)
1235         if (enableBlendToDiscard)
1236         {
1237             Value* cne0 = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1238 
1239             Value* a = colorOut.color[3];
1240             Constant* f1 = ConstantFP::get(a->getType(), 1.0);
1241             Value* ane1 = irb.CreateFCmpUNE(a, f1);
1242 
1243             colorOut.mask = irb.CreateOr(cne0, ane1);
1244             hasDiscard = true;
1245         }
1246 
1247         return hasDiscard;
1248     }
1249 
1250     case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_IGNORE:
1251     {
1252         // Discard: src.rgb == 0 and don't compute src.a
1253         // equivalently mask = (r|g|b != 0)
1254         if (enableBlendToDiscard)
1255         {
1256             colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1257             hasDiscard = true;
1258         }
1259 
1260         // set output alpha as output.r, see IGC-959
1261         if (IGC_IS_FLAG_ENABLED(EnableUndefAlphaOutputAsRed))
1262         {
1263             colorOut.color[3] = colorOut.color[0];
1264         }
1265         else
1266         {
1267             colorOut.color[3] = ConstantFP::get(
1268                 colorOut.color[3]->getType(), 0.0);
1269         }
1270 
1271         return hasDiscard;
1272     }
1273 
1274     default:
1275         return false;
1276     }
1277 }
1278 
moveRTWriteToBlock(CallInst * call,SmallVector<BasicBlock *,8> & predBB,BasicBlock * toBB,llvm::DenseMap<llvm::Value *,llvm::PHINode * > & valueToPhiMap)1279 void PixelShaderLowering::moveRTWriteToBlock(
1280     CallInst* call, SmallVector<BasicBlock*, 8> & predBB, BasicBlock* toBB,
1281     llvm::DenseMap<llvm::Value*, llvm::PHINode*>& valueToPhiMap)
1282 {
1283     unsigned numPredBB = predBB.size();
1284     if (numPredBB > 1)
1285     {
1286         for (unsigned i = 0; i < call->getNumArgOperands(); i++)
1287         {
1288             if (Instruction * inst = dyn_cast<Instruction>(call->getArgOperand(i)))
1289             {
1290                 auto it = valueToPhiMap.find(inst);
1291                 if (it != valueToPhiMap.end())
1292                 {
1293                     call->setArgOperand(i, it->second);
1294                     continue;
1295                 }
1296 
1297                 PHINode* phi = PHINode::Create(
1298                     inst->getType(), numPredBB, "", &(*toBB->begin()));
1299                 valueToPhiMap[inst] = phi;
1300                 for (unsigned j = 0; j < numPredBB; j++)
1301                 {
1302                     Value* inVal;
1303                     if (predBB[j] == call->getParent())
1304                     {
1305                         inVal = inst;
1306                     }
1307                     else
1308                     {
1309                         inVal = UndefValue::get(inst->getType());
1310                     }
1311                     phi->addIncoming(inVal, predBB[j]);
1312                 }
1313                 call->setArgOperand(i, phi);
1314             }
1315         }
1316     }
1317 
1318     call->removeFromParent();
1319     call->insertBefore(toBB->getTerminator());
1320 }
1321 
moveRTWritesToReturnBlock(const ColorOutputArray & colors)1322 void PixelShaderLowering::moveRTWritesToReturnBlock(
1323     const ColorOutputArray& colors)
1324 {
1325     if (colors.size())
1326     {
1327         IGC_ASSERT(colors[0].inst != nullptr);
1328         SmallVector<BasicBlock*, 8> predBB;
1329         DenseMap<Value*, PHINode*> valueToPhiMap;
1330         for (auto PI = pred_begin(m_ReturnBlock), PE = pred_end(m_ReturnBlock);
1331             PI != PE; ++PI)
1332         {
1333             predBB.push_back(*PI);
1334         }
1335 
1336         if (useDualSrcBlend(colors))
1337         {
1338             // For SIMD16 PS thread with two output colors must send
1339             // messages in the following sequence for each RT: SIMD8 dual
1340             // source RTW message (low); SIMD8 dual source RTW message
1341             // (high); SIMD16 single src RTW message with second color.
1342             CallInst* const dualSourceRTW =
1343                 isa<RTDualBlendSourceIntrinsic>(colors[0].inst) ? colors[0].inst : colors[1].inst;
1344             CallInst* const singleSourceRTW =
1345                 isa<RTDualBlendSourceIntrinsic>(colors[0].inst) ? colors[1].inst : colors[0].inst;
1346 
1347             IGC_ASSERT(isa<RTWritIntrinsic>(singleSourceRTW));
1348             IGC_ASSERT(isa<RTDualBlendSourceIntrinsic>(dualSourceRTW));
1349 
1350             moveRTWriteToBlock(dualSourceRTW, predBB, m_ReturnBlock, valueToPhiMap);
1351             moveRTWriteToBlock(singleSourceRTW, predBB, m_ReturnBlock, valueToPhiMap);
1352         }
1353         else
1354         {
1355             moveRTWriteToBlock(colors[0].inst, predBB, m_ReturnBlock, valueToPhiMap);
1356         }
1357     }
1358 }
1359 
createPhiForRTWrite(Value * val,smallvector<BasicBlock *,8> & predBB,BasicBlock * toBB)1360 PHINode* PixelShaderLowering::createPhiForRTWrite(Value* val,
1361     smallvector<BasicBlock*, 8> & predBB, BasicBlock* toBB)
1362 {
1363     PHINode* phi = PHINode::Create(
1364         val->getType(), predBB.size(), "", &(*toBB->begin()));
1365     for (auto* BB : predBB)
1366     {
1367         Value* inVal;
1368         if (BB == m_outputBlock)
1369             inVal = val;
1370         else
1371             inVal = UndefValue::get(val->getType());
1372         phi->addIncoming(inVal, BB);
1373     }
1374     return phi;
1375 }
1376 
1377 // create a null surface write in return block if there's no one
checkAndCreateNullRTWrite(Value * oMask,Value * depth,Value * stencil)1378 void PixelShaderLowering::checkAndCreateNullRTWrite(
1379     Value* oMask, Value* depth, Value* stencil)
1380 {
1381     bool hasRTW = false;
1382     for (auto& I : *m_ReturnBlock)
1383     {
1384         if (isa<RTWritIntrinsic>(&I) ||
1385             isa<RTDualBlendSourceIntrinsic>(&I))
1386         {
1387             hasRTW = true;
1388             break;
1389         }
1390     }
1391 
1392     if (!hasRTW)
1393     {
1394         Value* undef = UndefValue::get(Type::getFloatTy(m_module->getContext()));
1395         ColorOutput color;
1396         color.color[0] = color.color[1] = color.color[2] = color.color[3] = undef;
1397         color.mask = ConstantInt::get(Type::getInt1Ty(m_module->getContext()), true);
1398         color.RTindex = -1;
1399         color.blendStateIndex = nullptr;
1400 
1401         if (m_outputBlock != m_ReturnBlock)
1402         {
1403             smallvector<BasicBlock*, 8> predBB;
1404 
1405             for (auto PI = pred_begin(m_ReturnBlock), PE = pred_end(m_ReturnBlock);
1406                 PI != PE; ++PI)
1407             {
1408                 predBB.push_back(*PI);
1409             }
1410             if (predBB.size() > 1)
1411             {
1412                 if (oMask)
1413                 {
1414                     oMask = createPhiForRTWrite(oMask, predBB, m_ReturnBlock);
1415                 }
1416                 if (depth)
1417                 {
1418                     depth = createPhiForRTWrite(depth, predBB, m_ReturnBlock);
1419                 }
1420                 if (stencil)
1421                 {
1422                     stencil = createPhiForRTWrite(stencil, predBB, m_ReturnBlock);
1423                 }
1424             }
1425         }
1426         addRTWrite(
1427             m_ReturnBlock,
1428             undef,
1429             oMask, color,
1430             depth, stencil);
1431     }
1432 }
1433 
1434 ///////////////////////////////////////////////////////////////////////
1435 // Lower discard intrinsics
1436 ///////////////////////////////////////////////////////////////////////
1437 
1438 #define PASS_FLAG "igc-lower-discard"
1439 #define PASS_DESCRIPTION "Lower discard intrinsics"
1440 #define PASS_CFG_ONLY false
1441 #define PASS_ANALYSIS false
1442 IGC_INITIALIZE_PASS_BEGIN(DiscardLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
1443     IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
1444     IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
1445     IGC_INITIALIZE_PASS_END(DiscardLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
1446 #undef PASS_FLAG
1447 #undef PASS_DESCRIPTION
1448 #undef PASS_CFG_ONLY
1449 #undef PASS_ANALYSIS
1450 
1451     char DiscardLowering::ID = 0;
1452 
DiscardLowering()1453 DiscardLowering::DiscardLowering()
1454     : FunctionPass(ID)
1455 {
1456     initializeDiscardLoweringPass(*PassRegistry::getPassRegistry());
1457 }
1458 
lowerDiscards(Function & F)1459 bool DiscardLowering::lowerDiscards(Function& F)
1460 {
1461     if (m_discards.empty() && m_isHelperInvocationCalls.empty())
1462     {
1463         return false;
1464     }
1465 
1466     m_earlyRet = BasicBlock::Create(m_module->getContext(), "DiscardRet", &F);
1467 
1468     // add OUTPUT_PIXELMASK call to track discard conditions
1469     IRBuilder<> irb(m_earlyRet);
1470 
1471     irb.CreateRetVoid();
1472 
1473     if (m_retBB)
1474     {
1475         m_retBB->getTerminator()->eraseFromParent();
1476         BranchInst::Create(m_earlyRet, m_retBB);
1477     }
1478     m_retBB = m_earlyRet;
1479 
1480     Function* fInitMask = GenISAIntrinsic::getDeclaration(m_module,
1481         GenISAIntrinsic::GenISA_InitDiscardMask);
1482     Function* fSetMask = GenISAIntrinsic::getDeclaration(m_module,
1483         GenISAIntrinsic::GenISA_UpdateDiscardMask);
1484 
1485     Value* discardMask = CallInst::Create(fInitMask, llvm::None, "",
1486         m_entryBB->getFirstNonPHI());
1487 
1488     bool killsPixels = false;
1489 
1490     for (auto discard : m_discards)
1491     {
1492         IGC_ASSERT(discard->isGenIntrinsic(GenISAIntrinsic::GenISA_discard));
1493         killsPixels = true;
1494 
1495         BasicBlock* bbDiscard;
1496         BasicBlock* bbAfter;
1497 
1498         bbDiscard = discard->getParent();
1499 
1500         BasicBlock::iterator bi = discard->getIterator();
1501         ++bi;
1502         bbAfter = bbDiscard->splitBasicBlock(
1503             bi, "postDiscard");
1504 
1505         // erase the branch inserted by splitBasicBLock
1506         bbDiscard->getTerminator()->eraseFromParent();
1507 
1508         // create conditional branch to early ret
1509         IRBuilder<> B(discard);
1510 
1511         // call discard(%dcond)
1512         // -->
1513         // UpdatePixelMask(%globalMask, %dcond) ; update discard pixel mask in dmask
1514         // %all = WaveBallot(%dcond)
1515         // %1 = icmp eq i32 %all, -1    ; if.all %dcond returnBB
1516         // br %1, returnBB, postDiscardBB
1517 
1518         Value* discardCond = discard->getOperand(0);
1519         Value* v = B.CreateCall(fSetMask, { discardMask, discardCond });
1520 
1521         B.CreateCondBr(v, m_earlyRet, bbAfter);
1522     }
1523 
1524     if (killsPixels)
1525     {
1526         m_module->getOrInsertNamedMetadata("KillPixel");
1527     }
1528 
1529     for (auto inst : m_isHelperInvocationCalls)
1530     {
1531         IRBuilder<> B(inst);
1532         Function* getPixelMask = GenISAIntrinsic::getDeclaration(m_module,
1533             GenISAIntrinsic::GenISA_GetPixelMask);
1534         llvm::Value* pixelMask = B.CreateCall(getPixelMask, { discardMask });
1535         inst->replaceAllUsesWith(B.CreateNot(pixelMask));
1536         inst->eraseFromParent();
1537     }
1538 
1539     for (auto discard : m_discards)
1540     {
1541         discard->eraseFromParent();
1542     }
1543 
1544 
1545     return true;
1546 }
1547 
runOnFunction(Function & F)1548 bool DiscardLowering::runOnFunction(Function& F)
1549 {
1550     IGCMD::MetaDataUtils* pMdUtils = nullptr;
1551     pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
1552     if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
1553     {
1554         return false;
1555     }
1556 
1557     m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
1558     m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
1559 
1560     m_entryBB = &F.getEntryBlock();
1561     m_module = F.getParent();
1562 
1563     // find return block
1564     for (auto& bb : F)
1565     {
1566         if (llvm::isa<llvm::ReturnInst>(bb.getTerminator()))
1567         {
1568             m_retBB = &bb;
1569             break;
1570         }
1571     }
1572 
1573     SmallVector<GenIntrinsicInst*, 4> discardToDel;
1574 
1575     for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
1576     {
1577         for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
1578         {
1579             GenIntrinsicInst* inst = dyn_cast<GenIntrinsicInst>(II);
1580             if (inst)
1581             {
1582                 if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_discard))
1583                 {
1584                     // get rid of discard(false)
1585                     if (ConstantInt * cval = dyn_cast<ConstantInt>(inst->getOperand(0)))
1586                     {
1587                         if (cval->isZero())
1588                         {
1589                             discardToDel.push_back(inst);
1590                             continue;
1591                         }
1592                     }
1593                     m_discards.push_back(inst);
1594                 }
1595                 else if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_IsHelperInvocation))
1596                 {
1597                     m_isHelperInvocationCalls.push_back(inst);
1598                 }
1599                 else
1600                     if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_OUTPUT))
1601                     {
1602                         // Check whether PS output omask/depth/stencil and save to
1603                         // metadata, since after discard lowering, the OUTPUT
1604                         // could become dead code and get cleaned. While we need to
1605                         // know it when creating null surface write.
1606                         uint outputType = (uint)llvm::cast<llvm::ConstantInt>(
1607                             inst->getOperand(4))->getZExtValue();
1608                         IGC_ASSERT(outputType == SHADER_OUTPUT_TYPE_DEFAULT ||
1609                             outputType == SHADER_OUTPUT_TYPE_DEPTHOUT ||
1610                             outputType == SHADER_OUTPUT_TYPE_STENCIL ||
1611                             outputType == SHADER_OUTPUT_TYPE_OMASK);
1612                         switch (outputType)
1613                         {
1614                         case SHADER_OUTPUT_TYPE_DEPTHOUT:
1615                             m_modMD->psInfo.outputDepth = true;
1616                             break;
1617                         case SHADER_OUTPUT_TYPE_STENCIL:
1618                             m_modMD->psInfo.outputStencil = true;
1619                             break;
1620                         case SHADER_OUTPUT_TYPE_OMASK:
1621                             m_modMD->psInfo.outputMask = true;
1622                             break;
1623                         default:
1624                             break;
1625                         }
1626                     }
1627             }
1628         }
1629     }
1630 
1631     for (auto I : discardToDel)
1632     {
1633         I->eraseFromParent();
1634     }
1635 
1636 
1637     Function* samplePhaseEntry = nullptr;
1638     Function* pixelPhaseEntry = nullptr;
1639     NamedMDNode* pixelNode = F.getParent()->getNamedMetadata("pixel_phase");
1640     NamedMDNode* sampleNode = F.getParent()->getNamedMetadata("sample_phase");
1641     if (sampleNode)
1642     {
1643         samplePhaseEntry = mdconst::dyn_extract<Function>(
1644             sampleNode->getOperand(0)->getOperand(0));
1645     }
1646     if (pixelNode)
1647     {
1648         pixelPhaseEntry = mdconst::dyn_extract<Function>(
1649             pixelNode->getOperand(0)->getOperand(0));
1650     }
1651 
1652     bool cfgChanged = false;
1653 
1654     // For multirate PS, we will run discard lowering twice, first on sample
1655     // phase entry before link multi rate pass, second on pixel entry after
1656     // link multi rate pass. The check is to make sure only lower discards on
1657     // sample phase entry before link multi rate pass.
1658     if (samplePhaseEntry == nullptr || pixelPhaseEntry != &F)
1659     {
1660         cfgChanged = lowerDiscards(F);
1661     }
1662     m_discards.clear();
1663 
1664 #ifdef DEBUG_DISCARD_OPT
1665     DumpLLVMIR(getAnalysis<CodeGenContextWrapper>().getCodeGenContext(), "discard");
1666 #endif
1667 
1668     return cfgChanged;
1669 }
1670 
1671 }//namespace IGC
1672