1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "common/LLVMUtils.h"
10 #include "common/IGCIRBuilder.h"
11 #include "PixelShaderLowering.hpp"
12 #include "GenISAIntrinsics/GenIntrinsics.h"
13 #include "Compiler/IGCPassSupport.h"
14 #include "Probe/Assertion.h"
15
16 using namespace llvm;
17
18 //#define DEBUG_BLEND_TO_DISCARD
19
20 namespace IGC
21 {
22
23 #define PASS_FLAG "igc-pixel-shader-addmask"
24 #define PASS_DESCRIPTION "Pixel shader lowering pass"
25 #define PASS_CFG_ONLY false
26 #define PASS_ANALYSIS true
27 IGC_INITIALIZE_PASS_BEGIN(PixelShaderAddMask, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
28 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
29 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
30 IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
31 IGC_INITIALIZE_PASS_END(PixelShaderAddMask, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
32 #undef PASS_FLAG
33 #undef PASS_DESCRIPTION
34 #undef PASS_CFG_ONLY
35 #undef PASS_ANALYSIS
36
37 char PixelShaderAddMask::ID = 0;
38
PixelShaderAddMask()39 PixelShaderAddMask::PixelShaderAddMask() :
40 FunctionPass(ID)
41 {
42 initializePixelShaderAddMaskPass(*PassRegistry::getPassRegistry());
43 }
44
runOnFunction(llvm::Function & F)45 bool PixelShaderAddMask::runOnFunction(llvm::Function& F)
46 {
47 m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
48
49 Module* mod = F.getParent();
50 bool hasDiscard;
51
52 hasDiscard = (mod->getNamedMetadata("KillPixel") != nullptr);
53 m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
54 IGCMD::MetaDataUtils* pMdUtils =
55 getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
56
57 if (!hasDiscard || pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
58 {
59 return false;
60 }
61
62 Instruction* globalMask = nullptr;
63 Instruction* updateMask = nullptr;
64
65 unsigned numUpdateMask = 0;
66
67 for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
68 {
69 for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
70 {
71 if (isa<GenIntrinsicInst>(II, GenISAIntrinsic::GenISA_InitDiscardMask))
72 {
73 globalMask = &(*II);
74 }
75 else
76 if (isa<GenIntrinsicInst>(II, GenISAIntrinsic::GenISA_UpdateDiscardMask))
77 {
78 numUpdateMask++;
79 updateMask = &(*II);
80 }
81 }
82 }
83 if (!globalMask)
84 {
85 return false;
86 }
87
88 if (F.size() == 1 && numUpdateMask == 1)
89 {
90 // handle special case function has 1 BB and 1 discard, then we
91 // can directly use the discard condition for RTWrite, no need to
92 // generate GetPixelMask.
93 Value* discardCond = updateMask->getOperand(1);
94 updateMask->eraseFromParent();
95 globalMask->eraseFromParent();
96 Value* mask = nullptr;
97
98 for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
99 {
100 RTWritIntrinsic* rtw;
101 RTDualBlendSourceIntrinsic* drt;
102
103 for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
104 {
105 if ((rtw = dyn_cast<RTWritIntrinsic>(II)))
106 {
107 IGC_ASSERT(isa<ConstantInt>(rtw->getPMask()));
108 if (!mask)
109 {
110 mask = BinaryOperator::CreateNot(discardCond, "", rtw);
111 }
112
113 rtw->setPMask(mask);
114 }
115 else
116 if ((drt = dyn_cast<RTDualBlendSourceIntrinsic>(II)))
117 {
118 IGC_ASSERT(isa<ConstantInt>(drt->getPMask()));
119 if (!mask)
120 {
121 mask = BinaryOperator::CreateNot(discardCond, "", drt);
122 }
123
124 drt->setPMask(mask);
125 }
126 }
127 }
128 }
129 else
130 {
131 globalMask->moveBefore(globalMask->getParent()->getFirstNonPHI());
132
133 Function* getMaskF;
134 getMaskF = GenISAIntrinsic::getDeclaration(mod,
135 GenISAIntrinsic::GenISA_GetPixelMask);
136
137 Value* mask = nullptr;
138
139 for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
140 {
141 RTWritIntrinsic* rtw;
142 RTDualBlendSourceIntrinsic* drt;
143
144 mask = nullptr;
145 for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
146 {
147 if ((rtw = dyn_cast<RTWritIntrinsic>(II)) && globalMask)
148 {
149 if (!mask)
150 {
151 mask = CallInst::Create(getMaskF, { globalMask }, "", rtw);
152 }
153 IGC_ASSERT(isa<ConstantInt>(rtw->getPMask()));
154 rtw->setPMask(mask);
155 }
156 else
157 if ((drt = dyn_cast<RTDualBlendSourceIntrinsic>(II)) && globalMask)
158 {
159 if (!mask)
160 {
161 mask = CallInst::Create(getMaskF, { globalMask }, "", drt);
162 }
163 IGC_ASSERT(isa<ConstantInt>(drt->getPMask()));
164 drt->setPMask(mask);
165 }
166 }
167 }
168 }
169
170 return false;
171 }
172
173 char PixelShaderLowering::ID = 0;
174
175 // Register pass to igc-opt
176 #define PASS_FLAG "igc-pixel-shader-lowering"
177 #define PASS_DESCRIPTION "This is the pixel shader lowering pass "
178 #define PASS_CFG_ONLY false
179 #define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(PixelShaderLowering,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)180 IGC_INITIALIZE_PASS_BEGIN(PixelShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
181 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
182 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
183 IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
184 IGC_INITIALIZE_PASS_END(PixelShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
185 #undef PASS_FLAG
186 #undef PASS_DESCRIPTION
187 #undef PASS_CFG_ONLY
188 #undef PASS_ANALYSIS
189
190 PixelShaderLowering::PixelShaderLowering() :
191 FunctionPass(ID),
192 m_module(nullptr),
193 PDT(nullptr),
194 m_ReturnBlock(nullptr),
195 SkipSrc0Alpha(false),
196 m_dualSrcBlendEnabled(false),
197 uavPixelSync(false)
198 {
199 initializePixelShaderLoweringPass(*PassRegistry::getPassRegistry());
200 }
201
runOnFunction(llvm::Function & F)202 bool PixelShaderLowering::runOnFunction(llvm::Function& F)
203 {
204 m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
205 IGCMD::MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
206 if (!isEntryFunc(pMdUtils, &F))
207 {
208 return false;
209 }
210 m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
211
212 for (llvm::Function::iterator bb = F.begin(), be = F.end(); bb != be; ++bb)
213 {
214 if (llvm::isa<llvm::ReturnInst>(bb->getTerminator()))
215 {
216 m_ReturnBlock = &(*bb);
217 break;
218 }
219 }
220 if (m_ReturnBlock == nullptr)
221 {
222 F.begin()->getTerminator()->eraseFromParent();
223 ReturnInst::Create(F.getContext(), &(*F.begin()));
224 m_ReturnBlock = &(*F.begin());
225 }
226 m_outputBlock = nullptr;
227
228 m_module = F.getParent();
229 ColorOutputArray colors;
230 DebugLocArray debugLocs;
231 Value* depth = nullptr;
232 Value* mask = nullptr;
233 Value* src0Alpha = nullptr;
234 Value* stencil = nullptr;
235
236 // src0Alphas need not be sent when renderTargetBlending metadata is disabled
237 // this means alpha to coverage and alpha test is disabled
238 // this also means the render target blending is disabled
239 SkipSrc0Alpha = m_modMD->psInfo.SkipSrc0Alpha || IGC_IS_FLAG_ENABLED(ForceDisableSrc0Alpha);
240
241 // Check whether metadata indicates that dual source blending should be disabled
242 bool dualSourceBlendingDisabled =
243 IGC_IS_FLAG_ENABLED(DisableDualBlendSource) ||
244 m_modMD->psInfo.DualSourceBlendingDisabled;
245
246 m_dualSrcBlendEnabled = !dualSourceBlendingDisabled;
247
248 m_isPerSample = false;
249
250 m_hasDiscard = (m_module->getNamedMetadata("KillPixel") != nullptr);
251
252 // In case we are using intrinsic retrieve the output
253 FindIntrinsicOutput(colors, depth, stencil, mask, src0Alpha, debugLocs);
254
255 if (uavPixelSync)
256 {
257 // Emitting a fence to ensure that the uav write is completed before an EOT is issued
258 IRBuilder<> builder(F.getContext());
259
260 bool fenceFlushNone = 0;
261 EmitMemoryFence(builder, fenceFlushNone);
262 }
263
264 // EmitRender target write intrinsic
265 EmitRTWrite(colors, depth, stencil, mask, src0Alpha, debugLocs);
266
267 Function* pixelPhase = nullptr;
268 Function* coarsePhase = nullptr;
269 NamedMDNode* coarseNode = F.getParent()->getNamedMetadata(NAMED_METADATA_COARSE_PHASE);
270 NamedMDNode* pixelNode = F.getParent()->getNamedMetadata(NAMED_METADATA_PIXEL_PHASE);
271 bool cfgChanged = false;
272 if (coarseNode)
273 {
274 coarsePhase = mdconst::dyn_extract<Function>(coarseNode->getOperand(0)->getOperand(0));
275 }
276 if (pixelNode)
277 {
278 pixelPhase = mdconst::dyn_extract<Function>(pixelNode->getOperand(0)->getOperand(0));
279 }
280
281 if (&F == coarsePhase && pixelPhase != nullptr && mask != nullptr)
282 {
283 EmitCoarseMask(mask);
284 }
285 return cfgChanged;
286 }
287
FindIntrinsicOutput(ColorOutputArray & colors,Value * & depth,Value * & stencil,Value * & mask,Value * & src0Alpha,DebugLocArray & debugLocs)288 void PixelShaderLowering::FindIntrinsicOutput(
289 ColorOutputArray& colors,
290 Value*& depth,
291 Value*& stencil,
292 Value*& mask,
293 Value*& src0Alpha,
294 DebugLocArray& debugLocs)
295 {
296 constexpr uint cMaxInputs = 32;
297 constexpr uint cMaxInputComponents = cMaxInputs * 4;
298 std::bitset<cMaxInputComponents> inputComponentsUsed;
299 std::bitset<cMaxInputs> isLinearInterpolation;
300
301 llvm::Instruction* primId = nullptr;
302 llvm::Instruction* pointCoordX = nullptr;
303 llvm::Instruction* pointCoordY = nullptr;
304 SmallVector<GenIntrinsicInst*, 4> outputInstructions;
305 SmallVector<Instruction*, 4> instructionToRemove;
306 Function& F = *m_ReturnBlock->getParent();
307 Value* btrue = llvm::ConstantInt::get(Type::getInt1Ty(m_module->getContext()), true);
308
309 m_modMD->psInfo.colorOutputMask.resize(USC::NUM_PSHADER_OUTPUT_REGISTERS);
310
311 for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
312 {
313 for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
314 {
315 if (GenIntrinsicInst * inst = dyn_cast<GenIntrinsicInst>(II))
316 {
317 GenISAIntrinsic::ID IID = inst->getIntrinsicID();
318 if (IID == GenISAIntrinsic::GenISA_uavSerializeAll ||
319 IID == GenISAIntrinsic::GenISA_uavSerializeOnResID)
320 {
321 uavPixelSync = true;
322 }
323 else if (IID == GenISAIntrinsic::GenISA_OUTPUT)
324 {
325 m_outputBlock = inst->getParent();
326 outputInstructions.push_back(inst);
327 uint outputType = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(4))->getZExtValue();
328 IGC_ASSERT(outputType == SHADER_OUTPUT_TYPE_DEFAULT ||
329 outputType == SHADER_OUTPUT_TYPE_DEPTHOUT ||
330 outputType == SHADER_OUTPUT_TYPE_STENCIL ||
331 outputType == SHADER_OUTPUT_TYPE_OMASK);
332
333 //Need to save debug location
334 debugLocs.push_back(((Instruction*)inst)->getDebugLoc());
335
336 // delete the output
337 instructionToRemove.push_back(inst);
338 }
339 else if (IID == GenISAIntrinsic::GenISA_DCL_SystemValue)
340 {
341 SGVUsage usage = (SGVUsage)
342 llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
343 if (usage == PRIMITIVEID)
344 {
345 primId = inst;
346 }
347 else if (usage == POINT_COORD_X)
348 {
349 pointCoordX = inst;
350 }
351 else if (usage == POINT_COORD_Y)
352 {
353 pointCoordY = inst;
354 }
355 else if (usage == POSITION_X || usage == POSITION_Y)
356 {
357 LowerPositionInput(inst, usage);
358 }
359 else if (usage == SAMPLEINDEX)
360 {
361 m_isPerSample = true;
362 }
363 }
364 else if (IID == GenISAIntrinsic::GenISA_DCL_inputVec)
365 {
366 uint setupIndex =
367 (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
368
369 IGC_ASSERT_MESSAGE(setupIndex < cMaxInputComponents, "Max inputs cannot be greater than 32 x 4");
370 inputComponentsUsed.set(setupIndex);
371
372 e_interpolation mode = (e_interpolation)
373 llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
374 switch (mode)
375 {
376 case EINTERPOLATION_CONSTANT:
377 IGC_ASSERT(!isLinearInterpolation.test(setupIndex / 4));
378 break;
379 case EINTERPOLATION_LINEARSAMPLE:
380 case EINTERPOLATION_LINEARNOPERSPECTIVESAMPLE:
381 m_isPerSample = true;
382 // fall through
383 case EINTERPOLATION_LINEAR:
384 case EINTERPOLATION_LINEARCENTROID:
385 case EINTERPOLATION_LINEARNOPERSPECTIVE:
386 case EINTERPOLATION_LINEARNOPERSPECTIVECENTROID:
387 isLinearInterpolation.set(setupIndex / 4);
388 break;
389 case EINTERPOLATION_UNDEFINED:
390 case EINTERPOLATION_VERTEX:
391 default:
392 IGC_ASSERT_MESSAGE(0, "Unexpected Pixel Shader input interpolation mode.");
393 }
394 }
395 }
396 }
397 }
398 if (primId)
399 {
400 // When PrimitiveId input is present in shader IGC allocates an additional input and returns
401 // information about the PrimitiveID input to UMD (to program SBE). This new input component
402 // is created with constant interpolation and cannot be placed in a (4-dword) location that
403 // has linearly interpolated components. Alernatively code in MarkConstantInterpolation()
404 // could be modified to ignore the additional input created for PrimitveID.
405 unsigned int location;
406 for (location = 0; location < cMaxInputComponents; location++)
407 {
408 if (inputComponentsUsed.test(location) == false &&
409 isLinearInterpolation.test(location / 4) == false)
410 {
411 break;
412 }
413 }
414 Value* arguments[] =
415 {
416 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), location),
417 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), EINTERPOLATION_CONSTANT),
418 };
419 CallInst* in = GenIntrinsicInst::Create(
420 GenISAIntrinsic::getDeclaration(
421 m_module,
422 GenISAIntrinsic::GenISA_DCL_inputVec,
423 Type::getFloatTy(m_module->getContext())),
424 arguments,
425 "",
426 primId);
427 in->setDebugLoc(primId->getDebugLoc());
428 primId->replaceAllUsesWith(in);
429 NamedMDNode* primIdMD = m_module->getOrInsertNamedMetadata("PrimIdLocation");
430
431 Constant* cval = ConstantInt::get(
432 Type::getInt32Ty(m_module->getContext()), location);
433 llvm::MDNode* locationNd = llvm::MDNode::get(
434 m_module->getContext(),
435 ConstantAsMetadata::get(cval));
436 primIdMD->addOperand(locationNd);
437 }
438 if (pointCoordX || pointCoordY)
439 {
440 // Although PointCoords needs only 2 DWORDs, IGC must allocate 4 additional input and returns
441 // information about the PointCoord input to UMD (to program SBE). These new input components
442 // are created with linear interpolation and must be placed in an empty attribute index (4 DWORDs).
443 unsigned int location;
444 for (location = 0; location < cMaxInputComponents; location += 4)
445 {
446 bool isAttributeIndexEmpty =
447 inputComponentsUsed.test(location) == false &&
448 inputComponentsUsed.test(location + 1) == false &&
449 inputComponentsUsed.test(location + 2) == false &&
450 inputComponentsUsed.test(location + 3) == false;
451 if (isAttributeIndexEmpty)
452 {
453 isLinearInterpolation.set(location / 4);
454 break;
455 }
456 }
457 IGC_ASSERT(location < cMaxInputComponents);
458
459 llvm::Instruction* inputPointCoords[] = { pointCoordX, pointCoordY };
460 for (unsigned int i = 0; i < sizeof(inputPointCoords) / sizeof(inputPointCoords[0]); i++)
461 {
462 if (inputPointCoords[i] == nullptr)
463 {
464 continue;
465 }
466 Value* arguments[] =
467 {
468 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), location + i),
469 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), EINTERPOLATION_LINEAR),
470 };
471 CallInst* in = GenIntrinsicInst::Create(
472 GenISAIntrinsic::getDeclaration(
473 m_module,
474 GenISAIntrinsic::GenISA_DCL_inputVec,
475 Type::getFloatTy(m_module->getContext())),
476 arguments,
477 "",
478 inputPointCoords[i]);
479 in->setDebugLoc(inputPointCoords[i]->getDebugLoc());
480 inputPointCoords[i]->replaceAllUsesWith(in);
481 instructionToRemove.push_back(inputPointCoords[i]);
482 }
483
484 NamedMDNode* PointCoordMD = m_module->getOrInsertNamedMetadata("PointCoordLocation");
485 Constant* cval = ConstantInt::get(
486 Type::getInt32Ty(m_module->getContext()), location);
487 llvm::MDNode* locationNd = llvm::MDNode::get(
488 m_module->getContext(),
489 ConstantAsMetadata::get(cval));
490 PointCoordMD->addOperand(locationNd);
491
492 }
493 for (GenIntrinsicInst* pInst : outputInstructions)
494 {
495 uint outputType = (uint)llvm::cast<llvm::ConstantInt>(pInst->getOperand(4))->getZExtValue();
496 if (outputType == SHADER_OUTPUT_TYPE_DEFAULT)
497 {
498 uint RTIndex = (uint)llvm::cast<llvm::ConstantInt>(pInst->getOperand(5))->getZExtValue();
499
500 unsigned mask = 0;
501 // if any of the color channel is undef, initialize it
502 // to 0 for color compression perf.
503 for (int i = 0; i < 4; i++)
504 {
505 if (isa<UndefValue>(pInst->getOperand(i)))
506 {
507 if (i == 3 &&
508 IGC_IS_FLAG_ENABLED(EnableUndefAlphaOutputAsRed))
509 {
510 // if it's alpha, then set default value to
511 // color.r, see IGC-959.
512 pInst->setOperand(i, pInst->getOperand(0));
513 }
514 else
515 {
516 pInst->setOperand(i,
517 ConstantFP::get(pInst->getOperand(i)->getType(), 0.0f));
518 }
519 }
520 else
521 {
522 mask |= 1 << i;
523 }
524 }
525 if (RTIndex == 0)
526 {
527 src0Alpha = pInst->getOperand(3);
528 }
529 m_modMD->psInfo.colorOutputMask[RTIndex] = mask;
530 ColorOutput data;
531 data.RTindex = RTIndex;
532 data.color[0] = pInst->getOperand(0);
533 data.color[1] = pInst->getOperand(1);
534 data.color[2] = pInst->getOperand(2);
535 data.color[3] = pInst->getOperand(3);
536 data.mask = btrue;
537 data.blendStateIndex = nullptr;
538 data.bb = pInst->getParent();
539 colors.push_back(data);
540 }
541 else if (outputType == SHADER_OUTPUT_TYPE_DEPTHOUT)
542 {
543 depth = pInst->getOperand(0);
544 }
545 else if (outputType == SHADER_OUTPUT_TYPE_STENCIL)
546 {
547 stencil = pInst->getOperand(0);
548 }
549 else if (outputType == SHADER_OUTPUT_TYPE_OMASK)
550 {
551 mask = pInst->getOperand(0);
552 }
553 }
554 for (unsigned int i = 0; i < instructionToRemove.size(); i++)
555 {
556 instructionToRemove[i]->eraseFromParent();
557 }
558 }
559
EmitMemoryFence(IRBuilder<> & builder,bool forceFlushNone)560 void PixelShaderLowering::EmitMemoryFence(IRBuilder<>& builder, bool forceFlushNone)
561 {
562 Value* trueValue = builder.getInt1(true);
563 Value* falseValue = builder.getInt1(false);
564
565 Value* arguments[] =
566 {
567 trueValue,
568 falseValue,
569 falseValue,
570 falseValue,
571 falseValue,
572 trueValue,
573 falseValue,
574 };
575
576 CallInst* memFence = GenIntrinsicInst::Create(GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_memoryfence),
577 arguments,
578 "",
579 m_ReturnBlock->getTerminator());
580 }
581
addRTWrite(BasicBlock * bbToAdd,Value * src0Alpha,Value * oMask,ColorOutput & color,Value * depth,Value * stencil)582 CallInst* PixelShaderLowering::addRTWrite(
583 BasicBlock* bbToAdd, Value* src0Alpha,
584 Value* oMask, ColorOutput& color,
585 Value* depth, Value* stencil)
586 {
587 bool isHF = false;
588 Value* undefSrc0Alpha = nullptr;
589 Value* r = color.color[0];
590 Value* g = color.color[1];
591 Value* b = color.color[2];
592 Value* a = color.color[3];
593
594 //True if src0Alpha exists and renderTargetBlendingDisabled is false
595 bool needsSrc0Alpha = ((src0Alpha && color.RTindex > 0) && (!SkipSrc0Alpha) && src0Alpha != color.color[3]);
596 bool src0AlphaIsHF = (needsSrc0Alpha && isa<FPExtInst>(src0Alpha)) || !needsSrc0Alpha;
597
598 if (m_cgCtx->platform.supportFP16() &&
599 (llvm::isa<llvm::FPExtInst>(r) &&
600 llvm::isa<llvm::FPExtInst>(g) &&
601 llvm::isa<llvm::FPExtInst>(b) &&
602 llvm::isa<llvm::FPExtInst>(a)) &&
603 src0AlphaIsHF &&
604 !SkipSrc0Alpha)
605 {
606
607 FPExtInst* rInst = llvm::cast<llvm::FPExtInst>(r);
608 FPExtInst* gInst = llvm::cast<llvm::FPExtInst>(g);
609 FPExtInst* bInst = llvm::cast<llvm::FPExtInst>(b);
610 FPExtInst* aInst = llvm::cast<llvm::FPExtInst>(a);
611 FPExtInst* src0AlphaInst = nullptr;
612
613 if (needsSrc0Alpha &&
614 llvm::isa<llvm::FPExtInst>(src0Alpha))
615 src0AlphaInst = llvm::cast<llvm::FPExtInst>(src0Alpha);
616
617 r = rInst->getOperand(0);
618
619 g = gInst->getOperand(0);
620
621 b = bInst->getOperand(0);
622
623 a = aInst->getOperand(0);
624
625 if (src0AlphaInst)
626 {
627 src0Alpha = src0AlphaInst->getOperand(0);
628 }
629 isHF = true;
630 }
631
632 if (r->getType()->isHalfTy())
633 {
634 isHF = true;
635 }
636
637 /*
638 In case src0Alpha comes from a HF RT Write
639 */
640 IRBuilder<> builder(bbToAdd->getTerminator());
641 if (!isHF &&
642 needsSrc0Alpha &&
643 src0Alpha->getType()->isHalfTy())
644 {
645 if (llvm::isa<llvm::FPTruncInst>(src0Alpha))
646 {
647 src0Alpha = (llvm::cast<llvm::FPTruncInst>(src0Alpha))->getOperand(0);
648 }
649 else
650 {
651 src0Alpha = builder.CreateFPExt(src0Alpha, builder.getFloatTy());
652 }
653 }
654 else if (isHF &&
655 needsSrc0Alpha &&
656 src0Alpha->getType()->isFloatTy())
657 {
658 /*
659 reverse, src0Alpha comes from half float in to float RT Write
660 */
661 if (llvm::isa<llvm::FPExtInst>(src0Alpha))
662 {
663 src0Alpha = (llvm::cast<llvm::FPExtInst>(src0Alpha))->getOperand(0);
664 }
665 else
666 {
667 src0Alpha = builder.CreateFPTrunc(src0Alpha, llvm::Type::getHalfTy(m_module->getContext()));
668 }
669 }
670
671 if (isHF)
672 undefSrc0Alpha = llvm::UndefValue::get(Type::getHalfTy(m_module->getContext()));
673 else
674 undefSrc0Alpha = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
675
676 Type* i32t = Type::getInt32Ty(m_module->getContext());
677 Type* i1t = Type::getInt1Ty(m_module->getContext());
678 Value* undef = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
679 Value* iundef = llvm::UndefValue::get(i32t);
680 Value* i1true = ConstantInt::get(i1t, 1);
681 Value* i1false = ConstantInt::get(i1t, 0);
682 Value* vrtIdx = ConstantInt::get(i32t, color.RTindex);
683 Value* vblendIdx = color.blendStateIndex ? color.blendStateIndex : vrtIdx;
684 Value* hasOmask = (oMask || m_modMD->psInfo.outputMask) ? i1true : i1false;
685 Value* hasDepth = (depth || m_modMD->psInfo.outputDepth) ? i1true : i1false;
686 Value* hasStencil = (stencil || m_modMD->psInfo.outputStencil) ? i1true : i1false;
687
688 Value* arguments[] = {
689 needsSrc0Alpha ? src0Alpha : undefSrc0Alpha, // 0
690 oMask ? oMask : undef, // 1 - oMask
691 color.mask, // 2 - pMask
692 r, g, b, a, // 3,4,5,6
693 depth ? depth : undef, // 7
694 stencil ? stencil : undef, // 8
695 vrtIdx, // 9 - RT index
696 vblendIdx, // 10 - blend state index
697 hasOmask, // 11
698 hasDepth, // 12
699 hasStencil, // 13
700 i1false, // 14 - per sample
701 iundef // 15 - sample idx
702 };
703
704 Function* frtw;
705
706 if (isHF)
707 {
708 frtw = GenISAIntrinsic::getDeclaration(m_module,
709 GenISAIntrinsic::GenISA_RTWrite,
710 Type::getHalfTy(this->m_module->getContext()));
711 }
712 else
713 {
714 frtw = GenISAIntrinsic::getDeclaration(m_module,
715 GenISAIntrinsic::GenISA_RTWrite,
716 Type::getFloatTy(this->m_module->getContext()));
717 }
718
719 return GenIntrinsicInst::Create(frtw, arguments, "",
720 bbToAdd->getTerminator());
721 }
722
723 #ifdef DEBUG_BLEND_TO_DISCARD
724 // debug function
dbgPrintBlendOptMode(uint64_t hash,std::vector<int> & blendOpt,unsigned ncolors)725 static void dbgPrintBlendOptMode(uint64_t hash,
726 std::vector<int>& blendOpt, unsigned ncolors)
727 {
728 static const char* blendOptName[] =
729 {
730 "BLEND_OPTIMIZATION_NONE",
731 "BLEND_OPTIMIZATION_SRC_ALPHA",
732 "BLEND_OPTIMIZATION_INV_SRC_ALPHA",
733 "BLEND_OPTIMIZATION_SRC_ALPHA_DISCARD_ONLY",
734 "BLEND_OPTIMIZATION_SRC_ALPHA_FILL_ONLY",
735 "BLEND_OPTIMIZATION_SRC_COLOR_ZERO",
736 "BLEND_OPTIMIZATION_SRC_COLOR_ONE",
737 "BLEND_OPTIMIZATION_SRC_BOTH_ZERO",
738 "BLEND_OPTIMIZATION_SRC_BOTH_ONE",
739 "BLEND_OPTIMIZATION_SRC_ALPHA_OR_COLOR_ZERO",
740 "BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_ONE",
741 "BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_IGNORE"
742 };
743 bool doprint = false;
744 for (unsigned i = 0; i < ncolors; i++)
745 {
746 if (blendOpt[i] != USC::BLEND_OPTIMIZATION_NONE)
747 doprint = true;
748 }
749 if (doprint)
750 {
751 printf("%016llx blend opt[%d]:\n", hash, ncolors);
752 for (unsigned i = 0; i < ncolors; i++)
753 {
754 printf(" %s\n", blendOptName[blendOpt[i]]);
755 }
756 }
757 }
758 #endif
759
EmitRTWrite(ColorOutputArray & colors,Value * depth,Value * stencil,Value * oMask,Value * src0Alpha,DebugLocArray & debugLocs)760 void PixelShaderLowering::EmitRTWrite(
761 ColorOutputArray& colors, Value* depth, Value* stencil,
762 Value* oMask, Value* src0Alpha, DebugLocArray& debugLocs)
763 {
764 if (!m_hasDiscard)
765 {
766 // no discard found
767 //IGC_ASSERT(m_module->getNamedMetadata("KillPixel") == nullptr);
768
769 // check blend to discard optimization and generate mask for each
770 // render target output
771 std::vector<int>& blendOpt = m_modMD->psInfo.blendOptimizationMode;
772 #ifdef DEBUG_BLEND_TO_DISCARD
773 dbgPrintBlendOptMode(m_cgCtx->hash.getAsmHash(), blendOpt, colors.size());
774 #endif
775
776 if (blendOpt.size() && !useDualSrcBlend(colors))
777 {
778 bool hasDiscard = false;
779
780 unsigned maxRTIndex = 0;
781 for (unsigned i = 0; i < colors.size(); i++)
782 {
783 if (maxRTIndex < colors[i].RTindex)
784 {
785 maxRTIndex = colors[i].RTindex;
786 }
787 }
788
789 for (unsigned i = 0; i < colors.size(); i++)
790 {
791 USC::BLEND_OPTIMIZATION_MODE blendOptMode =
792 static_cast<USC::BLEND_OPTIMIZATION_MODE>(blendOpt[i]);
793
794 // Only do blend to fill if the shader is persample, hardware
795 // already does blend to fill for other cases.
796 bool enableBlendToFill =
797 m_cgCtx->m_DriverInfo.SupportBlendToFillOpt() &&
798 maxRTIndex <= 4 && m_isPerSample;
799
800 if (optBlendState(blendOptMode, colors[i], enableBlendToFill))
801 {
802 // for blend to discard opt, we need to force earlyz
803 hasDiscard = true;
804 m_modMD->psInfo.forceEarlyZ = true;
805 }
806 }
807
808 if (hasDiscard)
809 {
810 m_module->getOrInsertNamedMetadata("KillPixel");
811 }
812 }
813 }
814
815 uint32_t RTindexVal = -1;
816 //According to Spec, the RT Write instruction must follow this order : dual source followed by single source
817 if (useDualSrcBlend(colors))
818 {
819 //If RT0 is executed first when size is 2
820 if (colors[0].RTindex == 0 && colors[1].RTindex == 1)
821 {
822 RTindexVal = 0;
823 }
824 else if (colors[0].RTindex == 1 && colors[1].RTindex == 0)
825 {
826
827 RTindexVal = 1;
828 }
829 }
830
831 if (RTindexVal != -1)
832 {
833 //dual source RTWrite first
834 colors[RTindexVal].inst = addDualBlendWrite(
835 colors[RTindexVal].bb,
836 oMask,
837 colors[RTindexVal],
838 colors[1 - RTindexVal],
839 depth, stencil, 0);
840 colors[RTindexVal].inst->setDebugLoc(debugLocs[RTindexVal]);
841
842 //Single source RTWrite
843 colors[1 - RTindexVal].inst = addRTWrite(
844 colors[1 - RTindexVal].bb,
845 src0Alpha,
846 oMask, colors[1 - RTindexVal],
847 depth,
848 stencil);
849 colors[1 - RTindexVal].inst->setDebugLoc(debugLocs[1 - RTindexVal]);
850 }
851 else
852 {
853 for (unsigned int i = 0; i < colors.size(); i++)
854 {
855 colors[i].inst = addRTWrite(
856 colors[i].bb,
857 src0Alpha,
858 oMask, colors[i],
859 depth,
860 stencil);
861
862 colors[i].inst->setDebugLoc(debugLocs[i]);
863 }
864 }
865
866 // pick up 1 RTWrite and move it to return block, so we don't need to
867 // generate an additional null surface write for EOT.
868 if (m_hasDiscard)
869 {
870 moveRTWritesToReturnBlock(colors);
871 }
872
873 checkAndCreateNullRTWrite(oMask, depth, stencil);
874 }
875
fixHFSource(IRBuilder<> & builder,Value * val)876 inline Value* fixHFSource(IRBuilder<>& builder, Value* val)
877 {
878 if (val->getType()->isFloatTy())
879 return val;
880
881 if (llvm::isa<llvm::FPTruncInst>(val))
882 {
883 return (llvm::cast<llvm::FPTruncInst>(val))->getOperand(0);
884 }
885 else
886 {
887 return builder.CreateFPExt(val, builder.getFloatTy());
888 }
889 }
890
addDualBlendWrite(BasicBlock * bbToAdd,Value * oMask,ColorOutput & color0,ColorOutput & color1,Value * depth,Value * stencil,uint index)891 CallInst* PixelShaderLowering::addDualBlendWrite(
892 BasicBlock* bbToAdd, Value* oMask,
893 ColorOutput& color0, ColorOutput& color1,
894 Value* depth, Value* stencil, uint index)
895 {
896 bool isFP16 = false;
897 bool isFP32 = false;
898 Value* pMask = color0.mask;
899 Value* r0 = color0.color[0];
900 Value* g0 = color0.color[1];
901 Value* b0 = color0.color[2];
902 Value* a0 = color0.color[3];
903 Value* r1 = color1.color[0];
904 Value* g1 = color1.color[1];
905 Value* b1 = color1.color[2];
906 Value* a1 = color1.color[3];
907
908 IGC_ASSERT(color0.mask == color1.mask);
909
910 //assuming types are consistent
911 if (r0->getType()->isHalfTy() ||
912 r1->getType()->isHalfTy())
913 {
914 isFP16 = true;
915 }
916
917 if (r0->getType()->isFloatTy() ||
918 r1->getType()->isFloatTy())
919 {
920 isFP32 = true;
921 }
922
923 /*
924 if we are combining FP32 and FP16 RT writes
925 promote everything to FP32
926 Three Cases:
927 Case 1) Immediate, extend to FP32 Immediate.
928 Case 2) FP16 Not Immediate. Not result to FPTrunc. Add FPExt Instruction
929 Case 3) FP16 Not Immediate. Result of FPTrunc. Use src of FPTrunc
930 */
931 if (isFP16 && isFP32)
932 {
933 IRBuilder<> builder(bbToAdd->getTerminator());
934 r0 = fixHFSource(builder, r0);
935 g0 = fixHFSource(builder, g0);
936 b0 = fixHFSource(builder, b0);
937 a0 = fixHFSource(builder, a0);
938 r1 = fixHFSource(builder, r1);
939 g1 = fixHFSource(builder, g1);
940 b1 = fixHFSource(builder, b1);
941 a1 = fixHFSource(builder, a1);
942 }
943
944 Type* i32t = Type::getInt32Ty(m_module->getContext());
945 Type* i1t = Type::getInt1Ty(m_module->getContext());
946 Value* undef = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
947 Value* iundef = llvm::UndefValue::get(i32t);
948 Value* i1true = ConstantInt::get(i1t, 1);
949 Value* i1false = ConstantInt::get(i1t, 0);
950
951 Value* arguments[] = {
952 oMask ? oMask : undef, // 0 - oMask
953 pMask, // 1 - pMask
954 r0, g0, b0, a0, // 2, 3, 4, 5
955 r1, g1, b1, a1, // 6, 7, 8, 9
956 depth ? depth : undef, // 10
957 stencil ? stencil : undef, // 11
958 ConstantInt::get(i32t, index), // 12 - RT index
959 oMask ? i1true : i1false, // 13
960 depth ? i1true : i1false, // 14
961 stencil ? i1true : i1false, // 15
962 i1false, // 16 - per sample
963 iundef, // 17 - sample index
964 };
965 return GenIntrinsicInst::Create(
966 GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_RTDualBlendSource, r0->getType()),
967 arguments,
968 "",
969 bbToAdd->getTerminator());
970 }
971
EmitCoarseMask(llvm::Value * mask)972 void PixelShaderLowering::EmitCoarseMask(llvm::Value* mask)
973 {
974 Type* floatTy = Type::getFloatTy(m_module->getContext());
975 Value* undef = llvm::UndefValue::get(floatTy);
976 Value* oMaskType =
977 ConstantInt::get(Type::getInt32Ty(m_module->getContext()), SHADER_OUTPUT_TYPE_OMASK);
978 Value* zero = ConstantInt::get(Type::getInt32Ty(m_module->getContext()), 0);
979 Value* arguments[] =
980 {
981 mask,
982 undef,
983 undef,
984 undef,
985 oMaskType,
986 zero,
987 };
988
989 GenIntrinsicInst::Create(
990 GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_OUTPUT, floatTy),
991 arguments,
992 "",
993 m_ReturnBlock->getTerminator());
994 }
995
LowerPositionInput(GenIntrinsicInst * positionInstr,uint usage)996 void PixelShaderLowering::LowerPositionInput(GenIntrinsicInst* positionInstr, uint usage)
997 {
998 IRBuilder<> builder(positionInstr);
999 Function* positionIntr = GenISAIntrinsic::getDeclaration(m_module,
1000 usage == POSITION_X ? GenISAIntrinsic::GenISA_PixelPositionX : GenISAIntrinsic::GenISA_PixelPositionY);
1001 Value* intPosition = builder.CreateCall(positionIntr);
1002 Value* floatPosition = positionInstr;
1003 if (floatPosition->hasOneUse())
1004 {
1005 if (BinaryOperator * fadd = dyn_cast<BinaryOperator>(*floatPosition->user_begin()))
1006 {
1007 if (ConstantFP * cst = dyn_cast<ConstantFP>(fadd->getOperand(1)))
1008 {
1009 float constant = cst->getValueAPF().convertToFloat();
1010 if (constant >= 0.0f && constant < 1.f)
1011 {
1012 floatPosition = fadd;
1013 }
1014 }
1015 }
1016 }
1017 if (floatPosition->hasOneUse())
1018 {
1019 Value* v = *floatPosition->user_begin();
1020 if (v->getType()->isIntegerTy(32) && (isa<FPToUIInst>(v) || isa<FPToSIInst>(v)))
1021 {
1022 for (auto UI = v->user_begin(), UE = v->user_end(); UI != UE;)
1023 {
1024 Value* use = *UI++;
1025 if (TruncInst * truncI = dyn_cast<TruncInst>(use))
1026 {
1027 truncI->replaceAllUsesWith(builder.CreateZExtOrTrunc(intPosition, truncI->getType()));
1028 }
1029 }
1030 if (!v->user_empty())
1031 {
1032 v->replaceAllUsesWith(builder.CreateZExt(intPosition, v->getType()));
1033 }
1034 return;
1035 }
1036 }
1037 positionInstr->replaceAllUsesWith(builder.CreateUIToFP(intPosition, positionInstr->getType()));
1038 }
1039
1040 // Based on blend state, check color output and discard them if possible.
optBlendState(USC::BLEND_OPTIMIZATION_MODE blendOpt,ColorOutput & colorOut,bool enableBlendToFill)1041 bool PixelShaderLowering::optBlendState(
1042 USC::BLEND_OPTIMIZATION_MODE blendOpt,
1043 ColorOutput& colorOut,
1044 bool enableBlendToFill)
1045 {
1046 Function* fBallot = GenISAIntrinsic::getDeclaration(m_module,
1047 GenISAIntrinsic::GenISA_WaveBallot);
1048
1049 bool enableBlendToDiscard =
1050 IGC_IS_FLAG_ENABLED(EnableBlendToDiscard) &&
1051 m_cgCtx->platform.enableBlendToDiscardAndFill();
1052 enableBlendToFill = enableBlendToFill &&
1053 IGC_IS_FLAG_ENABLED(EnableBlendToFill) &&
1054 m_cgCtx->platform.enableBlendToDiscardAndFill();
1055
1056 bool hasDiscard = false;
1057
1058 if (m_modMD->psInfo.outputDepth || m_modMD->psInfo.outputStencil)
1059 {
1060 enableBlendToDiscard = false;
1061 }
1062
1063 IGCIRBuilder<> irb(m_ReturnBlock->getTerminator());
1064
1065 switch (blendOpt)
1066 {
1067
1068 case USC::BLEND_OPTIMIZATION_SRC_ALPHA:
1069 {
1070 // discard: src.a == 0, fill: src.a == 1
1071
1072 if (enableBlendToDiscard)
1073 {
1074 Constant* f0 = ConstantFP::get(colorOut.color[3]->getType(), 0.0);
1075 Value* ane0 = irb.CreateFCmpUNE(colorOut.color[3], f0);
1076 colorOut.mask = ane0;
1077 hasDiscard = true;
1078 }
1079
1080 if (enableBlendToFill)
1081 {
1082 // ifany(src.a != 1.0) ? RTIndex : RTIndex + 4
1083 Constant* f1 = ConstantFP::get(colorOut.color[3]->getType(), 1.0);
1084 Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1085 Value* ane1_ballot = irb.CreateCall(fBallot, { ane1 });
1086 Value* any = irb.CreateICmpNE(ane1_ballot, irb.getInt32(0));
1087 colorOut.blendStateIndex = irb.CreateSelect(any,
1088 irb.getInt32(colorOut.RTindex),
1089 irb.getInt32(colorOut.RTindex + 4));
1090 m_modMD->psInfo.blendToFillEnabled = true;
1091 }
1092 return hasDiscard;
1093 }
1094
1095 case USC::BLEND_OPTIMIZATION_INV_SRC_ALPHA:
1096 {
1097 // discard: src.a == 1, fill: src.a == 0
1098 Constant* f1 = ConstantFP::get(colorOut.color[0]->getType(), 1.0);
1099
1100 if (enableBlendToDiscard)
1101 {
1102 Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1103 colorOut.mask = ane1;
1104 hasDiscard = true;
1105 }
1106
1107 if (enableBlendToFill)
1108 {
1109 // ifall(src.a == 0) ? RTIndex + 4 : RTIndex
1110 // ifany(src.a != 0) ? RTIndex : RTIndex + 4
1111 Value* ai = irb.CreateBitCast(colorOut.color[3], irb.getInt32Ty());
1112 Value* ane0 = irb.CreateICmpNE(ai, irb.getInt32(0));
1113 Value* ane0_ballot = irb.CreateCall(fBallot, { ane0 });
1114 Value* any = irb.CreateICmpNE(ane0_ballot, irb.getInt32(0));
1115 colorOut.blendStateIndex = irb.CreateSelect(any,
1116 irb.getInt32(colorOut.RTindex),
1117 irb.getInt32(colorOut.RTindex + 4));
1118 m_modMD->psInfo.blendToFillEnabled = true;
1119 }
1120 return hasDiscard;
1121 }
1122
1123 case USC::BLEND_OPTIMIZATION_SRC_ALPHA_DISCARD_ONLY:
1124 {
1125 // discard: src.a == 0
1126 if (enableBlendToDiscard)
1127 {
1128 Constant* f0 = ConstantFP::get(colorOut.color[3]->getType(), 0.0);
1129 Value* ane0 = irb.CreateFCmpUNE(colorOut.color[3], f0);
1130 colorOut.mask = ane0;
1131 hasDiscard = true;
1132 }
1133 return hasDiscard;
1134 }
1135
1136 case USC::BLEND_OPTIMIZATION_SRC_ALPHA_FILL_ONLY:
1137 {
1138 // fill: src.a == 1
1139 if (enableBlendToFill)
1140 {
1141 // ifall(src.a == 1.0) ? RTIndex + 4 : RTIndex
1142 // ifany(src.a != 1.0) ? RTIndex : RTIndex + 4
1143 Constant* f1 = ConstantFP::get(colorOut.color[3]->getType(), 1.0);
1144 Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1145 Value* ane1_ballot = irb.CreateCall(fBallot, { ane1 });
1146 Value* any = irb.CreateICmpNE(ane1_ballot, irb.getInt32(0));
1147 colorOut.blendStateIndex = irb.CreateSelect(any,
1148 irb.getInt32(colorOut.RTindex),
1149 irb.getInt32(colorOut.RTindex + 4));
1150 m_modMD->psInfo.blendToFillEnabled = true;
1151 }
1152 return false;
1153 }
1154
1155 case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO:
1156 {
1157 // discard: src.rgb == 0
1158 if (enableBlendToDiscard)
1159 {
1160 colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1161 hasDiscard = true;
1162 }
1163 return hasDiscard;
1164 }
1165
1166 case USC::BLEND_OPTIMIZATION_SRC_COLOR_ONE:
1167 {
1168 // discard if src.rgb == 1
1169 if (enableBlendToDiscard)
1170 {
1171 ConstantFP* f1 = cast<ConstantFP>(
1172 ConstantFP::get(colorOut.color[0]->getType(), 1.0));
1173
1174 Value* rne1 = fcmpUNEConst(irb, colorOut.color[0], f1);
1175 Value* gne1 = fcmpUNEConst(irb, colorOut.color[1], f1);
1176 Value* bne1 = fcmpUNEConst(irb, colorOut.color[2], f1);
1177
1178 colorOut.mask = createOr(irb, bne1, createOr(irb, rne1, gne1));
1179 hasDiscard = true;
1180 }
1181 return hasDiscard;
1182 }
1183
1184 case USC::BLEND_OPTIMIZATION_SRC_BOTH_ZERO:
1185 {
1186 // discard: src.rgba == 0
1187 if (enableBlendToDiscard)
1188 {
1189 colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 4);
1190
1191 hasDiscard = true;
1192 }
1193 return hasDiscard;
1194 }
1195
1196 case USC::BLEND_OPTIMIZATION_SRC_BOTH_ONE:
1197 {
1198 // discard if src.rgba == 1
1199 if (enableBlendToDiscard)
1200 {
1201 Constant* f1 = ConstantFP::get(colorOut.color[0]->getType(), 1.0);
1202
1203 Value* rne1 = irb.CreateFCmpUNE(colorOut.color[0], f1);
1204 Value* gne1 = irb.CreateFCmpUNE(colorOut.color[1], f1);
1205 Value* bne1 = irb.CreateFCmpUNE(colorOut.color[2], f1);
1206 Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
1207 colorOut.mask = irb.CreateOr(ane1, irb.CreateOr(bne1, irb.CreateOr(rne1, gne1)));
1208 hasDiscard = true;
1209 }
1210 return hasDiscard;
1211 }
1212
1213 case USC::BLEND_OPTIMIZATION_SRC_ALPHA_OR_COLOR_ZERO:
1214 {
1215 // discard: src.a == 0 || src.rgb == 0
1216 if (enableBlendToDiscard)
1217 {
1218 Value* a = colorOut.color[3];
1219 Constant* f0 = ConstantFP::get(a->getType(), 0.0);
1220
1221 Value* ane0 = irb.CreateFCmpUNE(a, f0);
1222
1223 Value* cne0 = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1224
1225 colorOut.mask = irb.CreateAnd(ane0, cne0);
1226 hasDiscard = true;
1227 }
1228 return hasDiscard;
1229 }
1230
1231 case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_ONE:
1232 {
1233 // discard: src.rgb == 0 && src.a == 1
1234 // equivalently mask = (r|g|b != 0) || (a != 1)
1235 if (enableBlendToDiscard)
1236 {
1237 Value* cne0 = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1238
1239 Value* a = colorOut.color[3];
1240 Constant* f1 = ConstantFP::get(a->getType(), 1.0);
1241 Value* ane1 = irb.CreateFCmpUNE(a, f1);
1242
1243 colorOut.mask = irb.CreateOr(cne0, ane1);
1244 hasDiscard = true;
1245 }
1246
1247 return hasDiscard;
1248 }
1249
1250 case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_IGNORE:
1251 {
1252 // Discard: src.rgb == 0 and don't compute src.a
1253 // equivalently mask = (r|g|b != 0)
1254 if (enableBlendToDiscard)
1255 {
1256 colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 3);
1257 hasDiscard = true;
1258 }
1259
1260 // set output alpha as output.r, see IGC-959
1261 if (IGC_IS_FLAG_ENABLED(EnableUndefAlphaOutputAsRed))
1262 {
1263 colorOut.color[3] = colorOut.color[0];
1264 }
1265 else
1266 {
1267 colorOut.color[3] = ConstantFP::get(
1268 colorOut.color[3]->getType(), 0.0);
1269 }
1270
1271 return hasDiscard;
1272 }
1273
1274 default:
1275 return false;
1276 }
1277 }
1278
moveRTWriteToBlock(CallInst * call,SmallVector<BasicBlock *,8> & predBB,BasicBlock * toBB,llvm::DenseMap<llvm::Value *,llvm::PHINode * > & valueToPhiMap)1279 void PixelShaderLowering::moveRTWriteToBlock(
1280 CallInst* call, SmallVector<BasicBlock*, 8> & predBB, BasicBlock* toBB,
1281 llvm::DenseMap<llvm::Value*, llvm::PHINode*>& valueToPhiMap)
1282 {
1283 unsigned numPredBB = predBB.size();
1284 if (numPredBB > 1)
1285 {
1286 for (unsigned i = 0; i < call->getNumArgOperands(); i++)
1287 {
1288 if (Instruction * inst = dyn_cast<Instruction>(call->getArgOperand(i)))
1289 {
1290 auto it = valueToPhiMap.find(inst);
1291 if (it != valueToPhiMap.end())
1292 {
1293 call->setArgOperand(i, it->second);
1294 continue;
1295 }
1296
1297 PHINode* phi = PHINode::Create(
1298 inst->getType(), numPredBB, "", &(*toBB->begin()));
1299 valueToPhiMap[inst] = phi;
1300 for (unsigned j = 0; j < numPredBB; j++)
1301 {
1302 Value* inVal;
1303 if (predBB[j] == call->getParent())
1304 {
1305 inVal = inst;
1306 }
1307 else
1308 {
1309 inVal = UndefValue::get(inst->getType());
1310 }
1311 phi->addIncoming(inVal, predBB[j]);
1312 }
1313 call->setArgOperand(i, phi);
1314 }
1315 }
1316 }
1317
1318 call->removeFromParent();
1319 call->insertBefore(toBB->getTerminator());
1320 }
1321
moveRTWritesToReturnBlock(const ColorOutputArray & colors)1322 void PixelShaderLowering::moveRTWritesToReturnBlock(
1323 const ColorOutputArray& colors)
1324 {
1325 if (colors.size())
1326 {
1327 IGC_ASSERT(colors[0].inst != nullptr);
1328 SmallVector<BasicBlock*, 8> predBB;
1329 DenseMap<Value*, PHINode*> valueToPhiMap;
1330 for (auto PI = pred_begin(m_ReturnBlock), PE = pred_end(m_ReturnBlock);
1331 PI != PE; ++PI)
1332 {
1333 predBB.push_back(*PI);
1334 }
1335
1336 if (useDualSrcBlend(colors))
1337 {
1338 // For SIMD16 PS thread with two output colors must send
1339 // messages in the following sequence for each RT: SIMD8 dual
1340 // source RTW message (low); SIMD8 dual source RTW message
1341 // (high); SIMD16 single src RTW message with second color.
1342 CallInst* const dualSourceRTW =
1343 isa<RTDualBlendSourceIntrinsic>(colors[0].inst) ? colors[0].inst : colors[1].inst;
1344 CallInst* const singleSourceRTW =
1345 isa<RTDualBlendSourceIntrinsic>(colors[0].inst) ? colors[1].inst : colors[0].inst;
1346
1347 IGC_ASSERT(isa<RTWritIntrinsic>(singleSourceRTW));
1348 IGC_ASSERT(isa<RTDualBlendSourceIntrinsic>(dualSourceRTW));
1349
1350 moveRTWriteToBlock(dualSourceRTW, predBB, m_ReturnBlock, valueToPhiMap);
1351 moveRTWriteToBlock(singleSourceRTW, predBB, m_ReturnBlock, valueToPhiMap);
1352 }
1353 else
1354 {
1355 moveRTWriteToBlock(colors[0].inst, predBB, m_ReturnBlock, valueToPhiMap);
1356 }
1357 }
1358 }
1359
createPhiForRTWrite(Value * val,smallvector<BasicBlock *,8> & predBB,BasicBlock * toBB)1360 PHINode* PixelShaderLowering::createPhiForRTWrite(Value* val,
1361 smallvector<BasicBlock*, 8> & predBB, BasicBlock* toBB)
1362 {
1363 PHINode* phi = PHINode::Create(
1364 val->getType(), predBB.size(), "", &(*toBB->begin()));
1365 for (auto* BB : predBB)
1366 {
1367 Value* inVal;
1368 if (BB == m_outputBlock)
1369 inVal = val;
1370 else
1371 inVal = UndefValue::get(val->getType());
1372 phi->addIncoming(inVal, BB);
1373 }
1374 return phi;
1375 }
1376
1377 // create a null surface write in return block if there's no one
checkAndCreateNullRTWrite(Value * oMask,Value * depth,Value * stencil)1378 void PixelShaderLowering::checkAndCreateNullRTWrite(
1379 Value* oMask, Value* depth, Value* stencil)
1380 {
1381 bool hasRTW = false;
1382 for (auto& I : *m_ReturnBlock)
1383 {
1384 if (isa<RTWritIntrinsic>(&I) ||
1385 isa<RTDualBlendSourceIntrinsic>(&I))
1386 {
1387 hasRTW = true;
1388 break;
1389 }
1390 }
1391
1392 if (!hasRTW)
1393 {
1394 Value* undef = UndefValue::get(Type::getFloatTy(m_module->getContext()));
1395 ColorOutput color;
1396 color.color[0] = color.color[1] = color.color[2] = color.color[3] = undef;
1397 color.mask = ConstantInt::get(Type::getInt1Ty(m_module->getContext()), true);
1398 color.RTindex = -1;
1399 color.blendStateIndex = nullptr;
1400
1401 if (m_outputBlock != m_ReturnBlock)
1402 {
1403 smallvector<BasicBlock*, 8> predBB;
1404
1405 for (auto PI = pred_begin(m_ReturnBlock), PE = pred_end(m_ReturnBlock);
1406 PI != PE; ++PI)
1407 {
1408 predBB.push_back(*PI);
1409 }
1410 if (predBB.size() > 1)
1411 {
1412 if (oMask)
1413 {
1414 oMask = createPhiForRTWrite(oMask, predBB, m_ReturnBlock);
1415 }
1416 if (depth)
1417 {
1418 depth = createPhiForRTWrite(depth, predBB, m_ReturnBlock);
1419 }
1420 if (stencil)
1421 {
1422 stencil = createPhiForRTWrite(stencil, predBB, m_ReturnBlock);
1423 }
1424 }
1425 }
1426 addRTWrite(
1427 m_ReturnBlock,
1428 undef,
1429 oMask, color,
1430 depth, stencil);
1431 }
1432 }
1433
1434 ///////////////////////////////////////////////////////////////////////
1435 // Lower discard intrinsics
1436 ///////////////////////////////////////////////////////////////////////
1437
1438 #define PASS_FLAG "igc-lower-discard"
1439 #define PASS_DESCRIPTION "Lower discard intrinsics"
1440 #define PASS_CFG_ONLY false
1441 #define PASS_ANALYSIS false
1442 IGC_INITIALIZE_PASS_BEGIN(DiscardLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
1443 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
1444 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
1445 IGC_INITIALIZE_PASS_END(DiscardLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
1446 #undef PASS_FLAG
1447 #undef PASS_DESCRIPTION
1448 #undef PASS_CFG_ONLY
1449 #undef PASS_ANALYSIS
1450
1451 char DiscardLowering::ID = 0;
1452
DiscardLowering()1453 DiscardLowering::DiscardLowering()
1454 : FunctionPass(ID)
1455 {
1456 initializeDiscardLoweringPass(*PassRegistry::getPassRegistry());
1457 }
1458
lowerDiscards(Function & F)1459 bool DiscardLowering::lowerDiscards(Function& F)
1460 {
1461 if (m_discards.empty() && m_isHelperInvocationCalls.empty())
1462 {
1463 return false;
1464 }
1465
1466 m_earlyRet = BasicBlock::Create(m_module->getContext(), "DiscardRet", &F);
1467
1468 // add OUTPUT_PIXELMASK call to track discard conditions
1469 IRBuilder<> irb(m_earlyRet);
1470
1471 irb.CreateRetVoid();
1472
1473 if (m_retBB)
1474 {
1475 m_retBB->getTerminator()->eraseFromParent();
1476 BranchInst::Create(m_earlyRet, m_retBB);
1477 }
1478 m_retBB = m_earlyRet;
1479
1480 Function* fInitMask = GenISAIntrinsic::getDeclaration(m_module,
1481 GenISAIntrinsic::GenISA_InitDiscardMask);
1482 Function* fSetMask = GenISAIntrinsic::getDeclaration(m_module,
1483 GenISAIntrinsic::GenISA_UpdateDiscardMask);
1484
1485 Value* discardMask = CallInst::Create(fInitMask, llvm::None, "",
1486 m_entryBB->getFirstNonPHI());
1487
1488 bool killsPixels = false;
1489
1490 for (auto discard : m_discards)
1491 {
1492 IGC_ASSERT(discard->isGenIntrinsic(GenISAIntrinsic::GenISA_discard));
1493 killsPixels = true;
1494
1495 BasicBlock* bbDiscard;
1496 BasicBlock* bbAfter;
1497
1498 bbDiscard = discard->getParent();
1499
1500 BasicBlock::iterator bi = discard->getIterator();
1501 ++bi;
1502 bbAfter = bbDiscard->splitBasicBlock(
1503 bi, "postDiscard");
1504
1505 // erase the branch inserted by splitBasicBLock
1506 bbDiscard->getTerminator()->eraseFromParent();
1507
1508 // create conditional branch to early ret
1509 IRBuilder<> B(discard);
1510
1511 // call discard(%dcond)
1512 // -->
1513 // UpdatePixelMask(%globalMask, %dcond) ; update discard pixel mask in dmask
1514 // %all = WaveBallot(%dcond)
1515 // %1 = icmp eq i32 %all, -1 ; if.all %dcond returnBB
1516 // br %1, returnBB, postDiscardBB
1517
1518 Value* discardCond = discard->getOperand(0);
1519 Value* v = B.CreateCall(fSetMask, { discardMask, discardCond });
1520
1521 B.CreateCondBr(v, m_earlyRet, bbAfter);
1522 }
1523
1524 if (killsPixels)
1525 {
1526 m_module->getOrInsertNamedMetadata("KillPixel");
1527 }
1528
1529 for (auto inst : m_isHelperInvocationCalls)
1530 {
1531 IRBuilder<> B(inst);
1532 Function* getPixelMask = GenISAIntrinsic::getDeclaration(m_module,
1533 GenISAIntrinsic::GenISA_GetPixelMask);
1534 llvm::Value* pixelMask = B.CreateCall(getPixelMask, { discardMask });
1535 inst->replaceAllUsesWith(B.CreateNot(pixelMask));
1536 inst->eraseFromParent();
1537 }
1538
1539 for (auto discard : m_discards)
1540 {
1541 discard->eraseFromParent();
1542 }
1543
1544
1545 return true;
1546 }
1547
runOnFunction(Function & F)1548 bool DiscardLowering::runOnFunction(Function& F)
1549 {
1550 IGCMD::MetaDataUtils* pMdUtils = nullptr;
1551 pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
1552 if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
1553 {
1554 return false;
1555 }
1556
1557 m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
1558 m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
1559
1560 m_entryBB = &F.getEntryBlock();
1561 m_module = F.getParent();
1562
1563 // find return block
1564 for (auto& bb : F)
1565 {
1566 if (llvm::isa<llvm::ReturnInst>(bb.getTerminator()))
1567 {
1568 m_retBB = &bb;
1569 break;
1570 }
1571 }
1572
1573 SmallVector<GenIntrinsicInst*, 4> discardToDel;
1574
1575 for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
1576 {
1577 for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
1578 {
1579 GenIntrinsicInst* inst = dyn_cast<GenIntrinsicInst>(II);
1580 if (inst)
1581 {
1582 if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_discard))
1583 {
1584 // get rid of discard(false)
1585 if (ConstantInt * cval = dyn_cast<ConstantInt>(inst->getOperand(0)))
1586 {
1587 if (cval->isZero())
1588 {
1589 discardToDel.push_back(inst);
1590 continue;
1591 }
1592 }
1593 m_discards.push_back(inst);
1594 }
1595 else if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_IsHelperInvocation))
1596 {
1597 m_isHelperInvocationCalls.push_back(inst);
1598 }
1599 else
1600 if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_OUTPUT))
1601 {
1602 // Check whether PS output omask/depth/stencil and save to
1603 // metadata, since after discard lowering, the OUTPUT
1604 // could become dead code and get cleaned. While we need to
1605 // know it when creating null surface write.
1606 uint outputType = (uint)llvm::cast<llvm::ConstantInt>(
1607 inst->getOperand(4))->getZExtValue();
1608 IGC_ASSERT(outputType == SHADER_OUTPUT_TYPE_DEFAULT ||
1609 outputType == SHADER_OUTPUT_TYPE_DEPTHOUT ||
1610 outputType == SHADER_OUTPUT_TYPE_STENCIL ||
1611 outputType == SHADER_OUTPUT_TYPE_OMASK);
1612 switch (outputType)
1613 {
1614 case SHADER_OUTPUT_TYPE_DEPTHOUT:
1615 m_modMD->psInfo.outputDepth = true;
1616 break;
1617 case SHADER_OUTPUT_TYPE_STENCIL:
1618 m_modMD->psInfo.outputStencil = true;
1619 break;
1620 case SHADER_OUTPUT_TYPE_OMASK:
1621 m_modMD->psInfo.outputMask = true;
1622 break;
1623 default:
1624 break;
1625 }
1626 }
1627 }
1628 }
1629 }
1630
1631 for (auto I : discardToDel)
1632 {
1633 I->eraseFromParent();
1634 }
1635
1636
1637 Function* samplePhaseEntry = nullptr;
1638 Function* pixelPhaseEntry = nullptr;
1639 NamedMDNode* pixelNode = F.getParent()->getNamedMetadata("pixel_phase");
1640 NamedMDNode* sampleNode = F.getParent()->getNamedMetadata("sample_phase");
1641 if (sampleNode)
1642 {
1643 samplePhaseEntry = mdconst::dyn_extract<Function>(
1644 sampleNode->getOperand(0)->getOperand(0));
1645 }
1646 if (pixelNode)
1647 {
1648 pixelPhaseEntry = mdconst::dyn_extract<Function>(
1649 pixelNode->getOperand(0)->getOperand(0));
1650 }
1651
1652 bool cfgChanged = false;
1653
1654 // For multirate PS, we will run discard lowering twice, first on sample
1655 // phase entry before link multi rate pass, second on pixel entry after
1656 // link multi rate pass. The check is to make sure only lower discards on
1657 // sample phase entry before link multi rate pass.
1658 if (samplePhaseEntry == nullptr || pixelPhaseEntry != &F)
1659 {
1660 cfgChanged = lowerDiscards(F);
1661 }
1662 m_discards.clear();
1663
1664 #ifdef DEBUG_DISCARD_OPT
1665 DumpLLVMIR(getAnalysis<CodeGenContextWrapper>().getCodeGenContext(), "discard");
1666 #endif
1667
1668 return cfgChanged;
1669 }
1670
1671 }//namespace IGC
1672