1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Compiler/LowPrecisionOptPass.hpp"
10 #include "Compiler/CodeGenContextWrapper.hpp"
11 #include "Compiler/CodeGenPublic.h"
12 #include "Compiler/CISACodeGen/helper.h"
13 #include "Compiler/IGCPassSupport.h"
14 
15 #include "llvmWrapper/IR/DerivedTypes.h"
16 #include "common/LLVMWarningsPush.hpp"
17 #include <llvm/Support/CommandLine.h>
18 #include <llvm/IR/Instructions.h>
19 #include <llvm/IR/InstIterator.h>
20 #include "common/LLVMWarningsPop.hpp"
21 
22 #include "GenISAIntrinsics/GenIntrinsicInst.h"
23 #include "common/IGCIRBuilder.h"
24 using namespace llvm;
25 using namespace IGC;
26 using namespace IGC::IGCMD;
27 using namespace GenISAIntrinsic;
28 
29 char LowPrecisionOpt::ID = 0;
30 
31 // Register pass to igc-opt
32 #define PASS_FLAG "igc-low-precision-opt"
33 #define PASS_DESCRIPTION "Low Precision Opt"
34 #define PASS_CFG_ONLY false
35 #define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(LowPrecisionOpt,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)36 IGC_INITIALIZE_PASS_BEGIN(LowPrecisionOpt, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
37 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
38 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
39 IGC_INITIALIZE_PASS_END(LowPrecisionOpt, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
40 
41 LowPrecisionOpt::LowPrecisionOpt() : FunctionPass(ID)
42 {
43     initializeLowPrecisionOptPass(*PassRegistry::getPassRegistry());
44     m_func_llvm_GenISA_DCL_inputVec_f16 = nullptr;
45     m_func_llvm_GenISA_DCL_inputVec_f32 = nullptr;
46     m_currFunction = nullptr;
47     func_llvm_floor_f32 = nullptr;
48 }
49 
runOnFunction(Function & F)50 bool LowPrecisionOpt::runOnFunction(Function& F)
51 {
52     m_changed = false;
53     CodeGenContextWrapper* pCtxWrapper = &getAnalysis<CodeGenContextWrapper>();
54     CodeGenContext* ctx = pCtxWrapper->getCodeGenContext();
55 
56     MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
57 
58     if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
59     {
60         return m_changed;
61     }
62     llvm::IGCIRBuilder<> builder(F.getContext());
63     m_builder = &builder;
64     m_currFunction = &F;
65     shdrType = ctx->type;
66     bundles.clear();
67     m_simplifyAlu = true;
68     m_changeSample = false;
69     visit(F);
70     // change sampler only after we simplified fext + ftrunc
71     m_simplifyAlu = false;
72     m_changeSample = true;
73     visit(F);
74     std::sort(bundles.begin(), bundles.end(), cmpOperator);
75     auto bundleEnd = bundles.end();
76     for (auto bundle = bundles.begin(); bundle != bundleEnd; ++bundle)
77     {
78         (*bundle).cInst->moveBefore(&(*(m_currFunction->getEntryBlock().begin())));
79         (*bundle).fpTrunc->moveBefore(&(*(m_currFunction->getEntryBlock().begin())));
80     }
81     return m_changed;
82 }
83 
visitFPExtInst(llvm::FPExtInst & I)84 void LowPrecisionOpt::visitFPExtInst(llvm::FPExtInst& I)
85 {
86     if (!m_simplifyAlu)
87     {
88         return;
89     }
90     if (I.getOperand(0)->getType()->isHalfTy())
91     {
92         Instruction* I0 = dyn_cast<Instruction>(I.getOperand(0));
93         llvm::GenIntrinsicInst* callInst = llvm::dyn_cast<llvm::GenIntrinsicInst>(I.getOperand(0));
94 
95         if (I0 && I0->getOpcode() == Instruction::FPTrunc && I.getDestTy() == I0->getOperand(0)->getType())
96         {
97             I.replaceAllUsesWith(I0->getOperand(0));
98             I.eraseFromParent();
99             m_changed = true;
100         }
101         else if (callInst && callInst->hasOneUse())
102         {
103             GenISAIntrinsic::ID ID = callInst->getIntrinsicID();
104             if (ID == GenISAIntrinsic::GenISA_DCL_ShaderInputVec || ID == GenISAIntrinsic::GenISA_DCL_inputVec)
105             {
106                 /*
107                 Catches a pattern where we have a lowp input, then extend it back up. This
108                 generates mixed mode instructions and so it's better to keep it as PLN.
109                 Example if it's used directly in the sample instruction before CNL.
110                 */
111 
112                 if (m_func_llvm_GenISA_DCL_inputVec_f32 == nullptr)
113                 {
114                     m_func_llvm_GenISA_DCL_inputVec_f32 = llvm::GenISAIntrinsic::getDeclaration(
115                         m_currFunction->getParent(),
116                         ID,
117                         Type::getFloatTy(m_builder->getContext()));
118                 }
119 
120                 m_builder->SetInsertPoint(callInst);
121                 Value* v = m_builder->CreateCall2(m_func_llvm_GenISA_DCL_inputVec_f32, callInst->getOperand(0), callInst->getOperand(1));
122 #if VALUE_NAME_ENABLE
123                 v->setName(callInst->getName());
124 #endif
125                 I.replaceAllUsesWith(v);
126                 I.eraseFromParent();
127                 callInst->eraseFromParent();
128                 m_changed = true;
129             }
130         }
131     }
132 }
133 
visitFPTruncInst(llvm::FPTruncInst & I)134 void LowPrecisionOpt::visitFPTruncInst(llvm::FPTruncInst& I)
135 {
136     if (!m_simplifyAlu)
137     {
138         return;
139     }
140     llvm::GenIntrinsicInst* cInst = llvm::dyn_cast<llvm::GenIntrinsicInst>(I.getOperand(0));
141 
142     if (cInst &&
143         cInst->getIntrinsicID() == GenISAIntrinsic::GenISA_RuntimeValue)
144     {
145         if (!IGC_IS_FLAG_ENABLED(HoistPSConstBufferValues) ||
146             shdrType != ShaderType::PIXEL_SHADER)
147             return;
148         moveBundle bundle;
149         bundle.index = (uint)llvm::cast<llvm::ConstantInt>(cInst->getOperand(0))->getZExtValue();
150         bundle.cInst = cInst;
151         bundle.fpTrunc = &I;
152         bundles.push_back(bundle);
153     }
154 }
155 
156 // If all the uses of a sampler instruction are converted to a different floating point type
157 // try to propagate the type in the sampler
propagateSamplerType(llvm::GenIntrinsicInst & I)158 bool LowPrecisionOpt::propagateSamplerType(llvm::GenIntrinsicInst& I)
159 {
160     if (IGC_IS_FLAG_DISABLED(UpConvertF16Sampler) && cast<VectorType>(I.getType())->getElementType()->isHalfTy())
161     {
162         return false;
163     }
164 
165     IGC::CodeGenContext& CGContext = *getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
166     if (!CGContext.platform.supportFP16())
167     {
168         return false;
169     }
170 
171     Type* eltTy = NULL;
172     bool isFloatType = false;
173 
174     if (I.getType()->isVectorTy())
175     {
176         eltTy = cast<VectorType>(I.getType())->getElementType();
177         isFloatType = cast<VectorType>(I.getType())->getElementType()->isFloatTy();
178     }
179     else
180     {
181         eltTy = I.getType();
182         isFloatType = I.getType()->isFloatTy();
183     }
184 
185     Type* newDstType = nullptr;
186     if (eltTy->isFloatingPointTy())
187     {
188         // check that all uses are extractelement followed by fpext
189         newDstType = isFloatType ?
190             m_builder->getHalfTy() : m_builder->getFloatTy();
191         for (auto use = I.user_begin(); use != I.user_end(); ++use)
192         {
193             auto extractElt = dyn_cast<ExtractElementInst>(*use);
194             if (!(extractElt && extractElt->hasOneUse()))
195             {
196                 return false;
197             }
198             auto fpExtOrTrunc = dyn_cast<CastInst>(*extractElt->user_begin());
199 
200             if (!(fpExtOrTrunc && fpExtOrTrunc->getType() == newDstType))
201             {
202                 return false;
203             }
204         }
205     }
206     else if (eltTy == m_builder->getInt32Ty())
207     {
208         // check if we can lower the sampler return to 16-bit
209         newDstType = m_builder->getInt16Ty();
210         for (auto use = I.user_begin(); use != I.user_end(); ++use)
211         {
212             auto extractElt = dyn_cast<ExtractElementInst>(*use);
213             if (!(extractElt && extractElt->hasOneUse()))
214             {
215                 return false;
216             }
217             auto isUpperBitClear = [this](User* U)
218             {
219                 // match the pattern
220                 // %scalar59 = extractelement <4 x i32> % 83, i32 3
221                 // % 84 = and i32 %scalar59, 65535
222                 if (U->getType() != m_builder->getInt32Ty())
223                 {
224                     return false;
225                 }
226                 auto andInst = dyn_cast<BinaryOperator>(U);
227                 if (!andInst || andInst->getOpcode() != BinaryOperator::And)
228                 {
229                     return false;
230                 }
231                 auto andSrc1 = dyn_cast<ConstantInt>(andInst->getOperand(1));
232                 if (!andSrc1 || andSrc1->getZExtValue() != 0xFFFF)
233                 {
234                     return false;
235                 }
236                 return true;
237             };
238 
239             auto Use = *extractElt->user_begin();
240             bool isInt32to16Trunc = dyn_cast<TruncInst>(Use) && Use->getType() == m_builder->getInt16Ty();
241             if (!isInt32to16Trunc && !isUpperBitClear(Use))
242             {
243                 return false;
244             }
245         }
246     }
247     else
248     {
249         return false;
250     }
251 
252     unsigned int numberOfElements = 1;
253 
254     if (I.getType()->isVectorTy())
255     {
256         numberOfElements = int_cast<unsigned int>(cast<IGCLLVM::FixedVectorType>(I.getType())->getNumElements());
257     }
258 
259     llvm::SmallVector<llvm::Type*, 4> overloadTys;
260     auto retTy = IGCLLVM::FixedVectorType::get(newDstType, numberOfElements);
261     overloadTys.push_back(retTy);
262     auto ID = I.getIntrinsicID();
263     switch (ID)
264     {
265     case GenISAIntrinsic::GenISA_sampleptr:
266     case GenISAIntrinsic::GenISA_sampleBptr:
267     case GenISAIntrinsic::GenISA_sampleCptr:
268     case GenISAIntrinsic::GenISA_sampleDptr:
269     case GenISAIntrinsic::GenISA_sampleDCptr:
270     case GenISAIntrinsic::GenISA_sampleLptr:
271     case GenISAIntrinsic::GenISA_sampleLCptr:
272     case GenISAIntrinsic::GenISA_sampleBCptr:
273         // 4 overloaded tys: ret, arg0, resource, sampler
274         overloadTys.push_back(I.getArgOperand(0)->getType());
275         overloadTys.push_back(cast<SampleIntrinsic>(&I)->getTextureValue()->getType());
276         overloadTys.push_back(cast<SampleIntrinsic>(&I)->getSamplerValue()->getType());
277         break;
278     case GenISAIntrinsic::GenISA_ldptr:
279         overloadTys.push_back(cast<SamplerLoadIntrinsic>(&I)->getTextureValue()->getType());
280         break;
281     case GenISAIntrinsic::GenISA_ldmsptr:
282         overloadTys.push_back(cast<SamplerLoadIntrinsic>(&I)->getTextureValue()->getType());
283         break;
284     case GenISAIntrinsic::GenISA_gather4ptr:
285     case GenISAIntrinsic::GenISA_gather4Cptr:
286     case GenISAIntrinsic::GenISA_gather4POptr:
287     case GenISAIntrinsic::GenISA_gather4POCptr:
288         // 4 overloaded tys: ret, arg0, resource, sampler
289         overloadTys.push_back(I.getArgOperand(0)->getType());
290         overloadTys.push_back(cast<SamplerGatherIntrinsic>(&I)->getTextureValue()->getType());
291         overloadTys.push_back(cast<SamplerGatherIntrinsic>(&I)->getSamplerValue()->getType());
292         break;
293     default:
294         return false;
295     }
296 
297     Function* newSample = GenISAIntrinsic::getDeclaration(
298         m_currFunction->getParent(), I.getIntrinsicID(), overloadTys);
299     llvm::SmallVector<llvm::Value*, 8> newArgs;
300     for (unsigned int i = 0, argSize = I.getNumArgOperands(); i < argSize; i++)
301     {
302         newArgs.push_back(I.getArgOperand(i));
303     }
304     m_builder->SetInsertPoint(&I);
305     auto newCall = m_builder->CreateCall(newSample, newArgs);
306 
307     for (auto use = I.user_begin(); use != I.user_end(); ++use)
308     {
309         ExtractElementInst* extractElt = cast<ExtractElementInst>(*use);
310         m_builder->SetInsertPoint(extractElt);
311 
312         Value* extractUse = *extractElt->user_begin();
313         Value* newExtract = m_builder->CreateExtractElement(newCall, extractElt->getIndexOperand());
314         if (extractUse->getType()->isFloatingPointTy())
315         {
316             extractUse->replaceAllUsesWith(newExtract);
317         }
318         else
319         {
320             if (dyn_cast<TruncInst>(extractUse))
321             {
322                 // replace trunc with new extractElt
323                 extractUse->replaceAllUsesWith(newExtract);
324             }
325             else
326             {
327                 // replace and with zext
328                 Value* zextInst = m_builder->CreateZExt(newExtract, m_builder->getInt32Ty());
329                 extractUse->replaceAllUsesWith(zextInst);
330             }
331         }
332     }
333     return true;
334 }
335 
visitIntrinsicInst(llvm::IntrinsicInst & I)336 void LowPrecisionOpt::visitIntrinsicInst(llvm::IntrinsicInst& I)
337 {
338     if (!m_simplifyAlu)
339     {
340         return;
341     }
342     if (I.getIntrinsicID() != llvm::Intrinsic::floor ||
343         I.getType() != Type::getHalfTy(m_builder->getContext()))
344         return;
345 
346     auto src = I.getOperand(0);
347     m_builder->SetInsertPoint(&I);
348 
349     auto fpTrunc = llvm::dyn_cast <llvm::FPTruncInst>(src);
350     if (fpTrunc)
351     {
352         src = fpTrunc->getOperand(0);
353     }
354     else
355     {
356         src = m_builder->CreateFPExt(src, m_builder->getFloatTy());
357     }
358 
359     if (!func_llvm_floor_f32)
360         func_llvm_floor_f32 = llvm::Intrinsic::getDeclaration(m_currFunction->getParent(), Intrinsic::floor, m_builder->getFloatTy());
361 
362     auto floor32 = m_builder->CreateCall(func_llvm_floor_f32, src);
363 #if VALUE_NAME_ENABLE
364     floor32->setName(I.getName());
365 #endif
366 
367     if (I.hasOneUse())
368     {
369         auto hfSub = llvm::dyn_cast<llvm::BinaryOperator>(*I.user_begin());
370 
371         if (hfSub && hfSub->getOpcode() == llvm::Instruction::BinaryOps::FSub)
372         {
373             if (hfSub->getOperand(0) == I.getOperand(0))
374             {
375                 auto fSub = m_builder->CreateFSub(src, floor32, hfSub->getName());
376                 auto fpdst = m_builder->CreateFPTrunc(fSub, Type::getHalfTy(m_builder->getContext()));
377                 hfSub->replaceAllUsesWith(fpdst);
378             }
379         }
380     }
381     else
382     {
383         auto fpdst = m_builder->CreateFPTrunc(floor32, Type::getHalfTy(m_builder->getContext()));
384         I.replaceAllUsesWith(fpdst);
385         I.eraseFromParent();
386     }
387 
388 }
389 
390 /*FP16SamplerOptimization*/
visitCallInst(CallInst & I)391 void LowPrecisionOpt::visitCallInst(CallInst& I)
392 {
393     if (!m_changeSample)
394     {
395         return;
396     }
397     if (isSampleLoadGather4InfoInstruction(&I))
398     {
399         bool changed = propagateSamplerType(*cast<GenIntrinsicInst>(&I));
400         if (changed)
401         {
402             return;
403         }
404     }
405 }
406