1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "Compiler/LowPrecisionOptPass.hpp"
10 #include "Compiler/CodeGenContextWrapper.hpp"
11 #include "Compiler/CodeGenPublic.h"
12 #include "Compiler/CISACodeGen/helper.h"
13 #include "Compiler/IGCPassSupport.h"
14
15 #include "llvmWrapper/IR/DerivedTypes.h"
16 #include "common/LLVMWarningsPush.hpp"
17 #include <llvm/Support/CommandLine.h>
18 #include <llvm/IR/Instructions.h>
19 #include <llvm/IR/InstIterator.h>
20 #include "common/LLVMWarningsPop.hpp"
21
22 #include "GenISAIntrinsics/GenIntrinsicInst.h"
23 #include "common/IGCIRBuilder.h"
24 using namespace llvm;
25 using namespace IGC;
26 using namespace IGC::IGCMD;
27 using namespace GenISAIntrinsic;
28
29 char LowPrecisionOpt::ID = 0;
30
31 // Register pass to igc-opt
32 #define PASS_FLAG "igc-low-precision-opt"
33 #define PASS_DESCRIPTION "Low Precision Opt"
34 #define PASS_CFG_ONLY false
35 #define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(LowPrecisionOpt,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)36 IGC_INITIALIZE_PASS_BEGIN(LowPrecisionOpt, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
37 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
38 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
39 IGC_INITIALIZE_PASS_END(LowPrecisionOpt, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
40
41 LowPrecisionOpt::LowPrecisionOpt() : FunctionPass(ID)
42 {
43 initializeLowPrecisionOptPass(*PassRegistry::getPassRegistry());
44 m_func_llvm_GenISA_DCL_inputVec_f16 = nullptr;
45 m_func_llvm_GenISA_DCL_inputVec_f32 = nullptr;
46 m_currFunction = nullptr;
47 func_llvm_floor_f32 = nullptr;
48 }
49
runOnFunction(Function & F)50 bool LowPrecisionOpt::runOnFunction(Function& F)
51 {
52 m_changed = false;
53 CodeGenContextWrapper* pCtxWrapper = &getAnalysis<CodeGenContextWrapper>();
54 CodeGenContext* ctx = pCtxWrapper->getCodeGenContext();
55
56 MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
57
58 if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
59 {
60 return m_changed;
61 }
62 llvm::IGCIRBuilder<> builder(F.getContext());
63 m_builder = &builder;
64 m_currFunction = &F;
65 shdrType = ctx->type;
66 bundles.clear();
67 m_simplifyAlu = true;
68 m_changeSample = false;
69 visit(F);
70 // change sampler only after we simplified fext + ftrunc
71 m_simplifyAlu = false;
72 m_changeSample = true;
73 visit(F);
74 std::sort(bundles.begin(), bundles.end(), cmpOperator);
75 auto bundleEnd = bundles.end();
76 for (auto bundle = bundles.begin(); bundle != bundleEnd; ++bundle)
77 {
78 (*bundle).cInst->moveBefore(&(*(m_currFunction->getEntryBlock().begin())));
79 (*bundle).fpTrunc->moveBefore(&(*(m_currFunction->getEntryBlock().begin())));
80 }
81 return m_changed;
82 }
83
visitFPExtInst(llvm::FPExtInst & I)84 void LowPrecisionOpt::visitFPExtInst(llvm::FPExtInst& I)
85 {
86 if (!m_simplifyAlu)
87 {
88 return;
89 }
90 if (I.getOperand(0)->getType()->isHalfTy())
91 {
92 Instruction* I0 = dyn_cast<Instruction>(I.getOperand(0));
93 llvm::GenIntrinsicInst* callInst = llvm::dyn_cast<llvm::GenIntrinsicInst>(I.getOperand(0));
94
95 if (I0 && I0->getOpcode() == Instruction::FPTrunc && I.getDestTy() == I0->getOperand(0)->getType())
96 {
97 I.replaceAllUsesWith(I0->getOperand(0));
98 I.eraseFromParent();
99 m_changed = true;
100 }
101 else if (callInst && callInst->hasOneUse())
102 {
103 GenISAIntrinsic::ID ID = callInst->getIntrinsicID();
104 if (ID == GenISAIntrinsic::GenISA_DCL_ShaderInputVec || ID == GenISAIntrinsic::GenISA_DCL_inputVec)
105 {
106 /*
107 Catches a pattern where we have a lowp input, then extend it back up. This
108 generates mixed mode instructions and so it's better to keep it as PLN.
109 Example if it's used directly in the sample instruction before CNL.
110 */
111
112 if (m_func_llvm_GenISA_DCL_inputVec_f32 == nullptr)
113 {
114 m_func_llvm_GenISA_DCL_inputVec_f32 = llvm::GenISAIntrinsic::getDeclaration(
115 m_currFunction->getParent(),
116 ID,
117 Type::getFloatTy(m_builder->getContext()));
118 }
119
120 m_builder->SetInsertPoint(callInst);
121 Value* v = m_builder->CreateCall2(m_func_llvm_GenISA_DCL_inputVec_f32, callInst->getOperand(0), callInst->getOperand(1));
122 #if VALUE_NAME_ENABLE
123 v->setName(callInst->getName());
124 #endif
125 I.replaceAllUsesWith(v);
126 I.eraseFromParent();
127 callInst->eraseFromParent();
128 m_changed = true;
129 }
130 }
131 }
132 }
133
visitFPTruncInst(llvm::FPTruncInst & I)134 void LowPrecisionOpt::visitFPTruncInst(llvm::FPTruncInst& I)
135 {
136 if (!m_simplifyAlu)
137 {
138 return;
139 }
140 llvm::GenIntrinsicInst* cInst = llvm::dyn_cast<llvm::GenIntrinsicInst>(I.getOperand(0));
141
142 if (cInst &&
143 cInst->getIntrinsicID() == GenISAIntrinsic::GenISA_RuntimeValue)
144 {
145 if (!IGC_IS_FLAG_ENABLED(HoistPSConstBufferValues) ||
146 shdrType != ShaderType::PIXEL_SHADER)
147 return;
148 moveBundle bundle;
149 bundle.index = (uint)llvm::cast<llvm::ConstantInt>(cInst->getOperand(0))->getZExtValue();
150 bundle.cInst = cInst;
151 bundle.fpTrunc = &I;
152 bundles.push_back(bundle);
153 }
154 }
155
156 // If all the uses of a sampler instruction are converted to a different floating point type
157 // try to propagate the type in the sampler
propagateSamplerType(llvm::GenIntrinsicInst & I)158 bool LowPrecisionOpt::propagateSamplerType(llvm::GenIntrinsicInst& I)
159 {
160 if (IGC_IS_FLAG_DISABLED(UpConvertF16Sampler) && cast<VectorType>(I.getType())->getElementType()->isHalfTy())
161 {
162 return false;
163 }
164
165 IGC::CodeGenContext& CGContext = *getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
166 if (!CGContext.platform.supportFP16())
167 {
168 return false;
169 }
170
171 Type* eltTy = NULL;
172 bool isFloatType = false;
173
174 if (I.getType()->isVectorTy())
175 {
176 eltTy = cast<VectorType>(I.getType())->getElementType();
177 isFloatType = cast<VectorType>(I.getType())->getElementType()->isFloatTy();
178 }
179 else
180 {
181 eltTy = I.getType();
182 isFloatType = I.getType()->isFloatTy();
183 }
184
185 Type* newDstType = nullptr;
186 if (eltTy->isFloatingPointTy())
187 {
188 // check that all uses are extractelement followed by fpext
189 newDstType = isFloatType ?
190 m_builder->getHalfTy() : m_builder->getFloatTy();
191 for (auto use = I.user_begin(); use != I.user_end(); ++use)
192 {
193 auto extractElt = dyn_cast<ExtractElementInst>(*use);
194 if (!(extractElt && extractElt->hasOneUse()))
195 {
196 return false;
197 }
198 auto fpExtOrTrunc = dyn_cast<CastInst>(*extractElt->user_begin());
199
200 if (!(fpExtOrTrunc && fpExtOrTrunc->getType() == newDstType))
201 {
202 return false;
203 }
204 }
205 }
206 else if (eltTy == m_builder->getInt32Ty())
207 {
208 // check if we can lower the sampler return to 16-bit
209 newDstType = m_builder->getInt16Ty();
210 for (auto use = I.user_begin(); use != I.user_end(); ++use)
211 {
212 auto extractElt = dyn_cast<ExtractElementInst>(*use);
213 if (!(extractElt && extractElt->hasOneUse()))
214 {
215 return false;
216 }
217 auto isUpperBitClear = [this](User* U)
218 {
219 // match the pattern
220 // %scalar59 = extractelement <4 x i32> % 83, i32 3
221 // % 84 = and i32 %scalar59, 65535
222 if (U->getType() != m_builder->getInt32Ty())
223 {
224 return false;
225 }
226 auto andInst = dyn_cast<BinaryOperator>(U);
227 if (!andInst || andInst->getOpcode() != BinaryOperator::And)
228 {
229 return false;
230 }
231 auto andSrc1 = dyn_cast<ConstantInt>(andInst->getOperand(1));
232 if (!andSrc1 || andSrc1->getZExtValue() != 0xFFFF)
233 {
234 return false;
235 }
236 return true;
237 };
238
239 auto Use = *extractElt->user_begin();
240 bool isInt32to16Trunc = dyn_cast<TruncInst>(Use) && Use->getType() == m_builder->getInt16Ty();
241 if (!isInt32to16Trunc && !isUpperBitClear(Use))
242 {
243 return false;
244 }
245 }
246 }
247 else
248 {
249 return false;
250 }
251
252 unsigned int numberOfElements = 1;
253
254 if (I.getType()->isVectorTy())
255 {
256 numberOfElements = int_cast<unsigned int>(cast<IGCLLVM::FixedVectorType>(I.getType())->getNumElements());
257 }
258
259 llvm::SmallVector<llvm::Type*, 4> overloadTys;
260 auto retTy = IGCLLVM::FixedVectorType::get(newDstType, numberOfElements);
261 overloadTys.push_back(retTy);
262 auto ID = I.getIntrinsicID();
263 switch (ID)
264 {
265 case GenISAIntrinsic::GenISA_sampleptr:
266 case GenISAIntrinsic::GenISA_sampleBptr:
267 case GenISAIntrinsic::GenISA_sampleCptr:
268 case GenISAIntrinsic::GenISA_sampleDptr:
269 case GenISAIntrinsic::GenISA_sampleDCptr:
270 case GenISAIntrinsic::GenISA_sampleLptr:
271 case GenISAIntrinsic::GenISA_sampleLCptr:
272 case GenISAIntrinsic::GenISA_sampleBCptr:
273 // 4 overloaded tys: ret, arg0, resource, sampler
274 overloadTys.push_back(I.getArgOperand(0)->getType());
275 overloadTys.push_back(cast<SampleIntrinsic>(&I)->getTextureValue()->getType());
276 overloadTys.push_back(cast<SampleIntrinsic>(&I)->getSamplerValue()->getType());
277 break;
278 case GenISAIntrinsic::GenISA_ldptr:
279 overloadTys.push_back(cast<SamplerLoadIntrinsic>(&I)->getTextureValue()->getType());
280 break;
281 case GenISAIntrinsic::GenISA_ldmsptr:
282 overloadTys.push_back(cast<SamplerLoadIntrinsic>(&I)->getTextureValue()->getType());
283 break;
284 case GenISAIntrinsic::GenISA_gather4ptr:
285 case GenISAIntrinsic::GenISA_gather4Cptr:
286 case GenISAIntrinsic::GenISA_gather4POptr:
287 case GenISAIntrinsic::GenISA_gather4POCptr:
288 // 4 overloaded tys: ret, arg0, resource, sampler
289 overloadTys.push_back(I.getArgOperand(0)->getType());
290 overloadTys.push_back(cast<SamplerGatherIntrinsic>(&I)->getTextureValue()->getType());
291 overloadTys.push_back(cast<SamplerGatherIntrinsic>(&I)->getSamplerValue()->getType());
292 break;
293 default:
294 return false;
295 }
296
297 Function* newSample = GenISAIntrinsic::getDeclaration(
298 m_currFunction->getParent(), I.getIntrinsicID(), overloadTys);
299 llvm::SmallVector<llvm::Value*, 8> newArgs;
300 for (unsigned int i = 0, argSize = I.getNumArgOperands(); i < argSize; i++)
301 {
302 newArgs.push_back(I.getArgOperand(i));
303 }
304 m_builder->SetInsertPoint(&I);
305 auto newCall = m_builder->CreateCall(newSample, newArgs);
306
307 for (auto use = I.user_begin(); use != I.user_end(); ++use)
308 {
309 ExtractElementInst* extractElt = cast<ExtractElementInst>(*use);
310 m_builder->SetInsertPoint(extractElt);
311
312 Value* extractUse = *extractElt->user_begin();
313 Value* newExtract = m_builder->CreateExtractElement(newCall, extractElt->getIndexOperand());
314 if (extractUse->getType()->isFloatingPointTy())
315 {
316 extractUse->replaceAllUsesWith(newExtract);
317 }
318 else
319 {
320 if (dyn_cast<TruncInst>(extractUse))
321 {
322 // replace trunc with new extractElt
323 extractUse->replaceAllUsesWith(newExtract);
324 }
325 else
326 {
327 // replace and with zext
328 Value* zextInst = m_builder->CreateZExt(newExtract, m_builder->getInt32Ty());
329 extractUse->replaceAllUsesWith(zextInst);
330 }
331 }
332 }
333 return true;
334 }
335
visitIntrinsicInst(llvm::IntrinsicInst & I)336 void LowPrecisionOpt::visitIntrinsicInst(llvm::IntrinsicInst& I)
337 {
338 if (!m_simplifyAlu)
339 {
340 return;
341 }
342 if (I.getIntrinsicID() != llvm::Intrinsic::floor ||
343 I.getType() != Type::getHalfTy(m_builder->getContext()))
344 return;
345
346 auto src = I.getOperand(0);
347 m_builder->SetInsertPoint(&I);
348
349 auto fpTrunc = llvm::dyn_cast <llvm::FPTruncInst>(src);
350 if (fpTrunc)
351 {
352 src = fpTrunc->getOperand(0);
353 }
354 else
355 {
356 src = m_builder->CreateFPExt(src, m_builder->getFloatTy());
357 }
358
359 if (!func_llvm_floor_f32)
360 func_llvm_floor_f32 = llvm::Intrinsic::getDeclaration(m_currFunction->getParent(), Intrinsic::floor, m_builder->getFloatTy());
361
362 auto floor32 = m_builder->CreateCall(func_llvm_floor_f32, src);
363 #if VALUE_NAME_ENABLE
364 floor32->setName(I.getName());
365 #endif
366
367 if (I.hasOneUse())
368 {
369 auto hfSub = llvm::dyn_cast<llvm::BinaryOperator>(*I.user_begin());
370
371 if (hfSub && hfSub->getOpcode() == llvm::Instruction::BinaryOps::FSub)
372 {
373 if (hfSub->getOperand(0) == I.getOperand(0))
374 {
375 auto fSub = m_builder->CreateFSub(src, floor32, hfSub->getName());
376 auto fpdst = m_builder->CreateFPTrunc(fSub, Type::getHalfTy(m_builder->getContext()));
377 hfSub->replaceAllUsesWith(fpdst);
378 }
379 }
380 }
381 else
382 {
383 auto fpdst = m_builder->CreateFPTrunc(floor32, Type::getHalfTy(m_builder->getContext()));
384 I.replaceAllUsesWith(fpdst);
385 I.eraseFromParent();
386 }
387
388 }
389
390 /*FP16SamplerOptimization*/
visitCallInst(CallInst & I)391 void LowPrecisionOpt::visitCallInst(CallInst& I)
392 {
393 if (!m_changeSample)
394 {
395 return;
396 }
397 if (isSampleLoadGather4InfoInstruction(&I))
398 {
399 bool changed = propagateSamplerType(*cast<GenIntrinsicInst>(&I));
400 if (changed)
401 {
402 return;
403 }
404 }
405 }
406